`
mr_lonely_hp
  • 浏览: 91157 次
  • 性别: Icon_minigender_1
  • 来自: 湖南
社区版块
存档分类
最新评论

nutch 1.2 分页处理

阅读更多

<%@ page session="false" contentType="text/html; charset=UTF-8"
pageEncoding="UTF-8" import="java.io.*" import="java.util.*"
import="java.net.*" import="javax.servlet.http.*"
import="javax.servlet.*" import="org.apache.nutch.html.Entities"
import="org.apache.nutch.metadata.Nutch"
import="org.apache.nutch.searcher.*" import="org.apache.nutch.plugin.*"
import="org.apache.nutch.clustering.*"
import="org.apache.hadoop.conf.*"
import="org.apache.nutch.util.NutchConfiguration"%>
<%!
/**
* Number of hits to retrieve and cluster if clustering extension is available
* and clustering is on. By default, 100. Configurable via nutch-conf.xml.
*/
private int HITS_TO_CLUSTER;

/**
* Maximum hits per page to be displayed.
*/
private int MAX_HITS_PER_PAGE;

/**
* An instance of the clustering extension, if available.
*/
private OnlineClusterer clusterer;

/**
* Nutch configuration for this servlet.
*/
private Configuration nutchConf;

/**
* Initialize search bean.
*/
public void jspInit() {
super.jspInit();

final ServletContext application = getServletContext();
nutchConf = NutchConfiguration.get(application);
HITS_TO_CLUSTER = nutchConf.getInt("extension.clustering.hits-to-cluster", 100);
MAX_HITS_PER_PAGE = nutchConf.getInt("searcher.max.hits.per.page", -1);

try {
clusterer = new OnlineClustererFactory(nutchConf).getOnlineClusterer();
} catch (PluginRuntimeException e) {
super.log("Could not initialize online clusterer: " + e.toString());
}
}
%>

<%--
// Uncomment this to enable query refinement.
// Do the same to "refine-query.jsp" below.,
<%@ include file="./refine-query-init.jsp" %>
--%>

<%
// The Nutch bean instance is initialized through a ServletContextListener
// that is setup in the web.xml file
NutchBean bean = NutchBean.get(application, nutchConf);
// set the character encoding to use when interpreting request values
request.setCharacterEncoding("UTF-8");

bean.LOG.info("query request from " + request.getRemoteAddr());

// get query from request
String queryString = request.getParameter("query");
if (queryString == null)
queryString = "";
String htmlQueryString = Entities.encode(queryString);

// a flag to make the code cleaner a bit.
boolean clusteringAvailable = (clusterer != null);

String clustering = "";
if (clusteringAvailable && "yes".equals(request.getParameter("clustering")))
clustering = "yes";

int start = 0; // first hit to display
String startString = request.getParameter("start");
if (startString != null)
start = Integer.parseInt(startString);

int hitsPerPage = 10; // 显示最大查询数量
String hitsString = request.getParameter("hitsPerPage");
if (hitsString != null)
hitsPerPage = Integer.parseInt(hitsString);
if(MAX_HITS_PER_PAGE > 0 && hitsPerPage > MAX_HITS_PER_PAGE)
hitsPerPage = MAX_HITS_PER_PAGE;

int hitsPerSite = 0; //必须设置为零
String hitsPerSiteString = request.getParameter("hitsPerSite");
if (hitsPerSiteString != null)
hitsPerSite = Integer.parseInt(hitsPerSiteString);

String sort = request.getParameter("sort");
boolean reverse =
sort!=null && "true".equals(request.getParameter("reverse"));

String params = "&hitsPerPage="+hitsPerPage
+(sort==null ? "" : "&sort="+sort+(reverse?"&reverse=true":""));

int hitsToCluster = HITS_TO_CLUSTER; // number of hits to cluster

// get the lang from request
String queryLang = request.getParameter("lang");
if (queryLang == null) { queryLang = ""; }
Query query = Query.parse(queryString, queryLang, nutchConf);
bean.LOG.info("query: " + queryString);
bean.LOG.info("lang: " + queryLang);

String language =
ResourceBundle.getBundle("org.nutch.jsp.search", request.getLocale())
.getLocale().getLanguage();
String requestURI = HttpUtils.getRequestURL(request).toString();
String base = requestURI.substring(0, requestURI.lastIndexOf('/'));
String rss = "../opensearch?query="+htmlQueryString
+"&hitsPerSite="+hitsPerSite+"&lang="+queryLang+params;
%>

<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
<%
out.flush();
%>

<%@ taglib uri="http://jakarta.apache.org/taglibs/i18n" prefix="i18n"%>
<i18n:bundle baseName="org.nutch.jsp.search" />
<html lang="<%= language %>">

<title><i18n:message key="title" /></title>
<meta http-equiv="content-type" content="text/html; charset=UTF-8">
<link href="resources/test/news.css" type="text/css" rel="stylesheet">
<link href="resources/test/base.css" type="text/css" rel="stylesheet">
<script src="resources/test/ajax-min.js"></script>
<script src="resources/test/gosoBox-min.js"></script>
<script src="resources/test/helpers-min.js"></script>
<script src="resources/test/autoComplete-min.js"></script>
<script src="resources/test/goso-zoomImg-min.js"></script>
<script src="resources/test/page-min.js"></script>
<script language="javascript" src="resources/test/navMore-min.js"></script>
<base href="<%= base + "/" + language %>/">
<script type="text/javascript">

function subEvent(subId,ulId,num,flag){
new AJAX.Request("/ses",{parameters:{id:subId}, success:function(data){
if(data!=""){
document.getElementById("d_"+ulId).innerHTML=data;
if('little'!=num){
var dlObj = document.getElementById("dl_"+ulId);
var tempArr = dlObj.childNodes;
var len = tempArr.length;
for(var i=0; i<len; i++){
tempArr[i].className = "TimeOff";
}
var nowDd = document.getElementById("dd_"+ulId+"_"+num);
nowDd.className = "TimeOn";
}
}
}});
}
function showSource(id){
var zzDiv = document.getElementById(id);
var showDiv = document.getElementById("zz_show_div");
showDiv.innerHTML = zzDiv.innerHTML;
gosoBox.showBox("zz_show_div");
}
function closeDiv_zz(id){
gosoBox.hideBox("zz_show_div");
}
function searchNews(){
var frm = document.getElementById("frm");
frm.action = "/nso";
document.getElementById("keyword").value='地震';
frm.submit();
}

var showIDs = "";
function getInfoByCode(codes,id,type){
var key = '地震';
new AJAX.Request("/pis",{parameters:{codes:codes,type:type,key:key},success:function(data){
if(data!=""){
showIDs = showIDs+id+",";
document.getElementById(type+"_dl_"+id).innerHTML+=data;
document.getElementById("about_"+id).style.display="";
}else{
document.getElementById(type+"_dl_"+id).style.display="none";
if(showIDs.indexOf(id)==-1){
document.getElementById("about_"+id).style.display="none";
}
}
}});
}


function showErrorImg(img){
img.src= 'http://multimedia.goso.cn/goso/common/newEvent/images/pic/ZWImg.gif';
}



function getMediaData(mediaStr, id){
new AJAX.Request("/ms",{parameters:{mediaStr:mediaStr},success:function(data){
if(data!=""){
document.getElementById("temp_a_"+id).innerHTML=data;
}
}});
}

function loadImg(img){
var zoom =new zoomImg(130,85);
zoom.zoom(img);
}
</script>
<script type="text/javascript">
<!--
function queryfocus() { document.search.query.focus(); }
// -->
</script>
</head>
<body class="w100BI" onLoad="queryfocus();">
<%
// how many hits to retrieve? if clustering is on and available,
// take "hitsToCluster", otherwise just get hitsPerPage
int hitsToRetrieve = (clusteringAvailable && clustering.equals("yes") ? hitsToCluster : hitsPerPage);

if (clusteringAvailable && clustering.equals("yes")) {
bean.LOG.info("Clustering is on, hits to retrieve: " + hitsToRetrieve);
}

// perform query
// NOTE by Dawid Weiss:
// The 'clustering' window actually moves with the start
// position.... this is good, bad?... ugly?....
Hits hits;
try{
query.getParams().initFrom(start + hitsToRetrieve, hitsPerSite, "site", sort, reverse);
hits = bean.search(query);
} catch (IOException e){
hits = new Hits(0,new Hit[0]);
}
int end = (int)Math.min(hits.getLength(), start + hitsPerPage);
int length = end-start;
int realEnd = (int)Math.min(hits.getLength(), start + hitsToRetrieve);

Hit[] show = hits.getHits(start, realEnd-start);
HitDetails[] details = bean.getDetails(show);
Summary[] summaries = bean.getSummary(details, query);
bean.LOG.info("total hits: " + hits.getTotal());
%>
<div class="NewNav">
<dl>
<dd>
<a href="http://www.goso.cn/">首页</a>
</dd>
<dd class="red">
<b>新闻</b>
<img src="resources/test/ico1_0.gif" alt="new" class="icoNew">
</dd>
<dd>
<a href="http://www.goso.cn/so?q=%E5%9C%B0%E9%9C%87">网页</a>
</dd>
<dd>
<a href="http://image.goso.cn/so?q=%E5%9C%B0%E9%9C%87">图片</a>
</dd>
<dd>
<a href="http://video.goso.cn/so?q=%E5%9C%B0%E9%9C%87">视频</a>
</dd>
<dd>
<a href="http://blog.goso.cn/so?q=%E5%9C%B0%E9%9C%87">博客</a>
</dd>
<dd>
<a href="http://bbs.goso.cn/so?q=%E5%9C%B0%E9%9C%87">论坛</a>
</dd>
<dd class="red">
<a href="http://mobile.goso.cn/" target="_blank">移动</a>
<img src="resources/test/icoNew.jpg" alt="" class="icoNew">
</dd>
<dt>
|
</dt>
<dd>
<a href="javascript:void(0)" onclick="showMoreA(this)">实验室<small>▼</small>
</a>
</dd>
<dd>
<a href="javascript:void(0)" onclick="showMore(this)">实用导航<small>▼</small>
</a>
</dd>
</dl>
</div>
<div class="H10"></div>
<div style="width: 100%;" class="header" id="header">
<div class="logo">
<a href="http://news.goso.cn/"><img
src="resources/test/logoTing.jpg" alt="人民搜索"> </a>
</div>
<div class="searchNew">
<form action="../search.jsp" method="get" class="searchBox"
name="frm" id="frm">
<label>
<input name="query" size=44 value="<%=htmlQueryString%>">
<input type="hidden" name="hitsPerPage" value="<%=hitsPerPage%>">
<input type="hidden" name="lang" value="<%=language%>">
<input type="submit" value="<i18n:message key="search"/>">
</label>
<% if (clusteringAvailable) { %>
<input id="clustbox" type="checkbox" name="clustering" value="yes"
<% if (clustering.equals("yes")) { %> CHECKED <% } %>>
<label for="clustbox">
<i18n:message key="clustering" />
</label>
<% } %>
</form>
</div>
<div class="clear"></div>
<div class="searchinfo">
<div class="titleTing">
<h1>
<label class="NewListOn" id="blogLabel">
<b>事件模式</b>
</label>
<label class="NewListOff" id="bbsLabel">
<b onclick="searchNews()">新闻模式</b>
</label>
</h1>
<label class="txt">
搜索
<%=hits.getTotal()%>个
</label>
</div>
</div>
</div>
<div style="width: 100%;" id="bodyer">

<div class="leftchoose">

</div>

<div class="searchlist searchTing">
<i18n:message key="hits">
<i18n:messageArg value="<%=new Long((end==0)?0:(start+1))%>" />
<i18n:messageArg value="<%=new Long(end)%>" />
<i18n:messageArg value="<%=new Long(hits.getTotal())%>" />
</i18n:message>

<%
// be responsive
out.flush();
%>
<ul id="searchResult">


<li>
<div class="textcontent">
<%
for (int i = 0; i < length; i++) { // display the hits
Hit hit = show[i];
HitDetails detail = details[i];
String title = detail.getValue("title");
String url = detail.getValue("url");
String id = "idx=" + hit.getIndexNo() + "&id=" + hit.getUniqueKey();
String summary = summaries[i].toHtml(true);
String caching = detail.getValue("cache");
boolean showSummary = true;
boolean showCached = true;
if (caching != null) {
showSummary = !caching.equals(Nutch.CACHING_FORBIDDEN_ALL);
showCached = !caching.equals(Nutch.CACHING_FORBIDDEN_NONE);
}

if (title == null || title.equals("")) { // use url for docs w/o title
title = url;
}
%>
<h2>
<a href="<%=url%>"><%=Entities.encode(title)%></a>
</h2>

<p>
<span class="wrap"> <% if (!"".equals(summary) && showSummary) { %>
<br><%=summary%> <% } %>
</span>
</p>
<%=Entities.encode(url)%>
<%
if (showCached) {
%>(
<a href="../cached.jsp?<%=id%>"><i18n:message key="cached" />
</a>)
<%
}
%>
<br>
<% } %>
</div>
</li>

</ul>

<div class="clear"></div>
<div class="fanye">

<table align="center">
<tr>
<td>
<%
if (start >= hitsPerPage) // more hits to show
{
%>
<form name="pre" action="../search.jsp" method="get">
<input type="hidden" name="query" value="<%=htmlQueryString%>">
<input type="hidden" name="lang" value="<%=queryLang%>">
<input type="hidden" name="start" value="<%=start - hitsPerPage%>">
<input type="hidden" name="hitsPerPage" value="<%=hitsPerPage%>">
<input type="hidden" name="hitsPerSite" value="<%=hitsPerSite%>">
<input type="hidden" name="clustering" value="<%=clustering%>">
<input type="submit" value="上一页">
<%} %>
</form>
<%
int startnum=1;//页面中最前面的页码编号,我设定(满足)共10页,当页为第6页
if((int)(start/hitsPerPage)>=5)
startnum=(int)(start/hitsPerPage)-4;
for(int i=(hitsPerPage+10)*(startnum-1),j=0;i<=hits.getTotal()&&j<=10;)
{
%>
<span>
<form name="next" action="../search.jsp" method="get">
<input type="hidden" name="query" value="<%=htmlQueryString%>">
<input type="hidden" name="lang" value="<%=queryLang%>">
<input type="hidden" name="start" value="<%=i%>">
<input type="hidden" name="hitsPerPage" value="<%=hitsPerPage%>">
<input type="hidden" name="hitsPerSite" value="<%=hitsPerSite%>">
<input type="hidden" name="clustering" value="<%=clustering%>">
<input type="submit" value="<%=i/hitsPerPage+1 %>">
</form>
</span>
<%
i=i+10;
j++;
}
%>
<%
if ((hits.totalIsExact() && end < hits.getTotal()) // more hits to show
|| (!hits.totalIsExact() && (hits.getLength() > start
+ hitsPerPage))) {
%>

<form name="next" action="../search.jsp" method="get">
<input type="hidden" name="query" value="<%=htmlQueryString%>">
<input type="hidden" name="lang" value="<%=queryLang%>">
<input type="hidden" name="start" value="<%=end%>">
<input type="hidden" name="hitsPerPage" value="<%=hitsPerPage%>">
<input type="hidden" name="hitsPerSite" value="<%=hitsPerSite%>">
<input type="hidden" name="clustering" value="<%=clustering%>">
<input type="submit" value="<i18n:message key="next"/>">
</form>
<%} %>

</div>

</div>

<div class="clear"></div>
<div class="footsearch">
<div class="searchNew">
<form method="get" action="/so" class="searchBox">
<label>
<input name="q" class="text" id="keyword2" value="地震"
autocomplete="off" disableautocomplete="" type="text">
<input value="人民搜索" class="submit" type="submit">
</label>
</form>
</div>
</div>
</div>

<div style="display: none;" class="NewsPop" id="zz_show_div">

</div>

<div class="foot">
<p>
<a href="javascript:void(0)"
onclick="GosoHelper.setHome(this,'http://www.goso.cn')">设为首页</a>
<a target="_blank" href="http://www.goso.cn/aboutus.html">关于我们</a>
<a target="_blank" style="color: rgb(255, 0, 0);"
href="http://www.goso.cn/zhaopin/zplist.html">诚聘英才</a>
<a href="http://about.goso.cn/about?pid=1" target="_blank">征求意见</a>
</p>
<p>
版权所有©2011 人民搜索 保留所有权利
<a target="_blank" href="http://www.miibeian.gov.cn/"
class="linkICP">京ICP备10216100号</a>
<a target="_blank" href="http://www.goso.cn/duty.html"
class="linkICP m0I">免责声明</a>
</p>
</div>


<script type="text/javascript">

var auto =new AutoComplete({
inputID:"keyword",
url:"/live?k=",
callClickBack:"doSearch"
});
function doSearch(){
document.getElementById("frm").submit();
}

function getLastNews(){
var nowPage = '1';
var listSize = 10;
if('1'==nowPage && listSize>0){
new AJAX.Request("/lns",{parameters:{key:'地震'}, success:function(data){
if(data!=""){
var obj = document.getElementById("lastNews");
obj.innerHTML=data;
obj.style.display = "block";
}
}});
}
}
getLastNews();
</script>
<div
style="width: 491px; left: 207px; top: 87px; position: absolute; z-index: 99999999; display: none;"
class="frameUi">
<ul style="padding: 0pt; width: 97%; margin: 0pt;"></ul>
</div>
<div style="position: absolute; display: none; z-index: 9999;"
id="livemargins_control">
<img src="resources/test/monitor-background-horizontal.png"
style="position: absolute; left: -77px; top: -5px;" width="77"
height="5">
<img src="resources/test/monitor-background-vertical.png"
style="position: absolute; left: 0pt; top: -5px;">
<img id="monitor-play-button"
src="resources/test/monitor-play-button.png"
onmouseover="this.style.opacity=1"
onmouseout="this.style.opacity=0.5"
style="position: absolute; left: 1px; top: 0pt; opacity: 0.5; cursor: pointer;">
</div>
</body>
</html>

分享到:
评论

相关推荐

    nutch1.2 java的project

    1. **导入项目**:在Eclipse中选择“File” &gt; “Import” &gt; “Existing Projects into Workspace”,然后浏览到下载的`nutch1.2+Project`目录,导入项目。 2. **添加库**:确保你的Eclipse环境中已经安装了Apache ...

    nutch1.2 java project

    Nutch 1.2 是一个开源的网络爬虫项目,基于 Java 编写,用于抓取互联网上的网页并建立索引。这个项目是 Apache Software Foundation 的一部分,它为大规模的数据采集提供了强大的工具。Nutch 1.2 版本相对于早期版本...

    Nutch 1.2源码阅读

    ### Nutch 1.2 源码阅读深入解析 #### Crawl类核心作用与流程概览 在深入了解Nutch 1.2源码之前,我们先明确Nutch的架构和工作流程。Nutch作为一款开源搜索引擎框架,其功能涵盖网页抓取、索引构建以及查询处理。...

    nutch1.2源码

    总的来说,Apache Nutch 1.2是构建大规模搜索引擎或进行网络数据分析的理想工具,它与Hadoop的深度集成使得处理大量网页数据变得可能。通过深入理解Nutch的架构和工作流程,你可以构建自己的定制化网络爬虫系统,...

    myeclipse8.5导入nutch1.2源码

    - 在 Default output folder 设置中,将输出目录更改为 `nutch1.2/bin/tmp_nutch`。 - 转到 Libraries 标签页,点击 Add Class Folder,选择 `nutch1.2/conf` 目录。 3. **调整库顺序**: - 在 Order and Export...

    nutch1.2测试文档

    nutch1.2测试文档

    nutch-1.2.war

    nutch官方简单案例,请版本是nutch-1.2.war

    Windows下cygwin+MyEclipse 8.5+Nutch1.2+Tomcat 6.0

    ### Windows下cygwin+MyEclipse 8.5+Nutch1.2+Tomcat 6.0 本文旨在详细介绍如何在Windows环境下搭建基于cygwin、MyEclipse 8.5、Nutch 1.2及Tomcat 6.0的开发环境,并对每个步骤进行深入解析。 #### 一、Cygwin的...

    nutch-1.2.part02

    nutch Nutch是一个由Java实现的,刚刚诞生开放源代码(open-source)的web搜索引擎。 尽管Web搜索是漫游Internet的基本要求, 但是现有web搜索引擎的数目却在下降。 并且这很有可能进一步演变成为一个公司垄断了几乎...

    实验报告(利用Nutch和IKanalyzer构造中文分词搜索引擎)

    在Nutch 1.2中集成IKAnalyzer,需要修改NutchAnalysis.jj文件,将SIGRAM规则调整为支持连续的汉字,然后在代码中初始化IKTokenizer,使其能够处理输入的文本流。通过这种方式,Nutch现在能够对抓取的网页内容进行...

    nutch 0.9分页代码(粘贴可用)

    ### Nutch 0.9 分页代码解析与应用 #### 一、背景介绍 Nutch 是一个开源的网络爬虫项目,它提供了高度可扩展且可靠的网页抓取框架。随着互联网的发展,数据量日益增大,如何高效地处理这些数据成为了一个重要的...

    Nutch搜索引擎培训讲义

    - 选择“Source”选项卡,将默认输出目录从`nutch1.2/bin`修改为`nutch1.2/_bin`。 - 对于bin文件夹,可以通过右键点击“Team” &gt; “Restore”来恢复其内容。 3. **添加JAR包** - 通过“Add JARs”功能,将`...

    nutch-1.2.part06

    nutch Nutch是一个由Java实现的,刚刚诞生开放源代码(open-source)的web搜索引擎。 尽管Web搜索是漫游Internet的基本要求, 但是现有web搜索引擎的数目却在下降。 并且这很有可能进一步演变成为一个公司垄断了几乎...

    nutch部分网页乱码BUG修正

    Nutch是Apache开发的一款开源网络爬虫项目,用于抓取互联网上的网页并建立索引,以便于搜索引擎进行数据处理。然而,在实际使用过程中,由于编码问题,Nutch可能会出现部分网页乱码的情况。本篇文章将深入探讨这个...

    nutch-1.2.part07

    nutch Nutch是一个由Java实现的,刚刚诞生开放源代码(open-source)的web搜索引擎。 尽管Web搜索是漫游Internet的基本要求, 但是现有web搜索引擎的数目却在下降。 并且这很有可能进一步演变成为一个公司垄断了几乎...

    nutch-1.2.part05

    nutch Nutch是一个由Java实现的,刚刚诞生开放源代码(open-source)的web搜索引擎。 尽管Web搜索是漫游Internet的基本要求, 但是现有web搜索引擎的数目却在下降。 并且这很有可能进一步演变成为一个公司垄断了几乎...

    nutch-1.2.part03

    nutch Nutch是一个由Java实现的,刚刚诞生开放源代码(open-source)的web搜索引擎。 尽管Web搜索是漫游Internet的基本要求, 但是现有web搜索引擎的数目却在下降。 并且这很有可能进一步演变成为一个公司垄断了几乎...

    搭建nutch web开发环境

    3. **Hadoop**:Nutch可以与Hadoop集成,用于分布式处理和存储数据。安装Hadoop 1.x或2.x版本,并配置好`HADOOP_HOME`环境变量。 **获取Nutch源代码** 1. 使用Git克隆Nutch 1.2的源代码库: ``` git clone ...

    nutch-1.2.part10

    Nutch是一个由Java实现的,刚刚诞生开放源代码(open-source)的web搜索引擎。 尽管Web搜索是漫游Internet的基本要求, 但是现有web搜索引擎的数目却在下降。 并且这很有可能进一步演变成为一个公司垄断了几乎所有的...

Global site tag (gtag.js) - Google Analytics