运用Luence建索和检索的方法

小丑虾

浏览: 53388 次
性别:
来自: 郑州

最近访客更多访客>>

zxlfast

广东林发

lzj0.0

扮猪吃老虎

博主相关

博客

微博

相册

留言

关于我

文章分类

社区版块

存档分类

博客分类：

虾米-Luence检索

初学者还在进步勿批！

链接地址： http://2814704901.iteye.com/admin/blogs/1930415

建索：

import java.io.File;

import java.text.SimpleDateFormat;

import java.util.ArrayList;

import java.util.Date;

import java.util.List;

import org.apache.lucene.analysis.Analyzer;

import org.apache.lucene.analysis.standard.StandardAnalyzer;

import org.apache.lucene.document.Document;

import org.apache.lucene.document.Field;

import org.apache.lucene.index.IndexWriter;

import org.apache.lucene.store.Directory;

import org.apache.lucene.store.FSDirectory;

import com.jpsycn.kfwggl.common.tools.GetRootPath;

import com.jpsycn.kfwggl.common.tools.HandlerSummary;

import com.jpsycn.kfwggl.system.entity.ResultGetInfo;

public class CreateIndex {

//抓取到的页面存放的路径

//String filesPath="F:/kfwlyqTxtList";

//分词

private Analyzer analyzer = new StandardAnalyzer();

public void createIndex(List<ResultGetInfo> lt){

Date d=new Date();

String root=GetRootPath.getIndexesPath();

if(!new File(root).exists()){

new File(root).mkdir();

}

//创建的索引存放路径

String INDEXPATH=root+new SimpleDateFormat("yyyyMMddHHMMSS").format(d)+d.getTime();

System.out.println(INDEXPATH);

if(!new File(INDEXPATH).exists()){

new File(INDEXPATH).mkdir();

}

//获取存放索引的文件夹

try {

SimpleDateFormat ft=new SimpleDateFormat("yyyy-MM-dd");

Directory directory = FSDirectory.getDirectory(INDEXPATH);

IndexWriter indexWriter = new IndexWriter(directory, analyzer ,true, IndexWriter.MaxFieldLength.LIMITED);

long begin = new Date().getTime();

for(ResultGetInfo rg:lt){

//获取一个List<esultGetInfo>遍历里面的值建索

//其中红色titleResult 就是索引如字典中的索引蓝色就是你要建索的字符串

//Field.Store.YES 表示是否存储以后可以检索

//Field.Index.ANALYZED 表示是否分词

Document doc = new Document();

String titleResult=rg.getTitle()==null?"":rg.getTitleResult().trim();

String content =rg.getContent()==null?"":rg.getContent();

String link=rg.getLink()==null?"":rg.getLink().trim();

String releaseDate=rg.getReleaseDate()==null?"":ft.format(rg.getReleaseDate());

doc.add(new Field("titleResult", titleResult, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.YES));

doc.add(new Field("content", content, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.YES));

doc.add(new Field("link", link, Field.Store.YES,Field.Index.ANALYZED,Field.TermVector.YES));

doc.add(new Field("releaseDate", releaseDate, Field.Store.YES,Field.Index.ANALYZED,Field.TermVector.YES));

indexWriter.addDocument(doc);

}

long end = new Date().getTime();

System.out.println(">>> 1.存入索引完毕.. 共花费：" + (end - begin) +"毫秒...");

indexWriter.optimize();

indexWriter.close();

} catch (Exception e) {

e.printStackTrace();

}

检索：

import java.io.File;

import java.util.ArrayList;

import java.util.Date;

import java.util.HashMap;

import java.util.List;

import java.util.Map;

import org.apache.lucene.analysis.Analyzer;

import org.apache.lucene.analysis.standard.StandardAnalyzer;

import org.apache.lucene.document.Document;

import org.apache.lucene.queryParser.MultiFieldQueryParser;

import org.apache.lucene.search.BooleanClause;

import org.apache.lucene.search.IndexSearcher;

import org.apache.lucene.search.Query;

import org.apache.lucene.search.ScoreDoc;

import org.apache.lucene.search.TopDocs;

public class GetResultInfo {

public static List<String> getResultInfos(String keyName,String INDEXPATH,String titleOrContent){

List<String> list=new ArrayList<String>();

Analyzer analyzer = new StandardAnalyzer();

//String titleResult="titleResult";

String link="link";

//String content=titleOrContent;

//String releaseDate="releaseDate";

//索引存放位置

try {

IndexSearcher indexSearcher = new IndexSearcher(INDEXPATH);

//System.out.println(">>> 2.开始读取索引... ... 通过关键字：【 "+ keyName +" 】");

BooleanClause.Occur[] clauses = { BooleanClause.Occur.SHOULD };

Query queryOBJ = MultiFieldQueryParser.parse(keyName, new String[]{titleOrContent}, clauses, analyzer);//parser.parse(query);

//Filter filter = null;

//################# 搜索相似度最高的记录 ###################

//TopDocs topDocs = indexSearcher.search(queryOBJ, filter, 1000);

TopDocs topDocs = indexSearcher.search(queryOBJ , 10000);

//System.out.println("*** 共匹配：" + topDocs.totalHits + "个 ***");

//ResultGetInfo rg = null;

//输出结果

for (ScoreDoc scoreDoc : topDocs.scoreDocs){

* 这里我就返回一个List<String>集合里面存放路径 url

* 这里的link是需要和建索的时候的 link对应的而且相同

Document targetDoc = indexSearcher.doc(scoreDoc.doc);

list.add(targetDoc.get(link).trim());

/*rg = new ResultGetInfo();

//注释掉的是关于高亮显示的部分获取到的是含有html标签的字符串需要你转换

//设置高亮显示格式

SimpleHTMLFormatter simpleHTMLFormatter = new SimpleHTMLFormatter("<font color='red'><strong>", "</strong></font>");

/* 语法高亮显示设置 */

/*Highlighter highlighter = new Highlighter(simpleHTMLFormatter,new QueryScorer(queryOBJ));

highlighter.setTextFragmenter(new SimpleFragmenter(100));

// 设置高亮设置 title,content 字段

/*String title = targetDoc.get("titleResult");

String contents = targetDoc.get("content");

TokenStream titleTokenStream = analyzer.tokenStream(titleResult,new StringReader(title));

TokenStream contentTokenStream = analyzer.tokenStream(content,new StringReader(contents));

String highLightTitle = highlighter.getBestFragment(titleTokenStream, title);

String highLightContent = highlighter.getBestFragment(contentTokenStream, contents);

if(highLightTitle == null){

highLightTitle = title;

}

if(highLightContent == null) {

highLightContent = content;

}

rg.setLink(targetDoc.get(link));

rg.setTitleResult(highLightTitle);

rg.setContent(highLightContent);

rg.setReleaseDate(new SimpleDateFormat("yyyy-MM-dd hh:mm:ss").parse(targetDoc.get(releaseDate)+" 00:00:00"));

list.add(rg);*/

}

indexSearcher.close();

return list;

} catch (Exception e) {

e.printStackTrace();

return null;

}

public static List<String> getDirPath(String path){

List<String> dirPaths=new ArrayList<String>();

File f=new File(path);

File files[]=f.listFiles();

if(files.length==0){

System.out.println("没有存放索引的文件夹");

}else{

for(int i=0;i<files.length;i++){

//检索每个存放索引的文件夹

dirPaths.add(files[i].getAbsolutePath());

}

return dirPaths;

}

public static Map<String,String> getInfos(String path,String str[],String titleOrContent){

long begin = new Date().getTime();

Map<String,String> map=new HashMap<String,String>();

//获取存放索引的所有文件夹

List<String> dirPaths=GetResultInfo.getDirPath(path);

for(int k=0;k<str.length;k++){

for(int i=0;i<dirPaths.size();i++){

List<String> infoList=GetResultInfo.getResultInfos(str[k],dirPaths.get(i),titleOrContent);

for(int j=0;j<infoList.size();j++){

map.put(infoList.get(j),infoList.get(j));

}

long end = new Date().getTime();

System.out.println(">>> 搜索完毕... ... 共花费：" + (end - begin) +"毫秒...");

System.out.println("一共检索到"+map.size()+"条");

return map;

}

这个例子是我对解析到的网页的路径、标题、内容、发布日期、来源进行建索

然后通过对内容的检索获取该网页的路径

lucene-core-2.4.1.jar (803.5 KB)
下载次数: 0

lucene-demos-2.4.1.jar (54.7 KB)
下载次数: 0

lucene-highlighter-2.4.0.jar (88.8 KB)
下载次数: 0

分享到：

通过Js实现高亮显示 | 登录拦截器

2013-08-24 17:57
浏览 712
评论(0)
分类:Web前端
查看更多

发表评论

您还没有登录,请您登录后再发表评论

最近访客更多访客>>

博主相关

文章分类

社区版块

存档分类

最新评论