java的搜索引擎之Lucene的创建和搜索高亮 -

hi_beijing

浏览: 251156 次
性别:
来自: 济南

最近访客更多访客>>

linchers

cicely_123456

搬运代码的MrLiu

spedit

博主相关

博客

微博

相册

留言

关于我

文章分类

社区版块

存档分类

java的搜索引擎之Lucene的创建和搜索高亮

博客分类：

Lucene

TermQuery 关键字查询;

RangeQuery 范围查询;

WildcardQuery 通配符查询;

PhraseQuery 短语查询;

booleanQuery boolean查询;

 package cn.mmclbs.com;

import java.io.File;

import jeasy.analysis.MMAnalyzer;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriter.MaxFieldLength;
import org.apache.lucene.queryParser.MultiFieldQueryParser;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.Filter;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.junit.Test;
/**
 * Lucene搜索引擎类
 * @author Administrator
 *
 */
public class Lucenes
{
	//来源路径
	private static String path = "F:\\deng\\Lucene\\datasource\\IndexWriter addDocument's a javadoc .txt";
	//索引路径
	private static String indexPath = "F:\\deng\\Lucene\\luceneIndexs";
	//中文分词器
	Analyzer analyzer = new MMAnalyzer();
	/**
	 * 创建索引
	 * @throws Exception
	 */
	@Test
	public void IndexTest() throws Exception
	{
		//得到一个File对象
		File file = new File(path);
		//转换成Document对象
		Document doc = DocumentAndStringUtil.getString2Docuement(file);
		//创建索引对象并分词设置最大字段
		IndexWriter index = new IndexWriter(indexPath,analyzer,true,MaxFieldLength.LIMITED);
		index.addDocument(doc);
		index.commit(); //涮新提交到硬盘上
		index.close();//关闭索引对象
	}
	/**
	 * 搜索引擎
	 * @throws Exception
	 */
	@Test
	public void SearchTest() throws Exception
	{
		//搜索内容
		String queryString = "高德";
		//搜索对象的字段
		String[] field = {"content"};
		
		//搜索的目标对象
		QueryParser queryParser = new MultiFieldQueryParser(field,analyzer);
		Query query = queryParser.parse(queryString);
		
		//搜索的对象
		IndexSearcher search = new IndexSearcher(indexPath);
		Filter filter = null;
		//返回集合
		TopDocs topDocs = search.search(query,filter,10000);
		//一共查出来多少条
		System.out.println("一共搜索出["+ topDocs.totalHits +"]条记录");
		
		//循环出搜索出来的集合
		for(ScoreDoc scoreDoc:topDocs.scoreDocs)
		{
			//内部文档是有编号的。这个是文档编辑
			int docSn = scoreDoc.doc;
			//每一条Document对象记录
			Document doc = search.doc(docSn);//这个是编号
			DocumentAndStringUtil.getDocumentoString(doc,analyzer,query);
		}
	}
}

工具类：

package cn.mmclbs.com;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.StringReader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Field.Index;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.SimpleFragmenter;
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;

/**
 * 转换工具类
 * @author Administrator
 *
 */
public class DocumentAndStringUtil
{

	/**
	 * String转换Document
	 * @param file 返回的文件
	 * @return
	 * @throws Exception
	 */
	public static Document getString2Docuement(File file) throws Exception
	{
		Document doc = new Document();
		doc.add(new Field("name",file.getName(),Store.YES,Index.ANALYZED));
		doc.add(new Field("content",readFile(file),Store.YES,Index.ANALYZED));
		doc.add(new Field("size",String.valueOf(file.length()),Store.YES,Index.NOT_ANALYZED));
		doc.add(new Field("url",file.getAbsolutePath(),Store.YES,Index.NO));
		return doc;
	}

	/**
	 * 读取File文件转换成String
	 * @param file 文件对象
	 * @return
	 * @throws Exception
	 */
	private static String readFile(File file) throws Exception
	{
		InputStream inStream = new FileInputStream(file);
		Reader reader = new InputStreamReader(inStream); 
		BufferedReader br = new BufferedReader(reader);
		StringBuffer buffer = new StringBuffer();
		String tempStr = null;
		while((tempStr = br.readLine()) != null)
		{
			buffer.append(tempStr+"\n");
		}
		return buffer.toString();
	}
	/**
	 * Document转换成String
	 * @param doc 对象
	 * @param analyzer 分词器
	 * @param query 查询对象
	 * @return
	 * @throws Exception
	 */
	public static void getDocumentoString(Document doc, Analyzer analyzer,Query query) throws Exception
	{
		System.out.println("名称："+doc.get("name"));
		System.out.println("内容："+getHighLight(doc,analyzer,query,"content"));
		System.out.println("大小："+doc.get("size"));
		System.out.println("地址："+doc.get("url"));
	}
	/**
	 * 高亮
	 * @param doc 对象
	 * @param analyzer 分词器
	 * @param query 查询对象
	 * @param field 查询的内容
	 * @return
	 * @throws Exception
	 */
	public static String getHighLight(Document doc, Analyzer analyzer, Query query, String field) throws Exception
	{
		SimpleHTMLFormatter simpleHTMLFormatter = new SimpleHTMLFormatter("<b>", "</b>");
		/* 语法高亮显示设置 */
		Highlighter highlighter = new Highlighter(simpleHTMLFormatter, new QueryScorer(query));
		highlighter.setTextFragmenter(new SimpleFragmenter(100));
		// 取 field 字段值，准备进行高亮
		String fieldValue = doc.get(field);
		TokenStream tokenStream = analyzer.tokenStream(field, new StringReader(fieldValue));
		// 转成高亮的值
		String highLightFieldValue = highlighter.getBestFragment(tokenStream, fieldValue);
		if (highLightFieldValue == null)
			highLightFieldValue = fieldValue;
		return highLightFieldValue;
	}
}

分享到：

HTML的HtmlParser解析器 | lucene文本高亮

2012-01-16 15:20
浏览 946
评论(0)
分类:编程语言
查看更多

发表评论

您还没有登录,请您登录后再发表评论

最近访客更多访客>>

博主相关

文章分类

社区版块

存档分类

最新评论

java的搜索引擎之Lucene的创建和搜索高亮

评论

发表评论

相关推荐

最近访客 更多访客>>

博主相关

文章分类

社区版块

存档分类

最新评论

java的搜索引擎之Lucene的创建和搜索高亮

评论

发表评论

相关推荐

Lucene升序降序排序

Lucene相关度排序

Lucene的增删改查的操作

Lucene创建索引的优化

lucene的jar包

lucene文本高亮

最近访客更多访客>>