`
TRAMP_ZZY
  • 浏览: 141027 次
社区版块
存档分类
最新评论

Lucene 搜索

阅读更多
/** 
 * Project Name:docsearch 
 * File Name:Search.java 
 * Package Name:cn.tramp.docsearch.search 
 * Date:2014年2月27日 下午6:40:58 
 * Copyright (c) 2014, zhangzhaoyu0524@163.com All Rights Reserved. 
 * 
*/  
  
package cn.tramp.docsearch.search;  

import java.io.File;
import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.DateTools;
import org.apache.lucene.document.DateTools.Resolution;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.queryparser.classic.QueryParser.Operator;
import org.apache.lucene.queryparser.flexible.core.nodes.RangeQueryNode;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.ConstantScoreQuery;
import org.apache.lucene.search.FuzzyQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.NumericRangeQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TermRangeQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.WildcardQuery;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.InvalidTokenOffsetsException;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;

import cn.tramp.docsearch.domain.DocumentInfo;
import cn.tramp.docsearch.index.Index;
import cn.tramp.docsearch.util.IndexPropertyUtil;
import cn.tramp.docsearch.util.Page;

/** 
 * ClassName:Search <br/> 
 * Function: Search. <br/> 
 * Reason:   Search. <br/> 
 * Date:     2014年2月27日 下午6:40:58 <br/> 
 * @author   zhangzhaoyu 
 * @version   
 * @since    JDK 1.7
 * @see       
 */
public class Search {

	private final static Log logger  = LogFactory.getLog(Index.class);
	
	public Directory directory;
	private String indexPath;
	private String docmentPath;
	
	private static IndexReader reader = null;
	
	public Search() {
		try {
			indexPath = IndexPropertyUtil.getKeyValueByName("indexPath");
			docmentPath = IndexPropertyUtil.getKeyValueByName("docmentPath");
			directory = FSDirectory.open(new File(indexPath));
		} catch (IOException e) {
			e.printStackTrace();
		}
	}
	
	public IndexSearcher getIndexSearcher() {
		try {
			if (reader == null) {
				reader = DirectoryReader.open(directory);
			} 
			return new IndexSearcher(reader);
 		} catch (CorruptIndexException e) {
 			e.printStackTrace();
 		} catch (Exception ex) {
 			ex.printStackTrace();
 		}
		return null;
	}
	
	public List<DocumentInfo> queryForDocumentByPage(String field, String searchText, Page<DocumentInfo> page) {
		IndexReader reader = null;
		IndexSearcher searcher = null;
		try {
			reader = DirectoryReader.open(directory);
			searcher = new IndexSearcher(reader);
			QueryParser parser = new QueryParser(Version.LUCENE_43, field, new StandardAnalyzer(Version.LUCENE_43));
			Query query = parser.parse(searchText);
			//Query query = new FuzzyQuery(new Term(field, searchText));
			//Query query = new TermQuery(new Term(field, searchText));
			TopDocs docs = searcher.search(query, 1000);
			
			int index = (page.getCurrentPage()-1) * page.getPageSize();
			page.setTotalRecord(docs.scoreDocs.length);
			ScoreDoc scoreDoc = null;
			if (index > 0) {
				scoreDoc = docs.scoreDocs[index-1];
			}
			TopDocs hits = searcher.searchAfter(scoreDoc, query, page.getPageSize());
			
			// 组装结果
			return convert(hits, searcher, query);
		} catch (Exception e) {
			e.printStackTrace();
		} 
		return null;
	}
	
	public List<DocumentInfo> queryForListByWildcard(String doc_name, String content, Page<DocumentInfo> page) {
		IndexSearcher searcher = null;
		try {
			searcher = getIndexSearcher();
			Query docNamequery = new WildcardQuery(new Term("doc_name", doc_name + "*"));
			Query contentQuery = new WildcardQuery(new Term("content", doc_name + "*"));
			
			BooleanQuery booleanQuery = new BooleanQuery();
			booleanQuery.add(docNamequery, Occur.SHOULD);
			booleanQuery.add(contentQuery, Occur.SHOULD);
			
			//Query query = new FuzzyQuery(new Term(field, value));
			TopDocs docs = searcher.search(booleanQuery, 1000);
			
			int index = (page.getCurrentPage()-1) * page.getPageSize();
			page.setTotalRecord(docs.scoreDocs.length);
			ScoreDoc scoreDoc = null;
			if (index > 0) {
				scoreDoc = docs.scoreDocs[index-1];
			}
			TopDocs hits = searcher.searchAfter(scoreDoc, booleanQuery, page.getPageSize());
			
			// 组装结果
			return convert(hits, searcher, booleanQuery);
		} catch (Exception e) {
			e.printStackTrace();
		}
		return null;
	}
	
	public List<DocumentInfo> queryByComplexBoolean(List<Query> termQueryList, Page<DocumentInfo> page) {
		IndexSearcher searcher = null;
		try {
			searcher = getIndexSearcher();
			BooleanQuery query = new BooleanQuery();  
	        for (Query termQuery : termQueryList) {  
	        	query.add(termQuery, Occur.MUST);  
	        }  
	        TopDocs docs = searcher.search(query, 1000);//获取匹配上元素的一个docid 
			
			int index = (page.getCurrentPage()-1) * page.getPageSize();
			page.setTotalRecord(docs.scoreDocs.length);
			ScoreDoc scoreDoc = null;
			if (index > 0) {
				scoreDoc = docs.scoreDocs[index-1];
			}
			TopDocs hits = searcher.searchAfter(scoreDoc, query, page.getPageSize());
			
			// 组装结果
			return convert(hits, searcher, query);
		} catch (Exception e) {
			e.printStackTrace();
		}
		return null;
	}
	
	public List<DocumentInfo> queryByQueryParse(String searchContent, Page<DocumentInfo> page) {
		
		IndexSearcher searcher = null;
		try {
			searcher = getIndexSearcher();
			QueryParser parser = new QueryParser(Version.LUCENE_43, "doc_name", new SmartChineseAnalyzer(Version.LUCENE_43));
			
			//parser.setDefaultOperator(Operator.AND);
	        Query query = parser.parse(searchContent + "*");
	        parser = new QueryParser(Version.LUCENE_43, "content", new StandardAnalyzer(Version.LUCENE_43));
	        Query  contentQuery = parser.parse(searchContent + "*");
	        
	        BooleanQuery booleanQuery = new BooleanQuery();
	        booleanQuery.add(query, Occur.SHOULD);
	        booleanQuery.add(contentQuery, Occur.SHOULD);
	        
	        TopDocs docs = searcher.search(booleanQuery, 1000);
			
			int index = (page.getCurrentPage()-1) * page.getPageSize();
			page.setTotalRecord(docs.scoreDocs.length);
			ScoreDoc scoreDoc = null;
			if (index > 0) {
				scoreDoc = docs.scoreDocs[index-1];
			}
			TopDocs hits = searcher.searchAfter(scoreDoc, booleanQuery, page.getPageSize());
			
			// 组装结果
			return convert(hits, searcher, query);
		} catch (Exception e) {
			e.printStackTrace();
		}
		return null;
	}
	
	public List<DocumentInfo> queryByDocType(TypeSearchBean searchBean, Page<DocumentInfo> page) {
		IndexSearcher searcher = null;
		try {
			searcher = getIndexSearcher();
			QueryParser parser = new QueryParser(Version.LUCENE_43, "doc_name", new SmartChineseAnalyzer(Version.LUCENE_43));
	        Query docNamequery = parser.parse(searchBean.getDoc_name() + "*");
	        parser = new QueryParser(Version.LUCENE_43, "content", new StandardAnalyzer(Version.LUCENE_43));
	        Query  contentQuery = parser.parse(searchBean.getContent() + "*");
	        Query typeQuery = new TermQuery(new Term("doc_type", searchBean.getDoc_type()));
	        
	        parser.setDefaultOperator(Operator.AND);
	        
	        BooleanQuery booleanQuery = new BooleanQuery();
	        booleanQuery.add(docNamequery, Occur.SHOULD);
	        booleanQuery.add(contentQuery, Occur.SHOULD);
	        booleanQuery.add(typeQuery, Occur.SHOULD);
	        
	        TopDocs docs = searcher.search(booleanQuery, 1000);
			
			int index = (page.getCurrentPage()-1) * page.getPageSize();
			page.setTotalRecord(docs.scoreDocs.length);
			ScoreDoc scoreDoc = null;
			if (index > 0) {
				scoreDoc = docs.scoreDocs[index-1];
			}
			TopDocs hits = searcher.searchAfter(scoreDoc, booleanQuery, page.getPageSize());
			// 组装结果
			return convert(hits, searcher, booleanQuery);
		} catch (Exception e) {
			e.printStackTrace();
		}
		return null;
	}
	
	public List<DocumentInfo> queryByDocDate(String beginDate, String endDate, Page<DocumentInfo> page) {
		IndexSearcher searcher = null;
		try {
			searcher = getIndexSearcher();
			TermRangeQuery addDaterangeQuery = TermRangeQuery.newStringRange("add_datetime", beginDate, endDate, true, true);
			TermRangeQuery modifyDaterangeQuery = TermRangeQuery.newStringRange("modify_datetime", beginDate, endDate, true, true);
			//NumericRangeQuery<Long>  addDaterangeQuery = NumericRangeQuery.newLongRange("add_datetime", beginDate, endDate, true, true);
			BooleanQuery booleanQuery = new BooleanQuery();
			booleanQuery.add(addDaterangeQuery, Occur.MUST);
			booleanQuery.add(modifyDaterangeQuery, Occur.MUST);
			 
	        TopDocs docs = searcher.search(booleanQuery, 1000);
			
			int index = (page.getCurrentPage()-1) * page.getPageSize();
			page.setTotalRecord(docs.scoreDocs.length);
			ScoreDoc scoreDoc = null;
			if (index > 0) {
				scoreDoc = docs.scoreDocs[index-1];
			}
			TopDocs hits = searcher.searchAfter(scoreDoc, booleanQuery, page.getPageSize());
			
			// 组装结果
			return convert(hits, searcher, booleanQuery);
		} catch (Exception e) {
			e.printStackTrace();
		}
		return null;
	}
	
	public List<DocumentInfo> queryByAccurate(AccurateSearchBean searchBean, Page<DocumentInfo> page) {
		IndexSearcher searcher = null;
		try {
			searcher = getIndexSearcher();
			QueryParser parser = new QueryParser(Version.LUCENE_43, "content", new SmartChineseAnalyzer(Version.LUCENE_43));
			Query docNameQuery = new WildcardQuery(new Term("doc_name", searchBean.getDoc_name() +"*"));
			//Query contentQuery = parser.parse(searchBean.getContent() + "*");
			Query docTypeQuery = new TermQuery(new Term("doc_type", searchBean.getDoc_type()));
			Query docAuthorQuery = new FuzzyQuery(new Term("author", searchBean.getAuthor()), 1, 1);
			TermRangeQuery dateRangeQuery = getTermRangeQueryByDate(searchBean.getLimit_date());
			
			BooleanQuery booleanQuery = new BooleanQuery();
			booleanQuery.add(docNameQuery, Occur.MUST);
			//booleanQuery.add(contentQuery, Occur.MUST);
			booleanQuery.add(docTypeQuery, Occur.MUST);
			booleanQuery.add(docAuthorQuery, Occur.MUST);
			if (dateRangeQuery != null) {
				booleanQuery.add(dateRangeQuery, Occur.MUST);
			}
			
	        TopDocs docs = searcher.search(booleanQuery, 1000);
			
			int index = (page.getCurrentPage()-1) * page.getPageSize();
			page.setTotalRecord(docs.scoreDocs.length);
			ScoreDoc scoreDoc = null;
			if (index > 0) {
				scoreDoc = docs.scoreDocs[index-1];
			}
			TopDocs hits = searcher.searchAfter(scoreDoc, booleanQuery, page.getPageSize());
			
			// 组装结果
			return convert(hits, searcher, booleanQuery);
		} catch (Exception e) {
			e.printStackTrace();
		}
		return null;
	}
	
	private List<DocumentInfo> convert(TopDocs hits, IndexSearcher searcher, Query query) throws IOException, InvalidTokenOffsetsException {
		List<DocumentInfo> list = new ArrayList<DocumentInfo>();
		Analyzer analyzer = new SmartChineseAnalyzer(Version.LUCENE_43);
		TokenStream tokenStream = null;
		for (ScoreDoc doc : hits.scoreDocs) {
			Document d = searcher.doc(doc.doc);
			DocumentInfo info = new DocumentInfo();
			//加亮处理
            //SimpleHTMLFormatter simplehtml=new SimpleHTMLFormatter("<font color='red'>", "</font>");
			//SimpleHTMLFormatter simplehtml=new SimpleHTMLFormatter("<font class='highLight'>", "</font>");
			SimpleHTMLFormatter simplehtml=new SimpleHTMLFormatter("<span class='label label-warning'>", "</span>");
            Highlighter highlighter = new Highlighter(simplehtml,new QueryScorer(query)); 
            String content = d.get("content");
            String doc_name = d.get("doc_name");
            
            if (content != null) {
            	tokenStream = analyzer.tokenStream("content", new StringReader(content));
            	String hightLightText = highlighter.getBestFragment(tokenStream, content);
            	if (hightLightText != null) {
            		info.setContent(hightLightText);
            	} else 
            		info.setContent(content);
            } else {
            	info.setContent(content);
            }
            
            if (doc_name != null) {
            	tokenStream = analyzer.tokenStream("doc_name", new StringReader(doc_name));
            	String hightLightText = highlighter.getBestFragment(tokenStream, doc_name);
            	if (hightLightText != null) {
            		info.setDoc_name(hightLightText);
            	} else 
            		info.setDoc_name(doc_name);
            } else {
            	info.setDoc_name(doc_name);
            }
            
            info.setDoc_id(Integer.parseInt(d.get("doc_id")));
			info.setDoc_type(d.get("doc_type"));
			info.setDoc_location(d.get("doc_location"));
			info.setAdd_datetime(new Date(Long.parseLong(d.get("add_datetime"))));
			info.setModify_datetime(new Date(Long.parseLong(d.get("modify_datetime"))));
			info.setAuthor(d.get("author"));
			info.setUpload_author(d.get("upload_author"));
			list.add(info);
		}
		return list;
	}
	
	/**
	 * 
	 * getTermRangeQueryByDate:<br />
	 * 通过时间限制,获取时间区间查询
	 *
	 * @author zhangzhaoyu
	 * @return
	 */
	private TermRangeQuery getTermRangeQueryByDate(int dateRange) {
		TermRangeQuery query = null;
		Date nowDate = null;
		Date oldDate = null;
		
		if (2 == dateRange) {
			nowDate = new Date();
			oldDate = new Date(nowDate.getTime() - 24*60*60*1000);
			
			query = TermRangeQuery.newStringRange("add_datetime", DateTools.dateToString(oldDate, Resolution.DAY),
					DateTools.dateToString(nowDate, Resolution.DAY), true, true);
		} else if (3 == dateRange) {
			nowDate = new Date();
			oldDate = new Date(nowDate.getTime() - 24*60*60*1000*7);
			
			query = TermRangeQuery.newStringRange("add_datetime", DateTools.dateToString(oldDate, Resolution.DAY),
					DateTools.dateToString(nowDate, Resolution.DAY), true, true);
		} else if (4 == dateRange) {
			nowDate = new Date();
			oldDate = new Date(nowDate.getTime() - 24*60*60*1000*210);
			
			query = TermRangeQuery.newStringRange("add_datetime", DateTools.dateToString(oldDate, Resolution.DAY),
					DateTools.dateToString(nowDate, Resolution.DAY), true, true);
		} else if (5 == dateRange) {
			nowDate = new Date();
			oldDate = new Date(nowDate.getTime() - 24*60*60*1000*365);
			
			query = TermRangeQuery.newStringRange("add_datetime", DateTools.dateToString(oldDate, Resolution.DAY),
					DateTools.dateToString(nowDate, Resolution.DAY), true, true);
		} else {
			return query;
		}
		return query;
	}
	
	/**
	 * 
	 * close:<br />
	 * 关闭reader
	 *
	 * @author zhangzhaoyu
	 */
	public void close() {
		try {
			reader.close();
		} catch (IOException e) {
			e.printStackTrace();
		}
	}
}
 
分享到:
评论

相关推荐

    Lucene搜索引擎开发权威经典(附盘源码)【于天恩】.zip

    《Lucene搜索引擎开发权威经典》是由于天恩编著的一本深入探讨Lucene搜索引擎开发的专业书籍,这本书结合源码分析,旨在帮助读者掌握Lucene的核心技术和应用实践。Lucene是Apache软件基金会的一个开放源代码项目,它...

    Lucene搜索引擎 JSP + JAVA

    **Lucene搜索引擎 JSP + JAVA** Lucene是一个高性能、全文本搜索库,由Apache软件基金会开发,它提供了索引和搜索大量文本数据的能力。在这个项目中,Lucene被结合了JSP(JavaServer Pages)和JAVA技术,创建了一个...

    LUCENE搜索引擎基本工作原理

    **LUCENE搜索引擎基本工作原理** Lucene是一个开源的全文搜索引擎库,被广泛应用于构建复杂的搜索引擎系统。它的设计目标是高效、灵活且可扩展。理解Lucene的工作原理有助于开发人员更好地利用这一强大的工具。 **...

    lucene 搜索引擎 compass

    《Lucene搜索引擎与Compass框架详解》 在信息化飞速发展的今天,搜索引擎已经成为了我们获取信息的重要工具。Apache Lucene是一个高性能、全文本检索库,它提供了基础的索引和搜索功能,使得开发者能够轻松地在应用...

    Lucene搜索引擎开发权威经典随书资源1-6章

    本书基于Lucene的当前最新版本(2.1)精解了Lucene搜索引擎的相关知识,从基础知识到应用开发,精炼简洁,恰到好处。  本书包含了必要的理论,但以实践为主。所讲的理论都不是纸上谈兵,都是可以立即付诸实践进行...

    Lucene搜索-引擎开发权威经典pdf+源码第二部分

    《Lucene搜索-引擎开发权威经典》是一本深入解析Apache Lucene搜索引擎库的专业书籍,它为读者提供了构建高效全文搜索引擎的全面指南。Lucene是Java领域最著名的全文检索库,被广泛应用于各种信息检索系统中,包括...

    [Lucene搜索引擎开发权威经典].zip

    《Lucene搜索引擎开发权威经典》是一本深入探讨Apache Lucene的专著,作者于天恩在书中详尽地阐述了Lucene的核心概念、工作原理以及实际应用。这本书旨在帮助读者理解如何利用Lucene构建高性能、可扩展的全文搜索...

    基于lucene搜索引擎的java源码

    **基于Lucene搜索引擎的Java源码详解** Lucene是一个高性能、全文检索库,它由Apache软件基金会开发并维护。此Java源码包提供了一个全面的示例,展示了如何利用Lucene进行索引创建、更新(增量索引)以及搜索操作。...

    lucene搜索引擎项目

    《深入理解Lucene搜索引擎项目》 Lucene是一个高性能、全文本搜索库,它为开发者提供了在Java应用程序中实现全文检索的工具集。这个名为“lucene搜索引擎项目”的资源,旨在帮助用户更好地理解和应用Lucene来构建...

    lucene 搜索中文PDF文档

    **正文** ...总结,利用Lucene搜索中文PDF文档涉及多个技术层面,包括中文分词、PDF解析、索引构建、搜索执行和性能优化。通过理解这些关键技术,开发者可以构建出高效、准确的中文PDF文档检索系统。

    Lucene4.X实战类baidu搜索的大型文档海量搜索系统-10.Lucene搜索深入实战2 共11页.pptx

    【课程大纲】01.Lucene4入门精通实战课程-概述 共23页02.Lucene系统架构 共16页03.Lucene索引里有什么 共17页04.Lucene索引深入 共24页05.Lucene索引深入优化 共10页06.Lucene索引搜索 共13页07.Lucene搜索实战1 共4...

    Lucene4.X实战类baidu搜索的大型文档海量搜索系统-09.Lucene搜索深入实战1 共5页.pptx

    【课程大纲】01.Lucene4入门精通实战课程-概述 共23页02.Lucene系统架构 共16页03.Lucene索引里有什么 共17页04.Lucene索引深入 共24页05.Lucene索引深入优化 共10页06.Lucene索引搜索 共13页07.Lucene搜索实战1 共4...

    Lucene4.X实战类baidu搜索的大型文档海量搜索系统-13.Lucene搜索深入实战进阶3 共5页.pptx

    【课程大纲】01.Lucene4入门精通实战课程-概述 共23页02.Lucene系统架构 共16页03.Lucene索引里有什么 共17页04.Lucene索引深入 共24页05.Lucene索引深入优化 共10页06.Lucene索引搜索 共13页07.Lucene搜索实战1 共4...

    Lucene 搜索方法(多短语搜索)

    1. **DemoData.java** - 这个文件很可能是包含测试数据或者示例数据的类,用于演示Lucene搜索功能。它可能包含了创建索引所需的文档对象,以及用于搜索的关键词。 2. **MultiPhraseQueryDemo.java** - 这个文件是多...

    [Lucene搜索引擎开发权威经典].于天恩著.扫描版.7z.002

    [Lucene搜索引擎开发权威经典].于天恩著.扫描版.7z.001[Lucene搜索引擎开发权威经典].于天恩著.扫描版.7z.001

    Solr Elasticsearch lucene 搜索引擎

    Solr、Elasticsearch和Lucene是三个在搜索引擎领域中至关重要的技术,它们共同构建了现代数据检索的基础架构。下面将分别对这三个组件进行详细解释,并探讨它们之间的关系。 **Lucene** Lucene是一个高性能、全文本...

    Lucene搜索-引擎开发权威经典pdf+源码

    Lucene搜索-引擎开发权威经典pdf+源码第一部分共2个

    ssh+lucene搜索实例

    在这个"ssh+lucene搜索实例"中,我们可以理解为结合了SSH和Lucene两个技术,以实现远程服务器上的全文检索功能。例如,可能有一个需求是在多个远程服务器上存储大量数据,而这些数据需要通过关键词进行快速搜索。在...

    Lucene搜索引擎开发权威经典随书资源7-10

    本书基于Lucene的当前最新版本(2.1)精解了Lucene搜索引擎的相关知识,从基础知识到应用开发,精炼简洁,恰到好处。  本书包含了必要的理论,但以实践为主。所讲的理论都不是纸上谈兵,都是可以立即付诸实践进行...

    Lucene搜索引擎2

    【Lucene搜索引擎2】入门教程 Lucene是一个由Apache软件基金会Jakarta项目组开发的开源全文检索引擎工具包。它的核心作者是Doug Cutting,一位在全文索引和检索领域具有深厚经验的专家。Lucene并非一个完整的应用,...

Global site tag (gtag.js) - Google Analytics