`
gcgmh
  • 浏览: 359221 次
  • 性别: Icon_minigender_1
  • 来自: 北京
社区版块
存档分类
最新评论

Lucene TermVector用法:相关搜索功能及提高高亮显示性能

阅读更多
public class TermVectorTest {
	
	Analyzer analyzer = new SimpleAnalyzer();
	Directory ramDir = new RAMDirectory();
	
	public void createRamIndex() throws CorruptIndexException, LockObtainFailedException, IOException{
		
		IndexWriter writer = new IndexWriter(ramDir,analyzer,IndexWriter.MaxFieldLength.LIMITED);
		
		Document doc1 = new Document();
		doc1.add(new Field("title","java",Store.YES,Index.ANALYZED));
		doc1.add(new Field("author","callan",Store.YES,Index.ANALYZED));
		doc1.add(new Field("subject","java一门编程语言,用java的人很多,编程语言也不少,但是java最流行",Store.YES,Index.ANALYZED,TermVector.WITH_POSITIONS_OFFSETS));
		
		Document doc2 = new Document();
		doc2.add(new Field("title","english",Store.YES,Index.ANALYZED));
		doc2.add(new Field("author","wcq",Store.YES,Index.ANALYZED));
		doc2.add(new Field("subject","英语用的人很多",Store.YES,Index.ANALYZED,TermVector.WITH_POSITIONS_OFFSETS));
	
		Document doc3 = new Document();
		doc3.add(new Field("title","asp",Store.YES,Index.ANALYZED));
		doc3.add(new Field("author","ca",Store.YES,Index.ANALYZED));
		doc3.add(new Field("subject","英语用的人很多",Store.YES,Index.ANALYZED,TermVector.WITH_POSITIONS_OFFSETS));
		
		writer.addDocument(doc1);
		writer.addDocument(doc2);
		writer.addDocument(doc3);
		
		writer.optimize();
		writer.close();
	}
	
	public void search() throws CorruptIndexException, IOException{
		IndexReader reader = IndexReader.open(ramDir);
		IndexSearcher searcher = new IndexSearcher(reader);
		Term term = new Term("title","java");	//在title里查询java词条
		TermQuery query = new TermQuery(term);
		Hits hits = searcher.search(query);
		for (int i = 0; i < hits.length(); i++)
		{
			Document doc = hits.doc(i);
			System.out.println(doc.get("title"));
			System.out.println(doc.get("subject"));
			System.out.println("moreLike search: ");
			
			morelikeSearch(reader,hits.id(i));
		}
	}

	private void morelikeSearch(IndexReader reader,int id) throws IOException
	{
		//根据这个document的id获取这个field的Term Vector 信息,就是这个field分词之后在这个field里的频率、位置、等信息
		TermFreqVector vector = reader.getTermFreqVector(id, "subject");
		
		BooleanQuery query = new BooleanQuery();  
		
		for (int i = 0; i < vector.size(); i++)
		{
			 TermQuery tq = new TermQuery(new Term("subject",   
		                vector.getTerms()[i]));   //获取每个term保存的Token
		           
		         query.add(tq, BooleanClause.Occur.SHOULD);   

		}
		
		IndexSearcher searcher = new IndexSearcher(ramDir);   
	       
	    Hits hits = searcher.search(query);   
	    
	    //显示代码,略

		
	}

//Lucene使用TermVector提高高亮显示性能
	public void highterLightSearch() throws CorruptIndexException, IOException{
		IndexReader reader = IndexReader.open(ramDir);   
        
        IndexSearcher searcher = new IndexSearcher(reader);   
           
        TermQuery query = new TermQuery(new Term("subject","java"));   
           
        Hits hits = searcher.search(query);   
           
        //高亮显示设置   
        SimpleHTMLFormatter simpleHTMLFormatter = new SimpleHTMLFormatter("<font color='red'>","</font>");
           
        Highlighter highlighter =new Highlighter(simpleHTMLFormatter,new QueryScorer(query));   
        
         // 这个100是指定关键字字符串的context的长度,你可以自己设定,因为不可能返回整篇正文内容   
        highlighter.setTextFragmenter(new SimpleFragmenter(100));   
  
        for(int i = 0; i < hits.length(); i++){   
               
            Document doc = hits.doc(i);   
               
            TermPositionVector termFreqVector = (TermPositionVector)reader.getTermFreqVector(hits.id(i), "subject");   
             
            TermFreqVector vector = reader.getTermFreqVector(hits.id(i), "subject");
            TokenStream tokenStream = TokenSources.getTokenStream(termFreqVector);   
               
            String result = highlighter.getBestFragment(tokenStream, doc.get("subject"));   
  
            System.out.println(doc.get("title"));   
               
            System.out.println(result);   
               
        }   

		
	}
	
	public static void main(String[] args) throws CorruptIndexException, IOException
	{
		TermVectorTest  t = new TermVectorTest();
		t.createRamIndex();
		t.search();
	}

}
分享到:
评论

相关推荐

    lucene 高亮显示. java

    对于深入理解和掌握Lucene的高亮显示功能及中文分词性能优化,建议参考官方文档和相关技术博客,同时进行实际的编码实践,以便更好地理解和运用这一知识点。此外,关注Lucene社区的最新动态,可以获取更多关于性能...

    lucene.NET 中文分词

    - **缓存策略**:合理使用Lucene.NET的缓存机制,如TermVector缓存,可以提升查询性能。 总的来说,Lucene.NET在中文分词和高亮显示方面的应用需要结合合适的分词器,并进行适当的配置和优化。开发者可以根据实际...

    lucence高亮显示

    Apache Lucene是一个强大的全文搜索引擎库,它提供了多种功能,包括高亮显示搜索结果。高亮显示有助于提高用户体验,使用户能够一目了然地看到哪些词在文档中匹配了查询。 **1. Lucene高亮器概述** Lucene提供了一...

    lucene站内搜索引擎

    - **优化索引**:索引优化(merge索引)能合并多个段,减少段的数量,提高搜索性能。 3. **查询过程** - **解析查询**:使用QueryParser将用户的查询字符串转化为Query对象,可以指定查询语法和默认字段。 - **...

    apache-solr-ref-guide-7.1.pdf

    最后,“The Terms Component”、“The TermVector Component”、“The Stats Component”、“The Query Elevation Component”等章节,介绍了Solr的高级搜索组件,这些组件增强了Solr的搜索能力,能够处理更复杂、...

Global site tag (gtag.js) - Google Analytics