- 浏览: 97443 次
- 性别:
- 来自: 上海
文章分类
最新评论
-
jyjsjd:
请教博主这个ChineseTokenizer()你是怎么写的, ...
使用WVTool进行文本分类 -
superclay:
能不能发个indexwriter indexsearch ...
结合ehcache缓存对lucene使用单例模式搜索 -
strayly:
我采用和ehcache缓存结合使用单例模式
使用ehcache ...
lucene搜索优化(转)
/*
* MoreLikeThis.java
*
* Created on 2008年3月11日, 下午3:31
*
* To change this template, choose Tools | Template Manager
* and open the template in the editor.
*/
package Similarity;
import java.util.*;
import java.io.*;
import java.lang.*;
import java.text.*;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.util.PriorityQueue;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermFreqVector;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.DefaultSimilarity;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Hits;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
/**
*
* @author Administrator
*/
public final class MoreLikeThis {
public final int DEFAULT_MAX_NUM_TOKENS_PARSED = 5000;
public static final Analyzer DEFAULT_ANALYZER = new StandardAnalyzer();
public final int DEFAULT_MIN_TERM_FREQ = 2;
public final int DEFALT_MIN_DOC_FREQ = 5;
public final boolean DEFAULT_BOOST = false;
public static final String[] DEFAULT_FIELD_NAMES = new String[]{"contents"};
public final int DEFAULT_MIN_WORD_LENGTH = 0;
public final int DEFAULT_MAX_WORD_LENGTH = 0;
public static final Hashtable DEFAULT_STOP_WORDS = null;
private Hashtable stopWords = DEFAULT_STOP_WORDS;
public final int DEFAULT_MAX_QUERY_TERMS = 25;
private Analyzer analyzer = DEFAULT_ANALYZER;
private int minTermFreq = DEFAULT_MIN_TERM_FREQ;
private int minDocFreq = DEFALT_MIN_DOC_FREQ;
private boolean boost = DEFAULT_BOOST;
private String[] fieldNames = DEFAULT_FIELD_NAMES;
private int maxNumTokensParsed = DEFAULT_MAX_NUM_TOKENS_PARSED;
private int minWordLen = DEFAULT_MIN_WORD_LENGTH;
private int maxWordLen = DEFAULT_MAX_WORD_LENGTH;
private int maxQueryTerms = DEFAULT_MAX_QUERY_TERMS;
private org.apache.lucene.search.Similarity similarity = new DefaultSimilarity();
private IndexReader ir;
/** Creates a new instance of MoreLikeThis */
public MoreLikeThis(IndexReader ir) {
this.ir = ir;
}
public Analyzer GetAnalyzer() {
return analyzer;
}
public void SetAnalyzer(Analyzer analyzer) {
this.analyzer = analyzer;
}
public int GetMinTermFreq() {
return minTermFreq;
}
public void SetMinTermFreq(int minTermFreq) {
this.minTermFreq = minTermFreq;
}
public int GetMinDocFreq() {
return minDocFreq;
}
public void SetMinDocFreq(int minDocFreq) {
this.minDocFreq = minDocFreq;
}
public boolean IsBoost() {
return boost;
}
public void SetBoost(boolean boost) {
this.boost = boost;
}
public String[] GetFieldNames() {
return fieldNames;
}
public void SetFieldNames(String[] fieldNames) {
this.fieldNames = fieldNames;
}
public int GetMinWordLen() {
return minWordLen;
}
public void SetMinWordLen(int minWordLen) {
this.minWordLen = minWordLen;
}
public int GetMaxWordLen() {
return maxWordLen;
}
public void SetMaxWordLen(int maxWordLen) {
this.maxWordLen = maxWordLen;
}
public void SetStopWords(Hashtable stopWords) {
this.stopWords = stopWords;
}
public Hashtable GetStopWords() {
return stopWords;
}
public int GetMaxQueryTerms() {
return maxQueryTerms;
}
public void SetMaxQueryTerms(int maxQueryTerms) {
this.maxQueryTerms = maxQueryTerms;
}
public int GetMaxNumTokensParsed() {
return maxNumTokensParsed;
}
public void SetMaxNumTokensParsed(int i) {
maxNumTokensParsed = i;
}
public Query Like(int docNum)
{
if (fieldNames == null)
{
Collection fields = ir.getFieldNames(IndexReader.FieldOption.INDEXED);
Iterator e = fields.iterator();
fieldNames = new String[fields.size()];
int index = 0;
while (e.hasNext())
fieldNames[index++] = (String) e.next();
}
return CreateQuery(RetrieveTerms(docNum));
}
public Query Like(File f)
{
try
{
if (fieldNames == null)
{
Collection fields = ir.getFieldNames(IndexReader.FieldOption.INDEXED);
Iterator e = fields.iterator();
fieldNames = new String[fields.size()];
int index = 0;
while (e.hasNext())
fieldNames[index++] = (String) e.next();
}
return Like(new FileInputStream(f.getName()));
}
catch(IOException e)
{
System.out.println(e);
}
return null;
}
public Query Like(FileInputStream is_Renamed)
{
return Like(new InputStreamReader(is_Renamed));
}
public Query Like(Reader r)
{
return CreateQuery(RetrieveTerms(r));
}
private Query CreateQuery(PriorityQueue q)
{
BooleanQuery query = new BooleanQuery();
Object cur;
int qterms = 0;
float bestScore = 0;
if(q!=null)
{
while (((cur = q.pop()) != null))
{
Object[] ar = (Object[]) cur;
TermQuery tq = new TermQuery(new Term((String) ar[1], (String) ar[0]));
if (boost)
{
if (qterms == 0)
{
bestScore = ((Float)ar[2]).floatValue();
}
float myScore = ((Float)ar[2]).floatValue();
tq.setBoost(myScore / bestScore);
}
try
{
query.add(tq, BooleanClause.Occur.SHOULD);
}
catch (BooleanQuery.TooManyClauses ignore)
{
break;
}
qterms++;
if (maxQueryTerms > 0 && qterms >= maxQueryTerms)
{
break;
}
}
}
return query;
}
private PriorityQueue CreateQueue(Dictionary words)
{
try
{
int numDocs = ir.numDocs();
FreqQ res = new FreqQ(words.size());
Enumeration it = words.keys();
while (it.hasMoreElements())
{
String word = (String) it.nextElement();
Object tmpW=words.get(word);
int tmpI=((Int32)tmpW).x;
if(tmpI==0)
{
tmpI=1;
}
int tf = tmpI;
if (minTermFreq > 0 && tf < minTermFreq)
{
continue;
}
String topField = fieldNames[0];
int docFreq = 0;
for (int i = 0; i < fieldNames.length; i++)
{
int freq = ir.docFreq(new Term(fieldNames[i], word));
topField = (freq > docFreq) ? fieldNames[i] : topField;
docFreq = (freq > docFreq) ? freq : docFreq;
}
if (minDocFreq > 0 && docFreq < minDocFreq)
{
continue;
}
if (docFreq == 0)
{
continue;
}
float idf = similarity.idf(docFreq, numDocs);
float score = tf * idf;
res.insert(new Object[]{word, topField, (float) score, (float)idf, (long) docFreq, (long) tf});
return res;
}
}
catch(IOException e)
{
System.out.println(e);
}
return null;
}
public String DescribeParams()
{
StringBuilder sb = new StringBuilder();
sb.append("\t" + "maxQueryTerms : " + maxQueryTerms + "\n");
sb.append("\t" + "minWordLen : " + minWordLen + "\n");
sb.append("\t" + "maxWordLen : " + maxWordLen + "\n");
sb.append("\t" + "fieldNames : \"");
String delim = "";
for (int i = 0; i < fieldNames.length; i++)
{
String fieldName = fieldNames[i];
sb.append(delim).append(fieldName);
delim = ", ";
}
sb.append("\n");
sb.append("\t" + "boost : " + boost + "\n");
sb.append("\t" + "minTermFreq : " + minTermFreq + "\n");
sb.append("\t" + "minDocFreq : " + minDocFreq + "\n");
return sb.toString();
}
private PriorityQueue RetrieveTerms(int docNum)
{
try
{
Dictionary termFreqMap = new Hashtable();
for (int i = 0; i < fieldNames.length; i++)
{
String fieldName = fieldNames[i];
TermFreqVector vector = ir.getTermFreqVector(docNum, fieldName);
if (vector == null)
{
Document d = ir.document(docNum);
String[] text = d.getValues(fieldName);
if (text != null)
{
for (int j = 0; j < text.length; j++)
{
AddTermFrequencies(new StringReader(text[j]), termFreqMap, fieldName);
}
}
}
else
{
AddTermFrequencies(termFreqMap, vector);
}
}
return CreateQueue(termFreqMap);
}
catch(IOException e)
{
System.out.println(e);
}
return null;
}
private void AddTermFrequencies(Dictionary termFreqMap, TermFreqVector vector)
{
String[] terms = vector.getTerms();
int[] freqs = vector.getTermFrequencies();
for (int j = 0; j < terms.length; j++)
{
String term = terms[j];
if (IsNoiseWord(term))
{
continue;
}
Int32 cnt = (Int32) termFreqMap.get(term);
if (cnt == null)
{
cnt = new Int32();
termFreqMap.put(term,cnt);
cnt.x = freqs[j];
}
else
{
cnt.x += freqs[j];
}
}
}
private void AddTermFrequencies(Reader r, Dictionary termFreqMap, String fieldName)
{
try{
TokenStream ts = analyzer.tokenStream(fieldName,r);
org.apache.lucene.analysis.Token token;
int tokenCount = 0;
while ((token = ts.next()) != null)
{
String word = token.termText();
tokenCount++;
if (tokenCount > maxNumTokensParsed)
{
break;
}
if (IsNoiseWord(word))
{
continue;
}
Int32 cnt = (Int32) termFreqMap.get(word);
if (cnt == null)
{
termFreqMap.put(word,new Int32());
}
else
{
cnt.x++;
}
}
}
catch(IOException e)
{
System.out.println(e);
}
}
private boolean IsNoiseWord(String term)
{
int len = term.length();
if (minWordLen > 0 && len < minWordLen)
{
return true;
}
if (maxWordLen > 0 && len > maxWordLen)
{
return true;
}
if (stopWords != null && stopWords.contains(term))
{
return true;
}
return false;
}
public PriorityQueue RetrieveTerms(Reader r)
{
Dictionary words = new Hashtable();
for (int i = 0; i < fieldNames.length; i++)
{
String fieldName = fieldNames[i];
AddTermFrequencies(r, words, fieldName);
}
return CreateQueue(words);
}
public String[] RetrieveInterestingTerms(StringReader r)
{
ArrayList al = new ArrayList(maxQueryTerms);
PriorityQueue pq = RetrieveTerms(r);
Object cur;
int lim = maxQueryTerms;
while (((cur = pq.pop()) != null) && lim-- > 0)
{
Object[] ar = (Object[]) cur;
al.add(ar[0]);
}
String[] res = new String[al.size()];
return (String[]) al.toArray();
}
private class FreqQ extends PriorityQueue
{
private FreqQ(int s)
{
super.initialize(s);
}
protected boolean lessThan(Object a, Object b)
{
Object[] aa = (Object[]) a;
Object[] bb = (Object[]) b;
float fa = ((Float) aa[2]).floatValue();
float fb = ((Float) bb[2]).floatValue();
return (float) fa > (float) fb;
}
}
private class Int32
{
int x;
Int32()
{
x = 1;
}
}
}
调用测试代码:
String indexName="e:\\index\\indexForML"; //索引文件路径
String fn="c:\\0.txt"; //测试文件路径
IndexReader r = IndexReader.open(indexName);
MoreLikeThis mlt = new MoreLikeThis(r); //传入IndexReader对象提供查询
mlt.SetAnalyzer(new StandardAnalyzer()); //设置使用的分词器
mlt.SetFieldNames(new String[]{"content"}); //设置需要比较的field字段
Query query = null;
query = mlt.Like(new FileReader(fn)); //创建查询,传入查询内容可为任意的Reader子类
IndexSearcher searcher = new IndexSearcher(indexName);
Hits hits = searcher.search(query); //根据查询返回相似文档
int len = hits.length();
for (int i = 0; i < Math.min(25, len); i++) //如果比25大就只返回前25条
{
Document d = hits.doc(i);
System.out.println("score : " + hits.score(i));
System.out.println("filename : " + d.get("fullname")); //取出索引字段fullname内容
System.out.println("type : " + d.get("ttype")); //取出索引字段ttype内容
System.out.println("___________________________");
}
r.close(); //关闭索引
* MoreLikeThis.java
*
* Created on 2008年3月11日, 下午3:31
*
* To change this template, choose Tools | Template Manager
* and open the template in the editor.
*/
package Similarity;
import java.util.*;
import java.io.*;
import java.lang.*;
import java.text.*;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.util.PriorityQueue;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermFreqVector;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.DefaultSimilarity;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Hits;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
/**
*
* @author Administrator
*/
public final class MoreLikeThis {
public final int DEFAULT_MAX_NUM_TOKENS_PARSED = 5000;
public static final Analyzer DEFAULT_ANALYZER = new StandardAnalyzer();
public final int DEFAULT_MIN_TERM_FREQ = 2;
public final int DEFALT_MIN_DOC_FREQ = 5;
public final boolean DEFAULT_BOOST = false;
public static final String[] DEFAULT_FIELD_NAMES = new String[]{"contents"};
public final int DEFAULT_MIN_WORD_LENGTH = 0;
public final int DEFAULT_MAX_WORD_LENGTH = 0;
public static final Hashtable DEFAULT_STOP_WORDS = null;
private Hashtable stopWords = DEFAULT_STOP_WORDS;
public final int DEFAULT_MAX_QUERY_TERMS = 25;
private Analyzer analyzer = DEFAULT_ANALYZER;
private int minTermFreq = DEFAULT_MIN_TERM_FREQ;
private int minDocFreq = DEFALT_MIN_DOC_FREQ;
private boolean boost = DEFAULT_BOOST;
private String[] fieldNames = DEFAULT_FIELD_NAMES;
private int maxNumTokensParsed = DEFAULT_MAX_NUM_TOKENS_PARSED;
private int minWordLen = DEFAULT_MIN_WORD_LENGTH;
private int maxWordLen = DEFAULT_MAX_WORD_LENGTH;
private int maxQueryTerms = DEFAULT_MAX_QUERY_TERMS;
private org.apache.lucene.search.Similarity similarity = new DefaultSimilarity();
private IndexReader ir;
/** Creates a new instance of MoreLikeThis */
public MoreLikeThis(IndexReader ir) {
this.ir = ir;
}
public Analyzer GetAnalyzer() {
return analyzer;
}
public void SetAnalyzer(Analyzer analyzer) {
this.analyzer = analyzer;
}
public int GetMinTermFreq() {
return minTermFreq;
}
public void SetMinTermFreq(int minTermFreq) {
this.minTermFreq = minTermFreq;
}
public int GetMinDocFreq() {
return minDocFreq;
}
public void SetMinDocFreq(int minDocFreq) {
this.minDocFreq = minDocFreq;
}
public boolean IsBoost() {
return boost;
}
public void SetBoost(boolean boost) {
this.boost = boost;
}
public String[] GetFieldNames() {
return fieldNames;
}
public void SetFieldNames(String[] fieldNames) {
this.fieldNames = fieldNames;
}
public int GetMinWordLen() {
return minWordLen;
}
public void SetMinWordLen(int minWordLen) {
this.minWordLen = minWordLen;
}
public int GetMaxWordLen() {
return maxWordLen;
}
public void SetMaxWordLen(int maxWordLen) {
this.maxWordLen = maxWordLen;
}
public void SetStopWords(Hashtable stopWords) {
this.stopWords = stopWords;
}
public Hashtable GetStopWords() {
return stopWords;
}
public int GetMaxQueryTerms() {
return maxQueryTerms;
}
public void SetMaxQueryTerms(int maxQueryTerms) {
this.maxQueryTerms = maxQueryTerms;
}
public int GetMaxNumTokensParsed() {
return maxNumTokensParsed;
}
public void SetMaxNumTokensParsed(int i) {
maxNumTokensParsed = i;
}
public Query Like(int docNum)
{
if (fieldNames == null)
{
Collection fields = ir.getFieldNames(IndexReader.FieldOption.INDEXED);
Iterator e = fields.iterator();
fieldNames = new String[fields.size()];
int index = 0;
while (e.hasNext())
fieldNames[index++] = (String) e.next();
}
return CreateQuery(RetrieveTerms(docNum));
}
public Query Like(File f)
{
try
{
if (fieldNames == null)
{
Collection fields = ir.getFieldNames(IndexReader.FieldOption.INDEXED);
Iterator e = fields.iterator();
fieldNames = new String[fields.size()];
int index = 0;
while (e.hasNext())
fieldNames[index++] = (String) e.next();
}
return Like(new FileInputStream(f.getName()));
}
catch(IOException e)
{
System.out.println(e);
}
return null;
}
public Query Like(FileInputStream is_Renamed)
{
return Like(new InputStreamReader(is_Renamed));
}
public Query Like(Reader r)
{
return CreateQuery(RetrieveTerms(r));
}
private Query CreateQuery(PriorityQueue q)
{
BooleanQuery query = new BooleanQuery();
Object cur;
int qterms = 0;
float bestScore = 0;
if(q!=null)
{
while (((cur = q.pop()) != null))
{
Object[] ar = (Object[]) cur;
TermQuery tq = new TermQuery(new Term((String) ar[1], (String) ar[0]));
if (boost)
{
if (qterms == 0)
{
bestScore = ((Float)ar[2]).floatValue();
}
float myScore = ((Float)ar[2]).floatValue();
tq.setBoost(myScore / bestScore);
}
try
{
query.add(tq, BooleanClause.Occur.SHOULD);
}
catch (BooleanQuery.TooManyClauses ignore)
{
break;
}
qterms++;
if (maxQueryTerms > 0 && qterms >= maxQueryTerms)
{
break;
}
}
}
return query;
}
private PriorityQueue CreateQueue(Dictionary words)
{
try
{
int numDocs = ir.numDocs();
FreqQ res = new FreqQ(words.size());
Enumeration it = words.keys();
while (it.hasMoreElements())
{
String word = (String) it.nextElement();
Object tmpW=words.get(word);
int tmpI=((Int32)tmpW).x;
if(tmpI==0)
{
tmpI=1;
}
int tf = tmpI;
if (minTermFreq > 0 && tf < minTermFreq)
{
continue;
}
String topField = fieldNames[0];
int docFreq = 0;
for (int i = 0; i < fieldNames.length; i++)
{
int freq = ir.docFreq(new Term(fieldNames[i], word));
topField = (freq > docFreq) ? fieldNames[i] : topField;
docFreq = (freq > docFreq) ? freq : docFreq;
}
if (minDocFreq > 0 && docFreq < minDocFreq)
{
continue;
}
if (docFreq == 0)
{
continue;
}
float idf = similarity.idf(docFreq, numDocs);
float score = tf * idf;
res.insert(new Object[]{word, topField, (float) score, (float)idf, (long) docFreq, (long) tf});
return res;
}
}
catch(IOException e)
{
System.out.println(e);
}
return null;
}
public String DescribeParams()
{
StringBuilder sb = new StringBuilder();
sb.append("\t" + "maxQueryTerms : " + maxQueryTerms + "\n");
sb.append("\t" + "minWordLen : " + minWordLen + "\n");
sb.append("\t" + "maxWordLen : " + maxWordLen + "\n");
sb.append("\t" + "fieldNames : \"");
String delim = "";
for (int i = 0; i < fieldNames.length; i++)
{
String fieldName = fieldNames[i];
sb.append(delim).append(fieldName);
delim = ", ";
}
sb.append("\n");
sb.append("\t" + "boost : " + boost + "\n");
sb.append("\t" + "minTermFreq : " + minTermFreq + "\n");
sb.append("\t" + "minDocFreq : " + minDocFreq + "\n");
return sb.toString();
}
private PriorityQueue RetrieveTerms(int docNum)
{
try
{
Dictionary termFreqMap = new Hashtable();
for (int i = 0; i < fieldNames.length; i++)
{
String fieldName = fieldNames[i];
TermFreqVector vector = ir.getTermFreqVector(docNum, fieldName);
if (vector == null)
{
Document d = ir.document(docNum);
String[] text = d.getValues(fieldName);
if (text != null)
{
for (int j = 0; j < text.length; j++)
{
AddTermFrequencies(new StringReader(text[j]), termFreqMap, fieldName);
}
}
}
else
{
AddTermFrequencies(termFreqMap, vector);
}
}
return CreateQueue(termFreqMap);
}
catch(IOException e)
{
System.out.println(e);
}
return null;
}
private void AddTermFrequencies(Dictionary termFreqMap, TermFreqVector vector)
{
String[] terms = vector.getTerms();
int[] freqs = vector.getTermFrequencies();
for (int j = 0; j < terms.length; j++)
{
String term = terms[j];
if (IsNoiseWord(term))
{
continue;
}
Int32 cnt = (Int32) termFreqMap.get(term);
if (cnt == null)
{
cnt = new Int32();
termFreqMap.put(term,cnt);
cnt.x = freqs[j];
}
else
{
cnt.x += freqs[j];
}
}
}
private void AddTermFrequencies(Reader r, Dictionary termFreqMap, String fieldName)
{
try{
TokenStream ts = analyzer.tokenStream(fieldName,r);
org.apache.lucene.analysis.Token token;
int tokenCount = 0;
while ((token = ts.next()) != null)
{
String word = token.termText();
tokenCount++;
if (tokenCount > maxNumTokensParsed)
{
break;
}
if (IsNoiseWord(word))
{
continue;
}
Int32 cnt = (Int32) termFreqMap.get(word);
if (cnt == null)
{
termFreqMap.put(word,new Int32());
}
else
{
cnt.x++;
}
}
}
catch(IOException e)
{
System.out.println(e);
}
}
private boolean IsNoiseWord(String term)
{
int len = term.length();
if (minWordLen > 0 && len < minWordLen)
{
return true;
}
if (maxWordLen > 0 && len > maxWordLen)
{
return true;
}
if (stopWords != null && stopWords.contains(term))
{
return true;
}
return false;
}
public PriorityQueue RetrieveTerms(Reader r)
{
Dictionary words = new Hashtable();
for (int i = 0; i < fieldNames.length; i++)
{
String fieldName = fieldNames[i];
AddTermFrequencies(r, words, fieldName);
}
return CreateQueue(words);
}
public String[] RetrieveInterestingTerms(StringReader r)
{
ArrayList al = new ArrayList(maxQueryTerms);
PriorityQueue pq = RetrieveTerms(r);
Object cur;
int lim = maxQueryTerms;
while (((cur = pq.pop()) != null) && lim-- > 0)
{
Object[] ar = (Object[]) cur;
al.add(ar[0]);
}
String[] res = new String[al.size()];
return (String[]) al.toArray();
}
private class FreqQ extends PriorityQueue
{
private FreqQ(int s)
{
super.initialize(s);
}
protected boolean lessThan(Object a, Object b)
{
Object[] aa = (Object[]) a;
Object[] bb = (Object[]) b;
float fa = ((Float) aa[2]).floatValue();
float fb = ((Float) bb[2]).floatValue();
return (float) fa > (float) fb;
}
}
private class Int32
{
int x;
Int32()
{
x = 1;
}
}
}
调用测试代码:
String indexName="e:\\index\\indexForML"; //索引文件路径
String fn="c:\\0.txt"; //测试文件路径
IndexReader r = IndexReader.open(indexName);
MoreLikeThis mlt = new MoreLikeThis(r); //传入IndexReader对象提供查询
mlt.SetAnalyzer(new StandardAnalyzer()); //设置使用的分词器
mlt.SetFieldNames(new String[]{"content"}); //设置需要比较的field字段
Query query = null;
query = mlt.Like(new FileReader(fn)); //创建查询,传入查询内容可为任意的Reader子类
IndexSearcher searcher = new IndexSearcher(indexName);
Hits hits = searcher.search(query); //根据查询返回相似文档
int len = hits.length();
for (int i = 0; i < Math.min(25, len); i++) //如果比25大就只返回前25条
{
Document d = hits.doc(i);
System.out.println("score : " + hits.score(i));
System.out.println("filename : " + d.get("fullname")); //取出索引字段fullname内容
System.out.println("type : " + d.get("ttype")); //取出索引字段ttype内容
System.out.println("___________________________");
}
r.close(); //关闭索引
发表评论
-
lucene简单词典分词
2013-06-18 16:25 666其实要实现自己的分词并不是很困难。 要实现Token的ne ... -
[转载]基于Luence的分布式搜索引擎ElasticSearch搜索实例演示(Java API)
2013-06-18 15:58 827安装包下载当前最新版本为:0.20.6http://www. ... -
使用Zoie构建实时检索系统(转)
2010-05-18 17:03 1446Zoie是LinkedIn开源的基于l ... -
lucene自定义排序
2010-03-24 09:13 1106lucene能够很方便的实现自定义排序 具体做法就是写一个类 ... -
结合ehcache缓存对lucene使用单例模式搜索
2010-03-23 10:01 2099我采用和ehcache缓存结合使用单例模式 ,其实就是给ind ... -
lucene搜索优化(转)
2010-03-22 13:25 10921、对于按创建时间的排序可以使用doc.id的方式 new ... -
使用bobo-browse 实现lucene的分组统计
2010-03-22 08:24 1965bobo-browse 是一用java写的lucene扩展组件 ...
相关推荐
在这个使用案例中,我们将深入探讨如何利用Lucene实现对Word文档中的关键字检索并高亮显示结果。 首先,我们需要理解Lucene的基本工作原理。Lucene通过建立倒排索引(Inverted Index)来加速查询。倒排索引是一种...
包含翻译后的API文档:lucene-core-7.2.1-javadoc-API文档-中文(简体)版.zip; Maven坐标:org.apache.lucene:lucene-core:7.2.1; 标签:apache、lucene、core、中文文档、jar包、java; 使用方法:解压翻译后的API...
通过深入学习这些内容,开发者可以充分利用Lucene.NET的强大功能,构建出高性能、用户友好的全文搜索引擎。无论是新手还是经验丰富的开发人员,都可以从这个中文帮助文档中受益,提升自己在信息检索领域的技能。
《Lucene 4.6.0官方文档》是针对开源全文搜索引擎库Lucene的一个详尽参考资料,适用于版本4.6.0。Lucene是Apache软件基金会的一个项目,它提供了一个高性能、可扩展的信息检索库,广泛应用于各种搜索应用和信息管理...
**Lucene技术文档doc** **一、Lucene简介** Lucene是Apache软件基金会下的Jakarta项目组的一个核心项目,它是一款高性能、可扩展的全文检索引擎库。作为一个开源的Java库,Lucene提供了完整的搜索功能,包括索引、...
此外,还可以研究Lucene如何处理分词、同义词、停用词等问题,以及它如何利用倒排索引来实现高效的模糊匹配和短语查询。 另一方面,"ppt"可能是一些关于Lucene的演示文稿或教程,可能是讲解Lucene的基本使用方法、...
通过不断优化和扩展,开发者可以充分利用Lucene的强大功能,提升产品的用户体验和性能。 总之,学习Lucene-2.0涉及到对索引构建、搜索算法、排序机制的理解以及源码的阅读和分析。这不仅可以帮助开发者构建自己的...
**Lucene 3.0 API 中文帮助文档详解** Lucene 是一个开源的全文检索库,由Apache软件基金会开发并维护。它提供了高级的文本搜索功能,被广泛应用于各种信息检索系统。本篇将深入探讨Lucene 3.0 API的中文帮助文档,...
通过阅读这份离线文档,开发者不仅可以了解Lucene的基本用法,还能深入学习其内部工作原理,从而在实际项目中充分利用其强大功能,提升搜索体验和系统性能。无论是初次接触Lucene的新手,还是有经验的开发者,都能...
3. 相关性(Relevance):Lucene使用TF-IDF(词频-逆文档频率)算法计算文档与查询的相关性,确定搜索结果的排名。 四、扩展与优化 1. 分布式搜索(Solr):Apache Solr基于Lucene,提供分布式、集群化搜索解决...
**正文** ...总结,利用Lucene搜索中文PDF文档涉及多个技术层面,包括中文分词、PDF解析、索引构建、搜索执行和性能优化。通过理解这些关键技术,开发者可以构建出高效、准确的中文PDF文档检索系统。
包含翻译后的API文档:lucene-core-7.7.0-javadoc-API文档-中文(简体)版.zip; Maven坐标:org.apache.lucene:lucene-core:7.7.0; 标签:apache、lucene、core、中文文档、jar包、java; 使用方法:解压翻译后的API...
chm格式的Lucene帮助文档,Lucene3.5
### 利用Lucene实现高级搜索的关键知识点 #### Lucene简介 Lucene是Apache软件基金会下的一个开源全文检索库,提供了高性能的文本搜索能力。它不仅适用于网站的搜索功能,还可以用于任何需要文本搜索的应用场景,如...
包含翻译后的API文档:lucene-spatial3d-7.2.1-javadoc-API文档-中文(简体)版.zip; Maven坐标:org.apache.lucene:lucene-spatial3d:7.2.1; 标签:apache、lucene、spatial3d、中文文档、jar包、java; 使用方法:...
lucene 3.0 API中文帮助,学习的人懂得的
iTextPDFExtractor.java ------ ...--PDFBox创建PDF文件的Lucene索引 PDFBoxPathIndex.java ------- --PDFBox创建指定目录PDF文档索引 POIOfficeExtractor.java ----- -- POI处理Excel和Word文档代码