lucene学习笔记

fushengfei

浏览: 211304 次
性别:
来自: 北京

最近访客更多访客>>

Sobfist

kidlovec

413899327

gaoshaoye

博主相关

博客

微博

相册

留言

关于我

文章分类

社区版块

存档分类

博客分类：

搜索引擎

lucene Apache

1、工具类

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStreamReader;

import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.NumberTools;
import org.apache.lucene.document.Field.Index;
import org.apache.lucene.document.Field.Store;

public class File2DocumentUtils {
                //文件转化为Document对象
	public static Document file2Document(String filePath) {
	     File file=new File(filePath);
	     Document doc=new Document();
	     doc.add(new Field("name",file.getName(),Store.YES,Index.ANALYZED));
	     doc.add(new Field("content",readFileContent(file),Store.YES,Index.ANALYZED));
	     doc.add(new Field("size",NumberTools.longToString(file.length()),Store.YES,Index.ANALYZED));
	     doc.add(new Field("path",file.getAbsolutePath(),Store.YES,Index.ANALYZED));
	     System.out.println(doc);
	     return doc;
	}

	private static String readFileContent(File file){
       try {
		BufferedReader bufer=new BufferedReader(new InputStreamReader(new FileInputStream(file))); 
		   StringBuffer buf=new StringBuffer();
		   String str="";
		
		 while( (str= bufer.readLine())!=null)
		 {
			 buf.append(str).append("\n");
		 }
	     return buf.toString();
	} catch (Exception e) {
		throw new RuntimeException(e);
	} 
		
	}
               //打印document
	public static void printDocumentInfo(Document doc) {
		System.out.println("------------------------------");
		System.out.println("name     = " + doc.get("name"));
		System.out.println("content  = " + doc.get("content"));
		System.out.println("size     = " + NumberTools.stringToLong(doc.get("size")));
		System.out.println("path     = " + doc.get("path"));		
	}

}

2、索引的正删改和搜索

public class IndexDao {
	String indexPath = "索引文件存放路径";

	// Analyzer analyzer = new StandardAnalyzer();
	Analyzer analyzer = new MMAnalyzer();// 词库分词

	/**
	 * 添加/创建索引
	 * 
	 * @param doc
	 */
	public void save(Document doc) {
		IndexWriter indexWriter = null;
		try {
			indexWriter = new IndexWriter(indexPath, analyzer, MaxFieldLength.LIMITED);
			indexWriter.addDocument(doc);// Adds a document to this index.
		} catch (Exception e) {
			throw new RuntimeException(e);
		} finally {
			try {
				indexWriter.close();
			} catch (Exception e) {
				e.printStackTrace();
			}
		}
	}

	/**
	 * Term是搜索的最小单位，代表某个 Field 中的一个关键词，如：<title, lucene>
	 * 
	 * new Term( "title", "lucene" );
	 * 
	 * new Term( "id", "5" );
	 * 
	 * new Term( "id", UUID );
	 * 
	 * @param term
	 */
	public void delete(Term term) {
		IndexWriter indexWriter = null;
		try {
			indexWriter = new IndexWriter(indexPath, analyzer, MaxFieldLength.LIMITED);
			indexWriter.deleteDocuments(term);//Deletes the document(s) containing term.
		} catch (Exception e) {
			throw new RuntimeException(e);
		} finally {
			try {
				indexWriter.close();
			} catch (Exception e) {
				e.printStackTrace();
			}
		}
	}

	/**
	 * 更新索引
	 * 
	 * <pre>
	 * indexWriter.deleteDocuments(term);
	 * indexWriter.addDocument(doc);
	 * </pre>
	 * 
	 * @param term
	 * @param doc
	 */
	public void update(Term term, Document doc) {
		IndexWriter indexWriter = null;
		try {
			indexWriter = new IndexWriter(indexPath, analyzer, MaxFieldLength.LIMITED);
			indexWriter.updateDocument(term, doc);//Updates a document by first deleting the document(s) containing term and then adding the new document.
		} catch (Exception e) {
			throw new RuntimeException(e);
		} finally {
			try {
				indexWriter.close();
			} catch (Exception e) {
				e.printStackTrace();
			}
		}
	}

	/**
	 * <pre>
	 * totalPage = recordCount / pageSize;
	 * if (recordCount % pageSize &gt; 0)
	 * 	totalPage++;
	 * </pre>
	 * 
	 * @param queryString
	 * @param firstResult
	 * @param maxResults
	 * @return
	 */
	public QueryResult search(String queryString, int firstResult, int maxResults) {
		try {
			// 1，把要搜索的文本解析为 Query
			String[] fields = { "name", "content" };
			Map<String, Float> boosts = new HashMap<String, Float>();
			//boosts.put("name", 3f);
			// boosts.put("content", 1.0f); 默认为1.0f

			QueryParser queryParser = new MultiFieldQueryParser(fields, analyzer, boosts);
			Query query = queryParser.parse(queryString);

			return search(query, firstResult, maxResults);
		} catch (Exception e) {
			throw new RuntimeException(e);
		}
	}

	public QueryResult search(Query query, int firstResult, int maxResults) {
		IndexSearcher indexSearcher = null;

		try {
			// 2，进行查询
			indexSearcher = new IndexSearcher(indexPath);
			Filter filter = new RangeFilter("size", NumberTools.longToString(20)
					, NumberTools.longToString(1000), true, true);//效率很低，不建议使用

			// ========== 排序
			Sort sort = new Sort();
			sort.setSort(new SortField("size")); // 默认为升序
			// sort.setSort(new SortField("size", true));
			// ==========

			TopDocs topDocs = indexSearcher.search(query, filter, 10000, sort);
			//Finds the top n hits for query, applying filter if non-null, and sorting the hits by the criteria in sort. 
			int recordCount = topDocs.totalHits;
			List<Document> recordList = new ArrayList<Document>();

			// ============== 准备高亮器
			Formatter formatter = new SimpleHTMLFormatter("<font color='red'>", "</font>");
			Scorer scorer = new QueryScorer(query);
			Highlighter highlighter = new Highlighter(formatter, scorer);

			Fragmenter fragmenter = new SimpleFragmenter(50);
			highlighter.setTextFragmenter(fragmenter);
			// ==============

			// 3，取出当前页的数据
			int end = Math.min(firstResult + maxResults, topDocs.totalHits);
			for (int i = firstResult; i < end; i++) {
				ScoreDoc scoreDoc = topDocs.scoreDocs[i];

				int docSn = scoreDoc.doc; // 文档内部编号
				Document doc = indexSearcher.doc(docSn); // 根据编号取出相应的文档

				// =========== 高亮
				// 返回高亮后的结果，如果当前属性值中没有出现关键字，会返回 null
				String hc = highlighter.getBestFragment(analyzer, "content", doc.get("content"));
				if (hc == null) {
					String content = doc.get("content");
					int endIndex = Math.min(50, content.length());
					hc = content.substring(0, endIndex);// 最多前50个字符
				}
				doc.getField("content").setValue(hc);
				// ===========

				recordList.add(doc);
			}

			// 返回结果
			return new QueryResult(recordCount, recordList);
		} catch (Exception e) {
			throw new RuntimeException(e);
		} finally {
			try {
				indexSearcher.close();
			} catch (IOException e) {
				e.printStackTrace();
			}
		}
	}
}

分享到：

浅谈正则表达式 | lucene的简单使用

2010-11-10 15:25
浏览 941
评论(0)
分类:编程语言
查看更多

发表评论

您还没有登录,请您登录后再发表评论

最近访客更多访客>>

博主相关

文章分类

社区版块

存档分类

最新评论

lucene学习笔记

评论

发表评论

相关推荐

最近访客 更多访客>>

博主相关

文章分类

社区版块

存档分类

最新评论

lucene学习笔记

评论

发表评论

相关推荐

nutch中文分词(通过插件的方式)

nutch中文分词（修改源码的方式）

lucene的简单使用

最近访客更多访客>>