lucene全文检索入门实例

ln_ydc

浏览: 273971 次
性别:
来自: 青岛

最近访客更多访客>>

jxjxtang

czldl

pcdlrzxx

bruce_ma

博主相关

博客

微博

相册

留言

关于我

文章分类

社区版块

存档分类

博客分类：

其它

基本概念：

前期准备：

lucene-2.4.0

junit4.9

实例代码：

package com.ln.ydc.lucene.test;

import java.io.File;
import java.io.IOException;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.queryParser.MultiFieldQueryParser;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.Filter;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.junit.Test;

import com.ln.ydc.lucene.util.LuceneUtil;

/**
 * 1.根据 数据源文件 创建索引库 
 * 2.根据 数据源文件所在的目录 创建索引库 
 * 3.根据索引搜索关键字
 * 
 * @author ydc
 * 
 */
public class HelloWorld {
	// 数据源文件路径
	String filePath = "D:\\logs\\lucene\\datasource\\eng_article.txt";
	// 索引库目录
	String indexPath = "D:\\logs\\lucene\\luceneIndex";
	// Directory directory = Directory.createOutput(indexPath);
	// 数据源目录
	String dirPath = "D:\\logs\\lucene\\datasource";

	// 词库分词器
	Analyzer analyzer = new StandardAnalyzer();
	// Analyzer analyzer = new MMAnalyzer();	// 中文分词器

	/**
	 * 根据数据源文件创建索引库
	 * 
	 * IndexWriter 是用来操作索引库的(增、删、改)
	 */
	@Test
	public void createIndexByFile() throws Exception {
		Document doc = LuceneUtil.file2Document(filePath);
		// file-->doc

		IndexWriter indexWriter = new IndexWriter(indexPath, analyzer, true, IndexWriter.MaxFieldLength.LIMITED);
		indexWriter.addDocument(doc);
		// optimize()方法是对索引进行优化
		indexWriter.optimize();
		indexWriter.close();
	}

	/**
	 * 根据数据源文件目录创建索引库
	 * 
	 * @throws Exception
	 */
	@Test
	public void createIndexByDir() throws Exception {
		File filesDir = new File(dirPath);
		IndexWriter indexWriter = new IndexWriter(indexPath, analyzer, true, IndexWriter.MaxFieldLength.LIMITED);
		for (String filePath : filesDir.list()) {
			Document doc = LuceneUtil.file2Document(dirPath + "//" + filePath);
			indexWriter.addDocument(doc);
		}
		// optimize()方法是对索引进行优化
		indexWriter.optimize();
		indexWriter.close();
	}

	/**
	 * 搜索
	 * 
	 * @throws ParseException
	 * @throws IOException
	 */
	@Test
	public void search() throws ParseException, IOException {
		String queryString = "村上春树";

		// 1.把要搜索的文本解析为Query
		String[] fields = { "name", "content" };
		QueryParser queryParser = new MultiFieldQueryParser(fields, analyzer);
		Query query = queryParser.parse(queryString);

		// 2.进行查询
		IndexSearcher indexSearcher = new IndexSearcher(indexPath);
		Filter filter = null;
		TopDocs topDocs = indexSearcher.search(query, filter, 10000);

		// 打印结果
		System.out.println("总共有【" + topDocs.totalHits + "】条匹配结果");
		for (ScoreDoc scoreDoc : topDocs.scoreDocs) {
			int docSn = scoreDoc.doc; // 文档内部编号
			Document doc = indexSearcher.doc(docSn); // 根据编号取出相应的文档
			LuceneUtil.printDocumentInfo(doc); // 打印出文档信息
		}
	}
}

package com.ln.ydc.lucene.util;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStreamReader;

import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.NumberTools;

public class LuceneUtil {

	/**
	 * 将一个具体的文件转换成document 
	 * 保存文件的如下信息：
	 * name 	文件名 
	 * content 	文件内容
	 * size 	文件大小 
	 * path		文件路径
	 * 
	 * @param filePath
	 * @return
	 */
	public static Document file2Document(String filePath) {
		File file = new File(filePath);
		Document doc = new Document();
		doc.add(new Field("name", file.getName().trim(), Field.Store.YES, Field.Index.ANALYZED));
		doc.add(new Field("content", readFileContent(file), Field.Store.YES, Field.Index.ANALYZED));
		doc.add(new Field("size", NumberTools.longToString(file.length()), Field.Store.YES, Field.Index.ANALYZED));
		doc.add(new Field("path", file.getAbsolutePath(), Field.Store.YES, Field.Index.NO));
		return doc;
	}

	/**
	 * 读取文件内容
	 * 
	 * @param file
	 * @return
	 */
	public static String readFileContent(File file) {
		try {
			BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(file)));
			StringBuffer content = new StringBuffer();
			for (String line = null; (line = reader.readLine()) != null;) {
				if(line!=null && !"".equals(line=line.trim()))
					content.append(line).append("\n");
			}
			return content.toString();
		} catch (Exception e) {
			throw new RuntimeException(e);
		}
	}

	/**
	 * <pre>
	 * 获取 name 属性的值的两种方法：
	 * 1.Field field = doc.getField(&quot;name&quot;);
	 *  field.stringValue();
	 *  该方法已过时
	 * 2.doc.get(&quot;name&quot;);
	 * </pre>
	 * 
	 * @param doc
	 */
	public static void printDocumentInfo(Document doc) {
		System.out.println("--------------------------------------------");
		System.out.println("【name】 \t " + doc.get("name"));
		System.out.println("【content】 \t " + doc.get("content"));
		System.out.println("【size】 \t " + doc.get("size"));
		System.out.println("【path】 \t " + doc.get("path"));
	}
	/*
	public static void printDocumentInfo(QueryResult<Document> qr) {
		System.out.println("总共有【" + qr.getRecordCount() + "】条匹配结果");
		for(Document doc : qr.getRecordList()) {
			printDocumentInfo(doc);
		}
	}
*/
 }

查看图片附件

分享到：

myeclipse插件之FindBugs(静态分析工具) | myeclipse自定义java注释

2012-02-29 23:00
浏览 3037
评论(0)
分类:开源软件
查看更多

发表评论

您还没有登录,请您登录后再发表评论

最近访客更多访客>>

博主相关

文章分类

社区版块

存档分类

最新评论

lucene全文检索入门实例

评论

发表评论

相关推荐

最近访客 更多访客>>

博主相关

文章分类

社区版块

存档分类

最新评论

lucene全文检索入门实例

评论

发表评论

相关推荐

批量删除订阅kindle、kindle touch 推送

svn应用笔记

dos批处理-删除eclipse配置

《学得少却考得好》(Learn More Study Less)笔记

myeclipse插件之FindBugs(静态分析工具)

制作 Google Chrome 绿色版

myeclipse自定义java注释

keytool 用法总结

配置Tomcat使用https协议(配置SSL协议)

JNDI对LDAP的基本操作

windows下搭建并配置OpenLDAP服务器

程序员应该知道的100个vim命令

阅读计算机图书的一些心得

学习vim的几篇文章

oracle权限表

如何学好oracle课程

oracle服务启动、关闭脚本(windows下)

计算机术语：日语---汉语---英语

最近访客更多访客>>