Lucene入门示例

liuzidong

浏览: 2299812 次
性别:
来自: 成都

最近访客更多访客>>

joechl

njdccy

mushroom12

musicxujun

博主相关

博客

微博

相册

留言

关于我

文章分类

社区版块

存档分类

博客分类：

Lucene

apache java lucene 全文检索 junit

主要参考了Lucene的官方示例

环境：Win7 + JDK1.6 + Eclipse37
Lucene版本：3.5
官方：http://www.apache.org/dyn/closer.cgi
检索的基本概念
一信息检索:从信息集合中打找出与用户相关的信息.
1 信息检索的分类
全文检索:把用户的查询请求和全文中的每一个词进行比较不考虑查询请求与文本语义的匹配。
数据检索:查询要求和信息系统中的数据都有一定的结构，语义匹配能力差.
知识检索:强调基于知识语义上的匹配
说明以下介绍来自于百科名片,http://baike.baidu.com/view/371811.htm
二 Lucene介绍
Lucene是apache软件基金会jakarta项目组的一个子项目,是一个开放源代码的全文检索引擎工具包，即它不是一个完整的全文检索引擎，而是一个全文检索引擎的架构，提供了完整的查询引擎和索引引擎.Lucene的原作者是Doug Cutting，他是一位资深全文索引/检索专家.
优点如下：
1 索引文件格式独立于应用平台。Lucene定义了一套以8位字节为基础的索引文件格式，使得兼容系统或者不同平台的应用能够共享建立的索引文件。、
2 在传统全文检索引擎的倒排索引的基础上，实现了分块索引，能够针对新的文件建立小文件索引，提升索引速度。
3 设计了独立于语言和文件格式的文本分析接口，索引器通过接受Token流完成索引文件的创立，用户扩展新的语言和文件格式，只需要实现文本分析的接口。
4 Lucene的查询实现中默认实现了布尔操作、模糊查询（Fuzzy Search[11]）、分组查询等.
三工程图片如下,所用jar文件包含：lucene-core-3.5.0.jar，lucene-analyzers-3.5.0.jar.

四想要搜索任何内容，必须先收集数据，建立索引库,之后才能进行搜索。
具体实现类如下：

package net.liuzd.lucene.test;

import java.io.File;
import java.io.IOException;

import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.Filter;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.junit.Test;

public class IndeSearchFiles {
		
	/**
	 * 创建索引
	 * @throws IOException 
	 * @throws CorruptIndexException 
	 * */
	@Test
	public void createIndex() throws Exception{
	
		//操作增，删,改索引库的
		IndexWriter writer = LuceneUtils.createIndexWriter(OpenMode.CREATE);		
		//数据源的位置
		File sourceFile = LuceneUtils.createSourceFile();
		System.out.println("文件路径：" + sourceFile.getAbsolutePath());
		//进行写入文档		
		Document doc = new Document();
		 doc.add(new Field("name",sourceFile.getName(),Field.Store.YES, Field.Index.ANALYZED_NO_NORMS));
		//文件路径
        Field pathField = new Field("path", sourceFile.getPath(), Field.Store.YES, Field.Index.NO);
        pathField.setIndexOptions(org.apache.lucene.index.FieldInfo.IndexOptions.DOCS_ONLY);
        doc.add(pathField);
        //文件最后修改时间
        doc.add(new Field("modified",String.valueOf(sourceFile.lastModified()),Field.Store.YES, Field.Index.NO));
        //添加文件内容
        String content = LuceneUtils.readFileContext(sourceFile);
        System.out.println("content: " + content);
        doc.add(new Field("contents",content,Field.Store.YES, Field.Index.ANALYZED));
        //以下是官网的实现
       /* FileInputStream fis = new FileInputStream(sourceFile);
        doc.add(new Field("contents", new BufferedReader(new InputStreamReader(fis, "UTF-8"))));*/
        
        if (writer.getConfig().getOpenMode() == IndexWriterConfig.OpenMode.CREATE)
        {
           writer.addDocument(doc);
        }
        else
        {
          writer.updateDocument(new Term("path", sourceFile.getPath()), doc);
        }		
		//释放资源
        writer.close();
       // fis.close();
		
	}
	
	/***
	 * 搜索
	 * */
	@Test
	public void search() throws Exception{
		
		//查询的字符串:输入不存在的字符串是查询不到的,如：中国
		String queryString = "Lucene";
		//查询字段集合
		String [] queryFileds = {"contents"};		
		IndexSearcher searcher = LuceneUtils.createIndexSearcher();		
		Query query = LuceneUtils.createQuery(queryFileds, queryString);
		//在搜索器中进行查询
		//对查询内容进行过滤
		Filter filter = null;
		//一次在索引器查询多少条数据
		int queryCount = 100;
		TopDocs results = searcher.search(query,filter,queryCount);
		System.out.println("总符合: " + results.totalHits + "条数！");
		
		//显示记录
		for(ScoreDoc sr : results.scoreDocs){
			//文档编号
			int docID = sr.doc;
			//真正的内容
			Document doc = searcher.doc(docID);
			System.out.println("name = " + doc.get("name"));
			System.out.println("path = " + doc.get("path"));
			System.out.println("modified = " + doc.get("modified"));
			System.out.println("contents = " + doc.get("contents"));			
		}		
	}
}

工具类代码如下：

package net.liuzd.lucene.test;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.queryParser.MultiFieldQueryParser;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;

public class LuceneUtils {

	//当前目录位置
	public static final String USERDIR = System.getProperty("user.dir");
	//存放索引的目录
	private static final String INDEXPATH = USERDIR + File.separator + "index";
	//数据源
	private static final String INDEXSOURCE = USERDIR + File.separator
			+ "source" + File.separator + "lucene.txt";
	//使用版本
	public static final Version version = Version.LUCENE_35;
	
	/**
	 * 获取分词器
	 * */
	public static Analyzer getAnalyzer(){
		// 分词器
		Analyzer analyzer = new StandardAnalyzer(version);
		return analyzer;
	}

	/**
	 * 创建一个索引器的操作类
	 * 
	 * @param openMode
	 * @return
	 * @throws Exception
	 */
	public static IndexWriter createIndexWriter(OpenMode openMode)
			throws Exception {
		// 索引存放位置设置
		Directory dir = FSDirectory.open(new File(INDEXPATH));		
		// 索引配置类设置
		IndexWriterConfig iwc = new IndexWriterConfig(version,
				getAnalyzer());
		iwc.setOpenMode(openMode);
		IndexWriter writer = new IndexWriter(dir, iwc);
		return writer;
	}

	/***
	 * 创建一个搜索的索引器
	 * @throws IOException 
	 * @throws CorruptIndexException 
	 * */
	public static IndexSearcher createIndexSearcher() throws CorruptIndexException, IOException {
		IndexReader reader = IndexReader.open(FSDirectory.open(new File(INDEXPATH)));
		IndexSearcher searcher = new IndexSearcher(reader);
		return searcher;
	}

	/**
	 * 创建一个查询器
	 * @param queryFileds  在哪些字段上进行查询
	 * @param queryString  查询内容
	 * @return
	 * @throws ParseException
	 */
	public static Query createQuery(String [] queryFileds,String queryString) throws ParseException{
		 QueryParser parser = new MultiFieldQueryParser(version, queryFileds, getAnalyzer());
		 Query query = parser.parse(queryString);
		 return query;
	}
	
	/***
	 * 读取文件内容
	 * */
	public static String readFileContext(File file){
		try {
			BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(file)));
			StringBuilder content = new StringBuilder();
			for(String line = null; (line = br.readLine())!= null;){
				content.append(line).append("\n");
			}
			return content.toString();
		} catch (Exception e) {
			throw new RuntimeException(e);
		}
	
	}
	
	
	public static void main(String[] args) {

		System.out.println(Thread.currentThread().getContextClassLoader()
				.getResource(""));
		System.out.println(LuceneUtils.class.getClassLoader().getResource(""));
		System.out.println(ClassLoader.getSystemResource(""));
		System.out.println(LuceneUtils.class.getResource(""));
		System.out.println(LuceneUtils.class.getResource("/")); // Class文件所在路径
		System.out.println(new File("/").getAbsolutePath());
		System.out.println(System.getProperty("user.dir"));
	}

	/**
	 * 创建索引的数据源
	 * 
	 * @return
	 */
	public static File createSourceFile() {
		File file = new File(INDEXSOURCE);
		return file;
	}
}

附件有工程源码与jar文件