lucene中文分词

gjs622520

浏览: 42132 次
性别:
来自: 北京

最近访客更多访客>>

yufei2999

xionghaoming

sky1yy

博主相关

博客

微博

相册

留言

关于我

文章分类

全部博客 (19)

社区版块

存档分类

lucene Apache Ajax 搜索引擎

最近看了看lucene，看了一下那本《征服AJAX.LUCENE构建搜索引擎》，不能不说这本书有骗钱嫌疑，因为书中写了一大堆js跟ajax的内容，而似乎书中并没有结合ajax、lucene写出什么例子，当然书名取得好呀，如果拆开来看的话，就不能怪作者了，也就是说你相当于买了两本书，一本是《征服Ajax》，另一本是《LUCENE构建搜索引擎》，也可能这就是作者的本意吧，买一送一，作者好人也。
ajax部分没看，lucene部分还是花了工夫写的，只是版本有点过了，1.4，现在已经2.4，有些方法不见了，所以我只能下了三个版本的包1.4、1.9、2.4，2.4里找不到的去1.9里看是被什么替换了，这版本问题真是…………
CreateIndex.java

package test;

import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.io.Reader;
import java.util.Collection;

import org.apache.commons.io.FileUtils;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriter.MaxFieldLength;
import org.apache.lucene.store.LockObtainFailedException;

public class CreateIndex {
	/**
	 * 建立索引
	 * @param indexPath 索引目录
	 * @param sourcePath 搜索目录
	 * @param fileSuffix 文件后缀
	 * @param recursive 是否递归子目录
	 * @param analyzer 分析器
	 */
	@SuppressWarnings("unchecked")
	public void createIndex(String indexPath,String sourcePath,String[] fileSuffix,boolean recursive,Class analyzer){
		try {
			Analyzer an=(Analyzer) analyzer.newInstance();
			IndexWriter writer=new IndexWriter(indexPath,an,true,MaxFieldLength.LIMITED);
			Collection<File> files=getFiles(sourcePath, fileSuffix, recursive);
			for (File file : files) {
				Reader reader = new FileReader(file);
				Document doc=new Document();
				doc.add(new Field("title",file.getName(),Field.Store.YES,Field.Index.ANALYZED));
				doc.add(new Field("content",reader));
				doc.add(new Field("path",file.getAbsolutePath(),Field.Store.YES,Field.Index.ANALYZED));
				writer.addDocument(doc);
				writer.optimize();				
			}
			writer.close();
		} catch (CorruptIndexException e) {
			e.printStackTrace();
		} catch (LockObtainFailedException e) {
			e.printStackTrace();
		} catch (IOException e) {
			e.printStackTrace();
		} catch (InstantiationException e) {
			e.printStackTrace();
		} catch (IllegalAccessException e) {
			e.printStackTrace();
		}
	}
	
	/**
	 * 获取文件
	 * @param path 路径
	 * @param fileSuffix 后缀
	 * @param recursive 是否递归
	 * @return 文件集合
	 */
	@SuppressWarnings("unchecked")
	public Collection<File> getFiles(String path,String[] fileSuffix,boolean recursive){
		File src=new File(path);
		if(src.isDirectory()){
			return FileUtils.listFiles(src, fileSuffix, recursive);
		}
		return null;
	}
	
}

Searcher.java

package test;

import java.io.File;
import java.io.IOException;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocCollector;

public class Searcher {
	private Analyzer an;
	private IndexSearcher searcher;
	@SuppressWarnings("unchecked")
	public Searcher(Class analyzer,String indexPath){		
		try {
			an=(Analyzer) analyzer.newInstance();
			searcher=new IndexSearcher(IndexReader.open(new File(indexPath)));
		} catch (CorruptIndexException e) {
			e.printStackTrace();
		} catch (IOException e) {
			e.printStackTrace();
		} catch (InstantiationException e) {
			e.printStackTrace();
		} catch (IllegalAccessException e) {
			e.printStackTrace();
		}
	}
	public ScoreDoc[] doSearch(String keyWord){
		QueryParser parser=new QueryParser("content",an);
		ScoreDoc[] hits=null;
		try {
			Query q=parser.parse(keyWord);
			 TopDocCollector collector = new TopDocCollector(10);
			 searcher.search(q, collector);
			 hits = collector.topDocs().scoreDocs;			
		} catch (ParseException e) {
			e.printStackTrace();
		} catch (IOException e) {
			e.printStackTrace();
		}
		return hits;
	}
	public void displayResult(ScoreDoc[] hits){		
		System.out.println("共找到"+hits.length+"个文件：");
		for(ScoreDoc hit : hits){
			try {
				int docId=hit.doc;
				System.out.println(searcher.doc(docId).getField("path").stringValue());
			} catch (CorruptIndexException e) {
				e.printStackTrace();
			} catch (IOException e) {
				e.printStackTrace();
			}
		}
	}
}

public class App {
	public static void main(String[] args) {
		CreateIndex ci=new CreateIndex();
		ci.createIndex("D:/index", "D:/source", new String[]{"txt"}, true,MIK_CAnalyzer.class);
		Searcher s=new Searcher(MIK_CAnalyzer.class,"D:/index");		
		long start=System.currentTimeMillis();
		s.displayResult(s.doSearch("朱元璋 李寻欢"));
		System.out.println("查询  朱元璋 李寻欢  用了："+(System.currentTimeMillis()-start)+"毫秒");
		System.out.println("***************************************");
		start=System.currentTimeMillis();
		s.displayResult(s.doSearch("黄蓉"));
		System.out.println("查询  黄蓉  用了："+(System.currentTimeMillis()-start)+"毫秒");
		System.out.println("***************************************");
		start=System.currentTimeMillis();
		s.displayResult(s.doSearch("郭靖"));
		System.out.println("查询  郭靖  用了："+(System.currentTimeMillis()-start)+"毫秒");
		System.out.println("***************************************");
		start=System.currentTimeMillis();
		s.displayResult(s.doSearch("李寻欢"));
		System.out.println("查询  李寻欢  用了："+(System.currentTimeMillis()-start)+"毫秒");
		System.out.println("***************************************");
		start=System.currentTimeMillis();
		s.displayResult(s.doSearch("姚广孝"));
		System.out.println("查询  姚广孝  用了："+(System.currentTimeMillis()-start)+"毫秒");
		System.out.println("***************************************");
	}
}

结果：
共找到10个文件：
D:\source\古龙全集\多情剑客无情剑.txt
D:\source\明朝那些事\明朝那些事儿1朱元璋卷.txt
D:\source\古龙全集\飞刀，又见飞刀.txt
D:\source\古龙全集\古龙传奇.txt
D:\source\明朝那些事\明朝那些事儿2.txt
D:\source\古龙全集\大旗英雄传.txt
D:\source\古龙全集\天涯明月刀.txt
D:\source\古龙全集\怒剑狂花.txt
D:\source\明朝那些事\明朝那些事儿 4.txt
D:\source\明朝那些事\明朝那些事儿3.txt
查询朱元璋李寻欢用了：1063毫秒
***************************************
共找到2个文件：
D:\source\金庸全集\神雕侠侣.txt
D:\source\金庸全集\倚天屠龙记.txt
查询黄蓉用了：0毫秒
***************************************
共找到4个文件：
D:\source\金庸全集\神雕侠侣.txt
D:\source\金庸全集\倚天屠龙记.txt
D:\source\金庸全集\射雕英雄传.txt
D:\source\古龙全集\古龙传奇.txt
查询郭靖用了：0毫秒
***************************************
共找到6个文件：
D:\source\古龙全集\多情剑客无情剑.txt
D:\source\古龙全集\飞刀，又见飞刀.txt
D:\source\古龙全集\古龙传奇.txt
D:\source\古龙全集\大旗英雄传.txt
D:\source\古龙全集\天涯明月刀.txt
D:\source\古龙全集\怒剑狂花.txt
查询李寻欢用了：0毫秒
***************************************
共找到1个文件：
D:\source\明朝那些事\明朝那些事儿2.txt
查询姚广孝用了：0毫秒
***************************************
射雕英雄传.txt里应该有黄蓉的可是没有搜到，虽然IKAnalyzer2.0.2已有很大改进。
改为使用lucene的StandardAnalyzer测试：

public class App {
	public static void main(String[] args) {
		CreateIndex ci=new CreateIndex();
		ci.createIndex("D:/index2", "D:/source", new String[]{"txt"}, true,StandardAnalyzer.class);
		Searcher s=new Searcher(StandardAnalyzer.class,"D:/index2");		
		long start=System.currentTimeMillis();
		s.displayResult(s.doSearch("朱元璋 李寻欢"));
		System.out.println("查询  朱元璋 李寻欢  用了："+(System.currentTimeMillis()-start)+"毫秒");
		System.out.println("***************************************");
		start=System.currentTimeMillis();
		s.displayResult(s.doSearch("黄蓉"));
		System.out.println("查询  黄蓉  用了："+(System.currentTimeMillis()-start)+"毫秒");
		System.out.println("***************************************");
		start=System.currentTimeMillis();
		s.displayResult(s.doSearch("郭靖"));
		System.out.println("查询  郭靖  用了："+(System.currentTimeMillis()-start)+"毫秒");
		System.out.println("***************************************");
		start=System.currentTimeMillis();
		s.displayResult(s.doSearch("李寻欢"));
		System.out.println("查询  李寻欢  用了："+(System.currentTimeMillis()-start)+"毫秒");
		System.out.println("***************************************");
		start=System.currentTimeMillis();
		s.displayResult(s.doSearch("姚广孝"));
		System.out.println("查询  姚广孝  用了："+(System.currentTimeMillis()-start)+"毫秒");
		System.out.println("***************************************");
	}
}

结果：
共找到7个文件：
D:\source\明朝那些事\明朝那些事儿1朱元璋卷.txt
D:\source\明朝那些事\明朝那些事儿2.txt
D:\source\古龙全集\多情剑客无情剑.txt
D:\source\明朝那些事\明朝那些事儿3.txt
D:\source\古龙全集\飞刀，又见飞刀.txt
D:\source\古龙全集\大旗英雄传.txt
D:\source\古龙全集\怒剑狂花.txt
查询朱元璋李寻欢用了：62毫秒
***************************************
共找到2个文件：
D:\source\金庸全集\倚天屠龙记.txt
D:\source\金庸全集\神雕侠侣.txt
查询黄蓉用了：0毫秒
***************************************
共找到2个文件：
D:\source\金庸全集\倚天屠龙记.txt
D:\source\金庸全集\神雕侠侣.txt
查询郭靖用了：16毫秒
***************************************
共找到4个文件：
D:\source\古龙全集\多情剑客无情剑.txt
D:\source\古龙全集\飞刀，又见飞刀.txt
D:\source\古龙全集\大旗英雄传.txt
D:\source\古龙全集\怒剑狂花.txt
查询李寻欢用了：0毫秒
***************************************
共找到1个文件：
D:\source\明朝那些事\明朝那些事儿2.txt
查询姚广孝用了：0毫秒
***************************************

准确率差好多

分享到：

oracle webservice axis | flex3与java通信

2008-12-18 13:59
浏览 1674
评论(1)
查看更多

1 楼 lwlsoftware 2009-01-03

Java基础不错学习了

发表评论

您还没有登录,请您登录后再发表评论

最近访客更多访客>>

博主相关

文章分类

社区版块

存档分类

最新评论