lucene入门到项目开发

笑我痴狂

浏览: 287472 次
性别:
来自: 湖南

最近访客更多访客>>

lvye351

xiangshouxiyang

fhtwins

wueuru

博主相关

博客

微博

相册

留言

关于我

文章分类

社区版块

存档分类

博客分类：

lucene

lucene Apache 互联网 junit F#

加入jar包 lucene-core-2.4.0.jar je-analysis-1.4.0.jar lucene-highlighter-2.4.1.jar lucene-analyzers-2.4.1.jar

先准备下工具类

package com.cs.lucene.utils;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.InputStreamReader;

import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.NumberTools;
import org.apache.lucene.document.Field.Index;
import org.apache.lucene.document.Field.Store;

public class File2DocumentUtiles {

	/**
	 *文件到document的转换
	 * @param filepath
	 * @return
	 */
	public static Document file2Document(String filepath) {
		
		File file = new File(filepath) ;
		
		Document doc = new Document();
		doc.add(new Field("name",file.getName(),Store.YES,Index.ANALYZED)) ; //索引并分词
		doc.add(new Field("content",readFileContent(file),Store.YES,Index.ANALYZED)) ; //索引并分词
		doc.add(new Field("size",NumberTools.longToString(file.length()),Store.YES,Index.NOT_ANALYZED)) ; //索引不分词
		doc.add(new Field("path",file.getPath(),Store.YES,Index.NO)) ; //不索引
		
		return doc;
	}
/**
 * 根据文件读取文件内容
 * @param file
 * @return
 */
	private static String readFileContent(File file) {
		
		try {
			BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(file)));
		    StringBuffer content = new StringBuffer();
		    
		    for(String line=null; (line = reader.readLine())!=null ;){
		    	content.append(line).append("\n") ;
		    }
		    return content.toString() ;
		} catch (Exception e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
		return null;
	}
	
	public static void printDocInfo(Document doc){
		System.out.println("--------------------------");
		System.out.println("name          =" + doc.get("name"));
		System.out.println("content       =" + doc.get("content"));
		System.out.println("size          =" + NumberTools.stringToLong(doc.get("size")));
		System.out.println("path          =" + doc.get("path"));
	}
	

}

先了解下分词器

package com.cs.lucene.analyzer;

import java.io.StringReader;

import jeasy.analysis.MMAnalyzer;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.junit.Test;

public class AnalyzerTest {
      String text = "资源来自互联网吴朝辉wwwa的a-b放到" ;
      Analyzer analyzer = new MMAnalyzer() ;
      
      @Test
      public void testAnalyze() throws Exception{
    	  analyze(analyzer,text);
      }

	private void analyze(Analyzer analyzer2, String text2) throws Exception {
		System.out.println("----------分词器-------------------");
		TokenStream tokenStream = analyzer.tokenStream("content", new StringReader(text)) ;
		for(Token token = new Token();(token = tokenStream.next(token))!=null;){
			System.out.println(token);
		}
	}
      
      
}

现在看看FSDirectory和RAMDirectory

package com.cs.lucene.directory;

import jeasy.analysis.MMAnalyzer;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriter.MaxFieldLength;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.RAMDirectory;
import org.junit.Test;

import com.cs.lucene.utils.File2DocumentUtiles;

public class DirectoryTest {
	//创建索引用的文件路径
	String filePath = "E:\\EclipseStudyWorkspace\\LucenceTest\\lucenceDataSource\\资源来自互联网，版权归原创作者或原单位公司所有.txt";
	//存放索引的目录
	String indexPath = "E:\\EclipseStudyWorkspace\\LucenceTest\\lucenceIndex" ;
	 //分词器
	Analyzer analyzer = new MMAnalyzer(); //je分词器
	
	
	/**
	 * 利用FSDirectory 创建索引
	 * FSDirectory：在文件系统上存放
	 * @throws Exception
	 */
	@Test
	public void testFSDirectory() throws Exception{ 
		  //测试文件系统目录  
		  Directory dir =  FSDirectory.getDirectory(indexPath) ;
		  Document doc = File2DocumentUtiles.file2Document(filePath);
		   //参数true表示是否删除原来的索引后再重新创建，MaxFieldLength.LIMITED：表示只对前10000个字做索引
		   IndexWriter indexWriter = new IndexWriter(dir,analyzer,MaxFieldLength.LIMITED) ; //没有参数true，添加索引
		   indexWriter.addDocument(doc) ;
		   
		   indexWriter.close() ;
	}
	/**
	 * 利用RAMDirectory 创建索引
	 * RAMDirectory：在内存中存放
	 * 优点：读取快
	 * 缺点：重新开机，索引没了
	 * @throws Exception
	 */
	@Test
	public void testRAMDirectory() throws Exception{
		  //测试文件系统目录
		  Directory dir =  new RAMDirectory() ;
		  Document doc = File2DocumentUtiles.file2Document(filePath);
		   //参数true表示是否删除原来的索引后再重新创建，MaxFieldLength.LIMITED：表示只对前10000个字做索引
		   IndexWriter indexWriter = new IndexWriter(dir,analyzer,MaxFieldLength.LIMITED) ; //没有参数true，添加索引
		   indexWriter.addDocument(doc) ;
		   
		   indexWriter.close() ;
	}
	/**
	 * 实际应用中，FSDirectory和RAMDirectory联合起来用
	 * 操控内存的索引要快，所以在运行时操作RAMDirectory，
	 * 但退出时必须保存到到文件系统上，所以退出时操控FSDirectory
	 * @throws Exception
	 */
	@Test
	public void testRAMDirectoryAndFSDirectory() throws Exception{
		//整个过程：从文件系统上读取所以到内存，运行时添加索引，此时的全部索引都在内存中，
		//退出时再把全部保存到文件系统上
		
		Directory fsDir = FSDirectory.getDirectory(indexPath) ;
		 //1.启动时读取
		 Directory ramDir = new RAMDirectory(fsDir) ;
		 //运行时操作ramDir
		 IndexWriter ramIndexWriter = new IndexWriter(ramDir,analyzer,MaxFieldLength.LIMITED);
		 //添加document
		 Document doc = File2DocumentUtiles.file2Document(filePath) ;
		 ramIndexWriter.addDocument(doc) ;
		 ramIndexWriter.close() ;//一定要关闭再合并，因为有缓存
		 
		 //2.退出时保存
		 //参数true表示把以前的索引删掉，全部重写 （默认为false）
		 IndexWriter fsIndexWriter = new IndexWriter(fsDir,analyzer,true,MaxFieldLength.LIMITED);
		 //new Directory[]{ramDir}：要合并的目录
		 //addIndexesNoOptimize：表示不做优化，做优化检索时相对要慢，但占用的存储空间小
		 fsIndexWriter.addIndexesNoOptimize(new Directory[]{ramDir}) ;
		 fsIndexWriter.flush() ; //优化之前一定要先刷新缓存
		 fsIndexWriter.optimize() ; //优化一定要在关闭之前做，优化可以提高检索的速度
		 fsIndexWriter.close() ;
	}
	@Test
	public void testOptimize() throws Exception{
		
		Directory fsDir = FSDirectory.getDirectory(indexPath) ;
		IndexWriter fsIndexWriter = new IndexWriter(fsDir,analyzer,MaxFieldLength.LIMITED);
	
		fsIndexWriter.optimize() ; 
		fsIndexWriter.close() ;
	}
}

现在来测测索引如何建立以及搜索

package com.cs.lucene.lucene;

import java.io.File;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import jeasy.analysis.MMAnalyzer;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriter.MaxFieldLength;
import org.apache.lucene.queryParser.MultiFieldQueryParser;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.Filter;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.SortField;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.highlight.Formatter;
import org.apache.lucene.search.highlight.Fragmenter;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.Scorer;
import org.apache.lucene.search.highlight.SimpleFragmenter;
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;

import com.cs.lucene.utils.File2DocumentUtiles;

public class IndexDao {
	// 存放索引的目录
	private String indexPath;
	private Analyzer analyzer = null; // 分词器

	public IndexDao() {
		this.indexPath = "E:\\EclipseStudyWorkspace\\LucenceTest\\lucenceIndex";
		this.analyzer = new MMAnalyzer(); // je分词器对中文支持很好

	}

	public IndexDao(Analyzer analyzer, String indexPath) {
		this.analyzer = analyzer;
		this.indexPath = indexPath;
	}

	/**
	 * 接受一个QuerString字符串 搜索索引并返回结果
	 * 
	 */
	public QueryResult search(String queryString, int firstResult,
			int maxResults) throws Exception {
		// 1.把要搜索的fields解析为Query
		String[] fields = { "name", "content" };
		// boosts:需要的理由，标题和内容中出现关键字的得分不一样，在标题中出现时的得分理应高些
		Map<String, Float> boosts = new HashMap<String, Float>();
		boosts.put("name", 3.0f);
		boosts.put("content", 1.0f); // 默认值

		QueryParser queryParser = new MultiFieldQueryParser(fields, analyzer,
				boosts);// 多field搜索
		Query query = queryParser.parse(queryString);

		return search(query, firstResult, maxResults);
	}

	/*
	 * 接受一个Query对象 搜索索引并返回结果
	 */
	public QueryResult search(Query query, int firstResult, int maxResults)
			throws Exception {
		IndexSearcher indexSearcher = null;
		// 2.进行查询
		indexSearcher = new IndexSearcher(indexPath);
		Filter filter = null; // 搜索时的过滤器
		/** ********过滤器************* */
		// 过滤器：把结果再过滤一遍，效率会很低
		// filter = new
		// RangeFilter("size",NumberTools.longToString(200),NumberTools.longToString(500),true,true);
		/** ************************* */
		Sort sort = new Sort();
		// 默认是按升序排序，参数true：排序结果改为按降序排列
		sort.setSort(new SortField[] { new SortField("size", true) });
		TopDocs topDocs = indexSearcher.search(query, filter, 10000, sort);

		int recordCount = topDocs.totalHits;

		/** ***********准备高亮器******************** */
		Formatter formatter = new SimpleHTMLFormatter("<font color='red'>",
				"</font>");
		Scorer scorer = new QueryScorer(query);
		Highlighter highlighter = new Highlighter(formatter, scorer);

		// 50表示只显示50个字符 这里的50个字符是有关键字的左右部分（称之为最佳部分） 这里只是测试用
		Fragmenter fragmenter = new SimpleFragmenter(500);
		highlighter.setTextFragmenter(fragmenter);
		/** ************************************ */
		
		// 3.取出当前的数据
		List<Document> recordList = new ArrayList<Document>();
		int end = Math.min(firstResult + maxResults, recordCount);
		for (int i = firstResult; i < end; i++) {
			ScoreDoc scoreDoc = topDocs.scoreDocs[i];

			int docSn = scoreDoc.doc;
			Document doc = indexSearcher.doc(docSn);

			// 使用高亮器
			String hc = highlighter.getBestFragment(analyzer, "content", doc
					.get("content"));

			// 如果content中没有搜索的关键字，则截取content的前200个字符
			if (hc == null) {
				String content = doc.get("content");
				int endIndex = Math.min(200, content.length());
				hc = content.substring(0, endIndex);
			}
			doc.getField("content").setValue(hc);

			recordList.add(doc);
		}
		// 打开结果
		/*
		 * for(ScoreDoc scoreDoc :topDocs.scoreDocs){ int docSn = scoreDoc.doc ;
		 * //文档内部编号 Document doc = indexSearcher.doc(docSn); //根据编号查找相应的文档
		 * File2DocumentUtiles.printDocInfo(doc) ; }
		 */
		// 4.返回结果
		return new QueryResult(recordCount, recordList);
	}

	/*
	 * 建立索引并保存
	 */
	public void save(String filePath) throws Exception {
		Document doc = File2DocumentUtiles.file2Document(filePath);
		// 在添加doc的时候，可以设定文档的分数，不过不建议这样做
		// doc.setBoost(1.0f); //默认值

		// 参数true表示是否删除原来的索引后再重新创建，MaxFieldLength.LIMITED：表示只对前10000个字做索引
		IndexWriter indexWriter = new IndexWriter(indexPath, analyzer, false,
				MaxFieldLength.LIMITED);
		indexWriter.addDocument(doc);
		indexWriter.commit();
		indexWriter.optimize();
		indexWriter.close();

	}
	
	public void save(File file) throws Exception {
		save(file.getAbsolutePath()) ;
	}

	/*
	 * 建立索引并保存 可以直接传入的是目录
	 */
	public void saveDirectory(File file) throws Exception {
		if (file.isFile()) { // 如果是文件就建索引并保存
			save(file.getAbsolutePath());
			return;
		}
		File[] childs = file.listFiles();
		for (int i = 0; i < childs.length; i++) {
			File f = childs[i];
			if (f.isDirectory()) {// 如果是目录就递归调用
				saveDirectory(f);
			} else {
				save(f.getAbsolutePath());
			}
		}
	}

	/**
	 * 测试递归
	 */
	public void save(File file, int pointer) throws Exception {
		StringBuffer str = new StringBuffer();
		for (int i = 0; i < pointer; i++) {
			str.append("--");
		}
		if (file.isFile()) { // 如果是文件就建索引并保存
			System.out.println(str + file.getName());
			return;
		}
		File[] childs = file.listFiles();
		for (int i = 0; i < childs.length; i++) {
			File f = childs[i];
			if (f.isDirectory()) {// 如果是目录就递归调用
				System.out.println(str + f.getName());
				save(f, pointer + 1);
			} else {
				System.out.println(str + f.getName());
			}

		}
	}
}

package com.cs.lucene.lucene;

import java.util.ArrayList;
import java.util.List;

import org.apache.lucene.document.Document;

public class QueryResult {
	private int recordCount = 0;
	private List<Document> recordResults = new ArrayList<Document>();

	public QueryResult(int recordCount, List<Document> recordResults) {
		this.recordCount = recordCount;
		this.recordResults = recordResults;
	}

	public int getRecordCount() {
		return recordCount;
	}

	public void setRecordCount(int recordCount) {
		this.recordCount = recordCount;
	}

	public List<Document> getRecordResults() {
		return recordResults;
	}

	public void setRecordResults(List<Document> recordResults) {
		this.recordResults = recordResults;
	}

}

测试索引

package com.cs.lucene.lucene;

import java.io.File;

import jeasy.analysis.MMAnalyzer;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.junit.Test;

import com.cs.lucene.utils.File2DocumentUtiles;

public class IndexDaoTest {


	private IndexDao indexDao = new IndexDao() ;
	
	/*
	 *搜索索引库，并返回结果
	 */
	@Test
	public void testSearch() throws Exception{
		String queryString = "www*" ;
		QueryResult queryResults = indexDao.search(queryString ,0, 10) ;
		 //测试结果
		 System.out.println("总共有【"+queryResults.getRecordCount()+"】条匹配结果");
		 
	      for(int i =0 ; i<queryResults.getRecordResults().size();i++){
	    	  Document doc = queryResults.getRecordResults().get(i) ;
	    	  File2DocumentUtiles.printDocInfo(doc) ;
	      }
	}
	/*
	 * 测试索引源文件并保存到索引库
	 */
	@Test
	public void testSave() throws Exception{
		String filePath2 = "E:\\EclipseStudyWorkspace\\LucenceTest\\lucenceDataSource\\readme2.txt";
		//源文件
		//String filePath = "E:\\EclipseStudyWorkspace\\LucenceTest\\lucenceDataSource\\资源来自互联网，版权归原创作者或原单位公司所有吴朝辉.txt";
		
		  indexDao.save(filePath2);
	}
	/**
	 * 用来给目录建索引并保存到索引库
	 */
	@Test
	public  void  testSaveDir() throws Exception{
		String filepath = "E:\\EclipseStudyWorkspace\\LucenceTest\\lucenceDataSource\\www" ;
		File file = new File(filepath) ;
		indexDao.saveDirectory(file);
	}
}

最后我们来看看lucene的查询功能

package com.cs.lucene.query;

import java.util.Date;

import org.apache.lucene.document.DateTools;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.NumberTools;
import org.apache.lucene.document.DateTools.Resolution;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.RangeQuery;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.WildcardQuery;
import org.apache.lucene.search.BooleanClause.Occur;
import org.junit.Test;

import com.cs.lucene.lucene.IndexDao;
import com.cs.lucene.lucene.QueryResult;
import com.cs.lucene.utils.File2DocumentUtiles;

public class QueryTest {
	
	IndexDao indexDao = new IndexDao() ;
	
	/*
	 * 关键词查询
	 */
	@Test
	public void testTermQuery() throws Exception{
		Term term = new Term("name","资源");
		Query query = new TermQuery(term);
	
		//查询打印结果
		 QueryAndPrintResult(query) ;
	}
	/*
	 * 范围索引
	 * 数字在query中都是字符串，所以要借助NumberTools工具类做转换
	 */
	@Test
	public void testRangeQuery() throws Exception{
		Term lowerTerm = new Term("size",NumberTools.longToString(200));
		Term upperTerm = new Term("size",NumberTools.longToString(500));
		//true表示是否包含边界
		Query query = new RangeQuery(lowerTerm,upperTerm,true) ;
		
		/*
		Term lowerTerm2 = new Term("size","200");
		Term upperTerm2 = new Term("size","500");
		Query query = new RangeQuery(lowerTerm2,upperTerm2,true) ; //true表示是否包含边界
	*/
		//查询打印结果
		 QueryAndPrintResult(query) ;
	}
	
	/*
	 * 测试NumberTools和DateTools
	 */
	@Test
	public void testNumberToolsAndDateTools() throws Exception{
		
	System.out.println("数字测试：");	
		 System.out.println(NumberTools.longToString(200));
		 System.out.println(NumberTools.longToString(500));
		 System.out.println(NumberTools.stringToLong("000000000000dw"));
		
	System.out.println("日期测试：");	
		 System.out.println(DateTools.dateToString(new Date(), Resolution.SECOND));
		 System.out.println(DateTools.dateToString(new Date(), Resolution.DAY));
		 System.out.println(DateTools.stringToDate("20101005080855"));
	}
	
	/*
	 * 通配符查询
	 * ？：代表一个字符，*：代表0个或多个字符
	 */
	@Test
	public void testWildcardQuery() throws Exception{
		Term term = new Term("name","*me");
		Query query = new WildcardQuery(term) ; 
	
		//查询打印结果
		 QueryAndPrintResult(query) ;
	}
	/*
	 * 短语查询：查询包含多个短语的query
	 */
	@Test
	public void testPhraseQuery() throws Exception{
		PhraseQuery phraseQuery = new PhraseQuery() ; 
		phraseQuery.add(new Term("name","资源")) ;
		phraseQuery.add(new Term("name","作者")) ;
		
		//setSlop：用来设置两个短语之间的最多可以隔多少个字符
		phraseQuery.setSlop(20);
	
		//查询打印结果
		 QueryAndPrintResult(phraseQuery) ;
	}
	/**
	 * 布尔查询：非常重要
	 * 三种关系：
	 * 1.MUST和MUST：取得两个查询子句的交集。
	 * 2.MUST和MUST_NOT：包含MUST但并且查询结果中不包含MUST_NOT的检索结果。
	 * 3.SHOULT和SHOULT：表示"或"关系，最终检索结果为所有检索子句的并集。
	 * 注意：有些组合是没有意义的
	 * @throws Exception
	 */
	@Test
	public void testBooleanQuery() throws Exception{
		//条件1
		PhraseQuery phraseQuery = new PhraseQuery() ; 
		phraseQuery.add(new Term("name","资源")) ;
		phraseQuery.add(new Term("name","作者")) ;
		phraseQuery.setSlop(20);
		
		//条件2
		Term lowerTerm2 = new Term("size","200");
		Term upperTerm2 = new Term("size","500");
		Query rangeQuery = new RangeQuery(lowerTerm2,upperTerm2,true) ; //true表示是否包含边界
	
		//合并两个查询
		BooleanQuery booleanQuery = new BooleanQuery() ;
	    booleanQuery.add(phraseQuery, Occur.MUST) ;
		booleanQuery.add(rangeQuery,Occur.MUST) ;
	
		//查询打印结果
		QueryAndPrintResult(booleanQuery) ;
	}
	
	
	
    private void QueryAndPrintResult(Query query) throws Exception{
    	
    	System.out.println("相对应的查询字符串："+query);
    	  QueryResult qr = indexDao.search(query, 0, 100) ;
		 System.out.println("总共有【"+qr.getRecordCount()+"】条匹配结果");
		
    	//打印结果
		  for(int i =0 ; i<qr.getRecordResults().size();i++){
	    	  Document doc = qr.getRecordResults().get(i) ;
	    	  File2DocumentUtiles.printDocInfo(doc) ;
	      }
    }
}

通过以上学习应该对lucene开发没什么问题了恭喜您您又向前迈进了一步

分享到：

dwr配置 | lucene根据文件类型自动解析的工厂类

2010-10-10 15:53
浏览 3369
评论(2)
分类:企业架构
查看更多

2 楼笑我痴狂 2010-10-19

出入并没有关系

1 楼 guofengcn 2010-10-11

我怎么感觉和ITCAse那个视频上讲的东西很像呢！呵呵

发表评论

您还没有登录,请您登录后再发表评论

最近访客更多访客>>

博主相关

文章分类

社区版块

存档分类

最新评论

lucene入门到项目开发

评论

发表评论

相关推荐

最近访客 更多访客>>

博主相关

文章分类

社区版块

存档分类

最新评论

lucene入门到项目开发

评论

发表评论

相关推荐

lucene根据文件类型自动解析的工厂类

lucene如何解析Doc文档

lucene如何解析pdf文档

lucene如何解析PPT文档

lucene如何抽取html网页

最近访客更多访客>>