【转】lucene3.0入门实例

liuxinglanyue

浏览: 557740 次
性别:
来自: 杭州

最近访客更多访客>>

hui963966800

lhc98

guoshun0321

kidding87

博主相关

博客

微博

相册

留言

关于我

文章分类

社区版块

存档分类

2011-02 ( 10)
2011-01 ( 22)
2010-12 ( 165)
更多存档...

博客分类：

lucene

lucene F#Blog

转自：http://cumtfirefly.iteye.com/blog/543664

lucene3.0已于2009-11-25发布啦，但网上的入门实例都是针对lucene3.0以前的，相对于以前的版本，貌似改动不小。
本人从头开始学习lucene，现在用的是《lucene in action中文版》，结合lucene3.0文档写了个入门实例，可供像我一样直接从lucene3.0开始学习的新手参考！

入门实例：

1.预处理：先把网上下载的一个《三国演义》电子书“三国演义.txt”（可用其他代替，呵呵）切割成多个小文件。

/**
 * @author ht
 * 预处理
 *
 */
public class FilePreprocess {
   public static void main(String[] arg){
	String outputpath = "D:\\test\\small\\";//小文件存放路径
	String filename = "D:\\test\\三国演义.txt";//原文件存放路径
	if(!new File(outputpath).exists()){
		new File(outputpath).mkdirs();
	}
	splitToSmallFiles(new File(filename), outputpath);
   }
/**大文件切割为小的
 * @param file
 * @param outputpath
 */
   public static void splitToSmallFiles(File file ,String outputpath){
        int filePointer = 0;
	int MAX_SIZE = 10240;
	String filename = "output";

	BufferedWriter writer = null;
	try {
		BufferedReader reader = new BufferedReader(new FileReader(file));
		StringBuffer buffer = new StringBuffer();
		String line = reader.readLine();
		while(line != null){
			buffer.append(line).append("\r\n");
			if(buffer.toString().getBytes().length>=MAX_SIZE){
				writer = new BufferedWriter(new  FileWriter(outputpath+filename+filePointer+".txt"));
				writer.write(buffer.toString());
				writer.close();
				filePointer++;
				buffer=new StringBuffer();
			}
			line = reader.readLine();				
		}
		writer = new BufferedWriter(new FileWriter(outputpath+filename+filePointer+".txt"));
		writer.write(buffer.toString());
		writer.close();
		System.out.println("The file hava splited to small files !");
	} catch (FileNotFoundException e) {
		System.out.println("file not found !");
	e.printStackTrace();
	} catch (IOException e) {
		e.printStackTrace();
	}		
}

2.用lucene3.0生成索引类:用lencene3.0对生成的多个小文件进行索引，中文分词用的是lucene3.0自带的StandardAnalyzer.

/**
 * @author ht
 * 索引生成
 *
 */
public class Indexer {
   private static String INDEX_DIR = "D:\\test\\index";//索引存放目录
   private static String DATA_DIR = "D:\\test\\small\\";//小文件存放的目录
	
  public static void main(String[] args) throws Exception {
 
    long start = new Date().getTime();
    int numIndexed = index(new File(INDEX_DIR), new File(DATA_DIR));//调用index方法
    long end = new Date().getTime();
    System.out.println("Indexing " + numIndexed + " files took " + (end - start) + " milliseconds");
  }

  /**索引dataDir下的.txt文件，并储存在indexDir下，返回索引的文件数量
 * @param indexDir
 * @param dataDir
 * @return int 
 * @throws IOException
 */
public static int index(File indexDir, File dataDir) throws IOException {

    if (!dataDir.exists() || !dataDir.isDirectory()) {
      throw new IOException(dataDir + " does not exist or is not a directory");
    }

    IndexWriter writer = new IndexWriter(FSDirectory.open(indexDir), new StandardAnalyzer(Version.LUCENE_CURRENT), true, 
IndexWriter.MaxFieldLength.LIMITED);//有变化的地方
    
    indexDirectory(writer, dataDir);
    int numIndexed = writer.numDocs();
    writer.optimize();
    writer.close();
    return numIndexed;
  }

  /**循环遍历目录下的所有.txt文件并进行索引
 * @param writer
 * @param dir
 * @throws IOException
 */
private static void indexDirectory(IndexWriter writer, File dir)
    throws IOException {

    File[] files = dir.listFiles();

    for (int i = 0; i < files.length; i++) {
      File f = files[i];
      if (f.isDirectory()) {
        indexDirectory(writer, f);  // recurse
      } else if (f.getName().endsWith(".txt")) {
        indexFile(writer, f);
      }
    }
  }

  /**对单个txt文件进行索引
 * @param writer
 * @param f
 * @throws IOException
 */
private static void indexFile(IndexWriter writer, File f)
    throws IOException {
	
    if (f.isHidden() || !f.exists() || !f.canRead()) {
      return;
    }

    System.out.println("Indexing " + f.getCanonicalPath());
    Document doc = new Document();
    doc.add(new Field("contents",new FileReader(f)));//有变化的地方
    doc.add(new Field("filename",f.getCanonicalPath(),Field.Store.YES, Field.Index.ANALYZED));//有变化的地方
 
    writer.addDocument(doc);
  }
}

3.查询类：查询“玄德”！

/**
 * @author ht
 * 查询
 *
 */
public class Searcher {
   private static String INDEX_DIR = "D:\\test\\index\\";//索引所在的路径
   private static String KEYWORD = "玄德";//关键词
   private static int TOP_NUM = 100;//显示前100条结果
	
  public static void main(String[] args) throws Exception {
	File indexDir = new File(INDEX_DIR);
    if (!indexDir.exists() || !indexDir.isDirectory()) {
      throw new Exception(indexDir +
        " does not exist or is not a directory.");
    }
    search(indexDir, KEYWORD);//调用search方法进行查询
  }
/**查询
 * @param indexDir
 * @param q
 * @throws Exception
 */
  public static void search(File indexDir, String q) throws Exception {
    IndexSearcher is = new  IndexSearcher(FSDirectory.open(indexDir),true);//read-only
    String field = "contents";
    
    QueryParser parser = new QueryParser(Version.LUCENE_CURRENT, field, new StandardAnalyzer(Version.LUCENE_CURRENT));//有变化的地方
    Query query = parser.parse(q);

    TopScoreDocCollector collector = TopScoreDocCollector.create(TOP_NUM , false);//有变化的地方
    
    long start = new Date().getTime();// start time
    
    is.search(query, collector);
    ScoreDoc[] hits = collector.topDocs().scoreDocs;

    System.out.println(hits.length);
    for (int i = 0; i < hits.length; i++) {
    	Document doc = is.doc(hits[i].doc);//new method is.doc()
    	System.out.println(doc.getField("filename")+"   "+hits[i].toString()+"  ");
	}
    long end = new Date().getTime();//end time

    System.out.println("Found " + collector.getTotalHits() +
    	      " document(s) (in " + (end - start) +
    	      " milliseconds) that matched query '" +
    	        q + "':");
  }
}

分享到：

htmlparser使用指南（转） | lucene3 搜索例子

2010-11-16 21:52
浏览 848
评论(0)
分类:互联网
查看更多

发表评论

您还没有登录,请您登录后再发表评论

最近访客更多访客>>

博主相关

文章分类

社区版块

存档分类

最新评论

【转】lucene3.0入门实例

评论

发表评论

相关推荐

最近访客 更多访客>>

博主相关

文章分类

社区版块

存档分类

最新评论

【转】lucene3.0入门实例

评论

发表评论

相关推荐

关于Lucene的讨论

有关Lucene的问题（收藏）推荐

Lucene 学习总结（收藏）推荐

基于Lucene的Compass 资源（收藏）

Lucene 3.0.2索引文件官方文档（二）

Lucene 3.0.2索引文件官方文档（一）

Lucene 3.0 索引文件学习总结（收藏）

Lucene 字符编码问题

Lucene 字符编码问题

Annotated Lucene(源码剖析中文版)

Lucene 学习推荐博客

Lucene3.0 初窥 总结（收藏）

转：基于lucene实现自己的推荐引擎

加速 lucene 的搜索速度 ImproveSearchingSpeed（二）

加速 lucene 索引建立速度 ImproveIndexingSpeed

lucene 3.0 中的demo项目部署

Lucene 3.0.2 源码 - final class Document

Lucene 3.0.2 源码 - final class Field

Lucene 3.0.2 源码 - abstract class AbstractField

Lucene 3.0.2 源码 - interface Fieldable

最近访客更多访客>>

Lucene3.0 初窥总结（收藏）