lucene

have_life

浏览: 153447 次

最近访客更多访客>>

siyu3223

dong_junshuai

thornbird313

xiaomabobo

博主相关

博客

微博

相册

留言

关于我

文章分类

社区版块

存档分类

博客分类：

数据挖掘

我找了一些hello world想试一下，不少文档冗长，又看不到效果。

http://onlyonetoone.iteye.com/blog/1546097
上面这个链接上的代码我的确试了，可以运行。
我用的lucene版本3.6.1

package com.yale.lucene;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileFilter;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.InputStreamReader;

import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;

/**
 * 创建一个索引
 * 
 * 
 * 
 */
public class Indexer
{
	public static void main(String[] args) throws Exception
	{
		// 创建索引放在这个目录
		String indexDir = "F://新建文件夹//luceneTest//indexFile";
		// 要索引的文件存在的目录
		String dataDir = "F://新建文件夹//luceneTest//dataSource";
		long start = System.currentTimeMillis();
		Indexer indexer = new Indexer(indexDir);
		int numIndexed;
		try
		{
			numIndexed = indexer.index(dataDir, new TextFilesFilter());
		}
		finally
		{
			indexer.close();
		}
		long end =System.currentTimeMillis();
		System.out.println("Indexing " + numIndexed + " files took "
				+ (end - start) + " milliseconds");

	}

	private IndexWriter writer;

	public Indexer(String indexDir) throws Exception
	{
		Directory dir = FSDirectory.open(new File(indexDir));
		// 创建IndexWriter
		writer = new IndexWriter(dir, new StandardAnalyzer(Version.LUCENE_30),
				true, IndexWriter.MaxFieldLength.LIMITED);

	}

	// 关闭IndexWriter
	public void close() throws Exception
	{
		writer.close();
	}

	// 开始索引
	public int index(String dir, FileFilter filter) throws Exception
	{
		File[] files = new File(dir).listFiles();
		for (File f : files)
		{
			if (!f.isDirectory() && !f.isHidden() && f.exists() && f.canRead()
					&& (filter == null || filter.accept(f)))
			{
				indexFile(f);
			}
		}
		return writer.numDocs();
	}

	// 索引文件
	private void indexFile(File f) throws Exception
	{
		System.out.println("Indexing " + f.getCanonicalPath());
		Document doc = getDocument(f);
		writer.addDocument(doc);

	}

	// 文档设置
	private Document getDocument(File f) throws Exception
	{
		Document doc = new Document();
		doc.add(new Field("contents", File2Reader(f), Field.Store.YES,
				Field.Index.ANALYZED));
		doc.add(new Field("filename", f.getName(), Field.Store.YES,
				Field.Index.NOT_ANALYZED));
		doc.add(new Field("fullpath", f.getCanonicalPath(), Field.Store.YES,
				Field.Index.NOT_ANALYZED));

		return doc;
	}

	public static String File2Reader(File f) throws Exception
	{
		BufferedReader bf = new BufferedReader(new InputStreamReader(
				new FileInputStream(f)));
		StringBuffer sb = new StringBuffer();
		for (String line = null; (line = bf.readLine()) != null;)
		{
			sb.append(line).append("\n");
		}
		return sb.toString();
	}

	// 文件类型过滤
	private static class TextFilesFilter implements FileFilter
	{
		@Override
		public boolean accept(File pathname)
		{

			return pathname.getName().toLowerCase().endsWith(".txt");
		}
	}
}

package com.yale.lucene;

import java.io.File;

import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;

public class Searcher
{
	public static void main(String[] args) throws Exception
	{
		// 索引放在这个目录
		String indexDir = "F://新建文件夹//luceneTest//indexFile";
		// 要查询的字符串
		String queryString = "Apache";
		search(indexDir, queryString);
	}
//开始搜索
	public static void search(String indexDir, String queryString)
			throws Exception
	{
		Directory dir = FSDirectory.open(new File(indexDir));

		IndexSearcher is = new IndexSearcher(dir);

		QueryParser parser = new QueryParser(Version.LUCENE_30, "contents",
				new StandardAnalyzer(Version.LUCENE_30));

		Query query = parser.parse(queryString);

		long start = System.currentTimeMillis();
		TopDocs hits = is.search(query, 10);
		long end = System.currentTimeMillis();
		System.err.println("找到   " + hits.totalHits + "个文件 在" + (end - start)
				+ "毫秒匹配 要查询的字符串	'" + queryString + "'");
		
		for(ScoreDoc scoreDoc :hits.scoreDocs)
		{
			Document doc = is.doc(scoreDoc.doc);
			
			System.out.println(doc.get("fullpath"));
			
			System.out.println(doc.get("filename"));
			
			System.out.println(doc.get("contents"));
		}
		is.close();
	}
}

他的博客上还有后续的一些，其他的示例。我再看看行不行。

分享到：

Mahout 使用的例子 | Java 的Runnable接口和继承Thread的区别

2012-10-05 16:22
浏览 957
评论(0)
分类:互联网
查看更多

发表评论

您还没有登录,请您登录后再发表评论

最近访客更多访客>>

博主相关

文章分类

社区版块

存档分类

最新评论

lucene

评论

发表评论

相关推荐

最近访客 更多访客>>

博主相关

文章分类

社区版块

存档分类

最新评论

lucene

评论

发表评论

相关推荐

使用lingpipe自然语言处理包进行文本分类

如何运行LingPipe

搜狗实验室 提供了一些数据挖掘加工的数据

Mahout 使用的例子

决策树

最近访客更多访客>>

搜狗实验室提供了一些数据挖掘加工的数据