搜索引擎工具类

ruanzy888888

浏览: 89684 次
性别:
来自: 宜昌

最近访客更多访客>>

fanan_666

清风不识字何故乱翻书

aa8945163

xuyongff

博主相关

博客

微博

相册

留言

关于我

文章分类

社区版块

存档分类

博客分类：

开发宝典

搜索引擎 lucene F#

/**
 * 搜索引擎工具类
 * @author ruanzhiyong6496
 * @version 1.0
 */
public class Lucene
{
	private static String INDEX_DIR = "D:\\index";// 索引存放目录
	private static String DATA_DIR = "D:\\small";// 小文件存放的目录

	/**
	 * 将大文件切割为小文件
	 * 
	 * @param filepath
	 *            大文件路径
	 * @param outputdir
	 *            小文件輸出目錄
	 * @param size
	 *            小文件尺寸
	 */
	private static void splitToSmallFiles(String filepath)
	{
		int filePointer = 0;
		int MAX_SIZE = 1024 * 10;
		BufferedWriter writer = null;
		int index1 = filepath.lastIndexOf("/");
		int index2 = filepath.lastIndexOf(".");
		String fileName = filepath.substring(index1 + 1, index2);
		try
		{
			File dir = new File(DATA_DIR);
			if (!dir.exists())
			{
				dir.mkdir();
			}
			BufferedReader reader = new BufferedReader(new FileReader(filepath));
			StringBuffer buffer = new StringBuffer();
			String line = reader.readLine();
			while (line != null)
			{
				buffer.append(line).append("\r\n");
				if (buffer.toString().getBytes().length >= MAX_SIZE)
				{
					File file = new File(dir, fileName + filePointer + ".txt");
					writer = new BufferedWriter(new FileWriter(file));
					writer.write(buffer.toString());
					writer.close();
					filePointer++;
					buffer = new StringBuffer();
				}
				line = reader.readLine();
			}
			System.out.println("The file hava splited to small files !");
		}
		catch (FileNotFoundException e)
		{
			System.out.println("file not found !");
			e.printStackTrace();
		}
		catch (IOException e)
		{
			e.printStackTrace();
		}
	}

	/**
	 * 索引dataDir下的.txt文件，并储存在indexDir下，返回索引的文件数量
	 * 
	 * @param indexDir
	 * @param dataDir
	 * @return int
	 * @throws IOException
	 */
	private static int index() throws IOException
	{

		File dataDr = new File(DATA_DIR);
		if (!dataDr.exists() || !dataDr.isDirectory())
		{
			throw new IOException(dataDr
					+ " does not exist or is not a directory");
		}

		IndexWriter writer = new IndexWriter(FSDirectory.open(new File(
				INDEX_DIR)), new StandardAnalyzer(Version.LUCENE_CURRENT),
				true, IndexWriter.MaxFieldLength.LIMITED);// 有变化的地方

		indexDirectory(writer, DATA_DIR);
		int numIndexed = writer.numDocs();
		writer.optimize();
		writer.close();
		return numIndexed;
	}

	/**
	 * 循环遍历目录下的所有.txt文件并进行索引
	 * 
	 * @param writer
	 * @param dir
	 * @throws IOException
	 */
	private static void indexDirectory(IndexWriter writer, String dir)
			throws IOException
	{

		File dr = new File(dir);
		if (!dr.exists())
		{
			return;
		}
		File[] files = dr.listFiles();
		for (int i = 0; i < files.length; i++)
		{
			File f = files[i];
			if (f.isDirectory())
			{
				indexDirectory(writer, f.getName()); // recurse
			}
			else if (f.getName().endsWith(".txt"))
			{
				indexFile(writer, f);
			}
		}
	}

	/**
	 * 对单个txt文件进行索引
	 * 
	 * @param writer
	 * @param f
	 * @throws IOException
	 */
	private static void indexFile(IndexWriter writer, File f)
			throws IOException
	{

		if (f.isHidden() || !f.exists() || !f.canRead())
		{
			return;
		}

		// System.out.println("Indexing " + f.getCanonicalPath());
		Document doc = new Document();
		doc.add(new Field("contents", new FileReader(f)));// 有变化的地方
		doc.add(new Field("filename", f.getCanonicalPath(), Field.Store.YES,
				Field.Index.ANALYZED));// 有变化的地方

		writer.addDocument(doc);
	}

	/**
	 * 查询
	 * 
	 * @param indexDir
	 * @param q
	 * @throws Exception
	 */
	public static void search(String filepath, String keyword, int topnum)
	{

		try
		{
			splitToSmallFiles(filepath);
			index();
			IndexSearcher is = new IndexSearcher(FSDirectory.open(new File(
					INDEX_DIR)), true);// read-only
			String field = "contents";

			QueryParser parser = new QueryParser(Version.LUCENE_CURRENT, field,
					new StandardAnalyzer(Version.LUCENE_CURRENT));// 有变化的地方
			Query query = parser.parse(keyword);

			TopScoreDocCollector collector = TopScoreDocCollector.create(
					topnum, false);// 有变化的地方

			long start = new Date().getTime();// start time

			is.search(query, collector);
			ScoreDoc[] hits = collector.topDocs().scoreDocs;

			// System.out.println(hits.length);
			for (int i = 0; i < hits.length; i++)
			{
				Document doc = is.doc(hits[i].doc);// new method is.doc()
				System.out.println(doc.getField("filename"));
				// System.out.println(doc.getField("filename") + " "
				// + hits[i].toString() + " ");
			}
			long end = new Date().getTime();// end time

			System.out.println("Found " + collector.getTotalHits()
					+ " document(s) (in " + (end - start)
					+ " milliseconds) that matched query '" + keyword + "':");
		}
		catch (Exception e)
		{
			e.printStackTrace();
		}
	}

}

分享到：

多线程任务 | 验证码

2011-04-07 10:37
浏览 732
评论(0)
分类:编程语言
查看更多

发表评论

文章已被作者锁定，不允许评论。

最近访客更多访客>>

博主相关

文章分类

社区版块

存档分类

最新评论

搜索引擎工具类

评论

发表评论

相关推荐

最近访客 更多访客>>

博主相关

文章分类

社区版块

存档分类

最新评论

搜索引擎工具类

评论

发表评论

相关推荐

dialog

box.js

IE6 autocomplete

Callable Future

RequestContext

XUtil

ActionFilter

最新基于Json 协议的架构

权限脚本

主框架

java 备份mysql

权限管理

FTPUPLOAD

layout

ligerui.css

Accordion

LigerUI

js 控件集

fileutil

语法高亮

最近访客更多访客>>