Lucene开发实例--代码篇

rwg109

浏览: 223747 次
性别:
来自: 南京

最近访客更多访客>>

w603257390

extjos

china-shrimp

huting0211

博主相关

博客

微博

相册

留言

关于我

文章分类

社区版块

存档分类

博客分类：

Lucene

lucene maven Apache Eclipse F#

上次已经介绍了Lucene的功能及简介语法,此次再与大家分享一下实际开发中用Lucene查询数据的Code.

我使用的maven添加的jar包。maven中的pom.xml中添加lucene的jar包的方法是：

<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  <modelVersion>4.0.0</modelVersion>

  <groupId>com.rwg.lucene</groupId>
  <artifactId>cms</artifactId>
  <version>0.0.1-SNAPSHOT</version>
  <packaging>jar</packaging>

  <name>lucene</name>
  <url>http://maven.apache.org</url>

  <properties>
    <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
  </properties>

  <build>
  	<finalName>webapp</finalName>
  	<sourceDirectory>src/main/java/</sourceDirectory>
  	<testSourceDirectory>src/test/java/</testSourceDirectory>
  	<plugins>
  		<plugin>
  			<groupId>org.apache.maven.plugins</groupId>
  			<artifactId>maven-compiler-plugin</artifactId>
  			<version>2.3.2</version>
  			<configuration>
  				<source>1.6</source>
  				<target>1.6</target>
  				<encoding>UTF-8</encoding>
  			</configuration>
  		</plugin>
  		<plugin>
  			<groupId>org.apache.maven.plugins</groupId>
  			<artifactId>maven-resources-plugin</artifactId>
  			<version>2.4.3</version>
  			<configuration>
  				<encoding>UTF-8</encoding>
  			</configuration>
  		</plugin>
  		<plugin>
  			<groupId>org.mortbay.jetty</groupId>
  			<artifactId>jetty-maven-plugin</artifactId>
  			<version>8.0.0.M1</version>
  		</plugin>
  		<plugin>
  			<groupId>org.apache.maven.plugins</groupId>
  			<artifactId>maven-eclipse-plugin</artifactId>
  			<version>2.8</version>
  			<configuration>
  				<addVersionToProjectName>false</addVersionToProjectName>
  				<useProjectReferences>false</useProjectReferences>
  				<wtpmanifest>false</wtpmanifest>
  				<wtpapplicationxml>true</wtpapplicationxml>
  				<wtpversion>1.6</wtpversion>
  				<additionalBuildcommands>
  					<buildcommand>org.eclipse.jdt.core.javabuilder</buildcommand>
  					<buildcommand>org.eclipse.wst.common.project.facet.core.builder</buildcommand>
  					<buildcommand>org.eclipse.wst.validation.validationbuilder</buildcommand>
  				</additionalBuildcommands>
  			</configuration>
  		</plugin>
  		<plugin>
  			<groupId>org.apache.maven.plugins</groupId>
  			<artifactId>maven-war-plugin</artifactId>
  			<version>2.1.1</version>
  			<configuration>
  				<warName>webapp</warName>
  			</configuration>
  		</plugin>
  	</plugins>
  </build>



<dependencies>
     <!-- 此处是所引用的两个jar包 -->
		<dependency>
			<groupId>org.apache.lucene</groupId>
			<artifactId>lucene-core</artifactId>
			<version>3.0.2</version>
			<type>jar</type>
			<scope>compile</scope>
		</dependency>
		<dependency>
			<groupId>org.apache.lucene</groupId>
			<artifactId>lucene-demos</artifactId>
			<version>3.0.2</version>
			<type>jar</type>
			<scope>compile</scope>
		</dependency>
</dependencies>

</project>

     如果不是使用的maven的童鞋见附件下载lucene3.0.2的两个jar包。


1：运行第一步(建立索引)代码,目的是：将要搜索的txt文件的内容根据字段替换为索引,
并将替换后的索引保存到你所指定的文件夹中。
   简单来说就是：创建索引文件。

package com.rwg.lucene;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.util.Date;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;

/**
 * 创建索引 Lucene 3.0(第一步)
 * 
 * @author RenWeigang
 * 
 * @version 2010.12.13
 * 
 */
public class Indexer {
	
	//保存索引文件的地方
	private static String INDEX_DIR = "E:\\renwg\\茶余饭后\\新建文件夹";
	//将要搜索TXT文件的地方
	private static String DATA_DIR = "E:\\renwg\\茶余饭后";

	public static void main(String[] args) throws Exception {
		long start = new Date().getTime();
		int numIndexed = index(new File(INDEX_DIR), new File(DATA_DIR));
		long end = new Date().getTime();

		System.out.println("Indexing " + numIndexed + " files took "
				+ (end - start) + " milliseconds");
	}

	/**
	 * 索引dataDir下.txt文件，并储存在indexDir下，返回索引的文件数量
	 * 
	 * @param indexDir
	 * @param dataDir
	 * @return
	 * @throws IOException
	 */
	public static int index(File indexDir, File dataDir) throws IOException {
		if (!dataDir.exists() || !dataDir.isDirectory()) {
			throw new IOException(dataDir
					+ " does not exist or is not a directory");
		}
		/**
		 * 创建IndexWriter对象,
		 * 第一个参数是Directory,也可以为：Directory dir = new SimpleFSDirectory(new File(indexDir));
		 * 第二个是分词器,
		 * 第三个表示是否是创建,
		 * 如果为false为在此基础上面修改,
		 * 第四表示表示分词的最大值，比如说new MaxFieldLength(2)，
		 * 就表示两个字一分，一般用IndexWriter.MaxFieldLength.LIMITED
		 *     
		 */
		IndexWriter writer = new IndexWriter(FSDirectory.open(indexDir),
				new StandardAnalyzer(Version.LUCENE_30), true,
				IndexWriter.MaxFieldLength.LIMITED);
		indexDirectory(writer, dataDir);
		
		//查看IndexWriter里面有多少个索引 
		int numIndexed = writer.numDocs();
		writer.optimize();
		writer.close();
		return numIndexed;
	}

	/**
	 * 循环遍历dir下的所有.txt文件并进行索引
	 * 
	 * @param writer
	 * @param dir
	 * @throws IOException
	 */
	private static void indexDirectory(IndexWriter writer, File dir)
			throws IOException {

		File[] files = dir.listFiles();

		for (int i = 0; i < files.length; i++) {
			if (files[i].isDirectory()) {
				//递归
				indexDirectory(writer,files[i]);
			} else if (files[i].getName().endsWith(".txt")){
				indexFile(writer,files[i]);
			}
		}
	}

	/**
	 * 对单个txt文件进行索引
	 * 
	 * @param writer
	 * @param f
	 * @throws IOException
	 */
	private static void indexFile(IndexWriter writer, File f)
			throws IOException {

		if (f.isHidden() || !f.exists() || !f.canRead()) {
			return;
		}

		System.out.println("Indexing " + f.getCanonicalPath());
		Document doc = new Document();
		doc.add(new Field("contents", new FileReader(f)));
		doc.add(new Field("filename", f.getCanonicalPath(), Field.Store.YES,Field.Index.ANALYZED));
		
		/**
		 * Field.Index有五个属性，分别是： 
         * Field.Index.ANALYZED：分词索引 
         * Field.Index.NOT_ANALYZED：分词进行索引，如作者名，日期等，Rod Johnson本身为一单词，不再需要分词。 
         * Field.Index.NO：不进行索引，存放不能被搜索的内容如文档的一些附加属性如文档类型, URL等。 
         * Field.Index.NOT_ANALYZED_NO_NORMS：不使用分词索引，不使用存储规则。 
         * Field.Index.ANALYZED_NO_NORMS：使用分词索引，不使用存储规则。
		 */
		writer.addDocument(doc);
	}

2：运行第二步（搜索索引）,目的是：根据创建索引的字段到所指定的索引文件夹中去寻找要搜索的字符。
简单来说就是：从索引文件中查找你想要得到的信息

package com.rwg.lucene;

import java.io.File;
import java.util.Date;

import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopScoreDocCollector;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;

/**
 * 搜索索引 Lucene 3.0(第二步)
 * 
 * @author RenWeigang
 * 
 * @version 2010.12.13
 * 
 */
public class Searcher {
	
	//保存索引的地方
	private static String INDEX_DIR = "E:\\renwg\\茶余饭后\\新建文件夹";
	
	private static String KEYWORD = "青云";
	private static int TOP_NUM = 100;

	public static void main(String[] args) throws Exception {
		File indexDir = new File(INDEX_DIR);
		if (!indexDir.exists() || !indexDir.isDirectory()) {
			throw new Exception(indexDir
					+ " does not exist or is not a directory.");
		}

		search(indexDir, KEYWORD);
	}

	/**
	 * 查詢
	 * 
	 * @param indexDir
	 *        索引目录地址
	 * @param q
	 * 		   要查询的字符串
	 * @throws Exception
	 */
	public static void search(File indexDir, String q) throws Exception {
		
		//创建 IndexSearcher对象，相比IndexWriter对象，这个参数就要提供一个索引的目录就行了   
		IndexSearcher indexSearch = new IndexSearcher(FSDirectory.open(indexDir), true);// read-only
		//在建立索引时,存在IndexWriter对象中的
		String field = "contents";

		/**
		 *  创建QueryParser对象,
		 *  第一个参数表示Lucene的版本,
		 *  第二个表示搜索Field的字段,
		 *  第三个表示搜索使用分词器 
		 */
		QueryParser parser = new QueryParser(Version.LUCENE_30, field,new StandardAnalyzer(Version.LUCENE_30));
		
		//生成Query对象   
		Query query = parser.parse(q);

		TopScoreDocCollector collector = TopScoreDocCollector.create(TOP_NUM,false);

		// start time
		long start = new Date().getTime();

		indexSearch.search(query,collector);
		//搜索结果TopScoreDocCollector里面有 TopDocs,TopDocs里面有scoreDocs[]数组，里面保存着索引值.   
		ScoreDoc[] hits = collector.topDocs().scoreDocs;
		
		System.out.println("找到了"+hits.length+"个");
		
		//循环ScoreDoc数据，并使用indexSearch.doc方法把Document还原，再拿出对应的字段的值  
		for (int i = 0; i < hits.length; i++) {
			// new method is.doc()
			Document doc = indexSearch.doc(hits[i].doc);
			System.out.println(doc.getField("filename") + "------------"+ hits[i].toString());
		}
		indexSearch.close();
		
		// end time
		long end = new Date().getTime();

		System.out.println("Found " + collector.getTotalHits()
				+ " document(s) (in " + (end - start)
				+ " milliseconds) that matched query '" + q + "':");
		
	}
}

把代码贴上来了，大家先熟练的使用后再详细研究它的原理。

lucene-demos-3.0.2.jar (55.7 KB)
下载次数: 695

lucene-core-3.0.2.jar (1008.8 KB)
下载次数: 759

5
顶

4
踩

分享到：

Lucene原理简单介绍 | 物联网入门相关知识

2010-12-13 11:40
浏览 7389
评论(6)
分类:编程语言
查看更多

6 楼 fendou3754 2014-06-28

程序可以运行,不过对于中文的搜索,貌似要将txt文件存为UTF8编码格式才能搜索得到,why???这种东西应该由lucene内部自己解决吧;另外对于英文的搜索,遇到点疑问,txt文件里面有AA这两个字母,为毛一定要搜索AA才能搜索得出,搜索一个A居然没结果;
以上是使用体会；总之程序可以运行,3q

5 楼 ewf_momo 2013-06-02

4 楼 dbh0512 2013-04-22

我的是一段文本每次只能创建一个索引但是搜索不到求解答

3 楼 lyj57 2012-11-21

那个"E:\\renwg\\茶余饭后\\新建文件夹"里存放的是要建立索引的文档吗

2 楼 xutao5641745 2011-03-13

为什么，我用你的代码运行后，生成的文件的内容就是乱码呢？

1 楼 pch272215690 2011-01-05

3.0.2啊，新版本，我喜欢。

发表评论

文章已被作者锁定，不允许评论。

最近访客更多访客>>

博主相关

文章分类

社区版块

存档分类

最新评论