Lucene5学习之创建索引入门示例

lxwt909

浏览: 576720 次
性别:
来自: 北京

最近访客更多访客>>

akingde

chenghu209

14252316

yinxin2745154

博主相关

博客

微博

相册

留言

关于我

文章分类

社区版块

存档分类

博客分类：

Lucene

Lucene Maven

Lucene更新实在太快了，只好紧跟脚步开始学习Lucene5,花了点时间写了一个demo，就是程序根据用户提供的一个文件夹，读取该文件夹下的所有文件，然后读取文件里的内容写入索引。读取文件部分采用的是最新的NIO2.0API,因此，JDK必须使用1.7及以上版本。Lucene5开发压缩包请在Lucene官网下载。不多说了，对于码农来说，最直接的就是上代码。

package com.yida.framework.lucene5.core;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.charset.StandardCharsets;
import java.nio.file.FileVisitResult;
import java.nio.file.Files;
import java.nio.file.LinkOption;
import java.nio.file.OpenOption;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.nio.file.SimpleFileVisitor;
import java.nio.file.attribute.BasicFileAttributes;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.LongField;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;

/**
 * 读取硬盘文件，创建索引
 * 
 * @author Lanxiaowei
 * 
 */
@SuppressWarnings({ "unchecked", "unused", "rawtypes" })
public class IndexFile {
	public static void main(String[] args) throws IOException {
		String dirPath = "D:/docPath";
		String indexPath = "D:/lucenedir";
		createIndex(dirPath, indexPath);
	}
	
	/**
	 * 创建索引
	 * @param dirPath       需要读取的文件所在文件目录
	 * @param indexPath     索引存放目录
	 * @throws IOException
	 */
	public static void createIndex(String dirPath, String indexPath) throws IOException {
		createIndex(dirPath, indexPath, false);
	}
	
	/**
	 * 创建索引
	 * @param dirPath         需要读取的文件所在文件目录
	 * @param indexPath       索引存放目录
	 * @param createOrAppend  始终重建索引/不存在则追加索引
	 * @throws IOException
	 */
	public static void createIndex(String dirPath, String indexPath,
			boolean createOrAppend) throws IOException {
		long start = System.currentTimeMillis();
		Directory dir = FSDirectory.open(Paths.get(indexPath, new String[0]));
		Path docDirPath = Paths.get(dirPath, new String[0]);
		Analyzer analyzer = new StandardAnalyzer();
		IndexWriterConfig indexWriterConfig = new IndexWriterConfig(analyzer);

		if (createOrAppend) {
			indexWriterConfig.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
		} else {
			indexWriterConfig.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND);
		}
		IndexWriter writer = new IndexWriter(dir, indexWriterConfig);
		indexDocs(writer, docDirPath);
		writer.close();
		long end = System.currentTimeMillis();
		System.out.println("Time consumed:" + (end - start) + " ms");
	}

	/**
	 * 
	 * @param writer
	 *            索引写入器
	 * @param path
	 *            文件路径
	 * @throws IOException
	 */
	public static void indexDocs(final IndexWriter writer, Path path)
			throws IOException {
		// 如果是目录，查找目录下的文件
		if (Files.isDirectory(path, new LinkOption[0])) {
			System.out.println("directory");
			Files.walkFileTree(path, new SimpleFileVisitor() {
				@Override
				public FileVisitResult visitFile(Object file,
						BasicFileAttributes attrs) throws IOException {
					Path path = (Path)file;
					System.out.println(path.getFileName());
					indexDoc(writer, path, attrs.lastModifiedTime().toMillis());
					return FileVisitResult.CONTINUE;
				}
			});
		} else {
			indexDoc(writer, path,
					Files.getLastModifiedTime(path, new LinkOption[0])
							.toMillis());
		}
	}

	/**
	 * 读取文件创建索引
	 * 
	 * @param writer
	 *            索引写入器
	 * @param file
	 *            文件路径
	 * @param lastModified
	 *            文件最后一次修改时间
	 * @throws IOException
	 */
	public static void indexDoc(IndexWriter writer, Path file, long lastModified)
			throws IOException {
		InputStream stream = Files.newInputStream(file, new OpenOption[0]);
		Document doc = new Document();

		Field pathField = new StringField("path", file.toString(),
				Field.Store.YES);
		doc.add(pathField);

		doc.add(new LongField("modified", lastModified, Field.Store.NO));
		doc.add(new TextField("contents", new BufferedReader(
				new InputStreamReader(stream, StandardCharsets.UTF_8))));

		if (writer.getConfig().getOpenMode() == IndexWriterConfig.OpenMode.CREATE) {
			System.out.println("adding " + file);
			writer.addDocument(doc);
		} else {
			System.out.println("updating " + file);
			writer.updateDocument(new Term("path", file.toString()), doc);
		}
		writer.commit();
	}
}

项目采用的是Maven构建，怎么创建Maven Project就不用介绍了吧，我就贴下pom配置吧。

<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
	xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
	<modelVersion>4.0.0</modelVersion>
	<groupId>com.yida.framework</groupId>
	<artifactId>lucene5</artifactId>
	<packaging>war</packaging>
	<version>1.0</version>
	<name>lucene5 Maven Webapp</name>
	<url>http://maven.apache.org</url>
	
	<properties>
	    <lucene.version>5.0.0</lucene.version>
	</properties>
	
	<dependencies>
		<dependency>
			<groupId>junit</groupId>
			<artifactId>junit</artifactId>
			<version>3.8.1</version>
			<scope>test</scope>
		</dependency>
		<dependency>
			<groupId>org.apache.lucene</groupId>
			<artifactId>lucene-core</artifactId>
			<version>${lucene.version}</version>
		</dependency>
		<dependency>
			<groupId>org.apache.lucene</groupId>
			<artifactId>lucene-analyzers-common</artifactId>
			<version>${lucene.version}</version>
		</dependency>
		<dependency>
			<groupId>org.apache.lucene</groupId>
			<artifactId>lucene-queryparser</artifactId>
			<version>${lucene.version}</version>
		</dependency>
		<dependency>
			<groupId>org.apache.lucene</groupId>
			<artifactId>lucene-highlighter</artifactId>
			<version>${lucene.version}</version>
		</dependency>
	</dependencies>
	<build>
		<finalName>lucene5</finalName>
	</build>
</project>

项目结构图如图：

运行之前，先在D盘新建两个文件夹，如图：

然后在docPath文件夹里随便放几个文本文件，如图：

然后运行测试类，就会在lucenedir文件夹下创建索引。

代码很简单，没什么需要过多解释的，demo源码请在附件里下载。

希望能对大家学习Lucene有所帮助，其次也算是对自己学习轨迹的一个记录，写博客这个习惯

我会努力保持下去。

若你还有什么疑问，请加我Ｑ－Ｑ：７－３－６－０－３－１－３－０－５，或者加裙：

，欢迎你加入一起交流学习。

lucene5-demo1.rar (11.4 KB)
下载次数: 208

查看图片附件

分享到：

Lucene5学习之分页查询 | ExtJS5学习之TreePanel(续)

2015-03-16 20:49
浏览 5849
评论(3)
分类:编程语言
查看更多

3 楼徐小白520 2015-12-05

不知道你这篇Lucene5学习之分页查询是否基于这个IndexFile 产生的索引文件。我想说的是

doc.add(new TextField("contents", new BufferedReader(new InputStreamReader(stream, StandardCharsets.UTF_8))));

这个不会把文件内容存储到索引文件中，也就是查询索引文件，输出contens内容会为null。
如果你们继续学博主下一篇文章，你们要注意，不要奇怪为什么输出结果不一样。

2 楼 majiedota 2015-06-29

1 楼 rubricate 2015-03-31

终于看到写lucene5的牛人了
欢迎加入我们的elasticsearch群 211682609
还有我们的问答社区 http://elasticsearch.cn

发表评论

您还没有登录,请您登录后再发表评论

最近访客更多访客>>

博主相关

文章分类

社区版块

存档分类

最新评论