- 浏览: 65633 次
- 性别:
- 来自: 上海
文章分类
最新评论
lucene入门到项目开发
加入jar包 lucene-core-2.4.0.jar je-analysis-1.4.0.jar lucene-highlighter-2.4.1.jar lucene-analyzers-2.4.1.jar
先准备下工具类
Java代码
package com.cs.lucene.utils;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.InputStreamReader;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.NumberTools;
import org.apache.lucene.document.Field.Index;
import org.apache.lucene.document.Field.Store;
public class File2DocumentUtiles {
/**
*文件到document的转换
* @param filepath
* @return
*/
public static Document file2Document(String filepath) {
File file = new File(filepath) ;
Document doc = new Document();
doc.add(new Field("name",file.getName(),Store.YES,Index.ANALYZED)) ; //索引并分词
doc.add(new Field("content",readFileContent(file),Store.YES,Index.ANALYZED)) ; //索引并分词
doc.add(new Field("size",NumberTools.longToString(file.length()),Store.YES,Index.NOT_ANALYZED)) ; //索引不分词
doc.add(new Field("path",file.getPath(),Store.YES,Index.NO)) ; //不索引
return doc;
}
/**
* 根据文件读取文件内容
* @param file
* @return
*/
private static String readFileContent(File file) {
try {
BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(file)));
StringBuffer content = new StringBuffer();
for(String line=null; (line = reader.readLine())!=null ;){
content.append(line).append("\n") ;
}
return content.toString() ;
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return null;
}
public static void printDocInfo(Document doc){
System.out.println("--------------------------");
System.out.println("name =" + doc.get("name"));
System.out.println("content =" + doc.get("content"));
System.out.println("size =" + NumberTools.stringToLong(doc.get("size")));
System.out.println("path =" + doc.get("path"));
}
}
先了解下分词器
Java代码
package com.cs.lucene.analyzer;
import java.io.StringReader;
import jeasy.analysis.MMAnalyzer;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.junit.Test;
public class AnalyzerTest {
String text = "资源来自互联网吴朝辉wwwa的a-b放到" ;
Analyzer analyzer = new MMAnalyzer() ;
@Test
public void testAnalyze() throws Exception{
analyze(analyzer,text);
}
private void analyze(Analyzer analyzer2, String text2) throws Exception {
System.out.println("----------分词器-------------------");
TokenStream tokenStream = analyzer.tokenStream("content", new StringReader(text)) ;
for(Token token = new Token();(token = tokenStream.next(token))!=null;){
System.out.println(token);
}
}
}
现在看看FSDirectory和RAMDirectory
Java代码
package com.cs.lucene.directory;
import jeasy.analysis.MMAnalyzer;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriter.MaxFieldLength;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.RAMDirectory;
import org.junit.Test;
import com.cs.lucene.utils.File2DocumentUtiles;
public class DirectoryTest {
//创建索引用的文件路径
String filePath = "E:\\EclipseStudyWorkspace\\LucenceTest\\lucenceDataSource\\资源来自互联网,版权归原创作者或原单位公司所有.txt";
//存放索引的目录
String indexPath = "E:\\EclipseStudyWorkspace\\LucenceTest\\lucenceIndex" ;
//分词器
Analyzer analyzer = new MMAnalyzer(); //je分词器
/**
* 利用FSDirectory 创建索引
* FSDirectory:在文件系统上存放
* @throws Exception
*/
@Test
public void testFSDirectory() throws Exception{
//测试文件系统目录
Directory dir = FSDirectory.getDirectory(indexPath) ;
Document doc = File2DocumentUtiles.file2Document(filePath);
//参数true表示是否删除原来的索引后再重新创建,MaxFieldLength.LIMITED:表示只对前10000个字做索引
IndexWriter indexWriter = new IndexWriter(dir,analyzer,MaxFieldLength.LIMITED) ; //没有参数true,添加索引
indexWriter.addDocument(doc) ;
indexWriter.close() ;
}
/**
* 利用RAMDirectory 创建索引
* RAMDirectory:在内存中存放
* 优点:读取快
* 缺点:重新开机,索引没了
* @throws Exception
*/
@Test
public void testRAMDirectory() throws Exception{
//测试文件系统目录
Directory dir = new RAMDirectory() ;
Document doc = File2DocumentUtiles.file2Document(filePath);
//参数true表示是否删除原来的索引后再重新创建,MaxFieldLength.LIMITED:表示只对前10000个字做索引
IndexWriter indexWriter = new IndexWriter(dir,analyzer,MaxFieldLength.LIMITED) ; //没有参数true,添加索引
indexWriter.addDocument(doc) ;
indexWriter.close() ;
}
/**
* 实际应用中,FSDirectory和RAMDirectory联合起来用
* 操控内存的索引要快,所以在运行时操作RAMDirectory,
* 但退出时必须保存到到文件系统上,所以退出时操控FSDirectory
* @throws Exception
*/
@Test
public void testRAMDirectoryAndFSDirectory() throws Exception{
//整个过程:从文件系统上读取所以到内存,运行时添加索引,此时的全部索引都在内存中,
//退出时再把全部保存到文件系统上
Directory fsDir = FSDirectory.getDirectory(indexPath) ;
//1.启动时读取
Directory ramDir = new RAMDirectory(fsDir) ;
//运行时操作ramDir
IndexWriter ramIndexWriter = new IndexWriter(ramDir,analyzer,MaxFieldLength.LIMITED);
//添加document
Document doc = File2DocumentUtiles.file2Document(filePath) ;
ramIndexWriter.addDocument(doc) ;
ramIndexWriter.close() ;//一定要关闭再合并,因为有缓存
//2.退出时保存
//参数true表示把以前的索引删掉,全部重写 (默认为false)
IndexWriter fsIndexWriter = new IndexWriter(fsDir,analyzer,true,MaxFieldLength.LIMITED);
//new Directory[]{ramDir}:要合并的目录
//addIndexesNoOptimize:表示不做优化,做优化检索时相对要慢,但占用的存储空间小
fsIndexWriter.addIndexesNoOptimize(new Directory[]{ramDir}) ;
fsIndexWriter.flush() ; //优化之前一定要先刷新缓存
fsIndexWriter.optimize() ; //优化一定要在关闭之前做,优化可以提高检索的速度
fsIndexWriter.close() ;
}
@Test
public void testOptimize() throws Exception{
Directory fsDir = FSDirectory.getDirectory(indexPath) ;
IndexWriter fsIndexWriter = new IndexWriter(fsDir,analyzer,MaxFieldLength.LIMITED);
fsIndexWriter.optimize() ;
fsIndexWriter.close() ;
}
}
现在来测测索引如何建立以及搜索
Java代码
package com.cs.lucene.lucene;
import java.io.File;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import jeasy.analysis.MMAnalyzer;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriter.MaxFieldLength;
import org.apache.lucene.queryParser.MultiFieldQueryParser;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.Filter;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.SortField;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.highlight.Formatter;
import org.apache.lucene.search.highlight.Fragmenter;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.Scorer;
import org.apache.lucene.search.highlight.SimpleFragmenter;
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
import com.cs.lucene.utils.File2DocumentUtiles;
public class IndexDao {
// 存放索引的目录
private String indexPath;
private Analyzer analyzer = null; // 分词器
public IndexDao() {
this.indexPath = "E:\\EclipseStudyWorkspace\\LucenceTest\\lucenceIndex";
this.analyzer = new MMAnalyzer(); // je分词器对中文支持很好
}
public IndexDao(Analyzer analyzer, String indexPath) {
this.analyzer = analyzer;
this.indexPath = indexPath;
}
/**
* 接受一个QuerString字符串 搜索索引并返回结果
*
*/
public QueryResult search(String queryString, int firstResult,
int maxResults) throws Exception {
// 1.把要搜索的fields解析为Query
String[] fields = { "name", "content" };
// boosts:需要的理由,标题和内容中出现关键字的得分不一样,在标题中出现时的得分理应高些
Map<String, Float> boosts = new HashMap<String, Float>();
boosts.put("name", 3.0f);
boosts.put("content", 1.0f); // 默认值
QueryParser queryParser = new MultiFieldQueryParser(fields, analyzer,
boosts);// 多field搜索
Query query = queryParser.parse(queryString);
return search(query, firstResult, maxResults);
}
/*
* 接受一个Query对象 搜索索引并返回结果
*/
public QueryResult search(Query query, int firstResult, int maxResults)
throws Exception {
IndexSearcher indexSearcher = null;
// 2.进行查询
indexSearcher = new IndexSearcher(indexPath);
Filter filter = null; // 搜索时的过滤器
/** ********过滤器************* */
// 过滤器:把结果再过滤一遍,效率会很低
// filter = new
// RangeFilter("size",NumberTools.longToString(200),NumberTools.longToString(500),true,true);
/** ************************* */
Sort sort = new Sort();
// 默认是按升序排序,参数true:排序结果改为按降序排列
sort.setSort(new SortField[] { new SortField("size", true) });
TopDocs topDocs = indexSearcher.search(query, filter, 10000, sort);
int recordCount = topDocs.totalHits;
/** ***********准备高亮器******************** */
Formatter formatter = new SimpleHTMLFormatter("<font color='red'>",
"</font>");
Scorer scorer = new QueryScorer(query);
Highlighter highlighter = new Highlighter(formatter, scorer);
// 50表示只显示50个字符 这里的50个字符是有关键字的左右部分(称之为最佳部分) 这里只是测试用
Fragmenter fragmenter = new SimpleFragmenter(500);
highlighter.setTextFragmenter(fragmenter);
/** ************************************ */
// 3.取出当前的数据
List<Document> recordList = new ArrayList<Document>();
int end = Math.min(firstResult + maxResults, recordCount);
for (int i = firstResult; i < end; i++) {
ScoreDoc scoreDoc = topDocs.scoreDocs[i];
int docSn = scoreDoc.doc;
Document doc = indexSearcher.doc(docSn);
// 使用高亮器
String hc = highlighter.getBestFragment(analyzer, "content", doc
.get("content"));
// 如果content中没有搜索的关键字,则截取content的前200个字符
if (hc == null) {
String content = doc.get("content");
int endIndex = Math.min(200, content.length());
hc = content.substring(0, endIndex);
}
doc.getField("content").setValue(hc);
recordList.add(doc);
}
// 打开结果
/*
* for(ScoreDoc scoreDoc :topDocs.scoreDocs){ int docSn = scoreDoc.doc ;
* //文档内部编号 Document doc = indexSearcher.doc(docSn); //根据编号查找相应的文档
* File2DocumentUtiles.printDocInfo(doc) ; }
*/
// 4.返回结果
return new QueryResult(recordCount, recordList);
}
/*
* 建立索引并保存
*/
public void save(String filePath) throws Exception {
Document doc = File2DocumentUtiles.file2Document(filePath);
// 在添加doc的时候,可以设定文档的分数,不过不建议这样做
// doc.setBoost(1.0f); //默认值
// 参数true表示是否删除原来的索引后再重新创建,MaxFieldLength.LIMITED:表示只对前10000个字做索引
IndexWriter indexWriter = new IndexWriter(indexPath, analyzer, false,
MaxFieldLength.LIMITED);
indexWriter.addDocument(doc);
indexWriter.commit();
indexWriter.optimize();
indexWriter.close();
}
public void save(File file) throws Exception {
save(file.getAbsolutePath()) ;
}
/*
* 建立索引并保存 可以直接传入的是目录
*/
public void saveDirectory(File file) throws Exception {
if (file.isFile()) { // 如果是文件就建索引并保存
save(file.getAbsolutePath());
return;
}
File[] childs = file.listFiles();
for (int i = 0; i < childs.length; i++) {
File f = childs[i];
if (f.isDirectory()) {// 如果是目录就递归调用
saveDirectory(f);
} else {
save(f.getAbsolutePath());
}
}
}
/**
* 测试递归
*/
public void save(File file, int pointer) throws Exception {
StringBuffer str = new StringBuffer();
for (int i = 0; i < pointer; i++) {
str.append("--");
}
if (file.isFile()) { // 如果是文件就建索引并保存
System.out.println(str + file.getName());
return;
}
File[] childs = file.listFiles();
for (int i = 0; i < childs.length; i++) {
File f = childs[i];
if (f.isDirectory()) {// 如果是目录就递归调用
System.out.println(str + f.getName());
save(f, pointer + 1);
} else {
System.out.println(str + f.getName());
}
}
}
}
Java代码
package com.cs.lucene.lucene;
import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.document.Document;
public class QueryResult {
private int recordCount = 0;
private List<Document> recordResults = new ArrayList<Document>();
public QueryResult(int recordCount, List<Document> recordResults) {
this.recordCount = recordCount;
this.recordResults = recordResults;
}
public int getRecordCount() {
return recordCount;
}
public void setRecordCount(int recordCount) {
this.recordCount = recordCount;
}
public List<Document> getRecordResults() {
return recordResults;
}
public void setRecordResults(List<Document> recordResults) {
this.recordResults = recordResults;
}
}
测试索引
Java代码
package com.cs.lucene.lucene;
import java.io.File;
import jeasy.analysis.MMAnalyzer;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.junit.Test;
import com.cs.lucene.utils.File2DocumentUtiles;
public class IndexDaoTest {
private IndexDao indexDao = new IndexDao() ;
/*
*搜索索引库,并返回结果
*/
@Test
public void testSearch() throws Exception{
String queryString = "www*" ;
QueryResult queryResults = indexDao.search(queryString ,0, 10) ;
//测试结果
System.out.println("总共有【"+queryResults.getRecordCount()+"】条匹配结果");
for(int i =0 ; i<queryResults.getRecordResults().size();i++){
Document doc = queryResults.getRecordResults().get(i) ;
File2DocumentUtiles.printDocInfo(doc) ;
}
}
/*
* 测试索引源文件并保存到索引库
*/
@Test
public void testSave() throws Exception{
String filePath2 = "E:\\EclipseStudyWorkspace\\LucenceTest\\lucenceDataSource\\readme2.txt";
//源文件
//String filePath = "E:\\EclipseStudyWorkspace\\LucenceTest\\lucenceDataSource\\资源来自互联网,版权归原创作者或原单位公司所有吴朝辉.txt";
indexDao.save(filePath2);
}
/**
* 用来给目录建索引并保存到索引库
*/
@Test
public void testSaveDir() throws Exception{
String filepath = "E:\\EclipseStudyWorkspace\\LucenceTest\\lucenceDataSource\\www" ;
File file = new File(filepath) ;
indexDao.saveDirectory(file);
}
}
最后我们来看看lucene的查询功能
Java代码
package com.cs.lucene.query;
import java.util.Date;
import org.apache.lucene.document.DateTools;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.NumberTools;
import org.apache.lucene.document.DateTools.Resolution;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.RangeQuery;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.WildcardQuery;
import org.apache.lucene.search.BooleanClause.Occur;
import org.junit.Test;
import com.cs.lucene.lucene.IndexDao;
import com.cs.lucene.lucene.QueryResult;
import com.cs.lucene.utils.File2DocumentUtiles;
public class QueryTest {
IndexDao indexDao = new IndexDao() ;
/*
* 关键词查询
*/
@Test
public void testTermQuery() throws Exception{
Term term = new Term("name","资源");
Query query = new TermQuery(term);
//查询打印结果
QueryAndPrintResult(query) ;
}
/*
* 范围索引
* 数字在query中都是字符串,所以要借助NumberTools工具类做转换
*/
@Test
public void testRangeQuery() throws Exception{
Term lowerTerm = new Term("size",NumberTools.longToString(200));
Term upperTerm = new Term("size",NumberTools.longToString(500));
//true表示是否包含边界
Query query = new RangeQuery(lowerTerm,upperTerm,true) ;
/*
Term lowerTerm2 = new Term("size","200");
Term upperTerm2 = new Term("size","500");
Query query = new RangeQuery(lowerTerm2,upperTerm2,true) ; //true表示是否包含边界
*/
//查询打印结果
QueryAndPrintResult(query) ;
}
/*
* 测试NumberTools和DateTools
*/
@Test
public void testNumberToolsAndDateTools() throws Exception{
System.out.println("数字测试:");
System.out.println(NumberTools.longToString(200));
System.out.println(NumberTools.longToString(500));
System.out.println(NumberTools.stringToLong("000000000000dw"));
System.out.println("日期测试:");
System.out.println(DateTools.dateToString(new Date(), Resolution.SECOND));
System.out.println(DateTools.dateToString(new Date(), Resolution.DAY));
System.out.println(DateTools.stringToDate("20101005080855"));
}
/*
* 通配符查询
* ?:代表一个字符,*:代表0个或多个字符
*/
@Test
public void testWildcardQuery() throws Exception{
Term term = new Term("name","*me");
Query query = new WildcardQuery(term) ;
//查询打印结果
QueryAndPrintResult(query) ;
}
/*
* 短语查询:查询包含多个短语的query
*/
@Test
public void testPhraseQuery() throws Exception{
PhraseQuery phraseQuery = new PhraseQuery() ;
phraseQuery.add(new Term("name","资源")) ;
phraseQuery.add(new Term("name","作者")) ;
//setSlop:用来设置两个短语之间的最多可以隔多少个字符
phraseQuery.setSlop(20);
//查询打印结果
QueryAndPrintResult(phraseQuery) ;
}
/**
* 布尔查询:非常重要
* 三种关系:
* 1.MUST和MUST:取得两个查询子句的交集。
* 2.MUST和MUST_NOT:包含MUST但并且查询结果中不包含MUST_NOT的检索结果。
* 3.SHOULT和SHOULT:表示"或"关系,最终检索结果为所有检索子句的并集。
* 注意:有些组合是没有意义的
* @throws Exception
*/
@Test
public void testBooleanQuery() throws Exception{
//条件1
PhraseQuery phraseQuery = new PhraseQuery() ;
phraseQuery.add(new Term("name","资源")) ;
phraseQuery.add(new Term("name","作者")) ;
phraseQuery.setSlop(20);
//条件2
Term lowerTerm2 = new Term("size","200");
Term upperTerm2 = new Term("size","500");
Query rangeQuery = new RangeQuery(lowerTerm2,upperTerm2,true) ; //true表示是否包含边界
//合并两个查询
BooleanQuery booleanQuery = new BooleanQuery() ;
booleanQuery.add(phraseQuery, Occur.MUST) ;
booleanQuery.add(rangeQuery,Occur.MUST) ;
//查询打印结果
QueryAndPrintResult(booleanQuery) ;
}
private void QueryAndPrintResult(Query query) throws Exception{
System.out.println("相对应的查询字符串:"+query);
QueryResult qr = indexDao.search(query, 0, 100) ;
System.out.println("总共有【"+qr.getRecordCount()+"】条匹配结果");
//打印结果
for(int i =0 ; i<qr.getRecordResults().size();i++){
Document doc = qr.getRecordResults().get(i) ;
File2DocumentUtiles.printDocInfo(doc) ;
}
}
}
先准备下工具类
Java代码
package com.cs.lucene.utils;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.InputStreamReader;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.NumberTools;
import org.apache.lucene.document.Field.Index;
import org.apache.lucene.document.Field.Store;
public class File2DocumentUtiles {
/**
*文件到document的转换
* @param filepath
* @return
*/
public static Document file2Document(String filepath) {
File file = new File(filepath) ;
Document doc = new Document();
doc.add(new Field("name",file.getName(),Store.YES,Index.ANALYZED)) ; //索引并分词
doc.add(new Field("content",readFileContent(file),Store.YES,Index.ANALYZED)) ; //索引并分词
doc.add(new Field("size",NumberTools.longToString(file.length()),Store.YES,Index.NOT_ANALYZED)) ; //索引不分词
doc.add(new Field("path",file.getPath(),Store.YES,Index.NO)) ; //不索引
return doc;
}
/**
* 根据文件读取文件内容
* @param file
* @return
*/
private static String readFileContent(File file) {
try {
BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(file)));
StringBuffer content = new StringBuffer();
for(String line=null; (line = reader.readLine())!=null ;){
content.append(line).append("\n") ;
}
return content.toString() ;
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return null;
}
public static void printDocInfo(Document doc){
System.out.println("--------------------------");
System.out.println("name =" + doc.get("name"));
System.out.println("content =" + doc.get("content"));
System.out.println("size =" + NumberTools.stringToLong(doc.get("size")));
System.out.println("path =" + doc.get("path"));
}
}
先了解下分词器
Java代码
package com.cs.lucene.analyzer;
import java.io.StringReader;
import jeasy.analysis.MMAnalyzer;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.junit.Test;
public class AnalyzerTest {
String text = "资源来自互联网吴朝辉wwwa的a-b放到" ;
Analyzer analyzer = new MMAnalyzer() ;
@Test
public void testAnalyze() throws Exception{
analyze(analyzer,text);
}
private void analyze(Analyzer analyzer2, String text2) throws Exception {
System.out.println("----------分词器-------------------");
TokenStream tokenStream = analyzer.tokenStream("content", new StringReader(text)) ;
for(Token token = new Token();(token = tokenStream.next(token))!=null;){
System.out.println(token);
}
}
}
现在看看FSDirectory和RAMDirectory
Java代码
package com.cs.lucene.directory;
import jeasy.analysis.MMAnalyzer;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriter.MaxFieldLength;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.RAMDirectory;
import org.junit.Test;
import com.cs.lucene.utils.File2DocumentUtiles;
public class DirectoryTest {
//创建索引用的文件路径
String filePath = "E:\\EclipseStudyWorkspace\\LucenceTest\\lucenceDataSource\\资源来自互联网,版权归原创作者或原单位公司所有.txt";
//存放索引的目录
String indexPath = "E:\\EclipseStudyWorkspace\\LucenceTest\\lucenceIndex" ;
//分词器
Analyzer analyzer = new MMAnalyzer(); //je分词器
/**
* 利用FSDirectory 创建索引
* FSDirectory:在文件系统上存放
* @throws Exception
*/
@Test
public void testFSDirectory() throws Exception{
//测试文件系统目录
Directory dir = FSDirectory.getDirectory(indexPath) ;
Document doc = File2DocumentUtiles.file2Document(filePath);
//参数true表示是否删除原来的索引后再重新创建,MaxFieldLength.LIMITED:表示只对前10000个字做索引
IndexWriter indexWriter = new IndexWriter(dir,analyzer,MaxFieldLength.LIMITED) ; //没有参数true,添加索引
indexWriter.addDocument(doc) ;
indexWriter.close() ;
}
/**
* 利用RAMDirectory 创建索引
* RAMDirectory:在内存中存放
* 优点:读取快
* 缺点:重新开机,索引没了
* @throws Exception
*/
@Test
public void testRAMDirectory() throws Exception{
//测试文件系统目录
Directory dir = new RAMDirectory() ;
Document doc = File2DocumentUtiles.file2Document(filePath);
//参数true表示是否删除原来的索引后再重新创建,MaxFieldLength.LIMITED:表示只对前10000个字做索引
IndexWriter indexWriter = new IndexWriter(dir,analyzer,MaxFieldLength.LIMITED) ; //没有参数true,添加索引
indexWriter.addDocument(doc) ;
indexWriter.close() ;
}
/**
* 实际应用中,FSDirectory和RAMDirectory联合起来用
* 操控内存的索引要快,所以在运行时操作RAMDirectory,
* 但退出时必须保存到到文件系统上,所以退出时操控FSDirectory
* @throws Exception
*/
@Test
public void testRAMDirectoryAndFSDirectory() throws Exception{
//整个过程:从文件系统上读取所以到内存,运行时添加索引,此时的全部索引都在内存中,
//退出时再把全部保存到文件系统上
Directory fsDir = FSDirectory.getDirectory(indexPath) ;
//1.启动时读取
Directory ramDir = new RAMDirectory(fsDir) ;
//运行时操作ramDir
IndexWriter ramIndexWriter = new IndexWriter(ramDir,analyzer,MaxFieldLength.LIMITED);
//添加document
Document doc = File2DocumentUtiles.file2Document(filePath) ;
ramIndexWriter.addDocument(doc) ;
ramIndexWriter.close() ;//一定要关闭再合并,因为有缓存
//2.退出时保存
//参数true表示把以前的索引删掉,全部重写 (默认为false)
IndexWriter fsIndexWriter = new IndexWriter(fsDir,analyzer,true,MaxFieldLength.LIMITED);
//new Directory[]{ramDir}:要合并的目录
//addIndexesNoOptimize:表示不做优化,做优化检索时相对要慢,但占用的存储空间小
fsIndexWriter.addIndexesNoOptimize(new Directory[]{ramDir}) ;
fsIndexWriter.flush() ; //优化之前一定要先刷新缓存
fsIndexWriter.optimize() ; //优化一定要在关闭之前做,优化可以提高检索的速度
fsIndexWriter.close() ;
}
@Test
public void testOptimize() throws Exception{
Directory fsDir = FSDirectory.getDirectory(indexPath) ;
IndexWriter fsIndexWriter = new IndexWriter(fsDir,analyzer,MaxFieldLength.LIMITED);
fsIndexWriter.optimize() ;
fsIndexWriter.close() ;
}
}
现在来测测索引如何建立以及搜索
Java代码
package com.cs.lucene.lucene;
import java.io.File;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import jeasy.analysis.MMAnalyzer;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriter.MaxFieldLength;
import org.apache.lucene.queryParser.MultiFieldQueryParser;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.Filter;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.SortField;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.highlight.Formatter;
import org.apache.lucene.search.highlight.Fragmenter;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.Scorer;
import org.apache.lucene.search.highlight.SimpleFragmenter;
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
import com.cs.lucene.utils.File2DocumentUtiles;
public class IndexDao {
// 存放索引的目录
private String indexPath;
private Analyzer analyzer = null; // 分词器
public IndexDao() {
this.indexPath = "E:\\EclipseStudyWorkspace\\LucenceTest\\lucenceIndex";
this.analyzer = new MMAnalyzer(); // je分词器对中文支持很好
}
public IndexDao(Analyzer analyzer, String indexPath) {
this.analyzer = analyzer;
this.indexPath = indexPath;
}
/**
* 接受一个QuerString字符串 搜索索引并返回结果
*
*/
public QueryResult search(String queryString, int firstResult,
int maxResults) throws Exception {
// 1.把要搜索的fields解析为Query
String[] fields = { "name", "content" };
// boosts:需要的理由,标题和内容中出现关键字的得分不一样,在标题中出现时的得分理应高些
Map<String, Float> boosts = new HashMap<String, Float>();
boosts.put("name", 3.0f);
boosts.put("content", 1.0f); // 默认值
QueryParser queryParser = new MultiFieldQueryParser(fields, analyzer,
boosts);// 多field搜索
Query query = queryParser.parse(queryString);
return search(query, firstResult, maxResults);
}
/*
* 接受一个Query对象 搜索索引并返回结果
*/
public QueryResult search(Query query, int firstResult, int maxResults)
throws Exception {
IndexSearcher indexSearcher = null;
// 2.进行查询
indexSearcher = new IndexSearcher(indexPath);
Filter filter = null; // 搜索时的过滤器
/** ********过滤器************* */
// 过滤器:把结果再过滤一遍,效率会很低
// filter = new
// RangeFilter("size",NumberTools.longToString(200),NumberTools.longToString(500),true,true);
/** ************************* */
Sort sort = new Sort();
// 默认是按升序排序,参数true:排序结果改为按降序排列
sort.setSort(new SortField[] { new SortField("size", true) });
TopDocs topDocs = indexSearcher.search(query, filter, 10000, sort);
int recordCount = topDocs.totalHits;
/** ***********准备高亮器******************** */
Formatter formatter = new SimpleHTMLFormatter("<font color='red'>",
"</font>");
Scorer scorer = new QueryScorer(query);
Highlighter highlighter = new Highlighter(formatter, scorer);
// 50表示只显示50个字符 这里的50个字符是有关键字的左右部分(称之为最佳部分) 这里只是测试用
Fragmenter fragmenter = new SimpleFragmenter(500);
highlighter.setTextFragmenter(fragmenter);
/** ************************************ */
// 3.取出当前的数据
List<Document> recordList = new ArrayList<Document>();
int end = Math.min(firstResult + maxResults, recordCount);
for (int i = firstResult; i < end; i++) {
ScoreDoc scoreDoc = topDocs.scoreDocs[i];
int docSn = scoreDoc.doc;
Document doc = indexSearcher.doc(docSn);
// 使用高亮器
String hc = highlighter.getBestFragment(analyzer, "content", doc
.get("content"));
// 如果content中没有搜索的关键字,则截取content的前200个字符
if (hc == null) {
String content = doc.get("content");
int endIndex = Math.min(200, content.length());
hc = content.substring(0, endIndex);
}
doc.getField("content").setValue(hc);
recordList.add(doc);
}
// 打开结果
/*
* for(ScoreDoc scoreDoc :topDocs.scoreDocs){ int docSn = scoreDoc.doc ;
* //文档内部编号 Document doc = indexSearcher.doc(docSn); //根据编号查找相应的文档
* File2DocumentUtiles.printDocInfo(doc) ; }
*/
// 4.返回结果
return new QueryResult(recordCount, recordList);
}
/*
* 建立索引并保存
*/
public void save(String filePath) throws Exception {
Document doc = File2DocumentUtiles.file2Document(filePath);
// 在添加doc的时候,可以设定文档的分数,不过不建议这样做
// doc.setBoost(1.0f); //默认值
// 参数true表示是否删除原来的索引后再重新创建,MaxFieldLength.LIMITED:表示只对前10000个字做索引
IndexWriter indexWriter = new IndexWriter(indexPath, analyzer, false,
MaxFieldLength.LIMITED);
indexWriter.addDocument(doc);
indexWriter.commit();
indexWriter.optimize();
indexWriter.close();
}
public void save(File file) throws Exception {
save(file.getAbsolutePath()) ;
}
/*
* 建立索引并保存 可以直接传入的是目录
*/
public void saveDirectory(File file) throws Exception {
if (file.isFile()) { // 如果是文件就建索引并保存
save(file.getAbsolutePath());
return;
}
File[] childs = file.listFiles();
for (int i = 0; i < childs.length; i++) {
File f = childs[i];
if (f.isDirectory()) {// 如果是目录就递归调用
saveDirectory(f);
} else {
save(f.getAbsolutePath());
}
}
}
/**
* 测试递归
*/
public void save(File file, int pointer) throws Exception {
StringBuffer str = new StringBuffer();
for (int i = 0; i < pointer; i++) {
str.append("--");
}
if (file.isFile()) { // 如果是文件就建索引并保存
System.out.println(str + file.getName());
return;
}
File[] childs = file.listFiles();
for (int i = 0; i < childs.length; i++) {
File f = childs[i];
if (f.isDirectory()) {// 如果是目录就递归调用
System.out.println(str + f.getName());
save(f, pointer + 1);
} else {
System.out.println(str + f.getName());
}
}
}
}
Java代码
package com.cs.lucene.lucene;
import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.document.Document;
public class QueryResult {
private int recordCount = 0;
private List<Document> recordResults = new ArrayList<Document>();
public QueryResult(int recordCount, List<Document> recordResults) {
this.recordCount = recordCount;
this.recordResults = recordResults;
}
public int getRecordCount() {
return recordCount;
}
public void setRecordCount(int recordCount) {
this.recordCount = recordCount;
}
public List<Document> getRecordResults() {
return recordResults;
}
public void setRecordResults(List<Document> recordResults) {
this.recordResults = recordResults;
}
}
测试索引
Java代码
package com.cs.lucene.lucene;
import java.io.File;
import jeasy.analysis.MMAnalyzer;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.junit.Test;
import com.cs.lucene.utils.File2DocumentUtiles;
public class IndexDaoTest {
private IndexDao indexDao = new IndexDao() ;
/*
*搜索索引库,并返回结果
*/
@Test
public void testSearch() throws Exception{
String queryString = "www*" ;
QueryResult queryResults = indexDao.search(queryString ,0, 10) ;
//测试结果
System.out.println("总共有【"+queryResults.getRecordCount()+"】条匹配结果");
for(int i =0 ; i<queryResults.getRecordResults().size();i++){
Document doc = queryResults.getRecordResults().get(i) ;
File2DocumentUtiles.printDocInfo(doc) ;
}
}
/*
* 测试索引源文件并保存到索引库
*/
@Test
public void testSave() throws Exception{
String filePath2 = "E:\\EclipseStudyWorkspace\\LucenceTest\\lucenceDataSource\\readme2.txt";
//源文件
//String filePath = "E:\\EclipseStudyWorkspace\\LucenceTest\\lucenceDataSource\\资源来自互联网,版权归原创作者或原单位公司所有吴朝辉.txt";
indexDao.save(filePath2);
}
/**
* 用来给目录建索引并保存到索引库
*/
@Test
public void testSaveDir() throws Exception{
String filepath = "E:\\EclipseStudyWorkspace\\LucenceTest\\lucenceDataSource\\www" ;
File file = new File(filepath) ;
indexDao.saveDirectory(file);
}
}
最后我们来看看lucene的查询功能
Java代码
package com.cs.lucene.query;
import java.util.Date;
import org.apache.lucene.document.DateTools;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.NumberTools;
import org.apache.lucene.document.DateTools.Resolution;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.RangeQuery;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.WildcardQuery;
import org.apache.lucene.search.BooleanClause.Occur;
import org.junit.Test;
import com.cs.lucene.lucene.IndexDao;
import com.cs.lucene.lucene.QueryResult;
import com.cs.lucene.utils.File2DocumentUtiles;
public class QueryTest {
IndexDao indexDao = new IndexDao() ;
/*
* 关键词查询
*/
@Test
public void testTermQuery() throws Exception{
Term term = new Term("name","资源");
Query query = new TermQuery(term);
//查询打印结果
QueryAndPrintResult(query) ;
}
/*
* 范围索引
* 数字在query中都是字符串,所以要借助NumberTools工具类做转换
*/
@Test
public void testRangeQuery() throws Exception{
Term lowerTerm = new Term("size",NumberTools.longToString(200));
Term upperTerm = new Term("size",NumberTools.longToString(500));
//true表示是否包含边界
Query query = new RangeQuery(lowerTerm,upperTerm,true) ;
/*
Term lowerTerm2 = new Term("size","200");
Term upperTerm2 = new Term("size","500");
Query query = new RangeQuery(lowerTerm2,upperTerm2,true) ; //true表示是否包含边界
*/
//查询打印结果
QueryAndPrintResult(query) ;
}
/*
* 测试NumberTools和DateTools
*/
@Test
public void testNumberToolsAndDateTools() throws Exception{
System.out.println("数字测试:");
System.out.println(NumberTools.longToString(200));
System.out.println(NumberTools.longToString(500));
System.out.println(NumberTools.stringToLong("000000000000dw"));
System.out.println("日期测试:");
System.out.println(DateTools.dateToString(new Date(), Resolution.SECOND));
System.out.println(DateTools.dateToString(new Date(), Resolution.DAY));
System.out.println(DateTools.stringToDate("20101005080855"));
}
/*
* 通配符查询
* ?:代表一个字符,*:代表0个或多个字符
*/
@Test
public void testWildcardQuery() throws Exception{
Term term = new Term("name","*me");
Query query = new WildcardQuery(term) ;
//查询打印结果
QueryAndPrintResult(query) ;
}
/*
* 短语查询:查询包含多个短语的query
*/
@Test
public void testPhraseQuery() throws Exception{
PhraseQuery phraseQuery = new PhraseQuery() ;
phraseQuery.add(new Term("name","资源")) ;
phraseQuery.add(new Term("name","作者")) ;
//setSlop:用来设置两个短语之间的最多可以隔多少个字符
phraseQuery.setSlop(20);
//查询打印结果
QueryAndPrintResult(phraseQuery) ;
}
/**
* 布尔查询:非常重要
* 三种关系:
* 1.MUST和MUST:取得两个查询子句的交集。
* 2.MUST和MUST_NOT:包含MUST但并且查询结果中不包含MUST_NOT的检索结果。
* 3.SHOULT和SHOULT:表示"或"关系,最终检索结果为所有检索子句的并集。
* 注意:有些组合是没有意义的
* @throws Exception
*/
@Test
public void testBooleanQuery() throws Exception{
//条件1
PhraseQuery phraseQuery = new PhraseQuery() ;
phraseQuery.add(new Term("name","资源")) ;
phraseQuery.add(new Term("name","作者")) ;
phraseQuery.setSlop(20);
//条件2
Term lowerTerm2 = new Term("size","200");
Term upperTerm2 = new Term("size","500");
Query rangeQuery = new RangeQuery(lowerTerm2,upperTerm2,true) ; //true表示是否包含边界
//合并两个查询
BooleanQuery booleanQuery = new BooleanQuery() ;
booleanQuery.add(phraseQuery, Occur.MUST) ;
booleanQuery.add(rangeQuery,Occur.MUST) ;
//查询打印结果
QueryAndPrintResult(booleanQuery) ;
}
private void QueryAndPrintResult(Query query) throws Exception{
System.out.println("相对应的查询字符串:"+query);
QueryResult qr = indexDao.search(query, 0, 100) ;
System.out.println("总共有【"+qr.getRecordCount()+"】条匹配结果");
//打印结果
for(int i =0 ; i<qr.getRecordResults().size();i++){
Document doc = qr.getRecordResults().get(i) ;
File2DocumentUtiles.printDocInfo(doc) ;
}
}
}
相关推荐
`lucene入门小实例.txt` 文件中可能包含了一个简单的Lucene使用示例,例如: 1. 创建 `Directory` 对象,比如使用 `FSDirectory.open()` 打开一个文件系统的目录来存储索引。 2. 实例化 `Analyzer`,如使用 `...
Lucene 是一个高性能、全文本搜索库,由 Apache 软件基金会开发。它提供了完整的搜索功能,包括索引、查询、评分等,广泛应用于各种项目和产品中。在这个入门案例中,我们将深入理解如何使用 Lucene 3.6 版本来构建...
**Lucene入门学习文档** **一、什么是Lucene** Lucene是Apache软件基金会下的一个开源全文检索库,它提供了一个高性能、可扩展的信息检索服务。Lucene最初由Doug Cutting开发,现在已经成为Java社区中事实上的标准...
【Lucene】Lucene入门心得 Lucene是一个高性能、全文本搜索库,由Apache软件基金会开发,被广泛应用于各种搜索引擎的构建。它提供了一个简单的API,使得开发者可以方便地在自己的应用程序中集成全文检索功能。...
**正文** Lucene.Net是一个基于Apache Lucene的开源全文搜索引擎库,它被移植到...教程可能涵盖从安装步骤、基本概念介绍,到实战案例的详细讲解,帮助初学者快速入门并熟练运用Lucene.Net进行全文搜索引擎的开发。
Lucene常被集成到各种项目中,例如Elasticsearch就是一个基于Lucene的分布式搜索引擎。开发者也可以利用工具如Solr来简化Lucene的使用。在给定的标签“工具”中,可能是指使用Lucene作为开发搜索引擎的基础工具。 ...
**Lucene 入门指南** Lucene 是一个高性能、全文本搜索库,由 Apache 软件基金会开发并维护。它是 Java 开发人员用来构建搜索引擎应用程序的基础工具。本指南将帮助初学者理解 Lucene 的核心概念,以及如何利用它来...
【标题】"Lucene全文检索入门项目 Java实现Maven项目 Elasticsearch 基础实战" 提供了一个学习如何在Java环境中运用全文检索技术的起点。这个项目涵盖了两个主要的开源工具:Lucene和Elasticsearch,它们都是业界...
doc.add(new Field("title", "Lucene入门", Field.Store.YES, Field.Index.ANALYZED)); doc.add(new Field("content", "这是Lucene 3.0的实例", Field.Store.YES, Field.Index.ANALYZED)); ``` 3. **查询...
总的来说,王学松的“Lucene+Nutch搜索引擎开发实例代码”是一份宝贵的教育资源,它可以帮助开发者快速入门搜索引擎开发,并深入了解这两个项目的内部工作机制。通过实践这些代码,不仅可以提升技术能力,还能为构建...
Lucene 是一个高性能、全文本搜索库,由 Apache 软件基金会开发并维护。它是 Java 编写的,但提供了多种语言的接口,包括 Python、.NET 和 PHP 等。Lucene 提供了完整的搜索功能,包括索引创建、查询解析、评分和...
在《开发自己的搜索引擎》一书中,通过`ch2-lucene入门小例子`,读者可以了解到如何使用Lucene 2.0创建简单的搜索引擎,例如建立索引、执行搜索等基本操作。而`myReserch-可用的网络搜索引擎`可能包含一个完整的搜索...
### Lucene入门指南 #### 一、Lucene简介 **Lucene** 是一款高性能的全文检索引擎工具包,由 **Apache 软件基金会** 的 **Jakarta 项目组** 开发并维护。作为一款完全开放源代码的工具,Lucene 提供了一系列的功能...