package com.lucene;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import javax.sql.DataSource;
import org.apache.ibatis.io.Resources;
import org.apache.ibatis.mapping.Environment;
import org.apache.ibatis.session.SqlSessionFactory;
import org.apache.ibatis.session.SqlSessionFactoryBuilder;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Field.Index;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.InvalidTokenOffsetsException;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.SimpleFragmenter;
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.MMapDirectory;
import org.apache.lucene.store.SimpleFSDirectory;
import org.apache.lucene.util.Version;
import org.wltea.analyzer.lucene.IKAnalyzer;
import org.wltea.analyzer.lucene.IKQueryParser;
import org.wltea.analyzer.lucene.IKSimilarity;
import com.dao.core.SqlSessionSingle;
import com.dao.reptile.WriteAlreadyUrlDao;
import com.reptile.util.GlobalContains;
public class Lucene {
public static void main(String[] args) throws SQLException, InvalidTokenOffsetsException {
Lucene lucene = new Lucene();
loadMybatis();
lucene.createIndex(GlobalContains.index_path);
// lucene.indexSearch(GlobalContains.index_path,"title","111");//"content"
}
private void createIndex(String indexFile) {
Analyzer analyzer = new IKAnalyzer();
Directory d;
try {
// File dir = new File(GlobalContains.reptile_root);
WriteAlreadyUrlDao alreadyDao = new WriteAlreadyUrlDao();
Map paramMap = new HashMap();
paramMap.put("is_index","0");
List list = alreadyDao.queryList(paramMap);
if(list!=null && list.size()>0){
d = FSDirectory.open(new File(indexFile));
IndexWriterConfig conf = new IndexWriterConfig(Version.LUCENE_33,
analyzer);
IndexWriter indexWriter = new IndexWriter(d, conf);
for (int i = 0; i < list.size(); i++) {
Map map = (Map)list.get(i);
String path = map.get("path").toString();
File ff = new File(path);
if(ff==null || !ff.exists()){
System.out.println("文件:"+path+"不存在。");
continue;
}
Document doc = new Document();
doc.add(new Field("title", map.get("title").toString(), Store.YES,
Index.ANALYZED));
doc.add(new Field("url", map.get("url").toString(), Store.YES,
Index.ANALYZED));
doc.add(new Field("content", new FileReader(ff)));
indexWriter.addDocument(doc);
System.out.println(map.get("url").toString()+"\tcount:"+"\t当前:" + (i + 1)+",总共:"+list.size());
}
indexWriter.close();
d.close();
}else{
System.out.println("没有任何数据需要被索引。");
}
} catch (Exception e) {
e.printStackTrace();
}
}
public List indexSearch(String indexFile,String key,String keywork) throws InvalidTokenOffsetsException {
Analyzer analyzer = new IKAnalyzer();
Directory d;
List resultList = null;
IndexSearcher isearcher = null;
try {
// d = SimpleFSDirectory.open(new File(indexFile));
// d= MMapDirectory.open(new File(indexFile));
d = FSDirectory.open(new File(indexFile));
isearcher = new IndexSearcher(d);
// 在索引中使用IKSimilarity相似度评估器
isearcher.setSimilarity(new IKSimilarity());
Query query = IKQueryParser.parse(key, keywork);
// 搜索相似度最高的5条记录
TopDocs topDocs = isearcher.search(query, 1000);
System.out.println("命中:" + topDocs.totalHits);
ScoreDoc[] result = topDocs.scoreDocs;
if(result.length>0){
resultList = new ArrayList();
for (int i = 0; i < result.length; i++) {
Document document = isearcher.doc(result[i].doc);
System.out.println("找到:" + document.get("url")+"\t"+
document.get("title"));
//org.apache.lucene.search.highlight
String text = document.get("title");
System.out.println("key:"+text);
if(text!=null){
SimpleHTMLFormatter simpleHTMLFormatter = new SimpleHTMLFormatter("<font color='red'>", "</font>");
Highlighter highlighter = new Highlighter(simpleHTMLFormatter, new QueryScorer(query));
highlighter.setTextFragmenter(new SimpleFragmenter(text.length()));
TokenStream tokenStream = analyzer.tokenStream(key, new StringReader(text));
String highlighterText = highlighter.getBestFragment(tokenStream, text);
System.out.println("【高亮显示第】"+(i+1)+"条,检索结果如下:"+highlighterText);
//set result
LuceneResultBean luceneResultBean = new LuceneResultBean();
luceneResultBean.setUrl(document.get("url"));
luceneResultBean.setTitle(highlighterText);
resultList.add(luceneResultBean);
}
}
}
return resultList;
} catch (IOException e) {
e.printStackTrace();
}finally{
if(isearcher!=null)
try {
isearcher.close();
} catch (IOException e) {
e.printStackTrace();
}
}
return null;
}
private static void loadMybatis() throws SQLException{
// 加载myBatis的数据库配置文件,不用spring则需要手动加载
Reader reader = null;
try {
reader = Resources.getResourceAsReader("myBatis3.xml");
} catch (IOException e) {
e.printStackTrace();
}
SqlSessionFactory sqlSession = new SqlSessionFactoryBuilder()
.build(reader);
Environment en = sqlSession.getConfiguration().getEnvironment();
DataSource ds = en.getDataSource();
System.out.println("连接:" + ds.getConnection().toString());
SqlSessionSingle.sqlSession = sqlSession;
System.out.println(sqlSession);
// System.out.println("测试连接数据库是否成功。。。");
// SqlSession session = sqlSession.openSession();
// Map map = new HashMap();
// List list = session.selectList("t_url.queryList", map);
// System.out.println(list);
// session.close();
}
}
分享到:
相关推荐
- **Analyzer API**: 如`org.apache.lucene.analysis.Analyzer`,提供了创建自定义分析器的方法,如`createComponents(String fieldName, Reader reader)`,用于创建TokenStream。 - **IndexWriter API**: 如`org....
首先,我们需要导入必要的包,如`org.apache.lucene.analysis.Analyzer`,`org.apache.lucene.document.Document`和`org.apache.lucene.index.IndexWriter`等。接着,我们可以创建一个`IndexWriter`对象,它负责将...
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org...
config.setOpenMode(OpenMode.CREATE_OR_APPEND); // 如果索引不存在则创建,存在则追加 IndexWriter indexWriter = new IndexWriter(directory, config); ``` ### 5. 添加文档到索引 每个文档由一个`Document`...
在`test_index`类中,我们定义了一个静态的`Analyzer`对象`luceneAnalyzer`,初始化为`IKAnalyzer`实例,用于对输入的文本进行分词处理。这里我们选择使用IK分词器,是因为它对中文的支持较为优秀,能更好地理解并...
Lucene.NET的核心组件包括文档(Document)、字段(Field)、索引(Index)和查询(Query)。文档是信息的基本单位,可以包含多个字段,如标题、内容等。字段定义了文档的属性,如是否存储原始值、是否可搜索等。...
StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_29); PerFieldAnalyzerWrapper perFieldAnalyzer = new PerFieldAnalyzerWrapper(analyzer); perFieldAnalyzer.addAnalyzer("mm", new MMAnalyzer...
IndexWriter(Directory d, Analyzer a, boolean create, IndexWriter.MaxFieldLength mfl) create为true时,原索引文件不存在就创建,存在就覆盖。 create为false时,原索引文件不存在就报错,存在就追加。 b) ...
- 使用`QueryParser`解析查询字符串,`Query query = new QueryParser(Version.LUCENE_21, "content", analyzer).parse(queryString);` - 初始化`IndexSearcher`,执行查询,`TopDocs topDocs = searcher.search...
- **设置 `IndexWriterConfig`**:配置 `IndexWriter` 的行为,如版本控制(`Version.LUCENE_35`)、分词器及写入模式(`OpenMode.CREATE_OR_APPEND`)。 2. **添加文档到索引** - 创建 `Document` 对象并填充...
Lucene的强大之处在于其灵活性和效率,能够处理复杂查询,支持多种查询类型,如`TermQuery`、`BooleanQuery`、`RangeQuery`、`PrefixQuery`、`PhraseQuery`、`FuzzyQuery`、`WildcardQuery`等。理解这些概念和操作,...
Lucene对中文处理需要特殊的Analyzer,如`IKAnalyzer`或`SmartChineseAnalyzer`。这些Analyzer能对中文进行分词,便于索引和搜索。例如: ```java Analyzer analyzer = new IKAnalyzer(); ``` 3. **创建...
Query query = parser.parse("Lucene搜索"); // 初始化IndexSearcher IndexSearcher searcher = new IndexSearcher(DirectoryReader.open(directory)); // 执行搜索 TopDocs topDocs = searcher.search(query, 10)...
- **建立索引**:运行命令`java org.apache.lucene.demo.IndexHTML-create-index [索引数据存放路径] [被索引文件路径]`,例如`java org.apache.lucene.demo.IndexHTML-create-index D:\lucene\temp\index D:\...
二、创建索引(Create) 创建索引是Lucene的第一步,它涉及到对原始数据的分析和存储。在Java代码中,我们需要定义一个`Document`对象来存储每个要索引的记录。然后,使用`IndexWriter`类来写入这些文档到索引中。...
在“Lucene 实现控制台操作 C/R/U/D”这一主题中,我们将深入探讨如何利用 Lucene 来完成创建(Create)、读取(Read)、更新(Update)和删除(Delete)这四个基本数据库操作在文本搜索中的应用。 1. **创建...
IndexWriterConfig iwc = new IndexWriterConfig(analyzer); iwc.setOpenMode(OpenMode.CREATE_OR_APPEND); IndexWriter writer = new IndexWriter(directory, iwc); writer.updateDocument(new Term("path", ...
Analyzer queryAnalyzer = new StandardAnalyzer(); QueryParser parser = new QueryParser("content", queryAnalyzer); Query query = parser.parse("示例内容"); // 构建查询 TopDocs topDocs = searcher.search...
"analyzer": "ik_smart" }, "total_fee": { "type": "keyword" }, "create_time": { "type": "date", "format": "yyyy-MM-dd HH:mm:ss" } } } } ``` - **查询命令**: ```json GET index_name/_...
es.indices.create(index=index_name) # 添加文档 doc = { 'title': 'Elasticsearch: The Definitive Guide', 'author': 'Clinton Gormley', 'summary': 'A comprehensive guide to Elasticsearch.' } res = es...