`
543089122
  • 浏览: 153915 次
  • 性别: Icon_minigender_1
  • 来自: 武汉
社区版块
存档分类
最新评论

lucene create index and analyzer query

阅读更多
package com.lucene;

import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import javax.sql.DataSource;

import org.apache.ibatis.io.Resources;
import org.apache.ibatis.mapping.Environment;
import org.apache.ibatis.session.SqlSessionFactory;
import org.apache.ibatis.session.SqlSessionFactoryBuilder;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Field.Index;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.InvalidTokenOffsetsException;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.SimpleFragmenter;
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.MMapDirectory;
import org.apache.lucene.store.SimpleFSDirectory;
import org.apache.lucene.util.Version;
import org.wltea.analyzer.lucene.IKAnalyzer;
import org.wltea.analyzer.lucene.IKQueryParser;
import org.wltea.analyzer.lucene.IKSimilarity;

import com.dao.core.SqlSessionSingle;
import com.dao.reptile.WriteAlreadyUrlDao;
import com.reptile.util.GlobalContains;

public class Lucene {
	public static void main(String[] args) throws SQLException, InvalidTokenOffsetsException {
		Lucene lucene = new Lucene();
		loadMybatis();
		lucene.createIndex(GlobalContains.index_path);
//		lucene.indexSearch(GlobalContains.index_path,"title","111");//"content"
	}

	private void createIndex(String indexFile) {
		Analyzer analyzer = new IKAnalyzer();
		Directory d;
		try {
//			File dir = new File(GlobalContains.reptile_root);
			WriteAlreadyUrlDao alreadyDao = new WriteAlreadyUrlDao();
			Map paramMap = new HashMap();
			paramMap.put("is_index","0");
			List list = alreadyDao.queryList(paramMap);
			if(list!=null && list.size()>0){
				d = FSDirectory.open(new File(indexFile));
				IndexWriterConfig conf = new IndexWriterConfig(Version.LUCENE_33,
						analyzer);
				IndexWriter indexWriter = new IndexWriter(d, conf);
				
				for (int i = 0; i < list.size(); i++) {
					Map map = (Map)list.get(i);
					String path = map.get("path").toString();
					File ff = new File(path);
					if(ff==null || !ff.exists()){
						System.out.println("文件:"+path+"不存在。");
						continue;
					}
					Document doc = new Document();
					doc.add(new Field("title", map.get("title").toString(), Store.YES,
							Index.ANALYZED));
					doc.add(new Field("url", map.get("url").toString(), Store.YES,
							Index.ANALYZED));
					doc.add(new Field("content", new FileReader(ff)));
					indexWriter.addDocument(doc);
					System.out.println(map.get("url").toString()+"\tcount:"+"\t当前:" + (i + 1)+",总共:"+list.size());
				}
				indexWriter.close();
				d.close();
			}else{
				System.out.println("没有任何数据需要被索引。");
			}
		} catch (Exception e) {
			e.printStackTrace();
		}
	}

	public List indexSearch(String indexFile,String key,String keywork) throws InvalidTokenOffsetsException {
		Analyzer analyzer = new IKAnalyzer();
		Directory d;
		List resultList = null;
		IndexSearcher isearcher = null;
		try {
//			d = SimpleFSDirectory.open(new File(indexFile));
//			d= MMapDirectory.open(new File(indexFile));
			d = FSDirectory.open(new File(indexFile));
			isearcher = new IndexSearcher(d);
			// 在索引中使用IKSimilarity相似度评估器
			isearcher.setSimilarity(new IKSimilarity());
			Query query = IKQueryParser.parse(key, keywork);
			// 搜索相似度最高的5条记录
			TopDocs topDocs = isearcher.search(query, 1000);
			System.out.println("命中:" + topDocs.totalHits);
			ScoreDoc[] result = topDocs.scoreDocs;
			
			if(result.length>0){
				resultList = new ArrayList();
				for (int i = 0; i < result.length; i++) {
					Document document = isearcher.doc(result[i].doc);
					System.out.println("找到:" + document.get("url")+"\t"+
							document.get("title"));
					
					//org.apache.lucene.search.highlight
					String text = document.get("title");
					System.out.println("key:"+text);
					if(text!=null){
						SimpleHTMLFormatter simpleHTMLFormatter = new SimpleHTMLFormatter("<font color='red'>", "</font>");
						Highlighter highlighter = new Highlighter(simpleHTMLFormatter, new QueryScorer(query));
						highlighter.setTextFragmenter(new SimpleFragmenter(text.length()));
						
						TokenStream tokenStream = analyzer.tokenStream(key, new StringReader(text));
						String highlighterText = highlighter.getBestFragment(tokenStream, text);
						System.out.println("【高亮显示第】"+(i+1)+"条,检索结果如下:"+highlighterText);

						//set result
						LuceneResultBean luceneResultBean = new LuceneResultBean();
						luceneResultBean.setUrl(document.get("url"));
						luceneResultBean.setTitle(highlighterText);
						
						resultList.add(luceneResultBean);
					}
				}
			}
			return resultList;
		} catch (IOException e) {
			e.printStackTrace();
		}finally{
			if(isearcher!=null)
				try {
					isearcher.close();
				} catch (IOException e) {
					e.printStackTrace();
				}
		}
		return null;
	}
	private static void loadMybatis() throws SQLException{
		// 加载myBatis的数据库配置文件,不用spring则需要手动加载
		Reader reader = null;
		try {
			reader = Resources.getResourceAsReader("myBatis3.xml");
		} catch (IOException e) {
			e.printStackTrace();
		}
		SqlSessionFactory sqlSession = new SqlSessionFactoryBuilder()
				.build(reader);
		Environment en = sqlSession.getConfiguration().getEnvironment();
		DataSource ds = en.getDataSource();
		System.out.println("连接:" + ds.getConnection().toString());
		SqlSessionSingle.sqlSession = sqlSession;
		System.out.println(sqlSession);
//		System.out.println("测试连接数据库是否成功。。。");
//		SqlSession session = sqlSession.openSession();
//		Map map = new HashMap();
//		List list = session.selectList("t_url.queryList", map);
//		System.out.println(list);
//		session.close();
	}
}

分享到:
评论

相关推荐

    lucene核心资源包以及lucene的api

    - **Analyzer API**: 如`org.apache.lucene.analysis.Analyzer`,提供了创建自定义分析器的方法,如`createComponents(String fieldName, Reader reader)`,用于创建TokenStream。 - **IndexWriter API**: 如`org....

    lucene3 例子

    首先,我们需要导入必要的包,如`org.apache.lucene.analysis.Analyzer`,`org.apache.lucene.document.Document`和`org.apache.lucene.index.IndexWriter`等。接着,我们可以创建一个`IndexWriter`对象,它负责将...

    lucene示例代码

    import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org...

    Lucene索引的基本操作

    config.setOpenMode(OpenMode.CREATE_OR_APPEND); // 如果索引不存在则创建,存在则追加 IndexWriter indexWriter = new IndexWriter(directory, config); ``` ### 5. 添加文档到索引 每个文档由一个`Document`...

    Lucene3.5实例

    在`test_index`类中,我们定义了一个静态的`Analyzer`对象`luceneAnalyzer`,初始化为`IKAnalyzer`实例,用于对输入的文本进行分词处理。这里我们选择使用IK分词器,是因为它对中文的支持较为优秀,能更好地理解并...

    使用lucene.net盘古分词实现站内搜索demo

    Lucene.NET的核心组件包括文档(Document)、字段(Field)、索引(Index)和查询(Query)。文档是信息的基本单位,可以包含多个字段,如标题、内容等。字段定义了文档的属性,如是否存储原始值、是否可搜索等。...

    lucene 分组统计

    StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_29); PerFieldAnalyzerWrapper perFieldAnalyzer = new PerFieldAnalyzerWrapper(analyzer); perFieldAnalyzer.addAnalyzer("mm", new MMAnalyzer...

    lucene2.9.1所有最新开发包及源码及文档

    IndexWriter(Directory d, Analyzer a, boolean create, IndexWriter.MaxFieldLength mfl) create为true时,原索引文件不存在就创建,存在就覆盖。 create为false时,原索引文件不存在就报错,存在就追加。 b) ...

    lucene 2.1.0 好用实例

    - 使用`QueryParser`解析查询字符串,`Query query = new QueryParser(Version.LUCENE_21, "content", analyzer).parse(queryString);` - 初始化`IndexSearcher`,执行查询,`TopDocs topDocs = searcher.search...

    lucene个人总结

    - **设置 `IndexWriterConfig`**:配置 `IndexWriter` 的行为,如版本控制(`Version.LUCENE_35`)、分词器及写入模式(`OpenMode.CREATE_OR_APPEND`)。 2. **添加文档到索引** - 创建 `Document` 对象并填充...

    lucene搜索过程代码详解

    Lucene的强大之处在于其灵活性和效率,能够处理复杂查询,支持多种查询类型,如`TermQuery`、`BooleanQuery`、`RangeQuery`、`PrefixQuery`、`PhraseQuery`、`FuzzyQuery`、`WildcardQuery`等。理解这些概念和操作,...

    LuceneUtils_lucenejava_全文检索_lucene_

    Lucene对中文处理需要特殊的Analyzer,如`IKAnalyzer`或`SmartChineseAnalyzer`。这些Analyzer能对中文进行分词,便于索引和搜索。例如: ```java Analyzer analyzer = new IKAnalyzer(); ``` 3. **创建...

    Lucene 2.4 入门例子

    Query query = parser.parse("Lucene搜索"); // 初始化IndexSearcher IndexSearcher searcher = new IndexSearcher(DirectoryReader.open(directory)); // 执行搜索 TopDocs topDocs = searcher.search(query, 10)...

    lucene 教程详解

    - **建立索引**:运行命令`java org.apache.lucene.demo.IndexHTML-create-index [索引数据存放路径] [被索引文件路径]`,例如`java org.apache.lucene.demo.IndexHTML-create-index D:\lucene\temp\index D:\...

    lucene的curd

    二、创建索引(Create) 创建索引是Lucene的第一步,它涉及到对原始数据的分析和存储。在Java代码中,我们需要定义一个`Document`对象来存储每个要索引的记录。然后,使用`IndexWriter`类来写入这些文档到索引中。...

    Lucene 实现控制台操作 C/R/U/D

    在“Lucene 实现控制台操作 C/R/U/D”这一主题中,我们将深入探讨如何利用 Lucene 来完成创建(Create)、读取(Read)、更新(Update)和删除(Delete)这四个基本数据库操作在文本搜索中的应用。 1. **创建...

    Lucene4.X实战类baidu搜索的大型文档海量搜索系统-16.Lucene高级进阶2 共4页.pptx

    IndexWriterConfig iwc = new IndexWriterConfig(analyzer); iwc.setOpenMode(OpenMode.CREATE_OR_APPEND); IndexWriter writer = new IndexWriter(directory, iwc); writer.updateDocument(new Term("path", ...

    lunece 建立索引与查询示例

    Analyzer queryAnalyzer = new StandardAnalyzer(); QueryParser parser = new QueryParser("content", queryAnalyzer); Query query = parser.parse("示例内容"); // 构建查询 TopDocs topDocs = searcher.search...

    lasticsearch(简称ES)是一个基于Apache Lucene(TM)的开源搜索引擎,无论在开源还是专有领域,Luce

    "analyzer": "ik_smart" }, "total_fee": { "type": "keyword" }, "create_time": { "type": "date", "format": "yyyy-MM-dd HH:mm:ss" } } } } ``` - **查询命令**: ```json GET index_name/_...

    elasticsearch数据库下载以及 配置、使用案例,项目代码

    es.indices.create(index=index_name) # 添加文档 doc = { 'title': 'Elasticsearch: The Definitive Guide', 'author': 'Clinton Gormley', 'summary': 'A comprehensive guide to Elasticsearch.' } res = es...

Global site tag (gtag.js) - Google Analytics