Lucene入门，小例子，笔记

liangjian103

浏览: 177963 次
性别:
来自: 北京

最近访客更多访客>>

belle-liang

java_byh

kasuo123

Geeweir

博主相关

博客

微博

相册

留言

关于我

文章分类

社区版块

存档分类

lucene Apache 搜索引擎

最近在研究Lucene的用法，经过这两天的努力，在网上搜索资料，还专门买了本书《开发自己的搜索引擎---Lucene+Heritrix》打算系统的学习一下这东西，大的项目是肯定离不开搜索引擎的，学吧，没错~ 这两天有过无助、有过失落、也有过新发现时的欣喜若狂，总之最后还是做出了个小例子，怕以后再忘记，还是记录一下吧~也记录自己的成长，只有把学到的东西讲给别人，才算是真的会了，此例子也献给那些正在搜索Lucene资料的朋友们吧~愿对你们有所帮助~

好了先贴代码吧！

主类：TestIndex.java

package com.lj.test;

import java.io.StringReader;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;

import net.paoding.analysis.analyzer.PaodingAnalyzer;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriter.MaxFieldLength;
import org.apache.lucene.queryParser.MultiFieldQueryParser;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.Filter;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.SimpleFragmenter;
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;

import com.lj.entity.Product;
import com.lj.util.Configuration;

/**
 * Lucene初级小例子，简单测试。
 * @author LiangJian
 * 2011-6-17 11:56:14
 */
public class TestIndex {
   
    /** 创建Lucene索引 */
    public void createIndex(String indexPath,List<Product> productList) throws Exception{
        //记录开始时间
        long startTime = new Date().getTime();
        /** 建立索引，使用庖丁中文分词器PaodingAnalyzer。*/
        IndexWriter indexWriter = new IndexWriter(indexPath, new PaodingAnalyzer(), true, MaxFieldLength.LIMITED);
        /**
         * 说明：
        * Field.TermVector.NO:不保存term vectors
          Field.TermVector.YES:保存term vectors
          Field.TermVector.WITH_POSITIONS:保存term vectors.(保存值和token位置信息)
          Field.TermVector.WITH_OFFSETS:保存term vectors.(保存值和Token的offset)
          Field.TermVector.WITH_POSITIONS_OFFSETS:保存term vectors.(保存值和token位置信息和Token的offset)
        */
        for(Product product:productList){
            Document doc = new Document();
            doc.add(new Field("p_id",product.getP_id()+"",Field.Store.YES,Field.Index.NO));
            doc.add(new Field("p_name",product.getP_name(),Field.Store.YES,Field.Index.ANALYZED));
            doc.add(new Field("p_price",product.getP_price(),Field.Store.YES,Field.Index.NO));
            doc.add(new Field("p_content",product.getP_content(),Field.Store.YES,Field.Index.ANALYZED));
            indexWriter.addDocument(doc);
        }
        // optimize()方法是对索引进行优化，进行了索引优化后，索引才算是真正的生效。
        indexWriter.optimize();
        indexWriter.close();
        // 测试一下索引的时间
        long endTime = new Date().getTime();
        System.out.println("这花费了 " + (endTime - startTime)+ "毫秒来把数据增加到索引里面去!");
    }
   
    /**
     * 按Content字段查询
     * @param indexPath 索引文件路径
     * @param keyword 关键字
     * @return
     * @throws Exception
     */
    public List<Product> searchByKeyWord(String indexPath,String keyword)throws Exception{
        List<Product> productList = new ArrayList<Product>();
        IndexSearcher search = new IndexSearcher(indexPath);
       
        long startTime = new Date().getTime();
       
        //下面的是进行p_content和p_name 范围内进行搜索.
        String[] keywords = new String[]{"p_content","p_name"};//要检索的字段
        /** 这里需要注意的就是BooleanClause.Occur[]数组,它表示多个条件之间的关系,
         * BooleanClause.Occur.MUST表示and,
         * BooleanClause.Occur.MUST_NOT表示not,
         * BooleanClause.Occur.SHOULD表示or. 
         * */
        BooleanClause.Occur[] clauses = { BooleanClause.Occur.SHOULD,BooleanClause.Occur.SHOULD};//对应要检索的字段的逻辑（与、或）
        Analyzer analyzer = new PaodingAnalyzer();//使用庖丁分词，按分词进行检索
        //用MultiFieldQueryParser得到query对象
        Query query = MultiFieldQueryParser.parse(keyword, keywords, clauses, analyzer);//parser.parse(query);
        Filter filter = null;//过滤
        //开始匹配
        TopDocs topDocs = search.search(query, filter, 1000);
        System.out.println("共匹配到："+topDocs.totalHits+"个.");
       
        for(ScoreDoc scorceDoc : topDocs.scoreDocs){
            Document doc = search.doc(scorceDoc.doc);
//            System.out.println(scorceDoc.doc+"---"+doc);//便于学习，可以打印出来看看。
            Product product = new Product();
            product.setP_id(Integer.parseInt(doc.get("p_id")));
            product.setP_name(doc.get("p_name"));
            product.setP_price(doc.get("p_price"));
//            product.setP_content(doc.get("p_content"));//不使用高亮
            product.setP_content(this.getHighLight(doc, analyzer, query, "p_content"));//使用高亮
            productList.add(product);
        }
        search.close();
        long endTime = new Date().getTime();
        System.out.println("检索耗时： " + (endTime - startTime)+ "毫秒!");
        return productList;
    }
   
    /**
     * 高亮设置
     * @param doc
     * @param analyzer 分词器
     * @param query
     * @param field 字段
     * @throws Exception
     * @reutrn 高亮后的值
     */
    public String getHighLight(Document doc,Analyzer analyzer,Query query,String field)throws Exception{
        //设置高亮显示格式
//        SimpleHTMLFormatter simpleHTMLFormatter = new SimpleHTMLFormatter("<font color='red'><strong>", "</strong></font>");
        SimpleHTMLFormatter simpleHTMLFormatter = new SimpleHTMLFormatter("<b>", "</b>");
        /* 语法高亮显示设置 */
        Highlighter highlighter = new Highlighter(simpleHTMLFormatter,new QueryScorer(query));
        highlighter.setTextFragmenter(new SimpleFragmenter(100));
        // 取 field 字段值，准备进行高亮
        String fieldValue = doc.get(field);
        TokenStream tokenStream = analyzer.tokenStream(field,new StringReader(fieldValue));
        //转成高亮的值
        String highLightFieldValue = highlighter.getBestFragment(tokenStream, fieldValue);
        if(highLightFieldValue == null)
            highLightFieldValue = fieldValue;
        return highLightFieldValue;
    }

    /** 创建测试数据 */
    public List<Product> createProductList(){
        List<Product> productList = new ArrayList<Product>();
        for(int i=1;i<=20;i++){
            Product product = new Product();
            product.setP_id(i);
            product.setP_name("手表"+i);
            product.setP_price((i*i+Math.random())+"元");
            product.setP_content("手表的描述"+i+"块");
            productList.add(product);
        }
        return productList;
    }

    /**
     * 测试主方法
     * @param args
     * @throws Exception
     */
    public static void main(String[] args) throws Exception {
        TestIndex test = new TestIndex();
        String indexPath = Configuration.getInstance().read("config.properties", "indexPath");
        //创建Lucene索引
        test.createIndex(indexPath+"Product/index", test.createProductList());
       
        //从Lucene索引库中——搜索
        List<Product> productList = test.searchByKeyWord(indexPath+"Product/index", "手表4 描述3");
        //搜索结果
        for(Product product:productList){
            System.out.println("---------------");
            System.out.println("p_id:"+product.getP_id());
            System.out.println("p_name:"+product.getP_name());
            System.out.println("p_price:"+product.getP_price());
            System.out.println("p_content:"+product.getP_content());
            System.out.println("---------------");
        }
       
    }

}

读取配置文件类：Configuration.java

package com.lj.util;

import java.io.IOException;
import java.io.InputStream;
import java.util.Properties;

public class Configuration {
   
    //采用单例模式
    private static final Configuration configuration = new Configuration();

    private Configuration(){}
   
    public synchronized static Configuration getInstance(){
        return configuration;
    }
   
    public String read(String properties,String key){
        //读取配置文件
        InputStream in = this.getClass().getClassLoader().getResourceAsStream(properties);
        Properties p = new Properties();
        try {
            p.load(in);
        } catch (IOException e) {
            e.printStackTrace();
        }
       
        //取得配置文件中的值
        return p.getProperty(key);
    }
}

配置文件：config.properties

#配置索引路径
indexPath=d:/LuceneIndex/LuceneTest02/

运行结果：

这花费了 1672毫秒来把数据增加到索引里面去!
共匹配到：2个.
检索耗时： 109毫秒!
---------------
p_id:4
p_name:手表4
p_price:16.29956430691176元
p_content:手表的描述4块
---------------
---------------
p_id:3
p_name:手表3
p_price:9.465650388124237元
p_content:手表的描述3块
---------------

你需要导入几个Jar文件：

lucene-core-2.4.1.jar（Lucene核心包）

lucene-highlighter-2.4.0.jar（用于高亮显示的）

commons-logging.jar

paoding-analysis.jar（到网上搜索paoding-analysis-2.0.4-beta，解压后把Jar拷贝过来就OK，把dic文件夹拷贝到工程的根目录下，或者配置PAODING_DIC_HOME环境变量到bic目录下）

Lucene小例子-梁健.rar (964.9 KB)
下载次数: 674

分享到：

抓取防爬虫的网站信息(梁健-原创) | lucene的中文分词器

2011-06-17 13:57
浏览 1196
评论(0)
论坛回复 / 浏览 (6 / 6811)
分类:编程语言
查看更多

发表评论

您还没有登录,请您登录后再发表评论

最近访客更多访客>>

博主相关

文章分类

社区版块

存档分类

最新评论