lucene入门到项目开发 -

cameory

浏览: 65633 次
性别:
来自: 上海

最近访客更多访客>>

lvyangzhuo

dlzdy

liven001

kingkylin

博主相关

博客

微博

相册

留言

关于我

文章分类

社区版块

存档分类

lucene入门到项目开发

加入jar包 lucene-core-2.4.0.jar je-analysis-1.4.0.jar   lucene-highlighter-2.4.1.jar lucene-analyzers-2.4.1.jar

先准备下工具类

Java代码
package com.cs.lucene.utils;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.InputStreamReader;

import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.NumberTools;
import org.apache.lucene.document.Field.Index;
import org.apache.lucene.document.Field.Store;

public class File2DocumentUtiles {

    /**
     *文件到document的转换
     * @param filepath
     * @return
     */
    public static Document file2Document(String filepath) {

        File file = new File(filepath) ;

        Document doc = new Document();
        doc.add(new Field("name",file.getName(),Store.YES,Index.ANALYZED)) ; //索引并分词
        doc.add(new Field("content",readFileContent(file),Store.YES,Index.ANALYZED)) ; //索引并分词
        doc.add(new Field("size",NumberTools.longToString(file.length()),Store.YES,Index.NOT_ANALYZED)) ; //索引不分词
        doc.add(new Field("path",file.getPath(),Store.YES,Index.NO)) ; //不索引

        return doc;
    }
/**
* 根据文件读取文件内容
* @param file
* @return
*/
    private static String readFileContent(File file) {

        try {
            BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(file)));
            StringBuffer content = new StringBuffer();

            for(String line=null; (line = reader.readLine())!=null ;){
                content.append(line).append("\n") ;
            }
            return content.toString() ;
        } catch (Exception e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
        return null;
    }

    public static void printDocInfo(Document doc){
        System.out.println("--------------------------");
        System.out.println("name          =" + doc.get("name"));
        System.out.println("content       =" + doc.get("content"));
        System.out.println("size          =" + NumberTools.stringToLong(doc.get("size")));
        System.out.println("path          =" + doc.get("path"));
    }


}

先了解下分词器
Java代码
package com.cs.lucene.analyzer;

import java.io.StringReader;

import jeasy.analysis.MMAnalyzer;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.junit.Test;

public class AnalyzerTest {
      String text = "资源来自互联网吴朝辉wwwa的a-b放到" ;
      Analyzer analyzer = new MMAnalyzer() ;

      @Test
      public void testAnalyze() throws Exception{
          analyze(analyzer,text);
      }

    private void analyze(Analyzer analyzer2, String text2) throws Exception {
        System.out.println("----------分词器-------------------");
        TokenStream tokenStream = analyzer.tokenStream("content", new StringReader(text)) ;
        for(Token token = new Token();(token = tokenStream.next(token))!=null;){
            System.out.println(token);
        }
    }


}

现在看看FSDirectory和RAMDirectory
Java代码
package com.cs.lucene.directory;

import jeasy.analysis.MMAnalyzer;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriter.MaxFieldLength;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.RAMDirectory;
import org.junit.Test;

import com.cs.lucene.utils.File2DocumentUtiles;

public class DirectoryTest {
    //创建索引用的文件路径
    String filePath = "E:\\EclipseStudyWorkspace\\LucenceTest\\lucenceDataSource\\资源来自互联网，版权归原创作者或原单位公司所有.txt";
    //存放索引的目录
    String indexPath = "E:\\EclipseStudyWorkspace\\LucenceTest\\lucenceIndex" ;
     //分词器
    Analyzer analyzer = new MMAnalyzer(); //je分词器


    /**
     * 利用FSDirectory 创建索引
     * FSDirectory：在文件系统上存放
     * @throws Exception
     */
    @Test
    public void testFSDirectory() throws Exception{
          //测试文件系统目录
          Directory dir = FSDirectory.getDirectory(indexPath) ;
          Document doc = File2DocumentUtiles.file2Document(filePath);
           //参数true表示是否删除原来的索引后再重新创建，MaxFieldLength.LIMITED：表示只对前10000个字做索引
           IndexWriter indexWriter = new IndexWriter(dir,analyzer,MaxFieldLength.LIMITED) ; //没有参数true，添加索引
           indexWriter.addDocument(doc) ;

           indexWriter.close() ;
    }
    /**
     * 利用RAMDirectory 创建索引
     * RAMDirectory：在内存中存放
     * 优点：读取快
     * 缺点：重新开机，索引没了
     * @throws Exception
     */
    @Test
    public void testRAMDirectory() throws Exception{
          //测试文件系统目录
          Directory dir = new RAMDirectory() ;
          Document doc = File2DocumentUtiles.file2Document(filePath);
           //参数true表示是否删除原来的索引后再重新创建，MaxFieldLength.LIMITED：表示只对前10000个字做索引
           IndexWriter indexWriter = new IndexWriter(dir,analyzer,MaxFieldLength.LIMITED) ; //没有参数true，添加索引
           indexWriter.addDocument(doc) ;

           indexWriter.close() ;
    }
    /**
     * 实际应用中，FSDirectory和RAMDirectory联合起来用
     * 操控内存的索引要快，所以在运行时操作RAMDirectory，
     * 但退出时必须保存到到文件系统上，所以退出时操控FSDirectory
     * @throws Exception
     */
    @Test
    public void testRAMDirectoryAndFSDirectory() throws Exception{
        //整个过程：从文件系统上读取所以到内存，运行时添加索引，此时的全部索引都在内存中，
        //退出时再把全部保存到文件系统上

        Directory fsDir = FSDirectory.getDirectory(indexPath) ;
         //1.启动时读取
         Directory ramDir = new RAMDirectory(fsDir) ;
         //运行时操作ramDir
         IndexWriter ramIndexWriter = new IndexWriter(ramDir,analyzer,MaxFieldLength.LIMITED);
         //添加document
         Document doc = File2DocumentUtiles.file2Document(filePath) ;
         ramIndexWriter.addDocument(doc) ;
         ramIndexWriter.close() ;//一定要关闭再合并，因为有缓存

         //2.退出时保存
         //参数true表示把以前的索引删掉，全部重写（默认为false）
         IndexWriter fsIndexWriter = new IndexWriter(fsDir,analyzer,true,MaxFieldLength.LIMITED);
         //new Directory[]{ramDir}：要合并的目录
         //addIndexesNoOptimize：表示不做优化，做优化检索时相对要慢，但占用的存储空间小
         fsIndexWriter.addIndexesNoOptimize(new Directory[]{ramDir}) ;
         fsIndexWriter.flush() ; //优化之前一定要先刷新缓存
         fsIndexWriter.optimize() ; //优化一定要在关闭之前做，优化可以提高检索的速度
         fsIndexWriter.close() ;
    }
    @Test
    public void testOptimize() throws Exception{

        Directory fsDir = FSDirectory.getDirectory(indexPath) ;
        IndexWriter fsIndexWriter = new IndexWriter(fsDir,analyzer,MaxFieldLength.LIMITED);

        fsIndexWriter.optimize() ;
        fsIndexWriter.close() ;
    }
}

现在来测测索引如何建立以及搜索
Java代码
package com.cs.lucene.lucene;

import java.io.File;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import jeasy.analysis.MMAnalyzer;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriter.MaxFieldLength;
import org.apache.lucene.queryParser.MultiFieldQueryParser;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.Filter;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.SortField;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.highlight.Formatter;
import org.apache.lucene.search.highlight.Fragmenter;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.Scorer;
import org.apache.lucene.search.highlight.SimpleFragmenter;
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;

import com.cs.lucene.utils.File2DocumentUtiles;

public class IndexDao {
    // 存放索引的目录
    private String indexPath;
    private Analyzer analyzer = null; // 分词器

    public IndexDao() {
        this.indexPath = "E:\\EclipseStudyWorkspace\\LucenceTest\\lucenceIndex";
        this.analyzer = new MMAnalyzer(); // je分词器对中文支持很好

    }

    public IndexDao(Analyzer analyzer, String indexPath) {
        this.analyzer = analyzer;
        this.indexPath = indexPath;
    }

    /**
     * 接受一个QuerString字符串搜索索引并返回结果
     *
     */
    public QueryResult search(String queryString, int firstResult,
            int maxResults) throws Exception {
        // 1.把要搜索的fields解析为Query
        String[] fields = { "name", "content" };
        // boosts:需要的理由，标题和内容中出现关键字的得分不一样，在标题中出现时的得分理应高些
        Map<String, Float> boosts = new HashMap<String, Float>();
        boosts.put("name", 3.0f);
        boosts.put("content", 1.0f); // 默认值

        QueryParser queryParser = new MultiFieldQueryParser(fields, analyzer,
                boosts);// 多field搜索
        Query query = queryParser.parse(queryString);

        return search(query, firstResult, maxResults);
    }

    /*
     * 接受一个Query对象搜索索引并返回结果
     */
    public QueryResult search(Query query, int firstResult, int maxResults)
            throws Exception {
        IndexSearcher indexSearcher = null;
        // 2.进行查询
        indexSearcher = new IndexSearcher(indexPath);
        Filter filter = null; // 搜索时的过滤器
        /** ********过滤器************* */
        // 过滤器：把结果再过滤一遍，效率会很低
        // filter = new
        // RangeFilter("size",NumberTools.longToString(200),NumberTools.longToString(500),true,true);
        /** ************************* */
        Sort sort = new Sort();
        // 默认是按升序排序，参数true：排序结果改为按降序排列
        sort.setSort(new SortField[] { new SortField("size", true) });
        TopDocs topDocs = indexSearcher.search(query, filter, 10000, sort);

        int recordCount = topDocs.totalHits;

        /** ***********准备高亮器******************** */
        Formatter formatter = new SimpleHTMLFormatter("<font color='red'>",
                "</font>");
        Scorer scorer = new QueryScorer(query);
        Highlighter highlighter = new Highlighter(formatter, scorer);

        // 50表示只显示50个字符这里的50个字符是有关键字的左右部分（称之为最佳部分）这里只是测试用
        Fragmenter fragmenter = new SimpleFragmenter(500);
        highlighter.setTextFragmenter(fragmenter);
        /** ************************************ */

        // 3.取出当前的数据
        List<Document> recordList = new ArrayList<Document>();
        int end = Math.min(firstResult + maxResults, recordCount);
        for (int i = firstResult; i < end; i++) {
            ScoreDoc scoreDoc = topDocs.scoreDocs[i];

            int docSn = scoreDoc.doc;
            Document doc = indexSearcher.doc(docSn);

            // 使用高亮器
            String hc = highlighter.getBestFragment(analyzer, "content", doc
                    .get("content"));

            // 如果content中没有搜索的关键字，则截取content的前200个字符
            if (hc == null) {
                String content = doc.get("content");
                int endIndex = Math.min(200, content.length());
                hc = content.substring(0, endIndex);
            }
            doc.getField("content").setValue(hc);

            recordList.add(doc);
        }
        // 打开结果
        /*
         * for(ScoreDoc scoreDoc :topDocs.scoreDocs){ int docSn = scoreDoc.doc ;
         * //文档内部编号 Document doc = indexSearcher.doc(docSn); //根据编号查找相应的文档
         * File2DocumentUtiles.printDocInfo(doc) ; }
         */
        // 4.返回结果
        return new QueryResult(recordCount, recordList);
    }

    /*
     * 建立索引并保存
     */
    public void save(String filePath) throws Exception {
        Document doc = File2DocumentUtiles.file2Document(filePath);
        // 在添加doc的时候，可以设定文档的分数，不过不建议这样做
        // doc.setBoost(1.0f); //默认值

        // 参数true表示是否删除原来的索引后再重新创建，MaxFieldLength.LIMITED：表示只对前10000个字做索引
        IndexWriter indexWriter = new IndexWriter(indexPath, analyzer, false,
                MaxFieldLength.LIMITED);
        indexWriter.addDocument(doc);
        indexWriter.commit();
        indexWriter.optimize();
        indexWriter.close();

    }

    public void save(File file) throws Exception {
        save(file.getAbsolutePath()) ;
    }

    /*
     * 建立索引并保存可以直接传入的是目录
     */
    public void saveDirectory(File file) throws Exception {
        if (file.isFile()) { // 如果是文件就建索引并保存
            save(file.getAbsolutePath());
            return;
        }
        File[] childs = file.listFiles();
        for (int i = 0; i < childs.length; i++) {
            File f = childs[i];
            if (f.isDirectory()) {// 如果是目录就递归调用
                saveDirectory(f);
            } else {
                save(f.getAbsolutePath());
            }
        }
    }

    /**
     * 测试递归
     */
    public void save(File file, int pointer) throws Exception {
        StringBuffer str = new StringBuffer();
        for (int i = 0; i < pointer; i++) {
            str.append("--");
        }
        if (file.isFile()) { // 如果是文件就建索引并保存
            System.out.println(str + file.getName());
            return;
        }
        File[] childs = file.listFiles();
        for (int i = 0; i < childs.length; i++) {
            File f = childs[i];
            if (f.isDirectory()) {// 如果是目录就递归调用
                System.out.println(str + f.getName());
                save(f, pointer + 1);
            } else {
                System.out.println(str + f.getName());
            }

        }
    }
}

Java代码
package com.cs.lucene.lucene;

import java.util.ArrayList;
import java.util.List;

import org.apache.lucene.document.Document;

public class QueryResult {
    private int recordCount = 0;
    private List<Document> recordResults = new ArrayList<Document>();

    public QueryResult(int recordCount, List<Document> recordResults) {
        this.recordCount = recordCount;
        this.recordResults = recordResults;
    }

    public int getRecordCount() {
        return recordCount;
    }

    public void setRecordCount(int recordCount) {
        this.recordCount = recordCount;
    }

    public List<Document> getRecordResults() {
        return recordResults;
    }

    public void setRecordResults(List<Document> recordResults) {
        this.recordResults = recordResults;
    }

}

测试索引
Java代码
package com.cs.lucene.lucene;

import java.io.File;

import jeasy.analysis.MMAnalyzer;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.junit.Test;

import com.cs.lucene.utils.File2DocumentUtiles;

public class IndexDaoTest {

    private IndexDao indexDao = new IndexDao() ;

    /*
     *搜索索引库，并返回结果
     */
    @Test
    public void testSearch() throws Exception{
        String queryString = "www*" ;
        QueryResult queryResults = indexDao.search(queryString ,0, 10) ;
         //测试结果
         System.out.println("总共有【"+queryResults.getRecordCount()+"】条匹配结果");

          for(int i =0 ; i<queryResults.getRecordResults().size();i++){
              Document doc = queryResults.getRecordResults().get(i) ;
              File2DocumentUtiles.printDocInfo(doc) ;
          }
    }
    /*
     * 测试索引源文件并保存到索引库
     */
    @Test
    public void testSave() throws Exception{
        String filePath2 = "E:\\EclipseStudyWorkspace\\LucenceTest\\lucenceDataSource\\readme2.txt";
        //源文件
        //String filePath = "E:\\EclipseStudyWorkspace\\LucenceTest\\lucenceDataSource\\资源来自互联网，版权归原创作者或原单位公司所有吴朝辉.txt";

          indexDao.save(filePath2);
    }
    /**
     * 用来给目录建索引并保存到索引库
     */
    @Test
    public void testSaveDir() throws Exception{
        String filepath = "E:\\EclipseStudyWorkspace\\LucenceTest\\lucenceDataSource\\www" ;
        File file = new File(filepath) ;
        indexDao.saveDirectory(file);
    }
}

最后我们来看看lucene的查询功能
Java代码
package com.cs.lucene.query;

import java.util.Date;

import org.apache.lucene.document.DateTools;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.NumberTools;
import org.apache.lucene.document.DateTools.Resolution;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.RangeQuery;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.WildcardQuery;
import org.apache.lucene.search.BooleanClause.Occur;
import org.junit.Test;

import com.cs.lucene.lucene.IndexDao;
import com.cs.lucene.lucene.QueryResult;
import com.cs.lucene.utils.File2DocumentUtiles;

public class QueryTest {

    IndexDao indexDao = new IndexDao() ;

    /*
     * 关键词查询
     */
    @Test
    public void testTermQuery() throws Exception{
        Term term = new Term("name","资源");
        Query query = new TermQuery(term);

        //查询打印结果
         QueryAndPrintResult(query) ;
    }
    /*
     * 范围索引
     * 数字在query中都是字符串，所以要借助NumberTools工具类做转换
     */
    @Test
    public void testRangeQuery() throws Exception{
        Term lowerTerm = new Term("size",NumberTools.longToString(200));
        Term upperTerm = new Term("size",NumberTools.longToString(500));
        //true表示是否包含边界
        Query query = new RangeQuery(lowerTerm,upperTerm,true) ;

        /*
        Term lowerTerm2 = new Term("size","200");
        Term upperTerm2 = new Term("size","500");
        Query query = new RangeQuery(lowerTerm2,upperTerm2,true) ; //true表示是否包含边界
    */
        //查询打印结果
         QueryAndPrintResult(query) ;
    }

    /*
     * 测试NumberTools和DateTools
     */
    @Test
    public void testNumberToolsAndDateTools() throws Exception{

    System.out.println("数字测试：");
         System.out.println(NumberTools.longToString(200));
         System.out.println(NumberTools.longToString(500));
         System.out.println(NumberTools.stringToLong("000000000000dw"));

    System.out.println("日期测试：");
         System.out.println(DateTools.dateToString(new Date(), Resolution.SECOND));
         System.out.println(DateTools.dateToString(new Date(), Resolution.DAY));
         System.out.println(DateTools.stringToDate("20101005080855"));
    }

    /*
     * 通配符查询
     * ？：代表一个字符，*：代表0个或多个字符
     */
    @Test
    public void testWildcardQuery() throws Exception{
        Term term = new Term("name","*me");
        Query query = new WildcardQuery(term) ;

        //查询打印结果
         QueryAndPrintResult(query) ;
    }
    /*
     * 短语查询：查询包含多个短语的query
     */
    @Test
    public void testPhraseQuery() throws Exception{
        PhraseQuery phraseQuery = new PhraseQuery() ;
        phraseQuery.add(new Term("name","资源")) ;
        phraseQuery.add(new Term("name","作者")) ;

        //setSlop：用来设置两个短语之间的最多可以隔多少个字符
        phraseQuery.setSlop(20);

        //查询打印结果
         QueryAndPrintResult(phraseQuery) ;
    }
    /**
     * 布尔查询：非常重要
     * 三种关系：
     * 1.MUST和MUST：取得两个查询子句的交集。
     * 2.MUST和MUST_NOT：包含MUST但并且查询结果中不包含MUST_NOT的检索结果。
     * 3.SHOULT和SHOULT：表示"或"关系，最终检索结果为所有检索子句的并集。
     * 注意：有些组合是没有意义的
     * @throws Exception
     */
    @Test
    public void testBooleanQuery() throws Exception{
        //条件1
        PhraseQuery phraseQuery = new PhraseQuery() ;
        phraseQuery.add(new Term("name","资源")) ;
        phraseQuery.add(new Term("name","作者")) ;
        phraseQuery.setSlop(20);

        //条件2
        Term lowerTerm2 = new Term("size","200");
        Term upperTerm2 = new Term("size","500");
        Query rangeQuery = new RangeQuery(lowerTerm2,upperTerm2,true) ; //true表示是否包含边界

        //合并两个查询
        BooleanQuery booleanQuery = new BooleanQuery() ;
        booleanQuery.add(phraseQuery, Occur.MUST) ;
        booleanQuery.add(rangeQuery,Occur.MUST) ;

        //查询打印结果
        QueryAndPrintResult(booleanQuery) ;
    }



    private void QueryAndPrintResult(Query query) throws Exception{

        System.out.println("相对应的查询字符串："+query);
          QueryResult qr = indexDao.search(query, 0, 100) ;
         System.out.println("总共有【"+qr.getRecordCount()+"】条匹配结果");

        //打印结果
          for(int i =0 ; i<qr.getRecordResults().size();i++){
              Document doc = qr.getRecordResults().get(i) ;
              File2DocumentUtiles.printDocInfo(doc) ;
          }
    }
}

分享到：