Lucene4全文索引示例

xiang37

浏览: 431549 次
性别:
来自: 南京

最近访客更多访客>>

xiaomabobo

sxlkk

jenny825

long-will

博主相关

博客

微博

相册

留言

关于我

文章分类

社区版块

存档分类

博客分类：

Java综合知识

Lucene4.2.1示例，之前也做过3.6的示例。3.6的分词需要使用IKAnalyzer或者其他的分词，对中文的支持可能才会更好，但是4.2为我们提供了SmartChineseAnalyzer这个中文分词器。

下面是一个简单的示例程序，分别对应增删改查：

package com.xiva.test.lucene;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.SimpleFSDirectory;
import org.apache.lucene.util.Version;

public class IvFileIndex
{

    private static List<File> fileList = new ArrayList<File>(1024);

    public static void listAllFile(File fileDir)
    {
        File[] files = fileDir.listFiles();
        for (File file : files)
        {
            if (file.isDirectory())
            {
                listAllFile(file);
            }
            else
            {
                fileList.add(file);
            }
        }
    }

    public static void main(String[] args) throws Exception
    {
        File fileDir = new File("F:\\WorkSpace");
        File indexDir = new File("F:\\WorkSpace\\EclipseProjects\\luceneIndex");

        Analyzer luceneAnalyzer = new SmartChineseAnalyzer(Version.LUCENE_42);

        IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_42, luceneAnalyzer);
        config.setOpenMode(org.apache.lucene.index.IndexWriterConfig.OpenMode.CREATE);

        Directory fsDir = new SimpleFSDirectory(indexDir);
        IndexWriter indexWriter = new IndexWriter(fsDir, config);

        listAllFile(fileDir);
        long startTime = new Date().getTime();

        indexWriter.deleteAll();

        // 增加document到索引去
        for (File txtFile : fileList)
        {
            if (txtFile.isFile() && txtFile.getName().endsWith(".java"))
            {
                System.out.println(txtFile.getName());
                FileInputStream fis = null;
                try
                {
                    fis = new FileInputStream(txtFile);
                }
                catch (FileNotFoundException fnfe)
                {
                    continue;
                }

                try
                {
                    Document document = new Document();
                    Field fieldPath = new StringField("path", txtFile.getPath(), Field.Store.YES);
                    Field fieldBody = new TextField("body", new BufferedReader(new InputStreamReader(fis, "GBK")));

                    document.add(fieldPath);
                    document.add(fieldBody);
                    indexWriter.addDocument(document);
                }
                finally
                {
                    fis.close();
                }

                System.out.println("被索引文件:" + txtFile.getCanonicalPath());
            }
        }

        // 对索引进行优化
        indexWriter.forceMerge(10);

        indexWriter.close();

        // 测试一下索引的时间
        long endTime = new Date().getTime();
        System.out.println("索引耗费时间：" + (endTime - startTime) + " 毫秒!");
    }

}

package com.xiva.test.lucene;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.util.Date;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.SimpleFSDirectory;
import org.apache.lucene.util.Version;

/**
 * 
 * 删除索引
 * @author xiva
 * @version [版本号, 2013-4-30]
 * @see [相关类/方法]
 * @since [产品、模块版本]
 */
public class IvIndexDelete
{
    public static void main(String[] args) throws Exception
    {
        File fileDir = new File("E:\\data\\lucene");
        File indexDir = new File("E:\\data\\index");
        
        Analyzer luceneAnalyzer = new SmartChineseAnalyzer(Version.LUCENE_42);
        
        IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_42,
                luceneAnalyzer);
        config.setOpenMode(org.apache.lucene.index.IndexWriterConfig.OpenMode.APPEND);
        
        Directory fsDir = new SimpleFSDirectory(indexDir);
        IndexWriter indexWriter = new IndexWriter(fsDir, config);
        File[] txtFiles = fileDir.listFiles();
        long startTime = new Date().getTime();
        
        // 增加document到索引去  
        for (int i = 0; i < txtFiles.length; i++)
        {
            if (txtFiles[i].isFile() && txtFiles[i].getName().endsWith("u.txt"))
            {
                FileInputStream fis = null;
                try
                {
                    fis = new FileInputStream(txtFiles[i]);
                }
                catch (FileNotFoundException fnfe)
                {
                    continue;
                }
                
                try
                {
                    
                    indexWriter.deleteDocuments(new Term("path",
                            txtFiles[i].getPath()));
                }
                finally
                {
                    fis.close();
                }
                
                System.out.println("被删除索引文件:" + txtFiles[i].getCanonicalPath());
            }
        }
        
        indexWriter.forceMerge(10);
        indexWriter.close();
        
        //测试一下索引的时间  
        long endTime = new Date().getTime();
        System.out.println("删除索引耗费时间：" + (endTime - startTime) + " 毫秒!");
    }
}

package com.xiva.test.lucene;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.InputStreamReader;
import java.util.Date;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.SimpleFSDirectory;
import org.apache.lucene.util.Version;

public class IvIndexUpdate
{
    public static void updateIndex() throws Exception
    {
        File fileDir = new File("E:\\data\\lucene");
        File indexDir = new File("E:\\data\\index");

        Analyzer luceneAnalyzer = new SmartChineseAnalyzer(Version.LUCENE_42);

        IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_42, luceneAnalyzer);

        config.setOpenMode(org.apache.lucene.index.IndexWriterConfig.OpenMode.APPEND);

        Directory fsDir = new SimpleFSDirectory(indexDir);
        IndexWriter indexWriter = new IndexWriter(fsDir, config);
        File[] txtFiles = fileDir.listFiles();
        long startTime = new Date().getTime();

        // 增加document到索引去
        for (int i = 0; i < txtFiles.length; i++)
        {
            if (txtFiles[i].isFile() && txtFiles[i].getName().endsWith("u.txt"))
            {
                FileInputStream fis;
                try
                {
                    fis = new FileInputStream(txtFiles[i]);
                }
                catch (FileNotFoundException fnfe)
                {
                    continue;
                }

                try
                {
                    Document document = new Document();
                    Field fieldPath = new StringField("path", txtFiles[i].getPath(), Field.Store.YES);
                    Field fieldBody = new TextField("body", new BufferedReader(new InputStreamReader(fis, "GBK")));
                    
                    document.add(fieldPath);
                    document.add(fieldBody);

                    indexWriter.updateDocument(new Term("path", txtFiles[i].getPath()), document);
                }
                finally
                {
                    fis.close();
                }

                System.out.println("被更新索引文件:" + txtFiles[i].getCanonicalPath());
            }
        }

        indexWriter.forceMerge(10);
        indexWriter.close();

        // 测试一下索引的时间
        long endTime = new Date().getTime();
        System.out.println("更新索引耗费时间：" + (endTime - startTime) + " 毫秒!");
    }

    public static void main(String[] args) throws Exception
    {
        updateIndex();
    }
}

package com.xiva.test.lucene;

import java.io.File;
import java.io.IOException;
import java.util.Date;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;

public class IvFileSearch
{
    public static void main(String[] args) throws IOException
    {
        String queryString = "索引";
        String field = "body";
        Query query = null;
        TopDocs docs = null;

        File indexDir = new File("F:\\WorkSpace\\EclipseProjects\\luceneIndex");
        IndexReader reader = DirectoryReader.open(FSDirectory.open(indexDir));
        IndexSearcher searcher = new IndexSearcher(reader);

        // StopFilterFactory factory = new StopFilterFactory();
        // factory.getStopWords()
        Analyzer analyzer = new SmartChineseAnalyzer(Version.LUCENE_42);

        try
        {
            long startTime = new Date().getTime();
            QueryParser qp = new QueryParser(Version.LUCENE_42, field, analyzer);
            query = qp.parse(queryString);

            long endTime = new Date().getTime();
            System.out.println("索引耗费时间：" + (endTime - startTime) + " 毫秒!");
        }
        catch (ParseException e)
        {
            e.printStackTrace();
        }

        if (searcher != null)
        {
            docs = searcher.search(query, 25);// 可以分页查询

            ScoreDoc scoreDocs[] = docs.scoreDocs;

            for (int i = 0; i < docs.totalHits; i++)
            {
                Document targetDoc = searcher.doc(scoreDocs[i].doc);
                String path = targetDoc.get("path");
                System.out.println("path:" + path);
            }
        }
    }
}

PS：对于数据库操作时，相信大家都有相关的方法去更新或者删除索引，比如及时更新或者使用定时扫描表的方法。数据库本身也具有全文索引的特性，比如Oracle和MSSQL。

对与文件的操作，我的解决方法是：可以采用利用JNA对文件进行监听之观察者模式这里给出的方法来更新或者删除索引。

分享到：

Eclipse安装git插件 | HttpClient4示例

2013-04-30 02:20
浏览 1568
评论(0)
分类:编程语言
查看更多

发表评论

您还没有登录,请您登录后再发表评论

最近访客更多访客>>

博主相关

文章分类

社区版块

存档分类

最新评论

Lucene4全文索引示例

评论

发表评论

相关推荐

最近访客 更多访客>>

博主相关

文章分类

社区版块

存档分类

最新评论

Lucene4全文索引示例

评论

发表评论

相关推荐

Tesseract-OCR的简单使用与训练

JNA与动态链接库交互之使用结构体与结构体数组

ElasticSearch1.7.3 报错Root type mapping not empty after parsing!

TopN问题的算法实现

NIO之Socket通信

阻塞与非阻塞通讯

[续]Java调用DLL视频解帧，并保存第一关键帧到JPG格式文件

Jconsole连接之JVM设置

Lucene4.x SmartChineseAnalyzer添加扩展词

Java ORC

OSCache的对action响应的配置

Java PING一个IP地址 isReachable

Java后台返回easyUI的comboxTree数据

利用JDBC生成数据库表对应的Class

HttpClient4示例

http client

Java6新特性之动态生成Class，并加载

利用JNA对文件进行监听之观察者模式

改进后的归并排序，对大文件归并排序

Servlet ZIP文件下载

最近访客更多访客>>