lucene 3.6 全文检索 -

yuan_bin1990

浏览: 4439 次

最近访客更多访客>>

jjxlcsw

buyu123

zhudiyuan

u014619297

博主相关

博客

微博

相册

留言

关于我

文章分类

社区版块

存档分类

2012-10 ( 2)
更多存档...

lucene 3.6 全文检索

博客分类：

lucene

lucene

/**

*索引字段，可根据需要修改

package com.beyondbit.entity;

import java.util.Date;

public class ResultInfo {

private Long id;   //主健id
private String ct_title;   //标题
private String ct_brief;   //摘要
private String ct_content;   //内容
private String sj_name;    //栏目   or 分类名称
private String url;       //静态文件地址
private String topid;     //呼叫中心下的栏目id or 知识分类下的分类id   该栏目为呼叫中心、知识分类下的第二级
private Date create_time;   //知识     or 信息的发布日期
private String recommend; //是否推荐   1推荐
private Integer bscredit;   //企业信用值
public ResultInfo(){

}

public ResultInfo(Long id, String ct_title,String ct_brief,String ct_content,
   String sj_name,String url,String topid,Date create_time,String recommend,Integer bscredit) {
  super();
  this.id = id;
  this.ct_title = ct_title;
  this.ct_brief=ct_brief;
  this.ct_content = ct_content;
  this.sj_name = sj_name;
  this.url=url;
  this.topid=topid;
  this.create_time = create_time;
  this.recommend=recommend;
  this.bscredit=bscredit;
}
//此处省略get ,set方法

}

package com.beyondbit.util;

import java.io.File;
import java.io.IOException;

import org.apache.lucene.document.DateTools;
import org.apache.lucene.document.DateTools.Resolution;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.SimpleFSDirectory;
import org.apache.lucene.util.Version;
import org.springframework.transaction.annotation.Transactional;
import org.wltea.analyzer.lucene.IKAnalyzer;

import com.beyondbit.entity.ResultInfo;

/**

*操作索引的类

*/
public class LuceneContent {

public static final String LUCENE_PATH="lucene";

public Document createDocument(ResultInfo c) {
  Document doc = new Document();
  doc.add(new Field("id",c.getId().toString(), Field.Store.YES,Field.Index.NOT_ANALYZED));
  doc.add(new Field("title",c.getCt_title()==null?"":c.getCt_title(),Field.Store.YES, Field.Index.ANALYZED));
  doc.add(new Field("brief",c.getCt_brief()==null?"":c.getCt_brief(),Field.Store.YES, Field.Index.ANALYZED));
  doc.add(new Field("content",c.getCt_content()==null?"":c.getCt_content(),Field.Store.YES, Field.Index.ANALYZED));
  doc.add(new Field("url",c.getUrl()==null?"":c.getUrl(),Field.Store.YES,Field.Index.NOT_ANALYZED));
  doc.add(new Field("sjname",c.getSj_name()==null?"":c.getSj_name(),Field.Store.YES, Field.Index.ANALYZED));
  doc.add(new Field("topid",c.getTopid()==null?"":c.getTopid(),Field.Store.YES,Field.Index.ANALYZED));
  doc.add(new Field("date", DateTools.dateToString(c.getCreate_time(), Resolution.DAY), Field.Store.YES,Field.Index.NOT_ANALYZED));
  doc.add(new Field("recommend",c.getRecommend()==null?"0":c.getRecommend().toString(), Field.Store.YES,Field.Index.NOT_ANALYZED_NO_NORMS));
  doc.add(new Field("bscredit",c.getBscredit()==null?"0":c.getBscredit().toString(), Field.Store.YES,Field.Index.NOT_ANALYZED_NO_NORMS));
  return doc;
}

/**
* 创建索引文件
* @param content
* @throws IOException
*/
@Transactional(readOnly = true)
public void createIndex(ResultInfo content) throws IOException {
  Directory dir = new SimpleFSDirectory(new File(Constants.luceneIndex));
  createIndex(content, dir);
}

/**
* 创建索引文件
* @param content
* @param dir
* @throws IOException
*/
@Transactional(readOnly = true)
public void createIndex(ResultInfo content, Directory dir) throws IOException {
  boolean exist = IndexReader.indexExists(dir);
  IndexWriterConfig iwconfig=new IndexWriterConfig(Version.LUCENE_36,new IKAnalyzer());
  IndexWriter writer=new IndexWriter(dir, iwconfig);
  try {
   writer.addDocument(createDocument(content));
  } finally {
   writer.close();
  }
}

/**
* 删除索引文件
* @param contentId
* @throws IOException
* @throws ParseException
*/
@Transactional(readOnly = true)
public void deleteIndex(Long contentId) throws IOException,
   ParseException {
  Directory dir = new SimpleFSDirectory(new File(Constants.luceneIndex));
  deleteIndex(contentId, dir);
}

/**
* 根据索引文件id删除一条索引文件的信息
* @param contentId
* @param dir 索引文件的存放目录
* @throws IOException
* @throws ParseException
*/
@Transactional(readOnly = true)
public void deleteIndex(Long contentId, Directory dir)
   throws IOException, ParseException {
  boolean exist = IndexReader.indexExists(dir);
  if (exist) {
   IndexWriterConfig iwconfig=new IndexWriterConfig(Version.LUCENE_36,new IKAnalyzer());
   IndexWriter writer=new IndexWriter(dir, iwconfig);
   try {
    delete(contentId, writer);
   } finally {
    writer.close();
   }
  }
}

/**
* 根据一条索引文件的id删除索引文件
* @param contentId
* @param writer
* @throws CorruptIndexException
* @throws IOException
* @throws ParseException
*/
public void delete(Long contentId, IndexWriter writer)
   throws CorruptIndexException, IOException, ParseException {
  writer.deleteDocuments(new Term("id", contentId.toString()));
}

/**
* 更新索引文件
* @param content
* @throws IOException
* @throws ParseException
*/
public void updateIndex(ResultInfo content) throws IOException, ParseException {
  Directory dir = new SimpleFSDirectory(new File(Constants.luceneIndex));
  updateIndex(content, dir);
}

/**
* 更新索引文件
* @param content
* @param dir 索引文件存放的目录
* @throws IOException
* @throws ParseException
*/
public void updateIndex(ResultInfo content, Directory dir) throws IOException,
   ParseException {
  boolean exist = IndexReader.indexExists(dir);
  IndexWriterConfig iwconfig=new IndexWriterConfig(Version.LUCENE_36,new IKAnalyzer());
  IndexWriter writer=new IndexWriter(dir, iwconfig);
  try {
   if (exist) {
    delete(content.getId(), writer);
   }
   writer.addDocument(createDocument(content));
  } finally {
   writer.close();
  }
}
}

public String execute() throws Exception {
  page.setPageSize(getCookieCount());
  Query query=null;
  Analyzer analyzer=new IKAnalyzer();
  try {
   IndexSearcher searcher =new IndexSearcher(IndexReader.open(FSDirectory.open(new File(PropertyManager.getProperty("articleindex")))));
   TopScoreDocCollector topCollector = TopScoreDocCollector.create(searcher.maxDoc(),false);

   if(content==null||content.equals("")){
    QueryParser parse = new MultiFieldQueryParser(Version.LUCENE_36,new String[]{"title","content"}, analyzer);
    query=parse.parse(keyword);
   }
   if(content!=null&&content.equals("1")){
    if(sjid.equals("0")){
     //到全文中检索关健字
     query=MultiFieldQueryParser.parse(Version.LUCENE_36,new String[]{keyword},new String[]{"content"},new BooleanClause.Occur[]{BooleanClause.Occur.MUST},analyzer);
    }else{
     //根据关健字和栏目去查找
     query=MultiFieldQueryParser.parse(Version.LUCENE_36,new String[]{keyword,sjid},new String[]{"content","topid"},new BooleanClause.Occur[]{BooleanClause.Occur.MUST,BooleanClause.Occur.MUST},analyzer);
    }
   }if(content!=null&&content.equals("2")){
    if(sjid.equals("0")){
     //到标题中检索关健字
     query=MultiFieldQueryParser.parse(Version.LUCENE_36,new String[]{keyword},new String[]{"title"},new BooleanClause.Occur[]{BooleanClause.Occur.MUST},analyzer);
    }else{
     //根据关健字和栏目去查找
     query=MultiFieldQueryParser.parse(Version.LUCENE_36,new String[]{keyword,sjid},new String[]{"title","topid"},new BooleanClause.Occur[]{BooleanClause.Occur.MUST,BooleanClause.Occur.MUST},analyzer);
    }
   }
   searcher.search(query, topCollector);
   SimpleHTMLFormatter simpleHtmlFormatter = new SimpleHTMLFormatter("<font color=\"red\">", "</font>");
   Highlighter highlighter = new Highlighter(simpleHtmlFormatter,new QueryScorer(query));
   ScoreDoc[] docs=topCollector.topDocs((page.getPageNo()-1)*page.getPageSize(),page.getPageSize()).scoreDocs;
   ResultInfo info = null;
   for (int i = 0; i < docs.length; i++) {
    Document doc=searcher.doc(docs[i].doc);
    String content2 = doc.get("content");
    String title2 = doc.get("title");
    TokenStream tokenStream = analyzer.tokenStream("content", new StringReader(content2));
    TokenStream tokenStream1 = analyzer.tokenStream("title", new StringReader(title2));
    String content = highlighter.getBestFragment(tokenStream,content2);
    String title = highlighter.getBestFragment(tokenStream1, title2);
    info = new ResultInfo();
    info.setId(Long.parseLong(doc.get("id")));
    info.setCt_title(title==null?title2:title);
    info.setCt_content(content==null?content2:content);
    info.setUrl(doc.get("url")==null?"":doc.get("url"));
    info.setSj_name(doc.get("sjname")==null?"":doc.get("sjname"));
       info.setCreate_time(DateTools.stringToDate(doc.get("date")));
    infos.add(info);
    info = null;
   }
   page.setTotalCount(topCollector.getTotalHits());
   //subs=subjectMananger.getAllLeafSubject();
  }catch (Exception e) {
   e.printStackTrace();
  }
  return SUCCESS;
}

还可以对查询结果用sort进行排序，如：

String[] fields = { "title","brief","content" };

QueryParser parse = new MultiFieldQueryParser(Version.LUCENE_36,fields, analyzer);

//keyword为关需要查找的关健字
Query query=parse.parse(keyword);

//下面这条语句表示先按recommend进行降序排列，false表示升序，再按title的匹配度进行排序
Sort sort=new Sort(new SortField[]{new SortField("recommend",SortField.INT,true),new SortField("title",SortField.SCORE,false),new SortField("content",SortField.SCORE,false)});
TopFieldDocs topFieldDocs = searcher.search(query,searcher.maxDoc(), sort);

   //设置高亮显示
    SimpleHTMLFormatter simpleHtmlFormatter = new SimpleHTMLFormatter("<font color=\"red\">", "</font>");
    Highlighter highlighter = new Highlighter(simpleHtmlFormatter,new QueryScorer(query));
    ScoreDoc[] docs=topFieldDocs.scoreDocs;
    for (int i =(page.getPageNo()-1)*page.getPageSize(); i <page.getPageNo()*page.getPageSize(); i++) {
     if(i<docs.length){
      Document doc=searcher.doc(docs[i].doc);
      String content2 = doc.get("content");
      String title2 = doc.get("title");
      String content=null;
      String title=null;
      if(content2!=null&&!content2.equals("")){
       TokenStream tokenStream = analyzer.tokenStream("content", new StringReader(content2));

       //为了自己能够控制显示的长度必须加上下面这行代码，上面一行代码执行后会自动截取一长度
       highlighter.setTextFragmenter(new SimpleFragmenter(content2.length()));
       content = highlighter.getBestFragment(tokenStream,content2);
      }if(title2!=null&&!title2.equals("")){
       TokenStream tokenStream1 = analyzer.tokenStream("title", new StringReader(title2));
       title = highlighter.getBestFragment(tokenStream1, title2);
      }
      info = new ResultInfo();
      info.setId(Long.parseLong(doc.get("id")));
      info.setCt_title(title==null?title2:title);
      info.setCt_content(content==null?content2:content);
      info.setUrl(doc.get("url")==null?"":doc.get("url"));
      info.setSj_name(doc.get("name")==null?"":doc.get("name"));
      info.setCreate_time(DateTools.stringToDate(doc.get("date")));
      info.setRecommend(doc.get("recommend")==null?"":doc.get("recommend"));
      infos.add(info);
      info = null;
     }
    }

IKAnalyzer3.2.5Stable.jar (1.1 MB)
下载次数: 55

lucene-highlighter-3.6.0.jar (87.1 KB)
下载次数: 60

lucene-core-3.6.0.jar (1.5 MB)
下载次数: 57

lucene-memory-3.6.0.jar (29.1 KB)
下载次数: 51

GJSearchDemo.rar (1.6 KB)
下载次数: 44

分享到：

axis2 发布webservice

2012-10-17 14:34
浏览 2271
评论(0)
分类:互联网
查看更多

发表评论

您还没有登录,请您登录后再发表评论

最近访客更多访客>>

博主相关

文章分类

社区版块

存档分类

最新评论

lucene 3.6 全文检索

评论

发表评论

相关推荐

最近访客 更多访客>>

博主相关

文章分类

社区版块

存档分类

最新评论

lucene 3.6 全文检索

评论

发表评论

相关推荐

最近访客更多访客>>