`
bevis.cn
  • 浏览: 153913 次
  • 性别: Icon_minigender_1
  • 来自: 杭州
社区版块
存档分类
最新评论

Lucene2.0中最常用的基本操作

阅读更多
<script>function StorePage(){d=document;t=d.selection?(d.selection.type!='None'?d.selection.createRange().text:''):(d.getSelection?d.getSelection():'');void(keyit=window.open('http://www.365key.com/storeit.aspx?t='+escape(d.title)+'&u='+escape(d.location.href)+'&c='+escape(t),'keyit','scrollbars=no,width=475,height=575,left=75,top=20,status=no,resizable=yes'));keyit.focus();}</script>

已经有两个项目中有机会接触lucene,由于之前用的lucene版本是2.0的,所以这里也说一下2.0中一些常用操作:

package com.wisekernel.em.business.index.impl;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Iterator;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.Hits;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.SortField;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.WildcardQuery;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;

public class IndexManager implements IndexIF ,Runnable{

private PathUtil pathUtil;

private PersistenceIF persistence;

private String magazineId;

private Log log = LogFactory.getLog(this.getClass());

public IndexManager(){}

public IndexManager(String mid,PersistenceIF persistence,PathUtil PathUtil){
this.magazineId = mid;
this.persistence = persistence;
this.pathUtil=PathUtil;
}

public synchronized void addIndex(final String perodicalId)
throws IOException {
log.info("in addIndex");
Directory indexDir=null;
try{
indexDir = getDirectory();
IndexWriter indexWriter = new IndexWriter(indexDir, getAnalyzer(),false);
addDocument(indexWriter, perodicalId);
indexWriter.optimize();
indexWriter.close();
}catch(IOException e){
try{
IndexReader.unlock(indexDir);
e.printStackTrace();
}catch(Exception ex){
ex.printStackTrace();
}
}finally{
indexDir.close();
}
System.gc();
}

public synchronized void deleteIndex(String perodicalId) throws IOException {
this.log.info("deleteIndex(String perodicalId) begin..");
Directory indexDir = getDirectory();
IndexReader reader = IndexReader.open(indexDir);
IndexReader.unlock(indexDir);
Term term = new Term("periodicalId", perodicalId);
System.out.println("perodicalId:"+perodicalId);
int num=reader.deleteDocuments(term);
System.out.println("delete num:"+num);
reader.close();

IndexWriter indexWriter = new IndexWriter(indexDir, getAnalyzer(),false);
indexWriter.optimize();
indexWriter.close();
indexDir.close();
this.log.info("deleteIndex(String perodicalId) end..");
}

public synchronized void modifyIndex(String perodicalId) throws IOException {
this.log.info("modifyIndex(String perodicalId) begin..");
deleteIndex(perodicalId);
addIndex(perodicalId);
log.info("modifyIndex(String perodicalId) end..");
}

private Directory getDirectory() throws IOException {
File homePath = new File(pathUtil.getIndexFolderPath().getFile()
.getAbsolutePath());
if (!homePath.exists()) {
homePath.mkdirs();
}
Directory indexDir = FSDirectory.getDirectory(homePath
.getAbsolutePath(), false);
return indexDir;
}

private static Analyzer getAnalyzer() {
XAnalyzer analyzer = XFactory.getWriterAnalyzer();
return analyzer;
}

@SuppressWarnings("unchecked")
private void addDocument(IndexWriter indexWriter, String periodicalId) {
log.info("begin add docuemnt");
log.info("periodicalId:"+periodicalId);
Collection pdfCol = getPagesByPId(periodicalId);
try {
for (Iterator iter = pdfCol.iterator(); iter.hasNext();) {
Document document = new Document();
PdfPage element = (PdfPage) iter.next();
String pathstr = pathUtil.getPdfTxtPath().getFile()
.getAbsolutePath()
+ File.separator + element.getMaterial().getNewname();
// Field.Index.TOKENIZED=分词建索引
if (element.isSetadpage() == true || element.isPostad() == true
|| element.getCrosspagenum() != 0) {
continue;
}
document.add(new Field("periodicalId", periodicalId,
Field.Store.YES, Field.Index.UN_TOKENIZED));
document.add(new Field("materialId", element.getMaterial()
.getId(), Field.Store.YES, Field.Index.NO));
document.add(new Field("pdfPageId", element.getId(),
Field.Store.YES, Field.Index.UN_TOKENIZED));
document.add(new Field("pagenum", String.valueOf(element
.getPagenum()), Field.Store.YES, Field.Index.NO));
document.add(new Field("crossnum", String.valueOf(element
.getCrosspagenum()), Field.Store.YES, Field.Index.NO));
String currentTxt = readTxt(pathstr, element.getPagenum());
// System.out.println("pagenum:"+element.getPagenum()+",txt:"+currentTxt.substring(0,
// 20));
document.add(new Field("isad", Boolean.toString(element
.isSetadpage()), Field.Store.YES, Field.Index.TOKENIZED));
document.add(new Field("pdfPageText", currentTxt,
Field.Store.YES, Field.Index.TOKENIZED));
document.add(new Field("magazineName", element.getMaterial()
.getMagazine().getMagazinename(), Field.Store.YES,
Field.Index.TOKENIZED));
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
String publishDate = sdf.format(element.getPeriodical().getPublishDate());
document.add(new Field("publishDate", publishDate, Field.Store.YES,
Field.Index.TOKENIZED));
indexWriter.addDocument(document);
log.info("add one document");
}
log.info("Build Index Success...");
} catch (Exception e) {
log.info("add document error");
e.printStackTrace();
}
}

private String readTxt(String pathDir, int pagenum) {
File file = new File(pathDir);
StringBuffer buffer = new StringBuffer();
if (file.isDirectory()) {
try {
File realTextFile = new File(pathDir + File.separator + pagenum+ ".txt");
BufferedReader is = new BufferedReader(new FileReader(realTextFile));
String text = "";
while ((text = is.readLine()) != null)
buffer.append(text + "\n");
is.close();
} catch (Exception ex) {
}
}
return buffer.toString();
}

private Collection getPagesByPId(String pid) {
PdfPageFilter ppf = new PdfPageFilter();
ppf.addEqualTo(ppf.PERIODICAL_ID, pid);
return this.persistence.query(ppf);
}

public PathUtil getPathUtil() {
return pathUtil;
}

public void setPathUtil(PathUtil pathUtil) {
this.pathUtil = pathUtil;
}

@SuppressWarnings("unchecked")
public Page getByPage(Page page, String magazineTitle,String magazineContext) throws IOException {
this.log.info("Index getByPage(Page page) begin..");
File homePath = new File(pathUtil.getIndexFolderPath().getFile().getAbsolutePath());
Directory indexDir = FSDirectory.getDirectory(homePath.getAbsolutePath(), false);
IndexSearcher indexSearcher = null;
try {
indexSearcher = new IndexSearcher(indexDir);
} catch (FileNotFoundException e) {
throw new RuntimeException("em.indexNotFind");
}
Query query;
Sort sort = null;
Hits hits;
try {
query = getQuery(magazineTitle,magazineContext);
sort = new Sort(new SortField("publishDate",false));
// sort = new Sort("publishDate", true);
hits = indexSearcher.search(query, sort);
Collection<Object[]> items = new ArrayList<Object[]>();
int start = page.getStartAtIndex();
int end = page.getStartAtIndex() + page.getPs();
items = processHits(hits, start, end);
page.setItems(items);
page.setCount(hits.length());
int totalPages = page.getCount() / page.getPs();
if (page.getCount() % page.getPs() != 0) {
totalPages = totalPages + 1;
}
page.setTotalPage(totalPages);
indexSearcher.close();
indexDir.close();
} catch (Exception e) {
System.gc();
return page;
}
System.gc();
log.info("Index getByPage(Page page) begin..");
return page;
}

private Collection processHits(Hits hits, int start, int end) throws IOException, java.text.ParseException {
if (end >= hits.length()) {
end = hits.length();
}
Collection<SearchedBean> articles = new ArrayList<SearchedBean>();
for (int i = start; i < end; i++) {
Document doc = hits.doc(i);
SearchedBean article = getArticle(doc);
articles.add(article);
}
return articles;
}

private SearchedBean getArticle(Document doc)throws java.text.ParseException {
SearchedBean searchedBean = new SearchedBean();
searchedBean.setPeriodicalId(doc.get("periodicalId"));
searchedBean.setCrossnum(doc.get("crossnum"));
searchedBean.setMagazineName(doc.get("magazineName"));
searchedBean.setMaterialId(doc.get("materialId"));
searchedBean.setPagenum(doc.get("pagenum"));
searchedBean.setPdfPageId(doc.get("pdfPageId"));
searchedBean.setIsad(doc.get("isad"));
String str = doc.get("pdfPageText");
if(str.length()>200){
str = str.substring(0, 200);
}
searchedBean.setPdfPageText(str);
searchedBean.setPublishDate(doc.get("publishDate"));
return searchedBean;
}

private Query getQuery(String magazineTitile, String magazineContext) {
BooleanQuery query = new BooleanQuery();
boolean flag = true;
if(magazineTitile!=null&&!magazineTitile.equals("")){
Query titleQuery = new WildcardQuery(new Term("magazineName", "*" + magazineTitile.trim()+ "*"));
query.add(titleQuery, BooleanClause.Occur.MUST);
flag = false;
}
if(magazineContext!=null&&!magazineContext.equals("")){
Query contextQuery = new WildcardQuery(new Term("pdfPageText", "*"+ magazineContext.trim() + "*"));
query.add(contextQuery, BooleanClause.Occur.MUST);
flag = false;
}
Term aTerm = new Term("isad", "false");
Query singleQuery = new TermQuery(aTerm);
query.add(singleQuery, BooleanClause.Occur.MUST);
return query;
}

public PersistenceIF getPersistence() {
return persistence;
}

public void setPersistence(PersistenceIF persistence) {
this.persistence = persistence;
}

public synchronized void deleteIndexByPage(String pdfPageId) throws IOException {
this.log.info("deleteIndex(String pdfPageId) begin..");
Directory indexDir = getDirectory();
IndexReader reader = IndexReader.open(indexDir);
IndexReader.unlock(indexDir);
Term term = new Term("pdfPageId", pdfPageId);
System.out.println("pdfPageId:"+pdfPageId);
int num=reader.deleteDocuments(term);
System.out.println("delete num:"+num);
reader.close();
IndexWriter indexWriter = new IndexWriter(indexDir, getAnalyzer(),false);
indexWriter.optimize();
indexWriter.close();
indexDir.close();
this.log.info("deleteIndex(String pdfPageId) end..");

}

public void modifyIndexByMagazine(String magazineId) throws IOException {
Magazine m=(Magazine) this.persistence.get(Magazine.class, magazineId);
PeriodicalFilter pfilter = new PeriodicalFilter();
pfilter.addEqualTo(pfilter.MAGAZINE_ID, magazineId);
Collection periodicalList = this.persistence.query(pfilter);
for(Iterator it =periodicalList.iterator();it.hasNext(); ){
Periodical p = (Periodical)it.next();
this.modifyIndex(p.getId());
}
}

public void run() {
System.out.println("thread begin===================");
try {
modifyIndexByMagazine(magazineId);
} catch (IOException e) {
e.printStackTrace();
}
}

public String getMagazineId() {
return magazineId;
}

public void setMagazineId(String magazineId) {
this.magazineId = magazineId;
}
}

分享到:
评论

相关推荐

    Local Lucene-开源

    Lucene是Java领域内最知名的全文搜索引擎库,而Local Lucene则是在其基础上,针对地理数据进行的特化处理,使得在海量地理信息中快速定位和查询变得轻而易举。 首先,我们需要理解Apache Lucene的基本原理。Lucene...

    querydsl使用文档

    描述中的“描述了与maven自动生成、jpa等的使用方法”强调了文档内容涵盖了与Maven构建工具集成、Java Persistence API(JPA)查询、以及可能的其他技术(如JDO、SQL、Lucene、Hibernate Search)的集成使用指南。...

    chm格式文档(包含JAVA所有基本文档)

    【描述】中提到的"chm格式文档 包含基本常用的CHM帮助文档",暗示了这个压缩包内含有的资料是Java初学者和开发者必备的参考资料。"下了肯定值,不下后悔"表达了这些文档的重要性和实用性,对于想要深入理解和掌握...

    solr6.5英文版操作说明文档

    - **配置问题**:解决配置文件中出现的常见问题。 #### 十二、社区支持与资源 - **官方论坛**:获取最新动态和技术支持。 - **文档资料**:查阅详细的用户手册和技术文档。 - **示例代码**:学习实际应用案例中的 ...

    Liferay_Portal_门户解决方案

    Portlet 是 Portal 中最重要的组件,负责在 Portal 中呈现信息内容,有相应的生命周期。通过自定义 Portlet,用户很容易定义个性化的 Portal 页面。 5. Liferay portal 工作原理 Portal 系统根据需要由一个或者多...

    千锋2018elasticsearch笔记修改.docx

    Elasticsearch使用标准的HTTP方法如GET、POST、PUT和DELETE来执行常见的CRUD操作。 #### 1.3 架构 **Gateway层:** 负责存储索引文件,支持多种存储类型,如本地文件系统、Hadoop HDFS、Amazon S3等。该层确保数据...

    jeefuseMDA用户开发手册1

    Lucene是核心库,提供了索引和搜索的基本功能;Solr在其基础上提供了更高级的特性,如集群、分布式搜索和更友好的API。 综上所述,《jeefuseMDA用户开发手册1》覆盖了从底层数据访问到用户界面展示,再到网络通信和...

    pdfbox所有jar包以及源码

    这个压缩包中包含的"commons-logging.jar"是一个常用的日志抽象库,它允许你在程序中使用统一的日志接口,而实际的日志实现可以通过配置选择log4j、Java内置的日志系统或其他日志框架。 在使用这些jar包之前,建议...

    最新Java-培训大纲.docx

    - **Windows到Linux**:学员将从常见的Windows环境过渡到更适用于服务器端开发的Linux系统,学习其基本操作和管理,以适应企业级开发的需求。 2. **中间件**: - **Tomcat到JBoss**:学习如何配置和使用两种流行...

    微信公众平台应用开发:方法、技巧与案例.(机械工业.柳峰)

     9.3.2 MySQL的常用操作 228  9.3.3 JDBC的基本使用 232  9.3.4 案例:使用JDBC查询数据 234  9.4 BAE的MySQL服务 235  9.4.1 创建数据库 235  9.4.2 使用phpMyAdmin操作MySQL 237  9.4.3 案例:使用...

    JAVA上百实例源码以及开源项目源代码

    2个目标文件 摘要:Java源码,文件操作,TCP,服务器 Tcp服务端与客户端的JAVA实例源代码,一个简单的Java TCP服务器端程序,别外还有一个客户端的程序,两者互相配合可以开发出超多的网络程序,这是最基础的部分。...

    CSDN TUP第二期:王鹏云演讲PPT

    - **Zoie**:基于Lucene构建,来自LinkedIn,适用于中小规模的实时搜索需求。 - **Sphinx**:自1.10-beta版本起支持实时搜索功能,被广泛应用于craigslist、netlog等网站。 通过以上分析可以看出,实时搜索不仅是一...

    JeeSite开发说明文档

    - **常用工具封装**:提供了丰富的工具类封装,如日志工具、缓存工具、数据字典工具等,以及常用的前端标签(taglib),便于开发者快速调用这些工具和组件。 - **兼容性**:完全兼容主流浏览器,包括IE6/IE7+/Fire...

    史上最好传智播客就业班.net培训教程60G 不下会后悔

    常用数据结构(List、Dictionary、Array)、多态、常用设计模式、反射、常用.net类库、泛型、IO流、委托事件、正则表达式、XML、反射、GC等。 2、数据库开发及ADO.Net(6天) 核心技术课程 数据库开发基础、...

    opencms资料

    通过对简单页面的创建和配置,可以对OpenCMS的基本操作有一个初步的认识。 #### 四、OpenCMS XML内容管理 ##### 4.1 创建一个简单的XSD - **定义XML Schema**:用于描述XML文档的结构。 - **创建XML内容类型**:...

    flora-solr:用于Flora的Solr连接

    提到的“执照”通常是指软件的许可证,这可能关乎到该组件的使用、修改和分发规则,对于开源项目而言,常见的有MIT、Apache 2.0等许可方式。 从标签“JavaScript”我们可以推断,flora-solr可能是用JavaScript编写...

Global site tag (gtag.js) - Google Analytics