- 浏览: 106565 次
- 性别:
- 来自: 吉林
文章分类
最新评论
lucene第二步,lucene搜索
出自:http://blog.csdn.net/wxwzy738/article/details/8799656 的整理
1、工程结构
2、查询语法代码
[java]
view plaincopy
- packageorg.itat.index;
- importjava.io.File;
- importjava.io.IOException;
- importjava.io.StringReader;
- importjava.text.ParseException;
- importjava.text.SimpleDateFormat;
- importjava.util.Date;
- importjava.util.HashMap;
- importjava.util.Map;
- importorg.apache.lucene.analysis.Analyzer;
- importorg.apache.lucene.analysis.TokenStream;
- importorg.apache.lucene.analysis.standard.StandardAnalyzer;
- importorg.apache.lucene.analysis.tokenattributes.CharTermAttribute;
- importorg.apache.lucene.document.Document;
- importorg.apache.lucene.document.Field;
- importorg.apache.lucene.document.NumericField;
- importorg.apache.lucene.index.CorruptIndexException;
- importorg.apache.lucene.index.IndexReader;
- importorg.apache.lucene.index.IndexWriter;
- importorg.apache.lucene.index.IndexWriterConfig;
- importorg.apache.lucene.index.Term;
- importorg.apache.lucene.queryParser.QueryParser;
- importorg.apache.lucene.search.BooleanClause.Occur;
- importorg.apache.lucene.search.BooleanQuery;
- importorg.apache.lucene.search.FuzzyQuery;
- importorg.apache.lucene.search.IndexSearcher;
- importorg.apache.lucene.search.NumericRangeQuery;
- importorg.apache.lucene.search.PhraseQuery;
- importorg.apache.lucene.search.PrefixQuery;
- importorg.apache.lucene.search.Query;
- importorg.apache.lucene.search.ScoreDoc;
- importorg.apache.lucene.search.TermQuery;
- importorg.apache.lucene.search.TermRangeQuery;
- importorg.apache.lucene.search.TopDocs;
- importorg.apache.lucene.search.WildcardQuery;
- importorg.apache.lucene.store.Directory;
- importorg.apache.lucene.store.FSDirectory;
- importorg.apache.lucene.store.LockObtainFailedException;
- importorg.apache.lucene.util.Version;
- importorg.wltea.analyzer.lucene.IKAnalyzer;
- publicclassSearcherUtil{
- privateDirectorydirectory;
- privateAnalyzeranalyzer=newIKAnalyzer();
- privateIndexReaderreader;
- privateString[]ids={"1","2","3","4","5","6"};
- privateString[]emails={"aa@itat.org","bb@itat.org","cc@cc.org","dd@sina.org","ee@zttc.edu","ff@itat.org"};
- privateString[]contents={
- "welcometovisitedthespace,Ilikebook",
- "helloboy,Ilikepingpengball",
- "mynameisccIlikegame",
- "Ilikefootball",
- "IlikefootballandIlikebasketballtoo",
- "Ilikemovieandswim"
- };
- privateDate[]dates=null;
- privateint[]attachs={2,3,1,4,5,5};
- privateString[]names={"zhangsan","lisi","john","jetty","mike","jake"};
- privateMap<String,Float>scores=newHashMap<String,Float>();
- publicSearcherUtil(){
- //directory=newRAMDirectory();
- try{
- directory=FSDirectory.open(newFile("F:\\Workspaces\\lucenes\\02_lucene_searcher\\index"));
- setDates();
- scores.put("itat.org",2.0f);
- scores.put("zttc.edu",1.5f);
- //index();
- }catch(IOExceptione){
- e.printStackTrace();
- }
- }
- privatevoidsetDates(){
- SimpleDateFormatsdf=newSimpleDateFormat("yyyy-MM-dd");
- try{
- dates=newDate[ids.length];
- dates[0]=sdf.parse("2010-02-19");
- dates[1]=sdf.parse("2012-01-11");
- dates[2]=sdf.parse("2011-09-19");
- dates[3]=sdf.parse("2010-12-22");
- dates[4]=sdf.parse("2012-01-01");
- dates[5]=sdf.parse("2011-05-19");
- }catch(ParseExceptione){
- e.printStackTrace();
- }
- }
- publicvoidindex(){
- IndexWriterwriter=null;
- try{
- writer=newIndexWriter(directory,newIndexWriterConfig(Version.LUCENE_35,newStandardAnalyzer(Version.LUCENE_35)));
- writer.deleteAll();
- Documentdoc=null;
- for(inti=0;i<ids.length;i++){
- doc=newDocument();
- doc.add(newField("id",ids[i],Field.Store.YES,Field.Index.NOT_ANALYZED_NO_NORMS));
- doc.add(newField("email",emails[i],Field.Store.YES,Field.Index.NOT_ANALYZED));
- doc.add(newField("content",contents[i],Field.Store.NO,Field.Index.ANALYZED));
- doc.add(newField("name",names[i],Field.Store.YES,Field.Index.NOT_ANALYZED_NO_NORMS));
- //存储数字
- doc.add(newNumericField("attach",Field.Store.YES,true).setIntValue(attachs[i]));
- //存储日期
- doc.add(newNumericField("date",Field.Store.YES,true).setLongValue(dates[i].getTime()));
- Stringet=emails[i].substring(emails[i].lastIndexOf("@")+1);
- if(scores.containsKey(et)){
- doc.setBoost(scores.get(et));
- }else{
- doc.setBoost(0.5f);
- }
- writer.addDocument(doc);
- }
- }catch(CorruptIndexExceptione){
- e.printStackTrace();
- }catch(LockObtainFailedExceptione){
- e.printStackTrace();
- }catch(IOExceptione){
- e.printStackTrace();
- }finally{
- try{
- if(writer!=null)writer.close();
- }catch(CorruptIndexExceptione){
- e.printStackTrace();
- }catch(IOExceptione){
- e.printStackTrace();
- }
- }
- }
- publicIndexSearchergetSearcher(){
- try{
- if(reader==null){
- reader=IndexReader.open(directory);
- }else{
- IndexReadertr=IndexReader.openIfChanged(reader);
- if(tr!=null){
- reader.close();
- reader=tr;
- }
- }
- returnnewIndexSearcher(reader);
- }catch(CorruptIndexExceptione){
- e.printStackTrace();
- }catch(IOExceptione){
- e.printStackTrace();
- }
- returnnull;
- }
- publicIndexSearchergetSearcher(Directorydirectory){
- try{
- if(reader==null){
- reader=IndexReader.open(directory);
- }else{
- IndexReadertr=IndexReader.openIfChanged(reader);
- if(tr!=null){
- reader.close();
- reader=tr;
- }
- }
- returnnewIndexSearcher(reader);
- }catch(CorruptIndexExceptione){
- e.printStackTrace();
- }catch(IOExceptione){
- e.printStackTrace();
- }
- returnnull;
- }
- publicvoidsearchByTerm(Stringfield,Stringname,intnum){
- try{
- IndexSearchersearcher=getSearcher();
- Queryquery=newTermQuery(newTerm(field,name));
- TopDocstds=searcher.search(query,num);
- printDocument(searcher,tds);
- searcher.close();
- }catch(CorruptIndexExceptione){
- e.printStackTrace();
- }catch(IOExceptione){
- e.printStackTrace();
- }
- }
- publicvoidsearchByTermToken(Stringfield,Stringname,intnum){
- try{
- IndexSearchersearcher=getSearcher();
- //Queryquery=newTermQuery(newTerm(field,name));
- //当用户输入两个关键字时,QueryParser默认它们之间的关系为“或”关系
- //下面这么写的话在对用户输入进行扫描时,就会用空格分开的关键字理解为“与”,
- //其实也就是构建了一个“与”关系的布尔型查询
- //parser.setDefaultOperator(Operator.AND);
- QueryParserparser=newQueryParser(Version.LUCENE_35,field,analyzer);
- Stringk=analyzerKey(name);
- Queryquery=parser.parse(name);
- TopDocstds=searcher.search(query,num);
- printDocument(searcher,tds);
- searcher.close();
- }catch(CorruptIndexExceptione){
- e.printStackTrace();
- }catch(Exceptione){
- e.printStackTrace();
- }
- }
- privateStringanalyzerKey(Stringkey){
- //StandardAnalyzeranalyzer=newStandardAnalyzer(Version.LUCENE_35);
- StringReaderreader=newStringReader(key);
- TokenStreamtokenStream=analyzer.tokenStream("",reader);
- CharTermAttributetermattr=tokenStream.addAttribute(CharTermAttribute.class);
- StringBuildersb=newStringBuilder();
- try{
- while(tokenStream.incrementToken()){
- Stringk=termattr.toString();
- sb.append(k).append("");
- }
- }catch(IOExceptione){
- e.printStackTrace();
- }
- key=sb.toString().trim();
- key=key.replaceAll("\\s+","AND");
- returnsb.toString();
- }
- publicvoidprintDocument(IndexSearchersearcher,TopDocstds){
- System.out.println("共查询了【"+tds.totalHits+"】条");
- for(ScoreDocsd:tds.scoreDocs){
- try{
- Documentdoc=searcher.doc(sd.doc);
- System.out.println("filename:"+doc.get("filename"));
- System.out.println("path:"+doc.get("path"));
- System.out.println("date:"+doc.get("date"));
- System.out.println("size:"+doc.get("size"));
- System.out.println("content:"+doc.get("content"));
- System.out.println("-------------------------------------------");
- }catch(CorruptIndexExceptione){
- e.printStackTrace();
- }catch(IOExceptione){
- e.printStackTrace();
- }
- }
- }
- publicvoidsearchByTermRange(Stringfield,Stringstart,Stringend,intnum){
- try{
- IndexSearchersearcher=getSearcher();
- Queryquery=newTermRangeQuery(field,start,end,true,true);
- TopDocstds=searcher.search(query,num);
- printDocument(searcher,tds);
- searcher.close();
- }catch(CorruptIndexExceptione){
- e.printStackTrace();
- }catch(IOExceptione){
- e.printStackTrace();
- }
- }
- /**
- *建立索引时:使用的Field,而使用NumericRangeQuery,必须使用NumericField
- *@paramfield
- *@paramstart
- *@paramend
- *@paramnum
- */
- publicvoidsearchByNumricRange(Stringfield,intstart,intend,intnum){
- try{
- IndexSearchersearcher=getSearcher();
- Queryquery=NumericRangeQuery.newIntRange(field,start,end,true,true);
- //DateTools.dateToString(newDate(),null);
- TopDocstds=searcher.search(query,num);
- printDocument(searcher,tds);
- searcher.close();
- }catch(CorruptIndexExceptione){
- e.printStackTrace();
- }catch(IOExceptione){
- e.printStackTrace();
- }
- }
- publicvoidsearchByPrefix(Stringfield,Stringvalue,intnum){
- try{
- IndexSearchersearcher=getSearcher();
- Queryquery=newPrefixQuery(newTerm(field,value));
- TopDocstds=searcher.search(query,num);
- printDocument(searcher,tds);
- searcher.close();
- }catch(CorruptIndexExceptione){
- e.printStackTrace();
- }catch(IOExceptione){
- e.printStackTrace();
- }
- }
- publicvoidsearchByWildcard(Stringfield,Stringvalue,intnum){
- try{
- IndexSearchersearcher=getSearcher();
- //在传入的value中可以使用通配符:?和*,?表示匹配一个字符,*表示匹配任意多个字符
- Queryquery=newWildcardQuery(newTerm(field,value));
- TopDocstds=searcher.search(query,num);
- printDocument(searcher,tds);
- searcher.close();
- }catch(CorruptIndexExceptione){
- e.printStackTrace();
- }catch(IOExceptione){
- e.printStackTrace();
- }
- }
- publicvoidsearchByBoolean(intnum){
- try{
- IndexSearchersearcher=getSearcher();
- BooleanQueryquery=newBooleanQuery();
- /*
- *BooleanQuery可以连接多个子查询
- *Occur.MUST表示必须出现
- *Occur.SHOULD表示可以出现
- *Occur.MUSE_NOT表示不能出现
- */
- query.add(newTermQuery(newTerm("name","3")),Occur.MUST_NOT);
- query.add(newTermQuery(newTerm("content","健壮")),Occur.SHOULD);
- TopDocstds=searcher.search(query,num);
- printDocument(searcher,tds);
- searcher.close();
- }catch(CorruptIndexExceptione){
- e.printStackTrace();
- }catch(IOExceptione){
- e.printStackTrace();
- }
- }
- publicvoidsearchByPhrase(intnum){
- try{
- IndexSearchersearcher=getSearcher();
- PhraseQueryquery=newPhraseQuery();
- query.setSlop(10);
- query.add(newTerm("content","java"));
- //第一个Term
- query.add(newTerm("content","程序"));
- //产生距离之后的第二个Term
- //query.add(newTerm("content","football"));
- TopDocstds=searcher.search(query,num);
- printDocument(searcher,tds);
- searcher.close();
- }catch(CorruptIndexExceptione){
- e.printStackTrace();
- }catch(IOExceptione){
- e.printStackTrace();
- }
- }
- /**
- *查询用于匹配与指定项相似的项
- *默认是匹配一个有不同的,其他一样的,比如like和mike,就是距离算法的相似距离为1
- *这种方式少用,影响效率
- */
- publicvoidsearchByFuzzy(intnum){
- try{
- IndexSearchersearcher=getSearcher();
- //最后两个参数为匹配率和距离
- FuzzyQueryquery=newFuzzyQuery(newTerm("content","总统"),0.4f,0);
- System.out.println(query.getPrefixLength());
- System.out.println(query.getMinSimilarity());
- TopDocstds=searcher.search(query,num);
- printDocument(searcher,tds);
- searcher.close();
- }catch(CorruptIndexExceptione){
- e.printStackTrace();
- }catch(IOExceptione){
- e.printStackTrace();
- }
- }
- publicvoidsearchByQueryParse(Queryquery,intnum){
- try{
- IndexSearchersearcher=getSearcher();
- TopDocstds=searcher.search(query,num);
- System.out.println("一共查询了:"+tds.totalHits);
- for(ScoreDocsd:tds.scoreDocs){
- Documentdoc=searcher.doc(sd.doc);
- System.out.println(doc.get("id")+"---->"+
- doc.get("name")+"["+doc.get("email")+"]-->"+doc.get("id")+","+
- doc.get("attach")+","+doc.get("date")+"=="+sd.score);
- }
- searcher.close();
- }catch(CorruptIndexExceptione){
- e.printStackTrace();
- }catch(IOExceptione){
- e.printStackTrace();
- }
- }
- /**
- *lucene3.5之前采用的是一种再查询的方式,也就是说先把全部的结果的docid查询出来,然后
- *分页得到该页的docid,然后根据docid得到document信息,
- *lucene官方是说他的速度已经够快,再查询不会有效率问题
- *@paramquery
- *@parampageIndex
- *@parampageSize
- */
- publicvoidsearchPage(Stringquery,intpageIndex,intpageSize){
- try{
- Directorydir=FileIndexUtils.getDirectory();
- IndexSearchersearcher=getSearcher(dir);
- QueryParserparser=newQueryParser(Version.LUCENE_35,"content",analyzer);
- Queryq=parser.parse(query);
- TopDocstds=searcher.search(q,500);
- ScoreDoc[]sds=tds.scoreDocs;
- intstart=(pageIndex-1)*pageSize;
- intend=pageIndex*pageSize;
- for(inti=start;i<end;i++){
- Documentdoc=searcher.doc(sds[i].doc);
- System.out.println("filename:"+doc.get("filename"));
- System.out.println("path:"+doc.get("path"));
- System.out.println("date:"+doc.get("date"));
- System.out.println("size:"+doc.get("size"));
- System.out.println("content:"+doc.get("content"));
- System.out.println("-------------------------------------------");
- }
- searcher.close();
- }catch(org.apache.lucene.queryParser.ParseExceptione){
- e.printStackTrace();
- }catch(IOExceptione){
- e.printStackTrace();
- }
- }
- /**
- *目前没有办法只取当前这页的数据,而是要全部查询然后得到docid
- *一种增加效率的方式是取的条数做下限制,比如不要每次都取500条,
- *也是把取的条数设置为当前页的所在位置数,比如每页10条,
- *取第一页数据则取10条,取第二页则取20条,取五页则去50条
- *根据页码和分页大小获取上一次的最后一个ScoreDoc
- */
- privateScoreDocgetLastScoreDoc(intpageIndex,intpageSize,Queryquery,IndexSearchersearcher)throwsIOException{
- if(pageIndex==1)returnnull;//如果是第一页就返回空
- intnum=pageSize*(pageIndex-1);//获取上一页的数量
- TopDocstds=searcher.search(query,num);
- returntds.scoreDocs[num-1];
- }
- /**
- *使用这种方式的话是把上一页的最后一个元素给拿到,然后再把pagesize传入,
- *就可以得到当页的数据,其实就是简便了查询,原理还是把全部的docid查询后在得到document
- *@paramquery
- *@parampageIndex
- *@parampageSize
- */
- publicvoidsearchPageByAfter(Stringquery,intpageIndex,intpageSize){
- try{
- Directorydir=FileIndexUtils.getDirectory();
- IndexSearchersearcher=getSearcher(dir);
- QueryParserparser=newQueryParser(Version.LUCENE_35,"content",analyzer);
- Queryq=parser.parse(query);
- //先获取上一页的最后一个元素
- ScoreDoclastSd=getLastScoreDoc(pageIndex,pageSize,q,searcher);
- //通过最后一个元素搜索下页的pageSize个元素
- TopDocstds=searcher.searchAfter(lastSd,q,pageSize);
- printDocument(searcher,tds);
- searcher.close();
- }catch(org.apache.lucene.queryParser.ParseExceptione){
- e.printStackTrace();
- }catch(IOExceptione){
- e.printStackTrace();
- }
- }
- publicvoidsearchNoPage(Stringquery){
- try{
- Directorydir=FileIndexUtils.getDirectory();
- IndexSearchersearcher=getSearcher(dir);
- QueryParserparser=newQueryParser(Version.LUCENE_35,"content",newStandardAnalyzer(Version.LUCENE_35));
- Queryq=parser.parse(query);
- TopDocstds=searcher.search(q,20);
- ScoreDoc[]sds=tds.scoreDocs;
- for(inti=0;i<sds.length;i++){
- Documentdoc=searcher.doc(sds[i].doc);
- System.out.println(sds[i].doc+":"+doc.get("path")+"-->"+doc.get("filename"));
- }
- searcher.close();
- }catch(org.apache.lucene.queryParser.ParseExceptione){
- e.printStackTrace();
- }catch(IOExceptione){
- e.printStackTrace();
- }
- }
- }
3、查询语法的测试单元类
[java]
view plaincopy
- packageorg.itat.test;
- importjava.io.File;
- importjava.io.IOException;
- importorg.apache.commons.io.FileUtils;
- importorg.apache.commons.io.FilenameUtils;
- importorg.apache.lucene.analysis.Analyzer;
- importorg.apache.lucene.analysis.standard.StandardAnalyzer;
- importorg.apache.lucene.queryParser.ParseException;
- importorg.apache.lucene.queryParser.QueryParser;
- importorg.apache.lucene.search.Query;
- importorg.apache.lucene.util.Version;
- importorg.itat.index.FileIndexUtils;
- importorg.itat.index.SearcherUtil;
- importorg.junit.Before;
- importorg.junit.Test;
- importorg.wltea.analyzer.lucene.IKAnalyzer;
- publicclassTestSearch{
- privateSearcherUtilsu;
- privateAnalyzeranalyzer=newIKAnalyzer();
- @Before
- publicvoidinit(){
- su=newSearcherUtil();
- }
- @Test
- publicvoidtestCopyFiles(){
- try{
- Filefile=newFile("F:\\Workspaces\\lucenes\\02_lucene_searcher\\resource");
- for(Filef:file.listFiles()){
- StringdestFileName=FilenameUtils.getFullPath(f.getAbsolutePath())+
- FilenameUtils.getBaseName(f.getName())+".she";
- FileUtils.copyFile(f,newFile(destFileName));
- }
- }catch(IOExceptione){
- e.printStackTrace();
- }
- }
- @Test
- publicvoidsearchByTerm(){
- //su.searchByTerm("content","",10);
- su.searchByTermToken("content","头脑风暴",10);
- }
- @Test
- publicvoidsearchByTermRange(){
- //查询name以a开头和s结尾的
- //su.searchByTermRange("name","a","s",10);
- //由于attachs是数字类型,使用TermRange无法查询
- //su.searchByTermRange("size",newNumericField("200").stringValue(),newNumericField("500").stringValue(),10);
- QueryParserparser=newQueryParser(Version.LUCENE_35,"size",analyzer);
- Queryquery;
- try{
- query=parser.parse("size:[100TO500]");
- su.searchByQueryParse(query,10);
- }catch(ParseExceptione){
- e.printStackTrace();
- }
- }
- @Test
- publicvoidsearchByNumRange(){
- //su.searchByNumricRange("attach",2,10,5);
- su.searchByNumricRange("size",100,300,10);
- }
- @Test
- publicvoidsearchByPrefix(){
- su.searchByPrefix("content","人",10);
- }
- @Test
- publicvoidsearchByWildcard(){
- //匹配@itat.org结尾的所有字符
- //su.searchByWildcard("email","*@itat.org",10);
- //匹配j开头的有三个字符的name
- //su.searchByWildcard("name","j???",10);
- su.searchByWildcard("content","类?",10);
- }
- @Test
- publicvoidsearchByBoolean(){
- su.searchByBoolean(10);
- }
- @Test
- publicvoidsearchByPhrase(){
- su.searchByPhrase(10);
- }
- @Test
- publicvoidsearchByFuzzy(){
- su.searchByFuzzy(10);
- }
- @Test
- publicvoidsearchByQueryParse()throwsParseException{
- //1、创建QueryParser对象,默认搜索域为content
- QueryParserparser=newQueryParser(Version.LUCENE_35,"content",newStandardAnalyzer(Version.LUCENE_35));
- //改变空格的默认操作符,以下可以改成AND
- //parser.setDefaultOperator(Operator.AND);
- //开启第一个字符的通配符匹配,默认关闭因为效率不高
- parser.setAllowLeadingWildcard(true);
- //搜索content中包含有like的
- Queryquery=parser.parse("like");
- //有basketball或者football的,空格默认就是OR
- query=parser.parse("basketballfootball");
- //改变搜索域为name为mike
- //query=parser.parse("content:like");
- //同样可以使用*和?来进行通配符匹配
- //query=parser.parse("name:j*");
- //通配符默认不能放在首位
- //query=parser.parse("email:*@itat.org");
- //匹配name中没有mike但是content中必须有football的,+和-要放置到域说明前面
- query=parser.parse("-name:mike+like");
- //匹配一个区间,注意:TO必须是大写
- //query=parser.parse("id:[1TO6]");
- //闭区间匹配只会匹配到2
- //query=parser.parse("id:{1TO3}");
- //完全匹配ILikeFootball的
- //query=parser.parse("\"Ilikefootball\"");
- //匹配I和football之间有一个单词距离的
- //query=parser.parse("\"Ifootball\"~1");
- //模糊查询
- //query=parser.parse("name:make~");
- //没有办法匹配数字范围(自己扩展Parser)
- //query=parser.parse("attach:[2TO10]");
- su.searchByQueryParse(query,10);
- }
- @Test
- publicvoidindexFile(){
- FileIndexUtils.index(true);
- }
- @Test
- publicvoidtestSearchPage01(){
- su.searchPage("java",2,5);
- System.out.println("-------------------------------");
- //su.searchNoPage("java");
- su.searchPageByAfter("java",2,2);
- }
- @Test
- publicvoidtestSearchPage02(){
- su.searchPageByAfter("java",3,20);
- }
- }
4、创建索引的类
[java]
view plaincopy
- packageorg.itat.index;
- importjava.io.File;
- importjava.io.FileReader;
- importjava.io.IOException;
- importorg.apache.commons.io.FileUtils;
- importorg.apache.lucene.analysis.Analyzer;
- importorg.apache.lucene.analysis.standard.StandardAnalyzer;
- importorg.apache.lucene.document.Document;
- importorg.apache.lucene.document.Field;
- importorg.apache.lucene.document.NumericField;
- importorg.apache.lucene.index.CorruptIndexException;
- importorg.apache.lucene.index.IndexWriter;
- importorg.apache.lucene.index.IndexWriterConfig;
- importorg.apache.lucene.store.Directory;
- importorg.apache.lucene.store.FSDirectory;
- importorg.apache.lucene.store.LockObtainFailedException;
- importorg.apache.lucene.util.Version;
- importorg.wltea.analyzer.lucene.IKAnalyzer;
- publicclassFileIndexUtils{
- privatestaticDirectorydirectory=null;
- privatestaticAnalyzeranalyzer=newIKAnalyzer();
- static{
- try{
- directory=FSDirectory.open(newFile("F:\\Workspaces\\lucenes\\02_lucene_searcher\\index"));
- }catch(IOExceptione){
- e.printStackTrace();
- }
- }
- publicstaticDirectorygetDirectory(){
- returndirectory;
- }
- publicstaticvoidindex(booleanhasNew){
- IndexWriterwriter=null;
- try{
- writer=newIndexWriter(directory,newIndexWriterConfig(Version.LUCENE_35,analyzer));
- if(hasNew){
- writer.deleteAll();
- }
- Filefile=newFile("F:\\Workspaces\\lucenes\\02_lucene_searcher\\resource");
- Documentdoc=null;
- for(Filef:file.listFiles()){
- doc=newDocument();
- doc.add(newField("content",FileUtils.readFileToString(f),Field.Store.YES,Field.Index.ANALYZED));
- doc.add(newField("filename",f.getName(),Field.Store.YES,Field.Index.ANALYZED));
- doc.add(newField("path",f.getAbsolutePath(),Field.Store.YES,Field.Index.ANALYZED));
- doc.add(newNumericField("date",Field.Store.YES,true).setLongValue(f.lastModified()));
- doc.add(newNumericField("size",Field.Store.YES,true).setIntValue((int)(f.length())));
- writer.addDocument(doc);
- }
- }catch(CorruptIndexExceptione){
- e.printStackTrace();
- }catch(LockObtainFailedExceptione){
- e.printStackTrace();
- }catch(IOExceptione){
- e.printStackTrace();
- }finally{
- try{
- if(writer!=null)writer.close();
- }catch(CorruptIndexExceptione){
- e.printStackTrace();
- }catch(IOExceptione){
- e.printStackTrace();
- }
- }
- }
- }
5、对索引进行操作的类
[java]
view plaincopy
- packageorg.itat.index;
- importjava.io.IOException;
- importjava.text.ParseException;
- importjava.text.SimpleDateFormat;
- importjava.util.Date;
- importjava.util.HashMap;
- importjava.util.Map;
- importorg.apache.lucene.analysis.standard.StandardAnalyzer;
- importorg.apache.lucene.document.Document;
- importorg.apache.lucene.document.Field;
- importorg.apache.lucene.document.NumericField;
- importorg.apache.lucene.index.CorruptIndexException;
- importorg.apache.lucene.index.IndexReader;
- importorg.apache.lucene.index.IndexWriter;
- importorg.apache.lucene.index.IndexWriterConfig;
- importorg.apache.lucene.index.StaleReaderException;
- importorg.apache.lucene.index.Term;
- importorg.apache.lucene.store.Directory;
- importorg.apache.lucene.store.LockObtainFailedException;
- importorg.apache.lucene.store.RAMDirectory;
- importorg.apache.lucene.util.Version;
- publicclassIndexUtil{
- privateString[]ids={"1","2","3","4","5","6"};
- privateString[]emails={"aa@itat.org","bb@itat.org","cc@cc.org","dd@sina.org","ee@zttc.edu","ff@itat.org"};
- privateString[]contents={
- "welcometovisitedthespace,Ilikebook",
- "helloboy,Ilikepingpengball",
- "mynameisccIlikegame",
- "Ilikefootball",
- "IlikefootballandIlikebasketballtoo",
- "Ilikemovieandswim"
- };
- privateDate[]dates=null;
- privateint[]attachs={2,3,1,4,5,5};
- privateString[]names={"zhangsan","lisi","john","jetty","mike","jake"};
- privateDirectorydirectory=null;
- privateMap<String,Float>scores=newHashMap<String,Float>();
- publicIndexUtil(){
- setDates();
- scores.put("itat.org",2.0f);
- scores.put("zttc.edu",1.5f);
- directory=newRAMDirectory();
- index();
- }
- privatevoidsetDates(){
- SimpleDateFormatsdf=newSimpleDateFormat("yyyy-MM-dd");
- try{
- dates=newDate[ids.length];
- dates[0]=sdf.parse("2010-02-19");
- dates[1]=sdf.parse("2012-01-11");
- dates[2]=sdf.parse("2011-09-19");
- dates[3]=sdf.parse("2010-12-22");
- dates[4]=sdf.parse("2012-01-01");
- dates[5]=sdf.parse("2011-05-19");
- }catch(ParseExceptione){
- e.printStackTrace();
- }
- }
- publicvoidundelete(){
- //使用IndexReader进行恢复
- try{
- IndexReaderreader=IndexReader.open(directory,false);
- //恢复时,必须把IndexReader的只读(readOnly)设置为false
- reader.undeleteAll();
- reader.close();
- }catch(CorruptIndexExceptione){
- e.printStackTrace();
- }catch(StaleReaderExceptione){
- e.printStackTrace();
- }catch(LockObtainFailedExceptione){
- e.printStackTrace();
- }catch(IOExceptione){
- e.printStackTrace();
- }
- }
- publicvoidmerge(){
- IndexWriterwriter=null;
- try{
- writer=newIndexWriter(directory,
- newIndexWriterConfig(Version.LUCENE_35,newStandardAnalyzer(Version.LUCENE_35)));
- //会将索引合并为2段,这两段中的被删除的数据会被清空
- //特别注意:此处Lucene在3.5之后不建议使用,因为会消耗大量的开销,
- //Lucene会根据情况自动处理的
- writer.forceMerge(2);
- }catch(CorruptIndexExceptione){
- e.printStackTrace();
- }catch(LockObtainFailedExceptione){
- e.printStackTrace();
- }catch(IOExceptione){
- e.printStackTrace();
- }finally{
- try{
- if(writer!=null)writer.close();
- }catch(CorruptIndexExceptione){
- e.printStackTrace();
- }catch(IOExceptione){
- e.printStackTrace();
- }
- }
- }
- publicvoidforceDelete(){
- IndexWriterwriter=null;
- try{
- writer=newIndexWriter(directory,
- newIndexWriterConfig(Version.LUCENE_35,newStandardAnalyzer(Version.LUCENE_35)));
- writer.forceMergeDeletes();
- }catch(CorruptIndexExceptione){
- e.printStackTrace();
- }catch(LockObtainFailedExceptione){
- e.printStackTrace();
- }catch(IOExceptione){
- e.printStackTrace();
- }finally{
- try{
- if(writer!=null)writer.close();
- }catch(CorruptIndexExceptione){
- e.printStackTrace();
- }catch(IOExceptione){
- e.printStackTrace();
- }
- }
- }
- publicvoiddelete(){
- IndexWriterwriter=null;
- try{
- writer=newIndexWriter(directory,
- newIndexWriterConfig(Version.LUCENE_35,newStandardAnalyzer(Version.LUCENE_35)));
- //参数是一个选项,可以是一个Query,也可以是一个term,term是一个精确查找的值
- //此时删除的文档并不会被完全删除,而是存储在一个回收站中的,可以恢复
- writer.deleteDocuments(newTerm("id","1"));
- writer.commit();
- }catch(CorruptIndexExceptione){
- e.printStackTrace();
- }catch(LockObtainFailedExceptione){
- e.printStackTrace();
- }catch(IOExceptione){
- e.printStackTrace();
- }finally{
- try{
- if(writer!=null)writer.close();
- }catch(CorruptIndexExceptione){
- e.printStackTrace();
- }catch(IOExceptione){
- e.printStackTrace();
- }
- }
- }
- publicvoidupdate(){
- IndexWriterwriter=null;
- try{
- writer=newIndexWriter(directory,
- newIndexWriterConfig(Version.LUCENE_35,newStandardAnalyzer(Version.LUCENE_35)));
- /*
- *Lucene并没有提供更新,这里的更新操作其实是如下两个操作的合集
- *先删除之后再添加
- */
- Documentdoc=newDocument();
- doc.add(newField("id","11",Field.Store.YES,Field.Index.NOT_ANALYZED_NO_NORMS));
- doc.add(newField("email",emails[0],Field.Store.YES,Field.Index.NOT_ANALYZED));
- doc.add(newField("content",contents[0],Field.Store.NO,Field.Index.ANALYZED));
- doc.add(newField("name",names[0],Field.Store.YES,Field.Index.NOT_ANALYZED_NO_NORMS));
- writer.updateDocument(newTerm("id","1"),doc);
- }catch(CorruptIndexExceptione){
- e.printStackTrace();
- }catch(LockObtainFailedExceptione){
- e.printStackTrace();
- }catch(IOExceptione){
- e.printStackTrace();
- }finally{
- try{
- if(writer!=null)writer.close();
- }catch(CorruptIndexExceptione){
- e.printStackTrace();
- }catch(IOExceptione){
- e.printStackTrace();
- }
- }
- }
- publicvoidquery(){
- try{
- IndexReaderreader=IndexReader.open(directory);
- //通过reader可以有效的获取到文档的数量
- System.out.println("numDocs:"+reader.numDocs());
- System.out.println("maxDocs:"+reader.maxDoc());
- System.out.println("deleteDocs:"+reader.numDeletedDocs());
- reader.close();
- }catch(CorruptIndexExceptione){
- e.printStackTrace();
- }catch(IOExceptione){
- e.printStackTrace();
- }
- }
- publicvoidindex(){
- IndexWriterwriter=null;
- try{
- writer=newIndexWriter(directory,newIndexWriterConfig(Version.LUCENE_35,newStandardAnalyzer(Version.LUCENE_35)));
- writer.deleteAll();
- Documentdoc=null;
- for(inti=0;i<ids.length;i++){
- doc=newDocument();
- doc.add(newField("id",ids[i],Field.Store.YES,Field.Index.NOT_ANALYZED_NO_NORMS));
- doc.add(newField("email",emails[i],Field.Store.YES,Field.Index.NOT_ANALYZED));
- doc.add(newField("content",contents[i],Field.Store.NO,Field.Index.ANALYZED));
- doc.add(newField("name",names[i],Field.Store.YES,Field.Index.NOT_ANALYZED_NO_NORMS));
- //存储数字
- doc.add(newNumericField("attach",Field.Store.YES,true).setIntValue(attachs[i]));
- //存储日期
- doc.add(newNumericField("date",Field.Store.YES,true).setLongValue(dates[i].getTime()));
- Stringet=emails[i].substring(emails[i].lastIndexOf("@")+1);
- System.out.println(et);
- if(scores.containsKey(et)){
- doc.setBoost(scores.get(et));
- }else{
- doc.setBoost(0.5f);
- }
- writer.addDocument(doc);
- }
- }catch(CorruptIndexExceptione){
- e.printStackTrace();
- }catch(LockObtainFailedExceptione){
- e.printStackTrace();
- }catch(IOExceptione){
- e.printStackTrace();
- }finally{
- try{
- if(writer!=null)writer.close();
- }catch(CorruptIndexExceptione){
- e.printStackTrace();
- }catch(IOExceptione){
- e.printStackTrace();
- }
- }
- }
- }
工程下载地址:http://download.csdn.net/detail/wxwzy738/5256553
相关推荐
《Lucene in Action 第二版》是一本专门介绍如何使用Lucene搜索引擎框架的书籍。Lucene是一个高性能的全文检索库,它允许开发者在应用程序中实现搜索功能。第二版意味着这本书经过了更新,以适应Lucene版本的变化。...
本篇文章将带你迈出使用Lucene的第一步,重点关注如何实现分页检索,这对于构建高效、用户友好的搜索系统至关重要。 Lucene的核心功能包括文档索引、查询解析和结果排序。在分页检索方面,我们需要考虑如何有效地...
《Lucene实战(中文版第二版)》是针对搜索引擎开发领域的经典著作,它详细介绍了如何使用Apache Lucene这个强大的全文搜索引擎库。Lucene是Java语言实现的开源项目,被广泛应用于各种信息检索系统中,包括网站搜索...
标题中的“lucene第一步---5.中文分词IKAnalyzer和高亮highlighter的使用”指出,这个主题将探讨如何在Lucene中应用IKAnalyzer进行中文分词,以及如何使用高亮器(highlighter)来突出搜索结果中的关键词。Lucene是...
本书《Lucene实战第二版》是一本关于如何使用Lucene进行文本检索的实用教程。这本书详细介绍了Lucene的使用方法和内部工作机制,并提供了丰富的代码示例和清晰的解释。它不仅适合那些计划在应用中使用Lucene的开发者...
《Lucene In Action 第二版》是一本深入探讨Apache Lucene全文搜索引擎库的专业书籍,高清中文版的提供为中文读者提供了便利。这本书由Michael McCandless等作者编写,旨在帮助开发者充分利用Lucene的强大功能,构建...
《Lucene in Action 第二版》是一本深入探讨Apache Lucene全文检索库的专业书籍,它在Java开发领域具有很高的权威性。这本书详细介绍了如何利用Lucene进行高效的文本搜索和索引构建,是Java开发者和信息检索爱好者的...
经典的Lucene资源
Lucene实战第二版完整清晰中文版是一本介绍Lucene开源全文搜索引擎开发包的书籍。Lucene是一个用Java编写的功能强大的全文搜索引擎库,它以出色的可扩展性和快速的搜索特性获得了广泛的赞誉。本书详细介绍了如何有效...
《Lucene实战(第二版)》是一本深入探讨Apache Lucene全文搜索引擎库的权威书籍,主要面向对Java和搜索引擎技术感兴趣的开发者。这本书详尽地介绍了如何利用Lucene进行信息检索、文本分析和索引构建,同时也涵盖了...
《Lucene实战第二版》是关于全文搜索引擎Lucene的一本权威指南,由Michael McCandless、Erik Hatcher和Dave Bollinger共同撰写。这本书详细介绍了如何使用Java库Lucene来构建高性能、可扩展的搜索功能。以下是该书的...
第二个是 RAMDirectory,它表示一个存储在内存当中的索引的位置。 public void add(Query query, BooleanClause.Occur occur) BooleanClause用于表示布尔查询子句关系的类,包括: BooleanClause.Occur.MUST,...
《Lucene in Action》第二版是一本专注于开源全文搜索引擎库Lucene的专业著作,由美国的Otis Gospodnetic和Erik Hatcher共同撰写。这本书深入浅出地讲解了如何利用Lucene进行高效的文本搜索和索引构建,是Java开发者...
5. **术语(Term)**:经过分词后的单个词或短语称为术语,是Lucene搜索的基本单位。 ### 二、Lucene工作流程 1. **创建索引**:首先,开发者需要创建一个`IndexWriter`实例,然后调用`addDocument()`方法添加文档...
**Lucene 高级搜索项目概述** Lucene 是一个高性能、全文检索库,它提供了文本分析、索引和搜索功能,被广泛应用于各种搜索引擎的构建。在这个“Lucene 高级搜索项目”中,我们将深入探讨如何利用Lucene实现附件...
本书的第二版更新了与最新Lucene版本相关的技术,确保读者能掌握最前沿的搜索技术。 Lucene的核心功能包括索引构建、查询解析、搜索执行和结果排序。索引构建是Lucene的第一步,它将文本数据转换为可快速检索的结构...
《Lucene实战(第2版)》基于Apache的Lucene 3.0,从Lucene核心、Lucene应用、案例分析3个方面详细系统地介绍了Lucene,包括认识Lucene、建立索引、为应用程序添加搜索功能、高级搜索技术、扩展搜索、使用Tika提取文本...
《Lucene实战(第2版)》基于Apache的Lucene 3.0,从Lucene核心、Lucene应用、案例分析3个方面详细系统地介绍了Lucene,包括认识Lucene、建立索引、为应用程序添加搜索功能、高级搜索技术、扩展搜索、使用Tika提取...
《搜索引擎Lucene+Heritrix(第二版)4》是一本深入探讨搜索引擎技术的专业书籍,主要围绕两个核心组件——Lucene和Heritrix展开。Lucene是Apache软件基金会的一个开源全文检索库,而Heritrix则是一个网络爬虫工具,...