lucene基本的搜索功能

gordon20082008

浏览: 25563 次
性别:
来自: 合肥

最近访客更多访客>>

fireqiao

gaochunhu

wl80917

博主相关

博客

微博

相册

留言

关于我

文章分类

社区版块

存档分类

博客分类：

技术杂类

lucene Apache C C++C#

对网上盛传的lucene使用方法进行了一些修改.网上的是对某个目录下所有HTML文件进行索引和搜索。但是不支持多重目录下的搜索。这里做了一点修改。大部分还是网上的代码。
Constants.java

package testlucene;

public class Constants {
        //要索引的文件的存放路径
        public final static String INDEX_FILE_PATH = "c:\\dataDir";
        
        //索引的存放位置
        public final static String INDEX_STORE_PATH = "c:\\indexDir";
}

LuceneIndex.java

package testlucene;
import java.io.*;
import java.util.*;
import org.apache.lucene.document.*;
import org.apache.lucene.index.*;
import org.mira.lucene.analysis.IK_CAnalyzer;

public class LuceneIndex {
        private IndexWriter writer = null;
        
		public LuceneIndex(){
                try {
                        writer = new IndexWriter(Constants.INDEX_STORE_PATH,new IK_CAnalyzer(),true);
                        //true表示可以重写（覆盖？）
                }catch(Exception e){
                        e.printStackTrace();
                }
        }
        
        @SuppressWarnings("deprecation")
		private Document getDocument(File f) throws Exception{
        	//为每个文件建立一个Document文档，里面增加内容
                Document doc = new Document();
                    if(f.isFile()){
                   	 FileInputStream is = new FileInputStream(f);
                        Reader reader = new BufferedReader(new InputStreamReader(is));
                        doc.add(new Field("contents",reader));
                        doc.add(new Field("path",f.getCanonicalPath(),Field.Store.YES,Field.Index.TOKENIZED));
                   }

               
                return doc;
        }
        
        public void writeToIndex() throws Exception{
                File folder = new File(Constants.INDEX_FILE_PATH);
                if(folder.isDirectory()){
                        File[] files = getFileList(new File(Constants.INDEX_FILE_PATH));
                        for(int i=0; i<files.length; i++){
                                File file = new File(files[i].toString());
                                Document doc = getDocument(file);
                                System.out.println("正在为文件   (" + file + ") 建立索引...");
                                writer.addDocument(doc);
                        }
                }
        }
        
        public void close()throws Exception{
                writer.close();
        }
        
        public static void main(String[] args)throws Exception{
                LuceneIndex indexer = new LuceneIndex();
                Date start = new Date();
                indexer.writeToIndex();
                Date end = new Date();
                System.out.println("建立索引用时 " + (end.getTime() - start.getTime()) + "毫秒");
                indexer.close();
        }
        
		@SuppressWarnings("unchecked")
		private File[] getFileList(File file){
        	File[] list = null;
        	ArrayList show = new ArrayList();
        	if(file.isFile()){list = new File[1];list[0] = file;return list;}
        	else if(file.isDirectory()){
        		File[] subDir = file.listFiles();
        		
        		for(int j=0;j<subDir.length;j++){
        			if(subDir[j].isFile()){
        				
        				 show.add(subDir[j]);
        			}else if(subDir[j].isDirectory()){
        				File[] third = getFileList(subDir[j]);
        				for(int k=0;null!=third&&k<third.length;k++)
        				show.add(third[k]);
        			}
        		}
        	}
        	list = new File[show.size()];
        	for(int m=0;m<show.size();m++)list[m]=new File(show.get(m).toString());
        	return list;
        }
        
}

package testlucene;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.InputStreamReader;
import java.io.RandomAccessFile;
import java.util.*;
import org.apache.lucene.document.*;
import org.apache.lucene.queryParser.*;
import org.apache.lucene.search.*;
import org.mira.lucene.analysis.IK_CAnalyzer;

public class LuceneSearch {
	private IndexSearcher searcher = null;
	private Query query = null;
	private  File shopInfoTxt = null;
	private RandomAccessFile bw;
	

	
	public LuceneSearch() {
		try {
			searcher = new IndexSearcher(Constants.INDEX_STORE_PATH);
		} catch (Exception e) {
			e.printStackTrace();
		}
	}

	@SuppressWarnings("deprecation")
	public final Hits Search(String keyword) {
		System.out.println("正在检索关键字 " + keyword);
		try {
			query = new QueryParser("contents", new IK_CAnalyzer())
					.parse(keyword);
			Date start = new Date();
			Hits hits = searcher.search(query);
			Date end = new Date();
			System.out.println("检索完成，用时" + (end.getTime() - start.getTime())
					+ "毫秒");
			return hits;
		} catch (Exception e) {
			e.printStackTrace();
			return null;
		}
	}

	@SuppressWarnings("deprecation")
	public String printResult(Hits h, String test) {//显示关键字再哪个文件的哪行，用|隔开
		if (h.length() == 0) {
			System.out.println("对不起，没有找到您要的结果。");
			return "";
		} else {
			for (int i = 0; i < h.length(); i++) {
				try {
					Document doc = h.doc(i);
					System.out.print("这是第" + (i + 1) + "个检索到的结果，文件名为 ：");
					System.out.println(doc.get("path"));

					BufferedReader br = new BufferedReader(new FileReader(doc.get("path")));
					String line = null;
					int lineNum = 0;
					while ((line = br.readLine()) != null) {
						lineNum++;

						if (line.indexOf(test) != -1)
							return doc.get("path")+"|"+lineNum;
					}

				} catch (Exception e) {
					e.printStackTrace();
				}
			}
		}
		return "";
	}

	public static void main(String[] args) throws Exception {
		
		LuceneSearch temp = new LuceneSearch();
		
		
		//得到分类信息
		String content = temp.getInnerContent3("nav_w","main_w");
		content = content.replaceAll("&nbsp;", "");
		content = content.replaceAll("&gt;", ">");
		System.out.println("==========分类信息:=============\n"+content+"\n================");
		String typeinfo = content;
		
		//得到商家具体信息
		String shopInfo = "";
		content = temp.getInnerContent3("shopInfo","shopRemark");
		shopInfo = content;
		//DataOutputStream write = new  DataOutputStream(new FileOutputStream(temp.getShopInfoTxt()));
		//write.write(typeinfo.getBytes());
		
		temp.getBw().write(typeinfo.getBytes());
		
		System.out.println("得到商家信息:\n"+content);
		
		//地址信息
		int addStart = content.indexOf("地址");
		int addEnd   = content.indexOf("电话");
		if(addStart>0&&addEnd>0){
		content = content.substring(addStart+3, addEnd);
		content = content.replaceAll("&nbsp;", "");
		System.out.println("==========地址信息:=============\n"+content+"\n===============");
		}
		//write.write(content.getBytes());
		
		temp.getBw().write(content.getBytes());
		temp.getBw().write("\n".getBytes());
		
		//电话信息
		content = shopInfo;
		int telStart = content.indexOf("电话");
		int telEnd   = content.indexOf("报错");
		if(telStart>0&&telEnd>0){
		content = content.substring(telStart+3,telEnd);
		content = content.replaceAll("&nbsp;", "");
		System.out.println("=================电话信息:================\n"+content+"\n==================");
		//write.write(content.getBytes());
		temp.getBw().write(content.getBytes());
		temp.getBw().write("\n".getBytes());
		}
		
		//商家介绍
		content = shopInfo;
		int introStart = content.indexOf("商户简介");
		int introEnd   = content.indexOf("分类标签");
		
		if(introStart>0&&introEnd>0){
		content = content.substring(introStart+5,introEnd);
		content = content.replaceAll("&nbsp;", "");
		System.out.println("=================商家介绍:===============\n"+content+"\n===================");
		//write.write(content.getBytes());
		temp.getBw().write(content.getBytes());
		temp.getBw().write("\n".getBytes());
		}
		
		//分类标签
		content = shopInfo;
		int typeStart = content.indexOf("分类标签");
		int typeEnd   = content.indexOf("网友推荐");
		if(typeStart>0&&typeEnd>0){
		content = content.substring(typeStart+4,typeEnd);
		content = content.replaceAll("&nbsp;", "");
		System.out.println("=================分类标签:===============\n"+content+"\n===================");
		//write.write(content.getBytes());
		temp.getBw().write(content.getBytes());
		temp.getBw().write("\n".getBytes());
		}
		
		//网友推荐
		content = shopInfo;
		int suggestEnd = content.lastIndexOf(")");
		if(typeEnd>0&&suggestEnd>0){
		content = content.substring(typeEnd+4,suggestEnd+1);
		content = content.replaceAll("&nbsp;", "");
		System.out.println("=================网友推荐:===============\n"+content+"\n===================");
		//write.write(content.getBytes());
		//write.close();
		temp.getBw().write(content.getBytes());
		temp.getBw().write("\n".getBytes());
		}
		
		temp.getBw().close();
	}
	

	
	
	@SuppressWarnings("deprecation")
	public  String getInnerContent3(String first,String sec) throws Exception, FileNotFoundException{
		//选择两个字段之间的内容，读取其中的内容
		LuceneSearch test = new LuceneSearch();
		Hits h = null;
		String scrrenString = "";
		
		//first = "nav_w";//起始位置
		h = test.Search(first);
		String start = test.printResult(h, first);
		String fileName = start.substring(0, start.indexOf("|"));
		int startLine = Integer.parseInt(start.substring(start.indexOf("|")+1, start.length()));
		
		//sec = "main_w";//截止位置
		h = test.Search(sec);
		String end = test.printResult(h, sec);
		String fileName2 = start.substring(0, start.indexOf("|"));
		int endLine  = Integer.parseInt(end.substring(end.indexOf("|")+1, end.length()));
		
		if(fileName2.equalsIgnoreCase(fileName)){
			String tempFileName = "";
			tempFileName = fileName.substring(fileName.lastIndexOf("\\")).replace(".html", ".txt");
			tempFileName = fileName.substring(fileName.lastIndexOf("\\")).replace(".htm", ".txt");
			shopInfoTxt = new File("c:/temp/",tempFileName);
			if(!shopInfoTxt.exists())shopInfoTxt.createNewFile();
			bw = new RandomAccessFile(shopInfoTxt,"rw");    
		BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(fileName),"utf-8"));
		String line = null;
		int lineNum = 0;
		System.out.println("sentences from "+startLine+" to "+endLine);
		while ((line = br.readLine()) != null) {
			lineNum++;
			
			if (lineNum>=startLine&&lineNum<endLine)scrrenString+=line;
		}
		scrrenString = getShortFormat(scrrenString);
		scrrenString = scrrenString.substring(scrrenString.indexOf("：")+1);
		}
		return scrrenString;
	}

	public File  getShopInfoTxt() {
		return shopInfoTxt;
	}

	public RandomAccessFile getBw() {
		return bw;
	}
	private static String getShortFormat(String content){//去掉<>里面的内容
		String finalString = content.trim();
		int first = finalString.indexOf("<");
		int end = finalString.indexOf(">");
		if(first>-1&&end>-1){
			finalString = finalString.substring(0, first).trim()+finalString.substring(end+1, finalString.length()).trim();
			finalString = getShortFormat(finalString);
		}
		return finalString;
	}
	
	
}

分享到：

捣鼓ssh上手的例子-----hibernate

2009-05-26 10:11
浏览 847
评论(0)
查看更多

发表评论

您还没有登录,请您登录后再发表评论

最近访客更多访客>>

博主相关

文章分类

社区版块

存档分类

最新评论

lucene基本的搜索功能

评论

发表评论

相关推荐

最近访客 更多访客>>

博主相关

文章分类

社区版块

存档分类

最新评论

lucene基本的搜索功能

评论

发表评论

相关推荐

dfdfsd

111

页面播放视频

funambol相关资料

openfire+spark的过程

openfire 信息

搜集的可能对公司的JAVAEYE博客

最近访客更多访客>>