`
ckwang17
  • 浏览: 26254 次
  • 性别: Icon_minigender_1
  • 来自: 深圳
社区版块
存档分类
最新评论

compass对文件建立索引

阅读更多

在已有的ssh框架中,增加一个spring配置文件applicationContext-compassConfig.xml

文件内容:

<?xml version="1.0" encoding="UTF-8"?>
<beans
	xmlns="http://www.springframework.org/schema/beans"
	xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
	xsi:schemaLocation="http://www.springframework.org/schema/beans http://www.springframework.org/schema/beans/spring-beans-2.5.xsd">
	
  
	<!-- compass2.2 config start -->
	<bean id="annotationConfiguration" class="org.compass.annotations.config.CompassAnnotationsConfiguration"></bean>
	<bean id="compass" class="org.compass.spring.LocalCompassBean">
		<property name="resourceDirectoryLocations">
			<list>
				<value>classpath:com/tjsoft/SearchEngines</value>
			</list>
		</property>
		<!-- 定义索引的存储位置 -->
		<property name="connection">
            <value>/lucene/indexes</value>
        </property>
		
		
		<!-- anontaition式设置  -->
		<property name="classMappings">
			<list>
				<!-- compass测试类  -->
				<value>com.tjsoft.SearchEngines.compass.model.Compassbean</value>
				<value>com.tjsoft.SearchEngines.compass.model.TextFile</value>
				<value>com.tjsoft.SearchEngines.compass.model.ExternalDBBean</value>
			</list>
		</property>
		<property name="compassConfiguration" ref="annotationConfiguration"/>
		<property name="compassSettings">
			<props>
				<!-- 建立索引位置的另一种方式  
                <prop key="compass.engine.connection">  
                    file://${user.home}/lucene/indexes  
                </prop>  
                 --> 
                <prop key="compass.engine.mergeFactor">100</prop> 
				<prop key="compass.engine.maxBufferedDocs">1000</prop> 
				<prop key="compass.engine.maxFieldLength">100000</prop>
				<prop key="compass.transaction.factory">org.compass.spring.transaction.SpringSyncTransactionFactory</prop>
				<prop key="compass.engine.highlighter.default.formatter.simple.pre"><![CDATA[<span style='background-color:yellow;color:red;'>]]></prop>
				<prop key="compass.engine.highlighter.default.formatter.simple.post"><![CDATA[</span>]]></prop>
				<!-- 指定摘要文本的长度 -->  
                <prop key="compass.engine.highlighter.default.fragmenter.simple.size">200</prop>
				<!-- 定义分词器 -->
                <prop key="compass.engine.analyzer.default.type">net.paoding.analysis.analyzer.PaodingAnalyzer</prop> 
			</props>  
		</property>  
		<property name="transactionManager" ref="transactionManager"/>
	</bean>
	<bean id="hibernateGpsDevice" class="org.compass.spring.device.hibernate.dep.SpringHibernate3GpsDevice">
		<property name="name" value="hibernateDevice"/>
		<property name="sessionFactory" ref="sessionFactory"/>
		<property name="mirrorDataChanges" value="true"/>
	
	</bean>
	<!-- 同步更新索引 -->
	<bean id="compassGps" class="org.compass.gps.impl.SingleCompassGps" init-method="start" destroy-method="stop">  
		<property name="compass" ref="compass"/>  
		<property name="gpsDevices">  
			<list>  
				<ref local="hibernateGpsDevice"/>  
			</list>   
		</property>
	</bean>	
	<!-- compass模板 -->
	<bean id="compassTemplate" class="org.compass.core.CompassTemplate">
		<property name="compass" ref="compass" />
	</bean>
	<!-- 定时重建索引(利用quartz)或随Spring ApplicationContext启动而重建索引 -->
	<bean id="compassIndexBuilder" class="com.tjsoft.SearchEngines.compass.service.imp.CompassIndexBuilderImp" lazy-init="false">
		<property name="compassGps" ref="compassGps" />
		<property name="buildIndex" value="false" />
		<property name="compassTemplate" ref="compassTemplate" />
		<property name="compassService" ref="compassService" />
	</bean>
	<!-- compass2.2 config end -->
</beans>

 

最后一个bean中引用的com.tjsoft.SearchEngines.compass.service.imp.CompassIndexBuilderImp 代码如下

 

package com.tjsoft.SearchEngines.compass.service.imp;

import java.io.File;
import java.sql.Connection;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;

import javax.naming.NameNotFoundException;
import javax.sql.DataSource;

import org.apache.log4j.Logger;
import org.compass.core.CompassTemplate;
import org.compass.gps.CompassGps;
import org.springframework.beans.factory.InitializingBean;

import com.tjsoft.SearchEngines.common.util.IDFactory;
import com.tjsoft.SearchEngines.common.util.PreparatorUtil;
import com.tjsoft.SearchEngines.compass.dbConnection.ConnectionFactory;
import com.tjsoft.SearchEngines.compass.dbConnection.ConnectionParam;
import com.tjsoft.SearchEngines.compass.model.Compassbean;
import com.tjsoft.SearchEngines.compass.model.ExternalDBBean;
import com.tjsoft.SearchEngines.compass.model.TCompassIndexinfo;
import com.tjsoft.SearchEngines.compass.model.TCompassIndexinfoSql;
import com.tjsoft.SearchEngines.compass.model.TextFile;
import com.tjsoft.SearchEngines.compass.service.CompassIndexBuilder;
import com.tjsoft.SearchEngines.compass.service.TCompassService;

/**
 * 系统启动后自动创建索引
 * @author wbin
 *
 */
public class CompassIndexBuilderImp implements InitializingBean,CompassIndexBuilder {
	Logger logger = Logger.getLogger ( CompassIndexBuilderImp.class.getName() ) ; 
	// 是否要建立索引,可被设置为false使Builder失效
    private boolean buildIndex;
    // Compass封装
    private CompassGps compassGps;
    private CompassTemplate compassTemplate;
    private TCompassService compassService;
 // 索引操作线程延时启动的时间,单位为秒
    private int lazyTime = 10;
 // 索引线程
    private Thread indexThread = new Thread() {

        @SuppressWarnings("static-access")
		@Override
        public void run() {
        	long beginTime = System.currentTimeMillis();
        	try {
				indexThread.sleep(lazyTime*1000);
				 System.out.println("搜索引擎开始创建索引...");
					logger.info("搜索引擎开始创建索引...");
					// 重建索引.
					// 如果compass实体中定义的索引文件已存在,索引过程中会建立临时
					// 索引完成后再进行覆盖.
					compassGps.index();
					//文件索引
					List<TCompassIndexinfo> infoList = compassService.getIndexTargetList();
					for(TCompassIndexinfo info : infoList)
					{
						//为文件类型的目标建立索引
						if(info.getTargetType().equals("file"))
						{
							String dir = info.getIndextargetpath();  
							index(dir) ;   
						}
						else if(info.getTargetType().equals("db"))
						{
							
							//1.登记一个连接池对象,该操作在程序初始化只做一次即可
							ConnectionParam param = new ConnectionParam(info.getDataSourceName(),info.getDriver(),info.getUrl(),
						             info.getUserName(),info.getPassword(),
						             1,5,20000,false,100,""); 

							ConnectionFactory cFactory = ConnectionFactory.getInstance();
							
							try
							{ 
								cFactory.bind(info.getDataSourceName(), param,false);
								
								//2.以后都可以如此通过数据源名称直接得到数据源
								DataSource ds = cFactory.lookUp(info.getDataSourceName()); 
								
								Connection conn = ds.getConnection(); 
								try
								{ 
									PreparedStatement ps = null;
									ResultSet res = null; 
									
									for(Iterator<TCompassIndexinfoSql> ite = info.getCompassIndexinfoSqls().iterator();ite.hasNext();)
									{
										TCompassIndexinfoSql compassIndexinfoSql =  ite.next();
										ps = conn.prepareStatement(compassIndexinfoSql.getSqlstr()); 
										res = ps.executeQuery(); 
										indexDbRs(res);
									}
								}
								finally
								{ 
									try
									{ 
										conn.close(); 
									}
									catch(Exception e)
									{
										logger.error(e.getMessage());
									} 
								} 
							
							}
							catch(Exception e)
							{ 
								logger.error(e.getMessage());
							}
							finally
							{ 
								try 
								{
									cFactory.unbind(info.getDataSourceName());
								} 
								catch (NameNotFoundException e) 
								{
									logger.error(e.getMessage());
								} 
								logger.info("释放数据源"); 
							}
							
							
						}
					}
					
			} catch (InterruptedException e1) {
				e1.printStackTrace();
			}
           
			long costTime = System.currentTimeMillis() - beginTime;
			System.out.println("创建索引完成。");
			System.out.println("耗时 " + costTime + " 毫秒");
			logger.info("创建索引完成。");
			logger.info("耗时 " + costTime + " 毫秒");
        }
    };


	/* 
	 * 实现InitializingBean接口,在完成注入后创建索引.
	 */
	public void afterPropertiesSet() throws Exception {
		if (buildIndex) {
            indexThread.setDaemon(true);
            indexThread.setName("Compass Indexer");
            indexThread.start();
        }

	}
	
	public void restartUpdateIndex()
	{
		indexThread.run();
	}
	

	/*
	 * 给单个文件建索引 
	 */
	private void indexFile(File file) {
		try
		{
			String sb = "";
			String fileType = PreparatorUtil.getfiletypeByFile(file);
			if(fileType != null)
			{
				/** ********穿透Txt,sql,java文件**************** */
				if(fileType.equals("txt") || fileType.equals("sql") || fileType.equals("java"))
					sb = PreparatorUtil.readTxt(file.getAbsolutePath());
				/** ********穿透PDF文件**************** */
				else if(fileType.equals("pdf"))
					sb = PreparatorUtil.readPdf(file.getAbsolutePath());
				/** ********穿透Office文件**************** */
				else if(fileType.equals("doc")|| fileType.equals("docx") || fileType.equals("pst") 
						|| fileType.equals("xls") || fileType.equals("xlsx") 
						|| fileType.equals("pptx") || fileType.equals("ppt"))
					sb = PreparatorUtil.readOffic(file.getAbsolutePath());
				/** ********穿透Html文件**************** */
				else if(fileType.equals("html") || fileType.equals("htm"))
				{
					sb = PreparatorUtil.readHtml(file.getAbsolutePath());
					sb = PreparatorUtil.html2text(sb.toString());
				}
				/** ********其他文件,只读取文件名和路径**************** */
				else
					sb = file.getAbsolutePath();
					
			}
			else
			{
				sb = file.getAbsolutePath();
			}
			
			/** *********包装成对象************* */
			
			TextFile tf = new TextFile();
			tf.setFileId(IDFactory.getId());
			tf.setTitle(file.getName());
//			tf.setPath(file.getAbsolutePath());
			tf.setPath(file.getCanonicalPath().replaceAll("\\\\","/"));
			tf.setContent(sb.toString());
			tf.setLastModifyTime(file.lastModified());
			tf.setFileType(fileType);
			
			
			/** *********索引对象**************** */
			compassTemplate.create(tf);
		}
		catch (Exception e) {
			logger.error("读取文件:"+file.getAbsolutePath()+"出错!");
			logger.error(e.getMessage());
		}
	}

	/*
	 * 给目录下的所有文件建索引
	 */
	private void index(File file) {
		if (file.isFile()) { // 如果是文件就建索引并保存
			indexFile(file);
			return;
		}
		
		File[] childs = file.listFiles();
		if(childs == null) return;
		for (int i = 0; i < childs.length; i++) {
			File f = childs[i];
			if (f.isDirectory()) {// 如果是目录就递归调用
				index(f);
			} else {
				indexFile(f);
			}
		}
	}

	/*
	 * 给目录下的所有文件建索引
	 */
	public void index(String filePath) {
		File file = null;
		file = new File(filePath);
		index(file);
	}

	/*
	 * 删除索引 删除索引是根据索引的id来删除
	 */
	public void unIndex(File file) {
		Compassbean tf = new Compassbean();
		tf.setPath(file.getAbsolutePath());

		compassTemplate.delete(tf);
	}
	
	
	/*
	 *为数据库记录集创建索引 
	 */
	public void indexDbRs(ResultSet res)
	{
		if  (res  !=  null)
		{
			try {
				while(res.next())
				{
					ExternalDBBean  externalDBBean = new ExternalDBBean();
					externalDBBean.setBeanID(IDFactory.getId());
					List<String> parmList = new ArrayList<String>();
					for(int i=0;i<res.getMetaData().getColumnCount();i++)
					{
						parmList.add(res.getString(i+1));
					}
					externalDBBean.setPramList(parmList);
					externalDBBean.setFileType("otherDB");
					/** *********索引对象**************** */
					compassTemplate.create(externalDBBean);
					
				}
			} catch (SQLException e) {
				logger.error(e.getMessage());
			}
		}
	}
	
	/**
	 * 
	 * @param buildIndex
	 */
	public void setBuildIndex(boolean buildIndex) {
        this.buildIndex = buildIndex;
    }
	  
    /**
     * 
     * @param compassGps
     */
    public void setCompassGps(CompassGps compassGps) {
        this.compassGps = compassGps;
    }

	public CompassTemplate getCompassTemplate() {
		return compassTemplate;
	}

	public void setCompassTemplate(CompassTemplate compassTemplate) {
		this.compassTemplate = compassTemplate;
	}


	public TCompassService getCompassService() {
		return compassService;
	}


	public void setCompassService(TCompassService compassService) {
		this.compassService = compassService;
	}
	
}

 

 

接口为

public interface CompassIndexBuilder {

	public void afterPropertiesSet() throws Exception ;
	public void index(String filePath);
	public void restartUpdateIndex();
}

 

读取文件使用到的工具类PreparatorUtil

package com.tjsoft.SearchEngines.common.util;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.StringWriter;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.Map.Entry;

import org.apache.pdfbox.cos.COSDocument;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.util.PDFTextStripper;
import org.apache.poi.POITextExtractor;
import org.apache.poi.extractor.ExtractorFactory;
import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
import org.apache.xmlbeans.XmlException;

/**
 * 文件穿透
 * @author wbin
 *
 */
public class PreparatorUtil {
	public final static Map<String, String> FILE_TYPE_MAP = new HashMap<String, String>();     
    
    private PreparatorUtil(){}     
    static{     
        getAllFileType(); //初始化文件类型信息     
    }     
         
    /**   
     * Created on 2011-02-15    
     * <p>Discription:[getAllFileType,常见文件头信息]</p>   
     * @author:wbin   
     */     
    private static void getAllFileType()     
    {     
        FILE_TYPE_MAP.put("jpg", "FFD8FF"); //JPEG (jpg)     
        FILE_TYPE_MAP.put("png", "89504E47"); //PNG (png)     
        FILE_TYPE_MAP.put("gif", "47494638"); //GIF (gif)     
        FILE_TYPE_MAP.put("tif", "49492A00"); //TIFF (tif)     
        FILE_TYPE_MAP.put("bmp", "89504E470D0A1A0A0000000D4948445200000060000000600806000000E2987738000000017352474200AECE1CE900000004"); //Windows Bitmap (bmp)   
        FILE_TYPE_MAP.put("dwg", "41433130"); //CAD (dwg)     
        FILE_TYPE_MAP.put("html", "68746D6C3E"); //HTML (html)     
        FILE_TYPE_MAP.put("htm", "3C21444F435459504520"); //HTML (html)
        FILE_TYPE_MAP.put("rtf", "7B5C727466"); //Rich Text Format (rtf)     
        FILE_TYPE_MAP.put("xml", "3C3F786D6C");     
        FILE_TYPE_MAP.put("zip", "504B03040A00000000009");       
        FILE_TYPE_MAP.put("rar", "52617221");     
        FILE_TYPE_MAP.put("psd", "38425053"); //Photoshop (psd)     
        FILE_TYPE_MAP.put("eml", "44656C69766572792D646174653A"); //Email [thorough only] (eml)     
        FILE_TYPE_MAP.put("dbx", "CFAD12FEC5FD746F"); //Outlook Express (dbx)     
        FILE_TYPE_MAP.put("pst", "2142444E"); //Outlook (pst)     
        FILE_TYPE_MAP.put("xls", "D0CF11E0A1B11AE1000000000000000000000000000000003E000300FEFF0900060000000000000000000000010000000100"); //MS Word     
        FILE_TYPE_MAP.put("xlsx", "504B030414000600080000002100C8A3");      
        FILE_TYPE_MAP.put("doc", "D0CF11E0A1B11AE1000000000000000000000000000000003E000300FEFF09000600000000000000000000004E0000005600"); 
        FILE_TYPE_MAP.put("docx", "504B030414000600080000002100729");     
        FILE_TYPE_MAP.put("pptx", "504B03041400060008000000210036F7");   
        FILE_TYPE_MAP.put("ppt", "D0CF11E0A1B11AE1000000000000000000000000000000003E000300FEFF0900060000000000000000000000020000000100");  
        FILE_TYPE_MAP.put("mdb", "5374616E64617264204A"); //MS Access (mdb)     
        FILE_TYPE_MAP.put("wpd", "FF575043"); //WordPerfect (wpd)      
        FILE_TYPE_MAP.put("eps", "252150532D41646F6265");     
        FILE_TYPE_MAP.put("ps", "252150532D41646F6265");     
        FILE_TYPE_MAP.put("pdf", "255044462D312E"); //Adobe Acrobat (pdf)     
        FILE_TYPE_MAP.put("qdf", "AC9EBD8F"); //Quicken (qdf)     
        FILE_TYPE_MAP.put("pwl", "E3828596"); //Windows Password (pwl)     
        FILE_TYPE_MAP.put("wav", "57415645"); //Wave (wav)     
        FILE_TYPE_MAP.put("avi", "41564920");     
        FILE_TYPE_MAP.put("ram", "2E7261FD"); //Real Audio (ram)     
        FILE_TYPE_MAP.put("rm", "2E524D46"); //Real Media (rm)     
        FILE_TYPE_MAP.put("mpg", "000001BA"); //     
        FILE_TYPE_MAP.put("mov", "6D6F6F76"); //Quicktime (mov)     
        FILE_TYPE_MAP.put("asf", "3026B2758E66CF11"); //Windows Media (asf)     
        FILE_TYPE_MAP.put("mid", "4D546864"); //MIDI (mid)     
        FILE_TYPE_MAP.put("sql", "73656C656374200D0");    
        FILE_TYPE_MAP.put("txt", "73656C6563742032303");  
        FILE_TYPE_MAP.put("java", "7061636B61676520636F6D2E");   
    }     
	
	
	
	
    /**
     * 穿透offic文档
     * @param path
     * @return
     */
    public static String readOffic(String path) {
    	File inputFile = new File(path); 
    	POITextExtractor extractor = null;
		try {
			extractor = ExtractorFactory.createExtractor(inputFile);
		} catch (InvalidFormatException e) {
			e.printStackTrace();
		} catch (IOException e) {
			e.printStackTrace();
		} catch (OpenXML4JException e) {
			e.printStackTrace();
		} catch (XmlException e) {
			e.printStackTrace();
		}
    	return extractor.getText().trim();
    }
    
    /**
     * 穿透PDF文件
     * @param path
     * @return
     * @throws Exception
     */
    public static String readPdf(String path) throws Exception {
    	StringBuffer content = new StringBuffer("");// 文档内容
    	PDDocument pdfDocument = null;  
        try {  
            FileInputStream fis = new FileInputStream(path);  
            PDFTextStripper stripper = new PDFTextStripper();  
            pdfDocument = PDDocument.load(fis);  
            StringWriter writer = new StringWriter();  
            stripper.writeText(pdfDocument, writer);  
            content.append(writer.getBuffer().toString());  
            fis.close();  
        } catch (java.io.IOException e) {  
            System.err.println("IOException=" + e);  
            System.exit(1);  
        } finally 
        {  
            if (pdfDocument != null) {  
                COSDocument cos = pdfDocument.getDocument();  
                cos.close();  
                pdfDocument.close();  
            }  
        }
		return content.toString().trim();
    }
    

    
    /**
     * 穿透html 保留html标签和css样式
     * @param urlString
     * @return
     */
    public static String readHtml(String urlString) {

        StringBuffer content = new StringBuffer("");
        File file = new File(urlString);
        FileInputStream fis = null;
        BufferedReader reader = null;
        try {
            fis = new FileInputStream(file);
            // 读取页面
            reader = new BufferedReader(new InputStreamReader(fis,"utf-8"));//这里的字符编码要注意,要对上html头文件的一致,否则会出乱码
            String line = null;
            while ((line = reader.readLine()) != null) {
                content.append(line + "\n");
            }
        } catch (Exception e) {
            e.printStackTrace();
        }
        finally
        {
        	try {
				reader.close();
			} catch (IOException e) {
				e.printStackTrace();
			}
        }
        String contentString = content.toString();
        return contentString;
    }
    
    
   
    
    /**
     * 穿透txt
     * @param path
     * @return
     */
    public static String readTxt(String path) {
        StringBuffer content = new StringBuffer("");// 文档内容
        FileReader reader = null;
        BufferedReader br = null;
        try {
            reader = new FileReader(path);
            br = new BufferedReader(reader);
            String s1 = null;

            while ((s1 = br.readLine()) != null) {
                content.append(s1 + "\r");
            }
            
        } catch (IOException e) {
            e.printStackTrace();
        }
        finally
        {
        	try {
				br.close();
				reader.close();
			} catch (IOException e) {
				e.printStackTrace();
			}
        }
        return content.toString().trim();
    }
    
    
    
    public static String getFileHexString(byte[] b)     
    {     
        StringBuilder stringBuilder = new StringBuilder();     
        if (b == null || b.length <= 0)     
        {     
            return null;     
        }     
        for (int i = 0; i < b.length; i++)     
        {     
            int v = b[i] & 0xFF;     
            String hv = Integer.toHexString(v);     
            if (hv.length() < 2)     
            {     
                stringBuilder.append(0);     
            }     
            stringBuilder.append(hv);     
        }     
        return stringBuilder.toString();     
    }     
    
    public static String getFileTypeByStream(byte[] b) 
    { 
        String filetypeHex = String.valueOf(getFileHexString(b)); 
        Iterator<Entry<String, String>> entryiterator = FILE_TYPE_MAP.entrySet().iterator(); 
        while (entryiterator.hasNext()) { 
            Entry<String,String> entry = entryiterator.next(); 
            String fileTypeHexValue = entry.getValue(); 
            if (filetypeHex.toUpperCase().startsWith(fileTypeHexValue)) { 
                return entry.getKey(); 
            } 
        } 
        return null; 
    } 
    
    /**
     * 判断文件类型
     * @param file
     * @return
     */
    public static String getfiletypeByFile(File file) 
    { 
        String filetype = null; 
        byte[] b = new byte[50]; 
        InputStream is = null; 
        try 
        { 
        	is = new FileInputStream(file);
            is.read(b); 
            filetype = getFileTypeByStream(b); 
             
        } 
        catch (FileNotFoundException e) 
        { 
            e.printStackTrace(); 
        } 
        catch (IOException e) 
        { 
            e.printStackTrace(); 
        }
        finally
        {
        	try {
				is.close();
			} catch (IOException e) {
				e.printStackTrace();
			}
        }
        return filetype; 
    } 
    
    
    
    //---------html 去掉标签和css样式等   start-------
    public static String html2text(String html) {
        StringBuffer sb = new StringBuffer(html.length());
        char[] data = html.toCharArray();
        int start = 0;
        boolean previousIsPre = false;
        Token token = null;
        for(;;) {
            token = parse(data, start, previousIsPre);
            if(token==null)
                break;
            previousIsPre = token.isPreTag();
            sb = sb.append(token.getText());
            start += token.getLength();
        }
        return sb.toString();
    }   
    
    private static Token parse(char[] data, int start, boolean previousIsPre) {
        if(start>=data.length)
            return null;
        // try to read next char:
        char c = data[start];
        if(c=='<') {
            // this is a tag or comment or script:
            int end_index = indexOf(data, start+1, '>');
            if(end_index==(-1)) {
                // the left is all text!
                return new Token(Token.TOKEN_TEXT, data, start, data.length, previousIsPre);
            }
            String s = new String(data, start, end_index-start+1);
            // now we got s="<...>":
            if(s.startsWith("<!--")) { // this is a comment!
                int end_comment_index = indexOf(data, start+1, "-->");
                if(end_comment_index==(-1)) {
                    // illegal end, but treat as comment:
                    return new Token(Token.TOKEN_COMMENT, data, start, data.length, previousIsPre);
                }
                else
                    return new Token(Token.TOKEN_COMMENT, data, start, end_comment_index+3, previousIsPre);
            }
            String s_lowerCase = s.toLowerCase();
            if(s_lowerCase.startsWith("<script")) { // this is a script:
                int end_script_index = indexOf(data, start+1, "</script>");
                if(end_script_index==(-1))
                    // illegal end, but treat as script:
                    return new Token(Token.TOKEN_SCRIPT, data, start, data.length, previousIsPre);
                else
                    return new Token(Token.TOKEN_SCRIPT, data, start, end_script_index+9, previousIsPre);
            }
            else { // this is a tag:
                return new Token(Token.TOKEN_TAG, data, start, start+s.length(), previousIsPre);
            }
        }
        // this is a text:
        int next_tag_index = indexOf(data, start+1, '<');
        if(next_tag_index==(-1))
            return new Token(Token.TOKEN_TEXT, data, start, data.length, previousIsPre);
        return new Token(Token.TOKEN_TEXT, data, start, next_tag_index, previousIsPre);
    }   
    
    
    private static int indexOf(char[] data, int start, String s) {
        char[] ss = s.toCharArray();
        // TODO: performance can improve!
        for(int i=start; i<(data.length-ss.length); i++) {
            // compare from data[i] with ss[0]:
            boolean match = true;
            for(int j=0; j<ss.length; j++) {
                if(data[i+j]!=ss[j]) {
                    match = false;
                    break;
                }
            }
            if(match)
                return i;
        }
        return (-1);
    }   

    private static int indexOf(char[] data, int start, char c) {
        for(int i=start; i<data.length; i++) {
            if(data[i]==c)
                return i;
        }
        return (-1);
    }}@SuppressWarnings("unchecked")
	class Token {    
    public static final int TOKEN_TEXT    = 0; // html text.
    public static final int TOKEN_COMMENT = 1; // comment like <!--
             // comments... -->
    public static final int TOKEN_TAG     = 2; // tag like <pre>, <font>,
             // etc.
    public static final int TOKEN_SCRIPT = 3;    private static final char[] TAG_BR = "<br".toCharArray();
    private static final char[] TAG_P   = "<p".toCharArray();
    private static final char[] TAG_LI = "<li".toCharArray();
    private static final char[] TAG_PRE = "<pre".toCharArray();
    private static final char[] TAG_HR = "<hr".toCharArray();    
    private static final char[] END_TAG_TD = "</td>".toCharArray();
    private static final char[] END_TAG_TR = "</tr>".toCharArray();
    private static final char[] END_TAG_LI = "</li>".toCharArray();   
    private static final Map SPECIAL_CHARS = new HashMap();    
    private int type;
    private String html;           // original html
    private String text = null;    // text!
    private int length = 0;        // html length
    private boolean isPre = false; // isPre tag? 
    static {
    
//        SPECIAL_CHARS.put(""", "\"");
        SPECIAL_CHARS.put("<",   "<");
        SPECIAL_CHARS.put(">",   ">");
        SPECIAL_CHARS.put("&", "&");
        SPECIAL_CHARS.put("?", "(r)");
        SPECIAL_CHARS.put("?", "(c)");
        SPECIAL_CHARS.put(" ", " ");
        SPECIAL_CHARS.put("£", "?");
    }   
public Token(int type, char[] data, int start, int end, boolean previousIsPre) {
        this.type = type;
        this.length = end - start;
        this.html = new String(data, start, length);
        //System.out.println("[Token] html=" + html + ".");
        parseText(previousIsPre);
        //System.out.println("[Token] text=" + text + ".");
    }    public int getLength() {
        return length;
    }    public boolean isPreTag() {
        return isPre;
    }    private void parseText(boolean previousIsPre) {
        if(type==TOKEN_TAG) {
            char[] cs = html.toCharArray();
            if(compareTag(TAG_BR, cs) || compareTag(TAG_P, cs))
                text = "\n";
            else if(compareTag(TAG_LI, cs))
                text = "\n* ";
            else if(compareTag(TAG_PRE, cs))
                isPre = true;
            else if(compareTag(TAG_HR, cs))
                text = "\n--------\n";
            else if(compareString(END_TAG_TD, cs))
                text = "\t";
            else if(compareString(END_TAG_TR, cs) || compareString(END_TAG_LI, cs))
                text = "\n";
        }
        // text token:
        else if(type==TOKEN_TEXT) {
            text = toText(html, previousIsPre);
        }
    }    
    public String getText() {
        return text==null ? "" : text;
    }   
    
    private String toText(String html, final boolean isPre) {
        char[] cs = html.toCharArray();
        StringBuffer buffer = new StringBuffer(cs.length);
        int start = 0;
        boolean continueSpace = false;
        char current, next;
        for(;;) {
            if(start>=cs.length)
                break;
            current = cs[start]; // read current char
            if(start+1<cs.length) // and next char
                next = cs[start+1];
            else
                next = '\0';
            if(current==' ') {
                if(isPre || !continueSpace)
                    buffer = buffer.append(' ');
                continueSpace = true;
                // continue loop:
                start++;
                continue;
            }
            // not ' ', so:
            if(current=='\r' && next=='\n') {
                if(isPre)
                    buffer = buffer.append('\n');
                // continue loop:
                start+=2;
                continue;
            }
            if(current=='\n' || current=='\r') {
                if(isPre)
                    buffer = buffer.append('\n');
                // continue loop:
                start++;
                continue;
            }
            // cannot continue space:
            continueSpace = false;
            if(current=='&') {
                // maybe special char:
                int length = readUtil(cs, start, ';', 10);
                if(length==(-1)) { // just '&':
                    buffer = buffer.append('&');
                    // continue loop:
                    start++;
                    continue;
                }
                else { // check if special character:
                    String spec = new String(cs, start, length);
                    String specChar = (String)SPECIAL_CHARS.get(spec);
                    if(specChar!=null) { // special chars!
                        buffer = buffer.append(specChar);
                        // continue loop:
                        start+=length;
                        continue;
                    }
                    else { // check if like '?':
                        if(next=='#') { // maybe a char
                            String num = new String(cs, start+2, length-3);
                            try {
                                int code = Integer.parseInt(num);
                                if(code>0 && code<65536) { // this is a
                // special char:
                                    buffer = buffer.append((char)code);
                                    // continue loop:
                                    start++;
                                    continue;
                                }
                            }
                            catch(Exception e) {}
                            // just normal char:
                            buffer = buffer.append("&#");
                            // continue loop:
                            start+=2;
                            continue;
                        }
                        else { // just '&':
                            buffer = buffer.append('&');
                            // continue loop:
                            start++;
                            continue;
                        }
                    }
                }
            }
            else { // just a normal char!
                buffer = buffer.append(current);
                // continue loop:
                start++;
                continue;
            }
        }
        return buffer.toString();
    }    // read from cs[start] util meet the specified char 'util',
    // or null if not found:
    
    private int readUtil(final char[] cs, final int start, final char util, final int maxLength) {
        int end = start+maxLength;
        if(end>cs.length)
            end = cs.length;
        for(int i=start; i<start+maxLength; i++) {
            if(cs[i]==util) {
                return i-start+1;
            }
        }
        return (-1);
    }    // compare standard tag "<input" with tag "<INPUT value=aa>"
    
    private boolean compareTag(final char[] ori_tag, char[] tag) {
        if(ori_tag.length>=tag.length)
            return false;
        for(int i=0; i<ori_tag.length; i++) {
            if(Character.toLowerCase(tag[i])!=ori_tag[i])
                return false;
        }
        // the following char should not be a-z:
        if(tag.length>ori_tag.length) {
            char c = Character.toLowerCase(tag[ori_tag.length]);
            if(c<'a' || c>'z')
                return true;
            return false;
        }
        return true;
    }   
    
    private boolean compareString(final char[] ori, char[] comp) {
        if(ori.length>comp.length)
            return false;
        for(int i=0; i<ori.length; i++) {
            if(Character.toLowerCase(comp[i])!=ori[i])
                return false;
        }
        return true;
    }    public String toString() {
        return html;
    }
    //------------------end ------------------
    
    
}

 

 

分享到:
评论
1 楼 kobe6111 2011-08-05  
博主写的不错,有没有代码下载链接

相关推荐

    基于Compass2.2与Spring 结合建立索引的实例

    ### 基于Compass2.2与Spring结合建立索引的实例 #### 一、Compass简介 Compass是一个开源的.NET和Java框架,它为应用程序提供了完整的文本搜索功能。该框架支持多种搜索引擎(如Elasticsearch, Solr等),使得开发...

    compass jar包

    一旦映射配置完成,你可以启动Compass实例,它会自动建立索引。之后,每当数据发生变化时,Compass可以实时更新索引,保持搜索结果的准确性。在你的应用程序中,可以使用Compass提供的API来执行搜索查询,它支持多种...

    compass搜索引擎技术

    6. **建立索引** 应用启动时,通常会执行一次全量索引,将所有数据导入到Compass中。这可以通过调用Compass的批处理索引API完成。在运行时,Compass可以监听数据库的变化,自动对新插入、更新或删除的数据进行索引...

    使用compass+lucene实现简单的全文检索功能

    这通常涉及读取数据源中的信息,然后通过 Compass 将每个文档的关键信息转换为 Lucene 可理解的格式并建立索引。索引过程可以配置为实时、批量或定期执行。 4. **索引映射**:在 Compass 中,需要定义索引字段与...

    Compass原理深入学习笔记

    3. 索引生成:根据处理后的词生成词典并排序,建立文档倒排链表,记录每个词在哪些文档中出现,以及出现的频率。 搜索过程包括: 1. 用户输入查询。 2. 分析查询语句,生成词元和查询树。 3. 加载索引到内存。 4. ...

    S2SH+compass (实现站内全文检索)

    2. 引入Compass:在项目中添加Compass的依赖,配置相应的compass.xml文件,定义索引的存储位置、搜索引擎的分析器以及需要建立索引的数据源。 3. 创建索引:使用Compass的API,根据需要索引的数据实体,在应用启动...

    compass_src

    如果没有Compass,我们一般会在每天深夜重建一次索引。相比Compass的做法, 一来反应迟缓,平均延时半天; 二来效率没有Compass高。... 三来不支持事务,如果建立索引过程中出现异常,索引文件的状态是不可控的。

    Compass学习文档1

    Compass 是一个强大的开源搜索引擎框架,它建立在 Lucene 的基础之上,为 Java 开发者提供了更为简洁的搜索引擎 API。Compass 的设计目标是简化搜索引擎的集成,使其能够无缝地与现有的应用程序,如 Hibernate 和 ...

    Compass+SSH搜索引擎简单项目

    3. 查看Compass的配置,了解它是如何与Hibernate集成,以及如何建立索引的。 4. 理解Action类如何接收并处理搜索请求,以及如何通过Service层调用Compass进行搜索。 5. 跟踪视图层的JSP文件,了解搜索结果是如何呈现...

    基于Java的Luncene的compass框架说明使用技术文档.pdf

    Lucene的主要任务是为文件中的每个词建立索引,这样可以大大提高搜索效率。Lucene提供了一套API用于解析、过滤、分析文件并构建和使用索引。开发者可以将Lucene视为支持全文索引的数据库系统。 - **Compass**:...

    Lucene 3.0完成入门

    - **分词**:Lucene 使用分词器(Tokenizer)将文档拆分为单独的词汇项(Tokens),这是建立索引的基础。 - **字段**:每个文档可以包含多个字段,如标题、内容、作者等,每个字段都有独立的索引。 2. **Java 中...

    lucene.ppt

    2. **分块索引**:能快速对新内容建立索引,并通过合并优化索引。 3. **面向对象设计**:易于学习和扩展,方便添加新功能。 4. **文本分析接口**:允许用户自定义语言和文件格式的分析,只需实现Token流处理。 5. **...

    全文检索技术(自己总结)

    该技术的核心在于分析文档内容,提取关键字,并建立索引,使得在海量数据中查找特定信息变得迅速。 **全文检索系统** 全文检索系统通常由几个关键组件构成: 1. **网页爬取器**:负责自动遍历互联网,按照网页之间...

    Dosya Arayıcı:遍历Windows文件,对其进行索引并使用“拼写正确”呈现给用户-开源

    【标题】"Dosya Arayıcı"是一款专为Windows操作系统设计的开源文件搜索工具,它具有独特的功能,能够遍历用户的文件系统,建立一个全面的索引,以便用户能够更快速、更准确地找到所需文件。这个软件的核心价值在于...

    lunece

    其次,Lucene引入了分块索引,可以快速为新内容建立索引,并通过与旧索引合并来优化整个索引结构。此外,其面向对象的设计使得扩展和自定义功能变得简单。Lucene还提供了一个独立于语言和文件格式的文本分析接口,...

    MongoDB安装配置过程

    4. **完全索引**:支持对数据建立索引,包括内部对象,有助于提高查询效率。 5. **复制与故障恢复**:MongoDB 支持数据复制,以实现高可用性和故障恢复。 6. **自动处理碎片**:能够自动管理数据的碎片,适应...

    全文检索Lucene 全文检索Lucene

    - **索引(Index)**:Lucene通过建立索引来实现快速搜索。索引是预处理的结果,将原始文档转换为倒排索引结构,便于高效的关键词查找。 - **文档(Document)**:在Lucene中,每个要索引的对象被称为一个文档,...

    自己动手写搜索引擎

    - 针对非纯文本文件的索引和搜索也是搜索引擎的重要功能之一。这部分内容详细介绍了如何将Word、Excel和PDF等格式的文档转化为可搜索的文本内容,并建立相应的索引。 #### 七、Compass搜索引擎框架 - Compass是一...

    开源企业搜索引擎SOLR的应用教程

    索引过程包括解析文档、提取字段、建立索引等步骤。 - **1.3.2 搜索**:用户通过提交查询请求到Solr,Solr根据请求条件从索引中检索数据并返回结果。 #### 二、Solr的安装与配置 **2.1 在Tomcat下Solr安装** - **...

Global site tag (gtag.js) - Google Analytics