DFA算法过滤敏感词，替换为*

yhhazr

浏览: 48409 次
性别:
来自: 重庆

最近访客更多访客>>

fdayok

Peak_

5180466

sosoab

博主相关

博客

微博

相册

留言

关于我

文章分类

全部博客 (19)

社区版块

存档分类

DFA 敏感词过滤

import java.io.InputStream;
import java.io.UnsupportedEncodingException;
import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.Enumeration;
import java.util.List;
import java.util.Properties;

public class test {      
    /** 
     * 根节点 
     */  
    private TreeNode rootNode = new TreeNode();  
      
    /** 
     * 关键词缓存 
     */  
    private ByteBuffer keywordBuffer = ByteBuffer.allocate(1024);     
      
    /** 
     * 关键词编码 
     */  
    private String charset = "utf-8";  
  
    /** 
     * 创建DFA 
     * @param keywordList 
     * @throws UnsupportedEncodingException  
     */  
    public void createKeywordTree(List<String> keywordList) throws UnsupportedEncodingException{  
        for (String keyword : keywordList) {  
            if(keyword == null) continue;  
            keyword = keyword.trim();  
            byte[] bytes = keyword.getBytes(charset);  
            TreeNode tempNode = rootNode;  
            for (int i = 0; i < bytes.length; i++) {  
                int index = bytes[i] & 0xff;   
                TreeNode node = tempNode.getSubNode(index);  
                if(node == null){  
                    node = new TreeNode();  
                    tempNode.setSubNode(index, node);  
                }  
                tempNode = node;  
                if(i == bytes.length - 1){  
                    tempNode.setKeywordEnd(true);    
                }  
            }  
        }
    }  
      
   
    public String searchKeyword(String text) throws UnsupportedEncodingException{  
        return searchKeyword(text.getBytes(charset));  
    }  
   
    public String searchKeyword(byte[] bytes){  
        StringBuilder words = new StringBuilder();  
        if(bytes == null || bytes.length == 0){  
            return words.toString();  
        }  
        TreeNode tempNode = rootNode;  
        int rollback = 0;   
        int position = 0; 
        while (position < bytes.length) {  
            int index = bytes[position] & 0xFF;  
            keywordBuffer.put(bytes[position]); 
            tempNode = tempNode.getSubNode(index);  
            if(tempNode == null){
                position = position - rollback;
                rollback = 0;  
                tempNode = rootNode;      
                keywordBuffer.clear();  
            }  
            else if(tempNode.isKeywordEnd()){  
                keywordBuffer.flip();  
                for (int i = 0; i <= rollback; i++) {
                		bytes[position-i] = 42;
				}
                keywordBuffer.limit(keywordBuffer.capacity());  
                rollback = 1;  
            }else{   
                rollback++; 
            }  
            position++;  
        }  
        String result = null;
         try {
        	 result  =  new String(bytes,"utf-8");  
			
		} catch (Exception e) {
			e.printStackTrace();
		}
		return result;
    }  
      
    public void setCharset(String charset) {  
        this.charset = charset;  
    } 
}

import java.util.ArrayList;
import java.util.List;

public class TreeNode {  
    private static final int NODE_LEN = 256;  
      
    /** 
     * true 关键词的终结 ； false 继续 
     */  
    private boolean end = false;   
      
    private List<TreeNode> subNodes = new ArrayList<TreeNode>(NODE_LEN);  
      
    public TreeNode(){  
        for (int i = 0; i < NODE_LEN; i++) {  
            subNodes.add(i, null);  
        }  
    }  
      
    /** 
     * 向指定位置添加节点树 
     * @param index 
     * @param node 
     */  
    public void setSubNode(int index, TreeNode node){  
        subNodes.set(index, node);  
    }  
      
    public TreeNode getSubNode(int index){  
        return subNodes.get(index);  
    }  
      
  
    public boolean isKeywordEnd() {  
        return end;  
    }  
  
    public void setKeywordEnd(boolean end) {  
        this.end = end;  
    }  
}

分享到：