浏览 13363 次
锁定老帖子 主题:字符串多模式匹配算法:关键字过滤技术
精华帖 (0) :: 良好帖 (0) :: 新手帖 (1) :: 隐藏帖 (0)
|
|
---|---|
作者 | 正文 |
发表时间:2009-11-08
最后修改:2010-01-21
ps:09年网上掀起了敏感字词过滤热,一时想到就动手干起来了,此算法借鉴经典的WM算法改写而成,不足之处还需优化和改进,测试时只对屏蔽的illegal keywords 进行了个数统计!
有需要测试用的,请在最后自己下载这两个肮脏的文件,请勿在JE上到处传播,大家都是文明人哈!
1.MutiPatternParser.java
package com.mutiplepatternmatch; import java.util.Vector; /** * * @author Daniel * */ public class MutiPatternParser { public MutiPatternParser() { } private boolean initFlag = false; private UnionPatternSet unionPatternSet = new UnionPatternSet(); private int maxIndex = (int) java.lang.Math.pow(2, 16); private int shiftTable[] = new int[maxIndex]; public Vector<AtomicPattern> hashTable[] = new Vector[maxIndex]; private UnionPatternSet tmpUnionPatternSet = new UnionPatternSet(); public boolean addFilterKeyWord(String keyWord, int level) { if (initFlag == true) return false; UnionPattern unionPattern = new UnionPattern(); String []strArray = keyWord.split(" "); for (int i = 0; i < strArray.length; i++) { Pattern pattern = new Pattern(strArray[i]); AtomicPattern atomicPattern = new AtomicPattern(pattern); unionPattern.addNewAtomicPattrn(atomicPattern); unionPattern.setLevel(level); atomicPattern.setBelongUnionPattern(unionPattern); } tmpUnionPatternSet.addNewUnionPattrn(unionPattern); return true; } private boolean isValidChar(char ch) { if ((ch >= '0' && ch <= '9') || (ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z')) return true; if ((ch >= 0x4e00 && ch <= 0x7fff) || (ch >= 0x8000 && ch <= 0x952f)) return true;//简体中文汉字编码 return false; } public String parse(String content, Vector<Integer> levelSet) { if (initFlag == false) init(); Vector<AtomicPattern> aps = new Vector<AtomicPattern>(); String preContent = preConvert(content); for (int i = 0; i < preContent.length();) { char checkChar = preContent.charAt(i); if (shiftTable[checkChar] == 0) { Vector<AtomicPattern> tmpAps = new Vector<AtomicPattern>(); tmpAps = findMathAps(preContent.substring(0, i + 1), hashTable[checkChar]); aps.addAll(tmpAps); i++; } else i = i + shiftTable[checkChar]; } parseAtomicPatternSet(aps, levelSet); return content; } private void parseAtomicPatternSet(Vector<AtomicPattern> aps, Vector<Integer> levelSet) { while (aps.size() > 0) { AtomicPattern ap = aps.get(0); UnionPattern up = ap.belongUnionPattern; if (up.isIncludeAllAp(aps) == true) { levelSet.add(new Integer(up.getLevel())); } aps.remove(0); } } private Vector<AtomicPattern> findMathAps(String src, Vector<AtomicPattern> destAps) { Vector<AtomicPattern> aps = new Vector<AtomicPattern>(); for (int i = 0; i < destAps.size(); i++) { AtomicPattern ap = destAps.get(i); if (ap.findMatchInString(src) == true) aps.add(ap); } return aps; } private String preConvert(String content) { String retStr = new String(); for (int i = 0; i < content.length(); i++) { char ch = content.charAt(i); if (this.isValidChar(ch) == true) { retStr = retStr + ch; } } return retStr; } // shift table and hash table of initialize private void init() { initFlag = true; for (int i = 0; i < maxIndex; i++) hashTable[i] = new Vector<AtomicPattern>(); shiftTableInit(); hashTableInit(); } public void clear() { tmpUnionPatternSet.clear(); initFlag = false; } private void shiftTableInit() { for (int i = 0; i < maxIndex; i++) shiftTable[i] = 2; Vector<UnionPattern> upSet = tmpUnionPatternSet.getSet(); for (int i = 0; i < upSet.size(); i++) { Vector<AtomicPattern> apSet = upSet.get(i).getSet(); for (int j = 0; j < apSet.size(); j++) { AtomicPattern ap = apSet.get(j); Pattern pattern = ap.getPattern(); if (shiftTable[pattern.charAtEnd(1)] != 0) shiftTable[pattern.charAtEnd(1)] = 1; if (shiftTable[pattern.charAtEnd(0)] != 0) shiftTable[pattern.charAtEnd(0)] = 0; } } } private void hashTableInit() { Vector<UnionPattern> upSet = tmpUnionPatternSet.getSet(); for (int i = 0; i < upSet.size(); i++) { Vector<AtomicPattern> apSet = upSet.get(i).getSet(); for (int j = 0; j < apSet.size(); j++) { AtomicPattern ap = apSet.get(j); Pattern pattern = ap.getPattern(); if (pattern.charAtEnd(0) != 0) { hashTable[pattern.charAtEnd(0)].add(ap); } } } } } class Pattern { // string Pattern(String str) { this.str = str; } public char charAtEnd(int index) { if (str.length() > index) { return str.charAt(str.length() - index - 1); } else return 0; } public String str; public String getStr() { return str; }; } class AtomicPattern { public boolean findMatchInString(String str) { if (this.pattern.str.length() > str.length()) return false; int beginIndex = str.length() - this.pattern.str.length(); String eqaulLengthStr = str.substring(beginIndex); if (this.pattern.str.equalsIgnoreCase(eqaulLengthStr)) return true; return false; } AtomicPattern(Pattern pattern) { this.pattern = pattern; }; private Pattern pattern; public UnionPattern belongUnionPattern; public UnionPattern getBelongUnionPattern() { return belongUnionPattern; } public void setBelongUnionPattern(UnionPattern belongUnionPattern) { this.belongUnionPattern = belongUnionPattern; } public Pattern getPattern() { return pattern; } public void setPattern(Pattern pattern) { this.pattern = pattern; } } class SameAtomicPatternSet { SameAtomicPatternSet() { SAPS = new Vector<AtomicPattern>(); }; public Vector<AtomicPattern> SAPS; } class UnionPattern { // union string UnionPattern() { this.apSet = new Vector<AtomicPattern>(); } public Vector<AtomicPattern> apSet; public void addNewAtomicPattrn(AtomicPattern ap) { this.apSet.add(ap); } public Vector<AtomicPattern> getSet() { return apSet; } public boolean isIncludeAllAp(Vector<AtomicPattern> inAps) { if (apSet.size() > inAps.size()) return false; for (int i = 0; i < apSet.size(); i++) { AtomicPattern ap = apSet.get(i); if (isInAps(ap, inAps) == false) return false; } return true; } private boolean isInAps(AtomicPattern ap, Vector<AtomicPattern> inAps) { for (int i = 0; i < inAps.size(); i++) { AtomicPattern destAp = inAps.get(i); if (ap.getPattern().str.equalsIgnoreCase(destAp.getPattern().str) == true) return true; } return false; } public void setLevel(int level) { this.level = level; } public int getLevel() { return this.level; } private int level; } class UnionPatternSet { // union string set UnionPatternSet() { this.unionPatternSet = new Vector<UnionPattern>(); } public void addNewUnionPattrn(UnionPattern up) { this.unionPatternSet.add(up); } public Vector<UnionPattern> unionPatternSet; public Vector<UnionPattern> getSet() { return unionPatternSet; } public void clear() { unionPatternSet.clear(); } } 2.TxtReader.java package com.mutiplepatternmatch; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.InputStreamReader; import java.io.Serializable; import java.io.UnsupportedEncodingException; /** * * @author Daniel * */ public class TxtReader implements Serializable{ public TxtReader() { super(); } public static BufferedReader keywordReader(String fileName){ File file=new File(fileName); BufferedReader br=null; try { FileInputStream in=new FileInputStream(file); InputStreamReader inReader = new InputStreamReader(in, "UTF-8"); br=new BufferedReader(inReader); } catch (FileNotFoundException e) { System.out.println("你想加载的文件没有找到!!!"); e.printStackTrace(); } catch (UnsupportedEncodingException e) { System.out.println("你指定的编码类型不支持哦!!!"); e.printStackTrace(); } return br; } } 3.FilterTest.java package com.mutiplepatternmatch; import java.io.BufferedReader; import java.io.IOException; import java.util.Vector; public class FilterTest { public static void main(String args[]) { MutiPatternParser filterEngine = new MutiPatternParser(); BufferedReader brKeyword = TxtReader .keywordReader("file/illegalkeyword.txt"); BufferedReader brArticle = TxtReader.keywordReader("file/article.txt"); String keyword = null; String article = null; long startTime=System.currentTimeMillis(); StringBuffer buffer=new StringBuffer(); Vector<Integer> levelSet = new Vector<Integer>(); try { while ((keyword = brKeyword.readLine()) != null) { filterEngine.addFilterKeyWord(keyword, 1); } while ((article = brArticle.readLine()) != null) { buffer.append(article); } } catch (IOException e) { // TODO Auto-generated catch block System.out.println("读取文件IO异常!!!"); e.printStackTrace(); } String content=filterEngine.parse(buffer.toString(), levelSet); System.out.println("article.txt test:levelSet.size=" + levelSet.size()+"\n this article's content: "+content); long endTime=System.currentTimeMillis(); System.out.println("Cost:"+(endTime-startTime)+"ms"); /* 清除过滤算法引擎,带来的后果是引擎中将找不到任何的字符,测试代码,用来重载关键字时使用! */ filterEngine.clear(); levelSet.clear(); filterEngine.parse("我我我 你你你 他他他", levelSet); System.out.println("test:levelSet.size=" + levelSet.size());// 测试正确结果是找不到匹配字符 levelSet.clear(); filterEngine.parse("a'b'c a b c", levelSet); System.out.println("test:tz levelSet.size=" + levelSet.size());// 测试正确结果是找不到匹配字符*/ } } 声明:ITeye文章版权属于作者,受法律保护。没有作者书面许可不得转载。
推荐链接
|
|
返回顶楼 | |
发表时间:2010-01-06
就是想要你的关键词呢.发出来吧!
|
|
返回顶楼 | |
发表时间:2010-01-06
测试数据呢?
|
|
返回顶楼 | |
发表时间:2010-02-09
这个很久之前就看到了~就是看不懂啊~~~~~~
|
|
返回顶楼 | |