HTML过滤和补齐（一）

sillycat

浏览: 2555184 次
性别:
来自: 成都

最近访客更多访客>>

huageng520

learnmore

u012363178

ymgjava

博主相关

博客

微博

相册

留言

关于我

文章分类

社区版块

存档分类

博客分类：

JAVA

HTML Apache Security XML J#

主要使用了一个UTIL工具来过滤HTML
其中使用到了alibaba的几个类,
import com.alibaba.common.lang.ObjectUtil;
import com.alibaba.common.lang.StringEscapeUtil;
import com.alibaba.common.lang.i18n.LocaleUtil;
import com.alibaba.common.lang.internal.Entities;
import com.alibaba.common.lang.StringUtil;

package com.megaeyes.ipcamera.service.util;

import java.io.IOException;
import java.io.StringReader;
import java.io.StringWriter;
import java.io.Writer;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.text.DateFormat;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;

import org.apache.commons.codec.binary.Base64;
import org.apache.html.dom.HTMLDocumentImpl;
import org.apache.oro.text.regex.Pattern;
import org.apache.oro.text.regex.PatternMatcher;
import org.apache.oro.text.regex.PatternMatcherInput;
import org.apache.oro.text.regex.Perl5Compiler;
import org.apache.oro.text.regex.Perl5Matcher;
import org.apache.oro.text.regex.Perl5Substitution;
import org.apache.oro.text.regex.Util;
import org.apache.xerces.xni.parser.XMLDocumentFilter;
import org.cyberneko.html.filters.ElementRemover;
import org.cyberneko.html.parsers.DOMFragmentParser;
import org.w3c.dom.DocumentFragment;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.Node;
import org.w3c.dom.html.HTMLDocument;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;

import com.alibaba.common.lang.ObjectUtil;
import com.alibaba.common.lang.StringEscapeUtil;
import com.alibaba.common.lang.i18n.LocaleUtil;
import com.alibaba.common.lang.internal.Entities;
import com.alibaba.common.lang.StringUtil;

public class TBStringUtil {
private static MessageDigest mHasher;
private static char[] digits = { '0', '1', '2', '3', '4', '5', '6', '7',
    '8', '9', 'a', 'b', 'c', 'd', 'e', 'f' };
private static Pattern escapeURLsInHTMLPattern = null;
private static Pattern escapeSpecialHTMLPattern = null;
private static String[] commonAttribute = new String[] { "style", "align",
    "valign", "class", "bgcolor", "background", "title" };
private static String[] divAttribute = new String[] { "align", "valign",
    "class", "bgcolor", "background", "title" };
private static String[] imgAttribute = new String[] { "style", "align",
    "valign", "class", "bgcolor", "background", "title", "src",
    "border", "width", "height", "alt", "usemap" };
private static String[] fontAttribute = new String[] { "style", "align",
    "valign", "class", "bgcolor", "background", "title", "color",
    "size", "face" };
private static String[] tableAttribute = new String[] { "style", "align",
    "valign", "class", "bgcolor", "background", "title", "border",
    "width", "height", "cellpadding", "cellspacing", "bordercolor",
    "blockquote" };
private static String[] tdAttribute = new String[] { "style", "align",
    "valign", "class", "bgcolor", "background", "title", "width",
    "height", "colspan", "rowspan" };
private static String[] marqueeAttribute = new String[] { "style", "align",
    "valign", "class", "bgcolor", "background", "title",
    "scrollamount", "direction", "behavior", "width", "height",
    "scrolldelay" };
private static String[] aAttribute = new String[] { "style", "align",
    "valign", "class", "bgcolor", "background", "title", "target",
    "name", "href" };

private static String[] bgsoundAttribute = new String[] { "src", "loop" };
private static String[] mapAttribute = new String[] { "name" };
private static String[] areaAttribute = new String[] { "href", "shape",
    "coords" };

private static Set INLINE_CLOSED_TAG = new HashSet();

static {
   INLINE_CLOSED_TAG.add("img");
   INLINE_CLOSED_TAG.add("br");
   INLINE_CLOSED_TAG.add("input");

   try {
    escapeURLsInHTMLPattern = (new Perl5Compiler())
      .compile("(http://[a-zA-Z0-9_/&=?\\.;]*)");
    escapeSpecialHTMLPattern = (new Perl5Compiler()).compile(
      "^http://[a-z0-9]+\\.taobao\\.com.*$",
      Perl5Compiler.CASE_INSENSITIVE_MASK);
   } catch (Exception e) {
    e.printStackTrace();
   }
   try {
    mHasher = MessageDigest.getInstance("MD5");
   } catch (NoSuchAlgorithmException nex) {
    mHasher = null;
    nex.printStackTrace();
   }
}

public static String hash(String str) {
   byte[] bt = null;
   synchronized (mHasher) {
    bt = mHasher.digest(str.getBytes());
   }
   int l = bt.length;

   char[] out = new char[l << 1];

   for (int i = 0, j = 0; i < l; i++) {
    out[j++] = digits[(0xF0 & bt[i]) >>> 4];
    out[j++] = digits[0x0F & bt[i]];
   }

   return new String(out);
}

/**
* 转化字符串以适合html输出
*
* @param str
*
* @return
*/
public static String escapeHTML(String strInput) {
   if (strInput == null) {
    return "";
   }

   try {
    StringWriter out = new StringWriter(strInput.length());

    if (escapeEntities(Entities.HTML40, strInput, out)) {
     return out.toString();
    }

    return strInput;
   } catch (IOException e) {
    return ""; // StringWriter不可能发生这个异常
   }
}

/**
* 将字符串中的部分字符转换成实体编码。
*
* @param entities
*            实体集合
* @param str
*            要转义的字符串
* @param out
*            字符输出流，不能为<code>null</code>
*
* @return 如果字符串没有变化，则返回<code>false</code>
*
* @throws IllegalArgumentException
*             如果<code>entities</code>或输出流为<code>null</code>
* @throws IOException
*             如果输出失败
*/
protected static boolean escapeEntities(Entities entities, String str,
    Writer out) throws IOException {
   boolean needToChange = false;

   if (entities == null) {
    throw new IllegalArgumentException("The Entities must not be null");
   }

   if (out == null) {
    throw new IllegalArgumentException("The Writer must not be null");
   }

   if (str == null) {
    return needToChange;
   }

   for (int i = 0; i < str.length(); ++i) {
    char ch = str.charAt(i);
    String entityName = entities.getEntityName(ch);

    if (entityName == null) {
     if (ch == '\n') {
      out.write('<');
      out.write('b');
      out.write('r');
      out.write('/');
      out.write('>');
      // out.write(ch);
     } else if (ch == '\r') {
      // nodo
     } else {
      out.write(ch);
     }

     needToChange = true;
    } else {
     out.write('&');
     out.write(entityName);
     out.write(';');

     // 设置改变标志
     needToChange = true;
    }
   }

   return needToChange;
}

/**
* 比较两个字符串是否相等，""与null相等 extends com.alibaba.common.lang.StringUtil
*
* @param str1
* @param str2
*
* @return
*/
public static boolean equals(String str1, String str2) {
   if (StringUtil.isBlank(str1) && StringUtil.isBlank(str2)) {
    return true;
   }
   return StringUtil.equals(str1, str2);
}

/**
* 去除特殊的HTML标记，自动补齐不完整的HTML
*
* @param String
*            转换前的HTML
*
* @return String 转换后的HTML
*/
public static String escapeSpecialHTML(String str) {
   return escapeSpecialHTML(str, true);
}

/**
* 去除HTML标记
*
* @param str
* @return
*/
public static String stripHTML(String str) {
   if (StringUtil.isBlank(str)) {
    return "";
   }

   try {
    DOMFragmentParser parser = new DOMFragmentParser();

    // 标签过滤器
    // acceptElement指接受那些html标签。removeElement表示那些标签会全部除去（包括子标签）。这两种之外的会去掉标签，但保留内容。
    ElementRemover remover = new ElementRemover();

    remover.removeElement("script");
    remover.removeElement("style");
    remover.removeElement("head");
    remover.removeElement("select");

    XMLDocumentFilter[] filters = { remover };

    parser.setProperty("http://cyberneko.org/html/properties/filters",
      filters);

    HTMLDocument document = new HTMLDocumentImpl();
    DocumentFragment fragment = document.createDocumentFragment();
    InputSource is = new InputSource(new StringReader(str));

    is.setEncoding("GBK");
    parser.parse(is, fragment);

    return getHTML(fragment, false).toString();
   } catch (IOException e) {
    // e.printStackTrace();
   } catch (SAXException e) {
    // e.printStackTrace();
   } catch (Exception e) {
    // ingore
   }

   return escapeHTML(str);
}

/**
* 去除特殊的HTML标记，自动补齐不完整的HTML
*
* @param String
*            转换前的HTML
* @param boolean
*            forOutPut
*            true：判断表格和链接是否合法。false：不判断。系统中发布宝贝的时候存入数据库前是true，显示宝贝的时候用的false
*
* @return String 转换后的HTML
*/
public static String escapeSpecialHTML(String str, boolean check) {
   if (StringUtil.isBlank(str)) {
    return "";
   }

   try {
    DOMFragmentParser parser = new DOMFragmentParser();

    // 标签过滤器
    // acceptElement指接受那些html标签。removeElement表示那些标签会全部除去（包括子标签）。这两种之外的会去掉标签，但保留内容。
    ElementRemover remover = new ElementRemover();

    remover.acceptElement("b", commonAttribute);
    remover.acceptElement("i", commonAttribute);
    remover.acceptElement("u", commonAttribute);
    remover.acceptElement("br", commonAttribute);
    remover.acceptElement("hr", commonAttribute);
    remover.acceptElement("sup", commonAttribute);
    remover.acceptElement("sub", commonAttribute);
    remover.acceptElement("strong", commonAttribute);
    remover.acceptElement("em", commonAttribute);
    remover.acceptElement("strike", commonAttribute);
    remover.acceptElement("ol", commonAttribute);
    remover.acceptElement("li", commonAttribute);
    remover.acceptElement("ul", commonAttribute);
    remover.acceptElement("h1", commonAttribute);
    remover.acceptElement("h3", commonAttribute);
    remover.acceptElement("h2", commonAttribute);
    remover.acceptElement("h4", commonAttribute);
    remover.acceptElement("h5", commonAttribute);

    remover.acceptElement("span", commonAttribute);
    remover.acceptElement("div", divAttribute);
    remover.acceptElement("p", commonAttribute);

    remover.acceptElement("a", aAttribute);
    remover.acceptElement("img", imgAttribute);
    remover.acceptElement("font", fontAttribute);
    remover.acceptElement("table", tableAttribute);
    remover.acceptElement("caption", commonAttribute);
    remover.acceptElement("tr", tdAttribute);
    remover.acceptElement("td", tdAttribute);
    remover.acceptElement("bgsound", bgsoundAttribute);
    remover.acceptElement("map", mapAttribute);
    remover.acceptElement("area", areaAttribute);
    remover.acceptElement("marquee", marqueeAttribute);
    remover.acceptElement("blockquote", commonAttribute);
    remover.acceptElement("cite", commonAttribute);

    remover.removeElement("script");
    remover.removeElement("style");
    remover.removeElement("head");
    remover.removeElement("select");

    XMLDocumentFilter[] filters = { remover };

    parser.setProperty(
      "http://cyberneko.org/html/properties/default-encoding",
      "GBK");
    parser.setProperty("http://cyberneko.org/html/properties/filters",
      filters);

    HTMLDocument document = new HTMLDocumentImpl();
    DocumentFragment fragment = document.createDocumentFragment();
    InputSource is = new InputSource(new StringReader(str));

    is.setEncoding("GBK");
    parser.parse(is, fragment);

    return getHTML(fragment, check).toString();
   } catch (IOException e) {
    // e.printStackTrace();
   } catch (SAXException e) {
    // e.printStackTrace();
   } catch (Exception e) {
    // ignore
   }

   return escapeHTML(str);
}

分享到：

HTML过滤和补齐（二） | 关键字过滤

2010-01-05 23:21
浏览 2228
评论(0)
分类:编程语言
查看更多

发表评论

您还没有登录,请您登录后再发表评论

最近访客更多访客>>

博主相关

文章分类

社区版块

存档分类

最新评论