浏览 4592 次
锁定老帖子 主题:HTML工具类
精华帖 (0) :: 良好帖 (1) :: 新手帖 (1) :: 隐藏帖 (0)
|
|
---|---|
作者 | 正文 |
发表时间:2011-05-11
package ssh.util; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * HTML工具 * @author gary * */ public class HTMLUtil { //> public static final String GT = ">"; //< public static final String LT = "<"; //" public static final String QUOT = """; //& public static final String AMP = "&"; //空格 public static final String SPACE = " "; //© public static final String COPYRIGHT = "©"; //® public static final String REG = "®"; //™ public static final String TM = "™"; //¥ public static final String RMB = "¥"; /** * 删除script标签 * @param str * @return */ public static String delScriptTag(String str){ String regEx_script = "<script[^>]*?>[\\s\\S]*?<\\/script>"; Pattern p_script = Pattern.compile(regEx_script,Pattern.CASE_INSENSITIVE); Matcher m_script = p_script.matcher(str); str = m_script.replaceAll(""); return str.trim(); } /** * 删除style标签 * @param str * @return */ public static String delStyleTag(String str){ String regEx_style="<style[^>]*?>[\\s\\S]*?<\\/style>"; Pattern p_style = Pattern.compile(regEx_style,Pattern.CASE_INSENSITIVE); Matcher m_style = p_style.matcher(str); str = m_style.replaceAll(""); return str; } /** * 删除HTML标签 * @param str * @return */ public static String delHTMLTag(String str){ String regEx_html = "<[^>]+>"; Pattern p_html = Pattern.compile(regEx_html,Pattern.CASE_INSENSITIVE); Matcher m_html = p_html.matcher(str); str = m_html.replaceAll(""); return str; } /** * 删除所有标签 * @param str * @return */ public static String delAllTag(String str){ //删script str = delScriptTag(str); //删style str = delStyleTag(str); //删HTML str = delHTMLTag(str); return str; } /** * 清除标签,恢复HTML转义字符 * @param str * @return */ public static String clean(String str){ str = delAllTag(str); str = str.replaceAll(SPACE, " "); str = str.replaceAll(GT, ">"); str = str.replaceAll(LT, "<"); str = str.replaceAll(QUOT, "\""); str = str.replaceAll(AMP, "&"); str = str.replaceAll(COPYRIGHT, "©"); str = str.replaceAll(REG,"®"); str = str.replaceAll(TM,"™"); str = str.replaceAll(RMB,"¥"); return str; } /** * 过滤指定标签 * @param str * @param tag * 指定标签 * @return String */ public static String fiterHtmlTag(String str, String tag) { String regxp = "<\\s*" + tag + "\\s+([^>]*)\\s*>"; Pattern pattern = Pattern.compile(regxp); Matcher matcher = pattern.matcher(str); StringBuffer sb = new StringBuffer(); boolean result1 = matcher.find(); while (result1) { matcher.appendReplacement(sb, ""); result1 = matcher.find(); } matcher.appendTail(sb); return sb.toString(); } /** * 替换指定的标签 * @param str * @param beforeTag * 要替换的标签 * @param tagAttrib * 要替换的标签属性值 * @param startTag * 新标签开始标记 * @param endTag * 新标签结束标记 * @return String * example: 替换img标签的src属性值为[img]属性值[/img] */ public static String replaceHtmlTag(String str, String beforeTag, String tagAttrib, String startTag, String endTag) { String regxpForTag = "<\\s*" + beforeTag + "\\s+([^>]*)\\s*>"; String regxpForTagAttrib = tagAttrib + "=\"([^\"]+)\""; Pattern patternForTag = Pattern.compile(regxpForTag); Pattern patternForAttrib = Pattern.compile(regxpForTagAttrib); Matcher matcherForTag = patternForTag.matcher(str); StringBuffer sb = new StringBuffer(); boolean result = matcherForTag.find(); while (result) { StringBuffer sbreplace = new StringBuffer(); Matcher matcherForAttrib = patternForAttrib.matcher(matcherForTag .group(1)); if (matcherForAttrib.find()) { matcherForAttrib.appendReplacement(sbreplace, startTag + matcherForAttrib.group(1) + endTag); } matcherForTag.appendReplacement(sb, sbreplace.toString()); result = matcherForTag.find(); } matcherForTag.appendTail(sb); return sb.toString(); } public static void main(String[] args) { System.out.println(clean(URLUtil.url2Str("http://www.baidu.com"))); } } 声明:ITeye文章版权属于作者,受法律保护。没有作者书面许可不得转载。
推荐链接
|
|
返回顶楼 | |
发表时间:2011-05-13
Pattern 好像是线程安全的.所以可以考虑把 各个Pattern提升为静态属性.
|
|
返回顶楼 | |