html to txt

medius

浏览: 3949 次
性别:
来自: 北京

最近访客更多访客>>

cymlancy

woodding2008

博主相关

博客

微博

相册

留言

关于我

文章分类

社区版块

存档分类

import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.InputStreamReader;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class HtmltoText {
public static String HtmlToText(String inputString)
{
           String htmlStr = inputString; //含html标签的字符串
           String textStr ="";
           Pattern p_script;
           Matcher m_script;
           Pattern p_style;
           Matcher m_style;
           Pattern p_html;
           Matcher m_html;
           Pattern p_spe;
           Matcher m_spe;
           Pattern p_blank;
           Matcher m_blank;
           Pattern p_table;
           Matcher m_table;
           Pattern p_enter;
           Matcher m_enter;

           try {
            String regEx_script = "<[\\s]*?script[^>]*?>[\\s\\S]*?<[\\s]*?\\/[\\s]*?script[\\s]*?>";
            //定义script的正则表达式.
            String regEx_style = "<[\\s]*?style[^>]*?>[\\s\\S]*?<[\\s]*?\\/[\\s]*?style[\\s]*?>";
            //定义style的正则表达式.
            String regEx_html = "<[^>]+>";
            //定义HTML标签的正则表达式
            String regEx_spe="\\&[^;]+;";
            //定义特殊符号的正则表达式
            String regEx_blank=" +";
            //定义多个空格的正则表达式
            String regEx_table="\t+";
            //定义多个制表符的正则表达式
            String regEx_enter="\n+";
            //定义多个回车的正则表达式
            p_script = Pattern.compile(regEx_script,Pattern.CASE_INSENSITIVE);
            m_script = p_script.matcher(htmlStr);
            htmlStr = m_script.replaceAll(""); //过滤script标签

            p_style = Pattern.compile(regEx_style,Pattern.CASE_INSENSITIVE);
            m_style = p_style.matcher(htmlStr);
            htmlStr = m_style.replaceAll(""); //过滤style标签

            p_html = Pattern.compile(regEx_html,Pattern.CASE_INSENSITIVE);
            m_html = p_html.matcher(htmlStr);
            htmlStr = m_html.replaceAll(""); //过滤html标签

            p_spe = Pattern.compile(regEx_spe,Pattern.CASE_INSENSITIVE);
            m_spe = p_spe.matcher(htmlStr);
            htmlStr = m_spe.replaceAll(""); //过滤特殊符号

            p_blank = Pattern.compile(regEx_blank,Pattern.CASE_INSENSITIVE);
            m_blank = p_blank.matcher(htmlStr);
            htmlStr = m_blank.replaceAll(" "); //过滤过多的空格

            p_table = Pattern.compile(regEx_table,Pattern.CASE_INSENSITIVE);
            m_table = p_table.matcher(htmlStr);
            htmlStr = m_table.replaceAll(" "); //过滤过多的制表符

           p_enter = Pattern.compile(regEx_enter,Pattern.CASE_INSENSITIVE);
            m_enter = p_enter.matcher(htmlStr);

            htmlStr = m_enter.replaceAll(" "); //过滤过多的制表符

            textStr = htmlStr;

           }catch(Exception e)
           {
                 System.err.println("Html2Text: " + e.getMessage());
           }

           return textStr;//返回文本字符串
}

public static void main(String[] args)throws Exception{

  FileInputStream fis = new FileInputStream("f:\\UPGRADING-2.0.html");
  BufferedReader buff = new BufferedReader(new InputStreamReader(fis));
  String str = null;
  long lstart = System.currentTimeMillis();
  StringBuffer Sbuff = new StringBuffer();
  str = buff.readLine();
  while(str!=null){
   Sbuff.append(str+"\n");
   str = buff.readLine();
  }
  HtmlToText(Sbuff.toString());
  long lend = System.currentTimeMillis();
  System.out.println(lend-lstart);
  //System.out.println(HtmlToText(Sbuff.toString()));
}
}

分享到：

jquery插件写法

2013-03-28 15:45
浏览 816
评论(0)
分类:编程语言
查看更多

发表评论

您还没有登录,请您登录后再发表评论

最近访客更多访客>>

博主相关

文章分类

社区版块

存档分类

最新评论

html to txt

评论

发表评论

相关推荐

最近访客 更多访客>>

博主相关

文章分类

社区版块

存档分类

最新评论

html to txt

评论

发表评论

相关推荐

最近访客更多访客>>