使用Htmlparser解析网页的一种方法（除去中文乱码）

zhouwendong006

浏览: 88618 次
性别:
来自: 河北

最近访客更多访客>>

lch1985110

fly_chao

suncong1024

hl_wu

博主相关

博客

微博

相册

留言

关于我

文章分类

社区版块

存档分类

博客分类：

htmlparser网页解析

IBM 网络协议 .net SUN F#

import java.lang.reflect.InvocationTargetException;

import java.lang.reflect.Method;

import java.net.HttpURLConnection;

import java.net.URL;

import org.htmlparser.Node;

import org.htmlparser.PrototypicalNodeFactory;

import org.htmlparser.Tag;

import org.htmlparser.lexer.Lexer;

import org.htmlparser.lexer.Page;

import org.htmlparser.nodes.TagNode;

import org.htmlparser.nodes.TextNode;

import org.htmlparser.tags.ScriptTag;

import org.htmlparser.tags.StyleTag;

import org.htmlparser.util.ParserException;

import com.lietu.tag.CnTagMaker.WordWeight;

public class TagExt {

	private static StringBuffer body;

	private static String title;

	public static void main(String[] args) throws Exception {

		String path;

		if (0 >= args.length) {

			path =

			/** 以下是测试地址 * */

			// "http://www.ibm.com/developerworks/cn/webservices/0901_haoxf_humantask/";
			// "http://developers.sun.com.cn/Java/xref_index.html";
			"http://mil.news.sina.com.cn/2009-01-09/0839538126.html";

			// "http://www.sina.com.cn/";

			// "http://www.ibm.com/";

			// "http://hao861002.iteye.com/blog/301581";

		} else {

			path = args[0];

		}

		/**
		 * 
		 * 构造 URL ，并打开网络链接。
		 * 
		 */

		URL url = new URL(path);

		HttpURLConnection httpUrl = (HttpURLConnection) url.openConnection();

		/** 对该网页进行解析 * */

		parseHTML(httpUrl);

		/** 打印解析后的内容 * */

		System.out.println("body=" + body);

	}

	/**
	 * 
	 * 解析网页内容
	 * 
	 * @param uc
	 *            传入一个 HttpURLConnection 链接对象
	 * 
	 * @throws ParserException
	 * 
	 */

	public static void parseHTML(HttpURLConnection uc) throws ParserException { 

       /** 声明节点 * */ 

       Node node; 

  

       String stringText; 

       body = new StringBuffer(); 

 

       /***********************************************************************
		 * 从 head 头获取网页编码格式。该方式取决于服务器是否设置 charSet 值，如果没有，该
		 * 
		 * 方式将无法获取 charSet 值
		 **********************************************************************/ 

       String contentType = uc.getContentType();   

       String charSet = getCharset (contentType); 

       Lexer lexer = null ; 

  

       if (charSet == null ) { 

           charSet = "UTF-8" ; 

       } 

  

       try { 

           lexer = new Lexer( new Page(uc.getInputStream(), charSet)); 

       } catch (Exception e) { 

           e.printStackTrace(); 

           return ; 

       } 

       

       /** 对网页内容进行解析 * */ 

       lexer.setNodeFactory( new PrototypicalNodeFactory()); 

       

       /** 设置开关，决定网页是否重新解析 * */ 

       boolean tryAgain = false ; 

       

       while ( null != (node = lexer.nextNode())) { 

           /** 以下是判断节点的类型，并作相应的处理 * */ 

           if (node instanceof ScriptTag) { 

              while ( null != (node = lexer.nextNode())) { 

                  if (node instanceof Tag) { 

                     Tag tag = (Tag) node; 

                    if (tag.isEndTag() && "SCRIPT" .equals(tag.getTagName())) { 

                         break ; 

                     } 

                  } 

              } 

              if ( null == node) 

                  break ; 

           } else if (node instanceof StyleTag) { 

              while ( null != (node = lexer.nextNode())) { 

                  if (node instanceof Tag) { 

                     Tag tag = (Tag) node; 

                     if (tag.isEndTag()) 

                         break ; 

                  } 

              } 

              if ( null == node) 

                  break ; 

           } else if (node instanceof TextNode) { 

              stringText = node.toPlainTextString(); 

              if ( "" .equals( title )) 

                  continue

else if (node instanceof TextNode) { 

              stringText = node.toPlainTextString(); 

              if ( "" .equals( title )) 

                  continue ; 

              stringText = stringText.replaceAll( "[ \t\n\f\r 　 ]+" , " " ); 

              stringText = TextHtml.html2text (stringText.trim()); 

  

              if (! "" .equals(stringText)) { 

                  body .append(stringText); 

                  body .append( " " ); 

              } 

           } else if (node instanceof TagNode) { 

              TagNode tagNode = (TagNode) node; 

              String name = ((TagNode) node).getTagName(); 

              if (name.equals( "TITLE" ) && !tagNode.isEndTag()) { 

                  node = lexer.nextNode(); 

                  stringText = node.toPlainTextString().trim(); 

                  if (! "" .equals(stringText)) { 

                     title = stringText; 

                  } 

              } else if (name.equals( "META" )) { 

                  String contentCharSet = tagNode.getAttribute( "CONTENT" ); 

                  // System.out.println("contentCharset="+contentCharSet);

                  int b = contentCharSet.toLowerCase().indexOf( "charset" ); 

                  if (b > -1) { 

                     String newCharSet = getCharset (contentCharSet); 

                     // System.out.println("newCharSet=" + newCharSet);

                     if (!charSet.equals(newCharSet)) { 

                         tryAgain = true ; 

                         charSet = newCharSet; 

                         // System.out.println("charSet=" + charSet);

                         // System.out.println("newCharSet=" + newCharSet);

                         break ; 

                     } 

                  } 

              } 

           } 

       } 

  

       /***********************************************************************
		 * 如果在 Meta 信息中检测到新的字符编码，则需要按照 meta 信息中的编码再次解析网页 。
		 **********************************************************************/ 

       if (tryAgain) { 

           body = new StringBuffer(); 

  

           try { 

                uc = (HttpURLConnection) uc.getURL().openConnection(); 

              lexer = new Lexer( new Page(uc.getInputStream(), charSet)); 

           } catch (Exception e) { 

              e.printStackTrace(); 

           } 

  

           lexer.setNodeFactory( new PrototypicalNodeFactory()); 

  

           while ( null != (node = lexer.nextNode())) { 

              if (node instanceof TextNode) { 

                  stringText = node.toPlainTextString(); 

                  if ( "" .equals( title )) 

                     continue ; 

                  stringText = stringText.replaceAll( "[ \t\n\f\r 　 ]+" , " " ); 

                  stringText = TextHtml.html2text (stringText.trim()); 

                  if (! "" .equals(stringText)) { 

                     body .append(stringText); 

                     body .append( " " ); 

                  } 

              } 

           } 

       } 

    }

	/**
	 * 
	 * 找出最终的网页编码
	 * 
	 * @param name
	 *            经过 getCharset 方法处理后 meta 标签的值
	 * 
	 * @param _default
	 *            默认的编码集
	 * 
	 * @return
	 * 
	 */

	public static String findCharset(String name, String _default) {

		String ret;

		try {

			Class<java.nio.charset.Charset> cls;

			Method method;

			Object object;

			cls = java.nio.charset.Charset.class;

			method = cls.getMethod("forName", new Class[] { String.class });

			object = method.invoke(null, new Object[] { name });

			method = cls.getMethod("name", new Class[] {});

			object = method.invoke(object, new Object[] {});

			ret = (String) object;

		} catch (NoSuchMethodException nsme) {

			ret = name;

		} catch (IllegalAccessException ia) {

			ret = name;

		} catch (InvocationTargetException ita) {

			ret = _default;

			System.out

			.println("unable to determine cannonical charset name for "

			+ name + " - using " + _default);

		}

		return (ret);

	}

	/**
	 * 
	 * 处理 meta 中的内容，并调用 findCharset() 方法获取编码值
	 * 
	 * @param content
	 *            Meta 中的内容
	 * 
	 * @return
	 * 
	 */

	public static String getCharset(String content) {

		final String CHARSET_STRING = "charset";

		int index;

		String ret;

		ret = null;

		if (null != content) {

			index = content.indexOf(CHARSET_STRING);

			if (index != -1) {

				content = content.substring(index + CHARSET_STRING.length())

				.trim();

				if (content.startsWith("=")) {

					content = content.substring(1).trim();

					index = content.indexOf(";");

					if (index != -1)

						content = content.substring(0, index);

					if (content.startsWith("\"") && content.endsWith("\"")

					&& (1 < content.length()))

						content = content.substring(1, content.length() - 1);

					if (content.startsWith("'") && content.endsWith("'")

					&& (1 < content.length()))

						content = content.substring(1, content.length() - 1);

					ret = findCharset(content, ret);

				}

			}

		}

		return (ret);

	}

}

说明：
该类使用 Boolean 型 tryAgain 来判断是否进行第二次解析！

如果通过 String contentType = uc.getContentType() 来获得 http 协议中 Head 头中contentType 中的值，并使用 String charSet = getCharset(contentType); 从 contentType 中获取charSet 值。如果此时 charSet 获取的为空，则先用 charSet=UTF-8 （即：默认情况下， charSet 值为 UTF-8 ）进行解析。使用 org.htmlparser.tags 包中的类对比节点 node ，并对合适的节点的内容进行处理，本例中是对类型为 TextNode 的 node 节点，获取其中的内容，并将其添加到类型为StringBuffer 的 body 当中。当解析到 <meta> 标签的时候，会试图从 <meta> 中获取 newCharSet 值，并将 newCharSet 和 charSet 进行比较，如果相同，则不再进行解析，主函数将 body 打印。如果不同，说明默认的的 charSet 值并不是网页的编码格式，刚才的解析可能不正确，会出现乱码。这时候，将 newCharSet 中的值赋给 charSet ，并将 tryAgain 值设为 true 。当程序进行到 if(tryAgain){ …… } 判断时，结果为真，则根据新得到的 charSet 值进行二次解析。这时候，我们就会得到正确的解析结果。

分享到：

中文搜索引擎技术揭密：中文分词 | Jersey初次使用有感

2009-01-20 17:10
浏览 7597
评论(2)
查看更多

2 楼 zhouwendong006 2010-11-20

ww20042005 写道

我试了一下，有点问题，TextHtml是什么jar包下的类？

不好意思，没有发现这个类！

1 楼 ww20042005 2010-11-08

我试了一下，有点问题，TextHtml是什么jar包下的类？

发表评论

您还没有登录,请您登录后再发表评论

最近访客更多访客>>

博主相关

文章分类

社区版块

存档分类

最新评论