package test;
import java.io.*;
import org.htmlparser.Parser;
import org.htmlparser.visitors.HtmlPage;
class Test {
public static void main(String[] argv) throws IOException, InterruptedException {
Parser parser;
String body = "";
String title = "";
try {
parser = new Parser("http://www.hao123.com");
parser.setEncoding("UTF-8");
HtmlPage htmlpage = new HtmlPage(parser);
parser.visitAllNodesWith(htmlpage);
//通过htmlparser 获取body内容
body = htmlpage.getBody().asString();
//通过htmlparser 获取title内容
title =htmlpage.getTitle();
body = body.replaceAll("[ \\t\\n\\r\\f( |gt) ]+"," ");
System.out.println(title);
System.out.println(body);
} catch (Exception e) {
// TODO: handle exception
e.printStackTrace();
}
}
}
//获取源文件
Parser parser = new Parser(address);
parser.setEncoding("gbk");
System.out.println(parser.parse(null).toHtml());
分享到:
评论