/**
* 解析一个html页面,返回一个html页面类.
*
* @param resource 文件路径或者网址
*/
public static searchhtmlpage parsehtmlpage(string resource)
{
string title = "";
string body = "";
try
{
parser myparser = new parser(resource);
//设置编码:根据实际情况修改
myparser.setencoding("gbk");
htmlpage visitor = new htmlpage(myparser);
myparser.visitallnodeswith(visitor);
title = visitor.gettitle();
body = combinenodetext(visitor.getbody().tonodearray());
}
catch (parserexception e)
{
logman.error("parse html page " + resource + " error!");
}
searchhtmlpage result = new searchhtmlpage(title, body);
return result;
}
/**
* 解析html内容,得到普通文本和链接的内容.
*
* @param content 要解析的内容
* @return 返回解析后的内容
*/
public static string parsehtmlcontent(string content)
{
parser myparser;
nodelist nodelist = null;
myparser = parser.createparser(content, "gbk");
nodefilter textfilter = new nodeclassfilter(textnode.class);
nodefilter linkfilter = new nodeclassfilter(linktag.class);
//暂时不处理 meta
//nodefilter metafilter = new nodeclassfilter(metatag.class);
orfilter lastfilter = new orfilter();
lastfilter.setpredicates(new nodefilter[] { textfilter, linkfilter });
try
{
nodelist = myparser.parse(lastfilter);
}
catch (parserexception e)
{
logman.warn("parse content error", e);
}
//中场退出了
if (null == nodelist)
{
return "";
}
node[] nodes = nodelist.tonodearray();
string result = combinenodetext(nodes);
return result;
}
//合并节点的有效内容
private static string combinenodetext(node[] nodes)
{
stringbuffer result = new stringbuffer();
for (int i = 0; i < nodes.length; i++)
{
node anode = (node) nodes[i];
string line = "";
if (anode instanceof textnode)
{
textnode textnode = (textnode) anode;
//line = textnode.toplaintextstring().trim();
line = textnode.gettext();
}
else if (anode instanceof linktag)
{
linktag linknode = (linktag) anode;
line = linknode.getlink();
//过滤jsp标签
line = stringfunc.replace(line, "", "");
}
if (stringfunc.istrimempty(line)) continue;
result.append(" ").append(line);
}
return result.tostring();
}
|