HtmlUnit解析html会丢掉不可见的Element

fuliang

浏览: 1664224 次
性别:
来自: 北京

最近访客更多访客>>

依然任逍遥

stephenworld

lli

samwalt

博主相关

博客

微博

相册

留言

关于我

文章分类

社区版块

存档分类

博客分类：

Data/Web Mining

HTML

最近使用htmlunit来作为后端抽取数据，htmlunit的DOM解析,使用xpath定位结点的过程发现有这个问题。

不知道是故意这么做，还是个bug。

于是对重写了前端获取xpath的代码，让它也忽略不可见Element，保持一致，可以解决我们的问题：

function getXPath(element){
	if (element && element.id)
		return '//*[@id="' + element.id + '"]';

	var paths = [];

	for (; element && element.nodeType == 1; element = element.parentNode){
		var index = 0;
		for (var sibling = element.previousSibling; sibling; sibling = sibling.previousSibling){
			if (sibling.localName == element.localName && isVisible(sibling))
				++index;
		}
		var tagName = element.localName.toLowerCase();
		var pathIndex = (index ? "[" + (index+1) + "]" : "");
		paths.splice(0, 0, tagName + pathIndex);
	}
	return paths.length ? "/" + paths.join("/") : null;    
};

判断是否可见：

function isVisible(element){
	var doc = element.ownerDocument;
	var docView = XPCOMUtils.QI(doc,_CI.nsIDOMDocumentView);
	var viewCss = XPCOMUtils.QI(docView.defaultView,_CI.nsIDOMViewCSS);
	var computedCss = viewCss.getComputedStyle(element,"");
	var visiable = computedCss.getPropertyCSSValue("visibility").getStringValue();
	var display = computedCss.getPropertyCSSValue("display").getStringValue();
	return visiable != "hidden" && display != "none";
}

其中XPCOMUtils是XPCOM的QueryInterface, createInstance,getService的一个代码简化的封装：

var _CI = Components.interfaces;
var _CC = Components.classes;

function XPCOMUtils() {}

(function() {

this.CCSV = function(cName, ifaceName)
{
	if (_CC[cName])
		return _CC[cName].getService(_CI[ifaceName]);  // if fbs fails to load, the error can be _CC[cName] has no properties
	else
		alert("Can't get the components class name: " + cName);
};

this.CCIN = function(cName, ifaceName)
{
    return _CC[cName].createInstance(_CI[ifaceName]);
};

this.QI = function(obj, iface)
{
    return obj.QueryInterface(iface);
};

// ************************************************************************************************

}).apply(XPCOMUtils);

分享到：

runtime.exec()执行进程block死锁问题 | XStream序列化对象，java.util.Map自定义 ...

2010-01-15 21:06
浏览 2923
评论(0)
分类:互联网
查看更多

发表评论

您还没有登录,请您登录后再发表评论

最近访客更多访客>>

博主相关

文章分类

社区版块

存档分类

最新评论

HtmlUnit解析html会丢掉不可见的Element

评论

发表评论

相关推荐

最近访客 更多访客>>

博主相关

文章分类

社区版块

存档分类

最新评论

HtmlUnit解析html会丢掉不可见的Element

评论

发表评论

相关推荐

推荐系统note

[zz]推荐系统-从入门到精通

[ZZ]计算机视觉、模式识别、机器学习常用牛人主页链接

计算广告学

期望最大（EM）算法推导

Large-Scale Support Vector Machines: Algorithms and Theory

[zz]数据挖掘邻域的5篇经典文章

大规模数据挖掘-第三章 学习笔记二

大规模数据挖掘-第三章 学习笔记一

信息抽取思考笔记

基于模式发现的信息抽取(1)

分享一本文本挖掘的书

《Web Data Mining Exploring Hyperlinks, Contents, and Usage Data》列入读书单中

机器学习的开放源代码项目mahout

网页分析/挖掘中常用数据结构和算法

一个很好的Machine Learning的开源工具网站

基于firefox浏览器的Deep Web Navigation总结

一份夭折了的Information Extraction的总体设计

Programming Collective Intelligence读书笔记三 推荐系统（续）

今天听了fanwei博士的Data Mining的讲座

最近访客更多访客>>

大规模数据挖掘-第三章学习笔记二

大规模数据挖掘-第三章学习笔记一

Programming Collective Intelligence读书笔记三推荐系统（续）