基于文本标签方差分析和链接分析的网页正文、正文图片地址抽取算法 -

cesul

浏览: 31919 次
性别:
来自: 成都

最近访客更多访客>>

javamingming

happylzq

temfulX

lhzzxa

博主相关

博客

微博

相册

留言

关于我

文章分类

全部博客 (19)

社区版块

存档分类

基于文本标签方差分析和链接分析的网页正文、正文图片地址抽取算法

网页正文网页主题主题抽取


public class CopyOfContentExtractor {

	private static final int MIN_NODE_TEXT_LENGTH = 20; // //正文的最小长度
	private static final int MIN_K = 30; // //有了链接分析，可以设置高一些，粗放一些
	private static final double MAX_LINK_RATE = 0.5; // ///最小链接率
	
	private double TEMP_MAX_LENGTH = 0;
	
	private Node targetNode = null;
	public String title = "";
	public List<String> imgSrcList = new ArrayList<String>();
	public String address;

	/**
	 * 解析本地html文件
	 * 
	 * @param address
	 */
	public void parseHTM(String location, String address) {
		this.address = address;
		try {
			parse(new FileInputStream(new File(location)));
		} catch (SAXException e) {
			e.printStackTrace();
		} catch (IOException e) {
			e.printStackTrace();
		}
	}

	/**
	 * 解析传入的url地址
	 * 
	 * @param address
	 */
	public void parseURL(String address) {
		this.address = address;
		try {
			URL url = new URL(address);
			parse(url.openConnection().getInputStream());
		} catch (SAXException e) {
			e.printStackTrace();
		} catch (IOException e) {
			e.printStackTrace();
		}
	}

	private void parse(InputStream ips) throws SAXException, IOException {

		DOMParser parser = new DOMParser();
		parser.setProperty("http://cyberneko.org/html/properties/default-encoding", "utf-8");
		parser.parse(new InputSource(ips));
		Document doc = parser.getDocument();

		Node body = doc.getElementsByTagName("body").item(0); // 抽取内容

		if (body != null) {
			NodeList list = body.getChildNodes();
			for (int i = 0; i < list.getLength(); i++) {
				Node node = list.item(i);
				loop(node); // //////递归调用
			}
			// //////此时已对targetNode赋值
			if (targetNode != null) {

				linkNodeFilter(targetNode);
				// //////进行链接分析继续对目标node的dom过滤
				// refixTargetNodeByLinkAnalysis(this.targetNode);
				// //////抽取目标node内的图片地址
				imgExtractor(targetNode);
			} else {
				System.out.println("extractor is NULL!!! ");
			}
		}
	}

	public void loop(Node node) {

		if (node.getNodeType() == Node.ELEMENT_NODE) {
			Element elmt = (Element) node;
			if (!elmt.getTagName().equals("STYLE") && !elmt.getTagName().equals("SCRIPT")) {
				NodeList list = node.getChildNodes();
				if (list.getLength() == 1) {
					loop(list.item(0));
				} else {
					double[] lengthArr = new double[list.getLength()];
					for (int i = 0; i < list.getLength(); i++) {  /////计算每个子节点的文本长度
						String text = textExtractor(list.item(i));
						///System.out.println("text: " + text);
						lengthArr[i] = text.length();
					}
					// ////////遍历每一个长度，判断走向
					double sum = 0.0;
					for (double d : lengthArr) {
						sum += d;
					}
					if (sum > MIN_NODE_TEXT_LENGTH) {

						double mean = sum / lengthArr.length;
						double varianceSum = 0.0;
						for (double d : lengthArr) {
							varianceSum += (d - mean) * (d - mean);
						}
						double variance = varianceSum / lengthArr.length; // /////方差

						double k = variance / sum;
						////System.out.println("k:" + k);
						if (k < MIN_K) {
							////System.out.println(k + "  " + textExtractor(node));

							////抽取正文node
							if (sum > TEMP_MAX_LENGTH) {
								TEMP_MAX_LENGTH = sum;
								targetNode = node;
							}

						} else {
							////重新采用链接分析的办法筛选
							for (int i = 0; i < list.getLength(); i++) {  /////计算每个子节点的文本长度
								lengthArr[i] = lengthArr[i] - linkAnalysis(list.item(i));  ///减去链接文本长度
							}
							int maxIndex = getMaxIndex(lengthArr);
							loop(list.item(maxIndex));
						}
					}
				}
			}
		}
	}

	// 抽取节点文本递归部分
	public String textExtractor(Node root) {
		// 若是文本节点的话，直接返回
		if (root.getNodeType() == Node.TEXT_NODE) {
			return root.getNodeValue().trim();
		}
		if (root.getNodeType() == Node.ELEMENT_NODE) {
			Element elmt = (Element) root;

			// 去除脚本
			if (elmt.getTagName().equals("STYLE") || elmt.getTagName().equals("SCRIPT"))
				return "";

			NodeList children = elmt.getChildNodes();
			StringBuilder text = new StringBuilder();
			for (int i = 0; i < children.getLength(); i++) {
				String innerText = textExtractor(children.item(i));
				if (innerText.length() > 5) {
					text.append(innerText + " ");
				}
			}
			return text.toString();
		}
		// 对其它类型的节点，返回空值
		return "";
	}

	// 抽取图片地址递归部分
	private void imgExtractor(Node root) {

		if (root.getNodeType() == Node.ELEMENT_NODE) {
			Element elmt = (Element) root;

			if (elmt.getTagName().equals("IMG") || elmt.getTagName().equals("img")) {

				String src = elmt.getAttribute("real_src").isEmpty() ? elmt.getAttribute("src") : elmt.getAttribute("real_src");
				if (!src.startsWith("http://")) { // //////如果图片地址是相对地址
					String sub = address.substring(7);
					int index = sub.indexOf("/");
					src = "http://" + sub.substring(0, index) + src;
				}
				imgSrcList.add(src);
			}
			NodeList children = elmt.getChildNodes();
			for (int i = 0; i < children.getLength(); i++) {
				imgExtractor(children.item(i));
			}
		}
	}

	/**
	 * 计算节点链接文本长度，非破坏性
	 * @param root
	 * @return
	 */
	private double linkAnalysis(Node root) {

		// 若是文本节点的话，直接返回
		if (root.getNodeType() == Node.TEXT_NODE
				&& (root.getParentNode().getNodeName().endsWith("a") || root.getParentNode().getNodeName().endsWith("A"))) {
			return root.getNodeValue().trim().length();
		}

		if (root.getNodeType() == Node.ELEMENT_NODE) {

			Element elmt = (Element) root;
			// 去除脚本
			if (elmt.getTagName().equals("STYLE") || elmt.getTagName().equals("SCRIPT"))
				return 0.0;

			NodeList children = elmt.getChildNodes();
			double temp = 0.0;
			for (int i = 0; i < children.getLength(); i++) {
				double innerTemp = linkAnalysis(children.item(i));
				temp += innerTemp;
			}
			return temp;
		}
		// 对其它类型的节点，返回空值
		return 0.0;
	}

	private int getMaxIndex(double[] input) {
		int index = 0;
		double temp = 0.0;
		for (int i = 0; i < input.length; i++) {
			if (temp < input[i]) {
				temp = input[i];
				index = i;
			}
		}
		return index;
	}

	/**
	 * 采用remove风格处理targetNode：链接分析
	 */
	private void linkNodeFilter(Node node) {
		if (node != null) {
			NodeList children = node.getChildNodes();

			// //拼接网页正文
			for (int i = 0; i < children.getLength(); i++) {
				Node subNode = children.item(i);
				// ///链接分析
				String text = textExtractor(subNode);

				double textLength = text.length();
				if (textLength > 0) {

					// System.out.println("Text:" + text);
					// System.out.println(subNode.getNodeName());
					double linkLen = linkAnalysis(subNode);
					double linkRate = linkLen / textLength;
					// System.out.println("linkLen: " + linkLen + "; textLen: "
					// + textLength + "; linkRate: " + linkRate);
					// System.out.println(100 * linkRate / (linkLen +
					// textLength));

					if (linkRate < MAX_LINK_RATE) {
						linkNodeFilter(subNode);
					} else {
						node.removeChild(subNode);
					}
				}
			}
		}
	}

	/**
	 * 
	 * @return
	 */
	private String getContent() {
		if(targetNode != null){
			return textExtractor(targetNode);
		}else{
			return "NULL"; 
		}
	}

	
	// //////////
	public static void main(String[] argv) throws IOException {

		int count = 0;
		for (File f : new File("D:/fxreader/data/2012-09-29-00-13").listFiles()) {
			CopyOfContentExtractor extractor = new CopyOfContentExtractor();
			extractor.parseHTM(f.getAbsolutePath(), Options.rules_startsWith);
			
			String content = extractor.getContent();
			
			if(content.length() > 200 ){
				count++;
				PrintWriter pw = new PrintWriter(new OutputStreamWriter(new FileOutputStream(new File("D:/fxreader/content/2012-09-29-00-13/" + f.getName()+ "_" + content.length() + "_" + ".txt")), "utf-8"));
				pw.append(content);
				//FileUtils.writeStringToFile(new File("D:/fxreader/content/" + f.getName()+ "_" + content.length() + "_" + ".txt"), content, "utf-8");
				pw.flush();
				pw.close();
				
				System.out.println("missing: " + f.getName());
			}
			System.out.println(count);
		};

	}

}

实验测试，错误率：42/500.其中，42篇中大多为原网页不含有“正文”的网页。

分享到：