`
yuhai.china
  • 浏览: 160531 次
  • 性别: Icon_minigender_1
  • 来自: 北京
社区版块
存档分类
最新评论

使用watij和xpath实现自动spider(完善中)

阅读更多
watij本来是用于web测试的,但是我发现利用它来做垂直爬虫,效果也很好
以下的代码抓了三个网站
package com.example.tests;

import watij.runtime.ie.IE;
import watij.finders.AttributeFinder;
import watij.finders.Finder;
import watij.finders.NameFinder;
import watij.finders.XPathFinder;
import watij.finders.FinderFactory.*;
import watij.elements.*;


public class WatijHotel {
	public static void main(String[] args){
		IE ie = new IE(),new_ie=null;
		IE iectrip=null,ieelong=null;
		try {
			ie.start("http://hotel.qunar.com");
			//ie.textField(new NameFinder("toCity")).set("北京");
			//click("hchkParaSeachElong");
			ie.checkbox(new AttributeFinder("id","hchkParaSeachElong")).click();
			ie.button(new AttributeFinder("id","hbtnSearch")).click();
			Links ls = ie.links(new AttributeFinder("target","_blank"));
			//System.out.println(ls.toString());
			ls.link(0).click();
			new_ie = ie.childBrowser();
			//System.out.println(new_ie.text());
			//ie.link(new XPathFinder("//DIV[@id='jxContentPanel']//DIV[1]//DIV[1]//DIV[2]/A")).click();
			new_ie.waitUntilReady(1000);
			String text = new_ie.div(new AttributeFinder("class","detailInfoLinks")).text();
			System.out.println(text);
			String[] links = text.split("\\)");
			String ctrip = "携程旅行网";
			String elong = "艺龙旅行网";
			String tctrip = "",telong="";
			System.out.println(new_ie.childBrowserCount());
			
			for(String link : links){			
				if(link.indexOf(ctrip)>=0){
					try {
						new_ie.link("预订网站").click();
						tctrip = link + ")";
						new_ie.link(tctrip).click();
						String qunarprice = new_ie.table(
								new AttributeFinder("class", "bookingTable"))
								.text();
						System.out.println(qunarprice);
						new_ie.table(
								new AttributeFinder("class", "bookingTable"))
								.links().get(0).click();
						int count = new_ie.childBrowserCount();
						System.out.println(new_ie.childBrowserCount());
						iectrip = new_ie.childBrowser(count - 1);
					} catch (Exception e) {
						e.printStackTrace();
					}
				} else if(link.indexOf(elong)>=0){
					try {
						new_ie.link("预订网站").click();
						// table class="bookingTable"
						telong = link + ")";
						new_ie.link(telong).click();
						String qunarprice = new_ie.table(
								new AttributeFinder("class", "bookingTable"))
								.text();
						System.out.println(qunarprice);
						new_ie.table(
								new AttributeFinder("class", "bookingTable"))
								.links().get(0).click();
						int count = new_ie.childBrowserCount();
						// System.out.println(new_ie.childBrowserCount());
						ieelong = new_ie.childBrowser(count - 1);
						// div class="taL left10_dbk2
						
						
					} catch (Exception e) {
						e.printStackTrace();
					}
				}
			}
			if(ieelong != null){
				ieelong.waitUntilReady(10000);
				// table class="border_2"
				ieelong.waitUntilReady(20);
				ieelong.div(new AttributeFinder("class", "taL left10_dbk2")).link(0).click();
				//ieelong.executeScript("HotelDetails('50101472','rate','eLong')");
				ieelong.waitUntilReady(10000);				
				//ieelong.div(new AttributeFinder("class", "taL left10_dbk2"));
				//System.out.println("ieelong=" + ieelong.text());
				// form id="HotSrch"
				//ieelong.table();
				System.out.println(ieelong.text());
				//System.out.println(elongprice);
			}
			if(iectrip != null){
				iectrip.waitUntilReady(10000);
				// table class="pubGlobal_romList01"
				String ctripprice = iectrip.table(new AttributeFinder("class","pubGlobal_romList01")).text();
				System.out.println(ctripprice);
			}
		} catch (Exception e) {
			e.printStackTrace();
		} finally{
			try {
				if(ie != null)
					ie.close();
				if(new_ie != null)
					new_ie.close();
				if(ieelong != null)
					ieelong.close();
				if(iectrip != null)
					iectrip.close();
			} catch (Exception e) {
				// TODO Auto-generated catch block
				e.printStackTrace();
			}
			
		}
		
	}
}
分享到:
评论
2 楼 lsb_48 2009-09-02  
你好,有关于watij的问题想请交,麻烦你了,很急
QQ:2024486
1 楼 kqy929 2008-09-05  
能否介绍下你的解决方案?
最近由于工作需要也得做过类似的spider。
kqy929@126.com
Global site tag (gtag.js) - Google Analytics