java网页抓取问题

0 0

java网页抓取问题20

在这个网站中：http://wwwapps.ups.com/WebTracking/track?HTMLVersion=5.0&loc=zh_CN&Requester=UPSHome&WBPM_lid=homepage%2Fct1.html_pnl_trk&trackNums=H8947154378&track.x=%E8%BF%BD%E8%B8%AA

　　需要抓取：页面中的运输进程的部分该运输进程查看源码为一个div层（<div class="secBody" >）点击层厚URL地址改变为：http://wwwapps.ups.com/WebTracking/detail 因为抓取的信息需要第一个链接中的H8947154378 参数所以URL改变后就不知道怎么抓取了

　　通过普通抓取只能抓取到层中的第一条数据和最后一条数据火狐和其他浏览器查看第一个页面的源码也只有div中第一条数据和最后一条
　　public String getPageContent(String strUrl, String strPostRequest,int maxLength) {

　　// 读取结果网页
　　StringBuffer buffer = new StringBuffer();
　　System.setProperty("sun.net.client.defaultConnectTimeout", "5000");
　　System.setProperty("sun.net.client.defaultReadTimeout", "5000");
　　try {
　　URL newUrl = new URL(strUrl);
　　HttpURLConnection hConnect = (HttpURLConnection) newUrl.openConnection();
　　// POST方式的额外数据
　　if (strPostRequest.length() > 0) {
　　hConnect.setDoOutput(true);
　　OutputStreamWriter out = new OutputStreamWriter(hConnect.getOutputStream());
　　out.write(strPostRequest);
　　out.flush();
　　out.close();
　　}
　　// 读取内容

　　BufferedReader rd = new BufferedReader(new InputStreamReader(hConnect.getInputStream(),"utf-8"));
　　int ch;
　　for (int length = 0; (ch = rd.read()) > -1 && (maxLength <= 0 || length < maxLength); length++)
　　buffer.append((char) ch);
　　String s = buffer.toString();
　　s.replaceAll("//&[a-zA-Z]{1,10};", "").replaceAll("<[^>]*>", "");
　　System.out.println(s);

　　rd.close();
　　hConnect.disconnect();
　　return buffer.toString().trim();
　　} catch (Exception e) {
　　return "错误:读取网页失败！";
　　//

　　}
　　}
　　public static void main(String[] args) {

　　String url = "http://wwwapps.ups.com/WebTracking/track?HTMLVersion=5.0&loc=zh_CN&Requester=UPSHome&WBPM_lid=homepage%2Fct1.html_pnl_trk&trackNums=H8947154378&track.x=%E8%BF%BD%E8%B8%AA";

　　String url2 = "http://wwwapps.ups.com/WebTracking/detail";

　　Test p = new Test();
　　p.getPageContent(url, "post", 100500);

　　Test3 p3 = new Test3();
　　p3.getPageContent(url2, "post", 100500);
　　System.out.print("已经执行！");
　　}
　　上面是我写的普通抓取办法
　　想请教大家是否有其他解决办法没有公开的API接口

Java 网页抓取二次抓取

2012年6月21日 16:42

qizi456258
1
0 0 5

4个答案按时间排序按投票排序

0 0

采纳的答案

这里是使用HttpClient和nekohtml的完整实现，能够完整抓取出来运输进程一览：

public class UpsDetail {
	
	private static final String HTML_TACK_HTML = "html/tack.html";
	private static final String HTML_DETAIL_HTML = "html/detail.html";

	private static String url1 = "http://wwwapps.ups.com/WebTracking/track?HTMLVersion=5.0&loc=zh_CN&Requester=UPSHome&WBPM_lid=homepage%2Fct1.html_pnl_trk&trackNums=H8947154378&track.x=%E8%BF%BD%E8%B8%AA";
	private static String url2 = "http://wwwapps.ups.com/WebTracking/detail"; 

	public static void main(String[] args) {
		
		try {
			
			//抓取追踪信息页面HTML
			getHtml(url1, HTML_TACK_HTML, null);

			//获取 抓取运输进程页面HTML时 需要的参数
			Map<String, String> data = getHiddenValue(HTML_TACK_HTML);

			//抓取运输进程页面HTML		
			getHtml(url2, HTML_DETAIL_HTML, data);
			
			//获取运输进程
			List<DetailBean> list = getDetailList(HTML_DETAIL_HTML);
			
			//打印详细的运输进程
			DetailBean bean = null;
			System.out.println("地点" + "\t" + "日期" + "\t" + "当地时间" + "\t" + "处理");
			for (int i = 0; i < list.size(); i++) {
				bean = list.get(i);
				System.out.println(bean.getLocation() + "\t" + bean.getDate() + "\t" + bean.getTime() + "\t" + bean.getOperation());
			}
		
		} catch (Exception e) {
			e.printStackTrace();
		}
	}

	private static List<DetailBean> getDetailList(String html) throws Exception {
		List<DetailBean> list = new ArrayList<DetailBean>();
		
		DOMParser parser = new DOMParser();
		parser.parse(html);
		Node node = parser.getDocument();
				
		Node tb = XPathAPI.selectSingleNode(node, "//TABLE[@class='dataTable']");
		NodeList tdlist = XPathAPI.selectNodeList(tb, "//TR/TD");

		int line = 0;
		while (line < tdlist.getLength() / 4) {
			DetailBean bean = new DetailBean();
			
			bean.setLocation(deleteSpace(tdlist.item(line * 4 + 0).getTextContent()));
			bean.setDate(deleteSpace(tdlist.item(line * 4 + 1).getTextContent()));
			bean.setTime(deleteSpace(tdlist.item(line * 4 + 2).getTextContent()));
			bean.setOperation(deleteSpace(tdlist.item(line * 4 + 3).getTextContent()));
			
			line++;
			
			list.add(bean);
		}
		
		return list;
	}

	private static Map<String, String> getHiddenValue(String html) throws Exception {		
		Map<String, String> data = new HashMap<String, String>();
		
		List<String> params = new ArrayList<String>();
		params.add("loc".toLowerCase());
		params.add("USER_HISTORY_LIST".toLowerCase());
		params.add("progressIsLoaded".toLowerCase());
		params.add("refresh_sii".toLowerCase());
		params.add("showSpPkgProg1".toLowerCase());
		params.add("datakey".toLowerCase());
		params.add("HIDDEN_FIELD_SESSION".toLowerCase());
		params.add("trackNums".toLowerCase());
		
		DOMParser parser = new DOMParser();
		parser.parse(html);
		Node node = parser.getDocument();
		
		NodeList nodeList = XPathAPI.selectNodeList(node, "//INPUT");
		for (int i = 0; i < nodeList.getLength(); i++) {
			Element e = (Element) nodeList.item(i);
			if ("hidden".equalsIgnoreCase(e.getAttribute("type"))
					&& params.contains(e.getAttribute("name").toLowerCase())) {
				data.put(e.getAttribute("name"), e.getAttribute("value"));
			}
		}
		
		System.out.println("订单编号:" + data.get("trackNums"));
		return data;
	}

	private static void getHtml(String url, String filename, Map<String, String> data) throws Exception {
		
		//创建一个客户端
		DefaultHttpClient client = new DefaultHttpClient();

		HttpResponse res = null;
		if (data == null) {
			//创建一个get方法
			HttpGet get = new HttpGet(url);
			//执行请求
			res = client.execute(get);
		} else {
			
			client.setRedirectStrategy(new DefaultRedirectStrategy() {                
			        public boolean isRedirected(HttpRequest request, HttpResponse response, HttpContext context)  {
			            boolean isRedirect = false;
			            try {
			                isRedirect = super.isRedirected(request, response, context);
			            } catch (ProtocolException e) {
			                e.printStackTrace();
			            }
			            if (!isRedirect) {
			                int responseCode = response.getStatusLine().getStatusCode();
			                if (responseCode == 301 || responseCode == 302) {
			                    return true;
			                }
			            }
			            return isRedirect;
			        }
			    });
			
			//作成post参数Entity
			List<NameValuePair> formparams = new ArrayList<NameValuePair>();
			Iterator i = data.keySet().iterator();
			while(i.hasNext()) {
				String key = (String)i.next();
				formparams.add(new BasicNameValuePair(key, data.get(key)));
			}
			UrlEncodedFormEntity entity = new UrlEncodedFormEntity(formparams, "UTF-8");

			//创建一个post方法
			HttpPost post = new HttpPost(url);
			//设置post参数
			post.setEntity(entity);

			//执行请求
			res = client.execute(post);
		}

		//获取完整的StatusLine・・・「HTTP/1.1 200 OK」
		System.out.println(res.getStatusLine().toString());
		
		//获取返回内容
		if (res.getEntity() != null) {
			String result = EntityUtils.toString(res.getEntity());
			//System.out.println(result);			
			//生成HTML文件保存到本地（测试用可以不保存直接解析）
			createHtmlFile(filename, result);
		}
		
		//关闭流
		EntityUtils.consume(res.getEntity());
		
		//关闭连接
		client.getConnectionManager().shutdown();
	}
	
	private static void createHtmlFile(String filename, String data) throws Exception {
		File file = new File(filename);
		OutputStream os = new FileOutputStream(file);
		os.write(data.getBytes("UTF-8"));
		os.close();
	}
	
	private static String deleteSpace(String in) {
		Pattern pattern = Pattern.compile("\\s*|\t|\r|\n");
	    Matcher re = pattern.matcher(in);
	    
	    return re.replaceAll("");
	}

}

其中用到的DetailBean

public class DetailBean {
	//地点
	private String location;
	//日期
	private String date;
	//当地时间
	private String time;
	//处理 
	private String operation;
	
	public String getLocation() {
		return location;
	}
	public void setLocation(String location) {
		this.location = location;
	}
	public String getDate() {
		return date;
	}
	public void setDate(String date) {
		this.date = date;
	}
	public String getTime() {
		return time;
	}
	public void setTime(String time) {
		this.time = time;
	}
	public String getOperation() {
		return operation;
	}
	public void setOperation(String operation) {
		this.operation = operation;
	}
	
	
}

2012年6月22日 11:15

rensanning
216
0 0 12

1条评论

0 0

注意运输进程点击后提交的是POST请求，
虽然这个网址http://wwwapps.ups.com/WebTracking/detail后面没参数了，但他是POST请求，里面藏着2个cookie要传给网站。
UPS_SHARED_SESSION:
webappcommon.cclamp.usb.acceptsCookie
我想H8947154378 参数已经通过session/cookie藏在里面了。所以你要在第一个网页先找到以上两个cookie参数。并通过POST方式提交给第二个网页。

建议用firebug监控每次提交网页的动作（GET，POST）到底传哪些参数给网站，以及网站返回给你哪些信息（set-cookie）

单用JDK搞定这类工作会很累，建议用一些第三方类包，比如httpclient抓网页，htmlparser解析html（建议前2个），或者用webharvest搞定（但这个是写xml，初学者会比较累）。可选用的开源包很多，比自己写来的方便多了。

同时建议了解一下http协议，不然做这类东西会云里雾里，知其然不知其所以然。了解了http协议，你就知道为什么要这样GET，POST了，cookie，session的作用。

2012年6月22日 11:34