Jsoup数据抓取

_yanbh

浏览: 18297 次
性别:
来自: 上海

最近访客更多访客>>

xiyukongjian

xcly

grafthzqx

ThomasBarsoe

博主相关

博客

微博

相册

留言

关于我

文章分类

社区版块

存档分类

/***
	 * 美图抓取
	 * http://www.tupianzj.com/meinv/ 
	 * @time 2014-9-5上午11:10:25
	 */
	public static void search2() {
		
		String httpUrl = "http://www.tupianzj.com/meinv/";
		
		try {
			
			Document doc = Jsoup.connect(httpUrl).get();
			
			Elements items = doc.select(".meinv970 dl");
			
			System.out.println(items);
			
			for(Element item:items){
				
				Elements childItemsDT = item.select("dt h2 a");
				
				String title=childItemsDT.html();
					
				System.out.println(childItemsDT.html());
				
				Elements childItemsDD1 = item.select("dd li a img");
				
				for(Element childItemdd1:childItemsDD1){
					
					String picUrl0=childItemdd1.attr("src");
					
					saveImg("小",title, picUrl0);
					
					System.out.println(picUrl0);
				}
				
				Elements childItemsDD2 = item.select("dd li .moxflashtext a");
				
				System.out.println(childItemsDD2);
				
				for(Element childItem:childItemsDD2){
					
					String secondUrl="http://www.tupianzj.com"+childItem.attr("href");
					
					Document childDoc = Jsoup.connect(secondUrl).get();
					
					String picUrl=childDoc.select(".pictu900 img").attr("src");
					
					System.out.println(picUrl);
					
					saveImg("大",title, picUrl);
					
					Elements thirdChilds=childDoc.select(".pages li a");
					
					for(Element thirdChild:thirdChilds){
						
						String isHave = thirdChild.attr("href");
						
						if(!"".endsWith(isHave) && !"javascript:dPlayPre();".endsWith(isHave) && !"#".endsWith(isHave) && !"#".endsWith(isHave)){
							
							String url=secondUrl.substring(0,secondUrl.lastIndexOf("/")+1)+isHave;
							
							Document secondChildDoc = Jsoup.connect(url).get();
							
							String picUrl1=secondChildDoc.select(".pictu900 img").attr("src");
							
							System.out.println(picUrl1);
							
							saveImg("大",title, picUrl1);
							
						}
						
					}
				}
				
			}
			
		} catch (IOException e) {
			
			e.printStackTrace();
		}
	}
	
	
	/***
	 * 保存图片
	 * @time 2014-9-5上午11:10:25
	 */
	 public static String saveImg(String tag,String name,String picUrl) {
			
			String fileName = "";
			
			fileName = tag+System.currentTimeMillis()+".jpg";
			
			File realDirectory = new File("D:/pic/"+name+"/");
			
			if (!realDirectory.exists()) {
				realDirectory.mkdirs();
			}
			
			try {
				// 构造URL
				URL url = new URL(picUrl);
				// 打开连接
				URLConnection con = url.openConnection();
				// 输入流
				InputStream is = con.getInputStream();
				// 1K的数据缓冲
				byte[] bs = new byte[1024];
				// 读取到的数据长度
				int len;
				// 输出的文件流
				OutputStream os = new FileOutputStream("D:/pic/"+name+"/"+fileName);
				// 开始读取
				while ((len = is.read(bs)) != -1) {
					os.write(bs, 0, len);
				}
				// 完毕，关闭所有链接
				os.close();
				is.close();
				
			} catch (Exception e) {
				
				e.printStackTrace();
			} 
			
			return fileName;
			
		}

分享到：