[小代码]蜘蛛爬虫，抓取某网站所有图片文章中的图片~

wangshu3000

浏览: 138160 次
性别:
来自: 大连

最近访客更多访客>>

nosuchperson

k-ping

itsfh

anyinger023

博主相关

博客

微博

相册

留言

关于我

文章分类

社区版块

存档分类

博客分类：

J2EE企业计算

spider 爬虫 java html httpclient

为朋友的网站写了个小代码，把所有图片下载到本地，有点不道德了，哈哈。。。

package com.ai.picpicker;

import java.io.IOException;

import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpException;
import org.apache.commons.httpclient.MultiThreadedHttpConnectionManager;
import org.htmlparser.util.ParserException;

public class PicPicker {
	public static final String siteUrlPrefix = "http://www.****.com/a********";
	public static final String siteUrlSuffix = ".html";;
	public static final int pageNum = 4;// Sum 20 pages.
	public static final int startCategory = 1;

	public static void main(String args[]) throws ParserException, HttpException, IOException, InterruptedException {
		MultiThreadedHttpConnectionManager connectionManager = new MultiThreadedHttpConnectionManager();
		HttpClient httpClient = new HttpClient(connectionManager);

		int picCount = 0;
		// Main Loop, all page.
		PickerThread[] ppt = new PickerThread[pageNum];
		for (int p = 3; p < pageNum; p++) {
			ppt[p] = new PickerThread(httpClient, p + 1, startCategory);
			ppt[p].start();
			System.out.println("Thread " + (p + 1) + " Started~~");
			ppt[p].join();
		}
		for (int p = 3; p < pageNum; p++) {
			picCount = picCount + ppt[p].getCount();
		}
		System.out.println("All downloaded file num:" + picCount);
	}
}

package com.ai.picpicker;

import java.io.File;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.IOException;

import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpStatus;
import org.apache.commons.httpclient.methods.GetMethod;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.TagNameFilter;
import org.htmlparser.tags.HeadingTag;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.util.NodeList;

public class PickerThread extends Thread {

	private HttpClient httpClient = null;
	private int pageNum = 0;
	private int picCountT = 0;
	private int startCategory = 0;

	public PickerThread(HttpClient httpClient, int pageNum, int startCategory) {
		this.httpClient = httpClient;
		this.pageNum = pageNum;
		this.startCategory = startCategory - 1;
	}

	public int getCount() {
		return picCountT;
	}

	@Override
	public void run() {
		System.out.println("Thread " + pageNum + " Running~~");
		File forLogFile = new File("log4thread" + pageNum);
		FileWriter fwl = null;
		String picStr = null;
		try {
			Parser parser;
			parser = new Parser("http://www.******.com/a*******" + pageNum + ".html");
			fwl = new FileWriter(forLogFile);
			NodeList nodelist = parser.parse(null);
			NodeFilter categoryFilter = new TagNameFilter("h4");
			NodeList categoryList = nodelist.extractAllNodesThatMatch(categoryFilter, true);
			GetMethod getPageMethod = null;
			for (int i = startCategory; i < categoryList.size(); i++) {
				HeadingTag ht = (HeadingTag) categoryList.elementAt(i);
				LinkTag lt = (LinkTag) ht.getChild(0);
				String oneUrl = lt.getLink();
				fwl.write("[INFO]" + "Category Num" + i + " Downloading! Url:" + oneUrl + "\n");
				getPageMethod = new GetMethod(oneUrl);
				int statusCode = httpClient.executeMethod(getPageMethod);
				if (statusCode != HttpStatus.SC_OK) {
					fwl.write("[ERROR]" + "Method failed: " + getPageMethod.getStatusLine() + "\n");
				} else {
					fwl.write("[INFO]" + "Page" + pageNum + "_" + getPageMethod.getStatusLine() + "\n");
					fwl.flush();
					byte[] pageBody = getPageMethod.getResponseBody();
					getPageMethod.releaseConnection();
					String picListHtml = new String(pageBody, "UTF-8").trim();
					String picSubStr = picListHtml.substring(picListHtml.indexOf("start"), picListHtml.indexOf("end"));
					while (picSubStr.indexOf("\"file\"") != -1) {
						picStr = picSubStr.substring(picSubStr.indexOf("\"file\"") + 8, picSubStr.indexOf("\"pic\"") - 2).replace("\\",
								"");
						StringBuilder sb = new StringBuilder();
						sb.append("P").append(pageNum).append("_C").append(i + 1).append("/");
						File dir = new File(sb.toString());
						if (!dir.exists()) {
							dir.mkdir();
							dir = null;
						}
						sb.append(picStr.substring(picStr.indexOf("/", 16) + 1, picStr.length()));
						File picFile = new File(sb.toString());
						if (picFile.exists()) {
							fwl.write("[ERROR]" + "Duplication picture! FileName:" + sb.toString() + "\n");
							if (picSubStr.indexOf("\"pic\"", 7) != -1) {
								picSubStr = picSubStr.substring(picSubStr.indexOf("\"pic\"", 7) + 7, picSubStr.length());
							}
							continue;
						}
						GetMethod getPicMethod = new GetMethod("http://www.******.com/" + picStr);
						statusCode = httpClient.executeMethod(getPicMethod);
						if (statusCode != HttpStatus.SC_OK) {
							fwl.write("[ERROR]" + "Method failed: " + " URL:" + "http://www.********.com/" + picStr
									+ getPicMethod.getStatusLine() + "\n");
						} else {
							byte[] picBody = getPicMethod.getResponseBody();
							getPicMethod.releaseConnection();
							FileOutputStream picOutPut = new FileOutputStream(picFile);
							picOutPut.write(picBody);
							picOutPut.close();
							fwl.write("[INFO]" + "Pic" + picCountT++ + " URL:" + "http://www.*********.com/" + picStr + "\n");
							fwl.flush();
							System.out.print('.');
						}
						// System.out.println(picStr);
						if (picSubStr.indexOf("\"pic\"", 7) != -1) {
							picSubStr = picSubStr.substring(picSubStr.indexOf("\"pic\"", 7) + 7, picSubStr.length());
						}
					}
					fwl.write("[DEBUG]" + lt.getLink());
					fwl.flush();
				}
				System.out.println();
				fwl.flush();
			}
			fwl.write("[INFO]" + "Thread " + pageNum + " run over " + picCountT + "pic!!!\n");
			fwl.flush();
		} catch (Exception e) {
			System.out.println("Thread " + pageNum + " Exception!!! PicUrl:" + picStr);
			e.printStackTrace();
		} finally {
			try {
				if (fwl != null)
					fwl.close();
			} catch (IOException e) {
				// TODO Auto-generated catch block
				e.printStackTrace();
			}
		}

	}
}

2
顶

6
踩

分享到：

[疑问]关于NIO的耗时服务端业务逻辑问题 ... | 写了个小代码，统计史蒂夫乔布斯传英文版的 ...

2012-01-02 12:02
浏览 4536
评论(2)
分类:非技术
查看更多

2 楼 wangshu3000 2012-01-03

greatghoul 写道

很不错，我也经常抓取些图片，当然都是**网站的，如果只是抓取图片，建议使用正则，会快很多。
还有这个run方法大了点儿，可以再分离下。。。

不好意思，完全没考虑设计，

就是实现功能，一次性的。呵呵。所以也不顾什么设计模式，什么代码结构了。实现功能就OK了。。

1 楼 greatghoul 2012-01-03

很不错，我也经常抓取些图片，当然都是**网站的，如果只是抓取图片，建议使用正则，会快很多。
还有这个run方法大了点儿，可以再分离下。。。

发表评论

您还没有登录,请您登录后再发表评论

最近访客更多访客>>

博主相关

文章分类

社区版块

存档分类

最新评论

[小代码]蜘蛛爬虫，抓取某网站所有图片文章中的图片~

评论

发表评论

相关推荐

最近访客 更多访客>>

博主相关

文章分类

社区版块

存档分类

最新评论

[小代码]蜘蛛爬虫，抓取某网站所有图片文章中的图片~

评论

发表评论

相关推荐

MyLab

Learning

Eclipse 绘制草图的plugin

My Environment

话单匹配问题

Senior Solution Architect(Systems)

从一篇文章中筛选处辞典生词本中没有的单词，导出成txt文件

关于火车订票系统瓶颈的分析及解决方案

[疑问]关于NIO的耗时服务端业务逻辑问题？？？？

写了个小代码，统计史蒂夫乔布斯传英文版的单词数量

软件项目版本号的命名规则及格式介绍

中文编码基础知识介绍

软件架构

理解架构师

架构和架构设计师

构架师之路

软件架构师成长之路

大型网站架构之:MySpace的体系架构

IT网站

一位软件工程师的6年总结

最近访客更多访客>>