使用动态代理IP让爬虫效率提高N倍

mcj8089

浏览: 194561 次
性别:
来自: 北京

最近访客更多访客>>

msj_0529

sf_dream

laical

lubin83

博主相关

博客

微博

相册

留言

关于我

文章分类

社区版块

存档分类

博客分类：

Java

动态代理IP 数据抓取数据爬虫 IP被封禁换IP抓数据

package com.goubanjia.test;

import java.io.BufferedInputStream;
import java.io.InputStream;
import java.net.HttpURLConnection;
import java.util.ArrayList;
import java.util.List;

import org.jsoup.Jsoup;

import com.gargoylesoftware.htmlunit.BrowserVersion;
import com.gargoylesoftware.htmlunit.ProxyConfig;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.html.HtmlPage;

public class TestDynamicIp {
	public static List ipList = new ArrayList();
	public static boolean gameOver = false;
	public static void main(String[] args) {
		long fetchIpSeconds = 5;
		int threadNum = 10;
		int testTime = 3;
		String order = "这里换成你的订单号，百度全网代理IP获取";
		System.out.println(">>>>>>>>>>>>>>全网代理动态IP测试开始<<<<<<<<<<<<<<");
		System.out.println("***************");
		System.out.println("接口返回IP为国内各地区，每次最多返回10个");
		System.out.println("提取IP间隔 " + fetchIpSeconds + " 秒 ");
		System.out.println("开启爬虫线程 " + threadNum);
		System.out.println("爬虫目标网址  http://1212.ip138.com/ic.asp");
		System.out.println("测试次数 3 ");
		System.out.println("***************\n");
		TestDynamicIp tester = new TestDynamicIp();
		new Thread(tester.new GetIP(fetchIpSeconds * 1000, testTime, order)).start();
		for (int i = 0; i < threadNum; i++) {
			tester.new Ip138Tester(100).start();
		}
		while(!gameOver){
			try {
				Thread.sleep(100);
			} catch (InterruptedException e) {
				e.printStackTrace();
			}
		}
		System.out.println(">>>>>>>>>>>>>>全网代理动态IP测试结束<<<<<<<<<<<<<<");
		System.exit(0);
	}
    
	// 抓取IP138，检测IP
	public class Ip138Tester extends Thread{
		@Override
		public void run() {
			while(!gameOver){
				webParseHtml("http://1212.ip138.com/ic.asp");
				try {
					Thread.sleep(sleepMs);
				} catch (InterruptedException e) {
					e.printStackTrace();
				}
			}
		}
		
		long sleepMs = 200;
		public Ip138Tester(long sleepMs) {
			this.sleepMs = sleepMs;
		}
		public String webParseHtml(String parentUrl) {
			String html = "";
			WebClient client = new WebClient();
			try {
				client.getOptions().setThrowExceptionOnFailingStatusCode(false);
				client.getOptions().setJavaScriptEnabled(false);
				client.getOptions().setCssEnabled(false);
				client.getOptions().setThrowExceptionOnScriptError(false);
				client.getOptions().setTimeout(10000); // 10s超时
				client.getOptions().setAppletEnabled(true);
				client.getOptions().setGeolocationEnabled(true);
				client.getOptions().setRedirectEnabled(true);
				
				String ipport = getAProxy();
				if (ipport != null) {
					ProxyConfig proxyConfig = new ProxyConfig(ipport.split(":")[0], Integer.parseInt(ipport.split(":")[1]));
					client.getOptions().setProxyConfig(proxyConfig);
				}else {
					System.out.print(".");
					return "";
				}
			
				HtmlPage page = client.getPage(parentUrl);
				html = page.asXml();
				
				if (html.length() > 0) {
					html = Jsoup.parse(html).select("center").first().text();
				}
				
				System.out.println(getName() + " 使用代理 " + ipport + "请求IP138返回：" + html);
				
			} catch (Exception e) {
				return webParseHtml(parentUrl);
			} finally {
				client.close();
			}
			return html;
		}
		
	    private String getAProxy() {
	    	if (ipList.size() > 0) {
	    		String ip = ipList.get((int)(Math.random() * ipList.size()));
	    		return ip ;
			}
			return null;
		}
	}
	
	// 定时获取动态IP
	public class GetIP implements Runnable{
		long sleepMs = 1000;
		int maxTime = 3;
		String order = "";
		
		public GetIP(long sleepMs, int maxTime, String order) {
			this.sleepMs = sleepMs;
			this.maxTime = maxTime;
			this.order = order;
		}
		
		@Override
		public void run() {
			long getIpTime = 0;
			int time = 1;
			while(!gameOver){
				if(time >= 4){
					gameOver = true;
					break;
				}
				try {
					java.net.URL url = new java.net.URL("http://dynamic.goubanjia.com/dynamic/get/" + order + ".html?ttl");
			    	HttpURLConnection connection = (HttpURLConnection)url.openConnection();
			    	connection.setConnectTimeout(3000);
			    	connection = (HttpURLConnection)url.openConnection();
			    	
			        InputStream raw = connection.getInputStream();  
			        InputStream in = new BufferedInputStream(raw);  
			        byte[] data = new byte[in.available()];
			        int bytesRead = 0;  
			        int offset = 0;  
			        while(offset < data.length) {  
			            bytesRead = in.read(data, offset, data.length - offset);  
			            if(bytesRead == -1) {  
			                break;  
			            }  
			            offset += bytesRead;  
			        }  
			        in.close();  
			        raw.close();
					String[] res = new String(data, "UTF-8").split("\n");
					List ipList = new ArrayList();
					for (String ip : res) {
						try {
							String[] parts = ip.split(",");
							if (Integer.parseInt(parts[1]) > 0) {
								ipList.add(parts[0]);
							}
						} catch (Exception e) {
						}
					}
					if (ipList.size() > 0) {
						TestDynamicIp.ipList = ipList;
						System.out.println("第" + ++getIpTime + "次获取动态IP " + ipList.size() + " 个");
						time += 1;
					}
				} catch (Exception e) {
					e.printStackTrace();
					System.err.println(">>>>>>>>>>>>>>获取IP出错");
				}
				try {
					Thread.sleep(sleepMs);
				} catch (InterruptedException e) {
					e.printStackTrace();
				}
			}
		}
	}
}

完整项目包下载地址：http://www.goubanjia.com/download/test-dynamic-ip.zip

使用动态代理IP，完全避免了被封IP的风险，爬虫效率直接提升了3倍以上。

请填写全网代理IP订单号，填写之后才可以提取到IP哦

分享到：

JVM性能调优工具 | SpringMVC的MAVEN项目报错ClassNotFoundE ...

2016-08-16 14:38
浏览 11813
评论(0)
分类:编程语言
查看更多

发表评论

您还没有登录,请您登录后再发表评论

最近访客更多访客>>

博主相关

文章分类

社区版块

存档分类

最新评论

使用动态代理IP让爬虫效率提高N倍

评论

发表评论

相关推荐

最近访客 更多访客>>

博主相关

文章分类

社区版块

存档分类

最新评论

使用动态代理IP让爬虫效率提高N倍

评论

发表评论

相关推荐

数据抓取原理及常见爬虫框架、代理示例

如何使用动态代理IP并开启多线程做数据抓取？

JVM性能调优工具

Mybatis批量更新配置（Mysql batch update）

POI读取word2003和word2007

Java中使用代理IP获取网址内容（防IP被封，做数据爬虫）

Nginx学习笔记

springmvc与mybatis整合，log4j输出sql语句

spring与mybatis项目出现 java.lang.NumberFormatException: For input string: "${jdbc.ma

使用spring做java的swing客户端报错：找不到元素 'beans' 的声明 或者 找不到元素 'tx' 的声明。

SVN Ubuntu创建仓库

Java annotation

Mybatis获取刚插入数据库中的记录的ID（MYSQL）

Enum

JAVA泛型(Generic)

工作中的问题总结（1）

MyEclipse插件注册方法

JAVA正则表达式-捕获组与非捕获组

Java中的静态代理和动态代理

最近访客更多访客>>

使用spring做java的swing客户端报错：找不到元素 'beans' 的声明或者找不到元素 'tx' 的声明。