Python验证IP是否可用

uule

浏览: 6387314 次
性别:
来自: 一片神奇的土地

最近访客更多访客>>

myzcm

wangenbao1

hyjqdy

逆光下的轮廓

博主相关

博客

微博

相册

留言

关于我

文章分类

社区版块

存档分类

博客分类：

Python

python爬虫-爬取代理IP并通过多线程快速验证

scrapy爬虫代理——利用crawlera神器，无需再寻找代理IP

python爬虫成长之路（二）：抓取代理IP并多线程验证

在使用爬虫爬取网络数据时，如果长时间对一个网站进行抓取时可能会遇到IP被封的情况，这种情况可以使用代理更换ip来突破服务器封IP的限制。

随手在百度上搜索免费代理IP，可以得到一系列的网站，这里我们通过对西刺网站的抓取来举例。

验证代理IP是否可用，原理是使用代理IP访问指定网站，如果返回状态为200，表示这个代理是可以使用的。

http://www.jianshu.com/p/588241a313e7

方法1：

import requests

try:
    requests.get('http://wenshu.court.gov.cn/', proxies={"http":"http://121.31.154.12:8123"})
except:
    print 'connect failed'
else:
    print 'success'

方法2：

import telnetlib

try:
    telnetlib.Telnet('127.0.0.1', port='80', timeout=20)
except:
    print 'connect failed'
else:
    print 'success'

例子：

#coding=UTF-8
import urllib.request



def validateIp():     
    inFile = open('proxy.txt', 'r')
    f = open("proxy2.txt","w")
    url = "http://www.baidu.com/"
      
    for line in inFile.readlines():
        try:            
            #print(line)
            line = line.strip('\n')  
            proxy_host = '://'.join(line.split('='))
            #print(proxy_host)
            proxy_temp = {line.split("=")[0]:proxy_host}
            print(proxy_temp)
            urllib.request.urlopen(url,proxies=proxy_temp).read()
            f.write(line+'\n')
        except Exception as e:
            print('%s connect failed' % line)
            continue
        f.close()
 
    
if __name__ == '__main__':
    validateIp()

结果：

{'http': 'http://218.21.169.19:8998'}

http=218.21.169.19:8998 connect failed

{'http': 'http://27.46.74.38:9999'}

http=27.46.74.38:9999 connect failed

{'http': 'http://60.173.35.99:808'}

http=60.173.35.99:808 connect failed

{'http': 'http://218.4.95.182:80'}

http=218.4.95.182:80 connect failed

{'http': 'http://218.56.132.155:8080'}

http=218.56.132.155:8080 connect failed

。。

网上例子：

python爬虫成长之路（二）：抓取代理IP并多线程验证

#coding:utf-8
import urllib2

def url_user_agent(url):
    #设置使用代理
    proxy = {'http':'27.24.158.155:84'}
    proxy_support = urllib2.ProxyHandler(proxy)
    # opener = urllib2.build_opener(proxy_support,urllib2.HTTPHandler(debuglevel=1))
    opener = urllib2.build_opener(proxy_support)
    urllib2.install_opener(opener)

    #添加头信息，模仿浏览器抓取网页，对付返回403禁止访问的问题
    # i_headers = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}
    i_headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.48'}
    req = urllib2.Request(url,headers=i_headers)
    html = urllib2.urlopen(req)
    if url == html.geturl():
        doc = html.read()
        return doc
    return

url = 'http://www.dianping.com/search/category/2/10/g311'
doc = url_user_agent(url)
print doc

自己写的可用例子：

#coding=UTF-8
import urllib.request
import chardet

def url_user_agent():
    #url = 'http://quote.stockstar.com/stock'
    url = 'http://www.baidu.com/'

    inFile = open('proxy.txt', 'r')
    f = open("available.txt","wb")
    
    for line in inFile.readlines():
        #f.write(line+'\n')  
        
        #print(line)
        line = line.strip('\n')  
        #proxy_host = '://'.join(line.split('='))
        proxy_host = line.split('=')[1]
        #print(proxy_host)
        proxy_temp = {line.split("=")[0]:proxy_host}
        print(proxy_temp)    
        
        #proxy_temp = {'http':'58.33.37.205:8118'}
        #设置使用代理
        #proxy_temp = {'http':'119.5.0.100:808'}
        proxy_support = urllib.request.ProxyHandler(proxy_temp)
        # opener = urllib.request.build_opener(proxy_support,urllib.request.HTTPHandler(debuglevel=1))
        opener = urllib.request.build_opener(proxy_support)
        #i_headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.48'}
        #req = urllib.request.Request(url,headers=i_headers)
        opener.addheaders=[("User-Agent","Mozilla/5.0 (Windows NT 10.0; WOW64)")]
        
        urllib.request.install_opener(opener)
    
        #添加头信息，模仿浏览器抓取网页，对付返回403禁止访问的问题
        # i_headers = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}
        try:      
            html = urllib.request.urlopen(url,timeout=5)
            content = html.read()
            print(content)
            #print(type(content))
            #print(chardet.detect(content))
            print("==============================")   
            if content.strip() != '':
                line = line + '\n'
                data = line.encode(encoding="UTF-8")
                f.write(data) 
                
        except Exception as e:
            print('%s connect failed' % line)       
    
    f.close() 
    print("Test End !")
   
   
if __name__ == '__main__':  
    url_user_agent()

多线程例子：

#coding=UTF-8
import urllib.request
import urllib
import re
import time
import socket
import threading

#整理代理IP格式
proxys = []
inFile = open('proxy.txt','r')
proxy_ip=open('proxy_ip.txt','w')  #新建一个储存有效IP的文档

for line in inFile.readlines():
    line = line.strip('\n')  
    #proxy_host = '://'.join(line.split('='))
    proxy_host = line.split('=')[1]
    #print(proxy_host)
    proxy_temp = {line.split("=")[0]:proxy_host}
    print(proxy_temp)    
    proxys.append(proxy_temp)


lock=threading.Lock()  #建立一个锁
#验证代理IP有效性的方法
def test(i):
    socket.setdefaulttimeout(5)  #设置全局超时时间
    #url = "http://quote.stockstar.com/stock"  #打算爬取的网址
    url = "http://www.baidu.com/"  #打算爬取的网址
    try:
        proxy_support = urllib.request.ProxyHandler(proxys[i])
        opener = urllib.request.build_opener(proxy_support)
        opener.addheaders=[("User-Agent","Mozilla/5.0 (Windows NT 10.0; WOW64)")]
        urllib.request.install_opener(opener)
        res = urllib.request.urlopen(url).read()
        
        # 获取锁，用于线程同步
        lock.acquire()     #获得锁
        print(proxys[i],'is OK')        
        proxy_ip.write('%s\n' %str(proxys[i]))  #写入该代理IP
        
         # 释放锁，开启下一个线程
        lock.release()     #释放锁
    except Exception as e:
        lock.acquire()
        print(proxys[i],e)
        lock.release()
#单线程验证
'''for i in range(len(proxys)):
    test(i)'''
#多线程验证    
threads=[]
start = time.clock()
for i in range(len(proxys)):
    thread=threading.Thread(target=test,args=[i])
    threads.append(thread)
    thread.start()
#阻塞主进程，等待所有子线程结束
for thread in threads:
    thread.join()
    
proxy_ip.close()  #关闭文件

end = time.clock()
print("开始时间: %f s" % start)
print("结束时间: %f s" % end)
print("校验IP耗时: %f s" % (end - start))

结果：

...

{'http': '221.197.1.210:14515'} <urlopen error timed out>

{'http': '183.78.183.156:82'} is OK

开始时间: 0.000000 s

结束时间: 6.352310 s

校验IP耗时: 6.352309 s

#coding:gbk
import requests
'''代理IP地址（高匿）'''
proxy = {
    'http': 'http://117.85.105.170:808',
    'https': 'https://117.85.105.170:808'
}
'''head 信息'''
head = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36', 
             'Connection': 'keep-alive'}
'''http://icanhazip.com会返回当前的IP地址'''
p = requests.get('http://icanhazip.com', headers=head, proxies=proxy)
print(p.text)

分享到：

【第三方包总结】 | jQuery中map函数

2017-04-07 18:16
浏览 17640
评论(0)
分类:编程语言
查看更多

发表评论

您还没有登录,请您登录后再发表评论

最近访客更多访客>>

博主相关

文章分类

社区版块

存档分类

最新评论