新浪微博小爬虫(转)

wbj0110

浏览: 1637579 次
性别:
来自: 上海

最近访客更多访客>>

一往无前bhz

ninja2006

loginboot

u012363178

博主相关

博客

微博

相册

留言

关于我

文章分类

社区版块

存档分类

博客分类：

Data Mining

Data Mining

一直琢磨着写个爬虫玩，上学期都没实行，于是花了大概一天写了这个东西

其实半天就把程序调试好了，可是在往mysql数据库里保存数据的时候出了问题

python的中文编码实在是非常麻烦，不光如此，因为有些用户的简介里有一些特殊符号，®或者笑脸之类的，于是在这里纠结了很久，好歹最后是成功了(其实也就是过滤掉了那些特殊符号)

效率来说呢，开始的时候一个小时能采集大概1.4w条微博的用户信息，但是由于我是从每个人的关注列表里采集的，所以很快就会遇到爬到许多已经爬过的用户，所以总的来说效率不是很高，怪不得那个“中国爬盟”要发动群众的力量去爬

而且有些担心爬久了微博账号被封，我也没敢尝试太久，最后爬了5w条用户信息，8w条关系数据，我拿着数据目前也没什么用，所以就这样吧

python没看多久，代码有冗余的地方，其实主要就是三个函数save_user(),creepy_myself(),creepy_others()

具体的就看代码的注释吧，下载地址，和下面的一样(代码有冗余，因为要先爬出来用户的关注数目来计算有多少页)

[python]view plaincopy 
#coding=utf8  
  
import urllib2  
import re  
from BeautifulSoup import *  
import MySQLdb  
import sys  
""" 
Login to Sina Weibo with cookie 
setdefaultencoding 用于对中文编码的处理 
"""  
reload(sys)  
sys.setdefaultencoding('utf8')    
COOKIE ='你的cookie'  
HEADERS = {'cookie': COOKIE}  
UID= COOKIE[COOKIE.find('uid')+4:COOKIE.find('uid')+14]  
  
''''' 
    尝试连接数据库，以供保存诗句 
'''  
try:  
    conn=MySQLdb.connect(host='127.0.0.1',user='root',passwd='root',db='weibodata',port=3309,charset='utf8',use_unicode=False)  
    cur=conn.cursor()  
except MySQLdb.Error,e:  
    print "Mysql Error %d: %s" % (e.args[0], e.args[1])  
  
  
  
def save_user(uuid,uid,name,common):  
    ''''' 
        save_user(uuid,uid,name,common) 
        用于保存诗句，uuid->uid是用户关系，uuid关注uid 
        uid,name,common是将要保存的用户信息 
        setup.ini中保存有两个数字 
        第一个是now我对当前用户的编号 
        第二个point是当前正在扫描的用户的编号 
        你可以把它们看作是一个队列的两个指针 
    '''  
    fileHandle = open ( 'setup.ini','r+');  
    now=int(fileHandle.readline())+1;  
    point =int(fileHandle.readline())  
    print now  
    #print uuid,uid,name,common  
    #保存用户关系信息  
    count=cur.execute('select * from relations where uid1=\''+str(uuid)+'\' and uid2=\''+str(uid)+'\'')  
    if (count==0):  
             
            cur.execute('insert into relations(uid1,uid2)values(\''+\  
                        str(uuid)+'\',\''+str(uid)+'\')')  
            conn.commit()  
  
    count=cur.execute('select * from users where uid=\''+str(uid)+'\'')  
    #保存用户信息  
    if (count==0):  
            cs=common.encode('gbk', 'ignore').decode('gbk', 'ignore').encode('utf-8', 'ignore')  
    
            #print cs  
            cur.execute('insert into users(id,uid,name,common)values(\''+\  
                        str(now)+'\',\''+str(uid)+'\',\''+str(name)+'\',\"'+\  
                        cs +\  
                        '\")')  
            conn.commit()  
            fileHandle.close()  
            fileHandle = open ( 'setup.ini','w');  
            fileHandle.write(str(now)+'\n'+str(point))  
       
    fileHandle.close()  
  
def creepy_myself():  
    ''''' 
        这是用来扫描你自己的关注列表的 
        我想着得有个开头，所以第一次使用时应调用这个函数为队列添加一些用户再作扩展 
    '''  
    uid= COOKIE[COOKIE.find('uid')+4:COOKIE.find('uid')+14]  
    url = 'http://weibo.com/'+str(uid)+'/myfollow?t=1&page=1'  
    mainurl='http://weibo.com/'+str(uid)+'/myfollow?t=1&page='  
    req = urllib2.Request(url, headers=HEADERS)  
    text = urllib2.urlopen(req).read()  
    mainSoup=BeautifulSoup(text)  
    strs=str(mainSoup.find('div','lev2'));  
    num=int(strs[strs.find('(')+1:strs.find(')')])  
  
    lines=text.splitlines()    
    for line in lines:  
         if line.startswith('<script>STK && STK.pageletM && STK.pageletM.view({"pid":"pl_relation_myf'):  
            n = line.find('html":"')    
            if n > 0:    
                j = line[n + 7: -12].replace("\\", "")    
                soup =BeautifulSoup(j)  
                follows=soup.findAll('div','myfollow_list S_line2 SW_fun')  
                for follow in follows:  
                    namess=follow.find('ul','info').find('a')['title']  
                    temp_str=str(follow)  
                    uiddd= temp_str[temp_str.find('uid')+4:temp_str.find('&')]  
                    save_user(UID,uiddd,namess,follow.find('div','intro S_txt2').contents[0][6:])  
                       
    for i in range(2,num/30+1):  
        url = 'http://weibo.com/2421424850/myfollow?t=1&page='+str(i)  
        req = urllib2.Request(url, headers=HEADERS)  
        text = urllib2.urlopen(req).read()  
  
  
        lines=text.splitlines()    
        for line in lines:  
         if line.startswith('<script>STK && STK.pageletM && STK.pageletM.view({"pid":"pl_relation_myf'):  
            n = line.find('html":"')    
            if n > 0:    
                j = line[n + 7: -12].replace("\\", "")    
                soup =BeautifulSoup(j)  
                follows=soup.findAll('div','myfollow_list S_line2 SW_fun')  
                for follow in follows:  
                    namess=follow.find('ul','info').find('a')['title']  
                    temp_str=str(follow)  
                    uiddd =temp_str[temp_str.find('uid')+4:temp_str.find('&')]  
                    save_user(UID,uiddd,namess,follow.find('div','intro S_txt2').contents[0][6:])  
  
                      
                       
def creepy_others(uid):  
    ''''' 
        扫描制定uid用户的信息 
        和上面一样代码有冗余 
        因为要先得到这个用户的关注人数，来计算一共有多少页数据 
    '''  
    url="http://weibo.com/"+str(uid)+"/follow?page=";  
    req = urllib2.Request(url, headers=HEADERS)  
    text = urllib2.urlopen(req).read()  
  
      
    mainSoup=BeautifulSoup(text.strip())  
    lines=text.splitlines()  
    num=1  
    for line in lines:  
         if line.startswith('<script>STK && STK.pageletM && STK.pageletM.view({"pid":"pl_relation_hisFollow'):  
            n = line.find('html":"')  
            if n > 0:    
                j = line[n + 7: -12].replace("\\n", "")  
                j = j.replace("\\t","")  
                j = j.replace("\\",'');  
                soup=BeautifulSoup(j)  
                strs=str(soup.find('div','patch_title'))  
                num=int(strs[strs.find('关注了')+9:strs.find('人</div')]);  
                follows=soup.findAll('li','clearfix S_line1')  
                for follow in follows:  
                    temp_str=str(follow)  
                   # print temp_str  
                    temp_uid=temp_str[temp_str.find('uid'):temp_str.find('&')];  
                    temp_soup=BeautifulSoup(temp_str);  
                    temp_fnick=temp_soup.find('div').find('a')['title']  
                    save_user(uid,temp_uid[4:],temp_fnick,str(temp_soup.find('div','info'))[18:-6]);  
  
                #print num/20+2  
                for i in range(2,num/20+1):  
                     urls="http://weibo.com/"+str(uid)+"/follow?page="+str(i);  
                     req = urllib2.Request(urls, headers=HEADERS)  
                     text = urllib2.urlopen(req).read()  
                     lines=text.splitlines()  
                     for line in lines:  
                        if line.startswith('<script>STK && STK.pageletM && STK.pageletM.view({"pid":"pl_relation_hisFollow'):  
                            n = line.find('html":"')  
                            if n > 0:    
                                j = line[n + 7: -12].replace("\\n", "")  
                                j = j.replace("\\t","")  
                                j = j.replace("\\",'');  
                                soup=BeautifulSoup(j)  
                                strs=str(soup.find('div','patch_title'))  
                                num=int(strs[strs.find('关注了')+9:strs.find('人</div')]);  
                                follows=soup.findAll('li','clearfix S_line1')  
                                for follow in follows:  
                                    temp_str=str(follow)  
                                   # print temp_str  
                                    temp_uid=temp_str[temp_str.find('uid'):temp_str.find('&')];  
                                    temp_soup=BeautifulSoup(temp_str);  
                                    temp_fnick=temp_soup.find('div').find('a')['title']  
                                    save_user(uid,temp_uid[4:],temp_fnick,str(temp_soup.find('div','info'))[18:-6]);  
                         
                                     
  
                
if __name__ == '__main__':  
    #save_user('123','123','ads','212332231')  
    #creepy_myself()  
    ''''' 
        虽然很谨慎地处理了中文编码，但每过一段时间还是会有一些问题 
        于是抛掉了所有异常，防止程序中断 
    '''  
    while(1):  
        ''''' 
            首先取得队列的尾指针，也就是point 
            根据point从数据库中找到uid，然后creepy_others(uuid) 
        '''  
        fileHandle = open ( 'setup.ini','r+');  
        now=int(fileHandle.readline());  
        point =int(fileHandle.readline())+1;  
        fileHandle.close()  
        fileHandle = open ( 'setup.ini','w');  
        fileHandle.write(str(now)+'\n'+str(point))  
        fileHandle.close()  
        cur.execute('select uid from users where id=\''+str(point)+'\'')  
        uuid=cur.fetchone()[0];  
        if len(uuid)==10:  
            try:  
                creepy_others(uuid)  
            except Exception , e:  
                pass  
      
    cur.close()  
    conn.close()  

大家可以加我个人微信号：scccdgf
 
 
或者关注soledede的微信公众号：soledede
微信公众号：