【Python真的很强大】使用scrapy爬取百度贴吧-上海吧

Ihavegotyou

浏览: 238413 次
性别:
来自: 深圳

最近访客更多访客>>

skynothing

zjfmail

jackyin5918

waxuanxuan

博主相关

博客

微博

相册

留言

关于我

文章分类

社区版块

存档分类

博客分类：

python

scrapy

需求是这样的: 需要获取最近20天的贴吧的主题贴以及直接回复(过滤回复的回复),输出数据到MySQL 这里以百度贴吧-上海吧为例子。

上海吧的结构如下，主题帖和回复都带有分页。如下所示：

subject

post

定义全局变量(settings.py)：

# -*- coding: utf-8 -*-

# Scrapy settings for tieba project
#
# For simplicity, this file contains only the most important settings by
# default. All the other settings are documented here:
#
#     http://doc.scrapy.org/en/latest/topics/settings.html
#

BOT_NAME = 'tieba'

SPIDER_MODULES = ['tieba.spiders']
NEWSPIDER_MODULE = 'tieba.spiders'

START_URL  = 'http://tieba.baidu.com/f?ie=utf-8&kw=%E4%B8%8A%E6%B5%B7'
#START_URL  = 'http://tieba.baidu.com/f?ie=utf-8&kw=%E4%B8%8A%E6%B5%B7'
#START_URL = 'http://tieba.baidu.com/f?ie=utf-8&kw=%E4%B8%8A%E6%B5%B7%E4%BA%A4%E9%80%9A%E5%A4%A7%E5%AD%A6'
TOTAL_DAYS = "20"

ITEM_PIPELINES = ['tieba.pipelines.MySQLDBPipeline']

MySQL_SERVER = "localhost"
MySQL_SERVER_PORT = 3306
MySQL_SERVER_DB = "tieba"
MySQL_SERVER_USER = "mysql"
MySQL_SERVER_PWD = "xyz"



# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; rv:35.0) Gecko/20100101 Firefox/35.0'

数据抓取部分（TiebaSpider.py 只完成主题帖，回复内容还未准备) :

#coding=utf-8
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors import LinkExtractor
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor 
from scrapy.selector import HtmlXPathSelector
from tieba.items import SubjectItem
from tieba.items import CommentItem
from tieba import settings
import scrapy
import json

class TiebaSpider(CrawlSpider):
    name = 'tieba'
    allowed_domains = ['tieba.baidu.com'] #备注：那些带有推广的帖子现在看起来都不是这个域名下的，所以主题文章已经过滤了推广贴
    start_urls = [settings.START_URL]
    #这里假设20天内主题帖数量<1000*50,可以根据实际调整或获取页面上每个主题帖的时间来计算出具体需要多少页！
    for x in range(0, 1000):
        start_urls.append(settings.START_URL + "&pn=" + str((x+1) * 50))
    rules = [Rule(LinkExtractor(allow=['/p/\d+']), 'parse_subject_shanghai')]#这里只解析主题贴
    
    
    def parse_subject_shanghai(self, response):
        try:
            torrent = SubjectItem()
            torrent['url'] = response.url
            torrent['id'] = response.url.split('/p')[1].split('/')[1].split('?')[0]
            torrent['commentNum'] = response.xpath("//*[@id='thread_theme_5']/div[1]/ul/li[2]/span[1]/text()").extract()[0]
            #这里用id定位没有找到content,一个可能原因是用了自定义tag cc
            torrent['content'] = response.xpath("//*/cc/div/text()").extract()[0]
            dataField = json.loads(str(response.xpath("//*[@id='j_p_postlist']/div[1]/@data-field").extract()[0]))
            #很多信息在html source里没有，是在客户端用 js 生成
            torrent['created'] = dataField['content']['date'].strip()+":00"
            torrent['title'] = response.xpath("//*[@id='j_core_title_wrap']/div/h1/text()").extract()[0]
            torrent['tiebaName'] = response.xpath("//*[@id='container']/div/div[1]/div[2]/div[2]/a/text()").extract()[0].strip()
            torrent['authorName'] = response.xpath("//*[@id='j_p_postlist']/div[1]/div[2]/ul/li[3]/a/text()").extract()[0]
            torrent['authorUrl'] = response.xpath("//*[@id='j_p_postlist']/div[1]/div[2]/ul/li[3]/a/@href").extract()[0]
            torrent['authorAvatar'] = response.xpath("//*[@id='j_p_postlist']/div[1]/div[2]/ul/li[1]/div/a/img/@src").extract()[0]
            if not "http://tieba.baidu.com" in torrent['authorUrl']:
                torrent['authorUrl'] = "http://tieba.baidu.com" + torrent['authorUrl']
            
            hxs = HtmlXPathSelector(response)
            subject_post_div = hxs.select("//*/cc/div")[0]
            imgs = ['','',''] 
            index = 1
            for img in subject_post_div.select(".//img/@src"):
                if index > 3:
                    break
                imgs[index-1] = img.extract()
                index += 1
            torrent['image1'],torrent['image2'],torrent['image3'] = imgs
            #到这里已经完成主题帖的解析
            
            totalCommentPage =  int(response.xpath("//div[@id='thread_theme_5']/div[1]/ul/li[2]/span[2]/text()").extract()[0])
            for x in range(2, totalCommentPage):
                url = torrent['url'] + ("?pn=%s"  % x)
                yield scrapy.Request(url=url, callback=self.parse_comments_shanghai)
            
        except:
            torrent['id'] = None
            pass
        yield torrent
        
        
    def parse_comments_shanghai(self,response):
        try:
            items = []
            print response
            hxs = HtmlXPathSelector(response)
            print "---------------------------------------------------"
            j_p_postlist = hxs.select("//div[@id='j_p_postlist']").select(".//div[@class='l_post l_post_bright ']")
            print "----------------------------------------got it",j_p_postlist
            for childNode in j_p_postlist:
                print ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>"
                print childNode.extract()
            #for content in j_p_postlist.select(".//div[@id='l_post l_post_bright']/text()"):
                #print '=-===content',content
        except:
            for item in items:
                item['id'] = None
            pass
        return items

数据存取部分(只完成主题帖)

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html

import settings
from scrapy import log
import traceback
import MySQLdb
import MySQLdb.cursors
from twisted.enterprise import adbapi
from datetime import datetime

def strtodatetime(datestr,format):      
    return datetime.strptime(datestr,format)  

class MySQLDBPipeline(object):
    def __init__(self):
        self.date_time_format = "%Y-%m-%d %H:%M:%S"
        self.dbpool = adbapi.ConnectionPool('MySQLdb',
                    host = settings.MySQL_SERVER,
                    db = settings.MySQL_SERVER_DB,
                    port = settings.MySQL_SERVER_PORT,
                    user = settings.MySQL_SERVER_USER,
                    passwd = settings.MySQL_SERVER_PWD,
                    cp_reconnect = True,
                    cursorclass = MySQLdb.cursors.DictCursor,
                    charset = 'utf8',
                    use_unicode = True) 

    def process_item(self, item, spider):
        # run db query in thread pool
        query = self.dbpool.runInteraction(self._conditional_insert, item).addErrback(self.handle_error)
        return item
    
    def _conditional_insert(self, tx, item):
            if item.get('id') and item.get('created'):
                today = datetime.now()
                postDay = strtodatetime(item.get('created'), self.date_time_format)
                #从这里限制只更新20天内的数据
                if (today - postDay).days <= int(settings.TOTAL_DAYS):
                    args= (item['id'],
                     item['title'],
                     item['url'],
                     item['tiebaName'],
                     item['authorName'],
                     item['authorUrl'],
                     item['authorAvatar'],
                     item['content'],
                     item['created'],
                     item['image1'],
                     item['image2'],
                     item['image3'],  
                     item['commentNum'],
                     item['commentNum']
                     )
                                    
                    sql = '''insert into tieba_articles(id, title, url, tiebaName, authorName, authorUrl, authorAvatar,content,created,image1,image2,image3,commentNum)  
                          VALUES('%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s', '%s') ON DUPLICATE KEY UPDATE commentNum = '%s'
                          ''' % args
                
                    tx.execute(sql)
     
    def handle_error(self, e):
        log.err(e)

忘记了，补上数据结构部分(items.py):

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html

import scrapy
class SubjectItem(scrapy.Item):
    id = scrapy.Field()
    url = scrapy.Field()
    title = scrapy.Field()
    tiebaName = scrapy.Field()
    authorName = scrapy.Field()
    authorUrl = scrapy.Field()
    authorAvatar = scrapy.Field()
    commentNum = scrapy.Field()
    created = scrapy.Field()
    content = scrapy.Field()
    image1 = scrapy.Field()
    image2 = scrapy.Field()
    image3 = scrapy.Field()
    
    
    
class CommentItem(scrapy.Item):
    authorName = scrapy.Field()
    authorUrl = scrapy.Field()
    authorAvatar = scrapy.Field()
    content = scrapy.Field()
    index = scrapy.Field()
    article_id = scrapy.Field()
    created = scrapy.Field()

总结： scrapy定义了清晰的类层次结构，使得开发者只需要关注业务逻辑本身。对于分页数据处理，可以使用两种模式： 1)把已知的所有url添加到一个列表; 2)使用yield scrapy.Request(xargs)

数据解析可以使用自身的Xpath,也可以选用其他第三方module. 如BeautifulSoup.