`

python 爬虫小结2

 
阅读更多
1 LXML是比beautisoup速度更快的解析,使用的是XPATH,来个例子:
from lxml import etree
import requests
import csv

fp = open('d://doubanbook.csv','wt',newline='',encoding='utf-8')
writer = csv.writer(fp)
writer.writerow(('name', 'url',  'author', 'publisher', 'date', 'price', 'rate', 'comment'))

urls = ['https://book.douban.com/top250?start={}'.format(str(i)) for i in range(0,250,25)]

headers = {
    'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'
}

for url in urls:
    html = requests.get(url,headers=headers)
    selector = etree.HTML(html.text)
    infos = selector.xpath('//tr[@class="item"]')
    for info in infos:
        name = info.xpath('td/div/a/@title')[0]
        url = info.xpath('td/div/a/@href')[0]
        book_infos = info.xpath('td/p/text()')[0]
        author = book_infos.split('/')[0]
        publisher = book_infos.split('/')[-3]
        date = book_infos.split('/')[-2]
        price = book_infos.split('/')[-1]
        rate = info.xpath('td/div/span[2]/text()')[0]
        comments = info.xpath('td/p/span/text()')
        comment = comments[0] if len(comments) != 0 else "空"
        writer.writerow((name,url,author,publisher,date,price,rate,comment))

fp.close()

注意写成CSV后,要记事本打开后,保存为UTF-8格式才能打开;

2) 针对EXCEL的读取
import xlwt
import requests
from lxml import etree
import time

all_info_list = []

def get_info(url):
    html = requests.get(url)
    selector = etree.HTML(html.text)
    infos = selector.xpath('//ul[@class="all-img-list cf"]/li')
    for info in infos:
        title = info.xpath('div[2]/h4/a/text()')[0]
        author = info.xpath('div[2]/p[1]/a[1]/text()')[0]
        style_1 = info.xpath('div[2]/p[1]/a[2]/text()')[0]
        style_2 = info.xpath('div[2]/p[1]/a[3]/text()')[0]
        style = style_1+'·'+style_2
        complete = info.xpath('div[2]/p[1]/span/text()')[0]
        introduce = info.xpath('div[2]/p[2]/text()')[0].strip()
        word = info.xpath('div[2]/p[3]/span/span/text()')[0].strip('万字')
        info_list = [title,author,style,complete,introduce,word]
        all_info_list.append(info_list)
    time.sleep(5)

if __name__ == '__main__':
    urls = ['http://a.qidian.com/?page={}'.format(str(i)) for i in range(1,2)]
    for url in urls:
        get_info(url)
    header = ['title','author','style','complete','introduce','word']
    book = xlwt.Workbook(encoding='utf-8')
    sheet = book.add_sheet('Sheet1')
    for h in range(len(header)):
        sheet.write(0, h, header[h])
    i = 1
    for list in all_info_list:
        j = 0
        for data in list:
            sheet.write(i, j, data)
            j += 1
        i += 1
    book.save('xiaoshuo.xls')

3  selenium 和phantomjs 配合使用,比如登录网页
     from selenium import webdriver
driver = webdriver.PhantomJS()
driver.get('https://www.douban.com/')
driver.implicitly_wait(10)
driver.find_element_by_id('form_email').clear()
driver.find_element_by_id('form_email').send_keys('用户名')
driver.find_element_by_id('form_password').clear()
driver.find_element_by_id('form_password').send_keys('密码')
driver.find_element_by_class_name('bn-submit').click()
print(driver.page_source)
     也可以针对AJAX轻松不用逆向工程
    比如爬QQ空间的说说:
from selenium import webdriver
import time
import csv
#import pymongo

#client = pymongo.MongoClient('localhost', 27017)
#mydb = client['mydb']
#qq_shuo = mydb['qq_shuo']

driver = webdriver.PhantomJS()
driver.maximize_window()

def get_info(qq):
    driver.get('http://user.qzone.qq.com/{}/311'.format(qq))
    driver.implicitly_wait(10)
    try:
        driver.find_element_by_id('login_div')
        a = True
    except:
        a = False
    if a == True:
        driver.switch_to.frame('login_frame')
        driver.find_element_by_id('switcher_plogin').click()
        driver.find_element_by_id('u').clear()
        driver.find_element_by_id('u').send_keys('XXXX')
        driver.find_element_by_id('p').clear()
        driver.find_element_by_id('p').send_keys('XXXX')
        driver.find_element_by_id('login_button').click()
        time.sleep(5)
    driver.implicitly_wait(3)
    try:
        driver.find_element_by_id('QM_OwnerInfo_Icon')
        b = True
    except:
        b = False
    if b == True:
        driver.switch_to.frame('app_canvas_frame')
        contents = driver.find_elements_by_css_selector('.content')

        times = driver.find_elements_by_css_selector('.c_tx.c_tx3.goDetail')
        for content, tim in zip(contents, times):
            data = {
                'time': tim.text,
                'content': content.text
            }
            print(content.text)
         #   qq_shuo.insert_one(data)

if __name__ == '__main__':
    qq_lists = []
    fp = open('C:/Users/lyr/Downloads/QQmail.csv')
    reader = csv.DictReader(fp)
    for row in reader:
        qq_lists.append(row['电子邮件'].split('@')[0])
    fp.close()
    for item in qq_lists:
        get_info(item)
  

4 from selenium import webdriver
from lxml import etree
import time
#import pymongo

#client = pymongo.MongoClient('localhost', 27017)
#mydb = client['mydb']
#taobao = mydb['taobao']

driver = webdriver.PhantomJS()
driver.maximize_window()

def get_info(url,page):
    page = page + 1
    driver.get(url)
    driver.implicitly_wait(10)
    selector = etree.HTML(driver.page_source)
    infos = selector.xpath('//div[@class="item J_MouserOnverReq  "]')
    for info in infos:
        data = info.xpath('div/div/a')[0]
        goods = data.xpath('string(.)').strip()
        price = info.xpath('div/div/div/strong/text()')[0]
        sell = info.xpath('div/div/div[@class="deal-cnt"]/text()')[0]
        shop = info.xpath('div[2]/div[3]/div[1]/a/span[2]/text()')[0]
        address = info.xpath('div[2]/div[3]/div[2]/text()')[0]
        print(goods)
        print(price)

        commodity = {
            'good':goods,
            'price':price,
            'sell':sell,
            'shop':shop,
            'address':address
        }
      #  taobao.insert_one(commodity)

    if page <= 50:
        NextPage(url,page)
    else:
        pass

def NextPage(url,page):
    driver.get(url)
    driver.implicitly_wait(10)
//模拟点击下一页
    driver.find_element_by_xpath('//a[@trace="srp_bottom_pagedown"]').click()
    time.sleep(4)
    driver.get(driver.current_url)
    driver.implicitly_wait(10)
    get_info(driver.current_url,page)

if __name__ == '__main__':
    page = 1
    url = 'https://www.taobao.com/'
    driver.get(url)
    driver.implicitly_wait(10)
    driver.find_element_by_id('q').clear()
    driver.find_element_by_id('q').send_keys('男士短袖')
    driver.find_element_by_class_name('btn-search').click()
    get_info(driver.current_url,page)

4 scrapy快速使用
    在某个目录下,可以scrapy startproject 项目名
   
然后要抓取的项,写在items.py 中
  from scrapy.item import Item,Field

class XiaozhuItem(Item):
    title= Field()
    address = Field()
    price = Field()
    lease_type = Field()
    suggestion = Field()
    bed = Field()
  
  然后在spiders目录下新建立文件:
   from scrapy.spiders import CrawlSpider
from scrapy.selector import Selector
from xiaozhu.items import XiaozhuItem

class xiaozhu(CrawlSpider):
    name = 'xiaozhu'
    start_urls = ['http://bj.xiaozhu.com/fangzi/6937392816.html']

    def parse(self, response):
        item = XiaozhuItem()
        selector = Selector(response)
        title = selector.xpath('//h4/em/text()').extract()[0]
        address = selector.xpath('//p/span[@class="pr5"]/text()').extract()[0].strip()
        price = selector.xpath('//*[@id="pricePart"]/div[1]/span/text()').extract()[0]
        lease_type = selector.xpath('//*[@id="introduce"]/li[1]/h6/text()').extract()[0]
        suggestion = selector.xpath('//*[@id="introduce"]/li[2]/h6/text()').extract()[0]
        bed = selector.xpath('//*[@id="introduce"]/li[3]/h6/text()').extract()[0]

        item['title'] = title
        item['address'] = address
        item['price'] = price
        item['lease_type'] = lease_type
        item['suggestion'] = suggestion
        item['bed'] = bed

        yield item

对于抓取后的字段保存和处理,使用pipeline:
   class XiaozhuPipeline(object):
    def process_item(self, item, spider):
        fp = open('d:/xiaozhu.txt','a+')
        fp.write(item['title']+'\n')
        fp.write(item['address']+'\n')
        fp.write(item['price'] + '\n')
        fp.write(item['lease_type'] + '\n')
        fp.write(item['suggestion'] + '\n')
        fp.write(item['bed'] + '\n')
        return item

    最后进行设置:
ITEM_PIPELINES = {'xiaozhu.pipelines.XiaozhuPipeline':300}
  可以搞个MAIN程序,就可以不在命令行下运行了,在spiders目录下,设置
main.py
   from scrapy import cmdline
cmdline.execute("scrapy crawl xiaozhu".split())
5  如果要scrapy 设置请求头和导出CSV,可以
  设置settings.py:
ROBOTSTXT_OBEY = True
USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36'
DOWNLOAD_DELAY=4
FEED_URI = 'file:d:/photo/zhuanti.csv'
FEED_FORMAT = 'csv'
  

6 scrapy后, 保存到MYSQL文件中

import pymysql
class JianshuitPipeline(object):
    def __init__(self):
        conn = pymysql.connect(host='localhost', user='root', passwd='123456', db='mydb', port=3306, charset='utf8')
        cursor = conn.cursor()
        self.post = cursor
    def process_item(self, item, spider):
        cursor = self.post
        cursor.execute("use mydb")
        sql = "insert into jianshu1 (user,time,title,view,comment,lik,gain) values(%s,%s,%s,%s,%s,%s,%s)"
        cursor.execute(sql,(item['user'],item['time'],item['title'],item['view'],item['comment'],item['like'],item['gain']))
        cursor.connection.commit()
        return item

分享到:
评论

相关推荐

    Python爬虫总结 (2).pdf

    Python爬虫总结 (2).pdfPython爬虫总结 (2).pdfPython爬虫总结 (2).pdfPython爬虫总结 (2).pdfPython爬虫总结 (2).pdfPython爬虫总结 (2).pdfPython爬虫总结 (2).pdfPython爬虫总结 (2).pdfPython爬虫总结 (2).pdf

    Python爬虫教学视频-最全的Python爬虫视频教程全集

    本Python爬虫教学视频,全集共51天课程,整套课程以Python语言为核心,通过各种经典案例的讲解,很好的演示了python爬虫如何抓取数据的全过程,非常值得Python爬虫工程师和想掌握python爬虫技术的同学借鉴学习。...

    最全的Python爬虫视频教程全集

    Python爬虫实战学习day2 1response网络详细信息 2agent代{过}{滤】理解决网站屏蔽 3agent也可以冒充手机或者ipad浏览器 4get模拟百度 5get模拟智联招聘 6get小结 7post通信 8postcgi 9get与post小结 10综合应用模拟...

    Python网络爬虫实习报告总结归纳.docx

    Python网络爬虫是一种用于自动化获取网页内容的技术,广泛应用于互联网数据采集、数据分析和信息监控等领域。在Python中,有许多强大的库和框架可以帮助开发者构建高效、稳定的爬虫程序。 一、选题背景 随着互联网...

    Python爬虫总结材料.pdf

    Python爬虫总结材料.pdfPython爬虫总结材料.pdfPython爬虫总结材料.pdfPython爬虫总结材料.pdfPython爬虫总结材料.pdfPython爬虫总结材料.pdfPython爬虫总结材料.pdfPython爬虫总结材料.pdfPython爬虫总结材料.pdf

    基于python爬虫的中国疫情数据可视化分析

    总结来说,这个项目涵盖了Python爬虫技术的应用,包括网页数据的抓取、清洗、存储以及数据分析和可视化。通过实践,不仅可以提升编程技能,还能增强对疫情数据的理解,为疫情防控提供科学支持。对于初学者,这是一个...

    python爬虫大作业报告+代码

    所学Python技术设计并实现一个功能完整的系统,并撰写总结报告。 要求: (1)实现时需要至少使用图形界面、多线程、文件操作、数据库编程、网页爬虫、统计 分析并绘图(或数据挖掘)六项技术,缺一不可。少一项则...

    Python爬虫总结教学提纲.pdf

    Python爬虫总结教学提纲.pdfPython爬虫总结教学提纲.pdfPython爬虫总结教学提纲.pdfPython爬虫总结教学提纲.pdfPython爬虫总结教学提纲.pdfPython爬虫总结教学提纲.pdfPython爬虫总结教学提纲.pdfPython爬虫总结教学...

    Python爬虫总结 (3).pdf

    Python爬虫总结 (3).pdfPython爬虫总结 (3).pdfPython爬虫总结 (3).pdfPython爬虫总结 (3).pdfPython爬虫总结 (3).pdfPython爬虫总结 (3).pdfPython爬虫总结 (3).pdfPython爬虫总结 (3).pdfPython爬虫总结 (3).pdf

    python的爬虫技术归纳总结

    +python爬虫知识点总结 个人学习的历程和知识点的总结。其中包括内容比较丰富

    Python-python爬虫由浅入深

    《Python Python爬虫由浅入深》 Python作为一门易学且功能强大的编程语言,尤其在Web爬虫领域,有着广泛的应用。Web爬虫是一种自动提取网页信息的程序,它能够帮助我们批量获取网络上的数据,进行数据分析、信息...

    Python爬虫总结.pdf

    Python爬虫总结.pdf

    python 爬虫爬取简历

    总结起来,Python爬虫爬取简历模板涉及的主要知识点包括:使用`requests`库进行HTTP请求,使用BeautifulSoup或Scrapy解析和提取HTML内容,理解HTML结构并定位目标元素,处理分页和登录验证,以及注意网络爬虫的道德...

    Python爬虫总结.rar

    Python爬虫总结.rar

    Python爬虫总结.docx

    Python爬虫总结.docxPython爬虫总结.docxPython爬虫总结.docxPython爬虫总结.docxPython爬虫总结.docxPython爬虫总结.docxPython爬虫总结.docxPython爬虫总结.docx

    python爬虫小实例.docx

    ### Python爬虫小实例知识点详解 #### 一、Python爬虫简介及应用场景 Python作为一种流行的编程语言,在数据抓取方面有着广泛的应用。Python爬虫主要应用于数据采集、数据分析、搜索引擎优化等多个领域。对于初学...

    Python爬虫实战+数据分析+数据可视化.zip

    在这个"Python爬虫实战+数据分析+数据可视化.zip"的压缩包中,包含了一个名为“nba-master”的项目,我们可以推测这是一个关于利用Python进行NBA篮球数据的爬取、分析和可视化的实例。 首先,让我们深入了解一下...

    Python爬虫情况总结.pdf

    Python爬虫情况总结.pdfPython爬虫情况总结.pdfPython爬虫情况总结.pdfPython爬虫情况总结.pdfPython爬虫情况总结.pdfPython爬虫情况总结.pdfPython爬虫情况总结.pdfPython爬虫情况总结.pdfPython爬虫情况总结.pdf

    Python爬虫总结教学提纲.docx

    ### Python爬虫总结教学知识点详解 #### 一、Python爬虫概述 Python作为一种高级编程语言,因其简洁易读的语法特性、丰富的第三方库资源及强大的社区支持,成为了编写网络爬虫程序的首选语言之一。本教学提纲旨在...

Global site tag (gtag.js) - Google Analytics