使用python备份搜狐博客

mushme

浏览: 797156 次
性别:
来自: 西安

最近访客更多访客>>

mumume123

sker

odpsoft

西红柿炒笨蛋

博主相关

博客

微博

相册

留言

关于我

文章分类

社区版块

存档分类

博客分类：

python

博客日落西山，已经是不争的事实了，只怕哪天会停掉的。
用python把旧的内容做下备份。

# -*- coding:utf-8 -*-

import urllib.request
from urllib import request
from bs4 import BeautifulSoup
import sqlite3

domain="TTTT";####此处修改为你的博客域名
url = "http://"+domain+".blog.sohu.com/entry/"
urlFile = urllib.request.urlopen(url)
data = urlFile.read()
urlFile.close()
data = data.decode('utf-8',errors='ignore')
print("get page success")
pre = "var _ebi = \'"
index1 = data.find(pre) + len(pre)
index2 = data.find('\'', index1)

ebi=data[index1 : index2];
print("ebi:"+ebi)

pre = "var totalCount = "
index1 = data.find(pre) + len(pre)
index2 = data.find(';', index1)
print("totalcount:"+data[index1 : index2])
totalPage="";
if (int(data[index1 : index2]))%20>0:
    totalPage=str(int(int(data[index1 : index2])/20+1)) 
else:
    totalPage=str(int(int(data[index1 : index2])/20))
print("totalpage:"+totalPage);

    
def getBlogList(pageId):
    url="http://"+domain+".blog.sohu.com/action/v_frag-ebi_"+ebi+"-pg_"+pageId+"/entry/";
    
    print("get url:"+url);
    #1.获取页面内容html
    with request.urlopen(url) as f:
        html_doc=f.read()
        html_doc = html_doc.decode('utf-8',errors='ignore')
        
    #2.分析页面内容，获取标题内容和链接[格式如下]
    #<h2 class="news_entry">
    #	<a href="/n/535728/" target="_blank">传Windows 10 Mobile Build 11088下月初发布</a>
    #</h2>
    soup = BeautifulSoup(html_doc,"html.parser")
    news_array=soup.find_all('div', {'class': 'newBlog-list-title'})
    for news in news_array:
        if news.a:
            print(news.a.get("href"))#获取链接
            save(news.a.get("href"))
        #print(news.a.string)#获取标题

def save(link,title=None):
    if title is None:
	    title=""
    conn = sqlite3.connect('blog.db')
    cursor = conn.cursor()
    # 执行一条SQL语句，创建user表:
    cursor.execute('create table IF NOT EXISTS blog (id INTEGER PRIMARY KEY, title varchar(100),link vachar(100),content text,postdate varchar(100),status Integer)')
    cursor.execute('select * from blog where link=\''+link+'\'')
    values=cursor.fetchall()
    if len(values) > 0:#链接以前就存在
        print('链接已经存在:'+link)
    else:
        cursor.execute('insert into blog (title, link,status) values (\''+title+'\', \''+link+'\',0)')
        conn.commit()
        print("save success."+link)    
# 关闭Cursor:
    cursor.close()
# 提交事务:
    conn.commit()
# 关闭Connection:
    conn.close()
    
for x in range(1,int(totalPage)+1): #代表从1到5(不包含5)9000-9700
    errorLink=[]
    try:
        getBlogList(str(x))
    except Exception  as e:
        print('except:', e)
        errorLink.append(x)
print("errorLink:"+str(errorLink));

2.抓取内容页面，将内容保存到数据库中

# -*- coding:utf-8 -*-

from bs4 import BeautifulSoup
import urllib.request
from urllib import request
# 导入SQLite驱动:
import sqlite3

	
def updateContent():
    conn = sqlite3.connect('blog.db')
    cursor = conn.cursor()
    cursor.execute('select * from blog where status=0')
    values = cursor.fetchall()
    
    for line in values:
        id=line[0]
        link=line[2]
        
        soup=getContent(link)
        
        try:
            title=soup.find('div', {'class': 'item-body'}).h2.span.get_text()
            postdate=soup.find('span', {'class': 'date'}).get_text();
            content=str(soup.find('div', {'class': 'item-content'}))#.get_text()
            end = "<div class=\"clear\"></div>"
            content=content[45:content.find(end)]
            
            print(link)
            cursor.execute('update blog set title=?,content=?,status=1,postdate=? where id=?',(title,content, postdate,id))
            conn.commit()
        except Exception  as e:
            print('except:', e)
    cursor.close()
    conn.commit()
    conn.close()


#根据链接获取内容
def getContent(link):
    #1.获取页面内容html
    html_doc="";
    #构造header，一般header至少要包含一下两项。这两项是从抓到的包里分析得出的。
    headers = {'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:44.0) Gecko/20100101 Firefox/44.0',
           'Referer' : link}
           
    #打开登录主页面（他的目的是从页面下载cookie，这样我们在再送post数据时就有cookie了，否则发送不成功）
    
    try:
        #with request.urlopen(link) as f:
        #    html_doc=f.read()  
        request = urllib.request.Request(link, None, headers)
        html_doc=urllib.request.urlopen(request).read()
    except Exception  as e:
        print('except:', e)
            
    #2.分析页面内容，获取内容
    soup = BeautifulSoup(html_doc,"html.parser")
    return soup

#将所有没有内容的新闻，抓取一下，将内容填充进去
updateContent()

分享到：

入门一门语言的顺序 | 使用python从360doc上抓取内容

2016-02-24 15:17
浏览 548
评论(0)
分类:编程语言
查看更多

发表评论

您还没有登录,请您登录后再发表评论

最近访客更多访客>>

博主相关

文章分类

社区版块

存档分类

最新评论

使用python备份搜狐博客

评论

发表评论

相关推荐

最近访客 更多访客>>

博主相关

文章分类

社区版块

存档分类

最新评论

使用python备份搜狐博客

评论

发表评论

相关推荐

useragent

整理一个python工具类

scrapy抓取cnblog新闻

scrapy抓取dmoz内容

安装scrapy

获取天气预报的接口

选择一个好的驾校，用数据说话，我用python

python版wobot

运用百度语音识别来读文字

多线程下载cnblog新闻图片

使用python从360doc上抓取内容

python连接telnet

获取可用的代理服务器

每日自动下载bing背景图片做桌面之python

python数据抓取

用python自动登录iteye

最近访客更多访客>>