用Python写的抓取天气预报程序

wangdei

浏览: 379426 次

最近访客更多访客>>

u012363178

陈民刚

limuzi13

halloffame

博主相关

博客

微博

相册

留言

关于我

文章分类

社区版块

存档分类

博客分类：

长见识的东西

Python HTML MySQL SQL Windows

最近用java写网站有点累了，发发一些写于一年前的python代码，一直没有用在实际系统中。不知道针对现在的天气预报网站是不是有效，不过对各位应该又很大的参考价值.

使用BeautifulSoup做HTML分析。

抓取最近的5天数据，并保存到mysql数据库中。

如果出现处理失败，会向指定的邮件地址，发送报警。这是一个比较完善的天气预报抓取程序。

#! /usr/bin/env python

# -*- coding: utf-8 -*-

"""

todo ：设置一个字段表示，是否成功更新，一旦成功则记录ID,系统运行结束，进行update。否则发送短信。保证只发送一次

"""

import os,urllib2,re,MySQLdb,datetime,time,smtplib

from BeautifulSoup import BeautifulSoup

from StringIO import StringIO

from email.mime.text import MIMEText

USER_AGENT = ' Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.0.1) Gecko/2008070208 Firefox/3.0.1'

BASE_URL_BEGIN= 'http://www.weather.com.cn/html/weather/'

BASE_URL_END = '.shtml'

conn = MySQLdb.connect(host= "localhost" , user= "fun" , passwd= "fun" , db= "fun" ,use_unicode= 1 , charset= 'utf8' )

#Mail 接收方邮件

mailto_list=[ "ealpha.shi@mobimtech.com" ]

#SMTP 服务器，用户名、口令以及邮箱的后缀

mail_host= "imichat.com"

mail_user= "imichat"

mail_pass= "imichat"

mail_postfix= "imichat.com"

#失败的WID,记录用来判断，是否发送邮件

faultwid = []

#失败重试次数

dotime = 0

def send_mail(to_list,sub,content):

'' '''

to_list:发给谁

sub:主题

content:内容

send_mail("aaa@126.com","sub","content")

'''

me=mail_user+ "<" +mail_user+ "@" +mail_postfix+ ">"

msg = MIMEText(content)

msg[ 'Subject' ] = sub

msg[ 'From' ] = me

msg[ 'To' ] = ";" .join(to_list)

try :

s = smtplib.SMTP()

s.connect(mail_host)

s.login(mail_user,mail_pass)

s.sendmail(me, to_list, msg.as_string())

s.close()

return True

except Exception, e:

print str(e)

return False

def getFiveDayWeather(wid,pageid,agent=USER_AGENT):

"""

将需要的数据表格从整个网页取出来

"""

url = BASE_URL_BEGIN + pageid + BASE_URL_END

#print '*************************'

#print url

request = urllib2.Request(url)

request.add_header( 'User-Agent' , agent)

opener = urllib2.build_opener()

allhtml = StringIO(str((opener.open(request)).read()))

soup = BeautifulSoup(allhtml, fromEncoding= "utf-8" )

html = soup.find( 'div' , id= 'dd_0' ).parent.contents

getWeatherList(wid,html)

return html

def getWeatherList(wid,html):

"""

取得最后发布时间，已经5天数据

"""

soup1 = BeautifulSoup(str(html))

time = soup1.find( 'h2' )

update_time = ''

for t in time:

update_time = t

#print '\r'

#print update_time

#print '\r'

html2 = soup1.findAll( 'div' , { "class" : "fut_weatherbox" })

dayid = 0

for dayweather in html2:

dayid += 1

getOneDayWeather(wid,dayid,update_time,dayweather)

pass

def getOneDayWeather(wid,dayid,update_time,html):

"""

分析一天的天气预报信息

"""

soup = BeautifulSoup(StringIO(str(html)), fromEncoding= "UTF-8" )

day = soup.findAll( 'h3' )

imgs = soup.findAll( 'img' )

t00 = soup.findAll( 'h4' , { "class" : "temp00_dn" })

t01 = soup.findAll( 'h4' , { "class" : "temp01_dn" })

t02 = soup.findAll( 'h4' , { "class" : "temp02_dn" })

t03 = soup.findAll( 'h4' , { "class" : "temp03_dn" })

#print '----------------------'

soup_h3 = BeautifulSoup(StringIO(str(day)), fromEncoding= "UTF-8" )

day_value = soup_h3.h3.renderContents()

#for img in imgs: 为了确定值，不使用循环

soup_img = BeautifulSoup(StringIO(str(imgs[ 0 ])), fromEncoding= "UTF-8" )

imgsrc = soup_img.first( 'img' )[ 'src' ]

d_pic_value = imgsrc.split( '/' )[- 1 ].split( '.' )[- 2 ]

soup_img = BeautifulSoup(StringIO(str(imgs[ 1 ])), fromEncoding= "UTF-8" )

imgsrc = soup_img.first( 'img' )[ 'src' ]

n_pic_value = imgsrc.split( '/' )[- 1 ].split( '.' )[- 2 ]

soup_t00 = BeautifulSoup(StringIO(str(t00)), fromEncoding= "UTF-8" )

weather_value = soup_t00.h4.renderContents()

soup_t01 = BeautifulSoup(StringIO(str(t01)), fromEncoding= "UTF-8" )

max_temp = soup_t01.h4.renderContents()

soup_t02 = BeautifulSoup(StringIO(str(t02)), fromEncoding= "UTF-8" )

min_temp = soup_t02.h4.renderContents()

soup_t03 = BeautifulSoup(StringIO(str(t03)), fromEncoding= "UTF-8" )

wind = soup_t03.h4.renderContents()

insertDB(wid,dayid,update_time,day_value,d_pic_value,n_pic_value,weather_value,max_temp,min_temp,wind )

def insertDB(wid,dayid,update_time,day_value,d_pic_value,n_pic_value,weather_value,max_temp,min_temp,wind ):

"""

插入数据库，此处要修改，5天数据一次commit(),异常rollback()

"""

cursor_uodate=conn.cursor()

sql= "INSERT INTO weatherdetail( wid, dayid, lastupdate, currdate, dpic, npic,weather, maxtemp, mintemp, wind) VALUES( %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"

param = (wid,dayid,update_time ,day_value,d_pic_value,n_pic_value,weather_value,max_temp,min_temp,wind)

#print param

n=cursor_uodate.execute(sql,param)

conn.commit()

#print n

def sendMonitor():

"""

处理失败，发送报警邮件或短信

"""

if len(faultwid) <> 0 :

if send_mail(mailto_list, "Error: Get Weather Error " +str(datetime.datetime.now()),str(faultwid)):

print "监控邮件发送成功."

else :

print "监控邮件发送失败."

pass

def doworking(dotime,wid,pageid):

"""

业务处理入口

"""

try :

getFiveDayWeather(wid,pageid)

except (NameError,Exception),e:

print "has one error on %s %s , then do it again , waiting five secs." % (wid,pageid)

time.sleep( 5 )

if dotime < 3 :

doworking(dotime + 1 ,wid,pageid)

else :

faultwid.append(wid)

pass

if __name__ == "__main__" :

"""

入口函数

"""

starttime = datetime.datetime.now()

print "Start." +str(starttime)

cursor = conn.cursor()

cursor.execute( "SELECT id,weather_com_cn_pageid FROM weather" )

result = cursor.fetchall()

for record in result:

# 将 dotime 恢复到 0 ，代表本次请求首次执行

doworking( 0 ,str(record[ 0 ]),record[ 1 ])

#time.sleep(2)

print '\r'

endtime = datetime.datetime.now()

print "End." +str(endtime)

print "-------------------------------------------------"

sendMonitor()

print (endtime - starttime).seconde

3
顶

1
踩

分享到：

iostat(vmstat)来对linux硬盘IO性能进行了 ... | 用java 画几个函数曲线

2009-02-09 21:58
浏览 4258
评论(0)
分类:企业架构
查看更多

发表评论

您还没有登录,请您登录后再发表评论

最近访客更多访客>>

博主相关

文章分类

社区版块

存档分类

最新评论