`
metaphy
  • 浏览: 344619 次
  • 性别: Icon_minigender_1
  • 来自: 大西洋底
社区版块
存档分类
最新评论

VOA special English 下载 py

阅读更多
发个有点用的py程序。这个将从wwenglish网站下载VOA Special English 资料

# -*- coding: utf-8 -*-   
"""   
Author: metaphy
2008-3-10
"""   
import urllib, httplib, urlparse   
import re   
import random   

SITE = "http://www.wwenglish.com"

"""处理每个页面"""
def handleSubpage(url):
#    print url
    save = "D:\\English\\people\\"
    htmlFile = getFileName(url) + ".htm"
    mp3File = getFileName(url) + ".mp3"
    #Get the html file
    download(url,save + htmlFile)

    page = urllib.urlopen(url)
    html = page.readlines()     
    page.close()
    pattern = r"""href="?(\S+)\.mp3"""
    re.compile(pattern)
    
    for line in html:
        matches = re.findall(pattern, line) 
        if matches != None:
            for m in matches :
                mp3url = SITE + m + ".mp3"
#                print mp3url 
                download(mp3url,save + mp3File)
                break
            
""" Get file name from URL"""
def getFileName(url):
    arr = []
    arr = url.split ("/")
    name = arr.pop()[0:-4]
    return name

"""根据url下载文件 """   
def download(url,fileName):
    print url
    urllib.urlretrieve(url, fileName)
    print "Save file succeed!"
                
def main():
    url = "http://www.wwenglish.com/en/voa/spec/people_in_america.htm"
    
    try:
        page = urllib.urlopen(url)
        html = page.readlines()     #
        page.close() 
        
        pattern = r"""href="/en/voa/spec/2007/?(\S+)\.htm"""
        re.compile(pattern)
        
        for line in html:
            matches  = re.findall(pattern, line)
            if matches!=None:
                for subpage in matches:
                    subpage = SITE + "/en/voa/spec/2007/" + subpage + ".htm"
#                    print subpage 
                    handleSubpage (subpage)
    except:
        print "Err: main() error!"  
        return  

main()

       
下面这个是下载VOA Standard English的一个py脚本:
# -*- coding: utf-8 -*-   
"""   
Author: metaphy
2008-5-4
"""   
import urllib, httplib, urlparse   
import re   
import random   

SITE = "http://www.51voa.com/"
SITE_MAINPAGE = "VOA_Standard_2.html"
DOWNLOAD_FOLDER = "D:\\English\\VOAStandard\\"

"""处理每个页面"""
def handleSubpage(url):
    try:
        print "URL:"+url
        title = getFileName(url)
        htmlFile = title + ".html"
        mp3File = title + ".mp3"
        
        #Get the html file
        download(url, DOWNLOAD_FOLDER + htmlFile)
    
        page = urllib.urlopen(url)
        html = page.readlines()
        page.close()
        pattern = r"""<EMBED src="/?(\S+)\.mp3"""
        re.compile(pattern)
        
        for line in html:
            matches = re.findall(pattern, line) 
            if matches != None:
                for m in matches :
                    mp3url = SITE + m + ".mp3"
#                    print mp3url
                    download(mp3url,DOWNLOAD_FOLDER + mp3File)
                    break
    except:
        print "handleSubpage(url) error!"
        return 

""" Get file name from URL"""
def getFileName(url):
    try:
        page = urllib.urlopen(url)
        html = page.readlines()
        page.close()
        
        pattern = """<title>.*</title>"""
        re.compile(pattern)
        for line in html:
#            print line
            matches  = re.findall(pattern, line)
            if matches!=None:
                for title in matches:
                    title = title[7:-39]
                    return title
        return ""
    except:
        print "getFileName(url) error!"
        return 

"""根据url下载文件 """   
def download(url,fileName):
    print url 
    urllib.urlretrieve(url, fileName)
    print "Save file succeed!"

def main():
    try:
        page = urllib.urlopen(SITE+SITE_MAINPAGE)
        html = page.readlines()
        page.close() 
        
        pattern = r"""href="/VOA_Standard_English/?(\S+)\.html"""
        re.compile(pattern)
        for line in html:
            matches  = re.findall(pattern, line)
            if matches!=None:
                counter = 0
                for subpage in matches:
                    counter = counter + 1
                    
                    if counter >=4 and counter <= 50:
                        subpage = SITE + "VOA_Standard_English/" + subpage + ".html"
                        handleSubpage (subpage)

    except:
        print "Err: main() error!"
        return  

main()

       
3
2
分享到:
评论

相关推荐

Global site tag (gtag.js) - Google Analytics