__author__ = 'chenyang' #!/usr/bin/env python """Web Crawler/Spider This module implements a web crawler. This is very _basic_ only and needs to be extended to do anything usefull with the traversed pages. """ import re import sys import time import math import urllib2 import urlparse import optparse from cgi import escape from traceback import format_exc from Queue import Queue, Empty as QueueEmpty from BeautifulSoup import BeautifulSoup __version__ = "0.2" __copyright__ = "CopyRight (C) 2008-2011 by James Mills" __license__ = "MIT" __author__ = "James Mills" __author_email__ = "James Mills, James dot Mills st dotred dot com dot au" USAGE = "%prog [options] <url>" VERSION = "%prog v" + __version__ AGENT = "%s/%s" % (__name__, __version__) class Crawler(object): def __init__(self, root, depth, locked=True): self.root = root self.depth = depth self.locked = locked self.host = urlparse.urlparse(root)[1] self.urls = [] self.links = 0 self.followed = 0 def crawl(self): page = Fetcher(self.root) page.fetch() q = Queue() for url in page.urls: q.put(url) followed = [self.root] n = 0 while True: try: url = q.get() except QueueEmpty: break n += 1 if url not in followed: try: host = urlparse.urlparse(url)[1] if self.locked and re.match(".*%s" % self.host, host): followed.append(url) self.followed += 1 page = Fetcher(url) page.fetch() for i, url in enumerate(page): if url not in self.urls: self.links += 1 q.put(url) self.urls.append(url) if n > self.depth and self.depth > 0: break except Exception, e: print "ERROR: Can't process url '%s' (%s)" % (url, e) print format_exc() class Fetcher(object): def __init__(self, url): self.url = url self.urls = [] def __getitem__(self, x): return self.urls[x] def _addHeaders(self, request): request.add_header("User-Agent", AGENT) def open(self): url = self.url try: request = urllib2.Request(url) handle = urllib2.build_opener() except IOError: return None return (request, handle) def fetch(self): request, handle = self.open() self._addHeaders(request) if handle: try: content = unicode(handle.open(request).read(), "utf-8", errors="replace") soup = BeautifulSoup(content) #######BeautifulSoup提供a标签# tags = soup('a') except urllib2.HTTPError, error: if error.code == 404: print >> sys.stderr, "ERROR: %s -> %s" % (error, error.url) else: print >> sys.stderr, "ERROR: %s" % error tags = [] except urllib2.URLError, error: print >> sys.stderr, "ERROR: %s" % error tags = [] for tag in tags: href = tag.get("href") if href is not None: url = urlparse.urljoin(self.url, escape(href)) if url not in self: self.urls.append(url) def getLinks(url): page = Fetcher(url) page.fetch() for i, url in enumerate(page): print "%d. %s" % (i, url) def parse_options(): """parse_options() -> opts, args Parse any command-line options given returning both the parsed options and arguments. """ parser = optparse.OptionParser(usage=USAGE, version=VERSION) parser.add_option("-q", "--quiet", action="store_true", default=False, dest="quiet", help="Enable quiet mode") parser.add_option("-l", "--links", action="store_true", default=False, dest="links", help="Get links for specified url only") parser.add_option("-d", "--depth", action="store", type="int", default=30, dest="depth", help="Maximum depth to traverse") opts, args = parser.parse_args() if len(args) < 1: parser.print_help() raise SystemExit, 1 return opts, args def main(): opts, args = parse_options() url = args[0] if opts.links: getLinks(url) raise SystemExit, 0 depth = opts.depth sTime = time.time() print "Crawling %s (Max Depth: %d)" % (url, depth) crawler = Crawler(url, depth) crawler.crawl() print "\n".join(crawler.urls) eTime = time.time() tTime = eTime - sTime print "Found: %d" % crawler.links print "Followed: %d" % crawler.followed print "Stats: (%d/s after %0.2fs)" % ( int(math.ceil(float(crawler.links) / tTime)), tTime) if __name__ == "__main__": main()
Python BeautifulSoup 简单笔记
http://rsj217.diandian.com/post/2012-11-01/40041235132
TK calculation
import Tkinter as tk calc = tk.Tk() calc.title("CrappyCalc") buttons = [ '7', '8', '9', '*', 'C', '4', '5', '6', '/', 'Neg', '1', '2', '3', '-', '$', '0', '.', '=', '+', '@' ] # set up GUI row = 1 col = 0 for i in buttons: button_style = 'raised' action = lambda x = i: click_event(x) tk.Button(calc, text = i, width = 4, height = 3, relief = button_style, command = action) \ .grid(row = row, column = col, sticky = 'nesw', ) col += 1 if col > 4: col = 0 row += 1 display = tk.Entry(calc, width = 40, bg = "white") display.grid(row = 0, column = 0, columnspan = 5) def click_event(key): # = -> calculate results if key == '=': # safeguard against integer division if '/' in display.get() and '.' not in display.get(): display.insert(tk.END, ".0") # attempt to evaluate results try: result = eval(display.get()) display.insert(tk.END, " = " + str(result)) except: display.insert(tk.END, " Error, use only valid chars") # C -> clear display elif key == 'C': display.delete(0, tk.END) # $ -> clear display elif key == '$': display.delete(0, tk.END) display.insert(tk.END, "$$$$C.$R.$E.$A.$M.$$$$") # @ -> clear display elif key == '@': display.delete(0, tk.END) display.insert(tk.END, "wwwwwwwwwwwwwwwwebsite") # neg -> negate term elif key == 'neg': if '=' in display.get(): display.delete(0, tk.END) try: if display.get()[0] == '-': display.delete(0) else: display.insert(0, '-') except IndexError: pass # clear display and start new input else: if '=' in display.get(): display.delete(0, tk.END) display.insert(tk.END, key) # RUNTIME calc.mainloop()
相关推荐
本教程"PythonCrawler-master"旨在教授如何利用Python进行网页数据的抓取和处理。教程涵盖了网络爬虫的基础知识,包括HTML解析、HTTP请求、数据存储等核心内容,同时也涉及了一些高级技巧,如模拟登录、反爬虫策略和...
**PythonCrawler-Scrapy-Mysql-File-Template 框架详解** 本文将深入探讨一个基于Python的开源爬虫框架——Scrapy,以及如何利用它来构建爬虫项目,将抓取的数据存储到MySQL数据库或文件中。Scrapy是一个强大的、...
( )\ ) ) ) ( ( (()/( ( ( /( ( /( )\ ( ) ( ( )\ ( ( /(_)))\ ) )\()))\()) ( ( (((_) )( ( /( )\))( ((_) ))\ )( (_)) (()/( (_))/((_)\ )\ )\ ) )\___ (()\ )(_))((_)()\ _ /((_)(()\
简单爬虫操作,直达博客——复工复产,利用Python爬虫爬取火车票信息,利用Python 爬虫获取火车票信息
python库。 资源全名:spidy_web_crawler-1.6.0-py3-none-any.whl
Fun's Python crawler and Python data analysis small projects (some interesting Python crawlers and data analysis projects)interested-python interesting Python crawler and data analysis small projects...
pythonCrawler 注意 exe_file是本程序爬取的附录,全部测试,实战识读路径全部指向exe_file 本爬虫笔记基于b站 在该教程的基础上对教程中的思路进行实践,对教程出现的错误进行修正,并且另外扩展,而不是教程源码...
这个压缩包文件"一段python爬虫代码需要自行下载第三方库函数.zip"包含了一个名为"python Crawler.py"的Python爬虫程序,它用于抓取足球运动员的相关信息。下面,我们将详细探讨Python爬虫的基本概念、第三方库的...
主要文件在./PythonCrawler中 1、运行文件databaseInitialization.py初始化数据库和相应的表, 数据库连接默认是localhost:3306 2、运行文件findAllStocksCode.py获得所有4192支股票的股票代码和名称, 这时会得到...
简介人生苦短,我用Python!该仓库主要用于存放网络爬虫相关的Python脚本文件。关于网络爬虫的部分代码均放在该仓库下了,感兴趣的小伙伴可以随意进行下载,有问题欢迎学习交流~~a如果你喜欢的话,记得给我个star;...
在实际使用时,用户可能需要安装必要的Python库(如requests、beautifulsoup4),然后运行`python crawler.py`,并按照提示输入关键词。程序将自动下载图片到指定的目录下。 需要注意的是,虽然爬虫可以方便地获取...
Python-Crawler-master是一个关于Python爬虫的项目,主要利用Python的多线程技术来实现对电影天堂网站资源的高效抓取。在这个项目中,开发者旨在提供一个实用且高效的爬虫框架,帮助用户获取到电影天堂网站上的丰富...
在本压缩包"PythonCrawler-master"中,我们可以预见到包含了几个用于Python爬虫的小模块,这些模块可能涉及到了网页抓取、数据解析以及特定网站如百度贴吧和京东的商品信息抓取。 首先,关于"抓取百度贴吧的照片",...
Python website crawler.
- 在命令行中运行 `python crawler.py`。 5. **注意事项**: - **遵循 robots.txt**:确保遵守目标网站的爬虫协议。 - **控制请求频率**:合理设置请求间隔,避免对服务器造成压力。 - **异常处理**:添加异常...
Python爬虫学习路径图_Learn-Python-Crawler
学习 Python 爬虫需要掌握以下几个方面的知识:首先,需要了解 Python 基础知识,包括变量、数据类型、控制结构、函数、模块等。 Python 是一种易于学习的语言,对于初学者来说,学习 Python 基础知识并不困难。其次...
爬虫-python crawler-python 是一个简单的爬虫框架,用于从网站收集在线数据用于学术目的。快速开始下载或克隆源代码...目前支持的网站最好和 goengent 一起工作(哎呀,yelp 挡住了)未来的网站去做可用代理列表 ...
在这个“python_crawler”项目中,我们很可能看到了一系列用于学习和实践Python爬虫技术的代码和资源。下面,我们将深入探讨Python爬虫的一些核心知识点。 1. **基础概念**:Python爬虫,也称为网络爬虫或网页抓取...
1. "爬虫python":表明这个项目涉及Python语言编写爬虫,Python是目前非常流行的爬虫开发语言,因为其语法简洁,且有强大的网络请求和数据处理库。 2. "webcrawler":这是“网络爬虫”的英文,指的是自动遍历互联网...