BOT_NAME = ‘scrapybot’
CLOSESPIDER_TIMEOUT = 0
CLOSESPIDER_PAGECOUNT = 0
CLOSESPIDER_ITEMCOUNT = 0
CLOSESPIDER_ERRORCOUNT = 0
COMMANDS_MODULE = ”
CONCURRENT_ITEMS = 100
CONCURRENT_REQUESTS = 16
CONCURRENT_REQUESTS_PER_DOMAIN = 8
CONCURRENT_REQUESTS_PER_IP = 0
COOKIES_ENABLED = True
COOKIES_DEBUG = False
DEFAULT_ITEM_CLASS = ‘scrapy.item.Item’
DEFAULT_REQUEST_HEADERS = {
‘Accept’: ‘text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8′,
‘Accept-Language’: ‘en’,
}
DEPTH_LIMIT = 0
DEPTH_STATS = True
DEPTH_PRIORITY = 0
DNSCACHE_ENABLED = True
DOWNLOAD_DELAY = 0
DOWNLOAD_HANDLERS = {}
DOWNLOAD_HANDLERS_BASE = {
‘file’: ‘scrapy.core.downloader.handlers.file.FileDownloadHandler’,
‘http’: ‘scrapy.core.downloader.handlers.http.HttpDownloadHandler’,
‘https’: ‘scrapy.core.downloader.handlers.http.HttpDownloadHandler’,
‘s3′: ‘scrapy.core.downloader.handlers.s3.S3DownloadHandler’,
}
DOWNLOAD_TIMEOUT = 180 # 3mins
DOWNLOADER_DEBUG = False
DOWNLOADER_HTTPCLIENTFACTORY = ‘scrapy.core.downloader.webclient.ScrapyHTTPClientFactory’
DOWNLOADER_CLIENTCONTEXTFACTORY = ‘scrapy.core.downloader.webclient.ScrapyClientContextFactory’
DOWNLOADER_MIDDLEWARES = {}
DOWNLOADER_MIDDLEWARES_BASE = {
# Engine side
‘scrapy.contrib.downloadermiddleware.robotstxt.RobotsTxtMiddleware’: 100,
‘scrapy.contrib.downloadermiddleware.httpauth.HttpAuthMiddleware’: 300,
‘scrapy.contrib.downloadermiddleware.downloadtimeout.DownloadTimeoutMiddleware’: 350,
‘scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware’: 400,
‘scrapy.contrib.downloadermiddleware.retry.RetryMiddleware’: 500,
‘scrapy.contrib.downloadermiddleware.defaultheaders.DefaultHeadersMiddleware’: 550,
‘scrapy.contrib.downloadermiddleware.redirect.RedirectMiddleware’: 600,
‘scrapy.contrib.downloadermiddleware.cookies.CookiesMiddleware’: 700,
‘scrapy.contrib.downloadermiddleware.httpproxy.HttpProxyMiddleware’: 750,
‘scrapy.contrib.downloadermiddleware.httpcompression.HttpCompressionMiddleware’: 800,
‘scrapy.contrib.downloadermiddleware.chunked.ChunkedTransferMiddleware’: 830,
‘scrapy.contrib.downloadermiddleware.stats.DownloaderStats’: 850,
‘scrapy.contrib.downloadermiddleware.httpcache.HttpCacheMiddleware’: 900,
# Downloader side
}
DOWNLOADER_STATS = True
DUPEFILTER_CLASS = ‘scrapy.dupefilter.RFPDupeFilter’
try:
EDITOR = os.environ['EDITOR']
except KeyError:
if sys.platform == ‘win32′:
EDITOR = ‘%s -m idlelib.idle’
else:
EDITOR = ‘vi’
EXTENSIONS = {}
EXTENSIONS_BASE = {
‘scrapy.contrib.corestats.CoreStats’: 0,
‘scrapy.webservice.WebService’: 0,
‘scrapy.telnet.TelnetConsole’: 0,
‘scrapy.contrib.memusage.MemoryUsage’: 0,
‘scrapy.contrib.memdebug.MemoryDebugger’: 0,
‘scrapy.contrib.closespider.CloseSpider’: 0,
‘scrapy.contrib.feedexport.FeedExporter’: 0,
‘scrapy.contrib.logstats.LogStats’: 0,
‘scrapy.contrib.spiderstate.SpiderState’: 0,
‘scrapy.contrib.throttle.AutoThrottle’: 0,
}
FEED_URI = None
FEED_URI_PARAMS = None # a function to extend uri arguments
FEED_FORMAT = ‘jsonlines’
FEED_STORE_EMPTY = False
FEED_STORAGES = {}
FEED_STORAGES_BASE = {
”: ‘scrapy.contrib.feedexport.FileFeedStorage’,
‘file’: ‘scrapy.contrib.feedexport.FileFeedStorage’,
‘stdout’: ‘scrapy.contrib.feedexport.StdoutFeedStorage’,
‘s3′: ‘scrapy.contrib.feedexport.S3FeedStorage’,
‘ftp’: ‘scrapy.contrib.feedexport.FTPFeedStorage’,
}
FEED_EXPORTERS = {}
FEED_EXPORTERS_BASE = {
‘json’: ‘scrapy.contrib.exporter.JsonItemExporter’,
‘jsonlines’: ‘scrapy.contrib.exporter.JsonLinesItemExporter’,
‘csv’: ‘scrapy.contrib.exporter.CsvItemExporter’,
‘xml’: ‘scrapy.contrib.exporter.XmlItemExporter’,
‘marshal’: ‘scrapy.contrib.exporter.MarshalItemExporter’,
‘pickle’: ‘scrapy.contrib.exporter.PickleItemExporter’,
}
HTTPCACHE_ENABLED = False
HTTPCACHE_DIR = ‘httpcache’
HTTPCACHE_IGNORE_MISSING = False
HTTPCACHE_STORAGE = ‘scrapy.contrib.httpcache.DbmCacheStorage’
HTTPCACHE_EXPIRATION_SECS = 0
HTTPCACHE_IGNORE_HTTP_CODES = []
HTTPCACHE_IGNORE_SCHEMES = ['file']
HTTPCACHE_DBM_MODULE = ‘anydbm’
ITEM_PROCESSOR = ‘scrapy.contrib.pipeline.ItemPipelineManager’
# Item pipelines are typically set in specific commands settings
ITEM_PIPELINES = []
LOG_ENABLED = True
LOG_ENCODING = ‘utf-8′
LOG_FORMATTER = ‘scrapy.logformatter.LogFormatter’
LOG_STDOUT = False
LOG_LEVEL = ‘DEBUG’
LOG_FILE = None
LOG_UNSERIALIZABLE_REQUESTS = False
LOGSTATS_INTERVAL = 60.0
MAIL_DEBUG = False
MAIL_HOST = ‘localhost’
MAIL_PORT = 25
MAIL_FROM = ‘scrapy@localhost’
MAIL_PASS = None
MAIL_USER = None
MEMDEBUG_ENABLED = False # enable memory debugging
MEMDEBUG_NOTIFY = [] # send memory debugging report by mail at engine shutdown
MEMUSAGE_ENABLED = False
MEMUSAGE_LIMIT_MB = 0
MEMUSAGE_NOTIFY_MAIL = []
MEMUSAGE_REPORT = False
MEMUSAGE_WARNING_MB = 0
NEWSPIDER_MODULE = ”
RANDOMIZE_DOWNLOAD_DELAY = True
REDIRECT_ENABLED = True
REDIRECT_MAX_METAREFRESH_DELAY = 100
REDIRECT_MAX_TIMES = 20 # uses Firefox default setting
REDIRECT_PRIORITY_ADJUST = +2
REFERER_ENABLED = True
RETRY_ENABLED = True
RETRY_TIMES = 2 # initial response + 2 retries = 3 requests
RETRY_HTTP_CODES = [500, 503, 504, 400, 408]
RETRY_PRIORITY_ADJUST = -1
ROBOTSTXT_OBEY = False
SCHEDULER = ‘scrapy.core.scheduler.Scheduler’
SCHEDULER_DISK_QUEUE = ‘scrapy.squeue.PickleLifoDiskQueue’
SCHEDULER_MEMORY_QUEUE = ‘scrapy.squeue.LifoMemoryQueue’
SPIDER_MANAGER_CLASS = ‘scrapy.spidermanager.SpiderManager’
SPIDER_MIDDLEWARES = {}
SPIDER_MIDDLEWARES_BASE = {
# Engine side
‘scrapy.contrib.spidermiddleware.httperror.HttpErrorMiddleware’: 50,
‘scrapy.contrib.spidermiddleware.offsite.OffsiteMiddleware’: 500,
‘scrapy.contrib.spidermiddleware.referer.RefererMiddleware’: 700,
‘scrapy.contrib.spidermiddleware.urllength.UrlLengthMiddleware’: 800,
‘scrapy.contrib.spidermiddleware.depth.DepthMiddleware’: 900,
# Spider side
}
SPIDER_MODULES = []
STATS_CLASS = ‘scrapy.statscol.MemoryStatsCollector’
STATS_DUMP = True
STATSMAILER_RCPTS = []
TEMPLATES_DIR = abspath(join(dirname(__file__), ‘..’, ‘templates’))
URLLENGTH_LIMIT = 2083
USER_AGENT = ‘Scrapy/%s (+http://scrapy.org)’ % __import__(‘scrapy’).__version__
TELNETCONSOLE_ENABLED = 1
TELNETCONSOLE_PORT = [6023, 6073]
TELNETCONSOLE_HOST = ’0.0.0.0′
WEBSERVICE_ENABLED = True
WEBSERVICE_LOGFILE = None
WEBSERVICE_PORT = [6080, 7030]
WEBSERVICE_HOST = ’0.0.0.0′
WEBSERVICE_RESOURCES = {}
WEBSERVICE_RESOURCES_BASE = {
‘scrapy.contrib.webservice.crawler.CrawlerResource’: 1,
‘scrapy.contrib.webservice.enginestatus.EngineStatusResource’: 1,
‘scrapy.contrib.webservice.stats.StatsResource’: 1,
}
SPIDER_CONTRACTS = {}
SPIDER_CONTRACTS_BASE = {
‘scrapy.contracts.default.UrlContract’ : 1,
‘scrapy.contracts.default.ReturnsContract’: 2,
‘scrapy.contracts.default.ScrapesContract’: 3,
}
分享到:
相关推荐
配置 Scrapy 项目: 1. 创建一个新的 Scrapy 项目:`scrapy startproject myproject`,这将创建一个名为 "myproject" 的文件夹,包含基本的项目结构。 2. 进入项目目录:`cd myproject`。 3. 编写爬虫代码,通常位于...
此外,压缩包中的资源可能包括一些Scrapy相关的教程、示例代码和配置文件,可以帮助初学者更好地理解和应用Scrapy。对于更复杂的爬虫任务,例如处理JavaScript渲染、登录认证、反爬策略等,你可能需要学习更多的...
在本教程中,我们将深入探讨如何配置Scrapy以及在配置过程中涉及的关键文件。 首先,让我们了解Scrapy的基本结构。一个Scrapy项目通常包含以下几个核心组件: 1. **项目结构**:Scrapy项目的根目录下,你会看到`...
未维护如果您需要Scrapy的浏览器集成,请考虑使用Scrapy的...要求Python 3.6+ Scrapy 2.0+ 皮皮特0.0.23+安装$ pip install scrapy-pyppeteer配置通过替换默认的http和https下载处理程序: DOWNLOAD_HANDLERS = { ...
scrapy项目 setting.py #Resis 设置 #使能Redis调度器 SCHEDULER = 'scrapy_redis.scheduler.Scheduler' #所有spider通过redis使用同一个去重过滤器 DUPEFILTER_CLASS = 'scrapy_redis.dupefilter.RFPDupeFilter' ...
确保Scrapy项目中的`settings.py`配置文件允许其被其他程序调用。 4. **编写Django视图**:在Django应用中,创建视图函数,这些函数将负责启动Scrapy爬虫,可以通过调用Scrapy的命令行工具或者直接导入Scrapy模块来...
Scrapy项目配置文件可以设定各种全局参数,如下载延迟(避免对目标站点造成过大压力)、代理设置、用户代理字符串等。 10. **Scheduling**: 调度器负责管理待处理的Request队列,按照一定的策略(如深度优先或...
【选修】01-Scrapy配置文件详解.vep
本文将深入探讨如何配置Scrapy来实现随机延时、随机用户代理(UA)以及IP更换,以提高爬虫的匿名性和效率。 首先,我们来讨论随机延时。在Scrapy的`settings.py`文件中,有一个默认的下载延迟设置`DOWNLOAD_DELAY =...
9. **Settings**:Scrapy项目配置文件允许你定义各种全局设置,如下载延迟、并发请求数、日志级别等。 使用Scrapy可以快速构建起复杂的爬虫项目,同时Scrapy还提供了命令行工具,用于创建新项目、生成新Spider、...
通过以上步骤,你就成功地配置了一个基于Scrapy和Redis的分布式爬虫。现在,你可以启动多个Scrapy爬虫实例,它们会共享Redis中的待爬取URL队列,实现并行爬取,从而大大提高爬取效率。在实际项目中,还可以根据需求...
9. **PyYAML**或**json**:用于读写YAML或JSON格式的数据,Scrapy的配置文件通常使用YAML格式,而抓取的数据可能以JSON格式存储。 10. **six**:提供Python 2和Python 3之间的兼容性,确保Scrapy可以在不同版本的...
Scrapy是一个强大的Python爬虫框架,它为开发者提供了一个高效且结构化的数据抓取环境。在Windows 7 64位系统上安装Scrapy时,需要确保安装一系列依赖项,这些依赖项对于Scrapy的正常运行至关重要。在这个压缩包文件...
在Scrapy项目中,可以通过`settings.py`文件配置各种参数,如下载延迟、并发请求的数量、中间件和Pipeline的启用等,以满足特定的爬虫需求。 9. **Scrapy命令行工具** Scrapy提供了一系列命令行工具,如`...
在压缩包文件`Scrape_Books`中,可能包含了打包后的exe文件以及相关的配置或数据文件。用户只需解压并运行exe文件,即可启动Scrapy爬虫,执行预定的抓取任务。 总结起来,使用Pyinstaller打包Scrapy项目涉及到以下...
8. **Settings**:Scrapy设置允许你配置项目的行为,如更改默认的下载延迟、启用或禁用某些中间件等。 学习Scrapy的过程中,你将了解到如何创建项目、编写Spider、定义Item和Pipeline、配置中间件,以及如何运行和...
### Scrapy 0.22.3:一个强大的网络爬虫框架 #### 一、Scrapy简介 **Scrapy** 是一个用于爬取网站并提取结构化数据的应用框架,广泛应用于数据挖掘、信息处理或历史档案等领域。尽管最初设计是为了进行网页抓取...
1. **《3-Command line tool.md.txt》**:这份文档介绍了Scrapy命令行工具的使用,包括如何创建项目、启动爬虫、生成配置文件等基本操作。了解这些命令将帮助用户快速初始化并管理Scrapy项目。 2. **《2-Scrapy ...
由于Redis的队列特性,可以实现自动负载均衡,无需额外配置。 10. **Redis Pub/Sub**: 除了队列,Scrapy-Redis还可以利用Redis的发布/订阅功能进行爬虫间的通信,例如通知新的种子URL或者监控爬虫状态。 综上所述...
Scrapy是Python编程语言中的一款强大的Web爬虫框架,它为开发者提供了构建高效、可扩展的网络爬虫项目的能力。在Scrapy中,ImagesPipeline是一个非常实用的组件,专门用于处理图片的下载和管理,使得爬虫在抓取网页...