# Scrapy settings for douban_book project # # For simplicity, this file contains only settings considered important or # commonly used. You can find more settings consulting the documentation: # # https://docs.scrapy.org/en/latest/topics/settings.html # https://docs.scrapy.org/en/latest/topics/downloader-middleware.html # https://docs.scrapy.org/en/latest/topics/spider-middleware.html BOT_NAME = "douban_book" SPIDER_MODULES = ["douban_book.spiders"] NEWSPIDER_MODULE = "douban_book.spiders" ADDONS = {} # Crawl responsibly by identifying yourself (and your website) on the user-agent # USER_AGENT = "douban_book (+http://www.yourdomain.com)" # Obey robots.txt rules ROBOTSTXT_OBEY = False # Concurrency and throttling settings # CONCURRENT_REQUESTS = 16 CONCURRENT_REQUESTS_PER_DOMAIN = 1 DOWNLOAD_DELAY = 10 # Disable cookies (enabled by default) # COOKIES_ENABLED = False # Disable Telnet Console (enabled by default) # TELNETCONSOLE_ENABLED = False # Override the default request headers: DEFAULT_REQUEST_HEADERS = { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language": "en", "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36" } # Enable or disable spider middlewares # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html # SPIDER_MIDDLEWARES = { # "douban_book.middlewares.DoubanBookSpiderMiddleware": 543, # } # Enable or disable downloader middlewares # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html RETRY_ENABLED = True RETRY_TIMES = 2 # 重试3次 RETRY_HTTP_CODES = [500, 502, 503, 504, 408, 400] # 增加了一些常见的错误码 DOWNLOADER_MIDDLEWARES = { 'scrapy.downloadermiddlewares.retry.RetryMiddleware': 550, # "douban_book.middlewares.DoubanBookDownloaderMiddleware": 543, } # Enable or disable extensions # See https://docs.scrapy.org/en/latest/topics/extensions.html # EXTENSIONS = { # "scrapy.extensions.telnet.TelnetConsole": None, # } # Configure item pipelines # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html ITEM_PIPELINES = { "douban_book.pipelines.DoubanBookPipeline": 300, "douban_book.pipelines.DoubanBookInfoStandard": 400, "douban_book.pipelines.MongoPipeline": 410, } MONGO_URI = "mongodb://root:123456@192.168.1.211:27017/" MONGO_DATABASE = "science2" REDIS_URL = 'redis://:kcidea1509@192.168.1.211:6379/10' # Enable and configure the AutoThrottle extension (disabled by default) # See https://docs.scrapy.org/en/latest/topics/autothrottle.html # AUTOTHROTTLE_ENABLED = True # The initial download delay # AUTOTHROTTLE_START_DELAY = 5 # The maximum download delay to be set in case of high latencies # AUTOTHROTTLE_MAX_DELAY = 60 # The average number of requests Scrapy should be sending in parallel to # each remote server # AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 # Enable showing throttling stats for every response received: # AUTOTHROTTLE_DEBUG = False # Enable and configure HTTP caching (disabled by default) # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings # HTTPCACHE_ENABLED = True # HTTPCACHE_EXPIRATION_SECS = 0 # HTTPCACHE_DIR = "httpcache" # HTTPCACHE_IGNORE_HTTP_CODES = [] # HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage" # Set settings whose default value is deprecated to a future-proof value FEED_EXPORT_ENCODING = "utf-8"