From 601cc86af8fab5e89c41a9853e011ac5b5fd98a6 Mon Sep 17 00:00:00 2001 From: zhaoxiangpeng <1943364377@qq.com> Date: Thu, 21 Aug 2025 10:59:35 +0800 Subject: [PATCH] =?UTF-8?q?add:=20=E8=B1=86=E7=93=A3=E5=9B=BE=E4=B9=A6top2?= =?UTF-8?q?50?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- douban_book/douban_book/items.py | 12 +++ douban_book/douban_book/main.py | 9 ++ douban_book/douban_book/pipelines.py | 84 ++++++++++++++++++- .../douban_book/selector_cfg/__init__.py | 4 + douban_book/douban_book/selector_cfg/book.py | 51 +++++++++++ douban_book/douban_book/settings.py | 70 +++++++++------- .../douban_book/spiders/douban_top250.py | 70 +++++++++++++++- douban_book/run.py | 10 +++ 8 files changed, 278 insertions(+), 32 deletions(-) create mode 100644 douban_book/douban_book/main.py create mode 100644 douban_book/douban_book/selector_cfg/__init__.py create mode 100644 douban_book/douban_book/selector_cfg/book.py diff --git a/douban_book/douban_book/items.py b/douban_book/douban_book/items.py index 224bc2d..b7c57ad 100644 --- a/douban_book/douban_book/items.py +++ b/douban_book/douban_book/items.py @@ -16,6 +16,18 @@ class DoubanBookTop250Item(scrapy.Item): book_name = scrapy.Field() # 书名 author_name = scrapy.Field() # 作者 publisher = scrapy.Field() # 出版社 + producer = scrapy.Field() # 出品方 + title_subhead = scrapy.Field() # 副标题 + book_name_original = scrapy.Field() # 原作名 + translator = scrapy.Field() # 译者 + pages = scrapy.Field() # 页数 + price = scrapy.Field() # 定价 + binding = scrapy.Field() # 装帧 + series = scrapy.Field() # 丛书 + rating_nums = scrapy.Field() # 评分 + comment = scrapy.Field() # 一言 + book_catalog = scrapy.Field() # 目录 + author_summery = scrapy.Field() # 作者简介 pub_year = scrapy.Field() # 出版年 book_isbn = scrapy.Field() # ISBN book_summery = scrapy.Field() # 简介 diff --git a/douban_book/douban_book/main.py b/douban_book/douban_book/main.py new file mode 100644 index 0000000..fdfc092 --- /dev/null +++ b/douban_book/douban_book/main.py @@ -0,0 +1,9 @@ +# -*- coding: utf-8 -*- +# @Time : 2025/8/20 15:05 +# @Author : zhaoxiangpeng +# @File : main.py + +from scrapy.cmdline import execute + + +execute('scrapy crawl douban_top250 -o data_douban_top250_3.csv'.split()) diff --git a/douban_book/douban_book/pipelines.py b/douban_book/douban_book/pipelines.py index c93c479..e30aa5c 100644 --- a/douban_book/douban_book/pipelines.py +++ b/douban_book/douban_book/pipelines.py @@ -2,12 +2,94 @@ # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html - +from typing import Union # useful for handling different item types with a single interface from itemadapter import ItemAdapter +import pymongo +from pymongo.errors import DuplicateKeyError class DoubanBookPipeline: def process_item(self, item, spider): return item + + +def standard_list_or_str(list_or_str: Union[list, str, int]) -> Union[str, int, None]: + new_arr = [] + if not list_or_str: + return None + if isinstance(list_or_str, int): + return list_or_str + elif isinstance(list_or_str, str): + text = list_or_str.strip() + return text + elif isinstance(list_or_str, list): + for text in list_or_str: + text = text.strip() + text = text.replace('.intro p{text-indent:2em;word-break:normal;}', '') + if text: + new_arr.append(text) + else: + return list_or_str + return '; '.join(new_arr) + + +class DoubanBookInfoStandard: + def process_item(self, item, spider): + adapter = ItemAdapter(item) + # item['author_name'] = standard_list_or_str(adapter.get('author_name')) + # item['translator'] = standard_list_or_str(adapter.get('translator')) + # item['book_catalog'] = standard_list_or_str(adapter.get('book_catalog')) + + for key in adapter.keys(): + item[key] = standard_list_or_str(adapter.get(key)) + + return item + + +class ToCSVPipeline: + def open_spider(self, spider): + print('爬虫开始') + self.f = open('data_douban_top250.csv', 'a', encoding='utf-8') + self.f.write() + + def close_spider(self, spider): + if self.f: + self.f.close() + print('爬虫结束') + + def process_item(self, item, spider): + pass + + +class MongoPipeline: + collection_name = "data_douban_top250" + + def __init__(self, mongo_uri, mongo_db): + self.mongo_uri = mongo_uri + self.mongo_db = mongo_db + + @classmethod + def from_crawler(cls, crawler): + return cls( + mongo_uri=crawler.settings.get("MONGO_URI"), + mongo_db=crawler.settings.get("MONGO_DATABASE", "items"), + ) + + def open_spider(self, spider): + self.client = pymongo.MongoClient(self.mongo_uri) + self.db = self.client[self.mongo_db] + + def close_spider(self, spider): + self.client.close() + + def process_item(self, item, spider): + try: + self.db[self.collection_name].insert_one(ItemAdapter(item).asdict()) + except DuplicateKeyError as dup_err: + spider.logger.warning(dup_err) + except Exception: + raise + return item + diff --git a/douban_book/douban_book/selector_cfg/__init__.py b/douban_book/douban_book/selector_cfg/__init__.py new file mode 100644 index 0000000..5654e2b --- /dev/null +++ b/douban_book/douban_book/selector_cfg/__init__.py @@ -0,0 +1,4 @@ +# -*- coding: utf-8 -*- +# @Time : 2025/8/20 9:29 +# @Author : zhaoxiangpeng +# @File : __init__.py diff --git a/douban_book/douban_book/selector_cfg/book.py b/douban_book/douban_book/selector_cfg/book.py new file mode 100644 index 0000000..6ea013b --- /dev/null +++ b/douban_book/douban_book/selector_cfg/book.py @@ -0,0 +1,51 @@ +# -*- coding: utf-8 -*- +# @Time : 2025/8/20 9:30 +# @Author : zhaoxiangpeng +# @File : book.py + +# 书名 +SHUOMING_XPATH = book_name_xpath = '//*[@id="wrapper"]/h1/span/text()' +# 封面 +FENGMIAN_XPATH = book_cover_xpath = '//*[@id="mainpic"]/a/@href' +# 作者[多个] +ZUOZHE_XPATH = author_xpath = '//span[contains(text(), "作者")]/following-sibling::a/text()' +# 出版社 +CHUBANSHE_XPATH = press_xpath = '//span[contains(text(), "出版社")]/following-sibling::a[1]/text()' +# 出品方[多个] +CHUPINFANG_XPATH = producer_xpath = '//span[contains(text(), "作者")]/following-sibling/a/text()' +# 副标题 +FUBIAOTI_XPATH = title_subhead_xpath = '//span[contains(text(), "副标题")]/following-sibling::text()[1]' +# 原作名 +YUANZUOMING_XPATH = book_name_original_xpath = '//span[contains(text(), "原作名")]/following-sibling::text()[1]' +# 译者[多个] +YIZHE_XPATH = translator_xpath = '//span[contains(text(), "译者")]/following-sibling::a/text()' +# 出版年 +CHUBANNIAN_XPATH = publish_year_xpath = '//span[contains(text(), "出版年")]/following-sibling::text()[1]' +# 页数 +YESHU_XPATH = pages_xpath = '//span[contains(text(), "页数")]/following-sibling::text()[1]' +# 定价 +DINGJIA_XPATH = price_xpath = '//span[contains(text(), "定价")]/following-sibling::text()[1]' +# 装帧 +ZHUANGZHEN_XPATH = binding_xpath = '//span[contains(text(), "装帧")]/following-sibling::text()[1]' +# 丛书 +CONGSHU_XPATH = series_xpath = '//span[contains(text(), "丛书")]/following-sibling::a[1]/text()' +# 评分 +PINGFEN_XPATH = rating_nums_xpath = '//*[@id="interest_sectl"]/div/div[2]/strong/text()' +# ISBN +ISBN_XPATH = isbn_xpath = '//span[contains(text(), "ISBN")]/following-sibling::text()[1]' +# 内容简介 +NEIRONGJIANJIE_XPATH = book_summery_xpath = '//span[contains(text(), "内容简介")]/parent::*/following-sibling::div[1]/span[2]//text() | //*[@class="all hidden"]/div/div//text() | //*[@id="content"]/div/div[1]/div[3]/div[1]/div/div//text()' +# 目录 +MULU_XPATH = book_catalog_xpath = '//span[contains(text(), "目录")]/parent::*/following-sibling::div[2]//text() | //*[@class="related_info"]/div[5]/text()' +# 作者简介 +ZUOZHEJIANJIE_XPATH = author_summery_xpath = '//span[contains(text(), "作者简介")]/parent::*/following-sibling::div[1]//text()' + + + + + + +XPATH = '//span[contains(text(), "作者")]/following-sibling/a/text()' # XPATH[多个] +XPATH = '//span[contains(text(), "作者")]/following-sibling/a/text()' # XPATH[多个] +XPATH = '//span[contains(text(), "作者")]/following-sibling/a/text()' # XPATH[多个] +XPATH = '//span[contains(text(), "作者")]/following-sibling/a/text()' # XPATH[多个] diff --git a/douban_book/douban_book/settings.py b/douban_book/douban_book/settings.py index bf07438..97b5714 100644 --- a/douban_book/douban_book/settings.py +++ b/douban_book/douban_book/settings.py @@ -14,75 +14,85 @@ NEWSPIDER_MODULE = "douban_book.spiders" ADDONS = {} - # Crawl responsibly by identifying yourself (and your website) on the user-agent -#USER_AGENT = "douban_book (+http://www.yourdomain.com)" +# USER_AGENT = "douban_book (+http://www.yourdomain.com)" # Obey robots.txt rules -ROBOTSTXT_OBEY = True +ROBOTSTXT_OBEY = False # Concurrency and throttling settings -#CONCURRENT_REQUESTS = 16 +# CONCURRENT_REQUESTS = 16 CONCURRENT_REQUESTS_PER_DOMAIN = 1 -DOWNLOAD_DELAY = 1 +DOWNLOAD_DELAY = 10 # Disable cookies (enabled by default) -#COOKIES_ENABLED = False +# COOKIES_ENABLED = False # Disable Telnet Console (enabled by default) -#TELNETCONSOLE_ENABLED = False +# TELNETCONSOLE_ENABLED = False # Override the default request headers: DEFAULT_REQUEST_HEADERS = { - "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", - "Accept-Language": "en", - "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36" + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", + "Accept-Language": "en", + "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36" } # Enable or disable spider middlewares # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html -#SPIDER_MIDDLEWARES = { +# SPIDER_MIDDLEWARES = { # "douban_book.middlewares.DoubanBookSpiderMiddleware": 543, -#} +# } # Enable or disable downloader middlewares # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html -#DOWNLOADER_MIDDLEWARES = { -# "douban_book.middlewares.DoubanBookDownloaderMiddleware": 543, -#} +RETRY_ENABLED = True +RETRY_TIMES = 2 # 重试3次 +RETRY_HTTP_CODES = [500, 502, 503, 504, 408, 400] # 增加了一些常见的错误码 + +DOWNLOADER_MIDDLEWARES = { + 'scrapy.downloadermiddlewares.retry.RetryMiddleware': 550, + # "douban_book.middlewares.DoubanBookDownloaderMiddleware": 543, +} # Enable or disable extensions # See https://docs.scrapy.org/en/latest/topics/extensions.html -#EXTENSIONS = { +# EXTENSIONS = { # "scrapy.extensions.telnet.TelnetConsole": None, -#} +# } # Configure item pipelines # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html -#ITEM_PIPELINES = { -# "douban_book.pipelines.DoubanBookPipeline": 300, -#} +ITEM_PIPELINES = { + "douban_book.pipelines.DoubanBookPipeline": 300, + "douban_book.pipelines.DoubanBookInfoStandard": 400, + "douban_book.pipelines.MongoPipeline": 410, +} +MONGO_URI = "mongodb://root:123456@192.168.1.211:27017/" +MONGO_DATABASE = "science2" + +REDIS_URL = 'redis://:kcidea1509@192.168.1.211:6379/10' # Enable and configure the AutoThrottle extension (disabled by default) # See https://docs.scrapy.org/en/latest/topics/autothrottle.html -#AUTOTHROTTLE_ENABLED = True +# AUTOTHROTTLE_ENABLED = True # The initial download delay -#AUTOTHROTTLE_START_DELAY = 5 +# AUTOTHROTTLE_START_DELAY = 5 # The maximum download delay to be set in case of high latencies -#AUTOTHROTTLE_MAX_DELAY = 60 +# AUTOTHROTTLE_MAX_DELAY = 60 # The average number of requests Scrapy should be sending in parallel to # each remote server -#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 +# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 # Enable showing throttling stats for every response received: -#AUTOTHROTTLE_DEBUG = False +# AUTOTHROTTLE_DEBUG = False # Enable and configure HTTP caching (disabled by default) # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings -#HTTPCACHE_ENABLED = True -#HTTPCACHE_EXPIRATION_SECS = 0 -#HTTPCACHE_DIR = "httpcache" -#HTTPCACHE_IGNORE_HTTP_CODES = [] -#HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage" +# HTTPCACHE_ENABLED = True +# HTTPCACHE_EXPIRATION_SECS = 0 +# HTTPCACHE_DIR = "httpcache" +# HTTPCACHE_IGNORE_HTTP_CODES = [] +# HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage" # Set settings whose default value is deprecated to a future-proof value FEED_EXPORT_ENCODING = "utf-8" diff --git a/douban_book/douban_book/spiders/douban_top250.py b/douban_book/douban_book/spiders/douban_top250.py index 95bcae3..a1ed5a8 100644 --- a/douban_book/douban_book/spiders/douban_top250.py +++ b/douban_book/douban_book/spiders/douban_top250.py @@ -1,4 +1,8 @@ import scrapy +from douban_book.items import DoubanBookTop250Item + +from douban_book.selector_cfg import book +# from douban_book.douban_book.selector_cfg import book class DoubanTop250Spider(scrapy.Spider): @@ -7,4 +11,68 @@ class DoubanTop250Spider(scrapy.Spider): start_urls = ["https://book.douban.com/top250?start=0"] def parse(self, response): - pass + start = response.meta.get('start', 0) + nodes = response.xpath('//div[@class="article"]/div/table//tr[@class="item"]') + for node in nodes: + cover_link = node.xpath('./td[1]/a/img/@src').get() # 封面链接 + book_name = node.xpath('./td[2]/div[1]/a/@title').get() # 书名 + book_link = node.xpath('./td[2]/div[1]/a/@href').get() # 图书链接 + rating_nums = node.xpath('./td[2]/div[2]/span[@class="rating_nums"]/text()').get() # 评分 + comment = node.xpath('./td[2]/p[@class="quote"]/span/text()').get() # 一言 + start += 1 + yield response.follow(book_link, callback=self.parse_detail, + # 携带列表页部分信息 + meta=dict( + item=dict(book_name=book_name, book_link=book_link, cover_link=cover_link, ranking=start, rating_nums=rating_nums, comment=comment), + )) + next_page = response.xpath('//div[@class="paginator"]/span[@class="next"]/a/@href').get() # 下一页 + if next_page: + yield scrapy.Request(next_page, meta=dict(start=start)) + + def parse_detail(self, response): + req_meta = response.meta + meta_item = req_meta.get('item') + book_name = response.xpath(book.SHUOMING_XPATH) + author = response.xpath(book.ZUOZHE_XPATH) + publisher = response.xpath(book.CHUBANSHE_XPATH) + pub_year = response.xpath(book.CHUBANNIAN_XPATH) + book_isbn = response.xpath(book.ISBN_XPATH) + book_summery = response.xpath(book.NEIRONGJIANJIE_XPATH) + producer = response.xpath(book.producer_xpath) + title_subhead = response.xpath(book.title_subhead_xpath) + book_name_original = response.xpath(book.book_name_original_xpath) + translator = response.xpath(book.translator_xpath) + pages = response.xpath(book.pages_xpath) + price = response.xpath(book.price_xpath) + binding = response.xpath(book.binding_xpath) + series = response.xpath(book.series_xpath) + rating_nums = response.xpath(book.rating_nums_xpath) + book_catalog = response.xpath(book.book_catalog_xpath) + author_summery = response.xpath(book.author_summery_xpath) + + book_item = DoubanBookTop250Item() + book_item['book_name'] = book_name.get() + book_item['author_name'] = author.getall() + book_item['publisher'] = publisher.get() + book_item['producer'] = producer.getall() + book_item['title_subhead'] = title_subhead.get() + book_item['book_name_original'] = book_name_original.get() + book_item['translator'] = translator.getall() + book_item['pages'] = pages.get() + book_item['price'] = price.get() + book_item['binding'] = binding.get() + book_item['series'] = series.get() + book_item['rating_nums'] = meta_item.get('rating_nums') + book_item['comment'] = meta_item.get('comment') + book_item['book_catalog'] = book_catalog.getall() + book_item['author_summery'] = author_summery.getall() + book_item['pub_year'] = pub_year.get() + book_item['book_isbn'] = book_isbn.get() + book_item['book_summery'] = book_summery.getall() + book_item['rank_type'] = 'douban-top250' + book_item['rank_name'] = 'top250' + book_item['ranking'] = meta_item.get('ranking') + book_item['book_link'] = meta_item.get('book_link') + book_item['book_cover_link'] = meta_item.get('cover_link') + yield book_item + diff --git a/douban_book/run.py b/douban_book/run.py index f19f63f..3c3b68b 100644 --- a/douban_book/run.py +++ b/douban_book/run.py @@ -2,3 +2,13 @@ # @Time : 2025/8/19 17:13 # @Author : zhaoxiangpeng # @File : run.py + +import os +import re +from scrapy.crawler import CrawlerProcess +from scrapy.utils.project import get_project_settings + +process = CrawlerProcess(get_project_settings()) + +process.crawl('douban_top250') +process.start()