From 601cc86af8fab5e89c41a9853e011ac5b5fd98a6 Mon Sep 17 00:00:00 2001
From: zhaoxiangpeng <1943364377@qq.com>
Date: Thu, 21 Aug 2025 10:59:35 +0800
Subject: [PATCH] =?UTF-8?q?add:=20=E8=B1=86=E7=93=A3=E5=9B=BE=E4=B9=A6top2?=
 =?UTF-8?q?50?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 douban_book/douban_book/items.py              | 12 +++
 douban_book/douban_book/main.py               |  9 ++
 douban_book/douban_book/pipelines.py          | 84 ++++++++++++++++++-
 .../douban_book/selector_cfg/__init__.py      |  4 +
 douban_book/douban_book/selector_cfg/book.py  | 51 +++++++++++
 douban_book/douban_book/settings.py           | 70 +++++++++-------
 .../douban_book/spiders/douban_top250.py      | 70 +++++++++++++++-
 douban_book/run.py                            | 10 +++
 8 files changed, 278 insertions(+), 32 deletions(-)
 create mode 100644 douban_book/douban_book/main.py
 create mode 100644 douban_book/douban_book/selector_cfg/__init__.py
 create mode 100644 douban_book/douban_book/selector_cfg/book.py

diff --git a/douban_book/douban_book/items.py b/douban_book/douban_book/items.py
index 224bc2d..b7c57ad 100644
--- a/douban_book/douban_book/items.py
+++ b/douban_book/douban_book/items.py
@@ -16,6 +16,18 @@ class DoubanBookTop250Item(scrapy.Item):
     book_name = scrapy.Field()  # 书名
     author_name = scrapy.Field()  # 作者
     publisher = scrapy.Field()  # 出版社
+    producer = scrapy.Field()  # 出品方
+    title_subhead = scrapy.Field()  # 副标题
+    book_name_original = scrapy.Field()  # 原作名
+    translator = scrapy.Field()  # 译者
+    pages = scrapy.Field()  # 页数
+    price = scrapy.Field()  # 定价
+    binding = scrapy.Field()  # 装帧
+    series = scrapy.Field()  # 丛书
+    rating_nums = scrapy.Field()  # 评分
+    comment = scrapy.Field()  # 一言
+    book_catalog = scrapy.Field()  # 目录
+    author_summery = scrapy.Field()  # 作者简介
     pub_year = scrapy.Field()  # 出版年
     book_isbn = scrapy.Field()  # ISBN
     book_summery = scrapy.Field()  # 简介
diff --git a/douban_book/douban_book/main.py b/douban_book/douban_book/main.py
new file mode 100644
index 0000000..fdfc092
--- /dev/null
+++ b/douban_book/douban_book/main.py
@@ -0,0 +1,9 @@
+# -*- coding: utf-8 -*-
+# @Time    : 2025/8/20 15:05
+# @Author  : zhaoxiangpeng
+# @File    : main.py
+
+from scrapy.cmdline import execute
+
+
+execute('scrapy crawl douban_top250 -o data_douban_top250_3.csv'.split())
diff --git a/douban_book/douban_book/pipelines.py b/douban_book/douban_book/pipelines.py
index c93c479..e30aa5c 100644
--- a/douban_book/douban_book/pipelines.py
+++ b/douban_book/douban_book/pipelines.py
@@ -2,12 +2,94 @@
 #
 # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
-
+from typing import Union
 
 # useful for handling different item types with a single interface
 from itemadapter import ItemAdapter
+import pymongo
+from pymongo.errors import DuplicateKeyError
 
 
 class DoubanBookPipeline:
     def process_item(self, item, spider):
         return item
+
+
+def standard_list_or_str(list_or_str: Union[list, str, int]) -> Union[str, int, None]:
+    new_arr = []
+    if not list_or_str:
+        return None
+    if isinstance(list_or_str, int):
+        return list_or_str
+    elif isinstance(list_or_str, str):
+        text = list_or_str.strip()
+        return text
+    elif isinstance(list_or_str, list):
+        for text in list_or_str:
+            text = text.strip()
+            text = text.replace('.intro p{text-indent:2em;word-break:normal;}', '')
+            if text:
+                new_arr.append(text)
+    else:
+        return list_or_str
+    return '; '.join(new_arr)
+
+
+class DoubanBookInfoStandard:
+    def process_item(self, item, spider):
+        adapter = ItemAdapter(item)
+        # item['author_name'] = standard_list_or_str(adapter.get('author_name'))
+        # item['translator'] = standard_list_or_str(adapter.get('translator'))
+        # item['book_catalog'] = standard_list_or_str(adapter.get('book_catalog'))
+
+        for key in adapter.keys():
+            item[key] = standard_list_or_str(adapter.get(key))
+
+        return item
+
+
+class ToCSVPipeline:
+    def open_spider(self, spider):
+        print('爬虫开始')
+        self.f = open('data_douban_top250.csv', 'a', encoding='utf-8')
+        self.f.write()
+
+    def close_spider(self, spider):
+        if self.f:
+            self.f.close()
+        print('爬虫结束')
+
+    def process_item(self, item, spider):
+        pass
+
+
+class MongoPipeline:
+    collection_name = "data_douban_top250"
+
+    def __init__(self, mongo_uri, mongo_db):
+        self.mongo_uri = mongo_uri
+        self.mongo_db = mongo_db
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        return cls(
+            mongo_uri=crawler.settings.get("MONGO_URI"),
+            mongo_db=crawler.settings.get("MONGO_DATABASE", "items"),
+        )
+
+    def open_spider(self, spider):
+        self.client = pymongo.MongoClient(self.mongo_uri)
+        self.db = self.client[self.mongo_db]
+
+    def close_spider(self, spider):
+        self.client.close()
+
+    def process_item(self, item, spider):
+        try:
+            self.db[self.collection_name].insert_one(ItemAdapter(item).asdict())
+        except DuplicateKeyError as dup_err:
+            spider.logger.warning(dup_err)
+        except Exception:
+            raise
+        return item
+
diff --git a/douban_book/douban_book/selector_cfg/__init__.py b/douban_book/douban_book/selector_cfg/__init__.py
new file mode 100644
index 0000000..5654e2b
--- /dev/null
+++ b/douban_book/douban_book/selector_cfg/__init__.py
@@ -0,0 +1,4 @@
+# -*- coding: utf-8 -*-
+# @Time    : 2025/8/20 9:29
+# @Author  : zhaoxiangpeng
+# @File    : __init__.py
diff --git a/douban_book/douban_book/selector_cfg/book.py b/douban_book/douban_book/selector_cfg/book.py
new file mode 100644
index 0000000..6ea013b
--- /dev/null
+++ b/douban_book/douban_book/selector_cfg/book.py
@@ -0,0 +1,51 @@
+# -*- coding: utf-8 -*-
+# @Time    : 2025/8/20 9:30
+# @Author  : zhaoxiangpeng
+# @File    : book.py
+
+# 书名
+SHUOMING_XPATH = book_name_xpath = '//*[@id="wrapper"]/h1/span/text()'
+# 封面
+FENGMIAN_XPATH = book_cover_xpath = '//*[@id="mainpic"]/a/@href'
+# 作者[多个]
+ZUOZHE_XPATH = author_xpath = '//span[contains(text(), "作者")]/following-sibling::a/text()'
+# 出版社
+CHUBANSHE_XPATH = press_xpath = '//span[contains(text(), "出版社")]/following-sibling::a[1]/text()'
+# 出品方[多个]
+CHUPINFANG_XPATH = producer_xpath = '//span[contains(text(), "作者")]/following-sibling/a/text()'
+# 副标题
+FUBIAOTI_XPATH = title_subhead_xpath = '//span[contains(text(), "副标题")]/following-sibling::text()[1]'
+# 原作名
+YUANZUOMING_XPATH = book_name_original_xpath = '//span[contains(text(), "原作名")]/following-sibling::text()[1]'
+# 译者[多个]
+YIZHE_XPATH = translator_xpath = '//span[contains(text(), "译者")]/following-sibling::a/text()'
+# 出版年
+CHUBANNIAN_XPATH = publish_year_xpath = '//span[contains(text(), "出版年")]/following-sibling::text()[1]'
+# 页数
+YESHU_XPATH = pages_xpath = '//span[contains(text(), "页数")]/following-sibling::text()[1]'
+# 定价
+DINGJIA_XPATH = price_xpath = '//span[contains(text(), "定价")]/following-sibling::text()[1]'
+# 装帧
+ZHUANGZHEN_XPATH = binding_xpath = '//span[contains(text(), "装帧")]/following-sibling::text()[1]'
+# 丛书
+CONGSHU_XPATH = series_xpath = '//span[contains(text(), "丛书")]/following-sibling::a[1]/text()'
+# 评分
+PINGFEN_XPATH = rating_nums_xpath = '//*[@id="interest_sectl"]/div/div[2]/strong/text()'
+# ISBN
+ISBN_XPATH = isbn_xpath = '//span[contains(text(), "ISBN")]/following-sibling::text()[1]'
+# 内容简介
+NEIRONGJIANJIE_XPATH = book_summery_xpath = '//span[contains(text(), "内容简介")]/parent::*/following-sibling::div[1]/span[2]//text() | //*[@class="all hidden"]/div/div//text() | //*[@id="content"]/div/div[1]/div[3]/div[1]/div/div//text()'
+# 目录
+MULU_XPATH = book_catalog_xpath = '//span[contains(text(), "目录")]/parent::*/following-sibling::div[2]//text() | //*[@class="related_info"]/div[5]/text()'
+# 作者简介
+ZUOZHEJIANJIE_XPATH = author_summery_xpath = '//span[contains(text(), "作者简介")]/parent::*/following-sibling::div[1]//text()'
+
+
+
+
+
+
+XPATH = '//span[contains(text(), "作者")]/following-sibling/a/text()'  # XPATH[多个]
+XPATH = '//span[contains(text(), "作者")]/following-sibling/a/text()'  # XPATH[多个]
+XPATH = '//span[contains(text(), "作者")]/following-sibling/a/text()'  # XPATH[多个]
+XPATH = '//span[contains(text(), "作者")]/following-sibling/a/text()'  # XPATH[多个]
diff --git a/douban_book/douban_book/settings.py b/douban_book/douban_book/settings.py
index bf07438..97b5714 100644
--- a/douban_book/douban_book/settings.py
+++ b/douban_book/douban_book/settings.py
@@ -14,75 +14,85 @@ NEWSPIDER_MODULE = "douban_book.spiders"
 
 ADDONS = {}
 
-
 # Crawl responsibly by identifying yourself (and your website) on the user-agent
-#USER_AGENT = "douban_book (+http://www.yourdomain.com)"
+# USER_AGENT = "douban_book (+http://www.yourdomain.com)"
 
 # Obey robots.txt rules
-ROBOTSTXT_OBEY = True
+ROBOTSTXT_OBEY = False
 
 # Concurrency and throttling settings
-#CONCURRENT_REQUESTS = 16
+# CONCURRENT_REQUESTS = 16
 CONCURRENT_REQUESTS_PER_DOMAIN = 1
-DOWNLOAD_DELAY = 1
+DOWNLOAD_DELAY = 10
 
 # Disable cookies (enabled by default)
-#COOKIES_ENABLED = False
+# COOKIES_ENABLED = False
 
 # Disable Telnet Console (enabled by default)
-#TELNETCONSOLE_ENABLED = False
+# TELNETCONSOLE_ENABLED = False
 
 # Override the default request headers:
 DEFAULT_REQUEST_HEADERS = {
-   "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
-   "Accept-Language": "en",
-   "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36"
+    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
+    "Accept-Language": "en",
+    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36"
 }
 
 # Enable or disable spider middlewares
 # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
-#SPIDER_MIDDLEWARES = {
+# SPIDER_MIDDLEWARES = {
 #    "douban_book.middlewares.DoubanBookSpiderMiddleware": 543,
-#}
+# }
 
 # Enable or disable downloader middlewares
 # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
-#DOWNLOADER_MIDDLEWARES = {
-#    "douban_book.middlewares.DoubanBookDownloaderMiddleware": 543,
-#}
+RETRY_ENABLED = True
+RETRY_TIMES = 2  # 重试3次
+RETRY_HTTP_CODES = [500, 502, 503, 504, 408, 400]  # 增加了一些常见的错误码
+
+DOWNLOADER_MIDDLEWARES = {
+    'scrapy.downloadermiddlewares.retry.RetryMiddleware': 550,
+   # "douban_book.middlewares.DoubanBookDownloaderMiddleware": 543,
+}
 
 # Enable or disable extensions
 # See https://docs.scrapy.org/en/latest/topics/extensions.html
-#EXTENSIONS = {
+# EXTENSIONS = {
 #    "scrapy.extensions.telnet.TelnetConsole": None,
-#}
+# }
 
 # Configure item pipelines
 # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
-#ITEM_PIPELINES = {
-#    "douban_book.pipelines.DoubanBookPipeline": 300,
-#}
+ITEM_PIPELINES = {
+    "douban_book.pipelines.DoubanBookPipeline": 300,
+    "douban_book.pipelines.DoubanBookInfoStandard": 400,
+    "douban_book.pipelines.MongoPipeline": 410,
+}
+MONGO_URI = "mongodb://root:123456@192.168.1.211:27017/"
+MONGO_DATABASE = "science2"
+
+REDIS_URL = 'redis://:kcidea1509@192.168.1.211:6379/10'
 
 # Enable and configure the AutoThrottle extension (disabled by default)
 # See https://docs.scrapy.org/en/latest/topics/autothrottle.html
-#AUTOTHROTTLE_ENABLED = True
+# AUTOTHROTTLE_ENABLED = True
 # The initial download delay
-#AUTOTHROTTLE_START_DELAY = 5
+# AUTOTHROTTLE_START_DELAY = 5
 # The maximum download delay to be set in case of high latencies
-#AUTOTHROTTLE_MAX_DELAY = 60
+# AUTOTHROTTLE_MAX_DELAY = 60
 # The average number of requests Scrapy should be sending in parallel to
 # each remote server
-#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
+# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
 # Enable showing throttling stats for every response received:
-#AUTOTHROTTLE_DEBUG = False
+# AUTOTHROTTLE_DEBUG = False
 
 # Enable and configure HTTP caching (disabled by default)
 # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
-#HTTPCACHE_ENABLED = True
-#HTTPCACHE_EXPIRATION_SECS = 0
-#HTTPCACHE_DIR = "httpcache"
-#HTTPCACHE_IGNORE_HTTP_CODES = []
-#HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"
+# HTTPCACHE_ENABLED = True
+# HTTPCACHE_EXPIRATION_SECS = 0
+# HTTPCACHE_DIR = "httpcache"
+# HTTPCACHE_IGNORE_HTTP_CODES = []
+# HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"
 
 # Set settings whose default value is deprecated to a future-proof value
 FEED_EXPORT_ENCODING = "utf-8"
diff --git a/douban_book/douban_book/spiders/douban_top250.py b/douban_book/douban_book/spiders/douban_top250.py
index 95bcae3..a1ed5a8 100644
--- a/douban_book/douban_book/spiders/douban_top250.py
+++ b/douban_book/douban_book/spiders/douban_top250.py
@@ -1,4 +1,8 @@
 import scrapy
+from douban_book.items import DoubanBookTop250Item
+
+from douban_book.selector_cfg import book
+# from douban_book.douban_book.selector_cfg import book
 
 
 class DoubanTop250Spider(scrapy.Spider):
@@ -7,4 +11,68 @@ class DoubanTop250Spider(scrapy.Spider):
     start_urls = ["https://book.douban.com/top250?start=0"]
 
     def parse(self, response):
-        pass
+        start = response.meta.get('start', 0)
+        nodes = response.xpath('//div[@class="article"]/div/table//tr[@class="item"]')
+        for node in nodes:
+            cover_link = node.xpath('./td[1]/a/img/@src').get()  # 封面链接
+            book_name = node.xpath('./td[2]/div[1]/a/@title').get()  # 书名
+            book_link = node.xpath('./td[2]/div[1]/a/@href').get()  # 图书链接
+            rating_nums = node.xpath('./td[2]/div[2]/span[@class="rating_nums"]/text()').get()  # 评分
+            comment = node.xpath('./td[2]/p[@class="quote"]/span/text()').get()  # 一言
+            start += 1
+            yield response.follow(book_link, callback=self.parse_detail,
+                                  # 携带列表页部分信息
+                                  meta=dict(
+                                      item=dict(book_name=book_name, book_link=book_link, cover_link=cover_link, ranking=start, rating_nums=rating_nums, comment=comment),
+                                  ))
+        next_page = response.xpath('//div[@class="paginator"]/span[@class="next"]/a/@href').get()  # 下一页
+        if next_page:
+            yield scrapy.Request(next_page, meta=dict(start=start))
+
+    def parse_detail(self, response):
+        req_meta = response.meta
+        meta_item = req_meta.get('item')
+        book_name = response.xpath(book.SHUOMING_XPATH)
+        author = response.xpath(book.ZUOZHE_XPATH)
+        publisher = response.xpath(book.CHUBANSHE_XPATH)
+        pub_year = response.xpath(book.CHUBANNIAN_XPATH)
+        book_isbn = response.xpath(book.ISBN_XPATH)
+        book_summery = response.xpath(book.NEIRONGJIANJIE_XPATH)
+        producer = response.xpath(book.producer_xpath)
+        title_subhead = response.xpath(book.title_subhead_xpath)
+        book_name_original = response.xpath(book.book_name_original_xpath)
+        translator = response.xpath(book.translator_xpath)
+        pages = response.xpath(book.pages_xpath)
+        price = response.xpath(book.price_xpath)
+        binding = response.xpath(book.binding_xpath)
+        series = response.xpath(book.series_xpath)
+        rating_nums = response.xpath(book.rating_nums_xpath)
+        book_catalog = response.xpath(book.book_catalog_xpath)
+        author_summery = response.xpath(book.author_summery_xpath)
+
+        book_item = DoubanBookTop250Item()
+        book_item['book_name'] = book_name.get()
+        book_item['author_name'] = author.getall()
+        book_item['publisher'] = publisher.get()
+        book_item['producer'] = producer.getall()
+        book_item['title_subhead'] = title_subhead.get()
+        book_item['book_name_original'] = book_name_original.get()
+        book_item['translator'] = translator.getall()
+        book_item['pages'] = pages.get()
+        book_item['price'] = price.get()
+        book_item['binding'] = binding.get()
+        book_item['series'] = series.get()
+        book_item['rating_nums'] = meta_item.get('rating_nums')
+        book_item['comment'] = meta_item.get('comment')
+        book_item['book_catalog'] = book_catalog.getall()
+        book_item['author_summery'] = author_summery.getall()
+        book_item['pub_year'] = pub_year.get()
+        book_item['book_isbn'] = book_isbn.get()
+        book_item['book_summery'] = book_summery.getall()
+        book_item['rank_type'] = 'douban-top250'
+        book_item['rank_name'] = 'top250'
+        book_item['ranking'] = meta_item.get('ranking')
+        book_item['book_link'] = meta_item.get('book_link')
+        book_item['book_cover_link'] = meta_item.get('cover_link')
+        yield book_item
+
diff --git a/douban_book/run.py b/douban_book/run.py
index f19f63f..3c3b68b 100644
--- a/douban_book/run.py
+++ b/douban_book/run.py
@@ -2,3 +2,13 @@
 # @Time    : 2025/8/19 17:13
 # @Author  : zhaoxiangpeng
 # @File    : run.py
+
+import os
+import re
+from scrapy.crawler import CrawlerProcess
+from scrapy.utils.project import get_project_settings
+
+process = CrawlerProcess(get_project_settings())
+
+process.crawl('douban_top250')
+process.start()