add: 豆瓣图书top250

3 months ago · 601cc86af8
parent 6e21275234
commit 601cc86af8
8 changed files with 278 additions and 32 deletions
--- a/douban_book/douban_book/items.py
+++ b/douban_book/douban_book/items.py
@ -16,6 +16,18 @@ class DoubanBookTop250Item(scrapy.Item):
    book_name = scrapy.Field()  # 书名
    author_name = scrapy.Field()  # 作者
    publisher = scrapy.Field()  # 出版社
+    producer = scrapy.Field()  # 出品方
+    title_subhead = scrapy.Field()  # 副标题
+    book_name_original = scrapy.Field()  # 原作名
+    translator = scrapy.Field()  # 译者
+    pages = scrapy.Field()  # 页数
+    price = scrapy.Field()  # 定价
+    binding = scrapy.Field()  # 装帧
+    series = scrapy.Field()  # 丛书
+    rating_nums = scrapy.Field()  # 评分
+    comment = scrapy.Field()  # 一言
+    book_catalog = scrapy.Field()  # 目录
+    author_summery = scrapy.Field()  # 作者简介
    pub_year = scrapy.Field()  # 出版年
    book_isbn = scrapy.Field()  # ISBN
    book_summery = scrapy.Field()  # 简介
--- a/douban_book/douban_book/main.py
+++ b/douban_book/douban_book/main.py
@ -0,0 +1,9 @@
+# -*- coding: utf-8 -*-
+# @Time    : 2025/8/20 15:05
+# @Author  : zhaoxiangpeng
+# @File    : main.py
+
+from scrapy.cmdline import execute
+
+
+execute('scrapy crawl douban_top250 -o data_douban_top250_3.csv'.split())
--- a/douban_book/douban_book/pipelines.py
+++ b/douban_book/douban_book/pipelines.py
@ -2,12 +2,94 @@
 #
 # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
-
+from typing import Union

 # useful for handling different item types with a single interface
 from itemadapter import ItemAdapter
+import pymongo
+from pymongo.errors import DuplicateKeyError


 class DoubanBookPipeline:
    def process_item(self, item, spider):
        return item
+
+
+def standard_list_or_str(list_or_str: Union[list, str, int]) -> Union[str, int, None]:
+    new_arr = []
+    if not list_or_str:
+        return None
+    if isinstance(list_or_str, int):
+        return list_or_str
+    elif isinstance(list_or_str, str):
+        text = list_or_str.strip()
+        return text
+    elif isinstance(list_or_str, list):
+        for text in list_or_str:
+            text = text.strip()
+            text = text.replace('.intro p{text-indent:2em;word-break:normal;}', '')
+            if text:
+                new_arr.append(text)
+    else:
+        return list_or_str
+    return '; '.join(new_arr)
+
+
+class DoubanBookInfoStandard:
+    def process_item(self, item, spider):
+        adapter = ItemAdapter(item)
+        # item['author_name'] = standard_list_or_str(adapter.get('author_name'))
+        # item['translator'] = standard_list_or_str(adapter.get('translator'))
+        # item['book_catalog'] = standard_list_or_str(adapter.get('book_catalog'))
+
+        for key in adapter.keys():
+            item[key] = standard_list_or_str(adapter.get(key))
+
+        return item
+
+
+class ToCSVPipeline:
+    def open_spider(self, spider):
+        print('爬虫开始')
+        self.f = open('data_douban_top250.csv', 'a', encoding='utf-8')
+        self.f.write()
+
+    def close_spider(self, spider):
+        if self.f:
+            self.f.close()
+        print('爬虫结束')
+
+    def process_item(self, item, spider):
+        pass
+
+
+class MongoPipeline:
+    collection_name = "data_douban_top250"
+
+    def __init__(self, mongo_uri, mongo_db):
+        self.mongo_uri = mongo_uri
+        self.mongo_db = mongo_db
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        return cls(
+            mongo_uri=crawler.settings.get("MONGO_URI"),
+            mongo_db=crawler.settings.get("MONGO_DATABASE", "items"),
+        )
+
+    def open_spider(self, spider):
+        self.client = pymongo.MongoClient(self.mongo_uri)
+        self.db = self.client[self.mongo_db]
+
+    def close_spider(self, spider):
+        self.client.close()
+
+    def process_item(self, item, spider):
+        try:
+            self.db[self.collection_name].insert_one(ItemAdapter(item).asdict())
+        except DuplicateKeyError as dup_err:
+            spider.logger.warning(dup_err)
+        except Exception:
+            raise
+        return item
+
--- a/douban_book/douban_book/selector_cfg/init.py
+++ b/douban_book/douban_book/selector_cfg/init.py
@ -0,0 +1,4 @@
+# -*- coding: utf-8 -*-
+# @Time    : 2025/8/20 9:29
+# @Author  : zhaoxiangpeng
+# @File    : __init__.py
--- a/douban_book/douban_book/selector_cfg/book.py
+++ b/douban_book/douban_book/selector_cfg/book.py
@ -0,0 +1,51 @@
+# -*- coding: utf-8 -*-
+# @Time    : 2025/8/20 9:30
+# @Author  : zhaoxiangpeng
+# @File    : book.py
+
+# 书名
+SHUOMING_XPATH = book_name_xpath = '//*[@id="wrapper"]/h1/span/text()'
+# 封面
+FENGMIAN_XPATH = book_cover_xpath = '//*[@id="mainpic"]/a/@href'
+# 作者[多个]
+ZUOZHE_XPATH = author_xpath = '//span[contains(text(), "作者")]/following-sibling::a/text()'
+# 出版社
+CHUBANSHE_XPATH = press_xpath = '//span[contains(text(), "出版社")]/following-sibling::a[1]/text()'
+# 出品方[多个]
+CHUPINFANG_XPATH = producer_xpath = '//span[contains(text(), "作者")]/following-sibling/a/text()'
+# 副标题
+FUBIAOTI_XPATH = title_subhead_xpath = '//span[contains(text(), "副标题")]/following-sibling::text()[1]'
+# 原作名
+YUANZUOMING_XPATH = book_name_original_xpath = '//span[contains(text(), "原作名")]/following-sibling::text()[1]'
+# 译者[多个]
+YIZHE_XPATH = translator_xpath = '//span[contains(text(), "译者")]/following-sibling::a/text()'
+# 出版年
+CHUBANNIAN_XPATH = publish_year_xpath = '//span[contains(text(), "出版年")]/following-sibling::text()[1]'
+# 页数
+YESHU_XPATH = pages_xpath = '//span[contains(text(), "页数")]/following-sibling::text()[1]'
+# 定价
+DINGJIA_XPATH = price_xpath = '//span[contains(text(), "定价")]/following-sibling::text()[1]'
+# 装帧
+ZHUANGZHEN_XPATH = binding_xpath = '//span[contains(text(), "装帧")]/following-sibling::text()[1]'
+# 丛书
+CONGSHU_XPATH = series_xpath = '//span[contains(text(), "丛书")]/following-sibling::a[1]/text()'
+# 评分
+PINGFEN_XPATH = rating_nums_xpath = '//*[@id="interest_sectl"]/div/div[2]/strong/text()'
+# ISBN
+ISBN_XPATH = isbn_xpath = '//span[contains(text(), "ISBN")]/following-sibling::text()[1]'
+# 内容简介
+NEIRONGJIANJIE_XPATH = book_summery_xpath = '//span[contains(text(), "内容简介")]/parent::*/following-sibling::div[1]/span[2]//text() | //*[@class="all hidden"]/div/div//text() | //*[@id="content"]/div/div[1]/div[3]/div[1]/div/div//text()'
+# 目录
+MULU_XPATH = book_catalog_xpath = '//span[contains(text(), "目录")]/parent::*/following-sibling::div[2]//text() | //*[@class="related_info"]/div[5]/text()'
+# 作者简介
+ZUOZHEJIANJIE_XPATH = author_summery_xpath = '//span[contains(text(), "作者简介")]/parent::*/following-sibling::div[1]//text()'
+
+
+
+
+
+
+XPATH = '//span[contains(text(), "作者")]/following-sibling/a/text()'  # XPATH[多个]
+XPATH = '//span[contains(text(), "作者")]/following-sibling/a/text()'  # XPATH[多个]
+XPATH = '//span[contains(text(), "作者")]/following-sibling/a/text()'  # XPATH[多个]
+XPATH = '//span[contains(text(), "作者")]/following-sibling/a/text()'  # XPATH[多个]
--- a/douban_book/douban_book/settings.py
+++ b/douban_book/douban_book/settings.py
@ -14,17 +14,16 @@ NEWSPIDER_MODULE = "douban_book.spiders"

 ADDONS = {}

-
 # Crawl responsibly by identifying yourself (and your website) on the user-agent
 # USER_AGENT = "douban_book (+http://www.yourdomain.com)"

 # Obey robots.txt rules
-ROBOTSTXT_OBEY = True
+ROBOTSTXT_OBEY = False

 # Concurrency and throttling settings
 # CONCURRENT_REQUESTS = 16
 CONCURRENT_REQUESTS_PER_DOMAIN = 1
-DOWNLOAD_DELAY = 1
+DOWNLOAD_DELAY = 10

 # Disable cookies (enabled by default)
 # COOKIES_ENABLED = False
@ -47,9 +46,14 @@ DEFAULT_REQUEST_HEADERS = {

 # Enable or disable downloader middlewares
 # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
-#DOWNLOADER_MIDDLEWARES = {
+RETRY_ENABLED = True
+RETRY_TIMES = 2  # 重试3次
+RETRY_HTTP_CODES = [500, 502, 503, 504, 408, 400]  # 增加了一些常见的错误码
+
+DOWNLOADER_MIDDLEWARES = {
+    'scrapy.downloadermiddlewares.retry.RetryMiddleware': 550,
   # "douban_book.middlewares.DoubanBookDownloaderMiddleware": 543,
-#}
+}

 # Enable or disable extensions
 # See https://docs.scrapy.org/en/latest/topics/extensions.html
@ -59,9 +63,15 @@ DEFAULT_REQUEST_HEADERS = {

 # Configure item pipelines
 # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
-#ITEM_PIPELINES = {
-#    "douban_book.pipelines.DoubanBookPipeline": 300,
-#}
+ITEM_PIPELINES = {
+    "douban_book.pipelines.DoubanBookPipeline": 300,
+    "douban_book.pipelines.DoubanBookInfoStandard": 400,
+    "douban_book.pipelines.MongoPipeline": 410,
+}
+MONGO_URI = "mongodb://root:123456@192.168.1.211:27017/"
+MONGO_DATABASE = "science2"
+
+REDIS_URL = 'redis://:kcidea1509@192.168.1.211:6379/10'

 # Enable and configure the AutoThrottle extension (disabled by default)
 # See https://docs.scrapy.org/en/latest/topics/autothrottle.html
--- a/douban_book/douban_book/spiders/douban_top250.py
+++ b/douban_book/douban_book/spiders/douban_top250.py
@ -1,4 +1,8 @@
 import scrapy
+from douban_book.items import DoubanBookTop250Item
+
+from douban_book.selector_cfg import book
+# from douban_book.douban_book.selector_cfg import book


 class DoubanTop250Spider(scrapy.Spider):
@ -7,4 +11,68 @@ class DoubanTop250Spider(scrapy.Spider):
    start_urls = ["https://book.douban.com/top250?start=0"]

    def parse(self, response):
-        pass
+        start = response.meta.get('start', 0)
+        nodes = response.xpath('//div[@class="article"]/div/table//tr[@class="item"]')
+        for node in nodes:
+            cover_link = node.xpath('./td[1]/a/img/@src').get()  # 封面链接
+            book_name = node.xpath('./td[2]/div[1]/a/@title').get()  # 书名
+            book_link = node.xpath('./td[2]/div[1]/a/@href').get()  # 图书链接
+            rating_nums = node.xpath('./td[2]/div[2]/span[@class="rating_nums"]/text()').get()  # 评分
+            comment = node.xpath('./td[2]/p[@class="quote"]/span/text()').get()  # 一言
+            start += 1
+            yield response.follow(book_link, callback=self.parse_detail,
+                                  # 携带列表页部分信息
+                                  meta=dict(
+                                      item=dict(book_name=book_name, book_link=book_link, cover_link=cover_link, ranking=start, rating_nums=rating_nums, comment=comment),
+                                  ))
+        next_page = response.xpath('//div[@class="paginator"]/span[@class="next"]/a/@href').get()  # 下一页
+        if next_page:
+            yield scrapy.Request(next_page, meta=dict(start=start))
+
+    def parse_detail(self, response):
+        req_meta = response.meta
+        meta_item = req_meta.get('item')
+        book_name = response.xpath(book.SHUOMING_XPATH)
+        author = response.xpath(book.ZUOZHE_XPATH)
+        publisher = response.xpath(book.CHUBANSHE_XPATH)
+        pub_year = response.xpath(book.CHUBANNIAN_XPATH)
+        book_isbn = response.xpath(book.ISBN_XPATH)
+        book_summery = response.xpath(book.NEIRONGJIANJIE_XPATH)
+        producer = response.xpath(book.producer_xpath)
+        title_subhead = response.xpath(book.title_subhead_xpath)
+        book_name_original = response.xpath(book.book_name_original_xpath)
+        translator = response.xpath(book.translator_xpath)
+        pages = response.xpath(book.pages_xpath)
+        price = response.xpath(book.price_xpath)
+        binding = response.xpath(book.binding_xpath)
+        series = response.xpath(book.series_xpath)
+        rating_nums = response.xpath(book.rating_nums_xpath)
+        book_catalog = response.xpath(book.book_catalog_xpath)
+        author_summery = response.xpath(book.author_summery_xpath)
+
+        book_item = DoubanBookTop250Item()
+        book_item['book_name'] = book_name.get()
+        book_item['author_name'] = author.getall()
+        book_item['publisher'] = publisher.get()
+        book_item['producer'] = producer.getall()
+        book_item['title_subhead'] = title_subhead.get()
+        book_item['book_name_original'] = book_name_original.get()
+        book_item['translator'] = translator.getall()
+        book_item['pages'] = pages.get()
+        book_item['price'] = price.get()
+        book_item['binding'] = binding.get()
+        book_item['series'] = series.get()
+        book_item['rating_nums'] = meta_item.get('rating_nums')
+        book_item['comment'] = meta_item.get('comment')
+        book_item['book_catalog'] = book_catalog.getall()
+        book_item['author_summery'] = author_summery.getall()
+        book_item['pub_year'] = pub_year.get()
+        book_item['book_isbn'] = book_isbn.get()
+        book_item['book_summery'] = book_summery.getall()
+        book_item['rank_type'] = 'douban-top250'
+        book_item['rank_name'] = 'top250'
+        book_item['ranking'] = meta_item.get('ranking')
+        book_item['book_link'] = meta_item.get('book_link')
+        book_item['book_cover_link'] = meta_item.get('cover_link')
+        yield book_item
+
--- a/douban_book/run.py
+++ b/douban_book/run.py
@ -2,3 +2,13 @@
 # @Time    : 2025/8/19 17:13
 # @Author  : zhaoxiangpeng
 # @File    : run.py
+
+import os
+import re
+from scrapy.crawler import CrawlerProcess
+from scrapy.utils.project import get_project_settings
+
+process = CrawlerProcess(get_project_settings())
+
+process.crawl('douban_top250')
+process.start()