add: 豆瓣图书top250

main
zhaoxiangpeng 1 week ago
parent 6e21275234
commit 601cc86af8

@ -16,6 +16,18 @@ class DoubanBookTop250Item(scrapy.Item):
book_name = scrapy.Field() # 书名
author_name = scrapy.Field() # 作者
publisher = scrapy.Field() # 出版社
producer = scrapy.Field() # 出品方
title_subhead = scrapy.Field() # 副标题
book_name_original = scrapy.Field() # 原作名
translator = scrapy.Field() # 译者
pages = scrapy.Field() # 页数
price = scrapy.Field() # 定价
binding = scrapy.Field() # 装帧
series = scrapy.Field() # 丛书
rating_nums = scrapy.Field() # 评分
comment = scrapy.Field() # 一言
book_catalog = scrapy.Field() # 目录
author_summery = scrapy.Field() # 作者简介
pub_year = scrapy.Field() # 出版年
book_isbn = scrapy.Field() # ISBN
book_summery = scrapy.Field() # 简介

@ -0,0 +1,9 @@
# -*- coding: utf-8 -*-
# @Time : 2025/8/20 15:05
# @Author : zhaoxiangpeng
# @File : main.py
from scrapy.cmdline import execute
execute('scrapy crawl douban_top250 -o data_douban_top250_3.csv'.split())

@ -2,12 +2,94 @@
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
from typing import Union
# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
import pymongo
from pymongo.errors import DuplicateKeyError
class DoubanBookPipeline:
def process_item(self, item, spider):
return item
def standard_list_or_str(list_or_str: Union[list, str, int]) -> Union[str, int, None]:
new_arr = []
if not list_or_str:
return None
if isinstance(list_or_str, int):
return list_or_str
elif isinstance(list_or_str, str):
text = list_or_str.strip()
return text
elif isinstance(list_or_str, list):
for text in list_or_str:
text = text.strip()
text = text.replace('.intro p{text-indent:2em;word-break:normal;}', '')
if text:
new_arr.append(text)
else:
return list_or_str
return '; '.join(new_arr)
class DoubanBookInfoStandard:
def process_item(self, item, spider):
adapter = ItemAdapter(item)
# item['author_name'] = standard_list_or_str(adapter.get('author_name'))
# item['translator'] = standard_list_or_str(adapter.get('translator'))
# item['book_catalog'] = standard_list_or_str(adapter.get('book_catalog'))
for key in adapter.keys():
item[key] = standard_list_or_str(adapter.get(key))
return item
class ToCSVPipeline:
def open_spider(self, spider):
print('爬虫开始')
self.f = open('data_douban_top250.csv', 'a', encoding='utf-8')
self.f.write()
def close_spider(self, spider):
if self.f:
self.f.close()
print('爬虫结束')
def process_item(self, item, spider):
pass
class MongoPipeline:
collection_name = "data_douban_top250"
def __init__(self, mongo_uri, mongo_db):
self.mongo_uri = mongo_uri
self.mongo_db = mongo_db
@classmethod
def from_crawler(cls, crawler):
return cls(
mongo_uri=crawler.settings.get("MONGO_URI"),
mongo_db=crawler.settings.get("MONGO_DATABASE", "items"),
)
def open_spider(self, spider):
self.client = pymongo.MongoClient(self.mongo_uri)
self.db = self.client[self.mongo_db]
def close_spider(self, spider):
self.client.close()
def process_item(self, item, spider):
try:
self.db[self.collection_name].insert_one(ItemAdapter(item).asdict())
except DuplicateKeyError as dup_err:
spider.logger.warning(dup_err)
except Exception:
raise
return item

@ -0,0 +1,4 @@
# -*- coding: utf-8 -*-
# @Time : 2025/8/20 9:29
# @Author : zhaoxiangpeng
# @File : __init__.py

@ -0,0 +1,51 @@
# -*- coding: utf-8 -*-
# @Time : 2025/8/20 9:30
# @Author : zhaoxiangpeng
# @File : book.py
# 书名
SHUOMING_XPATH = book_name_xpath = '//*[@id="wrapper"]/h1/span/text()'
# 封面
FENGMIAN_XPATH = book_cover_xpath = '//*[@id="mainpic"]/a/@href'
# 作者[多个]
ZUOZHE_XPATH = author_xpath = '//span[contains(text(), "作者")]/following-sibling::a/text()'
# 出版社
CHUBANSHE_XPATH = press_xpath = '//span[contains(text(), "出版社")]/following-sibling::a[1]/text()'
# 出品方[多个]
CHUPINFANG_XPATH = producer_xpath = '//span[contains(text(), "作者")]/following-sibling/a/text()'
# 副标题
FUBIAOTI_XPATH = title_subhead_xpath = '//span[contains(text(), "副标题")]/following-sibling::text()[1]'
# 原作名
YUANZUOMING_XPATH = book_name_original_xpath = '//span[contains(text(), "原作名")]/following-sibling::text()[1]'
# 译者[多个]
YIZHE_XPATH = translator_xpath = '//span[contains(text(), "译者")]/following-sibling::a/text()'
# 出版年
CHUBANNIAN_XPATH = publish_year_xpath = '//span[contains(text(), "出版年")]/following-sibling::text()[1]'
# 页数
YESHU_XPATH = pages_xpath = '//span[contains(text(), "页数")]/following-sibling::text()[1]'
# 定价
DINGJIA_XPATH = price_xpath = '//span[contains(text(), "定价")]/following-sibling::text()[1]'
# 装帧
ZHUANGZHEN_XPATH = binding_xpath = '//span[contains(text(), "装帧")]/following-sibling::text()[1]'
# 丛书
CONGSHU_XPATH = series_xpath = '//span[contains(text(), "丛书")]/following-sibling::a[1]/text()'
# 评分
PINGFEN_XPATH = rating_nums_xpath = '//*[@id="interest_sectl"]/div/div[2]/strong/text()'
# ISBN
ISBN_XPATH = isbn_xpath = '//span[contains(text(), "ISBN")]/following-sibling::text()[1]'
# 内容简介
NEIRONGJIANJIE_XPATH = book_summery_xpath = '//span[contains(text(), "内容简介")]/parent::*/following-sibling::div[1]/span[2]//text() | //*[@class="all hidden"]/div/div//text() | //*[@id="content"]/div/div[1]/div[3]/div[1]/div/div//text()'
# 目录
MULU_XPATH = book_catalog_xpath = '//span[contains(text(), "目录")]/parent::*/following-sibling::div[2]//text() | //*[@class="related_info"]/div[5]/text()'
# 作者简介
ZUOZHEJIANJIE_XPATH = author_summery_xpath = '//span[contains(text(), "作者简介")]/parent::*/following-sibling::div[1]//text()'
XPATH = '//span[contains(text(), "作者")]/following-sibling/a/text()' # XPATH[多个]
XPATH = '//span[contains(text(), "作者")]/following-sibling/a/text()' # XPATH[多个]
XPATH = '//span[contains(text(), "作者")]/following-sibling/a/text()' # XPATH[多个]
XPATH = '//span[contains(text(), "作者")]/following-sibling/a/text()' # XPATH[多个]

@ -14,17 +14,16 @@ NEWSPIDER_MODULE = "douban_book.spiders"
ADDONS = {}
# Crawl responsibly by identifying yourself (and your website) on the user-agent
# USER_AGENT = "douban_book (+http://www.yourdomain.com)"
# Obey robots.txt rules
ROBOTSTXT_OBEY = True
ROBOTSTXT_OBEY = False
# Concurrency and throttling settings
# CONCURRENT_REQUESTS = 16
CONCURRENT_REQUESTS_PER_DOMAIN = 1
DOWNLOAD_DELAY = 1
DOWNLOAD_DELAY = 10
# Disable cookies (enabled by default)
# COOKIES_ENABLED = False
@ -47,9 +46,14 @@ DEFAULT_REQUEST_HEADERS = {
# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
RETRY_ENABLED = True
RETRY_TIMES = 2 # 重试3次
RETRY_HTTP_CODES = [500, 502, 503, 504, 408, 400] # 增加了一些常见的错误码
DOWNLOADER_MIDDLEWARES = {
'scrapy.downloadermiddlewares.retry.RetryMiddleware': 550,
# "douban_book.middlewares.DoubanBookDownloaderMiddleware": 543,
#}
}
# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
@ -59,9 +63,15 @@ DEFAULT_REQUEST_HEADERS = {
# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = {
# "douban_book.pipelines.DoubanBookPipeline": 300,
#}
ITEM_PIPELINES = {
"douban_book.pipelines.DoubanBookPipeline": 300,
"douban_book.pipelines.DoubanBookInfoStandard": 400,
"douban_book.pipelines.MongoPipeline": 410,
}
MONGO_URI = "mongodb://root:123456@192.168.1.211:27017/"
MONGO_DATABASE = "science2"
REDIS_URL = 'redis://:kcidea1509@192.168.1.211:6379/10'
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html

@ -1,4 +1,8 @@
import scrapy
from douban_book.items import DoubanBookTop250Item
from douban_book.selector_cfg import book
# from douban_book.douban_book.selector_cfg import book
class DoubanTop250Spider(scrapy.Spider):
@ -7,4 +11,68 @@ class DoubanTop250Spider(scrapy.Spider):
start_urls = ["https://book.douban.com/top250?start=0"]
def parse(self, response):
pass
start = response.meta.get('start', 0)
nodes = response.xpath('//div[@class="article"]/div/table//tr[@class="item"]')
for node in nodes:
cover_link = node.xpath('./td[1]/a/img/@src').get() # 封面链接
book_name = node.xpath('./td[2]/div[1]/a/@title').get() # 书名
book_link = node.xpath('./td[2]/div[1]/a/@href').get() # 图书链接
rating_nums = node.xpath('./td[2]/div[2]/span[@class="rating_nums"]/text()').get() # 评分
comment = node.xpath('./td[2]/p[@class="quote"]/span/text()').get() # 一言
start += 1
yield response.follow(book_link, callback=self.parse_detail,
# 携带列表页部分信息
meta=dict(
item=dict(book_name=book_name, book_link=book_link, cover_link=cover_link, ranking=start, rating_nums=rating_nums, comment=comment),
))
next_page = response.xpath('//div[@class="paginator"]/span[@class="next"]/a/@href').get() # 下一页
if next_page:
yield scrapy.Request(next_page, meta=dict(start=start))
def parse_detail(self, response):
req_meta = response.meta
meta_item = req_meta.get('item')
book_name = response.xpath(book.SHUOMING_XPATH)
author = response.xpath(book.ZUOZHE_XPATH)
publisher = response.xpath(book.CHUBANSHE_XPATH)
pub_year = response.xpath(book.CHUBANNIAN_XPATH)
book_isbn = response.xpath(book.ISBN_XPATH)
book_summery = response.xpath(book.NEIRONGJIANJIE_XPATH)
producer = response.xpath(book.producer_xpath)
title_subhead = response.xpath(book.title_subhead_xpath)
book_name_original = response.xpath(book.book_name_original_xpath)
translator = response.xpath(book.translator_xpath)
pages = response.xpath(book.pages_xpath)
price = response.xpath(book.price_xpath)
binding = response.xpath(book.binding_xpath)
series = response.xpath(book.series_xpath)
rating_nums = response.xpath(book.rating_nums_xpath)
book_catalog = response.xpath(book.book_catalog_xpath)
author_summery = response.xpath(book.author_summery_xpath)
book_item = DoubanBookTop250Item()
book_item['book_name'] = book_name.get()
book_item['author_name'] = author.getall()
book_item['publisher'] = publisher.get()
book_item['producer'] = producer.getall()
book_item['title_subhead'] = title_subhead.get()
book_item['book_name_original'] = book_name_original.get()
book_item['translator'] = translator.getall()
book_item['pages'] = pages.get()
book_item['price'] = price.get()
book_item['binding'] = binding.get()
book_item['series'] = series.get()
book_item['rating_nums'] = meta_item.get('rating_nums')
book_item['comment'] = meta_item.get('comment')
book_item['book_catalog'] = book_catalog.getall()
book_item['author_summery'] = author_summery.getall()
book_item['pub_year'] = pub_year.get()
book_item['book_isbn'] = book_isbn.get()
book_item['book_summery'] = book_summery.getall()
book_item['rank_type'] = 'douban-top250'
book_item['rank_name'] = 'top250'
book_item['ranking'] = meta_item.get('ranking')
book_item['book_link'] = meta_item.get('book_link')
book_item['book_cover_link'] = meta_item.get('cover_link')
yield book_item

@ -2,3 +2,13 @@
# @Time : 2025/8/19 17:13
# @Author : zhaoxiangpeng
# @File : run.py
import os
import re
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
process = CrawlerProcess(get_project_settings())
process.crawl('douban_top250')
process.start()

Loading…
Cancel
Save