You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

280 lines
10 KiB
Python

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# useful for handling different item types with a single interface
from __future__ import annotations
import re
import json
import logging
from datetime import datetime
from typing import TYPE_CHECKING, Tuple, Union, Optional
import scrapy
from itemadapter import ItemAdapter
from kafka import KafkaProducer
from pymongo import MongoClient
from pymongo.errors import (
DuplicateKeyError,
BulkWriteError
)
from science_article_cssci.db_utils.mongo import MongoDBUtils, build_update_query
from science_article_cssci.db_utils.kafka import KafkaUtil
from science_article_cssci.scripts.field_assembly import FieldAssembly
if TYPE_CHECKING:
from scrapy.crawler import Crawler
from scrapy.statscollectors import StatsCollector
from pymongo.collection import Collection
mongo_logger = logging.getLogger('pymongo')
mongo_logger.setLevel(logging.WARNING)
logging.getLogger('kafka').setLevel(logging.WARNING)
logger = logging.getLogger(__name__)
class ScienceArticleCssciPipeline:
def process_item(self, item, spider):
return item
class MongoPipeline(MongoDBUtils):
def __init__(self, mongo_uri, mongo_db, stats: StatsCollector):
super().__init__(mongo_uri, mongo_db)
self.stats: StatsCollector = stats
self.insert_failure_update_enable = True
self.duplicate_cover_enable = False # 重复项覆盖
@classmethod
def from_crawler(cls, crawler: Crawler):
m = cls(
mongo_uri=crawler.settings.get("MONGO_URI"),
mongo_db=crawler.settings.get("MONGO_DATABASE", "items"),
stats=crawler.stats
)
return m
def open_spider(self, spider):
self.client = MongoClient(self.mongo_uri)
self.db = self.client[self.mongo_db]
def process_item(self, item, spider) -> scrapy.Item:
"""
插入遇到错误不处理
"""
adapter = ItemAdapter(item)
tablename = self._get_item_table(item)
collection = self.db.get_collection(tablename)
d = adapter.asdict()
try:
collection.insert_one(d)
self.stats.inc_value("item2db_inserted/{}".format(tablename))
except DuplicateKeyError as duplicate_error:
self.stats.inc_value("item2db_duplicate/{}".format(tablename))
self.stats.inc_value(f"item_dropped_reasons_count/duplicate")
except Exception:
raise
return item
def process_item_update(self, item, spider) -> scrapy.Item:
"""
插入遇到错误进行更新
"""
adapter = ItemAdapter(item)
tablename = self._get_item_table(item)
collection = self.db.get_collection(tablename)
d = adapter.asdict()
try:
collection.insert_one(d)
self.stats.inc_value("item2db_inserted/{}".format(tablename))
except DuplicateKeyError as duplicate_error:
if self.insert_failure_update_enable:
write_error = duplicate_error.details
filter_query, update_query = self._pick_filter_update(write_error, doc=d)
updated_at_query = None # 删除不确定因素的时间防止影响更新更新除了task_id外的字段不需要处理这个
key_pattern = write_error.get('keyPattern')
key_value = write_error.get('keyValue')
logger.debug("dupKey: %s, keyValue: %s", key_pattern, key_value)
# 专门用来适配增量的任务
task_ids = update_query.pop("task_ids", None)
if task_ids:
# task_id一定会引起变动所以先处理
task_id_query = {'task_ids': task_ids}
collection.update_one(filter=filter_query, update=build_update_query(task_id_query, replace=False))
updated_at_query = {"updated_at": update_query.pop('updated_at', None)}
update_q = build_update_query(update_query, replace=self.duplicate_cover_enable)
up_result = collection.update_one(filter=key_value, update=update_q, upsert=True)
if up_result.matched_count == up_result.modified_count == 1:
# 变动了就要修改更新的时间(其实没变也要更新,这样可以知道什么时候动过这条数据)
if updated_at_query:
collection.update_one(filter=key_value, update={"$set": updated_at_query})
self.stats.inc_value("item2db_updated/{}".format(tablename))
except Exception:
raise
return item
@staticmethod
def _pick_filter_update(write_error, doc: dict = None):
original_doc = write_error.get('op', doc) # 插入的数据
key_pattern = write_error.get('keyPattern')
original_doc.pop("_id", None) # 删掉插入失败产生的_id
filter_query = {}
update_query = {key: val for key, val in original_doc.items() if val}
for key in key_pattern.keys():
filter_query.update({key: update_query.pop(key, None)})
return filter_query, update_query
def close_spider(self, spider):
self.client.close()
@staticmethod
def _get_item_table(item) -> str:
"""获取Item类型"""
if hasattr(item, '__tablename__'):
return item.__class__.__tablename__
return 'items_null_table'
class Mongo2SciencePipeline(MongoPipeline):
def process_item(self, item, spider):
d = self.parse2science(item)
table = self._get_item_table(item)
coll = self.db.get_collection(table)
return d
@staticmethod
def parse2science(item) -> dict:
def change_qi(tmp_str):
return re.sub(r'^0|0$', '', tmp_str)
def change_string(tmp_str):
tmp = tmp_str.split("aaa")
tmp_z = []
for t in tmp:
if len(t) > 1:
tmp_z.append(t)
return tmp_z
adapter = ItemAdapter(item)
third_id = adapter['third_id']
resp_raw = adapter['resp_raw']
resp_raw = json.loads(resp_raw)
authors: list = resp_raw['author']
authors = [dict(zzmc=au_info['zzmc'], jgmc=au_info['jgmc'], bmmc=au_info['bmmc']) for au_info in
authors or []] # 作者.学校.院系
# pprint(authors)
catations: list = resp_raw.get('catation')
contents: list = resp_raw.get('contents', [])
body = [c for c in contents if c.get("sno") == adapter['third_id']]
if body:
content = body[0]
else:
content = {}
d = dict(
sno=third_id,
lypm=content['lypm'], # 篇名
blpm=content['blpm'], # 英文篇名
zzjg=authors, # 作者和机构
wzlx=content['wzlx'], # 文献类型
xkfl1=content['xkfl1'], # 学科类别1
xkfl2=content.get('xkfl2', ''), # 学科类别2
xkdm1=content['xkdm1'], # 中图类号1
xkdm2=content.get('xkdm2', ''), # 中图类号2
xmlb=content.get('xmlb', ''), # 基金项目
qkmc=content.get('qkmc', ''), # 来源期刊
# (nian)年(juan)卷第(qi).replace(/^0|0$/g,'')期:(ym)
nian=content.get('nian', ''), # 年
juan=content.get('juan', ''), # 卷
qi=content.get('qi', ''), # 期
ym=content.get('ym', ''), # 页码
byc=change_string(content.get('byc', '')), # 关键词
ckwx=catations # 参考文献
)
return d
class BuildDetailPipeline:
def process_item(self, item, spider):
adapter = ItemAdapter(item)
item['detailed'] = self.build_detailed(adapter)
return item
@staticmethod
def build_detailed(item):
resp_raw = item.get("resp_raw")
dd = dict(
**FieldAssembly.parse_detail(
content=resp_raw.get('content'),
author=resp_raw.get('author'),
catation=resp_raw.get('catation'),
)
)
return dd
class KafkaPipeline:
def __init__(self, kafka_servers, topic):
self.kafka_servers = kafka_servers
self.topic = topic
self.producer: KafkaProducer = None
@classmethod
def from_crawler(cls, crawler):
return cls(
kafka_servers=crawler.settings.get('KAFKA_SERVERS', 'localhost:9092'),
topic=crawler.settings.get('KAFKA_TOPIC', 'scrapy_items')
)
def open_spider(self, spider):
self.producer: KafkaProducer = KafkaProducer(
bootstrap_servers=self.kafka_servers,
value_serializer=lambda v: json.dumps(v, ensure_ascii=False).encode('utf-8'),
acks='all',
linger_ms=50, # 等待时间,最多等待 50ms 封包发送
compression_type='gzip',
)
spider.logger.info(f"Connected to Kafka at {self.kafka_servers}")
def close_spider(self, spider):
if self.producer:
self.producer.flush()
self.producer.close()
spider.logger.info("Kafka connection closed")
def process_item(self, item, spider):
adapter = ItemAdapter(item)
d = adapter.asdict()
d = self.build2kafka(d)
# 发送到Kafka
future = self.producer.send(
topic=self.topic,
value=d,
headers=[('source_type', b'cssci')]
)
future.add_callback(self.on_send_success)
future.add_callback(self.on_send_success)
return item
def on_send_success(self, record_metadata):
"""发送成功回调"""
pass
def on_send_error(self, excp):
"""发送失败回调"""
pass
def build2kafka(self, item: dict) -> dict:
dd = dict(
id=item.get("third_id"),
school_id="999",
**item.get('detailed'),
updated_time="2025-11-01 09:01:56"
)
dd.pop("references", None)
return dd