add:校验下载的数据的完整性

main
zhaoxiangpeng 1 month ago
parent a86cbc9952
commit 6d4b0a7dd9

@ -0,0 +1,67 @@
# -*- coding: utf-8 -*-
# @Time : 2025/12/2 13:34
# @Author : zhaoxiangpeng
# @File : verify_data.py
import logging
from itemadapter import ItemAdapter
from pymongo import MongoClient
from science_article_add.items import ArticleItem
class VerifyDataIntegrity:
def __init__(self, mongo_uri, mongo_db):
self.successful_delete = False
self.batch_ids = set()
self.successful = []
self.logger = logging.getLogger(__name__)
self.mongo_uri = mongo_uri
self.mongo_db = mongo_db
self.client: MongoClient = None
self.db = None
@classmethod
def from_crawler(cls, crawler):
settings = crawler.settings
c = cls(
mongo_uri=crawler.settings.get("MONGO_URI"),
mongo_db=crawler.settings.get("MONGO_DATABASE", "items"),
)
return c
def init_db(self):
self.client = MongoClient(self.mongo_uri)
self.db = self.client[self.mongo_db]
def open_spider(self, spider):
spider_batch_ids = spider.get_batch_ids()
for batch in spider_batch_ids:
if batch.get("field", "UT") == "UT":
self.batch_ids.add(batch.get("third_id"))
self.init_db()
def process_item(self, item, spider):
adapter = ItemAdapter(item)
if isinstance(item, ArticleItem):
unique_id = adapter.get("third_id")
self.successful.append(unique_id)
if self.successful_delete:
self.batch_ids.discard(unique_id)
return item
def close_spider(self, spider):
failure = self.batch_ids - set(self.successful)
coll = self.db.get_collection("todo_ids_wos")
if self.successful:
if self.successful_delete:
coll.delete_many(filter={"third_id": {"$in": self.successful}})
self.logger.info("Successfully deleted %d articles", len(self.successful))
else:
coll.update_many(filter={"third_id": {"$in": self.successful}}, update={"$set": {"state": 1}})
self.logger.info("Successfully updated %d articles", len(self.successful))
if failure:
self.logger.warning("未下载到: %s" % list(failure))
coll.update_many(filter={"third_id": {"$in": list(failure)}}, update={"$set": {"state": -1}})
else:
self.logger.info("Successfully verified: %s" % "下载完整无异常")
Loading…
Cancel
Save