From 43b26550e706a34a2fa4d1ac1c00d604ad363dff Mon Sep 17 00:00:00 2001 From: zhaoxiangpeng <1943364377@qq.com> Date: Tue, 10 Mar 2026 15:10:26 +0800 Subject: [PATCH] wos: --- .../science_article_wos/pipelines.py | 83 ++++++++++++++++++- .../scripts/cookie_manager.py | 10 ++- 2 files changed, 89 insertions(+), 4 deletions(-) diff --git a/science_article_wos/science_article_wos/pipelines.py b/science_article_wos/science_article_wos/pipelines.py index 64b1ba3..6d98cb6 100644 --- a/science_article_wos/science_article_wos/pipelines.py +++ b/science_article_wos/science_article_wos/pipelines.py @@ -19,7 +19,7 @@ from pymongo.errors import ( DuplicateKeyError, BulkWriteError ) -from science_article_wos.items import WosIdRelationItem, WosArticleTodoIdItem, WosCitedNumberItem +from science_article_wos.items import ArticleItem, WosArticleItem, WosIdRelationItem, WosArticleTodoIdItem, WosCitedNumberItem from science_article_wos.db_utils.mongo import MongoDBUtils, update_document, build_update_query if TYPE_CHECKING: @@ -139,6 +139,15 @@ class MongoPipeline(MongoDBUtils): return 'items_null_table' +class Article2MongoPipeline(MongoPipeline): + def process_item(self, item, spider): + # 确定Item类型 + if isinstance(item, ArticleItem): + super().process_item_update(item, spider=spider) + + return item + + class CitedRelation2MongoPipeline(MongoPipeline): def process_item(self, item, spider): # 确定Item类型 @@ -206,3 +215,75 @@ class DupTodoBySciencePipeline(DupTodoPipeline): self.inc_item_dropped_count("exists") return True return False + + +class VerifyDataIntegrity: + def __init__(self, mongo_uri, mongo_db): + self.successful_delete = False + self.batch_ids = set() + self.successful = [] + self.logger = logging.getLogger(__name__) + + self.mongo_uri = mongo_uri + self.mongo_db = mongo_db + self.client: MongoClient = None + self.db = None + + @classmethod + def from_crawler(cls, crawler): + settings = crawler.settings + c = cls( + mongo_uri=crawler.settings.get("MONGO_URI"), + mongo_db=crawler.settings.get("MONGO_DATABASE", "items"), + ) + return c + + def init_db(self): + self.client = MongoClient(self.mongo_uri) + self.db = self.client[self.mongo_db] + + def open_spider(self, spider): + spider_batch_ids = spider.get_batch_ids() + for batch in spider_batch_ids: + if batch.get("field", "UT") == "UT": + self.batch_ids.add(batch.get("third_id")) + self.init_db() + + def process_item(self, item, spider): + adapter = ItemAdapter(item) + if isinstance(item, ArticleItem): + unique_id = adapter.get("third_id") + self.successful.append(unique_id) + if self.successful_delete: + self.batch_ids.discard(unique_id) + return item + + def close_spider(self, spider): + failure = self.batch_ids - set(self.successful) + coll = self.db.get_collection("todo_ids_wos") + if self.successful: + if self.successful_delete: + coll.delete_many(filter={"third_id": {"$in": self.successful}}) + self.logger.info("Successfully deleted %d articles", len(self.successful)) + else: + coll.update_many(filter={"third_id": {"$in": self.successful}}, update={"$set": {"state": 1}}) + self.logger.info("Successfully updated %d articles", len(self.successful)) + if failure: + self.logger.warning("未下载到: %s" % list(failure)) + coll.update_many(filter={"third_id": {"$in": list(failure)}}, update={"$set": {"state": -1}}) + else: + self.logger.info("Successfully verified: %s" % "下载完整无异常") + + def spider_end(self): + """ + 组合检索式,把结果写到数据库里 + """ + dict( + content="", + qeury_id="", + records_found=0, + perfact=1, + state=1, + reason="" + ) + diff --git a/science_article_wos/science_article_wos/scripts/cookie_manager.py b/science_article_wos/science_article_wos/scripts/cookie_manager.py index d89fd65..cbdf119 100644 --- a/science_article_wos/science_article_wos/scripts/cookie_manager.py +++ b/science_article_wos/science_article_wos/scripts/cookie_manager.py @@ -16,7 +16,7 @@ import redis import requests from DrissionPage import Chromium -from science_article_wos.utils.xpath_cfg import Settings +from science_article_wos.configs.wos_dp import Settings if TYPE_CHECKING: from DrissionPage import ChromiumPage, ChromiumOptions @@ -97,7 +97,7 @@ class DPOperations: if clear_input: input_area_ele.clear() # 清空 if content is None: - content = "(OG=(Shanghai Jiao Tong University)) AND PY=(2025)" + content = "(OG=(Shanghai Jiao Tong University)) AND PY=(2026)" input_area_ele.input(content) # 输入检索内容 @staticmethod @@ -328,11 +328,13 @@ class CookieManager: logger.warning("cookie使用次数超限/需要验证,准备进行验证。。。") # 验证逻辑,导出一次过验证 self.intercept_verify(op_func=self.dp_ins.bypass_ops) + self.sid2redis() elif status == "expired": logger.warning("cookie已过期,准备重新获取。。。") # 刷新页面或者重新进行搜索/导出 self.intercept_verify(op_func=self.refresh_page) + self.sid2redis() else: logger.info(f"Cookie状态正常: {status}") @@ -392,7 +394,9 @@ class CookieManager: def main(): - manager = CookieManager(redis_uri="redis://:kcidea1509@192.168.1.211:6379/10", keep_browser_alive=True) + from science_article_wos.settings import REDIS_URL + # manager = CookieManager(redis_uri="redis://:kcidea1509@192.168.1.211:6379/10", keep_browser_alive=True) + manager = CookieManager(redis_uri=REDIS_URL, keep_browser_alive=True) try: manager.start_monitor()