From cf20521b1ccf887522a7e23db5b033fccb182508 Mon Sep 17 00:00:00 2001 From: zhaoxiangpeng <1943364377@qq.com> Date: Wed, 7 Jan 2026 17:56:13 +0800 Subject: [PATCH] =?UTF-8?q?cnki:=E9=87=8D=E5=A4=8D=E9=A1=B9=E6=9B=B4?= =?UTF-8?q?=E6=96=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- science_article_cnki/science_article_cnki/pipelines.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/science_article_cnki/science_article_cnki/pipelines.py b/science_article_cnki/science_article_cnki/pipelines.py index 715dc65..dc9a93a 100644 --- a/science_article_cnki/science_article_cnki/pipelines.py +++ b/science_article_cnki/science_article_cnki/pipelines.py @@ -41,6 +41,7 @@ class MongoPipeline(MongoDBUtils): super().__init__(mongo_uri, mongo_db) self.stats: StatsCollector = stats self.insert_failure_update_enable = True + self.duplicate_cover_enable = False # 重复项覆盖 @classmethod def from_crawler(cls, crawler: Crawler): @@ -71,7 +72,8 @@ class MongoPipeline(MongoDBUtils): logger.debug("dupKey: %s, keyValue: %s", key_pattern, key_value) d.pop("_id", None) [d.pop(k, None) for k in key_pattern.keys()] - up_result = collection.update_one(filter=key_value, update={"$set": d}, upsert=True) + update_q = build_update_query(d, replace=self.duplicate_cover_enable) + up_result = collection.update_one(filter=key_value, update=update_q, upsert=True) self.stats.inc_value("item2db_updated/{}".format(item_type)) except Exception: raise