diff --git a/science_article_cnki/science_article_cnki/pipelines.py b/science_article_cnki/science_article_cnki/pipelines.py index 715dc65..dc9a93a 100644 --- a/science_article_cnki/science_article_cnki/pipelines.py +++ b/science_article_cnki/science_article_cnki/pipelines.py @@ -41,6 +41,7 @@ class MongoPipeline(MongoDBUtils): super().__init__(mongo_uri, mongo_db) self.stats: StatsCollector = stats self.insert_failure_update_enable = True + self.duplicate_cover_enable = False # 重复项覆盖 @classmethod def from_crawler(cls, crawler: Crawler): @@ -71,7 +72,8 @@ class MongoPipeline(MongoDBUtils): logger.debug("dupKey: %s, keyValue: %s", key_pattern, key_value) d.pop("_id", None) [d.pop(k, None) for k in key_pattern.keys()] - up_result = collection.update_one(filter=key_value, update={"$set": d}, upsert=True) + update_q = build_update_query(d, replace=self.duplicate_cover_enable) + up_result = collection.update_one(filter=key_value, update=update_q, upsert=True) self.stats.inc_value("item2db_updated/{}".format(item_type)) except Exception: raise