main
zhaoxiangpeng 1 week ago
parent c94aba0245
commit 43b26550e7

@ -19,7 +19,7 @@ from pymongo.errors import (
DuplicateKeyError, DuplicateKeyError,
BulkWriteError BulkWriteError
) )
from science_article_wos.items import WosIdRelationItem, WosArticleTodoIdItem, WosCitedNumberItem from science_article_wos.items import ArticleItem, WosArticleItem, WosIdRelationItem, WosArticleTodoIdItem, WosCitedNumberItem
from science_article_wos.db_utils.mongo import MongoDBUtils, update_document, build_update_query from science_article_wos.db_utils.mongo import MongoDBUtils, update_document, build_update_query
if TYPE_CHECKING: if TYPE_CHECKING:
@ -139,6 +139,15 @@ class MongoPipeline(MongoDBUtils):
return 'items_null_table' return 'items_null_table'
class Article2MongoPipeline(MongoPipeline):
def process_item(self, item, spider):
# 确定Item类型
if isinstance(item, ArticleItem):
super().process_item_update(item, spider=spider)
return item
class CitedRelation2MongoPipeline(MongoPipeline): class CitedRelation2MongoPipeline(MongoPipeline):
def process_item(self, item, spider): def process_item(self, item, spider):
# 确定Item类型 # 确定Item类型
@ -206,3 +215,75 @@ class DupTodoBySciencePipeline(DupTodoPipeline):
self.inc_item_dropped_count("exists") self.inc_item_dropped_count("exists")
return True return True
return False return False
class VerifyDataIntegrity:
def __init__(self, mongo_uri, mongo_db):
self.successful_delete = False
self.batch_ids = set()
self.successful = []
self.logger = logging.getLogger(__name__)
self.mongo_uri = mongo_uri
self.mongo_db = mongo_db
self.client: MongoClient = None
self.db = None
@classmethod
def from_crawler(cls, crawler):
settings = crawler.settings
c = cls(
mongo_uri=crawler.settings.get("MONGO_URI"),
mongo_db=crawler.settings.get("MONGO_DATABASE", "items"),
)
return c
def init_db(self):
self.client = MongoClient(self.mongo_uri)
self.db = self.client[self.mongo_db]
def open_spider(self, spider):
spider_batch_ids = spider.get_batch_ids()
for batch in spider_batch_ids:
if batch.get("field", "UT") == "UT":
self.batch_ids.add(batch.get("third_id"))
self.init_db()
def process_item(self, item, spider):
adapter = ItemAdapter(item)
if isinstance(item, ArticleItem):
unique_id = adapter.get("third_id")
self.successful.append(unique_id)
if self.successful_delete:
self.batch_ids.discard(unique_id)
return item
def close_spider(self, spider):
failure = self.batch_ids - set(self.successful)
coll = self.db.get_collection("todo_ids_wos")
if self.successful:
if self.successful_delete:
coll.delete_many(filter={"third_id": {"$in": self.successful}})
self.logger.info("Successfully deleted %d articles", len(self.successful))
else:
coll.update_many(filter={"third_id": {"$in": self.successful}}, update={"$set": {"state": 1}})
self.logger.info("Successfully updated %d articles", len(self.successful))
if failure:
self.logger.warning("未下载到: %s" % list(failure))
coll.update_many(filter={"third_id": {"$in": list(failure)}}, update={"$set": {"state": -1}})
else:
self.logger.info("Successfully verified: %s" % "下载完整无异常")
def spider_end(self):
"""
组合检索式把结果写到数据库里
"""
dict(
content="",
qeury_id="",
records_found=0,
perfact=1,
state=1,
reason=""
)

@ -16,7 +16,7 @@ import redis
import requests import requests
from DrissionPage import Chromium from DrissionPage import Chromium
from science_article_wos.utils.xpath_cfg import Settings from science_article_wos.configs.wos_dp import Settings
if TYPE_CHECKING: if TYPE_CHECKING:
from DrissionPage import ChromiumPage, ChromiumOptions from DrissionPage import ChromiumPage, ChromiumOptions
@ -97,7 +97,7 @@ class DPOperations:
if clear_input: if clear_input:
input_area_ele.clear() # 清空 input_area_ele.clear() # 清空
if content is None: if content is None:
content = "(OG=(Shanghai Jiao Tong University)) AND PY=(2025)" content = "(OG=(Shanghai Jiao Tong University)) AND PY=(2026)"
input_area_ele.input(content) # 输入检索内容 input_area_ele.input(content) # 输入检索内容
@staticmethod @staticmethod
@ -328,11 +328,13 @@ class CookieManager:
logger.warning("cookie使用次数超限/需要验证,准备进行验证。。。") logger.warning("cookie使用次数超限/需要验证,准备进行验证。。。")
# 验证逻辑,导出一次过验证 # 验证逻辑,导出一次过验证
self.intercept_verify(op_func=self.dp_ins.bypass_ops) self.intercept_verify(op_func=self.dp_ins.bypass_ops)
self.sid2redis()
elif status == "expired": elif status == "expired":
logger.warning("cookie已过期准备重新获取。。。") logger.warning("cookie已过期准备重新获取。。。")
# 刷新页面或者重新进行搜索/导出 # 刷新页面或者重新进行搜索/导出
self.intercept_verify(op_func=self.refresh_page) self.intercept_verify(op_func=self.refresh_page)
self.sid2redis()
else: else:
logger.info(f"Cookie状态正常: {status}") logger.info(f"Cookie状态正常: {status}")
@ -392,7 +394,9 @@ class CookieManager:
def main(): def main():
manager = CookieManager(redis_uri="redis://:kcidea1509@192.168.1.211:6379/10", keep_browser_alive=True) from science_article_wos.settings import REDIS_URL
# manager = CookieManager(redis_uri="redis://:kcidea1509@192.168.1.211:6379/10", keep_browser_alive=True)
manager = CookieManager(redis_uri=REDIS_URL, keep_browser_alive=True)
try: try:
manager.start_monitor() manager.start_monitor()

Loading…
Cancel
Save