From 1b0da2c41e68ce12033964e795011d903d4e6bf7 Mon Sep 17 00:00:00 2001 From: zhaoxiangpeng <1943364377@qq.com> Date: Mon, 12 Jan 2026 10:19:11 +0800 Subject: [PATCH] =?UTF-8?q?wos:=E5=90=AF=E5=8A=A8=E4=BB=A3=E7=A0=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../starts/crawl_article_by_qid.py | 43 +++++++++++++++++++ .../starts/crawl_article_by_ut.py | 41 ++++++++++++++++++ 2 files changed, 84 insertions(+) create mode 100644 science_article_add/starts/crawl_article_by_qid.py create mode 100644 science_article_add/starts/crawl_article_by_ut.py diff --git a/science_article_add/starts/crawl_article_by_qid.py b/science_article_add/starts/crawl_article_by_qid.py new file mode 100644 index 0000000..11da7f0 --- /dev/null +++ b/science_article_add/starts/crawl_article_by_qid.py @@ -0,0 +1,43 @@ +# -*- coding: utf-8 -*- +# @Time : 2025/12/11 13:56 +# @Author : zhaoxiangpeng +# @File : crawl_article_by_qid.py +import math +from scrapy.crawler import CrawlerProcess +from scrapy.utils.project import get_project_settings +from science_article_add.spiders.download_by_qid import DownloadByQidSpider + +BATCH_DOWNLOAD_LIMIT = 500 + +process = CrawlerProcess(get_project_settings()) +RECORDS_FOUND = 1486 +wos_download_todo = [ + +] + + +def f(record_id: str, records_found: int): + mark_start = 1 + mark_end = 0 + idx = 0 + for i in range(math.ceil(records_found / BATCH_DOWNLOAD_LIMIT)): + idx += 1 + mark_end += BATCH_DOWNLOAD_LIMIT + + if mark_end > records_found: + mark_end = records_found + + yield dict( + record_id=record_id, batch=idx, + mark_from=mark_start, mark_to=mark_end, records_found=records_found + ) + + mark_start += BATCH_DOWNLOAD_LIMIT + + +init_params = dict( + record_id='02f30273-1342-4d61-9e51-c1ea1f5b2423-0190efdd10', + mark_from=1, mark_to=500, records_found=10641 +) +process.crawl(DownloadByQidSpider, **init_params) +process.start() diff --git a/science_article_add/starts/crawl_article_by_ut.py b/science_article_add/starts/crawl_article_by_ut.py new file mode 100644 index 0000000..9470c24 --- /dev/null +++ b/science_article_add/starts/crawl_article_by_ut.py @@ -0,0 +1,41 @@ +# -*- coding: utf-8 -*- +# @Time : 2025/12/11 17:07 +# @Author : zhaoxiangpeng +# @File : crawl_article_by_ut.py +import math +import time +import logging +from twisted.internet import defer +from scrapy.crawler import CrawlerProcess +from scrapy.utils.project import get_project_settings +from science_article_add.spiders.wos_download import WosDownloadSpider + +logging.getLogger('pymongo').setLevel(logging.WARNING) +logger = logging.getLogger(__name__) +BATCH_DOWNLOAD_LIMIT = 500 + + +@defer.inlineCallbacks +def crawl_sequentially(): + settings = get_project_settings() + from pymongo import MongoClient + client = MongoClient(settings.get("MONGO_URI")) + db = client.get_database(settings.get("MONGO_DATABASE")) + collection = db.get_collection("todo_ids_wos") + + def f(): + count = collection.count_documents(filter={"state": 0}) + return count + + while count_doc := f(): + logger.info('待下载数量 %d' % count_doc) + yield process.crawl(WosDownloadSpider) + time.sleep(60) + + process.stop() # 所有爬虫结束后关闭事件循环 + + +if __name__ == '__main__': + process = CrawlerProcess(get_project_settings()) + crawl_sequentially() + process.start() # 阻塞直到所有爬虫完成