From 1b0da2c41e68ce12033964e795011d903d4e6bf7 Mon Sep 17 00:00:00 2001
From: zhaoxiangpeng <1943364377@qq.com>
Date: Mon, 12 Jan 2026 10:19:11 +0800
Subject: [PATCH] =?UTF-8?q?wos:=E5=90=AF=E5=8A=A8=E4=BB=A3=E7=A0=81?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../starts/crawl_article_by_qid.py            | 43 +++++++++++++++++++
 .../starts/crawl_article_by_ut.py             | 41 ++++++++++++++++++
 2 files changed, 84 insertions(+)
 create mode 100644 science_article_add/starts/crawl_article_by_qid.py
 create mode 100644 science_article_add/starts/crawl_article_by_ut.py

diff --git a/science_article_add/starts/crawl_article_by_qid.py b/science_article_add/starts/crawl_article_by_qid.py
new file mode 100644
index 0000000..11da7f0
--- /dev/null
+++ b/science_article_add/starts/crawl_article_by_qid.py
@@ -0,0 +1,43 @@
+# -*- coding: utf-8 -*-
+# @Time    : 2025/12/11 13:56
+# @Author  : zhaoxiangpeng
+# @File    : crawl_article_by_qid.py
+import math
+from scrapy.crawler import CrawlerProcess
+from scrapy.utils.project import get_project_settings
+from science_article_add.spiders.download_by_qid import DownloadByQidSpider
+
+BATCH_DOWNLOAD_LIMIT = 500
+
+process = CrawlerProcess(get_project_settings())
+RECORDS_FOUND = 1486
+wos_download_todo = [
+
+]
+
+
+def f(record_id: str, records_found: int):
+    mark_start = 1
+    mark_end = 0
+    idx = 0
+    for i in range(math.ceil(records_found / BATCH_DOWNLOAD_LIMIT)):
+        idx += 1
+        mark_end += BATCH_DOWNLOAD_LIMIT
+
+        if mark_end > records_found:
+            mark_end = records_found
+
+        yield dict(
+            record_id=record_id, batch=idx,
+            mark_from=mark_start, mark_to=mark_end, records_found=records_found
+        )
+
+        mark_start += BATCH_DOWNLOAD_LIMIT
+
+
+init_params = dict(
+    record_id='02f30273-1342-4d61-9e51-c1ea1f5b2423-0190efdd10',
+    mark_from=1, mark_to=500, records_found=10641
+)
+process.crawl(DownloadByQidSpider, **init_params)
+process.start()
diff --git a/science_article_add/starts/crawl_article_by_ut.py b/science_article_add/starts/crawl_article_by_ut.py
new file mode 100644
index 0000000..9470c24
--- /dev/null
+++ b/science_article_add/starts/crawl_article_by_ut.py
@@ -0,0 +1,41 @@
+# -*- coding: utf-8 -*-
+# @Time    : 2025/12/11 17:07
+# @Author  : zhaoxiangpeng
+# @File    : crawl_article_by_ut.py
+import math
+import time
+import logging
+from twisted.internet import defer
+from scrapy.crawler import CrawlerProcess
+from scrapy.utils.project import get_project_settings
+from science_article_add.spiders.wos_download import WosDownloadSpider
+
+logging.getLogger('pymongo').setLevel(logging.WARNING)
+logger = logging.getLogger(__name__)
+BATCH_DOWNLOAD_LIMIT = 500
+
+
+@defer.inlineCallbacks
+def crawl_sequentially():
+    settings = get_project_settings()
+    from pymongo import MongoClient
+    client = MongoClient(settings.get("MONGO_URI"))
+    db = client.get_database(settings.get("MONGO_DATABASE"))
+    collection = db.get_collection("todo_ids_wos")
+
+    def f():
+        count = collection.count_documents(filter={"state": 0})
+        return count
+
+    while count_doc := f():
+        logger.info('待下载数量 %d' % count_doc)
+        yield process.crawl(WosDownloadSpider)
+        time.sleep(60)
+
+    process.stop()  # 所有爬虫结束后关闭事件循环
+
+
+if __name__ == '__main__':
+    process = CrawlerProcess(get_project_settings())
+    crawl_sequentially()
+    process.start()  # 阻塞直到所有爬虫完成