# -*- coding: utf-8 -*- # @Time : 2025/12/11 17:07 # @Author : zhaoxiangpeng # @File : crawl_article_by_ut.py import math import time import logging from twisted.internet import defer from scrapy.crawler import CrawlerProcess from scrapy.utils.project import get_project_settings from science_article_add.spiders.wos_download import WosDownloadSpider logging.getLogger('pymongo').setLevel(logging.WARNING) logger = logging.getLogger(__name__) BATCH_DOWNLOAD_LIMIT = 500 @defer.inlineCallbacks def crawl_sequentially(): settings = get_project_settings() from pymongo import MongoClient client = MongoClient(settings.get("MONGO_URI")) db = client.get_database(settings.get("MONGO_DATABASE")) collection = db.get_collection("todo_ids_wos") def f(): count = collection.count_documents(filter={"state": 0}) return count while count_doc := f(): logger.info('待下载数量 %d' % count_doc) yield process.crawl(WosDownloadSpider) time.sleep(60) process.stop() # 所有爬虫结束后关闭事件循环 if __name__ == '__main__': process = CrawlerProcess(get_project_settings()) crawl_sequentially() process.start() # 阻塞直到所有爬虫完成