# -*- coding: utf-8 -*- # @Time : 2025/12/11 13:56 # @Author : zhaoxiangpeng # @File : crawl_article_by_qid.py import math from scrapy.crawler import CrawlerProcess from scrapy.utils.project import get_project_settings from science_article_add.spiders.download_by_qid import DownloadByQidSpider BATCH_DOWNLOAD_LIMIT = 500 process = CrawlerProcess(get_project_settings()) RECORDS_FOUND = 1486 wos_download_todo = [ ] def f(record_id: str, records_found: int): mark_start = 1 mark_end = 0 idx = 0 for i in range(math.ceil(records_found / BATCH_DOWNLOAD_LIMIT)): idx += 1 mark_end += BATCH_DOWNLOAD_LIMIT if mark_end > records_found: mark_end = records_found yield dict( record_id=record_id, batch=idx, mark_from=mark_start, mark_to=mark_end, records_found=records_found ) mark_start += BATCH_DOWNLOAD_LIMIT init_params = dict( record_id='02f30273-1342-4d61-9e51-c1ea1f5b2423-0190efdd10', mark_from=1, mark_to=500, records_found=10641 ) process.crawl(DownloadByQidSpider, **init_params) process.start()