# -*- coding: utf-8 -*- # @Time : 2025/12/11 13:56 # @Author : zhaoxiangpeng # @File : crawl_article_by_qid.py import math from scrapy.crawler import CrawlerProcess from scrapy.utils.project import get_project_settings from science_article_wos.spiders.download_by_search_record import DownloadBySearchRecordSpider BATCH_DOWNLOAD_LIMIT = 500 def f(record_id: str, records_found: int, shard_count: int = None): mark_start = 1 mark_end = 0 idx = 0 shard_count = shard_count or math.ceil(records_found / BATCH_DOWNLOAD_LIMIT) for i in range(shard_count): idx += 1 mark_end += BATCH_DOWNLOAD_LIMIT if mark_end > records_found: mark_end = records_found yield dict( record_id=record_id, mark_from=mark_start, mark_to=mark_end, shard=idx, shard_count=shard_count, records_found=records_found ) mark_start += BATCH_DOWNLOAD_LIMIT def ready(): """ 把待采集的任务入库 :return: """ RECORDS_FOUND = 1486 def test_starter(): init_params = dict( record_id='68ce1627-b4c3-4938-adcb-476c7dcde004-0192d3c012', mark_from=1, mark_to=50, shard=1, shard_count=51, records_found=25256 ) process = CrawlerProcess(get_project_settings()) process.crawl(DownloadBySearchRecordSpider, **init_params) process.start() def starter(): process = CrawlerProcess(get_project_settings()) process.crawl(DownloadBySearchRecordSpider) process.start()