You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

60 lines
1.5 KiB
Python

# -*- coding: utf-8 -*-
# @Time : 2025/12/11 13:56
# @Author : zhaoxiangpeng
# @File : crawl_article_by_qid.py
import math
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
from science_article_wos.spiders.download_by_search_record import DownloadBySearchRecordSpider
BATCH_DOWNLOAD_LIMIT = 500
def f(record_id: str, records_found: int, shard_count: int = None):
mark_start = 1
mark_end = 0
idx = 0
shard_count = shard_count or math.ceil(records_found / BATCH_DOWNLOAD_LIMIT)
for i in range(shard_count):
idx += 1
mark_end += BATCH_DOWNLOAD_LIMIT
if mark_end > records_found:
mark_end = records_found
yield dict(
record_id=record_id,
mark_from=mark_start, mark_to=mark_end,
shard=idx, shard_count=shard_count,
records_found=records_found
)
mark_start += BATCH_DOWNLOAD_LIMIT
def ready():
"""
把待采集的任务入库
:return:
"""
RECORDS_FOUND = 1486
def test_starter():
init_params = dict(
record_id='68ce1627-b4c3-4938-adcb-476c7dcde004-0192d3c012',
mark_from=1, mark_to=50,
shard=1, shard_count=51,
records_found=25256
)
process = CrawlerProcess(get_project_settings())
process.crawl(DownloadBySearchRecordSpider, **init_params)
process.start()
def starter():
process = CrawlerProcess(get_project_settings())
process.crawl(DownloadBySearchRecordSpider)
process.start()