You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
60 lines
1.5 KiB
Python
60 lines
1.5 KiB
Python
# -*- coding: utf-8 -*-
|
|
# @Time : 2025/12/11 13:56
|
|
# @Author : zhaoxiangpeng
|
|
# @File : crawl_article_by_qid.py
|
|
import math
|
|
from scrapy.crawler import CrawlerProcess
|
|
from scrapy.utils.project import get_project_settings
|
|
from science_article_wos.spiders.download_by_search_record import DownloadBySearchRecordSpider
|
|
|
|
BATCH_DOWNLOAD_LIMIT = 500
|
|
|
|
|
|
def f(record_id: str, records_found: int, shard_count: int = None):
|
|
mark_start = 1
|
|
mark_end = 0
|
|
idx = 0
|
|
shard_count = shard_count or math.ceil(records_found / BATCH_DOWNLOAD_LIMIT)
|
|
for i in range(shard_count):
|
|
idx += 1
|
|
mark_end += BATCH_DOWNLOAD_LIMIT
|
|
|
|
if mark_end > records_found:
|
|
mark_end = records_found
|
|
|
|
yield dict(
|
|
record_id=record_id,
|
|
mark_from=mark_start, mark_to=mark_end,
|
|
shard=idx, shard_count=shard_count,
|
|
records_found=records_found
|
|
)
|
|
|
|
mark_start += BATCH_DOWNLOAD_LIMIT
|
|
|
|
|
|
def ready():
|
|
"""
|
|
把待采集的任务入库
|
|
:return:
|
|
"""
|
|
RECORDS_FOUND = 1486
|
|
|
|
|
|
def test_starter():
|
|
init_params = dict(
|
|
record_id='68ce1627-b4c3-4938-adcb-476c7dcde004-0192d3c012',
|
|
mark_from=1, mark_to=50,
|
|
shard=1, shard_count=51,
|
|
records_found=25256
|
|
)
|
|
|
|
process = CrawlerProcess(get_project_settings())
|
|
process.crawl(DownloadBySearchRecordSpider, **init_params)
|
|
process.start()
|
|
|
|
|
|
def starter():
|
|
process = CrawlerProcess(get_project_settings())
|
|
process.crawl(DownloadBySearchRecordSpider)
|
|
process.start()
|