# -*- coding: utf-8 -*- # @Time : 2026/1/19 10:01 # @Author : zhaoxiangpeng # @File : crawl_article_by_id.py import time import logging from typing import List import pymysql from pymysql import cursors from twisted.internet import defer from scrapy.crawler import CrawlerProcess from scrapy.utils.project import get_project_settings from science_article_wos.spiders.wos_article_download_by_id import WosArticleDownloadByIdSpider from science_article_wos.utils import tools logger = logging.getLogger(__name__) def starter_forever(): def check_task() -> bool: from pymongo import MongoClient cli = MongoClient(settings.get("MONGO_URI")) db = cli[settings.get("MONGO_DATABASE")] r = db['todo_ids_wos'].find_one(filter={"state": 0}) if r: return True return False def check_session() -> bool: from redis import Redis cli = Redis.from_url(settings.get("REDIS_URL"), decode_responses=True) keys = cli.keys('cookie_pool:wos_sid:*') for key in keys: # 获取所有的信息 status = cli.hget(key, "status") if status == "normal": real_sid = key.rsplit(':', maxsplit=1)[-1] return True return False @defer.inlineCallbacks def f(running: bool = True): while running: # 连接到mongodb查询是否有未执行的任务 if not check_task(): logger.info("没有可下载的任务,即将结束") running = False continue # 查询redis中是否有可用的cookie if not check_session(): logger.info("没有有可用的cookie,等待") time.sleep(60 * 5) continue yield process.crawl(WosArticleDownloadByIdSpider) time.sleep(60 * 2) settings = get_project_settings() process = CrawlerProcess(settings) f(True) process.start() def starter(): process = CrawlerProcess(get_project_settings()) process.crawl(WosArticleDownloadByIdSpider) process.start() if __name__ == '__main__': starter_forever()