You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

71 lines
2.1 KiB
Python

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

# -*- coding: utf-8 -*-
# @Time : 2026/1/19 10:01
# @Author : zhaoxiangpeng
# @File : crawl_article_by_id.py
import time
import logging
from typing import List
import pymysql
from pymysql import cursors
from twisted.internet import defer
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
from science_article_wos.spiders.wos_article_download_by_id import WosArticleDownloadByIdSpider
from science_article_wos.utils import tools
logger = logging.getLogger(__name__)
def starter_forever():
def check_task() -> bool:
from pymongo import MongoClient
cli = MongoClient(settings.get("MONGO_URI"))
db = cli[settings.get("MONGO_DATABASE")]
r = db['todo_ids_wos'].find_one(filter={"state": 0})
if r:
return True
return False
def check_session() -> bool:
from redis import Redis
cli = Redis.from_url(settings.get("REDIS_URL"), decode_responses=True)
keys = cli.keys('cookie_pool:wos_sid:*')
for key in keys:
# 获取所有的信息
status = cli.hget(key, "status")
if status == "normal":
real_sid = key.rsplit(':', maxsplit=1)[-1]
return True
return False
@defer.inlineCallbacks
def f(running: bool = True):
while running:
# 连接到mongodb查询是否有未执行的任务
if not check_task():
logger.info("没有可下载的任务,即将结束")
running = False
continue
# 查询redis中是否有可用的cookie
if not check_session():
logger.info("没有有可用的cookie等待")
time.sleep(60 * 5)
continue
yield process.crawl(WosArticleDownloadByIdSpider)
time.sleep(60 * 2)
settings = get_project_settings()
process = CrawlerProcess(settings)
f(True)
process.start()
def starter():
process = CrawlerProcess(get_project_settings())
process.crawl(WosArticleDownloadByIdSpider)
process.start()
if __name__ == '__main__':
starter_forever()