|
|
# -*- coding: utf-8 -*-
|
|
|
# @Time : 2026/1/19 10:01
|
|
|
# @Author : zhaoxiangpeng
|
|
|
# @File : crawl_article_by_id.py
|
|
|
|
|
|
import time
|
|
|
import logging
|
|
|
from typing import List
|
|
|
import pymysql
|
|
|
from pymysql import cursors
|
|
|
from twisted.internet import defer
|
|
|
from scrapy.crawler import CrawlerProcess
|
|
|
from scrapy.utils.project import get_project_settings
|
|
|
from science_article_wos.spiders.wos_article_download_by_id import WosArticleDownloadByIdSpider
|
|
|
from science_article_wos.utils import tools
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
|
|
|
def starter_forever():
|
|
|
def check_task() -> bool:
|
|
|
from pymongo import MongoClient
|
|
|
cli = MongoClient(settings.get("MONGO_URI"))
|
|
|
db = cli[settings.get("MONGO_DATABASE")]
|
|
|
r = db['todo_ids_wos'].find_one(filter={"state": 0})
|
|
|
if r:
|
|
|
return True
|
|
|
return False
|
|
|
|
|
|
def check_session() -> bool:
|
|
|
from redis import Redis
|
|
|
cli = Redis.from_url(settings.get("REDIS_URL"), decode_responses=True)
|
|
|
keys = cli.keys('cookie_pool:wos_sid:*')
|
|
|
for key in keys:
|
|
|
# 获取所有的信息
|
|
|
status = cli.hget(key, "status")
|
|
|
if status == "normal":
|
|
|
real_sid = key.rsplit(':', maxsplit=1)[-1]
|
|
|
return True
|
|
|
return False
|
|
|
|
|
|
@defer.inlineCallbacks
|
|
|
def f(running: bool = True):
|
|
|
while running:
|
|
|
# 连接到mongodb查询是否有未执行的任务
|
|
|
if not check_task():
|
|
|
logger.info("没有可下载的任务,即将结束")
|
|
|
running = False
|
|
|
continue
|
|
|
# 查询redis中是否有可用的cookie
|
|
|
if not check_session():
|
|
|
logger.info("没有有可用的cookie,等待")
|
|
|
time.sleep(60 * 5)
|
|
|
continue
|
|
|
yield process.crawl(WosArticleDownloadByIdSpider)
|
|
|
time.sleep(60 * 2)
|
|
|
|
|
|
settings = get_project_settings()
|
|
|
process = CrawlerProcess(settings)
|
|
|
f(True)
|
|
|
process.start()
|
|
|
|
|
|
|
|
|
def starter():
|
|
|
process = CrawlerProcess(get_project_settings())
|
|
|
process.crawl(WosArticleDownloadByIdSpider)
|
|
|
process.start()
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
starter_forever()
|