wos:add spider

2 months ago · 2bd56aeb10
parent 3e50a7acef
commit 2bd56aeb10
3 changed files with 103 additions and 1 deletions
--- a/science_article_wos/science_article_wos/configs/wos_dp.py
+++ b/science_article_wos/science_article_wos/configs/wos_dp.py
@ -0,0 +1,32 @@
 class Settings:
    env = "dev"
    SEARCH_ROUTE = '/api/wosnx/core/runQuerySearch'
    EXPORT_ROUTE = '/api/wosnx/indic/export/saveToFile'
    DB_CHANGE_ELE = '//*[@id="global-select"]/div/div[@aria-label="Select database"]/div[@title="Web of Science Core Collection"]'
    QUERY_INPUT_ELE = '//*[@id="advancedSearchInputArea"]'
    SEARCH_BUTTON_ELE = '//button[@data-ta="run-search"]/span[@class="mat-mdc-button-touch-target"]'
    EXPORT_BUTTON_ELE = '//*[@id="export-trigger-btn"]'
    TABWIN_BUTTON_ELE = '//*[@id="exportToTabWinButton"]'  # 制表符分割文件button
    RECORD_TYPE_SELECT_ELE = '//div[@class="ng-star-inserted"]/wos-select/button[@aria-haspopup="listbox"]'  # 记录内容选择框
    FULL_RECORD_ELE = '//div[@id="global-select"]//div[@class="options options-menu"]/div[@title="Full Record"]'  # 完整记录
    FULL_RECORD_REFERENCE_ELE = '//div[@id="global-select"]//div[@class="options options-menu"]/div[@title="Full Record and Cited References"]'  # 全记录与参考文献
    RECORD_RANGE_ELE = '//*[@id="radio3-input"]'  # 记录范围
    RECORD_EXPORT_START_ELE = '//input[@name="markFrom"]'
    RECORD_EXPORT_END_ELE = '//input[@name="markTo"]'
    EXPORT_FILE_ELE = '//*[@id="exportButton"]'
    INPUT_CONTENT = '(OG=(Anhui University of Science & Technology)) AND PY=(2025)'
 class ProSettings(Settings):
    DB_CHANGE = '//*[@id="global-select"]/div/div[@aria-label="Select database"]/div[@title="Web of Science 核心合集"]'
    EXPORT_BUTTON_ELE = '//botton[@id="export-trigger-btn"]'
    FULL_RECORD_ELE = '//div[@id="global-select"]//div[@class="options options-menu"]/div[@title="完整记录"]'  # 完整记录
    FULL_RECORD_REFERENCE_ELE = '//div[@id="global-select"]//div[@class="options options-menu"]/div[@title="全记录与引用的参考文献"]'  # 全记录与参考文献
 settings = Settings()
--- a/science_article_wos/science_article_wos/settings.py
+++ b/science_article_wos/science_article_wos/settings.py
@ -23,7 +23,7 @@ ROBOTSTXT_OBEY = False
 # Concurrency and throttling settings
 #CONCURRENT_REQUESTS = 16
 CONCURRENT_REQUESTS_PER_DOMAIN = 1
-DOWNLOAD_DELAY = 1
+DOWNLOAD_DELAY = 0
 # Disable cookies (enabled by default)
 #COOKIES_ENABLED = False
--- a/science_article_wos/starter/crawl_article_by_id.py
+++ b/science_article_wos/starter/crawl_article_by_id.py
@ -0,0 +1,70 @@
 # -*- coding: utf-8 -*-
 # @Time    : 2026/1/19 10:01
 # @Author  : zhaoxiangpeng
 # @File    : crawl_article_by_id.py
 import time
 import logging
 from typing import List
 import pymysql
 from pymysql import cursors
 from twisted.internet import defer
 from scrapy.crawler import CrawlerProcess
 from scrapy.utils.project import get_project_settings
 from science_article_wos.spiders.wos_article_download_by_id import WosArticleDownloadByIdSpider
 from science_article_wos.utils import tools
 logger = logging.getLogger(__name__)
 def starter_forever():
    def check_task() -> bool:
        from pymongo import MongoClient
        cli = MongoClient(settings.get("MONGO_URI"))
        db = cli[settings.get("MONGO_DATABASE")]
        r = db['todo_ids_wos'].find_one(filter={"state": 0})
        if r:
            return True
        return False
    def check_session() -> bool:
        from redis import Redis
        cli = Redis.from_url(settings.get("REDIS_URL"), decode_responses=True)
        keys = cli.keys('cookie_pool:wos_sid:*')
        for key in keys:
            # 获取所有的信息
            status = cli.hget(key, "status")
            if status == "normal":
                real_sid = key.rsplit(':', maxsplit=1)[-1]
                return True
        return False
    @defer.inlineCallbacks
    def f(running: bool = True):
        while running:
            # 连接到mongodb查询是否有未执行的任务
            if not check_task():
                logger.info("没有可下载的任务，即将结束")
                running = False
                continue
            # 查询redis中是否有可用的cookie
            if not check_session():
                logger.info("没有有可用的cookie，等待")
                time.sleep(60 * 5)
                continue
            yield process.crawl(WosArticleDownloadByIdSpider)
            time.sleep(60 * 2)
    settings = get_project_settings()
    process = CrawlerProcess(settings)
    f(True)
    process.start()
 def starter():
    process = CrawlerProcess(get_project_settings())
    process.crawl(WosArticleDownloadByIdSpider)
    process.start()
 if __name__ == '__main__':
    starter_forever()