diff --git a/science_article_wos/science_article_wos/configs/wos_dp.py b/science_article_wos/science_article_wos/configs/wos_dp.py new file mode 100644 index 0000000..ced89fe --- /dev/null +++ b/science_article_wos/science_article_wos/configs/wos_dp.py @@ -0,0 +1,32 @@ +class Settings: + env = "dev" + SEARCH_ROUTE = '/api/wosnx/core/runQuerySearch' + EXPORT_ROUTE = '/api/wosnx/indic/export/saveToFile' + DB_CHANGE_ELE = '//*[@id="global-select"]/div/div[@aria-label="Select database"]/div[@title="Web of Science Core Collection"]' + QUERY_INPUT_ELE = '//*[@id="advancedSearchInputArea"]' + SEARCH_BUTTON_ELE = '//button[@data-ta="run-search"]/span[@class="mat-mdc-button-touch-target"]' + + EXPORT_BUTTON_ELE = '//*[@id="export-trigger-btn"]' + TABWIN_BUTTON_ELE = '//*[@id="exportToTabWinButton"]' # 制表符分割文件button + + RECORD_TYPE_SELECT_ELE = '//div[@class="ng-star-inserted"]/wos-select/button[@aria-haspopup="listbox"]' # 记录内容选择框 + FULL_RECORD_ELE = '//div[@id="global-select"]//div[@class="options options-menu"]/div[@title="Full Record"]' # 完整记录 + FULL_RECORD_REFERENCE_ELE = '//div[@id="global-select"]//div[@class="options options-menu"]/div[@title="Full Record and Cited References"]' # 全记录与参考文献 + + RECORD_RANGE_ELE = '//*[@id="radio3-input"]' # 记录范围 + RECORD_EXPORT_START_ELE = '//input[@name="markFrom"]' + RECORD_EXPORT_END_ELE = '//input[@name="markTo"]' + + EXPORT_FILE_ELE = '//*[@id="exportButton"]' + + INPUT_CONTENT = '(OG=(Anhui University of Science & Technology)) AND PY=(2025)' + + +class ProSettings(Settings): + DB_CHANGE = '//*[@id="global-select"]/div/div[@aria-label="Select database"]/div[@title="Web of Science 核心合集"]' + EXPORT_BUTTON_ELE = '//botton[@id="export-trigger-btn"]' + FULL_RECORD_ELE = '//div[@id="global-select"]//div[@class="options options-menu"]/div[@title="完整记录"]' # 完整记录 + FULL_RECORD_REFERENCE_ELE = '//div[@id="global-select"]//div[@class="options options-menu"]/div[@title="全记录与引用的参考文献"]' # 全记录与参考文献 + + +settings = Settings() diff --git a/science_article_wos/science_article_wos/settings.py b/science_article_wos/science_article_wos/settings.py index 1536b74..f2e36be 100644 --- a/science_article_wos/science_article_wos/settings.py +++ b/science_article_wos/science_article_wos/settings.py @@ -23,7 +23,7 @@ ROBOTSTXT_OBEY = False # Concurrency and throttling settings #CONCURRENT_REQUESTS = 16 CONCURRENT_REQUESTS_PER_DOMAIN = 1 -DOWNLOAD_DELAY = 1 +DOWNLOAD_DELAY = 0 # Disable cookies (enabled by default) #COOKIES_ENABLED = False diff --git a/science_article_wos/starter/crawl_article_by_id.py b/science_article_wos/starter/crawl_article_by_id.py new file mode 100644 index 0000000..087b074 --- /dev/null +++ b/science_article_wos/starter/crawl_article_by_id.py @@ -0,0 +1,70 @@ +# -*- coding: utf-8 -*- +# @Time : 2026/1/19 10:01 +# @Author : zhaoxiangpeng +# @File : crawl_article_by_id.py + +import time +import logging +from typing import List +import pymysql +from pymysql import cursors +from twisted.internet import defer +from scrapy.crawler import CrawlerProcess +from scrapy.utils.project import get_project_settings +from science_article_wos.spiders.wos_article_download_by_id import WosArticleDownloadByIdSpider +from science_article_wos.utils import tools +logger = logging.getLogger(__name__) + + +def starter_forever(): + def check_task() -> bool: + from pymongo import MongoClient + cli = MongoClient(settings.get("MONGO_URI")) + db = cli[settings.get("MONGO_DATABASE")] + r = db['todo_ids_wos'].find_one(filter={"state": 0}) + if r: + return True + return False + + def check_session() -> bool: + from redis import Redis + cli = Redis.from_url(settings.get("REDIS_URL"), decode_responses=True) + keys = cli.keys('cookie_pool:wos_sid:*') + for key in keys: + # 获取所有的信息 + status = cli.hget(key, "status") + if status == "normal": + real_sid = key.rsplit(':', maxsplit=1)[-1] + return True + return False + + @defer.inlineCallbacks + def f(running: bool = True): + while running: + # 连接到mongodb查询是否有未执行的任务 + if not check_task(): + logger.info("没有可下载的任务,即将结束") + running = False + continue + # 查询redis中是否有可用的cookie + if not check_session(): + logger.info("没有有可用的cookie,等待") + time.sleep(60 * 5) + continue + yield process.crawl(WosArticleDownloadByIdSpider) + time.sleep(60 * 2) + + settings = get_project_settings() + process = CrawlerProcess(settings) + f(True) + process.start() + + +def starter(): + process = CrawlerProcess(get_project_settings()) + process.crawl(WosArticleDownloadByIdSpider) + process.start() + + +if __name__ == '__main__': + starter_forever()