wos:add spider

main
zhaoxiangpeng 1 week ago
parent 3e50a7acef
commit 2bd56aeb10

@ -0,0 +1,32 @@
class Settings:
env = "dev"
SEARCH_ROUTE = '/api/wosnx/core/runQuerySearch'
EXPORT_ROUTE = '/api/wosnx/indic/export/saveToFile'
DB_CHANGE_ELE = '//*[@id="global-select"]/div/div[@aria-label="Select database"]/div[@title="Web of Science Core Collection"]'
QUERY_INPUT_ELE = '//*[@id="advancedSearchInputArea"]'
SEARCH_BUTTON_ELE = '//button[@data-ta="run-search"]/span[@class="mat-mdc-button-touch-target"]'
EXPORT_BUTTON_ELE = '//*[@id="export-trigger-btn"]'
TABWIN_BUTTON_ELE = '//*[@id="exportToTabWinButton"]' # 制表符分割文件button
RECORD_TYPE_SELECT_ELE = '//div[@class="ng-star-inserted"]/wos-select/button[@aria-haspopup="listbox"]' # 记录内容选择框
FULL_RECORD_ELE = '//div[@id="global-select"]//div[@class="options options-menu"]/div[@title="Full Record"]' # 完整记录
FULL_RECORD_REFERENCE_ELE = '//div[@id="global-select"]//div[@class="options options-menu"]/div[@title="Full Record and Cited References"]' # 全记录与参考文献
RECORD_RANGE_ELE = '//*[@id="radio3-input"]' # 记录范围
RECORD_EXPORT_START_ELE = '//input[@name="markFrom"]'
RECORD_EXPORT_END_ELE = '//input[@name="markTo"]'
EXPORT_FILE_ELE = '//*[@id="exportButton"]'
INPUT_CONTENT = '(OG=(Anhui University of Science & Technology)) AND PY=(2025)'
class ProSettings(Settings):
DB_CHANGE = '//*[@id="global-select"]/div/div[@aria-label="Select database"]/div[@title="Web of Science 核心合集"]'
EXPORT_BUTTON_ELE = '//botton[@id="export-trigger-btn"]'
FULL_RECORD_ELE = '//div[@id="global-select"]//div[@class="options options-menu"]/div[@title="完整记录"]' # 完整记录
FULL_RECORD_REFERENCE_ELE = '//div[@id="global-select"]//div[@class="options options-menu"]/div[@title="全记录与引用的参考文献"]' # 全记录与参考文献
settings = Settings()

@ -23,7 +23,7 @@ ROBOTSTXT_OBEY = False
# Concurrency and throttling settings # Concurrency and throttling settings
#CONCURRENT_REQUESTS = 16 #CONCURRENT_REQUESTS = 16
CONCURRENT_REQUESTS_PER_DOMAIN = 1 CONCURRENT_REQUESTS_PER_DOMAIN = 1
DOWNLOAD_DELAY = 1 DOWNLOAD_DELAY = 0
# Disable cookies (enabled by default) # Disable cookies (enabled by default)
#COOKIES_ENABLED = False #COOKIES_ENABLED = False

@ -0,0 +1,70 @@
# -*- coding: utf-8 -*-
# @Time : 2026/1/19 10:01
# @Author : zhaoxiangpeng
# @File : crawl_article_by_id.py
import time
import logging
from typing import List
import pymysql
from pymysql import cursors
from twisted.internet import defer
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
from science_article_wos.spiders.wos_article_download_by_id import WosArticleDownloadByIdSpider
from science_article_wos.utils import tools
logger = logging.getLogger(__name__)
def starter_forever():
def check_task() -> bool:
from pymongo import MongoClient
cli = MongoClient(settings.get("MONGO_URI"))
db = cli[settings.get("MONGO_DATABASE")]
r = db['todo_ids_wos'].find_one(filter={"state": 0})
if r:
return True
return False
def check_session() -> bool:
from redis import Redis
cli = Redis.from_url(settings.get("REDIS_URL"), decode_responses=True)
keys = cli.keys('cookie_pool:wos_sid:*')
for key in keys:
# 获取所有的信息
status = cli.hget(key, "status")
if status == "normal":
real_sid = key.rsplit(':', maxsplit=1)[-1]
return True
return False
@defer.inlineCallbacks
def f(running: bool = True):
while running:
# 连接到mongodb查询是否有未执行的任务
if not check_task():
logger.info("没有可下载的任务,即将结束")
running = False
continue
# 查询redis中是否有可用的cookie
if not check_session():
logger.info("没有有可用的cookie等待")
time.sleep(60 * 5)
continue
yield process.crawl(WosArticleDownloadByIdSpider)
time.sleep(60 * 2)
settings = get_project_settings()
process = CrawlerProcess(settings)
f(True)
process.start()
def starter():
process = CrawlerProcess(get_project_settings())
process.crawl(WosArticleDownloadByIdSpider)
process.start()
if __name__ == '__main__':
starter_forever()
Loading…
Cancel
Save