add:搁置更改
parent
12e9ed53a9
commit
abdad5b786
@ -0,0 +1,296 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
# @Time : 2025/11/24 09:25
|
||||||
|
# @Author : zhaoxiangpeng
|
||||||
|
# @File : wos_search_export.py
|
||||||
|
import math
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
from typing import Any
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
import redis
|
||||||
|
from DrissionPage import Chromium
|
||||||
|
from DrissionPage import ChromiumPage, ChromiumOptions
|
||||||
|
from DrissionPage._pages.chromium_tab import ChromiumTab
|
||||||
|
from DrissionPage._units.listener import DataPacket, Response
|
||||||
|
from DrissionPage.errors import ElementNotFoundError
|
||||||
|
|
||||||
|
from science_article_add.utils import tools
|
||||||
|
from science_article_add.scripts.wos_parse_data import parse_full_records_txt
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
logger.setLevel(logging.DEBUG)
|
||||||
|
LINK = "https://webofscience.clarivate.cn/wos/woscc/advanced-search"
|
||||||
|
BATCH_DOWNLOAD_LIMIT = 500
|
||||||
|
|
||||||
|
|
||||||
|
class Settings:
|
||||||
|
env = "dev"
|
||||||
|
SEARCH_ROUTE = '/api/wosnx/core/runQuerySearch'
|
||||||
|
EXPORT_ROUTE = '/api/wosnx/indic/export/saveToFile'
|
||||||
|
DB_CHANGE_ELE = '//*[@id="global-select"]/div/div[@aria-label="Select database"]/div[@title="Web of Science Core Collection"]'
|
||||||
|
QUERY_INPUT_ELE = '//*[@id="advancedSearchInputArea"]'
|
||||||
|
SEARCH_BUTTON_ELE = '//button[@data-ta="run-search"]/span[@class="mat-mdc-button-touch-target"]'
|
||||||
|
|
||||||
|
EXPORT_BUTTON_ELE = '//*[@id="export-trigger-btn"]'
|
||||||
|
TABWIN_BUTTON_ELE = '//*[@id="exportToTabWinButton"]' # 制表符分割文件button
|
||||||
|
|
||||||
|
RECORD_TYPE_SELECT_ELE = '//div[@class="ng-star-inserted"]/wos-select/button[@aria-haspopup="listbox"]' # 记录内容选择框
|
||||||
|
FULL_RECORD_ELE = '//div[@id="global-select"]//div[@class="options options-menu"]/div[@title="Full Record"]' # 完整记录
|
||||||
|
FULL_RECORD_REFERENCE_ELE = '//div[@id="global-select"]//div[@class="options options-menu"]/div[@title="Full Record and Cited References"]' # 全记录与参考文献
|
||||||
|
|
||||||
|
RECORD_RANGE_ELE = '//*[@id="radio3-input"]' # 记录范围
|
||||||
|
RECORD_EXPORT_START_ELE = '//input[@name="markFrom"]'
|
||||||
|
RECORD_EXPORT_END_ELE = '//input[@name="markTo"]'
|
||||||
|
|
||||||
|
EXPORT_FILE_ELE = '//*[@id="exportButton"]'
|
||||||
|
|
||||||
|
INPUT_CONTENT = '(OG=(Anhui University of Science & Technology)) AND PY=(2025)'
|
||||||
|
|
||||||
|
|
||||||
|
class ProSettings(Settings):
|
||||||
|
DB_CHANGE = '//*[@id="global-select"]/div/div[@aria-label="Select database"]/div[@title="Web of Science 核心合集"]'
|
||||||
|
EXPORT_BUTTON_ELE = '//botton[@id="export-trigger-btn"]'
|
||||||
|
FULL_RECORD_ELE = '//div[@id="global-select"]//div[@class="options options-menu"]/div[@title="完整记录"]' # 完整记录
|
||||||
|
FULL_RECORD_REFERENCE_ELE = '//div[@id="global-select"]//div[@class="options options-menu"]/div[@title="全记录与引用的参考文献"]' # 全记录与参考文献
|
||||||
|
|
||||||
|
|
||||||
|
settings = Settings()
|
||||||
|
|
||||||
|
|
||||||
|
class WosSearchExport:
|
||||||
|
_records_found = 0
|
||||||
|
inited: bool = False
|
||||||
|
is_running = False
|
||||||
|
|
||||||
|
def __init__(self, query_content: Any, options=None):
|
||||||
|
self._records_found = 0
|
||||||
|
self._query_id = None
|
||||||
|
self.query_content = query_content
|
||||||
|
self.options = options
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def create_instance(cls, config: dict):
|
||||||
|
return cls(
|
||||||
|
query_content=config.get("query_content"),
|
||||||
|
options=config.get('options')
|
||||||
|
)
|
||||||
|
|
||||||
|
def set_records_found(self, val):
|
||||||
|
self._records_found = val
|
||||||
|
|
||||||
|
def get_records_found(self) -> int:
|
||||||
|
return self._records_found
|
||||||
|
|
||||||
|
def set_query_id(self, query_id):
|
||||||
|
self._query_id = query_id
|
||||||
|
|
||||||
|
def get_query_id(self):
|
||||||
|
return self._query_id
|
||||||
|
|
||||||
|
def _initialize(self):
|
||||||
|
self.browser = Chromium(self.options)
|
||||||
|
self.tab = self.browser.latest_tab
|
||||||
|
# 都只需要执行一次
|
||||||
|
self.open_url(LINK)
|
||||||
|
# 处理cookie的首选项
|
||||||
|
self.operate_cookie_first()
|
||||||
|
self.change_db()
|
||||||
|
self.inited = True
|
||||||
|
|
||||||
|
def open_url(self, url):
|
||||||
|
logger.debug('Opening url: %s' % url)
|
||||||
|
self.tab.get(url)
|
||||||
|
|
||||||
|
def operate_cookie_first(self):
|
||||||
|
# cookie管理处理
|
||||||
|
logger.debug('Operating cookie first...')
|
||||||
|
ck_m_div = self.tab.ele('xpath://*[@id="onetrust-banner-sdk"]')
|
||||||
|
if ck_m_div:
|
||||||
|
ele = self.tab.ele('xpath://*[@id="onetrust-accept-btn-handler"]')
|
||||||
|
ele.click()
|
||||||
|
|
||||||
|
def change_db(self):
|
||||||
|
logger.info('Changing database...')
|
||||||
|
default_db_ele = self.tab.ele('xpath://*[@id="snSelectDb"]/button')
|
||||||
|
c1 = default_db_ele.raw_text
|
||||||
|
default_db_ele.click()
|
||||||
|
self.tab.ele(
|
||||||
|
'xpath:%(xpath)s' % {"xpath": settings.DB_CHANGE_ELE}).click()
|
||||||
|
|
||||||
|
def input_query(self, content: str, clear_input: bool = True, tab=None):
|
||||||
|
tab = tab or self.tab
|
||||||
|
input_area_ele = tab.ele('xpath:%(xpath)s' % {"xpath": settings.QUERY_INPUT_ELE})
|
||||||
|
if clear_input:
|
||||||
|
input_area_ele.clear() # 清空
|
||||||
|
|
||||||
|
input_area_ele.input(content) # 输入检索内容
|
||||||
|
|
||||||
|
def listen_func():
|
||||||
|
tab.listen.start(settings.SEARCH_ROUTE, method="POST")
|
||||||
|
|
||||||
|
def operation_func():
|
||||||
|
search_button_ele = tab.ele('xpath:%(xpath)s' % {"xpath": settings.SEARCH_BUTTON_ELE})
|
||||||
|
search_button_ele.click()
|
||||||
|
|
||||||
|
def capture_packet(packet: DataPacket):
|
||||||
|
search_url = tab.url
|
||||||
|
record_id, records_found = self.get_record_info(packet.response.body)
|
||||||
|
self.set_records_found(records_found)
|
||||||
|
self.set_query_id(record_id)
|
||||||
|
if not self.get_query_id():
|
||||||
|
logger.warning('未找到记录 %s' % packet.response.body)
|
||||||
|
|
||||||
|
if records_found == 0:
|
||||||
|
logger.warning('检索式 "%s" 找到记录 %s 条' % (self.query_content, records_found))
|
||||||
|
return
|
||||||
|
|
||||||
|
else:
|
||||||
|
logger.info('检索式 "%s" 找到记录 %s 条' % (self.query_content, records_found))
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
|
self.intercept(listen=listen_func, operation=operation_func, callback=capture_packet, tab=tab)
|
||||||
|
|
||||||
|
def download_records(self):
|
||||||
|
for b in self.distribute_page():
|
||||||
|
query_id, batch_id, mark_start, mark_end = b
|
||||||
|
self.rpa_download(mark_start, mark_end, batch=batch_id, tab=self.tab)
|
||||||
|
|
||||||
|
def distribute_page(self):
|
||||||
|
# 计算页码
|
||||||
|
logger.info("prepare downloading...")
|
||||||
|
records_found = self.get_records_found()
|
||||||
|
query_id = self.get_query_id()
|
||||||
|
mark_start = 1
|
||||||
|
mark_end = 0
|
||||||
|
batch_id = 0
|
||||||
|
for i in range(math.ceil(records_found / BATCH_DOWNLOAD_LIMIT)):
|
||||||
|
mark_end += BATCH_DOWNLOAD_LIMIT
|
||||||
|
if mark_end > records_found:
|
||||||
|
mark_end = records_found
|
||||||
|
batch_id += 1
|
||||||
|
yield query_id, batch_id, mark_start, mark_end
|
||||||
|
|
||||||
|
mark_start += BATCH_DOWNLOAD_LIMIT
|
||||||
|
|
||||||
|
def clear_query(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
def reflush_query(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
def reflush_page(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
def rpa_download(self, start: int = 1, end: int = 500, batch: str | int = None, tab=None):
|
||||||
|
"""
|
||||||
|
点击下载前拦截api
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
logger.debug("download starting...")
|
||||||
|
tab = tab or self.tab
|
||||||
|
tab.ele('xpath:%(xpath)s' % {"xpath": settings.EXPORT_BUTTON_ELE}).click() # 点击导出
|
||||||
|
tab.ele('xpath:%(xpath)s' % {"xpath": settings.TABWIN_BUTTON_ELE}).click() # 选择制表符分割
|
||||||
|
# 等待弹框
|
||||||
|
# 切换导出格式选择全记录与参考文献
|
||||||
|
tab.ele('xpath:%(xpath)s' % {"xpath": settings.RECORD_TYPE_SELECT_ELE}).click()
|
||||||
|
tab.ele('xpath:%(xpath)s' % {"xpath": settings.FULL_RECORD_REFERENCE_ELE}).click()
|
||||||
|
|
||||||
|
# 输入记录起止
|
||||||
|
tab.ele('xpath:%(xpath)s' % {"xpath": settings.RECORD_RANGE_ELE}).click() # 切换到范围
|
||||||
|
tab.ele('xpath:%(xpath)s' % {"xpath": settings.RECORD_EXPORT_START_ELE}).input(start, clear=True)
|
||||||
|
tab.ele('xpath:%(xpath)s' % {"xpath": settings.RECORD_EXPORT_END_ELE}).input(end, clear=True)
|
||||||
|
except ElementNotFoundError:
|
||||||
|
self.reflush_page()
|
||||||
|
|
||||||
|
def listen_func():
|
||||||
|
tab.listen.start(settings.EXPORT_ROUTE, method="POST")
|
||||||
|
|
||||||
|
def operation_func():
|
||||||
|
# tab.ele('xpath:%(xpath)s' % {"xpath": settings.EXPORT_FILE_ELE}).click() # 点击导出按钮
|
||||||
|
tab.ele('xpath:%(xpath)s' % {"xpath": settings.EXPORT_FILE_ELE}).click.to_download(
|
||||||
|
save_path=DOWNLOAD_PATH,
|
||||||
|
rename='%s.txt' % batch
|
||||||
|
)
|
||||||
|
|
||||||
|
def capture_packet(packet: DataPacket):
|
||||||
|
g = self._parse_download(packet.response)
|
||||||
|
for i in g:
|
||||||
|
print(i)
|
||||||
|
return True
|
||||||
|
|
||||||
|
self.intercept(listen=listen_func, operation=operation_func, callback=capture_packet, tab=tab)
|
||||||
|
|
||||||
|
def intercept(self, listen, operation, callback, tab=None):
|
||||||
|
listen()
|
||||||
|
operation()
|
||||||
|
for packet in tab.listen.steps(count=3):
|
||||||
|
print(packet.response.body)
|
||||||
|
if not self.intercept_verify(packet):
|
||||||
|
continue
|
||||||
|
r = callback(packet)
|
||||||
|
if r:
|
||||||
|
break
|
||||||
|
return
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def intercept_verify(packet: DataPacket):
|
||||||
|
content = packet.response.body
|
||||||
|
if isinstance(content, bytes) and content.find(b'"Server.passiveVerificationRequired"') != -1:
|
||||||
|
return False
|
||||||
|
else:
|
||||||
|
return True
|
||||||
|
|
||||||
|
def _parse_download(self, response: Response):
|
||||||
|
batch_time = datetime.now()
|
||||||
|
item_g = parse_full_records_txt(response.body.encode())
|
||||||
|
parse_count = 0
|
||||||
|
for data_dic in item_g:
|
||||||
|
t_id = data_dic.pop('ut', None)
|
||||||
|
if t_id:
|
||||||
|
parse_count += 1
|
||||||
|
yield dict(third_id=t_id, exported=data_dic, updated_at=batch_time)
|
||||||
|
# 解析被引量
|
||||||
|
if cited_num := tools.str2int(data_dic.get("tc", 0), 0):
|
||||||
|
yield dict(third_id=t_id, cited=cited_num, updated_at=batch_time)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def get_record_info(body: bytes):
|
||||||
|
resp_texts = body.strip().split(b'\n')
|
||||||
|
query_id = None
|
||||||
|
records_found = 0
|
||||||
|
for resp_text in resp_texts:
|
||||||
|
resp_row_dict: dict = json.loads(resp_text)
|
||||||
|
if resp_row_dict.get("key") == "searchInfo":
|
||||||
|
query_id = resp_row_dict.get("payload", {}).get("QueryID")
|
||||||
|
records_found = resp_row_dict.get("payload", {}).get("RecordsFound") # 找到的记录
|
||||||
|
break # 找到就结束
|
||||||
|
return query_id, records_found
|
||||||
|
|
||||||
|
def execute(self):
|
||||||
|
if not self.inited:
|
||||||
|
logger.info('初始化页面')
|
||||||
|
self._initialize()
|
||||||
|
self.input_query(self.query_content)
|
||||||
|
self.download_records()
|
||||||
|
|
||||||
|
def start(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
def stop(self):
|
||||||
|
self.tab.close()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
DOWNLOAD_PATH = r'Y:\wos-metadata\wos increment-202512\00'
|
||||||
|
conf = dict(
|
||||||
|
query_content="(OG=(Southwest University of Science & Technology - China)) AND PY=(2025)",
|
||||||
|
download_dir=DOWNLOAD_PATH
|
||||||
|
)
|
||||||
|
co = ChromiumOptions() # .headless()
|
||||||
|
co.set_pref('download.default_directory', conf['download_dir'])
|
||||||
|
conf['options'] = co
|
||||||
|
|
||||||
|
ins = WosSearchExport.create_instance(config=conf)
|
||||||
|
ins.execute()
|
||||||
@ -0,0 +1,95 @@
|
|||||||
|
from typing import Any, List, Union
|
||||||
|
from datetime import datetime
|
||||||
|
import scrapy
|
||||||
|
from scrapy.http import Response
|
||||||
|
from scrapy.http.request.json_request import JsonRequest
|
||||||
|
from scrapy.crawler import Crawler
|
||||||
|
|
||||||
|
from science_article_add.items.wos import WosArticleItem, WosCitedNumberItem, WosIdRelationItem
|
||||||
|
from science_article_add.scripts.wos_parse_data import parse_full_records
|
||||||
|
from science_article_add.models import wos_model as model
|
||||||
|
from science_article_add.utils import tools
|
||||||
|
from science_article_add.configs import wos as config
|
||||||
|
|
||||||
|
|
||||||
|
def maybe_list(val: Union[int, List[int]]) -> List[int]:
|
||||||
|
if isinstance(val, int):
|
||||||
|
return [val]
|
||||||
|
return list(val)
|
||||||
|
|
||||||
|
|
||||||
|
class DownloadByQidSpider(scrapy.Spider):
|
||||||
|
name = "download_by_qid"
|
||||||
|
|
||||||
|
custom_settings = dict(
|
||||||
|
DOWNLOADER_MIDDLEWARES={
|
||||||
|
"science_article_add.middlewares.wos.WosSidParamMiddleware": 500
|
||||||
|
},
|
||||||
|
ITEM_PIPELINES={
|
||||||
|
"science_article_add.pipelines.mongo.MongoPipeline": 300,
|
||||||
|
},
|
||||||
|
LOG_LEVEL="INFO"
|
||||||
|
)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_crawler(cls, crawler, *args, **kwargs):
|
||||||
|
return super().from_crawler(crawler, *args, **kwargs)
|
||||||
|
|
||||||
|
def __init__(self, record_id: str, mark_from: int = 1, mark_to: int = 500, records_found: int = None, **kwargs):
|
||||||
|
super().__init__()
|
||||||
|
self.record_id = record_id
|
||||||
|
self.records_found = records_found
|
||||||
|
self.mark_from = mark_from
|
||||||
|
self.mark_to = mark_to
|
||||||
|
self.task_id = None
|
||||||
|
self.org_id = None
|
||||||
|
self.query_id = None
|
||||||
|
self.bind_relation_enable = False
|
||||||
|
self.bind_relation_d = None
|
||||||
|
if self.bind_relation_enable:
|
||||||
|
self.build_relation()
|
||||||
|
|
||||||
|
def build_relation(self):
|
||||||
|
bind_relation_d = dict()
|
||||||
|
if self.task_id: self.bind_relation_d.setdefault("task_ids", maybe_list(self.task_id))
|
||||||
|
if self.org_id: self.bind_relation_d.setdefault("school_ids", maybe_list(self.org_id))
|
||||||
|
if self.query_id: self.bind_relation_d.setdefault("query_ids", maybe_list(self.query_id))
|
||||||
|
self.bind_relation_d = bind_relation_d
|
||||||
|
return bind_relation_d
|
||||||
|
|
||||||
|
async def start(self):
|
||||||
|
query_id = self.record_id
|
||||||
|
records_found = self.records_found
|
||||||
|
mark_start = self.mark_from
|
||||||
|
mark_end = self.mark_to
|
||||||
|
yield JsonRequest(config.WOS_EXPORT_FILE_API, method='POST',
|
||||||
|
data=model.export_search_data_to_txt(query_id, mark_from=mark_start,
|
||||||
|
mark_to=mark_end),
|
||||||
|
callback=self.download_parse)
|
||||||
|
|
||||||
|
def download_parse(self, response: Response, **kwargs: Any) -> Any:
|
||||||
|
parse_count = 0
|
||||||
|
batch_time = datetime.now()
|
||||||
|
records = parse_full_records(response.body)
|
||||||
|
for data_dic in records:
|
||||||
|
t_id = data_dic.pop('ut', None)
|
||||||
|
if t_id:
|
||||||
|
parse_count += 1
|
||||||
|
article_item = WosArticleItem()
|
||||||
|
article_item['third_id'] = t_id
|
||||||
|
article_item['exported'] = data_dic
|
||||||
|
article_item['updated_at'] = batch_time
|
||||||
|
yield article_item
|
||||||
|
# 解析被引量
|
||||||
|
if cited_num := tools.str2int(data_dic.get("tc", 0), 0):
|
||||||
|
cited_item = WosCitedNumberItem()
|
||||||
|
cited_item['third_id'] = t_id
|
||||||
|
cited_item['cited'] = cited_num
|
||||||
|
cited_item['updated_at'] = batch_time
|
||||||
|
yield cited_item
|
||||||
|
if self.bind_relation_enable and self.bind_relation_d:
|
||||||
|
# 当启用绑定关系配置才会绑定各种关系
|
||||||
|
relation_item = WosIdRelationItem()
|
||||||
|
relation_item['third_id'] = t_id
|
||||||
|
relation_item.update(**self.bind_relation_d)
|
||||||
|
yield relation_item
|
||||||
Loading…
Reference in New Issue