add:搁置更改
parent
12e9ed53a9
commit
abdad5b786
@ -0,0 +1,296 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# @Time : 2025/11/24 09:25
|
||||
# @Author : zhaoxiangpeng
|
||||
# @File : wos_search_export.py
|
||||
import math
|
||||
import json
|
||||
import logging
|
||||
from typing import Any
|
||||
from datetime import datetime
|
||||
|
||||
import redis
|
||||
from DrissionPage import Chromium
|
||||
from DrissionPage import ChromiumPage, ChromiumOptions
|
||||
from DrissionPage._pages.chromium_tab import ChromiumTab
|
||||
from DrissionPage._units.listener import DataPacket, Response
|
||||
from DrissionPage.errors import ElementNotFoundError
|
||||
|
||||
from science_article_add.utils import tools
|
||||
from science_article_add.scripts.wos_parse_data import parse_full_records_txt
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
logger.setLevel(logging.DEBUG)
|
||||
LINK = "https://webofscience.clarivate.cn/wos/woscc/advanced-search"
|
||||
BATCH_DOWNLOAD_LIMIT = 500
|
||||
|
||||
|
||||
class Settings:
|
||||
env = "dev"
|
||||
SEARCH_ROUTE = '/api/wosnx/core/runQuerySearch'
|
||||
EXPORT_ROUTE = '/api/wosnx/indic/export/saveToFile'
|
||||
DB_CHANGE_ELE = '//*[@id="global-select"]/div/div[@aria-label="Select database"]/div[@title="Web of Science Core Collection"]'
|
||||
QUERY_INPUT_ELE = '//*[@id="advancedSearchInputArea"]'
|
||||
SEARCH_BUTTON_ELE = '//button[@data-ta="run-search"]/span[@class="mat-mdc-button-touch-target"]'
|
||||
|
||||
EXPORT_BUTTON_ELE = '//*[@id="export-trigger-btn"]'
|
||||
TABWIN_BUTTON_ELE = '//*[@id="exportToTabWinButton"]' # 制表符分割文件button
|
||||
|
||||
RECORD_TYPE_SELECT_ELE = '//div[@class="ng-star-inserted"]/wos-select/button[@aria-haspopup="listbox"]' # 记录内容选择框
|
||||
FULL_RECORD_ELE = '//div[@id="global-select"]//div[@class="options options-menu"]/div[@title="Full Record"]' # 完整记录
|
||||
FULL_RECORD_REFERENCE_ELE = '//div[@id="global-select"]//div[@class="options options-menu"]/div[@title="Full Record and Cited References"]' # 全记录与参考文献
|
||||
|
||||
RECORD_RANGE_ELE = '//*[@id="radio3-input"]' # 记录范围
|
||||
RECORD_EXPORT_START_ELE = '//input[@name="markFrom"]'
|
||||
RECORD_EXPORT_END_ELE = '//input[@name="markTo"]'
|
||||
|
||||
EXPORT_FILE_ELE = '//*[@id="exportButton"]'
|
||||
|
||||
INPUT_CONTENT = '(OG=(Anhui University of Science & Technology)) AND PY=(2025)'
|
||||
|
||||
|
||||
class ProSettings(Settings):
|
||||
DB_CHANGE = '//*[@id="global-select"]/div/div[@aria-label="Select database"]/div[@title="Web of Science 核心合集"]'
|
||||
EXPORT_BUTTON_ELE = '//botton[@id="export-trigger-btn"]'
|
||||
FULL_RECORD_ELE = '//div[@id="global-select"]//div[@class="options options-menu"]/div[@title="完整记录"]' # 完整记录
|
||||
FULL_RECORD_REFERENCE_ELE = '//div[@id="global-select"]//div[@class="options options-menu"]/div[@title="全记录与引用的参考文献"]' # 全记录与参考文献
|
||||
|
||||
|
||||
settings = Settings()
|
||||
|
||||
|
||||
class WosSearchExport:
|
||||
_records_found = 0
|
||||
inited: bool = False
|
||||
is_running = False
|
||||
|
||||
def __init__(self, query_content: Any, options=None):
|
||||
self._records_found = 0
|
||||
self._query_id = None
|
||||
self.query_content = query_content
|
||||
self.options = options
|
||||
|
||||
@classmethod
|
||||
def create_instance(cls, config: dict):
|
||||
return cls(
|
||||
query_content=config.get("query_content"),
|
||||
options=config.get('options')
|
||||
)
|
||||
|
||||
def set_records_found(self, val):
|
||||
self._records_found = val
|
||||
|
||||
def get_records_found(self) -> int:
|
||||
return self._records_found
|
||||
|
||||
def set_query_id(self, query_id):
|
||||
self._query_id = query_id
|
||||
|
||||
def get_query_id(self):
|
||||
return self._query_id
|
||||
|
||||
def _initialize(self):
|
||||
self.browser = Chromium(self.options)
|
||||
self.tab = self.browser.latest_tab
|
||||
# 都只需要执行一次
|
||||
self.open_url(LINK)
|
||||
# 处理cookie的首选项
|
||||
self.operate_cookie_first()
|
||||
self.change_db()
|
||||
self.inited = True
|
||||
|
||||
def open_url(self, url):
|
||||
logger.debug('Opening url: %s' % url)
|
||||
self.tab.get(url)
|
||||
|
||||
def operate_cookie_first(self):
|
||||
# cookie管理处理
|
||||
logger.debug('Operating cookie first...')
|
||||
ck_m_div = self.tab.ele('xpath://*[@id="onetrust-banner-sdk"]')
|
||||
if ck_m_div:
|
||||
ele = self.tab.ele('xpath://*[@id="onetrust-accept-btn-handler"]')
|
||||
ele.click()
|
||||
|
||||
def change_db(self):
|
||||
logger.info('Changing database...')
|
||||
default_db_ele = self.tab.ele('xpath://*[@id="snSelectDb"]/button')
|
||||
c1 = default_db_ele.raw_text
|
||||
default_db_ele.click()
|
||||
self.tab.ele(
|
||||
'xpath:%(xpath)s' % {"xpath": settings.DB_CHANGE_ELE}).click()
|
||||
|
||||
def input_query(self, content: str, clear_input: bool = True, tab=None):
|
||||
tab = tab or self.tab
|
||||
input_area_ele = tab.ele('xpath:%(xpath)s' % {"xpath": settings.QUERY_INPUT_ELE})
|
||||
if clear_input:
|
||||
input_area_ele.clear() # 清空
|
||||
|
||||
input_area_ele.input(content) # 输入检索内容
|
||||
|
||||
def listen_func():
|
||||
tab.listen.start(settings.SEARCH_ROUTE, method="POST")
|
||||
|
||||
def operation_func():
|
||||
search_button_ele = tab.ele('xpath:%(xpath)s' % {"xpath": settings.SEARCH_BUTTON_ELE})
|
||||
search_button_ele.click()
|
||||
|
||||
def capture_packet(packet: DataPacket):
|
||||
search_url = tab.url
|
||||
record_id, records_found = self.get_record_info(packet.response.body)
|
||||
self.set_records_found(records_found)
|
||||
self.set_query_id(record_id)
|
||||
if not self.get_query_id():
|
||||
logger.warning('未找到记录 %s' % packet.response.body)
|
||||
|
||||
if records_found == 0:
|
||||
logger.warning('检索式 "%s" 找到记录 %s 条' % (self.query_content, records_found))
|
||||
return
|
||||
|
||||
else:
|
||||
logger.info('检索式 "%s" 找到记录 %s 条' % (self.query_content, records_found))
|
||||
|
||||
return True
|
||||
|
||||
self.intercept(listen=listen_func, operation=operation_func, callback=capture_packet, tab=tab)
|
||||
|
||||
def download_records(self):
|
||||
for b in self.distribute_page():
|
||||
query_id, batch_id, mark_start, mark_end = b
|
||||
self.rpa_download(mark_start, mark_end, batch=batch_id, tab=self.tab)
|
||||
|
||||
def distribute_page(self):
|
||||
# 计算页码
|
||||
logger.info("prepare downloading...")
|
||||
records_found = self.get_records_found()
|
||||
query_id = self.get_query_id()
|
||||
mark_start = 1
|
||||
mark_end = 0
|
||||
batch_id = 0
|
||||
for i in range(math.ceil(records_found / BATCH_DOWNLOAD_LIMIT)):
|
||||
mark_end += BATCH_DOWNLOAD_LIMIT
|
||||
if mark_end > records_found:
|
||||
mark_end = records_found
|
||||
batch_id += 1
|
||||
yield query_id, batch_id, mark_start, mark_end
|
||||
|
||||
mark_start += BATCH_DOWNLOAD_LIMIT
|
||||
|
||||
def clear_query(self):
|
||||
pass
|
||||
|
||||
def reflush_query(self):
|
||||
pass
|
||||
|
||||
def reflush_page(self):
|
||||
pass
|
||||
|
||||
def rpa_download(self, start: int = 1, end: int = 500, batch: str | int = None, tab=None):
|
||||
"""
|
||||
点击下载前拦截api
|
||||
"""
|
||||
try:
|
||||
logger.debug("download starting...")
|
||||
tab = tab or self.tab
|
||||
tab.ele('xpath:%(xpath)s' % {"xpath": settings.EXPORT_BUTTON_ELE}).click() # 点击导出
|
||||
tab.ele('xpath:%(xpath)s' % {"xpath": settings.TABWIN_BUTTON_ELE}).click() # 选择制表符分割
|
||||
# 等待弹框
|
||||
# 切换导出格式选择全记录与参考文献
|
||||
tab.ele('xpath:%(xpath)s' % {"xpath": settings.RECORD_TYPE_SELECT_ELE}).click()
|
||||
tab.ele('xpath:%(xpath)s' % {"xpath": settings.FULL_RECORD_REFERENCE_ELE}).click()
|
||||
|
||||
# 输入记录起止
|
||||
tab.ele('xpath:%(xpath)s' % {"xpath": settings.RECORD_RANGE_ELE}).click() # 切换到范围
|
||||
tab.ele('xpath:%(xpath)s' % {"xpath": settings.RECORD_EXPORT_START_ELE}).input(start, clear=True)
|
||||
tab.ele('xpath:%(xpath)s' % {"xpath": settings.RECORD_EXPORT_END_ELE}).input(end, clear=True)
|
||||
except ElementNotFoundError:
|
||||
self.reflush_page()
|
||||
|
||||
def listen_func():
|
||||
tab.listen.start(settings.EXPORT_ROUTE, method="POST")
|
||||
|
||||
def operation_func():
|
||||
# tab.ele('xpath:%(xpath)s' % {"xpath": settings.EXPORT_FILE_ELE}).click() # 点击导出按钮
|
||||
tab.ele('xpath:%(xpath)s' % {"xpath": settings.EXPORT_FILE_ELE}).click.to_download(
|
||||
save_path=DOWNLOAD_PATH,
|
||||
rename='%s.txt' % batch
|
||||
)
|
||||
|
||||
def capture_packet(packet: DataPacket):
|
||||
g = self._parse_download(packet.response)
|
||||
for i in g:
|
||||
print(i)
|
||||
return True
|
||||
|
||||
self.intercept(listen=listen_func, operation=operation_func, callback=capture_packet, tab=tab)
|
||||
|
||||
def intercept(self, listen, operation, callback, tab=None):
|
||||
listen()
|
||||
operation()
|
||||
for packet in tab.listen.steps(count=3):
|
||||
print(packet.response.body)
|
||||
if not self.intercept_verify(packet):
|
||||
continue
|
||||
r = callback(packet)
|
||||
if r:
|
||||
break
|
||||
return
|
||||
|
||||
@staticmethod
|
||||
def intercept_verify(packet: DataPacket):
|
||||
content = packet.response.body
|
||||
if isinstance(content, bytes) and content.find(b'"Server.passiveVerificationRequired"') != -1:
|
||||
return False
|
||||
else:
|
||||
return True
|
||||
|
||||
def _parse_download(self, response: Response):
|
||||
batch_time = datetime.now()
|
||||
item_g = parse_full_records_txt(response.body.encode())
|
||||
parse_count = 0
|
||||
for data_dic in item_g:
|
||||
t_id = data_dic.pop('ut', None)
|
||||
if t_id:
|
||||
parse_count += 1
|
||||
yield dict(third_id=t_id, exported=data_dic, updated_at=batch_time)
|
||||
# 解析被引量
|
||||
if cited_num := tools.str2int(data_dic.get("tc", 0), 0):
|
||||
yield dict(third_id=t_id, cited=cited_num, updated_at=batch_time)
|
||||
|
||||
@staticmethod
|
||||
def get_record_info(body: bytes):
|
||||
resp_texts = body.strip().split(b'\n')
|
||||
query_id = None
|
||||
records_found = 0
|
||||
for resp_text in resp_texts:
|
||||
resp_row_dict: dict = json.loads(resp_text)
|
||||
if resp_row_dict.get("key") == "searchInfo":
|
||||
query_id = resp_row_dict.get("payload", {}).get("QueryID")
|
||||
records_found = resp_row_dict.get("payload", {}).get("RecordsFound") # 找到的记录
|
||||
break # 找到就结束
|
||||
return query_id, records_found
|
||||
|
||||
def execute(self):
|
||||
if not self.inited:
|
||||
logger.info('初始化页面')
|
||||
self._initialize()
|
||||
self.input_query(self.query_content)
|
||||
self.download_records()
|
||||
|
||||
def start(self):
|
||||
pass
|
||||
|
||||
def stop(self):
|
||||
self.tab.close()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
DOWNLOAD_PATH = r'Y:\wos-metadata\wos increment-202512\00'
|
||||
conf = dict(
|
||||
query_content="(OG=(Southwest University of Science & Technology - China)) AND PY=(2025)",
|
||||
download_dir=DOWNLOAD_PATH
|
||||
)
|
||||
co = ChromiumOptions() # .headless()
|
||||
co.set_pref('download.default_directory', conf['download_dir'])
|
||||
conf['options'] = co
|
||||
|
||||
ins = WosSearchExport.create_instance(config=conf)
|
||||
ins.execute()
|
||||
@ -0,0 +1,95 @@
|
||||
from typing import Any, List, Union
|
||||
from datetime import datetime
|
||||
import scrapy
|
||||
from scrapy.http import Response
|
||||
from scrapy.http.request.json_request import JsonRequest
|
||||
from scrapy.crawler import Crawler
|
||||
|
||||
from science_article_add.items.wos import WosArticleItem, WosCitedNumberItem, WosIdRelationItem
|
||||
from science_article_add.scripts.wos_parse_data import parse_full_records
|
||||
from science_article_add.models import wos_model as model
|
||||
from science_article_add.utils import tools
|
||||
from science_article_add.configs import wos as config
|
||||
|
||||
|
||||
def maybe_list(val: Union[int, List[int]]) -> List[int]:
|
||||
if isinstance(val, int):
|
||||
return [val]
|
||||
return list(val)
|
||||
|
||||
|
||||
class DownloadByQidSpider(scrapy.Spider):
|
||||
name = "download_by_qid"
|
||||
|
||||
custom_settings = dict(
|
||||
DOWNLOADER_MIDDLEWARES={
|
||||
"science_article_add.middlewares.wos.WosSidParamMiddleware": 500
|
||||
},
|
||||
ITEM_PIPELINES={
|
||||
"science_article_add.pipelines.mongo.MongoPipeline": 300,
|
||||
},
|
||||
LOG_LEVEL="INFO"
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler, *args, **kwargs):
|
||||
return super().from_crawler(crawler, *args, **kwargs)
|
||||
|
||||
def __init__(self, record_id: str, mark_from: int = 1, mark_to: int = 500, records_found: int = None, **kwargs):
|
||||
super().__init__()
|
||||
self.record_id = record_id
|
||||
self.records_found = records_found
|
||||
self.mark_from = mark_from
|
||||
self.mark_to = mark_to
|
||||
self.task_id = None
|
||||
self.org_id = None
|
||||
self.query_id = None
|
||||
self.bind_relation_enable = False
|
||||
self.bind_relation_d = None
|
||||
if self.bind_relation_enable:
|
||||
self.build_relation()
|
||||
|
||||
def build_relation(self):
|
||||
bind_relation_d = dict()
|
||||
if self.task_id: self.bind_relation_d.setdefault("task_ids", maybe_list(self.task_id))
|
||||
if self.org_id: self.bind_relation_d.setdefault("school_ids", maybe_list(self.org_id))
|
||||
if self.query_id: self.bind_relation_d.setdefault("query_ids", maybe_list(self.query_id))
|
||||
self.bind_relation_d = bind_relation_d
|
||||
return bind_relation_d
|
||||
|
||||
async def start(self):
|
||||
query_id = self.record_id
|
||||
records_found = self.records_found
|
||||
mark_start = self.mark_from
|
||||
mark_end = self.mark_to
|
||||
yield JsonRequest(config.WOS_EXPORT_FILE_API, method='POST',
|
||||
data=model.export_search_data_to_txt(query_id, mark_from=mark_start,
|
||||
mark_to=mark_end),
|
||||
callback=self.download_parse)
|
||||
|
||||
def download_parse(self, response: Response, **kwargs: Any) -> Any:
|
||||
parse_count = 0
|
||||
batch_time = datetime.now()
|
||||
records = parse_full_records(response.body)
|
||||
for data_dic in records:
|
||||
t_id = data_dic.pop('ut', None)
|
||||
if t_id:
|
||||
parse_count += 1
|
||||
article_item = WosArticleItem()
|
||||
article_item['third_id'] = t_id
|
||||
article_item['exported'] = data_dic
|
||||
article_item['updated_at'] = batch_time
|
||||
yield article_item
|
||||
# 解析被引量
|
||||
if cited_num := tools.str2int(data_dic.get("tc", 0), 0):
|
||||
cited_item = WosCitedNumberItem()
|
||||
cited_item['third_id'] = t_id
|
||||
cited_item['cited'] = cited_num
|
||||
cited_item['updated_at'] = batch_time
|
||||
yield cited_item
|
||||
if self.bind_relation_enable and self.bind_relation_d:
|
||||
# 当启用绑定关系配置才会绑定各种关系
|
||||
relation_item = WosIdRelationItem()
|
||||
relation_item['third_id'] = t_id
|
||||
relation_item.update(**self.bind_relation_d)
|
||||
yield relation_item
|
||||
Loading…
Reference in New Issue