|
|
|
|
@ -0,0 +1,409 @@
|
|
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
|
# @Time : 2025/12/16 15:24
|
|
|
|
|
# @Author : zhaoxiangpeng
|
|
|
|
|
# @File : cookie_manager.py
|
|
|
|
|
from __future__ import annotations
|
|
|
|
|
|
|
|
|
|
import json
|
|
|
|
|
from typing import TYPE_CHECKING, Generator
|
|
|
|
|
import logging
|
|
|
|
|
import time
|
|
|
|
|
import threading
|
|
|
|
|
from datetime import datetime, timedelta
|
|
|
|
|
from typing import Optional, Callable
|
|
|
|
|
|
|
|
|
|
import redis
|
|
|
|
|
import requests
|
|
|
|
|
from DrissionPage import Chromium
|
|
|
|
|
|
|
|
|
|
from science_article_wos.utils.xpath_cfg import Settings
|
|
|
|
|
|
|
|
|
|
if TYPE_CHECKING:
|
|
|
|
|
from DrissionPage import ChromiumPage, ChromiumOptions
|
|
|
|
|
from scrapy_drissionpage.response import DrissionResponse
|
|
|
|
|
from DrissionPage._pages.chromium_tab import ChromiumTab
|
|
|
|
|
from DrissionPage._units.listener import DataPacket, Response
|
|
|
|
|
|
|
|
|
|
VERIFY_ROUTER = "/api/wosnx/core/verify"
|
|
|
|
|
settings = Settings()
|
|
|
|
|
logging.basicConfig(level=logging.DEBUG)
|
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_self_ip():
|
|
|
|
|
"""获取当前IP地址"""
|
|
|
|
|
try:
|
|
|
|
|
resp = requests.get("https://www.httpbin.org/ip", timeout=10)
|
|
|
|
|
assert resp.status_code == 200
|
|
|
|
|
data = resp.json()
|
|
|
|
|
ipaddr = data['origin']
|
|
|
|
|
return ipaddr
|
|
|
|
|
except Exception as e:
|
|
|
|
|
logger.error(f"获取IP失败: {str(e)}")
|
|
|
|
|
return "unknown"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def intercept(self, listen, operation, callback, tab=None):
|
|
|
|
|
listen()
|
|
|
|
|
operation()
|
|
|
|
|
for packet in tab.listen.steps(count=3):
|
|
|
|
|
if not intercept_verify(packet):
|
|
|
|
|
continue
|
|
|
|
|
r = callback(packet)
|
|
|
|
|
if isinstance(r, Generator):
|
|
|
|
|
return r
|
|
|
|
|
else:
|
|
|
|
|
if isinstance(r, bool):
|
|
|
|
|
break
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def intercept_verify(packet: DataPacket):
|
|
|
|
|
content = packet.response.body
|
|
|
|
|
if isinstance(content, bytes) and content.find(b'"Server.passiveVerificationRequired"') != -1:
|
|
|
|
|
return False
|
|
|
|
|
else:
|
|
|
|
|
return True
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class DPOperations:
|
|
|
|
|
def __init__(self, browser, tab):
|
|
|
|
|
self.browser = browser
|
|
|
|
|
self.tab = tab
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
|
def operate_cookie_first(tab):
|
|
|
|
|
# 处理弹出的cookie首选项
|
|
|
|
|
logger.debug('Operating cookie first...')
|
|
|
|
|
ck_m_div = tab.ele('xpath://*[@id="onetrust-banner-sdk"]')
|
|
|
|
|
if ck_m_div:
|
|
|
|
|
ele = tab.ele('xpath://*[@id="onetrust-accept-btn-handler"]')
|
|
|
|
|
ele.click()
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
|
def change_db(tab):
|
|
|
|
|
logger.info('Changing database...')
|
|
|
|
|
default_db_ele = tab.ele('xpath://*[@id="snSelectDb"]/button')
|
|
|
|
|
c1 = default_db_ele.raw_text
|
|
|
|
|
default_db_ele.click()
|
|
|
|
|
xpath = '//*[@id="global-select"]/div/div[@aria-label="Select database"]/div[@title="Web of Science Core Collection"]'
|
|
|
|
|
tab.ele(
|
|
|
|
|
'xpath:%(xpath)s' % {"xpath": xpath}).click()
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
|
def input_ops(tab, content=None, clear_input: bool = True):
|
|
|
|
|
logger.debug('Input operation...')
|
|
|
|
|
input_area_ele = tab.ele('xpath:%(xpath)s' % {"xpath": settings.QUERY_INPUT_ELE})
|
|
|
|
|
if clear_input:
|
|
|
|
|
input_area_ele.clear() # 清空
|
|
|
|
|
if content is None:
|
|
|
|
|
content = "(OG=(Shanghai Jiao Tong University)) AND PY=(2025)"
|
|
|
|
|
input_area_ele.input(content) # 输入检索内容
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
|
def search_ops(tab):
|
|
|
|
|
logger.debug('Search operation...')
|
|
|
|
|
search_button_ele = tab.ele('xpath:%(xpath)s' % {"xpath": settings.SEARCH_BUTTON_ELE})
|
|
|
|
|
search_button_ele.click()
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
|
def export_ops(tab, start: int = 1, end: int = 50):
|
|
|
|
|
tab.ele('xpath:%(xpath)s' % {"xpath": settings.EXPORT_BUTTON_ELE}).click() # 点击导出
|
|
|
|
|
tab.ele('xpath:%(xpath)s' % {"xpath": settings.TABWIN_BUTTON_ELE}).click() # 选择制表符分割
|
|
|
|
|
# 等待弹框
|
|
|
|
|
# 切换导出格式选择全记录与参考文献
|
|
|
|
|
tab.ele('xpath:%(xpath)s' % {"xpath": settings.RECORD_TYPE_SELECT_ELE}).click()
|
|
|
|
|
tab.ele('xpath:%(xpath)s' % {"xpath": settings.FULL_RECORD_REFERENCE_ELE}).click()
|
|
|
|
|
|
|
|
|
|
# 输入记录起止
|
|
|
|
|
tab.ele('xpath:%(xpath)s' % {"xpath": settings.RECORD_RANGE_ELE}).click() # 切换到范围
|
|
|
|
|
tab.ele('xpath:%(xpath)s' % {"xpath": settings.RECORD_EXPORT_START_ELE}).input(start, clear=True)
|
|
|
|
|
tab.ele('xpath:%(xpath)s' % {"xpath": settings.RECORD_EXPORT_END_ELE}).input(end, clear=True)
|
|
|
|
|
|
|
|
|
|
# 点击导出
|
|
|
|
|
tab.ele('xpath:%(xpath)s' % {"xpath": settings.EXPORT_FILE_ELE}).click.to_download(
|
|
|
|
|
# save_path=DOWNLOAD_PATH,
|
|
|
|
|
rename='%s.txt' % 'savedrecs'
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
def first_ops(self):
|
|
|
|
|
tab = self.tab
|
|
|
|
|
self.operate_cookie_first(tab)
|
|
|
|
|
self.change_db(tab)
|
|
|
|
|
self.input_ops(tab)
|
|
|
|
|
self.search_ops(tab)
|
|
|
|
|
|
|
|
|
|
def bypass_ops(self):
|
|
|
|
|
tab = self.tab
|
|
|
|
|
self.export_ops(tab)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class CookieManager:
|
|
|
|
|
def __init__(
|
|
|
|
|
self,
|
|
|
|
|
redis_uri: str = "redis://localhost:6379/0",
|
|
|
|
|
cookie_lifetime: int = 60 * 60 * 4, # cookie有效期(秒)
|
|
|
|
|
check_interval: int = 60,
|
|
|
|
|
keep_browser_alive: bool = True,
|
|
|
|
|
):
|
|
|
|
|
self.url = "https://webofscience.clarivate.cn/wos/woscc/advanced-search"
|
|
|
|
|
self.cookie_lifetime = cookie_lifetime
|
|
|
|
|
# Redis连接
|
|
|
|
|
self.redis_key_prefix = 'cookie_pool:wos_sid'
|
|
|
|
|
self.check_interval = check_interval
|
|
|
|
|
self.redis_client = redis.Redis.from_url(
|
|
|
|
|
redis_uri,
|
|
|
|
|
decode_responses=True
|
|
|
|
|
)
|
|
|
|
|
logger.info(f"Redis连接成功: {redis_uri}")
|
|
|
|
|
self.dp_ins: DPOperations = None
|
|
|
|
|
self.first = True
|
|
|
|
|
|
|
|
|
|
# 浏览器实例
|
|
|
|
|
self.browser = None
|
|
|
|
|
self.tab = None
|
|
|
|
|
self.keep_browser_alive = keep_browser_alive
|
|
|
|
|
self.current_sid = None
|
|
|
|
|
self.bypass_ok_tag = False
|
|
|
|
|
|
|
|
|
|
# 控制标志
|
|
|
|
|
self._running = False
|
|
|
|
|
self._monitor_thread = None
|
|
|
|
|
|
|
|
|
|
def start_browser(self):
|
|
|
|
|
"""启动浏览器"""
|
|
|
|
|
if self.browser is None:
|
|
|
|
|
logger.info("启动浏览器...")
|
|
|
|
|
self.browser = Chromium()
|
|
|
|
|
self.tab = self.browser.latest_tab
|
|
|
|
|
logger.info("浏览器启动成功")
|
|
|
|
|
self.dp_ins = DPOperations(self.browser, self.tab)
|
|
|
|
|
|
|
|
|
|
def close_browser(self):
|
|
|
|
|
"""关闭浏览器"""
|
|
|
|
|
if self.browser:
|
|
|
|
|
logger.info("关闭浏览器...")
|
|
|
|
|
self.browser.quit()
|
|
|
|
|
self.browser = None
|
|
|
|
|
self.tab = None
|
|
|
|
|
logger.info("浏览器已关闭")
|
|
|
|
|
|
|
|
|
|
def refresh_page(self):
|
|
|
|
|
try:
|
|
|
|
|
logger.info("正在刷新页面")
|
|
|
|
|
if self.tab:
|
|
|
|
|
self.tab.refresh()
|
|
|
|
|
except Exception as e:
|
|
|
|
|
logger.error(f"正在刷新页面: {str(e)}")
|
|
|
|
|
|
|
|
|
|
def intercept_verify(self, op_func: Callable[[], None]):
|
|
|
|
|
"""
|
|
|
|
|
所有的刷新或者xhr操作都要监听一下hcaptcha验证接口
|
|
|
|
|
:return:
|
|
|
|
|
"""
|
|
|
|
|
logger.debug("监听 %s" % VERIFY_ROUTER)
|
|
|
|
|
self.tab.listen.start(VERIFY_ROUTER, method="POST") # 开启监听
|
|
|
|
|
op_func() # 指定操作方法
|
|
|
|
|
verify_count = 0
|
|
|
|
|
for packet in self.tab.listen.steps(count=3, timeout=60):
|
|
|
|
|
verify_count += 1
|
|
|
|
|
if self.verify_hook(packet):
|
|
|
|
|
# 验证成功会退出,不会出发下面的逻辑
|
|
|
|
|
return
|
|
|
|
|
if verify_count:
|
|
|
|
|
logger.warning("获取失败")
|
|
|
|
|
else:
|
|
|
|
|
logger.info("没有触发验证, cookie有效")
|
|
|
|
|
self.sid2redis()
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
|
def get_wos_sid_from_localstorage(tab):
|
|
|
|
|
s = tab.local_storage('wos_sid')
|
|
|
|
|
sid = s.strip('"')
|
|
|
|
|
return sid
|
|
|
|
|
|
|
|
|
|
def get_cookie_from_browser(self):
|
|
|
|
|
try:
|
|
|
|
|
if self.tab is None:
|
|
|
|
|
self.start_browser()
|
|
|
|
|
|
|
|
|
|
if self.first:
|
|
|
|
|
logger.info(f"第一次访问页面: {self.url}")
|
|
|
|
|
self.tab.get(self.url)
|
|
|
|
|
time.sleep(3) # 等待页面加载
|
|
|
|
|
|
|
|
|
|
# 执行自定义操作
|
|
|
|
|
self.intercept_verify(op_func=self.dp_ins.first_ops)
|
|
|
|
|
time.sleep(2)
|
|
|
|
|
self.sid2redis()
|
|
|
|
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
logger.error(e)
|
|
|
|
|
|
|
|
|
|
def save_cookie_to_redis(self, wos_sid: str):
|
|
|
|
|
try:
|
|
|
|
|
current_time = datetime.now()
|
|
|
|
|
expired_time = current_time + timedelta(seconds=self.cookie_lifetime)
|
|
|
|
|
ip = get_self_ip()
|
|
|
|
|
|
|
|
|
|
cookie_data = {
|
|
|
|
|
'ip': ip,
|
|
|
|
|
'status': 'normal',
|
|
|
|
|
'generated_time': current_time.isoformat(),
|
|
|
|
|
'expired_time': expired_time.isoformat(),
|
|
|
|
|
'used_times': 0
|
|
|
|
|
}
|
|
|
|
|
self.redis_client.hset(
|
|
|
|
|
name=f'{self.redis_key_prefix}:{wos_sid}',
|
|
|
|
|
mapping=cookie_data
|
|
|
|
|
)
|
|
|
|
|
logger.info(f"Cookie已保存到Redis: {self.redis_key_prefix}:{wos_sid}")
|
|
|
|
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
(
|
|
|
|
|
logger.error(f"保存cookie到Redis失败: {str(e)}"))
|
|
|
|
|
|
|
|
|
|
def sid2redis(self):
|
|
|
|
|
"""
|
|
|
|
|
存储到reids
|
|
|
|
|
:return:
|
|
|
|
|
"""
|
|
|
|
|
wos_sid = self.get_wos_sid_from_localstorage(self.tab)
|
|
|
|
|
if wos_sid:
|
|
|
|
|
logger.info("保存 %s 到redis..." % wos_sid)
|
|
|
|
|
self.current_sid = wos_sid
|
|
|
|
|
self.save_cookie_to_redis(wos_sid)
|
|
|
|
|
|
|
|
|
|
def verify_hook(self, packet: DataPacket):
|
|
|
|
|
verified_tag = 'verified'
|
|
|
|
|
request_url = packet.request.url
|
|
|
|
|
verify_success = False
|
|
|
|
|
if request_url.find(VERIFY_ROUTER) != -1: # 走验证了
|
|
|
|
|
logger.debug(f"正在验证: {request_url}\n"
|
|
|
|
|
f"请求body: {packet.request.postData}")
|
|
|
|
|
response_body = packet.response.body
|
|
|
|
|
if isinstance(response_body, bytes):
|
|
|
|
|
verify_success = packet.response.body.find(verified_tag.encode()) != -1
|
|
|
|
|
elif isinstance(response_body, str):
|
|
|
|
|
verify_success = packet.response.body.find(verified_tag) != -1
|
|
|
|
|
elif isinstance(response_body, dict):
|
|
|
|
|
verify_success = response_body.get('key') == verified_tag
|
|
|
|
|
elif isinstance(response_body, list) and len(response_body) > 0:
|
|
|
|
|
verify_success = response_body[0].get('key') == verified_tag
|
|
|
|
|
else:
|
|
|
|
|
raise TypeError("未知的response_body类型")
|
|
|
|
|
if verify_success:
|
|
|
|
|
logger.info(f"验证成功: {request_url}")
|
|
|
|
|
return True
|
|
|
|
|
else:
|
|
|
|
|
return False
|
|
|
|
|
else:
|
|
|
|
|
logger.info("无需验证")
|
|
|
|
|
return True
|
|
|
|
|
|
|
|
|
|
def check_cookie_status(self, sid: str = None, default_status: str = "expired"):
|
|
|
|
|
if sid is None:
|
|
|
|
|
sid = self.current_sid
|
|
|
|
|
|
|
|
|
|
if not sid:
|
|
|
|
|
return default_status
|
|
|
|
|
|
|
|
|
|
status = self.redis_client.hget(name=f'{self.redis_key_prefix}:{sid}', key='status')
|
|
|
|
|
|
|
|
|
|
return status
|
|
|
|
|
|
|
|
|
|
def monitor_loop(self):
|
|
|
|
|
"""
|
|
|
|
|
监控循环,定期检查cookie状态
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
custom_operations: 自定义操作函数
|
|
|
|
|
"""
|
|
|
|
|
logger.info(f"开始监控cookie,检查间隔: {self.check_interval}秒")
|
|
|
|
|
while self._running:
|
|
|
|
|
try:
|
|
|
|
|
status = self.check_cookie_status()
|
|
|
|
|
|
|
|
|
|
if status == "validate":
|
|
|
|
|
logger.warning("cookie使用次数超限/需要验证,准备进行验证。。。")
|
|
|
|
|
# 验证逻辑,导出一次过验证
|
|
|
|
|
self.intercept_verify(op_func=self.dp_ins.bypass_ops)
|
|
|
|
|
|
|
|
|
|
elif status == "expired":
|
|
|
|
|
logger.warning("cookie已过期,准备重新获取。。。")
|
|
|
|
|
# 刷新页面或者重新进行搜索/导出
|
|
|
|
|
self.intercept_verify(op_func=self.refresh_page)
|
|
|
|
|
else:
|
|
|
|
|
logger.info(f"Cookie状态正常: {status}")
|
|
|
|
|
|
|
|
|
|
# 等待下次检查
|
|
|
|
|
time.sleep(self.check_interval)
|
|
|
|
|
except Exception as e:
|
|
|
|
|
logger.error(e)
|
|
|
|
|
|
|
|
|
|
def start_monitor(self):
|
|
|
|
|
if self._running:
|
|
|
|
|
logger.warning("监控已在运行中")
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
if self.browser is None:
|
|
|
|
|
self.start_browser()
|
|
|
|
|
|
|
|
|
|
# 首次获取cookie
|
|
|
|
|
logger.info("首次获取cookie...")
|
|
|
|
|
self.get_cookie_from_browser()
|
|
|
|
|
if self.current_sid:
|
|
|
|
|
logger.error("首次获取cookie成功")
|
|
|
|
|
else:
|
|
|
|
|
logger.error("首次获取cookie失败")
|
|
|
|
|
if not self.keep_browser_alive:
|
|
|
|
|
self.close_browser()
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
# 如果不需要浏览器保活,关闭
|
|
|
|
|
if not self.keep_browser_alive:
|
|
|
|
|
self.close_browser()
|
|
|
|
|
|
|
|
|
|
# 启动监控线程
|
|
|
|
|
self._running = True
|
|
|
|
|
# self._monitor_thread = threading.Thread(
|
|
|
|
|
# target=self.monitor_loop,
|
|
|
|
|
# name="CookieMonitorThread",
|
|
|
|
|
# daemon=True
|
|
|
|
|
# )
|
|
|
|
|
# self._monitor_thread.start()
|
|
|
|
|
self.monitor_loop()
|
|
|
|
|
logger.info("监控已启动")
|
|
|
|
|
|
|
|
|
|
def stop_monitor(self):
|
|
|
|
|
"""停止监控"""
|
|
|
|
|
if not self._running:
|
|
|
|
|
logger.warning("监控未在运行")
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
logger.info("正在停止监控...")
|
|
|
|
|
self._running = False
|
|
|
|
|
|
|
|
|
|
if self._monitor_thread:
|
|
|
|
|
self._monitor_thread.join(timeout=5)
|
|
|
|
|
|
|
|
|
|
self.close_browser()
|
|
|
|
|
logger.info("监控已停止")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def main():
|
|
|
|
|
manager = CookieManager(redis_uri="redis://:kcidea1509@192.168.1.211:6379/10", keep_browser_alive=True)
|
|
|
|
|
try:
|
|
|
|
|
manager.start_monitor()
|
|
|
|
|
|
|
|
|
|
# 主程序运行
|
|
|
|
|
logger.info("Cookie管理器正在运行,按Ctrl+C停止...")
|
|
|
|
|
except KeyboardInterrupt:
|
|
|
|
|
logger.info("收到停止信号")
|
|
|
|
|
# manager.close_browser()
|
|
|
|
|
finally:
|
|
|
|
|
manager.stop_monitor()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
|
main()
|