Compare commits

..

No commits in common. 'a95f242bd5eb8849613f849ffb7512bfe2cb7fd6' and '9b9b14a3839b3acea1fc45108d93b2bdb10c5801' have entirely different histories.

@ -0,0 +1,12 @@
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$" />
<orderEntry type="jdk" jdkName="pydevenv" jdkType="Python SDK" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
<component name="PyDocumentationSettings">
<option name="format" value="PLAIN" />
<option name="myDocStringFormat" value="Plain" />
</component>
</module>

@ -1,54 +0,0 @@
# 使用 Python 官方镜像
FROM python:3.11-slim
# 设置工作目录
WORKDIR /app
# 安装系统依赖(包含浏览器相关)
RUN apt-get update && apt-get install -y \
wget \
curl \
gnupg \
ca-certificates \
fonts-liberation \
libasound2 \
libatk-bridge2.0-0 \
libatk1.0-0 \
libatspi2.0-0 \
libcups2 \
libdbus-1-3 \
libdrm2 \
libgbm1 \
libgtk-3-0 \
libnspr4 \
libnss3 \
libxcomposite1 \
libxdamage1 \
libxfixes3 \
libxrandr2 \
xdg-utils \
--no-install-recommends \
&& rm -rf /var/lib/apt/lists/*
# 安装 Chrome 浏览器(如果需要)
RUN wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add - \
&& sh -c 'echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google.list' \
&& apt-get update \
&& apt-get install -y google-chrome-stable \
--no-install-recommends \
&& rm -rf /var/lib/apt/lists/*
# 复制依赖文件
COPY requirements.txt .
# 安装 Python 依赖
RUN pip install --no-cache-dir -r requirements.txt
# 复制应用代码
COPY . .
# 设置环境变量
ENV PYTHONUNBUFFERED=1
# 运行应用
CMD ["python", "app.py"]

@ -1,10 +0,0 @@
requests~=2.32.4
scrapy~=2.13.3
pymongo~=4.13.0
itemadapter~=0.11.0
happybase~=1.2.0
fastapi~=0.116.1
redis~=6.2.0
parsel~=1.10.0
sympy~=1.14.0
pydantic~=2.0.3

@ -0,0 +1,12 @@
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
from science_article_add.scripts.get_db_task import TaskManager
tm = TaskManager()
process = CrawlerProcess(get_project_settings())
task = tm.get_task_from_mysql()
process.crawl('wos_latest_increment', task_obj=task)
process.start()

@ -1,296 +0,0 @@
# -*- coding: utf-8 -*-
# @Time : 2025/11/24 09:25
# @Author : zhaoxiangpeng
# @File : wos_search_export.py
import math
import json
import logging
from typing import Any
from datetime import datetime
import redis
from DrissionPage import Chromium
from DrissionPage import ChromiumPage, ChromiumOptions
from DrissionPage._pages.chromium_tab import ChromiumTab
from DrissionPage._units.listener import DataPacket, Response
from DrissionPage.errors import ElementNotFoundError
from science_article_add.utils import tools
from science_article_add.scripts.wos_parse_data import parse_full_records_txt
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
LINK = "https://webofscience.clarivate.cn/wos/woscc/advanced-search"
BATCH_DOWNLOAD_LIMIT = 500
class Settings:
env = "dev"
SEARCH_ROUTE = '/api/wosnx/core/runQuerySearch'
EXPORT_ROUTE = '/api/wosnx/indic/export/saveToFile'
DB_CHANGE_ELE = '//*[@id="global-select"]/div/div[@aria-label="Select database"]/div[@title="Web of Science Core Collection"]'
QUERY_INPUT_ELE = '//*[@id="advancedSearchInputArea"]'
SEARCH_BUTTON_ELE = '//button[@data-ta="run-search"]/span[@class="mat-mdc-button-touch-target"]'
EXPORT_BUTTON_ELE = '//*[@id="export-trigger-btn"]'
TABWIN_BUTTON_ELE = '//*[@id="exportToTabWinButton"]' # 制表符分割文件button
RECORD_TYPE_SELECT_ELE = '//div[@class="ng-star-inserted"]/wos-select/button[@aria-haspopup="listbox"]' # 记录内容选择框
FULL_RECORD_ELE = '//div[@id="global-select"]//div[@class="options options-menu"]/div[@title="Full Record"]' # 完整记录
FULL_RECORD_REFERENCE_ELE = '//div[@id="global-select"]//div[@class="options options-menu"]/div[@title="Full Record and Cited References"]' # 全记录与参考文献
RECORD_RANGE_ELE = '//*[@id="radio3-input"]' # 记录范围
RECORD_EXPORT_START_ELE = '//input[@name="markFrom"]'
RECORD_EXPORT_END_ELE = '//input[@name="markTo"]'
EXPORT_FILE_ELE = '//*[@id="exportButton"]'
INPUT_CONTENT = '(OG=(Anhui University of Science & Technology)) AND PY=(2025)'
class ProSettings(Settings):
DB_CHANGE = '//*[@id="global-select"]/div/div[@aria-label="Select database"]/div[@title="Web of Science 核心合集"]'
EXPORT_BUTTON_ELE = '//botton[@id="export-trigger-btn"]'
FULL_RECORD_ELE = '//div[@id="global-select"]//div[@class="options options-menu"]/div[@title="完整记录"]' # 完整记录
FULL_RECORD_REFERENCE_ELE = '//div[@id="global-select"]//div[@class="options options-menu"]/div[@title="全记录与引用的参考文献"]' # 全记录与参考文献
settings = Settings()
class WosSearchExport:
_records_found = 0
inited: bool = False
is_running = False
def __init__(self, query_content: Any, options=None):
self._records_found = 0
self._query_id = None
self.query_content = query_content
self.options = options
@classmethod
def create_instance(cls, config: dict):
return cls(
query_content=config.get("query_content"),
options=config.get('options')
)
def set_records_found(self, val):
self._records_found = val
def get_records_found(self) -> int:
return self._records_found
def set_query_id(self, query_id):
self._query_id = query_id
def get_query_id(self):
return self._query_id
def _initialize(self):
self.browser = Chromium(self.options)
self.tab = self.browser.latest_tab
# 都只需要执行一次
self.open_url(LINK)
# 处理cookie的首选项
self.operate_cookie_first()
self.change_db()
self.inited = True
def open_url(self, url):
logger.debug('Opening url: %s' % url)
self.tab.get(url)
def operate_cookie_first(self):
# cookie管理处理
logger.debug('Operating cookie first...')
ck_m_div = self.tab.ele('xpath://*[@id="onetrust-banner-sdk"]')
if ck_m_div:
ele = self.tab.ele('xpath://*[@id="onetrust-accept-btn-handler"]')
ele.click()
def change_db(self):
logger.info('Changing database...')
default_db_ele = self.tab.ele('xpath://*[@id="snSelectDb"]/button')
c1 = default_db_ele.raw_text
default_db_ele.click()
self.tab.ele(
'xpath:%(xpath)s' % {"xpath": settings.DB_CHANGE_ELE}).click()
def input_query(self, content: str, clear_input: bool = True, tab=None):
tab = tab or self.tab
input_area_ele = tab.ele('xpath:%(xpath)s' % {"xpath": settings.QUERY_INPUT_ELE})
if clear_input:
input_area_ele.clear() # 清空
input_area_ele.input(content) # 输入检索内容
def listen_func():
tab.listen.start(settings.SEARCH_ROUTE, method="POST")
def operation_func():
search_button_ele = tab.ele('xpath:%(xpath)s' % {"xpath": settings.SEARCH_BUTTON_ELE})
search_button_ele.click()
def capture_packet(packet: DataPacket):
search_url = tab.url
record_id, records_found = self.get_record_info(packet.response.body)
self.set_records_found(records_found)
self.set_query_id(record_id)
if not self.get_query_id():
logger.warning('未找到记录 %s' % packet.response.body)
if records_found == 0:
logger.warning('检索式 "%s" 找到记录 %s' % (self.query_content, records_found))
return
else:
logger.info('检索式 "%s" 找到记录 %s' % (self.query_content, records_found))
return True
self.intercept(listen=listen_func, operation=operation_func, callback=capture_packet, tab=tab)
def download_records(self):
for b in self.distribute_page():
query_id, batch_id, mark_start, mark_end = b
self.rpa_download(mark_start, mark_end, batch=batch_id, tab=self.tab)
def distribute_page(self):
# 计算页码
logger.info("prepare downloading...")
records_found = self.get_records_found()
query_id = self.get_query_id()
mark_start = 1
mark_end = 0
batch_id = 0
for i in range(math.ceil(records_found / BATCH_DOWNLOAD_LIMIT)):
mark_end += BATCH_DOWNLOAD_LIMIT
if mark_end > records_found:
mark_end = records_found
batch_id += 1
yield query_id, batch_id, mark_start, mark_end
mark_start += BATCH_DOWNLOAD_LIMIT
def clear_query(self):
pass
def reflush_query(self):
pass
def reflush_page(self):
pass
def rpa_download(self, start: int = 1, end: int = 500, batch: str | int = None, tab=None):
"""
点击下载前拦截api
"""
try:
logger.debug("download starting...")
tab = tab or self.tab
tab.ele('xpath:%(xpath)s' % {"xpath": settings.EXPORT_BUTTON_ELE}).click() # 点击导出
tab.ele('xpath:%(xpath)s' % {"xpath": settings.TABWIN_BUTTON_ELE}).click() # 选择制表符分割
# 等待弹框
# 切换导出格式选择全记录与参考文献
tab.ele('xpath:%(xpath)s' % {"xpath": settings.RECORD_TYPE_SELECT_ELE}).click()
tab.ele('xpath:%(xpath)s' % {"xpath": settings.FULL_RECORD_REFERENCE_ELE}).click()
# 输入记录起止
tab.ele('xpath:%(xpath)s' % {"xpath": settings.RECORD_RANGE_ELE}).click() # 切换到范围
tab.ele('xpath:%(xpath)s' % {"xpath": settings.RECORD_EXPORT_START_ELE}).input(start, clear=True)
tab.ele('xpath:%(xpath)s' % {"xpath": settings.RECORD_EXPORT_END_ELE}).input(end, clear=True)
except ElementNotFoundError:
self.reflush_page()
def listen_func():
tab.listen.start(settings.EXPORT_ROUTE, method="POST")
def operation_func():
# tab.ele('xpath:%(xpath)s' % {"xpath": settings.EXPORT_FILE_ELE}).click() # 点击导出按钮
tab.ele('xpath:%(xpath)s' % {"xpath": settings.EXPORT_FILE_ELE}).click.to_download(
save_path=DOWNLOAD_PATH,
rename='%s.txt' % batch
)
def capture_packet(packet: DataPacket):
g = self._parse_download(packet.response)
for i in g:
print(i)
return True
self.intercept(listen=listen_func, operation=operation_func, callback=capture_packet, tab=tab)
def intercept(self, listen, operation, callback, tab=None):
listen()
operation()
for packet in tab.listen.steps(count=3):
print(packet.response.body)
if not self.intercept_verify(packet):
continue
r = callback(packet)
if r:
break
return
@staticmethod
def intercept_verify(packet: DataPacket):
content = packet.response.body
if isinstance(content, bytes) and content.find(b'"Server.passiveVerificationRequired"') != -1:
return False
else:
return True
def _parse_download(self, response: Response):
batch_time = datetime.now()
item_g = parse_full_records_txt(response.body.encode())
parse_count = 0
for data_dic in item_g:
t_id = data_dic.pop('ut', None)
if t_id:
parse_count += 1
yield dict(third_id=t_id, exported=data_dic, updated_at=batch_time)
# 解析被引量
if cited_num := tools.str2int(data_dic.get("tc", 0), 0):
yield dict(third_id=t_id, cited=cited_num, updated_at=batch_time)
@staticmethod
def get_record_info(body: bytes):
resp_texts = body.strip().split(b'\n')
query_id = None
records_found = 0
for resp_text in resp_texts:
resp_row_dict: dict = json.loads(resp_text)
if resp_row_dict.get("key") == "searchInfo":
query_id = resp_row_dict.get("payload", {}).get("QueryID")
records_found = resp_row_dict.get("payload", {}).get("RecordsFound") # 找到的记录
break # 找到就结束
return query_id, records_found
def execute(self):
if not self.inited:
logger.info('初始化页面')
self._initialize()
self.input_query(self.query_content)
self.download_records()
def start(self):
pass
def stop(self):
self.tab.close()
if __name__ == '__main__':
DOWNLOAD_PATH = r'Y:\wos-metadata\wos increment-202512\00'
conf = dict(
query_content="(OG=(Southwest University of Science & Technology - China)) AND PY=(2025)",
download_dir=DOWNLOAD_PATH
)
co = ChromiumOptions() # .headless()
co.set_pref('download.default_directory', conf['download_dir'])
conf['options'] = co
ins = WosSearchExport.create_instance(config=conf)
ins.execute()

@ -11,36 +11,6 @@ if TYPE_CHECKING:
from pymongo.results import InsertManyResult, BulkWriteResult
def build_update_query(update_data: dict, replace: bool = True) -> dict:
"""
如果replace为True则直接覆盖原有的document
"""
update_query = {}
if not update_data:
return {}
for key, val in update_data.items():
if replace:
update_query.setdefault(
"$set", {}
).update(
{key: val}
)
else:
if isinstance(val, list):
update_query.setdefault(
"$addToSet", {}
).update({
key: {"$each": val}
})
else:
update_query.setdefault(
"$set", {}
).update(
{key: val}
)
return update_query
def update_document(filter_query: dict = None, update_data: dict = None, replace: bool = True) -> Tuple[dict, dict]:
update_query = {}
if not update_data:

@ -13,25 +13,6 @@ class ScienceArticleAddItem(scrapy.Item):
updated_at = scrapy.Field()
class AddItemBase(scrapy.Item):
third_id = scrapy.Field()
updated_at = scrapy.Field()
class ArticleItem(AddItemBase):
exported = scrapy.Field()
class IdRelationItem(AddItemBase):
query_ids = scrapy.Field()
school_ids = scrapy.Field()
task_ids = scrapy.Field()
class ArticleCitedItem(AddItemBase):
cited = scrapy.Field()
class WosLiteAddItem(ScienceArticleAddItem):
year = scrapy.Field()
query_ids = scrapy.Field()

@ -1,97 +0,0 @@
# -*- coding: utf-8 -*-
# @Time : 2025/10/23 17:22
# @Author : zhaoxiangpeng
# @File : wos.py
from __future__ import annotations
from typing import TYPE_CHECKING
import sys
import logging
import redis
from scrapy.exceptions import CloseSpider
if TYPE_CHECKING:
from scrapy.crawler import Crawler
from scrapy import Request
logger = logging.getLogger(__name__)
class WosLiteApiXkeyDownloaderMiddleware:
async def process_request(self, request, spider):
key_param = {
'X-ApiKey': '941a216f25cbef0f80ee4ba58a08ef1e19dee7a4'
}
if not request.headers:
request.headers = key_param
return request
request.headers.update(key_param)
return request
class WosStarterApiXkeyDownloaderMiddleware:
async def process_request(self, request, spider):
key_param = {
'X-ApiKey': '53b8164e7543ccebe489988287e8b871bc2c0880'
}
request.headers.update(key_param)
# return request
class WosSidParamMiddleware:
def __init__(self, redis_uri: str):
self.redis_cli = redis.from_url(redis_uri)
self.cookiepool_key = 'cookies_pool:wos:sid-sjtu'
@classmethod
def from_crawler(cls, crawler: Crawler, *args, **kwargs):
settings = crawler.settings
return cls(
redis_uri=settings.get("REDIS_URL")
)
def process_request(self, request: Request, spider):
has_wos_sid = hasattr(request, 'wos_sid')
if not has_wos_sid:
sid = self.get_sid_from_redis()
if not sid:
raise CloseSpider(f"没有获取导sid: ")
# 把获取到的wos_sid绑定到request可以在parse方法中获取到wos_sid的值
setattr(request, 'wos_sid', sid)
else:
sid = getattr(request, 'wos_sid')
cookie_1 = {'dotmatics.elementalKey': 'SLsLWlMhrHnTjDerSrlG'}
headers = {
'authority': 'webofscience.clarivate.cn',
'accept-language': 'zh-CN,zh;q=0.9',
'cache-control': 'no-cache',
'origin': 'https://webofscience.clarivate.cn',
'pragma': 'no-cache',
# 'referer': 'https://webofscience.clarivate.cn/wos/woscc/advanced-search',
}
request.cookies = cookie_1
if request.url.endswith('runQuerySearch'):
# 检索时需要带有sid参数
request._set_url(request.url + "?SID=%s" % sid)
headers.update(
{'accept': 'application/x-ndjson', 'content-type': 'text/plain;charset=UTF-8'})
else:
headers.update(
{'accept': 'application/json, text/plain, */*', 'content-type': 'application/json',
'x-1p-wos-sid': sid})
for hk, hv in headers.items():
request.headers[hk] = hv
return None
def get_sid_from_redis(self):
sid = self.redis_cli.get(self.cookiepool_key)
if not sid:
return None
logger.warning("没有可用cookie, 退出!!!")
sys.exit()
return sid.decode()

@ -314,18 +314,5 @@ def get_refine_count(q_id: str, count: int = 5):
return model
def get_record_info(body: bytes, sep: Union[str, bytes] = b'\n'):
resp_texts = body.strip().split(sep)
query_id = None
records_found = 0
for resp_text in resp_texts:
resp_row_dict: dict = json.loads(resp_text)
if resp_row_dict.get("key") == "searchInfo":
query_id = resp_row_dict.get("payload", {}).get("QueryID")
records_found = resp_row_dict.get("payload", {}).get("RecordsFound") # 找到的记录
break # 找到就结束
return query_id, records_found
if __name__ == '__main__':
m1 = lite_base_model(WosDB.WOS)

@ -5,7 +5,7 @@
from __future__ import annotations
import logging
from datetime import datetime
from typing import TYPE_CHECKING, Tuple, Union
from typing import TYPE_CHECKING, Tuple, Generator
from pymongo import MongoClient
from itemadapter import ItemAdapter
@ -15,8 +15,7 @@ from pymongo.errors import (
)
from science_article_add.db_utils.buffer_component import SimpleBuffer
from science_article_add.db_utils.mongo import MongoDBUtils, update_document,build_update_query
from science_article_add.db_utils.mongo import MongoDBUtils, update_document
if TYPE_CHECKING:
from scrapy.crawler import Crawler
from scrapy.statscollectors import StatsCollector
@ -52,17 +51,14 @@ class MongoPipeline(MongoDBUtils):
d = adapter.asdict()
try:
insert_result = collection.insert_one(d)
self.stats.inc_value("item2db_inserted/{}".format(item_type))
except DuplicateKeyError as duplicate_error:
if self.insert_failure_update_enable:
write_error = duplicate_error.details
key_pattern = write_error.get('keyPattern')
key_value = write_error.get('keyValue')
logger.debug("dupKey: %s, keyValue: %s", key_pattern, key_value)
d.pop("_id", None)
[d.pop(k, None) for k in key_pattern.keys()]
up_result = collection.update_one(filter=key_value, update={"$set": d}, upsert=True)
self.stats.inc_value("item2db_updated/{}".format(item_type))
except Exception:
raise
@ -75,22 +71,20 @@ class MongoPipeline(MongoDBUtils):
def _get_item_type(item) -> str:
"""获取Item类型"""
if hasattr(item, '__tablename__'):
return item.__class__.__tablename__
return item.item_type
return 'items_null_table'
class MongoPipelineMulti(MongoDBUtils):
def __init__(self, mongo_uri, mongo_db, stats: StatsCollector, buffer_max_size=None):
def __init__(self, mongo_uri, mongo_db, buffer_max_size=None):
super().__init__(mongo_uri, mongo_db)
self.buffer = SimpleBuffer(buffer_max_size=buffer_max_size, flush_interval=10)
self.stats: StatsCollector = stats
@classmethod
def from_crawler(cls, crawler):
return cls(
mongo_uri=crawler.settings.get("MONGO_URI"),
mongo_db=crawler.settings.get("MONGO_DATABASE", "items"),
stats=crawler.stats,
buffer_max_size=crawler.settings.get("BUFFER_MAX_SIZE", 100),
)
@ -133,15 +127,11 @@ class MongoPipelineMulti(MongoDBUtils):
write_errors = bulk_write_e.details.get('writeErrors')
current_time = datetime.now()
up_time_requests = []
errors = self._build__update(write_errors)
collection = self.db.get_collection(item_type)
for write_error in write_errors:
filter_query, update_query = self._pick_filter_update(write_error)
original_doc = write_error.get('op') # 插入的数据
task_ids = update_query.pop('task_ids', None)
if task_ids:
task_id_query = {'task_ids': task_ids}
collection.update_one(filter=filter_query, update=build_update_query(task_id_query, replace=False))
up_result = collection.update_one(filter=filter_query, update=build_update_query(update_query, replace=False))
for new_item in errors:
filter_query, update_query = new_item
up_result = collection.update_one(filter=filter_query, update=update_query)
affect_count -= 1
if up_result.matched_count == up_result.modified_count == 1:
@ -159,29 +149,16 @@ class MongoPipelineMulti(MongoDBUtils):
finally:
# 清空缓冲区
self.buffer.clear_buffer(item_type)
self.stats.inc_value("item2db_inserted/{}".format(item_type), count=affect_count)
self.stats.inc_value("item2db_updated/{}".format(item_type), count=item_count - affect_count)
logger.info('✅ 入库 %s 行数 %s 条, 新增 %s 条, 更新 %s' % (
item_type, item_count, affect_count, item_count - affect_count))
def _build__update(self, write_error) -> Union[Tuple[dict, dict], Tuple[None, None]]:
update_one = None, None
if write_error.get('code') == 11000:
update_one = self._pick_filter_update(write_error)
return update_one
@staticmethod
def _pick_filter_update(write_error):
original_doc = write_error.get('op') # 插入的数据
key_pattern = write_error.get('keyPattern')
original_doc.pop("_id", None) # 删掉插入失败产生的_id
filter_query = {}
update_query = {key: val for key, val in original_doc.items() if val}
update_query.pop('updated_at', None) # 删除不确定因素时间防止影响更新的
for key in key_pattern.keys():
filter_query.update({key: update_query.pop(key, None)})
return filter_query, update_query
item_type, item_count, affect_count, item_count - affect_count))
def _build__update(self, write_errors) -> Generator[Tuple[dict, dict], Tuple[None, None]]:
for write_error in write_errors:
update_one = None, None
if write_error.get('code') == 11000:
update_one = self._build_dup_error(write_error)
if update_one:
yield update_one
@staticmethod
def _build_dup_error(write_error) -> tuple[None, None] | tuple[dict, dict]:

@ -65,16 +65,3 @@ class VerifyDataIntegrity:
coll.update_many(filter={"third_id": {"$in": list(failure)}}, update={"$set": {"state": -1}})
else:
self.logger.info("Successfully verified: %s" % "下载完整无异常")
def spider_end(self):
"""
组合检索式把结果写到数据库里
"""
dict(
content="",
qeury_id="",
records_found=0,
perfact=1,
state=1,
reason=""
)

@ -1,11 +1,7 @@
# pipelines.py
import logging
import pymongo
from itemadapter import ItemAdapter
from science_article_add.items.wos import WosArticleItem, WosCitedNumberItem, WosIdRelationItem
from science_article_add.pipelines.verify_data import VerifyDataIntegrity
logger = logging.getLogger(__name__)
from science_article_add.items.wos import WosCitedNumberItem, WosIdRelationItem
class MongoDBPipeline:
@ -42,12 +38,3 @@ class MongoDBPipeline:
self.db[collection_name].insert_one(dict(adapter))
return item
class WosVerifyDataIntegrity(VerifyDataIntegrity):
def open_spider(self, spider):
spider_batch_ids = spider.get_batch_ids()
for batch in spider_batch_ids:
if batch.get("field") == "UT":
self.batch_ids.add(batch.get("third_id"))

@ -1,139 +0,0 @@
# -*- coding: utf-8 -*-
# @Time : 2025/10/31 10:24
# @Author : zhaoxiangpeng
# @File : distribute_task.py
from typing import Any
import pymysql
from science_article_add.utils import tools
SELECT_STRATEGY_SQL = '''SELECT
r.org_id, q.id, q.content, q.param, q.disable_flag, q.state
FROM relation_org_query AS r JOIN task_search_strategy AS q
ON r.query_id = q.id
WHERE
r.org_name="%(org_name)s" AND disable_flag = 0'''
CREATE_RECORD_SQL = '''insert into task_batch_record (batch_date, query_id, task_condition) VALUES ("%(batch_date)s", %(query_id)s, %(task_condition)s)'''
ORG_STRATEGY_SQL = """
SELECT r.%(org_id)s, r.%(org_name)s, r.%(query_id)s, q.%(content)s, q.%(source_type)s
FROM task_search_strategy AS q JOIN relation_org_query AS r ON r.query_id = q.id
WHERE q.id = %(q_id)s
"""
ORG_STRATEGY_FIELDS = ['org_id', 'org_name', 'query_id', 'content', 'source_type']
class CrawlTaskManager:
def __init__(self):
self.client: pymysql.Connection = pymysql.connect(host='43.140.203.187', port=3306,
database='science_data_dept', user='science-data-dept',
passwd='datadept1509', )
def execute_sql(self, sql):
cursor = self.client.cursor()
try:
cursor.execute(sql)
results = cursor.fetchall()
return results
except Exception as e:
raise e
finally:
cursor.close()
def find_task_by_school_name(self, school_name, source_type: int = None):
cursor = self.client.cursor()
try:
# 查询
select_fields = ['org_id', 'id', 'content', 'disable_flag', 'state']
select_sql = 'select %()s from task_search_strategy as q join relation_org_query as r ON q.id = r.query_id where q.source_type = %(source_type)s'
cursor.execute(
select_sql
)
find_result = cursor.fetchall()
except pymysql.MySQLError as e:
pass
def create_crawler_task(self, query_id: int, condition: Any = None, source_type: int = None):
cursor = self.client.cursor()
try:
insert_sql = CREATE_RECORD_SQL % {
'batch_date': tools.get_today_date(),
'query_id': query_id,
'task_condition': condition
}
cursor.execute(
insert_sql
)
cursor.connection.commit()
return cursor.lastrowid
except pymysql.MySQLError as e:
print(e)
return None
finally:
cursor.close()
def get_crawler_task(self, task_id: int = None, source_type: int = None, state: int = None):
STRATEGY_FIELDS = ['org_id', 'org_name', 'query_id', 'content', 'source_type']
cursor = self.client.cursor()
try:
record_fields = ['id', 'batch_date', 'query_id', 'task_condition', 'is_done']
condition = {}
if state is not None:
condition['is_done'] = state
else:
condition['is_done'] = 0
if task_id:
condition['id'] = task_id
sql = "select %(fields)s from task_batch_record where %(condition)s" % {
'fields': ', '.join(record_fields), 'condition': ' and '.join([f'{k}={v}' for k, v in condition.items()])
}
if source_type:
pass
cursor.execute(sql)
result = cursor.fetchone()
if result is None:
return
task_record_dic = dict(zip(record_fields, result))
fill = dict(zip(STRATEGY_FIELDS, STRATEGY_FIELDS))
fill.update(q_id=task_record_dic.get("query_id"))
cursor.execute(
ORG_STRATEGY_SQL % fill,
)
result = cursor.fetchone()
task_dic = dict(zip(STRATEGY_FIELDS, result))
task_dic.update(task_record_dic)
return task_dic
finally:
cursor.close()
def _build_condition(self, source_type: int = None):
if source_type is None:
source_type = 1
if source_type == 1:
condition = 'AND PY=()'
# def test_create_one():
# manager = CrawlTaskManager()
# manager.create_crawler_task(1542, condition='NULL', source_type=1)
def main():
manager = CrawlTaskManager()
rr = manager.execute_sql('select id from task_search_strategy where disable_flag=0 and source_type=1 and state=0 limit 20')
# rr = manager.execute_sql('select id from task_search_strategy where disable_flag=0 and source_type=1 and id in (1124, 1148, 1159, 1162, 1163, 1164, 1534, 1535)')
query_ids = []
for c in rr:
record_id = manager.create_crawler_task(c[0], condition='"AND PY=(2025-2026)"', source_type=1)
query_ids.append(c[0])
print(record_id)
changed = [str(i) for i in query_ids]
print(changed)
ok = 'update task_search_strategy set state=1 where id in (%s)' % ', '.join(changed)
print(ok)
if __name__ == '__main__':
main()

@ -22,12 +22,10 @@ class TaskManager:
def get_task_from_mysql(self):
cursor = self.client.cursor()
record_fields = ['id', 'batch_date', 'query_id', 'task_condition', 'is_done']
sql = "select %(fields)s from task_batch_record where is_done=0" % {'fields': ', '.join(record_fields)}
sql = "select %(fields)s from task_batch_record" % {'fields': ', '.join(record_fields)}
try:
cursor.execute(sql)
result = cursor.fetchone()
if result is None:
return
task_record_dic = dict(zip(record_fields, result))
fill = dict(zip(STRATEGY_FIELDS, STRATEGY_FIELDS))
fill.update(q_id=task_record_dic.get("query_id"))
@ -45,34 +43,6 @@ class TaskManager:
finally:
cursor.close()
def create_task_from_mysql(self, school_name=None, school_id=None):
cursor = self.client.cursor()
sql = """
SELECT
r.org_id,
q.id,
q.content,
q.param,
q.interval_unit,
q.disable_flag,
q.state
FROM
relation_org_query AS r
JOIN task_search_strategy AS q ON r.query_id = q.id
WHERE
r.org_name="%(school_name)s"
AND source_type = 1
AND disable_flag = 0""" % {'school_name': school_name}
try:
cursor.execute(sql)
result = cursor.fetchone()
sql = "insert into %s (batch_date, query_id, task_condition, result_count, is_done, created_time) values ('%s', %s, '%s', %s, %s, CURRENT_TIMESTAMP)" % (
"", batch_date, query_id, task_condition, result_count, is_done
)
except Exception as exc:
pass
if __name__ == '__main__':
tm = TaskManager()

@ -1,95 +0,0 @@
from typing import Any, List, Union
from datetime import datetime
import scrapy
from scrapy.http import Response
from scrapy.http.request.json_request import JsonRequest
from scrapy.crawler import Crawler
from science_article_add.items.wos import WosArticleItem, WosCitedNumberItem, WosIdRelationItem
from science_article_add.scripts.wos_parse_data import parse_full_records
from science_article_add.models import wos_model as model
from science_article_add.utils import tools
from science_article_add.configs import wos as config
def maybe_list(val: Union[int, List[int]]) -> List[int]:
if isinstance(val, int):
return [val]
return list(val)
class DownloadByQidSpider(scrapy.Spider):
name = "download_by_qid"
custom_settings = dict(
DOWNLOADER_MIDDLEWARES={
"science_article_add.middlewares.wos.WosSidParamMiddleware": 500
},
ITEM_PIPELINES={
"science_article_add.pipelines.mongo.MongoPipeline": 300,
},
LOG_LEVEL="INFO"
)
@classmethod
def from_crawler(cls, crawler, *args, **kwargs):
return super().from_crawler(crawler, *args, **kwargs)
def __init__(self, record_id: str, mark_from: int = 1, mark_to: int = 500, records_found: int = None, **kwargs):
super().__init__()
self.record_id = record_id
self.records_found = records_found
self.mark_from = mark_from
self.mark_to = mark_to
self.task_id = None
self.org_id = None
self.query_id = None
self.bind_relation_enable = False
self.bind_relation_d = None
if self.bind_relation_enable:
self.build_relation()
def build_relation(self):
bind_relation_d = dict()
if self.task_id: self.bind_relation_d.setdefault("task_ids", maybe_list(self.task_id))
if self.org_id: self.bind_relation_d.setdefault("school_ids", maybe_list(self.org_id))
if self.query_id: self.bind_relation_d.setdefault("query_ids", maybe_list(self.query_id))
self.bind_relation_d = bind_relation_d
return bind_relation_d
async def start(self):
query_id = self.record_id
records_found = self.records_found
mark_start = self.mark_from
mark_end = self.mark_to
yield JsonRequest(config.WOS_EXPORT_FILE_API, method='POST',
data=model.export_search_data_to_txt(query_id, mark_from=mark_start,
mark_to=mark_end),
callback=self.download_parse)
def download_parse(self, response: Response, **kwargs: Any) -> Any:
parse_count = 0
batch_time = datetime.now()
records = parse_full_records(response.body)
for data_dic in records:
t_id = data_dic.pop('ut', None)
if t_id:
parse_count += 1
article_item = WosArticleItem()
article_item['third_id'] = t_id
article_item['exported'] = data_dic
article_item['updated_at'] = batch_time
yield article_item
# 解析被引量
if cited_num := tools.str2int(data_dic.get("tc", 0), 0):
cited_item = WosCitedNumberItem()
cited_item['third_id'] = t_id
cited_item['cited'] = cited_num
cited_item['updated_at'] = batch_time
yield cited_item
if self.bind_relation_enable and self.bind_relation_d:
# 当启用绑定关系配置才会绑定各种关系
relation_item = WosIdRelationItem()
relation_item['third_id'] = t_id
relation_item.update(**self.bind_relation_d)
yield relation_item

@ -1,149 +0,0 @@
import os
import json
from datetime import datetime
from typing import List, Dict, Union, Any, Self
import scrapy
from scrapy.http.request.json_request import JsonRequest
from scrapy.crawler import Crawler
from science_article_add.items.wos import WosArticleItem, WosCitedNumberItem
from science_article_add.scripts.wos_parse_data import parse_full_records_txt
from science_article_add.models import wos_model as model
from science_article_add.utils import tools
from science_article_add.configs import wos as config
def _parse_download(body: Union[bytes, str]):
"""
解析响应的下载内容
"""
batch_time = datetime.now()
if isinstance(body, str):
body = body.encode()
item_g = parse_full_records_txt(body)
parse_count = 0
for data_dic in item_g:
t_id = data_dic.pop('ut', None)
if t_id:
parse_count += 1
article_item = WosArticleItem()
article_item['third_id'] = t_id
article_item['exported'] = data_dic
article_item['updated_at'] = batch_time
yield article_item
# 解析被引量
if cited_num := tools.str2int(data_dic.get("tc", 0), 0):
cited_item = WosCitedNumberItem()
cited_item['third_id'] = t_id
cited_item['cited'] = cited_num
cited_item['updated_at'] = batch_time
yield cited_item
class WosDownloadSpider(scrapy.Spider):
name = "wos_download"
custom_settings = dict(
FILE_STORAGE_DIR=r"Y:\wos-metadata\wos increment-202512\03",
DOWNLOADER_MIDDLEWARES={
"science_article_add.middlewares.wos.WosSidParamMiddleware": 500
},
ITEM_PIPELINES={
"science_article_add.pipelines.mongo.MongoPipeline": 300,
"science_article_add.pipelines.verify_data.VerifyDataIntegrity": 400,
},
LOG_LEVEL="INFO"
)
def __init__(self, task_obj, file_storage_dir: str = None, **kwargs):
scrapy.Spider.__init__(self)
self.file_storage_dir = file_storage_dir
self.id_list: List[Dict[str, str]] = task_obj
self._records_found = 0
@classmethod
def from_crawler(cls, crawler: Crawler, *args: Any, **kwargs: Any) -> Self:
settings = crawler.settings
from pymongo import MongoClient
client = MongoClient(settings.get("MONGO_URI"))
db = client.get_database(settings.get("MONGO_DATABASE"))
collection = db.get_collection("todo_ids_wos")
def f():
cursor = collection.find(filter={"state": 0}, projection={"state": 0}).limit(500)
d = [c for c in cursor]
if not d:
cursor = collection.find(filter={"state": 2}, projection={"_id": 0, "state": 0}).limit(500)
d = [c for c in cursor]
else:
_ids = [x.pop("_id", None) for x in d]
collection.update_many(filter={"_id": {"$in": _ids}}, update={"$set": {"state": 2}})
return d
tasks = f()
kwargs.update({"task_obj": tasks})
kwargs['file_storage_dir'] = settings.get("FILE_STORAGE_DIR")
return super().from_crawler(crawler, *args, **kwargs)
def make_query(self) -> str:
third_ids = []
for idT in self.id_list:
third_ids.append('%s=(%s)' % (idT.get('field', 'UT'), idT.get('third_id')))
todo_query = ' OR '.join(third_ids)
return todo_query
def get_batch_ids(self) -> List[Dict[str, str]]:
return self.id_list
async def start(self):
if not os.path.exists(self.file_storage_dir):
os.makedirs(self.file_storage_dir)
qu = self.make_query()
yield JsonRequest(
config.WOS_ADVANCED_SEARCH_API, method='POST', data=model.make_advanced_search_ut(query=qu),
)
def parse(self, response, **kwargs):
meta = response.meta
request = response.request
query_id, records_found = model.get_record_info(response.body)
if (not query_id) or (records_found == 0):
self.logger.warning("""
未找到记录
错误信息 %s
请求信息 %s""" % (response.text, request))
return
else:
self.set_records_found(records_found)
mark_start = 1
yield JsonRequest(config.WOS_EXPORT_FILE_API, method='POST',
data=model.export_search_data_to_txt(query_id, mark_from=mark_start,
mark_to=records_found),
meta={'QUERY_ID': query_id, 'QUERY': meta.get('QUERY'),
'FILENAME': meta.get("FILENAME"),
'RECORDS_FOUND': records_found, 'MARK_START': mark_start,
'MARK_END': records_found},
cb_kwargs=dict(filename=meta.get("FILENAME"), query_id=query_id),
callback=self.download_parse)
def download_parse(self, response, query_id: str = None, **kwargs):
filename = query_id or response.meta.get('FILENAME')
file_export_path = os.path.join(self.file_storage_dir, '%s.txt' % filename)
with open(file_export_path, 'wb') as f:
f.write(response.body)
yield from _parse_download(response.body)
def set_records_found(self, val):
self._records_found = val
def get_records_found(self) -> int:
return self._records_found
if __name__ == '__main__':
from scrapy.crawler import CrawlerProcess, Crawler
from scrapy.utils.project import get_project_settings
process = CrawlerProcess(get_project_settings())
process.crawl(WosDownloadSpider, task_obj=[])
process.start()

@ -1,250 +0,0 @@
# -*- coding: utf-8 -*-
# @Time : 2025/11/25 14:44
# @Author : zhaoxiangpeng
# @File : wos_dp_download.py
from __future__ import annotations
import math
from datetime import datetime
from typing import TYPE_CHECKING, Generator
from scrapy_drissionpage.spider import DrissionSpider
from science_article_add.items.wos import WosArticleItem, WosCitedNumberItem
from science_article_add.models.wos_model import get_record_info
from science_article_add.configs.wos_dp import settings as wos_dp_settings
from science_article_add.configs.wos import BATCH_DOWNLOAD_LIMIT
from science_article_add.utils import tools
from science_article_add.scripts.wos_parse_data import parse_full_records_txt
if TYPE_CHECKING:
from DrissionPage import ChromiumPage, ChromiumOptions
from scrapy_drissionpage.response import DrissionResponse
from DrissionPage._pages.chromium_tab import ChromiumTab
from DrissionPage._units.listener import DataPacket, Response
settings = wos_dp_settings
DOWNLOAD_PATH = r'Y:\wos-metadata\wos increment-202512\00'
class DpWosFileSpider(DrissionSpider):
name = "dp_wos_file"
start_urls = ["https://webofscience.clarivate.cn/wos/woscc/advanced-search"]
custom_settings = dict(
# 启用中间件
DOWNLOADER_MIDDLEWARES={
'scrapy_drissionpage.middleware.DrissionPageMiddleware': 543,
},
ITEM_PIPELINES={
"science_article_add.pipelines.mongo.MongoPipeline": 300,
},
EXTENSIONS={},
CONCURRENT_REQUESTS=1,
# DrissionPage配置
DRISSIONPAGE_HEADLESS=False, # 是否无头模式
DRISSIONPAGE_LOAD_MODE='normal', # 页面加载模式normal, eager, none
DRISSIONPAGE_DOWNLOAD_PATH='downloads', # 下载路径
DRISSIONPAGE_TIMEOUT=30, # 请求超时时间
DRISSIONPAGE_RETRY_TIMES=3, # 重试次数
DRISSIONPAGE_RETRY_INTERVAL=2, # 重试间隔(秒)
# 浏览器设置
DRISSIONPAGE_BROWSER_PATH=None, # 浏览器路径None使用默认浏览器
DRISSIONPAGE_INCOGNITO=True, # 是否使用无痕模式
DRISSIONPAGE_CHROME_OPTIONS=['--disable-gpu'], # Chrome启动选项}
)
_records_found = 0
_records_id = 0
query_content = "(OG=(Southwest University of Science & Technology - China)) AND PY=(2025)"
async def start(self):
yield self.drission_request(
url=self.start_urls[0],
callback=self.before_search,
page_type='chromium'
)
def before_search(self, response: DrissionResponse, **kwargs):
page: ChromiumPage = response.page # 重用页面
def operate_cookie_first():
# cookie管理处理
ck_m_div = page.ele('xpath://*[@id="onetrust-banner-sdk"]')
if ck_m_div:
ele = page.ele('xpath://*[@id="onetrust-accept-btn-handler"]')
ele.click()
operate_cookie_first() # cookie管理处理
# 切换数据库类型
page.ele('xpath://*[@id="snSelectDb"]/button').click()
page.ele('xpath:%(xpath)s' % {"xpath": settings.DB_CHANGE_ELE}).click()
# 开始检索流程
input_area_ele = page.ele('xpath:%(xpath)s' % {"xpath": settings.QUERY_INPUT_ELE})
input_area_ele.clear() # 清空
input_area_ele.input(self.query_content)
def listen_func():
page.listen.start(settings.SEARCH_ROUTE, method="POST")
def operation_func():
search_button_ele = page.ele('xpath:%(xpath)s' % {"xpath": settings.SEARCH_BUTTON_ELE})
search_button_ele.click()
def capture(packet: DataPacket):
search_url = page.url
record_id, records_found = get_record_info(packet.response.body)
self.set_records_found(records_found)
self.set_records_id(record_id)
if not self.get_records_id():
self.logger.warning('未找到记录 %s' % packet.response.body)
if records_found == 0:
self.logger.warning('检索式 "%s" 找到记录 %s' % (self.query_content, records_found))
return
else:
self.logger.info('检索式 "%s" 找到记录 %s' % (self.query_content, records_found))
return True
r = self.intercept(listen_func, operation=operation_func, callback=capture, tab=page)
print(r)
yield from self.download_records()
def before_download(self, response: DrissionResponse, **kwargs):
resp_meta = response.meta['wos_download_info']
g = self.rpa_download(
start=resp_meta['mark_start'],
end=resp_meta['mark_end'],
batch=resp_meta['batch_id'],
tab=self.current_tab
)
yield from g
def download_records(self):
for b in self.distribute_page():
query_id, batch_id, mark_start, mark_end = b
yield self.drission_request(
self.current_tab.url,
callback=self.before_download,
meta={'wos_download_info': dict(query_id=query_id, batch_id=batch_id, mark_start=mark_start,
mark_end=mark_end)}
)
# self.rpa_download(mark_start, mark_end, batch=batch_id, tab=self.current_tab)
def distribute_page(self):
# 计算页码
self.logger.info("prepare downloading...")
records_found = self.get_records_found()
query_id = self.get_records_id()
mark_start = 1
mark_end = 0
batch_id = 0
for i in range(math.ceil(records_found / BATCH_DOWNLOAD_LIMIT)):
mark_end += BATCH_DOWNLOAD_LIMIT
if mark_end > records_found:
mark_end = records_found
batch_id += 1
yield query_id, batch_id, mark_start, mark_end
mark_start += BATCH_DOWNLOAD_LIMIT
def rpa_download(self, start: int = 1, end: int = 500, batch: str | int = None, tab=None):
"""
点击下载前拦截api
"""
self.logger.debug("download starting...")
tab = tab or self.current_tab
tab.ele('xpath:%(xpath)s' % {"xpath": settings.EXPORT_BUTTON_ELE}).click() # 点击导出
tab.ele('xpath:%(xpath)s' % {"xpath": settings.TABWIN_BUTTON_ELE}).click() # 选择制表符分割
# 等待弹框
# 切换导出格式选择全记录与参考文献
tab.ele('xpath:%(xpath)s' % {"xpath": settings.RECORD_TYPE_SELECT_ELE}).click()
tab.ele('xpath:%(xpath)s' % {"xpath": settings.FULL_RECORD_REFERENCE_ELE}).click()
# 输入记录起止
tab.ele('xpath:%(xpath)s' % {"xpath": settings.RECORD_RANGE_ELE}).click() # 切换到范围
tab.ele('xpath:%(xpath)s' % {"xpath": settings.RECORD_EXPORT_START_ELE}).input(start, clear=True)
tab.ele('xpath:%(xpath)s' % {"xpath": settings.RECORD_EXPORT_END_ELE}).input(end, clear=True)
def listen_func():
tab.listen.start(settings.EXPORT_ROUTE, method="POST")
def operation_func():
tab.ele('xpath:%(xpath)s' % {"xpath": settings.EXPORT_FILE_ELE}).click.to_download(
save_path=DOWNLOAD_PATH,
rename='%s.txt' % batch
)
def capture_packet(packet: DataPacket):
g = self._parse_download(packet.response)
yield from g
return self.intercept(listen=listen_func, operation=operation_func, callback=capture_packet, tab=tab)
def _parse_download(self, response: Response):
batch_time = datetime.now()
item_g = parse_full_records_txt(response.body.encode())
parse_count = 0
for data_dic in item_g:
t_id = data_dic.pop('ut', None)
if t_id:
parse_count += 1
article_item = WosArticleItem()
article_item['third_id'] = t_id
article_item['exported'] = data_dic
article_item['updated_at'] = batch_time
yield article_item
# 解析被引量
if cited_num := tools.str2int(data_dic.get("tc", 0), 0):
cited_item = WosCitedNumberItem()
cited_item['third_id'] = t_id
cited_item['cited'] = cited_num
cited_item['updated_at'] = batch_time
yield cited_item
def intercept(self, listen, operation, callback, tab=None):
listen()
operation()
for packet in tab.listen.steps(count=3):
if not self.intercept_verify(packet):
continue
r = callback(packet)
if isinstance(r, Generator):
return r
else:
if isinstance(r, bool):
break
return
@staticmethod
def intercept_verify(packet: DataPacket):
content = packet.response.body
if isinstance(content, bytes) and content.find(b'"Server.passiveVerificationRequired"') != -1:
return False
else:
return True
def set_records_found(self, val):
self._records_found = val
def get_records_found(self) -> int:
return self._records_found
def set_records_id(self, val):
self._records_id = val
def get_records_id(self) -> str:
return self._records_id
if __name__ == '__main__':
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
process = CrawlerProcess(get_project_settings())
process.crawl(DpWosFileSpider)
process.start()

@ -28,10 +28,6 @@ class WosLatestIncrementSpider(scrapy.Spider):
"science_article_add.pipelines.mongo.MongoPipelineMulti": 300,
"science_article_add.pipelines.duptodo.DupTodoPipeline": 400,
},
EXTENSIONS={
"science_article_add.extensions.ackextension.ACKExtension": 0,
# "science_article_add.extensions.dingtalk_extension.DingTalkExtension": 0,
},
LOG_LEVEL="INFO"
)
source = "wos"

@ -1,378 +0,0 @@
import asyncio
import aiohttp
from typing import Dict, List, Any, Optional
from enum import Enum
import logging
from dataclasses import dataclass
import time
logger = logging.getLogger(__name__)
class DingTalkMessageType(Enum):
"""钉钉消息类型枚举"""
TEXT = "text"
LINK = "link"
MARKDOWN = "markdown"
ACTION_CARD = "actionCard"
FEED_CARD = "feedCard"
@dataclass
class DingTalkConfig:
"""钉钉配置数据类"""
webhook: str
secret: Optional[str] = None
at_mobiles: Optional[List[str]] = None
at_user_ids: Optional[List[str]] = None
at_all: bool = False
class DingTalkSender:
"""
钉钉消息推送器
功能描述:
1. 支持多种消息类型文本链接MarkdownActionCardFeedCard
2. 支持@指定用户或@所有人
3. 支持签名安全设置
4. 支持异步发送和批量发送
5. 内置重试机制和错误处理
"""
def __init__(self, config: DingTalkConfig):
"""
初始化钉钉消息发送器
Args:
config: 钉钉机器人配置
"""
self.config = config
self.session: Optional[aiohttp.ClientSession] = None
self._retry_count = 3
self._retry_delay = 1
async def __aenter__(self):
"""异步上下文管理器入口"""
await self._ensure_session()
return self
async def __aexit__(self, exc_type, exc_val, exc_tb):
"""异步上下文管理器出口"""
await self.close()
async def _ensure_session(self):
"""确保会话存在"""
if self.session is None:
self.session = aiohttp.ClientSession(
timeout=aiohttp.ClientTimeout(total=10)
)
async def close(self):
"""关闭会话"""
if self.session:
await self.session.close()
self.session = None
def _generate_signature(self, timestamp: int) -> str:
"""
生成签名
Args:
timestamp: 时间戳
Returns:
签名字符串
"""
if not self.config.secret:
return ""
import hmac
import hashlib
import base64
import urllib.parse
string_to_sign = f"{timestamp}\n{self.config.secret}"
hmac_code = hmac.new(
self.config.secret.encode('utf-8'),
string_to_sign.encode('utf-8'),
digestmod=hashlib.sha256
).digest()
sign = urllib.parse.quote_plus(base64.b64encode(hmac_code))
return sign
def _build_webhook_url(self) -> str:
"""
构建完整的webhook URL包含签名
Returns:
完整的webhook URL
"""
if not self.config.secret:
return self.config.webhook
timestamp = int(time.time() * 1000)
sign = self._generate_signature(timestamp)
return f"{self.config.webhook}&timestamp={timestamp}&sign={sign}"
def _build_at_info(self) -> Dict[str, Any]:
"""
构建@信息
Returns:
@信息字典
"""
at_info = {}
if self.config.at_mobiles:
at_info["atMobiles"] = self.config.at_mobiles
if self.config.at_user_ids:
at_info["atUserIds"] = self.config.at_user_ids
if self.config.at_all:
at_info["isAtAll"] = True
return at_info
async def _send_request(self, data: Dict[str, Any]) -> Dict[str, Any]:
"""
发送请求到钉钉
Args:
data: 请求数据
Returns:
响应数据
Raises:
Exception: 发送失败时抛出异常
"""
await self._ensure_session()
webhook_url = self._build_webhook_url()
headers = {
"Content-Type": "application/json",
"User-Agent": "DingTalk-Bot/1.0"
}
last_exception = None
for attempt in range(self._retry_count):
try:
logger.info(f"发送钉钉消息,尝试 {attempt + 1}/{self._retry_count}")
async with self.session.post(
webhook_url,
json=data,
headers=headers
) as response:
result = await response.json()
if response.status == 200 and result.get("errcode") == 0:
logger.info("钉钉消息发送成功")
return result
else:
error_msg = f"钉钉消息发送失败: {result.get('errmsg', 'Unknown error')}"
logger.error(error_msg)
last_exception = Exception(error_msg)
except asyncio.TimeoutError:
error_msg = f"钉钉消息发送超时,尝试 {attempt + 1}/{self._retry_count}"
logger.warning(error_msg)
last_exception = Exception(error_msg)
except Exception as e:
error_msg = f"钉钉消息发送异常: {str(e)},尝试 {attempt + 1}/{self._retry_count}"
logger.error(error_msg)
last_exception = e
# 如果不是最后一次尝试,等待重试
if attempt < self._retry_count - 1:
await asyncio.sleep(self._retry_delay * (attempt + 1))
# 所有重试都失败,抛出异常
raise last_exception or Exception("钉钉消息发送失败")
async def send_text(self, content: str, at_mobiles: Optional[List[str]] = None,
at_user_ids: Optional[List[str]] = None, at_all: Optional[bool] = None) -> Dict[str, Any]:
"""
发送文本消息
Args:
content: 消息内容
at_mobiles: @的手机号列表
at_user_ids: @的用户ID列表
at_all: 是否@所有人
Returns:
发送结果
"""
at_info = self._build_at_info()
# 覆盖默认的@设置
if at_mobiles is not None:
at_info["atMobiles"] = at_mobiles
if at_user_ids is not None:
at_info["atUserIds"] = at_user_ids
if at_all is not None:
at_info["isAtAll"] = at_all
data = {
"msgtype": DingTalkMessageType.TEXT.value,
"text": {
"content": content
},
"at": at_info
}
return await self._send_request(data)
async def send_markdown(self, title: str, text: str, at_mobiles: Optional[List[str]] = None,
at_user_ids: Optional[List[str]] = None, at_all: Optional[bool] = None) -> Dict[str, Any]:
"""
发送Markdown消息
Args:
title: 消息标题
text: Markdown格式的消息内容
at_mobiles: @的手机号列表
at_user_ids: @的用户ID列表
at_all: 是否@所有人
Returns:
发送结果
"""
at_info = self._build_at_info()
if at_mobiles is not None:
at_info["atMobiles"] = at_mobiles
if at_user_ids is not None:
at_info["atUserIds"] = at_user_ids
if at_all is not None:
at_info["isAtAll"] = at_all
data = {
"msgtype": DingTalkMessageType.MARKDOWN.value,
"markdown": {
"title": title,
"text": text
},
"at": at_info
}
return await self._send_request(data)
async def send_link(self, title: str, text: str, message_url: str,
pic_url: Optional[str] = None) -> Dict[str, Any]:
"""
发送链接消息
Args:
title: 消息标题
text: 消息内容
message_url: 点击消息跳转的URL
pic_url: 图片URL
Returns:
发送结果
"""
data = {
"msgtype": DingTalkMessageType.LINK.value,
"link": {
"title": title,
"text": text,
"messageUrl": message_url,
}
}
if pic_url:
data["link"]["picUrl"] = pic_url
return await self._send_request(data)
async def send_action_card(self, title: str, text: str, single_title: str,
single_url: str, btn_orientation: str = "0") -> Dict[str, Any]:
"""
发送整体跳转ActionCard消息
Args:
title: 消息标题
text: 消息内容
single_title: 单个按钮标题
single_url: 单个按钮跳转URL
btn_orientation: 按钮排列方向0-竖直1-横向
Returns:
发送结果
"""
data = {
"msgtype": DingTalkMessageType.ACTION_CARD.value,
"actionCard": {
"title": title,
"text": text,
"singleTitle": single_title,
"singleURL": single_url,
"btnOrientation": btn_orientation
}
}
return await self._send_request(data)
async def send_feed_card(self, links: List[Dict[str, str]]) -> Dict[str, Any]:
"""
发送FeedCard消息
Args:
links: 链接列表每个链接包含title, messageURL, picURL
Returns:
发送结果
"""
data = {
"msgtype": DingTalkMessageType.FEED_CARD.value,
"feedCard": {
"links": links
}
}
return await self._send_request(data)
async def send_alert(self, title: str, message: str, level: str = "info",
at_users: bool = False) -> Dict[str, Any]:
"""
发送告警消息便捷方法
Args:
title: 告警标题
message: 告警内容
level: 告警级别 (info, warning, error, critical)
at_users: 是否@相关人员
Returns:
发送结果
"""
level_emojis = {
"info": "",
"warning": "⚠️",
"error": "",
"critical": "🚨"
}
emoji = level_emojis.get(level, "")
markdown_text = f"""
## {emoji} {title}
**级别**: {level.upper()}
**时间**: {time.strftime('%Y-%m-%d %H:%M:%S')}
**详情**:
{message}
""".strip()
at_all = at_users and self.config.at_all
at_mobiles = self.config.at_mobiles if at_users else None
at_user_ids = self.config.at_user_ids if at_users else None
return await self.send_markdown(
title=f"{emoji} {title}",
text=markdown_text,
at_mobiles=at_mobiles,
at_user_ids=at_user_ids,
at_all=at_all
)

@ -1,43 +0,0 @@
# -*- coding: utf-8 -*-
# @Time : 2025/12/11 13:56
# @Author : zhaoxiangpeng
# @File : crawl_article_by_qid.py
import math
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
from science_article_add.spiders.download_by_qid import DownloadByQidSpider
BATCH_DOWNLOAD_LIMIT = 500
process = CrawlerProcess(get_project_settings())
RECORDS_FOUND = 1486
wos_download_todo = [
]
def f(record_id: str, records_found: int):
mark_start = 1
mark_end = 0
idx = 0
for i in range(math.ceil(records_found / BATCH_DOWNLOAD_LIMIT)):
idx += 1
mark_end += BATCH_DOWNLOAD_LIMIT
if mark_end > records_found:
mark_end = records_found
yield dict(
record_id=record_id, batch=idx,
mark_from=mark_start, mark_to=mark_end, records_found=records_found
)
mark_start += BATCH_DOWNLOAD_LIMIT
init_params = dict(
record_id='02f30273-1342-4d61-9e51-c1ea1f5b2423-0190efdd10',
mark_from=1, mark_to=500, records_found=10641
)
process.crawl(DownloadByQidSpider, **init_params)
process.start()

@ -1,41 +0,0 @@
# -*- coding: utf-8 -*-
# @Time : 2025/12/11 17:07
# @Author : zhaoxiangpeng
# @File : crawl_article_by_ut.py
import math
import time
import logging
from twisted.internet import defer
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
from science_article_add.spiders.wos_download import WosDownloadSpider
logging.getLogger('pymongo').setLevel(logging.WARNING)
logger = logging.getLogger(__name__)
BATCH_DOWNLOAD_LIMIT = 500
@defer.inlineCallbacks
def crawl_sequentially():
settings = get_project_settings()
from pymongo import MongoClient
client = MongoClient(settings.get("MONGO_URI"))
db = client.get_database(settings.get("MONGO_DATABASE"))
collection = db.get_collection("todo_ids_wos")
def f():
count = collection.count_documents(filter={"state": 0})
return count
while count_doc := f():
logger.info('待下载数量 %d' % count_doc)
yield process.crawl(WosDownloadSpider)
time.sleep(60)
process.stop() # 所有爬虫结束后关闭事件循环
if __name__ == '__main__':
process = CrawlerProcess(get_project_settings())
crawl_sequentially()
process.start() # 阻塞直到所有爬虫完成

@ -1,88 +0,0 @@
import time
from typing import List
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
from twisted.internet import defer
from scrapy import Spider
import pymysql
from pymysql import cursors
from science_article_add.spiders.wos_latest_increment import WosLatestIncrementSpider
sql = """
SELECT
b.id AS task_id,
r.org_id AS org_id,
r.org_name AS org_name,
q.id AS query_id,
q.content AS content,
b.task_condition AS task_condition,
q.source_type AS source_type,
b.is_done AS is_done
FROM
task_batch_record AS b
JOIN task_search_strategy AS q ON q.id = b.query_id
JOIN relation_org_query AS r ON r.query_id = b.query_id
WHERE
b.is_done = 2
AND q.source_type = 1
LIMIT %(limit)s
"""
sql2 = """
SELECT
b.id AS task_id,
q.id AS query_id,
q.content AS content,
b.task_condition AS task_condition,
q.source_type AS source_type,
b.is_done AS is_done
FROM
task_batch_record AS b
JOIN task_search_strategy AS q ON q.id = b.query_id
WHERE
b.is_done = 0
AND q.source_type = 1
LIMIT %(limit)s
"""
def get_task(limit: int = 1):
client: pymysql.Connection = pymysql.connect(host='43.140.203.187', port=3306,
database='science_data_dept', user='science-data-dept',
passwd='datadept1509', )
cursor = client.cursor(cursors.DictCursor)
try:
cursor.execute(sql2 % {'limit': limit})
results = cursor.fetchall()
except Exception as e:
raise e
else:
for result in results:
query_id = result['query_id']
cursor.execute('select org_id, org_name from relation_org_query where query_id=%s', (query_id,))
org_results: List[dict] = cursor.fetchall()
result['org_id'] = [org_result['org_id'] for org_result in org_results]
result['org_name'] = [org_result['org_name'] for org_result in org_results]
print(result)
yield result
finally:
cursor.close()
client.close()
@defer.inlineCallbacks
def crawl_sequentially(targets):
for target in targets:
print(f"\n=== 正在启动 Spider参数: {target} ===")
yield process.crawl(WosLatestIncrementSpider, task_obj=target)
print(f"=== Spider 完成: {target} ===\n")
time.sleep(60)
process.stop() # 所有爬虫结束后关闭事件循环
# ====== 主程序部分 ======
if __name__ == '__main__':
process = CrawlerProcess(get_project_settings())
targets = get_task(10)
crawl_sequentially(targets)
process.start() # 阻塞直到所有爬虫完成

@ -21,32 +21,10 @@ class ArticleItem(AddItemBase):
exported = scrapy.Field()
class CnkiArticleItem(ArticleItem):
"""cnki文章item"""
__tablename__ = 'data_cnki_article'
third_id = scrapy.Field()
exported = scrapy.Field()
updated_at = scrapy.Field()
class CnkiIdRelationItem(AddItemBase):
__tablename__ = 'relation_school_cnki'
class IdRelationItem(AddItemBase):
query_ids = scrapy.Field()
school_ids = scrapy.Field()
task_ids = scrapy.Field()
year = scrapy.Field()
class CnkiArticleTodoIdItem(scrapy.Item):
__tablename__ = 'todo_ids_cnki'
third_id = scrapy.Field()
db_code = scrapy.Field()
state = scrapy.Field()
ti = scrapy.Field()
v = scrapy.Field()
class ArticleCitedItem(AddItemBase):

@ -9,7 +9,6 @@ from datetime import timedelta
class ResourceType(enum.Enum):
"""资源类型"""
总库 = ALL = CROSSDB = "CROSSDB" # 默认是中文的,中午呢和外文分开
学术期刊 = JOURNAL = "JOURNAL" # 学术期刊
学位论文 = DISSERTATION = "DISSERTATION" # 学位论文
会议 = CONFERENCE = "CONFERENCE" # 会议
@ -23,7 +22,6 @@ class ResourceType(enum.Enum):
class SourceDatabaseEnum(enum.Enum):
"""来源库id"""
CROSSDB = 总库 = "WD0FTY92"
JOURNAL = 学术期刊 = "YSTT4HG0" # 学术期刊
DISSERTATION = 学位论文 = "LSTPFY1C" # 学位论文
CONFERENCE = 会议 = "JUP3MUPD" # 会议
@ -35,15 +33,6 @@ class SourceDatabaseEnum(enum.Enum):
ACHIEVEMENTS = "BLZOG7CK"
class ProductsEnum(enum.Enum):
pass
class ResourceLanguageEnum(enum.Enum):
中文 = "CHINESE"
外文 = "FOREIGN"
class SearchTypeId(enum.Enum):
"""知网的检索类型"""
ADV = 1
@ -160,3 +149,4 @@ class UpdatedTimeEnum(enum.Enum):
最近半年 = timedelta(days=180)
最近一年 = timedelta(days=180)
今年迄今 = timedelta(days=180)

@ -24,13 +24,12 @@ from pymongo.errors import (
DuplicateKeyError,
BulkWriteError
)
from science_article_cnki.items import CnkiArticleTodoIdItem
from science_article_cnki.db_utils.mongo import MongoDBUtils, update_document, build_update_query
if TYPE_CHECKING:
from scrapy.crawler import Crawler
from scrapy.statscollectors import StatsCollector
from pymongo.collection import Collection
mongo_logger = logging.getLogger('pymongo')
mongo_logger.setLevel(logging.WARNING)
@ -58,11 +57,8 @@ class MongoPipeline(MongoDBUtils):
def process_item(self, item, spider):
# 确定Item类型
if isinstance(item, CnkiArticleTodoIdItem):
return item
adapter = ItemAdapter(item)
item_type = self._get_item_table(item)
item_type = self._get_item_type(item)
collection = self.db.get_collection(item_type)
d = adapter.asdict()
try:
@ -75,14 +71,10 @@ class MongoPipeline(MongoDBUtils):
key_value = write_error.get('keyValue')
logger.debug("dupKey: %s, keyValue: %s", key_pattern, key_value)
d.pop("_id", None)
updated_at = d.pop('updated_at', None)
[d.pop(k, None) for k in key_pattern.keys()]
update_q = build_update_query(d, replace=self.duplicate_cover_enable)
up_result = collection.update_one(filter=key_value, update=update_q, upsert=True)
if up_result.matched_count == up_result.modified_count == 1:
current_time = datetime.now()
collection.update_one(filter=key_value, update={"$set": {"updated_at": updated_at}})
self.stats.inc_value("item2db_updated/{}".format(item_type))
self.stats.inc_value("item2db_updated/{}".format(item_type))
except Exception:
raise
@ -92,63 +84,9 @@ class MongoPipeline(MongoDBUtils):
self.client.close()
@staticmethod
def _get_item_table(item) -> str:
def _get_item_type(item) -> str:
"""获取Item类型"""
if hasattr(item, '__tablename__'):
return item.__class__.__tablename__
return 'items_null_table'
class DupTodoPipeline(MongoDBUtils):
def __init__(self, mongo_uri, mongo_db, stats: StatsCollector):
super().__init__(mongo_uri, mongo_db)
self.stats: StatsCollector = stats
@classmethod
def from_crawler(cls, crawler: Crawler):
return cls(
mongo_uri=crawler.settings.get("MONGO_URI"),
mongo_db=crawler.settings.get("MONGO_DATABASE", "items"),
stats=crawler.stats
)
def open_spider(self, spider):
self.client = MongoClient(self.mongo_uri)
self.db = self.client[self.mongo_db]
def process_item(self, item, spider):
if isinstance(item, CnkiArticleTodoIdItem):
fingerprints = item.get('third_id')
try:
if not self.is_exists(item, filter_key=self._get_dup_key(spider)):
table_name = self._get_item_table(spider)
coll = self.db.get_collection(table_name)
adapter = ItemAdapter(item)
d = adapter.asdict()
insert_result = coll.insert_one(d)
self.stats.inc_value("item2db_inserted/{}".format(table_name), count=1)
except DuplicateKeyError as duplicate_error:
logger.warning(duplicate_error)
except Exception as e:
raise e
return item
def is_exists(self, item, filter_key) -> bool:
fingerprints = item.get('third_id')
collection: Collection = self.db.get_collection(filter_key)
results = collection.find_one(filter={"third_id": fingerprints}, projection={"_id": 0, "third_id": 1})
if results and results.get('third_id') == fingerprints:
self.inc_item_dropped_count("duplicate")
return True
return False
def _get_dup_key(self, spider):
return 'data_%(source_type)s_article' % {"source_type": spider.source}
def _get_item_table(self, spider) -> str:
"""获取Item类型"""
return 'todo_ids_%(source_type)s' % {"source_type": spider.source}
def inc_item_dropped_count(self, reason):
self.stats.inc_value("item_dropped_count")
self.stats.inc_value(f"item_dropped_reasons_count/{reason}")

@ -16,7 +16,7 @@ ADDONS = {}
# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36'
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/139.0.0.0 Safari/537.36'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
@ -39,11 +39,11 @@ COOKIES_ENABLED = True
#}
SEARCH_REQUEST_HEADERS = {
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
'Cookie': 'Hm_lvt_dcec09ba2227fd02c55623c1bb82776a=1739256689; UM_distinctid=197b0769b48ea3-0de0b4b2dd761f-26001051-1fa400-197b0769b49cc6; Ecp_ClientId=e250627180800765334; Ecp_ClientIp=111.186.53.36; cnkiUserKey=1b8e7dbe-3c98-864f-2b80-84b544af32af; _c_WBKFRo=UO8UFAxWLjMjlOxhuKvmtkZ4yYaXr8dPZXuhVFea; Ecp_loginuserbk=SJTU; tfstk=g5GqYEZ0ZId4NHSWG0FNzQCb6QNYs5-QjfZ_SV0gloqDDdFa7uoTCSMjSA5ZJuEOhdn6_lmxYPZ0DxMNb0nUXt99nAPZ2q5jhfuO_P0iXEE6kLgxk5FMAHTBOq3vhen9f3NMS4V_773PuGuxk5Q-60hJAqQN2mSLS5mgZz4gS540ItYPZPqliPf0SgYzWuVgSrX0ZT4_uGb0Sc0kzPEuolmgsUPu2PVgjcViG50mS_zQnU-thdfV8NPaxqqPs67Lu-cB9u5Mabzqzugc-1fiaryqZpcfbM2jI2eKGqONwSgEE74qjBx0ex0r_Jh9Csg0ZoPxa-bMXocxSfPYTNAmzSr4KbwXO1mnzVDQUbTH9SP0mANx5w-jzjojkbu1STV4GYyEgWAdmlMS8fzZ6hdrYqDnjASP1GUobXlt3GXanzUzAU8z4y3oBzrYp_6OB8VLzkTblOBTnzUzAU8PBOeu2zrBlr1..; Ecp_session=1; SID_sug=018104; knsLeftGroupSelectItem=; dsorders=CF; dsortypes=cur%20DESC; knsadv-searchtype=%7B%22BLZOG7CK%22%3A%22gradeSearch%2CmajorSearch%22%2C%22MPMFIG1A%22%3A%22gradeSearch%2CmajorSearch%2CsentenceSearch%22%2C%22T2VC03OH%22%3A%22gradeSearch%2CmajorSearch%22%2C%22JQIRZIYA%22%3A%22gradeSearch%2CmajorSearch%2CsentenceSearch%22%2C%22S81HNSV3%22%3A%22gradeSearch%22%2C%22YSTT4HG0%22%3A%22gradeSearch%2CmajorSearch%2CauthorSearch%2CsentenceSearch%22%2C%22ML4DRIDX%22%3A%22gradeSearch%2CmajorSearch%22%2C%22WQ0UVIAA%22%3A%22gradeSearch%2CmajorSearch%22%2C%22VUDIXAIY%22%3A%22gradeSearch%2CmajorSearch%22%2C%22LIQN9Z3G%22%3A%22gradeSearch%22%2C%22NN3FJMUV%22%3A%22gradeSearch%2CmajorSearch%2CauthorSearch%2CsentenceSearch%22%2C%22LSTPFY1C%22%3A%22gradeSearch%2CmajorSearch%2CsentenceSearch%22%2C%22HHCPM1F8%22%3A%22gradeSearch%2CmajorSearch%22%2C%22OORPU5FE%22%3A%22gradeSearch%2CmajorSearch%22%2C%22WD0FTY92%22%3A%22gradeSearch%2CmajorSearch%2CauthorSearch%2CsentenceSearch%22%2C%22BPBAFJ5S%22%3A%22gradeSearch%2CmajorSearch%2CauthorSearch%2CsentenceSearch%22%2C%22EMRPGLPA%22%3A%22gradeSearch%2CmajorSearch%22%2C%22PWFIRAGL%22%3A%22gradeSearch%2CmajorSearch%2CsentenceSearch%22%2C%22U8J8LYLV%22%3A%22gradeSearch%2CmajorSearch%22%2C%22R79MZMCB%22%3A%22gradeSearch%22%2C%22J708GVCE%22%3A%22gradeSearch%2CmajorSearch%22%2C%228JBZLDJQ%22%3A%22gradeSearch%2CmajorSearch%2CsentenceSearch%22%2C%22HR1YT1Z9%22%3A%22gradeSearch%2CmajorSearch%22%2C%22JUP3MUPD%22%3A%22gradeSearch%2CmajorSearch%2CauthorSearch%2CsentenceSearch%22%2C%22NLBO1Z6R%22%3A%22gradeSearch%2CmajorSearch%22%2C%22RMJLXHZ3%22%3A%22gradeSearch%2CmajorSearch%2CsentenceSearch%22%2C%221UR4K4HZ%22%3A%22gradeSearch%2CmajorSearch%2CauthorSearch%2CsentenceSearch%22%2C%22NB3BWEHK%22%3A%22gradeSearch%2CmajorSearch%22%2C%22XVLO76FD%22%3A%22gradeSearch%2CmajorSearch%22%7D; Ecp_IpLoginFail=25121149.65.252.186; SID_kns_new=kns018106; SID_restapi=kns018110; KNS2COOKIE=1765437722.656.114388.232155|b25e41a932fd162af3b8c5cff4059fc3; dblang=both; createtime-advInput=2025-12-11%2015%3A22%3A21; searchTimeFlags=1',
'Origin': 'https://kns.cnki.net',
'Referer': 'https://kns.cnki.net/kns8s/AdvSearch?crossids=YSTT4HG0%2CLSTPFY1C%2CJUP3MUPD%2CMPMFIG1A%2CWQ0UVIAA%2CBLZOG7CK%2CPWFIRAGL%2CEMRPGLPA%2CNLBO1Z6R%2CNN3FJMUV',
'User-Agent': USER_AGENT,
}
SEARCH_REQUEST_COOKIES_STR = 'Ecp_notFirstLogin=qkFgu9; Ecp_ClientId=o240823084800102418; Ecp_loginuserbk=SJTU; cnkiUserKey=eef4d3aa-1096-bc9e-dff0-74349179c2cc; Ecp_ClientIp=111.186.52.67; UM_distinctid=19366f14e7a832-0f92ef85a35cb5-26001051-1fa400-19366f14e7c14f2; Hm_lvt_dcec09ba2227fd02c55623c1bb82776a=1734079899; Ecp_session=1; SID_kns_new=kns018104; SID_sug=018104; knsLeftGroupSelectItem=; updatetime-advInput=2024-12-19+17%3A42%3A08; knsadv-searchtype=%7B%22BLZOG7CK%22%3A%22gradeSearch%2CmajorSearch%22%2C%22MPMFIG1A%22%3A%22gradeSearch%2CmajorSearch%2CsentenceSearch%22%2C%22T2VC03OH%22%3A%22gradeSearch%2CmajorSearch%22%2C%22JQIRZIYA%22%3A%22gradeSearch%2CmajorSearch%2CsentenceSearch%22%2C%22S81HNSV3%22%3A%22gradeSearch%22%2C%22YSTT4HG0%22%3A%22gradeSearch%2CmajorSearch%2CauthorSearch%2CsentenceSearch%22%2C%22ML4DRIDX%22%3A%22gradeSearch%2CmajorSearch%22%2C%22WQ0UVIAA%22%3A%22gradeSearch%2CmajorSearch%22%2C%22VUDIXAIY%22%3A%22gradeSearch%2CmajorSearch%22%2C%22NN3FJMUV%22%3A%22gradeSearch%2CmajorSearch%2CauthorSearch%2CsentenceSearch%22%2C%22LSTPFY1C%22%3A%22gradeSearch%2CmajorSearch%2CsentenceSearch%22%2C%22HHCPM1F8%22%3A%22gradeSearch%2CmajorSearch%22%2C%22OORPU5FE%22%3A%22gradeSearch%2CmajorSearch%22%2C%22WD0FTY92%22%3A%22gradeSearch%2CmajorSearch%2CauthorSearch%2CsentenceSearch%22%2C%22BPBAFJ5S%22%3A%22gradeSearch%2CmajorSearch%2CauthorSearch%2CsentenceSearch%22%2C%22EMRPGLPA%22%3A%22gradeSearch%2CmajorSearch%22%2C%22PWFIRAGL%22%3A%22gradeSearch%2CmajorSearch%2CsentenceSearch%22%2C%22U8J8LYLV%22%3A%22gradeSearch%2CmajorSearch%22%2C%22R79MZMCB%22%3A%22gradeSearch%22%2C%22J708GVCE%22%3A%22gradeSearch%2CmajorSearch%22%2C%22HR1YT1Z9%22%3A%22gradeSearch%2CmajorSearch%22%2C%22JUP3MUPD%22%3A%22gradeSearch%2CmajorSearch%2CauthorSearch%2CsentenceSearch%22%2C%22NLBO1Z6R%22%3A%22gradeSearch%2CmajorSearch%22%2C%22RMJLXHZ3%22%3A%22gradeSearch%2CmajorSearch%2CsentenceSearch%22%2C%221UR4K4HZ%22%3A%22gradeSearch%2CmajorSearch%2CauthorSearch%2CsentenceSearch%22%2C%22NB3BWEHK%22%3A%22gradeSearch%2CmajorSearch%22%2C%22XVLO76FD%22%3A%22gradeSearch%2CmajorSearch%22%7D; createtime-advInput=2024-12-20%2014%3A37%3A03; LID=WEEvREcwSlJHSldSdmVpanJGNW9JQS9sbkNrOUFycHJkRzF3eXgyTGlWbz0=$9A4hF_YAuvQ5obgVAqNKPCYcEjKensW4IQMovwHtwkF4VYPoHbKxJw!!; Ecp_LoginStuts={"IsAutoLogin":false,"UserName":"SJTU","ShowName":"%E4%B8%8A%E6%B5%B7%E4%BA%A4%E9%80%9A%E5%A4%A7%E5%AD%A6","UserType":"bk","BUserName":"","BShowName":"","BUserType":"","r":"qkFgu9","Members":[]}; KNS2COOKIE=1734680479.883.14106.830885|b25e41a932fd162af3b8c5cff4059fc3; dblang=both; c_m_LinID=LinID=WEEvREcwSlJHSldSdmVpanJGNW9JQS9sbkNrOUFycHJkRzF3eXgyTGlWbz0=$9A4hF_YAuvQ5obgVAqNKPCYcEjKensW4IQMovwHtwkF4VYPoHbKxJw!!&ot=12%2F20%2F2024%2016%3A01%3A27; c_m_expire=2024-12-20%2016%3A01%3A27; tfstk=gnXZLQYMKRewdgBaoHvqL9aIUYp9sd45ntTXmijDfFYG5iTcTZbBCGsccx-D-NdjCxY18pQRVAC_6ITq0dBC1xT_WKScPKz7P8w5XGpynzaShW0gBdKqnncilpDHmK-i1ZwdGGpvnyaM9UCdXabz7TCMnkJH4ncDnxYMtk-6qKDMiAcn-eKDnKADjDYH4nmioAYgYMYpDKxcoCcmtGjmL3Og25LCsWPKUCYljekmU0KHslSnGAMsnhA9rBxrnH6ebC8ljOHkrv-hd9RWOmayKgCCSHJz3vvwaOBytO4K3BQ2-IWMh0kcYNshNIWgD5IF3FRlIBoS3dIpmZAV9zkWbd1eaO5TD2jGPF5kBiiz5MRPTQKHtmlMC_s5HQXgQ4LBwn7y4NuN4DuvxG5lH1umgCxpYUZUY7E40mtBH0LEMjdHeH87fhGxMCxpYUZUYjhvteKePlt1.; searchTimeFlags=1; updatetime-advInput=2024-12-19+17%3A42%3A08'
# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html

@ -1,185 +0,0 @@
from __future__ import annotations
import math
from copy import deepcopy
from datetime import datetime
from typing import TYPE_CHECKING, Any, Self
from pprint import pformat
import scrapy
from science_article_cnki.items import CnkiIdRelationItem, CnkiArticleTodoIdItem, CnkiCitedNumberItem
from science_article_cnki.models.enum_cls import SingleResultEnum
from science_article_cnki.models import cnki_model as model
from science_article_cnki.utils import tools
from science_article_cnki.utils.tools import parse_datetime, add_year2item
from science_article_cnki.utils.ti_match_id import ti2format, ti2unique_type2
from science_article_cnki.configs import cnki as config
class CnkiArticleCrossdbSpider(scrapy.Spider):
name = "cnki_article_crossdb"
custom_settings = dict(
DOWNLOADER_MIDDLEWARES={
"science_article_cnki.middlewares.CnkiSearchHeadersDownloaderMiddleware": 540,
},
ITEM_PIPELINES={
"science_article_cnki.pipelines.MongoPipeline": 300,
"science_article_cnki.pipelines.DupTodoPipeline": 310,
# "science_article_cnki.pipelines.verify_data.VerifyDataIntegrity": 400,
},
# LOG_LEVEL="INFO"
)
source = 'cnki'
resource_type: str = "总库"
query_id: int
query: str
filters: list = list()
def open_spider(self):
"""
"""
pass
async def start(self):
m = dict(query=self.query, resource_type=self.resource_type, page=1)
m.update(filters=self.filters)
query_body = model.adv_refine_search(**m)
# 把筛选项加到查询体中
model.add_muti_filters(base_query=query_body, filters=m.get("filters"))
form_d = model.adv_query_search(query_body, **m)
yield scrapy.FormRequest(url=config.CNKI_ADV_SEARCH_API, method="POST",
formdata=form_d, meta=dict(REQUEST_Q=m))
def parse(self, response, **kwargs):
"""
首次请求会进入这个解析
"""
request_q = response.meta["REQUEST_Q"]
msg = """当前检索: %(query)s,\n筛选项: %(filters)s,\n页数: %(page)s"""
kws = {
"query": request_q.get("query"),
"filters": pformat(request_q.get("filters", [])),
"page": '{c}/{m}'.format(c=request_q.get("page", 1), m=request_q.get("max_page", 'null'))
}
self.logger.info(msg % kws)
# -------------------------------------------- 计算一共有多少页的逻辑 --------------------------------------------
# 提取检索结果的数量
total_prm = response.xpath('//span[@class="pagerTitleCell"]/em/text()').get()
if not total_prm:
return
total = tools.str2int(total_prm.replace(',', '')) # 格式化数量字符串并转int
# 计算一共有多少页
max_page = math.ceil(total / config.BATCH_SEARCH_RESULT_LIMIT)
request_q['max_page'] = max_page
batch_time = datetime.now()
# ---------------------------------------------- 提取列表文章的逻辑 ----------------------------------------------
tr_nodes = response.xpath('//div[@id="gridTable"]//table[@class="result-table-list"]/tbody/tr')
for tr_node in tr_nodes:
check_v = tr_node.xpath('./td[@class="seq"]/input/@value').get() # 下载导出用的v
article_title = tr_node.xpath('./td[@class="name"]/a//text()').getall() # 文章标题
article_title = article_title and ''.join(article_title)
article_link = tr_node.xpath('./td[@class="name"]/a/@href').get() # 文章链接有v值
source_title = tr_node.xpath('./td[@class="source"]/*/a/text()').get() # 出版物名称(刊名)
db_name = tr_node.xpath('./td[@class="operat"]/a[@class="icon-collect"]/@data-dbname').get() # 收录库
third_id = tr_node.xpath('./td[@class="operat"]/a[@class="icon-collect"]/@data-filename').get() # 三方id
cited_str = tr_node.xpath('./td[@class="quote"]/span/a/text()').get() # 被引量字符串
param = tools.url_parse(article_link)
v = check_v
ti_format = ti2format(article_title)
ti_unique = ti2unique_type2(ti=ti_format, so=source_title)
if third_id:
relation_item = CnkiIdRelationItem()
relation_item['third_id'] = third_id
relation_item['query_ids'] = [self.query_id]
# 给关系添加年份
add_year2item(relation_item, request_q.get("year"), tr_node.xpath('./td[@class="date"]/text()').get())
relation_item['updated_at'] = batch_time
yield relation_item
if cited_str:
cited_item = CnkiCitedNumberItem(**dict(third_id=third_id, cited=tools.str2int(cited_str, 0), updated_at=batch_time))
yield cited_item
yield CnkiArticleTodoIdItem(**dict(third_id=third_id, db_code=db_name, ti=ti_unique, v=v, state=0))
q_bak: dict = deepcopy(request_q)
q_bak['page'] += 1
query_body = model.adv_refine_search(**q_bak)
model.add_muti_filters(base_query=query_body, filters=q_bak.get("filters"))
search_param = model.adv_query_search(query_body, **q_bak)
yield scrapy.FormRequest(
url=config.CNKI_ADV_SEARCH_API, method="POST",
formdata=search_param,
callback=self.parse_other_page,
meta=dict(REQUEST_Q=q_bak)
)
async def parse_other_page(self, response, **kwargs):
priority = response.request.priority
request_q = response.meta["REQUEST_Q"]
msg = """当前检索: %(query)s,\n筛选项: %(filters)s,\n页数: %(page)s"""
kws = {
"query": request_q.get("query"),
"filters": pformat(request_q.get("filters", [])),
"page": '{c}/{m}'.format(c=request_q.get("page", 1), m=request_q.get("max_page", 'null'))
}
self.logger.info(msg % kws)
batch_time = datetime.now()
# ---------------------------------------------- 提取列表文章的逻辑 ----------------------------------------------
tr_nodes = response.xpath('//div[@id="gridTable"]//table[@class="result-table-list"]/tbody/tr')
for tr_node in tr_nodes:
check_v = tr_node.xpath('./td[@class="seq"]/input/@value').get() # 下载导出用的v
article_title = tr_node.xpath('./td[@class="name"]/a/text()').get() # 文章标题
article_link = tr_node.xpath('./td[@class="name"]/a/@href').get() # 文章链接有v值
source_title = tr_node.xpath('./td[@class="source"]/*/a/text()').get() # 出版物名称(刊名)
db_name = tr_node.xpath('./td[@class="operat"]/a[@class="icon-collect"]/@data-dbname').get() # 收录库
third_id = tr_node.xpath('./td[@class="operat"]/a[@class="icon-collect"]/@data-filename').get() # 三方id
cited_str = tr_node.xpath('./td[@class="quote"]/span/a/text()').get() # 被引量字符串
param = tools.url_parse(article_link)
v = check_v
ti_format = ti2format(article_title)
ti_unique = ti2unique_type2(ti=ti_format, so=source_title)
if third_id:
relation_item = CnkiIdRelationItem()
relation_item['third_id'] = third_id
relation_item['query_ids'] = [self.query_id]
# 给关系添加年份
add_year2item(relation_item, request_q.get("year"), tr_node.xpath('./td[@class="date"]/text()').get())
relation_item['updated_at'] = batch_time
yield relation_item
if cited_str:
cited_item = CnkiCitedNumberItem(**dict(third_id=third_id, cited=tools.str2int(cited_str, 0), updated_at=batch_time))
yield cited_item
yield CnkiArticleTodoIdItem(**dict(third_id=third_id, db_code=db_name, ti=ti_unique, v=v, state=0))
"""
# -------------------------------------------------- 翻页逻辑 --------------------------------------------------
"""
if request_q['page'] < request_q['max_page']:
q_bak = deepcopy(request_q)
"""
2023年6月29日14:56:44 处理倒序逻辑
cnki单次检索限制6000条即6000/50=120当6000<数量<12000可以使用倒序来进行补充
"""
# 限制6000条的逻辑
if q_bak['page'] >= 120 and q_bak.get('sort') != 'asc':
q_bak['page'] = 0
q_bak['sort'] = 'asc'
q_bak['max_page_sum'] = q_bak['max_page']
q_bak['max_page'] = q_bak['max_page_sum'] - 120 + 2
# 倒序处理逻辑结束
q_bak['page'] += 1
query_body = model.adv_refine_search(**q_bak)
model.add_muti_filters(base_query=query_body, filters=q_bak.get("filters"))
search_param = model.adv_query_search(query_body, **q_bak)
yield scrapy.FormRequest(
url=config.CNKI_ADV_SEARCH_API, method="POST",
formdata=search_param, priority=priority,
callback=self.parse_other_page,
meta=dict(REQUEST_Q=q_bak)
)

@ -33,7 +33,7 @@ class CnkiCitedNumberSpider(scrapy.Spider):
# 比如判断如果没有参数从数据库中读取
return super().from_crawler(crawler, *args, **kwargs)
def __init__(self, query: str = None, resource_type: str = "学术期刊", query_condition: dict = None, **kwargs: Any):
def __init__(self, query: str = None, resource_type: str = "JOURNAL", query_condition: dict = None, **kwargs: Any):
super().__init__(**kwargs)
self.query = query
self.resource_type = resource_type

@ -1,30 +0,0 @@
from typing import Any, List, Dict, Self, AsyncIterator
import scrapy
from scrapy.crawler import Crawler
from science_article_cnki.models import cnki_model as model
from science_article_cnki.configs import cnki as config
class CnkiIdsDownloadSpider(scrapy.Spider):
name = "cnki_ids_download"
allowed_domains = ["cnki.net"]
start_urls = ["https://cnki.net"]
@classmethod
def from_crawler(cls, crawler: Crawler, *args: Any, **kwargs: Any) -> Self:
return super().from_crawler(crawler, *args, **kwargs)
def __init__(self):
scrapy.Spider.__init__(self)
self.id_list: List[Dict[str, str]] = None
async def start(self):
yield scrapy.FormRequest(
config.CNKI_EXPORT_XLS_OLD_API,
method='POST',
formdata=model.export_data(ids),
)
def parse(self, response):
pass

@ -1,15 +0,0 @@
# -*- coding: utf-8 -*-
# @Time : 2024/5/13 16:53
# @Author : zhaoxiangpeng
# @File : extract_rule.py
# 提取ISSN号
ISSN_REGEX_PATTERN = r'ISSN(\d{4}-[\dX]{4})'
# 提取CN号, https://baike.baidu.com/item/%E5%9B%BD%E5%86%85%E7%BB%9F%E4%B8%80%E5%88%8A%E5%8F%B7/386463
CN_REGEX_PATTERN = r'CN(\d{2}-\d{4}/?[A-Z]?)'
# 去除/替换标题中的特殊字符
DEL_TITLE_SYMBOL_PATTERN = '[!"#$%&\'()*+,-.·/:;<=>—?@,。?★、…()【】《》?“”‘’![\\]^_`{|}~\s]+'
# 去除特殊字符后的字符
DEL_SOURCE_SYMBOL_PATTERN = DEL_TITLE_SYMBOL_PATTERN

@ -1,8 +0,0 @@
# -*- coding: utf-8 -*-
# @Time : 2026/1/12 14:31
# @Author : zhaoxiangpeng
# @File : logformat.py
def pformat_dict(**kwargs):
return ', '.join([f'{k}={v}' for k, v in kwargs.items()])

@ -1,409 +0,0 @@
# -*- coding: utf-8 -*-
# @Time : 2024/5/23 16:22
# @Author : zhaoxiangpeng
# @File : ti_match_id.py
from .tools import ji2format, ti2format
TYPE1 = '1@'
TYPE2 = '2@'
TAG_TYPE_LENGTH = 2
GET_TYPE_MAP = {TYPE1, TYPE2}
def ti2unique(ti=None, ji=None, y=None, i=None):
"""
标题加其他标志作为唯一号
:param ti:
:param ji: 期刊issn
:param y:
:param i:
:return:
"""
if ji:
ji = ji2format(ji)
ti_unique = '{ti}:{j}:{y}{i}'.format(ti=ti, j=ji, y=y, i=i)
return ti_unique
def ti2unique_type1(ti=None, ji=None, y=None, i=None):
"""
标题加其他标志作为唯一号
:param ti:
:param ji: 期刊issn
:param y:
:param i:
:return:
"""
if ji:
ji = ji2format(ji)
ti_unique = TYPE1 + '{ti}:{j}:{y}{i}'.format(ti=ti, j=ji, y=y, i=i)
return ti_unique
def ti2unique_type2(ti=None, so=None):
"""
生成检索结果的唯一id
:param ti:
:param so:
:return:
"""
if so:
so = ti2format(so)
ti_unique = TYPE2 + '{ti}:{so}'.format(ti=ti, so=so)
return ti_unique
class TitleMatchIdTool:
def __init__(self, id_ti_info: str = None):
self._id_ti_info = id_ti_info
self._map_count = 0
# self._unfold_map = self.make_ti_id_relation(id_ti_info)
self._unfold_map_extend = self.ti_id_relation(id_ti_info)
def make_ti_id_relation(self, ti_map_str: str) -> dict:
"""
展开字符串关系
:return:
"""
result = {}
count = 0
del_top_id_set = set()
del_top_issue = {}
del_issn_id_coll = {}
del_issue_coll = {}
id_ti_list = ti_map_str.split(';')
for s in id_ti_list:
count += 1
i, ti = s.split(',')
# 标题可能会重复,那么标题需要时一对多
# 预处理标题
ti_format, is_format, issue_format = ti.split(':')
if not result.get(ti_format):
result.setdefault(ti_format, {})
else:
del_top_id_set.add(ti_format)
# 需要先把标题的字典里有一个id字段
if not result[ti_format].get('id'):
result[ti_format].setdefault('id', i)
else:
# 设置过id说明标题有重复就要删除掉id字段进行下层处理
del_top_id_set.add(ti_format)
# 再加一层结构
if not result[ti_format].get(is_format):
result[ti_format].setdefault(is_format, {})
# 在issn层中设置id
result[ti_format][is_format].setdefault('id', i)
else:
# 说明issn有重复删除掉id
del_issn_id_coll.setdefault(ti_format, []).append(is_format)
# 在issn层中设置期关联id
if not result[ti_format][is_format].get(issue_format):
result[ti_format][is_format].setdefault(issue_format, i)
# 在标题层设置期关联id
if not result[ti_format].get(issue_format):
result[ti_format].setdefault(issue_format, i)
else:
del_top_issue.setdefault('issue', []).append(issue_format)
else:
# 说明同一本期刊的期也重复了,那么重复的期数都不可信
del_issue_coll.setdefault(ti_format, {}).setdefault(is_format, []).append(issue_format)
count -= 1
if del_top_id_set:
for del_top in del_top_id_set:
result[del_top].pop('id', None)
if del_issn_id_coll:
for key_ti, vals in del_issn_id_coll.items():
for val in vals:
result[key_ti][val].pop('id', None)
# 删除期重复的
if del_issue_coll:
for key_ti, is_info in del_issue_coll.items():
for ii in is_info:
for x in is_info[ii]:
count -= 1
# 移除issn中的的期
result[key_ti][ii].pop(x, None)
# 移除最上层的期
result[key_ti].pop(x, None)
self._map_count = count
return result
def get_id_use_ti_unique(self, ti_unique: str):
"""
使用格式化标题匹配id
:param ti_unique:
:return:
"""
unfold_map = self._unfold_map_extend.get(TYPE1)
if not unfold_map:
return None
ti_format, is_format, issue_format = ti_unique.split(':')
obj = unfold_map.get(ti_format)
if not obj:
return None
# 直接获取id key如果可以获取到说明只有一个没有重复直接返回
t_id = obj.get('id')
if t_id:
return t_id
# 没有获取到走issn的逻辑issn可能会因为过刊的原因变化
is_obj = obj.get(is_format) # 获取到issn层的对象
if not is_obj:
return None
# 如果获取到,那么唯一
t_id = is_obj.get('id')
if t_id:
return t_id
# 获取不到可能不唯一,进行期获取
t_id = is_obj.get(issue_format)
if t_id:
return t_id
# 以上流程走完如果还没有匹配到,那么直接通过标题和期进行匹配
issue_t_id = obj.get(issue_format, None)
# 如果依然获取不到,返回 None流程结束
return issue_t_id
def get_id_use_ti_unique2(self, ti_unique: str):
"""
使用格式化标题匹配id
:param ti_unique:
:return:
"""
unfold_map = self._unfold_map_extend.get(TYPE2)
if not unfold_map:
return None
ti_format, so_format = ti_unique.split(':')
obj = unfold_map.get(ti_format)
if not obj:
return None
# 直接获取id key如果可以获取到说明只有一个没有重复直接返回
t_id = obj.get('id')
if t_id:
return t_id
# 没有获取到走期刊名称的逻辑
is_obj = obj.get(so_format) # 获取到issn层的对象
if not is_obj:
return None
def get_id_control(self, ti_unique: str):
type_ = ti_unique[:TAG_TYPE_LENGTH]
if type_ not in GET_TYPE_MAP:
type_ = TYPE1
ti2uni = ti_unique
else:
ti2uni = ti_unique[TAG_TYPE_LENGTH:]
func = {TYPE1: self.get_id_use_ti_unique, TYPE2: self.get_id_use_ti_unique2}
return func[type_](ti2uni)
def count(self):
return self._map_count
@staticmethod
def format_article_title(title: str) -> str:
"""
去除特殊字符
:param title:
:return:
"""
@staticmethod
def format_journal_issn(issn: str) -> str:
"""
去除特殊字符
:param issn:
:return:
"""
def make_ti_id_type1(self, datas):
result = {}
count = 0
del_top_id_set = set()
del_top_issue = {}
del_issn_id_coll = {}
del_issue_coll = {}
for s in datas:
count += 1
i, ti = s
# 标题可能会重复,那么标题需要时一对多
# 预处理标题
ti_format, is_format, issue_format = ti.split(':')
if not result.get(ti_format):
result.setdefault(ti_format, {})
else:
del_top_id_set.add(ti_format)
# 需要先把标题的字典里有一个id字段
if not result[ti_format].get('id'):
result[ti_format].setdefault('id', i)
else:
# 设置过id说明标题有重复就要删除掉id字段进行下层处理
del_top_id_set.add(ti_format)
# 再加一层结构
if not result[ti_format].get(is_format):
result[ti_format].setdefault(is_format, {})
# 在issn层中设置id
result[ti_format][is_format].setdefault('id', i)
else:
# 说明issn有重复删除掉id
del_issn_id_coll.setdefault(ti_format, []).append(is_format)
# 在issn层中设置期关联id
if not result[ti_format][is_format].get(issue_format):
result[ti_format][is_format].setdefault(issue_format, i)
# 在标题层设置期关联id
if not result[ti_format].get(issue_format):
result[ti_format].setdefault(issue_format, i)
else:
del_top_issue.setdefault('issue', []).append(issue_format)
else:
# 说明同一本期刊的期也重复了,那么重复的期数都不可信
del_issue_coll.setdefault(ti_format, {}).setdefault(is_format, []).append(issue_format)
count -= 1
if del_top_id_set:
for del_top in del_top_id_set:
result[del_top].pop('id', None)
if del_issn_id_coll:
for key_ti, vals in del_issn_id_coll.items():
for val in vals:
result[key_ti][val].pop('id', None)
# 删除期重复的
if del_issue_coll:
for key_ti, is_info in del_issue_coll.items():
for ii in is_info:
for x in is_info[ii]:
count -= 1
# 移除issn中的的期
result[key_ti][ii].pop(x, None)
# 移除最上层的期
result[key_ti].pop(x, None)
self._map_count += count
return result
def make_ti_id_type2(self, datas: list):
result = {}
count = 0
del_top_id_set = set()
del_so_coll = {}
for data in datas:
count += 1
t_id, ti_uni = data
ti_format, so_format = ti_uni.split(':')
if not result.get(ti_format):
result.setdefault(ti_format, {})
else:
del_top_id_set.add(ti_format)
# 需要先把标题的字典里有一个id字段
if not result[ti_format].get('id'):
result[ti_format].setdefault('id', t_id)
else:
# 设置过id说明标题有重复就要删除掉id字段进行下层处理
del_top_id_set.add(ti_format)
# 处理期刊名称
if not result[ti_format].get(so_format):
result[ti_format].setdefault(so_format, {})
# 在期刊名称层中设置id
result[ti_format][so_format].setdefault('id', t_id)
else:
# 说明期刊名称有重复删除掉id
del_so_coll.setdefault(ti_format, []).append(so_format)
if del_top_id_set:
for del_top in del_top_id_set:
result[del_top].pop('id', None)
if del_so_coll:
for key_ti, vals in del_so_coll.items():
for val in vals:
result[key_ti][val].pop('id', None)
self._map_count += count
return result
def ti_id_relation(self, ti_map_str: str):
count = 0
# 类型收集
var_map = dict()
id_ti_list = ti_map_str.split(';')
for s in id_ti_list:
count += 1
i, ti = s.split(',')
t = ti[:TAG_TYPE_LENGTH] # 取类型标记位
# -------------------- 没有标识类型时用默认的逻辑 --------------------
if t not in GET_TYPE_MAP:
t = TYPE1 # 默认值为1
else:
ti = ti[TAG_TYPE_LENGTH:]
var_map.setdefault(t, []).append((i, ti))
case = dict()
while var_map:
t, info = var_map.popitem()
if t == TYPE1:
case[t] = self.make_ti_id_type1(info)
elif t == TYPE2:
case[t] = self.make_ti_id_type2(info)
else:
raise ValueError("%s 类型解析未实现" % t)
return case
if __name__ == '__main__':
"""
ti_map = TitleMatchIdTool(
id_ti_info='BLDS201706005,区域大气污染排放效率变化趋势地区差距与影响因素基于长江经济带11省市的面板数据:10093370:20176;BLDS201706004,北京机动车环境外部成本的测算:10093370:20176;BLDS201706003,个体异质性与环境公共物品的私人有效供给:10093370:20176;BLDS201706002,绿色治理变迁逻辑政策反思与展望基于19782016年政策文本分析:10093370:20176;BLDS201706001,新能源汽车产业专利池的形成机制:10093370:20176;BLDS201806021,北京理工大学学报社会科学版征稿简则:10093370:20186;BLDS201806020,北京理工大学学报社会科学版2018年总目录:10093370:20186;BLDS201806019,双一流建设高校的全要素科技创新效率研究:10093370:20186;BLDS201806018,公共财政如何促进教育公平基于广东省基础教育创强专项资金绩效评价:10093370:20186;BLDS201806017,行政裁量行为的合理性审查研究:10093370:20186;BLDS201806016,大数据视野下环境侵权诉讼证据制度的优化:10093370:20186;BLDS201806015,食品安全监管国际软法变革论食品安全全球治理的视角:10093370:20186;BLDS201806014,不动产善意取得中无权处分认定研究:10093370:20186;BLDS201806013,国民经济动员立法的必要性及重难点:10093370:20186;BLDS201806012,中国经济增长方式转变的影响因素及路径选择:10093370:20186;BLDS201806011,农地闲置治理中的村民互助地方经验与缺陷补正以四川省G村为例:10093370:20186;BLDS201806010,房价波动银行信贷与产业升级基于银行信贷中介效应检验及区域差异对比分析:10093370:20186;BLDS201806009,中国股票市场信息流关联网络基于转移熵的实证研究:10093370:20186;BLDS201806008,国家创新型城市效率评价研究基于两阶段DEA模型:10093370:20186;BLDS201806007,共享经济监管机制对感知隐私风险消费者信任及持续共享意愿的影响:10093370:20186;BLDS201806006,区域旅游业碳排放的时空差异以山东省为例:10093370:20186;BLDS201806005,中国储能产业中动力电池梯次利用的商业价值:10093370:20186;BLDS201806004,环境规制空间溢出与区域生态效率基于空间杜宾面板模型的实证分析:10093370:20186;BLDS201806003,环境约束下中国工业部门能源投入的拥塞效应:10093370:20186;BLDS201806002,陷入惩戒牢笼失信惩戒是否抑制了企业创新来自废水国控重点监测企业的证据:10093370:20186;BLDS201806001,基准线法下企业最优碳减排和产品定价决策:10093370:20186;BLDS201805022,北京理工大学学报社会科学版征稿简则:10093370:20185;BLDS201805021,社会自主性的三种提升路径:10093370:20185;BLDS201805020,朱子学与日本近世儒学的一元论倾向:10093370:20185;BLDS201805019,从共同体之善的定位到价值中立原则的悖论反思权利绝对化及其隐忧:10093370:20185;BLDS201805018,法律效力的道德条件比较分析:10093370:20185;BLDS201805017,中国刑事证人保护制度的问题与完善:10093370:20185;BLDS201805016,诉权层次论视域下的行政诉权要件探析基于诉权本质学说与诉权要件之关联性考察:10093370:20185;BLDS201805015,岛礁之辨的分歧及其消解路径:10093370:20185;BLDS201805014,中国自然灾害与长期经济增长基于VAR与VEC模型的协整分析:10093370:20185;BLDS201805013,区块链技术在政府数据治理中的应用优势挑战与对策:10093370:20185;BLDS201805012,基于三方演化博弈的网约车出行市场规制策略:10093370:20185;BLDS201805011,中国工业行业产能利用率测度分析:10093370:20185;BLDS201805010,基于VAR模型P2P网络借贷与传统金融市场之间的动态变化:10093370:20185;BLDS201805009,电子口碑平台对感知可信度及购买意愿的影响:10093370:20185;BLDS201805008,协同创新网络与组织创新绩效的关系:10093370:20185;BLDS201805007,基于网络搜索指数的股票市场微观结构特征:10093370:20185;BLDS201805006,公众环境关心指数编制及其影响因素以北京市为例:10093370:20185;BLDS201805005,公众环境治理参与行为的多层分析:10093370:20185;BLDS201805004,政府规制下废旧汽车非正规回收渠道的演化博弈:10093370:20185;BLDS201805003,中国电力消费周期的路径演化识别基于Markov区制转移模型:10093370:20185;BLDS201805002,基于改进希尔伯特黄变换算法的碳市场价格多尺度分解:10093370:20185;BLDS201805001,地方政府环境规制竞争背景下地区间的企业污染排放行为:10093370:20185;BLDS201804020,北京理工大学学报社会科学版征稿简则:10093370:20184;BLDS201804019,建国初期劳动教育的兴起与上海的地方性实践:10093370:20184;'
'BLDS201806001,北京理工大学学报社会科学版征稿简则:10093370:20186;GDWZ202206019,2肝癌转移的免疫微环境:肝胆外科杂志;HDLG2022S2014,2U型管式蒸汽发生器内改性壁面强化传热数值研究:核动力工程;HDLG2022S2012,2高温下锆合金包壳切向微动磨蚀行为研究:核动力工程;HDLG2022S2003,2铅铋螺旋管壳侧流动传热数值模拟研究:核动力工程;ZLJS202206003,2基于对抗网络的冷水机组制冷剂泄漏故障跨工况诊断研究:制冷技术;ZLJS202206001,2高温梯级相变胶囊堆积床储热系统数值研究:制冷技术;QHDL202204013,2调相机整流电路电阻绝缘垫块灼烧问题分析及处理:青海电力;ZLJS202206014,2模糊控制与模型预测控制在空调系统中应用的研究现状:制冷技术;CCJY2022S1005,2双一流背景下一流大学毕业生就业质量评估模型的构建与应用:成才与就业;TJCX202206007,2高速铁路半封闭式声屏障脉动风压特性实车测试研究:铁路技术创新;SHGL202204029,2上海市国家高速公路命名编号调整工作实施效果评价:上海公路;SHGL202204025,2基于轨迹和气象数据的高速公路行车安全风险研究:上海公路;NTKT2022S2015,2地铁站台与轨行区的非均匀非稳态流场及热平衡仿真分析:暖通空调;XNYJ202206007,2密闭空间内10氢气浓度的氢气空气混合气体燃爆的仿真和实验研究:新能源进展;SLJX202206003,2浮式电站黑启动方式下谐波影响因素及其表征:发电技术;ZUAN202212023,2失代偿期肝硬化的新定义:肝脏;ZUAN202212005,2门静脉成纤维细胞有望成为可再生肌成纤维细胞的新来源:肝脏;ZUAN202212004,2肝窦内皮细胞介导的细胞串扰在肝纤维化中的作用:肝脏;ZUAN202212003,2药物性肝损伤的生物标志物研究进展:肝脏;YYXX202206017,22型糖尿病患者血清叶酸和维生素B:营养学报;ZUAN202212024,2原发性肝脏神经内分泌肿瘤的诊治现状:肝脏;ZUAN202212002,2固有淋巴样细胞抗肿瘤免疫新进展:肝脏;SHGL202204017,2克服无人机拍摄扰动的高精度车辆轨迹数据提取方法:上海公路;XJZZ202206009,2北京小剧场戏曲节的发展节点与编剧养成:戏剧中央戏剧学院学报;GDYJ202212006,2双极性方波场下电晕老化对环氧树脂空间电荷特性的影响:高电压技术;SDLJ202205011,2尺度效应对船舶在受限水域航行时的流场偏移影响研究:水动力学研究与进展A辑;LSBL202212014,2EBV相关性胃癌21例临床病理学分析:临床与实验病理学杂志;SXGC202212015,2面向PHF工艺的7075T6铝合金高温变形行为:塑性工程学报;LSBL202212001,2第五版WHO肾脏肿瘤新分类主要变化解读:临床与实验病理学杂志;TSZM202212001,2增强极端天气下城市治理的韧性:探索与争鸣;TSZM202212024,2提升以个人为中心的城市应急管理能力:探索与争鸣;TSZM202212019,2践行人民城市重要理念扎实推进气候适应型城市建设:探索与争鸣;TSZM202212023,2发挥新媒体平台在城市重大气象灾害风险治理中的社会协同作用:探索与争鸣;HGSZ202212001,2微反应器内连续制备拓扑结构聚合物的研究进展:化工学报;MYSY202206005,2略论电影想象力消费的三个层面:民族艺术研究;XXCB202206005,2广州市5岁以下腹泻儿童人芽囊原虫感染流行病学特征及影响因素:中国血吸虫病防治杂志;ZLDT202212006,2超长鸡舍夏季湿帘通风时舍内温度分布研究:制冷与空调;GCSJ202206010,2超精密大行程麦克斯韦磁阻驱动器磁场建模与推力分析:工程设计学报;HEBG202212014,2U50Zr螺旋十字燃料热力耦合特性分析:哈尔滨工程大学学报;HJGC202212003,2填埋场好氧修复过程碳排放特征及削减研究:环境工程;GZTX202206006,2双减背景下社会力量参与学校体育的价值困境与对策:广州体育学院学报;SDLJ202206001,2基于神经网络的船舶剖面参数化建模与辐射水动力系数预测:水动力学研究与进展A辑;ZZLL202211001,2PI3KAKT信号转导通路关键蛋白在皮肤光老化及皮肤鳞状细胞癌中的表达研究:肿瘤;SDLJ202206003,2基于虚实结合的波浪环境下船舶操纵运动机器学习建模研究:水动力学研究与进展A辑;SDLJ202206015,2仿鸮前缘突节风机叶片气动流场的数值模拟:水动力学研究与进展A辑;ZJJB202202005,2高校实验室仪器设备管理维护现存问题及对策探究:中国教育技术装备;DZXU202212013,2基于分形超表面的小型化宽带高透射率平面透镜天线:电子学报;QHMS202204002,2大学生铸牢中华民族共同体意识的内涵特征本质规律与实践进路:青海民族大学学报社会科学版;JJYS202206009,2新时代社会主义意识形态凝聚力和引领力提升的路径研究:经济与社会发展;ZJJB202214034,2航空航天方向本科实验教学改革探讨:中国教育技术装备'
)
print(ti_map.count())
print(ti_map.get_id_control('六地企业顾客关系管理现状调研报告:10035192:20054'))
print(ti_map.get_id_control('北京理工大学学报社会科学版征稿简则:10093370:20185'))
print(ti_map.get_id_control('北京理工大学学报社会科学版征稿简则:10093370:20186'))
print(ti_map.get_id_control('2肝癌转移的免疫微环境:肝胆外科杂志'))
"""
ti_map = TitleMatchIdTool('BJTJ200310003,当前北京经济运行中的主要问题:10065954:200310;'
'BJTJ200310002,北京市2003年19月份主要经济指标:10065954:200310;'
'BJTJ200310001,今年北京经济呈V字型走势:10065954:200310;'
'BJTJ200310000,让我轻轻地告诉你:10065954:200310;'
'BJTJ2003Z1057,动态简讯:10065954:2003Z1;'
'BJTJ2003Z1055,统计员颂歌歌词:10065954:2003Z1;'
'BJTJ2003Z1054,庆祝北京市统计局建局五十周年有感:10065954:2003Z1;'
'BJTJ2003Z1053,第二回美女午后品茶费雪突发灵感统计学的故事一:10065954:2003Z1;'
'BJTJ2003Z1052,享受快乐统计:10065954:2003Z1;'
'BJTJ2003Z1051,数字的回响:10065954:2003Z1;'
'BJTJ2003Z1050,从统计资料调查报告看美国社会现象之一斑:10065954:2003Z1;'
'BJTJ2003Z1049,国际统计学会职业道德宣言讲了些什么:10065954:2003Z1;'
'BJTJ2003Z1048,怎样避免统计基本概念与方法的误用:10065954:2003Z1;'
'BJTJ2003Z1047,裁文匠笔戒律为先统计分析报告写作十戒:10065954:2003Z1;'
'BJTJ2003Z1046,谈谈怎样用活统计数据:10065954:2003Z1;'
'BJTJ2003Z1045,摩托罗拉公司成功运作电子商务案例摩托罗拉公司房地产项目网上审批系统的实施:10065954:2003Z1;'
'BJTJ2003Z1044,应实行能源的全社会统计:10065954:2003Z1;'
'BJTJ2003Z1043,统计方法制度改革的思考:10065954:2003Z1;'
'BJTJ2003Z1042,北京市物流现状调查设计的问题与思考兼与北京市物流现状调查表设计者商榷:10065954:2003Z1;'
'BJTJ2003Z1041,行政诉讼中的补证问题:10065954:2003Z1;'
'BJTJ2003Z1040,统计执法程序及执法文书的使用与制作十三:10065954:2003Z1;'
'BJTJ2003Z1039,兼职做统计工作也要认真对待:10065954:2003Z1;'
'BJTJ2003Z1038,统计违法行为处罚难难在何处:10065954:2003Z1;'
'BJTJ2003Z1037,居民睡眠用时多少北京居民生活时间分配调查系列报告之八:10065954:2003Z1;'
'BJTJ2003Z1036,北京市劳动岗位人员需求知多少:10065954:2003Z1;'
'BJTJ2003Z1035,难忘在希望的田野上:10065954:2003Z1;'
'BJTJ2003Z1034,政府统计为企业微观评价提供了丰富营养:10065954:2003Z1;'
'BJTJ2003Z1033,难说再见:10065954:2003Z1;'
'BJTJ2003Z1032,天道酬勤记市统计局新闻发言人于秀琴:10065954:2003Z1;'
'BJTJ2003Z1031,追忆跨越祝福:10065954:2003Z1;'
'BJTJ2003Z1030,50年的评说抒怀寄语:10065954:2003Z1;'
'BJTJ2003Z1029,2001年首都经济六大行业前10名按2001年经营收入排序:10065954:2003Z1;'
'BJTJ2003Z1028,首都经济200强强在何处:10065954:2003Z1;'
'BJTJ2003Z1027,新企业会计制度与会计准则和股份有限公司会计制度的主要差异二十:10065954:2003Z1;'
'BJTJ2003Z1026,企业效绩评价操作细则修订八:10065954:2003Z1;'
'BJTJ2003Z1025,2002年批发零售贸易业餐饮业年报培训测试题及答案:10065954:2003Z1;'
'BJTJ2003Z1024,2003年固定资产投资房地产开发定期报表制度填报方法二:10065954:2003Z1;'
'BJTJ2003Z1023,如何看待居民消费价格指数和商品零售价格指数的数据差异:10065954:2003Z1;'
'BJTJ2003Z1022,工业主要产品产量统计数据审核要点:10065954:2003Z1;'
'BJTJ2003Z1021,北京市消费者信心指数是怎样编制的:10065954:2003Z1;'
'BJTJ2003Z1020,北京市第二次投入产出工作会议召开:10065954:2003Z1;'
'BJTJ2003Z1019,消费需求扩张环境问题突出北京市人均GDP突破3000美元究竟意味着什么之二:10065954:2003Z1;'
'BJTJ2003Z1018,北京知识经济发展进程及分析:10065954:2003Z1;'
'BJTJ2003Z1017,北京城市竞争力状况与变化:10065954:2003Z1;'
'BJTJ2003Z1016,北京应在哪些领域巩固和培育经济增长点:10065954:2003Z1;'
'BJTJ2003Z1015,京房景气指数京投景气指数均呈降势:10065954:2003Z1;'
'BJTJ2003Z1014,关于征集北京市第十二届统计科学讨论会论文的通知:10065954:2003Z1;'
'BJTJ2003Z1013,怎样科学分析经济形势:10065954:2003Z1;'
'BJTJ2003Z1012,稳健统计在经济指标中的应用探讨:10065954:2003Z1;'
'BJTJ2003Z1011,关于我国数理统计学发展中存在的问题的几点思考:10065954:2003Z1')
print(ti_map)

@ -1,16 +1,5 @@
# -*- coding: utf-8 -*-
# @Time : 2024/5/15 17:40
# @Author : zhaoxiangpeng
# @File : tools.py
import enum
import re
from typing import Dict, Union
from datetime import datetime, timedelta
from urllib.parse import urlparse, quote, unquote, parse_qs
from . import extract_rule
from typing import List, Tuple
from datetime import datetime
def str2int(val, replace=0):
@ -23,258 +12,6 @@ def str2int(val, replace=0):
return val
def replace_str(source_str, regex, replace_str=""):
"""
916
@summary: 替换字符串
---------
@param source_str: 原字符串
@param regex: 正则
@param replace_str: 用什么来替换 默认为''
---------
@result: 返回替换后的字符串
"""
str_info = re.compile(regex)
return str_info.sub(replace_str, source_str)
def url_parse(url: str):
"""
url解析为dict
:param url:
:return:
"""
query = urlparse(url).query
params = parse_qs(query)
result = {key: params[key][0] if params[key].__len__() == 1 else params[key] for key in params}
return result
def parse_datetime(datetime_str):
"""
解析多种格式的日期时间字符串返回datetime对象
支持的格式
1. YYYY-MM-DD
2. YYYY-MM-DD HH:MM
3. YYYY-MM-DD HH:MM:SS
参数:
datetime_str (str): 日期时间字符串
返回:
datetime: 解析后的datetime对象
"""
formats = [
"%Y-%m-%d", # 2025-05-09
"%Y-%m-%d %H:%M", # 2025-05-08 16:16
"%Y-%m-%d %H:%M:%S" # 2025-04-15 14:40:03
]
for fmt in formats:
try:
return datetime.strptime(datetime_str, fmt)
except ValueError:
continue
return None
def add_year2item(item, year: Union[int, None], pub_datetime):
"""
给关系添加年份
:param item: CnkiIdRelationItem
:param year: 优先选择的年份如果为None则从tr_node中提取
:param pub_datetime: 从node节点中提取的日期时间字符串支持格式见 parse_datetime 方法
:return:
"""
if not year:
# 如果meta中没有携带年份字段从页面中解析年份
dt = parse_datetime(pub_datetime)
if dt:
year = dt.year
if year:
item.year = year
return item
def parse_retrieval(query: str):
"""
解析aside值拼接queryJson
:param query:
:return:
"""
def func(string: str):
stand = string[1:-1] # 去除左右的中文括号
title, value = stand.split("", maxsplit=1) # 分割 "作者单位:湖南中医药大学(模糊)" -> [作者单位, 湖南中医药大学(模糊)]
return title, value[:-4], value[-3:-1]
cond_list = re.split(r'(AND|NOT|OR)', query)
logic = 'AND'
content = cond_list[0]
yield logic, func(content)
for i in range(1, len(cond_list), 2):
chunk = cond_list[i:i + 2] # 获取两个元素
logic, content = chunk
yield logic, func(content)
def parse_updatedtime_symbol(symbol: str, today: str = None) -> tuple:
"""
从字符串解析时间范围
:param symbol:
:param today:
:return:
"""
if today and isinstance(today, str):
today = datetime.strptime(today, "%Y-%m-%d")
else:
today = datetime.now()
if symbol == "最近一周":
ago_day = today - timedelta(days=7)
elif symbol == "最近一月":
ago_day = today - timedelta(days=30)
elif symbol == "最近半年":
ago_day = today - timedelta(days=181)
elif symbol == "最近一年":
ago_day = today.replace(year=today.year-1)
elif symbol == "今年迄今":
ago_day = today.replace(month=1, day=1)
else:
ago_day = today
return ago_day.strftime("%Y-%m-%d"), today.strftime("%Y-%m-%d")
def id_ti2map(ti_map_str: str) -> Dict[str, str]:
"""
将third_id,标题转为标题对应id的dict
:param ti_map_str:
:return:
"""
example = {
'配额约束下考虑回收维修努力的共享单车供应链决策与协调': {
'id': 'YUCE202202005', # 标题仅有一个时存在
'20970145': 'YUCE202202005', # issn对应一个id
'20222': 'YUCE202202005', # 期号对应一个id
# 假设标题有多个
# 1.不同期刊标题重复
'00010002': 'ZHXP200101001',
'20011': 'ZHXP200101001',
# 2.同期刊同期标题重复
'00010003': {
'id': 'ZHXP200101001',
'20011': 'ZHXP200101001'
}
}
}
result = {}
id_ti_list = ti_map_str.split(';')
for s in id_ti_list:
i, ti = s.split(',')
# 标题可能会重复,那么标题需要时一对多
# 预处理标题
ti_format, is_format, issue_format = ti.split(':')
if not result.get(ti_format):
result.setdefault(ti_format, {})
# 需要先把标题的字典里有一个id字段
if not result[ti_format].get('id'):
result[ti_format].setdefault('id', i)
else:
result[ti_format].pop('id')
result[ti_format].setdefault(is_format, i)
result[ti_format].setdefault(issue_format, i)
# if ti in result:
# continue
# result.setdefault(ti, i)
return result
def get_id_from_map(ti_unique, ti_map: dict):
ti_format, is_format, issue_format = ti_unique.split(':')
obj = ti_map.get(ti_format)
if not obj:
return None
t_id = obj.get('id')
if t_id:
return t_id
for p in [is_format, issue_format]:
t_id = ti_map.get(p)
if t_id:
return t_id
def so2format(data):
"""
去除刊名的特殊后綴
:param data:
:return:
"""
if not data:
return ''
split_result = re.split(extract_rule.DEL_SOURCE_SYMBOL_PATTERN, data)
return split_result[0]
def ji2format(data: str):
"""
issn标准化
:param data:
:return:
"""
if not data:
return ''
data = data.upper()
return re.sub(r'-', '', data)
def ti2format(data):
"""
去除标题空格
:param data:
:return:
"""
return replace_str(data, extract_rule.DEL_TITLE_SYMBOL_PATTERN, "")
def ti2unique(ti=None, ji=None, y=None, i=None):
"""
标题加其他标志作为唯一号
:param ti:
:param ji: 期刊issn
:param y:
:param i:
:return:
"""
if ji:
ji = ji2format(ji)
ti_unique = '{ti}:{j}:{y}{i}'.format(ti=ti, j=ji, y=y, i=i)
return ti_unique
def func_0(ti, todo_dic: dict):
"""
从dic中匹配到id
:param ti:
:param todo_dic:
:return:
"""
t_id = todo_dic.get(ti)
if not t_id:
# 去掉期刊名来匹配
ti1, jn, q = ti.split(':')
# 同理 dic里的也要去除刊名
return t_id
if __name__ == '__main__':
# so2format('中国农业文摘-农业工程')
# id_ti2map('YUCE200504016,六地企业顾客关系管理现状调研报告:10035192:20054;YUCE200504015,中国电信业市场结构与X效率的实证研究:10035192:20054;YUCE200504004,基于质量合约的风险化管理初探:10035192:20054')
def get_today_date(fmt: str = "%Y-%m-%d"):
return datetime.today().strftime(fmt)
q1 = '(作者单位:湖南中医药大学(模糊)OR作者单位湖南中医学院(精确)'
q2 = '(作者单位:湖南中医药大学(模糊)OR作者单位湖南中医学院(模糊)OR篇名基于PINK1LETM1信号通路探讨何首乌苷减轻脑缺血再灌注损伤的作用机制(精确)'
q3 = '(作者单位:湖南中医药大学(模糊)OR作者单位湖南中医学院(模糊)AND篇名基于PINK1LETM1信号通路探讨何首乌苷减轻脑缺血再灌注损伤的作用机制(精确)'
g = parse_retrieval(q3)
i = 1
for _, s in g:
print(i, _, s)
i += 1

@ -1,54 +0,0 @@
# -*- coding: utf-8 -*-
# @Time : 2026/1/12 14:13
# @Author : zhaoxiangpeng
# @File : crawl_crossdb_article.py
from twisted.internet import defer
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
from science_article_cnki.spiders.cnki_article_crossdb import CnkiArticleCrossdbSpider
def starter_by_year():
@defer.inlineCallbacks
def f(range_list: list = None):
for y in range_list:
init_params = {
'query_id': 1609,
'query': '(作者单位:河北工程技术学院(模糊)',
# 'query_condition': {'year': str(y)},
'filters': [
dict(project="年度", value=f"{y}", text_or_title=f"{y}"),
]
}
yield process.crawl(CnkiArticleCrossdbSpider, **init_params)
process = CrawlerProcess(get_project_settings())
f(list(range(2021, 2022)))
process.start()
def starter_more_year():
@defer.inlineCallbacks
def f(years: list = None):
init_params = {
'query_id': 1611,
'query': '(作者单位:武昌首义学院(模糊)',
'filters': [
dict(project="年度", value=[f"{y}" for y in years], text_or_title=[f"{y}" for y in years]),
]
}
yield process.crawl(CnkiArticleCrossdbSpider, **init_params)
process = CrawlerProcess(get_project_settings())
f(list(range(2021, 2026)))
process.start()
def starter():
process = CrawlerProcess(get_project_settings())
process.crawl(CnkiArticleCrossdbSpider)
process.start()
if __name__ == '__main__':
starter_more_year()

@ -1,3 +0,0 @@
sqlalchemy~=1.3.24
scrapy~=2.13.3
itemadapter~=0.11.0

@ -1,13 +0,0 @@
# my_scrapy_project/models/base.py
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy import Column, Integer, DateTime
from datetime import datetime
Base = declarative_base()
class BaseModel(Base):
"""基础模型类"""
__abstract__ = True
id = Column(Integer, primary_key=True, autoincrement=True)

@ -1,225 +0,0 @@
# Define here the models for your spider middleware
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
from __future__ import annotations
import json
from typing import TYPE_CHECKING
import redis
from scrapy import signals, Spider
from scrapy.exceptions import CloseSpider
# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
if TYPE_CHECKING:
from scrapy.crawler import Crawler
from scrapy import Request
class ScienceArticleWosSpiderMiddleware:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(self, response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(self, response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, or item objects.
for i in result:
yield i
def process_spider_exception(self, response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Request or item objects.
pass
async def process_start(self, start):
# Called with an async iterator over the spider start() method or the
# maching method of an earlier spider middleware.
async for item_or_request in start:
yield item_or_request
def spider_opened(self, spider):
spider.logger.info("Spider opened: %s" % spider.name)
class ScienceArticleWosDownloaderMiddleware:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_request(self, request, spider):
# Called for each request that goes through the downloader
# middleware.
# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
return None
def process_response(self, request, response, spider):
# Called with the response returned from the downloader.
# Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
return response
def process_exception(self, request, exception, spider):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.
# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
pass
def spider_opened(self, spider):
spider.logger.info("Spider opened: %s" % spider.name)
class WosCookieMiddleware:
def __init__(self, redis_uri: str):
self.redis_cli = redis.from_url(redis_uri, decode_responses=True)
self.redis_key_prefix = 'cookie_pool:wos_sid'
self.cookiepool_cache_key = 'cookie_pool:wos:sid_q'
@classmethod
def from_crawler(cls, crawler: Crawler, *args, **kwargs):
settings = crawler.settings
middle = cls(
redis_uri=settings.get("REDIS_URL")
)
crawler.signals.connect(middle.open_spider, signal=signals.spider_opened)
crawler.signals.connect(middle.close_spider, signal=signals.spider_closed)
return middle
def open_spider(self, spider: Spider):
self.loading_sid_from_redis()
def close_spider(self, spider: Spider, reason: str = None):
self.del_sid_from_redis()
def process_request(self, request: Request, spider):
req_wos_sid = request.meta.get('wos_sid')
if not req_wos_sid:
sid = self.get_sid_from_redis()
if not sid:
raise CloseSpider(f"没有获取到sid即将退出")
# 把获取到的wos_sid绑定到request可以在parse方法中获取到wos_sid的值
request.meta['wos_sid'] = sid
else:
sid = req_wos_sid
cookie_1 = {'dotmatics.elementalKey': 'SLsLWlMhrHnTjDerSrlG'}
headers = {
'authority': 'webofscience.clarivate.cn',
'accept-language': 'zh-CN,zh;q=0.9',
'cache-control': 'no-cache',
'origin': 'https://webofscience.clarivate.cn',
'pragma': 'no-cache',
# 'referer': 'https://webofscience.clarivate.cn/wos/woscc/advanced-search',
}
request.cookies = cookie_1
if request.url.endswith('runQuerySearch'):
# 检索时需要带有sid参数
request._set_url(request.url + "?SID=%s" % sid)
headers.update(
{'accept': 'application/x-ndjson', 'content-type': 'text/plain;charset=UTF-8'})
else:
headers.update(
{'accept': 'application/json, text/plain, */*', 'content-type': 'application/json',
'x-1p-wos-sid': sid})
for hk, hv in headers.items():
request.headers[hk] = hv
return None
def process_response(self, request, response, spider):
if response.status != 200:
self.mark_sid_status(request.meta.get('wos_sid'))
return response
def get_sid_from_redis(self):
val = self.redis_cli.rpoplpush(self.cookiepool_cache_key, self.cookiepool_cache_key)
if val:
self.redis_cli.hincrby(f'{self.redis_key_prefix}:{val}', 'used_times', 1)
return val
return None
def mark_sid_status(self, sid: str, status: str = 'validate'):
"""
:param sid:
:param status: validate/expired
:return:
"""
if status == "expired":
# 过期直接删除key
self.redis_cli.delete(f'{self.cookiepool_cache_key}:{sid}')
else:
self.redis_cli.hset(f'{self.redis_key_prefix}:{sid}', 'status', status)
def loading_sid_from_redis(self) -> list:
"""
加载所有的sid到List结构从缓存队列取sid
:return:
"""
valid_sid = []
keys = self.redis_cli.keys(f'{self.redis_key_prefix}:*')
for key in keys:
# 获取所有的信息
key_obj: dict = self.redis_cli.hgetall(key)
if key_obj.get("status") == "normal":
real_sid = key.rsplit(':', maxsplit=1)[-1]
valid_sid.append(real_sid)
self.redis_cli.lpush(self.cookiepool_cache_key, real_sid)
return valid_sid
def del_sid_from_redis(self):
self.redis_cli.delete(f'{self.cookiepool_cache_key}')
class A:
def __init__(self, redis_cli):
self.redis_cli = redis_cli
def load_keys(self, name):
return self.redis_cli.keys(r'cookie_pool:wos_sid:*')
def get_one_sid(self, name):
return self.redis_cli.rpoplpush(name)

@ -1,409 +0,0 @@
# -*- coding: utf-8 -*-
# @Time : 2025/12/16 15:24
# @Author : zhaoxiangpeng
# @File : cookie_manager.py
from __future__ import annotations
import json
from typing import TYPE_CHECKING, Generator
import logging
import time
import threading
from datetime import datetime, timedelta
from typing import Optional, Callable
import redis
import requests
from DrissionPage import Chromium
from science_article_wos.utils.xpath_cfg import Settings
if TYPE_CHECKING:
from DrissionPage import ChromiumPage, ChromiumOptions
from scrapy_drissionpage.response import DrissionResponse
from DrissionPage._pages.chromium_tab import ChromiumTab
from DrissionPage._units.listener import DataPacket, Response
VERIFY_ROUTER = "/api/wosnx/core/verify"
settings = Settings()
logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger(__name__)
def get_self_ip():
"""获取当前IP地址"""
try:
resp = requests.get("https://www.httpbin.org/ip", timeout=10)
assert resp.status_code == 200
data = resp.json()
ipaddr = data['origin']
return ipaddr
except Exception as e:
logger.error(f"获取IP失败: {str(e)}")
return "unknown"
def intercept(self, listen, operation, callback, tab=None):
listen()
operation()
for packet in tab.listen.steps(count=3):
if not intercept_verify(packet):
continue
r = callback(packet)
if isinstance(r, Generator):
return r
else:
if isinstance(r, bool):
break
return
def intercept_verify(packet: DataPacket):
content = packet.response.body
if isinstance(content, bytes) and content.find(b'"Server.passiveVerificationRequired"') != -1:
return False
else:
return True
class DPOperations:
def __init__(self, browser, tab):
self.browser = browser
self.tab = tab
@staticmethod
def operate_cookie_first(tab):
# 处理弹出的cookie首选项
logger.debug('Operating cookie first...')
ck_m_div = tab.ele('xpath://*[@id="onetrust-banner-sdk"]')
if ck_m_div:
ele = tab.ele('xpath://*[@id="onetrust-accept-btn-handler"]')
ele.click()
@staticmethod
def change_db(tab):
logger.info('Changing database...')
default_db_ele = tab.ele('xpath://*[@id="snSelectDb"]/button')
c1 = default_db_ele.raw_text
default_db_ele.click()
xpath = '//*[@id="global-select"]/div/div[@aria-label="Select database"]/div[@title="Web of Science Core Collection"]'
tab.ele(
'xpath:%(xpath)s' % {"xpath": xpath}).click()
@staticmethod
def input_ops(tab, content=None, clear_input: bool = True):
logger.debug('Input operation...')
input_area_ele = tab.ele('xpath:%(xpath)s' % {"xpath": settings.QUERY_INPUT_ELE})
if clear_input:
input_area_ele.clear() # 清空
if content is None:
content = "(OG=(Shanghai Jiao Tong University)) AND PY=(2025)"
input_area_ele.input(content) # 输入检索内容
@staticmethod
def search_ops(tab):
logger.debug('Search operation...')
search_button_ele = tab.ele('xpath:%(xpath)s' % {"xpath": settings.SEARCH_BUTTON_ELE})
search_button_ele.click()
@staticmethod
def export_ops(tab, start: int = 1, end: int = 50):
tab.ele('xpath:%(xpath)s' % {"xpath": settings.EXPORT_BUTTON_ELE}).click() # 点击导出
tab.ele('xpath:%(xpath)s' % {"xpath": settings.TABWIN_BUTTON_ELE}).click() # 选择制表符分割
# 等待弹框
# 切换导出格式选择全记录与参考文献
tab.ele('xpath:%(xpath)s' % {"xpath": settings.RECORD_TYPE_SELECT_ELE}).click()
tab.ele('xpath:%(xpath)s' % {"xpath": settings.FULL_RECORD_REFERENCE_ELE}).click()
# 输入记录起止
tab.ele('xpath:%(xpath)s' % {"xpath": settings.RECORD_RANGE_ELE}).click() # 切换到范围
tab.ele('xpath:%(xpath)s' % {"xpath": settings.RECORD_EXPORT_START_ELE}).input(start, clear=True)
tab.ele('xpath:%(xpath)s' % {"xpath": settings.RECORD_EXPORT_END_ELE}).input(end, clear=True)
# 点击导出
tab.ele('xpath:%(xpath)s' % {"xpath": settings.EXPORT_FILE_ELE}).click.to_download(
# save_path=DOWNLOAD_PATH,
rename='%s.txt' % 'savedrecs'
)
def first_ops(self):
tab = self.tab
self.operate_cookie_first(tab)
self.change_db(tab)
self.input_ops(tab)
self.search_ops(tab)
def bypass_ops(self):
tab = self.tab
self.export_ops(tab)
class CookieManager:
def __init__(
self,
redis_uri: str = "redis://localhost:6379/0",
cookie_lifetime: int = 60 * 60 * 4, # cookie有效期
check_interval: int = 60,
keep_browser_alive: bool = True,
):
self.url = "https://webofscience.clarivate.cn/wos/woscc/advanced-search"
self.cookie_lifetime = cookie_lifetime
# Redis连接
self.redis_key_prefix = 'cookie_pool:wos_sid'
self.check_interval = check_interval
self.redis_client = redis.Redis.from_url(
redis_uri,
decode_responses=True
)
logger.info(f"Redis连接成功: {redis_uri}")
self.dp_ins: DPOperations = None
self.first = True
# 浏览器实例
self.browser = None
self.tab = None
self.keep_browser_alive = keep_browser_alive
self.current_sid = None
self.bypass_ok_tag = False
# 控制标志
self._running = False
self._monitor_thread = None
def start_browser(self):
"""启动浏览器"""
if self.browser is None:
logger.info("启动浏览器...")
self.browser = Chromium()
self.tab = self.browser.latest_tab
logger.info("浏览器启动成功")
self.dp_ins = DPOperations(self.browser, self.tab)
def close_browser(self):
"""关闭浏览器"""
if self.browser:
logger.info("关闭浏览器...")
self.browser.quit()
self.browser = None
self.tab = None
logger.info("浏览器已关闭")
def refresh_page(self):
try:
logger.info("正在刷新页面")
if self.tab:
self.tab.refresh()
except Exception as e:
logger.error(f"正在刷新页面: {str(e)}")
def intercept_verify(self, op_func: Callable[[], None]):
"""
所有的刷新或者xhr操作都要监听一下hcaptcha验证接口
:return:
"""
logger.debug("监听 %s" % VERIFY_ROUTER)
self.tab.listen.start(VERIFY_ROUTER, method="POST") # 开启监听
op_func() # 指定操作方法
verify_count = 0
for packet in self.tab.listen.steps(count=3, timeout=60):
verify_count += 1
if self.verify_hook(packet):
# 验证成功会退出,不会出发下面的逻辑
return
if verify_count:
logger.warning("获取失败")
else:
logger.info("没有触发验证, cookie有效")
self.sid2redis()
@staticmethod
def get_wos_sid_from_localstorage(tab):
s = tab.local_storage('wos_sid')
sid = s.strip('"')
return sid
def get_cookie_from_browser(self):
try:
if self.tab is None:
self.start_browser()
if self.first:
logger.info(f"第一次访问页面: {self.url}")
self.tab.get(self.url)
time.sleep(3) # 等待页面加载
# 执行自定义操作
self.intercept_verify(op_func=self.dp_ins.first_ops)
time.sleep(2)
self.sid2redis()
except Exception as e:
logger.error(e)
def save_cookie_to_redis(self, wos_sid: str):
try:
current_time = datetime.now()
expired_time = current_time + timedelta(seconds=self.cookie_lifetime)
ip = get_self_ip()
cookie_data = {
'ip': ip,
'status': 'normal',
'generated_time': current_time.isoformat(),
'expired_time': expired_time.isoformat(),
'used_times': 0
}
self.redis_client.hset(
name=f'{self.redis_key_prefix}:{wos_sid}',
mapping=cookie_data
)
logger.info(f"Cookie已保存到Redis: {self.redis_key_prefix}:{wos_sid}")
except Exception as e:
(
logger.error(f"保存cookie到Redis失败: {str(e)}"))
def sid2redis(self):
"""
存储到reids
:return:
"""
wos_sid = self.get_wos_sid_from_localstorage(self.tab)
if wos_sid:
logger.info("保存 %s 到redis..." % wos_sid)
self.current_sid = wos_sid
self.save_cookie_to_redis(wos_sid)
def verify_hook(self, packet: DataPacket):
verified_tag = 'verified'
request_url = packet.request.url
verify_success = False
if request_url.find(VERIFY_ROUTER) != -1: # 走验证了
logger.debug(f"正在验证: {request_url}\n"
f"请求body: {packet.request.postData}")
response_body = packet.response.body
if isinstance(response_body, bytes):
verify_success = packet.response.body.find(verified_tag.encode()) != -1
elif isinstance(response_body, str):
verify_success = packet.response.body.find(verified_tag) != -1
elif isinstance(response_body, dict):
verify_success = response_body.get('key') == verified_tag
elif isinstance(response_body, list) and len(response_body) > 0:
verify_success = response_body[0].get('key') == verified_tag
else:
raise TypeError("未知的response_body类型")
if verify_success:
logger.info(f"验证成功: {request_url}")
return True
else:
return False
else:
logger.info("无需验证")
return True
def check_cookie_status(self, sid: str = None, default_status: str = "expired"):
if sid is None:
sid = self.current_sid
if not sid:
return default_status
status = self.redis_client.hget(name=f'{self.redis_key_prefix}:{sid}', key='status')
return status
def monitor_loop(self):
"""
监控循环定期检查cookie状态
Args:
custom_operations: 自定义操作函数
"""
logger.info(f"开始监控cookie检查间隔: {self.check_interval}")
while self._running:
try:
status = self.check_cookie_status()
if status == "validate":
logger.warning("cookie使用次数超限/需要验证,准备进行验证。。。")
# 验证逻辑,导出一次过验证
self.intercept_verify(op_func=self.dp_ins.bypass_ops)
elif status == "expired":
logger.warning("cookie已过期准备重新获取。。。")
# 刷新页面或者重新进行搜索/导出
self.intercept_verify(op_func=self.refresh_page)
else:
logger.info(f"Cookie状态正常: {status}")
# 等待下次检查
time.sleep(self.check_interval)
except Exception as e:
logger.error(e)
def start_monitor(self):
if self._running:
logger.warning("监控已在运行中")
return
if self.browser is None:
self.start_browser()
# 首次获取cookie
logger.info("首次获取cookie...")
self.get_cookie_from_browser()
if self.current_sid:
logger.error("首次获取cookie成功")
else:
logger.error("首次获取cookie失败")
if not self.keep_browser_alive:
self.close_browser()
return
# 如果不需要浏览器保活,关闭
if not self.keep_browser_alive:
self.close_browser()
# 启动监控线程
self._running = True
# self._monitor_thread = threading.Thread(
# target=self.monitor_loop,
# name="CookieMonitorThread",
# daemon=True
# )
# self._monitor_thread.start()
self.monitor_loop()
logger.info("监控已启动")
def stop_monitor(self):
"""停止监控"""
if not self._running:
logger.warning("监控未在运行")
return
logger.info("正在停止监控...")
self._running = False
if self._monitor_thread:
self._monitor_thread.join(timeout=5)
self.close_browser()
logger.info("监控已停止")
def main():
manager = CookieManager(redis_uri="redis://:kcidea1509@192.168.1.211:6379/10", keep_browser_alive=True)
try:
manager.start_monitor()
# 主程序运行
logger.info("Cookie管理器正在运行按Ctrl+C停止...")
except KeyboardInterrupt:
logger.info("收到停止信号")
# manager.close_browser()
finally:
manager.stop_monitor()
if __name__ == '__main__':
main()

@ -1,4 +0,0 @@
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.

@ -1,110 +0,0 @@
from typing import Any, List, Union
from datetime import datetime
import scrapy
from scrapy import signals
from scrapy.http import Response
from scrapy.http.request.json_request import JsonRequest
from .database import DatabaseSpider
from science_article_wos.items import WosArticleItem, WosCitedNumberItem, WosIdRelationItem
from science_article_wos.scripts.wos_parse_data import parse_full_records
from science_article_wos.utils import model
from science_article_wos.utils import tools
from science_article_wos.utils import config
def maybe_list(val: Union[int, List[int]]) -> List[int]:
if isinstance(val, int):
return [val]
return list(val)
class DownloadBySearchRecordSpider(DatabaseSpider):
name = "download_by_search_record"
custom_settings = dict(
DOWNLOADER_MIDDLEWARES={
"science_article_wos.middlewares.WosCookieMiddleware": 500
},
# ITEM_PIPELINES={
# "science_article_wos.pipelines.MongoPipeline": 300,
# },
REDIS_URL='redis://:kcidea1509@192.168.1.211:6379/10',
LOG_LEVEL="INFO"
)
def spider_opened(self, spider):
if self.record_id is None:
# 从数据库中查询任务执行
from science_article_wos.dao.database.connection import DatabaseManager
from science_article_wos.dao.models.search_record import SearchRecord
db_url = ""
db_manager = DatabaseManager(db_url)
with db_manager.session_scope() as session:
record = session.query(SearchRecord).filter_by(state="pending").first()
if record:
print(f"查询到记录: {record}")
self.record_id = record.record_id
self.records_found = record.records_found
self.mark_from = record.mark_from
self.mark_to = record.mark_to
self.shard = record.shard
def __init__(self, record_id: str = None, mark_from: int = 1, mark_to: int = 500, shard: str | int = None, records_found: int = None, **kwargs):
super().__init__()
self.record_id = record_id
self.records_found = records_found
self.mark_from = mark_from
self.mark_to = mark_to
self.shard = shard
self.task_id = None
self.org_id = None
self.query_id = None
self.bind_relation_enable = False
self.bind_relation_d = None
if self.bind_relation_enable:
self.build_relation()
def build_relation(self):
bind_relation_d = dict()
if self.task_id: self.bind_relation_d.setdefault("task_ids", maybe_list(self.task_id))
if self.org_id: self.bind_relation_d.setdefault("school_ids", maybe_list(self.org_id))
if self.query_id: self.bind_relation_d.setdefault("query_ids", maybe_list(self.query_id))
self.bind_relation_d = bind_relation_d
return bind_relation_d
async def start(self):
query_id = self.record_id
records_found = self.records_found
mark_start = self.mark_from
mark_end = self.mark_to
yield JsonRequest(config.WOS_EXPORT_FILE_API, method='POST',
data=model.export_search_data_to_txt(query_id, mark_from=mark_start,
mark_to=mark_end),
callback=self.download_parse)
def download_parse(self, response: Response, **kwargs: Any) -> Any:
parse_count = 0
batch_time = datetime.now()
records = parse_full_records(response.body)
for data_dic in records:
t_id = data_dic.pop('ut', None)
if t_id:
parse_count += 1
article_item = WosArticleItem()
article_item['third_id'] = t_id
article_item['exported'] = data_dic
article_item['updated_at'] = batch_time
yield article_item
# 解析被引量
if cited_num := tools.str2int(data_dic.get("tc", 0), 0):
cited_item = WosCitedNumberItem()
cited_item['third_id'] = t_id
cited_item['cited'] = cited_num
cited_item['updated_at'] = batch_time
yield cited_item
if self.bind_relation_enable and self.bind_relation_d:
# 当启用绑定关系配置才会绑定各种关系
relation_item = WosIdRelationItem()
relation_item['third_id'] = t_id
relation_item.update(**self.bind_relation_d)
yield relation_item

@ -1,96 +0,0 @@
# -*- coding: utf-8 -*-
# @Time : 2024/1/16 8:41
# @Author : zhaoxiangpeng
# @File : config.py
from datetime import datetime
# 数据来源名
SOURCE_NAME = 'wos'
WOS_SEARCH_API = "https://webofscience.clarivate.cn/api/wosnx/core/runQuerySearch"
WOS_DETAIL_LINK = 'https://webofscience.clarivate.cn/wos/woscc/full-record/{wos_id}'
WOS_DETAIL_API = 'https://webofscience.clarivate.cn/api/wosnx/core/runQuerySearch'
WOS_ADVANCED_SEARCH_API = 'https://webofscience.clarivate.cn/api/wosnx/core/runQuerySearch'
WOS_EXPORT_FILE_API = 'https://webofscience.clarivate.cn/api/wosnx/indic/export/saveToFile'
WOS_RECORD_STREAM_API = "https://webofscience.clarivate.cn/api/wosnx/core/runQueryGetRecordsStream"
WOS_REFINE_API = "https://webofscience.clarivate.cn/api/wosnx/core/runQueryRefine"
# WOS starter api
WOS_STARTER_DOCUMENT_UID_API = "https://api.clarivate.com/apis/wos-starter/v1/documents/{uid}" # Unique Identifier/Accession Number
WOS_STARTER_DOCUMENT_API = "https://api.clarivate.com/apis/wos-starter/v1/documents"
WOS_STARTER_PER_PAGE_LIMIT = 50 # 每页限制的数量
# WOS lite api
WOS_LITE_QUERY_FIRST_API = 'https://wos-api.clarivate.com/api/woslite' # 第一个请求请求后会有一个query的序号
WOS_LITE_QUERY_API = 'https://wos-api.clarivate.com/api/woslite/query' # 使用序号进行翻页
# 发文表
WOS_ARTICLE_COLLECTION = 'data_{}_article'.format(SOURCE_NAME)
# 被引量集合
WOS_CITED_NUMBER_COLLECTION = "relation_cited_number_{}".format(SOURCE_NAME)
# 发文关系表
SCHOOL_RELATION_COLLECTION = 'relation_school_{}'.format(SOURCE_NAME)
# 参考文献集合
WOS_REFERENCE_COLLECTION = "relation_reference_{}".format(SOURCE_NAME)
# 待下载Id表
ARTICLE_TODO_IDS_COLLECTION = "todo_ids_{}".format(SOURCE_NAME)
# CSCD来源的发文表
WOS_CSCD_ARTICLE_COLLECTION = 'data_{}_article_{}'.format(SOURCE_NAME, 'cscd')
# cookie池配置
# COOKIE_POOL_CONFIG = dict(host=setting.REDIS_HOST, port=6379, db=setting.REDIS_DB, password=setting.REDIS_PASSWORD)
COOKIE_POOL_GROUP = 'cookies_pool:wos:sid*'
COOKIE_POOL_KEY = 'cookies_pool:wos:sid-sjtu'
COOKIE_TTL = 60 * 60 * 4
# 下载的单个文件的大小
BATCH_DOWNLOAD_LIMIT = 500
# 导出文件时的默认值
DEFAULT_EXPORT_RECORD_FILTER = "fullRecordPlus" # fullRecordPlus
# 表头验证配置
SUCCESS_TABLE_HEAD_START = b'\xef\xbb\xbfPT'
LOST_TABLE_HEAD_START = b'\xef\xbb\xbfnull'
AUTO_TABLE_HEAD_START = b'\xef\xbb\xbfPT\tAU\tBA\tBE\tGP\tAF\tBF\tCA\tTI\tSO\tSE\tBS\tLA\tDT\tCT\tCY\tCL\tSP\tHO\tDE\tID\tAB\tC1\tC3\tRP\tEM\tRI\tOI\tFU\tFP\tFX\tCR\tNR\tTC\tZ9\tU1\tU2\tPU\tPI\tPA\tSN\tEI\tBN\tJ9\tJI\tPD\tPY\tVL\tIS\tPN\tSU\tSI\tMA\tBP\tEP\tAR\tDI\tDL\tD2\tEA\tPG\tWC\tWE\tSC\tGA\tPM\tOA\tHC\tHP\tDA\tUT\r\n'
CORE_NAME_TABLE = dict(
WOSCC="Web of Science Core Collection",
BCI="BIOSIS Citation Index",
SCIELO="SciELO Citation Index",
RSCI="Russian Science Citation Index",
CSCD="Chinese Science Citation Database℠",
ARCI="Arabic Citation Index",
DIIDW="Derwent Innovations Index",
PPRN="",
PQDT="ProQuest ™ Dissertations & Theses Citation Index"
)
NAV_NAME_TABLE = dict(
SCI="Science Citation Index Expanded (SCI-Expanded)",
ESCI="Emerging Sources Citation Index (ESCI)",
SSCI="Social Sciences Citation Index (SSCI)",
ISTP="Conference Proceedings Citation Index Science (CPCI-S)",
BSCI="Book Citation Index Science (BKCI-S)",
AHCI="Arts & Humanities Citation Index (A&HCI)",
IC="Index Chemicus (IC)",
ISSHP="Conference Proceedings Citation Index Social Sciences & Humanities (CPCI-SSH)"
)
TASK_CONFIG = {
"school_id": 83,
"school_name": "北京林业大学",
"search_policy": """OG=(Beijing Forestry University)""",
"crawl_year": [2021, 2022, 2023],
"source_type": 1,
"priority": 10,
"is_important": 1,
"update_interval": 60 * 60 * 24 * 14,
"create_time": datetime.now(),
"last_time": datetime.now(),
"next_time": datetime.now(),
"state": 0
}

@ -1,59 +0,0 @@
# -*- coding: utf-8 -*-
# @Time : 2025/12/11 13:56
# @Author : zhaoxiangpeng
# @File : crawl_article_by_qid.py
import math
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
from science_article_wos.spiders.download_by_search_record import DownloadBySearchRecordSpider
BATCH_DOWNLOAD_LIMIT = 500
def f(record_id: str, records_found: int, shard_count: int = None):
mark_start = 1
mark_end = 0
idx = 0
shard_count = shard_count or math.ceil(records_found / BATCH_DOWNLOAD_LIMIT)
for i in range(shard_count):
idx += 1
mark_end += BATCH_DOWNLOAD_LIMIT
if mark_end > records_found:
mark_end = records_found
yield dict(
record_id=record_id,
mark_from=mark_start, mark_to=mark_end,
shard=idx, shard_count=shard_count,
records_found=records_found
)
mark_start += BATCH_DOWNLOAD_LIMIT
def ready():
"""
把待采集的任务入库
:return:
"""
RECORDS_FOUND = 1486
def test_starter():
init_params = dict(
record_id='68ce1627-b4c3-4938-adcb-476c7dcde004-0192d3c012',
mark_from=1, mark_to=50,
shard=1, shard_count=51,
records_found=25256
)
process = CrawlerProcess(get_project_settings())
process.crawl(DownloadBySearchRecordSpider, **init_params)
process.start()
def starter():
process = CrawlerProcess(get_project_settings())
process.crawl(DownloadBySearchRecordSpider)
process.start()

@ -1,70 +0,0 @@
# -*- coding: utf-8 -*-
# @Time : 2025/12/15 16:47
# @Author : zhaoxiangpeng
# @File : search_records_orm.py
import math
from science_article_wos.dao.database.connection import DatabaseManager
from science_article_wos.dao.models.search_record import SearchRecord
BATCH_DOWNLOAD_LIMIT = 500
def f(record_id: str, records_found: int, shard_count: int = None):
mark_start = 1
mark_end = 0
idx = 0
shard_count = shard_count or math.ceil(records_found / BATCH_DOWNLOAD_LIMIT)
for i in range(shard_count):
idx += 1
mark_end += BATCH_DOWNLOAD_LIMIT
if mark_end > records_found:
mark_end = records_found
yield dict(
record_id=record_id,
mark_from=mark_start, mark_to=mark_end,
shard=idx, shard_count=shard_count,
records_found=records_found
)
mark_start += BATCH_DOWNLOAD_LIMIT
if __name__ == "__main__":
# 根据您的数据库类型选择连接字符串
# MySQL
db_url = "mysql+pymysql://root:admin000@localhost/crawler"
# SQLite
# db_url = "sqlite:///search_records.db"
# 初始化数据库管理器
db_manager = DatabaseManager(db_url)
# 创建表
db_manager.create_tables()
# 使用示例
with db_manager.session_scope() as session:
# search_record_id = "02f30273-1342-4d61-9e51-c1ea1f5b2423-0190efdd10"
# for d in f(search_record_id, 10641):
# # 创建新记录
# new_record = SearchRecord(
# **d
# )
#
# session.add(new_record)
# print(f"记录已添加: {new_record}")
# session.commit()
# 查询记录
record = session.query(SearchRecord).filter_by(state="pending").first()
if record:
print(f"查询到记录: {record}")
# 更新记录
if record:
record.state = "processing"
record.reason = "正在处理数据"
session.commit()
print(f"记录已更新: {record}")

@ -3,25 +3,10 @@
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
from datetime import datetime
import scrapy
class WosItem(scrapy.Item):
# define the fields for your item here like:
third_id = scrapy.Field()
updated_at = scrapy.Field()
class WosArticleItem(WosItem):
"""
wos发文item
"""
exported = scrapy.Field()
class WosCitedNumberItem(WosItem):
"""发文被引量item"""
third_id = scrapy.Field()
cited = scrapy.Field()
updated_at = scrapy.Field()
# name = scrapy.Field()
pass

Loading…
Cancel
Save