cnki:采集被引量

main
zhaoxiangpeng 3 weeks ago
parent 9a29a8ace7
commit 6c0d732877

@ -0,0 +1,88 @@
# -*- coding: utf-8 -*-
# @Time : 2024/5/9 11:11
# @Author : zhaoxiangpeng
# @File : config.py
# 数据来源名
SOURCE_NAME = 'cnki'
# 期刊导航
# 主页
CNKI_JOURNAL_NAVIGATOR_INDEX = 'https://navi.cnki.net/knavi/journals/index?uniplatform=NZKPT'
# 搜索接口
CNKI_JOURNAL_NAVIGATOR_SEARCH_API = 'https://navi.cnki.net/knavi/journals/searchbaseinfo'
# 导出xls接口旧版, 通过filename导出
CNKI_EXPORT_XLS_OLD_API = 'https://kns.cnki.net/dm/manage/FileToText'
# 导出xls接口
CNKI_EXPORT_XLS_API = 'https://kns.cnki.net/dm8/FileToText'
# 期刊详情页
CNKI_JOURNAL_DETAIL = 'https://navi.cnki.net/knavi/journals/{journal_no}/detail?uniplatform=NZKPT'
# 期刊详情页获取发文年份/期列表的接口
CNKI_JOURNAL_ISSUE = 'https://navi.cnki.net/knavi/journals/{journal_no}/yearList' # ZDJY
# 期刊详情页获取年/期发文列表的接口
CNKI_JOURNAL_ISSUE_ARTICLE = 'https://navi.cnki.net/knavi/journals/{journal_no}/papers'
# 文章详情页
CNKI_ARTICLE_DETAIL = 'https://kns.cnki.net/kcms/detail/detail.aspx?dbcode={db_code}&filename={article_id}'
# -- 旧版的接口
CNKI_ADV_SEARCH_API = 'https://kns.cnki.net/kns8s/brief/grid'
# 搜索用的请求头
SEARCH_HEADERS = {
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
'Cookie': 'Ecp_notFirstLogin=qkFgu9; Ecp_ClientId=o240823084800102418; Ecp_loginuserbk=SJTU; cnkiUserKey=eef4d3aa-1096-bc9e-dff0-74349179c2cc; Ecp_ClientIp=111.186.52.67; UM_distinctid=19366f14e7a832-0f92ef85a35cb5-26001051-1fa400-19366f14e7c14f2; Hm_lvt_dcec09ba2227fd02c55623c1bb82776a=1734079899; Ecp_session=1; SID_kns_new=kns018104; SID_sug=018104; knsLeftGroupSelectItem=; updatetime-advInput=2024-12-19+17%3A42%3A08; knsadv-searchtype=%7B%22BLZOG7CK%22%3A%22gradeSearch%2CmajorSearch%22%2C%22MPMFIG1A%22%3A%22gradeSearch%2CmajorSearch%2CsentenceSearch%22%2C%22T2VC03OH%22%3A%22gradeSearch%2CmajorSearch%22%2C%22JQIRZIYA%22%3A%22gradeSearch%2CmajorSearch%2CsentenceSearch%22%2C%22S81HNSV3%22%3A%22gradeSearch%22%2C%22YSTT4HG0%22%3A%22gradeSearch%2CmajorSearch%2CauthorSearch%2CsentenceSearch%22%2C%22ML4DRIDX%22%3A%22gradeSearch%2CmajorSearch%22%2C%22WQ0UVIAA%22%3A%22gradeSearch%2CmajorSearch%22%2C%22VUDIXAIY%22%3A%22gradeSearch%2CmajorSearch%22%2C%22NN3FJMUV%22%3A%22gradeSearch%2CmajorSearch%2CauthorSearch%2CsentenceSearch%22%2C%22LSTPFY1C%22%3A%22gradeSearch%2CmajorSearch%2CsentenceSearch%22%2C%22HHCPM1F8%22%3A%22gradeSearch%2CmajorSearch%22%2C%22OORPU5FE%22%3A%22gradeSearch%2CmajorSearch%22%2C%22WD0FTY92%22%3A%22gradeSearch%2CmajorSearch%2CauthorSearch%2CsentenceSearch%22%2C%22BPBAFJ5S%22%3A%22gradeSearch%2CmajorSearch%2CauthorSearch%2CsentenceSearch%22%2C%22EMRPGLPA%22%3A%22gradeSearch%2CmajorSearch%22%2C%22PWFIRAGL%22%3A%22gradeSearch%2CmajorSearch%2CsentenceSearch%22%2C%22U8J8LYLV%22%3A%22gradeSearch%2CmajorSearch%22%2C%22R79MZMCB%22%3A%22gradeSearch%22%2C%22J708GVCE%22%3A%22gradeSearch%2CmajorSearch%22%2C%22HR1YT1Z9%22%3A%22gradeSearch%2CmajorSearch%22%2C%22JUP3MUPD%22%3A%22gradeSearch%2CmajorSearch%2CauthorSearch%2CsentenceSearch%22%2C%22NLBO1Z6R%22%3A%22gradeSearch%2CmajorSearch%22%2C%22RMJLXHZ3%22%3A%22gradeSearch%2CmajorSearch%2CsentenceSearch%22%2C%221UR4K4HZ%22%3A%22gradeSearch%2CmajorSearch%2CauthorSearch%2CsentenceSearch%22%2C%22NB3BWEHK%22%3A%22gradeSearch%2CmajorSearch%22%2C%22XVLO76FD%22%3A%22gradeSearch%2CmajorSearch%22%7D; createtime-advInput=2024-12-20%2014%3A37%3A03; LID=WEEvREcwSlJHSldSdmVpanJGNW9JQS9sbkNrOUFycHJkRzF3eXgyTGlWbz0=$9A4hF_YAuvQ5obgVAqNKPCYcEjKensW4IQMovwHtwkF4VYPoHbKxJw!!; Ecp_LoginStuts={"IsAutoLogin":false,"UserName":"SJTU","ShowName":"%E4%B8%8A%E6%B5%B7%E4%BA%A4%E9%80%9A%E5%A4%A7%E5%AD%A6","UserType":"bk","BUserName":"","BShowName":"","BUserType":"","r":"qkFgu9","Members":[]}; KNS2COOKIE=1734680479.883.14106.830885|b25e41a932fd162af3b8c5cff4059fc3; dblang=both; c_m_LinID=LinID=WEEvREcwSlJHSldSdmVpanJGNW9JQS9sbkNrOUFycHJkRzF3eXgyTGlWbz0=$9A4hF_YAuvQ5obgVAqNKPCYcEjKensW4IQMovwHtwkF4VYPoHbKxJw!!&ot=12%2F20%2F2024%2016%3A01%3A27; c_m_expire=2024-12-20%2016%3A01%3A27; tfstk=gnXZLQYMKRewdgBaoHvqL9aIUYp9sd45ntTXmijDfFYG5iTcTZbBCGsccx-D-NdjCxY18pQRVAC_6ITq0dBC1xT_WKScPKz7P8w5XGpynzaShW0gBdKqnncilpDHmK-i1ZwdGGpvnyaM9UCdXabz7TCMnkJH4ncDnxYMtk-6qKDMiAcn-eKDnKADjDYH4nmioAYgYMYpDKxcoCcmtGjmL3Og25LCsWPKUCYljekmU0KHslSnGAMsnhA9rBxrnH6ebC8ljOHkrv-hd9RWOmayKgCCSHJz3vvwaOBytO4K3BQ2-IWMh0kcYNshNIWgD5IF3FRlIBoS3dIpmZAV9zkWbd1eaO5TD2jGPF5kBiiz5MRPTQKHtmlMC_s5HQXgQ4LBwn7y4NuN4DuvxG5lH1umgCxpYUZUY7E40mtBH0LEMjdHeH87fhGxMCxpYUZUYjhvteKePlt1.; searchTimeFlags=1; updatetime-advInput=2024-12-19+17%3A42%3A08',
'Origin': 'https://kns.cnki.net',
'Referer': 'https://kns.cnki.net/kns8s/AdvSearch?crossids=YSTT4HG0%2CLSTPFY1C%2CJUP3MUPD%2CMPMFIG1A%2CWQ0UVIAA%2CBLZOG7CK%2CPWFIRAGL%2CEMRPGLPA%2CNLBO1Z6R%2CNN3FJMUV',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36',
}
# 期刊导航页的请求头
JOURNAL_NAVIGATOR_HEADERS = {
'Content-Type': 'application/x-www-form-urlencoded',
'Origin': 'https://navi.cnki.net',
'Referer': 'https://navi.cnki.net/knavi/journals/index?uniplatform=NZKPT',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36',
'uniplatform': 'NZKPT',
}
# mongodb 集合配置
# 风控参数缓存key
FEC_REDIS_KEY = "cookies_pool:cnki:crypt"
FEC_REDIS_TTL = 3600
# 详情页cookies
COOKIES_REDIS_KEY = "cookies_pool:cnki:detail_cookies"
# 期刊信息表
CNKI_JOURNAL_INFO_COLLECTION = 'task_journal_info_{}'.format(SOURCE_NAME)
# 期刊年卷期任务表
CNKI_JOURNAL_ISSUE_COLLECTION = 'task_journal_issue_{}'.format(SOURCE_NAME)
# 期刊发文表
CNKI_JOURNAL_ARTICLE_COLLECTION = 'data_{}_article'.format(SOURCE_NAME)
# 待下载Id表
CNKI_ARTICLE_TODO_IDS_COLLECTION = 'todo_ids_{}'.format(SOURCE_NAME)
# 待下载详情id表
CNKI_ARTICLE_DETAIL_TODO_IDS_COLLECTION = 'todo_ids_cnki_detail'
# 发文作者地址关系
CNKI_ARTICLE_AUTHOR_ORG_COLLECTION = "relation_author_org_cnki"
# 发文关系表
SCHOOL_RELATION_COLLECTION = 'relation_school_{}'.format(SOURCE_NAME)
# 中文期刊列表需要用到的集合
CHECK_JOURNAL_INFO_TABLE = "check_journal_info_{}".format(SOURCE_NAME) # 信息表
CHECK_JOURNAL_MIDDLE_TABLE = "check_journal_middle_{}".format(SOURCE_NAME) # 中间任务表
CHECK_JOURNAL_ISDOWN_TABLE = "check_journal_isdown_{}".format(SOURCE_NAME) # 结果存储表
# xls文件表头
TABLE_HEAD = ['SrcDatabase-来源库', 'Title-题名', 'Author-作者', 'Organ-单位', 'Source-文献来源', 'Keyword-关键词', 'Summary-摘要', 'PubTime-发表时间', 'FirstDuty-第一责任人', 'Fund-基金', 'Year-年', 'Volume-卷', 'Period-期', 'PageCount-页码', 'CLC-中图分类号', 'ISSN-国际标准刊号', 'URL-网址', 'DOI-DOI']
# json字段表头
TABLE_HEAD_EN = ['src_db', 'title', 'author', 'org', 'journal', 'keyword', 'abstract', 'pub_time', 'first_duty', 'fund', 'year', 'volum', 'issue', 'page', 'classification_code', 'issn', 'url', 'doi']
# 每次下载数量
BATCH_DOWNLOAD_LIMIT = 50

@ -0,0 +1,15 @@
# -*- coding: utf-8 -*-
# @Time : 2024/5/13 16:53
# @Author : zhaoxiangpeng
# @File : extract_rule.py
# 提取ISSN号
ISSN_REGEX_PATTERN = r'ISSN(\d{4}-[\dX]{4})'
# 提取CN号, https://baike.baidu.com/item/%E5%9B%BD%E5%86%85%E7%BB%9F%E4%B8%80%E5%88%8A%E5%8F%B7/386463
CN_REGEX_PATTERN = r'CN(\d{2}-\d{4}/?[A-Z]?)'
# 去除/替换标题中的特殊字符
DEL_TITLE_SYMBOL_PATTERN = '[!"#$%&\'()*+,-.·/:;<=>—?@,。?★、…()【】《》?“”‘’![\\]^_`{|}~\s]+'
# 去除特殊字符后的字符
DEL_SOURCE_SYMBOL_PATTERN = DEL_TITLE_SYMBOL_PATTERN

@ -0,0 +1,87 @@
from __future__ import annotations
import logging
from typing import TYPE_CHECKING, Optional, Dict, Tuple
from pymongo import MongoClient
from pymongo import UpdateOne
from pymongo.errors import DuplicateKeyError, BulkWriteError
if TYPE_CHECKING:
from pymongo.database import Database
from pymongo.collection import Collection
from pymongo.results import InsertManyResult, BulkWriteResult
def build_update_query(update_data: dict, replace: bool = True) -> dict:
"""
如果replace为True则直接覆盖原有的document
"""
update_query = {}
if not update_data:
return {}
for key, val in update_data.items():
if replace:
update_query.setdefault(
"$set", {}
).update(
{key: val}
)
else:
if isinstance(val, list):
update_query.setdefault(
"$addToSet", {}
).update({
key: {"$each": val}
})
else:
update_query.setdefault(
"$set", {}
).update(
{key: val}
)
return update_query
def update_document(filter_query: dict = None, update_data: dict = None, replace: bool = True) -> Tuple[dict, dict]:
update_query = {}
if not update_data:
return {}, {}
for key, val in update_data.items():
if replace:
update_query.setdefault(
"$set", {}
).update(
{key: val}
)
else:
if isinstance(val, list):
update_query.setdefault(
"$addToSet", {}
).update({
key: {"$each": val}
})
else:
update_query.setdefault(
"$set", {}
).update(
{key: val}
)
return filter_query, update_query
class MongoDBUtils:
def __init__(self, mongo_uri, mongo_db):
self.mongo_uri = mongo_uri
self.mongo_db = mongo_db
self.client: MongoClient = None
self.db: Database = None
def _insert2db(self, items, tablename, ordered: bool = False, **kwargs) -> InsertManyResult:
collection: Collection = self.db.get_collection(tablename)
result: InsertManyResult = collection.insert_many(items, ordered=ordered, **kwargs)
return result
def _update2db(self, items, tablename, ordered: bool = False, **kwargs) -> BulkWriteResult:
collection: Collection = self.db.get_collection(tablename)
bulk_results: BulkWriteResult = collection.bulk_write(items, ordered=ordered, **kwargs)
return bulk_results

@ -0,0 +1,40 @@
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class ScienceArticlCnkiItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
pass
class AddItemBase(scrapy.Item):
third_id = scrapy.Field()
updated_at = scrapy.Field()
class ArticleItem(AddItemBase):
exported = scrapy.Field()
class IdRelationItem(AddItemBase):
query_ids = scrapy.Field()
school_ids = scrapy.Field()
task_ids = scrapy.Field()
class ArticleCitedItem(AddItemBase):
cited = scrapy.Field()
class CnkiCitedNumberItem(ArticleCitedItem):
__tablename__ = 'relation_cited_number_cnki'
"""发文被引量item"""
third_id = scrapy.Field()
cited = scrapy.Field()
updated_at = scrapy.Field()

@ -9,7 +9,7 @@ from scrapy import signals
from itemadapter import ItemAdapter
class ScienceArticleAddSpiderMiddleware:
class ScienceArticlCnkiSpiderMiddleware:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@ -53,7 +53,7 @@ class ScienceArticleAddSpiderMiddleware:
spider.logger.info("Spider opened: %s" % spider.name)
class ScienceArticleAddDownloaderMiddleware:
class ScienceArticlCnkiDownloaderMiddleware:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
@ -100,14 +100,17 @@ class ScienceArticleAddDownloaderMiddleware:
spider.logger.info("Spider opened: %s" % spider.name)
class WosLiteApiXkeyDownloaderMiddleware:
async def process_request(self, request, spider):
key_param = {
'X-ApiKey': '941a216f25cbef0f80ee4ba58a08ef1e19dee7a4'
}
if not request.headers:
request.headers = key_param
return request
from scrapy.http.headers import Headers
request.headers.update(key_param)
return request
class CnkiSearchHeadersDownloaderMiddleware:
def __init__(self, custom_headers: dict):
self.custom_headers = custom_headers
@classmethod
def from_crawler(cls, crawler):
return cls(custom_headers=crawler.settings['SEARCH_REQUEST_HEADERS'])
def process_request(self, request, spider):
request.headers = Headers(self.custom_headers)
return None

@ -0,0 +1,630 @@
# -*- coding: utf-8 -*-
# @Time : 2023/2/9 14:24
# @Author : zhaoxiangpeng
# @File : model.py
import re
import json
from typing import Union, List, Dict
from datetime import datetime, timedelta
from science_article_cnki.models.enum_cls import ResourceType, SearchTypeId, SearchFieldEnum, OperatorEnum, LogicEnum, SearchFromId
DB_CODE = {
'CFLS': '总库',
'CFLQ': '期刊',
'CDMD': '学位论文',
'CFLP': '会议',
'CCND': '报纸'
}
def export2adv(query):
"""
检索式的专业检索转高级检索
:param query:
:return:
"""
if query.find('%=') != -1:
query = '(作者单位:%(name)s(模糊)' % {"name": query.split('%=')[-1]}
return query
def navigator_body(query: str = None, db_code: str = 'CFLS', **kwargs):
if query is None:
raise ValueError("query 不能为空应为检索式AF=上海交通大学,具体检索式见 "
"https://piccache.cnki.net/2022/kdn/index/helper/manual.html#frame2-1-5")
_param = {
'queryJson': json.dumps({
"Platform": "",
"DBCode": db_code,
"KuaKuCode": "CJFQ,CDMD,CIPD,CCND,CISD,SNAD,BDZK,CCJD,CCVD,CJFN",
"QNode": {
"QGroup": [
{"Key": "Subject", "Title": "", "Logic": 4, "Items": [
{"Key": "Expert", "Title": "", "Logic": 0, "Name": "", "Operate": "", "Value": query,
"ExtendType": 12, "ExtendValue": "中英文对照", "Value2": "", "BlurType": ""}],
"ChildItems": []},
{"Key": "ControlGroup", "Title": "", "Logic": 1, "Items": [], "ChildItems": []}
]
},
"CodeLang": ""
})
}
return _param
def signal_body(query: str = None, resource_type: str = 'JOURNAL', group_id: str = 'YE', **kwargs):
"""
获取左侧导航栏单类目的聚合
:group_id: 主要主题1; 学科2; 发表年度3; 研究层次4; 文献类型5; 文献来源6; 作者7; 机构8; 基金9
:return:
"""
if query is None:
raise ValueError("query 不能为空应为检索式AF=上海交通大学,具体检索式见 "
"https://piccache.cnki.net/2022/kdn/index/helper/manual.html#frame2-1-5")
_param = {
'queryJson': json.dumps({
"Platform": "",
"Resource": ResourceType[resource_type].name,
"Classid": ResourceType[resource_type].value,
"Products": "",
"QNode": {
"QGroup": [{
"Key": "Subject",
"Title": "",
"Logic": 0,
"Items": [{
"Key": "Expert",
"Title": "",
"Logic": 0,
"Field": "EXPERT",
"Operator": 0,
"Value": query,
"Value2": ""
}],
"ChildItems": []
}, {
"Key": "ControlGroup",
"Title": "",
"Logic": 0,
"Items": [],
"ChildItems": []
}]
},
"ExScope": "1",
"SearchType": SearchTypeId.GROUP.value,
"Rlang": "CHINESE",
"KuaKuCode": ""
}, ensure_ascii=False),
'groupId': group_id
}
return _param
def refine_search(query: str, resource_type: str = 'JOURNAL', year=None, subject=None, code=None, **kwargs):
"""
使用专业检索式检索后再次检索年份
"""
_query = {
"Platform": "",
"Resource": ResourceType[resource_type].name,
"Classid": ResourceType[resource_type].value,
"Products": "",
"QNode": {
"QGroup": [{
"Key": "Subject",
"Title": "",
"Logic": 0,
"Items": [{
"Key": "Expert",
"Title": "",
"Logic": 0,
"Field": "EXPERT",
"Operator": 0,
"Value": query,
"Value2": ""
}],
"ChildItems": []
}, {
"Key": "ControlGroup",
"Title": "",
"Logic": 0,
"Items": [],
"ChildItems": []
}]
},
"ExScope": "1",
"SearchType": SearchTypeId.GROUP.value,
"Rlang": "CHINESE",
"KuaKuCode": "",
"View": "changeDBOnlyFT"
}
_group2 = {
"Key": "MutiGroup",
"Title": "",
"Logic": 0,
"Items": [],
"ChildItems": []
}
if year:
year_param = {
"Key": "YE",
"Title": "",
"Logic": 0,
"Items": [{
"Key": year,
"Title": "%s" % year,
"Logic": 1,
"Field": "YE",
"Operator": "DEFAULT",
"Value": year,
"Value2": "",
"Name": "YE",
"ExtendType": 0
}],
"ChildItems": []
}
_group2['ChildItems'].append(year_param)
if subject:
subject_param = {
'Key': '6',
'Title': '',
'Logic': 1,
'Items': [{
'Key': code + '?',
'Title': subject,
'Logic': 2,
'Name': '专题子栏目代码',
'Operate': '',
'Value': code + '?',
'ExtendType': 14,
'ExtendValue': '',
'Value2': '',
'BlurType': ''
}],
'ChildItems': []
}
_group2['ChildItems'].append(subject_param)
_query['QNode']['QGroup'].append(_group2)
return _query
def query_search(query_body, page: int = 1, handler_id: str = 18, sql: str = None,
sort: str = 'desc', sort_field: str = 'PT', **kwargs):
"""
搜索请求body
:param query_body: 用来搜索的详细query, 与左侧导航body相同
:param page: 请求的页码
:param handler_id: 可能需要携带此参数, 在源码中获取
:param sql: 源码中一般不需要
:param sort: 排序方式, desc/asc
:param sort_field: 排序字段, PT(发表时间)/CF(被引)
:return:
"""
if page == 1:
base_query = query_body.get("QNode", {}).get("QGroup", [{}])[0].get("Items", [{}])[0].get("Value")
aside = '( %s)' % base_query if page == 1 else ''
_query = {
"boolSearch": "true",
"QueryJson": json.dumps(query_body, ensure_ascii=False),
"pageNum": "1",
"pageSize": "50",
"dstyle": "listmode",
"boolSortSearch": "false",
"aside": aside,
"searchFrom": "资源范围:学术期刊; 仅看有全文,中英文扩展; 时间范围:更新时间:不限; 来源类别:全部期刊;",
"CurPage": "1"
}
else:
_query = {
'boolSearch': "false",
'QueryJson': json.dumps(query_body, ensure_ascii=False),
'pageNum': page,
'pageSize': 50,
'sortField': sort_field,
'sortType': sort,
'dstyle': 'listmode',
'boolSortSearch': "false",
# 'sentenceSearch': "false",
# 'productStr': 'YSTT4HG0,LSTPFY1C,RMJLXHZ3,JQIRZIYA,JUP3MUPD,1UR4K4HZ,BPBAFJ5S,R79MZMCB,MPMFIG1A,EMRPGLPA,J708GVCE,ML4DRIDX,WQ0UVIAA,NB3BWEHK,XVLO76FD,HR1YT1Z9,BLZOG7CK,PWFIRAGL,NN3FJMUV,NLBO1Z6R,',
'aside': '',
'searchFrom': '资源范围:学术期刊; 仅看有全文,中英文扩展; 时间范围:更新时间:不限; 来源类别:全部期刊;',
}
return _query
def get_cnki_export_data(ids: str):
"""
表头
'SrcDatabase-来源库,Title-题名,Author-作者,Organ-单位,Source-文献来源,Keyword-关键词,Summary-摘要,PubTime-发表时间,FirstDuty-第一责任人,Fund-基金,Year-年,Volume-卷,Period-期,PageCount-页码,CLC-中图分类号,ISSN-国际标准刊号,URL-网址,DOI-DOI,',
:param ids:
:return:
"""
data = {
'FileName': ids,
'DisplayMode': 'selfDefine',
'OrderParam': 0,
'OrderType': 'desc',
'SelectField': 'DB,TI,AU,AF,LY,KY,AB,PT,FI,FU,YE,JU,QI,PM,CLC,SN,ABSTRACT,DI,',
'PageIndex': 1,
'PageSize': 20,
'language': 'CHS',
'uniplatform': 'NZKPT',
'Type': 'xls',
}
return data
def export_data(ids: str):
"""
https://kns.cnki.net/dm/manage/FileToText
:param ids:
:return:
"""
data = {
'FileName': ids,
'DisplayMode': 'selfDefine',
'OrderParam': 0,
'OrderType': 'desc',
'SelectField': 'SrcDatabase-来源库,Title-题名,Author-作者,Organ-单位,Source-文献来源,Keyword-关键词,Summary-摘要,PubTime-发表时间,FirstDuty-第一责任人,Fund-基金,Year-年,Volume-卷,Period-期,PageCount-页码,CLC-中图分类号,ISSN-国际标准刊号,URL-网址,DOI-DOI,',
'PageIndex': 1,
'PageSize': 20,
'language': 'CHS',
'uniplatform': '',
'Type': 'xls',
}
return data
def journal_nav_all(page: int = 1, page_size: int = 21):
model = dict(
searchStateJson=json.dumps(
{"StateID": "", "Platfrom": "", "QueryTime": "", "Account": "knavi", "ClientToken": "", "Language": "",
"CNode": {"PCode": "OYXNO5VW", "SMode": "", "OperateT": ""},
"QNode": {"SelectT": "", "Select_Fields": "", "S_DBCodes": "", "QGroup": [], "OrderBy": "OTA|DESC",
"GroupBy": "", "Additon": ""}}, ensure_ascii=False),
displaymode=1,
pageindex=page,
pagecount=page_size,
index='JSTMWT6S',
searchType='刊名(曾用刊名)',
clickName='',
switchdata=''
)
return model
def journal_article_by_year_issue(year_issue, page: int = 0, pcode: str = None):
"""
获取期刊每一期的文章
:param year_issue:
:param page:
:param pcode:
:return:
"""
if pcode is None:
pcode = 'CJFD,CCJD'
model = {
"yearIssue": year_issue,
"pageIdx": page,
"pcode": pcode
}
return model
def add_limit_2query_body(limit_query: Union[List[dict], dict], body_key: str, query_body: dict):
"""
把limit添加到检索的queryJson中
:param limit_query:
:param body_key:
:param query_body:
:return:
"""
# 判断组的key是否存在不存在的话添加一个组
if body_key not in {g["Key"] for g in query_body["QNode"]["QGroup"]}:
query_body["QNode"]["QGroup"].append({
"Key": body_key,
"Title": "",
"Logic": LogicEnum.AND.value,
"Items": [],
"ChildItems": []
})
# 遍历所有的组满足条件则把limit添加进去
for group in query_body["QNode"]["QGroup"]:
if group["Key"] == body_key:
if isinstance(limit_query, dict):
group["ChildItems"].append(limit_query)
elif isinstance(limit_query, list):
group["ChildItems"].extend(limit_query)
else:
raise ValueError("不支持的limit类型 \n%s" % limit_query)
break
def parse_retrieval(query: str):
"""
解析aside值拼接queryJson
:param query:
:return:
"""
def func(string: str):
stand = string[1:-1] # 去除左右的中文括号
title, value = stand.split("", maxsplit=1) # 分割 "作者单位:湖南中医药大学(模糊)" -> [作者单位, 湖南中医药大学(模糊)]
return title, value[:-4], value[-3:-1]
cond_list = re.split(r'(AND|NOT|OR)', query)
logic = 'AND'
content = cond_list[0]
yield logic, func(content)
for i in range(1, len(cond_list), 2):
chunk = cond_list[i:i + 2] # 获取两个元素
logic, content = chunk
yield logic, func(content)
def add_search_word(search_content: str, base_query: dict = None):
"""
高级检索添加检索式
:param search_content: 用高级检索复制下来的aside字段
:param base_query:
:return:
"""
words_query = []
g = parse_retrieval(search_content)
i = 1
for logic, sequence in g:
field_name, word, way = sequence
input_select = "input[data-tipid=gradetxt-%(input_no)s]" % {"input_no": i}
logic_operator = LogicEnum[logic].value
q = {
"Key": input_select,
"Title": field_name,
"Logic": logic_operator,
"Items": [{
"Key": input_select,
"Title": field_name,
"Logic": logic_operator,
"Field": SearchFieldEnum(field_name).name,
"Operator": OperatorEnum[way].value,
"Value": word,
"Value2": ""
}],
"ChildItems": []
}
words_query.append(q)
i += 1
# 如果传入了检索式,那自动添加检索词的语句
if base_query:
add_limit_2query_body(words_query, "Subject", base_query)
return words_query
def limit_year_range(year: int, base_query: dict = None):
"""
添加年份筛选
:param year:
:param base_query:
:return:
"""
year = str(year)
ye_query = {
"Key": "YE",
"Title": "",
"Logic": 0,
"Items": [{
"Key": year,
"Title": "%s" % year,
"Logic": 1,
"Field": "YE",
"Operator": "DEFAULT",
"Value": year,
"Value2": "",
"Name": "YE",
"ExtendType": 0
}],
"ChildItems": []
}
if base_query:
add_limit_2query_body(ye_query, "MutiGroup", base_query)
return ye_query
def parse_updatedtime_symbol(symbol: str, today: str = None) -> tuple:
"""
从字符串解析时间范围
:param symbol:
:param today:
:return:
"""
if today and isinstance(today, str):
today = datetime.strptime(today, "%Y-%m-%d")
else:
today = datetime.now()
if symbol == "最近一周":
ago_day = today - timedelta(days=7)
elif symbol == "最近一月":
ago_day = today - timedelta(days=30)
elif symbol == "最近半年":
ago_day = today - timedelta(days=181)
elif symbol == "最近一年":
ago_day = today.replace(year=today.year-1)
elif symbol == "今年迄今":
ago_day = today.replace(month=1, day=1)
else:
ago_day = today
return ago_day.strftime("%Y-%m-%d"), today.strftime("%Y-%m-%d")
def limit_updated_time(range_str: str, toady: str = None, base_query: dict = None):
"""
更新时间的检索式
:param range_str:
:param toady:
:param base_query:
:return:
"""
start_date, end_date = parse_updatedtime_symbol(range_str, toady)
rt_query = {
"Key": ".tit-dropdown-box>.sort",
"Title": "",
"Logic": 0,
"Items": [
{
"Key": ".tit-dropdown-box>.sort",
"Title": "更新时间",
"Logic": 0,
"Field": "RT",
"Operator": 7,
"Value": start_date,
"Value2": end_date
}
],
"ChildItems": []
}
# 当base_query参数存在时自动添加筛选日期范围的query
if base_query:
add_limit_2query_body(rt_query, "ControlGroup", base_query)
return rt_query
def temp_refine_search(
query: str,
year: int = None,
updated_date: str = None,
resource_type: str = 'JOURNAL',
**kwargs
):
"""
构造queryJson字段的值
:param query: 检索式作者单位湖南中医药大学(模糊)OR作者单位湖南中医学院(模糊)
:param updated_date: 更新时间不限最近一周/一月/半年/一年今年迄今上一年度
:param year: 指定筛选的年份如果需要与updated_date参数同时使用需要在限制更新时间后再筛选
:param resource_type:
:param kwargs:
:return:
"""
_query = {
"Platform": "",
"Resource": ResourceType[resource_type].name,
"Classid": ResourceType[resource_type].value,
"Products": "",
"QNode": {
"QGroup": [
{
"Key": "Subject", "Title": "", "Logic": 0, "Items": [], "ChildItems": []
},
{
"Key": "ControlGroup", "Title": "", "Logic": 0, "Items": [], "ChildItems": []
}
]
},
"ExScope": "1",
"SearchType": 1,
"Rlang": "CHINESE",
"KuaKuCode": "",
"Expands": {},
"View": "changeDBOnlyFT",
"SearchFrom": 1
}
add_search_word(search_content=query, base_query=_query)
if updated_date and updated_date != "不限":
limit_updated_time(updated_date, base_query=_query)
if year:
limit_year_range(year=year, base_query=_query)
return _query
adv_refine_search = temp_refine_search
def temp_query_search(query_body, query: str = None, page: int = 1, page_size: int = 50,
sort: str = 'desc', sort_field: str = 'PT', updated_date: str = "不限", **kwargs):
"""
搜索请求body
:param query_body: 用来搜索的详细query, 与左侧导航body相同
:param query: aside/检索式字符串
:param page: 请求的页码
:param page_size: 每页的数量
:param sort: 排序方式, desc/asc
:param sort_field: 排序字段, PT(发表时间)/CF(被引)
:param updated_date: 默认不限
:return:
"""
page = str(page)
page_size = str(page_size)
if page == '1':
aside = query or ''
_query = {
"boolSearch": "true",
"QueryJson": json.dumps(query_body, ensure_ascii=False),
"pageNum": "1",
"pageSize": page_size,
'sortField': sort_field,
'sortType': sort,
"dstyle": "listmode",
"boolSortSearch": "false",
"aside": aside,
"searchFrom": "资源范围:学术期刊; 仅看有全文,中英文扩展; 时间范围:更新时间:%(updated_date)s; 来源类别:全部期刊; " % {"updated_date": updated_date},
"subject": "",
"language": "",
"uniplatform": "",
"CurPage": "1"
}
else:
_query = {
'boolSearch': "false",
'QueryJson': json.dumps(query_body, ensure_ascii=False),
'pageNum': page,
'pageSize': page_size,
'sortField': sort_field,
'sortType': sort,
'dstyle': 'listmode',
'boolSortSearch': "false",
'aside': '',
'searchFrom': '资源范围:学术期刊; 时间范围:更新时间:%(updated_date)s; 来源类别:全部期刊; ' % {"updated_date": updated_date},
"subject": "",
"language": "",
"uniplatform": ""
}
return _query
adv_query_search = temp_query_search
class SearchPaperArgModel:
pass
class briefParam:
@staticmethod
def getDbCode():
return 'CFLS'
@staticmethod
def getPageSize(isSearch):
return 50
@staticmethod
def getCurPage():
return 1
@staticmethod
def getSearchPaperArgModel(isSearch, cPage):
argModel = {}
dbCode = briefParam.getDbCode()
pSize = briefParam.getPageSize(isSearch)
cPage = cPage if cPage else briefParam.getCurPage()
argModel = {
'IsSearch': isSearch,
'QueryJson': ''
}
if __name__ == '__main__':
print(SearchTypeId.GROUP)
print(add_search_word(
'(作者单位:湖南中医药大学(模糊)OR作者单位湖南中医学院(模糊)OR篇名基于PINK1LETM1信号通路探讨何首乌苷减轻脑缺血再灌注损伤的作用机制(精确)'))

@ -0,0 +1,128 @@
# -*- coding: utf-8 -*-
# @Time : 2025/5/13 10:41
# @Author : zhaoxiangpeng
# @File : enum_cls.py
import enum
from datetime import timedelta
class ResourceType(enum.Enum):
"""资源类型"""
JOURNAL = "YSTT4HG0" # 学术期刊
DISSERTATION = "LSTPFY1C" # 学位论文
CONFERENCE = "JUP3MUPD" # 会议
NEWSPAPER = "MPMFIG1A" # 报纸
ALMANAC = "HHCPM1F8"
BOOK = "EMRPGLPA"
PATENT = "VUDIXAIY"
STANDARD = "WQ0UVIAA"
ACHIEVEMENTS = "BLZOG7CK"
class SearchTypeId(enum.Enum):
"""知网的检索类型"""
ADV = 1
SIMPLE = 2
AUTHOR = 3
EXPERT = 4 # 专业检索
SENTENCE = 5
'''
GROUP = 6
PAGE = 7
SORT = 8
ABSTRACT = 9
MORESENTENCE = 10
HISTORY = 11
SIZE = 12
RESULT = 13
ADVRESULT = 14
EXPERTRESULT = 15
AUTHORRESULT = 16
SENRESULT = 17
CROSSDBCHANGEDB = 18
COMBOHISTORY = 19
'''
class SearchFromId(enum.Enum):
SEARCH = 1
GROUPSEARCH = 2
RESULT = 3
PAGE = 4
SORT = 5
CHANGEDB = 6
DISPLAYMODEL = 7
NAVISEARCH = 8
HISTORY = 9
COMBOHISTORY = 10
CROSSDBCHANGEDB = 11
CHANGELANG = 12
GROUP = 99
class SearchFieldEnum(enum.Enum):
"""文献元数据字段枚举类"""
SU = "主题"
TKA = "篇关摘"
TI = "篇名"
KY = "关键词"
AB = "摘要"
CO = "小标题"
FT = "全文"
AU = "作者"
FI = "第一作者"
RP = "通讯作者"
AF = "作者单位"
LY = "期刊名称"
RF = "参考文献"
FU = "基金"
CLC = "中图分类号"
SN = "ISSN"
CN = "CN"
DOI = "DOI"
QKLM = "栏目信息"
FAF = "第一单位"
CF = "被引频次"
class OperatorEnum(enum.Enum):
模糊 = "FUZZY"
精确 = "DEFAULT"
class OperatorTypeEnum(enum.Enum):
DEFAULT = 0
TOPRANK = 1
FUZZY = 2
GT = 3
GE = 4
LT = 5
LE = 6
BETWEEN = 7
FREQUENCY = 8
PREFIX = 9
SUFFIX = 10
CONTAINS = 11
NEAR = 12
SENTENCE = 13
IS = 14
FUZZYFREQUENCY = 15
class LogicEnum(enum.Enum):
AND = 0
OR = 1
NOT = 2
class UpdatedTimeEnum(enum.Enum):
"""
最近一段时间的枚举
"""
最近一周 = timedelta(days=7)
最近一月 = timedelta(days=30)
最近半年 = timedelta(days=180)
最近一年 = timedelta(days=180)
今年迄今 = timedelta(days=180)

@ -0,0 +1,90 @@
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# useful for handling different item types with a single interface
from __future__ import annotations
from itemadapter import ItemAdapter
class ScienceArticlCnkiPipeline:
def process_item(self, item, spider):
return item
import logging
from datetime import datetime
from typing import TYPE_CHECKING, Tuple, Union
from pymongo import MongoClient
from itemadapter import ItemAdapter
from pymongo.errors import (
DuplicateKeyError,
BulkWriteError
)
from science_article_cnki.db_utils.mongo import MongoDBUtils, update_document, build_update_query
if TYPE_CHECKING:
from scrapy.crawler import Crawler
from scrapy.statscollectors import StatsCollector
mongo_logger = logging.getLogger('pymongo')
mongo_logger.setLevel(logging.WARNING)
logger = logging.getLogger(__name__)
class MongoPipeline(MongoDBUtils):
def __init__(self, mongo_uri, mongo_db, stats: StatsCollector):
super().__init__(mongo_uri, mongo_db)
self.stats: StatsCollector = stats
self.insert_failure_update_enable = True
@classmethod
def from_crawler(cls, crawler: Crawler):
return cls(
mongo_uri=crawler.settings.get("MONGO_URI"),
mongo_db=crawler.settings.get("MONGO_DATABASE", "items"),
stats=crawler.stats
)
def open_spider(self, spider):
self.client = MongoClient(self.mongo_uri)
self.db = self.client[self.mongo_db]
def process_item(self, item, spider):
# 确定Item类型
adapter = ItemAdapter(item)
item_type = self._get_item_type(item)
collection = self.db.get_collection(item_type)
d = adapter.asdict()
try:
insert_result = collection.insert_one(d)
self.stats.inc_value("item2db_inserted/{}".format(item_type))
except DuplicateKeyError as duplicate_error:
if self.insert_failure_update_enable:
write_error = duplicate_error.details
key_pattern = write_error.get('keyPattern')
key_value = write_error.get('keyValue')
logger.debug("dupKey: %s, keyValue: %s", key_pattern, key_value)
d.pop("_id", None)
[d.pop(k, None) for k in key_pattern.keys()]
up_result = collection.update_one(filter=key_value, update={"$set": d}, upsert=True)
self.stats.inc_value("item2db_updated/{}".format(item_type))
except Exception:
raise
return item
def close_spider(self, spider):
self.client.close()
@staticmethod
def _get_item_type(item) -> str:
"""获取Item类型"""
if hasattr(item, '__tablename__'):
return item.__class__.__tablename__
return 'items_null_table'

@ -0,0 +1,105 @@
# Scrapy settings for science_article_cnki project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://docs.scrapy.org/en/latest/topics/settings.html
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = "science_article_cnki"
SPIDER_MODULES = ["science_article_cnki.spiders"]
NEWSPIDER_MODULE = "science_article_cnki.spiders"
ADDONS = {}
# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/139.0.0.0 Safari/537.36'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
# Concurrency and throttling settings
#CONCURRENT_REQUESTS = 16
CONCURRENT_REQUESTS_PER_DOMAIN = 1
DOWNLOAD_DELAY = 3
# Disable cookies (enabled by default)
COOKIES_ENABLED = True
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
# "Accept-Language": "en",
#}
SEARCH_REQUEST_HEADERS = {
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
'Cookie': 'Hm_lvt_dcec09ba2227fd02c55623c1bb82776a=1739256689; UM_distinctid=197b0769b48ea3-0de0b4b2dd761f-26001051-1fa400-197b0769b49cc6; Ecp_ClientId=e250627180800765334; Ecp_ClientIp=111.186.53.36; cnkiUserKey=1b8e7dbe-3c98-864f-2b80-84b544af32af; _c_WBKFRo=UO8UFAxWLjMjlOxhuKvmtkZ4yYaXr8dPZXuhVFea; Ecp_loginuserbk=SJTU; tfstk=g5GqYEZ0ZId4NHSWG0FNzQCb6QNYs5-QjfZ_SV0gloqDDdFa7uoTCSMjSA5ZJuEOhdn6_lmxYPZ0DxMNb0nUXt99nAPZ2q5jhfuO_P0iXEE6kLgxk5FMAHTBOq3vhen9f3NMS4V_773PuGuxk5Q-60hJAqQN2mSLS5mgZz4gS540ItYPZPqliPf0SgYzWuVgSrX0ZT4_uGb0Sc0kzPEuolmgsUPu2PVgjcViG50mS_zQnU-thdfV8NPaxqqPs67Lu-cB9u5Mabzqzugc-1fiaryqZpcfbM2jI2eKGqONwSgEE74qjBx0ex0r_Jh9Csg0ZoPxa-bMXocxSfPYTNAmzSr4KbwXO1mnzVDQUbTH9SP0mANx5w-jzjojkbu1STV4GYyEgWAdmlMS8fzZ6hdrYqDnjASP1GUobXlt3GXanzUzAU8z4y3oBzrYp_6OB8VLzkTblOBTnzUzAU8PBOeu2zrBlr1..; Ecp_session=1; SID_sug=018104; knsLeftGroupSelectItem=; dsorders=CF; dsortypes=cur%20DESC; knsadv-searchtype=%7B%22BLZOG7CK%22%3A%22gradeSearch%2CmajorSearch%22%2C%22MPMFIG1A%22%3A%22gradeSearch%2CmajorSearch%2CsentenceSearch%22%2C%22T2VC03OH%22%3A%22gradeSearch%2CmajorSearch%22%2C%22JQIRZIYA%22%3A%22gradeSearch%2CmajorSearch%2CsentenceSearch%22%2C%22S81HNSV3%22%3A%22gradeSearch%22%2C%22YSTT4HG0%22%3A%22gradeSearch%2CmajorSearch%2CauthorSearch%2CsentenceSearch%22%2C%22ML4DRIDX%22%3A%22gradeSearch%2CmajorSearch%22%2C%22WQ0UVIAA%22%3A%22gradeSearch%2CmajorSearch%22%2C%22VUDIXAIY%22%3A%22gradeSearch%2CmajorSearch%22%2C%22LIQN9Z3G%22%3A%22gradeSearch%22%2C%22NN3FJMUV%22%3A%22gradeSearch%2CmajorSearch%2CauthorSearch%2CsentenceSearch%22%2C%22LSTPFY1C%22%3A%22gradeSearch%2CmajorSearch%2CsentenceSearch%22%2C%22HHCPM1F8%22%3A%22gradeSearch%2CmajorSearch%22%2C%22OORPU5FE%22%3A%22gradeSearch%2CmajorSearch%22%2C%22WD0FTY92%22%3A%22gradeSearch%2CmajorSearch%2CauthorSearch%2CsentenceSearch%22%2C%22BPBAFJ5S%22%3A%22gradeSearch%2CmajorSearch%2CauthorSearch%2CsentenceSearch%22%2C%22EMRPGLPA%22%3A%22gradeSearch%2CmajorSearch%22%2C%22PWFIRAGL%22%3A%22gradeSearch%2CmajorSearch%2CsentenceSearch%22%2C%22U8J8LYLV%22%3A%22gradeSearch%2CmajorSearch%22%2C%22R79MZMCB%22%3A%22gradeSearch%22%2C%22J708GVCE%22%3A%22gradeSearch%2CmajorSearch%22%2C%228JBZLDJQ%22%3A%22gradeSearch%2CmajorSearch%2CsentenceSearch%22%2C%22HR1YT1Z9%22%3A%22gradeSearch%2CmajorSearch%22%2C%22JUP3MUPD%22%3A%22gradeSearch%2CmajorSearch%2CauthorSearch%2CsentenceSearch%22%2C%22NLBO1Z6R%22%3A%22gradeSearch%2CmajorSearch%22%2C%22RMJLXHZ3%22%3A%22gradeSearch%2CmajorSearch%2CsentenceSearch%22%2C%221UR4K4HZ%22%3A%22gradeSearch%2CmajorSearch%2CauthorSearch%2CsentenceSearch%22%2C%22NB3BWEHK%22%3A%22gradeSearch%2CmajorSearch%22%2C%22XVLO76FD%22%3A%22gradeSearch%2CmajorSearch%22%7D; Ecp_IpLoginFail=25121149.65.252.186; SID_kns_new=kns018106; SID_restapi=kns018110; KNS2COOKIE=1765437722.656.114388.232155|b25e41a932fd162af3b8c5cff4059fc3; dblang=both; createtime-advInput=2025-12-11%2015%3A22%3A21; searchTimeFlags=1',
'Origin': 'https://kns.cnki.net',
'Referer': 'https://kns.cnki.net/kns8s/AdvSearch?crossids=YSTT4HG0%2CLSTPFY1C%2CJUP3MUPD%2CMPMFIG1A%2CWQ0UVIAA%2CBLZOG7CK%2CPWFIRAGL%2CEMRPGLPA%2CNLBO1Z6R%2CNN3FJMUV',
'User-Agent': USER_AGENT,
}
# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# "science_article_cnki.middlewares.ScienceArticlCnkiSpiderMiddleware": 543,
#}
RETRY_ENABLED = True
RETRY_TIMES = 2 # 重试3次
RETRY_HTTP_CODES = [500, 502, 503, 504, 408, 400, 403, 404] # 增加了一些常见的错误码
DOWNLOADER_MIDDLEWARES = {
'scrapy.downloadermiddlewares.retry.RetryMiddleware': 550
# "org_news.middlewares.OrgNewsDownloaderMiddleware": 543,
}
# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# "science_article_cnki.middlewares.ScienceArticlCnkiDownloaderMiddleware": 543,
#}
# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# "scrapy.extensions.telnet.TelnetConsole": None,
#}
# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = {
# "science_article_cnki.pipelines.ScienceArticlCnkiPipeline": 300,
#}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = "httpcache"
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"
# Set settings whose default value is deprecated to a future-proof value
FEED_EXPORT_ENCODING = "utf-8"
MONGO_URI = "mongodb://science-dev:kcidea1509!%25)(@101.43.239.105:27017/?authSource=science&directConnection=true"
MONGO_DATABASE = 'science2'

@ -0,0 +1,4 @@
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.

@ -0,0 +1,101 @@
from __future__ import annotations
import math
from copy import deepcopy
from datetime import datetime
from typing import TYPE_CHECKING, Any, Self
import scrapy
from science_article_cnki.items import CnkiCitedNumberItem
from science_article_cnki.utils.tools import str2int
from science_article_cnki.models import cnki_model as model
from science_article_cnki.configs import cnki as config
if TYPE_CHECKING:
from scrapy.crawler import Crawler
class CnkiCitedNumberSpider(scrapy.Spider):
name = "cnki_cited_number"
custom_settings = dict(
DEFAULT_REQUEST_HEADERS={
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "en",
},
DOWNLOADER_MIDDLEWARES={
"science_article_cnki.middlewares.CnkiSearchHeadersDownloaderMiddleware": 540,
},
ITEM_PIPELINES={
"science_article_cnki.pipelines.MongoPipeline": 300,
# "science_article_cnki.pipelines.verify_data.VerifyDataIntegrity": 400,
},
LOG_LEVEL="INFO"
)
@classmethod
def from_crawler(cls, crawler: Crawler, *args: Any, **kwargs: Any) -> Self:
# 自定义逻辑
# 比如判断如果没有参数从数据库中读取
return super().from_crawler(crawler, *args, **kwargs)
def __init__(self, query: str = None, resource_type: str = "JOURNAL", query_condition: dict = None, **kwargs: Any):
super().__init__(**kwargs)
self.query = query
self.resource_type = resource_type
self.sort_field = 'CF'
self.query_condition = query_condition # {'year': '2021'}
self.page_size = 50
async def start(self):
m = dict(query=self.query, resource_type=self.resource_type, page=1, sort_field=self.sort_field, **self.query_condition)
query_body = model.adv_refine_search(**m)
search_param = model.adv_query_search(query_body, **m)
yield scrapy.FormRequest(
url=config.CNKI_ADV_SEARCH_API, method="POST", formdata=search_param, meta=m
)
def parse(self, response, **kwargs):
meta = response.meta
# -------------------------------------------- 计算一共有多少页的逻辑 --------------------------------------------
# 提取检索结果的数量
total_prm = response.xpath('//span[@class="pagerTitleCell"]/em/text()').get()
if not total_prm:
self.logger.warning("""
当前 {query}
响应 {resp}""".format(query=meta.get('query'), resp=response.body))
return
total = str2int(total_prm.replace(',', '')) # 格式化数量字符串并转int
# 计算一共有多少页
max_page = math.ceil(total / self.page_size)
meta['max_page'] = max_page
batch_time = datetime.now()
tr_nodes = response.xpath('//div[@id="gridTable"]//table[@class="result-table-list"]/tbody/tr')
for tr_node in tr_nodes:
third_id = tr_node.xpath('./td[@class="operat"]/a[@class="icon-collect"]/@data-filename').get() # 三方id
cited_str = tr_node.xpath('./td[@class="quote"]/span/a/text()').get() # 三方id
if third_id and cited_str:
cited_item = CnkiCitedNumberItem()
cited_item['third_id'] = third_id
cited_item['cited'] = str2int(cited_str, 0)
cited_item['updated_at'] = batch_time
yield cited_item
meta_copy: dict = deepcopy(meta)
meta_copy['page'] += 1
query_body = model.adv_refine_search(**meta_copy)
search_param = model.adv_query_search(query_body, **meta_copy)
yield scrapy.FormRequest(
url=config.CNKI_ADV_SEARCH_API, method="POST", formdata=search_param,
meta=meta_copy
)
if __name__ == '__main__':
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
process = CrawlerProcess(get_project_settings())
task_params = dict()
task_params.setdefault('query', '(作者单位:西安建筑科技大学(模糊)')
task_params.setdefault('query_condition', {'year': '2026'})
process.crawl(CnkiCitedNumberSpider, **task_params)
process.start() # 阻塞直到所有爬虫完成

@ -0,0 +1,10 @@
import scrapy
class ExampleSpider(scrapy.Spider):
name = "example"
allowed_domains = ["example.com"]
start_urls = ["https://example.com"]
def parse(self, response):
pass

@ -0,0 +1,17 @@
from typing import List, Tuple
from datetime import datetime
def str2int(val, replace=0):
try:
val = int(val)
except ValueError:
val = replace
except TypeError:
val = replace
return val
def get_today_date(fmt: str = "%Y-%m-%d"):
return datetime.today().strftime(fmt)

@ -0,0 +1,11 @@
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.io/en/latest/deploy.html
[settings]
default = science_article_cnki.settings
[deploy]
#url = http://localhost:6800/
project = science_article_cnki
Loading…
Cancel
Save