From 6c0d7328773e7b164ec51d7b7dfe768353887fc7 Mon Sep 17 00:00:00 2001 From: zhaoxiangpeng <1943364377@qq.com> Date: Tue, 16 Dec 2025 09:40:22 +0800 Subject: [PATCH] =?UTF-8?q?cnki:=E9=87=87=E9=9B=86=E8=A2=AB=E5=BC=95?= =?UTF-8?q?=E9=87=8F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../science_article_cnki/__init__.py | 0 .../science_article_cnki/configs/__init__.py | 0 .../science_article_cnki/configs/cnki.py | 88 +++ .../configs/extract_rule.py | 15 + .../science_article_cnki/db_utils/__init__.py | 0 .../science_article_cnki/db_utils/mongo.py | 87 +++ .../science_article_cnki/items.py | 40 ++ .../science_article_cnki}/middlewares.py | 27 +- .../science_article_cnki/models/__init__.py | 0 .../science_article_cnki/models/cnki_model.py | 630 ++++++++++++++++++ .../science_article_cnki/models/enum_cls.py | 128 ++++ .../science_article_cnki/pipelines.py | 90 +++ .../science_article_cnki/settings.py | 105 +++ .../science_article_cnki/spiders/__init__.py | 4 + .../spiders/cnki_cited_number.py | 101 +++ .../science_article_cnki/spiders/example.py | 10 + .../science_article_cnki/utils/__init__.py | 0 .../science_article_cnki/utils/tools.py | 17 + science_article_cnki/scrapy.cfg | 11 + 19 files changed, 1341 insertions(+), 12 deletions(-) create mode 100644 science_article_cnki/science_article_cnki/__init__.py create mode 100644 science_article_cnki/science_article_cnki/configs/__init__.py create mode 100644 science_article_cnki/science_article_cnki/configs/cnki.py create mode 100644 science_article_cnki/science_article_cnki/configs/extract_rule.py create mode 100644 science_article_cnki/science_article_cnki/db_utils/__init__.py create mode 100644 science_article_cnki/science_article_cnki/db_utils/mongo.py create mode 100644 science_article_cnki/science_article_cnki/items.py rename {science_article_add/science_article_add => science_article_cnki/science_article_cnki}/middlewares.py (87%) create mode 100644 science_article_cnki/science_article_cnki/models/__init__.py create mode 100644 science_article_cnki/science_article_cnki/models/cnki_model.py create mode 100644 science_article_cnki/science_article_cnki/models/enum_cls.py create mode 100644 science_article_cnki/science_article_cnki/pipelines.py create mode 100644 science_article_cnki/science_article_cnki/settings.py create mode 100644 science_article_cnki/science_article_cnki/spiders/__init__.py create mode 100644 science_article_cnki/science_article_cnki/spiders/cnki_cited_number.py create mode 100644 science_article_cnki/science_article_cnki/spiders/example.py create mode 100644 science_article_cnki/science_article_cnki/utils/__init__.py create mode 100644 science_article_cnki/science_article_cnki/utils/tools.py create mode 100644 science_article_cnki/scrapy.cfg diff --git a/science_article_cnki/science_article_cnki/__init__.py b/science_article_cnki/science_article_cnki/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/science_article_cnki/science_article_cnki/configs/__init__.py b/science_article_cnki/science_article_cnki/configs/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/science_article_cnki/science_article_cnki/configs/cnki.py b/science_article_cnki/science_article_cnki/configs/cnki.py new file mode 100644 index 0000000..e4770bb --- /dev/null +++ b/science_article_cnki/science_article_cnki/configs/cnki.py @@ -0,0 +1,88 @@ +# -*- coding: utf-8 -*- +# @Time : 2024/5/9 11:11 +# @Author : zhaoxiangpeng +# @File : config.py + +# 数据来源名 +SOURCE_NAME = 'cnki' + +# 期刊导航 +# 主页 +CNKI_JOURNAL_NAVIGATOR_INDEX = 'https://navi.cnki.net/knavi/journals/index?uniplatform=NZKPT' +# 搜索接口 +CNKI_JOURNAL_NAVIGATOR_SEARCH_API = 'https://navi.cnki.net/knavi/journals/searchbaseinfo' +# 导出xls接口(旧版, 通过filename导出) +CNKI_EXPORT_XLS_OLD_API = 'https://kns.cnki.net/dm/manage/FileToText' +# 导出xls接口 +CNKI_EXPORT_XLS_API = 'https://kns.cnki.net/dm8/FileToText' + +# 期刊详情页 +CNKI_JOURNAL_DETAIL = 'https://navi.cnki.net/knavi/journals/{journal_no}/detail?uniplatform=NZKPT' + +# 期刊详情页获取发文年份/期列表的接口 +CNKI_JOURNAL_ISSUE = 'https://navi.cnki.net/knavi/journals/{journal_no}/yearList' # ZDJY + +# 期刊详情页获取年/期发文列表的接口 +CNKI_JOURNAL_ISSUE_ARTICLE = 'https://navi.cnki.net/knavi/journals/{journal_no}/papers' + +# 文章详情页 +CNKI_ARTICLE_DETAIL = 'https://kns.cnki.net/kcms/detail/detail.aspx?dbcode={db_code}&filename={article_id}' + +# -- 旧版的接口 +CNKI_ADV_SEARCH_API = 'https://kns.cnki.net/kns8s/brief/grid' + +# 搜索用的请求头 +SEARCH_HEADERS = { + 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', + 'Cookie': 'Ecp_notFirstLogin=qkFgu9; Ecp_ClientId=o240823084800102418; Ecp_loginuserbk=SJTU; cnkiUserKey=eef4d3aa-1096-bc9e-dff0-74349179c2cc; Ecp_ClientIp=111.186.52.67; UM_distinctid=19366f14e7a832-0f92ef85a35cb5-26001051-1fa400-19366f14e7c14f2; Hm_lvt_dcec09ba2227fd02c55623c1bb82776a=1734079899; Ecp_session=1; SID_kns_new=kns018104; SID_sug=018104; knsLeftGroupSelectItem=; updatetime-advInput=2024-12-19+17%3A42%3A08; knsadv-searchtype=%7B%22BLZOG7CK%22%3A%22gradeSearch%2CmajorSearch%22%2C%22MPMFIG1A%22%3A%22gradeSearch%2CmajorSearch%2CsentenceSearch%22%2C%22T2VC03OH%22%3A%22gradeSearch%2CmajorSearch%22%2C%22JQIRZIYA%22%3A%22gradeSearch%2CmajorSearch%2CsentenceSearch%22%2C%22S81HNSV3%22%3A%22gradeSearch%22%2C%22YSTT4HG0%22%3A%22gradeSearch%2CmajorSearch%2CauthorSearch%2CsentenceSearch%22%2C%22ML4DRIDX%22%3A%22gradeSearch%2CmajorSearch%22%2C%22WQ0UVIAA%22%3A%22gradeSearch%2CmajorSearch%22%2C%22VUDIXAIY%22%3A%22gradeSearch%2CmajorSearch%22%2C%22NN3FJMUV%22%3A%22gradeSearch%2CmajorSearch%2CauthorSearch%2CsentenceSearch%22%2C%22LSTPFY1C%22%3A%22gradeSearch%2CmajorSearch%2CsentenceSearch%22%2C%22HHCPM1F8%22%3A%22gradeSearch%2CmajorSearch%22%2C%22OORPU5FE%22%3A%22gradeSearch%2CmajorSearch%22%2C%22WD0FTY92%22%3A%22gradeSearch%2CmajorSearch%2CauthorSearch%2CsentenceSearch%22%2C%22BPBAFJ5S%22%3A%22gradeSearch%2CmajorSearch%2CauthorSearch%2CsentenceSearch%22%2C%22EMRPGLPA%22%3A%22gradeSearch%2CmajorSearch%22%2C%22PWFIRAGL%22%3A%22gradeSearch%2CmajorSearch%2CsentenceSearch%22%2C%22U8J8LYLV%22%3A%22gradeSearch%2CmajorSearch%22%2C%22R79MZMCB%22%3A%22gradeSearch%22%2C%22J708GVCE%22%3A%22gradeSearch%2CmajorSearch%22%2C%22HR1YT1Z9%22%3A%22gradeSearch%2CmajorSearch%22%2C%22JUP3MUPD%22%3A%22gradeSearch%2CmajorSearch%2CauthorSearch%2CsentenceSearch%22%2C%22NLBO1Z6R%22%3A%22gradeSearch%2CmajorSearch%22%2C%22RMJLXHZ3%22%3A%22gradeSearch%2CmajorSearch%2CsentenceSearch%22%2C%221UR4K4HZ%22%3A%22gradeSearch%2CmajorSearch%2CauthorSearch%2CsentenceSearch%22%2C%22NB3BWEHK%22%3A%22gradeSearch%2CmajorSearch%22%2C%22XVLO76FD%22%3A%22gradeSearch%2CmajorSearch%22%7D; createtime-advInput=2024-12-20%2014%3A37%3A03; LID=WEEvREcwSlJHSldSdmVpanJGNW9JQS9sbkNrOUFycHJkRzF3eXgyTGlWbz0=$9A4hF_YAuvQ5obgVAqNKPCYcEjKensW4IQMovwHtwkF4VYPoHbKxJw!!; Ecp_LoginStuts={"IsAutoLogin":false,"UserName":"SJTU","ShowName":"%E4%B8%8A%E6%B5%B7%E4%BA%A4%E9%80%9A%E5%A4%A7%E5%AD%A6","UserType":"bk","BUserName":"","BShowName":"","BUserType":"","r":"qkFgu9","Members":[]}; KNS2COOKIE=1734680479.883.14106.830885|b25e41a932fd162af3b8c5cff4059fc3; dblang=both; c_m_LinID=LinID=WEEvREcwSlJHSldSdmVpanJGNW9JQS9sbkNrOUFycHJkRzF3eXgyTGlWbz0=$9A4hF_YAuvQ5obgVAqNKPCYcEjKensW4IQMovwHtwkF4VYPoHbKxJw!!&ot=12%2F20%2F2024%2016%3A01%3A27; c_m_expire=2024-12-20%2016%3A01%3A27; tfstk=gnXZLQYMKRewdgBaoHvqL9aIUYp9sd45ntTXmijDfFYG5iTcTZbBCGsccx-D-NdjCxY18pQRVAC_6ITq0dBC1xT_WKScPKz7P8w5XGpynzaShW0gBdKqnncilpDHmK-i1ZwdGGpvnyaM9UCdXabz7TCMnkJH4ncDnxYMtk-6qKDMiAcn-eKDnKADjDYH4nmioAYgYMYpDKxcoCcmtGjmL3Og25LCsWPKUCYljekmU0KHslSnGAMsnhA9rBxrnH6ebC8ljOHkrv-hd9RWOmayKgCCSHJz3vvwaOBytO4K3BQ2-IWMh0kcYNshNIWgD5IF3FRlIBoS3dIpmZAV9zkWbd1eaO5TD2jGPF5kBiiz5MRPTQKHtmlMC_s5HQXgQ4LBwn7y4NuN4DuvxG5lH1umgCxpYUZUY7E40mtBH0LEMjdHeH87fhGxMCxpYUZUYjhvteKePlt1.; searchTimeFlags=1; updatetime-advInput=2024-12-19+17%3A42%3A08', + 'Origin': 'https://kns.cnki.net', + 'Referer': 'https://kns.cnki.net/kns8s/AdvSearch?crossids=YSTT4HG0%2CLSTPFY1C%2CJUP3MUPD%2CMPMFIG1A%2CWQ0UVIAA%2CBLZOG7CK%2CPWFIRAGL%2CEMRPGLPA%2CNLBO1Z6R%2CNN3FJMUV', + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36', +} +# 期刊导航页的请求头 +JOURNAL_NAVIGATOR_HEADERS = { + 'Content-Type': 'application/x-www-form-urlencoded', + 'Origin': 'https://navi.cnki.net', + 'Referer': 'https://navi.cnki.net/knavi/journals/index?uniplatform=NZKPT', + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36', + 'uniplatform': 'NZKPT', +} + +# mongodb 集合配置 + +# 风控参数缓存key +FEC_REDIS_KEY = "cookies_pool:cnki:crypt" +FEC_REDIS_TTL = 3600 + +# 详情页cookies +COOKIES_REDIS_KEY = "cookies_pool:cnki:detail_cookies" + + +# 期刊信息表 +CNKI_JOURNAL_INFO_COLLECTION = 'task_journal_info_{}'.format(SOURCE_NAME) +# 期刊年卷期任务表 +CNKI_JOURNAL_ISSUE_COLLECTION = 'task_journal_issue_{}'.format(SOURCE_NAME) +# 期刊发文表 +CNKI_JOURNAL_ARTICLE_COLLECTION = 'data_{}_article'.format(SOURCE_NAME) +# 待下载Id表 +CNKI_ARTICLE_TODO_IDS_COLLECTION = 'todo_ids_{}'.format(SOURCE_NAME) +# 待下载详情id表 +CNKI_ARTICLE_DETAIL_TODO_IDS_COLLECTION = 'todo_ids_cnki_detail' +# 发文作者地址关系 +CNKI_ARTICLE_AUTHOR_ORG_COLLECTION = "relation_author_org_cnki" +# 发文关系表 +SCHOOL_RELATION_COLLECTION = 'relation_school_{}'.format(SOURCE_NAME) + + +# 中文期刊列表需要用到的集合 +CHECK_JOURNAL_INFO_TABLE = "check_journal_info_{}".format(SOURCE_NAME) # 信息表 +CHECK_JOURNAL_MIDDLE_TABLE = "check_journal_middle_{}".format(SOURCE_NAME) # 中间任务表 +CHECK_JOURNAL_ISDOWN_TABLE = "check_journal_isdown_{}".format(SOURCE_NAME) # 结果存储表 + + +# xls文件表头 +TABLE_HEAD = ['SrcDatabase-来源库', 'Title-题名', 'Author-作者', 'Organ-单位', 'Source-文献来源', 'Keyword-关键词', 'Summary-摘要', 'PubTime-发表时间', 'FirstDuty-第一责任人', 'Fund-基金', 'Year-年', 'Volume-卷', 'Period-期', 'PageCount-页码', 'CLC-中图分类号', 'ISSN-国际标准刊号', 'URL-网址', 'DOI-DOI'] +# json字段表头 +TABLE_HEAD_EN = ['src_db', 'title', 'author', 'org', 'journal', 'keyword', 'abstract', 'pub_time', 'first_duty', 'fund', 'year', 'volum', 'issue', 'page', 'classification_code', 'issn', 'url', 'doi'] +# 每次下载数量 +BATCH_DOWNLOAD_LIMIT = 50 diff --git a/science_article_cnki/science_article_cnki/configs/extract_rule.py b/science_article_cnki/science_article_cnki/configs/extract_rule.py new file mode 100644 index 0000000..d402f06 --- /dev/null +++ b/science_article_cnki/science_article_cnki/configs/extract_rule.py @@ -0,0 +1,15 @@ +# -*- coding: utf-8 -*- +# @Time : 2024/5/13 16:53 +# @Author : zhaoxiangpeng +# @File : extract_rule.py + +# 提取ISSN号 +ISSN_REGEX_PATTERN = r'ISSN:(\d{4}-[\dX]{4})' +# 提取CN号, https://baike.baidu.com/item/%E5%9B%BD%E5%86%85%E7%BB%9F%E4%B8%80%E5%88%8A%E5%8F%B7/386463 +CN_REGEX_PATTERN = r'CN:(\d{2}-\d{4}/?[A-Z]?)' + +# 去除/替换标题中的特殊字符 +DEL_TITLE_SYMBOL_PATTERN = '[’!"#$%&\'()*+,-.·/::;<=>—?@,。?★、…()【】《》?“”‘’![\\]^_`{|}~\s]+' + +# 去除特殊字符后的字符 +DEL_SOURCE_SYMBOL_PATTERN = DEL_TITLE_SYMBOL_PATTERN diff --git a/science_article_cnki/science_article_cnki/db_utils/__init__.py b/science_article_cnki/science_article_cnki/db_utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/science_article_cnki/science_article_cnki/db_utils/mongo.py b/science_article_cnki/science_article_cnki/db_utils/mongo.py new file mode 100644 index 0000000..7b81a9e --- /dev/null +++ b/science_article_cnki/science_article_cnki/db_utils/mongo.py @@ -0,0 +1,87 @@ +from __future__ import annotations +import logging +from typing import TYPE_CHECKING, Optional, Dict, Tuple +from pymongo import MongoClient +from pymongo import UpdateOne +from pymongo.errors import DuplicateKeyError, BulkWriteError + +if TYPE_CHECKING: + from pymongo.database import Database + from pymongo.collection import Collection + from pymongo.results import InsertManyResult, BulkWriteResult + + +def build_update_query(update_data: dict, replace: bool = True) -> dict: + """ + 如果replace为True,则直接覆盖原有的document + """ + update_query = {} + if not update_data: + return {} + for key, val in update_data.items(): + if replace: + update_query.setdefault( + "$set", {} + ).update( + {key: val} + ) + else: + if isinstance(val, list): + update_query.setdefault( + "$addToSet", {} + ).update({ + key: {"$each": val} + }) + else: + update_query.setdefault( + "$set", {} + ).update( + {key: val} + ) + return update_query + + +def update_document(filter_query: dict = None, update_data: dict = None, replace: bool = True) -> Tuple[dict, dict]: + update_query = {} + if not update_data: + return {}, {} + + for key, val in update_data.items(): + if replace: + update_query.setdefault( + "$set", {} + ).update( + {key: val} + ) + else: + if isinstance(val, list): + update_query.setdefault( + "$addToSet", {} + ).update({ + key: {"$each": val} + }) + else: + update_query.setdefault( + "$set", {} + ).update( + {key: val} + ) + return filter_query, update_query + + +class MongoDBUtils: + def __init__(self, mongo_uri, mongo_db): + self.mongo_uri = mongo_uri + self.mongo_db = mongo_db + self.client: MongoClient = None + self.db: Database = None + + def _insert2db(self, items, tablename, ordered: bool = False, **kwargs) -> InsertManyResult: + collection: Collection = self.db.get_collection(tablename) + result: InsertManyResult = collection.insert_many(items, ordered=ordered, **kwargs) + return result + + def _update2db(self, items, tablename, ordered: bool = False, **kwargs) -> BulkWriteResult: + collection: Collection = self.db.get_collection(tablename) + bulk_results: BulkWriteResult = collection.bulk_write(items, ordered=ordered, **kwargs) + return bulk_results diff --git a/science_article_cnki/science_article_cnki/items.py b/science_article_cnki/science_article_cnki/items.py new file mode 100644 index 0000000..dfa31a9 --- /dev/null +++ b/science_article_cnki/science_article_cnki/items.py @@ -0,0 +1,40 @@ +# Define here the models for your scraped items +# +# See documentation in: +# https://docs.scrapy.org/en/latest/topics/items.html + +import scrapy + + +class ScienceArticlCnkiItem(scrapy.Item): + # define the fields for your item here like: + # name = scrapy.Field() + pass + + +class AddItemBase(scrapy.Item): + third_id = scrapy.Field() + updated_at = scrapy.Field() + + +class ArticleItem(AddItemBase): + exported = scrapy.Field() + + +class IdRelationItem(AddItemBase): + query_ids = scrapy.Field() + school_ids = scrapy.Field() + task_ids = scrapy.Field() + + +class ArticleCitedItem(AddItemBase): + cited = scrapy.Field() + + +class CnkiCitedNumberItem(ArticleCitedItem): + __tablename__ = 'relation_cited_number_cnki' + + """发文被引量item""" + third_id = scrapy.Field() + cited = scrapy.Field() + updated_at = scrapy.Field() diff --git a/science_article_add/science_article_add/middlewares.py b/science_article_cnki/science_article_cnki/middlewares.py similarity index 87% rename from science_article_add/science_article_add/middlewares.py rename to science_article_cnki/science_article_cnki/middlewares.py index 7a4e77b..fdd76df 100644 --- a/science_article_add/science_article_add/middlewares.py +++ b/science_article_cnki/science_article_cnki/middlewares.py @@ -9,7 +9,7 @@ from scrapy import signals from itemadapter import ItemAdapter -class ScienceArticleAddSpiderMiddleware: +class ScienceArticlCnkiSpiderMiddleware: # Not all methods need to be defined. If a method is not defined, # scrapy acts as if the spider middleware does not modify the # passed objects. @@ -53,7 +53,7 @@ class ScienceArticleAddSpiderMiddleware: spider.logger.info("Spider opened: %s" % spider.name) -class ScienceArticleAddDownloaderMiddleware: +class ScienceArticlCnkiDownloaderMiddleware: # Not all methods need to be defined. If a method is not defined, # scrapy acts as if the downloader middleware does not modify the # passed objects. @@ -100,14 +100,17 @@ class ScienceArticleAddDownloaderMiddleware: spider.logger.info("Spider opened: %s" % spider.name) -class WosLiteApiXkeyDownloaderMiddleware: - async def process_request(self, request, spider): - key_param = { - 'X-ApiKey': '941a216f25cbef0f80ee4ba58a08ef1e19dee7a4' - } - if not request.headers: - request.headers = key_param - return request +from scrapy.http.headers import Headers - request.headers.update(key_param) - return request + +class CnkiSearchHeadersDownloaderMiddleware: + def __init__(self, custom_headers: dict): + self.custom_headers = custom_headers + + @classmethod + def from_crawler(cls, crawler): + return cls(custom_headers=crawler.settings['SEARCH_REQUEST_HEADERS']) + + def process_request(self, request, spider): + request.headers = Headers(self.custom_headers) + return None diff --git a/science_article_cnki/science_article_cnki/models/__init__.py b/science_article_cnki/science_article_cnki/models/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/science_article_cnki/science_article_cnki/models/cnki_model.py b/science_article_cnki/science_article_cnki/models/cnki_model.py new file mode 100644 index 0000000..ee4cfd7 --- /dev/null +++ b/science_article_cnki/science_article_cnki/models/cnki_model.py @@ -0,0 +1,630 @@ +# -*- coding: utf-8 -*- +# @Time : 2023/2/9 14:24 +# @Author : zhaoxiangpeng +# @File : model.py +import re +import json +from typing import Union, List, Dict + +from datetime import datetime, timedelta +from science_article_cnki.models.enum_cls import ResourceType, SearchTypeId, SearchFieldEnum, OperatorEnum, LogicEnum, SearchFromId + +DB_CODE = { + 'CFLS': '总库', + 'CFLQ': '期刊', + 'CDMD': '学位论文', + 'CFLP': '会议', + 'CCND': '报纸' +} + + +def export2adv(query): + """ + 检索式的专业检索转高级检索 + :param query: + :return: + """ + if query.find('%=') != -1: + query = '(作者单位:%(name)s(模糊))' % {"name": query.split('%=')[-1]} + return query + + +def navigator_body(query: str = None, db_code: str = 'CFLS', **kwargs): + if query is None: + raise ValueError("query 不能为空,应为检索式,如:AF=上海交通大学,具体检索式见 " + "https://piccache.cnki.net/2022/kdn/index/helper/manual.html#frame2-1-5") + _param = { + 'queryJson': json.dumps({ + "Platform": "", + "DBCode": db_code, + "KuaKuCode": "CJFQ,CDMD,CIPD,CCND,CISD,SNAD,BDZK,CCJD,CCVD,CJFN", + "QNode": { + "QGroup": [ + {"Key": "Subject", "Title": "", "Logic": 4, "Items": [ + {"Key": "Expert", "Title": "", "Logic": 0, "Name": "", "Operate": "", "Value": query, + "ExtendType": 12, "ExtendValue": "中英文对照", "Value2": "", "BlurType": ""}], + "ChildItems": []}, + {"Key": "ControlGroup", "Title": "", "Logic": 1, "Items": [], "ChildItems": []} + ] + }, + "CodeLang": "" + }) + } + return _param + + +def signal_body(query: str = None, resource_type: str = 'JOURNAL', group_id: str = 'YE', **kwargs): + """ + 获取左侧导航栏单类目的聚合 + :group_id: 主要主题1; 学科2; 发表年度3; 研究层次4; 文献类型5; 文献来源6; 作者7; 机构8; 基金9 + :return: + """ + if query is None: + raise ValueError("query 不能为空,应为检索式,如:AF=上海交通大学,具体检索式见 " + "https://piccache.cnki.net/2022/kdn/index/helper/manual.html#frame2-1-5") + _param = { + 'queryJson': json.dumps({ + "Platform": "", + "Resource": ResourceType[resource_type].name, + "Classid": ResourceType[resource_type].value, + "Products": "", + "QNode": { + "QGroup": [{ + "Key": "Subject", + "Title": "", + "Logic": 0, + "Items": [{ + "Key": "Expert", + "Title": "", + "Logic": 0, + "Field": "EXPERT", + "Operator": 0, + "Value": query, + "Value2": "" + }], + "ChildItems": [] + }, { + "Key": "ControlGroup", + "Title": "", + "Logic": 0, + "Items": [], + "ChildItems": [] + }] + }, + "ExScope": "1", + "SearchType": SearchTypeId.GROUP.value, + "Rlang": "CHINESE", + "KuaKuCode": "" + }, ensure_ascii=False), + 'groupId': group_id + } + return _param + + +def refine_search(query: str, resource_type: str = 'JOURNAL', year=None, subject=None, code=None, **kwargs): + """ + 使用专业检索式,检索后再次检索年份 + """ + _query = { + "Platform": "", + "Resource": ResourceType[resource_type].name, + "Classid": ResourceType[resource_type].value, + "Products": "", + "QNode": { + "QGroup": [{ + "Key": "Subject", + "Title": "", + "Logic": 0, + "Items": [{ + "Key": "Expert", + "Title": "", + "Logic": 0, + "Field": "EXPERT", + "Operator": 0, + "Value": query, + "Value2": "" + }], + "ChildItems": [] + }, { + "Key": "ControlGroup", + "Title": "", + "Logic": 0, + "Items": [], + "ChildItems": [] + }] + }, + "ExScope": "1", + "SearchType": SearchTypeId.GROUP.value, + "Rlang": "CHINESE", + "KuaKuCode": "", + "View": "changeDBOnlyFT" + } + _group2 = { + "Key": "MutiGroup", + "Title": "", + "Logic": 0, + "Items": [], + "ChildItems": [] + } + if year: + year_param = { + "Key": "YE", + "Title": "", + "Logic": 0, + "Items": [{ + "Key": year, + "Title": "%s年" % year, + "Logic": 1, + "Field": "YE", + "Operator": "DEFAULT", + "Value": year, + "Value2": "", + "Name": "YE", + "ExtendType": 0 + }], + "ChildItems": [] + } + _group2['ChildItems'].append(year_param) + if subject: + subject_param = { + 'Key': '6', + 'Title': '', + 'Logic': 1, + 'Items': [{ + 'Key': code + '?', + 'Title': subject, + 'Logic': 2, + 'Name': '专题子栏目代码', + 'Operate': '', + 'Value': code + '?', + 'ExtendType': 14, + 'ExtendValue': '', + 'Value2': '', + 'BlurType': '' + }], + 'ChildItems': [] + } + _group2['ChildItems'].append(subject_param) + _query['QNode']['QGroup'].append(_group2) + return _query + + +def query_search(query_body, page: int = 1, handler_id: str = 18, sql: str = None, + sort: str = 'desc', sort_field: str = 'PT', **kwargs): + """ + 搜索请求body + :param query_body: 用来搜索的详细query, 与左侧导航body相同 + :param page: 请求的页码 + :param handler_id: 可能需要携带此参数, 在源码中获取 + :param sql: 源码中,一般不需要 + :param sort: 排序方式, desc/asc + :param sort_field: 排序字段, PT(发表时间)/CF(被引) + :return: + """ + if page == 1: + base_query = query_body.get("QNode", {}).get("QGroup", [{}])[0].get("Items", [{}])[0].get("Value") + aside = '( %s)' % base_query if page == 1 else '' + _query = { + "boolSearch": "true", + "QueryJson": json.dumps(query_body, ensure_ascii=False), + "pageNum": "1", + "pageSize": "50", + "dstyle": "listmode", + "boolSortSearch": "false", + "aside": aside, + "searchFrom": "资源范围:学术期刊; 仅看有全文,中英文扩展; 时间范围:更新时间:不限; 来源类别:全部期刊;", + "CurPage": "1" + } + else: + _query = { + 'boolSearch': "false", + 'QueryJson': json.dumps(query_body, ensure_ascii=False), + 'pageNum': page, + 'pageSize': 50, + 'sortField': sort_field, + 'sortType': sort, + 'dstyle': 'listmode', + 'boolSortSearch': "false", + # 'sentenceSearch': "false", + # 'productStr': 'YSTT4HG0,LSTPFY1C,RMJLXHZ3,JQIRZIYA,JUP3MUPD,1UR4K4HZ,BPBAFJ5S,R79MZMCB,MPMFIG1A,EMRPGLPA,J708GVCE,ML4DRIDX,WQ0UVIAA,NB3BWEHK,XVLO76FD,HR1YT1Z9,BLZOG7CK,PWFIRAGL,NN3FJMUV,NLBO1Z6R,', + 'aside': '', + 'searchFrom': '资源范围:学术期刊; 仅看有全文,中英文扩展; 时间范围:更新时间:不限; 来源类别:全部期刊;', + } + return _query + + +def get_cnki_export_data(ids: str): + """ + 表头 + 'SrcDatabase-来源库,Title-题名,Author-作者,Organ-单位,Source-文献来源,Keyword-关键词,Summary-摘要,PubTime-发表时间,FirstDuty-第一责任人,Fund-基金,Year-年,Volume-卷,Period-期,PageCount-页码,CLC-中图分类号,ISSN-国际标准刊号,URL-网址,DOI-DOI,', + :param ids: + :return: + """ + data = { + 'FileName': ids, + 'DisplayMode': 'selfDefine', + 'OrderParam': 0, + 'OrderType': 'desc', + 'SelectField': 'DB,TI,AU,AF,LY,KY,AB,PT,FI,FU,YE,JU,QI,PM,CLC,SN,ABSTRACT,DI,', + 'PageIndex': 1, + 'PageSize': 20, + 'language': 'CHS', + 'uniplatform': 'NZKPT', + 'Type': 'xls', + } + return data + + +def export_data(ids: str): + """ + https://kns.cnki.net/dm/manage/FileToText + :param ids: + :return: + """ + data = { + 'FileName': ids, + 'DisplayMode': 'selfDefine', + 'OrderParam': 0, + 'OrderType': 'desc', + 'SelectField': 'SrcDatabase-来源库,Title-题名,Author-作者,Organ-单位,Source-文献来源,Keyword-关键词,Summary-摘要,PubTime-发表时间,FirstDuty-第一责任人,Fund-基金,Year-年,Volume-卷,Period-期,PageCount-页码,CLC-中图分类号,ISSN-国际标准刊号,URL-网址,DOI-DOI,', + 'PageIndex': 1, + 'PageSize': 20, + 'language': 'CHS', + 'uniplatform': '', + 'Type': 'xls', + } + return data + + +def journal_nav_all(page: int = 1, page_size: int = 21): + model = dict( + searchStateJson=json.dumps( + {"StateID": "", "Platfrom": "", "QueryTime": "", "Account": "knavi", "ClientToken": "", "Language": "", + "CNode": {"PCode": "OYXNO5VW", "SMode": "", "OperateT": ""}, + "QNode": {"SelectT": "", "Select_Fields": "", "S_DBCodes": "", "QGroup": [], "OrderBy": "OTA|DESC", + "GroupBy": "", "Additon": ""}}, ensure_ascii=False), + displaymode=1, + pageindex=page, + pagecount=page_size, + index='JSTMWT6S', + searchType='刊名(曾用刊名)', + clickName='', + switchdata='' + ) + return model + + +def journal_article_by_year_issue(year_issue, page: int = 0, pcode: str = None): + """ + 获取期刊每一期的文章 + :param year_issue: + :param page: + :param pcode: + :return: + """ + if pcode is None: + pcode = 'CJFD,CCJD' + model = { + "yearIssue": year_issue, + "pageIdx": page, + "pcode": pcode + } + return model + + +def add_limit_2query_body(limit_query: Union[List[dict], dict], body_key: str, query_body: dict): + """ + 把limit添加到检索的queryJson中 + :param limit_query: + :param body_key: + :param query_body: + :return: + """ + # 判断组的key是否存在,不存在的话添加一个组 + if body_key not in {g["Key"] for g in query_body["QNode"]["QGroup"]}: + query_body["QNode"]["QGroup"].append({ + "Key": body_key, + "Title": "", + "Logic": LogicEnum.AND.value, + "Items": [], + "ChildItems": [] + }) + # 遍历所有的组,满足条件则把limit添加进去 + for group in query_body["QNode"]["QGroup"]: + if group["Key"] == body_key: + if isinstance(limit_query, dict): + group["ChildItems"].append(limit_query) + elif isinstance(limit_query, list): + group["ChildItems"].extend(limit_query) + else: + raise ValueError("不支持的limit类型 \n%s" % limit_query) + break + + +def parse_retrieval(query: str): + """ + 解析aside值拼接queryJson + :param query: + :return: + """ + def func(string: str): + stand = string[1:-1] # 去除左右的中文括号 + title, value = stand.split(":", maxsplit=1) # 分割 "作者单位:湖南中医药大学(模糊)" -> [作者单位, 湖南中医药大学(模糊)] + return title, value[:-4], value[-3:-1] + cond_list = re.split(r'(AND|NOT|OR)', query) + logic = 'AND' + content = cond_list[0] + yield logic, func(content) + for i in range(1, len(cond_list), 2): + chunk = cond_list[i:i + 2] # 获取两个元素 + logic, content = chunk + yield logic, func(content) + + +def add_search_word(search_content: str, base_query: dict = None): + """ + 高级检索添加检索式 + :param search_content: 用高级检索复制下来的aside字段 + :param base_query: + :return: + """ + words_query = [] + g = parse_retrieval(search_content) + i = 1 + for logic, sequence in g: + field_name, word, way = sequence + input_select = "input[data-tipid=gradetxt-%(input_no)s]" % {"input_no": i} + logic_operator = LogicEnum[logic].value + q = { + "Key": input_select, + "Title": field_name, + "Logic": logic_operator, + "Items": [{ + "Key": input_select, + "Title": field_name, + "Logic": logic_operator, + "Field": SearchFieldEnum(field_name).name, + "Operator": OperatorEnum[way].value, + "Value": word, + "Value2": "" + }], + "ChildItems": [] + } + words_query.append(q) + i += 1 + + # 如果传入了检索式,那自动添加检索词的语句 + if base_query: + add_limit_2query_body(words_query, "Subject", base_query) + + return words_query + + +def limit_year_range(year: int, base_query: dict = None): + """ + 添加年份筛选 + :param year: + :param base_query: + :return: + """ + year = str(year) + ye_query = { + "Key": "YE", + "Title": "", + "Logic": 0, + "Items": [{ + "Key": year, + "Title": "%s年" % year, + "Logic": 1, + "Field": "YE", + "Operator": "DEFAULT", + "Value": year, + "Value2": "", + "Name": "YE", + "ExtendType": 0 + }], + "ChildItems": [] + } + if base_query: + add_limit_2query_body(ye_query, "MutiGroup", base_query) + + return ye_query + + +def parse_updatedtime_symbol(symbol: str, today: str = None) -> tuple: + """ + 从字符串解析时间范围 + :param symbol: + :param today: + :return: + """ + if today and isinstance(today, str): + today = datetime.strptime(today, "%Y-%m-%d") + else: + today = datetime.now() + if symbol == "最近一周": + ago_day = today - timedelta(days=7) + elif symbol == "最近一月": + ago_day = today - timedelta(days=30) + elif symbol == "最近半年": + ago_day = today - timedelta(days=181) + elif symbol == "最近一年": + ago_day = today.replace(year=today.year-1) + elif symbol == "今年迄今": + ago_day = today.replace(month=1, day=1) + else: + ago_day = today + return ago_day.strftime("%Y-%m-%d"), today.strftime("%Y-%m-%d") + + +def limit_updated_time(range_str: str, toady: str = None, base_query: dict = None): + """ + 更新时间的检索式 + :param range_str: + :param toady: + :param base_query: + :return: + """ + start_date, end_date = parse_updatedtime_symbol(range_str, toady) + rt_query = { + "Key": ".tit-dropdown-box>.sort", + "Title": "", + "Logic": 0, + "Items": [ + { + "Key": ".tit-dropdown-box>.sort", + "Title": "更新时间", + "Logic": 0, + "Field": "RT", + "Operator": 7, + "Value": start_date, + "Value2": end_date + } + ], + "ChildItems": [] + } + # 当base_query参数存在时,自动添加筛选日期范围的query + if base_query: + add_limit_2query_body(rt_query, "ControlGroup", base_query) + + return rt_query + + +def temp_refine_search( + query: str, + year: int = None, + updated_date: str = None, + resource_type: str = 'JOURNAL', + **kwargs +): + """ + 构造queryJson字段的值 + :param query: 检索式,例:(作者单位:湖南中医药大学(模糊))OR(作者单位:湖南中医学院(模糊)) + :param updated_date: 更新时间:不限、最近一周/一月/半年/一年、今年迄今、上一年度 + :param year: 指定筛选的年份,如果需要与updated_date参数同时使用,需要在限制更新时间后再筛选 + :param resource_type: + :param kwargs: + :return: + """ + _query = { + "Platform": "", + "Resource": ResourceType[resource_type].name, + "Classid": ResourceType[resource_type].value, + "Products": "", + "QNode": { + "QGroup": [ + { + "Key": "Subject", "Title": "", "Logic": 0, "Items": [], "ChildItems": [] + }, + { + "Key": "ControlGroup", "Title": "", "Logic": 0, "Items": [], "ChildItems": [] + } + ] + }, + "ExScope": "1", + "SearchType": 1, + "Rlang": "CHINESE", + "KuaKuCode": "", + "Expands": {}, + "View": "changeDBOnlyFT", + "SearchFrom": 1 + } + add_search_word(search_content=query, base_query=_query) + if updated_date and updated_date != "不限": + limit_updated_time(updated_date, base_query=_query) + if year: + limit_year_range(year=year, base_query=_query) + return _query + + +adv_refine_search = temp_refine_search + + +def temp_query_search(query_body, query: str = None, page: int = 1, page_size: int = 50, + sort: str = 'desc', sort_field: str = 'PT', updated_date: str = "不限", **kwargs): + """ + 搜索请求body + :param query_body: 用来搜索的详细query, 与左侧导航body相同 + :param query: aside/检索式字符串 + :param page: 请求的页码 + :param page_size: 每页的数量 + :param sort: 排序方式, desc/asc + :param sort_field: 排序字段, PT(发表时间)/CF(被引) + :param updated_date: 默认不限 + :return: + """ + page = str(page) + page_size = str(page_size) + if page == '1': + aside = query or '' + _query = { + "boolSearch": "true", + "QueryJson": json.dumps(query_body, ensure_ascii=False), + "pageNum": "1", + "pageSize": page_size, + 'sortField': sort_field, + 'sortType': sort, + "dstyle": "listmode", + "boolSortSearch": "false", + "aside": aside, + "searchFrom": "资源范围:学术期刊; 仅看有全文,中英文扩展; 时间范围:更新时间:%(updated_date)s; 来源类别:全部期刊; " % {"updated_date": updated_date}, + "subject": "", + "language": "", + "uniplatform": "", + "CurPage": "1" + } + else: + _query = { + 'boolSearch': "false", + 'QueryJson': json.dumps(query_body, ensure_ascii=False), + 'pageNum': page, + 'pageSize': page_size, + 'sortField': sort_field, + 'sortType': sort, + 'dstyle': 'listmode', + 'boolSortSearch': "false", + 'aside': '', + 'searchFrom': '资源范围:学术期刊; 时间范围:更新时间:%(updated_date)s; 来源类别:全部期刊; ' % {"updated_date": updated_date}, + "subject": "", + "language": "", + "uniplatform": "" + } + return _query + + +adv_query_search = temp_query_search + + +class SearchPaperArgModel: + pass + + +class briefParam: + @staticmethod + def getDbCode(): + return 'CFLS' + + @staticmethod + def getPageSize(isSearch): + return 50 + + @staticmethod + def getCurPage(): + return 1 + + @staticmethod + def getSearchPaperArgModel(isSearch, cPage): + argModel = {} + dbCode = briefParam.getDbCode() + pSize = briefParam.getPageSize(isSearch) + cPage = cPage if cPage else briefParam.getCurPage() + argModel = { + 'IsSearch': isSearch, + 'QueryJson': '' + } + + +if __name__ == '__main__': + print(SearchTypeId.GROUP) + print(add_search_word( + '(作者单位:湖南中医药大学(模糊))OR(作者单位:湖南中医学院(模糊))OR(篇名:基于PINK1LETM1信号通路探讨何首乌苷减轻脑缺血再灌注损伤的作用机制(精确))')) diff --git a/science_article_cnki/science_article_cnki/models/enum_cls.py b/science_article_cnki/science_article_cnki/models/enum_cls.py new file mode 100644 index 0000000..e1bc80b --- /dev/null +++ b/science_article_cnki/science_article_cnki/models/enum_cls.py @@ -0,0 +1,128 @@ +# -*- coding: utf-8 -*- +# @Time : 2025/5/13 10:41 +# @Author : zhaoxiangpeng +# @File : enum_cls.py + +import enum +from datetime import timedelta + + +class ResourceType(enum.Enum): + """资源类型""" + JOURNAL = "YSTT4HG0" # 学术期刊 + DISSERTATION = "LSTPFY1C" # 学位论文 + CONFERENCE = "JUP3MUPD" # 会议 + NEWSPAPER = "MPMFIG1A" # 报纸 + ALMANAC = "HHCPM1F8" + BOOK = "EMRPGLPA" + PATENT = "VUDIXAIY" + STANDARD = "WQ0UVIAA" + ACHIEVEMENTS = "BLZOG7CK" + + +class SearchTypeId(enum.Enum): + """知网的检索类型""" + ADV = 1 + SIMPLE = 2 + AUTHOR = 3 + EXPERT = 4 # 专业检索 + SENTENCE = 5 + ''' + GROUP = 6 + PAGE = 7 + SORT = 8 + ABSTRACT = 9 + MORESENTENCE = 10 + HISTORY = 11 + SIZE = 12 + RESULT = 13 + ADVRESULT = 14 + EXPERTRESULT = 15 + AUTHORRESULT = 16 + SENRESULT = 17 + CROSSDBCHANGEDB = 18 + COMBOHISTORY = 19 + ''' + + +class SearchFromId(enum.Enum): + SEARCH = 1 + GROUPSEARCH = 2 + RESULT = 3 + PAGE = 4 + SORT = 5 + CHANGEDB = 6 + DISPLAYMODEL = 7 + NAVISEARCH = 8 + HISTORY = 9 + COMBOHISTORY = 10 + CROSSDBCHANGEDB = 11 + CHANGELANG = 12 + GROUP = 99 + + +class SearchFieldEnum(enum.Enum): + """文献元数据字段枚举类""" + SU = "主题" + TKA = "篇关摘" + TI = "篇名" + KY = "关键词" + AB = "摘要" + CO = "小标题" + FT = "全文" + AU = "作者" + FI = "第一作者" + RP = "通讯作者" + AF = "作者单位" + LY = "期刊名称" + RF = "参考文献" + FU = "基金" + CLC = "中图分类号" + SN = "ISSN" + CN = "CN" + DOI = "DOI" + QKLM = "栏目信息" + FAF = "第一单位" + CF = "被引频次" + + +class OperatorEnum(enum.Enum): + 模糊 = "FUZZY" + 精确 = "DEFAULT" + + +class OperatorTypeEnum(enum.Enum): + DEFAULT = 0 + TOPRANK = 1 + FUZZY = 2 + GT = 3 + GE = 4 + LT = 5 + LE = 6 + BETWEEN = 7 + FREQUENCY = 8 + PREFIX = 9 + SUFFIX = 10 + CONTAINS = 11 + NEAR = 12 + SENTENCE = 13 + IS = 14 + FUZZYFREQUENCY = 15 + + +class LogicEnum(enum.Enum): + AND = 0 + OR = 1 + NOT = 2 + + +class UpdatedTimeEnum(enum.Enum): + """ + 最近一段时间的枚举 + """ + 最近一周 = timedelta(days=7) + 最近一月 = timedelta(days=30) + 最近半年 = timedelta(days=180) + 最近一年 = timedelta(days=180) + 今年迄今 = timedelta(days=180) + diff --git a/science_article_cnki/science_article_cnki/pipelines.py b/science_article_cnki/science_article_cnki/pipelines.py new file mode 100644 index 0000000..715dc65 --- /dev/null +++ b/science_article_cnki/science_article_cnki/pipelines.py @@ -0,0 +1,90 @@ +# Define your item pipelines here +# +# Don't forget to add your pipeline to the ITEM_PIPELINES setting +# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html + + +# useful for handling different item types with a single interface +from __future__ import annotations +from itemadapter import ItemAdapter + + +class ScienceArticlCnkiPipeline: + def process_item(self, item, spider): + return item + + +import logging +from datetime import datetime +from typing import TYPE_CHECKING, Tuple, Union + +from pymongo import MongoClient +from itemadapter import ItemAdapter +from pymongo.errors import ( + DuplicateKeyError, + BulkWriteError +) + +from science_article_cnki.db_utils.mongo import MongoDBUtils, update_document, build_update_query + +if TYPE_CHECKING: + from scrapy.crawler import Crawler + from scrapy.statscollectors import StatsCollector + +mongo_logger = logging.getLogger('pymongo') +mongo_logger.setLevel(logging.WARNING) +logger = logging.getLogger(__name__) + + +class MongoPipeline(MongoDBUtils): + def __init__(self, mongo_uri, mongo_db, stats: StatsCollector): + super().__init__(mongo_uri, mongo_db) + self.stats: StatsCollector = stats + self.insert_failure_update_enable = True + + @classmethod + def from_crawler(cls, crawler: Crawler): + return cls( + mongo_uri=crawler.settings.get("MONGO_URI"), + mongo_db=crawler.settings.get("MONGO_DATABASE", "items"), + stats=crawler.stats + ) + + def open_spider(self, spider): + self.client = MongoClient(self.mongo_uri) + self.db = self.client[self.mongo_db] + + def process_item(self, item, spider): + # 确定Item类型 + adapter = ItemAdapter(item) + item_type = self._get_item_type(item) + collection = self.db.get_collection(item_type) + d = adapter.asdict() + try: + insert_result = collection.insert_one(d) + self.stats.inc_value("item2db_inserted/{}".format(item_type)) + except DuplicateKeyError as duplicate_error: + if self.insert_failure_update_enable: + write_error = duplicate_error.details + key_pattern = write_error.get('keyPattern') + key_value = write_error.get('keyValue') + logger.debug("dupKey: %s, keyValue: %s", key_pattern, key_value) + d.pop("_id", None) + [d.pop(k, None) for k in key_pattern.keys()] + up_result = collection.update_one(filter=key_value, update={"$set": d}, upsert=True) + self.stats.inc_value("item2db_updated/{}".format(item_type)) + except Exception: + raise + + return item + + def close_spider(self, spider): + self.client.close() + + @staticmethod + def _get_item_type(item) -> str: + """获取Item类型""" + if hasattr(item, '__tablename__'): + return item.__class__.__tablename__ + return 'items_null_table' + diff --git a/science_article_cnki/science_article_cnki/settings.py b/science_article_cnki/science_article_cnki/settings.py new file mode 100644 index 0000000..7a7c421 --- /dev/null +++ b/science_article_cnki/science_article_cnki/settings.py @@ -0,0 +1,105 @@ +# Scrapy settings for science_article_cnki project +# +# For simplicity, this file contains only settings considered important or +# commonly used. You can find more settings consulting the documentation: +# +# https://docs.scrapy.org/en/latest/topics/settings.html +# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html +# https://docs.scrapy.org/en/latest/topics/spider-middleware.html + +BOT_NAME = "science_article_cnki" + +SPIDER_MODULES = ["science_article_cnki.spiders"] +NEWSPIDER_MODULE = "science_article_cnki.spiders" + +ADDONS = {} + + +# Crawl responsibly by identifying yourself (and your website) on the user-agent +USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/139.0.0.0 Safari/537.36' + +# Obey robots.txt rules +ROBOTSTXT_OBEY = False + +# Concurrency and throttling settings +#CONCURRENT_REQUESTS = 16 +CONCURRENT_REQUESTS_PER_DOMAIN = 1 +DOWNLOAD_DELAY = 3 + +# Disable cookies (enabled by default) +COOKIES_ENABLED = True + +# Disable Telnet Console (enabled by default) +#TELNETCONSOLE_ENABLED = False + +# Override the default request headers: +#DEFAULT_REQUEST_HEADERS = { +# "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", +# "Accept-Language": "en", +#} +SEARCH_REQUEST_HEADERS = { + 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', + 'Cookie': 'Hm_lvt_dcec09ba2227fd02c55623c1bb82776a=1739256689; UM_distinctid=197b0769b48ea3-0de0b4b2dd761f-26001051-1fa400-197b0769b49cc6; Ecp_ClientId=e250627180800765334; Ecp_ClientIp=111.186.53.36; cnkiUserKey=1b8e7dbe-3c98-864f-2b80-84b544af32af; _c_WBKFRo=UO8UFAxWLjMjlOxhuKvmtkZ4yYaXr8dPZXuhVFea; Ecp_loginuserbk=SJTU; tfstk=g5GqYEZ0ZId4NHSWG0FNzQCb6QNYs5-QjfZ_SV0gloqDDdFa7uoTCSMjSA5ZJuEOhdn6_lmxYPZ0DxMNb0nUXt99nAPZ2q5jhfuO_P0iXEE6kLgxk5FMAHTBOq3vhen9f3NMS4V_773PuGuxk5Q-60hJAqQN2mSLS5mgZz4gS540ItYPZPqliPf0SgYzWuVgSrX0ZT4_uGb0Sc0kzPEuolmgsUPu2PVgjcViG50mS_zQnU-thdfV8NPaxqqPs67Lu-cB9u5Mabzqzugc-1fiaryqZpcfbM2jI2eKGqONwSgEE74qjBx0ex0r_Jh9Csg0ZoPxa-bMXocxSfPYTNAmzSr4KbwXO1mnzVDQUbTH9SP0mANx5w-jzjojkbu1STV4GYyEgWAdmlMS8fzZ6hdrYqDnjASP1GUobXlt3GXanzUzAU8z4y3oBzrYp_6OB8VLzkTblOBTnzUzAU8PBOeu2zrBlr1..; Ecp_session=1; SID_sug=018104; knsLeftGroupSelectItem=; dsorders=CF; dsortypes=cur%20DESC; knsadv-searchtype=%7B%22BLZOG7CK%22%3A%22gradeSearch%2CmajorSearch%22%2C%22MPMFIG1A%22%3A%22gradeSearch%2CmajorSearch%2CsentenceSearch%22%2C%22T2VC03OH%22%3A%22gradeSearch%2CmajorSearch%22%2C%22JQIRZIYA%22%3A%22gradeSearch%2CmajorSearch%2CsentenceSearch%22%2C%22S81HNSV3%22%3A%22gradeSearch%22%2C%22YSTT4HG0%22%3A%22gradeSearch%2CmajorSearch%2CauthorSearch%2CsentenceSearch%22%2C%22ML4DRIDX%22%3A%22gradeSearch%2CmajorSearch%22%2C%22WQ0UVIAA%22%3A%22gradeSearch%2CmajorSearch%22%2C%22VUDIXAIY%22%3A%22gradeSearch%2CmajorSearch%22%2C%22LIQN9Z3G%22%3A%22gradeSearch%22%2C%22NN3FJMUV%22%3A%22gradeSearch%2CmajorSearch%2CauthorSearch%2CsentenceSearch%22%2C%22LSTPFY1C%22%3A%22gradeSearch%2CmajorSearch%2CsentenceSearch%22%2C%22HHCPM1F8%22%3A%22gradeSearch%2CmajorSearch%22%2C%22OORPU5FE%22%3A%22gradeSearch%2CmajorSearch%22%2C%22WD0FTY92%22%3A%22gradeSearch%2CmajorSearch%2CauthorSearch%2CsentenceSearch%22%2C%22BPBAFJ5S%22%3A%22gradeSearch%2CmajorSearch%2CauthorSearch%2CsentenceSearch%22%2C%22EMRPGLPA%22%3A%22gradeSearch%2CmajorSearch%22%2C%22PWFIRAGL%22%3A%22gradeSearch%2CmajorSearch%2CsentenceSearch%22%2C%22U8J8LYLV%22%3A%22gradeSearch%2CmajorSearch%22%2C%22R79MZMCB%22%3A%22gradeSearch%22%2C%22J708GVCE%22%3A%22gradeSearch%2CmajorSearch%22%2C%228JBZLDJQ%22%3A%22gradeSearch%2CmajorSearch%2CsentenceSearch%22%2C%22HR1YT1Z9%22%3A%22gradeSearch%2CmajorSearch%22%2C%22JUP3MUPD%22%3A%22gradeSearch%2CmajorSearch%2CauthorSearch%2CsentenceSearch%22%2C%22NLBO1Z6R%22%3A%22gradeSearch%2CmajorSearch%22%2C%22RMJLXHZ3%22%3A%22gradeSearch%2CmajorSearch%2CsentenceSearch%22%2C%221UR4K4HZ%22%3A%22gradeSearch%2CmajorSearch%2CauthorSearch%2CsentenceSearch%22%2C%22NB3BWEHK%22%3A%22gradeSearch%2CmajorSearch%22%2C%22XVLO76FD%22%3A%22gradeSearch%2CmajorSearch%22%7D; Ecp_IpLoginFail=25121149.65.252.186; SID_kns_new=kns018106; SID_restapi=kns018110; KNS2COOKIE=1765437722.656.114388.232155|b25e41a932fd162af3b8c5cff4059fc3; dblang=both; createtime-advInput=2025-12-11%2015%3A22%3A21; searchTimeFlags=1', + 'Origin': 'https://kns.cnki.net', + 'Referer': 'https://kns.cnki.net/kns8s/AdvSearch?crossids=YSTT4HG0%2CLSTPFY1C%2CJUP3MUPD%2CMPMFIG1A%2CWQ0UVIAA%2CBLZOG7CK%2CPWFIRAGL%2CEMRPGLPA%2CNLBO1Z6R%2CNN3FJMUV', + 'User-Agent': USER_AGENT, +} + +# Enable or disable spider middlewares +# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html +#SPIDER_MIDDLEWARES = { +# "science_article_cnki.middlewares.ScienceArticlCnkiSpiderMiddleware": 543, +#} + +RETRY_ENABLED = True +RETRY_TIMES = 2 # 重试3次 +RETRY_HTTP_CODES = [500, 502, 503, 504, 408, 400, 403, 404] # 增加了一些常见的错误码 +DOWNLOADER_MIDDLEWARES = { + 'scrapy.downloadermiddlewares.retry.RetryMiddleware': 550 + # "org_news.middlewares.OrgNewsDownloaderMiddleware": 543, +} +# Enable or disable downloader middlewares +# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html +#DOWNLOADER_MIDDLEWARES = { +# "science_article_cnki.middlewares.ScienceArticlCnkiDownloaderMiddleware": 543, +#} + +# Enable or disable extensions +# See https://docs.scrapy.org/en/latest/topics/extensions.html +#EXTENSIONS = { +# "scrapy.extensions.telnet.TelnetConsole": None, +#} + +# Configure item pipelines +# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html +#ITEM_PIPELINES = { +# "science_article_cnki.pipelines.ScienceArticlCnkiPipeline": 300, +#} + +# Enable and configure the AutoThrottle extension (disabled by default) +# See https://docs.scrapy.org/en/latest/topics/autothrottle.html +#AUTOTHROTTLE_ENABLED = True +# The initial download delay +#AUTOTHROTTLE_START_DELAY = 5 +# The maximum download delay to be set in case of high latencies +#AUTOTHROTTLE_MAX_DELAY = 60 +# The average number of requests Scrapy should be sending in parallel to +# each remote server +#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 +# Enable showing throttling stats for every response received: +#AUTOTHROTTLE_DEBUG = False + +# Enable and configure HTTP caching (disabled by default) +# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings +#HTTPCACHE_ENABLED = True +#HTTPCACHE_EXPIRATION_SECS = 0 +#HTTPCACHE_DIR = "httpcache" +#HTTPCACHE_IGNORE_HTTP_CODES = [] +#HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage" + +# Set settings whose default value is deprecated to a future-proof value +FEED_EXPORT_ENCODING = "utf-8" + + +MONGO_URI = "mongodb://science-dev:kcidea1509!%25)(@101.43.239.105:27017/?authSource=science&directConnection=true" +MONGO_DATABASE = 'science2' diff --git a/science_article_cnki/science_article_cnki/spiders/__init__.py b/science_article_cnki/science_article_cnki/spiders/__init__.py new file mode 100644 index 0000000..ebd689a --- /dev/null +++ b/science_article_cnki/science_article_cnki/spiders/__init__.py @@ -0,0 +1,4 @@ +# This package will contain the spiders of your Scrapy project +# +# Please refer to the documentation for information on how to create and manage +# your spiders. diff --git a/science_article_cnki/science_article_cnki/spiders/cnki_cited_number.py b/science_article_cnki/science_article_cnki/spiders/cnki_cited_number.py new file mode 100644 index 0000000..5791cce --- /dev/null +++ b/science_article_cnki/science_article_cnki/spiders/cnki_cited_number.py @@ -0,0 +1,101 @@ +from __future__ import annotations +import math +from copy import deepcopy +from datetime import datetime +from typing import TYPE_CHECKING, Any, Self + +import scrapy +from science_article_cnki.items import CnkiCitedNumberItem +from science_article_cnki.utils.tools import str2int +from science_article_cnki.models import cnki_model as model +from science_article_cnki.configs import cnki as config + +if TYPE_CHECKING: + from scrapy.crawler import Crawler + + +class CnkiCitedNumberSpider(scrapy.Spider): + name = "cnki_cited_number" + custom_settings = dict( + DEFAULT_REQUEST_HEADERS={ + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", + "Accept-Language": "en", + }, + DOWNLOADER_MIDDLEWARES={ + "science_article_cnki.middlewares.CnkiSearchHeadersDownloaderMiddleware": 540, + }, + ITEM_PIPELINES={ + "science_article_cnki.pipelines.MongoPipeline": 300, + # "science_article_cnki.pipelines.verify_data.VerifyDataIntegrity": 400, + }, + LOG_LEVEL="INFO" + ) + + @classmethod + def from_crawler(cls, crawler: Crawler, *args: Any, **kwargs: Any) -> Self: + # 自定义逻辑 + # 比如判断如果没有参数从数据库中读取 + return super().from_crawler(crawler, *args, **kwargs) + + def __init__(self, query: str = None, resource_type: str = "JOURNAL", query_condition: dict = None, **kwargs: Any): + super().__init__(**kwargs) + self.query = query + self.resource_type = resource_type + self.sort_field = 'CF' + self.query_condition = query_condition # {'year': '2021'} + self.page_size = 50 + + async def start(self): + m = dict(query=self.query, resource_type=self.resource_type, page=1, sort_field=self.sort_field, **self.query_condition) + query_body = model.adv_refine_search(**m) + search_param = model.adv_query_search(query_body, **m) + yield scrapy.FormRequest( + url=config.CNKI_ADV_SEARCH_API, method="POST", formdata=search_param, meta=m + ) + + def parse(self, response, **kwargs): + meta = response.meta + # -------------------------------------------- 计算一共有多少页的逻辑 -------------------------------------------- + # 提取检索结果的数量 + total_prm = response.xpath('//span[@class="pagerTitleCell"]/em/text()').get() + if not total_prm: + self.logger.warning(""" + 当前 {query} + 响应 {resp}""".format(query=meta.get('query'), resp=response.body)) + return + total = str2int(total_prm.replace(',', '')) # 格式化数量字符串并转int + # 计算一共有多少页 + max_page = math.ceil(total / self.page_size) + meta['max_page'] = max_page + batch_time = datetime.now() + tr_nodes = response.xpath('//div[@id="gridTable"]//table[@class="result-table-list"]/tbody/tr') + for tr_node in tr_nodes: + third_id = tr_node.xpath('./td[@class="operat"]/a[@class="icon-collect"]/@data-filename').get() # 三方id + cited_str = tr_node.xpath('./td[@class="quote"]/span/a/text()').get() # 三方id + if third_id and cited_str: + cited_item = CnkiCitedNumberItem() + cited_item['third_id'] = third_id + cited_item['cited'] = str2int(cited_str, 0) + cited_item['updated_at'] = batch_time + yield cited_item + + meta_copy: dict = deepcopy(meta) + meta_copy['page'] += 1 + query_body = model.adv_refine_search(**meta_copy) + search_param = model.adv_query_search(query_body, **meta_copy) + yield scrapy.FormRequest( + url=config.CNKI_ADV_SEARCH_API, method="POST", formdata=search_param, + meta=meta_copy + ) + + +if __name__ == '__main__': + from scrapy.crawler import CrawlerProcess + from scrapy.utils.project import get_project_settings + + process = CrawlerProcess(get_project_settings()) + task_params = dict() + task_params.setdefault('query', '(作者单位:西安建筑科技大学(模糊))') + task_params.setdefault('query_condition', {'year': '2026'}) + process.crawl(CnkiCitedNumberSpider, **task_params) + process.start() # 阻塞直到所有爬虫完成 diff --git a/science_article_cnki/science_article_cnki/spiders/example.py b/science_article_cnki/science_article_cnki/spiders/example.py new file mode 100644 index 0000000..9c0ab11 --- /dev/null +++ b/science_article_cnki/science_article_cnki/spiders/example.py @@ -0,0 +1,10 @@ +import scrapy + + +class ExampleSpider(scrapy.Spider): + name = "example" + allowed_domains = ["example.com"] + start_urls = ["https://example.com"] + + def parse(self, response): + pass diff --git a/science_article_cnki/science_article_cnki/utils/__init__.py b/science_article_cnki/science_article_cnki/utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/science_article_cnki/science_article_cnki/utils/tools.py b/science_article_cnki/science_article_cnki/utils/tools.py new file mode 100644 index 0000000..e56525f --- /dev/null +++ b/science_article_cnki/science_article_cnki/utils/tools.py @@ -0,0 +1,17 @@ +from typing import List, Tuple +from datetime import datetime + + +def str2int(val, replace=0): + try: + val = int(val) + except ValueError: + val = replace + except TypeError: + val = replace + return val + + +def get_today_date(fmt: str = "%Y-%m-%d"): + return datetime.today().strftime(fmt) + diff --git a/science_article_cnki/scrapy.cfg b/science_article_cnki/scrapy.cfg new file mode 100644 index 0000000..fa7d725 --- /dev/null +++ b/science_article_cnki/scrapy.cfg @@ -0,0 +1,11 @@ +# Automatically created by: scrapy startproject +# +# For more information about the [deploy] section see: +# https://scrapyd.readthedocs.io/en/latest/deploy.html + +[settings] +default = science_article_cnki.settings + +[deploy] +#url = http://localhost:6800/ +project = science_article_cnki