From 3507ba07ae4182873d47d30c8fb0cb707b67c464 Mon Sep 17 00:00:00 2001 From: zhaoxiangpeng <1943364377@qq.com> Date: Wed, 7 Jan 2026 17:55:43 +0800 Subject: [PATCH] =?UTF-8?q?cnki:=E6=9D=A5=E6=BA=90=E7=B1=BB=E5=9E=8B?= =?UTF-8?q?=E6=89=93=E6=A0=87=E7=AD=BE?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../spiders/cnki_article_tag_source.py | 128 ++++++++++++++++++ 1 file changed, 128 insertions(+) create mode 100644 science_article_cnki/science_article_cnki/spiders/cnki_article_tag_source.py diff --git a/science_article_cnki/science_article_cnki/spiders/cnki_article_tag_source.py b/science_article_cnki/science_article_cnki/spiders/cnki_article_tag_source.py new file mode 100644 index 0000000..0ec941c --- /dev/null +++ b/science_article_cnki/science_article_cnki/spiders/cnki_article_tag_source.py @@ -0,0 +1,128 @@ +from __future__ import annotations +import math +from copy import deepcopy +from datetime import datetime +from typing import TYPE_CHECKING, Any, Self +from pprint import pformat +import scrapy +from science_article_cnki.items import CnkiArticeSourceItem +from science_article_cnki.models.enum_cls import SingleResultEnum +from science_article_cnki.models import cnki_model as model +from science_article_cnki.utils.tools import str2int +from science_article_cnki.configs import cnki as config + + +class CnkiArticleTagSourceSpider(scrapy.Spider): + name = "cnki_article_tag_source" + custom_settings = dict( + DOWNLOADER_MIDDLEWARES={ + "science_article_cnki.middlewares.CnkiSearchHeadersDownloaderMiddleware": 540, + }, + ITEM_PIPELINES={ + "science_article_cnki.pipelines.MongoPipeline": 300, + # "science_article_cnki.pipelines.verify_data.VerifyDataIntegrity": 400, + }, + LOG_LEVEL="INFO" + ) + + query: str + resource_type: str = "学术期刊" + group: str = "来源类别" + query_condition: dict + + async def start(self): + # 先去取一个核心的聚合页,看看每个核心的文章还有标志是啥 + m = dict(query=self.query, resource_type=self.resource_type, page=1, + **self.query_condition) + m.update(filters=[]) + query_body = model.adv_refine_search(**m) + form_d = model.single_result_nav(query_body, groupId=SingleResultEnum[self.group]) + yield scrapy.FormRequest(url=config.SIGNAL_RESULT_API, method="POST", + formdata=form_d, meta=dict(REQUEST_Q=m)) + + def parse(self, response, **kwargs): + request_q = response.meta["REQUEST_Q"] + project_field = response.xpath('//dd/@field').get() # 单一类别的标识 + project_title = response.xpath('//dd/@tit').get() # 单一类别的名字 + nodes = response.xpath('//div[@class="resultlist"]/ul/li') + priority = 0 + for node in nodes: + # 解析分组后的条目 + s_code = node.xpath('./input/@value').get() # 用来筛选的input代码 + s_text = node.xpath('./input/@text').get() # 筛选项的值 + s_title = node.xpath('./input/@title').get() # 显示的值(目前看与筛选项的值一致) + total_prm = node.xpath('./span/text()').re_first(r'\((.*?)\)') + s_total = str2int(total_prm.replace(',', ''), 0) + # max_page = math.ceil(s_total / config.BATCH_SEARCH_RESULT_LIMIT) + # request_q['max_page'] = max_page + + group = dict( + project=project_title, + value=s_code, + text_or_title=s_text, + ) + self.logger.info("组: %s" % pformat(group)) + + q_bak: dict = deepcopy(request_q) + q_bak.update(group) + q_bak.update(source_types=s_text) + q_bak.setdefault('filters', []).append(group) + + query_body = model.adv_refine_search(**q_bak) + model.add_muti_group(**q_bak, + base_query=query_body) + form_d = model.adv_query_search(query_body, **q_bak) + priority -= 100 # 保证一个类别一个优先级 + yield scrapy.FormRequest( + url=config.CNKI_ADV_SEARCH_API, method="POST", + formdata=form_d, priority=priority, + callback=self.parse_result, + meta=dict(REQUEST_Q=q_bak) + ) + # return + + def parse_result(self, response, **kwargs): + priority = response.request.priority + request_q = response.meta["REQUEST_Q"] + msg = """当前检索: %(query)s,\n筛选项: %(filters)s,\n页数: %(page)s""" + kws = { + "query": request_q.get("query"), + "filters": pformat(request_q.get("filters", [])), + "page": '{c}/{m}'.format(c=request_q.get("page", 1), m=request_q.get("max_page", 'null')) + } + self.logger.info(msg % kws) + + # 提取检索结果的数量 + total_prm = response.xpath('//span[@class="pagerTitleCell"]/em/text()').get() + if not total_prm: + self.logger.warning("响应body: \n{resp}".format(resp=response.body)) + return + if request_q.get("page", 1) == 1: + total = str2int(total_prm.replace(',', '')) # 格式化数量字符串并转int + # 计算一共有多少页 + max_page = math.ceil(total / config.BATCH_SEARCH_RESULT_LIMIT) + request_q['max_page'] = max_page + tr_nodes = response.xpath('//div[@id="gridTable"]//table[@class="result-table-list"]/tbody/tr') + for tr_node in tr_nodes: + third_id = tr_node.xpath('./td[@class="operat"]/a[@class="icon-collect"]/@data-filename').get() # 三方id + if third_id: + st_item = CnkiArticeSourceItem() + st_item['third_id'] = third_id + st_item['source_types'] = [request_q.get("source_types")] + yield st_item + + q_bak: dict = deepcopy(request_q) + q_bak['page'] += 1 + if q_bak['page'] > q_bak['max_page']: + self.logger.info("当前采集结束") + return + query_body = model.adv_refine_search(**q_bak) + model.add_muti_group(**q_bak, + base_query=query_body) + search_param = model.adv_query_search(query_body, **q_bak) + yield scrapy.FormRequest( + url=config.CNKI_ADV_SEARCH_API, method="POST", + formdata=search_param, priority=priority, + callback=self.parse_result, + meta=dict(REQUEST_Q=q_bak) + )