cnki:来源类型打标签

main
zhaoxiangpeng 4 weeks ago
parent bedba6c83f
commit 3507ba07ae

@ -0,0 +1,128 @@
from __future__ import annotations
import math
from copy import deepcopy
from datetime import datetime
from typing import TYPE_CHECKING, Any, Self
from pprint import pformat
import scrapy
from science_article_cnki.items import CnkiArticeSourceItem
from science_article_cnki.models.enum_cls import SingleResultEnum
from science_article_cnki.models import cnki_model as model
from science_article_cnki.utils.tools import str2int
from science_article_cnki.configs import cnki as config
class CnkiArticleTagSourceSpider(scrapy.Spider):
name = "cnki_article_tag_source"
custom_settings = dict(
DOWNLOADER_MIDDLEWARES={
"science_article_cnki.middlewares.CnkiSearchHeadersDownloaderMiddleware": 540,
},
ITEM_PIPELINES={
"science_article_cnki.pipelines.MongoPipeline": 300,
# "science_article_cnki.pipelines.verify_data.VerifyDataIntegrity": 400,
},
LOG_LEVEL="INFO"
)
query: str
resource_type: str = "学术期刊"
group: str = "来源类别"
query_condition: dict
async def start(self):
# 先去取一个核心的聚合页,看看每个核心的文章还有标志是啥
m = dict(query=self.query, resource_type=self.resource_type, page=1,
**self.query_condition)
m.update(filters=[])
query_body = model.adv_refine_search(**m)
form_d = model.single_result_nav(query_body, groupId=SingleResultEnum[self.group])
yield scrapy.FormRequest(url=config.SIGNAL_RESULT_API, method="POST",
formdata=form_d, meta=dict(REQUEST_Q=m))
def parse(self, response, **kwargs):
request_q = response.meta["REQUEST_Q"]
project_field = response.xpath('//dd/@field').get() # 单一类别的标识
project_title = response.xpath('//dd/@tit').get() # 单一类别的名字
nodes = response.xpath('//div[@class="resultlist"]/ul/li')
priority = 0
for node in nodes:
# 解析分组后的条目
s_code = node.xpath('./input/@value').get() # 用来筛选的input代码
s_text = node.xpath('./input/@text').get() # 筛选项的值
s_title = node.xpath('./input/@title').get() # 显示的值(目前看与筛选项的值一致)
total_prm = node.xpath('./span/text()').re_first(r'\((.*?)\)')
s_total = str2int(total_prm.replace(',', ''), 0)
# max_page = math.ceil(s_total / config.BATCH_SEARCH_RESULT_LIMIT)
# request_q['max_page'] = max_page
group = dict(
project=project_title,
value=s_code,
text_or_title=s_text,
)
self.logger.info("组: %s" % pformat(group))
q_bak: dict = deepcopy(request_q)
q_bak.update(group)
q_bak.update(source_types=s_text)
q_bak.setdefault('filters', []).append(group)
query_body = model.adv_refine_search(**q_bak)
model.add_muti_group(**q_bak,
base_query=query_body)
form_d = model.adv_query_search(query_body, **q_bak)
priority -= 100 # 保证一个类别一个优先级
yield scrapy.FormRequest(
url=config.CNKI_ADV_SEARCH_API, method="POST",
formdata=form_d, priority=priority,
callback=self.parse_result,
meta=dict(REQUEST_Q=q_bak)
)
# return
def parse_result(self, response, **kwargs):
priority = response.request.priority
request_q = response.meta["REQUEST_Q"]
msg = """当前检索: %(query)s,\n筛选项: %(filters)s,\n页数: %(page)s"""
kws = {
"query": request_q.get("query"),
"filters": pformat(request_q.get("filters", [])),
"page": '{c}/{m}'.format(c=request_q.get("page", 1), m=request_q.get("max_page", 'null'))
}
self.logger.info(msg % kws)
# 提取检索结果的数量
total_prm = response.xpath('//span[@class="pagerTitleCell"]/em/text()').get()
if not total_prm:
self.logger.warning("响应body: \n{resp}".format(resp=response.body))
return
if request_q.get("page", 1) == 1:
total = str2int(total_prm.replace(',', '')) # 格式化数量字符串并转int
# 计算一共有多少页
max_page = math.ceil(total / config.BATCH_SEARCH_RESULT_LIMIT)
request_q['max_page'] = max_page
tr_nodes = response.xpath('//div[@id="gridTable"]//table[@class="result-table-list"]/tbody/tr')
for tr_node in tr_nodes:
third_id = tr_node.xpath('./td[@class="operat"]/a[@class="icon-collect"]/@data-filename').get() # 三方id
if third_id:
st_item = CnkiArticeSourceItem()
st_item['third_id'] = third_id
st_item['source_types'] = [request_q.get("source_types")]
yield st_item
q_bak: dict = deepcopy(request_q)
q_bak['page'] += 1
if q_bak['page'] > q_bak['max_page']:
self.logger.info("当前采集结束")
return
query_body = model.adv_refine_search(**q_bak)
model.add_muti_group(**q_bak,
base_query=query_body)
search_param = model.adv_query_search(query_body, **q_bak)
yield scrapy.FormRequest(
url=config.CNKI_ADV_SEARCH_API, method="POST",
formdata=search_param, priority=priority,
callback=self.parse_result,
meta=dict(REQUEST_Q=q_bak)
)
Loading…
Cancel
Save