|
|
|
|
@ -0,0 +1,183 @@
|
|
|
|
|
from __future__ import annotations
|
|
|
|
|
import math
|
|
|
|
|
from copy import deepcopy
|
|
|
|
|
from datetime import datetime
|
|
|
|
|
from typing import TYPE_CHECKING, Any, Self
|
|
|
|
|
from pprint import pformat
|
|
|
|
|
import scrapy
|
|
|
|
|
|
|
|
|
|
from science_article_cnki.items import CnkiIdRelationItem, CnkiArticleTodoIdItem, CnkiCitedNumberItem
|
|
|
|
|
from science_article_cnki.models.enum_cls import SingleResultEnum
|
|
|
|
|
from science_article_cnki.models import cnki_model as model
|
|
|
|
|
from science_article_cnki.utils import tools
|
|
|
|
|
from science_article_cnki.utils.tools import parse_datetime, add_year2item
|
|
|
|
|
from science_article_cnki.utils.ti_match_id import ti2format, ti2unique_type2
|
|
|
|
|
from science_article_cnki.configs import cnki as config
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class CnkiArticleCrossdbSpider(scrapy.Spider):
|
|
|
|
|
name = "cnki_article_crossdb"
|
|
|
|
|
custom_settings = dict(
|
|
|
|
|
DOWNLOADER_MIDDLEWARES={
|
|
|
|
|
"science_article_cnki.middlewares.CnkiSearchHeadersDownloaderMiddleware": 540,
|
|
|
|
|
},
|
|
|
|
|
ITEM_PIPELINES={
|
|
|
|
|
"science_article_cnki.pipelines.MongoPipeline": 300,
|
|
|
|
|
"science_article_cnki.pipelines.DupTodoPipeline": 310,
|
|
|
|
|
# "science_article_cnki.pipelines.verify_data.VerifyDataIntegrity": 400,
|
|
|
|
|
},
|
|
|
|
|
# LOG_LEVEL="INFO"
|
|
|
|
|
)
|
|
|
|
|
source = 'cnki'
|
|
|
|
|
|
|
|
|
|
resource_type: str = "总库"
|
|
|
|
|
|
|
|
|
|
query_id: int
|
|
|
|
|
query: str
|
|
|
|
|
filters: list = list()
|
|
|
|
|
|
|
|
|
|
def open_spider(self):
|
|
|
|
|
"""
|
|
|
|
|
"""
|
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
async def start(self):
|
|
|
|
|
m = dict(query=self.query, resource_type=self.resource_type, page=1)
|
|
|
|
|
m.update(filters=self.filters)
|
|
|
|
|
query_body = model.adv_refine_search(**m)
|
|
|
|
|
# 把筛选项加到查询体中
|
|
|
|
|
model.add_muti_filters(base_query=query_body, filters=m.get("filters"))
|
|
|
|
|
form_d = model.adv_query_search(query_body, **m)
|
|
|
|
|
yield scrapy.FormRequest(url=config.CNKI_ADV_SEARCH_API, method="POST",
|
|
|
|
|
formdata=form_d, meta=dict(REQUEST_Q=m))
|
|
|
|
|
|
|
|
|
|
def parse(self, response, **kwargs):
|
|
|
|
|
"""
|
|
|
|
|
首次请求会进入这个解析
|
|
|
|
|
"""
|
|
|
|
|
request_q = response.meta["REQUEST_Q"]
|
|
|
|
|
msg = """当前检索: %(query)s,\n筛选项: %(filters)s,\n页数: %(page)s"""
|
|
|
|
|
kws = {
|
|
|
|
|
"query": request_q.get("query"),
|
|
|
|
|
"filters": pformat(request_q.get("filters", [])),
|
|
|
|
|
"page": '{c}/{m}'.format(c=request_q.get("page", 1), m=request_q.get("max_page", 'null'))
|
|
|
|
|
}
|
|
|
|
|
self.logger.info(msg % kws)
|
|
|
|
|
|
|
|
|
|
# -------------------------------------------- 计算一共有多少页的逻辑 --------------------------------------------
|
|
|
|
|
# 提取检索结果的数量
|
|
|
|
|
total_prm = response.xpath('//span[@class="pagerTitleCell"]/em/text()').get()
|
|
|
|
|
if not total_prm:
|
|
|
|
|
return
|
|
|
|
|
total = tools.str2int(total_prm.replace(',', '')) # 格式化数量字符串并转int
|
|
|
|
|
|
|
|
|
|
# 计算一共有多少页
|
|
|
|
|
max_page = math.ceil(total / config.BATCH_SEARCH_RESULT_LIMIT)
|
|
|
|
|
request_q['max_page'] = max_page
|
|
|
|
|
batch_time = datetime.now()
|
|
|
|
|
# ---------------------------------------------- 提取列表文章的逻辑 ----------------------------------------------
|
|
|
|
|
tr_nodes = response.xpath('//div[@id="gridTable"]//table[@class="result-table-list"]/tbody/tr')
|
|
|
|
|
for tr_node in tr_nodes:
|
|
|
|
|
article_title = tr_node.xpath('./td[@class="name"]/a//text()').getall() # 文章标题
|
|
|
|
|
article_title = article_title and ''.join(article_title)
|
|
|
|
|
article_link = tr_node.xpath('./td[@class="name"]/a/@href').get() # 文章链接(有v值)
|
|
|
|
|
source_title = tr_node.xpath('./td[@class="source"]/*/a/text()').get() # 出版物名称(刊名)
|
|
|
|
|
db_name = tr_node.xpath('./td[@class="operat"]/a[@class="icon-collect"]/@data-dbname').get() # 收录库
|
|
|
|
|
third_id = tr_node.xpath('./td[@class="operat"]/a[@class="icon-collect"]/@data-filename').get() # 三方id
|
|
|
|
|
cited_str = tr_node.xpath('./td[@class="quote"]/span/a/text()').get() # 被引量字符串
|
|
|
|
|
|
|
|
|
|
param = tools.url_parse(article_link)
|
|
|
|
|
v = param.get('v')
|
|
|
|
|
ti_format = ti2format(article_title)
|
|
|
|
|
ti_unique = ti2unique_type2(ti=ti_format, so=source_title)
|
|
|
|
|
|
|
|
|
|
if third_id:
|
|
|
|
|
relation_item = CnkiIdRelationItem()
|
|
|
|
|
relation_item['third_id'] = third_id
|
|
|
|
|
relation_item['query_ids'] = [self.query_id]
|
|
|
|
|
# 给关系添加年份
|
|
|
|
|
add_year2item(relation_item, request_q.get("year"), tr_node.xpath('./td[@class="date"]/text()').get())
|
|
|
|
|
relation_item['updated_at'] = batch_time
|
|
|
|
|
yield relation_item
|
|
|
|
|
|
|
|
|
|
if cited_str:
|
|
|
|
|
cited_item = CnkiCitedNumberItem(**dict(third_id=third_id, cited=tools.str2int(cited_str, 0), updated_at=batch_time))
|
|
|
|
|
yield cited_item
|
|
|
|
|
yield CnkiArticleTodoIdItem(**dict(third_id=third_id, db_code=db_name, ti=ti_unique, v=v, state=0))
|
|
|
|
|
|
|
|
|
|
q_bak: dict = deepcopy(request_q)
|
|
|
|
|
q_bak['page'] += 1
|
|
|
|
|
query_body = model.adv_refine_search(**q_bak)
|
|
|
|
|
model.add_muti_filters(base_query=query_body, filters=q_bak.get("filters"))
|
|
|
|
|
search_param = model.adv_query_search(query_body, **q_bak)
|
|
|
|
|
yield scrapy.FormRequest(
|
|
|
|
|
url=config.CNKI_ADV_SEARCH_API, method="POST",
|
|
|
|
|
formdata=search_param,
|
|
|
|
|
callback=self.parse_other_page,
|
|
|
|
|
meta=dict(REQUEST_Q=q_bak)
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
async def parse_other_page(self, response, **kwargs):
|
|
|
|
|
priority = response.request.priority
|
|
|
|
|
request_q = response.meta["REQUEST_Q"]
|
|
|
|
|
msg = """当前检索: %(query)s,\n筛选项: %(filters)s,\n页数: %(page)s"""
|
|
|
|
|
kws = {
|
|
|
|
|
"query": request_q.get("query"),
|
|
|
|
|
"filters": pformat(request_q.get("filters", [])),
|
|
|
|
|
"page": '{c}/{m}'.format(c=request_q.get("page", 1), m=request_q.get("max_page", 'null'))
|
|
|
|
|
}
|
|
|
|
|
self.logger.info(msg % kws)
|
|
|
|
|
batch_time = datetime.now()
|
|
|
|
|
# ---------------------------------------------- 提取列表文章的逻辑 ----------------------------------------------
|
|
|
|
|
tr_nodes = response.xpath('//div[@id="gridTable"]//table[@class="result-table-list"]/tbody/tr')
|
|
|
|
|
for tr_node in tr_nodes:
|
|
|
|
|
article_title = tr_node.xpath('./td[@class="name"]/a/text()').get() # 文章标题
|
|
|
|
|
article_link = tr_node.xpath('./td[@class="name"]/a/@href').get() # 文章链接(有v值)
|
|
|
|
|
source_title = tr_node.xpath('./td[@class="source"]/*/a/text()').get() # 出版物名称(刊名)
|
|
|
|
|
db_name = tr_node.xpath('./td[@class="operat"]/a[@class="icon-collect"]/@data-dbname').get() # 收录库
|
|
|
|
|
third_id = tr_node.xpath('./td[@class="operat"]/a[@class="icon-collect"]/@data-filename').get() # 三方id
|
|
|
|
|
cited_str = tr_node.xpath('./td[@class="quote"]/span/a/text()').get() # 被引量字符串
|
|
|
|
|
|
|
|
|
|
param = tools.url_parse(article_link)
|
|
|
|
|
v = param.get('v')
|
|
|
|
|
ti_format = ti2format(article_title)
|
|
|
|
|
ti_unique = ti2unique_type2(ti=ti_format, so=source_title)
|
|
|
|
|
if third_id:
|
|
|
|
|
relation_item = CnkiIdRelationItem()
|
|
|
|
|
relation_item['third_id'] = third_id
|
|
|
|
|
relation_item['query_ids'] = [self.query_id]
|
|
|
|
|
# 给关系添加年份
|
|
|
|
|
add_year2item(relation_item, request_q.get("year"), tr_node.xpath('./td[@class="date"]/text()').get())
|
|
|
|
|
relation_item['updated_at'] = batch_time
|
|
|
|
|
yield relation_item
|
|
|
|
|
if cited_str:
|
|
|
|
|
cited_item = CnkiCitedNumberItem(**dict(third_id=third_id, cited=tools.str2int(cited_str, 0), updated_at=batch_time))
|
|
|
|
|
yield cited_item
|
|
|
|
|
yield CnkiArticleTodoIdItem(**dict(third_id=third_id, db_code=db_name, ti=ti_unique, v=v, state=0))
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
# -------------------------------------------------- 翻页逻辑 --------------------------------------------------
|
|
|
|
|
"""
|
|
|
|
|
if request_q['page'] < request_q['max_page']:
|
|
|
|
|
q_bak = deepcopy(request_q)
|
|
|
|
|
"""
|
|
|
|
|
2023年6月29日14:56:44 处理倒序逻辑
|
|
|
|
|
cnki单次检索限制6000条,即6000/50=120页,当6000<数量<12000时,可以使用倒序来进行补充
|
|
|
|
|
"""
|
|
|
|
|
# 限制6000条的逻辑
|
|
|
|
|
if q_bak['page'] >= 120 and q_bak.get('sort') != 'asc':
|
|
|
|
|
q_bak['page'] = 0
|
|
|
|
|
q_bak['sort'] = 'asc'
|
|
|
|
|
q_bak['max_page_sum'] = q_bak['max_page']
|
|
|
|
|
q_bak['max_page'] = q_bak['max_page_sum'] - 120 + 2
|
|
|
|
|
# 倒序处理逻辑结束
|
|
|
|
|
q_bak['page'] += 1
|
|
|
|
|
query_body = model.adv_refine_search(**q_bak)
|
|
|
|
|
model.add_muti_filters(base_query=query_body, filters=q_bak.get("filters"))
|
|
|
|
|
search_param = model.adv_query_search(query_body, **q_bak)
|
|
|
|
|
yield scrapy.FormRequest(
|
|
|
|
|
url=config.CNKI_ADV_SEARCH_API, method="POST",
|
|
|
|
|
formdata=search_param, priority=priority,
|
|
|
|
|
callback=self.parse_other_page,
|
|
|
|
|
meta=dict(REQUEST_Q=q_bak)
|
|
|
|
|
)
|