cnki:add spider

main
zhaoxiangpeng 1 week ago
parent 2bd56aeb10
commit 68306a03ab

@ -39,6 +39,11 @@ class ProductsEnum(enum.Enum):
pass pass
class KuaKuCodeEnum(enum.Enum):
总库 = 'YSTT4HG0,LSTPFY1C,JUP3MUPD,MPMFIG1A,EMRPGLPA,WQ0UVIAA,BLZOG7CK,PWFIRAGL,NN3FJMUV,NLBO1Z6R'
学术期刊 = ''
class ResourceLanguageEnum(enum.Enum): class ResourceLanguageEnum(enum.Enum):
中文 = "CHINESE" 中文 = "CHINESE"
外文 = "FOREIGN" 外文 = "FOREIGN"

@ -0,0 +1,40 @@
from typing import AsyncIterator, Any
import scrapy
from science_article_cnki.models import cnki_model as model
from science_article_cnki.configs import cnki as config
class CnkiLatestIncrementSpider(scrapy.Spider):
name = "cnki_latest_increment"
custom_settings = dict(
DOWNLOADER_MIDDLEWARES={
"science_article_cnki.middlewares.CnkiSearchHeadersDownloaderMiddleware": 540,
},
ITEM_PIPELINES={
"science_article_cnki.pipelines.MongoPipeline": 300,
"science_article_cnki.pipelines.DupTodoPipeline": 310,
# "science_article_cnki.pipelines.verify_data.VerifyDataIntegrity": 400,
},
# LOG_LEVEL="INFO"
)
source = 'cnki'
resource_type: str = "学术期刊"
query_id: int
query: str
filters: list = list()
async def start(self) -> AsyncIterator[Any]:
m = dict(query=self.query, resource_type=self.resource_type, page=1)
m.update(filters=self.filters)
query_body = model.adv_refine_search(**m)
# 把筛选项加到查询体中
model.add_muti_filters(base_query=query_body, filters=m.get("filters"))
form_d = model.adv_query_search(query_body, **m)
yield scrapy.FormRequest(url=config.CNKI_ADV_SEARCH_API, method="POST",
formdata=form_d, meta=dict(REQUEST_Q=m))
def parse(self, response):
pass

@ -94,7 +94,7 @@ def add_year2item(item, year: Union[int, None], pub_datetime):
if dt: if dt:
year = dt.year year = dt.year
if year: if year:
item.year = year item['year'] = year
return item return item

@ -0,0 +1,35 @@
# -*- coding: utf-8 -*-
# @Time : 2026/1/12 14:13
# @Author : zhaoxiangpeng
# @File : crawl_crossdb_article.py
from twisted.internet import defer
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
from science_article_cnki.spiders.cnki_article_conference import CnkiArticleConferenceSpider
def starter_by_year():
@defer.inlineCallbacks
def f(range_list: list = None):
for y in range_list:
init_params = {
'query': '(作者单位:河北工程技术学院(模糊)',
'filters': [
dict(project="年度", value=f"{y}", text_or_title=f"{y}"),
]
}
yield process.crawl(CnkiArticleConferenceSpider, **init_params)
process = CrawlerProcess(get_project_settings())
f(list(range(2021, 2022)))
process.start()
def starter():
process = CrawlerProcess(get_project_settings())
process.crawl(CnkiArticleConferenceSpider)
process.start()
if __name__ == '__main__':
starter_by_year()

@ -31,8 +31,8 @@ def starter_more_year():
@defer.inlineCallbacks @defer.inlineCallbacks
def f(years: list = None): def f(years: list = None):
init_params = { init_params = {
'query_id': 1611, 'query_id': 1609,
'query': '(作者单位:武昌首义学院(模糊)', 'query': '(作者单位:河北工程技术学院(模糊)',
'filters': [ 'filters': [
dict(project="年度", value=[f"{y}" for y in years], text_or_title=[f"{y}" for y in years]), dict(project="年度", value=[f"{y}" for y in years], text_or_title=[f"{y}" for y in years]),
] ]

@ -0,0 +1,60 @@
# -*- coding: utf-8 -*-
# @Time : 2026/2/28 09:36
# @Author : zhaoxiangpeng
# @File : crawl_article_latest.py
import time
from typing import List
import pymysql
from pymysql import cursors
from twisted.internet import defer
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
from science_article_cnki.spiders.cnki_latest_increment import CnkiLatestIncrementSpider
def get_connect() -> pymysql.Connection:
conn: pymysql.Connection = pymysql.connect(host='43.140.203.187', port=3306,
database='science_data_dept', user='science-data-dept',
passwd='datadept1509', )
return conn
def starter():
process = CrawlerProcess(get_project_settings())
process.crawl(CnkiLatestIncrementSpider)
process.start()
def starter_latest_by_record(record_id: int):
@defer.inlineCallbacks
def f():
client: pymysql.Connection = get_connect()
cursor = client.cursor(cursors.DictCursor)
cursor.execute(
'select b.id as task_id, q.id as query_id, q.content as content, b.task_condition as task_condition, q.source_type as source_type, b.is_done as is_done from task_batch_record as b join task_search_strategy as q on b.query_id=q.id where b.id=%s and q.source_type=5 limit 1',
(record_id,))
result = cursor.fetchone()
query_id = result['query_id']
cursor.execute('select org_id, org_name from relation_org_query where query_id=%s', (query_id,))
org_results: List[dict] = cursor.fetchall()
result['org_id'] = [org_result['org_id'] for org_result in org_results]
result['org_name'] = [org_result['org_name'] for org_result in org_results]
init_params = result
init_params = {
'query_id': 1609,
'query': '(作者单位:河北工程技术学院(模糊)',
'filters': [
dict(project="年度", value=[f"{y}" for y in years], text_or_title=[f"{y}" for y in years]),
]
}
yield process.crawl(CnkiLatestIncrementSpider, task_obj=init_params)
process = CrawlerProcess(get_project_settings())
f()
process.start()
process.stop()
if __name__ == '__main__':
starter_latest_by_record(8057)
Loading…
Cancel
Save