From 68306a03ab27e1a9f066287b1deb88850340982f Mon Sep 17 00:00:00 2001 From: zhaoxiangpeng <1943364377@qq.com> Date: Thu, 12 Mar 2026 14:09:53 +0800 Subject: [PATCH] cnki:add spider --- .../science_article_cnki/models/enum_cls.py | 5 ++ .../spiders/cnki_latest_increment.py | 40 +++++++++++++ .../science_article_cnki/utils/tools.py | 2 +- .../starter/crawl_article_conference.py | 35 +++++++++++ .../starter/crawl_article_crossdb.py | 4 +- .../starter/crawl_article_latest.py | 60 +++++++++++++++++++ 6 files changed, 143 insertions(+), 3 deletions(-) create mode 100644 science_article_cnki/science_article_cnki/spiders/cnki_latest_increment.py create mode 100644 science_article_cnki/starter/crawl_article_conference.py create mode 100644 science_article_cnki/starter/crawl_article_latest.py diff --git a/science_article_cnki/science_article_cnki/models/enum_cls.py b/science_article_cnki/science_article_cnki/models/enum_cls.py index 8f0634a..c2860f4 100644 --- a/science_article_cnki/science_article_cnki/models/enum_cls.py +++ b/science_article_cnki/science_article_cnki/models/enum_cls.py @@ -39,6 +39,11 @@ class ProductsEnum(enum.Enum): pass +class KuaKuCodeEnum(enum.Enum): + 总库 = 'YSTT4HG0,LSTPFY1C,JUP3MUPD,MPMFIG1A,EMRPGLPA,WQ0UVIAA,BLZOG7CK,PWFIRAGL,NN3FJMUV,NLBO1Z6R' + 学术期刊 = '' + + class ResourceLanguageEnum(enum.Enum): 中文 = "CHINESE" 外文 = "FOREIGN" diff --git a/science_article_cnki/science_article_cnki/spiders/cnki_latest_increment.py b/science_article_cnki/science_article_cnki/spiders/cnki_latest_increment.py new file mode 100644 index 0000000..80a75ad --- /dev/null +++ b/science_article_cnki/science_article_cnki/spiders/cnki_latest_increment.py @@ -0,0 +1,40 @@ +from typing import AsyncIterator, Any + +import scrapy + +from science_article_cnki.models import cnki_model as model +from science_article_cnki.configs import cnki as config + + +class CnkiLatestIncrementSpider(scrapy.Spider): + name = "cnki_latest_increment" + custom_settings = dict( + DOWNLOADER_MIDDLEWARES={ + "science_article_cnki.middlewares.CnkiSearchHeadersDownloaderMiddleware": 540, + }, + ITEM_PIPELINES={ + "science_article_cnki.pipelines.MongoPipeline": 300, + "science_article_cnki.pipelines.DupTodoPipeline": 310, + # "science_article_cnki.pipelines.verify_data.VerifyDataIntegrity": 400, + }, + # LOG_LEVEL="INFO" + ) + source = 'cnki' + resource_type: str = "学术期刊" + + query_id: int + query: str + filters: list = list() + + async def start(self) -> AsyncIterator[Any]: + m = dict(query=self.query, resource_type=self.resource_type, page=1) + m.update(filters=self.filters) + query_body = model.adv_refine_search(**m) + # 把筛选项加到查询体中 + model.add_muti_filters(base_query=query_body, filters=m.get("filters")) + form_d = model.adv_query_search(query_body, **m) + yield scrapy.FormRequest(url=config.CNKI_ADV_SEARCH_API, method="POST", + formdata=form_d, meta=dict(REQUEST_Q=m)) + + def parse(self, response): + pass diff --git a/science_article_cnki/science_article_cnki/utils/tools.py b/science_article_cnki/science_article_cnki/utils/tools.py index 22c56b0..5bbeb26 100644 --- a/science_article_cnki/science_article_cnki/utils/tools.py +++ b/science_article_cnki/science_article_cnki/utils/tools.py @@ -94,7 +94,7 @@ def add_year2item(item, year: Union[int, None], pub_datetime): if dt: year = dt.year if year: - item.year = year + item['year'] = year return item diff --git a/science_article_cnki/starter/crawl_article_conference.py b/science_article_cnki/starter/crawl_article_conference.py new file mode 100644 index 0000000..dfb5ff7 --- /dev/null +++ b/science_article_cnki/starter/crawl_article_conference.py @@ -0,0 +1,35 @@ +# -*- coding: utf-8 -*- +# @Time : 2026/1/12 14:13 +# @Author : zhaoxiangpeng +# @File : crawl_crossdb_article.py +from twisted.internet import defer +from scrapy.crawler import CrawlerProcess +from scrapy.utils.project import get_project_settings +from science_article_cnki.spiders.cnki_article_conference import CnkiArticleConferenceSpider + + +def starter_by_year(): + @defer.inlineCallbacks + def f(range_list: list = None): + for y in range_list: + init_params = { + 'query': '(作者单位:河北工程技术学院(模糊))', + 'filters': [ + dict(project="年度", value=f"{y}", text_or_title=f"{y}年"), + ] + } + yield process.crawl(CnkiArticleConferenceSpider, **init_params) + + process = CrawlerProcess(get_project_settings()) + f(list(range(2021, 2022))) + process.start() + + +def starter(): + process = CrawlerProcess(get_project_settings()) + process.crawl(CnkiArticleConferenceSpider) + process.start() + + +if __name__ == '__main__': + starter_by_year() diff --git a/science_article_cnki/starter/crawl_article_crossdb.py b/science_article_cnki/starter/crawl_article_crossdb.py index 8b4990a..25a9694 100644 --- a/science_article_cnki/starter/crawl_article_crossdb.py +++ b/science_article_cnki/starter/crawl_article_crossdb.py @@ -31,8 +31,8 @@ def starter_more_year(): @defer.inlineCallbacks def f(years: list = None): init_params = { - 'query_id': 1611, - 'query': '(作者单位:武昌首义学院(模糊))', + 'query_id': 1609, + 'query': '(作者单位:河北工程技术学院(模糊))', 'filters': [ dict(project="年度", value=[f"{y}" for y in years], text_or_title=[f"{y}年" for y in years]), ] diff --git a/science_article_cnki/starter/crawl_article_latest.py b/science_article_cnki/starter/crawl_article_latest.py new file mode 100644 index 0000000..6700965 --- /dev/null +++ b/science_article_cnki/starter/crawl_article_latest.py @@ -0,0 +1,60 @@ +# -*- coding: utf-8 -*- +# @Time : 2026/2/28 09:36 +# @Author : zhaoxiangpeng +# @File : crawl_article_latest.py +import time +from typing import List +import pymysql +from pymysql import cursors +from twisted.internet import defer +from scrapy.crawler import CrawlerProcess +from scrapy.utils.project import get_project_settings +from science_article_cnki.spiders.cnki_latest_increment import CnkiLatestIncrementSpider + + +def get_connect() -> pymysql.Connection: + conn: pymysql.Connection = pymysql.connect(host='43.140.203.187', port=3306, + database='science_data_dept', user='science-data-dept', + passwd='datadept1509', ) + return conn + + +def starter(): + process = CrawlerProcess(get_project_settings()) + process.crawl(CnkiLatestIncrementSpider) + process.start() + + +def starter_latest_by_record(record_id: int): + @defer.inlineCallbacks + def f(): + client: pymysql.Connection = get_connect() + cursor = client.cursor(cursors.DictCursor) + cursor.execute( + 'select b.id as task_id, q.id as query_id, q.content as content, b.task_condition as task_condition, q.source_type as source_type, b.is_done as is_done from task_batch_record as b join task_search_strategy as q on b.query_id=q.id where b.id=%s and q.source_type=5 limit 1', + (record_id,)) + result = cursor.fetchone() + query_id = result['query_id'] + cursor.execute('select org_id, org_name from relation_org_query where query_id=%s', (query_id,)) + org_results: List[dict] = cursor.fetchall() + result['org_id'] = [org_result['org_id'] for org_result in org_results] + result['org_name'] = [org_result['org_name'] for org_result in org_results] + + init_params = result + init_params = { + 'query_id': 1609, + 'query': '(作者单位:河北工程技术学院(模糊))', + 'filters': [ + dict(project="年度", value=[f"{y}" for y in years], text_or_title=[f"{y}年" for y in years]), + ] + } + yield process.crawl(CnkiLatestIncrementSpider, task_obj=init_params) + + process = CrawlerProcess(get_project_settings()) + f() + process.start() + process.stop() + + +if __name__ == '__main__': + starter_latest_by_record(8057)