diff --git a/science_article_cnki/science_article_cnki/spiders/cnki_cited_number.py b/science_article_cnki/science_article_cnki/spiders/cnki_cited_number.py index 5791cce..ea4e045 100644 --- a/science_article_cnki/science_article_cnki/spiders/cnki_cited_number.py +++ b/science_article_cnki/science_article_cnki/spiders/cnki_cited_number.py @@ -17,10 +17,6 @@ if TYPE_CHECKING: class CnkiCitedNumberSpider(scrapy.Spider): name = "cnki_cited_number" custom_settings = dict( - DEFAULT_REQUEST_HEADERS={ - "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", - "Accept-Language": "en", - }, DOWNLOADER_MIDDLEWARES={ "science_article_cnki.middlewares.CnkiSearchHeadersDownloaderMiddleware": 540, }, @@ -50,7 +46,8 @@ class CnkiCitedNumberSpider(scrapy.Spider): query_body = model.adv_refine_search(**m) search_param = model.adv_query_search(query_body, **m) yield scrapy.FormRequest( - url=config.CNKI_ADV_SEARCH_API, method="POST", formdata=search_param, meta=m + url=config.CNKI_ADV_SEARCH_API, method="POST", + formdata=search_param, meta=m ) def parse(self, response, **kwargs): @@ -84,7 +81,8 @@ class CnkiCitedNumberSpider(scrapy.Spider): query_body = model.adv_refine_search(**meta_copy) search_param = model.adv_query_search(query_body, **meta_copy) yield scrapy.FormRequest( - url=config.CNKI_ADV_SEARCH_API, method="POST", formdata=search_param, + url=config.CNKI_ADV_SEARCH_API, method="POST", + formdata=search_param, meta=meta_copy ) diff --git a/science_article_cnki/starter/crawl_cited_number.py b/science_article_cnki/starter/crawl_cited_number.py new file mode 100644 index 0000000..0275829 --- /dev/null +++ b/science_article_cnki/starter/crawl_cited_number.py @@ -0,0 +1,47 @@ +# -*- coding: utf-8 -*- +# @Time : 2026/1/5 09:18 +# @Author : zhaoxiangpeng +# @File : crawl_cited_number.py +from twisted.internet import defer +from scrapy.crawler import CrawlerProcess +from scrapy.utils.project import get_project_settings +from science_article_cnki.spiders.cnki_cited_number import CnkiCitedNumberSpider + + +""" +def test_starter(): + y = 2025 + init_params = { + 'query': '(作者单位:河北工程技术学院(模糊))', + 'query_condition': {'year': str(y)} + } + + process = CrawlerProcess(get_project_settings()) + process.crawl(CnkiCitedNumberSpider, **init_params) + process.start() +""" + + +def starter_by_year(): + @defer.inlineCallbacks + def f(range_list: list = None): + for y in range_list: + init_params = { + 'query': '(作者单位:大连东软信息学院(模糊))', + 'query_condition': {'year': str(y)} + } + yield process.crawl(CnkiCitedNumberSpider, **init_params) + + process = CrawlerProcess(get_project_settings()) + f(list(range(2021, 2026))) + process.start() + + +def starter(): + process = CrawlerProcess(get_project_settings()) + process.crawl(CnkiCitedNumberSpider) + process.start() + + +if __name__ == '__main__': + starter_by_year()