cnki:采集被引量入口

main
zhaoxiangpeng 4 weeks ago
parent ad54448faf
commit 229f2f49f9

@ -17,10 +17,6 @@ if TYPE_CHECKING:
class CnkiCitedNumberSpider(scrapy.Spider):
name = "cnki_cited_number"
custom_settings = dict(
DEFAULT_REQUEST_HEADERS={
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "en",
},
DOWNLOADER_MIDDLEWARES={
"science_article_cnki.middlewares.CnkiSearchHeadersDownloaderMiddleware": 540,
},
@ -50,7 +46,8 @@ class CnkiCitedNumberSpider(scrapy.Spider):
query_body = model.adv_refine_search(**m)
search_param = model.adv_query_search(query_body, **m)
yield scrapy.FormRequest(
url=config.CNKI_ADV_SEARCH_API, method="POST", formdata=search_param, meta=m
url=config.CNKI_ADV_SEARCH_API, method="POST",
formdata=search_param, meta=m
)
def parse(self, response, **kwargs):
@ -84,7 +81,8 @@ class CnkiCitedNumberSpider(scrapy.Spider):
query_body = model.adv_refine_search(**meta_copy)
search_param = model.adv_query_search(query_body, **meta_copy)
yield scrapy.FormRequest(
url=config.CNKI_ADV_SEARCH_API, method="POST", formdata=search_param,
url=config.CNKI_ADV_SEARCH_API, method="POST",
formdata=search_param,
meta=meta_copy
)

@ -0,0 +1,47 @@
# -*- coding: utf-8 -*-
# @Time : 2026/1/5 09:18
# @Author : zhaoxiangpeng
# @File : crawl_cited_number.py
from twisted.internet import defer
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
from science_article_cnki.spiders.cnki_cited_number import CnkiCitedNumberSpider
"""
def test_starter():
y = 2025
init_params = {
'query': '(作者单位:河北工程技术学院(模糊)',
'query_condition': {'year': str(y)}
}
process = CrawlerProcess(get_project_settings())
process.crawl(CnkiCitedNumberSpider, **init_params)
process.start()
"""
def starter_by_year():
@defer.inlineCallbacks
def f(range_list: list = None):
for y in range_list:
init_params = {
'query': '(作者单位:大连东软信息学院(模糊)',
'query_condition': {'year': str(y)}
}
yield process.crawl(CnkiCitedNumberSpider, **init_params)
process = CrawlerProcess(get_project_settings())
f(list(range(2021, 2026)))
process.start()
def starter():
process = CrawlerProcess(get_project_settings())
process.crawl(CnkiCitedNumberSpider)
process.start()
if __name__ == '__main__':
starter_by_year()
Loading…
Cancel
Save