|
|
# -*- coding: utf-8 -*-
|
|
|
# @Time : 2026/1/12 14:13
|
|
|
# @Author : zhaoxiangpeng
|
|
|
# @File : crawl_crossdb_article.py
|
|
|
from twisted.internet import defer
|
|
|
from scrapy.crawler import CrawlerProcess
|
|
|
from scrapy.utils.project import get_project_settings
|
|
|
from science_article_cnki.spiders.cnki_article_crossdb import CnkiArticleCrossdbSpider
|
|
|
|
|
|
|
|
|
def starter_by_year():
|
|
|
@defer.inlineCallbacks
|
|
|
def f(range_list: list = None):
|
|
|
for y in range_list:
|
|
|
init_params = {
|
|
|
'query_id': 1609,
|
|
|
'query': '(作者单位:河北工程技术学院(模糊))',
|
|
|
# 'query_condition': {'year': str(y)},
|
|
|
'filters': [
|
|
|
dict(project="年度", value=f"{y}", text_or_title=f"{y}年"),
|
|
|
]
|
|
|
}
|
|
|
yield process.crawl(CnkiArticleCrossdbSpider, **init_params)
|
|
|
|
|
|
process = CrawlerProcess(get_project_settings())
|
|
|
f(list(range(2021, 2022)))
|
|
|
process.start()
|
|
|
|
|
|
|
|
|
def starter_more_year():
|
|
|
@defer.inlineCallbacks
|
|
|
def f(years: list = None):
|
|
|
init_params = {
|
|
|
'query_id': 1611,
|
|
|
'query': '(作者单位:武昌首义学院(模糊))',
|
|
|
'filters': [
|
|
|
dict(project="年度", value=[f"{y}" for y in years], text_or_title=[f"{y}年" for y in years]),
|
|
|
]
|
|
|
}
|
|
|
yield process.crawl(CnkiArticleCrossdbSpider, **init_params)
|
|
|
|
|
|
process = CrawlerProcess(get_project_settings())
|
|
|
f(list(range(2021, 2026)))
|
|
|
process.start()
|
|
|
|
|
|
|
|
|
def starter():
|
|
|
process = CrawlerProcess(get_project_settings())
|
|
|
process.crawl(CnkiArticleCrossdbSpider)
|
|
|
process.start()
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
starter_more_year()
|