# -*- coding: utf-8 -*- # @Time : 2026/2/28 09:36 # @Author : zhaoxiangpeng # @File : crawl_article_latest.py import time from typing import List import pymysql from pymysql import cursors from twisted.internet import defer from scrapy.crawler import CrawlerProcess from scrapy.utils.project import get_project_settings from science_article_cnki.spiders.cnki_latest_increment import CnkiLatestIncrementSpider def get_connect() -> pymysql.Connection: conn: pymysql.Connection = pymysql.connect(host='43.140.203.187', port=3306, database='science_data_dept', user='science-data-dept', passwd='datadept1509', ) return conn def starter(): process = CrawlerProcess(get_project_settings()) process.crawl(CnkiLatestIncrementSpider) process.start() def starter_latest_by_record(record_id: int): @defer.inlineCallbacks def f(): client: pymysql.Connection = get_connect() cursor = client.cursor(cursors.DictCursor) cursor.execute( 'select b.id as task_id, q.id as query_id, q.content as content, b.task_condition as task_condition, q.source_type as source_type, b.is_done as is_done from task_batch_record as b join task_search_strategy as q on b.query_id=q.id where b.id=%s and q.source_type=5 limit 1', (record_id,)) result = cursor.fetchone() query_id = result['query_id'] cursor.execute('select org_id, org_name from relation_org_query where query_id=%s', (query_id,)) org_results: List[dict] = cursor.fetchall() result['org_id'] = [org_result['org_id'] for org_result in org_results] result['org_name'] = [org_result['org_name'] for org_result in org_results] init_params = result init_params = { 'query_id': 1609, 'query': '(作者单位:河北工程技术学院(模糊))', 'filters': [ dict(project="年度", value=[f"{y}" for y in years], text_or_title=[f"{y}年" for y in years]), ] } yield process.crawl(CnkiLatestIncrementSpider, task_obj=init_params) process = CrawlerProcess(get_project_settings()) f() process.start() process.stop() if __name__ == '__main__': starter_latest_by_record(8057)