science-spider2/science_article_cnki/starter/crawl_article_latest.py

# -*- coding: utf-8 -*-
# @Time    : 2026/2/28 09:36
# @Author  : zhaoxiangpeng
# @File    : crawl_article_latest.py
import time
from typing import List
import pymysql
from pymysql import cursors
from twisted.internet import defer
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
from science_article_cnki.spiders.cnki_latest_increment import CnkiLatestIncrementSpider


def get_connect() -> pymysql.Connection:
    conn: pymysql.Connection = pymysql.connect(host='43.140.203.187', port=3306,
                                                 database='science_data_dept', user='science-data-dept',
                                                 passwd='datadept1509', )
    return conn


def starter():
    process = CrawlerProcess(get_project_settings())
    process.crawl(CnkiLatestIncrementSpider)
    process.start()


def starter_latest_by_record(record_id: int):
    @defer.inlineCallbacks
    def f():
        client: pymysql.Connection = get_connect()
        cursor = client.cursor(cursors.DictCursor)
        cursor.execute(
            'select b.id as task_id, q.id as query_id, q.content as content, b.task_condition as task_condition, q.source_type as source_type, b.is_done as is_done from task_batch_record as b join task_search_strategy as q on b.query_id=q.id where b.id=%s and q.source_type=5 limit 1',
            (record_id,))
        result = cursor.fetchone()
        query_id = result['query_id']
        cursor.execute('select org_id, org_name from relation_org_query where query_id=%s', (query_id,))
        org_results: List[dict] = cursor.fetchall()
        result['org_id'] = [org_result['org_id'] for org_result in org_results]
        result['org_name'] = [org_result['org_name'] for org_result in org_results]

        init_params = result
        init_params = {
            'query_id': 1609,
            'query': '（作者单位：河北工程技术学院(模糊)）',
            'filters': [
                dict(project="年度", value=[f"{y}" for y in years], text_or_title=[f"{y}年" for y in years]),
            ]
        }
        yield process.crawl(CnkiLatestIncrementSpider, task_obj=init_params)

    process = CrawlerProcess(get_project_settings())
    f()
    process.start()
    process.stop()


if __name__ == '__main__':
    starter_latest_by_record(8057)