# -*- coding: utf-8 -*- # @Time : 2026/1/14 13:59 # @Author : zhaoxiangpeng # @File : crawl_article_latest.py import math from typing import List import pymysql from pymysql import cursors from twisted.internet import defer from scrapy.crawler import CrawlerProcess from scrapy.utils.project import get_project_settings from science_article_wos.spiders.wos_latest_increment import WosLatestIncrementSpider CREATE_RECORD_SQL = '''insert into task_batch_record (batch_date, query_id, task_condition) VALUES ("%(batch_date)s", %(query_id)s, %(task_condition)s)''' SELECT_RECORD_SQL = """ SELECT b.id AS task_id, q.id AS query_id, q.content AS content, b.task_condition AS task_condition, q.source_type AS source_type, b.is_done AS is_done FROM task_batch_record AS b JOIN task_search_strategy AS q ON q.id = b.query_id WHERE b.is_done = 0 AND q.source_type = 1 LIMIT %(limit)s """ def starter_latest_all(): @defer.inlineCallbacks def f(): client: pymysql.Connection = pymysql.connect(host='43.140.203.187', port=3306, database='science_data_dept', user='science-data-dept', passwd='datadept1509', ) cursor = client.cursor(cursors.DictCursor) cursor.execute(SELECT_RECORD_SQL % {'limit': 1}) result = cursor.fetchone() query_id = result['query_id'] cursor.execute('select org_id, org_name from relation_org_query where query_id=%s', (query_id,)) org_results: List[dict] = cursor.fetchall() result['org_id'] = [org_result['org_id'] for org_result in org_results] result['org_name'] = [org_result['org_name'] for org_result in org_results] init_params = result yield process.crawl(WosLatestIncrementSpider, task_obj=init_params) process = CrawlerProcess(get_project_settings()) f() process.start() process.stop() def starter(): process = CrawlerProcess(get_project_settings()) process.crawl(WosLatestIncrementSpider) process.start() if __name__ == '__main__': starter_latest_all()