import time from typing import List import scrapy from scrapy.crawler import CrawlerProcess from scrapy.utils.project import get_project_settings from twisted.internet import defer from scrapy import Spider import pymysql from pymysql import cursors from science_article_add.spiders.wos_latest_increment import WosLatestIncrementSpider sql = """ SELECT b.id AS task_id, r.org_id AS org_id, r.org_name AS org_name, q.id AS query_id, q.content AS content, b.task_condition AS task_condition, q.source_type AS source_type, b.is_done AS is_done FROM task_batch_record AS b JOIN task_search_strategy AS q ON q.id = b.query_id JOIN relation_org_query AS r ON r.query_id = b.query_id WHERE b.is_done = 2 AND q.source_type = 1 LIMIT %(limit)s """ sql2 = """ SELECT b.id AS task_id, q.id AS query_id, q.content AS content, b.task_condition AS task_condition, q.source_type AS source_type, b.is_done AS is_done FROM task_batch_record AS b JOIN task_search_strategy AS q ON q.id = b.query_id WHERE b.is_done = 0 AND q.source_type = 1 LIMIT %(limit)s """ def get_task(limit: int = 1): client: pymysql.Connection = pymysql.connect(host='43.140.203.187', port=3306, database='science_data_dept', user='science-data-dept', passwd='datadept1509', ) cursor = client.cursor(cursors.DictCursor) try: cursor.execute(sql2 % {'limit': limit}) results = cursor.fetchall() except Exception as e: raise e else: for result in results: query_id = result['query_id'] cursor.execute('select org_id, org_name from relation_org_query where query_id=%s', (query_id,)) org_results: List[dict] = cursor.fetchall() result['org_id'] = [org_result['org_id'] for org_result in org_results] result['org_name'] = [org_result['org_name'] for org_result in org_results] print(result) yield result finally: cursor.close() client.close() @defer.inlineCallbacks def crawl_sequentially(targets): for target in targets: print(f"\n=== 正在启动 Spider,参数: {target} ===") yield process.crawl(WosLatestIncrementSpider, task_obj=target) print(f"=== Spider 完成: {target} ===\n") time.sleep(60) process.stop() # 所有爬虫结束后关闭事件循环 # ====== 主程序部分 ====== if __name__ == '__main__': process = CrawlerProcess(get_project_settings()) targets = get_task(10) crawl_sequentially(targets) process.start() # 阻塞直到所有爬虫完成