You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

89 lines
2.6 KiB
Python

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

import time
from typing import List
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
from twisted.internet import defer
from scrapy import Spider
import pymysql
from pymysql import cursors
from science_article_add.spiders.wos_latest_increment import WosLatestIncrementSpider
sql = """
SELECT
b.id AS task_id,
r.org_id AS org_id,
r.org_name AS org_name,
q.id AS query_id,
q.content AS content,
b.task_condition AS task_condition,
q.source_type AS source_type,
b.is_done AS is_done
FROM
task_batch_record AS b
JOIN task_search_strategy AS q ON q.id = b.query_id
JOIN relation_org_query AS r ON r.query_id = b.query_id
WHERE
b.is_done = 2
AND q.source_type = 1
LIMIT %(limit)s
"""
sql2 = """
SELECT
b.id AS task_id,
q.id AS query_id,
q.content AS content,
b.task_condition AS task_condition,
q.source_type AS source_type,
b.is_done AS is_done
FROM
task_batch_record AS b
JOIN task_search_strategy AS q ON q.id = b.query_id
WHERE
b.is_done = 0
AND q.source_type = 1
LIMIT %(limit)s
"""
def get_task(limit: int = 1):
client: pymysql.Connection = pymysql.connect(host='43.140.203.187', port=3306,
database='science_data_dept', user='science-data-dept',
passwd='datadept1509', )
cursor = client.cursor(cursors.DictCursor)
try:
cursor.execute(sql2 % {'limit': limit})
results = cursor.fetchall()
except Exception as e:
raise e
else:
for result in results:
query_id = result['query_id']
cursor.execute('select org_id, org_name from relation_org_query where query_id=%s', (query_id,))
org_results: List[dict] = cursor.fetchall()
result['org_id'] = [org_result['org_id'] for org_result in org_results]
result['org_name'] = [org_result['org_name'] for org_result in org_results]
print(result)
yield result
finally:
cursor.close()
client.close()
@defer.inlineCallbacks
def crawl_sequentially(targets):
for target in targets:
print(f"\n=== 正在启动 Spider参数: {target} ===")
yield process.crawl(WosLatestIncrementSpider, task_obj=target)
print(f"=== Spider 完成: {target} ===\n")
time.sleep(60)
process.stop() # 所有爬虫结束后关闭事件循环
# ====== 主程序部分 ======
if __name__ == '__main__':
process = CrawlerProcess(get_project_settings())
targets = get_task(10)
crawl_sequentially(targets)
process.start() # 阻塞直到所有爬虫完成