|
|
import time
|
|
|
from typing import List
|
|
|
import scrapy
|
|
|
from scrapy.crawler import CrawlerProcess
|
|
|
from scrapy.utils.project import get_project_settings
|
|
|
from twisted.internet import defer
|
|
|
from scrapy import Spider
|
|
|
import pymysql
|
|
|
from pymysql import cursors
|
|
|
from science_article_add.spiders.wos_latest_increment import WosLatestIncrementSpider
|
|
|
|
|
|
sql = """
|
|
|
SELECT
|
|
|
b.id AS task_id,
|
|
|
r.org_id AS org_id,
|
|
|
r.org_name AS org_name,
|
|
|
q.id AS query_id,
|
|
|
q.content AS content,
|
|
|
b.task_condition AS task_condition,
|
|
|
q.source_type AS source_type,
|
|
|
b.is_done AS is_done
|
|
|
FROM
|
|
|
task_batch_record AS b
|
|
|
JOIN task_search_strategy AS q ON q.id = b.query_id
|
|
|
JOIN relation_org_query AS r ON r.query_id = b.query_id
|
|
|
WHERE
|
|
|
b.is_done = 2
|
|
|
AND q.source_type = 1
|
|
|
LIMIT %(limit)s
|
|
|
"""
|
|
|
sql2 = """
|
|
|
SELECT
|
|
|
b.id AS task_id,
|
|
|
q.id AS query_id,
|
|
|
q.content AS content,
|
|
|
b.task_condition AS task_condition,
|
|
|
q.source_type AS source_type,
|
|
|
b.is_done AS is_done
|
|
|
FROM
|
|
|
task_batch_record AS b
|
|
|
JOIN task_search_strategy AS q ON q.id = b.query_id
|
|
|
WHERE
|
|
|
b.is_done = 0
|
|
|
AND q.source_type = 1
|
|
|
LIMIT %(limit)s
|
|
|
"""
|
|
|
|
|
|
def get_task(limit: int = 1):
|
|
|
client: pymysql.Connection = pymysql.connect(host='43.140.203.187', port=3306,
|
|
|
database='science_data_dept', user='science-data-dept',
|
|
|
passwd='datadept1509', )
|
|
|
cursor = client.cursor(cursors.DictCursor)
|
|
|
try:
|
|
|
cursor.execute(sql2 % {'limit': limit})
|
|
|
results = cursor.fetchall()
|
|
|
except Exception as e:
|
|
|
raise e
|
|
|
else:
|
|
|
for result in results:
|
|
|
query_id = result['query_id']
|
|
|
cursor.execute('select org_id, org_name from relation_org_query where query_id=%s', (query_id,))
|
|
|
org_results: List[dict] = cursor.fetchall()
|
|
|
result['org_id'] = [org_result['org_id'] for org_result in org_results]
|
|
|
result['org_name'] = [org_result['org_name'] for org_result in org_results]
|
|
|
print(result)
|
|
|
yield result
|
|
|
finally:
|
|
|
cursor.close()
|
|
|
client.close()
|
|
|
|
|
|
|
|
|
@defer.inlineCallbacks
|
|
|
def crawl_sequentially(targets):
|
|
|
for target in targets:
|
|
|
print(f"\n=== 正在启动 Spider,参数: {target} ===")
|
|
|
yield process.crawl(WosLatestIncrementSpider, task_obj=target)
|
|
|
print(f"=== Spider 完成: {target} ===\n")
|
|
|
time.sleep(60)
|
|
|
|
|
|
process.stop() # 所有爬虫结束后关闭事件循环
|
|
|
|
|
|
|
|
|
# ====== 主程序部分 ======
|
|
|
if __name__ == '__main__':
|
|
|
process = CrawlerProcess(get_project_settings())
|
|
|
targets = get_task(10)
|
|
|
crawl_sequentially(targets)
|
|
|
process.start() # 阻塞直到所有爬虫完成
|