From 17b5253fde993df8d1298f8df20e3a50334376eb Mon Sep 17 00:00:00 2001 From: zhaoxiangpeng <1943364377@qq.com> Date: Mon, 12 Jan 2026 11:04:29 +0800 Subject: [PATCH] =?UTF-8?q?add:=E5=A4=87=E4=BB=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../.idea/science_article_add.iml | 12 --- science_article_add/requirements.txt | 10 +++ science_article_add/run.py | 12 --- science_article_add/tests/run_crawl_task.py | 88 +++++++++++++++++++ science_article_add/wos下载.md | 0 5 files changed, 98 insertions(+), 24 deletions(-) delete mode 100644 science_article_add/.idea/science_article_add.iml create mode 100644 science_article_add/requirements.txt delete mode 100644 science_article_add/run.py create mode 100644 science_article_add/tests/run_crawl_task.py create mode 100644 science_article_add/wos下载.md diff --git a/science_article_add/.idea/science_article_add.iml b/science_article_add/.idea/science_article_add.iml deleted file mode 100644 index ecabfc4..0000000 --- a/science_article_add/.idea/science_article_add.iml +++ /dev/null @@ -1,12 +0,0 @@ - - - - - - - - - - \ No newline at end of file diff --git a/science_article_add/requirements.txt b/science_article_add/requirements.txt new file mode 100644 index 0000000..bcc57ce --- /dev/null +++ b/science_article_add/requirements.txt @@ -0,0 +1,10 @@ +requests~=2.32.4 +scrapy~=2.13.3 +pymongo~=4.13.0 +itemadapter~=0.11.0 +happybase~=1.2.0 +fastapi~=0.116.1 +redis~=6.2.0 +parsel~=1.10.0 +sympy~=1.14.0 +pydantic~=2.0.3 \ No newline at end of file diff --git a/science_article_add/run.py b/science_article_add/run.py deleted file mode 100644 index 9ddc6bf..0000000 --- a/science_article_add/run.py +++ /dev/null @@ -1,12 +0,0 @@ -from scrapy.crawler import CrawlerProcess -from scrapy.utils.project import get_project_settings - -from science_article_add.scripts.get_db_task import TaskManager - -tm = TaskManager() -process = CrawlerProcess(get_project_settings()) - -task = tm.get_task_from_mysql() - -process.crawl('wos_latest_increment', task_obj=task) -process.start() diff --git a/science_article_add/tests/run_crawl_task.py b/science_article_add/tests/run_crawl_task.py new file mode 100644 index 0000000..8b685fa --- /dev/null +++ b/science_article_add/tests/run_crawl_task.py @@ -0,0 +1,88 @@ +import time +from typing import List +import scrapy +from scrapy.crawler import CrawlerProcess +from scrapy.utils.project import get_project_settings +from twisted.internet import defer +from scrapy import Spider +import pymysql +from pymysql import cursors +from science_article_add.spiders.wos_latest_increment import WosLatestIncrementSpider + +sql = """ +SELECT + b.id AS task_id, + r.org_id AS org_id, + r.org_name AS org_name, + q.id AS query_id, + q.content AS content, + b.task_condition AS task_condition, + q.source_type AS source_type, + b.is_done AS is_done +FROM + task_batch_record AS b + JOIN task_search_strategy AS q ON q.id = b.query_id + JOIN relation_org_query AS r ON r.query_id = b.query_id +WHERE + b.is_done = 2 + AND q.source_type = 1 + LIMIT %(limit)s +""" +sql2 = """ +SELECT + b.id AS task_id, + q.id AS query_id, + q.content AS content, + b.task_condition AS task_condition, + q.source_type AS source_type, + b.is_done AS is_done +FROM + task_batch_record AS b + JOIN task_search_strategy AS q ON q.id = b.query_id +WHERE + b.is_done = 0 + AND q.source_type = 1 + LIMIT %(limit)s +""" + +def get_task(limit: int = 1): + client: pymysql.Connection = pymysql.connect(host='43.140.203.187', port=3306, + database='science_data_dept', user='science-data-dept', + passwd='datadept1509', ) + cursor = client.cursor(cursors.DictCursor) + try: + cursor.execute(sql2 % {'limit': limit}) + results = cursor.fetchall() + except Exception as e: + raise e + else: + for result in results: + query_id = result['query_id'] + cursor.execute('select org_id, org_name from relation_org_query where query_id=%s', (query_id,)) + org_results: List[dict] = cursor.fetchall() + result['org_id'] = [org_result['org_id'] for org_result in org_results] + result['org_name'] = [org_result['org_name'] for org_result in org_results] + print(result) + yield result + finally: + cursor.close() + client.close() + + +@defer.inlineCallbacks +def crawl_sequentially(targets): + for target in targets: + print(f"\n=== 正在启动 Spider,参数: {target} ===") + yield process.crawl(WosLatestIncrementSpider, task_obj=target) + print(f"=== Spider 完成: {target} ===\n") + time.sleep(60) + + process.stop() # 所有爬虫结束后关闭事件循环 + + +# ====== 主程序部分 ====== +if __name__ == '__main__': + process = CrawlerProcess(get_project_settings()) + targets = get_task(10) + crawl_sequentially(targets) + process.start() # 阻塞直到所有爬虫完成 diff --git a/science_article_add/wos下载.md b/science_article_add/wos下载.md new file mode 100644 index 0000000..e69de29