add:备份

main
zhaoxiangpeng 3 weeks ago
parent 14eea8c9d1
commit 17b5253fde

@ -1,12 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$" />
<orderEntry type="jdk" jdkName="pydevenv" jdkType="Python SDK" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
<component name="PyDocumentationSettings">
<option name="format" value="PLAIN" />
<option name="myDocStringFormat" value="Plain" />
</component>
</module>

@ -0,0 +1,10 @@
requests~=2.32.4
scrapy~=2.13.3
pymongo~=4.13.0
itemadapter~=0.11.0
happybase~=1.2.0
fastapi~=0.116.1
redis~=6.2.0
parsel~=1.10.0
sympy~=1.14.0
pydantic~=2.0.3

@ -1,12 +0,0 @@
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
from science_article_add.scripts.get_db_task import TaskManager
tm = TaskManager()
process = CrawlerProcess(get_project_settings())
task = tm.get_task_from_mysql()
process.crawl('wos_latest_increment', task_obj=task)
process.start()

@ -0,0 +1,88 @@
import time
from typing import List
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
from twisted.internet import defer
from scrapy import Spider
import pymysql
from pymysql import cursors
from science_article_add.spiders.wos_latest_increment import WosLatestIncrementSpider
sql = """
SELECT
b.id AS task_id,
r.org_id AS org_id,
r.org_name AS org_name,
q.id AS query_id,
q.content AS content,
b.task_condition AS task_condition,
q.source_type AS source_type,
b.is_done AS is_done
FROM
task_batch_record AS b
JOIN task_search_strategy AS q ON q.id = b.query_id
JOIN relation_org_query AS r ON r.query_id = b.query_id
WHERE
b.is_done = 2
AND q.source_type = 1
LIMIT %(limit)s
"""
sql2 = """
SELECT
b.id AS task_id,
q.id AS query_id,
q.content AS content,
b.task_condition AS task_condition,
q.source_type AS source_type,
b.is_done AS is_done
FROM
task_batch_record AS b
JOIN task_search_strategy AS q ON q.id = b.query_id
WHERE
b.is_done = 0
AND q.source_type = 1
LIMIT %(limit)s
"""
def get_task(limit: int = 1):
client: pymysql.Connection = pymysql.connect(host='43.140.203.187', port=3306,
database='science_data_dept', user='science-data-dept',
passwd='datadept1509', )
cursor = client.cursor(cursors.DictCursor)
try:
cursor.execute(sql2 % {'limit': limit})
results = cursor.fetchall()
except Exception as e:
raise e
else:
for result in results:
query_id = result['query_id']
cursor.execute('select org_id, org_name from relation_org_query where query_id=%s', (query_id,))
org_results: List[dict] = cursor.fetchall()
result['org_id'] = [org_result['org_id'] for org_result in org_results]
result['org_name'] = [org_result['org_name'] for org_result in org_results]
print(result)
yield result
finally:
cursor.close()
client.close()
@defer.inlineCallbacks
def crawl_sequentially(targets):
for target in targets:
print(f"\n=== 正在启动 Spider参数: {target} ===")
yield process.crawl(WosLatestIncrementSpider, task_obj=target)
print(f"=== Spider 完成: {target} ===\n")
time.sleep(60)
process.stop() # 所有爬虫结束后关闭事件循环
# ====== 主程序部分 ======
if __name__ == '__main__':
process = CrawlerProcess(get_project_settings())
targets = get_task(10)
crawl_sequentially(targets)
process.start() # 阻塞直到所有爬虫完成
Loading…
Cancel
Save