You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

66 lines
2.1 KiB
Python

# -*- coding: utf-8 -*-
# @Time : 2026/1/14 13:59
# @Author : zhaoxiangpeng
# @File : crawl_article_latest.py
import math
from typing import List
import pymysql
from pymysql import cursors
from twisted.internet import defer
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
from science_article_wos.spiders.wos_latest_increment import WosLatestIncrementSpider
CREATE_RECORD_SQL = '''insert into task_batch_record (batch_date, query_id, task_condition) VALUES ("%(batch_date)s", %(query_id)s, %(task_condition)s)'''
SELECT_RECORD_SQL = """
SELECT
b.id AS task_id,
q.id AS query_id,
q.content AS content,
b.task_condition AS task_condition,
q.source_type AS source_type,
b.is_done AS is_done
FROM
task_batch_record AS b
JOIN task_search_strategy AS q ON q.id = b.query_id
WHERE
b.is_done = 0
AND q.source_type = 1
LIMIT %(limit)s
"""
def starter_latest_all():
@defer.inlineCallbacks
def f():
client: pymysql.Connection = pymysql.connect(host='43.140.203.187', port=3306,
database='science_data_dept', user='science-data-dept',
passwd='datadept1509', )
cursor = client.cursor(cursors.DictCursor)
cursor.execute(SELECT_RECORD_SQL % {'limit': 1})
result = cursor.fetchone()
query_id = result['query_id']
cursor.execute('select org_id, org_name from relation_org_query where query_id=%s', (query_id,))
org_results: List[dict] = cursor.fetchall()
result['org_id'] = [org_result['org_id'] for org_result in org_results]
result['org_name'] = [org_result['org_name'] for org_result in org_results]
init_params = result
yield process.crawl(WosLatestIncrementSpider, task_obj=init_params)
process = CrawlerProcess(get_project_settings())
f()
process.start()
process.stop()
def starter():
process = CrawlerProcess(get_project_settings())
process.crawl(WosLatestIncrementSpider)
process.start()
if __name__ == '__main__':
starter_latest_all()