You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

61 lines
2.3 KiB
Python

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

# -*- coding: utf-8 -*-
# @Time : 2026/2/28 09:36
# @Author : zhaoxiangpeng
# @File : crawl_article_latest.py
import time
from typing import List
import pymysql
from pymysql import cursors
from twisted.internet import defer
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
from science_article_cnki.spiders.cnki_latest_increment import CnkiLatestIncrementSpider
def get_connect() -> pymysql.Connection:
conn: pymysql.Connection = pymysql.connect(host='43.140.203.187', port=3306,
database='science_data_dept', user='science-data-dept',
passwd='datadept1509', )
return conn
def starter():
process = CrawlerProcess(get_project_settings())
process.crawl(CnkiLatestIncrementSpider)
process.start()
def starter_latest_by_record(record_id: int):
@defer.inlineCallbacks
def f():
client: pymysql.Connection = get_connect()
cursor = client.cursor(cursors.DictCursor)
cursor.execute(
'select b.id as task_id, q.id as query_id, q.content as content, b.task_condition as task_condition, q.source_type as source_type, b.is_done as is_done from task_batch_record as b join task_search_strategy as q on b.query_id=q.id where b.id=%s and q.source_type=5 limit 1',
(record_id,))
result = cursor.fetchone()
query_id = result['query_id']
cursor.execute('select org_id, org_name from relation_org_query where query_id=%s', (query_id,))
org_results: List[dict] = cursor.fetchall()
result['org_id'] = [org_result['org_id'] for org_result in org_results]
result['org_name'] = [org_result['org_name'] for org_result in org_results]
init_params = result
init_params = {
'query_id': 1609,
'query': '(作者单位:河北工程技术学院(模糊)',
'filters': [
dict(project="年度", value=[f"{y}" for y in years], text_or_title=[f"{y}" for y in years]),
]
}
yield process.crawl(CnkiLatestIncrementSpider, task_obj=init_params)
process = CrawlerProcess(get_project_settings())
f()
process.start()
process.stop()
if __name__ == '__main__':
starter_latest_by_record(8057)