|
|
# -*- coding: utf-8 -*-
|
|
|
# @Time : 2026/2/28 09:36
|
|
|
# @Author : zhaoxiangpeng
|
|
|
# @File : crawl_article_latest.py
|
|
|
import time
|
|
|
from typing import List
|
|
|
import pymysql
|
|
|
from pymysql import cursors
|
|
|
from twisted.internet import defer
|
|
|
from scrapy.crawler import CrawlerProcess
|
|
|
from scrapy.utils.project import get_project_settings
|
|
|
from science_article_cnki.spiders.cnki_latest_increment import CnkiLatestIncrementSpider
|
|
|
|
|
|
|
|
|
def get_connect() -> pymysql.Connection:
|
|
|
conn: pymysql.Connection = pymysql.connect(host='43.140.203.187', port=3306,
|
|
|
database='science_data_dept', user='science-data-dept',
|
|
|
passwd='datadept1509', )
|
|
|
return conn
|
|
|
|
|
|
|
|
|
def starter():
|
|
|
process = CrawlerProcess(get_project_settings())
|
|
|
process.crawl(CnkiLatestIncrementSpider)
|
|
|
process.start()
|
|
|
|
|
|
|
|
|
def starter_latest_by_record(record_id: int):
|
|
|
@defer.inlineCallbacks
|
|
|
def f():
|
|
|
client: pymysql.Connection = get_connect()
|
|
|
cursor = client.cursor(cursors.DictCursor)
|
|
|
cursor.execute(
|
|
|
'select b.id as task_id, q.id as query_id, q.content as content, b.task_condition as task_condition, q.source_type as source_type, b.is_done as is_done from task_batch_record as b join task_search_strategy as q on b.query_id=q.id where b.id=%s and q.source_type=5 limit 1',
|
|
|
(record_id,))
|
|
|
result = cursor.fetchone()
|
|
|
query_id = result['query_id']
|
|
|
cursor.execute('select org_id, org_name from relation_org_query where query_id=%s', (query_id,))
|
|
|
org_results: List[dict] = cursor.fetchall()
|
|
|
result['org_id'] = [org_result['org_id'] for org_result in org_results]
|
|
|
result['org_name'] = [org_result['org_name'] for org_result in org_results]
|
|
|
|
|
|
init_params = result
|
|
|
init_params = {
|
|
|
'query_id': 1609,
|
|
|
'query': '(作者单位:河北工程技术学院(模糊))',
|
|
|
'filters': [
|
|
|
dict(project="年度", value=[f"{y}" for y in years], text_or_title=[f"{y}年" for y in years]),
|
|
|
]
|
|
|
}
|
|
|
yield process.crawl(CnkiLatestIncrementSpider, task_obj=init_params)
|
|
|
|
|
|
process = CrawlerProcess(get_project_settings())
|
|
|
f()
|
|
|
process.start()
|
|
|
process.stop()
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
starter_latest_by_record(8057)
|