wos:增量采集
parent
a95f242bd5
commit
1bf0703dba
@ -1,3 +1,11 @@
|
||||
sqlalchemy~=1.3.24
|
||||
requests~=2.32.4
|
||||
scrapy~=2.13.3
|
||||
itemadapter~=0.11.0
|
||||
pymongo~=4.13.0
|
||||
itemadapter~=0.11.0
|
||||
happybase~=1.2.0
|
||||
fastapi~=0.116.1
|
||||
redis~=6.2.0
|
||||
parsel~=1.10.0
|
||||
sympy~=1.14.0
|
||||
pydantic~=2.0.3
|
||||
@ -0,0 +1,92 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# @Time : 2026/1/14 16:17
|
||||
# @Author : zhaoxiangpeng
|
||||
# @File : extensions.py
|
||||
import logging
|
||||
import pymysql
|
||||
from scrapy import signals
|
||||
from scrapy.crawler import Crawler
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class LatestSpiderProtocol:
|
||||
name: str
|
||||
record_id: int
|
||||
org_id: int
|
||||
org_name: str
|
||||
query_id: int
|
||||
query_content: str
|
||||
|
||||
def get_records_found(self) -> int: ...
|
||||
|
||||
|
||||
class ACKExtension:
|
||||
def __init__(self, crawler: Crawler):
|
||||
self.crawler = crawler
|
||||
self.change_state_sql = 'update task_batch_record set %(update_kws)s where %(update_cond)s'
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
ext = cls(crawler=crawler)
|
||||
crawler.signals.connect(ext.spider_opened, signal=signals.spider_opened)
|
||||
crawler.signals.connect(ext.spider_error, signal=signals.spider_error)
|
||||
crawler.signals.connect(ext.spider_closed, signal=signals.spider_closed)
|
||||
return ext
|
||||
|
||||
def spider_opened(self, spider):
|
||||
kws = {
|
||||
'is_done': 2,
|
||||
}
|
||||
sql = self.change_state_sql % {
|
||||
'update_kws': ', '.join([f'{k}={v}' for k, v in kws.items()]),
|
||||
'update_cond': 'id=%(record_id)s' % {'record_id': spider.record_id}
|
||||
}
|
||||
self._execute_sql(sql)
|
||||
|
||||
def spider_closed(self, spider: LatestSpiderProtocol):
|
||||
"""
|
||||
# 修改任务状态
|
||||
# 通知
|
||||
"""
|
||||
kws = {
|
||||
'is_done': 1,
|
||||
'result_count': spider.get_records_found(),
|
||||
'updated_time': 'CURRENT_TIMESTAMP'
|
||||
}
|
||||
sql = self.change_state_sql % {
|
||||
'update_kws': ', '.join([f'{k}={v}' for k, v in kws.items()]),
|
||||
'update_cond': 'id=%(record_id)s' % {'record_id': spider.record_id}
|
||||
}
|
||||
self._execute_sql(sql)
|
||||
|
||||
def spider_error(self, spider: LatestSpiderProtocol):
|
||||
kws = {
|
||||
'is_done': -1,
|
||||
'updated_time': 'CURRENT_TIMESTAMP'
|
||||
}
|
||||
sql = self.change_state_sql % {
|
||||
'update_kws': ', '.join([f'{k}={v}' for k, v in kws.items()]),
|
||||
'update_cond': 'id=%(record_id)s' % {'record_id': spider.record_id}
|
||||
}
|
||||
self._execute_sql(sql)
|
||||
|
||||
def _execute_sql(self, sql):
|
||||
settings = self.crawler.settings
|
||||
client = pymysql.connect(
|
||||
host=settings.get('MYSQL_HOST'),
|
||||
port=settings.get('MYSQL_PORT', 3306),
|
||||
database=settings.get('MYSQL_DATABASE'),
|
||||
user=settings.get('MYSQL_USER'),
|
||||
passwd=settings.get('MYSQL_PASSWORD'),
|
||||
)
|
||||
try:
|
||||
cursor = client.cursor()
|
||||
cursor.execute(sql)
|
||||
cursor.connection.commit()
|
||||
logger.info(f'Execute SQL: {sql}')
|
||||
except Exception as e:
|
||||
logger.exception(e)
|
||||
finally:
|
||||
client.close()
|
||||
|
||||
@ -0,0 +1,58 @@
|
||||
# Define here the models for your scraped items
|
||||
#
|
||||
# See documentation in:
|
||||
# https://docs.scrapy.org/en/latest/topics/items.html
|
||||
|
||||
import scrapy
|
||||
|
||||
|
||||
class ScienceArticleWosItem(scrapy.Item):
|
||||
# define the fields for your item here like:
|
||||
# name = scrapy.Field()
|
||||
pass
|
||||
|
||||
|
||||
class AddItemBase(scrapy.Item):
|
||||
third_id = scrapy.Field()
|
||||
updated_at = scrapy.Field()
|
||||
|
||||
|
||||
class ArticleItem(AddItemBase):
|
||||
exported = scrapy.Field()
|
||||
|
||||
|
||||
class ArticleCitedItem(AddItemBase):
|
||||
cited = scrapy.Field()
|
||||
|
||||
|
||||
class WosArticleItem(ArticleItem):
|
||||
"""wos文章item"""
|
||||
__tablename__ = 'data_wos_article'
|
||||
|
||||
third_id = scrapy.Field()
|
||||
exported = scrapy.Field()
|
||||
updated_at = scrapy.Field()
|
||||
|
||||
|
||||
class WosIdRelationItem(AddItemBase):
|
||||
__tablename__ = 'relation_school_wos'
|
||||
|
||||
query_ids = scrapy.Field()
|
||||
school_ids = scrapy.Field()
|
||||
task_ids = scrapy.Field()
|
||||
|
||||
|
||||
class WosArticleTodoIdItem(scrapy.Item):
|
||||
__tablename__ = 'todo_ids_wos'
|
||||
|
||||
third_id = scrapy.Field()
|
||||
state = scrapy.Field()
|
||||
|
||||
|
||||
class WosCitedNumberItem(ArticleCitedItem):
|
||||
__tablename__ = 'relation_cited_number_wos'
|
||||
|
||||
"""发文被引量item"""
|
||||
third_id = scrapy.Field()
|
||||
cited = scrapy.Field()
|
||||
updated_at = scrapy.Field()
|
||||
@ -0,0 +1,4 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# @Time : 2026/1/14 14:20
|
||||
# @Author : zhaoxiangpeng
|
||||
# @File : __init__.py.py
|
||||
@ -0,0 +1,76 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# @Time : 2024/3/5 16:05
|
||||
# @Author : zhaoxiangpeng
|
||||
# @File : parse_data.py
|
||||
|
||||
import logging
|
||||
from typing import Union
|
||||
from science_article_wos.utils.tools import str2int
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
DEFAULT_TABLE_HEAD = ['PT', 'AU', 'BA', 'BE', 'GP', 'AF', 'BF', 'CA', 'TI', 'SO', 'SE', 'BS', 'LA', 'DT', 'CT', 'CY', 'CL', 'SP', 'HO', 'DE', 'ID', 'AB', 'C1', 'C3', 'RP', 'EM', 'RI', 'OI', 'FU', 'FP', 'FX', 'CR', 'NR', 'TC', 'Z9', 'U1', 'U2', 'PU', 'PI', 'PA', 'SN', 'EI', 'BN', 'J9', 'JI', 'PD', 'PY', 'VL', 'IS', 'PN', 'SU', 'SI', 'MA', 'BP', 'EP', 'AR', 'DI', 'DL', 'D2', 'EA', 'PG', 'WC', 'WE', 'SC', 'GA', 'PM', 'OA', 'HC', 'HP', 'DA', 'UT']
|
||||
DEFAULT_TABLE_HEAD_LOWER = [h.lower() for h in DEFAULT_TABLE_HEAD]
|
||||
|
||||
|
||||
def to_dict(data, headers: list):
|
||||
data_text = data.strip().decode()
|
||||
_to_dict = {}
|
||||
|
||||
for key, value in zip(headers, data_text.split('\t')):
|
||||
if not value:
|
||||
value = None
|
||||
_to_dict[key] = value
|
||||
|
||||
vyear = None
|
||||
str2int(_to_dict.get("py"), None)
|
||||
try:
|
||||
vyear = str2int(_to_dict.get("py"), None)
|
||||
if not vyear:
|
||||
logger.warning("WOS号: %s,年份异常: %s" % (_to_dict["ut"], _to_dict.get("py")))
|
||||
except Exception as e:
|
||||
logger.exception("""
|
||||
原始数据: %s,
|
||||
数据字典: %s
|
||||
异常信息: %s""" % (data, _to_dict, e))
|
||||
|
||||
_to_dict["py"] = vyear
|
||||
|
||||
return _to_dict
|
||||
|
||||
|
||||
def parse_full_records_txt(content: bytes):
|
||||
lines = content.strip().split(b'\r\n')
|
||||
head_line = lines.pop(0)
|
||||
try:
|
||||
head_start = head_line.index(b'PT')
|
||||
head_line = head_line[head_start:]
|
||||
head_line = head_line.strip().decode('utf-8')
|
||||
HEADERS = head_line.split('\t')
|
||||
HEADERS = [s.lower() for s in HEADERS]
|
||||
except ValueError:
|
||||
logger.error("内容出现异常跳过: %s" % head_line)
|
||||
HEADERS = ['PT', 'AU', 'Z2', 'AF', 'BA', 'BF', 'CA', 'GP', 'BE', 'TI', 'Z1', 'SO', 'Z3', 'SE', 'BS', 'LA', 'DT', 'CT', 'CY', 'CL', 'SP', 'HO', 'DE', 'Z5', 'ID', 'AB', 'Z4', 'C1', 'Z6', 'RP', 'EM', 'Z7', 'RI', 'OI', 'FU', 'FX', 'CR', 'NR', 'TC', 'Z9', 'Z8', 'Z9', 'U1', 'U2', 'PU', 'PI', 'PA', 'SN', 'EI', 'BN', 'J9', 'JI', 'PD', 'PY', 'VL', 'IS', 'SI', 'PN', 'SU', 'MA', 'BP', 'EP', 'AR', 'DI', 'D2', 'EA', 'EY', 'PG', 'P2', 'WC', 'SC', 'PM', 'UT', 'OA', 'HP', 'HC', 'DA', 'C3']
|
||||
HEADERS = [s.lower() for s in HEADERS]
|
||||
|
||||
while lines:
|
||||
line_data = lines.pop(0)
|
||||
# print(line_data)
|
||||
standard_data = to_dict(line_data, HEADERS)
|
||||
# third_id = standard_data.pop('ut', None)
|
||||
# if not third_id:
|
||||
# continue
|
||||
yield standard_data
|
||||
|
||||
|
||||
def parse_full_records(body: Union[bytes, str]):
|
||||
"""
|
||||
解析响应的下载内容
|
||||
"""
|
||||
if isinstance(body, str):
|
||||
body = body.encode()
|
||||
item_g = parse_full_records_txt(body)
|
||||
for data_dic in item_g:
|
||||
yield data_dic
|
||||
|
||||
|
||||
@ -0,0 +1,135 @@
|
||||
# Scrapy settings for science_article_wos project
|
||||
#
|
||||
# For simplicity, this file contains only settings considered important or
|
||||
# commonly used. You can find more settings consulting the documentation:
|
||||
#
|
||||
# https://docs.scrapy.org/en/latest/topics/settings.html
|
||||
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
|
||||
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
|
||||
|
||||
BOT_NAME = "science_article_wos"
|
||||
|
||||
SPIDER_MODULES = ["science_article_wos.spiders"]
|
||||
NEWSPIDER_MODULE = "science_article_wos.spiders"
|
||||
|
||||
ADDONS = {}
|
||||
|
||||
# Crawl responsibly by identifying yourself (and your website) on the user-agent
|
||||
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/139.0.0.0 Safari/537.36'
|
||||
|
||||
# Obey robots.txt rules
|
||||
ROBOTSTXT_OBEY = False
|
||||
|
||||
# Concurrency and throttling settings
|
||||
#CONCURRENT_REQUESTS = 16
|
||||
CONCURRENT_REQUESTS_PER_DOMAIN = 1
|
||||
DOWNLOAD_DELAY = 1
|
||||
|
||||
# Disable cookies (enabled by default)
|
||||
#COOKIES_ENABLED = False
|
||||
|
||||
# Disable Telnet Console (enabled by default)
|
||||
#TELNETCONSOLE_ENABLED = False
|
||||
|
||||
# Override the default request headers:
|
||||
#DEFAULT_REQUEST_HEADERS = {
|
||||
# "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||||
# "Accept-Language": "en",
|
||||
#}
|
||||
|
||||
# Enable or disable spider middlewares
|
||||
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
|
||||
#SPIDER_MIDDLEWARES = {
|
||||
# "science_article_wos.middlewares.ScienceArticleAddSpiderMiddleware": 543,
|
||||
#}
|
||||
|
||||
# Enable or disable downloader middlewares
|
||||
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
|
||||
RETRY_ENABLED = True
|
||||
RETRY_TIMES = 2 # 重试3次
|
||||
RETRY_HTTP_CODES = [500, 502, 503, 504, 408, 400, 403, 404] # 增加了一些常见的错误码
|
||||
DOWNLOADER_MIDDLEWARES = {
|
||||
'scrapy.downloadermiddlewares.retry.RetryMiddleware': 550
|
||||
# "org_news.middlewares.OrgNewsDownloaderMiddleware": 543,
|
||||
}
|
||||
#DOWNLOADER_MIDDLEWARES = {
|
||||
# "science_article_wos.middlewares.ScienceArticleAddDownloaderMiddleware": 543,
|
||||
#}
|
||||
|
||||
# Enable or disable extensions
|
||||
# See https://docs.scrapy.org/en/latest/topics/extensions.html
|
||||
EXTENSIONS = {
|
||||
# "scrapy.extensions.telnet.TelnetConsole": None,
|
||||
# "science_article_wos.extensions.ackextension.ACKExtension": 0,
|
||||
# "science_article_wos.extensions.dingtalk_extension.DingTalkExtension": 0,
|
||||
}
|
||||
|
||||
# Configure item pipelines
|
||||
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
|
||||
#ITEM_PIPELINES = {
|
||||
# "science_article_wos.pipelines.ScienceArticleAddPipeline": 300,
|
||||
#}
|
||||
# MONGO_URI = "mongodb://root:123456@192.168.1.211:27017/"
|
||||
# MONGO_DATABASE = "science2"
|
||||
MONGO_URI = "mongodb://science-dev:kcidea1509!%25)(@101.43.239.105:27017/?authSource=science&directConnection=true"
|
||||
MONGO_DATABASE = 'science2'
|
||||
|
||||
MONGO_URI_SCIENCE = "mongodb://root:kcidea1509%21%25%29%28@43.140.203.187:27017/"
|
||||
MONGO_DATABASE_SCIENCE = 'science'
|
||||
|
||||
# REDIS_URL = 'redis://:kcidea1509@192.168.1.211:6379/10'
|
||||
REDIS_URL = 'redis://:kcidea1509!%)(@43.140.203.187:6379/10'
|
||||
|
||||
# mysql配置
|
||||
MYSQL_HOST = '43.140.203.187'
|
||||
MYSQL_PORT = 3306
|
||||
MYSQL_DATABASE = 'science_data_dept'
|
||||
MYSQL_USER = 'science-data-dept'
|
||||
MYSQL_PASSWORD = 'datadept1509'
|
||||
|
||||
# Enable and configure the AutoThrottle extension (disabled by default)
|
||||
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
|
||||
#AUTOTHROTTLE_ENABLED = True
|
||||
# The initial download delay
|
||||
#AUTOTHROTTLE_START_DELAY = 5
|
||||
# The maximum download delay to be set in case of high latencies
|
||||
#AUTOTHROTTLE_MAX_DELAY = 60
|
||||
# The average number of requests Scrapy should be sending in parallel to
|
||||
# each remote server
|
||||
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
|
||||
# Enable showing throttling stats for every response received:
|
||||
#AUTOTHROTTLE_DEBUG = False
|
||||
|
||||
# Enable and configure HTTP caching (disabled by default)
|
||||
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
|
||||
#HTTPCACHE_ENABLED = True
|
||||
#HTTPCACHE_EXPIRATION_SECS = 0
|
||||
#HTTPCACHE_DIR = "httpcache"
|
||||
#HTTPCACHE_IGNORE_HTTP_CODES = []
|
||||
#HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"
|
||||
|
||||
# Set settings whose default value is deprecated to a future-proof value
|
||||
FEED_EXPORT_ENCODING = "utf-8"
|
||||
|
||||
# 钉钉机器人配置
|
||||
DINGTALK_WEBHOOK_URL = 'https://oapi.dingtalk.com/robot/send?access_token=1252fe1ef63e95ced11ac87a01e9978670e82036a516c558e524f89e11513f9f'
|
||||
DINGTALK_SECRET = 'SECe77fe7cd6c0dbfcdd9ebe6ba1941ddc376be86ca717e9d68bb177b7eded71091'
|
||||
# 自定义消息模板(可选)
|
||||
DINGTALK_START_MESSAGE = "🚀 爬虫启动啦!\n**爬虫**: %(spider_name)s\n**时间**: %(started_time)s"
|
||||
# DINGTALK_CLOSED_MESSAGE = "✅ 爬虫完成!\n**爬虫**: %(spider_name)s\n**项目数**: %(item_scraped_count)s"
|
||||
# 启用/禁用特定通知
|
||||
DINGTALK_ENABLE_START = False
|
||||
DINGTALK_ENABLE_FINISH = True
|
||||
DINGTALK_ENABLE_ERROR = True
|
||||
DINGTALK_CLOSED_MESSAGE = """📊 爬虫完成通知\n
|
||||
**爬虫名称**: %(spider_name)s\n
|
||||
**机构名称**: %(org_name)s\n
|
||||
**任务条件**: %(task_condition)s\n
|
||||
**任务ID**: %(record_id)s\n
|
||||
**完成时间**: %(finished_time)s\n
|
||||
**完成原因**: %(finish_reason)s\n
|
||||
**采集统计**:\n
|
||||
- 采集项目: %(item_scraped_count)s 条
|
||||
- 请求响应: %(response_count)s 次
|
||||
- 错误数量: %(error_count)s 个\n
|
||||
**状态**: %(state)s"""
|
||||
@ -0,0 +1,131 @@
|
||||
import math
|
||||
from datetime import datetime
|
||||
from urllib.parse import urlencode
|
||||
from copy import deepcopy
|
||||
|
||||
import scrapy
|
||||
from scrapy.http.response.json import JsonResponse
|
||||
|
||||
from science_article_wos.items import WosCitedNumberItem, WosIdRelationItem, WosArticleTodoIdItem
|
||||
from science_article_wos.models import wos_model as model
|
||||
from science_article_wos.configs import wos as config
|
||||
from science_article_wos.utils import tools
|
||||
|
||||
|
||||
def calculate_next_page(next_page: int = 1, page_size: int = 100):
|
||||
return (next_page - 1) * page_size + 1
|
||||
|
||||
|
||||
class WosLatestIncrementSpider(scrapy.Spider):
|
||||
name = "wos_latest_increment"
|
||||
# allowed_domains = ["wos-api.clarivate.com"]
|
||||
# start_urls = ["https://wos-api.clarivate.com/api/woslite"]
|
||||
custom_settings = dict(
|
||||
DOWNLOADER_MIDDLEWARES={
|
||||
"science_article_wos.middlewares.WosStarterApiXkeyDownloaderMiddleware": 500
|
||||
},
|
||||
ITEM_PIPELINES={
|
||||
"science_article_wos.pipelines.CitedRelation2MongoPipeline": 300,
|
||||
"science_article_wos.pipelines.SchoolRelation2MongoPipeline": 350,
|
||||
"science_article_wos.pipelines.DupTodoBySciencePipeline": 400,
|
||||
# "science_article_wos.pipelines.DupTodoPipeline": 400,
|
||||
},
|
||||
EXTENSIONS={
|
||||
"science_article_wos.extensions.ACKExtension": 0,
|
||||
# "science_article_wos.extensions.dingtalk_extension.DingTalkExtension": 0,
|
||||
},
|
||||
LOG_LEVEL="INFO"
|
||||
)
|
||||
source = "wos"
|
||||
|
||||
def __init__(self, task_obj):
|
||||
scrapy.Spider.__init__(self)
|
||||
self.task_obj = task_obj
|
||||
self.record_id = task_obj['task_id']
|
||||
self.org_id = self.tolist(task_obj['org_id'])
|
||||
self.org_name = self.tolist(task_obj['org_name'])
|
||||
self.query_id = task_obj['query_id']
|
||||
self.query_content = task_obj['content']
|
||||
self.query_condition = task_obj['task_condition']
|
||||
|
||||
self.first_page = task_obj.get('first_page', 1)
|
||||
self._records_found = 0
|
||||
|
||||
@staticmethod
|
||||
def tolist(datas) -> list:
|
||||
if isinstance(datas, (list, tuple, set)):
|
||||
return list(set(datas))
|
||||
else:
|
||||
raise TypeError("不支持的类型:%s" % (type(datas)))
|
||||
|
||||
async def start(self):
|
||||
full_query = self.query_content
|
||||
if self.query_condition is not None:
|
||||
full_query = '%(query)s%(condition)s' % {
|
||||
'query': f'({self.query_content})' if self.query_condition else self.query_content,
|
||||
'condition': ' ' + self.query_condition if self.query_condition else ''
|
||||
}
|
||||
self.logger.info(f'full_query: {full_query}')
|
||||
meta = dict(q=full_query, page=self.first_page, limit=50, detail="short")
|
||||
params = model.starter_documents_get(**meta)
|
||||
enc_params = urlencode(params, doseq=True)
|
||||
yield scrapy.Request(url=config.WOS_STARTER_DOCUMENT_API + '?' + enc_params,
|
||||
meta=meta)
|
||||
|
||||
async def parse(self, response: JsonResponse, **kwargs):
|
||||
meta = response.meta
|
||||
request: scrapy.Request = response.request
|
||||
task_query_id: int = self.query_id
|
||||
task_org_id: list = self.org_id
|
||||
task_record_id: int = self.record_id
|
||||
|
||||
if response.status != 200:
|
||||
self.logger.warning("""
|
||||
响应异常
|
||||
状态码: %s
|
||||
响应内容: %s""" % (response.status, response.text))
|
||||
req_meta = request.meta
|
||||
resp_result = response.json()
|
||||
metadata: dict = resp_result.get("metadata")
|
||||
current_page = metadata.get("page")
|
||||
records_found = metadata.get('total')
|
||||
|
||||
max_page = req_meta.get("MAX_PAGE")
|
||||
if req_meta.get("page") == self.first_page:
|
||||
self.logger.info("""
|
||||
检索式: %s
|
||||
检索到结果: %s""" % (req_meta.get("q"), records_found))
|
||||
self.set_records_found(records_found)
|
||||
max_page = req_meta["MAX_PAGE"] = math.ceil(records_found / config.WOS_STARTER_PER_PAGE_LIMIT)
|
||||
batch_time = datetime.now()
|
||||
hits: list = resp_result.get("hits")
|
||||
for record in hits:
|
||||
third_id = record.get("uid")
|
||||
cited_num = tools.get_list_key(array=record.get("citations"), target="count", condition=("db", "WOS"))
|
||||
if cited_num:
|
||||
cited_item = WosCitedNumberItem()
|
||||
cited_item['third_id'] = third_id
|
||||
cited_item['cited'] = cited_num
|
||||
cited_item['updated_at'] = batch_time
|
||||
yield cited_item
|
||||
relation_item = WosIdRelationItem()
|
||||
relation_item['third_id'] = third_id
|
||||
relation_item['query_ids'] = [task_query_id]
|
||||
relation_item['school_ids'] = task_org_id
|
||||
relation_item['task_ids'] = [task_record_id]
|
||||
relation_item['updated_at'] = batch_time
|
||||
yield relation_item
|
||||
yield WosArticleTodoIdItem(**dict(third_id=third_id, state=0))
|
||||
|
||||
if current_page < max_page:
|
||||
meta_copy: dict = deepcopy(req_meta)
|
||||
meta_copy.update({'page': meta_copy['page'] + 1})
|
||||
yield scrapy.Request(
|
||||
config.WOS_STARTER_DOCUMENT_API + '?' + urlencode(model.starter_documents_get(**meta_copy)),
|
||||
meta=meta_copy)
|
||||
|
||||
def set_records_found(self, val):
|
||||
self._records_found = val
|
||||
|
||||
def get_records_found(self) -> int:
|
||||
return self._records_found
|
||||
@ -0,0 +1,32 @@
|
||||
from typing import List, Tuple
|
||||
from datetime import datetime
|
||||
|
||||
|
||||
def str2int(val, replace=0):
|
||||
try:
|
||||
val = int(val)
|
||||
except ValueError:
|
||||
val = replace
|
||||
except TypeError:
|
||||
val = replace
|
||||
return val
|
||||
|
||||
|
||||
def get_today_date(fmt: str = "%Y-%m-%d"):
|
||||
return datetime.today().strftime(fmt)
|
||||
|
||||
|
||||
def get_list_key(array: List[dict], target: str, condition: Tuple[str, str]):
|
||||
"""
|
||||
给定一个list [{key: val1, target: val2}, {key: val1, target: val2}]
|
||||
根据condition(key=val)返回第一个target对应的值
|
||||
:param target:
|
||||
:param condition:
|
||||
:param array:
|
||||
:return:
|
||||
"""
|
||||
n, v = condition
|
||||
for dic in array:
|
||||
if dic.get(n) == v:
|
||||
return dic.get(target)
|
||||
|
||||
@ -0,0 +1,11 @@
|
||||
# Automatically created by: scrapy startproject
|
||||
#
|
||||
# For more information about the [deploy] section see:
|
||||
# https://scrapyd.readthedocs.io/en/latest/deploy.html
|
||||
|
||||
[settings]
|
||||
default = science_article_wos.settings
|
||||
|
||||
[deploy]
|
||||
#url = http://localhost:6800/
|
||||
project = science_article_wos
|
||||
@ -0,0 +1,65 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# @Time : 2026/1/14 13:59
|
||||
# @Author : zhaoxiangpeng
|
||||
# @File : crawl_article_latest.py
|
||||
|
||||
import math
|
||||
from typing import List
|
||||
import pymysql
|
||||
from pymysql import cursors
|
||||
from twisted.internet import defer
|
||||
from scrapy.crawler import CrawlerProcess
|
||||
from scrapy.utils.project import get_project_settings
|
||||
from science_article_wos.spiders.wos_latest_increment import WosLatestIncrementSpider
|
||||
|
||||
CREATE_RECORD_SQL = '''insert into task_batch_record (batch_date, query_id, task_condition) VALUES ("%(batch_date)s", %(query_id)s, %(task_condition)s)'''
|
||||
SELECT_RECORD_SQL = """
|
||||
SELECT
|
||||
b.id AS task_id,
|
||||
q.id AS query_id,
|
||||
q.content AS content,
|
||||
b.task_condition AS task_condition,
|
||||
q.source_type AS source_type,
|
||||
b.is_done AS is_done
|
||||
FROM
|
||||
task_batch_record AS b
|
||||
JOIN task_search_strategy AS q ON q.id = b.query_id
|
||||
WHERE
|
||||
b.is_done = 0
|
||||
AND q.source_type = 1
|
||||
LIMIT %(limit)s
|
||||
"""
|
||||
|
||||
|
||||
def starter_latest_all():
|
||||
@defer.inlineCallbacks
|
||||
def f():
|
||||
client: pymysql.Connection = pymysql.connect(host='43.140.203.187', port=3306,
|
||||
database='science_data_dept', user='science-data-dept',
|
||||
passwd='datadept1509', )
|
||||
cursor = client.cursor(cursors.DictCursor)
|
||||
cursor.execute(SELECT_RECORD_SQL % {'limit': 1})
|
||||
result = cursor.fetchone()
|
||||
query_id = result['query_id']
|
||||
cursor.execute('select org_id, org_name from relation_org_query where query_id=%s', (query_id,))
|
||||
org_results: List[dict] = cursor.fetchall()
|
||||
result['org_id'] = [org_result['org_id'] for org_result in org_results]
|
||||
result['org_name'] = [org_result['org_name'] for org_result in org_results]
|
||||
|
||||
init_params = result
|
||||
yield process.crawl(WosLatestIncrementSpider, task_obj=init_params)
|
||||
|
||||
process = CrawlerProcess(get_project_settings())
|
||||
f()
|
||||
process.start()
|
||||
process.stop()
|
||||
|
||||
|
||||
def starter():
|
||||
process = CrawlerProcess(get_project_settings())
|
||||
process.crawl(WosLatestIncrementSpider)
|
||||
process.start()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
starter_latest_all()
|
||||
Loading…
Reference in New Issue