add:scrapy添加钉钉通知
parent
f977b8ad51
commit
ea68319ee6
@ -0,0 +1,129 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
# @Time : 2025/10/23 16:30
|
||||||
|
# @Author : deepseek
|
||||||
|
# @File : dingtalk.py
|
||||||
|
# extensions/dingtalk_extension.py
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import requests
|
||||||
|
from scrapy import signals
|
||||||
|
from scrapy.exceptions import NotConfigured
|
||||||
|
|
||||||
|
|
||||||
|
class DingTalkExtension:
|
||||||
|
"""钉钉机器人通知扩展"""
|
||||||
|
|
||||||
|
def __init__(self, webhook_url, spider_start_message=None, spider_closed_message=None):
|
||||||
|
self.webhook_url = webhook_url
|
||||||
|
self.spider_start_message = spider_start_message
|
||||||
|
self.spider_closed_message = spider_closed_message
|
||||||
|
self.stats = None
|
||||||
|
self.logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_crawler(cls, crawler):
|
||||||
|
# 从配置中获取钉钉webhook URL
|
||||||
|
webhook_url = crawler.settings.get('DINGTALK_WEBHOOK_URL')
|
||||||
|
if not webhook_url:
|
||||||
|
raise NotConfigured('DINGTALK_WEBHOOK_URL must be set')
|
||||||
|
|
||||||
|
# 获取自定义消息模板
|
||||||
|
start_msg = crawler.settings.get('DINGTALK_START_MESSAGE')
|
||||||
|
closed_msg = crawler.settings.get('DINGTALK_CLOSED_MESSAGE')
|
||||||
|
|
||||||
|
ext = cls(webhook_url, start_msg, closed_msg)
|
||||||
|
|
||||||
|
# 注册信号处理器
|
||||||
|
crawler.signals.connect(ext.spider_opened, signal=signals.spider_opened)
|
||||||
|
crawler.signals.connect(ext.spider_closed, signal=signals.spider_closed)
|
||||||
|
crawler.signals.connect(ext.spider_error, signal=signals.spider_error)
|
||||||
|
|
||||||
|
return ext
|
||||||
|
|
||||||
|
def spider_opened(self, spider):
|
||||||
|
"""爬虫开始时发送通知"""
|
||||||
|
self.stats = spider.crawler.stats
|
||||||
|
message = self.spider_start_message or self._get_default_start_message(spider)
|
||||||
|
self._send_dingtalk_message(message, spider)
|
||||||
|
|
||||||
|
def spider_closed(self, spider, reason):
|
||||||
|
"""爬虫结束时发送通知"""
|
||||||
|
message = self.spider_closed_message or self._get_default_closed_message(spider, reason)
|
||||||
|
self._send_dingtalk_message(message, spider)
|
||||||
|
|
||||||
|
def spider_error(self, failure, response, spider):
|
||||||
|
"""爬虫错误时发送通知"""
|
||||||
|
error_message = f"🚨 爬虫错误\n爬虫: {spider.name}\nURL: {response.url}\n错误: {str(failure.value)}"
|
||||||
|
self._send_dingtalk_message(error_message, spider, is_error=True)
|
||||||
|
|
||||||
|
def _get_default_start_message(self, spider):
|
||||||
|
"""默认开始消息模板"""
|
||||||
|
return f"🚀 爬虫启动通知\n**爬虫名称**: {spider.name}\n**开始时间**: {self._get_current_time()}\n**状态**: 开始运行"
|
||||||
|
|
||||||
|
def _get_default_closed_message(self, spider, reason):
|
||||||
|
"""默认结束消息模板"""
|
||||||
|
stats = self.stats
|
||||||
|
|
||||||
|
# 获取统计信息
|
||||||
|
item_scraped_count = stats.get_value('item_scraped_count', 0)
|
||||||
|
response_count = stats.get_value('response_received_count', 0)
|
||||||
|
error_count = stats.get_value('log_count/ERROR', 0)
|
||||||
|
finish_reason = self._get_reason_display(reason)
|
||||||
|
|
||||||
|
message = f"""📊 爬虫完成通知
|
||||||
|
**爬虫名称**: {spider.name}
|
||||||
|
**完成时间**: {self._get_current_time()}
|
||||||
|
**完成原因**: {finish_reason}
|
||||||
|
**采集统计**:
|
||||||
|
- 采集项目: {item_scraped_count} 条
|
||||||
|
- 请求响应: {response_count} 次
|
||||||
|
- 错误数量: {error_count} 个
|
||||||
|
**状态**: {'✅ 成功完成' if reason == 'finished' else '⚠️ 异常结束'}"""
|
||||||
|
|
||||||
|
return message
|
||||||
|
|
||||||
|
def _get_reason_display(self, reason):
|
||||||
|
"""获取完成原因的可读描述"""
|
||||||
|
reason_map = {
|
||||||
|
'finished': '正常完成',
|
||||||
|
'shutdown': '手动关闭',
|
||||||
|
'cancelled': '被取消',
|
||||||
|
}
|
||||||
|
return reason_map.get(reason, reason)
|
||||||
|
|
||||||
|
def _get_current_time(self):
|
||||||
|
"""获取当前时间"""
|
||||||
|
from datetime import datetime
|
||||||
|
return datetime.now().strftime('%Y-%m-%d %H:%M:%S')
|
||||||
|
|
||||||
|
def _send_dingtalk_message(self, message, spider, is_error=False):
|
||||||
|
"""发送钉钉消息"""
|
||||||
|
try:
|
||||||
|
headers = {'Content-Type': 'application/json'}
|
||||||
|
|
||||||
|
# 构建消息体
|
||||||
|
data = {
|
||||||
|
"msgtype": "markdown",
|
||||||
|
"markdown": {
|
||||||
|
"title": f"爬虫通知 - {spider.name}",
|
||||||
|
"text": message
|
||||||
|
},
|
||||||
|
"at": {
|
||||||
|
"isAtAll": is_error # 如果是错误,@所有人
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
response = requests.post(
|
||||||
|
self.webhook_url,
|
||||||
|
data=json.dumps(data),
|
||||||
|
headers=headers,
|
||||||
|
timeout=10
|
||||||
|
)
|
||||||
|
|
||||||
|
if response.status_code == 200:
|
||||||
|
self.logger.info(f"钉钉通知发送成功: {spider.name}")
|
||||||
|
else:
|
||||||
|
self.logger.error(f"钉钉通知发送失败: {response.status_code} - {response.text}")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.error(f"发送钉钉通知时出错: {e}")
|
||||||
@ -0,0 +1,21 @@
|
|||||||
|
import scrapy
|
||||||
|
|
||||||
|
|
||||||
|
class WosItem(scrapy.Item):
|
||||||
|
# define the fields for your item here like:
|
||||||
|
third_id = scrapy.Field()
|
||||||
|
updated_at = scrapy.Field()
|
||||||
|
|
||||||
|
|
||||||
|
class WosArticleItem(WosItem):
|
||||||
|
"""
|
||||||
|
wos发文item
|
||||||
|
"""
|
||||||
|
exported = scrapy.Field()
|
||||||
|
|
||||||
|
|
||||||
|
class WosCitedNumberItem(WosItem):
|
||||||
|
"""发文被引量item"""
|
||||||
|
third_id = scrapy.Field()
|
||||||
|
cited = scrapy.Field()
|
||||||
|
updated_at = scrapy.Field()
|
||||||
@ -0,0 +1,38 @@
|
|||||||
|
# pipelines.py
|
||||||
|
import pymongo
|
||||||
|
from itemadapter import ItemAdapter
|
||||||
|
from science_article_add.items.wos import WosCitedNumberItem
|
||||||
|
|
||||||
|
|
||||||
|
class MongoDBPipeline:
|
||||||
|
def __init__(self, mongo_uri, mongo_db):
|
||||||
|
self.mongo_uri = mongo_uri
|
||||||
|
self.mongo_db = mongo_db
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_crawler(cls, crawler):
|
||||||
|
return cls(
|
||||||
|
mongo_uri=crawler.settings.get('MONGO_URI'),
|
||||||
|
mongo_db=crawler.settings.get('MONGO_DATABASE', 'scrapy_data')
|
||||||
|
)
|
||||||
|
|
||||||
|
def open_spider(self, spider):
|
||||||
|
self.client = pymongo.MongoClient(self.mongo_uri)
|
||||||
|
self.db = self.client[self.mongo_db]
|
||||||
|
|
||||||
|
def close_spider(self, spider):
|
||||||
|
self.client.close()
|
||||||
|
|
||||||
|
def process_item(self, item, spider):
|
||||||
|
adapter = ItemAdapter(item)
|
||||||
|
|
||||||
|
# 根据Item类型存储到不同的集合
|
||||||
|
if isinstance(item, WosCitedNumberItem):
|
||||||
|
collection_name = 'relation_cited_number_wos'
|
||||||
|
else:
|
||||||
|
collection_name = 'relation_cited_number_other'
|
||||||
|
|
||||||
|
# 插入数据
|
||||||
|
self.db[collection_name].insert_one(dict(adapter))
|
||||||
|
|
||||||
|
return item
|
||||||
Loading…
Reference in New Issue