change:任务完成确认和钉钉通知

main
zhaoxiangpeng 2 months ago
parent 576260f52d
commit fe7044292b

@ -24,7 +24,7 @@ class SpiderProtocol:
class ACKExtension: class ACKExtension:
def __init__(self, crawler: Crawler): def __init__(self, crawler: Crawler):
self.crawler = crawler self.crawler = crawler
self.change_state_sql = 'update task_batch_record set %(update_kws)s where id=%(record_id)s' self.change_state_sql = 'update task_batch_record set %(update_kws)s where %(update_cond)s'
@classmethod @classmethod
def from_crawler(cls, crawler): def from_crawler(cls, crawler):

@ -4,61 +4,124 @@
# @File : dingtalk.py # @File : dingtalk.py
# extensions/dingtalk_extension.py # extensions/dingtalk_extension.py
import json import json
import time
import logging import logging
import requests import requests
from scrapy import signals from scrapy import signals
from scrapy.exceptions import NotConfigured from scrapy.exceptions import NotConfigured
SPIDER_START_MSG = """🚀 爬虫启动通知\n
**爬虫名称**: %(spider_name)s\n
**开始时间**: %(started_time)s\n
**状态**: 开始运行"""
# SPIDER_CLOSED_MSG = """📊 爬虫完成通知\n
# **爬虫名称**: %(spider_name)s\n
# **完成时间**: %(finished_time)s\n
# **完成原因**: %(finish_reason)s\n
# **采集统计**:\n
# - 采集项目: %(item_scraped_count)s 条
# - 请求响应: %(response_count)s 次
# - 错误数量: %(error_count)s 个\n
# **状态**: %(state)s"""
SPIDER_ERROR_MSG = """🚨 爬虫错误\n
**爬虫名称**: %(spider_name)s\n
**URL**: %(url)s\n
**错误**: %(err_msg)s"""
SPIDER_CLOSED_MSG = """📊 爬虫完成通知\n
**爬虫名称**: %(spider_name)s\n
**机构名称**: %(org_name)s\n
**任务条件**: %(task_condition)s\n
**任务ID**: %(record_id)s\n
**完成时间**: %(finished_time)s\n
**完成原因**: %(finish_reason)s\n
**采集统计**:\n
- 采集项目: %(item_scraped_count)s
- 请求响应: %(response_count)s
- 错误数量: %(error_count)s \n
**状态**: %(state)s"""
class DingTalkExtension: class DingTalkExtension:
"""钉钉机器人通知扩展""" """钉钉机器人通知扩展"""
def __init__(self, webhook_url, spider_start_message=None, spider_closed_message=None): def __init__(
self,
crawler,
webhook_url=None, secret=None,
start_msg=None, closed_msg=None,
**kwargs
):
self.crawler = crawler
self.stats = crawler.stats
self.webhook_url = webhook_url self.webhook_url = webhook_url
self.spider_start_message = spider_start_message self.secret = secret
self.spider_closed_message = spider_closed_message
self.stats = None self.spider_start_message = start_msg or SPIDER_START_MSG
self.spider_closed_message = closed_msg or SPIDER_CLOSED_MSG
self.enable_start_notify = kwargs.get("enable_start_notify", False)
self.enable_finish_notify = kwargs.get("enable_finish_notify", False)
self.enable_error_notify = kwargs.get("enable_error_notify", False)
self.logger = logging.getLogger(__name__) self.logger = logging.getLogger(__name__)
@classmethod @classmethod
def from_crawler(cls, crawler): def from_crawler(cls, crawler):
# 从配置中获取钉钉webhook URL # 从配置中获取钉钉webhook URL
webhook_url = crawler.settings.get('DINGTALK_WEBHOOK_URL') webhook_url = crawler.settings['DINGTALK_WEBHOOK_URL']
if not webhook_url: if not webhook_url:
raise NotConfigured('DINGTALK_WEBHOOK_URL must be set') raise NotConfigured('DINGTALK_WEBHOOK_URL must be set')
ding_cfg = dict(
# 获取自定义消息模板 webhook_url=crawler.settings.get('DINGTALK_WEBHOOK_URL'),
start_msg = crawler.settings.get('DINGTALK_START_MESSAGE') secret=crawler.settings.get('DINGTALK_SECRET'),
closed_msg = crawler.settings.get('DINGTALK_CLOSED_MESSAGE') # 获取自定义消息模板
start_msg=crawler.settings.get('DINGTALK_START_MESSAGE', SPIDER_START_MSG),
ext = cls(webhook_url, start_msg, closed_msg) closed_msg=crawler.settings.get('DINGTALK_CLOSED_MESSAGE', SPIDER_CLOSED_MSG),
enable_start_notify=crawler.settings.getbool('DINGTALK_ENABLE_START', False),
enable_finish_notify=crawler.settings.getbool('DINGTALK_ENABLE_FINISH', False),
enable_error_notify=crawler.settings.getbool('DINGTALK_ENABLE_ERROR', False),
)
ext = cls(crawler=crawler, **ding_cfg)
# 注册信号处理器 # 注册信号处理器
crawler.signals.connect(ext.spider_opened, signal=signals.spider_opened) if ext.enable_start_notify:
crawler.signals.connect(ext.spider_closed, signal=signals.spider_closed) crawler.signals.connect(ext.spider_opened, signal=signals.spider_opened)
crawler.signals.connect(ext.spider_error, signal=signals.spider_error) if ext.enable_finish_notify:
crawler.signals.connect(ext.spider_closed, signal=signals.spider_closed)
if ext.enable_error_notify:
crawler.signals.connect(ext.spider_error, signal=signals.spider_error)
crawler.signals.connect(ext.item_error, signal=signals.item_error)
return ext return ext
def spider_opened(self, spider): def spider_opened(self, spider):
"""爬虫开始时发送通知""" """爬虫开始时发送通知"""
self.stats = spider.crawler.stats self.stats = spider.crawler.stats
message = self.spider_start_message or self._get_default_start_message(spider) message = self._get_default_start_message(spider)
self._send_dingtalk_message(message, spider) self._send_dingtalk_message(message, spider)
def spider_closed(self, spider, reason): def spider_closed(self, spider, reason):
"""爬虫结束时发送通知""" """爬虫结束时发送通知"""
message = self.spider_closed_message or self._get_default_closed_message(spider, reason) message = self._get_default_closed_message(spider, reason)
self._send_dingtalk_message(message, spider) self._send_dingtalk_message(message, spider)
def spider_error(self, failure, response, spider): def spider_error(self, failure, response, spider):
"""爬虫错误时发送通知""" """爬虫错误时发送通知"""
error_message = f"🚨 爬虫错误\n爬虫: {spider.name}\nURL: {response.url}\n错误: {str(failure.value)}" message = SPIDER_ERROR_MSG % {
self._send_dingtalk_message(error_message, spider, is_error=True) "spider_name": spider.name,
"url": response.url,
"err_msg": str(failure.value),
}
self._send_dingtalk_message(message, spider, is_error=True)
def item_error(self, failure, response, spider):
pass
def _get_default_start_message(self, spider): def _get_default_start_message(self, spider):
"""默认开始消息模板""" """默认开始消息模板"""
return f"🚀 爬虫启动通知\n**爬虫名称**: {spider.name}\n**开始时间**: {self._get_current_time()}\n**状态**: 开始运行" message = self.spider_start_message % {"spider_name": spider.name, "started_time": self._get_current_time()}
return message
def _get_default_closed_message(self, spider, reason): def _get_default_closed_message(self, spider, reason):
"""默认结束消息模板""" """默认结束消息模板"""
@ -69,17 +132,19 @@ class DingTalkExtension:
response_count = stats.get_value('response_received_count', 0) response_count = stats.get_value('response_received_count', 0)
error_count = stats.get_value('log_count/ERROR', 0) error_count = stats.get_value('log_count/ERROR', 0)
finish_reason = self._get_reason_display(reason) finish_reason = self._get_reason_display(reason)
task_obj = spider.task_obj
message = f"""📊 爬虫完成通知 message = self.spider_closed_message % {
**爬虫名称**: {spider.name} "spider_name": spider.name,
**完成时间**: {self._get_current_time()} "org_name": task_obj['org_name'],
**完成原因**: {finish_reason} "task_condition": task_obj['task_condition'],
**采集统计**: "record_id": spider.record_id,
- 采集项目: {item_scraped_count} "finished_time": self._get_current_time(),
- 请求响应: {response_count} "finish_reason": finish_reason,
- 错误数量: {error_count} "item_scraped_count": spider.get_records_found(),
**状态**: {'✅ 成功完成' if reason == 'finished' else '⚠️ 异常结束'}""" "response_count": response_count,
"error_count": error_count,
"state": '✅ 成功完成' if reason == 'finished' else '⚠️ 异常结束'
}
return message return message
def _get_reason_display(self, reason): def _get_reason_display(self, reason):
@ -96,10 +161,56 @@ class DingTalkExtension:
from datetime import datetime from datetime import datetime
return datetime.now().strftime('%Y-%m-%d %H:%M:%S') return datetime.now().strftime('%Y-%m-%d %H:%M:%S')
def _generate_signature(self, timestamp: int) -> str:
"""
生成签名
Args:
timestamp: 时间戳
Returns:
签名字符串
"""
if not self.secret:
return ""
import hmac
import hashlib
import base64
import urllib.parse
string_to_sign = f"{timestamp}\n{self.secret}"
hmac_code = hmac.new(
self.secret.encode('utf-8'),
string_to_sign.encode('utf-8'),
digestmod=hashlib.sha256
).digest()
sign = urllib.parse.quote_plus(base64.b64encode(hmac_code))
return sign
def _build_webhook_url(self) -> str:
"""
构建完整的webhook URL包含签名
Returns:
完整的webhook URL
"""
if not self.secret:
return self.webhook_url
timestamp = int(time.time() * 1000)
sign = self._generate_signature(timestamp)
return f"{self.webhook_url}&timestamp={timestamp}&sign={sign}"
def _send_dingtalk_message(self, message, spider, is_error=False): def _send_dingtalk_message(self, message, spider, is_error=False):
"""发送钉钉消息""" """发送钉钉消息"""
try: try:
headers = {'Content-Type': 'application/json'} webhook_url = self._build_webhook_url()
headers = {
"Content-Type": "application/json",
"User-Agent": "DingTalk-Bot/1.0"
}
# 构建消息体 # 构建消息体
data = { data = {
@ -114,7 +225,7 @@ class DingTalkExtension:
} }
response = requests.post( response = requests.post(
self.webhook_url, webhook_url,
data=json.dumps(data), data=json.dumps(data),
headers=headers, headers=headers,
timeout=10 timeout=10

Loading…
Cancel
Save