add:scrapy添加钉钉通知
parent
f977b8ad51
commit
ea68319ee6
@ -0,0 +1,129 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# @Time : 2025/10/23 16:30
|
||||
# @Author : deepseek
|
||||
# @File : dingtalk.py
|
||||
# extensions/dingtalk_extension.py
|
||||
import json
|
||||
import logging
|
||||
import requests
|
||||
from scrapy import signals
|
||||
from scrapy.exceptions import NotConfigured
|
||||
|
||||
|
||||
class DingTalkExtension:
|
||||
"""钉钉机器人通知扩展"""
|
||||
|
||||
def __init__(self, webhook_url, spider_start_message=None, spider_closed_message=None):
|
||||
self.webhook_url = webhook_url
|
||||
self.spider_start_message = spider_start_message
|
||||
self.spider_closed_message = spider_closed_message
|
||||
self.stats = None
|
||||
self.logger = logging.getLogger(__name__)
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
# 从配置中获取钉钉webhook URL
|
||||
webhook_url = crawler.settings.get('DINGTALK_WEBHOOK_URL')
|
||||
if not webhook_url:
|
||||
raise NotConfigured('DINGTALK_WEBHOOK_URL must be set')
|
||||
|
||||
# 获取自定义消息模板
|
||||
start_msg = crawler.settings.get('DINGTALK_START_MESSAGE')
|
||||
closed_msg = crawler.settings.get('DINGTALK_CLOSED_MESSAGE')
|
||||
|
||||
ext = cls(webhook_url, start_msg, closed_msg)
|
||||
|
||||
# 注册信号处理器
|
||||
crawler.signals.connect(ext.spider_opened, signal=signals.spider_opened)
|
||||
crawler.signals.connect(ext.spider_closed, signal=signals.spider_closed)
|
||||
crawler.signals.connect(ext.spider_error, signal=signals.spider_error)
|
||||
|
||||
return ext
|
||||
|
||||
def spider_opened(self, spider):
|
||||
"""爬虫开始时发送通知"""
|
||||
self.stats = spider.crawler.stats
|
||||
message = self.spider_start_message or self._get_default_start_message(spider)
|
||||
self._send_dingtalk_message(message, spider)
|
||||
|
||||
def spider_closed(self, spider, reason):
|
||||
"""爬虫结束时发送通知"""
|
||||
message = self.spider_closed_message or self._get_default_closed_message(spider, reason)
|
||||
self._send_dingtalk_message(message, spider)
|
||||
|
||||
def spider_error(self, failure, response, spider):
|
||||
"""爬虫错误时发送通知"""
|
||||
error_message = f"🚨 爬虫错误\n爬虫: {spider.name}\nURL: {response.url}\n错误: {str(failure.value)}"
|
||||
self._send_dingtalk_message(error_message, spider, is_error=True)
|
||||
|
||||
def _get_default_start_message(self, spider):
|
||||
"""默认开始消息模板"""
|
||||
return f"🚀 爬虫启动通知\n**爬虫名称**: {spider.name}\n**开始时间**: {self._get_current_time()}\n**状态**: 开始运行"
|
||||
|
||||
def _get_default_closed_message(self, spider, reason):
|
||||
"""默认结束消息模板"""
|
||||
stats = self.stats
|
||||
|
||||
# 获取统计信息
|
||||
item_scraped_count = stats.get_value('item_scraped_count', 0)
|
||||
response_count = stats.get_value('response_received_count', 0)
|
||||
error_count = stats.get_value('log_count/ERROR', 0)
|
||||
finish_reason = self._get_reason_display(reason)
|
||||
|
||||
message = f"""📊 爬虫完成通知
|
||||
**爬虫名称**: {spider.name}
|
||||
**完成时间**: {self._get_current_time()}
|
||||
**完成原因**: {finish_reason}
|
||||
**采集统计**:
|
||||
- 采集项目: {item_scraped_count} 条
|
||||
- 请求响应: {response_count} 次
|
||||
- 错误数量: {error_count} 个
|
||||
**状态**: {'✅ 成功完成' if reason == 'finished' else '⚠️ 异常结束'}"""
|
||||
|
||||
return message
|
||||
|
||||
def _get_reason_display(self, reason):
|
||||
"""获取完成原因的可读描述"""
|
||||
reason_map = {
|
||||
'finished': '正常完成',
|
||||
'shutdown': '手动关闭',
|
||||
'cancelled': '被取消',
|
||||
}
|
||||
return reason_map.get(reason, reason)
|
||||
|
||||
def _get_current_time(self):
|
||||
"""获取当前时间"""
|
||||
from datetime import datetime
|
||||
return datetime.now().strftime('%Y-%m-%d %H:%M:%S')
|
||||
|
||||
def _send_dingtalk_message(self, message, spider, is_error=False):
|
||||
"""发送钉钉消息"""
|
||||
try:
|
||||
headers = {'Content-Type': 'application/json'}
|
||||
|
||||
# 构建消息体
|
||||
data = {
|
||||
"msgtype": "markdown",
|
||||
"markdown": {
|
||||
"title": f"爬虫通知 - {spider.name}",
|
||||
"text": message
|
||||
},
|
||||
"at": {
|
||||
"isAtAll": is_error # 如果是错误,@所有人
|
||||
}
|
||||
}
|
||||
|
||||
response = requests.post(
|
||||
self.webhook_url,
|
||||
data=json.dumps(data),
|
||||
headers=headers,
|
||||
timeout=10
|
||||
)
|
||||
|
||||
if response.status_code == 200:
|
||||
self.logger.info(f"钉钉通知发送成功: {spider.name}")
|
||||
else:
|
||||
self.logger.error(f"钉钉通知发送失败: {response.status_code} - {response.text}")
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"发送钉钉通知时出错: {e}")
|
||||
@ -0,0 +1,21 @@
|
||||
import scrapy
|
||||
|
||||
|
||||
class WosItem(scrapy.Item):
|
||||
# define the fields for your item here like:
|
||||
third_id = scrapy.Field()
|
||||
updated_at = scrapy.Field()
|
||||
|
||||
|
||||
class WosArticleItem(WosItem):
|
||||
"""
|
||||
wos发文item
|
||||
"""
|
||||
exported = scrapy.Field()
|
||||
|
||||
|
||||
class WosCitedNumberItem(WosItem):
|
||||
"""发文被引量item"""
|
||||
third_id = scrapy.Field()
|
||||
cited = scrapy.Field()
|
||||
updated_at = scrapy.Field()
|
||||
@ -0,0 +1,38 @@
|
||||
# pipelines.py
|
||||
import pymongo
|
||||
from itemadapter import ItemAdapter
|
||||
from science_article_add.items.wos import WosCitedNumberItem
|
||||
|
||||
|
||||
class MongoDBPipeline:
|
||||
def __init__(self, mongo_uri, mongo_db):
|
||||
self.mongo_uri = mongo_uri
|
||||
self.mongo_db = mongo_db
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
return cls(
|
||||
mongo_uri=crawler.settings.get('MONGO_URI'),
|
||||
mongo_db=crawler.settings.get('MONGO_DATABASE', 'scrapy_data')
|
||||
)
|
||||
|
||||
def open_spider(self, spider):
|
||||
self.client = pymongo.MongoClient(self.mongo_uri)
|
||||
self.db = self.client[self.mongo_db]
|
||||
|
||||
def close_spider(self, spider):
|
||||
self.client.close()
|
||||
|
||||
def process_item(self, item, spider):
|
||||
adapter = ItemAdapter(item)
|
||||
|
||||
# 根据Item类型存储到不同的集合
|
||||
if isinstance(item, WosCitedNumberItem):
|
||||
collection_name = 'relation_cited_number_wos'
|
||||
else:
|
||||
collection_name = 'relation_cited_number_other'
|
||||
|
||||
# 插入数据
|
||||
self.db[collection_name].insert_one(dict(adapter))
|
||||
|
||||
return item
|
||||
Loading…
Reference in New Issue