当前需求是,本机绑的bond0,但是网络不太好,实时上传文件到存储的时候,流量太小,导致程序上传堵塞,需要人工去重启猫或者路由器;但是不能实时盯着流量,只能通过脚本监控,如果出网IP流量小了2M,则触发告警,脚本内容如下:
当前脚本监控 111.111.111.111 的IP,如果两分钟内 流量平均小于2M, 则触发告警,告警每10分钟发送一次。
yum -y install tcpdump
yum -y install s-nail postfix
systemctl start postfix && systemctl enable postfixmkdir /var/log/traffic/# 测试
echo "流量告警测试" | mail -s "告警测试" xxxx@163.com
#!/bin/bashLOG_FILE="/var/log/traffic/tx_rate_monitor.log"
ALERT_FILE="/var/log/traffic/traffic_alert.log"
ALERT_THRESHOLD=2
ALERT_DURATION=120
ALERT_SENT_FILE="/var/log/traffic/alert_sent.txt"
CHECK_INTERVAL=30
ALERT_COOLDOWN=600
EMAIL_TO="xxxxxxx@163.com"
EMAIL_SUBJECT="网络流量异常告警"
mkdir -p /var/log/trafficecho "=== 生产环境流量监控 ==="
echo "开始监控发送流量速率,结果保存到: $LOG_FILE"
echo "告警条件: ${ALERT_DURATION}秒内总流量 < ${ALERT_THRESHOLD}MB"
echo "重复告警间隔: ${ALERT_COOLDOWN}秒"
if [ ! -f "$LOG_FILE" ]; thenecho "时间戳,发送速率(MB/s)" > $LOG_FILEecho "告警阈值: ${ALERT_THRESHOLD}MB/${ALERT_DURATION}秒" >> $LOG_FILEecho "重复告警间隔: ${ALERT_COOLDOWN}秒" >> $LOG_FILE
fi
send_alert() {local current_time=$(date '+%Y-%m-%d %H:%M:%S')local total_flow=$1local avg_rate=$2local data_points=$3local mail_content="网络流量异常告警告警时间: $current_time
监控目标: 192.168.0.192同步111.111.111.111 数据
过去${ALERT_DURATION}秒总流量: ${total_flow} MB
数据点数: ${data_points}
平均速率: ${avg_rate} MB/s
告警阈值: ${ALERT_THRESHOLD} MB/${ALERT_DURATION}秒
下次告警时间: $(date -d "+${ALERT_COOLDOWN} seconds" '+%Y-%m-%d %H:%M:%S')最近流量记录:
$(tail -20 $LOG_FILE | grep -E '^[0-9]{4}-[0-9]{2}-[0-9]{2}')请及时检查网络连接状态。"echo "$mail_content" | mail -s "$EMAIL_SUBJECT" "$EMAIL_TO" 2>/dev/nullif [ $? -eq 0 ]; thenecho "✅ 邮件告警发送成功"elseecho "❌ 邮件发送失败,使用本地告警"echo ""echo "🔴🔴🔴 流量告警 🔴🔴🔴"echo "时间: $current_time"echo "内容: ${ALERT_DURATION}秒流量 ${total_flow}MB 低于阈值 ${ALERT_THRESHOLD}MB"echo "数据点数: ${data_points}"echo "平均速率: ${avg_rate} MB/s"echo "下次告警: $(date -d "+${ALERT_COOLDOWN} seconds" '+%H:%M:%S')"echo "🔴🔴🔴🔴🔴🔴🔴🔴🔴"echo ""fiecho "$current_time - 告警已发送: ${ALERT_DURATION}秒流量 ${total_flow} MB, 数据点 ${data_points}, 平均 ${avg_rate} MB/s" >> $ALERT_FILEtouch $ALERT_SENT_FILElogger "流量告警: ${ALERT_DURATION}秒流量 ${total_flow}MB, 数据点 ${data_points}, 平均 ${avg_rate} MB/s"
}
is_in_cooldown() {if [ ! -f "$ALERT_SENT_FILE" ]; thenreturn 1 filocal last_alert_time=$(stat -c %Y "$ALERT_SENT_FILE" 2>/dev/null || echo 0)local current_time=$(date +%s)local time_diff=$((current_time - last_alert_time))if [ $time_diff -ge $ALERT_COOLDOWN ]; thenreturn 1 elselocal remaining=$((ALERT_COOLDOWN - time_diff))echo "⚠️ 冷却期中,${remaining}秒后可再次发送告警"return 0 fi
}
check_traffic_alert() {local current_time=$(date +%s)local threshold_time=$((current_time - ALERT_DURATION))local total_flow=0local data_points=0local temp_file="/tmp/traffic_check_$$.txt"grep -E '^[0-9]{4}-[0-9]{2}-[0-9]{2}' "$LOG_FILE" > "$temp_file"while IFS=',' read -r timestamp rate; dolog_time=$(date -d "$timestamp" +%s 2>/dev/null)if [ $? -ne 0 ]; thencontinuefiif [ $log_time -ge $threshold_time ]; thenif [[ "$rate" =~ ^[0-9.]+$ ]]; thentotal_flow=$(echo "scale=2; $total_flow + $rate" | bc 2>/dev/null || echo "$total_flow")data_points=$((data_points + 1))fifidone < "$temp_file"rm -f "$temp_file"local avg_rate=0if [ $data_points -gt 0 ]; thenavg_rate=$(echo "scale=3; $total_flow / $data_points" | bc 2>/dev/null || echo "0")fiecho "[$(date '+%H:%M:%S')] 检测: ${data_points}个数据点, ${ALERT_DURATION}秒总流量: ${total_flow} MB, 平均: ${avg_rate} MB/s"echo "调试: total_flow=$total_flow, ALERT_THRESHOLD=$ALERT_THRESHOLD"if [ -z "$total_flow" ] || [ "$total_flow" = "0" ]; thenecho "❌ 流量数据为空,跳过告警检查"returnfiif [ $(echo "$total_flow < $ALERT_THRESHOLD" | bc 2>/dev/null || echo "0") -eq 1 ]; thenif is_in_cooldown; thenreturnfiecho "🚨 触发告警: ${ALERT_DURATION}秒流量 ${total_flow} MB < ${ALERT_THRESHOLD} MB"send_alert "$total_flow" "$avg_rate" "$data_points"elseecho "✅ 流量正常: ${ALERT_DURATION}秒流量 ${total_flow} MB"fi
}
show_monitor_status() {echo ""echo "=== 监控状态 ==="echo "启动时间: $(date)"echo "监控目标: 111.111.111.111"echo "告警条件: ${ALERT_DURATION}秒内流量 < ${ALERT_THRESHOLD}MB"echo "检查间隔: ${CHECK_INTERVAL}秒"echo "告警冷却: ${ALERT_COOLDOWN}秒"if [ -f "$ALERT_SENT_FILE" ]; thenlocal last_alert=$(stat -c %y "$ALERT_SENT_FILE" 2>/dev/null | cut -d'.' -f1)local next_alert=$(date -d "$last_alert +${ALERT_COOLDOWN} seconds" '+%H:%M:%S')echo "最后告警: $last_alert"echo "下次可告警: $next_alert"elseecho "最后告警: 无"echo "下次可告警: 随时"filocal total_lines=$(wc -l < "$LOG_FILE" 2>/dev/null || echo "0")echo "数据记录: $((total_lines - 3)) 条"echo "================"echo ""
}
debug_traffic_calc() {local current_time=$(date +%s)local threshold_time=$((current_time - ALERT_DURATION))local debug_total=0local debug_count=0echo "=== 流量计算调试 ==="echo "当前时间: $(date -d @$current_time)"echo "统计起始: $(date -d @$threshold_time)"grep -E '^[0-9]{4}-[0-9]{2}-[0-9]{2}' "$LOG_FILE" | while IFS=',' read -r timestamp rate; dolog_time=$(date -d "$timestamp" +%s 2>/dev/null)if [ $? -eq 0 ] && [ $log_time -ge $threshold_time ]; thentime_diff=$((current_time - log_time))echo " $timestamp (${time_diff}秒前) - $rate MB/s"fidone | tail -10while IFS=',' read -r timestamp rate; dolog_time=$(date -d "$timestamp" +%s 2>/dev/null)if [ $? -eq 0 ] && [ $log_time -ge $threshold_time ]; thenif [[ "$rate" =~ ^[0-9.]+$ ]]; thendebug_total=$(echo "scale=2; $debug_total + $rate" | bc 2>/dev/null || echo "$debug_total")debug_count=$((debug_count + 1))fifidone < <(grep -E '^[0-9]{4}-[0-9]{2}-[0-9]{2}' "$LOG_FILE")echo "实际统计: $debug_count 个数据点, 总流量: $debug_total MB"echo "阈值: $ALERT_THRESHOLD MB"echo "================"
}
counter=0
status_counter=0echo "生产环境流量监控启动"
show_monitor_status
debug_traffic_calc
echo "按 Ctrl+C 停止监控"
echo "=========================================="while true; doTIMESTAMP=$(date '+%Y-%m-%d %H:%M:%S')tx_rate=$(timeout 1s sudo tcpdump -i bond0 -n -q "host 111.111.111.111" 2>/dev/null | \awk 'BEGIN {tx_bytes=0}/IP.*192\.168\.0\.192.*>.*111\.111\.111\.111/ {for(i=1; i<=NF; i++) {if ($i == "tcp" || $i == "udp") {pkt_size = $(i+1) + 0if (pkt_size > 0) {tx_bytes += pkt_size + 58}break}}}END {rate = tx_bytes/1024/1024printf "%.2f", (rate > 0 ? rate : 0)}')echo "$TIMESTAMP,$tx_rate" >> $LOG_FILEif [ $(echo "$tx_rate > 0" | bc 2>/dev/null || echo "0") -eq 1 ]; thenecho "$(date '+%H:%M:%S') - 当前速率: $tx_rate MB/s"ficounter=$((counter + 1))if [ $counter -ge $CHECK_INTERVAL ]; thencheck_traffic_alertcounter=0fistatus_counter=$((status_counter + 1))if [ $status_counter -ge 300 ]; thenshow_monitor_statusstatus_counter=0fisleep 1
done
# 附件上传流量监控告警 执行脚本
nohup /root/tcpdump-fujian-TX.sh > /dev/null &
查看结果

查看邮件发送结果
