当前位置: 首页 > news >正文

优秀shell脚本搜集——筑梦之路

脚本1:系统健康状态检查

  • 全面检查CPU、内存、磁盘、网络、进程等系统资源

  • 自动告警异常指标

  •  生成详细的巡检报告

  •  可配置告警阈值

  •  支持彩色输出,便于快速定位问题

#!/bin/bash
################################################################################
# 脚本名称: system_health_check.sh
# 功能描述: 系统健康状态全面检查
# 版本信息: v2.0
# 作者: DevOps Team
# 最后修改: 2025-01-15
################################################################################# 配置区域
HOSTNAME=$(hostname)
DATE=$(date +"%Y-%m-%d %H:%M:%S")
REPORT_FILE="/var/log/system_check_$(date +%Y%m%d_%H%M%S).log"# 告警阈值配置
CPU_WARNING=80          # CPU使用率告警阈值(%)
MEM_WARNING=85          # 内存使用率告警阈值(%)
DISK_WARNING=85         # 磁盘使用率告警阈值(%)
LOAD_WARNING=4          # 系统负载告警阈值(根据CPU核心数调整)
INODE_WARNING=80        # Inode使用率告警阈值(%)# 颜色定义
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
NC='\033[0m'# No Color# 日志函数
log() {
echo"[$(date +"%Y-%m-%d %H:%M:%S")] $1" | tee -a "$REPORT_FILE"
}log_section() {
echo"" | tee -a "$REPORT_FILE"
echo"============================================================" | tee -a "$REPORT_FILE"
echo" $1" | tee -a "$REPORT_FILE"
echo"============================================================" | tee -a "$REPORT_FILE"
}log_warning() {
echo -e "${YELLOW}[WARNING] $1${NC}" | tee -a "$REPORT_FILE"
}log_error() {
echo -e "${RED}[ERROR] $1${NC}" | tee -a "$REPORT_FILE"
}log_ok() {
echo -e "${GREEN}[OK] $1${NC}" | tee -a "$REPORT_FILE"
}# 1. 基本信息收集
check_basic_info() {log_section "1. 系统基本信息"log"主机名: $HOSTNAME"
log"检查时间: $DATE"
log"系统版本: $(cat /etc/redhat-release 2>/dev/null || cat /etc/issue | head -1)"
log"内核版本: $(uname -r)"
log"系统架构: $(uname -m)"
log"运行时长: $(uptime | awk -F'up ' '{print $2}' | awk -F',' '{print $1}')"
log"当前用户: $(whoami)"
log"登录用户数: $(who | wc -l)"
}# 2. CPU使用率检查
check_cpu() {log_section "2. CPU使用率检查"# 获取CPU核心数CPU_CORES=$(grep -c ^processor /proc/cpuinfo)
log"CPU核心数: $CPU_CORES"# 获取CPU使用率(取5秒平均值)CPU_IDLE=$(top -bn2 -d 1 | grep "Cpu(s)" | tail -1 | awk '{print $8}' | cut -d'%' -f1)CPU_USAGE=$(echo"scale=2; 100 - $CPU_IDLE" | bc)log"CPU使用率: ${CPU_USAGE}%"if (( $(echo "$CPU_USAGE > $CPU_WARNING" | bc -l) )); thenlog_warning "CPU使用率超过${CPU_WARNING}%,当前${CPU_USAGE}%"# 显示CPU占用最高的5个进程
log"TOP 5 CPU消耗进程:"ps aux | sort -rn -k3 | head -5 | awk '{printf "  PID: %-8s User: %-10s CPU: %-6s CMD: %s\n", $2,$1,$3,$11}' | tee -a "$REPORT_FILE"
elselog_ok "CPU使用率正常: ${CPU_USAGE}%"
fi# 检查系统负载LOAD_1=$(uptime | awk -F'load average:''{print $2}' | awk -F',''{print $1}' | xargs)LOAD_5=$(uptime | awk -F'load average:''{print $2}' | awk -F',''{print $2}' | xargs)LOAD_15=$(uptime | awk -F'load average:''{print $2}' | awk -F',''{print $3}' | xargs)log"系统负载: 1分钟=${LOAD_1}, 5分钟=${LOAD_5}, 15分钟=${LOAD_15}"# 负载告警(1分钟负载超过CPU核心数的2倍)LOAD_THRESHOLD=$(echo"$CPU_CORES * 2" | bc)
if (( $(echo "$LOAD_1 > $LOAD_THRESHOLD" | bc -l) )); thenlog_warning "系统负载过高! 1分钟负载${LOAD_1}超过阈值${LOAD_THRESHOLD}"
fi
}# 3. 内存使用检查
check_memory() {log_section "3. 内存使用检查"# 获取内存信息(兼容不同Linux版本)MEM_TOTAL=$(free -m | awk 'NR==2{print $2}')MEM_USED=$(free -m | awk 'NR==2{print $3}')MEM_FREE=$(free -m | awk 'NR==2{print $4}')MEM_AVAILABLE=$(free -m | awk 'NR==2{print $7}')# 计算使用率MEM_USAGE=$(echo"scale=2; $MEM_USED / $MEM_TOTAL * 100" | bc)log"内存总量: ${MEM_TOTAL}MB"
log"已用内存: ${MEM_USED}MB"
log"可用内存: ${MEM_AVAILABLE}MB"
log"内存使用率: ${MEM_USAGE}%"if (( $(echo "$MEM_USAGE > $MEM_WARNING" | bc -l) )); thenlog_warning "内存使用率超过${MEM_WARNING}%,当前${MEM_USAGE}%"# 显示内存占用最高的5个进程
log"TOP 5 内存消耗进程:"ps aux | sort -rn -k4 | head -5 | awk '{printf "  PID: %-8s User: %-10s MEM: %-6s CMD: %s\n", $2,$1,$4,$11}' | tee -a "$REPORT_FILE"
elselog_ok "内存使用率正常: ${MEM_USAGE}%"
fi# Swap检查SWAP_TOTAL=$(free -m | awk 'NR==3{print $2}')SWAP_USED=$(free -m | awk 'NR==3{print $3}')log"Swap总量: ${SWAP_TOTAL}MB"
log"Swap使用: ${SWAP_USED}MB"if [ "$SWAP_TOTAL" -gt 0 ] && [ "$SWAP_USED" -gt 100 ]; thenlog_warning "Swap使用量较高: ${SWAP_USED}MB, 可能存在内存压力"
fi
}# 4. 磁盘使用检查
check_disk() {log_section "4. 磁盘使用检查"# 检查各分区使用率
log"磁盘分区使用情况:"
df -h | grep -vE '^Filesystem|tmpfs|cdrom|loop' | awk '{print "  " $0}' | tee -a "$REPORT_FILE"# 告警检查HAS_DISK_WARNING=0
whileread line; doUSAGE=$(echo$line | awk '{print $5}' | sed 's/%//')MOUNT=$(echo$line | awk '{print $6}')if [ "$USAGE" -gt "$DISK_WARNING" ]; thenlog_warning "磁盘分区 $MOUNT 使用率${USAGE}%超过阈值${DISK_WARNING}%"HAS_DISK_WARNING=1# 显示该分区最大的5个目录
log"  $MOUNT 分区占用空间最大的5个目录:"
du -sh ${MOUNT}/* 2>/dev/null | sort -rh | head -5 | awk '{print "    " $0}' | tee -a "$REPORT_FILE"
fi
done < <(df -h | grep -vE '^Filesystem|tmpfs|cdrom|loop')if [ $HAS_DISK_WARNING -eq 0 ]; thenlog_ok "所有磁盘分区使用率正常"
fi# Inode使用率检查
log""
log"Inode使用情况:"
df -i | grep -vE '^Filesystem|tmpfs|cdrom|loop' | awk '{print "  " $0}' | tee -a "$REPORT_FILE"whileread line; doINODE_USAGE=$(echo$line | awk '{print $5}' | sed 's/%//')MOUNT=$(echo$line | awk '{print $6}')if [ "$INODE_USAGE" -gt "$INODE_WARNING" ]; thenlog_warning "分区 $MOUNT 的Inode使用率${INODE_USAGE}%超过阈值${INODE_WARNING}%"
fi
done < <(df -i | grep -vE '^Filesystem|tmpfs|cdrom|loop')
}# 5. 网络状态检查
check_network() {log_section "5. 网络状态检查"# 网络接口状态
log"网络接口状态:"ip -br addr | awk '{print "  " $0}' | tee -a "$REPORT_FILE"# 网络连接统计
log""
log"TCP连接状态统计:"netstat -an | awk '/^tcp/ {print $6}' | sort | uniq -c | sort -rn | awk '{print "  " $0}' | tee -a "$REPORT_FILE"# 检查TIME_WAIT过多TIME_WAIT_COUNT=$(netstat -an | grep TIME_WAIT | wc -l)
log"TIME_WAIT连接数: $TIME_WAIT_COUNT"if [ "$TIME_WAIT_COUNT" -gt 5000 ]; thenlog_warning "TIME_WAIT连接数过多: $TIME_WAIT_COUNT"
fi# 监听端口检查
log""
log"当前监听端口:"netstat -tuln | grep LISTEN | awk '{print "  " $0}' | tee -a "$REPORT_FILE"
}# 6. 进程和服务检查
check_processes() {log_section "6. 进程和服务检查"# 总进程数PROCESS_COUNT=$(ps aux | wc -l)
log"当前进程总数: $PROCESS_COUNT"# 僵尸进程检查ZOMBIE_COUNT=$(ps aux | awk '{print $8}' | grep -c Z)
log"僵尸进程数: $ZOMBIE_COUNT"if [ "$ZOMBIE_COUNT" -gt 0 ]; thenlog_warning "发现僵尸进程!"ps aux | grep 'Z' | grep -v grep | awk '{print "  PID: " $2 " PPID: " $3 " CMD: " $11}' | tee -a "$REPORT_FILE"
fi# 检查关键服务(根据实际业务调整)
log""
log"关键服务状态检查:"# 定义需要检查的服务列表CRITICAL_SERVICES=("sshd""crond""rsyslog")for service in"${CRITICAL_SERVICES[@]}"; do
if systemctl is-active --quiet $service 2>/dev/null; thenlog_ok "  $service: 运行中"
else
# 兼容非systemd系统
if ps aux | grep -v grep | grep -q $service; thenlog_ok "  $service: 运行中"
elselog_error "  $service: 未运行"
fi
fi
done
}# 7. 系统日志检查
check_logs() {log_section "7. 系统日志检查"# 检查最近的错误日志
log"最近1小时系统错误日志:"if [ -f /var/log/messages ]; thenERROR_COUNT=$(grep -i "error\|fail\|critical" /var/log/messages | tail -20 | wc -l)
if [ "$ERROR_COUNT" -gt 0 ]; thenlog_warning "发现 $ERROR_COUNT 条错误日志"grep -i "error\|fail\|critical" /var/log/messages | tail -10 | awk '{print "  " $0}' | tee -a "$REPORT_FILE"
elselog_ok "无严重错误日志"
fi
fi# OOM检查
log""
log"OOM(内存溢出)检查:"OOM_COUNT=$(dmesg | grep -i "out of memory" | wc -l)
if [ "$OOM_COUNT" -gt 0 ]; thenlog_warning "发现 $OOM_COUNT 次OOM事件"dmesg | grep -i "out of memory" | tail -5 | awk '{print "  " $0}' | tee -a "$REPORT_FILE"
elselog_ok "无OOM事件"
fi
}# 8. 生成巡检报告摘要
generate_summary() {log_section "8. 巡检报告摘要"# 统计告警和错误WARNING_COUNT=$(grep -c "\[WARNING\]""$REPORT_FILE")ERROR_COUNT=$(grep -c "\[ERROR\]""$REPORT_FILE")log"巡检完成时间: $(date +"%Y-%m-%d %H:%M:%S")"
log"告警数量: $WARNING_COUNT"
log"错误数量: $ERROR_COUNT"if [ "$ERROR_COUNT" -gt 0 ]; thenlog_error "发现 $ERROR_COUNT 个严重问题,请立即处理!"
elif [ "$WARNING_COUNT" -gt 0 ]; thenlog_warning "发现 $WARNING_COUNT 个告警,建议关注"
elselog_ok "系统状态良好,无异常"
filog""
log"完整报告已保存至: $REPORT_FILE"
}# 主函数
main() {
echo"=========================================="
echo"    服务器健康状态巡检脚本 v2.0"
echo"=========================================="
echo""# 检查是否为root用户
if [ "$(id -u)" -ne 0 ]; then
echo"警告: 非root用户运行,部分检查可能无法执行"
fi# 执行所有检查check_basic_infocheck_cpucheck_memorycheck_diskcheck_networkcheck_processescheck_logsgenerate_summaryecho""
echo"=========================================="
echo"    巡检完成!"
echo"=========================================="
}# 脚本入口
main "$@"
# 赋予执行权限
chmod +x system_health_check.sh# 手动执行
./system_health_check.sh# 定时执行(每天早上8点)
echo"0 8 * * * /path/to/system_health_check.sh" | crontab -

脚本2:磁盘空间深度检查

  • 检查所有分区的磁盘使用率和Inode使用率

  • 自动分析每个分区占用空间最大的目录和文件

  • 检测大文件和日志文件增长情况

  • 预测磁盘满的时间

  • 生成清理建议

#!/bin/bash
################################################################################
# 脚本名称: disk_space_check.sh
# 功能描述: 磁盘空间深度检查和分析
# 版本信息: v1.5
################################################################################REPORT_FILE="/var/log/disk_check_$(date +%Y%m%d_%H%M%S).log"
DISK_WARNING=80
INODE_WARNING=80
LARGE_FILE_SIZE=1G  # 大文件阈值# 颜色定义
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
NC='\033[0m'log() {
echo"[$(date +"%Y-%m-%d %H:%M:%S")] $1" | tee -a "$REPORT_FILE"
}log_warning() {
echo -e "${YELLOW}[WARNING] $1${NC}" | tee -a "$REPORT_FILE"
}log_error() {
echo -e "${RED}[ERROR] $1${NC}" | tee -a "$REPORT_FILE"
}log_ok() {
echo -e "${GREEN}[OK] $1${NC}" | tee -a "$REPORT_FILE"
}# 1. 磁盘使用率检查
check_disk_usage() {
log"============================================================"
log"1. 磁盘使用率检查"
log"============================================================"df -h | grep -vE '^Filesystem|tmpfs|cdrom|loop' | whileread line; doUSAGE=$(echo$line | awk '{print $5}' | sed 's/%//')MOUNT=$(echo$line | awk '{print $6}')AVAIL=$(echo$line | awk '{print $4}')log"分区: $MOUNT"
log"  使用率: ${USAGE}%"
log"  可用空间: $AVAIL"if [ "$USAGE" -gt "$DISK_WARNING" ]; thenlog_error "  磁盘使用率超过${DISK_WARNING}%!"# 分析占用空间最大的目录
log"  占用空间TOP 10目录:"
du -sh ${MOUNT}/* 2>/dev/null | sort -rh | head -10 | awk '{print "    " $0}' | tee -a "$REPORT_FILE"# 查找大文件
log"  大文件(>$LARGE_FILE_SIZE):"find ${MOUNT} -type f -size +${LARGE_FILE_SIZE} -execls -lh {} \; 2>/dev/null | awk '{print "    " $9 " (" $5 ")"}' | head -10 | tee -a "$REPORT_FILE"elselog_ok "  使用率正常"
fi
log""
done
}# 2. Inode使用率检查
check_inode_usage() {
log"============================================================"
log"2. Inode使用率检查"
log"============================================================"df -i | grep -vE '^Filesystem|tmpfs|cdrom|loop' | whileread line; doINODE_USAGE=$(echo$line | awk '{print $5}' | sed 's/%//')MOUNT=$(echo$line | awk '{print $6}')log"分区: $MOUNT"
log"  Inode使用率: ${INODE_USAGE}%"if [ "$INODE_USAGE" -gt "$INODE_WARNING" ]; thenlog_error "  Inode使用率超过${INODE_WARNING}%!"# 查找包含最多文件的目录
log"  文件数量TOP 10目录:"
fordirin $(find ${MOUNT}/* -maxdepth 0 -type d 2>/dev/null); do
echo"$(find $dir -type f 2>/dev/null | wc -l)$dir"
done | sort -rn | head -10 | awk '{print "    " $1 " files: " $2}' | tee -a "$REPORT_FILE"# 查找小文件聚集目录
log"  可能的小文件聚集目录:"find ${MOUNT} -type d -exec sh -c 'echo $(find "{}" -maxdepth 1 -type f | wc -l) "{}"' \; 2>/dev/null | sort -rn | head -10 | awk '{if($1>1000) print "    " $1 " files: " $2}' | tee -a "$REPORT_FILE"
elselog_ok "  Inode使用率正常"
fi
log""
done
}# 3. 日志文件检查
check_log_files() {
log"============================================================"
log"3. 日志文件大小检查"
log"============================================================"# 常见日志目录LOG_DIRS=("/var/log""/opt/logs""/data/logs""/app/logs")for log_dir in"${LOG_DIRS[@]}"; do
if [ -d "$log_dir" ]; then
log"检查目录: $log_dir"# 显示大于100MB的日志文件
log"  大日志文件(>100MB):"find $log_dir -type f -size +100M -execls -lh {} \; 2>/dev/null | awk '{print "    " $9 " (" $5 ")"}' | head -10 | tee -a "$REPORT_FILE"# 统计总大小TOTAL_SIZE=$(du -sh $log_dir 2>/dev/null | awk '{print $1}')
log"  总大小: $TOTAL_SIZE"
log""
fi
done# 检查未轮转的日志
log"未压缩的旧日志文件(30天前):"find /var/log -type f -name "*.log" -mtime +30 ! -name "*.gz" -execls -lh {} \; 2>/dev/null | awk '{print "  " $9 " (" $5 ")"}' | head -10 | tee -a "$REPORT_FILE"
}# 4. 临时文件检查
check_temp_files() {
log"============================================================"
log"4. 临时文件检查"
log"============================================================"# 检查/tmp目录
if [ -d "/tmp" ]; thenTMP_SIZE=$(du -sh /tmp 2>/dev/null | awk '{print $1}')TMP_FILE_COUNT=$(find /tmp -type f 2>/dev/null | wc -l)log"/tmp目录:"
log"  总大小: $TMP_SIZE"
log"  文件数: $TMP_FILE_COUNT"# 检查超过7天未使用的临时文件OLD_TMP=$(find /tmp -type f -atime +7 2>/dev/null | wc -l)
if [ "$OLD_TMP" -gt 0 ]; thenlog_warning "  发现 $OLD_TMP 个超过7天未访问的临时文件"
log"  占用空间TOP 10:"find /tmp -type f -atime +7 -execls -lh {} \; 2>/dev/null | sort -k5 -rh | head -10 | awk '{print "    " $9 " (" $5 ")"}' | tee -a "$REPORT_FILE"
fi
fi# 检查/var/tmp
if [ -d "/var/tmp" ]; thenVAR_TMP_SIZE=$(du -sh /var/tmp 2>/dev/null | awk '{print $1}')
log""
log"/var/tmp目录:"
log"  总大小: $VAR_TMP_SIZE"
fi
}# 5. 磁盘IO检查
check_disk_io() {
log"============================================================"
log"5. 磁盘IO统计"
log"============================================================"ifcommand -v iostat &> /dev/null; then
log"磁盘IO统计(最近1分钟):"iostat -x 1 2 | tail -n +4 | awk 'NF' | tail -n +2 | tee -a "$REPORT_FILE"
elselog_warning "未安装iostat工具 (yum install sysstat)"
fi# 检查IO等待IO_WAIT=$(top -bn1 | grep "Cpu(s)" | awk '{print $10}' | sed 's/%wa,//')
log""
log"当前IO等待: ${IO_WAIT}%"if (( $(echo "$IO_WAIT > 20" | bc -l) )); thenlog_warning "IO等待过高!"
fi
}# 6. 清理建议
generate_cleanup_suggestions() {
log"============================================================"
log"6. 磁盘清理建议"
log"============================================================"log"可以考虑清理的内容:"
log"1. 压缩或删除旧日志文件:"
log"   find /var/log -type f -name '*.log' -mtime +30 -exec gzip {} \;"
log""
log"2. 清理超过7天的临时文件:"
log"   find /tmp -type f -atime +7 -delete"
log""
log"3. 清理yum缓存(CentOS/RHEL):"
log"   yum clean all"
log""
log"4. 清理apt缓存(Ubuntu/Debian):"
log"   apt-get clean"
log""
log"5. 清理Docker未使用的镜像和容器:"
log"   docker system prune -a"
log""
log"6. 清理journal日志:"
log"   journalctl --vacuum-time=7d"
}# 主函数
main() {
echo"=========================================="
echo"    磁盘空间深度检查脚本 v1.5"
echo"=========================================="
echo""check_disk_usagecheck_inode_usagecheck_log_filescheck_temp_filescheck_disk_iogenerate_cleanup_suggestionslog""
log"============================================================"
log"检查完成!"
log"报告保存至: $REPORT_FILE"
log"============================================================"
}main "$@"

脚本3:网络连接和端口检查

专门检查网络状态、连接数、端口监听等网络相关问题

#!/bin/bash
################################################################################
# 脚本名称: network_check.sh
# 功能描述: 网络连接和端口深度检查
# 版本信息: v1.0
################################################################################REPORT_FILE="/var/log/network_check_$(date +%Y%m%d_%H%M%S).log"# 告警阈值
TIME_WAIT_WARNING=5000
ESTABLISHED_WARNING=10000# 需要检查的关键端口
CRITICAL_PORTS=(22 80 443 3306 6379 8080)RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
NC='\033[0m'log() {
echo"[$(date +"%Y-%m-%d %H:%M:%S")] $1" | tee -a "$REPORT_FILE"
}log_warning() {
echo -e "${YELLOW}[WARNING] $1${NC}" | tee -a "$REPORT_FILE"
}log_error() {
echo -e "${RED}[ERROR] $1${NC}" | tee -a "$REPORT_FILE"
}log_ok() {
echo -e "${GREEN}[OK] $1${NC}" | tee -a "$REPORT_FILE"
}# 1. 网络接口状态
check_network_interfaces() {
log"============================================================"
log"1. 网络接口状态"
log"============================================================"log"接口列表:"ip -br addr | tee -a "$REPORT_FILE"log""
log"接口详细信息:"
for interface in $(ip -br link | awk '{print $1}' | grep -v lo); do
log"接口: $interface"# 接口状态STATUS=$(ip link show $interface | grep "state" | awk '{print $9}')
log"  状态: $STATUS"# 接口统计RX_BYTES=$(cat /sys/class/net/$interface/statistics/rx_bytes 2>/dev/null)TX_BYTES=$(cat /sys/class/net/$interface/statistics/tx_bytes 2>/dev/null)RX_DROP=$(cat /sys/class/net/$interface/statistics/rx_dropped 2>/dev/null)TX_DROP=$(cat /sys/class/net/$interface/statistics/tx_dropped 2>/dev/null)RX_ERR=$(cat /sys/class/net/$interface/statistics/rx_errors 2>/dev/null)TX_ERR=$(cat /sys/class/net/$interface/statistics/tx_errors 2>/dev/null)log"  接收: $(numfmt --to=iec $RX_BYTES 2>/dev/null || echo ${RX_BYTES}B)"
log"  发送: $(numfmt --to=iec $TX_BYTES 2>/dev/null || echo ${TX_BYTES}B)"if [ "$RX_DROP" -gt 0 ] || [ "$TX_DROP" -gt 0 ]; thenlog_warning "  丢包: RX=$RX_DROP TX=$TX_DROP"
fiif [ "$RX_ERR" -gt 0 ] || [ "$TX_ERR" -gt 0 ]; thenlog_error "  错误: RX=$RX_ERR TX=$TX_ERR"
filog""
done
}# 2. TCP连接状态统计
check_tcp_connections() {
log"============================================================"
log"2. TCP连接状态统计"
log"============================================================"log"连接状态分布:"netstat -an | awk '/^tcp/ {print $6}' | sort | uniq -c | sort -rn | tee -a "$REPORT_FILE"# TIME_WAIT检查TIME_WAIT_COUNT=$(netstat -an | grep TIME_WAIT | wc -l)
log""
log"TIME_WAIT连接数: $TIME_WAIT_COUNT"if [ "$TIME_WAIT_COUNT" -gt "$TIME_WAIT_WARNING" ]; thenlog_warning "TIME_WAIT连接数过多,可能需要优化TCP参数"
log"优化建议:"
log"  echo 1 > /proc/sys/net/ipv4/tcp_tw_reuse"
log"  echo 1 > /proc/sys/net/ipv4/tcp_tw_recycle  # 不建议在NAT环境使用"
elselog_ok "TIME_WAIT连接数正常"
fi# ESTABLISHED检查ESTABLISHED_COUNT=$(netstat -an | grep ESTABLISHED | wc -l)
log""
log"ESTABLISHED连接数: $ESTABLISHED_COUNT"if [ "$ESTABLISHED_COUNT" -gt "$ESTABLISHED_WARNING" ]; thenlog_warning "活动连接数过多"
fi# 连接数TOP 10的远程IP
log""
log"连接数最多的远程IP TOP 10:"netstat -an | grep ESTABLISHED | awk '{print $5}' | cut -d':' -f1 | sort | uniq -c | sort -rn | head -10 | tee -a "$REPORT_FILE"
}# 3. 监听端口检查
check_listening_ports() {
log"============================================================"
log"3. 监听端口检查"
log"============================================================"log"当前监听端口:"netstat -tulnp | grep LISTEN | tee -a "$REPORT_FILE"log""
log"关键端口状态检查:"
for port in"${CRITICAL_PORTS[@]}"; do
if netstat -tuln | grep -q ":$port "; thenPID=$(netstat -tulnp 2>/dev/null | grep ":$port " | awk '{print $7}' | cut -d'/' -f1)PROCESS=$(netstat -tulnp 2>/dev/null | grep ":$port " | awk '{print $7}' | cut -d'/' -f2)log_ok "端口 $port: 监听中 (进程:$PROCESS PID:$PID)"
elselog_warning "端口 $port: 未监听"
fi
done
}# 4. 网络连通性检查
check_network_connectivity() {
log"============================================================"
log"4. 网络连通性检查"
log"============================================================"# 检查DNS
log"DNS解析测试:"
if nslookup www.baidu.com &>/dev/null; thenlog_ok "  DNS解析正常"
elselog_error "  DNS解析失败"
fi# 检查网关连通性
log""
log"网关连通性测试:"GATEWAY=$(ip route | grep default | awk '{print $3}' | head -1)
if [ -n "$GATEWAY" ]; then
log"  默认网关: $GATEWAY"
if ping -c 3 -W 2 $GATEWAY &>/dev/null; thenlog_ok "  网关可达"
elselog_error "  网关不可达"
fi
elselog_warning "  未找到默认网关"
fi# 检查外网连通性
log""
log"外网连通性测试:"
if ping -c 3 -W 2 8.8.8.8 &>/dev/null; thenlog_ok "  外网可达"
elselog_error "  外网不可达"
fi
}# 5. 网络参数检查
check_network_parameters() {
log"============================================================"
log"5. 网络内核参数"
log"============================================================"log"关键网络参数:"
log"  tcp_tw_reuse: $(cat /proc/sys/net/ipv4/tcp_tw_reuse)"
log"  tcp_tw_recycle: $(cat /proc/sys/net/ipv4/tcp_tw_recycle 2>/dev/null || echo 'N/A')"
log"  tcp_fin_timeout: $(cat /proc/sys/net/ipv4/tcp_fin_timeout)"
log"  tcp_keepalive_time: $(cat /proc/sys/net/ipv4/tcp_keepalive_time)"
log"  tcp_max_syn_backlog: $(cat /proc/sys/net/ipv4/tcp_max_syn_backlog)"
log"  somaxconn: $(cat /proc/sys/net/core/somaxconn)"# 文件描述符限制
log""
log"文件描述符限制:"
log"  软限制: $(ulimit -Sn)"
log"  硬限制: $(ulimit -Hn)"
}# 6. 防火墙状态
check_firewall() {
log"============================================================"
log"6. 防火墙状态"
log"============================================================"# 检查iptables
ifcommand -v iptables &> /dev/null; thenRULE_COUNT=$(iptables -L -n | grep -c "Chain")
log"iptables规则链数: $RULE_COUNT"
fi# 检查firewalld
if systemctl is-active --quiet firewalld 2>/dev/null; thenlog_ok "firewalld: 运行中"
log"开放的服务:"firewall-cmd --list-services 2>/dev/null | tee -a "$REPORT_FILE"
log"开放的端口:"firewall-cmd --list-ports 2>/dev/null | tee -a "$REPORT_FILE"
else
log"firewalld: 未运行"
fi
}# 主函数
main() {
echo"=========================================="
echo"    网络状态检查脚本 v1.0"
echo"=========================================="
echo""check_network_interfacescheck_tcp_connectionscheck_listening_portscheck_network_connectivitycheck_network_parameterscheck_firewalllog""
log"============================================================"
log"检查完成!"
log"报告保存至: $REPORT_FILE"
log"============================================================"
}main "$@"

脚本4:应用进程健康检查

检查关键业务进程的运行状态、资源占用和接口响应。

#!/bin/bash
################################################################################
# 脚本名称: process_health_check.sh
# 功能描述: 应用进程健康检查
# 版本信息: v2.0
################################################################################REPORT_FILE="/var/log/process_check_$(date +%Y%m%d_%H%M%S).log"# 定义需要检查的进程(根据实际业务修改)
declare -A CRITICAL_PROCESSES=(["nginx"]="nginx|80,443"["mysql"]="mysqld|3306"["redis"]="redis-server|6379"["java"]="java|8080"
)# 定义需要检查的HTTP接口
declare -A HTTP_ENDPOINTS=(["主页"]="http://localhost/"["健康检查"]="http://localhost:8080/health"["API状态"]="http://localhost:8080/api/status"
)RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
NC='\033[0m'log() {
echo"[$(date +"%Y-%m-%d %H:%M:%S")] $1" | tee -a "$REPORT_FILE"
}log_warning() {
echo -e "${YELLOW}[WARNING] $1${NC}" | tee -a "$REPORT_FILE"
}log_error() {
echo -e "${RED}[ERROR] $1${NC}" | tee -a "$REPORT_FILE"
}log_ok() {
echo -e "${GREEN}[OK] $1${NC}" | tee -a "$REPORT_FILE"
}# 1. 进程检查
check_process() {
local name=$1
local pattern=$2log"检查进程: $name"# 查找进程PIDS=$(pgrep -f "$pattern")if [ -z "$PIDS" ]; thenlog_error "  进程未运行!"
return 1
fi# 多进程情况PID_COUNT=$(echo"$PIDS" | wc -l)log_ok "  进程运行中 (实例数: $PID_COUNT)"# 检查每个进程实例
for pid in$PIDS; do
# 进程状态STATUS=$(ps -p $pid -o stat --no-headers 2>/dev/null)
if [[ $STATUS == *"Z"* ]]; thenlog_error "  PID $pid: 僵尸状态"
continue
elif [[ $STATUS == *"T"* ]]; thenlog_warning "  PID $pid: 暂停状态"
continue
fi# 资源使用CPU=$(ps -p $pid -o %cpu --no-headers 2>/dev/null | xargs)MEM=$(ps -p $pid -o %mem --no-headers 2>/dev/null | xargs)RSS=$(ps -p $pid -o rss --no-headers 2>/dev/null | xargs)VSZ=$(ps -p $pid -o vsz --no-headers 2>/dev/null | xargs)UPTIME=$(ps -p $pid -o etime --no-headers 2>/dev/null | xargs)THREADS=$(ps -p $pid -o nlwp --no-headers 2>/dev/null | xargs)log"  PID $pid:"
log"    CPU: ${CPU}%"
log"    内存: ${MEM}% (RSS:${RSS}KB VSZ:${VSZ}KB)"
log"    线程数: $THREADS"
log"    运行时长: $UPTIME"# CPU告警
if (( $(echo "$CPU > 80" | bc -l) )); thenlog_warning "    CPU使用率过高!"
fi# 内存告警
if (( $(echo "$MEM > 80" | bc -l) )); thenlog_warning "    内存使用率过高!"
fi# 线程数告警
if [ "$THREADS" -gt 1000 ]; thenlog_warning "    线程数过多!"
fi# 文件描述符检查
if [ -d "/proc/$pid/fd" ]; thenFD_COUNT=$(ls /proc/$pid/fd 2>/dev/null | wc -l)
log"    文件描述符: $FD_COUNT"if [ "$FD_COUNT" -gt 10000 ]; thenlog_warning "    文件描述符过多!"
fi
fi# 检查进程是否有core dump
if [ -d "/proc/$pid" ]; thenCORE_PATTERN=$(cat /proc/sys/kernel/core_pattern)
log"    Core dump配置: $CORE_PATTERN"
fi
donereturn 0
}# 2. 端口检查
check_ports() {
local name=$1
local ports=$2log""
log"检查端口: $name"IFS=','read -ra PORT_ARRAY <<< "$ports"
for port in"${PORT_ARRAY[@]}"; do
if netstat -tuln | grep -q ":$port "; thenPID=$(netstat -tulnp 2>/dev/null | grep ":$port " | awk '{print $7}' | cut -d'/' -f1 | head -1)PROC=$(netstat -tulnp 2>/dev/null | grep ":$port " | awk '{print $7}' | cut -d'/' -f2 | head -1)# 连接数统计ESTABLISHED=$(netstat -an | grep ":$port " | grep ESTABLISHED | wc -l)TIME_WAIT=$(netstat -an | grep ":$port " | grep TIME_WAIT | wc -l)CLOSE_WAIT=$(netstat -an | grep ":$port " | grep CLOSE_WAIT | wc -l)log_ok "  端口 $port: 监听中 (进程:$PROC PID:$PID)"
log"    ESTABLISHED: $ESTABLISHED"
log"    TIME_WAIT: $TIME_WAIT"
log"    CLOSE_WAIT: $CLOSE_WAIT"if [ "$CLOSE_WAIT" -gt 100 ]; thenlog_warning "    CLOSE_WAIT连接过多,可能存在连接泄露!"
fi
elselog_error "  端口 $port: 未监听"
fi
done
}# 3. HTTP接口检查
check_http_endpoints() {
log""
log"============================================================"
log"HTTP接口健康检查"
log"============================================================"for name in"${!HTTP_ENDPOINTS[@]}"; dourl="${HTTP_ENDPOINTS[$name]}"
log"检查: $name ($url)"# 使用curl检查接口START_TIME=$(date +%s%N)RESPONSE=$(curl -s -o /dev/null -w "%{http_code}" --connect-timeout 5 --max-time 10 "$url" 2>&1)END_TIME=$(date +%s%N)if [ $? -eq 0 ]; thenHTTP_CODE=$RESPONSERESPONSE_TIME=$(echo"scale=3; ($END_TIME - $START_TIME) / 1000000000" | bc)if [ "$HTTP_CODE" == "200" ]; thenlog_ok "  HTTP $HTTP_CODE 响应时间:${RESPONSE_TIME}s"
elif [ "$HTTP_CODE" == "301" ] || [ "$HTTP_CODE" == "302" ]; thenlog_warning "  HTTP $HTTP_CODE (重定向)"
elselog_error "  HTTP $HTTP_CODE (异常)"
fi# 响应时间告警
if (( $(echo "$RESPONSE_TIME > 3" | bc -l) )); thenlog_warning "  响应时间过长: ${RESPONSE_TIME}s"
fi
elselog_error "  请求失败 - 连接超时或网络错误"
fi
done
}# 4. 进程依赖检查
check_process_dependencies() {
log""
log"============================================================"
log"进程依赖检查"
log"============================================================"# 检查常见依赖
log"共享库依赖:"for proc in"${!CRITICAL_PROCESSES[@]}"; dopattern=$(echo"${CRITICAL_PROCESSES[$proc]}" | cut -d'|' -f1)PID=$(pgrep -f "$pattern" | head -1)if [ -n "$PID" ]; thenEXE=$(readlink /proc/$PID/exe 2>/dev/null)
if [ -n "$EXE" ]; then
log"  $proc ($EXE):"# 检查缺失的依赖MISSING=$(ldd "$EXE" 2>/dev/null | grep "not found")
if [ -n "$MISSING" ]; thenlog_error "    发现缺失的依赖:"
echo"$MISSING" | awk '{print "      " $0}' | tee -a "$REPORT_FILE"
elselog_ok "    依赖完整"
fi
fi
fi
done
}# 主函数
main() {
echo"=========================================="
echo"    应用进程健康检查 v2.0"
echo"=========================================="
echo""log"============================================================"
log"进程和端口检查"
log"============================================================"# 检查所有关键进程
for proc in"${!CRITICAL_PROCESSES[@]}"; doIFS='|'read -r pattern ports <<< "${CRITICAL_PROCESSES[$proc]}"check_process "$proc""$pattern"check_ports "$proc""$ports"
log""
done# HTTP接口检查check_http_endpoints# 依赖检查check_process_dependencieslog""
log"============================================================"
log"检查完成!"
log"报告保存至: $REPORT_FILE"
log"============================================================"
}main "$@"

脚本5:安全基线检查

检查服务器的安全配置,包括账户安全、权限配置、SSH安全等

#!/bin/bash
################################################################################
# 脚本名称: security_baseline_check.sh
# 功能描述: 服务器安全基线检查
# 版本信息: v1.0
################################################################################REPORT_FILE="/var/log/security_check_$(date +%Y%m%d_%H%M%S).log"RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
NC='\033[0m'log() {
echo"[$(date +"%Y-%m-%d %H:%M:%S")] $1" | tee -a "$REPORT_FILE"
}log_warning() {
echo -e "${YELLOW}[WARNING] $1${NC}" | tee -a "$REPORT_FILE"
}log_error() {
echo -e "${RED}[ERROR] $1${NC}" | tee -a "$REPORT_FILE"
}log_ok() {
echo -e "${GREEN}[OK] $1${NC}" | tee -a "$REPORT_FILE"
}# 1. 账户安全检查
check_account_security() {
log"============================================================"
log"1. 账户安全检查"
log"============================================================"# 空密码账户
log"空密码账户检查:"EMPTY_PASS=$(awk -F: '($2 == "") {print $1}' /etc/shadow 2>/dev/null)
if [ -n "$EMPTY_PASS" ]; thenlog_error "  发现空密码账户:"
echo"$EMPTY_PASS" | awk '{print "    " $0}' | tee -a "$REPORT_FILE"
elselog_ok "  无空密码账户"
fi# UID为0的账户
log""
log"UID为0的账户(root权限):"ROOT_USERS=$(awk -F: '($3 == 0) {print $1}' /etc/passwd)
echo"$ROOT_USERS" | awk '{print "  " $0}' | tee -a "$REPORT_FILE"
if [ $(echo"$ROOT_USERS" | wc -l) -gt 1 ]; thenlog_warning "  发现多个UID为0的账户!"
fi# 可登录的系统账户
log""
log"可登录的系统账户(UID<1000且shell不是nologin):"awk -F: '($3 < 1000 && $7 !~ /nologin|false/) {print "  " $1 " (UID:" $3 " Shell:" $7 ")"}' /etc/passwd | tee -a "$REPORT_FILE"# sudo权限用户
log""
log"拥有sudo权限的用户:"grep -E '^%wheel|^%sudo|^[^#].*ALL=.*ALL' /etc/sudoers /etc/sudoers.d/* 2>/dev/null | awk '{print "  " $0}' | tee -a "$REPORT_FILE"# 最近登录失败记录
log""
log"最近10次登录失败记录:"lastb | head -10 | awk '{print "  " $0}' | tee -a "$REPORT_FILE"
}# 2. SSH安全配置检查
check_ssh_security() {
log""
log"============================================================"
log"2. SSH安全配置检查"
log"============================================================"SSHD_CONFIG="/etc/ssh/sshd_config"if [ ! -f "$SSHD_CONFIG" ]; thenlog_error "SSH配置文件不存在"
return 1
fi# Root登录ROOT_LOGIN=$(grep "^PermitRootLogin"$SSHD_CONFIG | awk '{print $2}')
log"PermitRootLogin: $ROOT_LOGIN"
if [ "$ROOT_LOGIN" == "yes" ]; thenlog_warning "  SSH允许root直接登录(不安全)"
elselog_ok "  SSH禁止root直接登录"
fi# 密码登录PASS_AUTH=$(grep "^PasswordAuthentication"$SSHD_CONFIG | awk '{print $2}')
log"PasswordAuthentication: $PASS_AUTH"
if [ "$PASS_AUTH" == "yes" ]; thenlog_warning "  SSH允许密码登录(建议仅使用密钥)"
elselog_ok "  SSH已禁用密码登录"
fi# 空密码登录EMPTY_PASS=$(grep "^PermitEmptyPasswords"$SSHD_CONFIG | awk '{print $2}')
log"PermitEmptyPasswords: ${EMPTY_PASS:-no}"
if [ "$EMPTY_PASS" == "yes" ]; thenlog_error "  SSH允许空密码登录(严重安全风险!)"
elselog_ok "  SSH禁止空密码登录"
fi# SSH端口SSH_PORT=$(grep "^Port"$SSHD_CONFIG | awk '{print $2}')
log"SSH端口: ${SSH_PORT:-22}"
if [ "$SSH_PORT" == "22" ] || [ -z "$SSH_PORT" ]; thenlog_warning "  使用默认SSH端口22(建议修改)"
elselog_ok "  已修改SSH默认端口"
fi# 最大认证尝试次数MAX_AUTH=$(grep "^MaxAuthTries"$SSHD_CONFIG | awk '{print $2}')
log"MaxAuthTries: ${MAX_AUTH:-6}"# 允许的用户ALLOW_USERS=$(grep "^AllowUsers"$SSHD_CONFIG | cut -d' ' -f2-)
if [ -n "$ALLOW_USERS" ]; then
log"AllowUsers: $ALLOW_USERS"
elselog_warning "  未配置AllowUsers(所有用户均可尝试登录)"
fi
}# 3. 文件权限检查
check_file_permissions() {
log""
log"============================================================"
log"3. 关键文件权限检查"
log"============================================================"# 检查关键文件权限
declare -A CRITICAL_FILES=(["/etc/passwd"]="644"["/etc/shadow"]="000"["/etc/group"]="644"["/etc/gshadow"]="000"["/etc/ssh/sshd_config"]="600")for file in"${!CRITICAL_FILES[@]}"; do
if [ -f "$file" ]; thenPERM=$(stat -c "%a""$file")EXPECTED="${CRITICAL_FILES[$file]}"log"$file: $PERM"
if [ "$PERM" != "$EXPECTED" ]; thenlog_warning "  权限不正确(期望:$EXPECTED)"
elselog_ok "  权限正确"
fi
elselog_warning "$file: 文件不存在"
fi
done# SUID文件检查
log""
log"SUID文件检查(可能的提权风险):"find / -type f -perm -4000 2>/dev/null | head -20 | awk '{print "  " $0}' | tee -a "$REPORT_FILE"
}# 4. 防火墙检查
check_firewall() {
log""
log"============================================================"
log"4. 防火墙状态检查"
log"============================================================"# 检查iptables
ifcommand -v iptables &> /dev/null; thenRULE_COUNT=$(iptables -L -n | wc -l)
log"iptables规则数: $RULE_COUNT"if [ "$RULE_COUNT" -lt 10 ]; thenlog_warning "  iptables规则较少,可能未配置防火墙"
fi
fi# 检查firewalld
if systemctl is-active --quiet firewalld 2>/dev/null; thenlog_ok "firewalld: 运行中"
elselog_warning "firewalld: 未运行"
fi# SELinux状态
ifcommand -v getenforce &> /dev/null; thenSELINUX=$(getenforce)
log"SELinux: $SELINUX"
if [ "$SELINUX" == "Disabled" ]; thenlog_warning "  SELinux已禁用(降低了安全性)"
fi
fi
}# 5. 系统更新检查
check_system_updates() {
log""
log"============================================================"
log"5. 系统更新检查"
log"============================================================"# CentOS/RHEL
ifcommand -v yum &> /dev/null; thenUPDATE_COUNT=$(yum check-update 2>/dev/null | grep -v "^$" | wc -l)
log"可用更新数(yum): $UPDATE_COUNT"
if [ "$UPDATE_COUNT" -gt 50 ]; thenlog_warning "  可用更新较多,建议定期更新系统"
fi
fi# Ubuntu/Debian
ifcommand -v apt-get &> /dev/null; thenapt-get update &>/dev/nullUPDATE_COUNT=$(apt list --upgradable 2>/dev/null | wc -l)
log"可用更新数(apt): $UPDATE_COUNT"
fi# 内核版本KERNEL=$(uname -r)
log"当前内核版本: $KERNEL"
}# 6. 历史命令检查
check_history() {
log""
log"============================================================"
log"6. 敏感历史命令检查"
log"============================================================"# 检查历史命令中的敏感操作
if [ -f ~/.bash_history ]; then
log"危险命令记录:"grep -E "rm -rf|mkfs|dd|:/dev/|wget.*sh|curl.*sh|chmod 777" ~/.bash_history 2>/dev/null | tail -10 | awk '{print "  " $0}' | tee -a "$REPORT_FILE"
fi# 密码泄露检查
log""
log"可能的密码泄露(历史命令):"grep -E "password=|passwd|mysql.*-p" ~/.bash_history 2>/dev/null | tail -5 | awk '{print "  " $0}' | tee -a "$REPORT_FILE"
}# 主函数
main() {
echo"=========================================="
echo"    安全基线检查脚本 v1.0"
echo"=========================================="
echo""if [ "$(id -u)" -ne 0 ]; thenlog_warning "非root用户运行,部分检查可能无法执行"
ficheck_account_securitycheck_ssh_securitycheck_file_permissionscheck_firewallcheck_system_updatescheck_historylog""
log"============================================================"
log"检查完成!"
log"报告保存至: $REPORT_FILE"
log"============================================================"
}main "$@"

脚本6: mysql数据库健康检查

检查MySQL数据库的运行状态、性能指标和配置

#!/bin/bash
################################################################################
# 脚本名称: mysql_health_check.sh
# 功能描述: MySQL数据库健康检查
# 版本信息: v1.0
################################################################################REPORT_FILE="/var/log/mysql_check_$(date +%Y%m%d_%H%M%S).log"# MySQL连接配置(根据实际情况修改)
MYSQL_USER="monitor"
MYSQL_PASS="your_password"
MYSQL_HOST="localhost"
MYSQL_PORT="3306"# 告警阈值
CONN_WARNING=100
SLOW_QUERY_WARNING=100
THREAD_RUNNING_WARNING=10RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
NC='\033[0m'log() {
echo"[$(date +"%Y-%m-%d %H:%M:%S")] $1" | tee -a "$REPORT_FILE"
}log_warning() {
echo -e "${YELLOW}[WARNING] $1${NC}" | tee -a "$REPORT_FILE"
}log_error() {
echo -e "${RED}[ERROR] $1${NC}" | tee -a "$REPORT_FILE"
}log_ok() {
echo -e "${GREEN}[OK] $1${NC}" | tee -a "$REPORT_FILE"
}# MySQL命令封装
mysql_exec() {mysql -u"$MYSQL_USER" -p"$MYSQL_PASS" -h"$MYSQL_HOST" -P"$MYSQL_PORT" -e "$1" 2>/dev/null
}# 1. 连接检查
check_mysql_connection() {
log"============================================================"
log"1. MySQL连接检查"
log"============================================================"if mysql_exec "SELECT 1" &>/dev/null; thenlog_ok "MySQL连接正常"# 版本信息VERSION=$(mysql_exec "SELECT VERSION()" | tail -1)
log"MySQL版本: $VERSION"# 运行时长UPTIME=$(mysql_exec "SHOW STATUS LIKE 'Uptime'" | tail -1 | awk '{print $2}')UPTIME_DAYS=$(echo"scale=2; $UPTIME / 86400" | bc)
log"运行时长: ${UPTIME_DAYS}天"return 0
elselog_error "MySQL连接失败"
return 1
fi
}# 2. 连接数检查
check_mysql_connections() {
log""
log"============================================================"
log"2. 连接数检查"
log"============================================================"# 当前连接数THREADS_CONNECTED=$(mysql_exec "SHOW STATUS LIKE 'Threads_connected'" | tail -1 | awk '{print $2}')
log"当前连接数: $THREADS_CONNECTED"# 最大连接数MAX_CONNECTIONS=$(mysql_exec "SHOW VARIABLES LIKE 'max_connections'" | tail -1 | awk '{print $2}')
log"最大连接数: $MAX_CONNECTIONS"# 连接使用率CONN_USAGE=$(echo"scale=2; $THREADS_CONNECTED / $MAX_CONNECTIONS * 100" | bc)
log"连接使用率: ${CONN_USAGE}%"if (( $(echo "$CONN_USAGE > 80" | bc -l) )); thenlog_warning "连接使用率过高!"
elselog_ok "连接使用率正常"
fi# 历史最大连接数MAX_USED=$(mysql_exec "SHOW STATUS LIKE 'Max_used_connections'" | tail -1 | awk '{print $2}')
log"历史最大连接数: $MAX_USED"# 正在运行的线程THREADS_RUNNING=$(mysql_exec "SHOW STATUS LIKE 'Threads_running'" | tail -1 | awk '{print $2}')
log"正在运行的线程: $THREADS_RUNNING"if [ "$THREADS_RUNNING" -gt "$THREAD_RUNNING_WARNING" ]; thenlog_warning "运行线程数较多,可能存在慢查询"
fi
}# 3. 查询性能检查
check_query_performance() {
log""
log"============================================================"
log"3. 查询性能检查"
log"============================================================"# 慢查询SLOW_QUERIES=$(mysql_exec "SHOW STATUS LIKE 'Slow_queries'" | tail -1 | awk '{print $2}')
log"慢查询数: $SLOW_QUERIES"if [ "$SLOW_QUERIES" -gt "$SLOW_QUERY_WARNING" ]; thenlog_warning "慢查询数量较多!"# 显示慢查询配置SLOW_QUERY_TIME=$(mysql_exec "SHOW VARIABLES LIKE 'long_query_time'" | tail -1 | awk '{print $2}')
log"慢查询阈值: ${SLOW_QUERY_TIME}秒"
fi# QPS(每秒查询数)QUESTIONS=$(mysql_exec "SHOW STATUS LIKE 'Questions'" | tail -1 | awk '{print $2}')UPTIME=$(mysql_exec "SHOW STATUS LIKE 'Uptime'" | tail -1 | awk '{print $2}')QPS=$(echo"scale=2; $QUESTIONS / $UPTIME" | bc)
log"平均QPS: $QPS"# TPS(每秒事务数)COM_COMMIT=$(mysql_exec "SHOW STATUS LIKE 'Com_commit'" | tail -1 | awk '{print $2}')COM_ROLLBACK=$(mysql_exec "SHOW STATUS LIKE 'Com_rollback'" | tail -1 | awk '{print $2}')TPS=$(echo"scale=2; ($COM_COMMIT + $COM_ROLLBACK) / $UPTIME" | bc)
log"平均TPS: $TPS"
}# 4. InnoDB状态检查
check_innodb_status() {
log""
log"============================================================"
log"4. InnoDB存储引擎检查"
log"============================================================"# InnoDB缓冲池大小INNODB_BUFFER_POOL=$(mysql_exec "SHOW VARIABLES LIKE 'innodb_buffer_pool_size'" | tail -1 | awk '{print $2}')INNODB_BUFFER_GB=$(echo"scale=2; $INNODB_BUFFER_POOL / 1024 / 1024 / 1024" | bc)
log"InnoDB缓冲池大小: ${INNODB_BUFFER_GB}GB"# 缓冲池命中率BUFFER_READS=$(mysql_exec "SHOW STATUS LIKE 'Innodb_buffer_pool_reads'" | tail -1 | awk '{print $2}')BUFFER_READ_REQUESTS=$(mysql_exec "SHOW STATUS LIKE 'Innodb_buffer_pool_read_requests'" | tail -1 | awk '{print $2}')if [ "$BUFFER_READ_REQUESTS" -gt 0 ]; thenHIT_RATIO=$(echo"scale=4; (1 - $BUFFER_READS / $BUFFER_READ_REQUESTS) * 100" | bc)
log"缓冲池命中率: ${HIT_RATIO}%"if (( $(echo "$HIT_RATIO < 99" | bc -l) )); thenlog_warning "缓冲池命中率偏低,考虑增大innodb_buffer_pool_size"
elselog_ok "缓冲池命中率良好"
fi
fi# InnoDB日志INNODB_LOG_SIZE=$(mysql_exec "SHOW VARIABLES LIKE 'innodb_log_file_size'" | tail -1 | awk '{print $2}')INNODB_LOG_MB=$(echo"scale=2; $INNODB_LOG_SIZE / 1024 / 1024" | bc)
log"InnoDB日志文件大小: ${INNODB_LOG_MB}MB"
}# 5. 复制状态检查
check_replication_status() {
log""
log"============================================================"
log"5. 主从复制状态检查"
log"============================================================"# 检查是否为从库SLAVE_STATUS=$(mysql_exec "SHOW SLAVE STATUS\G" 2>/dev/null)if [ -n "$SLAVE_STATUS" ]; then
log"从库复制状态:"# IO线程状态SLAVE_IO_RUNNING=$(echo"$SLAVE_STATUS" | grep "Slave_IO_Running" | awk '{print $2}')
log"  Slave_IO_Running: $SLAVE_IO_RUNNING"# SQL线程状态SLAVE_SQL_RUNNING=$(echo"$SLAVE_STATUS" | grep "Slave_SQL_Running" | awk '{print $2}' | head -1)
log"  Slave_SQL_Running: $SLAVE_SQL_RUNNING"if [ "$SLAVE_IO_RUNNING" != "Yes" ] || [ "$SLAVE_SQL_RUNNING" != "Yes" ]; thenlog_error "主从复制异常!"# 显示错误信息LAST_IO_ERROR=$(echo"$SLAVE_STATUS" | grep "Last_IO_Error:" | cut -d':' -f2-)LAST_SQL_ERROR=$(echo"$SLAVE_STATUS" | grep "Last_SQL_Error:" | cut -d':' -f2-)if [ -n "$LAST_IO_ERROR" ]; then
log"  IO错误: $LAST_IO_ERROR"
fiif [ -n "$LAST_SQL_ERROR" ]; then
log"  SQL错误: $LAST_SQL_ERROR"
fi
elselog_ok "主从复制正常"
fi# 复制延迟SECONDS_BEHIND=$(echo"$SLAVE_STATUS" | grep "Seconds_Behind_Master" | awk '{print $2}')
log"  复制延迟: ${SECONDS_BEHIND}秒"if [ "$SECONDS_BEHIND" != "NULL" ] && [ "$SECONDS_BEHIND" -gt 60 ]; thenlog_warning "复制延迟较大!"
fi
else
log"非从库或未配置主从复制"
fi
}# 6. 表空间检查
check_tablespace() {
log""
log"============================================================"
log"6. 表空间检查"
log"============================================================"log"占用空间最大的TOP 10表:"mysql_exec "SELECTtable_schema AS 'Database',table_name AS 'Table',ROUND((data_length + index_length) / 1024 / 1024, 2) AS 'Size(MB)'FROM information_schema.tablesWHERE table_schema NOT IN ('mysql', 'information_schema', 'performance_schema', 'sys')ORDER BY (data_length + index_length) DESCLIMIT 10;" | tee -a "$REPORT_FILE"
}# 主函数
main() {
echo"=========================================="
echo"    MySQL健康检查脚本 v1.0"
echo"=========================================="
echo""if ! check_mysql_connection; thenlog_error "无法连接MySQL,检查终止"
exit 1
ficheck_mysql_connectionscheck_query_performancecheck_innodb_statuscheck_replication_statuscheck_tablespacelog""
log"============================================================"
log"检查完成!"
log"报告保存至: $REPORT_FILE"
log"============================================================"
}main "$@"

脚本7:服务器批量巡检

这是一个批量执行脚本,可以并发对多台服务器执行巡检并汇总结果

#!/bin/bash
################################################################################
# 脚本名称: batch_check_scheduler.sh
# 功能描述: 批量服务器巡检调度器
# 版本信息: v2.0
################################################################################# 配置区域
SERVER_LIST="servers.txt"# 服务器列表文件
SSH_USER="root"# SSH用户
SSH_KEY="~/.ssh/id_rsa"# SSH密钥
CONCURRENT=10                       # 并发数
CHECK_SCRIPT="system_health_check.sh"# 本地巡检脚本
RESULT_DIR="/var/log/batch_check_$(date +%Y%m%d_%H%M%S)"# 创建结果目录
mkdir -p "$RESULT_DIR"# 颜色定义
GREEN='\033[0;32m'
RED='\033[0;31m'
YELLOW='\033[1;33m'
NC='\033[0m'log_info() {
echo -e "[$(date +"%H:%M:%S")] ${GREEN}[INFO]${NC}$1" | tee -a "$RESULT_DIR/scheduler.log"
}log_error() {
echo -e "[$(date +"%H:%M:%S")] ${RED}[ERROR]${NC}$1" | tee -a "$RESULT_DIR/scheduler.log"
}log_warning() {
echo -e "[$(date +"%H:%M:%S")] ${YELLOW}[WARNING]${NC}$1" | tee -a "$RESULT_DIR/scheduler.log"
}# 单台服务器巡检
check_single_server() {
local server=$1
local result_file="$RESULT_DIR/${server}_result.log"
local status_file="$RESULT_DIR/${server}_status.txt"log_info "开始检查: $server"# 检查SSH连通性
if ! ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=no -i "$SSH_KEY""$SSH_USER@$server""echo ok" &>/dev/null; thenlog_error "$server - SSH连接失败"
echo"FAILED|SSH连接失败" > "$status_file"
return 1
fi# 上传巡检脚本scp -o StrictHostKeyChecking=no -i "$SSH_KEY""$CHECK_SCRIPT""$SSH_USER@$server:/tmp/" &>/dev/nullif [ $? -ne 0 ]; thenlog_error "$server - 脚本上传失败"
echo"FAILED|脚本上传失败" > "$status_file"
return 1
fi# 执行巡检ssh -o StrictHostKeyChecking=no -i "$SSH_KEY""$SSH_USER@$server""bash /tmp/$CHECK_SCRIPT" > "$result_file" 2>&1if [ $? -eq 0 ]; then
# 分析结果WARNING_COUNT=$(grep -c "\[WARNING\]""$result_file" || echo 0)ERROR_COUNT=$(grep -c "\[ERROR\]""$result_file" || echo 0)if [ "$ERROR_COUNT" -gt 0 ]; then
echo"ERROR|告警:${WARNING_COUNT} 错误:${ERROR_COUNT}" > "$status_file"log_error "$server - 发现 $ERROR_COUNT 个错误"
elif [ "$WARNING_COUNT" -gt 0 ]; then
echo"WARNING|告警:${WARNING_COUNT} 错误:${ERROR_COUNT}" > "$status_file"log_warning "$server - 发现 $WARNING_COUNT 个告警"
else
echo"OK|正常" > "$status_file"log_info "$server - 检查完成,状态正常"
fi# 清理远程脚本ssh -o StrictHostKeyChecking=no -i "$SSH_KEY""$SSH_USER@$server""rm -f /tmp/$CHECK_SCRIPT" &>/dev/nullreturn 0
elselog_error "$server - 巡检执行失败"
echo"FAILED|执行失败" > "$status_file"
return 1
fi
}# 并发批量检查
batch_check() {
if [ ! -f "$SERVER_LIST" ]; thenlog_error "服务器列表文件不存在: $SERVER_LIST"
exit 1
fiif [ ! -f "$CHECK_SCRIPT" ]; thenlog_error "巡检脚本不存在: $CHECK_SCRIPT"
exit 1
filocal total=$(grep -v "^#""$SERVER_LIST" | grep -v "^$" | wc -l)
local count=0log_info "开始批量巡检,共 $total 台服务器,并发数 $CONCURRENT"whileread server; do
# 跳过注释和空行[[ -z "$server" || "$server" =~ ^# ]] && continue# 控制并发
while [ $(jobs -r | wc -l) -ge $CONCURRENT ]; do
sleep 1
done# 后台执行check_single_server "$server" &((count++))
done < "$SERVER_LIST"# 等待所有任务完成
waitlog_info "所有服务器巡检完成!"
}# 生成HTML汇总报告
generate_html_report() {
local summary_file="$RESULT_DIR/summary.html"log_info "生成汇总报告..."cat > "$summary_file" <<'EOF'
<!DOCTYPE html>
<html>
<head><meta charset="UTF-8"><title>服务器巡检汇总报告</title><style>* { margin: 0; padding: 0; box-sizing: border-box; }body { font-family: 'Arial', 'Microsoft YaHei', sans-serif; background: #f5f5f5; padding: 20px; }.container { max-width: 1200px; margin: 0 auto; background: white; border-radius: 8px; box-shadow: 0 2px 10px rgba(0,0,0,0.1); }.header { background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; padding: 30px; border-radius: 8px 8px 0 0; }.header h1 { font-size: 28px; margin-bottom: 10px; }.header p { opacity: 0.9; font-size: 14px; }.stats { display: flex; padding: 20px; background: #f8f9fa; border-bottom: 1px solid #e9ecef; }.stat-item { flex: 1; text-align: center; padding: 10px; }.stat-item .number { font-size: 32px; font-weight: bold; margin-bottom: 5px; }.stat-item .label { color: #6c757d; font-size: 14px; }.stat-ok .number { color: #28a745; }.stat-warning .number { color: #ffc107; }.stat-error .number { color: #dc3545; }.content { padding: 20px; }table { width: 100%; border-collapse: collapse; margin-top: 20px; }th, td { padding: 12px; text-align: left; border-bottom: 1px solid #e9ecef; }th { background: #f8f9fa; font-weight: 600; color: #495057; position: sticky; top: 0; }
tr:hover { background: #f8f9fa; }.status-ok { background: #d4edda; color: #155724; padding: 4px 8px; border-radius: 4px; display: inline-block; font-size: 12px; }.status-warning { background: #fff3cd; color: #856404; padding: 4px 8px; border-radius: 4px; display: inline-block; font-size: 12px; }.status-error { background: #f8d7da; color: #721c24; padding: 4px 8px; border-radius: 4px; display: inline-block; font-size: 12px; }.status-failed { background: #6c757d; color: white; padding: 4px 8px; border-radius: 4px; display: inline-block; font-size: 12px; }a { color: #007bff; text-decoration: none; }a:hover { text-decoration: underline; }.footer { text-align: center; padding: 20px; color: #6c757d; font-size: 12px; border-top: 1px solid #e9ecef; }</style>
</head>
<body><div class="container"><div class="header"><h1>🖥️ 服务器巡检汇总报告</h1><p>巡检时间: REPORT_TIME</p></div><div class="stats"><div class="stat-item stat-ok"><div class="number"id="ok-count">0</div><div class="label">正常</div></div><div class="stat-item stat-warning"><div class="number"id="warning-count">0</div><div class="label">告警</div></div><div class="stat-item stat-error"><div class="number"id="error-count">0</div><div class="label">错误</div></div><div class="stat-item stat-error"><div class="number"id="failed-count">0</div><div class="label">失败</div></div></div><div class="content"><table><thead><tr><th>序号</th><th>服务器地址</th><th>状态</th><th>详情</th><th>详细报告</th></tr></thead><tbody id="server-list"></tbody></table></div><div class="footer">Generated by Batch Check Scheduler v2.0</div></div><script>
let okCount = 0, warningCount = 0, errorCount = 0, failedCount = 0;const serverData = SERVER_DATA_PLACEHOLDER;const tbody = document.getElementById('server-list');serverData.forEach((item, index) => {const row = tbody.insertRow();row.innerHTML = `<td>${index + 1}</td><td>${item.server}</td><td><span class="status-${item.status.toLowerCase()}">${item.status}</span></td><td>${item.detail}</td><td><a href="${item.report}" target="_blank">查看详情</a></td>`;if (item.status === 'OK') okCount++;else if (item.status === 'WARNING') warningCount++;else if (item.status === 'ERROR') errorCount++;else failedCount++;});document.getElementById('ok-count').textContent = okCount;document.getElementById('warning-count').textContent = warningCount;document.getElementById('error-count').textContent = errorCount;document.getElementById('failed-count').textContent = failedCount;</script>
</body>
</html>
EOF# 收集数据local server_data="["local first=truefor status_file in "$RESULT_DIR"/*_status.txt; do[ -f "$status_file" ] || continueserver=$(basename "$status_file" _status.txt)IFS='|' read status detail < "$status_file"if [ "$first" = true ]; thenfirst=falseelseserver_data+=","fiserver_data+="{\"server\":\"$server\",\"status\":\"$status\",\"detail\":\"$detail\",\"report\":\"${server}_result.log\"}"doneserver_data+="]"# 替换占位符sed -i "s/REPORT_TIME/$(date +"%Y-%m-%d %H:%M:%S")/g" "$summary_file"sed -i "s/SERVER_DATA_PLACEHOLDER/$server_data/g" "$summary_file"log_info "汇总报告已生成: $summary_file"
}# 发送邮件通知(可选)
send_email_notification() {local summary_file="$RESULT_DIR/summary.html"# 统计结果local total=$(ls "$RESULT_DIR"/*_status.txt 2>/dev/null | wc -l)local error_count=$(grep -c "^ERROR" "$RESULT_DIR"/*_status.txt 2>/dev/null || echo 0)local warning_count=$(grep -c "^WARNING" "$RESULT_DIR"/*_status.txt 2>/dev/null || echo 0)if [ "$error_count" -gt 0 ] || [ "$warning_count" -gt 0 ]; thenlog_warning "发现异常服务器,建议发送邮件通知"# 在这里添加邮件发送逻辑# mail -s "服务器巡检报告" -a "$summary_file" admin@example.com < /dev/nullfi
}# 主函数
main() {echo "=========================================="echo "    批量服务器巡检调度器 v2.0"echo "=========================================="echo ""# 执行批量巡检batch_check# 生成报告generate_html_report# 发送通知(可选)# send_email_notificationecho ""echo "=========================================="echo "    巡检完成!"echo "    结果目录: $RESULT_DIR"echo "    汇总报告: $RESULT_DIR/summary.html"echo "=========================================="
}main "$@"

原文:https://mp.weixin.qq.com/s?__biz=MzU1OTI0NjI1NQ==&mid=2247492836&idx=1&sn=737c803698507dc6d2d4ee2cec56ab13&chksm=fdb5fad8c2302b1cf5f79280ed073ff0216ce571bc2b85d822a7fb0155a68c998c3c8d2a7df3&mpshare=1&scene=23&srcid=1029WB01zDudpdQYGKttCgQz&sharer_shareinfo=5334d20997cbc6a3c530c4d10b1d9248&sharer_shareinfo_first=5334d20997cbc6a3c530c4d10b1d9248#rd

http://www.dtcms.com/a/545629.html

相关文章:

  • uniapp 实现一个底部悬浮面板
  • 中国桥梁空间分布数据
  • MutableStateFlow、StateFlow、LiveData在Compose中的运用
  • 网站建设的总结与评价专业定制网站开发公司
  • 应对AI全球化部署挑战:南凌科技云连接服务实现算法模型全球稳定传输
  • 公司网站建设岗位手机软件定制开发公司
  • 网站推广app软件一级注册工程师
  • LeetCode算法日记 - Day 87: 单词拆分
  • 学术论文写作与发表精讲:融合AI工具的高效方法与实战案例
  • 天津开发网站公司虚拟主机如何建设多个网站
  • 跟公司产品做网站制作app软件工具免费
  • 133-Spring AI Alibaba Vector Redis 功能完整案例
  • 线段树详解
  • AI 大模型应用中的图像,视频,音频的处理
  • 2025年大专建筑工程技术专业前景!
  • @1Panel 全面指南:从部署到高阶使用
  • SAP MM 采购申请转采购订单功能分享
  • FPGA设计中的“幽灵信号:一条走线,两种命运——浅析路径延迟导致的逻辑错误
  • 网站建设将新建用户授权为管理员免费搭建手机网站源码
  • 北京企业网站建设费用新闻最新消息
  • 算法工具箱之二分查找
  • undefined reference to `cJSON_AddStringToObject‘
  • 仓颉语言中TreeMap红黑树结构的实现与优化
  • Rust 的构建系统和包管理器
  • AI驱动嵌入式软件全链路变革:从芯片到系统的深度智能重构
  • 怎么修改网站域名推广网站排行榜
  • 靠谱的电磁阀维护保养
  • 【自动化测试函数 (下)】Web自动化攻坚:应对加载等待、浏览器导航与无头模式的自动化脚本艺术
  • 正向代理工具
  • 攀枝花建设工程有限责任公司网站中国兰州网