Java资源持续监控
说明:通过 Java 类名定时循环监控 java 进程资源以及机器内存情况占用并写入文件
例如:每 60s 记录 Spark 执行器资源使用情况:
./java_resource_monitor.sh CoarseGrainedExecutorBackend logs 60
结果:在 logs 目录中,以 PID 为名写出每个执行器资源,并将机器内存写入 free.log 中。
脚本内容:
#!/bin/bash# ============================================
# 自带定时采样循环的 Java 进程资源监控脚本
# 兼容无 jps 环境
# ============================================set -ecd "$(dirname "$0")"if [ $# -lt 2 ]; thenecho "Usage: $0 <process_name> <output_directory> [interval_seconds] [print_to_console]"exit 1
fiPROCESS_NAME=$1
OUTPUT_DIR=$2
INTERVAL=${3:-5} # 默认5秒采样一次
PRINT_TO_CONSOLE=${4:false}mkdir -p "$OUTPUT_DIR"# 明确指定命令路径(防止非交互环境下 PATH 不完整)
PS_CMD=$(command -v ps)
GREP_CMD=$(command -v grep)
AWK_CMD=$(command -v awk)
FREE_CMD=$(command -v free)
DATE_CMD=$(command -v date)
TAIL_CMD=$(command -v tail)echo "========================================="
echo " Java Resource Watcher"
echo " Process Name : $PROCESS_NAME"
echo " Output Dir : $OUTPUT_DIR"
echo " Interval : ${INTERVAL}s"
echo " Print Console: $PRINT_TO_CONSOLE}"
echo " CURRENT_PID : $CURRENT_PID"
echo "========================================="
echo ""# 获取当前脚本的PID
CURRENT_PID=$$# 捕获 Ctrl+C 信号
trap 'echo ""; echo "Stopped by user."; exit 0' INTwhile true; doCURRENT_TIME=$($DATE_CMD +"%Y-%m-%d %H:%M:%S")# ---------- 查找Java进程 ----------if command -v jps >/dev/null 2>&1; thenpids=$(jps | grep -i "$PROCESS_NAME" | $AWK_CMD '{print $1}')elsepids=$($PS_CMD -eo pid,cmd | $GREP_CMD '[j]ava' | $GREP_CMD -i "$PROCESS_NAME" | $AWK_CMD '{print $1}')fiif [ -z "$pids" ]; thenecho "[$CURRENT_TIME] ⚠️ No Java process found for name: $PROCESS_NAME"else# ---------- 记录每个进程的资源 ----------for pid in $pids; doOUTPUT_FILE="$OUTPUT_DIR/$pid"# 过滤一些无关进程if [ "$pid" == "$CURRENT_PID" ];then# echo "skip current pid: $$CURRENT_PID"continuefiif [ -f "/proc/$pid/cmdline" ]; then# 读取 cmdline 并替换 null 字符为空格cmdline=$(cat "/proc/$pid/cmdline" | tr '\0' ' ')case "$cmdline" in*"$java "*)# echo "===> find. $pid $cmdline"if [ ! -f "$OUTPUT_FILE" ]; thenecho "TIME USER PID CPU% MEM% MEM_GB MEM_KB VSZ_KB CMD" > "$OUTPUT_FILE"fi$PS_CMD -p "$pid" -o user=,pid=,%cpu=,%mem=,rss=,vsz=,comm=,args= --no-headers | \$AWK_CMD -v current_time="$CURRENT_TIME" '{mem_gb = sprintf("%.2f", $5/1024/1024)printf "%-20s %-10s %-10s %-10s %-10s %-10s %-10s %-10s %s\n",current_time, $1, $2, $3, $4, mem_gb, $5, $6, $8}' >> "$OUTPUT_FILE";;*)continue;;esacfidonefi# ---------- 系统内存 ----------FREE_LOG="$OUTPUT_DIR/free.log"if [ ! -f "$FREE_LOG" ]; thenecho "TIME TOTAL_GI USED_GI FREE_GI SHARED_GI BUFFERS_GI CACHE_GI SWAP_TOTAL-GI SWAP_USED-GI SWAP_FREE-GI" > "$FREE_LOG"fi#FREE_OUTPUT=$($FREE_CMD -g | $AWK_CMD '#NR == 2 { mem=$2" "$3" "$4" "$5" "$6" "$7" "$8 }#NR == 3 { swap=$2" "$3" "$4 }#END { print mem" "swap }#')FREE_OUTPUT=$(free -g | awk -v current_time="$CURRENT_TIME" 'NR == 1 {# 处理标题行,不输出任何内容next}NR == 2 {# 处理内存数据行(Mem行)# 这里输出所有需要的列:总内存、已用、空闲、共享、缓冲区、缓存printf "%-10s %-10s %-10s %-10s %-10s %-10s %-10s", $2, $3, $4, $5, $6, $7, $8}NR == 3 {# 处理交换数据行(Swap行)# 继续写入 swap 总量、已用、空闲printf " %-10s %-10s %-10s", $2, $3, $4}')echo "$CURRENT_TIME $FREE_OUTPUT" >> "$FREE_LOG"# ---------- 控制台摘要输出 ----------if [ "$PRINT_TO_CONSOLE" ]; thenecho ""echo "=== Resource Usage Summary @ $CURRENT_TIME ==="if [ -n "$pids" ]; thenfor pid in $pids; doif [ -f "$OUTPUT_DIR/$pid" ]; thenLAST_LINE=$($TAIL_CMD -n 1 "$OUTPUT_DIR/$pid")echo "Process $pid:"echo " TIME USER PID CPU% MEM% MEM_GB MEM_KB VSZ_KB CMD"echo " $LAST_LINE"fidonefiif [ -f "$FREE_LOG" ]; thenLAST_FREE_LINE=$($TAIL_CMD -n 1 "$FREE_LOG")echo "Memory Status:"echo " TIME TOTAL_GI USED_GI FREE_GI SHARED_GI BUFFERS_GI CACHE_GI SWAP_TOTAL-GI SWAP_USED-GI SWAP_FREE-GI"echo " $LAST_FREE_LINE"fifi# ---------- 等待下一次采样 ----------sleep "$INTERVAL"
done
