当前位置：首页 > news >正文

Go语言实战案例 — 工具开发篇：编写一个进程监控工具

news 2025/9/15 7:59:22

在生产和开发环境中，监控关键进程的存活与资源使用是非常常见的需求：当进程 CPU/内存超限或意外退出时自动告警、记录历史、甚至重启进程，能显著提升系统可靠性。本篇给出一个可运行的 Go 实战案例：一个轻量级的命令行进程监控工具（procmon），支持按进程名或 PID 监控、采样统计、阈值告警（HTTP webhook）、并能执行重启命令。

下面从目标、设计、实现到运行示例一步步展开，并给出可以直接拿去编译运行的完整代码。

功能目标

• 监控指定的进程（按名称或 PID），周期性采样 CPU% 与内存 RSS。
• 当某个进程 CPU% 或内存（MB）超出阈值时触发告警（支持 HTTP webhook + 本地日志）。
• 支持在告警时运行自定义重启命令（可用于 systemd restart、docker restart、或自定义脚本）。
• 支持本地日志、控制台输出、并优雅退出（SIGINT/SIGTERM）。
• 支持批量监控多个进程、简单配置（命令行 flags / JSON）。

技术选型

• 语言：Go
• 进程信息：github.com/shirou/gopsutil/v3/process（跨平台，常用）
• 告警：HTTP POST 到 webhook（简单可扩展到邮件/钉钉/Slack）
• 并发：每个监控项使用独立 goroutine，主循环统一调度与统计

项目结构（示意）

procmon/
├── main.go
├── go.mod

完整代码（main.go）

// main.go
package mainimport ("bytes""context""encoding/json""flag""fmt""log""net/http""os""os/exec""os/signal""strconv""strings""sync""syscall""time""github.com/shirou/gopsutil/v3/process"
)// MonitorConfig 表示对单个进程的监控配置
type MonitorConfig struct {Names       []string `json:"names"`        // 按进程名匹配PIDs        []int32  `json:"pids"`         // 指定 pidCPUThreshold float64 `json:"cpu_threshold"` // 百分比，如 80.0MemThreshold float64 `json:"mem_threshold"` // MB，如 500.0RestartCmd   string  `json:"restart_cmd"`   // 告警时执行的重启命令（可空）
}// AlertPayload 告警时发送的 JSON 结构
type AlertPayload struct {Time      time.Time `json:"time"`Host      string    `json:"host"`Process   string    `json:"process"`PID       int32     `json:"pid"`CPU       float64   `json:"cpu_percent"`MemoryMB  float64   `json:"memory_mb"`Triggered string    `json:"triggered"`Msg       string    `json:"msg"`
}func main() {// CLI 参数cfgFile := flag.String("config", "", "配置 JSON 文件（可选），与命令行参数组合使用")names := flag.String("names", "", "要监控的进程名，逗号分隔（例如: nginx,mysqld）")pids := flag.String("pids", "", "要监控的 pid，逗号分隔（例如: 123,456）")interval := flag.Duration("interval", 5*time.Second, "采样间隔")cpuTh := flag.Float64("cpu", 80.0, "默认 CPU 百分比阈值（%）")memTh := flag.Float64("mem", 500.0, "默认 内存阈值（MB）")webhook := flag.String("webhook", "", "告警 webhook URL（POST 接收 JSON）")restart := flag.String("restart", "", "全局重启命令（可选，覆盖 config 中 restart_cmd）")flag.Parse()// 解析配置var monitors []MonitorConfigif *cfgFile != "" {f, err := os.ReadFile(*cfgFile)if err != nil {log.Fatalf("读取配置文件失败: %v", err)}if err := json.Unmarshal(f, &monitors); err != nil {log.Fatalf("解析配置文件失败: %v", err)}}// 命令行 args 补充单一配置（如果用户没传配置文件）if len(monitors) == 0 && (*names != "" || *pids != "") {m := MonitorConfig{CPUThreshold: *cpuTh,MemThreshold: *memTh,}if *names != "" {for _, n := range strings.Split(*names, ",") {n = strings.TrimSpace(n)if n != "" {m.Names = append(m.Names, n)}}}if *pids != "" {for _, ps := range strings.Split(*pids, ",") {if s := strings.TrimSpace(ps); s != "" {id, err := strconv.Atoi(s)if err == nil {m.PIDs = append(m.PIDs, int32(id))}}}}if *restart != "" {m.RestartCmd = *restart}monitors = append(monitors, m)}if len(monitors) == 0 {log.Fatalln("没有任何监控配置。请通过 -config 或 -names/-pids 提供配置。")}hostname, _ := os.Hostname()ctx, cancel := context.WithCancel(context.Background())wg := &sync.WaitGroup{}// 信号优雅退出sigc := make(chan os.Signal, 1)signal.Notify(sigc, syscall.SIGINT, syscall.SIGTERM)go func() {<-sigclog.Println("收到退出信号，正在优雅停止...")cancel()}()// 启动每个监控项的 goroutinefor idx, mc := range monitors {wg.Add(1)go func(id int, cfg MonitorConfig) {defer wg.Done()monitorLoop(ctx, id, cfg, *interval, *webhook, hostname)}(idx, mc)}// 等待退出wg.Wait()log.Println("procmon 已退出")
}// monitorLoop 对单个 MonitorConfig 进行轮询监控
func monitorLoop(ctx context.Context, id int, cfg MonitorConfig, interval time.Duration, webhook, host string) {logPrefix := fmt.Sprintf("[monitor-%d] ", id)logger := log.New(os.Stdout, logPrefix, log.LstdFlags)ticker := time.NewTicker(interval)defer ticker.Stop()// 用于去重告警（避免短时间内频繁告警）alerted := make(map[int32]time.Time)alertCooldown := 30 * time.Second // 同一 pid 告警最小间隔for {select {case <-ctx.Done():logger.Println("停止监控（context canceled）")returncase <-ticker.C:procs, err := process.Processes()if err != nil {logger.Printf("获取进程列表失败: %v\n", err)continue}now := time.Now()for _, p := range procs {match := false// 匹配 PID 列表for _, pid := range cfg.PIDs {if p.Pid == pid {match = truebreak}}// 匹配名字列表（如果未通过 pid 匹配）if !match && len(cfg.Names) > 0 {name, err := p.Name()if err == nil {for _, nm := range cfg.Names {if strings.EqualFold(name, nm) {match = truebreak}}}}if !match {continue}// 获取 CPU & Mem// Percent 需要传入一个间隔来计算；这里使用 0 来获取自上次调用以后的值（某些平台）// 更可靠的做法是调用 Percent(interval)；为了简单与跨平台，这里使用 Percent(0)cpuPercent, errCpu := p.CPUPercent()memInfo, errMem := p.MemoryInfo()if errCpu != nil || errMem != nil || memInfo == nil {// 有时权限原因无法读取某些信息logger.Printf("读取进程 %d 信息失败: cpuErr=%v memErr=%v\n", p.Pid, errCpu, errMem)continue}memMB := float64(memInfo.RSS) / 1024.0 / 1024.0// 打印日志name, _ := p.Name()logger.Printf("进程 %s pid=%d cpu=%.2f%% mem=%.2fMB\n", name, p.Pid, cpuPercent, memMB)// 判断阈值triggered := ""if cfg.CPUThreshold > 0 && cpuPercent >= cfg.CPUThreshold {triggered = "cpu"}if cfg.MemThreshold > 0 && memMB >= cfg.MemThreshold {if triggered == "" {triggered = "mem"} else {triggered = "cpu+mem"}}if triggered != "" {lastAlert, ok := alerted[p.Pid]if ok && now.Sub(lastAlert) < alertCooldown {// 跳过频繁告警logger.Printf("已在 cooldown 中，跳过 pid=%d 的告警\n", p.Pid)continue}alerted[p.Pid] = nowpayload := AlertPayload{Time:      now,Host:      host,Process:   name,PID:       p.Pid,CPU:       cpuPercent,MemoryMB:  memMB,Triggered: triggered,Msg:       fmt.Sprintf("process %s (pid=%d) exceeded threshold (%s)", name, p.Pid, triggered),}// 本地日志告警logger.Printf("ALERT: %s\n", payload.Msg)// 发送 webhook（如果配置）if webhook != "" {go func(pl AlertPayload) {if err := postAlert(webhook, pl); err != nil {logger.Printf("发送 webhook 失败: %v\n", err)} else {logger.Printf("告警已发送到 %s\n", webhook)}}(payload)}// 执行重启命令（config 中或全局传入） —— 先尝试 graceful terminate 再执行重启命令（如果提供）if cfg.RestartCmd != "" {go func(cmdStr string, targetPid int32) {logger.Printf("尝试杀掉 pid=%d 并执行重启命令: %s\n", targetPid, cmdStr)// 发送 TERM_ = p.SendSignal(syscall.SIGTERM)// 等待短时间让进程退出time.Sleep(2 * time.Second)// 强制 kill 如果还存在exists, _ := process.PidExists(targetPid)if exists {_ = p.Kill()}// 执行重启命令（通过 shell）cmd := exec.Command("/bin/sh", "-c", cmdStr)out, err := cmd.CombinedOutput()if err != nil {logger.Printf("执行重启命令失败: %v. output: %s\n", err, string(out))} else {logger.Printf("重启命令已执行, output: %s\n", string(out))}}(cfg.RestartCmd, p.Pid)}}} // end for procs} // end ticker select} // end for
}// postAlert 以 JSON POST 方式发送告警
func postAlert(webhook string, payload AlertPayload) error {bs, _ := json.Marshal(payload)req, err := http.NewRequest("POST", webhook, bytes.NewReader(bs))if err != nil {return err}req.Header.Set("Content-Type", "application/json")client := &http.Client{Timeout: 8 * time.Second}resp, err := client.Do(req)if err != nil {return err}defer resp.Body.Close()if resp.StatusCode < 200 || resp.StatusCode >= 300 {return fmt.Errorf("webhook 返回非 2xx: %s", resp.Status)}return nil
}

代码说明 & 要点提醒

1. 依赖：本示例使用 github.com/shirou/gopsutil/v3/process。在项目目录运行：

go mod init procmon
go get github.com/shirou/gopsutil/v3/process

2. 采样 CPU：process.CPUPercent() 的行为受平台和调用频率影响。更精确的 CPU 百分比通常需要两次采样间的时间差（gopsutil 提供相关接口），但本例为简洁使用了库的默认方法。若需精确长期统计，可以保存上次样本并计算 delta。
3. 权限问题：在某些系统上读取其他用户的进程信息需要更高权限（root）。如果监控不到目标进程，请以合适权限运行。
4. 重启策略：示例中通过 RestartCmd 执行自定义 shell 命令来重启服务（例如 systemctl restart myservice 或 docker restart container）。这是最灵活的方式，但要确保命令安全（不要盲目执行来自不可信配置的命令）。
5. 告警去重：示例里使用 alertCooldown 防止短时间内重复告警。你可以把告警状态持久化到 Redis/文件以跨重启保留告警状态。
6. 跨平台：gopsutil 支持多平台，但信号、kill 等行为在 Windows 与 Unix 上不同。Windows 上需用不同方法停止进程。

使用示例

1. 简单按进程名监控 nginx，CPU 超过 70% 或内存超过 300MB 时发 webhook：

./procmon -names nginx -cpu 70 -mem 300 -webhook "https://example.com/webhook"

2. 使用 JSON 配置（config.json）支持多项监控（文件示例）：

[{"names": ["nginx"],"cpu_threshold": 70.0,"mem_threshold": 300,"restart_cmd": "systemctl restart nginx"},{"names": ["mysqld"],"cpu_threshold": 85.0,"mem_threshold": 2048,"restart_cmd": "systemctl restart mysql"}
]

运行：

./procmon -config config.json -interval 5s -webhook "https://example.com/webhook"

可行的扩展与改进（工程化建议）

• 持久化历史：把采样结果写入 InfluxDB/Prometheus 或本地文件，方便后续分析与告警策略优化。
• 更智能的告警：支持平均值/移动窗口、抑制波动（例如短时 spike 不告警）、按时间段不同阈值。
• 进程自恢复：把重启策略从单条命令扩展为“逐步恢复”：先重启、再报警、再回滚；并记录重启次数以避免重启风暴。
• UI 或 API：提供 HTTP 管理接口查看当前监控状态、触发测试告警或调整阈值。
• 容器/Pod 支持：在容器环境下识别容器内进程或直接对容器做重启（Kubernetes 中可使用 K8s API 触发重启）。
• 权限和安全：限制能够执行的 restart_cmd、对 webhook 使用签名/鉴权避免被滥用。

小结

本文实现了一个简单但实用的 Go 进程监控工具，涵盖进程扫描、资源采样、阈值检测、告警与重启动作。示例代码足够作为生产工具的原型，通过增加持久化、更多告警通道与更安全的重启策略，可以逐步把它演化为完整的运维监控组件。

文章转载自：

http://LsN4VEt5.Lthtp.cn
http://TtM8csTo.Lthtp.cn
http://s1AoTd98.Lthtp.cn
http://SMYfdX9c.Lthtp.cn
http://jB26Y3H0.Lthtp.cn
http://wCV70OcU.Lthtp.cn
http://YsG8x06Y.Lthtp.cn
http://4DAptytV.Lthtp.cn
http://6O3shpR6.Lthtp.cn
http://fczQNlkk.Lthtp.cn
http://CNSjyTOO.Lthtp.cn
http://B5VMhPUV.Lthtp.cn
http://NJPP46UL.Lthtp.cn
http://2DOzugLx.Lthtp.cn
http://2iiwKncE.Lthtp.cn
http://oL3MyVPr.Lthtp.cn
http://muQeP1WP.Lthtp.cn
http://pF1Yiw5g.Lthtp.cn
http://iNTA4xkx.Lthtp.cn
http://amR57MM0.Lthtp.cn
http://FcCT6By1.Lthtp.cn
http://rFRpNIyW.Lthtp.cn
http://qejolzwZ.Lthtp.cn
http://Tgq7XS27.Lthtp.cn
http://UaNrrvC2.Lthtp.cn
http://yn1FgwwY.Lthtp.cn
http://KHiMF7zh.Lthtp.cn
http://6koMuKwh.Lthtp.cn
http://SoolXhm1.Lthtp.cn
http://XS86pXb9.Lthtp.cn

查看全文

http://www.dtcms.com/a/383453.html

Roo Code 的检查点功能

【go/gopls/mcp】官方gopls内置mcp server使用

【无标题】神经网络算法初探

Genspark AI 浏览器

Linux内核IPsec接收机制剖析：XFRM框架与xfrm4_input.c的深度解读

Linux 系统下的流量控制工具之tc命令案例解析

数据库造神计划第五天---增删改查（CRUD）（1）

深入理解Java虚拟机：JVM高级特性与最佳实践（第3版）第九章知识点问答（10题）

AI表征了西方的有界，AI+体现了东方的无界

前端基础 —— B / CSS基础

Qwen2.5-VL 实战：用 VLM 实现 “看图对话”，从目标检测到空间推理！【附源码】

vLLM - EngineCoreClient

MySQL专题Day(2)————存储引擎

多文件编程与宏的使用

第5节-连接表-Inner-Join

【Csp - S】图的知识

【图文详解】MCP、A2A的核心技术特点以及架构模式

Java基础 9.13

Shell 正则表达式完全指南

玩转ClaudeCode：用Database-MCP实现自然语言操作数据库

【Android】答题系统Web服务器APP应用开发流程详解

Web服务器VS应用服务器：核心差异解析

分享一个vue2的tinymce配置

spring bean一共有几种作用域

Redie详细入门教程2

Maven入门_简介、安装与配置

Vue组件化开发介绍

new species of flying reptile1 discovered in Scotland

Spring JDBC与KingbaseES深度集成：构建高性能国产数据库应用实战

闪电科创 SCI专业辅导