PostgreSQL 实例运行状态全面检查
PostgreSQL 实例运行状态全面检查
PostgreSQL 实例的健康检查是数据库运维的核心工作,以下是系统化的状态检查方法和关键指标解读。
一 基础健康检查
1.1 实例可用性检查
# 快速连接测试
pg_isready -h localhost -p 5432 -U postgres -d postgres
# 返回状态说明:
# - "accepting connections" → 正常
# - "rejecting connections" → 需紧急处理
# - "no response" → 实例宕机
输出示例:
[pg16@test ~]$ pg_isready -h localhost -p 5777 -U postgres -d postgres
localhost:5777 - accepting connections
1.2 版本与运行时长
SELECT
version() AS postgres_version,
pg_postmaster_start_time() AS startup_time,
current_timestamp - pg_postmaster_start_time() AS uptime;
输出示例:
white=# SELECT
white-# version() AS postgres_version,
white-# pg_postmaster_start_time() AS startup_time,
white-# current_timestamp - pg_postmaster_start_time() AS uptime;
postgres_version | startup_time | uptime
---------------------------------------------------------------------------------------------------------+------------------------------+-----------------
PostgreSQL 16.2 on x86_64-pc-linux-gnu, compiled by gcc (GCC) 4.8.5 20150623 (Red Hat 4.8.5-44), 64-bit | 2025-04-12 06:07:34.70749-07 | 03:18:18.458812
(1 row)
二 关键性能指标检查
2.1 系统资源视图
-- 内存使用情况
SELECT
name,
setting,
unit,
source
FROM pg_settings
WHERE name IN ('shared_buffers', 'work_mem', 'maintenance_work_mem');
-- 连接数统计
select
(select count(*) from pg_stat_activity) as current_conn,
(select setting::int as max_conn from pg_settings where name='max_connections') max_conn;
输出示例:
white=# SELECT
white-# name,
white-# setting,
white-# unit,
white-# source
white-# FROM pg_settings
white-# WHERE name IN ('shared_buffers', 'work_mem', 'maintenance_work_mem');
name | setting | unit | source
----------------------+---------+------+--------------------
maintenance_work_mem | 65536 | kB | default
shared_buffers | 32768 | 8kB | configuration file
work_mem | 4096 | kB | default
(3 rows)
white=#
white=#
white=# select
white-# (select count(*) from pg_stat_activity) as current_conn,
white-# (select setting::int as max_conn from pg_settings where name='max_connections') max_conn;
current_conn | max_conn
--------------+----------
7 | 100
(1 row)
2.2 实时性能看板
-- 综合性能视图 (PostgreSQL 14+)
SELECT
now() - query_start AS duration,
datname,
usename,
wait_event_type,
wait_event,
state,
query
FROM pg_stat_activity
WHERE state != 'idle'
ORDER BY duration DESC
LIMIT 10;
输出示例:
white=# SELECT
white-# now() - query_start AS duration,
white-# datname,
white-# usename,
white-# wait_event_type,
white-# wait_event,
white-# state,
white-# query
white-# FROM pg_stat_activity
white-# WHERE state != 'idle'
white-# ORDER BY duration DESC
white-# LIMIT 10;
duration | datname | usename | wait_event_type | wait_event | state | query
----------+---------+----------+-----------------+------------+--------+--------------------------------------
00:00:00 | white | postgres | | | active | SELECT +
| | | | | | now() - query_start AS duration,+
| | | | | | datname, +
| | | | | | usename, +
| | | | | | wait_event_type, +
| | | | | | wait_event, +
| | | | | | state, +
| | | | | | query +
| | | | | | FROM pg_stat_activity +
| | | | | | WHERE state != 'idle' +
| | | | | | ORDER BY duration DESC +
| | | | | | LIMIT 10;
(1 row)
white=#
三 存储与I/O检查
3.1 数据库存储分析
-- 数据库大小排名
SELECT
d.datname AS database,
pg_size_pretty(pg_database_size(d.datname)) AS size,
pg_size_pretty(sum(pg_relation_size(c.oid))) AS data_size
FROM pg_database d
JOIN pg_class c ON c.relnamespace = d.oid
WHERE c.relkind = 'r'
GROUP BY d.datname
ORDER BY pg_database_size(d.datname) DESC;
-- 表空间使用情况
SELECT
spcname,
pg_size_pretty(pg_tablespace_size(spcname)) AS size,
pg_tablespace_location(oid) AS location
FROM pg_tablespace;
输出示例:
white=# SELECT
white-# d.datname AS database,
white-# pg_size_pretty(pg_database_size(d.datname)) AS size,
white-# pg_size_pretty(sum(pg_relation_size(c.oid))) AS data_size
white-# FROM pg_database d
white-# JOIN pg_class c ON c.relnamespace = d.oid
white-# WHERE c.relkind = 'r'
white-# GROUP BY d.datname
white-# ORDER BY pg_database_size(d.datname) DESC;
database | size | data_size
----------+------+-----------
(0 rows)
white=#
white=# SELECT
white-# spcname,
white-# pg_size_pretty(pg_tablespace_size(spcname)) AS size,
white-# pg_tablespace_location(oid) AS location
white-# FROM pg_tablespace;
spcname | size | location
------------+---------+--------------
pg_default | 1804 MB |
pg_global | 597 kB |
test2 | | /pgdir/test2
test3 | | /pgdir/test3
(4 rows)
white=#
3.2 I/O负载指标
-- 表IO统计 (需要pg_stat_statements扩展)
SELECT
schemaname,
relname,
seq_scan,
seq_tup_read,
idx_scan,
idx_tup_fetch,
pg_size_pretty(pg_relation_size(relid)) AS size
FROM pg_stat_user_tables
ORDER BY seq_tup_read DESC
LIMIT 10;
输出示例:
white=# SELECT
white-# schemaname,
white-# relname,
white-# seq_scan,
white-# seq_tup_read,
white-# idx_scan,
white-# idx_tup_fetch,
white-# pg_size_pretty(pg_relation_size(relid)) AS size
white-# FROM pg_stat_user_tables
white-# ORDER BY seq_tup_read DESC
white-# LIMIT 10;
schemaname | relname | seq_scan | seq_tup_read | idx_scan | idx_tup_fetch | size
------------+------------------+----------+--------------+----------+---------------+------------
yewu1 | t1 | 11 | 100001 | 5 | 5 | 440 kB
yewu1 | t2 | 1 | 10000 | | | 440 kB
yewu1 | t3 | 9 | 630 | | | 8192 bytes
yewu1 | test10 | 4 | 6 | | | 8192 bytes
public | pgbench_tellers | 0 | 0 | 0 | 0 | 8192 bytes
yewu2 | t4 | 0 | 0 | 0 | 0 | 8192 bytes
yewu1 | test5 | 0 | 0 | | | 0 bytes
public | pgbench_branches | 0 | 0 | 0 | 0 | 8192 bytes
yewu1 | test7 | 0 | 0 | 0 | 0 | 490 MB
yewu1 | test3 | 0 | 0 | | | 0 bytes
(10 rows)
white=#
四 查询与锁监控
4.1 慢查询识别
-- 使用pg_stat_statements扩展 (需提前安装)
SELECT
query,
calls,
total_exec_time,
mean_exec_time,
rows,
100.0 * shared_blks_hit / nullif(shared_blks_hit + shared_blks_read, 0) AS hit_percent
FROM pg_stat_statements
ORDER BY total_exec_time DESC
LIMIT 20;
输出示例:
white=# SELECT
white-# query,
white-# calls,
white-# total_exec_time,
white-# mean_exec_time,
white-# rows,
white-# 100.0 * shared_blks_hit / nullif(shared_blks_hit + shared_blks_read, 0) AS hit_percent
white-# FROM pg_stat_statements
white-# ORDER BY total_exec_time DESC
white-# LIMIT 20;
query | calls | total_exec_time | mean_exec_time | rows | hit_percent
--------------------------------------------------------------------+---------+--------------------+----------------------+---------+----------------------
DO $$ +| 2 | 118183.47813199999 | 59091.739065999995 | 0 | 99.9999951067743431
DECLARE aa INTEGER; +| | | | |
BEGIN +| | | | |
FOR aa IN 1..10000000 LOOP +| | | | |
INSERT INTO yewu1.test8 VALUES (aa,'white' || aa); +| | | | |
END LOOP; +| | | | |
COMMIT; +| | | | |
END $$ | | | | |
select pg_backend_pid() | 3 | 93277.79499 | 31092.59833 | 3 |
select pg_backend_pid() | 4 | 77014.163287 | 19253.54082175 | 4 |
DO $$ +| 2 | 58800.028212000005 | 29400.014106000002 | 0 | 100.0000000000000000
DECLARE aa INTEGER; +| | | | |
BEGIN +| | | | |
FOR aa IN 1..10000000 LOOP +| | | | |
INSERT INTO yewu1.test4 VALUES ('white ' || aa); +| | | | |
END LOOP; +| | | | |
COMMIT; +| | | | |
END $$ | | | | |
DO $$ +| 1 | 49685.024169 | 49685.024169 | 0 | 99.9999951067760191
DECLARE aa INTEGER; +| | | | |
BEGIN +| | | | |
FOR aa IN 1..10000000 LOOP +| | | | |
INSERT INTO yewu1.test7 VALUES (aa,'white' || aa); +| | | | |
END LOOP; +| | | | |
COMMIT; +| | | | |
END $$ | | | | |
DO $$ +| 1 | 49267.863302 | 49267.863302 | 0 | 100.0000000000000000
DECLARE aa INTEGER; +| | | | |
BEGIN +| | | | |
FOR aa IN 1..10000000 LOOP +| | | | |
INSERT INTO yewu1.test2 VALUES ('white ' || aa); +| | | | |
END LOOP; +| | | | |
COMMIT; +| | | | |
END $$ | | | | |
DO $$ +| 1 | 41309.762465 | 41309.762465 | 0 | 99.9999307490528696
DECLARE aa INTEGER; +| | | | |
BEGIN +| | | | |
FOR aa IN 1..10000000 LOOP +| | | | |
INSERT INTO yewu1.test1 VALUES ('white ' || aa); +| | | | |
END LOOP; +| | | | |
COMMIT; +| | | | |
END $$ | | | | |
select $1 | 2 | 37706.979766000004 | 18853.489883 | 2 |
DO $$ +| 1 | 27416.225248 | 27416.225248 | 0 | 100.0000000000000000
DECLARE aa INTEGER; +| | | | |
BEGIN +| | | | |
FOR aa IN 1..10000000 LOOP +| | | | |
INSERT INTO yewu1.test3 VALUES ('white ' || aa); +| | | | |
END LOOP; +| | | | |
COMMIT; +| | | | |
END $$ | | | | |
DO $$ +| 1 | 25163.0043 | 25163.0043 | 0 | 100.0000000000000000
DECLARE aa INTEGER; +| | | | |
BEGIN +| | | | |
FOR aa IN 1..10000000 LOOP +| | | | |
INSERT INTO yewu1.test6 VALUES ('white12345678'); +| | | | |
END LOOP; +| | | | |
COMMIT; +| | | | |
END $$ | | | | |
DO $$ +| 1 | 22281.973981 | 22281.973981 | 0 | 100.0000000000000000
DECLARE aa INTEGER; +| | | | |
BEGIN +| | | | |
FOR aa IN 1..10000000 LOOP +| | | | |
INSERT INTO yewu1.test5 VALUES ('white12345678'); +| | | | |
END LOOP; +| | | | |
COMMIT; +| | | | |
END $$ | | | | |
INSERT INTO yewu1.t1 VALUES ($1, $2) | 1000000 | 21842.20592600089 | 0.021842205926000155 | 1000000 | 100.0000000000000000
INSERT INTO public.pgbench_accounts VALUES ($1, $2, $3, $4) | 1000000 | 21328.50800399963 | 0.02132850800400086 | 1000000 | 100.0000000000000000
INSERT INTO public.pgbench_accounts VALUES ($1, $2, $3, $4) | 1000000 | 20541.572565999446 | 0.02054157256600104 | 1000000 | 100.0000000000000000
INSERT INTO public.pgbench_accounts VALUES ($1, $2, $3, $4) | 1000000 | 11018.594005000135 | 0.011018594005000208 | 1000000 | 100.0000000000000000
INSERT INTO yewu1.t1 VALUES ($1, $2) | 1000000 | 6577.947609999874 | 0.006577947610000384 | 1000000 | 100.0000000000000000
INSERT INTO public.pgbench_history VALUES ($1, $2, $3, $4, $5, $6) | 267227 | 5081.26534200007 | 0.019014790204582223 | 267227 | 100.0000000000000000
INSERT INTO public.pgbench_history VALUES ($1, $2, $3, $4, $5, $6) | 267227 | 3785.6832559999243 | 0.014166544757827429 | 267227 | 100.0000000000000000
DO $$ +| 1 | 3359.873687 | 3359.873687 | 0 | 99.9991112234700206
DECLARE aa INTEGER; +| | | | |
BEGIN +| | | | |
FOR aa IN 1..1000000 LOOP +| | | | |
INSERT INTO yewu1.t1 VALUES (aa,'white_' || aa); +| | | | |
END LOOP; +| | | | |
COMMIT; +| | | | |
END $$ | | | | |
INSERT INTO public.pgbench_history VALUES ($1, $2, $3, $4, $5, $6) | 267227 | 2205.2414340000396 | 0.008252315200185536 | 267227 | 100.0000000000000000
(20 rows)
4.2 锁等待分析
-- 阻塞关系视图
SELECT
blocked_locks.pid AS blocked_pid,
blocking_locks.pid AS blocking_pid,
blocked_activity.query AS blocked_query,
blocking_activity.query AS blocking_query,
blocked_activity.wait_event_type,
blocked_activity.wait_event
FROM pg_catalog.pg_locks blocked_locks
JOIN pg_catalog.pg_stat_activity blocked_activity ON blocked_activity.pid = blocked_locks.pid
JOIN pg_catalog.pg_locks blocking_locks ON blocking_locks.locktype = blocked_locks.locktype
AND blocking_locks.DATABASE IS NOT DISTINCT FROM blocked_locks.DATABASE
AND blocking_locks.relation IS NOT DISTINCT FROM blocked_locks.relation
AND blocking_locks.page IS NOT DISTINCT FROM blocked_locks.page
AND blocking_locks.tuple IS NOT DISTINCT FROM blocked_locks.tuple
AND blocking_locks.virtualxid IS NOT DISTINCT FROM blocked_locks.virtualxid
AND blocking_locks.transactionid IS NOT DISTINCT FROM blocked_locks.transactionid
AND blocking_locks.classid IS NOT DISTINCT FROM blocked_locks.classid
AND blocking_locks.objid IS NOT DISTINCT FROM blocked_locks.objid
AND blocking_locks.objsubid IS NOT DISTINCT FROM blocked_locks.objsubid
AND blocking_locks.pid != blocked_locks.pid
JOIN pg_catalog.pg_stat_activity blocking_activity ON blocking_activity.pid = blocking_locks.pid
WHERE NOT blocked_locks.GRANTED;
五 复制状态检查
5.1 主从复制监控
-- 主库复制状态
SELECT
client_addr,
usename,
application_name,
state,
sync_state,
pg_size_pretty(pg_wal_lsn_diff(pg_current_wal_lsn(), sent_lsn)) AS send_lag,
pg_size_pretty(pg_wal_lsn_diff(sent_lsn, write_lsn)) AS write_lag,
pg_size_pretty(pg_wal_lsn_diff(write_lsn, flush_lsn)) AS flush_lag,
pg_size_pretty(pg_wal_lsn_diff(pg_current_wal_lsn(), replay_lsn)) AS total_lag
FROM pg_stat_replication;
-- 从库状态检查
SELECT
pg_is_in_recovery() AS is_standby,
pg_last_wal_receive_lsn() AS received_lsn,
pg_last_wal_replay_lsn() AS replayed_lsn,
pg_last_xact_replay_timestamp() AS last_replay_time;
六 自动化检查脚本
6.1 综合健康检查脚本
#!/bin/bash
# PostgreSQL健康检查脚本
check_items=(
"SELECT count(*) FROM pg_stat_activity WHERE state='active';|活跃会话数"
"SELECT count(*) FROM pg_stat_activity WHERE wait_event IS NOT NULL;|等待会话数"
"SELECT round(100*(sum(blks_hit)/sum(blks_hit+blks_read)::numeric,2) FROM pg_stat_database;|缓存命中率"
"SELECT count(*) FROM pg_stat_activity WHERE state='idle in transaction';|空闲事务数"
"SELECT max(age(backend_xmin)) FROM pg_stat_activity;|最老事务年龄"
)
echo "PostgreSQL健康检查报告 - $(date)"
echo "================================="
for item in "${check_items[@]}"; do
sql=$(echo "$item" | cut -d'|' -f1)
desc=$(echo "$item" | cut -d'|' -f2)
result=$(psql -U postgres -t -c "$sql")
printf "%-20s: %s\n" "$desc" "$result"
done
6.2 关键阈值告警
指标 | 警告阈值 | 严重阈值 | 检查频率 |
---|---|---|---|
连接数使用率 | 80% | 90% | 5分钟 |
缓存命中率 | <95% | <90% | 15分钟 |
复制延迟 | >1MB | >10MB | 1分钟 |
最长事务 | >1小时 | >4小时 | 30分钟 |
PostgreSQL 状态检查建议:
- 日常检查:连接数、缓存命中率、锁等待
- 深度检查:每月执行一次全面的存储分析和索引效率检查
- 紧急情况:当响应时间突增时,首先检查活跃查询和锁等待
- 预防性监控:设置关键指标的自动告警机制
通过系统化的状态检查,可以提前发现90%以上的潜在问题,建议将关键检查项目纳入日常运维流程。
谨记:心存敬畏,行有所止。