pg_stat 监控系统
PostgreSQL 内置了完整的可观测性系统——pg_stat_* 和 pg_statio_* 视图提供实时的性能数据,无需安装额外工具就能了解数据库的健康状况、缓存命中率、索引使用情况和等待事件。
核心监控视图速查
| 视图 | 说明 |
|---|---|
pg_stat_activity | 当前连接和活跃查询 |
pg_stat_statements | 历史查询统计(需安装扩展) |
pg_stat_user_tables | 表级统计(扫描方式、增删改行数) |
pg_stat_user_indexes | 索引使用情况 |
pg_statio_user_tables | 表的 I/O 统计(缓存命中率) |
pg_stat_bgwriter | 后台写进程统计 |
pg_stat_replication | 复制状态 |
pg_stat_database | 数据库级统计 |
pg_stat_activity:实时查询监控
-- 查看所有活跃连接
SELECT
pid,
datname AS database,
usename AS user,
application_name,
client_addr,
state, -- active / idle / idle in transaction
wait_event_type, -- Lock / IO / Client / ...
wait_event, -- 具体等待事件
now() - query_start AS query_age, -- 查询已运行时长
LEFT(query, 80) AS query_snippet
FROM pg_stat_activity
WHERE datname = current_database()
AND pid != pg_backend_pid() -- 排除自身
ORDER BY query_age DESC NULLS LAST;
-- 找出长时间运行的查询(超过 30 秒)
SELECT pid, query_age, wait_event, LEFT(query, 100) AS query
FROM (
SELECT
pid,
now() - query_start AS query_age,
wait_event,
query,
state
FROM pg_stat_activity
WHERE state = 'active'
AND query_start IS NOT NULL
) q
WHERE query_age > INTERVAL '30 seconds'
ORDER BY query_age DESC;
-- 找出"idle in transaction"(忘记 COMMIT 的连接,会持有锁!)
SELECT pid, usename, now() - state_change AS idle_duration, LEFT(query, 80)
FROM pg_stat_activity
WHERE state = 'idle in transaction'
AND state_change < now() - INTERVAL '5 minutes'
ORDER BY idle_duration DESC;
pg_stat_user_tables:表级健康检查
-- 检查所有表的扫描方式和维护状态
SELECT
schemaname,
relname AS table_name,
seq_scan, -- 全表扫描次数
seq_tup_read, -- 全表扫描读取的行数
idx_scan, -- 索引扫描次数
n_dead_tup, -- 待清理的死行数(VACUUM 清理对象)
n_live_tup, -- 活跃行数
ROUND(100.0 * n_dead_tup / NULLIF(n_live_tup + n_dead_tup, 0), 1) AS dead_ratio,
last_vacuum,
last_autovacuum,
last_analyze,
last_autoanalyze
FROM pg_stat_user_tables
ORDER BY seq_scan DESC;
-- ⚠️ 需要关注的指标:
-- seq_scan 很高 + 表很大 → 缺少索引
-- dead_ratio > 10% → 需要 VACUUM
-- last_autovacuum 很旧 → autovacuum 配置可能有问题
pg_stat_user_indexes:索引使用情况
-- 找出未使用的索引(浪费磁盘和写入性能)
SELECT
schemaname,
tablename,
indexname,
idx_scan AS times_used,
pg_size_pretty(pg_relation_size(indexrelid)) AS index_size
FROM pg_stat_user_indexes
WHERE idx_scan = 0
AND indexname NOT LIKE '%pkey' -- 排除主键
AND indexname NOT LIKE '%unique%' -- 排除唯一索引(即使没用也要保留)
ORDER BY pg_relation_size(indexrelid) DESC;
-- 对于 idx_scan=0 的索引,考虑 DROP 以节省空间和提升写入性能
-- 查看最常用的索引
SELECT
tablename,
indexname,
idx_scan,
idx_tup_read,
idx_tup_fetch,
pg_size_pretty(pg_relation_size(indexrelid)) AS index_size
FROM pg_stat_user_indexes
ORDER BY idx_scan DESC
LIMIT 20;
pg_statio_user_tables:缓存命中率
-- 检查每张表的缓存命中率
SELECT
relname AS table_name,
heap_blks_read AS disk_reads, -- 从磁盘读取的数据块数
heap_blks_hit AS cache_hits, -- 从缓冲区命中的数据块数
ROUND(
100.0 * heap_blks_hit /
NULLIF(heap_blks_hit + heap_blks_read, 0),
1
) AS cache_hit_rate,
idx_blks_read AS idx_disk_reads,
idx_blks_hit AS idx_cache_hits,
ROUND(
100.0 * idx_blks_hit /
NULLIF(idx_blks_hit + idx_blks_read, 0),
1
) AS idx_cache_hit_rate
FROM pg_statio_user_tables
WHERE heap_blks_hit + heap_blks_read > 0
ORDER BY heap_blks_read DESC;
-- 目标:缓存命中率 > 99%
-- 如果某张表命中率低,考虑增大 shared_buffers
-- 或者检查是否有全表扫描导致大量数据进出缓存
pg_stat_database:数据库级统计
-- 全库概览
SELECT
datname,
numbackends AS active_connections,
xact_commit AS commits,
xact_rollback AS rollbacks,
ROUND(100.0 * xact_rollback / NULLIF(xact_commit + xact_rollback, 0), 2) AS rollback_rate,
blks_read AS disk_reads,
blks_hit AS cache_hits,
ROUND(100.0 * blks_hit / NULLIF(blks_hit + blks_read, 0), 1) AS overall_cache_hit_rate,
tup_returned AS rows_returned,
tup_fetched AS rows_fetched,
tup_inserted AS rows_inserted,
tup_updated AS rows_updated,
tup_deleted AS rows_deleted,
deadlocks,
conflicts
FROM pg_stat_database
WHERE datname = current_database();
等待事件分析
-- 查看当前的等待事件(诊断性能瓶颈)
SELECT
wait_event_type,
wait_event,
COUNT(*) AS waiting_connections
FROM pg_stat_activity
WHERE wait_event IS NOT NULL
AND state = 'active'
GROUP BY wait_event_type, wait_event
ORDER BY waiting_connections DESC;
-- 常见等待事件及含义:
-- Lock / relation → 等待表锁(有 DDL 操作?)
-- Lock / tuple → 等待行锁(高并发更新同一行)
-- Lock / transactionid → 等待事务提交(锁等待链)
-- IO / DataFileRead → 等待磁盘 I/O(shared_buffers 太小?)
-- IO / WALWrite → 等待 WAL 写入(synchronous_commit 相关)
-- Client / ClientRead → 等待客户端发送数据(网络?)
-- IPC / BgWorkerShutdown → 等待后台 worker(正常)
完整健康检查 SQL
-- 一键健康检查(生成报告)
WITH
cache_stats AS (
SELECT
ROUND(
100.0 * SUM(blks_hit) / NULLIF(SUM(blks_hit) + SUM(blks_read), 0),
1
) AS cache_hit_rate
FROM pg_stat_database
WHERE datname = current_database()
),
connection_stats AS (
SELECT
COUNT(*) FILTER (WHERE state = 'active') AS active,
COUNT(*) FILTER (WHERE state = 'idle') AS idle,
COUNT(*) FILTER (WHERE state = 'idle in transaction') AS idle_in_txn,
COUNT(*) FILTER (WHERE wait_event_type = 'Lock') AS waiting_for_lock
FROM pg_stat_activity
WHERE datname = current_database()
),
table_health AS (
SELECT
COUNT(*) FILTER (WHERE n_dead_tup > n_live_tup * 0.1) AS tables_needing_vacuum,
COUNT(*) FILTER (WHERE last_autovacuum < now() - INTERVAL '7 days'
AND n_live_tup > 1000) AS tables_not_vacuumed_7d
FROM pg_stat_user_tables
)
SELECT
'Cache Hit Rate' AS metric, cache_hit_rate::text AS value FROM cache_stats
UNION ALL
SELECT 'Active Connections', active::text FROM connection_stats
UNION ALL
SELECT 'Idle Connections', idle::text FROM connection_stats
UNION ALL
SELECT 'Idle in Transaction', idle_in_txn::text FROM connection_stats
UNION ALL
SELECT 'Lock Waiters', waiting_for_lock::text FROM connection_stats
UNION ALL
SELECT 'Tables Needing VACUUM', tables_needing_vacuum::text FROM table_health
UNION ALL
SELECT 'Tables Not Vacuumed (7d)', tables_not_vacuumed_7d::text FROM table_health;
推荐监控工具
开源工具:
- pg_activity:类似 top 的实时 PostgreSQL 监控(命令行)
- pgBadger:分析 PostgreSQL 慢查询日志,生成 HTML 报告
- Prometheus + postgres_exporter:指标导出到 Prometheus,Grafana 可视化
云原生:
- AWS CloudWatch(RDS 自带)
- Datadog PostgreSQL 集成
- PgAnalyze:专业 PostgreSQL 监控(商业)
可观测性基础设施:
PostgreSQL ──→ postgres_exporter ──→ Prometheus ──→ Grafana Dashboard
└──→ AlertManager
下一章:生产迁移、零停机变更与运维规范——表设计需要调整?字段要改类型?大表要加列?不停机地做 Schema 变更是生产运维的核心技能。