linux hungtask detect机制分析
1,机制概述
hungtask detect 是 Linux 内核用于检测长时间阻塞("hung")任务的机制,主要针对因死锁、死循环或资源竞争导致无法调度的任务
触发条件:任务在 TASK_UNINTERRUPTIBLE 状态持续超过预设阈值(默认 120 秒)。
核心目标:防止系统因单个任务阻塞而完全僵死,提供诊断信息辅助排查问题
2. 实现原理
监控线程:内核线程 khungtaskd 定期(通过 hung_task_poll_jiffies 控制间隔)扫描所有进程的 task_struct结构体。
状态检查:
a,检查任务是否处于 TASK_UNINTERRUPTIBLE 状态。
b,记录任务进入该状态的时间戳,计算阻塞时长。
c,若超时,触发警告并打印任务堆栈(通过 dump_stack())
日志输出:超时任务的信息会记录到内核日志(dmesg),包括进程 ID、名称、阻塞时长及调用栈
3,配置参数
通过 /proc/sys/kernel/
下的文件动态调整
参数文件 | 功能描述 | 默认值 |
---|---|---|
hung_task_timeout_secs | 设置任务阻塞超时阈值(秒) | 120 |
hung_task_panic | 超时后是否触发内核 panic(1=是) | 0 |
hung_task_check_count | 每次扫描的最大任务数 | 32768 |
打开 hungtask detect 开关
echo 1 > /proc/sys/kernel/hung_task_panic
或者修改/etc/sysctl.conf代码
kernel.hung_task_panic = 1
进程的D状态时间也可以设置(任务进入 D 状态多久后触发检测)
echo 60 > /proc/sys/kernel/hung_task_timeout_secs # 单位:秒(默认120秒)
内核宏设置,修改\kernel\arch\arm64\configs\rockchip_linux_defconfig,修改后查看.config
CONFIG_DETECT_HUNG_TASK=y
CONFIG_DEFAULT_HUNG_TASK_TIMEOUT=60
CONFIG_BOOTPARAM_HUNG_TASK_PANIC=y
4,代码分析
(1) khungtaskd内核线程在内核启动阶段创建,相关代码为 kernel/hung_task.c
初始化:
static int watchdog(void *dummy)
{unsigned long hung_last_checked = jiffies;set_user_nice(current, 0);for ( ; ; ) {unsigned long timeout = sysctl_hung_task_timeout_secs;unsigned long interval = sysctl_hung_task_check_interval_secs;long t;if (interval == 0)interval = timeout;interval = min_t(unsigned long, interval, timeout);t = hung_timeout_jiffies(hung_last_checked, interval);if (t <= 0) {if (!atomic_xchg(&reset_hung_task, 0) &&!hung_detector_suspended)check_hung_uninterruptible_tasks(timeout);hung_last_checked = jiffies;continue;}schedule_timeout_interruptible(t);}return 0;
}static int __init hung_task_init(void)
{atomic_notifier_chain_register(&panic_notifier_list, &panic_block);/* Disable hung task detector on suspend */pm_notifier(hungtask_pm_notify, 0);watchdog_task = kthread_run(watchdog, NULL, "khungtaskd");return 0;
}
任务函数
static void check_hung_uninterruptible_tasks(unsigned long timeout)
{int max_count = sysctl_hung_task_check_count;unsigned long last_break = jiffies;struct task_struct *g, *t;bool need_check = true;/** If the system crashed already then all bets are off,* do not report extra hung tasks:*/if (test_taint(TAINT_DIE) || did_panic)return;hung_task_show_lock = false;rcu_read_lock();for_each_process_thread(g, t) {if (!max_count--)goto unlock;if (time_after(jiffies, last_break + HUNG_TASK_LOCK_BREAK)) {if (!rcu_lock_break(g, t))goto unlock;last_break = jiffies;}trace_android_vh_check_uninterruptible_tasks(t, timeout, &need_check);if (need_check)/* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */if (t->state == TASK_UNINTERRUPTIBLE)check_hung_task(t, timeout);}trace_android_vh_check_uninterruptible_tasks_dn(NULL);unlock:rcu_read_unlock();if (hung_task_show_lock)debug_show_all_locks();if (hung_task_show_all_bt) {hung_task_show_all_bt = false;trigger_all_cpu_backtrace();}if (hung_task_call_panic)panic("hung_task: blocked tasks");
}
任务 处理函数
static void check_hung_task(struct task_struct *t, unsigned long timeout)
{unsigned long switch_count = t->nvcsw + t->nivcsw;/** Ensure the task is not frozen.* Also, skip vfork and any other user process that freezer should skip.*/if (unlikely(frozen_or_skipped(t)))return;/** When a freshly created task is scheduled once, changes its state to* TASK_UNINTERRUPTIBLE without having ever been switched out once, it* musn't be checked.*/if (unlikely(!switch_count))return;if (switch_count != t->last_switch_count) {t->last_switch_count = switch_count;t->last_switch_time = jiffies;return;}if (time_is_after_jiffies(t->last_switch_time + timeout * HZ))return;trace_sched_process_hang(t);if (sysctl_hung_task_panic) {console_verbose();hung_task_show_lock = true;hung_task_call_panic = true;}/** Ok, the task did not get scheduled for more than 2 minutes,* complain:*/if (sysctl_hung_task_warnings) {if (sysctl_hung_task_warnings > 0)sysctl_hung_task_warnings--;pr_err("INFO: task %s:%d blocked for more than %ld seconds.\n",t->comm, t->pid, (jiffies - t->last_switch_time) / HZ);pr_err(" %s %s %.*s\n",print_tainted(), init_utsname()->release,(int)strcspn(init_utsname()->version, " "),init_utsname()->version);pr_err("\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\""" disables this message.\n");sched_show_task(t);hung_task_show_lock = true;if (sysctl_hung_task_all_cpu_backtrace)hung_task_show_all_bt = true;}touch_nmi_watchdog();
}
。