Linux中do_wait函数的实现
一、获取进程所属的进程组IDprocess_group
static inline pid_t process_group(struct task_struct *tsk)
{return tsk->signal->pgrp;
}
1.函数定义
static inline pid_t process_group(struct task_struct *tsk)
{return tsk->signal->pgrp;
}
作用:返回给定进程 tsk
所属的进程组ID
2.关键概念解析
2.1. 进程组 (Process Group)
在 Linux系统中:
- 进程组是一组相关进程的集合
- 每个进程组有一个唯一的进程组ID (PGID)
二、判断进程是否是有子线程的线程组领导delay_group_leader
#define thread_group_leader(p) (p->pid == p->tgid)
static inline int thread_group_empty(task_t *p)
{return list_empty(&p->pids[PIDTYPE_TGID].pid_list);
}
#define delay_group_leader(p) \(thread_group_leader(p) && !thread_group_empty(p))
1. thread_group_leader(p)
- 线程组领导判断
#define thread_group_leader(p) (p->pid == p->tgid)
作用:判断进程是否是线程组的领导进程。
关键字段:
p->pid
:进程的实际PIDp->tgid
:线程组ID(Thread Group ID)
判断逻辑:
- 如果
pid == tgid
:该进程是线程组领导 - 如果
pid != tgid
:该进程是线程组中的普通线程
2. thread_group_empty(p)
- 线程组空判断
static inline int thread_group_empty(task_t *p)
{return list_empty(&p->pids[PIDTYPE_TGID].pid_list);
}
作用:判断线程组是否只有当前进程一个成员。
关键数据结构:
p->pids[PIDTYPE_TGID].pid_list
:线程组成员的链表list_empty()
:检查链表是否为空
判断逻辑:
- 链表为空:线程组只有当前进程 → 返回 1
- 链表不为空:线程组有多个成员 → 返回 0
3. delay_group_leader(p)
- 延迟的组领导判断
#define delay_group_leader(p) \(thread_group_leader(p) && !thread_group_empty(p))
作用:判断进程是否是有子线程的线程组领导。
逻辑分解:
thread_group_leader(p)
:是线程组领导!thread_group_empty(p)
:线程组不空(有子线程)
综合:该进程是线程组领导,并且线程组中还有其他线程。
三、进程等待另一个进程时进行安全检查security_task_wait
static inline int security_task_wait (struct task_struct *p)
{return security_ops->task_wait (p);
}
1.函数定义
static inline int security_task_wait(struct task_struct *p)
{return security_ops->task_wait(p);
}
作用:在进程等待另一个进程时进行安全检查。
四、判断子进程是否符合等待条件eligible_child
static int eligible_child(pid_t pid, int options, task_t *p)
{if (pid > 0) {if (p->pid != pid)return 0;} else if (!pid) {if (process_group(p) != process_group(current))return 0;} else if (pid != -1) {if (process_group(p) != -pid)return 0;}/** Do not consider detached threads that are* not ptraced:*/if (p->exit_signal == -1 && !p->ptrace)return 0;/* Wait for all children (clone and not) if __WALL is set;* otherwise, wait for clone children *only* if __WCLONE is* set; otherwise, wait for non-clone children *only*. (Note:* A "clone" child here is one that reports to its parent* using a signal other than SIGCHLD.) */if (((p->exit_signal != SIGCHLD) ^ ((options & __WCLONE) != 0))&& !(options & __WALL))return 0;/** Do not consider thread group leaders that are* in a non-empty thread group:*/if (current->tgid != p->tgid && delay_group_leader(p))return 2;if (security_task_wait(p))return 0;return 1;
}
1.函数定义
static int eligible_child(pid_t pid, int options, task_t *p)
参数:
pid
:等待的目标进程IDoptions
:等待选项p
:要检查的候选进程
返回值:
0
:不符合条件1
:符合条件2
:特殊条件(延迟的组领导)
2.函数逻辑分析
2.1. PID 过滤条件
if (pid > 0) {if (p->pid != pid)return 0;
} else if (!pid) {if (process_group(p) != process_group(current))return 0;
} else if (pid != -1) {if (process_group(p) != -pid)return 0;
}
PID 参数的不同含义:
pid 值 | 含义 | 检查条件 |
---|---|---|
pid > 0 | 等待特定PID的进程 | p->pid == pid |
pid == 0 | 等待与当前进程在同一进程组的任何子进程 | process_group(p) == process_group(current) |
pid < -1 | 等待进程组 | process_group(p) == -pid |
pid == -1 | 等待任何子进程 | 无PID过滤 |
2. 分离线程检查
if (p->exit_signal == -1 && !p->ptrace)return 0;
排除条件:
p->exit_signal == -1
:分离线程(不通知父进程)!p->ptrace
:没有被调试器跟踪
目的:排除那些不会通知父进程的分离线程。
3. 克隆类型检查
if (((p->exit_signal != SIGCHLD) ^ ((options & __WCLONE) != 0))&& !(options & __WALL))return 0;
这是一个关于等待克隆进程与非克隆进程的复杂逻辑判断
什么是"克隆进程"(clone child)
在 Linux 中,有两种类型的子进程:
- 普通进程:使用
SIGCHLD
信号通知父进程 - 克隆进程:使用非
SIGCHLD
信号通知父进程
选项 | 含义 |
---|---|
__WALL | 等待所有类型的子进程(包括克隆和非克隆) |
__WCLONE | 只等待克隆进程 |
默认(都不设置) | 只等待普通进程(非克隆) |
第一部分:异或运算 ^
(p->exit_signal != SIGCHLD) ^ ((options & __WCLONE) != 0)
异或真值表:
进程类型 | 选项 | 结果 | 含义 |
---|---|---|---|
普通进程 (exit_signal == SIGCHLD ) | 没有 __WCLONE | 0 ^ 0 = 0 | 不排除 |
普通进程 (exit_signal == SIGCHLD ) | 有 __WCLONE | 0 ^ 1 = 1 | 排除 |
克隆进程 (exit_signal != SIGCHLD ) | 没有 __WCLONE | 1 ^ 0 = 1 | 排除 |
克隆进程 (exit_signal != SIGCHLD ) | 有 __WCLONE | 1 ^ 1 = 0 | 不排除 |
第二部分:!__WALL
检查
&& !(options & __WALL)
- 如果设置了
__WALL
,整个条件为假(不排除任何进程) - 如果没有设置
__WALL
,继续应用上述过滤规则
即
__WALL
:我全都要(所有进程类型)__WCLONE
:我只要克隆进程- 默认:我只要普通进程
4. 线程组领导特殊处理
if (current->tgid != p->tgid && delay_group_leader(p))return 2;
条件:
current->tgid != p->tgid
:目标进程与当前进程不在同一线程组delay_group_leader(p)
:目标进程是多线程的线程组领导
返回值 2:说明目标线程组还有工作线程在运行,等待它们运行结束领导线程才可以退出
5. 安全检查
if (security_task_wait(p))return 0;
调用 LSM(Linux安全模块)检查当前进程是否有权限等待目标进程。
五、判断是否被ptrace
跟踪my_ptrace_child
static inline int my_ptrace_child(struct task_struct *p)
{if (!(p->ptrace & PT_PTRACED))return 0;if (!(p->ptrace & PT_ATTACHED))return 1;/** This child was PTRACE_ATTACH'd. We should be seeing it only if* we are the attacher. If we are the real parent, this is a race* inside ptrace_attach. It is waiting for the tasklist_lock,* which we have to switch the parent links, but has already set* the flags in p->ptrace.*/return (p->parent != p->real_parent);
}
1.函数定义
static inline int my_ptrace_child(struct task_struct *p)
作用:判断目标进程 p
是否可以被当前进程进行 ptrace
操作。
返回值:
0
:不是当前进程的ptrace
子进程1
:是当前进程的ptrace
子进程
2.函数逻辑分析
2.1. 基础 ptrace
检查
if (!(p->ptrace & PT_PTRACED))return 0;
条件:进程必须被 ptrace
跟踪
PT_PTRACED
:表示进程正在被调试器跟踪- 如果没有设置这个标志,直接返回 0
2.2. 附加类型检查
if (!(p->ptrace & PT_ATTACHED))return 1;
条件:检查是否是 PTRACE_ATTACH
操作
PT_ATTACHED
:表示通过ptrace(PTRACE_ATTACH, ...)
附加的- 如果没有这个标志,说明是自然跟踪的子进程,直接返回 1
2.3. 竞争条件处理
return (p->parent != p->real_parent);
PT_ATTACHED
情况下p->parent != p->real_parent
六、子进程状态信息拷贝到用户空间wait_noreap_copyout
static int wait_noreap_copyout(task_t *p, pid_t pid, uid_t uid,int why, int status,struct siginfo __user *infop,struct rusage __user *rusagep)
{int retval = rusagep ? getrusage(p, RUSAGE_BOTH, rusagep) : 0;put_task_struct(p);if (!retval)retval = put_user(SIGCHLD, &infop->si_signo);if (!retval)retval = put_user(0, &infop->si_errno);if (!retval)retval = put_user((short)why, &infop->si_code);if (!retval)retval = put_user(pid, &infop->si_pid);if (!retval)retval = put_user(uid, &infop->si_uid);if (!retval)retval = put_user(status, &infop->si_status);if (!retval)retval = pid;return retval;
}
1.函数定义
static int wait_noreap_copyout(task_t *p, pid_t pid, uid_t uid,int why, int status,struct siginfo __user *infop,struct rusage __user *rusagep)
参数:
p
:目标进程的任务结构pid
:进程IDuid
:用户IDwhy
:状态改变原因status
:退出状态infop
:用户空间的siginfo
结构指针rusagep
:用户空间的资源使用结构指针
2.函数逻辑分析
2.1. 获取资源使用信息
int retval = rusagep ? getrusage(p, RUSAGE_BOTH, rusagep) : 0;
条件获取:
- 如果
rusagep
不为空,调用getrusage()
获取进程资源使用信息 RUSAGE_BOTH
:获取进程及其所有子进程的资源使用统计- 如果
rusagep
为空,跳过这一步
2.2. 释放任务结构引用
put_task_struct(p);
作用:减少对任务结构的引用计数
- 之前通过
get_task_struct()
增加了引用计数 - 现在使用完毕,减少引用计数
- 如果引用计数归零,会释放任务结构
2.3. 填充 siginfo
结构
一系列 put_user()
调用,将信息拷贝到用户空间:
if (!retval)retval = put_user(SIGCHLD, &infop->si_signo); // 信号类型
if (!retval)retval = put_user(0, &infop->si_errno); // 错误代码
if (!retval)retval = put_user((short)why, &infop->si_code); // 原因代码
if (!retval)retval = put_user(pid, &infop->si_pid); // 进程ID
if (!retval)retval = put_user(uid, &infop->si_uid); // 用户ID
if (!retval)retval = put_user(status, &infop->si_status); // 退出状态
2.4. 设置返回值
if (!retval)retval = pid;
return retval;
返回值逻辑:
- 如果所有拷贝操作都成功,返回进程ID
pid
- 如果任何拷贝操作失败,返回错误码
七、处理停止状态进程wait_task_stopped
#define get_task_struct(tsk) do { atomic_inc(&(tsk)->usage); } while(0)
#define remove_parent(p) list_del_init(&(p)->sibling)
#define add_parent(p, parent) list_add_tail(&(p)->sibling,&(parent)->children)
static int wait_task_stopped(task_t *p, int delayed_group_leader, int noreap,struct siginfo __user *infop,int __user *stat_addr, struct rusage __user *ru)
{int retval, exit_code;if (!p->exit_code)return 0;if (delayed_group_leader && !(p->ptrace & PT_PTRACED) &&p->signal && p->signal->group_stop_count > 0)/** A group stop is in progress and this is the group leader.* We won't report until all threads have stopped.*/return 0;/** Now we are pretty sure this task is interesting.* Make sure it doesn't get reaped out from under us while we* give up the lock and then examine it below. We don't want to* keep holding onto the tasklist_lock while we call getrusage and* possibly take page faults for user memory.*/get_task_struct(p);read_unlock(&tasklist_lock);if (unlikely(noreap)) {pid_t pid = p->pid;uid_t uid = p->uid;int why = (p->ptrace & PT_PTRACED) ? CLD_TRAPPED : CLD_STOPPED;exit_code = p->exit_code;if (unlikely(!exit_code) ||unlikely(p->state > TASK_STOPPED))goto bail_ref;return wait_noreap_copyout(p, pid, uid,why, (exit_code << 8) | 0x7f,infop, ru);}write_lock_irq(&tasklist_lock);/** This uses xchg to be atomic with the thread resuming and setting* it. It must also be done with the write lock held to prevent a* race with the EXIT_ZOMBIE case.*/exit_code = xchg(&p->exit_code, 0);if (unlikely(p->exit_state >= EXIT_ZOMBIE)) {/** The task resumed and then died. Let the next iteration* catch it in EXIT_ZOMBIE. Note that exit_code might* already be zero here if it resumed and did _exit(0).* The task itself is dead and won't touch exit_code again;* other processors in this function are locked out.*/p->exit_code = exit_code;exit_code = 0;}if (unlikely(exit_code == 0)) {/** Another thread in this function got to it first, or it* resumed, or it resumed and then died.*/write_unlock_irq(&tasklist_lock);
bail_ref:put_task_struct(p);/** We are returning to the wait loop without having successfully* removed the process and having released the lock. We cannot* continue, since the "p" task pointer is potentially stale.** Return -EAGAIN, and do_wait() will restart the loop from the* beginning. Do _not_ re-acquire the lock.*/return -EAGAIN;}/* move to end of parent's list to avoid starvation */remove_parent(p);add_parent(p, p->parent);write_unlock_irq(&tasklist_lock);retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0;if (!retval && stat_addr)retval = put_user((exit_code << 8) | 0x7f, stat_addr);if (!retval && infop)retval = put_user(SIGCHLD, &infop->si_signo);if (!retval && infop)retval = put_user(0, &infop->si_errno);if (!retval && infop)retval = put_user((short)((p->ptrace & PT_PTRACED)? CLD_TRAPPED : CLD_STOPPED),&infop->si_code);if (!retval && infop)retval = put_user(exit_code, &infop->si_status);if (!retval && infop)retval = put_user(p->pid, &infop->si_pid);if (!retval && infop)retval = put_user(p->uid, &infop->si_uid);if (!retval)retval = p->pid;put_task_struct(p);BUG_ON(!retval);return retval;
}
1.函数定义
static int wait_task_stopped(task_t *p, int delayed_group_leader, int noreap,struct siginfo __user *infop,int __user *stat_addr, struct rusage __user *ru)
参数:
p
:要检查的目标进程delayed_group_leader
:是否是延迟处理的组领导(线程组有多个线程)noreap
:是否不收割进程(保持进程状态)infop
:用户空间的siginfo
结构stat_addr
:用户空间的状态地址ru
:用户空间的资源使用结构
2.函数逻辑分析
2.1. 初始检查
if (!p->exit_code)return 0;
- 如果进程没有退出码,直接返回0
- 停止的进程应该有非零的退出码
2.2. 组停止检查
if (delayed_group_leader && !(p->ptrace & PT_PTRACED) &&p->signal && p->signal->group_stop_count > 0)return 0;
条件:
- 等待的进程和当前线程不是一个线程组,并且它是另一个线程组的leader,见
eligible_child
- 没有被
ptrace
跟踪 - 有线程停止正在进行中
作用:先返回,等待整个线程组都停止后再报告,避免部分报告。
2.3. 获取任务引用和释放锁
get_task_struct(p);
read_unlock(&tasklist_lock);
关键操作:
get_task_struct(p)
:增加引用计数,防止进程被释放read_unlock(&tasklist_lock)
:释放任务列表锁
为什么需要这样:避免在拷贝用户数据时持有锁,防止死锁。
2.4. noreap
路径处理
if (unlikely(noreap)) {pid_t pid = p->pid;uid_t uid = p->uid;int why = (p->ptrace & PT_PTRACED) ? CLD_TRAPPED : CLD_STOPPED;exit_code = p->exit_code;if (unlikely(!exit_code) ||unlikely(p->state > TASK_STOPPED))goto bail_ref;return wait_noreap_copyout(p, pid, uid,why, (exit_code << 8) | 0x7f,infop, ru);}
条件检查
if (unlikely(noreap)) {
noreap
参数为真表示使用WNOWAIT
选项
提取进程信息
pid_t pid = p->pid;
uid_t uid = p->uid;
int why = (p->ptrace & PT_PTRACED) ? CLD_TRAPPED : CLD_STOPPED;
- 保存进程的 PID 和 UID
- 根据是否被
ptrace
跟踪确定停止原因:CLD_TRAPPED
:被调试器跟踪停止CLD_STOPPED
:被信号停止
状态验证
exit_code = p->exit_code;
if (unlikely(!exit_code) ||unlikely(p->state > TASK_STOPPED))goto bail_ref;
检查进程是否真的处于停止状态:
!exit_code
:停止的进程应该有非零的退出码(包含停止信号)p->state > TASK_STOPPED
:进程状态比停止状态更靠后(可能是僵尸或死亡)
符合,跳转到 bail_ref
清理并返回错误。
信息拷贝返回
return wait_noreap_copyout(p, pid, uid,why, (exit_code << 8) | 0x7f,infop, ru);
调用专门的函数将进程信息拷贝到用户空间。
状态编码解析
(exit_code << 8) | 0x7f
这个编码符合 waitpid()
的状态返回值规范:
exit_code << 8
:停止信号编号左移8位0x7f
:固定的停止状态标记
2.5. 原子获取退出码
write_lock_irq(&tasklist_lock);
exit_code = xchg(&p->exit_code, 0);
关键操作:
xchg(&p->exit_code, 0)
:原子地获取退出码并清零
2.6. 竞争条件处理
if (unlikely(p->exit_state >= EXIT_ZOMBIE)) {p->exit_code = exit_code; // 恢复退出码exit_code = 0; // 标记为无效
}
处理场景:进程在检查期间被唤醒并执行到退出,变成僵尸或退出状态
- 因为任务被加锁,所以其他CPU修改不了进程退出码,这里进行恢复
2.7. 无效退出码处理
if (unlikely(exit_code == 0)) {write_unlock_irq(&tasklist_lock);
bail_ref:put_task_struct(p);return -EAGAIN;
}
返回 -EAGAIN:告诉调用者重新开始等待循环。
2.8. 防止其他进程饥饿
remove_parent(p);
add_parent(p, p->parent);
task_struct
的成员sibling
初始化以后会作为一个锚点,用来获取task_struct
自身- 父进程的
children
成员是一个链表,记录着每一个子进程 - 而记录的链表节点就是每一个子进程的
sibling
成员 - 通过
list_entry
即container_of
宏可以基于sibling
的偏移得到子进程的task_struct
结构体 - 最终实现可以通过父进程的
children
链表进行遍历 - 这里将
p
放在父进程children
的末尾,这样可以让其他前面的子进程优先遍历到
2.9. 拷贝信息到用户空间
一系列 put_user()
调用,将进程信息拷贝到用户空间:
retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0;
if (!retval && stat_addr)retval = put_user((exit_code << 8) | 0x7f, stat_addr);
// ... 更多拷贝操作
八、处理僵尸进程wait_task_zombie
static int wait_task_zombie(task_t *p, int noreap,struct siginfo __user *infop,int __user *stat_addr, struct rusage __user *ru)
{unsigned long state;int retval;int status;if (unlikely(noreap)) {pid_t pid = p->pid;uid_t uid = p->uid;int exit_code = p->exit_code;int why, status;if (unlikely(p->exit_state != EXIT_ZOMBIE))return 0;if (unlikely(p->exit_signal == -1 && p->ptrace == 0))return 0;get_task_struct(p);read_unlock(&tasklist_lock);if ((exit_code & 0x7f) == 0) {why = CLD_EXITED;status = exit_code >> 8;} else {why = (exit_code & 0x80) ? CLD_DUMPED : CLD_KILLED;status = exit_code & 0x7f;}return wait_noreap_copyout(p, pid, uid, why,status, infop, ru);}/** Try to move the task's state to DEAD* only one thread is allowed to do this:*/state = xchg(&p->exit_state, EXIT_DEAD);if (state != EXIT_ZOMBIE) {BUG_ON(state != EXIT_DEAD);return 0;}if (unlikely(p->exit_signal == -1 && p->ptrace == 0)) {/** This can only happen in a race with a ptraced thread* dying on another processor.*/return 0;}if (likely(p->real_parent == p->parent) && likely(p->signal)) {/** The resource counters for the group leader are in its* own task_struct. Those for dead threads in the group* are in its signal_struct, as are those for the child* processes it has previously reaped. All these* accumulate in the parent's signal_struct c* fields.** We don't bother to take a lock here to protect these* p->signal fields, because they are only touched by* __exit_signal, which runs with tasklist_lock* write-locked anyway, and so is excluded here. We do* need to protect the access to p->parent->signal fields,* as other threads in the parent group can be right* here reaping other children at the same time.*/spin_lock_irq(&p->parent->sighand->siglock);p->parent->signal->cutime +=p->utime + p->signal->utime + p->signal->cutime;p->parent->signal->cstime +=p->stime + p->signal->stime + p->signal->cstime;p->parent->signal->cmin_flt +=p->min_flt + p->signal->min_flt + p->signal->cmin_flt;p->parent->signal->cmaj_flt +=p->maj_flt + p->signal->maj_flt + p->signal->cmaj_flt;p->parent->signal->cnvcsw +=p->nvcsw + p->signal->nvcsw + p->signal->cnvcsw;p->parent->signal->cnivcsw +=p->nivcsw + p->signal->nivcsw + p->signal->cnivcsw;spin_unlock_irq(&p->parent->sighand->siglock);}/** Now we are sure this task is interesting, and no other* thread can reap it because we set its state to EXIT_DEAD.*/read_unlock(&tasklist_lock);retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0;status = p->signal->group_exit? p->signal->group_exit_code : p->exit_code;if (!retval && stat_addr)retval = put_user(status, stat_addr);if (!retval && infop)retval = put_user(SIGCHLD, &infop->si_signo);if (!retval && infop)retval = put_user(0, &infop->si_errno);if (!retval && infop) {int why;if ((status & 0x7f) == 0) {why = CLD_EXITED;status >>= 8;} else {why = (status & 0x80) ? CLD_DUMPED : CLD_KILLED;status &= 0x7f;}retval = put_user((short)why, &infop->si_code);if (!retval)retval = put_user(status, &infop->si_status);}if (!retval && infop)retval = put_user(p->pid, &infop->si_pid);if (!retval && infop)retval = put_user(p->uid, &infop->si_uid);if (retval) {// TODO: is this safe?p->exit_state = EXIT_ZOMBIE;return retval;}retval = p->pid;if (p->real_parent != p->parent) {write_lock_irq(&tasklist_lock);/* Double-check with lock held. */if (p->real_parent != p->parent) {__ptrace_unlink(p);// TODO: is this safe?p->exit_state = EXIT_ZOMBIE;/** If this is not a detached task, notify the parent.* If it's still not detached after that, don't release* it now.*/if (p->exit_signal != -1) {do_notify_parent(p, p->exit_signal);if (p->exit_signal != -1)p = NULL;}}write_unlock_irq(&tasklist_lock);}if (p != NULL)release_task(p);BUG_ON(!retval);return retval;
}
1.函数定义
static int wait_task_zombie(task_t *p, int noreap,struct siginfo __user *infop,int __user *stat_addr, struct rusage __user *ru)
作用:处理僵尸进程,收集其状态信息并最终释放资源。
2.函数逻辑分析
2.1. noreap
路径处理(只读,不修改进程状态)
if (unlikely(noreap)) {// 检查进程状态if (unlikely(p->exit_state != EXIT_ZOMBIE))return 0;if (unlikely(p->exit_signal == -1 && p->ptrace == 0))return 0;get_task_struct(p);read_unlock(&tasklist_lock);// 解析退出原因和状态if ((exit_code & 0x7f) == 0) {why = CLD_EXITED; // 正常退出status = exit_code >> 8;} else {why = (exit_code & 0x80) ? CLD_DUMPED : CLD_KILLED; // 核心转储或被杀死status = exit_code & 0x7f;}return wait_noreap_copyout(p, pid, uid, why, status, infop, ru);
}
p->exit_signal == -1 && p->ptrace == 0
p->exit_signal == -1
:分离线程(不通知父进程)p->ptrace == 0
:没有被调试- 这样的线程不需要处理
(exit_code & 0x7f) == 0
- 低7位为0:表示没有导致退出的信号
why = CLD_EXITED
:正常退出status = exit_code >> 8
:取出高位的退出状态码
(exit_code & 0x80) ? CLD_DUMPED : CLD_KILLED
exit_code & 0x80
:第7位为1表示生成了核心转储文件CLD_DUMPED
:有核心转储的被杀CLD_KILLED
:无核心转储的被杀status = exit_code & 0x7f
:取出导致退出的信号编号
2.2. 状态转换(收割路径)
state = xchg(&p->exit_state, EXIT_DEAD);
if (state != EXIT_ZOMBIE) {BUG_ON(state != EXIT_DEAD);return 0;
}
原子状态转换:
- 使用
xchg
原子地将状态从EXIT_ZOMBIE
改为EXIT_DEAD
- 在
do_wait
函数中检测到EXIT_DEAD
状态会直接跳过,确保只有一个线程能成功收割这个僵尸进程
2.3. 分离线程检查
if (unlikely(p->exit_signal == -1 && p->ptrace == 0)) {return 0;
}
跳过不需要通知父进程的分离线程
2.4. 资源统计累积
if (likely(p->real_parent == p->parent) && likely(p->signal)) {spin_lock_irq(&p->parent->sighand->siglock);// 累积CPU时间统计p->parent->signal->cutime += p->utime + p->signal->utime + p->signal->cutime;p->parent->signal->cstime += p->stime + p->signal->stime + p->signal->cstime;// 累积缺页统计p->parent->signal->cmin_flt += p->min_flt + p->signal->min_flt + p->signal->cmin_flt;p->parent->signal->cmaj_flt += p->maj_flt + p->signal->maj_flt + p->signal->cmaj_flt;// 累积上下文切换统计p->parent->signal->cnvcsw += p->nvcsw + p->signal->nvcsw + p->signal->cnvcsw;p->parent->signal->cnivcsw += p->nivcsw + p->signal->nivcsw + p->signal->cnivcsw;spin_unlock_irq(&p->parent->sighand->siglock);
}
资源统计:将子进程的资源使用累加到父进程中。
2.5. 拷贝信息到用户空间
read_unlock(&tasklist_lock);retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0;// 确定退出状态(进程组退出码或进程退出码)
status = p->signal->group_exit ? p->signal->group_exit_code : p->exit_code;// 一系列 put_user() 调用拷贝数据到用户空间
2.6. 退出状态解析
if ((status & 0x7f) == 0) {why = CLD_EXITED; // 正常退出status >>= 8; // 获取退出状态码
} else {why = (status & 0x80) ? CLD_DUMPED : CLD_KILLED; // 信号导致退出status &= 0x7f; // 获取信号编号
}
2.7. 错误处理
if (retval) {// 如果拷贝失败,恢复僵尸状态p->exit_state = EXIT_ZOMBIE;return retval;
}
2.8. ptrace
分离处理
if (p->real_parent != p->parent) {write_lock_irq(&tasklist_lock);if (p->real_parent != p->parent) {__ptrace_unlink(p); // 解除ptrace关系p->exit_state = EXIT_ZOMBIE; // 恢复僵尸状态if (p->exit_signal != -1) {do_notify_parent(p, p->exit_signal); // 通知真实父进程if (p->exit_signal != -1)p = NULL; // 标记为已处理}}write_unlock_irq(&tasklist_lock);
}
条件检查
if (p->real_parent != p->parent) {
判断是否被 ptrace
跟踪:
p->real_parent
:真实的父进程(可能是调试器)p->parent
:当前的父进程(创建该进程的进程)- 两者不同说明进程正在被
ptrace
跟踪
加锁和双重检查
write_lock_irq(&tasklist_lock);
if (p->real_parent != p->parent) {
并发安全:
- 获取写锁保护进程关系修改
- 双重检查防止竞争条件
解除 ptrace
关系
__ptrace_unlink(p);
作用:断开调试器与目标进程的跟踪关系。
内部操作:
- 清除
ptrace
标志 - 恢复真实父进程
- 从调试器列表移除
恢复僵尸状态
p->exit_state = EXIT_ZOMBIE;
为什么需要恢复:
- 之前已经将状态改为
EXIT_DEAD
- 现在需要让真实父进程也能等待这个僵尸进程
通知真实父进程
if (p->exit_signal != -1) {do_notify_parent(p, p->exit_signal);if (p->exit_signal != -1)p = NULL; // 标记为已处理
}
2.9. 最终释放
if (p != NULL)release_task(p); // 释放任务结构return retval; // 返回进程PID
九、处理进程继续执行事件wait_task_continued
static int wait_task_continued(task_t *p, int noreap,struct siginfo __user *infop,int __user *stat_addr, struct rusage __user *ru)
{int retval;pid_t pid;uid_t uid;if (unlikely(!p->signal))return 0;if (p->signal->stop_state >= 0)return 0;spin_lock_irq(&p->sighand->siglock);if (p->signal->stop_state >= 0) { /* Re-check with the lock held. */spin_unlock_irq(&p->sighand->siglock);return 0;}if (!noreap)p->signal->stop_state = 0;spin_unlock_irq(&p->sighand->siglock);pid = p->pid;uid = p->uid;get_task_struct(p);read_unlock(&tasklist_lock);if (!infop) {retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0;put_task_struct(p);if (!retval && stat_addr)retval = put_user(0xffff, stat_addr);if (!retval)retval = p->pid;} else {retval = wait_noreap_copyout(p, pid, uid,CLD_CONTINUED, SIGCONT,infop, ru);BUG_ON(retval == 0);}return retval;
}
1.函数作用
处理那些之前被停止,现在又继续执行的进程,向等待的父进程报告 CLD_CONTINUED
事件。
2.完整工作流程
2.1.阶段1:基础检查
if (unlikely(!p->signal))return 0;if (p->signal->stop_state >= 0)return 0;
检查内容:
- 进程是否有有效的信号结构
- 进程的停止状态是否表示"已继续"
stop_state < 0
:表示进程之前被停止,现在可以报告继续事件stop_state >= 0
:不符合报告条件
2.2.阶段2:加锁重新检查
spin_lock_irq(&p->sighand->siglock);
if (p->signal->stop_state >= 0) {spin_unlock_irq(&p->sighand->siglock);return 0;
}
双重检查:在持有锁的情况下重新检查状态,防止竞争条件。
2.3.阶段3:状态更新
if (!noreap)p->signal->stop_state = 0;
spin_unlock_irq(&p->sighand->siglock);
状态机更新:
- 如果不是
noreap
模式,将stop_state
重置为 0(表示"运行中") - 如果是
noreap
模式,保持状态不变,允许后续再次报告
2.4.阶段4:准备进程信息
pid = p->pid;
uid = p->uid;
get_task_struct(p);
read_unlock(&tasklist_lock);
安全准备:
- 保存进程ID和用户ID
- 增加引用计数防止进程被释放
- 释放任务列表锁
2.5.阶段5:信息报告(两个路径)
2.5.1.路径A:传统模式(无 siginfo
)
if (!infop) {retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0;put_task_struct(p);if (!retval && stat_addr)retval = put_user(0xffff, stat_addr);if (!retval)retval = p->pid;
}
传统状态编码:0xffff
表示进程继续执行
// 用户空间解码:
if (status == 0xffff) {printf("进程已继续执行\n");
}
2.5.2.路径B:现代模式(有 siginfo
)
} else {retval = wait_noreap_copyout(p, pid, uid,CLD_CONTINUED, SIGCONT,infop, ru);BUG_ON(retval == 0);
}
现代报告:使用 siginfo
结构提供详细信息:
si_code = CLD_CONTINUED
(继续执行)si_status = SIGCONT
(继续信号)
十、wait()
系统调用的核心实现do_wait
static long do_wait(pid_t pid, int options, struct siginfo __user *infop,int __user *stat_addr, struct rusage __user *ru)
{DECLARE_WAITQUEUE(wait, current);struct task_struct *tsk;int flag, retval;add_wait_queue(¤t->wait_chldexit,&wait);
repeat:/** We will set this flag if we see any child that might later* match our criteria, even if we are not able to reap it yet.*/flag = 0;current->state = TASK_INTERRUPTIBLE;read_lock(&tasklist_lock);tsk = current;do {struct task_struct *p;struct list_head *_p;int ret;list_for_each(_p,&tsk->children) {p = list_entry(_p,struct task_struct,sibling);ret = eligible_child(pid, options, p);if (!ret)continue;switch (p->state) {case TASK_TRACED:if (!my_ptrace_child(p))continue;/*FALLTHROUGH*/case TASK_STOPPED:/** It's stopped now, so it might later* continue, exit, or stop again.*/flag = 1;if (!(options & WUNTRACED) &&!my_ptrace_child(p))continue;retval = wait_task_stopped(p, ret == 2,(options & WNOWAIT),infop,stat_addr, ru);if (retval == -EAGAIN)goto repeat;if (retval != 0) /* He released the lock. */goto end;break;default:// case EXIT_DEAD:if (p->exit_state == EXIT_DEAD)continue;// case EXIT_ZOMBIE:if (p->exit_state == EXIT_ZOMBIE) {/** Eligible but we cannot release* it yet:*/if (ret == 2)goto check_continued;if (!likely(options & WEXITED))continue;retval = wait_task_zombie(p, (options & WNOWAIT),infop, stat_addr, ru);/* He released the lock. */if (retval != 0)goto end;break;}
check_continued:/** It's running now, so it might later* exit, stop, or stop and then continue.*/flag = 1;if (!unlikely(options & WCONTINUED))continue;retval = wait_task_continued(p, (options & WNOWAIT),infop, stat_addr, ru);if (retval != 0) /* He released the lock. */goto end;break;}}if (!flag) {list_for_each(_p, &tsk->ptrace_children) {p = list_entry(_p, struct task_struct,ptrace_list);if (!eligible_child(pid, options, p))continue;flag = 1;break;}}if (options & __WNOTHREAD)break;tsk = next_thread(tsk);if (tsk->signal != current->signal)BUG();} while (tsk != current);read_unlock(&tasklist_lock);if (flag) {retval = 0;if (options & WNOHANG)goto end;retval = -ERESTARTSYS;if (signal_pending(current))goto end;schedule();goto repeat;}retval = -ECHILD;
end:current->state = TASK_RUNNING;remove_wait_queue(¤t->wait_chldexit,&wait);if (infop) {if (retval > 0)retval = 0;else {/** For a WNOHANG return, clear out all the fields* we would set so the user can easily tell the* difference.*/if (!retval)retval = put_user(0, &infop->si_signo);if (!retval)retval = put_user(0, &infop->si_errno);if (!retval)retval = put_user(0, &infop->si_code);if (!retval)retval = put_user(0, &infop->si_pid);if (!retval)retval = put_user(0, &infop->si_uid);if (!retval)retval = put_user(0, &infop->si_status);}}return retval;
}
1.函数定义
static long do_wait(pid_t pid, int options, struct siginfo __user *infop,int __user *stat_addr, struct rusage __user *ru)
作用:实现 wait()
, waitpid()
, wait4()
等系统调用的核心逻辑。
2.初始化阶段
1. 设置等待队列
DECLARE_WAITQUEUE(wait, current);
add_wait_queue(¤t->wait_chldexit, &wait);
- 创建等待队列条目
- 将当前进程添加到
wait_chldexit
等待队列
2. 状态设置
current->state = TASK_INTERRUPTIBLE;
将当前进程设置为可中断睡眠状态,准备等待子进程状态变化。
3.主循环逻辑
3.1.外层循环结构
repeat:flag = 0;read_lock(&tasklist_lock);tsk = current;do {// 遍历子进程// 遍历被跟踪进程tsk = next_thread(tsk);} while (tsk != current);
4.进程遍历阶段
4.1. 遍历子进程链表
list_for_each(_p, &tsk->children) {p = list_entry(_p, struct task_struct, sibling);ret = eligible_child(pid, options, p);if (!ret) continue;
遍历当前进程的所有子进程,检查每个子进程是否符合等待条件。
4.2. 进程状态处理
根据子进程的不同状态分别处理:
4.2.3.情况A:被跟踪或停止的进程
case TASK_TRACED:if (!my_ptrace_child(p)) continue;/*FALLTHROUGH*/
case TASK_STOPPED:flag = 1;if (!(options & WUNTRACED) && !my_ptrace_child(p))continue;retval = wait_task_stopped(p, ret == 2, (options & WNOWAIT),infop, stat_addr, ru);
处理逻辑:
-
设置
flag = 1
表示找到相关进程 -
检查
WUNTRACED
选项是否允许返回停止的进程-
设置
WUNTRACED
:报告停止的进程和终止的进程 -
不设置
WUNTRACED
:只报告终止的进程
-
-
my_ptrace_child(p)
函数- 返回
1
:是当前进程的ptrace
子进程 - 返回
0
:不是当前进程的ptrace
子进程
- 返回
-
调用
wait_task_stopped()
处理停止状态
4.2.4.情况B:僵尸进程
case EXIT_ZOMBIE:if (ret == 2) goto check_continued; // 延迟的组领导if (!likely(options & WEXITED)) continue;retval = wait_task_zombie(p, (options & WNOWAIT),infop, stat_addr, ru);
处理逻辑:
- 检查
WEXITED
选项是否允许返回退出的进程 - 调用
wait_task_zombie()
处理僵尸进程
4.2.5.情况C:运行中的进程
check_continued:flag = 1;if (!unlikely(options & WCONTINUED)) continue;retval = wait_task_continued(p, (options & WNOWAIT),infop, stat_addr, ru);
处理逻辑:
- 设置
flag = 1
表示找到相关进程 - 检查
WCONTINUED
选项是否允许返回继续执行的进程
4.3. 遍历被跟踪进程
list_for_each(_p, &tsk->ptrace_children) {p = list_entry(_p, struct task_struct, ptrace_list);if (!eligible_child(pid, options, p)) continue;flag = 1;break;
}
检查当前进程通过 ptrace
跟踪的其他进程。
4.4. 线程组遍历
if (options & __WNOTHREAD) break;
tsk = next_thread(tsk);
如果不是 __WNOTHREAD
选项,遍历线程组中的所有线程。
__WNOTHREAD
标记只遍历当前线程组的进程
5.等待逻辑
5.1.找到相关进程但无法立即返回
if (flag) {retval = 0;if (options & WNOHANG) goto end; // 非阻塞,立即返回retval = -ERESTARTSYS;if (signal_pending(current)) goto end; // 有信号待处理schedule(); // 让出CPU,等待子进程状态变化goto repeat; // 重新检查
}
5.2.没有找到相关进程
retval = -ECHILD; // 没有子进程
6.清理阶段
6.1. 恢复状态
end:current->state = TASK_RUNNING;remove_wait_queue(¤t->wait_chldexit, &wait);
6.2. 处理返回值
if (infop) {if (retval > 0)retval = 0; // 成功找到进程else {// 清理 siginfo 结构,设置为0put_user(0, &infop->si_signo);put_user(0, &infop->si_errno);// ... 其他字段}
}
7.选项标志解析
7.1.options
参数:
选项 | 含义 |
---|---|
WNOHANG | 非阻塞,立即返回 |
WUNTRACED | 也返回停止的子进程 |
WCONTINUED | 也返回继续执行的子进程 |
WEXITED | 返回退出的子进程 |
WNOWAIT | 不收割进程,保持状态 |
__WNOTHREAD | 只检查当前线程,不检查整个线程组 |
8.返回值含义
- > 0:找到的子进程PID
- 0:
WNOHANG
选项且没有立即可用的子进程 - -ECHILD:没有子进程
- -ERESTARTSYS:被信号中断
- -EAGAIN:需要重试
9.实际工作流程
开始等待↓
遍历子进程和被跟踪进程↓┌─────────────┐│ 找到可立即 │ 是 → 返回进程信息│ 返回的进程? │└─────────────┘↓ 否┌─────────────┐│ 有相关进程但 │ 是 → 睡眠等待状态变化│ 需要等待? │└─────────────┘↓ 否┌─────────────┐│ WNOHANG? │ 是 → 返回0└─────────────┘↓ 否返回 -ECHILD
十一、发起等待sys_waitid
asmlinkage long sys_waitid(int which, pid_t pid,struct siginfo __user *infop, int options,struct rusage __user *ru)
{long ret;if (options & ~(WNOHANG|WNOWAIT|WEXITED|WSTOPPED|WCONTINUED))return -EINVAL;if (!(options & (WEXITED|WSTOPPED|WCONTINUED)))return -EINVAL;switch (which) {case P_ALL:pid = -1;break;case P_PID:if (pid <= 0)return -EINVAL;break;case P_PGID:if (pid <= 0)return -EINVAL;pid = -pid;break;default:return -EINVAL;}ret = do_wait(pid, options, infop, NULL, ru);/* avoid REGPARM breakage on x86: */prevent_tail_call(ret);return ret;
}
检查非法选项
if (options & ~(WNOHANG|WNOWAIT|WEXITED|WSTOPPED|WCONTINUED))return -EINVAL;
~(WNOHANG|WNOWAIT|WEXITED|WSTOPPED|WCONTINUED)
:创建掩码的反码options & ~mask
:检查是否有不在允许范围内的位被设置
如果有任何其他位被设置,返回 -EINVAL
(无效参数)。
检查必要选项
if (!(options & (WEXITED|WSTOPPED|WCONTINUED)))return -EINVAL;
(WEXITED|WSTOPPED|WCONTINUED)
:等待的事件类型!(options & ...)
:检查是否没有设置任何事件类型
如果没有指定任何事件类型,返回 -EINVAL
。
这是 waitid()
系统调用中处理 which
参数的代码段。让我详细解释:
switch作用
情况1:P_ALL
- 等待任何子进程
case P_ALL:pid = -1;break;
含义:等待当前进程的任何子进程,不考虑进程组。
情况2:P_PID
- 等待特定PID的进程
case P_PID:if (pid <= 0)return -EINVAL;break;
参数验证:
pid
必须大于0(有效的进程ID)- 如果
pid <= 0
,返回-EINVAL
(无效参数)
情况3:P_PGID
- 等待进程组的任何进程
case P_PGID:if (pid <= 0)return -EINVAL;pid = -pid;break;
参数转换:
- 验证
pid
必须大于0(有效的进程组ID) - 将
pid
转换为负数:pid = -pid
情况4:默认情况 - 无效参数
default:return -EINVAL;
错误处理:如果 which
不是上述任何值,返回 -EINVAL
。
参数常量定义
在用户空间头文件中:
#define P_ALL 0 // 任何子进程
#define P_PID 1 // 特定PID的进程
#define P_PGID 2 // 特定进程组的任何进程
# define prevent_tail_call(ret) __asm__ ("" : "=r" (ret) : "0" (ret))
-
尾调用优化:当函数在返回前最后一步是调用另一个函数时,编译器可以优化为跳转而不是调用,从而节省栈空间。
-
这个宏的作用:通过内联汇编"使用"返回值,让编译器认为返回值在返回前被修改了,从而无法进行尾调用优化。
-
保持完整的调用栈信息
-
便于调试和分析