抓取任务D状态超时事件监控程序的进一步改进
一、背景
在之前的博客 https://zhaoxin1989.blog.csdn.net/article/details/151353093?spm=1001.2014.3001.5502 里,我们介绍了sched_waking事件及并对与这个事件相关的一个内核调度的关键函数try_to_wake_up做了详尽地解析。
这篇博客,我们针对之前的博客 之前的博客 https://zhaoxin1989.blog.csdn.net/article/details/145066795 和 https://zhaoxin1989.blog.csdn.net/article/details/146482124 里讲到的D状态的任务超时事件及相关唤醒者和被唤醒者任务的堆栈抓取,做进一步改进,解决在抓取D状态任务时遇到的奇怪的现象,也就是之前的博客 https://zhaoxin1989.blog.csdn.net/article/details/151353093?spm=1001.2014.3001.5502 里描述的“虚假唤醒”事件所导致抓取到的事件与实际情况不符。
我们复述一下在 https://zhaoxin1989.blog.csdn.net/article/details/151353093?spm=1001.2014.3001.5502 里 2.1 一节里描述的这个问题现象。
下图则是抓取程序抓到的一次D时间超时事件:
如上图,wakerpid是13230唤醒了pid 13264这个任务,这个任务的状态显示是current->__state是258,也就是TASK_KILLABLE
但是用调试程序触发的trace抓取在perfetto里显示不出4427.402671841的这次唤醒:
如上图,看到这个期间,只唤醒了13231和13233两个任务,并没有唤醒13264这个任务。
找到对应的trace原始文件里的这个期间的事件,是能看到这次唤醒事件的:
在之前的博客 https://zhaoxin1989.blog.csdn.net/article/details/151353093?spm=1001.2014.3001.5502 里的 2.2 一节,我们详细做了原理解释,这篇博客里不再赘述相关原理,在第二章里给出修改后的代码,在第三章里,做一些代码解释。
二、进一步改进后的抓D状态超时的异常事件代码
这一章,我们只是聚焦在抓取D状态超时,并没有像之前的博客 https://zhaoxin1989.blog.csdn.net/article/details/145066795 和 https://zhaoxin1989.blog.csdn.net/article/details/146482124 里那样抓取更多的D事件,比如D开始的时刻时间,D结束的时刻事件,iowait相关的事件及超时事件等等。
下面的代码由于做了一些特殊的修改内核部分的逻辑,是为了临时调试方便所使用,其实也是可以把相关内核改动部分及ko里的改动部分去掉的,具体的原因可以看下面 2.2 里的描述。
2.1 ko代码及使用方法
#include <linux/module.h>
#include <linux/capability.h>
#include <linux/sched.h>
#include <linux/uaccess.h>
#include <linux/proc_fs.h>
#include <linux/ctype.h>
#include <linux/seq_file.h>
#include <linux/poll.h>
#include <linux/types.h>
#include <linux/ioctl.h>
#include <linux/errno.h>
#include <linux/stddef.h>
#include <linux/lockdep.h>
#include <linux/kthread.h>
#include <linux/sched.h>
#include <linux/delay.h>
#include <linux/wait.h>
#include <linux/init.h>
#include <asm/atomic.h>
#include <trace/events/workqueue.h>
#include <linux/sched/clock.h>
#include <linux/string.h>
#include <linux/mm.h>
#include <linux/interrupt.h>
#include <linux/tracepoint.h>
#include <trace/events/osmonitor.h>
#include <trace/events/sched.h>
#include <trace/events/irq.h>
#include <trace/events/kmem.h>
#include <linux/ptrace.h>
#include <linux/uaccess.h>
#include <asm/processor.h>
#include <linux/sched/task_stack.h>
#include <linux/nmi.h>
#include <linux/version.h>
#include <linux/sched/mm.h>
#include <asm/irq_regs.h>
#include <linux/kallsyms.h>
#include <linux/kprobes.h>
#include <linux/stop_machine.h>MODULE_LICENSE("GPL");
MODULE_AUTHOR("zhaoxin");
MODULE_DESCRIPTION("Module for monitor D tasks.");
MODULE_VERSION("1.0");static unsigned long ns = 8000000ull;static unsigned long sns = 0ull;static unsigned long triggerns = 1000000000ull;static int pid0 = 0;
module_param(pid0, int, S_IRUGO);
static int pid1 = 0;
module_param(pid1, int, S_IRUGO);
static int pid2 = 0;
module_param(pid2, int, S_IRUGO);
static int pid3 = 0;
module_param(pid3, int, S_IRUGO);
static int pid4 = 0;
module_param(pid4, int, S_IRUGO);bool checkpid(int i_pid)
{
#if 0return true;
#elseif (pid0 == 0) {return true;}if (i_pid == pid0) {return true;}if (pid1) {if (i_pid == pid1) {return true;}}if (pid2) {if (i_pid == pid2) {return true;}}if (pid3) {if (i_pid == pid2) {return true;}}if (pid4) {if (i_pid == pid2) {return true;}}return false;
#endif
}module_param(ns, ulong, S_IRUGO);
MODULE_PARM_DESC(ns, "threshold nano second");
module_param(triggerns, ulong, S_IRUGO);module_param(sns, ulong, S_IRUGO);#define IODELAY_TRACEPOINT_ENABLE#define TEST_STACK_TRACE_ENTRIES 32typedef unsigned int (*stack_trace_save_tsk_func)(struct task_struct *task,unsigned long *store, unsigned int size,unsigned int skipnr);
stack_trace_save_tsk_func _stack_trace_save_tsk;typedef int (*get_cmdline_func)(struct task_struct *task, char *buffer, int buflen);
get_cmdline_func _get_cmdline_func;#define TESTDIOMONITOR_SAMPLEDESC_SWDSTART "swDstart"
#define TESTDIOMONITOR_SAMPLEDESC_WADSTOP "waDstop"
#define TESTDIOMONITOR_SAMPLEDESC_SWDIOSTART "swDiostart"
#define TESTDIOMONITOR_SAMPLEDESC_WADIOSTOP "waDiostop"
#define TESTDIOMONITOR_SAMPLEDESC_DEXCEED "Dexceed"
#define TESTDIOMONITOR_SAMPLEDESC_SDEXCEED "sDexceed"
#define TESTDIOMONITOR_SAMPLEDESC_DIOEXCEED "Dioexceed"
#define TESTDIOMONITOR_SAMPLEDESC_IOEXCEED "Ioexceed"#define TESTDIOMONITOR_SIMPLE_THRESHOLDNS (ns)//5000000ull
#define TESTDIOMONITOR_SIMPLE_THRESHOLDSWITCHNS (sns)//5000000ull// 1ms
//#define TESTDIOMONITOR_DEXCEED_THRESHOLD 1000ull//1000000ull#ifdef CONFIG_UCLAMP_TASK
struct uclamp_bucket {unsigned long value : bits_per(SCHED_CAPACITY_SCALE);unsigned long tasks : BITS_PER_LONG - bits_per(SCHED_CAPACITY_SCALE);
};struct uclamp_rq {unsigned int value;struct uclamp_bucket bucket[UCLAMP_BUCKETS];
};
#endif/* CFS-related fields in a runqueue */
struct cfs_rq {struct load_weight load;unsigned int nr_running;unsigned int h_nr_running; /* SCHED_{NORMAL,BATCH,IDLE} */unsigned int idle_nr_running; /* SCHED_IDLE */unsigned int idle_h_nr_running; /* SCHED_IDLE */u64 exec_clock;u64 min_vruntime;
#ifdef CONFIG_SCHED_COREunsigned int forceidle_seq;u64 min_vruntime_fi;
#endif#ifndef CONFIG_64BITu64 min_vruntime_copy;
#endifstruct rb_root_cached tasks_timeline;/** 'curr' points to currently running entity on this cfs_rq.* It is set to NULL otherwise (i.e when none are currently running).*/struct sched_entity *curr;struct sched_entity *next;struct sched_entity *last;struct sched_entity *skip;#ifdef CONFIG_SCHED_DEBUGunsigned int nr_spread_over;
#endif#ifdef CONFIG_SMP/** CFS load tracking*/struct sched_avg avg;
#ifndef CONFIG_64BITu64 last_update_time_copy;
#endifstruct {raw_spinlock_t lock ____cacheline_aligned;int nr;unsigned long load_avg;unsigned long util_avg;unsigned long runnable_avg;} removed;#ifdef CONFIG_FAIR_GROUP_SCHEDunsigned long tg_load_avg_contrib;long propagate;long prop_runnable_sum;/** h_load = weight * f(tg)** Where f(tg) is the recursive weight fraction assigned to* this group.*/unsigned long h_load;u64 last_h_load_update;struct sched_entity *h_load_next;
#endif /* CONFIG_FAIR_GROUP_SCHED */
#endif /* CONFIG_SMP */#ifdef CONFIG_FAIR_GROUP_SCHEDstruct rq *rq; /* CPU runqueue to which this cfs_rq is attached *//** leaf cfs_rqs are those that hold tasks (lowest schedulable entity in* a hierarchy). Non-leaf lrqs hold other higher schedulable entities* (like users, containers etc.)** leaf_cfs_rq_list ties together list of leaf cfs_rq's in a CPU.* This list is used during load balance.*/int on_list;struct list_head leaf_cfs_rq_list;struct task_group *tg; /* group that "owns" this runqueue *//* Locally cached copy of our task_group's idle value */int idle;#ifdef CONFIG_CFS_BANDWIDTHint runtime_enabled;s64 runtime_remaining;u64 throttled_pelt_idle;
#ifndef CONFIG_64BITu64 throttled_pelt_idle_copy;
#endifu64 throttled_clock;u64 throttled_clock_pelt;u64 throttled_clock_pelt_time;int throttled;int throttle_count;struct list_head throttled_list;
#endif /* CONFIG_CFS_BANDWIDTH */
#endif /* CONFIG_FAIR_GROUP_SCHED */
};/** This is the priority-queue data structure of the RT scheduling class:*/
struct rt_prio_array {DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */struct list_head queue[MAX_RT_PRIO];
};/* Real-Time classes' related field in a runqueue: */
struct rt_rq {struct rt_prio_array active;unsigned int rt_nr_running;unsigned int rr_nr_running;
#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHEDstruct {int curr; /* highest queued rt task prio */
#ifdef CONFIG_SMPint next; /* next highest */
#endif} highest_prio;
#endif
#ifdef CONFIG_SMPunsigned int rt_nr_migratory;unsigned int rt_nr_total;int overloaded;struct plist_head pushable_tasks;#endif /* CONFIG_SMP */int rt_queued;int rt_throttled;u64 rt_time;u64 rt_runtime;/* Nests inside the rq lock: */raw_spinlock_t rt_runtime_lock;#ifdef CONFIG_RT_GROUP_SCHEDunsigned int rt_nr_boosted;struct rq *rq;struct task_group *tg;
#endif
};/* Deadline class' related fields in a runqueue */
struct dl_rq {/* runqueue is an rbtree, ordered by deadline */struct rb_root_cached root;unsigned int dl_nr_running;#ifdef CONFIG_SMP/** Deadline values of the currently executing and the* earliest ready task on this rq. Caching these facilitates* the decision whether or not a ready but not running task* should migrate somewhere else.*/struct {u64 curr;u64 next;} earliest_dl;unsigned int dl_nr_migratory;int overloaded;/** Tasks on this rq that can be pushed away. They are kept in* an rb-tree, ordered by tasks' deadlines, with caching* of the leftmost (earliest deadline) element.*/struct rb_root_cached pushable_dl_tasks_root;
#elsestruct dl_bw dl_bw;
#endif/** "Active utilization" for this runqueue: increased when a* task wakes up (becomes TASK_RUNNING) and decreased when a* task blocks*/u64 running_bw;/** Utilization of the tasks "assigned" to this runqueue (including* the tasks that are in runqueue and the tasks that executed on this* CPU and blocked). Increased when a task moves to this runqueue, and* decreased when the task moves away (migrates, changes scheduling* policy, or terminates).* This is needed to compute the "inactive utilization" for the* runqueue (inactive utilization = this_bw - running_bw).*/u64 this_bw;u64 extra_bw;/** Inverse of the fraction of CPU utilization that can be reclaimed* by the GRUB algorithm.*/u64 bw_ratio;
};/** This is the main, per-CPU runqueue data structure.** Locking rule: those places that want to lock multiple runqueues* (such as the load balancing or the thread migration code), lock* acquire operations must be ordered by ascending &runqueue.*/
struct rq {/* runqueue lock: */raw_spinlock_t __lock;/** nr_running and cpu_load should be in the same cacheline because* remote CPUs use both these fields when doing load calculation.*/unsigned int nr_running;
#ifdef CONFIG_NUMA_BALANCINGunsigned int nr_numa_running;unsigned int nr_preferred_running;unsigned int numa_migrate_on;
#endif
#ifdef CONFIG_NO_HZ_COMMON
#ifdef CONFIG_SMPunsigned long last_blocked_load_update_tick;unsigned int has_blocked_load;call_single_data_t nohz_csd;
#endif /* CONFIG_SMP */unsigned int nohz_tick_stopped;atomic_t nohz_flags;
#endif /* CONFIG_NO_HZ_COMMON */#ifdef CONFIG_SMPunsigned int ttwu_pending;
#endifu64 nr_switches;#ifdef CONFIG_UCLAMP_TASK/* Utilization clamp values based on CPU's RUNNABLE tasks */struct uclamp_rq uclamp[UCLAMP_CNT] ____cacheline_aligned;unsigned int uclamp_flags;
#define UCLAMP_FLAG_IDLE 0x01
#endifstruct cfs_rq cfs;struct rt_rq rt;struct dl_rq dl;#ifdef CONFIG_FAIR_GROUP_SCHED/* list of leaf cfs_rq on this CPU: */struct list_head leaf_cfs_rq_list;struct list_head *tmp_alone_branch;
#endif /* CONFIG_FAIR_GROUP_SCHED *//** This is part of a global counter where only the total sum* over all CPUs matters. A task can increase this counter on* one CPU and if it got migrated afterwards it may decrease* it on another CPU. Always updated under the runqueue lock:*/unsigned int nr_uninterruptible;struct task_struct __rcu *curr;struct task_struct *idle;struct task_struct *stop;unsigned long next_balance;struct mm_struct *prev_mm;unsigned int clock_update_flags;u64 clock;/* Ensure that all clocks are in the same cache line */u64 clock_task ____cacheline_aligned;u64 clock_pelt;unsigned long lost_idle_time;u64 clock_pelt_idle;u64 clock_idle;
#ifndef CONFIG_64BITu64 clock_pelt_idle_copy;u64 clock_idle_copy;
#endifatomic_t nr_iowait;#ifdef CONFIG_SCHED_DEBUGu64 last_seen_need_resched_ns;int ticks_without_resched;
#endif#ifdef CONFIG_MEMBARRIERint membarrier_state;
#endif#ifdef CONFIG_SMPstruct root_domain *rd;struct sched_domain __rcu *sd;unsigned long cpu_capacity;unsigned long cpu_capacity_orig;struct balance_callback *balance_callback;unsigned char nohz_idle_balance;unsigned char idle_balance;unsigned long misfit_task_load;/* For active balancing */int active_balance;int push_cpu;struct cpu_stop_work active_balance_work;/* CPU of this runqueue: */int cpu;int online;struct list_head cfs_tasks;struct sched_avg avg_rt;struct sched_avg avg_dl;
#ifdef CONFIG_HAVE_SCHED_AVG_IRQstruct sched_avg avg_irq;
#endif
#ifdef CONFIG_SCHED_THERMAL_PRESSUREstruct sched_avg avg_thermal;
#endifu64 idle_stamp;u64 avg_idle;unsigned long wake_stamp;u64 wake_avg_idle;/* This is used to determine avg_idle's max value */u64 max_idle_balance_cost;#ifdef CONFIG_HOTPLUG_CPUstruct rcuwait hotplug_wait;
#endif
#endif /* CONFIG_SMP */#ifdef CONFIG_IRQ_TIME_ACCOUNTINGu64 prev_irq_time;u64 psi_irq_time;
#endif
#ifdef CONFIG_PARAVIRTu64 prev_steal_time;
#endif
#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTINGu64 prev_steal_time_rq;
#endif/* calc_load related fields */unsigned long calc_load_update;long calc_load_active;#ifdef CONFIG_SCHED_HRTICK
#ifdef CONFIG_SMPcall_single_data_t hrtick_csd;
#endifstruct hrtimer hrtick_timer;ktime_t hrtick_time;
#endif#ifdef CONFIG_SCHEDSTATS/* latency stats */struct sched_info rq_sched_info;unsigned long long rq_cpu_time;/* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? *//* sys_sched_yield() stats */unsigned int yld_count;/* schedule() stats */unsigned int sched_count;unsigned int sched_goidle;/* try_to_wake_up() stats */unsigned int ttwu_count;unsigned int ttwu_local;
#endif#ifdef CONFIG_CPU_IDLE/* Must be inspected within a rcu lock section */struct cpuidle_state *idle_state;
#endif#ifdef CONFIG_SMPunsigned int nr_pinned;
#endifunsigned int push_busy;struct cpu_stop_work push_work;#ifdef CONFIG_SCHED_CORE/* per rq */struct rq *core;struct task_struct *core_pick;unsigned int core_enabled;unsigned int core_sched_seq;struct rb_root core_tree;/* shared state -- careful with sched_core_cpu_deactivate() */unsigned int core_task_seq;unsigned int core_pick_seq;unsigned long core_cookie;unsigned int core_forceidle_count;unsigned int core_forceidle_seq;unsigned int core_forceidle_occupation;u64 core_forceidle_start;
#endif
};// runqueues (not export symbol)
struct rq* _prq = NULL;struct rq* my_cpu_rq(int i_cpu)
{return per_cpu_ptr(_prq, i_cpu);
}u64 my_rq_clock_task(void)
{struct rq* prq = my_cpu_rq(smp_processor_id());return prq->clock_task;
}#define TESTDIOMONITOR_FILE_MAXLEN 1024typedef struct testdmonitor_sample {struct timespec64 time;u64 timens;int cpu;int pid;int tgid;int ppid;char comm[TASK_COMM_LEN];char ppidcomm[TASK_COMM_LEN];// 0 or 1int bin_iowait;/** "swDstart" // 在sched_switch里* "waDstop" // 在sched_waking里* "swDiostart" // 在sched_switch里* "waDiostop" // 在sched_waking里* "Dexceed" // 超出阈值,非iowait* "Dioexceed" // 超出阈值,iowait*/const char* desc;u64 dtimens; // 纳秒单位,D状态持续的时间u64 iowaittimens; // 纳秒单位,等待io的时间int stackn;void* parray_stack[TEST_STACK_TRACE_ENTRIES];int wakercpu;int wakerpid;int wakertgid;int wakerppid;char wakercomm[TASK_COMM_LEN];char wakerppidcomm[TASK_COMM_LEN];int wakerstackn;void* parray_wakerstack[TEST_STACK_TRACE_ENTRIES];//char filepath[TESTDIOMONITOR_FILE_MAXLEN];u32 __state;u64 exec_start_begin;u64 exec_start_end;u64 local_clock_now;u64 clock_task_curr;u32 writedone; // 0 or 1
} testdmonitor_sample;#define TESTDIOMONITOR_SAMPLE_RINGBUFF_MAXCOUNT 8192*4typedef struct testdmonitor_sample_ringbuff {testdmonitor_sample* parray_sample;volatile u64 wp; // Index is wp & (TESTDIOMONITOR_SAMPLE_RINGBUFF_MAXCOUNT - 1).volatile u64 rp; // Index is rp & (TESTDIOMONITOR_SAMPLE_RINGBUFF_MAXCOUNT - 1).u32 skipcount; // 0 means no skip any abnormal event
} testdmonitor_sample_ringbuff;#define TESTDIOMONITOR_LINEBUFF 1024typedef struct testdmonitor_env {struct file* file;char file_linebuff[TESTDIOMONITOR_LINEBUFF];int headoffset;loff_t file_pos;testdmonitor_sample_ringbuff ringbuff;
} testdmonitor_env;static testdmonitor_env _env;static struct delayed_work work_write_file;
static struct workqueue_struct *wq_write_file;#define FILENAME "dlog.txt"void init_file(void)
{_env.file = filp_open(FILENAME, O_WRONLY | O_CREAT | O_TRUNC, 0644);if (IS_ERR(_env.file)) {_env.file = NULL;}
}void exit_file(void)
{if (_env.file) {filp_close(_env.file, NULL);}
}void testdmonitor_write_file(char* i_pchar, int i_size)
{if (_env.file) {kernel_write(_env.file, i_pchar, i_size, &_env.file_pos);}
}void testdmonitor_write_file_emptyline(void)
{testdmonitor_write_file("\n", strlen("\n"));
}void testdmonitor_file_oneline(const char* i_format, ...)
{char* pcontent = &_env.file_linebuff[_env.headoffset];va_list args;va_start(args, i_format);vsnprintf(pcontent, TESTDIOMONITOR_LINEBUFF - _env.headoffset, i_format, args);va_end(args);testdmonitor_write_file(_env.file_linebuff, strlen(_env.file_linebuff));
}void testdmonitor_replace_null_with_space(char *str, int n) {for (int i = 0; i < n - 1; i++) {if (str[i] == '\0') {str[i] = ' ';}}
}void testdmonitor_set_cmdline(char* i_pbuff, int i_buffsize, struct task_struct* i_ptask)
{int ret = _get_cmdline_func(i_ptask, i_pbuff, i_buffsize);if (ret <= 0) {i_pbuff[0] = '\0';return;}testdmonitor_replace_null_with_space(i_pbuff, ret);i_pbuff[ret - 1] = '\0';
}void testdmonitor_checkget_parentinfo_and_cmdline(testdmonitor_sample* io_psample, struct task_struct* i_ptask)
{struct task_struct* parent;rcu_read_lock();parent = rcu_dereference(i_ptask->real_parent);io_psample->ppid = parent->pid;strlcpy(io_psample->ppidcomm, parent->comm, TASK_COMM_LEN);rcu_read_unlock();
}void testdmonitor_checkget_parentinfo_and_cmdline_waker(testdmonitor_sample* io_psample, struct task_struct* i_ptask)
{struct task_struct* parent;rcu_read_lock();parent = rcu_dereference(i_ptask->real_parent);io_psample->wakerppid = parent->pid;strlcpy(io_psample->wakerppidcomm, parent->comm, TASK_COMM_LEN);rcu_read_unlock();
}#define TESTDIOMONITOR_COMMANDLINE_MAX 128int contains_ls(char *str) {const char *substr = "ls";size_t len = strlen(substr); // 获取子字符串的长度const char *p = str;while ((p = strchr(p, substr[0])) != NULL) { // 查找第一个字符 'l'if (strncmp(p, substr, len) == 0) { // 比较后续的字符return 1; // 找到了}p++; // 移动到下一个字符}return 0; // 没有找到
}static inline u64 gettscns(void)
{struct timespec64 ts;ktime_get_ts64(&ts);return timespec64_to_ns(&ts);
}void trigger(void)
{
#if 0//char *argv[] = {"/bin/bash", "-x", "/app/saturnv_sw/script/perfetto/one_shot_dump.sh", NULL};//char *argv[] = {"/bin/sh", "-c", "echo hello > /home/zhaoxin/code/git/los/code/test/testdiomonitor/b.txt", NULL};char *argv[] = {"/bin/bash", "-x", "/log/ftraceonly_stop.sh", NULL};char *envp[] ={"HOME=/", "TERM=linux", "PATH=/sbin:/usr/sbin:/bin:/usr/bin", NULL};//mm_segment_t old_fs = get_fs();printk("testdmonitor run trigger begin\n");testdmonitor_file_oneline("trigger ftraceonly stop!\n");//set_fs(KERNEL_DS);call_usermodehelper(argv[0],argv,envp,UMH_WAIT_EXEC);//set_fs(old_fs);printk("testdmonitor run trigger end\n");
#endif
}u64 _lastdumptimens = 0;
u64 _gapns = 240000000000ull;void checktrigger(void)
{if (_lastdumptimens == 0) {_lastdumptimens = gettscns();goto trigger;}else {// u64 now = gettscns();// if (now > _lastdumptimens && now - _lastdumptimens > _gapns) {// _lastdumptimens = now;// goto trigger;// }return;}
trigger:trigger();
}static void write_file(struct work_struct *w)
{//ssize_t ret;u32 index;testdmonitor_sample* psample;struct tm t;char timestr[64];char exceedstr[64];char temp_commandline[TESTDIOMONITOR_COMMANDLINE_MAX];struct pid* pid_struct;struct task_struct* ptask;int stacki;while (_env.ringbuff.rp != _env.ringbuff.wp) {index = (_env.ringbuff.rp & (TESTDIOMONITOR_SAMPLE_RINGBUFF_MAXCOUNT - 1));psample = &_env.ringbuff.parray_sample[index];if (psample->writedone != 1) {break;}testdmonitor_write_file_emptyline();_env.headoffset = sprintf(_env.file_linebuff, "[%llu][%s] ", _env.ringbuff.rp, psample->desc);time64_to_tm(psample->time.tv_sec + 8 * 60 * 60, 0, &t);snprintf(timestr, 64, "%04ld-%02d-%02d-%02d_%02d_%02d.%09ld",1900 + t.tm_year, t.tm_mon + 1, t.tm_mday, t.tm_hour, t.tm_min, t.tm_sec, psample->time.tv_nsec);if (psample->desc == (const char*)TESTDIOMONITOR_SAMPLEDESC_DEXCEED) {snprintf(exceedstr, 64, "dtimens[%llu]", psample->dtimens);if (psample->dtimens > triggerns) {checktrigger();}}else if (psample->desc == (const char*)TESTDIOMONITOR_SAMPLEDESC_SDEXCEED) {snprintf(exceedstr, 64, "sdtimens[%llu]", psample->dtimens);}else if (psample->desc == (const char*)TESTDIOMONITOR_SAMPLEDESC_DIOEXCEED) {snprintf(exceedstr, 64, "iowaittimens[%llu]", psample->iowaittimens);}else if (psample->desc == (const char*)TESTDIOMONITOR_SAMPLEDESC_IOEXCEED) {snprintf(exceedstr, 64, "delayacct_iowaittimens[%llu]", psample->iowaittimens);}else {exceedstr[0] = '\0';}//if (psample->desc == (const char*)TESTDIOMONITOR_SAMPLEDESC_DIOEXCEED) {if (psample->desc == (const char*)TESTDIOMONITOR_SAMPLEDESC_DEXCEED|| psample->desc == (const char*)TESTDIOMONITOR_SAMPLEDESC_SDEXCEED) {testdmonitor_file_oneline("[skipcount:%u]begin...time[%s][%llu]wakercpu[%d]desc[%s]%s\n", _env.ringbuff.skipcount, timestr, psample->timens, psample->wakercpu, psample->desc, "wakerDexceed");testdmonitor_file_oneline("wakertgid[%d]wakerpid[%d]wakercomm[%s]wakerppid[%d]wakerppidcomm[%s]\n",psample->wakertgid, psample->wakerpid, psample->wakercomm, psample->wakerppid, psample->wakerppidcomm);pid_struct = find_get_pid(psample->wakerpid);if (pid_struct) {ptask = get_pid_task(pid_struct, PIDTYPE_PID);if (ptask) {testdmonitor_set_cmdline(temp_commandline, TESTDIOMONITOR_COMMANDLINE_MAX, ptask);put_task_struct(ptask);}else {temp_commandline[0] = '\0';}put_pid(pid_struct);}else {temp_commandline[0] = '\0';}testdmonitor_file_oneline("wakercommandline[%s]\n", temp_commandline);pid_struct = find_get_pid(psample->wakerppid);if (pid_struct) {ptask = get_pid_task(pid_struct, PIDTYPE_PID);if (ptask) {testdmonitor_set_cmdline(temp_commandline, TESTDIOMONITOR_COMMANDLINE_MAX, ptask);put_task_struct(ptask);}else {temp_commandline[0] = '\0';}put_pid(pid_struct);}else {temp_commandline[0] = '\0';}testdmonitor_file_oneline("wakerppid_commandline[%s]\n", temp_commandline);testdmonitor_file_oneline("stack[%d]:\n", psample->wakerstackn);for (stacki = 0; stacki < psample->wakerstackn; stacki++) {testdmonitor_file_oneline("%*c%pS\n", 5, ' ', (void *)psample->parray_wakerstack[stacki]);}testdmonitor_file_oneline("cpu[%d]desc[%s]%s\n", psample->cpu, psample->desc, exceedstr);}else {testdmonitor_file_oneline("begin...time[%s]cpu[%d]desc[%s]%s\n", timestr, psample->cpu, psample->desc, exceedstr);}testdmonitor_file_oneline("tgid[%d]pid[%d]comm[%s]ppid[%d]ppidcomm[%s]\n",psample->tgid, psample->pid, psample->comm, psample->ppid, psample->ppidcomm);{const char *desc = "NA";if (psample->__state == TASK_UNINTERRUPTIBLE) {desc = "D";}else if (psample->__state == TASK_KILLABLE) {desc = "K";}else if (psample->__state == TASK_RTLOCK_WAIT) {desc = "RTLOCK";}if (psample->desc == (const char*)TESTDIOMONITOR_SAMPLEDESC_SDEXCEED) {testdmonitor_file_oneline("iniowait[%u]__state[%u][%s]exec_start_begin[%llu]exec_start_end[%llu]wakingns[%llu]delayns[%llu]\n",psample->bin_iowait ? 1 : 0, psample->__state, desc, psample->exec_start_begin, psample->exec_start_end, psample->local_clock_now, psample->clock_task_curr);}else {testdmonitor_file_oneline("iniowait[%u]__state[%u][%s]exec_start_begin[%llu]exec_start_end[%llu]local_clock[%llu]clock_task_curr[%llu]\n",psample->bin_iowait ? 1 : 0, psample->__state, desc, psample->exec_start_begin, psample->exec_start_end, psample->local_clock_now, psample->clock_task_curr);}}pid_struct = find_get_pid(psample->pid);if (pid_struct) {ptask = get_pid_task(pid_struct, PIDTYPE_PID);if (ptask) {testdmonitor_set_cmdline(temp_commandline, TESTDIOMONITOR_COMMANDLINE_MAX, ptask);put_task_struct(ptask);}else {temp_commandline[0] = '\0';}put_pid(pid_struct);}else {temp_commandline[0] = '\0';}testdmonitor_file_oneline("commandline[%s]\n", temp_commandline);pid_struct = find_get_pid(psample->ppid);if (pid_struct) {ptask = get_pid_task(pid_struct, PIDTYPE_PID);if (ptask) {testdmonitor_set_cmdline(temp_commandline, TESTDIOMONITOR_COMMANDLINE_MAX, ptask);put_task_struct(ptask);}else {temp_commandline[0] = '\0';}put_pid(pid_struct);}else {temp_commandline[0] = '\0';}testdmonitor_file_oneline("ppid_commandline[%s]\n", temp_commandline);//testdmonitor_file_oneline("filepath[%s]\n", psample->filepath);testdmonitor_file_oneline("stack[%d]:\n", psample->stackn);for (stacki = 0; stacki < psample->stackn; stacki++) {testdmonitor_file_oneline("%*c%pS\n", 5, ' ', (void *)psample->parray_stack[stacki]);}testdmonitor_write_file_emptyline();smp_wmb();psample->writedone = 0;_env.ringbuff.rp ++;}queue_delayed_work_on(nr_cpu_ids - 1, wq_write_file,&work_write_file, 1);
}static void init_write_file(void)
{init_file();wq_write_file = alloc_workqueue("testdmonitor_write_file", WQ_MEM_RECLAIM, 0);INIT_DELAYED_WORK(&work_write_file, write_file);queue_delayed_work_on(nr_cpu_ids - 1, wq_write_file,&work_write_file, 3);
}static void exit_write_file(void)
{cancel_delayed_work_sync(&work_write_file);destroy_workqueue(wq_write_file);exit_file();
}void init_testdmonitor_sample_ringbuff(void)
{//testdmonitor_sample* psample;_env.ringbuff.parray_sample = kvzalloc(sizeof(testdmonitor_sample) * TESTDIOMONITOR_SAMPLE_RINGBUFF_MAXCOUNT, GFP_KERNEL);
}void exit_testdmonitor_sample_ringbuff(void)
{kvfree(_env.ringbuff.parray_sample);
}testdmonitor_sample* testdmonitor_get_psample(void)
{u64 windex_raw, windex_raw_old;u32 windex;while (1) {windex_raw = _env.ringbuff.wp;if (windex_raw - _env.ringbuff.rp >= (u64)(TESTDIOMONITOR_SAMPLE_RINGBUFF_MAXCOUNT)) {_env.ringbuff.skipcount ++;return NULL;}// atomic_cmpxchg return old valuewindex_raw_old = atomic64_cmpxchg((atomic64_t*)&_env.ringbuff.wp,windex_raw, windex_raw + 1);if (windex_raw_old == windex_raw) {break;}}windex = (u32)(windex_raw & (u64)(TESTDIOMONITOR_SAMPLE_RINGBUFF_MAXCOUNT - 1));return &_env.ringbuff.parray_sample[windex];
}//static u64 _magic_number = 0xABCDEFull;//void* _dl_sched_class = NULL;int get_file_dir_by_folio(struct folio *i_fo, char* i_path, int i_len);// static inline u64 gettscns(void)
// {
// struct timespec64 ts;
// ktime_get_ts64(&ts);
// return timespec64_to_ns(&ts);
// }void testdmonitor_add_sample(const char* i_desc, struct task_struct* i_task, u64 i_timens, u32 i_state, u64 i_exec_start_begin, u64 i_exec_start_end, u64 i_local_clock, u64 i_clock_task_curr)
{testdmonitor_sample* psample = testdmonitor_get_psample();if (!psample) {return;}ktime_get_real_ts64(&psample->time);psample->timens = gettscns();psample->cpu = task_cpu(i_task);psample->pid = i_task->pid;psample->tgid = i_task->tgid;strlcpy(psample->comm, i_task->comm, TASK_COMM_LEN);testdmonitor_checkget_parentinfo_and_cmdline(psample, i_task);psample->bin_iowait = i_task->in_iowait;psample->desc = i_desc;if (i_desc == (const char*)TESTDIOMONITOR_SAMPLEDESC_DEXCEED || i_desc == (const char*)TESTDIOMONITOR_SAMPLEDESC_SDEXCEED) {psample->dtimens = i_timens;}else if (i_desc == (const char*)TESTDIOMONITOR_SAMPLEDESC_DIOEXCEED || i_desc == (const char*)TESTDIOMONITOR_SAMPLEDESC_IOEXCEED) {psample->iowaittimens = i_timens;}psample->stackn = _stack_trace_save_tsk(i_task, (unsigned long*)psample->parray_stack, TEST_STACK_TRACE_ENTRIES, 0);//if (i_desc == (const char*)TESTDIOMONITOR_SAMPLEDESC_DIOEXCEED) {if (i_desc == (const char*)TESTDIOMONITOR_SAMPLEDESC_DEXCEED || i_desc == (const char*)TESTDIOMONITOR_SAMPLEDESC_SDEXCEED) {psample->__state = i_state;psample->exec_start_begin = i_exec_start_begin;psample->exec_start_end = i_exec_start_end;psample->local_clock_now = i_local_clock;psample->clock_task_curr = i_clock_task_curr;psample->wakercpu = smp_processor_id();psample->wakerpid = current->pid;psample->wakertgid = current->tgid;strlcpy(psample->wakercomm, current->comm, TASK_COMM_LEN);testdmonitor_checkget_parentinfo_and_cmdline_waker(psample, current);psample->wakerstackn = _stack_trace_save_tsk(current, (unsigned long*)psample->parray_wakerstack, TEST_STACK_TRACE_ENTRIES, 0);// psample->filepath[0] = '\0';// if ((void*)i_task->sched_class != (void*)&_dl_sched_class) {// if (i_task->dl.dl_runtime == _magic_number) {// //if (sched_clock() - i_task->dl.dl_deadline >= TESTDIOMONITOR_SIMPLE_THRESHOLDNS) // {// //printk("__folio_lock_killable wait %llu ns\n", sched_clock() - current->dl.dl_deadline);// //dump_stack();// if (get_file_dir_by_folio((struct folio*)i_task->dl.dl_period, psample->filepath, TESTDIOMONITOR_FILE_MAXLEN) < 0) {// //printk("get_file_dir_by_folio fail!\n");// }// }// current->dl.dl_runtime = 0;// }// }}smp_wmb();psample->writedone = 1;
}bool checkisd(u32 i_state)
{if ((i_state & TASK_UNINTERRUPTIBLE) == TASK_UNINTERRUPTIBLE) {return true;}if (i_state == TASK_RTLOCK_WAIT) {return true;}return false;
}static void cb_sched_switch(void *i_data, bool i_preempt,struct task_struct *i_prev,struct task_struct *i_next,unsigned int i_prev_state)
{u64 currns = gettscns();i_prev->lastswitchns = currns;i_prev->lastswitchstate = i_prev_state;if (!checkisd(i_prev_state)) {i_prev->lastswitchns = 0;}if (!checkisd(i_next->lastswitchstate)) {return;}if (TESTDIOMONITOR_SIMPLE_THRESHOLDSWITCHNS == 0) {return;}if (i_next->lastswitchns && currns > i_next->lastswitchns && currns - i_next->lastswitchns >= TESTDIOMONITOR_SIMPLE_THRESHOLDSWITCHNS){testdmonitor_add_sample(TESTDIOMONITOR_SAMPLEDESC_SDEXCEED, i_next,currns - i_next->lastswitchns, i_next->lastswitchstate, i_next->lastswitchns, currns, i_next->wakingns, currns - i_next->wakingns);}
}static void cb_sched_waking(void *i_data, struct task_struct *i_p)
{//struct rq* prq = my_cpu_rq(task_cpu(i_p));struct rq* prq_curr = my_cpu_rq(smp_processor_id());//u64 currns = prq->clock_task;u64 currns = gettscns();u64 currns_curr = prq_curr->clock_task;//u64 local_c = local_clock();//u64 exec_start = i_p->se.exec_start;u64 exec_start = i_p->lastswitchns;//int cpuid = smp_processor_id();if (i_p->on_cpu || i_p->on_rq == 1) {return;}i_p->wakingns = currns;if (!checkisd(i_p->__state)) {return;}if (!checkpid(i_p->tgid)) {return;}if (exec_start == 0) {return;}if (currns > exec_start && currns - exec_start >= TESTDIOMONITOR_SIMPLE_THRESHOLDNS){testdmonitor_add_sample(TESTDIOMONITOR_SAMPLEDESC_DEXCEED, i_p,currns - exec_start, i_p->__state, exec_start, currns, local_clock(), currns_curr);}
}struct kern_tracepoint {void *callback;struct tracepoint *ptr;bool bregister;
};
static void clear_kern_tracepoint(struct kern_tracepoint *tp)
{if (tp->bregister) {tracepoint_probe_unregister(tp->ptr, tp->callback, NULL);}
}#define INIT_KERN_TRACEPOINT(tracepoint_name) \static struct kern_tracepoint mykern_##tracepoint_name = {.callback = NULL, .ptr = NULL, .bregister = false};#define TRACEPOINT_CHECK_AND_SET(tracepoint_name) \static void tracepoint_name##_tracepoint_check_and_set(struct tracepoint *tp, void *priv) \{ \if (!strcmp(#tracepoint_name, tp->name)) \{ \((struct kern_tracepoint *)priv)->ptr = tp; \return; \} \}INIT_KERN_TRACEPOINT(sched_switch)
TRACEPOINT_CHECK_AND_SET(sched_switch)
INIT_KERN_TRACEPOINT(sched_waking)
TRACEPOINT_CHECK_AND_SET(sched_waking)typedef unsigned long (*kallsyms_lookup_name_func)(const char *name);
kallsyms_lookup_name_func _kallsyms_lookup_name_func;void* get_func_by_symbol_name_kallsyms_lookup_name(void)
{int ret;void* pfunc = NULL;struct kprobe kp;memset(&kp, 0, sizeof(kp));kp.symbol_name = "kallsyms_lookup_name";kp.pre_handler = NULL;kp.addr = NULL; // 作为强调,提示使用symbol_nameret = register_kprobe(&kp);if (ret < 0) {printk("register_kprobe fail!\n");return NULL;}printk("register_kprobe succeed!\n");pfunc = (void*)kp.addr;unregister_kprobe(&kp);return pfunc;
}void* get_func_by_symbol_name(const char* i_symbol)
{if (_kallsyms_lookup_name_func == NULL) {return NULL;}return (void*)_kallsyms_lookup_name_func(i_symbol);
}enum behavior {EXCLUSIVE, /* Hold ref to page and take the bit when woken, like* __folio_lock() waiting on then setting PG_locked.*/SHARED, /* Hold ref to page and check the bit when woken, like* folio_wait_writeback() waiting on PG_writeback.*/DROP, /* Drop ref to page before wait, no check when woken,* like folio_put_wait_locked() on PG_locked.*/
};struct kprobe _kp1;//extern void* get_dl_sched_class_pointer(void);
extern struct rq* get_runqueues(void);static int __init testdmonitor_init(void)
{//trigger();if (pid0 == 0) {printk(KERN_INFO "pid0=0! NOT filter pid\n");}printk(KERN_INFO "ns=%lu\n", ns);//printk("offset of mmap_lock in mm_struct [%d]\n", offsetof(struct mm_struct, mmap_lock));_kallsyms_lookup_name_func = get_func_by_symbol_name_kallsyms_lookup_name();// _dl_sched_class = (void*)_kallsyms_lookup_name_func("dl_sched_class");// if (_dl_sched_class == NULL) {// printk(KERN_ERR "get_func_by_symbol_name _dl_sched_class failed!\n");// return -1;// }//_dl_sched_class = get_dl_sched_class_pointer();// _prq = get_func_by_symbol_name("runqueues");// if (_prq == NULL) {// printk(KERN_ERR "get_func_by_symbol_name runqueues failed!\n");// return -1;// }_prq = get_runqueues();init_testdmonitor_sample_ringbuff();init_write_file();_stack_trace_save_tsk = get_func_by_symbol_name("stack_trace_save_tsk");if (_stack_trace_save_tsk == NULL) {printk(KERN_ERR "get_func_by_symbol_name stack_trace_save_tsk failed!\n");return -1;}_get_cmdline_func = get_func_by_symbol_name("get_cmdline");if (_get_cmdline_func == NULL) {printk(KERN_ERR "get_func_by_symbol_name get_cmdline failed!\n");return -1;}mykern_sched_switch.callback = cb_sched_switch;for_each_kernel_tracepoint(sched_switch_tracepoint_check_and_set, &mykern_sched_switch);if (!mykern_sched_switch.ptr) {printk(KERN_ERR "mykern_sched_switch register failed!\n");return -1;}else {printk(KERN_INFO "mykern_sched_switch register succeeded!\n");}tracepoint_probe_register(mykern_sched_switch.ptr, mykern_sched_switch.callback, NULL);mykern_sched_switch.bregister = 1;mykern_sched_waking.callback = cb_sched_waking;for_each_kernel_tracepoint(sched_waking_tracepoint_check_and_set, &mykern_sched_waking);if (!mykern_sched_waking.ptr) {printk(KERN_ERR "mykern_sched_waking register failed!\n");return -1;}else {printk(KERN_INFO "mykern_sched_waking register succeeded!\n");}tracepoint_probe_register(mykern_sched_waking.ptr, mykern_sched_waking.callback, NULL);mykern_sched_waking.bregister = 1;return 0;
}static void __exit testdmonitor_exit(void)
{clear_kern_tracepoint(&mykern_sched_waking);clear_kern_tracepoint(&mykern_sched_switch);tracepoint_synchronize_unregister();exit_write_file();exit_testdmonitor_sample_ringbuff();
}module_init(testdmonitor_init);
module_exit(testdmonitor_exit);
上面的ko代码部分是基于之前的博客 https://zhaoxin1989.blog.csdn.net/article/details/146482124 里,去掉D超时事件之外的其他事件的逻辑,修复在第一章背景里讲到的问题后的代码。
ko设置了几个参数,ns是超时的时间:
sns如果是非0,则表示也要统计sched_waking到真正被换入时这段时间,也就是D状态到真正到cpu上执行(switch in)的这个时间,也记录这样的超时事件,意思就是包含了调度时延:
triggerns是触发一个事件可用于唤醒一个用户态程序,方便写一些抓取脚本或者运行其他用户态程序的一个超时阈值。
pid0是0的话,则表示抓取所有pid的D超时的事件,如果pid0非0,则设置了几个非0,就监控几个pid,目前最多监控5个进程。
抓取的内容形如:
上面的代码里有一些之前的博客里讲到的一些细节,这里不再赘述。
2.2 相关的内核改动
由于有些内核编译选项下的内核版本上有些符号无法拿到,比如 runqueues 符号,但是有些是可以:
还比如dl_sched_class(下图系统上能通过kallsyms_lookup_name来拿到该符号):
但是有些系统上缺拿不到该符号。
为了统一,就直接在fair.c里增加了下图的函数:
另外,在task_struct里增加了下图三个变量:
方便监控逻辑的编写,和排除掉时间轴不一致的偏差,做一些状态的记录方便做一些判断。其实增加一个变量和增加三个变量,从调试角度来说并没有太大的差别。
三、代码细节的一些说明
3.1 增加lastswitchns变量
在task_struct里增加lastswitchns变量,是为了方便拉齐时间轴,在任务被switch out时,设置换出去的时间到这个变量里去,这样不用担心使用task_struct里现有的如se.exec_start这样的变量还要担心它在非CFS调度类时不能及时的更新,导致D时间计算上的误差。
要注意的是,如果系统启动起来之后,insmod过一次ko后,rmmod后重新insmod会导致之前已经有lastswitchns的数值在一些task_struct里,由于ko模块被rmmod掉,导致rmmod后的switch out时不能及时更新,从而在计算D时间时算出一个巨大的时间。
3.2 使用call_usermodehelper机制触发一个用户态程序
在进行事件的落盘时,在write_file里,判断超过一定阈值后,执行checktrigger逻辑:
checktrigger逻辑会执行trigger,trigger会用call_usermodehelper来触发执行一个用户态程序。
3.3 sched_waking事件通过on_cpu和on_rq的判断排除掉虚假唤醒情况
任务on_cpu是1表示正在cpu上,任务on_rq是1,表示已经在就绪队列里或者在cpu上。
有关on_rq和on_cpu的更多细节见之前的博客 https://zhaoxin1989.blog.csdn.net/article/details/151353093?spm=1001.2014.3001.5502 里的 2.2.2 一节。