Kprobe机制是内核提供的一种调试机制,它提供了一种方法,能够在不修改现有代码的基础上,灵活的跟踪内核函数的执行。它的基本工作原理是:用户指定一个探测点,并把一个用户定义的处理函数关联到该探测点,当内核执行到该探测点时,相应的关联函数被执行,然后继续执行正常的代码路径。

Kprobe提供了三种形式的探测点,一种是最基本的kprobe,能够在指定代码执行前、执行后进行探测,但此时不能访问被探测函数内的相关变量信息;一种是jprobe,用于探测某一函数的入口,并且能够访问对应的函数参数;一种是kretprobe,用于完成指定函数返回值的探测功能。其中最基本的就是kprobe机制,jprobe以及kretprobe的实现都依赖于kprobe,但其代码的实现都很巧妙,强烈建议每一个内核爱好者阅读。
好了,闲话少叙,开始上代码:
首先是struct kprobe结构,每一个探测点的基本结构

点击(此处)折叠或打开

  1. struct kprobe {

  2. /*用于保存kprobe的全局hash表,以被探测的addr为key*/

  3. struct hlist_node hlist;


  4. /* list of kprobes for multi-handler support */

  5. /*当对同一个探测点存在多个探测函数时,所有的函数挂在这条链上*/

  6. struct list_head list;


  7. /*count the number of times this probe was temporarily disarmed */

  8. unsigned long nmissed;


  9. /* location of the probe point */

  10. /*被探测的目标地址*/

  11. kprobe_opcode_t *addr;


  12. /* Allow user to indicate symbol name of the probe point */

  13. /*symblo_name的存在,允许用户指定函数名而非确定的地址*/

  14. constchar*symbol_name;


  15. /* Offset into the symbol */

  16. /*如果被探测点为函数内部某个指令,需要使用addr + offset的方式*/

  17. unsigned int offset;


  18. /* Called before addr is executed. */

  19. /*探测函数,在目标探测点执行之前调用*/

  20. kprobe_pre_handler_t pre_handler;


  21. /* Called after addr is executed, unless... */

  22. /*探测函数,在目标探测点执行之后调用*/

  23. kprobe_post_handler_t post_handler;


  24. /*

  25. * ... called if executing addr causes a fault (eg. page fault).

  26. * Return 1 if it handled fault, otherwise kernel will see it.

  27. */

  28. kprobe_fault_handler_t fault_handler;


  29. /*

  30. * ... called if breakpoint trap occurs in probe handler.

  31. * Return 1 if it handled break, otherwise kernel will see it.

  32. */

  33. kprobe_break_handler_t break_handler;


  34. /*opcode 以及 ainsn 用于保存被替换的指令码*/


  35. /* Saved opcode (which has been replaced with breakpoint) */

  36. kprobe_opcode_t opcode;


  37. /* copy of the original instruction */

  38. struct arch_specific_insn ainsn;


  39. /*

  40. * Indicates various status flags.

  41. * Protected by kprobe_mutex after this kprobe is registered.

  42. */

  43. u32 flags;

  44. };

对于kprobe功能的实现主要利用了内核中的两个功能特性:异常(尤其是int 3),单步执行(EFLAGS中的TF标志)。
大概的流程:
1)在注册探测点的时候,对被探测函数的指令码进行替换,替换为int 3的指令码;
2)在执行int 3的异常执行中,通过通知链的方式调用kprobe的异常处理函数;
3)在kprobe的异常出来函数中,判断是否存在pre_handler钩子,存在则执行;
4)执行完后,准备进入单步调试,通过设置EFLAGS中的TF标志位,并且把异常返回的地址修改为保存的原指令码;
5)代码返回,执行原有指令,执行结束后触发单步异常;
6)在单步异常的处理中,清除单步标志,执行post_handler流程,并最终返回;

下面又进入代码时间,首先看一下kprobe模块的初始化代码,初始化代码主要做了两件事:标记出哪些代码是不能被探测的,这些代码属于kprobe实现的关键代码;注册通知链到die_notifier,用于接收异常通知。

点击(此处)折叠或打开

  1. 初始化代码位于kernel/kprobes.c中

  2. staticint __init init_kprobes(void)

  3. {

  4. int i, err = 0;

  5. ....


  6. /*kprobe_blacklist中保存的是kprobe实现的关键代码路径,这些函数不应该被kprobe探测*/

  7. /*

  8. * Lookup and populate the kprobe_blacklist.

  9. *

  10. * Unlike the kretprobe blacklist, we'll need to determine

  11. * the range of addresses that belong to the said functions,

  12. * since a kprobe need not necessarily be at the beginning

  13. * of a function.

  14. */

  15. for(kb = kprobe_blacklist; kb->name!=NULL; kb++){

  16. kprobe_lookup_name(kb->name, addr);

  17. if(!addr)

  18. continue;


  19. kb->start_addr =(unsigned long)addr;

  20. symbol_name = kallsyms_lookup(kb->start_addr,

  21. &size,&offset,&modname, namebuf);

  22. if(!symbol_name)

  23. kb->range = 0;

  24. else

  25. kb->range =size;

  26. }

  27. ....

  28. if(!err)

  29. /*注册通知链到die_notifier,用于接收int 3的异常信息*/

  30. err = register_die_notifier(&kprobe_exceptions_nb);

  31. ....

  32. }

  33. 其中的通知链:

  34. staticstruct notifier_block kprobe_exceptions_nb ={

  35. .notifier_call = kprobe_exceptions_notify,

  36. /*优先级最高,保证最先执行*/

  37. .priority = 0x7fffffff /* we need to be notified first */

  38. };

kprobe的注册流程register_kprobe。

点击(此处)折叠或打开

  1. int __kprobes register_kprobe(struct kprobe *p)

  2. {

  3. int ret = 0;

  4. struct kprobe *old_p;

  5. struct module *probed_mod;

  6. kprobe_opcode_t *addr;


  7. /*获取被探测点的地址,指定了symbol_name,则从kallsyms中获取;指定了offset,则返回addr + offset*/

  8. addr = kprobe_addr(p);

  9. if(!addr)

  10. return-EINVAL;

  11. p->addr = addr;


  12. /*判断同一个kprobe是否被重复注册*/

  13. ret = check_kprobe_rereg(p);

  14. if(ret)

  15. return ret;


  16. jump_label_lock();

  17. preempt_disable();

  18. /*判断被注册的函数是否位于内核的代码段内,或位于不能探测的kprobe实现路径中*/

  19. if(!kernel_text_address((unsigned long) p->addr)||

  20. in_kprobes_functions((unsigned long) p->addr)||

  21. ftrace_text_reserved(p->addr, p->addr)||

  22. jump_label_text_reserved(p->addr, p->addr))

  23. goto fail_with_jump_label;


  24. /* User can pass only KPROBE_FLAG_DISABLED to register_kprobe */

  25. p->flags&= KPROBE_FLAG_DISABLED;


  26. /*

  27. * Check if are we probing a module.

  28. */

  29. /*判断被探测的地址是否属于某一个模块,并且位于模块的text section内*/

  30. probed_mod = __module_text_address((unsigned long) p->addr);

  31. if(probed_mod){

  32. /*如果被探测的为模块地址,首先要增加模块的引用计数*/

  33. /*

  34. * We must hold a refcount of the probed module while updating

  35. * its code to prohibit unexpected unloading.

  36. */

  37. if(unlikely(!try_module_get(probed_mod)))

  38. goto fail_with_jump_label;


  39. /*

  40. * If the module freed .init.text, we couldn't insert

  41. * kprobes in there.

  42. */

  43. /*如果被探测的地址位于模块的init地址段内,但该段代码区间已被释放,则直接退出*/

  44. if(within_module_init((unsigned long)p->addr, probed_mod)&&

  45. probed_mod->state!= MODULE_STATE_COMING){

  46. module_put(probed_mod);

  47. goto fail_with_jump_label;

  48. }

  49. }

  50. preempt_enable();

  51. jump_label_unlock();


  52. p->nmissed = 0;

  53. INIT_LIST_HEAD(&p->list);

  54. mutex_lock(&kprobe_mutex);


  55. jump_label_lock();/* needed to call jump_label_text_reserved() */


  56. get_online_cpus();/* For avoiding text_mutex deadlock. */

  57. mutex_lock(&text_mutex);


  58. /*判断在同一个探测点是否已经注册了其他的探测函数*/

  59. old_p = get_kprobe(p->addr);

  60. if(old_p){

  61. /* Since this may unoptimize old_p, locking text_mutex. */

  62. /*如果已经存在注册过的kprobe,则将探测点的函数修改为aggr_pre_handler,并将所有的handler挂载到其链表上,由其负责所有handler函数的执行*/

  63. ret = register_aggr_kprobe(old_p, p);

  64. goto out;

  65. }


  66. /* 分配特定的内存地址用于保存原有的指令

  67. * 按照内核注释,被分配的地址必须must be on special executable page on x86.

  68. * 该地址被保存在kprobe->ainsn.insn

  69. */

  70. ret = arch_prepare_kprobe(p);

  71. if(ret)

  72. goto out;


  73. /*将kprobe加入到相应的hash表内*/

  74. INIT_HLIST_NODE(&p->hlist);

  75. hlist_add_head_rcu(&p->hlist,

  76. &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]);


  77. if(!kprobes_all_disarmed &&!kprobe_disabled(p))

  78. /*将探测点的指令码修改为int 3指令*/

  79. __arm_kprobe(p);


  80. /* Try to optimize kprobe */

  81. try_to_optimize_kprobe(p);


  82. out:

  83. mutex_unlock(&text_mutex);

  84. put_online_cpus();

  85. jump_label_unlock();

  86. mutex_unlock(&kprobe_mutex);


  87. if(probed_mod)

  88. module_put(probed_mod);


  89. return ret;


  90. fail_with_jump_label:

  91. preempt_enable();

  92. jump_label_unlock();

  93. return-EINVAL;

注册完毕,就开始kprobe的执行流程了。对于该探测点,由于其起始指令已经被修改为int3,因此在执行到该地址时,必然会触发3号中断向量的处理流程do_int3.

点击(此处)折叠或打开

  1. /* May run on IST stack. */

  2. dotraplinkage void __kprobes do_int3(struct pt_regs *regs,long error_code)

  3. {

  4. #ifdef CONFIG_KGDB_LOW_LEVEL_TRAP

  5. if(kgdb_ll_trap(DIE_INT3,"int3", regs, error_code, 3, SIGTRAP)

  6. == NOTIFY_STOP)

  7. return;

  8. #endif /* CONFIG_KGDB_LOW_LEVEL_TRAP */

  9. #ifdef CONFIG_KPROBES

  10. /*在这里以DIE_INT3,通知kprobe注册的通知链*/

  11. if(notify_die(DIE_INT3,"int3", regs, error_code, 3, SIGTRAP)

  12. == NOTIFY_STOP)

  13. return;

  14. #else

  15. if(notify_die(DIE_TRAP,"int3", regs, error_code, 3, SIGTRAP)

  16. == NOTIFY_STOP)

  17. return;

  18. #endif


  19. preempt_conditional_sti(regs);

  20. do_trap(3, SIGTRAP,"int3", regs, error_code,NULL);

  21. preempt_conditional_cli(regs);

  22. }

在do_int3中触发kprobe注册的通知链函数,kprobe_exceptions_notify。由于kprobe以及jprobe等机制的处理核心都在此函数内,这里只针对kprobe的流程进行分析:进入函数的原因是DIE_INT3,并且是第一次进入该函数。

点击(此处)折叠或打开

  1. int __kprobes kprobe_exceptions_notify(struct notifier_block *self,

  2. unsigned long val,void*data)

  3. {

  4. struct die_args *args = data;

  5. int ret = NOTIFY_DONE;


  6. if(args->regs && user_mode_vm(args->regs))

  7. return ret;


  8. switch(val){

  9. case DIE_INT3:

  10. /*对于kprobe,进入kprobe_handle*/

  11. if(kprobe_handler(args->regs))

  12. ret = NOTIFY_STOP;

  13. break;

  14. case DIE_DEBUG:

  15. if(post_kprobe_handler(args->regs)){

  16. /*

  17. * Reset the BS bit in dr6 (pointed by args->err) to

  18. * denote completion of processing

  19. */

  20. (*(unsigned long*)ERR_PTR(args->err))&=~DR_STEP;

  21. ret = NOTIFY_STOP;

  22. }

  23. break;

  24. case DIE_GPF:

  25. /*

  26. * To be potentially processing a kprobe fault and to

  27. * trust the result from kprobe_running(), we have

  28. * be non-preemptible.

  29. */

  30. if(!preemptible()&& kprobe_running()&&

  31. kprobe_fault_handler(args->regs, args->trapnr))

  32. ret = NOTIFY_STOP;

  33. break;

  34. default:

  35. break;

  36. }

  37. return ret;

  38. }

点击(此处)折叠或打开

  1. staticint __kprobes kprobe_handler(struct pt_regs *regs)

  2. {

  3. kprobe_opcode_t *addr;

  4. struct kprobe *p;

  5. struct kprobe_ctlblk *kcb;


  6. /*对于int 3中断,其被Intel定义为Trap,那么异常发生时EIP寄存器内指向的为异常指令的后一条指令*/

  7. addr =(kprobe_opcode_t *)(regs->ip - sizeof(kprobe_opcode_t));

  8. /*

  9. * We don't want to be preempted for the entire

  10. * duration of kprobe processing. We conditionally

  11. * re-enable preemption at the end of this function,

  12. * and also in reenter_kprobe() and setup_singlestep().

  13. */

  14. preempt_disable();


  15. kcb = get_kprobe_ctlblk();

  16. /*获取addr对应的kprobe*/

  17. p = get_kprobe(addr);


  18. if(p){

  19. /*如果异常的进入是由kprobe导致,则进入reenter_kprobe(jprobe需要,到时候分析)*/

  20. if(kprobe_running()){

  21. if(reenter_kprobe(p, regs, kcb))

  22. return 1;

  23. }else{

  24. set_current_kprobe(p, regs, kcb);

  25. kcb->kprobe_status = KPROBE_HIT_ACTIVE;


  26. /*

  27. * If we have no pre-handler or it returned 0, we

  28. * continue with normal processing. If we have a

  29. * pre-handler and it returned non-zero, it prepped

  30. * for calling the break_handler below on re-entry

  31. * for jprobe processing, so get out doing nothing

  32. * more here.

  33. */

  34. /*执行在此地址上挂载的pre_handle函数*/

  35. if(!p->pre_handler ||!p->pre_handler(p, regs))

  36. /*设置单步调试模式,为post_handle函数的执行做准备*/

  37. setup_singlestep(p, regs, kcb, 0);

  38. return 1;

  39. }

  40. }elseif(*addr != BREAKPOINT_INSTRUCTION){

  41. /*

  42. * The breakpoint instruction was removed right

  43. * after we hit it. Another cpu has removed

  44. * either a probepoint or a debugger breakpoint

  45. * at this address. In either case, no further

  46. * handling of this interrupt is appropriate.

  47. * Back up over the (now missing) int3 and run

  48. * the original instruction.

  49. */

  50. regs->ip =(unsigned long)addr;

  51. preempt_enable_no_resched();

  52. return 1;

  53. }elseif(kprobe_running()){

  54. p = __this_cpu_read(current_kprobe);

  55. if(p->break_handler && p->break_handler(p, regs)){

  56. setup_singlestep(p, regs, kcb, 0);

  57. return 1;

  58. }

  59. }/* else: not a kprobe fault; let the kernel handle it */


  60. preempt_enable_no_resched();

  61. return 0;

  62. }

点击(此处)折叠或打开

  1. staticvoid __kprobes setup_singlestep(struct kprobe *p,struct pt_regs *regs,

  2. struct kprobe_ctlblk *kcb,int reenter)

  3. {

  4. if(setup_detour_execution(p, regs, reenter))

  5. return;


  6. #if!defined(CONFIG_PREEMPT)

  7. if(p->ainsn.boostable == 1 &&!p->post_handler){

  8. /* Boost up -- we can execute copied instructions directly */

  9. if(!reenter)

  10. reset_current_kprobe();

  11. /*

  12. * Reentering boosted probe doesn't reset current_kprobe,

  13. * nor set current_kprobe, because it doesn't use single

  14. * stepping.

  15. */

  16. regs->ip =(unsigned long)p->ainsn.insn;

  17. preempt_enable_no_resched();

  18. return;

  19. }

  20. #endif

  21. /*jprobe*/

  22. if(reenter){

  23. save_previous_kprobe(kcb);

  24. set_current_kprobe(p, regs, kcb);

  25. kcb->kprobe_status = KPROBE_REENTER;

  26. }else

  27. kcb->kprobe_status = KPROBE_HIT_SS;

  28. /* Prepare real single stepping */

  29. /*准备单步模式,设置EFLAGS的TF标志位,清楚IF标志位(禁止中断)*/

  30. clear_btf();

  31. regs->flags|= X86_EFLAGS_TF;

  32. regs->flags&=~X86_EFLAGS_IF;

  33. /* single step inline if the instruction is an int3 */

  34. if(p->opcode == BREAKPOINT_INSTRUCTION)

  35. regs->ip =(unsigned long)p->addr;

  36. else

  37. /*设置异常返回的指令为保存的被探测点的指令*/

  38. regs->ip =(unsigned long)p->ainsn.insn;

  39. }

对应kprobe,pre_handle的执行就结束了,按照代码,程序开始执行保存的被探测点的指令,由于开启了单步调试模式,执行完指令后会继续触发异常,这次的是do_debug异常处理流程。

点击(此处)折叠或打开

  1. dotraplinkage void __kprobes do_debug(struct pt_regs *regs,long error_code)

  2. {

  3. ....


  4. /*在do_debug中,以DIE_DEBUG再一次触发kprobe的通知链*/

  5. if(notify_die(DIE_DEBUG,"debug", regs, PTR_ERR(&dr6), error_code,

  6. SIGTRAP)== NOTIFY_STOP)

  7. return;


  8. ....

  9. return;

  10. }

点击(此处)折叠或打开

  1. /*对于kprobe_exceptions_notify,其DIE_DEBUG处理流程*/

  2. case DIE_DEBUG:

  3. if(post_kprobe_handler(args->regs)){

  4. /*

  5. * Reset the BS bit in dr6 (pointed by args->err) to

  6. * denote completion of processing

  7. */

  8. (*(unsigned long*)ERR_PTR(args->err))&=~DR_STEP;

  9. ret = NOTIFY_STOP;

  10. }

  11. break;


  12. staticint __kprobes post_kprobe_handler(struct pt_regs *regs)

  13. {

  14. struct kprobe *cur = kprobe_running();

  15. struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();


  16. if(!cur)

  17. return 0;


  18. /*设置异常返回的EIP为下一条需要执行的指令*/

  19. resume_execution(cur, regs, kcb);

  20. /*恢复异常执行前的EFLAGS*/

  21. regs->flags|= kcb->kprobe_saved_flags;


  22. /*执行post_handler函数*/

  23. if((kcb->kprobe_status != KPROBE_REENTER)&& cur->post_handler){

  24. kcb->kprobe_status = KPROBE_HIT_SSDONE;

  25. cur->post_handler(cur, regs, 0);

  26. }


  27. /* Restore back the original saved kprobes variables and continue. */

  28. if(kcb->kprobe_status == KPROBE_REENTER){

  29. restore_previous_kprobe(kcb);

  30. goto out;

  31. }

  32. reset_current_kprobe();

  33. out:

  34. preempt_enable_no_resched();


  35. /*

  36. * if somebody else is singlestepping across a probe point, flags

  37. * will have TF set, in which case, continue the remaining processing

  38. * of do_debug, as if this is not a probe hit.

  39. */

  40. if(regs->flags& X86_EFLAGS_TF)

  41. return 0;


  42. return 1;

  43. }

至此,一个典型的kprobe的流程已经执行完毕了。