最近处理一个骨灰级历史残留问题,内核模块DPI的内存数据被无故关顾,导致系统的panic的问题,linux 内核版本3.18 x86_64,由于我们要精简系统,许多调试工具已经被阉割,SLAB_DEBUG, KASAN not support, 由于这部分数据主要是查询,在初始化后不会对其进行修改,所以想到一个办法初始化完DPI后,将其使用的内存页设置为只读,通过stack的信息找到元凶。
按照以上的分析总共分为以下步骤:
- 查找 虚拟地址的PTE
- 设置PTE的属性为只读
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/init.h>
#include <linux/gfp.h>
#include <linux/mm.h>
MODULE_LICENSE("GPL");
static void *address = NULL;
static __init int test_init(void)
{
int level;
pte_t *ptep;
struct page *page = alloc_pages(GFP_KERNEL, 1);
if (unlikely(page == NULL)) {
pr_err("alloc page err %p\n", page);
return -1;
}
address = page_address(page);
pr_info("lookup_address %p\n", address);
// 1, lookup for PTE
ptep = lookup_address((unsigned long)address, &level);
if (unlikely(ptep == NULL)) {
pr_err("lookup_address %p\n", ptep);
goto err;
}
if(level != PG_LEVEL_4K) {
pr_err("level not 4K %d\n", level);
goto err;
}
if (!pte_present(*ptep)) {
pr_err("level not 4K %d\n", level);
goto err;
}
// 2, set write protect flag
set_pte(ptep, pte_wrprotect(*ptep));
return 0;
err:
__free_page(page);
return -1;
}
static __exit void test_exit(void)
{
// clear wrprotect flag
// TODO ...
pr_info("test exit\n");
}
module_init(test_init);
module_exit(test_exit);
按照思路从业务中抽取功能代码,写了非常简单的一个测试用例,以为万事大吉,万万没有想到,理想很丰满,现实很骨感,事情总是不按照我们的预期执行,多次执行insmod test.ko,得到以下结果
[ 659.486243] lookup_address ffff8800692e4000
[ 659.486248] level not 4K 2
[ 660.142577] lookup_address ffff880046436000
[ 660.142582] level not 4K 2
[ 660.530890] lookup_address ffff8800461a0000
[ 660.530896] level not 4K 2
[ 660.873884] lookup_address ffff88012369a000
[ 660.873889] level not 4K 2
为什么level不是PG_LEVEL_4K,明明申请一页,level层级确是PG_LEVEL_2M,这样会将2M的内存空间设置为只读状态,为了查清这个问题,我们不得不梳理内存管理初始化流程:
start_kernel()
|---->setup_arch(&command_line);
|
|---->init_mem_mapping();
|
|---->memory_map_top_down();
|
|---->init_range_memory_mapping();
|
|---->init_memory_mapping();
/*
* Setup the direct mapping of the physical memory at PAGE_OFFSET.
* This runs before bootmem is initialized and gets pages directly from
* the physical memory. To access them they are temporarily mapped.
*/
unsigned long __init_refok init_memory_mapping(unsigned long start,
unsigned long end)
{
struct map_range mr[NR_RANGE_MR];
unsigned long ret = 0;
int nr_range, i;
pr_info("init_memory_mapping: [mem %#010lx-%#010lx]\n",
start, end - 1);
memset(mr, 0, sizeof(mr));
nr_range = split_mem_range(mr, 0, start, end);
for (i = 0; i < nr_range; i++)
ret = kernel_physical_mapping_init(mr[i].start, mr[i].end,
mr[i].page_size_mask);
add_pfn_range_mapped(start >> PAGE_SHIFT, ret >> PAGE_SHIFT);
return ret >> PAGE_SHIFT;
}
static int __meminit split_mem_range(struct map_range *mr, int nr_range,
unsigned long start,
unsigned long end)
{
...... // 省略部分代码
/* big page (2M) range */
start_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE));
#ifdef CONFIG_X86_32
end_pfn = round_down(limit_pfn, PFN_DOWN(PMD_SIZE));
#else /* CONFIG_X86_64 */
end_pfn = round_up(pfn, PFN_DOWN(PUD_SIZE));
if (end_pfn > round_down(limit_pfn, PFN_DOWN(PMD_SIZE)))
end_pfn = round_down(limit_pfn, PFN_DOWN(PMD_SIZE));
#endif
if (start_pfn < end_pfn) {
nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
page_size_mask & (1<<PG_LEVEL_2M));
pfn = end_pfn;
}
#ifdef CONFIG_X86_64
/* big page (1G) range */
start_pfn = round_up(pfn, PFN_DOWN(PUD_SIZE));
end_pfn = round_down(limit_pfn, PFN_DOWN(PUD_SIZE));
if (start_pfn < end_pfn) {
nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
page_size_mask &
((1<<PG_LEVEL_2M)|(1<<PG_LEVEL_1G)));
pfn = end_pfn;
}
/* tail is not big page (1G) alignment */
start_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE));
end_pfn = round_down(limit_pfn, PFN_DOWN(PMD_SIZE));
if (start_pfn < end_pfn) {
nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
page_size_mask & (1<<PG_LEVEL_2M));
pfn = end_pfn;
}
#endif
...... // 省略部分代码
}
从split_mem_range() 可以看出,在做物理内存直接映射的时候,尽可能使用huge page去映射,这就解释了为什么我们申请的内存是PG_LEVEL_2M,理论上说应该也会出现PG_LEVEL_1G的大页,问题原因找到了,该怎么解决这个问题呢?此时想到了BPF功能,会将BPF字节码注入内核,为了安全它也会做BPF字节码的内存设置只读权限,肯定也会遇到我们同样的问题,RTFSC
sys_bpf()
|
|---->bpf_prog_load()
|
|---->bpf_prog_select_runtime()
|
|---->bpf_int_jit_compile()
|
|---->set_memory_ro()
|
|---->change_page_attr_clear()
|
|---->__change_page_attr_set_clr()
|
|---->__change_page_attr()
|
|---->lookup_address_cpa()
|
|---->split_large_page() /* ! PG_LEVEL_4K */
从上面代码流程可以看出,bpf() 系统调用最终会调用split_large_page() 来解决申请的大页的情况,x86平台封装了系列函数,至此我们修改我们的实现方式,采用set_memory_ro(),自作聪明的以为修改PTE属性,还是掉进的坑里。
/*
* The set_memory_* API can be used to change various attributes of a virtual
* address range. The attributes include:
* Cachability : UnCached, WriteCombining, WriteBack
* Executability : eXeutable, NoteXecutable
* Read/Write : ReadOnly, ReadWrite
* Presence : NotPresent
* /
int set_memory_uc(unsigned long addr, int numpages);
int set_memory_wc(unsigned long addr, int numpages);
int set_memory_wb(unsigned long addr, int numpages);
int set_memory_x(unsigned long addr, int numpages);
int set_memory_nx(unsigned long addr, int numpages);
int set_memory_ro(unsigned long addr, int numpages);
int set_memory_rw(unsigned long addr, int numpages);
int set_memory_np(unsigned long addr, int numpages);
int set_memory_4k(unsigned long addr, int numpages);
学习的道路,永无止境,特别是内核学习,RTFSC!!!!