一个历史遗留问题,引发的linux内存管理的‘血案’

最近处理一个骨灰级历史残留问题,内核模块DPI的内存数据被无故关顾,导致系统的panic的问题,linux 内核版本3.18 x86_64,由于我们要精简系统,许多调试工具已经被阉割,SLAB_DEBUG, KASAN not support, 由于这部分数据主要是查询,在初始化后不会对其进行修改,所以想到一个办法初始化完DPI后,将其使用的内存页设置为只读,通过stack的信息找到元凶。

按照以上的分析总共分为以下步骤:

  1. 查找 虚拟地址的PTE
  2. 设置PTE的属性为只读
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/init.h>
#include <linux/gfp.h>
#include <linux/mm.h>

MODULE_LICENSE("GPL");

static void *address = NULL;

static __init int test_init(void)
{
        int level;
        pte_t *ptep;

        struct page *page = alloc_pages(GFP_KERNEL, 1);
        if (unlikely(page == NULL)) {
                pr_err("alloc page err %p\n", page);
                return -1;
        }
        address = page_address(page);
        pr_info("lookup_address %p\n", address);

        // 1, lookup for PTE
        ptep = lookup_address((unsigned long)address, &level);
        if (unlikely(ptep == NULL)) {
                pr_err("lookup_address %p\n", ptep);
                goto err;
        }
        if(level != PG_LEVEL_4K) {
                pr_err("level not 4K %d\n", level);
                goto err;
        }
        if (!pte_present(*ptep)) {
                pr_err("level not 4K %d\n", level);
                goto err;
        }

        // 2, set write protect flag
        set_pte(ptep, pte_wrprotect(*ptep));

        return 0;

err:
        __free_page(page);
        return -1;
}

static __exit void test_exit(void)
{
        // clear wrprotect flag
        // TODO ...

        pr_info("test exit\n");
}

module_init(test_init);
module_exit(test_exit);

按照思路从业务中抽取功能代码,写了非常简单的一个测试用例,以为万事大吉,万万没有想到,理想很丰满,现实很骨感,事情总是不按照我们的预期执行,多次执行insmod test.ko,得到以下结果

[  659.486243] lookup_address ffff8800692e4000
[  659.486248] level not 4K 2
[  660.142577] lookup_address ffff880046436000
[  660.142582] level not 4K 2
[  660.530890] lookup_address ffff8800461a0000
[  660.530896] level not 4K 2
[  660.873884] lookup_address ffff88012369a000
[  660.873889] level not 4K 2

为什么level不是PG_LEVEL_4K,明明申请一页,level层级确是PG_LEVEL_2M,这样会将2M的内存空间设置为只读状态,为了查清这个问题,我们不得不梳理内存管理初始化流程:

start_kernel()
    |---->setup_arch(&command_line);
          |
          |---->init_mem_mapping();
                |
                |---->memory_map_top_down();
                      |
                      |---->init_range_memory_mapping();
                            |
                            |---->init_memory_mapping();

/*
 * Setup the direct mapping of the physical memory at PAGE_OFFSET.
 * This runs before bootmem is initialized and gets pages directly from
 * the physical memory. To access them they are temporarily mapped.
 */
unsigned long __init_refok init_memory_mapping(unsigned long start,
					       unsigned long end)
{
	struct map_range mr[NR_RANGE_MR];
	unsigned long ret = 0;
	int nr_range, i;

	pr_info("init_memory_mapping: [mem %#010lx-%#010lx]\n",
	       start, end - 1);

	memset(mr, 0, sizeof(mr));
	nr_range = split_mem_range(mr, 0, start, end);

	for (i = 0; i < nr_range; i++)
		ret = kernel_physical_mapping_init(mr[i].start, mr[i].end,
						   mr[i].page_size_mask);

	add_pfn_range_mapped(start >> PAGE_SHIFT, ret >> PAGE_SHIFT);

	return ret >> PAGE_SHIFT;
}

static int __meminit split_mem_range(struct map_range *mr, int nr_range,
				     unsigned long start,
				     unsigned long end)
{
	...... // 省略部分代码

	/* big page (2M) range */
	start_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE));
#ifdef CONFIG_X86_32
	end_pfn = round_down(limit_pfn, PFN_DOWN(PMD_SIZE));
#else /* CONFIG_X86_64 */
	end_pfn = round_up(pfn, PFN_DOWN(PUD_SIZE));
	if (end_pfn > round_down(limit_pfn, PFN_DOWN(PMD_SIZE)))
		end_pfn = round_down(limit_pfn, PFN_DOWN(PMD_SIZE));
#endif

	if (start_pfn < end_pfn) {
		nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
				page_size_mask & (1<<PG_LEVEL_2M));
		pfn = end_pfn;
	}

#ifdef CONFIG_X86_64
	/* big page (1G) range */
	start_pfn = round_up(pfn, PFN_DOWN(PUD_SIZE));
	end_pfn = round_down(limit_pfn, PFN_DOWN(PUD_SIZE));
	if (start_pfn < end_pfn) {
		nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
				page_size_mask &
				 ((1<<PG_LEVEL_2M)|(1<<PG_LEVEL_1G)));
		pfn = end_pfn;
	}

	/* tail is not big page (1G) alignment */
	start_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE));
	end_pfn = round_down(limit_pfn, PFN_DOWN(PMD_SIZE));
	if (start_pfn < end_pfn) {
		nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
				page_size_mask & (1<<PG_LEVEL_2M));
		pfn = end_pfn;
	}
#endif

	...... // 省略部分代码
}

从split_mem_range() 可以看出,在做物理内存直接映射的时候,尽可能使用huge page去映射,这就解释了为什么我们申请的内存是PG_LEVEL_2M,理论上说应该也会出现PG_LEVEL_1G的大页,问题原因找到了,该怎么解决这个问题呢?此时想到了BPF功能,会将BPF字节码注入内核,为了安全它也会做BPF字节码的内存设置只读权限,肯定也会遇到我们同样的问题,RTFSC

sys_bpf()
|
|---->bpf_prog_load()
     |
     |---->bpf_prog_select_runtime()
           |
           |---->bpf_int_jit_compile()
                 |
                 |---->set_memory_ro()
                       |
                       |---->change_page_attr_clear()
                             |
                             |---->__change_page_attr_set_clr()
                                   |
                                   |---->__change_page_attr()
                                         |
                                         |---->lookup_address_cpa()
                                         |
                                         |---->split_large_page() /* ! PG_LEVEL_4K */

从上面代码流程可以看出,bpf() 系统调用最终会调用split_large_page() 来解决申请的大页的情况,x86平台封装了系列函数,至此我们修改我们的实现方式,采用set_memory_ro(),自作聪明的以为修改PTE属性,还是掉进的坑里。

/*
 * The set_memory_* API can be used to change various attributes of a virtual
 * address range. The attributes include:
 * Cachability   : UnCached, WriteCombining, WriteBack
 * Executability : eXeutable, NoteXecutable
 * Read/Write    : ReadOnly, ReadWrite
 * Presence      : NotPresent
 * /
int set_memory_uc(unsigned long addr, int numpages);
int set_memory_wc(unsigned long addr, int numpages);
int set_memory_wb(unsigned long addr, int numpages);
int set_memory_x(unsigned long addr, int numpages);
int set_memory_nx(unsigned long addr, int numpages);
int set_memory_ro(unsigned long addr, int numpages);
int set_memory_rw(unsigned long addr, int numpages);
int set_memory_np(unsigned long addr, int numpages);
int set_memory_4k(unsigned long addr, int numpages);

学习的道路,永无止境,特别是内核学习,RTFSC!!!!

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值