驱动蓝屏0x50 PAGE_FAULT_IN_NONPAGED_AREA

版权声明:本文为博主原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。
本文链接:https://blog.csdn.net/lixiangminghate/article/details/54347134

    想写个暴力搜索nt模块中未导出函数,结果一直蓝屏:

unsigned long MatchFunc(PKLDR_DATA_TABLE_ENTRY ldrDataTabEntry)
{
	unsigned long found = 0x00UL, CodeFeatureLen = sizeof(CodeFeature) / sizeof(unsigned char), idx = 0;
	UNICODE_STRING matchModName;
	char* modBase = (char*)ldrDataTabEntry->DllBase;

	RtlInitUnicodeString(&matchModName, L"ntoskrnl.exe");
	if (RtlCompareUnicodeString(&matchModName, &ldrDataTabEntry->BaseDllName, TRUE) == 0)
	{
		for (; idx<(ldrDataTabEntry->SizeOfImage - CodeFeatureLen); idx++)
		{
			if (memcmp(&modBase[idx], CodeFeature, CodeFeatureLen) == 0)
			{
				found = 0x01UL;
				break;
			}
		}

	}

	return found;
}

dump的内容大概如下:

PAGE_FAULT_IN_NONPAGED_AREA (50)
Invalid system memory was referenced.  This cannot be protected by try-except.
Typically the address is just plain bad or it is pointing at freed memory.
Arguments:
Arg1: 82d8a000, memory referenced.
Arg2: 00000000, value 0 = read operation, 1 = write operation.
Arg3: 93f3115e, If non-zero, the instruction address which referenced the bad memory
	address.
Arg4: 00000000, (reserved)

READ_ADDRESS:  82d8a000 

FAULTING_IP: 
enummod!memcmp+16 [d:\wbrtm\minkernel\crts\crtw32\string\i386\memcmp.c @ 70]
93f3115e 8b02            mov     eax,dword ptr [edx]


TRAP_FRAME:  807e3a08 -- (.trap 0xffffffff807e3a08)
ErrCode = 00000000
eax=82d89ffd ebx=00000000 ecx=00000019 edx=82d89ffd esi=93f33000 edi=00000004
eip=93f3115e esp=807e3a7c ebp=807e3a88 iopl=0         nv up ei pl nz na po nc
cs=0008  ss=0010  ds=0023  es=0023  fs=0030  gs=0000             efl=00010202
enummod!memcmp+0x16:
93f3115e 8b02            mov     eax,dword ptr [edx]  ds:0023:82d89ffd=????????
Resetting default scope

LAST_CONTROL_TRANSFER:  from 82aeae71 to 82a79394

STACK_TEXT:  
...
807e3968 82a938e3 00000050 82d8a000 00000000 nt!KeBugCheck2+0x68b
807e39f0 82a545f8 00000000 82d8a000 00000000 nt!MmAccessFault+0x106
807e39f0 93f3115e 00000000 82d8a000 00000000 nt!KiTrap0E+0xdc
807e3a88 93f31128 82d89ffd 93f33000 00000019 enummod!memcmp+0x16 [d:\wbrtm\minkernel\crts\crtw32\string\i386\memcmp.c @ 70]
807e3ab8 93f31093 84f42c98 00220020 93f311de enummod!MatchFunc+0x78 [c:\studio\enummod\enummod\enummod.c @ 49]
807e3ad8 82bb1728 850a0358 850a3000 00000000 enummod!DriverEntry+0x93 [c:\studio\enummod\enummod\enummod.c @ 164]
807e3cbc 82baf499 00000001 00000000 807e3ce4 nt!IopLoadDriver+0x7ed
807e3d00 82a7bf2b 8ce35cd0 00000000 85010020 nt!IopLoadUnloadDriver+0x70
...

FOLLOWUP_IP: 
enummod!memcmp+16 [d:\wbrtm\minkernel\crts\crtw32\string\i386\memcmp.c @ 70]
93f3115e 8b02            mov     eax,dword ptr [edx]
...

Followup:     MachineOwner
---------

kd> .trap 0xffffffff807e3a08
ErrCode = 00000000
eax=82d89ffd ebx=00000000 ecx=00000019 edx=82d89ffd esi=93f33000 edi=00000004
eip=93f3115e esp=807e3a7c ebp=807e3a88 iopl=0         nv up ei pl nz na po nc
cs=0008  ss=0010  ds=0023  es=0023  fs=0030  gs=0000             efl=00010202
enummod!memcmp+0x16:
93f3115e 8b02            mov     eax,dword ptr [edx]  ds:0023:82d89ffd=????????

kd> !pte 82d89ffd
                    VA 82d89ffd
PDE at C06020B0            PTE at C0416C48
contains 00000000001D2063  contains 0000000002D89963
pfn 1d2       ---DA--KWEV  pfn 2d89      -G-DA--KWEV

kd> !dd 2D89ffd L4
# 2d89ffc 00000000 00000000 00000000 00000000

乍一看错误解释,以为是在高IRQL上引用了分页内存,心里琢磨着什么时候DriverEntry的IRQL>DISPATCHER_LEVEL了?正巧同事经过,看了一眼我的代码说:在比较内存值前先判断一下页面是否有效的,应该能解决问题~我那个激动啊,原来如此,三下五除二将修改了代码为:

unsigned long MatchFunc(PKLDR_DATA_TABLE_ENTRY ldrDataTabEntry)
{
	unsigned long found = 0x00UL, CodeFeatureLen = sizeof(CodeFeature) / sizeof(unsigned char), idx = 0;
	UNICODE_STRING matchModName;
	char* modBase = (char*)ldrDataTabEntry->DllBase;

	RtlInitUnicodeString(&matchModName, L"ntoskrnl.exe");
	if (RtlCompareUnicodeString(&matchModName, &ldrDataTabEntry->BaseDllName, TRUE) == 0)
	{
		for (; idx<(ldrDataTabEntry->SizeOfImage - CodeFeatureLen); idx++)
		{
			//比较内存前先判断地址是否有效
			if ((MmIsAddressValidAddr)(&modBase[idx]))
			{
				//有效,然后再比较
				if (memcmp(&modBase[idx], CodeFeature, CodeFeatureLen) == 0)
				{
					found = 0x01UL;
					break;
				}
			}

		}

	}

	return found;
}
再次加载驱动,程序还是在相同的地方出错了。说不通啊,我已经判断过页面的有效性了,能进到最内层的if进行页面比较就说明内存有效性检测已经通过了,难道是MmIsAddressValid出错了?百度了一圈,发现求助的人不少,更有人说内核API MmIsAddressValid不稳定,还引用了ddk help的原话:"Even if MmIsAddressValid returns TRUE, accessing the address can cause page faults unless the memory has been locked down or the address is a valid nonpaged pool address."

带着这样的疑惑,我查看了wrk1.2 MmIsAddressValid的实现(我的机器是win7RTM32bit,所以只贴出源码中2级页表映射的内容):


BOOLEAN
MiIsAddressValid (
    IN PVOID VirtualAddress,
    IN LOGICAL UseForceIfPossible
    )

{
    PMMPTE PointerPte;
    UNREFERENCED_PARAMETER (UseForceIfPossible);

    if (MI_RESERVED_BITS_CANONICAL(VirtualAddress) == FALSE) {
        return FALSE;
    }

    PointerPte = MiGetPdeAddress (VirtualAddress);
    if (PointerPte->u.Hard.Valid == 0) {
        return FALSE;
    }

    if (MI_PDE_MAPS_LARGE_PAGE (PointerPte)) {
        return TRUE;
    }

    PointerPte = MiGetPteAddress (VirtualAddress);
    if (PointerPte->u.Hard.Valid == 0) {
        return FALSE;
    }

    if (MI_PDE_MAPS_LARGE_PAGE (PointerPte)) {
        return FALSE;
    }

    return TRUE;
}
wrk用比较简单的方式检测页面的有效性:检测pte页表项的有效位p位是否置1,置1则认为有效,返回TRUE。这样检测有一个问题:如果代码以下面的流程执行可能会出错:

{
  if(MmIsAddressValid(addr)) //在IRQL<DISPATCH_LEVEL时判断页面有效性
  {  //通过有效性判断,同时addr被换出内存
    KeRaiseIrql(DISPATCH_LEVEL); //页面无法换入
    do access addr //访问已经被换出的内存,可能蓝屏
    KeLowerIrql();
  }
}

附:调换一下代码顺序可能就不会出错了(不过我还没有验证过),

{
  {  //先提升IRQL,停止换页线程运行
    KeRaiseIrql(DISPATCH_LEVEL); 
    if(MmIsAddressValid(addr)) //在IRQL>=DISPATCH_LEVEL时判断页面有效性
        do access addr //通过有效性检测 这时页面就不会被换出,可以大胆的使用
    KeLowerIrql();
  }
}

可是,我的代码并没有提升IRQL,就算访问了换页出去的页面OS也会负责将页面重新换回。在这样的背景下通过  if(MmIsAddressValid(addr))有效性检测也不可能访问到无效的页面。最后,我在一个群里得到了比较靠谱的答案:nt模块有些页面被标志位Init或者Discard,如果程序访问了这样的页面也可能会触发缺页异常。我查看了一下nt模块的PE信息,的确发现有很多被标记为Discardable的节:

kd> !dh nt

SECTION HEADER #13
PAGEVRFD name
     B18 virtual size
  37B000 virtual address <-------------蓝屏时显示我正在访问的内存所在的节 mov eax,dword ptr [edx]  ds:0023:82d81ffd=????????
     C00 size of raw data
  330A00 file pointer to raw data
       0 file pointer to relocation table
       0 file pointer to line numbers
       0 number of relocations
       0 number of line numbers
C0000040 flags
         Initialized Data
         (no align specified)
         Read Write

SECTION HEADER #14
    INIT name
   44638 virtual size
  37C000 virtual address
   44800 size of raw data
  331600 file pointer to raw data
       0 file pointer to relocation table
       0 file pointer to line numbers
       0 number of relocations
       0 number of line numbers
E2000020 flags
         Code
         Discardable <-------------------------------
         (no align specified)
         Execute Read Write

SECTION HEADER #16
  .reloc name
   1976C virtual size
  3F6000 virtual address
   19800 size of raw data
  3AA400 file pointer to raw data
       0 file pointer to relocation table
       0 file pointer to line numbers
       0 number of relocations
       0 number of line numbers
42000040 flags
         Initialized Data
         Discardable <-------------------------------
         (no align specified)
         Read Only

另外,我查看了虚拟地址0x82d81ffd处的pte,发现虽然内存0x82d81ffd未被映射,但他的pte是有效的,比较奇怪:

kd> dd 82d81ffd L1
82d81ffd  ????????
kd> !pte 82d81ffd
                    VA 82d81ffd
PDE at C06020B0            PTE at C0416C08
contains 00000000001D2063  contains 0000000002D81963
pfn 1d2       ---DA--KWEV  pfn 2d81      -G-DA--KWEV

了解了这个知识点,我再次对程序进行了修改:先判断是否能获得物理内存,如果返回值为零,即可以获得物理内存再进行内存比较。但这种方式不尽人意,依然触发了异常,异常位置还是在memcmp处。很无奈,我只能再次分析dump文件。再次定位访问出错时CPU将要执行的指令

mov     eax,dword ptr [edx]  ds:0023:82d81ffd=????????
指令访问地址0x82d81ffd,这个地址是正好位于两个节的交界处,再加4B,就能从Section Header #13进入到Section Header #14,而Section Header #14的内存页面是标记为Discardable。由此,我猜想我在暴力搜索内存时只判断了搜索的起始位置的有效性,并未对结束位置的有效性进行判断;如果搜索的起始点在页末最后几个字节,结束位置位于这种被标记为Disacrdable的页面中,就很可能出错。

kd> !dh nt

SECTION HEADER #13
PAGEVRFD name
     B18 virtual size
  37B000 virtual address <----页面起始位于82a06000+37B000,页面对齐
...
         Initialized Data
         (no align specified)
         Read Write

SECTION HEADER #14
    INIT name
   44638 virtual size
  37C000 virtual address <----页面起始位于82a06000+37C000,页面对齐
...
         Discardable <-------------------------------
         (no align specified)
         Execute Read Write

kd> !pte 82a06000+37B000 <----#13节的页面对应pte是有效的
                    VA 82d81000
PDE at C06020B0            PTE at C0416C08
contains 00000000001D2063  contains 0000000002D81963
pfn 1d2       ---DA--KWEV  pfn 2d81      -G-DA--KWEV

kd> !pte 82a06000+37C000 <-----#14节的页面对应pte是全0pte
                    VA 82d82000
PDE at C06020B0            PTE at C0416C10
contains 00000000001D2063  contains 0000000000000000
pfn 1d2       ---DA--KWEV  not valid

BugCheck 50, {82d82000, 0, 9453015e, 0} <-----BugCheck 0x50 Arg1 表明蓝屏时访问的内存地址 82d82000=82a06000+37C000 正好是#14节的开始
结合上面的分析,验证了我的猜想。这次我同时验证将比较的内存区域的首尾是否有效,再次加载果然没有蓝屏发生。

展开阅读全文

没有更多推荐了,返回首页