内存管理源码剖析 Author:钱国正 为了深入内存管理的研究,我决定研究它的源码,但是版本过多,所以选择 0.12 内核源码,因为其 简短,但是无脏俱全。 内存管理代码位于 mm 文件夹内。含有 Makefile, memory.c, page.s swap.c 四个文件。 Makefile 是文件编译时用的,主要是管理源文件。 memory.c 进行内存分页管理。实现对主内存区内存页面的动态分配和回收操作。 swap.c 程序主要实现虚拟内存交换功能。 page.s 包括异常中断处理程序,主要分为两种情况处理:一是由于缺页引起的页异常中断,通过调 用 do_no_page(error_code,address)来处理;二是:由页写保护引起的页异常,此时调用页写保护处理函数 do_wp_page(error_code,address)进行处理。其中出错码 error_code 由 CPU 自动产生并压入堆栈,出现异 常时访问的线性地址是从寄存器 CR2 中取得的。CR2 专门用来存放页出错时的线性地址。 get_free_page()和 free_page()这两个函数专门用于管理主内存中物理内存的占用和空闲情况,与每 个线程的线性地址无关。 void free_page(unsigned long addr) //memory.c 中 { if (addr < LOW_MEM) return; if (addr >= HIGH_MEMORY) panic("trying to free nonexistent page"); addr -= LOW_MEM; addr >>= 12; if (mem_map[addr]--) return; mem_map[addr]=0; panic("trying to free free page"); } unsigned long get_free_page(void) { register unsigned long __res asm("ax"); //swap.c 中 repeat: __asm__("std ; repne ; scasb/n/t" "jne 1f/n/t" "movb $1,1(%%edi)/n/t" "sall $12,%%ecx/n/t" "addl %2,%%ecx/n/t" "movl %%ecx,%%edx/n/t" "movl $1024,%%ecx/n/t" "leal 4092(%%edx),%%edi/n/t" "rep ; stosl/n/t" "movl %%edx,%%eax/n" "1:" :"=a" (__res) :"0" (0),"i" (LOW_MEM),"c" (PAGING_PAGES), "D" (mem_map+PAGING_PAGES-1) :"di","cx","dx"); if (__res >= HIGH_MEMORY) goto repeat; if (!__res && swap_out()) goto repeat; return __res; } free_page_tables()和 copy_page_tables()这两个函数则以一个页表对应的物理内存块为单位,释放 或复制指定线性地址和长度(页表个数)对应的物理内存页块。不仅对管理线性地址的页目录和页表中的对 应项内容进行修改,而且也对每个也表中所有页表项对应的物理内存页进行释放或占用操作。 /* * This function frees a continuos block of page tables, as needed * by 'exit()'. As does copy_page_tables(), this handles only 4Mb blocks. */ int free_page_tables(unsigned long from,unsigned long size) //memory.c 中 { unsigned long *pg_table; unsigned long * dir, nr; if (from & 0x3fffff) panic("free_page_tables called with wrong alignment"); if (!from) panic("Trying to free up swapper memory space"); size = (size + 0x3fffff) >> 22; dir = (unsigned long *) ((from>>20) & 0xffc); /* _pg_dir = 0 */ for ( ; size-->0 ; dir++) { if (!(1 & *dir)) continue; pg_table = (unsigned long *) (0xfffff000 & *dir); for (nr=0 ; nr<1024 ; nr++) { if (*pg_table) { if (1 & *pg_table) free_page(0xfffff000 & *pg_table); else swap_free(*pg_table >> 1); *pg_table = 0; } pg_table++; } free_page(0xfffff000 & *dir); *dir = 0; } invalidate(); return 0; } /* * Well, here is one of the most complicated functions in mm. It * copies a range of linerar addresses by copying only the pages. * Let's hope this is bug-free, 'cause this one I don't want to debug :-) * * Note! We don't copy just any chunks of memory - addresses have to * be divisible by 4Mb (one page-directory entry), as this makes the * function easier. It's used only by fork anyway. * * NOTE 2!! When from==0 we are copying kernel space for the first * fork(). Then we DONT want to copy a full page-directory entry, as * that would lead to some serious memory waste - we just copy the * first 160 pages - 640kB. Even that is more than we need, but it * doesn't take any more memory - we don't copy-on-write in the low * 1 Mb-range, so the pages can be shared with the kernel. Thus the * special case for nr=xxxx. */ int copy_page_tables(unsigned long from,unsigned long to,long size) { unsigned long * from_page_table; unsigned long * to_page_table; unsigned long this_page; unsigned long * from_dir, * to_dir; unsigned long new_page; unsigned long nr; if ((from&0x3fffff) || (to&0x3fffff)) panic("copy_page_tables called with wrong alignment"); from_dir = (unsigned long *) ((from>>20) & 0xffc); /* _pg_dir = 0 */ to_dir = (unsigned long *) ((to>>20) & 0xffc); size = ((unsigned) (size+0x3fffff)) >> 22; for( ; size-->0 ; from_dir++,to_dir++) { if (1 & *to_dir) panic("copy_page_tables: already exist"); if (!(1 & *from_dir)) continue; from_page_table = (unsigned long *) (0xfffff000 & *from_dir); if (!(to_page_table = (unsigned long *) get_free_page())) return -1; /* Out of memory, see freeing */ *to_dir = ((unsigned long) to_page_table) | 7; nr = (from==0)?0xA0:1024; for ( ; nr-- > 0 ; from_page_table++,to_page_table++) { this_page = *from_page_table; if (!this_page) continue; if (!(1 & this_page)) { if (!(new_page = get_free_page())) return -1; read_swap_page(this_page>>1, (char *) new_page); *to_page_table = this_page; *from_page_table = new_page | (PAGE_DIRTY | 7); continue; } this_page &= ~2; *to_page_table = this_page; if (this_page > LOW_MEM) { *from_page_table = this_page; this_page -= LOW_MEM; this_page >>= 12; mem_map[this_page]++; } } } invalidate(); return 0; } put_page()用于将一指定的物理内存页面映射到指定的线性地址处。它首先判断指定的内存页面地址 的有效性,应在 1MB 和系统最高端内存地址外,否则发出警告。然后计算该指定线性地址在页目录中对应 的目录项,此时若该目录项有效(P=1),则取其对应页表的地址,否则申请空闲页给页表使用,并设置该 页表中对应页表项的属性。最后仍返回指定的物理内存页面地址。 /* * This function puts a page in memory at the wanted address. * It returns the physical address of the page gotten, 0 if * out of memory (either when trying to access page-table or * page.) */ static unsigned long put_page(unsigned long page,unsigned long address) { unsigned long tmp, *page_table; /* NOTE !!! This uses the fact that _pg_dir=0 */ if (page < LOW_MEM || page >= HIGH_MEMORY) printk("Trying to put page %p at %p/n",page,address); if (mem_map[(page-LOW_MEM)>>12] != 1) printk("mem_map disagrees with %p at %p/n",page,address); page_table = (unsigned long *) ((address>>20) & 0xffc); if ((*page_table)&1) page_table = (unsigned long *) (0xfffff000 & *page_table); else { if (!(tmp=get_free_page())) return 0; *page_table = tmp | 7; page_table = (unsigned long *) tmp; } page_table[(address>>12) & 0x3ff] = page | 7; /* no need for invalidate */ return page; } do_wp_page()是页异常中断过程中调用的页写保护处理函数。它首先判断地址是否在进程的代码区 域,若是则终止程序;然后执行写时复制页面的操作(copy on write) /* * This routine handles present pages, when users try to write * to a shared page. It is done by copying the page to a new address * and decrementing the shared-page counter for the old page. * * If it's in code space we exit with a segment error. */ void do_wp_page(unsigned long error_code,unsigned long address) { if (address < TASK_SIZE) printk("/n/rBAD! KERNEL MEMORY WP-ERR!/n/r"); if (address - current->start_code > TASK_SIZE) { printk("Bad things happen: page error in do_wp_page/n/r"); do_exit(SIGSEGV); } #if 0 /* we cannot do this yet: the estdio library writes to code space */ /* stupid, stupid. I really want the libc.a from GNU */ if (CODE_SPACE(address)) do_exit(SIGSEGV); #endif un_wp_page((unsigned long *) (((address>>10) & 0xffc) + (0xfffff000 & *((unsigned long *) ((address>>20) &0xffc))))); } do_no_page()是页异常中断过程中调用的缺页处理函数。 void do_no_page(unsigned long error_code,unsigned long address) { int nr[4]; unsigned long tmp; unsigned long page; int block,i; struct m_inode * inode; if (address < TASK_SIZE) printk("/n/rBAD!! KERNEL PAGE MISSING/n/r"); if (address - current->start_code > TASK_SIZE) { printk("Bad things happen: nonexistent page error in do_no_page/n/r"); do_exit(SIGSEGV); } page = *(unsigned long *) ((address >> 20) & 0xffc); if (page & 1) { page &= 0xfffff000; page += (address >> 10) & 0xffc; tmp = *(unsigned long *) page; if (tmp && !(1 & tmp)) { swap_in((unsigned long *) page); return; } } address &= 0xfffff000; tmp = address - current->start_code; if (tmp >= LIBRARY_OFFSET ) { inode = current->library; block = 1 + (tmp-LIBRARY_OFFSET) / BLOCK_SIZE; } else if (tmp < current->end_data) { inode = current->executable; block = 1 + tmp / BLOCK_SIZE; } else { inode = NULL; block = 0; } if (!inode) { get_empty_page(address); return; } if (share_page(inode,tmp)) return; if (!(page = get_free_page())) oom(); /* remember that 1 block is used for header */ for (i=0 ; i<4 ; block++,i++) nr[i] = bmap(inode,block); bread_page(page,inode->i_dev,nr); i = tmp + 4096 - current->end_data; if (i>4095) i = 0; tmp = page + 4096; while (i-- > 0) { tmp--; *(char *)tmp = 0; } if (put_page(page,address)) return; free_page(page); oom(); } get_empty_page(0 用也取得一页空闲物理内存并映射到指定线性地址处。主要使用了 get_free_page()和 put_page()函数来实现该功能。 void get_empty_page(unsigned long address) { unsigned long tmp; if (!(tmp=get_free_page()) || !put_page(tmp,address)) { free_page(tmp); /* 0 is ok - ignored */ oom(); } } 以上只列出几个重要函数,对于源码的详细分析可以看同济大学赵博士的书,再仔细分析估计是一本 书了,呵呵。 本日志是最不负责任的一篇了,没有添加每行的注释,非常抱歉,不过我这周真的是精疲力竭了,没有太多时间研究内核,不能给大家好的分析结果,再次表示抱歉。