https://www.cnblogs.com/arnoldlu/p/8568330.html
1. General
1.1 /proc/meminfo
/proc/meminfo是了解Linux系统内存使用状况主要接口,也是free等命令的数据来源。
下面是cat /proc/meminfo的一个实例。
复制代码
MemTotal: 8054880 kB---------------------对应totalram_pages大小
MemFree: 4004312 kB---------------------对应vm_stat[NR_FREE_PAGES]大小
MemAvailable: 5678888 kB---------------------MemFree减去保留内存,加上部分pagecache和部分SReclaimable。
Buffers: 303016 kB---------------------块设备缓冲区大小
Cached: 2029616 kB---------------------主要是vm_stat[NR_FILE_PAGES],再减去swap出的大小和块设备缓冲区大小。
SwapCached: 0 kB
Active: 2123084 kB
Inactive: 1476268 kB
Active(anon): 1273544 kB
Inactive(anon): 547988 kB
Active(file): 849540 kB
Inactive(file): 928280 kB
Unevictable: 17152 kB
Mlocked: 17152 kB
SwapTotal: 7812092 kB
SwapFree: 7812092 kB
Dirty: 6796 kB
Writeback: 0 kB
AnonPages: 1283984 kB
Mapped: 455248 kB
Shmem: 550260 kB---------------------vm_stat[NR_SHMEM]
Slab: 268208 kB
SReclaimable: 206964 kB---------------------可回收的slab缓存vm_stat[NR_SLAB_RECLAIMABLE]
SUnreclaim: 61244 kB
KernelStack: 12736 kB
PageTables: 50376 kB
NFS_Unstable: 0 kB
Bounce: 0 kB
WritebackTmp: 0 kB
CommitLimit: 11839532 kB
Committed_AS: 7934688 kB
VmallocTotal: 34359738367 kB
VmallocUsed: 0 kB
VmallocChunk: 0 kB
HardwareCorrupted: 0 kB
AnonHugePages: 0 kB
ShmemHugePages: 0 kB
ShmemPmdMapped: 0 kB
CmaTotal: 0 kB
CmaFree: 0 kB
HugePages_Total: 0
HugePages_Free: 0
HugePages_Rsvd: 0
HugePages_Surp: 0
Hugepagesize: 2048 kB
DirectMap4k: 226256 kB
DirectMap2M: 5953536 kB
DirectMap1G: 3145728 kB
复制代码
/proc/meminfo对应内核的核心函数是meminfo_proc_show(), 包括两个重要的填充sysinfo的函数si_meminfo()和si_swapinfo()。
MemTotal是系统从加电开始到引导完成,除去kernel本身要占用一些内存,最后剩下可供kernel支配的内存。
MemFree表示系统尚未使用的内存;MemAvailable表示系统可用内存,因为应用会根据系统可用内存大小动态调整申请内存大小,MemFree并不适用,因为有些内存是可以回收的,所以这部分内存要加上可回收内存。
PageTables用于将内存的虚拟地址翻译成物理地址,随着内存地址分配的越来越多,PageTable会增大。/proc/meminfo中的PageTables就是统计PageTable所占用内存大小。
KernelStack是常驻内存的,既不包括在LRU链表中,也不包括在进程RSS、PSS中,所以认为它是内核消耗的内存。
复制代码
static int meminfo_proc_show(struct seq_file *m, void *v)
{
struct sysinfo i;
unsigned long committed;
long cached;
long available;
unsigned long pagecache;
unsigned long wmark_low = 0;
unsigned long pages[NR_LRU_LISTS];
struct zone *zone;
int lru;
/*
* display in kilobytes.
*/
#define K(x) ((x) << (PAGE_SHIFT - 10))
si_meminfo(&i);
si_swapinfo(&i);
committed = percpu_counter_read_positive(&vm_committed_as);
cached = global_page_state(NR_FILE_PAGES) -
total_swapcache_pages() - i.bufferram;---------------------vm_stat[NR_FILE_PAGES]减去swap的页面和块设备缓存页面。
if (cached < 0)
cached = 0;
for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++)
pages[lru] = global_page_state(NR_LRU_BASE + lru);--------------遍历获取vm_stat中的5种LRU页面大小。
for_each_zone(zone)
wmark_low += zone->watermark[WMARK_LOW];
/*
* Estimate the amount of memory available for userspace allocations,
* without causing swapping.
*/
available = i.freeram - totalreserve_pages;--------------------------vm_stat[NR_FREE_PAGES]减去保留页面totalreserve_pages。
/*
* Not all the page cache can be freed, otherwise the system will
* start swapping. Assume at least half of the page cache, or the
* low watermark worth of cache, needs to stay.
*/
pagecache = pages[LRU_ACTIVE_FILE] + pages[LRU_INACTIVE_FILE];------pagecache包括活跃和不活跃文件LRU页面两部分。
pagecache -= min(pagecache / 2, wmark_low);-------------------------保留min(pagecache/2, wmark_low)大小,确保不会被释放。
available += pagecache;---------------------------------------------可用页面增加可释放的pagecache部分。
/*
* Part of the reclaimable slab consists of items that are in use,
* and cannot be freed. Cap this estimate at the low watermark.
*/
available += global_page_state(NR_SLAB_RECLAIMABLE) -
min(global_page_state(NR_SLAB_RECLAIMABLE) / 2, wmark_low);--类似pagecache,可回收slab缓存保留一部分不可释放。其余部分给available。
if (available < 0)
available = 0;
/*
* Tagged format, for easy grepping and expansion.
*/
seq_printf(m,
"MemTotal: %8lu kB\n"
"MemFree: %8lu kB\n"
"MemAvailable: %8lu kB\n"
"Buffers: %8lu kB\n"
"Cached: %8lu kB\n"
"SwapCached: %8lu kB\n"
"Active: %8lu kB\n"
"Inactive: %8lu kB\n"
"Active(anon): %8lu kB\n"
"Inactive(anon): %8lu kB\n"
"Active(file): %8lu kB\n"
"Inactive(file): %8lu kB\n"
"Unevictable: %8lu kB\n"
"Mlocked: %8lu kB\n"
#ifdef CONFIG_HIGHMEM
"HighTotal: %8lu kB\n"
"HighFree: %8lu kB\n"
"LowTotal: %8lu kB\n"
"LowFree: %8lu kB\n"
#endif
#ifndef CONFIG_MMU
"MmapCopy: %8lu kB\n"
#endif
"SwapTotal: %8lu kB\n"
"SwapFree: %8lu kB\n"
"Dirty: %8lu kB\n"
"Writeback: %8lu kB\n"
"AnonPages: %8lu kB\n"
"Mapped: %8lu kB\n"
"Shmem: %8lu kB\n"
"Slab: %8lu kB\n"
"SReclaimable: %8lu kB\n"
"SUnreclaim: %8lu kB\n"
"KernelStack: %8lu kB\n"
"PageTables: %8lu kB\n"
#ifdef CONFIG_QUICKLIST
"Quicklists: %8lu kB\n"
#endif
"NFS_Unstable: %8lu kB\n"
"Bounce: %8lu kB\n"
"WritebackTmp: %8lu kB\n"
"CommitLimit: %8lu kB\n"
"Committed_AS: %8lu kB\n"
"VmallocTotal: %8lu kB\n"
"VmallocUsed: %8lu kB\n"
"VmallocChunk: %8lu kB\n"
#ifdef CONFIG_MEMORY_FAILURE
"HardwareCorrupted: %5lu kB\n"
#endif
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
"AnonHugePages: %8lu kB\n"
#endif
#ifdef CONFIG_CMA
"CmaTotal: %8lu kB\n"
"CmaFree: %8lu kB\n"
#endif
,
K(i.totalram),-------------------------------------------------即totalram_pages大小
K(i.freeram),--------------------------------------------------即vm_stat[NR_FREE_PAGES]
K(available),--------------------------------------------------等于freeram减去保留totalreserve_pages,以及一部分pagecache和可回收slab缓存。
K(i.bufferram),------------------------------------------------通过nr_blockdev_pages()获取。
K(cached),-----------------------------------------------------vm_stat[NR_FILE_PAGES]减去swap部分以及块设备缓存。
K(total_swapcache_pages()),------------------------------------swap交换占用的页面大小。
K(pages[LRU_ACTIVE_ANON] + pages[LRU_ACTIVE_FILE]),----------活跃页面大小
K(pages[LRU_INACTIVE_ANON] + pages[LRU_INACTIVE_FILE]),--------不活跃页面大小
K(pages[LRU_ACTIVE_ANON]),
K(pages[LRU_INACTIVE_ANON]),
K(pages[LRU_ACTIVE_FILE]),
K(pages[LRU_INACTIVE_FILE]),
K(pages[LRU_UNEVICTABLE]),-------------------------------------不能被pageout/swapout的内存页面
K(global_page_state(NR_MLOCK)),
#ifdef CONFIG_HIGHMEM
K(i.totalhigh),
K(i.freehigh),
K(i.totalram-i.totalhigh),
K(i.freeram-i.freehigh),
#endif
#ifndef CONFIG_MMU
K((unsigned long) atomic_long_read(&mmap_pages_allocated)),
#endif
K(i.totalswap),------------------------------------------------总swap空间大小
K(i.freeswap),-------------------------------------------------空闲swap空间大小
K(global_page_state(NR_FILE_DIRTY)),---------------------------等待被写回磁盘文件大小
K(global_page_state(NR_WRITEBACK)),----------------------------正在被回写文件的大小
K(global_page_state(NR_ANON_PAGES)),---------------------------映射的匿名页面
K(global_page_state(NR_FILE_MAPPED)),--------------------------映射的文件页面
K(i.sharedram),------------------------------------------------即vm_stat[NR_SHMEM]
K(global_page_state(NR_SLAB_RECLAIMABLE) +
global_page_state(NR_SLAB_UNRECLAIMABLE)),-------------slab缓存包括可回收和不可回收两部分,vm_stat[NR_SLAB_RECLAIMABLE]+vm_stat[NR_SLAB_UNRECLAIMABLE]。
K(global_page_state(NR_SLAB_RECLAIMABLE)),
K(global_page_state(NR_SLAB_UNRECLAIMABLE)),
global_page_state(NR_KERNEL_STACK) * THREAD_SIZE / 1024,-------vm_stat[NR_KERNEL_STACK]大小
K(global_page_state(NR_PAGETABLE)),----------------------------pagetables所占大小
#ifdef CONFIG_QUICKLIST
K(quicklist_total_size()),
#endif
K(global_page_state(NR_UNSTABLE_NFS)),
K(global_page_state(NR_BOUNCE)),
K(global_page_state(NR_WRITEBACK_TEMP)),
K(vm_commit_limit()),
K(committed),
(unsigned long)VMALLOC_TOTAL >> 10,----------------------------vmalloc虚拟空间的大小
0ul, // used to be vmalloc 'used'
0ul // used to be vmalloc 'largest_chunk'
#ifdef CONFIG_MEMORY_FAILURE
, atomic_long_read(&num_poisoned_pages) << (PAGE_SHIFT - 10)
#endif
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
, K(global_page_state(NR_ANON_TRANSPARENT_HUGEPAGES) *
HPAGE_PMD_NR)
#endif
#ifdef CONFIG_CMA
, K(totalcma_pages)
, K(global_page_state(NR_FREE_CMA_PAGES))
#endif
);
hugetlb_report_meminfo(m);
arch_report_meminfo(m);
return 0;
#undef K
}
void si_meminfo(struct sysinfo *val)
{
val->totalram = totalram_pages;
val->sharedram = global_page_state(NR_SHMEM);
val->freeram = global_page_state(NR_FREE_PAGES);
val->bufferram = nr_blockdev_pages();
val->totalhigh = totalhigh_pages;
val->freehigh = nr_free_highpages();
val->mem_unit = PAGE_SIZE;
}
void si_swapinfo(struct sysinfo *val)
{
unsigned int type;
unsigned long nr_to_be_unused = 0;
spin_lock(&swap_lock);
for (type = 0; type < nr_swapfiles; type++) {
struct swap_info_struct *si = swap_info[type];
if ((si->flags & SWP_USED) && !(si->flags & SWP_WRITEOK))
nr_to_be_unused += si->inuse_pages;
}
val->freeswap = atomic_long_read(&nr_swap_pages) + nr_to_be_unused;
val->totalswap = total_swap_pages + nr_to_be_unused;
spin_unlock(&swap_lock);
}
复制代码
参考文档:《/PROC/MEMINFO之谜》
1.2 free
free命令用来显示内存的使用情况。
free -s 2 -c 2 -w -t -h
含义为-s 每2秒显示一次,-c 共2次,-w buff/cache分开显示,-t 显示total,-h 可读性更高。
结果如下:
复制代码
total used free shared buffers cache available
Mem: 7.7G 1.4G 3.8G 534M 295M 2.1G 5.4G
Swap: 7.5G 0B 7.5G
Total: 15G 1.4G 11G
total used free shared buffers cache available
Mem: 7.7G 1.4G 3.8G 537M 295M 2.1G 5.4G
Swap: 7.5G 0B 7.5G
Total: 15G 1.4G 11G
复制代码
Mem一行指的是RAM的使用情况,Swap一行是交换分区的使用情况。
free命令是procps-ng包的一部分,主体在free.c中。这些参数的获取在meminfo()中进行。
复制代码
int main(int argc, char **argv)
{
...
do {
meminfo();
/* Translation Hint: You can use 9 character words in
* the header, and the words need to be right align to
* beginning of a number. */
if (flags & FREE_WIDE) {
printf(_(" total used free shared buffers cache available"));
} else {
printf(_(" total used free shared buff/cache available"));
}
printf("\n");
printf("%-7s", _("Mem:"));
printf(" %11s", scale_size(kb_main_total, flags, args));
printf(" %11s", scale_size(kb_main_used, flags, args));
printf(" %11s", scale_size(kb_main_free, flags, args));
printf(" %11s", scale_size(kb_main_shared, flags, args));
if (flags & FREE_WIDE) {
printf(" %11s", scale_size(kb_main_buffers, flags, args));
printf(" %11s", scale_size(kb_main_cached, flags, args));
} else {
printf(" %11s", scale_size(kb_main_buffers+kb_main_cached, flags, args));
}
printf(" %11s", scale_size(kb_main_available, flags, args));
printf("\n");
...
printf("%-7s", _("Swap:"));
printf(" %11s", scale_size(kb_swap_total, flags, args));
printf(" %11s", scale_size(kb_swap_used, flags, args));
printf(" %11s", scale_size(kb_swap_free, flags, args));
printf("\n");
if (flags & FREE_TOTAL) {
printf("%-7s", _("Total:"));
printf(" %11s", scale_size(kb_main_total + kb_swap_total, flags, args));
printf(" %11s", scale_size(kb_main_used + kb_swap_used, flags, args));
printf(" %11s", scale_size(kb_main_free + kb_swap_free, flags, args));
printf("\n");
}
fflush(stdout);
if (flags & FREE_REPEATCOUNT) {
args.repeat_counter--;
if (args.repeat_counter < 1)
exit(EXIT_SUCCESS);
}
if (flags & FREE_REPEAT) {
printf("\n");
usleep(args.repeat_interval);
}
} while ((flags & FREE_REPEAT));
exit(EXIT_SUCCESS);
}
复制代码
解析部分在sysinfo.c中。通过解析/proc/meminfo信息,计算出free的各项值。
/proc/meminfo和free的对应关系如下:
free /proc/meminfo
total =MemTotal
used =MemTotal - MemFree - (Cached + SReclaimable) - Buffers
free =MemFree
shared =Shmem
buffers =Buffers
cache =Cached + SReclaimable
available =MemAvailable
复制代码
void meminfo(void){
char namebuf[32]; /* big enough to hold any row name */
int linux_version_code = procps_linux_version();
mem_table_struct findme = { namebuf, NULL};
mem_table_struct *found;
char *head;
char *tail;
static const mem_table_struct mem_table[] = {
{"Active", &kb_active}, // important
{"Active(file)", &kb_active_file},
{"AnonPages", &kb_anon_pages},
{"Bounce", &kb_bounce},
{"Buffers", &kb_main_buffers}, // important
{"Cached", &kb_page_cache}, // important
{"CommitLimit", &kb_commit_limit},
{"Committed_AS", &kb_committed_as},
{"Dirty", &kb_dirty}, // kB version of vmstat nr_dirty
{"HighFree", &kb_high_free},
{"HighTotal", &kb_high_total},
{"Inact_clean", &kb_inact_clean},
{"Inact_dirty", &kb_inact_dirty},
{"Inact_laundry",&kb_inact_laundry},
{"Inact_target", &kb_inact_target},
{"Inactive", &kb_inactive}, // important
{"Inactive(file)",&kb_inactive_file},
{"LowFree", &kb_low_free},
{"LowTotal", &kb_low_total},
{"Mapped", &kb_mapped}, // kB version of vmstat nr_mapped
{"MemAvailable", &kb_main_available}, // important
{"MemFree", &kb_main_free}, // important
{"MemTotal", &kb_main_total}, // important
{"NFS_Unstable", &kb_nfs_unstable},
{"PageTables", &kb_pagetables}, // kB version of vmstat nr_page_table_pages
{"ReverseMaps", &nr_reversemaps}, // same as vmstat nr_page_table_pages
{"SReclaimable", &kb_slab_reclaimable}, // "slab reclaimable" (dentry and inode structures)
{"SUnreclaim", &kb_slab_unreclaimable},
{"Shmem", &kb_main_shared}, // kernel 2.6.32 and later
{"Slab", &kb_slab}, // kB version of vmstat nr_slab
{"SwapCached", &kb_swap_cached},
{"SwapFree", &kb_swap_free}, // important
{"SwapTotal", &kb_swap_total}, // important
{"VmallocChunk", &kb_vmalloc_chunk},
{"VmallocTotal", &kb_vmalloc_total},
{"VmallocUsed", &kb_vmalloc_used},
{"Writeback", &kb_writeback}, // kB version of vmstat nr_writeback
};
const int mem_table_count = sizeof(mem_table)/sizeof(mem_table_struct);
unsigned long watermark_low;
signed long mem_available, mem_used;
FILE_TO_BUF(MEMINFO_FILE,meminfo_fd);
kb_inactive = ~0UL;
kb_low_total = kb_main_available = 0;
head = buf;
for(;;){
tail = strchr(head, ':');
if(!tail) break;
*tail = '\0';
if(strlen(head) >= sizeof(namebuf)){
head = tail+1;
goto nextline;
}
strcpy(namebuf,head);
found = bsearch(&findme, mem_table, mem_table_count,
sizeof(mem_table_struct), compare_mem_table_structs
);
head = tail+1;
if(!found) goto nextline;
*(found->slot) = (unsigned long)strtoull(head,&tail,10);
nextline:
tail = strchr(head, '\n');
if(!tail) break;
head = tail+1;
}
if(!kb_low_total){ /* low==main except with large-memory support */
kb_low_total = kb_main_total;
kb_low_free = kb_main_free;
}
if(kb_inactive==~0UL){
kb_inactive = kb_inact_dirty + kb_inact_clean + kb_inact_laundry;
}
kb_main_cached = kb_page_cache + kb_slab_reclaimable;
kb_swap_used = kb_swap_total - kb_swap_free;
/* if kb_main_available is greater than kb_main_total or our calculation of
mem_used overflows, that's symptomatic of running within a lxc container
where such values will be dramatically distorted over those of the host. */
if (kb_main_available > kb_main_total)
kb_main_available = kb_main_free;
mem_used = kb_main_total - kb_main_free - kb_main_cached - kb_main_buffers;
if (mem_used < 0)
mem_used = kb_main_total - kb_main_free;
kb_main_used = (unsigned long)mem_used;----------------------------------kb_main_used为MemTotal - MemFree - (Cached + SReclaimable) - Buffers
/* zero? might need fallback for 2.6.27 <= kernel <? 3.14 */
if (!kb_main_available) {
#ifdef __linux__
if (linux_version_code < LINUX_VERSION(2, 6, 27))
kb_main_available = kb_main_free;
else {
FILE_TO_BUF(VM_MIN_FREE_FILE, vm_min_free_fd);
kb_min_free = (unsigned long) strtoull(buf,&tail,10);
watermark_low = kb_min_free * 5 / 4; /* should be equal to sum of all 'low' fields in /proc/zoneinfo */
mem_available = (signed long)kb_main_free - watermark_low
+ kb_inactive_file + kb_active_file - MIN((kb_inactive_file + kb_active_file) / 2, watermark_low)
+ kb_slab_reclaimable - MIN(kb_slab_reclaimable / 2, watermark_low);
if (mem_available < 0) mem_available = 0;
kb_main_available = (unsigned long)mem_available;
}
#else
kb_main_available = kb_main_free;
#endif /* linux */
}
}
复制代码
1.3 /proc/buddyinfo
/proc/buddyinfo显示Linux buddy系统空闲物理内存使用情况,行为内存节点不同zone,列为不同order。
Node 0, zone DMA 1 1 1 1 2 2 0 0 1 1 3
Node 0, zone DMA32 7 8 8 9 6 3 8 7 7 7 441
Node 0, zone Normal 141 168 320 174 81 66 39 13 27 17 782
buddyinfo中的Node0表示节点ID,而每个节点下的内存设备又可以划分为多个内存区域。每列的值表示当前节点当前zone中的空闲连续页面数量。
复制代码
static void frag_show_print(struct seq_file *m, pg_data_t *pgdat,
struct zone *zone)
{
int order;
seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
for (order = 0; order < MAX_ORDER; ++order)
seq_printf(m, "%6lu ", zone->free_area[order].nr_free);-----------打印当前zone不同order的空闲数目
seq_putc(m, '\n');
}
/*
* This walks the free areas for each zone.
*/
static int frag_show(struct seq_file *m, void *arg)
{
pg_data_t *pgdat = (pg_data_t *)arg;
walk_zones_in_node(m, pgdat, frag_show_print);------------------------walk_zones_in_node()遍历当前节点pgdat里面所有的zone
return 0;
}
复制代码
1.4 /proc/pagetypeinfo
pagetypeinfo比buggyinfo更加详细,更进一步将页面按照不同类型划分。
pagetypeinfo分为三部分:pageblock介数、不同节点不同zone不同页面类型不同介空闲数、
复制代码
Page block order: 9
Pages per block: 512-------------------------------------------------------------------------------------------------------------一个pageblock占用
多少个页面
Free pages count per migrate type at order 0 1 2 3 4 5 6 7 8 9 10 ---------这个部分是空闲的连续个order介数页面数量
Node 0, zone DMA, type Unmovable 1 1 1 1 2 2 0 0 1 0 0
Node 0, zone DMA, type Movable 0 0 0 0 0 0 0 0 0 1 3
Node 0, zone DMA, type Reclaimable 0 0 0 0 0 0 0 0 0 0 0
Node 0, zone DMA, type HighAtomic 0 0 0 0 0 0 0 0 0 0 0
Node 0, zone DMA, type CMA 0 0 0 0 0 0 0 0 0 0 0
Node 0, zone DMA, type Isolate 0 0 0 0 0 0 0 0 0 0 0
Node 0, zone DMA32, type Unmovable 0 1 1 1 0 0 1 0 1 1 0
Node 0, zone DMA32, type Movable 7 7 7 8 6 3 7 7 6 6 441
Node 0, zone DMA32, type Reclaimable 0 0 0 0 0 0 0 0 0 0 0
Node 0, zone DMA32, type HighAtomic 0 0 0 0 0 0 0 0 0 0 0
Node 0, zone DMA32, type CMA 0 0 0 0 0 0 0 0 0 0 0
Node 0, zone DMA32, type Isolate 0 0 0 0 0 0 0 0 0 0 0
Node 0, zone Normal, type Unmovable 75 124 43 2 8 2 3 0 1 1 0
Node 0, zone Normal, type Movable 33 246 173 172 78 36 10 8 2 1 709
Node 0, zone Normal, type Reclaimable 239 370 231 33 45 23 12 8 5 12 1
Node 0, zone Normal, type HighAtomic 0 0 0 0 0 0 0 0 0 0 0
Node 0, zone Normal, type CMA 0 0 0 0 0 0 0 0 0 0 0
Node 0, zone Normal, type Isolate 0 0 0 0 0 0 0 0 0 0 0
Number of blocks type Unmovable Movable Reclaimable HighAtomic CMA Isolate -----------------------------这里是pageblock的数目,pageblock的大小在第一部分确定。
Node 0, zone DMA 1 7 0 0 0 0
Node 0, zone DMA32 2 950 0 0 0 0
Node 0, zone Normal 140 2662 302 0 0 0
复制代码
第三部分减去第二部分就是被使用掉的页面数量。
下面是核心代码:
复制代码
static int pagetypeinfo_show(struct seq_file *m, void *arg)
{
pg_data_t *pgdat = (pg_data_t *)arg;
/* check memoryless node */
if (!node_state(pgdat->node_id, N_MEMORY))
return 0;
seq_printf(m, "Page block order: %d\n", pageblock_order);
seq_printf(m, "Pages per block: %lu\n", pageblock_nr_pages);
seq_putc(m, '\n');
pagetypeinfo_showfree(m, pgdat);
pagetypeinfo_showblockcount(m, pgdat);
pagetypeinfo_showmixedcount(m, pgdat);
return 0;
}
/* Print out the free pages at each order for each migatetype */
static int pagetypeinfo_showfree(struct seq_file *m, void *arg)
{
int order;
pg_data_t *pgdat = (pg_data_t *)arg;
/* Print header */
seq_printf(m, "%-43s ", "Free pages count per migrate type at order");
for (order = 0; order < MAX_ORDER; ++order)
seq_printf(m, "%6d ", order);
seq_putc(m, '\n');
walk_zones_in_node(m, pgdat, pagetypeinfo_showfree_print);-----------------------遍历当前节点的不同zone。
return 0;
}
static void pagetypeinfo_showfree_print(struct seq_file *m,
pg_data_t *pgdat, struct zone *zone)
{
int order, mtype;
for (mtype = 0; mtype < MIGRATE_TYPES; mtype++) {--------------------------------当前zone的不同页面类型,包括MIGRATE_UNMOVABLE、MIGRATE_MOVABLE、MIGRATE_RECLAIMABLE、MIGRATE_HIGHATOMIC、MIGRATE_CMA、MIGRATE_ISOLATE。
seq_printf(m, "Node %4d, zone %8s, type %12s ",
pgdat->node_id,
zone->name,
migratetype_names[mtype]);
for (order = 0; order < MAX_ORDER; ++order) {--------------------------------然后按照order递增统计空闲个数。
unsigned long freecount = 0;
struct free_area *area;
struct list_head *curr;
area = &(zone->free_area[order]);
list_for_each(curr, &area->free_list[mtype])
freecount++;
seq_printf(m, "%6lu ", freecount);
}
seq_putc(m, '\n');
}
}
/* Print out the free pages at each order for each migratetype */
static int pagetypeinfo_showblockcount(struct seq_file *m, void *arg)
{
int mtype;
pg_data_t *pgdat = (pg_data_t *)arg;
seq_printf(m, "\n%-23s", "Number of blocks type ");
for (mtype = 0; mtype < MIGRATE_TYPES; mtype++)
seq_printf(m, "%12s ", migratetype_names[mtype]);
seq_putc(m, '\n');
walk_zones_in_node(m, pgdat, pagetypeinfo_showblockcount_print);---------------遍历当前节点的不同zone
return 0;
}
static void pagetypeinfo_showblockcount_print(struct seq_file *m,
pg_data_t *pgdat, struct zone *zone)
{
int mtype;
unsigned long pfn;
unsigned long start_pfn = zone->zone_start_pfn;
unsigned long end_pfn = zone_end_pfn(zone);
unsigned long count[MIGRATE_TYPES] = { 0, };
for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {--------------遍历所有的pageblock,然后按照页面类型进行统计。
struct page *page;
if (!pfn_valid(pfn))
continue;
page = pfn_to_page(pfn);
/* Watch for unexpected holes punched in the memmap */
if (!memmap_valid_within(pfn, page, zone))
continue;
mtype = get_pageblock_migratetype(page);
if (mtype < MIGRATE_TYPES)
count[mtype]++;
}
/* Print counts */
seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
for (mtype = 0; mtype < MIGRATE_TYPES; mtype++)
seq_printf(m, "%12lu ", count[mtype]);
seq_putc(m, '\n');
}
复制代码
1.4 /proc/vmstat
/proc/vmstat主要是导出vm_stat[]、vm_numa_stat[]、vm_node_stat[]、的统计信息,对应的字符串信息在vmstat_text[]中;其他信息还包括writeback_stat_item、。
复制代码
nr_free_pages 1148275
nr_zone_inactive_anon 129283
nr_zone_active_anon 312361
nr_zone_inactive_file 207534
nr_zone_active_file 122432
nr_zone_unevictable 3743
nr_zone_write_pending 131
nr_mlock 3751
nr_page_table_pages 12230
nr_kernel_stack 12048
nr_bounce 0
nr_zspages 0
nr_free_cma 0
numa_hit 11496173
numa_miss 0
numa_foreign 0
numa_interleave 44278
numa_local 11496173
numa_other 0
...
复制代码
/proc/vmstat对应的文件操作函数为vmstat_file_operations。
在vmstat_start()中获取各参数到v[]中,里面的数值和vmstat_text[]里的字符一一对应。
然后在vmstat_show()中一条一条打印出来。
复制代码
const char * const vmstat_text[] = {
/* enum zone_stat_item countes */
"nr_free_pages",
"nr_zone_inactive_anon",
"nr_zone_active_anon",
"nr_zone_inactive_file",
"nr_zone_active_file",
"nr_zone_unevictable",
"nr_zone_write_pending",
"nr_mlock",
"nr_page_table_pages",
"nr_kernel_stack",
"nr_bounce",
...
};
static void *vmstat_start(struct seq_file *m, loff_t *pos)
{
unsigned long *v;
int i, stat_items_size;
if (*pos >= ARRAY_SIZE(vmstat_text))
return NULL;
stat_items_size = NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long) +
NR_VM_NUMA_STAT_ITEMS * sizeof(unsigned long) +
NR_VM_NODE_STAT_ITEMS * sizeof(unsigned long) +
NR_VM_WRITEBACK_STAT_ITEMS * sizeof(unsigned long);
#ifdef CONFIG_VM_EVENT_COUNTERS
stat_items_size += sizeof(struct vm_event_state);
#endif
v = kmalloc(stat_items_size, GFP_KERNEL);
m->private = v;
if (!v)
return ERR_PTR(-ENOMEM);
for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
v[i] = global_zone_page_state(i);
v += NR_VM_ZONE_STAT_ITEMS;
#ifdef CONFIG_NUMA
for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++)
v[i] = global_numa_state(i);
v += NR_VM_NUMA_STAT_ITEMS;
#endif
for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
v[i] = global_node_page_state(i);
v += NR_VM_NODE_STAT_ITEMS;
global_dirty_limits(v + NR_DIRTY_BG_THRESHOLD,
v + NR_DIRTY_THRESHOLD);
v += NR_VM_WRITEBACK_STAT_ITEMS;
#ifdef CONFIG_VM_EVENT_COUNTERS
all_vm_events(v);
v[PGPGIN] /= 2; /* sectors -> kbytes */
v[PGPGOUT] /= 2;
#endif
return (unsigned long *)m->private + *pos;
}
static int vmstat_show(struct seq_file *m, void *arg)
{
unsigned long *l = arg;
unsigned long off = l - (unsigned long *)m->private;
seq_puts(m, vmstat_text[off]);
seq_put_decimal_ull(m, " ", *l);
seq_putc(m, '\n');
return 0;
}
static const struct seq_operations vmstat_op = {
.start = vmstat_start,
.next = vmstat_next,
.stop = vmstat_stop,
.show = vmstat_show,
};
static int vmstat_open(struct inode *inode, struct file *file)
{
return seq_open(file, &vmstat_op);
}
static const struct file_operations vmstat_file_operations = {
.open = vmstat_open,
.read = seq_read,
.llseek = seq_lseek,
.release = seq_release,
};
复制代码
1.5 /proc/vmallocinfo
提供vmalloc以及map区域相关信息,一块区域一行信息。
复制代码
0xffffaeec00000000-0xffffaeec00002000 8192 acpi_os_map_iomem+0x17c/0x1b0 phys=0x0000000077fe9000 ioremap
0xffffaeec00002000-0xffffaeec00004000 8192 acpi_os_map_iomem+0x17c/0x1b0 phys=0x0000000077faa000 ioremap
0xffffaeec00004000-0xffffaeec00006000 8192 acpi_os_map_iomem+0x17c/0x1b0 phys=0x0000000077ffd000 ioremap
...
0xffffaeec00043000-0xffffaeec00045000 8192 acpi_os_map_iomem+0x17c/0x1b0 phys=0x0000000077fcb000 ioremap
0xffffaeec00045000-0xffffaeec00047000 8192 acpi_os_map_iomem+0x17c/0x1b0 phys=0x0000000077fe4000 ioremap
0xffffaeec00047000-0xffffaeec00049000 8192 acpi_os_map_iomem+0x17c/0x1b0 phys=0x0000000077fee000 ioremap
0xffffaeec00049000-0xffffaeec0004b000 8192 pci_iomap_range+0x63/0x80 phys=0x000000009432d000 ioremap
0xffffaeec0004b000-0xffffaeec0004d000 8192 acpi_os_map_iomem+0x17c/0x1b0 phys=0x0000000077fc3000 ioremap
...
0xffffaeec00c65000-0xffffaeec00c86000 135168 alloc_large_system_hash+0x19c/0x259 pages=32 vmalloc N0=32
复制代码
/proc/vmallocinfo调用vmalloc_open()来遍历vmap_area_list,在s_show()中显示每个区域信息。
从下面的s_show()可知,第一列是区域虚拟地址起点终点,第二列是区域的大小,第三列是调用者,第四列是对应的页面数量(如果有的话),第五列是物理地址,第六列是区域类型,最后节点序号。
复制代码
static int s_show(struct seq_file *m, void *p)
{
struct vmap_area *va = p;
struct vm_struct *v;
/*
* s_show can encounter race with remove_vm_area, !VM_VM_AREA on
* behalf of vmap area is being tear down or vm_map_ram allocation.
*/
if (!(va->flags & VM_VM_AREA))
return 0;
v = va->vm;
seq_printf(m, "0x%pK-0x%pK %7ld",
v->addr, v->addr + v->size, v->size);
if (v->caller)
seq_printf(m, " %pS", v->caller);
if (v->nr_pages)
seq_printf(m, " pages=%d", v->nr_pages);
if (v->phys_addr)
seq_printf(m, " phys=%llx", (unsigned long long)v->phys_addr);
if (v->flags & VM_IOREMAP)
seq_puts(m, " ioremap");
if (v->flags & VM_ALLOC)
seq_puts(m, " vmalloc");
if (v->flags & VM_MAP)
seq_puts(m, " vmap");
if (v->flags & VM_USERMAP)
seq_puts(m, " user");
if (v->flags & VM_VPAGES)
seq_puts(m, " vpages");
show_numa_info(m, v);
seq_putc(m, '\n');
return 0;
}
static const struct seq_operations vmalloc_op = {
.start = s_start,
.next = s_next,
.stop = s_stop,
.show = s_show,
};
static int vmalloc_open(struct inode *inode, struct file *file)
{
if (IS_ENABLED(CONFIG_NUMA))
return seq_open_private(file, &vmalloc_op,
nr_node_ids * sizeof(unsigned int));
else
return seq_open(file, &vmalloc_op);
}
复制代码
1.6 /proc/self/statm、maps
1.6.1 /proc/self/statm
每个进程都有自己的statm,statm显示当前进程的内存使用情况,以page为单位。
3679 213 197 8 0 111 0
statm一共7项,分别解释如下:
size:进程虚拟地址空间的大小。
resident:应用程序占用的物理内存大小。
shared:共享页面大小。
text:代码段占用的大小。
lib:为0。
data:data_vm+stack_vm占用的大小。
dt:脏页,为0。
/proc/self/statm的核心函数是proc_pid_statm(),通过task_statm()获取相关参数,然后打印。
复制代码
int proc_pid_statm(struct seq_file *m, struct pid_namespace *ns,
struct pid *pid, struct task_struct *task)
{
unsigned long size = 0, resident = 0, shared = 0, text = 0, data = 0;
struct mm_struct *mm = get_task_mm(task);
if (mm) {
size = task_statm(mm, &shared, &text, &data, &resident);
mmput(mm);
}
/*
* For quick read, open code by putting numbers directly
* expected format is
* seq_printf(m, "%lu %lu %lu %lu 0 %lu 0\n",
* size, resident, shared, text, data);
*/
seq_put_decimal_ull(m, "", size);
seq_put_decimal_ull(m, " ", resident);
seq_put_decimal_ull(m, " ", shared);
seq_put_decimal_ull(m, " ", text);
seq_put_decimal_ull(m, " ", 0);
seq_put_decimal_ull(m, " ", data);
seq_put_decimal_ull(m, " ", 0);
seq_putc(m, '\n');
return 0;
}
unsigned long task_statm(struct mm_struct *mm,
unsigned long *shared, unsigned long *text,
unsigned long *data, unsigned long *resident)
{
*shared = get_mm_counter(mm, MM_FILEPAGES) +
get_mm_counter(mm, MM_SHMEMPAGES);
*text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK))
>> PAGE_SHIFT;
*data = mm->data_vm + mm->stack_vm;
*resident = *shared + get_mm_counter(mm, MM_ANONPAGES);
return mm->total_vm;
}
复制代码
1.6.2 /proc/self/maps
maps显示当前进程各虚拟地址段的属性,包括虚拟地址段的起始终止地址、读写执行属性、vm_pgoff、主从设备号、i_ino、文件名。
复制代码
6212616d000-562126175000 r-xp 00000000 08:06 1569818 /bin/cat--------------------------只读、可执行,一般是代码段的位置。
562126374000-562126375000 r--p 00007000 08:06 1569818 /bin/cat-------------------------只读属性、不可执行。
562126375000-562126376000 rw-p 00008000 08:06 1569818 /bin/cat-------------------------读写、不可执行。
562126f5b000-562126f7c000 rw-p 00000000 00:00 0 [heap]
7fd5423d5000-7fd542da4000 r--p 00000000 08:06 922566 /usr/lib/locale/locale-archive
7fd542da4000-7fd542f8b000 r-xp 00000000 08:06 136724 /lib/x86_64-linux-gnu/libc-2.27.so
7fd542f8b000-7fd54318b000 ---p 001e7000 08:06 136724 /lib/x86_64-linux-gnu/libc-2.27.so
7fd54318b000-7fd54318f000 r--p 001e7000 08:06 136724 /lib/x86_64-linux-gnu/libc-2.27.so
7fd54318f000-7fd543191000 rw-p 001eb000 08:06 136724 /lib/x86_64-linux-gnu/libc-2.27.so
7fd543191000-7fd543195000 rw-p 00000000 00:00 0
7fd543195000-7fd5431bc000 r-xp 00000000 08:06 136696 /lib/x86_64-linux-gnu/ld-2.27.so
7fd54338d000-7fd54338f000 rw-p 00000000 00:00 0
7fd54339a000-7fd5433bc000 rw-p 00000000 00:00 0
7fd5433bc000-7fd5433bd000 r--p 00027000 08:06 136696 /lib/x86_64-linux-gnu/ld-2.27.so
7fd5433bd000-7fd5433be000 rw-p 00028000 08:06 136696 /lib/x86_64-linux-gnu/ld-2.27.so
7fd5433be000-7fd5433bf000 rw-p 00000000 00:00 0
7ffe3ab8a000-7ffe3abab000 rw-p 00000000 00:00 0 [stack]
7ffe3abd5000-7ffe3abd8000 r--p 00000000 00:00 0 [vvar]
7ffe3abd8000-7ffe3abda000 r-xp 00000000 00:00 0 [vdso]
ffffffffff600000-ffffffffff601000 r-xp 00000000 00:00 0 [vsyscall]
复制代码
首先要遍历当前进程的所有vma,然后show_map_vma()显示每个vma的详细信息。
vdso的全称是虚拟动态共享库(virtual dynamic shared library),而vsyscall的全称是虚拟系统调用(virtual system call)。
复制代码
static void
show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid)
{
struct mm_struct *mm = vma->vm_mm;
struct file *file = vma->vm_file;
vm_flags_t flags = vma->vm_flags;
unsigned long ino = 0;
unsigned long long pgoff = 0;
unsigned long start, end;
dev_t dev = 0;
const char *name = NULL;
if (file) {
struct inode *inode = file_inode(vma->vm_file);
dev = inode->i_sb->s_dev;
ino = inode->i_ino;
pgoff = ((loff_t)vma->vm_pgoff) << PAGE_SHIFT;------------------------------是这个vma的第一页在地址空间里是第几页。
}
start = vma->vm_start;
end = vma->vm_end;
show_vma_header_prefix(m, start, end, flags, pgoff, dev, ino);
/*
* Print the dentry name for named mappings, and a
* special [heap] marker for the heap:
*/
if (file) {---------------------------------------------------------------------如果vm_file是文件,显示其路径。
seq_pad(m, ' ');
seq_file_path(m, file, "\n");
goto done;
}
if (vma->vm_ops && vma->vm_ops->name) {
name = vma->vm_ops->name(vma);
if (name)
goto done;
}
name = arch_vma_name(vma);
if (!name) {
if (!mm) {------------------------------------------------------------------不是文件但是,name和mm都不为空,名称为vdso。
name = "[vdso]";
goto done;
}
if (vma->vm_start <= mm->brk &&
vma->vm_end >= mm->start_brk) {
name = "[heap]";
goto done;
}
if (is_stack(vma))
name = "[stack]";
}
done:
if (name) {
seq_pad(m, ' ');
seq_puts(m, name);
}
seq_putc(m, '\n');
}
static void show_vma_header_prefix(struct seq_file *m,
unsigned long start, unsigned long end,
vm_flags_t flags, unsigned long long pgoff,
dev_t dev, unsigned long ino)
{
seq_setwidth(m, 25 + sizeof(void *) * 6 - 1);
seq_printf(m, "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu ",
start,
end,
flags & VM_READ ? 'r' : '-',
flags & VM_WRITE ? 'w' : '-',
flags & VM_EXEC ? 'x' : '-',
flags & VM_MAYSHARE ? 's' : 'p',
pgoff,
MAJOR(dev), MINOR(dev), ino);
}
复制代码
2. vm参数
2.1 /proc/sys/vm/highmem_is_dirtyable
首先highmem_is_dirtyable只有在CONFIG_HIGHMEM定义的情况下,才有效。
默认为0,即在计算dirty_ratio和dirty_background_ratio的时候只考虑low mem。当打开之后才会将highmem也计算在内。
2.2 /proc/sys/vm/legacy_va_layout
默认为0,即使用32位mmap分层,否则使用2.4内核的分层。
2.3 /proc/sys/vm/lowmem_reserve_ratio
lowmem_reserve_ratio是防止highmem内存在不充裕情况下,过度借用低端内存。
lowmem_reserve_ratio决定了每个zone保留多少数目的页面。
sysctl_lowmem_reserve_ratio中定义了不同zone的预留比例,值越大保留比例越小。如,DMA为1/256,NORMAL为1/32。
复制代码
int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = {
#ifdef CONFIG_ZONE_DMA
256,
#endif
#ifdef CONFIG_ZONE_DMA32
256,
#endif
#ifdef CONFIG_HIGHMEM
32,
#endif
32,
};
static void setup_per_zone_lowmem_reserve(void)
{
struct pglist_data *pgdat;
enum zone_type j, idx;
for_each_online_pgdat(pgdat) {
for (j = 0; j < MAX_NR_ZONES; j++) {------------------------------------------这里供ZONE_DMA、ZONE_NORMAL、ZONE_MOVABLE三个zone。
struct zone *zone = pgdat->node_zones + j;
unsigned long managed_pages = zone->managed_pages;------------------------当前zone伙伴系统管理的页面数目
zone->lowmem_reserve[j] = 0;
idx = j;
while (idx) {-------------------------------------------------------------遍历低于当前zone的zone。
struct zone *lower_zone;
idx--;----------------------------------------------------------------注意下面idx和j的区别,j表示当前zone,idx表示lower zone。
if (sysctl_lowmem_reserve_ratio[idx] < 1)-----------------------------最低不小于1,不可能预留超过内存总量的大小。
sysctl_lowmem_reserve_ratio[idx] = 1;
lower_zone = pgdat->node_zones + idx;
lower_zone->lowmem_reserve[j] = managed_pages /
sysctl_lowmem_reserve_ratio[idx];----------------------------------更新lower zone的关于当前zone的lowmem_reserve。
managed_pages += lower_zone->managed_pages;----------------------------managed_pages累加
}
}
}
/* update totalreserve_pages */
calculate_totalreserve_pages();----------------------------------------------------更新totalreserve_pages
}
复制代码
2.4 /proc/sys/vm/max_map_count 、/proc/sys/vm/mmap_min_addr
max_map_count规定了mmap区域的最大数目,默认值是65536。
mmap_min_addr规定了用于进程mmap的最小空间大小,默认是4096。
2.5 /proc/sys/vm/min_free_kbytes
min_free_kbytes是强制系统lowmem保持最低限度的空闲内存大小,这个值用于计算WMARK_MIN水位。
如果设置过低,可能造成系统在高负荷下易死锁;如果设置过高,又容易触发OOM机制。
2.6 /proc/sys/vm/stat_interval
VM统计信息的采样周期,默认1秒。
2.7 /proc/sys/vm/vfs_cache_pressure
vfs_cache_pressure用于控制dentry/inode页面回收的倾向性,默认是为100。这里的倾向性是和pagecache/swapcahche回收相对比的。
当vfs_cache_pressure=100,是对两者采取一个平衡的策略。
当vfs_cache_pressure小于100,更倾向于保留dentry/inode类型页面。
当vfs_cache_pressure大于100,更倾向于回收dentry/inode类型页面。
当vfs_cache_pressure为0时,内核不会回收dentry/inode类型页面。
当vfs_cache_pressure远高于100时,可能引起性能回退,因为内存回收会持有很多锁来查找可释放页面。
2.8 /proc/sys/vm/page-cluster
一次从swap分区读取的页面阶数,0表示1页,1表示2页。类似于pagecache的预读取功能。
主要用于提高从swap恢复的读性能。
2. swap
2.1 /proc/swaps
/proc/swaps文件操作函数在proc_swaps_operations。
swap_start()遍历swap_info[]所有swap文件,然后在swap_show()中显示每个swap文件的信息。
复制代码
static void *swap_start(struct seq_file *swap, loff_t *pos)
{
struct swap_info_struct *si;
int type;
loff_t l = *pos;
mutex_lock(&swapon_mutex);
if (!l)
return SEQ_START_TOKEN;
for (type = 0; type < nr_swapfiles; type++) {
smp_rmb(); /* read nr_swapfiles before swap_info[type] */
si = swap_info[type];
if (!(si->flags & SWP_USED) || !si->swap_map)
continue;
if (!--l)
return si;
}
return NULL;
}
static int swap_show(struct seq_file *swap, void *v)
{
struct swap_info_struct *si = v;
struct file *file;
int len;
if (si == SEQ_START_TOKEN) {
seq_puts(swap,"Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n");
return 0;
}
file = si->swap_file;
len = seq_file_path(swap, file, " \t\n\\");-----------------根据file显示swap文件的名称。
seq_printf(swap, "%*s%s\t%u\t%u\t%d\n",
len < 40 ? 40 - len : 1, " ",
S_ISBLK(file_inode(file)->i_mode) ?-----------------判断swap文件类型是块设备分区还是一个文件
"partition" : "file\t",
si->pages << (PAGE_SHIFT - 10),---------------------以KB为单位的swap总大小
si->inuse_pages << (PAGE_SHIFT - 10),---------------以KB为单位的被使用部分大小
si->prio);------------------------------------------swap优先级
return 0;
}
static const struct seq_operations swaps_op = {
.start = swap_start,
.next = swap_next,
.stop = swap_stop,
.show = swap_show
};
复制代码
示例如下:
Filename Type Size Used Priority
/dev/sda7 partition 7812092 0 -2
2.2 /proc/sys/vm/swappiness
3. zone
/proc/zoneinfo
4. slab
/proc/slab_allocators
/proc/slabinfo
slabinfo
5. KSM
/sys/kernel/mm/ksm
6. 页面迁移
/sys/kernel/debug/tracing/events/migrate
7. 内存规整
/proc/sys/vm/compact_memory、/proc/sys/vm/extfrag_threshold
echo 1到compact_memory触发内存规整,extfrag_threshold是内存规整碎片阈值。
两者详情见:compact_memory和extfrag_threshold。
/sys/kernel/debug/extfrag
/sys/kernel/debug/tracing/events/compaction
8. OOM
关于OOM的介绍Linux内存管理 (21)OOM。
/proc/sys/vm/panic_on_oom
当Kernel遇到OOM的时候,根据panic_on_oom采取行动,有两种:
panic_on_oom2或者1:产生内核Panic
panic_on_oom0:启动OOM选择进程,杀死以释放内存
复制代码
/*
* Determines whether the kernel must panic because of the panic_on_oom sysctl.
*/
void check_panic_on_oom(struct oom_control *oc, enum oom_constraint constraint,
struct mem_cgroup *memcg)
{
if (likely(!sysctl_panic_on_oom))
return;
if (sysctl_panic_on_oom != 2) {
/*
* panic_on_oom == 1 only affects CONSTRAINT_NONE, the kernel
* does not panic for cpuset, mempolicy, or memcg allocation
* failures.
*/
if (constraint != CONSTRAINT_NONE)
return;
}
/* Do not panic for oom kills triggered by sysrq */
if (is_sysrq_oom(oc))
return;
dump_header(oc, NULL, memcg);
panic("Out of memory: %s panic_on_oom is enabled\n",
sysctl_panic_on_oom == 2 ? "compulsory" : "system-wide");
}
复制代码
/proc/sys/vm/oom_kill_allocating_task
在触发OOM的情况下,选择杀死哪个进程的策略是有个oom_kill_allocating_task来决定。
oom_kill_allocating_task1:谁触发了OOM就杀死谁
oom_kill_allocating_task0:在系统范围内选择最‘bad’进程杀死
默认情况下该变量为0,如果配置了此值,则当内存被耗尽时,或者内存不足已满足需要分配的内存时,会把当前申请内存分配的进程杀掉。
复制代码
bool out_of_memory(struct oom_control *oc)
{
...
if (sysctl_oom_kill_allocating_task && current->mm &&----------------------选择当前进程进行处理
!oom_unkillable_task(current, NULL, oc->nodemask) &&
current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) {
get_task_struct(current);
oom_kill_process(oc, current, 0, totalpages, NULL,
"Out of memory (oom_kill_allocating_task)");
return true;
}
p = select_bad_process(oc, &points, totalpages);---------------------------在系统范围内选择最'bad'进程进行处理
...
return true;
}
复制代码
/proc/sys/vm/oom_dump_tasks
决定在OOM打印的使用是否dump_tasks,oom_dump_tasks==1则打印,否则不打印。
/proc/xxx/oom_score、/proc/xxx/oom_adj、/proc/xxx/oom_score_adj
这三个参数都是具体进程相关的,其中oom_score是只读j。
复制代码
static const struct pid_entry tid_base_stuff[] = {
...
ONE("oom_score", S_IRUGO, proc_oom_score),
REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adj_operations),
REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations),
...
}
复制代码
oom_score的结果来自于oom_badness,主要来自两部分,一是根据进程内存使用情况打分,另一部分来自于用户打分即oom_score_adj。
如果oom_score_adj为OOM_SCORE_ADJ_MIN的话,就禁止了OOM杀死进程。
oom_adj是一个旧接口参数,取值范围是[-16, 15]。oom_adj通过一定计算转换成oom_score_adj。
oom_score_adj通过用户空间直接写入进程的signal->oom_score_adj。
这三者之间关系简单概述:oom_adj映射到oom_score_adj;oom_score_adj作为一部分计算出oom_score;oom_score才是OOM机制选择’bad’进程的依据。
oom_score_adj和oom_adj的关系
内核首先根据内存使用情况计算出points得分,oom_score_adj的范围是[-1000, 1000],adj的值是将oom_score_adj归一化后乘以totalpages的结果。
如果oom_score_adj为0,则不计入oom_score_adj的影响。
如果oom_score_adj为负数,则最终得分会变小,进程降低被选中可能性。
如果oom_score_adj为正数,则加大被选为’bad’的可能性。
复制代码
unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
const nodemask_t *nodemask, unsigned long totalpages)
{
...
/* Normalize to oom_score_adj units */
adj *= totalpages / 1000;
points += adj;
...
}
复制代码
oom_adj和oom_score_adj的关系
可以看出oom_ad从区间[-16, 15]j被映射到oom_score_adj区间[-1000, 1000]。
复制代码
static ssize_t oom_adj_write(struct file *file, const char __user *buf,
size_t count, loff_t *ppos)
{
...
/*
* Scale /proc/pid/oom_score_adj appropriately ensuring that a maximum
* value is always attainable.
*/
if (oom_adj == OOM_ADJUST_MAX)--------------------------------------如果oom_adj等于OOM_ADJUST_MAX,则对应OOM_SCORE_ADJ_MAX。
oom_adj = OOM_SCORE_ADJ_MAX;
else
oom_adj = (oom_adj * OOM_SCORE_ADJ_MAX) / -OOM_DISABLE;---------通过公式将旧oom_adj映射到oom_score_adj区间。
if (oom_adj < task->signal->oom_score_adj &&
!capable(CAP_SYS_RESOURCE)) {-----------------------------------判断修改权限是否满足CAP_SYS_RESOURCE
err = -EACCES;
goto err_sighand;
}
...
task->signal->oom_score_adj = oom_adj;------------------------------将从oom_adj转换到oom_score_adj
...
}
复制代码
/sys/kernel/debug/tracing/events/oom
参考文档:《Linux vm运行参数之(二):OOM相关的参数》
- Overcommit
参考文档:《理解LINUX的MEMORY OVERCOMMIT》
当进程需要内存时,进程从内核获得的仅仅是一段虚拟地址的使用权,而不是实际的物理内存。
实际的物理内存只有当进程真的去访问时,产生缺页异常,从而进入分配实际物理内存的分配。
看起来虚拟内存和物理内存分配被分割开了,虚拟内存分配超过物理内存的限制,这种情况成为Overcommit。
相关参数初始化:
int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */
int sysctl_overcommit_ratio = 50; /* default is 50% */
unsigned long sysctl_overcommit_kbytes __read_mostly;
unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */
unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */
9.1 /proc/sys/vm/overcommit_memory
关于Overcommit的策略有三种:
#define OVERCOMMIT_GUESS 0---------让内核根据自己当前状况进行判断。
#define OVERCOMMIT_ALWAYS 1-------不限制Overcommit,无论进程申请多少虚拟地址空间。
#define OVERCOMMIT_NEVER 2---------不允许Overcommit,会根据overcommit_ration计算出一个overcommit阈值。
overcommit_memory ==0,系统默认设置,释放较少物理内存,使得oom-kill机制运作比较明显。
Heuristic overcommit handling. 这是缺省值,它允许overcommit,但过于明目张胆的overcommit会被拒绝,比如malloc一次性申请的内存大小就超过了系统总内存。
Heuristic的意思是“试探式的”,内核利用某种算法猜测你的内存申请是否合理,它认为不合理就会拒绝overcommit。
overcommit_memory == 1,会从buffer中释放较多物理内存,oom-kill也会继续起作用;
允许overcommit,对内存申请来者不拒。
overcommit_memory == 2,物理内存使用完后,打开任意一个程序均显示内存不足;
禁止overcommit。CommitLimit 就是overcommit的阈值,申请的内存总数超过CommitLimit的话就算是overcommit。
也就是说,如果overcommit_memory==2时,内存耗尽时,oom-kill是不会起作用的,系统不会再打开其他程序了,只有等待正在运行的进程释放内存。
复制代码
int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
{
long free, allowed, reserve;
VM_WARN_ONCE(percpu_counter_read(&vm_committed_as) <
-(s64)vm_committed_as_batch * num_online_cpus(),
"memory commitment underflow");
vm_acct_memory(pages);
/*
* Sometimes we want to use more memory than we have
*/
if (sysctl_overcommit_memory == OVERCOMMIT_ALWAYS)-----------------------------------OVERCOMMIT_ALWAYS不会对内存申请做限制。
return 0;
if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) {----------------------------------OVERCOMMIT_GUESS情况下对内存申请处理。
free = global_page_state(NR_FREE_PAGES);
free += global_page_state(NR_FILE_PAGES);
/*
* shmem pages shouldn't be counted as free in this
* case, they can't be purged, only swapped out, and
* that won't affect the overall amount of available
* memory in the system.
*/
free -= global_page_state(NR_SHMEM);
free += get_nr_swap_pages();
/*
* Any slabs which are created with the
* SLAB_RECLAIM_ACCOUNT flag claim to have contents
* which are reclaimable, under pressure. The dentry
* cache and most inode caches should fall into this
*/
free += global_page_state(NR_SLAB_RECLAIMABLE);
/*
* Leave reserved pages. The pages are not for anonymous pages.
*/
if (free <= totalreserve_pages)
goto error;
else
free -= totalreserve_pages;
/*
* Reserve some for root
*/
if (!cap_sys_admin)
free -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);
if (free > pages)
return 0;
goto error;
}
allowed = vm_commit_limit();
/*
* Reserve some for root
*/
if (!cap_sys_admin)
allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);
/*
* Don't let a single process grow so big a user can't recover
*/
if (mm) {
reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10);
allowed -= min_t(long, mm->total_vm / 32, reserve);
}
if (percpu_counter_read_positive(&vm_committed_as) < allowed)
return 0;
error:
vm_unacct_memory(pages);
return -ENOMEM;
}
复制代码
9.2 /proc/sys/vm/overcommit_kbytes、/proc/sys/vm/overcommit_ratio
在overcommit_memory被设置为OVERCOMMIT_GUESS 和OVERCOMMIT_NEVER的情况下,计算Overcommit的允许量。
复制代码
unsigned long vm_commit_limit(void)
{
unsigned long allowed;
if (sysctl_overcommit_kbytes)
allowed = sysctl_overcommit_kbytes >> (PAGE_SHIFT - 10);
else
allowed = ((totalram_pages - hugetlb_total_pages())
* sysctl_overcommit_ratio / 100);
allowed += total_swap_pages;
return allowed;
}
复制代码
/proc/sys/vm/admin_reserve_kbytes、/proc/sys/vm/user_reserve_kbytes
分别为root用户和普通用户保留操作需要的的内存。
参考文档:《Linux vm运行参数之(一):overcommit相关的参数》
/sys/kernel/debug/memblock
/sys/kernel/debug/tracing/events/kmem
/sys/kernel/debug/tracing/events/pagemap
/sys/kernel/debug/tracing/events/skb
/sys/kernel/debug/tracing/events/vmscan
block_dump
10. 文件缓存回写
/proc/sys/vm/dirty_background_bytes
/proc/sys/vm/dirty_background_ratio
/proc/sys/vm/dirty_bytes
/proc/sys/vm/dirty_ratio
/proc/sys/vm/dirty_expire_centisecs
脏数据的超时时间,超过这个时间的脏数据将会马上放入会写队列,单位是百分之一秒,默认值是30秒。
复制代码
/*
* The longest time for which data is allowed to remain dirty
*/
unsigned int dirty_expire_interval = 30 * 100; /* centiseconds */
复制代码
/proc/sys/vm/dirty_writeback_centisecs
回写现成的循环周期,默认5秒。
/*
* The interval between `kupdate'-style writebacks
*/
unsigned int dirty_writeback_interval = 5 * 100; /* centiseconds */
/proc/sys/vm/dirtytime_expire_seconds
/proc/sys/vm/drop_caches
drop_caches会一系列页面回收操作,注意只丢弃clean caches,包括可回收slab对象(包括dentry/inode)和文件缓存页面。
echo 1 > /proc/sys/vm/drop_caches------------------释放pagecache页面
echo 2 > /proc/sys/vm/drop_caches------------------释放可回收slab对象,包括dentry和inode
echo 3 > /proc/sys/vm/drop_caches------------------释放前两者之和
由于drop_caches只是放clean caches,如果想释放更多内存,需要先执行sync进行文件系统同步。这样就会最小化脏页数量,并且创造了更多的可drop的clean caches。
操作drop_caches可能会造成性能问题,因为被丢弃的内容,可能会被立即需要,从而产生大量的I/O和CPU负荷。
联系方式:arnoldlu@qq.com