日期 | 内核版本 | 架构 | 作者 | GitHub | CSDN |
---|---|---|---|---|---|
2017-07-04 | Linux-4.12 | X86 | lwhuq | LinuxMemoryStudy | Linux内存管理 |
在NUMA多CPU架构下,每个CPU后面都有挂载本地内存,CPU之前通过总线连接。每个CPU在访问当地内存的速度都会比访问远程内存速度快。Linux系统下把每个CPU的本地内存资源用一个结点node表示。
1 pg_data_t结构
pg_data_t的定义在include/linux/mmzone.h#L601
typedef struct pglist_data {
//一个结构数组,包含了结点中各内存域的数据结构zone
struct zone node_zones[MAX_NR_ZONES];
//指定了备用结点机器内存域的列表,以便在当前结点没有可用空间时,在备用结点分配内存
struct zonelist node_zonelists[MAX_ZONELISTS];
//内存域的个数
int nr_zones;
#ifdef CONFIG_FLAT_NODE_MEM_MAP /* means !SPARSEMEM */
//指向结点的第一个页框的页结构,该页结构位于全局mem_map中某个位置
struct page *node_mem_map;
#ifdef CONFIG_PAGE_EXTENSION
struct page_ext *node_page_ext;
#endif
#endif
#ifndef CONFIG_NO_BOOTMEM
//启动内存分配器
struct bootmem_data *bdata;
#endif
#ifdef CONFIG_MEMORY_HOTPLUG
/*
* Must be held any time you expect node_start_pfn, node_present_pages
* or node_spanned_pages stay constant. Holding this will also
* guarantee that any pfn_valid() stays that way.
*
* pgdat_resize_lock() and pgdat_resize_unlock() are provided to
* manipulate node_size_lock without checking for CONFIG_MEMORY_HOTPLUG.
*
* Nests above zone->lock and zone->span_seqlock
*/
spinlock_t node_size_lock;
#endif
//结点起始页框
unsigned long node_start_pfn;
//结点总页框数(不包含洞)
unsigned long node_present_pages; /* total number of physical pages */
//结点总页框数(包含洞)
unsigned long node_spanned_pages; /* total size of physical page
range, including holes */
//结点id
int node_id;
//交换守护进程的等待列表
wait_queue_head_t kswapd_wait;
//本结点交换守护进程
wait_queue_head_t pfmemalloc_wait;
struct task_struct *kswapd; /* Protected by
mem_hotplug_begin/end() */
int kswapd_order;
enum zone_type kswapd_classzone_idx;
int kswapd_failures; /* Number of 'reclaimed == 0' runs */
#ifdef CONFIG_COMPACTION
int kcompactd_max_order;
enum zone_type kcompactd_classzone_idx;
wait_queue_head_t kcompactd_wait;
struct task_struct *kcompactd;
#endif
#ifdef CONFIG_NUMA_BALANCING
/* Lock serializing the migrate rate limiting window */
spinlock_t numabalancing_migrate_lock;
/* Rate limiting time interval */
unsigned long numabalancing_migrate_next_window;
/* Number of pages migrated during the rate limiting time interval */
unsigned long numabalancing_migrate_nr_pages;
#endif
/*
* This is a per-node reserve of pages that are not available
* to userspace allocations.
*/
unsigned long totalreserve_pages;
#ifdef CONFIG_NUMA
/*
* zone reclaim becomes active if more unmapped pages exist.
*/
unsigned long min_unmapped_pages;
unsigned long min_slab_pages;
#endif /* CONFIG_NUMA */
/* Write-intensive fields used by page reclaim */
ZONE_PADDING(_pad1_)
spinlock_t lru_lock;
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
/*
* If memory initialisation on large machines is deferred then this
* is the first PFN that needs to be initialised.
*/
unsigned long first_deferred_pfn;
unsigned long static_init_size;
#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
spinlock_t split_queue_lock;
struct list_head split_queue;
unsigned long split_queue_len;
#endif
/* Fields commonly accessed by the page reclaim scanner */
struct lruvec lruvec;
/*
* The target ratio of ACTIVE_ANON to INACTIVE_ANON pages on
* this node's LRU. Maintained by the pageout code.
*/
unsigned int inactive_ratio;
unsigned long flags;
ZONE_PADDING(_pad2_)
/* Per-node vmstats */
struct per_cpu_nodestat __percpu *per_cpu_nodestats;
atomic_long_t vm_stat[NR_VM_NODE_STAT_ITEMS];
} pg_data_t;
1.1 结点的内存域
结点管理的内存再细分成内存域。typedef struct pglist_data {
//一个结构数组,包含了结点中各内存域的数据结构zone
struct zone node_zones[MAX_NR_ZONES];
//指定了备用结点机器内存域的列表,以便在当前结点没有可用空间时,在备用结点分配内存
struct zonelist node_zonelists[MAX_ZONELISTS];
//内存域的个数
int nr_zones;
}
- node_zones[MAX_NR_ZONES]管理着本地内存的最多MAX_NR_ZONES个内存域
- node_zonelists[MAX_ZONELISTS]指定了备用结点及内存域的列表。可以想象这些备用结点及内存域都是远程内存
- nr_zones结点内存域的个数
1.2 结点的内存页
typedef struct pglist_data {
#ifdef CONFIG_FLAT_NODE_MEM_MAP /* means !SPARSEMEM */
struct page *node_mem_map; //指向结点的第一个页框的页结构,该页结构位于全局mem_map中某个位置
#ifdef CONFIG_PAGE_EXTENSION
struct page_ext *node_page_ext;
#endif
#endif
//结点起始页框
unsigned long node_start_pfn;
//结点总页框数(不包含洞)
unsigned long node_present_pages; /* total number of physical pages */
//结点总页框数(包含洞)
unsigned long node_spanned_pages; /* total size of physical page range, including holes */
} pg_data_t;
在每个结点的结构pg_data_t内有一个指向页结构page的指针node_mem_map。pg_data_t->node_mem_map指向本结点管理的物理内存页框的第一个页框。
typedef struct pglist_data {
//指向结点的第一个页框的页结构,该页结构位于全局mem_map中某个位置
struct page *node_mem_map;
}
pg_data_t->node_mem_map的初始化在alloc_node_mem_map中完成,定义在
mm/page_alloc.c#L6096
static void __ref alloc_node_mem_map(struct pglist_data *pgdat)
{
start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1);
offset = pgdat->node_start_pfn - start;
/* ia64 gets its own node_mem_map, before this, without bootmem */
if (!pgdat->node_mem_map) {
unsigned long size, end;
struct page *map;
/*
* The zone's endpoints aren't required to be MAX_ORDER
* aligned but the node_mem_map endpoints must be in order
* for the buddy allocator to function correctly.
*/
end = pgdat_end_pfn(pgdat);
end = ALIGN(end, MAX_ORDER_NR_PAGES);
size = (end - start) * sizeof(struct page);
map = alloc_remap(pgdat->node_id, size);
if (!map)
map = memblock_virt_alloc_node_nopanic(size,
pgdat->node_id);
pgdat->node_mem_map = map + offset;
}
}
1.3 交换守护进程
typedef struct pglist_data {
//交换守护进程的等待列表
wait_queue_head_t kswapd_wait;wait_queue_head_t pfmemalloc_wait;
//本结点交换守护进程
struct task_struct *kswapd; /* Protected by mem_hotplug_begin/end() */
int kswapd_order;
enum zone_type kswapd_classzone_idx;
int kswapd_failures; /* Number of 'reclaimed == 0' runs */
} pg_data_t;
2 结点状态
当系统中有超过一个结点时,内核会维护一个位图node_states用以提供各个结点的状态信息,其定义在 include/linux/nodemask.h#L381enum node_states {
N_POSSIBLE, /* The node could become online at some point */
N_ONLINE, /* The node is online */
N_NORMAL_MEMORY, /* The node has regular memory */
#ifdef CONFIG_HIGHMEM
N_HIGH_MEMORY, /* The node has regular or high memory */
#else
N_HIGH_MEMORY = N_NORMAL_MEMORY,
#endif
#ifdef CONFIG_MOVABLE_NODE
N_MEMORY, /* The node has memory(regular, high, movable) */
#else
N_MEMORY = N_HIGH_MEMORY,
#endif
N_CPU, /* The node has one or more cpus */
NR_NODE_STATES
};
结点位图的实例node_states定义在
mm/page_alloc.c#L122, 当某个node处在某个状态时,对应状态位的node位就会被置起。
nodemask_t node_states[NR_NODE_STATES] __read_mostly = {
[N_POSSIBLE] = NODE_MASK_ALL,
[N_ONLINE] = { { [0] = 1UL } },
#ifndef CONFIG_NUMA
[N_NORMAL_MEMORY] = { { [0] = 1UL } },
#ifdef CONFIG_HIGHMEM
[N_HIGH_MEMORY] = { { [0] = 1UL } },
#endif
#ifdef CONFIG_MOVABLE_NODE
[N_MEMORY] = { { [0] = 1UL } },
#endif
[N_CPU] = { { [0] = 1UL } },
#endif /* NUMA */
};
EXPORT_SYMBOL(node_states);
- N_POSSIBLE, N_ONLINE和N_CPU用于CPU和内存的热插拔
- N_NORMAL_MEMORY, N_HIGH_MEMORY用于普通内存管理
- N_MEMORY表示有物理内存的结点
static inline int node_state(int node, enum node_states state)
{
return node_isset(node, node_states[state]);
}
static inline void node_set_state(int node, enum node_states state)
{
__node_set(node, &node_states[state]);
}
static inline void node_clear_state(int node, enum node_states state)
{
__node_clear(node, &node_states[state]);
}
static inline int num_node_state(enum node_states state)
{
return nodes_weight(node_states[state]);
}
3 查找内存结点
内存结点的实例为node_data[MAX_NUMNODES],定义在 arch/x86/mm/numa.c#L26struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
EXPORT_SYMBOL(node_data);
内存结点最大数目由MAX_NUMNODES决定,定义在include/linux/numa.h#L11
#ifdef CONFIG_NODES_SHIFT
#define NODES_SHIFT CONFIG_NODES_SHIFT
#else
#define NODES_SHIFT 0
#define MAX_NUMNODES (1 << NODES_SHIFT)
#endif
宏NODE_DATA(nid)可以根据node id找到node_data结构实例,定义在
arch/x86/include/asm/mmzone_32.h#L13和
arch/x86/include/asm/mmzone_64.h#L14
#define NODE_DATA(nid) (node_data[nid])
3.1 查找node id
宏first_online_node用于得到第一个online的node,定义在 include/linux/nodemask.h#L430#define first_online_node first_node(node_states[N_ONLINE])
宏 first_memory_node得到第一个有memory的node,定义在
include/linux/nodemask.h#L431
#define first_memory_node first_node(node_states[N_MEMORY])
宏next_node(n, src)得到某个node state状态src的下一个被置起的node id,定义在
include/linux/nodemask.h#L258
#define next_node(n, src) __next_node((n), &(src))
static inline int __next_node(int n, const nodemask_t *srcp)
{
return min_t(int,MAX_NUMNODES,find_next_bit(srcp->bits, MAX_NUMNODES, n+1));
}
函数next_online_node得到下一个online的node,定义在
include/linux/nodemask.h#L432
static inline int next_online_node(int nid)
{
return next_node(nid, node_states[N_ONLINE]);
}
函数next_memory_node得到下一个有memory的node,定义在
include/linux/nodemask.h#L436
static inline int next_memory_node(int nid)
{
return next_node(nid, node_states[N_MEMORY]);
}
3.2 node id的遍历
宏for_each_node_state(__node, __state)用来遍历处于特定状态的所有结点,定义在 include/linux/nodemask.h#L427#define for_each_node_state(__node, __state) \
for_each_node_mask((__node), node_states[__state])
宏for_each_node(node)用来迭代处于N_POSSIBLE状态的所有结点,定义在
include/linux/nodemask.h#L507
#define for_each_node(node) for_each_node_state(node, N_POSSIBLE)
宏for_each_online_node(node)用来遍历处于N_ONLINE所有结点,定义在
include/linux/nodemask.h#L508
#define for_each_online_node(node) for_each_node_state(node, N_ONLINE)
3.3 查找pg_data_t结构
函数first_online_pgdat得到第一个online的pg_data结构的指针,定义在 mm/mmzone.c#L12struct pglist_data *first_online_pgdat(void)
{
return NODE_DATA(first_online_node);
}
函数next_online_pgdat(pgdat)得到下一个online的pg_data结构的指针,定义在
mm/mmzone.c#L17
struct pglist_data *next_online_pgdat(struct pglist_data *pgdat)
{
int nid = next_online_node(pgdat->node_id);
if (nid == MAX_NUMNODES)
return NULL;
return NODE_DATA(nid);
}
3.4 pg_data_t结构的遍历
宏for_each_online_pgdat(pgdat)用来遍历所有online的pg_data_t结构指针,定义在 include/linux/mmzone.h#L908#define for_each_online_pgdat(pgdat) \
for (pgdat = first_online_pgdat(); \
pgdat; \
pgdat = next_online_pgdat(pgdat))