linux 尾队列,dpdk-eal抽象层(一)

overview

eal( Environment Abstraction Layer)是对底层资源的抽象, 它采用run-to-completion模型,在运行前分配好所有的资源。根据官方文档他提供以下服务

DPDK loading and launching

Support for multi-process and multi-thread execution types

Core affinity/assignment procedures

System memory allocation/de-allocation

Atomic/lock operations

Time reference

PCI bus access

Trace and debug functions

CPU feature identification

Interrupt handling

Alarm operations

Memory managenent (malloc)

run-to-completion和pipeline模型相比,pipeline类似流水线,可以把io繁忙的操作放到一个线程,把cpu繁忙的放到另一个线程分开处理。

run-to-completion则是把这个业务放到一个单独的线程中,不同的线程之间水平扩展。

cpu信息初始化1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17int rte_eal_cpu_init(void) {

for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {

/* in 1:1 mapping, record related cpu detected state */

// /sys/devices/system/cpu/cpu{$lcore_id}/topology/core_id 是否存在

lcore_config[lcore_id].detected = eal_cpu_detected(lcore_id);

/* By default, lcore 1:1 map to cpu id */

CPU_SET(lcore_id, &lcore_config[lcore_id].cpuset);

/* By default, each detected core is enabled */

config->lcore_role[lcore_id] = ROLE_RTE;

// /sys/devices/system/cpu/cpu{$lcore_id}/topology/core_id的值

lcore_config[lcore_id].core_id = eal_cpu_core_id(lcore_id);

// /sys/devices/system/node/node{$nodeid}/cpu{$lcore_id} 遍历node看目录下是否存在该cpu

lcore_config[lcore_id].socket_id = eal_cpu_socket_id(lcore_id);

}

}

hugepage初始化1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77int eal_hugepage_info_init(void)

{

dir = opendir("/sys/kernel/mm/hugepages");

for (dirent = readdir(dir); dirent != NULL; dirent = readdir(dir)) {

struct hugepage_info *hpi;

if (strncmp(dirent->d_name, dirent_start_text,

dirent_start_len) != 0)

continue;

if (num_sizes >= MAX_HUGEPAGE_SIZES) // 目前支持3种hugepage大小

break;

hpi = &internal_config.hugepage_info[num_sizes];

// 通过目录名取得hugepage size

hpi->hugepage_sz = rte_str_to_size(&dirent->d_name[dirent_start_len]);

/* 通过/proc/mount取得hugetlbfs类型的的挂载点

nodev /mnt/huge hugetlbfs rw,seclabel,relatime 0 0

如果选项里指定了pagesize,则使用该大小,没有则取/proc/meminfo里的Hugepagesize作为默认大小

*/

hpi->hugedir = get_hugepage_dir(hpi->hugepage_sz);

/* first, check if we have a mountpoint */

if (hpi->hugedir == NULL) {

uint32_t num_pages;

///sys/kernel/mm/hugepages/hugepages-2048kB/free_hugepages减resv_hugepages是否有剩余未使用,如果有但是没挂载则报错

num_pages = get_num_hugepages(dirent->d_name);

if (num_pages > 0)

RTE_LOG(NOTICE, EAL,

"%" PRIu32 " hugepages of size "

"%" PRIu64 " reserved, but no mounted "

"hugetlbfs found for that size\n",

num_pages, hpi->hugepage_sz);

continue;

}

/* try to obtain a writelock */

// 对挂载的目录获取写锁

hpi->lock_descriptor = open(hpi->hugedir, O_RDONLY);

/* if blocking lock failed */

if (flock(hpi->lock_descriptor, LOCK_EX) == -1) {

RTE_LOG(CRIT, EAL,

"Failed to lock hugepage directory!\n");

break;

}

/* clear out the hugepages dir from unused pages */

// 在挂载目录下查找"*map_*"的目录,并尝试获取写锁,如果能获取说明未被使用,则解锁并删除

if (clear_hugedir(hpi->hugedir) == -1)

break;

/* for now, put all pages into socket 0,

* later they will be sorted */

// /sys/kernel/mm/hugepages/hugepages-2048kB/free_hugepages减resv_hugepages

hpi->num_pages[0] = get_num_hugepages(dirent->d_name);

num_sizes++;

}

internal_config.num_hugepage_sizes = num_sizes;

/* sort the page directory entries by size, largest to smallest */

// internal_config.hugepage_info数组对pagesize从小到大排序

qsort(&internal_config.hugepage_info[0], num_sizes,

sizeof(internal_config.hugepage_info[0]), compare_hpi);

/* now we have all info, check we have at least one valid size */

for (i = 0; i < num_sizes; i++)

if (internal_config.hugepage_info[i].hugedir != NULL &&

internal_config.hugepage_info[i].num_pages[0] > 0)

return 0;

/* no valid hugepage mounts available, return error */

return -1;

}

多进程和配置文件初始化

因为dpdk已经初始化并抽象了所有底层细节,它采用primary/secondary模型,采用共享内存中的配置文件来作为资源的统一分配和同步

可以通过–proc-type参数来指定当前进程的类型,如果是auto,则第一个获取配置文件写锁的作为master进程

–no-shconf不会对配置文件做任何修改

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19static void

rte_config_init(void)

{

rte_config.process_type = internal_config.process_type;

switch (rte_config.process_type){

case RTE_PROC_PRIMARY: //primary进程有/var/run/.rte_config写锁

rte_eal_config_create(); //清空并创建.rte_config

break;

case RTE_PROC_SECONDARY:

rte_eal_config_attach();

rte_eal_mcfg_wait_complete(rte_config.mem_config); //两个进程之间有个时间窗口,需要同步

rte_eal_config_reattach(); //.rte_config mmap到相同的虚拟地址

break;

case RTE_PROC_AUTO:

case RTE_PROC_INVALID:

rte_panic("Invalid process type\n");

}

}

primary进程

rte_eal_config_create主要是创建新的配置文件,获取写锁,mmap,并保存配置文件的虚拟地址,让secondary之后mmap到该地址来共享配置文件。

因此如果两个不同的应用程序分别作为primary和secondary,很有可能primary中mmap的虚拟地址已经被secondary使用了,之后secondary再mmap到这个地址就会attach失败了。

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37static void

rte_eal_config_create(void)

{

void *rte_mem_cfg_addr;

int retval;

const char *pathname = eal_runtime_config_path();

if (internal_config.no_shconf)

return;

/* map the config before hugepage address so that we don't waste a page */

if (internal_config.base_virtaddr != 0)

rte_mem_cfg_addr = (void *)

RTE_ALIGN_FLOOR(internal_config.base_virtaddr -

sizeof(struct rte_mem_config), sysconf(_SC_PAGE_SIZE));

else

rte_mem_cfg_addr = NULL;

if (mem_cfg_fd < 0){

mem_cfg_fd = open(pathname, O_RDWR | O_CREAT, 0660);

}

retval = ftruncate(mem_cfg_fd, sizeof(*rte_config.mem_config));

retval = fcntl(mem_cfg_fd, F_SETLK, &wr_lock);

rte_mem_cfg_addr = mmap(rte_mem_cfg_addr, sizeof(*rte_config.mem_config),

PROT_READ | PROT_WRITE, MAP_SHARED, mem_cfg_fd, 0);

memcpy(rte_mem_cfg_addr, &early_mem_config, sizeof(early_mem_config));

rte_config.mem_config = (struct rte_mem_config *) rte_mem_cfg_addr;

/* store address of the config in the config itself so that secondary

* processes could later map the config into this exact location */

rte_config.mem_config->mem_cfg_addr = (uintptr_t) rte_mem_cfg_addr; //保存虚拟地址,让其他进程在该地址mmap

}

secondary进程

打开并mmap配置文件, 这里可以看到需要primary进程先启动,即使两个进程都指定了auto类型。。

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23/* attach to an existing shared memory config */

static void

rte_eal_config_attach(void)

{

struct rte_mem_config *mem_config;

const char *pathname = eal_runtime_config_path();

if (internal_config.no_shconf)

return;

if (mem_cfg_fd < 0){

mem_cfg_fd = open(pathname, O_RDWR);

if (mem_cfg_fd < 0)

rte_panic("Cannot open '%s' for rte_mem_config\n", pathname);

}

/* map it as read-only first */

mem_config = (struct rte_mem_config *) mmap(NULL, sizeof(*mem_config),

PROT_READ, MAP_SHARED, mem_cfg_fd, 0);

rte_config.mem_config = mem_config;

}

rte_eal_mcfg_wait_complete是因为primary和secondary两个初始化之间有个时间窗口,需要等待primary调用rte_eal_mcfg_complete来表示完成初始化。

最后secondary rte_eal_config_reattach到primary保存的配置文件虚拟地址中

pci初始化

遍历/sys/bus/pci/devices

pci格式是domain:bus:devid.function

查看driver的链接的驱动,是否是支持的igb_uio,uio_pci_generic,vfio-pci

通过该目录下的pci信息初始化rte_pci_device结构, 然后按照pci地址排序后插入pci_device_list尾队列中

这里还会查看resource文件来,如果flag里表示该段地址是物理内存空间,则初始化rte_pci_device的mem_resource,目前代码里只扫描6行该文件

rte_eal_pci_init->rte_eal_pci_scan->pci_scan_one

内存初始化

初始化后primary和secondary有相同的内存抽象, 对内存的使用映射到相同的虚拟地址

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16/* init memory subsystem */

int rte_eal_memory_init(void)

{

RTE_LOG(INFO, EAL, "Setting up physically contiguous memory...\n");

const int retval = rte_eal_process_type() == RTE_PROC_PRIMARY ?

rte_eal_hugepage_init() : // 把hugepage按地址连续性和socket分到memseg中

rte_eal_hugepage_attach(); //通过.rte_hugepage_info把hugepage mmap到memseg连续地址

if (retval < 0)

return -1;

if (internal_config.no_shconf == 0 && rte_eal_memdevice_init() < 0)

return -1;

return 0;

}

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214/*

* Prepare physical memory mapping: fill configuration structure with

* these infos, return 0 on success.

* 1. map N huge pages in separate files in hugetlbfs

* 2. find associated physical addr

* 3. find associated NUMA socket ID

* 4. sort all huge pages by physical address

* 5. remap these N huge pages in the correct order

* 6. unmap the first mapping

* 7. fill memsegs in configuration with contiguous zones

*/

int rte_eal_hugepage_init(void){

struct rte_mem_config *mcfg;

struct hugepage_file *hugepage, *tmp_hp = NULL;

struct hugepage_info used_hp[MAX_HUGEPAGE_SIZES];

uint64_t memory[RTE_MAX_NUMA_NODES];

unsigned hp_offset;

int i, j, new_memseg;

int nr_hugefiles, nr_hugepages = 0;

void *addr;

test_proc_pagemap_readable();

memset(used_hp, 0, sizeof(used_hp));

/* get pointer to global configuration */

mcfg = rte_eal_get_configuration()->mem_config;

/* calculate total number of hugepages available. at this point we haven't

* yet started sorting them so they all are on socket 0 */

for (i = 0; i < (int) internal_config.num_hugepage_sizes; i++) {

/* meanwhile, also initialize used_hp hugepage sizes in used_hp */

used_hp[i].hugepage_sz = internal_config.hugepage_info[i].hugepage_sz;

nr_hugepages += internal_config.hugepage_info[i].num_pages[0];

}

/*

* allocate a memory area for hugepage table.

* this isn't shared memory yet. due to the fact that we need some

* processing done on these pages, shared memory will be created

* at a later stage.

*/

tmp_hp = malloc(nr_hugepages * sizeof(struct hugepage_file));

hp_offset = 0; /* where we start the current page size entries */

/* map all hugepages and sort them */

for (i = 0; i < (int)internal_config.num_hugepage_sizes; i ++){

struct hugepage_info *hpi;

/*

* we don't yet mark hugepages as used at this stage, so

* we just map all hugepages available to the system

* all hugepages are still located on socket 0

*/

hpi = &internal_config.hugepage_info[i];

if (hpi->num_pages[0] == 0)

continue;

/* map all hugepages available */

if (map_all_hugepages(&tmp_hp[hp_offset], hpi, 1) < 0){ // 第一次创建并mmap所有的hugepage文件,虚拟地址存到各自的orig_va, flock共享锁

RTE_LOG(DEBUG, EAL, "Failed to mmap %u MB hugepages\n",

(unsigned)(hpi->hugepage_sz / 0x100000));

goto fail;

}

/* find physical addresses and sockets for each hugepage */

if (find_physaddrs(&tmp_hp[hp_offset], hpi) < 0){ //orig_va->physaddr

RTE_LOG(DEBUG, EAL, "Failed to find phys addr for %u MB pages\n",

(unsigned)(hpi->hugepage_sz / 0x100000));

goto fail;

}

if (find_numasocket(&tmp_hp[hp_offset], hpi) < 0){ //查找所有hugepage所在的numa node

RTE_LOG(DEBUG, EAL, "Failed to find NUMA socket for %u MB pages\n",

(unsigned)(hpi->hugepage_sz / 0x100000));

goto fail;

}

if (sort_by_physaddr(&tmp_hp[hp_offset], hpi) < 0) //根据物理地址排序

goto fail;

/* remap all hugepages */

if (map_all_hugepages(&tmp_hp[hp_offset], hpi, 0) < 0){ //第二次mmap,把物理地址连续的hugepage,mmap到连续的虚拟地址,设置final_va

RTE_LOG(DEBUG, EAL, "Failed to remap %u MB pages\n",

(unsigned)(hpi->hugepage_sz / 0x100000));

goto fail;

}

/* unmap original mappings */

if (unmap_all_hugepages_orig(&tmp_hp[hp_offset], hpi) < 0)//unmap orig_va

goto fail;

/* we have processed a num of hugepages of this size, so inc offset */

hp_offset += hpi->num_pages[0];

}

nr_hugefiles = nr_hugepages;

/* clean out the numbers of pages */

for (i = 0; i < (int) internal_config.num_hugepage_sizes; i++)

for (j = 0; j < RTE_MAX_NUMA_NODES; j++)

internal_config.hugepage_info[i].num_pages[j] = 0;

/* get hugepages for each socket */

for (i = 0; i < nr_hugefiles; i++) {

int socket = tmp_hp[i].socket_id;

/* find a hugepage info with right size and increment num_pages */

const int nb_hpsizes = RTE_MIN(MAX_HUGEPAGE_SIZES,

(int)internal_config.num_hugepage_sizes);

for (j = 0; j < nb_hpsizes; j++) {

if (tmp_hp[i].size ==

internal_config.hugepage_info[j].hugepage_sz) {

internal_config.hugepage_info[j].num_pages[socket]++; //统计各个pagesize在各个numa socket上的数量

}

}

}

/* make a copy of socket_mem, needed for number of pages calculation */

for (i = 0; i < RTE_MAX_NUMA_NODES; i++)

memory[i] = internal_config.socket_mem[i];

/* calculate final number of pages */

nr_hugepages = calc_num_pages_per_socket(memory, //把内存在所有socket上均分,剩余内存分到hugepage所属的socket上

internal_config.hugepage_info, used_hp,

internal_config.num_hugepage_sizes); //返回所有hugepage size在所有socket上总的page数量

/* create shared memory */

hugepage = create_shared_memory(eal_hugepage_info_path(), // mmap /var/run/.rte_hugepage_info

nr_hugefiles * sizeof(struct hugepage_file));

memset(hugepage, 0, nr_hugefiles * sizeof(struct hugepage_file));

/*

* unmap pages that we won't need (looks at used_hp).

* also, sets final_va to NULL on pages that were unmapped.

*/

if (unmap_unneeded_hugepages(tmp_hp, used_hp, // unmap不需要的内存

internal_config.num_hugepage_sizes) < 0) {

RTE_LOG(ERR, EAL, "Unmapping and locking hugepages failed!\n");

goto fail;

}

/*

* copy stuff from malloc'd hugepage* to the actual shared memory.

* this procedure only copies those hugepages that have final_va

* not NULL. has overflow protection.

*/

if (copy_hugepages_to_shared_mem(hugepage, nr_hugefiles, //使用的hg信息从堆copy到共享内存

tmp_hp, nr_hugefiles) < 0) {

RTE_LOG(ERR, EAL, "Copying tables to shared memory failed!\n");

goto fail;

}

/* free the temporary hugepage table */

free(tmp_hp);

tmp_hp = NULL;

/* find earliest free memseg - this is needed because in case of IVSHMEM,

* segments might have already been initialized */

for (j = 0; j < RTE_MAX_MEMSEG; j++)

if (mcfg->memseg[j].addr == NULL) {

/* move to previous segment and exit loop */

j--;

break;

}

for (i = 0; i < nr_hugefiles; i++) { //按连续虚拟地址/物理地址/相同的socket来分段,保存到mcfg->memseg中

new_memseg = 0;

/* if this is a new section, create a new memseg */

if (i == 0)

new_memseg = 1;

else if (hugepage[i].socket_id != hugepage[i-1].socket_id)

new_memseg = 1;

else if (hugepage[i].size != hugepage[i-1].size)

new_memseg = 1;

else if ((hugepage[i].physaddr - hugepage[i-1].physaddr) !=

hugepage[i].size)

new_memseg = 1;

else if (((unsigned long)hugepage[i].final_va -

(unsigned long)hugepage[i-1].final_va) != hugepage[i].size)

new_memseg = 1;

if (new_memseg) {

j += 1;

if (j == RTE_MAX_MEMSEG)

break;

mcfg->memseg[j].phys_addr = hugepage[i].physaddr;

mcfg->memseg[j].addr = hugepage[i].final_va;

mcfg->memseg[j].len = hugepage[i].size;

mcfg->memseg[j].socket_id = hugepage[i].socket_id;

mcfg->memseg[j].hugepage_sz = hugepage[i].size;

}

/* continuation of previous memseg */

else {

mcfg->memseg[j].len += mcfg->memseg[j].hugepage_sz;

}

hugepage[i].memseg_id = j;

}

return 0;

fail:

return -1;

}

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117/*

* This creates the memory mappings in the secondary process to match that of

* the server process. It goes through each memory segment in the DPDK runtime

* configuration and finds the hugepages which form that segment, mapping them

* in order to form a contiguous block in the virtual memory space

*/

int rte_eal_hugepage_attach(void) {

const struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;

const struct hugepage_file *hp = NULL;

unsigned num_hp = 0;

unsigned i, s = 0; /* s used to track the segment number */

off_t size;

int fd, fd_zero = -1, fd_hugepage = -1;

if (aslr_enabled() > 0) {

RTE_LOG(WARNING, EAL, "WARNING: Address Space Layout Randomization "

"(ASLR) is enabled in the kernel.\n");

RTE_LOG(WARNING, EAL, " This may cause issues with mapping memory "

"into secondary processes\n");

}

test_proc_pagemap_readable();

fd_zero = open("/dev/zero", O_RDONLY);

if (fd_zero < 0) {

RTE_LOG(ERR, EAL, "Could not open /dev/zero\n");

goto error;

}

fd_hugepage = open("/var/run/.rte_hugepage_info", O_RDONLY);

/* map all segments into memory to make sure we get the addrs */

for (s = 0; s < RTE_MAX_MEMSEG; ++s) {

void *base_addr;

/*

* the first memory segment with len==0 is the one that

* follows the last valid segment.

*/

if (mcfg->memseg[s].len == 0)

break;

/*

* fdzero is mmapped to get a contiguous block of virtual

* addresses of the appropriate memseg size.

* use mmap to get identical addresses as the primary process.

*/

base_addr = mmap(mcfg->memseg[s].addr, mcfg->memseg[s].len, // 先在/dev/zero上mmap到memseg地址,获取连续的虚拟地址

PROT_READ, MAP_PRIVATE, fd_zero, 0);

if (base_addr == MAP_FAILED ||

base_addr != mcfg->memseg[s].addr) {

RTE_LOG(ERR, EAL, "Could not mmap %llu bytes "

"in /dev/zero to requested address [%p]: '%s'\n",

(unsigned long long)mcfg->memseg[s].len,

mcfg->memseg[s].addr, strerror(errno));

if (aslr_enabled() > 0) {

RTE_LOG(ERR, EAL, "It is recommended to "

"disable ASLR in the kernel "

"and retry running both primary "

"and secondary processes\n");

}

goto error;

}

}

size = getFileSize(fd_hugepage);

hp = mmap(NULL, size, PROT_READ, MAP_PRIVATE, fd_hugepage, 0); // mmap获取hugepage信息

num_hp = size / sizeof(struct hugepage_file);

RTE_LOG(DEBUG, EAL, "Analysing %u files\n", num_hp);

s = 0;

while (s < RTE_MAX_MEMSEG && mcfg->memseg[s].len > 0){

void *addr, *base_addr;

uintptr_t offset = 0;

size_t mapping_size;

/*

* free previously mapped memory so we can map the

* hugepages into the space

*/

base_addr = mcfg->memseg[s].addr;

munmap(base_addr, mcfg->memseg[s].len); // 先unmmap掉上面的映射,尝试在该地址上mmap hugepage

/* find the hugepages for this segment and map them

* we don't need to worry about order, as the server sorted the

* entries before it did the second mmap of them */

for (i = 0; i < num_hp && offset < mcfg->memseg[s].len; i++){

if (hp[i].memseg_id == (int)s){

fd = open(hp[i].filepath, O_RDWR);

mapping_size = hp[i].size;

addr = mmap(RTE_PTR_ADD(base_addr, offset),

mapping_size, PROT_READ | PROT_WRITE,

MAP_SHARED, fd, 0);

close(fd); /* close file both on success and on failure */

if (addr == MAP_FAILED ||

addr != RTE_PTR_ADD(base_addr, offset)) {

RTE_LOG(ERR, EAL, "Could not mmap %s\n",

hp[i].filepath);

goto error;

}

offset+=mapping_size;

}

}

RTE_LOG(DEBUG, EAL, "Mapped segment %u of size 0x%llx\n", s,

(unsigned long long)mcfg->memseg[s].len);

s++;

}

/* unmap the hugepage config file, since we are done using it */

munmap((void *)(uintptr_t)hp, size);

close(fd_zero);

close(fd_hugepage);

return 0;

error:

return -1;

}

map_all_hugepages

第一次origin=1的时候,map_all_hugepages通过hpi指向的hugepage信息, 初始化了hugepg_tbl表,总共 hpi->num_pages[0]个表项,为每个表项创建文件rtemap_{$file_id},并mmap到对应的hugepage挂载点下,整个hugepg_tbl表映射到连续的虚拟地址,并把每个表项的虚拟地址保存到hugepg_tbl[i].orig_va中

第二次origin=0的时候,此时已经取到hugepage的物理地址,并按物理地址排好序

把物理地址连续的hugepage,mmap到连续的虚拟地址,并设置final_va

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82/*

* Mmap all hugepages of hugepage table: it first open a file in

* hugetlbfs, then mmap() hugepage_sz data in it. If orig is set, the

* virtual address is stored in hugepg_tbl[i].orig_va, else it is stored

* in hugepg_tbl[i].final_va. The second mapping (when orig is 0) tries to

* map continguous physical blocks in contiguous virtual blocks.

*/

static int map_all_hugepages(struct hugepage_file *hugepg_tbl,

struct hugepage_info *hpi, int orig)

{

int fd;

unsigned i;

void *virtaddr;

void *vma_addr = NULL;

size_t vma_len = 0;

for (i = 0; i < hpi->num_pages[0]; i++) {

uint64_t hugepage_sz = hpi->hugepage_sz;

if (orig) {

hugepg_tbl[i].file_id = i;

hugepg_tbl[i].size = hugepage_sz;

eal_get_hugefile_path(hugepg_tbl[i].filepath, //hpi->hugedir/rtemap_$file_id

sizeof(hugepg_tbl[i].filepath), hpi->hugedir,

hugepg_tbl[i].file_id);

hugepg_tbl[i].filepath[sizeof(hugepg_tbl[i].filepath) - 1] = '\0';

}

else if (vma_len == 0) {

unsigned j, num_pages;

/* reserve a virtual area for next contiguous

* physical block: count the number of

* contiguous physical pages. */

for (j = i+1; j < hpi->num_pages[0] ; j++) {

if (hugepg_tbl[j].physaddr !=

hugepg_tbl[j-1].physaddr + hugepage_sz)

break;

}

num_pages = j - i;

vma_len = num_pages * hugepage_sz;

/* get the biggest virtual memory area up to

* vma_len. If it fails, vma_addr is NULL, so

* let the kernel provide the address. */

vma_addr = get_virtual_area(&vma_len, hpi->hugepage_sz); //mmap到/dev/zero获取连续的虚拟地址,获取失败则减少该大小再尝试

if (vma_addr == NULL)

vma_len = hugepage_sz;

}

/* try to create hugepage file */

fd = open(hugepg_tbl[i].filepath, O_CREAT | O_RDWR, 0755);

if (fd < 0) {

RTE_LOG(ERR, EAL, "%s(): open failed: %s\n", __func__,

strerror(errno));

return -1;

}

virtaddr = mmap(vma_addr, hugepage_sz, PROT_READ | PROT_WRITE,

MAP_SHARED, fd, 0);

if (orig) {

hugepg_tbl[i].orig_va = virtaddr;

memset(virtaddr, 0, hugepage_sz);

}

else {

hugepg_tbl[i].final_va = virtaddr;

}

/* set shared flock on the file. */

if (flock(fd, LOCK_SH | LOCK_NB) == -1) {

return -1;

}

close(fd);

vma_addr = (char *)vma_addr + hugepage_sz;

vma_len -= hugepage_sz;

}

return 0;

}

find_physaddrs

为每个hugepage mmap的虚拟地址orig_va, 找到对应的物理地址保存到physaddr中

具体代码在rte_mem_virt2phy中, 主要是读取/proc/self/pagemap文件

该文件为每个虚拟页面分配一个64bit的值,包含以下信息

* Bits 0-54 page frame number (PFN) if present

* Bits 0-4 swap type if swapped

* Bits 5-54 swap offset if swapped

* Bits 55-60 page shift (page size = 1<

* Bit 61 page is file-page or shared-anon

* Bit 62 page swapped

* Bit 63 page present1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20phys_addr_t

rte_mem_virt2phy(const void *virtaddr) {

/* standard page size */

page_size = getpagesize();

virt_pfn = (unsigned long)virtaddr / page_size;

offset = sizeof(uint64_t) * virt_pfn;

lseek(fd, offset, SEEK_SET)

read(fd, &page, sizeof(uint64_t)) < 0) {

/*

* the pfn (page frame number) are bits 0-54 (see

* pagemap.txt in linux Documentation)

*/

physaddr = ((page & 0x7fffffffffffffULL) * page_size)

+ ((unsigned long)virtaddr % page_size);

close(fd);

return physaddr;

}

find_numasocket

读取/proc/self/numa_maps

7ff494c00000 prefer:1 file=/dev/hugepages/rtemap_1 huge dirty=1 N1=1

N=读取该node值

calc_num_pages_per_socket1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110/*

* This function is a NUMA-aware equivalent of calc_num_pages.

* It takes in the list of hugepage sizes and the

* number of pages thereof, and calculates the best number of

* pages of each size to fulfill the request for ram

*/

static int

calc_num_pages_per_socket(uint64_t \* memory,

struct hugepage_info *hp_info,

struct hugepage_info *hp_used,

unsigned num_hp_info)

{

int cpu_per_socket[RTE_MAX_NUMA_NODES];

RTE_LCORE_FOREACH(lcore_id) {

cpu_per_socket[rte_lcore_to_socket_id(lcore_id)]++; //每个socket中对应的cpu数量

}

total_size = internal_config.memory;

for (socket = 0; socket < RTE_MAX_NUMA_NODES && total_size != 0; socket++) {

//每个lcore均等的比例分配给socket

default_size = (internal_config.memory * cpu_per_socket[socket])

/ rte_lcore_count();

//不能大于在该socket上的所有hugepage的总大小

default_size = RTE_MIN(default_size, get_socket_mem_size(socket));

/* Update sizes */

memory[socket] = default_size;

total_size -= default_size;

}

for (socket = 0; socket < RTE_MAX_NUMA_NODES && total_size != 0; socket++) { //分完剩余内存到所属的socket上

default_size = RTE_MIN(get_socket_mem_size(socket) - memory[socket],

total_size);

memory[socket] += default_size;

total_size -= default_size;

}

for (socket = 0; socket < RTE_MAX_NUMA_NODES && total_mem != 0; socket++) { // 之前是比较理想的平均分配,现在要按页面来计算

/* skips if the memory on specific socket wasn't requested */

for (i = 0; i < num_hp_info && memory[socket] != 0; i++){

hp_used[i].hugedir = hp_info[i].hugedir;

hp_used[i].num_pages[socket] = RTE_MIN(

memory[socket] / hp_info[i].hugepage_sz,

hp_info[i].num_pages[socket]); // 如果这时候取的不是hp_info[i].num_pages[socket],表示这个大小的hugepage有剩余

// 要么分给size更小的页面, 多余的需要在之后unmap掉

// 如果这时候取的是hp_info[i].num_pages[socket], 表示这个大小hugepage不够满足,继续到下一个大小分配空间

cur_mem = hp_used[i].num_pages[socket] *

hp_used[i].hugepage_sz;

memory[socket] -= cur_mem;

total_mem -= cur_mem;

total_num_pages += hp_used[i].num_pages[socket];

/* check if we have met all memory requests */

if (memory[socket] == 0)

break;

/* check if we have any more pages left at this size, if so

* move on to next size */

if (hp_used[i].num_pages[socket] == hp_info[i].num_pages[socket])

continue;

/* At this point we know that there are more pages available that are

* bigger than the memory we want, so lets see if we can get enough

* from other page sizes.

*/

remaining_mem = 0;

for (j = i+1; j < num_hp_info; j++)

remaining_mem += hp_info[j].hugepage_sz *

hp_info[j].num_pages[socket];

/* is there enough other memory, if not allocate another page and quit */

if (remaining_mem < memory[socket]){

cur_mem = RTE_MIN(memory[socket],

hp_info[i].hugepage_sz);

memory[socket] -= cur_mem;

total_mem -= cur_mem;

hp_used[i].num_pages[socket]++;

total_num_pages++;

break; /* we are done with this socket*/

}

}

/* if we didn't satisfy all memory requirements per socket */

if (memory[socket] > 0) {

/* to prevent icc errors */

requested = (unsigned) (internal_config.socket_mem[socket] /

0x100000);

available = requested -

((unsigned) (memory[socket] / 0x100000));

RTE_LOG(ERR, EAL, "Not enough memory available on socket %u! "

"Requested: %uMB, available: %uMB\n", socket,

requested, available);

return -1;

}

}

/* if we didn't satisfy total memory requirements */

if (total_mem > 0) {

requested = (unsigned) (internal_config.memory / 0x100000);

available = requested - (unsigned) (total_mem / 0x100000);

RTE_LOG(ERR, EAL, "Not enough memory available! Requested: %uMB,"

" available: %uMB\n", requested, available);

return -1;

}

return total_num_pages;

}

rte_eal_memzone_init

rte_eal_memzone_init主要是master进程对配置文件里的memzone做初始化, 主要调用了rte_eal_malloc_heap_init函数

首先为每一个memseg创建一个malloc_elem结构的start_elem到end_elem指定的free空间, 然后吧start_elem插入到heap的free_head对应大小的链表头重

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76int

rte_eal_malloc_heap_init(void)

{

struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;

unsigned ms_cnt;

struct rte_memseg *ms;

for (ms = &mcfg->memseg[0], ms_cnt = 0;

(ms_cnt < RTE_MAX_MEMSEG) && (ms->len > 0);

ms_cnt++, ms++) {

malloc_heap_add_memseg(&mcfg->malloc_heaps[ms->socket_id], ms); // 把所有的memseg按socket分配到malloc_heap,并按大小分配到对应的free_head

}

return 0;

}

/*

* Expand the heap with a memseg.

* This reserves the zone and sets a dummy malloc_elem header at the end

* to prevent overflow. The rest of the zone is added to free list as a single

* large free block

*/

static void

malloc_heap_add_memseg(struct malloc_heap *heap, struct rte_memseg *ms)

{

/* allocate the memory block headers, one at end, one at start */

struct malloc_elem *start_elem = (struct malloc_elem *)ms->addr;

struct malloc_elem *end_elem = RTE_PTR_ADD(ms->addr,

ms->len - MALLOC_ELEM_OVERHEAD); // memset为end_elem留出必须的空间

end_elem = RTE_PTR_ALIGN_FLOOR(end_elem, RTE_CACHE_LINE_SIZE); // 向下对齐

const size_t elem_size = (uintptr_t)end_elem - (uintptr_t)start_elem;

malloc_elem_init(start_elem, heap, ms, elem_size);

malloc_elem_mkend(end_elem, start_elem); // end_elem->pre = start_elem

malloc_elem_free_list_insert(start_elem); // 把start_elem insert到free_head对应大小的双向链表表头

heap->total_size += elem_size;

}

/*

* Given an element size, compute its freelist index.

* We free an element into the freelist containing similarly-sized elements.

* We try to allocate elements starting with the freelist containing

* similarly-sized elements, and if necessary, we search freelists

* containing larger elements.

*

* Example element size ranges for a heap with five free lists:

* heap->free_head[0] - (0 , 2^8]

* heap->free_head[1] - (2^8 , 2^10]

* heap->free_head[2] - (2^10 ,2^12]

* heap->free_head[3] - (2^12, 2^14]

* heap->free_head[4] - (2^14, MAX_SIZE]

*/

size_t

malloc_elem_free_list_index(size_t size)

{

# define MALLOC_MINSIZE_LOG2 8

# define MALLOC_LOG2_INCREMENT 2

size_t log2;

size_t index;

if (size <= (1UL << MALLOC_MINSIZE_LOG2))

return 0;

/* Find next power of 2 >= size. */

log2 = sizeof(size) * 8 - __builtin_clzl(size-1);

/* Compute freelist index, based on log2(size). */

index = (log2 - MALLOC_MINSIZE_LOG2 + MALLOC_LOG2_INCREMENT - 1) /

MALLOC_LOG2_INCREMENT;

return (index <= RTE_HEAP_NUM_FREELISTS-1?

index: RTE_HEAP_NUM_FREELISTS-1);

}

rte_eal_tailqs_init

rte_eal_tailqs_init->rte_eal_tailqs_init->rte_eal_tailq_update来初始化。master创建mcfg->tailq_head[i],secondary attach到配置文件中的地址

比如说调用rte_mempool_create需要的尾队列,通过constructor在main函数执行前就通过rte_eal_tailq_register插入到rte_tailq_elem_head队列中

然后在rte_eal_tailqs_init中在配置文件中初始化相应的队列

还有常用的rte_ring_tailq也是类似

1

2

3

4

5

6

7

8

9

10

11

12static struct rte_tailq_elem rte_mempool_tailq = {

.name = "RTE_MEMPOOL",

};

EAL_REGISTER_TAILQ(rte_mempool_tailq)

# define EAL_REGISTER_TAILQ(t) \\

void tailqinitfn_ ##t(void); \\

void __attribute__((constructor, used)) tailqinitfn_ ##t(void) \\

{ \\

if (rte_eal_tailq_register(&t) < 0) \

rte_panic("Cannot initialize tailq: %s\n", t.name); \

}

rte_eal_pci_probe

rte_eal_pci_probe->pci_probe_all_drivers->rte_eal_pci_probe_one_driver->pci_map_device->rte_eth_dev_init->eth_ixgbe_dev_init

多线程

每个进程(不管是primary还是secondary)都有master和slave线程,具体的线程由rte_eal_init的命令行参数cpu掩码来决定,每个对应核心上跑一个对应的线程,通过pipe来进行同步。

初始化总流程1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113int rte_eal_init(int argc, char **argv) {

if (!rte_atomic32_test_and_set(&run_once))

return -1;

if (rte_eal_cpu_init() < 0) // 获取系统cpu信息到 lcore_config

rte_panic("Cannot detect lcores\n");

// 内部调用eal_proc_type_detect来区分但前进程是master还是secondary

// 如果文件不存在或者能够获取写锁则是master

fctret = eal_parse_args(argc, argv);

if (fctret < 0)

exit(1);

if (internal_config.no_hugetlbfs == 0 &&

internal_config.process_type != RTE_PROC_SECONDARY &&

internal_config.xen_dom0_support == 0 &&

eal_hugepage_info_init() < 0) // 获取hugepage信息

rte_panic("Cannot get hugepage information\n");

if (internal_config.memory == 0 && internal_config.force_sockets == 0) {

if (internal_config.no_hugetlbfs)

internal_config.memory = MEMSIZE_IF_NO_HUGE_PAGE;

else

internal_config.memory = eal_get_hugepage_mem_size();

}

rte_srand(rte_rdtsc());

rte_config_init(); // mmap .ret_config配置文件

if (rte_eal_pci_init() < 0) //扫描系统的pci设备,按pci地址顺序挂到pci_device_list上

rte_panic("Cannot init PCI\n");

if (rte_eal_memory_init() < 0) // mmap所有使用的hugepage,master/secondary同步共享

rte_panic("Cannot init memory\n");

/* the directories are locked during eal_hugepage_info_init */

eal_hugedirs_unlock();

if (rte_eal_memzone_init() < 0) // memseg初始化malloc_heap对应的free_head

rte_panic("Cannot init memzone\n");

if (rte_eal_tailqs_init() < 0)

rte_panic("Cannot init tail queues for objects\n");

if (rte_eal_log_init(logid, internal_config.syslog_facility) < 0)

rte_panic("Cannot init logs\n");

if (rte_eal_alarm_init() < 0)

rte_panic("Cannot init interrupt-handling thread\n");

if (rte_eal_intr_init() < 0)

rte_panic("Cannot init interrupt-handling thread\n");

if (rte_eal_timer_init() < 0)

rte_panic("Cannot init HPET or TSC timers\n");

eal_check_mem_on_local_socket();

rte_eal_mcfg_complete();

TAILQ_FOREACH(solib, &solib_list, next) {

RTE_LOG(DEBUG, EAL, "open shared lib %s\n", solib->name);

solib->lib_handle = dlopen(solib->name, RTLD_NOW);

if (solib->lib_handle == NULL)

RTE_LOG(WARNING, EAL, "%s\n", dlerror());

}

eal_thread_init_master(rte_config.master_lcore);

ret = eal_thread_dump_affinity(cpuset, RTE_CPU_AFFINITY_STR_LEN);

RTE_LOG(DEBUG, EAL, "Master lcore %u is ready (tid=%x;cpuset=[%s%s])\n",

rte_config.master_lcore, (int)thread_id, cpuset,

ret == 0 ? "" : "...");

if (rte_eal_dev_init() < 0)

rte_panic("Cannot init pmd devices\n");

RTE_LCORE_FOREACH_SLAVE(i) {

/*

* create communication pipes between master thread

* and children

*/

if (pipe(lcore_config[i].pipe_master2slave) < 0)

rte_panic("Cannot create pipe\n");

if (pipe(lcore_config[i].pipe_slave2master) < 0)

rte_panic("Cannot create pipe\n");

lcore_config[i].state = WAIT;

/* create a thread for each lcore */

ret = pthread_create(&lcore_config[i].thread_id, NULL,

eal_thread_loop, NULL);

if (ret != 0)

rte_panic("Cannot create thread\n");

}

/*

* Launch a dummy function on all slave lcores, so that master lcore

* knows they are all ready when this function returns.

*/

rte_eal_mp_remote_launch(sync_func, NULL, SKIP_MASTER);

rte_eal_mp_wait_lcore();

/* Probe & Initialize PCI devices */

if (rte_eal_pci_probe())

rte_panic("Cannot probe PCI\n");

return fctret;

}

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值