memkind是用户可扩展堆管理器,构建在jemalloc之上,它可以控制各种内存之间的堆分区。由于在各种系统、环境中都需要分配内存,并且没有统一的标准,所以memkind为了实现统一的函数分配而生:http://memkind.github.io/memkind/memkind_arch_20150318.pdf。memkind_create_pmem()采用tmpfile函数创建,在创建的目录中不会显示,并且当程序退出后创建的文件也会释放/删除。并且memkind_create_pmem()函数调用后,并不会立即创建文件,而是在memkind_pmem_mmap()函数调用后才会创建。当首次更新一个memkind分配的地址,会产生一个缺页中断,这个缺页中断的代价较大,其目的是为了建立虚拟地址到物理地址的连接。而memkind_free函数可能会触发这个中断。而使用环境变量“export MEMKIND_HOG_MEMORY=1”可以避免这个中断,从而不会再次产生缺页中断。如何通过预先访问避免第一次访问的缺页中断代价,现在仍然在讨论中间。
bool pmem_extent_dalloc(extent_hooks_t *extent_hooks,
void *addr,
size_t size,
bool committed,
unsigned arena_ind)
{
bool result = true;
if (memkind_get_hog_memory()) {
return result;
}
// if madvise fail, it means that addr isn't mapped shared (doesn't come from pmem)
// and it should be also unmapped to avoid space exhaustion when calling large number of
// operations like memkind_create_pmem and memkind_destroy_kind
errno = 0;
int status = madvise(addr, size, MADV_REMOVE);
...
}
示例 5中所使用的memkind的核心代码如下:
1.int err = memkind_create_pmem(filename, PMEM_SIZE, &pmem_kind);
2.disk=(disk_t *) memkind_calloc(MEMKIND_DEFAULT,1,sizeof(disk_t));
3.page = (unsigned char *)memkind_malloc(pmem_kind, PAGE_SIZE);
4.pmem_memcpy_nodrain(page,content,PAGE_SIZE);
5.memkind_free(pmem_kind,page);
6.memkind_free(MEMKIND_DEFAULT,disk);
7.memkind_pmem_destroy (pmem_kind);
示例 5 持久内存实现易失的页缓存
#include <iostream>
#include <chrono>
#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#include <libpmem.h>
#include <memkind.h>
#include <time.h>
#include <string.h>
#include <unistd.h>
#define PAGE_SIZE 4096
#define BLOCK_PAGE_NUM 1024
#define REQ_BLOCK 10
#define REQ_PAGE 0x3ff
#define DISK_BLOCK_NUM 1048576 //1024*1024 blocks
#define PMEM_SIZE 100*1024*1024*1024UL
/*****************************************
page are in the PMEM (allocated from the memkind)
*/
typedef struct block_page {
unsigned char * page;
}block_page_t;
typedef struct block_data {
block_page_t pages[BLOCK_PAGE_NUM];
uint32_t cached_pages_cnt; //if the count over one threshold, might need to write back to the SSD.
}block_data_t;
typedef struct disk_s {
block_data_t blocks[DISK_BLOCK_NUM];
}disk_t;
disk_t * disk;
struct memkind *pmem_kind;
//初始化,传入的是文件的名称,文件名称也可以定义在头文件中间。
int cbs_init(const char * filename)
{
int i;
int err = memkind_create_pmem(filename, PMEM_SIZE, &pmem_kind);
if (err) {
perror("memkind_create_pmem()");
return 1;
}
disk=(disk_t *) memkind_calloc(MEMKIND_DEFAULT,1,sizeof(disk_t));
if(disk == NULL) {
return 1;
}
return 0;
}
int get_cached_count()
{
int i;
int cnt=0;
for(i=0;i<DISK_BLOCK_NUM;i++) {
cnt+=disk->blocks[i].cached_pages_cnt;
}
return cnt;
}
//写入或者更新一个页,输入req_id表示更新到什么地方,content表示更新的内容
int write_req(uint64_t req_id, unsigned char * content) {
//req_id>>10是block_id(1024 pages/block); req_id&0x3ff是该block中的page_id.
uint64_t block_id = req_id >> REQ_BLOCK;
uint64_t req_page_id = req_id & REQ_PAGE;
if(block_id > DISK_BLOCK_NUM) {
printf(" write req_id is not valid and over the disk range\n");
return -1;
}
unsigned char * page=disk->blocks[block_id].pages[req_page_id].page;
if(page == NULL) {
page = (unsigned char *)memkind_malloc(pmem_kind, PAGE_SIZE);
disk->blocks[block_id].pages[req_page_id].page=page;
}
if (page != NULL) {
pmem_memcpy_nodrain(page,content,PAGE_SIZE);
//memcpy(page,content,PAGE_SIZE);
}
return 0;
}
void * read_req(uint64_t req_id) {
// req_id and content; only after the init success, then this API can be called.
uint64_t block_id = req_id >> REQ_BLOCK;
uint64_t req_page_id = req_id & REQ_PAGE;
if(block_id > DISK_BLOCK_NUM) {
printf("read req_id is not valid and over the disk range\n");
return NULL;
}
return disk->blocks[block_id].pages[req_page_id].page;
}
void delete_page(uint64_t req_id) {
// req_id and content; only after the init success, then this API can be called.
uint64_t block_id = req_id >> REQ_BLOCK;
uint64_t req_page_id = req_id & REQ_PAGE;
if(block_id > DISK_BLOCK_NUM) {
printf(" write req_id is not valid and over the disk range\n");
return;
}
unsigned char * page=disk->blocks[block_id].pages[req_page_id].page;
if (page != NULL) {
memkind_free(pmem_kind,page);
disk->blocks[block_id].pages[req_page_id].page=NULL;
}
}
#define WRITE_COUNT 100000
#define OVERWRITE_COUNT 10000
int main()
{
// calculate the time
unsigned char * page_content=(unsigned char *)malloc(4096);
uint64_t i=0;
auto start=std::chrono::steady_clock::now();
auto stop=std::chrono::steady_clock::now();
std::chrono::duration<double> diff=stop-start;
char * read_content;
memset(page_content,0xab,4096);
start=std::chrono::steady_clock::now();
cbs_init("/mnt/pmem0");
stop=std::chrono::steady_clock::now();
diff=stop-start;
std::cout<<"cbs_init time "<<diff.count()<<std::endl;
std::cout<<"cached page count" << get_cached_count()<<std::endl;
start = std::chrono::steady_clock::now();
for(i=0;i<WRITE_COUNT;i++) {
write_req(i,page_content);
}
stop=std::chrono::steady_clock::now();
diff=stop-start;
std::cout<<"write_req time "<<diff.count()/WRITE_COUNT<<std::endl;
memset(page_content,0xcd,4096);
start = std::chrono::steady_clock::now();
for(i=0;i<OVERWRITE_COUNT;i++) {
write_req(i,page_content);
}
stop=std::chrono::steady_clock::now();
diff=stop-start;
std::cout<<"overwrite write_req update take time "<< diff.count()/OVERWRITE_COUNT<<std::endl;
start = std::chrono::steady_clock::now();
for(i=0;i<OVERWRITE_COUNT;i++) {
read_content=(char *)read_req(i);
memcpy(page_content,read_content,PAGE_SIZE);
}
stop=std::chrono::steady_clock::now();
diff=stop-start;
std::cout<<"overwrite read_req take time "<<diff.count()/OVERWRITE_COUNT<<std::endl;
printf("the page should fill with paten 0xcd, 0x%x\n", read_content[0]);
start = std::chrono::steady_clock::now();
for(i=OVERWRITE_COUNT;i<WRITE_COUNT;i++) {
read_content=(char *)read_req(i);
memcpy(page_content,read_content,PAGE_SIZE);
}
stop=std::chrono::steady_clock::now();
diff=stop-start;
std::cout<<"overwrite->write count read_req take time "<<diff.count()/(WRITE_COUNT-OVERWRITE_COUNT)<<std::endl;
printf("the page should fill with patern 0xab, 0x%x\n", read_content[0]);
//start = std::chrono::steady_clock::now();
//for(i=0;i<WRITE_COUNT;i++) {
// delete_page(i);
//}
//stop=std::chrono::steady_clock::now();
//diff=stop-start;
//std::cout<<"delete write count take time "<<diff.count()/WRITE_COUNT<<std::endl;
memkind_free(MEMKIND_DEFAULT,disk);
memkind_pmem_destroy (pmem_kind);
return 0;
}
使用“g++ cbs_req_memkind.cpp -o cbs_req_memkind -lmemkind -lpmem -O2”编译后运行“taskset -c 2 ./cbs_req_memkind”得到的性能,第一次分配和写入4K页需要5.25us,而第二次的更新写入只需要958 ns 不到1us:
➜ ~ taskset -c 2 ./cbs_req_memkind
cbs_init time 0.0954005
cached page count0
write_req time 5.25884e-06
overwrite write_req update take time 9.58645e-07
overwrite read_req take time 1.14597e-06
the page should fill with paten 0xcd, 0xcd
overwrite->write count read_req take time 1.08656e-06
the page should fill with patern 0xab, 0xab
memkind还可以管理numa节点上内存的分配,如果你有多个numa节点可以使用memkind来进行分配。其中持久内存可以配置成为系统中的numa节点。Kernel5.1以上会支持,这样在系统中可以使用MEMKIND_DAX_KMEM的静态类型(static kinds)来直接访问持久内存。性能应该和上述示例没有区别。如果你有兴趣可以尝试编写这样的示例。
由于有多个kind类型的内存使用memkind来进行分配,在释放这些内存时,我们很难判断这个内存来自于哪一个kind,所以在memkind中可以支持使用无类型的释放memkind_free(NULL,page); memkind会自动找出类型并去释放,这个会带来一定的性能代价。
Call For Action:
1.https://github.com/memkind/memkind
2.http://memkind.github.io/memkind/
Tips:
1.应用需要自己来决定那些空间放在DRAM中,那些放在PMEM中。
2.注意环境变量“export MEMKIND_HOG_MEMORY=1”的使用,如果page fault的影响非常大,可以使用export MEMKIND_HOG_MEMORY=1。
The baseline store on DRAM and SSD
为了让大家对持久内存编程方法和性能方面有一个更加直观的感受,我们将上述示例要求的4K页写入DRAM和写入普通的SSD。写入DRAM的性能我们可以参考最后的memkind示例即可,将pmem_kind改为MEMKIND_DEFAULT,或者将memkind的接口换成glibc的接口就可以了。
#include <iostream>
#include <chrono>
#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
//#include <libpmem.h>
#include <memkind.h>
#include <time.h>
#include <string.h>
#include <unistd.h>
#define PAGE_SIZE 4096
#define BLOCK_PAGE_NUM 1024
#define REQ_BLOCK 10
#define REQ_PAGE 0x3ff
#define DISK_BLOCK_NUM 1048576 //1024*1024 blocks
//#define PMEM_SIZE 100*1024*1024*1024UL
/*****************************************
page are in the PMEM (allocated from the memkind)
*/
typedef struct block_page {
unsigned char * page;
}block_page_t;
typedef struct block_data {
block_page_t pages[BLOCK_PAGE_NUM];
uint32_t cached_pages_cnt; //if the count over one threshold, might need to write back to the SSD.
}block_data_t;
typedef struct disk_s {
block_data_t blocks[DISK_BLOCK_NUM];
}disk_t;
disk_t * disk;
//struct memkind *pmem_kind;
//初始化,传入的是文件的名称,文件名称也可以定义在头文件中间。
int cbs_init(const char * filename)
{
int i;
//int err = memkind_create_pmem(filename, PMEM_SIZE, &pmem_kind);
//if (err) {
// perror("memkind_create_pmem()");
// return 1;
//}
disk=(disk_t *) memkind_calloc(MEMKIND_DEFAULT,1,sizeof(disk_t));
if(disk == NULL) {
return 1;
}
return 0;
}
int get_cached_count()
{
int i;
int cnt=0;
for(i=0;i<DISK_BLOCK_NUM;i++) {
cnt+=disk->blocks[i].cached_pages_cnt;
}
return cnt;
}
//写入或者更新一个页,输入req_id表示更新到什么地方,content表示更新的内容
int write_req(uint64_t req_id, unsigned char * content) {
//req_id>>10是block_id(1024 pages/block); req_id&0x3ff是该block中的page_id.
uint64_t block_id = req_id >> REQ_BLOCK;
uint64_t req_page_id = req_id & REQ_PAGE;
if(block_id > DISK_BLOCK_NUM) {
printf(" write req_id is not valid and over the disk range\n");
return -1;
}
unsigned char * page=disk->blocks[block_id].pages[req_page_id].page;
if(page == NULL) {
page = (unsigned char *)memkind_malloc(MEMKIND_DEFAULT, PAGE_SIZE);
disk->blocks[block_id].pages[req_page_id].page=page;
}
if (page != NULL) {
//pmem_memcpy_nodrain(page,content,PAGE_SIZE);
memcpy(page,content,PAGE_SIZE);
}
return 0;
}
void * read_req(uint64_t req_id) {
// req_id and content; only after the init success, then this API can be called.
uint64_t block_id = req_id >> REQ_BLOCK;
uint64_t req_page_id = req_id & REQ_PAGE;
if(block_id > DISK_BLOCK_NUM) {
printf("read req_id is not valid and over the disk range\n");
return NULL;
}
return disk->blocks[block_id].pages[req_page_id].page;
}
void delete_page(uint64_t req_id) {
// req_id and content; only after the init success, then this API can be called.
uint64_t block_id = req_id >> REQ_BLOCK;
uint64_t req_page_id = req_id & REQ_PAGE;
if(block_id > DISK_BLOCK_NUM) {
printf(" write req_id is not valid and over the disk range\n");
return;
}
unsigned char * page=disk->blocks[block_id].pages[req_page_id].page;
if (page != NULL) {
memkind_free(MEMKIND_DEFAULT,page);
disk->blocks[block_id].pages[req_page_id].page=NULL;
}
}
#define WRITE_COUNT 100000
#define OVERWRITE_COUNT 10000
int main()
{
// calculate the time
unsigned char * page_content=(unsigned char *)malloc(4096);
uint64_t i=0;
auto start=std::chrono::steady_clock::now();
auto stop=std::chrono::steady_clock::now();
std::chrono::duration<double> diff=stop-start;
char * read_content;
memset(page_content,0xab,4096);
start=std::chrono::steady_clock::now();
cbs_init("/mnt/pmem0");
stop=std::chrono::steady_clock::now();
diff=stop-start;
std::cout<<"cbs_init time "<<diff.count()<<std::endl;
std::cout<<"cached page count" << get_cached_count()<<std::endl;
start = std::chrono::steady_clock::now();
for(i=0;i<WRITE_COUNT;i++) {
write_req(i,page_content);
}
stop=std::chrono::steady_clock::now();
diff=stop-start;
std::cout<<"write_req time "<<diff.count()/WRITE_COUNT<<std::endl;
memset(page_content,0xcd,4096);
start = std::chrono::steady_clock::now();
for(i=0;i<OVERWRITE_COUNT;i++) {
write_req(i,page_content);
}
stop=std::chrono::steady_clock::now();
diff=stop-start;
std::cout<<"overwrite write_req update take time "<< diff.count()/OVERWRITE_COUNT<<std::endl;
start = std::chrono::steady_clock::now();
for(i=0;i<OVERWRITE_COUNT;i++) {
read_content=(char *)read_req(i);
memcpy(page_content,read_content,PAGE_SIZE);
assert(page_content[0]==0xcd);
}
stop=std::chrono::steady_clock::now();
diff=stop-start;
std::cout<<"overwrite read_req take time "<<diff.count()/OVERWRITE_COUNT<<std::endl;
printf("the page should fill with paten 0xcd, 0x%x\n", page_content[0]);
start = std::chrono::steady_clock::now();
for(i=OVERWRITE_COUNT;i<WRITE_COUNT;i++) {
read_content=(char *)read_req(i);
memcpy(page_content,read_content,PAGE_SIZE);
assert(page_content[0]==0xab);
}
stop=std::chrono::steady_clock::now();
diff=stop-start;
std::cout<<"overwrite->write count read_req take time "<<diff.count()/(WRITE_COUNT-OVERWRITE_COUNT)<<std::endl;
printf("the page should fill with patern 0xab, 0x%x\n", page_content[0]);
for(i=0;i<WRITE_COUNT;i++) {
delete_page(i);
}
sleep(10);
start = std::chrono::steady_clock::now();
for(i=0;i<WRITE_COUNT;i++) {
write_req(i,page_content);
}
stop=std::chrono::steady_clock::now();
diff=stop-start;
std::cout<<"write_req time "<<diff.count()/WRITE_COUNT<<std::endl;
memset(page_content,0xcd,4096);
start = std::chrono::steady_clock::now();
for(i=0;i<OVERWRITE_COUNT;i++) {
write_req(i,page_content);
}
stop=std::chrono::steady_clock::now();
diff=stop-start;
std::cout<<"overwrite write_req update take time "<< diff.count()/OVERWRITE_COUNT<<std::endl;
//start = std::chrono::steady_clock::now();
//for(i=0;i<WRITE_COUNT;i++) {
// delete_page(i);
//}
//stop=std::chrono::steady_clock::now();
//diff=stop-start;
//std::cout<<"delete write count take time "<<diff.count()/WRITE_COUNT<<std::endl;
return 0;
}
使用memkind去分配DRAM,通过编译” g++ cbs_req_dram.cpp -o cbs_req_dram -lmemkind -O2”,然后测试得到的结果,第一次的写由于有page fault,所以需要1.5us,但是第二次只需要696ns:读的性能大概在253ns~331ns。
➜ ~ taskset -c 2 ./cbs_req_dram
cbs_init time 0.00107337
cached page count0
write_req time 1.50446e-06
overwrite write_req update take time 6.9639e-07
overwrite read_req take time 2.53707e-07
the page should fill with paten 0xcd, 0xcd
overwrite->write count read_req take time 3.31147e-07
the page should fill with patern 0xab, 0xab
write_req time 1.30782e-06
overwrite write_req update take time 6.96456e-07
而如果将4k页已经相应的meta data写到SSD,我们需要保证将4k页还有相应的meta data保存到ssd中。我们可以使用libpmem示例的方法来保存数据到SSD中并从SSD中间恢复数据。Libpmem示例的方法是使用了pmem_memcpy的方法将要持久化的数据写入PMEM,而我们可以使用pmem_msync的方法将数据写入到SSD。在示例 1的基础上做这些改动:
1.将pmem_map_file(filename,PMEM_SIZE,
PMEM_FILE_CREATE, 0666, &mapped_len, &is_pmem), filename指向在ssd上的文件,同时is_pmem将会是false,将is_pmem设置为全局变量。
2.在使用pmem_memcpy_persist;pmem_persist;pmem_memset_persistent的地方使用类似下面的操作:
if(is_pmem) {
pmem_memcpy_persist(PAGE_FROM_META(page_new), content, 4096);
} else {
memcpy(PAGE_FROM_META(page_new),content,4096);
pmem_msync(PAGE_FROM_META(page_new),4096);
}
这样就可以完成将数据写入SSD的过程,当然这个过程中meta的写入。为了避免page cache的影响,在测试之前我们会先将page cache drop掉“echo 3 > /proc/sys/vm/drop_caches”,这样第一次必须从SSD介质中去读数据,所以性能大概是在23us~30us;写的性能大概在430us;而第二次读由于已经在page cache中, 此时读是364ns~390ns,和DRAM的性能几乎相当。Ssd可以保证一个扇区(sector)的断电原子性,所以在这个里面8字节应该是可以保证原子性的。
➜ ~ echo 3 > /proc/sys/vm/drop_caches
➜ ~ taskset -c 2 ./cbs_req_ssd
pmem_map_file mapped_len=107374182400, is_pmem=0
init done, pagecache_num=23588351,free page number=23588351
cbs_init time 107.153
cached page count0
overwrite read_req take time 7.109e-09
the page should fill with paten 0xcd, 0x20
overwrite->write count read_req take time 7.03638e-09
the page should fill with patern 0xab, 0x20
write_req time 0.000269097
overwrite write_req update take time 0.000282651
overwrite read_req take time 4.84718e-07
the page should fill with paten 0xcd, 0xcd
overwrite->write count read_req take time 5.37396e-07
the page should fill with patern 0xab, 0xab
➜ ~ echo 3 > /proc/sys/vm/drop_caches
➜ ~ taskset -c 2 ./cbs_req_ssd
pmem_map_file mapped_len=107374182400, is_pmem=0
init done, pagecache_num=23588351,free page number=23488351
cbs_init time 6.6092
cached page count100000
overwrite read_req take time 2.31789e-05
the page should fill with paten 0xcd, 0xcd
overwrite->write count read_req take time 3.03323e-05
the page should fill with patern 0xab, 0xab
write_req time 0.000439356
overwrite write_req update take time 0.000431005
overwrite read_req take time 3.64774e-07
the page should fill with paten 0xcd, 0xcd
overwrite->write count read_req take time 3.90219e-07
the page should fill with patern 0xab, 0xab
思考,这个地方可以如何优化?
1.是不是需要每次写操作都需要sync?
2.Metadata的设计可不可以简化,是否直接将req_id转成文件的offset并写入就可以了? 是否使用CRC来检查数据完整性和一致性?为了简化,我们只考虑将4K页数据写入SSD(当然SSD的型号,容量大小等等可能对性能有影响,我们用P4510为例),不考虑任何的meta data。