C++ | 标准库 string 的内存结构(前8字节是指针，接着8字节是字符个数，最后16字节可能是字符串本身)

biomooc

已于 2023-01-05 16:49:49 修改

阅读量1.2k

点赞数

分类专栏： C++ 文章标签： c++ 开发语言

于 2023-01-05 15:54:48 首次发布

本文链接：https://blog.csdn.net/wangjunliang/article/details/128561656

版权

C++ 专栏收录该内容

7 篇文章 1 订阅

订阅专栏

水平不够，不阅读源码的情况下，本文探索 c++ string 的内存占用情况。
环境: Ubuntu 20.04, g++ 9.4.0

1. 遇到的问题：memcpy 复制字符串为什么会报错？

#include<iostream>
#include<vector>
#include<cstring>

struct Book{
    std::string name;
    double price;
};
void demo1(){
    Book book1, book2;
    std::vector<Book> shelf;
    //1.给book1赋值
    book1.name="this is a book about C lang, including the main part of C and its std lib."; book1.price=40.8;
    book2.name="c++ lang"; book2.price=80.8;
    std::cout << "(1)book2: " << book2.name << ", " << book2.price << std::endl;
    //2.插入到vector后面
    shelf.push_back(book1);
    //3.把 vector[0] 按内存复制到 book2
    memcpy(&book2, &shelf[0], sizeof(Book));
    //打印出 book2;
    std::cout << "(2)book2: " << book2.name << ", " << book2.price << std::endl;
}

int main(){
    demo1();
    return 0;
}

输出如下:

$ g++ a11_memcpy_vector2struct.cpp && ./a.out 
(1)book2: c++ lang, 80.8
(2)book2: this is a book about C lang, including the main part of C and its std lib., 40.8
free(): double free detected in tcache 2
Aborted (core dumped)

复制string后的报错

free(): double free detected in tcache 2
Aborted (core dumped)

像是同一个地址被释放了2次。
探究确切原因，就需要知道 string 在内存中的细节。

2. 查看内存的每个字节的2进制及16进制

详见 C语言 | 数字(int, float 为例)在内存中的二进制表示与查看

大小端：小端为例。

#include<iostream>
#include<vector>
#include<cstring>

//辅助工具 显示一个字节的16进制
void show_byte(void *p){
    printf("[%p] ", p);
    unsigned char *pChar=(unsigned char*)p;
    // 二进制 8 位 = 1byte
    for(int i=7; i>=0; i--){
        printf("%s",  (*pChar&(1<<i))?"1":"0" );
    }
    // 16进制 2位 = 1byte
    printf(" 0x%X | %c  \n", *pChar, *pChar);
    //printf(" %c ", *pChar);
}

void show_bytes(void *p, int len){
    unsigned char *pChar1=(unsigned char *)p;
    for(int i=0; i<len; i++){
        show_byte( pChar1+i );
    }
    printf("\n");
}

void demo2(){
    int x=1;
    show_byte(&x);
}

int main(){
    demo2();
    return 0;
}

输出:

$ g++ a11_memcpy_vector2struct.cpp && ./a.out 
[0x7ffc39431e04] 00000001 0x1 |

3. 探究string 的内存细节

#include<iostream>
#include<vector>
#include<cstring>

struct Book{
    std::string name;
    double price;
};

//辅助工具 显示一个字节的16进制
void show_byte(void *p){
    printf("[%p] ", p);
    unsigned char *pChar=(unsigned char*)p;
    // 二进制 8 位 = 1byte
    for(int i=7; i>=0; i--){
        printf("%s",  (*pChar&(1<<i))?"1":"0" );
    }
    // 16进制 2位 = 1byte
    printf(" 0x%X | %c  \n", *pChar, *pChar);
    //printf(" %c ", *pChar);
}

void show_bytes(void *p, int len){
    unsigned char *pChar1=(unsigned char *)p;
    for(int i=0; i<len; i++){
        show_byte( pChar1+i );
    }
    printf("\n");
}

void demo2(){
    int x=1;
    show_byte(&x);
}


//这个是探索案例，保留，再写个简单的
void demo3(){
    // 1.打印堆和栈的地址，做参考
    int x0=10;
    printf("stack addr: &x0=%p (x0=%d)\n", &x0, x0); //auto变量的内存地址
    int *x1=(int *)malloc(sizeof(int));
    *x1=20;
    printf(" heap addr:  x1=%p (*x1=%d)\n", x1, *x1); //heap变量的内存地址
    free(x1);
    int arr[3]={10,20,30};
    for(int i=0; i<3; i++){
        printf("%p | arr[%d]=%d\n", &arr[i], i, arr[i]); //一个int占了4个字节
    }
    printf("\n");

    // 2.获取string的地址
    std::string s1="c lang-2nd", s2="";
    // string变量的内存地址
    printf("1 addr: &s1=%p, s1=%s\n", &s1, s1.c_str()); //直接获取string变量的地址
    printf("1 addr: &s2=%p, s2=%s\n", &s2, s2.c_str()); //直接获取string变量的地址
    printf("2 addr: s1.c_str()=%p (val=%s)\n", s1.c_str(), s1.c_str());//获取string变量.c_str() C风格的地址
    printf("2 addr: s2.c_str()=%p (val=%s)\n", s2.c_str(), s2.c_str());//获取string变量.c_str() C风格的地址
    auto iter=s1.begin();
    printf("3 addr: &*iter=%p (*iter=%c)\n", &*iter, *iter); //使用迭代器获取地址

    //获取每个 字符 的位置
    const char *pStr=s1.c_str(); //不加const 报错
    for(int i=0; i<strlen(pStr); i++){
        printf("    %p | pStr[%d]=%c\n", &pStr[i], i, pStr[i]);
    }
    printf("\n");

    //3. &s2 和 s2.c_str() 之间的10个字节是什么？
    /*
    printf("\nS1:\n");
    show_bytes(&s1, 20);
    printf("\nS2:\n");
    show_bytes(&s2, 20);
    printf("\n");
    */


    // 4. 使用 memcpy 复制string：释放阶段报错
    std::cout << "s1: " << s1 << std::endl;
    std::cout << "s2: " << s2 << std::endl;

    //复制前，先保存string的第一个8字节
    int a=-1, *pA=&a; //指针变量pA，准备保存&s2的前8字节
    //printf("===>1 pA=%p\n", pA);
    memcpy(&pA, &s2, sizeof(int*));
    //printf("===>2 pA=%p\n", pA); //确实复制到该指针了：&s2的前8字节
    
    //复制字符串，s1覆盖掉s2
    memcpy(&s2, &s1, sizeof(s1));

    //恢复s2的头8字节
    memcpy(&s2, &pA, sizeof(int*));

    printf("\nAfter memcpy: \n");
    /* check addr
    printf("\nafter: S1:\n");
    show_bytes(&s1, 20);
    printf("\nafter: S2:\n");
    show_bytes(&s2, 20);
    printf("\n");

    s2.clear();
    s1.clear(); //clear也不能消除错误
    */
    std::cout << "s1: " << s1 << std::endl;
    std::cout << "s2: " << s2 << std::endl;
}



int main(){
    demo3();

    return 0;
}

输出如下:

$ g++ a03.cpp && ./a.out 
stack addr: &x0=0x7ffeb23bd870 (x0=10)
 heap addr:  x1=0x56061d2682c0 (*x1=20)
0x7ffeb23bd8a4 | arr[0]=10
0x7ffeb23bd8a8 | arr[1]=20
0x7ffeb23bd8ac | arr[2]=30

1 addr: &s1=0x7ffeb23bd8b0, s1=c lang-2nd
1 addr: &s2=0x7ffeb23bd8d0, s2=
2 addr: s1.c_str()=0x7ffeb23bd8c0 (val=c lang-2nd)
2 addr: s2.c_str()=0x7ffeb23bd8e0 (val=)
3 addr: &*iter=0x7ffeb23bd8c0 (*iter=c)
    0x7ffeb23bd8c0 | pStr[0]=c
    0x7ffeb23bd8c1 | pStr[1]= 
    0x7ffeb23bd8c2 | pStr[2]=l
    0x7ffeb23bd8c3 | pStr[3]=a
    0x7ffeb23bd8c4 | pStr[4]=n
    0x7ffeb23bd8c5 | pStr[5]=g
    0x7ffeb23bd8c6 | pStr[6]=-
    0x7ffeb23bd8c7 | pStr[7]=2
    0x7ffeb23bd8c8 | pStr[8]=n
    0x7ffeb23bd8c9 | pStr[9]=d

s1: c lang-2nd
s2: 

After memcpy: 
s1: c lang-2nd
s2: c lang-2nd

问题：已知string s2; 则 &s2 和 s2.c_str() 之间的10个字节是什么？

4. 精简版例子：如何使用 memcpy 复制 string 变量?

#include<iostream>
#include<vector>
#include<cstring>

struct Book{
    std::string name;
    double price;
};

//辅助工具 显示一个字节的16进制
void show_byte(void *p){
    printf("[%p] ", p);
    unsigned char *pChar=(unsigned char*)p;
    // 二进制 8 位 = 1byte
    for(int i=7; i>=0; i--){
        printf("%s",  (*pChar&(1<<i))?"1":"0" );
    }
    // 16进制 2位 = 1byte
    printf(" 0x%X | %c  \n", *pChar, *pChar);
    //printf(" %c ", *pChar);
}

void show_bytes(void *p, int len){
    unsigned char *pChar1=(unsigned char *)p;
    for(int i=0; i<len; i++){
        show_byte( pChar1+i );
    }
    printf("\n");
}

void demo2(){
    int x=1;
    show_byte(&x);
}

// 简短版: 如何使用 memcpy 复制 string 变量
void demo4(){
    // 按位覆盖，仅 step2: free(): invalid pointer Aborted (core dumped) ；加上step1/3则正常
    // 原因: string 的前8位是 其内部字符串保存的地址，如果按位copy，则释放时，该地址会被释放两次
    // 解决方法: 保留 string 变量的前8字节，按位覆盖后，再恢复其前8字节
    // string 变量的第二个8字节：看着像字符串长度 //todo
    std::string s1="c lang-2nd", s2="";
    std::cout << "s1: " << s1 << std::endl;
    std::cout << "s2: " << s2 << std::endl;

    //step1.复制前，先保存string的第一个8字节
    int a=-1, *pA=&a; //指针变量pA，准备保存&s2的前8字节
    memcpy(&pA, &s2, sizeof(int*));

    //step2.按位复制string变量
    memcpy(&s2, &s1, sizeof(s1)); //第2个参数覆盖掉第1个，长度为第3个参数

    //step3.恢复被覆盖string s2的头8字节
    memcpy(&s2, &pA, sizeof(int*));

    std::cout << "s1: " << s1 << std::endl;
    std::cout << "s2: " << s2 << std::endl;
}


int main(){
    demo4();
    return 0;
}

输出:

$ g++ a04.cpp && ./a.out 
s1: c lang-2nd
s2: 
s1: c lang-2nd
s2: c lang-2nd

5. 探究 string 的内存结构

#include<iostream>
#include<vector>
#include<cstring>

struct Book{
    std::string name;
    double price;
};

//辅助工具 显示一个字节的16进制
void show_byte(void *p){
    printf("[%p] ", p);
    unsigned char *pChar=(unsigned char*)p;
    // 二进制 8 位 = 1byte
    for(int i=7; i>=0; i--){
        printf("%s",  (*pChar&(1<<i))?"1":"0" );
    }
    // 16进制 2位 = 1byte
    printf(" 0x%X | %c  \n", *pChar, *pChar);
    //printf(" %c ", *pChar);
}

void show_bytes(void *p, int len){
    unsigned char *pChar1=(unsigned char *)p;
    for(int i=0; i<len; i++){
        show_byte( pChar1+i );
    }
    printf("\n");
}

void demo2(){
    int x=1;
    show_byte(&x);
}

// 测试：string 变量第二个8字节是长度，类型为long
void show1(const std::string &s){
    printf(" addr: &s=%p, s.c_str()=%p |[%ld] %s\n", &s, s.c_str(), s.size(), s.c_str());
}
void show2(const std::string &s){
    //使用 long 打印的数字太大。可能是小端，按字节倒着写的？
    //short *pL=(short *)( ((char *)&s) + 8); //获取第2个8字节的起始地址。
    long *pL=(long *)( ((char *)&s) + 8);
    printf(" read 2nd 8 bytes: %ld\n", *pL);
}

void demo5(){
    std::string s1="this is a book about C lang, including the main part of C and its std lib.", 
        s2="c++",
        s3="0123456789ABCDE",
        s4="0123456789ABCDEF";
    std::cout << "s1: " << s1 << std::endl;
    std::cout << "s2: " << s2 << std::endl;
    std::cout << "s3: " << s3 << std::endl;
    std::cout << "s4: " << s4 << std::endl;

    //2. 打印地址: [0,15]保存在栈，则 &s 和  s.c_str() 之间差了16字节
    //size>=16的长字符串，保存在堆；则只有前16个字节使用，其余16个字节不用。
    printf("s1.capacity()=%ld\n", s1.capacity()); //开辟的地址能容纳的字符数，超了就重新开辟
    //string变量大小:32字节; 前8字节: 指向第一个字符的指针; 接着8字节:字符串长度; 其余16字节: 保存短字符串
    printf("sizeof: s1:%ld, s2:%ld\n", sizeof(s1), sizeof(s2)); 
    
    //打印地址
    printf("s1"); show1(s1);    printf("s2"); show1(s2);
    printf("s3"); show1(s3);    printf("s4"); show1(s4);

    //打印第一个8字节: 指向第一个字符的指针
    char *p1=(char *)&s1;
    printf("\np1=%p\n", p1);
    memcpy(&p1, &s1, sizeof(char*)); //string变量头8位的地址只能按位拷贝到 指针变量 p1 中
    printf("p1=%p(val=%c), s1.c_str()=%p\n", p1, *p1, s1.c_str());

    //打印第二个8字节：字符串的长度，long int
    printf("\nsizeof long: %ld\n", sizeof(long int));
    printf("  s1 size:%ld, ", s1.size()); show2(s1);   
    printf("  s2 size:%ld, ", s2.size()); show2(s2);
    printf("  s3 size:%ld, ", s3.size()); show2(s3);
    printf("  s4 size:%ld, ", s4.size()); show2(s4);
    
    printf("\n");
};



int main(){
    demo5();

    return 0;
}

输出:

$ g++ a05.cpp && ./a.out 
s1: this is a book about C lang, including the main part of C and its std lib.
s2: c++
s3: 0123456789ABCDE
s4: 0123456789ABCDEF
s1.capacity()=74
sizeof: s1:32, s2:32
s1 addr: &s=0x7ffda92c9c70, s.c_str()=0x5557f1621eb0 |[74] this is a book about C lang, including the main part of C and its std lib.
s2 addr: &s=0x7ffda92c9c90, s.c_str()=0x7ffda92c9ca0 |[3] c++
s3 addr: &s=0x7ffda92c9cb0, s.c_str()=0x7ffda92c9cc0 |[15] 0123456789ABCDE
s4 addr: &s=0x7ffda92c9cd0, s.c_str()=0x5557f1621f10 |[16] 0123456789ABCDEF

p1=0x7ffda92c9c70
p1=0x5557f1621eb0(val=t), s1.c_str()=0x5557f1621eb0

sizeof long: 8
  s1 size:74,  read 2nd 8 bytes: 74
  s2 size:3,  read 2nd 8 bytes: 3
  s3 size:15,  read 2nd 8 bytes: 15
  s4 size:16,  read 2nd 8 bytes: 16

我的机器上，一个 int 占用 4 字节，一个 int * 占用 8字节。

一个 string 是 32 字节。
string 的第一个8字节是一个指针，指向真正的字符串内容。
对比s3 和 s4: 如果字符串长度 <=15则直接存储在随后的地址中(栈中)，如果字符串长度>=16则保存在新开辟的堆内存中。
复制出 s1的第一个8字节，就是 s1.c_str() 返回值。
string的第二个8字节，就是string 字符串的长度，不包括最后的’\0’，就是 s1.size() 返回值。
总结：一个string共32字节，前8字节是字符串指针，接着8字节是字符串长度，余下的 16 字节可能是字符串本身(如果字符串长度 <=15字节)。为什么少了1字节呢？可能保存的是什么呢？

$ cat a002.cpp 
#include<iostream>
using namespace std;
int main(){
	string s1="abc";
	cout << "s1=" << s1 << endl;
	printf("%d\n", '\0');
	
	const char * p1=s1.c_str();
	for(int i=0; i< s1.size()+2; i++){
		printf("%d [%c] %d\n", i,  p1[i], p1[i]);
	}
	return 0;
}


$ g++ a002.cpp 

$ ./a.out 
s1=abc
0
0 [a] 97
1 [b] 98
2 [c] 99
3 [] 0
4 [] 25

$ ./a.out 
s1=abc
0
0 [a] 97
1 [b] 98
2 [c] 99
3 [] 0
4 [:] 58

可见，字符串abc后面确实跟着一个 ‘\0’，再往后的一个则会随机变化，不在字符串的范围内了。

6. 探究string的第二个8字节

#include<iostream>
#include<vector>
#include<cstring>

struct Book{
    std::string name;
    double price;
};

//辅助工具 显示一个字节的16进制
void show_byte(void *p){
    printf("[%p] ", p);
    unsigned char *pChar=(unsigned char*)p;
    // 二进制 8 位 = 1byte
    for(int i=7; i>=0; i--){
        printf("%s",  (*pChar&(1<<i))?"1":"0" );
    }
    // 16进制 2位 = 1byte
    printf(" 0x%X | %c  \n", *pChar, *pChar);
    //printf(" %c ", *pChar);
}

void show_bytes(void *p, int len){
    unsigned char *pChar1=(unsigned char *)p;
    for(int i=0; i<len; i++){
        show_byte( pChar1+i );
    }
    printf("\n");
}

void demo2(){
    int x=1;
    show_byte(&x);
}


// 第二个8字节到底是啥？
void demo6(){
    std::string s1="c++", s2="this is a book about C lang, including the main part of C and its std lib.";
    std::cout << "s1: " << s1 << std::endl;
    std::cout << "s2: " << s2 << std::endl;
    printf("s1.size()=%ld, s2.size()=%ld\n", s1.size(), s2.size());
    printf("addr: &s1=%p, s1.c_str()=%p\n", &s1, s1.c_str());
    printf("addr: &s2=%p, s2.c_str()=%p\n", &s2, s2.c_str());
    printf("\n");

    //1.按字节打印前16字节
    unsigned char *pChar=(unsigned char *) &s1;
    for(int i=0; i<20; i++){
        printf("%p [%2d] 0x%x %d %c\n", (pChar+i), i, *(pChar+i), *(pChar+i), *(pChar+i) );
        if(i==7 || i==15) printf("\n");
    }

    //2.打印第二个8字节：字符串长度 s.size(), long类型
    printf("sizeof long: %ld\n", sizeof(long));
    long *pL=(long *)( ((char *)&s1) + 8);
    printf("2nd 8 bytes of s1: %ld\n", *pL);

    long *pL2=(long *)( ((char *)&s2) + 8);
    printf("2nd 8 bytes of s2: %ld\n", *pL2);
}


int main(){
    demo6();
    return 0;
}


$ g++ a06.cpp && ./a.out 
s1: c++
s2: this is a book about C lang, including the main part of C and its std lib.
s1.size()=3, s2.size()=74
addr: &s1=0x7fff3a070370, s1.c_str()=0x7fff3a070380
addr: &s2=0x7fff3a070390, s2.c_str()=0x556b83db6eb0

0x7fff3a070370 [ 0] 0x80 128  
0x7fff3a070371 [ 1] 0x3 3  
0x7fff3a070372 [ 2] 0x7 7 
0x7fff3a070373 [ 3] 0x3a 58 :
0x7fff3a070374 [ 4] 0xff 255 ÿ
0x7fff3a070375 [ 5] 0x7f 127 
0x7fff3a070376 [ 6] 0x0 0 
0x7fff3a070377 [ 7] 0x0 0 

0x7fff3a070378 [ 8] 0x3 3  
0x7fff3a070379 [ 9] 0x0 0 
0x7fff3a07037a [10] 0x0 0 
0x7fff3a07037b [11] 0x0 0 
0x7fff3a07037c [12] 0x0 0 
0x7fff3a07037d [13] 0x0 0 
0x7fff3a07037e [14] 0x0 0 
0x7fff3a07037f [15] 0x0 0 

0x7fff3a070380 [16] 0x63 99 c
0x7fff3a070381 [17] 0x2b 43 +
0x7fff3a070382 [18] 0x2b 43 +
0x7fff3a070383 [19] 0x0 0 
sizeof long: 8
2nd 8 bytes of s1: 3
2nd 8 bytes of s2: 74

再次确认，string的第2个8字节还是字符串长度，不计算最后的’\0’。

7. 再次印证以上结论：string 字符串末尾预留一个字节的’\0’，但是统计长度的时候不统计该字节

#include<iostream>
#include<vector>
#include<cstring>

// 测试 string 的末尾到底有没有'\0'
void demo7(){
    std::string s1="0123456789ABCDE"; //当字符串长度 <=15时，就使用接下来的栈内存；为什么不能是16个字符？说明最后为'\0'预留一个位置
    //std::string s1="0123456789ABCDEF"; //当字符串长度 >=16时，就会去堆开辟新内存
    printf("(0) &s1=%p, sizeof(s1)=%ld\n", &s1, sizeof(s1)); // sizeof(s1)=32: 8+8+16

    //1. s1的第1个8字节是字符串指针，就是 s1.c_str() 返回的值
    unsigned char *pS1=(unsigned char *) &"abc";
    memcpy( &pS1, (unsigned char *)&s1, sizeof(char *)); //复制s1的第一个8字节到字符指针变量 pS1 中
    printf("(1) pS1=%p, s1.c_str=%p\n", pS1, s1.c_str());

    //按照C风格打印字符串，说明结尾肯定有'\0'，否则无法正常结尾
    printf("pS1=[%s], s1.c_str()=[%s]\n", pS1, s1.c_str());


    //2. s1的第2个字节是字符串长度，不包括末尾的'\0'
    long *pLen=(long *)(( (char *)&s1 ) +8 ); // 获取地址，强制转为char*，指针移动8字节，强制转为 long*;
    printf("(2) *pLen=%ld, s1.size()=%ld \n", *pLen, s1.size());

    //3. 尝试看字符串结尾的'\0'是否存在: 结尾总是有一个 int值为 0的字符的;
    for(int i=0; i<s1.size()+3; i++){
        printf("(3) %p [%c] %d\n", &pS1[i], pS1[i], pS1[i]);
    }
}

int main(){
    demo7();
    return 0;
}

输出:

$ g++ a07.cpp && ./a.out 
(0) &s1=0x7ffd8687c310, sizeof(s1)=32
(1) pS1=0x7ffd8687c320, s1.c_str=0x7ffd8687c320
pS1=[0123456789ABCDE], s1.c_str()=[0123456789ABCDE]
(2) *pLen=15, s1.size()=15 
(3) 0x7ffd8687c320 [0] 48
(3) 0x7ffd8687c321 [1] 49
(3) 0x7ffd8687c322 [2] 50
(3) 0x7ffd8687c323 [3] 51
(3) 0x7ffd8687c324 [4] 52
(3) 0x7ffd8687c325 [5] 53
(3) 0x7ffd8687c326 [6] 54
(3) 0x7ffd8687c327 [7] 55
(3) 0x7ffd8687c328 [8] 56
(3) 0x7ffd8687c329 [9] 57
(3) 0x7ffd8687c32a [A] 65
(3) 0x7ffd8687c32b [B] 66
(3) 0x7ffd8687c32c [C] 67
(3) 0x7ffd8687c32d [D] 68
(3) 0x7ffd8687c32e [E] 69
(3) 0x7ffd8687c32f [] 0
(3) 0x7ffd8687c330 [盠232
(3) 0x7ffd8687c331 [R] 82

一个完整的 string 的实现可能是这样的:

class String2{
public:
	char *_Ptr;    //指向字符串的指针，第一个8字节
	long _Len;      //字符串的长度，第二个8字节
	char _arr[16]; //接着16个字节是字符数组
};

biomooc

关注

0
点赞
踩
2

收藏

觉得还不错? 一键收藏
1
评论
C++ | 标准库 string 的内存结构(前8字节是指针，接着8字节是字符个数，最后16字节可能是字符串本身)

string 的内存结构：前8字节是指针，接着8字节是字符个数，最后16字节可能是字符串本身
复制链接

扫一扫

专栏目录