水平不够,不阅读源码的情况下,本文探索 c++ string 的内存占用情况。
环境: Ubuntu 20.04, g++ 9.4.0
1. 遇到的问题:memcpy 复制字符串为什么会报错?
#include<iostream>
#include<vector>
#include<cstring>
struct Book{
std::string name;
double price;
};
void demo1(){
Book book1, book2;
std::vector<Book> shelf;
//1.给book1赋值
book1.name="this is a book about C lang, including the main part of C and its std lib."; book1.price=40.8;
book2.name="c++ lang"; book2.price=80.8;
std::cout << "(1)book2: " << book2.name << ", " << book2.price << std::endl;
//2.插入到vector后面
shelf.push_back(book1);
//3.把 vector[0] 按内存复制到 book2
memcpy(&book2, &shelf[0], sizeof(Book));
//打印出 book2;
std::cout << "(2)book2: " << book2.name << ", " << book2.price << std::endl;
}
int main(){
demo1();
return 0;
}
输出如下:
$ g++ a11_memcpy_vector2struct.cpp && ./a.out
(1)book2: c++ lang, 80.8
(2)book2: this is a book about C lang, including the main part of C and its std lib., 40.8
free(): double free detected in tcache 2
Aborted (core dumped)
复制string后的报错
free(): double free detected in tcache 2
Aborted (core dumped)
像是同一个地址被释放了2次。
探究确切原因,就需要知道 string 在内存中的细节。
2. 查看内存的每个字节的2进制及16进制
详见 C语言 | 数字(int, float 为例)在内存中的二进制表示与查看
大小端:小端 为例。
#include<iostream>
#include<vector>
#include<cstring>
//辅助工具 显示一个字节的16进制
void show_byte(void *p){
printf("[%p] ", p);
unsigned char *pChar=(unsigned char*)p;
// 二进制 8 位 = 1byte
for(int i=7; i>=0; i--){
printf("%s", (*pChar&(1<<i))?"1":"0" );
}
// 16进制 2位 = 1byte
printf(" 0x%X | %c \n", *pChar, *pChar);
//printf(" %c ", *pChar);
}
void show_bytes(void *p, int len){
unsigned char *pChar1=(unsigned char *)p;
for(int i=0; i<len; i++){
show_byte( pChar1+i );
}
printf("\n");
}
void demo2(){
int x=1;
show_byte(&x);
}
int main(){
demo2();
return 0;
}
输出:
$ g++ a11_memcpy_vector2struct.cpp && ./a.out
[0x7ffc39431e04] 00000001 0x1 |
3. 探究string 的内存细节
#include<iostream>
#include<vector>
#include<cstring>
struct Book{
std::string name;
double price;
};
//辅助工具 显示一个字节的16进制
void show_byte(void *p){
printf("[%p] ", p);
unsigned char *pChar=(unsigned char*)p;
// 二进制 8 位 = 1byte
for(int i=7; i>=0; i--){
printf("%s", (*pChar&(1<<i))?"1":"0" );
}
// 16进制 2位 = 1byte
printf(" 0x%X | %c \n", *pChar, *pChar);
//printf(" %c ", *pChar);
}
void show_bytes(void *p, int len){
unsigned char *pChar1=(unsigned char *)p;
for(int i=0; i<len; i++){
show_byte( pChar1+i );
}
printf("\n");
}
void demo2(){
int x=1;
show_byte(&x);
}
//这个是探索案例,保留,再写个简单的
void demo3(){
// 1.打印堆和栈的地址,做参考
int x0=10;
printf("stack addr: &x0=%p (x0=%d)\n", &x0, x0); //auto变量的内存地址
int *x1=(int *)malloc(sizeof(int));
*x1=20;
printf(" heap addr: x1=%p (*x1=%d)\n", x1, *x1); //heap变量的内存地址
free(x1);
int arr[3]={10,20,30};
for(int i=0; i<3; i++){
printf("%p | arr[%d]=%d\n", &arr[i], i, arr[i]); //一个int占了4个字节
}
printf("\n");
// 2.获取string的地址
std::string s1="c lang-2nd", s2="";
// string变量的内存地址
printf("1 addr: &s1=%p, s1=%s\n", &s1, s1.c_str()); //直接获取string变量的地址
printf("1 addr: &s2=%p, s2=%s\n", &s2, s2.c_str()); //直接获取string变量的地址
printf("2 addr: s1.c_str()=%p (val=%s)\n", s1.c_str(), s1.c_str());//获取string变量.c_str() C风格的地址
printf("2 addr: s2.c_str()=%p (val=%s)\n", s2.c_str(), s2.c_str());//获取string变量.c_str() C风格的地址
auto iter=s1.begin();
printf("3 addr: &*iter=%p (*iter=%c)\n", &*iter, *iter); //使用迭代器获取地址
//获取每个 字符 的位置
const char *pStr=s1.c_str(); //不加const 报错
for(int i=0; i<strlen(pStr); i++){
printf(" %p | pStr[%d]=%c\n", &pStr[i], i, pStr[i]);
}
printf("\n");
//3. &s2 和 s2.c_str() 之间的10个字节是什么?
/*
printf("\nS1:\n");
show_bytes(&s1, 20);
printf("\nS2:\n");
show_bytes(&s2, 20);
printf("\n");
*/
// 4. 使用 memcpy 复制string:释放阶段报错
std::cout << "s1: " << s1 << std::endl;
std::cout << "s2: " << s2 << std::endl;
//复制前,先保存string的第一个8字节
int a=-1, *pA=&a; //指针变量pA,准备保存&s2的前8字节
//printf("===>1 pA=%p\n", pA);
memcpy(&pA, &s2, sizeof(int*));
//printf("===>2 pA=%p\n", pA); //确实复制到该指针了:&s2的前8字节
//复制字符串,s1覆盖掉s2
memcpy(&s2, &s1, sizeof(s1));
//恢复s2的头8字节
memcpy(&s2, &pA, sizeof(int*));
printf("\nAfter memcpy: \n");
/* check addr
printf("\nafter: S1:\n");
show_bytes(&s1, 20);
printf("\nafter: S2:\n");
show_bytes(&s2, 20);
printf("\n");
s2.clear();
s1.clear(); //clear也不能消除错误
*/
std::cout << "s1: " << s1 << std::endl;
std::cout << "s2: " << s2 << std::endl;
}
int main(){
demo3();
return 0;
}
输出如下:
$ g++ a03.cpp && ./a.out
stack addr: &x0=0x7ffeb23bd870 (x0=10)
heap addr: x1=0x56061d2682c0 (*x1=20)
0x7ffeb23bd8a4 | arr[0]=10
0x7ffeb23bd8a8 | arr[1]=20
0x7ffeb23bd8ac | arr[2]=30
1 addr: &s1=0x7ffeb23bd8b0, s1=c lang-2nd
1 addr: &s2=0x7ffeb23bd8d0, s2=
2 addr: s1.c_str()=0x7ffeb23bd8c0 (val=c lang-2nd)
2 addr: s2.c_str()=0x7ffeb23bd8e0 (val=)
3 addr: &*iter=0x7ffeb23bd8c0 (*iter=c)
0x7ffeb23bd8c0 | pStr[0]=c
0x7ffeb23bd8c1 | pStr[1]=
0x7ffeb23bd8c2 | pStr[2]=l
0x7ffeb23bd8c3 | pStr[3]=a
0x7ffeb23bd8c4 | pStr[4]=n
0x7ffeb23bd8c5 | pStr[5]=g
0x7ffeb23bd8c6 | pStr[6]=-
0x7ffeb23bd8c7 | pStr[7]=2
0x7ffeb23bd8c8 | pStr[8]=n
0x7ffeb23bd8c9 | pStr[9]=d
s1: c lang-2nd
s2:
After memcpy:
s1: c lang-2nd
s2: c lang-2nd
问题:已知string s2; 则 &s2 和 s2.c_str() 之间的10个字节是什么?
4. 精简版例子:如何使用 memcpy 复制 string 变量?
#include<iostream>
#include<vector>
#include<cstring>
struct Book{
std::string name;
double price;
};
//辅助工具 显示一个字节的16进制
void show_byte(void *p){
printf("[%p] ", p);
unsigned char *pChar=(unsigned char*)p;
// 二进制 8 位 = 1byte
for(int i=7; i>=0; i--){
printf("%s", (*pChar&(1<<i))?"1":"0" );
}
// 16进制 2位 = 1byte
printf(" 0x%X | %c \n", *pChar, *pChar);
//printf(" %c ", *pChar);
}
void show_bytes(void *p, int len){
unsigned char *pChar1=(unsigned char *)p;
for(int i=0; i<len; i++){
show_byte( pChar1+i );
}
printf("\n");
}
void demo2(){
int x=1;
show_byte(&x);
}
// 简短版: 如何使用 memcpy 复制 string 变量
void demo4(){
// 按位覆盖,仅 step2: free(): invalid pointer Aborted (core dumped) ;加上step1/3则正常
// 原因: string 的前8位是 其内部字符串保存的地址,如果按位copy,则释放时,该地址会被释放两次
// 解决方法: 保留 string 变量的前8字节,按位覆盖后,再恢复其前8字节
// string 变量的第二个8字节:看着像字符串长度 //todo
std::string s1="c lang-2nd", s2="";
std::cout << "s1: " << s1 << std::endl;
std::cout << "s2: " << s2 << std::endl;
//step1.复制前,先保存string的第一个8字节
int a=-1, *pA=&a; //指针变量pA,准备保存&s2的前8字节
memcpy(&pA, &s2, sizeof(int*));
//step2.按位复制string变量
memcpy(&s2, &s1, sizeof(s1)); //第2个参数覆盖掉第1个,长度为第3个参数
//step3.恢复被覆盖string s2的头8字节
memcpy(&s2, &pA, sizeof(int*));
std::cout << "s1: " << s1 << std::endl;
std::cout << "s2: " << s2 << std::endl;
}
int main(){
demo4();
return 0;
}
输出:
$ g++ a04.cpp && ./a.out
s1: c lang-2nd
s2:
s1: c lang-2nd
s2: c lang-2nd
5. 探究 string 的内存结构
#include<iostream>
#include<vector>
#include<cstring>
struct Book{
std::string name;
double price;
};
//辅助工具 显示一个字节的16进制
void show_byte(void *p){
printf("[%p] ", p);
unsigned char *pChar=(unsigned char*)p;
// 二进制 8 位 = 1byte
for(int i=7; i>=0; i--){
printf("%s", (*pChar&(1<<i))?"1":"0" );
}
// 16进制 2位 = 1byte
printf(" 0x%X | %c \n", *pChar, *pChar);
//printf(" %c ", *pChar);
}
void show_bytes(void *p, int len){
unsigned char *pChar1=(unsigned char *)p;
for(int i=0; i<len; i++){
show_byte( pChar1+i );
}
printf("\n");
}
void demo2(){
int x=1;
show_byte(&x);
}
// 测试:string 变量第二个8字节是长度,类型为long
void show1(const std::string &s){
printf(" addr: &s=%p, s.c_str()=%p |[%ld] %s\n", &s, s.c_str(), s.size(), s.c_str());
}
void show2(const std::string &s){
//使用 long 打印的数字太大。可能是小端,按字节倒着写的?
//short *pL=(short *)( ((char *)&s) + 8); //获取第2个8字节的起始地址。
long *pL=(long *)( ((char *)&s) + 8);
printf(" read 2nd 8 bytes: %ld\n", *pL);
}
void demo5(){
std::string s1="this is a book about C lang, including the main part of C and its std lib.",
s2="c++",
s3="0123456789ABCDE",
s4="0123456789ABCDEF";
std::cout << "s1: " << s1 << std::endl;
std::cout << "s2: " << s2 << std::endl;
std::cout << "s3: " << s3 << std::endl;
std::cout << "s4: " << s4 << std::endl;
//2. 打印地址: [0,15]保存在栈,则 &s 和 s.c_str() 之间差了16字节
//size>=16的长字符串,保存在堆;则只有前16个字节使用,其余16个字节不用。
printf("s1.capacity()=%ld\n", s1.capacity()); //开辟的地址能容纳的字符数,超了就重新开辟
//string变量大小:32字节; 前8字节: 指向第一个字符的指针; 接着8字节:字符串长度; 其余16字节: 保存短字符串
printf("sizeof: s1:%ld, s2:%ld\n", sizeof(s1), sizeof(s2));
//打印地址
printf("s1"); show1(s1); printf("s2"); show1(s2);
printf("s3"); show1(s3); printf("s4"); show1(s4);
//打印第一个8字节: 指向第一个字符的指针
char *p1=(char *)&s1;
printf("\np1=%p\n", p1);
memcpy(&p1, &s1, sizeof(char*)); //string变量头8位的地址只能按位拷贝到 指针变量 p1 中
printf("p1=%p(val=%c), s1.c_str()=%p\n", p1, *p1, s1.c_str());
//打印第二个8字节:字符串的长度,long int
printf("\nsizeof long: %ld\n", sizeof(long int));
printf(" s1 size:%ld, ", s1.size()); show2(s1);
printf(" s2 size:%ld, ", s2.size()); show2(s2);
printf(" s3 size:%ld, ", s3.size()); show2(s3);
printf(" s4 size:%ld, ", s4.size()); show2(s4);
printf("\n");
};
int main(){
demo5();
return 0;
}
输出:
$ g++ a05.cpp && ./a.out
s1: this is a book about C lang, including the main part of C and its std lib.
s2: c++
s3: 0123456789ABCDE
s4: 0123456789ABCDEF
s1.capacity()=74
sizeof: s1:32, s2:32
s1 addr: &s=0x7ffda92c9c70, s.c_str()=0x5557f1621eb0 |[74] this is a book about C lang, including the main part of C and its std lib.
s2 addr: &s=0x7ffda92c9c90, s.c_str()=0x7ffda92c9ca0 |[3] c++
s3 addr: &s=0x7ffda92c9cb0, s.c_str()=0x7ffda92c9cc0 |[15] 0123456789ABCDE
s4 addr: &s=0x7ffda92c9cd0, s.c_str()=0x5557f1621f10 |[16] 0123456789ABCDEF
p1=0x7ffda92c9c70
p1=0x5557f1621eb0(val=t), s1.c_str()=0x5557f1621eb0
sizeof long: 8
s1 size:74, read 2nd 8 bytes: 74
s2 size:3, read 2nd 8 bytes: 3
s3 size:15, read 2nd 8 bytes: 15
s4 size:16, read 2nd 8 bytes: 16
我的机器上,一个 int 占用 4 字节,一个 int * 占用 8字节。
- 一个 string 是 32 字节。
- string 的第一个8字节是一个指针,指向真正的字符串内容。
- 对比s3 和 s4: 如果字符串长度 <=15则直接存储在随后的地址中(栈中),如果字符串长度>=16则保存在新开辟的堆内存中。
- 复制出 s1的第一个8字节,就是 s1.c_str() 返回值。
- string的第二个8字节,就是string 字符串的长度,不包括最后的’\0’,就是 s1.size() 返回值。
- 总结:一个string共32字节,前8字节是字符串指针,接着8字节是字符串长度,余下的 16 字节可能是字符串本身(如果字符串长度 <=15字节)。为什么少了1字节呢?可能保存的是什么呢?
$ cat a002.cpp
#include<iostream>
using namespace std;
int main(){
string s1="abc";
cout << "s1=" << s1 << endl;
printf("%d\n", '\0');
const char * p1=s1.c_str();
for(int i=0; i< s1.size()+2; i++){
printf("%d [%c] %d\n", i, p1[i], p1[i]);
}
return 0;
}
$ g++ a002.cpp
$ ./a.out
s1=abc
0
0 [a] 97
1 [b] 98
2 [c] 99
3 [] 0
4 [] 25
$ ./a.out
s1=abc
0
0 [a] 97
1 [b] 98
2 [c] 99
3 [] 0
4 [:] 58
可见,字符串abc后面确实跟着一个 ‘\0’,再往后的一个则会随机变化,不在字符串的范围内了。
6. 探究string的第二个8字节
#include<iostream>
#include<vector>
#include<cstring>
struct Book{
std::string name;
double price;
};
//辅助工具 显示一个字节的16进制
void show_byte(void *p){
printf("[%p] ", p);
unsigned char *pChar=(unsigned char*)p;
// 二进制 8 位 = 1byte
for(int i=7; i>=0; i--){
printf("%s", (*pChar&(1<<i))?"1":"0" );
}
// 16进制 2位 = 1byte
printf(" 0x%X | %c \n", *pChar, *pChar);
//printf(" %c ", *pChar);
}
void show_bytes(void *p, int len){
unsigned char *pChar1=(unsigned char *)p;
for(int i=0; i<len; i++){
show_byte( pChar1+i );
}
printf("\n");
}
void demo2(){
int x=1;
show_byte(&x);
}
// 第二个8字节到底是啥?
void demo6(){
std::string s1="c++", s2="this is a book about C lang, including the main part of C and its std lib.";
std::cout << "s1: " << s1 << std::endl;
std::cout << "s2: " << s2 << std::endl;
printf("s1.size()=%ld, s2.size()=%ld\n", s1.size(), s2.size());
printf("addr: &s1=%p, s1.c_str()=%p\n", &s1, s1.c_str());
printf("addr: &s2=%p, s2.c_str()=%p\n", &s2, s2.c_str());
printf("\n");
//1.按字节打印前16字节
unsigned char *pChar=(unsigned char *) &s1;
for(int i=0; i<20; i++){
printf("%p [%2d] 0x%x %d %c\n", (pChar+i), i, *(pChar+i), *(pChar+i), *(pChar+i) );
if(i==7 || i==15) printf("\n");
}
//2.打印第二个8字节:字符串长度 s.size(), long类型
printf("sizeof long: %ld\n", sizeof(long));
long *pL=(long *)( ((char *)&s1) + 8);
printf("2nd 8 bytes of s1: %ld\n", *pL);
long *pL2=(long *)( ((char *)&s2) + 8);
printf("2nd 8 bytes of s2: %ld\n", *pL2);
}
int main(){
demo6();
return 0;
}
$ g++ a06.cpp && ./a.out
s1: c++
s2: this is a book about C lang, including the main part of C and its std lib.
s1.size()=3, s2.size()=74
addr: &s1=0x7fff3a070370, s1.c_str()=0x7fff3a070380
addr: &s2=0x7fff3a070390, s2.c_str()=0x556b83db6eb0
0x7fff3a070370 [ 0] 0x80 128
0x7fff3a070371 [ 1] 0x3 3
0x7fff3a070372 [ 2] 0x7 7
0x7fff3a070373 [ 3] 0x3a 58 :
0x7fff3a070374 [ 4] 0xff 255 ÿ
0x7fff3a070375 [ 5] 0x7f 127
0x7fff3a070376 [ 6] 0x0 0
0x7fff3a070377 [ 7] 0x0 0
0x7fff3a070378 [ 8] 0x3 3
0x7fff3a070379 [ 9] 0x0 0
0x7fff3a07037a [10] 0x0 0
0x7fff3a07037b [11] 0x0 0
0x7fff3a07037c [12] 0x0 0
0x7fff3a07037d [13] 0x0 0
0x7fff3a07037e [14] 0x0 0
0x7fff3a07037f [15] 0x0 0
0x7fff3a070380 [16] 0x63 99 c
0x7fff3a070381 [17] 0x2b 43 +
0x7fff3a070382 [18] 0x2b 43 +
0x7fff3a070383 [19] 0x0 0
sizeof long: 8
2nd 8 bytes of s1: 3
2nd 8 bytes of s2: 74
再次确认,string的第2个8字节还是字符串长度,不计算最后的’\0’。
7. 再次印证以上结论:string 字符串末尾预留一个字节的’\0’,但是统计长度的时候不统计该字节
#include<iostream>
#include<vector>
#include<cstring>
// 测试 string 的末尾到底有没有'\0'
void demo7(){
std::string s1="0123456789ABCDE"; //当字符串长度 <=15时,就使用接下来的栈内存;为什么不能是16个字符?说明最后为'\0'预留一个位置
//std::string s1="0123456789ABCDEF"; //当字符串长度 >=16时,就会去堆开辟新内存
printf("(0) &s1=%p, sizeof(s1)=%ld\n", &s1, sizeof(s1)); // sizeof(s1)=32: 8+8+16
//1. s1的第1个8字节是字符串指针,就是 s1.c_str() 返回的值
unsigned char *pS1=(unsigned char *) &"abc";
memcpy( &pS1, (unsigned char *)&s1, sizeof(char *)); //复制s1的第一个8字节到字符指针变量 pS1 中
printf("(1) pS1=%p, s1.c_str=%p\n", pS1, s1.c_str());
//按照C风格打印字符串,说明结尾肯定有'\0',否则无法正常结尾
printf("pS1=[%s], s1.c_str()=[%s]\n", pS1, s1.c_str());
//2. s1的第2个字节是字符串长度,不包括末尾的'\0'
long *pLen=(long *)(( (char *)&s1 ) +8 ); // 获取地址,强制转为char*,指针移动8字节,强制转为 long*;
printf("(2) *pLen=%ld, s1.size()=%ld \n", *pLen, s1.size());
//3. 尝试看字符串结尾的'\0'是否存在: 结尾总是有一个 int值为 0的字符的;
for(int i=0; i<s1.size()+3; i++){
printf("(3) %p [%c] %d\n", &pS1[i], pS1[i], pS1[i]);
}
}
int main(){
demo7();
return 0;
}
输出:
$ g++ a07.cpp && ./a.out
(0) &s1=0x7ffd8687c310, sizeof(s1)=32
(1) pS1=0x7ffd8687c320, s1.c_str=0x7ffd8687c320
pS1=[0123456789ABCDE], s1.c_str()=[0123456789ABCDE]
(2) *pLen=15, s1.size()=15
(3) 0x7ffd8687c320 [0] 48
(3) 0x7ffd8687c321 [1] 49
(3) 0x7ffd8687c322 [2] 50
(3) 0x7ffd8687c323 [3] 51
(3) 0x7ffd8687c324 [4] 52
(3) 0x7ffd8687c325 [5] 53
(3) 0x7ffd8687c326 [6] 54
(3) 0x7ffd8687c327 [7] 55
(3) 0x7ffd8687c328 [8] 56
(3) 0x7ffd8687c329 [9] 57
(3) 0x7ffd8687c32a [A] 65
(3) 0x7ffd8687c32b [B] 66
(3) 0x7ffd8687c32c [C] 67
(3) 0x7ffd8687c32d [D] 68
(3) 0x7ffd8687c32e [E] 69
(3) 0x7ffd8687c32f [] 0
(3) 0x7ffd8687c330 [盠232
(3) 0x7ffd8687c331 [R] 82
一个完整的 string 的实现可能是这样的:
class String2{
public:
char *_Ptr; //指向字符串的指针,第一个8字节
long _Len; //字符串的长度,第二个8字节
char _arr[16]; //接着16个字节是字符数组
};