编译环境:gcc version 7.5.0 (Ubuntu 7.5.0-3ubuntu1~18.04)
例子按3.5节所写:对象成员的效率(object member efficiency)
constexpr int ITERATION_COUNT = 10000001;
void test_plain_float()
{
float pA_x = 1.725f, pA_y = 0.875f, pA_z = 0.478f;
float pB_x = 0.315f, pB_y = 0.317f, pB_z = 0.838f;
START_TIMING(0);
for (int i = 0; i < ITERATION_COUNT; i++)
{
pB_x = pA_x - pB_z;
pB_y = pA_y + pB_x;
pB_z = pA_z + pB_y;
}
END_TIMING(0);
cout << "-----plain:" << pB_z << endl;
}
void test_plain_array()
{
enum fussy
{
x,
y,
z
};
float pA[z + 1], pB[z + 1];
pA[x] = 1.725f, pA[y] = 0.875f, pA[z] = 0.478f;
pB[x] = 0.315f, pB[y] = 0.317f, pB[z] = 0.838f;
START_TIMING(0);
for (int iters = 0; iters < ITERATION_COUNT; iters++)
{
pB[x] = pA[x] - pB[z];
pB[y] = pA[y] + pB[x];
pB[z] = pA[z] + pB[y];
}
END_TIMING(0);
cout << "-----array:" << pB[z] << endl;
}
struct Point3d
{
float _x;
float _y;
float _z;
};
void test_c_struct()
{
Point3d pA, pB;
pA._x = 1.725f, pA._y = 0.875f, pA._z = 0.478f;
pB._x = 0.315f, pB._y = 0.317f, pB._z = 0.838f;
START_TIMING(0);
for (int iters = 0; iters < ITERATION_COUNT; iters++)
{
pB._x = pA._x - pB._z;
pB._y = pA._y + pB._x;
pB._z = pA._z + pB._y;
}
END_TIMING(0);
cout << "-----struct:" << pB._z << endl;
}
void test_c_struct_pointer()
{
Point3d *pA = new Point3d;
Point3d *pB = new Point3d;
pA->_x = 1.725f, pA->_y = 0.875f, pA->_z = 0.478f;
pB->_x = 0.315f, pB->_y = 0.317f, pB->_z = 0.838f;
START_TIMING(0);
for (int iters = 0; iters < ITERATION_COUNT; iters++)
{
pB->_x = pA->_x - pB->_z;
pB->_y = pA->_y + pB->_x;
pB->_z = pA->_z + pB->_y;
}
END_TIMING(0);
cout << "-----struct pointer:" << pB->_z << endl;
}
void test_c_struct_pointer_volatile()
{
volatile Point3d *pA = new Point3d;
volatile Point3d *pB = new Point3d;
pA->_x = 1.725f, pA->_y = 0.875f, pA->_z = 0.478f;
pB->_x = 0.315f, pB->_y = 0.317f, pB->_z = 0.838f;
START_TIMING(0);
for (int iters = 0; iters < ITERATION_COUNT; iters++)
{
pB->_x = pA->_x - pB->_z;
pB->_y = pA->_y + pB->_x;
pB->_z = pA->_z + pB->_y;
}
END_TIMING(0);
cout << "-----struct pointer volatile:" << pB->_z << endl;
}
void test_c_struct_member_pointer()
{
Point3d pA, pB;
pA._x = 1.725f, pA._y = 0.875f, pA._z = 0.478f;
pB._x = 0.315f, pB._y = 0.317f, pB._z = 0.838f;
float *ax = &pA._x;
float *ay = &pA._y;
float *az = &pA._z;
float *bx = &pB._x;
float *by = &pB._y;
float *bz = &pB._z;
START_TIMING(0);
for (int iters = 0; iters < ITERATION_COUNT; iters++)
{
*bx = *ax - *bz;
*by = *ay + *bx;
*bz = *az + *by;
}
END_TIMING(0);
cout << "-----struct_member_pointer:" << pB._z << endl;
}
class Point3d_Virtual
{
public:
Point3d_Virtual(float xx = 0.0, float yy = 0.0, float zz = 0.0)
: _x(xx), _y(yy), _z(zz) {}
virtual inline float &x() { return _x; } //it's 3 times slower than non-virtual function invoke
virtual inline float &y() { return _y; }
virtual inline float &z() { return _z; }
virtual inline void x(float x) { _x = x; }
virtual inline void y(float y) { _y = y; }
virtual inline void z(float z) { _z = z; }
public:
float _x, _y, _z;
};
void test_access_by_virtual_function()
{
Point3d_Virtual*pA = new Point3d_Virtual, *pB = new Point3d_Virtual;
pA->_x = 1.725f, pA->_y = 0.875f, pA->_z = 0.478f;
pB->_x = 0.315f, pB->_y = 0.317f, pB->_z = 0.838f;
START_TIMING(0);
for (int iters = 0; iters < ITERATION_COUNT; iters++)
{
pB->x() = pA->x() - pB->z();
pB->y() = pA->y() + pB->x();
pB->z() = pA->z() + pB->y();
}
END_TIMING(0);
cout << "-----virtual func:" << pB->z() << endl;
}
void test_access_by_inline_function()
{
Point3d_Virtual pA, pB;
pA._x = 1.725f, pA._y = 0.875f, pA._z = 0.478f;
pB._x = 0.315f, pB._y = 0.317f, pB._z = 0.838f;
START_TIMING(0);
for (int iters = 0; iters < ITERATION_COUNT; iters++)
{
pB.x()=(pA.x() - pB.z());
pB.y()=(pA.y() + pB.x());
pB.z()=(pA.z() + pB.y());
}
END_TIMING(0);
cout << "-----inline func:" << pB.z() << endl;
}
void test_class_member_pointer()
{
Point3d_Virtual pA, pB;
pA._x = 1.725f, pA._y = 0.875f, pA._z = 0.478f;
pB._x = 0.315f, pB._y = 0.317f, pB._z = 0.838f;
float *ax = &pA._x;
float *ay = &pA._y;
float *az = &pA._z;
float *bx = &pB._x;
float *by = &pB._y;
float *bz = &pB._z;
START_TIMING(0);
for (int iters = 0; iters < ITERATION_COUNT; iters++)
{
*bx = *ax - *bz;
*by = *ay + *bx;
*bz = *az + *by;
}
END_TIMING(0);
cout << "-----class_member_pointer:" << pB._z << endl;
}
void test_inside_cplusplus_object_model_performance()
{
test_plain_float();
test_plain_array();
test_c_struct();
test_c_struct_pointer();
test_c_struct_pointer_volatile();
test_c_struct_member_pointer();
test_access_by_virtual_function();
test_access_by_inline_function();
test_class_member_pointer();
}
-O2编译,输出结果如下:
0.122494-----plain:2.24
0.123217-----array:2.24
0.122705-----struct:2.24
0.130809-----struct pointer:2.24
0.367345-----struct pointer volatile:2.24
0.122326-----struct_member_pointer:2.24
0.368905-----virtual func:2.24
0.123141-----inline func:2.24
0.122887-----class_member_pointer:2.24
-O0编译,输出结果如下:
0.365971-----plain:2.24
0.366328-----array:2.24
0.366668-----struct:2.24
0.366613-----struct pointer:2.24
0.369769-----struct pointer volatile:2.24
0.371624-----struct_member_pointer:2.24
0.957507-----virtual func:2.24
0.815733-----inline func:2.24
0.373669-----class_member_pointer:2.24
对比结果,结论:
1.-O2优化的情况,虚函数开销是inline函数的3倍
2.-O2优化的情况,内联函数与直接存取成员变量效率一致;-O0不优化情况,内联函数(内联不起作用)存取成员变量相比直接存取,时间加倍
3.通过成员变量指针访问,与直接访问成员变量,效率一样
4.在一次开机测试出现如下结果,汇编代码完全一样见下,不理解。
原因应该是数据加载问题,测试发现哪种方法第一个运行,哪种方法耗时长。打印可知,这6个变量的地址均是连续的,不存在内存访问的差异。clock_gettime的调用的影响,也可忽略。机器负载越高,数据预先加载的影响越小。
0.061806-----plain:2.24
0.031168-----array:2.24
0.031260-----struct:2.24
.....
test_plain_float()循环部分的汇编代码:
cc0: e8 eb fd ff ff callq ab0 <clock_gettime@plt>
cc5: 83 f8 ff cmp $0xffffffff,%eax
cc8: 0f 84 db 01 00 00 je ea9 <_Z16test_plain_floatv+0x209>
cce: b8 81 96 98 00 mov $0x989681,%eax
float pB_x = 0.315f, pB_y = 0.317f, pB_z = 0.838f;
cd3: f3 0f 10 0d 99 10 00 movss 0x1099(%rip),%xmm1 # 1d74 <_IO_stdin_used+0xc4>
cda: 00
cdb: f3 0f 10 1d 95 10 00 movss 0x1095(%rip),%xmm3 # 1d78 <_IO_stdin_used+0xc8>
ce2: 00
ce3: f3 0f 10 15 91 10 00 movss 0x1091(%rip),%xmm2 # 1d7c <_IO_stdin_used+0xcc>
cea: 00
ceb: f3 0f 10 05 8d 10 00 movss 0x108d(%rip),%xmm0 # 1d80 <_IO_stdin_used+0xd0>
cf2: 00
cf3: 0f 1f 44 00 00 nopl 0x0(%rax,%rax,1)
pB_x = pA_x - pB_z;
cf8: 0f 28 e3 movaps %xmm3,%xmm4
for (int i = 0; i < ITERATION_COUNT; i++)
cfb: 83 e8 01 sub $0x1,%eax
pB_x = pA_x - pB_z;
cfe: f3 0f 5c e1 subss %xmm1,%xmm4
d02: 0f 28 cc movaps %xmm4,%xmm1
pB_y = pA_y + pB_x;
d05: f3 0f 58 ca addss %xmm2,%xmm1
pB_z = pA_z + pB_y;
d09: f3 0f 58 c8 addss %xmm0,%xmm1
for (int i = 0; i < ITERATION_COUNT; i++)
d0d: 75 e9 jne cf8 <_Z16test_plain_floatv+0x58>
d0f: 48 8d 74 24 20 lea 0x20(%rsp),%rsi
d14: bf 01 00 00 00 mov $0x1,%edi
d19: f3 0f 11 4c 24 0c movss %xmm1,0xc(%rsp)
d1f: e8 8c fd ff ff callq ab0 <clock_gettime@plt>
test_plain_array()循环部分的汇编代码:
ee0: e8 cb fb ff ff callq ab0 <clock_gettime@plt>
ee5: 83 f8 ff cmp $0xffffffff,%eax
ee8: 0f 84 db 01 00 00 je 10c9 <_Z16test_plain_arrayv+0x209>
eee: b8 81 96 98 00 mov $0x989681,%eax
pB[x] = 0.315f, pB[y] = 0.317f, pB[z] = 0.838f;
ef3: f3 0f 10 0d 79 0e 00 movss 0xe79(%rip),%xmm1 # 1d74 <_IO_stdin_used+0xc4>
efa: 00
efb: f3 0f 10 1d 75 0e 00 movss 0xe75(%rip),%xmm3 # 1d78 <_IO_stdin_used+0xc8>
f02: 00
f03: f3 0f 10 15 71 0e 00 movss 0xe71(%rip),%xmm2 # 1d7c <_IO_stdin_used+0xcc>
f0a: 00
f0b: f3 0f 10 05 6d 0e 00 movss 0xe6d(%rip),%xmm0 # 1d80 <_IO_stdin_used+0xd0>
f12: 00
f13: 0f 1f 44 00 00 nopl 0x0(%rax,%rax,1)
pB[x] = pA[x] - pB[z];
f18: 0f 28 e3 movaps %xmm3,%xmm4
for (int iters = 0; iters < ITERATION_COUNT; iters++)
f1b: 83 e8 01 sub $0x1,%eax
pB[x] = pA[x] - pB[z];
f1e: f3 0f 5c e1 subss %xmm1,%xmm4
f22: 0f 28 cc movaps %xmm4,%xmm1
pB[y] = pA[y] + pB[x];
f25: f3 0f 58 ca addss %xmm2,%xmm1
pB[z] = pA[z] + pB[y];
f29: f3 0f 58 c8 addss %xmm0,%xmm1
for (int iters = 0; iters < ITERATION_COUNT; iters++)
f2d: 75 e9 jne f18 <_Z16test_plain_arrayv+0x58>
f2f: 48 8d 74 24 20 lea 0x20(%rsp),%rsi
f34: bf 01 00 00 00 mov $0x1,%edi
f39: f3 0f 11 4c 24 0c movss %xmm1,0xc(%rsp)
f3f: e8 6c fb ff ff callq ab0 <clock_gettime@plt>