C++元编程与内联性能对比测试

腾昵猫

已于 2024-01-08 14:23:11 修改

阅读量424

点赞数 7

分类专栏：元编程学习实践文章标签： c++ java 开发语言

于 2024-01-03 10:51:40 首次发布

本文链接：https://blog.csdn.net/Dr_Jack/article/details/135357500

版权

元编程学习实践专栏收录该内容

23 篇文章 0 订阅

订阅专栏

一直有两个疑问。1、C++的模板元编程是否比函数内联来的快；2、C++模板的inline是否会进行展开。于是进行了一下测试，测试代码如下：

#include <benchmark/benchmark.h>
#include <type_traits>
#include <utility>

class mat
{
public:
    static constexpr int row_num = 10;
    static constexpr int col_num = 10;
    double  data[100];
    inline double& get(const int& r, const int& c)
    {
        return data[col_num * r + c];
    }
    template<int r, int c>
    inline constexpr double& get()
    {
        return data[col_num * r + c];
    }
};

template<int N, int row, int col, typename mat_t1, typename mat_t2>
inline double mul_sum(mat_t1&& m1, mat_t2&& m2)
{
    if constexpr (N == 0)
    {
        return m1.template get<row, N>() * m2.template get<N, col>();
    }
    else
    {
        return m1.template get<row, N>() * m2.template get<N, col>() + mul_sum<N-1, row, col>(std::forward<mat_t1>(m1), std::forward<mat_t2>(m2));
    }
}

template<int target_row, int target_col, typename mat_t1, typename mat_t2>
inline double cell_proc(mat_t1&& m1, mat_t2&& m2)
{
    constexpr int loop_num = std::remove_reference_t<mat_t1>::col_num;
    return mul_sum<loop_num-1, target_row, target_col>(m1, m2);
}

template<int r, int c,typename mat_t1, typename mat_t2, typename mat_tr>
inline void template_dot(mat_t1&& m1, mat_t2&& m2, mat_tr&& mr)
{
    mr.template get<r, c>() = cell_proc<r, c>(m1, m2);
    if constexpr (c != 0)
    {
        template_dot<r, c-1>(std::forward<mat_t1>(m1), std::forward<mat_t2>(m2), std::forward<mat_tr>(mr));
    }
    if constexpr (r != 0 && c == 0)
    {
        template_dot<r-1, std::remove_reference_t<mat_t2>::col_num-1>(std::forward<mat_t1>(m1), std::forward<mat_t2>(m2), std::forward<mat_tr>(mr));
    }
}

inline void inline_dot(mat& m1, mat& m2, mat& ret)
{
    for (int r = 0; r < mat::row_num; ++r)
    {
        for (int c = 0; c < mat::col_num; ++c)
        {
            ret.get(r, c) = 0;
            for (int i = 0; i < mat::col_num; ++i)
            {
                ret.get(r, c) += (m1.get(r, i) * m2.get(i, c));
            }
        }
    }
}

static void BM_Template(benchmark::State& state)
{
    mat m1 = {
        1, 2, 3, 4, 5, 6, 7, 8, 9, 10
        , 1, 2, 3, 4, 5, 6, 7, 8, 9, 10
        , 1, 2, 3, 4, 5, 6, 7, 8, 9, 10
        , 1, 2, 3, 4, 5, 6, 7, 8, 9, 10
        , 1, 2, 3, 4, 5, 6, 7, 8, 9, 10
        , 1, 2, 3, 4, 5, 6, 7, 8, 9, 10
        , 1, 2, 3, 4, 5, 6, 7, 8, 9, 10
        , 1, 2, 3, 4, 5, 6, 7, 8, 9, 10
        , 1, 2, 3, 4, 5, 6, 7, 8, 9, 10
        , 1, 2, 3, 4, 5, 6, 7, 8, 9, 10
    };
    mat m2 = {
        2, 4, 6, 8, 10, 12, 14, 16, 18, 
        2, 4, 6, 8, 10, 12, 14, 16, 18, 
        2, 4, 6, 8, 10, 12, 14, 16, 18, 
        2, 4, 6, 8, 10, 12, 14, 16, 18, 
        2, 4, 6, 8, 10, 12, 14, 16, 18, 
        2, 4, 6, 8, 10, 12, 14, 16, 18, 
        2, 4, 6, 8, 10, 12, 14, 16, 18, 
        2, 4, 6, 8, 10, 12, 14, 16, 18, 
        2, 4, 6, 8, 10, 12, 14, 16, 18, 
        2, 4, 6, 8, 10, 12, 14, 16, 18
        };
    mat mr = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
    for (auto _: state)
    {
        template_dot<mat::row_num-1, mat::col_num-1>(m1, m2, mr);
    }
}

static void BM_Inline(benchmark::State& state)
{
    mat m1 = {
        1, 2, 3, 4, 5, 6, 7, 8, 9, 10
        , 1, 2, 3, 4, 5, 6, 7, 8, 9, 10
        , 1, 2, 3, 4, 5, 6, 7, 8, 9, 10
        , 1, 2, 3, 4, 5, 6, 7, 8, 9, 10
        , 1, 2, 3, 4, 5, 6, 7, 8, 9, 10
        , 1, 2, 3, 4, 5, 6, 7, 8, 9, 10
        , 1, 2, 3, 4, 5, 6, 7, 8, 9, 10
        , 1, 2, 3, 4, 5, 6, 7, 8, 9, 10
        , 1, 2, 3, 4, 5, 6, 7, 8, 9, 10
        , 1, 2, 3, 4, 5, 6, 7, 8, 9, 10
    };
    mat m2 = {
        2, 4, 6, 8, 10, 12, 14, 16, 18, 
        2, 4, 6, 8, 10, 12, 14, 16, 18, 
        2, 4, 6, 8, 10, 12, 14, 16, 18, 
        2, 4, 6, 8, 10, 12, 14, 16, 18, 
        2, 4, 6, 8, 10, 12, 14, 16, 18, 
        2, 4, 6, 8, 10, 12, 14, 16, 18, 
        2, 4, 6, 8, 10, 12, 14, 16, 18, 
        2, 4, 6, 8, 10, 12, 14, 16, 18, 
        2, 4, 6, 8, 10, 12, 14, 16, 18, 
        2, 4, 6, 8, 10, 12, 14, 16, 18
        };
    mat mr = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
    for (auto _: state)
    {
        inline_dot(m1, m2, mr);
    }
}

BENCHMARK(BM_Template);
BENCHMARK(BM_Inline);
BENCHMARK_MAIN();

稍微对以上代码进行一些讲解。模板和内联的测试都公用了一个类class mat这个类是一个简单的数组的封装，针对模板和内联分别定义了get函数，可以看到模板的get函数，由于都是编译期确定的值，如果编译器不进行优化，那么应该是模板的get函数更快一些。
模板元编程：实现了mul_sum模板函数，作用是计算两个函数的行列对应元素相乘之和，这个也是在编译期能够确定对应关系的，缺点是调用了递归以实现循环，但是增加了inline关键字，以期望其能内联。然后就是cell_proc模板函数，这个函数的作用是针对指定的行列，调用mul_sum计算其值。主测试函数template_dot调用cell_proc，先列后行反向遍历返回矩阵，计算其值。
内联函数：这个比较简单，就是遍历返回数组行列，并且计算其值。
可以看出，模板的元编程实现起来比较麻烦，其主要原因是模板没有for constexpr，因此循环需要依靠递归来实现，这种递归如果不能进行内联，那么函数入栈和退栈都是比较消耗时间的。
那么我们来看一下最后的性能情况：
编译语句：

gcc template_benchmark.cpp -std=c++17 -isystem benchmark/include -O3 -lbenchmark -lpthread -lstdc++ -lrt -lm -o template_benchmark

程序使用了if constexpr因此需要指明使用C++17的标准，使用了O3对两个程序进行优化。下面进行运行并统计：

./template_benchmark --benchmark_repetitions=10 --benchmark_report_aggregates_only=true

让google-benchmark运行10次并只显示统计信息，结果如下：
benchmark运行结果图这个图中可以看出，模板元编程实现同样的10*10矩阵点积运算竟然需要516ns，而内联的函数只需要0.747ns。由此可见1、模板元编程并不比函数内联来的更快；2、模板元编程大概率没有对inline的函数进行内联，从而导致运行速度如此之慢。
简化版本的矩阵相乘汇编代码：

_Z12template_dotIR3matS1_S1_Li2ELi2EEvOT_OT0_OT1_:
.LFB2310:
	.cfi_startproc
	pushq	%rbp
	.cfi_def_cfa_offset 16
	.cfi_offset 6, -16
	movq	%rsp, %rbp
	.cfi_def_cfa_register 6
	pushq	%r12
	pushq	%rbx
	subq	$32, %rsp
	.cfi_offset 12, -24
	.cfi_offset 3, -32
	movq	%rdi, -24(%rbp)
	movq	%rsi, -32(%rbp)
	movq	%rdx, -40(%rbp)
	movq	-32(%rbp), %rax
	movq	%rax, %rdi
	call	_ZSt7forwardIR3matEOT_RNSt16remove_referenceIS2_E4typeE
	movq	%rax, %rbx
	movq	-24(%rbp), %rax
	movq	%rax, %rdi
	call	_ZSt7forwardIR3matEOT_RNSt16remove_referenceIS2_E4typeE
	movq	%rbx, %rsi
	movq	%rax, %rdi
	call	_Z7mul_sumILi1ELi2ELi2ER3matS1_EdOT2_OT3_
	movq	%xmm0, %rbx
	movq	-40(%rbp), %rax
	movq	%rax, %rdi
	call	_ZN3mat3getILi2ELi2EEERdv
	movq	%rbx, (%rax)
	movq	-40(%rbp), %rax
	movq	%rax, %rdi
	call	_ZSt7forwardIR3matEOT_RNSt16remove_referenceIS2_E4typeE
	movq	%rax, %r12
	movq	-32(%rbp), %rax
	movq	%rax, %rdi
	call	_ZSt7forwardIR3matEOT_RNSt16remove_referenceIS2_E4typeE
	movq	%rax, %rbx
	movq	-24(%rbp), %rax
	movq	%rax, %rdi
	call	_ZSt7forwardIR3matEOT_RNSt16remove_referenceIS2_E4typeE
	movq	%r12, %rdx
	movq	%rbx, %rsi
	movq	%rax, %rdi
	call	_Z12template_dotIR3matS1_S1_Li2ELi1EEvOT_OT0_OT1_
	nop
	addq	$32, %rsp
	popq	%rbx
	popq	%r12
	popq	%rbp
	.cfi_def_cfa 7, 8
	ret
	.cfi_endproc
.LFE2310:
	.size	_Z12template_dotIR3matS1_S1_Li2ELi2EEvOT_OT0_OT1_, .-_Z12template_dotIR3matS1_S1_Li2ELi2EEvOT_OT0_OT1_
	.section	.text._ZN3mat3getILi2ELi2EEERdv,"axG",@progbits,_ZN3mat3getILi2ELi2EEERdv,comdat
	.align 2
	.weak	_ZN3mat3getILi2ELi2EEERdv
	.type	_ZN3mat3getILi2ELi2EEERdv, @function

可以看到这个函数中还是使用call的方式调用了递归的模板函数，也验证了inline的模板函数并没有进行展开。当然，也有可能是因为我的模板函数写的比较复杂，导致程序认为其不适宜内联。
各位如果有不同看法，或者我的测试程序有什么问题。欢迎提出来，大家一起讨论，共同进步。

腾昵猫

关注

7
点赞
踩
7

收藏

觉得还不错? 一键收藏
打赏
0
评论
C++元编程与内联性能对比测试

这个图中可以看出，模板元编程实现同样的10*10矩阵点积运算竟然需要516ns，而内联的函数只需要0.747ns。模板函数，作用是计算两个函数的行列对应元素相乘之和，这个也是在编译期能够确定对应关系的，缺点是调用了递归以实现循环，但是增加了inline关键字，以期望其能内联。，因此循环需要依靠递归来实现，这种递归如果不能进行内联，那么函数入栈和退栈都是比较消耗时间的。函数，由于都是编译期确定的值，如果编译器不进行优化，那么应该是模板的。模板函数，这个函数的作用是针对指定的行列，调用。函数，可以看到模板的。
复制链接

扫一扫