前言
为了分析C++程序的效率,一个非常重要的指标就是其算法的运行时间,我们C++11提供了chrono
库专门进行时间的测量,具体使用方法如下
chrono
首先我们要说明chrono
库中有自己的chrono
命名空间,且这个命名空间包含在std
中,所以我们要使用他先指定在std
命名空间中,再指定在chrono
命名空间中,比如我们计算现在的时间是多少
#include <chrono>
std::chrono::high_resoulution_clock::now();
std::chrono
有3个clock可以选择,分别是ststem_clock
,steady_clock
,high_resolution_clock
ststem_clock
主要是从unix时间开始计算(1970/1/1 00:00:00)到现在的系统时间,但是系统时间可以被管理员修改,所以我们一般不用他计算intervalsteady_clock
:用time tick精确的计算时间间隔high_resolution_clock
:同上
我们测算的基本思想是先统计当前时间,再在算法执行完毕后再统计一下当前时间,最后用chrono的内置转换函数,将其转换成我们想要的单位返回,最后输出,例子如下
#include <iostream>
#include <chrono>
using namespace std;
int main()
{
auto start = chrono::high_resolution_clock::now();
// do somthine
auto end = chrono::high_resolution_clock::now();
auto ms = chrono::duration_cast< std::chrono::milliseconds>(start - end);
cout << ms.count() << endl;
return 0;
具体例子如下,我们写一个矩阵相乘算法, 矩阵相乘算法具体放到device(GPU)上执行
#include <iostream>
#include <math.h>
#include <chrono>
void
__global__
matrix_miltiple(int *a, int *b, int *c, int n){
int row = blockIdx.y * blockDim.y + threadIdx.y; //每个block中的thread_id都是从0开始
int col = blockIdx.x * blockDim.x + threadIdx.x;
int temp = 0;
if((row < n) && (col < n)){ //one thread iterate over matrix a's row and matrix b'x column
for(int i = 0; i < n; i++) temp += a[row * n + i] * b[i * n + col];
}
c[row * n + col] = temp;
}
void
init_matrix(int *a, int n){
for(int i = 0; i < n; i++){
a[n] = rand() % 100;
}
}
int
main(){
//matrix size of row or col (1024)
int n = 1 << 10;
// size of matrix n * n (1024*1024)
size_t bytes = n * n * sizeof(int);
//host pointer
int *h_a, *h_b, *h_c;
//malloc host
h_a = (int*)malloc(bytes);
h_b = (int*)malloc(bytes);
h_c = (int*)malloc(bytes);
//device pointer
int *d_a, *d_b, *d_c;
//alloc device mem
cudaMalloc(&d_a, bytes);
cudaMalloc(&d_b, bytes);
cudaMalloc(&d_c, bytes);
//init martrix
init_matrix(h_a, n);
init_matrix(h_b, n);
//memcopy from host to device
cudaMemcpy(d_a, h_a, bytes, cudaMemcpyHostToDevice);
cudaMemcpy(d_b, h_b, bytes, cudaMemcpyHostToDevice);
//thread pre block
int BLOCK_SIZE = 16;
//how many block in each demension
int GRID_SIZE = (int)ceil(n / BLOCK_SIZE);
//开启二维
dim3 grid(GRID_SIZE, GRID_SIZE);
dim3 threads(BLOCK_SIZE, BLOCK_SIZE);
//计时
auto start = std::chrono::high_resolution_clock::now();
matrix_miltiple<<<grid, threads>>>(d_a, d_b, d_c, n);
//wait
cudaDeviceSynchronize();
//copy back to host
cudaMemcpy(h_c, d_c, bytes, cudaMemcpyDeviceToHost);
//计时
auto end = std::chrono::high_resolution_clock::now();
//转换
auto ms = std::chrono::duration_cast< std::chrono::milliseconds>(end - start);
std::cout << ms.count() << "ms" << std::endl;
return 0;
}