测试minpy 调用gpu加速矩阵相乘,已经写了几篇文章.前几篇文章得到的结果不太好,主要原因是跟想象中的结果并不是很相同.
主要有两点,一个是前几篇测试加速的效果并不是很好,矩阵要很大的时候才能看到明显的加速.另一个是我一个先验的经验认为float32的加速效果要明显比float64的加速效果要好很多. 但是这两点在前面的测试中并没有得到.
这个就能感受到理论跟实验之间差距.如果你相信理论,相信自己的先验经验知识,那就说明你实验中存在局限性.但关键点是我们不知道我们实验的局限性在哪里,我们手里也没有准确的可以对比的实验结果数据,也就是说我们不知道我们实验的哪个地方是出了问题的.我们尝试去找问题出现在哪里,但如果我们找不到呢? 是把自己的实验结果搁置,还是自己从心里去怀疑这个先验知识呢? 我们一直得不到我们想要的跟我们认为的理论上相近的结果.
前几篇文章关于这个gpu加速主题,基本上就是围绕这个过程.但是依然没有得到我们想要的结果.
可能这篇文章的测试也依然是有局限性,也就是说也可能说不太准确的.
这里我们只是把这个过程记录下来.
#!/usr/bin/python
# -*- coding: utf-8 -*-
#####################################
# File name : main.py
# Create date : 2019-01-10 16:39
# Modified date : 2019-01-11 14:52
# Author : DARREN
# Describe : not set
# Email : lzygzh@126.com
#####################################
from __future__ import division
from __future__ import print_function
import os
import minpy.numpy as np
import minpy.numpy.random as random
from minpy.context import cpu, gpu
import time
import matplotlib.pyplot as plt
def create_path(path):
if not os.path.isdir(path):
os.makedirs(path)
def get_file_full_name(path, name):
create_path(path)
if path[-1] == "/":
full_name = path + name
else:
full_name = path + "/" + name
return full_name
def create_file(path, name, open_type='w'):
file_name = get_file_full_name(path, name)
return open(file_name, open_type)
def _plot_record(record,full_path):
_plot_cpu_gpu_time(record, full_path)
_plot_acceleration(record,full_path)
def _get_full_path(repeats, size_begin, size_end):
if not os.path.exists("./output"):
os.makedirs("./output")
path_str = "./output/%s_%s_%s" % (repeats, size_begin, size_end)
return path_str
def _plot_cpu_gpu_time(record, full_path):
float32_cpu_lt = []
float64_cpu_lt = []
float32_gpu_lt = []
float64_gpu_lt = []
steps = []
for key in record:
steps.append([key])
steps.sort()
for i in range(len(steps)):
step_dic = record[steps[i][0]]
float32_cpu_value = step_dic["float32_cpu"]
float32_cpu_lt.append(float32_cpu_value)
float64_cpu_value = step_dic["float64_cpu"]
float64_cpu_lt.append(float64_cpu_value)
float32_gpu_value = step_dic["float32_gpu"]
float32_gpu_lt.append(float32_gpu_value)
float64_gpu_value = step_dic["float64_gpu"]
float64_gpu_lt.append(float64_gpu_value)
float32_cpu_lt = np.array(float32_cpu_lt)
float64_cpu_lt = np.array(float64_cpu_lt)
float32_gpu_lt = np.array(float32_gpu_lt)
float64_gpu_lt = np.array(float64_gpu_lt)
float32_cpu_lt = float32_cpu_lt.asnumpy()
float64_cpu_lt = float64_cpu_lt.asnumpy()
float32_gpu_lt = float32_gpu_lt.asnumpy()
float64_gpu_lt = float64_gpu_lt.asnumpy()
steps = np.array(steps)
steps = steps*steps
steps = steps.asnumpy()
float32_gpu_line, = plt.plot(steps, float32_gpu_lt)
float64_gpu_line, = plt.plot(steps, float64_gpu_lt)
float32_cpu_line, = plt.plot(steps, float32_cpu_lt)
float64_cpu_line, = plt.plot(steps, float64_cpu_lt)
line_lt = [
float32_gpu_line,
float64_gpu_line,
float32_cpu_line,
float64_cpu_line,
]
labels_lt = (
"float32 gpu",
"float64 gpu",
"float32 cpu",
"float64 cpu",
)
plt.legend(handles=line_lt,labels=labels_lt,loc='best')
full_path_name = "%s/cpu_gpu.jpg" % (full_path)
# plt.show()
plt.savefig(full_path_name)
plt.close()
def _plot_acceleration(record, full_path):
float64_acceleration_lt= []
float32_acceleration_lt= []
steps = []
for key in record:
steps.append([key])
steps.sort()
for i in range(len(steps)):
step_dic = record[steps[i][0]]
float64_acceleration_value = step_dic["float64_acceleration"]
float64_acceleration_lt.append(float64_acceleration_value)
float32_acceleration_value = step_dic["float32_acceleration"]
float32_acceleration_lt.append(float32_acceleration_value)
float64_acceleration_lt = np.array(float64_acceleration_lt)
float64_acceleration_lt = float64_acceleration_lt.asnumpy()
float32_acceleration_lt = np.array(float32_acceleration_lt)
float32_acceleration_lt = float32_acceleration_lt.asnumpy()
steps = np.array(steps)
steps = steps*steps
steps = steps.asnumpy()
float32_acceleration_line, = plt.plot(steps, float32_acceleration_lt)
float64_acceleration_line, = plt.plot(steps, float64_acceleration_lt)
line_lt = [
float32_acceleration_line,
float64_acceleration_line,
]
labels_lt = (
'float32 acceleration',
'float64 acceleration',
)
plt.legend(handles=line_lt,labels=labels_lt,loc='best')
full_path_name = "%s/acceleration.jpg" % (full_path)
# plt.show()
plt.savefig(full_path_name)
plt.close()
def _write_status(file_obj, i, time_lt):
float32_acceleration = time_lt[1] / time_lt[3]
float64_acceleration = time_lt[0] / time_lt[2]
float64_cpu_str = "i:%s float64 cpu:%s" % (i, time_lt[0])
float32_cpu_str = "i:%s float32 cpu:%s" % (i, time_lt[1])
float64_gpu_str = "i:%s float64 gpu:%s" % (i, time_lt[2])
float32_gpu_str = "i:%s float32 gpu:%s" % (i, time_lt[3])
float32_acceleration_str = "float32 acceleration:%s" % float32_acceleration
float64_acceleration_str = "float64 acceleration:%s" % float64_acceleration
file_obj.write("%s\n" % float64_cpu_str)
file_obj.write("%s\n" % float32_cpu_str)
file_obj.write("%s\n" % float64_gpu_str)
file_obj.write("%s\n" % float32_gpu_str)
file_obj.write("%s\n" % float32_acceleration_str)
file_obj.write("%s\n" % float64_acceleration_str)
print(float64_cpu_str)
print(float32_cpu_str)
print(float64_gpu_str)
print(float32_gpu_str)
print(float32_acceleration_str)
print(float64_acceleration_str)
def _record_status(record, i,time_lt):
dic = {}
dic["float64_cpu"] = time_lt[0]
dic["float32_cpu"] = time_lt[1]
dic["float64_gpu"] = time_lt[2]
dic["float32_gpu"] = time_lt[3]
dic["float64_acceleration"] = time_lt[0]/ time_lt[2]
dic["float32_acceleration"] = time_lt[1]/ time_lt[3]
record[i] = dic
def _randn(l,c):
return random.randn(l,c)
def _get_take_time(s, repeats, data_type):
x = _randn(s,s)
y = _randn(s,s)
x = np.array(x, dtype=data_type)
y = np.array(y, dtype=data_type)
t0 = time.time()
for i in range(repeats):
z = np.dot(x, y)
z.asnumpy()
t1 = time.time()
all_time = t1 - t0
avg_time = all_time / repeats
return avg_time
def test_cpu_gpu(repeats,size_begin, size_end, step=1):
record = {}
full_path = _get_full_path(repeats, size_begin, size_end)
file_obj = create_file(full_path, "output")
for s in range(size_begin, size_end, step):
time_lt = []
with cpu():
float64_cpu_time = _get_take_time(s, repeats, np.float64)
float32_cpu_time = _get_take_time(s, repeats, np.float32)
time_lt.append(float64_cpu_time)
time_lt.append(float32_cpu_time)
with gpu(0):
float64_gpu_time = _get_take_time(s, repeats, np.float64)
float32_gpu_time = _get_take_time(s, repeats, np.float32)
time_lt.append(float64_gpu_time)
time_lt.append(float32_gpu_time)
_write_status(file_obj, s, time_lt)
_record_status(record, s, time_lt)
file_obj.close()
_plot_record(record,full_path)
def test_matmul(repeats, max_size, step):
for i in range(int(max_size / step)):
size_begin = 1 + i*step
size_end = (i+1)*step
test_cpu_gpu(repeats, size_begin, size_end)
size_begin = 1
size_end = max_size
test_cpu_gpu(repeats, size_begin, size_end)
def test():
# repeats = 500
# max_size = 1000
# step = 100
# test_matmul(repeats, max_size, step)
# repeats = 5
# size_begin = 1
# size_end = 3000
# test_cpu_gpu(repeats, size_begin, size_end)
repeats = 1
size_begin = 1
size_end = 10000
step = 50
test_cpu_gpu(repeats, size_begin, size_end, step)
repeats = 1
size_begin = 10000
size_end = 20000
step = 100
test_cpu_gpu(repeats, size_begin, size_end, step)
test()
下面是我机器中的cpu和gpu型号
31.4 GiB
Intel® Core™ i7-8700K CPU @ 3.70GHz × 12
GeForce GTX 1080 Ti/PCIe/SSE2
64-bit
先看下整体的输出效果
运行500次 方阵大小1-1000,也就是元素数1-100万
500_1_100
cpu 与gpu 运行时间对比图
这个结果跟我预想还算比较接近,首先在矩阵比较不是很大的时候就能看到加速效果,第二float32与float64 的加速效果明显不同.
下面是cpu与gpu的加速效果对比图
能够看到在矩阵比较小的时候,float32就有加速效果,float64的加速效果并不是很明显.
下面是一些局部的数据.
500_1_100
500_101_200
500_201_300
500_301_400
500_401_500
500_501_600
500_601_700
500_701_800
500_801_900
500_901_1000
5_1_3000
1_1_10000