# 高性能计算的矩阵乘法优化 - Python + OpenMP实现

### 0. 引言

OpenMP是一种C/C++的并行编译标准方案，严谨地说，在Python上使用OpenMP是不可能的，因为本来Python是一门解释语言。

• 在可交互级别的转化：一些第三方包pymp，pyopenmp，使用fork脱离GIL

• 在解释阶段级别的转化：使用Cython直接编写C code，这样写出来的module在解释的时候会被解释器做并行优化。

import numpy as np
from functools import wraps
import time

def generate_example_matrix(h, w):
_vs = (-1, 2, -1)
_i = -1  # shift bits
example_data = np.zeros([h, w], dtype=np.int32)
for i in range(3):
example_data_eye_mask = np.eye(h, w, _i + i,
dtype=np.bool_)  # build eyes and shift
return example_data

def generate_example_vector(w):
_rest_dict = {
1: [1],
2: [1, 2],
3: [1, 2, 3],
}
rest_bits = int(w % 3)
repeat_times = int(w // 3)
example_vector = np.repeat([[1, 2, 3]], repeat_times, axis=0).ravel()

if rest_bits > 0:
tail_vec = _rest_dict[rest_bits]
tail_vec = np.array(tail_vec, dtype=np.int32)
example_vector = np.concatenate([example_vector, tail_vec], axis=0)
return example_vector



def naive_method(example_matrix, example_vector):
result = []
h, w = example_matrix.shape
for hi in range(h):
colv = example_matrix[hi, :]
temp = 0
for wi in range(w):
temp += colv[wi] * example_vector[wi]
result.append(temp)
return np.array(result)



from utils import generate_example_matrix, generate_example_vector
import pymp
import time
import numpy as np

def naive_method(example_matrix, example_vector):
result = []
h, w = example_matrix.shape
for hi in range(h):
colv = example_matrix[hi, :]
temp = 0
for wi in range(w):
temp += colv[wi] * example_vector[wi]
result.append(temp)
return np.array(result)

def main():
start_time = time.perf_counter()
h = 5000
w = 5000
print('--- Current matrix scale is: {} x {} ---'.format(h,w))
example_matrix = generate_example_matrix(h, w)
example_vector = generate_example_vector(w)
result = naive_method(example_matrix, example_vector)
end_time = time.perf_counter()
print('single method used time is: {:.2f}s\n'.format(end_time - start_time))

if __name__ == '__main__':
main()


### 1. 在可交互级别的转化

ex_array = np.zeros((100,), dtype='uint8')

for index in range(0, 100):
ex_array[index] = 1
print('Yay! {} done!'.format(index))



import pymp

ex_array = pymp.shared.array((100,), dtype='uint8')

with pymp.Parallel(4) as p:
for index in p.range(0, 100):
ex_array[index] = 1
# The parallel print function takes care of asynchronous output.
p.print('Yay! {} done!'.format(index))


#### 1.1 OpenMP Style的改写

from utils import generate_example_matrix, generate_example_vector
import pymp
import time
import numpy as np

# result = pymp.shared.list()
result = []
h, w = example_matrix.shape
for hi in p.range(0, h):
colv = example_matrix[hi, :]
temp = 0
for wi in p.range(0, w):
temp += colv[wi] * example_vector[wi]
result.append(temp)

def main():
start_time = time.perf_counter()
h = 50000
w = 50000
threads_num = 200 # 250, 200, 100, 50, 25, 16, 8
print('--- Current matrix scale is: {} x {} ---'.format(h,w))
example_matrix = generate_example_matrix(h, w)
example_vector = generate_example_vector(w)
end_time = time.perf_counter()
print('multi-thread method used time is: {:.2f}s\n'.format(end_time - start_time))
return np.array(result)
if __name__ == '__main__':
main()


#### 2.2 实验对比：

11th Gen Intel® Core™ i5-11600K @ 3.90GHz，此外，我超频到了4.5GHz

--- Current matrix scale is: 50000 x 50000 ---
single method used time is: 475.64s


Baseline：475秒

8线程

--- Current matrix scale is: 50000 x 50000 ---
<- Threads num is: 8 ->
multi-thread method used time is: 19.46s


T8：19秒

16线程：13秒

--- Current matrix scale is: 50000 x 50000 ---
<- Threads num is: 16 ->
multi-thread method used time is: 13.28s


T16：13秒

25线程

--- Current matrix scale is: 50000 x 50000 ---
<- Threads num is: 25 ->
multi-thread method used time is: 11.12s


T25：11秒

50线程

--- Current matrix scale is: 50000 x 50000 ---
<- Threads num is: 50 ->
multi-thread method used time is: 9.18s


T50：9.18秒

100线程

--- Current matrix scale is: 50000 x 50000 ---
<- Threads num is: 100 ->
multi-thread method used time is: 8.27s


T100：8.27秒

200线程

--- Current matrix scale is: 50000 x 50000 ---
<- Threads num is: 200 ->
multi-thread method used time is: 8.23s


T200：8.23秒

250线程

--- Current matrix scale is: 50000 x 50000 ---
<- Threads num is: 250 ->
multi-thread method used time is: 8.47s


T250：8.47秒

• 0
点赞
• 12
收藏
觉得还不错? 一键收藏
• 7
评论
06-17
03-29

### “相关推荐”对你有帮助么？

• 非常没帮助
• 没帮助
• 一般
• 有帮助
• 非常有帮助

1.余额是钱包充值的虚拟货币，按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载，可以购买VIP、付费专栏及课程。