python模拟对数计算函数_python – 快速对数计算

最新推荐文章于 2023-10-13 20:47:45 发布

weixin_39586683

最新推荐文章于 2023-10-13 20:47:45 发布

阅读量247

点赞数

文章标签： python模拟对数计算函数

注意ALL以下是float32,不是双精度.

更新：

我已经gcc完全抛弃了英特尔的icc.当性能至关重要时,当您没有时间微调“编译器提示”以执行gcc向量化(见例如here)时,它会使所有的区别

log_omp.c,

GCC：gcc -o log_omp.so -fopenmp log_omp.c -lm -O3 -fPIC -shared -std = c99

ICC：icc -o log_omp.so -openmp loge_omp.c -lm -O3 -fPIC -shared -std = c99 -vec-report1 -xAVX -I / opt / intel / composer / mkl / include

#include

#include "omp.h"

#include "mkl_vml.h"

#define restrict __restrict

inline void log_omp(int m, float * restrict a, float * restrict c);

void log_omp(int m, float * restrict a, float * restrict c)

{

int i;

#pragma omp parallel for default(none) shared(m,a,c) private(i)

for (i=0; i

a[i] = log(c[i]);

}

}

// VML / icc only:

void log_VML(int m, float * restrict a, float * restrict c)

{

int i;

int split_to = 14;

int iter = m / split_to;

int additional = m % split_to;

// vsLn(m, c, a);

#pragma omp parallel for default(none) shared(m,a,c, additional, iter) private(i) num_threads(split_to)

for (i=0;i < (m-additional); i+=iter)

vsLog10(iter,c+i,a+i);

//vmsLn(iter,c+i,a+i, VML_HA);

if (additional > 0)

vsLog10(additional, c+m-additional, a+m-additional);

//vmsLn(additional, c+m-additional, a+m-additional, VML_HA);

}

在python：

from ctypes import CDLL, c_int, c_void_p

def log_omp(xs, out):

lib = CDLL('./log_omp.so')

lib.log_omp.argtypes = [c_int, np.ctypeslib.ndpointer(dtype=np.float32), np.ctypeslib.ndpointer(dtype=np.float32)]

lib.log_omp.restype = c_void_p

n = xs.shape[0]

out = np.empty(n, np.float32)

lib.log_omp(n, out, xs)

return out

Cython代码(在ipython笔记本中,因此%%魔术)：

%%cython --compile-args=-fopenmp --link-args=-fopenmp

import numpy as np

cimport numpy as np

from libc.math cimport log

from cython.parallel cimport prange

import cython

@cython.boundscheck(False)

def cylog(np.ndarray[np.float32_t, ndim=1] a not None,

np.ndarray[np.float32_t, ndim=1] out=None):

if out is None:

out = np.empty((a.shape[0]), dtype=a.dtype)

cdef Py_ssize_t i

with nogil:

for i in prange(a.shape[0]):

out[i] = log(a[i])

return out

时序：

numexpr.detect_number_of_cores() // 2

28

%env OMP_NUM_THREADS=28

x = np.abs(np.random.randn(50000000).astype('float32'))

y = x.copy()

# GCC

%timeit log_omp(x, y)

10 loops, best of 3: 21.6 ms per loop

# ICC

%timeit log_omp(x, y)

100 loops, best of 3: 9.6 ms per loop

%timeit log_VML(x, y)

100 loops, best of 3: 10 ms per loop

%timeit cylog(x, out=y)

10 loops, best of 3: 21.7 ms per loop

numexpr.set_num_threads(28)

%timeit out = numexpr.evaluate('log(x)')

100 loops, best of 3: 13 ms per loop

所以,numexpr似乎比编译好的gcc代码做得更好,但是icc赢了.

一些资源我发现有用和可耻的使用代码：

weixin_39586683

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
复制链接

分享到 QQ

分享到新浪微博

扫一扫

评论

被折叠的条评论为什么被折叠?

到【灌水乐园】发言

查看更多评论

添加红包

成就一亿技术人!

hope_wisdom

发出的红包

实付元

使用余额支付

点击重新获取

扫码支付

钱包余额 0

抵扣说明：

1.余额是钱包充值的虚拟货币，按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载，可以购买VIP、付费专栏及课程。