python模拟对数计算函数_python – 快速对数计算

注意ALL以下是float32,不是双精度.

更新:

我已经gcc完全抛弃了英特尔的icc.当性能至关重要时,当您没有时间微调“编译器提示”以执行gcc向量化(见例如here)时,它会使所有的区别

log_omp.c,

GCC:gcc -o log_omp.so -fopenmp log_omp.c -lm -O3 -fPIC -shared -std = c99

ICC:icc -o log_omp.so -openmp loge_omp.c -lm -O3 -fPIC -shared -std = c99 -vec-report1 -xAVX -I / opt / intel / composer / mkl / include

#include

#include "omp.h"

#include "mkl_vml.h"

#define restrict __restrict

inline void log_omp(int m, float * restrict a, float * restrict c);

void log_omp(int m, float * restrict a, float * restrict c)

{

int i;

#pragma omp parallel for default(none) shared(m,a,c) private(i)

for (i=0; i

a[i] = log(c[i]);

}

}

// VML / icc only:

void log_VML(int m, float * restrict a, float * restrict c)

{

int i;

int split_to = 14;

int iter = m / split_to;

int additional = m % split_to;

// vsLn(m, c, a);

#pragma omp parallel for default(none) shared(m,a,c, additional, iter) private(i) num_threads(split_to)

for (i=0;i < (m-additional); i+=iter)

vsLog10(iter,c+i,a+i);

//vmsLn(iter,c+i,a+i, VML_HA);

if (additional > 0)

vsLog10(additional, c+m-additional, a+m-additional);

//vmsLn(additional, c+m-additional, a+m-additional, VML_HA);

}

在python:

from ctypes import CDLL, c_int, c_void_p

def log_omp(xs, out):

lib = CDLL('./log_omp.so')

lib.log_omp.argtypes = [c_int, np.ctypeslib.ndpointer(dtype=np.float32), np.ctypeslib.ndpointer(dtype=np.float32)]

lib.log_omp.restype = c_void_p

n = xs.shape[0]

out = np.empty(n, np.float32)

lib.log_omp(n, out, xs)

return out

Cython代码(在ipython笔记本中,因此%%魔术):

%%cython --compile-args=-fopenmp --link-args=-fopenmp

import numpy as np

cimport numpy as np

from libc.math cimport log

from cython.parallel cimport prange

import cython

@cython.boundscheck(False)

def cylog(np.ndarray[np.float32_t, ndim=1] a not None,

np.ndarray[np.float32_t, ndim=1] out=None):

if out is None:

out = np.empty((a.shape[0]), dtype=a.dtype)

cdef Py_ssize_t i

with nogil:

for i in prange(a.shape[0]):

out[i] = log(a[i])

return out

时序:

numexpr.detect_number_of_cores() // 2

28

%env OMP_NUM_THREADS=28

x = np.abs(np.random.randn(50000000).astype('float32'))

y = x.copy()

# GCC

%timeit log_omp(x, y)

10 loops, best of 3: 21.6 ms per loop

# ICC

%timeit log_omp(x, y)

100 loops, best of 3: 9.6 ms per loop

%timeit log_VML(x, y)

100 loops, best of 3: 10 ms per loop

%timeit cylog(x, out=y)

10 loops, best of 3: 21.7 ms per loop

numexpr.set_num_threads(28)

%timeit out = numexpr.evaluate('log(x)')

100 loops, best of 3: 13 ms per loop

所以,numexpr似乎比编译好的gcc代码做得更好,但是icc赢了.

一些资源我发现有用和可耻的使用代码:

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值