你可以编写一个cython函数来加速计算,你要做的第一件事是获取fortran函数SPHJ的地址,这里是如何在Python中执行此操作:
from scipy import special as sp
sphj = sp.specfun.sphj
import ctypes
addr = ctypes.pythonapi.PyCObject_AsVoidPtr(ctypes.py_object(sphj._cpointer))
然后你可以直接在Cython中调用fortran函数,注意我使用prange()来使用multicore来加速计算:
%%cython -c-Ofast -c-fopenmp --link-args=-fopenmp
from cpython.mem cimport PyMem_Malloc,PyMem_Free
from cython.parallel import prange
import numpy as np
import cython
from cpython cimport PyCObject_AsVoidPtr
from scipy import special
ctypedef void (*sphj_ptr) (const int *n,const double *x,const int *nm,const double *sj,const double *dj) nogil
cdef sphj_ptr _sphj=PyCObject_AsVoidPtr(special.specfun.sphj._cpointer)
@cython.wraparound(False)
@cython.boundscheck(False)
def cython_sphj2(int n,double[::1] x):
cdef int count = x.shape[0]
cdef double * sj = PyMem_Malloc(count * sizeof(double) * (n + 1))
cdef double * dj = PyMem_Malloc(count * sizeof(double) * (n + 1))
cdef int * mn = PyMem_Malloc(count * sizeof(int))
cdef double[::1] res = np.empty(count)
cdef int i
if count < 100:
for i in range(x.shape[0]):
_sphj(&n,&x[i],mn + i,sj + i*(n+1),dj + i*(n+1))
res[i] = sj[i*(n+1) + n] #choose the element you want here
else:
for i in prange(count,nogil=True):
_sphj(&n,dj + i*(n+1))
res[i] = sj[i*(n+1) + n] #choose the element you want here
PyMem_Free(sj)
PyMem_Free(dj)
PyMem_Free(mn)
return res.base
比较一下,这是在forloop中调用sphj()的Python函数:
import numpy as np
def python_sphj(n,x):
sphj = special.specfun.sphj
res = np.array([sphj(n,v)[1][n] for v in x])
return res
以下是10个元素的%timit结果:
x = np.linspace(1,10)
r1 = cython_sphj2(4,x)
r2 = python_sphj(4,x)
assert np.allclose(r1,r2)
%timeit cython_sphj2(4,x)
%timeit python_sphj(4,x)
结果:
10000 loops,best of 3: 21.5 µs per loop
10000 loops,best of 3: 28.1 µs per loop
以下是100000个元素的结果:
x = np.linspace(1,100000)
r1 = cython_sphj2(4,x)
结果:
10 loops,best of 3: 44.7 ms per loop
1 loops,best of 3: 231 ms per loop