A solution using Numba
CodeSurgeon already gave an excellent answer using Cython. In this answer I wan't to show an alternative way using Numba.
I have created three versions. In naive_numba
I only have added an function decorator. In improved_Numba
I have manually combined the loops (every vectorized command is actually a loop). In improved_Numba_p
I have parallelized the function. Please note that there is obviously a Bug not allowing to define constant values when using the pararallel accelerator. It has also be noted that the parallelized version is only beneficial for larger input arrays. But you can also add a small wrapper which calls the single threaded or the parallelized version according to the input array size.
Code dtype=float64
import numba as nb
import numpy as np
import time
@nb.njit(fastmath=True)
def naive_Numba(u, PorosityProfile, DensityIceProfile, DensityDustProfile, DensityProfile):
DustJ, DustF, DustG, DustH, DustI = 250.0, 633.0, 2.513, -2.2e-3, -2.8e-6
IceI, IceC, IceD, IceE, IceF, IceG, IceH = 273.16, 1.843e5, 1.6357e8, 3.5519e9, 1.6670e2, 6.4650e4, 1.6935e6
delta = u-DustJ
result_dust = DustF+DustG*delta+DustH*delta**2+DustI*(delta**3);
x= u/IceI;
result_ice = (x**3)*(IceC+IceD*(x**2)+IceE*(x**6))/(1+IceF*(x**2)+IceG*(x**4)+IceH*(x**8))
return (DensityIceProfile*result_ice+DensityDustProfile*result_dust)/DensityProfile
#error_model='numpy' sets divison by 0 to NaN instead of throwing a exception, this allows vectorization
@nb.njit(fastmath=True,error_model='numpy')
def improved_Numba(u, PorosityProfile, DensityIceProfile, DensityDustProfile, DensityProfile):
DustJ, DustF, DustG, DustH, DustI = 250.0, 633.0, 2.513, -2.2e-3, -2.8e-6
IceI, IceC, IceD, IceE, IceF, IceG, IceH = 273.16, 1.843e5, 1.6357e8, 3.5519e9, 1.6670e2, 6.4650e4, 1.6935e6
res=np.empty(u.shape[0],dtype=u.dtype)
for i in range(u.shape[0]):
delta = u[i]-DustJ
result_dust = DustF+DustG*delta+DustH*delta**2+DustI*(delta**3);
x= u[i]/IceI
result_ice = (x**3)*(IceC+IceD*(x**2)+IceE*(x**6))/(1+IceF*(x**2)+IceG*(x**4)+IceH*(x**8))
res[i]=(DensityIceProfile[i]*result_ice+DensityDustProfile[i]*result_dust)/DensityProfile[i]
return res
#there is obviously a bug in Numba (declaring const values in the function)
@nb.njit(fastmath=True,parallel=True,error_model='numpy')
def improved_Numba_p(u, PorosityProfile, DensityIceProfile, DensityDustProfile, DensityProfile,DustJ, DustF, DustG, DustH, DustI,IceI, IceC, IceD, IceE, IceF, IceG, IceH):
res=np.empty((u.shape[0]),dtype=u.dtype)
for i in nb.prange(u.shape[0]):
delta = u[i]-DustJ
result_dust = DustF+DustG*delta+DustH*delta**2+DustI*(delta**3);
x= u[i]/IceI
result_ice = (x**3)*(IceC+IceD*(x**2)+IceE*(x**6))/(1+IceF*(x**2)+IceG*(x**4)+IceH*(x**8))
res[i]=(DensityIceProfile[i]*result_ice+DensityDustProfile[i]*result_dust)/DensityProfile[i]
return res
u=np.array(np.random.rand(1000000),dtype=np.float32)
PorosityProfile=np.array(np.random.rand(1000000),dtype=np.float32)
DensityIceProfile=np.array(np.random.rand(1000000),dtype=np.float32)
DensityDustProfile=np.array(np.random.rand(1000000),dtype=np.float32)
DensityProfile=np.array(np.random.rand(1000000),dtype=np.float32)
DustJ, DustF, DustG, DustH, DustI = 250.0, 633.0, 2.513, -2.2e-3, -2.8e-6
IceI, IceC, IceD, IceE, IceF, IceG, IceH = 273.16, 1.843e5, 1.6357e8, 3.5519e9, 1.6670e2, 6.4650e4, 1.6935e6
#don't measure compilation overhead on first call
res=improved_Numba_p(u, PorosityProfile, DensityIceProfile, DensityDustProfile, DensityProfile,DustJ, DustF, DustG, DustH, DustI,IceI, IceC, IceD, IceE, IceF, IceG, IceH)
for i in range(1000):
res=improved_Numba_p(u, PorosityProfile, DensityIceProfile, DensityDustProfile, DensityProfile,DustJ, DustF, DustG, DustH, DustI,IceI, IceC, IceD, IceE, IceF, IceG, IceH)
print(time.time()-t1)
print(time.time()-t1)
Performance
Arraysize np.random.rand(100)
Numpy 46.8μs
naive Numba 3.1μs
improved Numba: 1.62μs
improved_Numba_p: 17.45μs
#Arraysize np.random.rand(1000000)
Numpy 255.8ms
naive Numba 18.6ms
improved Numba: 6.13ms
improved_Numba_p: 3.54ms
Code dtype=np.float32
If np.float32 is sufficient you have to explicitly declare all constant values in the function to float32. Otherwise Numba will use float64.
@nb.njit(fastmath=True,error_model='numpy')
def improved_Numba(u, PorosityProfile, DensityIceProfile, DensityDustProfile, DensityProfile):
DustJ, DustF, DustG, DustH, DustI = nb.float32(250.0), nb.float32(633.0), nb.float32(2.513), nb.float32(-2.2e-3), nb.float32(-2.8e-6)
IceI, IceC, IceD, IceE, IceF, IceG, IceH = nb.float32(273.16), nb.float32(1.843e5), nb.float32(1.6357e8), nb.float32(3.5519e9), nb.float32(1.6670e2), nb.float32(6.4650e4), nb.float32(1.6935e6)
res=np.empty(u.shape[0],dtype=u.dtype)
for i in range(u.shape[0]):
delta = u[i]-DustJ
result_dust = DustF+DustG*delta+DustH*delta**2+DustI*(delta**3)
x= u[i]/IceI
result_ice = (x**3)*(IceC+IceD*(x**2)+IceE*(x**6))/(nb.float32(1)+IceF*(x**2)+IceG*(x**4)+IceH*(x**8))
res[i]=(DensityIceProfile[i]*result_ice+DensityDustProfile[i]*result_dust)/DensityProfile[i]
return res
@nb.njit(fastmath=True,parallel=True,error_model='numpy')
def improved_Numba_p(u, PorosityProfile, DensityIceProfile, DensityDustProfile, DensityProfile):
res=np.empty((u.shape[0]),dtype=u.dtype)
DustJ, DustF, DustG, DustH, DustI = nb.float32(250.0), nb.float32(633.0), nb.float32(2.513), nb.float32(-2.2e-3), nb.float32(-2.8e-6)
IceI, IceC, IceD, IceE, IceF, IceG, IceH = nb.float32(273.16), nb.float32(1.843e5), nb.float32(1.6357e8), nb.float32(3.5519e9), nb.float32(1.6670e2), nb.float32(6.4650e4), nb.float32(1.6935e6)
for i in nb.prange(u.shape[0]):
delta = u[i]-DustJ
result_dust = DustF+DustG*delta+DustH*delta**2+DustI*(delta**3)
x= u[i]/IceI
result_ice = (x**3)*(IceC+IceD*(x**2)+IceE*(x**6))/(nb.float32(1)+IceF*(x**2)+IceG*(x**4)+IceH*(x**8))
res[i]=(DensityIceProfile[i]*result_ice+DensityDustProfile[i]*result_dust)/DensityProfile[i]
return res
Performance
Arraysize np.random.rand(100).astype(np.float32)
Numpy 29.3μs
improved Numba: 1.33μs
improved_Numba_p: 18μs
Arraysize np.random.rand(1000000).astype(np.float32)
Numpy 117ms
improved Numba: 2.46ms
improved_Numba_p: 1.56ms
The comparison to the Cython version provided by @CodeSurgeon isn't really fair because he didn't compile the function with enabled AVX2 and FMA3 instructions. Numba compiles by default with -march=native which enables AVX2 and FMA3 instructions on my Core i7-4xxx.
But this makes sence if you wan't to distribute a compiled Cython version of your code, because it won't run by default on pre Haswell processors (or all Pentium and Celerons) if that optimizations are enabled. Compiling multiple code paths should be possible, but is compiler dependend and more work.