I'm trying to understand how numpy
can be so fast, based on my shocking comparison with optimized C/C++ code which is still far from reproducing numpy's speed.
Consider the following example:
Given a 2D array with shape=(N, N)
and dtype=float32
, which represents a list of N vectors of N dimensions, I am computing the pairwise differences between every pair of vectors. Using numpy
broadcasting, this simply writes as:
def pairwise_sub_numpy( X ):
return X - X[:, None, :]
Using timeit
I can measure the performance for N=512
: it takes 88 ms per call on my laptop.
Now, in C/C++ a naive implementation writes as:
#define X(i, j) _X[(i)*N + (j)]
#define res(i, j, k) _res[((i)*N + (j))*N + (k)]
float* pairwise_sub_naive( const float* _X, int N )
{
float* _res = (float*) aligned_alloc( 32, N*N*N*sizeof(float));
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
for (int k = 0; k < N; k++)
res(i,j,k) = X(i,k) - X(j,k);
}
}
return _res;
}
Compiling using gcc
7.3.0 with -O3
flag, I get 195 ms per call for pairwise_sub_naive(X)
, which is not too bad given the simplicity of the code, but about 2 times slower than numpy
.
Now I start getting serious and add some small optimizations, by indexing the row vectors directly:
float* pairwise_sub_better( const float* _X, int N )
{
float* _res = (float*) aligned_alloc( 32, N*N*N*sizeof(float));
for (int i = 0; i < N; i++) {
const float* xi = & X(i,0);
for (int j = 0; j < N; j++) {
const float* xj = & X(j,0);
float* r = &res(i,j,0);
for (int k = 0; k < N; k++)
r[k] = xi[k] - xj[k];
}
}
return _res;
}
The speed stays the same at 195 ms, which means that the compiler was able to figure that much. Let's now use SIMD vector instructions:
float* pairwise_sub_simd( const float* _X, int N )
{
float* _res = (float*) aligned_alloc( 32, N*N*N*sizeof(float));
// create caches for row vectors which are memory-aligned
float* xi = (float*)aligned_alloc(32, N * sizeof(float));
float* xj = (float*)aligned_alloc(32, N * sizeof(float));
for (int i = 0; i < N; i++) {
memcpy(xi, & X(i,0), N*sizeof(float));
for (int j = 0; j < N; j++) {
memcpy(xj, & X(j,0), N*sizeof(float));
float* r = &res(i,j,0);
for (int k = 0; k < N; k += 256/sizeof(float)) {
const __m256 A = _mm256_load_ps(xi+k);
const __m256 B = _mm256_load_ps(xj+k);
_mm256_store_ps(r+k, _mm256_sub_ps( A, B ));
}
}
}
free(xi);
free(xj);
return _res;
}
This only yields a small boost (178 ms instead of 194 ms per function call).
Then I was wondering if a "block-wise" approach, like what is used to optimize dot-products, could be beneficials:
float* pairwise_sub_blocks( const float* _X, int N )
{
float* _res = (float*) aligned_alloc( 32, N*N*N*sizeof(float));
#define B 8
float cache1[B*B], cache2[B*B];
for (int bi = 0; bi < N; bi+=B)
for (int bj = 0; bj < N; bj+=B)
for (int bk = 0; bk < N; bk+=B) {
// load first 8x8 block in the cache
for (int i = 0; i < B; i++)
for (int k = 0; k < B; k++)
cache1[B*i + k] = X(bi+i, bk+k);
// load second 8x8 block in the cache
for (int j = 0; j < B; j++)
for (int k = 0; k < B; k++)
cache2[B*j + k] = X(bj+j, bk+k);
// compute local operations on the caches
for (int i = 0; i < B; i++)
for (int j = 0; j < B; j++)
for (int k = 0; k < B; k++)
res(bi+i,bj+j,bk+k) = cache1[B*i + k] - cache2[B*j + k];
}
return _res;
}
And surprisingly, this is the slowest method so far (258 ms per function call).
To summarize, despite some efforts with some optimized C++ code, I can't come anywhere close the 88 ms / call that numpy achieves effortlessly. Any idea why?
Note: By the way, I am disabling numpy
multi-threading and anyway, this kind of operation is not multi-threaded.
Edit: Exact code to benchmark the numpy code:
import numpy as np
def pairwise_sub_numpy( X ):
return X - X[:, None, :]
N = 512
X = np.random.rand(N,N).astype(np.float32)
import timeit
times = timeit.repeat('pairwise_sub_numpy( X )', globals=globals(), number=1, repeat=5)
print(f">> best of 5 = {1000*min(times):.3f} ms")
Full benchmark for C code:
#include <stdio.h>
#include <string.h>
#include <xmmintrin.h> // compile with -mavx -msse4.1
#include <pmmintrin.h>
#include <immintrin.h>
#include <time.h>
#define X(i, j) _x[(i)*N + (j)]
#define res(i, j, k) _res[((i)*N + (j))*N + (k)]
float* pairwise_sub_naive( const float* _x, int N )
{
float* _res = (float*) aligned_alloc( 32, N*N*N*sizeof(float));
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
for (int k = 0; k < N; k++)
res(i,j,k) = X(i,k) - X(j,k);
}
}
return _res;
}
float* pairwise_sub_better( const float* _x, int N )
{
float* _res = (float*) aligned_alloc( 32, N*N*N*sizeof(float));
for (int i = 0; i < N; i++) {
const float* xi = & X(i,0);
for (int j = 0; j < N; j++) {
const float* xj = & X(j,0);
float* r = &res(i,j,0);
for (int k = 0; k < N; k++)
r[k] = xi[k] - xj[k];
}
}
return _res;
}
float* pairwise_sub_simd( const float* _x, int N )
{
float* _res = (float*) aligned_alloc( 32, N*N*N*sizeof(float));
// create caches for row vectors which are memory-aligned
float* xi = (float*)aligned_alloc(32, N * sizeof(float));
float* xj = (float*)aligned_alloc(32, N * sizeof(float));
for (int i = 0; i < N; i++) {
memcpy(xi, & X(i,0), N*sizeof(float));
for (int j = 0; j < N; j++) {
memcpy(xj, & X(j,0), N*sizeof(float));
float* r = &res(i,j,0);
for (int k = 0; k < N; k += 256/sizeof(float)) {
const __m256 A = _mm256_load_ps(xi+k);
const __m256 B = _mm256_load_ps(xj+k);
_mm256_store_ps(r+k, _mm256_sub_ps( A, B ));
}
}
}
free(xi);
free(xj);
return _res;
}
float* pairwise_sub_blocks( const float* _x, int N )
{
float* _res = (float*) aligned_alloc( 32, N*N*N*sizeof(float));
#define B 8
float cache1[B*B], cache2[B*B];
for (int bi = 0; bi < N; bi+=B)
for (int bj = 0; bj < N; bj+=B)
for (int bk = 0; bk < N; bk+=B) {
// load first 8x8 block in the cache
for (int i = 0; i < B; i++)
for (int k = 0; k < B; k++)
cache1[B*i + k] = X(bi+i, bk+k);
// load second 8x8 block in the cache
for (int j = 0; j < B; j++)
for (int k = 0; k < B; k++)
cache2[B*j + k] = X(bj+j, bk+k);
// compute local operations on the caches
for (int i = 0; i < B; i++)
for (int j = 0; j < B; j++)
for (int k = 0; k < B; k++)
res(bi+i,bj+j,bk+k) = cache1[B*i + k] - cache2[B*j + k];
}
return _res;
}
int main()
{
const int N = 512;
float* _x = (float*) malloc( N * N * sizeof(float) );
for( int i = 0; i < N; i++)
for( int j = 0; j < N; j++)
X(i,j) = ((i+j*j+17*i+101) % N) / float(N);
double best = 9e9;
for( int i = 0; i < 5; i++)
{
struct timespec start, stop;
clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start);
//float* res = pairwise_sub_naive( _x, N );
//float* res = pairwise_sub_better( _x, N );
//float* res = pairwise_sub_simd( _x, N );
float* res = pairwise_sub_blocks( _x, N );
clock_gettime(CLOCK_THREAD_CPUTIME_ID, &stop);
double t = (stop.tv_sec - start.tv_sec) * 1e6 + (stop.tv_nsec - start.tv_nsec) / 1e3; // in microseconds
if (t < best) best = t;
free( res );
}
printf("Best of 5 = %f ms
", best / 1000);
free( _x );
return 0;
}
Compiled using gcc 7.3.0 gcc -Wall -O3 -mavx -msse4.1 -o test_simd test_simd.c
Summary of timings on my machine: