You can do this in CUBLAS as long as you use the "V2" API. The newer API includes a function cublasSetPointerMode
which you can use to set the API to assume that all routines which return a scalar value will be passed a device pointer rather than a host pointer. This is discussed in Section 2.4 of the latest CUBLAS documentation. For example:
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <stdio.h>
int main(void)
{
const int nvals = 10;
const size_t sz = sizeof(double) * (size_t)nvals;
double x[nvals], y[nvals];
double *x_, *y_, *result_;
double result=0., resulth=0.;
for(int i=0; i<nvals; i++) {
x[i] = y[i] = (double)(i)/(double)(nvals);
resulth += x[i] * y[i];
}
cublasHandle_t h;
cublasCreate(&h);
cublasSetPointerMode(h, CUBLAS_POINTER_MODE_DEVICE);
cudaMalloc( (void **)(&x_), sz);
cudaMalloc( (void **)(&y_), sz);
cudaMalloc( (void **)(&result_), sizeof(double) );
cudaMemcpy(x_, x, sz, cudaMemcpyHostToDevice);
cudaMemcpy(y_, y, sz, cudaMemcpyHostToDevice);
cublasDdot(h, nvals, x_, 1, y_, 1, result_);
cudaMemcpy(&result, result_, sizeof(double), cudaMemcpyDeviceToHost);
printf("%f %f
", resulth, result);
cublasDestroy(h);
return 0;
}
Using CUBLAS_POINTER_MODE_DEVICE
makes cublasDdot
assume that result_
is a device pointer, and there is no attempt made to copy the result back to the host. Note that this makes routines like dot
asynchronous, so you might need to keep on eye on synchronization between device and host.
与恶龙缠斗过久,自身亦成为恶龙;凝视深渊过久,深渊将回以凝视…