本文整理汇总了C++中dim3函数的典型用法代码示例。如果您正苦于以下问题:C++ dim3函数的具体用法?C++ dim3怎么用?C++ dim3使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了dim3函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的C++代码示例。
示例1: run_add
bool run_add() {
constexpr size_t N = 64;
std::vector<T> host_input(N);
std::vector<T> host_expected(N);
for (int i = 0; i < N; ++i) {
host_input[i] = (T)i;
host_expected[i] = host_input[i] + host_input[i];
}
T* input1;
hipMalloc(&input1, N * sizeof(T));
hipMemcpy(input1, host_input.data(), host_input.size()*sizeof(T), hipMemcpyHostToDevice);
T* input2;
hipMalloc(&input2, N * sizeof(T));
hipMemcpy(input2, host_input.data(), host_input.size()*sizeof(T), hipMemcpyHostToDevice);
constexpr unsigned int blocks = 1;
constexpr unsigned int threads_per_block = 1;
hipLaunchKernelGGL(add<T>, dim3(blocks), dim3(threads_per_block), 0, 0, input1, input2, N);
hipMemcpy(host_input.data(), input1, host_input.size()*sizeof(T), hipMemcpyDeviceToHost);
bool equal = true;
for (int i = 0; i < N; i++) {
equal &= (host_input[i] == host_expected[i]);
}
return equal;
}
开发者ID:scchan,项目名称:hcc_perf,代码行数:32,代码来源:dl_common.hpp
示例2: test_gl2
int test_gl2(size_t N) {
size_t Nbytes = N*sizeof(int);
int *A_d, *B_d, *C_d;
int *A_h, *B_h, *C_h;
HipTest::initArrays (&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N);
unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);
// Full vadd in one large chunk, to get things started:
HIPCHECK ( hipMemcpy(A_d, A_h, Nbytes, hipMemcpyHostToDevice));
HIPCHECK ( hipMemcpy(B_d, B_h, Nbytes, hipMemcpyHostToDevice));
hipLaunchKernel(vectorADD2, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d, B_d, C_d, N);
HIPCHECK ( hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost));
HIPCHECK (hipDeviceSynchronize());
HipTest::checkVectorADD(A_h, B_h, C_h, N);
return 0;
}
开发者ID:GPUOpen-ProfessionalCompute-Tools,项目名称:HIP,代码行数:29,代码来源:hipGridLaunch.cpp
示例3: run_rint
bool run_rint() {
double *A, *Ad;
double *B, *Bd;
A = new double[N];
B = new double[N];
for (int i = 0; i < N; i++) {
A[i] = 1.345;
}
hipMalloc((void**)&Ad, SIZE);
hipMalloc((void**)&Bd, SIZE);
hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice);
hipLaunchKernelGGL(test_rint, dim3(1), dim3(N), 0, 0, Ad, Bd);
hipMemcpy(B, Bd, SIZE, hipMemcpyDeviceToHost);
int passed = 0;
for (int i = 0; i < 512; i++) {
double x = round(A[i]);
if (B[i] == x) {
passed = 1;
}
}
delete[] A;
delete[] B;
hipFree(Ad);
hipFree(Bd);
if (passed == 1) {
return true;
}
assert(passed == 1);
return false;
}
开发者ID:ssahasra,项目名称:HIP,代码行数:32,代码来源:hipTestDeviceDouble.cpp
示例4: main
int main(){
int A=0, *Ad;
hipMalloc((void**)&Ad, SIZE);
hipMemcpy(Ad, &A, SIZE, hipMemcpyHostToDevice);
hipLaunchKernel(HIP_KERNEL_NAME(Iter), dim3(1), dim3(1), 0, 0, Ad);
hipMemcpy(&A, Ad, SIZE, hipMemcpyDeviceToHost);
}
开发者ID:GPUOpen-ProfessionalCompute-Tools,项目名称:HIP,代码行数:7,代码来源:hipC.cpp
示例5: main
int main(int argc, char *argv[])
{ int warpSize, pshift;
hipDeviceProp_t devProp;
hipDeviceGetProperties(&devProp, 0);
if(strncmp(devProp.name,"Fiji",1)==0) {warpSize =64; pshift =6;}
else {warpSize =32; pshift =5;}
unsigned int Num_Threads_per_Block = 512;
unsigned int Num_Blocks_per_Grid = 1;
unsigned int Num_Warps_per_Block = Num_Threads_per_Block/warpSize;
unsigned int Num_Warps_per_Grid = (Num_Threads_per_Block*Num_Blocks_per_Grid)/warpSize;
unsigned int* host_ballot = (unsigned int*)malloc(Num_Warps_per_Grid*sizeof(unsigned int));
unsigned int* device_ballot;
HIP_ASSERT(hipMalloc((void**)&device_ballot, Num_Warps_per_Grid*sizeof(unsigned int)));
int divergent_count =0;
for (int i=0; i<Num_Warps_per_Grid; i++) host_ballot[i] = 0;
HIP_ASSERT(hipMemcpy(device_ballot, host_ballot, Num_Warps_per_Grid*sizeof(unsigned int), hipMemcpyHostToDevice));
hipLaunchKernel(gpu_ballot, dim3(Num_Blocks_per_Grid),dim3(Num_Threads_per_Block),0,0, device_ballot,Num_Warps_per_Block,pshift);
HIP_ASSERT(hipMemcpy(host_ballot, device_ballot, Num_Warps_per_Grid*sizeof(unsigned int), hipMemcpyDeviceToHost));
for (int i=0; i<Num_Warps_per_Grid; i++) {
if ((host_ballot[i] == 0)||(host_ballot[i]/warpSize == warpSize)) std::cout << "Warp " << i << " IS convergent- Predicate true for " << host_ballot[i]/warpSize << " threads\n";
else {std::cout << "Warp " << i << " IS divergent - Predicate true for " << host_ballot[i]/warpSize<< " threads\n";
divergent_count++;}
}
if (divergent_count==1) printf("PASSED\n"); else printf("FAILED\n");
return EXIT_SUCCESS;
}
开发者ID:codeaudit,项目名称:HIP,代码行数:35,代码来源:hip_ballot.cpp
示例6: run_lround
bool run_lround(){
double *A, *Ad;
long int *B, *Bd;
A = new double[N];
B = new long int[N];
for(int i=0;i<N;i++){
A[i] = 1.345;
}
hipMalloc((void**)&Ad, SIZE);
hipMalloc((void**)&Bd, N*sizeof(long int));
hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice);
hipLaunchKernel(test_lround, dim3(1), dim3(N), 0, 0, Ad, Bd);
hipMemcpy(B, Bd, N*sizeof(long int), hipMemcpyDeviceToHost);
int passed = 0;
for(int i=0;i<512;i++){
long int x = round(A[i]);
if(B[i] == x){
passed = 1;
}
}
free(A);
if(passed == 1){
return true;
}
assert(passed == 1);
return false;
}
开发者ID:GPUOpen-ProfessionalCompute-Tools,项目名称:HIP,代码行数:27,代码来源:hipTestDeviceDouble.cpp
示例7: run_rnorm
bool run_rnorm(){
double *A, *Ad, *B, *Bd;
A = new double[N];
B = new double[N];
double val = 0.0;
for(int i=0;i<N;i++){
A[i] = 1.0;
B[i] = 0.0;
val += 1.0;
}
val = 1/sqrt(val);
hipMalloc((void**)&Ad, SIZE);
hipMalloc((void**)&Bd, SIZE);
hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice);
hipLaunchKernel(test_rnorm, dim3(1), dim3(N), 0, 0, Ad, Bd);
hipMemcpy(B, Bd, SIZE, hipMemcpyDeviceToHost);
int passed = 0;
for(int i=0;i<512;i++){
if(B[0] - val < 0.000001){
passed = 1;
}
}
free(A);
if(passed == 1){
return true;
}
assert(passed == 1);
return false;
}
开发者ID:GPUOpen-ProfessionalCompute-Tools,项目名称:HIP,代码行数:29,代码来源:hipTestDeviceDouble.cpp
示例8: run_rnorm3d
bool run_rnorm3d(){
double *A, *Ad, *B, *Bd, *C, *Cd, *D, *Dd;
A = new double[N];
B = new double[N];
C = new double[N];
D = new double[N];
double val = 0.0;
for(int i=0;i<N;i++){
A[i] = 1.0;
B[i] = 2.0;
C[i] = 3.0;
}
val = 1/sqrt(1.0 + 4.0 + 9.0);
hipMalloc((void**)&Ad, SIZE);
hipMalloc((void**)&Bd, SIZE);
hipMalloc((void**)&Cd, SIZE);
hipMalloc((void**)&Dd, SIZE);
hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice);
hipMemcpy(Bd, B, SIZE, hipMemcpyHostToDevice);
hipMemcpy(Cd, C, SIZE, hipMemcpyHostToDevice);
hipLaunchKernel(test_rnorm3d, dim3(1), dim3(N), 0, 0, Ad, Bd, Cd, Dd);
hipMemcpy(D, Dd, SIZE, hipMemcpyDeviceToHost);
int passed = 0;
for(int i=0;i<512;i++){
if(D[i] - val < 0.000001){
passed = 1;
}
}
free(A);
if(passed == 1){
return true;
}
assert(passed == 1);
return false;
}
开发者ID:GPUOpen-ProfessionalCompute-Tools,项目名称:HIP,代码行数:35,代码来源:hipTestDeviceDouble.cpp
示例9: run_erfinv
bool run_erfinv(){
double *A, *Ad, *B, *Bd;
A = new double[N];
B = new double[N];
for(int i=0;i<N;i++){
A[i] = -0.6;
B[i] = 0.0;
}
hipMalloc((void**)&Ad, SIZE);
hipMalloc((void**)&Bd, SIZE);
hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice);
hipLaunchKernel(test_erfinv, dim3(1), dim3(N), 0, 0, Ad, Bd);
hipMemcpy(B, Bd, SIZE, hipMemcpyDeviceToHost);
int passed = 0;
for(int i=0;i<512;i++){
if(B[i] - A[i] < 0.000001){
passed = 1;
}
}
free(A);
if(passed == 1){
return true;
}
assert(passed == 1);
return false;
}
开发者ID:GPUOpen-ProfessionalCompute-Tools,项目名称:HIP,代码行数:26,代码来源:hipTestDeviceDouble.cpp
示例10: run_sincos
bool run_sincos(){
double *A, *Ad, *B, *C, *Bd, *Cd;
A = new double[N];
B = new double[N];
C = new double[N];
for(int i=0;i<N;i++){
A[i] = 1.0;
}
hipMalloc((void**)&Ad, SIZE);
hipMalloc((void**)&Bd, SIZE);
hipMalloc((void**)&Cd, SIZE);
hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice);
hipLaunchKernel(test_sincos, dim3(1), dim3(N), 0, 0, Ad, Bd, Cd);
hipMemcpy(B, Bd, SIZE, hipMemcpyDeviceToHost);
hipMemcpy(C, Cd, SIZE, hipMemcpyDeviceToHost);
int passed = 0;
for(int i=0;i<512;i++){
if(B[i] == sin(1.0)){
passed = 1;
}
}
passed = 0;
for(int i=0;i<512;i++){
if(C[i] == cos(1.0)){
passed = 1;
}
}
free(A);
if(passed == 1){
return true;
}
assert(passed == 1);
return false;
}
开发者ID:GPUOpen-ProfessionalCompute-Tools,项目名称:HIP,代码行数:34,代码来源:hipTestDeviceDouble.cpp
示例11: run
void run(size_t size, hipStream_t stream1, hipStream_t stream2){
float *Ah, *Bh, *Cd, *Dd, *Eh;
float *Ahh, *Bhh, *Cdd, *Ddd, *Ehh;
HIPCHECK(hipHostMalloc((void**)&Ah, size, hipHostMallocDefault));
HIPCHECK(hipHostMalloc((void**)&Bh, size, hipHostMallocDefault));
HIPCHECK(hipMalloc(&Cd, size));
HIPCHECK(hipMalloc(&Dd, size));
HIPCHECK(hipHostMalloc((void**)&Eh, size, hipHostMallocDefault));
HIPCHECK(hipHostMalloc((void**)&Ahh, size, hipHostMallocDefault));
HIPCHECK(hipHostMalloc((void**)&Bhh, size, hipHostMallocDefault));
HIPCHECK(hipMalloc(&Cdd, size));
HIPCHECK(hipMalloc(&Ddd, size));
HIPCHECK(hipHostMalloc((void**)&Ehh, size, hipHostMallocDefault));
HIPCHECK(hipMemcpyAsync(Bh, Ah, size, hipMemcpyHostToHost, stream1));
HIPCHECK(hipMemcpyAsync(Bhh, Ahh, size, hipMemcpyHostToHost, stream2));
HIPCHECK(hipMemcpyAsync(Cd, Bh, size, hipMemcpyHostToDevice, stream1));
HIPCHECK(hipMemcpyAsync(Cdd, Bhh, size, hipMemcpyHostToDevice, stream2));
hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N/500), dim3(500), 0, stream1, Cd);
hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N/500), dim3(500), 0, stream2, Cdd);
HIPCHECK(hipMemcpyAsync(Dd, Cd, size, hipMemcpyDeviceToDevice, stream1));
HIPCHECK(hipMemcpyAsync(Ddd, Cdd, size, hipMemcpyDeviceToDevice, stream2));
HIPCHECK(hipMemcpyAsync(Eh, Dd, size, hipMemcpyDeviceToHost, stream1));
HIPCHECK(hipMemcpyAsync(Ehh, Ddd, size, hipMemcpyDeviceToHost, stream2));
HIPCHECK(hipDeviceSynchronize());
HIPASSERT(Eh[10] = Ah[10] + 1.0f);
HIPASSERT(Ehh[10] = Ahh[10] + 1.0f);
}
开发者ID:GPUOpen-ProfessionalCompute-Tools,项目名称:HIP,代码行数:29,代码来源:hipMultiThreadStreams2.cpp
示例12: main
int main(int argc, char *argv[])
{ int warpSize, pshift;
hipDeviceProp_t devProp;
hipGetDeviceProperties(&devProp, 0);
if(strncmp(devProp.name,"Fiji",1)==0)
{ warpSize =64;
pshift =6;
}
else {warpSize =32; pshift=5;}
int anycount =0;
int allcount =0;
int Num_Threads_per_Block = 1024;
int Num_Blocks_per_Grid = 1;
int Num_Warps_per_Block = Num_Threads_per_Block/warpSize;
int Num_Warps_per_Grid = (Num_Threads_per_Block*Num_Blocks_per_Grid)/warpSize;
int * host_any = ( int*)malloc(Num_Warps_per_Grid*sizeof(int));
int * host_all = ( int*)malloc(Num_Warps_per_Grid*sizeof(int));
int *device_any;
int *device_all;
HIP_ASSERT(hipMalloc((void**)&device_any,Num_Warps_per_Grid*sizeof( int)));
HIP_ASSERT(hipMalloc((void**)&device_all,Num_Warps_per_Grid*sizeof(int)));
for (int i=0; i<Num_Warps_per_Grid; i++)
{
host_any[i] = 0;
host_all[i] = 0;
}
HIP_ASSERT(hipMemcpy(device_any, host_any,sizeof(int), hipMemcpyHostToDevice));
HIP_ASSERT(hipMemcpy(device_all, host_all,sizeof(int), hipMemcpyHostToDevice));
hipLaunchKernel(warpvote, dim3(Num_Blocks_per_Grid),dim3(Num_Threads_per_Block),0,0, device_any, device_all ,Num_Warps_per_Block,pshift);
HIP_ASSERT(hipMemcpy(host_any, device_any, Num_Warps_per_Grid*sizeof(int), hipMemcpyDeviceToHost));
HIP_ASSERT(hipMemcpy(host_all, device_all, Num_Warps_per_Grid*sizeof(int), hipMemcpyDeviceToHost));
for (int i=0; i<Num_Warps_per_Grid; i++) {
printf("warp no. %d __any = %d \n",i,host_any[i]);
printf("warp no. %d __all = %d \n",i,host_all[i]);
if (host_all[i]!=1) ++allcount;
#if defined (__HIP_PLATFORM_HCC__) && !defined ( NVCC_COMPAT )
if (host_any[i]!=64) ++anycount;
#else
if (host_any[i]!=1) ++anycount;
#endif
}
#if defined (__HIP_PLATFORM_HCC__) && !defined ( NVCC_COMPAT )
if (anycount == 1 && allcount ==1) printf("PASSED\n"); else printf("FAILED\n");
#else
if (anycount == 0 && allcount ==1) printf("PASSED\n"); else printf("FAILED\n");
#endif
return EXIT_SUCCESS;
}
开发者ID:GPUOpen-ProfessionalCompute-Tools,项目名称:HIP,代码行数:57,代码来源:hip_anyall.cpp
示例13: main
int main() {
hipLaunchKernelGGL(
compileDoublePrecisionMathOnDevice,
dim3(1, 1, 1),
dim3(1, 1, 1),
0,
0,
1);
passed();
}
开发者ID:ssahasra,项目名称:HIP,代码行数:10,代码来源:hipDoublePrecisionMathDevice.cpp
示例14: operator
void operator()(dim3 *grid_dim, dim3 *block_dim, int x, int y, int z)
{
if (y >= 4) {
*block_dim = dim3(128, 4, 1);
} else {
*block_dim = dim3(512, 1, 1);
}
grid_dim->x = divide_and_round_up(x, block_dim->x);
grid_dim->y = divide_and_round_up(y, block_dim->y);
grid_dim->z = divide_and_round_up(z, block_dim->z);
}
开发者ID:shifty91,项目名称:libflatarray,代码行数:12,代码来源:generate_cuda_launch_config.hpp
示例15:
void BlockArrangement::ArrangePrefer3dLocality(dim3* grid, dim3* block,
const uint3& volume_size)
{
if (!grid || !block)
return;
int bw = 8;
int bh = 8;
int bd = 8;
*block = dim3(bw, bh, bd);
*grid = dim3((volume_size.x + bw - 1) / bw, (volume_size.y + bh - 1) / bh,
(volume_size.z + bd - 1) / bd);
}
开发者ID:snowlesswinter,项目名称:fluid3d,代码行数:13,代码来源:block_arrangement.cpp
示例16: runTest
void runTest(int argc, char **argv)
{
hipDeviceProp_t deviceProp;
deviceProp.major = 0;
deviceProp.minor = 0;
int dev = 0;
hipDeviceGetProperties(&deviceProp, dev);
// Statistics about the GPU device
printf("> GPU device has %d Multi-Processors, "
"SM %d.%d compute capabilities\n\n",
deviceProp.multiProcessorCount, deviceProp.major, deviceProp.minor);
int version = (deviceProp.major * 0x10 + deviceProp.minor);
unsigned int numThreads = 256;
unsigned int numBlocks = 64;
unsigned int numData = 11;
unsigned int memSize = sizeof(int) * numData;
//allocate mem for the result on host side
int *hOData = (int *) malloc(memSize);
//initialize the memory
for (unsigned int i = 0; i < numData; i++)
hOData[i] = 0;
//To make the AND and XOR tests generate something other than 0...
hOData[8] = hOData[10] = 0xff;
// allocate device memory for result
int *dOData;
hipMalloc((void **) &dOData, memSize);
// copy host memory to device to initialize to zero
hipMemcpy(dOData, hOData, memSize,hipMemcpyHostToDevice);
// execute the kernel
hipLaunchKernel(testKernel, dim3(numBlocks), dim3(numThreads), 0, 0, dOData);
//Copy result from device to host
hipMemcpy(hOData,dOData, memSize,hipMemcpyDeviceToHost);
// Compute reference solution
testResult = computeGold(hOData, numThreads * numBlocks);
// Cleanup memory
free(hOData);
hipFree(dOData);
}
开发者ID:sutt0n,项目名称:HIP,代码行数:50,代码来源:hipSimpleAtomicsTest.cpp
示例17: run_rnorm4d
bool run_rnorm4d() {
double *A, *Ad, *B, *Bd, *C, *Cd, *D, *Dd, *E, *Ed;
A = new double[N];
B = new double[N];
C = new double[N];
D = new double[N];
E = new double[N];
double val = 0.0;
for (int i = 0; i < N; i++) {
A[i] = 1.0;
B[i] = 2.0;
C[i] = 3.0;
D[i] = 4.0;
}
val = 1 / sqrt(1.0 + 4.0 + 9.0 + 16.0);
hipMalloc((void**)&Ad, SIZE);
hipMalloc((void**)&Bd, SIZE);
hipMalloc((void**)&Cd, SIZE);
hipMalloc((void**)&Dd, SIZE);
hipMalloc((void**)&Ed, SIZE);
hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice);
hipMemcpy(Bd, B, SIZE, hipMemcpyHostToDevice);
hipMemcpy(Cd, C, SIZE, hipMemcpyHostToDevice);
hipMemcpy(Dd, D, SIZE, hipMemcpyHostToDevice);
hipLaunchKernelGGL(test_rnorm4d, dim3(1), dim3(N), 0, 0, Ad, Bd, Cd, Dd, Ed);
hipMemcpy(E, Ed, SIZE, hipMemcpyDeviceToHost);
int passed = 0;
for (int i = 0; i < 512; i++) {
if (E[i] - val < 0.000001) {
passed = 1;
}
}
delete[] A;
delete[] B;
delete[] C;
delete[] D;
delete[] E;
hipFree(Ad);
hipFree(Bd);
hipFree(Cd);
hipFree(Dd);
hipFree(Ed);
if (passed == 1) {
return true;
}
assert(passed == 1);
return false;
}
开发者ID:ssahasra,项目名称:HIP,代码行数:50,代码来源:hipTestDeviceDouble.cpp
示例18: setImageInfo
void CUDARenderer::windowResize(int width, int height)
{
skipStep = true;
h_info.setWidthHeight(width, height);
camera.width = h_info.width;
camera.height = h_info.height;
setImageInfo(h_info);
dimBlock = dim3(THREAD_DIM, THREAD_DIM, 1);
dimGrid = dim3(h_info.width / dimBlock.x + (h_info.width % dimBlock.x > 0), h_info.height / dimBlock.y + (h_info.height % dimBlock.y > 0), 1);
deleteTexture();
OGLtexture = Texture2D(GL_TEXTURE_2D);
initTexture();
}
开发者ID:Aloalo,项目名称:RTRT,代码行数:14,代码来源:CUDARenderer.cpp
示例19: gpuErrchk
void CUDARenderer::initCUDA()
{
gpuErrchk(cudaSetDevice(0));
gpuErrchk(cudaGLSetGLDevice(0));
gpuErrchk(cudaMalloc((void **)&d_scene, sizeof(SlowScene)));
gpuErrchk(cudaMemcpy(d_scene, &h_scene, sizeof(SlowScene), cudaMemcpyHostToDevice));
setImageInfo(h_info);
gpuErrchk(cuCtxSetLimit(CU_LIMIT_STACK_SIZE, 1024 * 10));
dimBlock = dim3(THREAD_DIM, THREAD_DIM, 1);
dimGrid = dim3(h_info.width / dimBlock.x + (h_info.width % dimBlock.x > 0), h_info.height / dimBlock.y + (h_info.height % dimBlock.y > 0), 1);
snapshot.init();
}
开发者ID:Aloalo,项目名称:RTRT,代码行数:15,代码来源:CUDARenderer.cpp
示例20: main
int main() {
size_t Nbytes = N * sizeof(int);
int numDevices = 0;
int *A_d, *B_d, *C_d, *X_d, *Y_d, *Z_d;
int *A_h, *B_h, *C_h;
hipStream_t s;
HIPCHECK(hipGetDeviceCount(&numDevices));
if (numDevices > 1) {
HIPCHECK(hipSetDevice(0));
unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);
HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false);
HIPCHECK(hipSetDevice(1));
HIPCHECK(hipMalloc(&X_d, Nbytes));
HIPCHECK(hipMalloc(&Y_d, Nbytes));
HIPCHECK(hipMalloc(&Z_d, Nbytes));
HIPCHECK(hipSetDevice(0));
HIPCHECK(hipMemcpy(A_d, A_h, Nbytes, hipMemcpyHostToDevice));
HIPCHECK(hipMemcpy(B_d, B_h, Nbytes, hipMemcpyHostToDevice));
hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0,
static_cast<const int*>(A_d), static_cast<const int*>(B_d), C_d, N);
HIPCHECK(hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost));
HIPCHECK(hipDeviceSynchronize());
HipTest::checkVectorADD(A_h, B_h, C_h, N);
HIPCHECK(hipSetDevice(1));
HIPCHECK(hipStreamCreate(&s));
HIPCHECK(hipMemcpyDtoDAsync((hipDeviceptr_t)X_d, (hipDeviceptr_t)A_d, Nbytes, s));
HIPCHECK(hipMemcpyDtoDAsync((hipDeviceptr_t)Y_d, (hipDeviceptr_t)B_d, Nbytes, s));
hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0,
static_cast<const int*>(X_d), static_cast<const int*>(Y_d), Z_d, N);
HIPCHECK(hipMemcpyDtoHAsync(C_h, (hipDeviceptr_t)Z_d, Nbytes, s));
HIPCHECK(hipStreamSynchronize(s));
HIPCHECK(hipDeviceSynchronize());
HipTest::checkVectorADD(A_h, B_h, C_h, N);
HIPCHECK(hipStreamDestroy(s));
HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, false);
HIPCHECK(hipFree(X_d));
HIPCHECK(hipFree(Y_d));
HIPCHECK(hipFree(Z_d));
}
passed();
}
开发者ID:ssahasra,项目名称:HIP,代码行数:48,代码来源:hipMemcpyDtoDAsync.cpp
注:本文中的dim3函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论