I have this simple routine for dynamic global memory allocation:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <stdlib.h>
__global__ void mallocTest()
{
__shared__ int* data;
int t=threadIdx.x;
// The first thread in the block does the allocation and then
// shares the pointer with all other threads through shared memory,
// so that access can easily be coalesced.
// 64 bytes per thread are allocated.
if (t == 0) {
size_t size = blockDim.x * 64;
data = (int*)malloc(size);
}
__syncthreads();
// Check for failure
if (data == NULL)
return;
// Threads index into the memory, ensuring coalescence
int* ptr = data;
for (int i = 0; i < 64; ++i)
ptr[i * blockDim.x + t] = t;
printf("Thread %d got pointer: %p
", t, ptr);
// Ensure all threads complete before freeing
__syncthreads();
// Only one thread may free the memory!
if (t == 0)
free(data);
}
int main()
{
cudaDeviceSetLimit(cudaLimitMallocHeapSize, 128*1024*1024);
mallocTest<<<1, 64>>>();
cudaDeviceSynchronize();
return 0;
}
It runs fine in both debug and release modes. But When I try to debug it after putting some breakpoints using NSIGHT, it fails. Basically the monitor switches off for few seconds then recover. Please see the following screenshot:
My system info: Windows 7, MSVS2010, CUDA 8.0, GT640 CC 3.0
与恶龙缠斗过久,自身亦成为恶龙;凝视深渊过久,深渊将回以凝视…