So i have a illegal memory access was encountered error when i try to launch some kernels in a loop like this:
for (int bitId = 0; bitId < sizeof(uint32_t) * 8; bitId++)
{
// Extract bits
extractBits <<< gridSize, blockSize >>> (d_in, n, d_bits, bitId);
// Compute nOnesBefore
scanKernel <<< gridSize, blockSize, smem >>> (d_bits, n, d_nOnesBefore, d_bSums);
int zero = 0;
CHECK(cudaMemcpyToSymbol(bCount, &zero, sizeof(int)));
CHECK(cudaMemcpyToSymbol(bDoneCount, &zero, sizeof(int)));
// Compute rank
computeRank <<< gridSize, blockSize >>> (d_in, n, d_out, d_bits, d_nOnesBefore);
// Swap d_in and d_out
uint32_t* temp = d_in;
d_in = d_out;
d_out = temp;
}
And i debug and found out this ‘extractBits’ function was calculate wrong. When I try to print to see problem, the error rarely happen. So i realize the more delay i add, the less chance error happen. I add something like this and it run fine without error:
for (int bitId = 0; bitId < sizeof(uint32_t) * 8; bitId++)
{
// Extract bits
extractBits <<< gridSize, blockSize >>> (d_in, n, d_bits, bitId);
// Dummy copy to delay, do nothing.
CHECK(cudaMemcpy(src, d_in, sizeof(uint32_t) * n, cudaMemcpyDeviceToHost));
// Compute nOnesBefore
scanKernel <<< gridSize, blockSize, smem >>> (d_bits, n, d_nOnesBefore, d_bSums);
int zero = 0;
CHECK(cudaMemcpyToSymbol(bCount, &zero, sizeof(int)));
CHECK(cudaMemcpyToSymbol(bDoneCount, &zero, sizeof(int)));
// Compute rank
computeRank <<< gridSize, blockSize >>> (d_in, n, d_out, d_bits, d_nOnesBefore);
// Swap d_in and d_out
uint32_t* temp = d_in;
d_in = d_out;
d_out = temp;
}
Here is source code(it only can run with gpu like T4 on colab, P100 can't):
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <stdint.h>
#include <string.h>
#include <stdlib.h>
#define CHECK(call)
{
const cudaError_t error = call;
if (error != cudaSuccess)
{
fprintf(stderr, "Error: %s:%d, ", __FILE__, __LINE__);
fprintf(stderr, "code: %d, reason: %s
", error,
cudaGetErrorString(error));
exit(1);
}
}
struct GpuTimer
{
cudaEvent_t start;
cudaEvent_t stop;
GpuTimer()
{
cudaEventCreate(&start);
cudaEventCreate(&stop);
}
~GpuTimer()
{
cudaEventDestroy(start);
cudaEventDestroy(stop);
}
void Start()
{
cudaEventRecord(start, 0);
cudaEventSynchronize(start);
}
void Stop()
{
cudaEventRecord(stop, 0);
}
float Elapsed()
{
float elapsed;
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsed, start, stop);
return elapsed;
}
};
// Sequential Radix Sort
// "const uint32_t * in" means: the memory region pointed by "in" is read-only
void sortByHost(const uint32_t * in, int n,
uint32_t * out)
{
int * bits = (int *)malloc(n * sizeof(int));
int * nOnesBefore = (int *)malloc(n * sizeof(int));
uint32_t * src = (uint32_t *)malloc(n * sizeof(uint32_t));
uint32_t * originalSrc = src; // To free memory later
memcpy(src, in, n * sizeof(uint32_t));
uint32_t * dst = out;
// Loop from LSB (Least Significant Bit) to MSB (Most Significant Bit)
// In each loop, sort elements according to the current bit from src to dst
// (using STABLE counting sort)
for (int bitIdx = 0; bitIdx < sizeof(uint32_t) * 8; bitIdx++)
{
// Extract bits
for (int i = 0; i < n; i++)
bits[i] = (src[i] >> bitIdx) & 1;
// Compute nOnesBefore
nOnesBefore[0] = 0;
for (int i = 1; i < n; i++)
nOnesBefore[i] = nOnesBefore[i-1] + bits[i-1];
// Compute rank and write to dst
int nZeros = n - nOnesBefore[n-1] - bits[n-1];
for (int i = 0; i < n; i++)
{
int rank;
if (bits[i] == 0)
rank = i - nOnesBefore[i];
else
rank = nZeros + nOnesBefore[i];
dst[rank] = src[i];
}
// Swap src and dst
uint32_t * temp = src;
src = dst;
dst = temp;
}
// Does out array contain results?
memcpy(out, src, n * sizeof(uint32_t));
// Free memory
free(originalSrc);
free(bits);
free(nOnesBefore);
}
__global__ void extractBits(uint32_t* in, int n, int* out, int bitId)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < n)
out[i] = (in[i] >> bitId) & 1;
}
__device__ int bCount = 0;
volatile __device__ int bDoneCount = 0;
__global__ void scanKernel(int* in, int n, int* out, volatile int* bSums)
{
__shared__ int blockId;
if (threadIdx.x == 0)
{
blockId = atomicAdd(&bCount, 1);
}
__syncthreads();
// 1. Each block loads data from GMEM to SMEM
extern __shared__ int s_data[];
int i = blockId * blockDim.x + threadIdx.x;
if (i < n)
{
if (i == 0)
s_data[threadIdx.x] = 0;
else
s_data[threadIdx.x] = in[i - 1];
__syncthreads();
// 2. Each block does scan with data on SMEM
for (int stride = 1; stride < blockDim.x; stride *= 2)
{
int neededVal;
if (threadIdx.x >= stride)
neededVal = s_data[threadIdx.x - stride];
__syncthreads();
if (threadIdx.x >= stride)
s_data[threadIdx.x] += neededVal;
__syncthreads();
}
// 3. Each block write results from SMEM to GMEM
out[i] = s_data[threadIdx.x];
if (bSums != NULL)
{
if (threadIdx.x == 0)
{
bSums[blockId] = s_data[blockDim.x - 1];
if (blockId > 0)
{
while (bDoneCount < blockId) {}
bSums[blockId] += bSums[blockId - 1];
__threadfence();
}
bDoneCount += 1;
}
__syncthreads();
if (i + blockDim.x < n)
out[i + blockDim.x] += bSums[blockId];
}
}
}
__global__ void computeRank(uint32_t* in, int n, uint32_t* out, int* bits, int* nOnesBefore)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
int nZeros = n - nOnesBefore[n - 1] - bits[n - 1];
if (i < n)
{
int rank;
if (bits[i] == 0)
rank = i - nOnesBefore[i];
else
rank = nZeros + nOnesBefore[i];
out[rank] = in[i];
}
}
// Parallel Radix Sort
void sortByDevice(const uint32_t * in, int n, uint32_t * out, int blockSize)
{
uint32_t* src = (uint32_t*)malloc(n * sizeof(uint32_t));
// TODO
int *d_bits, *d_nOnesBefore;
size_t nBytes = n * sizeof(int);
CHECK(cudaMalloc(&d_bits, nBytes));
CHECK(cudaMalloc(&d_nOnesBefore, nBytes));
uint32_t * d_in, * d_out;
nBytes = n * sizeof(uint32_t);
CHECK(cudaMalloc(&d_in, nBytes));
CHECK(cudaMalloc(&d_out, nBytes));
CHECK(cudaMemcpy(d_in, in, nBytes, cudaMemcpyHostToDevice));
int gridSize = (n - 1) / blockSize + 1;
int* d_bSums;
if (gridSize > 1)
{
CHECK(cudaMalloc(&d_bSums, gridSize * sizeof(int)));
}
else
{
d_bSums = NULL;
}
size_t smem = blockSize * sizeof(int);
// Loop from LSB (Least Significant Bit) to MSB (Most Significant Bit)
// In each loop, sort elements according to the current bit from src to dst
// (using STABLE counting sort)
for (int bitId = 0; bitId < sizeof(uint32_t) * 8; bitId++)
{
// Extract bits
extractBits <<< gridSize, blockSize >>> (d_in, n, d_bits, bitId);
cudaDeviceSynchronize();
CHECK(cudaGetLastError());
CHECK(cudaMemcpy(src, d_in, sizeof(uint32_t) * n, cudaMemcpyDeviceToHost));
// Compute nOnesBefore
scanKernel <<< gridSize, blockSize, smem >>> (d_bits, n, d_nOnesBefore, d_bSums);
cudaDeviceSynchronize();
CHECK(cudaGetLastError());
int zero = 0;
CHECK(cudaMemcpyToSymbol(bCount, &zero, sizeof(int)));
CHECK(cudaMemcpyToSymbol(bDoneCount, &zero, sizeof(int)));
// Compute rank and write to d_out
computeRank <<< gridSize, blockSize >>> (d_in, n, d_out, d_bits, d_nOnesBefore);
cudaDeviceSynchronize();
CHECK(cudaGetLastError());
// Swap d_in and d_out
uint32_t* temp = d_in;
d_in = d_out;
d_out = temp;
}
CHECK(cudaMemcpy(out, d_in, nBytes, cudaMemcpyDeviceToHost));
// Free memory
CHECK(cudaFree(d_bits));
CHECK(cudaFree(d_nOnesBefore));
CHECK(cudaFree(d_in));
CHECK(cudaFree(d_out));
if (gridSize > 1)
CHECK(cudaFree(d_bSums));
free(src);
}
// Radix Sort
void sort(const uint32_t * in, int n,
uint32_t * out,
bool useDevice=false, int blockSize=1)
{
GpuTimer timer;
timer.Start();
if (useDevice == false)
{
printf("
Radix Sort by host
");
sortByHost(in, n, out);
}
else // use device
{
printf("
Radix Sort by device
");
sortByDevice(in, n, out, blockSize);
}
timer.Stop();
printf("Time: %.3f ms
", timer.Elapsed());
}
void printDeviceInfo()
{
cudaDeviceProp devProv;
CHECK(cudaGetDeviceProperties(&devProv, 0));
printf("**********GPU info**********
");
printf("Name: %s
", devProv.name);
printf("Compute capability: %d.%d
", devProv.major, devProv.minor);
printf("Num SMs: %d
", devProv.multiProcessorCount);
printf("Max num threads per SM: %d
", devProv.maxThreadsPerMultiProcessor);
printf("Max num warps per SM: %d
", devProv.maxThreadsPerMultiProcessor / devProv.warpSize);
printf("GMEM: %zu byte
", devProv.totalGlobalMem);
printf("SMEM per SM: %zu byte
", devProv.sharedMemPerMultiprocessor);
printf("SMEM per block: %zu byte
", devProv.sharedMemPerBlock);
printf("****************************
");
}
void checkCorrectness(uint32_t * out, uint32_t * correctOut, int n)
{
for (int i = 0; i < n; i++)
{
if (out[i] != correctOut[i])
{
printf("INCORRECT :(
");
return;
}
}
printf("CORRECT :)
");
}
void printArray(uint32_t * a, int n)
{
for (int i = 0; i < n; i++)
printf("%i ", a[i]);
printf("
");
}
int main(int argc, char ** argv)
{
// PRINT OUT DEVICE INFO
printDeviceInfo();
// SET UP INPUT SIZE
//int n = 50; // For test by eye
int n = (1 << 24) + 1;
printf("
Input size: %d
", n);
// ALLOCATE MEMORIES
size_t bytes = n * sizeof(uint32_t);
uint32_t * in = (uint32_t *)malloc(bytes);