CUDA not waiting for kernel to finish

Question

Welcome To Ask or Share your Answers For Others

CUDA not waiting for kernel to finish

asked Oct 7, 2021 in Technique[技术] by 深蓝 (71.8m points)

CUDA not waiting for kernel to finish

So i have a illegal memory access was encountered error when i try to launch some kernels in a loop like this:

for (int bitId = 0; bitId < sizeof(uint32_t) * 8; bitId++)
    {
        // Extract bits
        extractBits <<< gridSize, blockSize >>> (d_in, n, d_bits, bitId);

        // Compute nOnesBefore       
        scanKernel <<< gridSize, blockSize, smem >>> (d_bits, n, d_nOnesBefore, d_bSums);

        int zero = 0;
        CHECK(cudaMemcpyToSymbol(bCount, &zero, sizeof(int)));
        CHECK(cudaMemcpyToSymbol(bDoneCount, &zero, sizeof(int)));

        // Compute rank
        computeRank <<< gridSize, blockSize >>> (d_in, n, d_out, d_bits, d_nOnesBefore);

        // Swap d_in and d_out
        uint32_t* temp = d_in;
        d_in = d_out;
        d_out = temp;
    }

And i debug and found out this ‘extractBits’ function was calculate wrong. When I try to print to see problem, the error rarely happen. So i realize the more delay i add, the less chance error happen. I add something like this and it run fine without error:

for (int bitId = 0; bitId < sizeof(uint32_t) * 8; bitId++)
    {
        // Extract bits
        extractBits <<< gridSize, blockSize >>> (d_in, n, d_bits, bitId);

        // Dummy copy to delay, do nothing.
        CHECK(cudaMemcpy(src, d_in, sizeof(uint32_t) * n, cudaMemcpyDeviceToHost));

        // Compute nOnesBefore       
        scanKernel <<< gridSize, blockSize, smem >>> (d_bits, n, d_nOnesBefore, d_bSums);

        int zero = 0;
        CHECK(cudaMemcpyToSymbol(bCount, &zero, sizeof(int)));
        CHECK(cudaMemcpyToSymbol(bDoneCount, &zero, sizeof(int)));

        // Compute rank
        computeRank <<< gridSize, blockSize >>> (d_in, n, d_out, d_bits, d_nOnesBefore);

        // Swap d_in and d_out
        uint32_t* temp = d_in;
        d_in = d_out;
        d_out = temp;
    }

Here is source code(it only can run with gpu like T4 on colab, P100 can't):

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <stdint.h>
#include <string.h>
#include <stdlib.h>

#define CHECK(call)
{
    const cudaError_t error = call;
    if (error != cudaSuccess)
    {
        fprintf(stderr, "Error: %s:%d, ", __FILE__, __LINE__);
        fprintf(stderr, "code: %d, reason: %s
", error,
                cudaGetErrorString(error));
        exit(1);
    }
}

struct GpuTimer
{
    cudaEvent_t start;
    cudaEvent_t stop;

    GpuTimer()
    {
        cudaEventCreate(&start);
        cudaEventCreate(&stop);
    }

    ~GpuTimer()
    {
        cudaEventDestroy(start);
        cudaEventDestroy(stop);
    }

    void Start()
    {
        cudaEventRecord(start, 0);
        cudaEventSynchronize(start);
    }

    void Stop()
    {
        cudaEventRecord(stop, 0);
    }

    float Elapsed()
    {
        float elapsed;
        cudaEventSynchronize(stop);
        cudaEventElapsedTime(&elapsed, start, stop);
        return elapsed;
    }
};

// Sequential Radix Sort
// "const uint32_t * in" means: the memory region pointed by "in" is read-only
void sortByHost(const uint32_t * in, int n,
                uint32_t * out)
{
    int * bits = (int *)malloc(n * sizeof(int));
    int * nOnesBefore = (int *)malloc(n * sizeof(int));

    uint32_t * src = (uint32_t *)malloc(n * sizeof(uint32_t));
    uint32_t * originalSrc = src; // To free memory later
    memcpy(src, in, n * sizeof(uint32_t));
    uint32_t * dst = out;

    // Loop from LSB (Least Significant Bit) to MSB (Most Significant Bit)
    // In each loop, sort elements according to the current bit from src to dst 
    // (using STABLE counting sort)
    for (int bitIdx = 0; bitIdx < sizeof(uint32_t) * 8; bitIdx++)
    {
        // Extract bits
        for (int i = 0; i < n; i++)
            bits[i] = (src[i] >> bitIdx) & 1;

        // Compute nOnesBefore
        nOnesBefore[0] = 0;
        for (int i = 1; i < n; i++)
            nOnesBefore[i] = nOnesBefore[i-1] + bits[i-1];

        // Compute rank and write to dst
        int nZeros = n - nOnesBefore[n-1] - bits[n-1];
        for (int i = 0; i < n; i++)
        {
            int rank;
            if (bits[i] == 0)
                rank = i - nOnesBefore[i];
            else
                rank = nZeros + nOnesBefore[i];
            dst[rank] = src[i];
        }

        // Swap src and dst
        uint32_t * temp = src;
        src = dst;
        dst = temp;
    }

    // Does out array contain results?
    memcpy(out, src, n * sizeof(uint32_t));

    // Free memory
    free(originalSrc);
    free(bits);
    free(nOnesBefore);
}

__global__ void extractBits(uint32_t* in, int n, int* out, int bitId)
{
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i < n)
        out[i] = (in[i] >> bitId) & 1;
}

__device__ int bCount = 0;
volatile __device__ int bDoneCount = 0;

__global__ void scanKernel(int* in, int n, int* out, volatile int* bSums)
{
    __shared__ int blockId;
    if (threadIdx.x == 0)
    {
        blockId = atomicAdd(&bCount, 1);
    }
    __syncthreads();

    // 1. Each block loads data from GMEM to SMEM
    extern __shared__ int s_data[];

    int i = blockId * blockDim.x + threadIdx.x;

    if (i < n)
    {
        if (i == 0)
            s_data[threadIdx.x] = 0;
        else
            s_data[threadIdx.x] = in[i - 1];
        __syncthreads();

        // 2. Each block does scan with data on SMEM
        for (int stride = 1; stride < blockDim.x; stride *= 2)
        {
            int neededVal;
            if (threadIdx.x >= stride)
                neededVal = s_data[threadIdx.x - stride];
            __syncthreads();
            if (threadIdx.x >= stride)
                s_data[threadIdx.x] += neededVal;
            __syncthreads();
        }

        // 3. Each block write results from SMEM to GMEM
        out[i] = s_data[threadIdx.x];

        if (bSums != NULL)
        {
            if (threadIdx.x == 0)
            {
                bSums[blockId] = s_data[blockDim.x - 1];

                if (blockId > 0)
                {
                    while (bDoneCount < blockId) {}
                    bSums[blockId] += bSums[blockId - 1];
                    __threadfence();
                }
                bDoneCount += 1;
            }
            __syncthreads();

            if (i + blockDim.x < n)
                out[i + blockDim.x] += bSums[blockId];
        }
    }
}

__global__ void computeRank(uint32_t* in, int n, uint32_t* out, int* bits, int* nOnesBefore)
{
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    int nZeros = n - nOnesBefore[n - 1] - bits[n - 1];

    if (i < n)
    {
        int rank;
        if (bits[i] == 0)
            rank = i - nOnesBefore[i];
        else
            rank = nZeros + nOnesBefore[i];
        out[rank] = in[i];
    }
}

// Parallel Radix Sort
void sortByDevice(const uint32_t * in, int n, uint32_t * out, int blockSize)
{
    uint32_t* src = (uint32_t*)malloc(n * sizeof(uint32_t));
    // TODO
    int *d_bits, *d_nOnesBefore;
    size_t nBytes = n * sizeof(int);
    CHECK(cudaMalloc(&d_bits, nBytes));
    CHECK(cudaMalloc(&d_nOnesBefore, nBytes));

    uint32_t * d_in, * d_out;
    nBytes = n * sizeof(uint32_t);
    CHECK(cudaMalloc(&d_in, nBytes));
    CHECK(cudaMalloc(&d_out, nBytes));

    CHECK(cudaMemcpy(d_in, in, nBytes, cudaMemcpyHostToDevice));

    int gridSize = (n - 1) / blockSize + 1;

    int* d_bSums;
    if (gridSize > 1)
    {
        CHECK(cudaMalloc(&d_bSums, gridSize * sizeof(int)));
    }
    else
    {
        d_bSums = NULL;
    }
    size_t smem = blockSize * sizeof(int);

    // Loop from LSB (Least Significant Bit) to MSB (Most Significant Bit)
    // In each loop, sort elements according to the current bit from src to dst 
    // (using STABLE counting sort)
    for (int bitId = 0; bitId < sizeof(uint32_t) * 8; bitId++)
    {
        // Extract bits
        extractBits <<< gridSize, blockSize >>> (d_in, n, d_bits, bitId);
        cudaDeviceSynchronize();
        CHECK(cudaGetLastError());

        CHECK(cudaMemcpy(src, d_in, sizeof(uint32_t) * n, cudaMemcpyDeviceToHost));

        // Compute nOnesBefore       
        scanKernel <<< gridSize, blockSize, smem >>> (d_bits, n, d_nOnesBefore, d_bSums);
        cudaDeviceSynchronize();
        CHECK(cudaGetLastError());

        int zero = 0;
        CHECK(cudaMemcpyToSymbol(bCount, &zero, sizeof(int)));
        CHECK(cudaMemcpyToSymbol(bDoneCount, &zero, sizeof(int)));

        // Compute rank and write to d_out
        computeRank <<< gridSize, blockSize >>> (d_in, n, d_out, d_bits, d_nOnesBefore);
        cudaDeviceSynchronize();
        CHECK(cudaGetLastError());
        
        // Swap d_in and d_out
        uint32_t* temp = d_in;
        d_in = d_out;
        d_out = temp;
    }

    CHECK(cudaMemcpy(out, d_in, nBytes, cudaMemcpyDeviceToHost));
    

    // Free memory  
    CHECK(cudaFree(d_bits));
    CHECK(cudaFree(d_nOnesBefore));
    CHECK(cudaFree(d_in));
    CHECK(cudaFree(d_out));

    if (gridSize > 1)
        CHECK(cudaFree(d_bSums));

    free(src);
}

// Radix Sort
void sort(const uint32_t * in, int n, 
        uint32_t * out, 
        bool useDevice=false, int blockSize=1)
{
    GpuTimer timer; 
    timer.Start();

    if (useDevice == false)
    {
        printf("
Radix Sort by host
");
        sortByHost(in, n, out);
    }
    else // use device
    {
        printf("
Radix Sort by device
");
        sortByDevice(in, n, out, blockSize);
    }

    timer.Stop();
    printf("Time: %.3f ms
", timer.Elapsed());
}

void printDeviceInfo()
{
    cudaDeviceProp devProv;
    CHECK(cudaGetDeviceProperties(&devProv, 0));
    printf("**********GPU info**********
");
    printf("Name: %s
", devProv.name);
    printf("Compute capability: %d.%d
", devProv.major, devProv.minor);
    printf("Num SMs: %d
", devProv.multiProcessorCount);
    printf("Max num threads per SM: %d
", devProv.maxThreadsPerMultiProcessor); 
    printf("Max num warps per SM: %d
", devProv.maxThreadsPerMultiProcessor / devProv.warpSize);
    printf("GMEM: %zu byte
", devProv.totalGlobalMem);
    printf("SMEM per SM: %zu byte
", devProv.sharedMemPerMultiprocessor);
    printf("SMEM per block: %zu byte
", devProv.sharedMemPerBlock);
    printf("****************************
");
}

void checkCorrectness(uint32_t * out, uint32_t * correctOut, int n)
{
    for (int i = 0; i < n; i++)
    {
        if (out[i] != correctOut[i])
        {
            printf("INCORRECT :(
");
            return;
        }
    }
    printf("CORRECT :)
");
}

void printArray(uint32_t * a, int n)
{
    for (int i = 0; i < n; i++)
        printf("%i ", a[i]);
    printf("
");
}

int main(int argc, char ** argv)
{
    // PRINT OUT DEVICE INFO
    printDeviceInfo();

    // SET UP INPUT SIZE
    //int n = 50; // For test by eye
    int n = (1 << 24) + 1;
    printf("
Input size: %d
", n);

    // ALLOCATE MEMORIES
    size_t bytes = n * sizeof(uint32_t);
    uint32_t * in = (uint32_t *)malloc(bytes);

与恶龙缠斗过久,自身亦成为恶龙；凝视深渊过久,深渊将回以凝视…

1 Answer

深蓝 · Answer 1 · 2021-10-06T18:48:00+0000

There are probably multiple issues with this code. I will list 3 and then give some additional commentary.

You have illegal use of __syncthreads():

 if (i < n)   <--------------------------------------
 {                                                   |
     if (i == 0)                                     |
         s_data[threadIdx.x] = 0;                    |
     else                                            |
         s_data[threadIdx.x] = in[i - 1];            |
     __syncthreads();  <-----------------------------

see here. I don't think this is a central issue, however.

I'm reasonably confident, for correctness, you need a __threadfence() here:

     if (threadIdx.x == 0)
     {
         bSums[blockId] = s_data[blockDim.x - 1];
         __threadfence();  // added
         if (blockId > 0)

to handle the interaction between block 0 and block 1.

The central issue, according to my testing is that you have a global memory race condition that you have not properly accounted for. Let's consider this section of code:

         // 3. Each block write results from SMEM to GMEM
         out[i] = s_data[threadIdx.x];  // line A

         if (bSums != NULL)
         {
             if (threadIdx.x == 0)
             {
                 bSums[blockId] = s_data[blockDim.x - 1];

                 if (blockId > 0)
                 {
                     while (bDoneCount < blockId) {}
                     bSums[blockId] += bSums[blockId - 1];
                     __threadfence();
                 }
                 bDoneCount += 1;
             }
             __syncthreads();

             if (i + blockDim.x < n)
                 out[i + blockDim.x] += bSums[blockId];  // line B
         }
     }
 }

where I have marked line A and line B. For correctness, this requires for any given block X, that the block X+1 must have executed line A before block X executes line B. I don't see anything that enforces that.

I've made the following test code primarily to test the assertion in item 3 above. The key addition is to require any block to not proceed to line B until the next higher numbered block has passed line A. Since you already have a bDoneCount counter that is available for this purpose, I reuse that for this kind of crude inter-block synchronization. A few caveats:

I don't recommend this programming approach at all.
I'm not suggesting anything I'm demonstrating here is correct. It's mostly your code, and my goal here is to lend credence to the assertion in item 3 above.
Yes, this additional block-to-block synchronization that I added causes your overall code to run much more slowly. As far as I am concerned, I don't care about that, because none of the work here is a sensible approach to writing a high-performance radix sort.
If you care about high performance, you should not be writing your own scan kernel, at least not the one you have here
If you care about high performance, you should be using a library implementation, either a library implementation of sort, or at least a library implementation of prefix sum. The prefix sum here is not a high performance realization.

With those caveats out of the way, here is a test code modification of your code, addressing some of the 3 items I listed. It still does not address the improper use of syncthreads. However that is a fairly mechanical fix, once you understand the underlying issue and requirements. The following code passed all the testing I cared to throw at it. You'll find additional modifications beyond what I described, mainly to facilitate my testing and problem visibility:

$ cat t109.cu
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <stdint.h>
#include <string.h>
#include <stdlib.h>

#define CHECK(call)
{
    const cudaError_t error = call;
    if (error != cudaSuccess)
    {
        fprintf(stderr, "Error: %s:%d, ", __FILE__, __LINE__);
        fprintf(stderr, "code: %d, reason: %s
", error,
                cudaGetErrorString(error));
        exit(1);
    }
}

struct GpuTimer
{
    cudaEvent_t start;
    cudaEvent_t stop;

    GpuTimer()
    {
        cudaEventCreate(&start);
        cudaEventCreate(&stop);
    }

    ~GpuTimer()
    {
        cudaEventDestroy(start);
        cudaEventDestroy(stop);
    }

    void Start()
    {
        cudaEventRecord(start, 0);
        cudaEventSynchronize(start);
    }

    void Stop()
    {
        cudaEventRecord(stop, 0);
    }

    float Elapsed()
    {
        float elapsed;
        cudaEventSynchronize(stop);
        cudaEventElapsedTime(&elapsed, start, stop);
        return elapsed;
    }
};

// Sequential Radix Sort
// "const uint32_t * in" means: the memory region pointed by "in" is read-only
void sortByHost(const uint32_t * in, int n,
                uint32_t * out)
{
    int * bits = (int *)malloc(n * sizeof(int));
    int * nOnesBefore = (int *)malloc(n * sizeof(int));

    uint32_t * src = (uint32_t *)malloc(n * sizeof(uint32_t));
    uint32_t * originalSrc = src; // To free memory later
    memcpy(src, in, n * sizeof(uint32_t));
    uint32_t * dst = out;

    // Loop from LSB (Least Significant Bit) to MSB (Most Significant Bit)
    // In each loop, sort elements according to the current bit from src to dst
    // (using STABLE counting sort)
    for (int bitIdx = 0; bitIdx < sizeof(uint32_t) * 8; bitIdx++)
    {
        // Extract bits
        for (int i = 0; i < n; i++)
            bits[i] = (src[i] >> bitIdx) & 1;

        // Compute nOnesBefore
        nOnesBefore[0] = 0;
        for (int i = 1; i < n; i++)
            nOnesBefore[i] = nOnesBefore[i-1] + bits[i-1];

        // Compute rank and write to dst
        int nZeros = n - nOnesBefore[n-1] - bits[n-1];
        for (int i = 0; i < n; i++)
        {
            int rank;
            if (bits[i] == 0)
                rank = i - nOnesBefore[i];
            else
                rank = nZeros + nOnesBefore[i];
            dst[rank] = src[i];
        }

        // Swap src and dst
        uint32_t * temp = src;
        src = dst;
        dst = temp;
    }

    // Does out array contain results?
    memcpy(out, src, n * sizeof(uint32_t));

    // Free memory
    free(originalSrc);
    free(bits);
    free(nOnesBefore);
}

__global__ void extractBits(uint32_t* in, int n, int* out, int bitId)
{
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i < n)
        out[i] = (in[i] >> bitId) & 1;
}

__device__ int bCount = 0;
volatile __device__ int bDoneCount = 0;

__global__ void scanKernel(int* in, int n, int* out, volatile int* bSums)
{
    __shared__ int blockId;
    if (threadIdx.x == 0)
    {
        blockId = atomicAdd(&bCount, 1);
    }
    __syncthreads();

    // 1. Each block loads data from GMEM to SMEM
    extern __shared__ int s_data[];

    int i = blockId * blockDim.x + threadIdx.x;

    if (i < n)
    {
        if (i == 0)
            s_data[threadIdx.x] = 0;
        else
            s_data[threadIdx.x] = in[i - 1];
        __syncthreads();

        // 2. Each block does scan with data on SMEM
        for (int stride = 1; stride < blockDim.x; stride *= 2)
        {
            int neededVal;
            if (threadIdx.x >= stride)
                neededVal = s_data[threadIdx.x - stride];
            __syncthreads();
            if (threadIdx.x >= stride)
                s_data[threadIdx.x] += neededVal;
            __syncthreads();
        }

        // 3. Each block write results from SMEM to GMEM
        out[i] = s_data[threadIdx.x];

        if (bSums != NULL)
        {
            if (threadIdx.x == 0)
            {
                bSums[blockId] = s_data[blockDim.x - 1];
                __threadfence();

                if (blockId > 0)
                {
                    while (bDoneCount < blockId) {}
                    bSums[blockId] += bSums[blockId - 1];
                    __threadfence();
                }
                bDoneCount += 1;
            }
            if (blockId < (gridDim.x-1)) {while (bDoneCount < (blockId+2)){};} // ADDED SYNC
            __syncthreads();

            if (i + blockDim.x < n)
                out[i + blockDim.x] += bSums[blockId];
        }
    }
}
__global__ void computeRank(uint32_t* in, int n, uint32_t* out, int* bits, int* nOnesBefore)
{
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    int nZeros = n - nOnesBefore[n - 1] - bits[n - 1];

    if (i < n)
    {
        int rank;
        if (bits[i] == 0)
            rank = i - nOnesBefore[i];
        else
            rank = nZeros + nOnesBefore[i];
        out[rank] = in[i];
    }
}

// Parallel Radix Sort
void sortByDevice(const uint32_t * in, int n, uint32_t * out, int blockSize)
{
    uint32_t* src = (uint32_t*)malloc(n * sizeof(uint32_t));
    // TODO
    int *d_bits, *d_nOnesBefore;
    size_t nBytes = n * sizeof(int);
    CHECK(cudaMalloc(&d_bits, nBytes));
    CHECK(cudaMalloc(&d_nOnesBefore, nBytes));

    uint32_t * d_in, * d_out;
    nBytes = n * sizeof(uint32_t);
    CHECK(cudaMalloc(&d_in, nBytes));
    CHECK(cudaMalloc(&d_out, nBytes));

    CHECK(cudaMemcpy(d_in, in, nBytes, cudaMemcpyHostToDevice));

//    int gridSize = (n - 1) / blockSize + 1;
    int gridSize = (n+blockSize-1)/blockSize;
    int* d_bSums;
    if (gridSize > 1)
    {
        CHECK(cudaMalloc(&d_bSums, gridSize * sizeof(int)));
    }
    else
    {
        d_bSums = NULL;
    }
    size_t smem = blockSize * sizeof(int);

    // Loop from LSB (Least Significant Bit) to MSB (Most Significant Bit)
    // In each loop, sort elements according to the current bit from src to dst
    // (using STABLE counting sort)
    for (int bitId = 0; bitId < sizeof(uint32_t) * 8; bitId++)
    {
        // Extract bits
        extractBits <<< gridSize, blockSize >>> (d_in, n, d_bits, bitId);
        cudaDeviceSynchronize();
        CHECK(cudaGetLastError());

    //    CHECK(cudaMemcpy(src, d_in, sizeof(uint32_t) * n, cudaMemcpyDeviceToHost));

        // Compute nOnesBefore
        scanKernel <<< gridSize, blockSize, smem >>> (d_bits, n, d_nOnesBefore, d_bSums);
        cudaDeviceSynchronize();
        CHECK(cudaGetLastError());

        int zero = 0;
        CHECK(cudaMemcpyToSymbol(bCount, &zero, sizeof(int)));
        CHECK(cudaMemcpyToSymbol(bDoneCount, &zero, sizeof(int)));

        // Compute rank and write to d_out
        computeRank <<< gridSize, blockSize >>> (d_in, n, d_out, d_bits, d_nOnesBefore);
        cudaDeviceSynchronize();
        CHECK(cudaGetLastError());

        // Swap d_in and d_out
        uint32_t* temp = d_in;
        d_in = d_out;
        d_out = temp;
    }

    CHECK(cudaMemcpy(out, d_in, nBytes, cudaMemcpyDeviceToHost));


    // Free memory
    CHECK(cudaFree(d_bits));
    CHECK(cudaFree(d_nOnesBefore));
    CHECK(cudaFree(d_in));
    CHECK(cudaFree(d_out));

    if (gridSize > 1)
        CHECK(cudaFree(d_bSums));

    free(src);
}

// Radix Sort
void

Categories

CUDA not waiting for kernel to finish

CUDA not waiting for kernel to finish

Please log in or register to add a comment.

Please log in or register to answer this question.

1 Answer

Please log in or register to add a comment.

Just Browsing Browsing

Most popular tags