Welcome to OStack Knowledge Sharing Community for programmer and developer-Open, Learning and Share
Welcome To Ask or Share your Answers For Others

Categories

0 votes
82 views
in Technique[技术] by (71.8m points)

CUDA not waiting for kernel to finish

So i have a illegal memory access was encountered error when i try to launch some kernels in a loop like this:

for (int bitId = 0; bitId < sizeof(uint32_t) * 8; bitId++)
    {
        // Extract bits
        extractBits <<< gridSize, blockSize >>> (d_in, n, d_bits, bitId);

        // Compute nOnesBefore       
        scanKernel <<< gridSize, blockSize, smem >>> (d_bits, n, d_nOnesBefore, d_bSums);

        int zero = 0;
        CHECK(cudaMemcpyToSymbol(bCount, &zero, sizeof(int)));
        CHECK(cudaMemcpyToSymbol(bDoneCount, &zero, sizeof(int)));

        // Compute rank
        computeRank <<< gridSize, blockSize >>> (d_in, n, d_out, d_bits, d_nOnesBefore);

        // Swap d_in and d_out
        uint32_t* temp = d_in;
        d_in = d_out;
        d_out = temp;
    }

And i debug and found out this ‘extractBits’ function was calculate wrong. When I try to print to see problem, the error rarely happen. So i realize the more delay i add, the less chance error happen. I add something like this and it run fine without error:

for (int bitId = 0; bitId < sizeof(uint32_t) * 8; bitId++)
    {
        // Extract bits
        extractBits <<< gridSize, blockSize >>> (d_in, n, d_bits, bitId);

        // Dummy copy to delay, do nothing.
        CHECK(cudaMemcpy(src, d_in, sizeof(uint32_t) * n, cudaMemcpyDeviceToHost));

        // Compute nOnesBefore       
        scanKernel <<< gridSize, blockSize, smem >>> (d_bits, n, d_nOnesBefore, d_bSums);

        int zero = 0;
        CHECK(cudaMemcpyToSymbol(bCount, &zero, sizeof(int)));
        CHECK(cudaMemcpyToSymbol(bDoneCount, &zero, sizeof(int)));

        // Compute rank
        computeRank <<< gridSize, blockSize >>> (d_in, n, d_out, d_bits, d_nOnesBefore);

        // Swap d_in and d_out
        uint32_t* temp = d_in;
        d_in = d_out;
        d_out = temp;
    }

Here is source code(it only can run with gpu like T4 on colab, P100 can't):

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <stdint.h>
#include <string.h>
#include <stdlib.h>

#define CHECK(call)
{
    const cudaError_t error = call;
    if (error != cudaSuccess)
    {
        fprintf(stderr, "Error: %s:%d, ", __FILE__, __LINE__);
        fprintf(stderr, "code: %d, reason: %s
", error,
                cudaGetErrorString(error));
        exit(1);
    }
}

struct GpuTimer
{
    cudaEvent_t start;
    cudaEvent_t stop;

    GpuTimer()
    {
        cudaEventCreate(&start);
        cudaEventCreate(&stop);
    }

    ~GpuTimer()
    {
        cudaEventDestroy(start);
        cudaEventDestroy(stop);
    }

    void Start()
    {
        cudaEventRecord(start, 0);
        cudaEventSynchronize(start);
    }

    void Stop()
    {
        cudaEventRecord(stop, 0);
    }

    float Elapsed()
    {
        float elapsed;
        cudaEventSynchronize(stop);
        cudaEventElapsedTime(&elapsed, start, stop);
        return elapsed;
    }
};

// Sequential Radix Sort
// "const uint32_t * in" means: the memory region pointed by "in" is read-only
void sortByHost(const uint32_t * in, int n,
                uint32_t * out)
{
    int * bits = (int *)malloc(n * sizeof(int));
    int * nOnesBefore = (int *)malloc(n * sizeof(int));

    uint32_t * src = (uint32_t *)malloc(n * sizeof(uint32_t));
    uint32_t * originalSrc = src; // To free memory later
    memcpy(src, in, n * sizeof(uint32_t));
    uint32_t * dst = out;

    // Loop from LSB (Least Significant Bit) to MSB (Most Significant Bit)
    // In each loop, sort elements according to the current bit from src to dst 
    // (using STABLE counting sort)
    for (int bitIdx = 0; bitIdx < sizeof(uint32_t) * 8; bitIdx++)
    {
        // Extract bits
        for (int i = 0; i < n; i++)
            bits[i] = (src[i] >> bitIdx) & 1;

        // Compute nOnesBefore
        nOnesBefore[0] = 0;
        for (int i = 1; i < n; i++)
            nOnesBefore[i] = nOnesBefore[i-1] + bits[i-1];

        // Compute rank and write to dst
        int nZeros = n - nOnesBefore[n-1] - bits[n-1];
        for (int i = 0; i < n; i++)
        {
            int rank;
            if (bits[i] == 0)
                rank = i - nOnesBefore[i];
            else
                rank = nZeros + nOnesBefore[i];
            dst[rank] = src[i];
        }

        // Swap src and dst
        uint32_t * temp = src;
        src = dst;
        dst = temp;
    }

    // Does out array contain results?
    memcpy(out, src, n * sizeof(uint32_t));

    // Free memory
    free(originalSrc);
    free(bits);
    free(nOnesBefore);
}

__global__ void extractBits(uint32_t* in, int n, int* out, int bitId)
{
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i < n)
        out[i] = (in[i] >> bitId) & 1;
}

__device__ int bCount = 0;
volatile __device__ int bDoneCount = 0;

__global__ void scanKernel(int* in, int n, int* out, volatile int* bSums)
{
    __shared__ int blockId;
    if (threadIdx.x == 0)
    {
        blockId = atomicAdd(&bCount, 1);
    }
    __syncthreads();

    // 1. Each block loads data from GMEM to SMEM
    extern __shared__ int s_data[];

    int i = blockId * blockDim.x + threadIdx.x;

    if (i < n)
    {
        if (i == 0)
            s_data[threadIdx.x] = 0;
        else
            s_data[threadIdx.x] = in[i - 1];
        __syncthreads();

        // 2. Each block does scan with data on SMEM
        for (int stride = 1; stride < blockDim.x; stride *= 2)
        {
            int neededVal;
            if (threadIdx.x >= stride)
                neededVal = s_data[threadIdx.x - stride];
            __syncthreads();
            if (threadIdx.x >= stride)
                s_data[threadIdx.x] += neededVal;
            __syncthreads();
        }

        // 3. Each block write results from SMEM to GMEM
        out[i] = s_data[threadIdx.x];

        if (bSums != NULL)
        {
            if (threadIdx.x == 0)
            {
                bSums[blockId] = s_data[blockDim.x - 1];

                if (blockId > 0)
                {
                    while (bDoneCount < blockId) {}
                    bSums[blockId] += bSums[blockId - 1];
                    __threadfence();
                }
                bDoneCount += 1;
            }
            __syncthreads();

            if (i + blockDim.x < n)
                out[i + blockDim.x] += bSums[blockId];
        }
    }
}

__global__ void computeRank(uint32_t* in, int n, uint32_t* out, int* bits, int* nOnesBefore)
{
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    int nZeros = n - nOnesBefore[n - 1] - bits[n - 1];

    if (i < n)
    {
        int rank;
        if (bits[i] == 0)
            rank = i - nOnesBefore[i];
        else
            rank = nZeros + nOnesBefore[i];
        out[rank] = in[i];
    }
}

// Parallel Radix Sort
void sortByDevice(const uint32_t * in, int n, uint32_t * out, int blockSize)
{
    uint32_t* src = (uint32_t*)malloc(n * sizeof(uint32_t));
    // TODO
    int *d_bits, *d_nOnesBefore;
    size_t nBytes = n * sizeof(int);
    CHECK(cudaMalloc(&d_bits, nBytes));
    CHECK(cudaMalloc(&d_nOnesBefore, nBytes));

    uint32_t * d_in, * d_out;
    nBytes = n * sizeof(uint32_t);
    CHECK(cudaMalloc(&d_in, nBytes));
    CHECK(cudaMalloc(&d_out, nBytes));

    CHECK(cudaMemcpy(d_in, in, nBytes, cudaMemcpyHostToDevice));

    int gridSize = (n - 1) / blockSize + 1;

    int* d_bSums;
    if (gridSize > 1)
    {
        CHECK(cudaMalloc(&d_bSums, gridSize * sizeof(int)));
    }
    else
    {
        d_bSums = NULL;
    }
    size_t smem = blockSize * sizeof(int);

    // Loop from LSB (Least Significant Bit) to MSB (Most Significant Bit)
    // In each loop, sort elements according to the current bit from src to dst 
    // (using STABLE counting sort)
    for (int bitId = 0; bitId < sizeof(uint32_t) * 8; bitId++)
    {
        // Extract bits
        extractBits <<< gridSize, blockSize >>> (d_in, n, d_bits, bitId);
        cudaDeviceSynchronize();
        CHECK(cudaGetLastError());

        CHECK(cudaMemcpy(src, d_in, sizeof(uint32_t) * n, cudaMemcpyDeviceToHost));

        // Compute nOnesBefore       
        scanKernel <<< gridSize, blockSize, smem >>> (d_bits, n, d_nOnesBefore, d_bSums);
        cudaDeviceSynchronize();
        CHECK(cudaGetLastError());

        int zero = 0;
        CHECK(cudaMemcpyToSymbol(bCount, &zero, sizeof(int)));
        CHECK(cudaMemcpyToSymbol(bDoneCount, &zero, sizeof(int)));

        // Compute rank and write to d_out
        computeRank <<< gridSize, blockSize >>> (d_in, n, d_out, d_bits, d_nOnesBefore);
        cudaDeviceSynchronize();
        CHECK(cudaGetLastError());
        
        // Swap d_in and d_out
        uint32_t* temp = d_in;
        d_in = d_out;
        d_out = temp;
    }

    CHECK(cudaMemcpy(out, d_in, nBytes, cudaMemcpyDeviceToHost));
    

    // Free memory  
    CHECK(cudaFree(d_bits));
    CHECK(cudaFree(d_nOnesBefore));
    CHECK(cudaFree(d_in));
    CHECK(cudaFree(d_out));

    if (gridSize > 1)
        CHECK(cudaFree(d_bSums));

    free(src);
}

// Radix Sort
void sort(const uint32_t * in, int n, 
        uint32_t * out, 
        bool useDevice=false, int blockSize=1)
{
    GpuTimer timer; 
    timer.Start();

    if (useDevice == false)
    {
        printf("
Radix Sort by host
");
        sortByHost(in, n, out);
    }
    else // use device
    {
        printf("
Radix Sort by device
");
        sortByDevice(in, n, out, blockSize);
    }

    timer.Stop();
    printf("Time: %.3f ms
", timer.Elapsed());
}

void printDeviceInfo()
{
    cudaDeviceProp devProv;
    CHECK(cudaGetDeviceProperties(&devProv, 0));
    printf("**********GPU info**********
");
    printf("Name: %s
", devProv.name);
    printf("Compute capability: %d.%d
", devProv.major, devProv.minor);
    printf("Num SMs: %d
", devProv.multiProcessorCount);
    printf("Max num threads per SM: %d
", devProv.maxThreadsPerMultiProcessor); 
    printf("Max num warps per SM: %d
", devProv.maxThreadsPerMultiProcessor / devProv.warpSize);
    printf("GMEM: %zu byte
", devProv.totalGlobalMem);
    printf("SMEM per SM: %zu byte
", devProv.sharedMemPerMultiprocessor);
    printf("SMEM per block: %zu byte
", devProv.sharedMemPerBlock);
    printf("****************************
");
}

void checkCorrectness(uint32_t * out, uint32_t * correctOut, int n)
{
    for (int i = 0; i < n; i++)
    {
        if (out[i] != correctOut[i])
        {
            printf("INCORRECT :(
");
            return;
        }
    }
    printf("CORRECT :)
");
}

void printArray(uint32_t * a, int n)
{
    for (int i = 0; i < n; i++)
        printf("%i ", a[i]);
    printf("
");
}

int main(int argc, char ** argv)
{
    // PRINT OUT DEVICE INFO
    printDeviceInfo();

    // SET UP INPUT SIZE
    //int n = 50; // For test by eye
    int n = (1 << 24) + 1;
    printf("
Input size: %d
", n);

    // ALLOCATE MEMORIES
    size_t bytes = n * sizeof(uint32_t);
    uint32_t * in = (uint32_t *)malloc(bytes);

与恶龙缠斗过久,自身亦成为恶龙;凝视深渊过久,深渊将回以凝视…
Welcome To Ask or Share your Answers For Others

1 Answer

0 votes
by (71.8m points)

There are probably multiple issues with this code. I will list 3 and then give some additional commentary.

  1. You have illegal use of __syncthreads():

     if (i < n)   <--------------------------------------
     {                                                   |
         if (i == 0)                                     |
             s_data[threadIdx.x] = 0;                    |
         else                                            |
             s_data[threadIdx.x] = in[i - 1];            |
         __syncthreads();  <-----------------------------
    

    see here. I don't think this is a central issue, however.

  2. I'm reasonably confident, for correctness, you need a __threadfence() here:

         if (threadIdx.x == 0)
         {
             bSums[blockId] = s_data[blockDim.x - 1];
             __threadfence();  // added
             if (blockId > 0)
    

    to handle the interaction between block 0 and block 1.

  3. The central issue, according to my testing is that you have a global memory race condition that you have not properly accounted for. Let's consider this section of code:

             // 3. Each block write results from SMEM to GMEM
             out[i] = s_data[threadIdx.x];  // line A
    
             if (bSums != NULL)
             {
                 if (threadIdx.x == 0)
                 {
                     bSums[blockId] = s_data[blockDim.x - 1];
    
                     if (blockId > 0)
                     {
                         while (bDoneCount < blockId) {}
                         bSums[blockId] += bSums[blockId - 1];
                         __threadfence();
                     }
                     bDoneCount += 1;
                 }
                 __syncthreads();
    
                 if (i + blockDim.x < n)
                     out[i + blockDim.x] += bSums[blockId];  // line B
             }
         }
     }
    

    where I have marked line A and line B. For correctness, this requires for any given block X, that the block X+1 must have executed line A before block X executes line B. I don't see anything that enforces that.

I've made the following test code primarily to test the assertion in item 3 above. The key addition is to require any block to not proceed to line B until the next higher numbered block has passed line A. Since you already have a bDoneCount counter that is available for this purpose, I reuse that for this kind of crude inter-block synchronization. A few caveats:

  • I don't recommend this programming approach at all.
  • I'm not suggesting anything I'm demonstrating here is correct. It's mostly your code, and my goal here is to lend credence to the assertion in item 3 above.
  • Yes, this additional block-to-block synchronization that I added causes your overall code to run much more slowly. As far as I am concerned, I don't care about that, because none of the work here is a sensible approach to writing a high-performance radix sort.
  • If you care about high performance, you should not be writing your own scan kernel, at least not the one you have here
  • If you care about high performance, you should be using a library implementation, either a library implementation of sort, or at least a library implementation of prefix sum. The prefix sum here is not a high performance realization.

With those caveats out of the way, here is a test code modification of your code, addressing some of the 3 items I listed. It still does not address the improper use of syncthreads. However that is a fairly mechanical fix, once you understand the underlying issue and requirements. The following code passed all the testing I cared to throw at it. You'll find additional modifications beyond what I described, mainly to facilitate my testing and problem visibility:

$ cat t109.cu
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <stdint.h>
#include <string.h>
#include <stdlib.h>

#define CHECK(call)
{
    const cudaError_t error = call;
    if (error != cudaSuccess)
    {
        fprintf(stderr, "Error: %s:%d, ", __FILE__, __LINE__);
        fprintf(stderr, "code: %d, reason: %s
", error,
                cudaGetErrorString(error));
        exit(1);
    }
}

struct GpuTimer
{
    cudaEvent_t start;
    cudaEvent_t stop;

    GpuTimer()
    {
        cudaEventCreate(&start);
        cudaEventCreate(&stop);
    }

    ~GpuTimer()
    {
        cudaEventDestroy(start);
        cudaEventDestroy(stop);
    }

    void Start()
    {
        cudaEventRecord(start, 0);
        cudaEventSynchronize(start);
    }

    void Stop()
    {
        cudaEventRecord(stop, 0);
    }

    float Elapsed()
    {
        float elapsed;
        cudaEventSynchronize(stop);
        cudaEventElapsedTime(&elapsed, start, stop);
        return elapsed;
    }
};

// Sequential Radix Sort
// "const uint32_t * in" means: the memory region pointed by "in" is read-only
void sortByHost(const uint32_t * in, int n,
                uint32_t * out)
{
    int * bits = (int *)malloc(n * sizeof(int));
    int * nOnesBefore = (int *)malloc(n * sizeof(int));

    uint32_t * src = (uint32_t *)malloc(n * sizeof(uint32_t));
    uint32_t * originalSrc = src; // To free memory later
    memcpy(src, in, n * sizeof(uint32_t));
    uint32_t * dst = out;

    // Loop from LSB (Least Significant Bit) to MSB (Most Significant Bit)
    // In each loop, sort elements according to the current bit from src to dst
    // (using STABLE counting sort)
    for (int bitIdx = 0; bitIdx < sizeof(uint32_t) * 8; bitIdx++)
    {
        // Extract bits
        for (int i = 0; i < n; i++)
            bits[i] = (src[i] >> bitIdx) & 1;

        // Compute nOnesBefore
        nOnesBefore[0] = 0;
        for (int i = 1; i < n; i++)
            nOnesBefore[i] = nOnesBefore[i-1] + bits[i-1];

        // Compute rank and write to dst
        int nZeros = n - nOnesBefore[n-1] - bits[n-1];
        for (int i = 0; i < n; i++)
        {
            int rank;
            if (bits[i] == 0)
                rank = i - nOnesBefore[i];
            else
                rank = nZeros + nOnesBefore[i];
            dst[rank] = src[i];
        }

        // Swap src and dst
        uint32_t * temp = src;
        src = dst;
        dst = temp;
    }

    // Does out array contain results?
    memcpy(out, src, n * sizeof(uint32_t));

    // Free memory
    free(originalSrc);
    free(bits);
    free(nOnesBefore);
}

__global__ void extractBits(uint32_t* in, int n, int* out, int bitId)
{
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i < n)
        out[i] = (in[i] >> bitId) & 1;
}

__device__ int bCount = 0;
volatile __device__ int bDoneCount = 0;

__global__ void scanKernel(int* in, int n, int* out, volatile int* bSums)
{
    __shared__ int blockId;
    if (threadIdx.x == 0)
    {
        blockId = atomicAdd(&bCount, 1);
    }
    __syncthreads();

    // 1. Each block loads data from GMEM to SMEM
    extern __shared__ int s_data[];

    int i = blockId * blockDim.x + threadIdx.x;

    if (i < n)
    {
        if (i == 0)
            s_data[threadIdx.x] = 0;
        else
            s_data[threadIdx.x] = in[i - 1];
        __syncthreads();

        // 2. Each block does scan with data on SMEM
        for (int stride = 1; stride < blockDim.x; stride *= 2)
        {
            int neededVal;
            if (threadIdx.x >= stride)
                neededVal = s_data[threadIdx.x - stride];
            __syncthreads();
            if (threadIdx.x >= stride)
                s_data[threadIdx.x] += neededVal;
            __syncthreads();
        }

        // 3. Each block write results from SMEM to GMEM
        out[i] = s_data[threadIdx.x];

        if (bSums != NULL)
        {
            if (threadIdx.x == 0)
            {
                bSums[blockId] = s_data[blockDim.x - 1];
                __threadfence();

                if (blockId > 0)
                {
                    while (bDoneCount < blockId) {}
                    bSums[blockId] += bSums[blockId - 1];
                    __threadfence();
                }
                bDoneCount += 1;
            }
            if (blockId < (gridDim.x-1)) {while (bDoneCount < (blockId+2)){};} // ADDED SYNC
            __syncthreads();

            if (i + blockDim.x < n)
                out[i + blockDim.x] += bSums[blockId];
        }
    }
}
__global__ void computeRank(uint32_t* in, int n, uint32_t* out, int* bits, int* nOnesBefore)
{
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    int nZeros = n - nOnesBefore[n - 1] - bits[n - 1];

    if (i < n)
    {
        int rank;
        if (bits[i] == 0)
            rank = i - nOnesBefore[i];
        else
            rank = nZeros + nOnesBefore[i];
        out[rank] = in[i];
    }
}

// Parallel Radix Sort
void sortByDevice(const uint32_t * in, int n, uint32_t * out, int blockSize)
{
    uint32_t* src = (uint32_t*)malloc(n * sizeof(uint32_t));
    // TODO
    int *d_bits, *d_nOnesBefore;
    size_t nBytes = n * sizeof(int);
    CHECK(cudaMalloc(&d_bits, nBytes));
    CHECK(cudaMalloc(&d_nOnesBefore, nBytes));

    uint32_t * d_in, * d_out;
    nBytes = n * sizeof(uint32_t);
    CHECK(cudaMalloc(&d_in, nBytes));
    CHECK(cudaMalloc(&d_out, nBytes));

    CHECK(cudaMemcpy(d_in, in, nBytes, cudaMemcpyHostToDevice));

//    int gridSize = (n - 1) / blockSize + 1;
    int gridSize = (n+blockSize-1)/blockSize;
    int* d_bSums;
    if (gridSize > 1)
    {
        CHECK(cudaMalloc(&d_bSums, gridSize * sizeof(int)));
    }
    else
    {
        d_bSums = NULL;
    }
    size_t smem = blockSize * sizeof(int);

    // Loop from LSB (Least Significant Bit) to MSB (Most Significant Bit)
    // In each loop, sort elements according to the current bit from src to dst
    // (using STABLE counting sort)
    for (int bitId = 0; bitId < sizeof(uint32_t) * 8; bitId++)
    {
        // Extract bits
        extractBits <<< gridSize, blockSize >>> (d_in, n, d_bits, bitId);
        cudaDeviceSynchronize();
        CHECK(cudaGetLastError());

    //    CHECK(cudaMemcpy(src, d_in, sizeof(uint32_t) * n, cudaMemcpyDeviceToHost));

        // Compute nOnesBefore
        scanKernel <<< gridSize, blockSize, smem >>> (d_bits, n, d_nOnesBefore, d_bSums);
        cudaDeviceSynchronize();
        CHECK(cudaGetLastError());

        int zero = 0;
        CHECK(cudaMemcpyToSymbol(bCount, &zero, sizeof(int)));
        CHECK(cudaMemcpyToSymbol(bDoneCount, &zero, sizeof(int)));

        // Compute rank and write to d_out
        computeRank <<< gridSize, blockSize >>> (d_in, n, d_out, d_bits, d_nOnesBefore);
        cudaDeviceSynchronize();
        CHECK(cudaGetLastError());

        // Swap d_in and d_out
        uint32_t* temp = d_in;
        d_in = d_out;
        d_out = temp;
    }

    CHECK(cudaMemcpy(out, d_in, nBytes, cudaMemcpyDeviceToHost));


    // Free memory
    CHECK(cudaFree(d_bits));
    CHECK(cudaFree(d_nOnesBefore));
    CHECK(cudaFree(d_in));
    CHECK(cudaFree(d_out));

    if (gridSize > 1)
        CHECK(cudaFree(d_bSums));

    free(src);
}

// Radix Sort
void

与恶龙缠斗过久,自身亦成为恶龙;凝视深渊过久,深渊将回以凝视…
Welcome to OStack Knowledge Sharing Community for programmer and developer-Open, Learning and Share
Click Here to Ask a Question

...