Out of the following two ways of allocating shared memory statically, which method is correct and why? I get same results for both but I am trying to understand the behavior in a little more detail.
Kernel 1:
__shared__ int as[3][3],bs[3][3];
__global__ void Sharesum(int* a,int* b,int* c,int n)
{
int s,k,i,sum=0;
int tx,ty,bx,by;
tx=threadIdx.x;
ty=threadIdx.y;
as[ty][tx]=a[tx+n*ty];
bs[ty][tx]=b[tx+n*ty];
sum += as[ty][tx]+bs[ty][tx];
c[tx*n+ty]=sum;
}
kernel 2:
__global__ void Sharesum(int* a,int* b,int* c,int n)
{
__shared__ int as[3][3],bs[3][3];
int s,k,i,sum=0;
int tx,ty,bx,by;
tx=threadIdx.x;
ty=threadIdx.y;
as[ty][tx]=a[tx+n*ty];
bs[ty][tx]=b[tx+n*ty];
sum += as[ty][tx]+bs[ty][tx];
c[tx*n+ty]=sum;
}
与恶龙缠斗过久,自身亦成为恶龙;凝视深渊过久,深渊将回以凝视…