gpu - purposely causing bank conflicts for shared memory on CUDA device -
it mystery me how shared memory on cuda devices work. curious count threads having access same shared memory. wrote simple program
#include <cuda_runtime.h> #include <stdio.h> #define nblc 13 #define nthr 1024 //------------------------@device-------------------- __device__ int inwarpd[nblc]; __global__ void kernel(){ __shared__ int mywarp; mywarp=0; (int i=0;i<5;i++) mywarp += (10000*threadidx.x+1); __syncthreads(); inwarpd[blockidx.x]=mywarp; } //------------------------@host----------------------- int main(int argc, char **argv){ int inwarph[nblc]; cudasetdevice(2); kernel<<<nblc, nthr>>>(); cudamemcpyfromsymbol(inwarph, inwarpd, nblc*sizeof(int), 0, cudamemcpydevicetohost); (int i=0;i<nblc;i++) printf("%i : %i\n",i, inwarph[i]); }
and ran on k80 gpu. since several threads having access same shared memory variable expecting variable updated 5*nthr times, albeit not @ same cycle because of bank conflict. however, output indicates mywarp shared variable updated 5 times. each blocks different threads accomplished task:
0 : 35150005 1 : 38350005 2 : 44750005 3 : 38350005 4 : 51150005 5 : 38350005 6 : 38350005 7 : 38350005 8 : 51150005 9 : 44750005 10 : 51150005 11 : 38350005 12 : 38350005
instead, expecting
523776*10000+5*1024=5237765120
for each block. can kindly explain me understanding of shared memory fails. know how possible threads in 1 block access (update) same shared variable. know not possible @ same mp cycle. serialisation fine me because going rare event.
lets walk through ptx generates.
//declare registers .reg .s32 %r<5>; .reg .s64 %rd<4>; // demoted variable .shared .align 4 .u32 _z6kernelv$__cuda_local_var_35411_30_non_const_mywarp; //load tid in register r1 mov.u32 %r1, %tid.x; //multiple tid*5000+5 , store in r2 mad.lo.s32 %r2, %r1, 50000, 5; //store result in shared memory st.shared.u32 [_z6kernelv$__cuda_local_var_35411_30_non_const_mywarp], %r2; ///synchronize bar.sync 0; //load shared memory , store in r3 ld.shared.u32 %r3, [_z6kernelv$__cuda_local_var_35411_30_non_const_mywarp]; mov.u32 %r4, %ctaid.x; mul.wide.u32 %rd1, %r4, 4; mov.u64 %rd2, inwarpd; add.s64 %rd3, %rd2, %rd1; //store r3 in global memory st.global.u32 [%rd3], %r3; ret;
so
for (int i=0;i<5;i++) mywarp += (10000*threadidx.x+1);
is being optimized down to
mywarp=50000*threadidx.x+5
so you're not experiencing bank-conflict. experiencing race-condition.
Comments
Post a Comment