gpu - purposely causing bank conflicts for shared memory on CUDA device -


it mystery me how shared memory on cuda devices work. curious count threads having access same shared memory. wrote simple program

#include <cuda_runtime.h> #include <stdio.h>  #define nblc 13 #define nthr 1024  //------------------------@device--------------------  __device__ int inwarpd[nblc];  __global__ void kernel(){ __shared__ int mywarp;  mywarp=0; (int i=0;i<5;i++) mywarp += (10000*threadidx.x+1); __syncthreads();  inwarpd[blockidx.x]=mywarp; } //------------------------@host-----------------------  int main(int argc, char **argv){ int inwarph[nblc]; cudasetdevice(2);  kernel<<<nblc, nthr>>>();  cudamemcpyfromsymbol(inwarph, inwarpd, nblc*sizeof(int), 0, cudamemcpydevicetohost);  (int i=0;i<nblc;i++) printf("%i : %i\n",i, inwarph[i]); } 

and ran on k80 gpu. since several threads having access same shared memory variable expecting variable updated 5*nthr times, albeit not @ same cycle because of bank conflict. however, output indicates mywarp shared variable updated 5 times. each blocks different threads accomplished task:

0 : 35150005 1 : 38350005 2 : 44750005 3 : 38350005 4 : 51150005 5 : 38350005 6 : 38350005 7 : 38350005 8 : 51150005 9 : 44750005 10 : 51150005 11 : 38350005 12 : 38350005 

instead, expecting

 523776*10000+5*1024=5237765120 

for each block. can kindly explain me understanding of shared memory fails. know how possible threads in 1 block access (update) same shared variable. know not possible @ same mp cycle. serialisation fine me because going rare event.

lets walk through ptx generates.

//declare registers .reg .s32       %r<5>; .reg .s64       %rd<4>;  // demoted variable .shared .align 4 .u32 _z6kernelv$__cuda_local_var_35411_30_non_const_mywarp;  //load tid in register r1 mov.u32         %r1, %tid.x;  //multiple tid*5000+5 , store in r2 mad.lo.s32      %r2, %r1, 50000, 5;  //store result in shared memory st.shared.u32   [_z6kernelv$__cuda_local_var_35411_30_non_const_mywarp], %r2;  ///synchronize bar.sync        0;  //load shared memory , store in r3 ld.shared.u32   %r3, [_z6kernelv$__cuda_local_var_35411_30_non_const_mywarp];  mov.u32         %r4, %ctaid.x; mul.wide.u32    %rd1, %r4, 4; mov.u64         %rd2, inwarpd; add.s64         %rd3, %rd2, %rd1;  //store r3 in global memory st.global.u32   [%rd3], %r3; ret; 

so

for (int i=0;i<5;i++)     mywarp += (10000*threadidx.x+1); 

is being optimized down to

mywarp=50000*threadidx.x+5 

so you're not experiencing bank-conflict. experiencing race-condition.


Comments

Popular posts from this blog

Java 3D LWJGL collision -

spring - SubProtocolWebSocketHandler - No handlers -

methods - python can't use function in submodule -