Converting a 2D Canny Edge image to 1D edge pixel array in CUDA - Strange behaviour -


i have cuda kernel takes edge image , processes create smaller, 1d array of edge pixels. here strange behaviour. every time run kernel , calculate number of edge pixels in "d_nlist" (see code near printf), greater pixel count each time, when use same image , stop program , re-run. therefore, each time run it, takes longer run, until eventually, throws un-caught exception.

my question is, how can stop happening can consistent results each time run kernel?

my device geforce 620.

constants:

threads_x = 32
threads_y = 4
pixels_per_thread = 4
max_queue_length = threads_x * threads_y * pixels_per_thread
img_width = 256
img_height = 256
img_size = img_width * img_height
blocks_x = img_width / (threads_x * pixels_per_thread)
blocks_y = img_height / threads_y

the kernel follows:

__global__ void convert2dedgeimageto1darray( unsigned char const * const image,  unsigned int* const list, int* const glob_index ) {  unsigned int const x = blockidx.x  * threads_x*pixels_per_thread + threadidx.x; unsigned int const y = blockidx.y  * threads_y + threadidx.y;  volatile int qindex = -1; volatile __shared__ int sh_qindex[threads_y]; volatile __shared__ int sh_qstart[threads_y]; sh_qindex[threadidx.y] = -1;  // start making array volatile __shared__ unsigned int sh_queue[max_queue_length];  // fill queue for(int i=0; i<pixels_per_thread; i++) {     int const xx = i*threads_x + x;      // read 1 image pixel global memory     unsigned char const pixel = image[y*img_width + xx];     unsigned int  const queue_val = (y << 16) + xx;      if(pixel)     {                    {             qindex++;             sh_qindex[threadidx.y] = qindex;             sh_queue[threadidx.y*threads_x*pixels_per_thread + qindex] = queue_val;         } while (sh_queue[threadidx.y*threads_x*pixels_per_thread + qindex] != queue_val);     }      // reload index smem (last thread write smem have updated it)     qindex = sh_qindex[threadidx.y]; }  // let thread 0 reserve space required in global list __syncthreads(); if(threadidx.x == 0 && threadidx.y == 0) {     // find how many items stored in each list     int total_index = 0;     #pragma unroll     for(int i=0; i<threads_y; i++)     {         sh_qstart[i] = total_index;         total_index += (sh_qindex[i] + 1u);     }      // calculate offset in global list     unsigned int global_offset = atomicadd(glob_index, total_index);     #pragma unroll     for(int i=0; i<threads_y; i++)     {         sh_qstart[i] += global_offset;     } } __syncthreads();  // copy local queues global queue for(int i=0; i<=qindex; i+=threads_x) {     if(i + threadidx.x > qindex)         break;      unsigned int qvalue = sh_queue[threadidx.y*threads_x*pixels_per_thread + + threadidx.x];     list[sh_qstart[threadidx.y] + + threadidx.x] = qvalue; } } 

the following method calls kernel:

void call2dto1dkernel(unsigned char const * const h_image) {     // device side allocation     unsigned char *d_image = null;     unsigned int *d_list = null;     int h_nlist, *d_nlist = null;     cudamalloc((void**)&d_image, sizeof(unsigned char)*img_size);     cudamalloc((void**)&d_list, sizeof(unsigned int)*img_size);     cudamalloc((void**)&d_nlist, sizeof(int));      // time measurement initialization     cudaevent_t start, stop, startio, stopio;     cudaeventcreate(&start);     cudaeventcreate(&stop);     cudaeventcreate(&startio);      cudaeventcreate(&stopio);      // start timer w/ io     cudaeventrecord(startio,0);      // copy image data device     cudamemcpy((void*)d_image, (void*)h_image, sizeof(unsigned char)*img_size,    cudamemcpyhosttodevice);      // start timer     cudaeventrecord(start,0);      // kernel call     // phase 1 : convert 2d binary image 1d pixel array     dim3 dimblock1(threads_x, threads_y);     dim3 dimgrid1(blocks_x, blocks_y);     convert2dedgeimageto1darray<<<dimgrid1, dimblock1>>>(d_image, d_list, d_nlist);      // stop timer     cudaeventrecord(stop,0);     cudaeventsynchronize(stop);      // stop timer w/ io     cudaeventrecord(stopio,0);     cudaeventsynchronize(stopio);      // time measurement     cudaeventelapsedtime(&et,start,stop);     cudaeventelapsedtime(&etio,startio,stopio);      // time measurement deinitialization     cudaeventdestroy(start);      cudaeventdestroy(stop);     cudaeventdestroy(startio);      cudaeventdestroy(stopio);      // list size     cudamemcpy((void*)&h_nlist, (void*)d_nlist, sizeof(int), cudamemcpydevicetohost);      // report on console     printf("%d pixels processed...\n", h_nlist);      // device side dealloc     cudafree(d_image);     cudafree(d_space);     cudafree(d_list);     cudafree(d_nlist); } 

thank in advance everyone.

as preamble, let me suggest troubleshooting steps useful:

  1. instrument code proper cuda error checking
  2. run code cuda-memcheck e.g. cuda-memcheck ./myapp

if above steps, you'll find kernel failing, , failures have global writes of size 4. focus attention on last segment of kernel, beginning comment // copy local queues global queue

regarding code, then, have @ least 2 problems:

  1. the addressing/indexing in final segment of kernel, writing individual queues out global memory, messed up. i'm not going try , debug you.
  2. you not initializing d_nlist variable zero. therefore when atomic add it, adding values junk value, tend increase repeat process.

here's code has problems removed, (i did not try sort out queue copy code) , error checking added. produces repeatable results me:

$ cat t216.cu #include <stdio.h> #include <stdlib.h>  #define threads_x 32 #define threads_y 4 #define pixels_per_thread 4 #define max_queue_length (threads_x*threads_y*pixels_per_thread) #define img_width 256 #define img_height 256 #define img_size (img_width*img_height) #define blocks_x (img_width/(threads_x*pixels_per_thread)) #define blocks_y (img_height/threads_y)  #define cudacheckerrors(msg) \     { \         cudaerror_t __err = cudagetlasterror(); \         if (__err != cudasuccess) { \             fprintf(stderr, "fatal error: %s (%s @ %s:%d)\n", \                 msg, cudageterrorstring(__err), \                 __file__, __line__); \             fprintf(stderr, "*** failed - aborting\n"); \             exit(1); \         } \     } while (0)  __global__ void convert2dedgeimageto1darray( unsigned char const * const image, unsigned int* const list, int* const glob_index ) {  unsigned int const x = blockidx.x  * threads_x*pixels_per_thread + threadidx.x; unsigned int const y = blockidx.y  * threads_y + threadidx.y;  volatile int qindex = -1; volatile __shared__ int sh_qindex[threads_y]; volatile __shared__ int sh_qstart[threads_y]; sh_qindex[threadidx.y] = -1;  // start making array volatile __shared__ unsigned int sh_queue[max_queue_length];  // fill queue for(int i=0; i<pixels_per_thread; i++) {     int const xx = i*threads_x + x;      // read 1 image pixel global memory     unsigned char const pixel = image[y*img_width + xx];     unsigned int  const queue_val = (y << 16) + xx;      if(pixel)     {         {             qindex++;             sh_qindex[threadidx.y] = qindex;             sh_queue[threadidx.y*threads_x*pixels_per_thread + qindex] = queue_val;         } while (sh_queue[threadidx.y*threads_x*pixels_per_thread + qindex] != queue_val);     }      // reload index smem (last thread write smem have updated it)     qindex = sh_qindex[threadidx.y]; }  // let thread 0 reserve space required in global list __syncthreads(); if(threadidx.x == 0 && threadidx.y == 0) {     // find how many items stored in each list     int total_index = 0;     #pragma unroll     for(int i=0; i<threads_y; i++)     {         sh_qstart[i] = total_index;         total_index += (sh_qindex[i] + 1u);     }      // calculate offset in global list     unsigned int global_offset = atomicadd(glob_index, total_index);     #pragma unroll     for(int i=0; i<threads_y; i++)     {         sh_qstart[i] += global_offset;     }  } __syncthreads();  // copy local queues global queue /* for(int i=0; i<=qindex; i+=threads_x) {     if(i + threadidx.x > qindex)         break;      unsigned int qvalue = sh_queue[threadidx.y*threads_x*pixels_per_thread + + threadidx.x];     list[sh_qstart[threadidx.y] + + threadidx.x] = qvalue; } */ }  void call2dto1dkernel(unsigned char const * const h_image) {     // device side allocation     unsigned char *d_image = null;     unsigned int *d_list = null;     int h_nlist=0, *d_nlist = null;     cudamalloc((void**)&d_image, sizeof(unsigned char)*img_size);     cudamalloc((void**)&d_list, sizeof(unsigned int)*img_size);     cudamalloc((void**)&d_nlist, sizeof(int));     cudacheckerrors("cudamalloc fail");      // time measurement initialization     cudaevent_t start, stop, startio, stopio;     cudaeventcreate(&start);     cudaeventcreate(&stop);     cudaeventcreate(&startio);     cudaeventcreate(&stopio);     float et, etio;      // start timer w/ io     cudaeventrecord(startio,0);     cudamemcpy(d_nlist, &h_nlist, sizeof(int), cudamemcpyhosttodevice);     // copy image data device     cudamemcpy((void*)d_image, (void*)h_image, sizeof(unsigned char)*img_size,    cudamemcpyhosttodevice);     cudacheckerrors("cudamemcpy 1");     // start timer     cudaeventrecord(start,0);      // kernel call     // phase 1 : convert 2d binary image 1d pixel array     dim3 dimblock1(threads_x, threads_y);     dim3 dimgrid1(blocks_x, blocks_y);     convert2dedgeimageto1darray<<<dimgrid1, dimblock1>>>(d_image, d_list, d_nlist);     cudadevicesynchronize();     cudacheckerrors("kernel fail");     // stop timer     cudaeventrecord(stop,0);     cudaeventsynchronize(stop);      // stop timer w/ io     cudaeventrecord(stopio,0);     cudaeventsynchronize(stopio);      // time measurement     cudaeventelapsedtime(&et,start,stop);     cudaeventelapsedtime(&etio,startio,stopio);      // time measurement deinitialization     cudaeventdestroy(start);     cudaeventdestroy(stop);     cudaeventdestroy(startio);     cudaeventdestroy(stopio);      // list size     cudamemcpy((void*)&h_nlist, (void*)d_nlist, sizeof(int), cudamemcpydevicetohost);     cudacheckerrors("cudamemcpy 2");     // report on console     printf("%d pixels processed...\n", h_nlist);      // device side dealloc     cudafree(d_image); //    cudafree(d_space);     cudafree(d_list);     cudafree(d_nlist); }  int main(){    unsigned char *image;    image = (unsigned char *)malloc(img_size * sizeof(unsigned char));   if (image == 0) {printf("malloc fail\n"); return 0;}    (int =0 ; i<img_size; i++)     image[i] = rand()%2;    call2dto1dkernel(image);   call2dto1dkernel(image);   call2dto1dkernel(image);   call2dto1dkernel(image);   call2dto1dkernel(image);   cudacheckerrors("some error");   return 0; }  $ nvcc -arch=sm_20 -o3 -o t216 t216.cu $ ./t216 32617 pixels processed... 32617 pixels processed... 32617 pixels processed... 32617 pixels processed... 32617 pixels processed... $ ./t216 32617 pixels processed... 32617 pixels processed... 32617 pixels processed... 32617 pixels processed... 32617 pixels processed... $ 

Comments

Popular posts from this blog

image - ClassNotFoundException when add a prebuilt apk into system.img in android -

I need to import mysql 5.1 to 5.5? -

Java, Hibernate, MySQL - store UTC date-time -