Converting a 2D Canny Edge image to 1D edge pixel array in CUDA - Strange behaviour -
i have cuda kernel takes edge image , processes create smaller, 1d array of edge pixels. here strange behaviour. every time run kernel , calculate number of edge pixels in "d_nlist" (see code near printf), greater pixel count each time, when use same image , stop program , re-run. therefore, each time run it, takes longer run, until eventually, throws un-caught exception.
my question is, how can stop happening can consistent results each time run kernel?
my device geforce 620.
constants:
threads_x = 32
threads_y = 4
pixels_per_thread = 4
max_queue_length = threads_x * threads_y * pixels_per_thread
img_width = 256
img_height = 256
img_size = img_width * img_height
blocks_x = img_width / (threads_x * pixels_per_thread)
blocks_y = img_height / threads_y
the kernel follows:
__global__ void convert2dedgeimageto1darray( unsigned char const * const image, unsigned int* const list, int* const glob_index ) { unsigned int const x = blockidx.x * threads_x*pixels_per_thread + threadidx.x; unsigned int const y = blockidx.y * threads_y + threadidx.y; volatile int qindex = -1; volatile __shared__ int sh_qindex[threads_y]; volatile __shared__ int sh_qstart[threads_y]; sh_qindex[threadidx.y] = -1; // start making array volatile __shared__ unsigned int sh_queue[max_queue_length]; // fill queue for(int i=0; i<pixels_per_thread; i++) { int const xx = i*threads_x + x; // read 1 image pixel global memory unsigned char const pixel = image[y*img_width + xx]; unsigned int const queue_val = (y << 16) + xx; if(pixel) { { qindex++; sh_qindex[threadidx.y] = qindex; sh_queue[threadidx.y*threads_x*pixels_per_thread + qindex] = queue_val; } while (sh_queue[threadidx.y*threads_x*pixels_per_thread + qindex] != queue_val); } // reload index smem (last thread write smem have updated it) qindex = sh_qindex[threadidx.y]; } // let thread 0 reserve space required in global list __syncthreads(); if(threadidx.x == 0 && threadidx.y == 0) { // find how many items stored in each list int total_index = 0; #pragma unroll for(int i=0; i<threads_y; i++) { sh_qstart[i] = total_index; total_index += (sh_qindex[i] + 1u); } // calculate offset in global list unsigned int global_offset = atomicadd(glob_index, total_index); #pragma unroll for(int i=0; i<threads_y; i++) { sh_qstart[i] += global_offset; } } __syncthreads(); // copy local queues global queue for(int i=0; i<=qindex; i+=threads_x) { if(i + threadidx.x > qindex) break; unsigned int qvalue = sh_queue[threadidx.y*threads_x*pixels_per_thread + + threadidx.x]; list[sh_qstart[threadidx.y] + + threadidx.x] = qvalue; } }
the following method calls kernel:
void call2dto1dkernel(unsigned char const * const h_image) { // device side allocation unsigned char *d_image = null; unsigned int *d_list = null; int h_nlist, *d_nlist = null; cudamalloc((void**)&d_image, sizeof(unsigned char)*img_size); cudamalloc((void**)&d_list, sizeof(unsigned int)*img_size); cudamalloc((void**)&d_nlist, sizeof(int)); // time measurement initialization cudaevent_t start, stop, startio, stopio; cudaeventcreate(&start); cudaeventcreate(&stop); cudaeventcreate(&startio); cudaeventcreate(&stopio); // start timer w/ io cudaeventrecord(startio,0); // copy image data device cudamemcpy((void*)d_image, (void*)h_image, sizeof(unsigned char)*img_size, cudamemcpyhosttodevice); // start timer cudaeventrecord(start,0); // kernel call // phase 1 : convert 2d binary image 1d pixel array dim3 dimblock1(threads_x, threads_y); dim3 dimgrid1(blocks_x, blocks_y); convert2dedgeimageto1darray<<<dimgrid1, dimblock1>>>(d_image, d_list, d_nlist); // stop timer cudaeventrecord(stop,0); cudaeventsynchronize(stop); // stop timer w/ io cudaeventrecord(stopio,0); cudaeventsynchronize(stopio); // time measurement cudaeventelapsedtime(&et,start,stop); cudaeventelapsedtime(&etio,startio,stopio); // time measurement deinitialization cudaeventdestroy(start); cudaeventdestroy(stop); cudaeventdestroy(startio); cudaeventdestroy(stopio); // list size cudamemcpy((void*)&h_nlist, (void*)d_nlist, sizeof(int), cudamemcpydevicetohost); // report on console printf("%d pixels processed...\n", h_nlist); // device side dealloc cudafree(d_image); cudafree(d_space); cudafree(d_list); cudafree(d_nlist); }
thank in advance everyone.
as preamble, let me suggest troubleshooting steps useful:
- instrument code proper cuda error checking
- run code
cuda-memcheck
e.g.cuda-memcheck ./myapp
if above steps, you'll find kernel failing, , failures have global writes of size 4. focus attention on last segment of kernel, beginning comment // copy local queues global queue
regarding code, then, have @ least 2 problems:
- the addressing/indexing in final segment of kernel, writing individual queues out global memory, messed up. i'm not going try , debug you.
- you not initializing
d_nlist
variable zero. therefore when atomic add it, adding values junk value, tend increase repeat process.
here's code has problems removed, (i did not try sort out queue copy code) , error checking added. produces repeatable results me:
$ cat t216.cu #include <stdio.h> #include <stdlib.h> #define threads_x 32 #define threads_y 4 #define pixels_per_thread 4 #define max_queue_length (threads_x*threads_y*pixels_per_thread) #define img_width 256 #define img_height 256 #define img_size (img_width*img_height) #define blocks_x (img_width/(threads_x*pixels_per_thread)) #define blocks_y (img_height/threads_y) #define cudacheckerrors(msg) \ { \ cudaerror_t __err = cudagetlasterror(); \ if (__err != cudasuccess) { \ fprintf(stderr, "fatal error: %s (%s @ %s:%d)\n", \ msg, cudageterrorstring(__err), \ __file__, __line__); \ fprintf(stderr, "*** failed - aborting\n"); \ exit(1); \ } \ } while (0) __global__ void convert2dedgeimageto1darray( unsigned char const * const image, unsigned int* const list, int* const glob_index ) { unsigned int const x = blockidx.x * threads_x*pixels_per_thread + threadidx.x; unsigned int const y = blockidx.y * threads_y + threadidx.y; volatile int qindex = -1; volatile __shared__ int sh_qindex[threads_y]; volatile __shared__ int sh_qstart[threads_y]; sh_qindex[threadidx.y] = -1; // start making array volatile __shared__ unsigned int sh_queue[max_queue_length]; // fill queue for(int i=0; i<pixels_per_thread; i++) { int const xx = i*threads_x + x; // read 1 image pixel global memory unsigned char const pixel = image[y*img_width + xx]; unsigned int const queue_val = (y << 16) + xx; if(pixel) { { qindex++; sh_qindex[threadidx.y] = qindex; sh_queue[threadidx.y*threads_x*pixels_per_thread + qindex] = queue_val; } while (sh_queue[threadidx.y*threads_x*pixels_per_thread + qindex] != queue_val); } // reload index smem (last thread write smem have updated it) qindex = sh_qindex[threadidx.y]; } // let thread 0 reserve space required in global list __syncthreads(); if(threadidx.x == 0 && threadidx.y == 0) { // find how many items stored in each list int total_index = 0; #pragma unroll for(int i=0; i<threads_y; i++) { sh_qstart[i] = total_index; total_index += (sh_qindex[i] + 1u); } // calculate offset in global list unsigned int global_offset = atomicadd(glob_index, total_index); #pragma unroll for(int i=0; i<threads_y; i++) { sh_qstart[i] += global_offset; } } __syncthreads(); // copy local queues global queue /* for(int i=0; i<=qindex; i+=threads_x) { if(i + threadidx.x > qindex) break; unsigned int qvalue = sh_queue[threadidx.y*threads_x*pixels_per_thread + + threadidx.x]; list[sh_qstart[threadidx.y] + + threadidx.x] = qvalue; } */ } void call2dto1dkernel(unsigned char const * const h_image) { // device side allocation unsigned char *d_image = null; unsigned int *d_list = null; int h_nlist=0, *d_nlist = null; cudamalloc((void**)&d_image, sizeof(unsigned char)*img_size); cudamalloc((void**)&d_list, sizeof(unsigned int)*img_size); cudamalloc((void**)&d_nlist, sizeof(int)); cudacheckerrors("cudamalloc fail"); // time measurement initialization cudaevent_t start, stop, startio, stopio; cudaeventcreate(&start); cudaeventcreate(&stop); cudaeventcreate(&startio); cudaeventcreate(&stopio); float et, etio; // start timer w/ io cudaeventrecord(startio,0); cudamemcpy(d_nlist, &h_nlist, sizeof(int), cudamemcpyhosttodevice); // copy image data device cudamemcpy((void*)d_image, (void*)h_image, sizeof(unsigned char)*img_size, cudamemcpyhosttodevice); cudacheckerrors("cudamemcpy 1"); // start timer cudaeventrecord(start,0); // kernel call // phase 1 : convert 2d binary image 1d pixel array dim3 dimblock1(threads_x, threads_y); dim3 dimgrid1(blocks_x, blocks_y); convert2dedgeimageto1darray<<<dimgrid1, dimblock1>>>(d_image, d_list, d_nlist); cudadevicesynchronize(); cudacheckerrors("kernel fail"); // stop timer cudaeventrecord(stop,0); cudaeventsynchronize(stop); // stop timer w/ io cudaeventrecord(stopio,0); cudaeventsynchronize(stopio); // time measurement cudaeventelapsedtime(&et,start,stop); cudaeventelapsedtime(&etio,startio,stopio); // time measurement deinitialization cudaeventdestroy(start); cudaeventdestroy(stop); cudaeventdestroy(startio); cudaeventdestroy(stopio); // list size cudamemcpy((void*)&h_nlist, (void*)d_nlist, sizeof(int), cudamemcpydevicetohost); cudacheckerrors("cudamemcpy 2"); // report on console printf("%d pixels processed...\n", h_nlist); // device side dealloc cudafree(d_image); // cudafree(d_space); cudafree(d_list); cudafree(d_nlist); } int main(){ unsigned char *image; image = (unsigned char *)malloc(img_size * sizeof(unsigned char)); if (image == 0) {printf("malloc fail\n"); return 0;} (int =0 ; i<img_size; i++) image[i] = rand()%2; call2dto1dkernel(image); call2dto1dkernel(image); call2dto1dkernel(image); call2dto1dkernel(image); call2dto1dkernel(image); cudacheckerrors("some error"); return 0; } $ nvcc -arch=sm_20 -o3 -o t216 t216.cu $ ./t216 32617 pixels processed... 32617 pixels processed... 32617 pixels processed... 32617 pixels processed... 32617 pixels processed... $ ./t216 32617 pixels processed... 32617 pixels processed... 32617 pixels processed... 32617 pixels processed... 32617 pixels processed... $
Comments
Post a Comment