Tell me more ×
Stack Overflow is a question and answer site for professional and enthusiast programmers. It's 100% free, no registration required.

This is part of an online parallel programming course. I am using the below CUDA code to find the minimum in an image. The code is getting executed on the Udacity server. Since I do not have a GPU I am not able to run the code locally and debug too. As part of problem I have to find the maximum element in an array too.(which is not the current focus). I am using the reduction algorithm to find the min element. However I get the following error .Can someone please help me out in finding out what the problem is ? As per my understanding an unspecified launch failure is equivalent to a CPU side segmentation fault. But I am just not able to figure out where its going wrong. Thanks for all the help in advance!

Error message:: CUDA error at: HW3.cu:254 unspecified launch failure cudaGetLastError() We are unable to execute your code. Did you set the grid and/or block size correctly?

Here is the code. The kernel is Reduce_Min. It is called from the function your_histogram_and_prefixsum. Also the pointer d_logLuminance which is the parameter to the function your_histogram_and_prefixsum is the input array.

include "reference_calc.cpp"
include "utils.h"
//kernel code for minimum
global void Reduce_Min(const float * d_in, float * d_out)
{
extern shared float sdata[];
int bx = blockIdx.x; 
int tx = threadIdx.x; 
int myId = (bx * blockDim.x) + tx;

sdata[tx] = d_in[myId];
__syncthreads();

for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1)
{
if (tx < s)
{
sdata[tx] > sdata[tx + s];
sdata[tx] = sdata[tx + s];
}
__syncthreads(); 
}

// only thread 0 writes result for this block back to global mem
if (tx == 0)
{
d_out[blockIdx.x] = sdata[0];
}
}

void your_histogram_and_prefixsum(const float const d_logLuminance,
unsigned int const d_cdf,
float &min_logLum,
float &max_logLum,
const size_t numRows,
const size_t numCols,
const size_t numBins)
{
float * d_intermediate , * d_out;
const int maxThreads = 1024;
const int ARRAY_BYTES = (((numRows * numCols) / maxThreads)* sizeof(float));
int nosBlocks = ((numRows * numCols) / maxThreads);

//device memory for min
cudaMalloc((void ) &d_out, sizeof(float));
cudaMalloc((void ) &d_intermediate, ARRAY_BYTES);

//parameters for kernel 
const dim3 gridSize = ( numCols *numRows) / maxThreads;
const dim3 blockSize = (maxThreads);

Reduce_Min<<< gridSize , blockSize, maxThreads * sizeof(float)>>>(d_logLuminance,    d_intermediate);
Reduce_Min<<< 1 , gridSize, numBlocks * sizeof(float)>>>(d_intermediate ,d_out);
cudaMemcpy(&min_logLum, d_out, sizeof(float), cudaMemcpyDeviceToHost);

}

I am editing to add the entire code Here is the entire code

Shown below is reference_main.cpp

#include <iostream>
#include "timer.h"
#include "utils.h"
#include <string>
#include <stdio.h>

void preProcess(float **d_luminance, unsigned int **d_cdf,
                size_t *numRows, size_t *numCols, unsigned int *numBins,
                const std::string& filename);

void postProcess(const std::string& output_file, size_t numRows, size_t numCols,
                 float min_logLum, float max_logLum);

void your_histogram_and_prefixsum(const float* const d_luminance,
                                  unsigned int* const d_cdf,
                                  float &min_logLum,
                                  float &max_logLum,
                                  const size_t numRows,
                                  const size_t numCols,
                                  const size_t numBins);

int main(int argc, char **argv) {
  float *d_luminance;
  unsigned int *d_cdf;

  size_t numRows, numCols;
  unsigned int numBins;

  std::string input_file;
  std::string output_file;
  if (argc == 3) {
    input_file  = std::string(argv[1]);
    output_file = std::string(argv[2]);
  }
  else {
    std::cerr << "Usage: ./hw input_file output_file" << std::endl;
    exit(1);
  }
  //load the image and give us our input and output pointers
  preProcess(&d_luminance, &d_cdf,
             &numRows, &numCols, &numBins, input_file);

  GpuTimer timer;
  float min_logLum, max_logLum;
  min_logLum = 0.f;
  max_logLum = 1.f;
  timer.Start();
  //call the students' code
  your_histogram_and_prefixsum(d_luminance, d_cdf, min_logLum, max_logLum,
                               numRows, numCols, numBins);
  timer.Stop();
  cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
  int err = printf("%f msecs.\n", timer.Elapsed());

  if (err < 0) {
    //Couldn't print! Probably the student closed stdout - bad news
    std::cerr << "Couldn't print timing information! STDOUT Closed!" << std::endl;
    exit(1);
  }

  //check results and output the tone-mapped image
  postProcess(output_file, numRows, numCols, min_logLum, max_logLum);

  return 0;
}

Below is the second file reference_hw3.cu

#include "utils.h"
#include <string>
#include "loadSaveImage.h"
#include <thrust/extrema.h>

//chroma-LogLuminance Space
static float *d_x__;
static float *d_y__;
static float *d_logY__;

//memory for the cdf
static unsigned int *d_cdf__;

static const int numBins = 1024;

size_t numRows__;
size_t numCols__;

/* Copied from Mike's IPython notebook with some minor modifications
 * Mainly double precision constants to floats and log10 -> log10f
 * Also removed Luminance (Y) channel since it is never used       eke*/

__global__ void rgb_to_xyY(
    float* d_r,
    float* d_g,
    float* d_b,
    float* d_x,
    float* d_y,
    float* d_log_Y,
    float  delta,
    int    num_pixels_y,
    int    num_pixels_x )
{
  int  ny             = num_pixels_y;
  int  nx             = num_pixels_x;
  int2 image_index_2d = make_int2( ( blockIdx.x * blockDim.x ) + threadIdx.x, ( blockIdx.y * blockDim.y ) + threadIdx.y );
  int  image_index_1d = ( nx * image_index_2d.y ) + image_index_2d.x;

  if ( image_index_2d.x < nx && image_index_2d.y < ny )
  {
    float r = d_r[ image_index_1d ];
    float g = d_g[ image_index_1d ];
    float b = d_b[ image_index_1d ];

    float X = ( r * 0.4124f ) + ( g * 0.3576f ) + ( b * 0.1805f );
    float Y = ( r * 0.2126f ) + ( g * 0.7152f ) + ( b * 0.0722f );
    float Z = ( r * 0.0193f ) + ( g * 0.1192f ) + ( b * 0.9505f );

    float L = X + Y + Z;
    float x = X / L;
    float y = Y / L;

    float log_Y = log10f( delta + Y );

    d_x[ image_index_1d ]     = x;
    d_y[ image_index_1d ]     = y;
    d_log_Y[ image_index_1d ] = log_Y;
  }
}

/* Copied from Mike's IPython notebook *
   Modified just by having threads read the 
   normalization constant directly from device memory
   instead of copying it back                          */


__global__ void normalize_cdf(
    unsigned int* d_input_cdf,
    float*        d_output_cdf,
    int           n
    )
{
  const float normalization_constant = 1.f / d_input_cdf[n - 1];

  int global_index_1d = ( blockIdx.x * blockDim.x ) + threadIdx.x;

  if ( global_index_1d < n )
  {
    unsigned int input_value  = d_input_cdf[ global_index_1d ];
    float        output_value = input_value * normalization_constant;

    d_output_cdf[ global_index_1d ] = output_value;
  }
}


/* Copied from Mike's IPython notebook *
   Modified double constants -> float  *
   Perform tone mapping based upon new *
   luminance scaling                   */

__global__ void tonemap(
    float* d_x,
    float* d_y,
    float* d_log_Y,
    float* d_cdf_norm,
    float* d_r_new,
    float* d_g_new,
    float* d_b_new,
    float  min_log_Y,
    float  max_log_Y,
    float  log_Y_range,
    int    num_bins,
    int    num_pixels_y,
    int    num_pixels_x )
{
  int  ny             = num_pixels_y;
  int  nx             = num_pixels_x;
  int2 image_index_2d = make_int2( ( blockIdx.x * blockDim.x ) + threadIdx.x, ( blockIdx.y * blockDim.y ) + threadIdx.y );
  int  image_index_1d = ( nx * image_index_2d.y ) + image_index_2d.x;

  if ( image_index_2d.x < nx && image_index_2d.y < ny )
  {
    float x         = d_x[ image_index_1d ];
    float y         = d_y[ image_index_1d ];
    float log_Y     = d_log_Y[ image_index_1d ];
    int   bin_index = min( num_bins - 1, int( (num_bins * ( log_Y - min_log_Y ) ) / log_Y_range ) );
    float Y_new     = d_cdf_norm[ bin_index ];

    float X_new = x * ( Y_new / y );
    float Z_new = ( 1 - x - y ) * ( Y_new / y );

    float r_new = ( X_new *  3.2406f ) + ( Y_new * -1.5372f ) + ( Z_new * -0.4986f );
    float g_new = ( X_new * -0.9689f ) + ( Y_new *  1.8758f ) + ( Z_new *  0.0415f );
    float b_new = ( X_new *  0.0557f ) + ( Y_new * -0.2040f ) + ( Z_new *  1.0570f );

    d_r_new[ image_index_1d ] = r_new;
    d_g_new[ image_index_1d ] = g_new;
    d_b_new[ image_index_1d ] = b_new;
  }
}


//return types are void since any internal error will be handled by quitting
//no point in returning error codes...
void preProcess(float** d_luminance, unsigned int** d_cdf,
                size_t *numRows, size_t *numCols,
                unsigned int *numberOfBins,
                const std::string &filename) {
  //make sure the context initializes ok
  checkCudaErrors(cudaFree(0));

  float *imgPtr; //we will become responsible for this pointer
  loadImageHDR(filename, &imgPtr, &numRows__, &numCols__);
  *numRows = numRows__;
  *numCols = numCols__;

  //first thing to do is split incoming BGR float data into separate channels
  size_t numPixels = numRows__ * numCols__;
  float *red   = new float[numPixels];
  float *green = new float[numPixels];
  float *blue  = new float[numPixels];

  //Remeber image is loaded BGR
  for (size_t i = 0; i < numPixels; ++i) {
    blue[i]  = imgPtr[3 * i + 0];
    green[i] = imgPtr[3 * i + 1];
    red[i]   = imgPtr[3 * i + 2];
  }

  delete[] imgPtr; //being good citizens are releasing resources
                   //allocated in loadImageHDR

  float *d_red, *d_green, *d_blue;  //RGB space

  size_t channelSize = sizeof(float) * numPixels;

  checkCudaErrors(cudaMalloc(&d_red,    channelSize));
  checkCudaErrors(cudaMalloc(&d_green,  channelSize));
  checkCudaErrors(cudaMalloc(&d_blue,   channelSize));
  checkCudaErrors(cudaMalloc(&d_x__,    channelSize));
  checkCudaErrors(cudaMalloc(&d_y__,    channelSize));
  checkCudaErrors(cudaMalloc(&d_logY__, channelSize));

  checkCudaErrors(cudaMemcpy(d_red,   red,   channelSize, cudaMemcpyHostToDevice));
  checkCudaErrors(cudaMemcpy(d_green, green, channelSize, cudaMemcpyHostToDevice));
  checkCudaErrors(cudaMemcpy(d_blue,  blue,  channelSize, cudaMemcpyHostToDevice));

  //convert from RGB space to chrominance/luminance space xyY
  const dim3 blockSize(32, 16, 1);
  const dim3 gridSize( (numCols__ + blockSize.x - 1) / blockSize.x, 
                       (numRows__ + blockSize.y - 1) / blockSize.y, 1);
  rgb_to_xyY<<<gridSize, blockSize>>>(d_red, d_green, d_blue,
                                      d_x__, d_y__,   d_logY__,
                                      .0001f, numRows__, numCols__);

  cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());

  *d_luminance = d_logY__;

  //allocate memory for the cdf of the histogram
  *numberOfBins = numBins;
  checkCudaErrors(cudaMalloc(&d_cdf__, sizeof(unsigned int) * numBins));
  checkCudaErrors(cudaMemset(d_cdf__, 0, sizeof(unsigned int) * numBins));

  *d_cdf = d_cdf__;

  checkCudaErrors(cudaFree(d_red));
  checkCudaErrors(cudaFree(d_green));
  checkCudaErrors(cudaFree(d_blue));

  delete[] red;
  delete[] green;
  delete[] blue;
}

void postProcess(const std::string& output_file, 
                 size_t numRows, size_t numCols,
                 float min_log_Y, float max_log_Y) {
  const int numPixels = numRows__ * numCols__;

  const int numThreads = 192;

  float *d_cdf_normalized;

  checkCudaErrors(cudaMalloc(&d_cdf_normalized, sizeof(float) * numBins));

  //first normalize the cdf to a maximum value of 1
  //this is how we compress the range of the luminance channel
  normalize_cdf<<< (numBins + numThreads - 1) / numThreads,
                    numThreads>>>(d_cdf__,
                                  d_cdf_normalized,
                                  numBins);

  cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());

  //allocate memory for the output RGB channels
  float *h_red, *h_green, *h_blue;
  float *d_red, *d_green, *d_blue;

  h_red   = new float[numPixels];
  h_green = new float[numPixels];
  h_blue  = new float[numPixels];

  checkCudaErrors(cudaMalloc(&d_red,   sizeof(float) * numPixels));
  checkCudaErrors(cudaMalloc(&d_green, sizeof(float) * numPixels));
  checkCudaErrors(cudaMalloc(&d_blue,  sizeof(float) * numPixels));

  float log_Y_range = max_log_Y - min_log_Y;

  const dim3 blockSize(32, 16, 1);
  const dim3 gridSize( (numCols + blockSize.x - 1) / blockSize.x,
                       (numRows + blockSize.y - 1) / blockSize.y );
  //next perform the actual tone-mapping
  //we map each luminance value to its new value
  //and then transform back to RGB space
  tonemap<<<gridSize, blockSize>>>(d_x__, d_y__, d_logY__,
                                   d_cdf_normalized,
                                   d_red, d_green, d_blue,
                                   min_log_Y, max_log_Y,
                                   log_Y_range, numBins,
                                   numRows, numCols);

  cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());

  checkCudaErrors(cudaMemcpy(h_red,   d_red,   sizeof(float) * numPixels, cudaMemcpyDeviceToHost));
  checkCudaErrors(cudaMemcpy(h_green, d_green, sizeof(float) * numPixels, cudaMemcpyDeviceToHost));
  checkCudaErrors(cudaMemcpy(h_blue,  d_blue,  sizeof(float) * numPixels, cudaMemcpyDeviceToHost));

  //recombine the image channels
  float *imageHDR = new float[numPixels * 3];

  for (int i = 0; i < numPixels; ++i) {
    imageHDR[3 * i + 0] = h_blue[i];
    imageHDR[3 * i + 1] = h_green[i];
    imageHDR[3 * i + 2] = h_red[i];
  }

  saveImageHDR(imageHDR, numRows, numCols, output_file);

  delete[] imageHDR;
  delete[] h_red;
  delete[] h_green;
  delete[] h_blue;

  //cleanup
  checkCudaErrors(cudaFree(d_x__));
  checkCudaErrors(cudaFree(d_y__));
  checkCudaErrors(cudaFree(d_logY__));
  checkCudaErrors(cudaFree(d_cdf__));
  checkCudaErrors(cudaFree(d_cdf_normalized));
}

Shown below is the third file student_func.cu which contains the actual kernel code to find the minimum using reduction

include "reference_calc.cpp"
include "utils.h"
//kernel code for minimum
global void Reduce_Min(const float * d_in, float * d_out)
{
extern shared float sdata[];
int bx = blockIdx.x; 
int tx = threadIdx.x; 
int myId = (bx * blockDim.x) + tx;

sdata[tx] = d_in[myId];
__syncthreads();


for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1)
{
    if (tx < s)
    {
        sdata[tx] > sdata[tx + s];
        sdata[tx] = sdata[tx + s];
    }
    __syncthreads(); 
}

  // only thread 0 writes result for this block back to global mem
if (tx == 0)
{
    d_out[blockIdx.x] = sdata[0];
}
}

void your_histogram_and_prefixsum(const float const d_logLuminance,
unsigned int const d_cdf,
float &min_logLum,
float &max_logLum,
const size_t numRows,
const size_t numCols,
const size_t numBins)
{
float * d_intermediate , * d_out, d_out_max, d_intermediate_max;
const int maxThreads = 1024;
const int ARRAY_BYTES = (((numRows * numCols) / maxThreads)* sizeof(float));
int nosBlocks = ((numRows * numCols) / maxThreads);

//device memory for min
cudaMalloc((void **) &d_out, sizeof(float));
cudaMalloc((void **) &d_intermediate, ARRAY_BYTES);


//parameters for kernel 
const dim3 gridSize = ( numCols *numRows) / maxThreads;
const dim3 blockSize = (maxThreads);

Reduce_Min<<< gridSize , blockSize, maxThreads * sizeof(float)>>>(d_logLuminance, d_intermediate);
Reduce_Min<<< 1 , gridSize,  numBlocks * sizeof(float)>>>(d_intermediate ,d_out);
cudaMemcpy(&min_logLum, d_out, sizeof(float), cudaMemcpyDeviceToHost);

}
share|improve this question
Coud you provide the entire code so that we could copy, paste and compile it? – JackOLantern Jul 12 at 12:44
1  
try running your code with cuda-memcheck, such as cuda-memcheck ./mycode – Robert Crovella Jul 12 at 14:09
@JackOLantern Please find attached the entire code. Thanks for the help – user2273108 Jul 12 at 14:51
@user2273108 This code requires input files, right? So it is not possible to run it on this side. Follow Robert Crovella's suggestion. cuda-memcheck has been useful in a number of cases, see CUDA error message : unspecified launch failure, CUDA: “unspecified launch failure”(http://stackoverflow.com/questions/11820912/cuda-unspecified-launch-fai‌​lure) and [CUDA Error - unspecified launch failure. – JackOLantern Jul 12 at 21:57

Know someone who can answer? Share a link to this question via email, Google+, Twitter, or Facebook.

Your Answer

 
discard

By posting your answer, you agree to the privacy policy and terms of service.

Browse other questions tagged or ask your own question.