This is part of an online parallel programming course. I am using the below CUDA code to find the minimum in an image. The code is getting executed on the Udacity server. Since I do not have a GPU I am not able to run the code locally and debug too. As part of problem I have to find the maximum element in an array too.(which is not the current focus). I am using the reduction algorithm to find the min element. However I get the following error .Can someone please help me out in finding out what the problem is ? As per my understanding an unspecified launch failure is equivalent to a CPU side segmentation fault. But I am just not able to figure out where its going wrong. Thanks for all the help in advance!
Error message:: CUDA error at: HW3.cu:254 unspecified launch failure cudaGetLastError() We are unable to execute your code. Did you set the grid and/or block size correctly?
Here is the code. The kernel is Reduce_Min. It is called from the function your_histogram_and_prefixsum. Also the pointer d_logLuminance which is the parameter to the function your_histogram_and_prefixsum is the input array.
include "reference_calc.cpp"
include "utils.h"
//kernel code for minimum
global void Reduce_Min(const float * d_in, float * d_out)
{
extern shared float sdata[];
int bx = blockIdx.x;
int tx = threadIdx.x;
int myId = (bx * blockDim.x) + tx;
sdata[tx] = d_in[myId];
__syncthreads();
for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1)
{
if (tx < s)
{
sdata[tx] > sdata[tx + s];
sdata[tx] = sdata[tx + s];
}
__syncthreads();
}
// only thread 0 writes result for this block back to global mem
if (tx == 0)
{
d_out[blockIdx.x] = sdata[0];
}
}
void your_histogram_and_prefixsum(const float const d_logLuminance,
unsigned int const d_cdf,
float &min_logLum,
float &max_logLum,
const size_t numRows,
const size_t numCols,
const size_t numBins)
{
float * d_intermediate , * d_out;
const int maxThreads = 1024;
const int ARRAY_BYTES = (((numRows * numCols) / maxThreads)* sizeof(float));
int nosBlocks = ((numRows * numCols) / maxThreads);
//device memory for min
cudaMalloc((void ) &d_out, sizeof(float));
cudaMalloc((void ) &d_intermediate, ARRAY_BYTES);
//parameters for kernel
const dim3 gridSize = ( numCols *numRows) / maxThreads;
const dim3 blockSize = (maxThreads);
Reduce_Min<<< gridSize , blockSize, maxThreads * sizeof(float)>>>(d_logLuminance, d_intermediate);
Reduce_Min<<< 1 , gridSize, numBlocks * sizeof(float)>>>(d_intermediate ,d_out);
cudaMemcpy(&min_logLum, d_out, sizeof(float), cudaMemcpyDeviceToHost);
}
I am editing to add the entire code Here is the entire code
Shown below is reference_main.cpp
#include <iostream>
#include "timer.h"
#include "utils.h"
#include <string>
#include <stdio.h>
void preProcess(float **d_luminance, unsigned int **d_cdf,
size_t *numRows, size_t *numCols, unsigned int *numBins,
const std::string& filename);
void postProcess(const std::string& output_file, size_t numRows, size_t numCols,
float min_logLum, float max_logLum);
void your_histogram_and_prefixsum(const float* const d_luminance,
unsigned int* const d_cdf,
float &min_logLum,
float &max_logLum,
const size_t numRows,
const size_t numCols,
const size_t numBins);
int main(int argc, char **argv) {
float *d_luminance;
unsigned int *d_cdf;
size_t numRows, numCols;
unsigned int numBins;
std::string input_file;
std::string output_file;
if (argc == 3) {
input_file = std::string(argv[1]);
output_file = std::string(argv[2]);
}
else {
std::cerr << "Usage: ./hw input_file output_file" << std::endl;
exit(1);
}
//load the image and give us our input and output pointers
preProcess(&d_luminance, &d_cdf,
&numRows, &numCols, &numBins, input_file);
GpuTimer timer;
float min_logLum, max_logLum;
min_logLum = 0.f;
max_logLum = 1.f;
timer.Start();
//call the students' code
your_histogram_and_prefixsum(d_luminance, d_cdf, min_logLum, max_logLum,
numRows, numCols, numBins);
timer.Stop();
cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
int err = printf("%f msecs.\n", timer.Elapsed());
if (err < 0) {
//Couldn't print! Probably the student closed stdout - bad news
std::cerr << "Couldn't print timing information! STDOUT Closed!" << std::endl;
exit(1);
}
//check results and output the tone-mapped image
postProcess(output_file, numRows, numCols, min_logLum, max_logLum);
return 0;
}
Below is the second file reference_hw3.cu
#include "utils.h"
#include <string>
#include "loadSaveImage.h"
#include <thrust/extrema.h>
//chroma-LogLuminance Space
static float *d_x__;
static float *d_y__;
static float *d_logY__;
//memory for the cdf
static unsigned int *d_cdf__;
static const int numBins = 1024;
size_t numRows__;
size_t numCols__;
/* Copied from Mike's IPython notebook with some minor modifications
* Mainly double precision constants to floats and log10 -> log10f
* Also removed Luminance (Y) channel since it is never used eke*/
__global__ void rgb_to_xyY(
float* d_r,
float* d_g,
float* d_b,
float* d_x,
float* d_y,
float* d_log_Y,
float delta,
int num_pixels_y,
int num_pixels_x )
{
int ny = num_pixels_y;
int nx = num_pixels_x;
int2 image_index_2d = make_int2( ( blockIdx.x * blockDim.x ) + threadIdx.x, ( blockIdx.y * blockDim.y ) + threadIdx.y );
int image_index_1d = ( nx * image_index_2d.y ) + image_index_2d.x;
if ( image_index_2d.x < nx && image_index_2d.y < ny )
{
float r = d_r[ image_index_1d ];
float g = d_g[ image_index_1d ];
float b = d_b[ image_index_1d ];
float X = ( r * 0.4124f ) + ( g * 0.3576f ) + ( b * 0.1805f );
float Y = ( r * 0.2126f ) + ( g * 0.7152f ) + ( b * 0.0722f );
float Z = ( r * 0.0193f ) + ( g * 0.1192f ) + ( b * 0.9505f );
float L = X + Y + Z;
float x = X / L;
float y = Y / L;
float log_Y = log10f( delta + Y );
d_x[ image_index_1d ] = x;
d_y[ image_index_1d ] = y;
d_log_Y[ image_index_1d ] = log_Y;
}
}
/* Copied from Mike's IPython notebook *
Modified just by having threads read the
normalization constant directly from device memory
instead of copying it back */
__global__ void normalize_cdf(
unsigned int* d_input_cdf,
float* d_output_cdf,
int n
)
{
const float normalization_constant = 1.f / d_input_cdf[n - 1];
int global_index_1d = ( blockIdx.x * blockDim.x ) + threadIdx.x;
if ( global_index_1d < n )
{
unsigned int input_value = d_input_cdf[ global_index_1d ];
float output_value = input_value * normalization_constant;
d_output_cdf[ global_index_1d ] = output_value;
}
}
/* Copied from Mike's IPython notebook *
Modified double constants -> float *
Perform tone mapping based upon new *
luminance scaling */
__global__ void tonemap(
float* d_x,
float* d_y,
float* d_log_Y,
float* d_cdf_norm,
float* d_r_new,
float* d_g_new,
float* d_b_new,
float min_log_Y,
float max_log_Y,
float log_Y_range,
int num_bins,
int num_pixels_y,
int num_pixels_x )
{
int ny = num_pixels_y;
int nx = num_pixels_x;
int2 image_index_2d = make_int2( ( blockIdx.x * blockDim.x ) + threadIdx.x, ( blockIdx.y * blockDim.y ) + threadIdx.y );
int image_index_1d = ( nx * image_index_2d.y ) + image_index_2d.x;
if ( image_index_2d.x < nx && image_index_2d.y < ny )
{
float x = d_x[ image_index_1d ];
float y = d_y[ image_index_1d ];
float log_Y = d_log_Y[ image_index_1d ];
int bin_index = min( num_bins - 1, int( (num_bins * ( log_Y - min_log_Y ) ) / log_Y_range ) );
float Y_new = d_cdf_norm[ bin_index ];
float X_new = x * ( Y_new / y );
float Z_new = ( 1 - x - y ) * ( Y_new / y );
float r_new = ( X_new * 3.2406f ) + ( Y_new * -1.5372f ) + ( Z_new * -0.4986f );
float g_new = ( X_new * -0.9689f ) + ( Y_new * 1.8758f ) + ( Z_new * 0.0415f );
float b_new = ( X_new * 0.0557f ) + ( Y_new * -0.2040f ) + ( Z_new * 1.0570f );
d_r_new[ image_index_1d ] = r_new;
d_g_new[ image_index_1d ] = g_new;
d_b_new[ image_index_1d ] = b_new;
}
}
//return types are void since any internal error will be handled by quitting
//no point in returning error codes...
void preProcess(float** d_luminance, unsigned int** d_cdf,
size_t *numRows, size_t *numCols,
unsigned int *numberOfBins,
const std::string &filename) {
//make sure the context initializes ok
checkCudaErrors(cudaFree(0));
float *imgPtr; //we will become responsible for this pointer
loadImageHDR(filename, &imgPtr, &numRows__, &numCols__);
*numRows = numRows__;
*numCols = numCols__;
//first thing to do is split incoming BGR float data into separate channels
size_t numPixels = numRows__ * numCols__;
float *red = new float[numPixels];
float *green = new float[numPixels];
float *blue = new float[numPixels];
//Remeber image is loaded BGR
for (size_t i = 0; i < numPixels; ++i) {
blue[i] = imgPtr[3 * i + 0];
green[i] = imgPtr[3 * i + 1];
red[i] = imgPtr[3 * i + 2];
}
delete[] imgPtr; //being good citizens are releasing resources
//allocated in loadImageHDR
float *d_red, *d_green, *d_blue; //RGB space
size_t channelSize = sizeof(float) * numPixels;
checkCudaErrors(cudaMalloc(&d_red, channelSize));
checkCudaErrors(cudaMalloc(&d_green, channelSize));
checkCudaErrors(cudaMalloc(&d_blue, channelSize));
checkCudaErrors(cudaMalloc(&d_x__, channelSize));
checkCudaErrors(cudaMalloc(&d_y__, channelSize));
checkCudaErrors(cudaMalloc(&d_logY__, channelSize));
checkCudaErrors(cudaMemcpy(d_red, red, channelSize, cudaMemcpyHostToDevice));
checkCudaErrors(cudaMemcpy(d_green, green, channelSize, cudaMemcpyHostToDevice));
checkCudaErrors(cudaMemcpy(d_blue, blue, channelSize, cudaMemcpyHostToDevice));
//convert from RGB space to chrominance/luminance space xyY
const dim3 blockSize(32, 16, 1);
const dim3 gridSize( (numCols__ + blockSize.x - 1) / blockSize.x,
(numRows__ + blockSize.y - 1) / blockSize.y, 1);
rgb_to_xyY<<<gridSize, blockSize>>>(d_red, d_green, d_blue,
d_x__, d_y__, d_logY__,
.0001f, numRows__, numCols__);
cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
*d_luminance = d_logY__;
//allocate memory for the cdf of the histogram
*numberOfBins = numBins;
checkCudaErrors(cudaMalloc(&d_cdf__, sizeof(unsigned int) * numBins));
checkCudaErrors(cudaMemset(d_cdf__, 0, sizeof(unsigned int) * numBins));
*d_cdf = d_cdf__;
checkCudaErrors(cudaFree(d_red));
checkCudaErrors(cudaFree(d_green));
checkCudaErrors(cudaFree(d_blue));
delete[] red;
delete[] green;
delete[] blue;
}
void postProcess(const std::string& output_file,
size_t numRows, size_t numCols,
float min_log_Y, float max_log_Y) {
const int numPixels = numRows__ * numCols__;
const int numThreads = 192;
float *d_cdf_normalized;
checkCudaErrors(cudaMalloc(&d_cdf_normalized, sizeof(float) * numBins));
//first normalize the cdf to a maximum value of 1
//this is how we compress the range of the luminance channel
normalize_cdf<<< (numBins + numThreads - 1) / numThreads,
numThreads>>>(d_cdf__,
d_cdf_normalized,
numBins);
cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
//allocate memory for the output RGB channels
float *h_red, *h_green, *h_blue;
float *d_red, *d_green, *d_blue;
h_red = new float[numPixels];
h_green = new float[numPixels];
h_blue = new float[numPixels];
checkCudaErrors(cudaMalloc(&d_red, sizeof(float) * numPixels));
checkCudaErrors(cudaMalloc(&d_green, sizeof(float) * numPixels));
checkCudaErrors(cudaMalloc(&d_blue, sizeof(float) * numPixels));
float log_Y_range = max_log_Y - min_log_Y;
const dim3 blockSize(32, 16, 1);
const dim3 gridSize( (numCols + blockSize.x - 1) / blockSize.x,
(numRows + blockSize.y - 1) / blockSize.y );
//next perform the actual tone-mapping
//we map each luminance value to its new value
//and then transform back to RGB space
tonemap<<<gridSize, blockSize>>>(d_x__, d_y__, d_logY__,
d_cdf_normalized,
d_red, d_green, d_blue,
min_log_Y, max_log_Y,
log_Y_range, numBins,
numRows, numCols);
cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
checkCudaErrors(cudaMemcpy(h_red, d_red, sizeof(float) * numPixels, cudaMemcpyDeviceToHost));
checkCudaErrors(cudaMemcpy(h_green, d_green, sizeof(float) * numPixels, cudaMemcpyDeviceToHost));
checkCudaErrors(cudaMemcpy(h_blue, d_blue, sizeof(float) * numPixels, cudaMemcpyDeviceToHost));
//recombine the image channels
float *imageHDR = new float[numPixels * 3];
for (int i = 0; i < numPixels; ++i) {
imageHDR[3 * i + 0] = h_blue[i];
imageHDR[3 * i + 1] = h_green[i];
imageHDR[3 * i + 2] = h_red[i];
}
saveImageHDR(imageHDR, numRows, numCols, output_file);
delete[] imageHDR;
delete[] h_red;
delete[] h_green;
delete[] h_blue;
//cleanup
checkCudaErrors(cudaFree(d_x__));
checkCudaErrors(cudaFree(d_y__));
checkCudaErrors(cudaFree(d_logY__));
checkCudaErrors(cudaFree(d_cdf__));
checkCudaErrors(cudaFree(d_cdf_normalized));
}
Shown below is the third file student_func.cu which contains the actual kernel code to find the minimum using reduction
include "reference_calc.cpp"
include "utils.h"
//kernel code for minimum
global void Reduce_Min(const float * d_in, float * d_out)
{
extern shared float sdata[];
int bx = blockIdx.x;
int tx = threadIdx.x;
int myId = (bx * blockDim.x) + tx;
sdata[tx] = d_in[myId];
__syncthreads();
for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1)
{
if (tx < s)
{
sdata[tx] > sdata[tx + s];
sdata[tx] = sdata[tx + s];
}
__syncthreads();
}
// only thread 0 writes result for this block back to global mem
if (tx == 0)
{
d_out[blockIdx.x] = sdata[0];
}
}
void your_histogram_and_prefixsum(const float const d_logLuminance,
unsigned int const d_cdf,
float &min_logLum,
float &max_logLum,
const size_t numRows,
const size_t numCols,
const size_t numBins)
{
float * d_intermediate , * d_out, d_out_max, d_intermediate_max;
const int maxThreads = 1024;
const int ARRAY_BYTES = (((numRows * numCols) / maxThreads)* sizeof(float));
int nosBlocks = ((numRows * numCols) / maxThreads);
//device memory for min
cudaMalloc((void **) &d_out, sizeof(float));
cudaMalloc((void **) &d_intermediate, ARRAY_BYTES);
//parameters for kernel
const dim3 gridSize = ( numCols *numRows) / maxThreads;
const dim3 blockSize = (maxThreads);
Reduce_Min<<< gridSize , blockSize, maxThreads * sizeof(float)>>>(d_logLuminance, d_intermediate);
Reduce_Min<<< 1 , gridSize, numBlocks * sizeof(float)>>>(d_intermediate ,d_out);
cudaMemcpy(&min_logLum, d_out, sizeof(float), cudaMemcpyDeviceToHost);
}
cuda-memcheck
, such ascuda-memcheck ./mycode
– Robert Crovella Jul 12 at 14:09cuda-memcheck
has been useful in a number of cases, see CUDA error message : unspecified launch failure, CUDA: “unspecified launch failure”(http://stackoverflow.com/questions/11820912/cuda-unspecified-launch-failure) and [CUDA Error - unspecified launch failure. – JackOLantern Jul 12 at 21:57