This is a little baffling to me as to why the CUDA code runs about twice as slow as the CPU version. I am just counting all the primes from 0 to (512 * 512 * 512). The CPU version executed in about 97 seconds whereas the GPU version took 182 seconds.
- CPU: Intel Core i7 @ 4 GHz
- GPU: NVIDIA GTX 960
Any ideas why?
#include <cuda.h>
#include <iostream>
#include <cstdint>
#include <stdio.h>
#include <ctime>
#include <vector>
#include <cstdlib>
#include <climits>
using namespace std;
__host__ __device__ bool is_prime(uint32_t n)
{
if(n == 2)
return true;
if(n % 2 == 0)
return false;
uint32_t sr = sqrtf(n);
for(uint32_t i = 3; i <= sr; i += 2)
if(n % i == 0)
return false;
return true;
}
__global__ void prime_sum(unsigned int* count)
{
uint32_t n = (blockIdx.y * gridDim.y + blockIdx.x) * blockDim.x + threadIdx.x;
if(is_prime(n))
atomicAdd(count, 1);
}
CPU version
int main()
{
time_t start = time(0);
int pcount = 0;
for(uint32_t i = 0; i < (512 * 512 * 512); i++)
{
if(is_prime(i)) pcount++;
}
start = time(0) - start;
std::cout << pcount << "\t" << start << std::endl;
return 0;
}
CUDA version
int main()
{
time_t start = time(0);
unsigned int* sum_d;
cudaMalloc(&sum_d, sizeof(unsigned int));
cudaMemset(sum_d, 0, sizeof(unsigned int));
prime_sum<<< dim3(512, 512), 512 >>>(sum_d);
unsigned int sum = 0;
cudaMemcpy(&sum, sum_d, sizeof(unsigned int), cudaMemcpyDeviceToHost);
start = time(0) - start;
std::cout << sum << "\t" << start << std::endl;
cudaFree(sum_d);
return 0;
}