]> rtime.felk.cvut.cz Git - hercules2020/kcf.git/blob - src/cuda_functions.cu
Work done so far on CUDA streams
[hercules2020/kcf.git] / src / cuda_functions.cu
1 #include "cuda_functions.cuh"
2
3 __global__ void  gaussian_correlation_kernel(float *data_in, float *data_out, float *xf_sqr_norm, float *yf_sqr_norm, int rows, int cols, int channels_per_scale, double sigma)
4 {
5         extern __shared__ float sdata[];
6         int blockId   = blockIdx.y * gridDim.x + blockIdx.x;                            
7         int threadId = blockId *( blockDim.x+channels_per_scale/2) + threadIdx.x; 
8         
9         sdata[threadIdx.x] = 0;
10         sdata[threadIdx.x] = data_in[threadId] + data_in[threadId+blockDim.x];
11         __syncthreads();
12
13         for (unsigned int s= (channels_per_scale/2+1)/2, old_s = channels_per_scale/2;s>0; s>>=1) {
14                   
15                   if(old_s&1) s+=1;
16
17                     if (threadIdx.x < s && threadIdx.x+s < old_s) {
18                           sdata[threadIdx.x] += sdata[threadIdx.x + s];
19                     }
20                   old_s = s;
21                   __syncthreads();
22         }
23           
24         if(threadIdx.x == 0){
25           float accumulate_res = sdata[0]/(rows*cols);
26
27           float numel_xf_inv = 1.f/((cols/2+1) * rows * (channels_per_scale));
28
29           float tmp = (xf_sqr_norm[blockIdx.x] + yf_sqr_norm[0] - 2 * accumulate_res) * numel_xf_inv;
30
31           if (tmp > 0) {
32               data_out[blockIdx.x*rows*cols+blockIdx.y] = expf(- 1.f / (sigma * sigma) * tmp);
33           } else {
34               data_out[blockIdx.x*rows*cols+blockIdx.y] = expf(0);
35           }
36         }
37 }
38
39 void cuda_gaussian_correlation(float *data_in, float *data_out, float *xf_sqr_norm, float *yf_sqr_norm, double sigma, int n_channels, int n_scales,int rows, int cols, cudaStream_t stream)
40 {
41     dim3 threadsPerBlock((n_channels/n_scales)/2);
42     dim3 numBlocks(n_scales, rows*cols);
43
44     gaussian_correlation_kernel<<<numBlocks, threadsPerBlock, ((n_channels/n_scales)/2)*sizeof(float), stream>>>(data_in, data_out, xf_sqr_norm, yf_sqr_norm, rows, cols, n_channels/n_scales,  sigma);
45     CudaCheckError();
46     
47 //    float *data_cpu = (float*) malloc(rows*cols*n_scales*sizeof(float));
48 //    CudaSafeCall(cudaMemcpy(data_cpu, data_out, rows*cols*n_scales*sizeof(float), cudaMemcpyDeviceToHost));
49 //    for (int j = 0; j < rows*n_scales; ++j) {
50 //                for (int k = 0; k < cols-1; ++k)
51 //                   std::cout  << data_cpu[j*cols  + k]  << ", ";
52 //                std::cout << data_cpu[j*cols + cols-1] <<  std::endl;
53 //            }
54 //    free(data_cpu);
55     return;
56 }