]> rtime.felk.cvut.cz Git - hercules2020/kcf.git/commitdiff
Gaussian correlation for CUFFT version is now on GPU. Also corrected << operator...
authorShanigen <vkaraf@gmail.com>
Mon, 23 Apr 2018 12:59:36 +0000 (14:59 +0200)
committerShanigen <vkaraf@gmail.com>
Mon, 23 Apr 2018 12:59:36 +0000 (14:59 +0200)
14 files changed:
src/CMakeLists.txt
src/complexmat.cuh
src/cuda/cuda_error_check.cuh
src/cuda_functions.cu [new file with mode: 0644]
src/cuda_functions.cuh [new file with mode: 0644]
src/fft.h
src/fft_cufft.cpp
src/fft_cufft.h
src/fft_fftw.cpp
src/fft_fftw.h
src/fft_opencv.cpp
src/fft_opencv.h
src/kcf.cpp
src/kcf.h

index 840d46278d8e0f4fc176cb0cd54ec712e764670f..c358b710bb025832e3b5755e67d87a713a566e2d 100644 (file)
@@ -37,7 +37,7 @@ ELSEIF(FFT STREQUAL "cuFFTW")
   add_definitions(-DFFTW -DCUFFTW)
   set(use_cuda ON)
 ELSEIF(FFT STREQUAL "cuFFT")
-    list(APPEND KCF_LIB_SRC fft_cufft.cpp complexmat.cuh)
+    list(APPEND KCF_LIB_SRC fft_cufft.cpp complexmat.cuh cuda_functions.cuh)
     add_definitions(-DCUFFT)
     set(use_cuda ON)
     iF(CUDA_DEBUG)
@@ -71,6 +71,7 @@ IF(use_cuda)
   IF(FFT STREQUAL "cuFFT")
   add_subdirectory(cuda)
   cuda_add_library(complexmat complexmat.cu)
+  cuda_add_library(cuda_func cuda_functions.cu)
   ENDIF()
   
 ENDIF()
@@ -96,7 +97,7 @@ IF(FFT STREQUAL "cuFFTW")
 ENDIF() #cuFFTW
 
 IF(FFT STREQUAL "cuFFT")
-    target_link_libraries(kcf ${CUDA_cufft_LIBRARY} ${CUDA_LIBRARIES} complexmat)
+    target_link_libraries(kcf ${CUDA_cufft_LIBRARY} ${CUDA_LIBRARIES} complexmat cuda_func)
 ENDIF()
 
 IF(PROFILING)
index aef660060708d967799c20b34795afec303c9250..f91cab7c4077678b7fcaf5edff19fb9c391792da 100644 (file)
@@ -95,15 +95,18 @@ public:
     //text output
     friend std::ostream & operator<<(std::ostream & os, const ComplexMat & mat)
     {
+        float *data_cpu = (float*) malloc(mat.rows*mat.cols*mat.n_channels*sizeof(cufftComplex));
+        CudaSafeCall(cudaMemcpy(data_cpu, mat.p_data, mat.rows*mat.cols*mat.n_channels*sizeof(cufftComplex), cudaMemcpyDeviceToHost));
         //for (int i = 0; i < mat.n_channels; ++i){
         for (int i = 0; i < 1; ++i){
             os << "Channel " << i << std::endl;
             for (int j = 0; j < mat.rows; ++j) {
                 for (int k = 0; k < 2*mat.cols-2; k+=2)
-                    os << "(" << mat.p_data[j*2*mat.cols + k] << "," << mat.p_data[j*2*mat.cols + (k+1)] << ")" << ", ";
-                os << "(" << mat.p_data[j*2*mat.cols + 2*mat.cols-2] << "," << mat.p_data[j*2*mat.cols + 2*mat.cols-1] << ")" <<  std::endl;
+                    os << "(" << data_cpu[j*2*mat.cols + k] << "," << data_cpu[j*2*mat.cols + (k+1)] << ")" << ", ";
+                os << "(" << data_cpu[j*2*mat.cols + 2*mat.cols-2] << "," << data_cpu[j*2*mat.cols + 2*mat.cols-1] << ")" <<  std::endl;
             }
         }
+        free(data_cpu);
         return os;
     }
     
index bdb58c080e6def79f3e90b58303a0a45b954fee9..55ddbacd6dd809bd04e68039c8f532e4f161a927 100644 (file)
@@ -1,6 +1,8 @@
 #ifndef CUDA_ERROR_CHECK_H
 #define CUDA_ERROR_CHECK_H
 
+#include <iostream>
+
 #define CudaSafeCall( err ) __cudaSafeCall( err, __FILE__, __LINE__ )
 #define CudaCheckError()    __cudaCheckError( __FILE__, __LINE__ )
 
@@ -103,7 +105,6 @@ static const char *_cudaGetErrorEnum(cufftResult error)
 
     return "<unknown>";
 }
-#endif
 
 #define CufftErrorCheck(call) __cufftErrorCheck(call, __FILE__, __LINE__ )
 
@@ -118,5 +119,6 @@ inline void __cufftErrorCheck(cufftResult_t call, const char *file, const int li
 
     return;
 }
+#endif
 
 #endif
diff --git a/src/cuda_functions.cu b/src/cuda_functions.cu
new file mode 100644 (file)
index 0000000..44d41be
--- /dev/null
@@ -0,0 +1,56 @@
+#include "cuda_functions.cuh"
+
+__global__ void  gaussian_correlation_kernel(float *data_in, float *data_out, float *xf_sqr_norm, float *yf_sqr_norm, int rows, int cols, int channels_per_scale, double sigma)
+{
+        extern __shared__ float sdata[];
+        int blockId   = blockIdx.y * gridDim.x + blockIdx.x;                           
+        int threadId = blockId *( blockDim.x+channels_per_scale/2) + threadIdx.x; 
+        
+        sdata[threadIdx.x] = 0;
+        sdata[threadIdx.x] = data_in[threadId] + data_in[threadId+blockDim.x];
+        __syncthreads();
+
+        for (unsigned int s= (channels_per_scale/2+1)/2, old_s = channels_per_scale/2;s>0; s>>=1) {
+                  
+                  if(old_s&1) s+=1;
+
+                    if (threadIdx.x < s && threadIdx.x+s < old_s) {
+                          sdata[threadIdx.x] += sdata[threadIdx.x + s];
+                    }
+                  old_s = s;
+                  __syncthreads();
+        }
+          
+        if(threadIdx.x == 0){
+          float accumulate_res = sdata[0]/(rows*cols);
+
+          float numel_xf_inv = 1.f/((cols/2+1) * rows * (channels_per_scale));
+
+          float tmp = (xf_sqr_norm[blockIdx.x] + yf_sqr_norm[0] - 2 * accumulate_res) * numel_xf_inv;
+
+          if (tmp > 0) {
+              data_out[blockIdx.x*rows*cols+blockIdx.y] = expf(- 1.f / (sigma * sigma) * tmp);
+          } else {
+              data_out[blockIdx.x*rows*cols+blockIdx.y] = expf(0);
+          }
+        }
+}
+
+void cuda_gaussian_correlation(float *data_in, float *data_out, float *xf_sqr_norm, float *yf_sqr_norm, double sigma, int n_channels, int n_scales,int rows, int cols)
+{
+    dim3 threadsPerBlock((n_channels/n_scales)/2);
+    dim3 numBlocks(n_scales, rows*cols);
+
+    gaussian_correlation_kernel<<<numBlocks, threadsPerBlock, ((n_channels/n_scales)/2)*sizeof(float)>>>(data_in, data_out, xf_sqr_norm, yf_sqr_norm, rows, cols, n_channels/n_scales,  sigma);
+    CudaCheckError();
+    
+//    float *data_cpu = (float*) malloc(rows*cols*n_scales*sizeof(float));
+//    CudaSafeCall(cudaMemcpy(data_cpu, data_out, rows*cols*n_scales*sizeof(float), cudaMemcpyDeviceToHost));
+//    for (int j = 0; j < rows*n_scales; ++j) {
+//                for (int k = 0; k < cols-1; ++k)
+//                   std::cout  << data_cpu[j*cols  + k]  << ", ";
+//                std::cout << data_cpu[j*cols + cols-1] <<  std::endl;
+//            }
+//    free(data_cpu);
+    return;
+}
diff --git a/src/cuda_functions.cuh b/src/cuda_functions.cuh
new file mode 100644 (file)
index 0000000..452313f
--- /dev/null
@@ -0,0 +1,9 @@
+#ifndef CUDA_FUNCTIONS_H
+#define CUDA_FUNCTIONS_H
+
+#include "cuda_runtime.h"
+#include "cuda/cuda_error_check.cuh"
+
+void cuda_gaussian_correlation(float *data_in, float *data_out, float *xf_sqr_norm, float *yf_sqr_norm, double sigma, int n_channels, int n_scales, int rows, int cols);
+
+#endif
index 92f731bd76c3f46271db6cd175a24acd189a357e..c8ce998ad7d333f3de2490a183144c093bfe9740 100644 (file)
--- a/src/fft.h
+++ b/src/fft.h
@@ -17,7 +17,7 @@ public:
     virtual void init(unsigned width, unsigned height,unsigned num_of_feats, unsigned num_of_scales, bool big_batch_mode) = 0;
     virtual void set_window(const cv::Mat &window) = 0;
     virtual ComplexMat forward(const cv::Mat &input) = 0;
-    virtual ComplexMat forward_raw(float *input) = 0;
+    virtual ComplexMat forward_raw(float *input, bool all_scales) = 0;
     virtual ComplexMat forward_window(const std::vector<cv::Mat> &input) = 0;
     virtual cv::Mat inverse(const ComplexMat &input) = 0;
     virtual float* inverse_raw(const ComplexMat &input) = 0;
index 213b6e2e4179e83b6cf36dba17848ec07a75687b..fe3b6f494ec65d5fedfe206b21c7941b4db4aebf 100644 (file)
@@ -172,10 +172,19 @@ ComplexMat cuFFT::forward(const cv::Mat &input)
     return complex_result;
 }
 
-ComplexMat cuFFT::forward_raw(float *input)
+ComplexMat cuFFT::forward_raw(float *input, bool all_scales)
 {
-    ComplexMat dummy;
-    return dummy;
+    ComplexMat complex_result;
+    if (all_scales){
+        complex_result.create(m_height, m_width / 2 + 1, m_num_of_scales);
+        CufftErrorCheck(cufftExecR2C(plan_f_all_scales, reinterpret_cast<cufftReal*>(input),
+                                complex_result.get_p_data()));
+    } else {
+        complex_result.create(m_height, m_width/ 2 + 1, 1);
+        CufftErrorCheck(cufftExecR2C(plan_f, reinterpret_cast<cufftReal*>(input),
+                                complex_result.get_p_data()));
+    }
+    return complex_result;
 }
 
 ComplexMat cuFFT::forward_window(const std::vector<cv::Mat> &input)
@@ -244,11 +253,26 @@ cv::Mat cuFFT::inverse(const ComplexMat &input)
 
 float* cuFFT::inverse_raw(const ComplexMat &input)
 {
+    int n_channels = input.n_channels;
     cufftComplex *in = reinterpret_cast<cufftComplex*>(input.get_p_data());
-    
-    CufftErrorCheck(cufftExecC2R(plan_i_features_all_scales, in, reinterpret_cast<cufftReal*>(data_i_features_all_scales_d)));
 
-    return data_i_features_all_scales;
+    if(n_channels == 1){
+        CufftErrorCheck(cufftExecC2R(plan_i_1ch, in, reinterpret_cast<cufftReal*>(data_i_1ch_d)));
+
+        return data_i_1ch_d;
+    } else if(n_channels == (int) m_num_of_scales){
+        CufftErrorCheck(cufftExecC2R(plan_i_1ch_all_scales, in, reinterpret_cast<cufftReal*>(data_i_1ch_all_scales_d)));
+
+        return data_i_1ch_all_scales_d;
+    } else if(n_channels == (int) m_num_of_feats * (int) m_num_of_scales){
+        CufftErrorCheck(cufftExecC2R(plan_i_features_all_scales, in, reinterpret_cast<cufftReal*>(data_i_features_all_scales_d)));
+
+        return data_i_features_all_scales_d;
+    }
+
+    CufftErrorCheck(cufftExecC2R(plan_i_features, in, reinterpret_cast<cufftReal*>(data_i_features_d)));
+    
+    return data_i_features_d;
 }
 
 cuFFT::~cuFFT()
index 680259df10041d626da0299b7611a1fcc3913b2d..a71bf3442848b7d056cc6875c54c9287f1ba72b9 100644 (file)
@@ -22,7 +22,7 @@ public:
     void init(unsigned width, unsigned height, unsigned num_of_feats, unsigned num_of_scales, bool big_batch_mode) override;
     void set_window(const cv::Mat &window) override;
     ComplexMat forward(const cv::Mat &input) override;
-    ComplexMat forward_raw(float *input) override;
+    ComplexMat forward_raw(float *input, bool all_scales) override;
     ComplexMat forward_window(const std::vector<cv::Mat> &input) override;
     cv::Mat inverse(const ComplexMat &input) override;
     float* inverse_raw(const ComplexMat &input) override;
index 3aba1d7edd5e662c04b68c15bc753ed371796259..8ba0d1516680d3909d015b0fb5ca39b6c80d9584 100644 (file)
@@ -205,7 +205,7 @@ ComplexMat Fftw::forward(const cv::Mat &input)
     return complex_result;
 }
 
-ComplexMat Ffftw::forward_raw(float *input)
+ComplexMat Fftw::forward_raw(float *input, bool all_scales)
 {
     ComplexMat dummy;
     return dummy;
index b728a31dd104ae5a4ebf24af689c464caa2ff78a..8c23cda25d00103b8359af1eb5f7157cd52ca5be 100644 (file)
@@ -22,7 +22,7 @@ public:
     void init(unsigned width, unsigned height, unsigned num_of_feats, unsigned num_of_scales, bool big_batch_mode) override;
     void set_window(const cv::Mat &window) override;
     ComplexMat forward(const cv::Mat &input) override;
-    ComplexMat forward_raw(float *input) override;
+    ComplexMat forward_raw(float *input, bool all_scales) override;
     ComplexMat forward_window(const std::vector<cv::Mat> &input) override;
     cv::Mat inverse(const ComplexMat &input) override;
     float* inverse_raw(const ComplexMat &input) override;
index 4772b1c3b0255484edbeea2bf8a1b4939e68d86d..e8046ebc94042d4ed41d571cd51e9c713c2c72f7 100644 (file)
@@ -22,7 +22,7 @@ ComplexMat FftOpencv::forward(const cv::Mat &input)
     return ComplexMat(complex_result);
 }
 
-ComplexMat FftOpencv::forward_raw(float *input)
+ComplexMat FftOpencv::forward_raw(float *input, bool all_scales)
 {
     ComplexMat dummy;
     return dummy;
index 1f49f3d1e08a9f6a024fc814148095571bbd5bf7..7050b2e5b0f2e47f9e785e816d1408cd633bd953 100644 (file)
@@ -10,7 +10,7 @@ public:
     void init(unsigned width, unsigned height, unsigned num_of_feats, unsigned num_of_scales, bool big_batch_mode) override;
     void set_window(const cv::Mat &window) override;
     ComplexMat forward(const cv::Mat &input) override;
-    ComplexMat forward_raw(float *input) override;
+    ComplexMat forward_raw(float *input, bool all_scales) override;
     ComplexMat forward_window(const std::vector<cv::Mat> &input) override;
     cv::Mat inverse(const ComplexMat &input) override;
     float* inverse_raw(const ComplexMat &input) override;
index fafad07cf6b64b1733ca69e33a82d793392331d9..40b6c9ba741a896e7cc9346651e200e035f6c7fc 100644 (file)
@@ -36,6 +36,7 @@ KCF_Tracker::~KCF_Tracker()
 #ifdef CUFFT
     CudaSafeCall(cudaFreeHost(xf_sqr_norm));
     CudaSafeCall(cudaFreeHost(yf_sqr_norm));
+    CudaSafeCall(cudaFree(gauss_corr_res));
 #else
     free(xf_sqr_norm);
     free(yf_sqr_norm);
@@ -111,12 +112,12 @@ void KCF_Tracker::init(cv::Mat &img, const cv::Rect & bbox)
     cudaSetDeviceFlags(cudaDeviceMapHost);
     CudaSafeCall(cudaHostAlloc((void**)&xf_sqr_norm, p_num_scales*sizeof(float), cudaHostAllocMapped));
     CudaSafeCall(cudaHostGetDevicePointer((void**)&xf_sqr_norm_d, (void*)xf_sqr_norm, 0));
-    std::cout << &xf_sqr_norm << std::endl;
+
     CudaSafeCall(cudaHostAlloc((void**)&yf_sqr_norm, sizeof(float), cudaHostAllocMapped));
     CudaSafeCall(cudaHostGetDevicePointer((void**)&yf_sqr_norm_d, (void*)yf_sqr_norm, 0));
 #else
     xf_sqr_norm = (float*) malloc(p_num_scales*sizeof(float));
-    xf_sqr_norm = (float*) malloc(sizeof(float));
+    yf_sqr_norm = (float*) malloc(sizeof(float));
 #endif
 
     p_current_scale = 1.;
@@ -136,13 +137,16 @@ void KCF_Tracker::init(cv::Mat &img, const cv::Rect & bbox)
     p_num_of_feats = 31;
     if(m_use_color) p_num_of_feats += 3;
     if(m_use_cnfeat) p_num_of_feats += 10;
-    p_poi_width = p_windows_size[0]/p_cell_size;
-    p_poi_height = p_windows_size[1]/p_cell_size;
+    p_roi_width = p_windows_size[0]/p_cell_size;
+    p_roi_height = p_windows_size[1]/p_cell_size;
 
     fft.init(p_windows_size[0]/p_cell_size, p_windows_size[1]/p_cell_size, p_num_of_feats, p_num_scales, m_use_big_batch);
     p_yf = fft.forward(gaussian_shaped_labels(p_output_sigma, p_windows_size[0]/p_cell_size, p_windows_size[1]/p_cell_size));
     fft.set_window(cosine_window_function(p_windows_size[0]/p_cell_size, p_windows_size[1]/p_cell_size));
 
+#ifdef CUFFT
+      CudaSafeCall(cudaMalloc((void**)&gauss_corr_res, (p_windows_size[0]/p_cell_size)*(p_windows_size[1]/p_cell_size)*p_num_scales*sizeof(float)));
+#endif
     //obtain a sub-window for training initial model
     std::vector<cv::Mat> path_feat = get_features(input_rgb, input_gray, p_pose.cx, p_pose.cy, p_windows_size[0], p_windows_size[1]);
     p_model_xf = fft.forward_window(path_feat);
@@ -257,7 +261,7 @@ void KCF_Tracker::track(cv::Mat &img)
             }
             scale_responses.push_back(max_val*weight);
         }
-    } else if(m_use_big_batch){
+    } else if (m_use_big_batch){
 #pragma omp parallel for ordered
         for (size_t i = 0; i < p_scales.size(); ++i) {
             std::vector<cv::Mat> tmp = get_features(input_rgb, input_gray, p_pose.cx, p_pose.cy, p_windows_size[0], p_windows_size[1], p_current_scale * p_scales[i]);
@@ -618,27 +622,33 @@ ComplexMat KCF_Tracker::gaussian_correlation(const ComplexMat &xf, const Complex
 {
 #ifdef CUFFT
     xf.sqr_norm(xf_sqr_norm_d);
+    if (auto_correlation){
+        cudaDeviceSynchronize();
+        yf_sqr_norm[0] = xf_sqr_norm[0];
+    } else {
+        yf.sqr_norm(yf_sqr_norm_d);
+    }
 #else
     xf.sqr_norm(xf_sqr_norm);
-#endif
-    if(auto_correlation){
+    if (auto_correlation){
       yf_sqr_norm[0] = xf_sqr_norm[0];
     } else {
-#ifdef CUFFT
-       yf.sqr_norm(yf_sqr_norm_d);
-#else
        yf.sqr_norm(yf_sqr_norm);
-#endif
     }
-
+#endif
     ComplexMat xyf;
     xyf = auto_correlation ? xf.sqr_mag() : xf.mul2(yf.conj());
     DEBUG_PRINTM(xyf);
+#ifdef CUFFT
+    cuda_gaussian_correlation(fft.inverse_raw(xyf), gauss_corr_res, xf_sqr_norm_d, yf_sqr_norm_d, sigma, xf.n_channels, xf.n_scales, p_roi_height, p_roi_width);
 
+    return fft.forward_raw(gauss_corr_res, xf.n_scales==p_num_scales);
+#else
     //ifft2 and sum over 3rd dimension, we dont care about individual channels
     cv::Mat ifft2_res = fft.inverse(xyf);
+    DEBUG_PRINTM(ifft2_res);
     cv::Mat xy_sum;
-    if(xf.channels() != p_num_scales*p_num_of_feats)
+    if (xf.channels() != p_num_scales*p_num_of_feats)
         xy_sum.create(ifft2_res.size(), CV_32FC1);
     else
         xy_sum.create(ifft2_res.size(), CV_32FC(p_scales.size()));
@@ -648,12 +658,10 @@ ComplexMat KCF_Tracker::gaussian_correlation(const ComplexMat &xf, const Complex
         float * row_ptr_sum = xy_sum.ptr<float>(y);
         for (int x = 0; x < ifft2_res.cols; ++x) {
             for (int sum_ch = 0; sum_ch < xy_sum.channels(); ++sum_ch) {
-                row_ptr_sum[(x*xy_sum.channels())+sum_ch] += std::accumulate(row_ptr + x*ifft2_res.channels() + sum_ch*(ifft2_res.channels()/xy_sum.channels()),
-                                                                             (row_ptr + x*ifft2_res.channels() + (sum_ch+1)*(ifft2_res.channels()/xy_sum.channels())), 0.f);
+                row_ptr_sum[(x*xy_sum.channels())+sum_ch] += std::accumulate(row_ptr + x*ifft2_res.channels() + sum_ch*(ifft2_res.channels()/xy_sum.channels()), (row_ptr + x*ifft2_res.channels() + (sum_ch+1)*(ifft2_res.channels()/xy_sum.channels())), 0.f);
             }
         }
     }
-    DEBUG_PRINTM(ifft2_res);
     DEBUG_PRINTM(xy_sum);
 
     std::vector<cv::Mat> scales;
@@ -661,7 +669,7 @@ ComplexMat KCF_Tracker::gaussian_correlation(const ComplexMat &xf, const Complex
     cv::Mat in_all(scales[0].rows * xf.n_scales, scales[0].cols, CV_32F);
 
     float numel_xf_inv = 1.f/(xf.cols * xf.rows * (xf.channels()/xf.n_scales));
-    for(int i = 0; i < xf.n_scales; ++i){
+    for (int i = 0; i < xf.n_scales; ++i){
         cv::Mat in_roi(in_all, cv::Rect(0, i*scales[0].rows, scales[0].cols, scales[0].rows));
         cv::exp(- 1.f / (sigma * sigma) * cv::max((xf_sqr_norm[i] + yf_sqr_norm[0] - 2 * scales[i]) * numel_xf_inv, 0), in_roi);
         DEBUG_PRINTM(in_roi);
@@ -669,6 +677,7 @@ ComplexMat KCF_Tracker::gaussian_correlation(const ComplexMat &xf, const Complex
 
     DEBUG_PRINTM(in_all);
     return fft.forward(in_all);
+#endif
 }
 
 float get_response_circular(cv::Point2i & pt, cv::Mat & response)
index 659ae54209aa1a71f263190e512b8535ff1cd21c..2501521fccff0c35eba7cfa650b0fa56a8a1a402 100644 (file)
--- a/src/kcf.h
+++ b/src/kcf.h
@@ -7,6 +7,7 @@
 
 #ifdef CUFFT
   #include "complexmat.cuh"
+  #include "cuda_functions.cuh"
   #include "cuda/cuda_error_check.cuh"
   #include <cuda_runtime.h>
 #else
@@ -108,10 +109,10 @@ private:
 
     //for big batch
     int p_num_of_feats;
-    int p_poi_height, p_poi_width;
+    int p_roi_height, p_roi_width;
     float *xf_sqr_norm = nullptr, *yf_sqr_norm = nullptr;
 #ifdef CUFFT
-    float *xf_sqr_norm_d = nullptr, *yf_sqr_norm_d = nullptr;
+    float *xf_sqr_norm_d = nullptr, *yf_sqr_norm_d = nullptr, *gauss_corr_res = nullptr;
 #endif
 
     //model