From 09ad2802d987433f9f5d1a7f5ab67c3dbb4bcbd9 Mon Sep 17 00:00:00 2001
From: Shanigen <vkaraf@gmail.com>
Date: Fri, 10 Aug 2018 13:20:02 +0200
Subject: [PATCH] CUDA streams works

This commit adds support for CUDA streams and also corrects issue
with CUFFTW version of the tracker. All version now works without
any problem. CUDA streams version currently does not support C++
async directive and only OpenMP.
TODO: Implement correct use of OpenMP in big batch mode.
---
 Makefile           |  3 ++-
 src/fft_cufft.cpp  | 20 ++++++++++++++++++--
 src/fft_fftw.cpp   |  2 +-
 src/kcf.cpp        | 13 ++++---------
 src/scale_vars.hpp | 14 ++++++--------
 5 files changed, 31 insertions(+), 21 deletions(-)

diff --git a/Makefile b/Makefile
index 7228039..561af83 100644
--- a/Makefile
+++ b/Makefile
@@ -1,6 +1,6 @@
 # Makefile to build all the available variants
 
-BUILDS = opencvfft-st opencvfft-async opencvfft-openmp fftw fftw-async fftw-openmp fftw-big fftw-big-openmp cufftw cufftw-big cufftw-big-openmp cufft cufft-big cufft-big-openmp
+BUILDS = opencvfft-st opencvfft-async opencvfft-openmp fftw fftw-async fftw-openmp fftw-big fftw-big-openmp cufftw cufftw-big cufftw-big-openmp cufft cufft-openmp cufft-big cufft-big-openmp cufft-opemp
 
 all: $(foreach build,$(BUILDS),build-$(build)/kcf_vot)
 
@@ -19,6 +19,7 @@ CMAKE_OTPS_cufftw            = -DFFT=cuFFTW
 CMAKE_OTPS_cufftw-big        = -DFFT=cuFFTW -DBIG_BATCH=ON
 CMAKE_OTPS_cufftw-big-openmp = -DFFT=cuFFTW -DBIG_BATCH=ON -DOPENMP=ON
 CMAKE_OTPS_cufft             = -DFFT=cuFFT
+CMAKE_OTPS_cufft-openmp			 = -DFFT=cuFFT -DOPENMP=ON
 CMAKE_OTPS_cufft-big         = -DFFT=cuFFT -DBIG_BATCH=ON
 CMAKE_OTPS_cufft-big-openmp  = -DFFT=cuFFT -DBIG_BATCH=ON -DOPENMP=ON
 
diff --git a/src/fft_cufft.cpp b/src/fft_cufft.cpp
index da43fbf..955c914 100644
--- a/src/fft_cufft.cpp
+++ b/src/fft_cufft.cpp
@@ -139,9 +139,12 @@ void cuFFT::forward(const cv::Mat & real_input, ComplexMat & complex_result, flo
         CufftErrorCheck(cufftExecR2C(plan_f_all_scales, reinterpret_cast<cufftReal*>(real_input_arr),
                                 complex_result.get_p_data()));
     } else {
+#pragma omp critical
+        {
         CufftErrorCheck(cufftSetStream(plan_f, stream));
-        CufftErrorCheck(cufftExecR2C(plan_f, reinterpret_cast<cufftReal*>(real_input_arr),
-                                complex_result.get_p_data()));
+        CufftErrorCheck(cufftExecR2C(plan_f, reinterpret_cast<cufftReal*>(real_input_arr), complex_result.get_p_data()));
+        cudaStreamSynchronize(stream);
+        }
     }
     return;
 }
@@ -161,8 +164,12 @@ void cuFFT::forward_window(std::vector<cv::Mat> patch_feats, ComplexMat & comple
             cv::Mat in_roi(fw_all, cv::Rect(0, int(i*m_height), int(m_width), int(m_height)));
             in_roi = patch_feats[i].mul(m_window);
         }
+#pragma omp critical
+        {
         CufftErrorCheck(cufftSetStream(plan_fw, stream));
         CufftErrorCheck(cufftExecR2C(plan_fw, reinterpret_cast<cufftReal*>(real_input_arr), complex_result.get_p_data()));
+        cudaStreamSynchronize(stream);
+        }
     }
     return;
 }
@@ -173,9 +180,12 @@ void cuFFT::inverse(ComplexMat &  complex_input, cv::Mat & real_result, float *r
     cufftComplex *in = reinterpret_cast<cufftComplex*>(complex_input.get_p_data());
 
     if(n_channels == 1){
+#pragma omp critical
+        {
         CufftErrorCheck(cufftSetStream(plan_i_1ch, stream));
         CufftErrorCheck(cufftExecC2R(plan_i_1ch, in, reinterpret_cast<cufftReal*>(real_result_arr)));
         cudaStreamSynchronize(stream);
+        }
         real_result = real_result/(m_width*m_height);
         return;
     } else if(n_channels == int(m_num_of_scales)){
@@ -188,8 +198,14 @@ void cuFFT::inverse(ComplexMat &  complex_input, cv::Mat & real_result, float *r
         CufftErrorCheck(cufftExecC2R(plan_i_features_all_scales, in, reinterpret_cast<cufftReal*>(real_result_arr)));
         return;
     }
+#pragma omp critical
+    {
     CufftErrorCheck(cufftSetStream(plan_i_features, stream));
     CufftErrorCheck(cufftExecC2R(plan_i_features, in, reinterpret_cast<cufftReal*>(real_result_arr)));
+#if defined(OPENMP) && !defined(BIG_BATCH)
+    cudaStreamSynchronize(stream);
+#endif
+    }
     return;
 }
 
diff --git a/src/fft_fftw.cpp b/src/fft_fftw.cpp
index d171e4a..4dfd5a1 100644
--- a/src/fft_fftw.cpp
+++ b/src/fft_fftw.cpp
@@ -30,7 +30,7 @@ void Fftw::init(unsigned width, unsigned height, unsigned num_of_feats, unsigned
     m_num_of_scales = num_of_scales;
     m_big_batch_mode = big_batch_mode;
 
-#if (!defined(ASYNC) && !defined(CUFFTW))|| defined(OPENMP)
+#if (!defined(ASYNC) && !defined(CUFFTW)) && defined(OPENMP)
     fftw_init_threads();
 #endif //OPENMP
 
diff --git a/src/kcf.cpp b/src/kcf.cpp
index 11687d2..fceccae 100644
--- a/src/kcf.cpp
+++ b/src/kcf.cpp
@@ -21,6 +21,8 @@
 
 #define DEBUG_PRINT(obj) if (m_debug) {std::cout << #obj << " @" << __LINE__ << std::endl << (obj) << std::endl;}
 #define DEBUG_PRINTM(obj) if (m_debug) {std::cout << #obj << " @" << __LINE__ << " " << (obj).size() << " CH: " << (obj).channels() << std::endl << (obj) << std::endl;}
+#define DEBUG_PRINTD(obj) {std::cout << #obj << " @" << __LINE__ << " " << (obj).size() << " CH: " << (obj).channels() << std::endl << (obj) << std::endl;}
+
 
 KCF_Tracker::KCF_Tracker(double padding, double kernel_sigma, double lambda, double interp_factor, double output_sigma_factor, int cell_size) :
     fft(*new FFT()),
@@ -157,15 +159,6 @@ void KCF_Tracker::init(cv::Mat &img, const cv::Rect & bbox, int fit_size_x, int
         else {
             p_scale_vars.emplace_back(new Scale_vars(p_windows_size, p_cell_size, p_num_of_feats, 1));
         }
-#ifdef CUFFT
-        std::cout << p_scale_vars.back()->zf.stream << std::endl;
-        std::cout << p_scale_vars.back()->kzf.stream << std::endl;
-        std::cout << p_scale_vars.back()->kf.stream << std::endl << std::endl;
-
-        std::cout << p_scale_vars.back()->zf.n_scales << std::endl;
-        std::cout << p_scale_vars.back()->kzf.n_scales << std::endl;
-        std::cout << p_scale_vars.back()->kf.n_scales << std::endl << std::endl;
-#endif
     }
 
     p_current_scale = 1.;
@@ -199,6 +192,7 @@ void KCF_Tracker::init(cv::Mat &img, const cv::Rect & bbox, int fit_size_x, int
     p_scale_vars.front()->model_xf = p_model_xf;
     p_scale_vars.front()->model_xf.set_stream(p_scale_vars.front()->stream);
     p_yf.set_stream(p_scale_vars.front()->stream);
+    p_model_xf.set_stream(p_scale_vars.front()->stream);
 #endif
 
     if (m_use_linearkernel) {
@@ -393,6 +387,7 @@ void KCF_Tracker::track(cv::Mat &img)
         p_current_scale = p_min_max_scale[0];
     if (p_current_scale > p_min_max_scale[1])
         p_current_scale = p_min_max_scale[1];
+
     //obtain a subwindow for training at newly estimated target position
     p_scale_vars.front()->patch_feats.clear();
     get_features(input_rgb, input_gray, int(p_pose.cx), int(p_pose.cy), p_windows_size[0], p_windows_size[1], *p_scale_vars.front(), p_current_scale);
diff --git a/src/scale_vars.hpp b/src/scale_vars.hpp
index 0aad3fb..9f755e5 100644
--- a/src/scale_vars.hpp
+++ b/src/scale_vars.hpp
@@ -5,8 +5,12 @@
   #include "complexmat.cuh"
 #else
   #include "complexmat.hpp"
+#ifndef CUFFTW
 //For compatibility reasons between CuFFT and FFTW, OpenCVfft versions.
   typedef int* cudaStream_t;
+#else
+    #include "cuda_runtime.h"
+#endif
 #endif
 
 struct Scale_vars
@@ -16,7 +20,6 @@ public:
                            ComplexMat *yf = nullptr,bool zero_index = false)
     {
         uint alloc_size;
-        std::cout << __PRETTY_FUNCTION__ << std::endl;
 #ifdef CUFFT
         if (zero_index) {
             cudaSetDeviceFlags(cudaDeviceMapHost);
@@ -27,6 +30,8 @@ public:
         CudaSafeCall(cudaStreamCreate(&this->stream));
 #endif
 
+        this->patch_feats.reserve(uint(num_of_feats));
+
         alloc_size = uint(windows_size[0]/cell_size*windows_size[1]/cell_size*num_of_scales)*sizeof(cufftReal);
         CudaSafeCall(cudaHostAlloc(reinterpret_cast<void**>(&this->data_i_1ch), alloc_size, cudaHostAllocMapped));
         CudaSafeCall(cudaHostGetDevicePointer(reinterpret_cast<void**>(&this->data_i_1ch_d), reinterpret_cast<void*>(this->data_i_1ch), 0));
@@ -39,14 +44,8 @@ public:
         this->response = cv::Mat(windows_size[1]/cell_size, windows_size[0]/cell_size, CV_32FC(num_of_scales), this->data_i_1ch);
 
         this->zf.create(windows_size[1]/cell_size, (windows_size[0]/cell_size)/2+1, num_of_feats, num_of_scales, this->stream);
-        std::cout << this->zf.stream << std::endl;
-        std::cout << this->zf.n_scales << std::endl;
         this->kzf.create(windows_size[1]/cell_size, (windows_size[0]/cell_size)/2+1, num_of_scales, this->stream);
-        std::cout << this->kzf.stream << std::endl;
-        std::cout << this->kzf.n_scales << std::endl;
         this->kf.create(windows_size[1]/cell_size, (windows_size[0]/cell_size)/2+1, num_of_scales, this->stream);
-        std::cout << this->kf.stream << std::endl;
-        std::cout << this->kf.n_scales << std::endl << std::endl;
 
         alloc_size = uint(num_of_scales);
 
@@ -126,7 +125,6 @@ public:
 
     ~Scale_vars() {
 #ifdef CUFFT
-        std::cout << __PRETTY_FUNCTION__ << std::endl;
         CudaSafeCall(cudaFreeHost(this->xf_sqr_norm));
         CudaSafeCall(cudaFreeHost(this->yf_sqr_norm));
         CudaSafeCall(cudaFreeHost(this->data_i_1ch));
-- 
2.39.2