]> rtime.felk.cvut.cz Git - hercules2020/kcf.git/commitdiff
CUDA streams works
authorShanigen <vkaraf@gmail.com>
Fri, 10 Aug 2018 11:20:02 +0000 (13:20 +0200)
committerMichal Sojka <michal.sojka@cvut.cz>
Wed, 5 Sep 2018 06:38:52 +0000 (08:38 +0200)
This commit adds support for CUDA streams and also corrects issue
with CUFFTW version of the tracker. All version now works without
any problem. CUDA streams version currently does not support C++
async directive and only OpenMP.
TODO: Implement correct use of OpenMP in big batch mode.

Makefile
src/fft_cufft.cpp
src/fft_fftw.cpp
src/kcf.cpp
src/scale_vars.hpp

index 72280398a896c6246e300c7d54e7e70ab5d02e3f..561af832f9a1057657cdde2e2d14d85f1eea2ff6 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -1,6 +1,6 @@
 # Makefile to build all the available variants
 
-BUILDS = opencvfft-st opencvfft-async opencvfft-openmp fftw fftw-async fftw-openmp fftw-big fftw-big-openmp cufftw cufftw-big cufftw-big-openmp cufft cufft-big cufft-big-openmp
+BUILDS = opencvfft-st opencvfft-async opencvfft-openmp fftw fftw-async fftw-openmp fftw-big fftw-big-openmp cufftw cufftw-big cufftw-big-openmp cufft cufft-openmp cufft-big cufft-big-openmp cufft-opemp
 
 all: $(foreach build,$(BUILDS),build-$(build)/kcf_vot)
 
@@ -19,6 +19,7 @@ CMAKE_OTPS_cufftw            = -DFFT=cuFFTW
 CMAKE_OTPS_cufftw-big        = -DFFT=cuFFTW -DBIG_BATCH=ON
 CMAKE_OTPS_cufftw-big-openmp = -DFFT=cuFFTW -DBIG_BATCH=ON -DOPENMP=ON
 CMAKE_OTPS_cufft             = -DFFT=cuFFT
+CMAKE_OTPS_cufft-openmp                         = -DFFT=cuFFT -DOPENMP=ON
 CMAKE_OTPS_cufft-big         = -DFFT=cuFFT -DBIG_BATCH=ON
 CMAKE_OTPS_cufft-big-openmp  = -DFFT=cuFFT -DBIG_BATCH=ON -DOPENMP=ON
 
index da43fbf4bd06f92a606380d4dd91097212994e82..955c9148489afaf8a8dc29f91aef5821baed9368 100644 (file)
@@ -139,9 +139,12 @@ void cuFFT::forward(const cv::Mat & real_input, ComplexMat & complex_result, flo
         CufftErrorCheck(cufftExecR2C(plan_f_all_scales, reinterpret_cast<cufftReal*>(real_input_arr),
                                 complex_result.get_p_data()));
     } else {
+#pragma omp critical
+        {
         CufftErrorCheck(cufftSetStream(plan_f, stream));
-        CufftErrorCheck(cufftExecR2C(plan_f, reinterpret_cast<cufftReal*>(real_input_arr),
-                                complex_result.get_p_data()));
+        CufftErrorCheck(cufftExecR2C(plan_f, reinterpret_cast<cufftReal*>(real_input_arr), complex_result.get_p_data()));
+        cudaStreamSynchronize(stream);
+        }
     }
     return;
 }
@@ -161,8 +164,12 @@ void cuFFT::forward_window(std::vector<cv::Mat> patch_feats, ComplexMat & comple
             cv::Mat in_roi(fw_all, cv::Rect(0, int(i*m_height), int(m_width), int(m_height)));
             in_roi = patch_feats[i].mul(m_window);
         }
+#pragma omp critical
+        {
         CufftErrorCheck(cufftSetStream(plan_fw, stream));
         CufftErrorCheck(cufftExecR2C(plan_fw, reinterpret_cast<cufftReal*>(real_input_arr), complex_result.get_p_data()));
+        cudaStreamSynchronize(stream);
+        }
     }
     return;
 }
@@ -173,9 +180,12 @@ void cuFFT::inverse(ComplexMat &  complex_input, cv::Mat & real_result, float *r
     cufftComplex *in = reinterpret_cast<cufftComplex*>(complex_input.get_p_data());
 
     if(n_channels == 1){
+#pragma omp critical
+        {
         CufftErrorCheck(cufftSetStream(plan_i_1ch, stream));
         CufftErrorCheck(cufftExecC2R(plan_i_1ch, in, reinterpret_cast<cufftReal*>(real_result_arr)));
         cudaStreamSynchronize(stream);
+        }
         real_result = real_result/(m_width*m_height);
         return;
     } else if(n_channels == int(m_num_of_scales)){
@@ -188,8 +198,14 @@ void cuFFT::inverse(ComplexMat &  complex_input, cv::Mat & real_result, float *r
         CufftErrorCheck(cufftExecC2R(plan_i_features_all_scales, in, reinterpret_cast<cufftReal*>(real_result_arr)));
         return;
     }
+#pragma omp critical
+    {
     CufftErrorCheck(cufftSetStream(plan_i_features, stream));
     CufftErrorCheck(cufftExecC2R(plan_i_features, in, reinterpret_cast<cufftReal*>(real_result_arr)));
+#if defined(OPENMP) && !defined(BIG_BATCH)
+    cudaStreamSynchronize(stream);
+#endif
+    }
     return;
 }
 
index d171e4a2a6beedccd86925575e26392d708ce36d..4dfd5a140b4bf81498451e019cd3aa1f4d089aa3 100644 (file)
@@ -30,7 +30,7 @@ void Fftw::init(unsigned width, unsigned height, unsigned num_of_feats, unsigned
     m_num_of_scales = num_of_scales;
     m_big_batch_mode = big_batch_mode;
 
-#if (!defined(ASYNC) && !defined(CUFFTW))|| defined(OPENMP)
+#if (!defined(ASYNC) && !defined(CUFFTW)) && defined(OPENMP)
     fftw_init_threads();
 #endif //OPENMP
 
index 11687d261d3ee17ce6a7f5073869d9ae12c38957..fceccaecd6271434f08ef3ca491ef6db97796ac2 100644 (file)
@@ -21,6 +21,8 @@
 
 #define DEBUG_PRINT(obj) if (m_debug) {std::cout << #obj << " @" << __LINE__ << std::endl << (obj) << std::endl;}
 #define DEBUG_PRINTM(obj) if (m_debug) {std::cout << #obj << " @" << __LINE__ << " " << (obj).size() << " CH: " << (obj).channels() << std::endl << (obj) << std::endl;}
+#define DEBUG_PRINTD(obj) {std::cout << #obj << " @" << __LINE__ << " " << (obj).size() << " CH: " << (obj).channels() << std::endl << (obj) << std::endl;}
+
 
 KCF_Tracker::KCF_Tracker(double padding, double kernel_sigma, double lambda, double interp_factor, double output_sigma_factor, int cell_size) :
     fft(*new FFT()),
@@ -157,15 +159,6 @@ void KCF_Tracker::init(cv::Mat &img, const cv::Rect & bbox, int fit_size_x, int
         else {
             p_scale_vars.emplace_back(new Scale_vars(p_windows_size, p_cell_size, p_num_of_feats, 1));
         }
-#ifdef CUFFT
-        std::cout << p_scale_vars.back()->zf.stream << std::endl;
-        std::cout << p_scale_vars.back()->kzf.stream << std::endl;
-        std::cout << p_scale_vars.back()->kf.stream << std::endl << std::endl;
-
-        std::cout << p_scale_vars.back()->zf.n_scales << std::endl;
-        std::cout << p_scale_vars.back()->kzf.n_scales << std::endl;
-        std::cout << p_scale_vars.back()->kf.n_scales << std::endl << std::endl;
-#endif
     }
 
     p_current_scale = 1.;
@@ -199,6 +192,7 @@ void KCF_Tracker::init(cv::Mat &img, const cv::Rect & bbox, int fit_size_x, int
     p_scale_vars.front()->model_xf = p_model_xf;
     p_scale_vars.front()->model_xf.set_stream(p_scale_vars.front()->stream);
     p_yf.set_stream(p_scale_vars.front()->stream);
+    p_model_xf.set_stream(p_scale_vars.front()->stream);
 #endif
 
     if (m_use_linearkernel) {
@@ -393,6 +387,7 @@ void KCF_Tracker::track(cv::Mat &img)
         p_current_scale = p_min_max_scale[0];
     if (p_current_scale > p_min_max_scale[1])
         p_current_scale = p_min_max_scale[1];
+
     //obtain a subwindow for training at newly estimated target position
     p_scale_vars.front()->patch_feats.clear();
     get_features(input_rgb, input_gray, int(p_pose.cx), int(p_pose.cy), p_windows_size[0], p_windows_size[1], *p_scale_vars.front(), p_current_scale);
index 0aad3fb8a0f19f06a0c15d25340b980cc9a3b56f..9f755e5861f015c7c61c163c32f03b6436159935 100644 (file)
@@ -5,8 +5,12 @@
   #include "complexmat.cuh"
 #else
   #include "complexmat.hpp"
+#ifndef CUFFTW
 //For compatibility reasons between CuFFT and FFTW, OpenCVfft versions.
   typedef int* cudaStream_t;
+#else
+    #include "cuda_runtime.h"
+#endif
 #endif
 
 struct Scale_vars
@@ -16,7 +20,6 @@ public:
                            ComplexMat *yf = nullptr,bool zero_index = false)
     {
         uint alloc_size;
-        std::cout << __PRETTY_FUNCTION__ << std::endl;
 #ifdef CUFFT
         if (zero_index) {
             cudaSetDeviceFlags(cudaDeviceMapHost);
@@ -27,6 +30,8 @@ public:
         CudaSafeCall(cudaStreamCreate(&this->stream));
 #endif
 
+        this->patch_feats.reserve(uint(num_of_feats));
+
         alloc_size = uint(windows_size[0]/cell_size*windows_size[1]/cell_size*num_of_scales)*sizeof(cufftReal);
         CudaSafeCall(cudaHostAlloc(reinterpret_cast<void**>(&this->data_i_1ch), alloc_size, cudaHostAllocMapped));
         CudaSafeCall(cudaHostGetDevicePointer(reinterpret_cast<void**>(&this->data_i_1ch_d), reinterpret_cast<void*>(this->data_i_1ch), 0));
@@ -39,14 +44,8 @@ public:
         this->response = cv::Mat(windows_size[1]/cell_size, windows_size[0]/cell_size, CV_32FC(num_of_scales), this->data_i_1ch);
 
         this->zf.create(windows_size[1]/cell_size, (windows_size[0]/cell_size)/2+1, num_of_feats, num_of_scales, this->stream);
-        std::cout << this->zf.stream << std::endl;
-        std::cout << this->zf.n_scales << std::endl;
         this->kzf.create(windows_size[1]/cell_size, (windows_size[0]/cell_size)/2+1, num_of_scales, this->stream);
-        std::cout << this->kzf.stream << std::endl;
-        std::cout << this->kzf.n_scales << std::endl;
         this->kf.create(windows_size[1]/cell_size, (windows_size[0]/cell_size)/2+1, num_of_scales, this->stream);
-        std::cout << this->kf.stream << std::endl;
-        std::cout << this->kf.n_scales << std::endl << std::endl;
 
         alloc_size = uint(num_of_scales);
 
@@ -126,7 +125,6 @@ public:
 
     ~Scale_vars() {
 #ifdef CUFFT
-        std::cout << __PRETTY_FUNCTION__ << std::endl;
         CudaSafeCall(cudaFreeHost(this->xf_sqr_norm));
         CudaSafeCall(cudaFreeHost(this->yf_sqr_norm));
         CudaSafeCall(cudaFreeHost(this->data_i_1ch));