CUDA streams works

author Shanigen <vkaraf@gmail.com>

Fri, 10 Aug 2018 11:20:02 +0000 (13:20 +0200)

committer Michal Sojka <michal.sojka@cvut.cz>

Wed, 5 Sep 2018 06:38:52 +0000 (08:38 +0200)
author Shanigen <vkaraf@gmail.com>
Fri, 10 Aug 2018 11:20:02 +0000 (13:20 +0200)
committer Michal Sojka <michal.sojka@cvut.cz>
Wed, 5 Sep 2018 06:38:52 +0000 (08:38 +0200)
diff --git a/Makefile b/Makefile

index 72280398a896c6246e300c7d54e7e70ab5d02e3f..561af832f9a1057657cdde2e2d14d85f1eea2ff6 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -1,6 +1,6 @@
  # Makefile to build all the available variants
  
-BUILDS = opencvfft-st opencvfft-async opencvfft-openmp fftw fftw-async fftw-openmp fftw-big fftw-big-openmp cufftw cufftw-big cufftw-big-openmp cufft cufft-big cufft-big-openmp
+BUILDS = opencvfft-st opencvfft-async opencvfft-openmp fftw fftw-async fftw-openmp fftw-big fftw-big-openmp cufftw cufftw-big cufftw-big-openmp cufft cufft-openmp cufft-big cufft-big-openmp cufft-opemp
  
  all: $(foreach build,$(BUILDS),build-$(build)/kcf_vot)
  
@@ -19,6 +19,7 @@ CMAKE_OTPS_cufftw            = -DFFT=cuFFTW
  CMAKE_OTPS_cufftw-big        = -DFFT=cuFFTW -DBIG_BATCH=ON
  CMAKE_OTPS_cufftw-big-openmp = -DFFT=cuFFTW -DBIG_BATCH=ON -DOPENMP=ON
  CMAKE_OTPS_cufft             = -DFFT=cuFFT
+CMAKE_OTPS_cufft-openmp                         = -DFFT=cuFFT -DOPENMP=ON
  CMAKE_OTPS_cufft-big         = -DFFT=cuFFT -DBIG_BATCH=ON
  CMAKE_OTPS_cufft-big-openmp  = -DFFT=cuFFT -DBIG_BATCH=ON -DOPENMP=ON
  
diff --git a/src/fft_cufft.cpp b/src/fft_cufft.cpp

index da43fbf4bd06f92a606380d4dd91097212994e82..955c9148489afaf8a8dc29f91aef5821baed9368 100644 (file)
--- a/src/fft_cufft.cpp
+++ b/src/fft_cufft.cpp
@@ -139,9 +139,12 @@ void cuFFT::forward(const cv::Mat & real_input, ComplexMat & complex_result, flo
          CufftErrorCheck(cufftExecR2C(plan_f_all_scales, reinterpret_cast<cufftReal*>(real_input_arr),
                                  complex_result.get_p_data()));
      } else {
+#pragma omp critical
+        {
          CufftErrorCheck(cufftSetStream(plan_f, stream));
-        CufftErrorCheck(cufftExecR2C(plan_f, reinterpret_cast<cufftReal*>(real_input_arr),
-                                complex_result.get_p_data()));
+        CufftErrorCheck(cufftExecR2C(plan_f, reinterpret_cast<cufftReal*>(real_input_arr), complex_result.get_p_data()));
+        cudaStreamSynchronize(stream);
+        }
      }
      return;
  }
@@ -161,8 +164,12 @@ void cuFFT::forward_window(std::vector<cv::Mat> patch_feats, ComplexMat & comple
              cv::Mat in_roi(fw_all, cv::Rect(0, int(i*m_height), int(m_width), int(m_height)));
              in_roi = patch_feats[i].mul(m_window);
          }
+#pragma omp critical
+        {
          CufftErrorCheck(cufftSetStream(plan_fw, stream));
          CufftErrorCheck(cufftExecR2C(plan_fw, reinterpret_cast<cufftReal*>(real_input_arr), complex_result.get_p_data()));
+        cudaStreamSynchronize(stream);
+        }
      }
      return;
  }
@@ -173,9 +180,12 @@ void cuFFT::inverse(ComplexMat &  complex_input, cv::Mat & real_result, float *r
      cufftComplex *in = reinterpret_cast<cufftComplex*>(complex_input.get_p_data());
  
      if(n_channels == 1){
+#pragma omp critical
+        {
          CufftErrorCheck(cufftSetStream(plan_i_1ch, stream));
          CufftErrorCheck(cufftExecC2R(plan_i_1ch, in, reinterpret_cast<cufftReal*>(real_result_arr)));
          cudaStreamSynchronize(stream);
+        }
          real_result = real_result/(m_width*m_height);
          return;
      } else if(n_channels == int(m_num_of_scales)){
@@ -188,8 +198,14 @@ void cuFFT::inverse(ComplexMat &  complex_input, cv::Mat & real_result, float *r
          CufftErrorCheck(cufftExecC2R(plan_i_features_all_scales, in, reinterpret_cast<cufftReal*>(real_result_arr)));
          return;
      }
+#pragma omp critical
+    {
      CufftErrorCheck(cufftSetStream(plan_i_features, stream));
      CufftErrorCheck(cufftExecC2R(plan_i_features, in, reinterpret_cast<cufftReal*>(real_result_arr)));
+#if defined(OPENMP) && !defined(BIG_BATCH)
+    cudaStreamSynchronize(stream);
+#endif
+    }
      return;
  }
  
diff --git a/src/fft_fftw.cpp b/src/fft_fftw.cpp

index d171e4a2a6beedccd86925575e26392d708ce36d..4dfd5a140b4bf81498451e019cd3aa1f4d089aa3 100644 (file)
--- a/src/fft_fftw.cpp
+++ b/src/fft_fftw.cpp
@@ -30,7 +30,7 @@ void Fftw::init(unsigned width, unsigned height, unsigned num_of_feats, unsigned
      m_num_of_scales = num_of_scales;
      m_big_batch_mode = big_batch_mode;
  
-#if (!defined(ASYNC) && !defined(CUFFTW))|| defined(OPENMP)
+#if (!defined(ASYNC) && !defined(CUFFTW)) && defined(OPENMP)
      fftw_init_threads();
  #endif //OPENMP
  
diff --git a/src/kcf.cpp b/src/kcf.cpp

index 11687d261d3ee17ce6a7f5073869d9ae12c38957..fceccaecd6271434f08ef3ca491ef6db97796ac2 100644 (file)
--- a/src/kcf.cpp
+++ b/src/kcf.cpp
@@ -21,6 +21,8 @@
  
  #define DEBUG_PRINT(obj) if (m_debug) {std::cout << #obj << " @" << __LINE__ << std::endl << (obj) << std::endl;}
  #define DEBUG_PRINTM(obj) if (m_debug) {std::cout << #obj << " @" << __LINE__ << " " << (obj).size() << " CH: " << (obj).channels() << std::endl << (obj) << std::endl;}
+#define DEBUG_PRINTD(obj) {std::cout << #obj << " @" << __LINE__ << " " << (obj).size() << " CH: " << (obj).channels() << std::endl << (obj) << std::endl;}
+
  
  KCF_Tracker::KCF_Tracker(double padding, double kernel_sigma, double lambda, double interp_factor, double output_sigma_factor, int cell_size) :
      fft(*new FFT()),
@@ -157,15 +159,6 @@ void KCF_Tracker::init(cv::Mat &img, const cv::Rect & bbox, int fit_size_x, int
          else {
              p_scale_vars.emplace_back(new Scale_vars(p_windows_size, p_cell_size, p_num_of_feats, 1));
          }
-#ifdef CUFFT
-        std::cout << p_scale_vars.back()->zf.stream << std::endl;
-        std::cout << p_scale_vars.back()->kzf.stream << std::endl;
-        std::cout << p_scale_vars.back()->kf.stream << std::endl << std::endl;
-
-        std::cout << p_scale_vars.back()->zf.n_scales << std::endl;
-        std::cout << p_scale_vars.back()->kzf.n_scales << std::endl;
-        std::cout << p_scale_vars.back()->kf.n_scales << std::endl << std::endl;
-#endif
      }
  
      p_current_scale = 1.;
@@ -199,6 +192,7 @@ void KCF_Tracker::init(cv::Mat &img, const cv::Rect & bbox, int fit_size_x, int
      p_scale_vars.front()->model_xf = p_model_xf;
      p_scale_vars.front()->model_xf.set_stream(p_scale_vars.front()->stream);
      p_yf.set_stream(p_scale_vars.front()->stream);
+    p_model_xf.set_stream(p_scale_vars.front()->stream);
  #endif
  
      if (m_use_linearkernel) {
@@ -393,6 +387,7 @@ void KCF_Tracker::track(cv::Mat &img)
          p_current_scale = p_min_max_scale[0];
      if (p_current_scale > p_min_max_scale[1])
          p_current_scale = p_min_max_scale[1];
+
      //obtain a subwindow for training at newly estimated target position
      p_scale_vars.front()->patch_feats.clear();
      get_features(input_rgb, input_gray, int(p_pose.cx), int(p_pose.cy), p_windows_size[0], p_windows_size[1], *p_scale_vars.front(), p_current_scale);
diff --git a/src/scale_vars.hpp b/src/scale_vars.hpp

index 0aad3fb8a0f19f06a0c15d25340b980cc9a3b56f..9f755e5861f015c7c61c163c32f03b6436159935 100644 (file)
--- a/src/scale_vars.hpp
+++ b/src/scale_vars.hpp
@@ -5,8 +5,12 @@
    #include "complexmat.cuh"
  #else
    #include "complexmat.hpp"
+#ifndef CUFFTW
  //For compatibility reasons between CuFFT and FFTW, OpenCVfft versions.
    typedef int* cudaStream_t;
+#else
+    #include "cuda_runtime.h"
+#endif
  #endif
  
  struct Scale_vars
@@ -16,7 +20,6 @@ public:
                             ComplexMat *yf = nullptr,bool zero_index = false)
      {
          uint alloc_size;
-        std::cout << __PRETTY_FUNCTION__ << std::endl;
  #ifdef CUFFT
          if (zero_index) {
              cudaSetDeviceFlags(cudaDeviceMapHost);
@@ -27,6 +30,8 @@ public:
          CudaSafeCall(cudaStreamCreate(&this->stream));
  #endif
  
+        this->patch_feats.reserve(uint(num_of_feats));
+
          alloc_size = uint(windows_size[0]/cell_size*windows_size[1]/cell_size*num_of_scales)*sizeof(cufftReal);
          CudaSafeCall(cudaHostAlloc(reinterpret_cast<void**>(&this->data_i_1ch), alloc_size, cudaHostAllocMapped));
          CudaSafeCall(cudaHostGetDevicePointer(reinterpret_cast<void**>(&this->data_i_1ch_d), reinterpret_cast<void*>(this->data_i_1ch), 0));
@@ -39,14 +44,8 @@ public:
          this->response = cv::Mat(windows_size[1]/cell_size, windows_size[0]/cell_size, CV_32FC(num_of_scales), this->data_i_1ch);
  
          this->zf.create(windows_size[1]/cell_size, (windows_size[0]/cell_size)/2+1, num_of_feats, num_of_scales, this->stream);
-        std::cout << this->zf.stream << std::endl;
-        std::cout << this->zf.n_scales << std::endl;
          this->kzf.create(windows_size[1]/cell_size, (windows_size[0]/cell_size)/2+1, num_of_scales, this->stream);
-        std::cout << this->kzf.stream << std::endl;
-        std::cout << this->kzf.n_scales << std::endl;
          this->kf.create(windows_size[1]/cell_size, (windows_size[0]/cell_size)/2+1, num_of_scales, this->stream);
-        std::cout << this->kf.stream << std::endl;
-        std::cout << this->kf.n_scales << std::endl << std::endl;
  
          alloc_size = uint(num_of_scales);
  
@@ -126,7 +125,6 @@ public:
  
      ~Scale_vars() {
  #ifdef CUFFT
-        std::cout << __PRETTY_FUNCTION__ << std::endl;
          CudaSafeCall(cudaFreeHost(this->xf_sqr_norm));
          CudaSafeCall(cudaFreeHost(this->yf_sqr_norm));
          CudaSafeCall(cudaFreeHost(this->data_i_1ch));
author	Shanigen <vkaraf@gmail.com>
	Fri, 10 Aug 2018 11:20:02 +0000 (13:20 +0200)
committer	Michal Sojka <michal.sojka@cvut.cz>
	Wed, 5 Sep 2018 06:38:52 +0000 (08:38 +0200)
Makefile		patch \| blob \| history
src/fft_cufft.cpp		patch \| blob \| history
src/fft_fftw.cpp		patch \| blob \| history
src/kcf.cpp		patch \| blob \| history
src/scale_vars.hpp		patch \| blob \| history