Start cleaning up cufft implementation

author Michal Sojka <michal.sojka@cvut.cz>

Tue, 18 Sep 2018 12:54:44 +0000 (14:54 +0200)

committer Michal Sojka <michal.sojka@cvut.cz>

Thu, 20 Sep 2018 13:51:03 +0000 (15:51 +0200)
author Michal Sojka <michal.sojka@cvut.cz>
Tue, 18 Sep 2018 12:54:44 +0000 (14:54 +0200)
committer Michal Sojka <michal.sojka@cvut.cz>
Thu, 20 Sep 2018 13:51:03 +0000 (15:51 +0200)
diff --git a/src/cuda/cuda_error_check.cuh b/src/cuda/cuda_error_check.cuh

index 13c2f7dada631e54e79bc0c8fff9ea848b5810d7..3c1381198f37b0f45f886492012abc27b5d2ac90 100644 (file)
--- a/src/cuda/cuda_error_check.cuh
+++ b/src/cuda/cuda_error_check.cuh
@@ -115,6 +115,19 @@ static inline void __cufftErrorCheck(cufftResult_t call, const char *file, const
  
      return;
  }
+
+#define CublasErrorCheck(call) __cublasErrorCheck(call, __FILE__, __LINE__ )
+
+static inline void __cublasErrorCheck(cublasStatus_t call, const char *file, const int line )
+{
+    if (call != CUBLAS_STATUS_SUCCESS) {
+        fprintf(stderr, "cuBLAS error %d at %s:%d\n", call, /* _cudaGetErrorEnum(call),*/ file, line);
+        exit(-1);
+    }
+
+    return;
+}
+
  #endif
  
  #endif
diff --git a/src/dynmem.hpp b/src/dynmem.hpp

index 4c5199e1273b88cc3740f9c7ba5275a0b9d74a75..ff1b03efe49a76b946edf261fcf9df484a7a457a 100644 (file)
--- a/src/dynmem.hpp
+++ b/src/dynmem.hpp
@@ -20,31 +20,29 @@ template <typename T> class DynMem_ {
  #endif
    public:
      typedef T type;
-    DynMem_() {}
      DynMem_(size_t size)
      {
  #ifdef CUFFT
-        CudaSafeCall(cudaHostAlloc(reinterpret_cast<void **>(&this->ptr), size, cudaHostAllocMapped));
-        CudaSafeCall(
-            cudaHostGetDevicePointer(reinterpret_cast<void **>(&this->ptr_d), reinterpret_cast<void *>(this->ptr), 0));
+        CudaSafeCall(cudaHostAlloc(reinterpret_cast<void **>(&ptr_h), size, cudaHostAllocMapped));
+        CudaSafeCall(cudaHostGetDevicePointer(reinterpret_cast<void **>(&ptr_d), reinterpret_cast<void *>(ptr_h), 0));
  #else
-        this->ptr_h = new float[size];
+        ptr_h = new float[size];
  #endif
      }
      DynMem_(DynMem_&& other) {
-        this->ptr_h = other.ptr_h;
+        ptr_h = other.ptr_h;
          other.ptr_h = nullptr;
  #ifdef CUFFT
-        this->ptr_d = other.ptr_d;
+        ptr_d = other.ptr_d;
          other.ptr_d = nullptr;
  #endif
      }
      ~DynMem_()
      {
  #ifdef CUFFT
-        CudaSafeCall(cudaFreeHost(this->ptr));
+        CudaSafeCall(cudaFreeHost(ptr_h));
  #else
-        delete[] this->ptr_h;
+        delete[] ptr_h;
  #endif
      }
      T *hostMem() { return ptr_h; }
@@ -53,10 +51,10 @@ template <typename T> class DynMem_ {
  #endif
      void operator=(DynMem_ &&rhs)
      {
-        this->ptr_h = rhs.ptr_h;
+        ptr_h = rhs.ptr_h;
          rhs.ptr_h = nullptr;
  #ifdef CUFFT
-        this->ptr_d = rhs.ptr_d;
+        ptr_d = rhs.ptr_d;
          rhs.ptr_d = nullptr;
  #endif
      }
@@ -65,14 +63,18 @@ template <typename T> class DynMem_ {
  typedef DynMem_<float> DynMem;
  
  
-class MatDynMem : protected DynMem, public cv::Mat {
+class MatDynMem : public DynMem, public cv::Mat {
    public:
      MatDynMem(cv::Size size, int type)
          : DynMem(size.area() * sizeof(DynMem::type) * CV_MAT_CN(type)), cv::Mat(size, type, hostMem())
      {
          assert((type & CV_MAT_DEPTH_MASK) == CV_32F);
      }
-    MatDynMem(int height, int width, int type) { MatDynMem(cv::Size(width, height), type); }
+    MatDynMem(int height, int width, int type)
+        : DynMem(width * height * sizeof(DynMem::type) * CV_MAT_CN(type)), cv::Mat(height, width, type, hostMem())
+    {
+        assert((type & CV_MAT_DEPTH_MASK) == CV_32F);
+    }
      MatDynMem(int ndims, const int *sizes, int type)
          : DynMem(volume(ndims, sizes) * sizeof(DynMem::type) * CV_MAT_CN(type)), cv::Mat(ndims, sizes, type, hostMem())
      {
diff --git a/src/fft.h b/src/fft.h

index b165ef466fec20cbe1fabd55fbb60e8a73ab8d8e..93dcea6145d3f848219afd855f8b6c5c0095bf02 100644 (file)
--- a/src/fft.h
+++ b/src/fft.h
@@ -23,7 +23,7 @@ class Fft
  public:
      virtual void init(unsigned width, unsigned height,unsigned num_of_feats, unsigned num_of_scales) = 0;
      virtual void set_window(const MatDynMem &window) = 0;
-    virtual void forward(const cv::Mat & real_input, ComplexMat & complex_result) = 0;
+    virtual void forward(MatDynMem & real_input, ComplexMat & complex_result) = 0;
      virtual void forward_window(MatDynMem &patch_feats_in, ComplexMat & complex_result, MatDynMem &tmp) = 0;
      virtual void inverse(ComplexMat &  complex_input, MatDynMem & real_result) = 0;
      virtual ~Fft() = 0;
diff --git a/src/fft_cufft.cpp b/src/fft_cufft.cpp

index cd53414bbf5921761bf725db9ae1b66a2e016224..63f2558621c58857f3bfd1532686f7ca0d0c98b3 100644 (file)
--- a/src/fft_cufft.cpp
+++ b/src/fft_cufft.cpp
@@ -1,4 +1,10 @@
  #include "fft_cufft.h"
+#include <cublas_v2.h>
+
+cuFFT::cuFFT()
+{
+    CublasErrorCheck(cublasCreate(&cublas));
+}
  
  void cuFFT::init(unsigned width, unsigned height, unsigned num_of_feats, unsigned num_of_scales)
  {
@@ -119,84 +125,68 @@ void cuFFT::set_window(const MatDynMem &window)
      m_window = window;
  }
  
-void cuFFT::forward(const cv::Mat &real_input, ComplexMat &complex_result, float *real_input_arr)
+void cuFFT::forward(MatDynMem & real_input, ComplexMat & complex_result)
  {
      if (BIG_BATCH_MODE && real_input.rows == int(m_height * m_num_of_scales)) {
-        CufftErrorCheck(cufftExecR2C(plan_f_all_scales, reinterpret_cast<cufftReal *>(real_input_arr),
+        CufftErrorCheck(cufftExecR2C(plan_f_all_scales, reinterpret_cast<cufftReal *>(real_input.deviceMem()),
                                       complex_result.get_p_data()));
      } else {
          NORMAL_OMP_CRITICAL
          {
              CufftErrorCheck(
-                cufftExecR2C(plan_f, reinterpret_cast<cufftReal *>(real_input_arr), complex_result.get_p_data()));
+                cufftExecR2C(plan_f, reinterpret_cast<cufftReal *>(real_input.deviceMem()), complex_result.get_p_data()));
              cudaStreamSynchronize(cudaStreamPerThread);
          }
      }
      return;
  }
  
-void cuFFT::forward_window(MatDynMem &patch_feats_in, ComplexMat & complex_result, MatDynMem &tmp)
+void cuFFT::forward_window(MatDynMem &feat, ComplexMat & complex_result, MatDynMem &temp)
  {
-    int n_channels = int(patch_feats.size());
+    uint n_channels = feat.size[0];
+    cufftReal *temp_data = temp.deviceMem();
  
-    if (n_channels > int(m_num_of_feats)) {
-        for (uint i = 0; i < uint(n_channels); ++i) {
-            cv::Mat in_roi(fw_all, cv::Rect(0, int(i * m_height), int(m_width), int(m_height)));
-            in_roi = patch_feats[i].mul(m_window);
-        }
-        CufftErrorCheck(cufftExecR2C(plan_fw_all_scales, reinterpret_cast<cufftReal *>(real_input_arr),
-                                     complex_result.get_p_data()));
-    } else {
-        for (uint i = 0; i < uint(n_channels); ++i) {
-            cv::Mat in_roi(fw_all, cv::Rect(0, int(i * m_height), int(m_width), int(m_height)));
-            in_roi = patch_feats[i].mul(m_window);
-        }
-        NORMAL_OMP_CRITICAL
-        {
-            CufftErrorCheck(
-                cufftExecR2C(plan_fw, reinterpret_cast<cufftReal *>(real_input_arr), complex_result.get_p_data()));
-            cudaStreamSynchronize(cudaStreamPerThread);
-        }
+    assert(feat.dims == 3);
+    assert(n_channels == m_num_of_feats || n_channels == m_num_of_feats * m_num_of_scales);
+
+    for (uint i = 0; i < n_channels; ++i) {
+        cv::Mat feat_plane(feat.dims - 1, feat.size + 1, feat.cv::Mat::type(), feat.ptr<void>(i));
+        cv::Mat temp_plane(temp.dims - 1, temp.size + 1, temp.cv::Mat::type(), temp.ptr(i));
+        temp_plane = feat_plane.mul(m_window);
      }
-    return;
+    CufftErrorCheck(cufftExecR2C((n_channels == m_num_of_feats) ? plan_fw : plan_fw_all_scales,
+                                 temp_data, complex_result.get_p_data()));
  }
  
-void cuFFT::inverse(ComplexMat &  complex_input, MatDynMem & real_result)
+void cuFFT::inverse(ComplexMat &complex_input, MatDynMem &real_result)
  {
-    int n_channels = complex_input.n_channels;
+    uint n_channels = complex_input.n_channels;
      cufftComplex *in = reinterpret_cast<cufftComplex *>(complex_input.get_p_data());
+    cufftReal *out = real_result.deviceMem();
+    float alpha = 1.0 / (m_width * m_height);
+    cufftHandle plan;
  
      if (n_channels == 1) {
-        NORMAL_OMP_CRITICAL
-        {
-            CufftErrorCheck(cufftExecC2R(plan_i_1ch, in, reinterpret_cast<cufftReal *>(real_result_arr)));
-            cudaStreamSynchronize(cudaStreamPerThread);
-        }
-        real_result = real_result / (m_width * m_height);
+        CufftErrorCheck(cufftExecC2R(plan_i_1ch, in, out));
+        CublasErrorCheck(cublasSscal(cublas, real_result.total(), &alpha, out, 1));
          return;
-    } else if (n_channels == int(m_num_of_scales)) {
-        CufftErrorCheck(cufftExecC2R(plan_i_1ch_all_scales, in, reinterpret_cast<cufftReal *>(real_result_arr)));
-        cudaStreamSynchronize(cudaStreamPerThread);
-
-        real_result = real_result / (m_width * m_height);
+    } else if (n_channels == m_num_of_scales) {
+        CufftErrorCheck(cufftExecC2R(plan_i_1ch_all_scales, in, out));
+        CublasErrorCheck(cublasSscal(cublas, real_result.total(), &alpha, out, 1));
          return;
-    } else if (n_channels == int(m_num_of_feats) * int(m_num_of_scales)) {
-        CufftErrorCheck(cufftExecC2R(plan_i_features_all_scales, in, reinterpret_cast<cufftReal *>(real_result_arr)));
+    } else if (n_channels == m_num_of_feats * m_num_of_scales) {
+        CufftErrorCheck(cufftExecC2R(plan_i_features_all_scales, in, out));
          cudaStreamSynchronize(cudaStreamPerThread);
          return;
      }
-    NORMAL_OMP_CRITICAL
-    {
-        CufftErrorCheck(cufftExecC2R(plan_i_features, in, reinterpret_cast<cufftReal *>(real_result_arr)));
-#if defined(OPENMP) && !defined(BIG_BATCH)
-        CudaSafeCall(cudaStreamSynchronize(cudaStreamPerThread));
-#endif
-    }
+    CufftErrorCheck(cufftExecC2R(plan_i_features, in, out));
      return;
  }
  
  cuFFT::~cuFFT()
  {
+    CublasErrorCheck(cublasDestroy(cublas));
+
      CufftErrorCheck(cufftDestroy(plan_f));
      CufftErrorCheck(cufftDestroy(plan_fw));
      CufftErrorCheck(cufftDestroy(plan_i_1ch));
diff --git a/src/fft_cufft.h b/src/fft_cufft.h

index 9b2467f254c80f151572ba39af329c20368e697f..4dd0f1042e1d025ddb17f9118383a0bc10da6b5d 100644 (file)
--- a/src/fft_cufft.h
+++ b/src/fft_cufft.h
@@ -4,6 +4,7 @@
  
  #include <cufft.h>
  #include <cuda_runtime.h>
+#include <cublas_v2.h>
  
  #include "fft.h"
  #include "cuda/cuda_error_check.cuh"
@@ -14,9 +15,10 @@ struct ThreadCtx;
  class cuFFT : public Fft
  {
  public:
+    cuFFT();
      void init(unsigned width, unsigned height, unsigned num_of_feats, unsigned num_of_scales) override;
      void set_window(const MatDynMem &window) override;
-    void forward(const cv::Mat & real_input, ComplexMat & complex_result) override;
+    void forward(MatDynMem & real_input, ComplexMat & complex_result) override;
      void forward_window(MatDynMem &patch_feats_in, ComplexMat & complex_result, MatDynMem &tmp) override;
      void inverse(ComplexMat &  complex_input, MatDynMem & real_result) override;
      ~cuFFT() override;
@@ -25,6 +27,7 @@ private:
      unsigned m_width, m_height, m_num_of_feats, m_num_of_scales;
      cufftHandle plan_f, plan_f_all_scales, plan_fw, plan_fw_all_scales, plan_i_features,
       plan_i_features_all_scales, plan_i_1ch, plan_i_1ch_all_scales;
+    cublasHandle_t cublas;
  };
  
  #endif // FFT_CUDA_H
diff --git a/src/fft_fftw.cpp b/src/fft_fftw.cpp

index 973c894b8ce66f4b34ac8fab4b3da09d67c267a5..3754036e43b065bf0b487f213ab235c4a720e079 100644 (file)
--- a/src/fft_fftw.cpp
+++ b/src/fft_fftw.cpp
@@ -172,7 +172,7 @@ void Fftw::set_window(const MatDynMem &window)
      m_window = window;
  }
  
-void Fftw::forward(const cv::Mat & real_input, ComplexMat & complex_result)
+void Fftw::forward(MatDynMem & real_input, ComplexMat & complex_result)
  {
      if (BIG_BATCH_MODE && real_input.rows == int(m_height * m_num_of_scales)) {
          fftwf_execute_dft_r2c(plan_f_all_scales, reinterpret_cast<float *>(real_input.data),
diff --git a/src/fft_fftw.h b/src/fft_fftw.h

index 97641f883939876996bae0220175a9a9b89af396..cb4a901d41d2f4f8494f2ae63fb9f66daecd2edc 100644 (file)
--- a/src/fft_fftw.h
+++ b/src/fft_fftw.h
@@ -20,7 +20,7 @@ public:
      Fftw();
      void init(unsigned width, unsigned height, unsigned num_of_feats, unsigned num_of_scales) override;
      void set_window(const MatDynMem &window) override;
-    void forward(const cv::Mat & real_input, ComplexMat & complex_result) override;
+    void forward(MatDynMem & real_input, ComplexMat & complex_result) override;
      void forward_window(MatDynMem &patch_feats_in, ComplexMat & complex_result, MatDynMem &tmp) override;
      void inverse(ComplexMat &  complex_input, MatDynMem & real_result) override;
      ~Fftw() override;
diff --git a/src/fft_opencv.h b/src/fft_opencv.h

index 1239acdd3e2c62e687d39d89fddd3683c11d2cf5..5e016db3b39ecf66404ab611288a96b7f75a8fbb 100644 (file)
--- a/src/fft_opencv.h
+++ b/src/fft_opencv.h
@@ -9,7 +9,7 @@ class FftOpencv : public Fft
  public:
      void init(unsigned width, unsigned height, unsigned num_of_feats, unsigned num_of_scales) override;
      void set_window(const MatDynMem &window) override;
-    void forward(const cv::Mat & real_input, ComplexMat & complex_result) override;
+    void forward(MatDynMem & real_input, ComplexMat & complex_result) override;
      void forward_window(MatDynMem &patch_feats_in, ComplexMat & complex_result, MatDynMem &tmp) override;
      void inverse(ComplexMat &  complex_input, MatDynMem & real_result) override;
      ~FftOpencv() override;
diff --git a/src/kcf.cpp b/src/kcf.cpp

index 4585aa0c22fb6df5e2fd77393725c6045196dca7..5e56bdde564f66104196295fd97c239138a00bdc 100644 (file)
--- a/src/kcf.cpp
+++ b/src/kcf.cpp
@@ -209,8 +209,10 @@ void KCF_Tracker::init(cv::Mat &img, const cv::Rect &bbox, int fit_size_x, int f
      DEBUG_PRINTM(p_yf);
  
      // obtain a sub-window for training initial model
-    std::vector<cv::Mat> patch_feats = get_features(input_rgb, input_gray, p_pose.cx, p_pose.cy,
-                                                    p_windows_size.width, p_windows_size.height);
+    int sizes[3] = {p_num_of_feats, p_windows_size.height, p_windows_size.width};
+    MatDynMem patch_feats(3, sizes, CV_32FC1);
+    MatDynMem temp(3, tmp, CV_32FC1);
+    get_features(features, input_rgb, input_gray, p_pose.cx, p_pose.cy, p_windows_size.width, p_windows_size.height);
      fft.forward_window(patch_feats, p_model_xf);
      DEBUG_PRINTM(p_model_xf);
  
@@ -638,7 +640,7 @@ cv::Mat KCF_Tracker::circshift(const cv::Mat &patch, int x_rot, int y_rot)
  }
  
  // hann window actually (Power-of-cosine windows)
-cv::Mat KCF_Tracker::cosine_window_function(int dim1, int dim2)
+MatDynMem KCF_Tracker::cosine_window_function(int dim1, int dim2)
  {
      cv::Mat m1(1, dim1, CV_32FC1), m2(dim2, 1, CV_32FC1);
      double N_inv = 1. / (static_cast<double>(dim1) - 1.);
@@ -647,7 +649,7 @@ cv::Mat KCF_Tracker::cosine_window_function(int dim1, int dim2)
      N_inv = 1. / (static_cast<double>(dim2) - 1.);
      for (int i = 0; i < dim2; ++i)
          m2.at<float>(i) = float(0.5 * (1. - std::cos(2. * CV_PI * static_cast<double>(i) * N_inv)));
-    cv::Mat ret = m2 * m1;
+    MatDynMem ret = m2 * m1;
      return ret;
  }
  
diff --git a/src/kcf.h b/src/kcf.h

index f0d79c181297504f86eb15d71faf68ce586bca81..66cb0fad10655ed22854e0325dd0cd801f1c80c2 100644 (file)
--- a/src/kcf.h
+++ b/src/kcf.h
@@ -162,7 +162,7 @@ private:
      cv::Mat gaussian_shaped_labels(double sigma, int dim1, int dim2);
      std::unique_ptr<GaussianCorrelation> gaussian_correlation;
      cv::Mat circshift(const cv::Mat & patch, int x_rot, int y_rot);
-    cv::Mat cosine_window_function(int dim1, int dim2);
+    MatDynMem cosine_window_function(int dim1, int dim2);
      void get_features(MatDynMem &feat_3d, cv::Mat & input_rgb, cv::Mat & input_gray, int cx, int cy, int size_x, int size_y, double scale = 1.);
      cv::Point2f sub_pixel_peak(cv::Point & max_loc, cv::Mat & response);
      double sub_grid_scale(uint index);
author	Michal Sojka <michal.sojka@cvut.cz>
	Tue, 18 Sep 2018 12:54:44 +0000 (14:54 +0200)
committer	Michal Sojka <michal.sojka@cvut.cz>
	Thu, 20 Sep 2018 13:51:03 +0000 (15:51 +0200)
src/cuda/cuda_error_check.cuh		patch \| blob \| history
src/dynmem.hpp		patch \| blob \| history
src/fft.h		patch \| blob \| history
src/fft_cufft.cpp		patch \| blob \| history
src/fft_cufft.h		patch \| blob \| history
src/fft_fftw.cpp		patch \| blob \| history
src/fft_fftw.h		patch \| blob \| history
src/fft_opencv.h		patch \| blob \| history
src/kcf.cpp		patch \| blob \| history
src/kcf.h		patch \| blob \| history