Added DynMem class

author Shanigen <vkaraf@gmail.com>

Thu, 6 Sep 2018 10:42:16 +0000 (12:42 +0200)

committer Michal Sojka <michal.sojka@cvut.cz>

Mon, 10 Sep 2018 14:52:24 +0000 (16:52 +0200)
author Shanigen <vkaraf@gmail.com>
Thu, 6 Sep 2018 10:42:16 +0000 (12:42 +0200)
committer Michal Sojka <michal.sojka@cvut.cz>
Mon, 10 Sep 2018 14:52:24 +0000 (16:52 +0200)
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt

index 1cff1e540849bc83ba832a1cb8cd5f78acf3d16e..3ffa79b89bc8332561e7367d807364ee9e47ad0d 100644 (file)
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -1,6 +1,6 @@
  cmake_minimum_required(VERSION 2.8)
  
-set(KCF_LIB_SRC kcf.cpp kcf.h fft.cpp threadctx.hpp pragmas.h)
+set(KCF_LIB_SRC kcf.cpp kcf.h fft.cpp threadctx.hpp pragmas.h dynmem.hpp)
  
  find_package(PkgConfig)
  
diff --git a/src/complexmat.hpp b/src/complexmat.hpp

index 2a595744a135721326a1daebdb8a57ba0e0e4424..7ba3f601960b1670abf57989358e25840013231f 100644 (file)
--- a/src/complexmat.hpp
+++ b/src/complexmat.hpp
@@ -27,7 +27,10 @@ template <typename T> class ComplexMat_ {
      }
  
      // assuming that mat has 2 channels (real, img)
-    ComplexMat_(const cv::Mat &mat) : cols(uint(mat.cols)), rows(uint(mat.rows)), n_channels(1) { p_data = convert(mat); }
+    ComplexMat_(const cv::Mat &mat) : cols(uint(mat.cols)), rows(uint(mat.rows)), n_channels(1)
+    {
+        p_data = convert(mat);
+    }
  
      void create(uint _rows, uint _cols, uint _n_channels)
      {
diff --git a/src/dynmem.hpp b/src/dynmem.hpp

new file mode 100644 (file)

index 0000000..3ddf312
--- /dev/null
+++ b/src/dynmem.hpp
@@ -0,0 +1,51 @@
+#ifndef DYNMEM_HPP
+#define DYNMEM_HPP
+
+#include <cstdlib>
+
+#if defined(CUFFT) || defined(CUFFTW)
+#include "cuda_runtime.h"
+#ifdef CUFFT
+#include "cuda/cuda_error_check.cuh"
+#endif
+#endif
+
+template <typename T> class DynMem_ {
+    T *ptr = nullptr;
+    T *ptr_d = nullptr;
+
+  public:
+    DynMem_()
+    {}
+    DynMem_(size_t size)
+    {
+#ifdef CUFFT
+        CudaSafeCall(cudaHostAlloc(reinterpret_cast<void **>(&this->ptr), size, cudaHostAllocMapped));
+        CudaSafeCall(
+            cudaHostGetDevicePointer(reinterpret_cast<void **>(&this->ptr_d), reinterpret_cast<void *>(this->ptr), 0));
+#else
+        this->ptr = new float[size];
+#endif
+    }
+    ~DynMem_()
+    {
+#ifdef CUFFT
+        CudaSafeCall(cudaFreeHost(this->ptr));
+#else
+        delete this->ptr;
+#endif
+    }
+    T *hostMem() { return ptr; }
+    T *deviceMem() { return ptr_d; }
+
+    void operator=(DynMem_ &&rhs)
+    {
+        this->ptr = rhs.ptr;
+        this->ptr_d = rhs.ptr_d;
+
+        rhs.ptr = nullptr;
+        rhs.ptr_d = nullptr;
+    }
+};
+typedef DynMem_<float> DynMem;
+#endif // DYNMEM_HPP
diff --git a/src/fft_cufft.h b/src/fft_cufft.h

index 3433bc3a90efe72189d3e328dbcde48c2f109d4a..ca0dc59754ecf6b2edc2cf175ff342cbf326b7cf 100644 (file)
--- a/src/fft_cufft.h
+++ b/src/fft_cufft.h
@@ -1,7 +1,10 @@
-
  #ifndef FFT_CUDA_H
  #define FFT_CUDA_H
  
+
+#include <cufft.h>
+#include <cuda_runtime.h>
+
  #include "fft.h"
  #include "cuda/cuda_error_check.cuh"
  #include "pragmas.h"
@@ -14,9 +17,6 @@
    #define CUDA cv::cuda
  #endif
  
-#include <cufft.h>
-#include <cuda_runtime.h>
-
  struct ThreadCtx;
  
  class cuFFT : public Fft
diff --git a/src/kcf.cpp b/src/kcf.cpp

index 629c6f3198a6f69252bc97938d10b157f448c0c0..663623c4868225f6bdf210dce9fc69d8d2ec5898 100644 (file)
--- a/src/kcf.cpp
+++ b/src/kcf.cpp
@@ -191,7 +191,7 @@ void KCF_Tracker::init(cv::Mat &img, const cv::Rect &bbox, int fit_size_x, int f
      // window weights, i.e. labels
      fft.forward(
          gaussian_shaped_labels(p_output_sigma, p_windows_size.width / p_cell_size, p_windows_size.height / p_cell_size), p_yf,
-        m_use_cuda ? p_scale_vars.front()->rot_labels_data_d : nullptr, p_scale_vars.front()->stream);
+        m_use_cuda ? p_scale_vars.front()->rot_labels_data.deviceMem() : nullptr, p_scale_vars.front()->stream);
      DEBUG_PRINTM(p_yf);
  
      // obtain a sub-window for training initial model
@@ -199,7 +199,7 @@ void KCF_Tracker::init(cv::Mat &img, const cv::Rect &bbox, int fit_size_x, int f
      get_features(input_rgb, input_gray, int(p_pose.cx), int(p_pose.cy), p_windows_size.width, p_windows_size.height,
                   *p_scale_vars.front());
      fft.forward_window(p_scale_vars.front()->patch_feats, p_model_xf, p_scale_vars.front()->fw_all,
-                       m_use_cuda ? p_scale_vars.front()->data_features_d : nullptr, p_scale_vars.front()->stream);
+                       m_use_cuda ? p_scale_vars.front()->data_features.deviceMem() : nullptr, p_scale_vars.front()->stream);
      DEBUG_PRINTM(p_model_xf);
  #if defined(CUFFT) && (defined(ASYNC) || defined(OPENMP))
      p_scale_vars.front()->model_xf = p_model_xf;
@@ -402,7 +402,7 @@ void KCF_Tracker::track(cv::Mat &img)
      get_features(input_rgb, input_gray, int(p_pose.cx), int(p_pose.cy), p_windows_size.width, p_windows_size.height,
                   *p_scale_vars.front(), p_current_scale);
      fft.forward_window(p_scale_vars.front()->patch_feats, p_scale_vars.front()->xf, p_scale_vars.front()->fw_all,
-                       m_use_cuda ? p_scale_vars.front()->data_features_d : nullptr, p_scale_vars.front()->stream);
+                       m_use_cuda ? p_scale_vars.front()->data_features.deviceMem() : nullptr, p_scale_vars.front()->stream);
  
      // subsequent frames, interpolate model
      p_model_xf = p_model_xf * float((1. - p_interp_factor)) + p_scale_vars.front()->xf * float(p_interp_factor);
@@ -452,14 +452,14 @@ void KCF_Tracker::scale_track(ThreadCtx &vars, cv::Mat &input_rgb, cv::Mat &inpu
                       this->p_windows_size.height, vars, this->p_current_scale *scale);
      }
  
-    fft.forward_window(vars.patch_feats, vars.zf, vars.fw_all, m_use_cuda ? vars.data_features_d : nullptr,
+    fft.forward_window(vars.patch_feats, vars.zf, vars.fw_all, m_use_cuda ? vars.data_features.deviceMem() : nullptr,
                         vars.stream);
      DEBUG_PRINTM(vars.zf);
  
      if (m_use_linearkernel) {
          vars.kzf = m_use_big_batch ? (vars.zf.mul2(this->p_model_alphaf)).sum_over_channels()
                                     : (p_model_alphaf * vars.zf).sum_over_channels();
-        fft.inverse(vars.kzf, vars.response, m_use_cuda ? vars.data_i_1ch_d : nullptr, vars.stream);
+        fft.inverse(vars.kzf, vars.response, m_use_cuda ? vars.data_i_1ch.deviceMem() : nullptr, vars.stream);
      } else {
  #if !defined(BIG_BATCH) && defined(CUFFT) && (defined(ASYNC) || defined(OPENMP))
          gaussian_correlation(vars, vars.zf, vars.model_xf, this->p_kernel_sigma);
@@ -470,7 +470,7 @@ void KCF_Tracker::scale_track(ThreadCtx &vars, cv::Mat &input_rgb, cv::Mat &inpu
          DEBUG_PRINTM(vars.kzf);
          vars.kzf = m_use_big_batch ? vars.kzf.mul(this->p_model_alphaf) : this->p_model_alphaf * vars.kzf;
  #endif
-        fft.inverse(vars.kzf, vars.response, m_use_cuda ? vars.data_i_1ch_d : nullptr, vars.stream);
+        fft.inverse(vars.kzf, vars.response, m_use_cuda ? vars.data_i_1ch.deviceMem() : nullptr, vars.stream);
      }
  
      DEBUG_PRINTM(vars.response);
@@ -732,25 +732,25 @@ void KCF_Tracker::gaussian_correlation(struct ThreadCtx &vars, const ComplexMat
                                         double sigma, bool auto_correlation)
  {
  #ifdef CUFFT
-    xf.sqr_norm(vars.xf_sqr_norm_d);
-    if (!auto_correlation) yf.sqr_norm(vars.yf_sqr_norm_d);
+    xf.sqr_norm(vars.xf_sqr_norm.deviceMem());
+    if (!auto_correlation) yf.sqr_norm(vars.yf_sqr_norm.deviceMem());
  #else
-    xf.sqr_norm(vars.xf_sqr_norm);
+    xf.sqr_norm(vars.xf_sqr_norm.hostMem());
      if (auto_correlation) {
-        vars.yf_sqr_norm[0] = vars.xf_sqr_norm[0];
+        vars.yf_sqr_norm.hostMem()[0] = vars.xf_sqr_norm.hostMem()[0];
      } else {
-        yf.sqr_norm(vars.yf_sqr_norm);
+        yf.sqr_norm(vars.yf_sqr_norm.hostMem());
      }
  #endif
      vars.xyf = auto_correlation ? xf.sqr_mag() : xf.mul2(yf.conj());
      DEBUG_PRINTM(vars.xyf);
-    fft.inverse(vars.xyf, vars.ifft2_res, m_use_cuda ? vars.data_i_features_d : nullptr, vars.stream);
+    fft.inverse(vars.xyf, vars.ifft2_res, m_use_cuda ? vars.data_i_features.deviceMem() : nullptr, vars.stream);
  #ifdef CUFFT
      if (auto_correlation)
-        cuda_gaussian_correlation(vars.data_i_features, vars.gauss_corr_res_d, vars.xf_sqr_norm_d, vars.xf_sqr_norm_d,
+        cuda_gaussian_correlation(vars.data_i_features.deviceMem(), vars.gauss_corr_res.deviceMem(), vars.xf_sqr_norm.deviceMem(), vars.xf_sqr_norm.deviceMem(),
                                    sigma, xf.n_channels, xf.n_scales, p_roi_height, p_roi_width, vars.stream);
      else
-        cuda_gaussian_correlation(vars.data_i_features, vars.gauss_corr_res_d, vars.xf_sqr_norm_d, vars.yf_sqr_norm_d,
+        cuda_gaussian_correlation(vars.data_i_features.deviceMem(), vars.gauss_corr_res.deviceMem(), vars.xf_sqr_norm.deviceMem(), vars.yf_sqr_norm.deviceMem(),
                                    sigma, xf.n_channels, xf.n_scales, p_roi_height, p_roi_width, vars.stream);
  #else
      // ifft2 and sum over 3rd dimension, we dont care about individual channels
@@ -784,13 +784,13 @@ void KCF_Tracker::gaussian_correlation(struct ThreadCtx &vars, const ComplexMat
          cv::Mat in_roi(vars.in_all, cv::Rect(0, int(i) * scales[0].rows, scales[0].cols, scales[0].rows));
          cv::exp(
              -1. / (sigma * sigma) *
-                cv::max((double(vars.xf_sqr_norm[i] + vars.yf_sqr_norm[0]) - 2 * scales[i]) * double(numel_xf_inv), 0),
+                cv::max((double(vars.xf_sqr_norm.hostMem()[i] + vars.yf_sqr_norm.hostMem()[0]) - 2 * scales[i]) * double(numel_xf_inv), 0),
              in_roi);
          DEBUG_PRINTM(in_roi);
      }
  #endif
      DEBUG_PRINTM(vars.in_all);
-    fft.forward(vars.in_all, auto_correlation ? vars.kf : vars.kzf, m_use_cuda ? vars.gauss_corr_res_d : nullptr,
+    fft.forward(vars.in_all, auto_correlation ? vars.kf : vars.kzf, m_use_cuda ? vars.gauss_corr_res.deviceMem() : nullptr,
                  vars.stream);
      return;
  }
diff --git a/src/threadctx.hpp b/src/threadctx.hpp

index 1cf13378cb27d34d499a214e11c9d12ad59c5b5e..2316d9bae46d64a13b131ccffeb0f8da28eb773d 100644 (file)
--- a/src/threadctx.hpp
+++ b/src/threadctx.hpp
@@ -1,6 +1,8 @@
  #ifndef SCALE_VARS_HPP
  #define SCALE_VARS_HPP
  
+#include "dynmem.hpp"
+
  #ifdef CUFFT
  #include "complexmat.cuh"
  #else
@@ -8,8 +10,6 @@
  #ifndef CUFFTW
  // For compatibility reasons between CuFFT and FFTW, OpenCVfft versions.
  typedef int *cudaStream_t;
-#else
-#include "cuda_runtime.h"
  #endif
  #endif
  
@@ -29,24 +29,17 @@ struct ThreadCtx {
  #endif
  
          this->patch_feats.reserve(uint(num_of_feats));
-// Size of cufftReal == float
+        // Size of cufftReal == float
          uint cells_size =
              ((uint(windows_size.width) / cell_size) * (uint(windows_size.height) / cell_size)) * sizeof(float);
  
-        CudaSafeCall(cudaHostAlloc(reinterpret_cast<void **>(&this->data_i_1ch), cells_size * num_of_scales,
-                                   cudaHostAllocMapped));
-        CudaSafeCall(cudaHostGetDevicePointer(reinterpret_cast<void **>(&this->data_i_1ch_d),
-                                              reinterpret_cast<void *>(this->data_i_1ch), 0));
-
-        CudaSafeCall(cudaHostAlloc(reinterpret_cast<void **>(&this->data_i_features), cells_size * num_of_feats,
-                                   cudaHostAllocMapped));
-        CudaSafeCall(cudaHostGetDevicePointer(reinterpret_cast<void **>(&this->data_i_features_d),
-                                              reinterpret_cast<void *>(this->data_i_features), 0));
+        this->data_i_1ch = DynMem(cells_size * num_of_scales);
+        this->data_i_features = DynMem(cells_size * num_of_feats);
  
          this->ifft2_res = cv::Mat(windows_size.height / int(cell_size), windows_size.width / int(cell_size),
-                                  CV_32FC(int(num_of_feats)), this->data_i_features);
+                                  CV_32FC(int(num_of_feats)), this->data_i_features.hostMem());
          this->response = cv::Mat(windows_size.height / int(cell_size), windows_size.width / int(cell_size),
-                                 CV_32FC(int(num_of_scales)), this->data_i_1ch);
+                                 CV_32FC(int(num_of_scales)), this->data_i_1ch.hostMem());
  
          this->zf.create(uint(windows_size.height) / cell_size, (uint(windows_size.width) / cell_size) / 2 + 1,
                          num_of_feats, num_of_scales, this->stream);
@@ -55,40 +48,26 @@ struct ThreadCtx {
          this->kf.create(uint(windows_size.height) / cell_size, (uint(windows_size.width) / cell_size) / 2 + 1,
                          num_of_scales, this->stream);
  
-        CudaSafeCall(cudaHostAlloc(reinterpret_cast<void **>(&this->xf_sqr_norm), num_of_scales * sizeof(float),
-                                   cudaHostAllocMapped));
-        CudaSafeCall(cudaHostGetDevicePointer(reinterpret_cast<void **>(&this->xf_sqr_norm_d),
-                                              reinterpret_cast<void *>(this->xf_sqr_norm), 0));
-
-        CudaSafeCall(cudaHostAlloc(reinterpret_cast<void **>(&this->yf_sqr_norm), sizeof(float), cudaHostAllocMapped));
-        CudaSafeCall(cudaHostGetDevicePointer(reinterpret_cast<void **>(&this->yf_sqr_norm_d),
-                                              reinterpret_cast<void *>(this->yf_sqr_norm), 0));
+        this->xf_sqr_norm = DynMem(num_of_scales * sizeof(float));
+        this->yf_sqr_norm = DynMem(sizeof(float));
  
-        CudaSafeCall(cudaHostAlloc(reinterpret_cast<void **>(&this->gauss_corr_res), cells_size * num_of_scales,
-                                   cudaHostAllocMapped));
-        CudaSafeCall(cudaHostGetDevicePointer(reinterpret_cast<void **>(&this->gauss_corr_res_d),
-                                              reinterpret_cast<void *>(this->gauss_corr_res), 0));
+        this->gauss_corr_res = DynMem(cells_size * num_of_scales);
          this->in_all = cv::Mat(windows_size.height / int(cell_size) * int(num_of_scales),
-                               windows_size.width / int(cell_size), CV_32F, this->gauss_corr_res);
+                               windows_size.width / int(cell_size), CV_32F, this->gauss_corr_res.hostMem());
  
          if (zero_index) {
-            CudaSafeCall(
-                cudaHostAlloc(reinterpret_cast<void **>(&this->rot_labels_data), cells_size, cudaHostAllocMapped));
-            CudaSafeCall(cudaHostGetDevicePointer(reinterpret_cast<void **>(&this->rot_labels_data_d),
-                                                  reinterpret_cast<void *>(this->rot_labels_data), 0));
+            this->rot_labels_data = DynMem(cells_size);
              this->rot_labels = cv::Mat(windows_size.height / int(cell_size), windows_size.width / int(cell_size),
-                                       CV_32FC1, this->rot_labels_data);
+                                       CV_32FC1, this->rot_labels_data.hostMem());
          }
  
-        CudaSafeCall(cudaHostAlloc(reinterpret_cast<void **>(&this->data_features), cells_size*num_of_feats, cudaHostAllocMapped));
-        CudaSafeCall(cudaHostGetDevicePointer(reinterpret_cast<void **>(&this->data_features_d),
-                                              reinterpret_cast<void *>(this->data_features), 0));
+        this->data_features = DynMem(cells_size * num_of_feats);
          this->fw_all = cv::Mat((windows_size.height / int(cell_size)) * int(num_of_feats),
-                               windows_size.width / int(cell_size), CV_32F, this->data_features);
+                               windows_size.width / int(cell_size), CV_32F, this->data_features.hostMem());
  #else
  
-        this->xf_sqr_norm = reinterpret_cast<float *>(malloc(num_of_scales * sizeof(float)));
-        this->yf_sqr_norm = reinterpret_cast<float *>(malloc(sizeof(float)));
+        this->xf_sqr_norm = DynMem(num_of_scales * sizeof(float));
+        this->yf_sqr_norm = DynMem(sizeof (float));
  
          this->patch_feats.reserve(num_of_feats);
  
@@ -143,24 +122,12 @@ struct ThreadCtx {
  
      ~ThreadCtx()
      {
-#ifdef CUFFT
-        CudaSafeCall(cudaFreeHost(this->xf_sqr_norm));
-        CudaSafeCall(cudaFreeHost(this->yf_sqr_norm));
-        CudaSafeCall(cudaFreeHost(this->data_i_1ch));
-        CudaSafeCall(cudaFreeHost(this->data_i_features));
-        CudaSafeCall(cudaFreeHost(this->gauss_corr_res));
-        if (zero_index) CudaSafeCall(cudaFreeHost(this->rot_labels_data));
-        CudaSafeCall(cudaFreeHost(this->data_features));
-#if defined(ASYNC) || defined(OPENMP)
+#if defined(CUFFT) && (defined(ASYNC) || defined(OPENMP))
          CudaSafeCall(cudaStreamDestroy(this->stream));
-#endif
-#else
-        free(this->xf_sqr_norm);
-        free(this->yf_sqr_norm);
  #endif
      }
  
-    float *xf_sqr_norm = nullptr, *yf_sqr_norm = nullptr;
+    DynMem xf_sqr_norm, yf_sqr_norm;
      std::vector<cv::Mat> patch_feats;
  
      cv::Mat in_all, fw_all, ifft2_res, response;
@@ -168,11 +135,7 @@ struct ThreadCtx {
  
      // CuFFT variables
      cv::Mat rot_labels;
-    float *xf_sqr_norm_d = nullptr, *yf_sqr_norm_d = nullptr, *gauss_corr_res = nullptr, *gauss_corr_res_d = nullptr,
-          *rot_labels_data = nullptr, *rot_labels_data_d = nullptr, *data_features = nullptr,
-          *data_features_d = nullptr;
-    float *data_f = nullptr, *data_i_features = nullptr, *data_i_features_d = nullptr, *data_i_1ch = nullptr,
-          *data_i_1ch_d = nullptr;
+    DynMem gauss_corr_res, rot_labels_data, data_features, data_f, data_i_features, data_i_1ch;
  
      cudaStream_t stream = nullptr;
      ComplexMat model_alphaf, model_xf;
author	Shanigen <vkaraf@gmail.com>
	Thu, 6 Sep 2018 10:42:16 +0000 (12:42 +0200)
committer	Michal Sojka <michal.sojka@cvut.cz>
	Mon, 10 Sep 2018 14:52:24 +0000 (16:52 +0200)
src/CMakeLists.txt		patch \| blob \| history
src/complexmat.hpp		patch \| blob \| history
src/dynmem.hpp	[new file with mode: 0644]	patch \| blob
src/fft_cufft.h		patch \| blob \| history
src/kcf.cpp		patch \| blob \| history
src/threadctx.hpp		patch \| blob \| history