From: Shanigen <vkaraf@gmail.com>
Date: Thu, 6 Sep 2018 10:42:16 +0000 (+0200)
Subject: Added DynMem class
X-Git-Url: http://rtime.felk.cvut.cz/gitweb/hercules2020/kcf.git/commitdiff_plain/c3b542f1a98a98a0fa98dace5e342d766f9696ce

Added DynMem class

DynMem class streamlines dynamic memory allocation for both CUDA
and CPU version of the tracker. Also some minor changes in fft_cufft.hpp
were made, because there was a problem with order of header includes.
---

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 1cff1e5..3ffa79b 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -1,6 +1,6 @@
 cmake_minimum_required(VERSION 2.8)
 
-set(KCF_LIB_SRC kcf.cpp kcf.h fft.cpp threadctx.hpp pragmas.h)
+set(KCF_LIB_SRC kcf.cpp kcf.h fft.cpp threadctx.hpp pragmas.h dynmem.hpp)
 
 find_package(PkgConfig)
 
diff --git a/src/complexmat.hpp b/src/complexmat.hpp
index 2a59574..7ba3f60 100644
--- a/src/complexmat.hpp
+++ b/src/complexmat.hpp
@@ -27,7 +27,10 @@ template <typename T> class ComplexMat_ {
     }
 
     // assuming that mat has 2 channels (real, img)
-    ComplexMat_(const cv::Mat &mat) : cols(uint(mat.cols)), rows(uint(mat.rows)), n_channels(1) { p_data = convert(mat); }
+    ComplexMat_(const cv::Mat &mat) : cols(uint(mat.cols)), rows(uint(mat.rows)), n_channels(1)
+    {
+        p_data = convert(mat);
+    }
 
     void create(uint _rows, uint _cols, uint _n_channels)
     {
diff --git a/src/dynmem.hpp b/src/dynmem.hpp
new file mode 100644
index 0000000..3ddf312
--- /dev/null
+++ b/src/dynmem.hpp
@@ -0,0 +1,51 @@
+#ifndef DYNMEM_HPP
+#define DYNMEM_HPP
+
+#include <cstdlib>
+
+#if defined(CUFFT) || defined(CUFFTW)
+#include "cuda_runtime.h"
+#ifdef CUFFT
+#include "cuda/cuda_error_check.cuh"
+#endif
+#endif
+
+template <typename T> class DynMem_ {
+    T *ptr = nullptr;
+    T *ptr_d = nullptr;
+
+  public:
+    DynMem_()
+    {}
+    DynMem_(size_t size)
+    {
+#ifdef CUFFT
+        CudaSafeCall(cudaHostAlloc(reinterpret_cast<void **>(&this->ptr), size, cudaHostAllocMapped));
+        CudaSafeCall(
+            cudaHostGetDevicePointer(reinterpret_cast<void **>(&this->ptr_d), reinterpret_cast<void *>(this->ptr), 0));
+#else
+        this->ptr = new float[size];
+#endif
+    }
+    ~DynMem_()
+    {
+#ifdef CUFFT
+        CudaSafeCall(cudaFreeHost(this->ptr));
+#else
+        delete this->ptr;
+#endif
+    }
+    T *hostMem() { return ptr; }
+    T *deviceMem() { return ptr_d; }
+
+    void operator=(DynMem_ &&rhs)
+    {
+        this->ptr = rhs.ptr;
+        this->ptr_d = rhs.ptr_d;
+
+        rhs.ptr = nullptr;
+        rhs.ptr_d = nullptr;
+    }
+};
+typedef DynMem_<float> DynMem;
+#endif // DYNMEM_HPP
diff --git a/src/fft_cufft.h b/src/fft_cufft.h
index 3433bc3..ca0dc59 100644
--- a/src/fft_cufft.h
+++ b/src/fft_cufft.h
@@ -1,7 +1,10 @@
-
 #ifndef FFT_CUDA_H
 #define FFT_CUDA_H
 
+
+#include <cufft.h>
+#include <cuda_runtime.h>
+
 #include "fft.h"
 #include "cuda/cuda_error_check.cuh"
 #include "pragmas.h"
@@ -14,9 +17,6 @@
   #define CUDA cv::cuda
 #endif
 
-#include <cufft.h>
-#include <cuda_runtime.h>
-
 struct ThreadCtx;
 
 class cuFFT : public Fft
diff --git a/src/kcf.cpp b/src/kcf.cpp
index 629c6f3..663623c 100644
--- a/src/kcf.cpp
+++ b/src/kcf.cpp
@@ -191,7 +191,7 @@ void KCF_Tracker::init(cv::Mat &img, const cv::Rect &bbox, int fit_size_x, int f
     // window weights, i.e. labels
     fft.forward(
         gaussian_shaped_labels(p_output_sigma, p_windows_size.width / p_cell_size, p_windows_size.height / p_cell_size), p_yf,
-        m_use_cuda ? p_scale_vars.front()->rot_labels_data_d : nullptr, p_scale_vars.front()->stream);
+        m_use_cuda ? p_scale_vars.front()->rot_labels_data.deviceMem() : nullptr, p_scale_vars.front()->stream);
     DEBUG_PRINTM(p_yf);
 
     // obtain a sub-window for training initial model
@@ -199,7 +199,7 @@ void KCF_Tracker::init(cv::Mat &img, const cv::Rect &bbox, int fit_size_x, int f
     get_features(input_rgb, input_gray, int(p_pose.cx), int(p_pose.cy), p_windows_size.width, p_windows_size.height,
                  *p_scale_vars.front());
     fft.forward_window(p_scale_vars.front()->patch_feats, p_model_xf, p_scale_vars.front()->fw_all,
-                       m_use_cuda ? p_scale_vars.front()->data_features_d : nullptr, p_scale_vars.front()->stream);
+                       m_use_cuda ? p_scale_vars.front()->data_features.deviceMem() : nullptr, p_scale_vars.front()->stream);
     DEBUG_PRINTM(p_model_xf);
 #if defined(CUFFT) && (defined(ASYNC) || defined(OPENMP))
     p_scale_vars.front()->model_xf = p_model_xf;
@@ -402,7 +402,7 @@ void KCF_Tracker::track(cv::Mat &img)
     get_features(input_rgb, input_gray, int(p_pose.cx), int(p_pose.cy), p_windows_size.width, p_windows_size.height,
                  *p_scale_vars.front(), p_current_scale);
     fft.forward_window(p_scale_vars.front()->patch_feats, p_scale_vars.front()->xf, p_scale_vars.front()->fw_all,
-                       m_use_cuda ? p_scale_vars.front()->data_features_d : nullptr, p_scale_vars.front()->stream);
+                       m_use_cuda ? p_scale_vars.front()->data_features.deviceMem() : nullptr, p_scale_vars.front()->stream);
 
     // subsequent frames, interpolate model
     p_model_xf = p_model_xf * float((1. - p_interp_factor)) + p_scale_vars.front()->xf * float(p_interp_factor);
@@ -452,14 +452,14 @@ void KCF_Tracker::scale_track(ThreadCtx &vars, cv::Mat &input_rgb, cv::Mat &inpu
                      this->p_windows_size.height, vars, this->p_current_scale *scale);
     }
 
-    fft.forward_window(vars.patch_feats, vars.zf, vars.fw_all, m_use_cuda ? vars.data_features_d : nullptr,
+    fft.forward_window(vars.patch_feats, vars.zf, vars.fw_all, m_use_cuda ? vars.data_features.deviceMem() : nullptr,
                        vars.stream);
     DEBUG_PRINTM(vars.zf);
 
     if (m_use_linearkernel) {
         vars.kzf = m_use_big_batch ? (vars.zf.mul2(this->p_model_alphaf)).sum_over_channels()
                                    : (p_model_alphaf * vars.zf).sum_over_channels();
-        fft.inverse(vars.kzf, vars.response, m_use_cuda ? vars.data_i_1ch_d : nullptr, vars.stream);
+        fft.inverse(vars.kzf, vars.response, m_use_cuda ? vars.data_i_1ch.deviceMem() : nullptr, vars.stream);
     } else {
 #if !defined(BIG_BATCH) && defined(CUFFT) && (defined(ASYNC) || defined(OPENMP))
         gaussian_correlation(vars, vars.zf, vars.model_xf, this->p_kernel_sigma);
@@ -470,7 +470,7 @@ void KCF_Tracker::scale_track(ThreadCtx &vars, cv::Mat &input_rgb, cv::Mat &inpu
         DEBUG_PRINTM(vars.kzf);
         vars.kzf = m_use_big_batch ? vars.kzf.mul(this->p_model_alphaf) : this->p_model_alphaf * vars.kzf;
 #endif
-        fft.inverse(vars.kzf, vars.response, m_use_cuda ? vars.data_i_1ch_d : nullptr, vars.stream);
+        fft.inverse(vars.kzf, vars.response, m_use_cuda ? vars.data_i_1ch.deviceMem() : nullptr, vars.stream);
     }
 
     DEBUG_PRINTM(vars.response);
@@ -732,25 +732,25 @@ void KCF_Tracker::gaussian_correlation(struct ThreadCtx &vars, const ComplexMat
                                        double sigma, bool auto_correlation)
 {
 #ifdef CUFFT
-    xf.sqr_norm(vars.xf_sqr_norm_d);
-    if (!auto_correlation) yf.sqr_norm(vars.yf_sqr_norm_d);
+    xf.sqr_norm(vars.xf_sqr_norm.deviceMem());
+    if (!auto_correlation) yf.sqr_norm(vars.yf_sqr_norm.deviceMem());
 #else
-    xf.sqr_norm(vars.xf_sqr_norm);
+    xf.sqr_norm(vars.xf_sqr_norm.hostMem());
     if (auto_correlation) {
-        vars.yf_sqr_norm[0] = vars.xf_sqr_norm[0];
+        vars.yf_sqr_norm.hostMem()[0] = vars.xf_sqr_norm.hostMem()[0];
     } else {
-        yf.sqr_norm(vars.yf_sqr_norm);
+        yf.sqr_norm(vars.yf_sqr_norm.hostMem());
     }
 #endif
     vars.xyf = auto_correlation ? xf.sqr_mag() : xf.mul2(yf.conj());
     DEBUG_PRINTM(vars.xyf);
-    fft.inverse(vars.xyf, vars.ifft2_res, m_use_cuda ? vars.data_i_features_d : nullptr, vars.stream);
+    fft.inverse(vars.xyf, vars.ifft2_res, m_use_cuda ? vars.data_i_features.deviceMem() : nullptr, vars.stream);
 #ifdef CUFFT
     if (auto_correlation)
-        cuda_gaussian_correlation(vars.data_i_features, vars.gauss_corr_res_d, vars.xf_sqr_norm_d, vars.xf_sqr_norm_d,
+        cuda_gaussian_correlation(vars.data_i_features.deviceMem(), vars.gauss_corr_res.deviceMem(), vars.xf_sqr_norm.deviceMem(), vars.xf_sqr_norm.deviceMem(),
                                   sigma, xf.n_channels, xf.n_scales, p_roi_height, p_roi_width, vars.stream);
     else
-        cuda_gaussian_correlation(vars.data_i_features, vars.gauss_corr_res_d, vars.xf_sqr_norm_d, vars.yf_sqr_norm_d,
+        cuda_gaussian_correlation(vars.data_i_features.deviceMem(), vars.gauss_corr_res.deviceMem(), vars.xf_sqr_norm.deviceMem(), vars.yf_sqr_norm.deviceMem(),
                                   sigma, xf.n_channels, xf.n_scales, p_roi_height, p_roi_width, vars.stream);
 #else
     // ifft2 and sum over 3rd dimension, we dont care about individual channels
@@ -784,13 +784,13 @@ void KCF_Tracker::gaussian_correlation(struct ThreadCtx &vars, const ComplexMat
         cv::Mat in_roi(vars.in_all, cv::Rect(0, int(i) * scales[0].rows, scales[0].cols, scales[0].rows));
         cv::exp(
             -1. / (sigma * sigma) *
-                cv::max((double(vars.xf_sqr_norm[i] + vars.yf_sqr_norm[0]) - 2 * scales[i]) * double(numel_xf_inv), 0),
+                cv::max((double(vars.xf_sqr_norm.hostMem()[i] + vars.yf_sqr_norm.hostMem()[0]) - 2 * scales[i]) * double(numel_xf_inv), 0),
             in_roi);
         DEBUG_PRINTM(in_roi);
     }
 #endif
     DEBUG_PRINTM(vars.in_all);
-    fft.forward(vars.in_all, auto_correlation ? vars.kf : vars.kzf, m_use_cuda ? vars.gauss_corr_res_d : nullptr,
+    fft.forward(vars.in_all, auto_correlation ? vars.kf : vars.kzf, m_use_cuda ? vars.gauss_corr_res.deviceMem() : nullptr,
                 vars.stream);
     return;
 }
diff --git a/src/threadctx.hpp b/src/threadctx.hpp
index 1cf1337..2316d9b 100644
--- a/src/threadctx.hpp
+++ b/src/threadctx.hpp
@@ -1,6 +1,8 @@
 #ifndef SCALE_VARS_HPP
 #define SCALE_VARS_HPP
 
+#include "dynmem.hpp"
+
 #ifdef CUFFT
 #include "complexmat.cuh"
 #else
@@ -8,8 +10,6 @@
 #ifndef CUFFTW
 // For compatibility reasons between CuFFT and FFTW, OpenCVfft versions.
 typedef int *cudaStream_t;
-#else
-#include "cuda_runtime.h"
 #endif
 #endif
 
@@ -29,24 +29,17 @@ struct ThreadCtx {
 #endif
 
         this->patch_feats.reserve(uint(num_of_feats));
-// Size of cufftReal == float
+        // Size of cufftReal == float
         uint cells_size =
             ((uint(windows_size.width) / cell_size) * (uint(windows_size.height) / cell_size)) * sizeof(float);
 
-        CudaSafeCall(cudaHostAlloc(reinterpret_cast<void **>(&this->data_i_1ch), cells_size * num_of_scales,
-                                   cudaHostAllocMapped));
-        CudaSafeCall(cudaHostGetDevicePointer(reinterpret_cast<void **>(&this->data_i_1ch_d),
-                                              reinterpret_cast<void *>(this->data_i_1ch), 0));
-
-        CudaSafeCall(cudaHostAlloc(reinterpret_cast<void **>(&this->data_i_features), cells_size * num_of_feats,
-                                   cudaHostAllocMapped));
-        CudaSafeCall(cudaHostGetDevicePointer(reinterpret_cast<void **>(&this->data_i_features_d),
-                                              reinterpret_cast<void *>(this->data_i_features), 0));
+        this->data_i_1ch = DynMem(cells_size * num_of_scales);
+        this->data_i_features = DynMem(cells_size * num_of_feats);
 
         this->ifft2_res = cv::Mat(windows_size.height / int(cell_size), windows_size.width / int(cell_size),
-                                  CV_32FC(int(num_of_feats)), this->data_i_features);
+                                  CV_32FC(int(num_of_feats)), this->data_i_features.hostMem());
         this->response = cv::Mat(windows_size.height / int(cell_size), windows_size.width / int(cell_size),
-                                 CV_32FC(int(num_of_scales)), this->data_i_1ch);
+                                 CV_32FC(int(num_of_scales)), this->data_i_1ch.hostMem());
 
         this->zf.create(uint(windows_size.height) / cell_size, (uint(windows_size.width) / cell_size) / 2 + 1,
                         num_of_feats, num_of_scales, this->stream);
@@ -55,40 +48,26 @@ struct ThreadCtx {
         this->kf.create(uint(windows_size.height) / cell_size, (uint(windows_size.width) / cell_size) / 2 + 1,
                         num_of_scales, this->stream);
 
-        CudaSafeCall(cudaHostAlloc(reinterpret_cast<void **>(&this->xf_sqr_norm), num_of_scales * sizeof(float),
-                                   cudaHostAllocMapped));
-        CudaSafeCall(cudaHostGetDevicePointer(reinterpret_cast<void **>(&this->xf_sqr_norm_d),
-                                              reinterpret_cast<void *>(this->xf_sqr_norm), 0));
-
-        CudaSafeCall(cudaHostAlloc(reinterpret_cast<void **>(&this->yf_sqr_norm), sizeof(float), cudaHostAllocMapped));
-        CudaSafeCall(cudaHostGetDevicePointer(reinterpret_cast<void **>(&this->yf_sqr_norm_d),
-                                              reinterpret_cast<void *>(this->yf_sqr_norm), 0));
+        this->xf_sqr_norm = DynMem(num_of_scales * sizeof(float));
+        this->yf_sqr_norm = DynMem(sizeof(float));
 
-        CudaSafeCall(cudaHostAlloc(reinterpret_cast<void **>(&this->gauss_corr_res), cells_size * num_of_scales,
-                                   cudaHostAllocMapped));
-        CudaSafeCall(cudaHostGetDevicePointer(reinterpret_cast<void **>(&this->gauss_corr_res_d),
-                                              reinterpret_cast<void *>(this->gauss_corr_res), 0));
+        this->gauss_corr_res = DynMem(cells_size * num_of_scales);
         this->in_all = cv::Mat(windows_size.height / int(cell_size) * int(num_of_scales),
-                               windows_size.width / int(cell_size), CV_32F, this->gauss_corr_res);
+                               windows_size.width / int(cell_size), CV_32F, this->gauss_corr_res.hostMem());
 
         if (zero_index) {
-            CudaSafeCall(
-                cudaHostAlloc(reinterpret_cast<void **>(&this->rot_labels_data), cells_size, cudaHostAllocMapped));
-            CudaSafeCall(cudaHostGetDevicePointer(reinterpret_cast<void **>(&this->rot_labels_data_d),
-                                                  reinterpret_cast<void *>(this->rot_labels_data), 0));
+            this->rot_labels_data = DynMem(cells_size);
             this->rot_labels = cv::Mat(windows_size.height / int(cell_size), windows_size.width / int(cell_size),
-                                       CV_32FC1, this->rot_labels_data);
+                                       CV_32FC1, this->rot_labels_data.hostMem());
         }
 
-        CudaSafeCall(cudaHostAlloc(reinterpret_cast<void **>(&this->data_features), cells_size*num_of_feats, cudaHostAllocMapped));
-        CudaSafeCall(cudaHostGetDevicePointer(reinterpret_cast<void **>(&this->data_features_d),
-                                              reinterpret_cast<void *>(this->data_features), 0));
+        this->data_features = DynMem(cells_size * num_of_feats);
         this->fw_all = cv::Mat((windows_size.height / int(cell_size)) * int(num_of_feats),
-                               windows_size.width / int(cell_size), CV_32F, this->data_features);
+                               windows_size.width / int(cell_size), CV_32F, this->data_features.hostMem());
 #else
 
-        this->xf_sqr_norm = reinterpret_cast<float *>(malloc(num_of_scales * sizeof(float)));
-        this->yf_sqr_norm = reinterpret_cast<float *>(malloc(sizeof(float)));
+        this->xf_sqr_norm = DynMem(num_of_scales * sizeof(float));
+        this->yf_sqr_norm = DynMem(sizeof (float));
 
         this->patch_feats.reserve(num_of_feats);
 
@@ -143,24 +122,12 @@ struct ThreadCtx {
 
     ~ThreadCtx()
     {
-#ifdef CUFFT
-        CudaSafeCall(cudaFreeHost(this->xf_sqr_norm));
-        CudaSafeCall(cudaFreeHost(this->yf_sqr_norm));
-        CudaSafeCall(cudaFreeHost(this->data_i_1ch));
-        CudaSafeCall(cudaFreeHost(this->data_i_features));
-        CudaSafeCall(cudaFreeHost(this->gauss_corr_res));
-        if (zero_index) CudaSafeCall(cudaFreeHost(this->rot_labels_data));
-        CudaSafeCall(cudaFreeHost(this->data_features));
-#if defined(ASYNC) || defined(OPENMP)
+#if defined(CUFFT) && (defined(ASYNC) || defined(OPENMP))
         CudaSafeCall(cudaStreamDestroy(this->stream));
-#endif
-#else
-        free(this->xf_sqr_norm);
-        free(this->yf_sqr_norm);
 #endif
     }
 
-    float *xf_sqr_norm = nullptr, *yf_sqr_norm = nullptr;
+    DynMem xf_sqr_norm, yf_sqr_norm;
     std::vector<cv::Mat> patch_feats;
 
     cv::Mat in_all, fw_all, ifft2_res, response;
@@ -168,11 +135,7 @@ struct ThreadCtx {
 
     // CuFFT variables
     cv::Mat rot_labels;
-    float *xf_sqr_norm_d = nullptr, *yf_sqr_norm_d = nullptr, *gauss_corr_res = nullptr, *gauss_corr_res_d = nullptr,
-          *rot_labels_data = nullptr, *rot_labels_data_d = nullptr, *data_features = nullptr,
-          *data_features_d = nullptr;
-    float *data_f = nullptr, *data_i_features = nullptr, *data_i_features_d = nullptr, *data_i_1ch = nullptr,
-          *data_i_1ch_d = nullptr;
+    DynMem gauss_corr_res, rot_labels_data, data_features, data_f, data_i_features, data_i_1ch;
 
     cudaStream_t stream = nullptr;
     ComplexMat model_alphaf, model_xf;