From: Shanigen Date: Thu, 6 Sep 2018 10:42:16 +0000 (+0200) Subject: Added DynMem class X-Git-Url: http://rtime.felk.cvut.cz/gitweb/hercules2020/kcf.git/commitdiff_plain/c3b542f1a98a98a0fa98dace5e342d766f9696ce Added DynMem class DynMem class streamlines dynamic memory allocation for both CUDA and CPU version of the tracker. Also some minor changes in fft_cufft.hpp were made, because there was a problem with order of header includes. --- diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 1cff1e5..3ffa79b 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -1,6 +1,6 @@ cmake_minimum_required(VERSION 2.8) -set(KCF_LIB_SRC kcf.cpp kcf.h fft.cpp threadctx.hpp pragmas.h) +set(KCF_LIB_SRC kcf.cpp kcf.h fft.cpp threadctx.hpp pragmas.h dynmem.hpp) find_package(PkgConfig) diff --git a/src/complexmat.hpp b/src/complexmat.hpp index 2a59574..7ba3f60 100644 --- a/src/complexmat.hpp +++ b/src/complexmat.hpp @@ -27,7 +27,10 @@ template class ComplexMat_ { } // assuming that mat has 2 channels (real, img) - ComplexMat_(const cv::Mat &mat) : cols(uint(mat.cols)), rows(uint(mat.rows)), n_channels(1) { p_data = convert(mat); } + ComplexMat_(const cv::Mat &mat) : cols(uint(mat.cols)), rows(uint(mat.rows)), n_channels(1) + { + p_data = convert(mat); + } void create(uint _rows, uint _cols, uint _n_channels) { diff --git a/src/dynmem.hpp b/src/dynmem.hpp new file mode 100644 index 0000000..3ddf312 --- /dev/null +++ b/src/dynmem.hpp @@ -0,0 +1,51 @@ +#ifndef DYNMEM_HPP +#define DYNMEM_HPP + +#include + +#if defined(CUFFT) || defined(CUFFTW) +#include "cuda_runtime.h" +#ifdef CUFFT +#include "cuda/cuda_error_check.cuh" +#endif +#endif + +template class DynMem_ { + T *ptr = nullptr; + T *ptr_d = nullptr; + + public: + DynMem_() + {} + DynMem_(size_t size) + { +#ifdef CUFFT + CudaSafeCall(cudaHostAlloc(reinterpret_cast(&this->ptr), size, cudaHostAllocMapped)); + CudaSafeCall( + cudaHostGetDevicePointer(reinterpret_cast(&this->ptr_d), reinterpret_cast(this->ptr), 0)); +#else + this->ptr = new float[size]; +#endif + } + ~DynMem_() + { +#ifdef CUFFT + CudaSafeCall(cudaFreeHost(this->ptr)); +#else + delete this->ptr; +#endif + } + T *hostMem() { return ptr; } + T *deviceMem() { return ptr_d; } + + void operator=(DynMem_ &&rhs) + { + this->ptr = rhs.ptr; + this->ptr_d = rhs.ptr_d; + + rhs.ptr = nullptr; + rhs.ptr_d = nullptr; + } +}; +typedef DynMem_ DynMem; +#endif // DYNMEM_HPP diff --git a/src/fft_cufft.h b/src/fft_cufft.h index 3433bc3..ca0dc59 100644 --- a/src/fft_cufft.h +++ b/src/fft_cufft.h @@ -1,7 +1,10 @@ - #ifndef FFT_CUDA_H #define FFT_CUDA_H + +#include +#include + #include "fft.h" #include "cuda/cuda_error_check.cuh" #include "pragmas.h" @@ -14,9 +17,6 @@ #define CUDA cv::cuda #endif -#include -#include - struct ThreadCtx; class cuFFT : public Fft diff --git a/src/kcf.cpp b/src/kcf.cpp index 629c6f3..663623c 100644 --- a/src/kcf.cpp +++ b/src/kcf.cpp @@ -191,7 +191,7 @@ void KCF_Tracker::init(cv::Mat &img, const cv::Rect &bbox, int fit_size_x, int f // window weights, i.e. labels fft.forward( gaussian_shaped_labels(p_output_sigma, p_windows_size.width / p_cell_size, p_windows_size.height / p_cell_size), p_yf, - m_use_cuda ? p_scale_vars.front()->rot_labels_data_d : nullptr, p_scale_vars.front()->stream); + m_use_cuda ? p_scale_vars.front()->rot_labels_data.deviceMem() : nullptr, p_scale_vars.front()->stream); DEBUG_PRINTM(p_yf); // obtain a sub-window for training initial model @@ -199,7 +199,7 @@ void KCF_Tracker::init(cv::Mat &img, const cv::Rect &bbox, int fit_size_x, int f get_features(input_rgb, input_gray, int(p_pose.cx), int(p_pose.cy), p_windows_size.width, p_windows_size.height, *p_scale_vars.front()); fft.forward_window(p_scale_vars.front()->patch_feats, p_model_xf, p_scale_vars.front()->fw_all, - m_use_cuda ? p_scale_vars.front()->data_features_d : nullptr, p_scale_vars.front()->stream); + m_use_cuda ? p_scale_vars.front()->data_features.deviceMem() : nullptr, p_scale_vars.front()->stream); DEBUG_PRINTM(p_model_xf); #if defined(CUFFT) && (defined(ASYNC) || defined(OPENMP)) p_scale_vars.front()->model_xf = p_model_xf; @@ -402,7 +402,7 @@ void KCF_Tracker::track(cv::Mat &img) get_features(input_rgb, input_gray, int(p_pose.cx), int(p_pose.cy), p_windows_size.width, p_windows_size.height, *p_scale_vars.front(), p_current_scale); fft.forward_window(p_scale_vars.front()->patch_feats, p_scale_vars.front()->xf, p_scale_vars.front()->fw_all, - m_use_cuda ? p_scale_vars.front()->data_features_d : nullptr, p_scale_vars.front()->stream); + m_use_cuda ? p_scale_vars.front()->data_features.deviceMem() : nullptr, p_scale_vars.front()->stream); // subsequent frames, interpolate model p_model_xf = p_model_xf * float((1. - p_interp_factor)) + p_scale_vars.front()->xf * float(p_interp_factor); @@ -452,14 +452,14 @@ void KCF_Tracker::scale_track(ThreadCtx &vars, cv::Mat &input_rgb, cv::Mat &inpu this->p_windows_size.height, vars, this->p_current_scale *scale); } - fft.forward_window(vars.patch_feats, vars.zf, vars.fw_all, m_use_cuda ? vars.data_features_d : nullptr, + fft.forward_window(vars.patch_feats, vars.zf, vars.fw_all, m_use_cuda ? vars.data_features.deviceMem() : nullptr, vars.stream); DEBUG_PRINTM(vars.zf); if (m_use_linearkernel) { vars.kzf = m_use_big_batch ? (vars.zf.mul2(this->p_model_alphaf)).sum_over_channels() : (p_model_alphaf * vars.zf).sum_over_channels(); - fft.inverse(vars.kzf, vars.response, m_use_cuda ? vars.data_i_1ch_d : nullptr, vars.stream); + fft.inverse(vars.kzf, vars.response, m_use_cuda ? vars.data_i_1ch.deviceMem() : nullptr, vars.stream); } else { #if !defined(BIG_BATCH) && defined(CUFFT) && (defined(ASYNC) || defined(OPENMP)) gaussian_correlation(vars, vars.zf, vars.model_xf, this->p_kernel_sigma); @@ -470,7 +470,7 @@ void KCF_Tracker::scale_track(ThreadCtx &vars, cv::Mat &input_rgb, cv::Mat &inpu DEBUG_PRINTM(vars.kzf); vars.kzf = m_use_big_batch ? vars.kzf.mul(this->p_model_alphaf) : this->p_model_alphaf * vars.kzf; #endif - fft.inverse(vars.kzf, vars.response, m_use_cuda ? vars.data_i_1ch_d : nullptr, vars.stream); + fft.inverse(vars.kzf, vars.response, m_use_cuda ? vars.data_i_1ch.deviceMem() : nullptr, vars.stream); } DEBUG_PRINTM(vars.response); @@ -732,25 +732,25 @@ void KCF_Tracker::gaussian_correlation(struct ThreadCtx &vars, const ComplexMat double sigma, bool auto_correlation) { #ifdef CUFFT - xf.sqr_norm(vars.xf_sqr_norm_d); - if (!auto_correlation) yf.sqr_norm(vars.yf_sqr_norm_d); + xf.sqr_norm(vars.xf_sqr_norm.deviceMem()); + if (!auto_correlation) yf.sqr_norm(vars.yf_sqr_norm.deviceMem()); #else - xf.sqr_norm(vars.xf_sqr_norm); + xf.sqr_norm(vars.xf_sqr_norm.hostMem()); if (auto_correlation) { - vars.yf_sqr_norm[0] = vars.xf_sqr_norm[0]; + vars.yf_sqr_norm.hostMem()[0] = vars.xf_sqr_norm.hostMem()[0]; } else { - yf.sqr_norm(vars.yf_sqr_norm); + yf.sqr_norm(vars.yf_sqr_norm.hostMem()); } #endif vars.xyf = auto_correlation ? xf.sqr_mag() : xf.mul2(yf.conj()); DEBUG_PRINTM(vars.xyf); - fft.inverse(vars.xyf, vars.ifft2_res, m_use_cuda ? vars.data_i_features_d : nullptr, vars.stream); + fft.inverse(vars.xyf, vars.ifft2_res, m_use_cuda ? vars.data_i_features.deviceMem() : nullptr, vars.stream); #ifdef CUFFT if (auto_correlation) - cuda_gaussian_correlation(vars.data_i_features, vars.gauss_corr_res_d, vars.xf_sqr_norm_d, vars.xf_sqr_norm_d, + cuda_gaussian_correlation(vars.data_i_features.deviceMem(), vars.gauss_corr_res.deviceMem(), vars.xf_sqr_norm.deviceMem(), vars.xf_sqr_norm.deviceMem(), sigma, xf.n_channels, xf.n_scales, p_roi_height, p_roi_width, vars.stream); else - cuda_gaussian_correlation(vars.data_i_features, vars.gauss_corr_res_d, vars.xf_sqr_norm_d, vars.yf_sqr_norm_d, + cuda_gaussian_correlation(vars.data_i_features.deviceMem(), vars.gauss_corr_res.deviceMem(), vars.xf_sqr_norm.deviceMem(), vars.yf_sqr_norm.deviceMem(), sigma, xf.n_channels, xf.n_scales, p_roi_height, p_roi_width, vars.stream); #else // ifft2 and sum over 3rd dimension, we dont care about individual channels @@ -784,13 +784,13 @@ void KCF_Tracker::gaussian_correlation(struct ThreadCtx &vars, const ComplexMat cv::Mat in_roi(vars.in_all, cv::Rect(0, int(i) * scales[0].rows, scales[0].cols, scales[0].rows)); cv::exp( -1. / (sigma * sigma) * - cv::max((double(vars.xf_sqr_norm[i] + vars.yf_sqr_norm[0]) - 2 * scales[i]) * double(numel_xf_inv), 0), + cv::max((double(vars.xf_sqr_norm.hostMem()[i] + vars.yf_sqr_norm.hostMem()[0]) - 2 * scales[i]) * double(numel_xf_inv), 0), in_roi); DEBUG_PRINTM(in_roi); } #endif DEBUG_PRINTM(vars.in_all); - fft.forward(vars.in_all, auto_correlation ? vars.kf : vars.kzf, m_use_cuda ? vars.gauss_corr_res_d : nullptr, + fft.forward(vars.in_all, auto_correlation ? vars.kf : vars.kzf, m_use_cuda ? vars.gauss_corr_res.deviceMem() : nullptr, vars.stream); return; } diff --git a/src/threadctx.hpp b/src/threadctx.hpp index 1cf1337..2316d9b 100644 --- a/src/threadctx.hpp +++ b/src/threadctx.hpp @@ -1,6 +1,8 @@ #ifndef SCALE_VARS_HPP #define SCALE_VARS_HPP +#include "dynmem.hpp" + #ifdef CUFFT #include "complexmat.cuh" #else @@ -8,8 +10,6 @@ #ifndef CUFFTW // For compatibility reasons between CuFFT and FFTW, OpenCVfft versions. typedef int *cudaStream_t; -#else -#include "cuda_runtime.h" #endif #endif @@ -29,24 +29,17 @@ struct ThreadCtx { #endif this->patch_feats.reserve(uint(num_of_feats)); -// Size of cufftReal == float + // Size of cufftReal == float uint cells_size = ((uint(windows_size.width) / cell_size) * (uint(windows_size.height) / cell_size)) * sizeof(float); - CudaSafeCall(cudaHostAlloc(reinterpret_cast(&this->data_i_1ch), cells_size * num_of_scales, - cudaHostAllocMapped)); - CudaSafeCall(cudaHostGetDevicePointer(reinterpret_cast(&this->data_i_1ch_d), - reinterpret_cast(this->data_i_1ch), 0)); - - CudaSafeCall(cudaHostAlloc(reinterpret_cast(&this->data_i_features), cells_size * num_of_feats, - cudaHostAllocMapped)); - CudaSafeCall(cudaHostGetDevicePointer(reinterpret_cast(&this->data_i_features_d), - reinterpret_cast(this->data_i_features), 0)); + this->data_i_1ch = DynMem(cells_size * num_of_scales); + this->data_i_features = DynMem(cells_size * num_of_feats); this->ifft2_res = cv::Mat(windows_size.height / int(cell_size), windows_size.width / int(cell_size), - CV_32FC(int(num_of_feats)), this->data_i_features); + CV_32FC(int(num_of_feats)), this->data_i_features.hostMem()); this->response = cv::Mat(windows_size.height / int(cell_size), windows_size.width / int(cell_size), - CV_32FC(int(num_of_scales)), this->data_i_1ch); + CV_32FC(int(num_of_scales)), this->data_i_1ch.hostMem()); this->zf.create(uint(windows_size.height) / cell_size, (uint(windows_size.width) / cell_size) / 2 + 1, num_of_feats, num_of_scales, this->stream); @@ -55,40 +48,26 @@ struct ThreadCtx { this->kf.create(uint(windows_size.height) / cell_size, (uint(windows_size.width) / cell_size) / 2 + 1, num_of_scales, this->stream); - CudaSafeCall(cudaHostAlloc(reinterpret_cast(&this->xf_sqr_norm), num_of_scales * sizeof(float), - cudaHostAllocMapped)); - CudaSafeCall(cudaHostGetDevicePointer(reinterpret_cast(&this->xf_sqr_norm_d), - reinterpret_cast(this->xf_sqr_norm), 0)); - - CudaSafeCall(cudaHostAlloc(reinterpret_cast(&this->yf_sqr_norm), sizeof(float), cudaHostAllocMapped)); - CudaSafeCall(cudaHostGetDevicePointer(reinterpret_cast(&this->yf_sqr_norm_d), - reinterpret_cast(this->yf_sqr_norm), 0)); + this->xf_sqr_norm = DynMem(num_of_scales * sizeof(float)); + this->yf_sqr_norm = DynMem(sizeof(float)); - CudaSafeCall(cudaHostAlloc(reinterpret_cast(&this->gauss_corr_res), cells_size * num_of_scales, - cudaHostAllocMapped)); - CudaSafeCall(cudaHostGetDevicePointer(reinterpret_cast(&this->gauss_corr_res_d), - reinterpret_cast(this->gauss_corr_res), 0)); + this->gauss_corr_res = DynMem(cells_size * num_of_scales); this->in_all = cv::Mat(windows_size.height / int(cell_size) * int(num_of_scales), - windows_size.width / int(cell_size), CV_32F, this->gauss_corr_res); + windows_size.width / int(cell_size), CV_32F, this->gauss_corr_res.hostMem()); if (zero_index) { - CudaSafeCall( - cudaHostAlloc(reinterpret_cast(&this->rot_labels_data), cells_size, cudaHostAllocMapped)); - CudaSafeCall(cudaHostGetDevicePointer(reinterpret_cast(&this->rot_labels_data_d), - reinterpret_cast(this->rot_labels_data), 0)); + this->rot_labels_data = DynMem(cells_size); this->rot_labels = cv::Mat(windows_size.height / int(cell_size), windows_size.width / int(cell_size), - CV_32FC1, this->rot_labels_data); + CV_32FC1, this->rot_labels_data.hostMem()); } - CudaSafeCall(cudaHostAlloc(reinterpret_cast(&this->data_features), cells_size*num_of_feats, cudaHostAllocMapped)); - CudaSafeCall(cudaHostGetDevicePointer(reinterpret_cast(&this->data_features_d), - reinterpret_cast(this->data_features), 0)); + this->data_features = DynMem(cells_size * num_of_feats); this->fw_all = cv::Mat((windows_size.height / int(cell_size)) * int(num_of_feats), - windows_size.width / int(cell_size), CV_32F, this->data_features); + windows_size.width / int(cell_size), CV_32F, this->data_features.hostMem()); #else - this->xf_sqr_norm = reinterpret_cast(malloc(num_of_scales * sizeof(float))); - this->yf_sqr_norm = reinterpret_cast(malloc(sizeof(float))); + this->xf_sqr_norm = DynMem(num_of_scales * sizeof(float)); + this->yf_sqr_norm = DynMem(sizeof (float)); this->patch_feats.reserve(num_of_feats); @@ -143,24 +122,12 @@ struct ThreadCtx { ~ThreadCtx() { -#ifdef CUFFT - CudaSafeCall(cudaFreeHost(this->xf_sqr_norm)); - CudaSafeCall(cudaFreeHost(this->yf_sqr_norm)); - CudaSafeCall(cudaFreeHost(this->data_i_1ch)); - CudaSafeCall(cudaFreeHost(this->data_i_features)); - CudaSafeCall(cudaFreeHost(this->gauss_corr_res)); - if (zero_index) CudaSafeCall(cudaFreeHost(this->rot_labels_data)); - CudaSafeCall(cudaFreeHost(this->data_features)); -#if defined(ASYNC) || defined(OPENMP) +#if defined(CUFFT) && (defined(ASYNC) || defined(OPENMP)) CudaSafeCall(cudaStreamDestroy(this->stream)); -#endif -#else - free(this->xf_sqr_norm); - free(this->yf_sqr_norm); #endif } - float *xf_sqr_norm = nullptr, *yf_sqr_norm = nullptr; + DynMem xf_sqr_norm, yf_sqr_norm; std::vector patch_feats; cv::Mat in_all, fw_all, ifft2_res, response; @@ -168,11 +135,7 @@ struct ThreadCtx { // CuFFT variables cv::Mat rot_labels; - float *xf_sqr_norm_d = nullptr, *yf_sqr_norm_d = nullptr, *gauss_corr_res = nullptr, *gauss_corr_res_d = nullptr, - *rot_labels_data = nullptr, *rot_labels_data_d = nullptr, *data_features = nullptr, - *data_features_d = nullptr; - float *data_f = nullptr, *data_i_features = nullptr, *data_i_features_d = nullptr, *data_i_1ch = nullptr, - *data_i_1ch_d = nullptr; + DynMem gauss_corr_res, rot_labels_data, data_features, data_f, data_i_features, data_i_1ch; cudaStream_t stream = nullptr; ComplexMat model_alphaf, model_xf;