cmake_minimum_required(VERSION 2.8)
-set(KCF_LIB_SRC kcf.cpp kcf.h fft.cpp threadctx.hpp pragmas.h)
+set(KCF_LIB_SRC kcf.cpp kcf.h fft.cpp threadctx.hpp pragmas.h dynmem.hpp)
find_package(PkgConfig)
}
// assuming that mat has 2 channels (real, img)
- ComplexMat_(const cv::Mat &mat) : cols(uint(mat.cols)), rows(uint(mat.rows)), n_channels(1) { p_data = convert(mat); }
+ ComplexMat_(const cv::Mat &mat) : cols(uint(mat.cols)), rows(uint(mat.rows)), n_channels(1)
+ {
+ p_data = convert(mat);
+ }
void create(uint _rows, uint _cols, uint _n_channels)
{
--- /dev/null
+#ifndef DYNMEM_HPP
+#define DYNMEM_HPP
+
+#include <cstdlib>
+
+#if defined(CUFFT) || defined(CUFFTW)
+#include "cuda_runtime.h"
+#ifdef CUFFT
+#include "cuda/cuda_error_check.cuh"
+#endif
+#endif
+
+template <typename T> class DynMem_ {
+ T *ptr = nullptr;
+ T *ptr_d = nullptr;
+
+ public:
+ DynMem_()
+ {}
+ DynMem_(size_t size)
+ {
+#ifdef CUFFT
+ CudaSafeCall(cudaHostAlloc(reinterpret_cast<void **>(&this->ptr), size, cudaHostAllocMapped));
+ CudaSafeCall(
+ cudaHostGetDevicePointer(reinterpret_cast<void **>(&this->ptr_d), reinterpret_cast<void *>(this->ptr), 0));
+#else
+ this->ptr = new float[size];
+#endif
+ }
+ ~DynMem_()
+ {
+#ifdef CUFFT
+ CudaSafeCall(cudaFreeHost(this->ptr));
+#else
+ delete this->ptr;
+#endif
+ }
+ T *hostMem() { return ptr; }
+ T *deviceMem() { return ptr_d; }
+
+ void operator=(DynMem_ &&rhs)
+ {
+ this->ptr = rhs.ptr;
+ this->ptr_d = rhs.ptr_d;
+
+ rhs.ptr = nullptr;
+ rhs.ptr_d = nullptr;
+ }
+};
+typedef DynMem_<float> DynMem;
+#endif // DYNMEM_HPP
-
#ifndef FFT_CUDA_H
#define FFT_CUDA_H
+
+#include <cufft.h>
+#include <cuda_runtime.h>
+
#include "fft.h"
#include "cuda/cuda_error_check.cuh"
#include "pragmas.h"
#define CUDA cv::cuda
#endif
-#include <cufft.h>
-#include <cuda_runtime.h>
-
struct ThreadCtx;
class cuFFT : public Fft
// window weights, i.e. labels
fft.forward(
gaussian_shaped_labels(p_output_sigma, p_windows_size.width / p_cell_size, p_windows_size.height / p_cell_size), p_yf,
- m_use_cuda ? p_scale_vars.front()->rot_labels_data_d : nullptr, p_scale_vars.front()->stream);
+ m_use_cuda ? p_scale_vars.front()->rot_labels_data.deviceMem() : nullptr, p_scale_vars.front()->stream);
DEBUG_PRINTM(p_yf);
// obtain a sub-window for training initial model
get_features(input_rgb, input_gray, int(p_pose.cx), int(p_pose.cy), p_windows_size.width, p_windows_size.height,
*p_scale_vars.front());
fft.forward_window(p_scale_vars.front()->patch_feats, p_model_xf, p_scale_vars.front()->fw_all,
- m_use_cuda ? p_scale_vars.front()->data_features_d : nullptr, p_scale_vars.front()->stream);
+ m_use_cuda ? p_scale_vars.front()->data_features.deviceMem() : nullptr, p_scale_vars.front()->stream);
DEBUG_PRINTM(p_model_xf);
#if defined(CUFFT) && (defined(ASYNC) || defined(OPENMP))
p_scale_vars.front()->model_xf = p_model_xf;
get_features(input_rgb, input_gray, int(p_pose.cx), int(p_pose.cy), p_windows_size.width, p_windows_size.height,
*p_scale_vars.front(), p_current_scale);
fft.forward_window(p_scale_vars.front()->patch_feats, p_scale_vars.front()->xf, p_scale_vars.front()->fw_all,
- m_use_cuda ? p_scale_vars.front()->data_features_d : nullptr, p_scale_vars.front()->stream);
+ m_use_cuda ? p_scale_vars.front()->data_features.deviceMem() : nullptr, p_scale_vars.front()->stream);
// subsequent frames, interpolate model
p_model_xf = p_model_xf * float((1. - p_interp_factor)) + p_scale_vars.front()->xf * float(p_interp_factor);
this->p_windows_size.height, vars, this->p_current_scale *scale);
}
- fft.forward_window(vars.patch_feats, vars.zf, vars.fw_all, m_use_cuda ? vars.data_features_d : nullptr,
+ fft.forward_window(vars.patch_feats, vars.zf, vars.fw_all, m_use_cuda ? vars.data_features.deviceMem() : nullptr,
vars.stream);
DEBUG_PRINTM(vars.zf);
if (m_use_linearkernel) {
vars.kzf = m_use_big_batch ? (vars.zf.mul2(this->p_model_alphaf)).sum_over_channels()
: (p_model_alphaf * vars.zf).sum_over_channels();
- fft.inverse(vars.kzf, vars.response, m_use_cuda ? vars.data_i_1ch_d : nullptr, vars.stream);
+ fft.inverse(vars.kzf, vars.response, m_use_cuda ? vars.data_i_1ch.deviceMem() : nullptr, vars.stream);
} else {
#if !defined(BIG_BATCH) && defined(CUFFT) && (defined(ASYNC) || defined(OPENMP))
gaussian_correlation(vars, vars.zf, vars.model_xf, this->p_kernel_sigma);
DEBUG_PRINTM(vars.kzf);
vars.kzf = m_use_big_batch ? vars.kzf.mul(this->p_model_alphaf) : this->p_model_alphaf * vars.kzf;
#endif
- fft.inverse(vars.kzf, vars.response, m_use_cuda ? vars.data_i_1ch_d : nullptr, vars.stream);
+ fft.inverse(vars.kzf, vars.response, m_use_cuda ? vars.data_i_1ch.deviceMem() : nullptr, vars.stream);
}
DEBUG_PRINTM(vars.response);
double sigma, bool auto_correlation)
{
#ifdef CUFFT
- xf.sqr_norm(vars.xf_sqr_norm_d);
- if (!auto_correlation) yf.sqr_norm(vars.yf_sqr_norm_d);
+ xf.sqr_norm(vars.xf_sqr_norm.deviceMem());
+ if (!auto_correlation) yf.sqr_norm(vars.yf_sqr_norm.deviceMem());
#else
- xf.sqr_norm(vars.xf_sqr_norm);
+ xf.sqr_norm(vars.xf_sqr_norm.hostMem());
if (auto_correlation) {
- vars.yf_sqr_norm[0] = vars.xf_sqr_norm[0];
+ vars.yf_sqr_norm.hostMem()[0] = vars.xf_sqr_norm.hostMem()[0];
} else {
- yf.sqr_norm(vars.yf_sqr_norm);
+ yf.sqr_norm(vars.yf_sqr_norm.hostMem());
}
#endif
vars.xyf = auto_correlation ? xf.sqr_mag() : xf.mul2(yf.conj());
DEBUG_PRINTM(vars.xyf);
- fft.inverse(vars.xyf, vars.ifft2_res, m_use_cuda ? vars.data_i_features_d : nullptr, vars.stream);
+ fft.inverse(vars.xyf, vars.ifft2_res, m_use_cuda ? vars.data_i_features.deviceMem() : nullptr, vars.stream);
#ifdef CUFFT
if (auto_correlation)
- cuda_gaussian_correlation(vars.data_i_features, vars.gauss_corr_res_d, vars.xf_sqr_norm_d, vars.xf_sqr_norm_d,
+ cuda_gaussian_correlation(vars.data_i_features.deviceMem(), vars.gauss_corr_res.deviceMem(), vars.xf_sqr_norm.deviceMem(), vars.xf_sqr_norm.deviceMem(),
sigma, xf.n_channels, xf.n_scales, p_roi_height, p_roi_width, vars.stream);
else
- cuda_gaussian_correlation(vars.data_i_features, vars.gauss_corr_res_d, vars.xf_sqr_norm_d, vars.yf_sqr_norm_d,
+ cuda_gaussian_correlation(vars.data_i_features.deviceMem(), vars.gauss_corr_res.deviceMem(), vars.xf_sqr_norm.deviceMem(), vars.yf_sqr_norm.deviceMem(),
sigma, xf.n_channels, xf.n_scales, p_roi_height, p_roi_width, vars.stream);
#else
// ifft2 and sum over 3rd dimension, we dont care about individual channels
cv::Mat in_roi(vars.in_all, cv::Rect(0, int(i) * scales[0].rows, scales[0].cols, scales[0].rows));
cv::exp(
-1. / (sigma * sigma) *
- cv::max((double(vars.xf_sqr_norm[i] + vars.yf_sqr_norm[0]) - 2 * scales[i]) * double(numel_xf_inv), 0),
+ cv::max((double(vars.xf_sqr_norm.hostMem()[i] + vars.yf_sqr_norm.hostMem()[0]) - 2 * scales[i]) * double(numel_xf_inv), 0),
in_roi);
DEBUG_PRINTM(in_roi);
}
#endif
DEBUG_PRINTM(vars.in_all);
- fft.forward(vars.in_all, auto_correlation ? vars.kf : vars.kzf, m_use_cuda ? vars.gauss_corr_res_d : nullptr,
+ fft.forward(vars.in_all, auto_correlation ? vars.kf : vars.kzf, m_use_cuda ? vars.gauss_corr_res.deviceMem() : nullptr,
vars.stream);
return;
}
#ifndef SCALE_VARS_HPP
#define SCALE_VARS_HPP
+#include "dynmem.hpp"
+
#ifdef CUFFT
#include "complexmat.cuh"
#else
#ifndef CUFFTW
// For compatibility reasons between CuFFT and FFTW, OpenCVfft versions.
typedef int *cudaStream_t;
-#else
-#include "cuda_runtime.h"
#endif
#endif
#endif
this->patch_feats.reserve(uint(num_of_feats));
-// Size of cufftReal == float
+ // Size of cufftReal == float
uint cells_size =
((uint(windows_size.width) / cell_size) * (uint(windows_size.height) / cell_size)) * sizeof(float);
- CudaSafeCall(cudaHostAlloc(reinterpret_cast<void **>(&this->data_i_1ch), cells_size * num_of_scales,
- cudaHostAllocMapped));
- CudaSafeCall(cudaHostGetDevicePointer(reinterpret_cast<void **>(&this->data_i_1ch_d),
- reinterpret_cast<void *>(this->data_i_1ch), 0));
-
- CudaSafeCall(cudaHostAlloc(reinterpret_cast<void **>(&this->data_i_features), cells_size * num_of_feats,
- cudaHostAllocMapped));
- CudaSafeCall(cudaHostGetDevicePointer(reinterpret_cast<void **>(&this->data_i_features_d),
- reinterpret_cast<void *>(this->data_i_features), 0));
+ this->data_i_1ch = DynMem(cells_size * num_of_scales);
+ this->data_i_features = DynMem(cells_size * num_of_feats);
this->ifft2_res = cv::Mat(windows_size.height / int(cell_size), windows_size.width / int(cell_size),
- CV_32FC(int(num_of_feats)), this->data_i_features);
+ CV_32FC(int(num_of_feats)), this->data_i_features.hostMem());
this->response = cv::Mat(windows_size.height / int(cell_size), windows_size.width / int(cell_size),
- CV_32FC(int(num_of_scales)), this->data_i_1ch);
+ CV_32FC(int(num_of_scales)), this->data_i_1ch.hostMem());
this->zf.create(uint(windows_size.height) / cell_size, (uint(windows_size.width) / cell_size) / 2 + 1,
num_of_feats, num_of_scales, this->stream);
this->kf.create(uint(windows_size.height) / cell_size, (uint(windows_size.width) / cell_size) / 2 + 1,
num_of_scales, this->stream);
- CudaSafeCall(cudaHostAlloc(reinterpret_cast<void **>(&this->xf_sqr_norm), num_of_scales * sizeof(float),
- cudaHostAllocMapped));
- CudaSafeCall(cudaHostGetDevicePointer(reinterpret_cast<void **>(&this->xf_sqr_norm_d),
- reinterpret_cast<void *>(this->xf_sqr_norm), 0));
-
- CudaSafeCall(cudaHostAlloc(reinterpret_cast<void **>(&this->yf_sqr_norm), sizeof(float), cudaHostAllocMapped));
- CudaSafeCall(cudaHostGetDevicePointer(reinterpret_cast<void **>(&this->yf_sqr_norm_d),
- reinterpret_cast<void *>(this->yf_sqr_norm), 0));
+ this->xf_sqr_norm = DynMem(num_of_scales * sizeof(float));
+ this->yf_sqr_norm = DynMem(sizeof(float));
- CudaSafeCall(cudaHostAlloc(reinterpret_cast<void **>(&this->gauss_corr_res), cells_size * num_of_scales,
- cudaHostAllocMapped));
- CudaSafeCall(cudaHostGetDevicePointer(reinterpret_cast<void **>(&this->gauss_corr_res_d),
- reinterpret_cast<void *>(this->gauss_corr_res), 0));
+ this->gauss_corr_res = DynMem(cells_size * num_of_scales);
this->in_all = cv::Mat(windows_size.height / int(cell_size) * int(num_of_scales),
- windows_size.width / int(cell_size), CV_32F, this->gauss_corr_res);
+ windows_size.width / int(cell_size), CV_32F, this->gauss_corr_res.hostMem());
if (zero_index) {
- CudaSafeCall(
- cudaHostAlloc(reinterpret_cast<void **>(&this->rot_labels_data), cells_size, cudaHostAllocMapped));
- CudaSafeCall(cudaHostGetDevicePointer(reinterpret_cast<void **>(&this->rot_labels_data_d),
- reinterpret_cast<void *>(this->rot_labels_data), 0));
+ this->rot_labels_data = DynMem(cells_size);
this->rot_labels = cv::Mat(windows_size.height / int(cell_size), windows_size.width / int(cell_size),
- CV_32FC1, this->rot_labels_data);
+ CV_32FC1, this->rot_labels_data.hostMem());
}
- CudaSafeCall(cudaHostAlloc(reinterpret_cast<void **>(&this->data_features), cells_size*num_of_feats, cudaHostAllocMapped));
- CudaSafeCall(cudaHostGetDevicePointer(reinterpret_cast<void **>(&this->data_features_d),
- reinterpret_cast<void *>(this->data_features), 0));
+ this->data_features = DynMem(cells_size * num_of_feats);
this->fw_all = cv::Mat((windows_size.height / int(cell_size)) * int(num_of_feats),
- windows_size.width / int(cell_size), CV_32F, this->data_features);
+ windows_size.width / int(cell_size), CV_32F, this->data_features.hostMem());
#else
- this->xf_sqr_norm = reinterpret_cast<float *>(malloc(num_of_scales * sizeof(float)));
- this->yf_sqr_norm = reinterpret_cast<float *>(malloc(sizeof(float)));
+ this->xf_sqr_norm = DynMem(num_of_scales * sizeof(float));
+ this->yf_sqr_norm = DynMem(sizeof (float));
this->patch_feats.reserve(num_of_feats);
~ThreadCtx()
{
-#ifdef CUFFT
- CudaSafeCall(cudaFreeHost(this->xf_sqr_norm));
- CudaSafeCall(cudaFreeHost(this->yf_sqr_norm));
- CudaSafeCall(cudaFreeHost(this->data_i_1ch));
- CudaSafeCall(cudaFreeHost(this->data_i_features));
- CudaSafeCall(cudaFreeHost(this->gauss_corr_res));
- if (zero_index) CudaSafeCall(cudaFreeHost(this->rot_labels_data));
- CudaSafeCall(cudaFreeHost(this->data_features));
-#if defined(ASYNC) || defined(OPENMP)
+#if defined(CUFFT) && (defined(ASYNC) || defined(OPENMP))
CudaSafeCall(cudaStreamDestroy(this->stream));
-#endif
-#else
- free(this->xf_sqr_norm);
- free(this->yf_sqr_norm);
#endif
}
- float *xf_sqr_norm = nullptr, *yf_sqr_norm = nullptr;
+ DynMem xf_sqr_norm, yf_sqr_norm;
std::vector<cv::Mat> patch_feats;
cv::Mat in_all, fw_all, ifft2_res, response;
// CuFFT variables
cv::Mat rot_labels;
- float *xf_sqr_norm_d = nullptr, *yf_sqr_norm_d = nullptr, *gauss_corr_res = nullptr, *gauss_corr_res_d = nullptr,
- *rot_labels_data = nullptr, *rot_labels_data_d = nullptr, *data_features = nullptr,
- *data_features_d = nullptr;
- float *data_f = nullptr, *data_i_features = nullptr, *data_i_features_d = nullptr, *data_i_1ch = nullptr,
- *data_i_1ch_d = nullptr;
+ DynMem gauss_corr_res, rot_labels_data, data_features, data_f, data_i_features, data_i_1ch;
cudaStream_t stream = nullptr;
ComplexMat model_alphaf, model_xf;