return;
}
+
+#define CublasErrorCheck(call) __cublasErrorCheck(call, __FILE__, __LINE__ )
+
+static inline void __cublasErrorCheck(cublasStatus_t call, const char *file, const int line )
+{
+ if (call != CUBLAS_STATUS_SUCCESS) {
+ fprintf(stderr, "cuBLAS error %d at %s:%d\n", call, /* _cudaGetErrorEnum(call),*/ file, line);
+ exit(-1);
+ }
+
+ return;
+}
+
#endif
#endif
#endif
public:
typedef T type;
- DynMem_() {}
DynMem_(size_t size)
{
#ifdef CUFFT
- CudaSafeCall(cudaHostAlloc(reinterpret_cast<void **>(&this->ptr), size, cudaHostAllocMapped));
- CudaSafeCall(
- cudaHostGetDevicePointer(reinterpret_cast<void **>(&this->ptr_d), reinterpret_cast<void *>(this->ptr), 0));
+ CudaSafeCall(cudaHostAlloc(reinterpret_cast<void **>(&ptr_h), size, cudaHostAllocMapped));
+ CudaSafeCall(cudaHostGetDevicePointer(reinterpret_cast<void **>(&ptr_d), reinterpret_cast<void *>(ptr_h), 0));
#else
- this->ptr_h = new float[size];
+ ptr_h = new float[size];
#endif
}
DynMem_(DynMem_&& other) {
- this->ptr_h = other.ptr_h;
+ ptr_h = other.ptr_h;
other.ptr_h = nullptr;
#ifdef CUFFT
- this->ptr_d = other.ptr_d;
+ ptr_d = other.ptr_d;
other.ptr_d = nullptr;
#endif
}
~DynMem_()
{
#ifdef CUFFT
- CudaSafeCall(cudaFreeHost(this->ptr));
+ CudaSafeCall(cudaFreeHost(ptr_h));
#else
- delete[] this->ptr_h;
+ delete[] ptr_h;
#endif
}
T *hostMem() { return ptr_h; }
#endif
void operator=(DynMem_ &&rhs)
{
- this->ptr_h = rhs.ptr_h;
+ ptr_h = rhs.ptr_h;
rhs.ptr_h = nullptr;
#ifdef CUFFT
- this->ptr_d = rhs.ptr_d;
+ ptr_d = rhs.ptr_d;
rhs.ptr_d = nullptr;
#endif
}
typedef DynMem_<float> DynMem;
-class MatDynMem : protected DynMem, public cv::Mat {
+class MatDynMem : public DynMem, public cv::Mat {
public:
MatDynMem(cv::Size size, int type)
: DynMem(size.area() * sizeof(DynMem::type) * CV_MAT_CN(type)), cv::Mat(size, type, hostMem())
{
assert((type & CV_MAT_DEPTH_MASK) == CV_32F);
}
- MatDynMem(int height, int width, int type) { MatDynMem(cv::Size(width, height), type); }
+ MatDynMem(int height, int width, int type)
+ : DynMem(width * height * sizeof(DynMem::type) * CV_MAT_CN(type)), cv::Mat(height, width, type, hostMem())
+ {
+ assert((type & CV_MAT_DEPTH_MASK) == CV_32F);
+ }
MatDynMem(int ndims, const int *sizes, int type)
: DynMem(volume(ndims, sizes) * sizeof(DynMem::type) * CV_MAT_CN(type)), cv::Mat(ndims, sizes, type, hostMem())
{
public:
virtual void init(unsigned width, unsigned height,unsigned num_of_feats, unsigned num_of_scales) = 0;
virtual void set_window(const MatDynMem &window) = 0;
- virtual void forward(const cv::Mat & real_input, ComplexMat & complex_result) = 0;
+ virtual void forward(MatDynMem & real_input, ComplexMat & complex_result) = 0;
virtual void forward_window(MatDynMem &patch_feats_in, ComplexMat & complex_result, MatDynMem &tmp) = 0;
virtual void inverse(ComplexMat & complex_input, MatDynMem & real_result) = 0;
virtual ~Fft() = 0;
#include "fft_cufft.h"
+#include <cublas_v2.h>
+
+cuFFT::cuFFT()
+{
+ CublasErrorCheck(cublasCreate(&cublas));
+}
void cuFFT::init(unsigned width, unsigned height, unsigned num_of_feats, unsigned num_of_scales)
{
m_window = window;
}
-void cuFFT::forward(const cv::Mat &real_input, ComplexMat &complex_result, float *real_input_arr)
+void cuFFT::forward(MatDynMem & real_input, ComplexMat & complex_result)
{
if (BIG_BATCH_MODE && real_input.rows == int(m_height * m_num_of_scales)) {
- CufftErrorCheck(cufftExecR2C(plan_f_all_scales, reinterpret_cast<cufftReal *>(real_input_arr),
+ CufftErrorCheck(cufftExecR2C(plan_f_all_scales, reinterpret_cast<cufftReal *>(real_input.deviceMem()),
complex_result.get_p_data()));
} else {
NORMAL_OMP_CRITICAL
{
CufftErrorCheck(
- cufftExecR2C(plan_f, reinterpret_cast<cufftReal *>(real_input_arr), complex_result.get_p_data()));
+ cufftExecR2C(plan_f, reinterpret_cast<cufftReal *>(real_input.deviceMem()), complex_result.get_p_data()));
cudaStreamSynchronize(cudaStreamPerThread);
}
}
return;
}
-void cuFFT::forward_window(MatDynMem &patch_feats_in, ComplexMat & complex_result, MatDynMem &tmp)
+void cuFFT::forward_window(MatDynMem &feat, ComplexMat & complex_result, MatDynMem &temp)
{
- int n_channels = int(patch_feats.size());
+ uint n_channels = feat.size[0];
+ cufftReal *temp_data = temp.deviceMem();
- if (n_channels > int(m_num_of_feats)) {
- for (uint i = 0; i < uint(n_channels); ++i) {
- cv::Mat in_roi(fw_all, cv::Rect(0, int(i * m_height), int(m_width), int(m_height)));
- in_roi = patch_feats[i].mul(m_window);
- }
- CufftErrorCheck(cufftExecR2C(plan_fw_all_scales, reinterpret_cast<cufftReal *>(real_input_arr),
- complex_result.get_p_data()));
- } else {
- for (uint i = 0; i < uint(n_channels); ++i) {
- cv::Mat in_roi(fw_all, cv::Rect(0, int(i * m_height), int(m_width), int(m_height)));
- in_roi = patch_feats[i].mul(m_window);
- }
- NORMAL_OMP_CRITICAL
- {
- CufftErrorCheck(
- cufftExecR2C(plan_fw, reinterpret_cast<cufftReal *>(real_input_arr), complex_result.get_p_data()));
- cudaStreamSynchronize(cudaStreamPerThread);
- }
+ assert(feat.dims == 3);
+ assert(n_channels == m_num_of_feats || n_channels == m_num_of_feats * m_num_of_scales);
+
+ for (uint i = 0; i < n_channels; ++i) {
+ cv::Mat feat_plane(feat.dims - 1, feat.size + 1, feat.cv::Mat::type(), feat.ptr<void>(i));
+ cv::Mat temp_plane(temp.dims - 1, temp.size + 1, temp.cv::Mat::type(), temp.ptr(i));
+ temp_plane = feat_plane.mul(m_window);
}
- return;
+ CufftErrorCheck(cufftExecR2C((n_channels == m_num_of_feats) ? plan_fw : plan_fw_all_scales,
+ temp_data, complex_result.get_p_data()));
}
-void cuFFT::inverse(ComplexMat & complex_input, MatDynMem & real_result)
+void cuFFT::inverse(ComplexMat &complex_input, MatDynMem &real_result)
{
- int n_channels = complex_input.n_channels;
+ uint n_channels = complex_input.n_channels;
cufftComplex *in = reinterpret_cast<cufftComplex *>(complex_input.get_p_data());
+ cufftReal *out = real_result.deviceMem();
+ float alpha = 1.0 / (m_width * m_height);
+ cufftHandle plan;
if (n_channels == 1) {
- NORMAL_OMP_CRITICAL
- {
- CufftErrorCheck(cufftExecC2R(plan_i_1ch, in, reinterpret_cast<cufftReal *>(real_result_arr)));
- cudaStreamSynchronize(cudaStreamPerThread);
- }
- real_result = real_result / (m_width * m_height);
+ CufftErrorCheck(cufftExecC2R(plan_i_1ch, in, out));
+ CublasErrorCheck(cublasSscal(cublas, real_result.total(), &alpha, out, 1));
return;
- } else if (n_channels == int(m_num_of_scales)) {
- CufftErrorCheck(cufftExecC2R(plan_i_1ch_all_scales, in, reinterpret_cast<cufftReal *>(real_result_arr)));
- cudaStreamSynchronize(cudaStreamPerThread);
-
- real_result = real_result / (m_width * m_height);
+ } else if (n_channels == m_num_of_scales) {
+ CufftErrorCheck(cufftExecC2R(plan_i_1ch_all_scales, in, out));
+ CublasErrorCheck(cublasSscal(cublas, real_result.total(), &alpha, out, 1));
return;
- } else if (n_channels == int(m_num_of_feats) * int(m_num_of_scales)) {
- CufftErrorCheck(cufftExecC2R(plan_i_features_all_scales, in, reinterpret_cast<cufftReal *>(real_result_arr)));
+ } else if (n_channels == m_num_of_feats * m_num_of_scales) {
+ CufftErrorCheck(cufftExecC2R(plan_i_features_all_scales, in, out));
cudaStreamSynchronize(cudaStreamPerThread);
return;
}
- NORMAL_OMP_CRITICAL
- {
- CufftErrorCheck(cufftExecC2R(plan_i_features, in, reinterpret_cast<cufftReal *>(real_result_arr)));
-#if defined(OPENMP) && !defined(BIG_BATCH)
- CudaSafeCall(cudaStreamSynchronize(cudaStreamPerThread));
-#endif
- }
+ CufftErrorCheck(cufftExecC2R(plan_i_features, in, out));
return;
}
cuFFT::~cuFFT()
{
+ CublasErrorCheck(cublasDestroy(cublas));
+
CufftErrorCheck(cufftDestroy(plan_f));
CufftErrorCheck(cufftDestroy(plan_fw));
CufftErrorCheck(cufftDestroy(plan_i_1ch));
#include <cufft.h>
#include <cuda_runtime.h>
+#include <cublas_v2.h>
#include "fft.h"
#include "cuda/cuda_error_check.cuh"
class cuFFT : public Fft
{
public:
+ cuFFT();
void init(unsigned width, unsigned height, unsigned num_of_feats, unsigned num_of_scales) override;
void set_window(const MatDynMem &window) override;
- void forward(const cv::Mat & real_input, ComplexMat & complex_result) override;
+ void forward(MatDynMem & real_input, ComplexMat & complex_result) override;
void forward_window(MatDynMem &patch_feats_in, ComplexMat & complex_result, MatDynMem &tmp) override;
void inverse(ComplexMat & complex_input, MatDynMem & real_result) override;
~cuFFT() override;
unsigned m_width, m_height, m_num_of_feats, m_num_of_scales;
cufftHandle plan_f, plan_f_all_scales, plan_fw, plan_fw_all_scales, plan_i_features,
plan_i_features_all_scales, plan_i_1ch, plan_i_1ch_all_scales;
+ cublasHandle_t cublas;
};
#endif // FFT_CUDA_H
m_window = window;
}
-void Fftw::forward(const cv::Mat & real_input, ComplexMat & complex_result)
+void Fftw::forward(MatDynMem & real_input, ComplexMat & complex_result)
{
if (BIG_BATCH_MODE && real_input.rows == int(m_height * m_num_of_scales)) {
fftwf_execute_dft_r2c(plan_f_all_scales, reinterpret_cast<float *>(real_input.data),
Fftw();
void init(unsigned width, unsigned height, unsigned num_of_feats, unsigned num_of_scales) override;
void set_window(const MatDynMem &window) override;
- void forward(const cv::Mat & real_input, ComplexMat & complex_result) override;
+ void forward(MatDynMem & real_input, ComplexMat & complex_result) override;
void forward_window(MatDynMem &patch_feats_in, ComplexMat & complex_result, MatDynMem &tmp) override;
void inverse(ComplexMat & complex_input, MatDynMem & real_result) override;
~Fftw() override;
public:
void init(unsigned width, unsigned height, unsigned num_of_feats, unsigned num_of_scales) override;
void set_window(const MatDynMem &window) override;
- void forward(const cv::Mat & real_input, ComplexMat & complex_result) override;
+ void forward(MatDynMem & real_input, ComplexMat & complex_result) override;
void forward_window(MatDynMem &patch_feats_in, ComplexMat & complex_result, MatDynMem &tmp) override;
void inverse(ComplexMat & complex_input, MatDynMem & real_result) override;
~FftOpencv() override;
DEBUG_PRINTM(p_yf);
// obtain a sub-window for training initial model
- std::vector<cv::Mat> patch_feats = get_features(input_rgb, input_gray, p_pose.cx, p_pose.cy,
- p_windows_size.width, p_windows_size.height);
+ int sizes[3] = {p_num_of_feats, p_windows_size.height, p_windows_size.width};
+ MatDynMem patch_feats(3, sizes, CV_32FC1);
+ MatDynMem temp(3, tmp, CV_32FC1);
+ get_features(features, input_rgb, input_gray, p_pose.cx, p_pose.cy, p_windows_size.width, p_windows_size.height);
fft.forward_window(patch_feats, p_model_xf);
DEBUG_PRINTM(p_model_xf);
}
// hann window actually (Power-of-cosine windows)
-cv::Mat KCF_Tracker::cosine_window_function(int dim1, int dim2)
+MatDynMem KCF_Tracker::cosine_window_function(int dim1, int dim2)
{
cv::Mat m1(1, dim1, CV_32FC1), m2(dim2, 1, CV_32FC1);
double N_inv = 1. / (static_cast<double>(dim1) - 1.);
N_inv = 1. / (static_cast<double>(dim2) - 1.);
for (int i = 0; i < dim2; ++i)
m2.at<float>(i) = float(0.5 * (1. - std::cos(2. * CV_PI * static_cast<double>(i) * N_inv)));
- cv::Mat ret = m2 * m1;
+ MatDynMem ret = m2 * m1;
return ret;
}
cv::Mat gaussian_shaped_labels(double sigma, int dim1, int dim2);
std::unique_ptr<GaussianCorrelation> gaussian_correlation;
cv::Mat circshift(const cv::Mat & patch, int x_rot, int y_rot);
- cv::Mat cosine_window_function(int dim1, int dim2);
+ MatDynMem cosine_window_function(int dim1, int dim2);
void get_features(MatDynMem &feat_3d, cv::Mat & input_rgb, cv::Mat & input_gray, int cx, int cy, int size_x, int size_y, double scale = 1.);
cv::Point2f sub_pixel_peak(cv::Point & max_loc, cv::Mat & response);
double sub_grid_scale(uint index);