From a46a489b26edaddf70c6750d54462d58dd95749b Mon Sep 17 00:00:00 2001 From: Michal Sojka Date: Tue, 18 Sep 2018 14:54:44 +0200 Subject: [PATCH] Start cleaning up cufft implementation - This simplifies the API by using MatDynMem, which unifies cv::Mat and GPU zero-copy memory. --- src/cuda/cuda_error_check.cuh | 13 ++++++ src/dynmem.hpp | 28 ++++++------ src/fft.h | 2 +- src/fft_cufft.cpp | 82 +++++++++++++++-------------------- src/fft_cufft.h | 5 ++- src/fft_fftw.cpp | 2 +- src/fft_fftw.h | 2 +- src/fft_opencv.h | 2 +- src/kcf.cpp | 10 +++-- src/kcf.h | 2 +- 10 files changed, 79 insertions(+), 69 deletions(-) diff --git a/src/cuda/cuda_error_check.cuh b/src/cuda/cuda_error_check.cuh index 13c2f7d..3c13811 100644 --- a/src/cuda/cuda_error_check.cuh +++ b/src/cuda/cuda_error_check.cuh @@ -115,6 +115,19 @@ static inline void __cufftErrorCheck(cufftResult_t call, const char *file, const return; } + +#define CublasErrorCheck(call) __cublasErrorCheck(call, __FILE__, __LINE__ ) + +static inline void __cublasErrorCheck(cublasStatus_t call, const char *file, const int line ) +{ + if (call != CUBLAS_STATUS_SUCCESS) { + fprintf(stderr, "cuBLAS error %d at %s:%d\n", call, /* _cudaGetErrorEnum(call),*/ file, line); + exit(-1); + } + + return; +} + #endif #endif diff --git a/src/dynmem.hpp b/src/dynmem.hpp index 4c5199e..ff1b03e 100644 --- a/src/dynmem.hpp +++ b/src/dynmem.hpp @@ -20,31 +20,29 @@ template class DynMem_ { #endif public: typedef T type; - DynMem_() {} DynMem_(size_t size) { #ifdef CUFFT - CudaSafeCall(cudaHostAlloc(reinterpret_cast(&this->ptr), size, cudaHostAllocMapped)); - CudaSafeCall( - cudaHostGetDevicePointer(reinterpret_cast(&this->ptr_d), reinterpret_cast(this->ptr), 0)); + CudaSafeCall(cudaHostAlloc(reinterpret_cast(&ptr_h), size, cudaHostAllocMapped)); + CudaSafeCall(cudaHostGetDevicePointer(reinterpret_cast(&ptr_d), reinterpret_cast(ptr_h), 0)); #else - this->ptr_h = new float[size]; + ptr_h = new float[size]; #endif } DynMem_(DynMem_&& other) { - this->ptr_h = other.ptr_h; + ptr_h = other.ptr_h; other.ptr_h = nullptr; #ifdef CUFFT - this->ptr_d = other.ptr_d; + ptr_d = other.ptr_d; other.ptr_d = nullptr; #endif } ~DynMem_() { #ifdef CUFFT - CudaSafeCall(cudaFreeHost(this->ptr)); + CudaSafeCall(cudaFreeHost(ptr_h)); #else - delete[] this->ptr_h; + delete[] ptr_h; #endif } T *hostMem() { return ptr_h; } @@ -53,10 +51,10 @@ template class DynMem_ { #endif void operator=(DynMem_ &&rhs) { - this->ptr_h = rhs.ptr_h; + ptr_h = rhs.ptr_h; rhs.ptr_h = nullptr; #ifdef CUFFT - this->ptr_d = rhs.ptr_d; + ptr_d = rhs.ptr_d; rhs.ptr_d = nullptr; #endif } @@ -65,14 +63,18 @@ template class DynMem_ { typedef DynMem_ DynMem; -class MatDynMem : protected DynMem, public cv::Mat { +class MatDynMem : public DynMem, public cv::Mat { public: MatDynMem(cv::Size size, int type) : DynMem(size.area() * sizeof(DynMem::type) * CV_MAT_CN(type)), cv::Mat(size, type, hostMem()) { assert((type & CV_MAT_DEPTH_MASK) == CV_32F); } - MatDynMem(int height, int width, int type) { MatDynMem(cv::Size(width, height), type); } + MatDynMem(int height, int width, int type) + : DynMem(width * height * sizeof(DynMem::type) * CV_MAT_CN(type)), cv::Mat(height, width, type, hostMem()) + { + assert((type & CV_MAT_DEPTH_MASK) == CV_32F); + } MatDynMem(int ndims, const int *sizes, int type) : DynMem(volume(ndims, sizes) * sizeof(DynMem::type) * CV_MAT_CN(type)), cv::Mat(ndims, sizes, type, hostMem()) { diff --git a/src/fft.h b/src/fft.h index b165ef4..93dcea6 100644 --- a/src/fft.h +++ b/src/fft.h @@ -23,7 +23,7 @@ class Fft public: virtual void init(unsigned width, unsigned height,unsigned num_of_feats, unsigned num_of_scales) = 0; virtual void set_window(const MatDynMem &window) = 0; - virtual void forward(const cv::Mat & real_input, ComplexMat & complex_result) = 0; + virtual void forward(MatDynMem & real_input, ComplexMat & complex_result) = 0; virtual void forward_window(MatDynMem &patch_feats_in, ComplexMat & complex_result, MatDynMem &tmp) = 0; virtual void inverse(ComplexMat & complex_input, MatDynMem & real_result) = 0; virtual ~Fft() = 0; diff --git a/src/fft_cufft.cpp b/src/fft_cufft.cpp index cd53414..63f2558 100644 --- a/src/fft_cufft.cpp +++ b/src/fft_cufft.cpp @@ -1,4 +1,10 @@ #include "fft_cufft.h" +#include + +cuFFT::cuFFT() +{ + CublasErrorCheck(cublasCreate(&cublas)); +} void cuFFT::init(unsigned width, unsigned height, unsigned num_of_feats, unsigned num_of_scales) { @@ -119,84 +125,68 @@ void cuFFT::set_window(const MatDynMem &window) m_window = window; } -void cuFFT::forward(const cv::Mat &real_input, ComplexMat &complex_result, float *real_input_arr) +void cuFFT::forward(MatDynMem & real_input, ComplexMat & complex_result) { if (BIG_BATCH_MODE && real_input.rows == int(m_height * m_num_of_scales)) { - CufftErrorCheck(cufftExecR2C(plan_f_all_scales, reinterpret_cast(real_input_arr), + CufftErrorCheck(cufftExecR2C(plan_f_all_scales, reinterpret_cast(real_input.deviceMem()), complex_result.get_p_data())); } else { NORMAL_OMP_CRITICAL { CufftErrorCheck( - cufftExecR2C(plan_f, reinterpret_cast(real_input_arr), complex_result.get_p_data())); + cufftExecR2C(plan_f, reinterpret_cast(real_input.deviceMem()), complex_result.get_p_data())); cudaStreamSynchronize(cudaStreamPerThread); } } return; } -void cuFFT::forward_window(MatDynMem &patch_feats_in, ComplexMat & complex_result, MatDynMem &tmp) +void cuFFT::forward_window(MatDynMem &feat, ComplexMat & complex_result, MatDynMem &temp) { - int n_channels = int(patch_feats.size()); + uint n_channels = feat.size[0]; + cufftReal *temp_data = temp.deviceMem(); - if (n_channels > int(m_num_of_feats)) { - for (uint i = 0; i < uint(n_channels); ++i) { - cv::Mat in_roi(fw_all, cv::Rect(0, int(i * m_height), int(m_width), int(m_height))); - in_roi = patch_feats[i].mul(m_window); - } - CufftErrorCheck(cufftExecR2C(plan_fw_all_scales, reinterpret_cast(real_input_arr), - complex_result.get_p_data())); - } else { - for (uint i = 0; i < uint(n_channels); ++i) { - cv::Mat in_roi(fw_all, cv::Rect(0, int(i * m_height), int(m_width), int(m_height))); - in_roi = patch_feats[i].mul(m_window); - } - NORMAL_OMP_CRITICAL - { - CufftErrorCheck( - cufftExecR2C(plan_fw, reinterpret_cast(real_input_arr), complex_result.get_p_data())); - cudaStreamSynchronize(cudaStreamPerThread); - } + assert(feat.dims == 3); + assert(n_channels == m_num_of_feats || n_channels == m_num_of_feats * m_num_of_scales); + + for (uint i = 0; i < n_channels; ++i) { + cv::Mat feat_plane(feat.dims - 1, feat.size + 1, feat.cv::Mat::type(), feat.ptr(i)); + cv::Mat temp_plane(temp.dims - 1, temp.size + 1, temp.cv::Mat::type(), temp.ptr(i)); + temp_plane = feat_plane.mul(m_window); } - return; + CufftErrorCheck(cufftExecR2C((n_channels == m_num_of_feats) ? plan_fw : plan_fw_all_scales, + temp_data, complex_result.get_p_data())); } -void cuFFT::inverse(ComplexMat & complex_input, MatDynMem & real_result) +void cuFFT::inverse(ComplexMat &complex_input, MatDynMem &real_result) { - int n_channels = complex_input.n_channels; + uint n_channels = complex_input.n_channels; cufftComplex *in = reinterpret_cast(complex_input.get_p_data()); + cufftReal *out = real_result.deviceMem(); + float alpha = 1.0 / (m_width * m_height); + cufftHandle plan; if (n_channels == 1) { - NORMAL_OMP_CRITICAL - { - CufftErrorCheck(cufftExecC2R(plan_i_1ch, in, reinterpret_cast(real_result_arr))); - cudaStreamSynchronize(cudaStreamPerThread); - } - real_result = real_result / (m_width * m_height); + CufftErrorCheck(cufftExecC2R(plan_i_1ch, in, out)); + CublasErrorCheck(cublasSscal(cublas, real_result.total(), &alpha, out, 1)); return; - } else if (n_channels == int(m_num_of_scales)) { - CufftErrorCheck(cufftExecC2R(plan_i_1ch_all_scales, in, reinterpret_cast(real_result_arr))); - cudaStreamSynchronize(cudaStreamPerThread); - - real_result = real_result / (m_width * m_height); + } else if (n_channels == m_num_of_scales) { + CufftErrorCheck(cufftExecC2R(plan_i_1ch_all_scales, in, out)); + CublasErrorCheck(cublasSscal(cublas, real_result.total(), &alpha, out, 1)); return; - } else if (n_channels == int(m_num_of_feats) * int(m_num_of_scales)) { - CufftErrorCheck(cufftExecC2R(plan_i_features_all_scales, in, reinterpret_cast(real_result_arr))); + } else if (n_channels == m_num_of_feats * m_num_of_scales) { + CufftErrorCheck(cufftExecC2R(plan_i_features_all_scales, in, out)); cudaStreamSynchronize(cudaStreamPerThread); return; } - NORMAL_OMP_CRITICAL - { - CufftErrorCheck(cufftExecC2R(plan_i_features, in, reinterpret_cast(real_result_arr))); -#if defined(OPENMP) && !defined(BIG_BATCH) - CudaSafeCall(cudaStreamSynchronize(cudaStreamPerThread)); -#endif - } + CufftErrorCheck(cufftExecC2R(plan_i_features, in, out)); return; } cuFFT::~cuFFT() { + CublasErrorCheck(cublasDestroy(cublas)); + CufftErrorCheck(cufftDestroy(plan_f)); CufftErrorCheck(cufftDestroy(plan_fw)); CufftErrorCheck(cufftDestroy(plan_i_1ch)); diff --git a/src/fft_cufft.h b/src/fft_cufft.h index 9b2467f..4dd0f10 100644 --- a/src/fft_cufft.h +++ b/src/fft_cufft.h @@ -4,6 +4,7 @@ #include #include +#include #include "fft.h" #include "cuda/cuda_error_check.cuh" @@ -14,9 +15,10 @@ struct ThreadCtx; class cuFFT : public Fft { public: + cuFFT(); void init(unsigned width, unsigned height, unsigned num_of_feats, unsigned num_of_scales) override; void set_window(const MatDynMem &window) override; - void forward(const cv::Mat & real_input, ComplexMat & complex_result) override; + void forward(MatDynMem & real_input, ComplexMat & complex_result) override; void forward_window(MatDynMem &patch_feats_in, ComplexMat & complex_result, MatDynMem &tmp) override; void inverse(ComplexMat & complex_input, MatDynMem & real_result) override; ~cuFFT() override; @@ -25,6 +27,7 @@ private: unsigned m_width, m_height, m_num_of_feats, m_num_of_scales; cufftHandle plan_f, plan_f_all_scales, plan_fw, plan_fw_all_scales, plan_i_features, plan_i_features_all_scales, plan_i_1ch, plan_i_1ch_all_scales; + cublasHandle_t cublas; }; #endif // FFT_CUDA_H diff --git a/src/fft_fftw.cpp b/src/fft_fftw.cpp index 973c894..3754036 100644 --- a/src/fft_fftw.cpp +++ b/src/fft_fftw.cpp @@ -172,7 +172,7 @@ void Fftw::set_window(const MatDynMem &window) m_window = window; } -void Fftw::forward(const cv::Mat & real_input, ComplexMat & complex_result) +void Fftw::forward(MatDynMem & real_input, ComplexMat & complex_result) { if (BIG_BATCH_MODE && real_input.rows == int(m_height * m_num_of_scales)) { fftwf_execute_dft_r2c(plan_f_all_scales, reinterpret_cast(real_input.data), diff --git a/src/fft_fftw.h b/src/fft_fftw.h index 97641f8..cb4a901 100644 --- a/src/fft_fftw.h +++ b/src/fft_fftw.h @@ -20,7 +20,7 @@ public: Fftw(); void init(unsigned width, unsigned height, unsigned num_of_feats, unsigned num_of_scales) override; void set_window(const MatDynMem &window) override; - void forward(const cv::Mat & real_input, ComplexMat & complex_result) override; + void forward(MatDynMem & real_input, ComplexMat & complex_result) override; void forward_window(MatDynMem &patch_feats_in, ComplexMat & complex_result, MatDynMem &tmp) override; void inverse(ComplexMat & complex_input, MatDynMem & real_result) override; ~Fftw() override; diff --git a/src/fft_opencv.h b/src/fft_opencv.h index 1239acd..5e016db 100644 --- a/src/fft_opencv.h +++ b/src/fft_opencv.h @@ -9,7 +9,7 @@ class FftOpencv : public Fft public: void init(unsigned width, unsigned height, unsigned num_of_feats, unsigned num_of_scales) override; void set_window(const MatDynMem &window) override; - void forward(const cv::Mat & real_input, ComplexMat & complex_result) override; + void forward(MatDynMem & real_input, ComplexMat & complex_result) override; void forward_window(MatDynMem &patch_feats_in, ComplexMat & complex_result, MatDynMem &tmp) override; void inverse(ComplexMat & complex_input, MatDynMem & real_result) override; ~FftOpencv() override; diff --git a/src/kcf.cpp b/src/kcf.cpp index 4585aa0..5e56bdd 100644 --- a/src/kcf.cpp +++ b/src/kcf.cpp @@ -209,8 +209,10 @@ void KCF_Tracker::init(cv::Mat &img, const cv::Rect &bbox, int fit_size_x, int f DEBUG_PRINTM(p_yf); // obtain a sub-window for training initial model - std::vector patch_feats = get_features(input_rgb, input_gray, p_pose.cx, p_pose.cy, - p_windows_size.width, p_windows_size.height); + int sizes[3] = {p_num_of_feats, p_windows_size.height, p_windows_size.width}; + MatDynMem patch_feats(3, sizes, CV_32FC1); + MatDynMem temp(3, tmp, CV_32FC1); + get_features(features, input_rgb, input_gray, p_pose.cx, p_pose.cy, p_windows_size.width, p_windows_size.height); fft.forward_window(patch_feats, p_model_xf); DEBUG_PRINTM(p_model_xf); @@ -638,7 +640,7 @@ cv::Mat KCF_Tracker::circshift(const cv::Mat &patch, int x_rot, int y_rot) } // hann window actually (Power-of-cosine windows) -cv::Mat KCF_Tracker::cosine_window_function(int dim1, int dim2) +MatDynMem KCF_Tracker::cosine_window_function(int dim1, int dim2) { cv::Mat m1(1, dim1, CV_32FC1), m2(dim2, 1, CV_32FC1); double N_inv = 1. / (static_cast(dim1) - 1.); @@ -647,7 +649,7 @@ cv::Mat KCF_Tracker::cosine_window_function(int dim1, int dim2) N_inv = 1. / (static_cast(dim2) - 1.); for (int i = 0; i < dim2; ++i) m2.at(i) = float(0.5 * (1. - std::cos(2. * CV_PI * static_cast(i) * N_inv))); - cv::Mat ret = m2 * m1; + MatDynMem ret = m2 * m1; return ret; } diff --git a/src/kcf.h b/src/kcf.h index f0d79c1..66cb0fa 100644 --- a/src/kcf.h +++ b/src/kcf.h @@ -162,7 +162,7 @@ private: cv::Mat gaussian_shaped_labels(double sigma, int dim1, int dim2); std::unique_ptr gaussian_correlation; cv::Mat circshift(const cv::Mat & patch, int x_rot, int y_rot); - cv::Mat cosine_window_function(int dim1, int dim2); + MatDynMem cosine_window_function(int dim1, int dim2); void get_features(MatDynMem &feat_3d, cv::Mat & input_rgb, cv::Mat & input_gray, int cx, int cy, int size_x, int size_y, double scale = 1.); cv::Point2f sub_pixel_peak(cv::Point & max_loc, cv::Mat & response); double sub_grid_scale(uint index); -- 2.39.2