From 09ad2802d987433f9f5d1a7f5ab67c3dbb4bcbd9 Mon Sep 17 00:00:00 2001 From: Shanigen Date: Fri, 10 Aug 2018 13:20:02 +0200 Subject: [PATCH] CUDA streams works This commit adds support for CUDA streams and also corrects issue with CUFFTW version of the tracker. All version now works without any problem. CUDA streams version currently does not support C++ async directive and only OpenMP. TODO: Implement correct use of OpenMP in big batch mode. --- Makefile | 3 ++- src/fft_cufft.cpp | 20 ++++++++++++++++++-- src/fft_fftw.cpp | 2 +- src/kcf.cpp | 13 ++++--------- src/scale_vars.hpp | 14 ++++++-------- 5 files changed, 31 insertions(+), 21 deletions(-) diff --git a/Makefile b/Makefile index 7228039..561af83 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ # Makefile to build all the available variants -BUILDS = opencvfft-st opencvfft-async opencvfft-openmp fftw fftw-async fftw-openmp fftw-big fftw-big-openmp cufftw cufftw-big cufftw-big-openmp cufft cufft-big cufft-big-openmp +BUILDS = opencvfft-st opencvfft-async opencvfft-openmp fftw fftw-async fftw-openmp fftw-big fftw-big-openmp cufftw cufftw-big cufftw-big-openmp cufft cufft-openmp cufft-big cufft-big-openmp cufft-opemp all: $(foreach build,$(BUILDS),build-$(build)/kcf_vot) @@ -19,6 +19,7 @@ CMAKE_OTPS_cufftw = -DFFT=cuFFTW CMAKE_OTPS_cufftw-big = -DFFT=cuFFTW -DBIG_BATCH=ON CMAKE_OTPS_cufftw-big-openmp = -DFFT=cuFFTW -DBIG_BATCH=ON -DOPENMP=ON CMAKE_OTPS_cufft = -DFFT=cuFFT +CMAKE_OTPS_cufft-openmp = -DFFT=cuFFT -DOPENMP=ON CMAKE_OTPS_cufft-big = -DFFT=cuFFT -DBIG_BATCH=ON CMAKE_OTPS_cufft-big-openmp = -DFFT=cuFFT -DBIG_BATCH=ON -DOPENMP=ON diff --git a/src/fft_cufft.cpp b/src/fft_cufft.cpp index da43fbf..955c914 100644 --- a/src/fft_cufft.cpp +++ b/src/fft_cufft.cpp @@ -139,9 +139,12 @@ void cuFFT::forward(const cv::Mat & real_input, ComplexMat & complex_result, flo CufftErrorCheck(cufftExecR2C(plan_f_all_scales, reinterpret_cast(real_input_arr), complex_result.get_p_data())); } else { +#pragma omp critical + { CufftErrorCheck(cufftSetStream(plan_f, stream)); - CufftErrorCheck(cufftExecR2C(plan_f, reinterpret_cast(real_input_arr), - complex_result.get_p_data())); + CufftErrorCheck(cufftExecR2C(plan_f, reinterpret_cast(real_input_arr), complex_result.get_p_data())); + cudaStreamSynchronize(stream); + } } return; } @@ -161,8 +164,12 @@ void cuFFT::forward_window(std::vector patch_feats, ComplexMat & comple cv::Mat in_roi(fw_all, cv::Rect(0, int(i*m_height), int(m_width), int(m_height))); in_roi = patch_feats[i].mul(m_window); } +#pragma omp critical + { CufftErrorCheck(cufftSetStream(plan_fw, stream)); CufftErrorCheck(cufftExecR2C(plan_fw, reinterpret_cast(real_input_arr), complex_result.get_p_data())); + cudaStreamSynchronize(stream); + } } return; } @@ -173,9 +180,12 @@ void cuFFT::inverse(ComplexMat & complex_input, cv::Mat & real_result, float *r cufftComplex *in = reinterpret_cast(complex_input.get_p_data()); if(n_channels == 1){ +#pragma omp critical + { CufftErrorCheck(cufftSetStream(plan_i_1ch, stream)); CufftErrorCheck(cufftExecC2R(plan_i_1ch, in, reinterpret_cast(real_result_arr))); cudaStreamSynchronize(stream); + } real_result = real_result/(m_width*m_height); return; } else if(n_channels == int(m_num_of_scales)){ @@ -188,8 +198,14 @@ void cuFFT::inverse(ComplexMat & complex_input, cv::Mat & real_result, float *r CufftErrorCheck(cufftExecC2R(plan_i_features_all_scales, in, reinterpret_cast(real_result_arr))); return; } +#pragma omp critical + { CufftErrorCheck(cufftSetStream(plan_i_features, stream)); CufftErrorCheck(cufftExecC2R(plan_i_features, in, reinterpret_cast(real_result_arr))); +#if defined(OPENMP) && !defined(BIG_BATCH) + cudaStreamSynchronize(stream); +#endif + } return; } diff --git a/src/fft_fftw.cpp b/src/fft_fftw.cpp index d171e4a..4dfd5a1 100644 --- a/src/fft_fftw.cpp +++ b/src/fft_fftw.cpp @@ -30,7 +30,7 @@ void Fftw::init(unsigned width, unsigned height, unsigned num_of_feats, unsigned m_num_of_scales = num_of_scales; m_big_batch_mode = big_batch_mode; -#if (!defined(ASYNC) && !defined(CUFFTW))|| defined(OPENMP) +#if (!defined(ASYNC) && !defined(CUFFTW)) && defined(OPENMP) fftw_init_threads(); #endif //OPENMP diff --git a/src/kcf.cpp b/src/kcf.cpp index 11687d2..fceccae 100644 --- a/src/kcf.cpp +++ b/src/kcf.cpp @@ -21,6 +21,8 @@ #define DEBUG_PRINT(obj) if (m_debug) {std::cout << #obj << " @" << __LINE__ << std::endl << (obj) << std::endl;} #define DEBUG_PRINTM(obj) if (m_debug) {std::cout << #obj << " @" << __LINE__ << " " << (obj).size() << " CH: " << (obj).channels() << std::endl << (obj) << std::endl;} +#define DEBUG_PRINTD(obj) {std::cout << #obj << " @" << __LINE__ << " " << (obj).size() << " CH: " << (obj).channels() << std::endl << (obj) << std::endl;} + KCF_Tracker::KCF_Tracker(double padding, double kernel_sigma, double lambda, double interp_factor, double output_sigma_factor, int cell_size) : fft(*new FFT()), @@ -157,15 +159,6 @@ void KCF_Tracker::init(cv::Mat &img, const cv::Rect & bbox, int fit_size_x, int else { p_scale_vars.emplace_back(new Scale_vars(p_windows_size, p_cell_size, p_num_of_feats, 1)); } -#ifdef CUFFT - std::cout << p_scale_vars.back()->zf.stream << std::endl; - std::cout << p_scale_vars.back()->kzf.stream << std::endl; - std::cout << p_scale_vars.back()->kf.stream << std::endl << std::endl; - - std::cout << p_scale_vars.back()->zf.n_scales << std::endl; - std::cout << p_scale_vars.back()->kzf.n_scales << std::endl; - std::cout << p_scale_vars.back()->kf.n_scales << std::endl << std::endl; -#endif } p_current_scale = 1.; @@ -199,6 +192,7 @@ void KCF_Tracker::init(cv::Mat &img, const cv::Rect & bbox, int fit_size_x, int p_scale_vars.front()->model_xf = p_model_xf; p_scale_vars.front()->model_xf.set_stream(p_scale_vars.front()->stream); p_yf.set_stream(p_scale_vars.front()->stream); + p_model_xf.set_stream(p_scale_vars.front()->stream); #endif if (m_use_linearkernel) { @@ -393,6 +387,7 @@ void KCF_Tracker::track(cv::Mat &img) p_current_scale = p_min_max_scale[0]; if (p_current_scale > p_min_max_scale[1]) p_current_scale = p_min_max_scale[1]; + //obtain a subwindow for training at newly estimated target position p_scale_vars.front()->patch_feats.clear(); get_features(input_rgb, input_gray, int(p_pose.cx), int(p_pose.cy), p_windows_size[0], p_windows_size[1], *p_scale_vars.front(), p_current_scale); diff --git a/src/scale_vars.hpp b/src/scale_vars.hpp index 0aad3fb..9f755e5 100644 --- a/src/scale_vars.hpp +++ b/src/scale_vars.hpp @@ -5,8 +5,12 @@ #include "complexmat.cuh" #else #include "complexmat.hpp" +#ifndef CUFFTW //For compatibility reasons between CuFFT and FFTW, OpenCVfft versions. typedef int* cudaStream_t; +#else + #include "cuda_runtime.h" +#endif #endif struct Scale_vars @@ -16,7 +20,6 @@ public: ComplexMat *yf = nullptr,bool zero_index = false) { uint alloc_size; - std::cout << __PRETTY_FUNCTION__ << std::endl; #ifdef CUFFT if (zero_index) { cudaSetDeviceFlags(cudaDeviceMapHost); @@ -27,6 +30,8 @@ public: CudaSafeCall(cudaStreamCreate(&this->stream)); #endif + this->patch_feats.reserve(uint(num_of_feats)); + alloc_size = uint(windows_size[0]/cell_size*windows_size[1]/cell_size*num_of_scales)*sizeof(cufftReal); CudaSafeCall(cudaHostAlloc(reinterpret_cast(&this->data_i_1ch), alloc_size, cudaHostAllocMapped)); CudaSafeCall(cudaHostGetDevicePointer(reinterpret_cast(&this->data_i_1ch_d), reinterpret_cast(this->data_i_1ch), 0)); @@ -39,14 +44,8 @@ public: this->response = cv::Mat(windows_size[1]/cell_size, windows_size[0]/cell_size, CV_32FC(num_of_scales), this->data_i_1ch); this->zf.create(windows_size[1]/cell_size, (windows_size[0]/cell_size)/2+1, num_of_feats, num_of_scales, this->stream); - std::cout << this->zf.stream << std::endl; - std::cout << this->zf.n_scales << std::endl; this->kzf.create(windows_size[1]/cell_size, (windows_size[0]/cell_size)/2+1, num_of_scales, this->stream); - std::cout << this->kzf.stream << std::endl; - std::cout << this->kzf.n_scales << std::endl; this->kf.create(windows_size[1]/cell_size, (windows_size[0]/cell_size)/2+1, num_of_scales, this->stream); - std::cout << this->kf.stream << std::endl; - std::cout << this->kf.n_scales << std::endl << std::endl; alloc_size = uint(num_of_scales); @@ -126,7 +125,6 @@ public: ~Scale_vars() { #ifdef CUFFT - std::cout << __PRETTY_FUNCTION__ << std::endl; CudaSafeCall(cudaFreeHost(this->xf_sqr_norm)); CudaSafeCall(cudaFreeHost(this->yf_sqr_norm)); CudaSafeCall(cudaFreeHost(this->data_i_1ch)); -- 2.39.2