]> rtime.felk.cvut.cz Git - hercules2020/kcf.git/commitdiff
Work done so far on CUDA streams
authorShanigen <vkaraf@gmail.com>
Thu, 9 Aug 2018 13:42:18 +0000 (15:42 +0200)
committerMichal Sojka <michal.sojka@cvut.cz>
Wed, 5 Sep 2018 06:38:52 +0000 (08:38 +0200)
This commit contains not only workd done to implement CUDA streams,
but also some changes to correct warnings and make the naming little
bit more consistent.

17 files changed:
Makefile
src/CMakeLists.txt
src/complexmat.cu
src/complexmat.cuh
src/cuda_functions.cu
src/cuda_functions.cuh
src/fft.h
src/fft_cufft.cpp
src/fft_cufft.h
src/fft_fftw.cpp
src/fft_fftw.h
src/fft_opencv.cpp
src/fft_opencv.h
src/kcf.cpp
src/kcf.h
src/piotr_fhog/gradientMex.cpp
src/scale_vars.hpp

index 796a1b44118a3a15b7f2f665e3815ec780d76ec7..72280398a896c6246e300c7d54e7e70ab5d02e3f 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -1,6 +1,6 @@
 # Makefile to build all the available variants
 
-BUILDS = opencvfft-st opencvfft-async fftw fftw-openmp fftw-async fftw-big fftw-big-openmp cufftw cufftw-big cufftw-big-openmp cufft cufft-big cufft-big-openmp
+BUILDS = opencvfft-st opencvfft-async opencvfft-openmp fftw fftw-async fftw-openmp fftw-big fftw-big-openmp cufftw cufftw-big cufftw-big-openmp cufft cufft-big cufft-big-openmp
 
 all: $(foreach build,$(BUILDS),build-$(build)/kcf_vot)
 
@@ -9,6 +9,7 @@ CMAKE_OPTS += -G Ninja
 
 CMAKE_OTPS_opencvfft-st      = -DFFT=OpenCV
 CMAKE_OTPS_opencvfft-async   = -DFFT=OpenCV -DASYNC=ON
+CMAKE_OTPS_opencvfft-openmp  = -DFFT=OpenCV -DOPENMP=ON
 CMAKE_OTPS_fftw              = -DFFT=fftw
 CMAKE_OTPS_fftw-openmp       = -DFFT=fftw -DOPENMP=ON
 CMAKE_OTPS_fftw-async        = -DFFT=fftw -DASYNC=ON
index 80350481b52e79c4901f8bd3268a2954120b9b5b..da31a5ead2e2ddd22ae4960c10098790bdf290c5 100644 (file)
@@ -61,9 +61,12 @@ IF((FFT STREQUAL "cuFFT") AND (ASYNC))
   message(SEND_ERROR "cuFFT version does not support ASYNC and OpenMP only if used with big batch mode.")
 ENDIF()
 
-IF(ASYNC AND NOT OPENMP)
+IF(ASYNC)
   add_definitions(-DASYNC)
   MESSAGE(STATUS "ASYNC")
+ELSEIF(OPENMP)
+    add_definitions(-DOPENMP)
+    MESSAGE(STATUS "OPENMP")
 ENDIF() #ASYNC
 
 FIND_PACKAGE( OpenCV REQUIRED )
@@ -79,7 +82,7 @@ IF(use_cuda)
   set(CUDA_SEPARABLE_COMPILATION ON)
   set(CUDA_PROPAGATE_HOST_FLAGS OFF)
   set(CUDA_HOST_COMPILER /usr/bin/g++)
-  list( APPEND CUDA_NVCC_FLAGS -O3 --gpu-architecture sm_62 -std=c++11 -default-stream per-thread)
+  list( APPEND CUDA_NVCC_FLAGS -O3 --gpu-architecture sm_62 -std=c++11)
   find_cuda_helper_libs(cufftw)
   IF(FFT STREQUAL "cuFFT")
   add_subdirectory(cuda)
index f3a3e5be476d0af5614b0d5852a74055495ef0e7..9b8bfa41adef3cd09c257a20ec88b6743d995689 100644 (file)
@@ -29,12 +29,12 @@ __global__ void sqr_norm_kernel(int n, float* out, float* data, float rows, floa
 
 void ComplexMat::sqr_norm(float *result) const
 {
-    CudaSafeCall(cudaMemset(result, 0, n_scales*sizeof(float)));
+    CudaSafeCall(cudaMemsetAsync(result, 0, n_scales*sizeof(float), this->stream));
 
     dim3 threadsPerBlock(rows, cols);
     dim3 numBlocks(n_channels/n_scales, n_scales);
     
-    sqr_norm_kernel<<<numBlocks, threadsPerBlock, rows*cols*sizeof(float)>>>(n_channels/n_scales, result, p_data, rows, cols);
+    sqr_norm_kernel<<<numBlocks, threadsPerBlock, rows*cols*sizeof(float), this->stream>>>(n_channels/n_scales, result, this->p_data, rows, cols);
     CudaCheckError();
         
     return;
@@ -51,11 +51,11 @@ __global__ void sqr_mag_kernel(float* data, float* result)
 
 ComplexMat ComplexMat::sqr_mag() const
 {
-    ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales);
+    ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales, this->stream);
     
     dim3 threadsPerBlock(rows, cols);
     dim3 numBlocks(n_channels/n_scales, n_scales);
-    sqr_mag_kernel<<<numBlocks, threadsPerBlock>>>(this->p_data, result.p_data);
+    sqr_mag_kernel<<<numBlocks, threadsPerBlock, 0, this->stream>>>(this->p_data, result.p_data);
     CudaCheckError();
     
     return result;
@@ -72,11 +72,11 @@ __global__ void conj_kernel(float* data, float* result)
 
 ComplexMat ComplexMat::conj() const
 {
-    ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales);
+    ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales, this->stream);
     
     dim3 threadsPerBlock(rows, cols);
     dim3 numBlocks(n_channels/n_scales, n_scales);
-    conj_kernel<<<numBlocks, threadsPerBlock>>>(this->p_data, result.p_data);
+    conj_kernel<<<numBlocks, threadsPerBlock, 0, this->stream>>>(this->p_data, result.p_data);
     CudaCheckError();
 
     return result;
@@ -85,7 +85,7 @@ ComplexMat ComplexMat::conj() const
 ComplexMat ComplexMat::sum_over_channels() const
 {
 //     assert(p_data.size() > 1);
-    ComplexMat result(this->rows, this->cols, 1);
+    ComplexMat result(this->rows, this->cols, 1, this->stream);
     return result;
 }
 
@@ -108,11 +108,11 @@ ComplexMat ComplexMat::operator*(const ComplexMat & rhs) const
 {
     assert(rhs.n_channels == n_channels && rhs.cols == cols && rhs.rows == rows);
     
-    ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales);
+    ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales, this->stream);
 
     dim3 threadsPerBlock(rows, cols);
     dim3 numBlocks(n_channels/n_scales, n_scales);
-    same_num_channels_mul_kernel<<<numBlocks, threadsPerBlock>>>(this->p_data, rhs.p_data, result.p_data);
+    same_num_channels_mul_kernel<<<numBlocks, threadsPerBlock, 0, this->stream>>>(this->p_data, rhs.p_data, result.p_data);
     CudaCheckError();
 
     return result;
@@ -133,11 +133,11 @@ ComplexMat ComplexMat::operator/(const ComplexMat & rhs) const
 {
     assert(rhs.n_channels == n_channels && rhs.cols == cols && rhs.rows == rows);
 
-    ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales);
+    ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales, this->stream);
     
     dim3 threadsPerBlock(rows, cols);
     dim3 numBlocks(n_channels/n_scales, n_scales);
-    same_num_channels_div_kernel<<<numBlocks, threadsPerBlock>>>(this->p_data, rhs.p_data, result.p_data);
+    same_num_channels_div_kernel<<<numBlocks, threadsPerBlock, 0, this->stream>>>(this->p_data, rhs.p_data, result.p_data);
     CudaCheckError();
 
     return result;
@@ -156,11 +156,11 @@ ComplexMat ComplexMat::operator+(const ComplexMat & rhs) const
 {
     assert(rhs.n_channels == n_channels && rhs.cols == cols && rhs.rows == rows);
 
-    ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales);
+    ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales, this->stream);
     
     dim3 threadsPerBlock(rows, cols);
     dim3 numBlocks(n_channels/n_scales, n_scales);
-    same_num_channels_add_kernel<<<numBlocks, threadsPerBlock>>>(this->p_data, rhs.p_data, result.p_data);
+    same_num_channels_add_kernel<<<numBlocks, threadsPerBlock, 0, this->stream>>>(this->p_data, rhs.p_data, result.p_data);
     CudaCheckError();
     
     return result;
@@ -177,11 +177,11 @@ __global__ void constant_mul_kernel(float* data_l, float constant, float* result
 
 ComplexMat ComplexMat::operator*(const float & rhs) const
 {
-    ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales);
+    ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales, this->stream);
     
     dim3 threadsPerBlock(rows, cols);
     dim3 numBlocks(n_channels/n_scales, n_scales);
-    constant_mul_kernel<<<numBlocks, threadsPerBlock>>>(this->p_data, rhs, result.p_data);
+    constant_mul_kernel<<<numBlocks, threadsPerBlock, 0, this->stream>>>(this->p_data, rhs, result.p_data);
     CudaCheckError();
 
     return result;
@@ -198,11 +198,11 @@ __global__ void constant_add_kernel(float* data_l, float constant, float* result
 
 ComplexMat ComplexMat::operator+(const float & rhs) const
 {
-    ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales);
+    ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales, this->stream);
     
     dim3 threadsPerBlock(rows, cols);
     dim3 numBlocks(n_channels/n_scales, n_scales);
-    constant_add_kernel<<<numBlocks, threadsPerBlock>>>(this->p_data, rhs, result.p_data);
+    constant_add_kernel<<<numBlocks, threadsPerBlock, 0, this->stream>>>(this->p_data, rhs, result.p_data);
     CudaCheckError();
 
     return result;
@@ -223,11 +223,11 @@ ComplexMat ComplexMat::mul(const ComplexMat & rhs) const
 {
     assert(rhs.n_channels == 1 && rhs.cols == cols && rhs.rows == rows);
 
-    ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales);
+    ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales, this->stream);
     
     dim3 threadsPerBlock(rows, cols);
     dim3 numBlocks(n_channels/n_scales, n_scales);
-    one_channel_mul_kernel<<<numBlocks, threadsPerBlock>>>(this->p_data, rhs.p_data, result.p_data);
+    one_channel_mul_kernel<<<numBlocks, threadsPerBlock, 0, this->stream>>>(this->p_data, rhs.p_data, result.p_data);
     CudaCheckError();
     
     return result;
@@ -248,11 +248,11 @@ ComplexMat ComplexMat::mul2(const ComplexMat & rhs) const
 {
     assert(rhs.n_channels == n_channels/n_scales && rhs.cols == cols && rhs.rows == rows);
 
-    ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales);
+    ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales, this->stream);
     
     dim3 threadsPerBlock(rows, cols);
     dim3 numBlocks(n_channels/n_scales, n_scales);
-    scales_channel_mul_kernel<<<numBlocks, threadsPerBlock>>>(this->p_data, rhs.p_data, result.p_data);
+    scales_channel_mul_kernel<<<numBlocks, threadsPerBlock, 0, this->stream>>>(this->p_data, rhs.p_data, result.p_data);
     CudaCheckError();
     
     return result;
@@ -264,6 +264,7 @@ void ComplexMat::operator=(ComplexMat & rhs)
     rows = rhs.rows;
     n_channels = rhs.n_channels;
     n_scales = rhs.n_scales;
+    stream = rhs.stream;
     foreign_data = true;
     
     p_data = rhs.p_data;
@@ -275,6 +276,7 @@ void ComplexMat::operator=(ComplexMat && rhs)
     rows = rhs.rows;
     n_channels = rhs.n_channels;
     n_scales = rhs.n_scales;
+    stream = rhs.stream;
     
     p_data = rhs.p_data;
     
index f91cab7c4077678b7fcaf5edff19fb9c391792da..37534b3afc1c4b598e9d196b8baae45dfa1ad586 100644 (file)
@@ -16,14 +16,16 @@ public:
     int n_channels;
     int n_scales = 1;
     bool foreign_data = false;
+    cudaStream_t stream = nullptr;
     
     ComplexMat() : cols(0), rows(0), n_channels(0) {}
-    ComplexMat(int _rows, int _cols, int _n_channels) : cols(_cols), rows(_rows), n_channels(_n_channels)
+    ComplexMat(int _rows, int _cols, int _n_channels, cudaStream_t _stream) : cols(_cols), rows(_rows), n_channels(_n_channels), stream(_stream)
     {
         CudaSafeCall(cudaMalloc(&p_data,  n_channels*cols*rows*sizeof(cufftComplex)));
     }
     
-    ComplexMat(int _rows, int _cols, int _n_channels, int _n_scales) : cols(_cols), rows(_rows), n_channels(_n_channels), n_scales(_n_scales)
+    ComplexMat(int _rows, int _cols, int _n_channels, int _n_scales, cudaStream_t _stream) : cols(_cols), rows(_rows), n_channels(_n_channels), n_scales(_n_scales),
+        stream(_stream)
     {
         CudaSafeCall(cudaMalloc(&p_data,  n_channels*cols*rows*sizeof(cufftComplex)));
     }
@@ -35,6 +37,7 @@ public:
         n_channels = other.n_channels;
         n_scales = other.n_scales;
         p_data = other.p_data;
+        stream = other.stream;
         
         other.p_data = nullptr;
     }
@@ -47,20 +50,22 @@ public:
         }
     }
 
-    void create(int _rows, int _cols, int _n_channels)
+    void create(int _rows, int _cols, int _n_channels, cudaStream_t _stream = nullptr)
     {
         rows = _rows;
         cols = _cols;
         n_channels = _n_channels;
+        stream = _stream;
         CudaSafeCall(cudaMalloc(&p_data,  n_channels*cols*rows*sizeof(cufftComplex)));
     }
 
-    void create(int _rows, int _cols, int _n_channels, int _n_scales)
+    void create(int _rows, int _cols, int _n_channels, int _n_scales, cudaStream_t _stream = nullptr)
     {
         rows = _rows;
         cols = _cols;
         n_channels = _n_channels;
         n_scales = _n_scales;
+        stream = _stream;
         CudaSafeCall(cudaMalloc(&p_data,  n_channels*cols*rows*sizeof(cufftComplex)));
     }
     // cv::Mat API compatibility
@@ -68,6 +73,12 @@ public:
     int channels() { return n_channels; }
     int channels() const { return n_channels; }
 
+    void set_stream(cudaStream_t _stream)
+    {
+        stream = _stream;
+        return;
+    }
+
     void sqr_norm(float *result) const;
     
     ComplexMat sqr_mag() const;
index 44d41be8f27391b4020c02b485a467b3858b1747..4eef14cdc6f68683c4602fcc93ec04bc4c482b6e 100644 (file)
@@ -36,12 +36,12 @@ __global__ void  gaussian_correlation_kernel(float *data_in, float *data_out, fl
         }
 }
 
-void cuda_gaussian_correlation(float *data_in, float *data_out, float *xf_sqr_norm, float *yf_sqr_norm, double sigma, int n_channels, int n_scales,int rows, int cols)
+void cuda_gaussian_correlation(float *data_in, float *data_out, float *xf_sqr_norm, float *yf_sqr_norm, double sigma, int n_channels, int n_scales,int rows, int cols, cudaStream_t stream)
 {
     dim3 threadsPerBlock((n_channels/n_scales)/2);
     dim3 numBlocks(n_scales, rows*cols);
 
-    gaussian_correlation_kernel<<<numBlocks, threadsPerBlock, ((n_channels/n_scales)/2)*sizeof(float)>>>(data_in, data_out, xf_sqr_norm, yf_sqr_norm, rows, cols, n_channels/n_scales,  sigma);
+    gaussian_correlation_kernel<<<numBlocks, threadsPerBlock, ((n_channels/n_scales)/2)*sizeof(float), stream>>>(data_in, data_out, xf_sqr_norm, yf_sqr_norm, rows, cols, n_channels/n_scales,  sigma);
     CudaCheckError();
     
 //    float *data_cpu = (float*) malloc(rows*cols*n_scales*sizeof(float));
index 452313f621613fe7a25438a255499ad54b08149c..e8381690b938d91bf58a8c8c11db7ee6f337e6a3 100644 (file)
@@ -4,6 +4,6 @@
 #include "cuda_runtime.h"
 #include "cuda/cuda_error_check.cuh"
 
-void cuda_gaussian_correlation(float *data_in, float *data_out, float *xf_sqr_norm, float *yf_sqr_norm, double sigma, int n_channels, int n_scales, int rows, int cols);
+void cuda_gaussian_correlation(float *data_in, float *data_out, float *xf_sqr_norm, float *yf_sqr_norm, double sigma, int n_channels, int n_scales, int rows, int cols, cudaStream_t stream);
 
 #endif
index 6f91413265ee41c94c7a5a122b001e6d2289fb84..379c31e86614ca23aa1a6f838984bb9da681c98e 100644 (file)
--- a/src/fft.h
+++ b/src/fft.h
@@ -19,9 +19,9 @@ class Fft
 public:
     virtual void init(unsigned width, unsigned height,unsigned num_of_feats, unsigned num_of_scales, bool big_batch_mode) = 0;
     virtual void set_window(const cv::Mat & window) = 0;
-    virtual void forward(const cv::Mat & real_input, ComplexMat & complex_result, float *real_input_arr) = 0;
-    virtual void forward_window(std::vector<cv::Mat> patch_feats, ComplexMat & complex_result, cv::Mat & fw_all, float *real_input_arr) = 0;
-    virtual void inverse(ComplexMat &  complex_input, cv::Mat & real_result, float *real_result_arr) = 0;
+    virtual void forward(const cv::Mat & real_input, ComplexMat & complex_result, float *real_input_arr, cudaStream_t  stream) = 0;
+    virtual void forward_window(std::vector<cv::Mat> patch_feats, ComplexMat & complex_result, cv::Mat & fw_all, float *real_input_arr, cudaStream_t stream) = 0;
+    virtual void inverse(ComplexMat &  complex_input, cv::Mat & real_result, float *real_result_arr, cudaStream_t stream) = 0;
     virtual ~Fft() = 0;
 };
 
index 004f45c96c9e1f1e345f1a930bd000348cbe324f..da43fbf4bd06f92a606380d4dd91097212994e82 100644 (file)
@@ -12,7 +12,7 @@ void cuFFT::init(unsigned width, unsigned height, unsigned num_of_feats, unsigne
 
     //FFT forward one scale
     {
-       CufftErrorCheck(cufftPlan2d(&plan_f, m_height, m_width, CUFFT_R2C));
+       CufftErrorCheck(cufftPlan2d(&plan_f, int(m_height), int(m_width), CUFFT_R2C));
     }
 #ifdef BIG_BATCH
     //FFT forward all scales
@@ -34,11 +34,11 @@ void cuFFT::init(unsigned width, unsigned height, unsigned num_of_feats, unsigne
     //FFT forward window one scale
     {
         int rank = 2;
-        int n[] = {(int)m_height, (int)m_width};
-        int howmany = m_num_of_feats;
-        int idist = m_height*m_width, odist = m_height*(m_width/2+1);
+        int n[] = {int(m_height), int(m_width)};
+        int howmany = int(m_num_of_feats);
+        int idist = int(m_height*m_width), odist = int(m_height*(m_width/2+1));
         int istride = 1, ostride = 1;
-        int *inembed = n, onembed[] = {(int)m_height, (int)m_width/2+1};
+        int *inembed = n, onembed[] = {int(m_height), int(m_width/2+1)};
 
         CufftErrorCheck(cufftPlanMany(&plan_fw, rank, n,
                  inembed, istride, idist,
@@ -65,11 +65,11 @@ void cuFFT::init(unsigned width, unsigned height, unsigned num_of_feats, unsigne
     //FFT inverse one scale
     {
         int rank = 2;
-        int n[] = {(int)m_height, (int)m_width};
-        int howmany = m_num_of_feats;
-        int idist = m_height*(m_width/2+1), odist = 1;
-        int istride = 1, ostride = m_num_of_feats;
-        int inembed[] = {(int)m_height, (int)m_width/2+1}, *onembed = n;
+        int n[] = {int(m_height), int(m_width)};
+        int howmany = int(m_num_of_feats);
+        int idist = int(m_height*(m_width/2+1)), odist = 1;
+        int istride = 1, ostride = int(m_num_of_feats);
+        int inembed[] = {int(m_height), int(m_width/2+1)}, *onembed = n;
 
         CufftErrorCheck(cufftPlanMany(&plan_i_features, rank, n,
                  inembed, istride, idist,
@@ -96,11 +96,11 @@ void cuFFT::init(unsigned width, unsigned height, unsigned num_of_feats, unsigne
     //FFT inverse one channel one scale
     {
         int rank = 2;
-        int n[] = {(int)m_height, (int)m_width};
+        int n[] = {int(m_height), int(m_width)};
         int howmany = 1;
-        int idist = m_height*(m_width/2+1), odist = 1;
+        int idist = int(m_height*(m_width/2+1)), odist = 1;
         int istride = 1, ostride = 1;
-        int inembed[] = {(int)m_height, (int)m_width/2+1}, *onembed = n;
+        int inembed[] = {int(m_height), int(m_width/2+1)}, *onembed = n;
 
         CufftErrorCheck(cufftPlanMany(&plan_i_1ch, rank, n,
                  inembed, istride, idist,
@@ -131,61 +131,64 @@ void cuFFT::set_window(const cv::Mat & window)
      m_window = window;
 }
 
-void cuFFT::forward(const cv::Mat & real_input, ComplexMat & complex_result, float *real_input_arr)
+void cuFFT::forward(const cv::Mat & real_input, ComplexMat & complex_result, float *real_input_arr, cudaStream_t stream)
 {
     (void) real_input;
 
-    if(m_big_batch_mode && real_input.rows == (int)(m_height*m_num_of_scales)){
+    if(m_big_batch_mode && real_input.rows == int(m_height*m_num_of_scales)){
         CufftErrorCheck(cufftExecR2C(plan_f_all_scales, reinterpret_cast<cufftReal*>(real_input_arr),
                                 complex_result.get_p_data()));
     } else {
-                CufftErrorCheck(cufftExecR2C(plan_f, reinterpret_cast<cufftReal*>(real_input_arr),
+        CufftErrorCheck(cufftSetStream(plan_f, stream));
+        CufftErrorCheck(cufftExecR2C(plan_f, reinterpret_cast<cufftReal*>(real_input_arr),
                                 complex_result.get_p_data()));
     }
     return;
 }
 
-void cuFFT::forward_window(std::vector<cv::Mat> patch_feats, ComplexMat & complex_result, cv::Mat & fw_all, float *real_input_arr)
+void cuFFT::forward_window(std::vector<cv::Mat> patch_feats, ComplexMat & complex_result, cv::Mat & fw_all, float *real_input_arr, cudaStream_t stream)
 {
-    int n_channels = patch_feats.size();
+    int n_channels = int(patch_feats.size());
 
-    if(n_channels > (int) m_num_of_feats){
-        for (int i = 0; i < n_channels; ++i) {
-            cv::Mat in_roi(fw_all, cv::Rect(0, i*m_height, m_width, m_height));
+    if(n_channels > int(m_num_of_feats)){
+        for (uint i = 0; i < uint(n_channels); ++i) {
+            cv::Mat in_roi(fw_all, cv::Rect(0, int(i*m_height), int(m_width), int(m_height)));
             in_roi = patch_feats[i].mul(m_window);
         }
-
         CufftErrorCheck(cufftExecR2C(plan_fw_all_scales, reinterpret_cast<cufftReal*>(real_input_arr), complex_result.get_p_data()));
     } else {
-        for (int i = 0; i < n_channels; ++i) {
-            cv::Mat in_roi(fw_all, cv::Rect(0, i*m_height, m_width, m_height));
+        for (uint i = 0; i < uint(n_channels); ++i) {
+            cv::Mat in_roi(fw_all, cv::Rect(0, int(i*m_height), int(m_width), int(m_height)));
             in_roi = patch_feats[i].mul(m_window);
         }
+        CufftErrorCheck(cufftSetStream(plan_fw, stream));
         CufftErrorCheck(cufftExecR2C(plan_fw, reinterpret_cast<cufftReal*>(real_input_arr), complex_result.get_p_data()));
     }
     return;
 }
 
-void cuFFT::inverse(ComplexMat &  complex_input, cv::Mat & real_result, float *real_result_arr)
+void cuFFT::inverse(ComplexMat &  complex_input, cv::Mat & real_result, float *real_result_arr, cudaStream_t stream)
 {
     int n_channels = complex_input.n_channels;
     cufftComplex *in = reinterpret_cast<cufftComplex*>(complex_input.get_p_data());
 
     if(n_channels == 1){
+        CufftErrorCheck(cufftSetStream(plan_i_1ch, stream));
         CufftErrorCheck(cufftExecC2R(plan_i_1ch, in, reinterpret_cast<cufftReal*>(real_result_arr)));
-        cudaDeviceSynchronize();
+        cudaStreamSynchronize(stream);
         real_result = real_result/(m_width*m_height);
         return;
-    } else if(n_channels == (int) m_num_of_scales){
+    } else if(n_channels == int(m_num_of_scales)){
         CufftErrorCheck(cufftExecC2R(plan_i_1ch_all_scales, in, reinterpret_cast<cufftReal*>(real_result_arr)));
-        cudaDeviceSynchronize();
+        cudaStreamSynchronize(stream);
 
         real_result = real_result/(m_width*m_height);
         return;
-    } else if(n_channels == (int) m_num_of_feats * (int) m_num_of_scales){
+    } else if(n_channels == int(m_num_of_feats) * int(m_num_of_scales)){
         CufftErrorCheck(cufftExecC2R(plan_i_features_all_scales, in, reinterpret_cast<cufftReal*>(real_result_arr)));
         return;
     }
+    CufftErrorCheck(cufftSetStream(plan_i_features, stream));
     CufftErrorCheck(cufftExecC2R(plan_i_features, in, reinterpret_cast<cufftReal*>(real_result_arr)));
     return;
 }
index 2d9d74f5daf0c6193687df51640c787eaa407b02..2cc4a6e531b7d2a60604e9d6f14120695b3246e6 100644 (file)
@@ -23,9 +23,9 @@ class cuFFT : public Fft
 public:
     void init(unsigned width, unsigned height, unsigned num_of_feats, unsigned num_of_scales, bool big_batch_mode) override;
     void set_window(const cv::Mat & window) override;
-    void forward(const cv::Mat & real_input, ComplexMat & complex_result, float *real_input_arr) override;
-    void forward_window(std::vector<cv::Mat> patch_feats, ComplexMat & complex_result, cv::Mat & fw_all, float *real_input_arr) override;
-    void inverse(ComplexMat &  complex_input, cv::Mat & real_result, float *real_result_arr) override;
+    void forward(const cv::Mat & real_input, ComplexMat & complex_result, float *real_input_arr, cudaStream_t  stream) override;
+    void forward_window(std::vector<cv::Mat> patch_feats, ComplexMat & complex_result, cv::Mat & fw_all, float *real_input_arr, cudaStream_t stream) override;
+    void inverse(ComplexMat &  complex_input, cv::Mat & real_result, float *real_result_arr, cudaStream_t stream) override;
     ~cuFFT() override;
 private:
     cv::Mat m_window;
index 81d895f1c9906f73feb98aaf9d98e3dfa633e435..d171e4a2a6beedccd86925575e26392d708ce36d 100644 (file)
@@ -7,7 +7,7 @@
 #endif
 
 #if !defined(ASYNC) && !defined(OPENMP) && !defined(CUFFTW)
-#define FFTW_PLAN_WITH_THREADS() fftw_plan_with_nthreads(m_num_threads);
+#define FFTW_PLAN_WITH_THREADS() fftw_plan_with_nthreads(int(m_num_threads));
 #else
 #define FFTW_PLAN_WITH_THREADS()
 #endif
@@ -17,7 +17,7 @@ Fftw::Fftw()
 {
 }
 
-Fftw::Fftw(int num_threads)
+Fftw::Fftw(unsigned num_threads)
     : m_num_threads(num_threads)
 {
 }
@@ -39,11 +39,12 @@ void Fftw::init(unsigned width, unsigned height, unsigned num_of_feats, unsigned
 #else
     std::cout << "FFT: cuFFTW" << std::endl;
 #endif
+    fftwf_cleanup();
     //FFT forward one scale
     {
-        cv::Mat in_f = cv::Mat::zeros(m_height, m_width, CV_32FC1);
-        ComplexMat out_f(m_height, m_width / 2 + 1, 1);
-        plan_f = fftwf_plan_dft_r2c_2d(m_height, m_width,
+        cv::Mat in_f = cv::Mat::zeros(int(m_height), int(m_width), CV_32FC1);
+        ComplexMat out_f(int(m_height), m_width / 2 + 1, 1);
+        plan_f = fftwf_plan_dft_r2c_2d(int(m_height), int(m_width),
                                        reinterpret_cast<float*>(in_f.data),
                                        reinterpret_cast<fftwf_complex*>(out_f.get_p_data()),
                                        FFTW_PATIENT);
@@ -71,16 +72,16 @@ void Fftw::init(unsigned width, unsigned height, unsigned num_of_feats, unsigned
 #endif
     //FFT forward window one scale
     {
-        cv::Mat in_fw = cv::Mat::zeros(m_height * m_num_of_feats, m_width, CV_32F);
-        ComplexMat out_fw(m_height, m_width / 2 + 1, m_num_of_feats);
+        cv::Mat in_fw = cv::Mat::zeros(int(m_height * m_num_of_feats), int(m_width), CV_32F);
+        ComplexMat out_fw(int(m_height), m_width / 2 + 1, int(m_num_of_feats));
         float *in = reinterpret_cast<float*>(in_fw.data);
         fftwf_complex *out = reinterpret_cast<fftwf_complex*>(out_fw.get_p_data());
         int rank = 2;
-        int n[] = {(int)m_height, (int)m_width};
-        int howmany = m_num_of_feats;
-        int idist = m_height*m_width, odist = m_height*(m_width/2+1);
+        int n[] = {int(m_height), int(m_width)};
+        int howmany = int(m_num_of_feats);
+        int idist = int(m_height*m_width), odist = int(m_height*(m_width/2+1));
         int istride = 1, ostride = 1;
-        int *inembed = NULL, *onembed = NULL;
+        int *inembed = nullptr, *onembed = nullptr;
 
         FFTW_PLAN_WITH_THREADS();
         plan_fw = fftwf_plan_many_dft_r2c(rank, n, howmany,
@@ -111,16 +112,16 @@ void Fftw::init(unsigned width, unsigned height, unsigned num_of_feats, unsigned
 #endif
     //FFT inverse one scale
     {
-        ComplexMat in_i(m_height,m_width,m_num_of_feats);
-        cv::Mat out_i = cv::Mat::zeros(m_height, m_width, CV_32FC(m_num_of_feats));
+        ComplexMat in_i(m_height, m_width, m_num_of_feats);
+        cv::Mat out_i = cv::Mat::zeros(int(m_height), int(m_width), CV_32FC(int(m_num_of_feats)));
         fftwf_complex *in = reinterpret_cast<fftwf_complex*>(in_i.get_p_data());
         float *out = reinterpret_cast<float*>(out_i.data);
         int rank = 2;
-        int n[] = {(int)m_height, (int)m_width};
-        int howmany = m_num_of_feats;
-        int idist = m_height*(m_width/2+1), odist = 1;
-        int istride = 1, ostride = m_num_of_feats;
-        int inembed[] = {(int)m_height, (int)m_width/2+1}, *onembed = n;
+        int n[] = {int(m_height), int(m_width)};
+        int howmany = int(m_num_of_feats);
+        int idist = int(m_height*(m_width/2+1)), odist = 1;
+        int istride = 1, ostride = int(m_num_of_feats);
+        int inembed[] = {int(m_height), int(m_width/2+1)}, *onembed = n;
 
         FFTW_PLAN_WITH_THREADS();
         plan_i_features = fftwf_plan_many_dft_c2r(rank, n, howmany,
@@ -151,16 +152,16 @@ void Fftw::init(unsigned width, unsigned height, unsigned num_of_feats, unsigned
 #endif
     //FFT inver one channel one scale
     {
-        ComplexMat in_i1(m_height,m_width,1);
-        cv::Mat out_i1 = cv::Mat::zeros(m_height, m_width, CV_32FC1);
+        ComplexMat in_i1(int(m_height),int(m_width),1);
+        cv::Mat out_i1 = cv::Mat::zeros(int(m_height), int(m_width), CV_32FC1);
         fftwf_complex *in = reinterpret_cast<fftwf_complex*>(in_i1.get_p_data());
         float *out = reinterpret_cast<float*>(out_i1.data);
         int rank = 2;
-        int n[] = {(int)m_height, (int)m_width};
+        int n[] = {int(m_height), int(m_width)};
         int howmany = 1;
-        int idist = m_height*(m_width/2+1), odist = 1;
+        int idist = int(m_height*(m_width/2+1)), odist = 1;
         int istride = 1, ostride = 1;
-        int inembed[] = {(int)m_height, (int)m_width/2+1}, *onembed = n;
+        int inembed[] = {int(m_height), int(m_width)/2+1}, *onembed = n;
 
         FFTW_PLAN_WITH_THREADS();
         plan_i_1ch = fftwf_plan_many_dft_c2r(rank, n, howmany,
@@ -196,11 +197,12 @@ void Fftw::set_window(const cv::Mat &window)
     m_window = window;
 }
 
-void Fftw::forward(const cv::Mat & real_input, ComplexMat & complex_result, float *real_input_arr)
+void Fftw::forward(const cv::Mat & real_input, ComplexMat & complex_result, float *real_input_arr, cudaStream_t stream)
 {
     (void) real_input_arr;
+    (void) stream;
 
-    if(m_big_batch_mode && real_input.rows == (int)(m_height*m_num_of_scales)){
+    if(m_big_batch_mode && real_input.rows == int(m_height*m_num_of_scales)){
         fftwf_execute_dft_r2c(plan_f_all_scales, reinterpret_cast<float*>(real_input.data),
                               reinterpret_cast<fftwf_complex*>(complex_result.get_p_data()));
     } else {
@@ -210,29 +212,31 @@ void Fftw::forward(const cv::Mat & real_input, ComplexMat & complex_result, floa
     return;
 }
 
-void Fftw::forward_window(std::vector<cv::Mat> patch_feats, ComplexMat & complex_result, cv::Mat & fw_all, float *real_input_arr)
+void Fftw::forward_window(std::vector<cv::Mat> patch_feats, ComplexMat & complex_result, cv::Mat & fw_all, float *real_input_arr, cudaStream_t stream)
 {
     (void) real_input_arr;
+    (void) stream;
 
-    int n_channels = patch_feats.size();
+    int n_channels = int(patch_feats.size());
     for (int i = 0; i < n_channels; ++i) {
-        cv::Mat in_roi(fw_all, cv::Rect(0, i*m_height, m_width, m_height));
-        in_roi = patch_feats[i].mul(m_window);
+        cv::Mat in_roi(fw_all, cv::Rect(0, i*int(m_height), int(m_width), int(m_height)));
+        in_roi = patch_feats[uint(i)].mul(m_window);
     }
 
     float *in = reinterpret_cast<float*>(fw_all.data);
     fftwf_complex *out = reinterpret_cast<fftwf_complex*>(complex_result.get_p_data());
 
-    if (n_channels <= (int) m_num_of_feats)
+    if (n_channels <= int(m_num_of_feats))
         fftwf_execute_dft_r2c(plan_fw, in, out);
     else
         fftwf_execute_dft_r2c(plan_fw_all_scales, in, out);
     return;
 }
 
-void Fftw::inverse(ComplexMat &  complex_input, cv::Mat & real_result, float *real_result_arr)
+void Fftw::inverse(ComplexMat &  complex_input, cv::Mat & real_result, float *real_result_arr, cudaStream_t stream)
 {
     (void) real_result_arr;
+    (void) stream;
 
     int n_channels = complex_input.n_channels;
     fftwf_complex *in = reinterpret_cast<fftwf_complex*>(complex_input.get_p_data());
@@ -240,9 +244,9 @@ void Fftw::inverse(ComplexMat &  complex_input, cv::Mat & real_result, float *re
 
     if(n_channels == 1)
         fftwf_execute_dft_c2r(plan_i_1ch, in, out);
-    else if(m_big_batch_mode && n_channels == (int) m_num_of_scales)
+    else if(m_big_batch_mode && n_channels == int(m_num_of_scales))
         fftwf_execute_dft_c2r(plan_i_1ch_all_scales, in, out);
-    else if(m_big_batch_mode && n_channels == (int) m_num_of_feats * (int) m_num_of_scales)
+    else if(m_big_batch_mode && n_channels == int(m_num_of_feats) * int(m_num_of_scales))
         fftwf_execute_dft_c2r(plan_i_features_all_scales, in, out);
     else
         fftwf_execute_dft_c2r(plan_i_features, in, out);
index afe2c974205b7ebdf6e893b5a5fdb0b78dc5e613..2e60e02927825a83dffcc127d01c09129fed6056 100644 (file)
@@ -20,12 +20,12 @@ class Fftw : public Fft
 {
 public:
     Fftw();
-    Fftw(int num_of_threads);
+    Fftw(unsigned num_of_threads);
     void init(unsigned width, unsigned height, unsigned num_of_feats, unsigned num_of_scales, bool big_batch_mode) override;
     void set_window(const cv::Mat & window) override;
-    void forward(const cv::Mat & real_input, ComplexMat & complex_result, float *real_input_arr) override;
-    void forward_window(std::vector<cv::Mat> patch_feats, ComplexMat & complex_result, cv::Mat & fw_all, float *real_input_arr) override;
-    void inverse(ComplexMat &  complex_input, cv::Mat & real_result, float *real_result_arr) override;
+    void forward(const cv::Mat & real_input, ComplexMat & complex_result, float *real_input_arr, cudaStream_t  stream) override;
+    void forward_window(std::vector<cv::Mat> patch_feats, ComplexMat & complex_result, cv::Mat & fw_all, float *real_input_arr, cudaStream_t stream) override;
+    void inverse(ComplexMat &  complex_input, cv::Mat & real_result, float *real_result_arr, cudaStream_t stream) override;
     ~Fftw() override;
 private:
     unsigned m_num_threads = 6;
index 4b8d8f39da113981f3168cec3a55dbaa1c12cce0..46399a55d4bdb7ac1cdd8f00ad39a22faedb303c 100644 (file)
@@ -15,9 +15,10 @@ void FftOpencv::set_window(const cv::Mat & window)
      m_window = window;
 }
 
-void FftOpencv::forward(const cv::Mat & real_input, ComplexMat & complex_result, float *real_input_arr)
+void FftOpencv::forward(const cv::Mat & real_input, ComplexMat & complex_result, float *real_input_arr, cudaStream_t stream)
 {
     (void) real_input_arr;
+    (void) stream;
 
     cv::Mat tmp;
     cv::dft(real_input, tmp, cv::DFT_COMPLEX_OUTPUT);
@@ -25,30 +26,32 @@ void FftOpencv::forward(const cv::Mat & real_input, ComplexMat & complex_result,
     return;
 }
 
-void FftOpencv::forward_window(std::vector<cv::Mat> patch_feats, ComplexMat & complex_result, cv::Mat & fw_all, float *real_input_arr)
+void FftOpencv::forward_window(std::vector<cv::Mat> patch_feats, ComplexMat & complex_result, cv::Mat & fw_all, float *real_input_arr, cudaStream_t stream)
 {
     (void) real_input_arr;
     (void) fw_all;
+    (void) stream;
 
-    int n_channels = patch_feats.size();
-    for (int i = 0; i < n_channels; ++i) {
+    uint n_channels = uint(patch_feats.size());
+    for (uint i = 0; i < n_channels; ++i) {
         cv::Mat complex_res;
         cv::dft(patch_feats[i].mul(m_window), complex_res, cv::DFT_COMPLEX_OUTPUT);
-        complex_result.set_channel(i, complex_res);
+        complex_result.set_channel(int(i), complex_res);
     }
     return;
 }
 
-void FftOpencv::inverse(ComplexMat &  complex_input, cv::Mat & real_result, float *real_result_arr)
+void FftOpencv::inverse(ComplexMat &  complex_input, cv::Mat & real_result, float *real_result_arr, cudaStream_t stream)
 {
     (void) real_result_arr;
+    (void) stream;
 
     if (complex_input.n_channels == 1) {
         cv::dft(complex_input.to_cv_mat(), real_result, cv::DFT_INVERSE | cv::DFT_REAL_OUTPUT | cv::DFT_SCALE);
     } else {
         std::vector<cv::Mat> mat_channels = complex_input.to_cv_mat_vector();
-        std::vector<cv::Mat> ifft_mats(complex_input.n_channels);
-        for (int i = 0; i < complex_input.n_channels; ++i) {
+        std::vector<cv::Mat> ifft_mats(ulong(complex_input.n_channels));
+        for (uint i = 0; i < uint(complex_input.n_channels); ++i) {
             cv::dft(mat_channels[i], ifft_mats[i], cv::DFT_INVERSE | cv::DFT_REAL_OUTPUT | cv::DFT_SCALE);
         }
         cv::merge(ifft_mats, real_result);
index 061b7e8063882e7acd4f78b931f2b21317f4e762..2fe1e4a6c6cefa6c80f5ae8b6b086d7f0a6a315a 100644 (file)
@@ -11,9 +11,9 @@ class FftOpencv : public Fft
 public:
     void init(unsigned width, unsigned height, unsigned num_of_feats, unsigned num_of_scales, bool big_batch_mode) override;
     void set_window(const cv::Mat & window) override;
-    void forward(const cv::Mat & real_input, ComplexMat & complex_result, float *real_input_arr) override;
-    void forward_window(std::vector<cv::Mat> patch_feats, ComplexMat & complex_result, cv::Mat & fw_all, float *real_input_arr) override;
-    void inverse(ComplexMat &  complex_input, cv::Mat & real_result, float *real_result_arr) override;
+    void forward(const cv::Mat & real_input, ComplexMat & complex_result, float *real_input_arr, cudaStream_t  stream) override;
+    void forward_window(std::vector<cv::Mat> patch_feats, ComplexMat & complex_result, cv::Mat & fw_all, float *real_input_arr, cudaStream_t stream) override;
+    void inverse(ComplexMat &  complex_input, cv::Mat & real_result, float *real_result_arr, cudaStream_t stream) override;
     ~FftOpencv() override;
 private:
     cv::Mat m_window;
index 7dbd2409cc30cae989fdcad72ad142aa369dd930..11687d261d3ee17ce6a7f5073869d9ae12c38957 100644 (file)
@@ -33,23 +33,6 @@ KCF_Tracker::KCF_Tracker()
 KCF_Tracker::~KCF_Tracker()
 {
     delete &fft;
-    int end = m_use_big_batch ? 2 : p_num_scales;
-#ifdef CUFFT
-    for (int i = 0;i < end;++i) {
-        CudaSafeCall(cudaFreeHost(p_scale_vars[i].xf_sqr_norm));
-        CudaSafeCall(cudaFreeHost(p_scale_vars[i].yf_sqr_norm));
-        CudaSafeCall(cudaFreeHost(p_scale_vars[i].data_i_1ch));
-        CudaSafeCall(cudaFreeHost(p_scale_vars[i].data_i_features));
-        CudaSafeCall(cudaFreeHost(p_scale_vars[i].gauss_corr_res));
-        CudaSafeCall(cudaFreeHost(p_scale_vars[i].rot_labels_data));
-        CudaSafeCall(cudaFreeHost(p_scale_vars[i].data_features));
-    }
-#else
-    for (int i = 0;i < end;++i) {
-        free(p_scale_vars[i].xf_sqr_norm);
-        free(p_scale_vars[i].yf_sqr_norm);
-    }
-#endif
 }
 
 void KCF_Tracker::init(cv::Mat &img, const cv::Rect & bbox, int fit_size_x, int fit_size_y)
@@ -109,17 +92,18 @@ void KCF_Tracker::init(cv::Mat &img, const cv::Rect & bbox, int fit_size_x, int
             std::cerr << "Fit size does not fit to hog cell size. The dimensions have to be divisible by HOG cell size, which is: " << p_cell_size << std::endl;;
             std::exit(EXIT_FAILURE);
         }
-        double tmp;
-        if (( tmp = (p_pose.w * (1. + p_padding) / p_cell_size) * p_cell_size ) != fit_size_x)
+        double tmp = (p_pose.w * (1. + p_padding) / p_cell_size) * p_cell_size ;
+        if (fabs(tmp-fit_size_x) > p_floating_error)
             p_scale_factor_x = fit_size_x/tmp;
-        if (( tmp = (p_pose.h * (1. + p_padding) / p_cell_size) * p_cell_size ) != fit_size_y)
+        tmp = (p_pose.h * (1. + p_padding) / p_cell_size) * p_cell_size;
+        if (fabs(tmp-fit_size_y) > p_floating_error)
             p_scale_factor_y = fit_size_y/tmp;
         std::cout << "resizing image horizontaly by factor of " << p_scale_factor_x
                   << " and verticaly by factor of " << p_scale_factor_y << std::endl;
         p_fit_to_pw2 = true;
         p_pose.scale_x(p_scale_factor_x);
         p_pose.scale_y(p_scale_factor_y);
-        if (p_scale_factor_x != 1 && p_scale_factor_y != 1) {
+        if (fabs(p_scale_factor_x-1) > p_floating_error && fabs(p_scale_factor_y-1) > p_floating_error) {
             if (p_scale_factor_x < 1 && p_scale_factor_y < 1) {
                 cv::resize(input_gray, input_gray, cv::Size(0, 0), p_scale_factor_x, p_scale_factor_y, cv::INTER_AREA);
                 cv::resize(input_rgb, input_rgb, cv::Size(0, 0), p_scale_factor_x, p_scale_factor_y, cv::INTER_AREA);
@@ -131,8 +115,8 @@ void KCF_Tracker::init(cv::Mat &img, const cv::Rect & bbox, int fit_size_x, int
     }
 
     //compute win size + fit to fhog cell size
-    p_windows_size[0] = round(p_pose.w * (1. + p_padding) / p_cell_size) * p_cell_size;
-    p_windows_size[1] = round(p_pose.h * (1. + p_padding) / p_cell_size) * p_cell_size;
+    p_windows_size[0] = int(round(p_pose.w * (1. + p_padding) / p_cell_size) * p_cell_size);
+    p_windows_size[1] = int(round(p_pose.h * (1. + p_padding) / p_cell_size) * p_cell_size);
 
     p_scales.clear();
     if (m_use_scale)
@@ -164,12 +148,24 @@ void KCF_Tracker::init(cv::Mat &img, const cv::Rect & bbox, int fit_size_x, int
 
     int max =m_use_big_batch ? 2: p_num_scales;
     for (int i = 0;i<max;++i) {
-        if (i == 0)
-            p_scale_vars.push_back(Scale_vars(p_windows_size, p_cell_size, p_num_of_feats, 1, &p_model_xf, &p_yf, true));
-        else if (m_use_big_batch)
-            p_scale_vars.push_back(Scale_vars(p_windows_size, p_cell_size, p_num_of_feats*p_num_scales, p_num_scales));
-        else
-            p_scale_vars.push_back(Scale_vars(p_windows_size, p_cell_size, p_num_of_feats, 1));
+        if (i == 0) {
+            p_scale_vars.emplace_back(new Scale_vars(p_windows_size, p_cell_size, p_num_of_feats, 1, &p_model_xf, &p_yf, true));
+        }
+        else if (m_use_big_batch) {
+            p_scale_vars.emplace_back(new Scale_vars(p_windows_size, p_cell_size, p_num_of_feats*p_num_scales, p_num_scales));
+        }
+        else {
+            p_scale_vars.emplace_back(new Scale_vars(p_windows_size, p_cell_size, p_num_of_feats, 1));
+        }
+#ifdef CUFFT
+        std::cout << p_scale_vars.back()->zf.stream << std::endl;
+        std::cout << p_scale_vars.back()->kzf.stream << std::endl;
+        std::cout << p_scale_vars.back()->kf.stream << std::endl << std::endl;
+
+        std::cout << p_scale_vars.back()->zf.n_scales << std::endl;
+        std::cout << p_scale_vars.back()->kzf.n_scales << std::endl;
+        std::cout << p_scale_vars.back()->kf.n_scales << std::endl << std::endl;
+#endif
     }
 
     p_current_scale = 1.;
@@ -185,20 +181,25 @@ void KCF_Tracker::init(cv::Mat &img, const cv::Rect & bbox, int fit_size_x, int
 
     p_output_sigma = std::sqrt(p_pose.w*p_pose.h) * p_output_sigma_factor / static_cast<double>(p_cell_size);
 
-    fft.init(p_windows_size[0]/p_cell_size, p_windows_size[1]/p_cell_size, p_num_of_feats, p_num_scales, m_use_big_batch);
+    fft.init(uint(p_windows_size[0]/p_cell_size), uint(p_windows_size[1]/p_cell_size), uint(p_num_of_feats), uint(p_num_scales), m_use_big_batch);
     fft.set_window(cosine_window_function(p_windows_size[0]/p_cell_size, p_windows_size[1]/p_cell_size));
 
     //window weights, i.e. labels
      fft.forward(gaussian_shaped_labels(p_output_sigma, p_windows_size[0]/p_cell_size, p_windows_size[1]/p_cell_size), p_yf,
-                                                                                m_use_cuda ? p_scale_vars[0].rot_labels_data_d: nullptr);
+                                                                                m_use_cuda ? p_scale_vars.front()->rot_labels_data_d: nullptr, p_scale_vars.front()->stream);
     DEBUG_PRINTM(p_yf);
 
     //obtain a sub-window for training initial model
-    p_scale_vars[0].patch_feats.clear();
-    get_features(input_rgb, input_gray, p_pose.cx, p_pose.cy, p_windows_size[0], p_windows_size[1], p_scale_vars[0]);
-    fft.forward_window(p_scale_vars[0].patch_feats, p_model_xf, p_scale_vars[0].fw_all, m_use_cuda ? p_scale_vars[0].data_features_d : nullptr);
+    p_scale_vars.front()->patch_feats.clear();
+    get_features(input_rgb, input_gray, int(p_pose.cx), int(p_pose.cy), p_windows_size[0], p_windows_size[1], *p_scale_vars.front());
+    fft.forward_window(p_scale_vars.front()->patch_feats, p_model_xf, p_scale_vars.front()->fw_all,
+                                               m_use_cuda ? p_scale_vars.front()->data_features_d : nullptr, p_scale_vars.front()->stream);
     DEBUG_PRINTM(p_model_xf);
-
+#if defined(CUFFT) && (defined(ASYNC) || defined(OPENMP))
+    p_scale_vars.front()->model_xf = p_model_xf;
+    p_scale_vars.front()->model_xf.set_stream(p_scale_vars.front()->stream);
+    p_yf.set_stream(p_scale_vars.front()->stream);
+#endif
 
     if (m_use_linearkernel) {
         ComplexMat xfconj = p_model_xf.conj();
@@ -206,16 +207,29 @@ void KCF_Tracker::init(cv::Mat &img, const cv::Rect & bbox, int fit_size_x, int
         p_model_alphaf_den = (p_model_xf * xfconj);
     } else {
         //Kernel Ridge Regression, calculate alphas (in Fourier domain)
-        gaussian_correlation(p_scale_vars[0], p_model_xf, p_model_xf, p_kernel_sigma, true);
-        DEBUG_PRINTM(p_scale_vars[0].kf);
-        p_model_alphaf_num = p_yf * p_scale_vars[0].kf;
+#if defined(CUFFT) && (defined(ASYNC) || defined(OPENMP))
+        gaussian_correlation(*p_scale_vars.front(), p_scale_vars.front()->model_xf, p_scale_vars.front()->model_xf, p_kernel_sigma, true);
+#else
+        gaussian_correlation(*p_scale_vars.front(), p_model_xf, p_model_xf, p_kernel_sigma, true);
+#endif
+        DEBUG_PRINTM(p_scale_vars.front()->kf);
+        p_model_alphaf_num = p_yf * p_scale_vars.front()->kf;
         DEBUG_PRINTM(p_model_alphaf_num);
-        p_model_alphaf_den = p_scale_vars[0].kf * (p_scale_vars[0].kf + p_lambda);
+        p_model_alphaf_den = p_scale_vars.front()->kf * (p_scale_vars.front()->kf + float(p_lambda));
         DEBUG_PRINTM(p_model_alphaf_den);
     }
     p_model_alphaf = p_model_alphaf_num / p_model_alphaf_den;
     DEBUG_PRINTM(p_model_alphaf);
 //        p_model_alphaf = p_yf / (kf + p_lambda);   //equation for fast training
+
+#if defined(CUFFT) && (defined(ASYNC) || defined(OPENMP))
+    for (auto it = p_scale_vars.begin();it != p_scale_vars.end();++it) {
+        (*it)->model_xf = p_model_xf;
+        (*it)->model_xf.set_stream((*it)->stream);
+        (*it)->model_alphaf = p_model_alphaf;
+        (*it)->model_alphaf.set_stream((*it)->stream);
+    }
+#endif
 }
 
 void KCF_Tracker::setTrackerPose(BBox_c &bbox, cv::Mat & img, int fit_size_x, int fit_size_y)
@@ -273,7 +287,7 @@ void KCF_Tracker::track(cv::Mat &img)
     if (p_resize_image) {
         cv::resize(input_gray, input_gray, cv::Size(0, 0), p_downscale_factor, p_downscale_factor, cv::INTER_AREA);
         cv::resize(input_rgb, input_rgb, cv::Size(0, 0), p_downscale_factor, p_downscale_factor, cv::INTER_AREA);
-    } else if (p_fit_to_pw2 && p_scale_factor_x != 1 && p_scale_factor_y != 1) {
+    } else if (p_fit_to_pw2 && fabs(p_scale_factor_x-1) > p_floating_error && fabs(p_scale_factor_y-1) > p_floating_error) {
         if (p_scale_factor_x < 1 && p_scale_factor_y < 1) {
             cv::resize(input_gray, input_gray, cv::Size(0, 0), p_scale_factor_x, p_scale_factor_y, cv::INTER_AREA);
             cv::resize(input_rgb, input_rgb, cv::Size(0, 0), p_scale_factor_x, p_scale_factor_y, cv::INTER_AREA);
@@ -290,43 +304,47 @@ void KCF_Tracker::track(cv::Mat &img)
 
     if(m_use_multithreading) {
         std::vector<std::future<void>> async_res(p_scales.size());
-        for (size_t i = 0; i < p_scale_vars.size(); ++i) {
-            async_res[i] = std::async(std::launch::async,
-                                [this, &input_gray, &input_rgb, i]() -> void
-                                {return scale_track(this->p_scale_vars[i], input_rgb, input_gray, this->p_scales[i]);});
+        for (auto it = p_scale_vars.begin();it != p_scale_vars.end();++it) {
+            uint index = uint(std::distance(p_scale_vars.begin(), it));
+            async_res[index] = std::async(std::launch::async,
+                                [this, &input_gray, &input_rgb, index, it]() -> void
+                                {return scale_track(*(*it), input_rgb, input_gray, this->p_scales[index]);});
         }
-        for (size_t i = 0; i < p_scales.size(); ++i) {
-            async_res[i].wait();
-            if (this->p_scale_vars[i].max_response > max_response) {
-                max_response = this->p_scale_vars[i].max_response;
-                max_response_pt = & this->p_scale_vars[i].max_loc;
-                max_response_map = & this->p_scale_vars[i].response;
-                scale_index = i;
+        for (auto it = p_scale_vars.begin();it != p_scale_vars.end();++it) {
+            uint index = uint(std::distance(p_scale_vars.begin(), it));
+            async_res[index].wait();
+            if ((*it)->max_response > max_response) {
+                max_response = (*it)->max_response;
+                max_response_pt = & (*it)->max_loc;
+                max_response_map = & (*it)->response;
+                scale_index = int(index);
             }
         }
     } else {
-        int end =m_use_big_batch ? 2: p_num_scales;
-        int start = m_use_big_batch ? 1 : 0;
+        uint start = m_use_big_batch ? 1 : 0;
+        uint end = m_use_big_batch ? 2 : uint(p_num_scales);
 #pragma omp parallel for schedule(dynamic)
-        for (int i = start; i < end; ++i) {
-            scale_track(this->p_scale_vars[i], input_rgb, input_gray, this->p_scales[i]);
+        for (uint i = start; i < end; ++i) {
+            auto it = p_scale_vars.begin();
+            std::advance(it, i);
+            scale_track(*(*it), input_rgb, input_gray, this->p_scales[i]);
 
             if (m_use_big_batch) {
                 for (size_t j = 0;j<p_scales.size();++j) {
-                    if (this->p_scale_vars[i].max_responses[j] > max_response) {
-                        max_response = this->p_scale_vars[i].max_responses[j];
-                        max_response_pt = & this->p_scale_vars[i].max_locs[j];
-                        max_response_map = & this->p_scale_vars[i].response_maps[j];
-                        scale_index = j;
+                    if ((*it)->max_responses[j] > max_response) {
+                        max_response = (*it)->max_responses[j];
+                        max_response_pt = & (*it)->max_locs[j];
+                        max_response_map = & (*it)->response_maps[j];
+                        scale_index = int(j);
                     }
                 }
             } else {
 #pragma omp critical
                 {
-                    if (this->p_scale_vars[i].max_response > max_response) {
-                        max_response = this->p_scale_vars[i].max_response;
-                        max_response_pt = & this->p_scale_vars[i].max_loc;
-                        max_response_map = & this->p_scale_vars[i].response;
+                    if ((*it)->max_response > max_response) {
+                        max_response = (*it)->max_response;
+                        max_response_pt = & (*it)->max_loc;
+                        max_response_map = & (*it)->response;
                         scale_index = i;
                     }
                 }
@@ -350,8 +368,8 @@ void KCF_Tracker::track(cv::Mat &img)
         new_location = sub_pixel_peak(*max_response_pt, *max_response_map);
     DEBUG_PRINT(new_location);
 
-    p_pose.cx += p_current_scale*p_cell_size*new_location.x;
-    p_pose.cy += p_current_scale*p_cell_size*new_location.y;
+    p_pose.cx += p_current_scale*p_cell_size*double(new_location.x);
+    p_pose.cy += p_current_scale*p_cell_size*double(new_location.y);
     if (p_fit_to_pw2) {
         if (p_pose.cx < 0) p_pose.cx = 0;
         if (p_pose.cx > (img.cols*p_scale_factor_x)-1) p_pose.cx = (img.cols*p_scale_factor_x)-1;
@@ -365,7 +383,7 @@ void KCF_Tracker::track(cv::Mat &img)
     }
 
     //sub grid scale interpolation
-    double new_scale = p_scales[scale_index];
+    double new_scale = p_scales[uint(scale_index)];
     if (m_use_subgrid_scale)
         new_scale = sub_grid_scale(scale_index);
 
@@ -376,59 +394,75 @@ void KCF_Tracker::track(cv::Mat &img)
     if (p_current_scale > p_min_max_scale[1])
         p_current_scale = p_min_max_scale[1];
     //obtain a subwindow for training at newly estimated target position
-    p_scale_vars[0].patch_feats.clear();
-    get_features(input_rgb, input_gray, p_pose.cx, p_pose.cy, p_windows_size[0], p_windows_size[1], p_scale_vars[0], p_current_scale);
-    fft.forward_window(p_scale_vars[0].patch_feats, p_scale_vars[0].xf, p_scale_vars[0].fw_all, m_use_cuda ? p_scale_vars[0].data_features_d : nullptr);
+    p_scale_vars.front()->patch_feats.clear();
+    get_features(input_rgb, input_gray, int(p_pose.cx), int(p_pose.cy), p_windows_size[0], p_windows_size[1], *p_scale_vars.front(), p_current_scale);
+    fft.forward_window(p_scale_vars.front()->patch_feats, p_scale_vars.front()->xf, p_scale_vars.front()->fw_all,
+                                               m_use_cuda ? p_scale_vars.front()->data_features_d : nullptr, p_scale_vars.front()->stream);
 
     //subsequent frames, interpolate model
-    p_model_xf = p_model_xf * (1. - p_interp_factor) + p_scale_vars[0].xf * p_interp_factor;
+    p_model_xf = p_model_xf *float((1. - p_interp_factor)) + p_scale_vars.front()->xf * float(p_interp_factor);
 
     ComplexMat alphaf_num, alphaf_den;
 
     if (m_use_linearkernel) {
-        ComplexMat xfconj = p_scale_vars[0].xf.conj();
+        ComplexMat xfconj = p_scale_vars.front()->xf.conj();
         alphaf_num = xfconj.mul(p_yf);
-        alphaf_den = (p_scale_vars[0].xf * xfconj);
+        alphaf_den = (p_scale_vars.front()->xf * xfconj);
     } else {
         //Kernel Ridge Regression, calculate alphas (in Fourier domain)
-        gaussian_correlation(p_scale_vars[0], p_scale_vars[0].xf, p_scale_vars[0].xf, p_kernel_sigma, true);
+        gaussian_correlation(*p_scale_vars.front(), p_scale_vars.front()->xf, p_scale_vars.front()->xf, p_kernel_sigma, true);
 //        ComplexMat alphaf = p_yf / (kf + p_lambda); //equation for fast training
 //        p_model_alphaf = p_model_alphaf * (1. - p_interp_factor) + alphaf * p_interp_factor;
-        alphaf_num = p_yf * p_scale_vars[0].kf;
-        alphaf_den = p_scale_vars[0].kf * (p_scale_vars[0].kf + p_lambda);
+        alphaf_num = p_yf * p_scale_vars.front()->kf;
+        alphaf_den = p_scale_vars.front()->kf * (p_scale_vars.front()->kf + float(p_lambda));
     }
 
-    p_model_alphaf_num = p_model_alphaf_num * (1. - p_interp_factor) + alphaf_num * p_interp_factor;
-    p_model_alphaf_den = p_model_alphaf_den * (1. - p_interp_factor) + alphaf_den * p_interp_factor;
+    p_model_alphaf_num = p_model_alphaf_num * float((1. - p_interp_factor)) + alphaf_num * float(p_interp_factor);
+    p_model_alphaf_den = p_model_alphaf_den * float((1. - p_interp_factor)) + alphaf_den * float(p_interp_factor);
     p_model_alphaf = p_model_alphaf_num / p_model_alphaf_den;
+
+#if defined(CUFFT) && (defined(ASYNC) || defined(OPENMP))
+    for (auto it = p_scale_vars.begin(); it != p_scale_vars.end(); ++it) {
+        (*it)->model_xf = p_model_xf;
+        (*it)->model_xf.set_stream((*it)->stream);
+        (*it)->model_alphaf = p_model_alphaf;
+        (*it)->model_alphaf.set_stream((*it)->stream);
+    }
+#endif
 }
 
 void KCF_Tracker::scale_track(Scale_vars & vars, cv::Mat & input_rgb, cv::Mat & input_gray, double scale)
 {
     if (m_use_big_batch) {
         vars.patch_feats.clear();
-        for (int i = 0; i < p_num_scales; ++i) {
-            get_features(input_rgb, input_gray, this->p_pose.cx, this->p_pose.cy, this->p_windows_size[0], this->p_windows_size[1],
+        for (uint i = 0; i < uint(p_num_scales); ++i) {
+            get_features(input_rgb, input_gray, int(this->p_pose.cx), int(this->p_pose.cy), this->p_windows_size[0], this->p_windows_size[1],
                                         vars, this->p_current_scale * this->p_scales[i]);
         }
     } else {
         vars.patch_feats.clear();
-        get_features(input_rgb, input_gray, this->p_pose.cx, this->p_pose.cy, this->p_windows_size[0], this->p_windows_size[1],
+        get_features(input_rgb, input_gray, int(this->p_pose.cx), int(this->p_pose.cy), this->p_windows_size[0], this->p_windows_size[1],
                                     vars, this->p_current_scale * scale);
     }
 
-    fft.forward_window(vars.patch_feats, vars.zf, vars.fw_all, m_use_cuda ? vars.data_features_d : nullptr);
+    fft.forward_window(vars.patch_feats, vars.zf, vars.fw_all, m_use_cuda ? vars.data_features_d : nullptr, vars.stream);
     DEBUG_PRINTM(vars.zf);
 
     if (m_use_linearkernel) {
                 vars.kzf = m_use_big_batch ? (vars.zf.mul2(this->p_model_alphaf)).sum_over_channels() : (p_model_alphaf * vars.zf).sum_over_channels();
-                fft.inverse(vars.kzf, vars.response, m_use_cuda ? vars.data_i_1ch_d : nullptr);
+                fft.inverse(vars.kzf, vars.response, m_use_cuda ? vars.data_i_1ch_d : nullptr, vars.stream);
     } else {
+
+#if defined(CUFFT) && (defined(ASYNC) || defined(OPENMP))
+        gaussian_correlation(vars, vars.zf, vars.model_xf, this->p_kernel_sigma);
+        vars.kzf = vars.model_alphaf * vars.kzf;
+#else
         gaussian_correlation(vars, vars.zf, this->p_model_xf, this->p_kernel_sigma);
         DEBUG_PRINTM(this->p_model_alphaf);
         DEBUG_PRINTM(vars.kzf);
         vars.kzf = m_use_big_batch ? vars.kzf.mul(this->p_model_alphaf) : this->p_model_alphaf * vars.kzf;
-        fft.inverse(vars.kzf, vars.response, m_use_cuda ? vars.data_i_1ch_d : nullptr);
+#endif
+        fft.inverse(vars.kzf, vars.response, m_use_cuda ? vars.data_i_1ch_d : nullptr, vars.stream);
     }
 
     DEBUG_PRINTM(vars.response);
@@ -466,8 +500,8 @@ void KCF_Tracker::scale_track(Scale_vars & vars, cv::Mat & input_rgb, cv::Mat &
 
 void KCF_Tracker::get_features(cv::Mat & input_rgb, cv::Mat & input_gray, int cx, int cy, int size_x, int size_y, Scale_vars &vars, double scale)
 {
-    int size_x_scaled = floor(size_x*scale);
-    int size_y_scaled = floor(size_y*scale);
+    int size_x_scaled = int(floor(size_x*scale));
+    int size_y_scaled = int(floor(size_y*scale));
 
     cv::Mat patch_gray = get_subwindow(input_gray, cx, cy, size_x_scaled, size_y_scaled);
     cv::Mat patch_rgb = get_subwindow(input_rgb, cx, cy, size_x_scaled, size_y_scaled);
@@ -511,7 +545,6 @@ void KCF_Tracker::get_features(cv::Mat & input_rgb, cv::Mat & input_gray, int cx
         std::vector<cv::Mat> cn_feat = CNFeat::extract(patch_rgb);
         color_feat.insert(color_feat.end(), cn_feat.begin(), cn_feat.end());
     }
-
     vars.patch_feats.insert(vars.patch_feats.end(), color_feat.begin(), color_feat.end());
     return;
 }
@@ -528,14 +561,14 @@ cv::Mat KCF_Tracker::gaussian_shaped_labels(double sigma, int dim1, int dim2)
         float * row_ptr = labels.ptr<float>(j);
         double y_s = y*y;
         for (int x = range_x[0], i = 0; x < range_x[1]; ++x, ++i){
-            row_ptr[i] = std::exp(-0.5 * (y_s + x*x) / sigma_s);//-1/2*e^((y^2+x^2)/sigma^2)
+            row_ptr[i] = float(std::exp(-0.5 * (y_s + x*x) / sigma_s));//-1/2*e^((y^2+x^2)/sigma^2)
         }
     }
 
     //rotate so that 1 is at top-left corner (see KCF paper for explanation)
 #ifdef CUFFT
     cv::Mat tmp = circshift(labels, range_x[0], range_y[0]);
-    tmp.copyTo(p_scale_vars[0].rot_labels);
+    tmp.copyTo(p_scale_vars.front()->rot_labels);
 
     assert(p_scale_vars[0].rot_labels.at<float>(0,0) >= 1.f - 1e-10f);
     return tmp;
@@ -618,10 +651,10 @@ cv::Mat KCF_Tracker::cosine_window_function(int dim1, int dim2)
     cv::Mat m1(1, dim1, CV_32FC1), m2(dim2, 1, CV_32FC1);
     double N_inv = 1./(static_cast<double>(dim1)-1.);
     for (int i = 0; i < dim1; ++i)
-        m1.at<float>(i) = 0.5*(1. - std::cos(2. * CV_PI * static_cast<double>(i) * N_inv));
+        m1.at<float>(i) = float(0.5*(1. - std::cos(2. * CV_PI * static_cast<double>(i) * N_inv)));
     N_inv = 1./(static_cast<double>(dim2)-1.);
     for (int i = 0; i < dim2; ++i)
-        m2.at<float>(i) = 0.5*(1. - std::cos(2. * CV_PI * static_cast<double>(i) * N_inv));
+        m2.at<float>(i) = float(0.5*(1. - std::cos(2. * CV_PI * static_cast<double>(i) * N_inv)));
     cv::Mat ret = m2*m1;
     return ret;
 }
@@ -641,7 +674,7 @@ cv::Mat KCF_Tracker::get_subwindow(const cv::Mat & input, int cx, int cy, int wi
     //out of image
     if (x1 >= input.cols || y1 >= input.rows || x2 < 0 || y2 < 0) {
         patch.create(height, width, input.type());
-        patch.setTo(0.f);
+        patch.setTo(double(0.f));
         return patch;
     }
 
@@ -699,12 +732,14 @@ void KCF_Tracker::gaussian_correlation(struct Scale_vars & vars, const ComplexMa
 #endif
     vars.xyf = auto_correlation ? xf.sqr_mag() : xf.mul2(yf.conj());
     DEBUG_PRINTM(vars.xyf);
-    fft.inverse(vars.xyf, vars.ifft2_res, m_use_cuda ? vars.data_i_features_d : nullptr);
+    fft.inverse(vars.xyf, vars.ifft2_res, m_use_cuda ? vars.data_i_features_d : nullptr, vars.stream);
 #ifdef CUFFT
     if(auto_correlation)
-        cuda_gaussian_correlation(vars.data_i_features, vars.gauss_corr_res_d, vars.xf_sqr_norm_d, vars.xf_sqr_norm_d, sigma, xf.n_channels, xf.n_scales, p_roi_height, p_roi_width);
+        cuda_gaussian_correlation(vars.data_i_features, vars.gauss_corr_res_d, vars.xf_sqr_norm_d, vars.xf_sqr_norm_d,
+                                                                sigma, xf.n_channels, xf.n_scales, p_roi_height, p_roi_width, vars.stream);
     else
-        cuda_gaussian_correlation(vars.data_i_features, vars.gauss_corr_res_d, vars.xf_sqr_norm_d, vars.yf_sqr_norm_d, sigma, xf.n_channels, xf.n_scales, p_roi_height, p_roi_width);
+        cuda_gaussian_correlation(vars.data_i_features, vars.gauss_corr_res_d, vars.xf_sqr_norm_d, vars.yf_sqr_norm_d,
+                                                                sigma, xf.n_channels, xf.n_scales, p_roi_height, p_roi_width, vars.stream);
 #else
     //ifft2 and sum over 3rd dimension, we dont care about individual channels
     DEBUG_PRINTM(vars.ifft2_res);
@@ -712,7 +747,7 @@ void KCF_Tracker::gaussian_correlation(struct Scale_vars & vars, const ComplexMa
     if (xf.channels() != p_num_scales*p_num_of_feats)
         xy_sum.create(vars.ifft2_res.size(), CV_32FC1);
     else
-        xy_sum.create(vars.ifft2_res.size(), CV_32FC(p_scales.size()));
+        xy_sum.create(vars.ifft2_res.size(), CV_32FC(int(p_scales.size())));
     xy_sum.setTo(0);
     for (int y = 0; y < vars.ifft2_res.rows; ++y) {
         float * row_ptr = vars.ifft2_res.ptr<float>(y);
@@ -730,14 +765,14 @@ void KCF_Tracker::gaussian_correlation(struct Scale_vars & vars, const ComplexMa
     cv::split(xy_sum,scales);
 
     float numel_xf_inv = 1.f/(xf.cols * xf.rows * (xf.channels()/xf.n_scales));
-    for (int i = 0; i < xf.n_scales; ++i){
-        cv::Mat in_roi(vars.in_all, cv::Rect(0, i*scales[0].rows, scales[0].cols, scales[0].rows));
-        cv::exp(- 1.f / (sigma * sigma) * cv::max((vars.xf_sqr_norm[i] + vars.yf_sqr_norm[0] - 2 * scales[i]) * numel_xf_inv, 0), in_roi);
+    for (uint i = 0; i < uint(xf.n_scales); ++i){
+        cv::Mat in_roi(vars.in_all, cv::Rect(0, int(i)*scales[0].rows, scales[0].cols, scales[0].rows));
+        cv::exp(- 1. / (sigma * sigma) * cv::max((double(vars.xf_sqr_norm[i] + vars.yf_sqr_norm[0]) - 2 * scales[i]) * double(numel_xf_inv), 0), in_roi);
         DEBUG_PRINTM(in_roi);
     }
 #endif
     DEBUG_PRINTM(vars.in_all);
-    fft.forward(vars.in_all, auto_correlation ? vars.kf : vars.kzf, m_use_cuda ? vars.gauss_corr_res_d : nullptr);
+    fft.forward(vars.in_all, auto_correlation ? vars.kf : vars.kzf, m_use_cuda ? vars.gauss_corr_res_d : nullptr, vars.stream);
     return;
 }
 
@@ -793,7 +828,7 @@ cv::Point2f KCF_Tracker::sub_pixel_peak(cv::Point & max_loc, cv::Mat & response)
     cv::Mat x;
     cv::solve(A, fval, x, cv::DECOMP_SVD);
 
-    double a = x.at<float>(0), b = x.at<float>(1), c = x.at<float>(2),
+    float a = x.at<float>(0), b = x.at<float>(1), c = x.at<float>(2),
            d = x.at<float>(3), e = x.at<float>(4);
 
     cv::Point2f sub_peak(max_loc.x, max_loc.y);
@@ -808,36 +843,44 @@ cv::Point2f KCF_Tracker::sub_pixel_peak(cv::Point & max_loc, cv::Mat & response)
 double KCF_Tracker::sub_grid_scale(int index)
 {
     cv::Mat A, fval;
-    if (index < 0 || index > (int)p_scales.size()-1) {
+    if (index < 0 || index > int(p_scales.size())-1) {
         // interpolate from all values
         // fit 1d quadratic function f(x) = a*x^2 + b*x + c
-        A.create(p_scales.size(), 3, CV_32FC1);
-        fval.create(p_scales.size(), 1, CV_32FC1);
-        for (size_t i = 0; i < p_scales.size(); ++i) {
-            A.at<float>(i, 0) = p_scales[i] * p_scales[i];
-            A.at<float>(i, 1) = p_scales[i];
-            A.at<float>(i, 2) = 1;
-            fval.at<float>(i) = m_use_big_batch ? p_scale_vars[1].max_responses[i] : p_scale_vars[i].max_response;
+        A.create(int(p_scales.size()), 3, CV_32FC1);
+        fval.create(int(p_scales.size()), 1, CV_32FC1);
+        for (auto it = p_scale_vars.begin(); it != p_scale_vars.end(); ++it) {
+            uint i = uint(std::distance(p_scale_vars.begin(), it));
+            int j = int(i);
+            A.at<float>(j, 0) = float(p_scales[i] * p_scales[i]);
+            A.at<float>(j, 1) = float(p_scales[i]);
+            A.at<float>(j, 2) = 1;
+            fval.at<float>(j) = m_use_big_batch ? float(p_scale_vars.back()->max_responses[i]) : float((*it)->max_response);
         }
     } else {
         //only from neighbours
-        if (index == 0 || index == (int)p_scales.size()-1)
-            return p_scales[index];
+        if (index == 0 || index == int(p_scales.size())-1)
+            return p_scales[uint(index)];
 
         A = (cv::Mat_<float>(3, 3) <<
-             p_scales[index-1] * p_scales[index-1], p_scales[index-1], 1,
-             p_scales[index] * p_scales[index], p_scales[index], 1,
-             p_scales[index+1] * p_scales[index+1], p_scales[index+1], 1);
-        fval = (cv::Mat_<float>(3, 1) << (m_use_big_batch ? p_scale_vars[1].max_responses[index-1] : p_scale_vars[index-1].max_response),
-                                                                        (m_use_big_batch ? p_scale_vars[1].max_responses[index] : p_scale_vars[index].max_response),
-                                                                        (m_use_big_batch ? p_scale_vars[1].max_responses[index+1] : p_scale_vars[index+1].max_response));
+             p_scales[uint(index)-1] * p_scales[uint(index)-1], p_scales[uint(index)-1], 1,
+             p_scales[uint(index)] * p_scales[uint(index)], p_scales[uint(index)], 1,
+             p_scales[uint(index)+1] * p_scales[uint(index)+1], p_scales[uint(index)+1], 1);
+        auto it1 = p_scale_vars.begin();
+        std::advance(it1, index-1);
+        auto it2 = p_scale_vars.begin();
+        std::advance(it2, index);
+        auto it3 = p_scale_vars.begin();
+        std::advance(it3, index+1);
+        fval = (cv::Mat_<float>(3, 1) << (m_use_big_batch ? p_scale_vars.back()->max_responses[uint(index)-1] : (*it1)->max_response),
+                                                                        (m_use_big_batch ? p_scale_vars.back()->max_responses[uint(index)] : (*it2)->max_response),
+                                                                        (m_use_big_batch ? p_scale_vars.back()->max_responses[uint(index)+1] : (*it3)->max_response));
     }
 
     cv::Mat x;
     cv::solve(A, fval, x, cv::DECOMP_SVD);
-    double a = x.at<float>(0), b = x.at<float>(1);
-    double scale = p_scales[index];
+    float a = x.at<float>(0), b = x.at<float>(1);
+    double scale = p_scales[uint(index)];
     if (a > 0 || a < 0)
-        scale = -b / (2 * a);
+        scale = double(-b / (2 * a));
     return scale;
 }
index 4ca5d6df75506306e6ed4ff5e44cd5458719169c..c61cecf65b9d40357dfd4eaa263d02304399c190 100644 (file)
--- a/src/kcf.h
+++ b/src/kcf.h
@@ -3,6 +3,7 @@
 
 #include <opencv2/opencv.hpp>
 #include <vector>
+#include <memory>
 #include "fhog.hpp"
 
 #ifdef CUFFT
@@ -44,7 +45,7 @@ struct BBox_c
 
     inline cv::Rect get_rect()
     {
-        return cv::Rect(cx-w/2., cy-h/2., w, h);
+        return cv::Rect(int(cx-w/2.), int(cy-h/2.), int(w), int(h));
     }
 
 };
@@ -108,6 +109,7 @@ private:
     const double p_downscale_factor = 0.5;
     double p_scale_factor_x = 1;
     double p_scale_factor_y = 1;
+    double p_floating_error = 0.0001;
 
     double p_padding = 1.5;
     double p_output_sigma_factor = 0.1;
@@ -127,7 +129,7 @@ private:
     int p_num_of_feats;
     int p_roi_height, p_roi_width;
 
-    std::vector<Scale_vars> p_scale_vars;
+    std::list<std::unique_ptr<Scale_vars>> p_scale_vars;
 
     //model
     ComplexMat p_yf;
index de976e24dcabb79c31b55ac538e5aa1dafe972d0..0e244e370786f872330cabc17e900a28bbe8a6f1 100644 (file)
@@ -289,6 +289,7 @@ void hog( float *M, float *O, float *H, int h, int w, int binSize,
   int nOrients, int softBin, bool full, float clip )
 {
   float *N, *R; const int hb=h/binSize, wb=w/binSize, nb=hb*wb;
+  (void) nb;
   // compute unnormalized gradient histograms
   R = (float*) wrCalloc(wb*hb*nOrients,sizeof(float));
   gradHist( M, O, R, h, w, binSize, nOrients, softBin, full );
index f070289a61625002f428e614fb792c388538335a..0aad3fb8a0f19f06a0c15d25340b980cc9a3b56f 100644 (file)
@@ -5,66 +5,80 @@
   #include "complexmat.cuh"
 #else
   #include "complexmat.hpp"
+//For compatibility reasons between CuFFT and FFTW, OpenCVfft versions.
+  typedef int* cudaStream_t;
 #endif
 
 struct Scale_vars
 {
 public:
-    Scale_vars();
-    Scale_vars(int windows_size[2], int cell_size, int num_of_feats, int num_of_scales = 1,ComplexMat *model_xf = nullptr, ComplexMat *yf = nullptr,bool zero_index = false)
+    Scale_vars(int windows_size[2], int cell_size, int num_of_feats, int num_of_scales = 1,ComplexMat *model_xf = nullptr,
+                           ComplexMat *yf = nullptr,bool zero_index = false)
     {
-        double alloc_size;
-
+        uint alloc_size;
+        std::cout << __PRETTY_FUNCTION__ << std::endl;
 #ifdef CUFFT
-        if (zero_index)
+        if (zero_index) {
             cudaSetDeviceFlags(cudaDeviceMapHost);
+            this->zero_index = true;
+        }
 
-        alloc_size = windows_size[0]/cell_size*windows_size[1]/cell_size*num_of_scales*sizeof(cufftReal);
-        CudaSafeCall(cudaHostAlloc((void**)&this->data_i_1ch, alloc_size, cudaHostAllocMapped));
-        CudaSafeCall(cudaHostGetDevicePointer((void**)&this->data_i_1ch_d, (void*)this->data_i_1ch, 0));
+#if defined(ASYNC) || defined(OPENMP)
+        CudaSafeCall(cudaStreamCreate(&this->stream));
+#endif
 
-        alloc_size = windows_size[0]/cell_size*windows_size[1]/cell_size*num_of_feats*sizeof(cufftReal);
-        CudaSafeCall(cudaHostAlloc((void**)&this->data_i_features, alloc_size, cudaHostAllocMapped));
-        CudaSafeCall(cudaHostGetDevicePointer((void**)&this->data_i_features_d, (void*)this->data_i_features, 0));
+        alloc_size = uint(windows_size[0]/cell_size*windows_size[1]/cell_size*num_of_scales)*sizeof(cufftReal);
+        CudaSafeCall(cudaHostAlloc(reinterpret_cast<void**>(&this->data_i_1ch), alloc_size, cudaHostAllocMapped));
+        CudaSafeCall(cudaHostGetDevicePointer(reinterpret_cast<void**>(&this->data_i_1ch_d), reinterpret_cast<void*>(this->data_i_1ch), 0));
+
+        alloc_size = uint(windows_size[0]/cell_size*windows_size[1]/cell_size*num_of_feats)*sizeof(cufftReal);
+        CudaSafeCall(cudaHostAlloc(reinterpret_cast<void**>(&this->data_i_features), alloc_size, cudaHostAllocMapped));
+        CudaSafeCall(cudaHostGetDevicePointer(reinterpret_cast<void**>(&this->data_i_features_d), reinterpret_cast<void*>(this->data_i_features), 0));
 
         this->ifft2_res = cv::Mat(windows_size[1]/cell_size, windows_size[0]/cell_size, CV_32FC(num_of_feats), this->data_i_features);
         this->response = cv::Mat(windows_size[1]/cell_size, windows_size[0]/cell_size, CV_32FC(num_of_scales), this->data_i_1ch);
 
-        this->zf = ComplexMat(windows_size[1]/cell_size, (windows_size[0]/cell_size)/2+1, num_of_feats, num_of_scales);
-        this->kzf = ComplexMat(windows_size[1]/cell_size, (windows_size[0]/cell_size)/2+1, num_of_scales);
-        this->kf = ComplexMat(windows_size[1]/cell_size, (windows_size[0]/cell_size)/2+1, num_of_scales);
+        this->zf.create(windows_size[1]/cell_size, (windows_size[0]/cell_size)/2+1, num_of_feats, num_of_scales, this->stream);
+        std::cout << this->zf.stream << std::endl;
+        std::cout << this->zf.n_scales << std::endl;
+        this->kzf.create(windows_size[1]/cell_size, (windows_size[0]/cell_size)/2+1, num_of_scales, this->stream);
+        std::cout << this->kzf.stream << std::endl;
+        std::cout << this->kzf.n_scales << std::endl;
+        this->kf.create(windows_size[1]/cell_size, (windows_size[0]/cell_size)/2+1, num_of_scales, this->stream);
+        std::cout << this->kf.stream << std::endl;
+        std::cout << this->kf.n_scales << std::endl << std::endl;
 
-        alloc_size = num_of_scales;
+        alloc_size = uint(num_of_scales);
 
-        CudaSafeCall(cudaHostAlloc((void**)&this->xf_sqr_norm, alloc_size*sizeof(float), cudaHostAllocMapped));
-        CudaSafeCall(cudaHostGetDevicePointer((void**)&this->xf_sqr_norm_d, (void*)this->xf_sqr_norm, 0));
+        CudaSafeCall(cudaHostAlloc(reinterpret_cast<void**>(&this->xf_sqr_norm), alloc_size*sizeof(float), cudaHostAllocMapped));
+        CudaSafeCall(cudaHostGetDevicePointer(reinterpret_cast<void**>(&this->xf_sqr_norm_d), reinterpret_cast<void*>(this->xf_sqr_norm), 0));
 
-        CudaSafeCall(cudaHostAlloc((void**)&this->yf_sqr_norm, sizeof(float), cudaHostAllocMapped));
-        CudaSafeCall(cudaHostGetDevicePointer((void**)&this->yf_sqr_norm_d, (void*)this->yf_sqr_norm, 0));
+        CudaSafeCall(cudaHostAlloc(reinterpret_cast<void**>(&this->yf_sqr_norm), sizeof(float), cudaHostAllocMapped));
+        CudaSafeCall(cudaHostGetDevicePointer(reinterpret_cast<void**>(&this->yf_sqr_norm_d), reinterpret_cast<void*>(this->yf_sqr_norm), 0));
 
-        alloc_size =(windows_size[0]/cell_size)*(windows_size[1]/cell_size)*alloc_size*sizeof(float);
-        CudaSafeCall(cudaHostAlloc((void**)&this->gauss_corr_res, alloc_size, cudaHostAllocMapped));
-        CudaSafeCall(cudaHostGetDevicePointer((void**)&this->gauss_corr_res_d, (void*)this->gauss_corr_res, 0));
+        alloc_size =uint((windows_size[0]/cell_size)*(windows_size[1]/cell_size))*alloc_size*sizeof(float);
+        CudaSafeCall(cudaHostAlloc(reinterpret_cast<void**>(&this->gauss_corr_res), alloc_size, cudaHostAllocMapped));
+        CudaSafeCall(cudaHostGetDevicePointer(reinterpret_cast<void**>(&this->gauss_corr_res_d), reinterpret_cast<void*>(this->gauss_corr_res), 0));
         this->in_all = cv::Mat(windows_size[1]/cell_size*num_of_scales, windows_size[0]/cell_size, CV_32F, this->gauss_corr_res_d);
 
         if (zero_index) {
-            alloc_size = (windows_size[0]/cell_size)*(windows_size[1]/cell_size)*sizeof(float);
-            CudaSafeCall(cudaHostAlloc((void**)&this->rot_labels_data, alloc_size, cudaHostAllocMapped));
-            CudaSafeCall(cudaHostGetDevicePointer((void**)&this->rot_labels_data_d, (void*)this->rot_labels_data, 0));
+            alloc_size = uint((windows_size[0]/cell_size)*(windows_size[1]/cell_size))*sizeof(float);
+            CudaSafeCall(cudaHostAlloc(reinterpret_cast<void**>(&this->rot_labels_data), alloc_size, cudaHostAllocMapped));
+            CudaSafeCall(cudaHostGetDevicePointer(reinterpret_cast<void**>(&this->rot_labels_data_d), reinterpret_cast<void*>(this->rot_labels_data), 0));
             this->rot_labels = cv::Mat(windows_size[1]/cell_size, windows_size[0]/cell_size, CV_32FC1, this->rot_labels_data);
         }
 
-        alloc_size = (windows_size[0]/cell_size)*((windows_size[1]/cell_size)*num_of_feats)*sizeof(cufftReal);
-        CudaSafeCall(cudaHostAlloc((void**)&this->data_features, alloc_size, cudaHostAllocMapped));
-        CudaSafeCall(cudaHostGetDevicePointer((void**)&this->data_features_d, (void*)this->data_features, 0));
+        alloc_size = uint((windows_size[0]/cell_size)*((windows_size[1]/cell_size)*num_of_feats))*sizeof(cufftReal);
+        CudaSafeCall(cudaHostAlloc(reinterpret_cast<void**>(&this->data_features), alloc_size, cudaHostAllocMapped));
+        CudaSafeCall(cudaHostGetDevicePointer(reinterpret_cast<void**>(&this->data_features_d), reinterpret_cast<void*>(this->data_features), 0));
         this->fw_all = cv::Mat((windows_size[1]/cell_size)*num_of_feats, windows_size[0]/cell_size, CV_32F, this->data_features);
 #else
-        alloc_size = num_of_scales;
+        alloc_size = uint(num_of_scales);
 
-        this->xf_sqr_norm = (float*) malloc(alloc_size*sizeof(float));
-        this->yf_sqr_norm = (float*) malloc(sizeof(float));
+        this->xf_sqr_norm = reinterpret_cast<float*>(malloc(alloc_size*sizeof(float)));
+        this->yf_sqr_norm = reinterpret_cast<float*>(malloc(sizeof(float)));
 
-        this->patch_feats.reserve(num_of_feats);
+        this->patch_feats.reserve(uint(num_of_feats));
 
         int height = windows_size[1]/cell_size;
 #ifdef FFTW
@@ -91,11 +105,15 @@ public:
             model_xf->create(windows_size[1]/cell_size, (windows_size[0]/cell_size)/2+1, num_of_feats);
             yf->create(windows_size[1]/cell_size, (windows_size[0]/cell_size)/2+1, 1);
             //We use scale_vars[0] for updating the tracker, so we only allocate memory for  its xf only.
+#ifdef CUFFT
+            this->xf.create(windows_size[1]/cell_size, (windows_size[0]/cell_size)/2+1, num_of_feats, this->stream);
+#else
             this->xf.create(windows_size[1]/cell_size, (windows_size[0]/cell_size)/2+1, num_of_feats);
+#endif
         } else if (num_of_scales > 1) {
-            this->max_responses.reserve(num_of_scales);
-            this->max_locs.reserve(num_of_scales);
-            this->response_maps.reserve(num_of_scales);
+            this->max_responses.reserve(uint(num_of_scales));
+            this->max_locs.reserve(uint(num_of_scales));
+            this->response_maps.reserve(uint(num_of_scales));
         }
 #else
         if (zero_index) {
@@ -106,6 +124,26 @@ public:
 #endif
     }
 
+    ~Scale_vars() {
+#ifdef CUFFT
+        std::cout << __PRETTY_FUNCTION__ << std::endl;
+        CudaSafeCall(cudaFreeHost(this->xf_sqr_norm));
+        CudaSafeCall(cudaFreeHost(this->yf_sqr_norm));
+        CudaSafeCall(cudaFreeHost(this->data_i_1ch));
+        CudaSafeCall(cudaFreeHost(this->data_i_features));
+        CudaSafeCall(cudaFreeHost(this->gauss_corr_res));
+        if (zero_index)
+            CudaSafeCall(cudaFreeHost(this->rot_labels_data));
+        CudaSafeCall(cudaFreeHost(this->data_features));
+#if defined(ASYNC) || defined(OPENMP)
+        CudaSafeCall(cudaStreamDestroy(this->stream));
+#endif
+#else
+        free(this->xf_sqr_norm);
+        free(this->yf_sqr_norm);
+#endif
+    }
+
     float *xf_sqr_norm = nullptr, *yf_sqr_norm = nullptr;
     std::vector<cv::Mat> patch_feats;
 
@@ -118,6 +156,9 @@ public:
               *rot_labels_data_d = nullptr, *data_features = nullptr, *data_features_d = nullptr;
     float *data_f = nullptr, *data_i_features = nullptr, *data_i_features_d = nullptr, *data_i_1ch = nullptr, *data_i_1ch_d = nullptr;
 
+    cudaStream_t stream = nullptr;
+    ComplexMat model_alphaf, model_xf;
+
     //Big batch variables
     cv::Point2i max_loc;
     double max_val, max_response;
@@ -125,6 +166,7 @@ public:
     std::vector<double> max_responses;
     std::vector<cv::Point2i> max_locs;
     std::vector<cv::Mat> response_maps;
+    bool zero_index = false;
 };
 
 #endif // SCALE_VARS_HPP