From: Shanigen <vkaraf@gmail.com>
Date: Tue, 10 Apr 2018 13:00:23 +0000 (+0200)
Subject: Removed zero copy for complexmat now it is using normal device memory. Added zero... 
X-Git-Url: https://rtime.felk.cvut.cz/gitweb/hercules2020/kcf.git/commitdiff_plain/45cef79cfa28703cd44eb96e95d816b06f46b4a6

Removed zero copy for complexmat now it is using normal device memory. Added zero copy memory to cufft and minimized memory copy from device to host and host to device.
---

diff --git a/Makefile b/Makefile
index 7ae150a..82d47c6 100644
--- a/Makefile
+++ b/Makefile
@@ -1,25 +1,26 @@
 # Makefile to build all the available variants
 
-BUILDS = opencvfft-st opencvfft-async fftw cufftw fftw_openmp fftw_async fftw_big fftw_big_openmp cufft cufft_big cufft_big_openmp
+BUILDS = opencvfft-st opencvfft-async fftw fftw_openmp fftw_async fftw_big fftw_big_openmp cufftw cufftw_big cufftw_big_openmp cufft cufft_big cufft_big_openmp
 
 all: $(foreach build,$(BUILDS),build-$(build)/kcf_vot)
 
 CMAKE_OPTS += -G Ninja
 #CMAKE_OPTS += = -DOpenCV_DIR=
 
-CMAKE_OTPS_opencvfft-st    = -DFFT=OpenCV
-CMAKE_OTPS_opencvfft-async = -DFFT=OpenCV -DASYNC=ON
+CMAKE_OTPS_opencvfft-st      = -DFFT=OpenCV
+CMAKE_OTPS_opencvfft-async   = -DFFT=OpenCV -DASYNC=ON
 #CMAKE_OTPS_opencv-cufft    = -DFFT=OpenCV_cuFFT
-CMAKE_OTPS_fftw            = -DFFT=fftw
-CMAKE_OTPS_cufftw          = -DFFT=cuFFTW
-CMAKE_OTPS_fftw_openmp     = -DFFT=fftw -DOPENMP=ON
-CMAKE_OTPS_fftw_async      = -DFFT=fftw -DASYNC=ON
-CMAKE_OTPS_fftw_big        = -DFFT=fftw -DBIG_BATCH=ON
-CMAKE_OTPS_fftw_big_openmp = -DFFT=fftw -DBIG_BATCH=ON -DOPENMP=ON
-CMAKE_OTPS_cufft           = -DFFT=cuFFT
-CMAKE_OTPS_cufft_big       = -DFFT=cuFFT -DBIG_BATCH=ON
-CMAKE_OTPS_cufft_big_openmp= = -DFFT=cuFFT -DBIG_BATCH=ON -DOPENMP=ON
-
+CMAKE_OTPS_fftw              = -DFFT=fftw
+CMAKE_OTPS_fftw_openmp       = -DFFT=fftw -DOPENMP=ON
+CMAKE_OTPS_fftw_async        = -DFFT=fftw -DASYNC=ON
+CMAKE_OTPS_fftw_big          = -DFFT=fftw -DBIG_BATCH=ON
+CMAKE_OTPS_fftw_big_openmp   = -DFFT=fftw -DBIG_BATCH=ON -DOPENMP=ON
+CMAKE_OTPS_cufftw            = -DFFT=cuFFTW
+CMAKE_OTPS_cufftw_big        = -DFFT=cuFFTW -DBIG_BATCH=ON
+CMAKE_OTPS_cufftw_big_openmp = -DFFT=cuFFTW -DBIG_BATCH=ON -DOPENMP=ON
+CMAKE_OTPS_cufft             = -DFFT=cuFFT
+CMAKE_OTPS_cufft_big         = -DFFT=cuFFT -DBIG_BATCH=ON
+CMAKE_OTPS_cufft_big_openmp  = -DFFT=cuFFT -DBIG_BATCH=ON -DOPENMP=ON
 
 build-%/kcf_vot: $(shell git ls-files)
 	mkdir -p $(@D)
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 99ed56e..bbe1ece 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -62,7 +62,7 @@ IF(use_cuda)
   set(CUDA_SEPARABLE_COMPILATION ON)
   set(CUDA_PROPAGATE_HOST_FLAGS OFF)
   set(CUDA_HOST_COMPILER /usr/bin/g++)
-  list( APPEND CUDA_NVCC_FLAGS -O3 --gpu-architecture sm_62 -std=c++11)
+  list( APPEND CUDA_NVCC_FLAGS -O3 --gpu-architecture sm_62 -std=c++11 -default-stream per-thread)
   find_cuda_helper_libs(cufftw)
   IF(FFT STREQUAL "cuFFT")
   cuda_add_library(complexmat complexmat.cu)
diff --git a/src/complexmat.cu b/src/complexmat.cu
index 0209c3c..94b90a1 100644
--- a/src/complexmat.cu
+++ b/src/complexmat.cu
@@ -1,8 +1,8 @@
 #include "complexmat.cuh"
 
-__global__ void sqr_norm_kernel(int n, double* out, float* data, float rows, float cols)
+__global__ void sqr_norm_kernel(int n, float* out, float* data, float rows, float cols)
 {
-    extern __shared__ double sdata[];
+    extern __shared__ float sdata[];
     int i = blockDim.x * threadIdx.y + threadIdx.x;
     int blockId = blockIdx.x + blockIdx.y * gridDim.x;
     int threadId = 2*(blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x);
@@ -27,32 +27,16 @@ __global__ void sqr_norm_kernel(int n, double* out, float* data, float rows, flo
     }
 }
 
-float* ComplexMat::sqr_norm() const
+void ComplexMat::sqr_norm(float *result) const
 {
-    
-    double *sums_sqr_norms_from_d_d, *sums_sqr_norms_d_d;
-    float *sums_sqr_norms_from_d_f;
-    sums_sqr_norms_from_d_d = (double*) malloc(n_scales*sizeof(double));
-    sums_sqr_norms_from_d_f = (float*) malloc(n_scales*sizeof(float));
-    cudaMalloc(&sums_sqr_norms_d_d, n_scales*sizeof(double));
-    cudaMemset(sums_sqr_norms_d_d, 0, n_scales*sizeof(double));
+    cudaMemset(result, 0, n_scales*sizeof(float));
     
     dim3 threadsPerBlock(rows, cols);
     dim3 numBlocks(n_channels/n_scales, n_scales);
     
-    sqr_norm_kernel<<<numBlocks, threadsPerBlock, rows*cols*sizeof(double)>>>(n_channels/n_scales, sums_sqr_norms_d_d, p_data_d, rows, cols);
-    
-    cudaError_t error = cudaGetLastError(); if(error != cudaSuccess) {printf("CUDA error: %s\n", cudaGetErrorString(error)); exit(-1); }
-    
-    cudaMemcpy(sums_sqr_norms_from_d_d, sums_sqr_norms_d_d, n_scales*sizeof(double), cudaMemcpyDeviceToHost);
-    cudaFree(sums_sqr_norms_d_d);
-    
-    for(int i = 0; i < n_scales; ++i){
-        sums_sqr_norms_from_d_f[i] = sums_sqr_norms_from_d_d[i];
-    }
-    free(sums_sqr_norms_from_d_d);
+    sqr_norm_kernel<<<numBlocks, threadsPerBlock, rows*cols*sizeof(float)>>>(n_channels/n_scales, result, p_data, rows, cols);
         
-    return sums_sqr_norms_from_d_f;
+    return;
 }
 
 __global__ void sqr_mag_kernel(float* data, float* result)
@@ -70,7 +54,7 @@ ComplexMat ComplexMat::sqr_mag() const
     
     dim3 threadsPerBlock(rows, cols);
     dim3 numBlocks(n_channels/n_scales, n_scales);
-    sqr_mag_kernel<<<numBlocks, threadsPerBlock>>>(this->p_data_d, result.p_data_d);
+    sqr_mag_kernel<<<numBlocks, threadsPerBlock>>>(this->p_data, result.p_data);
     
     return result;
 }
@@ -90,7 +74,7 @@ ComplexMat ComplexMat::conj() const
     
     dim3 threadsPerBlock(rows, cols);
     dim3 numBlocks(n_channels/n_scales, n_scales);
-    conj_kernel<<<numBlocks, threadsPerBlock>>>(this->p_data_d, result.p_data_d);  
+    conj_kernel<<<numBlocks, threadsPerBlock>>>(this->p_data, result.p_data);  
     return result;
 }
 
@@ -103,7 +87,7 @@ ComplexMat ComplexMat::sum_over_channels() const
 
 cufftComplex* ComplexMat::get_p_data() const
 {
-    return (cufftComplex*) p_data_d;
+    return (cufftComplex*) p_data;
 }
 
 __global__ void same_num_channels_mul_kernel(float* data_l, float* data_r, float* result)
@@ -124,7 +108,7 @@ ComplexMat ComplexMat::operator*(const ComplexMat & rhs) const
 
     dim3 threadsPerBlock(rows, cols);
     dim3 numBlocks(n_channels/n_scales, n_scales);
-    same_num_channels_mul_kernel<<<numBlocks, threadsPerBlock>>>(this->p_data_d, rhs.p_data_d, result.p_data_d);
+    same_num_channels_mul_kernel<<<numBlocks, threadsPerBlock>>>(this->p_data, rhs.p_data, result.p_data);
 
     return result;
 }
@@ -148,7 +132,7 @@ ComplexMat ComplexMat::operator/(const ComplexMat & rhs) const
     
     dim3 threadsPerBlock(rows, cols);
     dim3 numBlocks(n_channels/n_scales, n_scales);
-    same_num_channels_div_kernel<<<numBlocks, threadsPerBlock>>>(this->p_data_d, rhs.p_data_d, result.p_data_d);
+    same_num_channels_div_kernel<<<numBlocks, threadsPerBlock>>>(this->p_data, rhs.p_data, result.p_data);
 
     return result;
 }
@@ -170,7 +154,7 @@ ComplexMat ComplexMat::operator+(const ComplexMat & rhs) const
     
     dim3 threadsPerBlock(rows, cols);
     dim3 numBlocks(n_channels/n_scales, n_scales);
-    same_num_channels_add_kernel<<<numBlocks, threadsPerBlock>>>(this->p_data_d, rhs.p_data_d, result.p_data_d);
+    same_num_channels_add_kernel<<<numBlocks, threadsPerBlock>>>(this->p_data, rhs.p_data, result.p_data);
     
     return result;
 }
@@ -190,7 +174,7 @@ ComplexMat ComplexMat::operator*(const float & rhs) const
     
     dim3 threadsPerBlock(rows, cols);
     dim3 numBlocks(n_channels/n_scales, n_scales);
-    constant_mul_kernel<<<numBlocks, threadsPerBlock>>>(this->p_data_d, rhs, result.p_data_d);
+    constant_mul_kernel<<<numBlocks, threadsPerBlock>>>(this->p_data, rhs, result.p_data);
 
     return result;
 }
@@ -210,7 +194,7 @@ ComplexMat ComplexMat::operator+(const float & rhs) const
     
     dim3 threadsPerBlock(rows, cols);
     dim3 numBlocks(n_channels/n_scales, n_scales);
-    constant_add_kernel<<<numBlocks, threadsPerBlock>>>(this->p_data_d, rhs, result.p_data_d);
+    constant_add_kernel<<<numBlocks, threadsPerBlock>>>(this->p_data, rhs, result.p_data);
 
     return result;
 }
@@ -234,7 +218,7 @@ ComplexMat ComplexMat::mul(const ComplexMat & rhs) const
     
     dim3 threadsPerBlock(rows, cols);
     dim3 numBlocks(n_channels/n_scales, n_scales);
-    one_channel_mul_kernel<<<numBlocks, threadsPerBlock>>>(this->p_data_d, rhs.p_data_d, result.p_data_d);
+    one_channel_mul_kernel<<<numBlocks, threadsPerBlock>>>(this->p_data, rhs.p_data, result.p_data);
     
     return result;
 }
@@ -258,7 +242,7 @@ ComplexMat ComplexMat::mul2(const ComplexMat & rhs) const
     
     dim3 threadsPerBlock(rows, cols);
     dim3 numBlocks(n_channels/n_scales, n_scales);
-    scales_channel_mul_kernel<<<numBlocks, threadsPerBlock>>>(this->p_data_d, rhs.p_data_d, result.p_data_d);
+    scales_channel_mul_kernel<<<numBlocks, threadsPerBlock>>>(this->p_data, rhs.p_data, result.p_data);
     
     return result;
 }
@@ -271,7 +255,6 @@ void ComplexMat::operator=(ComplexMat & rhs)
     n_scales = rhs.n_scales;
     
     p_data = rhs.p_data;
-    p_data_d = rhs.p_data_d;
     
 }
 
@@ -283,8 +266,6 @@ void ComplexMat::operator=(ComplexMat && rhs)
     n_scales = rhs.n_scales;
     
     p_data = rhs.p_data;
-    p_data_d = rhs.p_data_d;
     
     rhs.p_data = nullptr;
-    rhs.p_data_d = nullptr;
 }
diff --git a/src/complexmat.cuh b/src/complexmat.cuh
index 7ebbe43..e97209f 100644
--- a/src/complexmat.cuh
+++ b/src/complexmat.cuh
@@ -16,14 +16,12 @@ public:
     ComplexMat() : cols(0), rows(0), n_channels(0) {}
     ComplexMat(int _rows, int _cols, int _n_channels) : cols(_cols), rows(_rows), n_channels(_n_channels)
     {
-        cudaHostAlloc((void **)&p_data, n_channels*cols*rows*sizeof(cufftComplex), cudaHostAllocMapped);
-        cudaHostGetDevicePointer((void **)&p_data_d,  (void *) p_data , 0);
+        cudaMalloc(&p_data,  n_channels*cols*rows*sizeof(cufftComplex));
     }
     
     ComplexMat(int _rows, int _cols, int _n_channels, int _n_scales) : cols(_cols), rows(_rows), n_channels(_n_channels), n_scales(_n_scales)
     {
-        cudaHostAlloc((void **)&p_data, n_channels*cols*rows*sizeof(cufftComplex), cudaHostAllocMapped);
-        cudaHostGetDevicePointer((void **)&p_data_d,  (void *) p_data , 0);
+        cudaMalloc(&p_data,  n_channels*cols*rows*sizeof(cufftComplex));
     }
     
     ComplexMat(ComplexMat &&other)
@@ -35,12 +33,11 @@ public:
         p_data = other.p_data;
         
         other.p_data = nullptr;
-        other.p_data_d = nullptr;
     }
     
     ~ComplexMat()
     {
-        if(p_data != nullptr) cudaFreeHost(p_data);
+        if(p_data != nullptr) cudaFree(p_data);
     }
 
     void create(int _rows, int _cols, int _n_channels)
@@ -48,8 +45,7 @@ public:
         rows = _rows;
         cols = _cols;
         n_channels = _n_channels;
-        cudaHostAlloc((void **)&p_data, n_channels*cols*rows*sizeof(cufftComplex), cudaHostAllocMapped);
-        cudaHostGetDevicePointer((void **)&p_data_d,  (void *) p_data , 0);
+        cudaMalloc(&p_data,  n_channels*cols*rows*sizeof(cufftComplex));
     }
 
     void create(int _rows, int _cols, int _n_channels, int _n_scales)
@@ -58,15 +54,14 @@ public:
         cols = _cols;
         n_channels = _n_channels;
         n_scales = _n_scales;
-        cudaHostAlloc((void **)&p_data, n_channels*cols*rows*sizeof(cufftComplex), cudaHostAllocMapped);
-        cudaHostGetDevicePointer((void **)&p_data_d,  (void *) p_data , 0);
+        cudaMalloc(&p_data,  n_channels*cols*rows*sizeof(cufftComplex));
     }
     // cv::Mat API compatibility
     cv::Size size() { return cv::Size(cols, rows); }
     int channels() { return n_channels; }
     int channels() const { return n_channels; }
 
-    float* sqr_norm() const;
+    void sqr_norm(float *result) const;
     
     ComplexMat sqr_mag() const;
 
@@ -110,5 +105,5 @@ public:
 
 
 private:
-    mutable float *p_data = nullptr, *p_data_d = nullptr;
+    mutable float *p_data = nullptr;
 };
\ No newline at end of file
diff --git a/src/complexmat.hpp b/src/complexmat.hpp
index 943e3f4..344c479 100644
--- a/src/complexmat.hpp
+++ b/src/complexmat.hpp
@@ -60,10 +60,8 @@ public:
     }
 
 
-    T* sqr_norm() const
+    void sqr_norm(T *sums_sqr_norms) const
     {
-        T* sums_sqr_norms;
-        sums_sqr_norms = (T*) malloc(n_scales*sizeof(T));
         int n_channels_per_scale = n_channels/n_scales;
         int scale_offset = n_channels_per_scale*rows*cols;
         T sum_sqr_norm;
@@ -74,7 +72,7 @@ public:
                     sum_sqr_norm += lhs->real()*lhs->real() + lhs->imag()*lhs->imag();
             sums_sqr_norms[scale] = sum_sqr_norm/static_cast<T>(cols*rows);
         }
-        return sums_sqr_norms;
+        return;
     }
 
     ComplexMat_<T> sqr_mag() const
diff --git a/src/fft_cufft.cpp b/src/fft_cufft.cpp
index 7ca6957..c94d61f 100644
--- a/src/fft_cufft.cpp
+++ b/src/fft_cufft.cpp
@@ -72,9 +72,7 @@ static const char *_cudaGetErrorEnum(cufftResult error)
 }
 
 cuFFT::cuFFT(): m_num_of_streams(4)
-{
-    cudaSetDeviceFlags(cudaDeviceMapHost);
-}
+{}
 
 void cuFFT::init(unsigned width, unsigned height, unsigned num_of_feats, unsigned num_of_scales, bool big_batch_mode)
 {
@@ -85,8 +83,6 @@ void cuFFT::init(unsigned width, unsigned height, unsigned num_of_feats, unsigne
     m_big_batch_mode = big_batch_mode;
 
     std::cout << "FFT: cuFFT" << std::endl;
-
-//     cudaSetDeviceFlags(cudaDeviceMapHost);
     
 
     for (unsigned i = 0; i < m_num_of_streams; i++) cudaStreamCreate(&streams[i]);
@@ -96,6 +92,8 @@ void cuFFT::init(unsigned width, unsigned height, unsigned num_of_feats, unsigne
         cudaMalloc(&data_f, m_height*m_width*sizeof(cufftReal));
         
         cufftPlan2d(&plan_f, m_height, m_width, CUFFT_R2C);
+        
+        
     }
     //FFT forward all scales
     if(m_num_of_scales > 1 && m_big_batch_mode)
@@ -107,7 +105,7 @@ void cuFFT::init(unsigned width, unsigned height, unsigned num_of_feats, unsigne
 	int howmany = m_num_of_scales;
 	int idist = m_height*m_width, odist = m_height*(m_width/2+1);
 	int istride = 1, ostride = 1;
-	int *inembed = NULL, *onembed = NULL;
+	int *inembed = n, onembed[] = {(int)m_height, (int)m_width/2+1};
 
 	CHECK_CUFFT_ERRORS(cufftPlanMany(&plan_f_all_scales, rank, n,
 		      inembed, istride, idist,
@@ -116,14 +114,15 @@ void cuFFT::init(unsigned width, unsigned height, unsigned num_of_feats, unsigne
     }
     //FFT forward window one scale
     {
-        cudaMalloc(&data_fw, m_height*m_num_of_feats*m_width*sizeof(cufftReal));
+        cudaHostAlloc(&data_fw, m_height*m_num_of_feats*m_width*sizeof(cufftReal), cudaHostAllocMapped);
+        cudaHostGetDevicePointer(&data_fw_d, data_fw, 0);
         
         int rank = 2;
         int n[] = {(int)m_height, (int)m_width};
         int howmany = m_num_of_feats;
         int idist = m_height*m_width, odist = m_height*(m_width/2+1);
         int istride = 1, ostride = 1;
-        int *inembed = NULL, *onembed = NULL;
+        int *inembed = n, onembed[] = {(int)m_height, (int)m_width/2+1};
 
         CHECK_CUFFT_ERRORS(cufftPlanMany(&plan_fw, rank, n,
 		  inembed, istride, idist,
@@ -133,23 +132,27 @@ void cuFFT::init(unsigned width, unsigned height, unsigned num_of_feats, unsigne
     //FFT forward window all scales all feats
     if(m_num_of_scales > 1 && m_big_batch_mode)
     {
-        cudaMalloc(&data_fw_all_scales, m_height*m_num_of_feats*m_num_of_scales*m_width*sizeof(cufftReal));
-        
+        cudaHostAlloc(&data_fw_all_scales, m_height*m_num_of_feats*m_num_of_scales*m_width*sizeof(cufftReal), cudaHostAllocMapped);
+        cudaHostGetDevicePointer(&data_fw_all_scales_d, data_fw_all_scales, 0);
+
         int rank = 2;
         int n[] = {(int)m_height, (int)m_width};
         int howmany = m_num_of_scales*m_num_of_feats;
         int idist = m_height*m_width, odist = m_height*(m_width/2+1);
         int istride = 1, ostride = 1;
-        int *inembed = NULL, *onembed = NULL;
+        int *inembed = n, onembed[] = {(int)m_height, (int)m_width/2+1};
 
         CHECK_CUFFT_ERRORS(cufftPlanMany(&plan_fw_all_scales, rank, n,
 		  inembed, istride, idist,
 		  onembed, ostride, odist,
 		  CUFFT_R2C, howmany));
+        
+        
     }
     //FFT inverse one scale
     {
-        cudaMalloc(&data_i_features, m_height*m_num_of_feats*m_width*sizeof(cufftReal));
+        cudaHostAlloc(&data_i_features, m_height*m_num_of_feats*m_width*sizeof(cufftReal), cudaHostAllocMapped);
+        cudaHostGetDevicePointer(&data_i_features_d, data_i_features, 0);
         
         int rank = 2;
         int n[] = {(int)m_height, (int)m_width};
@@ -166,7 +169,8 @@ void cuFFT::init(unsigned width, unsigned height, unsigned num_of_feats, unsigne
     //FFT inverse all scales
     if(m_num_of_scales > 1)
     {
-        cudaMalloc(&data_i_features_all_scales, m_height*m_num_of_feats*m_num_of_scales*m_width*sizeof(cufftReal));
+        cudaHostAlloc(&data_i_features_all_scales, m_height*m_num_of_feats*m_num_of_scales*m_width*sizeof(cufftReal), cudaHostAllocMapped);
+        cudaHostGetDevicePointer(&data_i_features_all_scales_d, data_i_features_all_scales, 0);
         
         int rank = 2;
         int n[] = {(int)m_height, (int)m_width};
@@ -182,7 +186,8 @@ void cuFFT::init(unsigned width, unsigned height, unsigned num_of_feats, unsigne
     }
     //FFT inverse one channel one scale
     {
-        cudaMalloc(&data_i_1ch, m_height*m_width*sizeof(cufftReal));
+        cudaHostAlloc(&data_i_1ch, m_height*m_width*sizeof(cufftReal), cudaHostAllocMapped);
+        cudaHostGetDevicePointer(&data_i_1ch_d, data_i_1ch, 0);
         
         int rank = 2;
         int n[] = {(int)m_height, (int)m_width};
@@ -199,7 +204,8 @@ void cuFFT::init(unsigned width, unsigned height, unsigned num_of_feats, unsigne
     //FFT inverse one channel all scales
     if(m_num_of_scales > 1 && m_big_batch_mode)
     {
-        cudaMalloc(&data_i_1ch_all_scales, m_height*m_num_of_scales*m_width*sizeof(cufftReal));
+        cudaHostAlloc(&data_i_1ch_all_scales, m_height*m_num_of_scales*m_width*sizeof(cufftReal), cudaHostAllocMapped);
+        cudaHostGetDevicePointer(&data_i_1ch_all_scales_d, data_i_1ch_all_scales, 0);
         
         int rank = 2;
         int n[] = {(int)m_height, (int)m_width};
@@ -241,24 +247,27 @@ ComplexMat cuFFT::forward(const cv::Mat &input)
 ComplexMat cuFFT::forward_window(const std::vector<cv::Mat> &input)
 {
     int n_channels = input.size();
-    cv::Mat in_all(m_height * n_channels, m_width, CV_32F);
-    for (int i = 0; i < n_channels; ++i) {
-        cv::Mat in_roi(in_all, cv::Rect(0, i*m_height, m_width, m_height));
-        in_roi = input[i].mul(m_window);
-    }
     ComplexMat result;
     if(n_channels > (int) m_num_of_feats){
+        cv::Mat in_all(m_height * n_channels, m_width, CV_32F, data_fw_all_scales);
+        for (int i = 0; i < n_channels; ++i) {
+            cv::Mat in_roi(in_all, cv::Rect(0, i*m_height, m_width, m_height));
+            in_roi = input[i].mul(m_window);
+        }
+        
         result.create(m_height, m_width/2 + 1, n_channels,m_num_of_scales);
+        
+        CHECK_CUFFT_ERRORS(cufftExecR2C(plan_fw_all_scales, reinterpret_cast<cufftReal*>(data_fw_all_scales_d), result.get_p_data()));
     } else {
+        cv::Mat in_all(m_height * n_channels, m_width, CV_32F, data_fw);
+        for (int i = 0; i < n_channels; ++i) {
+            cv::Mat in_roi(in_all, cv::Rect(0, i*m_height, m_width, m_height));
+            in_roi = input[i].mul(m_window);
+        }
+        
         result.create(m_height, m_width/2 + 1, n_channels);
-    }
-
-    if (n_channels <= (int) m_num_of_feats){
-        cudaMemcpy(data_fw, in_all.ptr<cufftReal>(), m_height*m_num_of_feats*m_width*sizeof(cufftReal), cudaMemcpyHostToDevice);
-        CHECK_CUFFT_ERRORS(cufftExecR2C(plan_fw, reinterpret_cast<cufftReal*>(data_fw), result.get_p_data()));
-    } else {
-       cudaMemcpy(data_fw_all_scales, in_all.ptr<cufftReal>(), m_height*n_channels*m_width*sizeof(cufftReal), cudaMemcpyHostToDevice);
-       CHECK_CUFFT_ERRORS(cufftExecR2C(plan_fw_all_scales, reinterpret_cast<cufftReal*>(data_fw_all_scales), result.get_p_data()));
+        
+        CHECK_CUFFT_ERRORS(cufftExecR2C(plan_fw, reinterpret_cast<cufftReal*>(data_fw_d), result.get_p_data()));
     }
     return result;
 }
@@ -266,22 +275,36 @@ ComplexMat cuFFT::forward_window(const std::vector<cv::Mat> &input)
 cv::Mat cuFFT::inverse(const ComplexMat &inputf)
 {
     int n_channels = inputf.n_channels;
-    cv::Mat real_result(m_height, m_width, CV_32FC(n_channels));
     cufftComplex *in = reinterpret_cast<cufftComplex*>(inputf.get_p_data());
-
+    
     if(n_channels == 1){
-        CHECK_CUFFT_ERRORS(cufftExecC2R(plan_i_1ch, in, reinterpret_cast<cufftReal*>(data_i_1ch)));
-        cudaMemcpy(real_result.ptr<cufftReal>(), data_i_1ch, m_height*m_width*sizeof(cufftReal), cudaMemcpyDeviceToHost);
+        cv::Mat real_result(m_height, m_width, CV_32FC1, data_i_1ch);
+        
+        CHECK_CUFFT_ERRORS(cufftExecC2R(plan_i_1ch, in, reinterpret_cast<cufftReal*>(data_i_1ch_d)));
+        cudaDeviceSynchronize();
+        
+        return real_result/(m_width*m_height);
     } else if(n_channels == (int) m_num_of_scales){
-        CHECK_CUFFT_ERRORS(cufftExecC2R(plan_i_1ch_all_scales, in, reinterpret_cast<cufftReal*>(data_i_1ch_all_scales)));
-        cudaMemcpy(real_result.ptr<cufftReal>(), data_i_1ch_all_scales, m_height*n_channels*m_width*sizeof(cufftReal), cudaMemcpyDeviceToHost);
+        cv::Mat real_result(m_height, m_width, CV_32FC(n_channels), data_i_1ch_all_scales);
+        
+        CHECK_CUFFT_ERRORS(cufftExecC2R(plan_i_1ch_all_scales, in, reinterpret_cast<cufftReal*>(data_i_1ch_all_scales_d)));
+        cudaDeviceSynchronize();
+        
+        return real_result/(m_width*m_height);
     } else if(n_channels == (int) m_num_of_feats * (int) m_num_of_scales){
-        CHECK_CUFFT_ERRORS(cufftExecC2R(plan_i_features_all_scales, in, reinterpret_cast<cufftReal*>(data_i_features_all_scales)));
-        cudaMemcpy(real_result.ptr<cufftReal>(), data_i_features_all_scales, m_height*n_channels*m_width*sizeof(cufftReal), cudaMemcpyDeviceToHost);
-    } else {
-        CHECK_CUFFT_ERRORS(cufftExecC2R(plan_i_features, in, reinterpret_cast<cufftReal*>(data_i_features)));
-        cudaMemcpy(real_result.ptr<cufftReal>(), data_i_features, m_height*n_channels*m_width*sizeof(cufftReal), cudaMemcpyDeviceToHost);
+        cv::Mat real_result(m_height, m_width, CV_32FC(n_channels), data_i_features_all_scales);
+        
+        CHECK_CUFFT_ERRORS(cufftExecC2R(plan_i_features_all_scales, in, reinterpret_cast<cufftReal*>(data_i_features_all_scales_d)));
+        cudaDeviceSynchronize();
+        
+        return real_result/(m_width*m_height);
     }
+    
+    cv::Mat real_result(m_height, m_width, CV_32FC(n_channels), data_i_features);
+    
+    CHECK_CUFFT_ERRORS(cufftExecC2R(plan_i_features, in, reinterpret_cast<cufftReal*>(data_i_features_d)));
+    cudaDeviceSynchronize();
+    
     return real_result/(m_width*m_height);
 }
 
@@ -301,12 +324,12 @@ cuFFT::~cuFFT()
   
   cudaFree(data_f);
   cudaFree(data_f_all_scales);
-  cudaFree(data_fw);
-  cudaFree(data_fw_all_scales);
-  cudaFree(data_i_1ch);
-  cudaFree(data_i_1ch_all_scales);
-  cudaFree(data_i_features);
-  cudaFree(data_i_features_all_scales);
+  cudaFreeHost(data_fw);
+  cudaFreeHost(data_fw_all_scales);
+  cudaFreeHost(data_i_1ch);
+  cudaFreeHost(data_i_1ch_all_scales);
+  cudaFreeHost(data_i_features);
+  cudaFreeHost(data_i_features_all_scales);
   
   cudaDeviceReset();
 }
diff --git a/src/fft_cufft.h b/src/fft_cufft.h
index df91b7f..a31285b 100644
--- a/src/fft_cufft.h
+++ b/src/fft_cufft.h
@@ -32,7 +32,8 @@ private:
     cudaStream_t streams[4];
     cufftHandle plan_f, plan_f_all_scales, plan_fw, plan_fw_all_scales, plan_i_features,
      plan_i_features_all_scales, plan_i_1ch, plan_i_1ch_all_scales;
-    float *data_f, *data_f_all_scales, *data_fw, *data_fw_all_scales, *data_i_features, *data_i_features_all_scales, *data_i_1ch, *data_i_1ch_all_scales;
+    float *data_f, *data_f_all_scales, *data_fw, *data_fw_d, *data_fw_all_scales, *data_fw_all_scales_d, *data_i_features, *data_i_features_d,
+          *data_i_features_all_scales, *data_i_features_all_scales_d, *data_i_1ch, *data_i_1ch_d, *data_i_1ch_all_scales, *data_i_1ch_all_scales_d;
 };
 
 #endif // FFT_CUDA_H
diff --git a/src/kcf.cpp b/src/kcf.cpp
index 94895be..710ed5b 100644
--- a/src/kcf.cpp
+++ b/src/kcf.cpp
@@ -33,6 +33,13 @@ KCF_Tracker::KCF_Tracker()
 KCF_Tracker::~KCF_Tracker()
 {
     delete &fft;
+#ifdef CUFFT
+    cudaFreeHost(xf_sqr_norm);
+    cudaFreeHost(yf_sqr_norm);
+#else
+    free(xf_sqr_norm);
+    free(yf_sqr_norm);
+#endif
 }
 
 void KCF_Tracker::init(cv::Mat &img, const cv::Rect & bbox)
@@ -99,6 +106,18 @@ void KCF_Tracker::init(cv::Mat &img, const cv::Rect & bbox)
             p_scales.push_back(std::pow(p_scale_step, i));
     else
         p_scales.push_back(1.);
+    
+#ifdef CUFFT
+    cudaSetDeviceFlags(cudaDeviceMapHost);
+    cudaHostAlloc((void**)&xf_sqr_norm, p_scales.size()*sizeof(float), cudaHostAllocMapped);
+    cudaHostGetDevicePointer((void**)&xf_sqr_norm_d, (void*)xf_sqr_norm, 0);
+    
+    cudaHostAlloc((void**)&yf_sqr_norm, sizeof(float), cudaHostAllocMapped);
+    cudaHostGetDevicePointer((void**)&yf_sqr_norm_d, (void*)yf_sqr_norm, 0);
+#else
+    xf_sqr_norm = (float*) malloc(p_scales.size()*sizeof(float));
+    xf_sqr_norm = (float*) malloc(sizeof(float));
+#endif
 
     p_current_scale = 1.;
 
@@ -594,8 +613,20 @@ cv::Mat KCF_Tracker::get_subwindow(const cv::Mat &input, int cx, int cy, int wid
 
 ComplexMat KCF_Tracker::gaussian_correlation(const ComplexMat &xf, const ComplexMat &yf, double sigma, bool auto_correlation)
 {
-    float* xf_sqr_norm = xf.sqr_norm();
-    float* yf_sqr_norm = auto_correlation ? xf_sqr_norm : yf.sqr_norm();
+#ifdef CUFFT
+    xf.sqr_norm(xf_sqr_norm_d);
+#else
+    xf.sqr_norm(xf_sqr_norm);
+#endif
+    if(auto_correlation){
+        yf_sqr_norm = xf_sqr_norm;
+    } else {
+#ifdef CUFFT
+        yf.sqr_norm(yf_sqr_norm_d);
+#else
+        yf.sqr_norm(yf_sqr_norm);
+#endif
+    }
 
     ComplexMat xyf;
     xyf = auto_correlation ? xf.sqr_mag() : xf.mul2(yf.conj());
@@ -633,9 +664,6 @@ ComplexMat KCF_Tracker::gaussian_correlation(const ComplexMat &xf, const Complex
     DEBUG_PRINTM(in_roi);
     }
 
-    free(xf_sqr_norm);
-    if(!auto_correlation)free(yf_sqr_norm);
-
     DEBUG_PRINTM(in_all);
     return fft.forward(in_all);
 }
diff --git a/src/kcf.h b/src/kcf.h
index 104de2d..4e374be 100644
--- a/src/kcf.h
+++ b/src/kcf.h
@@ -107,6 +107,10 @@ private:
 
     //for big batch
     int num_of_feats;
+    float *xf_sqr_norm, *yf_sqr_norm;
+#ifdef CUFFT
+    float *xf_sqr_norm_d, *yf_sqr_norm_d;
+#endif
 
     //model
     ComplexMat p_yf;