]> rtime.felk.cvut.cz Git - hercules2020/kcf.git/commitdiff
Removed zero copy for complexmat now it is using normal device memory. Added zero...
authorShanigen <vkaraf@gmail.com>
Tue, 10 Apr 2018 13:00:23 +0000 (15:00 +0200)
committerShanigen <vkaraf@gmail.com>
Tue, 10 Apr 2018 13:00:23 +0000 (15:00 +0200)
Makefile
src/CMakeLists.txt
src/complexmat.cu
src/complexmat.cuh
src/complexmat.hpp
src/fft_cufft.cpp
src/fft_cufft.h
src/kcf.cpp
src/kcf.h

index 7ae150a8cf34745f64d3085398ea3c74fd89b1c4..82d47c6b52e69c57102b35d14e8dab5924f5b52b 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -1,25 +1,26 @@
 # Makefile to build all the available variants
 
-BUILDS = opencvfft-st opencvfft-async fftw cufftw fftw_openmp fftw_async fftw_big fftw_big_openmp cufft cufft_big cufft_big_openmp
+BUILDS = opencvfft-st opencvfft-async fftw fftw_openmp fftw_async fftw_big fftw_big_openmp cufftw cufftw_big cufftw_big_openmp cufft cufft_big cufft_big_openmp
 
 all: $(foreach build,$(BUILDS),build-$(build)/kcf_vot)
 
 CMAKE_OPTS += -G Ninja
 #CMAKE_OPTS += = -DOpenCV_DIR=
 
-CMAKE_OTPS_opencvfft-st    = -DFFT=OpenCV
-CMAKE_OTPS_opencvfft-async = -DFFT=OpenCV -DASYNC=ON
+CMAKE_OTPS_opencvfft-st      = -DFFT=OpenCV
+CMAKE_OTPS_opencvfft-async   = -DFFT=OpenCV -DASYNC=ON
 #CMAKE_OTPS_opencv-cufft    = -DFFT=OpenCV_cuFFT
-CMAKE_OTPS_fftw            = -DFFT=fftw
-CMAKE_OTPS_cufftw          = -DFFT=cuFFTW
-CMAKE_OTPS_fftw_openmp     = -DFFT=fftw -DOPENMP=ON
-CMAKE_OTPS_fftw_async      = -DFFT=fftw -DASYNC=ON
-CMAKE_OTPS_fftw_big        = -DFFT=fftw -DBIG_BATCH=ON
-CMAKE_OTPS_fftw_big_openmp = -DFFT=fftw -DBIG_BATCH=ON -DOPENMP=ON
-CMAKE_OTPS_cufft           = -DFFT=cuFFT
-CMAKE_OTPS_cufft_big       = -DFFT=cuFFT -DBIG_BATCH=ON
-CMAKE_OTPS_cufft_big_openmp= = -DFFT=cuFFT -DBIG_BATCH=ON -DOPENMP=ON
-
+CMAKE_OTPS_fftw              = -DFFT=fftw
+CMAKE_OTPS_fftw_openmp       = -DFFT=fftw -DOPENMP=ON
+CMAKE_OTPS_fftw_async        = -DFFT=fftw -DASYNC=ON
+CMAKE_OTPS_fftw_big          = -DFFT=fftw -DBIG_BATCH=ON
+CMAKE_OTPS_fftw_big_openmp   = -DFFT=fftw -DBIG_BATCH=ON -DOPENMP=ON
+CMAKE_OTPS_cufftw            = -DFFT=cuFFTW
+CMAKE_OTPS_cufftw_big        = -DFFT=cuFFTW -DBIG_BATCH=ON
+CMAKE_OTPS_cufftw_big_openmp = -DFFT=cuFFTW -DBIG_BATCH=ON -DOPENMP=ON
+CMAKE_OTPS_cufft             = -DFFT=cuFFT
+CMAKE_OTPS_cufft_big         = -DFFT=cuFFT -DBIG_BATCH=ON
+CMAKE_OTPS_cufft_big_openmp  = -DFFT=cuFFT -DBIG_BATCH=ON -DOPENMP=ON
 
 build-%/kcf_vot: $(shell git ls-files)
        mkdir -p $(@D)
index 99ed56ee7f7e90a1cd7bf1ddd54d1832fd8e663c..bbe1ece6bcbca0396e644e6505f4296f7c65546d 100644 (file)
@@ -62,7 +62,7 @@ IF(use_cuda)
   set(CUDA_SEPARABLE_COMPILATION ON)
   set(CUDA_PROPAGATE_HOST_FLAGS OFF)
   set(CUDA_HOST_COMPILER /usr/bin/g++)
-  list( APPEND CUDA_NVCC_FLAGS -O3 --gpu-architecture sm_62 -std=c++11)
+  list( APPEND CUDA_NVCC_FLAGS -O3 --gpu-architecture sm_62 -std=c++11 -default-stream per-thread)
   find_cuda_helper_libs(cufftw)
   IF(FFT STREQUAL "cuFFT")
   cuda_add_library(complexmat complexmat.cu)
index 0209c3c6d313778e019acae0aa5d9c46aae4b942..94b90a16ef8fcb001692d1a2102d8c9b0a9ff497 100644 (file)
@@ -1,8 +1,8 @@
 #include "complexmat.cuh"
 
-__global__ void sqr_norm_kernel(int n, double* out, float* data, float rows, float cols)
+__global__ void sqr_norm_kernel(int n, float* out, float* data, float rows, float cols)
 {
-    extern __shared__ double sdata[];
+    extern __shared__ float sdata[];
     int i = blockDim.x * threadIdx.y + threadIdx.x;
     int blockId = blockIdx.x + blockIdx.y * gridDim.x;
     int threadId = 2*(blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x);
@@ -27,32 +27,16 @@ __global__ void sqr_norm_kernel(int n, double* out, float* data, float rows, flo
     }
 }
 
-float* ComplexMat::sqr_norm() const
+void ComplexMat::sqr_norm(float *result) const
 {
-    
-    double *sums_sqr_norms_from_d_d, *sums_sqr_norms_d_d;
-    float *sums_sqr_norms_from_d_f;
-    sums_sqr_norms_from_d_d = (double*) malloc(n_scales*sizeof(double));
-    sums_sqr_norms_from_d_f = (float*) malloc(n_scales*sizeof(float));
-    cudaMalloc(&sums_sqr_norms_d_d, n_scales*sizeof(double));
-    cudaMemset(sums_sqr_norms_d_d, 0, n_scales*sizeof(double));
+    cudaMemset(result, 0, n_scales*sizeof(float));
     
     dim3 threadsPerBlock(rows, cols);
     dim3 numBlocks(n_channels/n_scales, n_scales);
     
-    sqr_norm_kernel<<<numBlocks, threadsPerBlock, rows*cols*sizeof(double)>>>(n_channels/n_scales, sums_sqr_norms_d_d, p_data_d, rows, cols);
-    
-    cudaError_t error = cudaGetLastError(); if(error != cudaSuccess) {printf("CUDA error: %s\n", cudaGetErrorString(error)); exit(-1); }
-    
-    cudaMemcpy(sums_sqr_norms_from_d_d, sums_sqr_norms_d_d, n_scales*sizeof(double), cudaMemcpyDeviceToHost);
-    cudaFree(sums_sqr_norms_d_d);
-    
-    for(int i = 0; i < n_scales; ++i){
-        sums_sqr_norms_from_d_f[i] = sums_sqr_norms_from_d_d[i];
-    }
-    free(sums_sqr_norms_from_d_d);
+    sqr_norm_kernel<<<numBlocks, threadsPerBlock, rows*cols*sizeof(float)>>>(n_channels/n_scales, result, p_data, rows, cols);
         
-    return sums_sqr_norms_from_d_f;
+    return;
 }
 
 __global__ void sqr_mag_kernel(float* data, float* result)
@@ -70,7 +54,7 @@ ComplexMat ComplexMat::sqr_mag() const
     
     dim3 threadsPerBlock(rows, cols);
     dim3 numBlocks(n_channels/n_scales, n_scales);
-    sqr_mag_kernel<<<numBlocks, threadsPerBlock>>>(this->p_data_d, result.p_data_d);
+    sqr_mag_kernel<<<numBlocks, threadsPerBlock>>>(this->p_data, result.p_data);
     
     return result;
 }
@@ -90,7 +74,7 @@ ComplexMat ComplexMat::conj() const
     
     dim3 threadsPerBlock(rows, cols);
     dim3 numBlocks(n_channels/n_scales, n_scales);
-    conj_kernel<<<numBlocks, threadsPerBlock>>>(this->p_data_d, result.p_data_d);  
+    conj_kernel<<<numBlocks, threadsPerBlock>>>(this->p_data, result.p_data);  
     return result;
 }
 
@@ -103,7 +87,7 @@ ComplexMat ComplexMat::sum_over_channels() const
 
 cufftComplex* ComplexMat::get_p_data() const
 {
-    return (cufftComplex*) p_data_d;
+    return (cufftComplex*) p_data;
 }
 
 __global__ void same_num_channels_mul_kernel(float* data_l, float* data_r, float* result)
@@ -124,7 +108,7 @@ ComplexMat ComplexMat::operator*(const ComplexMat & rhs) const
 
     dim3 threadsPerBlock(rows, cols);
     dim3 numBlocks(n_channels/n_scales, n_scales);
-    same_num_channels_mul_kernel<<<numBlocks, threadsPerBlock>>>(this->p_data_d, rhs.p_data_d, result.p_data_d);
+    same_num_channels_mul_kernel<<<numBlocks, threadsPerBlock>>>(this->p_data, rhs.p_data, result.p_data);
 
     return result;
 }
@@ -148,7 +132,7 @@ ComplexMat ComplexMat::operator/(const ComplexMat & rhs) const
     
     dim3 threadsPerBlock(rows, cols);
     dim3 numBlocks(n_channels/n_scales, n_scales);
-    same_num_channels_div_kernel<<<numBlocks, threadsPerBlock>>>(this->p_data_d, rhs.p_data_d, result.p_data_d);
+    same_num_channels_div_kernel<<<numBlocks, threadsPerBlock>>>(this->p_data, rhs.p_data, result.p_data);
 
     return result;
 }
@@ -170,7 +154,7 @@ ComplexMat ComplexMat::operator+(const ComplexMat & rhs) const
     
     dim3 threadsPerBlock(rows, cols);
     dim3 numBlocks(n_channels/n_scales, n_scales);
-    same_num_channels_add_kernel<<<numBlocks, threadsPerBlock>>>(this->p_data_d, rhs.p_data_d, result.p_data_d);
+    same_num_channels_add_kernel<<<numBlocks, threadsPerBlock>>>(this->p_data, rhs.p_data, result.p_data);
     
     return result;
 }
@@ -190,7 +174,7 @@ ComplexMat ComplexMat::operator*(const float & rhs) const
     
     dim3 threadsPerBlock(rows, cols);
     dim3 numBlocks(n_channels/n_scales, n_scales);
-    constant_mul_kernel<<<numBlocks, threadsPerBlock>>>(this->p_data_d, rhs, result.p_data_d);
+    constant_mul_kernel<<<numBlocks, threadsPerBlock>>>(this->p_data, rhs, result.p_data);
 
     return result;
 }
@@ -210,7 +194,7 @@ ComplexMat ComplexMat::operator+(const float & rhs) const
     
     dim3 threadsPerBlock(rows, cols);
     dim3 numBlocks(n_channels/n_scales, n_scales);
-    constant_add_kernel<<<numBlocks, threadsPerBlock>>>(this->p_data_d, rhs, result.p_data_d);
+    constant_add_kernel<<<numBlocks, threadsPerBlock>>>(this->p_data, rhs, result.p_data);
 
     return result;
 }
@@ -234,7 +218,7 @@ ComplexMat ComplexMat::mul(const ComplexMat & rhs) const
     
     dim3 threadsPerBlock(rows, cols);
     dim3 numBlocks(n_channels/n_scales, n_scales);
-    one_channel_mul_kernel<<<numBlocks, threadsPerBlock>>>(this->p_data_d, rhs.p_data_d, result.p_data_d);
+    one_channel_mul_kernel<<<numBlocks, threadsPerBlock>>>(this->p_data, rhs.p_data, result.p_data);
     
     return result;
 }
@@ -258,7 +242,7 @@ ComplexMat ComplexMat::mul2(const ComplexMat & rhs) const
     
     dim3 threadsPerBlock(rows, cols);
     dim3 numBlocks(n_channels/n_scales, n_scales);
-    scales_channel_mul_kernel<<<numBlocks, threadsPerBlock>>>(this->p_data_d, rhs.p_data_d, result.p_data_d);
+    scales_channel_mul_kernel<<<numBlocks, threadsPerBlock>>>(this->p_data, rhs.p_data, result.p_data);
     
     return result;
 }
@@ -271,7 +255,6 @@ void ComplexMat::operator=(ComplexMat & rhs)
     n_scales = rhs.n_scales;
     
     p_data = rhs.p_data;
-    p_data_d = rhs.p_data_d;
     
 }
 
@@ -283,8 +266,6 @@ void ComplexMat::operator=(ComplexMat && rhs)
     n_scales = rhs.n_scales;
     
     p_data = rhs.p_data;
-    p_data_d = rhs.p_data_d;
     
     rhs.p_data = nullptr;
-    rhs.p_data_d = nullptr;
 }
index 7ebbe43c1b069f3119ab1eca0e4249ae0d35e2c3..e97209f6fdc74cdab17dd380250f6c4bcfaa4245 100644 (file)
@@ -16,14 +16,12 @@ public:
     ComplexMat() : cols(0), rows(0), n_channels(0) {}
     ComplexMat(int _rows, int _cols, int _n_channels) : cols(_cols), rows(_rows), n_channels(_n_channels)
     {
-        cudaHostAlloc((void **)&p_data, n_channels*cols*rows*sizeof(cufftComplex), cudaHostAllocMapped);
-        cudaHostGetDevicePointer((void **)&p_data_d,  (void *) p_data , 0);
+        cudaMalloc(&p_data,  n_channels*cols*rows*sizeof(cufftComplex));
     }
     
     ComplexMat(int _rows, int _cols, int _n_channels, int _n_scales) : cols(_cols), rows(_rows), n_channels(_n_channels), n_scales(_n_scales)
     {
-        cudaHostAlloc((void **)&p_data, n_channels*cols*rows*sizeof(cufftComplex), cudaHostAllocMapped);
-        cudaHostGetDevicePointer((void **)&p_data_d,  (void *) p_data , 0);
+        cudaMalloc(&p_data,  n_channels*cols*rows*sizeof(cufftComplex));
     }
     
     ComplexMat(ComplexMat &&other)
@@ -35,12 +33,11 @@ public:
         p_data = other.p_data;
         
         other.p_data = nullptr;
-        other.p_data_d = nullptr;
     }
     
     ~ComplexMat()
     {
-        if(p_data != nullptr) cudaFreeHost(p_data);
+        if(p_data != nullptr) cudaFree(p_data);
     }
 
     void create(int _rows, int _cols, int _n_channels)
@@ -48,8 +45,7 @@ public:
         rows = _rows;
         cols = _cols;
         n_channels = _n_channels;
-        cudaHostAlloc((void **)&p_data, n_channels*cols*rows*sizeof(cufftComplex), cudaHostAllocMapped);
-        cudaHostGetDevicePointer((void **)&p_data_d,  (void *) p_data , 0);
+        cudaMalloc(&p_data,  n_channels*cols*rows*sizeof(cufftComplex));
     }
 
     void create(int _rows, int _cols, int _n_channels, int _n_scales)
@@ -58,15 +54,14 @@ public:
         cols = _cols;
         n_channels = _n_channels;
         n_scales = _n_scales;
-        cudaHostAlloc((void **)&p_data, n_channels*cols*rows*sizeof(cufftComplex), cudaHostAllocMapped);
-        cudaHostGetDevicePointer((void **)&p_data_d,  (void *) p_data , 0);
+        cudaMalloc(&p_data,  n_channels*cols*rows*sizeof(cufftComplex));
     }
     // cv::Mat API compatibility
     cv::Size size() { return cv::Size(cols, rows); }
     int channels() { return n_channels; }
     int channels() const { return n_channels; }
 
-    float* sqr_norm() const;
+    void sqr_norm(float *result) const;
     
     ComplexMat sqr_mag() const;
 
@@ -110,5 +105,5 @@ public:
 
 
 private:
-    mutable float *p_data = nullptr, *p_data_d = nullptr;
+    mutable float *p_data = nullptr;
 };
\ No newline at end of file
index 943e3f4b82783fb9c2ee87a9c802fcc897b37cad..344c479ec6ab1d0402bc3e5b56d25817f4ce7903 100644 (file)
@@ -60,10 +60,8 @@ public:
     }
 
 
-    T* sqr_norm() const
+    void sqr_norm(T *sums_sqr_norms) const
     {
-        T* sums_sqr_norms;
-        sums_sqr_norms = (T*) malloc(n_scales*sizeof(T));
         int n_channels_per_scale = n_channels/n_scales;
         int scale_offset = n_channels_per_scale*rows*cols;
         T sum_sqr_norm;
@@ -74,7 +72,7 @@ public:
                     sum_sqr_norm += lhs->real()*lhs->real() + lhs->imag()*lhs->imag();
             sums_sqr_norms[scale] = sum_sqr_norm/static_cast<T>(cols*rows);
         }
-        return sums_sqr_norms;
+        return;
     }
 
     ComplexMat_<T> sqr_mag() const
index 7ca695733bbd06bc0e609630c2cbcd7dc46bc081..c94d61f14cdd6c3a790a38a7e836042447a60b2f 100644 (file)
@@ -72,9 +72,7 @@ static const char *_cudaGetErrorEnum(cufftResult error)
 }
 
 cuFFT::cuFFT(): m_num_of_streams(4)
-{
-    cudaSetDeviceFlags(cudaDeviceMapHost);
-}
+{}
 
 void cuFFT::init(unsigned width, unsigned height, unsigned num_of_feats, unsigned num_of_scales, bool big_batch_mode)
 {
@@ -85,8 +83,6 @@ void cuFFT::init(unsigned width, unsigned height, unsigned num_of_feats, unsigne
     m_big_batch_mode = big_batch_mode;
 
     std::cout << "FFT: cuFFT" << std::endl;
-
-//     cudaSetDeviceFlags(cudaDeviceMapHost);
     
 
     for (unsigned i = 0; i < m_num_of_streams; i++) cudaStreamCreate(&streams[i]);
@@ -96,6 +92,8 @@ void cuFFT::init(unsigned width, unsigned height, unsigned num_of_feats, unsigne
         cudaMalloc(&data_f, m_height*m_width*sizeof(cufftReal));
         
         cufftPlan2d(&plan_f, m_height, m_width, CUFFT_R2C);
+        
+        
     }
     //FFT forward all scales
     if(m_num_of_scales > 1 && m_big_batch_mode)
@@ -107,7 +105,7 @@ void cuFFT::init(unsigned width, unsigned height, unsigned num_of_feats, unsigne
        int howmany = m_num_of_scales;
        int idist = m_height*m_width, odist = m_height*(m_width/2+1);
        int istride = 1, ostride = 1;
-       int *inembed = NULL, *onembed = NULL;
+       int *inembed = n, onembed[] = {(int)m_height, (int)m_width/2+1};
 
        CHECK_CUFFT_ERRORS(cufftPlanMany(&plan_f_all_scales, rank, n,
                      inembed, istride, idist,
@@ -116,14 +114,15 @@ void cuFFT::init(unsigned width, unsigned height, unsigned num_of_feats, unsigne
     }
     //FFT forward window one scale
     {
-        cudaMalloc(&data_fw, m_height*m_num_of_feats*m_width*sizeof(cufftReal));
+        cudaHostAlloc(&data_fw, m_height*m_num_of_feats*m_width*sizeof(cufftReal), cudaHostAllocMapped);
+        cudaHostGetDevicePointer(&data_fw_d, data_fw, 0);
         
         int rank = 2;
         int n[] = {(int)m_height, (int)m_width};
         int howmany = m_num_of_feats;
         int idist = m_height*m_width, odist = m_height*(m_width/2+1);
         int istride = 1, ostride = 1;
-        int *inembed = NULL, *onembed = NULL;
+        int *inembed = n, onembed[] = {(int)m_height, (int)m_width/2+1};
 
         CHECK_CUFFT_ERRORS(cufftPlanMany(&plan_fw, rank, n,
                  inembed, istride, idist,
@@ -133,23 +132,27 @@ void cuFFT::init(unsigned width, unsigned height, unsigned num_of_feats, unsigne
     //FFT forward window all scales all feats
     if(m_num_of_scales > 1 && m_big_batch_mode)
     {
-        cudaMalloc(&data_fw_all_scales, m_height*m_num_of_feats*m_num_of_scales*m_width*sizeof(cufftReal));
-        
+        cudaHostAlloc(&data_fw_all_scales, m_height*m_num_of_feats*m_num_of_scales*m_width*sizeof(cufftReal), cudaHostAllocMapped);
+        cudaHostGetDevicePointer(&data_fw_all_scales_d, data_fw_all_scales, 0);
+
         int rank = 2;
         int n[] = {(int)m_height, (int)m_width};
         int howmany = m_num_of_scales*m_num_of_feats;
         int idist = m_height*m_width, odist = m_height*(m_width/2+1);
         int istride = 1, ostride = 1;
-        int *inembed = NULL, *onembed = NULL;
+        int *inembed = n, onembed[] = {(int)m_height, (int)m_width/2+1};
 
         CHECK_CUFFT_ERRORS(cufftPlanMany(&plan_fw_all_scales, rank, n,
                  inembed, istride, idist,
                  onembed, ostride, odist,
                  CUFFT_R2C, howmany));
+        
+        
     }
     //FFT inverse one scale
     {
-        cudaMalloc(&data_i_features, m_height*m_num_of_feats*m_width*sizeof(cufftReal));
+        cudaHostAlloc(&data_i_features, m_height*m_num_of_feats*m_width*sizeof(cufftReal), cudaHostAllocMapped);
+        cudaHostGetDevicePointer(&data_i_features_d, data_i_features, 0);
         
         int rank = 2;
         int n[] = {(int)m_height, (int)m_width};
@@ -166,7 +169,8 @@ void cuFFT::init(unsigned width, unsigned height, unsigned num_of_feats, unsigne
     //FFT inverse all scales
     if(m_num_of_scales > 1)
     {
-        cudaMalloc(&data_i_features_all_scales, m_height*m_num_of_feats*m_num_of_scales*m_width*sizeof(cufftReal));
+        cudaHostAlloc(&data_i_features_all_scales, m_height*m_num_of_feats*m_num_of_scales*m_width*sizeof(cufftReal), cudaHostAllocMapped);
+        cudaHostGetDevicePointer(&data_i_features_all_scales_d, data_i_features_all_scales, 0);
         
         int rank = 2;
         int n[] = {(int)m_height, (int)m_width};
@@ -182,7 +186,8 @@ void cuFFT::init(unsigned width, unsigned height, unsigned num_of_feats, unsigne
     }
     //FFT inverse one channel one scale
     {
-        cudaMalloc(&data_i_1ch, m_height*m_width*sizeof(cufftReal));
+        cudaHostAlloc(&data_i_1ch, m_height*m_width*sizeof(cufftReal), cudaHostAllocMapped);
+        cudaHostGetDevicePointer(&data_i_1ch_d, data_i_1ch, 0);
         
         int rank = 2;
         int n[] = {(int)m_height, (int)m_width};
@@ -199,7 +204,8 @@ void cuFFT::init(unsigned width, unsigned height, unsigned num_of_feats, unsigne
     //FFT inverse one channel all scales
     if(m_num_of_scales > 1 && m_big_batch_mode)
     {
-        cudaMalloc(&data_i_1ch_all_scales, m_height*m_num_of_scales*m_width*sizeof(cufftReal));
+        cudaHostAlloc(&data_i_1ch_all_scales, m_height*m_num_of_scales*m_width*sizeof(cufftReal), cudaHostAllocMapped);
+        cudaHostGetDevicePointer(&data_i_1ch_all_scales_d, data_i_1ch_all_scales, 0);
         
         int rank = 2;
         int n[] = {(int)m_height, (int)m_width};
@@ -241,24 +247,27 @@ ComplexMat cuFFT::forward(const cv::Mat &input)
 ComplexMat cuFFT::forward_window(const std::vector<cv::Mat> &input)
 {
     int n_channels = input.size();
-    cv::Mat in_all(m_height * n_channels, m_width, CV_32F);
-    for (int i = 0; i < n_channels; ++i) {
-        cv::Mat in_roi(in_all, cv::Rect(0, i*m_height, m_width, m_height));
-        in_roi = input[i].mul(m_window);
-    }
     ComplexMat result;
     if(n_channels > (int) m_num_of_feats){
+        cv::Mat in_all(m_height * n_channels, m_width, CV_32F, data_fw_all_scales);
+        for (int i = 0; i < n_channels; ++i) {
+            cv::Mat in_roi(in_all, cv::Rect(0, i*m_height, m_width, m_height));
+            in_roi = input[i].mul(m_window);
+        }
+        
         result.create(m_height, m_width/2 + 1, n_channels,m_num_of_scales);
+        
+        CHECK_CUFFT_ERRORS(cufftExecR2C(plan_fw_all_scales, reinterpret_cast<cufftReal*>(data_fw_all_scales_d), result.get_p_data()));
     } else {
+        cv::Mat in_all(m_height * n_channels, m_width, CV_32F, data_fw);
+        for (int i = 0; i < n_channels; ++i) {
+            cv::Mat in_roi(in_all, cv::Rect(0, i*m_height, m_width, m_height));
+            in_roi = input[i].mul(m_window);
+        }
+        
         result.create(m_height, m_width/2 + 1, n_channels);
-    }
-
-    if (n_channels <= (int) m_num_of_feats){
-        cudaMemcpy(data_fw, in_all.ptr<cufftReal>(), m_height*m_num_of_feats*m_width*sizeof(cufftReal), cudaMemcpyHostToDevice);
-        CHECK_CUFFT_ERRORS(cufftExecR2C(plan_fw, reinterpret_cast<cufftReal*>(data_fw), result.get_p_data()));
-    } else {
-       cudaMemcpy(data_fw_all_scales, in_all.ptr<cufftReal>(), m_height*n_channels*m_width*sizeof(cufftReal), cudaMemcpyHostToDevice);
-       CHECK_CUFFT_ERRORS(cufftExecR2C(plan_fw_all_scales, reinterpret_cast<cufftReal*>(data_fw_all_scales), result.get_p_data()));
+        
+        CHECK_CUFFT_ERRORS(cufftExecR2C(plan_fw, reinterpret_cast<cufftReal*>(data_fw_d), result.get_p_data()));
     }
     return result;
 }
@@ -266,22 +275,36 @@ ComplexMat cuFFT::forward_window(const std::vector<cv::Mat> &input)
 cv::Mat cuFFT::inverse(const ComplexMat &inputf)
 {
     int n_channels = inputf.n_channels;
-    cv::Mat real_result(m_height, m_width, CV_32FC(n_channels));
     cufftComplex *in = reinterpret_cast<cufftComplex*>(inputf.get_p_data());
-
+    
     if(n_channels == 1){
-        CHECK_CUFFT_ERRORS(cufftExecC2R(plan_i_1ch, in, reinterpret_cast<cufftReal*>(data_i_1ch)));
-        cudaMemcpy(real_result.ptr<cufftReal>(), data_i_1ch, m_height*m_width*sizeof(cufftReal), cudaMemcpyDeviceToHost);
+        cv::Mat real_result(m_height, m_width, CV_32FC1, data_i_1ch);
+        
+        CHECK_CUFFT_ERRORS(cufftExecC2R(plan_i_1ch, in, reinterpret_cast<cufftReal*>(data_i_1ch_d)));
+        cudaDeviceSynchronize();
+        
+        return real_result/(m_width*m_height);
     } else if(n_channels == (int) m_num_of_scales){
-        CHECK_CUFFT_ERRORS(cufftExecC2R(plan_i_1ch_all_scales, in, reinterpret_cast<cufftReal*>(data_i_1ch_all_scales)));
-        cudaMemcpy(real_result.ptr<cufftReal>(), data_i_1ch_all_scales, m_height*n_channels*m_width*sizeof(cufftReal), cudaMemcpyDeviceToHost);
+        cv::Mat real_result(m_height, m_width, CV_32FC(n_channels), data_i_1ch_all_scales);
+        
+        CHECK_CUFFT_ERRORS(cufftExecC2R(plan_i_1ch_all_scales, in, reinterpret_cast<cufftReal*>(data_i_1ch_all_scales_d)));
+        cudaDeviceSynchronize();
+        
+        return real_result/(m_width*m_height);
     } else if(n_channels == (int) m_num_of_feats * (int) m_num_of_scales){
-        CHECK_CUFFT_ERRORS(cufftExecC2R(plan_i_features_all_scales, in, reinterpret_cast<cufftReal*>(data_i_features_all_scales)));
-        cudaMemcpy(real_result.ptr<cufftReal>(), data_i_features_all_scales, m_height*n_channels*m_width*sizeof(cufftReal), cudaMemcpyDeviceToHost);
-    } else {
-        CHECK_CUFFT_ERRORS(cufftExecC2R(plan_i_features, in, reinterpret_cast<cufftReal*>(data_i_features)));
-        cudaMemcpy(real_result.ptr<cufftReal>(), data_i_features, m_height*n_channels*m_width*sizeof(cufftReal), cudaMemcpyDeviceToHost);
+        cv::Mat real_result(m_height, m_width, CV_32FC(n_channels), data_i_features_all_scales);
+        
+        CHECK_CUFFT_ERRORS(cufftExecC2R(plan_i_features_all_scales, in, reinterpret_cast<cufftReal*>(data_i_features_all_scales_d)));
+        cudaDeviceSynchronize();
+        
+        return real_result/(m_width*m_height);
     }
+    
+    cv::Mat real_result(m_height, m_width, CV_32FC(n_channels), data_i_features);
+    
+    CHECK_CUFFT_ERRORS(cufftExecC2R(plan_i_features, in, reinterpret_cast<cufftReal*>(data_i_features_d)));
+    cudaDeviceSynchronize();
+    
     return real_result/(m_width*m_height);
 }
 
@@ -301,12 +324,12 @@ cuFFT::~cuFFT()
   
   cudaFree(data_f);
   cudaFree(data_f_all_scales);
-  cudaFree(data_fw);
-  cudaFree(data_fw_all_scales);
-  cudaFree(data_i_1ch);
-  cudaFree(data_i_1ch_all_scales);
-  cudaFree(data_i_features);
-  cudaFree(data_i_features_all_scales);
+  cudaFreeHost(data_fw);
+  cudaFreeHost(data_fw_all_scales);
+  cudaFreeHost(data_i_1ch);
+  cudaFreeHost(data_i_1ch_all_scales);
+  cudaFreeHost(data_i_features);
+  cudaFreeHost(data_i_features_all_scales);
   
   cudaDeviceReset();
 }
index df91b7f61e5a7ec06f0c76f8bafff28299d34dfa..a31285b434cdca33a2ac91e96e67a7ce8ecef35b 100644 (file)
@@ -32,7 +32,8 @@ private:
     cudaStream_t streams[4];
     cufftHandle plan_f, plan_f_all_scales, plan_fw, plan_fw_all_scales, plan_i_features,
      plan_i_features_all_scales, plan_i_1ch, plan_i_1ch_all_scales;
-    float *data_f, *data_f_all_scales, *data_fw, *data_fw_all_scales, *data_i_features, *data_i_features_all_scales, *data_i_1ch, *data_i_1ch_all_scales;
+    float *data_f, *data_f_all_scales, *data_fw, *data_fw_d, *data_fw_all_scales, *data_fw_all_scales_d, *data_i_features, *data_i_features_d,
+          *data_i_features_all_scales, *data_i_features_all_scales_d, *data_i_1ch, *data_i_1ch_d, *data_i_1ch_all_scales, *data_i_1ch_all_scales_d;
 };
 
 #endif // FFT_CUDA_H
index 94895be54926fa4682583418ce7610e8adb62a83..710ed5b9fc809a339beb39fcfc5dcfa2442ae80a 100644 (file)
@@ -33,6 +33,13 @@ KCF_Tracker::KCF_Tracker()
 KCF_Tracker::~KCF_Tracker()
 {
     delete &fft;
+#ifdef CUFFT
+    cudaFreeHost(xf_sqr_norm);
+    cudaFreeHost(yf_sqr_norm);
+#else
+    free(xf_sqr_norm);
+    free(yf_sqr_norm);
+#endif
 }
 
 void KCF_Tracker::init(cv::Mat &img, const cv::Rect & bbox)
@@ -99,6 +106,18 @@ void KCF_Tracker::init(cv::Mat &img, const cv::Rect & bbox)
             p_scales.push_back(std::pow(p_scale_step, i));
     else
         p_scales.push_back(1.);
+    
+#ifdef CUFFT
+    cudaSetDeviceFlags(cudaDeviceMapHost);
+    cudaHostAlloc((void**)&xf_sqr_norm, p_scales.size()*sizeof(float), cudaHostAllocMapped);
+    cudaHostGetDevicePointer((void**)&xf_sqr_norm_d, (void*)xf_sqr_norm, 0);
+    
+    cudaHostAlloc((void**)&yf_sqr_norm, sizeof(float), cudaHostAllocMapped);
+    cudaHostGetDevicePointer((void**)&yf_sqr_norm_d, (void*)yf_sqr_norm, 0);
+#else
+    xf_sqr_norm = (float*) malloc(p_scales.size()*sizeof(float));
+    xf_sqr_norm = (float*) malloc(sizeof(float));
+#endif
 
     p_current_scale = 1.;
 
@@ -594,8 +613,20 @@ cv::Mat KCF_Tracker::get_subwindow(const cv::Mat &input, int cx, int cy, int wid
 
 ComplexMat KCF_Tracker::gaussian_correlation(const ComplexMat &xf, const ComplexMat &yf, double sigma, bool auto_correlation)
 {
-    float* xf_sqr_norm = xf.sqr_norm();
-    float* yf_sqr_norm = auto_correlation ? xf_sqr_norm : yf.sqr_norm();
+#ifdef CUFFT
+    xf.sqr_norm(xf_sqr_norm_d);
+#else
+    xf.sqr_norm(xf_sqr_norm);
+#endif
+    if(auto_correlation){
+        yf_sqr_norm = xf_sqr_norm;
+    } else {
+#ifdef CUFFT
+        yf.sqr_norm(yf_sqr_norm_d);
+#else
+        yf.sqr_norm(yf_sqr_norm);
+#endif
+    }
 
     ComplexMat xyf;
     xyf = auto_correlation ? xf.sqr_mag() : xf.mul2(yf.conj());
@@ -633,9 +664,6 @@ ComplexMat KCF_Tracker::gaussian_correlation(const ComplexMat &xf, const Complex
     DEBUG_PRINTM(in_roi);
     }
 
-    free(xf_sqr_norm);
-    if(!auto_correlation)free(yf_sqr_norm);
-
     DEBUG_PRINTM(in_all);
     return fft.forward(in_all);
 }
index 104de2dce65a00c40c62257b463bfed94e85ee6c..4e374be198739ed0261f6d5299a947e953d9ffb1 100644 (file)
--- a/src/kcf.h
+++ b/src/kcf.h
@@ -107,6 +107,10 @@ private:
 
     //for big batch
     int num_of_feats;
+    float *xf_sqr_norm, *yf_sqr_norm;
+#ifdef CUFFT
+    float *xf_sqr_norm_d, *yf_sqr_norm_d;
+#endif
 
     //model
     ComplexMat p_yf;