Removed zero copy for complexmat now it is using normal device memory. Added zero...

author Shanigen <vkaraf@gmail.com>

Tue, 10 Apr 2018 13:00:23 +0000 (15:00 +0200)

committer Shanigen <vkaraf@gmail.com>

Tue, 10 Apr 2018 13:00:23 +0000 (15:00 +0200)
author Shanigen <vkaraf@gmail.com>
Tue, 10 Apr 2018 13:00:23 +0000 (15:00 +0200)
committer Shanigen <vkaraf@gmail.com>
Tue, 10 Apr 2018 13:00:23 +0000 (15:00 +0200)
diff --git a/Makefile b/Makefile

index 7ae150a8cf34745f64d3085398ea3c74fd89b1c4..82d47c6b52e69c57102b35d14e8dab5924f5b52b 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -1,25 +1,26 @@
  # Makefile to build all the available variants
  
-BUILDS = opencvfft-st opencvfft-async fftw cufftw fftw_openmp fftw_async fftw_big fftw_big_openmp cufft cufft_big cufft_big_openmp
+BUILDS = opencvfft-st opencvfft-async fftw fftw_openmp fftw_async fftw_big fftw_big_openmp cufftw cufftw_big cufftw_big_openmp cufft cufft_big cufft_big_openmp
  
  all: $(foreach build,$(BUILDS),build-$(build)/kcf_vot)
  
  CMAKE_OPTS += -G Ninja
  #CMAKE_OPTS += = -DOpenCV_DIR=
  
-CMAKE_OTPS_opencvfft-st    = -DFFT=OpenCV
-CMAKE_OTPS_opencvfft-async = -DFFT=OpenCV -DASYNC=ON
+CMAKE_OTPS_opencvfft-st      = -DFFT=OpenCV
+CMAKE_OTPS_opencvfft-async   = -DFFT=OpenCV -DASYNC=ON
  #CMAKE_OTPS_opencv-cufft    = -DFFT=OpenCV_cuFFT
-CMAKE_OTPS_fftw            = -DFFT=fftw
-CMAKE_OTPS_cufftw          = -DFFT=cuFFTW
-CMAKE_OTPS_fftw_openmp     = -DFFT=fftw -DOPENMP=ON
-CMAKE_OTPS_fftw_async      = -DFFT=fftw -DASYNC=ON
-CMAKE_OTPS_fftw_big        = -DFFT=fftw -DBIG_BATCH=ON
-CMAKE_OTPS_fftw_big_openmp = -DFFT=fftw -DBIG_BATCH=ON -DOPENMP=ON
-CMAKE_OTPS_cufft           = -DFFT=cuFFT
-CMAKE_OTPS_cufft_big       = -DFFT=cuFFT -DBIG_BATCH=ON
-CMAKE_OTPS_cufft_big_openmp= = -DFFT=cuFFT -DBIG_BATCH=ON -DOPENMP=ON
-
+CMAKE_OTPS_fftw              = -DFFT=fftw
+CMAKE_OTPS_fftw_openmp       = -DFFT=fftw -DOPENMP=ON
+CMAKE_OTPS_fftw_async        = -DFFT=fftw -DASYNC=ON
+CMAKE_OTPS_fftw_big          = -DFFT=fftw -DBIG_BATCH=ON
+CMAKE_OTPS_fftw_big_openmp   = -DFFT=fftw -DBIG_BATCH=ON -DOPENMP=ON
+CMAKE_OTPS_cufftw            = -DFFT=cuFFTW
+CMAKE_OTPS_cufftw_big        = -DFFT=cuFFTW -DBIG_BATCH=ON
+CMAKE_OTPS_cufftw_big_openmp = -DFFT=cuFFTW -DBIG_BATCH=ON -DOPENMP=ON
+CMAKE_OTPS_cufft             = -DFFT=cuFFT
+CMAKE_OTPS_cufft_big         = -DFFT=cuFFT -DBIG_BATCH=ON
+CMAKE_OTPS_cufft_big_openmp  = -DFFT=cuFFT -DBIG_BATCH=ON -DOPENMP=ON
  
  build-%/kcf_vot: $(shell git ls-files)
         mkdir -p $(@D)
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt

index 99ed56ee7f7e90a1cd7bf1ddd54d1832fd8e663c..bbe1ece6bcbca0396e644e6505f4296f7c65546d 100644 (file)
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -62,7 +62,7 @@ IF(use_cuda)
    set(CUDA_SEPARABLE_COMPILATION ON)
    set(CUDA_PROPAGATE_HOST_FLAGS OFF)
    set(CUDA_HOST_COMPILER /usr/bin/g++)
-  list( APPEND CUDA_NVCC_FLAGS -O3 --gpu-architecture sm_62 -std=c++11)
+  list( APPEND CUDA_NVCC_FLAGS -O3 --gpu-architecture sm_62 -std=c++11 -default-stream per-thread)
    find_cuda_helper_libs(cufftw)
    IF(FFT STREQUAL "cuFFT")
    cuda_add_library(complexmat complexmat.cu)
diff --git a/src/complexmat.cu b/src/complexmat.cu

index 0209c3c6d313778e019acae0aa5d9c46aae4b942..94b90a16ef8fcb001692d1a2102d8c9b0a9ff497 100644 (file)
--- a/src/complexmat.cu
+++ b/src/complexmat.cu
@@ -1,8 +1,8 @@
  #include "complexmat.cuh"
  
-__global__ void sqr_norm_kernel(int n, double* out, float* data, float rows, float cols)
+__global__ void sqr_norm_kernel(int n, float* out, float* data, float rows, float cols)
  {
-    extern __shared__ double sdata[];
+    extern __shared__ float sdata[];
      int i = blockDim.x * threadIdx.y + threadIdx.x;
      int blockId = blockIdx.x + blockIdx.y * gridDim.x;
      int threadId = 2*(blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x);
@@ -27,32 +27,16 @@ __global__ void sqr_norm_kernel(int n, double* out, float* data, float rows, flo
      }
  }
  
-float* ComplexMat::sqr_norm() const
+void ComplexMat::sqr_norm(float *result) const
  {
-    
-    double *sums_sqr_norms_from_d_d, *sums_sqr_norms_d_d;
-    float *sums_sqr_norms_from_d_f;
-    sums_sqr_norms_from_d_d = (double*) malloc(n_scales*sizeof(double));
-    sums_sqr_norms_from_d_f = (float*) malloc(n_scales*sizeof(float));
-    cudaMalloc(&sums_sqr_norms_d_d, n_scales*sizeof(double));
-    cudaMemset(sums_sqr_norms_d_d, 0, n_scales*sizeof(double));
+    cudaMemset(result, 0, n_scales*sizeof(float));
      
      dim3 threadsPerBlock(rows, cols);
      dim3 numBlocks(n_channels/n_scales, n_scales);
      
-    sqr_norm_kernel<<<numBlocks, threadsPerBlock, rows*cols*sizeof(double)>>>(n_channels/n_scales, sums_sqr_norms_d_d, p_data_d, rows, cols);
-    
-    cudaError_t error = cudaGetLastError(); if(error != cudaSuccess) {printf("CUDA error: %s\n", cudaGetErrorString(error)); exit(-1); }
-    
-    cudaMemcpy(sums_sqr_norms_from_d_d, sums_sqr_norms_d_d, n_scales*sizeof(double), cudaMemcpyDeviceToHost);
-    cudaFree(sums_sqr_norms_d_d);
-    
-    for(int i = 0; i < n_scales; ++i){
-        sums_sqr_norms_from_d_f[i] = sums_sqr_norms_from_d_d[i];
-    }
-    free(sums_sqr_norms_from_d_d);
+    sqr_norm_kernel<<<numBlocks, threadsPerBlock, rows*cols*sizeof(float)>>>(n_channels/n_scales, result, p_data, rows, cols);
          
-    return sums_sqr_norms_from_d_f;
+    return;
  }
  
  __global__ void sqr_mag_kernel(float* data, float* result)
@@ -70,7 +54,7 @@ ComplexMat ComplexMat::sqr_mag() const
      
      dim3 threadsPerBlock(rows, cols);
      dim3 numBlocks(n_channels/n_scales, n_scales);
-    sqr_mag_kernel<<<numBlocks, threadsPerBlock>>>(this->p_data_d, result.p_data_d);
+    sqr_mag_kernel<<<numBlocks, threadsPerBlock>>>(this->p_data, result.p_data);
      
      return result;
  }
@@ -90,7 +74,7 @@ ComplexMat ComplexMat::conj() const
      
      dim3 threadsPerBlock(rows, cols);
      dim3 numBlocks(n_channels/n_scales, n_scales);
-    conj_kernel<<<numBlocks, threadsPerBlock>>>(this->p_data_d, result.p_data_d);  
+    conj_kernel<<<numBlocks, threadsPerBlock>>>(this->p_data, result.p_data);  
      return result;
  }
  
@@ -103,7 +87,7 @@ ComplexMat ComplexMat::sum_over_channels() const
  
  cufftComplex* ComplexMat::get_p_data() const
  {
-    return (cufftComplex*) p_data_d;
+    return (cufftComplex*) p_data;
  }
  
  __global__ void same_num_channels_mul_kernel(float* data_l, float* data_r, float* result)
@@ -124,7 +108,7 @@ ComplexMat ComplexMat::operator*(const ComplexMat & rhs) const
  
      dim3 threadsPerBlock(rows, cols);
      dim3 numBlocks(n_channels/n_scales, n_scales);
-    same_num_channels_mul_kernel<<<numBlocks, threadsPerBlock>>>(this->p_data_d, rhs.p_data_d, result.p_data_d);
+    same_num_channels_mul_kernel<<<numBlocks, threadsPerBlock>>>(this->p_data, rhs.p_data, result.p_data);
  
      return result;
  }
@@ -148,7 +132,7 @@ ComplexMat ComplexMat::operator/(const ComplexMat & rhs) const
      
      dim3 threadsPerBlock(rows, cols);
      dim3 numBlocks(n_channels/n_scales, n_scales);
-    same_num_channels_div_kernel<<<numBlocks, threadsPerBlock>>>(this->p_data_d, rhs.p_data_d, result.p_data_d);
+    same_num_channels_div_kernel<<<numBlocks, threadsPerBlock>>>(this->p_data, rhs.p_data, result.p_data);
  
      return result;
  }
@@ -170,7 +154,7 @@ ComplexMat ComplexMat::operator+(const ComplexMat & rhs) const
      
      dim3 threadsPerBlock(rows, cols);
      dim3 numBlocks(n_channels/n_scales, n_scales);
-    same_num_channels_add_kernel<<<numBlocks, threadsPerBlock>>>(this->p_data_d, rhs.p_data_d, result.p_data_d);
+    same_num_channels_add_kernel<<<numBlocks, threadsPerBlock>>>(this->p_data, rhs.p_data, result.p_data);
      
      return result;
  }
@@ -190,7 +174,7 @@ ComplexMat ComplexMat::operator*(const float & rhs) const
      
      dim3 threadsPerBlock(rows, cols);
      dim3 numBlocks(n_channels/n_scales, n_scales);
-    constant_mul_kernel<<<numBlocks, threadsPerBlock>>>(this->p_data_d, rhs, result.p_data_d);
+    constant_mul_kernel<<<numBlocks, threadsPerBlock>>>(this->p_data, rhs, result.p_data);
  
      return result;
  }
@@ -210,7 +194,7 @@ ComplexMat ComplexMat::operator+(const float & rhs) const
      
      dim3 threadsPerBlock(rows, cols);
      dim3 numBlocks(n_channels/n_scales, n_scales);
-    constant_add_kernel<<<numBlocks, threadsPerBlock>>>(this->p_data_d, rhs, result.p_data_d);
+    constant_add_kernel<<<numBlocks, threadsPerBlock>>>(this->p_data, rhs, result.p_data);
  
      return result;
  }
@@ -234,7 +218,7 @@ ComplexMat ComplexMat::mul(const ComplexMat & rhs) const
      
      dim3 threadsPerBlock(rows, cols);
      dim3 numBlocks(n_channels/n_scales, n_scales);
-    one_channel_mul_kernel<<<numBlocks, threadsPerBlock>>>(this->p_data_d, rhs.p_data_d, result.p_data_d);
+    one_channel_mul_kernel<<<numBlocks, threadsPerBlock>>>(this->p_data, rhs.p_data, result.p_data);
      
      return result;
  }
@@ -258,7 +242,7 @@ ComplexMat ComplexMat::mul2(const ComplexMat & rhs) const
      
      dim3 threadsPerBlock(rows, cols);
      dim3 numBlocks(n_channels/n_scales, n_scales);
-    scales_channel_mul_kernel<<<numBlocks, threadsPerBlock>>>(this->p_data_d, rhs.p_data_d, result.p_data_d);
+    scales_channel_mul_kernel<<<numBlocks, threadsPerBlock>>>(this->p_data, rhs.p_data, result.p_data);
      
      return result;
  }
@@ -271,7 +255,6 @@ void ComplexMat::operator=(ComplexMat & rhs)
      n_scales = rhs.n_scales;
      
      p_data = rhs.p_data;
-    p_data_d = rhs.p_data_d;
      
  }
  
@@ -283,8 +266,6 @@ void ComplexMat::operator=(ComplexMat && rhs)
      n_scales = rhs.n_scales;
      
      p_data = rhs.p_data;
-    p_data_d = rhs.p_data_d;
      
      rhs.p_data = nullptr;
-    rhs.p_data_d = nullptr;
  }
diff --git a/src/complexmat.cuh b/src/complexmat.cuh

index 7ebbe43c1b069f3119ab1eca0e4249ae0d35e2c3..e97209f6fdc74cdab17dd380250f6c4bcfaa4245 100644 (file)
--- a/src/complexmat.cuh
+++ b/src/complexmat.cuh
@@ -16,14 +16,12 @@ public:
      ComplexMat() : cols(0), rows(0), n_channels(0) {}
      ComplexMat(int _rows, int _cols, int _n_channels) : cols(_cols), rows(_rows), n_channels(_n_channels)
      {
-        cudaHostAlloc((void **)&p_data, n_channels*cols*rows*sizeof(cufftComplex), cudaHostAllocMapped);
-        cudaHostGetDevicePointer((void **)&p_data_d,  (void *) p_data , 0);
+        cudaMalloc(&p_data,  n_channels*cols*rows*sizeof(cufftComplex));
      }
      
      ComplexMat(int _rows, int _cols, int _n_channels, int _n_scales) : cols(_cols), rows(_rows), n_channels(_n_channels), n_scales(_n_scales)
      {
-        cudaHostAlloc((void **)&p_data, n_channels*cols*rows*sizeof(cufftComplex), cudaHostAllocMapped);
-        cudaHostGetDevicePointer((void **)&p_data_d,  (void *) p_data , 0);
+        cudaMalloc(&p_data,  n_channels*cols*rows*sizeof(cufftComplex));
      }
      
      ComplexMat(ComplexMat &&other)
@@ -35,12 +33,11 @@ public:
          p_data = other.p_data;
          
          other.p_data = nullptr;
-        other.p_data_d = nullptr;
      }
      
      ~ComplexMat()
      {
-        if(p_data != nullptr) cudaFreeHost(p_data);
+        if(p_data != nullptr) cudaFree(p_data);
      }
  
      void create(int _rows, int _cols, int _n_channels)
@@ -48,8 +45,7 @@ public:
          rows = _rows;
          cols = _cols;
          n_channels = _n_channels;
-        cudaHostAlloc((void **)&p_data, n_channels*cols*rows*sizeof(cufftComplex), cudaHostAllocMapped);
-        cudaHostGetDevicePointer((void **)&p_data_d,  (void *) p_data , 0);
+        cudaMalloc(&p_data,  n_channels*cols*rows*sizeof(cufftComplex));
      }
  
      void create(int _rows, int _cols, int _n_channels, int _n_scales)
@@ -58,15 +54,14 @@ public:
          cols = _cols;
          n_channels = _n_channels;
          n_scales = _n_scales;
-        cudaHostAlloc((void **)&p_data, n_channels*cols*rows*sizeof(cufftComplex), cudaHostAllocMapped);
-        cudaHostGetDevicePointer((void **)&p_data_d,  (void *) p_data , 0);
+        cudaMalloc(&p_data,  n_channels*cols*rows*sizeof(cufftComplex));
      }
      // cv::Mat API compatibility
      cv::Size size() { return cv::Size(cols, rows); }
      int channels() { return n_channels; }
      int channels() const { return n_channels; }
  
-    float* sqr_norm() const;
+    void sqr_norm(float *result) const;
      
      ComplexMat sqr_mag() const;
  
@@ -110,5 +105,5 @@ public:
  
  
  private:
-    mutable float *p_data = nullptr, *p_data_d = nullptr;
+    mutable float *p_data = nullptr;
  };
 \ No newline at end of file
diff --git a/src/complexmat.hpp b/src/complexmat.hpp

index 943e3f4b82783fb9c2ee87a9c802fcc897b37cad..344c479ec6ab1d0402bc3e5b56d25817f4ce7903 100644 (file)
--- a/src/complexmat.hpp
+++ b/src/complexmat.hpp
@@ -60,10 +60,8 @@ public:
      }
  
  
-    T* sqr_norm() const
+    void sqr_norm(T *sums_sqr_norms) const
      {
-        T* sums_sqr_norms;
-        sums_sqr_norms = (T*) malloc(n_scales*sizeof(T));
          int n_channels_per_scale = n_channels/n_scales;
          int scale_offset = n_channels_per_scale*rows*cols;
          T sum_sqr_norm;
@@ -74,7 +72,7 @@ public:
                      sum_sqr_norm += lhs->real()*lhs->real() + lhs->imag()*lhs->imag();
              sums_sqr_norms[scale] = sum_sqr_norm/static_cast<T>(cols*rows);
          }
-        return sums_sqr_norms;
+        return;
      }
  
      ComplexMat_<T> sqr_mag() const
diff --git a/src/fft_cufft.cpp b/src/fft_cufft.cpp

index 7ca695733bbd06bc0e609630c2cbcd7dc46bc081..c94d61f14cdd6c3a790a38a7e836042447a60b2f 100644 (file)
--- a/src/fft_cufft.cpp
+++ b/src/fft_cufft.cpp
@@ -72,9 +72,7 @@ static const char *_cudaGetErrorEnum(cufftResult error)
  }
  
  cuFFT::cuFFT(): m_num_of_streams(4)
-{
-    cudaSetDeviceFlags(cudaDeviceMapHost);
-}
+{}
  
  void cuFFT::init(unsigned width, unsigned height, unsigned num_of_feats, unsigned num_of_scales, bool big_batch_mode)
  {
@@ -85,8 +83,6 @@ void cuFFT::init(unsigned width, unsigned height, unsigned num_of_feats, unsigne
      m_big_batch_mode = big_batch_mode;
  
      std::cout << "FFT: cuFFT" << std::endl;
-
-//     cudaSetDeviceFlags(cudaDeviceMapHost);
      
  
      for (unsigned i = 0; i < m_num_of_streams; i++) cudaStreamCreate(&streams[i]);
@@ -96,6 +92,8 @@ void cuFFT::init(unsigned width, unsigned height, unsigned num_of_feats, unsigne
          cudaMalloc(&data_f, m_height*m_width*sizeof(cufftReal));
          
          cufftPlan2d(&plan_f, m_height, m_width, CUFFT_R2C);
+        
+        
      }
      //FFT forward all scales
      if(m_num_of_scales > 1 && m_big_batch_mode)
@@ -107,7 +105,7 @@ void cuFFT::init(unsigned width, unsigned height, unsigned num_of_feats, unsigne
         int howmany = m_num_of_scales;
         int idist = m_height*m_width, odist = m_height*(m_width/2+1);
         int istride = 1, ostride = 1;
-       int *inembed = NULL, *onembed = NULL;
+       int *inembed = n, onembed[] = {(int)m_height, (int)m_width/2+1};
  
         CHECK_CUFFT_ERRORS(cufftPlanMany(&plan_f_all_scales, rank, n,
                       inembed, istride, idist,
@@ -116,14 +114,15 @@ void cuFFT::init(unsigned width, unsigned height, unsigned num_of_feats, unsigne
      }
      //FFT forward window one scale
      {
-        cudaMalloc(&data_fw, m_height*m_num_of_feats*m_width*sizeof(cufftReal));
+        cudaHostAlloc(&data_fw, m_height*m_num_of_feats*m_width*sizeof(cufftReal), cudaHostAllocMapped);
+        cudaHostGetDevicePointer(&data_fw_d, data_fw, 0);
          
          int rank = 2;
          int n[] = {(int)m_height, (int)m_width};
          int howmany = m_num_of_feats;
          int idist = m_height*m_width, odist = m_height*(m_width/2+1);
          int istride = 1, ostride = 1;
-        int *inembed = NULL, *onembed = NULL;
+        int *inembed = n, onembed[] = {(int)m_height, (int)m_width/2+1};
  
          CHECK_CUFFT_ERRORS(cufftPlanMany(&plan_fw, rank, n,
                   inembed, istride, idist,
@@ -133,23 +132,27 @@ void cuFFT::init(unsigned width, unsigned height, unsigned num_of_feats, unsigne
      //FFT forward window all scales all feats
      if(m_num_of_scales > 1 && m_big_batch_mode)
      {
-        cudaMalloc(&data_fw_all_scales, m_height*m_num_of_feats*m_num_of_scales*m_width*sizeof(cufftReal));
-        
+        cudaHostAlloc(&data_fw_all_scales, m_height*m_num_of_feats*m_num_of_scales*m_width*sizeof(cufftReal), cudaHostAllocMapped);
+        cudaHostGetDevicePointer(&data_fw_all_scales_d, data_fw_all_scales, 0);
+
          int rank = 2;
          int n[] = {(int)m_height, (int)m_width};
          int howmany = m_num_of_scales*m_num_of_feats;
          int idist = m_height*m_width, odist = m_height*(m_width/2+1);
          int istride = 1, ostride = 1;
-        int *inembed = NULL, *onembed = NULL;
+        int *inembed = n, onembed[] = {(int)m_height, (int)m_width/2+1};
  
          CHECK_CUFFT_ERRORS(cufftPlanMany(&plan_fw_all_scales, rank, n,
                   inembed, istride, idist,
                   onembed, ostride, odist,
                   CUFFT_R2C, howmany));
+        
+        
      }
      //FFT inverse one scale
      {
-        cudaMalloc(&data_i_features, m_height*m_num_of_feats*m_width*sizeof(cufftReal));
+        cudaHostAlloc(&data_i_features, m_height*m_num_of_feats*m_width*sizeof(cufftReal), cudaHostAllocMapped);
+        cudaHostGetDevicePointer(&data_i_features_d, data_i_features, 0);
          
          int rank = 2;
          int n[] = {(int)m_height, (int)m_width};
@@ -166,7 +169,8 @@ void cuFFT::init(unsigned width, unsigned height, unsigned num_of_feats, unsigne
      //FFT inverse all scales
      if(m_num_of_scales > 1)
      {
-        cudaMalloc(&data_i_features_all_scales, m_height*m_num_of_feats*m_num_of_scales*m_width*sizeof(cufftReal));
+        cudaHostAlloc(&data_i_features_all_scales, m_height*m_num_of_feats*m_num_of_scales*m_width*sizeof(cufftReal), cudaHostAllocMapped);
+        cudaHostGetDevicePointer(&data_i_features_all_scales_d, data_i_features_all_scales, 0);
          
          int rank = 2;
          int n[] = {(int)m_height, (int)m_width};
@@ -182,7 +186,8 @@ void cuFFT::init(unsigned width, unsigned height, unsigned num_of_feats, unsigne
      }
      //FFT inverse one channel one scale
      {
-        cudaMalloc(&data_i_1ch, m_height*m_width*sizeof(cufftReal));
+        cudaHostAlloc(&data_i_1ch, m_height*m_width*sizeof(cufftReal), cudaHostAllocMapped);
+        cudaHostGetDevicePointer(&data_i_1ch_d, data_i_1ch, 0);
          
          int rank = 2;
          int n[] = {(int)m_height, (int)m_width};
@@ -199,7 +204,8 @@ void cuFFT::init(unsigned width, unsigned height, unsigned num_of_feats, unsigne
      //FFT inverse one channel all scales
      if(m_num_of_scales > 1 && m_big_batch_mode)
      {
-        cudaMalloc(&data_i_1ch_all_scales, m_height*m_num_of_scales*m_width*sizeof(cufftReal));
+        cudaHostAlloc(&data_i_1ch_all_scales, m_height*m_num_of_scales*m_width*sizeof(cufftReal), cudaHostAllocMapped);
+        cudaHostGetDevicePointer(&data_i_1ch_all_scales_d, data_i_1ch_all_scales, 0);
          
          int rank = 2;
          int n[] = {(int)m_height, (int)m_width};
@@ -241,24 +247,27 @@ ComplexMat cuFFT::forward(const cv::Mat &input)
  ComplexMat cuFFT::forward_window(const std::vector<cv::Mat> &input)
  {
      int n_channels = input.size();
-    cv::Mat in_all(m_height * n_channels, m_width, CV_32F);
-    for (int i = 0; i < n_channels; ++i) {
-        cv::Mat in_roi(in_all, cv::Rect(0, i*m_height, m_width, m_height));
-        in_roi = input[i].mul(m_window);
-    }
      ComplexMat result;
      if(n_channels > (int) m_num_of_feats){
+        cv::Mat in_all(m_height * n_channels, m_width, CV_32F, data_fw_all_scales);
+        for (int i = 0; i < n_channels; ++i) {
+            cv::Mat in_roi(in_all, cv::Rect(0, i*m_height, m_width, m_height));
+            in_roi = input[i].mul(m_window);
+        }
+        
          result.create(m_height, m_width/2 + 1, n_channels,m_num_of_scales);
+        
+        CHECK_CUFFT_ERRORS(cufftExecR2C(plan_fw_all_scales, reinterpret_cast<cufftReal*>(data_fw_all_scales_d), result.get_p_data()));
      } else {
+        cv::Mat in_all(m_height * n_channels, m_width, CV_32F, data_fw);
+        for (int i = 0; i < n_channels; ++i) {
+            cv::Mat in_roi(in_all, cv::Rect(0, i*m_height, m_width, m_height));
+            in_roi = input[i].mul(m_window);
+        }
+        
          result.create(m_height, m_width/2 + 1, n_channels);
-    }
-
-    if (n_channels <= (int) m_num_of_feats){
-        cudaMemcpy(data_fw, in_all.ptr<cufftReal>(), m_height*m_num_of_feats*m_width*sizeof(cufftReal), cudaMemcpyHostToDevice);
-        CHECK_CUFFT_ERRORS(cufftExecR2C(plan_fw, reinterpret_cast<cufftReal*>(data_fw), result.get_p_data()));
-    } else {
-       cudaMemcpy(data_fw_all_scales, in_all.ptr<cufftReal>(), m_height*n_channels*m_width*sizeof(cufftReal), cudaMemcpyHostToDevice);
-       CHECK_CUFFT_ERRORS(cufftExecR2C(plan_fw_all_scales, reinterpret_cast<cufftReal*>(data_fw_all_scales), result.get_p_data()));
+        
+        CHECK_CUFFT_ERRORS(cufftExecR2C(plan_fw, reinterpret_cast<cufftReal*>(data_fw_d), result.get_p_data()));
      }
      return result;
  }
@@ -266,22 +275,36 @@ ComplexMat cuFFT::forward_window(const std::vector<cv::Mat> &input)
  cv::Mat cuFFT::inverse(const ComplexMat &inputf)
  {
      int n_channels = inputf.n_channels;
-    cv::Mat real_result(m_height, m_width, CV_32FC(n_channels));
      cufftComplex *in = reinterpret_cast<cufftComplex*>(inputf.get_p_data());
-
+    
      if(n_channels == 1){
-        CHECK_CUFFT_ERRORS(cufftExecC2R(plan_i_1ch, in, reinterpret_cast<cufftReal*>(data_i_1ch)));
-        cudaMemcpy(real_result.ptr<cufftReal>(), data_i_1ch, m_height*m_width*sizeof(cufftReal), cudaMemcpyDeviceToHost);
+        cv::Mat real_result(m_height, m_width, CV_32FC1, data_i_1ch);
+        
+        CHECK_CUFFT_ERRORS(cufftExecC2R(plan_i_1ch, in, reinterpret_cast<cufftReal*>(data_i_1ch_d)));
+        cudaDeviceSynchronize();
+        
+        return real_result/(m_width*m_height);
      } else if(n_channels == (int) m_num_of_scales){
-        CHECK_CUFFT_ERRORS(cufftExecC2R(plan_i_1ch_all_scales, in, reinterpret_cast<cufftReal*>(data_i_1ch_all_scales)));
-        cudaMemcpy(real_result.ptr<cufftReal>(), data_i_1ch_all_scales, m_height*n_channels*m_width*sizeof(cufftReal), cudaMemcpyDeviceToHost);
+        cv::Mat real_result(m_height, m_width, CV_32FC(n_channels), data_i_1ch_all_scales);
+        
+        CHECK_CUFFT_ERRORS(cufftExecC2R(plan_i_1ch_all_scales, in, reinterpret_cast<cufftReal*>(data_i_1ch_all_scales_d)));
+        cudaDeviceSynchronize();
+        
+        return real_result/(m_width*m_height);
      } else if(n_channels == (int) m_num_of_feats * (int) m_num_of_scales){
-        CHECK_CUFFT_ERRORS(cufftExecC2R(plan_i_features_all_scales, in, reinterpret_cast<cufftReal*>(data_i_features_all_scales)));
-        cudaMemcpy(real_result.ptr<cufftReal>(), data_i_features_all_scales, m_height*n_channels*m_width*sizeof(cufftReal), cudaMemcpyDeviceToHost);
-    } else {
-        CHECK_CUFFT_ERRORS(cufftExecC2R(plan_i_features, in, reinterpret_cast<cufftReal*>(data_i_features)));
-        cudaMemcpy(real_result.ptr<cufftReal>(), data_i_features, m_height*n_channels*m_width*sizeof(cufftReal), cudaMemcpyDeviceToHost);
+        cv::Mat real_result(m_height, m_width, CV_32FC(n_channels), data_i_features_all_scales);
+        
+        CHECK_CUFFT_ERRORS(cufftExecC2R(plan_i_features_all_scales, in, reinterpret_cast<cufftReal*>(data_i_features_all_scales_d)));
+        cudaDeviceSynchronize();
+        
+        return real_result/(m_width*m_height);
      }
+    
+    cv::Mat real_result(m_height, m_width, CV_32FC(n_channels), data_i_features);
+    
+    CHECK_CUFFT_ERRORS(cufftExecC2R(plan_i_features, in, reinterpret_cast<cufftReal*>(data_i_features_d)));
+    cudaDeviceSynchronize();
+    
      return real_result/(m_width*m_height);
  }
  
@@ -301,12 +324,12 @@ cuFFT::~cuFFT()
    
    cudaFree(data_f);
    cudaFree(data_f_all_scales);
-  cudaFree(data_fw);
-  cudaFree(data_fw_all_scales);
-  cudaFree(data_i_1ch);
-  cudaFree(data_i_1ch_all_scales);
-  cudaFree(data_i_features);
-  cudaFree(data_i_features_all_scales);
+  cudaFreeHost(data_fw);
+  cudaFreeHost(data_fw_all_scales);
+  cudaFreeHost(data_i_1ch);
+  cudaFreeHost(data_i_1ch_all_scales);
+  cudaFreeHost(data_i_features);
+  cudaFreeHost(data_i_features_all_scales);
    
    cudaDeviceReset();
  }
diff --git a/src/fft_cufft.h b/src/fft_cufft.h

index df91b7f61e5a7ec06f0c76f8bafff28299d34dfa..a31285b434cdca33a2ac91e96e67a7ce8ecef35b 100644 (file)
--- a/src/fft_cufft.h
+++ b/src/fft_cufft.h
@@ -32,7 +32,8 @@ private:
      cudaStream_t streams[4];
      cufftHandle plan_f, plan_f_all_scales, plan_fw, plan_fw_all_scales, plan_i_features,
       plan_i_features_all_scales, plan_i_1ch, plan_i_1ch_all_scales;
-    float *data_f, *data_f_all_scales, *data_fw, *data_fw_all_scales, *data_i_features, *data_i_features_all_scales, *data_i_1ch, *data_i_1ch_all_scales;
+    float *data_f, *data_f_all_scales, *data_fw, *data_fw_d, *data_fw_all_scales, *data_fw_all_scales_d, *data_i_features, *data_i_features_d,
+          *data_i_features_all_scales, *data_i_features_all_scales_d, *data_i_1ch, *data_i_1ch_d, *data_i_1ch_all_scales, *data_i_1ch_all_scales_d;
  };
  
  #endif // FFT_CUDA_H
diff --git a/src/kcf.cpp b/src/kcf.cpp

index 94895be54926fa4682583418ce7610e8adb62a83..710ed5b9fc809a339beb39fcfc5dcfa2442ae80a 100644 (file)
--- a/src/kcf.cpp
+++ b/src/kcf.cpp
@@ -33,6 +33,13 @@ KCF_Tracker::KCF_Tracker()
  KCF_Tracker::~KCF_Tracker()
  {
      delete &fft;
+#ifdef CUFFT
+    cudaFreeHost(xf_sqr_norm);
+    cudaFreeHost(yf_sqr_norm);
+#else
+    free(xf_sqr_norm);
+    free(yf_sqr_norm);
+#endif
  }
  
  void KCF_Tracker::init(cv::Mat &img, const cv::Rect & bbox)
@@ -99,6 +106,18 @@ void KCF_Tracker::init(cv::Mat &img, const cv::Rect & bbox)
              p_scales.push_back(std::pow(p_scale_step, i));
      else
          p_scales.push_back(1.);
+    
+#ifdef CUFFT
+    cudaSetDeviceFlags(cudaDeviceMapHost);
+    cudaHostAlloc((void**)&xf_sqr_norm, p_scales.size()*sizeof(float), cudaHostAllocMapped);
+    cudaHostGetDevicePointer((void**)&xf_sqr_norm_d, (void*)xf_sqr_norm, 0);
+    
+    cudaHostAlloc((void**)&yf_sqr_norm, sizeof(float), cudaHostAllocMapped);
+    cudaHostGetDevicePointer((void**)&yf_sqr_norm_d, (void*)yf_sqr_norm, 0);
+#else
+    xf_sqr_norm = (float*) malloc(p_scales.size()*sizeof(float));
+    xf_sqr_norm = (float*) malloc(sizeof(float));
+#endif
  
      p_current_scale = 1.;
  
@@ -594,8 +613,20 @@ cv::Mat KCF_Tracker::get_subwindow(const cv::Mat &input, int cx, int cy, int wid
  
  ComplexMat KCF_Tracker::gaussian_correlation(const ComplexMat &xf, const ComplexMat &yf, double sigma, bool auto_correlation)
  {
-    float* xf_sqr_norm = xf.sqr_norm();
-    float* yf_sqr_norm = auto_correlation ? xf_sqr_norm : yf.sqr_norm();
+#ifdef CUFFT
+    xf.sqr_norm(xf_sqr_norm_d);
+#else
+    xf.sqr_norm(xf_sqr_norm);
+#endif
+    if(auto_correlation){
+        yf_sqr_norm = xf_sqr_norm;
+    } else {
+#ifdef CUFFT
+        yf.sqr_norm(yf_sqr_norm_d);
+#else
+        yf.sqr_norm(yf_sqr_norm);
+#endif
+    }
  
      ComplexMat xyf;
      xyf = auto_correlation ? xf.sqr_mag() : xf.mul2(yf.conj());
@@ -633,9 +664,6 @@ ComplexMat KCF_Tracker::gaussian_correlation(const ComplexMat &xf, const Complex
      DEBUG_PRINTM(in_roi);
      }
  
-    free(xf_sqr_norm);
-    if(!auto_correlation)free(yf_sqr_norm);
-
      DEBUG_PRINTM(in_all);
      return fft.forward(in_all);
  }
diff --git a/src/kcf.h b/src/kcf.h

index 104de2dce65a00c40c62257b463bfed94e85ee6c..4e374be198739ed0261f6d5299a947e953d9ffb1 100644 (file)
--- a/src/kcf.h
+++ b/src/kcf.h
@@ -107,6 +107,10 @@ private:
  
      //for big batch
      int num_of_feats;
+    float *xf_sqr_norm, *yf_sqr_norm;
+#ifdef CUFFT
+    float *xf_sqr_norm_d, *yf_sqr_norm_d;
+#endif
  
      //model
      ComplexMat p_yf;
author	Shanigen <vkaraf@gmail.com>
	Tue, 10 Apr 2018 13:00:23 +0000 (15:00 +0200)
committer	Shanigen <vkaraf@gmail.com>
	Tue, 10 Apr 2018 13:00:23 +0000 (15:00 +0200)
Makefile		patch \| blob \| history
src/CMakeLists.txt		patch \| blob \| history
src/complexmat.cu		patch \| blob \| history
src/complexmat.cuh		patch \| blob \| history
src/complexmat.hpp		patch \| blob \| history
src/fft_cufft.cpp		patch \| blob \| history
src/fft_cufft.h		patch \| blob \| history
src/kcf.cpp		patch \| blob \| history
src/kcf.h		patch \| blob \| history