Remove unnecessary calls to cudaStreamSynchronize()

[hercules2020/kcf.git] / src / fft_cufft.cpp
diff --git a/src/fft_cufft.cpp b/src/fft_cufft.cpp

index 9c92a69bfb7049f8c2702e052bc9a49546e33095..5c0da667c6916ac78582b5dd9a4578b5cfc85a6d 100644 (file)
--- a/src/fft_cufft.cpp
+++ b/src/fft_cufft.cpp
@@ -4,6 +4,7 @@ cuFFT::cuFFT()
  {
      CudaSafeCall(cudaSetDeviceFlags(cudaDeviceMapHost));
      cudaErrorCheck(cublasCreate(&cublas));
+    cudaErrorCheck(cublasSetStream(cublas, cudaStreamPerThread));
  }
  
  cufftHandle cuFFT::create_plan_fwd(uint howmany) const
@@ -64,10 +65,10 @@ void cuFFT::forward(const MatScales &real_input, ComplexMat &complex_result)
      auto in = static_cast<cufftReal *>(const_cast<MatScales&>(real_input).deviceMem());
  
      if (real_input.size[0] == 1)
-        cudaErrorCheck(cufftExecR2C(plan_f, in, complex_result.get_p_data()));
+        cudaErrorCheck(cufftExecR2C(plan_f, in, complex_result.get_dev_data()));
  #ifdef BIG_BATCH
      else
-        cudaErrorCheck(cufftExecR2C(plan_f_all_scales, in, complex_result.get_p_data()));
+        cudaErrorCheck(cufftExecR2C(plan_f_all_scales, in, complex_result.get_dev_data()));
  #endif
  }
  
@@ -87,10 +88,10 @@ void cuFFT::forward_window(MatScaleFeats &feat, ComplexMat &complex_result, MatS
      }
  
      if (n_scales == 1)
-        cudaErrorCheck(cufftExecR2C(plan_fw, temp_data, complex_result.get_p_data()));
+        cudaErrorCheck(cufftExecR2C(plan_fw, temp_data, complex_result.get_dev_data()));
  #ifdef BIG_BATCH
      else
-        cudaErrorCheck(cufftExecR2C(plan_fw_all_scales, temp_data, complex_result.get_p_data()));
+        cudaErrorCheck(cufftExecR2C(plan_fw_all_scales, temp_data, complex_result.get_dev_data()));
  #endif
  }
  
@@ -99,17 +100,20 @@ void cuFFT::inverse(ComplexMat &complex_input, MatScales &real_result)
      Fft::inverse(complex_input, real_result);
  
      uint n_channels = complex_input.n_channels;
-    cufftComplex *in = reinterpret_cast<cufftComplex *>(complex_input.get_p_data());
+    cufftComplex *in = reinterpret_cast<cufftComplex *>(complex_input.get_dev_data());
      cufftReal *out = real_result.deviceMem();
      float alpha = 1.0 / (m_width * m_height);
  
      if (n_channels == 1)
          cudaErrorCheck(cufftExecC2R(plan_i_1ch, in, out));
  #ifdef BIG_BATCH
+    else
          cudaErrorCheck(cufftExecC2R(plan_i_all_scales, in, out));
  #endif
-    // TODO: Investigate whether this scalling is needed or not
      cudaErrorCheck(cublasSscal(cublas, real_result.total(), &alpha, out, 1));
+    // The result is a cv::Mat, which will be accesses by CPU, so we
+    // must synchronize with the GPU here
+    CudaSafeCall(cudaStreamSynchronize(cudaStreamPerThread));
  }
  
  cuFFT::~cuFFT()