From 4b936bb4c69d7ccd07d29f9453602153c6170cc4 Mon Sep 17 00:00:00 2001 From: Michal Sojka Date: Fri, 12 Oct 2018 15:13:58 +0200 Subject: [PATCH] Remove unnecessary calls to cudaStreamSynchronize() --- src/complexmat.cu | 2 +- src/debug.h | 3 --- src/fft_cufft.cpp | 4 ++-- 3 files changed, 3 insertions(+), 6 deletions(-) diff --git a/src/complexmat.cu b/src/complexmat.cu index 1ebd883..c8430ed 100644 --- a/src/complexmat.cu +++ b/src/complexmat.cu @@ -24,6 +24,7 @@ __global__ void sqr_norm_kernel(const float *in, float *block_res, int total) void ComplexMat_::sqr_norm(DynMem &result) const { + assert(result.num_elem == n_scales); const uint total = n_channels / n_scales * rows * cols; @@ -127,7 +128,6 @@ ComplexMat_ ComplexMat_::sum_over_channels() const reinterpret_cast(p_data.deviceMem() + scale * n_channels_per_scale * rows * cols), n_channels_per_scale, total); } - CudaSafeCall(cudaStreamSynchronize(cudaStreamPerThread)); return result; } diff --git a/src/debug.h b/src/debug.h index 07c6ee7..cce8327 100644 --- a/src/debug.h +++ b/src/debug.h @@ -83,9 +83,6 @@ class DbgTracer { { (void)line; if (debug || always) { -#ifdef CUFFT - CudaSafeCall(cudaStreamSynchronize(cudaStreamPerThread)); -#endif IOSave s(std::cerr); std::cerr << std::setprecision(precision); std::cerr << indent() << name /*<< " @" << line */ << " " << print(obj) << std::endl; diff --git a/src/fft_cufft.cpp b/src/fft_cufft.cpp index c7a9e2f..5c0da66 100644 --- a/src/fft_cufft.cpp +++ b/src/fft_cufft.cpp @@ -93,7 +93,6 @@ void cuFFT::forward_window(MatScaleFeats &feat, ComplexMat &complex_result, MatS else cudaErrorCheck(cufftExecR2C(plan_fw_all_scales, temp_data, complex_result.get_dev_data())); #endif - CudaSafeCall(cudaStreamSynchronize(cudaStreamPerThread)); } void cuFFT::inverse(ComplexMat &complex_input, MatScales &real_result) @@ -111,8 +110,9 @@ void cuFFT::inverse(ComplexMat &complex_input, MatScales &real_result) else cudaErrorCheck(cufftExecC2R(plan_i_all_scales, in, out)); #endif - // TODO: Investigate whether this scalling is needed or not cudaErrorCheck(cublasSscal(cublas, real_result.total(), &alpha, out, 1)); + // The result is a cv::Mat, which will be accesses by CPU, so we + // must synchronize with the GPU here CudaSafeCall(cudaStreamSynchronize(cudaStreamPerThread)); } -- 2.39.2