X-Git-Url: http://rtime.felk.cvut.cz/gitweb/hercules2020/kcf.git/blobdiff_plain/6d8eea0517252bdbf4832d454a679b6f82d24007..4b936bb4c69d7ccd07d29f9453602153c6170cc4:/src/fft_cufft.cpp diff --git a/src/fft_cufft.cpp b/src/fft_cufft.cpp index 9c92a69..5c0da66 100644 --- a/src/fft_cufft.cpp +++ b/src/fft_cufft.cpp @@ -4,6 +4,7 @@ cuFFT::cuFFT() { CudaSafeCall(cudaSetDeviceFlags(cudaDeviceMapHost)); cudaErrorCheck(cublasCreate(&cublas)); + cudaErrorCheck(cublasSetStream(cublas, cudaStreamPerThread)); } cufftHandle cuFFT::create_plan_fwd(uint howmany) const @@ -64,10 +65,10 @@ void cuFFT::forward(const MatScales &real_input, ComplexMat &complex_result) auto in = static_cast(const_cast(real_input).deviceMem()); if (real_input.size[0] == 1) - cudaErrorCheck(cufftExecR2C(plan_f, in, complex_result.get_p_data())); + cudaErrorCheck(cufftExecR2C(plan_f, in, complex_result.get_dev_data())); #ifdef BIG_BATCH else - cudaErrorCheck(cufftExecR2C(plan_f_all_scales, in, complex_result.get_p_data())); + cudaErrorCheck(cufftExecR2C(plan_f_all_scales, in, complex_result.get_dev_data())); #endif } @@ -87,10 +88,10 @@ void cuFFT::forward_window(MatScaleFeats &feat, ComplexMat &complex_result, MatS } if (n_scales == 1) - cudaErrorCheck(cufftExecR2C(plan_fw, temp_data, complex_result.get_p_data())); + cudaErrorCheck(cufftExecR2C(plan_fw, temp_data, complex_result.get_dev_data())); #ifdef BIG_BATCH else - cudaErrorCheck(cufftExecR2C(plan_fw_all_scales, temp_data, complex_result.get_p_data())); + cudaErrorCheck(cufftExecR2C(plan_fw_all_scales, temp_data, complex_result.get_dev_data())); #endif } @@ -99,17 +100,20 @@ void cuFFT::inverse(ComplexMat &complex_input, MatScales &real_result) Fft::inverse(complex_input, real_result); uint n_channels = complex_input.n_channels; - cufftComplex *in = reinterpret_cast(complex_input.get_p_data()); + cufftComplex *in = reinterpret_cast(complex_input.get_dev_data()); cufftReal *out = real_result.deviceMem(); float alpha = 1.0 / (m_width * m_height); if (n_channels == 1) cudaErrorCheck(cufftExecC2R(plan_i_1ch, in, out)); #ifdef BIG_BATCH + else cudaErrorCheck(cufftExecC2R(plan_i_all_scales, in, out)); #endif - // TODO: Investigate whether this scalling is needed or not cudaErrorCheck(cublasSscal(cublas, real_result.total(), &alpha, out, 1)); + // The result is a cv::Mat, which will be accesses by CPU, so we + // must synchronize with the GPU here + CudaSafeCall(cudaStreamSynchronize(cudaStreamPerThread)); } cuFFT::~cuFFT()