]> rtime.felk.cvut.cz Git - hercules2020/kcf.git/blobdiff - src/fft_cufft.cpp
Fix CUDA bugs that manifest themselves in BIG_BATCH mode
[hercules2020/kcf.git] / src / fft_cufft.cpp
index 4ad0c21fb25ef6e43f5faa8371ca1efc9302c441..963c8b7084ff42d3ed24971cb627d2181897bdf9 100644 (file)
 #include "fft_cufft.h"
 
-void cuFFT::init(unsigned width, unsigned height, unsigned num_of_feats, unsigned num_of_scales, bool big_batch_mode)
+cuFFT::cuFFT()
 {
-    m_width = width;
-    m_height = height;
-    m_num_of_feats = num_of_feats;
-    m_num_of_scales = num_of_scales;
-    m_big_batch_mode = big_batch_mode;
-
-    std::cout << "FFT: cuFFT" << std::endl;
-
-    //FFT forward one scale
-    {
-        CudaSafeCall(cudaMalloc(&data_f, m_height*m_width*sizeof(cufftReal)));
-
-       CufftErrorCheck(cufftPlan2d(&plan_f, m_height, m_width, CUFFT_R2C));
-
-
-    }
-    //FFT forward all scales
-    if(m_num_of_scales > 1 && m_big_batch_mode)
-    {
-        CudaSafeCall(cudaMalloc(&data_f_all_scales, m_height*m_num_of_scales*m_width*sizeof(cufftReal)));
-
-        int rank = 2;
-        int n[] = {(int)m_height, (int)m_width};
-        int howmany = m_num_of_scales;
-        int idist = m_height*m_width, odist = m_height*(m_width/2+1);
-        int istride = 1, ostride = 1;
-        int *inembed = n, onembed[] = {(int)m_height, (int)m_width/2+1};
-
-        CufftErrorCheck(cufftPlanMany(&plan_f_all_scales, rank, n,
-                 inembed, istride, idist,
-                 onembed, ostride, odist,
-                 CUFFT_R2C, howmany));
-    }
-    //FFT forward window one scale
-    {
-        CudaSafeCall(cudaHostAlloc(&data_fw, m_height*m_num_of_feats*m_width*sizeof(cufftReal), cudaHostAllocMapped));
-        CudaSafeCall(cudaHostGetDevicePointer(&data_fw_d, data_fw, 0));
-
-        int rank = 2;
-        int n[] = {(int)m_height, (int)m_width};
-        int howmany = m_num_of_feats;
-        int idist = m_height*m_width, odist = m_height*(m_width/2+1);
-        int istride = 1, ostride = 1;
-        int *inembed = n, onembed[] = {(int)m_height, (int)m_width/2+1};
-
-        CufftErrorCheck(cufftPlanMany(&plan_fw, rank, n,
-                 inembed, istride, idist,
-                 onembed, ostride, odist,
-                 CUFFT_R2C, howmany));
-    }
-    //FFT forward window all scales all feats
-    if(m_num_of_scales > 1 && m_big_batch_mode)
-    {
-        CudaSafeCall(cudaHostAlloc(&data_fw_all_scales, m_height*m_num_of_feats*m_num_of_scales*m_width*sizeof(cufftReal), cudaHostAllocMapped));
-        CudaSafeCall(cudaHostGetDevicePointer(&data_fw_all_scales_d, data_fw_all_scales, 0));
-
-        int rank = 2;
-        int n[] = {(int)m_height, (int)m_width};
-        int howmany = m_num_of_scales*m_num_of_feats;
-        int idist = m_height*m_width, odist = m_height*(m_width/2+1);
-        int istride = 1, ostride = 1;
-        int *inembed = n, onembed[] = {(int)m_height, (int)m_width/2+1};
+    CudaSafeCall(cudaSetDeviceFlags(cudaDeviceMapHost));
+    cudaErrorCheck(cublasCreate(&cublas));
+}
 
-        CufftErrorCheck(cufftPlanMany(&plan_fw_all_scales, rank, n,
-                 inembed, istride, idist,
-                 onembed, ostride, odist,
-                 CUFFT_R2C, howmany));
+cufftHandle cuFFT::create_plan_fwd(uint howmany) const
+{
+    int rank = 2;
+    int n[] = {(int)m_height, (int)m_width};
+    int idist = m_height * m_width, odist = m_height * (m_width / 2 + 1);
+    int istride = 1, ostride = 1;
+    int *inembed = n, onembed[] = {(int)m_height, (int)m_width / 2 + 1};
+
+    cufftHandle plan;
+    cudaErrorCheck(cufftPlanMany(&plan, rank, n, inembed, istride, idist, onembed, ostride, odist, CUFFT_R2C, howmany));
+    cudaErrorCheck(cufftSetStream(plan, cudaStreamPerThread));
+    return plan;
+}
 
+cufftHandle cuFFT::create_plan_inv(uint howmany) const
+{
+    int rank = 2;
+    int n[] = {(int)m_height, (int)m_width};
+    int idist = m_height * (m_width / 2 + 1), odist = m_height * m_width;
+    int istride = 1, ostride = 1;
+    int inembed[] = {(int)m_height, (int)m_width / 2 + 1}, *onembed = n;
+
+    cufftHandle plan;
+    cudaErrorCheck(cufftPlanMany(&plan, rank, n, inembed, istride, idist, onembed, ostride, odist, CUFFT_C2R, howmany));
+    cudaErrorCheck(cufftSetStream(plan, cudaStreamPerThread));
+    return plan;
+}
 
-    }
-    //FFT inverse one scale
-    {
-        int rank = 2;
-        int n[] = {(int)m_height, (int)m_width};
-        int howmany = m_num_of_feats;
-        int idist = m_height*(m_width/2+1), odist = 1;
-        int istride = 1, ostride = m_num_of_feats;
-        int inembed[] = {(int)m_height, (int)m_width/2+1}, *onembed = n;
 
-        CufftErrorCheck(cufftPlanMany(&plan_i_features, rank, n,
-                 inembed, istride, idist,
-                 onembed, ostride, odist,
-                 CUFFT_C2R, howmany));
-    }
-    //FFT inverse all scales
-#ifdef BIG_BATCH
-    if(m_num_of_scales > 1)
-    {
-        CudaSafeCall(cudaHostAlloc(&data_i_features_all_scales, m_height*m_num_of_feats*m_num_of_scales*m_width*sizeof(cufftReal), cudaHostAllocMapped));
-        CudaSafeCall(cudaHostGetDevicePointer(&data_i_features_all_scales_d, data_i_features_all_scales, 0));
+void cuFFT::init(unsigned width, unsigned height, unsigned num_of_feats, unsigned num_of_scales)
+{
+    Fft::init(width, height, num_of_feats, num_of_scales);
 
-        int rank = 2;
-        int n[] = {(int)m_height, (int)m_width};
-        int howmany = m_num_of_feats*m_num_of_scales;
-        int idist = m_height*(m_width/2+1), odist = 1;
-        int istride = 1, ostride = m_num_of_feats*m_num_of_scales;
-        int inembed[] = {(int)m_height, (int)m_width/2+1}, *onembed = n;
+    std::cout << "FFT: cuFFT" << std::endl;
 
-        CufftErrorCheck(cufftPlanMany(&plan_i_features_all_scales, rank, n,
-                 inembed, istride, idist,
-                 onembed, ostride, odist,
-                 CUFFT_C2R, howmany));
-    }
-#endif
-    //FFT inverse one channel one scale
-    {
-        int rank = 2;
-        int n[] = {(int)m_height, (int)m_width};
-        int howmany = 1;
-        int idist = m_height*(m_width/2+1), odist = 1;
-        int istride = 1, ostride = 1;
-        int inembed[] = {(int)m_height, (int)m_width/2+1}, *onembed = n;
+    plan_f = create_plan_fwd(1);
+    plan_fw = create_plan_fwd(m_num_of_feats);
+    plan_i_1ch = create_plan_inv(1);
 
-        CufftErrorCheck(cufftPlanMany(&plan_i_1ch, rank, n,
-                 inembed, istride, idist,
-                 onembed, ostride, odist,
-                 CUFFT_C2R, howmany));
-    }
 #ifdef BIG_BATCH
-    //FFT inverse one channel all scales
-    if(m_num_of_scales > 1 && m_big_batch_mode)
-    {
-        CudaSafeCall(cudaHostAlloc(&data_i_1ch_all_scales, m_height*m_num_of_scales*m_width*sizeof(cufftReal), cudaHostAllocMapped));
-        CudaSafeCall(cudaHostGetDevicePointer(&data_i_1ch_all_scales_d, data_i_1ch_all_scales, 0));
-
-        int rank = 2;
-        int n[] = {(int)m_height, (int)m_width};
-        int howmany = m_num_of_scales;
-        int idist = m_height*(m_width/2+1), odist = 1;
-        int istride = 1, ostride = m_num_of_scales;
-        int inembed[] = {(int)m_height, (int)m_width/2+1}, *onembed = n;
-
-        CufftErrorCheck(cufftPlanMany(&plan_i_1ch_all_scales, rank, n,
-                 inembed, istride, idist,
-                 onembed, ostride, odist,
-                 CUFFT_C2R, howmany));
-    }
+    plan_f_all_scales = create_plan_fwd(m_num_of_scales);
+    plan_fw_all_scales = create_plan_fwd(m_num_of_scales * m_num_of_feats);
+    plan_i_all_scales = create_plan_inv(m_num_of_scales);
 #endif
 }
 
-void cuFFT::set_window(const cv::Mat & window)
+void cuFFT::set_window(const MatDynMem &window)
 {
-     m_window = window;
+    Fft::set_window(window);
+    m_window = window;
 }
 
-void cuFFT::forward(Scale_vars & vars)
+void cuFFT::forward(const MatScales &real_input, ComplexMat &complex_result)
 {
-    ComplexMat *complex_result = vars.flag & Tracker_flags::TRACKER_INIT ? vars.p_yf_ptr :
-                                                  vars.flag & Tracker_flags::AUTO_CORRELATION ? & vars.kf : & vars.kzf;
-    cv::Mat *input = vars.flag & Tracker_flags::TRACKER_INIT ? & vars.rot_labels : & vars.in_all;
+    Fft::forward(real_input, complex_result);
+    auto in = static_cast<cufftReal *>(const_cast<MatScales&>(real_input).deviceMem());
 
-    if(m_big_batch_mode && vars.in_all.rows == (int)(m_height*m_num_of_scales)){
-        CudaSafeCall(cudaMemcpy(data_f_all_scales, input->ptr<cufftReal>(), m_height*m_num_of_scales*m_width*sizeof(cufftReal), cudaMemcpyHostToDevice));
-        CufftErrorCheck(cufftExecR2C(plan_f_all_scales, reinterpret_cast<cufftReal*>(data_f_all_scales),
-                                complex_result->get_p_data()));
-    } else {
-        CudaSafeCall(cudaMemcpy(data_f, input->ptr<cufftReal>(), m_height*m_width*sizeof(cufftReal), cudaMemcpyHostToDevice));
-        CufftErrorCheck(cufftExecR2C(plan_f, reinterpret_cast<cufftReal*>(data_f),
-                                complex_result->get_p_data()));
-    }
-    return;
+    if (real_input.size[0] == 1)
+        cudaErrorCheck(cufftExecR2C(plan_f, in, complex_result.get_dev_data()));
+#ifdef BIG_BATCH
+    else
+        cudaErrorCheck(cufftExecR2C(plan_f_all_scales, in, complex_result.get_dev_data()));
+#endif
 }
 
-void cuFFT::forward_window(Scale_vars & vars)
+void cuFFT::forward_window(MatScaleFeats &feat, ComplexMat &complex_result, MatScaleFeats &temp)
 {
-    int n_channels = vars.patch_feats.size();
+    Fft::forward_window(feat, complex_result, temp);
 
-    ComplexMat *result = vars.flag & Tracker_flags::TRACKER_INIT ? vars.p_model_xf_ptr :
-                                                  vars.flag & Tracker_flags::TRACKER_UPDATE ? & vars.xf : & vars.zf;
+    cufftReal *temp_data = temp.deviceMem();
+    uint n_scales = feat.size[0];
 
-    if(n_channels > (int) m_num_of_feats){
-        cv::Mat in_all(m_height * n_channels, m_width, CV_32F, data_fw_all_scales);
-        for (int i = 0; i < n_channels; ++i) {
-            cv::Mat in_roi(in_all, cv::Rect(0, i*m_height, m_width, m_height));
-            in_roi = vars.patch_feats[i].mul(m_window);
+    for (uint s = 0; s < n_scales; ++s) {
+        for (uint ch = 0; ch < uint(feat.size[1]); ++ch) {
+            cv::Mat feat_plane = feat.plane(s, ch);
+            cv::Mat temp_plane = temp.plane(s, ch);
+            temp_plane = feat_plane.mul(m_window);
         }
-
-        CufftErrorCheck(cufftExecR2C(plan_fw_all_scales, reinterpret_cast<cufftReal*>(data_fw_all_scales_d), result->get_p_data()));
-    } else {
-        cv::Mat in_all(m_height * n_channels, m_width, CV_32F, data_fw);
-        for (int i = 0; i < n_channels; ++i) {
-            cv::Mat in_roi(in_all, cv::Rect(0, i*m_height, m_width, m_height));
-            in_roi = vars.patch_feats[i].mul(m_window);
-        }
-
-        CufftErrorCheck(cufftExecR2C(plan_fw, reinterpret_cast<cufftReal*>(data_fw_d), result->get_p_data()));
     }
-    return;
+
+    if (n_scales == 1)
+        cudaErrorCheck(cufftExecR2C(plan_fw, temp_data, complex_result.get_dev_data()));
+#ifdef BIG_BATCH
+    else
+        cudaErrorCheck(cufftExecR2C(plan_fw_all_scales, temp_data, complex_result.get_dev_data()));
+#endif
+    CudaSafeCall(cudaStreamSynchronize(cudaStreamPerThread));
 }
 
-void cuFFT::inverse(Scale_vars & vars)
+void cuFFT::inverse(ComplexMat &complex_input, MatScales &real_result)
 {
-    ComplexMat *input = vars.flag & Tracker_flags::RESPONSE ? & vars.kzf : &  vars.xyf;
-    cv::Mat *real_result = vars.flag & Tracker_flags::RESPONSE ? & vars.response : & vars.ifft2_res;
+    Fft::inverse(complex_input, real_result);
 
-    int n_channels = input->n_channels;
-    cufftComplex *in = reinterpret_cast<cufftComplex*>(input->get_p_data());
+    uint n_channels = complex_input.n_channels;
+    cufftComplex *in = reinterpret_cast<cufftComplex *>(complex_input.get_dev_data());
+    cufftReal *out = real_result.deviceMem();
+    float alpha = 1.0 / (m_width * m_height);
 
-    if(n_channels == 1){
-
-        CufftErrorCheck(cufftExecC2R(plan_i_1ch, in, reinterpret_cast<cufftReal*>(vars.data_i_1ch_d)));
-        cudaDeviceSynchronize();
-        *real_result = *real_result/(m_width*m_height);
-        return;
-    }
+    if (n_channels == 1)
+        cudaErrorCheck(cufftExecC2R(plan_i_1ch, in, out));
 #ifdef BIG_BATCH
-    else if(n_channels == (int) m_num_of_scales){
-        cv::Mat real_result(m_height, m_width, CV_32FC(n_channels), vars.data_i_1ch_all_scales);
-
-        CufftErrorCheck(cufftExecC2R(plan_i_1ch_all_scales, in, reinterpret_cast<cufftReal*>(vars.data_i_1ch_all_scales_d)));
-        cudaDeviceSynchronize();
-
-        return real_result/(m_width*m_height);
-    } else if(n_channels == (int) m_num_of_feats * (int) m_num_of_scales){
-        cv::Mat real_result(m_height, m_width, CV_32FC(n_channels), data_i_features_all_scales);
-
-        CufftErrorCheck(cufftExecC2R(plan_i_features_all_scales, in, reinterpret_cast<cufftReal*>(vars.data_i_features_all_scales_d)));
-        cudaDeviceSynchronize();
-
-        return real_result/(m_width*m_height);
-    }
+    else
+        cudaErrorCheck(cufftExecC2R(plan_i_all_scales, in, out));
 #endif
-
-    CufftErrorCheck(cufftExecC2R(plan_i_features, in, reinterpret_cast<cufftReal*>(vars.data_i_features_d)));
-
-    if (vars.cuda_gauss)
-        return;
-    else {
-        cudaDeviceSynchronize();
-        *real_result = *real_result/(m_width*m_height);
-    }
-    return;
+    // TODO: Investigate whether this scalling is needed or not
+    cudaErrorCheck(cublasSscal(cublas, real_result.total(), &alpha, out, 1));
+    CudaSafeCall(cudaStreamSynchronize(cudaStreamPerThread));
 }
 
 cuFFT::~cuFFT()
 {
-  CufftErrorCheck(cufftDestroy(plan_f));
-  CufftErrorCheck(cufftDestroy(plan_fw));
-  CufftErrorCheck(cufftDestroy(plan_i_1ch));
-  CufftErrorCheck(cufftDestroy(plan_i_features));
+    cudaErrorCheck(cublasDestroy(cublas));
 
-  CudaSafeCall(cudaFree(data_f));
-  CudaSafeCall(cudaFreeHost(data_fw));
-  
-  if (m_big_batch_mode) {
-      CufftErrorCheck(cufftDestroy(plan_f_all_scales));
-      CufftErrorCheck(cufftDestroy(plan_fw_all_scales));
-      CufftErrorCheck(cufftDestroy(plan_i_1ch_all_scales));
-      CufftErrorCheck(cufftDestroy(plan_i_features_all_scales));
-      
-      CudaSafeCall(cudaFree(data_f_all_scales));
-      CudaSafeCall(cudaFreeHost(data_fw_all_scales));
-      CudaSafeCall(cudaFreeHost(data_i_1ch_all_scales));
-      CudaSafeCall(cudaFreeHost(data_i_features_all_scales));
-  }
+    cudaErrorCheck(cufftDestroy(plan_f));
+    cudaErrorCheck(cufftDestroy(plan_fw));
+    cudaErrorCheck(cufftDestroy(plan_i_1ch));
+
+#ifdef BIG_BATCH
+    cudaErrorCheck(cufftDestroy(plan_f_all_scales));
+    cudaErrorCheck(cufftDestroy(plan_fw_all_scales));
+    cudaErrorCheck(cufftDestroy(plan_i_all_scales));
+#endif
 }