src/fft_cufft.cpp

   1 #include "fft_cufft.h"
   2
   3 cuFFT::cuFFT()
   4 {
   5     CudaSafeCall(cudaSetDeviceFlags(cudaDeviceMapHost));
   6     cudaErrorCheck(cublasCreate(&cublas));
   7     cudaErrorCheck(cublasSetStream(cublas, cudaStreamPerThread));
   8 }
   9
  10 cufftHandle cuFFT::create_plan_fwd(uint howmany) const
  11 {
  12     int rank = 2;
  13     int n[] = {(int)m_height, (int)m_width};
  14     int idist = m_height * m_width, odist = m_height * (m_width / 2 + 1);
  15     int istride = 1, ostride = 1;
  16     int *inembed = n, onembed[] = {(int)m_height, (int)m_width / 2 + 1};
  17
  18     cufftHandle plan;
  19     cudaErrorCheck(cufftPlanMany(&plan, rank, n, inembed, istride, idist, onembed, ostride, odist, CUFFT_R2C, howmany));
  20     cudaErrorCheck(cufftSetStream(plan, cudaStreamPerThread));
  21     return plan;
  22 }
  23
  24 cufftHandle cuFFT::create_plan_inv(uint howmany) const
  25 {
  26     int rank = 2;
  27     int n[] = {(int)m_height, (int)m_width};
  28     int idist = m_height * (m_width / 2 + 1), odist = m_height * m_width;
  29     int istride = 1, ostride = 1;
  30     int inembed[] = {(int)m_height, (int)m_width / 2 + 1}, *onembed = n;
  31
  32     cufftHandle plan;
  33     cudaErrorCheck(cufftPlanMany(&plan, rank, n, inembed, istride, idist, onembed, ostride, odist, CUFFT_C2R, howmany));
  34     cudaErrorCheck(cufftSetStream(plan, cudaStreamPerThread));
  35     return plan;
  36 }
  37
  38
  39 void cuFFT::init(unsigned width, unsigned height, unsigned num_of_feats, unsigned num_of_scales)
  40 {
  41     Fft::init(width, height, num_of_feats, num_of_scales);
  42
  43     std::cout << "FFT: cuFFT" << std::endl;
  44
  45     plan_f = create_plan_fwd(1);
  46     plan_fw = create_plan_fwd(m_num_of_feats);
  47     plan_i_1ch = create_plan_inv(1);
  48
  49 #ifdef BIG_BATCH
  50     plan_f_all_scales = create_plan_fwd(m_num_of_scales);
  51     plan_fw_all_scales = create_plan_fwd(m_num_of_scales * m_num_of_feats);
  52     plan_i_all_scales = create_plan_inv(m_num_of_scales);
  53 #endif
  54 }
  55
  56 void cuFFT::set_window(const MatDynMem &window)
  57 {
  58     Fft::set_window(window);
  59     m_window = window;
  60 }
  61
  62 void cuFFT::forward(const MatScales &real_input, ComplexMat &complex_result)
  63 {
  64     Fft::forward(real_input, complex_result);
  65     auto in = static_cast<cufftReal *>(const_cast<MatScales&>(real_input).deviceMem());
  66
  67     if (real_input.size[0] == 1)
  68         cudaErrorCheck(cufftExecR2C(plan_f, in, complex_result.get_dev_data()));
  69 #ifdef BIG_BATCH
  70     else
  71         cudaErrorCheck(cufftExecR2C(plan_f_all_scales, in, complex_result.get_dev_data()));
  72 #endif
  73 }
  74
  75 void cuFFT::forward_window(MatScaleFeats &feat, ComplexMat &complex_result, MatScaleFeats &temp)
  76 {
  77     Fft::forward_window(feat, complex_result, temp);
  78
  79     cufftReal *temp_data = temp.deviceMem();
  80     uint n_scales = feat.size[0];
  81
  82     for (uint s = 0; s < n_scales; ++s) {
  83         for (uint ch = 0; ch < uint(feat.size[1]); ++ch) {
  84             cv::Mat feat_plane = feat.plane(s, ch);
  85             cv::Mat temp_plane = temp.plane(s, ch);
  86             temp_plane = feat_plane.mul(m_window);
  87         }
  88     }
  89
  90     if (n_scales == 1)
  91         cudaErrorCheck(cufftExecR2C(plan_fw, temp_data, complex_result.get_dev_data()));
  92 #ifdef BIG_BATCH
  93     else
  94         cudaErrorCheck(cufftExecR2C(plan_fw_all_scales, temp_data, complex_result.get_dev_data()));
  95 #endif
  96 }
  97
  98 void cuFFT::inverse(ComplexMat &complex_input, MatScales &real_result)
  99 {
 100     Fft::inverse(complex_input, real_result);
 101
 102     uint n_channels = complex_input.n_channels;
 103     cufftComplex *in = reinterpret_cast<cufftComplex *>(complex_input.get_dev_data());
 104     cufftReal *out = real_result.deviceMem();
 105     float alpha = 1.0 / (m_width * m_height);
 106
 107     if (n_channels == 1)
 108         cudaErrorCheck(cufftExecC2R(plan_i_1ch, in, out));
 109 #ifdef BIG_BATCH
 110     else
 111         cudaErrorCheck(cufftExecC2R(plan_i_all_scales, in, out));
 112 #endif
 113     cudaErrorCheck(cublasSscal(cublas, real_result.total(), &alpha, out, 1));
 114     // The result is a cv::Mat, which will be accesses by CPU, so we
 115     // must synchronize with the GPU here
 116     CudaSafeCall(cudaStreamSynchronize(cudaStreamPerThread));
 117 }
 118
 119 cuFFT::~cuFFT()
 120 {
 121     cudaErrorCheck(cublasDestroy(cublas));
 122
 123     cudaErrorCheck(cufftDestroy(plan_f));
 124     cudaErrorCheck(cufftDestroy(plan_fw));
 125     cudaErrorCheck(cufftDestroy(plan_i_1ch));
 126
 127 #ifdef BIG_BATCH
 128     cudaErrorCheck(cufftDestroy(plan_f_all_scales));
 129     cudaErrorCheck(cufftDestroy(plan_fw_all_scales));
 130     cudaErrorCheck(cufftDestroy(plan_i_all_scales));
 131 #endif
 132 }