src/fft_cufft.cpp

   1 #include "fft_cufft.h"
   2
   3 void cuFFT::init(unsigned width, unsigned height, unsigned num_of_feats, unsigned num_of_scales, bool big_batch_mode)
   4 {
   5     m_width = width;
   6     m_height = height;
   7     m_num_of_feats = num_of_feats;
   8     m_num_of_scales = num_of_scales;
   9     m_big_batch_mode = big_batch_mode;
  10
  11     std::cout << "FFT: cuFFT" << std::endl;
  12
  13     if(m_height*(m_width/2+1) > 1024){
  14         std::cerr << "Image dimension after forward FFT are too big for CUDA kernels." << std::endl;
  15         std::exit(EXIT_FAILURE);
  16     }
  17
  18     //FFT forward one scale
  19     {
  20         CudaSafeCall(cudaMalloc(&data_f, m_height*m_width*sizeof(cufftReal)));
  21
  22        CufftErrorCheck(cufftPlan2d(&plan_f, m_height, m_width, CUFFT_R2C));
  23
  24
  25     }
  26     //FFT forward all scales
  27     if(m_num_of_scales > 1 && m_big_batch_mode)
  28     {
  29         CudaSafeCall(cudaMalloc(&data_f_all_scales, m_height*m_num_of_scales*m_width*sizeof(cufftReal)));
  30
  31         int rank = 2;
  32         int n[] = {(int)m_height, (int)m_width};
  33         int howmany = m_num_of_scales;
  34         int idist = m_height*m_width, odist = m_height*(m_width/2+1);
  35         int istride = 1, ostride = 1;
  36         int *inembed = n, onembed[] = {(int)m_height, (int)m_width/2+1};
  37
  38         CufftErrorCheck(cufftPlanMany(&plan_f_all_scales, rank, n,
  39                       inembed, istride, idist,
  40                       onembed, ostride, odist,
  41                       CUFFT_R2C, howmany));
  42     }
  43     //FFT forward window one scale
  44     {
  45         CudaSafeCall(cudaHostAlloc(&data_fw, m_height*m_num_of_feats*m_width*sizeof(cufftReal), cudaHostAllocMapped));
  46         CudaSafeCall(cudaHostGetDevicePointer(&data_fw_d, data_fw, 0));
  47
  48         int rank = 2;
  49         int n[] = {(int)m_height, (int)m_width};
  50         int howmany = m_num_of_feats;
  51         int idist = m_height*m_width, odist = m_height*(m_width/2+1);
  52         int istride = 1, ostride = 1;
  53         int *inembed = n, onembed[] = {(int)m_height, (int)m_width/2+1};
  54
  55         CufftErrorCheck(cufftPlanMany(&plan_fw, rank, n,
  56                   inembed, istride, idist,
  57                   onembed, ostride, odist,
  58                   CUFFT_R2C, howmany));
  59     }
  60     //FFT forward window all scales all feats
  61     if(m_num_of_scales > 1 && m_big_batch_mode)
  62     {
  63         CudaSafeCall(cudaHostAlloc(&data_fw_all_scales, m_height*m_num_of_feats*m_num_of_scales*m_width*sizeof(cufftReal), cudaHostAllocMapped));
  64         CudaSafeCall(cudaHostGetDevicePointer(&data_fw_all_scales_d, data_fw_all_scales, 0));
  65
  66         int rank = 2;
  67         int n[] = {(int)m_height, (int)m_width};
  68         int howmany = m_num_of_scales*m_num_of_feats;
  69         int idist = m_height*m_width, odist = m_height*(m_width/2+1);
  70         int istride = 1, ostride = 1;
  71         int *inembed = n, onembed[] = {(int)m_height, (int)m_width/2+1};
  72
  73         CufftErrorCheck(cufftPlanMany(&plan_fw_all_scales, rank, n,
  74                   inembed, istride, idist,
  75                   onembed, ostride, odist,
  76                   CUFFT_R2C, howmany));
  77
  78
  79     }
  80     //FFT inverse one scale
  81     {
  82         CudaSafeCall(cudaHostAlloc(&data_i_features, m_height*m_num_of_feats*m_width*sizeof(cufftReal), cudaHostAllocMapped));
  83         CudaSafeCall(cudaHostGetDevicePointer(&data_i_features_d, data_i_features, 0));
  84
  85         int rank = 2;
  86         int n[] = {(int)m_height, (int)m_width};
  87         int howmany = m_num_of_feats;
  88         int idist = m_height*(m_width/2+1), odist = 1;
  89         int istride = 1, ostride = m_num_of_feats;
  90         int inembed[] = {(int)m_height, (int)m_width/2+1}, *onembed = n;
  91
  92         CufftErrorCheck(cufftPlanMany(&plan_i_features, rank, n,
  93                   inembed, istride, idist,
  94                   onembed, ostride, odist,
  95                   CUFFT_C2R, howmany));
  96     }
  97     //FFT inverse all scales
  98     if(m_num_of_scales > 1)
  99     {
 100         CudaSafeCall(cudaHostAlloc(&data_i_features_all_scales, m_height*m_num_of_feats*m_num_of_scales*m_width*sizeof(cufftReal), cudaHostAllocMapped));
 101         CudaSafeCall(cudaHostGetDevicePointer(&data_i_features_all_scales_d, data_i_features_all_scales, 0));
 102
 103         int rank = 2;
 104         int n[] = {(int)m_height, (int)m_width};
 105         int howmany = m_num_of_feats*m_num_of_scales;
 106         int idist = m_height*(m_width/2+1), odist = 1;
 107         int istride = 1, ostride = m_num_of_feats*m_num_of_scales;
 108         int inembed[] = {(int)m_height, (int)m_width/2+1}, *onembed = n;
 109
 110         CufftErrorCheck(cufftPlanMany(&plan_i_features_all_scales, rank, n,
 111                   inembed, istride, idist,
 112                   onembed, ostride, odist,
 113                   CUFFT_C2R, howmany));
 114     }
 115     //FFT inverse one channel one scale
 116     {
 117         CudaSafeCall(cudaHostAlloc(&data_i_1ch, m_height*m_width*sizeof(cufftReal), cudaHostAllocMapped));
 118         CudaSafeCall(cudaHostGetDevicePointer(&data_i_1ch_d, data_i_1ch, 0));
 119
 120         int rank = 2;
 121         int n[] = {(int)m_height, (int)m_width};
 122         int howmany = 1;
 123         int idist = m_height*(m_width/2+1), odist = 1;
 124         int istride = 1, ostride = 1;
 125         int inembed[] = {(int)m_height, (int)m_width/2+1}, *onembed = n;
 126
 127         CufftErrorCheck(cufftPlanMany(&plan_i_1ch, rank, n,
 128                   inembed, istride, idist,
 129                   onembed, ostride, odist,
 130                   CUFFT_C2R, howmany));
 131     }
 132     //FFT inverse one channel all scales
 133     if(m_num_of_scales > 1 && m_big_batch_mode)
 134     {
 135         CudaSafeCall(cudaHostAlloc(&data_i_1ch_all_scales, m_height*m_num_of_scales*m_width*sizeof(cufftReal), cudaHostAllocMapped));
 136         CudaSafeCall(cudaHostGetDevicePointer(&data_i_1ch_all_scales_d, data_i_1ch_all_scales, 0));
 137
 138         int rank = 2;
 139         int n[] = {(int)m_height, (int)m_width};
 140         int howmany = m_num_of_scales;
 141         int idist = m_height*(m_width/2+1), odist = 1;
 142         int istride = 1, ostride = m_num_of_scales;
 143         int inembed[] = {(int)m_height, (int)m_width/2+1}, *onembed = n;
 144
 145         CufftErrorCheck(cufftPlanMany(&plan_i_1ch_all_scales, rank, n,
 146                   inembed, istride, idist,
 147                   onembed, ostride, odist,
 148                   CUFFT_C2R, howmany));
 149     }
 150 }
 151
 152 void cuFFT::set_window(const cv::Mat &window)
 153 {
 154      m_window = window;
 155 }
 156
 157 ComplexMat cuFFT::forward(const cv::Mat &input)
 158 {
 159     ComplexMat complex_result;
 160     if(m_big_batch_mode && input.rows == (int)(m_height*m_num_of_scales)){
 161         CudaSafeCall(cudaMemcpy(data_f_all_scales, input.ptr<cufftReal>(), m_height*m_num_of_scales*m_width*sizeof(cufftReal), cudaMemcpyHostToDevice));
 162         complex_result.create(m_height, m_width / 2 + 1, m_num_of_scales);
 163         CufftErrorCheck(cufftExecR2C(plan_f_all_scales, reinterpret_cast<cufftReal*>(data_f_all_scales),
 164                                 complex_result.get_p_data()));
 165     } else {
 166         CudaSafeCall(cudaMemcpy(data_f, input.ptr<cufftReal>(), m_height*m_width*sizeof(cufftReal), cudaMemcpyHostToDevice));
 167         complex_result.create(m_height, m_width/ 2 + 1, 1);
 168         CufftErrorCheck(cufftExecR2C(plan_f, reinterpret_cast<cufftReal*>(data_f),
 169                                 complex_result.get_p_data()));
 170     }
 171
 172     return complex_result;
 173 }
 174
 175 ComplexMat cuFFT::forward_raw(float *input)
 176 {
 177     ComplexMat dummy;
 178     return dummy;
 179 }
 180
 181 ComplexMat cuFFT::forward_window(const std::vector<cv::Mat> &input)
 182 {
 183     int n_channels = input.size();
 184     ComplexMat result;
 185     if(n_channels > (int) m_num_of_feats){
 186         cv::Mat in_all(m_height * n_channels, m_width, CV_32F, data_fw_all_scales);
 187         for (int i = 0; i < n_channels; ++i) {
 188             cv::Mat in_roi(in_all, cv::Rect(0, i*m_height, m_width, m_height));
 189             in_roi = input[i].mul(m_window);
 190         }
 191
 192         result.create(m_height, m_width/2 + 1, n_channels,m_num_of_scales);
 193
 194         CufftErrorCheck(cufftExecR2C(plan_fw_all_scales, reinterpret_cast<cufftReal*>(data_fw_all_scales_d), result.get_p_data()));
 195     } else {
 196         cv::Mat in_all(m_height * n_channels, m_width, CV_32F, data_fw);
 197         for (int i = 0; i < n_channels; ++i) {
 198             cv::Mat in_roi(in_all, cv::Rect(0, i*m_height, m_width, m_height));
 199             in_roi = input[i].mul(m_window);
 200         }
 201
 202         result.create(m_height, m_width/2 + 1, n_channels);
 203
 204         CufftErrorCheck(cufftExecR2C(plan_fw, reinterpret_cast<cufftReal*>(data_fw_d), result.get_p_data()));
 205     }
 206     return result;
 207 }
 208
 209 cv::Mat cuFFT::inverse(const ComplexMat &input)
 210 {
 211     int n_channels = input.n_channels;
 212     cufftComplex *in = reinterpret_cast<cufftComplex*>(input.get_p_data());
 213
 214     if(n_channels == 1){
 215         cv::Mat real_result(m_height, m_width, CV_32FC1, data_i_1ch);
 216
 217         CufftErrorCheck(cufftExecC2R(plan_i_1ch, in, reinterpret_cast<cufftReal*>(data_i_1ch_d)));
 218         cudaDeviceSynchronize();
 219
 220         return real_result/(m_width*m_height);
 221     } else if(n_channels == (int) m_num_of_scales){
 222         cv::Mat real_result(m_height, m_width, CV_32FC(n_channels), data_i_1ch_all_scales);
 223
 224         CufftErrorCheck(cufftExecC2R(plan_i_1ch_all_scales, in, reinterpret_cast<cufftReal*>(data_i_1ch_all_scales_d)));
 225         cudaDeviceSynchronize();
 226
 227         return real_result/(m_width*m_height);
 228     } else if(n_channels == (int) m_num_of_feats * (int) m_num_of_scales){
 229         cv::Mat real_result(m_height, m_width, CV_32FC(n_channels), data_i_features_all_scales);
 230
 231         CufftErrorCheck(cufftExecC2R(plan_i_features_all_scales, in, reinterpret_cast<cufftReal*>(data_i_features_all_scales_d)));
 232         cudaDeviceSynchronize();
 233
 234         return real_result/(m_width*m_height);
 235     }
 236
 237     cv::Mat real_result(m_height, m_width, CV_32FC(n_channels), data_i_features);
 238
 239     CufftErrorCheck(cufftExecC2R(plan_i_features, in, reinterpret_cast<cufftReal*>(data_i_features_d)));
 240     cudaDeviceSynchronize();
 241
 242     return real_result/(m_width*m_height);
 243 }
 244
 245 float* cuFFT::inverse_raw(const ComplexMat &input)
 246 {
 247     cufftComplex *in = reinterpret_cast<cufftComplex*>(input.get_p_data());
 248
 249     CufftErrorCheck(cufftExecC2R(plan_i_features_all_scales, in, reinterpret_cast<cufftReal*>(data_i_features_all_scales_d)));
 250
 251     return data_i_features_all_scales;
 252 }
 253
 254 cuFFT::~cuFFT()
 255 {
 256   CufftErrorCheck(cufftDestroy(plan_f));
 257   CufftErrorCheck(cufftDestroy(plan_f_all_scales));
 258   CufftErrorCheck(cufftDestroy(plan_fw));
 259   CufftErrorCheck(cufftDestroy(plan_fw_all_scales));
 260   CufftErrorCheck(cufftDestroy(plan_i_1ch));
 261   CufftErrorCheck(cufftDestroy(plan_i_1ch_all_scales));
 262   CufftErrorCheck(cufftDestroy(plan_i_features));
 263   CufftErrorCheck(cufftDestroy(plan_i_features_all_scales));
 264
 265   CudaSafeCall(cudaFree(data_f));
 266   CudaSafeCall(cudaFree(data_f_all_scales));
 267   CudaSafeCall(cudaFreeHost(data_fw));
 268   CudaSafeCall(cudaFreeHost(data_fw_all_scales));
 269   CudaSafeCall(cudaFreeHost(data_i_1ch));
 270   CudaSafeCall(cudaFreeHost(data_i_1ch_all_scales));
 271   CudaSafeCall(cudaFreeHost(data_i_features));
 272   CudaSafeCall(cudaFreeHost(data_i_features_all_scales));
 273 }