src/fft_cufft.cpp

   1 #include "fft_cufft.h"
   2
   3 #ifdef _CUFFT_H_
   4 // cuFFT API errors
   5 static const char *_cudaGetErrorEnum(cufftResult error)
   6 {
   7     switch (error)
   8     {
   9         case CUFFT_SUCCESS:
  10             return "CUFFT_SUCCESS";
  11
  12         case CUFFT_INVALID_PLAN:
  13             return "CUFFT_INVALID_PLAN";
  14
  15         case CUFFT_ALLOC_FAILED:
  16             return "CUFFT_ALLOC_FAILED";
  17
  18         case CUFFT_INVALID_TYPE:
  19             return "CUFFT_INVALID_TYPE";
  20
  21         case CUFFT_INVALID_VALUE:
  22             return "CUFFT_INVALID_VALUE";
  23
  24         case CUFFT_INTERNAL_ERROR:
  25             return "CUFFT_INTERNAL_ERROR";
  26
  27         case CUFFT_EXEC_FAILED:
  28             return "CUFFT_EXEC_FAILED";
  29
  30         case CUFFT_SETUP_FAILED:
  31             return "CUFFT_SETUP_FAILED";
  32
  33         case CUFFT_INVALID_SIZE:
  34             return "CUFFT_INVALID_SIZE";
  35
  36         case CUFFT_UNALIGNED_DATA:
  37             return "CUFFT_UNALIGNED_DATA";
  38
  39         case CUFFT_INVALID_DEVICE:
  40             return "CUFFT_INVALID_DEVICE";
  41
  42         case CUFFT_PARSE_ERROR:
  43             return "CUFFT_PARSE_ERROR";
  44
  45         case CUFFT_NO_WORKSPACE:
  46             return "CUFFT_NO_WORKSPACE";
  47
  48         case CUFFT_NOT_IMPLEMENTED:
  49             return "CUFFT_NOT_IMPLEMENTED";
  50
  51         case CUFFT_LICENSE_ERROR:
  52             return "CUFFT_LICENSE_ERROR";
  53
  54         case CUFFT_NOT_SUPPORTED:
  55             return "CUFFT_NOT_SUPPORTED";
  56
  57         case CUFFT_INCOMPLETE_PARAMETER_LIST:
  58             return "CUFFT_INCOMPLETE_PARAMETER_LIST";
  59     }
  60
  61     return "<unknown>";
  62 }
  63 #endif
  64
  65 #define CHECK_CUFFT_ERRORS(call) { \
  66     cufftResult_t err; \
  67     if ((err = (call)) != CUFFT_SUCCESS) { \
  68         fprintf(stderr, "cuFFT error %d:%s at %s:%d\n", err, _cudaGetErrorEnum(err), \
  69                 __FILE__, __LINE__); \
  70         exit(1); \
  71     } \
  72 }
  73
  74 cuFFT::cuFFT(): m_num_of_streams(4)
  75 {}
  76
  77 void cuFFT::init(unsigned width, unsigned height, unsigned num_of_feats, unsigned num_of_scales, bool big_batch_mode)
  78 {
  79     m_width = width;
  80     m_height = height;
  81     m_num_of_feats = num_of_feats;
  82     m_num_of_scales = num_of_scales;
  83     m_big_batch_mode = big_batch_mode;
  84
  85     std::cout << "FFT: cuFFT" << std::endl;
  86
  87     if(m_height*(m_width/2+1) > 1024){
  88         std::cerr << "Image dimension after forward FFT are too big for CUDA kernels." << std::endl;
  89         std::exit(EXIT_FAILURE);
  90     }
  91
  92     for (unsigned i = 0; i < m_num_of_streams; i++) cudaStreamCreate(&streams[i]);
  93
  94     //FFT forward one scale
  95     {
  96         cudaMalloc(&data_f, m_height*m_width*sizeof(cufftReal));
  97
  98         cufftPlan2d(&plan_f, m_height, m_width, CUFFT_R2C);
  99
 100
 101     }
 102     //FFT forward all scales
 103     if(m_num_of_scales > 1 && m_big_batch_mode)
 104     {
 105         cudaMalloc(&data_f_all_scales, m_height*m_num_of_scales*m_width*sizeof(cufftReal));
 106
 107         int rank = 2;
 108         int n[] = {(int)m_height, (int)m_width};
 109         int howmany = m_num_of_scales;
 110         int idist = m_height*m_width, odist = m_height*(m_width/2+1);
 111         int istride = 1, ostride = 1;
 112         int *inembed = n, onembed[] = {(int)m_height, (int)m_width/2+1};
 113
 114         CHECK_CUFFT_ERRORS(cufftPlanMany(&plan_f_all_scales, rank, n,
 115                       inembed, istride, idist,
 116                       onembed, ostride, odist,
 117                       CUFFT_R2C, howmany));
 118     }
 119     //FFT forward window one scale
 120     {
 121         cudaHostAlloc(&data_fw, m_height*m_num_of_feats*m_width*sizeof(cufftReal), cudaHostAllocMapped);
 122         cudaHostGetDevicePointer(&data_fw_d, data_fw, 0);
 123
 124         int rank = 2;
 125         int n[] = {(int)m_height, (int)m_width};
 126         int howmany = m_num_of_feats;
 127         int idist = m_height*m_width, odist = m_height*(m_width/2+1);
 128         int istride = 1, ostride = 1;
 129         int *inembed = n, onembed[] = {(int)m_height, (int)m_width/2+1};
 130
 131         CHECK_CUFFT_ERRORS(cufftPlanMany(&plan_fw, rank, n,
 132                   inembed, istride, idist,
 133                   onembed, ostride, odist,
 134                   CUFFT_R2C, howmany));
 135     }
 136     //FFT forward window all scales all feats
 137     if(m_num_of_scales > 1 && m_big_batch_mode)
 138     {
 139         cudaHostAlloc(&data_fw_all_scales, m_height*m_num_of_feats*m_num_of_scales*m_width*sizeof(cufftReal), cudaHostAllocMapped);
 140         cudaHostGetDevicePointer(&data_fw_all_scales_d, data_fw_all_scales, 0);
 141
 142         int rank = 2;
 143         int n[] = {(int)m_height, (int)m_width};
 144         int howmany = m_num_of_scales*m_num_of_feats;
 145         int idist = m_height*m_width, odist = m_height*(m_width/2+1);
 146         int istride = 1, ostride = 1;
 147         int *inembed = n, onembed[] = {(int)m_height, (int)m_width/2+1};
 148
 149         CHECK_CUFFT_ERRORS(cufftPlanMany(&plan_fw_all_scales, rank, n,
 150                   inembed, istride, idist,
 151                   onembed, ostride, odist,
 152                   CUFFT_R2C, howmany));
 153
 154
 155     }
 156     //FFT inverse one scale
 157     {
 158         cudaHostAlloc(&data_i_features, m_height*m_num_of_feats*m_width*sizeof(cufftReal), cudaHostAllocMapped);
 159         cudaHostGetDevicePointer(&data_i_features_d, data_i_features, 0);
 160
 161         int rank = 2;
 162         int n[] = {(int)m_height, (int)m_width};
 163         int howmany = m_num_of_feats;
 164         int idist = m_height*(m_width/2+1), odist = 1;
 165         int istride = 1, ostride = m_num_of_feats;
 166         int inembed[] = {(int)m_height, (int)m_width/2+1}, *onembed = n;
 167
 168         CHECK_CUFFT_ERRORS(cufftPlanMany(&plan_i_features, rank, n,
 169                   inembed, istride, idist,
 170                   onembed, ostride, odist,
 171                   CUFFT_C2R, howmany));
 172     }
 173     //FFT inverse all scales
 174     if(m_num_of_scales > 1)
 175     {
 176         cudaHostAlloc(&data_i_features_all_scales, m_height*m_num_of_feats*m_num_of_scales*m_width*sizeof(cufftReal), cudaHostAllocMapped);
 177         cudaHostGetDevicePointer(&data_i_features_all_scales_d, data_i_features_all_scales, 0);
 178
 179         int rank = 2;
 180         int n[] = {(int)m_height, (int)m_width};
 181         int howmany = m_num_of_feats*m_num_of_scales;
 182         int idist = m_height*(m_width/2+1), odist = 1;
 183         int istride = 1, ostride = m_num_of_feats*m_num_of_scales;
 184         int inembed[] = {(int)m_height, (int)m_width/2+1}, *onembed = n;
 185
 186         CHECK_CUFFT_ERRORS(cufftPlanMany(&plan_i_features_all_scales, rank, n,
 187                   inembed, istride, idist,
 188                   onembed, ostride, odist,
 189                   CUFFT_C2R, howmany));
 190     }
 191     //FFT inverse one channel one scale
 192     {
 193         cudaHostAlloc(&data_i_1ch, m_height*m_width*sizeof(cufftReal), cudaHostAllocMapped);
 194         cudaHostGetDevicePointer(&data_i_1ch_d, data_i_1ch, 0);
 195
 196         int rank = 2;
 197         int n[] = {(int)m_height, (int)m_width};
 198         int howmany = 1;
 199         int idist = m_height*(m_width/2+1), odist = 1;
 200         int istride = 1, ostride = 1;
 201         int inembed[] = {(int)m_height, (int)m_width/2+1}, *onembed = n;
 202
 203         CHECK_CUFFT_ERRORS(cufftPlanMany(&plan_i_1ch, rank, n,
 204                   inembed, istride, idist,
 205                   onembed, ostride, odist,
 206                   CUFFT_C2R, howmany));
 207     }
 208     //FFT inverse one channel all scales
 209     if(m_num_of_scales > 1 && m_big_batch_mode)
 210     {
 211         cudaHostAlloc(&data_i_1ch_all_scales, m_height*m_num_of_scales*m_width*sizeof(cufftReal), cudaHostAllocMapped);
 212         cudaHostGetDevicePointer(&data_i_1ch_all_scales_d, data_i_1ch_all_scales, 0);
 213
 214         int rank = 2;
 215         int n[] = {(int)m_height, (int)m_width};
 216         int howmany = m_num_of_scales;
 217         int idist = m_height*(m_width/2+1), odist = 1;
 218         int istride = 1, ostride = m_num_of_scales;
 219         int inembed[] = {(int)m_height, (int)m_width/2+1}, *onembed = n;
 220
 221         CHECK_CUFFT_ERRORS(cufftPlanMany(&plan_i_1ch_all_scales, rank, n,
 222                   inembed, istride, idist,
 223                   onembed, ostride, odist,
 224                   CUFFT_C2R, howmany));
 225     }
 226 }
 227
 228 void cuFFT::set_window(const cv::Mat &window)
 229 {
 230      m_window = window;
 231 }
 232
 233 ComplexMat cuFFT::forward(const cv::Mat &input)
 234 {
 235     ComplexMat complex_result;
 236     if(m_big_batch_mode && input.rows == (int)(m_height*m_num_of_scales)){
 237         cudaMemcpy(data_f_all_scales, input.ptr<cufftReal>(), m_height*m_num_of_scales*m_width*sizeof(cufftReal), cudaMemcpyHostToDevice);
 238         complex_result.create(m_height, m_width / 2 + 1, m_num_of_scales);
 239         CHECK_CUFFT_ERRORS(cufftExecR2C(plan_f_all_scales, reinterpret_cast<cufftReal*>(data_f_all_scales),
 240                                 complex_result.get_p_data()));
 241     } else {
 242         cudaMemcpy(data_f, input.ptr<cufftReal>(), m_height*m_width*sizeof(cufftReal), cudaMemcpyHostToDevice);
 243         complex_result.create(m_height, m_width/ 2 + 1, 1);
 244         CHECK_CUFFT_ERRORS(cufftExecR2C(plan_f, reinterpret_cast<cufftReal*>(data_f),
 245                                 complex_result.get_p_data()));
 246     }
 247
 248     return complex_result;
 249 }
 250
 251 ComplexMat cuFFT::forward_window(const std::vector<cv::Mat> &input)
 252 {
 253     int n_channels = input.size();
 254     ComplexMat result;
 255     if(n_channels > (int) m_num_of_feats){
 256         cv::Mat in_all(m_height * n_channels, m_width, CV_32F, data_fw_all_scales);
 257         for (int i = 0; i < n_channels; ++i) {
 258             cv::Mat in_roi(in_all, cv::Rect(0, i*m_height, m_width, m_height));
 259             in_roi = input[i].mul(m_window);
 260         }
 261
 262         result.create(m_height, m_width/2 + 1, n_channels,m_num_of_scales);
 263
 264         CHECK_CUFFT_ERRORS(cufftExecR2C(plan_fw_all_scales, reinterpret_cast<cufftReal*>(data_fw_all_scales_d), result.get_p_data()));
 265     } else {
 266         cv::Mat in_all(m_height * n_channels, m_width, CV_32F, data_fw);
 267         for (int i = 0; i < n_channels; ++i) {
 268             cv::Mat in_roi(in_all, cv::Rect(0, i*m_height, m_width, m_height));
 269             in_roi = input[i].mul(m_window);
 270         }
 271
 272         result.create(m_height, m_width/2 + 1, n_channels);
 273
 274         CHECK_CUFFT_ERRORS(cufftExecR2C(plan_fw, reinterpret_cast<cufftReal*>(data_fw_d), result.get_p_data()));
 275     }
 276     return result;
 277 }
 278
 279 cv::Mat cuFFT::inverse(const ComplexMat &inputf)
 280 {
 281     int n_channels = inputf.n_channels;
 282     cufftComplex *in = reinterpret_cast<cufftComplex*>(inputf.get_p_data());
 283
 284     if(n_channels == 1){
 285         cv::Mat real_result(m_height, m_width, CV_32FC1, data_i_1ch);
 286
 287         CHECK_CUFFT_ERRORS(cufftExecC2R(plan_i_1ch, in, reinterpret_cast<cufftReal*>(data_i_1ch_d)));
 288         cudaDeviceSynchronize();
 289
 290         return real_result/(m_width*m_height);
 291     } else if(n_channels == (int) m_num_of_scales){
 292         cv::Mat real_result(m_height, m_width, CV_32FC(n_channels), data_i_1ch_all_scales);
 293
 294         CHECK_CUFFT_ERRORS(cufftExecC2R(plan_i_1ch_all_scales, in, reinterpret_cast<cufftReal*>(data_i_1ch_all_scales_d)));
 295         cudaDeviceSynchronize();
 296
 297         return real_result/(m_width*m_height);
 298     } else if(n_channels == (int) m_num_of_feats * (int) m_num_of_scales){
 299         cv::Mat real_result(m_height, m_width, CV_32FC(n_channels), data_i_features_all_scales);
 300
 301         CHECK_CUFFT_ERRORS(cufftExecC2R(plan_i_features_all_scales, in, reinterpret_cast<cufftReal*>(data_i_features_all_scales_d)));
 302         cudaDeviceSynchronize();
 303
 304         return real_result/(m_width*m_height);
 305     }
 306
 307     cv::Mat real_result(m_height, m_width, CV_32FC(n_channels), data_i_features);
 308
 309     CHECK_CUFFT_ERRORS(cufftExecC2R(plan_i_features, in, reinterpret_cast<cufftReal*>(data_i_features_d)));
 310     cudaDeviceSynchronize();
 311
 312     return real_result/(m_width*m_height);
 313 }
 314
 315 cuFFT::~cuFFT()
 316 {
 317
 318   for(unsigned i = 0; i < m_num_of_streams; i++) cudaStreamDestroy(streams[i]);
 319
 320   cufftDestroy(plan_f);
 321   cufftDestroy(plan_f_all_scales);
 322   cufftDestroy(plan_fw);
 323   cufftDestroy(plan_fw_all_scales);
 324   cufftDestroy(plan_i_1ch);
 325   cufftDestroy(plan_i_1ch_all_scales);
 326   cufftDestroy(plan_i_features);
 327   cufftDestroy(plan_i_features_all_scales);
 328
 329   cudaFree(data_f);
 330   cudaFree(data_f_all_scales);
 331   cudaFreeHost(data_fw);
 332   cudaFreeHost(data_fw_all_scales);
 333   cudaFreeHost(data_i_1ch);
 334   cudaFreeHost(data_i_1ch_all_scales);
 335   cudaFreeHost(data_i_features);
 336   cudaFreeHost(data_i_features_all_scales);
 337
 338   cudaDeviceReset();
 339 }