5 static const char *_cudaGetErrorEnum(cufftResult error)
10 return "CUFFT_SUCCESS";
12 case CUFFT_INVALID_PLAN:
13 return "CUFFT_INVALID_PLAN";
15 case CUFFT_ALLOC_FAILED:
16 return "CUFFT_ALLOC_FAILED";
18 case CUFFT_INVALID_TYPE:
19 return "CUFFT_INVALID_TYPE";
21 case CUFFT_INVALID_VALUE:
22 return "CUFFT_INVALID_VALUE";
24 case CUFFT_INTERNAL_ERROR:
25 return "CUFFT_INTERNAL_ERROR";
27 case CUFFT_EXEC_FAILED:
28 return "CUFFT_EXEC_FAILED";
30 case CUFFT_SETUP_FAILED:
31 return "CUFFT_SETUP_FAILED";
33 case CUFFT_INVALID_SIZE:
34 return "CUFFT_INVALID_SIZE";
36 case CUFFT_UNALIGNED_DATA:
37 return "CUFFT_UNALIGNED_DATA";
39 case CUFFT_INVALID_DEVICE:
40 return "CUFFT_INVALID_DEVICE";
42 case CUFFT_PARSE_ERROR:
43 return "CUFFT_PARSE_ERROR";
45 case CUFFT_NO_WORKSPACE:
46 return "CUFFT_NO_WORKSPACE";
48 case CUFFT_NOT_IMPLEMENTED:
49 return "CUFFT_NOT_IMPLEMENTED";
51 case CUFFT_LICENSE_ERROR:
52 return "CUFFT_LICENSE_ERROR";
54 case CUFFT_NOT_SUPPORTED:
55 return "CUFFT_NOT_SUPPORTED";
57 case CUFFT_INCOMPLETE_PARAMETER_LIST:
58 return "CUFFT_INCOMPLETE_PARAMETER_LIST";
65 #define CHECK_CUFFT_ERRORS(call) { \
67 if ((err = (call)) != CUFFT_SUCCESS) { \
68 fprintf(stderr, "cuFFT error %d:%s at %s:%d\n", err, _cudaGetErrorEnum(err), \
69 __FILE__, __LINE__); \
74 cuFFT::cuFFT(): m_num_of_streams(4)
77 void cuFFT::init(unsigned width, unsigned height, unsigned num_of_feats, unsigned num_of_scales, bool big_batch_mode)
81 m_num_of_feats = num_of_feats;
82 m_num_of_scales = num_of_scales;
83 m_big_batch_mode = big_batch_mode;
85 std::cout << "FFT: cuFFT" << std::endl;
87 if(m_height*(m_width/2+1) > 1024){
88 std::cerr << "Image dimension after forward FFT are too big for CUDA kernels." << std::endl;
89 std::exit(EXIT_FAILURE);
92 for (unsigned i = 0; i < m_num_of_streams; i++) cudaStreamCreate(&streams[i]);
94 //FFT forward one scale
96 cudaMalloc(&data_f, m_height*m_width*sizeof(cufftReal));
98 cufftPlan2d(&plan_f, m_height, m_width, CUFFT_R2C);
102 //FFT forward all scales
103 if(m_num_of_scales > 1 && m_big_batch_mode)
105 cudaMalloc(&data_f_all_scales, m_height*m_num_of_scales*m_width*sizeof(cufftReal));
108 int n[] = {(int)m_height, (int)m_width};
109 int howmany = m_num_of_scales;
110 int idist = m_height*m_width, odist = m_height*(m_width/2+1);
111 int istride = 1, ostride = 1;
112 int *inembed = n, onembed[] = {(int)m_height, (int)m_width/2+1};
114 CHECK_CUFFT_ERRORS(cufftPlanMany(&plan_f_all_scales, rank, n,
115 inembed, istride, idist,
116 onembed, ostride, odist,
117 CUFFT_R2C, howmany));
119 //FFT forward window one scale
121 cudaHostAlloc(&data_fw, m_height*m_num_of_feats*m_width*sizeof(cufftReal), cudaHostAllocMapped);
122 cudaHostGetDevicePointer(&data_fw_d, data_fw, 0);
125 int n[] = {(int)m_height, (int)m_width};
126 int howmany = m_num_of_feats;
127 int idist = m_height*m_width, odist = m_height*(m_width/2+1);
128 int istride = 1, ostride = 1;
129 int *inembed = n, onembed[] = {(int)m_height, (int)m_width/2+1};
131 CHECK_CUFFT_ERRORS(cufftPlanMany(&plan_fw, rank, n,
132 inembed, istride, idist,
133 onembed, ostride, odist,
134 CUFFT_R2C, howmany));
136 //FFT forward window all scales all feats
137 if(m_num_of_scales > 1 && m_big_batch_mode)
139 cudaHostAlloc(&data_fw_all_scales, m_height*m_num_of_feats*m_num_of_scales*m_width*sizeof(cufftReal), cudaHostAllocMapped);
140 cudaHostGetDevicePointer(&data_fw_all_scales_d, data_fw_all_scales, 0);
143 int n[] = {(int)m_height, (int)m_width};
144 int howmany = m_num_of_scales*m_num_of_feats;
145 int idist = m_height*m_width, odist = m_height*(m_width/2+1);
146 int istride = 1, ostride = 1;
147 int *inembed = n, onembed[] = {(int)m_height, (int)m_width/2+1};
149 CHECK_CUFFT_ERRORS(cufftPlanMany(&plan_fw_all_scales, rank, n,
150 inembed, istride, idist,
151 onembed, ostride, odist,
152 CUFFT_R2C, howmany));
156 //FFT inverse one scale
158 cudaHostAlloc(&data_i_features, m_height*m_num_of_feats*m_width*sizeof(cufftReal), cudaHostAllocMapped);
159 cudaHostGetDevicePointer(&data_i_features_d, data_i_features, 0);
162 int n[] = {(int)m_height, (int)m_width};
163 int howmany = m_num_of_feats;
164 int idist = m_height*(m_width/2+1), odist = 1;
165 int istride = 1, ostride = m_num_of_feats;
166 int inembed[] = {(int)m_height, (int)m_width/2+1}, *onembed = n;
168 CHECK_CUFFT_ERRORS(cufftPlanMany(&plan_i_features, rank, n,
169 inembed, istride, idist,
170 onembed, ostride, odist,
171 CUFFT_C2R, howmany));
173 //FFT inverse all scales
174 if(m_num_of_scales > 1)
176 cudaHostAlloc(&data_i_features_all_scales, m_height*m_num_of_feats*m_num_of_scales*m_width*sizeof(cufftReal), cudaHostAllocMapped);
177 cudaHostGetDevicePointer(&data_i_features_all_scales_d, data_i_features_all_scales, 0);
180 int n[] = {(int)m_height, (int)m_width};
181 int howmany = m_num_of_feats*m_num_of_scales;
182 int idist = m_height*(m_width/2+1), odist = 1;
183 int istride = 1, ostride = m_num_of_feats*m_num_of_scales;
184 int inembed[] = {(int)m_height, (int)m_width/2+1}, *onembed = n;
186 CHECK_CUFFT_ERRORS(cufftPlanMany(&plan_i_features_all_scales, rank, n,
187 inembed, istride, idist,
188 onembed, ostride, odist,
189 CUFFT_C2R, howmany));
191 //FFT inverse one channel one scale
193 cudaHostAlloc(&data_i_1ch, m_height*m_width*sizeof(cufftReal), cudaHostAllocMapped);
194 cudaHostGetDevicePointer(&data_i_1ch_d, data_i_1ch, 0);
197 int n[] = {(int)m_height, (int)m_width};
199 int idist = m_height*(m_width/2+1), odist = 1;
200 int istride = 1, ostride = 1;
201 int inembed[] = {(int)m_height, (int)m_width/2+1}, *onembed = n;
203 CHECK_CUFFT_ERRORS(cufftPlanMany(&plan_i_1ch, rank, n,
204 inembed, istride, idist,
205 onembed, ostride, odist,
206 CUFFT_C2R, howmany));
208 //FFT inverse one channel all scales
209 if(m_num_of_scales > 1 && m_big_batch_mode)
211 cudaHostAlloc(&data_i_1ch_all_scales, m_height*m_num_of_scales*m_width*sizeof(cufftReal), cudaHostAllocMapped);
212 cudaHostGetDevicePointer(&data_i_1ch_all_scales_d, data_i_1ch_all_scales, 0);
215 int n[] = {(int)m_height, (int)m_width};
216 int howmany = m_num_of_scales;
217 int idist = m_height*(m_width/2+1), odist = 1;
218 int istride = 1, ostride = m_num_of_scales;
219 int inembed[] = {(int)m_height, (int)m_width/2+1}, *onembed = n;
221 CHECK_CUFFT_ERRORS(cufftPlanMany(&plan_i_1ch_all_scales, rank, n,
222 inembed, istride, idist,
223 onembed, ostride, odist,
224 CUFFT_C2R, howmany));
228 void cuFFT::set_window(const cv::Mat &window)
233 ComplexMat cuFFT::forward(const cv::Mat &input)
235 ComplexMat complex_result;
236 if(m_big_batch_mode && input.rows == (int)(m_height*m_num_of_scales)){
237 cudaMemcpy(data_f_all_scales, input.ptr<cufftReal>(), m_height*m_num_of_scales*m_width*sizeof(cufftReal), cudaMemcpyHostToDevice);
238 complex_result.create(m_height, m_width / 2 + 1, m_num_of_scales);
239 CHECK_CUFFT_ERRORS(cufftExecR2C(plan_f_all_scales, reinterpret_cast<cufftReal*>(data_f_all_scales),
240 complex_result.get_p_data()));
242 cudaMemcpy(data_f, input.ptr<cufftReal>(), m_height*m_width*sizeof(cufftReal), cudaMemcpyHostToDevice);
243 complex_result.create(m_height, m_width/ 2 + 1, 1);
244 CHECK_CUFFT_ERRORS(cufftExecR2C(plan_f, reinterpret_cast<cufftReal*>(data_f),
245 complex_result.get_p_data()));
248 return complex_result;
251 ComplexMat cuFFT::forward_window(const std::vector<cv::Mat> &input)
253 int n_channels = input.size();
255 if(n_channels > (int) m_num_of_feats){
256 cv::Mat in_all(m_height * n_channels, m_width, CV_32F, data_fw_all_scales);
257 for (int i = 0; i < n_channels; ++i) {
258 cv::Mat in_roi(in_all, cv::Rect(0, i*m_height, m_width, m_height));
259 in_roi = input[i].mul(m_window);
262 result.create(m_height, m_width/2 + 1, n_channels,m_num_of_scales);
264 CHECK_CUFFT_ERRORS(cufftExecR2C(plan_fw_all_scales, reinterpret_cast<cufftReal*>(data_fw_all_scales_d), result.get_p_data()));
266 cv::Mat in_all(m_height * n_channels, m_width, CV_32F, data_fw);
267 for (int i = 0; i < n_channels; ++i) {
268 cv::Mat in_roi(in_all, cv::Rect(0, i*m_height, m_width, m_height));
269 in_roi = input[i].mul(m_window);
272 result.create(m_height, m_width/2 + 1, n_channels);
274 CHECK_CUFFT_ERRORS(cufftExecR2C(plan_fw, reinterpret_cast<cufftReal*>(data_fw_d), result.get_p_data()));
279 cv::Mat cuFFT::inverse(const ComplexMat &inputf)
281 int n_channels = inputf.n_channels;
282 cufftComplex *in = reinterpret_cast<cufftComplex*>(inputf.get_p_data());
285 cv::Mat real_result(m_height, m_width, CV_32FC1, data_i_1ch);
287 CHECK_CUFFT_ERRORS(cufftExecC2R(plan_i_1ch, in, reinterpret_cast<cufftReal*>(data_i_1ch_d)));
288 cudaDeviceSynchronize();
290 return real_result/(m_width*m_height);
291 } else if(n_channels == (int) m_num_of_scales){
292 cv::Mat real_result(m_height, m_width, CV_32FC(n_channels), data_i_1ch_all_scales);
294 CHECK_CUFFT_ERRORS(cufftExecC2R(plan_i_1ch_all_scales, in, reinterpret_cast<cufftReal*>(data_i_1ch_all_scales_d)));
295 cudaDeviceSynchronize();
297 return real_result/(m_width*m_height);
298 } else if(n_channels == (int) m_num_of_feats * (int) m_num_of_scales){
299 cv::Mat real_result(m_height, m_width, CV_32FC(n_channels), data_i_features_all_scales);
301 CHECK_CUFFT_ERRORS(cufftExecC2R(plan_i_features_all_scales, in, reinterpret_cast<cufftReal*>(data_i_features_all_scales_d)));
302 cudaDeviceSynchronize();
304 return real_result/(m_width*m_height);
307 cv::Mat real_result(m_height, m_width, CV_32FC(n_channels), data_i_features);
309 CHECK_CUFFT_ERRORS(cufftExecC2R(plan_i_features, in, reinterpret_cast<cufftReal*>(data_i_features_d)));
310 cudaDeviceSynchronize();
312 return real_result/(m_width*m_height);
318 for(unsigned i = 0; i < m_num_of_streams; i++) cudaStreamDestroy(streams[i]);
320 cufftDestroy(plan_f);
321 cufftDestroy(plan_f_all_scales);
322 cufftDestroy(plan_fw);
323 cufftDestroy(plan_fw_all_scales);
324 cufftDestroy(plan_i_1ch);
325 cufftDestroy(plan_i_1ch_all_scales);
326 cufftDestroy(plan_i_features);
327 cufftDestroy(plan_i_features_all_scales);
330 cudaFree(data_f_all_scales);
331 cudaFreeHost(data_fw);
332 cudaFreeHost(data_fw_all_scales);
333 cudaFreeHost(data_i_1ch);
334 cudaFreeHost(data_i_1ch_all_scales);
335 cudaFreeHost(data_i_features);
336 cudaFreeHost(data_i_features_all_scales);