3 void cuFFT::init(unsigned width, unsigned height, unsigned num_of_feats, unsigned num_of_scales, bool big_batch_mode)
7 m_num_of_feats = num_of_feats;
8 m_num_of_scales = num_of_scales;
9 m_big_batch_mode = big_batch_mode;
11 std::cout << "FFT: cuFFT" << std::endl;
13 if(m_height*(m_width/2+1) > 1024){
14 std::cerr << "Image dimension after forward FFT are too big for CUDA kernels." << std::endl;
15 std::exit(EXIT_FAILURE);
18 //FFT forward one scale
20 CudaSafeCall(cudaMalloc(&data_f, m_height*m_width*sizeof(cufftReal)));
22 CufftErrorCheck(cufftPlan2d(&plan_f, m_height, m_width, CUFFT_R2C));
26 //FFT forward all scales
27 if(m_num_of_scales > 1 && m_big_batch_mode)
29 CudaSafeCall(cudaMalloc(&data_f_all_scales, m_height*m_num_of_scales*m_width*sizeof(cufftReal)));
32 int n[] = {(int)m_height, (int)m_width};
33 int howmany = m_num_of_scales;
34 int idist = m_height*m_width, odist = m_height*(m_width/2+1);
35 int istride = 1, ostride = 1;
36 int *inembed = n, onembed[] = {(int)m_height, (int)m_width/2+1};
38 CufftErrorCheck(cufftPlanMany(&plan_f_all_scales, rank, n,
39 inembed, istride, idist,
40 onembed, ostride, odist,
43 //FFT forward window one scale
45 CudaSafeCall(cudaHostAlloc(&data_fw, m_height*m_num_of_feats*m_width*sizeof(cufftReal), cudaHostAllocMapped));
46 CudaSafeCall(cudaHostGetDevicePointer(&data_fw_d, data_fw, 0));
49 int n[] = {(int)m_height, (int)m_width};
50 int howmany = m_num_of_feats;
51 int idist = m_height*m_width, odist = m_height*(m_width/2+1);
52 int istride = 1, ostride = 1;
53 int *inembed = n, onembed[] = {(int)m_height, (int)m_width/2+1};
55 CufftErrorCheck(cufftPlanMany(&plan_fw, rank, n,
56 inembed, istride, idist,
57 onembed, ostride, odist,
60 //FFT forward window all scales all feats
61 if(m_num_of_scales > 1 && m_big_batch_mode)
63 CudaSafeCall(cudaHostAlloc(&data_fw_all_scales, m_height*m_num_of_feats*m_num_of_scales*m_width*sizeof(cufftReal), cudaHostAllocMapped));
64 CudaSafeCall(cudaHostGetDevicePointer(&data_fw_all_scales_d, data_fw_all_scales, 0));
67 int n[] = {(int)m_height, (int)m_width};
68 int howmany = m_num_of_scales*m_num_of_feats;
69 int idist = m_height*m_width, odist = m_height*(m_width/2+1);
70 int istride = 1, ostride = 1;
71 int *inembed = n, onembed[] = {(int)m_height, (int)m_width/2+1};
73 CufftErrorCheck(cufftPlanMany(&plan_fw_all_scales, rank, n,
74 inembed, istride, idist,
75 onembed, ostride, odist,
80 //FFT inverse one scale
82 CudaSafeCall(cudaHostAlloc(&data_i_features, m_height*m_num_of_feats*m_width*sizeof(cufftReal), cudaHostAllocMapped));
83 CudaSafeCall(cudaHostGetDevicePointer(&data_i_features_d, data_i_features, 0));
86 int n[] = {(int)m_height, (int)m_width};
87 int howmany = m_num_of_feats;
88 int idist = m_height*(m_width/2+1), odist = 1;
89 int istride = 1, ostride = m_num_of_feats;
90 int inembed[] = {(int)m_height, (int)m_width/2+1}, *onembed = n;
92 CufftErrorCheck(cufftPlanMany(&plan_i_features, rank, n,
93 inembed, istride, idist,
94 onembed, ostride, odist,
97 //FFT inverse all scales
98 if(m_num_of_scales > 1)
100 CudaSafeCall(cudaHostAlloc(&data_i_features_all_scales, m_height*m_num_of_feats*m_num_of_scales*m_width*sizeof(cufftReal), cudaHostAllocMapped));
101 CudaSafeCall(cudaHostGetDevicePointer(&data_i_features_all_scales_d, data_i_features_all_scales, 0));
104 int n[] = {(int)m_height, (int)m_width};
105 int howmany = m_num_of_feats*m_num_of_scales;
106 int idist = m_height*(m_width/2+1), odist = 1;
107 int istride = 1, ostride = m_num_of_feats*m_num_of_scales;
108 int inembed[] = {(int)m_height, (int)m_width/2+1}, *onembed = n;
110 CufftErrorCheck(cufftPlanMany(&plan_i_features_all_scales, rank, n,
111 inembed, istride, idist,
112 onembed, ostride, odist,
113 CUFFT_C2R, howmany));
115 //FFT inverse one channel one scale
117 CudaSafeCall(cudaHostAlloc(&data_i_1ch, m_height*m_width*sizeof(cufftReal), cudaHostAllocMapped));
118 CudaSafeCall(cudaHostGetDevicePointer(&data_i_1ch_d, data_i_1ch, 0));
121 int n[] = {(int)m_height, (int)m_width};
123 int idist = m_height*(m_width/2+1), odist = 1;
124 int istride = 1, ostride = 1;
125 int inembed[] = {(int)m_height, (int)m_width/2+1}, *onembed = n;
127 CufftErrorCheck(cufftPlanMany(&plan_i_1ch, rank, n,
128 inembed, istride, idist,
129 onembed, ostride, odist,
130 CUFFT_C2R, howmany));
132 //FFT inverse one channel all scales
133 if(m_num_of_scales > 1 && m_big_batch_mode)
135 CudaSafeCall(cudaHostAlloc(&data_i_1ch_all_scales, m_height*m_num_of_scales*m_width*sizeof(cufftReal), cudaHostAllocMapped));
136 CudaSafeCall(cudaHostGetDevicePointer(&data_i_1ch_all_scales_d, data_i_1ch_all_scales, 0));
139 int n[] = {(int)m_height, (int)m_width};
140 int howmany = m_num_of_scales;
141 int idist = m_height*(m_width/2+1), odist = 1;
142 int istride = 1, ostride = m_num_of_scales;
143 int inembed[] = {(int)m_height, (int)m_width/2+1}, *onembed = n;
145 CufftErrorCheck(cufftPlanMany(&plan_i_1ch_all_scales, rank, n,
146 inembed, istride, idist,
147 onembed, ostride, odist,
148 CUFFT_C2R, howmany));
152 void cuFFT::set_window(const cv::Mat &window)
157 ComplexMat cuFFT::forward(const cv::Mat &input)
159 ComplexMat complex_result;
160 if(m_big_batch_mode && input.rows == (int)(m_height*m_num_of_scales)){
161 CudaSafeCall(cudaMemcpy(data_f_all_scales, input.ptr<cufftReal>(), m_height*m_num_of_scales*m_width*sizeof(cufftReal), cudaMemcpyHostToDevice));
162 complex_result.create(m_height, m_width / 2 + 1, m_num_of_scales);
163 CufftErrorCheck(cufftExecR2C(plan_f_all_scales, reinterpret_cast<cufftReal*>(data_f_all_scales),
164 complex_result.get_p_data()));
166 CudaSafeCall(cudaMemcpy(data_f, input.ptr<cufftReal>(), m_height*m_width*sizeof(cufftReal), cudaMemcpyHostToDevice));
167 complex_result.create(m_height, m_width/ 2 + 1, 1);
168 CufftErrorCheck(cufftExecR2C(plan_f, reinterpret_cast<cufftReal*>(data_f),
169 complex_result.get_p_data()));
172 return complex_result;
175 ComplexMat cuFFT::forward_raw(float *input)
181 ComplexMat cuFFT::forward_window(const std::vector<cv::Mat> &input)
183 int n_channels = input.size();
185 if(n_channels > (int) m_num_of_feats){
186 cv::Mat in_all(m_height * n_channels, m_width, CV_32F, data_fw_all_scales);
187 for (int i = 0; i < n_channels; ++i) {
188 cv::Mat in_roi(in_all, cv::Rect(0, i*m_height, m_width, m_height));
189 in_roi = input[i].mul(m_window);
192 result.create(m_height, m_width/2 + 1, n_channels,m_num_of_scales);
194 CufftErrorCheck(cufftExecR2C(plan_fw_all_scales, reinterpret_cast<cufftReal*>(data_fw_all_scales_d), result.get_p_data()));
196 cv::Mat in_all(m_height * n_channels, m_width, CV_32F, data_fw);
197 for (int i = 0; i < n_channels; ++i) {
198 cv::Mat in_roi(in_all, cv::Rect(0, i*m_height, m_width, m_height));
199 in_roi = input[i].mul(m_window);
202 result.create(m_height, m_width/2 + 1, n_channels);
204 CufftErrorCheck(cufftExecR2C(plan_fw, reinterpret_cast<cufftReal*>(data_fw_d), result.get_p_data()));
209 cv::Mat cuFFT::inverse(const ComplexMat &input)
211 int n_channels = input.n_channels;
212 cufftComplex *in = reinterpret_cast<cufftComplex*>(input.get_p_data());
215 cv::Mat real_result(m_height, m_width, CV_32FC1, data_i_1ch);
217 CufftErrorCheck(cufftExecC2R(plan_i_1ch, in, reinterpret_cast<cufftReal*>(data_i_1ch_d)));
218 cudaDeviceSynchronize();
220 return real_result/(m_width*m_height);
221 } else if(n_channels == (int) m_num_of_scales){
222 cv::Mat real_result(m_height, m_width, CV_32FC(n_channels), data_i_1ch_all_scales);
224 CufftErrorCheck(cufftExecC2R(plan_i_1ch_all_scales, in, reinterpret_cast<cufftReal*>(data_i_1ch_all_scales_d)));
225 cudaDeviceSynchronize();
227 return real_result/(m_width*m_height);
228 } else if(n_channels == (int) m_num_of_feats * (int) m_num_of_scales){
229 cv::Mat real_result(m_height, m_width, CV_32FC(n_channels), data_i_features_all_scales);
231 CufftErrorCheck(cufftExecC2R(plan_i_features_all_scales, in, reinterpret_cast<cufftReal*>(data_i_features_all_scales_d)));
232 cudaDeviceSynchronize();
234 return real_result/(m_width*m_height);
237 cv::Mat real_result(m_height, m_width, CV_32FC(n_channels), data_i_features);
239 CufftErrorCheck(cufftExecC2R(plan_i_features, in, reinterpret_cast<cufftReal*>(data_i_features_d)));
240 cudaDeviceSynchronize();
242 return real_result/(m_width*m_height);
245 float* cuFFT::inverse_raw(const ComplexMat &input)
247 cufftComplex *in = reinterpret_cast<cufftComplex*>(input.get_p_data());
249 CufftErrorCheck(cufftExecC2R(plan_i_features_all_scales, in, reinterpret_cast<cufftReal*>(data_i_features_all_scales_d)));
251 return data_i_features_all_scales;
256 CufftErrorCheck(cufftDestroy(plan_f));
257 CufftErrorCheck(cufftDestroy(plan_f_all_scales));
258 CufftErrorCheck(cufftDestroy(plan_fw));
259 CufftErrorCheck(cufftDestroy(plan_fw_all_scales));
260 CufftErrorCheck(cufftDestroy(plan_i_1ch));
261 CufftErrorCheck(cufftDestroy(plan_i_1ch_all_scales));
262 CufftErrorCheck(cufftDestroy(plan_i_features));
263 CufftErrorCheck(cufftDestroy(plan_i_features_all_scales));
265 CudaSafeCall(cudaFree(data_f));
266 CudaSafeCall(cudaFree(data_f_all_scales));
267 CudaSafeCall(cudaFreeHost(data_fw));
268 CudaSafeCall(cudaFreeHost(data_fw_all_scales));
269 CudaSafeCall(cudaFreeHost(data_i_1ch));
270 CudaSafeCall(cudaFreeHost(data_i_1ch_all_scales));
271 CudaSafeCall(cudaFreeHost(data_i_features));
272 CudaSafeCall(cudaFreeHost(data_i_features_all_scales));