]> rtime.felk.cvut.cz Git - hercules2020/kcf.git/blob - src/fft_cufft.cpp
Added CUFFT support
[hercules2020/kcf.git] / src / fft_cufft.cpp
1 #include "fft_cufft.h"
2
3 void cuFFT::init(unsigned width, unsigned height, unsigned num_of_feats, unsigned num_of_scales, bool big_batch_mode)
4 {
5     m_width = width;
6     m_height = height;
7     m_num_of_feats = num_of_feats;
8     m_num_of_scales = num_of_scales;
9     m_big_batch_mode = big_batch_mode;
10
11     std::cout << "FFT: cuFFT" << std::endl;
12
13     //FFT forward one scale
14     {
15         CudaSafeCall(cudaMalloc(&data_f, m_height*m_width*sizeof(cufftReal)));
16
17        CufftErrorCheck(cufftPlan2d(&plan_f, m_height, m_width, CUFFT_R2C));
18
19
20     }
21     //FFT forward all scales
22     if(m_num_of_scales > 1 && m_big_batch_mode)
23     {
24         CudaSafeCall(cudaMalloc(&data_f_all_scales, m_height*m_num_of_scales*m_width*sizeof(cufftReal)));
25
26         int rank = 2;
27         int n[] = {(int)m_height, (int)m_width};
28         int howmany = m_num_of_scales;
29         int idist = m_height*m_width, odist = m_height*(m_width/2+1);
30         int istride = 1, ostride = 1;
31         int *inembed = n, onembed[] = {(int)m_height, (int)m_width/2+1};
32
33         CufftErrorCheck(cufftPlanMany(&plan_f_all_scales, rank, n,
34                   inembed, istride, idist,
35                   onembed, ostride, odist,
36                   CUFFT_R2C, howmany));
37     }
38     //FFT forward window one scale
39     {
40         CudaSafeCall(cudaHostAlloc(&data_fw, m_height*m_num_of_feats*m_width*sizeof(cufftReal), cudaHostAllocMapped));
41         CudaSafeCall(cudaHostGetDevicePointer(&data_fw_d, data_fw, 0));
42
43         int rank = 2;
44         int n[] = {(int)m_height, (int)m_width};
45         int howmany = m_num_of_feats;
46         int idist = m_height*m_width, odist = m_height*(m_width/2+1);
47         int istride = 1, ostride = 1;
48         int *inembed = n, onembed[] = {(int)m_height, (int)m_width/2+1};
49
50         CufftErrorCheck(cufftPlanMany(&plan_fw, rank, n,
51                   inembed, istride, idist,
52                   onembed, ostride, odist,
53                   CUFFT_R2C, howmany));
54     }
55     //FFT forward window all scales all feats
56     if(m_num_of_scales > 1 && m_big_batch_mode)
57     {
58         CudaSafeCall(cudaHostAlloc(&data_fw_all_scales, m_height*m_num_of_feats*m_num_of_scales*m_width*sizeof(cufftReal), cudaHostAllocMapped));
59         CudaSafeCall(cudaHostGetDevicePointer(&data_fw_all_scales_d, data_fw_all_scales, 0));
60
61         int rank = 2;
62         int n[] = {(int)m_height, (int)m_width};
63         int howmany = m_num_of_scales*m_num_of_feats;
64         int idist = m_height*m_width, odist = m_height*(m_width/2+1);
65         int istride = 1, ostride = 1;
66         int *inembed = n, onembed[] = {(int)m_height, (int)m_width/2+1};
67
68         CufftErrorCheck(cufftPlanMany(&plan_fw_all_scales, rank, n,
69                   inembed, istride, idist,
70                   onembed, ostride, odist,
71                   CUFFT_R2C, howmany));
72
73
74     }
75     //FFT inverse one scale
76     {
77         CudaSafeCall(cudaHostAlloc(&data_i_features, m_height*m_num_of_feats*m_width*sizeof(cufftReal), cudaHostAllocMapped));
78         CudaSafeCall(cudaHostGetDevicePointer(&data_i_features_d, data_i_features, 0));
79
80         int rank = 2;
81         int n[] = {(int)m_height, (int)m_width};
82         int howmany = m_num_of_feats;
83         int idist = m_height*(m_width/2+1), odist = 1;
84         int istride = 1, ostride = m_num_of_feats;
85         int inembed[] = {(int)m_height, (int)m_width/2+1}, *onembed = n;
86
87         CufftErrorCheck(cufftPlanMany(&plan_i_features, rank, n,
88                   inembed, istride, idist,
89                   onembed, ostride, odist,
90                   CUFFT_C2R, howmany));
91     }
92     //FFT inverse all scales
93     if(m_num_of_scales > 1)
94     {
95         CudaSafeCall(cudaHostAlloc(&data_i_features_all_scales, m_height*m_num_of_feats*m_num_of_scales*m_width*sizeof(cufftReal), cudaHostAllocMapped));
96         CudaSafeCall(cudaHostGetDevicePointer(&data_i_features_all_scales_d, data_i_features_all_scales, 0));
97
98         int rank = 2;
99         int n[] = {(int)m_height, (int)m_width};
100         int howmany = m_num_of_feats*m_num_of_scales;
101         int idist = m_height*(m_width/2+1), odist = 1;
102         int istride = 1, ostride = m_num_of_feats*m_num_of_scales;
103         int inembed[] = {(int)m_height, (int)m_width/2+1}, *onembed = n;
104
105         CufftErrorCheck(cufftPlanMany(&plan_i_features_all_scales, rank, n,
106                   inembed, istride, idist,
107                   onembed, ostride, odist,
108                   CUFFT_C2R, howmany));
109     }
110     //FFT inverse one channel one scale
111     {
112         CudaSafeCall(cudaHostAlloc(&data_i_1ch, m_height*m_width*sizeof(cufftReal), cudaHostAllocMapped));
113         CudaSafeCall(cudaHostGetDevicePointer(&data_i_1ch_d, data_i_1ch, 0));
114
115         int rank = 2;
116         int n[] = {(int)m_height, (int)m_width};
117         int howmany = 1;
118         int idist = m_height*(m_width/2+1), odist = 1;
119         int istride = 1, ostride = 1;
120         int inembed[] = {(int)m_height, (int)m_width/2+1}, *onembed = n;
121
122         CufftErrorCheck(cufftPlanMany(&plan_i_1ch, rank, n,
123                   inembed, istride, idist,
124                   onembed, ostride, odist,
125                   CUFFT_C2R, howmany));
126     }
127     //FFT inverse one channel all scales
128     if(m_num_of_scales > 1 && m_big_batch_mode)
129     {
130         CudaSafeCall(cudaHostAlloc(&data_i_1ch_all_scales, m_height*m_num_of_scales*m_width*sizeof(cufftReal), cudaHostAllocMapped));
131         CudaSafeCall(cudaHostGetDevicePointer(&data_i_1ch_all_scales_d, data_i_1ch_all_scales, 0));
132
133         int rank = 2;
134         int n[] = {(int)m_height, (int)m_width};
135         int howmany = m_num_of_scales;
136         int idist = m_height*(m_width/2+1), odist = 1;
137         int istride = 1, ostride = m_num_of_scales;
138         int inembed[] = {(int)m_height, (int)m_width/2+1}, *onembed = n;
139
140         CufftErrorCheck(cufftPlanMany(&plan_i_1ch_all_scales, rank, n,
141                   inembed, istride, idist,
142                   onembed, ostride, odist,
143                   CUFFT_C2R, howmany));
144     }
145 }
146
147 void cuFFT::set_window(const cv::Mat & window)
148 {
149      m_window = window;
150 }
151
152 ComplexMat cuFFT::forward(const cv::Mat & input)
153 {
154     ComplexMat complex_result;
155     if(m_big_batch_mode && input.rows == (int)(m_height*m_num_of_scales)){
156         CudaSafeCall(cudaMemcpy(data_f_all_scales, input.ptr<cufftReal>(), m_height*m_num_of_scales*m_width*sizeof(cufftReal), cudaMemcpyHostToDevice));
157         complex_result.create(m_height, m_width / 2 + 1, m_num_of_scales);
158         CufftErrorCheck(cufftExecR2C(plan_f_all_scales, reinterpret_cast<cufftReal*>(data_f_all_scales),
159                                 complex_result.get_p_data()));
160     } else {
161         CudaSafeCall(cudaMemcpy(data_f, input.ptr<cufftReal>(), m_height*m_width*sizeof(cufftReal), cudaMemcpyHostToDevice));
162         complex_result.create(m_height, m_width/ 2 + 1, 1);
163         CufftErrorCheck(cufftExecR2C(plan_f, reinterpret_cast<cufftReal*>(data_f),
164                                 complex_result.get_p_data()));
165     }
166
167     return complex_result;
168 }
169
170 void cuFFT::forward(Scale_vars & vars)
171 {
172     return;
173 }
174
175 void cuFFT::forward_raw(Scale_vars & vars, bool all_scales)
176 {
177     ComplexMat *result = vars.flag & Track_flags::AUTO_CORRELATION ? & vars.kf : & vars.kzf;
178     if (all_scales){
179         CufftErrorCheck(cufftExecR2C(plan_f_all_scales, reinterpret_cast<cufftReal*>(vars.gauss_corr_res),
180                                 result->get_p_data()));
181     } else {
182         CufftErrorCheck(cufftExecR2C(plan_f, reinterpret_cast<cufftReal*>(vars.gauss_corr_res),
183                                 result->get_p_data()));
184     }
185     return;
186 }
187
188 ComplexMat cuFFT::forward_window(const std::vector<cv::Mat> & input)
189 {
190     int n_channels = input.size();
191     ComplexMat result;
192     if(n_channels > (int) m_num_of_feats){
193         cv::Mat in_all(m_height * n_channels, m_width, CV_32F, data_fw_all_scales);
194         for (int i = 0; i < n_channels; ++i) {
195             cv::Mat in_roi(in_all, cv::Rect(0, i*m_height, m_width, m_height));
196             in_roi = input[i].mul(m_window);
197         }
198
199         result.create(m_height, m_width/2 + 1, n_channels,m_num_of_scales);
200
201         CufftErrorCheck(cufftExecR2C(plan_fw_all_scales, reinterpret_cast<cufftReal*>(data_fw_all_scales_d), result.get_p_data()));
202     } else {
203         cv::Mat in_all(m_height * n_channels, m_width, CV_32F, data_fw);
204         for (int i = 0; i < n_channels; ++i) {
205             cv::Mat in_roi(in_all, cv::Rect(0, i*m_height, m_width, m_height));
206             in_roi = input[i].mul(m_window);
207         }
208
209         result.create(m_height, m_width/2 + 1, n_channels);
210
211         CufftErrorCheck(cufftExecR2C(plan_fw, reinterpret_cast<cufftReal*>(data_fw_d), result.get_p_data()));
212     }
213     return result;
214 }
215
216 void cuFFT::forward_window(Scale_vars & vars)
217 {
218     int n_channels = vars.patch_feats.size();
219     ComplexMat *result = vars.flag & Track_flags::TRACKER_UPDATE ? & vars.xf : & vars.zf;
220     if(n_channels > (int) m_num_of_feats){
221         cv::Mat in_all(m_height * n_channels, m_width, CV_32F, data_fw_all_scales);
222         for (int i = 0; i < n_channels; ++i) {
223             cv::Mat in_roi(in_all, cv::Rect(0, i*m_height, m_width, m_height));
224             in_roi = vars.patch_feats[i].mul(m_window);
225         }
226
227         CufftErrorCheck(cufftExecR2C(plan_fw_all_scales, reinterpret_cast<cufftReal*>(data_fw_all_scales_d), result->get_p_data()));
228     } else {
229         cv::Mat in_all(m_height * n_channels, m_width, CV_32F, data_fw);
230         for (int i = 0; i < n_channels; ++i) {
231             cv::Mat in_roi(in_all, cv::Rect(0, i*m_height, m_width, m_height));
232             in_roi = vars.patch_feats[i].mul(m_window);
233         }
234
235         CufftErrorCheck(cufftExecR2C(plan_fw, reinterpret_cast<cufftReal*>(data_fw_d), result->get_p_data()));
236     }
237     return;
238 }
239
240 cv::Mat cuFFT::inverse(const ComplexMat & input)
241 {
242     int n_channels = input.n_channels;
243     cufftComplex *in = reinterpret_cast<cufftComplex*>(input.get_p_data());
244
245     if(n_channels == 1){
246         cv::Mat real_result(m_height, m_width, CV_32FC1, data_i_1ch);
247
248         CufftErrorCheck(cufftExecC2R(plan_i_1ch, in, reinterpret_cast<cufftReal*>(data_i_1ch_d)));
249         cudaDeviceSynchronize();
250
251         return real_result/(m_width*m_height);
252     } else if(n_channels == (int) m_num_of_scales){
253         cv::Mat real_result(m_height, m_width, CV_32FC(n_channels), data_i_1ch_all_scales);
254
255         CufftErrorCheck(cufftExecC2R(plan_i_1ch_all_scales, in, reinterpret_cast<cufftReal*>(data_i_1ch_all_scales_d)));
256         cudaDeviceSynchronize();
257
258         return real_result/(m_width*m_height);
259     } else if(n_channels == (int) m_num_of_feats * (int) m_num_of_scales){
260         cv::Mat real_result(m_height, m_width, CV_32FC(n_channels), data_i_features_all_scales);
261
262         CufftErrorCheck(cufftExecC2R(plan_i_features_all_scales, in, reinterpret_cast<cufftReal*>(data_i_features_all_scales_d)));
263         cudaDeviceSynchronize();
264
265         return real_result/(m_width*m_height);
266     }
267
268     cv::Mat real_result(m_height, m_width, CV_32FC(n_channels), data_i_features);
269
270     CufftErrorCheck(cufftExecC2R(plan_i_features, in, reinterpret_cast<cufftReal*>(data_i_features_d)));
271     cudaDeviceSynchronize();
272
273     return real_result/(m_width*m_height);
274 }
275
276 void cuFFT::inverse(Scale_vars & vars)
277 {
278     return;
279 }
280
281 float* cuFFT::inverse_raw(const ComplexMat & input)
282 {
283     int n_channels = input.n_channels;
284     cufftComplex *in = reinterpret_cast<cufftComplex*>(input.get_p_data());
285
286     if(n_channels == 1){
287         CufftErrorCheck(cufftExecC2R(plan_i_1ch, in, reinterpret_cast<cufftReal*>(data_i_1ch_d)));
288
289         return data_i_1ch_d;
290     } else if(n_channels == (int) m_num_of_scales){
291         CufftErrorCheck(cufftExecC2R(plan_i_1ch_all_scales, in, reinterpret_cast<cufftReal*>(data_i_1ch_all_scales_d)));
292
293         return data_i_1ch_all_scales_d;
294     } else if(n_channels == (int) m_num_of_feats * (int) m_num_of_scales){
295         CufftErrorCheck(cufftExecC2R(plan_i_features_all_scales, in, reinterpret_cast<cufftReal*>(data_i_features_all_scales_d)));
296
297         return data_i_features_all_scales_d;
298     }
299
300     CufftErrorCheck(cufftExecC2R(plan_i_features, in, reinterpret_cast<cufftReal*>(data_i_features_d)));
301
302     return data_i_features_d;
303 }
304
305 cuFFT::~cuFFT()
306 {
307   CufftErrorCheck(cufftDestroy(plan_f));
308   CufftErrorCheck(cufftDestroy(plan_fw));
309   CufftErrorCheck(cufftDestroy(plan_i_1ch));
310   CufftErrorCheck(cufftDestroy(plan_i_features));
311
312   CudaSafeCall(cudaFree(data_f));
313   CudaSafeCall(cudaFreeHost(data_fw));
314   CudaSafeCall(cudaFreeHost(data_i_1ch));
315   CudaSafeCall(cudaFreeHost(data_i_features));
316   
317   if (m_big_batch_mode) {
318       CufftErrorCheck(cufftDestroy(plan_f_all_scales));
319       CufftErrorCheck(cufftDestroy(plan_fw_all_scales));
320       CufftErrorCheck(cufftDestroy(plan_i_1ch_all_scales));
321       CufftErrorCheck(cufftDestroy(plan_i_features_all_scales));
322       
323       CudaSafeCall(cudaFree(data_f_all_scales));
324       CudaSafeCall(cudaFreeHost(data_fw_all_scales));
325       CudaSafeCall(cudaFreeHost(data_i_1ch_all_scales));
326       CudaSafeCall(cudaFreeHost(data_i_features_all_scales));
327   }
328 }