]> rtime.felk.cvut.cz Git - hercules2020/kcf.git/blob - src/fft_cufft.cpp
Gaussian correlation for CUFFT version is now on GPU. Also corrected << operator...
[hercules2020/kcf.git] / src / fft_cufft.cpp
1 #include "fft_cufft.h"
2
3 void cuFFT::init(unsigned width, unsigned height, unsigned num_of_feats, unsigned num_of_scales, bool big_batch_mode)
4 {
5     m_width = width;
6     m_height = height;
7     m_num_of_feats = num_of_feats;
8     m_num_of_scales = num_of_scales;
9     m_big_batch_mode = big_batch_mode;
10
11     std::cout << "FFT: cuFFT" << std::endl;
12     
13     if(m_height*(m_width/2+1) > 1024){
14         std::cerr << "Image dimension after forward FFT are too big for CUDA kernels." << std::endl;
15         std::exit(EXIT_FAILURE);
16     }
17     
18     //FFT forward one scale
19     {
20         CudaSafeCall(cudaMalloc(&data_f, m_height*m_width*sizeof(cufftReal)));
21         
22        CufftErrorCheck(cufftPlan2d(&plan_f, m_height, m_width, CUFFT_R2C));
23         
24         
25     }
26     //FFT forward all scales
27     if(m_num_of_scales > 1 && m_big_batch_mode)
28     {
29         CudaSafeCall(cudaMalloc(&data_f_all_scales, m_height*m_num_of_scales*m_width*sizeof(cufftReal)));
30         
31         int rank = 2;
32         int n[] = {(int)m_height, (int)m_width};
33         int howmany = m_num_of_scales;
34         int idist = m_height*m_width, odist = m_height*(m_width/2+1);
35         int istride = 1, ostride = 1;
36         int *inembed = n, onembed[] = {(int)m_height, (int)m_width/2+1};
37
38         CufftErrorCheck(cufftPlanMany(&plan_f_all_scales, rank, n,
39                       inembed, istride, idist,
40                       onembed, ostride, odist,
41                       CUFFT_R2C, howmany));
42     }
43     //FFT forward window one scale
44     {
45         CudaSafeCall(cudaHostAlloc(&data_fw, m_height*m_num_of_feats*m_width*sizeof(cufftReal), cudaHostAllocMapped));
46         CudaSafeCall(cudaHostGetDevicePointer(&data_fw_d, data_fw, 0));
47         
48         int rank = 2;
49         int n[] = {(int)m_height, (int)m_width};
50         int howmany = m_num_of_feats;
51         int idist = m_height*m_width, odist = m_height*(m_width/2+1);
52         int istride = 1, ostride = 1;
53         int *inembed = n, onembed[] = {(int)m_height, (int)m_width/2+1};
54
55         CufftErrorCheck(cufftPlanMany(&plan_fw, rank, n,
56                   inembed, istride, idist,
57                   onembed, ostride, odist,
58                   CUFFT_R2C, howmany));
59     }
60     //FFT forward window all scales all feats
61     if(m_num_of_scales > 1 && m_big_batch_mode)
62     {
63         CudaSafeCall(cudaHostAlloc(&data_fw_all_scales, m_height*m_num_of_feats*m_num_of_scales*m_width*sizeof(cufftReal), cudaHostAllocMapped));
64         CudaSafeCall(cudaHostGetDevicePointer(&data_fw_all_scales_d, data_fw_all_scales, 0));
65
66         int rank = 2;
67         int n[] = {(int)m_height, (int)m_width};
68         int howmany = m_num_of_scales*m_num_of_feats;
69         int idist = m_height*m_width, odist = m_height*(m_width/2+1);
70         int istride = 1, ostride = 1;
71         int *inembed = n, onembed[] = {(int)m_height, (int)m_width/2+1};
72
73         CufftErrorCheck(cufftPlanMany(&plan_fw_all_scales, rank, n,
74                   inembed, istride, idist,
75                   onembed, ostride, odist,
76                   CUFFT_R2C, howmany));
77         
78         
79     }
80     //FFT inverse one scale
81     {
82         CudaSafeCall(cudaHostAlloc(&data_i_features, m_height*m_num_of_feats*m_width*sizeof(cufftReal), cudaHostAllocMapped));
83         CudaSafeCall(cudaHostGetDevicePointer(&data_i_features_d, data_i_features, 0));
84         
85         int rank = 2;
86         int n[] = {(int)m_height, (int)m_width};
87         int howmany = m_num_of_feats;
88         int idist = m_height*(m_width/2+1), odist = 1;
89         int istride = 1, ostride = m_num_of_feats;
90         int inembed[] = {(int)m_height, (int)m_width/2+1}, *onembed = n;
91
92         CufftErrorCheck(cufftPlanMany(&plan_i_features, rank, n,
93                   inembed, istride, idist,
94                   onembed, ostride, odist,
95                   CUFFT_C2R, howmany));
96     }
97     //FFT inverse all scales
98     if(m_num_of_scales > 1)
99     {
100         CudaSafeCall(cudaHostAlloc(&data_i_features_all_scales, m_height*m_num_of_feats*m_num_of_scales*m_width*sizeof(cufftReal), cudaHostAllocMapped));
101         CudaSafeCall(cudaHostGetDevicePointer(&data_i_features_all_scales_d, data_i_features_all_scales, 0));
102         
103         int rank = 2;
104         int n[] = {(int)m_height, (int)m_width};
105         int howmany = m_num_of_feats*m_num_of_scales;
106         int idist = m_height*(m_width/2+1), odist = 1;
107         int istride = 1, ostride = m_num_of_feats*m_num_of_scales;
108         int inembed[] = {(int)m_height, (int)m_width/2+1}, *onembed = n;
109
110         CufftErrorCheck(cufftPlanMany(&plan_i_features_all_scales, rank, n,
111                   inembed, istride, idist,
112                   onembed, ostride, odist,
113                   CUFFT_C2R, howmany));
114     }
115     //FFT inverse one channel one scale
116     {
117         CudaSafeCall(cudaHostAlloc(&data_i_1ch, m_height*m_width*sizeof(cufftReal), cudaHostAllocMapped));
118         CudaSafeCall(cudaHostGetDevicePointer(&data_i_1ch_d, data_i_1ch, 0));
119         
120         int rank = 2;
121         int n[] = {(int)m_height, (int)m_width};
122         int howmany = 1;
123         int idist = m_height*(m_width/2+1), odist = 1;
124         int istride = 1, ostride = 1;
125         int inembed[] = {(int)m_height, (int)m_width/2+1}, *onembed = n;
126
127         CufftErrorCheck(cufftPlanMany(&plan_i_1ch, rank, n,
128                   inembed, istride, idist,
129                   onembed, ostride, odist,
130                   CUFFT_C2R, howmany));
131     }
132     //FFT inverse one channel all scales
133     if(m_num_of_scales > 1 && m_big_batch_mode)
134     {
135         CudaSafeCall(cudaHostAlloc(&data_i_1ch_all_scales, m_height*m_num_of_scales*m_width*sizeof(cufftReal), cudaHostAllocMapped));
136         CudaSafeCall(cudaHostGetDevicePointer(&data_i_1ch_all_scales_d, data_i_1ch_all_scales, 0));
137         
138         int rank = 2;
139         int n[] = {(int)m_height, (int)m_width};
140         int howmany = m_num_of_scales;
141         int idist = m_height*(m_width/2+1), odist = 1;
142         int istride = 1, ostride = m_num_of_scales;
143         int inembed[] = {(int)m_height, (int)m_width/2+1}, *onembed = n;
144
145         CufftErrorCheck(cufftPlanMany(&plan_i_1ch_all_scales, rank, n,
146                   inembed, istride, idist,
147                   onembed, ostride, odist,
148                   CUFFT_C2R, howmany));
149     }
150 }
151
152 void cuFFT::set_window(const cv::Mat &window)
153 {
154      m_window = window;
155 }
156
157 ComplexMat cuFFT::forward(const cv::Mat &input)
158 {
159     ComplexMat complex_result;
160     if(m_big_batch_mode && input.rows == (int)(m_height*m_num_of_scales)){
161         CudaSafeCall(cudaMemcpy(data_f_all_scales, input.ptr<cufftReal>(), m_height*m_num_of_scales*m_width*sizeof(cufftReal), cudaMemcpyHostToDevice));
162         complex_result.create(m_height, m_width / 2 + 1, m_num_of_scales);
163         CufftErrorCheck(cufftExecR2C(plan_f_all_scales, reinterpret_cast<cufftReal*>(data_f_all_scales),
164                                 complex_result.get_p_data()));
165     } else {
166         CudaSafeCall(cudaMemcpy(data_f, input.ptr<cufftReal>(), m_height*m_width*sizeof(cufftReal), cudaMemcpyHostToDevice));
167         complex_result.create(m_height, m_width/ 2 + 1, 1);
168         CufftErrorCheck(cufftExecR2C(plan_f, reinterpret_cast<cufftReal*>(data_f),
169                                 complex_result.get_p_data()));
170     }
171     
172     return complex_result;
173 }
174
175 ComplexMat cuFFT::forward_raw(float *input, bool all_scales)
176 {
177     ComplexMat complex_result;
178     if (all_scales){
179         complex_result.create(m_height, m_width / 2 + 1, m_num_of_scales);
180         CufftErrorCheck(cufftExecR2C(plan_f_all_scales, reinterpret_cast<cufftReal*>(input),
181                                 complex_result.get_p_data()));
182     } else {
183         complex_result.create(m_height, m_width/ 2 + 1, 1);
184         CufftErrorCheck(cufftExecR2C(plan_f, reinterpret_cast<cufftReal*>(input),
185                                 complex_result.get_p_data()));
186     }
187     return complex_result;
188 }
189
190 ComplexMat cuFFT::forward_window(const std::vector<cv::Mat> &input)
191 {
192     int n_channels = input.size();
193     ComplexMat result;
194     if(n_channels > (int) m_num_of_feats){
195         cv::Mat in_all(m_height * n_channels, m_width, CV_32F, data_fw_all_scales);
196         for (int i = 0; i < n_channels; ++i) {
197             cv::Mat in_roi(in_all, cv::Rect(0, i*m_height, m_width, m_height));
198             in_roi = input[i].mul(m_window);
199         }
200         
201         result.create(m_height, m_width/2 + 1, n_channels,m_num_of_scales);
202         
203         CufftErrorCheck(cufftExecR2C(plan_fw_all_scales, reinterpret_cast<cufftReal*>(data_fw_all_scales_d), result.get_p_data()));
204     } else {
205         cv::Mat in_all(m_height * n_channels, m_width, CV_32F, data_fw);
206         for (int i = 0; i < n_channels; ++i) {
207             cv::Mat in_roi(in_all, cv::Rect(0, i*m_height, m_width, m_height));
208             in_roi = input[i].mul(m_window);
209         }
210         
211         result.create(m_height, m_width/2 + 1, n_channels);
212         
213         CufftErrorCheck(cufftExecR2C(plan_fw, reinterpret_cast<cufftReal*>(data_fw_d), result.get_p_data()));
214     }
215     return result;
216 }
217
218 cv::Mat cuFFT::inverse(const ComplexMat &input)
219 {
220     int n_channels = input.n_channels;
221     cufftComplex *in = reinterpret_cast<cufftComplex*>(input.get_p_data());
222     
223     if(n_channels == 1){
224         cv::Mat real_result(m_height, m_width, CV_32FC1, data_i_1ch);
225         
226         CufftErrorCheck(cufftExecC2R(plan_i_1ch, in, reinterpret_cast<cufftReal*>(data_i_1ch_d)));
227         cudaDeviceSynchronize();
228         
229         return real_result/(m_width*m_height);
230     } else if(n_channels == (int) m_num_of_scales){
231         cv::Mat real_result(m_height, m_width, CV_32FC(n_channels), data_i_1ch_all_scales);
232         
233         CufftErrorCheck(cufftExecC2R(plan_i_1ch_all_scales, in, reinterpret_cast<cufftReal*>(data_i_1ch_all_scales_d)));
234         cudaDeviceSynchronize();
235         
236         return real_result/(m_width*m_height);
237     } else if(n_channels == (int) m_num_of_feats * (int) m_num_of_scales){
238         cv::Mat real_result(m_height, m_width, CV_32FC(n_channels), data_i_features_all_scales);
239         
240         CufftErrorCheck(cufftExecC2R(plan_i_features_all_scales, in, reinterpret_cast<cufftReal*>(data_i_features_all_scales_d)));
241         cudaDeviceSynchronize();
242         
243         return real_result/(m_width*m_height);
244     }
245     
246     cv::Mat real_result(m_height, m_width, CV_32FC(n_channels), data_i_features);
247     
248     CufftErrorCheck(cufftExecC2R(plan_i_features, in, reinterpret_cast<cufftReal*>(data_i_features_d)));
249     cudaDeviceSynchronize();
250     
251     return real_result/(m_width*m_height);
252 }
253
254 float* cuFFT::inverse_raw(const ComplexMat &input)
255 {
256     int n_channels = input.n_channels;
257     cufftComplex *in = reinterpret_cast<cufftComplex*>(input.get_p_data());
258
259     if(n_channels == 1){
260         CufftErrorCheck(cufftExecC2R(plan_i_1ch, in, reinterpret_cast<cufftReal*>(data_i_1ch_d)));
261
262         return data_i_1ch_d;
263     } else if(n_channels == (int) m_num_of_scales){
264         CufftErrorCheck(cufftExecC2R(plan_i_1ch_all_scales, in, reinterpret_cast<cufftReal*>(data_i_1ch_all_scales_d)));
265
266         return data_i_1ch_all_scales_d;
267     } else if(n_channels == (int) m_num_of_feats * (int) m_num_of_scales){
268         CufftErrorCheck(cufftExecC2R(plan_i_features_all_scales, in, reinterpret_cast<cufftReal*>(data_i_features_all_scales_d)));
269
270         return data_i_features_all_scales_d;
271     }
272
273     CufftErrorCheck(cufftExecC2R(plan_i_features, in, reinterpret_cast<cufftReal*>(data_i_features_d)));
274     
275     return data_i_features_d;
276 }
277
278 cuFFT::~cuFFT()
279 {
280   CufftErrorCheck(cufftDestroy(plan_f));
281   CufftErrorCheck(cufftDestroy(plan_f_all_scales));
282   CufftErrorCheck(cufftDestroy(plan_fw));
283   CufftErrorCheck(cufftDestroy(plan_fw_all_scales));
284   CufftErrorCheck(cufftDestroy(plan_i_1ch));
285   CufftErrorCheck(cufftDestroy(plan_i_1ch_all_scales));
286   CufftErrorCheck(cufftDestroy(plan_i_features));
287   CufftErrorCheck(cufftDestroy(plan_i_features_all_scales));
288   
289   CudaSafeCall(cudaFree(data_f));
290   CudaSafeCall(cudaFree(data_f_all_scales));
291   CudaSafeCall(cudaFreeHost(data_fw));
292   CudaSafeCall(cudaFreeHost(data_fw_all_scales));
293   CudaSafeCall(cudaFreeHost(data_i_1ch));
294   CudaSafeCall(cudaFreeHost(data_i_1ch_all_scales));
295   CudaSafeCall(cudaFreeHost(data_i_features));
296   CudaSafeCall(cudaFreeHost(data_i_features_all_scales));
297 }