]> rtime.felk.cvut.cz Git - hercules2020/kcf.git/blob - src/fft_cufft.cpp
Merge branch 'master' of github.com:Shanigen/kcf
[hercules2020/kcf.git] / src / fft_cufft.cpp
1 #include "fft_cufft.h"
2
3 #ifdef _CUFFT_H_
4 // cuFFT API errors
5 static const char *_cudaGetErrorEnum(cufftResult error)
6 {
7     switch (error)
8     {
9         case CUFFT_SUCCESS:
10             return "CUFFT_SUCCESS";
11
12         case CUFFT_INVALID_PLAN:
13             return "CUFFT_INVALID_PLAN";
14
15         case CUFFT_ALLOC_FAILED:
16             return "CUFFT_ALLOC_FAILED";
17
18         case CUFFT_INVALID_TYPE:
19             return "CUFFT_INVALID_TYPE";
20
21         case CUFFT_INVALID_VALUE:
22             return "CUFFT_INVALID_VALUE";
23
24         case CUFFT_INTERNAL_ERROR:
25             return "CUFFT_INTERNAL_ERROR";
26
27         case CUFFT_EXEC_FAILED:
28             return "CUFFT_EXEC_FAILED";
29
30         case CUFFT_SETUP_FAILED:
31             return "CUFFT_SETUP_FAILED";
32
33         case CUFFT_INVALID_SIZE:
34             return "CUFFT_INVALID_SIZE";
35
36         case CUFFT_UNALIGNED_DATA:
37             return "CUFFT_UNALIGNED_DATA";
38
39         case CUFFT_INVALID_DEVICE:
40             return "CUFFT_INVALID_DEVICE";
41
42         case CUFFT_PARSE_ERROR:
43             return "CUFFT_PARSE_ERROR";
44
45         case CUFFT_NO_WORKSPACE:
46             return "CUFFT_NO_WORKSPACE";
47
48         case CUFFT_NOT_IMPLEMENTED:
49             return "CUFFT_NOT_IMPLEMENTED";
50
51         case CUFFT_LICENSE_ERROR:
52             return "CUFFT_LICENSE_ERROR";
53
54         case CUFFT_NOT_SUPPORTED:
55             return "CUFFT_NOT_SUPPORTED";
56
57         case CUFFT_INCOMPLETE_PARAMETER_LIST:
58             return "CUFFT_INCOMPLETE_PARAMETER_LIST";
59     }
60
61     return "<unknown>";
62 }
63 #endif
64
65 #define CHECK_CUFFT_ERRORS(call) { \
66     cufftResult_t err; \
67     if ((err = (call)) != CUFFT_SUCCESS) { \
68         fprintf(stderr, "cuFFT error %d:%s at %s:%d\n", err, _cudaGetErrorEnum(err), \
69                 __FILE__, __LINE__); \
70         exit(1); \
71     } \
72 }
73
74 cuFFT::cuFFT(): m_num_of_streams(4)
75 {}
76
77 void cuFFT::init(unsigned width, unsigned height, unsigned num_of_feats, unsigned num_of_scales, bool big_batch_mode)
78 {
79     m_width = width;
80     m_height = height;
81     m_num_of_feats = num_of_feats;
82     m_num_of_scales = num_of_scales;
83     m_big_batch_mode = big_batch_mode;
84
85     std::cout << "FFT: cuFFT" << std::endl;
86     
87     if(m_height*(m_width/2+1) > 1024){
88         std::cerr << "Image dimension after forward FFT are too big for CUDA kernels." << std::endl;
89         std::exit(EXIT_FAILURE);
90     }
91
92     for (unsigned i = 0; i < m_num_of_streams; i++) cudaStreamCreate(&streams[i]);
93     
94     //FFT forward one scale
95     {
96         cudaMalloc(&data_f, m_height*m_width*sizeof(cufftReal));
97         
98         cufftPlan2d(&plan_f, m_height, m_width, CUFFT_R2C);
99         
100         
101     }
102     //FFT forward all scales
103     if(m_num_of_scales > 1 && m_big_batch_mode)
104     {
105         cudaMalloc(&data_f_all_scales, m_height*m_num_of_scales*m_width*sizeof(cufftReal));
106         
107         int rank = 2;
108         int n[] = {(int)m_height, (int)m_width};
109         int howmany = m_num_of_scales;
110         int idist = m_height*m_width, odist = m_height*(m_width/2+1);
111         int istride = 1, ostride = 1;
112         int *inembed = n, onembed[] = {(int)m_height, (int)m_width/2+1};
113
114         CHECK_CUFFT_ERRORS(cufftPlanMany(&plan_f_all_scales, rank, n,
115                       inembed, istride, idist,
116                       onembed, ostride, odist,
117                       CUFFT_R2C, howmany));
118     }
119     //FFT forward window one scale
120     {
121         cudaHostAlloc(&data_fw, m_height*m_num_of_feats*m_width*sizeof(cufftReal), cudaHostAllocMapped);
122         cudaHostGetDevicePointer(&data_fw_d, data_fw, 0);
123         
124         int rank = 2;
125         int n[] = {(int)m_height, (int)m_width};
126         int howmany = m_num_of_feats;
127         int idist = m_height*m_width, odist = m_height*(m_width/2+1);
128         int istride = 1, ostride = 1;
129         int *inembed = n, onembed[] = {(int)m_height, (int)m_width/2+1};
130
131         CHECK_CUFFT_ERRORS(cufftPlanMany(&plan_fw, rank, n,
132                   inembed, istride, idist,
133                   onembed, ostride, odist,
134                   CUFFT_R2C, howmany));
135     }
136     //FFT forward window all scales all feats
137     if(m_num_of_scales > 1 && m_big_batch_mode)
138     {
139         cudaHostAlloc(&data_fw_all_scales, m_height*m_num_of_feats*m_num_of_scales*m_width*sizeof(cufftReal), cudaHostAllocMapped);
140         cudaHostGetDevicePointer(&data_fw_all_scales_d, data_fw_all_scales, 0);
141
142         int rank = 2;
143         int n[] = {(int)m_height, (int)m_width};
144         int howmany = m_num_of_scales*m_num_of_feats;
145         int idist = m_height*m_width, odist = m_height*(m_width/2+1);
146         int istride = 1, ostride = 1;
147         int *inembed = n, onembed[] = {(int)m_height, (int)m_width/2+1};
148
149         CHECK_CUFFT_ERRORS(cufftPlanMany(&plan_fw_all_scales, rank, n,
150                   inembed, istride, idist,
151                   onembed, ostride, odist,
152                   CUFFT_R2C, howmany));
153         
154         
155     }
156     //FFT inverse one scale
157     {
158         cudaHostAlloc(&data_i_features, m_height*m_num_of_feats*m_width*sizeof(cufftReal), cudaHostAllocMapped);
159         cudaHostGetDevicePointer(&data_i_features_d, data_i_features, 0);
160         
161         int rank = 2;
162         int n[] = {(int)m_height, (int)m_width};
163         int howmany = m_num_of_feats;
164         int idist = m_height*(m_width/2+1), odist = 1;
165         int istride = 1, ostride = m_num_of_feats;
166         int inembed[] = {(int)m_height, (int)m_width/2+1}, *onembed = n;
167
168         CHECK_CUFFT_ERRORS(cufftPlanMany(&plan_i_features, rank, n,
169                   inembed, istride, idist,
170                   onembed, ostride, odist,
171                   CUFFT_C2R, howmany));
172     }
173     //FFT inverse all scales
174     if(m_num_of_scales > 1)
175     {
176         cudaHostAlloc(&data_i_features_all_scales, m_height*m_num_of_feats*m_num_of_scales*m_width*sizeof(cufftReal), cudaHostAllocMapped);
177         cudaHostGetDevicePointer(&data_i_features_all_scales_d, data_i_features_all_scales, 0);
178         
179         int rank = 2;
180         int n[] = {(int)m_height, (int)m_width};
181         int howmany = m_num_of_feats*m_num_of_scales;
182         int idist = m_height*(m_width/2+1), odist = 1;
183         int istride = 1, ostride = m_num_of_feats*m_num_of_scales;
184         int inembed[] = {(int)m_height, (int)m_width/2+1}, *onembed = n;
185
186         CHECK_CUFFT_ERRORS(cufftPlanMany(&plan_i_features_all_scales, rank, n,
187                   inembed, istride, idist,
188                   onembed, ostride, odist,
189                   CUFFT_C2R, howmany));
190     }
191     //FFT inverse one channel one scale
192     {
193         cudaHostAlloc(&data_i_1ch, m_height*m_width*sizeof(cufftReal), cudaHostAllocMapped);
194         cudaHostGetDevicePointer(&data_i_1ch_d, data_i_1ch, 0);
195         
196         int rank = 2;
197         int n[] = {(int)m_height, (int)m_width};
198         int howmany = 1;
199         int idist = m_height*(m_width/2+1), odist = 1;
200         int istride = 1, ostride = 1;
201         int inembed[] = {(int)m_height, (int)m_width/2+1}, *onembed = n;
202
203         CHECK_CUFFT_ERRORS(cufftPlanMany(&plan_i_1ch, rank, n,
204                   inembed, istride, idist,
205                   onembed, ostride, odist,
206                   CUFFT_C2R, howmany));
207     }
208     //FFT inverse one channel all scales
209     if(m_num_of_scales > 1 && m_big_batch_mode)
210     {
211         cudaHostAlloc(&data_i_1ch_all_scales, m_height*m_num_of_scales*m_width*sizeof(cufftReal), cudaHostAllocMapped);
212         cudaHostGetDevicePointer(&data_i_1ch_all_scales_d, data_i_1ch_all_scales, 0);
213         
214         int rank = 2;
215         int n[] = {(int)m_height, (int)m_width};
216         int howmany = m_num_of_scales;
217         int idist = m_height*(m_width/2+1), odist = 1;
218         int istride = 1, ostride = m_num_of_scales;
219         int inembed[] = {(int)m_height, (int)m_width/2+1}, *onembed = n;
220
221         CHECK_CUFFT_ERRORS(cufftPlanMany(&plan_i_1ch_all_scales, rank, n,
222                   inembed, istride, idist,
223                   onembed, ostride, odist,
224                   CUFFT_C2R, howmany));
225     }
226 }
227
228 void cuFFT::set_window(const cv::Mat &window)
229 {
230      m_window = window;
231 }
232
233 ComplexMat cuFFT::forward(const cv::Mat &input)
234 {
235     ComplexMat complex_result;
236     if(m_big_batch_mode && input.rows == (int)(m_height*m_num_of_scales)){
237         cudaMemcpy(data_f_all_scales, input.ptr<cufftReal>(), m_height*m_num_of_scales*m_width*sizeof(cufftReal), cudaMemcpyHostToDevice);
238         complex_result.create(m_height, m_width / 2 + 1, m_num_of_scales);
239         CHECK_CUFFT_ERRORS(cufftExecR2C(plan_f_all_scales, reinterpret_cast<cufftReal*>(data_f_all_scales),
240                                 complex_result.get_p_data()));
241     } else {
242         cudaMemcpy(data_f, input.ptr<cufftReal>(), m_height*m_width*sizeof(cufftReal), cudaMemcpyHostToDevice);
243         complex_result.create(m_height, m_width/ 2 + 1, 1);
244         CHECK_CUFFT_ERRORS(cufftExecR2C(plan_f, reinterpret_cast<cufftReal*>(data_f),
245                                 complex_result.get_p_data()));
246     }
247     
248     return complex_result;
249 }
250
251 ComplexMat cuFFT::forward_window(const std::vector<cv::Mat> &input)
252 {
253     int n_channels = input.size();
254     ComplexMat result;
255     if(n_channels > (int) m_num_of_feats){
256         cv::Mat in_all(m_height * n_channels, m_width, CV_32F, data_fw_all_scales);
257         for (int i = 0; i < n_channels; ++i) {
258             cv::Mat in_roi(in_all, cv::Rect(0, i*m_height, m_width, m_height));
259             in_roi = input[i].mul(m_window);
260         }
261         
262         result.create(m_height, m_width/2 + 1, n_channels,m_num_of_scales);
263         
264         CHECK_CUFFT_ERRORS(cufftExecR2C(plan_fw_all_scales, reinterpret_cast<cufftReal*>(data_fw_all_scales_d), result.get_p_data()));
265     } else {
266         cv::Mat in_all(m_height * n_channels, m_width, CV_32F, data_fw);
267         for (int i = 0; i < n_channels; ++i) {
268             cv::Mat in_roi(in_all, cv::Rect(0, i*m_height, m_width, m_height));
269             in_roi = input[i].mul(m_window);
270         }
271         
272         result.create(m_height, m_width/2 + 1, n_channels);
273         
274         CHECK_CUFFT_ERRORS(cufftExecR2C(plan_fw, reinterpret_cast<cufftReal*>(data_fw_d), result.get_p_data()));
275     }
276     return result;
277 }
278
279 cv::Mat cuFFT::inverse(const ComplexMat &inputf)
280 {
281     int n_channels = inputf.n_channels;
282     cufftComplex *in = reinterpret_cast<cufftComplex*>(inputf.get_p_data());
283     
284     if(n_channels == 1){
285         cv::Mat real_result(m_height, m_width, CV_32FC1, data_i_1ch);
286         
287         CHECK_CUFFT_ERRORS(cufftExecC2R(plan_i_1ch, in, reinterpret_cast<cufftReal*>(data_i_1ch_d)));
288         cudaDeviceSynchronize();
289         
290         return real_result/(m_width*m_height);
291     } else if(n_channels == (int) m_num_of_scales){
292         cv::Mat real_result(m_height, m_width, CV_32FC(n_channels), data_i_1ch_all_scales);
293         
294         CHECK_CUFFT_ERRORS(cufftExecC2R(plan_i_1ch_all_scales, in, reinterpret_cast<cufftReal*>(data_i_1ch_all_scales_d)));
295         cudaDeviceSynchronize();
296         
297         return real_result/(m_width*m_height);
298     } else if(n_channels == (int) m_num_of_feats * (int) m_num_of_scales){
299         cv::Mat real_result(m_height, m_width, CV_32FC(n_channels), data_i_features_all_scales);
300         
301         CHECK_CUFFT_ERRORS(cufftExecC2R(plan_i_features_all_scales, in, reinterpret_cast<cufftReal*>(data_i_features_all_scales_d)));
302         cudaDeviceSynchronize();
303         
304         return real_result/(m_width*m_height);
305     }
306     
307     cv::Mat real_result(m_height, m_width, CV_32FC(n_channels), data_i_features);
308     
309     CHECK_CUFFT_ERRORS(cufftExecC2R(plan_i_features, in, reinterpret_cast<cufftReal*>(data_i_features_d)));
310     cudaDeviceSynchronize();
311     
312     return real_result/(m_width*m_height);
313 }
314
315 cuFFT::~cuFFT()
316 {
317
318   for(unsigned i = 0; i < m_num_of_streams; i++) cudaStreamDestroy(streams[i]);
319   
320   cufftDestroy(plan_f);
321   cufftDestroy(plan_f_all_scales);
322   cufftDestroy(plan_fw);
323   cufftDestroy(plan_fw_all_scales);
324   cufftDestroy(plan_i_1ch);
325   cufftDestroy(plan_i_1ch_all_scales);
326   cufftDestroy(plan_i_features);
327   cufftDestroy(plan_i_features_all_scales);
328   
329   cudaFree(data_f);
330   cudaFree(data_f_all_scales);
331   cudaFreeHost(data_fw);
332   cudaFreeHost(data_fw_all_scales);
333   cudaFreeHost(data_i_1ch);
334   cudaFreeHost(data_i_1ch_all_scales);
335   cudaFreeHost(data_i_features);
336   cudaFreeHost(data_i_features_all_scales);
337   
338   cudaDeviceReset();
339 }