]> rtime.felk.cvut.cz Git - hercules2020/kcf.git/blob - src/fft_cufft.cpp
Smaller FFT API
[hercules2020/kcf.git] / src / fft_cufft.cpp
1 #include "fft_cufft.h"
2
3 void cuFFT::init(unsigned width, unsigned height, unsigned num_of_feats, unsigned num_of_scales, bool big_batch_mode)
4 {
5     m_width = width;
6     m_height = height;
7     m_num_of_feats = num_of_feats;
8     m_num_of_scales = num_of_scales;
9     m_big_batch_mode = big_batch_mode;
10
11     std::cout << "FFT: cuFFT" << std::endl;
12
13     //FFT forward one scale
14     {
15         CudaSafeCall(cudaMalloc(&data_f, m_height*m_width*sizeof(cufftReal)));
16
17        CufftErrorCheck(cufftPlan2d(&plan_f, m_height, m_width, CUFFT_R2C));
18
19
20     }
21     //FFT forward all scales
22     if(m_num_of_scales > 1 && m_big_batch_mode)
23     {
24         CudaSafeCall(cudaMalloc(&data_f_all_scales, m_height*m_num_of_scales*m_width*sizeof(cufftReal)));
25
26         int rank = 2;
27         int n[] = {(int)m_height, (int)m_width};
28         int howmany = m_num_of_scales;
29         int idist = m_height*m_width, odist = m_height*(m_width/2+1);
30         int istride = 1, ostride = 1;
31         int *inembed = n, onembed[] = {(int)m_height, (int)m_width/2+1};
32
33         CufftErrorCheck(cufftPlanMany(&plan_f_all_scales, rank, n,
34                   inembed, istride, idist,
35                   onembed, ostride, odist,
36                   CUFFT_R2C, howmany));
37     }
38     //FFT forward window one scale
39     {
40         CudaSafeCall(cudaHostAlloc(&data_fw, m_height*m_num_of_feats*m_width*sizeof(cufftReal), cudaHostAllocMapped));
41         CudaSafeCall(cudaHostGetDevicePointer(&data_fw_d, data_fw, 0));
42
43         int rank = 2;
44         int n[] = {(int)m_height, (int)m_width};
45         int howmany = m_num_of_feats;
46         int idist = m_height*m_width, odist = m_height*(m_width/2+1);
47         int istride = 1, ostride = 1;
48         int *inembed = n, onembed[] = {(int)m_height, (int)m_width/2+1};
49
50         CufftErrorCheck(cufftPlanMany(&plan_fw, rank, n,
51                   inembed, istride, idist,
52                   onembed, ostride, odist,
53                   CUFFT_R2C, howmany));
54     }
55     //FFT forward window all scales all feats
56     if(m_num_of_scales > 1 && m_big_batch_mode)
57     {
58         CudaSafeCall(cudaHostAlloc(&data_fw_all_scales, m_height*m_num_of_feats*m_num_of_scales*m_width*sizeof(cufftReal), cudaHostAllocMapped));
59         CudaSafeCall(cudaHostGetDevicePointer(&data_fw_all_scales_d, data_fw_all_scales, 0));
60
61         int rank = 2;
62         int n[] = {(int)m_height, (int)m_width};
63         int howmany = m_num_of_scales*m_num_of_feats;
64         int idist = m_height*m_width, odist = m_height*(m_width/2+1);
65         int istride = 1, ostride = 1;
66         int *inembed = n, onembed[] = {(int)m_height, (int)m_width/2+1};
67
68         CufftErrorCheck(cufftPlanMany(&plan_fw_all_scales, rank, n,
69                   inembed, istride, idist,
70                   onembed, ostride, odist,
71                   CUFFT_R2C, howmany));
72
73
74     }
75     //FFT inverse one scale
76     {
77         int rank = 2;
78         int n[] = {(int)m_height, (int)m_width};
79         int howmany = m_num_of_feats;
80         int idist = m_height*(m_width/2+1), odist = 1;
81         int istride = 1, ostride = m_num_of_feats;
82         int inembed[] = {(int)m_height, (int)m_width/2+1}, *onembed = n;
83
84         CufftErrorCheck(cufftPlanMany(&plan_i_features, rank, n,
85                   inembed, istride, idist,
86                   onembed, ostride, odist,
87                   CUFFT_C2R, howmany));
88     }
89     //FFT inverse all scales
90 #ifdef BIG_BATCH
91     if(m_num_of_scales > 1)
92     {
93         CudaSafeCall(cudaHostAlloc(&data_i_features_all_scales, m_height*m_num_of_feats*m_num_of_scales*m_width*sizeof(cufftReal), cudaHostAllocMapped));
94         CudaSafeCall(cudaHostGetDevicePointer(&data_i_features_all_scales_d, data_i_features_all_scales, 0));
95
96         int rank = 2;
97         int n[] = {(int)m_height, (int)m_width};
98         int howmany = m_num_of_feats*m_num_of_scales;
99         int idist = m_height*(m_width/2+1), odist = 1;
100         int istride = 1, ostride = m_num_of_feats*m_num_of_scales;
101         int inembed[] = {(int)m_height, (int)m_width/2+1}, *onembed = n;
102
103         CufftErrorCheck(cufftPlanMany(&plan_i_features_all_scales, rank, n,
104                   inembed, istride, idist,
105                   onembed, ostride, odist,
106                   CUFFT_C2R, howmany));
107     }
108 #endif
109     //FFT inverse one channel one scale
110     {
111         int rank = 2;
112         int n[] = {(int)m_height, (int)m_width};
113         int howmany = 1;
114         int idist = m_height*(m_width/2+1), odist = 1;
115         int istride = 1, ostride = 1;
116         int inembed[] = {(int)m_height, (int)m_width/2+1}, *onembed = n;
117
118         CufftErrorCheck(cufftPlanMany(&plan_i_1ch, rank, n,
119                   inembed, istride, idist,
120                   onembed, ostride, odist,
121                   CUFFT_C2R, howmany));
122     }
123 #ifdef BIG_BATCH
124     //FFT inverse one channel all scales
125     if(m_num_of_scales > 1 && m_big_batch_mode)
126     {
127         CudaSafeCall(cudaHostAlloc(&data_i_1ch_all_scales, m_height*m_num_of_scales*m_width*sizeof(cufftReal), cudaHostAllocMapped));
128         CudaSafeCall(cudaHostGetDevicePointer(&data_i_1ch_all_scales_d, data_i_1ch_all_scales, 0));
129
130         int rank = 2;
131         int n[] = {(int)m_height, (int)m_width};
132         int howmany = m_num_of_scales;
133         int idist = m_height*(m_width/2+1), odist = 1;
134         int istride = 1, ostride = m_num_of_scales;
135         int inembed[] = {(int)m_height, (int)m_width/2+1}, *onembed = n;
136
137         CufftErrorCheck(cufftPlanMany(&plan_i_1ch_all_scales, rank, n,
138                   inembed, istride, idist,
139                   onembed, ostride, odist,
140                   CUFFT_C2R, howmany));
141     }
142 #endif
143 }
144
145 void cuFFT::set_window(const cv::Mat & window)
146 {
147      m_window = window;
148 }
149
150 void cuFFT::forward(Scale_vars & vars)
151 {
152     ComplexMat *complex_result = vars.flag & Tracker_flags::TRACKER_INIT ? vars.p_yf_ptr :
153                                                   vars.flag & Tracker_flags::AUTO_CORRELATION ? & vars.kf : & vars.kzf;
154     cv::Mat *input = vars.flag & Tracker_flags::TRACKER_INIT ? & vars.rot_labels : & vars.in_all;
155
156     if(m_big_batch_mode && vars.in_all.rows == (int)(m_height*m_num_of_scales)){
157         CudaSafeCall(cudaMemcpy(data_f_all_scales, input->ptr<cufftReal>(), m_height*m_num_of_scales*m_width*sizeof(cufftReal), cudaMemcpyHostToDevice));
158         CufftErrorCheck(cufftExecR2C(plan_f_all_scales, reinterpret_cast<cufftReal*>(data_f_all_scales),
159                                 complex_result->get_p_data()));
160     } else {
161         CudaSafeCall(cudaMemcpy(data_f, input->ptr<cufftReal>(), m_height*m_width*sizeof(cufftReal), cudaMemcpyHostToDevice));
162         CufftErrorCheck(cufftExecR2C(plan_f, reinterpret_cast<cufftReal*>(data_f),
163                                 complex_result->get_p_data()));
164     }
165     return;
166 }
167
168 void cuFFT::forward_window(Scale_vars & vars)
169 {
170     int n_channels = vars.patch_feats.size();
171
172     ComplexMat *result = vars.flag & Tracker_flags::TRACKER_INIT ? vars.p_model_xf_ptr :
173                                                   vars.flag & Tracker_flags::TRACKER_UPDATE ? & vars.xf : & vars.zf;
174
175     if(n_channels > (int) m_num_of_feats){
176         cv::Mat in_all(m_height * n_channels, m_width, CV_32F, data_fw_all_scales);
177         for (int i = 0; i < n_channels; ++i) {
178             cv::Mat in_roi(in_all, cv::Rect(0, i*m_height, m_width, m_height));
179             in_roi = vars.patch_feats[i].mul(m_window);
180         }
181
182         CufftErrorCheck(cufftExecR2C(plan_fw_all_scales, reinterpret_cast<cufftReal*>(data_fw_all_scales_d), result->get_p_data()));
183     } else {
184         cv::Mat in_all(m_height * n_channels, m_width, CV_32F, data_fw);
185         for (int i = 0; i < n_channels; ++i) {
186             cv::Mat in_roi(in_all, cv::Rect(0, i*m_height, m_width, m_height));
187             in_roi = vars.patch_feats[i].mul(m_window);
188         }
189
190         CufftErrorCheck(cufftExecR2C(plan_fw, reinterpret_cast<cufftReal*>(data_fw_d), result->get_p_data()));
191     }
192     return;
193 }
194
195 void cuFFT::inverse(Scale_vars & vars)
196 {
197     ComplexMat *input = vars.flag & Tracker_flags::RESPONSE ? & vars.kzf : &  vars.xyf;
198     cv::Mat *real_result = vars.flag & Tracker_flags::RESPONSE ? & vars.response : & vars.ifft2_res;
199
200     int n_channels = input->n_channels;
201     cufftComplex *in = reinterpret_cast<cufftComplex*>(input->get_p_data());
202
203     if(n_channels == 1){
204
205         CufftErrorCheck(cufftExecC2R(plan_i_1ch, in, reinterpret_cast<cufftReal*>(vars.data_i_1ch_d)));
206         cudaDeviceSynchronize();
207         *real_result = *real_result/(m_width*m_height);
208         return;
209     }
210 #ifdef BIG_BATCH
211     else if(n_channels == (int) m_num_of_scales){
212         cv::Mat real_result(m_height, m_width, CV_32FC(n_channels), vars.data_i_1ch_all_scales);
213
214         CufftErrorCheck(cufftExecC2R(plan_i_1ch_all_scales, in, reinterpret_cast<cufftReal*>(vars.data_i_1ch_all_scales_d)));
215         cudaDeviceSynchronize();
216
217         return real_result/(m_width*m_height);
218     } else if(n_channels == (int) m_num_of_feats * (int) m_num_of_scales){
219         cv::Mat real_result(m_height, m_width, CV_32FC(n_channels), data_i_features_all_scales);
220
221         CufftErrorCheck(cufftExecC2R(plan_i_features_all_scales, in, reinterpret_cast<cufftReal*>(vars.data_i_features_all_scales_d)));
222         cudaDeviceSynchronize();
223
224         return real_result/(m_width*m_height);
225     }
226 #endif
227
228     CufftErrorCheck(cufftExecC2R(plan_i_features, in, reinterpret_cast<cufftReal*>(vars.data_i_features_d)));
229
230     if (vars.cuda_gauss)
231         return;
232     else {
233         cudaDeviceSynchronize();
234         *real_result = *real_result/(m_width*m_height);
235     }
236     return;
237 }
238
239 cuFFT::~cuFFT()
240 {
241   CufftErrorCheck(cufftDestroy(plan_f));
242   CufftErrorCheck(cufftDestroy(plan_fw));
243   CufftErrorCheck(cufftDestroy(plan_i_1ch));
244   CufftErrorCheck(cufftDestroy(plan_i_features));
245
246   CudaSafeCall(cudaFree(data_f));
247   CudaSafeCall(cudaFreeHost(data_fw));
248   
249   if (m_big_batch_mode) {
250       CufftErrorCheck(cufftDestroy(plan_f_all_scales));
251       CufftErrorCheck(cufftDestroy(plan_fw_all_scales));
252       CufftErrorCheck(cufftDestroy(plan_i_1ch_all_scales));
253       CufftErrorCheck(cufftDestroy(plan_i_features_all_scales));
254       
255       CudaSafeCall(cudaFree(data_f_all_scales));
256       CudaSafeCall(cudaFreeHost(data_fw_all_scales));
257       CudaSafeCall(cudaFreeHost(data_i_1ch_all_scales));
258       CudaSafeCall(cudaFreeHost(data_i_features_all_scales));
259   }
260 }