]> rtime.felk.cvut.cz Git - hercules2020/kcf.git/blob - src/fft_cufft.cpp
Added forward_raw function to ffts in preparation to full cuda support.
[hercules2020/kcf.git] / src / fft_cufft.cpp
1 #include "fft_cufft.h"
2
3 void cuFFT::init(unsigned width, unsigned height, unsigned num_of_feats, unsigned num_of_scales, bool big_batch_mode)
4 {
5     m_width = width;
6     m_height = height;
7     m_num_of_feats = num_of_feats;
8     m_num_of_scales = num_of_scales;
9     m_big_batch_mode = big_batch_mode;
10
11     std::cout << "FFT: cuFFT" << std::endl;
12     
13     if(m_height*(m_width/2+1) > 1024){
14         std::cerr << "Image dimension after forward FFT are too big for CUDA kernels." << std::endl;
15         std::exit(EXIT_FAILURE);
16     }
17     
18     //FFT forward one scale
19     {
20         CudaSafeCall(cudaMalloc(&data_f, m_height*m_width*sizeof(cufftReal)));
21         
22        CufftErrorCheck(cufftPlan2d(&plan_f, m_height, m_width, CUFFT_R2C));
23         
24         
25     }
26     //FFT forward all scales
27     if(m_num_of_scales > 1 && m_big_batch_mode)
28     {
29         CudaSafeCall(cudaMalloc(&data_f_all_scales, m_height*m_num_of_scales*m_width*sizeof(cufftReal)));
30         
31         int rank = 2;
32         int n[] = {(int)m_height, (int)m_width};
33         int howmany = m_num_of_scales;
34         int idist = m_height*m_width, odist = m_height*(m_width/2+1);
35         int istride = 1, ostride = 1;
36         int *inembed = n, onembed[] = {(int)m_height, (int)m_width/2+1};
37
38         CufftErrorCheck(cufftPlanMany(&plan_f_all_scales, rank, n,
39                       inembed, istride, idist,
40                       onembed, ostride, odist,
41                       CUFFT_R2C, howmany));
42     }
43     //FFT forward window one scale
44     {
45         CudaSafeCall(cudaHostAlloc(&data_fw, m_height*m_num_of_feats*m_width*sizeof(cufftReal), cudaHostAllocMapped));
46         CudaSafeCall(cudaHostGetDevicePointer(&data_fw_d, data_fw, 0));
47         
48         int rank = 2;
49         int n[] = {(int)m_height, (int)m_width};
50         int howmany = m_num_of_feats;
51         int idist = m_height*m_width, odist = m_height*(m_width/2+1);
52         int istride = 1, ostride = 1;
53         int *inembed = n, onembed[] = {(int)m_height, (int)m_width/2+1};
54
55         CufftErrorCheck(cufftPlanMany(&plan_fw, rank, n,
56                   inembed, istride, idist,
57                   onembed, ostride, odist,
58                   CUFFT_R2C, howmany));
59     }
60     //FFT forward window all scales all feats
61     if(m_num_of_scales > 1 && m_big_batch_mode)
62     {
63         CudaSafeCall(cudaHostAlloc(&data_fw_all_scales, m_height*m_num_of_feats*m_num_of_scales*m_width*sizeof(cufftReal), cudaHostAllocMapped));
64         CudaSafeCall(cudaHostGetDevicePointer(&data_fw_all_scales_d, data_fw_all_scales, 0));
65
66         int rank = 2;
67         int n[] = {(int)m_height, (int)m_width};
68         int howmany = m_num_of_scales*m_num_of_feats;
69         int idist = m_height*m_width, odist = m_height*(m_width/2+1);
70         int istride = 1, ostride = 1;
71         int *inembed = n, onembed[] = {(int)m_height, (int)m_width/2+1};
72
73         CufftErrorCheck(cufftPlanMany(&plan_fw_all_scales, rank, n,
74                   inembed, istride, idist,
75                   onembed, ostride, odist,
76                   CUFFT_R2C, howmany));
77         
78         
79     }
80     //FFT inverse one scale
81     {
82         CudaSafeCall(cudaHostAlloc(&data_i_features, m_height*m_num_of_feats*m_width*sizeof(cufftReal), cudaHostAllocMapped));
83         CudaSafeCall(cudaHostGetDevicePointer(&data_i_features_d, data_i_features, 0));
84         
85         int rank = 2;
86         int n[] = {(int)m_height, (int)m_width};
87         int howmany = m_num_of_feats;
88         int idist = m_height*(m_width/2+1), odist = 1;
89         int istride = 1, ostride = m_num_of_feats;
90         int inembed[] = {(int)m_height, (int)m_width/2+1}, *onembed = n;
91
92         CufftErrorCheck(cufftPlanMany(&plan_i_features, rank, n,
93                   inembed, istride, idist,
94                   onembed, ostride, odist,
95                   CUFFT_C2R, howmany));
96     }
97     //FFT inverse all scales
98     if(m_num_of_scales > 1)
99     {
100         CudaSafeCall(cudaHostAlloc(&data_i_features_all_scales, m_height*m_num_of_feats*m_num_of_scales*m_width*sizeof(cufftReal), cudaHostAllocMapped));
101         CudaSafeCall(cudaHostGetDevicePointer(&data_i_features_all_scales_d, data_i_features_all_scales, 0));
102         
103         int rank = 2;
104         int n[] = {(int)m_height, (int)m_width};
105         int howmany = m_num_of_feats*m_num_of_scales;
106         int idist = m_height*(m_width/2+1), odist = 1;
107         int istride = 1, ostride = m_num_of_feats*m_num_of_scales;
108         int inembed[] = {(int)m_height, (int)m_width/2+1}, *onembed = n;
109
110         CufftErrorCheck(cufftPlanMany(&plan_i_features_all_scales, rank, n,
111                   inembed, istride, idist,
112                   onembed, ostride, odist,
113                   CUFFT_C2R, howmany));
114     }
115     //FFT inverse one channel one scale
116     {
117         CudaSafeCall(cudaHostAlloc(&data_i_1ch, m_height*m_width*sizeof(cufftReal), cudaHostAllocMapped));
118         CudaSafeCall(cudaHostGetDevicePointer(&data_i_1ch_d, data_i_1ch, 0));
119         
120         int rank = 2;
121         int n[] = {(int)m_height, (int)m_width};
122         int howmany = 1;
123         int idist = m_height*(m_width/2+1), odist = 1;
124         int istride = 1, ostride = 1;
125         int inembed[] = {(int)m_height, (int)m_width/2+1}, *onembed = n;
126
127         CufftErrorCheck(cufftPlanMany(&plan_i_1ch, rank, n,
128                   inembed, istride, idist,
129                   onembed, ostride, odist,
130                   CUFFT_C2R, howmany));
131     }
132     //FFT inverse one channel all scales
133     if(m_num_of_scales > 1 && m_big_batch_mode)
134     {
135         CudaSafeCall(cudaHostAlloc(&data_i_1ch_all_scales, m_height*m_num_of_scales*m_width*sizeof(cufftReal), cudaHostAllocMapped));
136         CudaSafeCall(cudaHostGetDevicePointer(&data_i_1ch_all_scales_d, data_i_1ch_all_scales, 0));
137         
138         int rank = 2;
139         int n[] = {(int)m_height, (int)m_width};
140         int howmany = m_num_of_scales;
141         int idist = m_height*(m_width/2+1), odist = 1;
142         int istride = 1, ostride = m_num_of_scales;
143         int inembed[] = {(int)m_height, (int)m_width/2+1}, *onembed = n;
144
145         CufftErrorCheck(cufftPlanMany(&plan_i_1ch_all_scales, rank, n,
146                   inembed, istride, idist,
147                   onembed, ostride, odist,
148                   CUFFT_C2R, howmany));
149     }
150 }
151
152 void cuFFT::set_window(const cv::Mat &window)
153 {
154      m_window = window;
155 }
156
157 ComplexMat cuFFT::forward(const cv::Mat &input)
158 {
159     ComplexMat complex_result;
160     if(m_big_batch_mode && input.rows == (int)(m_height*m_num_of_scales)){
161         CudaSafeCall(cudaMemcpy(data_f_all_scales, input.ptr<cufftReal>(), m_height*m_num_of_scales*m_width*sizeof(cufftReal), cudaMemcpyHostToDevice));
162         complex_result.create(m_height, m_width / 2 + 1, m_num_of_scales);
163         CufftErrorCheck(cufftExecR2C(plan_f_all_scales, reinterpret_cast<cufftReal*>(data_f_all_scales),
164                                 complex_result.get_p_data()));
165     } else {
166         CudaSafeCall(cudaMemcpy(data_f, input.ptr<cufftReal>(), m_height*m_width*sizeof(cufftReal), cudaMemcpyHostToDevice));
167         complex_result.create(m_height, m_width/ 2 + 1, 1);
168         CufftErrorCheck(cufftExecR2C(plan_f, reinterpret_cast<cufftReal*>(data_f),
169                                 complex_result.get_p_data()));
170     }
171     
172     return complex_result;
173 }
174
175 ComplexMat cuFFT::forward_raw(float *input)
176 {
177     ComplexMat dummy;
178     return dummy;
179 }
180
181 ComplexMat cuFFT::forward_window(const std::vector<cv::Mat> &input)
182 {
183     int n_channels = input.size();
184     ComplexMat result;
185     if(n_channels > (int) m_num_of_feats){
186         cv::Mat in_all(m_height * n_channels, m_width, CV_32F, data_fw_all_scales);
187         for (int i = 0; i < n_channels; ++i) {
188             cv::Mat in_roi(in_all, cv::Rect(0, i*m_height, m_width, m_height));
189             in_roi = input[i].mul(m_window);
190         }
191         
192         result.create(m_height, m_width/2 + 1, n_channels,m_num_of_scales);
193         
194         CufftErrorCheck(cufftExecR2C(plan_fw_all_scales, reinterpret_cast<cufftReal*>(data_fw_all_scales_d), result.get_p_data()));
195     } else {
196         cv::Mat in_all(m_height * n_channels, m_width, CV_32F, data_fw);
197         for (int i = 0; i < n_channels; ++i) {
198             cv::Mat in_roi(in_all, cv::Rect(0, i*m_height, m_width, m_height));
199             in_roi = input[i].mul(m_window);
200         }
201         
202         result.create(m_height, m_width/2 + 1, n_channels);
203         
204         CufftErrorCheck(cufftExecR2C(plan_fw, reinterpret_cast<cufftReal*>(data_fw_d), result.get_p_data()));
205     }
206     return result;
207 }
208
209 cv::Mat cuFFT::inverse(const ComplexMat &input)
210 {
211     int n_channels = input.n_channels;
212     cufftComplex *in = reinterpret_cast<cufftComplex*>(input.get_p_data());
213     
214     if(n_channels == 1){
215         cv::Mat real_result(m_height, m_width, CV_32FC1, data_i_1ch);
216         
217         CufftErrorCheck(cufftExecC2R(plan_i_1ch, in, reinterpret_cast<cufftReal*>(data_i_1ch_d)));
218         cudaDeviceSynchronize();
219         
220         return real_result/(m_width*m_height);
221     } else if(n_channels == (int) m_num_of_scales){
222         cv::Mat real_result(m_height, m_width, CV_32FC(n_channels), data_i_1ch_all_scales);
223         
224         CufftErrorCheck(cufftExecC2R(plan_i_1ch_all_scales, in, reinterpret_cast<cufftReal*>(data_i_1ch_all_scales_d)));
225         cudaDeviceSynchronize();
226         
227         return real_result/(m_width*m_height);
228     } else if(n_channels == (int) m_num_of_feats * (int) m_num_of_scales){
229         cv::Mat real_result(m_height, m_width, CV_32FC(n_channels), data_i_features_all_scales);
230         
231         CufftErrorCheck(cufftExecC2R(plan_i_features_all_scales, in, reinterpret_cast<cufftReal*>(data_i_features_all_scales_d)));
232         cudaDeviceSynchronize();
233         
234         return real_result/(m_width*m_height);
235     }
236     
237     cv::Mat real_result(m_height, m_width, CV_32FC(n_channels), data_i_features);
238     
239     CufftErrorCheck(cufftExecC2R(plan_i_features, in, reinterpret_cast<cufftReal*>(data_i_features_d)));
240     cudaDeviceSynchronize();
241     
242     return real_result/(m_width*m_height);
243 }
244
245 float* cuFFT::inverse_raw(const ComplexMat &input)
246 {
247     cufftComplex *in = reinterpret_cast<cufftComplex*>(input.get_p_data());
248     
249     CufftErrorCheck(cufftExecC2R(plan_i_features_all_scales, in, reinterpret_cast<cufftReal*>(data_i_features_all_scales_d)));
250
251     return data_i_features_all_scales;
252 }
253
254 cuFFT::~cuFFT()
255 {
256   CufftErrorCheck(cufftDestroy(plan_f));
257   CufftErrorCheck(cufftDestroy(plan_f_all_scales));
258   CufftErrorCheck(cufftDestroy(plan_fw));
259   CufftErrorCheck(cufftDestroy(plan_fw_all_scales));
260   CufftErrorCheck(cufftDestroy(plan_i_1ch));
261   CufftErrorCheck(cufftDestroy(plan_i_1ch_all_scales));
262   CufftErrorCheck(cufftDestroy(plan_i_features));
263   CufftErrorCheck(cufftDestroy(plan_i_features_all_scales));
264   
265   CudaSafeCall(cudaFree(data_f));
266   CudaSafeCall(cudaFree(data_f_all_scales));
267   CudaSafeCall(cudaFreeHost(data_fw));
268   CudaSafeCall(cudaFreeHost(data_fw_all_scales));
269   CudaSafeCall(cudaFreeHost(data_i_1ch));
270   CudaSafeCall(cudaFreeHost(data_i_1ch_all_scales));
271   CudaSafeCall(cudaFreeHost(data_i_features));
272   CudaSafeCall(cudaFreeHost(data_i_features_all_scales));
273 }