]> rtime.felk.cvut.cz Git - hercules2020/kcf.git/blob - src/fft_cufft.cpp
CUDA: Use per-thread default streams rather than explicit streams
[hercules2020/kcf.git] / src / fft_cufft.cpp
1 #include "fft_cufft.h"
2
3 void cuFFT::init(unsigned width, unsigned height, unsigned num_of_feats, unsigned num_of_scales)
4 {
5     m_width = width;
6     m_height = height;
7     m_num_of_feats = num_of_feats;
8     m_num_of_scales = num_of_scales;
9
10     std::cout << "FFT: cuFFT" << std::endl;
11
12     // FFT forward one scale
13     {
14         CufftErrorCheck(cufftPlan2d(&plan_f, int(m_height), int(m_width), CUFFT_R2C));
15     }
16 #ifdef BIG_BATCH
17     // FFT forward all scales
18     if (m_num_of_scales > 1 && BIG_BATCH_MODE) {
19         int rank = 2;
20         int n[] = {(int)m_height, (int)m_width};
21         int howmany = m_num_of_scales;
22         int idist = m_height * m_width, odist = m_height * (m_width / 2 + 1);
23         int istride = 1, ostride = 1;
24         int *inembed = n, onembed[] = {(int)m_height, (int)m_width / 2 + 1};
25
26         CufftErrorCheck(cufftPlanMany(&plan_f_all_scales, rank, n, inembed, istride, idist, onembed, ostride, odist,
27                                       CUFFT_R2C, howmany));
28         CufftErrorCheck(cufftSetStream(plan_f_all_scales, cudaStreamPerThread));
29     }
30 #endif
31     // FFT forward window one scale
32     {
33         int rank = 2;
34         int n[] = {int(m_height), int(m_width)};
35         int howmany = int(m_num_of_feats);
36         int idist = int(m_height * m_width), odist = int(m_height * (m_width / 2 + 1));
37         int istride = 1, ostride = 1;
38         int *inembed = n, onembed[] = {int(m_height), int(m_width / 2 + 1)};
39
40         CufftErrorCheck(
41             cufftPlanMany(&plan_fw, rank, n, inembed, istride, idist, onembed, ostride, odist, CUFFT_R2C, howmany));
42         CufftErrorCheck(cufftSetStream(plan_fw, cudaStreamPerThread));
43     }
44 #ifdef BIG_BATCH
45     // FFT forward window all scales all feats
46     if (m_num_of_scales > 1 && BIG_BATCH_MODE) {
47         int rank = 2;
48         int n[] = {(int)m_height, (int)m_width};
49         int howmany = m_num_of_scales * m_num_of_feats;
50         int idist = m_height * m_width, odist = m_height * (m_width / 2 + 1);
51         int istride = 1, ostride = 1;
52         int *inembed = n, onembed[] = {(int)m_height, (int)m_width / 2 + 1};
53
54         CufftErrorCheck(cufftPlanMany(&plan_fw_all_scales, rank, n, inembed, istride, idist, onembed, ostride, odist,
55                                       CUFFT_R2C, howmany));
56         CufftErrorCheck(cufftSetStream(plan_fw_all_scales, cudaStreamPerThread));
57     }
58 #endif
59     // FFT inverse one scale
60     {
61         int rank = 2;
62         int n[] = {int(m_height), int(m_width)};
63         int howmany = int(m_num_of_feats);
64         int idist = int(m_height * (m_width / 2 + 1)), odist = 1;
65         int istride = 1, ostride = int(m_num_of_feats);
66         int inembed[] = {int(m_height), int(m_width / 2 + 1)}, *onembed = n;
67
68         CufftErrorCheck(cufftPlanMany(&plan_i_features, rank, n, inembed, istride, idist, onembed, ostride, odist,
69                                       CUFFT_C2R, howmany));
70         CufftErrorCheck(cufftSetStream(plan_i_features, cudaStreamPerThread));
71     }
72     // FFT inverse all scales
73 #ifdef BIG_BATCH
74     if (m_num_of_scales > 1 && BIG_BATCH_MODE) {
75         int rank = 2;
76         int n[] = {(int)m_height, (int)m_width};
77         int howmany = m_num_of_feats * m_num_of_scales;
78         int idist = m_height * (m_width / 2 + 1), odist = 1;
79         int istride = 1, ostride = m_num_of_feats * m_num_of_scales;
80         int inembed[] = {(int)m_height, (int)m_width / 2 + 1}, *onembed = n;
81
82         CufftErrorCheck(cufftPlanMany(&plan_i_features_all_scales, rank, n, inembed, istride, idist, onembed, ostride,
83                                       odist, CUFFT_C2R, howmany));
84         CufftErrorCheck(cufftSetStream(plan_i_features_all_scales, cudaStreamPerThread));
85     }
86 #endif
87     // FFT inverse one channel one scale
88     {
89         int rank = 2;
90         int n[] = {int(m_height), int(m_width)};
91         int howmany = 1;
92         int idist = int(m_height * (m_width / 2 + 1)), odist = 1;
93         int istride = 1, ostride = 1;
94         int inembed[] = {int(m_height), int(m_width / 2 + 1)}, *onembed = n;
95
96         CufftErrorCheck(
97             cufftPlanMany(&plan_i_1ch, rank, n, inembed, istride, idist, onembed, ostride, odist, CUFFT_C2R, howmany));
98         CufftErrorCheck(cufftSetStream(plan_i_1ch, cudaStreamPerThread));
99     }
100 #ifdef BIG_BATCH
101     // FFT inverse one channel all scales
102     if (m_num_of_scales > 1 && BIG_BATCH_MODE) {
103         int rank = 2;
104         int n[] = {(int)m_height, (int)m_width};
105         int howmany = m_num_of_scales;
106         int idist = m_height * (m_width / 2 + 1), odist = 1;
107         int istride = 1, ostride = m_num_of_scales;
108         int inembed[] = {(int)m_height, (int)m_width / 2 + 1}, *onembed = n;
109
110         CufftErrorCheck(cufftPlanMany(&plan_i_1ch_all_scales, rank, n, inembed, istride, idist, onembed, ostride, odist,
111                                       CUFFT_C2R, howmany));
112         CufftErrorCheck(cufftSetStream(plan_i_1ch_all_scales, cudaStreamPerThread));
113     }
114 #endif
115 }
116
117 void cuFFT::set_window(const cv::Mat &window)
118 {
119     m_window = window;
120 }
121
122 void cuFFT::forward(const cv::Mat &real_input, ComplexMat &complex_result, float *real_input_arr)
123 {
124     if (BIG_BATCH_MODE && real_input.rows == int(m_height * m_num_of_scales)) {
125         CufftErrorCheck(cufftExecR2C(plan_f_all_scales, reinterpret_cast<cufftReal *>(real_input_arr),
126                                      complex_result.get_p_data()));
127     } else {
128         NORMAL_OMP_CRITICAL
129         {
130             CufftErrorCheck(
131                 cufftExecR2C(plan_f, reinterpret_cast<cufftReal *>(real_input_arr), complex_result.get_p_data()));
132             cudaStreamSynchronize(cudaStreamPerThread);
133         }
134     }
135     return;
136 }
137
138 void cuFFT::forward_window(std::vector<cv::Mat> patch_feats, ComplexMat &complex_result, cv::Mat &fw_all,
139                            float *real_input_arr)
140 {
141     int n_channels = int(patch_feats.size());
142
143     if (n_channels > int(m_num_of_feats)) {
144         for (uint i = 0; i < uint(n_channels); ++i) {
145             cv::Mat in_roi(fw_all, cv::Rect(0, int(i * m_height), int(m_width), int(m_height)));
146             in_roi = patch_feats[i].mul(m_window);
147         }
148         CufftErrorCheck(cufftExecR2C(plan_fw_all_scales, reinterpret_cast<cufftReal *>(real_input_arr),
149                                      complex_result.get_p_data()));
150     } else {
151         for (uint i = 0; i < uint(n_channels); ++i) {
152             cv::Mat in_roi(fw_all, cv::Rect(0, int(i * m_height), int(m_width), int(m_height)));
153             in_roi = patch_feats[i].mul(m_window);
154         }
155         NORMAL_OMP_CRITICAL
156         {
157             CufftErrorCheck(
158                 cufftExecR2C(plan_fw, reinterpret_cast<cufftReal *>(real_input_arr), complex_result.get_p_data()));
159             cudaStreamSynchronize(cudaStreamPerThread);
160         }
161     }
162     return;
163 }
164
165 void cuFFT::inverse(ComplexMat &complex_input, cv::Mat &real_result, float *real_result_arr)
166 {
167     int n_channels = complex_input.n_channels;
168     cufftComplex *in = reinterpret_cast<cufftComplex *>(complex_input.get_p_data());
169
170     if (n_channels == 1) {
171         NORMAL_OMP_CRITICAL
172         {
173             CufftErrorCheck(cufftExecC2R(plan_i_1ch, in, reinterpret_cast<cufftReal *>(real_result_arr)));
174             cudaStreamSynchronize(cudaStreamPerThread);
175         }
176         real_result = real_result / (m_width * m_height);
177         return;
178     } else if (n_channels == int(m_num_of_scales)) {
179         CufftErrorCheck(cufftExecC2R(plan_i_1ch_all_scales, in, reinterpret_cast<cufftReal *>(real_result_arr)));
180         cudaStreamSynchronize(cudaStreamPerThread);
181
182         real_result = real_result / (m_width * m_height);
183         return;
184     } else if (n_channels == int(m_num_of_feats) * int(m_num_of_scales)) {
185         CufftErrorCheck(cufftExecC2R(plan_i_features_all_scales, in, reinterpret_cast<cufftReal *>(real_result_arr)));
186         cudaStreamSynchronize(cudaStreamPerThread);
187         return;
188     }
189     NORMAL_OMP_CRITICAL
190     {
191         CufftErrorCheck(cufftExecC2R(plan_i_features, in, reinterpret_cast<cufftReal *>(real_result_arr)));
192 #if defined(OPENMP) && !defined(BIG_BATCH)
193         CudaSafeCall(cudaStreamSynchronize(cudaStreamPerThread));
194 #endif
195     }
196     return;
197 }
198
199 cuFFT::~cuFFT()
200 {
201     CufftErrorCheck(cufftDestroy(plan_f));
202     CufftErrorCheck(cufftDestroy(plan_fw));
203     CufftErrorCheck(cufftDestroy(plan_i_1ch));
204     CufftErrorCheck(cufftDestroy(plan_i_features));
205
206     if (BIG_BATCH_MODE) {
207         CufftErrorCheck(cufftDestroy(plan_f_all_scales));
208         CufftErrorCheck(cufftDestroy(plan_fw_all_scales));
209         CufftErrorCheck(cufftDestroy(plan_i_1ch_all_scales));
210         CufftErrorCheck(cufftDestroy(plan_i_features_all_scales));
211     }
212 }