]> rtime.felk.cvut.cz Git - hercules2020/kcf.git/blob - src/complexmat.cu
ComplexMat: Add CUDA stream synchronization before accessing host memory
[hercules2020/kcf.git] / src / complexmat.cu
1 #include "complexmat.hpp"
2
3
4 __global__ void sqr_norm_kernel(const float *in, float *block_res, int total)
5 {
6     extern __shared__ float sdata[];
7     int in_idx = 2 * (blockIdx.x * blockDim.x + threadIdx.x);
8     int i = threadIdx.x;
9
10     if (in_idx >= total * 2)
11         sdata[i] = 0;
12     else
13         sdata[i] = in[in_idx] * in[in_idx] + in[in_idx + 1] * in[in_idx + 1];
14
15     for (unsigned s = (blockDim.x + 1) / 2; s > 0; s >>= 1) {
16         __syncthreads();
17         if (i < s)
18             sdata[i] += sdata[i + s];
19     }
20
21     if (i == 0)
22         block_res[blockIdx.x] = sdata[0];
23 }
24
25 void ComplexMat_::sqr_norm(DynMem &result) const
26 {
27     assert(n_scales == 1);
28
29     const uint total = n_channels * rows * cols;
30     const dim3 threads(1024);
31     const dim3 blocks((total + threads.x - 1) / threads.x);
32
33     DynMem block_res(blocks.x);
34
35     sqr_norm_kernel<<<blocks, threads, threads.x * sizeof(float)>>>((const float*)p_data.deviceMem(),
36                                                                     block_res.deviceMem(), total);
37     CudaCheckError();
38     cudaSync();
39
40     T res = 0;
41     for (int i = 0; i < blocks.x; i++)
42         res += block_res[i];
43     result.hostMem()[0] = res / static_cast<T>(cols * rows);
44 }
45
46 __global__ void sqr_mag_kernel(const float *data, float *result, int total)
47 {
48     int idx = 2 * (blockIdx.x * blockDim.x + threadIdx.x);
49
50     if (idx / 2 < total) {
51         result[idx] = data[idx] * data[idx] + data[idx + 1] * data[idx + 1];
52         result[idx + 1] = 0;
53     }
54 }
55
56 ComplexMat_ ComplexMat_::sqr_mag() const
57 {
58     ComplexMat_ result = ComplexMat_::same_size(*this);
59
60     const uint total = n_channels * rows * cols;
61     const dim3 threads(256);
62     const dim3 blocks((total + threads.x - 1) / threads.x);
63
64     sqr_mag_kernel<<<threads, blocks, 0>>>((float*)this->p_data.deviceMem(),
65                                            (float*)result.p_data.deviceMem(),
66                                            total);
67     CudaCheckError();
68
69     return result;
70 }
71
72 __global__ void conj_kernel(const float *data, float *result, int total)
73 {
74     int idx = 2 * (blockIdx.x * blockDim.x + threadIdx.x);
75
76     if (idx / 2 < total) {
77         result[idx] = data[idx];
78         result[idx + 1] = -data[idx + 1];
79     }
80 }
81
82 ComplexMat_ ComplexMat_::conj() const
83 {
84     ComplexMat_ result = ComplexMat_::same_size(*this);
85
86     const uint total = n_channels * rows * cols;
87     const dim3 threads(256);
88     const dim3 blocks((total + threads.x - 1) / threads.x);
89
90     conj_kernel<<<threads, blocks, 0>>>((float*)this->p_data.deviceMem(), (float*)result.p_data.deviceMem(), total);
91     CudaCheckError();
92
93     return result;
94 }
95
96 __global__ static void sum_channels(float *dest, const float *src, uint channels, uint num_channel_elem)
97 {
98     int idx = blockIdx.x * blockDim.x + threadIdx.x;
99
100     if (idx >= num_channel_elem)
101         return;
102
103     float acc = 0;
104     for (uint i = 0; i < channels; ++i)
105         acc += src[idx + i * num_channel_elem];
106     dest[idx] = acc;
107 }
108
109 ComplexMat_ ComplexMat_::sum_over_channels() const
110 {
111     assert(p_data.num_elem == n_channels * rows * cols);
112
113     uint n_channels_per_scale = n_channels / n_scales;
114     uint scale_offset = n_channels_per_scale * rows * cols;
115
116     ComplexMat_ result(this->rows, this->cols, 1, n_scales);
117
118     const uint total = rows * cols * 2;
119     const dim3 threads(256);
120     const dim3 blocks((total + threads.x - 1) / threads.x);
121
122     for (uint scale = 0; scale < n_scales; ++scale) {
123         sum_channels<<<blocks, threads>>>(reinterpret_cast<float*>(result.p_data.deviceMem() + scale * scale_offset),
124                                           reinterpret_cast<const float*>(p_data.deviceMem() + scale * scale_offset),
125                                           n_channels_per_scale, total);
126     }
127     CudaSafeCall(cudaStreamSynchronize(cudaStreamPerThread));
128     return result;
129 }
130
131 __global__ void same_num_channels_mul_kernel(const float *data_l, const float *data_r, float *result, int total)
132 {
133     int idx = 2 * (blockIdx.x * blockDim.x + threadIdx.x);
134
135     if (idx / 2 < total) {
136         result[idx] = data_l[idx] * data_r[idx] - data_l[idx + 1] * data_r[idx + 1];
137         result[idx + 1] = data_l[idx] * data_r[idx + 1] + data_l[idx + 1] * data_r[idx];
138     }
139 }
140
141 // element-wise per channel multiplication, division and addition
142 ComplexMat_ ComplexMat_::operator*(const ComplexMat_ &rhs) const
143 {
144     assert(rhs.n_channels == n_channels && rhs.cols == cols && rhs.rows == rows);
145
146     ComplexMat_ result = ComplexMat_::same_size(*this);
147
148     const uint total = n_channels * rows * cols;
149     const dim3 threads(256);
150     const dim3 blocks((total + threads.x - 1) / threads.x);
151
152     same_num_channels_mul_kernel<<<blocks, threads, 0>>>((float*)this->p_data.deviceMem(),
153                                                          (float*)rhs.p_data.deviceMem(),
154                                                          (float*)result.p_data.deviceMem(),
155                                                          total);
156     CudaCheckError();
157
158     return result;
159 }
160
161 __global__ void same_num_channels_div_kernel(const float *data_l, const float *data_r, float *result, unsigned total)
162 {
163     int idx = 2 * (blockIdx.x * blockDim.x + threadIdx.x);
164
165     if (idx / 2 < total) {
166         result[idx] = (data_l[idx] * data_r[idx] + data_l[idx + 1] * data_r[idx + 1]) /
167                (data_r[idx] * data_r[idx] + data_r[idx + 1] * data_r[idx + 1]);
168         result[idx + 1] = (data_l[idx + 1] * data_r[idx] - data_l[idx] * data_r[idx + 1]) /
169                (data_r[idx] * data_r[idx] + data_r[idx + 1] * data_r[idx + 1]);
170     }
171 }
172
173 ComplexMat_ ComplexMat_::operator/(const ComplexMat_ &rhs) const
174 {
175     assert(rhs.n_channels == n_channels && rhs.cols == cols && rhs.rows == rows);
176
177     ComplexMat_ result = ComplexMat_::same_size(*this);
178
179     const uint total = n_channels * rows * cols;
180     const dim3 threads(256);
181     const dim3 blocks((total + threads.x - 1) / threads.x);
182
183     same_num_channels_div_kernel<<<threads, blocks, 0>>>((float*)this->p_data.deviceMem(),
184                                                          (float*)rhs.p_data.deviceMem(),
185                                                          (float*)result.p_data.deviceMem(), total);
186     CudaCheckError();
187
188     return result;
189 }
190
191 __global__ void same_num_channels_add_kernel(const float *data_l, const float *data_r, float *result, int total)
192 {
193     int idx = 2 * (blockIdx.x * blockDim.x + threadIdx.x);
194
195     if (idx / 2 < total) {
196         result[idx] = data_l[idx] + data_r[idx];
197         result[idx + 1] = data_l[idx + 1] + data_r[idx + 1];
198     }
199 }
200
201 ComplexMat_ ComplexMat_::operator+(const ComplexMat_ &rhs) const
202 {
203     assert(rhs.n_channels == n_channels && rhs.cols == cols && rhs.rows == rows);
204
205     ComplexMat_ result = ComplexMat_::same_size(*this);
206
207     const uint total = n_channels * rows * cols;
208     const dim3 threads(256);
209     const dim3 blocks((total + threads.x - 1) / threads.x);
210
211     same_num_channels_add_kernel<<<threads, blocks, 0>>>((float*)this->p_data.deviceMem(),
212                                                          (float*)rhs.p_data.deviceMem(),
213                                                          (float*)result.p_data.deviceMem(),
214                                                          total);
215     CudaCheckError();
216
217     return result;
218 }
219
220 __global__ void constant_mul_kernel(const float *data_l, float constant, float *result, int total)
221 {
222     int idx = 2 * (blockIdx.x * blockDim.x + threadIdx.x);
223
224     if (idx / 2 < total) {
225         result[idx] = data_l[idx] * constant;
226         result[idx + 1] = data_l[idx + 1] * constant;
227     }
228 }
229
230 ComplexMat_ ComplexMat_::operator*(const float &rhs) const
231 {
232     ComplexMat_ result = ComplexMat_::same_size(*this);
233
234     const uint total = n_channels * rows * cols;
235     const dim3 threads(256);
236     const dim3 blocks((total + threads.x - 1) / threads.x);
237
238     constant_mul_kernel<<<threads, blocks, 0>>>((float*)this->p_data.deviceMem(),
239                                                 rhs,
240                                                 (float*)result.p_data.deviceMem(),
241                                                 total);
242     CudaCheckError();
243
244     return result;
245 }
246
247 __global__ void constant_add_kernel(const float *data_l, float constant, float *result, int total)
248 {
249     int idx = 2 * (blockIdx.x * blockDim.x + threadIdx.x);
250
251     if (idx / 2 < total) {
252         result[idx] = data_l[idx] + constant;
253         result[idx + 1] = data_l[idx + 1];
254     }
255 }
256
257 ComplexMat_ ComplexMat_::operator+(const float &rhs) const
258 {
259     ComplexMat_ result = ComplexMat_::same_size(*this);
260
261     const uint total = n_channels * rows * cols;
262     const dim3 threads(256);
263     const dim3 blocks((total + threads.x - 1) / threads.x);
264
265     constant_add_kernel<<<threads, blocks, 0>>>((float*)this->p_data.deviceMem(),
266                                                 rhs,
267                                                 (float*)result.p_data.deviceMem(),
268                                                 total);
269     CudaCheckError();
270
271     return result;
272 }
273
274 __global__ void one_channel_mul_kernel(const float *data_l, const float *data_r, float *result,
275                                        int channel_total, int total)
276 {
277     int idx = 2 * (blockIdx.x * blockDim.x + threadIdx.x);
278     int one_ch_idx = idx  % (2 * channel_total);
279
280     if (idx / 2 < total) {
281         result[idx] = data_l[idx] * data_r[one_ch_idx] - data_l[idx + 1] * data_r[one_ch_idx + 1];
282         result[idx + 1] = data_l[idx] * data_r[one_ch_idx + 1] + data_l[idx + 1] * data_r[one_ch_idx];
283     }
284 }
285
286 // multiplying element-wise multichannel by one channel mats (rhs mat is with one channel)
287 ComplexMat_ ComplexMat_::mul(const ComplexMat_ &rhs) const
288 {
289     assert(rhs.n_channels == 1 && rhs.cols == cols && rhs.rows == rows);
290
291     ComplexMat_ result = ComplexMat_::same_size(*this);
292
293     const uint total = n_channels * rows * cols;
294     const dim3 threads(256);
295     const dim3 blocks((total + threads.x - 1) / threads.x);
296
297     one_channel_mul_kernel<<<threads, blocks, 0>>>((float*)this->p_data.deviceMem(),
298                                                    (float*)rhs.p_data.deviceMem(),
299                                                    (float*)result.p_data.deviceMem(),
300                                                    rows * cols, total);
301     CudaCheckError();
302
303     return result;
304 }
305
306 // __global__ void scales_channel_mul_kernel(float *data_l, float *data_r, float *result)
307 // {
308 //     int blockId = blockIdx.x + blockIdx.y * gridDim.x;
309 //     int idx = 2 * (blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x);
310 //     int one_ch_index = 2 * ((threadIdx.y * blockDim.x) + threadIdx.x + blockIdx.x * blockDim.x * blockDim.y);
311
312 //     result[idx] = data_l[idx] * data_r[one_ch_index] - data_l[idx + 1] * data_r[one_ch_index + 1];
313 //     result[idx + 1] = data_l[idx] * data_r[one_ch_index + 1] + data_l[idx + 1] * data_r[one_ch_index];
314 // }
315
316 // multiplying element-wise multichannel by one channel mats (rhs mat is with multiple channel)
317 // ComplexMat_ ComplexMat_::mul2(const ComplexMat_ &rhs) const
318 // {
319 //     assert(rhs.n_channels == n_channels / n_scales && rhs.cols == cols && rhs.rows == rows);
320
321 //     ComplexMat_ result(this->rows, this->cols, this->channels(), this->n_scales);
322
323 //     dim3 threadsPerBlock(rows, cols);
324 //     dim3 numBlocks(n_channels / n_scales, n_scales);
325 //     scales_channel_mul_kernel<<<threads, blocks, 0>>>(this->p_data, rhs.p_data, result.p_data);
326 //     CudaCheckError();
327
328 //     return result;
329 // }
330
331 // void ComplexMat_::operator=(ComplexMat_ &&rhs)
332 // {
333 //     cols = rhs.cols;
334 //     rows = rhs.rows;
335 //     n_channels = rhs.n_channels;
336 //     n_scales = rhs.n_scales;
337
338 //     p_data = rhs.p_data;
339
340 //     rhs.p_data = nullptr;
341 // }
342
343 void ComplexMat_::cudaSync() const
344 {
345     CudaSafeCall(cudaStreamSynchronize(cudaStreamPerThread));
346 }