]> rtime.felk.cvut.cz Git - hercules2020/kcf.git/blob - src/complexmat.cu
Merge rotation branch
[hercules2020/kcf.git] / src / complexmat.cu
1 #include "complexmat.hpp"
2
3
4 __global__ void sqr_norm_kernel(const float *in, float *block_res, int total)
5 {
6     extern __shared__ float sdata[];
7     int in_idx = 2 * (blockIdx.x * blockDim.x + threadIdx.x);
8     int i = threadIdx.x;
9
10     if (in_idx >= total * 2)
11         sdata[i] = 0;
12     else
13         sdata[i] = in[in_idx] * in[in_idx] + in[in_idx + 1] * in[in_idx + 1];
14
15     for (unsigned s = (blockDim.x + 1) / 2; s > 0; s >>= 1) {
16         __syncthreads();
17         if (i < s)
18             sdata[i] += sdata[i + s];
19     }
20
21     if (i == 0)
22         block_res[blockIdx.x] = sdata[0];
23 }
24
25 void ComplexMat_::sqr_norm(DynMem &result) const
26 {
27
28     assert(result.num_elem == n_scales);
29
30     const uint total = n_channels / n_scales * rows * cols;
31     const dim3 threads(1024);
32     const dim3 blocks((total + threads.x - 1) / threads.x);
33
34     DynMem block_res(blocks.x * n_scales);
35
36     for (uint s = 0; s < n_scales; ++s) {
37         sqr_norm_kernel<<<blocks, threads, threads.x * sizeof(float)>>>((const float*)(p_data.deviceMem() + s * total),
38                                                                         block_res.deviceMem() + s * blocks.x, total);
39         CudaCheckError();
40     }
41     cudaSync();
42
43     for (uint s = 0; s < n_scales; ++s) {
44         T res = 0;
45         for (int i = 0; i < blocks.x; i++)
46             res += block_res[s * blocks.x + i];
47         result.hostMem()[s] = res / static_cast<T>(cols * rows);
48     }
49 }
50
51 __global__ void sqr_mag_kernel(const float *data, float *result, int total)
52 {
53     int idx = 2 * (blockIdx.x * blockDim.x + threadIdx.x);
54
55     if (idx / 2 < total) {
56         result[idx] = data[idx] * data[idx] + data[idx + 1] * data[idx + 1];
57         result[idx + 1] = 0;
58     }
59 }
60
61 ComplexMat_ ComplexMat_::sqr_mag() const
62 {
63     ComplexMat_ result = ComplexMat_::same_size(*this);
64
65     const uint total = n_channels * rows * cols;
66     const dim3 threads(256);
67     const dim3 blocks((total + threads.x - 1) / threads.x);
68
69     sqr_mag_kernel<<<threads, blocks, 0>>>((float*)this->p_data.deviceMem(),
70                                            (float*)result.p_data.deviceMem(),
71                                            total);
72     CudaCheckError();
73
74     return result;
75 }
76
77 __global__ void conj_kernel(const float *data, float *result, int total)
78 {
79     int idx = 2 * (blockIdx.x * blockDim.x + threadIdx.x);
80
81     if (idx / 2 < total) {
82         result[idx] = data[idx];
83         result[idx + 1] = -data[idx + 1];
84     }
85 }
86
87 ComplexMat_ ComplexMat_::conj() const
88 {
89     ComplexMat_ result = ComplexMat_::same_size(*this);
90
91     const uint total = n_channels * rows * cols;
92     const dim3 threads(256);
93     const dim3 blocks((total + threads.x - 1) / threads.x);
94
95     conj_kernel<<<threads, blocks, 0>>>((float*)this->p_data.deviceMem(), (float*)result.p_data.deviceMem(), total);
96     CudaCheckError();
97
98     return result;
99 }
100
101 __global__ static void sum_channels(float *dest, const float *src, uint channels, uint num_channel_elem)
102 {
103     int idx = blockIdx.x * blockDim.x + threadIdx.x;
104
105     if (idx >= num_channel_elem)
106         return;
107
108     float acc = 0;
109     for (uint i = 0; i < channels; ++i)
110         acc += src[idx + i * num_channel_elem];
111     dest[idx] = acc;
112 }
113
114 ComplexMat_ ComplexMat_::sum_over_channels() const
115 {
116     assert(p_data.num_elem == n_channels * rows * cols);
117
118     uint n_channels_per_scale = n_channels / n_scales;
119
120     ComplexMat_ result(this->rows, this->cols, 1, n_scales);
121
122     const uint total = rows * cols * 2;
123     const dim3 threads(256);
124     const dim3 blocks((total + threads.x - 1) / threads.x);
125
126     for (uint scale = 0; scale < n_scales; ++scale) {
127         sum_channels<<<blocks, threads>>>(reinterpret_cast<float*>(result.p_data.deviceMem() + scale * rows * cols),
128                                           reinterpret_cast<const float*>(p_data.deviceMem() + scale * n_channels_per_scale * rows * cols),
129                                           n_channels_per_scale, total);
130     }
131     return result;
132 }
133
134 __global__ void same_num_channels_mul_kernel(const float *data_l, const float *data_r, float *result, int total)
135 {
136     int idx = 2 * (blockIdx.x * blockDim.x + threadIdx.x);
137
138     if (idx / 2 < total) {
139         result[idx] = data_l[idx] * data_r[idx] - data_l[idx + 1] * data_r[idx + 1];
140         result[idx + 1] = data_l[idx] * data_r[idx + 1] + data_l[idx + 1] * data_r[idx];
141     }
142 }
143
144 // element-wise per channel multiplication, division and addition
145 ComplexMat_ ComplexMat_::operator*(const ComplexMat_ &rhs) const
146 {
147     assert(n_channels == n_scales * rhs.n_channels && rhs.cols == cols && rhs.rows == rows);
148
149     ComplexMat_ result = ComplexMat_::same_size(*this);
150
151     const uint total = n_channels / n_scales * rows * cols;
152     const dim3 threads(256);
153     const dim3 blocks((total + threads.x - 1) / threads.x);
154
155     for (uint s = 0; s < n_scales; ++s) {
156         same_num_channels_mul_kernel<<<blocks, threads, 0>>>((float*)(this->p_data.deviceMem() + s * total),
157                                                              (float*)rhs.p_data.deviceMem(),
158                                                              (float*)(result.p_data.deviceMem() + s * total),
159                                                              total);
160         CudaCheckError();
161     }
162
163     return result;
164 }
165
166 __global__ void same_num_channels_div_kernel(const float *data_l, const float *data_r, float *result, unsigned total)
167 {
168     int idx = 2 * (blockIdx.x * blockDim.x + threadIdx.x);
169
170     if (idx / 2 < total) {
171         result[idx] = (data_l[idx] * data_r[idx] + data_l[idx + 1] * data_r[idx + 1]) /
172                (data_r[idx] * data_r[idx] + data_r[idx + 1] * data_r[idx + 1]);
173         result[idx + 1] = (data_l[idx + 1] * data_r[idx] - data_l[idx] * data_r[idx + 1]) /
174                (data_r[idx] * data_r[idx] + data_r[idx + 1] * data_r[idx + 1]);
175     }
176 }
177
178 ComplexMat_ ComplexMat_::operator/(const ComplexMat_ &rhs) const
179 {
180     assert(rhs.n_channels == n_channels && rhs.cols == cols && rhs.rows == rows);
181
182     ComplexMat_ result = ComplexMat_::same_size(*this);
183
184     const uint total = n_channels * rows * cols;
185     const dim3 threads(256);
186     const dim3 blocks((total + threads.x - 1) / threads.x);
187
188     same_num_channels_div_kernel<<<threads, blocks, 0>>>((float*)this->p_data.deviceMem(),
189                                                          (float*)rhs.p_data.deviceMem(),
190                                                          (float*)result.p_data.deviceMem(), total);
191     CudaCheckError();
192
193     return result;
194 }
195
196 __global__ void same_num_channels_add_kernel(const float *data_l, const float *data_r, float *result, int total)
197 {
198     int idx = 2 * (blockIdx.x * blockDim.x + threadIdx.x);
199
200     if (idx / 2 < total) {
201         result[idx] = data_l[idx] + data_r[idx];
202         result[idx + 1] = data_l[idx + 1] + data_r[idx + 1];
203     }
204 }
205
206 ComplexMat_ ComplexMat_::operator+(const ComplexMat_ &rhs) const
207 {
208     assert(rhs.n_channels == n_channels && rhs.cols == cols && rhs.rows == rows);
209
210     ComplexMat_ result = ComplexMat_::same_size(*this);
211
212     const uint total = n_channels * rows * cols;
213     const dim3 threads(256);
214     const dim3 blocks((total + threads.x - 1) / threads.x);
215
216     same_num_channels_add_kernel<<<threads, blocks, 0>>>((float*)this->p_data.deviceMem(),
217                                                          (float*)rhs.p_data.deviceMem(),
218                                                          (float*)result.p_data.deviceMem(),
219                                                          total);
220     CudaCheckError();
221
222     return result;
223 }
224
225 __global__ void constant_mul_kernel(const float *data_l, float constant, float *result, int total)
226 {
227     int idx = 2 * (blockIdx.x * blockDim.x + threadIdx.x);
228
229     if (idx / 2 < total) {
230         result[idx] = data_l[idx] * constant;
231         result[idx + 1] = data_l[idx + 1] * constant;
232     }
233 }
234
235 ComplexMat_ ComplexMat_::operator*(const float &rhs) const
236 {
237     ComplexMat_ result = ComplexMat_::same_size(*this);
238
239     const uint total = n_channels * rows * cols;
240     const dim3 threads(256);
241     const dim3 blocks((total + threads.x - 1) / threads.x);
242
243     constant_mul_kernel<<<threads, blocks, 0>>>((float*)this->p_data.deviceMem(),
244                                                 rhs,
245                                                 (float*)result.p_data.deviceMem(),
246                                                 total);
247     CudaCheckError();
248
249     return result;
250 }
251
252 __global__ void constant_add_kernel(const float *data_l, float constant, float *result, int total)
253 {
254     int idx = 2 * (blockIdx.x * blockDim.x + threadIdx.x);
255
256     if (idx / 2 < total) {
257         result[idx] = data_l[idx] + constant;
258         result[idx + 1] = data_l[idx + 1];
259     }
260 }
261
262 ComplexMat_ ComplexMat_::operator+(const float &rhs) const
263 {
264     ComplexMat_ result = ComplexMat_::same_size(*this);
265
266     const uint total = n_channels * rows * cols;
267     const dim3 threads(256);
268     const dim3 blocks((total + threads.x - 1) / threads.x);
269
270     constant_add_kernel<<<threads, blocks, 0>>>((float*)this->p_data.deviceMem(),
271                                                 rhs,
272                                                 (float*)result.p_data.deviceMem(),
273                                                 total);
274     CudaCheckError();
275
276     return result;
277 }
278
279 __global__ void one_channel_mul_kernel(const float *data_l, const float *data_r, float *result,
280                                        int channel_total, int total)
281 {
282     int idx = 2 * (blockIdx.x * blockDim.x + threadIdx.x);
283     int one_ch_idx = idx  % (2 * channel_total);
284
285     if (idx / 2 < total) {
286         result[idx] = data_l[idx] * data_r[one_ch_idx] - data_l[idx + 1] * data_r[one_ch_idx + 1];
287         result[idx + 1] = data_l[idx] * data_r[one_ch_idx + 1] + data_l[idx + 1] * data_r[one_ch_idx];
288     }
289 }
290
291 // multiplying element-wise multichannel by one channel mats (rhs mat is with one channel)
292 ComplexMat_ ComplexMat_::mul(const ComplexMat_ &rhs) const
293 {
294     assert(rhs.n_channels == 1 && rhs.cols == cols && rhs.rows == rows);
295
296     ComplexMat_ result = ComplexMat_::same_size(*this);
297
298     const uint total = n_channels * rows * cols;
299     const dim3 threads(256);
300     const dim3 blocks((total + threads.x - 1) / threads.x);
301
302     one_channel_mul_kernel<<<threads, blocks, 0>>>((float*)this->p_data.deviceMem(),
303                                                    (float*)rhs.p_data.deviceMem(),
304                                                    (float*)result.p_data.deviceMem(),
305                                                    rows * cols, total);
306     CudaCheckError();
307
308     return result;
309 }
310
311 // __global__ void scales_channel_mul_kernel(float *data_l, float *data_r, float *result)
312 // {
313 //     int blockId = blockIdx.x + blockIdx.y * gridDim.x;
314 //     int idx = 2 * (blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x);
315 //     int one_ch_index = 2 * ((threadIdx.y * blockDim.x) + threadIdx.x + blockIdx.x * blockDim.x * blockDim.y);
316
317 //     result[idx] = data_l[idx] * data_r[one_ch_index] - data_l[idx + 1] * data_r[one_ch_index + 1];
318 //     result[idx + 1] = data_l[idx] * data_r[one_ch_index + 1] + data_l[idx + 1] * data_r[one_ch_index];
319 // }
320
321 // multiplying element-wise multichannel by one channel mats (rhs mat is with multiple channel)
322 // ComplexMat_ ComplexMat_::mul2(const ComplexMat_ &rhs) const
323 // {
324 //     assert(rhs.n_channels == n_channels / n_scales && rhs.cols == cols && rhs.rows == rows);
325
326 //     ComplexMat_ result(this->rows, this->cols, this->channels(), this->n_scales);
327
328 //     dim3 threadsPerBlock(rows, cols);
329 //     dim3 numBlocks(n_channels / n_scales, n_scales);
330 //     scales_channel_mul_kernel<<<threads, blocks, 0>>>(this->p_data, rhs.p_data, result.p_data);
331 //     CudaCheckError();
332
333 //     return result;
334 // }
335
336 // void ComplexMat_::operator=(ComplexMat_ &&rhs)
337 // {
338 //     cols = rhs.cols;
339 //     rows = rhs.rows;
340 //     n_channels = rhs.n_channels;
341 //     n_scales = rhs.n_scales;
342
343 //     p_data = rhs.p_data;
344
345 //     rhs.p_data = nullptr;
346 // }
347
348 void ComplexMat_::cudaSync() const
349 {
350     CudaSafeCall(cudaStreamSynchronize(cudaStreamPerThread));
351 }