]> rtime.felk.cvut.cz Git - hercules2020/kcf.git/blob - src/complexmat.cu
Fix CUDA assertion failures in BIG_BATCH mode
[hercules2020/kcf.git] / src / complexmat.cu
1 #include "complexmat.hpp"
2
3
4 __global__ void sqr_norm_kernel(const float *in, float *block_res, int total)
5 {
6     extern __shared__ float sdata[];
7     int in_idx = 2 * (blockIdx.x * blockDim.x + threadIdx.x);
8     int i = threadIdx.x;
9
10     if (in_idx >= total * 2)
11         sdata[i] = 0;
12     else
13         sdata[i] = in[in_idx] * in[in_idx] + in[in_idx + 1] * in[in_idx + 1];
14
15     for (unsigned s = (blockDim.x + 1) / 2; s > 0; s >>= 1) {
16         __syncthreads();
17         if (i < s)
18             sdata[i] += sdata[i + s];
19     }
20
21     if (i == 0)
22         block_res[blockIdx.x] = sdata[0];
23 }
24
25 void ComplexMat_::sqr_norm(DynMem &result) const
26 {
27     assert(result.num_elem == n_scales);
28
29     const uint total = n_channels / n_scales * rows * cols;
30     const dim3 threads(1024);
31     const dim3 blocks((total + threads.x - 1) / threads.x);
32
33     DynMem block_res(blocks.x * n_scales);
34
35     for (uint s = 0; s < n_scales; ++s) {
36         sqr_norm_kernel<<<blocks, threads, threads.x * sizeof(float)>>>((const float*)(p_data.deviceMem() + s * total),
37                                                                         block_res.deviceMem() + s * blocks.x, total);
38         CudaCheckError();
39     }
40     cudaSync();
41
42     for (uint s = 0; s < n_scales; ++s) {
43         T res = 0;
44         for (int i = 0; i < blocks.x; i++)
45             res += block_res[s * blocks.x + i];
46         result.hostMem()[s] = res / static_cast<T>(cols * rows);
47     }
48 }
49
50 __global__ void sqr_mag_kernel(const float *data, float *result, int total)
51 {
52     int idx = 2 * (blockIdx.x * blockDim.x + threadIdx.x);
53
54     if (idx / 2 < total) {
55         result[idx] = data[idx] * data[idx] + data[idx + 1] * data[idx + 1];
56         result[idx + 1] = 0;
57     }
58 }
59
60 ComplexMat_ ComplexMat_::sqr_mag() const
61 {
62     ComplexMat_ result = ComplexMat_::same_size(*this);
63
64     const uint total = n_channels * rows * cols;
65     const dim3 threads(256);
66     const dim3 blocks((total + threads.x - 1) / threads.x);
67
68     sqr_mag_kernel<<<threads, blocks, 0>>>((float*)this->p_data.deviceMem(),
69                                            (float*)result.p_data.deviceMem(),
70                                            total);
71     CudaCheckError();
72
73     return result;
74 }
75
76 __global__ void conj_kernel(const float *data, float *result, int total)
77 {
78     int idx = 2 * (blockIdx.x * blockDim.x + threadIdx.x);
79
80     if (idx / 2 < total) {
81         result[idx] = data[idx];
82         result[idx + 1] = -data[idx + 1];
83     }
84 }
85
86 ComplexMat_ ComplexMat_::conj() const
87 {
88     ComplexMat_ result = ComplexMat_::same_size(*this);
89
90     const uint total = n_channels * rows * cols;
91     const dim3 threads(256);
92     const dim3 blocks((total + threads.x - 1) / threads.x);
93
94     conj_kernel<<<threads, blocks, 0>>>((float*)this->p_data.deviceMem(), (float*)result.p_data.deviceMem(), total);
95     CudaCheckError();
96
97     return result;
98 }
99
100 __global__ static void sum_channels(float *dest, const float *src, uint channels, uint num_channel_elem)
101 {
102     int idx = blockIdx.x * blockDim.x + threadIdx.x;
103
104     if (idx >= num_channel_elem)
105         return;
106
107     float acc = 0;
108     for (uint i = 0; i < channels; ++i)
109         acc += src[idx + i * num_channel_elem];
110     dest[idx] = acc;
111 }
112
113 ComplexMat_ ComplexMat_::sum_over_channels() const
114 {
115     assert(p_data.num_elem == n_channels * rows * cols);
116
117     uint n_channels_per_scale = n_channels / n_scales;
118     uint scale_offset = n_channels_per_scale * rows * cols;
119
120     ComplexMat_ result(this->rows, this->cols, 1, n_scales);
121
122     const uint total = rows * cols * 2;
123     const dim3 threads(256);
124     const dim3 blocks((total + threads.x - 1) / threads.x);
125
126     for (uint scale = 0; scale < n_scales; ++scale) {
127         sum_channels<<<blocks, threads>>>(reinterpret_cast<float*>(result.p_data.deviceMem() + scale * scale_offset),
128                                           reinterpret_cast<const float*>(p_data.deviceMem() + scale * scale_offset),
129                                           n_channels_per_scale, total);
130     }
131     CudaSafeCall(cudaStreamSynchronize(cudaStreamPerThread));
132     return result;
133 }
134
135 __global__ void same_num_channels_mul_kernel(const float *data_l, const float *data_r, float *result, int total)
136 {
137     int idx = 2 * (blockIdx.x * blockDim.x + threadIdx.x);
138
139     if (idx / 2 < total) {
140         result[idx] = data_l[idx] * data_r[idx] - data_l[idx + 1] * data_r[idx + 1];
141         result[idx + 1] = data_l[idx] * data_r[idx + 1] + data_l[idx + 1] * data_r[idx];
142     }
143 }
144
145 // element-wise per channel multiplication, division and addition
146 ComplexMat_ ComplexMat_::operator*(const ComplexMat_ &rhs) const
147 {
148     assert(n_channels == n_scales * rhs.n_channels && rhs.cols == cols && rhs.rows == rows);
149
150     ComplexMat_ result = ComplexMat_::same_size(*this);
151
152     const uint total = n_channels / n_scales * rows * cols;
153     const dim3 threads(256);
154     const dim3 blocks((total + threads.x - 1) / threads.x);
155
156     for (uint s = 0; s < n_scales; ++s) {
157         same_num_channels_mul_kernel<<<blocks, threads, 0>>>((float*)(this->p_data.deviceMem() + s * total),
158                                                              (float*)rhs.p_data.deviceMem(),
159                                                              (float*)(result.p_data.deviceMem() + s * total),
160                                                              total);
161         CudaCheckError();
162     }
163
164     return result;
165 }
166
167 __global__ void same_num_channels_div_kernel(const float *data_l, const float *data_r, float *result, unsigned total)
168 {
169     int idx = 2 * (blockIdx.x * blockDim.x + threadIdx.x);
170
171     if (idx / 2 < total) {
172         result[idx] = (data_l[idx] * data_r[idx] + data_l[idx + 1] * data_r[idx + 1]) /
173                (data_r[idx] * data_r[idx] + data_r[idx + 1] * data_r[idx + 1]);
174         result[idx + 1] = (data_l[idx + 1] * data_r[idx] - data_l[idx] * data_r[idx + 1]) /
175                (data_r[idx] * data_r[idx] + data_r[idx + 1] * data_r[idx + 1]);
176     }
177 }
178
179 ComplexMat_ ComplexMat_::operator/(const ComplexMat_ &rhs) const
180 {
181     assert(rhs.n_channels == n_channels && rhs.cols == cols && rhs.rows == rows);
182
183     ComplexMat_ result = ComplexMat_::same_size(*this);
184
185     const uint total = n_channels * rows * cols;
186     const dim3 threads(256);
187     const dim3 blocks((total + threads.x - 1) / threads.x);
188
189     same_num_channels_div_kernel<<<threads, blocks, 0>>>((float*)this->p_data.deviceMem(),
190                                                          (float*)rhs.p_data.deviceMem(),
191                                                          (float*)result.p_data.deviceMem(), total);
192     CudaCheckError();
193
194     return result;
195 }
196
197 __global__ void same_num_channels_add_kernel(const float *data_l, const float *data_r, float *result, int total)
198 {
199     int idx = 2 * (blockIdx.x * blockDim.x + threadIdx.x);
200
201     if (idx / 2 < total) {
202         result[idx] = data_l[idx] + data_r[idx];
203         result[idx + 1] = data_l[idx + 1] + data_r[idx + 1];
204     }
205 }
206
207 ComplexMat_ ComplexMat_::operator+(const ComplexMat_ &rhs) const
208 {
209     assert(rhs.n_channels == n_channels && rhs.cols == cols && rhs.rows == rows);
210
211     ComplexMat_ result = ComplexMat_::same_size(*this);
212
213     const uint total = n_channels * rows * cols;
214     const dim3 threads(256);
215     const dim3 blocks((total + threads.x - 1) / threads.x);
216
217     same_num_channels_add_kernel<<<threads, blocks, 0>>>((float*)this->p_data.deviceMem(),
218                                                          (float*)rhs.p_data.deviceMem(),
219                                                          (float*)result.p_data.deviceMem(),
220                                                          total);
221     CudaCheckError();
222
223     return result;
224 }
225
226 __global__ void constant_mul_kernel(const float *data_l, float constant, float *result, int total)
227 {
228     int idx = 2 * (blockIdx.x * blockDim.x + threadIdx.x);
229
230     if (idx / 2 < total) {
231         result[idx] = data_l[idx] * constant;
232         result[idx + 1] = data_l[idx + 1] * constant;
233     }
234 }
235
236 ComplexMat_ ComplexMat_::operator*(const float &rhs) const
237 {
238     ComplexMat_ result = ComplexMat_::same_size(*this);
239
240     const uint total = n_channels * rows * cols;
241     const dim3 threads(256);
242     const dim3 blocks((total + threads.x - 1) / threads.x);
243
244     constant_mul_kernel<<<threads, blocks, 0>>>((float*)this->p_data.deviceMem(),
245                                                 rhs,
246                                                 (float*)result.p_data.deviceMem(),
247                                                 total);
248     CudaCheckError();
249
250     return result;
251 }
252
253 __global__ void constant_add_kernel(const float *data_l, float constant, float *result, int total)
254 {
255     int idx = 2 * (blockIdx.x * blockDim.x + threadIdx.x);
256
257     if (idx / 2 < total) {
258         result[idx] = data_l[idx] + constant;
259         result[idx + 1] = data_l[idx + 1];
260     }
261 }
262
263 ComplexMat_ ComplexMat_::operator+(const float &rhs) const
264 {
265     ComplexMat_ result = ComplexMat_::same_size(*this);
266
267     const uint total = n_channels * rows * cols;
268     const dim3 threads(256);
269     const dim3 blocks((total + threads.x - 1) / threads.x);
270
271     constant_add_kernel<<<threads, blocks, 0>>>((float*)this->p_data.deviceMem(),
272                                                 rhs,
273                                                 (float*)result.p_data.deviceMem(),
274                                                 total);
275     CudaCheckError();
276
277     return result;
278 }
279
280 __global__ void one_channel_mul_kernel(const float *data_l, const float *data_r, float *result,
281                                        int channel_total, int total)
282 {
283     int idx = 2 * (blockIdx.x * blockDim.x + threadIdx.x);
284     int one_ch_idx = idx  % (2 * channel_total);
285
286     if (idx / 2 < total) {
287         result[idx] = data_l[idx] * data_r[one_ch_idx] - data_l[idx + 1] * data_r[one_ch_idx + 1];
288         result[idx + 1] = data_l[idx] * data_r[one_ch_idx + 1] + data_l[idx + 1] * data_r[one_ch_idx];
289     }
290 }
291
292 // multiplying element-wise multichannel by one channel mats (rhs mat is with one channel)
293 ComplexMat_ ComplexMat_::mul(const ComplexMat_ &rhs) const
294 {
295     assert(rhs.n_channels == 1 && rhs.cols == cols && rhs.rows == rows);
296
297     ComplexMat_ result = ComplexMat_::same_size(*this);
298
299     const uint total = n_channels * rows * cols;
300     const dim3 threads(256);
301     const dim3 blocks((total + threads.x - 1) / threads.x);
302
303     one_channel_mul_kernel<<<threads, blocks, 0>>>((float*)this->p_data.deviceMem(),
304                                                    (float*)rhs.p_data.deviceMem(),
305                                                    (float*)result.p_data.deviceMem(),
306                                                    rows * cols, total);
307     CudaCheckError();
308
309     return result;
310 }
311
312 // __global__ void scales_channel_mul_kernel(float *data_l, float *data_r, float *result)
313 // {
314 //     int blockId = blockIdx.x + blockIdx.y * gridDim.x;
315 //     int idx = 2 * (blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x);
316 //     int one_ch_index = 2 * ((threadIdx.y * blockDim.x) + threadIdx.x + blockIdx.x * blockDim.x * blockDim.y);
317
318 //     result[idx] = data_l[idx] * data_r[one_ch_index] - data_l[idx + 1] * data_r[one_ch_index + 1];
319 //     result[idx + 1] = data_l[idx] * data_r[one_ch_index + 1] + data_l[idx + 1] * data_r[one_ch_index];
320 // }
321
322 // multiplying element-wise multichannel by one channel mats (rhs mat is with multiple channel)
323 // ComplexMat_ ComplexMat_::mul2(const ComplexMat_ &rhs) const
324 // {
325 //     assert(rhs.n_channels == n_channels / n_scales && rhs.cols == cols && rhs.rows == rows);
326
327 //     ComplexMat_ result(this->rows, this->cols, this->channels(), this->n_scales);
328
329 //     dim3 threadsPerBlock(rows, cols);
330 //     dim3 numBlocks(n_channels / n_scales, n_scales);
331 //     scales_channel_mul_kernel<<<threads, blocks, 0>>>(this->p_data, rhs.p_data, result.p_data);
332 //     CudaCheckError();
333
334 //     return result;
335 // }
336
337 // void ComplexMat_::operator=(ComplexMat_ &&rhs)
338 // {
339 //     cols = rhs.cols;
340 //     rows = rhs.rows;
341 //     n_channels = rhs.n_channels;
342 //     n_scales = rhs.n_scales;
343
344 //     p_data = rhs.p_data;
345
346 //     rhs.p_data = nullptr;
347 // }
348
349 void ComplexMat_::cudaSync() const
350 {
351     CudaSafeCall(cudaStreamSynchronize(cudaStreamPerThread));
352 }