]> rtime.felk.cvut.cz Git - hercules2020/kcf.git/blob - src/complexmat.cu
6ce087cd183d40d7bababd06b6d082eb29c3d557
[hercules2020/kcf.git] / src / complexmat.cu
1 #include "complexmat.hpp"
2
3
4 __global__ void sqr_norm_kernel(const float *in, float *block_res, int total)
5 {
6     extern __shared__ float sdata[];
7     int in_idx = 2 * (blockIdx.x * blockDim.x + threadIdx.x);
8     int i = threadIdx.x;
9
10     if (in_idx >= total * 2)
11         sdata[i] = 0;
12     else
13         sdata[i] = in[in_idx] * in[in_idx] + in[in_idx + 1] * in[in_idx + 1];
14
15     for (unsigned s = (blockDim.x + 1) / 2; s > 0; s >>= 1) {
16         __syncthreads();
17         if (i < s)
18             sdata[i] += sdata[i + s];
19     }
20
21     if (i == 0)
22         block_res[blockIdx.x] = sdata[0];
23 }
24
25 void ComplexMat_::sqr_norm(DynMem &result) const
26 {
27     assert(n_scales == 1);
28
29     const uint total = n_channels * rows * cols;
30     const dim3 threads(1024);
31     const dim3 blocks((total + threads.x - 1) / threads.x);
32
33     DynMem block_res(blocks.x);
34
35     sqr_norm_kernel<<<blocks, threads, threads.x * sizeof(float)>>>((const float*)p_data.deviceMem(),
36                                                                     block_res.deviceMem(), total);
37     CudaCheckError();
38     CudaSafeCall(cudaStreamSynchronize(cudaStreamPerThread));
39
40     T res = 0;
41     for (int i = 0; i < blocks.x; i++)
42         res += block_res[i];
43     result.hostMem()[0] = res / static_cast<T>(cols * rows);
44 }
45
46 __global__ void sqr_mag_kernel(const float *data, float *result)
47 {
48     int idx = 2 * (blockIdx.x * blockDim.x + threadIdx.x);
49
50     result[idx] = data[idx] * data[idx] + data[idx + 1] * data[idx + 1];
51     result[idx + 1] = 0;
52 }
53
54 ComplexMat_ ComplexMat_::sqr_mag() const
55 {
56     ComplexMat_ result = ComplexMat_::same_size(*this);
57
58     const uint total = n_channels * rows * cols;
59     const dim3 threads(256);
60     const dim3 blocks((total + threads.x - 1) / threads.x);
61
62     sqr_mag_kernel<<<threads, blocks, 0>>>((float*)this->p_data.deviceMem(), (float*)result.p_data.deviceMem());
63     CudaCheckError();
64
65     return result;
66 }
67
68 __global__ void conj_kernel(const float *data, float *result)
69 {
70     int idx = 2 * (blockIdx.x * blockDim.x + threadIdx.x);
71
72     result[idx] = data[idx];
73     result[idx + 1] = -data[idx + 1];
74 }
75
76 ComplexMat_ ComplexMat_::conj() const
77 {
78     ComplexMat_ result = ComplexMat_::same_size(*this);
79
80     const uint total = n_channels * rows * cols;
81     const dim3 threads(256);
82     const dim3 blocks((total + threads.x - 1) / threads.x);
83
84     conj_kernel<<<threads, blocks, 0>>>((float*)this->p_data.deviceMem(), (float*)result.p_data.deviceMem());
85     CudaCheckError();
86
87     return result;
88 }
89
90 __global__ static void sum_channels(float *dest, const float *src, uint channels, uint num_channel_elem)
91 {
92     int idx = blockIdx.x * blockDim.x + threadIdx.x;
93
94     if (idx >= num_channel_elem)
95         return;
96
97     float acc = 0;
98     for (uint i = 0; i < channels; ++i)
99         acc += src[idx + i * num_channel_elem];
100     dest[idx] = acc;
101 }
102
103 ComplexMat_ ComplexMat_::sum_over_channels() const
104 {
105     assert(p_data.num_elem == n_channels * rows * cols);
106
107     uint n_channels_per_scale = n_channels / n_scales;
108     uint scale_offset = n_channels_per_scale * rows * cols;
109
110     ComplexMat_ result(this->rows, this->cols, 1, n_scales);
111
112     const uint total = rows * cols * 2;
113     const dim3 threads(256);
114     const dim3 blocks((total + threads.x - 1) / threads.x);
115
116     for (uint scale = 0; scale < n_scales; ++scale) {
117         sum_channels<<<blocks, threads>>>(reinterpret_cast<float*>(result.p_data.deviceMem() + scale * scale_offset),
118                                           reinterpret_cast<const float*>(p_data.deviceMem() + scale * scale_offset),
119                                           n_channels_per_scale, total);
120     }
121     CudaSafeCall(cudaStreamSynchronize(cudaStreamPerThread));
122     return result;
123 }
124
125 __global__ void same_num_channels_mul_kernel(const float *data_l, const float *data_r, float *result)
126 {
127     int idx = 2 * (blockIdx.x * blockDim.x + threadIdx.x);
128
129     result[idx] = data_l[idx] * data_r[idx] - data_l[idx + 1] * data_r[idx + 1];
130     result[idx + 1] = data_l[idx] * data_r[idx + 1] + data_l[idx + 1] * data_r[idx];
131 }
132
133 // element-wise per channel multiplication, division and addition
134 ComplexMat_ ComplexMat_::operator*(const ComplexMat_ &rhs) const
135 {
136     assert(rhs.n_channels == n_channels && rhs.cols == cols && rhs.rows == rows);
137
138     ComplexMat_ result = ComplexMat_::same_size(*this);
139
140     const uint total = n_channels * rows * cols;
141     const dim3 threads(256);
142     const dim3 blocks((total + threads.x - 1) / threads.x);
143
144     same_num_channels_mul_kernel<<<blocks, threads, 0>>>((float*)this->p_data.deviceMem(),
145                                                          (float*)rhs.p_data.deviceMem(),
146                                                          (float*)result.p_data.deviceMem());
147     CudaCheckError();
148
149     return result;
150 }
151
152 __global__ void same_num_channels_div_kernel(const float *data_l, const float *data_r, float *result)
153 {
154     int idx = 2 * (blockIdx.x * blockDim.x + threadIdx.x);
155
156     result[idx] = (data_l[idx] * data_r[idx] + data_l[idx + 1] * data_r[idx + 1]) /
157                   (data_r[idx] * data_r[idx] + data_r[idx + 1] * data_r[idx + 1]);
158     result[idx + 1] = (data_l[idx + 1] * data_r[idx] - data_l[idx] * data_r[idx + 1]) /
159                       (data_r[idx] * data_r[idx] + data_r[idx + 1] * data_r[idx + 1]);
160 }
161
162 ComplexMat_ ComplexMat_::operator/(const ComplexMat_ &rhs) const
163 {
164     assert(rhs.n_channels == n_channels && rhs.cols == cols && rhs.rows == rows);
165
166     ComplexMat_ result = ComplexMat_::same_size(*this);
167
168     const uint total = n_channels * rows * cols;
169     const dim3 threads(256);
170     const dim3 blocks((total + threads.x - 1) / threads.x);
171
172     same_num_channels_div_kernel<<<threads, blocks, 0>>>((float*)this->p_data.deviceMem(),
173                                                          (float*)rhs.p_data.deviceMem(),
174                                                          (float*)result.p_data.deviceMem());
175     CudaCheckError();
176
177     return result;
178 }
179
180 __global__ void same_num_channels_add_kernel(const float *data_l, const float *data_r, float *result)
181 {
182     int idx = 2 * (blockIdx.x * blockDim.x + threadIdx.x);
183
184     result[idx] = data_l[idx] + data_r[idx];
185     result[idx + 1] = data_l[idx + 1] + data_r[idx + 1];
186 }
187
188 ComplexMat_ ComplexMat_::operator+(const ComplexMat_ &rhs) const
189 {
190     assert(rhs.n_channels == n_channels && rhs.cols == cols && rhs.rows == rows);
191
192     ComplexMat_ result = ComplexMat_::same_size(*this);
193
194     const uint total = n_channels * rows * cols;
195     const dim3 threads(256);
196     const dim3 blocks((total + threads.x - 1) / threads.x);
197
198     same_num_channels_add_kernel<<<threads, blocks, 0>>>((float*)this->p_data.deviceMem(),
199                                                          (float*)rhs.p_data.deviceMem(),
200                                                          (float*)result.p_data.deviceMem());
201     CudaCheckError();
202
203     return result;
204 }
205
206 __global__ void constant_mul_kernel(const float *data_l, float constant, float *result)
207 {
208     int idx = 2 * (blockIdx.x * blockDim.x + threadIdx.x);
209
210     result[idx] = data_l[idx] * constant;
211     result[idx + 1] = data_l[idx + 1] * constant;
212 }
213
214 ComplexMat_ ComplexMat_::operator*(const float &rhs) const
215 {
216     ComplexMat_ result = ComplexMat_::same_size(*this);
217
218     const uint total = n_channels * rows * cols;
219     const dim3 threads(256);
220     const dim3 blocks((total + threads.x - 1) / threads.x);
221
222     constant_mul_kernel<<<threads, blocks, 0>>>((float*)this->p_data.deviceMem(),
223                                                 rhs,
224                                                 (float*)result.p_data.deviceMem());
225     CudaCheckError();
226
227     return result;
228 }
229
230 __global__ void constant_add_kernel(const float *data_l, float constant, float *result)
231 {
232     int idx = 2 * (blockIdx.x * blockDim.x + threadIdx.x);
233
234     result[idx] = data_l[idx] + constant;
235     result[idx + 1] = data_l[idx + 1];
236 }
237
238 ComplexMat_ ComplexMat_::operator+(const float &rhs) const
239 {
240     ComplexMat_ result = ComplexMat_::same_size(*this);
241
242     const uint total = n_channels * rows * cols;
243     const dim3 threads(256);
244     const dim3 blocks((total + threads.x - 1) / threads.x);
245
246     constant_add_kernel<<<threads, blocks, 0>>>((float*)this->p_data.deviceMem(),
247                                                 rhs,
248                                                 (float*)result.p_data.deviceMem());
249     CudaCheckError();
250
251     return result;
252 }
253
254 __global__ void one_channel_mul_kernel(const float *data_l, const float *data_r, float *result, int channel_total)
255 {
256     int idx = 2 * (blockIdx.x * blockDim.x + threadIdx.x);
257     int one_ch_idx = idx  % (2 * channel_total);
258
259     result[idx] = data_l[idx] * data_r[one_ch_idx] - data_l[idx + 1] * data_r[one_ch_idx + 1];
260     result[idx + 1] = data_l[idx] * data_r[one_ch_idx + 1] + data_l[idx + 1] * data_r[one_ch_idx];
261 }
262
263 // multiplying element-wise multichannel by one channel mats (rhs mat is with one channel)
264 ComplexMat_ ComplexMat_::mul(const ComplexMat_ &rhs) const
265 {
266     assert(rhs.n_channels == 1 && rhs.cols == cols && rhs.rows == rows);
267
268     ComplexMat_ result = ComplexMat_::same_size(*this);
269
270     const uint total = n_channels * rows * cols;
271     const dim3 threads(256);
272     const dim3 blocks((total + threads.x - 1) / threads.x);
273
274     one_channel_mul_kernel<<<threads, blocks, 0>>>((float*)this->p_data.deviceMem(),
275                                                    (float*)rhs.p_data.deviceMem(),
276                                                    (float*)result.p_data.deviceMem(),
277                                                    rows * cols);
278     CudaCheckError();
279
280     return result;
281 }
282
283 // __global__ void scales_channel_mul_kernel(float *data_l, float *data_r, float *result)
284 // {
285 //     int blockId = blockIdx.x + blockIdx.y * gridDim.x;
286 //     int idx = 2 * (blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x);
287 //     int one_ch_index = 2 * ((threadIdx.y * blockDim.x) + threadIdx.x + blockIdx.x * blockDim.x * blockDim.y);
288
289 //     result[idx] = data_l[idx] * data_r[one_ch_index] - data_l[idx + 1] * data_r[one_ch_index + 1];
290 //     result[idx + 1] = data_l[idx] * data_r[one_ch_index + 1] + data_l[idx + 1] * data_r[one_ch_index];
291 // }
292
293 // multiplying element-wise multichannel by one channel mats (rhs mat is with multiple channel)
294 // ComplexMat_ ComplexMat_::mul2(const ComplexMat_ &rhs) const
295 // {
296 //     assert(rhs.n_channels == n_channels / n_scales && rhs.cols == cols && rhs.rows == rows);
297
298 //     ComplexMat_ result(this->rows, this->cols, this->channels(), this->n_scales);
299
300 //     dim3 threadsPerBlock(rows, cols);
301 //     dim3 numBlocks(n_channels / n_scales, n_scales);
302 //     scales_channel_mul_kernel<<<threads, blocks, 0>>>(this->p_data, rhs.p_data, result.p_data);
303 //     CudaCheckError();
304
305 //     return result;
306 // }
307
308 // void ComplexMat_::operator=(ComplexMat_ &&rhs)
309 // {
310 //     cols = rhs.cols;
311 //     rows = rhs.rows;
312 //     n_channels = rhs.n_channels;
313 //     n_scales = rhs.n_scales;
314
315 //     p_data = rhs.p_data;
316
317 //     rhs.p_data = nullptr;
318 // }