]> rtime.felk.cvut.cz Git - hercules2020/kcf.git/blob - src/complexmat.cu
Fix and simplify CUDA version of ComplexMat::sqr_norm
[hercules2020/kcf.git] / src / complexmat.cu
1 #include "complexmat.hpp"
2
3
4 __global__ void sqr_norm_kernel(const float *in, float *block_res, int total)
5 {
6     extern __shared__ float sdata[];
7     int in_idx = 2 * (blockIdx.x * blockDim.x + threadIdx.x);
8     int i = threadIdx.x;
9
10     if (in_idx >= total * 2)
11         sdata[i] = 0;
12     else
13         sdata[i] = in[in_idx] * in[in_idx] + in[in_idx + 1] * in[in_idx + 1];
14
15     for (unsigned s = (blockDim.x + 1) / 2; s > 0; s >>= 1) {
16         __syncthreads();
17         if (i < s)
18             sdata[i] += sdata[i + s];
19     }
20
21     if (i == 0)
22         block_res[blockIdx.x] = sdata[0];
23 }
24
25 void ComplexMat_::sqr_norm(DynMem &result) const
26 {
27     assert(n_scales == 1);
28
29     const uint total = n_channels * rows * cols;
30     const dim3 threads(1024);
31     const dim3 blocks((total + threads.x - 1) / threads.x);
32
33     DynMem block_res(blocks.x);
34
35     sqr_norm_kernel<<<blocks, threads, threads.x * sizeof(float)>>>((const float*)p_data.deviceMem(),
36                                                                     block_res.deviceMem(), total);
37     CudaCheckError();
38     CudaSafeCall(cudaStreamSynchronize(cudaStreamPerThread));
39
40     T res = 0;
41     for (int i = 0; i < blocks.x; i++)
42         res += block_res[i];
43     result.hostMem()[0] = res / static_cast<T>(cols * rows);
44 }
45
46 __global__ void sqr_mag_kernel(const float *data, float *result)
47 {
48     int blockId = blockIdx.x + blockIdx.y * gridDim.x;
49     int threadId = 2 * (blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x);
50
51     result[threadId] = data[threadId] * data[threadId] + data[threadId + 1] * data[threadId + 1];
52     result[threadId + 1] = 0;
53 }
54
55 ComplexMat ComplexMat::sqr_mag() const
56 {
57     ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales);
58
59     dim3 threadsPerBlock(rows, cols);
60     dim3 numBlocks(n_channels / n_scales, n_scales);
61     sqr_mag_kernel<<<numBlocks, threadsPerBlock, 0>>>((float*)this->p_data.deviceMem(), (float*)result.p_data.deviceMem());
62     CudaCheckError();
63
64     return result;
65 }
66
67 __global__ void conj_kernel(const float *data, float *result)
68 {
69     int blockId = blockIdx.x + blockIdx.y * gridDim.x;
70     int threadId = 2 * (blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x);
71
72     result[threadId] = data[threadId];
73     result[threadId + 1] = -data[threadId + 1];
74 }
75
76 ComplexMat ComplexMat::conj() const
77 {
78     ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales);
79
80     dim3 threadsPerBlock(rows, cols);
81     dim3 numBlocks(n_channels / n_scales, n_scales);
82     conj_kernel<<<numBlocks, threadsPerBlock, 0>>>((float*)this->p_data.deviceMem(), (float*)result.p_data.deviceMem());
83     CudaCheckError();
84
85     return result;
86 }
87
88 __global__ static void sum_channels(float *dest, const float *src, uint channels, uint num_channel_elem)
89 {
90     int idx = blockIdx.x * blockDim.x + threadIdx.x;
91
92     if (idx >= num_channel_elem)
93         return;
94
95     float acc = 0;
96     for (uint i = 0; i < channels; ++i)
97         acc += src[idx + i * num_channel_elem];
98     dest[idx] = acc;
99 }
100
101 ComplexMat ComplexMat::sum_over_channels() const
102 {
103     assert(p_data.num_elem == n_channels * rows * cols);
104
105     uint n_channels_per_scale = n_channels / n_scales;
106     uint scale_offset = n_channels_per_scale * rows * cols;
107
108     ComplexMat_ result(this->rows, this->cols, 1, n_scales);
109
110     const uint total = rows * cols * 2;
111     const dim3 threads(256);
112     const dim3 blocks((total + threads.x - 1) / threads.x);
113
114     for (uint scale = 0; scale < n_scales; ++scale) {
115         sum_channels<<<blocks, threads>>>(reinterpret_cast<float*>(result.p_data.deviceMem() + scale * scale_offset),
116                                           reinterpret_cast<const float*>(p_data.deviceMem() + scale * scale_offset),
117                                           n_channels_per_scale, total);
118     }
119     CudaSafeCall(cudaStreamSynchronize(cudaStreamPerThread));
120     return result;
121 }
122
123 __global__ void same_num_channels_mul_kernel(const float *data_l, const float *data_r, float *result)
124 {
125     int blockId = blockIdx.x + blockIdx.y * gridDim.x;
126     int threadId = 2 * (blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x);
127
128     result[threadId] = data_l[threadId] * data_r[threadId] - data_l[threadId + 1] * data_r[threadId + 1];
129     result[threadId + 1] = data_l[threadId] * data_r[threadId + 1] + data_l[threadId + 1] * data_r[threadId];
130 }
131
132 // element-wise per channel multiplication, division and addition
133 ComplexMat ComplexMat::operator*(const ComplexMat &rhs) const
134 {
135     assert(rhs.n_channels == n_channels && rhs.cols == cols && rhs.rows == rows);
136
137     ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales);
138
139     dim3 threadsPerBlock(rows, cols);
140     dim3 numBlocks(n_channels / n_scales, n_scales);
141     same_num_channels_mul_kernel<<<numBlocks, threadsPerBlock, 0>>>((float*)this->p_data.deviceMem(),
142                                                                     (float*)rhs.p_data.deviceMem(),
143                                                                     (float*)result.p_data.deviceMem());
144     CudaCheckError();
145
146     return result;
147 }
148
149 __global__ void same_num_channels_div_kernel(const float *data_l, const float *data_r, float *result)
150 {
151     int blockId = blockIdx.x + blockIdx.y * gridDim.x;
152     int threadId = 2 * (blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x);
153
154     result[threadId] = (data_l[threadId] * data_r[threadId] + data_l[threadId + 1] * data_r[threadId + 1]) /
155                        (data_r[threadId] * data_r[threadId] + data_r[threadId + 1] * data_r[threadId + 1]);
156     result[threadId + 1] = (data_l[threadId + 1] * data_r[threadId] - data_l[threadId] * data_r[threadId + 1]) /
157                            (data_r[threadId] * data_r[threadId] + data_r[threadId + 1] * data_r[threadId + 1]);
158 }
159
160 ComplexMat ComplexMat::operator/(const ComplexMat &rhs) const
161 {
162     assert(rhs.n_channels == n_channels && rhs.cols == cols && rhs.rows == rows);
163
164     ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales);
165
166     dim3 threadsPerBlock(rows, cols);
167     dim3 numBlocks(n_channels / n_scales, n_scales);
168     same_num_channels_div_kernel<<<numBlocks, threadsPerBlock, 0>>>((float*)this->p_data.deviceMem(),
169                                                                     (float*)rhs.p_data.deviceMem(),
170                                                                     (float*)result.p_data.deviceMem());
171     CudaCheckError();
172
173     return result;
174 }
175
176 __global__ void same_num_channels_add_kernel(const float *data_l, const float *data_r, float *result)
177 {
178     int blockId = blockIdx.x + blockIdx.y * gridDim.x;
179     int threadId = 2 * (blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x);
180
181     result[threadId] = data_l[threadId] + data_r[threadId];
182     result[threadId + 1] = data_l[threadId + 1] + data_r[threadId + 1];
183 }
184
185 ComplexMat ComplexMat::operator+(const ComplexMat &rhs) const
186 {
187     assert(rhs.n_channels == n_channels && rhs.cols == cols && rhs.rows == rows);
188
189     ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales);
190
191     dim3 threadsPerBlock(rows, cols);
192     dim3 numBlocks(n_channels / n_scales, n_scales);
193     same_num_channels_add_kernel<<<numBlocks, threadsPerBlock, 0>>>((float*)this->p_data.deviceMem(),
194                                                                     (float*)rhs.p_data.deviceMem(),
195                                                                     (float*)result.p_data.deviceMem());
196     CudaCheckError();
197
198     return result;
199 }
200
201 __global__ void constant_mul_kernel(const float *data_l, float constant, float *result)
202 {
203     int blockId = blockIdx.x + blockIdx.y * gridDim.x;
204     int threadId = 2 * (blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x);
205
206     result[threadId] = data_l[threadId] * constant;
207     result[threadId + 1] = data_l[threadId + 1] * constant;
208 }
209
210 ComplexMat ComplexMat::operator*(const float &rhs) const
211 {
212     ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales);
213
214     dim3 threadsPerBlock(rows, cols);
215     dim3 numBlocks(n_channels / n_scales, n_scales);
216     constant_mul_kernel<<<numBlocks, threadsPerBlock, 0>>>((float*)this->p_data.deviceMem(),
217                                                            rhs,
218                                                            (float*)result.p_data.deviceMem());
219     CudaCheckError();
220
221     return result;
222 }
223
224 __global__ void constant_add_kernel(const float *data_l, float constant, float *result)
225 {
226     int blockId = blockIdx.x + blockIdx.y * gridDim.x;
227     int threadId = 2 * (blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x);
228
229     result[threadId] = data_l[threadId] + constant;
230     result[threadId + 1] = data_l[threadId + 1];
231 }
232
233 ComplexMat ComplexMat::operator+(const float &rhs) const
234 {
235     ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales);
236
237     dim3 threadsPerBlock(rows, cols);
238     dim3 numBlocks(n_channels / n_scales, n_scales);
239     constant_add_kernel<<<numBlocks, threadsPerBlock, 0>>>((float*)this->p_data.deviceMem(),
240                                                            rhs,
241                                                            (float*)result.p_data.deviceMem());
242     CudaCheckError();
243
244     return result;
245 }
246
247 __global__ void one_channel_mul_kernel(const float *data_l, const float *data_r, float *result)
248 {
249     int blockId = blockIdx.x + blockIdx.y * gridDim.x;
250     int threadId = 2 * (blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x);
251     int one_ch_index = 2 * ((threadIdx.y * blockDim.x) + threadIdx.x);
252
253     result[threadId] = data_l[threadId] * data_r[one_ch_index] - data_l[threadId + 1] * data_r[one_ch_index + 1];
254     result[threadId + 1] = data_l[threadId] * data_r[one_ch_index + 1] + data_l[threadId + 1] * data_r[one_ch_index];
255 }
256
257 // multiplying element-wise multichannel by one channel mats (rhs mat is with one channel)
258 ComplexMat ComplexMat::mul(const ComplexMat &rhs) const
259 {
260     assert(rhs.n_channels == 1 && rhs.cols == cols && rhs.rows == rows);
261
262     ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales);
263
264     dim3 threadsPerBlock(rows, cols);
265     dim3 numBlocks(n_channels / n_scales, n_scales);
266     one_channel_mul_kernel<<<numBlocks, threadsPerBlock, 0>>>((float*)this->p_data.deviceMem(),
267                                                               (float*)rhs.p_data.deviceMem(),
268                                                               (float*)result.p_data.deviceMem());
269     CudaCheckError();
270
271     return result;
272 }
273
274 __global__ void scales_channel_mul_kernel(float *data_l, float *data_r, float *result)
275 {
276     int blockId = blockIdx.x + blockIdx.y * gridDim.x;
277     int threadId = 2 * (blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x);
278     int one_ch_index = 2 * ((threadIdx.y * blockDim.x) + threadIdx.x + blockIdx.x * blockDim.x * blockDim.y);
279
280     result[threadId] = data_l[threadId] * data_r[one_ch_index] - data_l[threadId + 1] * data_r[one_ch_index + 1];
281     result[threadId + 1] = data_l[threadId] * data_r[one_ch_index + 1] + data_l[threadId + 1] * data_r[one_ch_index];
282 }
283
284 // multiplying element-wise multichannel by one channel mats (rhs mat is with multiple channel)
285 // ComplexMat ComplexMat::mul2(const ComplexMat &rhs) const
286 // {
287 //     assert(rhs.n_channels == n_channels / n_scales && rhs.cols == cols && rhs.rows == rows);
288
289 //     ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales);
290
291 //     dim3 threadsPerBlock(rows, cols);
292 //     dim3 numBlocks(n_channels / n_scales, n_scales);
293 //     scales_channel_mul_kernel<<<numBlocks, threadsPerBlock, 0>>>(this->p_data, rhs.p_data, result.p_data);
294 //     CudaCheckError();
295
296 //     return result;
297 // }
298
299 // void ComplexMat::operator=(ComplexMat &&rhs)
300 // {
301 //     cols = rhs.cols;
302 //     rows = rhs.rows;
303 //     n_channels = rhs.n_channels;
304 //     n_scales = rhs.n_scales;
305
306 //     p_data = rhs.p_data;
307
308 //     rhs.p_data = nullptr;
309 // }