]> rtime.felk.cvut.cz Git - hercules2020/kcf.git/blob - src/complexmat.cu
Add CUDA implementation for sum_channels
[hercules2020/kcf.git] / src / complexmat.cu
1 #include "complexmat.hpp"
2
3 __global__ void sqr_norm_kernel(int n, float *out, const float *data, float rows, float cols)
4 {
5     extern __shared__ float sdata[];
6     int i = blockDim.x * threadIdx.y + threadIdx.x;
7     int blockId = blockIdx.x + blockIdx.y * gridDim.x;
8     int threadId = 2 * (blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x);
9
10     sdata[i] = 0;
11     sdata[i] = data[threadId] * data[threadId] + data[threadId + 1] * data[threadId + 1];
12     __syncthreads();
13
14     for (unsigned int s = (blockDim.x * blockDim.y + 1) / 2, old_s = blockDim.x * blockDim.y; s > 0; s >>= 1) {
15
16         if (old_s & 1) s += 1;
17
18         if (i < s && i + s < old_s) {
19             sdata[i] += sdata[i + s];
20         }
21         old_s = s;
22         __syncthreads();
23     }
24
25     if (i == 0) {
26         atomicAdd(&out[blockId / n], sdata[0] / (rows * cols));
27     }
28 }
29
30 void ComplexMat::sqr_norm(DynMem &result) const
31 {
32     CudaSafeCall(cudaMemsetAsync(result.deviceMem(), 0, n_scales * sizeof(float)));
33
34     dim3 threadsPerBlock(rows, cols);
35     dim3 numBlocks(n_channels / n_scales, n_scales);
36
37     sqr_norm_kernel<<<numBlocks, threadsPerBlock, rows * cols * sizeof(float)>>>(
38         n_channels / n_scales, result.deviceMem(), (float*)this->p_data.deviceMem(), rows, cols);
39     CudaCheckError();
40
41     return;
42 }
43
44 __global__ void sqr_mag_kernel(const float *data, float *result)
45 {
46     int blockId = blockIdx.x + blockIdx.y * gridDim.x;
47     int threadId = 2 * (blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x);
48
49     result[threadId] = data[threadId] * data[threadId] + data[threadId + 1] * data[threadId + 1];
50     result[threadId + 1] = 0;
51 }
52
53 ComplexMat ComplexMat::sqr_mag() const
54 {
55     ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales);
56
57     dim3 threadsPerBlock(rows, cols);
58     dim3 numBlocks(n_channels / n_scales, n_scales);
59     sqr_mag_kernel<<<numBlocks, threadsPerBlock, 0>>>((float*)this->p_data.deviceMem(), (float*)result.p_data.deviceMem());
60     CudaCheckError();
61
62     return result;
63 }
64
65 __global__ void conj_kernel(const float *data, float *result)
66 {
67     int blockId = blockIdx.x + blockIdx.y * gridDim.x;
68     int threadId = 2 * (blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x);
69
70     result[threadId] = data[threadId];
71     result[threadId + 1] = -data[threadId + 1];
72 }
73
74 ComplexMat ComplexMat::conj() const
75 {
76     ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales);
77
78     dim3 threadsPerBlock(rows, cols);
79     dim3 numBlocks(n_channels / n_scales, n_scales);
80     conj_kernel<<<numBlocks, threadsPerBlock, 0>>>((float*)this->p_data.deviceMem(), (float*)result.p_data.deviceMem());
81     CudaCheckError();
82
83     return result;
84 }
85
86 __global__ static void sum_channels(float *dest, const float *src, uint channels, uint num_channel_elem)
87 {
88     int idx = blockIdx.x * blockDim.x + threadIdx.x;
89
90     if (idx >= num_channel_elem)
91         return;
92
93     float acc = 0;
94     for (uint i = 0; i < channels; ++i)
95         acc += src[idx + i * num_channel_elem];
96     dest[idx] = acc;
97 }
98
99 ComplexMat ComplexMat::sum_over_channels() const
100 {
101     assert(p_data.num_elem == n_channels * rows * cols);
102
103     uint n_channels_per_scale = n_channels / n_scales;
104     uint scale_offset = n_channels_per_scale * rows * cols;
105
106     ComplexMat_ result(this->rows, this->cols, 1, n_scales);
107
108     const uint total = rows * cols * 2;
109     const dim3 threads(256);
110     const dim3 blocks((total + threads.x - 1) / threads.x);
111
112     for (uint scale = 0; scale < n_scales; ++scale) {
113         sum_channels<<<blocks, threads>>>(reinterpret_cast<float*>(result.p_data.deviceMem() + scale * scale_offset),
114                                           reinterpret_cast<const float*>(p_data.deviceMem() + scale * scale_offset),
115                                           n_channels_per_scale, total);
116     }
117     CudaSafeCall(cudaStreamSynchronize(cudaStreamPerThread));
118     return result;
119 }
120
121 __global__ void same_num_channels_mul_kernel(const float *data_l, const float *data_r, float *result)
122 {
123     int blockId = blockIdx.x + blockIdx.y * gridDim.x;
124     int threadId = 2 * (blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x);
125
126     result[threadId] = data_l[threadId] * data_r[threadId] - data_l[threadId + 1] * data_r[threadId + 1];
127     result[threadId + 1] = data_l[threadId] * data_r[threadId + 1] + data_l[threadId + 1] * data_r[threadId];
128 }
129
130 // element-wise per channel multiplication, division and addition
131 ComplexMat ComplexMat::operator*(const ComplexMat &rhs) const
132 {
133     assert(rhs.n_channels == n_channels && rhs.cols == cols && rhs.rows == rows);
134
135     ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales);
136
137     dim3 threadsPerBlock(rows, cols);
138     dim3 numBlocks(n_channels / n_scales, n_scales);
139     same_num_channels_mul_kernel<<<numBlocks, threadsPerBlock, 0>>>((float*)this->p_data.deviceMem(),
140                                                                     (float*)rhs.p_data.deviceMem(),
141                                                                     (float*)result.p_data.deviceMem());
142     CudaCheckError();
143
144     return result;
145 }
146
147 __global__ void same_num_channels_div_kernel(const float *data_l, const float *data_r, float *result)
148 {
149     int blockId = blockIdx.x + blockIdx.y * gridDim.x;
150     int threadId = 2 * (blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x);
151
152     result[threadId] = (data_l[threadId] * data_r[threadId] + data_l[threadId + 1] * data_r[threadId + 1]) /
153                        (data_r[threadId] * data_r[threadId] + data_r[threadId + 1] * data_r[threadId + 1]);
154     result[threadId + 1] = (data_l[threadId + 1] * data_r[threadId] - data_l[threadId] * data_r[threadId + 1]) /
155                            (data_r[threadId] * data_r[threadId] + data_r[threadId + 1] * data_r[threadId + 1]);
156 }
157
158 ComplexMat ComplexMat::operator/(const ComplexMat &rhs) const
159 {
160     assert(rhs.n_channels == n_channels && rhs.cols == cols && rhs.rows == rows);
161
162     ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales);
163
164     dim3 threadsPerBlock(rows, cols);
165     dim3 numBlocks(n_channels / n_scales, n_scales);
166     same_num_channels_div_kernel<<<numBlocks, threadsPerBlock, 0>>>((float*)this->p_data.deviceMem(),
167                                                                     (float*)rhs.p_data.deviceMem(),
168                                                                     (float*)result.p_data.deviceMem());
169     CudaCheckError();
170
171     return result;
172 }
173
174 __global__ void same_num_channels_add_kernel(const float *data_l, const float *data_r, float *result)
175 {
176     int blockId = blockIdx.x + blockIdx.y * gridDim.x;
177     int threadId = 2 * (blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x);
178
179     result[threadId] = data_l[threadId] + data_r[threadId];
180     result[threadId + 1] = data_l[threadId + 1] + data_r[threadId + 1];
181 }
182
183 ComplexMat ComplexMat::operator+(const ComplexMat &rhs) const
184 {
185     assert(rhs.n_channels == n_channels && rhs.cols == cols && rhs.rows == rows);
186
187     ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales);
188
189     dim3 threadsPerBlock(rows, cols);
190     dim3 numBlocks(n_channels / n_scales, n_scales);
191     same_num_channels_add_kernel<<<numBlocks, threadsPerBlock, 0>>>((float*)this->p_data.deviceMem(),
192                                                                     (float*)rhs.p_data.deviceMem(),
193                                                                     (float*)result.p_data.deviceMem());
194     CudaCheckError();
195
196     return result;
197 }
198
199 __global__ void constant_mul_kernel(const float *data_l, float constant, float *result)
200 {
201     int blockId = blockIdx.x + blockIdx.y * gridDim.x;
202     int threadId = 2 * (blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x);
203
204     result[threadId] = data_l[threadId] * constant;
205     result[threadId + 1] = data_l[threadId + 1] * constant;
206 }
207
208 ComplexMat ComplexMat::operator*(const float &rhs) const
209 {
210     ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales);
211
212     dim3 threadsPerBlock(rows, cols);
213     dim3 numBlocks(n_channels / n_scales, n_scales);
214     constant_mul_kernel<<<numBlocks, threadsPerBlock, 0>>>((float*)this->p_data.deviceMem(),
215                                                            rhs,
216                                                            (float*)result.p_data.deviceMem());
217     CudaCheckError();
218
219     return result;
220 }
221
222 __global__ void constant_add_kernel(const float *data_l, float constant, float *result)
223 {
224     int blockId = blockIdx.x + blockIdx.y * gridDim.x;
225     int threadId = 2 * (blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x);
226
227     result[threadId] = data_l[threadId] + constant;
228     result[threadId + 1] = data_l[threadId + 1];
229 }
230
231 ComplexMat ComplexMat::operator+(const float &rhs) const
232 {
233     ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales);
234
235     dim3 threadsPerBlock(rows, cols);
236     dim3 numBlocks(n_channels / n_scales, n_scales);
237     constant_add_kernel<<<numBlocks, threadsPerBlock, 0>>>((float*)this->p_data.deviceMem(),
238                                                            rhs,
239                                                            (float*)result.p_data.deviceMem());
240     CudaCheckError();
241
242     return result;
243 }
244
245 __global__ void one_channel_mul_kernel(const float *data_l, const float *data_r, float *result)
246 {
247     int blockId = blockIdx.x + blockIdx.y * gridDim.x;
248     int threadId = 2 * (blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x);
249     int one_ch_index = 2 * ((threadIdx.y * blockDim.x) + threadIdx.x);
250
251     result[threadId] = data_l[threadId] * data_r[one_ch_index] - data_l[threadId + 1] * data_r[one_ch_index + 1];
252     result[threadId + 1] = data_l[threadId] * data_r[one_ch_index + 1] + data_l[threadId + 1] * data_r[one_ch_index];
253 }
254
255 // multiplying element-wise multichannel by one channel mats (rhs mat is with one channel)
256 ComplexMat ComplexMat::mul(const ComplexMat &rhs) const
257 {
258     assert(rhs.n_channels == 1 && rhs.cols == cols && rhs.rows == rows);
259
260     ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales);
261
262     dim3 threadsPerBlock(rows, cols);
263     dim3 numBlocks(n_channels / n_scales, n_scales);
264     one_channel_mul_kernel<<<numBlocks, threadsPerBlock, 0>>>((float*)this->p_data.deviceMem(),
265                                                               (float*)rhs.p_data.deviceMem(),
266                                                               (float*)result.p_data.deviceMem());
267     CudaCheckError();
268
269     return result;
270 }
271
272 __global__ void scales_channel_mul_kernel(float *data_l, float *data_r, float *result)
273 {
274     int blockId = blockIdx.x + blockIdx.y * gridDim.x;
275     int threadId = 2 * (blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x);
276     int one_ch_index = 2 * ((threadIdx.y * blockDim.x) + threadIdx.x + blockIdx.x * blockDim.x * blockDim.y);
277
278     result[threadId] = data_l[threadId] * data_r[one_ch_index] - data_l[threadId + 1] * data_r[one_ch_index + 1];
279     result[threadId + 1] = data_l[threadId] * data_r[one_ch_index + 1] + data_l[threadId + 1] * data_r[one_ch_index];
280 }
281
282 // multiplying element-wise multichannel by one channel mats (rhs mat is with multiple channel)
283 // ComplexMat ComplexMat::mul2(const ComplexMat &rhs) const
284 // {
285 //     assert(rhs.n_channels == n_channels / n_scales && rhs.cols == cols && rhs.rows == rows);
286
287 //     ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales);
288
289 //     dim3 threadsPerBlock(rows, cols);
290 //     dim3 numBlocks(n_channels / n_scales, n_scales);
291 //     scales_channel_mul_kernel<<<numBlocks, threadsPerBlock, 0>>>(this->p_data, rhs.p_data, result.p_data);
292 //     CudaCheckError();
293
294 //     return result;
295 // }
296
297 // void ComplexMat::operator=(ComplexMat &&rhs)
298 // {
299 //     cols = rhs.cols;
300 //     rows = rhs.rows;
301 //     n_channels = rhs.n_channels;
302 //     n_scales = rhs.n_scales;
303
304 //     p_data = rhs.p_data;
305
306 //     rhs.p_data = nullptr;
307 // }