]> rtime.felk.cvut.cz Git - hercules2020/kcf.git/blob - src/complexmat.cu
3d15b72075f50d7395b56ca64c182b0e31f8a025
[hercules2020/kcf.git] / src / complexmat.cu
1 #include "complexmat.cuh"
2
3 __global__ void sqr_norm_kernel(int n, float* out, float* data, float rows, float cols)
4 {
5     extern __shared__ float sdata[];
6     int i = blockDim.x * threadIdx.y + threadIdx.x;
7     int blockId = blockIdx.x + blockIdx.y * gridDim.x;
8     int threadId = 2*(blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x);
9
10     sdata[i] = 0;
11     sdata[i] = data[threadId]*data[threadId] + data[threadId+1]*data[threadId+1];
12     __syncthreads();
13
14     for (unsigned int s=(blockDim.x*blockDim.y+1)/2, old_s = blockDim.x*blockDim.y;s>0; s>>=1) {
15     
16     if(old_s&1) s+=1;
17
18        if (i < s && i+s < old_s) {
19             sdata[i] += sdata[i + s];
20        }
21     old_s = s;
22     __syncthreads();
23     }
24     
25     if(i == 0){
26        atomicAdd(&out[blockId/n], sdata[0]/(rows*cols));
27     }
28 }
29
30 float ComplexMat::sqr_norm() const
31 {
32     float result;
33     CudaSafeCall(cudaMemset(result, 0, n_scales*sizeof(float)));
34
35     dim3 threadsPerBlock(rows, cols);
36     dim3 numBlocks(n_channels, 1);
37
38     sqr_norm_kernel<<<numBlocks, threadsPerBlock, rows*cols*sizeof(float)>>>(n_channels, result, p_data, rows, cols);
39     CudaCheckError();
40
41     return result;
42 }
43
44 void ComplexMat::sqr_norm(float *result) const
45 {
46     CudaSafeCall(cudaMemset(result, 0, n_scales*sizeof(float)));
47
48     dim3 threadsPerBlock(rows, cols);
49     dim3 numBlocks(n_channels/n_scales, n_scales);
50     
51     sqr_norm_kernel<<<numBlocks, threadsPerBlock, rows*cols*sizeof(float)>>>(n_channels/n_scales, result, p_data, rows, cols);
52     CudaCheckError();
53         
54     return;
55 }
56
57 __global__ void sqr_mag_kernel(float* data, float* result)
58 {
59         int blockId = blockIdx.x + blockIdx.y * gridDim.x;
60         int threadId = 2*(blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x);
61
62         result[threadId] = data[threadId]*data[threadId] + data[threadId+1]*data[threadId+1];
63         result[threadId+1] = 0;
64 }
65
66 ComplexMat ComplexMat::sqr_mag() const
67 {
68     ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales);
69     
70     dim3 threadsPerBlock(rows, cols);
71     dim3 numBlocks(n_channels/n_scales, n_scales);
72     sqr_mag_kernel<<<numBlocks, threadsPerBlock>>>(this->p_data, result.p_data);
73     CudaCheckError();
74     
75     return result;
76 }
77
78 __global__ void conj_kernel(float* data, float* result)
79 {
80         int blockId = blockIdx.x + blockIdx.y * gridDim.x;
81         int threadId = 2*(blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x);
82         
83         result[threadId] =   data[threadId];
84         result[threadId+1] =  -data[threadId+1];
85 }
86
87 ComplexMat ComplexMat::conj() const
88 {
89     ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales);
90     
91     dim3 threadsPerBlock(rows, cols);
92     dim3 numBlocks(n_channels/n_scales, n_scales);
93     conj_kernel<<<numBlocks, threadsPerBlock>>>(this->p_data, result.p_data);
94     CudaCheckError();
95
96     return result;
97 }
98
99 ComplexMat ComplexMat::sum_over_channels() const
100 {
101 //     assert(p_data.size() > 1);
102     ComplexMat result(this->rows, this->cols, 1);
103     return result;
104 }
105
106 cufftComplex* ComplexMat::get_p_data() const
107 {
108     return (cufftComplex*) p_data;
109 }
110
111 __global__ void same_num_channels_mul_kernel(float* data_l, float* data_r, float* result)
112 {
113         int blockId = blockIdx.x + blockIdx.y * gridDim.x;
114         int threadId = 2*(blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x);
115         
116         result[threadId] =  data_l[threadId]*data_r[threadId] - data_l[threadId+1]*data_r[threadId+1];
117         result[threadId+1] = data_l[threadId]*data_r[threadId+1] + data_l[threadId+1]*data_r[threadId];
118 }
119
120 //element-wise per channel multiplication, division and addition
121 ComplexMat ComplexMat::operator*(const ComplexMat & rhs) const
122 {
123     assert(rhs.n_channels == n_channels && rhs.cols == cols && rhs.rows == rows);
124     
125     ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales);
126
127     dim3 threadsPerBlock(rows, cols);
128     dim3 numBlocks(n_channels/n_scales, n_scales);
129     same_num_channels_mul_kernel<<<numBlocks, threadsPerBlock>>>(this->p_data, rhs.p_data, result.p_data);
130     CudaCheckError();
131
132     return result;
133 }
134
135 __global__ void same_num_channels_div_kernel(float* data_l, float* data_r, float* result)
136 {
137         int blockId = blockIdx.x + blockIdx.y * gridDim.x;
138         int threadId = 2*(blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x);
139         
140         result[threadId] =  (data_l[threadId]*data_r[threadId] + data_l[threadId+1]*data_r[threadId+1])/
141                             (data_r[threadId]*data_r[threadId] + data_r[threadId+1]*data_r[threadId+1]);
142         result[threadId+1] = (data_l[threadId+1]*data_r[threadId] - data_l[threadId]*data_r[threadId+1])/
143                             (data_r[threadId]*data_r[threadId] + data_r[threadId+1]*data_r[threadId+1]);
144 }
145
146 ComplexMat ComplexMat::operator/(const ComplexMat & rhs) const
147 {
148     assert(rhs.n_channels == n_channels && rhs.cols == cols && rhs.rows == rows);
149
150     ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales);
151     
152     dim3 threadsPerBlock(rows, cols);
153     dim3 numBlocks(n_channels/n_scales, n_scales);
154     same_num_channels_div_kernel<<<numBlocks, threadsPerBlock>>>(this->p_data, rhs.p_data, result.p_data);
155     CudaCheckError();
156
157     return result;
158 }
159
160 __global__ void same_num_channels_add_kernel(float* data_l, float* data_r, float* result)
161 {
162         int blockId = blockIdx.x + blockIdx.y * gridDim.x;
163         int threadId = 2*(blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x);
164         
165         result[threadId] =  data_l[threadId]+data_r[threadId];
166         result[threadId+1] = data_l[threadId+1]+data_r[threadId+1];
167 }
168
169 ComplexMat ComplexMat::operator+(const ComplexMat & rhs) const
170 {
171     assert(rhs.n_channels == n_channels && rhs.cols == cols && rhs.rows == rows);
172
173     ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales);
174     
175     dim3 threadsPerBlock(rows, cols);
176     dim3 numBlocks(n_channels/n_scales, n_scales);
177     same_num_channels_add_kernel<<<numBlocks, threadsPerBlock>>>(this->p_data, rhs.p_data, result.p_data);
178     CudaCheckError();
179     
180     return result;
181 }
182
183 __global__ void constant_mul_kernel(float* data_l, float constant, float* result)
184 {
185         int blockId = blockIdx.x + blockIdx.y * gridDim.x;
186         int threadId = 2*(blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x);
187         
188         result[threadId] =  data_l[threadId]*constant;
189         result[threadId+1] = data_l[threadId+1]*constant;
190 }
191
192 ComplexMat ComplexMat::operator*(const float & rhs) const
193 {
194     ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales);
195     
196     dim3 threadsPerBlock(rows, cols);
197     dim3 numBlocks(n_channels/n_scales, n_scales);
198     constant_mul_kernel<<<numBlocks, threadsPerBlock>>>(this->p_data, rhs, result.p_data);
199     CudaCheckError();
200
201     return result;
202 }
203
204 __global__ void constant_add_kernel(float* data_l, float constant, float* result)
205 {
206         int blockId = blockIdx.x + blockIdx.y * gridDim.x;
207         int threadId = 2*(blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x);
208         
209         result[threadId] =  data_l[threadId]+constant;
210         result[threadId+1] = data_l[threadId+1];
211 }
212
213 ComplexMat ComplexMat::operator+(const float & rhs) const
214 {
215     ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales);
216     
217     dim3 threadsPerBlock(rows, cols);
218     dim3 numBlocks(n_channels/n_scales, n_scales);
219     constant_add_kernel<<<numBlocks, threadsPerBlock>>>(this->p_data, rhs, result.p_data);
220     CudaCheckError();
221
222     return result;
223 }
224
225 __global__ void one_channel_mul_kernel(float* data_l, float* data_r, float* result)
226 {
227         int blockId = blockIdx.x + blockIdx.y * gridDim.x;
228         int threadId = 2*(blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x);
229         int one_ch_index = 2*((threadIdx.y * blockDim.x) + threadIdx.x);
230         
231         result[threadId] =  data_l[threadId]*data_r[one_ch_index] - data_l[threadId+1]*data_r[one_ch_index+1];
232         result[threadId+1] = data_l[threadId]*data_r[one_ch_index+1] + data_l[threadId+1]*data_r[one_ch_index];
233 }
234
235 //multiplying element-wise multichannel by one channel mats (rhs mat is with one channel)
236 ComplexMat ComplexMat::mul(const ComplexMat & rhs) const
237 {
238     assert(rhs.n_channels == 1 && rhs.cols == cols && rhs.rows == rows);
239
240     ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales);
241     
242     dim3 threadsPerBlock(rows, cols);
243     dim3 numBlocks(n_channels/n_scales, n_scales);
244     one_channel_mul_kernel<<<numBlocks, threadsPerBlock>>>(this->p_data, rhs.p_data, result.p_data);
245     CudaCheckError();
246     
247     return result;
248 }
249
250 __global__ void scales_channel_mul_kernel(float* data_l, float* data_r, float* result)
251 {
252         int blockId = blockIdx.x + blockIdx.y * gridDim.x;
253         int threadId = 2*(blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x);
254         int one_ch_index = 2*((threadIdx.y * blockDim.x) + threadIdx.x+blockIdx.x*blockDim.x*blockDim.y);
255         
256         result[threadId] =  data_l[threadId]*data_r[one_ch_index] - data_l[threadId+1]*data_r[one_ch_index+1];
257         result[threadId+1] = data_l[threadId]*data_r[one_ch_index+1] + data_l[threadId+1]*data_r[one_ch_index];
258 }
259
260 //multiplying element-wise multichannel by one channel mats (rhs mat is with multiple channel)
261 ComplexMat ComplexMat::mul2(const ComplexMat & rhs) const
262 {
263     assert(rhs.n_channels == n_channels/n_scales && rhs.cols == cols && rhs.rows == rows);
264
265     ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales);
266     
267     dim3 threadsPerBlock(rows, cols);
268     dim3 numBlocks(n_channels/n_scales, n_scales);
269     scales_channel_mul_kernel<<<numBlocks, threadsPerBlock>>>(this->p_data, rhs.p_data, result.p_data);
270     CudaCheckError();
271     
272     return result;
273 }
274
275 void ComplexMat::operator=(ComplexMat & rhs)
276 {
277     cols = rhs.cols;
278     rows = rhs.rows;
279     n_channels = rhs.n_channels;
280     n_scales = rhs.n_scales;
281     foreign_data = true;
282     
283     p_data = rhs.p_data;
284 }
285
286 void ComplexMat::operator=(ComplexMat && rhs)
287 {
288     cols = rhs.cols;
289     rows = rhs.rows;
290     n_channels = rhs.n_channels;
291     n_scales = rhs.n_scales;
292     
293     p_data = rhs.p_data;
294     
295     rhs.p_data = nullptr;
296 }