]> rtime.felk.cvut.cz Git - hercules2020/kcf.git/blob - src/complexmat.cu
Added check in complexmat to prevent double freeing data on same pointer. Also added...
[hercules2020/kcf.git] / src / complexmat.cu
1 #include "complexmat.cuh"
2
3 __global__ void sqr_norm_kernel(int n, float* out, float* data, float rows, float cols)
4 {
5     extern __shared__ float sdata[];
6     int i = blockDim.x * threadIdx.y + threadIdx.x;
7     int blockId = blockIdx.x + blockIdx.y * gridDim.x;
8     int threadId = 2*(blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x);
9
10     sdata[i] = 0;
11     sdata[i] = data[threadId]*data[threadId] + data[threadId+1]*data[threadId+1];
12     __syncthreads();
13
14     for (unsigned int s=(blockDim.x*blockDim.y+1)/2, old_s = blockDim.x*blockDim.y;s>0; s>>=1) {
15     
16     if(old_s&1) s+=1;
17
18        if (i < s && i+s < old_s) {
19             sdata[i] += sdata[i + s];
20        }
21     old_s = s;
22     __syncthreads();
23     }
24     
25     if(i == 0){
26        atomicAdd(&out[blockId/n], sdata[0]/(rows*cols));
27     }
28 }
29
30 void ComplexMat::sqr_norm(float *result) const
31 {
32     CudaSafeCall(cudaMemset(result, 0, n_scales*sizeof(float)));
33
34     dim3 threadsPerBlock(rows, cols);
35     dim3 numBlocks(n_channels/n_scales, n_scales);
36     
37     sqr_norm_kernel<<<numBlocks, threadsPerBlock, rows*cols*sizeof(float)>>>(n_channels/n_scales, result, p_data, rows, cols);
38     CudaCheckError();
39         
40     return;
41 }
42
43 __global__ void sqr_mag_kernel(float* data, float* result)
44 {
45         int blockId = blockIdx.x + blockIdx.y * gridDim.x;
46         int threadId = 2*(blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x);
47
48         result[threadId] = data[threadId]*data[threadId] + data[threadId+1]*data[threadId+1];
49         result[threadId+1] = 0;
50 }
51
52 ComplexMat ComplexMat::sqr_mag() const
53 {
54     ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales);
55     
56     dim3 threadsPerBlock(rows, cols);
57     dim3 numBlocks(n_channels/n_scales, n_scales);
58     sqr_mag_kernel<<<numBlocks, threadsPerBlock>>>(this->p_data, result.p_data);
59     CudaCheckError();
60     
61     return result;
62 }
63
64 __global__ void conj_kernel(float* data, float* result)
65 {
66         int blockId = blockIdx.x + blockIdx.y * gridDim.x;
67         int threadId = 2*(blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x);
68         
69         result[threadId] =   data[threadId];
70         result[threadId+1] =  -data[threadId+1];
71 }
72
73 ComplexMat ComplexMat::conj() const
74 {
75     ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales);
76     
77     dim3 threadsPerBlock(rows, cols);
78     dim3 numBlocks(n_channels/n_scales, n_scales);
79     conj_kernel<<<numBlocks, threadsPerBlock>>>(this->p_data, result.p_data);
80     CudaCheckError();
81
82     return result;
83 }
84
85 ComplexMat ComplexMat::sum_over_channels() const
86 {
87 //     assert(p_data.size() > 1);
88     ComplexMat result(this->rows, this->cols, 1);
89     return result;
90 }
91
92 cufftComplex* ComplexMat::get_p_data() const
93 {
94     return (cufftComplex*) p_data;
95 }
96
97 __global__ void same_num_channels_mul_kernel(float* data_l, float* data_r, float* result)
98 {
99         int blockId = blockIdx.x + blockIdx.y * gridDim.x;
100         int threadId = 2*(blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x);
101         
102         result[threadId] =  data_l[threadId]*data_r[threadId] - data_l[threadId+1]*data_r[threadId+1];
103         result[threadId+1] = data_l[threadId]*data_r[threadId+1] + data_l[threadId+1]*data_r[threadId];
104 }
105
106 //element-wise per channel multiplication, division and addition
107 ComplexMat ComplexMat::operator*(const ComplexMat & rhs) const
108 {
109     assert(rhs.n_channels == n_channels && rhs.cols == cols && rhs.rows == rows);
110     
111     ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales);
112
113     dim3 threadsPerBlock(rows, cols);
114     dim3 numBlocks(n_channels/n_scales, n_scales);
115     same_num_channels_mul_kernel<<<numBlocks, threadsPerBlock>>>(this->p_data, rhs.p_data, result.p_data);
116     CudaCheckError();
117
118     return result;
119 }
120
121 __global__ void same_num_channels_div_kernel(float* data_l, float* data_r, float* result)
122 {
123         int blockId = blockIdx.x + blockIdx.y * gridDim.x;
124         int threadId = 2*(blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x);
125         
126         result[threadId] =  (data_l[threadId]*data_r[threadId] + data_l[threadId+1]*data_r[threadId+1])/
127                             (data_r[threadId]*data_r[threadId] + data_r[threadId+1]*data_r[threadId+1]);
128         result[threadId+1] = (data_l[threadId+1]*data_r[threadId] - data_l[threadId]*data_r[threadId+1])/
129                             (data_r[threadId]*data_r[threadId] + data_r[threadId+1]*data_r[threadId+1]);
130 }
131
132 ComplexMat ComplexMat::operator/(const ComplexMat & rhs) const
133 {
134     assert(rhs.n_channels == n_channels && rhs.cols == cols && rhs.rows == rows);
135
136     ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales);
137     
138     dim3 threadsPerBlock(rows, cols);
139     dim3 numBlocks(n_channels/n_scales, n_scales);
140     same_num_channels_div_kernel<<<numBlocks, threadsPerBlock>>>(this->p_data, rhs.p_data, result.p_data);
141     CudaCheckError();
142
143     return result;
144 }
145
146 __global__ void same_num_channels_add_kernel(float* data_l, float* data_r, float* result)
147 {
148         int blockId = blockIdx.x + blockIdx.y * gridDim.x;
149         int threadId = 2*(blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x);
150         
151         result[threadId] =  data_l[threadId]+data_r[threadId];
152         result[threadId+1] = data_l[threadId+1]+data_r[threadId+1];
153 }
154
155 ComplexMat ComplexMat::operator+(const ComplexMat & rhs) const
156 {
157     assert(rhs.n_channels == n_channels && rhs.cols == cols && rhs.rows == rows);
158
159     ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales);
160     
161     dim3 threadsPerBlock(rows, cols);
162     dim3 numBlocks(n_channels/n_scales, n_scales);
163     same_num_channels_add_kernel<<<numBlocks, threadsPerBlock>>>(this->p_data, rhs.p_data, result.p_data);
164     CudaCheckError();
165     
166     return result;
167 }
168
169 __global__ void constant_mul_kernel(float* data_l, float constant, float* result)
170 {
171         int blockId = blockIdx.x + blockIdx.y * gridDim.x;
172         int threadId = 2*(blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x);
173         
174         result[threadId] =  data_l[threadId]*constant;
175         result[threadId+1] = data_l[threadId+1]*constant;
176 }
177
178 ComplexMat ComplexMat::operator*(const float & rhs) const
179 {
180     ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales);
181     
182     dim3 threadsPerBlock(rows, cols);
183     dim3 numBlocks(n_channels/n_scales, n_scales);
184     constant_mul_kernel<<<numBlocks, threadsPerBlock>>>(this->p_data, rhs, result.p_data);
185     CudaCheckError();
186
187     return result;
188 }
189
190 __global__ void constant_add_kernel(float* data_l, float constant, float* result)
191 {
192         int blockId = blockIdx.x + blockIdx.y * gridDim.x;
193         int threadId = 2*(blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x);
194         
195         result[threadId] =  data_l[threadId]+constant;
196         result[threadId+1] = data_l[threadId+1];
197 }
198
199 ComplexMat ComplexMat::operator+(const float & rhs) const
200 {
201     ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales);
202     
203     dim3 threadsPerBlock(rows, cols);
204     dim3 numBlocks(n_channels/n_scales, n_scales);
205     constant_add_kernel<<<numBlocks, threadsPerBlock>>>(this->p_data, rhs, result.p_data);
206     CudaCheckError();
207
208     return result;
209 }
210
211 __global__ void one_channel_mul_kernel(float* data_l, float* data_r, float* result)
212 {
213         int blockId = blockIdx.x + blockIdx.y * gridDim.x;
214         int threadId = 2*(blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x);
215         int one_ch_index = 2*((threadIdx.y * blockDim.x) + threadIdx.x);
216         
217         result[threadId] =  data_l[threadId]*data_r[one_ch_index] - data_l[threadId+1]*data_r[one_ch_index+1];
218         result[threadId+1] = data_l[threadId]*data_r[one_ch_index+1] + data_l[threadId+1]*data_r[one_ch_index];
219 }
220
221 //multiplying element-wise multichannel by one channel mats (rhs mat is with one channel)
222 ComplexMat ComplexMat::mul(const ComplexMat & rhs) const
223 {
224     assert(rhs.n_channels == 1 && rhs.cols == cols && rhs.rows == rows);
225
226     ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales);
227     
228     dim3 threadsPerBlock(rows, cols);
229     dim3 numBlocks(n_channels/n_scales, n_scales);
230     one_channel_mul_kernel<<<numBlocks, threadsPerBlock>>>(this->p_data, rhs.p_data, result.p_data);
231     CudaCheckError();
232     
233     return result;
234 }
235
236 __global__ void scales_channel_mul_kernel(float* data_l, float* data_r, float* result)
237 {
238         int blockId = blockIdx.x + blockIdx.y * gridDim.x;
239         int threadId = 2*(blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x);
240         int one_ch_index = 2*((threadIdx.y * blockDim.x) + threadIdx.x+blockIdx.x*blockDim.x*blockDim.y);
241         
242         result[threadId] =  data_l[threadId]*data_r[one_ch_index] - data_l[threadId+1]*data_r[one_ch_index+1];
243         result[threadId+1] = data_l[threadId]*data_r[one_ch_index+1] + data_l[threadId+1]*data_r[one_ch_index];
244 }
245
246 //multiplying element-wise multichannel by one channel mats (rhs mat is with multiple channel)
247 ComplexMat ComplexMat::mul2(const ComplexMat & rhs) const
248 {
249     assert(rhs.n_channels == n_channels/n_scales && rhs.cols == cols && rhs.rows == rows);
250
251     ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales);
252     
253     dim3 threadsPerBlock(rows, cols);
254     dim3 numBlocks(n_channels/n_scales, n_scales);
255     scales_channel_mul_kernel<<<numBlocks, threadsPerBlock>>>(this->p_data, rhs.p_data, result.p_data);
256     CudaCheckError();
257     
258     return result;
259 }
260
261 void ComplexMat::operator=(ComplexMat & rhs)
262 {
263     cols = rhs.cols;
264     rows = rhs.rows;
265     n_channels = rhs.n_channels;
266     n_scales = rhs.n_scales;
267     foreign_data = true;
268     
269     p_data = rhs.p_data;
270 }
271
272 void ComplexMat::operator=(ComplexMat && rhs)
273 {
274     cols = rhs.cols;
275     rows = rhs.rows;
276     n_channels = rhs.n_channels;
277     n_scales = rhs.n_scales;
278     
279     p_data = rhs.p_data;
280     
281     rhs.p_data = nullptr;
282 }