]> rtime.felk.cvut.cz Git - hercules2020/kcf.git/blob - src/complexmat.cu
Removed zero copy for complexmat now it is using normal device memory. Added zero...
[hercules2020/kcf.git] / src / complexmat.cu
1 #include "complexmat.cuh"
2
3 __global__ void sqr_norm_kernel(int n, float* out, float* data, float rows, float cols)
4 {
5     extern __shared__ float sdata[];
6     int i = blockDim.x * threadIdx.y + threadIdx.x;
7     int blockId = blockIdx.x + blockIdx.y * gridDim.x;
8     int threadId = 2*(blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x);
9
10     sdata[i] = 0;
11     sdata[i] = data[threadId]*data[threadId] + data[threadId+1]*data[threadId+1];
12     __syncthreads();
13
14     for (unsigned int s=(blockDim.x*blockDim.y+1)/2, old_s = blockDim.x*blockDim.y;s>0; s>>=1) {
15     
16     if(old_s&1) s+=1;
17
18        if (i < s && i+s < old_s) {
19             sdata[i] += sdata[i + s];
20        }
21     old_s = s;
22     __syncthreads();
23     }
24     
25     if(i == 0){
26        atomicAdd(&out[blockId/n], sdata[0]/(rows*cols));
27     }
28 }
29
30 void ComplexMat::sqr_norm(float *result) const
31 {
32     cudaMemset(result, 0, n_scales*sizeof(float));
33     
34     dim3 threadsPerBlock(rows, cols);
35     dim3 numBlocks(n_channels/n_scales, n_scales);
36     
37     sqr_norm_kernel<<<numBlocks, threadsPerBlock, rows*cols*sizeof(float)>>>(n_channels/n_scales, result, p_data, rows, cols);
38         
39     return;
40 }
41
42 __global__ void sqr_mag_kernel(float* data, float* result)
43 {
44         int blockId = blockIdx.x + blockIdx.y * gridDim.x;
45         int threadId = 2*(blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x);
46
47         result[threadId] = data[threadId]*data[threadId] + data[threadId+1]*data[threadId+1];
48         result[threadId+1] = 0;
49 }
50
51 ComplexMat ComplexMat::sqr_mag() const
52 {
53     ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales);
54     
55     dim3 threadsPerBlock(rows, cols);
56     dim3 numBlocks(n_channels/n_scales, n_scales);
57     sqr_mag_kernel<<<numBlocks, threadsPerBlock>>>(this->p_data, result.p_data);
58     
59     return result;
60 }
61
62 __global__ void conj_kernel(float* data, float* result)
63 {
64         int blockId = blockIdx.x + blockIdx.y * gridDim.x;
65         int threadId = 2*(blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x);
66         
67         result[threadId] =   data[threadId];
68         result[threadId+1] =  -data[threadId+1];
69 }
70
71 ComplexMat ComplexMat::conj() const
72 {
73     ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales);
74     
75     dim3 threadsPerBlock(rows, cols);
76     dim3 numBlocks(n_channels/n_scales, n_scales);
77     conj_kernel<<<numBlocks, threadsPerBlock>>>(this->p_data, result.p_data);  
78     return result;
79 }
80
81 ComplexMat ComplexMat::sum_over_channels() const
82 {
83 //     assert(p_data.size() > 1);
84     ComplexMat result(this->rows, this->cols, 1);
85     return result;
86 }
87
88 cufftComplex* ComplexMat::get_p_data() const
89 {
90     return (cufftComplex*) p_data;
91 }
92
93 __global__ void same_num_channels_mul_kernel(float* data_l, float* data_r, float* result)
94 {
95         int blockId = blockIdx.x + blockIdx.y * gridDim.x;
96         int threadId = 2*(blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x);
97         
98         result[threadId] =  data_l[threadId]*data_r[threadId] - data_l[threadId+1]*data_r[threadId+1];
99         result[threadId+1] = data_l[threadId]*data_r[threadId+1] + data_l[threadId+1]*data_r[threadId];
100 }
101
102 //element-wise per channel multiplication, division and addition
103 ComplexMat ComplexMat::operator*(const ComplexMat & rhs) const
104 {
105     assert(rhs.n_channels == n_channels && rhs.cols == cols && rhs.rows == rows);
106     
107     ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales);
108
109     dim3 threadsPerBlock(rows, cols);
110     dim3 numBlocks(n_channels/n_scales, n_scales);
111     same_num_channels_mul_kernel<<<numBlocks, threadsPerBlock>>>(this->p_data, rhs.p_data, result.p_data);
112
113     return result;
114 }
115
116 __global__ void same_num_channels_div_kernel(float* data_l, float* data_r, float* result)
117 {
118         int blockId = blockIdx.x + blockIdx.y * gridDim.x;
119         int threadId = 2*(blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x);
120         
121         result[threadId] =  (data_l[threadId]*data_r[threadId] + data_l[threadId+1]*data_r[threadId+1])/
122                             (data_r[threadId]*data_r[threadId] + data_r[threadId+1]*data_r[threadId+1]);
123         result[threadId+1] = (data_l[threadId+1]*data_r[threadId] - data_l[threadId]*data_r[threadId+1])/
124                             (data_r[threadId]*data_r[threadId] + data_r[threadId+1]*data_r[threadId+1]);
125 }
126
127 ComplexMat ComplexMat::operator/(const ComplexMat & rhs) const
128 {
129     assert(rhs.n_channels == n_channels && rhs.cols == cols && rhs.rows == rows);
130
131     ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales);
132     
133     dim3 threadsPerBlock(rows, cols);
134     dim3 numBlocks(n_channels/n_scales, n_scales);
135     same_num_channels_div_kernel<<<numBlocks, threadsPerBlock>>>(this->p_data, rhs.p_data, result.p_data);
136
137     return result;
138 }
139
140 __global__ void same_num_channels_add_kernel(float* data_l, float* data_r, float* result)
141 {
142         int blockId = blockIdx.x + blockIdx.y * gridDim.x;
143         int threadId = 2*(blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x);
144         
145         result[threadId] =  data_l[threadId]+data_r[threadId];
146         result[threadId+1] = data_l[threadId+1]+data_r[threadId+1];
147 }
148
149 ComplexMat ComplexMat::operator+(const ComplexMat & rhs) const
150 {
151     assert(rhs.n_channels == n_channels && rhs.cols == cols && rhs.rows == rows);
152
153     ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales);
154     
155     dim3 threadsPerBlock(rows, cols);
156     dim3 numBlocks(n_channels/n_scales, n_scales);
157     same_num_channels_add_kernel<<<numBlocks, threadsPerBlock>>>(this->p_data, rhs.p_data, result.p_data);
158     
159     return result;
160 }
161
162 __global__ void constant_mul_kernel(float* data_l, float constant, float* result)
163 {
164         int blockId = blockIdx.x + blockIdx.y * gridDim.x;
165         int threadId = 2*(blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x);
166         
167         result[threadId] =  data_l[threadId]*constant;
168         result[threadId+1] = data_l[threadId+1]*constant;
169 }
170
171 ComplexMat ComplexMat::operator*(const float & rhs) const
172 {
173     ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales);
174     
175     dim3 threadsPerBlock(rows, cols);
176     dim3 numBlocks(n_channels/n_scales, n_scales);
177     constant_mul_kernel<<<numBlocks, threadsPerBlock>>>(this->p_data, rhs, result.p_data);
178
179     return result;
180 }
181
182 __global__ void constant_add_kernel(float* data_l, float constant, float* result)
183 {
184         int blockId = blockIdx.x + blockIdx.y * gridDim.x;
185         int threadId = 2*(blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x);
186         
187         result[threadId] =  data_l[threadId]+constant;
188         result[threadId+1] = data_l[threadId+1];
189 }
190
191 ComplexMat ComplexMat::operator+(const float & rhs) const
192 {
193     ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales);
194     
195     dim3 threadsPerBlock(rows, cols);
196     dim3 numBlocks(n_channels/n_scales, n_scales);
197     constant_add_kernel<<<numBlocks, threadsPerBlock>>>(this->p_data, rhs, result.p_data);
198
199     return result;
200 }
201
202 __global__ void one_channel_mul_kernel(float* data_l, float* data_r, float* result)
203 {
204         int blockId = blockIdx.x + blockIdx.y * gridDim.x;
205         int threadId = 2*(blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x);
206         int one_ch_index = 2*((threadIdx.y * blockDim.x) + threadIdx.x);
207         
208         result[threadId] =  data_l[threadId]*data_r[one_ch_index] - data_l[threadId+1]*data_r[one_ch_index+1];
209         result[threadId+1] = data_l[threadId]*data_r[one_ch_index+1] + data_l[threadId+1]*data_r[one_ch_index];
210 }
211
212 //multiplying element-wise multichannel by one channel mats (rhs mat is with one channel)
213 ComplexMat ComplexMat::mul(const ComplexMat & rhs) const
214 {
215     assert(rhs.n_channels == 1 && rhs.cols == cols && rhs.rows == rows);
216
217     ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales);
218     
219     dim3 threadsPerBlock(rows, cols);
220     dim3 numBlocks(n_channels/n_scales, n_scales);
221     one_channel_mul_kernel<<<numBlocks, threadsPerBlock>>>(this->p_data, rhs.p_data, result.p_data);
222     
223     return result;
224 }
225
226 __global__ void scales_channel_mul_kernel(float* data_l, float* data_r, float* result)
227 {
228         int blockId = blockIdx.x + blockIdx.y * gridDim.x;
229         int threadId = 2*(blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x);
230         int one_ch_index = 2*((threadIdx.y * blockDim.x) + threadIdx.x+blockIdx.x*blockDim.x*blockDim.y);
231         
232         result[threadId] =  data_l[threadId]*data_r[one_ch_index] - data_l[threadId+1]*data_r[one_ch_index+1];
233         result[threadId+1] = data_l[threadId]*data_r[one_ch_index+1] + data_l[threadId+1]*data_r[one_ch_index];
234 }
235
236 //multiplying element-wise multichannel by one channel mats (rhs mat is with multiple channel)
237 ComplexMat ComplexMat::mul2(const ComplexMat & rhs) const
238 {
239     assert(rhs.n_channels == n_channels/n_scales && rhs.cols == cols && rhs.rows == rows);
240
241     ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales);
242     
243     dim3 threadsPerBlock(rows, cols);
244     dim3 numBlocks(n_channels/n_scales, n_scales);
245     scales_channel_mul_kernel<<<numBlocks, threadsPerBlock>>>(this->p_data, rhs.p_data, result.p_data);
246     
247     return result;
248 }
249
250 void ComplexMat::operator=(ComplexMat & rhs)
251 {
252     cols = rhs.cols;
253     rows = rhs.rows;
254     n_channels = rhs.n_channels;
255     n_scales = rhs.n_scales;
256     
257     p_data = rhs.p_data;
258     
259 }
260
261 void ComplexMat::operator=(ComplexMat && rhs)
262 {
263     cols = rhs.cols;
264     rows = rhs.rows;
265     n_channels = rhs.n_channels;
266     n_scales = rhs.n_scales;
267     
268     p_data = rhs.p_data;
269     
270     rhs.p_data = nullptr;
271 }