src/complexmat.cu

   1 #include "complexmat.hpp"
   2
   3
   4 __global__ void sqr_norm_kernel(const float *in, float *block_res, int total)
   5 {
   6     extern __shared__ float sdata[];
   7     int in_idx = 2 * (blockIdx.x * blockDim.x + threadIdx.x);
   8     int i = threadIdx.x;
   9
  10     if (in_idx >= total * 2)
  11         sdata[i] = 0;
  12     else
  13         sdata[i] = in[in_idx] * in[in_idx] + in[in_idx + 1] * in[in_idx + 1];
  14
  15     for (unsigned s = (blockDim.x + 1) / 2; s > 0; s >>= 1) {
  16         __syncthreads();
  17         if (i < s)
  18             sdata[i] += sdata[i + s];
  19     }
  20
  21     if (i == 0)
  22         block_res[blockIdx.x] = sdata[0];
  23 }
  24
  25 void ComplexMat_::sqr_norm(DynMem &result) const
  26 {
  27     assert(n_scales == 1);
  28
  29     const uint total = n_channels * rows * cols;
  30     const dim3 threads(1024);
  31     const dim3 blocks((total + threads.x - 1) / threads.x);
  32
  33     DynMem block_res(blocks.x);
  34
  35     sqr_norm_kernel<<<blocks, threads, threads.x * sizeof(float)>>>((const float*)p_data.deviceMem(),
  36                                                                     block_res.deviceMem(), total);
  37     CudaCheckError();
  38     cudaSync();
  39
  40     T res = 0;
  41     for (int i = 0; i < blocks.x; i++)
  42         res += block_res[i];
  43     result.hostMem()[0] = res / static_cast<T>(cols * rows);
  44 }
  45
  46 __global__ void sqr_mag_kernel(const float *data, float *result, int total)
  47 {
  48     int idx = 2 * (blockIdx.x * blockDim.x + threadIdx.x);
  49
  50     if (idx / 2 < total) {
  51         result[idx] = data[idx] * data[idx] + data[idx + 1] * data[idx + 1];
  52         result[idx + 1] = 0;
  53     }
  54 }
  55
  56 ComplexMat_ ComplexMat_::sqr_mag() const
  57 {
  58     ComplexMat_ result = ComplexMat_::same_size(*this);
  59
  60     const uint total = n_channels * rows * cols;
  61     const dim3 threads(256);
  62     const dim3 blocks((total + threads.x - 1) / threads.x);
  63
  64     sqr_mag_kernel<<<threads, blocks, 0>>>((float*)this->p_data.deviceMem(),
  65                                            (float*)result.p_data.deviceMem(),
  66                                            total);
  67     CudaCheckError();
  68
  69     return result;
  70 }
  71
  72 __global__ void conj_kernel(const float *data, float *result, int total)
  73 {
  74     int idx = 2 * (blockIdx.x * blockDim.x + threadIdx.x);
  75
  76     if (idx / 2 < total) {
  77         result[idx] = data[idx];
  78         result[idx + 1] = -data[idx + 1];
  79     }
  80 }
  81
  82 ComplexMat_ ComplexMat_::conj() const
  83 {
  84     ComplexMat_ result = ComplexMat_::same_size(*this);
  85
  86     const uint total = n_channels * rows * cols;
  87     const dim3 threads(256);
  88     const dim3 blocks((total + threads.x - 1) / threads.x);
  89
  90     conj_kernel<<<threads, blocks, 0>>>((float*)this->p_data.deviceMem(), (float*)result.p_data.deviceMem(), total);
  91     CudaCheckError();
  92
  93     return result;
  94 }
  95
  96 __global__ static void sum_channels(float *dest, const float *src, uint channels, uint num_channel_elem)
  97 {
  98     int idx = blockIdx.x * blockDim.x + threadIdx.x;
  99
 100     if (idx >= num_channel_elem)
 101         return;
 102
 103     float acc = 0;
 104     for (uint i = 0; i < channels; ++i)
 105         acc += src[idx + i * num_channel_elem];
 106     dest[idx] = acc;
 107 }
 108
 109 ComplexMat_ ComplexMat_::sum_over_channels() const
 110 {
 111     assert(p_data.num_elem == n_channels * rows * cols);
 112
 113     uint n_channels_per_scale = n_channels / n_scales;
 114     uint scale_offset = n_channels_per_scale * rows * cols;
 115
 116     ComplexMat_ result(this->rows, this->cols, 1, n_scales);
 117
 118     const uint total = rows * cols * 2;
 119     const dim3 threads(256);
 120     const dim3 blocks((total + threads.x - 1) / threads.x);
 121
 122     for (uint scale = 0; scale < n_scales; ++scale) {
 123         sum_channels<<<blocks, threads>>>(reinterpret_cast<float*>(result.p_data.deviceMem() + scale * scale_offset),
 124                                           reinterpret_cast<const float*>(p_data.deviceMem() + scale * scale_offset),
 125                                           n_channels_per_scale, total);
 126     }
 127     CudaSafeCall(cudaStreamSynchronize(cudaStreamPerThread));
 128     return result;
 129 }
 130
 131 __global__ void same_num_channels_mul_kernel(const float *data_l, const float *data_r, float *result, int total)
 132 {
 133     int idx = 2 * (blockIdx.x * blockDim.x + threadIdx.x);
 134
 135     if (idx / 2 < total) {
 136         result[idx] = data_l[idx] * data_r[idx] - data_l[idx + 1] * data_r[idx + 1];
 137         result[idx + 1] = data_l[idx] * data_r[idx + 1] + data_l[idx + 1] * data_r[idx];
 138     }
 139 }
 140
 141 // element-wise per channel multiplication, division and addition
 142 ComplexMat_ ComplexMat_::operator*(const ComplexMat_ &rhs) const
 143 {
 144     assert(rhs.n_channels == n_channels && rhs.cols == cols && rhs.rows == rows);
 145
 146     ComplexMat_ result = ComplexMat_::same_size(*this);
 147
 148     const uint total = n_channels * rows * cols;
 149     const dim3 threads(256);
 150     const dim3 blocks((total + threads.x - 1) / threads.x);
 151
 152     same_num_channels_mul_kernel<<<blocks, threads, 0>>>((float*)this->p_data.deviceMem(),
 153                                                          (float*)rhs.p_data.deviceMem(),
 154                                                          (float*)result.p_data.deviceMem(),
 155                                                          total);
 156     CudaCheckError();
 157
 158     return result;
 159 }
 160
 161 __global__ void same_num_channels_div_kernel(const float *data_l, const float *data_r, float *result, unsigned total)
 162 {
 163     int idx = 2 * (blockIdx.x * blockDim.x + threadIdx.x);
 164
 165     if (idx / 2 < total) {
 166         result[idx] = (data_l[idx] * data_r[idx] + data_l[idx + 1] * data_r[idx + 1]) /
 167                (data_r[idx] * data_r[idx] + data_r[idx + 1] * data_r[idx + 1]);
 168         result[idx + 1] = (data_l[idx + 1] * data_r[idx] - data_l[idx] * data_r[idx + 1]) /
 169                (data_r[idx] * data_r[idx] + data_r[idx + 1] * data_r[idx + 1]);
 170     }
 171 }
 172
 173 ComplexMat_ ComplexMat_::operator/(const ComplexMat_ &rhs) const
 174 {
 175     assert(rhs.n_channels == n_channels && rhs.cols == cols && rhs.rows == rows);
 176
 177     ComplexMat_ result = ComplexMat_::same_size(*this);
 178
 179     const uint total = n_channels * rows * cols;
 180     const dim3 threads(256);
 181     const dim3 blocks((total + threads.x - 1) / threads.x);
 182
 183     same_num_channels_div_kernel<<<threads, blocks, 0>>>((float*)this->p_data.deviceMem(),
 184                                                          (float*)rhs.p_data.deviceMem(),
 185                                                          (float*)result.p_data.deviceMem(), total);
 186     CudaCheckError();
 187
 188     return result;
 189 }
 190
 191 __global__ void same_num_channels_add_kernel(const float *data_l, const float *data_r, float *result, int total)
 192 {
 193     int idx = 2 * (blockIdx.x * blockDim.x + threadIdx.x);
 194
 195     if (idx / 2 < total) {
 196         result[idx] = data_l[idx] + data_r[idx];
 197         result[idx + 1] = data_l[idx + 1] + data_r[idx + 1];
 198     }
 199 }
 200
 201 ComplexMat_ ComplexMat_::operator+(const ComplexMat_ &rhs) const
 202 {
 203     assert(rhs.n_channels == n_channels && rhs.cols == cols && rhs.rows == rows);
 204
 205     ComplexMat_ result = ComplexMat_::same_size(*this);
 206
 207     const uint total = n_channels * rows * cols;
 208     const dim3 threads(256);
 209     const dim3 blocks((total + threads.x - 1) / threads.x);
 210
 211     same_num_channels_add_kernel<<<threads, blocks, 0>>>((float*)this->p_data.deviceMem(),
 212                                                          (float*)rhs.p_data.deviceMem(),
 213                                                          (float*)result.p_data.deviceMem(),
 214                                                          total);
 215     CudaCheckError();
 216
 217     return result;
 218 }
 219
 220 __global__ void constant_mul_kernel(const float *data_l, float constant, float *result, int total)
 221 {
 222     int idx = 2 * (blockIdx.x * blockDim.x + threadIdx.x);
 223
 224     if (idx / 2 < total) {
 225         result[idx] = data_l[idx] * constant;
 226         result[idx + 1] = data_l[idx + 1] * constant;
 227     }
 228 }
 229
 230 ComplexMat_ ComplexMat_::operator*(const float &rhs) const
 231 {
 232     ComplexMat_ result = ComplexMat_::same_size(*this);
 233
 234     const uint total = n_channels * rows * cols;
 235     const dim3 threads(256);
 236     const dim3 blocks((total + threads.x - 1) / threads.x);
 237
 238     constant_mul_kernel<<<threads, blocks, 0>>>((float*)this->p_data.deviceMem(),
 239                                                 rhs,
 240                                                 (float*)result.p_data.deviceMem(),
 241                                                 total);
 242     CudaCheckError();
 243
 244     return result;
 245 }
 246
 247 __global__ void constant_add_kernel(const float *data_l, float constant, float *result, int total)
 248 {
 249     int idx = 2 * (blockIdx.x * blockDim.x + threadIdx.x);
 250
 251     if (idx / 2 < total) {
 252         result[idx] = data_l[idx] + constant;
 253         result[idx + 1] = data_l[idx + 1];
 254     }
 255 }
 256
 257 ComplexMat_ ComplexMat_::operator+(const float &rhs) const
 258 {
 259     ComplexMat_ result = ComplexMat_::same_size(*this);
 260
 261     const uint total = n_channels * rows * cols;
 262     const dim3 threads(256);
 263     const dim3 blocks((total + threads.x - 1) / threads.x);
 264
 265     constant_add_kernel<<<threads, blocks, 0>>>((float*)this->p_data.deviceMem(),
 266                                                 rhs,
 267                                                 (float*)result.p_data.deviceMem(),
 268                                                 total);
 269     CudaCheckError();
 270
 271     return result;
 272 }
 273
 274 __global__ void one_channel_mul_kernel(const float *data_l, const float *data_r, float *result,
 275                                        int channel_total, int total)
 276 {
 277     int idx = 2 * (blockIdx.x * blockDim.x + threadIdx.x);
 278     int one_ch_idx = idx  % (2 * channel_total);
 279
 280     if (idx / 2 < total) {
 281         result[idx] = data_l[idx] * data_r[one_ch_idx] - data_l[idx + 1] * data_r[one_ch_idx + 1];
 282         result[idx + 1] = data_l[idx] * data_r[one_ch_idx + 1] + data_l[idx + 1] * data_r[one_ch_idx];
 283     }
 284 }
 285
 286 // multiplying element-wise multichannel by one channel mats (rhs mat is with one channel)
 287 ComplexMat_ ComplexMat_::mul(const ComplexMat_ &rhs) const
 288 {
 289     assert(rhs.n_channels == 1 && rhs.cols == cols && rhs.rows == rows);
 290
 291     ComplexMat_ result = ComplexMat_::same_size(*this);
 292
 293     const uint total = n_channels * rows * cols;
 294     const dim3 threads(256);
 295     const dim3 blocks((total + threads.x - 1) / threads.x);
 296
 297     one_channel_mul_kernel<<<threads, blocks, 0>>>((float*)this->p_data.deviceMem(),
 298                                                    (float*)rhs.p_data.deviceMem(),
 299                                                    (float*)result.p_data.deviceMem(),
 300                                                    rows * cols, total);
 301     CudaCheckError();
 302
 303     return result;
 304 }
 305
 306 // __global__ void scales_channel_mul_kernel(float *data_l, float *data_r, float *result)
 307 // {
 308 //     int blockId = blockIdx.x + blockIdx.y * gridDim.x;
 309 //     int idx = 2 * (blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x);
 310 //     int one_ch_index = 2 * ((threadIdx.y * blockDim.x) + threadIdx.x + blockIdx.x * blockDim.x * blockDim.y);
 311
 312 //     result[idx] = data_l[idx] * data_r[one_ch_index] - data_l[idx + 1] * data_r[one_ch_index + 1];
 313 //     result[idx + 1] = data_l[idx] * data_r[one_ch_index + 1] + data_l[idx + 1] * data_r[one_ch_index];
 314 // }
 315
 316 // multiplying element-wise multichannel by one channel mats (rhs mat is with multiple channel)
 317 // ComplexMat_ ComplexMat_::mul2(const ComplexMat_ &rhs) const
 318 // {
 319 //     assert(rhs.n_channels == n_channels / n_scales && rhs.cols == cols && rhs.rows == rows);
 320
 321 //     ComplexMat_ result(this->rows, this->cols, this->channels(), this->n_scales);
 322
 323 //     dim3 threadsPerBlock(rows, cols);
 324 //     dim3 numBlocks(n_channels / n_scales, n_scales);
 325 //     scales_channel_mul_kernel<<<threads, blocks, 0>>>(this->p_data, rhs.p_data, result.p_data);
 326 //     CudaCheckError();
 327
 328 //     return result;
 329 // }
 330
 331 // void ComplexMat_::operator=(ComplexMat_ &&rhs)
 332 // {
 333 //     cols = rhs.cols;
 334 //     rows = rhs.rows;
 335 //     n_channels = rhs.n_channels;
 336 //     n_scales = rhs.n_scales;
 337
 338 //     p_data = rhs.p_data;
 339
 340 //     rhs.p_data = nullptr;
 341 // }
 342
 343 void ComplexMat_::cudaSync() const
 344 {
 345     CudaSafeCall(cudaStreamSynchronize(cudaStreamPerThread));
 346 }