src/complexmat.cu

   1 #include "complexmat.hpp"
   2
   3
   4 __global__ void sqr_norm_kernel(const float *in, float *block_res, int total)
   5 {
   6     extern __shared__ float sdata[];
   7     int in_idx = 2 * (blockIdx.x * blockDim.x + threadIdx.x);
   8     int i = threadIdx.x;
   9
  10     if (in_idx >= total * 2)
  11         sdata[i] = 0;
  12     else
  13         sdata[i] = in[in_idx] * in[in_idx] + in[in_idx + 1] * in[in_idx + 1];
  14
  15     for (unsigned s = (blockDim.x + 1) / 2; s > 0; s >>= 1) {
  16         __syncthreads();
  17         if (i < s)
  18             sdata[i] += sdata[i + s];
  19     }
  20
  21     if (i == 0)
  22         block_res[blockIdx.x] = sdata[0];
  23 }
  24
  25 void ComplexMat_::sqr_norm(DynMem &result) const
  26 {
  27     assert(n_scales == 1);
  28
  29     const uint total = n_channels * rows * cols;
  30     const dim3 threads(1024);
  31     const dim3 blocks((total + threads.x - 1) / threads.x);
  32
  33     DynMem block_res(blocks.x);
  34
  35     sqr_norm_kernel<<<blocks, threads, threads.x * sizeof(float)>>>((const float*)p_data.deviceMem(),
  36                                                                     block_res.deviceMem(), total);
  37     CudaCheckError();
  38     CudaSafeCall(cudaStreamSynchronize(cudaStreamPerThread));
  39
  40     T res = 0;
  41     for (int i = 0; i < blocks.x; i++)
  42         res += block_res[i];
  43     result.hostMem()[0] = res / static_cast<T>(cols * rows);
  44 }
  45
  46 __global__ void sqr_mag_kernel(const float *data, float *result)
  47 {
  48     int blockId = blockIdx.x + blockIdx.y * gridDim.x;
  49     int threadId = 2 * (blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x);
  50
  51     result[threadId] = data[threadId] * data[threadId] + data[threadId + 1] * data[threadId + 1];
  52     result[threadId + 1] = 0;
  53 }
  54
  55 ComplexMat ComplexMat::sqr_mag() const
  56 {
  57     ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales);
  58
  59     dim3 threadsPerBlock(rows, cols);
  60     dim3 numBlocks(n_channels / n_scales, n_scales);
  61     sqr_mag_kernel<<<numBlocks, threadsPerBlock, 0>>>((float*)this->p_data.deviceMem(), (float*)result.p_data.deviceMem());
  62     CudaCheckError();
  63
  64     return result;
  65 }
  66
  67 __global__ void conj_kernel(const float *data, float *result)
  68 {
  69     int blockId = blockIdx.x + blockIdx.y * gridDim.x;
  70     int threadId = 2 * (blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x);
  71
  72     result[threadId] = data[threadId];
  73     result[threadId + 1] = -data[threadId + 1];
  74 }
  75
  76 ComplexMat ComplexMat::conj() const
  77 {
  78     ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales);
  79
  80     dim3 threadsPerBlock(rows, cols);
  81     dim3 numBlocks(n_channels / n_scales, n_scales);
  82     conj_kernel<<<numBlocks, threadsPerBlock, 0>>>((float*)this->p_data.deviceMem(), (float*)result.p_data.deviceMem());
  83     CudaCheckError();
  84
  85     return result;
  86 }
  87
  88 __global__ static void sum_channels(float *dest, const float *src, uint channels, uint num_channel_elem)
  89 {
  90     int idx = blockIdx.x * blockDim.x + threadIdx.x;
  91
  92     if (idx >= num_channel_elem)
  93         return;
  94
  95     float acc = 0;
  96     for (uint i = 0; i < channels; ++i)
  97         acc += src[idx + i * num_channel_elem];
  98     dest[idx] = acc;
  99 }
 100
 101 ComplexMat ComplexMat::sum_over_channels() const
 102 {
 103     assert(p_data.num_elem == n_channels * rows * cols);
 104
 105     uint n_channels_per_scale = n_channels / n_scales;
 106     uint scale_offset = n_channels_per_scale * rows * cols;
 107
 108     ComplexMat_ result(this->rows, this->cols, 1, n_scales);
 109
 110     const uint total = rows * cols * 2;
 111     const dim3 threads(256);
 112     const dim3 blocks((total + threads.x - 1) / threads.x);
 113
 114     for (uint scale = 0; scale < n_scales; ++scale) {
 115         sum_channels<<<blocks, threads>>>(reinterpret_cast<float*>(result.p_data.deviceMem() + scale * scale_offset),
 116                                           reinterpret_cast<const float*>(p_data.deviceMem() + scale * scale_offset),
 117                                           n_channels_per_scale, total);
 118     }
 119     CudaSafeCall(cudaStreamSynchronize(cudaStreamPerThread));
 120     return result;
 121 }
 122
 123 __global__ void same_num_channels_mul_kernel(const float *data_l, const float *data_r, float *result)
 124 {
 125     int blockId = blockIdx.x + blockIdx.y * gridDim.x;
 126     int threadId = 2 * (blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x);
 127
 128     result[threadId] = data_l[threadId] * data_r[threadId] - data_l[threadId + 1] * data_r[threadId + 1];
 129     result[threadId + 1] = data_l[threadId] * data_r[threadId + 1] + data_l[threadId + 1] * data_r[threadId];
 130 }
 131
 132 // element-wise per channel multiplication, division and addition
 133 ComplexMat ComplexMat::operator*(const ComplexMat &rhs) const
 134 {
 135     assert(rhs.n_channels == n_channels && rhs.cols == cols && rhs.rows == rows);
 136
 137     ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales);
 138
 139     dim3 threadsPerBlock(rows, cols);
 140     dim3 numBlocks(n_channels / n_scales, n_scales);
 141     same_num_channels_mul_kernel<<<numBlocks, threadsPerBlock, 0>>>((float*)this->p_data.deviceMem(),
 142                                                                     (float*)rhs.p_data.deviceMem(),
 143                                                                     (float*)result.p_data.deviceMem());
 144     CudaCheckError();
 145
 146     return result;
 147 }
 148
 149 __global__ void same_num_channels_div_kernel(const float *data_l, const float *data_r, float *result)
 150 {
 151     int blockId = blockIdx.x + blockIdx.y * gridDim.x;
 152     int threadId = 2 * (blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x);
 153
 154     result[threadId] = (data_l[threadId] * data_r[threadId] + data_l[threadId + 1] * data_r[threadId + 1]) /
 155                        (data_r[threadId] * data_r[threadId] + data_r[threadId + 1] * data_r[threadId + 1]);
 156     result[threadId + 1] = (data_l[threadId + 1] * data_r[threadId] - data_l[threadId] * data_r[threadId + 1]) /
 157                            (data_r[threadId] * data_r[threadId] + data_r[threadId + 1] * data_r[threadId + 1]);
 158 }
 159
 160 ComplexMat ComplexMat::operator/(const ComplexMat &rhs) const
 161 {
 162     assert(rhs.n_channels == n_channels && rhs.cols == cols && rhs.rows == rows);
 163
 164     ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales);
 165
 166     dim3 threadsPerBlock(rows, cols);
 167     dim3 numBlocks(n_channels / n_scales, n_scales);
 168     same_num_channels_div_kernel<<<numBlocks, threadsPerBlock, 0>>>((float*)this->p_data.deviceMem(),
 169                                                                     (float*)rhs.p_data.deviceMem(),
 170                                                                     (float*)result.p_data.deviceMem());
 171     CudaCheckError();
 172
 173     return result;
 174 }
 175
 176 __global__ void same_num_channels_add_kernel(const float *data_l, const float *data_r, float *result)
 177 {
 178     int blockId = blockIdx.x + blockIdx.y * gridDim.x;
 179     int threadId = 2 * (blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x);
 180
 181     result[threadId] = data_l[threadId] + data_r[threadId];
 182     result[threadId + 1] = data_l[threadId + 1] + data_r[threadId + 1];
 183 }
 184
 185 ComplexMat ComplexMat::operator+(const ComplexMat &rhs) const
 186 {
 187     assert(rhs.n_channels == n_channels && rhs.cols == cols && rhs.rows == rows);
 188
 189     ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales);
 190
 191     dim3 threadsPerBlock(rows, cols);
 192     dim3 numBlocks(n_channels / n_scales, n_scales);
 193     same_num_channels_add_kernel<<<numBlocks, threadsPerBlock, 0>>>((float*)this->p_data.deviceMem(),
 194                                                                     (float*)rhs.p_data.deviceMem(),
 195                                                                     (float*)result.p_data.deviceMem());
 196     CudaCheckError();
 197
 198     return result;
 199 }
 200
 201 __global__ void constant_mul_kernel(const float *data_l, float constant, float *result)
 202 {
 203     int blockId = blockIdx.x + blockIdx.y * gridDim.x;
 204     int threadId = 2 * (blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x);
 205
 206     result[threadId] = data_l[threadId] * constant;
 207     result[threadId + 1] = data_l[threadId + 1] * constant;
 208 }
 209
 210 ComplexMat ComplexMat::operator*(const float &rhs) const
 211 {
 212     ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales);
 213
 214     dim3 threadsPerBlock(rows, cols);
 215     dim3 numBlocks(n_channels / n_scales, n_scales);
 216     constant_mul_kernel<<<numBlocks, threadsPerBlock, 0>>>((float*)this->p_data.deviceMem(),
 217                                                            rhs,
 218                                                            (float*)result.p_data.deviceMem());
 219     CudaCheckError();
 220
 221     return result;
 222 }
 223
 224 __global__ void constant_add_kernel(const float *data_l, float constant, float *result)
 225 {
 226     int blockId = blockIdx.x + blockIdx.y * gridDim.x;
 227     int threadId = 2 * (blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x);
 228
 229     result[threadId] = data_l[threadId] + constant;
 230     result[threadId + 1] = data_l[threadId + 1];
 231 }
 232
 233 ComplexMat ComplexMat::operator+(const float &rhs) const
 234 {
 235     ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales);
 236
 237     dim3 threadsPerBlock(rows, cols);
 238     dim3 numBlocks(n_channels / n_scales, n_scales);
 239     constant_add_kernel<<<numBlocks, threadsPerBlock, 0>>>((float*)this->p_data.deviceMem(),
 240                                                            rhs,
 241                                                            (float*)result.p_data.deviceMem());
 242     CudaCheckError();
 243
 244     return result;
 245 }
 246
 247 __global__ void one_channel_mul_kernel(const float *data_l, const float *data_r, float *result)
 248 {
 249     int blockId = blockIdx.x + blockIdx.y * gridDim.x;
 250     int threadId = 2 * (blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x);
 251     int one_ch_index = 2 * ((threadIdx.y * blockDim.x) + threadIdx.x);
 252
 253     result[threadId] = data_l[threadId] * data_r[one_ch_index] - data_l[threadId + 1] * data_r[one_ch_index + 1];
 254     result[threadId + 1] = data_l[threadId] * data_r[one_ch_index + 1] + data_l[threadId + 1] * data_r[one_ch_index];
 255 }
 256
 257 // multiplying element-wise multichannel by one channel mats (rhs mat is with one channel)
 258 ComplexMat ComplexMat::mul(const ComplexMat &rhs) const
 259 {
 260     assert(rhs.n_channels == 1 && rhs.cols == cols && rhs.rows == rows);
 261
 262     ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales);
 263
 264     dim3 threadsPerBlock(rows, cols);
 265     dim3 numBlocks(n_channels / n_scales, n_scales);
 266     one_channel_mul_kernel<<<numBlocks, threadsPerBlock, 0>>>((float*)this->p_data.deviceMem(),
 267                                                               (float*)rhs.p_data.deviceMem(),
 268                                                               (float*)result.p_data.deviceMem());
 269     CudaCheckError();
 270
 271     return result;
 272 }
 273
 274 __global__ void scales_channel_mul_kernel(float *data_l, float *data_r, float *result)
 275 {
 276     int blockId = blockIdx.x + blockIdx.y * gridDim.x;
 277     int threadId = 2 * (blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x);
 278     int one_ch_index = 2 * ((threadIdx.y * blockDim.x) + threadIdx.x + blockIdx.x * blockDim.x * blockDim.y);
 279
 280     result[threadId] = data_l[threadId] * data_r[one_ch_index] - data_l[threadId + 1] * data_r[one_ch_index + 1];
 281     result[threadId + 1] = data_l[threadId] * data_r[one_ch_index + 1] + data_l[threadId + 1] * data_r[one_ch_index];
 282 }
 283
 284 // multiplying element-wise multichannel by one channel mats (rhs mat is with multiple channel)
 285 // ComplexMat ComplexMat::mul2(const ComplexMat &rhs) const
 286 // {
 287 //     assert(rhs.n_channels == n_channels / n_scales && rhs.cols == cols && rhs.rows == rows);
 288
 289 //     ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales);
 290
 291 //     dim3 threadsPerBlock(rows, cols);
 292 //     dim3 numBlocks(n_channels / n_scales, n_scales);
 293 //     scales_channel_mul_kernel<<<numBlocks, threadsPerBlock, 0>>>(this->p_data, rhs.p_data, result.p_data);
 294 //     CudaCheckError();
 295
 296 //     return result;
 297 // }
 298
 299 // void ComplexMat::operator=(ComplexMat &&rhs)
 300 // {
 301 //     cols = rhs.cols;
 302 //     rows = rhs.rows;
 303 //     n_channels = rhs.n_channels;
 304 //     n_scales = rhs.n_scales;
 305
 306 //     p_data = rhs.p_data;
 307
 308 //     rhs.p_data = nullptr;
 309 // }