src/complexmat.cu

   1 #include "complexmat.hpp"
   2
   3 __global__ void sqr_norm_kernel(int n, float *out, const float *data, float rows, float cols)
   4 {
   5     extern __shared__ float sdata[];
   6     int i = blockDim.x * threadIdx.y + threadIdx.x;
   7     int blockId = blockIdx.x + blockIdx.y * gridDim.x;
   8     int threadId = 2 * (blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x);
   9
  10     sdata[i] = 0;
  11     sdata[i] = data[threadId] * data[threadId] + data[threadId + 1] * data[threadId + 1];
  12     __syncthreads();
  13
  14     for (unsigned int s = (blockDim.x * blockDim.y + 1) / 2, old_s = blockDim.x * blockDim.y; s > 0; s >>= 1) {
  15
  16         if (old_s & 1) s += 1;
  17
  18         if (i < s && i + s < old_s) {
  19             sdata[i] += sdata[i + s];
  20         }
  21         old_s = s;
  22         __syncthreads();
  23     }
  24
  25     if (i == 0) {
  26         atomicAdd(&out[blockId / n], sdata[0] / (rows * cols));
  27     }
  28 }
  29
  30 void ComplexMat::sqr_norm(DynMem &result) const
  31 {
  32     CudaSafeCall(cudaMemsetAsync(result.deviceMem(), 0, n_scales * sizeof(float)));
  33
  34     dim3 threadsPerBlock(rows, cols);
  35     dim3 numBlocks(n_channels / n_scales, n_scales);
  36
  37     sqr_norm_kernel<<<numBlocks, threadsPerBlock, rows * cols * sizeof(float)>>>(
  38         n_channels / n_scales, result.deviceMem(), (float*)this->p_data.deviceMem(), rows, cols);
  39     CudaCheckError();
  40
  41     return;
  42 }
  43
  44 __global__ void sqr_mag_kernel(const float *data, float *result)
  45 {
  46     int blockId = blockIdx.x + blockIdx.y * gridDim.x;
  47     int threadId = 2 * (blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x);
  48
  49     result[threadId] = data[threadId] * data[threadId] + data[threadId + 1] * data[threadId + 1];
  50     result[threadId + 1] = 0;
  51 }
  52
  53 ComplexMat ComplexMat::sqr_mag() const
  54 {
  55     ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales);
  56
  57     dim3 threadsPerBlock(rows, cols);
  58     dim3 numBlocks(n_channels / n_scales, n_scales);
  59     sqr_mag_kernel<<<numBlocks, threadsPerBlock, 0>>>((float*)this->p_data.deviceMem(), (float*)result.p_data.deviceMem());
  60     CudaCheckError();
  61
  62     return result;
  63 }
  64
  65 __global__ void conj_kernel(const float *data, float *result)
  66 {
  67     int blockId = blockIdx.x + blockIdx.y * gridDim.x;
  68     int threadId = 2 * (blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x);
  69
  70     result[threadId] = data[threadId];
  71     result[threadId + 1] = -data[threadId + 1];
  72 }
  73
  74 ComplexMat ComplexMat::conj() const
  75 {
  76     ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales);
  77
  78     dim3 threadsPerBlock(rows, cols);
  79     dim3 numBlocks(n_channels / n_scales, n_scales);
  80     conj_kernel<<<numBlocks, threadsPerBlock, 0>>>((float*)this->p_data.deviceMem(), (float*)result.p_data.deviceMem());
  81     CudaCheckError();
  82
  83     return result;
  84 }
  85
  86 __global__ static void sum_channels(float *dest, const float *src, uint channels, uint num_channel_elem)
  87 {
  88     int idx = blockIdx.x * blockDim.x + threadIdx.x;
  89
  90     if (idx >= num_channel_elem)
  91         return;
  92
  93     float acc = 0;
  94     for (uint i = 0; i < channels; ++i)
  95         acc += src[idx + i * num_channel_elem];
  96     dest[idx] = acc;
  97 }
  98
  99 ComplexMat ComplexMat::sum_over_channels() const
 100 {
 101     assert(p_data.num_elem == n_channels * rows * cols);
 102
 103     uint n_channels_per_scale = n_channels / n_scales;
 104     uint scale_offset = n_channels_per_scale * rows * cols;
 105
 106     ComplexMat_ result(this->rows, this->cols, 1, n_scales);
 107
 108     const uint total = rows * cols * 2;
 109     const dim3 threads(256);
 110     const dim3 blocks((total + threads.x - 1) / threads.x);
 111
 112     for (uint scale = 0; scale < n_scales; ++scale) {
 113         sum_channels<<<blocks, threads>>>(reinterpret_cast<float*>(result.p_data.deviceMem() + scale * scale_offset),
 114                                           reinterpret_cast<const float*>(p_data.deviceMem() + scale * scale_offset),
 115                                           n_channels_per_scale, total);
 116     }
 117     CudaSafeCall(cudaStreamSynchronize(cudaStreamPerThread));
 118     return result;
 119 }
 120
 121 __global__ void same_num_channels_mul_kernel(const float *data_l, const float *data_r, float *result)
 122 {
 123     int blockId = blockIdx.x + blockIdx.y * gridDim.x;
 124     int threadId = 2 * (blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x);
 125
 126     result[threadId] = data_l[threadId] * data_r[threadId] - data_l[threadId + 1] * data_r[threadId + 1];
 127     result[threadId + 1] = data_l[threadId] * data_r[threadId + 1] + data_l[threadId + 1] * data_r[threadId];
 128 }
 129
 130 // element-wise per channel multiplication, division and addition
 131 ComplexMat ComplexMat::operator*(const ComplexMat &rhs) const
 132 {
 133     assert(rhs.n_channels == n_channels && rhs.cols == cols && rhs.rows == rows);
 134
 135     ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales);
 136
 137     dim3 threadsPerBlock(rows, cols);
 138     dim3 numBlocks(n_channels / n_scales, n_scales);
 139     same_num_channels_mul_kernel<<<numBlocks, threadsPerBlock, 0>>>((float*)this->p_data.deviceMem(),
 140                                                                     (float*)rhs.p_data.deviceMem(),
 141                                                                     (float*)result.p_data.deviceMem());
 142     CudaCheckError();
 143
 144     return result;
 145 }
 146
 147 __global__ void same_num_channels_div_kernel(const float *data_l, const float *data_r, float *result)
 148 {
 149     int blockId = blockIdx.x + blockIdx.y * gridDim.x;
 150     int threadId = 2 * (blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x);
 151
 152     result[threadId] = (data_l[threadId] * data_r[threadId] + data_l[threadId + 1] * data_r[threadId + 1]) /
 153                        (data_r[threadId] * data_r[threadId] + data_r[threadId + 1] * data_r[threadId + 1]);
 154     result[threadId + 1] = (data_l[threadId + 1] * data_r[threadId] - data_l[threadId] * data_r[threadId + 1]) /
 155                            (data_r[threadId] * data_r[threadId] + data_r[threadId + 1] * data_r[threadId + 1]);
 156 }
 157
 158 ComplexMat ComplexMat::operator/(const ComplexMat &rhs) const
 159 {
 160     assert(rhs.n_channels == n_channels && rhs.cols == cols && rhs.rows == rows);
 161
 162     ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales);
 163
 164     dim3 threadsPerBlock(rows, cols);
 165     dim3 numBlocks(n_channels / n_scales, n_scales);
 166     same_num_channels_div_kernel<<<numBlocks, threadsPerBlock, 0>>>((float*)this->p_data.deviceMem(),
 167                                                                     (float*)rhs.p_data.deviceMem(),
 168                                                                     (float*)result.p_data.deviceMem());
 169     CudaCheckError();
 170
 171     return result;
 172 }
 173
 174 __global__ void same_num_channels_add_kernel(const float *data_l, const float *data_r, float *result)
 175 {
 176     int blockId = blockIdx.x + blockIdx.y * gridDim.x;
 177     int threadId = 2 * (blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x);
 178
 179     result[threadId] = data_l[threadId] + data_r[threadId];
 180     result[threadId + 1] = data_l[threadId + 1] + data_r[threadId + 1];
 181 }
 182
 183 ComplexMat ComplexMat::operator+(const ComplexMat &rhs) const
 184 {
 185     assert(rhs.n_channels == n_channels && rhs.cols == cols && rhs.rows == rows);
 186
 187     ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales);
 188
 189     dim3 threadsPerBlock(rows, cols);
 190     dim3 numBlocks(n_channels / n_scales, n_scales);
 191     same_num_channels_add_kernel<<<numBlocks, threadsPerBlock, 0>>>((float*)this->p_data.deviceMem(),
 192                                                                     (float*)rhs.p_data.deviceMem(),
 193                                                                     (float*)result.p_data.deviceMem());
 194     CudaCheckError();
 195
 196     return result;
 197 }
 198
 199 __global__ void constant_mul_kernel(const float *data_l, float constant, float *result)
 200 {
 201     int blockId = blockIdx.x + blockIdx.y * gridDim.x;
 202     int threadId = 2 * (blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x);
 203
 204     result[threadId] = data_l[threadId] * constant;
 205     result[threadId + 1] = data_l[threadId + 1] * constant;
 206 }
 207
 208 ComplexMat ComplexMat::operator*(const float &rhs) const
 209 {
 210     ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales);
 211
 212     dim3 threadsPerBlock(rows, cols);
 213     dim3 numBlocks(n_channels / n_scales, n_scales);
 214     constant_mul_kernel<<<numBlocks, threadsPerBlock, 0>>>((float*)this->p_data.deviceMem(),
 215                                                            rhs,
 216                                                            (float*)result.p_data.deviceMem());
 217     CudaCheckError();
 218
 219     return result;
 220 }
 221
 222 __global__ void constant_add_kernel(const float *data_l, float constant, float *result)
 223 {
 224     int blockId = blockIdx.x + blockIdx.y * gridDim.x;
 225     int threadId = 2 * (blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x);
 226
 227     result[threadId] = data_l[threadId] + constant;
 228     result[threadId + 1] = data_l[threadId + 1];
 229 }
 230
 231 ComplexMat ComplexMat::operator+(const float &rhs) const
 232 {
 233     ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales);
 234
 235     dim3 threadsPerBlock(rows, cols);
 236     dim3 numBlocks(n_channels / n_scales, n_scales);
 237     constant_add_kernel<<<numBlocks, threadsPerBlock, 0>>>((float*)this->p_data.deviceMem(),
 238                                                            rhs,
 239                                                            (float*)result.p_data.deviceMem());
 240     CudaCheckError();
 241
 242     return result;
 243 }
 244
 245 __global__ void one_channel_mul_kernel(const float *data_l, const float *data_r, float *result)
 246 {
 247     int blockId = blockIdx.x + blockIdx.y * gridDim.x;
 248     int threadId = 2 * (blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x);
 249     int one_ch_index = 2 * ((threadIdx.y * blockDim.x) + threadIdx.x);
 250
 251     result[threadId] = data_l[threadId] * data_r[one_ch_index] - data_l[threadId + 1] * data_r[one_ch_index + 1];
 252     result[threadId + 1] = data_l[threadId] * data_r[one_ch_index + 1] + data_l[threadId + 1] * data_r[one_ch_index];
 253 }
 254
 255 // multiplying element-wise multichannel by one channel mats (rhs mat is with one channel)
 256 ComplexMat ComplexMat::mul(const ComplexMat &rhs) const
 257 {
 258     assert(rhs.n_channels == 1 && rhs.cols == cols && rhs.rows == rows);
 259
 260     ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales);
 261
 262     dim3 threadsPerBlock(rows, cols);
 263     dim3 numBlocks(n_channels / n_scales, n_scales);
 264     one_channel_mul_kernel<<<numBlocks, threadsPerBlock, 0>>>((float*)this->p_data.deviceMem(),
 265                                                               (float*)rhs.p_data.deviceMem(),
 266                                                               (float*)result.p_data.deviceMem());
 267     CudaCheckError();
 268
 269     return result;
 270 }
 271
 272 __global__ void scales_channel_mul_kernel(float *data_l, float *data_r, float *result)
 273 {
 274     int blockId = blockIdx.x + blockIdx.y * gridDim.x;
 275     int threadId = 2 * (blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x);
 276     int one_ch_index = 2 * ((threadIdx.y * blockDim.x) + threadIdx.x + blockIdx.x * blockDim.x * blockDim.y);
 277
 278     result[threadId] = data_l[threadId] * data_r[one_ch_index] - data_l[threadId + 1] * data_r[one_ch_index + 1];
 279     result[threadId + 1] = data_l[threadId] * data_r[one_ch_index + 1] + data_l[threadId + 1] * data_r[one_ch_index];
 280 }
 281
 282 // multiplying element-wise multichannel by one channel mats (rhs mat is with multiple channel)
 283 // ComplexMat ComplexMat::mul2(const ComplexMat &rhs) const
 284 // {
 285 //     assert(rhs.n_channels == n_channels / n_scales && rhs.cols == cols && rhs.rows == rows);
 286
 287 //     ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales);
 288
 289 //     dim3 threadsPerBlock(rows, cols);
 290 //     dim3 numBlocks(n_channels / n_scales, n_scales);
 291 //     scales_channel_mul_kernel<<<numBlocks, threadsPerBlock, 0>>>(this->p_data, rhs.p_data, result.p_data);
 292 //     CudaCheckError();
 293
 294 //     return result;
 295 // }
 296
 297 // void ComplexMat::operator=(ComplexMat &&rhs)
 298 // {
 299 //     cols = rhs.cols;
 300 //     rows = rhs.rows;
 301 //     n_channels = rhs.n_channels;
 302 //     n_scales = rhs.n_scales;
 303
 304 //     p_data = rhs.p_data;
 305
 306 //     rhs.p_data = nullptr;
 307 // }