src/complexmat.cu

   1 #include "complexmat.cuh"
   2
   3 __global__ void sqr_norm_kernel(int n, float *out, float *data, float rows, float cols)
   4 {
   5     extern __shared__ float sdata[];
   6     int i = blockDim.x * threadIdx.y + threadIdx.x;
   7     int blockId = blockIdx.x + blockIdx.y * gridDim.x;
   8     int threadId = 2 * (blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x);
   9
  10     sdata[i] = 0;
  11     sdata[i] = data[threadId] * data[threadId] + data[threadId + 1] * data[threadId + 1];
  12     __syncthreads();
  13
  14     for (unsigned int s = (blockDim.x * blockDim.y + 1) / 2, old_s = blockDim.x * blockDim.y; s > 0; s >>= 1) {
  15
  16         if (old_s & 1) s += 1;
  17
  18         if (i < s && i + s < old_s) {
  19             sdata[i] += sdata[i + s];
  20         }
  21         old_s = s;
  22         __syncthreads();
  23     }
  24
  25     if (i == 0) {
  26         atomicAdd(&out[blockId / n], sdata[0] / (rows * cols));
  27     }
  28 }
  29
  30 void ComplexMat::sqr_norm(DynMem &result) const
  31 {
  32     CudaSafeCall(cudaMemsetAsync(result.deviceMem(), 0, n_scales * sizeof(float)));
  33
  34     dim3 threadsPerBlock(rows, cols);
  35     dim3 numBlocks(n_channels / n_scales, n_scales);
  36
  37     sqr_norm_kernel<<<numBlocks, threadsPerBlock, rows * cols * sizeof(float)>>>(
  38         n_channels / n_scales, result.deviceMem(), this->p_data, rows, cols);
  39     CudaCheckError();
  40
  41     return;
  42 }
  43
  44 __global__ void sqr_mag_kernel(float *data, float *result)
  45 {
  46     int blockId = blockIdx.x + blockIdx.y * gridDim.x;
  47     int threadId = 2 * (blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x);
  48
  49     result[threadId] = data[threadId] * data[threadId] + data[threadId + 1] * data[threadId + 1];
  50     result[threadId + 1] = 0;
  51 }
  52
  53 ComplexMat ComplexMat::sqr_mag() const
  54 {
  55     ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales);
  56
  57     dim3 threadsPerBlock(rows, cols);
  58     dim3 numBlocks(n_channels / n_scales, n_scales);
  59     sqr_mag_kernel<<<numBlocks, threadsPerBlock, 0>>>(this->p_data, result.p_data);
  60     CudaCheckError();
  61
  62     return result;
  63 }
  64
  65 __global__ void conj_kernel(float *data, float *result)
  66 {
  67     int blockId = blockIdx.x + blockIdx.y * gridDim.x;
  68     int threadId = 2 * (blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x);
  69
  70     result[threadId] = data[threadId];
  71     result[threadId + 1] = -data[threadId + 1];
  72 }
  73
  74 ComplexMat ComplexMat::conj() const
  75 {
  76     ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales);
  77
  78     dim3 threadsPerBlock(rows, cols);
  79     dim3 numBlocks(n_channels / n_scales, n_scales);
  80     conj_kernel<<<numBlocks, threadsPerBlock, 0>>>(this->p_data, result.p_data);
  81     CudaCheckError();
  82
  83     return result;
  84 }
  85
  86 ComplexMat ComplexMat::sum_over_channels() const
  87 {
  88     //     assert(p_data.size() > 1);
  89     ComplexMat result(this->rows, this->cols, 1);
  90     return result;
  91 }
  92
  93 cufftComplex *ComplexMat::get_p_data() const
  94 {
  95     return (cufftComplex *)p_data;
  96 }
  97
  98 __global__ void same_num_channels_mul_kernel(float *data_l, float *data_r, float *result)
  99 {
 100     int blockId = blockIdx.x + blockIdx.y * gridDim.x;
 101     int threadId = 2 * (blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x);
 102
 103     result[threadId] = data_l[threadId] * data_r[threadId] - data_l[threadId + 1] * data_r[threadId + 1];
 104     result[threadId + 1] = data_l[threadId] * data_r[threadId + 1] + data_l[threadId + 1] * data_r[threadId];
 105 }
 106
 107 // element-wise per channel multiplication, division and addition
 108 ComplexMat ComplexMat::operator*(const ComplexMat &rhs) const
 109 {
 110     assert(rhs.n_channels == n_channels && rhs.cols == cols && rhs.rows == rows);
 111
 112     ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales);
 113
 114     dim3 threadsPerBlock(rows, cols);
 115     dim3 numBlocks(n_channels / n_scales, n_scales);
 116     same_num_channels_mul_kernel<<<numBlocks, threadsPerBlock, 0>>>(this->p_data, rhs.p_data,
 117                                                                                   result.p_data);
 118     CudaCheckError();
 119
 120     return result;
 121 }
 122
 123 __global__ void same_num_channels_div_kernel(float *data_l, float *data_r, float *result)
 124 {
 125     int blockId = blockIdx.x + blockIdx.y * gridDim.x;
 126     int threadId = 2 * (blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x);
 127
 128     result[threadId] = (data_l[threadId] * data_r[threadId] + data_l[threadId + 1] * data_r[threadId + 1]) /
 129                        (data_r[threadId] * data_r[threadId] + data_r[threadId + 1] * data_r[threadId + 1]);
 130     result[threadId + 1] = (data_l[threadId + 1] * data_r[threadId] - data_l[threadId] * data_r[threadId + 1]) /
 131                            (data_r[threadId] * data_r[threadId] + data_r[threadId + 1] * data_r[threadId + 1]);
 132 }
 133
 134 ComplexMat ComplexMat::operator/(const ComplexMat &rhs) const
 135 {
 136     assert(rhs.n_channels == n_channels && rhs.cols == cols && rhs.rows == rows);
 137
 138     ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales);
 139
 140     dim3 threadsPerBlock(rows, cols);
 141     dim3 numBlocks(n_channels / n_scales, n_scales);
 142     same_num_channels_div_kernel<<<numBlocks, threadsPerBlock, 0>>>(this->p_data, rhs.p_data,
 143                                                                                   result.p_data);
 144     CudaCheckError();
 145
 146     return result;
 147 }
 148
 149 __global__ void same_num_channels_add_kernel(float *data_l, float *data_r, float *result)
 150 {
 151     int blockId = blockIdx.x + blockIdx.y * gridDim.x;
 152     int threadId = 2 * (blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x);
 153
 154     result[threadId] = data_l[threadId] + data_r[threadId];
 155     result[threadId + 1] = data_l[threadId + 1] + data_r[threadId + 1];
 156 }
 157
 158 ComplexMat ComplexMat::operator+(const ComplexMat &rhs) const
 159 {
 160     assert(rhs.n_channels == n_channels && rhs.cols == cols && rhs.rows == rows);
 161
 162     ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales);
 163
 164     dim3 threadsPerBlock(rows, cols);
 165     dim3 numBlocks(n_channels / n_scales, n_scales);
 166     same_num_channels_add_kernel<<<numBlocks, threadsPerBlock, 0>>>(this->p_data, rhs.p_data,
 167                                                                                   result.p_data);
 168     CudaCheckError();
 169
 170     return result;
 171 }
 172
 173 __global__ void constant_mul_kernel(float *data_l, float constant, float *result)
 174 {
 175     int blockId = blockIdx.x + blockIdx.y * gridDim.x;
 176     int threadId = 2 * (blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x);
 177
 178     result[threadId] = data_l[threadId] * constant;
 179     result[threadId + 1] = data_l[threadId + 1] * constant;
 180 }
 181
 182 ComplexMat ComplexMat::operator*(const float &rhs) const
 183 {
 184     ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales);
 185
 186     dim3 threadsPerBlock(rows, cols);
 187     dim3 numBlocks(n_channels / n_scales, n_scales);
 188     constant_mul_kernel<<<numBlocks, threadsPerBlock, 0>>>(this->p_data, rhs, result.p_data);
 189     CudaCheckError();
 190
 191     return result;
 192 }
 193
 194 __global__ void constant_add_kernel(float *data_l, float constant, float *result)
 195 {
 196     int blockId = blockIdx.x + blockIdx.y * gridDim.x;
 197     int threadId = 2 * (blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x);
 198
 199     result[threadId] = data_l[threadId] + constant;
 200     result[threadId + 1] = data_l[threadId + 1];
 201 }
 202
 203 ComplexMat ComplexMat::operator+(const float &rhs) const
 204 {
 205     ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales);
 206
 207     dim3 threadsPerBlock(rows, cols);
 208     dim3 numBlocks(n_channels / n_scales, n_scales);
 209     constant_add_kernel<<<numBlocks, threadsPerBlock, 0>>>(this->p_data, rhs, result.p_data);
 210     CudaCheckError();
 211
 212     return result;
 213 }
 214
 215 __global__ void one_channel_mul_kernel(float *data_l, float *data_r, float *result)
 216 {
 217     int blockId = blockIdx.x + blockIdx.y * gridDim.x;
 218     int threadId = 2 * (blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x);
 219     int one_ch_index = 2 * ((threadIdx.y * blockDim.x) + threadIdx.x);
 220
 221     result[threadId] = data_l[threadId] * data_r[one_ch_index] - data_l[threadId + 1] * data_r[one_ch_index + 1];
 222     result[threadId + 1] = data_l[threadId] * data_r[one_ch_index + 1] + data_l[threadId + 1] * data_r[one_ch_index];
 223 }
 224
 225 // multiplying element-wise multichannel by one channel mats (rhs mat is with one channel)
 226 ComplexMat ComplexMat::mul(const ComplexMat &rhs) const
 227 {
 228     assert(rhs.n_channels == 1 && rhs.cols == cols && rhs.rows == rows);
 229
 230     ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales);
 231
 232     dim3 threadsPerBlock(rows, cols);
 233     dim3 numBlocks(n_channels / n_scales, n_scales);
 234     one_channel_mul_kernel<<<numBlocks, threadsPerBlock, 0>>>(this->p_data, rhs.p_data, result.p_data);
 235     CudaCheckError();
 236
 237     return result;
 238 }
 239
 240 __global__ void scales_channel_mul_kernel(float *data_l, float *data_r, float *result)
 241 {
 242     int blockId = blockIdx.x + blockIdx.y * gridDim.x;
 243     int threadId = 2 * (blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x);
 244     int one_ch_index = 2 * ((threadIdx.y * blockDim.x) + threadIdx.x + blockIdx.x * blockDim.x * blockDim.y);
 245
 246     result[threadId] = data_l[threadId] * data_r[one_ch_index] - data_l[threadId + 1] * data_r[one_ch_index + 1];
 247     result[threadId + 1] = data_l[threadId] * data_r[one_ch_index + 1] + data_l[threadId + 1] * data_r[one_ch_index];
 248 }
 249
 250 // multiplying element-wise multichannel by one channel mats (rhs mat is with multiple channel)
 251 ComplexMat ComplexMat::mul2(const ComplexMat &rhs) const
 252 {
 253     assert(rhs.n_channels == n_channels / n_scales && rhs.cols == cols && rhs.rows == rows);
 254
 255     ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales);
 256
 257     dim3 threadsPerBlock(rows, cols);
 258     dim3 numBlocks(n_channels / n_scales, n_scales);
 259     scales_channel_mul_kernel<<<numBlocks, threadsPerBlock, 0>>>(this->p_data, rhs.p_data, result.p_data);
 260     CudaCheckError();
 261
 262     return result;
 263 }
 264
 265 void ComplexMat::operator=(ComplexMat &rhs)
 266 {
 267     cols = rhs.cols;
 268     rows = rhs.rows;
 269     n_channels = rhs.n_channels;
 270     n_scales = rhs.n_scales;
 271
 272     p_data = rhs.p_data;
 273 }
 274
 275 void ComplexMat::operator=(ComplexMat &&rhs)
 276 {
 277     cols = rhs.cols;
 278     rows = rhs.rows;
 279     n_channels = rhs.n_channels;
 280     n_scales = rhs.n_scales;
 281
 282     p_data = rhs.p_data;
 283
 284     rhs.p_data = nullptr;
 285 }