src/complexmat.cu

   1 #include "complexmat.cuh"
   2
   3 __global__ void sqr_norm_kernel(int n, float* out, float* data, float rows, float cols)
   4 {
   5     extern __shared__ float sdata[];
   6     int i = blockDim.x * threadIdx.y + threadIdx.x;
   7     int blockId = blockIdx.x + blockIdx.y * gridDim.x;
   8     int threadId = 2*(blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x);
   9
  10     sdata[i] = 0;
  11     sdata[i] = data[threadId]*data[threadId] + data[threadId+1]*data[threadId+1];
  12     __syncthreads();
  13
  14     for (unsigned int s=(blockDim.x*blockDim.y+1)/2, old_s = blockDim.x*blockDim.y;s>0; s>>=1) {
  15
  16     if(old_s&1) s+=1;
  17
  18        if (i < s && i+s < old_s) {
  19             sdata[i] += sdata[i + s];
  20        }
  21     old_s = s;
  22     __syncthreads();
  23     }
  24
  25     if(i == 0){
  26        atomicAdd(&out[blockId/n], sdata[0]/(rows*cols));
  27     }
  28 }
  29
  30 float ComplexMat::sqr_norm() const
  31 {
  32     float result;
  33     CudaSafeCall(cudaMemset(result, 0, n_scales*sizeof(float)));
  34
  35     dim3 threadsPerBlock(rows, cols);
  36     dim3 numBlocks(n_channels, 1);
  37
  38     sqr_norm_kernel<<<numBlocks, threadsPerBlock, rows*cols*sizeof(float)>>>(n_channels, result, p_data, rows, cols);
  39     CudaCheckError();
  40
  41     return result;
  42 }
  43
  44 void ComplexMat::sqr_norm(float *result) const
  45 {
  46     CudaSafeCall(cudaMemset(result, 0, n_scales*sizeof(float)));
  47
  48     dim3 threadsPerBlock(rows, cols);
  49     dim3 numBlocks(n_channels/n_scales, n_scales);
  50
  51     sqr_norm_kernel<<<numBlocks, threadsPerBlock, rows*cols*sizeof(float)>>>(n_channels/n_scales, result, p_data, rows, cols);
  52     CudaCheckError();
  53
  54     return;
  55 }
  56
  57 __global__ void sqr_mag_kernel(float* data, float* result)
  58 {
  59         int blockId = blockIdx.x + blockIdx.y * gridDim.x;
  60         int threadId = 2*(blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x);
  61
  62         result[threadId] = data[threadId]*data[threadId] + data[threadId+1]*data[threadId+1];
  63         result[threadId+1] = 0;
  64 }
  65
  66 ComplexMat ComplexMat::sqr_mag() const
  67 {
  68     ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales);
  69
  70     dim3 threadsPerBlock(rows, cols);
  71     dim3 numBlocks(n_channels/n_scales, n_scales);
  72     sqr_mag_kernel<<<numBlocks, threadsPerBlock>>>(this->p_data, result.p_data);
  73     CudaCheckError();
  74
  75     return result;
  76 }
  77
  78 __global__ void conj_kernel(float* data, float* result)
  79 {
  80         int blockId = blockIdx.x + blockIdx.y * gridDim.x;
  81         int threadId = 2*(blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x);
  82
  83         result[threadId] =   data[threadId];
  84         result[threadId+1] =  -data[threadId+1];
  85 }
  86
  87 ComplexMat ComplexMat::conj() const
  88 {
  89     ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales);
  90
  91     dim3 threadsPerBlock(rows, cols);
  92     dim3 numBlocks(n_channels/n_scales, n_scales);
  93     conj_kernel<<<numBlocks, threadsPerBlock>>>(this->p_data, result.p_data);
  94     CudaCheckError();
  95
  96     return result;
  97 }
  98
  99 ComplexMat ComplexMat::sum_over_channels() const
 100 {
 101 //     assert(p_data.size() > 1);
 102     ComplexMat result(this->rows, this->cols, 1);
 103     return result;
 104 }
 105
 106 cufftComplex* ComplexMat::get_p_data() const
 107 {
 108     return (cufftComplex*) p_data;
 109 }
 110
 111 __global__ void same_num_channels_mul_kernel(float* data_l, float* data_r, float* result)
 112 {
 113         int blockId = blockIdx.x + blockIdx.y * gridDim.x;
 114         int threadId = 2*(blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x);
 115
 116         result[threadId] =  data_l[threadId]*data_r[threadId] - data_l[threadId+1]*data_r[threadId+1];
 117         result[threadId+1] = data_l[threadId]*data_r[threadId+1] + data_l[threadId+1]*data_r[threadId];
 118 }
 119
 120 //element-wise per channel multiplication, division and addition
 121 ComplexMat ComplexMat::operator*(const ComplexMat & rhs) const
 122 {
 123     assert(rhs.n_channels == n_channels && rhs.cols == cols && rhs.rows == rows);
 124
 125     ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales);
 126
 127     dim3 threadsPerBlock(rows, cols);
 128     dim3 numBlocks(n_channels/n_scales, n_scales);
 129     same_num_channels_mul_kernel<<<numBlocks, threadsPerBlock>>>(this->p_data, rhs.p_data, result.p_data);
 130     CudaCheckError();
 131
 132     return result;
 133 }
 134
 135 __global__ void same_num_channels_div_kernel(float* data_l, float* data_r, float* result)
 136 {
 137         int blockId = blockIdx.x + blockIdx.y * gridDim.x;
 138         int threadId = 2*(blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x);
 139
 140         result[threadId] =  (data_l[threadId]*data_r[threadId] + data_l[threadId+1]*data_r[threadId+1])/
 141                             (data_r[threadId]*data_r[threadId] + data_r[threadId+1]*data_r[threadId+1]);
 142         result[threadId+1] = (data_l[threadId+1]*data_r[threadId] - data_l[threadId]*data_r[threadId+1])/
 143                             (data_r[threadId]*data_r[threadId] + data_r[threadId+1]*data_r[threadId+1]);
 144 }
 145
 146 ComplexMat ComplexMat::operator/(const ComplexMat & rhs) const
 147 {
 148     assert(rhs.n_channels == n_channels && rhs.cols == cols && rhs.rows == rows);
 149
 150     ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales);
 151
 152     dim3 threadsPerBlock(rows, cols);
 153     dim3 numBlocks(n_channels/n_scales, n_scales);
 154     same_num_channels_div_kernel<<<numBlocks, threadsPerBlock>>>(this->p_data, rhs.p_data, result.p_data);
 155     CudaCheckError();
 156
 157     return result;
 158 }
 159
 160 __global__ void same_num_channels_add_kernel(float* data_l, float* data_r, float* result)
 161 {
 162         int blockId = blockIdx.x + blockIdx.y * gridDim.x;
 163         int threadId = 2*(blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x);
 164
 165         result[threadId] =  data_l[threadId]+data_r[threadId];
 166         result[threadId+1] = data_l[threadId+1]+data_r[threadId+1];
 167 }
 168
 169 ComplexMat ComplexMat::operator+(const ComplexMat & rhs) const
 170 {
 171     assert(rhs.n_channels == n_channels && rhs.cols == cols && rhs.rows == rows);
 172
 173     ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales);
 174
 175     dim3 threadsPerBlock(rows, cols);
 176     dim3 numBlocks(n_channels/n_scales, n_scales);
 177     same_num_channels_add_kernel<<<numBlocks, threadsPerBlock>>>(this->p_data, rhs.p_data, result.p_data);
 178     CudaCheckError();
 179
 180     return result;
 181 }
 182
 183 __global__ void constant_mul_kernel(float* data_l, float constant, float* result)
 184 {
 185         int blockId = blockIdx.x + blockIdx.y * gridDim.x;
 186         int threadId = 2*(blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x);
 187
 188         result[threadId] =  data_l[threadId]*constant;
 189         result[threadId+1] = data_l[threadId+1]*constant;
 190 }
 191
 192 ComplexMat ComplexMat::operator*(const float & rhs) const
 193 {
 194     ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales);
 195
 196     dim3 threadsPerBlock(rows, cols);
 197     dim3 numBlocks(n_channels/n_scales, n_scales);
 198     constant_mul_kernel<<<numBlocks, threadsPerBlock>>>(this->p_data, rhs, result.p_data);
 199     CudaCheckError();
 200
 201     return result;
 202 }
 203
 204 __global__ void constant_add_kernel(float* data_l, float constant, float* result)
 205 {
 206         int blockId = blockIdx.x + blockIdx.y * gridDim.x;
 207         int threadId = 2*(blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x);
 208
 209         result[threadId] =  data_l[threadId]+constant;
 210         result[threadId+1] = data_l[threadId+1];
 211 }
 212
 213 ComplexMat ComplexMat::operator+(const float & rhs) const
 214 {
 215     ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales);
 216
 217     dim3 threadsPerBlock(rows, cols);
 218     dim3 numBlocks(n_channels/n_scales, n_scales);
 219     constant_add_kernel<<<numBlocks, threadsPerBlock>>>(this->p_data, rhs, result.p_data);
 220     CudaCheckError();
 221
 222     return result;
 223 }
 224
 225 __global__ void one_channel_mul_kernel(float* data_l, float* data_r, float* result)
 226 {
 227         int blockId = blockIdx.x + blockIdx.y * gridDim.x;
 228         int threadId = 2*(blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x);
 229         int one_ch_index = 2*((threadIdx.y * blockDim.x) + threadIdx.x);
 230
 231         result[threadId] =  data_l[threadId]*data_r[one_ch_index] - data_l[threadId+1]*data_r[one_ch_index+1];
 232         result[threadId+1] = data_l[threadId]*data_r[one_ch_index+1] + data_l[threadId+1]*data_r[one_ch_index];
 233 }
 234
 235 //multiplying element-wise multichannel by one channel mats (rhs mat is with one channel)
 236 ComplexMat ComplexMat::mul(const ComplexMat & rhs) const
 237 {
 238     assert(rhs.n_channels == 1 && rhs.cols == cols && rhs.rows == rows);
 239
 240     ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales);
 241
 242     dim3 threadsPerBlock(rows, cols);
 243     dim3 numBlocks(n_channels/n_scales, n_scales);
 244     one_channel_mul_kernel<<<numBlocks, threadsPerBlock>>>(this->p_data, rhs.p_data, result.p_data);
 245     CudaCheckError();
 246
 247     return result;
 248 }
 249
 250 __global__ void scales_channel_mul_kernel(float* data_l, float* data_r, float* result)
 251 {
 252         int blockId = blockIdx.x + blockIdx.y * gridDim.x;
 253         int threadId = 2*(blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x);
 254         int one_ch_index = 2*((threadIdx.y * blockDim.x) + threadIdx.x+blockIdx.x*blockDim.x*blockDim.y);
 255
 256         result[threadId] =  data_l[threadId]*data_r[one_ch_index] - data_l[threadId+1]*data_r[one_ch_index+1];
 257         result[threadId+1] = data_l[threadId]*data_r[one_ch_index+1] + data_l[threadId+1]*data_r[one_ch_index];
 258 }
 259
 260 //multiplying element-wise multichannel by one channel mats (rhs mat is with multiple channel)
 261 ComplexMat ComplexMat::mul2(const ComplexMat & rhs) const
 262 {
 263     assert(rhs.n_channels == n_channels/n_scales && rhs.cols == cols && rhs.rows == rows);
 264
 265     ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales);
 266
 267     dim3 threadsPerBlock(rows, cols);
 268     dim3 numBlocks(n_channels/n_scales, n_scales);
 269     scales_channel_mul_kernel<<<numBlocks, threadsPerBlock>>>(this->p_data, rhs.p_data, result.p_data);
 270     CudaCheckError();
 271
 272     return result;
 273 }
 274
 275 void ComplexMat::operator=(ComplexMat & rhs)
 276 {
 277     cols = rhs.cols;
 278     rows = rhs.rows;
 279     n_channels = rhs.n_channels;
 280     n_scales = rhs.n_scales;
 281     foreign_data = true;
 282
 283     p_data = rhs.p_data;
 284 }
 285
 286 void ComplexMat::operator=(ComplexMat && rhs)
 287 {
 288     cols = rhs.cols;
 289     rows = rhs.rows;
 290     n_channels = rhs.n_channels;
 291     n_scales = rhs.n_scales;
 292
 293     p_data = rhs.p_data;
 294
 295     rhs.p_data = nullptr;
 296 }