src/complexmat.cu

   1 #include "complexmat.cuh"
   2
   3 __global__ void sqr_norm_kernel(int n, float* out, float* data, float rows, float cols)
   4 {
   5     extern __shared__ float sdata[];
   6     int i = blockDim.x * threadIdx.y + threadIdx.x;
   7     int blockId = blockIdx.x + blockIdx.y * gridDim.x;
   8     int threadId = 2*(blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x);
   9
  10     sdata[i] = 0;
  11     sdata[i] = data[threadId]*data[threadId] + data[threadId+1]*data[threadId+1];
  12     __syncthreads();
  13
  14     for (unsigned int s=(blockDim.x*blockDim.y+1)/2, old_s = blockDim.x*blockDim.y;s>0; s>>=1) {
  15
  16     if(old_s&1) s+=1;
  17
  18        if (i < s && i+s < old_s) {
  19             sdata[i] += sdata[i + s];
  20        }
  21     old_s = s;
  22     __syncthreads();
  23     }
  24
  25     if(i == 0){
  26        atomicAdd(&out[blockId/n], sdata[0]/(rows*cols));
  27     }
  28 }
  29
  30 void ComplexMat::sqr_norm(float *result) const
  31 {
  32     cudaMemset(result, 0, n_scales*sizeof(float));
  33
  34     dim3 threadsPerBlock(rows, cols);
  35     dim3 numBlocks(n_channels/n_scales, n_scales);
  36
  37     sqr_norm_kernel<<<numBlocks, threadsPerBlock, rows*cols*sizeof(float)>>>(n_channels/n_scales, result, p_data, rows, cols);
  38
  39     return;
  40 }
  41
  42 __global__ void sqr_mag_kernel(float* data, float* result)
  43 {
  44         int blockId = blockIdx.x + blockIdx.y * gridDim.x;
  45         int threadId = 2*(blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x);
  46
  47         result[threadId] = data[threadId]*data[threadId] + data[threadId+1]*data[threadId+1];
  48         result[threadId+1] = 0;
  49 }
  50
  51 ComplexMat ComplexMat::sqr_mag() const
  52 {
  53     ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales);
  54
  55     dim3 threadsPerBlock(rows, cols);
  56     dim3 numBlocks(n_channels/n_scales, n_scales);
  57     sqr_mag_kernel<<<numBlocks, threadsPerBlock>>>(this->p_data, result.p_data);
  58
  59     return result;
  60 }
  61
  62 __global__ void conj_kernel(float* data, float* result)
  63 {
  64         int blockId = blockIdx.x + blockIdx.y * gridDim.x;
  65         int threadId = 2*(blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x);
  66
  67         result[threadId] =   data[threadId];
  68         result[threadId+1] =  -data[threadId+1];
  69 }
  70
  71 ComplexMat ComplexMat::conj() const
  72 {
  73     ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales);
  74
  75     dim3 threadsPerBlock(rows, cols);
  76     dim3 numBlocks(n_channels/n_scales, n_scales);
  77     conj_kernel<<<numBlocks, threadsPerBlock>>>(this->p_data, result.p_data);
  78     return result;
  79 }
  80
  81 ComplexMat ComplexMat::sum_over_channels() const
  82 {
  83 //     assert(p_data.size() > 1);
  84     ComplexMat result(this->rows, this->cols, 1);
  85     return result;
  86 }
  87
  88 cufftComplex* ComplexMat::get_p_data() const
  89 {
  90     return (cufftComplex*) p_data;
  91 }
  92
  93 __global__ void same_num_channels_mul_kernel(float* data_l, float* data_r, float* result)
  94 {
  95         int blockId = blockIdx.x + blockIdx.y * gridDim.x;
  96         int threadId = 2*(blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x);
  97
  98         result[threadId] =  data_l[threadId]*data_r[threadId] - data_l[threadId+1]*data_r[threadId+1];
  99         result[threadId+1] = data_l[threadId]*data_r[threadId+1] + data_l[threadId+1]*data_r[threadId];
 100 }
 101
 102 //element-wise per channel multiplication, division and addition
 103 ComplexMat ComplexMat::operator*(const ComplexMat & rhs) const
 104 {
 105     assert(rhs.n_channels == n_channels && rhs.cols == cols && rhs.rows == rows);
 106
 107     ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales);
 108
 109     dim3 threadsPerBlock(rows, cols);
 110     dim3 numBlocks(n_channels/n_scales, n_scales);
 111     same_num_channels_mul_kernel<<<numBlocks, threadsPerBlock>>>(this->p_data, rhs.p_data, result.p_data);
 112
 113     return result;
 114 }
 115
 116 __global__ void same_num_channels_div_kernel(float* data_l, float* data_r, float* result)
 117 {
 118         int blockId = blockIdx.x + blockIdx.y * gridDim.x;
 119         int threadId = 2*(blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x);
 120
 121         result[threadId] =  (data_l[threadId]*data_r[threadId] + data_l[threadId+1]*data_r[threadId+1])/
 122                             (data_r[threadId]*data_r[threadId] + data_r[threadId+1]*data_r[threadId+1]);
 123         result[threadId+1] = (data_l[threadId+1]*data_r[threadId] - data_l[threadId]*data_r[threadId+1])/
 124                             (data_r[threadId]*data_r[threadId] + data_r[threadId+1]*data_r[threadId+1]);
 125 }
 126
 127 ComplexMat ComplexMat::operator/(const ComplexMat & rhs) const
 128 {
 129     assert(rhs.n_channels == n_channels && rhs.cols == cols && rhs.rows == rows);
 130
 131     ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales);
 132
 133     dim3 threadsPerBlock(rows, cols);
 134     dim3 numBlocks(n_channels/n_scales, n_scales);
 135     same_num_channels_div_kernel<<<numBlocks, threadsPerBlock>>>(this->p_data, rhs.p_data, result.p_data);
 136
 137     return result;
 138 }
 139
 140 __global__ void same_num_channels_add_kernel(float* data_l, float* data_r, float* result)
 141 {
 142         int blockId = blockIdx.x + blockIdx.y * gridDim.x;
 143         int threadId = 2*(blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x);
 144
 145         result[threadId] =  data_l[threadId]+data_r[threadId];
 146         result[threadId+1] = data_l[threadId+1]+data_r[threadId+1];
 147 }
 148
 149 ComplexMat ComplexMat::operator+(const ComplexMat & rhs) const
 150 {
 151     assert(rhs.n_channels == n_channels && rhs.cols == cols && rhs.rows == rows);
 152
 153     ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales);
 154
 155     dim3 threadsPerBlock(rows, cols);
 156     dim3 numBlocks(n_channels/n_scales, n_scales);
 157     same_num_channels_add_kernel<<<numBlocks, threadsPerBlock>>>(this->p_data, rhs.p_data, result.p_data);
 158
 159     return result;
 160 }
 161
 162 __global__ void constant_mul_kernel(float* data_l, float constant, float* result)
 163 {
 164         int blockId = blockIdx.x + blockIdx.y * gridDim.x;
 165         int threadId = 2*(blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x);
 166
 167         result[threadId] =  data_l[threadId]*constant;
 168         result[threadId+1] = data_l[threadId+1]*constant;
 169 }
 170
 171 ComplexMat ComplexMat::operator*(const float & rhs) const
 172 {
 173     ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales);
 174
 175     dim3 threadsPerBlock(rows, cols);
 176     dim3 numBlocks(n_channels/n_scales, n_scales);
 177     constant_mul_kernel<<<numBlocks, threadsPerBlock>>>(this->p_data, rhs, result.p_data);
 178
 179     return result;
 180 }
 181
 182 __global__ void constant_add_kernel(float* data_l, float constant, float* result)
 183 {
 184         int blockId = blockIdx.x + blockIdx.y * gridDim.x;
 185         int threadId = 2*(blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x);
 186
 187         result[threadId] =  data_l[threadId]+constant;
 188         result[threadId+1] = data_l[threadId+1];
 189 }
 190
 191 ComplexMat ComplexMat::operator+(const float & rhs) const
 192 {
 193     ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales);
 194
 195     dim3 threadsPerBlock(rows, cols);
 196     dim3 numBlocks(n_channels/n_scales, n_scales);
 197     constant_add_kernel<<<numBlocks, threadsPerBlock>>>(this->p_data, rhs, result.p_data);
 198
 199     return result;
 200 }
 201
 202 __global__ void one_channel_mul_kernel(float* data_l, float* data_r, float* result)
 203 {
 204         int blockId = blockIdx.x + blockIdx.y * gridDim.x;
 205         int threadId = 2*(blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x);
 206         int one_ch_index = 2*((threadIdx.y * blockDim.x) + threadIdx.x);
 207
 208         result[threadId] =  data_l[threadId]*data_r[one_ch_index] - data_l[threadId+1]*data_r[one_ch_index+1];
 209         result[threadId+1] = data_l[threadId]*data_r[one_ch_index+1] + data_l[threadId+1]*data_r[one_ch_index];
 210 }
 211
 212 //multiplying element-wise multichannel by one channel mats (rhs mat is with one channel)
 213 ComplexMat ComplexMat::mul(const ComplexMat & rhs) const
 214 {
 215     assert(rhs.n_channels == 1 && rhs.cols == cols && rhs.rows == rows);
 216
 217     ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales);
 218
 219     dim3 threadsPerBlock(rows, cols);
 220     dim3 numBlocks(n_channels/n_scales, n_scales);
 221     one_channel_mul_kernel<<<numBlocks, threadsPerBlock>>>(this->p_data, rhs.p_data, result.p_data);
 222
 223     return result;
 224 }
 225
 226 __global__ void scales_channel_mul_kernel(float* data_l, float* data_r, float* result)
 227 {
 228         int blockId = blockIdx.x + blockIdx.y * gridDim.x;
 229         int threadId = 2*(blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x);
 230         int one_ch_index = 2*((threadIdx.y * blockDim.x) + threadIdx.x+blockIdx.x*blockDim.x*blockDim.y);
 231
 232         result[threadId] =  data_l[threadId]*data_r[one_ch_index] - data_l[threadId+1]*data_r[one_ch_index+1];
 233         result[threadId+1] = data_l[threadId]*data_r[one_ch_index+1] + data_l[threadId+1]*data_r[one_ch_index];
 234 }
 235
 236 //multiplying element-wise multichannel by one channel mats (rhs mat is with multiple channel)
 237 ComplexMat ComplexMat::mul2(const ComplexMat & rhs) const
 238 {
 239     assert(rhs.n_channels == n_channels/n_scales && rhs.cols == cols && rhs.rows == rows);
 240
 241     ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales);
 242
 243     dim3 threadsPerBlock(rows, cols);
 244     dim3 numBlocks(n_channels/n_scales, n_scales);
 245     scales_channel_mul_kernel<<<numBlocks, threadsPerBlock>>>(this->p_data, rhs.p_data, result.p_data);
 246
 247     return result;
 248 }
 249
 250 void ComplexMat::operator=(ComplexMat & rhs)
 251 {
 252     cols = rhs.cols;
 253     rows = rhs.rows;
 254     n_channels = rhs.n_channels;
 255     n_scales = rhs.n_scales;
 256
 257     p_data = rhs.p_data;
 258
 259 }
 260
 261 void ComplexMat::operator=(ComplexMat && rhs)
 262 {
 263     cols = rhs.cols;
 264     rows = rhs.rows;
 265     n_channels = rhs.n_channels;
 266     n_scales = rhs.n_scales;
 267
 268     p_data = rhs.p_data;
 269
 270     rhs.p_data = nullptr;
 271 }