Unify CPU and GPU implementations of ComplexMat

[hercules2020/kcf.git] / src / complexmat.cu
diff --git a/src/complexmat.cu b/src/complexmat.cu

index 7d4a43dbceece696d6dd03fd923f6f9d035ff6f9..6ed8628f815d3b14568d2d8d1ae42fc6544f524c 100644 (file)
--- a/src/complexmat.cu
+++ b/src/complexmat.cu
@@ -1,6 +1,6 @@
-#include "complexmat.cuh"
+#include "complexmat.hpp"
  
-__global__ void sqr_norm_kernel(int n, float *out, float *data, float rows, float cols)
+__global__ void sqr_norm_kernel(int n, float *out, const float *data, float rows, float cols)
  {
      extern __shared__ float sdata[];
      int i = blockDim.x * threadIdx.y + threadIdx.x;
@@ -27,21 +27,21 @@ __global__ void sqr_norm_kernel(int n, float *out, float *data, float rows, floa
      }
  }
  
-void ComplexMat::sqr_norm(float *result) const
+void ComplexMat::sqr_norm(DynMem &result) const
  {
-    CudaSafeCall(cudaMemsetAsync(result, 0, n_scales * sizeof(float), this->stream));
+    CudaSafeCall(cudaMemsetAsync(result.deviceMem(), 0, n_scales * sizeof(float)));
  
      dim3 threadsPerBlock(rows, cols);
      dim3 numBlocks(n_channels / n_scales, n_scales);
  
-    sqr_norm_kernel<<<numBlocks, threadsPerBlock, rows * cols * sizeof(float), this->stream>>>(
-        n_channels / n_scales, result, this->p_data, rows, cols);
+    sqr_norm_kernel<<<numBlocks, threadsPerBlock, rows * cols * sizeof(float)>>>(
+        n_channels / n_scales, result.deviceMem(), (float*)this->p_data.deviceMem(), rows, cols);
      CudaCheckError();
  
      return;
  }
  
-__global__ void sqr_mag_kernel(float *data, float *result)
+__global__ void sqr_mag_kernel(const float *data, float *result)
  {
      int blockId = blockIdx.x + blockIdx.y * gridDim.x;
      int threadId = 2 * (blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x);
@@ -52,17 +52,17 @@ __global__ void sqr_mag_kernel(float *data, float *result)
  
  ComplexMat ComplexMat::sqr_mag() const
  {
-    ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales, this->stream);
+    ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales);
  
      dim3 threadsPerBlock(rows, cols);
      dim3 numBlocks(n_channels / n_scales, n_scales);
-    sqr_mag_kernel<<<numBlocks, threadsPerBlock, 0, this->stream>>>(this->p_data, result.p_data);
+    sqr_mag_kernel<<<numBlocks, threadsPerBlock, 0>>>((float*)this->p_data.deviceMem(), (float*)result.p_data.deviceMem());
      CudaCheckError();
  
      return result;
  }
  
-__global__ void conj_kernel(float *data, float *result)
+__global__ void conj_kernel(const float *data, float *result)
  {
      int blockId = blockIdx.x + blockIdx.y * gridDim.x;
      int threadId = 2 * (blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x);
@@ -73,11 +73,11 @@ __global__ void conj_kernel(float *data, float *result)
  
  ComplexMat ComplexMat::conj() const
  {
-    ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales, this->stream);
+    ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales);
  
      dim3 threadsPerBlock(rows, cols);
      dim3 numBlocks(n_channels / n_scales, n_scales);
-    conj_kernel<<<numBlocks, threadsPerBlock, 0, this->stream>>>(this->p_data, result.p_data);
+    conj_kernel<<<numBlocks, threadsPerBlock, 0>>>((float*)this->p_data.deviceMem(), (float*)result.p_data.deviceMem());
      CudaCheckError();
  
      return result;
@@ -86,16 +86,11 @@ ComplexMat ComplexMat::conj() const
  ComplexMat ComplexMat::sum_over_channels() const
  {
      //     assert(p_data.size() > 1);
-    ComplexMat result(this->rows, this->cols, 1, this->stream);
+    ComplexMat result(this->rows, this->cols, 1);
      return result;
  }
  
-cufftComplex *ComplexMat::get_p_data() const
-{
-    return (cufftComplex *)p_data;
-}
-
-__global__ void same_num_channels_mul_kernel(float *data_l, float *data_r, float *result)
+__global__ void same_num_channels_mul_kernel(const float *data_l, const float *data_r, float *result)
  {
      int blockId = blockIdx.x + blockIdx.y * gridDim.x;
      int threadId = 2 * (blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x);
@@ -109,18 +104,19 @@ ComplexMat ComplexMat::operator*(const ComplexMat &rhs) const
  {
      assert(rhs.n_channels == n_channels && rhs.cols == cols && rhs.rows == rows);
  
-    ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales, this->stream);
+    ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales);
  
      dim3 threadsPerBlock(rows, cols);
      dim3 numBlocks(n_channels / n_scales, n_scales);
-    same_num_channels_mul_kernel<<<numBlocks, threadsPerBlock, 0, this->stream>>>(this->p_data, rhs.p_data,
-                                                                                  result.p_data);
+    same_num_channels_mul_kernel<<<numBlocks, threadsPerBlock, 0>>>((float*)this->p_data.deviceMem(),
+                                                                    (float*)rhs.p_data.deviceMem(),
+                                                                    (float*)result.p_data.deviceMem());
      CudaCheckError();
  
      return result;
  }
  
-__global__ void same_num_channels_div_kernel(float *data_l, float *data_r, float *result)
+__global__ void same_num_channels_div_kernel(const float *data_l, const float *data_r, float *result)
  {
      int blockId = blockIdx.x + blockIdx.y * gridDim.x;
      int threadId = 2 * (blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x);
@@ -135,18 +131,19 @@ ComplexMat ComplexMat::operator/(const ComplexMat &rhs) const
  {
      assert(rhs.n_channels == n_channels && rhs.cols == cols && rhs.rows == rows);
  
-    ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales, this->stream);
+    ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales);
  
      dim3 threadsPerBlock(rows, cols);
      dim3 numBlocks(n_channels / n_scales, n_scales);
-    same_num_channels_div_kernel<<<numBlocks, threadsPerBlock, 0, this->stream>>>(this->p_data, rhs.p_data,
-                                                                                  result.p_data);
+    same_num_channels_div_kernel<<<numBlocks, threadsPerBlock, 0>>>((float*)this->p_data.deviceMem(),
+                                                                    (float*)rhs.p_data.deviceMem(),
+                                                                    (float*)result.p_data.deviceMem());
      CudaCheckError();
  
      return result;
  }
  
-__global__ void same_num_channels_add_kernel(float *data_l, float *data_r, float *result)
+__global__ void same_num_channels_add_kernel(const float *data_l, const float *data_r, float *result)
  {
      int blockId = blockIdx.x + blockIdx.y * gridDim.x;
      int threadId = 2 * (blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x);
@@ -159,18 +156,19 @@ ComplexMat ComplexMat::operator+(const ComplexMat &rhs) const
  {
      assert(rhs.n_channels == n_channels && rhs.cols == cols && rhs.rows == rows);
  
-    ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales, this->stream);
+    ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales);
  
      dim3 threadsPerBlock(rows, cols);
      dim3 numBlocks(n_channels / n_scales, n_scales);
-    same_num_channels_add_kernel<<<numBlocks, threadsPerBlock, 0, this->stream>>>(this->p_data, rhs.p_data,
-                                                                                  result.p_data);
+    same_num_channels_add_kernel<<<numBlocks, threadsPerBlock, 0>>>((float*)this->p_data.deviceMem(),
+                                                                    (float*)rhs.p_data.deviceMem(),
+                                                                    (float*)result.p_data.deviceMem());
      CudaCheckError();
  
      return result;
  }
  
-__global__ void constant_mul_kernel(float *data_l, float constant, float *result)
+__global__ void constant_mul_kernel(const float *data_l, float constant, float *result)
  {
      int blockId = blockIdx.x + blockIdx.y * gridDim.x;
      int threadId = 2 * (blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x);
@@ -181,17 +179,19 @@ __global__ void constant_mul_kernel(float *data_l, float constant, float *result
  
  ComplexMat ComplexMat::operator*(const float &rhs) const
  {
-    ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales, this->stream);
+    ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales);
  
      dim3 threadsPerBlock(rows, cols);
      dim3 numBlocks(n_channels / n_scales, n_scales);
-    constant_mul_kernel<<<numBlocks, threadsPerBlock, 0, this->stream>>>(this->p_data, rhs, result.p_data);
+    constant_mul_kernel<<<numBlocks, threadsPerBlock, 0>>>((float*)this->p_data.deviceMem(),
+                                                           rhs,
+                                                           (float*)result.p_data.deviceMem());
      CudaCheckError();
  
      return result;
  }
  
-__global__ void constant_add_kernel(float *data_l, float constant, float *result)
+__global__ void constant_add_kernel(const float *data_l, float constant, float *result)
  {
      int blockId = blockIdx.x + blockIdx.y * gridDim.x;
      int threadId = 2 * (blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x);
@@ -202,17 +202,19 @@ __global__ void constant_add_kernel(float *data_l, float constant, float *result
  
  ComplexMat ComplexMat::operator+(const float &rhs) const
  {
-    ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales, this->stream);
+    ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales);
  
      dim3 threadsPerBlock(rows, cols);
      dim3 numBlocks(n_channels / n_scales, n_scales);
-    constant_add_kernel<<<numBlocks, threadsPerBlock, 0, this->stream>>>(this->p_data, rhs, result.p_data);
+    constant_add_kernel<<<numBlocks, threadsPerBlock, 0>>>((float*)this->p_data.deviceMem(),
+                                                           rhs,
+                                                           (float*)result.p_data.deviceMem());
      CudaCheckError();
  
      return result;
  }
  
-__global__ void one_channel_mul_kernel(float *data_l, float *data_r, float *result)
+__global__ void one_channel_mul_kernel(const float *data_l, const float *data_r, float *result)
  {
      int blockId = blockIdx.x + blockIdx.y * gridDim.x;
      int threadId = 2 * (blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x);
@@ -227,11 +229,13 @@ ComplexMat ComplexMat::mul(const ComplexMat &rhs) const
  {
      assert(rhs.n_channels == 1 && rhs.cols == cols && rhs.rows == rows);
  
-    ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales, this->stream);
+    ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales);
  
      dim3 threadsPerBlock(rows, cols);
      dim3 numBlocks(n_channels / n_scales, n_scales);
-    one_channel_mul_kernel<<<numBlocks, threadsPerBlock, 0, this->stream>>>(this->p_data, rhs.p_data, result.p_data);
+    one_channel_mul_kernel<<<numBlocks, threadsPerBlock, 0>>>((float*)this->p_data.deviceMem(),
+                                                              (float*)rhs.p_data.deviceMem(),
+                                                              (float*)result.p_data.deviceMem());
      CudaCheckError();
  
      return result;
@@ -248,41 +252,28 @@ __global__ void scales_channel_mul_kernel(float *data_l, float *data_r, float *r
  }
  
  // multiplying element-wise multichannel by one channel mats (rhs mat is with multiple channel)
-ComplexMat ComplexMat::mul2(const ComplexMat &rhs) const
-{
-    assert(rhs.n_channels == n_channels / n_scales && rhs.cols == cols && rhs.rows == rows);
+// ComplexMat ComplexMat::mul2(const ComplexMat &rhs) const
+// {
+//     assert(rhs.n_channels == n_channels / n_scales && rhs.cols == cols && rhs.rows == rows);
  
-    ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales, this->stream);
-
-    dim3 threadsPerBlock(rows, cols);
-    dim3 numBlocks(n_channels / n_scales, n_scales);
-    scales_channel_mul_kernel<<<numBlocks, threadsPerBlock, 0, this->stream>>>(this->p_data, rhs.p_data, result.p_data);
-    CudaCheckError();
+//     ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales);
  
-    return result;
-}
+//     dim3 threadsPerBlock(rows, cols);
+//     dim3 numBlocks(n_channels / n_scales, n_scales);
+//     scales_channel_mul_kernel<<<numBlocks, threadsPerBlock, 0>>>(this->p_data, rhs.p_data, result.p_data);
+//     CudaCheckError();
  
-void ComplexMat::operator=(ComplexMat &rhs)
-{
-    cols = rhs.cols;
-    rows = rhs.rows;
-    n_channels = rhs.n_channels;
-    n_scales = rhs.n_scales;
-    stream = rhs.stream;
-    foreign_data = true;
-
-    p_data = rhs.p_data;
-}
+//     return result;
+// }
  
-void ComplexMat::operator=(ComplexMat &&rhs)
-{
-    cols = rhs.cols;
-    rows = rhs.rows;
-    n_channels = rhs.n_channels;
-    n_scales = rhs.n_scales;
-    stream = rhs.stream;
+// void ComplexMat::operator=(ComplexMat &&rhs)
+// {
+//     cols = rhs.cols;
+//     rows = rhs.rows;
+//     n_channels = rhs.n_channels;
+//     n_scales = rhs.n_scales;
  
-    p_data = rhs.p_data;
+//     p_data = rhs.p_data;
  
-    rhs.p_data = nullptr;
-}
+//     rhs.p_data = nullptr;
+// }