Unify CPU and GPU implementations of ComplexMat

author Michal Sojka <michal.sojka@cvut.cz>

Tue, 2 Oct 2018 13:50:49 +0000 (15:50 +0200)

committer Michal Sojka <michal.sojka@cvut.cz>

Tue, 2 Oct 2018 13:51:12 +0000 (15:51 +0200)
author Michal Sojka <michal.sojka@cvut.cz>
Tue, 2 Oct 2018 13:50:49 +0000 (15:50 +0200)
committer Michal Sojka <michal.sojka@cvut.cz>
Tue, 2 Oct 2018 13:51:12 +0000 (15:51 +0200)
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt

index 0f734a08052aa6314631a8e960c60d7e8229aa0a..ae11abfd5a0d55893fd014641d4e0325b7341bc3 100644 (file)
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -1,6 +1,6 @@
  cmake_minimum_required(VERSION 2.8)
  
-set(KCF_LIB_SRC kcf.cpp kcf.h fft.cpp threadctx.hpp pragmas.h dynmem.hpp debug.cpp)
+set(KCF_LIB_SRC kcf.cpp kcf.h fft.cpp threadctx.hpp pragmas.h dynmem.hpp debug.cpp complexmat.hpp)
  
  find_package(PkgConfig)
  
@@ -30,17 +30,17 @@ ENDIF()
  SET(use_cuda OFF)
  
  IF(FFT STREQUAL "OpenCV")
-  list(APPEND KCF_LIB_SRC fft_opencv.cpp complexmat.hpp)
+  list(APPEND KCF_LIB_SRC fft_opencv.cpp)
  ELSEIF(FFT STREQUAL "fftw")
-  list(APPEND KCF_LIB_SRC fft_fftw.cpp complexmat.hpp)
+  list(APPEND KCF_LIB_SRC fft_fftw.cpp)
    add_definitions(-DFFTW)
    pkg_check_modules(FFTW REQUIRED fftw3f)
  ELSEIF(FFT STREQUAL "cuFFTW")
-  list(APPEND KCF_LIB_SRC fft_fftw.cpp complexmat.hpp)
+  list(APPEND KCF_LIB_SRC fft_fftw.cpp)
    add_definitions(-DFFTW -DCUFFTW)
    set(use_cuda ON)
  ELSEIF(FFT STREQUAL "cuFFT")
-    list(APPEND KCF_LIB_SRC fft_cufft.cpp complexmat.cuh cuda_functions.cuh complexmat.cu cuda_functions.cu)
+    list(APPEND KCF_LIB_SRC fft_cufft.cpp cuda_functions.h cuda_functions.cu)
      add_definitions(-DCUFFT)
      set(use_cuda ON)
      iF(CUDA_DEBUG)
@@ -51,6 +51,12 @@ ELSE()
    MESSAGE(FATAL_ERROR "Invalid FFT implementation selected")
  ENDIF()
  
+IF(FFT STREQUAL "cuFFT")
+  list(APPEND KCF_LIB_SRC complexmat.cu)
+ELSE()
+  list(APPEND KCF_LIB_SRC complexmat.cpp)
+ENDIF()
+
  IF((FFT STREQUAL "OpenCV") AND BIG_BATCH)
    message(SEND_ERROR "OpenCV version does not support big batch mode.")
  ENDIF()
diff --git a/src/complexmat.cpp b/src/complexmat.cpp

new file mode 100644 (file)

index 0000000..18d5064
--- /dev/null
+++ b/src/complexmat.cpp
@@ -0,0 +1,89 @@
+#include "complexmat.hpp"
+
+ComplexMat_::T ComplexMat_::sqr_norm() const
+{
+    assert(n_scales == 1);
+
+    int n_channels_per_scale = n_channels / n_scales;
+    T sum_sqr_norm = 0;
+    for (int i = 0; i < n_channels_per_scale; ++i) {
+        for (auto lhs = p_data.hostMem() + i * rows * cols; lhs != p_data.hostMem() + (i + 1) * rows * cols; ++lhs)
+            sum_sqr_norm += lhs->real() * lhs->real() + lhs->imag() * lhs->imag();
+    }
+    sum_sqr_norm = sum_sqr_norm / static_cast<T>(cols * rows);
+    return sum_sqr_norm;
+}
+
+void ComplexMat_::sqr_norm(DynMem_<ComplexMat_::T> &result) const
+{
+    int n_channels_per_scale = n_channels / n_scales;
+    int scale_offset = n_channels_per_scale * rows * cols;
+    for (uint scale = 0; scale < n_scales; ++scale) {
+        T sum_sqr_norm = 0;
+        for (int i = 0; i < n_channels_per_scale; ++i)
+            for (auto lhs = p_data.hostMem() + i * rows * cols + scale * scale_offset;
+                 lhs != p_data.hostMem() + (i + 1) * rows * cols + scale * scale_offset; ++lhs)
+                sum_sqr_norm += lhs->real() * lhs->real() + lhs->imag() * lhs->imag();
+        result.hostMem()[scale] = sum_sqr_norm / static_cast<T>(cols * rows);
+    }
+    return;
+}
+
+ComplexMat_ ComplexMat_::sqr_mag() const
+{
+    return mat_const_operator([](std::complex<T> &c) { c = c.real() * c.real() + c.imag() * c.imag(); });
+}
+
+ComplexMat_ ComplexMat_::conj() const
+{
+    return mat_const_operator([](std::complex<T> &c) { c = std::complex<T>(c.real(), -c.imag()); });
+}
+
+ComplexMat_ ComplexMat_::sum_over_channels() const
+{
+    assert(p_data.num_elem == n_channels * rows * cols);
+
+    uint n_channels_per_scale = n_channels / n_scales;
+    uint scale_offset = n_channels_per_scale * rows * cols;
+
+    ComplexMat_ result(this->rows, this->cols, 1, n_scales);
+    for (uint scale = 0; scale < n_scales; ++scale) {
+        for (uint i = 0; i < rows * cols; ++i) {
+            std::complex<T> acc = 0;
+            for (uint ch = 0; ch < n_channels_per_scale; ++ch)
+                acc +=  p_data[scale * scale_offset + i + ch * rows * cols];
+            result.p_data.hostMem()[scale * rows * cols + i] = acc;
+        }
+    }
+    return result;
+}
+
+ComplexMat_ ComplexMat_::operator/(const ComplexMat_ &rhs) const
+{
+    return mat_mat_operator([](std::complex<T> &c_lhs, const std::complex<T> &c_rhs) { c_lhs /= c_rhs; }, rhs);
+}
+
+ComplexMat_ ComplexMat_::operator+(const ComplexMat_ &rhs) const
+{
+    return mat_mat_operator([](std::complex<T> &c_lhs, const std::complex<T> &c_rhs) { c_lhs += c_rhs; }, rhs);
+}
+
+ComplexMat_ ComplexMat_::operator*(const ComplexMat_::T &rhs) const
+{
+    return mat_const_operator([&rhs](std::complex<T> &c) { c *= rhs; });
+}
+
+ComplexMat_ ComplexMat_::mul(const ComplexMat_ &rhs) const
+{
+    return matn_mat1_operator([](std::complex<T> &c_lhs, const std::complex<T> &c_rhs) { c_lhs *= c_rhs; }, rhs);
+}
+
+ComplexMat_ ComplexMat_::operator+(const ComplexMat_::T &rhs) const
+{
+    return mat_const_operator([&rhs](std::complex<T> &c) { c += rhs; });
+}
+
+ComplexMat_ ComplexMat_::operator*(const ComplexMat_ &rhs) const
+{
+    return mat_mat_operator([](std::complex<T> &c_lhs, const std::complex<T> &c_rhs) { c_lhs *= c_rhs; }, rhs);
+}
diff --git a/src/complexmat.cu b/src/complexmat.cu

index d06e66fa4cc1dcdc7ef90d6c30473026393a9cdf..6ed8628f815d3b14568d2d8d1ae42fc6544f524c 100644 (file)
--- a/src/complexmat.cu
+++ b/src/complexmat.cu
@@ -1,6 +1,6 @@
-#include "complexmat.cuh"
+#include "complexmat.hpp"
  
-__global__ void sqr_norm_kernel(int n, float *out, float *data, float rows, float cols)
+__global__ void sqr_norm_kernel(int n, float *out, const float *data, float rows, float cols)
  {
      extern __shared__ float sdata[];
      int i = blockDim.x * threadIdx.y + threadIdx.x;
@@ -35,13 +35,13 @@ void ComplexMat::sqr_norm(DynMem &result) const
      dim3 numBlocks(n_channels / n_scales, n_scales);
  
      sqr_norm_kernel<<<numBlocks, threadsPerBlock, rows * cols * sizeof(float)>>>(
-        n_channels / n_scales, result.deviceMem(), this->p_data, rows, cols);
+        n_channels / n_scales, result.deviceMem(), (float*)this->p_data.deviceMem(), rows, cols);
      CudaCheckError();
  
      return;
  }
  
-__global__ void sqr_mag_kernel(float *data, float *result)
+__global__ void sqr_mag_kernel(const float *data, float *result)
  {
      int blockId = blockIdx.x + blockIdx.y * gridDim.x;
      int threadId = 2 * (blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x);
@@ -56,13 +56,13 @@ ComplexMat ComplexMat::sqr_mag() const
  
      dim3 threadsPerBlock(rows, cols);
      dim3 numBlocks(n_channels / n_scales, n_scales);
-    sqr_mag_kernel<<<numBlocks, threadsPerBlock, 0>>>(this->p_data, result.p_data);
+    sqr_mag_kernel<<<numBlocks, threadsPerBlock, 0>>>((float*)this->p_data.deviceMem(), (float*)result.p_data.deviceMem());
      CudaCheckError();
  
      return result;
  }
  
-__global__ void conj_kernel(float *data, float *result)
+__global__ void conj_kernel(const float *data, float *result)
  {
      int blockId = blockIdx.x + blockIdx.y * gridDim.x;
      int threadId = 2 * (blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x);
@@ -77,7 +77,7 @@ ComplexMat ComplexMat::conj() const
  
      dim3 threadsPerBlock(rows, cols);
      dim3 numBlocks(n_channels / n_scales, n_scales);
-    conj_kernel<<<numBlocks, threadsPerBlock, 0>>>(this->p_data, result.p_data);
+    conj_kernel<<<numBlocks, threadsPerBlock, 0>>>((float*)this->p_data.deviceMem(), (float*)result.p_data.deviceMem());
      CudaCheckError();
  
      return result;
@@ -90,12 +90,7 @@ ComplexMat ComplexMat::sum_over_channels() const
      return result;
  }
  
-cufftComplex *ComplexMat::get_p_data() const
-{
-    return (cufftComplex *)p_data;
-}
-
-__global__ void same_num_channels_mul_kernel(float *data_l, float *data_r, float *result)
+__global__ void same_num_channels_mul_kernel(const float *data_l, const float *data_r, float *result)
  {
      int blockId = blockIdx.x + blockIdx.y * gridDim.x;
      int threadId = 2 * (blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x);
@@ -113,14 +108,15 @@ ComplexMat ComplexMat::operator*(const ComplexMat &rhs) const
  
      dim3 threadsPerBlock(rows, cols);
      dim3 numBlocks(n_channels / n_scales, n_scales);
-    same_num_channels_mul_kernel<<<numBlocks, threadsPerBlock, 0>>>(this->p_data, rhs.p_data,
-                                                                                  result.p_data);
+    same_num_channels_mul_kernel<<<numBlocks, threadsPerBlock, 0>>>((float*)this->p_data.deviceMem(),
+                                                                    (float*)rhs.p_data.deviceMem(),
+                                                                    (float*)result.p_data.deviceMem());
      CudaCheckError();
  
      return result;
  }
  
-__global__ void same_num_channels_div_kernel(float *data_l, float *data_r, float *result)
+__global__ void same_num_channels_div_kernel(const float *data_l, const float *data_r, float *result)
  {
      int blockId = blockIdx.x + blockIdx.y * gridDim.x;
      int threadId = 2 * (blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x);
@@ -139,14 +135,15 @@ ComplexMat ComplexMat::operator/(const ComplexMat &rhs) const
  
      dim3 threadsPerBlock(rows, cols);
      dim3 numBlocks(n_channels / n_scales, n_scales);
-    same_num_channels_div_kernel<<<numBlocks, threadsPerBlock, 0>>>(this->p_data, rhs.p_data,
-                                                                                  result.p_data);
+    same_num_channels_div_kernel<<<numBlocks, threadsPerBlock, 0>>>((float*)this->p_data.deviceMem(),
+                                                                    (float*)rhs.p_data.deviceMem(),
+                                                                    (float*)result.p_data.deviceMem());
      CudaCheckError();
  
      return result;
  }
  
-__global__ void same_num_channels_add_kernel(float *data_l, float *data_r, float *result)
+__global__ void same_num_channels_add_kernel(const float *data_l, const float *data_r, float *result)
  {
      int blockId = blockIdx.x + blockIdx.y * gridDim.x;
      int threadId = 2 * (blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x);
@@ -163,14 +160,15 @@ ComplexMat ComplexMat::operator+(const ComplexMat &rhs) const
  
      dim3 threadsPerBlock(rows, cols);
      dim3 numBlocks(n_channels / n_scales, n_scales);
-    same_num_channels_add_kernel<<<numBlocks, threadsPerBlock, 0>>>(this->p_data, rhs.p_data,
-                                                                                  result.p_data);
+    same_num_channels_add_kernel<<<numBlocks, threadsPerBlock, 0>>>((float*)this->p_data.deviceMem(),
+                                                                    (float*)rhs.p_data.deviceMem(),
+                                                                    (float*)result.p_data.deviceMem());
      CudaCheckError();
  
      return result;
  }
  
-__global__ void constant_mul_kernel(float *data_l, float constant, float *result)
+__global__ void constant_mul_kernel(const float *data_l, float constant, float *result)
  {
      int blockId = blockIdx.x + blockIdx.y * gridDim.x;
      int threadId = 2 * (blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x);
@@ -185,13 +183,15 @@ ComplexMat ComplexMat::operator*(const float &rhs) const
  
      dim3 threadsPerBlock(rows, cols);
      dim3 numBlocks(n_channels / n_scales, n_scales);
-    constant_mul_kernel<<<numBlocks, threadsPerBlock, 0>>>(this->p_data, rhs, result.p_data);
+    constant_mul_kernel<<<numBlocks, threadsPerBlock, 0>>>((float*)this->p_data.deviceMem(),
+                                                           rhs,
+                                                           (float*)result.p_data.deviceMem());
      CudaCheckError();
  
      return result;
  }
  
-__global__ void constant_add_kernel(float *data_l, float constant, float *result)
+__global__ void constant_add_kernel(const float *data_l, float constant, float *result)
  {
      int blockId = blockIdx.x + blockIdx.y * gridDim.x;
      int threadId = 2 * (blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x);
@@ -206,13 +206,15 @@ ComplexMat ComplexMat::operator+(const float &rhs) const
  
      dim3 threadsPerBlock(rows, cols);
      dim3 numBlocks(n_channels / n_scales, n_scales);
-    constant_add_kernel<<<numBlocks, threadsPerBlock, 0>>>(this->p_data, rhs, result.p_data);
+    constant_add_kernel<<<numBlocks, threadsPerBlock, 0>>>((float*)this->p_data.deviceMem(),
+                                                           rhs,
+                                                           (float*)result.p_data.deviceMem());
      CudaCheckError();
  
      return result;
  }
  
-__global__ void one_channel_mul_kernel(float *data_l, float *data_r, float *result)
+__global__ void one_channel_mul_kernel(const float *data_l, const float *data_r, float *result)
  {
      int blockId = blockIdx.x + blockIdx.y * gridDim.x;
      int threadId = 2 * (blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x);
@@ -231,7 +233,9 @@ ComplexMat ComplexMat::mul(const ComplexMat &rhs) const
  
      dim3 threadsPerBlock(rows, cols);
      dim3 numBlocks(n_channels / n_scales, n_scales);
-    one_channel_mul_kernel<<<numBlocks, threadsPerBlock, 0>>>(this->p_data, rhs.p_data, result.p_data);
+    one_channel_mul_kernel<<<numBlocks, threadsPerBlock, 0>>>((float*)this->p_data.deviceMem(),
+                                                              (float*)rhs.p_data.deviceMem(),
+                                                              (float*)result.p_data.deviceMem());
      CudaCheckError();
  
      return result;
@@ -248,38 +252,28 @@ __global__ void scales_channel_mul_kernel(float *data_l, float *data_r, float *r
  }
  
  // multiplying element-wise multichannel by one channel mats (rhs mat is with multiple channel)
-ComplexMat ComplexMat::mul2(const ComplexMat &rhs) const
-{
-    assert(rhs.n_channels == n_channels / n_scales && rhs.cols == cols && rhs.rows == rows);
-
-    ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales);
-
-    dim3 threadsPerBlock(rows, cols);
-    dim3 numBlocks(n_channels / n_scales, n_scales);
-    scales_channel_mul_kernel<<<numBlocks, threadsPerBlock, 0>>>(this->p_data, rhs.p_data, result.p_data);
-    CudaCheckError();
+// ComplexMat ComplexMat::mul2(const ComplexMat &rhs) const
+// {
+//     assert(rhs.n_channels == n_channels / n_scales && rhs.cols == cols && rhs.rows == rows);
  
-    return result;
-}
+//     ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales);
  
-void ComplexMat::operator=(ComplexMat &rhs)
-{
-    cols = rhs.cols;
-    rows = rhs.rows;
-    n_channels = rhs.n_channels;
-    n_scales = rhs.n_scales;
+//     dim3 threadsPerBlock(rows, cols);
+//     dim3 numBlocks(n_channels / n_scales, n_scales);
+//     scales_channel_mul_kernel<<<numBlocks, threadsPerBlock, 0>>>(this->p_data, rhs.p_data, result.p_data);
+//     CudaCheckError();
  
-    p_data = rhs.p_data;
-}
+//     return result;
+// }
  
-void ComplexMat::operator=(ComplexMat &&rhs)
-{
-    cols = rhs.cols;
-    rows = rhs.rows;
-    n_channels = rhs.n_channels;
-    n_scales = rhs.n_scales;
+// void ComplexMat::operator=(ComplexMat &&rhs)
+// {
+//     cols = rhs.cols;
+//     rows = rhs.rows;
+//     n_channels = rhs.n_channels;
+//     n_scales = rhs.n_scales;
  
-    p_data = rhs.p_data;
+//     p_data = rhs.p_data;
  
-    rhs.p_data = nullptr;
-}
+//     rhs.p_data = nullptr;
+// }
diff --git a/src/complexmat.cuh b/src/complexmat.cuh

deleted file mode 100644 (file)

index cf2e6ca..0000000
--- a/src/complexmat.cuh
+++ /dev/null
@@ -1,124 +0,0 @@
-#ifndef COMPLEXMAT_H
-#define COMPLEXMAT_H
-
-#include <opencv2/opencv.hpp>
-
-#include "dynmem.hpp"
-#include "cuda_runtime.h"
-#include "cufft.h"
-
-#include "cuda_error_check.hpp"
-
-class ComplexMat {
-  public:
-    uint cols;
-    uint rows;
-    uint n_channels;
-    uint n_scales = 1;
-
-    ComplexMat() : cols(0), rows(0), n_channels(0) {}
-
-    ComplexMat(uint _rows, uint _cols, uint _n_channels, uint _n_scales = 1)
-        : cols(_cols), rows(_rows), n_channels(_n_channels * _n_scales), n_scales(_n_scales)
-    {
-        CudaSafeCall(cudaMalloc(&p_data, n_channels * cols * rows * sizeof(cufftComplex)));
-    }
-
-    ComplexMat(cv::Size size, uint _n_channels, uint _n_scales = 1)
-        : cols(size.width), rows(size.height), n_channels(_n_channels * _n_channels), n_scales(_n_scales)
-    {
-        CudaSafeCall(cudaMalloc(&p_data, n_channels * cols * rows * sizeof(cufftComplex)));
-    }
-
-    ComplexMat(ComplexMat &&other)
-    {
-        cols = other.cols;
-        rows = other.rows;
-        n_channels = other.n_channels;
-        n_scales = other.n_scales;
-        p_data = other.p_data;
-
-        other.p_data = nullptr;
-    }
-
-    ~ComplexMat()
-    {
-        if (p_data != nullptr) {
-            CudaSafeCall(cudaFree(p_data));
-            p_data = nullptr;
-        }
-    }
-
-    void create(uint _rows, uint _cols, uint _n_channels)
-    {
-        rows = _rows;
-        cols = _cols;
-        n_channels = _n_channels;
-        CudaSafeCall(cudaMalloc(&p_data, n_channels * cols * rows * sizeof(cufftComplex)));
-    }
-
-    void create(uint _rows, uint _cols, uint _n_channels, uint _n_scales)
-    {
-        rows = _rows;
-        cols = _cols;
-        n_channels = _n_channels;
-        n_scales = _n_scales;
-        CudaSafeCall(cudaMalloc(&p_data, n_channels * cols * rows * sizeof(cufftComplex)));
-    }
-    // cv::Mat API compatibility
-    cv::Size size() const { return cv::Size(cols, rows); }
-    uint channels() const { return n_channels; }
-
-    void sqr_norm(DynMem &result) const;
-
-    ComplexMat sqr_mag() const;
-
-    ComplexMat conj() const;
-
-    ComplexMat sum_over_channels() const;
-
-    cufftComplex *get_p_data() const;
-
-    // element-wise per channel multiplication, division and addition
-    ComplexMat operator*(const ComplexMat &rhs) const;
-    ComplexMat operator/(const ComplexMat &rhs) const;
-    ComplexMat operator+(const ComplexMat &rhs) const;
-
-    // multiplying or adding constant
-    ComplexMat operator*(const float &rhs) const;
-    ComplexMat operator+(const float &rhs) const;
-
-    // multiplying element-wise multichannel by one channel mats (rhs mat is with one channel)
-    ComplexMat mul(const ComplexMat &rhs) const;
-
-    // multiplying element-wise multichannel by one channel mats (rhs mat is with multiple channel)
-    ComplexMat mul2(const ComplexMat &rhs) const;
-    // text output
-    friend std::ostream &operator<<(std::ostream &os, const ComplexMat &mat)
-    {
-        float *data_cpu = reinterpret_cast<float*>(malloc(mat.rows * mat.cols * mat.n_channels * sizeof(cufftComplex)));
-        CudaSafeCall(cudaMemcpy(data_cpu, mat.p_data, mat.rows * mat.cols * mat.n_channels * sizeof(cufftComplex),
-                                cudaMemcpyDeviceToHost));
-        // for (int i = 0; i < mat.n_channels; ++i){
-        for (int i = 0; i < 1; ++i) {
-            os << "Channel " << i << std::endl;
-            for (uint j = 0; j < mat.rows; ++j) {
-                for (uint k = 0; k < 2 * mat.cols - 2; k += 2)
-                    os << "(" << data_cpu[j * 2 * mat.cols + k] << "," << data_cpu[j * 2 * mat.cols + (k + 1)] << ")"
-                       << ", ";
-                os << "(" << data_cpu[j * 2 * mat.cols + 2 * mat.cols - 2] << ","
-                   << data_cpu[j * 2 * mat.cols + 2 * mat.cols - 1] << ")" << std::endl;
-            }
-        }
-        free(data_cpu);
-        return os;
-    }
-
-    void operator=(ComplexMat &rhs);
-    void operator=(ComplexMat &&rhs);
-
-  private:
-    mutable float *p_data = nullptr;
-};
-
-#endif // COMPLEXMAT_H
diff --git a/src/complexmat.hpp b/src/complexmat.hpp

index 1a0c1d508efbd1f2b4fd891adb2bfcc720e896f2..60c8255912b490a2dd90cd33e81dcdf28031c5a4 100644 (file)
--- a/src/complexmat.hpp
+++ b/src/complexmat.hpp
@@ -7,6 +7,10 @@
  #include <functional>
  #include "dynmem.hpp"
  
+#ifdef CUFFT
+#include <cufft.h>
+#endif
+
  class ComplexMat_ {
    public:
      typedef float T;
@@ -50,63 +54,15 @@ class ComplexMat_ {
          }
      }
  
-    T sqr_norm() const
-    {
-        assert(n_scales == 1);
+    T sqr_norm() const;
  
-        int n_channels_per_scale = n_channels / n_scales;
-        T sum_sqr_norm = 0;
-        for (int i = 0; i < n_channels_per_scale; ++i) {
-            for (auto lhs = p_data.hostMem() + i * rows * cols; lhs != p_data.hostMem() + (i + 1) * rows * cols; ++lhs)
-                sum_sqr_norm += lhs->real() * lhs->real() + lhs->imag() * lhs->imag();
-        }
-        sum_sqr_norm = sum_sqr_norm / static_cast<T>(cols * rows);
-        return sum_sqr_norm;
-    }
+    void sqr_norm(DynMem_<T> &result) const;
  
-    void sqr_norm(DynMem_<T> &result) const
-    {
-        int n_channels_per_scale = n_channels / n_scales;
-        int scale_offset = n_channels_per_scale * rows * cols;
-        for (uint scale = 0; scale < n_scales; ++scale) {
-            T sum_sqr_norm = 0;
-            for (int i = 0; i < n_channels_per_scale; ++i)
-                for (auto lhs = p_data.hostMem() + i * rows * cols + scale * scale_offset;
-                     lhs != p_data.hostMem() + (i + 1) * rows * cols + scale * scale_offset; ++lhs)
-                    sum_sqr_norm += lhs->real() * lhs->real() + lhs->imag() * lhs->imag();
-            result.hostMem()[scale] = sum_sqr_norm / static_cast<T>(cols * rows);
-        }
-        return;
-    }
-
-    ComplexMat_ sqr_mag() const
-    {
-        return mat_const_operator([](std::complex<T> &c) { c = c.real() * c.real() + c.imag() * c.imag(); });
-    }
+    ComplexMat_ sqr_mag() const;
  
-    ComplexMat_ conj() const
-    {
-        return mat_const_operator([](std::complex<T> &c) { c = std::complex<T>(c.real(), -c.imag()); });
-    }
+    ComplexMat_ conj() const;
  
-    ComplexMat_ sum_over_channels() const
-    {
-        assert(p_data.num_elem == n_channels * rows * cols);
-
-        uint n_channels_per_scale = n_channels / n_scales;
-        uint scale_offset = n_channels_per_scale * rows * cols;
-
-        ComplexMat_ result(this->rows, this->cols, 1, n_scales);
-        for (uint scale = 0; scale < n_scales; ++scale) {
-            for (uint i = 0; i < rows * cols; ++i) {
-                std::complex<T> acc = 0;
-                for (uint ch = 0; ch < n_channels_per_scale; ++ch)
-                    acc +=  p_data[scale * scale_offset + i + ch * rows * cols];
-                result.p_data.hostMem()[scale * rows * cols + i] = acc;
-            }
-        }
-        return result;
-    }
+    ComplexMat_ sum_over_channels() const;
  
      // return 2 channels (real, imag) for first complex channel
      cv::Mat to_cv_mat() const
@@ -129,35 +85,22 @@ class ComplexMat_ {
      std::complex<T> *get_p_data() { return p_data.hostMem(); }
      const std::complex<T> *get_p_data() const { return p_data.hostMem(); }
  
+#ifdef CUFFT
+    cufftComplex *get_dev_data() { return (cufftComplex*)p_data.deviceMem(); }
+    const cufftComplex *get_dev_data() const { return (cufftComplex*)p_data.deviceMem(); }
+#endif
+
      // element-wise per channel multiplication, division and addition
-    ComplexMat_ operator*(const ComplexMat_ &rhs) const
-    {
-        return mat_mat_operator([](std::complex<T> &c_lhs, const std::complex<T> &c_rhs) { c_lhs *= c_rhs; }, rhs);
-    }
-    ComplexMat_ operator/(const ComplexMat_ &rhs) const
-    {
-        return mat_mat_operator([](std::complex<T> &c_lhs, const std::complex<T> &c_rhs) { c_lhs /= c_rhs; }, rhs);
-    }
-    ComplexMat_ operator+(const ComplexMat_ &rhs) const
-    {
-        return mat_mat_operator([](std::complex<T> &c_lhs, const std::complex<T> &c_rhs) { c_lhs += c_rhs; }, rhs);
-    }
+    ComplexMat_ operator*(const ComplexMat_ &rhs) const;
+    ComplexMat_ operator/(const ComplexMat_ &rhs) const;
+    ComplexMat_ operator+(const ComplexMat_ &rhs) const;
  
      // multiplying or adding constant
-    ComplexMat_ operator*(const T &rhs) const
-    {
-        return mat_const_operator([&rhs](std::complex<T> &c) { c *= rhs; });
-    }
-    ComplexMat_ operator+(const T &rhs) const
-    {
-        return mat_const_operator([&rhs](std::complex<T> &c) { c += rhs; });
-    }
+    ComplexMat_ operator*(const T &rhs) const;
+    ComplexMat_ operator+(const T &rhs) const;
  
      // multiplying element-wise multichannel by one channel mats (rhs mat is with one channel)
-    ComplexMat_ mul(const ComplexMat_ &rhs) const
-    {
-        return matn_mat1_operator([](std::complex<T> &c_lhs, const std::complex<T> &c_rhs) { c_lhs *= c_rhs; }, rhs);
-    }
+    ComplexMat_ mul(const ComplexMat_ &rhs) const;
  
      // multiplying element-wise multichannel mats - same as operator*(ComplexMat), but without allocating memory for the result
      ComplexMat_ muln(const ComplexMat_ &rhs) const
diff --git a/src/cuda_functions.cu b/src/cuda_functions.cu

index 2b8ef3018806b1f5e15856865ddc80b50cef0643..508ebe0c701c29ddb8f80360b22f0bca2951a513 100644 (file)
--- a/src/cuda_functions.cu
+++ b/src/cuda_functions.cu
@@ -1,4 +1,4 @@
-#include "cuda_functions.cuh"
+#include "cuda_functions.h"
  
  __global__ void gaussian_correlation_kernel(float *data_in, float *data_out, float *xf_sqr_norm, float *yf_sqr_norm,
                                              int rows, int cols, int channels_per_scale, double sigma)
diff --git a/src/cuda_functions.cuh b/src/cuda_functions.h

similarity index 100%

rename from src/cuda_functions.cuh

rename to src/cuda_functions.h
diff --git a/src/debug.h b/src/debug.h

index 85a4890bbad684dd61e61b6a1b93ae05ea28e503..5335de0b045c5f50627d420f21623adaa6a2b0c6 100644 (file)
--- a/src/debug.h
+++ b/src/debug.h
@@ -7,10 +7,10 @@
  #include <stdio.h>
  #include <opencv2/opencv.hpp>
  #include "dynmem.hpp"
-#ifdef CUFFT
-#include "complexmat.cuh"
-#else
  #include "complexmat.hpp"
+
+#ifdef CUFFT
+#include <cufft.h>
  #endif
  
  
diff --git a/src/dynmem.hpp b/src/dynmem.hpp

index 236794d6a7b458acf6005ea5508204d3c90a2245..f45609419b4f4883960e3b8496f90a64e037915f 100644 (file)
--- a/src/dynmem.hpp
+++ b/src/dynmem.hpp
@@ -53,6 +53,7 @@ template <typename T> class DynMem_ {
      const T *hostMem() const { return ptr_h; }
  #ifdef CUFFT
      T *deviceMem() { return ptr_d; }
+    const T *deviceMem() const { return ptr_d; }
  #endif
      void operator=(DynMem_ &rhs) {
          assert(num_elem == rhs.num_elem);
diff --git a/src/fft.h b/src/fft.h

index c6a8d76d83ae572c74e50e743a204ec2aee55f65..f242a265d925df7040ec375bc750e440878a0654 100644 (file)
--- a/src/fft.h
+++ b/src/fft.h
@@ -5,12 +5,7 @@
  #include <opencv2/opencv.hpp>
  #include <vector>
  #include <cassert>
-
-#ifdef CUFFT
-    #include "complexmat.cuh"
-#else
-    #include "complexmat.hpp"
-#endif
+#include "complexmat.hpp"
  
  #ifdef BIG_BATCH
  #define BIG_BATCH_MODE 1
diff --git a/src/fft_cufft.cpp b/src/fft_cufft.cpp

index 9c92a69bfb7049f8c2702e052bc9a49546e33095..61bf1c9550bdc829f7eedeeb54540838bb2adb79 100644 (file)
--- a/src/fft_cufft.cpp
+++ b/src/fft_cufft.cpp
@@ -64,10 +64,10 @@ void cuFFT::forward(const MatScales &real_input, ComplexMat &complex_result)
      auto in = static_cast<cufftReal *>(const_cast<MatScales&>(real_input).deviceMem());
  
      if (real_input.size[0] == 1)
-        cudaErrorCheck(cufftExecR2C(plan_f, in, complex_result.get_p_data()));
+        cudaErrorCheck(cufftExecR2C(plan_f, in, complex_result.get_dev_data()));
  #ifdef BIG_BATCH
      else
-        cudaErrorCheck(cufftExecR2C(plan_f_all_scales, in, complex_result.get_p_data()));
+        cudaErrorCheck(cufftExecR2C(plan_f_all_scales, in, complex_result.get_dev_data()));
  #endif
  }
  
@@ -87,10 +87,10 @@ void cuFFT::forward_window(MatScaleFeats &feat, ComplexMat &complex_result, MatS
      }
  
      if (n_scales == 1)
-        cudaErrorCheck(cufftExecR2C(plan_fw, temp_data, complex_result.get_p_data()));
+        cudaErrorCheck(cufftExecR2C(plan_fw, temp_data, complex_result.get_dev_data()));
  #ifdef BIG_BATCH
      else
-        cudaErrorCheck(cufftExecR2C(plan_fw_all_scales, temp_data, complex_result.get_p_data()));
+        cudaErrorCheck(cufftExecR2C(plan_fw_all_scales, temp_data, complex_result.get_dev_data()));
  #endif
  }
  
diff --git a/src/kcf.h b/src/kcf.h

index 10b189a678c27a9b87d5950b59a0a1a7602ef704..f537ccad0dc74611860534bd1b16ee48450a1113 100644 (file)
--- a/src/kcf.h
+++ b/src/kcf.h
@@ -6,13 +6,11 @@
  #include <memory>
  #include "fhog.hpp"
  
+#include "complexmat.hpp"
  #ifdef CUFFT
-#include "complexmat.cuh"
-#include "cuda_functions.cuh"
+#include "cuda_functions.h"
  #include "cuda_error_check.hpp"
  #include <cuda_runtime.h>
-#else
-#include "complexmat.hpp"
  #endif
  
  #include "cnfeat.hpp"
diff --git a/src/threadctx.hpp b/src/threadctx.hpp

index 2faceb1e1d5805bef0dfed6ba392b8a56c85b81d..982905b96699b5928d39eb2ba158fc6920c2120f 100644 (file)
--- a/src/threadctx.hpp
+++ b/src/threadctx.hpp
@@ -4,12 +4,7 @@
  #include <future>
  #include "dynmem.hpp"
  #include "kcf.h"
-
-#ifdef CUFFT
-#include "complexmat.cuh"
-#else
  #include "complexmat.hpp"
-#endif
  
  class KCF_Tracker;
author	Michal Sojka <michal.sojka@cvut.cz>
	Tue, 2 Oct 2018 13:50:49 +0000 (15:50 +0200)
committer	Michal Sojka <michal.sojka@cvut.cz>
	Tue, 2 Oct 2018 13:51:12 +0000 (15:51 +0200)
src/CMakeLists.txt		patch \| blob \| history
src/complexmat.cpp	[new file with mode: 0644]	patch \| blob
src/complexmat.cu		patch \| blob \| history
src/complexmat.cuh	[deleted file]	patch \| blob \| history
src/complexmat.hpp		patch \| blob \| history
src/cuda_functions.cu		patch \| blob \| history
src/cuda_functions.h	[moved from src/cuda_functions.cuh with 100% similarity]	patch \| blob \| history
src/debug.h		patch \| blob \| history
src/dynmem.hpp		patch \| blob \| history
src/fft.h		patch \| blob \| history
src/fft_cufft.cpp		patch \| blob \| history
src/kcf.h		patch \| blob \| history
src/threadctx.hpp		patch \| blob \| history