#include <algorithm>
#include <functional>
+#ifdef CUFFT
+ #include "managed_allocator.h"
+
+ template<class T>
+ using managed_vector = std::vector<T,managed_allocator<T>>;
+#endif
+
template<typename T> class ComplexMat_
{
public:
private:
+#ifdef CUFFT
+ mutable managed_vector<std::complex<T>> p_data;
+#else
mutable std::vector<std::complex<T>> p_data;
-
+#endif
//convert 2 channel mat (real, imag) to vector row-by-row
std::vector<std::complex<T>> convert(const cv::Mat & mat)
{
#include "fft_cufft.h"
-#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
-inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
-{
- if (code != cudaSuccess)
- {
- fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
- if (abort) exit(code);
- }
-}
-
cuFFT::cuFFT(): m_num_of_streams(4)
{}
+
void cuFFT::init(unsigned width, unsigned height, unsigned num_of_feats, unsigned num_of_scales)
{
m_width = width;
std::cout << "FFT: cuFFT" << std::endl;
- for (unsigned i = 0; i < m_num_of_streams; i++) gpuErrchk(cudaStreamCreate(&streams[i]));
+ cudaSetDeviceFlags(cudaDeviceMapHost);
+
+ for (unsigned i = 0; i < m_num_of_streams; i++) cudaStreamCreate(&streams[i]);
//FFT forward one scale
{
ComplexMat cuFFT::forward(const cv::Mat &input)
{
+ CUDA::GpuMat input_d(input);
ComplexMat complex_result;
if(input.rows == (int)(m_height*m_num_of_scales)){
complex_result.create(m_height, m_width / 2 + 1, m_num_of_scales);
cv::Mat in_roi(in_all, cv::Rect(0, i*m_height, m_width, m_height));
in_roi = input[i].mul(m_window);
}
+ CUDA::GpuMat in_all_d(in_all);
ComplexMat result;
if(n_channels > (int) m_num_of_feats)
result.create(m_height, m_width/2 + 1, n_channels,m_num_of_scales);
cuFFT::~cuFFT()
{
- for(unsigned i = 0; i < m_num_of_streams; i++) gpuErrchk(cudaStreamDestroy(streams[i]));
+ for(unsigned i = 0; i < m_num_of_streams; i++) cudaStreamDestroy(streams[i]);
cudaDeviceReset();
}
--- /dev/null
+// https://github.com/jaredhoberock/managed_allocator
+#ifndef MANAGED_ALLOCATOR_H
+#define MANAGED_ALLOCATOR_H
+
+#include <cuda_runtime.h>
+#include <thrust/system_error.h>
+#include <thrust/system/cuda/error.h>
+
+template<class T>
+class managed_allocator
+{
+public:
+ using value_type = T;
+
+ managed_allocator() {}
+
+ template<class U>
+ managed_allocator(const managed_allocator<U>&) {}
+
+ value_type* allocate(size_t n)
+ {
+ value_type* result = nullptr;
+
+ cudaError_t error = cudaMallocManaged(&result, n*sizeof(T), cudaMemAttachGlobal);
+
+ if(error != cudaSuccess)
+ {
+ throw thrust::system_error(error, thrust::cuda_category(), "managed_allocator::allocate(): cudaMallocManaged");
+ }
+
+ return result;
+ }
+
+ void deallocate(value_type* ptr, size_t)
+ {
+ cudaError_t error = cudaFree(ptr);
+
+ if(error != cudaSuccess)
+ {
+ throw thrust::system_error(error, thrust::cuda_category(), "managed_allocator::deallocate(): cudaFree");
+ }
+ }
+};
+ template<class T1, class T2>
+ bool operator ==(const managed_allocator<T1>&, const managed_allocator<T2>&)
+ {
+ return true;
+ }
+
+ template<class T1, class T2>
+ bool operator!=(const managed_allocator<T1>& lhs, const managed_allocator<T2>& rhs)
+ {
+ return !(lhs == rhs);
+ }
+
+#endif // MANAGED_ALLOCATOR_H