From: Michal Sojka <michal.sojka@cvut.cz>
Date: Mon, 21 Jan 2019 23:59:32 +0000 (+0100)
Subject: Add simple CUDA memory manager
X-Git-Url: http://rtime.felk.cvut.cz/gitweb/hercules2020/kcf.git/commitdiff_plain/93502b823a0d73c5dd36f31261d4dd377ad09e08

Add simple CUDA memory manager

Thanks to this, we avoid allocating CUDA host-mapped memory at runtime,
which acts as implicit sync point and slows down the execution.
---

diff --git a/src/dynmem.hpp b/src/dynmem.hpp
index f456094..2d62352 100644
--- a/src/dynmem.hpp
+++ b/src/dynmem.hpp
@@ -5,6 +5,8 @@
 #include <opencv2/opencv.hpp>
 #include <cassert>
 #include <numeric>
+#include <mutex>
+#include <stack>
 
 #if defined(CUFFT) || defined(CUFFTW)
 #include "cuda_runtime.h"
@@ -13,11 +15,33 @@
 #endif
 #endif
 
+class MemoryManager {
+    std::mutex mutex;
+    std::map<size_t, std::stack<void*> > map;
+
+public:
+    void *get(size_t size) {
+        std::lock_guard<std::mutex> guard(mutex);
+        auto &stack = map[size];
+        void *ptr = nullptr;
+        if (!stack.empty()) {
+            ptr = stack.top();
+            stack.pop();
+        }
+        return ptr;
+    }
+    void put(void *ptr, size_t size) {
+        std::lock_guard<std::mutex> guard(mutex);
+        map[size].push(ptr);
+    }
+};
+
 template <typename T> class DynMem_ {
   private:
     T *ptr_h = nullptr;
 #ifdef CUFFT
     T *ptr_d = nullptr;
+    static MemoryManager mmng;
 #endif
   public:
     typedef T value_type;
@@ -26,7 +50,11 @@ template <typename T> class DynMem_ {
     DynMem_(size_t num_elem) : num_elem(num_elem)
     {
 #ifdef CUFFT
-        CudaSafeCall(cudaHostAlloc(reinterpret_cast<void **>(&ptr_h), num_elem * sizeof(T), cudaHostAllocMapped));
+        ptr_h = reinterpret_cast<T*>(mmng.get(num_elem));
+        if (!ptr_h) {
+            printf("malloc(%zu)\n", num_elem);
+            CudaSafeCall(cudaHostAlloc(reinterpret_cast<void **>(&ptr_h), num_elem * sizeof(T), cudaHostAllocMapped));
+	}
         CudaSafeCall(cudaHostGetDevicePointer(reinterpret_cast<void **>(&ptr_d), reinterpret_cast<void *>(ptr_h), 0));
 #else
         ptr_h = new T[num_elem];
@@ -75,13 +103,20 @@ private:
     void release()
     {
 #ifdef CUFFT
-        CudaSafeCall(cudaFreeHost(ptr_h));
+        if (ptr_h)
+            mmng.put(ptr_h, num_elem);
+        //CudaSafeCall(cudaFreeHost(ptr_h));
 #else
         delete[] ptr_h;
 #endif
     }
 };
 
+#ifdef CUFFT
+template <typename T>
+MemoryManager DynMem_<T>::mmng;
+#endif
+
 typedef DynMem_<float> DynMem;