]> rtime.felk.cvut.cz Git - hercules2020/kcf.git/commitdiff
Merge rotation branch
authorMichal Sojka <michal.sojka@cvut.cz>
Fri, 19 Oct 2018 07:16:17 +0000 (09:16 +0200)
committerMichal Sojka <michal.sojka@cvut.cz>
Fri, 19 Oct 2018 07:16:17 +0000 (09:16 +0200)
This code from that branch is not used directly. Instead, its
functionality was reimplemented in a clean way in the previous commits
of the master branch.

This merge is just to close the pull request and to not loos the
original implementation if it is needed sometimes later.

32 files changed:
.clang-format
.gitignore
.travis.yml
CMakeLists.txt
Makefile
README.md
graphGen.sh [new file with mode: 0755]
main_vot.cpp
print-test-results
src/CMakeLists.txt
src/complexmat.cpp [new file with mode: 0644]
src/complexmat.cu
src/complexmat.cuh [deleted file]
src/complexmat.hpp
src/cuda/CMakeLists.txt [deleted file]
src/cuda_error_check.hpp [moved from src/cuda/cuda_error_check.cuh with 65% similarity]
src/cuda_functions.cu [deleted file]
src/cuda_functions.cuh [deleted file]
src/debug.cpp [new file with mode: 0644]
src/debug.h [new file with mode: 0644]
src/dynmem.hpp
src/fft.cpp
src/fft.h
src/fft_cufft.cpp
src/fft_cufft.h
src/fft_fftw.cpp
src/fft_fftw.h
src/fft_opencv.cpp
src/fft_opencv.h
src/kcf.cpp
src/kcf.h
src/threadctx.hpp

index fa73b86780d1b874341209b58fe2e2ff9df406bd..539e1590d303f962eebacf66d58785b286565905 100644 (file)
@@ -17,7 +17,7 @@ AllowShortLoopsOnASingleLine: false
 AlwaysBreakAfterDefinitionReturnType: None
 AlwaysBreakAfterReturnType: None
 AlwaysBreakBeforeMultilineStrings: false
-AlwaysBreakTemplateDeclarations: false
+AlwaysBreakTemplateDeclarations: true
 BinPackArguments: true
 BinPackParameters: true
 BraceWrapping:   
index fc8c279688ddadf602a7cca9f67cc26e46189eb6..9ec1ef9e62d92a8182a555553ed21b14a1a1cd61 100644 (file)
@@ -5,3 +5,4 @@ src/complexmat_cv.hpp
 *TODO*
 /vot2016/
 *.kdev4
+/.test.file
index 4d3a00d1f1e871d8c61268a5e7a2c057a1696ea8..44f51e4938164dea9157cbf105a2f3fe27155e42 100644 (file)
@@ -11,10 +11,9 @@ addons:
       - ninja-build
       - libopencv-dev
       - libfftw3-dev
-      - cuda-command-line-tools-8-0 cuda-cufft-dev-8-0
 
 env:
-  - CUDA_BIN_PATH=/usr/local/cuda-8.0 CXXFLAGS=-Werror CUDA_ARCH_LIST=6.2
+  - CUDA_BIN_PATH=/usr/local/cuda-8.0 CXXFLAGS=-Werror CUDA_ARCH_LIST=6.2 NINJA_STATUS="[%f/%t] "
 
 script: make
 
@@ -27,6 +26,7 @@ matrix:
           packages:
             - *common_packages
             - g++-5
+            - cuda-command-line-tools-8-0 cuda-cufft-dev-8-0 cuda-cublas-dev-8-0
       script: make CC=gcc-5 CXX=g++-5
     - compiler: clang-3.8
       addons:
@@ -35,18 +35,16 @@ matrix:
           packages:
             - *common_packages
             - clang-3.8
-      script: make CC=clang-3.8 CXX=clang++-3.8 $(grep -v openmp .kcf_builds)
+            - cuda-command-line-tools-8-0 cuda-cufft-dev-8-0 cuda-cublas-dev-8-0
+      script: make CC=clang-3.8 CXX=clang++-3.8 $(make print_BUILDS|grep -v openmp)
     - compiler: clang
       name: clang & test
       script:
-        - make BUILDS="$(echo $(grep -v cufft .kcf_builds))"
-        - ninja test
-      after_script: find vot2016 -name output.txt -o -name core -print0 | xargs -0 rm -f
+        - make BUILDS="$(make print_BUILDS|grep -v cufft|paste -s)"
+        - LD_LIBRARY_PATH=/usr/local/clang-5.0.0/lib/ LD_PRELOAD=libSegFault.so SEGFAULT_SIGNALS=all ninja test
       addons:
         apt: { sources: *common_sources, packages: [*common_packages, unzip] }
 
-before_script:
-    - make -qp|grep "^BUILDS ="|grep -o '[a-z-]*' | tee .kcf_builds
-
+before_cache: rm -f vot2016/*/{output.txt,core}
 cache:
   directories: [ vot2016 ]
index c5f0c7e0337e04762ad89a190d37d9a1eee845a4..28062bd5b359f926eae1b523508de166473baa94 100644 (file)
@@ -15,18 +15,8 @@ endif()
 
 FIND_PACKAGE( OpenCV REQUIRED )
 link_directories ( ${OpenCV_LIB_DIR} )
-MESSAGE(STATUS "OpenCV_LIB_DIR: ${OpenCV_LIB_DIR} ")
-
-IF ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
-  IF(NOT OPENMP)
-    SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread -Wno-unknown-pragmas")
-  ELSE()
-    MESSAGE(STATUS "OpenMP")
-    SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp")
-  ENDIF() #ASYNC
-ENDIF ()
-
 include_directories ( ${OpenCV_INCLUDE_DIRS} )
+MESSAGE(STATUS "OpenCV_LIB_DIR: ${OpenCV_LIB_DIR} ")
 MESSAGE(STATUS "OpenCV_INCLUDE_DIRS: ${OpenCV_INCLUDE_DIRS}")
 
 INCLUDE_DIRECTORIES( ${CMAKE_BINARY_DIR}/)
index 094c5bb0e42f644e2c239d36d5f6a27715b67117..5ca40584a8c95041fa53ecb6bf807add51b0ece9 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -1,11 +1,14 @@
 # Makefile to build all the available variants
 
 BUILDS = opencvfft-st opencvfft-async opencvfft-openmp fftw fftw-async fftw-openmp fftw-big fftw-big-openmp cufftw cufftw-big cufftw-big-openmp cufft cufft-openmp cufft-big cufft-big-openmp
-TESTSEQ = bag ball1 car1 book
-TESTFLAGS = default fit128
+TESTSEQ = bmx ball1 crossing racing book
+TESTFLAGS = default fit
 
 all: $(BUILDS)
 
+print_%:
+       @$(foreach v,$($*),echo $(v);)
+
 ninja: build.ninja
        ninja
 
@@ -16,8 +19,6 @@ $(BUILDS): build.ninja
 clean: build.ninja
        ninja $@
 
-CMAKE_OPTS += -G Ninja
-
 ## Useful setting - uncomment and modify as needed
 # CMAKE_OPTS += -DOpenCV_DIR=~/opt/opencv-2.4/share/OpenCV
 # CMAKE_OPTS += -DCUDA_VERBOSE_BUILD=ON -DCUDA_NVCC_FLAGS="--verbose;--save-temps"
@@ -56,7 +57,7 @@ vot2016 $(TESTSEQ:%=vot2016/%): vot2016.zip
 .INTERMEDIATE: vot2016.zip
 .SECONDARY:    vot2016.zip
 vot2016.zip:
-       wget -O $@ http://data.votchallenge.net/vot2016/vot2016.zip
+       wget --progress=dot:giga -O $@ http://data.votchallenge.net/vot2016/vot2016.zip
 
 ###################
 # Ninja generator #
@@ -67,19 +68,24 @@ vot2016.zip:
 # compiles all variants in the same ways as this makefile, but faster.
 # The down side is that the build needs about 10 GB of memory.
 
-define nl
 
+# Define echo depending on whether make supports the $(file) function.
+$(file >.test.file)
+ifneq ($(wildcard .test.file),)
+  echo = $(file $(1),$(2))
+else
+  define nl
 
-endef
 
-define echo
-echo $(1) '$(subst $(nl),\n,$(subst \,\\,$(2)))';
-endef
+  endef
+  echo = echo $(1) '$(subst $(nl),\n,$(subst \,\\,$(2)))';
+endif
 
 # Ninja generator - to have faster parallel builds and tests
 .PHONY: build.ninja
 
-build.ninja: Makefile
+build.ninja:: $(MAKEFILE_LIST)
+       @echo "Generating $@"
        @$(call echo,>$@,$(ninja-rule))
        @$(foreach build,$(BUILDS),\
                $(call echo,>>$@,$(call ninja-build,$(build),$(CMAKE_OTPS_$(build)))))
@@ -88,27 +94,33 @@ build.ninja: Makefile
        @$(call echo,>>$@,build test: PRINT_RESULTS $(foreach build,$(BUILDS),$(foreach seq,$(TESTSEQ),$(foreach f,$(TESTFLAGS),$(call ninja-test,$(build),$(seq),$(f))))) | print-test-results)
        @$(foreach build,$(BUILDS),$(call echo,>>$@,build test-$(build): PRINT_RESULTS $(foreach seq,$(TESTSEQ),$(foreach f,$(TESTFLAGS),$(call ninja-test,$(build),$(seq),$(f)))) | print-test-results))
        @$(foreach seq,$(TESTSEQ),$(call echo,>>$@,build test-$(seq): PRINT_RESULTS $(foreach build,$(BUILDS),$(foreach f,$(TESTFLAGS),$(call ninja-test,$(build),$(seq),$(f)))) | print-test-results))
+       @$(call echo,>>$@,build plot: PLOT_RESULTS $(foreach build,$(BUILDS),$(foreach seq,$(TESTSEQ),$(foreach f,$(TESTFLAGS),$(call ninja-test,$(build),$(seq),$(f))))) | graphGen.sh)
+       @$(foreach build,$(BUILDS),$(call echo,>>$@,build plot-$(build): PLOT_RESULTS $(foreach seq,$(TESTSEQ),$(foreach f,$(TESTFLAGS),$(call ninja-test,$(build),$(seq),$(f)))) | graphGen.sh))
+       @$(foreach seq,$(TESTSEQ),$(call echo,>>$@,build plot-$(seq): PLOT_RESULTS $(foreach build,$(BUILDS),$(foreach f,$(TESTFLAGS),$(call ninja-test,$(build),$(seq),$(f)))) | graphGen.sh))
        @$(foreach seq,$(TESTSEQ),$(call echo,>>$@,build vot2016/$(seq): MAKE))
 
 ninja-test = build-$(1)/kcf_vot-$(2)-$(3).log
 
 define ninja-rule
 rule REGENERATE
-  command = make $$out BUILDS="$(BUILDS)" TESTSEQ="$(TESTSEQ)" TESTFLAGS="$(TESTFLAGS)"
+  command = MAKEFLAGS='$(MAKEFLAGS)' $(MAKE) $$out
+  description = Regenerating $$out
   generator = 1
 rule CMAKE
-  command = cd $$$$(dirname $$out) && cmake $(CMAKE_OPTS) $$opts ..
+  command = cd $$subdir && cmake -G Ninja $(CMAKE_OPTS) $$opts ..
 rule NINJA
   # Absolute path in -C allows Emacs to properly jump to error message locations
-  command = ninja -C $(CURDIR)/$$$$(dirname $$out)
-  description = ninja $$out
+  command = ninja -C $(CURDIR)/$$subdir
   restat = 1
 rule TEST_SEQ
   # Errors are ignored - they will be reported by PRINT_RESULTS
-  command = build-$$build/kcf_vot $$flags $$seq > $$out || :
+  command = build-$$build/kcf_vot $$flags $$seq $(if $(TRAVIS),2>&1) >$$out $(if $(TRAVIS),| grep -v libdc1394); true
 rule PRINT_RESULTS
   description = Print results
-  command = ./wvtool -w125 -v run ./print-test-results $$in
+  command = ./wvtool -w120 -v run ./print-test-results $$in
+rule PLOT_RESULTS
+  description = Plot results
+  command = ./graphGen.sh -f -s $$in
 rule MAKE
   command = make $$out
   pool = make
@@ -119,7 +131,7 @@ rule CLEAN
   description = Cleaning all built files...
   command = rm -rf $(BUILDS:%=build-%)
 build clean: CLEAN
-build build.ninja: REGENERATE Makefile
+build build.ninja: REGENERATE $(MAKEFILE_LIST)
 endef
 
 GIT_LS_FILES := $(shell git ls-files)
@@ -127,7 +139,9 @@ GIT_LS_FILES := $(shell git ls-files)
 define ninja-build
 build build-$(1)/build.ninja: CMAKE
   opts = $(2)
+  subdir = build-$(1)
 build build-$(1)/kcf_vot: NINJA build-$(1)/build.ninja $(GIT_LS_FILES)
+  subdir = build-$(1)
 default build-$(1)/kcf_vot
 endef
 
@@ -136,5 +150,5 @@ define ninja-testcase
 build build-$(1)/kcf_vot-$(2)-$(3).log: TEST_SEQ build-$(1)/kcf_vot $(filter-out %/output.txt,$(wildcard vot2016/$(2)/*)) vot2016/$(2)
   build = $(1)
   seq = vot2016/$(2)
-  flags = $(if $(3:fit128=),,--fit=128)
+  flags = $(if $(3:fit128=),,--fit=128)$(if $(3:fit=),,--fit)
 endef
index 8cf2c6388337a4cdbdbf481e84e20672400e7e48..26603100d409b06fc4328bee987fa8228db2cea2 100644 (file)
--- a/README.md
+++ b/README.md
@@ -7,19 +7,37 @@ of the algorithm including execution on the GPU. The aim is also to
 modify the code according to the PRedictable Execution Model (PREM).
 
 Stable version of the tracker is available from a [CTU server][2],
-development happens at GitHub [here][wsh] and [here][3].
+development happens at [GitHub][iig].
 
 [1]: http://hercules2020.eu/
 [2]: http://rtime.felk.cvut.cz/gitweb/hercules2020/kcf.git
-[wsh]: https://github.com/wentasah/kcf
+[iig]: https://github.com/CTU-IIG/kcf
 [3]: https://github.com/Shanigen/kcf
 
+<!-- markdown-toc start - Don't edit this section. Run M-x markdown-toc-refresh-toc -->
+**Table of Contents**
+
+- [Prerequisites](#prerequisites)
+- [Compilation](#compilation)
+    - [Compile all supported versions](#compile-all-supported-versions)
+    - [Using cmake gui](#using-cmake-gui)
+    - [Command line](#command-line)
+- [Running](#running)
+    - [Options](#options)
+- [Automated testing](#automated-testing)
+- [Authors](#authors)
+- [References](#references)
+- [License](#license)
+
+<!-- markdown-toc end -->
+
+
 ## Prerequisites
 
-The code depends on OpenCV 2.4 library
-and [CMake][13] (optionally with [Ninja][8]) is used for building.
-Depending on the version to be compiled you need to have development
-packages for [FFTW][4], [CUDA][5] or [OpenMP][6] installed.
+The code depends on OpenCV (version 2.4 or 3.x) library. [CMake][13]
+(optionally with [Ninja][8]) is used for building. Depending on the
+version to be compiled you need to have development packages for
+[FFTW][4], [CUDA][5] or [OpenMP][6] installed.
 
 On TX2, the following command should install what's needed:
 ``` shellsession
@@ -101,27 +119,19 @@ With all of these FFT version additional options can be added:
 
 |Option| Description |
 | --- | --- |
-| `-DASYNC=ON` | Use C++ `std::async` to run computations for different scales in parallel. This doesn't work with `BIG_BATCH` mode.|
-| `-DBIG_BATCH=ON` | Concatenate matrices of different scales to one big matrix and perform all computations on this matrix. This mode doesn't work with `OpenCV` FFT.|
-| `-DOPENMP=ON` | Parallelize certain operation with OpenMP. This can only be used with `OpenCV` or `fftw` FFT implementations. By default it runs computations for differenct scales in parallel. With `-DBIG_BATCH=ON` it parallelizes the feature extraction and the search for maximal response for differenct scales. With `fftw`, Ffftw's plans will execute in parallel.|
+| `-DBIG_BATCH=ON` | Concatenate matrices of different scales to one big matrix and perform all computations on this matrix. This improves performance of GPU FFT offloading. |
+| `-DOPENMP=ON` | Parallelize certain operation with OpenMP. With `-DBIG_BATCH=OFF` it runs computations for differenct scales in parallel, with `-DBIG_BATCH=ON` it parallelizes the feature extraction, which runs on the CPU. With `fftw`, Ffftw's plans will execute in parallel.|
 | `-DCUDA_DEBUG=ON` | Adds calls cudaDeviceSynchronize after every CUDA function and kernel call.|
 | `-DOpenCV_DIR=/opt/opencv-3.3/share/OpenCV` | Compile against a custom OpenCV version. |
+| `-DASYNC=ON` | Use C++ `std::async` to run computations for different scales in parallel. This mode of parallelization was present in the original implementation. Here, it is superseeded with -DOPENMP. This doesn't work with `BIG_BATCH` mode.|
 
-
-### Compilation for non-TX2 CUDA
-
-The CuFFT version is set up to run on NVIDIA Jetson TX2. If you want
-to run it on different architecture, change the `--gpu-architecture
-sm_62` NVCC flag in **/src/CMakeLists.txt** to your architecture of
-NVIDIA GPU. To find what SM variation you architecture has look
-[here][9].
-
-[9]: http://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
+See also the top-level `Makefile` for other useful cmake parameters
+such as extra compiler flags etc.
 
 ## Running
 
-No matter which method is used to compile the code, the results will
-be `kcf_vot` binary.
+No matter which method is used to compile the code, the result will be
+a `kcf_vot` binary.
 
 It operates on an image sequence created according to [VOT 2014
 methodology][10]. You can find some image sequences in [vot2016
@@ -162,17 +172,41 @@ top_left_y, width, height".
 
 | Options | Description |
 | ------- | ----------- |
+| --fit, -f[W[xH]] | Specifies the dimension to which the extracted patches should be scaled. Best performance is achieved for powers of two; the smaller number the higher performance but worse accuracy. No dimension or zero rounds the dimensions to the nearest smaller power of 2, a single dimension `W` will result in patch size of `W`×`W`. The numbers should be divisible by 4. |
 | --visualize, -v[delay_ms] | Visualize the output, optionally with specified delay. If the delay is 0 the program will wait for a key press. |
 | --output, -o <output.txt>     | Specify name of output file. |
 | --debug, -d                           | Generate debug output. |
-| --fit, -f[W[xH]] | Specifies the dimension to which the extracted patch should be scaled. It should be divisible by 4. No dimension is the same as `128x128`, a single dimension `W` will result in patch size of `W`×`W`. |
+| --visual_debug, -p[p|r] | Show graphical window with debugging information (either **p**atch or filter **r**esponse). |
+
+## Automated testing
+
+The tracker comes with a test suite based on [vot2016 datatset][11].
+You can run the test suite as follows:
+
+    make vot2016  # This download the datased (about 1GB of data)
+       make test
+
+The above command run all tests in parallel and displays the results
+in a table. If you want to measure performance, do not run multiple
+tests together. This can be achieved by:
+
+       make build.ninja
+       ninja -j1 test
+
+You can test only a subset of builds or image sequences by setting
+BUILDS, TESTSEQ or TESTFLAGS make variables. For instance:
+
+       make build.ninja BUILDS="cufft cufft-big fftw" TESTSEQ="bmx ball1"
+       ninja test
+
+
 
 
 ## Authors
 * Vít Karafiát, Michal Sojka
 
-Original C++ implementation of KCF tracker was written by Tomas Vojir
-[here][12] and is reimplementation of algorithm presented in
+[Original C++ implementation of the KCF tracker][12] was written by
+Tomas Vojir and is reimplementation of the algorithm presented in
 "High-Speed Tracking with Kernelized Correlation Filters" paper \[1].
 
 [12]: https://github.com/vojirt/kcf/blob/master/README.md
@@ -200,3 +234,7 @@ ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
 ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
 OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+<!-- Local Variables: -->
+<!-- markdown-toc-user-toc-structure-manipulation-fn: cdr -->
+<!-- End: -->
diff --git a/graphGen.sh b/graphGen.sh
new file mode 100755 (executable)
index 0000000..6652f41
--- /dev/null
@@ -0,0 +1,70 @@
+#!/usr/bin/env bash
+
+USE_FPS=0
+SORT=0
+
+while getopts "fs" opt
+do
+    case $opt in
+        f)
+            USE_FPS=1
+            ;;
+        s)
+            SORT=1
+            ;;
+        \?)
+            echo "Invalid option -$OPTARG" >&2
+            exit 1
+            ;;
+    esac
+done
+
+shift $((OPTIND-1))
+
+for log in "$@"
+do
+
+    [[ "$log" =~ build-(.*)/kcf_vot-(.*)-(.*).log ]]
+    tracker_version=${BASH_REMATCH[1]}
+    arguments=${BASH_REMATCH[3]}
+    dataset=${BASH_REMATCH[2]}
+
+    data_file=${log%.log}.dat
+
+    (echo ${tracker_version}-${arguments}-${dataset}; grep -e '->' $log | grep -o '[0-9.]*ms' ) > $data_file
+done
+
+getavg() { grep Average $1 | grep -o '[0-9.]*ms'; }
+set -- $(for i in $@; do avg=$(getavg $i); test "$avg" && echo $i $avg; done \
+       | if (($SORT == 1)); then sort -n -k2; else cat; fi \
+       | cut -f1 -d' ')
+
+
+paste ${@//.log/.dat} > all
+
+gnuplot -persist << EOFMarker
+        file = 'all'
+        header = system('head -1 '.file)
+        N = words(header)
+
+        if ($USE_FPS == 1) {
+           set ylabel "FPS"
+        } else {
+          set ylabel "Time [ms]"
+        }
+        set xtics rotate
+        set xtics ('' 1)
+        set for [i=1:N] xtics add (word(header, i) i)
+
+        set style data boxplot
+        set style boxplot nooutliers
+        set grid
+        unset key
+        if ($USE_FPS == 1) {
+           plot [][0:] for [i=1:N] file using (i):(1000/column(i))
+        } else {
+          plot [][0:] for [i=1:N] file using (i):i
+        }
+EOFMarker
+
+rm all
index 711a00045065ae9060ca9e9a55135ef315777246..fe5fb78364c1b4bc4407310236d4654439650ac4 100644 (file)
@@ -34,62 +34,77 @@ double calcAccuracy(std::string line, cv::Rect bb_rect, cv::Rect &groundtruth_re
 
 int main(int argc, char *argv[])
 {
-    // load region, images and prepare for output
-    std::string region, images, output;
+    //load region, images and prepare for output
+    std::string region, images, output, video_out;
     int visualize_delay = -1, fit_size_x = -1, fit_size_y = -1;
     KCF_Tracker tracker;
+    cv::VideoWriter videoWriter;
 
     while (1) {
         int option_index = 0;
-        static struct option long_options[] = {{"debug", no_argument, 0, 'd'},
-                                               {"visualDebug", no_argument, 0, 'p'},
-                                               {"help", no_argument, 0, 'h'},
-                                               {"output", required_argument, 0, 'o'},
-                                               {"visualize", optional_argument, 0, 'v'},
-                                               {"fit", optional_argument, 0, 'f'},
-                                               {0, 0, 0, 0}};
-
-        int c = getopt_long(argc, argv, "dphv::f::o:", long_options, &option_index);
-        if (c == -1) break;
+        static struct option long_options[] = {
+            {"debug",     no_argument,       0,  'd' },
+            {"visual_debug", optional_argument,    0, 'p'},
+            {"help",      no_argument,       0,  'h' },
+            {"output",    required_argument, 0,  'o' },
+            {"video_out", optional_argument, 0,  'O' },
+            {"visualize", optional_argument, 0,  'v' },
+            {"fit",       optional_argument, 0,  'f' },
+            {0,           0,                 0,  0 }
+        };
+
+        int c = getopt_long(argc, argv, "dp::hv::f::o:O::", long_options, &option_index);
+        if (c == -1)
+            break;
 
         switch (c) {
         case 'd':
             tracker.m_debug = true;
             break;
         case 'p':
-            tracker.m_visual_debug = true;
-            visualize_delay = 500;
+            if (!optarg || *optarg == 'p')
+                tracker.m_visual_debug = KCF_Tracker::vd::PATCH;
+            else if (optarg && *optarg == 'r')
+                tracker.m_visual_debug = KCF_Tracker::vd::RESPONSE;
+            else {
+                fprintf(stderr, "Unknown visual debug mode: %c", *optarg);
+                return 1;
+            }
             break;
         case 'h':
-            std::cerr
-                << "Usage: \n"
-                << argv[0] << " [options]\n"
-                << argv[0] << " [options] <directory>\n"
-                << argv[0]
-                << " [options] <path/to/region.txt or groundtruth.txt> <path/to/images.txt> [path/to/output.txt]\n"
-                << "Options:\n"
-                << " --visualize | -v[delay_ms]\n"
-                << " --output    | -o <output.txt>\n"
-                << " --debug     | -d\n"
-                << " --visualDebug | -p\n"
-                << " --fit       | -f[WxH]\n";
+            std::cerr << "Usage: \n"
+                      << argv[0] << " [options]\n"
+                      << argv[0] << " [options] <directory>\n"
+                      << argv[0] << " [options] <path/to/region.txt or groundtruth.txt> <path/to/images.txt> [path/to/output.txt]\n"
+                      << "Options:\n"
+                      << " --visualize    | -v[delay_ms]\n"
+                      << " --output       | -o <output.txt>\n"
+                      << " --fit          | -f[W[xH]]\n"
+                      << " --debug        | -d\n"
+                      << " --visual_debug | -p [p|r]\n";
             exit(0);
             break;
         case 'o':
             output = optarg;
             break;
+        case 'O':
+            video_out = optarg ? optarg : "./output.avi";
+            break;
         case 'v':
             visualize_delay = optarg ? atol(optarg) : 1;
             break;
         case 'f':
-            std::string sizes = optarg ? optarg : "128x128";
-            std::string delimiter = "x";
-            size_t pos = sizes.find(delimiter);
-            std::string first_argument = sizes.substr(0, pos);
-            sizes.erase(0, pos + delimiter.length());
-
-            fit_size_x = stol(first_argument);
-            fit_size_y = stol(sizes);
+            if (!optarg) {
+                fit_size_x = fit_size_y = 0;
+            } else {
+                char tail;
+                if (sscanf(optarg, "%d%c", &fit_size_x, &tail) == 1) {
+                    fit_size_y = fit_size_x;
+                } else if (sscanf(optarg, "%dx%d%c", &fit_size_x, &fit_size_y, &tail) != 2) {
+                    fprintf(stderr, "Cannot parse -f argument: %s\n", optarg);
+                    return 1;
+                }
+            }
             break;
         }
     }
@@ -104,7 +119,8 @@ int main(int argc, char *argv[])
     case 0:
         region = access("groundtruth.txt", F_OK) == 0 ? "groundtruth.txt" : "region.txt";
         images = "images.txt";
-        if (output.empty()) output = "output.txt";
+        if (output.empty())
+            output = "output.txt";
         break;
     case 2:
         // Fall through
@@ -134,13 +150,20 @@ int main(int argc, char *argv[])
 
     cv::Mat image;
 
-    // img = firts frame, initPos = initial position in the first frame
+    //img = firts frame, initPos = initial position in the first frame
     cv::Rect init_rect = vot_io.getInitRectangle();
     vot_io.outputBoundingBox(init_rect);
     vot_io.getNextImage(image);
 
+    if (!video_out.empty()) {
+        int codec = CV_FOURCC('M', 'J', 'P', 'G');  // select desired codec (must be available at runtime)
+        double fps = 25.0;                          // framerate of the created video stream
+        videoWriter.open(video_out, codec, fps, image.size(), true);
+    }
+
     tracker.init(image, init_rect, fit_size_x, fit_size_y);
 
+
     BBox_c bb;
     cv::Rect bb_rect;
     double avg_time = 0., sum_accuracy = 0.;
@@ -158,7 +181,7 @@ int main(int argc, char *argv[])
         frames++;
 
         bb = tracker.getBBox();
-        bb_rect = cv::Rect(bb.cx - bb.w / 2., bb.cy - bb.h / 2., bb.w, bb.h);
+        bb_rect = cv::Rect(bb.cx - bb.w/2., bb.cy - bb.h/2., bb.w, bb.h);
         vot_io.outputBoundingBox(bb_rect);
 
         if (groundtruth_stream.is_open()) {
@@ -167,14 +190,15 @@ int main(int argc, char *argv[])
 
             cv::Rect groundtruthRect;
             double accuracy = calcAccuracy(line, bb_rect, groundtruthRect);
-            if (visualize_delay >= 0) cv::rectangle(image, groundtruthRect, CV_RGB(255, 0, 0), 1);
+            if (visualize_delay >= 0)
+                cv::rectangle(image, groundtruthRect, CV_RGB(255, 0,0), 1);
             std::cout << ", accuracy: " << accuracy;
             sum_accuracy += accuracy;
         }
 
         std::cout << std::endl;
 
-        if (visualize_delay >= 0) {
+        if (visualize_delay >= 0 || !video_out.empty()) {
             cv::Point pt(bb.cx, bb.cy);
             cv::Size size(bb.w, bb.h);
             cv::RotatedRect rotatedRectangle(pt, size, bb.a);
@@ -184,37 +208,36 @@ int main(int argc, char *argv[])
 
             for (int i = 0; i < 4; i++)
                 cv::line(image, vertices[i], vertices[(i + 1) % 4], cv::Scalar(0, 255, 0), 2);
-            //             cv::rectangle(image, cv::Rect(bb.cx - bb.w/2., bb.cy - bb.h/2., bb.w, bb.h), CV_RGB(0,255,0),
-            //             2);
-            std::string angle = std::to_string(bb.a);
-            angle.erase(angle.find_last_not_of('0') + 1, std::string::npos);
-            angle.erase(angle.find_last_not_of('.') + 1, std::string::npos);
-            cv::putText(image, "Frame: " + std::to_string(frames) + " " + angle + " angle",
-                        cv::Point(0, image.rows - 1), cv::FONT_HERSHEY_SIMPLEX, 0.7, cv::Scalar(0, 255, 0), 2);
-            cv::imshow("output", image);
-            int ret = cv::waitKey(visualize_delay);
-            if (visualize_delay > 0 && ret != -1 && ret != 255) break;
+            if (visualize_delay >= 0) {
+                cv::imshow("KCF output", image);
+                int ret = cv::waitKey(visualize_delay);
+                if ((visualize_delay > 0 && ret != -1 && ret < 128) ||
+                        (visualize_delay == 0 && (ret == 27 /*esc*/ || ret == 'q')))
+                    break;
+            }
+            if (!video_out.empty())
+                videoWriter << image;
         }
 
-        //        std::stringstream s;
-        //        std::string ss;
-        //        int countTmp = frames;
-        //        s << "imgs" << "/img" << (countTmp/10000);
-        //        countTmp = countTmp%10000;
-        //        s << (countTmp/1000);
-        //        countTmp = countTmp%1000;
-        //        s << (countTmp/100);
-        //        countTmp = countTmp%100;
-        //        s << (countTmp/10);
-        //        countTmp = countTmp%10;
-        //        s << (countTmp);
-        //        s << ".jpg";
-        //        s >> ss;
-        //        //set image output parameters
-        //        std::vector<int> compression_params;
-        //        compression_params.push_back(CV_IMWRITE_JPEG_QUALITY);
-        //        compression_params.push_back(90);
-        //        cv::imwrite(ss.c_str(), image, compression_params);
+//        std::stringstream s;
+//        std::string ss;
+//        int countTmp = frames;
+//        s << "imgs" << "/img" << (countTmp/10000);
+//        countTmp = countTmp%10000;
+//        s << (countTmp/1000);
+//        countTmp = countTmp%1000;
+//        s << (countTmp/100);
+//        countTmp = countTmp%100;
+//        s << (countTmp/10);
+//        countTmp = countTmp%10;
+//        s << (countTmp);
+//        s << ".jpg";
+//        s >> ss;
+//        //set image output parameters
+//        std::vector<int> compression_params;
+//        compression_params.push_back(CV_IMWRITE_JPEG_QUALITY);
+//        compression_params.push_back(90);
+//        cv::imwrite(ss.c_str(), image, compression_params);
     }
 
     std::cout << "Average processing speed: " << avg_time / frames << "ms (" << 1. / (avg_time / frames) * 1000 << " fps)";
@@ -222,6 +245,8 @@ int main(int argc, char *argv[])
         std::cout << "; Average accuracy: " << sum_accuracy/frames << std::endl;
         groundtruth_stream.close();
     }
+    if (!video_out.empty())
+       videoWriter.release();
     std::cout << std::endl;
 
     return EXIT_SUCCESS;
index 50027de80754b53ffeab100aa434105917ebeb51..b3062404fe97ae58da34b40c38ee6ee3aca491a2 100755 (executable)
@@ -1,7 +1,10 @@
 #!/bin/bash
 set -e
 
-expected_accuracy=([bag]=0.53 [ball1]=0.70 [car1]=0.35 [book]=0.19)
+declare -A expected_accuracy
+
+expected_accuracy=([bag]=0.53 [ball1-correct]=0.65 [ball1]=0.27 [car1]=0.35 [book]=0.19
+                  [bmx]=0.38 [crossing]=0.48 [racing]=0.52)
 
 for i in "$@"; do
     [[ "$i" =~ build-(.*)/kcf_vot-(.*)-(.*).log ]]
@@ -9,15 +12,16 @@ for i in "$@"; do
     flags=${BASH_REMATCH[3]}
     seq=${BASH_REMATCH[2]}
 
-    result=$(grep 'Average accuracy:' $i || :)
-    if [[ "$result" =~ Average\ accuracy:\ ([0-9.]+) ]]; then
-       if [[ $(echo "${BASH_REMATCH[1]} >= ${expected_accuracy[$seq]}"|bc) -eq 1 ]]; then
+    result=$(grep 'Average accuracy:' $i | sed -e 's/Average/Avg./g' -e 's/processing //' || :)
+    if [[ "$result" =~ accuracy:\ ([0-9.]+) ]]; then
+       result+=" >= ${expected_accuracy[$seq]}"
+       if [[ $(echo "${BASH_REMATCH[1]} >= ${expected_accuracy[$seq]:-0}"|bc) -eq 1 ]]; then
            status=ok
        else
-           status=BAD
+           status=ACCURACY
        fi
     else
        status=FAILED
     fi
     echo ! "$seq;$flags;$build;$result;$status"
-done | sort | column -t -s";"
+done | sort -t";" $SORT_FLAGS | column -t -s";"
index 3e0aee471c678ef511a3ec344a0a03126d6198d2..7ed9f6051359f99ee430142e87749e1d294da7e7 100644 (file)
@@ -1,6 +1,6 @@
 cmake_minimum_required(VERSION 2.8)
 
-set(KCF_LIB_SRC kcf.cpp kcf.h fft.cpp threadctx.hpp pragmas.h dynmem.hpp)
+set(KCF_LIB_SRC kcf.cpp kcf.h fft.cpp threadctx.hpp pragmas.h dynmem.hpp debug.cpp complexmat.hpp)
 
 find_package(PkgConfig)
 
@@ -8,10 +8,10 @@ SET(FFT "OpenCV" CACHE STRING "Select FFT implementation")
 SET_PROPERTY(CACHE FFT PROPERTY STRINGS OpenCV fftw cuFFTW cuFFT)
 MESSAGE(STATUS "FFT implementation: ${FFT}")
 
-option(OPENMP "Use OpenMP library. Works with FFTW and OpenCV implementation." OFF)
-option(ASYNC "Works only if OPENCV_CUFFT is not ON. Will enable C++ async directive." OFF)
+option(OPENMP "Use OpenMP to paralelize certain portions of code." OFF)
+option(ASYNC "Use C++ std::async to paralelize certain portions of code." OFF)
 option(CUDA_DEBUG "Enables error cheking for cuda and cufft. " OFF)
-option(BIG_BATCH "Enable transforming all features from all scales together." OFF)
+option(BIG_BATCH "Execute all FFT calculation in a single batch. This can improve paralelism and reduce GPU offloading overhead." OFF)
 
 IF(PROFILING)
   add_definitions(-DPROFILING )
@@ -23,20 +23,24 @@ IF(BIG_BATCH)
   MESSAGE(STATUS "Big_batch mode")
 ENDIF()
 
+IF (("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU") AND NOT OPENMP)
+  SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unknown-pragmas")
+ENDIF()
+
 SET(use_cuda OFF)
 
 IF(FFT STREQUAL "OpenCV")
-  list(APPEND KCF_LIB_SRC fft_opencv.cpp complexmat.hpp)
+  list(APPEND KCF_LIB_SRC fft_opencv.cpp)
 ELSEIF(FFT STREQUAL "fftw")
-  list(APPEND KCF_LIB_SRC fft_fftw.cpp complexmat.hpp)
+  list(APPEND KCF_LIB_SRC fft_fftw.cpp)
   add_definitions(-DFFTW)
   pkg_check_modules(FFTW REQUIRED fftw3f)
 ELSEIF(FFT STREQUAL "cuFFTW")
-  list(APPEND KCF_LIB_SRC fft_fftw.cpp complexmat.hpp)
+  list(APPEND KCF_LIB_SRC fft_fftw.cpp)
   add_definitions(-DFFTW -DCUFFTW)
   set(use_cuda ON)
 ELSEIF(FFT STREQUAL "cuFFT")
-    list(APPEND KCF_LIB_SRC fft_cufft.cpp complexmat.cuh cuda_functions.cuh)
+    list(APPEND KCF_LIB_SRC fft_cufft.cpp)
     add_definitions(-DCUFFT)
     set(use_cuda ON)
     iF(CUDA_DEBUG)
@@ -47,6 +51,12 @@ ELSE()
   MESSAGE(FATAL_ERROR "Invalid FFT implementation selected")
 ENDIF()
 
+IF(FFT STREQUAL "cuFFT")
+  list(APPEND KCF_LIB_SRC complexmat.cu)
+ELSE()
+  list(APPEND KCF_LIB_SRC complexmat.cpp)
+ENDIF()
+
 IF((FFT STREQUAL "OpenCV") AND BIG_BATCH)
   message(SEND_ERROR "OpenCV version does not support big batch mode.")
 ENDIF()
@@ -57,10 +67,11 @@ ENDIF()
 
 IF(ASYNC)
   add_definitions(-DASYNC)
+  find_package(Threads REQUIRED)
   MESSAGE(STATUS "ASYNC")
 ELSEIF(OPENMP)
-    add_definitions(-DOPENMP)
-    MESSAGE(STATUS "OPENMP")
+  add_definitions(-DOPENMP)
+  FIND_PACKAGE(OpenMP REQUIRED CXX)
 ENDIF() #ASYNC
 
 FIND_PACKAGE( OpenCV REQUIRED )
@@ -78,29 +89,34 @@ IF(use_cuda)
 
   set(CUDA_ARCH_LIST "Auto" CACHE STRING "CUDA GPU architecture for building the code")
   CUDA_SELECT_NVCC_ARCH_FLAGS(ARCH_FLAGS ${CUDA_ARCH_LIST})
-  list( APPEND CUDA_NVCC_FLAGS -O3 -std=c++11 ${ARCH_FLAGS}) # --gpu-architecture sm_62 )
+  list( APPEND CUDA_NVCC_FLAGS -O3 -std=c++11 ${ARCH_FLAGS} --default-stream per-thread) # --gpu-architecture sm_62 )
   find_cuda_helper_libs(cufftw)
-  IF(FFT STREQUAL "cuFFT")
-  add_subdirectory(cuda)
-  cuda_add_library(complexmat complexmat.cu)
-  cuda_add_library(cuda_func cuda_functions.cu)
-  ENDIF()
-
+  find_cuda_helper_libs(nvToolsExt)
 ENDIF()
 
 add_subdirectory(piotr_fhog)
 add_subdirectory(cn)
 
-add_library(kcf STATIC ${KCF_LIB_SRC})
+if(FFT STREQUAL "cuFFT")
+  cuda_add_library(kcf STATIC ${KCF_LIB_SRC})
+else()
+  add_library(kcf STATIC ${KCF_LIB_SRC})
+endif()
+
+if (CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+  target_compile_options(kcf PRIVATE -Wno-gnu-zero-variadic-macro-arguments)
+endif()
+
+
 target_link_libraries(kcf fhog cndata ${OpenCV_LIBS})
 set_target_properties(kcf PROPERTIES VERSION 1.0.0 SOVERSION 1)
 
 IF(FFT STREQUAL "fftw")
   target_link_libraries(kcf ${FFTW_LDFLAGS})
   IF(OPENMP)
-    target_link_libraries(kcf fftw3f_omp)
-  ELSEIF(NOT ASYNC)
-    target_link_libraries(kcf fftw3f_threads)
+    target_link_libraries(kcf fftw3_omp)
+  ELSE()
+    target_link_libraries(kcf fftw3_threads)
   ENDIF()
 ENDIF() #FFTW
 
@@ -109,9 +125,26 @@ IF(FFT STREQUAL "cuFFTW")
 ENDIF() #cuFFTW
 
 IF(FFT STREQUAL "cuFFT")
-    target_link_libraries(kcf ${CUDA_cufft_LIBRARY} ${CUDA_LIBRARIES} complexmat cuda_func)
+  target_link_libraries(kcf ${CUDA_cufft_LIBRARY} ${CUDA_cublas_LIBRARY} ${CUDA_LIBRARIES} ${CUDA_nvToolsExt_LIBRARY})
 ENDIF()
 
 IF(PROFILING)
   target_link_libraries(kcf pfm)
 ENDIF()
+
+IF(OPENMP)
+  target_compile_options(kcf PUBLIC ${OpenMP_CXX_FLAGS})
+  target_link_libraries(kcf ${OpenMP_CXX_LIB_NAMES})
+  target_link_libraries(kcf ${OpenMP_omp_LIBRARY})
+  if (("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU") AND NOT OpenMP_CXX_LIB_NAMES)
+    # Older cmake does not set OpenMP_CXX_LIB_NAMES so hardcode it here
+    target_link_libraries(kcf gomp)
+  endif()
+ENDIF()
+
+if(THREADS_HAVE_PTHREAD_ARG)
+  target_compile_options(kcf PUBLIC "-pthread")
+endif()
+if(CMAKE_THREAD_LIBS_INIT)
+  target_link_libraries(kcf "${CMAKE_THREAD_LIBS_INIT}")
+endif()
diff --git a/src/complexmat.cpp b/src/complexmat.cpp
new file mode 100644 (file)
index 0000000..97e1ca7
--- /dev/null
@@ -0,0 +1,149 @@
+#include "complexmat.hpp"
+
+ComplexMat_::T ComplexMat_::sqr_norm() const
+{
+    assert(n_scales == 1);
+
+    int n_channels_per_scale = n_channels / n_scales;
+    T sum_sqr_norm = 0;
+    for (int i = 0; i < n_channels_per_scale; ++i) {
+        for (auto lhs = p_data.hostMem() + i * rows * cols; lhs != p_data.hostMem() + (i + 1) * rows * cols; ++lhs)
+            sum_sqr_norm += lhs->real() * lhs->real() + lhs->imag() * lhs->imag();
+    }
+    sum_sqr_norm = sum_sqr_norm / static_cast<T>(cols * rows);
+    return sum_sqr_norm;
+}
+
+void ComplexMat_::sqr_norm(DynMem_<ComplexMat_::T> &result) const
+{
+    int n_channels_per_scale = n_channels / n_scales;
+    int scale_offset = n_channels_per_scale * rows * cols;
+    for (uint scale = 0; scale < n_scales; ++scale) {
+        T sum_sqr_norm = 0;
+        for (int i = 0; i < n_channels_per_scale; ++i)
+            for (auto lhs = p_data.hostMem() + i * rows * cols + scale * scale_offset;
+                 lhs != p_data.hostMem() + (i + 1) * rows * cols + scale * scale_offset; ++lhs)
+                sum_sqr_norm += lhs->real() * lhs->real() + lhs->imag() * lhs->imag();
+        result.hostMem()[scale] = sum_sqr_norm / static_cast<T>(cols * rows);
+    }
+    return;
+}
+
+ComplexMat_ ComplexMat_::sqr_mag() const
+{
+    return mat_const_operator([](std::complex<T> &c) { c = c.real() * c.real() + c.imag() * c.imag(); });
+}
+
+ComplexMat_ ComplexMat_::conj() const
+{
+    return mat_const_operator([](std::complex<T> &c) { c = std::complex<T>(c.real(), -c.imag()); });
+}
+
+ComplexMat_ ComplexMat_::sum_over_channels() const
+{
+    assert(p_data.num_elem == n_channels * rows * cols);
+
+    uint n_channels_per_scale = n_channels / n_scales;
+    uint scale_offset = n_channels_per_scale * rows * cols;
+
+    ComplexMat_ result(this->rows, this->cols, 1, n_scales);
+    for (uint scale = 0; scale < n_scales; ++scale) {
+        for (uint i = 0; i < rows * cols; ++i) {
+            std::complex<T> acc = 0;
+            for (uint ch = 0; ch < n_channels_per_scale; ++ch)
+                acc +=  p_data[scale * scale_offset + i + ch * rows * cols];
+            result.p_data.hostMem()[scale * rows * cols + i] = acc;
+        }
+    }
+    return result;
+}
+
+ComplexMat_ ComplexMat_::operator/(const ComplexMat_ &rhs) const
+{
+    return mat_mat_operator([](std::complex<T> &c_lhs, const std::complex<T> &c_rhs) { c_lhs /= c_rhs; }, rhs);
+}
+
+ComplexMat_ ComplexMat_::operator+(const ComplexMat_ &rhs) const
+{
+    return mat_mat_operator([](std::complex<T> &c_lhs, const std::complex<T> &c_rhs) { c_lhs += c_rhs; }, rhs);
+}
+
+ComplexMat_ ComplexMat_::operator*(const ComplexMat_::T &rhs) const
+{
+    return mat_const_operator([&rhs](std::complex<T> &c) { c *= rhs; });
+}
+
+ComplexMat_ ComplexMat_::mul(const ComplexMat_ &rhs) const
+{
+    return matn_mat1_operator([](std::complex<T> &c_lhs, const std::complex<T> &c_rhs) { c_lhs *= c_rhs; }, rhs);
+}
+
+ComplexMat_ ComplexMat_::operator+(const ComplexMat_::T &rhs) const
+{
+    return mat_const_operator([&rhs](std::complex<T> &c) { c += rhs; });
+}
+
+ComplexMat_ ComplexMat_::operator*(const ComplexMat_ &rhs) const
+{
+    return mat_mat_operator([](std::complex<T> &c_lhs, const std::complex<T> &c_rhs) { c_lhs *= c_rhs; }, rhs);
+}
+
+ComplexMat_ ComplexMat_::mat_mat_operator(void (*op)(std::complex<ComplexMat_::T> &, const std::complex<ComplexMat_::T> &), const ComplexMat_ &mat_rhs) const
+{
+    assert(mat_rhs.n_channels == n_channels/n_scales && mat_rhs.cols == cols && mat_rhs.rows == rows);
+
+    ComplexMat_ result = *this;
+    for (uint s = 0; s < n_scales; ++s) {
+        auto lhs = result.p_data.hostMem() + (s * n_channels/n_scales * rows * cols);
+        auto rhs = mat_rhs.p_data.hostMem();
+        for (uint i = 0; i < n_channels/n_scales * rows * cols; ++i)
+            op(*(lhs + i), *(rhs + i));
+    }
+
+    return result;
+}
+
+ComplexMat_ ComplexMat_::matn_mat1_operator(void (*op)(std::complex<ComplexMat_::T> &, const std::complex<ComplexMat_::T> &), const ComplexMat_ &mat_rhs) const
+{
+    assert(mat_rhs.n_channels == 1 && mat_rhs.cols == cols && mat_rhs.rows == rows);
+
+    ComplexMat_ result = *this;
+    for (uint i = 0; i < n_channels; ++i) {
+        auto lhs = result.p_data.hostMem() + i * rows * cols;
+        auto rhs = mat_rhs.p_data.hostMem();
+        for (; lhs != result.p_data.hostMem() + (i + 1) * rows * cols; ++lhs, ++rhs)
+            op(*lhs, *rhs);
+    }
+
+    return result;
+}
+
+ComplexMat_ ComplexMat_::matn_mat2_operator(void (*op)(std::complex<ComplexMat_::T> &, const std::complex<ComplexMat_::T> &), const ComplexMat_ &mat_rhs) const
+{
+    assert(mat_rhs.n_channels == n_channels / n_scales && mat_rhs.cols == cols && mat_rhs.rows == rows);
+
+    int n_channels_per_scale = n_channels / n_scales;
+    int scale_offset = n_channels_per_scale * rows * cols;
+    ComplexMat_ result = *this;
+    for (uint i = 0; i < n_scales; ++i) {
+        for (int j = 0; j < n_channels_per_scale; ++j) {
+            auto lhs = result.p_data.hostMem() + (j * rows * cols) + (i * scale_offset);
+            auto rhs = mat_rhs.p_data.hostMem() + (j * rows * cols);
+            for (; lhs != result.p_data.hostMem() + ((j + 1) * rows * cols) + (i * scale_offset); ++lhs, ++rhs)
+                op(*lhs, *rhs);
+        }
+    }
+
+    return result;
+}
+
+ComplexMat_ ComplexMat_::mat_const_operator(const std::function<void (std::complex<ComplexMat_::T> &)> &op) const
+{
+    ComplexMat_ result = *this;
+    for (uint i = 0; i < n_channels; ++i) {
+        for (auto lhs = result.p_data.hostMem() + i * rows * cols;
+             lhs != result.p_data.hostMem() + (i + 1) * rows * cols; ++lhs)
+            op(*lhs);
+    }
+    return result;
+}
index cbf9076f6b09c7cc7014962a26498d324b346586..c8430ed9f6314c9311efe05d556c2613abec502f 100644 (file)
-#include "complexmat.cuh"
+#include "complexmat.hpp"
 
-__global__ void sqr_norm_kernel(int n, float *out, float *data, float rows, float cols)
+
+__global__ void sqr_norm_kernel(const float *in, float *block_res, int total)
 {
     extern __shared__ float sdata[];
-    int i = blockDim.x * threadIdx.y + threadIdx.x;
-    int blockId = blockIdx.x + blockIdx.y * gridDim.x;
-    int threadId = 2 * (blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x);
-
-    sdata[i] = 0;
-    sdata[i] = data[threadId] * data[threadId] + data[threadId + 1] * data[threadId + 1];
-    __syncthreads();
+    int in_idx = 2 * (blockIdx.x * blockDim.x + threadIdx.x);
+    int i = threadIdx.x;
 
-    for (unsigned int s = (blockDim.x * blockDim.y + 1) / 2, old_s = blockDim.x * blockDim.y; s > 0; s >>= 1) {
+    if (in_idx >= total * 2)
+        sdata[i] = 0;
+    else
+        sdata[i] = in[in_idx] * in[in_idx] + in[in_idx + 1] * in[in_idx + 1];
 
-        if (old_s & 1) s += 1;
-
-        if (i < s && i + s < old_s) {
-            sdata[i] += sdata[i + s];
-        }
-        old_s = s;
+    for (unsigned s = (blockDim.x + 1) / 2; s > 0; s >>= 1) {
         __syncthreads();
+        if (i < s)
+            sdata[i] += sdata[i + s];
     }
 
-    if (i == 0) {
-        atomicAdd(&out[blockId / n], sdata[0] / (rows * cols));
-    }
+    if (i == 0)
+        block_res[blockIdx.x] = sdata[0];
 }
 
-void ComplexMat::sqr_norm(DynMem &result) const
+void ComplexMat_::sqr_norm(DynMem &result) const
 {
-    CudaSafeCall(cudaMemsetAsync(result.deviceMem(), 0, n_scales * sizeof(float), this->stream));
 
-    dim3 threadsPerBlock(rows, cols);
-    dim3 numBlocks(n_channels / n_scales, n_scales);
+    assert(result.num_elem == n_scales);
 
-    sqr_norm_kernel<<<numBlocks, threadsPerBlock, rows * cols * sizeof(float), this->stream>>>(
-        n_channels / n_scales, result.deviceMem(), this->p_data, rows, cols);
-    CudaCheckError();
+    const uint total = n_channels / n_scales * rows * cols;
+    const dim3 threads(1024);
+    const dim3 blocks((total + threads.x - 1) / threads.x);
 
-    return;
+    DynMem block_res(blocks.x * n_scales);
+
+    for (uint s = 0; s < n_scales; ++s) {
+        sqr_norm_kernel<<<blocks, threads, threads.x * sizeof(float)>>>((const float*)(p_data.deviceMem() + s * total),
+                                                                        block_res.deviceMem() + s * blocks.x, total);
+        CudaCheckError();
+    }
+    cudaSync();
+
+    for (uint s = 0; s < n_scales; ++s) {
+        T res = 0;
+        for (int i = 0; i < blocks.x; i++)
+            res += block_res[s * blocks.x + i];
+        result.hostMem()[s] = res / static_cast<T>(cols * rows);
+    }
 }
 
-__global__ void sqr_mag_kernel(float *data, float *result)
+__global__ void sqr_mag_kernel(const float *data, float *result, int total)
 {
-    int blockId = blockIdx.x + blockIdx.y * gridDim.x;
-    int threadId = 2 * (blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x);
+    int idx = 2 * (blockIdx.x * blockDim.x + threadIdx.x);
 
-    result[threadId] = data[threadId] * data[threadId] + data[threadId + 1] * data[threadId + 1];
-    result[threadId + 1] = 0;
+    if (idx / 2 < total) {
+        result[idx] = data[idx] * data[idx] + data[idx + 1] * data[idx + 1];
+        result[idx + 1] = 0;
+    }
 }
 
-ComplexMat ComplexMat::sqr_mag() const
+ComplexMat_ ComplexMat_::sqr_mag() const
 {
-    ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales, this->stream);
+    ComplexMat_ result = ComplexMat_::same_size(*this);
+
+    const uint total = n_channels * rows * cols;
+    const dim3 threads(256);
+    const dim3 blocks((total + threads.x - 1) / threads.x);
 
-    dim3 threadsPerBlock(rows, cols);
-    dim3 numBlocks(n_channels / n_scales, n_scales);
-    sqr_mag_kernel<<<numBlocks, threadsPerBlock, 0, this->stream>>>(this->p_data, result.p_data);
+    sqr_mag_kernel<<<threads, blocks, 0>>>((float*)this->p_data.deviceMem(),
+                                           (float*)result.p_data.deviceMem(),
+                                           total);
     CudaCheckError();
 
     return result;
 }
 
-__global__ void conj_kernel(float *data, float *result)
+__global__ void conj_kernel(const float *data, float *result, int total)
 {
-    int blockId = blockIdx.x + blockIdx.y * gridDim.x;
-    int threadId = 2 * (blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x);
+    int idx = 2 * (blockIdx.x * blockDim.x + threadIdx.x);
 
-    result[threadId] = data[threadId];
-    result[threadId + 1] = -data[threadId + 1];
+    if (idx / 2 < total) {
+        result[idx] = data[idx];
+        result[idx + 1] = -data[idx + 1];
+    }
 }
 
-ComplexMat ComplexMat::conj() const
+ComplexMat_ ComplexMat_::conj() const
 {
-    ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales, this->stream);
+    ComplexMat_ result = ComplexMat_::same_size(*this);
+
+    const uint total = n_channels * rows * cols;
+    const dim3 threads(256);
+    const dim3 blocks((total + threads.x - 1) / threads.x);
 
-    dim3 threadsPerBlock(rows, cols);
-    dim3 numBlocks(n_channels / n_scales, n_scales);
-    conj_kernel<<<numBlocks, threadsPerBlock, 0, this->stream>>>(this->p_data, result.p_data);
+    conj_kernel<<<threads, blocks, 0>>>((float*)this->p_data.deviceMem(), (float*)result.p_data.deviceMem(), total);
     CudaCheckError();
 
     return result;
 }
 
-ComplexMat ComplexMat::sum_over_channels() const
+__global__ static void sum_channels(float *dest, const float *src, uint channels, uint num_channel_elem)
 {
-    //     assert(p_data.size() > 1);
-    ComplexMat result(this->rows, this->cols, 1, this->stream);
-    return result;
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (idx >= num_channel_elem)
+        return;
+
+    float acc = 0;
+    for (uint i = 0; i < channels; ++i)
+        acc += src[idx + i * num_channel_elem];
+    dest[idx] = acc;
 }
 
-cufftComplex *ComplexMat::get_p_data() const
+ComplexMat_ ComplexMat_::sum_over_channels() const
 {
-    return (cufftComplex *)p_data;
+    assert(p_data.num_elem == n_channels * rows * cols);
+
+    uint n_channels_per_scale = n_channels / n_scales;
+
+    ComplexMat_ result(this->rows, this->cols, 1, n_scales);
+
+    const uint total = rows * cols * 2;
+    const dim3 threads(256);
+    const dim3 blocks((total + threads.x - 1) / threads.x);
+
+    for (uint scale = 0; scale < n_scales; ++scale) {
+        sum_channels<<<blocks, threads>>>(reinterpret_cast<float*>(result.p_data.deviceMem() + scale * rows * cols),
+                                          reinterpret_cast<const float*>(p_data.deviceMem() + scale * n_channels_per_scale * rows * cols),
+                                          n_channels_per_scale, total);
+    }
+    return result;
 }
 
-__global__ void same_num_channels_mul_kernel(float *data_l, float *data_r, float *result)
+__global__ void same_num_channels_mul_kernel(const float *data_l, const float *data_r, float *result, int total)
 {
-    int blockId = blockIdx.x + blockIdx.y * gridDim.x;
-    int threadId = 2 * (blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x);
+    int idx = 2 * (blockIdx.x * blockDim.x + threadIdx.x);
 
-    result[threadId] = data_l[threadId] * data_r[threadId] - data_l[threadId + 1] * data_r[threadId + 1];
-    result[threadId + 1] = data_l[threadId] * data_r[threadId + 1] + data_l[threadId + 1] * data_r[threadId];
+    if (idx / 2 < total) {
+        result[idx] = data_l[idx] * data_r[idx] - data_l[idx + 1] * data_r[idx + 1];
+        result[idx + 1] = data_l[idx] * data_r[idx + 1] + data_l[idx + 1] * data_r[idx];
+    }
 }
 
 // element-wise per channel multiplication, division and addition
-ComplexMat ComplexMat::operator*(const ComplexMat &rhs) const
+ComplexMat_ ComplexMat_::operator*(const ComplexMat_ &rhs) const
 {
-    assert(rhs.n_channels == n_channels && rhs.cols == cols && rhs.rows == rows);
+    assert(n_channels == n_scales * rhs.n_channels && rhs.cols == cols && rhs.rows == rows);
 
-    ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales, this->stream);
+    ComplexMat_ result = ComplexMat_::same_size(*this);
 
-    dim3 threadsPerBlock(rows, cols);
-    dim3 numBlocks(n_channels / n_scales, n_scales);
-    same_num_channels_mul_kernel<<<numBlocks, threadsPerBlock, 0, this->stream>>>(this->p_data, rhs.p_data,
-                                                                                  result.p_data);
-    CudaCheckError();
+    const uint total = n_channels / n_scales * rows * cols;
+    const dim3 threads(256);
+    const dim3 blocks((total + threads.x - 1) / threads.x);
+
+    for (uint s = 0; s < n_scales; ++s) {
+        same_num_channels_mul_kernel<<<blocks, threads, 0>>>((float*)(this->p_data.deviceMem() + s * total),
+                                                             (float*)rhs.p_data.deviceMem(),
+                                                             (float*)(result.p_data.deviceMem() + s * total),
+                                                             total);
+        CudaCheckError();
+    }
 
     return result;
 }
 
-__global__ void same_num_channels_div_kernel(float *data_l, float *data_r, float *result)
+__global__ void same_num_channels_div_kernel(const float *data_l, const float *data_r, float *result, unsigned total)
 {
-    int blockId = blockIdx.x + blockIdx.y * gridDim.x;
-    int threadId = 2 * (blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x);
+    int idx = 2 * (blockIdx.x * blockDim.x + threadIdx.x);
 
-    result[threadId] = (data_l[threadId] * data_r[threadId] + data_l[threadId + 1] * data_r[threadId + 1]) /
-                       (data_r[threadId] * data_r[threadId] + data_r[threadId + 1] * data_r[threadId + 1]);
-    result[threadId + 1] = (data_l[threadId + 1] * data_r[threadId] - data_l[threadId] * data_r[threadId + 1]) /
-                           (data_r[threadId] * data_r[threadId] + data_r[threadId + 1] * data_r[threadId + 1]);
+    if (idx / 2 < total) {
+        result[idx] = (data_l[idx] * data_r[idx] + data_l[idx + 1] * data_r[idx + 1]) /
+               (data_r[idx] * data_r[idx] + data_r[idx + 1] * data_r[idx + 1]);
+        result[idx + 1] = (data_l[idx + 1] * data_r[idx] - data_l[idx] * data_r[idx + 1]) /
+               (data_r[idx] * data_r[idx] + data_r[idx + 1] * data_r[idx + 1]);
+    }
 }
 
-ComplexMat ComplexMat::operator/(const ComplexMat &rhs) const
+ComplexMat_ ComplexMat_::operator/(const ComplexMat_ &rhs) const
 {
     assert(rhs.n_channels == n_channels && rhs.cols == cols && rhs.rows == rows);
 
-    ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales, this->stream);
+    ComplexMat_ result = ComplexMat_::same_size(*this);
+
+    const uint total = n_channels * rows * cols;
+    const dim3 threads(256);
+    const dim3 blocks((total + threads.x - 1) / threads.x);
 
-    dim3 threadsPerBlock(rows, cols);
-    dim3 numBlocks(n_channels / n_scales, n_scales);
-    same_num_channels_div_kernel<<<numBlocks, threadsPerBlock, 0, this->stream>>>(this->p_data, rhs.p_data,
-                                                                                  result.p_data);
+    same_num_channels_div_kernel<<<threads, blocks, 0>>>((float*)this->p_data.deviceMem(),
+                                                         (float*)rhs.p_data.deviceMem(),
+                                                         (float*)result.p_data.deviceMem(), total);
     CudaCheckError();
 
     return result;
 }
 
-__global__ void same_num_channels_add_kernel(float *data_l, float *data_r, float *result)
+__global__ void same_num_channels_add_kernel(const float *data_l, const float *data_r, float *result, int total)
 {
-    int blockId = blockIdx.x + blockIdx.y * gridDim.x;
-    int threadId = 2 * (blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x);
+    int idx = 2 * (blockIdx.x * blockDim.x + threadIdx.x);
 
-    result[threadId] = data_l[threadId] + data_r[threadId];
-    result[threadId + 1] = data_l[threadId + 1] + data_r[threadId + 1];
+    if (idx / 2 < total) {
+        result[idx] = data_l[idx] + data_r[idx];
+        result[idx + 1] = data_l[idx + 1] + data_r[idx + 1];
+    }
 }
 
-ComplexMat ComplexMat::operator+(const ComplexMat &rhs) const
+ComplexMat_ ComplexMat_::operator+(const ComplexMat_ &rhs) const
 {
     assert(rhs.n_channels == n_channels && rhs.cols == cols && rhs.rows == rows);
 
-    ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales, this->stream);
+    ComplexMat_ result = ComplexMat_::same_size(*this);
+
+    const uint total = n_channels * rows * cols;
+    const dim3 threads(256);
+    const dim3 blocks((total + threads.x - 1) / threads.x);
 
-    dim3 threadsPerBlock(rows, cols);
-    dim3 numBlocks(n_channels / n_scales, n_scales);
-    same_num_channels_add_kernel<<<numBlocks, threadsPerBlock, 0, this->stream>>>(this->p_data, rhs.p_data,
-                                                                                  result.p_data);
+    same_num_channels_add_kernel<<<threads, blocks, 0>>>((float*)this->p_data.deviceMem(),
+                                                         (float*)rhs.p_data.deviceMem(),
+                                                         (float*)result.p_data.deviceMem(),
+                                                         total);
     CudaCheckError();
 
     return result;
 }
 
-__global__ void constant_mul_kernel(float *data_l, float constant, float *result)
+__global__ void constant_mul_kernel(const float *data_l, float constant, float *result, int total)
 {
-    int blockId = blockIdx.x + blockIdx.y * gridDim.x;
-    int threadId = 2 * (blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x);
+    int idx = 2 * (blockIdx.x * blockDim.x + threadIdx.x);
 
-    result[threadId] = data_l[threadId] * constant;
-    result[threadId + 1] = data_l[threadId + 1] * constant;
+    if (idx / 2 < total) {
+        result[idx] = data_l[idx] * constant;
+        result[idx + 1] = data_l[idx + 1] * constant;
+    }
 }
 
-ComplexMat ComplexMat::operator*(const float &rhs) const
+ComplexMat_ ComplexMat_::operator*(const float &rhs) const
 {
-    ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales, this->stream);
+    ComplexMat_ result = ComplexMat_::same_size(*this);
 
-    dim3 threadsPerBlock(rows, cols);
-    dim3 numBlocks(n_channels / n_scales, n_scales);
-    constant_mul_kernel<<<numBlocks, threadsPerBlock, 0, this->stream>>>(this->p_data, rhs, result.p_data);
+    const uint total = n_channels * rows * cols;
+    const dim3 threads(256);
+    const dim3 blocks((total + threads.x - 1) / threads.x);
+
+    constant_mul_kernel<<<threads, blocks, 0>>>((float*)this->p_data.deviceMem(),
+                                                rhs,
+                                                (float*)result.p_data.deviceMem(),
+                                                total);
     CudaCheckError();
 
     return result;
 }
 
-__global__ void constant_add_kernel(float *data_l, float constant, float *result)
+__global__ void constant_add_kernel(const float *data_l, float constant, float *result, int total)
 {
-    int blockId = blockIdx.x + blockIdx.y * gridDim.x;
-    int threadId = 2 * (blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x);
+    int idx = 2 * (blockIdx.x * blockDim.x + threadIdx.x);
 
-    result[threadId] = data_l[threadId] + constant;
-    result[threadId + 1] = data_l[threadId + 1];
+    if (idx / 2 < total) {
+        result[idx] = data_l[idx] + constant;
+        result[idx + 1] = data_l[idx + 1];
+    }
 }
 
-ComplexMat ComplexMat::operator+(const float &rhs) const
+ComplexMat_ ComplexMat_::operator+(const float &rhs) const
 {
-    ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales, this->stream);
+    ComplexMat_ result = ComplexMat_::same_size(*this);
+
+    const uint total = n_channels * rows * cols;
+    const dim3 threads(256);
+    const dim3 blocks((total + threads.x - 1) / threads.x);
 
-    dim3 threadsPerBlock(rows, cols);
-    dim3 numBlocks(n_channels / n_scales, n_scales);
-    constant_add_kernel<<<numBlocks, threadsPerBlock, 0, this->stream>>>(this->p_data, rhs, result.p_data);
+    constant_add_kernel<<<threads, blocks, 0>>>((float*)this->p_data.deviceMem(),
+                                                rhs,
+                                                (float*)result.p_data.deviceMem(),
+                                                total);
     CudaCheckError();
 
     return result;
 }
 
-__global__ void one_channel_mul_kernel(float *data_l, float *data_r, float *result)
+__global__ void one_channel_mul_kernel(const float *data_l, const float *data_r, float *result,
+                                       int channel_total, int total)
 {
-    int blockId = blockIdx.x + blockIdx.y * gridDim.x;
-    int threadId = 2 * (blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x);
-    int one_ch_index = 2 * ((threadIdx.y * blockDim.x) + threadIdx.x);
+    int idx = 2 * (blockIdx.x * blockDim.x + threadIdx.x);
+    int one_ch_idx = idx  % (2 * channel_total);
 
-    result[threadId] = data_l[threadId] * data_r[one_ch_index] - data_l[threadId + 1] * data_r[one_ch_index + 1];
-    result[threadId + 1] = data_l[threadId] * data_r[one_ch_index + 1] + data_l[threadId + 1] * data_r[one_ch_index];
+    if (idx / 2 < total) {
+        result[idx] = data_l[idx] * data_r[one_ch_idx] - data_l[idx + 1] * data_r[one_ch_idx + 1];
+        result[idx + 1] = data_l[idx] * data_r[one_ch_idx + 1] + data_l[idx + 1] * data_r[one_ch_idx];
+    }
 }
 
 // multiplying element-wise multichannel by one channel mats (rhs mat is with one channel)
-ComplexMat ComplexMat::mul(const ComplexMat &rhs) const
+ComplexMat_ ComplexMat_::mul(const ComplexMat_ &rhs) const
 {
     assert(rhs.n_channels == 1 && rhs.cols == cols && rhs.rows == rows);
 
-    ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales, this->stream);
+    ComplexMat_ result = ComplexMat_::same_size(*this);
+
+    const uint total = n_channels * rows * cols;
+    const dim3 threads(256);
+    const dim3 blocks((total + threads.x - 1) / threads.x);
 
-    dim3 threadsPerBlock(rows, cols);
-    dim3 numBlocks(n_channels / n_scales, n_scales);
-    one_channel_mul_kernel<<<numBlocks, threadsPerBlock, 0, this->stream>>>(this->p_data, rhs.p_data, result.p_data);
+    one_channel_mul_kernel<<<threads, blocks, 0>>>((float*)this->p_data.deviceMem(),
+                                                   (float*)rhs.p_data.deviceMem(),
+                                                   (float*)result.p_data.deviceMem(),
+                                                   rows * cols, total);
     CudaCheckError();
 
     return result;
 }
 
-__global__ void scales_channel_mul_kernel(float *data_l, float *data_r, float *result)
-{
-    int blockId = blockIdx.x + blockIdx.y * gridDim.x;
-    int threadId = 2 * (blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x);
-    int one_ch_index = 2 * ((threadIdx.y * blockDim.x) + threadIdx.x + blockIdx.x * blockDim.x * blockDim.y);
+// __global__ void scales_channel_mul_kernel(float *data_l, float *data_r, float *result)
+// {
+//     int blockId = blockIdx.x + blockIdx.y * gridDim.x;
+//     int idx = 2 * (blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x);
+//     int one_ch_index = 2 * ((threadIdx.y * blockDim.x) + threadIdx.x + blockIdx.x * blockDim.x * blockDim.y);
 
-    result[threadId] = data_l[threadId] * data_r[one_ch_index] - data_l[threadId + 1] * data_r[one_ch_index + 1];
-    result[threadId + 1] = data_l[threadId] * data_r[one_ch_index + 1] + data_l[threadId + 1] * data_r[one_ch_index];
-}
+//     result[idx] = data_l[idx] * data_r[one_ch_index] - data_l[idx + 1] * data_r[one_ch_index + 1];
+//     result[idx + 1] = data_l[idx] * data_r[one_ch_index + 1] + data_l[idx + 1] * data_r[one_ch_index];
+// }
 
 // multiplying element-wise multichannel by one channel mats (rhs mat is with multiple channel)
-ComplexMat ComplexMat::mul2(const ComplexMat &rhs) const
-{
-    assert(rhs.n_channels == n_channels / n_scales && rhs.cols == cols && rhs.rows == rows);
+// ComplexMat_ ComplexMat_::mul2(const ComplexMat_ &rhs) const
+// {
+//     assert(rhs.n_channels == n_channels / n_scales && rhs.cols == cols && rhs.rows == rows);
 
-    ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales, this->stream);
+//     ComplexMat_ result(this->rows, this->cols, this->channels(), this->n_scales);
 
-    dim3 threadsPerBlock(rows, cols);
-    dim3 numBlocks(n_channels / n_scales, n_scales);
-    scales_channel_mul_kernel<<<numBlocks, threadsPerBlock, 0, this->stream>>>(this->p_data, rhs.p_data, result.p_data);
-    CudaCheckError();
+//     dim3 threadsPerBlock(rows, cols);
+//     dim3 numBlocks(n_channels / n_scales, n_scales);
+//     scales_channel_mul_kernel<<<threads, blocks, 0>>>(this->p_data, rhs.p_data, result.p_data);
+//     CudaCheckError();
 
-    return result;
-}
+//     return result;
+// }
 
-void ComplexMat::operator=(ComplexMat &rhs)
-{
-    cols = rhs.cols;
-    rows = rhs.rows;
-    n_channels = rhs.n_channels;
-    n_scales = rhs.n_scales;
-    stream = rhs.stream;
-    foreign_data = true;
-
-    p_data = rhs.p_data;
-}
+// void ComplexMat_::operator=(ComplexMat_ &&rhs)
+// {
+//     cols = rhs.cols;
+//     rows = rhs.rows;
+//     n_channels = rhs.n_channels;
+//     n_scales = rhs.n_scales;
 
-void ComplexMat::operator=(ComplexMat &&rhs)
-{
-    cols = rhs.cols;
-    rows = rhs.rows;
-    n_channels = rhs.n_channels;
-    n_scales = rhs.n_scales;
-    stream = rhs.stream;
+//     p_data = rhs.p_data;
 
-    p_data = rhs.p_data;
+//     rhs.p_data = nullptr;
+// }
 
-    rhs.p_data = nullptr;
+void ComplexMat_::cudaSync() const
+{
+    CudaSafeCall(cudaStreamSynchronize(cudaStreamPerThread));
 }
diff --git a/src/complexmat.cuh b/src/complexmat.cuh
deleted file mode 100644 (file)
index 7f19771..0000000
+++ /dev/null
@@ -1,135 +0,0 @@
-#ifndef COMPLEXMAT_H
-#define COMPLEXMAT_H
-
-#include <opencv2/opencv.hpp>
-
-#include "dynmem.hpp"
-#include "cuda_runtime.h"
-#include "cufft.h"
-
-#include "cuda/cuda_error_check.cuh"
-
-class ComplexMat {
-  public:
-    uint cols;
-    uint rows;
-    uint n_channels;
-    uint n_scales = 1;
-    bool foreign_data = false;
-    cudaStream_t stream = nullptr;
-
-    ComplexMat() : cols(0), rows(0), n_channels(0) {}
-    ComplexMat(uint _rows, uint _cols, uint _n_channels, cudaStream_t _stream)
-        : cols(_cols), rows(_rows), n_channels(_n_channels), stream(_stream)
-    {
-        CudaSafeCall(cudaMalloc(&p_data, n_channels * cols * rows * sizeof(cufftComplex)));
-    }
-
-    ComplexMat(uint _rows, uint _cols, uint _n_channels, uint _n_scales, cudaStream_t _stream)
-        : cols(_cols), rows(_rows), n_channels(_n_channels), n_scales(_n_scales), stream(_stream)
-    {
-        CudaSafeCall(cudaMalloc(&p_data, n_channels * cols * rows * sizeof(cufftComplex)));
-    }
-
-    ComplexMat(ComplexMat &&other)
-    {
-        cols = other.cols;
-        rows = other.rows;
-        n_channels = other.n_channels;
-        n_scales = other.n_scales;
-        p_data = other.p_data;
-        stream = other.stream;
-
-        other.p_data = nullptr;
-    }
-
-    ~ComplexMat()
-    {
-        if (p_data != nullptr && !foreign_data) {
-            CudaSafeCall(cudaFree(p_data));
-            p_data = nullptr;
-        }
-    }
-
-    void create(uint _rows, uint _cols, uint _n_channels, cudaStream_t _stream = nullptr)
-    {
-        rows = _rows;
-        cols = _cols;
-        n_channels = _n_channels;
-        stream = _stream;
-        CudaSafeCall(cudaMalloc(&p_data, n_channels * cols * rows * sizeof(cufftComplex)));
-    }
-
-    void create(uint _rows, uint _cols, uint _n_channels, uint _n_scales, cudaStream_t _stream = nullptr)
-    {
-        rows = _rows;
-        cols = _cols;
-        n_channels = _n_channels;
-        n_scales = _n_scales;
-        stream = _stream;
-        CudaSafeCall(cudaMalloc(&p_data, n_channels * cols * rows * sizeof(cufftComplex)));
-    }
-    // cv::Mat API compatibility
-    cv::Size size() { return cv::Size(cols, rows); }
-    int channels() { return n_channels; }
-    int channels() const { return n_channels; }
-
-    void set_stream(cudaStream_t _stream)
-    {
-        stream = _stream;
-        return;
-    }
-
-    void sqr_norm(DynMem &result) const;
-
-    ComplexMat sqr_mag() const;
-
-    ComplexMat conj() const;
-
-    ComplexMat sum_over_channels() const;
-
-    cufftComplex *get_p_data() const;
-
-    // element-wise per channel multiplication, division and addition
-    ComplexMat operator*(const ComplexMat &rhs) const;
-    ComplexMat operator/(const ComplexMat &rhs) const;
-    ComplexMat operator+(const ComplexMat &rhs) const;
-
-    // multiplying or adding constant
-    ComplexMat operator*(const float &rhs) const;
-    ComplexMat operator+(const float &rhs) const;
-
-    // multiplying element-wise multichannel by one channel mats (rhs mat is with one channel)
-    ComplexMat mul(const ComplexMat &rhs) const;
-
-    // multiplying element-wise multichannel by one channel mats (rhs mat is with multiple channel)
-    ComplexMat mul2(const ComplexMat &rhs) const;
-    // text output
-    friend std::ostream &operator<<(std::ostream &os, const ComplexMat &mat)
-    {
-        float *data_cpu = reinterpret_cast<float*>(malloc(mat.rows * mat.cols * mat.n_channels * sizeof(cufftComplex)));
-        CudaSafeCall(cudaMemcpy(data_cpu, mat.p_data, mat.rows * mat.cols * mat.n_channels * sizeof(cufftComplex),
-                                cudaMemcpyDeviceToHost));
-        // for (int i = 0; i < mat.n_channels; ++i){
-        for (int i = 0; i < 1; ++i) {
-            os << "Channel " << i << std::endl;
-            for (uint j = 0; j < mat.rows; ++j) {
-                for (uint k = 0; k < 2 * mat.cols - 2; k += 2)
-                    os << "(" << data_cpu[j * 2 * mat.cols + k] << "," << data_cpu[j * 2 * mat.cols + (k + 1)] << ")"
-                       << ", ";
-                os << "(" << data_cpu[j * 2 * mat.cols + 2 * mat.cols - 2] << ","
-                   << data_cpu[j * 2 * mat.cols + 2 * mat.cols - 1] << ")" << std::endl;
-            }
-        }
-        free(data_cpu);
-        return os;
-    }
-
-    void operator=(ComplexMat &rhs);
-    void operator=(ComplexMat &&rhs);
-
-  private:
-    mutable float *p_data = nullptr;
-};
-
-#endif // COMPLEXMAT_H
index 724ffaa61ef1faddb9d89c929e58bf46ddbf3f29..f6aaef265985e41a0f0bdb46eb6b2176c61843bc 100644 (file)
 #include <algorithm>
 #include <functional>
 #include "dynmem.hpp"
+#include "pragmas.h"
 
-template <typename T> class ComplexMat_ {
+#ifdef CUFFT
+#include <cufft.h>
+#endif
+
+class ComplexMat_ {
   public:
+    typedef float T;
+
     uint cols;
     uint rows;
     uint n_channels;
-    uint n_scales = 1;
+    uint n_scales;
 
-    ComplexMat_() : cols(0), rows(0), n_channels(0) {}
-    ComplexMat_(uint _rows, uint _cols, uint _n_channels) : cols(_cols), rows(_rows), n_channels(_n_channels)
-    {
-        p_data.resize(n_channels * cols * rows);
-    }
-
-    ComplexMat_(uint _rows, uint _cols, uint _n_channels, uint _n_scales)
-        : cols(_cols), rows(_rows), n_channels(_n_channels), n_scales(_n_scales)
-    {
-        p_data.resize(n_channels * cols * rows);
-    }
+    ComplexMat_(uint _rows, uint _cols, uint _n_channels, uint _n_scales = 1)
+        : cols(_cols), rows(_rows), n_channels(_n_channels * _n_scales), n_scales(_n_scales),
+          p_data(n_channels * cols * rows) {}
+    ComplexMat_(cv::Size size, uint _n_channels, uint _n_scales = 1)
+        : cols(size.width), rows(size.height), n_channels(_n_channels * _n_scales), n_scales(_n_scales)
+        , p_data(n_channels * cols * rows) {}
 
     // assuming that mat has 2 channels (real, img)
-    ComplexMat_(const cv::Mat &mat) : cols(uint(mat.cols)), rows(uint(mat.rows)), n_channels(1)
+    ComplexMat_(const cv::Mat &mat) : cols(uint(mat.cols)), rows(uint(mat.rows)), n_channels(1), n_scales(1)
+                                    , p_data(n_channels * cols * rows)
     {
-        p_data = convert(mat);
+        cudaSync();
+        memcpy(p_data.hostMem(), mat.ptr<std::complex<T>>(), mat.total() * mat.elemSize());
     }
 
-    void create(uint _rows, uint _cols, uint _n_channels)
+    static ComplexMat_ same_size(const ComplexMat_ &o)
     {
-        rows = _rows;
-        cols = _cols;
-        n_channels = _n_channels;
-        p_data.resize(n_channels * cols * rows);
+        return ComplexMat_(o.rows, o.cols, o.n_channels / o.n_scales, o.n_scales);
     }
 
-    void create(uint _rows, uint _cols, uint _n_channels, uint _n_scales)
-    {
-        rows = _rows;
-        cols = _cols;
-        n_channels = _n_channels;
-        n_scales = _n_scales;
-        p_data.resize(n_channels * cols * rows);
-    }
     // cv::Mat API compatibility
-    cv::Size size() { return cv::Size(cols, rows); }
+    cv::Size size() const { return cv::Size(cols, rows); }
     uint channels() const { return n_channels; }
 
     // assuming that mat has 2 channels (real, imag)
     void set_channel(uint idx, const cv::Mat &mat)
     {
-        assert(idx >= 0 && idx < n_channels);
+        assert(idx < n_channels);
+        cudaSync();
         for (uint i = 0; i < rows; ++i) {
             const std::complex<T> *row = mat.ptr<std::complex<T>>(i);
             for (uint j = 0; j < cols; ++j)
-                p_data[idx * rows * cols + i * cols + j] = row[j];
-        }
-    }
-
-    T sqr_norm() const
-    {
-        int n_channels_per_scale = n_channels / n_scales;
-        T sum_sqr_norm = 0;
-        for (int i = 0; i < n_channels_per_scale; ++i) {
-            for (auto lhs = p_data.begin() + i * rows * cols; lhs != p_data.begin() + (i + 1) * rows * cols; ++lhs)
-                sum_sqr_norm += lhs->real() * lhs->real() + lhs->imag() * lhs->imag();
-        }
-        sum_sqr_norm = sum_sqr_norm / static_cast<T>(cols * rows);
-        return sum_sqr_norm;
-    }
-
-    void sqr_norm(DynMem_<T> &result) const
-    {
-        int n_channels_per_scale = n_channels / n_scales;
-        int scale_offset = n_channels_per_scale * rows * cols;
-        for (uint scale = 0; scale < n_scales; ++scale) {
-            T sum_sqr_norm = 0;
-            for (int i = 0; i < n_channels_per_scale; ++i)
-                for (auto lhs = p_data.begin() + i * rows * cols + scale * scale_offset;
-                     lhs != p_data.begin() + (i + 1) * rows * cols + scale * scale_offset; ++lhs)
-                    sum_sqr_norm += lhs->real() * lhs->real() + lhs->imag() * lhs->imag();
-            result.hostMem()[scale] = sum_sqr_norm / static_cast<T>(cols * rows);
+                p_data.hostMem()[idx * rows * cols + i * cols + j] = row[j];
         }
-        return;
     }
 
-    ComplexMat_<T> sqr_mag() const
-    {
-        return mat_const_operator([](std::complex<T> &c) { c = c.real() * c.real() + c.imag() * c.imag(); });
-    }
+    T sqr_norm() const;
 
-    ComplexMat_<T> conj() const
-    {
-        return mat_const_operator([](std::complex<T> &c) { c = std::complex<T>(c.real(), -c.imag()); });
-    }
+    void sqr_norm(DynMem_<T> &result) const;
 
-    ComplexMat_<T> sum_over_channels() const
-    {
-        assert(p_data.size() > 1);
+    ComplexMat_ sqr_mag() const;
 
-        int n_channels_per_scale = n_channels / n_scales;
-        int scale_offset = n_channels_per_scale * rows * cols;
+    ComplexMat_ conj() const;
 
-        ComplexMat_<T> result(this->rows, this->cols, n_scales);
-        for (uint scale = 0; scale < n_scales; ++scale) {
-            std::copy(p_data.begin() + scale * scale_offset, p_data.begin() + rows * cols + scale * scale_offset,
-                      result.p_data.begin() + scale * rows * cols);
-            for (int i = 1; i < n_channels_per_scale; ++i) {
-                std::transform(result.p_data.begin() + scale * rows * cols,
-                               result.p_data.begin() + (scale + 1) * rows * cols,
-                               p_data.begin() + i * rows * cols + scale * scale_offset,
-                               result.p_data.begin() + scale * rows * cols, std::plus<std::complex<T>>());
-            }
-        }
-        return result;
-    }
+    ComplexMat_ sum_over_channels() const;
 
     // return 2 channels (real, imag) for first complex channel
     cv::Mat to_cv_mat() const
     {
-        assert(p_data.size() >= 1);
+        assert(p_data.num_elem >= 1);
         return channel_to_cv_mat(0);
     }
     // return a vector of 2 channels (real, imag) per one complex channel
@@ -139,46 +85,40 @@ template <typename T> class ComplexMat_ {
         return result;
     }
 
-    std::complex<T> *get_p_data() const { return p_data.data(); }
-
-    // element-wise per channel multiplication, division and addition
-    ComplexMat_<T> operator*(const ComplexMat_<T> &rhs) const
-    {
-        return mat_mat_operator([](std::complex<T> &c_lhs, const std::complex<T> &c_rhs) { c_lhs *= c_rhs; }, rhs);
+    std::complex<T> *get_p_data() {
+        cudaSync();
+        return p_data.hostMem();
     }
-    ComplexMat_<T> operator/(const ComplexMat_<T> &rhs) const
-    {
-        return mat_mat_operator([](std::complex<T> &c_lhs, const std::complex<T> &c_rhs) { c_lhs /= c_rhs; }, rhs);
-    }
-    ComplexMat_<T> operator+(const ComplexMat_<T> &rhs) const
-    {
-        return mat_mat_operator([](std::complex<T> &c_lhs, const std::complex<T> &c_rhs) { c_lhs += c_rhs; }, rhs);
+    const std::complex<T> *get_p_data() const {
+        cudaSync();
+        return p_data.hostMem();
     }
 
+#ifdef CUFFT
+    cufftComplex *get_dev_data() { return (cufftComplex*)p_data.deviceMem(); }
+    const cufftComplex *get_dev_data() const { return (cufftComplex*)p_data.deviceMem(); }
+#endif
+
+    // element-wise per channel multiplication, division and addition
+    ComplexMat_ operator*(const ComplexMat_ &rhs) const;
+    ComplexMat_ operator/(const ComplexMat_ &rhs) const;
+    ComplexMat_ operator+(const ComplexMat_ &rhs) const;
+
     // multiplying or adding constant
-    ComplexMat_<T> operator*(const T &rhs) const
-    {
-        return mat_const_operator([&rhs](std::complex<T> &c) { c *= rhs; });
-    }
-    ComplexMat_<T> operator+(const T &rhs) const
-    {
-        return mat_const_operator([&rhs](std::complex<T> &c) { c += rhs; });
-    }
+    ComplexMat_ operator*(const T &rhs) const;
+    ComplexMat_ operator+(const T &rhs) const;
 
     // multiplying element-wise multichannel by one channel mats (rhs mat is with one channel)
-    ComplexMat_<T> mul(const ComplexMat_<T> &rhs) const
-    {
-        return matn_mat1_operator([](std::complex<T> &c_lhs, const std::complex<T> &c_rhs) { c_lhs *= c_rhs; }, rhs);
-    }
+    ComplexMat_ mul(const ComplexMat_ &rhs) const;
 
-    // multiplying element-wise multichannel by one channel mats (rhs mat is with multiple channel)
-    ComplexMat_<T> mul2(const ComplexMat_<T> &rhs) const
+    // multiplying element-wise multichannel mats - same as operator*(ComplexMat), but without allocating memory for the result
+    ComplexMat_ muln(const ComplexMat_ &rhs) const
     {
-        return matn_mat2_operator([](std::complex<T> &c_lhs, const std::complex<T> &c_rhs) { c_lhs *= c_rhs; }, rhs);
+        return mat_mat_operator([](std::complex<T> &c_lhs, const std::complex<T> &c_rhs) { c_lhs *= c_rhs; }, rhs);
     }
 
     // text output
-    friend std::ostream &operator<<(std::ostream &os, const ComplexMat_<T> &mat)
+    friend std::ostream &operator<<(std::ostream &os, const ComplexMat_ &mat)
     {
         // for (int i = 0; i < mat.n_channels; ++i){
         for (int i = 0; i < 1; ++i) {
@@ -193,7 +133,7 @@ template <typename T> class ComplexMat_ {
     }
 
   private:
-    mutable std::vector<std::complex<T>> p_data;
+    DynMem_<std::complex<T>> p_data;
 
     // convert 2 channel mat (real, imag) to vector row-by-row
     std::vector<std::complex<T>> convert(const cv::Mat &mat)
@@ -209,64 +149,13 @@ template <typename T> class ComplexMat_ {
         return result;
     }
 
-    ComplexMat_<T> mat_mat_operator(void (*op)(std::complex<T> &c_lhs, const std::complex<T> &c_rhs),
-                                    const ComplexMat_<T> &mat_rhs) const
-    {
-        assert(mat_rhs.n_channels == n_channels && mat_rhs.cols == cols && mat_rhs.rows == rows);
-
-        ComplexMat_<T> result = *this;
-        for (uint i = 0; i < n_channels; ++i) {
-            auto lhs = result.p_data.begin() + i * rows * cols;
-            auto rhs = mat_rhs.p_data.begin() + i * rows * cols;
-            for (; lhs != result.p_data.begin() + (i + 1) * rows * cols; ++lhs, ++rhs)
-                op(*lhs, *rhs);
-        }
-
-        return result;
-    }
-    ComplexMat_<T> matn_mat1_operator(void (*op)(std::complex<T> &c_lhs, const std::complex<T> &c_rhs),
-                                      const ComplexMat_<T> &mat_rhs) const
-    {
-        assert(mat_rhs.n_channels == 1 && mat_rhs.cols == cols && mat_rhs.rows == rows);
-
-        ComplexMat_<T> result = *this;
-        for (uint i = 0; i < n_channels; ++i) {
-            auto lhs = result.p_data.begin() + i * rows * cols;
-            auto rhs = mat_rhs.p_data.begin();
-            for (; lhs != result.p_data.begin() + (i + 1) * rows * cols; ++lhs, ++rhs)
-                op(*lhs, *rhs);
-        }
-
-        return result;
-    }
-    ComplexMat_<T> matn_mat2_operator(void (*op)(std::complex<T> &c_lhs, const std::complex<T> &c_rhs),
-                                      const ComplexMat_<T> &mat_rhs) const
-    {
-        assert(mat_rhs.n_channels == n_channels / n_scales && mat_rhs.cols == cols && mat_rhs.rows == rows);
-
-        int n_channels_per_scale = n_channels / n_scales;
-        int scale_offset = n_channels_per_scale * rows * cols;
-        ComplexMat_<T> result = *this;
-        for (uint i = 0; i < n_scales; ++i) {
-            for (int j = 0; j < n_channels_per_scale; ++j) {
-                auto lhs = result.p_data.begin() + (j * rows * cols) + (i * scale_offset);
-                auto rhs = mat_rhs.p_data.begin() + (j * rows * cols);
-                for (; lhs != result.p_data.begin() + ((j + 1) * rows * cols) + (i * scale_offset); ++lhs, ++rhs)
-                    op(*lhs, *rhs);
-            }
-        }
-
-        return result;
-    }
-    ComplexMat_<T> mat_const_operator(const std::function<void(std::complex<T> &c_rhs)> &op) const
-    {
-        ComplexMat_<T> result = *this;
-        for (uint i = 0; i < n_channels; ++i)
-            for (auto lhs = result.p_data.begin() + i * rows * cols;
-                 lhs != result.p_data.begin() + (i + 1) * rows * cols; ++lhs)
-                op(*lhs);
-        return result;
-    }
+    ComplexMat_ mat_mat_operator(void (*op)(std::complex<T> &c_lhs, const std::complex<T> &c_rhs),
+                                 const ComplexMat_ &mat_rhs) const;
+    ComplexMat_ matn_mat1_operator(void (*op)(std::complex<T> &c_lhs, const std::complex<T> &c_rhs),
+                                   const ComplexMat_ &mat_rhs) const;
+    ComplexMat_ matn_mat2_operator(void (*op)(std::complex<T> &c_lhs, const std::complex<T> &c_rhs),
+                                   const ComplexMat_ &mat_rhs) const;
+    ComplexMat_ mat_const_operator(const std::function<void(std::complex<T> &c_rhs)> &op) const;
 
     cv::Mat channel_to_cv_mat(int channel_id) const
     {
@@ -279,8 +168,14 @@ template <typename T> class ComplexMat_ {
         }
         return result;
     }
+
+#ifdef CUFFT
+    void cudaSync() const;
+#else
+    void cudaSync() const {}
+#endif
 };
 
-typedef ComplexMat_<float> ComplexMat;
+typedef ComplexMat_ ComplexMat;
 
 #endif // COMPLEX_MAT_HPP_213123048309482094
diff --git a/src/cuda/CMakeLists.txt b/src/cuda/CMakeLists.txt
deleted file mode 100644 (file)
index 7441c91..0000000
+++ /dev/null
@@ -1,3 +0,0 @@
-cmake_minimum_required(VERSION 2.8)
-
-set(CUDA_ERROR_CHECK cuda_error_check.cuh)
similarity index 65%
rename from src/cuda/cuda_error_check.cuh
rename to src/cuda_error_check.hpp
index 13c2f7dada631e54e79bc0c8fff9ea848b5810d7..5b61c65dca1887702332a41afe5389bb8785923b 100644 (file)
@@ -4,15 +4,15 @@
 //Code taken from https://codeyarns.com/2011/03/02/how-to-do-error-checking-in-cuda/
 #include <iostream>
 
-#define CudaSafeCall( err ) __cudaSafeCall( err, __FILE__, __LINE__ )
+#define CudaSafeCall( err ) __cudaSafeCall( err, #err, __FILE__, __LINE__ )
 #define CudaCheckError()    __cudaCheckError( __FILE__, __LINE__ )
 
-static inline void __cudaSafeCall( cudaError err, const char *file, const int line )
+static inline void __cudaSafeCall( cudaError err, const char *text, const char *file, const int line )
 {
     if ( cudaSuccess != err )
     {
-        fprintf( stderr, "cudaSafeCall() failed at %s:%i : %s\n",
-                 file, line, cudaGetErrorString( err ) );
+        fprintf( stderr, "%s:%i: %s: %s\n",
+                 file, line, text, cudaGetErrorString( err ) );
         exit( -1 );
     }
 
@@ -104,9 +104,10 @@ static inline const char *_cudaGetErrorEnum(cufftResult error)
     return "<unknown>";
 }
 
-#define CufftErrorCheck(call) __cufftErrorCheck(call, __FILE__, __LINE__ )
+// This uses C++ overload resolution to print correct error message based on call type
+#define cudaErrorCheck(call) __cudaErrorCheck(call, __FILE__, __LINE__ )
 
-static inline void __cufftErrorCheck(cufftResult_t call, const char *file, const int line )
+static inline void __cudaErrorCheck(cufftResult_t call, const char *file, const int line )
 {
     if (call != CUFFT_SUCCESS) {
         fprintf(stderr, "cuFFT error %d:%s at %s:%d\n", call, _cudaGetErrorEnum(call), file, line);
@@ -115,6 +116,34 @@ static inline void __cufftErrorCheck(cufftResult_t call, const char *file, const
 
     return;
 }
+
+static inline void __cudaErrorCheck(cublasStatus_t call, const char *file, const int line )
+{
+    if (call != CUBLAS_STATUS_SUCCESS) {
+        const char *status_str;
+        switch (call) {
+#define CUBLAS_STATUS_STR(SYM) case CUBLAS_STATUS_ ## SYM: status_str = #SYM; break
+            CUBLAS_STATUS_STR(SUCCESS);
+            CUBLAS_STATUS_STR(NOT_INITIALIZED);
+            CUBLAS_STATUS_STR(ALLOC_FAILED);
+            CUBLAS_STATUS_STR(INVALID_VALUE);
+            CUBLAS_STATUS_STR(ARCH_MISMATCH);
+            CUBLAS_STATUS_STR(MAPPING_ERROR);
+            CUBLAS_STATUS_STR(EXECUTION_FAILED);
+            CUBLAS_STATUS_STR(INTERNAL_ERROR);
+            CUBLAS_STATUS_STR(NOT_SUPPORTED);
+            CUBLAS_STATUS_STR(LICENSE_ERROR);
+        default:
+            status_str = "???";
+#undef CUBLAS_STATUS_STR
+        }
+        fprintf(stderr, "cuBLAS error %d (%s) at %s:%d\n", call, status_str, file, line);
+        exit(-1);
+    }
+
+    return;
+}
+
 #endif
 
 #endif
diff --git a/src/cuda_functions.cu b/src/cuda_functions.cu
deleted file mode 100644 (file)
index 0ec819d..0000000
+++ /dev/null
@@ -1,60 +0,0 @@
-#include "cuda_functions.cuh"
-
-__global__ void gaussian_correlation_kernel(float *data_in, float *data_out, float *xf_sqr_norm, float *yf_sqr_norm,
-                                            int rows, int cols, int channels_per_scale, double sigma)
-{
-    extern __shared__ float sdata[];
-    int blockId = blockIdx.y * gridDim.x + blockIdx.x;
-    int threadId = blockId * (blockDim.x + channels_per_scale / 2) + threadIdx.x;
-
-    sdata[threadIdx.x] = 0;
-    sdata[threadIdx.x] = data_in[threadId] + data_in[threadId + blockDim.x];
-    __syncthreads();
-
-    for (unsigned int s = (channels_per_scale / 2 + 1) / 2, old_s = channels_per_scale / 2; s > 0; s >>= 1) {
-
-        if (old_s & 1) s += 1;
-
-        if (threadIdx.x < s && threadIdx.x + s < old_s) {
-            sdata[threadIdx.x] += sdata[threadIdx.x + s];
-        }
-        old_s = s;
-        __syncthreads();
-    }
-
-    if (threadIdx.x == 0) {
-        float accumulate_res = sdata[0] / (rows * cols);
-
-        float numel_xf_inv = 1.f / ((cols / 2 + 1) * rows * (channels_per_scale));
-
-        float tmp = (xf_sqr_norm[blockIdx.x] + yf_sqr_norm[0] - 2 * accumulate_res) * numel_xf_inv;
-
-        if (tmp > 0) {
-            data_out[blockIdx.x * rows * cols + blockIdx.y] = expf(-1.f / (sigma * sigma) * tmp);
-        } else {
-            data_out[blockIdx.x * rows * cols + blockIdx.y] = expf(0);
-        }
-    }
-}
-
-void cuda_gaussian_correlation(float *data_in, float *data_out, float *xf_sqr_norm, float *yf_sqr_norm, double sigma,
-                               int n_channels, int n_scales, int rows, int cols, cudaStream_t stream)
-{
-    dim3 threadsPerBlock((n_channels / n_scales) / 2);
-    dim3 numBlocks(n_scales, rows * cols);
-
-    gaussian_correlation_kernel<<<numBlocks, threadsPerBlock, ((n_channels / n_scales) / 2) * sizeof(float), stream>>>(
-        data_in, data_out, xf_sqr_norm, yf_sqr_norm, rows, cols, n_channels / n_scales, sigma);
-    CudaCheckError();
-
-    //    float *data_cpu = (float*) malloc(rows*cols*n_scales*sizeof(float));
-    //    CudaSafeCall(cudaMemcpy(data_cpu, data_out, rows*cols*n_scales*sizeof(float), cudaMemcpyDeviceToHost));
-    //    for (int j = 0; j < rows*n_scales; ++j) {
-    //                for (int k = 0; k < cols-1; ++k)
-    //                   std::cout  << data_cpu[j*cols  + k]  << ", ";
-    //                std::cout << data_cpu[j*cols + cols-1] <<  std::endl;
-    //            }
-    //    std::cout << std::endl << std::endl;
-    //    free(data_cpu);
-    return;
-}
diff --git a/src/cuda_functions.cuh b/src/cuda_functions.cuh
deleted file mode 100644 (file)
index 000ab57..0000000
+++ /dev/null
@@ -1,10 +0,0 @@
-#ifndef CUDA_FUNCTIONS_H
-#define CUDA_FUNCTIONS_H
-
-#include "cuda_runtime.h"
-#include "cuda/cuda_error_check.cuh"
-
-void cuda_gaussian_correlation(float *data_in, float *data_out, float *xf_sqr_norm, float *yf_sqr_norm, double sigma,
-                               int n_channels, int n_scales, int rows, int cols, cudaStream_t stream);
-
-#endif
diff --git a/src/debug.cpp b/src/debug.cpp
new file mode 100644 (file)
index 0000000..4f21a39
--- /dev/null
@@ -0,0 +1,31 @@
+#include "debug.h"
+#include <string>
+
+std::ostream &operator<<(std::ostream &os, const DbgTracer::Printer<cv::Mat> &p)
+{
+    IOSave s(os);
+    os << std::setprecision(DbgTracer::precision);
+    os << p.obj.size << " " << p.obj.channels() << "ch ";// << static_cast<const void *>(p.obj.data);
+    os << " = [ ";
+    const size_t num = 10; //p.obj.total();
+    for (size_t i = 0; i < std::min(num, p.obj.total()); ++i)
+        os << p.obj.ptr<float>()[i] << ", ";
+    os << (num < p.obj.total() ? "... ]" : "]");
+    return os;
+}
+
+std::ostream &operator<<(std::ostream &os, const DbgTracer::Printer<ComplexMat> &p)
+{
+    IOSave s(os);
+    os << std::setprecision(DbgTracer::precision);
+    os << "<cplx> " << p.obj.size() << " " << p.obj.channels() << "ch "; // << p.obj.get_p_data();
+    const int num = 10; //p.obj.rows * p.obj.cols * p.obj.n_channels / p.obj.n_scales;
+    for (uint s = 0; s < p.obj.n_scales; ++s) {
+        uint ofs = s * p.obj.rows * p.obj.cols * p.obj.n_channels / p.obj.n_scales;
+        os << " = [ ";
+        for (int i = 0; i < std::min(num, p.obj.size().area()); ++i)
+            os << p.obj.get_p_data()[ofs + i] << ", ";
+        os << (num < p.obj.size().area() ? "... ]" : "]");
+    }
+    return os;
+}
diff --git a/src/debug.h b/src/debug.h
new file mode 100644 (file)
index 0000000..cce8327
--- /dev/null
@@ -0,0 +1,143 @@
+#ifndef DEBUG_H
+#define DEBUG_H
+
+#include <ios>
+#include <iomanip>
+#include <stdarg.h>
+#include <stdio.h>
+#include <opencv2/opencv.hpp>
+#include "dynmem.hpp"
+#include "complexmat.hpp"
+
+#ifdef CUFFT
+#include <cufft.h>
+#include "nvToolsExt.h"
+#endif
+
+
+class IOSave
+{
+    std::ios&           stream;
+    std::ios::fmtflags  flags;
+    std::streamsize     precision;
+    char                fill;
+public:
+    IOSave( std::ios& userStream )
+        : stream( userStream )
+        , flags( userStream.flags() )
+        , precision( userStream.precision() )
+        , fill( userStream.fill() )
+    {
+    }
+    ~IOSave()
+    {
+        stream.flags( flags );
+        stream.precision( precision );
+        stream.fill( fill );
+    }
+};
+
+class DbgTracer {
+    int indentLvl = 0;
+
+  public:
+    bool debug = false;
+    static constexpr int precision = 2;
+
+    std::string indent() { return std::string(indentLvl * 4, ' '); }
+
+    class FTrace {
+        DbgTracer &t;
+        const char *funcName;
+
+      public:
+        FTrace(DbgTracer &dt, const char *fn, const char *format, ...) : t(dt), funcName(fn)
+        {
+#ifdef CUFFT
+            nvtxRangePushA(fn);
+#endif
+            if (!t.debug) return;
+            char *arg;
+            va_list vl;
+            va_start(vl, format);
+            if (-1 == vasprintf(&arg, format, vl))
+                throw std::runtime_error("vasprintf error");
+            va_end(vl);
+
+            std::cerr << t.indent() << funcName << "(" << arg << ") {" << std::endl;
+            dt.indentLvl++;
+        }
+        ~FTrace()
+        {
+#ifdef CUFFT
+            nvtxRangePop();
+#endif
+            if (!t.debug) return;
+            t.indentLvl--;
+            std::cerr << t.indent() << "}" << std::endl;
+        }
+    };
+
+    template <typename T>
+    void traceVal(const char *name, const T& obj, int line, bool always = false)
+    {
+        (void)line;
+        if (debug || always) {
+            IOSave s(std::cerr);
+            std::cerr << std::setprecision(precision);
+            std::cerr << indent() << name /*<< " @" << line */ << " " << print(obj) << std::endl;
+        }
+    }
+
+    template <typename T> struct Printer {
+        const T &obj;
+        Printer(const T &_obj) : obj(_obj) {}
+    };
+
+    template <typename T> Printer<T> print(const T& obj) { return Printer<T>(obj); }
+    Printer<cv::Mat> print(const MatScales& obj) { return Printer<cv::Mat>(obj); }
+    Printer<cv::Mat> print(const MatFeats& obj) { return Printer<cv::Mat>(obj); }
+    Printer<cv::Mat> print(const MatScaleFeats& obj) { return Printer<cv::Mat>(obj); }
+};
+
+template <typename T>
+std::ostream &operator<<(std::ostream &os, const DbgTracer::Printer<T> &p)
+{
+    os << p.obj;
+    return os;
+}
+
+#if CV_MAJOR_VERSION == 3 && CV_MINOR_VERSION < 3
+static inline std::ostream &operator<<(std::ostream &out, const cv::MatSize &msize)
+{
+    int i, dims = msize.p[-1];
+    for (i = 0; i < dims; i++) {
+        out << msize.p[i];
+        if (i < dims - 1)
+            out << " x ";
+    }
+    return out;
+}
+#endif
+
+std::ostream &operator<<(std::ostream &os, const DbgTracer::Printer<cv::Mat> &p);
+
+#if defined(CUFFT)
+static inline std::ostream &operator<<(std::ostream &os, const cufftComplex &p)
+{
+    (void)p; // TODO
+    return os;
+}
+#endif
+
+std::ostream &operator<<(std::ostream &os, const DbgTracer::Printer<ComplexMat> &p);
+
+extern DbgTracer __dbgTracer;
+
+#define TRACE(...) const DbgTracer::FTrace __tracer(__dbgTracer, __PRETTY_FUNCTION__, ##__VA_ARGS__)
+
+#define DEBUG_PRINT(obj) __dbgTracer.traceVal(#obj, (obj), __LINE__)
+#define DEBUG_PRINTM(obj) DEBUG_PRINT(obj)
+#define PRINT(obj) __dbgTracer.traceVal(#obj, (obj), __LINE__, true)
+
+#endif // DEBUG_H
index a99cba74336416845cbc05b417e5a731761187c5..f45609419b4f4883960e3b8496f90a64e037915f 100644 (file)
 #define DYNMEM_HPP
 
 #include <cstdlib>
+#include <opencv2/opencv.hpp>
+#include <cassert>
+#include <numeric>
 
 #if defined(CUFFT) || defined(CUFFTW)
 #include "cuda_runtime.h"
 #ifdef CUFFT
-#include "cuda/cuda_error_check.cuh"
+#include "cuda_error_check.hpp"
 #endif
 #endif
 
 template <typename T> class DynMem_ {
-    T *ptr = nullptr;
+  private:
+    T *ptr_h = nullptr;
+#ifdef CUFFT
     T *ptr_d = nullptr;
-
+#endif
   public:
-    DynMem_()
-    {}
-    DynMem_(size_t size)
+    typedef T value_type;
+    const size_t num_elem;
+
+    DynMem_(size_t num_elem) : num_elem(num_elem)
     {
 #ifdef CUFFT
-        CudaSafeCall(cudaHostAlloc(reinterpret_cast<void **>(&this->ptr), size, cudaHostAllocMapped));
-        CudaSafeCall(
-            cudaHostGetDevicePointer(reinterpret_cast<void **>(&this->ptr_d), reinterpret_cast<void *>(this->ptr), 0));
+        CudaSafeCall(cudaHostAlloc(reinterpret_cast<void **>(&ptr_h), num_elem * sizeof(T), cudaHostAllocMapped));
+        CudaSafeCall(cudaHostGetDevicePointer(reinterpret_cast<void **>(&ptr_d), reinterpret_cast<void *>(ptr_h), 0));
 #else
-        this->ptr = new float[size];
+        ptr_h = new T[num_elem];
 #endif
     }
-    DynMem_(DynMem_&& other) {
-        this->ptr = other.ptr;
-        this->ptr_d = other.ptr_d;
-
-        other.ptr = nullptr;
+    DynMem_(const DynMem_ &other) : DynMem_(other.num_elem)
+    {
+        memcpy(ptr_h, other.ptr_h, num_elem * sizeof(T));
+    }
+    DynMem_(DynMem_ &&other) : num_elem(other.num_elem)
+    {
+        ptr_h = other.ptr_h;
+        other.ptr_h = nullptr;
+#ifdef CUFFT
+        ptr_d = other.ptr_d;
         other.ptr_d = nullptr;
+#endif
     }
     ~DynMem_()
     {
+        release();
+    }
+    T *hostMem() { return ptr_h; }
+    const T *hostMem() const { return ptr_h; }
 #ifdef CUFFT
-        CudaSafeCall(cudaFreeHost(this->ptr));
-#else
-        delete this->ptr;
+    T *deviceMem() { return ptr_d; }
+    const T *deviceMem() const { return ptr_d; }
 #endif
+    void operator=(DynMem_ &rhs) {
+        assert(num_elem == rhs.num_elem);
+        memcpy(ptr_h, rhs.ptr_h, num_elem * sizeof(T));
     }
-    T *hostMem() { return ptr; }
-    T *deviceMem() { return ptr_d; }
-
     void operator=(DynMem_ &&rhs)
     {
-        this->ptr = rhs.ptr;
-        this->ptr_d = rhs.ptr_d;
-
-        rhs.ptr = nullptr;
+        assert(num_elem == rhs.num_elem);
+        release();
+        ptr_h = rhs.ptr_h;
+        rhs.ptr_h = nullptr;
+#ifdef CUFFT
+        ptr_d = rhs.ptr_d;
         rhs.ptr_d = nullptr;
+#endif
+    }
+    T operator[](uint i) const { return ptr_h[i]; }
+private:
+    void release()
+    {
+#ifdef CUFFT
+        CudaSafeCall(cudaFreeHost(ptr_h));
+#else
+        delete[] ptr_h;
+#endif
     }
 };
+
 typedef DynMem_<float> DynMem;
+
+
+class MatDynMem : public DynMem, public cv::Mat {
+  public:
+    MatDynMem(cv::Size size, int type)
+        : DynMem(size.area() * CV_MAT_CN(type)), cv::Mat(size, type, hostMem())
+    {
+        assert((type & CV_MAT_DEPTH_MASK) == CV_32F);
+    }
+    MatDynMem(int height, int width, int type)
+        : DynMem(width * height * CV_MAT_CN(type)), cv::Mat(height, width, type, hostMem())
+    {
+        assert((type & CV_MAT_DEPTH_MASK) == CV_32F);
+    }
+    MatDynMem(int ndims, const int *sizes, int type)
+        : DynMem(volume(ndims, sizes) * CV_MAT_CN(type)), cv::Mat(ndims, sizes, type, hostMem())
+    {
+        assert((type & CV_MAT_DEPTH_MASK) == CV_32F);
+    }
+    MatDynMem(std::vector<int> size, int type)
+        : DynMem(std::accumulate(size.begin(), size.end(), 1, std::multiplies<int>()))
+        , cv::Mat(size.size(), size.data(), type, hostMem()) {}
+    MatDynMem(MatDynMem &&other) = default;
+    MatDynMem(const cv::Mat &other)
+        : DynMem(other.total()) , cv::Mat(other) {}
+
+    void operator=(const cv::MatExpr &expr) {
+        static_cast<cv::Mat>(*this) = expr;
+    }
+
+  private:
+    static int volume(int ndims, const int *sizes)
+    {
+        int vol = 1;
+        for (int i = 0; i < ndims; i++)
+            vol *= sizes[i];
+        return vol;
+    }
+
+    using cv::Mat::create;
+};
+
+class Mat3d : public MatDynMem
+{
+public:
+    Mat3d(uint dim0, cv::Size size) : MatDynMem({{int(dim0), size.height, size.width}}, CV_32F) {}
+
+    cv::Mat plane(uint idx) {
+        assert(dims == 3);
+        assert(int(idx) < size[0]);
+        return cv::Mat(size[1], size[2], cv::Mat::type(), ptr(idx));
+    }
+    const cv::Mat plane(uint idx) const {
+        assert(dims == 3);
+        assert(int(idx) < size[0]);
+        return cv::Mat(size[1], size[2], cv::Mat::type(), const_cast<uchar*>(ptr(idx)));
+    }
+
+};
+
+class MatFeats : public Mat3d
+{
+public:
+    MatFeats(uint num_features, cv::Size size) : Mat3d(num_features, size) {}
+};
+class MatScales : public Mat3d
+{
+public:
+    MatScales(uint num_scales, cv::Size size) : Mat3d(num_scales, size) {}
+};
+
+class MatScaleFeats : public MatDynMem
+{
+public:
+    MatScaleFeats(uint num_scales, uint num_features, cv::Size size)
+        : MatDynMem({{int(num_scales), int(num_features), size.height, size.width}}, CV_32F) {}
+
+    cv::Mat plane(uint scale, uint feature) {
+        assert(dims == 4);
+        assert(int(scale) < size[0]);
+        assert(int(feature) < size[1]);
+        return cv::Mat(size[2], size[3], cv::Mat::type(), ptr(scale, feature));
+    }
+    cv::Mat scale(uint scale) {
+        assert(dims == 4);
+        assert(int(scale) < size[0]);
+        return cv::Mat(3, std::vector<int>({size[1], size[2], size[3]}).data(), cv::Mat::type(), ptr(scale));
+    }
+};
+
 #endif // DYNMEM_HPP
index 570e5fd1c160e06132e52234a21977e680fc19da..1fe90d05c4bd0273151ebad1030237375cd1bab6 100644 (file)
@@ -1,7 +1,98 @@
 
 #include "fft.h"
+#include <cassert>
+#include "debug.h"
 
 Fft::~Fft()
 {
 
 }
+
+void Fft::init(unsigned width, unsigned height, unsigned num_of_feats, unsigned num_of_scales)
+{
+    m_width = width;
+    m_height = height;
+    m_num_of_feats = num_of_feats;
+#ifdef BIG_BATCH
+    m_num_of_scales = num_of_scales;
+#else
+    (void)num_of_scales;
+#endif
+}
+
+void Fft::set_window(const MatDynMem &window)
+{
+    assert(window.dims == 2);
+    assert(window.size().width == int(m_width));
+    assert(window.size().height == int(m_height));
+    (void)window;
+}
+
+void Fft::forward(const MatScales &real_input, ComplexMat &complex_result)
+{
+    TRACE("");
+    DEBUG_PRINT(real_input);
+    assert(real_input.dims == 3);
+#ifdef BIG_BATCH
+    assert(real_input.size[0] == 1 || real_input.size[0] == int(m_num_of_scales));
+#else
+    assert(real_input.size[0] == 1);
+#endif
+    assert(real_input.size[1] == int(m_height));
+    assert(real_input.size[2] == int(m_width));
+
+    assert(int(complex_result.cols) == freq_size(cv::Size(m_width, m_height)).width);
+    assert(int(complex_result.rows) == freq_size(cv::Size(m_width, m_height)).height);
+    assert(complex_result.channels() == uint(real_input.size[0]));
+
+    (void)real_input;
+    (void)complex_result;
+}
+
+void Fft::forward_window(MatScaleFeats &patch_feats, ComplexMat &complex_result, MatScaleFeats &tmp)
+{
+        assert(patch_feats.dims == 4);
+#ifdef BIG_BATCH
+        assert(patch_feats.size[0] == 1 || patch_feats.size[0] ==  int(m_num_of_scales));
+#else
+        assert(patch_feats.size[0] == 1);
+#endif
+        assert(patch_feats.size[1] == int(m_num_of_feats));
+        assert(patch_feats.size[2] == int(m_height));
+        assert(patch_feats.size[3] == int(m_width));
+
+        assert(tmp.dims == patch_feats.dims);
+        assert(tmp.size[0] == patch_feats.size[0]);
+        assert(tmp.size[1] == patch_feats.size[1]);
+        assert(tmp.size[2] == patch_feats.size[2]);
+        assert(tmp.size[3] == patch_feats.size[3]);
+
+        assert(int(complex_result.cols) == freq_size(cv::Size(m_width, m_height)).width);
+        assert(int(complex_result.rows) == freq_size(cv::Size(m_width, m_height)).height);
+        assert(complex_result.channels() == uint(patch_feats.size[0] * patch_feats.size[1]));
+
+        (void)patch_feats;
+        (void)complex_result;
+        (void)tmp;
+}
+
+void Fft::inverse(ComplexMat &complex_input, MatScales &real_result)
+{
+    TRACE("");
+    DEBUG_PRINT(complex_input);
+    assert(real_result.dims == 3);
+#ifdef BIG_BATCH
+        assert(real_result.size[0] == 1 || real_result.size[0] ==  int(m_num_of_scales));
+#else
+        assert(real_result.size[0] == 1);
+#endif
+    assert(real_result.size[1] == int(m_height));
+    assert(real_result.size[2] == int(m_width));
+
+    assert(int(complex_input.cols) == freq_size(cv::Size(m_width, m_height)).width);
+    assert(int(complex_input.rows) == freq_size(cv::Size(m_width, m_height)).height);
+    assert(complex_input.channels() == uint(real_result.size[0]));
+
+    (void)complex_input;
+    (void)real_result;
+}
index 9f2c6c6491b14c51f46d2d70f24f13e43a40e10d..f242a265d925df7040ec375bc750e440878a0654 100644 (file)
--- a/src/fft.h
+++ b/src/fft.h
@@ -4,29 +4,41 @@
 
 #include <opencv2/opencv.hpp>
 #include <vector>
-#include "threadctx.hpp"
-
-#ifdef CUFFT
-    #include "complexmat.cuh"
-#else
-    #include "complexmat.hpp"
-#endif
+#include <cassert>
+#include "complexmat.hpp"
 
 #ifdef BIG_BATCH
 #define BIG_BATCH_MODE 1
+#define IF_BIG_BATCH(true, false) true
 #else
 #define BIG_BATCH_MODE 0
+#define IF_BIG_BATCH(true, false) false
 #endif
 
 class Fft
 {
 public:
-    virtual void init(unsigned width, unsigned height,unsigned num_of_feats, unsigned num_of_scales) = 0;
-    virtual void set_window(const cv::Mat & window) = 0;
-    virtual void forward(const cv::Mat & real_input, ComplexMat & complex_result, float *real_input_arr, cudaStream_t  stream) = 0;
-    virtual void forward_window(std::vector<cv::Mat> patch_feats, ComplexMat & complex_result, cv::Mat & fw_all, float *real_input_arr, cudaStream_t stream) = 0;
-    virtual void inverse(ComplexMat &  complex_input, cv::Mat & real_result, float *real_result_arr, cudaStream_t stream) = 0;
+    virtual void init(unsigned width, unsigned height, unsigned num_of_feats, unsigned num_of_scales);
+    virtual void set_window(const MatDynMem &window);
+    virtual void forward(const MatScales &real_input, ComplexMat &complex_result);
+    virtual void forward_window(MatScaleFeats &patch_feats_in, ComplexMat &complex_result, MatScaleFeats &tmp);
+    virtual void inverse(ComplexMat &complex_input, MatScales &real_result);
     virtual ~Fft() = 0;
+
+    static cv::Size freq_size(cv::Size space_size)
+    {
+        cv::Size ret(space_size);
+#if defined(CUFFT) || defined(FFTW)
+        ret.width = space_size.width / 2 + 1;
+#endif
+        return ret;
+    }
+
+protected:
+    unsigned m_width, m_height, m_num_of_feats;
+#ifdef BIG_BATCH
+    unsigned m_num_of_scales;
+#endif
 };
 
 #endif // FFT_H
index 9200586c2d6bbc948ee46c897e6cddfc3a133cd9..5c0da667c6916ac78582b5dd9a4578b5cfc85a6d 100644 (file)
 #include "fft_cufft.h"
 
+cuFFT::cuFFT()
+{
+    CudaSafeCall(cudaSetDeviceFlags(cudaDeviceMapHost));
+    cudaErrorCheck(cublasCreate(&cublas));
+    cudaErrorCheck(cublasSetStream(cublas, cudaStreamPerThread));
+}
+
+cufftHandle cuFFT::create_plan_fwd(uint howmany) const
+{
+    int rank = 2;
+    int n[] = {(int)m_height, (int)m_width};
+    int idist = m_height * m_width, odist = m_height * (m_width / 2 + 1);
+    int istride = 1, ostride = 1;
+    int *inembed = n, onembed[] = {(int)m_height, (int)m_width / 2 + 1};
+
+    cufftHandle plan;
+    cudaErrorCheck(cufftPlanMany(&plan, rank, n, inembed, istride, idist, onembed, ostride, odist, CUFFT_R2C, howmany));
+    cudaErrorCheck(cufftSetStream(plan, cudaStreamPerThread));
+    return plan;
+}
+
+cufftHandle cuFFT::create_plan_inv(uint howmany) const
+{
+    int rank = 2;
+    int n[] = {(int)m_height, (int)m_width};
+    int idist = m_height * (m_width / 2 + 1), odist = m_height * m_width;
+    int istride = 1, ostride = 1;
+    int inembed[] = {(int)m_height, (int)m_width / 2 + 1}, *onembed = n;
+
+    cufftHandle plan;
+    cudaErrorCheck(cufftPlanMany(&plan, rank, n, inembed, istride, idist, onembed, ostride, odist, CUFFT_C2R, howmany));
+    cudaErrorCheck(cufftSetStream(plan, cudaStreamPerThread));
+    return plan;
+}
+
+
 void cuFFT::init(unsigned width, unsigned height, unsigned num_of_feats, unsigned num_of_scales)
 {
-    m_width = width;
-    m_height = height;
-    m_num_of_feats = num_of_feats;
-    m_num_of_scales = num_of_scales;
+    Fft::init(width, height, num_of_feats, num_of_scales);
 
     std::cout << "FFT: cuFFT" << std::endl;
 
-    // FFT forward one scale
-    {
-        CufftErrorCheck(cufftPlan2d(&plan_f, int(m_height), int(m_width), CUFFT_R2C));
-    }
-#ifdef BIG_BATCH
-    // FFT forward all scales
-    if (m_num_of_scales > 1 && BIG_BATCH_MODE) {
-        int rank = 2;
-        int n[] = {(int)m_height, (int)m_width};
-        int howmany = m_num_of_scales;
-        int idist = m_height * m_width, odist = m_height * (m_width / 2 + 1);
-        int istride = 1, ostride = 1;
-        int *inembed = n, onembed[] = {(int)m_height, (int)m_width / 2 + 1};
-
-        CufftErrorCheck(cufftPlanMany(&plan_f_all_scales, rank, n, inembed, istride, idist, onembed, ostride, odist,
-                                      CUFFT_R2C, howmany));
-    }
-#endif
-    // FFT forward window one scale
-    {
-        int rank = 2;
-        int n[] = {int(m_height), int(m_width)};
-        int howmany = int(m_num_of_feats);
-        int idist = int(m_height * m_width), odist = int(m_height * (m_width / 2 + 1));
-        int istride = 1, ostride = 1;
-        int *inembed = n, onembed[] = {int(m_height), int(m_width / 2 + 1)};
-
-        CufftErrorCheck(
-            cufftPlanMany(&plan_fw, rank, n, inembed, istride, idist, onembed, ostride, odist, CUFFT_R2C, howmany));
-    }
-#ifdef BIG_BATCH
-    // FFT forward window all scales all feats
-    if (m_num_of_scales > 1 && BIG_BATCH_MODE) {
-        int rank = 2;
-        int n[] = {(int)m_height, (int)m_width};
-        int howmany = m_num_of_scales * m_num_of_feats;
-        int idist = m_height * m_width, odist = m_height * (m_width / 2 + 1);
-        int istride = 1, ostride = 1;
-        int *inembed = n, onembed[] = {(int)m_height, (int)m_width / 2 + 1};
-
-        CufftErrorCheck(cufftPlanMany(&plan_fw_all_scales, rank, n, inembed, istride, idist, onembed, ostride, odist,
-                                      CUFFT_R2C, howmany));
-    }
-#endif
-    // FFT inverse one scale
-    {
-        int rank = 2;
-        int n[] = {int(m_height), int(m_width)};
-        int howmany = int(m_num_of_feats);
-        int idist = int(m_height * (m_width / 2 + 1)), odist = 1;
-        int istride = 1, ostride = int(m_num_of_feats);
-        int inembed[] = {int(m_height), int(m_width / 2 + 1)}, *onembed = n;
-
-        CufftErrorCheck(cufftPlanMany(&plan_i_features, rank, n, inembed, istride, idist, onembed, ostride, odist,
-                                      CUFFT_C2R, howmany));
-    }
-    // FFT inverse all scales
+    plan_f = create_plan_fwd(1);
+    plan_fw = create_plan_fwd(m_num_of_feats);
+    plan_i_1ch = create_plan_inv(1);
+
 #ifdef BIG_BATCH
-    if (m_num_of_scales > 1 && BIG_BATCH_MODE) {
-        int rank = 2;
-        int n[] = {(int)m_height, (int)m_width};
-        int howmany = m_num_of_feats * m_num_of_scales;
-        int idist = m_height * (m_width / 2 + 1), odist = 1;
-        int istride = 1, ostride = m_num_of_feats * m_num_of_scales;
-        int inembed[] = {(int)m_height, (int)m_width / 2 + 1}, *onembed = n;
-
-        CufftErrorCheck(cufftPlanMany(&plan_i_features_all_scales, rank, n, inembed, istride, idist, onembed, ostride,
-                                      odist, CUFFT_C2R, howmany));
-    }
-#endif
-    // FFT inverse one channel one scale
-    {
-        int rank = 2;
-        int n[] = {int(m_height), int(m_width)};
-        int howmany = 1;
-        int idist = int(m_height * (m_width / 2 + 1)), odist = 1;
-        int istride = 1, ostride = 1;
-        int inembed[] = {int(m_height), int(m_width / 2 + 1)}, *onembed = n;
-
-        CufftErrorCheck(
-            cufftPlanMany(&plan_i_1ch, rank, n, inembed, istride, idist, onembed, ostride, odist, CUFFT_C2R, howmany));
-    }
-#ifdef BIG_BATCH
-    // FFT inverse one channel all scales
-    if (m_num_of_scales > 1 && BIG_BATCH_MODE) {
-        int rank = 2;
-        int n[] = {(int)m_height, (int)m_width};
-        int howmany = m_num_of_scales;
-        int idist = m_height * (m_width / 2 + 1), odist = 1;
-        int istride = 1, ostride = m_num_of_scales;
-        int inembed[] = {(int)m_height, (int)m_width / 2 + 1}, *onembed = n;
-
-        CufftErrorCheck(cufftPlanMany(&plan_i_1ch_all_scales, rank, n, inembed, istride, idist, onembed, ostride, odist,
-                                      CUFFT_C2R, howmany));
-    }
+    plan_f_all_scales = create_plan_fwd(m_num_of_scales);
+    plan_fw_all_scales = create_plan_fwd(m_num_of_scales * m_num_of_feats);
+    plan_i_all_scales = create_plan_inv(m_num_of_scales);
 #endif
 }
 
-void cuFFT::set_window(const cv::Mat &window)
+void cuFFT::set_window(const MatDynMem &window)
 {
+    Fft::set_window(window);
     m_window = window;
 }
 
-void cuFFT::forward(const cv::Mat &real_input, ComplexMat &complex_result, float *real_input_arr, cudaStream_t stream)
+void cuFFT::forward(const MatScales &real_input, ComplexMat &complex_result)
 {
-    if (BIG_BATCH_MODE && real_input.rows == int(m_height * m_num_of_scales)) {
-        CufftErrorCheck(cufftExecR2C(plan_f_all_scales, reinterpret_cast<cufftReal *>(real_input_arr),
-                                     complex_result.get_p_data()));
-    } else {
-        NORMAL_OMP_CRITICAL
-        {
-            CufftErrorCheck(cufftSetStream(plan_f, stream));
-            CufftErrorCheck(
-                cufftExecR2C(plan_f, reinterpret_cast<cufftReal *>(real_input_arr), complex_result.get_p_data()));
-            cudaStreamSynchronize(stream);
-        }
-    }
-    return;
+    Fft::forward(real_input, complex_result);
+    auto in = static_cast<cufftReal *>(const_cast<MatScales&>(real_input).deviceMem());
+
+    if (real_input.size[0] == 1)
+        cudaErrorCheck(cufftExecR2C(plan_f, in, complex_result.get_dev_data()));
+#ifdef BIG_BATCH
+    else
+        cudaErrorCheck(cufftExecR2C(plan_f_all_scales, in, complex_result.get_dev_data()));
+#endif
 }
 
-void cuFFT::forward_window(std::vector<cv::Mat> patch_feats, ComplexMat &complex_result, cv::Mat &fw_all,
-                           float *real_input_arr, cudaStream_t stream)
+void cuFFT::forward_window(MatScaleFeats &feat, ComplexMat &complex_result, MatScaleFeats &temp)
 {
-    int n_channels = int(patch_feats.size());
+    Fft::forward_window(feat, complex_result, temp);
 
-    if (n_channels > int(m_num_of_feats)) {
-        for (uint i = 0; i < uint(n_channels); ++i) {
-            cv::Mat in_roi(fw_all, cv::Rect(0, int(i * m_height), int(m_width), int(m_height)));
-            in_roi = patch_feats[i].mul(m_window);
-        }
-        CufftErrorCheck(cufftExecR2C(plan_fw_all_scales, reinterpret_cast<cufftReal *>(real_input_arr),
-                                     complex_result.get_p_data()));
-    } else {
-        for (uint i = 0; i < uint(n_channels); ++i) {
-            cv::Mat in_roi(fw_all, cv::Rect(0, int(i * m_height), int(m_width), int(m_height)));
-            in_roi = patch_feats[i].mul(m_window);
-        }
-        NORMAL_OMP_CRITICAL
-        {
-            CufftErrorCheck(cufftSetStream(plan_fw, stream));
-            CufftErrorCheck(
-                cufftExecR2C(plan_fw, reinterpret_cast<cufftReal *>(real_input_arr), complex_result.get_p_data()));
-            cudaStreamSynchronize(stream);
+    cufftReal *temp_data = temp.deviceMem();
+    uint n_scales = feat.size[0];
+
+    for (uint s = 0; s < n_scales; ++s) {
+        for (uint ch = 0; ch < uint(feat.size[1]); ++ch) {
+            cv::Mat feat_plane = feat.plane(s, ch);
+            cv::Mat temp_plane = temp.plane(s, ch);
+            temp_plane = feat_plane.mul(m_window);
         }
     }
-    return;
+
+    if (n_scales == 1)
+        cudaErrorCheck(cufftExecR2C(plan_fw, temp_data, complex_result.get_dev_data()));
+#ifdef BIG_BATCH
+    else
+        cudaErrorCheck(cufftExecR2C(plan_fw_all_scales, temp_data, complex_result.get_dev_data()));
+#endif
 }
 
-void cuFFT::inverse(ComplexMat &complex_input, cv::Mat &real_result, float *real_result_arr, cudaStream_t stream)
+void cuFFT::inverse(ComplexMat &complex_input, MatScales &real_result)
 {
-    int n_channels = complex_input.n_channels;
-    cufftComplex *in = reinterpret_cast<cufftComplex *>(complex_input.get_p_data());
-
-    if (n_channels == 1) {
-        NORMAL_OMP_CRITICAL
-        {
-            CufftErrorCheck(cufftSetStream(plan_i_1ch, stream));
-            CufftErrorCheck(cufftExecC2R(plan_i_1ch, in, reinterpret_cast<cufftReal *>(real_result_arr)));
-            cudaStreamSynchronize(stream);
-        }
-        real_result = real_result / (m_width * m_height);
-        return;
-    } else if (n_channels == int(m_num_of_scales)) {
-        CufftErrorCheck(cufftExecC2R(plan_i_1ch_all_scales, in, reinterpret_cast<cufftReal *>(real_result_arr)));
-        cudaStreamSynchronize(stream);
-
-        real_result = real_result / (m_width * m_height);
-        return;
-    } else if (n_channels == int(m_num_of_feats) * int(m_num_of_scales)) {
-        CufftErrorCheck(cufftExecC2R(plan_i_features_all_scales, in, reinterpret_cast<cufftReal *>(real_result_arr)));
-        return;
-    }
-    NORMAL_OMP_CRITICAL
-    {
-        CufftErrorCheck(cufftSetStream(plan_i_features, stream));
-        CufftErrorCheck(cufftExecC2R(plan_i_features, in, reinterpret_cast<cufftReal *>(real_result_arr)));
-#if defined(OPENMP) && !defined(BIG_BATCH)
-        CudaSafeCall(cudaStreamSynchronize(stream));
+    Fft::inverse(complex_input, real_result);
+
+    uint n_channels = complex_input.n_channels;
+    cufftComplex *in = reinterpret_cast<cufftComplex *>(complex_input.get_dev_data());
+    cufftReal *out = real_result.deviceMem();
+    float alpha = 1.0 / (m_width * m_height);
+
+    if (n_channels == 1)
+        cudaErrorCheck(cufftExecC2R(plan_i_1ch, in, out));
+#ifdef BIG_BATCH
+    else
+        cudaErrorCheck(cufftExecC2R(plan_i_all_scales, in, out));
 #endif
-    }
-    return;
+    cudaErrorCheck(cublasSscal(cublas, real_result.total(), &alpha, out, 1));
+    // The result is a cv::Mat, which will be accesses by CPU, so we
+    // must synchronize with the GPU here
+    CudaSafeCall(cudaStreamSynchronize(cudaStreamPerThread));
 }
 
 cuFFT::~cuFFT()
 {
-    CufftErrorCheck(cufftDestroy(plan_f));
-    CufftErrorCheck(cufftDestroy(plan_fw));
-    CufftErrorCheck(cufftDestroy(plan_i_1ch));
-    CufftErrorCheck(cufftDestroy(plan_i_features));
-
-    if (BIG_BATCH_MODE) {
-        CufftErrorCheck(cufftDestroy(plan_f_all_scales));
-        CufftErrorCheck(cufftDestroy(plan_fw_all_scales));
-        CufftErrorCheck(cufftDestroy(plan_i_1ch_all_scales));
-        CufftErrorCheck(cufftDestroy(plan_i_features_all_scales));
-    }
+    cudaErrorCheck(cublasDestroy(cublas));
+
+    cudaErrorCheck(cufftDestroy(plan_f));
+    cudaErrorCheck(cufftDestroy(plan_fw));
+    cudaErrorCheck(cufftDestroy(plan_i_1ch));
+
+#ifdef BIG_BATCH
+    cudaErrorCheck(cufftDestroy(plan_f_all_scales));
+    cudaErrorCheck(cufftDestroy(plan_fw_all_scales));
+    cudaErrorCheck(cufftDestroy(plan_i_all_scales));
+#endif
 }
index 5004c624f532a92f3e34444487ce9702c186aa94..9a927aa3a40c204c8aa947a54012e182200d3f92 100644 (file)
@@ -1,12 +1,12 @@
 #ifndef FFT_CUDA_H
 #define FFT_CUDA_H
 
-
 #include <cufft.h>
 #include <cuda_runtime.h>
+#include <cublas_v2.h>
 
 #include "fft.h"
-#include "cuda/cuda_error_check.cuh"
+#include "cuda_error_check.hpp"
 #include "pragmas.h"
 
 struct ThreadCtx;
@@ -14,17 +14,25 @@ struct ThreadCtx;
 class cuFFT : public Fft
 {
 public:
+    cuFFT();
     void init(unsigned width, unsigned height, unsigned num_of_feats, unsigned num_of_scales) override;
-    void set_window(const cv::Mat & window) override;
-    void forward(const cv::Mat & real_input, ComplexMat & complex_result, float *real_input_arr, cudaStream_t  stream) override;
-    void forward_window(std::vector<cv::Mat> patch_feats, ComplexMat & complex_result, cv::Mat & fw_all, float *real_input_arr, cudaStream_t stream) override;
-    void inverse(ComplexMat &  complex_input, cv::Mat & real_result, float *real_result_arr, cudaStream_t stream) override;
+    void set_window(const MatDynMem &window) override;
+    void forward(const MatScales &real_input, ComplexMat &complex_result) override;
+    void forward_window(MatScaleFeats &patch_feats_in, ComplexMat &complex_result, MatScaleFeats &tmp) override;
+    void inverse(ComplexMat &complex_input, MatScales &real_result) override;
     ~cuFFT() override;
+
+protected:
+    cufftHandle create_plan_fwd(uint howmany) const;
+    cufftHandle create_plan_inv(uint howmany) const;
+
 private:
     cv::Mat m_window;
-    unsigned m_width, m_height, m_num_of_feats, m_num_of_scales;
-    cufftHandle plan_f, plan_f_all_scales, plan_fw, plan_fw_all_scales, plan_i_features,
-     plan_i_features_all_scales, plan_i_1ch, plan_i_1ch_all_scales;
+    cufftHandle plan_f, plan_fw, plan_i_1ch;
+#ifdef BIG_BATCH
+    cufftHandle plan_f_all_scales, plan_fw_all_scales, plan_i_all_scales;
+#endif
+    cublasHandle_t cublas;
 };
 
 #endif // FFT_CUDA_H
index 2215e20318a898119f53324d4a3c10b12a285863..1d26269baaaa3a59dcfdb66e863bd9ef49dda9db 100644 (file)
 #include "fft_fftw.h"
-
-#include "fft.h"
+#include <unistd.h>
 
 #ifdef OPENMP
 #include <omp.h>
 #endif
 
-#if (defined(BIG_BATCH) && !defined(CUFFTW)) || (!defined(ASYNC) && !defined(OPENMP) && !defined(CUFFTW))
-#define FFTW_PLAN_WITH_THREADS() fftwf_plan_with_nthreads(4);
-#define FFTW_INIT_THREAD() fftwf_init_threads();
-#define FFTW_CLEAN_THREADS() fftwf_cleanup_threads();
-#else
-#define FFTW_PLAN_WITH_THREADS()
-#define FFTW_INIT_THREAD()
-#define FFTW_CLEAN_THREADS()
-#endif
+Fftw::Fftw(){}
 
-Fftw::Fftw() {}
+fftwf_plan Fftw::create_plan_fwd(uint howmany) const
+{
+    cv::Mat mat_in = cv::Mat::zeros(howmany * m_height, m_width, CV_32F);
+    ComplexMat mat_out(m_height, m_width / 2 + 1, howmany);
+    float *in = reinterpret_cast<float *>(mat_in.data);
+    fftwf_complex *out = reinterpret_cast<fftwf_complex *>(mat_out.get_p_data());
+
+    int rank = 2;
+    int n[] = {(int)m_height, (int)m_width};
+    int idist = m_height * m_width, odist = m_height * (m_width / 2 + 1);
+    int istride = 1, ostride = 1;
+    int *inembed = NULL, *onembed = NULL;
+
+    return fftwf_plan_many_dft_r2c(rank, n, howmany, in, inembed, istride, idist, out, onembed, ostride, odist, FFTW_PATIENT);
+}
+
+fftwf_plan Fftw::create_plan_inv(uint howmany) const
+{
+    ComplexMat mat_in(m_height, m_width / 2 + 1, howmany);
+    cv::Mat mat_out = cv::Mat::zeros(howmany * m_height, m_width, CV_32F);
+    fftwf_complex *in = reinterpret_cast<fftwf_complex *>(mat_in.get_p_data());
+    float *out = reinterpret_cast<float *>(mat_out.data);
+
+    int rank = 2;
+    int n[] = {(int)m_height, (int)m_width};
+    int idist = m_height * (m_width / 2 + 1), odist = m_height * m_width;
+    int istride = 1, ostride = 1;
+    int *inembed = nullptr, *onembed = nullptr;
+
+    return fftwf_plan_many_dft_c2r(rank, n, howmany, in, inembed, istride, idist, out, onembed, ostride, odist, FFTW_PATIENT);
+}
 
 void Fftw::init(unsigned width, unsigned height, unsigned num_of_feats, unsigned num_of_scales)
 {
-    m_width = width;
-    m_height = height;
-    m_num_of_feats = num_of_feats;
-    m_num_of_scales = num_of_scales;
+    Fft::init(width, height, num_of_feats, num_of_scales);
+
+#if !defined(CUFFTW) && defined(BIG_BATCH)
+    fftw_init_threads();
+  #if defined(OPENMP)
+    fftw_plan_with_nthreads(omp_get_max_threads());
+  #else
+    int np = sysconf(_SC_NPROCESSORS_ONLN);
+    fftw_plan_with_nthreads(np);
+  #endif
+#endif
 
 #ifndef CUFFTW
     std::cout << "FFT: FFTW" << std::endl;
 #else
     std::cout << "FFT: cuFFTW" << std::endl;
 #endif
+    fftwf_cleanup();
 
-     FFTW_INIT_THREAD();
+    plan_f = create_plan_fwd(1);
+    plan_fw = create_plan_fwd(m_num_of_feats);
+    plan_i_1ch = create_plan_inv(1);
 
-    // FFT forward one scale
-    {
-        cv::Mat in_f = cv::Mat::zeros(int(m_height), int(m_width), CV_32FC1);
-        ComplexMat out_f(int(m_height), m_width / 2 + 1, 1);
-
-        FFTW_PLAN_WITH_THREADS();
-        plan_f = fftwf_plan_dft_r2c_2d(int(m_height), int(m_width), reinterpret_cast<float *>(in_f.data),
-                                       reinterpret_cast<fftwf_complex *>(out_f.get_p_data()), FFTW_PATIENT);
-    }
-#ifdef BIG_BATCH
-    // FFT forward all scales
-    if (m_num_of_scales > 1 && BIG_BATCH_MODE) {
-        cv::Mat in_f_all = cv::Mat::zeros(m_height * m_num_of_scales, m_width, CV_32F);
-        ComplexMat out_f_all(m_height, m_width / 2 + 1, m_num_of_scales);
-        float *in = reinterpret_cast<float *>(in_f_all.data);
-        fftwf_complex *out = reinterpret_cast<fftwf_complex *>(out_f_all.get_p_data());
-        int rank = 2;
-        int n[] = {(int)m_height, (int)m_width};
-        int howmany = m_num_of_scales;
-        int idist = m_height * m_width, odist = m_height * (m_width / 2 + 1);
-        int istride = 1, ostride = 1;
-        int *inembed = NULL, *onembed = NULL;
-
-        FFTW_PLAN_WITH_THREADS();
-        plan_f_all_scales = fftwf_plan_many_dft_r2c(rank, n, howmany, in, inembed, istride, idist, out, onembed,
-                                                    ostride, odist, FFTW_PATIENT);
-    }
-#endif
-    // FFT forward window one scale
-    {
-        cv::Mat in_fw = cv::Mat::zeros(int(m_height * m_num_of_feats), int(m_width), CV_32F);
-        ComplexMat out_fw(int(m_height), m_width / 2 + 1, int(m_num_of_feats));
-        float *in = reinterpret_cast<float *>(in_fw.data);
-        fftwf_complex *out = reinterpret_cast<fftwf_complex *>(out_fw.get_p_data());
-        int rank = 2;
-        int n[] = {int(m_height), int(m_width)};
-        int howmany = int(m_num_of_feats);
-        int idist = int(m_height * m_width), odist = int(m_height * (m_width / 2 + 1));
-        int istride = 1, ostride = 1;
-        int *inembed = nullptr, *onembed = nullptr;
-
-        FFTW_PLAN_WITH_THREADS();
-        plan_fw = fftwf_plan_many_dft_r2c(rank, n, howmany, in, inembed, istride, idist, out, onembed, ostride, odist,
-                                          FFTW_PATIENT);
-    }
-#ifdef BIG_BATCH
-    // FFT forward window all scales all feats
-    if (m_num_of_scales > 1 && BIG_BATCH_MODE) {
-        cv::Mat in_all = cv::Mat::zeros(m_height * (m_num_of_scales * m_num_of_feats), m_width, CV_32F);
-        ComplexMat out_all(m_height, m_width / 2 + 1, m_num_of_scales * m_num_of_feats);
-        float *in = reinterpret_cast<float *>(in_all.data);
-        fftwf_complex *out = reinterpret_cast<fftwf_complex *>(out_all.get_p_data());
-        int rank = 2;
-        int n[] = {(int)m_height, (int)m_width};
-        int howmany = m_num_of_scales * m_num_of_feats;
-        int idist = m_height * m_width, odist = m_height * (m_width / 2 + 1);
-        int istride = 1, ostride = 1;
-        int *inembed = NULL, *onembed = NULL;
-
-        FFTW_PLAN_WITH_THREADS();
-        plan_fw_all_scales = fftwf_plan_many_dft_r2c(rank, n, howmany, in, inembed, istride, idist, out, onembed,
-                                                     ostride, odist, FFTW_PATIENT);
-    }
-#endif
-    // FFT inverse one scale
-    {
-        ComplexMat in_i(m_height, m_width, m_num_of_feats);
-        cv::Mat out_i = cv::Mat::zeros(int(m_height), int(m_width), CV_32FC(int(m_num_of_feats)));
-        fftwf_complex *in = reinterpret_cast<fftwf_complex *>(in_i.get_p_data());
-        float *out = reinterpret_cast<float *>(out_i.data);
-        int rank = 2;
-        int n[] = {int(m_height), int(m_width)};
-        int howmany = int(m_num_of_feats);
-        int idist = int(m_height * (m_width / 2 + 1)), odist = 1;
-        int istride = 1, ostride = int(m_num_of_feats);
-        int inembed[] = {int(m_height), int(m_width / 2 + 1)}, *onembed = n;
-
-        FFTW_PLAN_WITH_THREADS();
-        plan_i_features = fftwf_plan_many_dft_c2r(rank, n, howmany, in, inembed, istride, idist, out, onembed, ostride,
-                                                  odist, FFTW_PATIENT);
-    }
-    // FFT inverse all scales
-#ifdef BIG_BATCH
-    if (m_num_of_scales > 1 && BIG_BATCH_MODE) {
-        ComplexMat in_i_all(m_height, m_width, m_num_of_feats * m_num_of_scales);
-        cv::Mat out_i_all = cv::Mat::zeros(m_height, m_width, CV_32FC(m_num_of_feats * m_num_of_scales));
-        fftwf_complex *in = reinterpret_cast<fftwf_complex *>(in_i_all.get_p_data());
-        float *out = reinterpret_cast<float *>(out_i_all.data);
-        int rank = 2;
-        int n[] = {(int)m_height, (int)m_width};
-        int howmany = m_num_of_feats * m_num_of_scales;
-        int idist = m_height * (m_width / 2 + 1), odist = 1;
-        int istride = 1, ostride = m_num_of_feats * m_num_of_scales;
-        int inembed[] = {(int)m_height, (int)m_width / 2 + 1}, *onembed = n;
-
-        FFTW_PLAN_WITH_THREADS();
-        plan_i_features_all_scales = fftwf_plan_many_dft_c2r(rank, n, howmany, in, inembed, istride, idist, out,
-                                                             onembed, ostride, odist, FFTW_PATIENT);
-    }
-#endif
-    // FFT inver one channel one scale
-    {
-        ComplexMat in_i1(int(m_height), int(m_width), 1);
-        cv::Mat out_i1 = cv::Mat::zeros(int(m_height), int(m_width), CV_32FC1);
-        fftwf_complex *in = reinterpret_cast<fftwf_complex *>(in_i1.get_p_data());
-        float *out = reinterpret_cast<float *>(out_i1.data);
-        int rank = 2;
-        int n[] = {int(m_height), int(m_width)};
-        int howmany = 1;
-        int idist = int(m_height * (m_width / 2 + 1)), odist = 1;
-        int istride = 1, ostride = 1;
-        int inembed[] = {int(m_height), int(m_width) / 2 + 1}, *onembed = n;
-
-        FFTW_PLAN_WITH_THREADS();
-        plan_i_1ch = fftwf_plan_many_dft_c2r(rank, n, howmany, in, inembed, istride, idist, out, onembed, ostride,
-                                             odist, FFTW_PATIENT);
-    }
 #ifdef BIG_BATCH
-    // FFT inver one channel all scales
-    if (m_num_of_scales > 1 && BIG_BATCH_MODE) {
-        ComplexMat in_i1_all(m_height, m_width, m_num_of_scales);
-        cv::Mat out_i1_all = cv::Mat::zeros(m_height, m_width, CV_32FC(m_num_of_scales));
-        fftwf_complex *in = reinterpret_cast<fftwf_complex *>(in_i1_all.get_p_data());
-        float *out = reinterpret_cast<float *>(out_i1_all.data);
-        int rank = 2;
-        int n[] = {(int)m_height, (int)m_width};
-        int howmany = m_num_of_scales;
-        int idist = m_height * (m_width / 2 + 1), odist = 1;
-        int istride = 1, ostride = m_num_of_scales;
-        int inembed[] = {(int)m_height, (int)m_width / 2 + 1}, *onembed = n;
-
-        FFTW_PLAN_WITH_THREADS();
-        plan_i_1ch_all_scales = fftwf_plan_many_dft_c2r(rank, n, howmany, in, inembed, istride, idist, out, onembed,
-                                                        ostride, odist, FFTW_PATIENT);
-    }
+    plan_f_all_scales = create_plan_fwd(m_num_of_scales);
+    plan_fw_all_scales = create_plan_fwd(m_num_of_scales * m_num_of_feats);
+    plan_i_all_scales = create_plan_inv(m_num_of_scales);
 #endif
 }
 
-void Fftw::set_window(const cv::Mat &window)
+void Fftw::set_window(const MatDynMem &window)
 {
+    Fft::set_window(window);
     m_window = window;
 }
 
-void Fftw::forward(const cv::Mat &real_input, ComplexMat &complex_result, float *real_input_arr, cudaStream_t stream)
+void Fftw::forward(const MatScales &real_input, ComplexMat &complex_result)
 {
-    (void)real_input_arr;
-    (void)stream;
+    Fft::forward(real_input, complex_result);
 
-    if (BIG_BATCH_MODE && real_input.rows == int(m_height * m_num_of_scales)) {
-        fftwf_execute_dft_r2c(plan_f_all_scales, reinterpret_cast<float *>(real_input.data),
-                              reinterpret_cast<fftwf_complex *>(complex_result.get_p_data()));
-    } else {
+    if (real_input.size[0] == 1)
         fftwf_execute_dft_r2c(plan_f, reinterpret_cast<float *>(real_input.data),
                               reinterpret_cast<fftwf_complex *>(complex_result.get_p_data()));
-    }
-    return;
+#ifdef BIG_BATCH
+    else
+        fftwf_execute_dft_r2c(plan_f_all_scales, reinterpret_cast<float *>(real_input.data),
+                              reinterpret_cast<fftwf_complex *>(complex_result.get_p_data()));
+#endif
 }
 
-void Fftw::forward_window(std::vector<cv::Mat> patch_feats, ComplexMat &complex_result, cv::Mat &fw_all,
-                          float *real_input_arr, cudaStream_t stream)
+void Fftw::forward_window(MatScaleFeats  &feat, ComplexMat & complex_result, MatScaleFeats &temp)
 {
-    (void)real_input_arr;
-    (void)stream;
-
-    int n_channels = int(patch_feats.size());
-    for (int i = 0; i < n_channels; ++i) {
-        cv::Mat in_roi(fw_all, cv::Rect(0, i * int(m_height), int(m_width), int(m_height)));
-        in_roi = patch_feats[uint(i)].mul(m_window);
+    Fft::forward_window(feat, complex_result, temp);
+
+    uint n_scales = feat.size[0];
+    for (uint s = 0; s < n_scales; ++s) {
+        for (uint ch = 0; ch < uint(feat.size[1]); ++ch) {
+            cv::Mat feat_plane = feat.plane(s, ch);
+            cv::Mat temp_plane = temp.plane(s, ch);
+            temp_plane = feat_plane.mul(m_window);
+        }
     }
 
-    float *in = reinterpret_cast<float *>(fw_all.data);
+    float *in = temp.ptr<float>();
     fftwf_complex *out = reinterpret_cast<fftwf_complex *>(complex_result.get_p_data());
 
-    if (n_channels <= int(m_num_of_feats))
+    if (n_scales == 1)
         fftwf_execute_dft_r2c(plan_fw, in, out);
+#ifdef BIG_BATCH
     else
         fftwf_execute_dft_r2c(plan_fw_all_scales, in, out);
-    return;
+#endif
 }
 
-void Fftw::inverse(ComplexMat &complex_input, cv::Mat &real_result, float *real_result_arr, cudaStream_t stream)
+void Fftw::inverse(ComplexMat &complex_input, MatScales &real_result)
 {
-    (void)real_result_arr;
-    (void)stream;
+    Fft::inverse(complex_input, real_result);
 
     int n_channels = complex_input.n_channels;
     fftwf_complex *in = reinterpret_cast<fftwf_complex *>(complex_input.get_p_data());
-    float *out = reinterpret_cast<float *>(real_result.data);
+    float *out = real_result.ptr<float>();
 
     if (n_channels == 1)
         fftwf_execute_dft_c2r(plan_i_1ch, in, out);
-    else if (BIG_BATCH_MODE && n_channels == int(m_num_of_scales))
-        fftwf_execute_dft_c2r(plan_i_1ch_all_scales, in, out);
-    else if (BIG_BATCH_MODE && n_channels == int(m_num_of_feats) * int(m_num_of_scales))
-        fftwf_execute_dft_c2r(plan_i_features_all_scales, in, out);
+#ifdef BIG_BATCH
     else
-        fftwf_execute_dft_c2r(plan_i_features, in, out);
-
-    real_result = real_result / (m_width * m_height);
-    return;
+        fftwf_execute_dft_c2r(plan_i_all_scales, in, out);
+#endif
+    real_result *= 1.0 / (m_width * m_height);
 }
 
 Fftw::~Fftw()
 {
-    fftwf_destroy_plan(plan_f);
-    fftwf_destroy_plan(plan_fw);
-    fftwf_destroy_plan(plan_i_features);
-    fftwf_destroy_plan(plan_i_1ch);
-
-    if (BIG_BATCH_MODE) {
-        fftwf_destroy_plan(plan_f_all_scales);
-        fftwf_destroy_plan(plan_i_features_all_scales);
-        fftwf_destroy_plan(plan_fw_all_scales);
-        fftwf_destroy_plan(plan_i_1ch_all_scales);
-    }
-    FFTW_CLEAN_THREADS();
+    if (plan_f) fftwf_destroy_plan(plan_f);
+    if (plan_fw) fftwf_destroy_plan(plan_fw);
+    if (plan_i_1ch) fftwf_destroy_plan(plan_i_1ch);
+
+#ifdef BIG_BATCH
+    if (plan_f_all_scales) fftwf_destroy_plan(plan_f_all_scales);
+    if (plan_fw_all_scales) fftwf_destroy_plan(plan_fw_all_scales);
+    if (plan_i_all_scales) fftwf_destroy_plan(plan_i_all_scales);
+#endif
 }
index 3de42adb113caff2af6569fcef5ec6945cbcf641..7274f514c80a0c3e725ce9ca1b12e3a6a8eccf88 100644 (file)
@@ -1,13 +1,8 @@
-
 #ifndef FFT_FFTW_H
 #define FFT_FFTW_H
 
 #include "fft.h"
 
-#if defined(ASYNC)
-#include <mutex>
-#endif
-
 #ifndef CUFFTW
   #include <fftw3.h>
 #else
 
 class Fftw : public Fft
 {
-public:
+  public:
     Fftw();
     void init(unsigned width, unsigned height, unsigned num_of_feats, unsigned num_of_scales) override;
-    void set_window(const cv::Mat & window) override;
-    void forward(const cv::Mat & real_input, ComplexMat & complex_result, float *real_input_arr, cudaStream_t  stream) override;
-    void forward_window(std::vector<cv::Mat> patch_feats, ComplexMat & complex_result, cv::Mat & fw_all, float *real_input_arr, cudaStream_t stream) override;
-    void inverse(ComplexMat &  complex_input, cv::Mat & real_result, float *real_result_arr, cudaStream_t stream) override;
+    void set_window(const MatDynMem &window) override;
+    void forward(const MatScales &real_input, ComplexMat &complex_result) override;
+    void forward_window(MatScaleFeats &patch_feats_in, ComplexMat &complex_result, MatScaleFeats &tmp) override;
+    void inverse(ComplexMat &complex_input, MatScales &real_result) override;
     ~Fftw() override;
+
+protected:
+    fftwf_plan create_plan_fwd(uint howmany) const;
+    fftwf_plan create_plan_inv(uint howmany) const;
+
 private:
-    unsigned m_width, m_height, m_num_of_feats, m_num_of_scales;
     cv::Mat m_window;
-    fftwf_plan plan_f, plan_f_all_scales, plan_fw, plan_fw_all_scales, plan_i_features,
-       plan_i_features_all_scales, plan_i_1ch, plan_i_1ch_all_scales;
+    fftwf_plan plan_f = 0, plan_fw = 0, plan_i_1ch = 0;
+#ifdef BIG_BATCH
+    fftwf_plan plan_f_all_scales = 0, plan_fw_all_scales = 0, plan_i_all_scales = 0;
+#endif
 };
 
 #endif // FFT_FFTW_H
index ce6dfc844179b500641d6a1dd7c2b2517af80b6b..a41412a37bea4864a1f4fe17706082c36117cf90 100644 (file)
@@ -2,62 +2,46 @@
 
 void FftOpencv::init(unsigned width, unsigned height, unsigned num_of_feats, unsigned num_of_scales)
 {
-    (void)width;
-    (void)height;
-    (void)num_of_feats;
-    (void)num_of_scales;
+    Fft::init(width, height, num_of_feats, num_of_scales);
     std::cout << "FFT: OpenCV" << std::endl;
 }
 
-void FftOpencv::set_window(const cv::Mat &window)
+void FftOpencv::set_window(const MatDynMem &window)
 {
     m_window = window;
 }
 
-void FftOpencv::forward(const cv::Mat &real_input, ComplexMat &complex_result, float *real_input_arr,
-                        cudaStream_t stream)
+void FftOpencv::forward(const MatScales &real_input, ComplexMat &complex_result)
 {
-    (void)real_input_arr;
-    (void)stream;
+    Fft::forward(real_input, complex_result);
 
     cv::Mat tmp;
-    cv::dft(real_input, tmp, cv::DFT_COMPLEX_OUTPUT);
+    cv::dft(real_input.plane(0), tmp, cv::DFT_COMPLEX_OUTPUT);
     complex_result = ComplexMat(tmp);
-    return;
 }
 
-void FftOpencv::forward_window(std::vector<cv::Mat> patch_feats, ComplexMat &complex_result, cv::Mat &fw_all,
-                               float *real_input_arr, cudaStream_t stream)
+void FftOpencv::forward_window(MatScaleFeats &feat, ComplexMat &complex_result, MatScaleFeats &temp)
 {
-    (void)real_input_arr;
-    (void)fw_all;
-    (void)stream;
-
-    uint n_channels = uint(patch_feats.size());
-    for (uint i = 0; i < n_channels; ++i) {
-        cv::Mat complex_res;
-        cv::dft(patch_feats[i].mul(m_window), complex_res, cv::DFT_COMPLEX_OUTPUT);
-        complex_result.set_channel(int(i), complex_res);
+    Fft::forward_window(feat, complex_result, temp);
+
+    for (uint i = 0; i < uint(feat.size[0]); ++i) {
+        for (uint j = 0; j < uint(feat.size[1]); ++j) {
+            cv::Mat complex_res;
+            cv::Mat channel = feat.plane(i, j);
+            cv::dft(channel.mul(m_window), complex_res, cv::DFT_COMPLEX_OUTPUT);
+            complex_result.set_channel(int(j), complex_res);
+        }
     }
-    return;
 }
 
-void FftOpencv::inverse(ComplexMat &complex_input, cv::Mat &real_result, float *real_result_arr, cudaStream_t stream)
+void FftOpencv::inverse(ComplexMat &  complex_input, MatScales & real_result)
 {
-    (void)real_result_arr;
-    (void)stream;
-
-    if (complex_input.n_channels == 1) {
-        cv::dft(complex_input.to_cv_mat(), real_result, cv::DFT_INVERSE | cv::DFT_REAL_OUTPUT | cv::DFT_SCALE);
-    } else {
-        std::vector<cv::Mat> mat_channels = complex_input.to_cv_mat_vector();
-        std::vector<cv::Mat> ifft_mats(ulong(complex_input.n_channels));
-        for (uint i = 0; i < uint(complex_input.n_channels); ++i) {
-            cv::dft(mat_channels[i], ifft_mats[i], cv::DFT_INVERSE | cv::DFT_REAL_OUTPUT | cv::DFT_SCALE);
-        }
-        cv::merge(ifft_mats, real_result);
+    Fft::inverse(complex_input, real_result);
+
+    std::vector<cv::Mat> mat_channels = complex_input.to_cv_mat_vector();
+    for (uint i = 0; i < uint(complex_input.n_channels); ++i) {
+        cv::dft(mat_channels[i], real_result.plane(i), cv::DFT_INVERSE | cv::DFT_REAL_OUTPUT | cv::DFT_SCALE);
     }
-    return;
 }
 
 FftOpencv::~FftOpencv() {}
index fe4d3861f832294cd8120c71c0301af6de8d000b..23711968419b896e12d46300c05a79f6478bdf09 100644 (file)
@@ -8,10 +8,10 @@ class FftOpencv : public Fft
 {
 public:
     void init(unsigned width, unsigned height, unsigned num_of_feats, unsigned num_of_scales) override;
-    void set_window(const cv::Mat & window) override;
-    void forward(const cv::Mat & real_input, ComplexMat & complex_result, float *real_input_arr, cudaStream_t  stream) override;
-    void forward_window(std::vector<cv::Mat> patch_feats, ComplexMat & complex_result, cv::Mat & fw_all, float *real_input_arr, cudaStream_t stream) override;
-    void inverse(ComplexMat &  complex_input, cv::Mat & real_result, float *real_result_arr, cudaStream_t stream) override;
+    void set_window(const MatDynMem &window) override;
+    void forward(const MatScales &real_input, ComplexMat &complex_result) override;
+    void forward_window(MatScaleFeats &patch_feats_in, ComplexMat &complex_result, MatScaleFeats &tmp) override;
+    void inverse(ComplexMat &complex_input, MatScales &real_result) override;
     ~FftOpencv() override;
 private:
     cv::Mat m_window;
index 3195bab37fdeb0c7e44adc44c68e9f71845735e7..d46e2c6f60aca6c4ed4f9e524d6c63b0e66d41de 100644 (file)
@@ -2,6 +2,9 @@
 #include <numeric>
 #include <thread>
 #include <algorithm>
+#include "threadctx.hpp"
+#include "debug.h"
+#include <limits>
 
 #ifdef FFTW
 #include "fft_fftw.h"
 #include <omp.h>
 #endif // OPENMP
 
-#define DEBUG_PRINT(obj)                                                                                               \
-    if (m_debug || m_visual_debug) {                                                                                   \
-        std::cout << #obj << " @" << __LINE__ << std::endl << (obj) << std::endl;                                      \
-    }
-#define DEBUG_PRINTM(obj)                                                                                              \
-    if (m_debug) {                                                                                                     \
-        std::cout << #obj << " @" << __LINE__ << " " << (obj).size() << " CH: " << (obj).channels() << std::endl       \
-                  << (obj) << std::endl;                                                                               \
-    }
+DbgTracer __dbgTracer;
 
 template <typename T>
 T clamp(const T& n, const T& lower, const T& upper)
@@ -40,10 +35,38 @@ void clamp2(T& n, const T& lower, const T& upper)
     n = std::max(lower, std::min(n, upper));
 }
 
+#if CV_MAJOR_VERSION < 3
+template<typename _Tp> static inline
+cv::Size_<_Tp> operator / (const cv::Size_<_Tp>& a, _Tp b)
+{
+    return cv::Size_<_Tp>(a.width / b, a.height / b);
+}
+
+template<typename _Tp> static inline
+cv::Point_<_Tp> operator / (const cv::Point_<_Tp>& a, double b)
+{
+    return cv::Point_<_Tp>(a.x / b, a.y / b);
+}
+
+#endif
+
+class Kcf_Tracker_Private {
+    friend KCF_Tracker;
+
+    Kcf_Tracker_Private(const KCF_Tracker &kcf) : kcf(kcf) {}
+
+    const KCF_Tracker &kcf;
+#ifdef BIG_BATCH
+    std::vector<ThreadCtx> threadctxs;
+#else
+    ScaleRotVector<ThreadCtx> threadctxs{kcf.p_scales, kcf.p_angles};
+#endif
+};
+
 KCF_Tracker::KCF_Tracker(double padding, double kernel_sigma, double lambda, double interp_factor,
                          double output_sigma_factor, int cell_size)
-    : fft(*new FFT()), p_padding(padding), p_output_sigma_factor(output_sigma_factor), p_kernel_sigma(kernel_sigma),
-      p_lambda(lambda), p_interp_factor(interp_factor), p_cell_size(cell_size)
+    : p_cell_size(cell_size), fft(*new FFT()), p_padding(padding), p_output_sigma_factor(output_sigma_factor), p_kernel_sigma(kernel_sigma),
+      p_lambda(lambda), p_interp_factor(interp_factor)
 {
 }
 
@@ -54,8 +77,52 @@ KCF_Tracker::~KCF_Tracker()
     delete &fft;
 }
 
+void KCF_Tracker::train(cv::Mat input_rgb, cv::Mat input_gray, double interp_factor)
+{
+    TRACE("");
+
+    // obtain a sub-window for training
+    get_features(input_rgb, input_gray, nullptr, p_current_center.x, p_current_center.y,
+                 p_windows_size.width, p_windows_size.height,
+                 p_current_scale, p_current_angle).copyTo(model->patch_feats.scale(0));
+    DEBUG_PRINT(model->patch_feats);
+    fft.forward_window(model->patch_feats, model->xf, model->temp);
+    DEBUG_PRINTM(model->xf);
+    model->model_xf = model->model_xf * (1. - interp_factor) + model->xf * interp_factor;
+    DEBUG_PRINTM(model->model_xf);
+
+    if (m_use_linearkernel) {
+        ComplexMat xfconj = model->xf.conj();
+        model->model_alphaf_num = xfconj.mul(model->yf);
+        model->model_alphaf_den = (model->xf * xfconj);
+    } else {
+        // Kernel Ridge Regression, calculate alphas (in Fourier domain)
+        cv::Size sz(Fft::freq_size(feature_size));
+        ComplexMat kf(sz.height, sz.width, 1);
+        (*gaussian_correlation)(kf, model->model_xf, model->model_xf, p_kernel_sigma, true, *this);
+        DEBUG_PRINTM(kf);
+        model->model_alphaf_num = model->yf * kf;
+        model->model_alphaf_den = kf * (kf + p_lambda);
+    }
+    model->model_alphaf = model->model_alphaf_num / model->model_alphaf_den;
+    DEBUG_PRINTM(model->model_alphaf);
+    //        p_model_alphaf = p_yf / (kf + p_lambda);   //equation for fast training
+}
+
+static int round_pw2_down(int x)
+{
+        for (int i = 1; i < 32; i <<= 1)
+            x |= x >> i;
+        x++;
+        return x >> 1;
+}
+
+
 void KCF_Tracker::init(cv::Mat &img, const cv::Rect &bbox, int fit_size_x, int fit_size_y)
 {
+    __dbgTracer.debug = m_debug;
+    TRACE("");
+
     // check boundary, enforce min size
     double x1 = bbox.x, x2 = bbox.x + bbox.width, y1 = bbox.y, y2 = bbox.y + bbox.height;
     if (x1 < 0) x1 = 0.;
@@ -86,10 +153,10 @@ void KCF_Tracker::init(cv::Mat &img, const cv::Rect &bbox, int fit_size_x, int f
         }
     }
 
-    p_pose.w = x2 - x1;
-    p_pose.h = y2 - y1;
-    p_pose.cx = x1 + p_pose.w / 2.;
-    p_pose.cy = y1 + p_pose.h / 2.;
+    p_init_pose.w = x2 - x1;
+    p_init_pose.h = y2 - y1;
+    p_init_pose.cx = x1 + p_init_pose.w / 2.;
+    p_init_pose.cy = y1 + p_init_pose.h / 2.;
 
     cv::Mat input_gray, input_rgb = img.clone();
     if (img.channels() == 3) {
@@ -99,101 +166,59 @@ void KCF_Tracker::init(cv::Mat &img, const cv::Rect &bbox, int fit_size_x, int f
         img.convertTo(input_gray, CV_32FC1);
 
     // don't need too large image
-    if (p_pose.w * p_pose.h > 100. * 100. && (fit_size_x == -1 || fit_size_y == -1)) {
+    if (p_init_pose.w * p_init_pose.h > 100. * 100.) {
         std::cout << "resizing image by factor of " << 1 / p_downscale_factor << std::endl;
         p_resize_image = true;
-        p_pose.scale(p_downscale_factor);
+        p_init_pose.scale(p_downscale_factor);
         cv::resize(input_gray, input_gray, cv::Size(0, 0), p_downscale_factor, p_downscale_factor, cv::INTER_AREA);
         cv::resize(input_rgb, input_rgb, cv::Size(0, 0), p_downscale_factor, p_downscale_factor, cv::INTER_AREA);
-    } else if (!(fit_size_x == -1 && fit_size_y == -1)) {
-        if (fit_size_x % p_cell_size != 0 || fit_size_y % p_cell_size != 0) {
-            std::cerr << "Error: Fit size is not multiple of HOG cell size (" << p_cell_size << ")" << std::endl;
-            std::exit(EXIT_FAILURE);
-        }
-        p_scale_factor_x = (double)fit_size_x / round(p_pose.w * (1. + p_padding));
-        p_scale_factor_y = (double)fit_size_y / round(p_pose.h * (1. + p_padding));
-        std::cout << "resizing image horizontaly by factor of " << p_scale_factor_x << " and verticaly by factor of "
-                  << p_scale_factor_y << std::endl;
-        p_fit_to_pw2 = true;
-        p_pose.scale_x(p_scale_factor_x);
-        p_pose.scale_y(p_scale_factor_y);
-        if (fabs(p_scale_factor_x - 1) > p_floating_error || fabs(p_scale_factor_y - 1) > p_floating_error) {
-            if (p_scale_factor_x < 1 && p_scale_factor_y < 1) {
-                cv::resize(input_gray, input_gray, cv::Size(0, 0), p_scale_factor_x, p_scale_factor_y, cv::INTER_AREA);
-                cv::resize(input_rgb, input_rgb, cv::Size(0, 0), p_scale_factor_x, p_scale_factor_y, cv::INTER_AREA);
-            } else {
-                cv::resize(input_gray, input_gray, cv::Size(0, 0), p_scale_factor_x, p_scale_factor_y, cv::INTER_LINEAR);
-                cv::resize(input_rgb, input_rgb, cv::Size(0, 0), p_scale_factor_x, p_scale_factor_y, cv::INTER_LINEAR);
-            }
-        }
     }
 
     // compute win size + fit to fhog cell size
-    p_windows_size.width = round(p_pose.w * (1. + p_padding) / p_cell_size) * p_cell_size;
-    p_windows_size.height = round(p_pose.h * (1. + p_padding) / p_cell_size) * p_cell_size;
-    p_roi.width = p_windows_size.width / p_cell_size;
-    p_roi.height = p_windows_size.height / p_cell_size;
-
-    p_scales.clear();
-    if (m_use_scale) {
-        for (int i = -int(p_num_scales) / 2; i <= int(p_num_scales) / 2; ++i)
-            p_scales.push_back(std::pow(p_scale_step, i));
+    p_windows_size.width = round(p_init_pose.w * (1. + p_padding) / p_cell_size) * p_cell_size;
+    p_windows_size.height = round(p_init_pose.h * (1. + p_padding) / p_cell_size) * p_cell_size;
+
+    if (fit_size_x == 0 || fit_size_y == 0) {
+        // Round down to the next highest power of 2
+        fit_size = cv::Size(round_pw2_down(p_windows_size.width),
+                            round_pw2_down(p_windows_size.height));
+    } else if (fit_size_x == -1 || fit_size_y == -1) {
+        fit_size =  p_windows_size;
     } else {
-        p_scales.push_back(1.);
-        p_num_scales = 1;
+        fit_size = cv::Size(fit_size_x, fit_size_y);
     }
 
-    if (m_use_angle) {
-        for (int i = p_angle_min; i <= p_angle_max; i += p_angle_step)
-            p_angles.push_back(i);
-    } else {
-        p_angles.push_back(0);
-        p_num_angles = 1;
-    }
+    feature_size = fit_size / p_cell_size;
 
-#ifdef CUFFT
-    if (p_roi.height * (p_roi.width / 2 + 1) > 1024) {
-        std::cerr << "Window after forward FFT is too big for CUDA kernels. Plese use -f to set "
-                     "the window dimensions so its size is less or equal to "
-                  << 1024 * p_cell_size * p_cell_size * 2 + 1
-                  << " pixels . Currently the size of the window is: " << p_windows_size.width << "x"
-                  << p_windows_size.height << " which is  " << p_windows_size.width * p_windows_size.height
-                  << " pixels. " << std::endl;
-        std::exit(EXIT_FAILURE);
-    }
+    p_scales.clear();
+    for (int i = -int(p_num_scales - 1) / 2; i <= int(p_num_scales) / 2; ++i)
+        p_scales.push_back(std::pow(p_scale_step, i));
+
+    p_angles.clear();
+    for (int i = -int(p_num_angles - 1) / 2; i <= int(p_num_angles) / 2; ++i)
+        p_angles.push_back(i * p_angle_step);
 
+#ifdef CUFFT
     if (m_use_linearkernel) {
         std::cerr << "cuFFT supports only Gaussian kernel." << std::endl;
         std::exit(EXIT_FAILURE);
     }
-
-    CudaSafeCall(cudaSetDeviceFlags(cudaDeviceMapHost));
-
-    p_rot_labels_data = DynMem(p_roi.width * p_roi.height * sizeof(float));
-    p_rot_labels = cv::Mat(p_roi, CV_32FC1, p_rot_labels_data.hostMem());
 #endif
 
-#if defined(CUFFT) || defined(FFTW)
-    uint width = p_roi.width / 2 + 1;
+    model.reset(new Model(feature_size, p_num_of_feats));
+    d.reset(new Kcf_Tracker_Private(*this));
+
+#ifndef BIG_BATCH
+    for (auto scale: p_scales)
+        for (auto angle : p_angles)
+            d->threadctxs.emplace_back(feature_size, p_num_of_feats, scale, angle);
 #else
-    uint width = p_roi.width;
+    d->threadctxs.emplace_back(feature_size, p_num_of_feats, p_scales, p_angles);
 #endif
-    p_model_xf.create(p_roi.height, width, p_num_of_feats);
-    p_yf.create(p_roi.height, width, 1);
-    p_xf.create(p_roi.height, width, p_num_of_feats);
-
-    int max1 = BIG_BATCH_MODE ? 2 : p_num_scales;
-    int max2 = BIG_BATCH_MODE ? 1 : p_num_angles;
-    for (int i = 0; i < max1; ++i) {
-        for (int j = 0; j < max2; ++j) {
-            if (BIG_BATCH_MODE && i == 1)
-                p_threadctxs.emplace_back(p_roi, p_num_of_feats * p_num_scales * p_num_angles, 1, 0, p_num_scales,
-                                          p_num_angles);
-            else
-                p_threadctxs.emplace_back(p_roi, p_num_of_feats, p_scales[i], p_angles[j]);
-        }
-    }
 
+    gaussian_correlation.reset(new GaussianCorrelation(1, p_num_of_feats, feature_size));
+
+    p_current_center = p_init_pose.center();
     p_current_scale = 1.;
 
     double min_size_ratio = std::max(5. * p_cell_size / p_windows_size.width, 5. * p_cell_size / p_windows_size.height);
@@ -203,78 +228,28 @@ void KCF_Tracker::init(cv::Mat &img, const cv::Rect &bbox, int fit_size_x, int f
     p_min_max_scale[0] = std::pow(p_scale_step, std::ceil(std::log(min_size_ratio) / log(p_scale_step)));
     p_min_max_scale[1] = std::pow(p_scale_step, std::floor(std::log(max_size_ratio) / log(p_scale_step)));
 
-    std::cout << "init: img size " << img.cols << "x" << img.rows << std::endl;
-    std::cout << "init: win size " << p_windows_size.width << "x" << p_windows_size.height << std::endl;
-    std::cout << "init: FFT size " << p_roi.width << "x" << p_roi.height << std::endl;
+    std::cout << "init: img size " << img.size() << std::endl;
+    std::cout << "init: win size " << p_windows_size;
+    if (p_windows_size != fit_size)
+        std::cout << " resized to " << fit_size;
+    std::cout << std::endl;
+    std::cout << "init: FFT size " << feature_size << std::endl;
     std::cout << "init: min max scales factors: " << p_min_max_scale[0] << " " << p_min_max_scale[1] << std::endl;
 
-    p_output_sigma = std::sqrt(p_pose.w * p_pose.h) * p_output_sigma_factor / p_cell_size;
+    p_output_sigma = std::sqrt(p_init_pose.w * p_init_pose.h * double(fit_size.area()) / p_windows_size.area())
+           * p_output_sigma_factor / p_cell_size;
 
-    fft.init(p_roi.width, p_roi.height, p_num_of_feats, p_num_scales);
-    fft.set_window(cosine_window_function(p_roi.width, p_roi.height));
+    fft.init(feature_size.width, feature_size.height, p_num_of_feats, p_num_scales * p_num_angles);
+    fft.set_window(MatDynMem(cosine_window_function(feature_size.width, feature_size.height)));
 
     // window weights, i.e. labels
-    fft.forward(gaussian_shaped_labels(p_output_sigma, p_roi.width, p_roi.height), p_yf,
-                m_use_cuda ? p_rot_labels_data.deviceMem() : nullptr, p_threadctxs.front().stream);
-    DEBUG_PRINTM(p_yf);
-
-    // obtain a sub-window for training initial model
-    int size_x_scaled = floor(p_windows_size.width);
-    int size_y_scaled = floor(p_windows_size.height);
-
-    cv::Mat patch_gray = get_subwindow(input_gray, this->p_pose.cx, this->p_pose.cy, size_x_scaled, size_y_scaled);
-    geometric_transformations(patch_gray, p_windows_size.width, p_windows_size.height, 0, false);
-
-    cv::Mat patch_rgb;
-    if ((m_use_color || m_use_cnfeat) && input_rgb.channels() == 3) {
-        patch_rgb = get_subwindow(input_rgb, this->p_pose.cx, this->p_pose.cy, size_x_scaled, size_y_scaled);
-        geometric_transformations(patch_rgb, p_windows_size.width, p_windows_size.height, 0, false);
-    }
+    MatScales gsl(1, feature_size);
+    gaussian_shaped_labels(p_output_sigma, feature_size.width, feature_size.height).copyTo(gsl.plane(0));
+    fft.forward(gsl, model->yf);
+    DEBUG_PRINTM(model->yf);
 
-    std::vector<cv::Mat> patch_feats = get_features(patch_rgb, patch_gray);
-    fft.forward_window(patch_feats, p_model_xf, p_threadctxs.front().fw_all,
-                       m_use_cuda ? p_threadctxs.front().data_features.deviceMem() : nullptr,
-                       p_threadctxs.front().stream);
-    DEBUG_PRINTM(p_model_xf);
-
-#if !defined(BIG_BATCH) && defined(CUFFT) && (defined(ASYNC) || defined(OPENMP))
-    p_threadctxs.front().model_xf = p_model_xf;
-    p_threadctxs.front().model_xf.set_stream(p_threadctxs.front().stream);
-    p_yf.set_stream(p_threadctxs.front().stream);
-    p_model_xf.set_stream(p_threadctxs.front().stream);
-    p_xf.set_stream(p_threadctxs.front().stream);
-#endif
-
-    if (m_use_linearkernel) {
-        ComplexMat xfconj = p_model_xf.conj();
-        p_model_alphaf_num = xfconj.mul(p_yf);
-        p_model_alphaf_den = (p_model_xf * xfconj);
-    } else {
-        // Kernel Ridge Regression, calculate alphas (in Fourier domain)
-#if  !defined(BIG_BATCH) && defined(CUFFT) && (defined(ASYNC) || defined(OPENMP))
-        gaussian_correlation(p_threadctxs.front(), p_threadctxs.front().model_xf, p_threadctxs.front().model_xf,
-                             p_kernel_sigma, true);
-#else
-        gaussian_correlation(p_threadctxs.front(), p_model_xf, p_model_xf, p_kernel_sigma, true);
-#endif
-        DEBUG_PRINTM(p_threadctxs.front().kf);
-        p_model_alphaf_num = p_yf * p_threadctxs.front().kf;
-        DEBUG_PRINTM(p_model_alphaf_num);
-        p_model_alphaf_den = p_threadctxs.front().kf * (p_threadctxs.front().kf + float(p_lambda));
-        DEBUG_PRINTM(p_model_alphaf_den);
-    }
-    p_model_alphaf = p_model_alphaf_num / p_model_alphaf_den;
-    DEBUG_PRINTM(p_model_alphaf);
-    //        p_model_alphaf = p_yf / (kf + p_lambda);   //equation for fast training
-
-#if  !defined(BIG_BATCH) && defined(CUFFT) && (defined(ASYNC) || defined(OPENMP))
-    for (auto it = p_threadctxs.begin(); it != p_threadctxs.end(); ++it) {
-        it->model_xf = p_model_xf;
-        it->model_xf.set_stream(it->stream);
-        it->model_alphaf = p_model_alphaf;
-        it->model_alphaf.set_stream(it->stream);
-    }
-#endif
+    // train initial model
+    train(input_rgb, input_gray, 1.0);
 }
 
 void KCF_Tracker::setTrackerPose(BBox_c &bbox, cv::Mat &img, int fit_size_x, int fit_size_y)
@@ -284,35 +259,24 @@ void KCF_Tracker::setTrackerPose(BBox_c &bbox, cv::Mat &img, int fit_size_x, int
 
 void KCF_Tracker::updateTrackerPosition(BBox_c &bbox)
 {
+    BBox_c tmp = bbox;
     if (p_resize_image) {
-        BBox_c tmp = bbox;
         tmp.scale(p_downscale_factor);
-        p_pose.cx = tmp.cx;
-        p_pose.cy = tmp.cy;
-    } else if (p_fit_to_pw2) {
-        BBox_c tmp = bbox;
-        tmp.scale_x(p_scale_factor_x);
-        tmp.scale_y(p_scale_factor_y);
-        p_pose.cx = tmp.cx;
-        p_pose.cy = tmp.cy;
-    } else {
-        p_pose.cx = bbox.cx;
-        p_pose.cy = bbox.cy;
     }
+    p_current_center = tmp.center();
 }
 
 BBox_c KCF_Tracker::getBBox()
 {
-    BBox_c tmp = p_pose;
-    tmp.w *= p_current_scale;
-    tmp.h *= p_current_scale;
+    BBox_c tmp;
+    tmp.cx = p_current_center.x;
+    tmp.cy = p_current_center.y;
+    tmp.w = p_init_pose.w * p_current_scale;
+    tmp.h = p_init_pose.h * p_current_scale;
     tmp.a = p_current_angle;
 
-    if (p_resize_image) tmp.scale(1 / p_downscale_factor);
-    if (p_fit_to_pw2) {
-        tmp.scale_x(1 / p_scale_factor_x);
-        tmp.scale_y(1 / p_scale_factor_y);
-    }
+    if (p_resize_image)
+        tmp.scale(1 / p_downscale_factor);
 
     return tmp;
 }
@@ -322,297 +286,268 @@ double KCF_Tracker::getFilterResponse() const
     return this->max_response;
 }
 
-void KCF_Tracker::track(cv::Mat &img)
+void KCF_Tracker::resizeImgs(cv::Mat &input_rgb, cv::Mat &input_gray)
 {
-    if (m_debug || m_visual_debug) std::cout << "\nNEW FRAME" << std::endl;
-    cv::Mat input_gray, input_rgb = img.clone();
-    if (img.channels() == 3) {
-        cv::cvtColor(img, input_gray, CV_BGR2GRAY);
-        input_gray.convertTo(input_gray, CV_32FC1);
-    } else
-        img.convertTo(input_gray, CV_32FC1);
-
-    // don't need too large image
     if (p_resize_image) {
         cv::resize(input_gray, input_gray, cv::Size(0, 0), p_downscale_factor, p_downscale_factor, cv::INTER_AREA);
         cv::resize(input_rgb, input_rgb, cv::Size(0, 0), p_downscale_factor, p_downscale_factor, cv::INTER_AREA);
-    } else if (p_fit_to_pw2 && fabs(p_scale_factor_x - 1) > p_floating_error &&
-               fabs(p_scale_factor_y - 1) > p_floating_error) {
-        if (p_scale_factor_x < 1 && p_scale_factor_y < 1) {
-            cv::resize(input_gray, input_gray, cv::Size(0, 0), p_scale_factor_x, p_scale_factor_y, cv::INTER_AREA);
-            cv::resize(input_rgb, input_rgb, cv::Size(0, 0), p_scale_factor_x, p_scale_factor_y, cv::INTER_AREA);
-        } else {
-            cv::resize(input_gray, input_gray, cv::Size(0, 0), p_scale_factor_x, p_scale_factor_y, cv::INTER_LINEAR);
-            cv::resize(input_rgb, input_rgb, cv::Size(0, 0), p_scale_factor_x, p_scale_factor_y, cv::INTER_LINEAR);
-        }
     }
-    max_response = -1.;
-    ThreadCtx *max = nullptr;
-    cv::Point2i *max_response_pt = nullptr;
-    cv::Mat *max_response_map = nullptr;
+}
 
-#ifdef ASYNC
-    for (auto &it : p_threadctxs)
-        it.async_res = std::async(std::launch::async, [this, &input_gray, &input_rgb, &it]() -> void {
-            scale_track(it, input_rgb, input_gray);
-        });
-    for (auto const &it : p_threadctxs)
-        it.async_res.wait();
-#else  // !ASYNC
-    NORMAL_OMP_PARALLEL_FOR
-    for (uint i =  BIG_BATCH_MODE ? 1 : 0; i < p_threadctxs.size(); ++i)
-        scale_track(p_threadctxs[i], input_rgb, input_gray);
-#endif
+static void drawCross(cv::Mat &img, cv::Point center, bool green)
+{
+    cv::Scalar col = green ? cv::Scalar(0, 1, 0) : cv::Scalar(0, 0, 1);
+    cv::line(img, cv::Point(center.x, 0), cv::Point(center.x, img.size().height), col);
+    cv::line(img, cv::Point(0, center.y), cv::Point(img.size().height, center.y), col);
+}
+
+static cv::Point2d wrapAroundFreq(cv::Point2d pt, cv::Mat &resp_map)
+{
+    if (pt.y > resp_map.rows / 2) // wrap around to negative half-space of vertical axis
+        pt.y = pt.y - resp_map.rows;
+    if (pt.x > resp_map.cols / 2) // same for horizontal axis
+        pt.x = pt.x - resp_map.cols;
+    return pt;
+}
+
+double KCF_Tracker::findMaxReponse(uint &max_idx, cv::Point2d &new_location) const
+{
+    double max;
+    const auto &vec = IF_BIG_BATCH(d->threadctxs[0].max, d->threadctxs);
 
 #ifndef BIG_BATCH
-    for (auto &it : p_threadctxs) {
-        if (it.max_response > max_response) {
-            max_response = it.max_response;
-            max_response_pt = &it.max_loc;
-            max_response_map = &it.response;
-            max = &it;
-        }
-    }
+    auto max_it = std::max_element(vec.begin(), vec.end(),
+                                   [](const ThreadCtx &a, const ThreadCtx &b)
+                                   { return a.max.response < b.max.response; });
 #else
-    for (uint j = 0; j < p_num_scales; ++j) {
-        for (uint k = 0; k < p_num_angles; ++k) {
-            if (p_threadctxs.back().max_responses[j + k] > max_response) {
-                max_response = p_threadctxs.back().max_responses[j + k];
-                max_response_pt = &p_threadctxs.back().max_locs[j + k];
-                max_response_map = &p_threadctxs.back().response_maps[j + k];
-            }
-        }
-    }
-    max = &p_threadctxs.back();
+    auto max_it = std::max_element(vec.begin(), vec.end(),
+                                   [](const ThreadCtx::Max &a, const ThreadCtx::Max &b)
+                                   { return a.response < b.response; });
 #endif
-    if (m_visual_debug) {
-        cv::Mat all_responses(cv::Size(p_num_angles* p_debug_image_size, p_num_scales * p_debug_image_size),
-                              p_debug_scale_responses[0].type(), cv::Scalar::all(0));
-        cv::Mat all_subwindows(cv::Size(p_num_angles* p_debug_image_size, p_num_scales* p_debug_image_size),
-                               p_debug_subwindows[0].type(), cv::Scalar::all(0));
-        for (size_t i = 0; i < p_num_scales; ++i) {
-            for (size_t j = 0; j < p_num_angles; ++j) {
-                cv::Mat in_roi(all_responses, cv::Rect(j * p_debug_image_size, i * p_debug_image_size,
-                                                       p_debug_image_size, p_debug_image_size));
-                p_debug_scale_responses[5 * i + j].copyTo(in_roi);
-                in_roi = all_subwindows(
-                    cv::Rect(j * p_debug_image_size, i * p_debug_image_size, p_debug_image_size, p_debug_image_size));
-                p_debug_subwindows[5 * i + j].copyTo(in_roi);
-            }
-        }
-        cv::namedWindow("All subwindows", CV_WINDOW_AUTOSIZE);
-        cv::imshow("All subwindows", all_subwindows);
-        cv::namedWindow("All responses", CV_WINDOW_AUTOSIZE);
-        cv::imshow("All responses", all_responses);
-        cv::waitKey();
-        p_debug_scale_responses.clear();
-        p_debug_subwindows.clear();
-    }
-
-    DEBUG_PRINTM(*max_response_map);
-    DEBUG_PRINT(*max_response_pt);
-
-    // sub pixel quadratic interpolation from neighbours
-    if (max_response_pt->y > max_response_map->rows / 2) // wrap around to negative half-space of vertical axis
-        max_response_pt->y = max_response_pt->y - max_response_map->rows;
-    if (max_response_pt->x > max_response_map->cols / 2) // same for horizontal axis
-        max_response_pt->x = max_response_pt->x - max_response_map->cols;
-
-    cv::Point2f new_location(max_response_pt->x, max_response_pt->y);
-    DEBUG_PRINT(new_location);
+    assert(max_it != vec.end());
+    max = max_it->IF_BIG_BATCH(response, max.response);
 
-    if (m_use_subpixel_localization)
-        new_location = sub_pixel_peak(*max_response_pt, *max_response_map);
-    DEBUG_PRINT(new_location);
+    max_idx = std::distance(vec.begin(), max_it);
 
-    if (m_visual_debug) std::cout << "Old p_pose, cx: " << p_pose.cx << " cy: " << p_pose.cy << std::endl;
+    cv::Point2i max_response_pt = IF_BIG_BATCH(max_it->loc, max_it->max.loc);
+    cv::Mat max_response_map    = IF_BIG_BATCH(d->threadctxs[0].response.plane(max_idx),
+                                               max_it->response.plane(0));
 
-    p_pose.cx += p_current_scale * p_cell_size * double(new_location.x);
-    p_pose.cy += p_current_scale * p_cell_size * double(new_location.y);
+    DEBUG_PRINTM(max_response_map);
+    DEBUG_PRINT(max_response_pt);
 
-    if (m_visual_debug) std::cout << "New p_pose, cx: " << p_pose.cx << " cy: " << p_pose.cy << std::endl;
+    max_response_pt = wrapAroundFreq(max_response_pt, max_response_map);
 
-    if (p_fit_to_pw2) {
-        clamp2(p_pose.cx, 0.0, (img.cols * p_scale_factor_x) - 1);
-        clamp2(p_pose.cy, 0.0, (img.rows * p_scale_factor_y) - 1);
+    // sub pixel quadratic interpolation from neighbours
+    if (m_use_subpixel_localization) {
+        new_location = sub_pixel_peak(max_response_pt, max_response_map);
     } else {
-        clamp2(p_pose.cx, 0.0, img.cols - 1.0);
-        clamp2(p_pose.cy, 0.0, img.rows - 1.0);
+        new_location = max_response_pt;
     }
+    DEBUG_PRINT(new_location);
 
-    // sub grid scale interpolation
-    if (m_use_subgrid_scale) {
-        auto it = std::find_if(p_threadctxs.begin(), p_threadctxs.end(), [max](ThreadCtx &ctx) { return &ctx == max; });
-        p_current_scale *= sub_grid_scale(std::distance(p_threadctxs.begin(), it));
-    } else {
-        p_current_scale *= max->scale;
+    if (m_visual_debug != vd::NONE) {
+        const bool fit = 1;
+        int w = fit ? 100 : (m_visual_debug == vd::PATCH ? fit_size.width  : feature_size.width);
+        int h = fit ? 100 : (m_visual_debug == vd::PATCH ? fit_size.height : feature_size.height);
+        cv::Mat all_responses((h + 1) * p_num_scales - 1,
+                              (w + 1) * p_num_angles - 1, CV_32FC3, cv::Scalar::all(0));
+        for (size_t i = 0; i < p_num_scales; ++i) {
+            for (size_t j = 0; j < p_num_angles; ++j) {
+                auto &threadctx = d->IF_BIG_BATCH(threadctxs[0], threadctxs(i, j));
+                cv::Mat tmp;
+                cv::Point2d cross = threadctx.IF_BIG_BATCH(max(i, j), max).loc;
+                cross = wrapAroundFreq(cross, max_response_map);
+                if (m_visual_debug == vd::PATCH ) {
+                    threadctx.dbg_patch IF_BIG_BATCH((i, j),)
+                            .convertTo(tmp, all_responses.type(), 1.0 / 255);
+                    cross.x = cross.x / fit_size.width  * tmp.cols + tmp.cols / 2;
+                    cross.y = cross.y / fit_size.height * tmp.rows + tmp.rows / 2;
+                } else {
+                    cv::cvtColor(threadctx.response.plane(IF_BIG_BATCH(threadctx.max.getIdx(i, j), 0)),
+                            tmp, cv::COLOR_GRAY2BGR);
+                    tmp /= max; // Normalize to 1
+                    cross += cv::Point2d(tmp.size())/2;
+                    tmp = circshift(tmp, -tmp.cols/2, -tmp.rows/2);
+                    //drawCross(tmp, cross, false);
+                }
+                bool green = false;
+                if (&*max_it == &IF_BIG_BATCH(threadctx.max(i, j), threadctx)) {
+                    // Show the green cross at position of sub-pixel interpolation (if enabled)
+                    cross = new_location + cv::Point2d(tmp.size())/2;
+                    green = true;
+                }
+                // Move to the center of pixes (if scaling up) and scale
+                cross.x = (cross.x + 0.5) * double(w)/tmp.cols;
+                cross.y = (cross.y + 0.5) * double(h)/tmp.rows;
+                cv::resize(tmp, tmp, cv::Size(w, h)); //, 0, 0, cv::INTER_NEAREST);
+                drawCross(tmp, cross, green);
+                cv::Mat resp_roi(all_responses, cv::Rect(j * (w+1), i * (h+1), w, h));
+                tmp.copyTo(resp_roi);
+            }
+        }
+        cv::namedWindow("KCF visual debug", CV_WINDOW_AUTOSIZE);
+        cv::imshow("KCF visual debug", all_responses);
     }
 
-    clamp2(p_current_scale, p_min_max_scale[0], p_min_max_scale[1]);
+    return max;
+}
+
+void KCF_Tracker::track(cv::Mat &img)
+{
+    __dbgTracer.debug = m_debug;
+    TRACE("");
+
+    cv::Mat input_gray, input_rgb = img.clone();
+    if (img.channels() == 3) {
+        cv::cvtColor(img, input_gray, CV_BGR2GRAY);
+        input_gray.convertTo(input_gray, CV_32FC1);
+    } else
+        img.convertTo(input_gray, CV_32FC1);
+
+    // don't need too large image
+    resizeImgs(input_rgb, input_gray);
 
-    if (p_current_scale < p_min_max_scale[0]) p_current_scale = p_min_max_scale[0];
-    if (p_current_scale > p_min_max_scale[1]) p_current_scale = p_min_max_scale[1];
+#ifdef ASYNC
+    for (auto &it : d->threadctxs)
+        it.async_res = std::async(std::launch::async, [this, &input_gray, &input_rgb, &it]() -> void {
+            it.track(*this, input_rgb, input_gray);
+        });
+    for (auto const &it : d->threadctxs)
+        it.async_res.wait();
 
-    p_current_angle = (p_current_angle + max->angle) < 0
-                          ? -std::abs(p_current_angle + max->angle) % 360
-                          : (p_current_angle + max->angle) % 360;
+#else  // !ASYNC
+    NORMAL_OMP_PARALLEL_FOR
+    for (uint i = 0; i < d->threadctxs.size(); ++i)
+        d->threadctxs[i].track(*this, input_rgb, input_gray);
+#endif
 
-    // obtain a subwindow for training at newly estimated target position
-    int size_x_scaled = floor(p_windows_size.width * p_current_scale);
-    int size_y_scaled = floor(p_windows_size.height * p_current_scale);
+    cv::Point2d new_location;
+    uint max_idx;
+    max_response = findMaxReponse(max_idx, new_location);
 
-    cv::Mat patch_gray = get_subwindow(input_gray, this->p_pose.cx, this->p_pose.cy, size_x_scaled, size_y_scaled);
-    geometric_transformations(patch_gray, p_windows_size.width, p_windows_size.height, p_current_angle, false);
+    double angle_change = d->IF_BIG_BATCH(threadctxs[0].max, threadctxs).angle(max_idx);
+    p_current_angle += angle_change;
 
-    cv::Mat patch_rgb = cv::Mat::zeros(size_y_scaled, size_x_scaled, CV_32F);
-    if ((m_use_color || m_use_cnfeat) && input_rgb.channels() == 3) {
-        patch_rgb = get_subwindow(input_rgb, this->p_pose.cx, this->p_pose.cy, size_x_scaled, size_y_scaled);
-        geometric_transformations(patch_rgb, p_windows_size.width, p_windows_size.height, p_current_angle, false);
-    }
+    new_location.x = new_location.x * cos(-p_current_angle/180*M_PI) + new_location.y * sin(-p_current_angle/180*M_PI);
+    new_location.y = new_location.y * cos(-p_current_angle/180*M_PI) - new_location.x * sin(-p_current_angle/180*M_PI);
 
-    ThreadCtx &ctx = p_threadctxs.front();
-    std::vector<cv::Mat> patch_feats = get_features(patch_rgb, patch_gray);
-    fft.forward_window(patch_feats, p_xf, ctx.fw_all, m_use_cuda ? ctx.data_features.deviceMem() : nullptr, ctx.stream);
+    new_location.x *= double(p_windows_size.width) / fit_size.width;
+    new_location.y *= double(p_windows_size.height) / fit_size.height;
 
-    // subsequent frames, interpolate model
-    p_model_xf = p_model_xf * float((1. - p_interp_factor)) + p_xf * float(p_interp_factor);
+    p_current_center += p_current_scale * p_cell_size * new_location;
 
-    ComplexMat alphaf_num, alphaf_den;
+    clamp2(p_current_center.x, 0.0, img.cols - 1.0);
+    clamp2(p_current_center.y, 0.0, img.rows - 1.0);
 
-    if (m_use_linearkernel) {
-        ComplexMat xfconj = p_xf.conj();
-        alphaf_num = xfconj.mul(p_yf);
-        alphaf_den = (p_xf * xfconj);
+    // sub grid scale interpolation
+    if (m_use_subgrid_scale) {
+        p_current_scale *= sub_grid_scale(max_idx);
     } else {
-        // Kernel Ridge Regression, calculate alphas (in Fourier domain)
-        gaussian_correlation(ctx, p_xf, p_xf, p_kernel_sigma,
-                             true);
-        //        ComplexMat alphaf = p_yf / (kf + p_lambda); //equation for fast training
-        //        p_model_alphaf = p_model_alphaf * (1. - p_interp_factor) + alphaf * p_interp_factor;
-        alphaf_num = p_yf * ctx.kf;
-        alphaf_den = ctx.kf * (ctx.kf + float(p_lambda));
+        p_current_scale *= d->IF_BIG_BATCH(threadctxs[0].max, threadctxs).scale(max_idx);
     }
 
-    p_model_alphaf_num = p_model_alphaf_num * float((1. - p_interp_factor)) + alphaf_num * float(p_interp_factor);
-    p_model_alphaf_den = p_model_alphaf_den * float((1. - p_interp_factor)) + alphaf_den * float(p_interp_factor);
-    p_model_alphaf = p_model_alphaf_num / p_model_alphaf_den;
+    clamp2(p_current_scale, p_min_max_scale[0], p_min_max_scale[1]);
 
-#if  !defined(BIG_BATCH) && defined(CUFFT) && (defined(ASYNC) || defined(OPENMP))
-    for (auto it = p_threadctxs.begin(); it != p_threadctxs.end(); ++it) {
-        it->model_xf = p_model_xf;
-        it->model_xf.set_stream(it->stream);
-        it->model_alphaf = p_model_alphaf;
-        it->model_alphaf.set_stream(it->stream);
-    }
-#endif
+
+    // train at newly estimated target position
+    train(input_rgb, input_gray, p_interp_factor);
 }
 
-void KCF_Tracker::scale_track(ThreadCtx &vars, cv::Mat &input_rgb, cv::Mat &input_gray)
+void ThreadCtx::track(const KCF_Tracker &kcf, cv::Mat &input_rgb, cv::Mat &input_gray)
 {
-    std::vector<cv::Mat> patch_feats;
-    if (BIG_BATCH_MODE) {
-        BIG_BATCH_OMP_PARALLEL_FOR
-        for (uint i = 0; i < this->p_scales.size(); ++i) {
-            for (uint j = 0; j < this->p_angles.size(); ++j) {
-                int size_x_scaled = floor(this->p_windows_size.width * this->p_current_scale * this->p_scales[i]);
-                int size_y_scaled = floor(this->p_windows_size.height * this->p_current_scale * this->p_scales[i]);
-
-                cv::Mat patch_gray =
-                    get_subwindow(input_gray, this->p_pose.cx, this->p_pose.cy, size_x_scaled, size_y_scaled);
-                geometric_transformations(patch_gray, p_windows_size.width, p_windows_size.height,
-                                          p_current_scale * this->p_scales[i], p_current_angle + this->p_angles[j]);
-
-                cv::Mat patch_rgb;
-                if ((m_use_color || m_use_cnfeat) && input_rgb.channels() == 3) {
-                    patch_rgb =
-                        get_subwindow(input_rgb, this->p_pose.cx, this->p_pose.cy, size_x_scaled, size_y_scaled);
-                    geometric_transformations(patch_rgb, p_windows_size.width, p_windows_size.height,
-                                              p_current_scale * this->p_scales[i], p_current_angle + this->p_angles[j]);
-                }
-                std::vector<cv::Mat> tmp = get_features(patch_rgb, patch_gray);
-                BIG_BATCH_OMP_ORDERED
-                patch_feats.insert(patch_feats.end(), tmp.begin(), tmp.end());
-            }
-        }
-    } else {
-        int size_x_scaled = floor(this->p_windows_size.width * this->p_current_scale * vars.scale);
-        int size_y_scaled = floor(this->p_windows_size.height * this->p_current_scale * vars.scale);
+    TRACE("");
 
-        cv::Mat patch_gray = get_subwindow(input_gray, this->p_pose.cx, this->p_pose.cy, size_x_scaled, size_y_scaled);
-        geometric_transformations(patch_gray, p_windows_size.width, p_windows_size.height, p_current_scale * vars.scale);
-
-        cv::Mat patch_rgb;
-        if ((m_use_color || m_use_cnfeat) && input_rgb.channels() == 3) {
-            patch_rgb = get_subwindow(input_rgb, this->p_pose.cx, this->p_pose.cy, size_x_scaled, size_y_scaled);
-            geometric_transformations(patch_rgb, p_windows_size.width, p_windows_size.height, p_current_scale * vars.scale,
-                                      p_current_angle + vars.angle);
-        }
-        patch_feats = get_features(patch_rgb, patch_gray);
+    BIG_BATCH_OMP_PARALLEL_FOR
+    for (uint i = 0; i < IF_BIG_BATCH(max.size(), 1); ++i)
+    {
+        kcf.get_features(input_rgb, input_gray, &dbg_patch IF_BIG_BATCH([i],),
+                         kcf.p_current_center.x, kcf.p_current_center.y,
+                         kcf.p_windows_size.width, kcf.p_windows_size.height,
+                         kcf.p_current_scale * IF_BIG_BATCH(max.scale(i), scale),
+                         kcf.p_current_angle + IF_BIG_BATCH(max.angle(i), angle))
+                .copyTo(patch_feats.scale(i));
+        DEBUG_PRINT(patch_feats.scale(i));
     }
 
-    fft.forward_window(patch_feats, vars.zf, vars.fw_all, m_use_cuda ? vars.data_features.deviceMem() : nullptr,
-                       vars.stream);
-    DEBUG_PRINTM(vars.zf);
+    kcf.fft.forward_window(patch_feats, zf, temp);
+    DEBUG_PRINTM(zf);
 
-    if (m_use_linearkernel) {
-        vars.kzf = BIG_BATCH_MODE ? (vars.zf.mul2(this->p_model_alphaf)).sum_over_channels()
-                                   : (p_model_alphaf * vars.zf).sum_over_channels();
-        fft.inverse(vars.kzf, vars.response, m_use_cuda ? vars.data_i_1ch.deviceMem() : nullptr, vars.stream);
+    if (kcf.m_use_linearkernel) {
+        kzf = zf.mul(kcf.model->model_alphaf).sum_over_channels();
     } else {
-#if !defined(BIG_BATCH) && defined(CUFFT) && (defined(ASYNC) || defined(OPENMP))
-        gaussian_correlation(vars, vars.zf, vars.model_xf, this->p_kernel_sigma);
-        vars.kzf = vars.model_alphaf * vars.kzf;
-#else
-        gaussian_correlation(vars, vars.zf, this->p_model_xf, this->p_kernel_sigma);
-        DEBUG_PRINTM(this->p_model_alphaf);
-        DEBUG_PRINTM(vars.kzf);
-        vars.kzf = BIG_BATCH_MODE ? vars.kzf.mul(this->p_model_alphaf) : this->p_model_alphaf * vars.kzf;
-#endif
-        fft.inverse(vars.kzf, vars.response, m_use_cuda ? vars.data_i_1ch.deviceMem() : nullptr, vars.stream);
+        gaussian_correlation(kzf, zf, kcf.model->model_xf, kcf.p_kernel_sigma, false, kcf);
+        DEBUG_PRINTM(kzf);
+        kzf = kzf.mul(kcf.model->model_alphaf);
     }
+    kcf.fft.inverse(kzf, response);
 
-    DEBUG_PRINTM(vars.response);
+    DEBUG_PRINTM(response);
 
     /* target location is at the maximum response. we must take into
     account the fact that, if the target doesn't move, the peak
     will appear at the top-left corner, not at the center (this is
     discussed in the paper). the responses wrap around cyclically. */
+    double min_val, max_val;
+    cv::Point2i min_loc, max_loc;
 #ifdef BIG_BATCH
-    cv::split(vars.response, vars.response_maps);
-
-    for (size_t i = 0; i < p_scales.size(); ++i) {
-        double min_val, max_val;
-        cv::Point2i min_loc, max_loc;
-        cv::minMaxLoc(vars.response_maps[i], &min_val, &max_val, &min_loc, &max_loc);
+    for (size_t i = 0; i < max.size(); ++i) {
+        cv::minMaxLoc(response.plane(i), &min_val, &max_val, &min_loc, &max_loc);
         DEBUG_PRINT(max_loc);
-        double weight = p_scales[i] < 1. ? p_scales[i] : 1. / p_scales[i];
-        vars.max_responses[i] = max_val * weight;
-        vars.max_locs[i] = max_loc;
+        double weight = max.scale(i) < 1. ? max.scale(i) : 1. / max.scale(i);
+        max[i].response = max_val * weight;
+        max[i].loc = max_loc;
     }
 #else
-    double min_val;
-    cv::Point2i min_loc;
-    cv::minMaxLoc(vars.response, &min_val, &vars.max_val, &min_loc, &vars.max_loc);
+    cv::minMaxLoc(response.plane(0), &min_val, &max_val, &min_loc, &max_loc);
 
-    DEBUG_PRINT(vars.max_loc);
+    DEBUG_PRINT(max_loc);
+    DEBUG_PRINT(max_val);
 
-    double weight = vars.scale < 1. ? vars.scale : 1. / vars.scale;
-    vars.max_response = vars.max_val * weight;
+    double weight = scale < 1. ? scale : 1. / scale;
+    max.response = max_val * weight;
+    max.loc = max_loc;
 #endif
-    return;
 }
 
 // ****************************************************************************
 
-std::vector<cv::Mat> KCF_Tracker::get_features(cv::Mat &patch_rgb, cv::Mat &patch_gray)
+cv::Mat KCF_Tracker::get_features(cv::Mat &input_rgb, cv::Mat &input_gray, cv::Mat *dbg_patch,
+                                  int cx, int cy, int size_x, int size_y, double scale, double angle) const
 {
+    cv::Size scaled = cv::Size(floor(size_x * scale), floor(size_y * scale));
+
+    cv::Mat patch_gray = get_subwindow(input_gray, cx, cy, scaled.width, scaled.height, angle);
+    cv::Mat patch_rgb = get_subwindow(input_rgb, cx, cy, scaled.width, scaled.height, angle);
+
+    if (dbg_patch)
+        patch_rgb.copyTo(*dbg_patch);
+
+    // resize to default size
+    if (scaled.area() > fit_size.area()) {
+        // if we downsample use  INTER_AREA interpolation
+        // note: this is just a guess - we may downsample in X and upsample in Y (or vice versa)
+        cv::resize(patch_gray, patch_gray, fit_size, 0., 0., cv::INTER_AREA);
+    } else {
+        cv::resize(patch_gray, patch_gray, fit_size, 0., 0., cv::INTER_LINEAR);
+    }
+
     // get hog(Histogram of Oriented Gradients) features
     std::vector<cv::Mat> hog_feat = FHoG::extract(patch_gray, 2, p_cell_size, 9);
 
     // get color rgb features (simple r,g,b channels)
     std::vector<cv::Mat> color_feat;
+    if ((m_use_color || m_use_cnfeat) && input_rgb.channels() == 3) {
+        // resize to default size
+        if (scaled.area() > (fit_size / p_cell_size).area()) {
+            // if we downsample use  INTER_AREA interpolation
+            cv::resize(patch_rgb, patch_rgb, fit_size / p_cell_size, 0., 0., cv::INTER_AREA);
+        } else {
+            cv::resize(patch_rgb, patch_rgb, fit_size / p_cell_size, 0., 0., cv::INTER_LINEAR);
+        }
+    }
 
-    if (m_use_color && patch_rgb.channels() == 3) {
+    if (m_use_color && input_rgb.channels() == 3) {
         // use rgb color space
         cv::Mat patch_rgb_norm;
         patch_rgb.convertTo(patch_rgb_norm, CV_32F, 1. / 255., -0.5);
@@ -624,13 +559,19 @@ std::vector<cv::Mat> KCF_Tracker::get_features(cv::Mat &patch_rgb, cv::Mat &patc
         color_feat.insert(color_feat.end(), rgb.begin(), rgb.end());
     }
 
-    if (m_use_cnfeat && patch_rgb.channels() == 3) {
+    if (m_use_cnfeat && input_rgb.channels() == 3) {
         std::vector<cv::Mat> cn_feat = CNFeat::extract(patch_rgb);
         color_feat.insert(color_feat.end(), cn_feat.begin(), cn_feat.end());
     }
 
     hog_feat.insert(hog_feat.end(), color_feat.begin(), color_feat.end());
-    return hog_feat;
+
+    int size[] = {p_num_of_feats, feature_size.height, feature_size.width};
+    cv::Mat result(3, size, CV_32F);
+    for (uint i = 0; i < hog_feat.size(); ++i)
+        hog_feat[i].copyTo(cv::Mat(size[1], size[2], CV_32FC1, result.ptr(i)));
+
+    return result;
 }
 
 cv::Mat KCF_Tracker::gaussian_shaped_labels(double sigma, int dim1, int dim2)
@@ -650,25 +591,17 @@ cv::Mat KCF_Tracker::gaussian_shaped_labels(double sigma, int dim1, int dim2)
     }
 
     // rotate so that 1 is at top-left corner (see KCF paper for explanation)
-#ifdef CUFFT
-    cv::Mat tmp = circshift(labels, range_x[0], range_y[0]);
-    tmp.copyTo(p_rot_labels);
-
-    assert(p_rot_labels.at<float>(0, 0) >= 1.f - 1e-10f);
-    return tmp;
-#else
-    cv::Mat rot_labels = circshift(labels, range_x[0], range_y[0]);
+    MatDynMem rot_labels = circshift(labels, range_x[0], range_y[0]);
     // sanity check, 1 at top left corner
     assert(rot_labels.at<float>(0, 0) >= 1.f - 1e-10f);
 
     return rot_labels;
-#endif
 }
 
-cv::Mat KCF_Tracker::circshift(const cv::Mat &patch, int x_rot, int y_rot)
+cv::Mat KCF_Tracker::circshift(const cv::Mat &patch, int x_rot, int y_rot) const
 {
-    cv::Mat rot_patch(patch.size(), CV_32FC1);
-    cv::Mat tmp_x_rot(patch.size(), CV_32FC1);
+    cv::Mat rot_patch(patch.size(), patch.type());
+    cv::Mat tmp_x_rot(patch.size(), patch.type());
 
     // circular rotate x-axis
     if (x_rot < 0) {
@@ -746,14 +679,18 @@ cv::Mat KCF_Tracker::cosine_window_function(int dim1, int dim2)
 // Returns sub-window of image input centered at [cx, cy] coordinates),
 // with size [width, height]. If any pixels are outside of the image,
 // they will replicate the values at the borders.
-cv::Mat KCF_Tracker::get_subwindow(const cv::Mat &input, int cx, int cy, int width, int height)
+cv::Mat KCF_Tracker::get_subwindow(const cv::Mat &input, int cx, int cy, int width, int height, double angle) const
 {
     cv::Mat patch;
 
-    int x1 = cx - width / 2;
-    int y1 = cy - height / 2;
-    int x2 = cx + width / 2;
-    int y2 = cy + height / 2;
+    cv::Size sz(width, height);
+    cv::RotatedRect rr(cv::Point2f(cx, cy), sz, angle);
+    cv::Rect bb = rr.boundingRect();
+
+    int x1 = bb.tl().x;
+    int y1 = bb.tl().y;
+    int x2 = bb.br().x;
+    int y2 = bb.br().y;
 
     // out of image
     if (x1 >= input.cols || y1 >= input.rows || x2 < 0 || y2 < 0) {
@@ -787,9 +724,18 @@ cv::Mat KCF_Tracker::get_subwindow(const cv::Mat &input, int cx, int cy, int wid
 
     if (x2 - x1 == 0 || y2 - y1 == 0)
         patch = cv::Mat::zeros(height, width, CV_32FC1);
-    else
+    else {
         cv::copyMakeBorder(input(cv::Range(y1, y2), cv::Range(x1, x2)), patch, top, bottom, left, right,
                            cv::BORDER_REPLICATE);
+        //      imshow( "copyMakeBorder", patch);
+        //      cv::waitKey();
+    }
+
+    cv::Point2f src_pts[4];
+    cv::RotatedRect(cv::Point2f(patch.size()) / 2.0, sz, angle).points(src_pts);
+    cv::Point2f dst_pts[3] = { cv::Point2f(0, height), cv::Point2f(0, 0),  cv::Point2f(width, 0)};
+    auto rot = cv::getAffineTransform(src_pts, dst_pts);
+    cv::warpAffine(patch, patch, rot, sz);
 
     // sanity check
     assert(patch.cols == width && patch.rows == height);
@@ -797,108 +743,49 @@ cv::Mat KCF_Tracker::get_subwindow(const cv::Mat &input, int cx, int cy, int wid
     return patch;
 }
 
-void KCF_Tracker::geometric_transformations(cv::Mat &patch, int size_x, int size_y, int angle, bool allow_debug)
-{
-    if (m_use_angle) {
-        cv::Point2f center((patch.cols - 1) / 2., (patch.rows - 1) / 2.);
-        cv::Mat r = cv::getRotationMatrix2D(center, angle, 1.0);
-
-        cv::warpAffine(patch, patch, r, cv::Size(patch.cols, patch.rows), cv::INTER_LINEAR, cv::BORDER_REPLICATE);
-    }
-
-    // resize to default size
-    if (patch.channels() != 3) {
-        if (patch.cols / size_x > 1.) {
-            // if we downsample use  INTER_AREA interpolation
-            cv::resize(patch, patch, cv::Size(size_x, size_y), 0., 0., cv::INTER_AREA);
-        } else {
-            cv::resize(patch, patch, cv::Size(size_x, size_y), 0., 0., cv::INTER_LINEAR);
-        }
-    } else {
-        if (patch.cols / size_x > 1.) {
-            // if we downsample use  INTER_AREA interpolation
-            cv::resize(patch, patch, cv::Size(size_x / p_cell_size, size_y / p_cell_size), 0., 0., cv::INTER_AREA);
-        } else {
-            cv::resize(patch, patch, cv::Size(size_x / p_cell_size, size_y / p_cell_size), 0., 0., cv::INTER_LINEAR);
-        }
-        if (m_visual_debug && allow_debug) {
-            cv::Mat input_clone = patch.clone();
-            cv::resize(input_clone, input_clone, cv::Size(p_debug_image_size, p_debug_image_size), 0., 0.,
-                       cv::INTER_LINEAR);
-
-            std::string angle_string = std::to_string(p_current_angle + angle);
-
-            cv::putText(input_clone, angle_string, cv::Point(1, input_clone.rows - 5), cv::FONT_HERSHEY_COMPLEX_SMALL,
-                        0.5, cv::Scalar(0, 255, 0), 1);
-
-            p_debug_subwindows.push_back(input_clone);
-        }
-    }
-}
-
-void KCF_Tracker::gaussian_correlation(struct ThreadCtx &vars, const ComplexMat &xf, const ComplexMat &yf, double sigma,
-                                       bool auto_correlation)
+void KCF_Tracker::GaussianCorrelation::operator()(ComplexMat &result, const ComplexMat &xf, const ComplexMat &yf,
+                                                  double sigma, bool auto_correlation, const KCF_Tracker &kcf)
 {
-    xf.sqr_norm(vars.xf_sqr_norm);
+    TRACE("");
+    DEBUG_PRINTM(xf);
+    DEBUG_PRINT(xf_sqr_norm.num_elem);
+    xf.sqr_norm(xf_sqr_norm);
+    for (uint s = 0; s < xf.n_scales; ++s)
+        DEBUG_PRINT(xf_sqr_norm[s]);
     if (auto_correlation) {
-        vars.yf_sqr_norm.hostMem()[0] = vars.xf_sqr_norm.hostMem()[0];
+        yf_sqr_norm = xf_sqr_norm;
     } else {
-        yf.sqr_norm(vars.yf_sqr_norm);
+        DEBUG_PRINTM(yf);
+        yf.sqr_norm(yf_sqr_norm);
     }
-    vars.xyf = auto_correlation ? xf.sqr_mag() : xf.mul2(yf.conj());
-    DEBUG_PRINTM(vars.xyf);
-    fft.inverse(vars.xyf, vars.ifft2_res, m_use_cuda ? vars.data_i_features.deviceMem() : nullptr, vars.stream);
-#ifdef CUFFT
-    cuda_gaussian_correlation(vars.data_i_features.deviceMem(), vars.gauss_corr_res.deviceMem(),
-                              vars.xf_sqr_norm.deviceMem(), vars.xf_sqr_norm.deviceMem(), sigma, xf.n_channels,
-                              xf.n_scales, p_roi.height, p_roi.width, vars.stream);
-#else
-    // ifft2 and sum over 3rd dimension, we dont care about individual channels
-    DEBUG_PRINTM(vars.ifft2_res);
-    cv::Mat xy_sum;
-    if (xf.channels() != p_num_scales * p_num_of_feats)
-        xy_sum.create(vars.ifft2_res.size(), CV_32FC1);
-    else
-        xy_sum.create(vars.ifft2_res.size(), CV_32FC(p_scales.size()));
-    xy_sum.setTo(0);
-    for (int y = 0; y < vars.ifft2_res.rows; ++y) {
-        float *row_ptr = vars.ifft2_res.ptr<float>(y);
-        float *row_ptr_sum = xy_sum.ptr<float>(y);
-        for (int x = 0; x < vars.ifft2_res.cols; ++x) {
-            for (int sum_ch = 0; sum_ch < xy_sum.channels(); ++sum_ch) {
-                row_ptr_sum[(x * xy_sum.channels()) + sum_ch] += std::accumulate(
-                    row_ptr + x * vars.ifft2_res.channels() + sum_ch * (vars.ifft2_res.channels() / xy_sum.channels()),
-                    (row_ptr + x * vars.ifft2_res.channels() +
-                     (sum_ch + 1) * (vars.ifft2_res.channels() / xy_sum.channels())),
-                    0.f);
-            }
-        }
-    }
-    DEBUG_PRINTM(xy_sum);
+    for (uint s = 0; s < yf.n_scales; ++s)
+        DEBUG_PRINTM(yf_sqr_norm[s]);
+    xyf = auto_correlation ? xf.sqr_mag() : xf * yf.conj(); // xf.muln(yf.conj());
+    DEBUG_PRINTM(xyf);
 
-    std::vector<cv::Mat> scales;
-    cv::split(xy_sum, scales);
+    // ifft2 and sum over 3rd dimension, we dont care about individual channels
+    ComplexMat xyf_sum = xyf.sum_over_channels();
+    DEBUG_PRINTM(xyf_sum);
+    kcf.fft.inverse(xyf_sum, ifft_res);
+    DEBUG_PRINTM(ifft_res);
 
     float numel_xf_inv = 1.f / (xf.cols * xf.rows * (xf.channels() / xf.n_scales));
     for (uint i = 0; i < xf.n_scales; ++i) {
-        cv::Mat in_roi(vars.in_all, cv::Rect(0, i * scales[0].rows, scales[0].cols, scales[0].rows));
-        cv::exp(
-            -1. / (sigma * sigma) *
-                cv::max((double(vars.xf_sqr_norm.hostMem()[i] + vars.yf_sqr_norm.hostMem()[0]) - 2 * scales[i]) * double(numel_xf_inv), 0),
-            in_roi);
-        DEBUG_PRINTM(in_roi);
+        cv::Mat plane = ifft_res.plane(i);
+        DEBUG_PRINT(ifft_res.plane(i));
+        cv::exp(-1. / (sigma * sigma) * cv::max((xf_sqr_norm[i] + yf_sqr_norm[0] - 2 * ifft_res.plane(i))
+                * numel_xf_inv, 0), plane);
+        DEBUG_PRINTM(plane);
     }
-#endif
-    DEBUG_PRINTM(vars.in_all);
-    fft.forward(vars.in_all, auto_correlation ? vars.kf : vars.kzf, m_use_cuda ? vars.gauss_corr_res.deviceMem() : nullptr,
-                vars.stream);
-    return;
+
+    kcf.fft.forward(ifft_res, result);
 }
 
 float get_response_circular(cv::Point2i &pt, cv::Mat &response)
 {
     int x = pt.x;
     int y = pt.y;
+    assert(response.dims == 2); // ensure .cols and .rows are valid
     if (x < 0) x = response.cols + x;
     if (y < 0) y = response.rows + y;
     if (x >= response.cols) x = x - response.cols;
@@ -907,7 +794,7 @@ float get_response_circular(cv::Point2i &pt, cv::Mat &response)
     return response.at<float>(y, x);
 }
 
-cv::Point2f KCF_Tracker::sub_pixel_peak(cv::Point &max_loc, cv::Mat &response)
+cv::Point2f KCF_Tracker::sub_pixel_peak(cv::Point &max_loc, cv::Mat &response) const
 {
     // find neighbourhood of max_loc (response is circular)
     // 1 2 3
@@ -946,18 +833,25 @@ cv::Point2f KCF_Tracker::sub_pixel_peak(cv::Point &max_loc, cv::Mat &response)
     float a = x.at<float>(0), b = x.at<float>(1), c = x.at<float>(2), d = x.at<float>(3), e = x.at<float>(4);
 
     cv::Point2f sub_peak(max_loc.x, max_loc.y);
-    if (b > 0 || b < 0) {
+    if (4 * a * c - b * b > p_floating_error) {
         sub_peak.y = ((2.f * a * e) / b - d) / (b - (4 * a * c) / b);
         sub_peak.x = (-2 * c * sub_peak.y - e) / b;
+        if (fabs(sub_peak.x - max_loc.x) > 1 ||
+            fabs(sub_peak.y - max_loc.y) > 1)
+            sub_peak = max_loc;
     }
 
     return sub_peak;
 }
 
-double KCF_Tracker::sub_grid_scale(uint index)
+double KCF_Tracker::sub_grid_scale(uint max_index)
 {
     cv::Mat A, fval;
-    if (index >= p_scales.size()) {
+    const auto &vec = d->IF_BIG_BATCH(threadctxs[0].max, threadctxs);
+    uint index = vec.getScaleIdx(max_index);
+    uint angle_idx = vec.getAngleIdx(max_index);
+
+    if (index >= vec.size()) {
         // interpolate from all values
         // fit 1d quadratic function f(x) = a*x^2 + b*x + c
         A.create(p_scales.size(), 3, CV_32FC1);
@@ -966,11 +860,7 @@ double KCF_Tracker::sub_grid_scale(uint index)
             A.at<float>(i, 0) = float(p_scales[i] * p_scales[i]);
             A.at<float>(i, 1) = float(p_scales[i]);
             A.at<float>(i, 2) = 1;
-#ifdef BIG_BATCH
-            fval.at<float>(i) = p_threadctxs.back().max_responses[i];
-#else
-            fval.at<float>(i) = p_threadctxs[i].max_response;
-#endif
+            fval.at<float>(i) = d->IF_BIG_BATCH(threadctxs[0].max[i].response, threadctxs(i, angle_idx).max.response);
         }
     } else {
         // only from neighbours
@@ -983,14 +873,14 @@ double KCF_Tracker::sub_grid_scale(uint index)
              p_scales[index + 1] * p_scales[index + 1], p_scales[index + 1], 1);
 #ifdef BIG_BATCH
         fval = (cv::Mat_<float>(3, 1) <<
-                p_threadctxs.back().max_responses[index - 1],
-                p_threadctxs.back().max_responses[index + 0],
-                p_threadctxs.back().max_responses[index + 1]);
+                d->threadctxs[0].max(index - 1, angle_idx).response,
+                d->threadctxs[0].max(index + 0, angle_idx).response,
+                d->threadctxs[0].max(index + 1, angle_idx).response);
 #else
         fval = (cv::Mat_<float>(3, 1) <<
-                p_threadctxs[index - 1].max_response,
-                p_threadctxs[index + 0].max_response,
-                p_threadctxs[index + 1].max_response);
+                d->threadctxs(index - 1, angle_idx).max.response,
+                d->threadctxs(index + 0, angle_idx).max.response,
+                d->threadctxs(index + 1, angle_idx).max.response);
 #endif
     }
 
index 7600e40b423fa4e23836317e3e8a839c3c06fde7..4a7f9f0e499894d77ed1b0d70e4acc3034bc78cb 100644 (file)
--- a/src/kcf.h
+++ b/src/kcf.h
@@ -6,67 +6,54 @@
 #include <memory>
 #include "fhog.hpp"
 
+#include "complexmat.hpp"
 #ifdef CUFFT
-#include "complexmat.cuh"
-#include "cuda_functions.cuh"
-#include "cuda/cuda_error_check.cuh"
+#include "cuda_error_check.hpp"
 #include <cuda_runtime.h>
-#else
-#include "complexmat.hpp"
 #endif
 
 #include "cnfeat.hpp"
 #include "fft.h"
-#include "threadctx.hpp"
 #include "pragmas.h"
 
-struct BBox_c {
+class Kcf_Tracker_Private;
+struct ThreadCtx;
+
+struct BBox_c
+{
     double cx, cy, w, h, a;
 
+    inline cv::Point2d center() const { return cv::Point2d(cx, cy); }
+
     inline void scale(double factor)
     {
         cx *= factor;
         cy *= factor;
-        w *= factor;
-        h *= factor;
+        w  *= factor;
+        h  *= factor;
     }
 
-    inline void scale_x(double factor)
+    inline cv::Rect get_rect()
     {
-        cx *= factor;
-        w *= factor;
-    }
-
-    inline void scale_y(double factor)
-    {
-        cy *= factor;
-        h *= factor;
+        return cv::Rect(int(cx-w/2.), int(cy-h/2.), int(w), int(h));
     }
 
-    inline cv::Rect get_rect() { return cv::Rect(int(cx - w / 2.), int(cy - h / 2.), int(w), int(h)); }
 };
 
-class KCF_Tracker {
-  public:
-    bool m_debug{false};
-    bool m_visual_debug{false};
-    bool m_use_scale{true};
-    bool m_use_angle{false}; // Doesn't work with FFTW-BIG version
-    bool m_use_color{true};
-#ifdef ASYNC
-    bool m_use_multithreading{true};
-#else
-    bool m_use_multithreading {false};
-#endif //ASYNC
-    bool m_use_subpixel_localization {true};
-    bool m_use_subgrid_scale {true};
-    bool m_use_cnfeat {true};
-    bool m_use_linearkernel {false};
-#ifdef CUFFT
-    bool m_use_cuda{true};
-#else
-    bool m_use_cuda{false};
-#endif
+class KCF_Tracker
+{
+    friend ThreadCtx;
+    friend Kcf_Tracker_Private;
+public:
+    bool m_debug {false};
+    enum class vd {NONE, PATCH, RESPONSE} m_visual_debug {vd::NONE};
+    const bool m_use_scale {true};
+    const bool m_use_color {true};
+    const bool m_use_subpixel_localization {true};
+    const bool m_use_subgrid_scale {true};
+    const bool m_use_cnfeat {true};
+    const bool m_use_linearkernel {false};
+    const int p_cell_size = 4;            //4 for hog (= bin_size)
 
     /*
     padding             ... extra area surrounding the target           (1.5)
@@ -76,88 +63,119 @@ class KCF_Tracker {
     output_sigma_factor ... spatial bandwidth (proportional to target)  (0.1)
     cell_size           ... hog cell size                               (4)
     */
-    KCF_Tracker(double padding, double kernel_sigma, double lambda, double interp_factor, double output_sigma_factor,
-                int cell_size);
+    KCF_Tracker(double padding, double kernel_sigma, double lambda, double interp_factor, double output_sigma_factor, int cell_size);
     KCF_Tracker();
     ~KCF_Tracker();
 
     // Init/re-init methods
-    void init(cv::Mat &img, const cv::Rect &bbox, int fit_size_x, int fit_size_y);
-    void setTrackerPose(BBox_c &bbox, cv::Mat &img, int fit_size_x, int fit_size_y);
-    void updateTrackerPosition(BBox_c &bbox);
+    void init(cv::Mat & img, const cv::Rect & bbox, int fit_size_x = -1, int fit_size_y = -1);
+    void setTrackerPose(BBox_c & bbox, cv::Mat & img, int fit_size_x = -1, int fit_size_y = -1);
+    void updateTrackerPosition(BBox_c & bbox);
 
     // frame-to-frame object tracking
-    void track(cv::Mat &img);
+    void track(cv::Mat & img);
     BBox_c getBBox();
     double getFilterResponse() const; // Measure of tracking accuracy
 
-  private:
+private:
     Fft &fft;
 
-    BBox_c p_pose;
+    // Initial pose of tracked object in internal image coordinates
+    // (scaled by p_downscale_factor if p_resize_image)
+    BBox_c p_init_pose;
+
+    // Information to calculate current pose of the tracked object
+    cv::Point2d p_current_center;
+    double p_current_scale = 1.;
+    double p_current_angle = 0.;
+
     double max_response = -1.;
 
     bool p_resize_image = false;
-    bool p_fit_to_pw2 = false;
 
     const double p_downscale_factor = 0.5;
-    double p_scale_factor_x = 1;
-    double p_scale_factor_y = 1;
     const double p_floating_error = 0.0001;
 
-    double p_padding = 1.5;
-    double p_output_sigma_factor = 0.1;
+    const double p_padding = 1.5;
+    const double p_output_sigma_factor = 0.1;
     double p_output_sigma;
-    double p_kernel_sigma = 0.5;   // def = 0.5
-    double p_lambda = 1e-4;        // regularization in learning step
-    double p_interp_factor = 0.02; // def = 0.02, linear interpolation factor for adaptation
-    int p_cell_size = 4;           // 4 for hog (= bin_size)
-    cv::Size p_windows_size;
-    uint p_num_scales {7};
-    double p_scale_step = 1.02;
-    double p_current_scale = 1.;
+    const double p_kernel_sigma = 0.5;    //def = 0.5
+    const double p_lambda = 1e-4;         //regularization in learning step
+    const double p_interp_factor = 0.02;  //def = 0.02, linear interpolation factor for adaptation
+    cv::Size p_windows_size;              // size of the patch to find the tracked object in
+    cv::Size fit_size;                    // size to which rescale the patch for better FFT performance
+
+    const uint p_num_scales = m_use_scale ? 7 : 1;
+    const double p_scale_step = 1.02;
     double p_min_max_scale[2];
     std::vector<double> p_scales;
-    int p_current_angle = 0;
-    uint p_num_angles {5};
-    int p_angle_min = -20, p_angle_max = 20;
-    int p_angle_step = 10;
-    std::vector<int> p_angles;
-
-    // for visual debug
-    int p_debug_image_size = 100;
-    std::vector<cv::Mat> p_debug_scale_responses;
-    std::vector<cv::Mat> p_debug_subwindows;
-
-    //for big batch
-    int p_num_of_feats = 31 + (m_use_color ? 3 : 0) + (m_use_cnfeat ? 10 : 0);
-    cv::Size p_roi;
-
-    std::vector<ThreadCtx> p_threadctxs;
-
-    //CUDA compability
-    cv::Mat p_rot_labels;
-    DynMem p_rot_labels_data;
-
-    // model
-    ComplexMat p_yf;
-    ComplexMat p_model_alphaf;
-    ComplexMat p_model_alphaf_num;
-    ComplexMat p_model_alphaf_den;
-    ComplexMat p_model_xf;
-    ComplexMat p_xf;
+
+    const uint p_num_angles = 3;
+    const int p_angle_step = 10;
+    std::vector<double> p_angles;
+
+    const int p_num_of_feats = 31 + (m_use_color ? 3 : 0) + (m_use_cnfeat ? 10 : 0);
+    cv::Size feature_size;
+
+    std::unique_ptr<Kcf_Tracker_Private> d;
+
+    class Model {
+        cv::Size feature_size;
+        uint height, width, n_feats;
+    public:
+        ComplexMat yf {height, width, 1};
+        ComplexMat model_alphaf {height, width, 1};
+        ComplexMat model_alphaf_num {height, width, 1};
+        ComplexMat model_alphaf_den {height, width, 1};
+        ComplexMat model_xf {height, width, n_feats};
+        ComplexMat xf {height, width, n_feats};
+
+        // Temporary variables for trainig
+        MatScaleFeats patch_feats{1, n_feats, feature_size};
+        MatScaleFeats temp{1, n_feats, feature_size};
+
+
+
+        Model(cv::Size feature_size, uint _n_feats)
+            : feature_size(feature_size)
+            , height(Fft::freq_size(feature_size).height)
+            , width(Fft::freq_size(feature_size).width)
+            , n_feats(_n_feats) {}
+    };
+
+    std::unique_ptr<Model> model;
+
+    class GaussianCorrelation {
+      public:
+        GaussianCorrelation(uint num_scales, uint num_feats, cv::Size size)
+            : xf_sqr_norm(num_scales)
+            , xyf(Fft::freq_size(size), num_feats, num_scales)
+            , ifft_res(num_scales, size)
+            , k(num_scales, size)
+        {}
+        void operator()(ComplexMat &result, const ComplexMat &xf, const ComplexMat &yf, double sigma, bool auto_correlation, const KCF_Tracker &kcf);
+
+      private:
+        DynMem xf_sqr_norm;
+        DynMem yf_sqr_norm{1};
+        ComplexMat xyf;
+        MatScales ifft_res;
+        MatScales k;
+    };
+
     //helping functions
-    void scale_track(ThreadCtx & vars, cv::Mat & input_rgb, cv::Mat & input_gray);
-    cv::Mat get_subwindow(const cv::Mat & input, int cx, int cy, int size_x, int size_y);
+    void scale_track(ThreadCtx &vars, cv::Mat &input_rgb, cv::Mat &input_gray);
+    cv::Mat get_subwindow(const cv::Mat &input, int cx, int cy, int size_x, int size_y, double angle) const;
     cv::Mat gaussian_shaped_labels(double sigma, int dim1, int dim2);
-    void gaussian_correlation(struct ThreadCtx &vars, const ComplexMat &xf, const ComplexMat &yf, double sigma,
-                              bool auto_correlation = false);
-    cv::Mat circshift(const cv::Mat &patch, int x_rot, int y_rot);
+    std::unique_ptr<GaussianCorrelation> gaussian_correlation;
+    cv::Mat circshift(const cv::Mat &patch, int x_rot, int y_rot) const;
     cv::Mat cosine_window_function(int dim1, int dim2);
-    std::vector<cv::Mat> get_features(cv::Mat &patch_rgb, cv::Mat &patch_gray);
-    void geometric_transformations(cv::Mat &patch, int size_x, int size_y, int angle = 0, bool allow_debug = true);
-    cv::Point2f sub_pixel_peak(cv::Point &max_loc, cv::Mat &response);
+    cv::Mat get_features(cv::Mat &input_rgb, cv::Mat &input_gray, cv::Mat *dbg_patch, int cx, int cy, int size_x, int size_y, double scale, double angle) const;
+    cv::Point2f sub_pixel_peak(cv::Point &max_loc, cv::Mat &response) const;
     double sub_grid_scale(uint index);
+    void resizeImgs(cv::Mat &input_rgb, cv::Mat &input_gray);
+    void train(cv::Mat input_rgb, cv::Mat input_gray, double interp_factor);
+    double findMaxReponse(uint &max_idx, cv::Point2d &new_location) const;
 };
 
-#endif // KCF_HEADER_6565467831231
+#endif //KCF_HEADER_6565467831231
index 92dd54e2a15fb87de0a87b1523352f3e73d3939d..0b23707172cbbc1033e5de02980c076041f8834f 100644 (file)
 
 #include <future>
 #include "dynmem.hpp"
-
-#ifdef CUFFT
-#include "complexmat.cuh"
-#else
+#include "kcf.h"
 #include "complexmat.hpp"
-#ifndef CUFFTW
-// For compatibility reasons between CuFFT and FFTW, OpenCVfft versions.
-typedef int *cudaStream_t;
-#endif
-#endif
+#include <vector>
+
+class KCF_Tracker;
+
+template <typename T>
+class ScaleRotVector : public std::vector<T> {
+public:
+    ScaleRotVector(const std::vector<double> &scales, const std::vector<double> &angles)
+        : scales(scales)
+        , angles(angles)
+    {}
+
+    uint getIdx(uint scale_idx, uint angle_idx) const { return angles.size() * scale_idx + angle_idx; }
+    uint getScaleIdx(uint idx) const { return idx / angles.size(); }
+    uint getAngleIdx(uint idx) const { return idx % angles.size(); }
+    T& operator()(uint scale_idx, uint angle_idx) { return std::vector<T>::at(getIdx(scale_idx, angle_idx)); }
+    double scale(uint idx) const { return scales[getScaleIdx(idx)]; }
+    double angle(uint idx) const { return angles[getAngleIdx(idx)]; }
+private:
+    const std::vector<double> scales, angles;
+};
 
 struct ThreadCtx {
   public:
-    ThreadCtx(cv::Size roi, uint num_of_feats, double scale, int angle, uint num_of_scales = 1, uint num_of_angles = 1)
-        : scale(scale), angle(angle)
-    {
-        this->xf_sqr_norm = DynMem(num_of_scales * num_of_angles * sizeof(float));
-        this->yf_sqr_norm = DynMem(sizeof(float));
-
-        uint cells_size = roi.width * roi.height * sizeof(float);
-
-#if  !defined(BIG_BATCH) && defined(CUFFT) && (defined(ASYNC) || defined(OPENMP))
-        CudaSafeCall(cudaStreamCreate(&this->stream));
+    ThreadCtx(cv::Size roi, uint num_features
+#ifdef BIG_BATCH
+              , const std::vector<double> &scales
+              , const std::vector<double> &angles
+#else
+              , double scale
+              , double angle
+#endif
+             )
+        : roi(roi)
+        , num_features(num_features)
+        , num_scales(IF_BIG_BATCH(scales.size(), 1))
+        , num_angles(IF_BIG_BATCH(angles.size(), 1))
+#ifdef BIG_BATCH
+        , max(scales, angles)
+        , dbg_patch(scales, angles)
+        {
+            max.resize(scales.size() * angles.size());
+            dbg_patch.resize(scales.size() * angles.size());
+        }
+#else
+        , scale(scale)
+        , angle(angle)
+        {}
 #endif
 
-#if defined(CUFFT) || defined(FFTW)
-        this->gauss_corr_res = DynMem(cells_size * num_of_scales * num_of_angles);
-        this->data_features = DynMem(cells_size * num_of_feats);
-
-        uint width_freq = roi.width / 2 + 1;
 
-        this->in_all = cv::Mat(roi.height * num_of_scales * num_of_angles, roi.width, CV_32F, this->gauss_corr_res.hostMem());
-        this->fw_all = cv::Mat(roi.height * num_of_feats, roi.width, CV_32F, this->data_features.hostMem());
-#else
-        uint width_freq = roi.width;
+    ThreadCtx(ThreadCtx &&) = default;
 
-        this->in_all = cv::Mat(roi, CV_32F);
-#endif
+    void track(const KCF_Tracker &kcf, cv::Mat &input_rgb, cv::Mat &input_gray);
+private:
+    cv::Size roi;
+    uint num_features;
+    uint num_scales;
+    uint num_angles;
+    cv::Size freq_size = Fft::freq_size(roi);
 
-        this->data_i_features = DynMem(cells_size * num_of_feats);
-        this->data_i_1ch = DynMem(cells_size * num_of_scales * num_of_angles);
+    MatScaleFeats patch_feats{num_scales * num_angles, num_features, roi};
+    MatScaleFeats temp{num_scales * num_angles, num_features, roi};
 
-        this->ifft2_res = cv::Mat(roi, CV_32FC(num_of_feats), this->data_i_features.hostMem());
-        this->response = cv::Mat(roi, CV_32FC(num_of_scales * num_of_angles), this->data_i_1ch.hostMem());
+    KCF_Tracker::GaussianCorrelation gaussian_correlation{num_scales * num_angles, num_features, roi};
 
-#ifdef CUFFT
-        this->zf.create(roi.height, width_freq, num_of_feats, num_of_scales * num_of_angles, this->stream);
-        this->kzf.create(roi.height, width_freq, num_of_scales * num_of_angles, this->stream);
-        this->kf.create(roi.height, width_freq, num_of_scales * num_of_angles, this->stream);
-#else
-        this->zf.create(roi.height, width_freq, num_of_feats, num_of_scales * num_of_angles);
-        this->kzf.create(roi.height, width_freq, num_of_scales * num_of_angles);
-        this->kf.create(roi.height, width_freq, num_of_scales * num_of_angles);
-#endif
+    MatScales ifft2_res{num_scales * num_angles, roi};
 
-#ifdef BIG_BATCH
-        if (num_of_scales > 1) {
-            this->max_responses.reserve(num_of_scales * num_of_angles);
-            this->max_locs.reserve(num_of_scales * num_of_angles);
-            this->response_maps.reserve(num_of_scales * num_of_angles);
-        }
-#endif
-    }
-    ThreadCtx(ThreadCtx &&) = default;
-    ~ThreadCtx()
-    {
-#if  !defined(BIG_BATCH) && defined(CUFFT) && (defined(ASYNC) || defined(OPENMP))
-        CudaSafeCall(cudaStreamDestroy(this->stream));
-#endif
-    }
+    ComplexMat zf{uint(freq_size.height), uint(freq_size.width), num_features, num_scales * num_angles};
+    ComplexMat kzf{uint(freq_size.height), uint(freq_size.width), 1, num_scales * num_angles};
 
-    const double scale;
-    const int angle;
+public:
 #ifdef ASYNC
     std::future<void> async_res;
 #endif
 
-    DynMem xf_sqr_norm, yf_sqr_norm;
-
-    cv::Mat in_all, fw_all, ifft2_res, response;
-    ComplexMat zf, kzf, kf, xyf;
+    MatScales response{num_scales * num_angles, roi};
 
-    DynMem data_i_features, data_i_1ch;
-    // CuFFT and FFTW variables
-    DynMem gauss_corr_res, data_features;
-
-    // CuFFT variables
-    cudaStream_t stream = nullptr;
-    ComplexMat model_alphaf, model_xf;
-
-    // Variables used during non big batch mode and in big batch mode with ThreadCtx in p_threadctxs in kcf  on zero index.
-    cv::Point2i max_loc;
-    double max_val, max_response;
+    struct Max {
+        cv::Point2i loc;
+        double response;
+    };
 
 #ifdef BIG_BATCH
-    // Stores value of responses, location of maximal response and response maps for each scale
-    std::vector<double> max_responses;
-    std::vector<cv::Point2i> max_locs;
-    std::vector<cv::Mat> response_maps;
+    ScaleRotVector<Max> max;
+    ScaleRotVector<cv::Mat> dbg_patch; // images for visual debugging
+#else
+    Max max;
+    const double scale, angle;
+    cv::Mat dbg_patch; // image for visual debugging
 #endif
 };