]> rtime.felk.cvut.cz Git - hercules2020/kcf.git/commitdiff
Merge remote-tracking branch 'upstream/master' into rotation
authorShanigen <vkaraf@gmail.com>
Thu, 13 Sep 2018 18:38:06 +0000 (20:38 +0200)
committerShanigen <vkaraf@gmail.com>
Thu, 13 Sep 2018 18:38:06 +0000 (20:38 +0200)
Big batch does not work.

1  2 
main_vot.cpp
src/kcf.cpp
src/kcf.h
src/threadctx.hpp

diff --cc main_vot.cpp
index cf378445f84b5ec57ab0d2802eb95b4a40e7e6e0,4a8c99e38adffad8b9041712e7548339e37c4288..528d263e678a3fc3110856a1ae5bd53c13cc1530
@@@ -170,54 -169,40 +171,54 @@@ int main(int argc, char *argv[]
          std::cout << std::endl;
  
          if (visualize_delay >= 0) {
 -            cv::rectangle(image, bb_rect, CV_RGB(0,255,0), 2);
 +            cv::Point pt(bb.cx, bb.cy);
 +            cv::Size size(bb.w, bb.h);
 +            cv::RotatedRect rotatedRectangle(pt, size, bb.a);
 +
 +            cv::Point2f vertices[4];
 +            rotatedRectangle.points(vertices);
 +
 +            for (int i = 0; i < 4; i++)
 +                cv::line(image, vertices[i], vertices[(i + 1) % 4], cv::Scalar(0, 255, 0), 2);
 +            //             cv::rectangle(image, cv::Rect(bb.cx - bb.w/2., bb.cy - bb.h/2., bb.w, bb.h), CV_RGB(0,255,0),
 +            //             2);
 +            std::string angle = std::to_string(bb.a);
 +            angle.erase(angle.find_last_not_of('0') + 1, std::string::npos);
 +            angle.erase(angle.find_last_not_of('.') + 1, std::string::npos);
 +            cv::putText(image, "Frame: " + std::to_string(frames) + " " + angle + " angle",
 +                        cv::Point(0, image.rows - 1), cv::FONT_HERSHEY_SIMPLEX, 0.7, cv::Scalar(0, 255, 0), 2);
              cv::imshow("output", image);
              int ret = cv::waitKey(visualize_delay);
 -            if (visualize_delay > 0 && ret != -1 && ret != 255)
 -                break;
 +            if (visualize_delay > 0 && ret != -1 && ret != 255) break;
          }
  
 -//        std::stringstream s;
 -//        std::string ss;
 -//        int countTmp = frames;
 -//        s << "imgs" << "/img" << (countTmp/10000);
 -//        countTmp = countTmp%10000;
 -//        s << (countTmp/1000);
 -//        countTmp = countTmp%1000;
 -//        s << (countTmp/100);
 -//        countTmp = countTmp%100;
 -//        s << (countTmp/10);
 -//        countTmp = countTmp%10;
 -//        s << (countTmp);
 -//        s << ".jpg";
 -//        s >> ss;
 -//        //set image output parameters
 -//        std::vector<int> compression_params;
 -//        compression_params.push_back(CV_IMWRITE_JPEG_QUALITY);
 -//        compression_params.push_back(90);
 -//        cv::imwrite(ss.c_str(), image, compression_params);
 +        //        std::stringstream s;
 +        //        std::string ss;
 +        //        int countTmp = frames;
 +        //        s << "imgs" << "/img" << (countTmp/10000);
 +        //        countTmp = countTmp%10000;
 +        //        s << (countTmp/1000);
 +        //        countTmp = countTmp%1000;
 +        //        s << (countTmp/100);
 +        //        countTmp = countTmp%100;
 +        //        s << (countTmp/10);
 +        //        countTmp = countTmp%10;
 +        //        s << (countTmp);
 +        //        s << ".jpg";
 +        //        s >> ss;
 +        //        //set image output parameters
 +        //        std::vector<int> compression_params;
 +        //        compression_params.push_back(CV_IMWRITE_JPEG_QUALITY);
 +        //        compression_params.push_back(90);
 +        //        cv::imwrite(ss.c_str(), image, compression_params);
      }
  
-     std::cout << "Average processing speed " << avg_time / frames << "ms. (" << 1. / (avg_time / frames) * 1000
-               << " fps)" << std::endl;
+     std::cout << "Average processing speed: " << avg_time/frames <<  "ms (" << 1./(avg_time/frames)*1000 << " fps)";
      if (groundtruth_stream.is_open()) {
-         std::cout << "Average accuracy: " << sum_accuracy / frames << std::endl;
+         std::cout << "; Average accuracy: " << sum_accuracy/frames << std::endl;
          groundtruth_stream.close();
      }
+     std::cout << std::endl;
  
      return EXIT_SUCCESS;
  }
diff --cc src/kcf.cpp
index c037137c474b310864c12bfec5a6ce9423a87899,c6b5f649932b552879909366597a77b8723400e1..b32532766901e76e536e9d7d7ca5f08e10518c03
@@@ -124,25 -117,24 +117,30 @@@ void KCF_Tracker::init(cv::Mat &img, co
      }
  
      // compute win size + fit to fhog cell size
-     p_windows_size.width = int(round(p_pose.w * (1. + p_padding) / p_cell_size) * p_cell_size);
-     p_windows_size.height = int(round(p_pose.h * (1. + p_padding) / p_cell_size) * p_cell_size);
+     p_windows_size.width = round(p_pose.w * (1. + p_padding) / p_cell_size) * p_cell_size;
+     p_windows_size.height = round(p_pose.h * (1. + p_padding) / p_cell_size) * p_cell_size;
+     p_roi.width = p_windows_size.width / p_cell_size;
+     p_roi.height = p_windows_size.height / p_cell_size;
  
 -    p_num_of_feats = 31;
 -    if (m_use_color) p_num_of_feats += 3;
 -    if (m_use_cnfeat) p_num_of_feats += 10;
 -
      p_scales.clear();
--    if (m_use_scale)
-         for (int i = -p_num_scales / 2; i <= p_num_scales / 2; ++i)
++    if (m_use_scale) {
+         for (int i = -int(p_num_scales) / 2; i <= int(p_num_scales) / 2; ++i)
              p_scales.push_back(std::pow(p_scale_step, i));
--    else
++    } else {
          p_scales.push_back(1.);
++        p_num_scales = 1;
++    }
 +
 +    if (m_use_angle) {
 +        for (int i = p_angle_min; i <= p_angle_max; i += p_angle_step)
 +            p_angles.push_back(i);
 +    } else {
 +        p_angles.push_back(0);
++        p_num_angles = 1;
 +    }
  
  #ifdef CUFFT
-     if (p_windows_size.height / p_cell_size * (p_windows_size.width / p_cell_size / 2 + 1) > 1024) {
+     if (p_roi.height * (p_roi.width / 2 + 1) > 1024) {
          std::cerr << "Window after forward FFT is too big for CUDA kernels. Plese use -f to set "
                       "the window dimensions so its size is less or equal to "
                    << 1024 * p_cell_size * p_cell_size * 2 + 1
          std::cerr << "cuFFT supports only Gaussian kernel." << std::endl;
          std::exit(EXIT_FAILURE);
      }
-     p_roi_width = p_windows_size.width / p_cell_size;
-     p_roi_height = p_windows_size.height / p_cell_size;
 +
      CudaSafeCall(cudaSetDeviceFlags(cudaDeviceMapHost));
-     p_rot_labels_data = DynMem(
-         ((uint(p_windows_size.width) / p_cell_size) * (uint(p_windows_size.height) / p_cell_size)) * sizeof(float));
-     p_rot_labels = cv::Mat(p_windows_size.height / int(p_cell_size), p_windows_size.width / int(p_cell_size), CV_32FC1,
-                            p_rot_labels_data.hostMem());
- #else
-     p_xf.create(uint(p_windows_size.height / p_cell_size), (uint(p_windows_size.height / p_cell_size)) / 2 + 1,
-                 p_num_of_feats);
 +
 -#else
 -    p_xf.create(p_roi.height, p_roi.height / 2 + 1, p_num_of_feats);
+     p_rot_labels_data = DynMem(p_roi.width * p_roi.height * sizeof(float));
+     p_rot_labels = cv::Mat(p_roi, CV_32FC1, p_rot_labels_data.hostMem());
  #endif
  
  #if defined(CUFFT) || defined(FFTW)
-     p_model_xf.create(uint(p_windows_size.height / p_cell_size), (uint(p_windows_size.width / p_cell_size)) / 2 + 1,
-                       uint(p_num_of_feats));
-     p_yf.create(uint(p_windows_size.height / p_cell_size), (uint(p_windows_size.width / p_cell_size)) / 2 + 1, 1);
-     p_xf.create(uint(p_windows_size.height) / p_cell_size, (uint(p_windows_size.width) / p_cell_size) / 2 + 1,
-                 p_num_of_feats);
+     uint width = p_roi.width / 2 + 1;
  #else
-     p_model_xf.create(uint(p_windows_size.height / p_cell_size), (uint(p_windows_size.width / p_cell_size)),
-                       uint(p_num_of_feats));
-     p_yf.create(uint(p_windows_size.height / p_cell_size), (uint(p_windows_size.width / p_cell_size)), 1);
-     p_xf.create(uint(p_windows_size.height) / p_cell_size, (uint(p_windows_size.width) / p_cell_size), p_num_of_feats);
+     uint width = p_roi.width;
  #endif
-     int max = m_use_big_batch ? 2 : p_num_scales;
-     for (int i = 0; i < max; ++i) {
-         if (m_use_big_batch && i == 1) {
-             p_threadctxs.emplace_back(new ThreadCtx(p_windows_size, p_cell_size,
-                                                     p_num_of_feats * p_scales.size() * p_angles.size(), p_scales.size(),
-                                                     p_angles.size()));
-         } else {
-             p_threadctxs.emplace_back(new ThreadCtx(p_windows_size, p_cell_size, p_num_of_feats));
+     p_model_xf.create(p_roi.height, width, p_num_of_feats);
+     p_yf.create(p_roi.height, width, 1);
+     p_xf.create(p_roi.height, width, p_num_of_feats);
 -    int max = m_use_big_batch ? 2 : p_num_scales;
 -    for (int i = 0; i < max; ++i) {
 -        if (m_use_big_batch && i == 1)
 -            p_threadctxs.emplace_back(p_roi, p_num_of_feats * p_num_scales, 1, p_num_scales);
 -        else
 -            p_threadctxs.emplace_back(p_roi, p_num_of_feats, p_scales[i], 1);
++    int max1 = m_use_big_batch ? 2 : p_num_scales;
++    int max2 = m_use_big_batch ? 1 : p_num_angles;
++    for (int i = 0; i < max1; ++i) {
++        for (int j = 0; j < max2; ++j) {
++            if (m_use_big_batch && i == 1)
++                p_threadctxs.emplace_back(p_roi, p_num_of_feats * p_num_scales * p_num_angles, 1, 0, p_num_scales,
++                                          p_num_angles);
++            else
++                p_threadctxs.emplace_back(p_roi, p_num_of_feats, p_scales[i], p_angles[j]);
 +        }
      }
  
      p_current_scale = 1.;
  
      p_output_sigma = std::sqrt(p_pose.w * p_pose.h) * p_output_sigma_factor / static_cast<double>(p_cell_size);
  
-     fft.init(uint(p_windows_size.width / p_cell_size), uint(p_windows_size.height / p_cell_size), uint(p_num_of_feats),
-              uint(p_scales.size() * p_angles.size()), m_use_big_batch);
-     fft.set_window(cosine_window_function(p_windows_size.width / p_cell_size, p_windows_size.height / p_cell_size));
 -    fft.init(p_roi.width, p_roi.height, p_num_of_feats, p_num_scales, m_use_big_batch);
++    fft.init(p_roi.width, p_roi.height, p_num_of_feats, p_num_scales * p_num_angles, m_use_big_batch);
+     fft.set_window(cosine_window_function(p_roi.width, p_roi.height));
  
      // window weights, i.e. labels
--    fft.forward(
-         gaussian_shaped_labels(p_output_sigma, p_windows_size.width / p_cell_size, p_windows_size.height / p_cell_size),
-         p_yf, m_use_cuda ? p_rot_labels_data.deviceMem() : nullptr, p_threadctxs.front()->stream);
 -        gaussian_shaped_labels(p_output_sigma, p_roi.width, p_roi.height), p_yf,
 -        m_use_cuda ? p_rot_labels_data.deviceMem() : nullptr, p_threadctxs.front().stream);
++    fft.forward(gaussian_shaped_labels(p_output_sigma, p_roi.width, p_roi.height), p_yf,
++                m_use_cuda ? p_rot_labels_data.deviceMem() : nullptr, p_threadctxs.front().stream);
      DEBUG_PRINTM(p_yf);
  
      // obtain a sub-window for training initial model
-     p_threadctxs.front()->patch_feats.clear();
+     p_threadctxs.front().patch_feats.clear();
 -    get_features(input_rgb, input_gray, p_pose.cx, p_pose.cy, p_windows_size.width, p_windows_size.height,
 -                 p_threadctxs.front());
 +
 +    int size_x_scaled = floor(p_windows_size.width);
 +    int size_y_scaled = floor(p_windows_size.height);
 +
 +    cv::Mat patch_gray = get_subwindow(input_gray, this->p_pose.cx, this->p_pose.cy, size_x_scaled, size_y_scaled);
 +    geometric_transformations(patch_gray, p_windows_size.width, p_windows_size.height, 0, false);
 +
 +    cv::Mat patch_rgb = cv::Mat::zeros(size_y_scaled, size_x_scaled, CV_32F);
 +    if ((m_use_color || m_use_cnfeat) && input_rgb.channels() == 3) {
 +        patch_rgb = get_subwindow(input_rgb, this->p_pose.cx, this->p_pose.cy, size_x_scaled, size_y_scaled);
 +        geometric_transformations(patch_rgb, p_windows_size.width, p_windows_size.height, 0, false);
 +    }
 +
-     get_features(patch_rgb, patch_gray, *p_threadctxs.front());
-     fft.forward_window(p_threadctxs.front()->patch_feats, p_model_xf, p_threadctxs.front()->fw_all,
-                        m_use_cuda ? p_threadctxs.front()->data_features.deviceMem() : nullptr,
-                        p_threadctxs.front()->stream);
++    get_features(patch_rgb, patch_gray, p_threadctxs.front());
+     fft.forward_window(p_threadctxs.front().patch_feats, p_model_xf, p_threadctxs.front().fw_all,
+                        m_use_cuda ? p_threadctxs.front().data_features.deviceMem() : nullptr,
+                        p_threadctxs.front().stream);
      DEBUG_PRINTM(p_model_xf);
 +
  #if !defined(BIG_BATCH) && defined(CUFFT) && (defined(ASYNC) || defined(OPENMP))
-     p_threadctxs.front()->model_xf = p_model_xf;
-     p_threadctxs.front()->model_xf.set_stream(p_threadctxs.front()->stream);
-     p_yf.set_stream(p_threadctxs.front()->stream);
-     p_model_xf.set_stream(p_threadctxs.front()->stream);
-     p_xf.set_stream(p_threadctxs.front()->stream);
+     p_threadctxs.front().model_xf = p_model_xf;
+     p_threadctxs.front().model_xf.set_stream(p_threadctxs.front().stream);
+     p_yf.set_stream(p_threadctxs.front().stream);
+     p_model_xf.set_stream(p_threadctxs.front().stream);
+     p_xf.set_stream(p_threadctxs.front().stream);
  #endif
  
      if (m_use_linearkernel) {
@@@ -321,9 -282,14 +306,14 @@@ BBox_c KCF_Tracker::getBBox(
      return tmp;
  }
  
+ double KCF_Tracker::getFilterResponse() const
+ {
+     return this->max_response;
+ }
  void KCF_Tracker::track(cv::Mat &img)
  {
 -    if (m_debug) std::cout << "NEW FRAME" << '\n';
 +    if (m_debug || m_visual_debug) std::cout << "\nNEW FRAME" << std::endl;
      cv::Mat input_gray, input_rgb = img.clone();
      if (img.channels() == 3) {
          cv::cvtColor(img, input_gray, CV_BGR2GRAY);
              cv::resize(input_rgb, input_rgb, cv::Size(0, 0), p_scale_factor_x, p_scale_factor_y, cv::INTER_LINEAR);
          }
      }
--
-     double max_response = -1.;
-     uint scale_index = 0;
-     uint angle_index = 0;
+     max_response = -1.;
+     ThreadCtx *max = nullptr;
      cv::Point2i *max_response_pt = nullptr;
      cv::Mat *max_response_map = nullptr;
  
-     if (m_use_multithreading) {
-         std::vector<std::future<void>> async_res(p_scales.size());
-         for (auto it = p_threadctxs.begin(); it != p_threadctxs.end(); ++it) {
-             uint index = uint(std::distance(p_threadctxs.begin(), it));
-             async_res[index] = std::async(std::launch::async, [this, &input_gray, &input_rgb, index, it]() -> void {
-                 return scale_track(*(*it), input_rgb, input_gray, this->p_scales[index]);
-             });
-         }
-         for (auto it = p_threadctxs.begin(); it != p_threadctxs.end(); ++it) {
-             uint index = uint(std::distance(p_threadctxs.begin(), it));
-             async_res[index].wait();
-             if ((*it)->max_response > max_response) {
-                 max_response = (*it)->max_response;
-                 max_response_pt = &(*it)->max_loc;
-                 max_response_map = &(*it)->response;
-                 scale_index = index;
-             }
+ #ifdef ASYNC
+     for (auto &it : p_threadctxs)
+         it.async_res = std::async(std::launch::async, [this, &input_gray, &input_rgb, &it]() -> void {
+             scale_track(it, input_rgb, input_gray);
+         });
+     for (auto const &it : p_threadctxs)
+         it.async_res.wait();
 -
+ #else  // !ASYNC
 -    // FIXME: Iterate correctly in big batch mode - perhaps have only one element in the list
+     NORMAL_OMP_PARALLEL_FOR
 -    for (uint i = 0; i < p_threadctxs.size(); ++i)
++    for (uint i = m_use_big_batch ? 1 : 0; i < p_threadctxs.size(); ++i)
+         scale_track(p_threadctxs[i], input_rgb, input_gray);
+ #endif
+ #ifndef BIG_BATCH
+     for (auto &it : p_threadctxs) {
+         if (it.max_response > max_response) {
+             max_response = it.max_response;
+             max_response_pt = &it.max_loc;
+             max_response_map = &it.response;
+             max = &it;
          }
-     } else {
-         uint start = m_use_big_batch ? 1 : 0;
-         uint end1 = m_use_big_batch ? 2 : uint(p_scales.size());
-         uint end2 = m_use_big_batch ? 1 : uint(p_angles.size());
-         NORMAL_OMP_PARALLEL_FOR
-         for (uint i = start; i < end1; ++i) {
-             auto it = p_threadctxs.begin();
-             std::advance(it, i);
-             for (size_t j = 0; j < end2; ++j) {
-                 scale_track(*(*it), input_rgb, input_gray, this->p_scales[i], this->p_angles[j]);
-                 if (m_use_big_batch) {
-                     for (uint x = 0; x < p_scales.size(); ++x) {
-                         for (uint k = 0; k < p_angles.size(); ++k) {
-                             if ((*it)->max_responses[x + k] > max_response) {
-                                 max_response = (*it)->max_responses[x + k];
-                                 max_response_pt = &(*it)->max_locs[x + k];
-                                 max_response_map = &(*it)->response_maps[x + k];
-                                 scale_index = x;
-                                 angle_index = k;
-                             }
-                         }
-                     }
-                 } else {
-                     NORMAL_OMP_CRITICAL
-                     {
-                         if ((*it)->max_response > max_response) {
-                             max_response = (*it)->max_response;
-                             max_response_pt = &(*it)->max_loc;
-                             max_response_map = &(*it)->response;
-                             scale_index = i;
-                             angle_index = j;
-                         }
-                     }
-                 }
+     }
+ #else
 -    // FIXME: Iterate correctly in big batch mode - perhaps have only one element in the list
 -    for (uint j = 0; j < p_scales.size(); ++j) {
 -        if (p_threadctxs[0].max_responses[j] > max_response) {
 -            max_response = p_threadctxs[0].max_responses[j];
 -            max_response_pt = &p_threadctxs[0].max_locs[j];
 -            max_response_map = &p_threadctxs[0].response_maps[j];
 -            max = &p_threadctxs[0];
++    for (uint j = 0; j < p_num_scales; ++j) {
++        for (uint k = 0; k < p_num_angles; ++k) {
++            if (p_threadctxs.back().max_responses[j + k] > max_response) {
++                max_response = p_threadctxs.back().max_responses[j + k];
++                max_response_pt = &p_threadctxs.back().max_locs[j + k];
++                max_response_map = &p_threadctxs.back().response_maps[j + k];
 +            }
          }
-         if (m_visual_debug) {
-             cv::Mat all_responses(cv::Size(p_angles.size() * p_debug_image_size, p_scales.size() * p_debug_image_size),
-                                   p_debug_scale_responses[0].type(), cv::Scalar::all(0));
-             cv::Mat all_subwindows(cv::Size(p_angles.size() * p_debug_image_size, p_scales.size() * p_debug_image_size),
-                                    p_debug_subwindows[0].type(), cv::Scalar::all(0));
-             for (size_t i = 0; i < p_scales.size(); ++i) {
-                 for (size_t j = 0; j < p_angles.size(); ++j) {
-                     cv::Mat in_roi(all_responses, cv::Rect(j * p_debug_image_size, i * p_debug_image_size,
-                                                            p_debug_image_size, p_debug_image_size));
-                     p_debug_scale_responses[5 * i + j].copyTo(in_roi);
-                     in_roi = all_subwindows(cv::Rect(j * p_debug_image_size, i * p_debug_image_size, p_debug_image_size,
-                                                      p_debug_image_size));
-                     p_debug_subwindows[5 * i + j].copyTo(in_roi);
-                 }
+     }
++    max = &p_threadctxs.back();
+ #endif
++    if (m_visual_debug) {
++        cv::Mat all_responses(cv::Size(p_num_angles* p_debug_image_size, p_num_scales * p_debug_image_size),
++                              p_debug_scale_responses[0].type(), cv::Scalar::all(0));
++        cv::Mat all_subwindows(cv::Size(p_num_angles* p_debug_image_size, p_num_scales* p_debug_image_size),
++                               p_debug_subwindows[0].type(), cv::Scalar::all(0));
++        for (size_t i = 0; i < p_num_scales; ++i) {
++            for (size_t j = 0; j < p_num_angles; ++j) {
++                cv::Mat in_roi(all_responses, cv::Rect(j * p_debug_image_size, i * p_debug_image_size,
++                                                       p_debug_image_size, p_debug_image_size));
++                p_debug_scale_responses[5 * i + j].copyTo(in_roi);
++                in_roi = all_subwindows(
++                    cv::Rect(j * p_debug_image_size, i * p_debug_image_size, p_debug_image_size, p_debug_image_size));
++                p_debug_subwindows[5 * i + j].copyTo(in_roi);
 +            }
-             cv::namedWindow("All subwindows", CV_WINDOW_AUTOSIZE);
-             cv::imshow("All subwindows", all_subwindows);
-             cv::namedWindow("All responses", CV_WINDOW_AUTOSIZE);
-             cv::imshow("All responses", all_responses);
-             cv::waitKey();
-             p_debug_scale_responses.clear();
-             p_debug_subwindows.clear();
 +        }
++        cv::namedWindow("All subwindows", CV_WINDOW_AUTOSIZE);
++        cv::imshow("All subwindows", all_subwindows);
++        cv::namedWindow("All responses", CV_WINDOW_AUTOSIZE);
++        cv::imshow("All responses", all_responses);
++        cv::waitKey();
++        p_debug_scale_responses.clear();
++        p_debug_subwindows.clear();
 +    }
  
      DEBUG_PRINTM(*max_response_map);
      DEBUG_PRINT(*max_response_pt);
      cv::Point2f new_location(max_response_pt->x, max_response_pt->y);
      DEBUG_PRINT(new_location);
  
-     if (m_use_subpixel_localization) new_location = sub_pixel_peak(*max_response_pt, *max_response_map);
+     if (m_use_subpixel_localization)
+         new_location = sub_pixel_peak(*max_response_pt, *max_response_map);
      DEBUG_PRINT(new_location);
  
 +    if (m_visual_debug) std::cout << "Old p_pose, cx: " << p_pose.cx << " cy: " << p_pose.cy << std::endl;
 +
      p_pose.cx += p_current_scale * p_cell_size * double(new_location.x);
      p_pose.cy += p_current_scale * p_cell_size * double(new_location.y);
 +
 +    if (m_visual_debug) std::cout << "New p_pose, cx: " << p_pose.cx << " cy: " << p_pose.cy << std::endl;
 +
      if (p_fit_to_pw2) {
          if (p_pose.cx < 0) p_pose.cx = 0;
          if (p_pose.cx > (img.cols * p_scale_factor_x) - 1) p_pose.cx = (img.cols * p_scale_factor_x) - 1;
      if (p_current_scale < p_min_max_scale[0]) p_current_scale = p_min_max_scale[0];
      if (p_current_scale > p_min_max_scale[1]) p_current_scale = p_min_max_scale[1];
  
-     p_current_angle = (p_current_angle + p_angles[angle_index]) < 0
-                           ? -std::abs(p_current_angle + p_angles[angle_index]) % 360
-                           : (p_current_angle + p_angles[angle_index]) % 360;
++    p_current_angle = (p_current_angle + max->angle) < 0
++                          ? -std::abs(p_current_angle + max->angle) % 360
++                          : (p_current_angle + max->angle) % 360;
 +
      // obtain a subwindow for training at newly estimated target position
-     p_threadctxs.front()->patch_feats.clear();
 +    int size_x_scaled = floor(p_windows_size.width * p_current_scale);
 +    int size_y_scaled = floor(p_windows_size.height * p_current_scale);
 +
 +    cv::Mat patch_gray = get_subwindow(input_gray, this->p_pose.cx, this->p_pose.cy, size_x_scaled, size_y_scaled);
 +    geometric_transformations(patch_gray, p_windows_size.width, p_windows_size.height, p_current_angle, false);
 +
 +    cv::Mat patch_rgb = cv::Mat::zeros(size_y_scaled, size_x_scaled, CV_32F);
 +    if ((m_use_color || m_use_cnfeat) && input_rgb.channels() == 3) {
 +        patch_rgb = get_subwindow(input_rgb, this->p_pose.cx, this->p_pose.cy, size_x_scaled, size_y_scaled);
 +        geometric_transformations(patch_rgb, p_windows_size.width, p_windows_size.height, p_current_angle, false);
 +    }
 +
-     get_features(patch_rgb, patch_gray, *p_threadctxs.front());
-     fft.forward_window(p_threadctxs.front()->patch_feats, p_xf, p_threadctxs.front()->fw_all,
-                        m_use_cuda ? p_threadctxs.front()->data_features.deviceMem() : nullptr,
-                        p_threadctxs.front()->stream);
+     p_threadctxs.front().patch_feats.clear();
 -    get_features(input_rgb, input_gray, p_pose.cx, p_pose.cy, p_windows_size.width, p_windows_size.height,
 -                 p_threadctxs.front(), p_current_scale);
++    get_features(patch_rgb, patch_gray, p_threadctxs.front());
+     fft.forward_window(p_threadctxs.front().patch_feats, p_xf, p_threadctxs.front().fw_all,
+                        m_use_cuda ? p_threadctxs.front().data_features.deviceMem() : nullptr, p_threadctxs.front().stream);
  
      // subsequent frames, interpolate model
      p_model_xf = p_model_xf * float((1. - p_interp_factor)) + p_xf * float(p_interp_factor);
@@@ -535,43 -439,14 +504,42 @@@ void KCF_Tracker::scale_track(ThreadCt
  {
      if (m_use_big_batch) {
          vars.patch_feats.clear();
-         std::cout << "WE ARE HERE BOIS" << std::endl;
          BIG_BATCH_OMP_PARALLEL_FOR
 -        for (uint i = 0; i < p_num_scales; ++i) {
 -            get_features(input_rgb, input_gray, this->p_pose.cx, this->p_pose.cy, this->p_windows_size.width,
 -                         this->p_windows_size.height, vars, this->p_current_scale * this->p_scales[i]);
 +        for (uint i = 0; i < this->p_scales.size(); ++i) {
 +            for (uint j = 0; j < this->p_angles.size(); ++j) {
 +                int size_x_scaled = floor(this->p_windows_size.width * this->p_current_scale * this->p_scales[i]);
 +                int size_y_scaled = floor(this->p_windows_size.height * this->p_current_scale * this->p_scales[i]);
 +
 +                cv::Mat patch_gray =
 +                    get_subwindow(input_gray, this->p_pose.cx, this->p_pose.cy, size_x_scaled, size_y_scaled);
 +                geometric_transformations(patch_gray, p_windows_size.width, p_windows_size.height,
 +                                          p_current_scale * this->p_scales[i], p_current_angle + this->p_angles[j]);
 +
 +                cv::Mat patch_rgb = cv::Mat::zeros(size_y_scaled, size_x_scaled, CV_32F);
 +                if ((m_use_color || m_use_cnfeat) && input_rgb.channels() == 3) {
 +                    patch_rgb =
 +                        get_subwindow(input_rgb, this->p_pose.cx, this->p_pose.cy, size_x_scaled, size_y_scaled);
 +                    geometric_transformations(patch_rgb, p_windows_size.width, p_windows_size.height,
 +                                              p_current_scale * this->p_scales[i], p_current_angle + this->p_angles[j]);
 +                }
 +                get_features(patch_rgb, patch_gray, vars);
 +            }
          }
      } else {
-         int size_x_scaled = floor(this->p_windows_size.width * this->p_current_scale * scale);
-         int size_y_scaled = floor(this->p_windows_size.height * this->p_current_scale * scale);
 -        get_features(input_rgb, input_gray, this->p_pose.cx, this->p_pose.cy, this->p_windows_size.width,
 -                     this->p_windows_size.height, vars, this->p_current_scale * vars.scale);
++        int size_x_scaled = floor(this->p_windows_size.width * this->p_current_scale * vars.scale);
++        int size_y_scaled = floor(this->p_windows_size.height * this->p_current_scale * vars.scale);
 +
 +        cv::Mat patch_gray = get_subwindow(input_gray, this->p_pose.cx, this->p_pose.cy, size_x_scaled, size_y_scaled);
-         geometric_transformations(patch_gray, p_windows_size.width, p_windows_size.height, p_current_scale * scale);
++        geometric_transformations(patch_gray, p_windows_size.width, p_windows_size.height, p_current_scale * vars.scale);
 +
 +        cv::Mat patch_rgb = cv::Mat::zeros(size_y_scaled, size_x_scaled, CV_32F);
 +        if ((m_use_color || m_use_cnfeat) && input_rgb.channels() == 3) {
 +            patch_rgb = get_subwindow(input_rgb, this->p_pose.cx, this->p_pose.cy, size_x_scaled, size_y_scaled);
-             geometric_transformations(patch_rgb, p_windows_size.width, p_windows_size.height, p_current_scale * scale,
-                                       p_current_angle + angle);
++            geometric_transformations(patch_rgb, p_windows_size.width, p_windows_size.height, p_current_scale * vars.scale,
++                                      p_current_angle + vars.angle);
 +        }
 +        vars.patch_feats.clear();
 +        get_features(patch_rgb, patch_gray, vars);
      }
  
      fft.forward_window(vars.patch_feats, vars.zf, vars.fw_all, m_use_cuda ? vars.data_features.deviceMem() : nullptr,
  
  // ****************************************************************************
  
 -void KCF_Tracker::get_features(cv::Mat &input_rgb, cv::Mat &input_gray, int cx, int cy, int size_x, int size_y,
 -                               ThreadCtx &vars, double scale)
 +void KCF_Tracker::get_features(cv::Mat &patch_rgb, cv::Mat &patch_gray, ThreadCtx &vars)
  {
 -    int size_x_scaled = floor(size_x * scale);
 -    int size_y_scaled = floor(size_y * scale);
 -
 -    cv::Mat patch_gray = get_subwindow(input_gray, cx, cy, size_x_scaled, size_y_scaled);
 -    cv::Mat patch_rgb = get_subwindow(input_rgb, cx, cy, size_x_scaled, size_y_scaled);
 -
 -    // resize to default size
 -    if (scale > 1.) {
 -        // if we downsample use  INTER_AREA interpolation
 -        cv::resize(patch_gray, patch_gray, cv::Size(size_x, size_y), 0., 0., cv::INTER_AREA);
 -    } else {
 -        cv::resize(patch_gray, patch_gray, cv::Size(size_x, size_y), 0., 0., cv::INTER_LINEAR);
 -    }
--
      // get hog(Histogram of Oriented Gradients) features
-     FHoG::extract(patch_gray, vars, 2, p_cell_size, 9);
+     vars.patch_feats = FHoG::extract(patch_gray, 2, p_cell_size, 9);
  
      // get color rgb features (simple r,g,b channels)
      std::vector<cv::Mat> color_feat;
diff --cc src/kcf.h
index 62fcdec624277f5dd046c0951204bdb817902851,ec0ab45ff57e2b37f60f2e092e032e8fee3f2d9c..825cfd6d983a080ae6876b64dcf823073dad6591
+++ b/src/kcf.h
@@@ -40,37 -41,40 +40,37 @@@ struct BBox_c 
      inline void scale_y(double factor)
      {
          cy *= factor;
 -        h  *= factor;
 -    }
 -
 -    inline cv::Rect get_rect()
 -    {
 -        return cv::Rect(int(cx-w/2.), int(cy-h/2.), int(w), int(h));
 +        h *= factor;
      }
  
 +    inline cv::Rect get_rect() { return cv::Rect(int(cx - w / 2.), int(cy - h / 2.), int(w), int(h)); }
  };
  
 -class KCF_Tracker
 -{
 -public:
 -    bool m_debug     {false};
 -    bool m_use_scale {true};
 -    bool m_use_color {true};
 +class KCF_Tracker {
 +  public:
 +    bool m_debug{false};
 +    bool m_visual_debug{false};
 +    bool m_use_scale{true};
-     bool m_use_angle{true}; // Doesn't work with FFTW-BIG version
++    bool m_use_angle{false}; // Doesn't work with FFTW-BIG version
 +    bool m_use_color{true};
  #ifdef ASYNC
 -    bool m_use_multithreading {true};
 +    bool m_use_multithreading{true};
  #else
 -    bool m_use_multithreading {false};
 -#endif //ASYNC
 -    bool m_use_subpixel_localization {true};
 -    bool m_use_subgrid_scale {true};
 -    bool m_use_cnfeat {true};
 -    bool m_use_linearkernel {false};
 +    bool m_use_multithreading{false};
 +#endif // ASYNC
 +    bool m_use_subpixel_localization{true};
 +    bool m_use_subgrid_scale{true};
 +    bool m_use_cnfeat{true};
 +    bool m_use_linearkernel{false};
  #ifdef BIG_BATCH
 -    bool m_use_big_batch {true};
 +    bool m_use_big_batch{true};
  #else
 -    bool m_use_big_batch {false};
 +    bool m_use_big_batch{false};
  #endif
  #ifdef CUFFT
 -    bool m_use_cuda {true};
 +    bool m_use_cuda{true};
  #else
 -    bool m_use_cuda {false};
 +    bool m_use_cuda{false};
  #endif
  
      /*
      ~KCF_Tracker();
  
      // Init/re-init methods
 -    void init(cv::Mat & img, const cv::Rect & bbox, int fit_size_x, int fit_size_y);
 -    void setTrackerPose(BBox_c & bbox, cv::Mat & img, int fit_size_x, int fit_size_y);
 -    void updateTrackerPosition(BBox_c & bbox);
 +    void init(cv::Mat &img, const cv::Rect &bbox, int fit_size_x, int fit_size_y);
 +    void setTrackerPose(BBox_c &bbox, cv::Mat &img, int fit_size_x, int fit_size_y);
 +    void updateTrackerPosition(BBox_c &bbox);
  
      // frame-to-frame object tracking
 -    void track(cv::Mat & img);
 +    void track(cv::Mat &img);
      BBox_c getBBox();
+     double getFilterResponse() const; // Measure of tracking accuracy
  
 -private:
 +  private:
      Fft &fft;
  
      BBox_c p_pose;
      double p_padding = 1.5;
      double p_output_sigma_factor = 0.1;
      double p_output_sigma;
 -    double p_kernel_sigma = 0.5;    //def = 0.5
 -    double p_lambda = 1e-4;         //regularization in learning step
 -    double p_interp_factor = 0.02;  //def = 0.02, linear interpolation factor for adaptation
 -    int p_cell_size = 4;            //4 for hog (= bin_size)
 +    double p_kernel_sigma = 0.5;   // def = 0.5
 +    double p_lambda = 1e-4;        // regularization in learning step
 +    double p_interp_factor = 0.02; // def = 0.02, linear interpolation factor for adaptation
 +    int p_cell_size = 4;           // 4 for hog (= bin_size)
      cv::Size p_windows_size;
-     int p_num_scales{7};
+     uint p_num_scales {7};
      double p_scale_step = 1.02;
      double p_current_scale = 1.;
      double p_min_max_scale[2];
      std::vector<double> p_scales;
 +    int p_current_angle = 0;
++    uint p_num_angles {5};
 +    int p_angle_min = -20, p_angle_max = 20;
 +    int p_angle_step = 10;
 +    std::vector<int> p_angles;
 +
 +    // for visual debug
 +    int p_debug_image_size = 100;
 +    int p_count = 0;
 +    std::vector<cv::Mat> p_debug_scale_responses;
 +    std::vector<cv::Mat> p_debug_subwindows;
  
-     // for big batch
+     //for big batch
 -    int p_num_of_feats;
 +    int p_num_of_feats = 31 + (m_use_color ? 3 : 0) + (m_use_cnfeat ? 10 : 0);
+     cv::Size p_roi;
  
-     // for CUDA
-     int p_roi_height, p_roi_width;
-     std::list<std::unique_ptr<ThreadCtx>> p_threadctxs;
+     std::vector<ThreadCtx> p_threadctxs;
  
-     // CUDA compability
+     //CUDA compability
      cv::Mat p_rot_labels;
      DynMem p_rot_labels_data;
  
      ComplexMat p_model_alphaf_den;
      ComplexMat p_model_xf;
      ComplexMat p_xf;
-     // helping functions
-     void scale_track(ThreadCtx &vars, cv::Mat &input_rgb, cv::Mat &input_gray, double scale, int angle = 0);
-     cv::Mat get_subwindow(const cv::Mat &input, int cx, int cy, int size_x, int size_y);
+     //helping functions
+     void scale_track(ThreadCtx & vars, cv::Mat & input_rgb, cv::Mat & input_gray);
+     cv::Mat get_subwindow(const cv::Mat & input, int cx, int cy, int size_x, int size_y);
      cv::Mat gaussian_shaped_labels(double sigma, int dim1, int dim2);
 -    void gaussian_correlation(struct ThreadCtx &vars, const ComplexMat & xf, const ComplexMat & yf, double sigma, bool auto_correlation = false);
 -    cv::Mat circshift(const cv::Mat & patch, int x_rot, int y_rot);
 +    void gaussian_correlation(struct ThreadCtx &vars, const ComplexMat &xf, const ComplexMat &yf, double sigma,
 +                              bool auto_correlation = false);
 +    cv::Mat circshift(const cv::Mat &patch, int x_rot, int y_rot);
      cv::Mat cosine_window_function(int dim1, int dim2);
 -    void get_features(cv::Mat & input_rgb, cv::Mat & input_gray, int cx, int cy, int size_x, int size_y, ThreadCtx & vars, double scale = 1.);
 -    cv::Point2f sub_pixel_peak(cv::Point & max_loc, cv::Mat & response);
 +    void get_features(cv::Mat &patch_rgb, cv::Mat &patch_gray, ThreadCtx &vars);
 +    void geometric_transformations(cv::Mat &patch, int size_x, int size_y, int angle = 0, bool allow_debug = true);
 +    cv::Point2f sub_pixel_peak(cv::Point &max_loc, cv::Mat &response);
-     double sub_grid_scale(int index = -1);
+     double sub_grid_scale(uint index);
 -
  };
  
 -#endif //KCF_HEADER_6565467831231
 +#endif // KCF_HEADER_6565467831231
index 26ddef8e72d77c268b02eafe4c49c84c125d024f,62aebec1b459775aaa8d87c6e0d6d0f4fd4f7afb..d6a2cbef807af3f92da76dae55e99512542abc51
@@@ -15,9 -16,10 +16,10 @@@ typedef int *cudaStream_t
  
  struct ThreadCtx {
    public:
-     ThreadCtx(cv::Size windows_size, uint cell_size, uint num_of_feats, uint num_of_scales = 1, uint num_of_angles = 1)
 -    ThreadCtx(cv::Size roi, uint num_of_feats, double scale, uint num_of_scales)
 -        : scale(scale)
++    ThreadCtx(cv::Size roi, uint num_of_feats, double scale, int angle, uint num_of_scales = 1, uint num_of_angles = 1)
++        : scale(scale), angle(angle)
      {
 -        this->xf_sqr_norm = DynMem(num_of_scales * sizeof(float));
 +        this->xf_sqr_norm = DynMem(num_of_scales * num_of_angles * sizeof(float));
          this->yf_sqr_norm = DynMem(sizeof(float));
          this->patch_feats.reserve(uint(num_of_feats));
  
  #endif
  
  #if defined(CUFFT) || defined(FFTW)
 -        this->gauss_corr_res = DynMem(cells_size * num_of_scales);
 +        this->gauss_corr_res = DynMem(cells_size * num_of_scales * num_of_angles);
          this->data_features = DynMem(cells_size * num_of_feats);
  
-         uint width_freq = (uint(windows_size.width) / cell_size) / 2 + 1;
-         this->in_all = cv::Mat(windows_size.height / int(cell_size) * int(num_of_scales) * int(num_of_angles),
-                                windows_size.width / int(cell_size), CV_32F, this->gauss_corr_res.hostMem());
+         uint width_freq = roi.width / 2 + 1;
  
-         this->fw_all = cv::Mat((windows_size.height / int(cell_size)) * int(num_of_feats),
-                                windows_size.width / int(cell_size), CV_32F, this->data_features.hostMem());
 -        this->in_all = cv::Mat(roi.height * num_of_scales, roi.width, CV_32F, this->gauss_corr_res.hostMem());
++        this->in_all = cv::Mat(roi.height * num_of_scales * num_of_angles, roi.width, CV_32F, this->gauss_corr_res.hostMem());
+         this->fw_all = cv::Mat(roi.height * num_of_feats, roi.width, CV_32F, this->data_features.hostMem());
  #else
-         uint width_freq = uint(windows_size.width) / cell_size;
+         uint width_freq = roi.width;
  
-         this->in_all = cv::Mat((windows_size.height / int(cell_size)), windows_size.width / int(cell_size), CV_32F);
+         this->in_all = cv::Mat(roi, CV_32F);
  #endif
  
          this->data_i_features = DynMem(cells_size * num_of_feats);
 -        this->data_i_1ch = DynMem(cells_size * num_of_scales);
 +        this->data_i_1ch = DynMem(cells_size * num_of_scales * num_of_angles);
  
-         this->ifft2_res = cv::Mat(windows_size.height / int(cell_size), windows_size.width / int(cell_size),
-                                   CV_32FC(int(num_of_feats)), this->data_i_features.hostMem());
-         this->response = cv::Mat(windows_size.height / int(cell_size), windows_size.width / int(cell_size),
-                                  CV_32FC(int(num_of_scales * num_of_angles)), this->data_i_1ch.hostMem());
+         this->ifft2_res = cv::Mat(roi, CV_32FC(num_of_feats), this->data_i_features.hostMem());
 -        this->response = cv::Mat(roi, CV_32FC(num_of_scales), this->data_i_1ch.hostMem());
++        this->response = cv::Mat(roi, CV_32FC(num_of_scales * num_of_angles), this->data_i_1ch.hostMem());
  
          this->patch_feats.reserve(num_of_feats);
  
  #ifdef CUFFT
-         this->zf.create(uint(windows_size.height) / cell_size, width_freq, num_of_feats, num_of_scales * num_of_angles,
-                         this->stream);
-         this->kzf.create(uint(windows_size.height) / cell_size, width_freq, num_of_scales * num_of_angles, this->stream);
-         this->kf.create(uint(windows_size.height) / cell_size, width_freq, num_of_scales * num_of_angles, this->stream);
 -        this->zf.create(roi.height, width_freq, num_of_feats, num_of_scales, this->stream);
 -        this->kzf.create(roi.height, width_freq, num_of_scales, this->stream);
 -        this->kf.create(roi.height, width_freq, num_of_scales, this->stream);
++        this->zf.create(roi.height, width_freq, num_of_feats, num_of_scales * num_of_angles, this->stream);
++        this->kzf.create(roi.height, width_freq, num_of_scales * num_of_angles, this->stream);
++        this->kf.create(roi.height, width_freq, num_of_scales * num_of_angles, this->stream);
  #else
-         this->zf.create(uint(windows_size.height) / cell_size, width_freq, num_of_feats, num_of_scales * num_of_angles);
-         this->kzf.create(uint(windows_size.height) / cell_size, width_freq, num_of_scales * num_of_angles);
-         this->kf.create(uint(windows_size.height) / cell_size, width_freq, num_of_scales * num_of_angles);
 -        this->zf.create(roi.height, width_freq, num_of_feats, num_of_scales);
 -        this->kzf.create(roi.height, width_freq, num_of_scales);
 -        this->kf.create(roi.height, width_freq, num_of_scales);
++        this->zf.create(roi.height, width_freq, num_of_feats, num_of_scales * num_of_angles);
++        this->kzf.create(roi.height, width_freq, num_of_scales * num_of_angles);
++        this->kf.create(roi.height, width_freq, num_of_scales * num_of_angles);
  #endif
  
          if (num_of_scales > 1) {
-             this->max_responses.reserve(uint(num_of_scales * num_of_angles));
-             this->max_locs.reserve(uint(num_of_scales * num_of_angles));
-             this->response_maps.reserve(uint(num_of_scales * num_of_angles));
 -            this->max_responses.reserve(num_of_scales);
 -            this->max_locs.reserve(num_of_scales);
 -            this->response_maps.reserve(num_of_scales);
++            this->max_responses.reserve(num_of_scales * num_of_angles);
++            this->max_locs.reserve(num_of_scales * num_of_angles);
++            this->response_maps.reserve(num_of_scales * num_of_angles);
          }
      }
+     ThreadCtx(ThreadCtx &&) = default;
      ~ThreadCtx()
      {
  #if  !defined(BIG_BATCH) && defined(CUFFT) && (defined(ASYNC) || defined(OPENMP))
  #endif
      }
  
+     const double scale;
++    const int angle;
+ #ifdef ASYNC
+     std::future<void> async_res;
+ #endif
      DynMem xf_sqr_norm, yf_sqr_norm;
      std::vector<cv::Mat> patch_feats;