# Makefile to build all the available variants
-BUILDS = opencvfft-st opencvfft-async fftw fftw-openmp fftw-async fftw-big fftw-big-openmp cufftw cufftw-big cufftw-big-openmp cufft cufft-big cufft-big-openmp
+BUILDS = opencvfft-st opencvfft-async opencvfft-openmp fftw fftw-async fftw-openmp fftw-big fftw-big-openmp cufftw cufftw-big cufftw-big-openmp cufft cufft-big cufft-big-openmp
all: $(foreach build,$(BUILDS),build-$(build)/kcf_vot)
CMAKE_OTPS_opencvfft-st = -DFFT=OpenCV
CMAKE_OTPS_opencvfft-async = -DFFT=OpenCV -DASYNC=ON
+CMAKE_OTPS_opencvfft-openmp = -DFFT=OpenCV -DOPENMP=ON
CMAKE_OTPS_fftw = -DFFT=fftw
CMAKE_OTPS_fftw-openmp = -DFFT=fftw -DOPENMP=ON
CMAKE_OTPS_fftw-async = -DFFT=fftw -DASYNC=ON
message(SEND_ERROR "cuFFT version does not support ASYNC and OpenMP only if used with big batch mode.")
ENDIF()
-IF(ASYNC AND NOT OPENMP)
+IF(ASYNC)
add_definitions(-DASYNC)
MESSAGE(STATUS "ASYNC")
+ELSEIF(OPENMP)
+ add_definitions(-DOPENMP)
+ MESSAGE(STATUS "OPENMP")
ENDIF() #ASYNC
FIND_PACKAGE( OpenCV REQUIRED )
set(CUDA_SEPARABLE_COMPILATION ON)
set(CUDA_PROPAGATE_HOST_FLAGS OFF)
set(CUDA_HOST_COMPILER /usr/bin/g++)
- list( APPEND CUDA_NVCC_FLAGS -O3 --gpu-architecture sm_62 -std=c++11 -default-stream per-thread)
+ list( APPEND CUDA_NVCC_FLAGS -O3 --gpu-architecture sm_62 -std=c++11)
find_cuda_helper_libs(cufftw)
IF(FFT STREQUAL "cuFFT")
add_subdirectory(cuda)
void ComplexMat::sqr_norm(float *result) const
{
- CudaSafeCall(cudaMemset(result, 0, n_scales*sizeof(float)));
+ CudaSafeCall(cudaMemsetAsync(result, 0, n_scales*sizeof(float), this->stream));
dim3 threadsPerBlock(rows, cols);
dim3 numBlocks(n_channels/n_scales, n_scales);
- sqr_norm_kernel<<<numBlocks, threadsPerBlock, rows*cols*sizeof(float)>>>(n_channels/n_scales, result, p_data, rows, cols);
+ sqr_norm_kernel<<<numBlocks, threadsPerBlock, rows*cols*sizeof(float), this->stream>>>(n_channels/n_scales, result, this->p_data, rows, cols);
CudaCheckError();
return;
ComplexMat ComplexMat::sqr_mag() const
{
- ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales);
+ ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales, this->stream);
dim3 threadsPerBlock(rows, cols);
dim3 numBlocks(n_channels/n_scales, n_scales);
- sqr_mag_kernel<<<numBlocks, threadsPerBlock>>>(this->p_data, result.p_data);
+ sqr_mag_kernel<<<numBlocks, threadsPerBlock, 0, this->stream>>>(this->p_data, result.p_data);
CudaCheckError();
return result;
ComplexMat ComplexMat::conj() const
{
- ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales);
+ ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales, this->stream);
dim3 threadsPerBlock(rows, cols);
dim3 numBlocks(n_channels/n_scales, n_scales);
- conj_kernel<<<numBlocks, threadsPerBlock>>>(this->p_data, result.p_data);
+ conj_kernel<<<numBlocks, threadsPerBlock, 0, this->stream>>>(this->p_data, result.p_data);
CudaCheckError();
return result;
ComplexMat ComplexMat::sum_over_channels() const
{
// assert(p_data.size() > 1);
- ComplexMat result(this->rows, this->cols, 1);
+ ComplexMat result(this->rows, this->cols, 1, this->stream);
return result;
}
{
assert(rhs.n_channels == n_channels && rhs.cols == cols && rhs.rows == rows);
- ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales);
+ ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales, this->stream);
dim3 threadsPerBlock(rows, cols);
dim3 numBlocks(n_channels/n_scales, n_scales);
- same_num_channels_mul_kernel<<<numBlocks, threadsPerBlock>>>(this->p_data, rhs.p_data, result.p_data);
+ same_num_channels_mul_kernel<<<numBlocks, threadsPerBlock, 0, this->stream>>>(this->p_data, rhs.p_data, result.p_data);
CudaCheckError();
return result;
{
assert(rhs.n_channels == n_channels && rhs.cols == cols && rhs.rows == rows);
- ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales);
+ ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales, this->stream);
dim3 threadsPerBlock(rows, cols);
dim3 numBlocks(n_channels/n_scales, n_scales);
- same_num_channels_div_kernel<<<numBlocks, threadsPerBlock>>>(this->p_data, rhs.p_data, result.p_data);
+ same_num_channels_div_kernel<<<numBlocks, threadsPerBlock, 0, this->stream>>>(this->p_data, rhs.p_data, result.p_data);
CudaCheckError();
return result;
{
assert(rhs.n_channels == n_channels && rhs.cols == cols && rhs.rows == rows);
- ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales);
+ ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales, this->stream);
dim3 threadsPerBlock(rows, cols);
dim3 numBlocks(n_channels/n_scales, n_scales);
- same_num_channels_add_kernel<<<numBlocks, threadsPerBlock>>>(this->p_data, rhs.p_data, result.p_data);
+ same_num_channels_add_kernel<<<numBlocks, threadsPerBlock, 0, this->stream>>>(this->p_data, rhs.p_data, result.p_data);
CudaCheckError();
return result;
ComplexMat ComplexMat::operator*(const float & rhs) const
{
- ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales);
+ ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales, this->stream);
dim3 threadsPerBlock(rows, cols);
dim3 numBlocks(n_channels/n_scales, n_scales);
- constant_mul_kernel<<<numBlocks, threadsPerBlock>>>(this->p_data, rhs, result.p_data);
+ constant_mul_kernel<<<numBlocks, threadsPerBlock, 0, this->stream>>>(this->p_data, rhs, result.p_data);
CudaCheckError();
return result;
ComplexMat ComplexMat::operator+(const float & rhs) const
{
- ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales);
+ ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales, this->stream);
dim3 threadsPerBlock(rows, cols);
dim3 numBlocks(n_channels/n_scales, n_scales);
- constant_add_kernel<<<numBlocks, threadsPerBlock>>>(this->p_data, rhs, result.p_data);
+ constant_add_kernel<<<numBlocks, threadsPerBlock, 0, this->stream>>>(this->p_data, rhs, result.p_data);
CudaCheckError();
return result;
{
assert(rhs.n_channels == 1 && rhs.cols == cols && rhs.rows == rows);
- ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales);
+ ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales, this->stream);
dim3 threadsPerBlock(rows, cols);
dim3 numBlocks(n_channels/n_scales, n_scales);
- one_channel_mul_kernel<<<numBlocks, threadsPerBlock>>>(this->p_data, rhs.p_data, result.p_data);
+ one_channel_mul_kernel<<<numBlocks, threadsPerBlock, 0, this->stream>>>(this->p_data, rhs.p_data, result.p_data);
CudaCheckError();
return result;
{
assert(rhs.n_channels == n_channels/n_scales && rhs.cols == cols && rhs.rows == rows);
- ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales);
+ ComplexMat result(this->rows, this->cols, this->channels(), this->n_scales, this->stream);
dim3 threadsPerBlock(rows, cols);
dim3 numBlocks(n_channels/n_scales, n_scales);
- scales_channel_mul_kernel<<<numBlocks, threadsPerBlock>>>(this->p_data, rhs.p_data, result.p_data);
+ scales_channel_mul_kernel<<<numBlocks, threadsPerBlock, 0, this->stream>>>(this->p_data, rhs.p_data, result.p_data);
CudaCheckError();
return result;
rows = rhs.rows;
n_channels = rhs.n_channels;
n_scales = rhs.n_scales;
+ stream = rhs.stream;
foreign_data = true;
p_data = rhs.p_data;
rows = rhs.rows;
n_channels = rhs.n_channels;
n_scales = rhs.n_scales;
+ stream = rhs.stream;
p_data = rhs.p_data;
int n_channels;
int n_scales = 1;
bool foreign_data = false;
+ cudaStream_t stream = nullptr;
ComplexMat() : cols(0), rows(0), n_channels(0) {}
- ComplexMat(int _rows, int _cols, int _n_channels) : cols(_cols), rows(_rows), n_channels(_n_channels)
+ ComplexMat(int _rows, int _cols, int _n_channels, cudaStream_t _stream) : cols(_cols), rows(_rows), n_channels(_n_channels), stream(_stream)
{
CudaSafeCall(cudaMalloc(&p_data, n_channels*cols*rows*sizeof(cufftComplex)));
}
- ComplexMat(int _rows, int _cols, int _n_channels, int _n_scales) : cols(_cols), rows(_rows), n_channels(_n_channels), n_scales(_n_scales)
+ ComplexMat(int _rows, int _cols, int _n_channels, int _n_scales, cudaStream_t _stream) : cols(_cols), rows(_rows), n_channels(_n_channels), n_scales(_n_scales),
+ stream(_stream)
{
CudaSafeCall(cudaMalloc(&p_data, n_channels*cols*rows*sizeof(cufftComplex)));
}
n_channels = other.n_channels;
n_scales = other.n_scales;
p_data = other.p_data;
+ stream = other.stream;
other.p_data = nullptr;
}
}
}
- void create(int _rows, int _cols, int _n_channels)
+ void create(int _rows, int _cols, int _n_channels, cudaStream_t _stream = nullptr)
{
rows = _rows;
cols = _cols;
n_channels = _n_channels;
+ stream = _stream;
CudaSafeCall(cudaMalloc(&p_data, n_channels*cols*rows*sizeof(cufftComplex)));
}
- void create(int _rows, int _cols, int _n_channels, int _n_scales)
+ void create(int _rows, int _cols, int _n_channels, int _n_scales, cudaStream_t _stream = nullptr)
{
rows = _rows;
cols = _cols;
n_channels = _n_channels;
n_scales = _n_scales;
+ stream = _stream;
CudaSafeCall(cudaMalloc(&p_data, n_channels*cols*rows*sizeof(cufftComplex)));
}
// cv::Mat API compatibility
int channels() { return n_channels; }
int channels() const { return n_channels; }
+ void set_stream(cudaStream_t _stream)
+ {
+ stream = _stream;
+ return;
+ }
+
void sqr_norm(float *result) const;
ComplexMat sqr_mag() const;
}
}
-void cuda_gaussian_correlation(float *data_in, float *data_out, float *xf_sqr_norm, float *yf_sqr_norm, double sigma, int n_channels, int n_scales,int rows, int cols)
+void cuda_gaussian_correlation(float *data_in, float *data_out, float *xf_sqr_norm, float *yf_sqr_norm, double sigma, int n_channels, int n_scales,int rows, int cols, cudaStream_t stream)
{
dim3 threadsPerBlock((n_channels/n_scales)/2);
dim3 numBlocks(n_scales, rows*cols);
- gaussian_correlation_kernel<<<numBlocks, threadsPerBlock, ((n_channels/n_scales)/2)*sizeof(float)>>>(data_in, data_out, xf_sqr_norm, yf_sqr_norm, rows, cols, n_channels/n_scales, sigma);
+ gaussian_correlation_kernel<<<numBlocks, threadsPerBlock, ((n_channels/n_scales)/2)*sizeof(float), stream>>>(data_in, data_out, xf_sqr_norm, yf_sqr_norm, rows, cols, n_channels/n_scales, sigma);
CudaCheckError();
// float *data_cpu = (float*) malloc(rows*cols*n_scales*sizeof(float));
#include "cuda_runtime.h"
#include "cuda/cuda_error_check.cuh"
-void cuda_gaussian_correlation(float *data_in, float *data_out, float *xf_sqr_norm, float *yf_sqr_norm, double sigma, int n_channels, int n_scales, int rows, int cols);
+void cuda_gaussian_correlation(float *data_in, float *data_out, float *xf_sqr_norm, float *yf_sqr_norm, double sigma, int n_channels, int n_scales, int rows, int cols, cudaStream_t stream);
#endif
public:
virtual void init(unsigned width, unsigned height,unsigned num_of_feats, unsigned num_of_scales, bool big_batch_mode) = 0;
virtual void set_window(const cv::Mat & window) = 0;
- virtual void forward(const cv::Mat & real_input, ComplexMat & complex_result, float *real_input_arr) = 0;
- virtual void forward_window(std::vector<cv::Mat> patch_feats, ComplexMat & complex_result, cv::Mat & fw_all, float *real_input_arr) = 0;
- virtual void inverse(ComplexMat & complex_input, cv::Mat & real_result, float *real_result_arr) = 0;
+ virtual void forward(const cv::Mat & real_input, ComplexMat & complex_result, float *real_input_arr, cudaStream_t stream) = 0;
+ virtual void forward_window(std::vector<cv::Mat> patch_feats, ComplexMat & complex_result, cv::Mat & fw_all, float *real_input_arr, cudaStream_t stream) = 0;
+ virtual void inverse(ComplexMat & complex_input, cv::Mat & real_result, float *real_result_arr, cudaStream_t stream) = 0;
virtual ~Fft() = 0;
};
//FFT forward one scale
{
- CufftErrorCheck(cufftPlan2d(&plan_f, m_height, m_width, CUFFT_R2C));
+ CufftErrorCheck(cufftPlan2d(&plan_f, int(m_height), int(m_width), CUFFT_R2C));
}
#ifdef BIG_BATCH
//FFT forward all scales
//FFT forward window one scale
{
int rank = 2;
- int n[] = {(int)m_height, (int)m_width};
- int howmany = m_num_of_feats;
- int idist = m_height*m_width, odist = m_height*(m_width/2+1);
+ int n[] = {int(m_height), int(m_width)};
+ int howmany = int(m_num_of_feats);
+ int idist = int(m_height*m_width), odist = int(m_height*(m_width/2+1));
int istride = 1, ostride = 1;
- int *inembed = n, onembed[] = {(int)m_height, (int)m_width/2+1};
+ int *inembed = n, onembed[] = {int(m_height), int(m_width/2+1)};
CufftErrorCheck(cufftPlanMany(&plan_fw, rank, n,
inembed, istride, idist,
//FFT inverse one scale
{
int rank = 2;
- int n[] = {(int)m_height, (int)m_width};
- int howmany = m_num_of_feats;
- int idist = m_height*(m_width/2+1), odist = 1;
- int istride = 1, ostride = m_num_of_feats;
- int inembed[] = {(int)m_height, (int)m_width/2+1}, *onembed = n;
+ int n[] = {int(m_height), int(m_width)};
+ int howmany = int(m_num_of_feats);
+ int idist = int(m_height*(m_width/2+1)), odist = 1;
+ int istride = 1, ostride = int(m_num_of_feats);
+ int inembed[] = {int(m_height), int(m_width/2+1)}, *onembed = n;
CufftErrorCheck(cufftPlanMany(&plan_i_features, rank, n,
inembed, istride, idist,
//FFT inverse one channel one scale
{
int rank = 2;
- int n[] = {(int)m_height, (int)m_width};
+ int n[] = {int(m_height), int(m_width)};
int howmany = 1;
- int idist = m_height*(m_width/2+1), odist = 1;
+ int idist = int(m_height*(m_width/2+1)), odist = 1;
int istride = 1, ostride = 1;
- int inembed[] = {(int)m_height, (int)m_width/2+1}, *onembed = n;
+ int inembed[] = {int(m_height), int(m_width/2+1)}, *onembed = n;
CufftErrorCheck(cufftPlanMany(&plan_i_1ch, rank, n,
inembed, istride, idist,
m_window = window;
}
-void cuFFT::forward(const cv::Mat & real_input, ComplexMat & complex_result, float *real_input_arr)
+void cuFFT::forward(const cv::Mat & real_input, ComplexMat & complex_result, float *real_input_arr, cudaStream_t stream)
{
(void) real_input;
- if(m_big_batch_mode && real_input.rows == (int)(m_height*m_num_of_scales)){
+ if(m_big_batch_mode && real_input.rows == int(m_height*m_num_of_scales)){
CufftErrorCheck(cufftExecR2C(plan_f_all_scales, reinterpret_cast<cufftReal*>(real_input_arr),
complex_result.get_p_data()));
} else {
- CufftErrorCheck(cufftExecR2C(plan_f, reinterpret_cast<cufftReal*>(real_input_arr),
+ CufftErrorCheck(cufftSetStream(plan_f, stream));
+ CufftErrorCheck(cufftExecR2C(plan_f, reinterpret_cast<cufftReal*>(real_input_arr),
complex_result.get_p_data()));
}
return;
}
-void cuFFT::forward_window(std::vector<cv::Mat> patch_feats, ComplexMat & complex_result, cv::Mat & fw_all, float *real_input_arr)
+void cuFFT::forward_window(std::vector<cv::Mat> patch_feats, ComplexMat & complex_result, cv::Mat & fw_all, float *real_input_arr, cudaStream_t stream)
{
- int n_channels = patch_feats.size();
+ int n_channels = int(patch_feats.size());
- if(n_channels > (int) m_num_of_feats){
- for (int i = 0; i < n_channels; ++i) {
- cv::Mat in_roi(fw_all, cv::Rect(0, i*m_height, m_width, m_height));
+ if(n_channels > int(m_num_of_feats)){
+ for (uint i = 0; i < uint(n_channels); ++i) {
+ cv::Mat in_roi(fw_all, cv::Rect(0, int(i*m_height), int(m_width), int(m_height)));
in_roi = patch_feats[i].mul(m_window);
}
-
CufftErrorCheck(cufftExecR2C(plan_fw_all_scales, reinterpret_cast<cufftReal*>(real_input_arr), complex_result.get_p_data()));
} else {
- for (int i = 0; i < n_channels; ++i) {
- cv::Mat in_roi(fw_all, cv::Rect(0, i*m_height, m_width, m_height));
+ for (uint i = 0; i < uint(n_channels); ++i) {
+ cv::Mat in_roi(fw_all, cv::Rect(0, int(i*m_height), int(m_width), int(m_height)));
in_roi = patch_feats[i].mul(m_window);
}
+ CufftErrorCheck(cufftSetStream(plan_fw, stream));
CufftErrorCheck(cufftExecR2C(plan_fw, reinterpret_cast<cufftReal*>(real_input_arr), complex_result.get_p_data()));
}
return;
}
-void cuFFT::inverse(ComplexMat & complex_input, cv::Mat & real_result, float *real_result_arr)
+void cuFFT::inverse(ComplexMat & complex_input, cv::Mat & real_result, float *real_result_arr, cudaStream_t stream)
{
int n_channels = complex_input.n_channels;
cufftComplex *in = reinterpret_cast<cufftComplex*>(complex_input.get_p_data());
if(n_channels == 1){
+ CufftErrorCheck(cufftSetStream(plan_i_1ch, stream));
CufftErrorCheck(cufftExecC2R(plan_i_1ch, in, reinterpret_cast<cufftReal*>(real_result_arr)));
- cudaDeviceSynchronize();
+ cudaStreamSynchronize(stream);
real_result = real_result/(m_width*m_height);
return;
- } else if(n_channels == (int) m_num_of_scales){
+ } else if(n_channels == int(m_num_of_scales)){
CufftErrorCheck(cufftExecC2R(plan_i_1ch_all_scales, in, reinterpret_cast<cufftReal*>(real_result_arr)));
- cudaDeviceSynchronize();
+ cudaStreamSynchronize(stream);
real_result = real_result/(m_width*m_height);
return;
- } else if(n_channels == (int) m_num_of_feats * (int) m_num_of_scales){
+ } else if(n_channels == int(m_num_of_feats) * int(m_num_of_scales)){
CufftErrorCheck(cufftExecC2R(plan_i_features_all_scales, in, reinterpret_cast<cufftReal*>(real_result_arr)));
return;
}
+ CufftErrorCheck(cufftSetStream(plan_i_features, stream));
CufftErrorCheck(cufftExecC2R(plan_i_features, in, reinterpret_cast<cufftReal*>(real_result_arr)));
return;
}
public:
void init(unsigned width, unsigned height, unsigned num_of_feats, unsigned num_of_scales, bool big_batch_mode) override;
void set_window(const cv::Mat & window) override;
- void forward(const cv::Mat & real_input, ComplexMat & complex_result, float *real_input_arr) override;
- void forward_window(std::vector<cv::Mat> patch_feats, ComplexMat & complex_result, cv::Mat & fw_all, float *real_input_arr) override;
- void inverse(ComplexMat & complex_input, cv::Mat & real_result, float *real_result_arr) override;
+ void forward(const cv::Mat & real_input, ComplexMat & complex_result, float *real_input_arr, cudaStream_t stream) override;
+ void forward_window(std::vector<cv::Mat> patch_feats, ComplexMat & complex_result, cv::Mat & fw_all, float *real_input_arr, cudaStream_t stream) override;
+ void inverse(ComplexMat & complex_input, cv::Mat & real_result, float *real_result_arr, cudaStream_t stream) override;
~cuFFT() override;
private:
cv::Mat m_window;
#endif
#if !defined(ASYNC) && !defined(OPENMP) && !defined(CUFFTW)
-#define FFTW_PLAN_WITH_THREADS() fftw_plan_with_nthreads(m_num_threads);
+#define FFTW_PLAN_WITH_THREADS() fftw_plan_with_nthreads(int(m_num_threads));
#else
#define FFTW_PLAN_WITH_THREADS()
#endif
{
}
-Fftw::Fftw(int num_threads)
+Fftw::Fftw(unsigned num_threads)
: m_num_threads(num_threads)
{
}
#else
std::cout << "FFT: cuFFTW" << std::endl;
#endif
+ fftwf_cleanup();
//FFT forward one scale
{
- cv::Mat in_f = cv::Mat::zeros(m_height, m_width, CV_32FC1);
- ComplexMat out_f(m_height, m_width / 2 + 1, 1);
- plan_f = fftwf_plan_dft_r2c_2d(m_height, m_width,
+ cv::Mat in_f = cv::Mat::zeros(int(m_height), int(m_width), CV_32FC1);
+ ComplexMat out_f(int(m_height), m_width / 2 + 1, 1);
+ plan_f = fftwf_plan_dft_r2c_2d(int(m_height), int(m_width),
reinterpret_cast<float*>(in_f.data),
reinterpret_cast<fftwf_complex*>(out_f.get_p_data()),
FFTW_PATIENT);
#endif
//FFT forward window one scale
{
- cv::Mat in_fw = cv::Mat::zeros(m_height * m_num_of_feats, m_width, CV_32F);
- ComplexMat out_fw(m_height, m_width / 2 + 1, m_num_of_feats);
+ cv::Mat in_fw = cv::Mat::zeros(int(m_height * m_num_of_feats), int(m_width), CV_32F);
+ ComplexMat out_fw(int(m_height), m_width / 2 + 1, int(m_num_of_feats));
float *in = reinterpret_cast<float*>(in_fw.data);
fftwf_complex *out = reinterpret_cast<fftwf_complex*>(out_fw.get_p_data());
int rank = 2;
- int n[] = {(int)m_height, (int)m_width};
- int howmany = m_num_of_feats;
- int idist = m_height*m_width, odist = m_height*(m_width/2+1);
+ int n[] = {int(m_height), int(m_width)};
+ int howmany = int(m_num_of_feats);
+ int idist = int(m_height*m_width), odist = int(m_height*(m_width/2+1));
int istride = 1, ostride = 1;
- int *inembed = NULL, *onembed = NULL;
+ int *inembed = nullptr, *onembed = nullptr;
FFTW_PLAN_WITH_THREADS();
plan_fw = fftwf_plan_many_dft_r2c(rank, n, howmany,
#endif
//FFT inverse one scale
{
- ComplexMat in_i(m_height,m_width,m_num_of_feats);
- cv::Mat out_i = cv::Mat::zeros(m_height, m_width, CV_32FC(m_num_of_feats));
+ ComplexMat in_i(m_height, m_width, m_num_of_feats);
+ cv::Mat out_i = cv::Mat::zeros(int(m_height), int(m_width), CV_32FC(int(m_num_of_feats)));
fftwf_complex *in = reinterpret_cast<fftwf_complex*>(in_i.get_p_data());
float *out = reinterpret_cast<float*>(out_i.data);
int rank = 2;
- int n[] = {(int)m_height, (int)m_width};
- int howmany = m_num_of_feats;
- int idist = m_height*(m_width/2+1), odist = 1;
- int istride = 1, ostride = m_num_of_feats;
- int inembed[] = {(int)m_height, (int)m_width/2+1}, *onembed = n;
+ int n[] = {int(m_height), int(m_width)};
+ int howmany = int(m_num_of_feats);
+ int idist = int(m_height*(m_width/2+1)), odist = 1;
+ int istride = 1, ostride = int(m_num_of_feats);
+ int inembed[] = {int(m_height), int(m_width/2+1)}, *onembed = n;
FFTW_PLAN_WITH_THREADS();
plan_i_features = fftwf_plan_many_dft_c2r(rank, n, howmany,
#endif
//FFT inver one channel one scale
{
- ComplexMat in_i1(m_height,m_width,1);
- cv::Mat out_i1 = cv::Mat::zeros(m_height, m_width, CV_32FC1);
+ ComplexMat in_i1(int(m_height),int(m_width),1);
+ cv::Mat out_i1 = cv::Mat::zeros(int(m_height), int(m_width), CV_32FC1);
fftwf_complex *in = reinterpret_cast<fftwf_complex*>(in_i1.get_p_data());
float *out = reinterpret_cast<float*>(out_i1.data);
int rank = 2;
- int n[] = {(int)m_height, (int)m_width};
+ int n[] = {int(m_height), int(m_width)};
int howmany = 1;
- int idist = m_height*(m_width/2+1), odist = 1;
+ int idist = int(m_height*(m_width/2+1)), odist = 1;
int istride = 1, ostride = 1;
- int inembed[] = {(int)m_height, (int)m_width/2+1}, *onembed = n;
+ int inembed[] = {int(m_height), int(m_width)/2+1}, *onembed = n;
FFTW_PLAN_WITH_THREADS();
plan_i_1ch = fftwf_plan_many_dft_c2r(rank, n, howmany,
m_window = window;
}
-void Fftw::forward(const cv::Mat & real_input, ComplexMat & complex_result, float *real_input_arr)
+void Fftw::forward(const cv::Mat & real_input, ComplexMat & complex_result, float *real_input_arr, cudaStream_t stream)
{
(void) real_input_arr;
+ (void) stream;
- if(m_big_batch_mode && real_input.rows == (int)(m_height*m_num_of_scales)){
+ if(m_big_batch_mode && real_input.rows == int(m_height*m_num_of_scales)){
fftwf_execute_dft_r2c(plan_f_all_scales, reinterpret_cast<float*>(real_input.data),
reinterpret_cast<fftwf_complex*>(complex_result.get_p_data()));
} else {
return;
}
-void Fftw::forward_window(std::vector<cv::Mat> patch_feats, ComplexMat & complex_result, cv::Mat & fw_all, float *real_input_arr)
+void Fftw::forward_window(std::vector<cv::Mat> patch_feats, ComplexMat & complex_result, cv::Mat & fw_all, float *real_input_arr, cudaStream_t stream)
{
(void) real_input_arr;
+ (void) stream;
- int n_channels = patch_feats.size();
+ int n_channels = int(patch_feats.size());
for (int i = 0; i < n_channels; ++i) {
- cv::Mat in_roi(fw_all, cv::Rect(0, i*m_height, m_width, m_height));
- in_roi = patch_feats[i].mul(m_window);
+ cv::Mat in_roi(fw_all, cv::Rect(0, i*int(m_height), int(m_width), int(m_height)));
+ in_roi = patch_feats[uint(i)].mul(m_window);
}
float *in = reinterpret_cast<float*>(fw_all.data);
fftwf_complex *out = reinterpret_cast<fftwf_complex*>(complex_result.get_p_data());
- if (n_channels <= (int) m_num_of_feats)
+ if (n_channels <= int(m_num_of_feats))
fftwf_execute_dft_r2c(plan_fw, in, out);
else
fftwf_execute_dft_r2c(plan_fw_all_scales, in, out);
return;
}
-void Fftw::inverse(ComplexMat & complex_input, cv::Mat & real_result, float *real_result_arr)
+void Fftw::inverse(ComplexMat & complex_input, cv::Mat & real_result, float *real_result_arr, cudaStream_t stream)
{
(void) real_result_arr;
+ (void) stream;
int n_channels = complex_input.n_channels;
fftwf_complex *in = reinterpret_cast<fftwf_complex*>(complex_input.get_p_data());
if(n_channels == 1)
fftwf_execute_dft_c2r(plan_i_1ch, in, out);
- else if(m_big_batch_mode && n_channels == (int) m_num_of_scales)
+ else if(m_big_batch_mode && n_channels == int(m_num_of_scales))
fftwf_execute_dft_c2r(plan_i_1ch_all_scales, in, out);
- else if(m_big_batch_mode && n_channels == (int) m_num_of_feats * (int) m_num_of_scales)
+ else if(m_big_batch_mode && n_channels == int(m_num_of_feats) * int(m_num_of_scales))
fftwf_execute_dft_c2r(plan_i_features_all_scales, in, out);
else
fftwf_execute_dft_c2r(plan_i_features, in, out);
{
public:
Fftw();
- Fftw(int num_of_threads);
+ Fftw(unsigned num_of_threads);
void init(unsigned width, unsigned height, unsigned num_of_feats, unsigned num_of_scales, bool big_batch_mode) override;
void set_window(const cv::Mat & window) override;
- void forward(const cv::Mat & real_input, ComplexMat & complex_result, float *real_input_arr) override;
- void forward_window(std::vector<cv::Mat> patch_feats, ComplexMat & complex_result, cv::Mat & fw_all, float *real_input_arr) override;
- void inverse(ComplexMat & complex_input, cv::Mat & real_result, float *real_result_arr) override;
+ void forward(const cv::Mat & real_input, ComplexMat & complex_result, float *real_input_arr, cudaStream_t stream) override;
+ void forward_window(std::vector<cv::Mat> patch_feats, ComplexMat & complex_result, cv::Mat & fw_all, float *real_input_arr, cudaStream_t stream) override;
+ void inverse(ComplexMat & complex_input, cv::Mat & real_result, float *real_result_arr, cudaStream_t stream) override;
~Fftw() override;
private:
unsigned m_num_threads = 6;
m_window = window;
}
-void FftOpencv::forward(const cv::Mat & real_input, ComplexMat & complex_result, float *real_input_arr)
+void FftOpencv::forward(const cv::Mat & real_input, ComplexMat & complex_result, float *real_input_arr, cudaStream_t stream)
{
(void) real_input_arr;
+ (void) stream;
cv::Mat tmp;
cv::dft(real_input, tmp, cv::DFT_COMPLEX_OUTPUT);
return;
}
-void FftOpencv::forward_window(std::vector<cv::Mat> patch_feats, ComplexMat & complex_result, cv::Mat & fw_all, float *real_input_arr)
+void FftOpencv::forward_window(std::vector<cv::Mat> patch_feats, ComplexMat & complex_result, cv::Mat & fw_all, float *real_input_arr, cudaStream_t stream)
{
(void) real_input_arr;
(void) fw_all;
+ (void) stream;
- int n_channels = patch_feats.size();
- for (int i = 0; i < n_channels; ++i) {
+ uint n_channels = uint(patch_feats.size());
+ for (uint i = 0; i < n_channels; ++i) {
cv::Mat complex_res;
cv::dft(patch_feats[i].mul(m_window), complex_res, cv::DFT_COMPLEX_OUTPUT);
- complex_result.set_channel(i, complex_res);
+ complex_result.set_channel(int(i), complex_res);
}
return;
}
-void FftOpencv::inverse(ComplexMat & complex_input, cv::Mat & real_result, float *real_result_arr)
+void FftOpencv::inverse(ComplexMat & complex_input, cv::Mat & real_result, float *real_result_arr, cudaStream_t stream)
{
(void) real_result_arr;
+ (void) stream;
if (complex_input.n_channels == 1) {
cv::dft(complex_input.to_cv_mat(), real_result, cv::DFT_INVERSE | cv::DFT_REAL_OUTPUT | cv::DFT_SCALE);
} else {
std::vector<cv::Mat> mat_channels = complex_input.to_cv_mat_vector();
- std::vector<cv::Mat> ifft_mats(complex_input.n_channels);
- for (int i = 0; i < complex_input.n_channels; ++i) {
+ std::vector<cv::Mat> ifft_mats(ulong(complex_input.n_channels));
+ for (uint i = 0; i < uint(complex_input.n_channels); ++i) {
cv::dft(mat_channels[i], ifft_mats[i], cv::DFT_INVERSE | cv::DFT_REAL_OUTPUT | cv::DFT_SCALE);
}
cv::merge(ifft_mats, real_result);
public:
void init(unsigned width, unsigned height, unsigned num_of_feats, unsigned num_of_scales, bool big_batch_mode) override;
void set_window(const cv::Mat & window) override;
- void forward(const cv::Mat & real_input, ComplexMat & complex_result, float *real_input_arr) override;
- void forward_window(std::vector<cv::Mat> patch_feats, ComplexMat & complex_result, cv::Mat & fw_all, float *real_input_arr) override;
- void inverse(ComplexMat & complex_input, cv::Mat & real_result, float *real_result_arr) override;
+ void forward(const cv::Mat & real_input, ComplexMat & complex_result, float *real_input_arr, cudaStream_t stream) override;
+ void forward_window(std::vector<cv::Mat> patch_feats, ComplexMat & complex_result, cv::Mat & fw_all, float *real_input_arr, cudaStream_t stream) override;
+ void inverse(ComplexMat & complex_input, cv::Mat & real_result, float *real_result_arr, cudaStream_t stream) override;
~FftOpencv() override;
private:
cv::Mat m_window;
KCF_Tracker::~KCF_Tracker()
{
delete &fft;
- int end = m_use_big_batch ? 2 : p_num_scales;
-#ifdef CUFFT
- for (int i = 0;i < end;++i) {
- CudaSafeCall(cudaFreeHost(p_scale_vars[i].xf_sqr_norm));
- CudaSafeCall(cudaFreeHost(p_scale_vars[i].yf_sqr_norm));
- CudaSafeCall(cudaFreeHost(p_scale_vars[i].data_i_1ch));
- CudaSafeCall(cudaFreeHost(p_scale_vars[i].data_i_features));
- CudaSafeCall(cudaFreeHost(p_scale_vars[i].gauss_corr_res));
- CudaSafeCall(cudaFreeHost(p_scale_vars[i].rot_labels_data));
- CudaSafeCall(cudaFreeHost(p_scale_vars[i].data_features));
- }
-#else
- for (int i = 0;i < end;++i) {
- free(p_scale_vars[i].xf_sqr_norm);
- free(p_scale_vars[i].yf_sqr_norm);
- }
-#endif
}
void KCF_Tracker::init(cv::Mat &img, const cv::Rect & bbox, int fit_size_x, int fit_size_y)
std::cerr << "Fit size does not fit to hog cell size. The dimensions have to be divisible by HOG cell size, which is: " << p_cell_size << std::endl;;
std::exit(EXIT_FAILURE);
}
- double tmp;
- if (( tmp = (p_pose.w * (1. + p_padding) / p_cell_size) * p_cell_size ) != fit_size_x)
+ double tmp = (p_pose.w * (1. + p_padding) / p_cell_size) * p_cell_size ;
+ if (fabs(tmp-fit_size_x) > p_floating_error)
p_scale_factor_x = fit_size_x/tmp;
- if (( tmp = (p_pose.h * (1. + p_padding) / p_cell_size) * p_cell_size ) != fit_size_y)
+ tmp = (p_pose.h * (1. + p_padding) / p_cell_size) * p_cell_size;
+ if (fabs(tmp-fit_size_y) > p_floating_error)
p_scale_factor_y = fit_size_y/tmp;
std::cout << "resizing image horizontaly by factor of " << p_scale_factor_x
<< " and verticaly by factor of " << p_scale_factor_y << std::endl;
p_fit_to_pw2 = true;
p_pose.scale_x(p_scale_factor_x);
p_pose.scale_y(p_scale_factor_y);
- if (p_scale_factor_x != 1 && p_scale_factor_y != 1) {
+ if (fabs(p_scale_factor_x-1) > p_floating_error && fabs(p_scale_factor_y-1) > p_floating_error) {
if (p_scale_factor_x < 1 && p_scale_factor_y < 1) {
cv::resize(input_gray, input_gray, cv::Size(0, 0), p_scale_factor_x, p_scale_factor_y, cv::INTER_AREA);
cv::resize(input_rgb, input_rgb, cv::Size(0, 0), p_scale_factor_x, p_scale_factor_y, cv::INTER_AREA);
}
//compute win size + fit to fhog cell size
- p_windows_size[0] = round(p_pose.w * (1. + p_padding) / p_cell_size) * p_cell_size;
- p_windows_size[1] = round(p_pose.h * (1. + p_padding) / p_cell_size) * p_cell_size;
+ p_windows_size[0] = int(round(p_pose.w * (1. + p_padding) / p_cell_size) * p_cell_size);
+ p_windows_size[1] = int(round(p_pose.h * (1. + p_padding) / p_cell_size) * p_cell_size);
p_scales.clear();
if (m_use_scale)
int max =m_use_big_batch ? 2: p_num_scales;
for (int i = 0;i<max;++i) {
- if (i == 0)
- p_scale_vars.push_back(Scale_vars(p_windows_size, p_cell_size, p_num_of_feats, 1, &p_model_xf, &p_yf, true));
- else if (m_use_big_batch)
- p_scale_vars.push_back(Scale_vars(p_windows_size, p_cell_size, p_num_of_feats*p_num_scales, p_num_scales));
- else
- p_scale_vars.push_back(Scale_vars(p_windows_size, p_cell_size, p_num_of_feats, 1));
+ if (i == 0) {
+ p_scale_vars.emplace_back(new Scale_vars(p_windows_size, p_cell_size, p_num_of_feats, 1, &p_model_xf, &p_yf, true));
+ }
+ else if (m_use_big_batch) {
+ p_scale_vars.emplace_back(new Scale_vars(p_windows_size, p_cell_size, p_num_of_feats*p_num_scales, p_num_scales));
+ }
+ else {
+ p_scale_vars.emplace_back(new Scale_vars(p_windows_size, p_cell_size, p_num_of_feats, 1));
+ }
+#ifdef CUFFT
+ std::cout << p_scale_vars.back()->zf.stream << std::endl;
+ std::cout << p_scale_vars.back()->kzf.stream << std::endl;
+ std::cout << p_scale_vars.back()->kf.stream << std::endl << std::endl;
+
+ std::cout << p_scale_vars.back()->zf.n_scales << std::endl;
+ std::cout << p_scale_vars.back()->kzf.n_scales << std::endl;
+ std::cout << p_scale_vars.back()->kf.n_scales << std::endl << std::endl;
+#endif
}
p_current_scale = 1.;
p_output_sigma = std::sqrt(p_pose.w*p_pose.h) * p_output_sigma_factor / static_cast<double>(p_cell_size);
- fft.init(p_windows_size[0]/p_cell_size, p_windows_size[1]/p_cell_size, p_num_of_feats, p_num_scales, m_use_big_batch);
+ fft.init(uint(p_windows_size[0]/p_cell_size), uint(p_windows_size[1]/p_cell_size), uint(p_num_of_feats), uint(p_num_scales), m_use_big_batch);
fft.set_window(cosine_window_function(p_windows_size[0]/p_cell_size, p_windows_size[1]/p_cell_size));
//window weights, i.e. labels
fft.forward(gaussian_shaped_labels(p_output_sigma, p_windows_size[0]/p_cell_size, p_windows_size[1]/p_cell_size), p_yf,
- m_use_cuda ? p_scale_vars[0].rot_labels_data_d: nullptr);
+ m_use_cuda ? p_scale_vars.front()->rot_labels_data_d: nullptr, p_scale_vars.front()->stream);
DEBUG_PRINTM(p_yf);
//obtain a sub-window for training initial model
- p_scale_vars[0].patch_feats.clear();
- get_features(input_rgb, input_gray, p_pose.cx, p_pose.cy, p_windows_size[0], p_windows_size[1], p_scale_vars[0]);
- fft.forward_window(p_scale_vars[0].patch_feats, p_model_xf, p_scale_vars[0].fw_all, m_use_cuda ? p_scale_vars[0].data_features_d : nullptr);
+ p_scale_vars.front()->patch_feats.clear();
+ get_features(input_rgb, input_gray, int(p_pose.cx), int(p_pose.cy), p_windows_size[0], p_windows_size[1], *p_scale_vars.front());
+ fft.forward_window(p_scale_vars.front()->patch_feats, p_model_xf, p_scale_vars.front()->fw_all,
+ m_use_cuda ? p_scale_vars.front()->data_features_d : nullptr, p_scale_vars.front()->stream);
DEBUG_PRINTM(p_model_xf);
-
+#if defined(CUFFT) && (defined(ASYNC) || defined(OPENMP))
+ p_scale_vars.front()->model_xf = p_model_xf;
+ p_scale_vars.front()->model_xf.set_stream(p_scale_vars.front()->stream);
+ p_yf.set_stream(p_scale_vars.front()->stream);
+#endif
if (m_use_linearkernel) {
ComplexMat xfconj = p_model_xf.conj();
p_model_alphaf_den = (p_model_xf * xfconj);
} else {
//Kernel Ridge Regression, calculate alphas (in Fourier domain)
- gaussian_correlation(p_scale_vars[0], p_model_xf, p_model_xf, p_kernel_sigma, true);
- DEBUG_PRINTM(p_scale_vars[0].kf);
- p_model_alphaf_num = p_yf * p_scale_vars[0].kf;
+#if defined(CUFFT) && (defined(ASYNC) || defined(OPENMP))
+ gaussian_correlation(*p_scale_vars.front(), p_scale_vars.front()->model_xf, p_scale_vars.front()->model_xf, p_kernel_sigma, true);
+#else
+ gaussian_correlation(*p_scale_vars.front(), p_model_xf, p_model_xf, p_kernel_sigma, true);
+#endif
+ DEBUG_PRINTM(p_scale_vars.front()->kf);
+ p_model_alphaf_num = p_yf * p_scale_vars.front()->kf;
DEBUG_PRINTM(p_model_alphaf_num);
- p_model_alphaf_den = p_scale_vars[0].kf * (p_scale_vars[0].kf + p_lambda);
+ p_model_alphaf_den = p_scale_vars.front()->kf * (p_scale_vars.front()->kf + float(p_lambda));
DEBUG_PRINTM(p_model_alphaf_den);
}
p_model_alphaf = p_model_alphaf_num / p_model_alphaf_den;
DEBUG_PRINTM(p_model_alphaf);
// p_model_alphaf = p_yf / (kf + p_lambda); //equation for fast training
+
+#if defined(CUFFT) && (defined(ASYNC) || defined(OPENMP))
+ for (auto it = p_scale_vars.begin();it != p_scale_vars.end();++it) {
+ (*it)->model_xf = p_model_xf;
+ (*it)->model_xf.set_stream((*it)->stream);
+ (*it)->model_alphaf = p_model_alphaf;
+ (*it)->model_alphaf.set_stream((*it)->stream);
+ }
+#endif
}
void KCF_Tracker::setTrackerPose(BBox_c &bbox, cv::Mat & img, int fit_size_x, int fit_size_y)
if (p_resize_image) {
cv::resize(input_gray, input_gray, cv::Size(0, 0), p_downscale_factor, p_downscale_factor, cv::INTER_AREA);
cv::resize(input_rgb, input_rgb, cv::Size(0, 0), p_downscale_factor, p_downscale_factor, cv::INTER_AREA);
- } else if (p_fit_to_pw2 && p_scale_factor_x != 1 && p_scale_factor_y != 1) {
+ } else if (p_fit_to_pw2 && fabs(p_scale_factor_x-1) > p_floating_error && fabs(p_scale_factor_y-1) > p_floating_error) {
if (p_scale_factor_x < 1 && p_scale_factor_y < 1) {
cv::resize(input_gray, input_gray, cv::Size(0, 0), p_scale_factor_x, p_scale_factor_y, cv::INTER_AREA);
cv::resize(input_rgb, input_rgb, cv::Size(0, 0), p_scale_factor_x, p_scale_factor_y, cv::INTER_AREA);
if(m_use_multithreading) {
std::vector<std::future<void>> async_res(p_scales.size());
- for (size_t i = 0; i < p_scale_vars.size(); ++i) {
- async_res[i] = std::async(std::launch::async,
- [this, &input_gray, &input_rgb, i]() -> void
- {return scale_track(this->p_scale_vars[i], input_rgb, input_gray, this->p_scales[i]);});
+ for (auto it = p_scale_vars.begin();it != p_scale_vars.end();++it) {
+ uint index = uint(std::distance(p_scale_vars.begin(), it));
+ async_res[index] = std::async(std::launch::async,
+ [this, &input_gray, &input_rgb, index, it]() -> void
+ {return scale_track(*(*it), input_rgb, input_gray, this->p_scales[index]);});
}
- for (size_t i = 0; i < p_scales.size(); ++i) {
- async_res[i].wait();
- if (this->p_scale_vars[i].max_response > max_response) {
- max_response = this->p_scale_vars[i].max_response;
- max_response_pt = & this->p_scale_vars[i].max_loc;
- max_response_map = & this->p_scale_vars[i].response;
- scale_index = i;
+ for (auto it = p_scale_vars.begin();it != p_scale_vars.end();++it) {
+ uint index = uint(std::distance(p_scale_vars.begin(), it));
+ async_res[index].wait();
+ if ((*it)->max_response > max_response) {
+ max_response = (*it)->max_response;
+ max_response_pt = & (*it)->max_loc;
+ max_response_map = & (*it)->response;
+ scale_index = int(index);
}
}
} else {
- int end =m_use_big_batch ? 2: p_num_scales;
- int start = m_use_big_batch ? 1 : 0;
+ uint start = m_use_big_batch ? 1 : 0;
+ uint end = m_use_big_batch ? 2 : uint(p_num_scales);
#pragma omp parallel for schedule(dynamic)
- for (int i = start; i < end; ++i) {
- scale_track(this->p_scale_vars[i], input_rgb, input_gray, this->p_scales[i]);
+ for (uint i = start; i < end; ++i) {
+ auto it = p_scale_vars.begin();
+ std::advance(it, i);
+ scale_track(*(*it), input_rgb, input_gray, this->p_scales[i]);
if (m_use_big_batch) {
for (size_t j = 0;j<p_scales.size();++j) {
- if (this->p_scale_vars[i].max_responses[j] > max_response) {
- max_response = this->p_scale_vars[i].max_responses[j];
- max_response_pt = & this->p_scale_vars[i].max_locs[j];
- max_response_map = & this->p_scale_vars[i].response_maps[j];
- scale_index = j;
+ if ((*it)->max_responses[j] > max_response) {
+ max_response = (*it)->max_responses[j];
+ max_response_pt = & (*it)->max_locs[j];
+ max_response_map = & (*it)->response_maps[j];
+ scale_index = int(j);
}
}
} else {
#pragma omp critical
{
- if (this->p_scale_vars[i].max_response > max_response) {
- max_response = this->p_scale_vars[i].max_response;
- max_response_pt = & this->p_scale_vars[i].max_loc;
- max_response_map = & this->p_scale_vars[i].response;
+ if ((*it)->max_response > max_response) {
+ max_response = (*it)->max_response;
+ max_response_pt = & (*it)->max_loc;
+ max_response_map = & (*it)->response;
scale_index = i;
}
}
new_location = sub_pixel_peak(*max_response_pt, *max_response_map);
DEBUG_PRINT(new_location);
- p_pose.cx += p_current_scale*p_cell_size*new_location.x;
- p_pose.cy += p_current_scale*p_cell_size*new_location.y;
+ p_pose.cx += p_current_scale*p_cell_size*double(new_location.x);
+ p_pose.cy += p_current_scale*p_cell_size*double(new_location.y);
if (p_fit_to_pw2) {
if (p_pose.cx < 0) p_pose.cx = 0;
if (p_pose.cx > (img.cols*p_scale_factor_x)-1) p_pose.cx = (img.cols*p_scale_factor_x)-1;
}
//sub grid scale interpolation
- double new_scale = p_scales[scale_index];
+ double new_scale = p_scales[uint(scale_index)];
if (m_use_subgrid_scale)
new_scale = sub_grid_scale(scale_index);
if (p_current_scale > p_min_max_scale[1])
p_current_scale = p_min_max_scale[1];
//obtain a subwindow for training at newly estimated target position
- p_scale_vars[0].patch_feats.clear();
- get_features(input_rgb, input_gray, p_pose.cx, p_pose.cy, p_windows_size[0], p_windows_size[1], p_scale_vars[0], p_current_scale);
- fft.forward_window(p_scale_vars[0].patch_feats, p_scale_vars[0].xf, p_scale_vars[0].fw_all, m_use_cuda ? p_scale_vars[0].data_features_d : nullptr);
+ p_scale_vars.front()->patch_feats.clear();
+ get_features(input_rgb, input_gray, int(p_pose.cx), int(p_pose.cy), p_windows_size[0], p_windows_size[1], *p_scale_vars.front(), p_current_scale);
+ fft.forward_window(p_scale_vars.front()->patch_feats, p_scale_vars.front()->xf, p_scale_vars.front()->fw_all,
+ m_use_cuda ? p_scale_vars.front()->data_features_d : nullptr, p_scale_vars.front()->stream);
//subsequent frames, interpolate model
- p_model_xf = p_model_xf * (1. - p_interp_factor) + p_scale_vars[0].xf * p_interp_factor;
+ p_model_xf = p_model_xf *float((1. - p_interp_factor)) + p_scale_vars.front()->xf * float(p_interp_factor);
ComplexMat alphaf_num, alphaf_den;
if (m_use_linearkernel) {
- ComplexMat xfconj = p_scale_vars[0].xf.conj();
+ ComplexMat xfconj = p_scale_vars.front()->xf.conj();
alphaf_num = xfconj.mul(p_yf);
- alphaf_den = (p_scale_vars[0].xf * xfconj);
+ alphaf_den = (p_scale_vars.front()->xf * xfconj);
} else {
//Kernel Ridge Regression, calculate alphas (in Fourier domain)
- gaussian_correlation(p_scale_vars[0], p_scale_vars[0].xf, p_scale_vars[0].xf, p_kernel_sigma, true);
+ gaussian_correlation(*p_scale_vars.front(), p_scale_vars.front()->xf, p_scale_vars.front()->xf, p_kernel_sigma, true);
// ComplexMat alphaf = p_yf / (kf + p_lambda); //equation for fast training
// p_model_alphaf = p_model_alphaf * (1. - p_interp_factor) + alphaf * p_interp_factor;
- alphaf_num = p_yf * p_scale_vars[0].kf;
- alphaf_den = p_scale_vars[0].kf * (p_scale_vars[0].kf + p_lambda);
+ alphaf_num = p_yf * p_scale_vars.front()->kf;
+ alphaf_den = p_scale_vars.front()->kf * (p_scale_vars.front()->kf + float(p_lambda));
}
- p_model_alphaf_num = p_model_alphaf_num * (1. - p_interp_factor) + alphaf_num * p_interp_factor;
- p_model_alphaf_den = p_model_alphaf_den * (1. - p_interp_factor) + alphaf_den * p_interp_factor;
+ p_model_alphaf_num = p_model_alphaf_num * float((1. - p_interp_factor)) + alphaf_num * float(p_interp_factor);
+ p_model_alphaf_den = p_model_alphaf_den * float((1. - p_interp_factor)) + alphaf_den * float(p_interp_factor);
p_model_alphaf = p_model_alphaf_num / p_model_alphaf_den;
+
+#if defined(CUFFT) && (defined(ASYNC) || defined(OPENMP))
+ for (auto it = p_scale_vars.begin(); it != p_scale_vars.end(); ++it) {
+ (*it)->model_xf = p_model_xf;
+ (*it)->model_xf.set_stream((*it)->stream);
+ (*it)->model_alphaf = p_model_alphaf;
+ (*it)->model_alphaf.set_stream((*it)->stream);
+ }
+#endif
}
void KCF_Tracker::scale_track(Scale_vars & vars, cv::Mat & input_rgb, cv::Mat & input_gray, double scale)
{
if (m_use_big_batch) {
vars.patch_feats.clear();
- for (int i = 0; i < p_num_scales; ++i) {
- get_features(input_rgb, input_gray, this->p_pose.cx, this->p_pose.cy, this->p_windows_size[0], this->p_windows_size[1],
+ for (uint i = 0; i < uint(p_num_scales); ++i) {
+ get_features(input_rgb, input_gray, int(this->p_pose.cx), int(this->p_pose.cy), this->p_windows_size[0], this->p_windows_size[1],
vars, this->p_current_scale * this->p_scales[i]);
}
} else {
vars.patch_feats.clear();
- get_features(input_rgb, input_gray, this->p_pose.cx, this->p_pose.cy, this->p_windows_size[0], this->p_windows_size[1],
+ get_features(input_rgb, input_gray, int(this->p_pose.cx), int(this->p_pose.cy), this->p_windows_size[0], this->p_windows_size[1],
vars, this->p_current_scale * scale);
}
- fft.forward_window(vars.patch_feats, vars.zf, vars.fw_all, m_use_cuda ? vars.data_features_d : nullptr);
+ fft.forward_window(vars.patch_feats, vars.zf, vars.fw_all, m_use_cuda ? vars.data_features_d : nullptr, vars.stream);
DEBUG_PRINTM(vars.zf);
if (m_use_linearkernel) {
vars.kzf = m_use_big_batch ? (vars.zf.mul2(this->p_model_alphaf)).sum_over_channels() : (p_model_alphaf * vars.zf).sum_over_channels();
- fft.inverse(vars.kzf, vars.response, m_use_cuda ? vars.data_i_1ch_d : nullptr);
+ fft.inverse(vars.kzf, vars.response, m_use_cuda ? vars.data_i_1ch_d : nullptr, vars.stream);
} else {
+
+#if defined(CUFFT) && (defined(ASYNC) || defined(OPENMP))
+ gaussian_correlation(vars, vars.zf, vars.model_xf, this->p_kernel_sigma);
+ vars.kzf = vars.model_alphaf * vars.kzf;
+#else
gaussian_correlation(vars, vars.zf, this->p_model_xf, this->p_kernel_sigma);
DEBUG_PRINTM(this->p_model_alphaf);
DEBUG_PRINTM(vars.kzf);
vars.kzf = m_use_big_batch ? vars.kzf.mul(this->p_model_alphaf) : this->p_model_alphaf * vars.kzf;
- fft.inverse(vars.kzf, vars.response, m_use_cuda ? vars.data_i_1ch_d : nullptr);
+#endif
+ fft.inverse(vars.kzf, vars.response, m_use_cuda ? vars.data_i_1ch_d : nullptr, vars.stream);
}
DEBUG_PRINTM(vars.response);
void KCF_Tracker::get_features(cv::Mat & input_rgb, cv::Mat & input_gray, int cx, int cy, int size_x, int size_y, Scale_vars &vars, double scale)
{
- int size_x_scaled = floor(size_x*scale);
- int size_y_scaled = floor(size_y*scale);
+ int size_x_scaled = int(floor(size_x*scale));
+ int size_y_scaled = int(floor(size_y*scale));
cv::Mat patch_gray = get_subwindow(input_gray, cx, cy, size_x_scaled, size_y_scaled);
cv::Mat patch_rgb = get_subwindow(input_rgb, cx, cy, size_x_scaled, size_y_scaled);
std::vector<cv::Mat> cn_feat = CNFeat::extract(patch_rgb);
color_feat.insert(color_feat.end(), cn_feat.begin(), cn_feat.end());
}
-
vars.patch_feats.insert(vars.patch_feats.end(), color_feat.begin(), color_feat.end());
return;
}
float * row_ptr = labels.ptr<float>(j);
double y_s = y*y;
for (int x = range_x[0], i = 0; x < range_x[1]; ++x, ++i){
- row_ptr[i] = std::exp(-0.5 * (y_s + x*x) / sigma_s);//-1/2*e^((y^2+x^2)/sigma^2)
+ row_ptr[i] = float(std::exp(-0.5 * (y_s + x*x) / sigma_s));//-1/2*e^((y^2+x^2)/sigma^2)
}
}
//rotate so that 1 is at top-left corner (see KCF paper for explanation)
#ifdef CUFFT
cv::Mat tmp = circshift(labels, range_x[0], range_y[0]);
- tmp.copyTo(p_scale_vars[0].rot_labels);
+ tmp.copyTo(p_scale_vars.front()->rot_labels);
assert(p_scale_vars[0].rot_labels.at<float>(0,0) >= 1.f - 1e-10f);
return tmp;
cv::Mat m1(1, dim1, CV_32FC1), m2(dim2, 1, CV_32FC1);
double N_inv = 1./(static_cast<double>(dim1)-1.);
for (int i = 0; i < dim1; ++i)
- m1.at<float>(i) = 0.5*(1. - std::cos(2. * CV_PI * static_cast<double>(i) * N_inv));
+ m1.at<float>(i) = float(0.5*(1. - std::cos(2. * CV_PI * static_cast<double>(i) * N_inv)));
N_inv = 1./(static_cast<double>(dim2)-1.);
for (int i = 0; i < dim2; ++i)
- m2.at<float>(i) = 0.5*(1. - std::cos(2. * CV_PI * static_cast<double>(i) * N_inv));
+ m2.at<float>(i) = float(0.5*(1. - std::cos(2. * CV_PI * static_cast<double>(i) * N_inv)));
cv::Mat ret = m2*m1;
return ret;
}
//out of image
if (x1 >= input.cols || y1 >= input.rows || x2 < 0 || y2 < 0) {
patch.create(height, width, input.type());
- patch.setTo(0.f);
+ patch.setTo(double(0.f));
return patch;
}
#endif
vars.xyf = auto_correlation ? xf.sqr_mag() : xf.mul2(yf.conj());
DEBUG_PRINTM(vars.xyf);
- fft.inverse(vars.xyf, vars.ifft2_res, m_use_cuda ? vars.data_i_features_d : nullptr);
+ fft.inverse(vars.xyf, vars.ifft2_res, m_use_cuda ? vars.data_i_features_d : nullptr, vars.stream);
#ifdef CUFFT
if(auto_correlation)
- cuda_gaussian_correlation(vars.data_i_features, vars.gauss_corr_res_d, vars.xf_sqr_norm_d, vars.xf_sqr_norm_d, sigma, xf.n_channels, xf.n_scales, p_roi_height, p_roi_width);
+ cuda_gaussian_correlation(vars.data_i_features, vars.gauss_corr_res_d, vars.xf_sqr_norm_d, vars.xf_sqr_norm_d,
+ sigma, xf.n_channels, xf.n_scales, p_roi_height, p_roi_width, vars.stream);
else
- cuda_gaussian_correlation(vars.data_i_features, vars.gauss_corr_res_d, vars.xf_sqr_norm_d, vars.yf_sqr_norm_d, sigma, xf.n_channels, xf.n_scales, p_roi_height, p_roi_width);
+ cuda_gaussian_correlation(vars.data_i_features, vars.gauss_corr_res_d, vars.xf_sqr_norm_d, vars.yf_sqr_norm_d,
+ sigma, xf.n_channels, xf.n_scales, p_roi_height, p_roi_width, vars.stream);
#else
//ifft2 and sum over 3rd dimension, we dont care about individual channels
DEBUG_PRINTM(vars.ifft2_res);
if (xf.channels() != p_num_scales*p_num_of_feats)
xy_sum.create(vars.ifft2_res.size(), CV_32FC1);
else
- xy_sum.create(vars.ifft2_res.size(), CV_32FC(p_scales.size()));
+ xy_sum.create(vars.ifft2_res.size(), CV_32FC(int(p_scales.size())));
xy_sum.setTo(0);
for (int y = 0; y < vars.ifft2_res.rows; ++y) {
float * row_ptr = vars.ifft2_res.ptr<float>(y);
cv::split(xy_sum,scales);
float numel_xf_inv = 1.f/(xf.cols * xf.rows * (xf.channels()/xf.n_scales));
- for (int i = 0; i < xf.n_scales; ++i){
- cv::Mat in_roi(vars.in_all, cv::Rect(0, i*scales[0].rows, scales[0].cols, scales[0].rows));
- cv::exp(- 1.f / (sigma * sigma) * cv::max((vars.xf_sqr_norm[i] + vars.yf_sqr_norm[0] - 2 * scales[i]) * numel_xf_inv, 0), in_roi);
+ for (uint i = 0; i < uint(xf.n_scales); ++i){
+ cv::Mat in_roi(vars.in_all, cv::Rect(0, int(i)*scales[0].rows, scales[0].cols, scales[0].rows));
+ cv::exp(- 1. / (sigma * sigma) * cv::max((double(vars.xf_sqr_norm[i] + vars.yf_sqr_norm[0]) - 2 * scales[i]) * double(numel_xf_inv), 0), in_roi);
DEBUG_PRINTM(in_roi);
}
#endif
DEBUG_PRINTM(vars.in_all);
- fft.forward(vars.in_all, auto_correlation ? vars.kf : vars.kzf, m_use_cuda ? vars.gauss_corr_res_d : nullptr);
+ fft.forward(vars.in_all, auto_correlation ? vars.kf : vars.kzf, m_use_cuda ? vars.gauss_corr_res_d : nullptr, vars.stream);
return;
}
cv::Mat x;
cv::solve(A, fval, x, cv::DECOMP_SVD);
- double a = x.at<float>(0), b = x.at<float>(1), c = x.at<float>(2),
+ float a = x.at<float>(0), b = x.at<float>(1), c = x.at<float>(2),
d = x.at<float>(3), e = x.at<float>(4);
cv::Point2f sub_peak(max_loc.x, max_loc.y);
double KCF_Tracker::sub_grid_scale(int index)
{
cv::Mat A, fval;
- if (index < 0 || index > (int)p_scales.size()-1) {
+ if (index < 0 || index > int(p_scales.size())-1) {
// interpolate from all values
// fit 1d quadratic function f(x) = a*x^2 + b*x + c
- A.create(p_scales.size(), 3, CV_32FC1);
- fval.create(p_scales.size(), 1, CV_32FC1);
- for (size_t i = 0; i < p_scales.size(); ++i) {
- A.at<float>(i, 0) = p_scales[i] * p_scales[i];
- A.at<float>(i, 1) = p_scales[i];
- A.at<float>(i, 2) = 1;
- fval.at<float>(i) = m_use_big_batch ? p_scale_vars[1].max_responses[i] : p_scale_vars[i].max_response;
+ A.create(int(p_scales.size()), 3, CV_32FC1);
+ fval.create(int(p_scales.size()), 1, CV_32FC1);
+ for (auto it = p_scale_vars.begin(); it != p_scale_vars.end(); ++it) {
+ uint i = uint(std::distance(p_scale_vars.begin(), it));
+ int j = int(i);
+ A.at<float>(j, 0) = float(p_scales[i] * p_scales[i]);
+ A.at<float>(j, 1) = float(p_scales[i]);
+ A.at<float>(j, 2) = 1;
+ fval.at<float>(j) = m_use_big_batch ? float(p_scale_vars.back()->max_responses[i]) : float((*it)->max_response);
}
} else {
//only from neighbours
- if (index == 0 || index == (int)p_scales.size()-1)
- return p_scales[index];
+ if (index == 0 || index == int(p_scales.size())-1)
+ return p_scales[uint(index)];
A = (cv::Mat_<float>(3, 3) <<
- p_scales[index-1] * p_scales[index-1], p_scales[index-1], 1,
- p_scales[index] * p_scales[index], p_scales[index], 1,
- p_scales[index+1] * p_scales[index+1], p_scales[index+1], 1);
- fval = (cv::Mat_<float>(3, 1) << (m_use_big_batch ? p_scale_vars[1].max_responses[index-1] : p_scale_vars[index-1].max_response),
- (m_use_big_batch ? p_scale_vars[1].max_responses[index] : p_scale_vars[index].max_response),
- (m_use_big_batch ? p_scale_vars[1].max_responses[index+1] : p_scale_vars[index+1].max_response));
+ p_scales[uint(index)-1] * p_scales[uint(index)-1], p_scales[uint(index)-1], 1,
+ p_scales[uint(index)] * p_scales[uint(index)], p_scales[uint(index)], 1,
+ p_scales[uint(index)+1] * p_scales[uint(index)+1], p_scales[uint(index)+1], 1);
+ auto it1 = p_scale_vars.begin();
+ std::advance(it1, index-1);
+ auto it2 = p_scale_vars.begin();
+ std::advance(it2, index);
+ auto it3 = p_scale_vars.begin();
+ std::advance(it3, index+1);
+ fval = (cv::Mat_<float>(3, 1) << (m_use_big_batch ? p_scale_vars.back()->max_responses[uint(index)-1] : (*it1)->max_response),
+ (m_use_big_batch ? p_scale_vars.back()->max_responses[uint(index)] : (*it2)->max_response),
+ (m_use_big_batch ? p_scale_vars.back()->max_responses[uint(index)+1] : (*it3)->max_response));
}
cv::Mat x;
cv::solve(A, fval, x, cv::DECOMP_SVD);
- double a = x.at<float>(0), b = x.at<float>(1);
- double scale = p_scales[index];
+ float a = x.at<float>(0), b = x.at<float>(1);
+ double scale = p_scales[uint(index)];
if (a > 0 || a < 0)
- scale = -b / (2 * a);
+ scale = double(-b / (2 * a));
return scale;
}
#include <opencv2/opencv.hpp>
#include <vector>
+#include <memory>
#include "fhog.hpp"
#ifdef CUFFT
inline cv::Rect get_rect()
{
- return cv::Rect(cx-w/2., cy-h/2., w, h);
+ return cv::Rect(int(cx-w/2.), int(cy-h/2.), int(w), int(h));
}
};
const double p_downscale_factor = 0.5;
double p_scale_factor_x = 1;
double p_scale_factor_y = 1;
+ double p_floating_error = 0.0001;
double p_padding = 1.5;
double p_output_sigma_factor = 0.1;
int p_num_of_feats;
int p_roi_height, p_roi_width;
- std::vector<Scale_vars> p_scale_vars;
+ std::list<std::unique_ptr<Scale_vars>> p_scale_vars;
//model
ComplexMat p_yf;
int nOrients, int softBin, bool full, float clip )
{
float *N, *R; const int hb=h/binSize, wb=w/binSize, nb=hb*wb;
+ (void) nb;
// compute unnormalized gradient histograms
R = (float*) wrCalloc(wb*hb*nOrients,sizeof(float));
gradHist( M, O, R, h, w, binSize, nOrients, softBin, full );
#include "complexmat.cuh"
#else
#include "complexmat.hpp"
+//For compatibility reasons between CuFFT and FFTW, OpenCVfft versions.
+ typedef int* cudaStream_t;
#endif
struct Scale_vars
{
public:
- Scale_vars();
- Scale_vars(int windows_size[2], int cell_size, int num_of_feats, int num_of_scales = 1,ComplexMat *model_xf = nullptr, ComplexMat *yf = nullptr,bool zero_index = false)
+ Scale_vars(int windows_size[2], int cell_size, int num_of_feats, int num_of_scales = 1,ComplexMat *model_xf = nullptr,
+ ComplexMat *yf = nullptr,bool zero_index = false)
{
- double alloc_size;
-
+ uint alloc_size;
+ std::cout << __PRETTY_FUNCTION__ << std::endl;
#ifdef CUFFT
- if (zero_index)
+ if (zero_index) {
cudaSetDeviceFlags(cudaDeviceMapHost);
+ this->zero_index = true;
+ }
- alloc_size = windows_size[0]/cell_size*windows_size[1]/cell_size*num_of_scales*sizeof(cufftReal);
- CudaSafeCall(cudaHostAlloc((void**)&this->data_i_1ch, alloc_size, cudaHostAllocMapped));
- CudaSafeCall(cudaHostGetDevicePointer((void**)&this->data_i_1ch_d, (void*)this->data_i_1ch, 0));
+#if defined(ASYNC) || defined(OPENMP)
+ CudaSafeCall(cudaStreamCreate(&this->stream));
+#endif
- alloc_size = windows_size[0]/cell_size*windows_size[1]/cell_size*num_of_feats*sizeof(cufftReal);
- CudaSafeCall(cudaHostAlloc((void**)&this->data_i_features, alloc_size, cudaHostAllocMapped));
- CudaSafeCall(cudaHostGetDevicePointer((void**)&this->data_i_features_d, (void*)this->data_i_features, 0));
+ alloc_size = uint(windows_size[0]/cell_size*windows_size[1]/cell_size*num_of_scales)*sizeof(cufftReal);
+ CudaSafeCall(cudaHostAlloc(reinterpret_cast<void**>(&this->data_i_1ch), alloc_size, cudaHostAllocMapped));
+ CudaSafeCall(cudaHostGetDevicePointer(reinterpret_cast<void**>(&this->data_i_1ch_d), reinterpret_cast<void*>(this->data_i_1ch), 0));
+
+ alloc_size = uint(windows_size[0]/cell_size*windows_size[1]/cell_size*num_of_feats)*sizeof(cufftReal);
+ CudaSafeCall(cudaHostAlloc(reinterpret_cast<void**>(&this->data_i_features), alloc_size, cudaHostAllocMapped));
+ CudaSafeCall(cudaHostGetDevicePointer(reinterpret_cast<void**>(&this->data_i_features_d), reinterpret_cast<void*>(this->data_i_features), 0));
this->ifft2_res = cv::Mat(windows_size[1]/cell_size, windows_size[0]/cell_size, CV_32FC(num_of_feats), this->data_i_features);
this->response = cv::Mat(windows_size[1]/cell_size, windows_size[0]/cell_size, CV_32FC(num_of_scales), this->data_i_1ch);
- this->zf = ComplexMat(windows_size[1]/cell_size, (windows_size[0]/cell_size)/2+1, num_of_feats, num_of_scales);
- this->kzf = ComplexMat(windows_size[1]/cell_size, (windows_size[0]/cell_size)/2+1, num_of_scales);
- this->kf = ComplexMat(windows_size[1]/cell_size, (windows_size[0]/cell_size)/2+1, num_of_scales);
+ this->zf.create(windows_size[1]/cell_size, (windows_size[0]/cell_size)/2+1, num_of_feats, num_of_scales, this->stream);
+ std::cout << this->zf.stream << std::endl;
+ std::cout << this->zf.n_scales << std::endl;
+ this->kzf.create(windows_size[1]/cell_size, (windows_size[0]/cell_size)/2+1, num_of_scales, this->stream);
+ std::cout << this->kzf.stream << std::endl;
+ std::cout << this->kzf.n_scales << std::endl;
+ this->kf.create(windows_size[1]/cell_size, (windows_size[0]/cell_size)/2+1, num_of_scales, this->stream);
+ std::cout << this->kf.stream << std::endl;
+ std::cout << this->kf.n_scales << std::endl << std::endl;
- alloc_size = num_of_scales;
+ alloc_size = uint(num_of_scales);
- CudaSafeCall(cudaHostAlloc((void**)&this->xf_sqr_norm, alloc_size*sizeof(float), cudaHostAllocMapped));
- CudaSafeCall(cudaHostGetDevicePointer((void**)&this->xf_sqr_norm_d, (void*)this->xf_sqr_norm, 0));
+ CudaSafeCall(cudaHostAlloc(reinterpret_cast<void**>(&this->xf_sqr_norm), alloc_size*sizeof(float), cudaHostAllocMapped));
+ CudaSafeCall(cudaHostGetDevicePointer(reinterpret_cast<void**>(&this->xf_sqr_norm_d), reinterpret_cast<void*>(this->xf_sqr_norm), 0));
- CudaSafeCall(cudaHostAlloc((void**)&this->yf_sqr_norm, sizeof(float), cudaHostAllocMapped));
- CudaSafeCall(cudaHostGetDevicePointer((void**)&this->yf_sqr_norm_d, (void*)this->yf_sqr_norm, 0));
+ CudaSafeCall(cudaHostAlloc(reinterpret_cast<void**>(&this->yf_sqr_norm), sizeof(float), cudaHostAllocMapped));
+ CudaSafeCall(cudaHostGetDevicePointer(reinterpret_cast<void**>(&this->yf_sqr_norm_d), reinterpret_cast<void*>(this->yf_sqr_norm), 0));
- alloc_size =(windows_size[0]/cell_size)*(windows_size[1]/cell_size)*alloc_size*sizeof(float);
- CudaSafeCall(cudaHostAlloc((void**)&this->gauss_corr_res, alloc_size, cudaHostAllocMapped));
- CudaSafeCall(cudaHostGetDevicePointer((void**)&this->gauss_corr_res_d, (void*)this->gauss_corr_res, 0));
+ alloc_size =uint((windows_size[0]/cell_size)*(windows_size[1]/cell_size))*alloc_size*sizeof(float);
+ CudaSafeCall(cudaHostAlloc(reinterpret_cast<void**>(&this->gauss_corr_res), alloc_size, cudaHostAllocMapped));
+ CudaSafeCall(cudaHostGetDevicePointer(reinterpret_cast<void**>(&this->gauss_corr_res_d), reinterpret_cast<void*>(this->gauss_corr_res), 0));
this->in_all = cv::Mat(windows_size[1]/cell_size*num_of_scales, windows_size[0]/cell_size, CV_32F, this->gauss_corr_res_d);
if (zero_index) {
- alloc_size = (windows_size[0]/cell_size)*(windows_size[1]/cell_size)*sizeof(float);
- CudaSafeCall(cudaHostAlloc((void**)&this->rot_labels_data, alloc_size, cudaHostAllocMapped));
- CudaSafeCall(cudaHostGetDevicePointer((void**)&this->rot_labels_data_d, (void*)this->rot_labels_data, 0));
+ alloc_size = uint((windows_size[0]/cell_size)*(windows_size[1]/cell_size))*sizeof(float);
+ CudaSafeCall(cudaHostAlloc(reinterpret_cast<void**>(&this->rot_labels_data), alloc_size, cudaHostAllocMapped));
+ CudaSafeCall(cudaHostGetDevicePointer(reinterpret_cast<void**>(&this->rot_labels_data_d), reinterpret_cast<void*>(this->rot_labels_data), 0));
this->rot_labels = cv::Mat(windows_size[1]/cell_size, windows_size[0]/cell_size, CV_32FC1, this->rot_labels_data);
}
- alloc_size = (windows_size[0]/cell_size)*((windows_size[1]/cell_size)*num_of_feats)*sizeof(cufftReal);
- CudaSafeCall(cudaHostAlloc((void**)&this->data_features, alloc_size, cudaHostAllocMapped));
- CudaSafeCall(cudaHostGetDevicePointer((void**)&this->data_features_d, (void*)this->data_features, 0));
+ alloc_size = uint((windows_size[0]/cell_size)*((windows_size[1]/cell_size)*num_of_feats))*sizeof(cufftReal);
+ CudaSafeCall(cudaHostAlloc(reinterpret_cast<void**>(&this->data_features), alloc_size, cudaHostAllocMapped));
+ CudaSafeCall(cudaHostGetDevicePointer(reinterpret_cast<void**>(&this->data_features_d), reinterpret_cast<void*>(this->data_features), 0));
this->fw_all = cv::Mat((windows_size[1]/cell_size)*num_of_feats, windows_size[0]/cell_size, CV_32F, this->data_features);
#else
- alloc_size = num_of_scales;
+ alloc_size = uint(num_of_scales);
- this->xf_sqr_norm = (float*) malloc(alloc_size*sizeof(float));
- this->yf_sqr_norm = (float*) malloc(sizeof(float));
+ this->xf_sqr_norm = reinterpret_cast<float*>(malloc(alloc_size*sizeof(float)));
+ this->yf_sqr_norm = reinterpret_cast<float*>(malloc(sizeof(float)));
- this->patch_feats.reserve(num_of_feats);
+ this->patch_feats.reserve(uint(num_of_feats));
int height = windows_size[1]/cell_size;
#ifdef FFTW
model_xf->create(windows_size[1]/cell_size, (windows_size[0]/cell_size)/2+1, num_of_feats);
yf->create(windows_size[1]/cell_size, (windows_size[0]/cell_size)/2+1, 1);
//We use scale_vars[0] for updating the tracker, so we only allocate memory for its xf only.
+#ifdef CUFFT
+ this->xf.create(windows_size[1]/cell_size, (windows_size[0]/cell_size)/2+1, num_of_feats, this->stream);
+#else
this->xf.create(windows_size[1]/cell_size, (windows_size[0]/cell_size)/2+1, num_of_feats);
+#endif
} else if (num_of_scales > 1) {
- this->max_responses.reserve(num_of_scales);
- this->max_locs.reserve(num_of_scales);
- this->response_maps.reserve(num_of_scales);
+ this->max_responses.reserve(uint(num_of_scales));
+ this->max_locs.reserve(uint(num_of_scales));
+ this->response_maps.reserve(uint(num_of_scales));
}
#else
if (zero_index) {
#endif
}
+ ~Scale_vars() {
+#ifdef CUFFT
+ std::cout << __PRETTY_FUNCTION__ << std::endl;
+ CudaSafeCall(cudaFreeHost(this->xf_sqr_norm));
+ CudaSafeCall(cudaFreeHost(this->yf_sqr_norm));
+ CudaSafeCall(cudaFreeHost(this->data_i_1ch));
+ CudaSafeCall(cudaFreeHost(this->data_i_features));
+ CudaSafeCall(cudaFreeHost(this->gauss_corr_res));
+ if (zero_index)
+ CudaSafeCall(cudaFreeHost(this->rot_labels_data));
+ CudaSafeCall(cudaFreeHost(this->data_features));
+#if defined(ASYNC) || defined(OPENMP)
+ CudaSafeCall(cudaStreamDestroy(this->stream));
+#endif
+#else
+ free(this->xf_sqr_norm);
+ free(this->yf_sqr_norm);
+#endif
+ }
+
float *xf_sqr_norm = nullptr, *yf_sqr_norm = nullptr;
std::vector<cv::Mat> patch_feats;
*rot_labels_data_d = nullptr, *data_features = nullptr, *data_features_d = nullptr;
float *data_f = nullptr, *data_i_features = nullptr, *data_i_features_d = nullptr, *data_i_1ch = nullptr, *data_i_1ch_d = nullptr;
+ cudaStream_t stream = nullptr;
+ ComplexMat model_alphaf, model_xf;
+
//Big batch variables
cv::Point2i max_loc;
double max_val, max_response;
std::vector<double> max_responses;
std::vector<cv::Point2i> max_locs;
std::vector<cv::Mat> response_maps;
+ bool zero_index = false;
};
#endif // SCALE_VARS_HPP