# Makefile to build all the available variants
-BUILDS = opencvfft-st opencvfft-async fftw cufftw fftw_openmp fftw_async fftw_big fftw_big_openmp cufft cufft_big cufft_big_openmp
+BUILDS = opencvfft-st opencvfft-async fftw fftw_openmp fftw_async fftw_big fftw_big_openmp cufftw cufftw_big cufftw_big_openmp cufft cufft_big cufft_big_openmp
all: $(foreach build,$(BUILDS),build-$(build)/kcf_vot)
CMAKE_OPTS += -G Ninja
#CMAKE_OPTS += = -DOpenCV_DIR=
-CMAKE_OTPS_opencvfft-st = -DFFT=OpenCV
-CMAKE_OTPS_opencvfft-async = -DFFT=OpenCV -DASYNC=ON
+CMAKE_OTPS_opencvfft-st = -DFFT=OpenCV
+CMAKE_OTPS_opencvfft-async = -DFFT=OpenCV -DASYNC=ON
#CMAKE_OTPS_opencv-cufft = -DFFT=OpenCV_cuFFT
-CMAKE_OTPS_fftw = -DFFT=fftw
-CMAKE_OTPS_cufftw = -DFFT=cuFFTW
-CMAKE_OTPS_fftw_openmp = -DFFT=fftw -DOPENMP=ON
-CMAKE_OTPS_fftw_async = -DFFT=fftw -DASYNC=ON
-CMAKE_OTPS_fftw_big = -DFFT=fftw -DBIG_BATCH=ON
-CMAKE_OTPS_fftw_big_openmp = -DFFT=fftw -DBIG_BATCH=ON -DOPENMP=ON
-CMAKE_OTPS_cufft = -DFFT=cuFFT
-CMAKE_OTPS_cufft_big = -DFFT=cuFFT -DBIG_BATCH=ON
-CMAKE_OTPS_cufft_big_openmp= = -DFFT=cuFFT -DBIG_BATCH=ON -DOPENMP=ON
-
+CMAKE_OTPS_fftw = -DFFT=fftw
+CMAKE_OTPS_fftw_openmp = -DFFT=fftw -DOPENMP=ON
+CMAKE_OTPS_fftw_async = -DFFT=fftw -DASYNC=ON
+CMAKE_OTPS_fftw_big = -DFFT=fftw -DBIG_BATCH=ON
+CMAKE_OTPS_fftw_big_openmp = -DFFT=fftw -DBIG_BATCH=ON -DOPENMP=ON
+CMAKE_OTPS_cufftw = -DFFT=cuFFTW
+CMAKE_OTPS_cufftw_big = -DFFT=cuFFTW -DBIG_BATCH=ON
+CMAKE_OTPS_cufftw_big_openmp = -DFFT=cuFFTW -DBIG_BATCH=ON -DOPENMP=ON
+CMAKE_OTPS_cufft = -DFFT=cuFFT
+CMAKE_OTPS_cufft_big = -DFFT=cuFFT -DBIG_BATCH=ON
+CMAKE_OTPS_cufft_big_openmp = -DFFT=cuFFT -DBIG_BATCH=ON -DOPENMP=ON
build-%/kcf_vot: $(shell git ls-files)
mkdir -p $(@D)
set(CUDA_SEPARABLE_COMPILATION ON)
set(CUDA_PROPAGATE_HOST_FLAGS OFF)
set(CUDA_HOST_COMPILER /usr/bin/g++)
- list( APPEND CUDA_NVCC_FLAGS -O3 --gpu-architecture sm_62 -std=c++11)
+ list( APPEND CUDA_NVCC_FLAGS -O3 --gpu-architecture sm_62 -std=c++11 -default-stream per-thread)
find_cuda_helper_libs(cufftw)
IF(FFT STREQUAL "cuFFT")
cuda_add_library(complexmat complexmat.cu)
#include "complexmat.cuh"
-__global__ void sqr_norm_kernel(int n, double* out, float* data, float rows, float cols)
+__global__ void sqr_norm_kernel(int n, float* out, float* data, float rows, float cols)
{
- extern __shared__ double sdata[];
+ extern __shared__ float sdata[];
int i = blockDim.x * threadIdx.y + threadIdx.x;
int blockId = blockIdx.x + blockIdx.y * gridDim.x;
int threadId = 2*(blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x);
}
}
-float* ComplexMat::sqr_norm() const
+void ComplexMat::sqr_norm(float *result) const
{
-
- double *sums_sqr_norms_from_d_d, *sums_sqr_norms_d_d;
- float *sums_sqr_norms_from_d_f;
- sums_sqr_norms_from_d_d = (double*) malloc(n_scales*sizeof(double));
- sums_sqr_norms_from_d_f = (float*) malloc(n_scales*sizeof(float));
- cudaMalloc(&sums_sqr_norms_d_d, n_scales*sizeof(double));
- cudaMemset(sums_sqr_norms_d_d, 0, n_scales*sizeof(double));
+ cudaMemset(result, 0, n_scales*sizeof(float));
dim3 threadsPerBlock(rows, cols);
dim3 numBlocks(n_channels/n_scales, n_scales);
- sqr_norm_kernel<<<numBlocks, threadsPerBlock, rows*cols*sizeof(double)>>>(n_channels/n_scales, sums_sqr_norms_d_d, p_data_d, rows, cols);
-
- cudaError_t error = cudaGetLastError(); if(error != cudaSuccess) {printf("CUDA error: %s\n", cudaGetErrorString(error)); exit(-1); }
-
- cudaMemcpy(sums_sqr_norms_from_d_d, sums_sqr_norms_d_d, n_scales*sizeof(double), cudaMemcpyDeviceToHost);
- cudaFree(sums_sqr_norms_d_d);
-
- for(int i = 0; i < n_scales; ++i){
- sums_sqr_norms_from_d_f[i] = sums_sqr_norms_from_d_d[i];
- }
- free(sums_sqr_norms_from_d_d);
+ sqr_norm_kernel<<<numBlocks, threadsPerBlock, rows*cols*sizeof(float)>>>(n_channels/n_scales, result, p_data, rows, cols);
- return sums_sqr_norms_from_d_f;
+ return;
}
__global__ void sqr_mag_kernel(float* data, float* result)
dim3 threadsPerBlock(rows, cols);
dim3 numBlocks(n_channels/n_scales, n_scales);
- sqr_mag_kernel<<<numBlocks, threadsPerBlock>>>(this->p_data_d, result.p_data_d);
+ sqr_mag_kernel<<<numBlocks, threadsPerBlock>>>(this->p_data, result.p_data);
return result;
}
dim3 threadsPerBlock(rows, cols);
dim3 numBlocks(n_channels/n_scales, n_scales);
- conj_kernel<<<numBlocks, threadsPerBlock>>>(this->p_data_d, result.p_data_d);
+ conj_kernel<<<numBlocks, threadsPerBlock>>>(this->p_data, result.p_data);
return result;
}
cufftComplex* ComplexMat::get_p_data() const
{
- return (cufftComplex*) p_data_d;
+ return (cufftComplex*) p_data;
}
__global__ void same_num_channels_mul_kernel(float* data_l, float* data_r, float* result)
dim3 threadsPerBlock(rows, cols);
dim3 numBlocks(n_channels/n_scales, n_scales);
- same_num_channels_mul_kernel<<<numBlocks, threadsPerBlock>>>(this->p_data_d, rhs.p_data_d, result.p_data_d);
+ same_num_channels_mul_kernel<<<numBlocks, threadsPerBlock>>>(this->p_data, rhs.p_data, result.p_data);
return result;
}
dim3 threadsPerBlock(rows, cols);
dim3 numBlocks(n_channels/n_scales, n_scales);
- same_num_channels_div_kernel<<<numBlocks, threadsPerBlock>>>(this->p_data_d, rhs.p_data_d, result.p_data_d);
+ same_num_channels_div_kernel<<<numBlocks, threadsPerBlock>>>(this->p_data, rhs.p_data, result.p_data);
return result;
}
dim3 threadsPerBlock(rows, cols);
dim3 numBlocks(n_channels/n_scales, n_scales);
- same_num_channels_add_kernel<<<numBlocks, threadsPerBlock>>>(this->p_data_d, rhs.p_data_d, result.p_data_d);
+ same_num_channels_add_kernel<<<numBlocks, threadsPerBlock>>>(this->p_data, rhs.p_data, result.p_data);
return result;
}
dim3 threadsPerBlock(rows, cols);
dim3 numBlocks(n_channels/n_scales, n_scales);
- constant_mul_kernel<<<numBlocks, threadsPerBlock>>>(this->p_data_d, rhs, result.p_data_d);
+ constant_mul_kernel<<<numBlocks, threadsPerBlock>>>(this->p_data, rhs, result.p_data);
return result;
}
dim3 threadsPerBlock(rows, cols);
dim3 numBlocks(n_channels/n_scales, n_scales);
- constant_add_kernel<<<numBlocks, threadsPerBlock>>>(this->p_data_d, rhs, result.p_data_d);
+ constant_add_kernel<<<numBlocks, threadsPerBlock>>>(this->p_data, rhs, result.p_data);
return result;
}
dim3 threadsPerBlock(rows, cols);
dim3 numBlocks(n_channels/n_scales, n_scales);
- one_channel_mul_kernel<<<numBlocks, threadsPerBlock>>>(this->p_data_d, rhs.p_data_d, result.p_data_d);
+ one_channel_mul_kernel<<<numBlocks, threadsPerBlock>>>(this->p_data, rhs.p_data, result.p_data);
return result;
}
dim3 threadsPerBlock(rows, cols);
dim3 numBlocks(n_channels/n_scales, n_scales);
- scales_channel_mul_kernel<<<numBlocks, threadsPerBlock>>>(this->p_data_d, rhs.p_data_d, result.p_data_d);
+ scales_channel_mul_kernel<<<numBlocks, threadsPerBlock>>>(this->p_data, rhs.p_data, result.p_data);
return result;
}
n_scales = rhs.n_scales;
p_data = rhs.p_data;
- p_data_d = rhs.p_data_d;
}
n_scales = rhs.n_scales;
p_data = rhs.p_data;
- p_data_d = rhs.p_data_d;
rhs.p_data = nullptr;
- rhs.p_data_d = nullptr;
}
ComplexMat() : cols(0), rows(0), n_channels(0) {}
ComplexMat(int _rows, int _cols, int _n_channels) : cols(_cols), rows(_rows), n_channels(_n_channels)
{
- cudaHostAlloc((void **)&p_data, n_channels*cols*rows*sizeof(cufftComplex), cudaHostAllocMapped);
- cudaHostGetDevicePointer((void **)&p_data_d, (void *) p_data , 0);
+ cudaMalloc(&p_data, n_channels*cols*rows*sizeof(cufftComplex));
}
ComplexMat(int _rows, int _cols, int _n_channels, int _n_scales) : cols(_cols), rows(_rows), n_channels(_n_channels), n_scales(_n_scales)
{
- cudaHostAlloc((void **)&p_data, n_channels*cols*rows*sizeof(cufftComplex), cudaHostAllocMapped);
- cudaHostGetDevicePointer((void **)&p_data_d, (void *) p_data , 0);
+ cudaMalloc(&p_data, n_channels*cols*rows*sizeof(cufftComplex));
}
ComplexMat(ComplexMat &&other)
p_data = other.p_data;
other.p_data = nullptr;
- other.p_data_d = nullptr;
}
~ComplexMat()
{
- if(p_data != nullptr) cudaFreeHost(p_data);
+ if(p_data != nullptr) cudaFree(p_data);
}
void create(int _rows, int _cols, int _n_channels)
rows = _rows;
cols = _cols;
n_channels = _n_channels;
- cudaHostAlloc((void **)&p_data, n_channels*cols*rows*sizeof(cufftComplex), cudaHostAllocMapped);
- cudaHostGetDevicePointer((void **)&p_data_d, (void *) p_data , 0);
+ cudaMalloc(&p_data, n_channels*cols*rows*sizeof(cufftComplex));
}
void create(int _rows, int _cols, int _n_channels, int _n_scales)
cols = _cols;
n_channels = _n_channels;
n_scales = _n_scales;
- cudaHostAlloc((void **)&p_data, n_channels*cols*rows*sizeof(cufftComplex), cudaHostAllocMapped);
- cudaHostGetDevicePointer((void **)&p_data_d, (void *) p_data , 0);
+ cudaMalloc(&p_data, n_channels*cols*rows*sizeof(cufftComplex));
}
// cv::Mat API compatibility
cv::Size size() { return cv::Size(cols, rows); }
int channels() { return n_channels; }
int channels() const { return n_channels; }
- float* sqr_norm() const;
+ void sqr_norm(float *result) const;
ComplexMat sqr_mag() const;
private:
- mutable float *p_data = nullptr, *p_data_d = nullptr;
+ mutable float *p_data = nullptr;
};
\ No newline at end of file
}
- T* sqr_norm() const
+ void sqr_norm(T *sums_sqr_norms) const
{
- T* sums_sqr_norms;
- sums_sqr_norms = (T*) malloc(n_scales*sizeof(T));
int n_channels_per_scale = n_channels/n_scales;
int scale_offset = n_channels_per_scale*rows*cols;
T sum_sqr_norm;
sum_sqr_norm += lhs->real()*lhs->real() + lhs->imag()*lhs->imag();
sums_sqr_norms[scale] = sum_sqr_norm/static_cast<T>(cols*rows);
}
- return sums_sqr_norms;
+ return;
}
ComplexMat_<T> sqr_mag() const
}
cuFFT::cuFFT(): m_num_of_streams(4)
-{
- cudaSetDeviceFlags(cudaDeviceMapHost);
-}
+{}
void cuFFT::init(unsigned width, unsigned height, unsigned num_of_feats, unsigned num_of_scales, bool big_batch_mode)
{
m_big_batch_mode = big_batch_mode;
std::cout << "FFT: cuFFT" << std::endl;
-
-// cudaSetDeviceFlags(cudaDeviceMapHost);
for (unsigned i = 0; i < m_num_of_streams; i++) cudaStreamCreate(&streams[i]);
cudaMalloc(&data_f, m_height*m_width*sizeof(cufftReal));
cufftPlan2d(&plan_f, m_height, m_width, CUFFT_R2C);
+
+
}
//FFT forward all scales
if(m_num_of_scales > 1 && m_big_batch_mode)
int howmany = m_num_of_scales;
int idist = m_height*m_width, odist = m_height*(m_width/2+1);
int istride = 1, ostride = 1;
- int *inembed = NULL, *onembed = NULL;
+ int *inembed = n, onembed[] = {(int)m_height, (int)m_width/2+1};
CHECK_CUFFT_ERRORS(cufftPlanMany(&plan_f_all_scales, rank, n,
inembed, istride, idist,
}
//FFT forward window one scale
{
- cudaMalloc(&data_fw, m_height*m_num_of_feats*m_width*sizeof(cufftReal));
+ cudaHostAlloc(&data_fw, m_height*m_num_of_feats*m_width*sizeof(cufftReal), cudaHostAllocMapped);
+ cudaHostGetDevicePointer(&data_fw_d, data_fw, 0);
int rank = 2;
int n[] = {(int)m_height, (int)m_width};
int howmany = m_num_of_feats;
int idist = m_height*m_width, odist = m_height*(m_width/2+1);
int istride = 1, ostride = 1;
- int *inembed = NULL, *onembed = NULL;
+ int *inembed = n, onembed[] = {(int)m_height, (int)m_width/2+1};
CHECK_CUFFT_ERRORS(cufftPlanMany(&plan_fw, rank, n,
inembed, istride, idist,
//FFT forward window all scales all feats
if(m_num_of_scales > 1 && m_big_batch_mode)
{
- cudaMalloc(&data_fw_all_scales, m_height*m_num_of_feats*m_num_of_scales*m_width*sizeof(cufftReal));
-
+ cudaHostAlloc(&data_fw_all_scales, m_height*m_num_of_feats*m_num_of_scales*m_width*sizeof(cufftReal), cudaHostAllocMapped);
+ cudaHostGetDevicePointer(&data_fw_all_scales_d, data_fw_all_scales, 0);
+
int rank = 2;
int n[] = {(int)m_height, (int)m_width};
int howmany = m_num_of_scales*m_num_of_feats;
int idist = m_height*m_width, odist = m_height*(m_width/2+1);
int istride = 1, ostride = 1;
- int *inembed = NULL, *onembed = NULL;
+ int *inembed = n, onembed[] = {(int)m_height, (int)m_width/2+1};
CHECK_CUFFT_ERRORS(cufftPlanMany(&plan_fw_all_scales, rank, n,
inembed, istride, idist,
onembed, ostride, odist,
CUFFT_R2C, howmany));
+
+
}
//FFT inverse one scale
{
- cudaMalloc(&data_i_features, m_height*m_num_of_feats*m_width*sizeof(cufftReal));
+ cudaHostAlloc(&data_i_features, m_height*m_num_of_feats*m_width*sizeof(cufftReal), cudaHostAllocMapped);
+ cudaHostGetDevicePointer(&data_i_features_d, data_i_features, 0);
int rank = 2;
int n[] = {(int)m_height, (int)m_width};
//FFT inverse all scales
if(m_num_of_scales > 1)
{
- cudaMalloc(&data_i_features_all_scales, m_height*m_num_of_feats*m_num_of_scales*m_width*sizeof(cufftReal));
+ cudaHostAlloc(&data_i_features_all_scales, m_height*m_num_of_feats*m_num_of_scales*m_width*sizeof(cufftReal), cudaHostAllocMapped);
+ cudaHostGetDevicePointer(&data_i_features_all_scales_d, data_i_features_all_scales, 0);
int rank = 2;
int n[] = {(int)m_height, (int)m_width};
}
//FFT inverse one channel one scale
{
- cudaMalloc(&data_i_1ch, m_height*m_width*sizeof(cufftReal));
+ cudaHostAlloc(&data_i_1ch, m_height*m_width*sizeof(cufftReal), cudaHostAllocMapped);
+ cudaHostGetDevicePointer(&data_i_1ch_d, data_i_1ch, 0);
int rank = 2;
int n[] = {(int)m_height, (int)m_width};
//FFT inverse one channel all scales
if(m_num_of_scales > 1 && m_big_batch_mode)
{
- cudaMalloc(&data_i_1ch_all_scales, m_height*m_num_of_scales*m_width*sizeof(cufftReal));
+ cudaHostAlloc(&data_i_1ch_all_scales, m_height*m_num_of_scales*m_width*sizeof(cufftReal), cudaHostAllocMapped);
+ cudaHostGetDevicePointer(&data_i_1ch_all_scales_d, data_i_1ch_all_scales, 0);
int rank = 2;
int n[] = {(int)m_height, (int)m_width};
ComplexMat cuFFT::forward_window(const std::vector<cv::Mat> &input)
{
int n_channels = input.size();
- cv::Mat in_all(m_height * n_channels, m_width, CV_32F);
- for (int i = 0; i < n_channels; ++i) {
- cv::Mat in_roi(in_all, cv::Rect(0, i*m_height, m_width, m_height));
- in_roi = input[i].mul(m_window);
- }
ComplexMat result;
if(n_channels > (int) m_num_of_feats){
+ cv::Mat in_all(m_height * n_channels, m_width, CV_32F, data_fw_all_scales);
+ for (int i = 0; i < n_channels; ++i) {
+ cv::Mat in_roi(in_all, cv::Rect(0, i*m_height, m_width, m_height));
+ in_roi = input[i].mul(m_window);
+ }
+
result.create(m_height, m_width/2 + 1, n_channels,m_num_of_scales);
+
+ CHECK_CUFFT_ERRORS(cufftExecR2C(plan_fw_all_scales, reinterpret_cast<cufftReal*>(data_fw_all_scales_d), result.get_p_data()));
} else {
+ cv::Mat in_all(m_height * n_channels, m_width, CV_32F, data_fw);
+ for (int i = 0; i < n_channels; ++i) {
+ cv::Mat in_roi(in_all, cv::Rect(0, i*m_height, m_width, m_height));
+ in_roi = input[i].mul(m_window);
+ }
+
result.create(m_height, m_width/2 + 1, n_channels);
- }
-
- if (n_channels <= (int) m_num_of_feats){
- cudaMemcpy(data_fw, in_all.ptr<cufftReal>(), m_height*m_num_of_feats*m_width*sizeof(cufftReal), cudaMemcpyHostToDevice);
- CHECK_CUFFT_ERRORS(cufftExecR2C(plan_fw, reinterpret_cast<cufftReal*>(data_fw), result.get_p_data()));
- } else {
- cudaMemcpy(data_fw_all_scales, in_all.ptr<cufftReal>(), m_height*n_channels*m_width*sizeof(cufftReal), cudaMemcpyHostToDevice);
- CHECK_CUFFT_ERRORS(cufftExecR2C(plan_fw_all_scales, reinterpret_cast<cufftReal*>(data_fw_all_scales), result.get_p_data()));
+
+ CHECK_CUFFT_ERRORS(cufftExecR2C(plan_fw, reinterpret_cast<cufftReal*>(data_fw_d), result.get_p_data()));
}
return result;
}
cv::Mat cuFFT::inverse(const ComplexMat &inputf)
{
int n_channels = inputf.n_channels;
- cv::Mat real_result(m_height, m_width, CV_32FC(n_channels));
cufftComplex *in = reinterpret_cast<cufftComplex*>(inputf.get_p_data());
-
+
if(n_channels == 1){
- CHECK_CUFFT_ERRORS(cufftExecC2R(plan_i_1ch, in, reinterpret_cast<cufftReal*>(data_i_1ch)));
- cudaMemcpy(real_result.ptr<cufftReal>(), data_i_1ch, m_height*m_width*sizeof(cufftReal), cudaMemcpyDeviceToHost);
+ cv::Mat real_result(m_height, m_width, CV_32FC1, data_i_1ch);
+
+ CHECK_CUFFT_ERRORS(cufftExecC2R(plan_i_1ch, in, reinterpret_cast<cufftReal*>(data_i_1ch_d)));
+ cudaDeviceSynchronize();
+
+ return real_result/(m_width*m_height);
} else if(n_channels == (int) m_num_of_scales){
- CHECK_CUFFT_ERRORS(cufftExecC2R(plan_i_1ch_all_scales, in, reinterpret_cast<cufftReal*>(data_i_1ch_all_scales)));
- cudaMemcpy(real_result.ptr<cufftReal>(), data_i_1ch_all_scales, m_height*n_channels*m_width*sizeof(cufftReal), cudaMemcpyDeviceToHost);
+ cv::Mat real_result(m_height, m_width, CV_32FC(n_channels), data_i_1ch_all_scales);
+
+ CHECK_CUFFT_ERRORS(cufftExecC2R(plan_i_1ch_all_scales, in, reinterpret_cast<cufftReal*>(data_i_1ch_all_scales_d)));
+ cudaDeviceSynchronize();
+
+ return real_result/(m_width*m_height);
} else if(n_channels == (int) m_num_of_feats * (int) m_num_of_scales){
- CHECK_CUFFT_ERRORS(cufftExecC2R(plan_i_features_all_scales, in, reinterpret_cast<cufftReal*>(data_i_features_all_scales)));
- cudaMemcpy(real_result.ptr<cufftReal>(), data_i_features_all_scales, m_height*n_channels*m_width*sizeof(cufftReal), cudaMemcpyDeviceToHost);
- } else {
- CHECK_CUFFT_ERRORS(cufftExecC2R(plan_i_features, in, reinterpret_cast<cufftReal*>(data_i_features)));
- cudaMemcpy(real_result.ptr<cufftReal>(), data_i_features, m_height*n_channels*m_width*sizeof(cufftReal), cudaMemcpyDeviceToHost);
+ cv::Mat real_result(m_height, m_width, CV_32FC(n_channels), data_i_features_all_scales);
+
+ CHECK_CUFFT_ERRORS(cufftExecC2R(plan_i_features_all_scales, in, reinterpret_cast<cufftReal*>(data_i_features_all_scales_d)));
+ cudaDeviceSynchronize();
+
+ return real_result/(m_width*m_height);
}
+
+ cv::Mat real_result(m_height, m_width, CV_32FC(n_channels), data_i_features);
+
+ CHECK_CUFFT_ERRORS(cufftExecC2R(plan_i_features, in, reinterpret_cast<cufftReal*>(data_i_features_d)));
+ cudaDeviceSynchronize();
+
return real_result/(m_width*m_height);
}
cudaFree(data_f);
cudaFree(data_f_all_scales);
- cudaFree(data_fw);
- cudaFree(data_fw_all_scales);
- cudaFree(data_i_1ch);
- cudaFree(data_i_1ch_all_scales);
- cudaFree(data_i_features);
- cudaFree(data_i_features_all_scales);
+ cudaFreeHost(data_fw);
+ cudaFreeHost(data_fw_all_scales);
+ cudaFreeHost(data_i_1ch);
+ cudaFreeHost(data_i_1ch_all_scales);
+ cudaFreeHost(data_i_features);
+ cudaFreeHost(data_i_features_all_scales);
cudaDeviceReset();
}
cudaStream_t streams[4];
cufftHandle plan_f, plan_f_all_scales, plan_fw, plan_fw_all_scales, plan_i_features,
plan_i_features_all_scales, plan_i_1ch, plan_i_1ch_all_scales;
- float *data_f, *data_f_all_scales, *data_fw, *data_fw_all_scales, *data_i_features, *data_i_features_all_scales, *data_i_1ch, *data_i_1ch_all_scales;
+ float *data_f, *data_f_all_scales, *data_fw, *data_fw_d, *data_fw_all_scales, *data_fw_all_scales_d, *data_i_features, *data_i_features_d,
+ *data_i_features_all_scales, *data_i_features_all_scales_d, *data_i_1ch, *data_i_1ch_d, *data_i_1ch_all_scales, *data_i_1ch_all_scales_d;
};
#endif // FFT_CUDA_H
KCF_Tracker::~KCF_Tracker()
{
delete &fft;
+#ifdef CUFFT
+ cudaFreeHost(xf_sqr_norm);
+ cudaFreeHost(yf_sqr_norm);
+#else
+ free(xf_sqr_norm);
+ free(yf_sqr_norm);
+#endif
}
void KCF_Tracker::init(cv::Mat &img, const cv::Rect & bbox)
p_scales.push_back(std::pow(p_scale_step, i));
else
p_scales.push_back(1.);
+
+#ifdef CUFFT
+ cudaSetDeviceFlags(cudaDeviceMapHost);
+ cudaHostAlloc((void**)&xf_sqr_norm, p_scales.size()*sizeof(float), cudaHostAllocMapped);
+ cudaHostGetDevicePointer((void**)&xf_sqr_norm_d, (void*)xf_sqr_norm, 0);
+
+ cudaHostAlloc((void**)&yf_sqr_norm, sizeof(float), cudaHostAllocMapped);
+ cudaHostGetDevicePointer((void**)&yf_sqr_norm_d, (void*)yf_sqr_norm, 0);
+#else
+ xf_sqr_norm = (float*) malloc(p_scales.size()*sizeof(float));
+ xf_sqr_norm = (float*) malloc(sizeof(float));
+#endif
p_current_scale = 1.;
ComplexMat KCF_Tracker::gaussian_correlation(const ComplexMat &xf, const ComplexMat &yf, double sigma, bool auto_correlation)
{
- float* xf_sqr_norm = xf.sqr_norm();
- float* yf_sqr_norm = auto_correlation ? xf_sqr_norm : yf.sqr_norm();
+#ifdef CUFFT
+ xf.sqr_norm(xf_sqr_norm_d);
+#else
+ xf.sqr_norm(xf_sqr_norm);
+#endif
+ if(auto_correlation){
+ yf_sqr_norm = xf_sqr_norm;
+ } else {
+#ifdef CUFFT
+ yf.sqr_norm(yf_sqr_norm_d);
+#else
+ yf.sqr_norm(yf_sqr_norm);
+#endif
+ }
ComplexMat xyf;
xyf = auto_correlation ? xf.sqr_mag() : xf.mul2(yf.conj());
DEBUG_PRINTM(in_roi);
}
- free(xf_sqr_norm);
- if(!auto_correlation)free(yf_sqr_norm);
-
DEBUG_PRINTM(in_all);
return fft.forward(in_all);
}
//for big batch
int num_of_feats;
+ float *xf_sqr_norm, *yf_sqr_norm;
+#ifdef CUFFT
+ float *xf_sqr_norm_d, *yf_sqr_norm_d;
+#endif
//model
ComplexMat p_yf;