add_definitions(-DFFTW -DCUFFTW)
set(use_cuda ON)
ELSEIF(FFT STREQUAL "cuFFT")
- list(APPEND KCF_LIB_SRC fft_cufft.cpp complexmat.cuh)
+ list(APPEND KCF_LIB_SRC fft_cufft.cpp complexmat.cuh cuda_functions.cuh)
add_definitions(-DCUFFT)
set(use_cuda ON)
iF(CUDA_DEBUG)
IF(FFT STREQUAL "cuFFT")
add_subdirectory(cuda)
cuda_add_library(complexmat complexmat.cu)
+ cuda_add_library(cuda_func cuda_functions.cu)
ENDIF()
ENDIF()
ENDIF() #cuFFTW
IF(FFT STREQUAL "cuFFT")
- target_link_libraries(kcf ${CUDA_cufft_LIBRARY} ${CUDA_LIBRARIES} complexmat)
+ target_link_libraries(kcf ${CUDA_cufft_LIBRARY} ${CUDA_LIBRARIES} complexmat cuda_func)
ENDIF()
IF(PROFILING)
//text output
friend std::ostream & operator<<(std::ostream & os, const ComplexMat & mat)
{
+ float *data_cpu = (float*) malloc(mat.rows*mat.cols*mat.n_channels*sizeof(cufftComplex));
+ CudaSafeCall(cudaMemcpy(data_cpu, mat.p_data, mat.rows*mat.cols*mat.n_channels*sizeof(cufftComplex), cudaMemcpyDeviceToHost));
//for (int i = 0; i < mat.n_channels; ++i){
for (int i = 0; i < 1; ++i){
os << "Channel " << i << std::endl;
for (int j = 0; j < mat.rows; ++j) {
for (int k = 0; k < 2*mat.cols-2; k+=2)
- os << "(" << mat.p_data[j*2*mat.cols + k] << "," << mat.p_data[j*2*mat.cols + (k+1)] << ")" << ", ";
- os << "(" << mat.p_data[j*2*mat.cols + 2*mat.cols-2] << "," << mat.p_data[j*2*mat.cols + 2*mat.cols-1] << ")" << std::endl;
+ os << "(" << data_cpu[j*2*mat.cols + k] << "," << data_cpu[j*2*mat.cols + (k+1)] << ")" << ", ";
+ os << "(" << data_cpu[j*2*mat.cols + 2*mat.cols-2] << "," << data_cpu[j*2*mat.cols + 2*mat.cols-1] << ")" << std::endl;
}
}
+ free(data_cpu);
return os;
}
#ifndef CUDA_ERROR_CHECK_H
#define CUDA_ERROR_CHECK_H
+#include <iostream>
+
#define CudaSafeCall( err ) __cudaSafeCall( err, __FILE__, __LINE__ )
#define CudaCheckError() __cudaCheckError( __FILE__, __LINE__ )
return "<unknown>";
}
-#endif
#define CufftErrorCheck(call) __cufftErrorCheck(call, __FILE__, __LINE__ )
return;
}
+#endif
#endif
--- /dev/null
+#include "cuda_functions.cuh"
+
+__global__ void gaussian_correlation_kernel(float *data_in, float *data_out, float *xf_sqr_norm, float *yf_sqr_norm, int rows, int cols, int channels_per_scale, double sigma)
+{
+ extern __shared__ float sdata[];
+ int blockId = blockIdx.y * gridDim.x + blockIdx.x;
+ int threadId = blockId *( blockDim.x+channels_per_scale/2) + threadIdx.x;
+
+ sdata[threadIdx.x] = 0;
+ sdata[threadIdx.x] = data_in[threadId] + data_in[threadId+blockDim.x];
+ __syncthreads();
+
+ for (unsigned int s= (channels_per_scale/2+1)/2, old_s = channels_per_scale/2;s>0; s>>=1) {
+
+ if(old_s&1) s+=1;
+
+ if (threadIdx.x < s && threadIdx.x+s < old_s) {
+ sdata[threadIdx.x] += sdata[threadIdx.x + s];
+ }
+ old_s = s;
+ __syncthreads();
+ }
+
+ if(threadIdx.x == 0){
+ float accumulate_res = sdata[0]/(rows*cols);
+
+ float numel_xf_inv = 1.f/((cols/2+1) * rows * (channels_per_scale));
+
+ float tmp = (xf_sqr_norm[blockIdx.x] + yf_sqr_norm[0] - 2 * accumulate_res) * numel_xf_inv;
+
+ if (tmp > 0) {
+ data_out[blockIdx.x*rows*cols+blockIdx.y] = expf(- 1.f / (sigma * sigma) * tmp);
+ } else {
+ data_out[blockIdx.x*rows*cols+blockIdx.y] = expf(0);
+ }
+ }
+}
+
+void cuda_gaussian_correlation(float *data_in, float *data_out, float *xf_sqr_norm, float *yf_sqr_norm, double sigma, int n_channels, int n_scales,int rows, int cols)
+{
+ dim3 threadsPerBlock((n_channels/n_scales)/2);
+ dim3 numBlocks(n_scales, rows*cols);
+
+ gaussian_correlation_kernel<<<numBlocks, threadsPerBlock, ((n_channels/n_scales)/2)*sizeof(float)>>>(data_in, data_out, xf_sqr_norm, yf_sqr_norm, rows, cols, n_channels/n_scales, sigma);
+ CudaCheckError();
+
+// float *data_cpu = (float*) malloc(rows*cols*n_scales*sizeof(float));
+// CudaSafeCall(cudaMemcpy(data_cpu, data_out, rows*cols*n_scales*sizeof(float), cudaMemcpyDeviceToHost));
+// for (int j = 0; j < rows*n_scales; ++j) {
+// for (int k = 0; k < cols-1; ++k)
+// std::cout << data_cpu[j*cols + k] << ", ";
+// std::cout << data_cpu[j*cols + cols-1] << std::endl;
+// }
+// free(data_cpu);
+ return;
+}
--- /dev/null
+#ifndef CUDA_FUNCTIONS_H
+#define CUDA_FUNCTIONS_H
+
+#include "cuda_runtime.h"
+#include "cuda/cuda_error_check.cuh"
+
+void cuda_gaussian_correlation(float *data_in, float *data_out, float *xf_sqr_norm, float *yf_sqr_norm, double sigma, int n_channels, int n_scales, int rows, int cols);
+
+#endif
virtual void init(unsigned width, unsigned height,unsigned num_of_feats, unsigned num_of_scales, bool big_batch_mode) = 0;
virtual void set_window(const cv::Mat &window) = 0;
virtual ComplexMat forward(const cv::Mat &input) = 0;
- virtual ComplexMat forward_raw(float *input) = 0;
+ virtual ComplexMat forward_raw(float *input, bool all_scales) = 0;
virtual ComplexMat forward_window(const std::vector<cv::Mat> &input) = 0;
virtual cv::Mat inverse(const ComplexMat &input) = 0;
virtual float* inverse_raw(const ComplexMat &input) = 0;
return complex_result;
}
-ComplexMat cuFFT::forward_raw(float *input)
+ComplexMat cuFFT::forward_raw(float *input, bool all_scales)
{
- ComplexMat dummy;
- return dummy;
+ ComplexMat complex_result;
+ if (all_scales){
+ complex_result.create(m_height, m_width / 2 + 1, m_num_of_scales);
+ CufftErrorCheck(cufftExecR2C(plan_f_all_scales, reinterpret_cast<cufftReal*>(input),
+ complex_result.get_p_data()));
+ } else {
+ complex_result.create(m_height, m_width/ 2 + 1, 1);
+ CufftErrorCheck(cufftExecR2C(plan_f, reinterpret_cast<cufftReal*>(input),
+ complex_result.get_p_data()));
+ }
+ return complex_result;
}
ComplexMat cuFFT::forward_window(const std::vector<cv::Mat> &input)
float* cuFFT::inverse_raw(const ComplexMat &input)
{
+ int n_channels = input.n_channels;
cufftComplex *in = reinterpret_cast<cufftComplex*>(input.get_p_data());
-
- CufftErrorCheck(cufftExecC2R(plan_i_features_all_scales, in, reinterpret_cast<cufftReal*>(data_i_features_all_scales_d)));
- return data_i_features_all_scales;
+ if(n_channels == 1){
+ CufftErrorCheck(cufftExecC2R(plan_i_1ch, in, reinterpret_cast<cufftReal*>(data_i_1ch_d)));
+
+ return data_i_1ch_d;
+ } else if(n_channels == (int) m_num_of_scales){
+ CufftErrorCheck(cufftExecC2R(plan_i_1ch_all_scales, in, reinterpret_cast<cufftReal*>(data_i_1ch_all_scales_d)));
+
+ return data_i_1ch_all_scales_d;
+ } else if(n_channels == (int) m_num_of_feats * (int) m_num_of_scales){
+ CufftErrorCheck(cufftExecC2R(plan_i_features_all_scales, in, reinterpret_cast<cufftReal*>(data_i_features_all_scales_d)));
+
+ return data_i_features_all_scales_d;
+ }
+
+ CufftErrorCheck(cufftExecC2R(plan_i_features, in, reinterpret_cast<cufftReal*>(data_i_features_d)));
+
+ return data_i_features_d;
}
cuFFT::~cuFFT()
void init(unsigned width, unsigned height, unsigned num_of_feats, unsigned num_of_scales, bool big_batch_mode) override;
void set_window(const cv::Mat &window) override;
ComplexMat forward(const cv::Mat &input) override;
- ComplexMat forward_raw(float *input) override;
+ ComplexMat forward_raw(float *input, bool all_scales) override;
ComplexMat forward_window(const std::vector<cv::Mat> &input) override;
cv::Mat inverse(const ComplexMat &input) override;
float* inverse_raw(const ComplexMat &input) override;
return complex_result;
}
-ComplexMat Ffftw::forward_raw(float *input)
+ComplexMat Fftw::forward_raw(float *input, bool all_scales)
{
ComplexMat dummy;
return dummy;
void init(unsigned width, unsigned height, unsigned num_of_feats, unsigned num_of_scales, bool big_batch_mode) override;
void set_window(const cv::Mat &window) override;
ComplexMat forward(const cv::Mat &input) override;
- ComplexMat forward_raw(float *input) override;
+ ComplexMat forward_raw(float *input, bool all_scales) override;
ComplexMat forward_window(const std::vector<cv::Mat> &input) override;
cv::Mat inverse(const ComplexMat &input) override;
float* inverse_raw(const ComplexMat &input) override;
return ComplexMat(complex_result);
}
-ComplexMat FftOpencv::forward_raw(float *input)
+ComplexMat FftOpencv::forward_raw(float *input, bool all_scales)
{
ComplexMat dummy;
return dummy;
void init(unsigned width, unsigned height, unsigned num_of_feats, unsigned num_of_scales, bool big_batch_mode) override;
void set_window(const cv::Mat &window) override;
ComplexMat forward(const cv::Mat &input) override;
- ComplexMat forward_raw(float *input) override;
+ ComplexMat forward_raw(float *input, bool all_scales) override;
ComplexMat forward_window(const std::vector<cv::Mat> &input) override;
cv::Mat inverse(const ComplexMat &input) override;
float* inverse_raw(const ComplexMat &input) override;
#ifdef CUFFT
CudaSafeCall(cudaFreeHost(xf_sqr_norm));
CudaSafeCall(cudaFreeHost(yf_sqr_norm));
+ CudaSafeCall(cudaFree(gauss_corr_res));
#else
free(xf_sqr_norm);
free(yf_sqr_norm);
cudaSetDeviceFlags(cudaDeviceMapHost);
CudaSafeCall(cudaHostAlloc((void**)&xf_sqr_norm, p_num_scales*sizeof(float), cudaHostAllocMapped));
CudaSafeCall(cudaHostGetDevicePointer((void**)&xf_sqr_norm_d, (void*)xf_sqr_norm, 0));
- std::cout << &xf_sqr_norm << std::endl;
+
CudaSafeCall(cudaHostAlloc((void**)&yf_sqr_norm, sizeof(float), cudaHostAllocMapped));
CudaSafeCall(cudaHostGetDevicePointer((void**)&yf_sqr_norm_d, (void*)yf_sqr_norm, 0));
#else
xf_sqr_norm = (float*) malloc(p_num_scales*sizeof(float));
- xf_sqr_norm = (float*) malloc(sizeof(float));
+ yf_sqr_norm = (float*) malloc(sizeof(float));
#endif
p_current_scale = 1.;
p_num_of_feats = 31;
if(m_use_color) p_num_of_feats += 3;
if(m_use_cnfeat) p_num_of_feats += 10;
- p_poi_width = p_windows_size[0]/p_cell_size;
- p_poi_height = p_windows_size[1]/p_cell_size;
+ p_roi_width = p_windows_size[0]/p_cell_size;
+ p_roi_height = p_windows_size[1]/p_cell_size;
fft.init(p_windows_size[0]/p_cell_size, p_windows_size[1]/p_cell_size, p_num_of_feats, p_num_scales, m_use_big_batch);
p_yf = fft.forward(gaussian_shaped_labels(p_output_sigma, p_windows_size[0]/p_cell_size, p_windows_size[1]/p_cell_size));
fft.set_window(cosine_window_function(p_windows_size[0]/p_cell_size, p_windows_size[1]/p_cell_size));
+#ifdef CUFFT
+ CudaSafeCall(cudaMalloc((void**)&gauss_corr_res, (p_windows_size[0]/p_cell_size)*(p_windows_size[1]/p_cell_size)*p_num_scales*sizeof(float)));
+#endif
//obtain a sub-window for training initial model
std::vector<cv::Mat> path_feat = get_features(input_rgb, input_gray, p_pose.cx, p_pose.cy, p_windows_size[0], p_windows_size[1]);
p_model_xf = fft.forward_window(path_feat);
}
scale_responses.push_back(max_val*weight);
}
- } else if(m_use_big_batch){
+ } else if (m_use_big_batch){
#pragma omp parallel for ordered
for (size_t i = 0; i < p_scales.size(); ++i) {
std::vector<cv::Mat> tmp = get_features(input_rgb, input_gray, p_pose.cx, p_pose.cy, p_windows_size[0], p_windows_size[1], p_current_scale * p_scales[i]);
{
#ifdef CUFFT
xf.sqr_norm(xf_sqr_norm_d);
+ if (auto_correlation){
+ cudaDeviceSynchronize();
+ yf_sqr_norm[0] = xf_sqr_norm[0];
+ } else {
+ yf.sqr_norm(yf_sqr_norm_d);
+ }
#else
xf.sqr_norm(xf_sqr_norm);
-#endif
- if(auto_correlation){
+ if (auto_correlation){
yf_sqr_norm[0] = xf_sqr_norm[0];
} else {
-#ifdef CUFFT
- yf.sqr_norm(yf_sqr_norm_d);
-#else
yf.sqr_norm(yf_sqr_norm);
-#endif
}
-
+#endif
ComplexMat xyf;
xyf = auto_correlation ? xf.sqr_mag() : xf.mul2(yf.conj());
DEBUG_PRINTM(xyf);
+#ifdef CUFFT
+ cuda_gaussian_correlation(fft.inverse_raw(xyf), gauss_corr_res, xf_sqr_norm_d, yf_sqr_norm_d, sigma, xf.n_channels, xf.n_scales, p_roi_height, p_roi_width);
+ return fft.forward_raw(gauss_corr_res, xf.n_scales==p_num_scales);
+#else
//ifft2 and sum over 3rd dimension, we dont care about individual channels
cv::Mat ifft2_res = fft.inverse(xyf);
+ DEBUG_PRINTM(ifft2_res);
cv::Mat xy_sum;
- if(xf.channels() != p_num_scales*p_num_of_feats)
+ if (xf.channels() != p_num_scales*p_num_of_feats)
xy_sum.create(ifft2_res.size(), CV_32FC1);
else
xy_sum.create(ifft2_res.size(), CV_32FC(p_scales.size()));
float * row_ptr_sum = xy_sum.ptr<float>(y);
for (int x = 0; x < ifft2_res.cols; ++x) {
for (int sum_ch = 0; sum_ch < xy_sum.channels(); ++sum_ch) {
- row_ptr_sum[(x*xy_sum.channels())+sum_ch] += std::accumulate(row_ptr + x*ifft2_res.channels() + sum_ch*(ifft2_res.channels()/xy_sum.channels()),
- (row_ptr + x*ifft2_res.channels() + (sum_ch+1)*(ifft2_res.channels()/xy_sum.channels())), 0.f);
+ row_ptr_sum[(x*xy_sum.channels())+sum_ch] += std::accumulate(row_ptr + x*ifft2_res.channels() + sum_ch*(ifft2_res.channels()/xy_sum.channels()), (row_ptr + x*ifft2_res.channels() + (sum_ch+1)*(ifft2_res.channels()/xy_sum.channels())), 0.f);
}
}
}
- DEBUG_PRINTM(ifft2_res);
DEBUG_PRINTM(xy_sum);
std::vector<cv::Mat> scales;
cv::Mat in_all(scales[0].rows * xf.n_scales, scales[0].cols, CV_32F);
float numel_xf_inv = 1.f/(xf.cols * xf.rows * (xf.channels()/xf.n_scales));
- for(int i = 0; i < xf.n_scales; ++i){
+ for (int i = 0; i < xf.n_scales; ++i){
cv::Mat in_roi(in_all, cv::Rect(0, i*scales[0].rows, scales[0].cols, scales[0].rows));
cv::exp(- 1.f / (sigma * sigma) * cv::max((xf_sqr_norm[i] + yf_sqr_norm[0] - 2 * scales[i]) * numel_xf_inv, 0), in_roi);
DEBUG_PRINTM(in_roi);
DEBUG_PRINTM(in_all);
return fft.forward(in_all);
+#endif
}
float get_response_circular(cv::Point2i & pt, cv::Mat & response)
#ifdef CUFFT
#include "complexmat.cuh"
+ #include "cuda_functions.cuh"
#include "cuda/cuda_error_check.cuh"
#include <cuda_runtime.h>
#else
//for big batch
int p_num_of_feats;
- int p_poi_height, p_poi_width;
+ int p_roi_height, p_roi_width;
float *xf_sqr_norm = nullptr, *yf_sqr_norm = nullptr;
#ifdef CUFFT
- float *xf_sqr_norm_d = nullptr, *yf_sqr_norm_d = nullptr;
+ float *xf_sqr_norm_d = nullptr, *yf_sqr_norm_d = nullptr, *gauss_corr_res = nullptr;
#endif
//model