}
}
-void ComplexMat::sqr_norm(float *result) const
+void ComplexMat::sqr_norm(DynMem &result) const
{
- CudaSafeCall(cudaMemsetAsync(result, 0, n_scales * sizeof(float), this->stream));
+ CudaSafeCall(cudaMemsetAsync(result.deviceMem(), 0, n_scales * sizeof(float), this->stream));
dim3 threadsPerBlock(rows, cols);
dim3 numBlocks(n_channels / n_scales, n_scales);
sqr_norm_kernel<<<numBlocks, threadsPerBlock, rows * cols * sizeof(float), this->stream>>>(
- n_channels / n_scales, result, this->p_data, rows, cols);
+ n_channels / n_scales, result.deviceMem(), this->p_data, rows, cols);
CudaCheckError();
return;
#include <vector>
#include <algorithm>
#include <functional>
+#include "dynmem.hpp"
template <typename T> class ComplexMat_ {
public:
return sum_sqr_norm;
}
- void sqr_norm(T *sums_sqr_norms) const
+ void sqr_norm(DynMem_<T> &result) const
{
int n_channels_per_scale = n_channels / n_scales;
int scale_offset = n_channels_per_scale * rows * cols;
- T sum_sqr_norm;
for (uint scale = 0; scale < n_scales; ++scale) {
- sum_sqr_norm = 0;
+ T sum_sqr_norm = 0;
for (int i = 0; i < n_channels_per_scale; ++i)
for (auto lhs = p_data.begin() + i * rows * cols + scale * scale_offset;
lhs != p_data.begin() + (i + 1) * rows * cols + scale * scale_offset; ++lhs)
sum_sqr_norm += lhs->real() * lhs->real() + lhs->imag() * lhs->imag();
- sums_sqr_norms[scale] = sum_sqr_norm / static_cast<T>(cols * rows);
+ result.hostMem()[scale] = sum_sqr_norm / static_cast<T>(cols * rows);
}
return;
}
void KCF_Tracker::gaussian_correlation(struct ThreadCtx &vars, const ComplexMat &xf, const ComplexMat &yf,
double sigma, bool auto_correlation)
{
-#ifdef CUFFT
- xf.sqr_norm(vars.xf_sqr_norm.deviceMem());
- if (!auto_correlation) yf.sqr_norm(vars.yf_sqr_norm.deviceMem());
-#else
- xf.sqr_norm(vars.xf_sqr_norm.hostMem());
+ xf.sqr_norm(vars.xf_sqr_norm);
if (auto_correlation) {
vars.yf_sqr_norm.hostMem()[0] = vars.xf_sqr_norm.hostMem()[0];
} else {
- yf.sqr_norm(vars.yf_sqr_norm.hostMem());
+ yf.sqr_norm(vars.yf_sqr_norm);
}
-#endif
vars.xyf = auto_correlation ? xf.sqr_mag() : xf.mul2(yf.conj());
DEBUG_PRINTM(vars.xyf);
fft.inverse(vars.xyf, vars.ifft2_res, m_use_cuda ? vars.data_i_features.deviceMem() : nullptr, vars.stream);
#ifdef CUFFT
- if (auto_correlation)
- cuda_gaussian_correlation(vars.data_i_features.deviceMem(), vars.gauss_corr_res.deviceMem(), vars.xf_sqr_norm.deviceMem(), vars.xf_sqr_norm.deviceMem(),
- sigma, xf.n_channels, xf.n_scales, p_roi.height, p_roi.width, vars.stream);
- else
- cuda_gaussian_correlation(vars.data_i_features.deviceMem(), vars.gauss_corr_res.deviceMem(), vars.xf_sqr_norm.deviceMem(), vars.yf_sqr_norm.deviceMem(),
- sigma, xf.n_channels, xf.n_scales, p_roi.height, p_roi.width, vars.stream);
+ cuda_gaussian_correlation(vars.data_i_features.deviceMem(), vars.gauss_corr_res.deviceMem(),
+ vars.xf_sqr_norm.deviceMem(), vars.xf_sqr_norm.deviceMem(), sigma, xf.n_channels,
+ xf.n_scales, p_roi.height, p_roi.width, vars.stream);
#else
// ifft2 and sum over 3rd dimension, we dont care about individual channels
DEBUG_PRINTM(vars.ifft2_res);