From: Michal Sojka Date: Thu, 4 Oct 2018 10:46:43 +0000 (+0200) Subject: Fix and simplify CUDA version of ComplexMat::sqr_norm X-Git-Url: https://rtime.felk.cvut.cz/gitweb/hercules2020/kcf.git/commitdiff_plain/7d93420cda54a987471b8358d5833766cc6bb9c8 Fix and simplify CUDA version of ComplexMat::sqr_norm --- diff --git a/src/complexmat.cu b/src/complexmat.cu index 13bd704..b5846fc 100644 --- a/src/complexmat.cu +++ b/src/complexmat.cu @@ -1,44 +1,46 @@ #include "complexmat.hpp" -__global__ void sqr_norm_kernel(int n, float *out, const float *data, float rows, float cols) + +__global__ void sqr_norm_kernel(const float *in, float *block_res, int total) { extern __shared__ float sdata[]; - int i = blockDim.x * threadIdx.y + threadIdx.x; - int blockId = blockIdx.x + blockIdx.y * gridDim.x; - int threadId = 2 * (blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x); - - sdata[i] = 0; - sdata[i] = data[threadId] * data[threadId] + data[threadId + 1] * data[threadId + 1]; - __syncthreads(); + int in_idx = 2 * (blockIdx.x * blockDim.x + threadIdx.x); + int i = threadIdx.x; - for (unsigned int s = (blockDim.x * blockDim.y + 1) / 2, old_s = blockDim.x * blockDim.y; s > 0; s >>= 1) { + if (in_idx >= total * 2) + sdata[i] = 0; + else + sdata[i] = in[in_idx] * in[in_idx] + in[in_idx + 1] * in[in_idx + 1]; - if (old_s & 1) s += 1; - - if (i < s && i + s < old_s) { - sdata[i] += sdata[i + s]; - } - old_s = s; + for (unsigned s = (blockDim.x + 1) / 2; s > 0; s >>= 1) { __syncthreads(); + if (i < s) + sdata[i] += sdata[i + s]; } - if (i == 0) { - atomicAdd(&out[blockId / n], sdata[0] / (rows * cols)); - } + if (i == 0) + block_res[blockIdx.x] = sdata[0]; } -void ComplexMat::sqr_norm(DynMem &result) const +void ComplexMat_::sqr_norm(DynMem &result) const { - CudaSafeCall(cudaMemsetAsync(result.deviceMem(), 0, n_scales * sizeof(float))); + assert(n_scales == 1); - dim3 threadsPerBlock(rows, cols); - dim3 numBlocks(n_channels / n_scales, n_scales); + const uint total = n_channels * rows * cols; + const dim3 threads(1024); + const dim3 blocks((total + threads.x - 1) / threads.x); + + DynMem block_res(blocks.x); - sqr_norm_kernel<<>>( - n_channels / n_scales, result.deviceMem(), (float*)this->p_data.deviceMem(), rows, cols); + sqr_norm_kernel<<>>((const float*)p_data.deviceMem(), + block_res.deviceMem(), total); CudaCheckError(); + CudaSafeCall(cudaStreamSynchronize(cudaStreamPerThread)); - return; + T res = 0; + for (int i = 0; i < blocks.x; i++) + res += block_res[i]; + result.hostMem()[0] = res / static_cast(cols * rows); } __global__ void sqr_mag_kernel(const float *data, float *result) diff --git a/src/kcf.cpp b/src/kcf.cpp index 60db47b..d6e1c30 100644 --- a/src/kcf.cpp +++ b/src/kcf.cpp @@ -680,11 +680,13 @@ void KCF_Tracker::GaussianCorrelation::operator()(ComplexMat &result, const Comp { TRACE(""); xf.sqr_norm(xf_sqr_norm); + DEBUG_PRINTM(xf_sqr_norm[0]); if (auto_correlation) { yf_sqr_norm = xf_sqr_norm; } else { yf.sqr_norm(yf_sqr_norm); } + DEBUG_PRINTM(yf_sqr_norm[0]); xyf = auto_correlation ? xf.sqr_mag() : xf * yf.conj(); // xf.muln(yf.conj()); DEBUG_PRINTM(xyf);