]> rtime.felk.cvut.cz Git - hercules2020/kcf.git/commitdiff
Fix CUDA bugs that manifest themselves in BIG_BATCH mode
authorMichal Sojka <michal.sojka@cvut.cz>
Mon, 8 Oct 2018 12:04:47 +0000 (14:04 +0200)
committerMichal Sojka <michal.sojka@cvut.cz>
Mon, 8 Oct 2018 12:04:47 +0000 (14:04 +0200)
src/complexmat.cu
src/fft_cufft.cpp

index 15806732ec789b7abe5baafd8dac495fb2fb0246..1ebd883993581f583952ef333dc124bb068442e0 100644 (file)
@@ -115,7 +115,6 @@ ComplexMat_ ComplexMat_::sum_over_channels() const
     assert(p_data.num_elem == n_channels * rows * cols);
 
     uint n_channels_per_scale = n_channels / n_scales;
-    uint scale_offset = n_channels_per_scale * rows * cols;
 
     ComplexMat_ result(this->rows, this->cols, 1, n_scales);
 
@@ -124,8 +123,8 @@ ComplexMat_ ComplexMat_::sum_over_channels() const
     const dim3 blocks((total + threads.x - 1) / threads.x);
 
     for (uint scale = 0; scale < n_scales; ++scale) {
-        sum_channels<<<blocks, threads>>>(reinterpret_cast<float*>(result.p_data.deviceMem() + scale * scale_offset),
-                                          reinterpret_cast<const float*>(p_data.deviceMem() + scale * scale_offset),
+        sum_channels<<<blocks, threads>>>(reinterpret_cast<float*>(result.p_data.deviceMem() + scale * rows * cols),
+                                          reinterpret_cast<const float*>(p_data.deviceMem() + scale * n_channels_per_scale * rows * cols),
                                           n_channels_per_scale, total);
     }
     CudaSafeCall(cudaStreamSynchronize(cudaStreamPerThread));
index e551eaa41726690a1bce8d42b1f415931e691ff6..963c8b7084ff42d3ed24971cb627d2181897bdf9 100644 (file)
@@ -100,13 +100,14 @@ void cuFFT::inverse(ComplexMat &complex_input, MatScales &real_result)
     Fft::inverse(complex_input, real_result);
 
     uint n_channels = complex_input.n_channels;
-    cufftComplex *in = reinterpret_cast<cufftComplex *>(complex_input.get_p_data());
+    cufftComplex *in = reinterpret_cast<cufftComplex *>(complex_input.get_dev_data());
     cufftReal *out = real_result.deviceMem();
     float alpha = 1.0 / (m_width * m_height);
 
     if (n_channels == 1)
         cudaErrorCheck(cufftExecC2R(plan_i_1ch, in, out));
 #ifdef BIG_BATCH
+    else
         cudaErrorCheck(cufftExecC2R(plan_i_all_scales, in, out));
 #endif
     // TODO: Investigate whether this scalling is needed or not