Fix CUDA bugs that manifest themselves in BIG_BATCH mode

author Michal Sojka <michal.sojka@cvut.cz>

Mon, 8 Oct 2018 12:04:47 +0000 (14:04 +0200)

committer Michal Sojka <michal.sojka@cvut.cz>

Mon, 8 Oct 2018 12:04:47 +0000 (14:04 +0200)
author Michal Sojka <michal.sojka@cvut.cz>
Mon, 8 Oct 2018 12:04:47 +0000 (14:04 +0200)
committer Michal Sojka <michal.sojka@cvut.cz>
Mon, 8 Oct 2018 12:04:47 +0000 (14:04 +0200)
diff --git a/src/complexmat.cu b/src/complexmat.cu

index 15806732ec789b7abe5baafd8dac495fb2fb0246..1ebd883993581f583952ef333dc124bb068442e0 100644 (file)
--- a/src/complexmat.cu
+++ b/src/complexmat.cu
@@ -115,7 +115,6 @@ ComplexMat_ ComplexMat_::sum_over_channels() const
      assert(p_data.num_elem == n_channels * rows * cols);
  
      uint n_channels_per_scale = n_channels / n_scales;
-    uint scale_offset = n_channels_per_scale * rows * cols;
  
      ComplexMat_ result(this->rows, this->cols, 1, n_scales);
  
@@ -124,8 +123,8 @@ ComplexMat_ ComplexMat_::sum_over_channels() const
      const dim3 blocks((total + threads.x - 1) / threads.x);
  
      for (uint scale = 0; scale < n_scales; ++scale) {
-        sum_channels<<<blocks, threads>>>(reinterpret_cast<float*>(result.p_data.deviceMem() + scale * scale_offset),
-                                          reinterpret_cast<const float*>(p_data.deviceMem() + scale * scale_offset),
+        sum_channels<<<blocks, threads>>>(reinterpret_cast<float*>(result.p_data.deviceMem() + scale * rows * cols),
+                                          reinterpret_cast<const float*>(p_data.deviceMem() + scale * n_channels_per_scale * rows * cols),
                                            n_channels_per_scale, total);
      }
      CudaSafeCall(cudaStreamSynchronize(cudaStreamPerThread));
diff --git a/src/fft_cufft.cpp b/src/fft_cufft.cpp

index e551eaa41726690a1bce8d42b1f415931e691ff6..963c8b7084ff42d3ed24971cb627d2181897bdf9 100644 (file)
--- a/src/fft_cufft.cpp
+++ b/src/fft_cufft.cpp
@@ -100,13 +100,14 @@ void cuFFT::inverse(ComplexMat &complex_input, MatScales &real_result)
      Fft::inverse(complex_input, real_result);
  
      uint n_channels = complex_input.n_channels;
-    cufftComplex *in = reinterpret_cast<cufftComplex *>(complex_input.get_p_data());
+    cufftComplex *in = reinterpret_cast<cufftComplex *>(complex_input.get_dev_data());
      cufftReal *out = real_result.deviceMem();
      float alpha = 1.0 / (m_width * m_height);
  
      if (n_channels == 1)
          cudaErrorCheck(cufftExecC2R(plan_i_1ch, in, out));
  #ifdef BIG_BATCH
+    else
          cudaErrorCheck(cufftExecC2R(plan_i_all_scales, in, out));
  #endif
      // TODO: Investigate whether this scalling is needed or not
author	Michal Sojka <michal.sojka@cvut.cz>
	Mon, 8 Oct 2018 12:04:47 +0000 (14:04 +0200)
committer	Michal Sojka <michal.sojka@cvut.cz>
	Mon, 8 Oct 2018 12:04:47 +0000 (14:04 +0200)
src/complexmat.cu		patch \| blob \| history
src/fft_cufft.cpp		patch \| blob \| history