assert(p_data.num_elem == n_channels * rows * cols);
uint n_channels_per_scale = n_channels / n_scales;
- uint scale_offset = n_channels_per_scale * rows * cols;
ComplexMat_ result(this->rows, this->cols, 1, n_scales);
const dim3 blocks((total + threads.x - 1) / threads.x);
for (uint scale = 0; scale < n_scales; ++scale) {
- sum_channels<<<blocks, threads>>>(reinterpret_cast<float*>(result.p_data.deviceMem() + scale * scale_offset),
- reinterpret_cast<const float*>(p_data.deviceMem() + scale * scale_offset),
+ sum_channels<<<blocks, threads>>>(reinterpret_cast<float*>(result.p_data.deviceMem() + scale * rows * cols),
+ reinterpret_cast<const float*>(p_data.deviceMem() + scale * n_channels_per_scale * rows * cols),
n_channels_per_scale, total);
}
CudaSafeCall(cudaStreamSynchronize(cudaStreamPerThread));
Fft::inverse(complex_input, real_result);
uint n_channels = complex_input.n_channels;
- cufftComplex *in = reinterpret_cast<cufftComplex *>(complex_input.get_p_data());
+ cufftComplex *in = reinterpret_cast<cufftComplex *>(complex_input.get_dev_data());
cufftReal *out = real_result.deviceMem();
float alpha = 1.0 / (m_width * m_height);
if (n_channels == 1)
cudaErrorCheck(cufftExecC2R(plan_i_1ch, in, out));
#ifdef BIG_BATCH
+ else
cudaErrorCheck(cufftExecC2R(plan_i_all_scales, in, out));
#endif
// TODO: Investigate whether this scalling is needed or not