- same_num_channels_mul_kernel<<<blocks, threads, 0>>>((float*)this->p_data.deviceMem(),
- (float*)rhs.p_data.deviceMem(),
- (float*)result.p_data.deviceMem(),
- total);
- CudaCheckError();
+ for (uint s = 0; s < n_scales; ++s) {
+ same_num_channels_mul_kernel<<<blocks, threads, 0>>>((float*)(this->p_data.deviceMem() + s * total),
+ (float*)rhs.p_data.deviceMem(),
+ (float*)(result.p_data.deviceMem() + s * total),
+ total);
+ CudaCheckError();
+ }