Files
blender/intern/cycles/device/cuda/queue.cpp
Brecht Van Lommel df00463764 Cycles: add shadow path compaction for GPU rendering
Similar to main path compaction that happens before adding work tiles, this
compacts shadow paths before launching kernels that may add shadow paths.

Only do it when more than 50% of space is wasted.

It's not a clear win in all scenes, some are up to 1.5% slower. Likely caused
by different order of scheduling kernels having an unpredictable performance
impact. Still feels like compaction is just the right thing to avoid cases
where a few shadow paths can hold up a lot of main paths.

Differential Revision: https://developer.blender.org/D12944
2021-10-21 15:38:03 +02:00

239 lines
6.6 KiB
C++

/*
* Copyright 2011-2013 Blender Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifdef WITH_CUDA
# include "device/cuda/queue.h"
# include "device/cuda/device_impl.h"
# include "device/cuda/graphics_interop.h"
# include "device/cuda/kernel.h"
CCL_NAMESPACE_BEGIN
/* CUDADeviceQueue */
CUDADeviceQueue::CUDADeviceQueue(CUDADevice *device)
: DeviceQueue(device), cuda_device_(device), cuda_stream_(nullptr)
{
const CUDAContextScope scope(cuda_device_);
cuda_device_assert(cuda_device_, cuStreamCreate(&cuda_stream_, CU_STREAM_NON_BLOCKING));
}
CUDADeviceQueue::~CUDADeviceQueue()
{
const CUDAContextScope scope(cuda_device_);
cuStreamDestroy(cuda_stream_);
}
int CUDADeviceQueue::num_concurrent_states(const size_t state_size) const
{
const int max_num_threads = cuda_device_->get_num_multiprocessors() *
cuda_device_->get_max_num_threads_per_multiprocessor();
int num_states = max(max_num_threads, 65536) * 16;
const char *factor_str = getenv("CYCLES_CONCURRENT_STATES_FACTOR");
if (factor_str) {
const float factor = (float)atof(factor_str);
if (factor != 0.0f) {
num_states = max((int)(num_states * factor), 1024);
}
else {
VLOG(3) << "CYCLES_CONCURRENT_STATES_FACTOR evaluated to 0";
}
}
VLOG(3) << "GPU queue concurrent states: " << num_states << ", using up to "
<< string_human_readable_size(num_states * state_size);
return num_states;
}
int CUDADeviceQueue::num_concurrent_busy_states() const
{
const int max_num_threads = cuda_device_->get_num_multiprocessors() *
cuda_device_->get_max_num_threads_per_multiprocessor();
if (max_num_threads == 0) {
return 65536;
}
return 4 * max_num_threads;
}
void CUDADeviceQueue::init_execution()
{
/* Synchronize all textures and memory copies before executing task. */
CUDAContextScope scope(cuda_device_);
cuda_device_->load_texture_info();
cuda_device_assert(cuda_device_, cuCtxSynchronize());
debug_init_execution();
}
bool CUDADeviceQueue::kernel_available(DeviceKernel kernel) const
{
return cuda_device_->kernels.available(kernel);
}
bool CUDADeviceQueue::enqueue(DeviceKernel kernel, const int work_size, void *args[])
{
if (cuda_device_->have_error()) {
return false;
}
debug_enqueue(kernel, work_size);
const CUDAContextScope scope(cuda_device_);
const CUDADeviceKernel &cuda_kernel = cuda_device_->kernels.get(kernel);
/* Compute kernel launch parameters. */
const int num_threads_per_block = cuda_kernel.num_threads_per_block;
const int num_blocks = divide_up(work_size, num_threads_per_block);
int shared_mem_bytes = 0;
switch (kernel) {
case DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY:
case DEVICE_KERNEL_INTEGRATOR_QUEUED_SHADOW_PATHS_ARRAY:
case DEVICE_KERNEL_INTEGRATOR_ACTIVE_PATHS_ARRAY:
case DEVICE_KERNEL_INTEGRATOR_TERMINATED_PATHS_ARRAY:
case DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY:
case DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY:
case DEVICE_KERNEL_INTEGRATOR_TERMINATED_SHADOW_PATHS_ARRAY:
case DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_PATHS_ARRAY:
/* See parall_active_index.h for why this amount of shared memory is needed. */
shared_mem_bytes = (num_threads_per_block + 1) * sizeof(int);
break;
default:
break;
}
/* Launch kernel. */
assert_success(cuLaunchKernel(cuda_kernel.function,
num_blocks,
1,
1,
num_threads_per_block,
1,
1,
shared_mem_bytes,
cuda_stream_,
args,
0),
"enqueue");
return !(cuda_device_->have_error());
}
bool CUDADeviceQueue::synchronize()
{
if (cuda_device_->have_error()) {
return false;
}
const CUDAContextScope scope(cuda_device_);
assert_success(cuStreamSynchronize(cuda_stream_), "synchronize");
debug_synchronize();
return !(cuda_device_->have_error());
}
void CUDADeviceQueue::zero_to_device(device_memory &mem)
{
assert(mem.type != MEM_GLOBAL && mem.type != MEM_TEXTURE);
if (mem.memory_size() == 0) {
return;
}
/* Allocate on demand. */
if (mem.device_pointer == 0) {
cuda_device_->mem_alloc(mem);
}
/* Zero memory on device. */
assert(mem.device_pointer != 0);
const CUDAContextScope scope(cuda_device_);
assert_success(
cuMemsetD8Async((CUdeviceptr)mem.device_pointer, 0, mem.memory_size(), cuda_stream_),
"zero_to_device");
}
void CUDADeviceQueue::copy_to_device(device_memory &mem)
{
assert(mem.type != MEM_GLOBAL && mem.type != MEM_TEXTURE);
if (mem.memory_size() == 0) {
return;
}
/* Allocate on demand. */
if (mem.device_pointer == 0) {
cuda_device_->mem_alloc(mem);
}
assert(mem.device_pointer != 0);
assert(mem.host_pointer != nullptr);
/* Copy memory to device. */
const CUDAContextScope scope(cuda_device_);
assert_success(
cuMemcpyHtoDAsync(
(CUdeviceptr)mem.device_pointer, mem.host_pointer, mem.memory_size(), cuda_stream_),
"copy_to_device");
}
void CUDADeviceQueue::copy_from_device(device_memory &mem)
{
assert(mem.type != MEM_GLOBAL && mem.type != MEM_TEXTURE);
if (mem.memory_size() == 0) {
return;
}
assert(mem.device_pointer != 0);
assert(mem.host_pointer != nullptr);
/* Copy memory from device. */
const CUDAContextScope scope(cuda_device_);
assert_success(
cuMemcpyDtoHAsync(
mem.host_pointer, (CUdeviceptr)mem.device_pointer, mem.memory_size(), cuda_stream_),
"copy_from_device");
}
void CUDADeviceQueue::assert_success(CUresult result, const char *operation)
{
if (result != CUDA_SUCCESS) {
const char *name = cuewErrorString(result);
cuda_device_->set_error(string_printf(
"%s in CUDA queue %s (%s)", name, operation, debug_active_kernels().c_str()));
}
}
unique_ptr<DeviceGraphicsInterop> CUDADeviceQueue::graphics_interop_create()
{
return make_unique<CUDADeviceGraphicsInterop>(this);
}
CCL_NAMESPACE_END
#endif /* WITH_CUDA */