blender/intern/cycles/device/hip/queue.cpp

/*
 * Copyright 2011-2021 Blender Foundation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#ifdef WITH_HIP

#  include "device/hip/queue.h"

#  include "device/hip/device_impl.h"
#  include "device/hip/graphics_interop.h"
#  include "device/hip/kernel.h"

CCL_NAMESPACE_BEGIN

/* HIPDeviceQueue */

HIPDeviceQueue::HIPDeviceQueue(HIPDevice *device)
    : DeviceQueue(device), hip_device_(device), hip_stream_(nullptr)
{
  const HIPContextScope scope(hip_device_);
  hip_device_assert(hip_device_, hipStreamCreateWithFlags(&hip_stream_, hipStreamNonBlocking));
}

HIPDeviceQueue::~HIPDeviceQueue()
{
  const HIPContextScope scope(hip_device_);
  hipStreamDestroy(hip_stream_);
}

int HIPDeviceQueue::num_concurrent_states(const size_t /*state_size*/) const
{
  /* TODO: compute automatically. */
  /* TODO: must have at least num_threads_per_block. */
  return 14416128;
}

int HIPDeviceQueue::num_concurrent_busy_states() const
{
  const int max_num_threads = hip_device_->get_num_multiprocessors() *
                              hip_device_->get_max_num_threads_per_multiprocessor();

  if (max_num_threads == 0) {
    return 65536;
  }

  return 4 * max_num_threads;
}

void HIPDeviceQueue::init_execution()
{
  /* Synchronize all textures and memory copies before executing task. */
  HIPContextScope scope(hip_device_);
  hip_device_->load_texture_info();
  hip_device_assert(hip_device_, hipDeviceSynchronize());

  debug_init_execution();
}

bool HIPDeviceQueue::kernel_available(DeviceKernel kernel) const
{
  return hip_device_->kernels.available(kernel);
}

bool HIPDeviceQueue::enqueue(DeviceKernel kernel, const int work_size, void *args[])
{
  if (hip_device_->have_error()) {
    return false;
  }

  debug_enqueue(kernel, work_size);

  const HIPContextScope scope(hip_device_);
  const HIPDeviceKernel &hip_kernel = hip_device_->kernels.get(kernel);

  /* Compute kernel launch parameters. */
  const int num_threads_per_block = hip_kernel.num_threads_per_block;
  const int num_blocks = divide_up(work_size, num_threads_per_block);

  int shared_mem_bytes = 0;

  switch (kernel) {
    case DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY:
    case DEVICE_KERNEL_INTEGRATOR_QUEUED_SHADOW_PATHS_ARRAY:
    case DEVICE_KERNEL_INTEGRATOR_ACTIVE_PATHS_ARRAY:
    case DEVICE_KERNEL_INTEGRATOR_TERMINATED_PATHS_ARRAY:
    case DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY:
    case DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY:
      /* See parall_active_index.h for why this amount of shared memory is needed. */
      shared_mem_bytes = (num_threads_per_block + 1) * sizeof(int);
      break;
    default:
      break;
  }

  /* Launch kernel. */
  hip_device_assert(hip_device_,
                    hipModuleLaunchKernel(hip_kernel.function,
                                          num_blocks,
                                          1,
                                          1,
                                          num_threads_per_block,
                                          1,
                                          1,
                                          shared_mem_bytes,
                                          hip_stream_,
                                          args,
                                          0));
  return !(hip_device_->have_error());
}

bool HIPDeviceQueue::synchronize()
{
  if (hip_device_->have_error()) {
    return false;
  }

  const HIPContextScope scope(hip_device_);
  hip_device_assert(hip_device_, hipStreamSynchronize(hip_stream_));
  debug_synchronize();

  return !(hip_device_->have_error());
}

void HIPDeviceQueue::zero_to_device(device_memory &mem)
{
  assert(mem.type != MEM_GLOBAL && mem.type != MEM_TEXTURE);

  if (mem.memory_size() == 0) {
    return;
  }

  /* Allocate on demand. */
  if (mem.device_pointer == 0) {
    hip_device_->mem_alloc(mem);
  }

  /* Zero memory on device. */
  assert(mem.device_pointer != 0);

  const HIPContextScope scope(hip_device_);
  hip_device_assert(
      hip_device_,
      hipMemsetD8Async((hipDeviceptr_t)mem.device_pointer, 0, mem.memory_size(), hip_stream_));
}

void HIPDeviceQueue::copy_to_device(device_memory &mem)
{
  assert(mem.type != MEM_GLOBAL && mem.type != MEM_TEXTURE);

  if (mem.memory_size() == 0) {
    return;
  }

  /* Allocate on demand. */
  if (mem.device_pointer == 0) {
    hip_device_->mem_alloc(mem);
  }

  assert(mem.device_pointer != 0);
  assert(mem.host_pointer != nullptr);

  /* Copy memory to device. */
  const HIPContextScope scope(hip_device_);
  hip_device_assert(
      hip_device_,
      hipMemcpyHtoDAsync(
          (hipDeviceptr_t)mem.device_pointer, mem.host_pointer, mem.memory_size(), hip_stream_));
}

void HIPDeviceQueue::copy_from_device(device_memory &mem)
{
  assert(mem.type != MEM_GLOBAL && mem.type != MEM_TEXTURE);

  if (mem.memory_size() == 0) {
    return;
  }

  assert(mem.device_pointer != 0);
  assert(mem.host_pointer != nullptr);

  /* Copy memory from device. */
  const HIPContextScope scope(hip_device_);
  hip_device_assert(
      hip_device_,
      hipMemcpyDtoHAsync(
          mem.host_pointer, (hipDeviceptr_t)mem.device_pointer, mem.memory_size(), hip_stream_));
}

// TODO : (Arya) Enable this after stabilizing dev branch
unique_ptr<DeviceGraphicsInterop> HIPDeviceQueue::graphics_interop_create()
{
  return make_unique<HIPDeviceGraphicsInterop>(this);
}

CCL_NAMESPACE_END

#endif /* WITH_HIP */
Cycles: add HIP device support for AMD GPUs NOTE: this feature is not ready for user testing, and not yet enabled in daily builds. It is being merged now for easier collaboration on development. HIP is a heterogenous compute interface allowing C++ code to be executed on GPUs similar to CUDA. It is intended to bring back AMD GPU rendering support on Windows and Linux. https://github.com/ROCm-Developer-Tools/HIP. As of the time of writing, it should compile and run on Linux with existing HIP compilers and driver runtimes. Publicly available compilers and drivers for Windows will come later. See task T91571 for more details on the current status and work remaining to be done. Credits: Sayak Biswas (AMD) Arya Rafii (AMD) Brian Savery (AMD) Differential Revision: https://developer.blender.org/D12578 2021-09-28 16:51:14 +02:00			`/*`
			`* Copyright 2011-2021 Blender Foundation`
			`*`
			`* Licensed under the Apache License, Version 2.0 (the "License");`
			`* you may not use this file except in compliance with the License.`
			`* You may obtain a copy of the License at`
			`*`
			`* http://www.apache.org/licenses/LICENSE-2.0`
			`*`
			`* Unless required by applicable law or agreed to in writing, software`
			`* distributed under the License is distributed on an "AS IS" BASIS,`
			`* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`* See the License for the specific language governing permissions and`
			`* limitations under the License.`
			`*/`

			`#ifdef WITH_HIP`

			`# include "device/hip/queue.h"`

			`# include "device/hip/device_impl.h"`
			`# include "device/hip/graphics_interop.h"`
			`# include "device/hip/kernel.h"`

			`CCL_NAMESPACE_BEGIN`

			`/* HIPDeviceQueue */`

			`HIPDeviceQueue::HIPDeviceQueue(HIPDevice *device)`
			`: DeviceQueue(device), hip_device_(device), hip_stream_(nullptr)`
			`{`
			`const HIPContextScope scope(hip_device_);`
			`hip_device_assert(hip_device_, hipStreamCreateWithFlags(&hip_stream_, hipStreamNonBlocking));`
			`}`

			`HIPDeviceQueue::~HIPDeviceQueue()`
			`{`
			`const HIPContextScope scope(hip_device_);`
			`hipStreamDestroy(hip_stream_);`
			`}`

			`int HIPDeviceQueue::num_concurrent_states(const size_t /state_size/) const`
			`{`
			`/* TODO: compute automatically. */`
			`/* TODO: must have at least num_threads_per_block. */`
			`return 14416128;`
			`}`

			`int HIPDeviceQueue::num_concurrent_busy_states() const`
			`{`
			`const int max_num_threads = hip_device_->get_num_multiprocessors() *`
			`hip_device_->get_max_num_threads_per_multiprocessor();`

			`if (max_num_threads == 0) {`
			`return 65536;`
			`}`

			`return 4 * max_num_threads;`
			`}`

			`void HIPDeviceQueue::init_execution()`
			`{`
			`/* Synchronize all textures and memory copies before executing task. */`
			`HIPContextScope scope(hip_device_);`
			`hip_device_->load_texture_info();`
			`hip_device_assert(hip_device_, hipDeviceSynchronize());`

			`debug_init_execution();`
			`}`

			`bool HIPDeviceQueue::kernel_available(DeviceKernel kernel) const`
			`{`
			`return hip_device_->kernels.available(kernel);`
			`}`

			`bool HIPDeviceQueue::enqueue(DeviceKernel kernel, const int work_size, void *args[])`
			`{`
			`if (hip_device_->have_error()) {`
			`return false;`
			`}`

			`debug_enqueue(kernel, work_size);`

			`const HIPContextScope scope(hip_device_);`
			`const HIPDeviceKernel &hip_kernel = hip_device_->kernels.get(kernel);`

			`/* Compute kernel launch parameters. */`
			`const int num_threads_per_block = hip_kernel.num_threads_per_block;`
			`const int num_blocks = divide_up(work_size, num_threads_per_block);`

			`int shared_mem_bytes = 0;`

			`switch (kernel) {`
			`case DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY:`
			`case DEVICE_KERNEL_INTEGRATOR_QUEUED_SHADOW_PATHS_ARRAY:`
			`case DEVICE_KERNEL_INTEGRATOR_ACTIVE_PATHS_ARRAY:`
			`case DEVICE_KERNEL_INTEGRATOR_TERMINATED_PATHS_ARRAY:`
			`case DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY:`
			`case DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY:`
			`/* See parall_active_index.h for why this amount of shared memory is needed. */`
			`shared_mem_bytes = (num_threads_per_block + 1) * sizeof(int);`
			`break;`
			`default:`
			`break;`
			`}`

			`/* Launch kernel. */`
			`hip_device_assert(hip_device_,`
			`hipModuleLaunchKernel(hip_kernel.function,`
			`num_blocks,`
			`1,`
			`1,`
			`num_threads_per_block,`
			`1,`
			`1,`
			`shared_mem_bytes,`
			`hip_stream_,`
			`args,`
			`0));`
			`return !(hip_device_->have_error());`
			`}`

			`bool HIPDeviceQueue::synchronize()`
			`{`
			`if (hip_device_->have_error()) {`
			`return false;`
			`}`

			`const HIPContextScope scope(hip_device_);`
			`hip_device_assert(hip_device_, hipStreamSynchronize(hip_stream_));`
			`debug_synchronize();`

			`return !(hip_device_->have_error());`
			`}`

			`void HIPDeviceQueue::zero_to_device(device_memory &mem)`
			`{`
			`assert(mem.type != MEM_GLOBAL && mem.type != MEM_TEXTURE);`

			`if (mem.memory_size() == 0) {`
			`return;`
			`}`

			`/* Allocate on demand. */`
			`if (mem.device_pointer == 0) {`
			`hip_device_->mem_alloc(mem);`
			`}`

			`/* Zero memory on device. */`
			`assert(mem.device_pointer != 0);`

			`const HIPContextScope scope(hip_device_);`
			`hip_device_assert(`
			`hip_device_,`
			`hipMemsetD8Async((hipDeviceptr_t)mem.device_pointer, 0, mem.memory_size(), hip_stream_));`
			`}`

			`void HIPDeviceQueue::copy_to_device(device_memory &mem)`
			`{`
			`assert(mem.type != MEM_GLOBAL && mem.type != MEM_TEXTURE);`

			`if (mem.memory_size() == 0) {`
			`return;`
			`}`

			`/* Allocate on demand. */`
			`if (mem.device_pointer == 0) {`
			`hip_device_->mem_alloc(mem);`
			`}`

			`assert(mem.device_pointer != 0);`
			`assert(mem.host_pointer != nullptr);`

			`/* Copy memory to device. */`
			`const HIPContextScope scope(hip_device_);`
			`hip_device_assert(`
			`hip_device_,`
			`hipMemcpyHtoDAsync(`
			`(hipDeviceptr_t)mem.device_pointer, mem.host_pointer, mem.memory_size(), hip_stream_));`
			`}`

			`void HIPDeviceQueue::copy_from_device(device_memory &mem)`
			`{`
			`assert(mem.type != MEM_GLOBAL && mem.type != MEM_TEXTURE);`

			`if (mem.memory_size() == 0) {`
			`return;`
			`}`

			`assert(mem.device_pointer != 0);`
			`assert(mem.host_pointer != nullptr);`

			`/* Copy memory from device. */`
			`const HIPContextScope scope(hip_device_);`
			`hip_device_assert(`
			`hip_device_,`
			`hipMemcpyDtoHAsync(`
			`mem.host_pointer, (hipDeviceptr_t)mem.device_pointer, mem.memory_size(), hip_stream_));`
			`}`

			`// TODO : (Arya) Enable this after stabilizing dev branch`
			`unique_ptr<DeviceGraphicsInterop> HIPDeviceQueue::graphics_interop_create()`
			`{`
			`return make_unique<HIPDeviceGraphicsInterop>(this);`
			`}`

			`CCL_NAMESPACE_END`

			`#endif /* WITH_HIP */`