blender/intern/cycles/device/device_optix.cpp

/*
 * Copyright 2019, NVIDIA Corporation.
 * Copyright 2019, Blender Foundation.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#ifdef WITH_OPTIX

#  include "device/device.h"
#  include "device/device_intern.h"
#  include "device/device_denoising.h"
#  include "bvh/bvh.h"
#  include "render/scene.h"
#  include "render/mesh.h"
#  include "render/object.h"
#  include "render/buffers.h"
#  include "util/util_md5.h"
#  include "util/util_path.h"
#  include "util/util_time.h"
#  include "util/util_debug.h"
#  include "util/util_logging.h"

#  undef _WIN32_WINNT  // Need minimum API support for Windows 7
#  define _WIN32_WINNT _WIN32_WINNT_WIN7

#  ifdef WITH_CUDA_DYNLOAD
#    include <cuew.h>
// Do not use CUDA SDK headers when using CUEW
#    define OPTIX_DONT_INCLUDE_CUDA
#  endif
#  include <optix_stubs.h>
#  include <optix_function_table_definition.h>

CCL_NAMESPACE_BEGIN

/* Make sure this stays in sync with kernel_globals.h */
struct ShaderParams {
  uint4 *input;
  float4 *output;
  int type;
  int filter;
  int sx;
  int offset;
  int sample;
};
struct KernelParams {
  WorkTile tile;
  KernelData data;
  ShaderParams shader;
#  define KERNEL_TEX(type, name) const type *name;
#  include "kernel/kernel_textures.h"
#  undef KERNEL_TEX
};

#  define check_result_cuda(stmt) \
    { \
      CUresult res = stmt; \
      if (res != CUDA_SUCCESS) { \
        const char *name; \
        cuGetErrorName(res, &name); \
        set_error(string_printf("OptiX CUDA error %s in %s, line %d", name, #stmt, __LINE__)); \
        return; \
      } \
    } \
    (void)0
#  define check_result_cuda_ret(stmt) \
    { \
      CUresult res = stmt; \
      if (res != CUDA_SUCCESS) { \
        const char *name; \
        cuGetErrorName(res, &name); \
        set_error(string_printf("OptiX CUDA error %s in %s, line %d", name, #stmt, __LINE__)); \
        return false; \
      } \
    } \
    (void)0

#  define check_result_optix(stmt) \
    { \
      enum OptixResult res = stmt; \
      if (res != OPTIX_SUCCESS) { \
        const char *name = optixGetErrorName(res); \
        set_error(string_printf("OptiX error %s in %s, line %d", name, #stmt, __LINE__)); \
        return; \
      } \
    } \
    (void)0
#  define check_result_optix_ret(stmt) \
    { \
      enum OptixResult res = stmt; \
      if (res != OPTIX_SUCCESS) { \
        const char *name = optixGetErrorName(res); \
        set_error(string_printf("OptiX error %s in %s, line %d", name, #stmt, __LINE__)); \
        return false; \
      } \
    } \
    (void)0

class OptiXDevice : public Device {

  // List of OptiX program groups
  enum {
    PG_RGEN,
    PG_MISS,
    PG_HITD,  // Default hit group
    PG_HITL,  // __BVH_LOCAL__ hit group
    PG_HITS,  // __SHADOW_RECORD_ALL__ hit group
#  ifdef WITH_CYCLES_DEBUG
    PG_EXCP,
#  endif
    PG_BAKE,  // kernel_bake_evaluate
    PG_DISP,  // kernel_displace_evaluate
    PG_BACK,  // kernel_background_evaluate
    NUM_PROGRAM_GROUPS
  };

  // List of OptiX pipelines
  enum { PIP_PATH_TRACE, PIP_SHADER_EVAL, NUM_PIPELINES };

  // A single shader binding table entry
  struct SbtRecord {
    char header[OPTIX_SBT_RECORD_HEADER_SIZE];
  };

  // Information stored about CUDA memory allocations
  struct CUDAMem {
    bool free_map_host = false;
    CUarray array = NULL;
    CUtexObject texobject = 0;
    void *map_host_pointer = nullptr;
  };

  // Helper class to manage current CUDA context
  struct CUDAContextScope {
    CUDAContextScope(CUcontext ctx)
    {
      cuCtxPushCurrent(ctx);
    }
    ~CUDAContextScope()
    {
      cuCtxPopCurrent(NULL);
    }
  };

  // Use a pool with multiple threads to support launches with multiple CUDA streams
  TaskPool task_pool;

  // CUDA/OptiX context handles
  CUdevice cuda_device = 0;
  CUcontext cuda_context = NULL;
  vector<CUstream> cuda_stream;
  OptixDeviceContext context = NULL;

  // Need CUDA kernel module for some utility functions
  CUmodule cuda_module = NULL;
  CUmodule cuda_filter_module = NULL;
  // All necessary OptiX kernels are in one module
  OptixModule optix_module = NULL;
  OptixPipeline pipelines[NUM_PIPELINES] = {};

  bool motion_blur = false;
  bool need_texture_info = false;
  device_vector<SbtRecord> sbt_data;
  device_vector<TextureInfo> texture_info;
  device_only_memory<KernelParams> launch_params;
  vector<device_only_memory<uint8_t>> blas;
  OptixTraversableHandle tlas_handle = 0;

  // TODO(pmours): This is copied from device_cuda.cpp, so move to common code eventually
  int can_map_host = 0;
  size_t map_host_used = 0;
  size_t map_host_limit = 0;
  size_t device_working_headroom = 32 * 1024 * 1024LL;   // 32MB
  size_t device_texture_headroom = 128 * 1024 * 1024LL;  // 128MB
  map<device_memory *, CUDAMem> cuda_mem_map;
  bool move_texture_to_host = false;

 public:
  OptiXDevice(DeviceInfo &info_, Stats &stats_, Profiler &profiler_, bool background_)
      : Device(info_, stats_, profiler_, background_),
        sbt_data(this, "__sbt", MEM_READ_ONLY),
        texture_info(this, "__texture_info", MEM_TEXTURE),
        launch_params(this, "__params")
  {
    // Store number of CUDA streams in device info
    info.cpu_threads = DebugFlags().optix.cuda_streams;

    // Initialize CUDA driver API
    check_result_cuda(cuInit(0));

    // Retrieve the primary CUDA context for this device
    check_result_cuda(cuDeviceGet(&cuda_device, info.num));
    check_result_cuda(cuDevicePrimaryCtxRetain(&cuda_context, cuda_device));

    // Make that CUDA context current
    const CUDAContextScope scope(cuda_context);

    // Limit amount of host mapped memory (see init_host_memory in device_cuda.cpp)
    size_t default_limit = 4 * 1024 * 1024 * 1024LL;
    size_t system_ram = system_physical_ram();
    if (system_ram > 0) {
      if (system_ram / 2 > default_limit) {
        map_host_limit = system_ram - default_limit;
      }
      else {
        map_host_limit = system_ram / 2;
      }
    }
    else {
      VLOG(1) << "Mapped host memory disabled, failed to get system RAM";
    }

    // Check device support for pinned host memory
    check_result_cuda(
        cuDeviceGetAttribute(&can_map_host, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, cuda_device));

    // Create OptiX context for this device
    OptixDeviceContextOptions options = {};
#  ifdef WITH_CYCLES_LOGGING
    options.logCallbackLevel = 4;  // Fatal = 1, Error = 2, Warning = 3, Print = 4
    options.logCallbackFunction =
        [](unsigned int level, const char *, const char *message, void *) {
          switch (level) {
            case 1:
              LOG_IF(FATAL, VLOG_IS_ON(1)) << message;
              break;
            case 2:
              LOG_IF(ERROR, VLOG_IS_ON(1)) << message;
              break;
            case 3:
              LOG_IF(WARNING, VLOG_IS_ON(1)) << message;
              break;
            case 4:
              LOG_IF(INFO, VLOG_IS_ON(1)) << message;
              break;
          }
        };
#  endif
    check_result_optix(optixDeviceContextCreate(cuda_context, &options, &context));
#  ifdef WITH_CYCLES_LOGGING
    check_result_optix(optixDeviceContextSetLogCallback(
        context, options.logCallbackFunction, options.logCallbackData, options.logCallbackLevel));
#  endif

    // Create launch streams
    cuda_stream.resize(info.cpu_threads);
    for (int i = 0; i < info.cpu_threads; ++i)
      check_result_cuda(cuStreamCreate(&cuda_stream[i], CU_STREAM_NON_BLOCKING));

    // Fix weird compiler bug that assigns wrong size
    launch_params.data_elements = sizeof(KernelParams);
    // Allocate launch parameter buffer memory on device
    launch_params.alloc_to_device(info.cpu_threads);
  }
  ~OptiXDevice()
  {
    // Stop processing any more tasks
    task_pool.stop();

    // Clean up all memory before destroying context
    blas.clear();

    sbt_data.free();
    texture_info.free();
    launch_params.free();

    // Make CUDA context current
    const CUDAContextScope scope(cuda_context);

    // Unload modules
    if (cuda_module != NULL)
      cuModuleUnload(cuda_module);
    if (cuda_filter_module != NULL)
      cuModuleUnload(cuda_filter_module);
    if (optix_module != NULL)
      optixModuleDestroy(optix_module);
    for (unsigned int i = 0; i < NUM_PIPELINES; ++i)
      if (pipelines[i] != NULL)
        optixPipelineDestroy(pipelines[i]);

    // Destroy launch streams
    for (int i = 0; i < info.cpu_threads; ++i)
      cuStreamDestroy(cuda_stream[i]);

    // Destroy OptiX and CUDA context
    optixDeviceContextDestroy(context);
    cuDevicePrimaryCtxRelease(cuda_device);
  }

 private:
  bool show_samples() const override
  {
    // Only show samples if not rendering multiple tiles in parallel
    return info.cpu_threads == 1;
  }

  BVHLayoutMask get_bvh_layout_mask() const override
  {
    // OptiX has its own internal acceleration structure format
    return BVH_LAYOUT_OPTIX;
  }

  bool load_kernels(const DeviceRequestedFeatures &requested_features) override
  {
    if (have_error())
      return false;  // Abort early if context creation failed already

    // Disable baking for now, since its kernel is not well-suited for inlining and is very slow
    if (requested_features.use_baking) {
      set_error("OptiX implementation does not support baking yet");
      return false;
    }
    // Disable shader raytracing support for now, since continuation callables are slow
    if (requested_features.use_shader_raytrace) {
      set_error("OptiX implementation does not support shader raytracing yet");
      return false;
    }

    const CUDAContextScope scope(cuda_context);

    // Unload any existing modules first
    if (cuda_module != NULL)
      cuModuleUnload(cuda_module);
    if (cuda_filter_module != NULL)
      cuModuleUnload(cuda_filter_module);
    if (optix_module != NULL)
      optixModuleDestroy(optix_module);
    for (unsigned int i = 0; i < NUM_PIPELINES; ++i)
      if (pipelines[i] != NULL)
        optixPipelineDestroy(pipelines[i]);

    OptixModuleCompileOptions module_options;
    module_options.maxRegisterCount = 0;  // Do not set an explicit register limit
#  ifdef WITH_CYCLES_DEBUG
    module_options.optLevel = OPTIX_COMPILE_OPTIMIZATION_LEVEL_0;
    module_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_FULL;
#  else
    module_options.optLevel = OPTIX_COMPILE_OPTIMIZATION_LEVEL_3;
    module_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_LINEINFO;
#  endif
    OptixPipelineCompileOptions pipeline_options;
    // Default to no motion blur and two-level graph, since it is the fastest option
    pipeline_options.usesMotionBlur = false;
    pipeline_options.traversableGraphFlags =
        OPTIX_TRAVERSABLE_GRAPH_FLAG_ALLOW_SINGLE_LEVEL_INSTANCING;
    pipeline_options.numPayloadValues = 6;
    pipeline_options.numAttributeValues = 2;  // u, v
#  ifdef WITH_CYCLES_DEBUG
    pipeline_options.exceptionFlags = OPTIX_EXCEPTION_FLAG_STACK_OVERFLOW |
                                      OPTIX_EXCEPTION_FLAG_TRACE_DEPTH;
#  else
    pipeline_options.exceptionFlags = OPTIX_EXCEPTION_FLAG_NONE;
#  endif
    pipeline_options.pipelineLaunchParamsVariableName = "__params";  // See kernel_globals.h

    // Keep track of whether motion blur is enabled, so to enable/disable motion in BVH builds
    // This is necessary since objects may be reported to have motion if the Vector pass is
    // active, but may still need to be rendered without motion blur if that isn't active as well
    motion_blur = requested_features.use_object_motion;

    if (motion_blur) {
      pipeline_options.usesMotionBlur = true;
      // Motion blur can insert motion transforms into the traversal graph
      // It is no longer a two-level graph then, so need to set flags to allow any configuration
      pipeline_options.traversableGraphFlags = OPTIX_TRAVERSABLE_GRAPH_FLAG_ALLOW_ANY;
    }

    {  // Load and compile PTX module with OptiX kernels
      string ptx_data;
      const string ptx_filename = "lib/kernel_optix.ptx";
      if (!path_read_text(path_get(ptx_filename), ptx_data)) {
        set_error("Failed loading OptiX kernel " + ptx_filename + ".");
        return false;
      }

      check_result_optix_ret(optixModuleCreateFromPTX(context,
                                                      &module_options,
                                                      &pipeline_options,
                                                      ptx_data.data(),
                                                      ptx_data.size(),
                                                      nullptr,
                                                      0,
                                                      &optix_module));
    }

    {  // Load CUDA modules because we need some of the utility kernels
      int major, minor;
      cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, info.num);
      cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, info.num);

      string cubin_data;
      const string cubin_filename = string_printf("lib/kernel_sm_%d%d.cubin", major, minor);
      if (!path_read_text(path_get(cubin_filename), cubin_data)) {
        set_error("Failed loading pre-compiled CUDA kernel " + cubin_filename + ".");
        return false;
      }

      check_result_cuda_ret(cuModuleLoadData(&cuda_module, cubin_data.data()));

      if (requested_features.use_denoising) {
        string filter_data;
        const string filter_filename = string_printf("lib/filter_sm_%d%d.cubin", major, minor);
        if (!path_read_text(path_get(filter_filename), filter_data)) {
          set_error("Failed loading pre-compiled CUDA filter kernel " + filter_filename + ".");
          return false;
        }

        check_result_cuda_ret(cuModuleLoadData(&cuda_filter_module, filter_data.data()));
      }
    }

    // Create program groups
    OptixProgramGroup groups[NUM_PROGRAM_GROUPS] = {};
    OptixProgramGroupDesc group_descs[NUM_PROGRAM_GROUPS] = {};
    OptixProgramGroupOptions group_options = {};  // There are no options currently
    group_descs[PG_RGEN].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
    group_descs[PG_RGEN].raygen.module = optix_module;
    // Ignore branched integrator for now (see "requested_features.use_integrator_branched")
    group_descs[PG_RGEN].raygen.entryFunctionName = "__raygen__kernel_optix_path_trace";
    group_descs[PG_MISS].kind = OPTIX_PROGRAM_GROUP_KIND_MISS;
    group_descs[PG_MISS].miss.module = optix_module;
    group_descs[PG_MISS].miss.entryFunctionName = "__miss__kernel_optix_miss";
    group_descs[PG_HITD].kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP;
    group_descs[PG_HITD].hitgroup.moduleCH = optix_module;
    group_descs[PG_HITD].hitgroup.entryFunctionNameCH = "__closesthit__kernel_optix_hit";
    group_descs[PG_HITD].hitgroup.moduleAH = optix_module;
    group_descs[PG_HITD].hitgroup.entryFunctionNameAH = "__anyhit__kernel_optix_visibility_test";
    group_descs[PG_HITS].kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP;
    group_descs[PG_HITS].hitgroup.moduleAH = optix_module;
    group_descs[PG_HITS].hitgroup.entryFunctionNameAH = "__anyhit__kernel_optix_shadow_all_hit";

    if (requested_features.use_hair) {
      // Add curve intersection programs
      group_descs[PG_HITD].hitgroup.moduleIS = optix_module;
      group_descs[PG_HITD].hitgroup.entryFunctionNameIS = "__intersection__curve";
      group_descs[PG_HITS].hitgroup.moduleIS = optix_module;
      group_descs[PG_HITS].hitgroup.entryFunctionNameIS = "__intersection__curve";
    }

    if (requested_features.use_subsurface || requested_features.use_shader_raytrace) {
      // Add hit group for local intersections
      group_descs[PG_HITL].kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP;
      group_descs[PG_HITL].hitgroup.moduleAH = optix_module;
      group_descs[PG_HITL].hitgroup.entryFunctionNameAH = "__anyhit__kernel_optix_local_hit";
    }

#  ifdef WITH_CYCLES_DEBUG
    group_descs[PG_EXCP].kind = OPTIX_PROGRAM_GROUP_KIND_EXCEPTION;
    group_descs[PG_EXCP].exception.module = optix_module;
    group_descs[PG_EXCP].exception.entryFunctionName = "__exception__kernel_optix_exception";
#  endif

    if (requested_features.use_baking) {
      group_descs[PG_BAKE].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
      group_descs[PG_BAKE].raygen.module = optix_module;
      group_descs[PG_BAKE].raygen.entryFunctionName = "__raygen__kernel_optix_bake";
    }

    if (requested_features.use_true_displacement) {
      group_descs[PG_DISP].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
      group_descs[PG_DISP].raygen.module = optix_module;
      group_descs[PG_DISP].raygen.entryFunctionName = "__raygen__kernel_optix_displace";
    }

    if (requested_features.use_background_light) {
      group_descs[PG_BACK].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
      group_descs[PG_BACK].raygen.module = optix_module;
      group_descs[PG_BACK].raygen.entryFunctionName = "__raygen__kernel_optix_background";
    }

    check_result_optix_ret(optixProgramGroupCreate(
        context, group_descs, NUM_PROGRAM_GROUPS, &group_options, nullptr, 0, groups));

    // Get program stack sizes
    OptixStackSizes stack_size[NUM_PROGRAM_GROUPS] = {};
    // Set up SBT, which in this case is used only to select between different programs
    sbt_data.alloc(NUM_PROGRAM_GROUPS);
    memset(sbt_data.host_pointer, 0, sizeof(SbtRecord) * NUM_PROGRAM_GROUPS);
    for (unsigned int i = 0; i < NUM_PROGRAM_GROUPS; ++i) {
      check_result_optix_ret(optixSbtRecordPackHeader(groups[i], &sbt_data[i]));
      check_result_optix_ret(optixProgramGroupGetStackSize(groups[i], &stack_size[i]));
    }
    sbt_data.copy_to_device();  // Upload SBT to device

    // Calculate maximum trace continuation stack size
    unsigned int trace_css = stack_size[PG_HITD].cssCH;
    // This is based on the maximum of closest-hit and any-hit/intersection programs
    trace_css = max(trace_css, stack_size[PG_HITD].cssIS + stack_size[PG_HITD].cssAH);
    trace_css = max(trace_css, stack_size[PG_HITL].cssIS + stack_size[PG_HITL].cssAH);
    trace_css = max(trace_css, stack_size[PG_HITS].cssIS + stack_size[PG_HITS].cssAH);

    OptixPipelineLinkOptions link_options;
    link_options.maxTraceDepth = 1;
#  ifdef WITH_CYCLES_DEBUG
    link_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_FULL;
#  else
    link_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_LINEINFO;
#  endif
    link_options.overrideUsesMotionBlur = pipeline_options.usesMotionBlur;

    {  // Create path tracing pipeline
      OptixProgramGroup pipeline_groups[] = {
          groups[PG_RGEN],
          groups[PG_MISS],
          groups[PG_HITD],
          groups[PG_HITS],
          groups[PG_HITL],
#  ifdef WITH_CYCLES_DEBUG
          groups[PG_EXCP],
#  endif
      };
      check_result_optix_ret(
          optixPipelineCreate(context,
                              &pipeline_options,
                              &link_options,
                              pipeline_groups,
                              (sizeof(pipeline_groups) / sizeof(pipeline_groups[0])),
                              nullptr,
                              0,
                              &pipelines[PIP_PATH_TRACE]));

      // Combine ray generation and trace continuation stack size
      const unsigned int css = stack_size[PG_RGEN].cssRG + link_options.maxTraceDepth * trace_css;

      // Set stack size depending on pipeline options
      check_result_optix_ret(optixPipelineSetStackSize(
          pipelines[PIP_PATH_TRACE], 0, 0, css, (pipeline_options.usesMotionBlur ? 3 : 2)));
    }

    // Only need to create shader evaluation pipeline if one of these features is used:
    const bool use_shader_eval_pipeline = requested_features.use_baking ||
                                          requested_features.use_background_light ||
                                          requested_features.use_true_displacement;

    if (use_shader_eval_pipeline) {  // Create shader evaluation pipeline
      OptixProgramGroup pipeline_groups[] = {
          groups[PG_BAKE],
          groups[PG_DISP],
          groups[PG_BACK],
          groups[PG_MISS],
          groups[PG_HITD],
          groups[PG_HITS],
          groups[PG_HITL],
#  ifdef WITH_CYCLES_DEBUG
          groups[PG_EXCP],
#  endif
      };
      check_result_optix_ret(
          optixPipelineCreate(context,
                              &pipeline_options,
                              &link_options,
                              pipeline_groups,
                              (sizeof(pipeline_groups) / sizeof(pipeline_groups[0])),
                              nullptr,
                              0,
                              &pipelines[PIP_SHADER_EVAL]));

      // Calculate continuation stack size based on the maximum of all ray generation stack sizes
      const unsigned int css = max(stack_size[PG_BAKE].cssRG,
                                   max(stack_size[PG_DISP].cssRG, stack_size[PG_BACK].cssRG)) +
                               link_options.maxTraceDepth * trace_css;

      check_result_optix_ret(optixPipelineSetStackSize(
          pipelines[PIP_SHADER_EVAL], 0, 0, css, (pipeline_options.usesMotionBlur ? 3 : 2)));
    }

    // Clean up program group objects
    for (unsigned int i = 0; i < NUM_PROGRAM_GROUPS; ++i) {
      optixProgramGroupDestroy(groups[i]);
    }

    return true;
  }

  void thread_run(DeviceTask &task, int thread_index)  // Main task entry point
  {
    if (have_error())
      return;  // Abort early if there was an error previously

    if (task.type == DeviceTask::RENDER) {
      RenderTile tile;
      while (task.acquire_tile(this, tile)) {
        if (tile.task == RenderTile::PATH_TRACE)
          launch_render(task, tile, thread_index);
        else if (tile.task == RenderTile::DENOISE)
          launch_denoise(task, tile, thread_index);
        task.release_tile(tile);
        if (task.get_cancel() && !task.need_finish_queue)
          break;  // User requested cancellation
        else if (have_error())
          break;  // Abort rendering when encountering an error
      }
    }
    else if (task.type == DeviceTask::SHADER) {
      launch_shader_eval(task, thread_index);
    }
    else if (task.type == DeviceTask::FILM_CONVERT) {
      launch_film_convert(task, thread_index);
    }
  }

  void launch_render(DeviceTask &task, RenderTile &rtile, int thread_index)
  {
    assert(thread_index < launch_params.data_size);

    // Keep track of total render time of this tile
    const scoped_timer timer(&rtile.buffers->render_time);

    WorkTile wtile;
    wtile.x = rtile.x;
    wtile.y = rtile.y;
    wtile.w = rtile.w;
    wtile.h = rtile.h;
    wtile.offset = rtile.offset;
    wtile.stride = rtile.stride;
    wtile.buffer = (float *)rtile.buffer;

    const int end_sample = rtile.start_sample + rtile.num_samples;
    // Keep this number reasonable to avoid running into TDRs
    const int step_samples = (info.display_device ? 8 : 32);
    // Offset into launch params buffer so that streams use separate data
    device_ptr launch_params_ptr = launch_params.device_pointer +
                                   thread_index * launch_params.data_elements;

    const CUDAContextScope scope(cuda_context);

    for (int sample = rtile.start_sample; sample < end_sample; sample += step_samples) {
      // Copy work tile information to device
      wtile.num_samples = min(step_samples, end_sample - sample);
      wtile.start_sample = sample;
      check_result_cuda(cuMemcpyHtoDAsync(launch_params_ptr + offsetof(KernelParams, tile),
                                          &wtile,
                                          sizeof(wtile),
                                          cuda_stream[thread_index]));

      OptixShaderBindingTable sbt_params = {};
      sbt_params.raygenRecord = sbt_data.device_pointer + PG_RGEN * sizeof(SbtRecord);
#  ifdef WITH_CYCLES_DEBUG
      sbt_params.exceptionRecord = sbt_data.device_pointer + PG_EXCP * sizeof(SbtRecord);
#  endif
      sbt_params.missRecordBase = sbt_data.device_pointer + PG_MISS * sizeof(SbtRecord);
      sbt_params.missRecordStrideInBytes = sizeof(SbtRecord);
      sbt_params.missRecordCount = 1;
      sbt_params.hitgroupRecordBase = sbt_data.device_pointer + PG_HITD * sizeof(SbtRecord);
      sbt_params.hitgroupRecordStrideInBytes = sizeof(SbtRecord);
      sbt_params.hitgroupRecordCount = 3;  // PG_HITD, PG_HITL, PG_HITS

      // Launch the ray generation program
      check_result_optix(optixLaunch(pipelines[PIP_PATH_TRACE],
                                     cuda_stream[thread_index],
                                     launch_params_ptr,
                                     launch_params.data_elements,
                                     &sbt_params,
                                     // Launch with samples close to each other for better locality
                                     wtile.w * wtile.num_samples,
                                     wtile.h,
                                     1));

      // Wait for launch to finish
      check_result_cuda(cuStreamSynchronize(cuda_stream[thread_index]));

      // Update current sample, so it is displayed correctly
      rtile.sample = wtile.start_sample + wtile.num_samples;
      // Update task progress after the kernel completed rendering
      task.update_progress(&rtile, wtile.w * wtile.h * wtile.num_samples);

      if (task.get_cancel() && !task.need_finish_queue)
        return;  // Cancel rendering
    }
  }

  void launch_denoise(DeviceTask &task, RenderTile &rtile, int thread_index)
  {
    const CUDAContextScope scope(cuda_context);

    // Run CUDA denoising kernels
    DenoisingTask denoising(this, task);
    denoising.functions.construct_transform = function_bind(
        &OptiXDevice::denoising_construct_transform, this, &denoising, thread_index);
    denoising.functions.accumulate = function_bind(
        &OptiXDevice::denoising_accumulate, this, _1, _2, _3, _4, &denoising, thread_index);
    denoising.functions.solve = function_bind(
        &OptiXDevice::denoising_solve, this, _1, &denoising, thread_index);
    denoising.functions.divide_shadow = function_bind(
        &OptiXDevice::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising, thread_index);
    denoising.functions.non_local_means = function_bind(
        &OptiXDevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising, thread_index);
    denoising.functions.combine_halves = function_bind(&OptiXDevice::denoising_combine_halves,
                                                       this,
                                                       _1,
                                                       _2,
                                                       _3,
                                                       _4,
                                                       _5,
                                                       _6,
                                                       &denoising,
                                                       thread_index);
    denoising.functions.get_feature = function_bind(
        &OptiXDevice::denoising_get_feature, this, _1, _2, _3, _4, _5, &denoising, thread_index);
    denoising.functions.write_feature = function_bind(
        &OptiXDevice::denoising_write_feature, this, _1, _2, _3, &denoising, thread_index);
    denoising.functions.detect_outliers = function_bind(
        &OptiXDevice::denoising_detect_outliers, this, _1, _2, _3, _4, &denoising, thread_index);

    denoising.filter_area = make_int4(rtile.x, rtile.y, rtile.w, rtile.h);
    denoising.render_buffer.samples = rtile.sample = rtile.start_sample + rtile.num_samples;
    denoising.buffer.gpu_temporary_mem = true;

    denoising.run_denoising(&rtile);

    task.update_progress(&rtile, rtile.w * rtile.h);
  }

  void launch_shader_eval(DeviceTask &task, int thread_index)
  {
    unsigned int rgen_index = PG_BACK;
    if (task.shader_eval_type >= SHADER_EVAL_BAKE)
      rgen_index = PG_BAKE;
    if (task.shader_eval_type == SHADER_EVAL_DISPLACE)
      rgen_index = PG_DISP;

    const CUDAContextScope scope(cuda_context);

    device_ptr launch_params_ptr = launch_params.device_pointer +
                                   thread_index * launch_params.data_elements;

    for (int sample = 0; sample < task.num_samples; ++sample) {
      ShaderParams params;
      params.input = (uint4 *)task.shader_input;
      params.output = (float4 *)task.shader_output;
      params.type = task.shader_eval_type;
      params.filter = task.shader_filter;
      params.sx = task.shader_x;
      params.offset = task.offset;
      params.sample = sample;

      check_result_cuda(cuMemcpyHtoDAsync(launch_params_ptr + offsetof(KernelParams, shader),
                                          &params,
                                          sizeof(params),
                                          cuda_stream[thread_index]));

      OptixShaderBindingTable sbt_params = {};
      sbt_params.raygenRecord = sbt_data.device_pointer + rgen_index * sizeof(SbtRecord);
#  ifdef WITH_CYCLES_DEBUG
      sbt_params.exceptionRecord = sbt_data.device_pointer + PG_EXCP * sizeof(SbtRecord);
#  endif
      sbt_params.missRecordBase = sbt_data.device_pointer + PG_MISS * sizeof(SbtRecord);
      sbt_params.missRecordStrideInBytes = sizeof(SbtRecord);
      sbt_params.missRecordCount = 1;
      sbt_params.hitgroupRecordBase = sbt_data.device_pointer + PG_HITD * sizeof(SbtRecord);
      sbt_params.hitgroupRecordStrideInBytes = sizeof(SbtRecord);
      sbt_params.hitgroupRecordCount = 3;  // PG_HITD, PG_HITL, PG_HITS

      check_result_optix(optixLaunch(pipelines[PIP_SHADER_EVAL],
                                     cuda_stream[thread_index],
                                     launch_params_ptr,
                                     launch_params.data_elements,
                                     &sbt_params,
                                     task.shader_w,
                                     1,
                                     1));

      check_result_cuda(cuStreamSynchronize(cuda_stream[thread_index]));

      task.update_progress(NULL);
    }
  }

  void launch_film_convert(DeviceTask &task, int thread_index)
  {
    const CUDAContextScope scope(cuda_context);

    CUfunction film_convert_func;
    check_result_cuda(cuModuleGetFunction(&film_convert_func,
                                          cuda_module,
                                          task.rgba_byte ? "kernel_cuda_convert_to_byte" :
                                                           "kernel_cuda_convert_to_half_float"));

    float sample_scale = 1.0f / (task.sample + 1);
    CUdeviceptr rgba = (task.rgba_byte ? task.rgba_byte : task.rgba_half);

    void *args[] = {&rgba,
                    &task.buffer,
                    &sample_scale,
                    &task.x,
                    &task.y,
                    &task.w,
                    &task.h,
                    &task.offset,
                    &task.stride};

    int threads_per_block;
    check_result_cuda(cuFuncGetAttribute(
        &threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, film_convert_func));

    const int num_threads_x = (int)sqrt(threads_per_block);
    const int num_blocks_x = (task.w + num_threads_x - 1) / num_threads_x;
    const int num_threads_y = (int)sqrt(threads_per_block);
    const int num_blocks_y = (task.h + num_threads_y - 1) / num_threads_y;

    check_result_cuda(cuLaunchKernel(film_convert_func,
                                     num_blocks_x,
                                     num_blocks_y,
                                     1, /* blocks */
                                     num_threads_x,
                                     num_threads_y,
                                     1, /* threads */
                                     0,
                                     cuda_stream[thread_index],
                                     args,
                                     0));

    check_result_cuda(cuStreamSynchronize(cuda_stream[thread_index]));

    task.update_progress(NULL);
  }

  bool build_optix_bvh(const OptixBuildInput &build_input,
                       uint16_t num_motion_steps,
                       device_memory &out_data,
                       OptixTraversableHandle &out_handle)
  {
    out_handle = 0;

    const CUDAContextScope scope(cuda_context);

    // Compute memory usage
    OptixAccelBufferSizes sizes = {};
    OptixAccelBuildOptions options;
    options.operation = OPTIX_BUILD_OPERATION_BUILD;
    options.buildFlags = OPTIX_BUILD_FLAG_PREFER_FAST_TRACE;
    options.motionOptions.numKeys = num_motion_steps;
    options.motionOptions.flags = OPTIX_MOTION_FLAG_START_VANISH | OPTIX_MOTION_FLAG_END_VANISH;
    options.motionOptions.timeBegin = 0.0f;
    options.motionOptions.timeEnd = 1.0f;

    check_result_optix_ret(
        optixAccelComputeMemoryUsage(context, &options, &build_input, 1, &sizes));

    // Allocate required output buffers
    device_only_memory<char> temp_mem(this, "temp_build_mem");
    temp_mem.alloc_to_device(sizes.tempSizeInBytes);

    out_data.type = MEM_DEVICE_ONLY;
    out_data.data_type = TYPE_UNKNOWN;
    out_data.data_elements = 1;
    out_data.data_size = sizes.outputSizeInBytes;
    mem_alloc(out_data);

    // Finally build the acceleration structure
    check_result_optix_ret(optixAccelBuild(context,
                                           NULL,
                                           &options,
                                           &build_input,
                                           1,
                                           temp_mem.device_pointer,
                                           sizes.tempSizeInBytes,
                                           out_data.device_pointer,
                                           sizes.outputSizeInBytes,
                                           &out_handle,
                                           NULL,
                                           0));

    // Wait for all operations to finish
    check_result_cuda_ret(cuStreamSynchronize(NULL));

    return true;
  }

  bool build_optix_bvh(BVH *bvh, device_memory &out_data) override
  {
    assert(bvh->params.top_level);

    unsigned int num_instances = 0;
    unordered_map<Mesh *, vector<OptixTraversableHandle>> meshes;

    // Clear all previous AS
    blas.clear();

    // Build bottom level acceleration structures (BLAS)
    // Note: Always keep this logic in sync with bvh_optix.cpp!
    for (Object *ob : bvh->objects) {
      // Skip meshes for which acceleration structure already exists
      if (meshes.find(ob->mesh) != meshes.end())
        continue;

      Mesh *const mesh = ob->mesh;
      vector<OptixTraversableHandle> handles;

      // Build BLAS for curve primitives
      if (bvh->params.primitive_mask & PRIMITIVE_ALL_CURVE && mesh->num_curves() > 0) {
        const size_t num_curves = mesh->num_curves();
        const size_t num_segments = mesh->num_segments();

        size_t num_motion_steps = 1;
        Attribute *motion_keys = mesh->curve_attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
        if (motion_blur && mesh->use_motion_blur && motion_keys) {
          num_motion_steps = mesh->motion_steps;
        }

        device_vector<OptixAabb> aabb_data(this, "temp_aabb_data", MEM_READ_ONLY);
        aabb_data.alloc(num_segments * num_motion_steps);

        // Get AABBs for each motion step
        for (size_t step = 0; step < num_motion_steps; ++step) {
          // The center step for motion vertices is not stored in the attribute
          const float3 *keys = mesh->curve_keys.data();
          size_t center_step = (num_motion_steps - 1) / 2;
          if (step != center_step) {
            size_t attr_offset = (step > center_step) ? step - 1 : step;
            // Technically this is a float4 array, but sizeof(float3) is the same as sizeof(float4)
            keys = motion_keys->data_float3() + attr_offset * mesh->curve_keys.size();
          }

          size_t i = step * num_segments;
          for (size_t j = 0; j < num_curves; ++j) {
            const Mesh::Curve c = mesh->get_curve(j);

            for (size_t k = 0; k < c.num_segments(); ++i, ++k) {
              BoundBox bounds = BoundBox::empty;
              c.bounds_grow(k, keys, mesh->curve_radius.data(), bounds);

              aabb_data[i].minX = bounds.min.x;
              aabb_data[i].minY = bounds.min.y;
              aabb_data[i].minZ = bounds.min.z;
              aabb_data[i].maxX = bounds.max.x;
              aabb_data[i].maxY = bounds.max.y;
              aabb_data[i].maxZ = bounds.max.z;
            }
          }
        }

        // Upload AABB data to GPU
        aabb_data.copy_to_device();

        vector<device_ptr> aabb_ptrs;
        aabb_ptrs.reserve(num_motion_steps);
        for (size_t step = 0; step < num_motion_steps; ++step) {
          aabb_ptrs.push_back(aabb_data.device_pointer + step * num_segments * sizeof(OptixAabb));
        }

        // Disable visibility test anyhit program, since it is already checked during intersection
        // Those trace calls that require anyhit can force it with OPTIX_RAY_FLAG_ENFORCE_ANYHIT
        unsigned int build_flags = OPTIX_GEOMETRY_FLAG_DISABLE_ANYHIT;
        OptixBuildInput build_input = {};
        build_input.type = OPTIX_BUILD_INPUT_TYPE_CUSTOM_PRIMITIVES;
        build_input.aabbArray.aabbBuffers = (CUdeviceptr *)aabb_ptrs.data();
        build_input.aabbArray.numPrimitives = num_segments;
        build_input.aabbArray.strideInBytes = sizeof(OptixAabb);
        build_input.aabbArray.flags = &build_flags;
        build_input.aabbArray.numSbtRecords = 1;
        build_input.aabbArray.primitiveIndexOffset = mesh->prim_offset;

        // Allocate memory for new BLAS and build it
        blas.emplace_back(this, "blas");
        handles.emplace_back();
        if (!build_optix_bvh(build_input, num_motion_steps, blas.back(), handles.back()))
          return false;
      }

      // Build BLAS for triangle primitives
      if (bvh->params.primitive_mask & PRIMITIVE_ALL_TRIANGLE && mesh->num_triangles() > 0) {
        const size_t num_verts = mesh->verts.size();

        size_t num_motion_steps = 1;
        Attribute *motion_keys = mesh->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
        if (motion_blur && mesh->use_motion_blur && motion_keys) {
          num_motion_steps = mesh->motion_steps;
        }

        device_vector<int> index_data(this, "temp_index_data", MEM_READ_ONLY);
        index_data.alloc(mesh->triangles.size());
        memcpy(index_data.data(), mesh->triangles.data(), mesh->triangles.size() * sizeof(int));
        device_vector<float3> vertex_data(this, "temp_vertex_data", MEM_READ_ONLY);
        vertex_data.alloc(num_verts * num_motion_steps);

        for (size_t step = 0; step < num_motion_steps; ++step) {
          const float3 *verts = mesh->verts.data();

          size_t center_step = (num_motion_steps - 1) / 2;
          // The center step for motion vertices is not stored in the attribute
          if (step != center_step) {
            verts = motion_keys->data_float3() +
                    (step > center_step ? step - 1 : step) * num_verts;
          }

          memcpy(vertex_data.data() + num_verts * step, verts, num_verts * sizeof(float3));
        }

        // Upload triangle data to GPU
        index_data.copy_to_device();
        vertex_data.copy_to_device();

        vector<device_ptr> vertex_ptrs;
        vertex_ptrs.reserve(num_motion_steps);
        for (size_t step = 0; step < num_motion_steps; ++step) {
          vertex_ptrs.push_back(vertex_data.device_pointer + num_verts * step * sizeof(float3));
        }

        // No special build flags for triangle primitives
        unsigned int build_flags = OPTIX_GEOMETRY_FLAG_NONE;
        OptixBuildInput build_input = {};
        build_input.type = OPTIX_BUILD_INPUT_TYPE_TRIANGLES;
        build_input.triangleArray.vertexBuffers = (CUdeviceptr *)vertex_ptrs.data();
        build_input.triangleArray.numVertices = num_verts;
        build_input.triangleArray.vertexFormat = OPTIX_VERTEX_FORMAT_FLOAT3;
        build_input.triangleArray.vertexStrideInBytes = sizeof(float3);
        build_input.triangleArray.indexBuffer = index_data.device_pointer;
        build_input.triangleArray.numIndexTriplets = mesh->num_triangles();
        build_input.triangleArray.indexFormat = OPTIX_INDICES_FORMAT_UNSIGNED_INT3;
        build_input.triangleArray.indexStrideInBytes = 3 * sizeof(int);
        build_input.triangleArray.flags = &build_flags;
        // The SBT does not store per primitive data since Cycles already allocates separate
        // buffers for that purpose. OptiX does not allow this to be zero though, so just pass in
        // one and rely on that having the same meaning in this case.
        build_input.triangleArray.numSbtRecords = 1;
        // Triangle primitives are packed right after the curve primitives of this mesh
        build_input.triangleArray.primitiveIndexOffset = mesh->prim_offset + mesh->num_segments();

        // Allocate memory for new BLAS and build it
        blas.emplace_back(this, "blas");
        handles.emplace_back();
        if (!build_optix_bvh(build_input, num_motion_steps, blas.back(), handles.back()))
          return false;
      }

      meshes.insert({mesh, handles});
    }

    // Fill instance descriptions
    device_vector<OptixAabb> aabbs(this, "tlas_aabbs", MEM_READ_ONLY);
    aabbs.alloc(bvh->objects.size() * 2);
    device_vector<OptixInstance> instances(this, "tlas_instances", MEM_READ_ONLY);
    instances.alloc(bvh->objects.size() * 2);

    for (Object *ob : bvh->objects) {
      // Skip non-traceable objects
      if (!ob->is_traceable())
        continue;
      // Create separate instance for triangle/curve meshes of an object
      for (OptixTraversableHandle handle : meshes[ob->mesh]) {
        OptixAabb &aabb = aabbs[num_instances];
        aabb.minX = ob->bounds.min.x;
        aabb.minY = ob->bounds.min.y;
        aabb.minZ = ob->bounds.min.z;
        aabb.maxX = ob->bounds.max.x;
        aabb.maxY = ob->bounds.max.y;
        aabb.maxZ = ob->bounds.max.z;

        OptixInstance &instance = instances[num_instances++];
        memset(&instance, 0, sizeof(instance));

        // Clear transform to identity matrix
        instance.transform[0] = 1.0f;
        instance.transform[5] = 1.0f;
        instance.transform[10] = 1.0f;

        // Set user instance ID to object index
        instance.instanceId = ob->get_device_index();

        // Volumes have a special bit set in the visibility mask so a trace can mask only volumes
        // See 'scene_intersect_volume' in bvh.h
        instance.visibilityMask = (ob->mesh->has_volume ? 3 : 1);

        // Insert motion traversable if object has motion
        if (motion_blur && ob->use_motion()) {
          blas.emplace_back(this, "motion_transform");
          device_only_memory<uint8_t> &motion_transform_gpu = blas.back();
          motion_transform_gpu.alloc_to_device(sizeof(OptixSRTMotionTransform) +
                                               (max(ob->motion.size(), 2) - 2) *
                                                   sizeof(OptixSRTData));

          // Allocate host side memory for motion transform and fill it with transform data
          OptixSRTMotionTransform &motion_transform = *reinterpret_cast<OptixSRTMotionTransform *>(
              motion_transform_gpu.host_pointer = new uint8_t[motion_transform_gpu.memory_size()]);
          motion_transform.child = handle;
          motion_transform.motionOptions.numKeys = ob->motion.size();
          motion_transform.motionOptions.flags = OPTIX_MOTION_FLAG_NONE;
          motion_transform.motionOptions.timeBegin = 0.0f;
          motion_transform.motionOptions.timeEnd = 1.0f;

          OptixSRTData *const srt_data = motion_transform.srtData;
          array<DecomposedTransform> decomp(ob->motion.size());
          transform_motion_decompose(decomp.data(), ob->motion.data(), ob->motion.size());

          for (size_t i = 0; i < ob->motion.size(); ++i) {
            // scaling
            srt_data[i].a = decomp[i].z.x;   // scale.x.y
            srt_data[i].b = decomp[i].z.y;   // scale.x.z
            srt_data[i].c = decomp[i].w.x;   // scale.y.z
            srt_data[i].sx = decomp[i].y.w;  // scale.x.x
            srt_data[i].sy = decomp[i].z.w;  // scale.y.y
            srt_data[i].sz = decomp[i].w.w;  // scale.z.z
            srt_data[i].pvx = 0;
            srt_data[i].pvy = 0;
            srt_data[i].pvz = 0;
            // rotation
            srt_data[i].qx = decomp[i].x.x;
            srt_data[i].qy = decomp[i].x.y;
            srt_data[i].qz = decomp[i].x.z;
            srt_data[i].qw = decomp[i].x.w;
            // transform
            srt_data[i].tx = decomp[i].y.x;
            srt_data[i].ty = decomp[i].y.y;
            srt_data[i].tz = decomp[i].y.z;
          }

          // Upload motion transform to GPU
          mem_copy_to(motion_transform_gpu);
          delete[] reinterpret_cast<uint8_t *>(motion_transform_gpu.host_pointer);
          motion_transform_gpu.host_pointer = 0;

          // Disable instance transform if object uses motion transform already
          instance.flags = OPTIX_INSTANCE_FLAG_DISABLE_TRANSFORM;

          // Get traversable handle to motion transform
          optixConvertPointerToTraversableHandle(context,
                                                 motion_transform_gpu.device_pointer,
                                                 OPTIX_TRAVERSABLE_TYPE_SRT_MOTION_TRANSFORM,
                                                 &instance.traversableHandle);
        }
        else {
          instance.traversableHandle = handle;

          if (ob->mesh->is_instanced()) {
            // Set transform matrix
            memcpy(instance.transform, &ob->tfm, sizeof(instance.transform));
          }
          else {
            // Disable instance transform if mesh already has it applied to vertex data
            instance.flags = OPTIX_INSTANCE_FLAG_DISABLE_TRANSFORM;
            // Non-instanced objects read ID from prim_object, so
            // distinguish them from instanced objects with high bit set
            instance.instanceId |= 0x800000;
          }
        }
      }
    }

    // Upload instance descriptions
    aabbs.resize(num_instances);
    aabbs.copy_to_device();
    instances.resize(num_instances);
    instances.copy_to_device();

    // Build top-level acceleration structure
    OptixBuildInput build_input = {};
    build_input.type = OPTIX_BUILD_INPUT_TYPE_INSTANCES;
    build_input.instanceArray.instances = instances.device_pointer;
    build_input.instanceArray.numInstances = num_instances;
    build_input.instanceArray.aabbs = aabbs.device_pointer;
    build_input.instanceArray.numAabbs = num_instances;

    return build_optix_bvh(build_input, 0 /* TLAS has no motion itself */, out_data, tlas_handle);
  }

  void update_texture_info()
  {
    if (need_texture_info) {
      texture_info.copy_to_device();
      need_texture_info = false;
    }
  }

  void update_launch_params(const char *name, size_t offset, void *data, size_t data_size)
  {
    const CUDAContextScope scope(cuda_context);

    for (int i = 0; i < info.cpu_threads; ++i)
      check_result_cuda(
          cuMemcpyHtoD(launch_params.device_pointer + i * launch_params.data_elements + offset,
                       data,
                       data_size));

    // Set constant memory for CUDA module
    // TODO(pmours): This is only used for tonemapping (see 'launch_film_convert').
    //               Could be removed by moving those functions to filter CUDA module.
    size_t bytes = 0;
    CUdeviceptr mem = 0;
    check_result_cuda(cuModuleGetGlobal(&mem, &bytes, cuda_module, name));
    assert(mem != NULL && bytes == data_size);
    check_result_cuda(cuMemcpyHtoD(mem, data, data_size));
  }

  void mem_alloc(device_memory &mem) override
  {
    if (mem.type == MEM_PIXELS && !background) {
      assert(!"mem_alloc not supported for pixels.");
    }
    else if (mem.type == MEM_TEXTURE) {
      assert(!"mem_alloc not supported for textures.");
    }
    else {
      generic_alloc(mem);
    }
  }

  CUDAMem *generic_alloc(device_memory &mem, size_t pitch_padding = 0)
  {
    CUDAContextScope scope(cuda_context);

    CUdeviceptr device_pointer = 0;
    size_t size = mem.memory_size() + pitch_padding;

    CUresult mem_alloc_result = CUDA_ERROR_OUT_OF_MEMORY;
    const char *status = "";

    /* First try allocating in device memory, respecting headroom. We make
     * an exception for texture info. It is small and frequently accessed,
     * so treat it as working memory.
     *
     * If there is not enough room for working memory, we will try to move
     * textures to host memory, assuming the performance impact would have
     * been worse for working memory. */
    bool is_texture = (mem.type == MEM_TEXTURE) && (&mem != &texture_info);
    bool is_image = is_texture && (mem.data_height > 1);

    size_t headroom = (is_texture) ? device_texture_headroom : device_working_headroom;

    size_t total = 0, free = 0;
    cuMemGetInfo(&free, &total);

    /* Move textures to host memory if needed. */
    if (!move_texture_to_host && !is_image && (size + headroom) >= free) {
      move_textures_to_host(size + headroom - free, is_texture);
      cuMemGetInfo(&free, &total);
    }

    /* Allocate in device memory. */
    if (!move_texture_to_host && (size + headroom) < free) {
      mem_alloc_result = cuMemAlloc(&device_pointer, size);
      if (mem_alloc_result == CUDA_SUCCESS) {
        status = " in device memory";
      }
    }

    /* Fall back to mapped host memory if needed and possible. */
    void *map_host_pointer = 0;
    bool free_map_host = false;

    if (mem_alloc_result != CUDA_SUCCESS && can_map_host &&
        map_host_used + size < map_host_limit) {
      if (mem.shared_pointer) {
        /* Another device already allocated host memory. */
        mem_alloc_result = CUDA_SUCCESS;
        map_host_pointer = mem.shared_pointer;
      }
      else {
        /* Allocate host memory ourselves. */
        mem_alloc_result = cuMemHostAlloc(
            &map_host_pointer, size, CU_MEMHOSTALLOC_DEVICEMAP | CU_MEMHOSTALLOC_WRITECOMBINED);
        mem.shared_pointer = map_host_pointer;
        free_map_host = true;
      }

      if (mem_alloc_result == CUDA_SUCCESS) {
        cuMemHostGetDevicePointer_v2(&device_pointer, mem.shared_pointer, 0);
        map_host_used += size;
        status = " in host memory";

        /* Replace host pointer with our host allocation. Only works if
         * CUDA memory layout is the same and has no pitch padding. Also
         * does not work if we move textures to host during a render,
         * since other devices might be using the memory. */
        if (!move_texture_to_host && pitch_padding == 0 && mem.host_pointer &&
            mem.host_pointer != mem.shared_pointer) {
          memcpy(mem.shared_pointer, mem.host_pointer, size);
          mem.host_free();
          mem.host_pointer = mem.shared_pointer;
        }
      }
      else {
        status = " failed, out of host memory";
      }
    }
    else if (mem_alloc_result != CUDA_SUCCESS) {
      status = " failed, out of device and host memory";
    }

    if (mem.name) {
      VLOG(1) << "Buffer allocate: " << mem.name << ", "
              << string_human_readable_number(mem.memory_size()) << " bytes. ("
              << string_human_readable_size(mem.memory_size()) << ")" << status;
    }

    if (mem_alloc_result != CUDA_SUCCESS) {
      set_error(string_printf("Buffer allocate %s", status));
      return NULL;
    }

    mem.device_pointer = (device_ptr)device_pointer;
    mem.device_size = size;
    stats.mem_alloc(size);

    if (!mem.device_pointer) {
      return NULL;
    }

    /* Insert into map of allocations. */
    CUDAMem *cmem = &cuda_mem_map[&mem];
    cmem->map_host_pointer = map_host_pointer;
    cmem->free_map_host = free_map_host;
    return cmem;
  }

  void tex_alloc(device_memory &mem)
  {
    CUDAContextScope scope(cuda_context);

    /* General variables for both architectures */
    string bind_name = mem.name;
    size_t dsize = datatype_size(mem.data_type);
    size_t size = mem.memory_size();

    CUaddress_mode address_mode = CU_TR_ADDRESS_MODE_WRAP;
    switch (mem.extension) {
      case EXTENSION_REPEAT:
        address_mode = CU_TR_ADDRESS_MODE_WRAP;
        break;
      case EXTENSION_EXTEND:
        address_mode = CU_TR_ADDRESS_MODE_CLAMP;
        break;
      case EXTENSION_CLIP:
        address_mode = CU_TR_ADDRESS_MODE_BORDER;
        break;
      default:
        assert(0);
        break;
    }

    CUfilter_mode filter_mode;
    if (mem.interpolation == INTERPOLATION_CLOSEST) {
      filter_mode = CU_TR_FILTER_MODE_POINT;
    }
    else {
      filter_mode = CU_TR_FILTER_MODE_LINEAR;
    }

    /* Data Storage */
    if (mem.interpolation == INTERPOLATION_NONE) {
      generic_alloc(mem);
      generic_copy_to(mem);

      // Update data storage pointers in launch parameters
#  define KERNEL_TEX(data_type, tex_name) \
    if (strcmp(mem.name, #tex_name) == 0) \
      update_launch_params( \
          mem.name, offsetof(KernelParams, tex_name), &mem.device_pointer, sizeof(device_ptr));
#  include "kernel/kernel_textures.h"
#  undef KERNEL_TEX
      return;
    }

    /* Image Texture Storage */
    CUarray_format_enum format;
    switch (mem.data_type) {
      case TYPE_UCHAR:
        format = CU_AD_FORMAT_UNSIGNED_INT8;
        break;
      case TYPE_UINT16:
        format = CU_AD_FORMAT_UNSIGNED_INT16;
        break;
      case TYPE_UINT:
        format = CU_AD_FORMAT_UNSIGNED_INT32;
        break;
      case TYPE_INT:
        format = CU_AD_FORMAT_SIGNED_INT32;
        break;
      case TYPE_FLOAT:
        format = CU_AD_FORMAT_FLOAT;
        break;
      case TYPE_HALF:
        format = CU_AD_FORMAT_HALF;
        break;
      default:
        assert(0);
        return;
    }

    CUDAMem *cmem = NULL;
    CUarray array_3d = NULL;
    size_t src_pitch = mem.data_width * dsize * mem.data_elements;
    size_t dst_pitch = src_pitch;

    if (mem.data_depth > 1) {
      /* 3D texture using array, there is no API for linear memory. */
      CUDA_ARRAY3D_DESCRIPTOR desc;

      desc.Width = mem.data_width;
      desc.Height = mem.data_height;
      desc.Depth = mem.data_depth;
      desc.Format = format;
      desc.NumChannels = mem.data_elements;
      desc.Flags = 0;

      VLOG(1) << "Array 3D allocate: " << mem.name << ", "
              << string_human_readable_number(mem.memory_size()) << " bytes. ("
              << string_human_readable_size(mem.memory_size()) << ")";

      check_result_cuda(cuArray3DCreate(&array_3d, &desc));

      if (!array_3d) {
        return;
      }

      CUDA_MEMCPY3D param;
      memset(&param, 0, sizeof(param));
      param.dstMemoryType = CU_MEMORYTYPE_ARRAY;
      param.dstArray = array_3d;
      param.srcMemoryType = CU_MEMORYTYPE_HOST;
      param.srcHost = mem.host_pointer;
      param.srcPitch = src_pitch;
      param.WidthInBytes = param.srcPitch;
      param.Height = mem.data_height;
      param.Depth = mem.data_depth;

      check_result_cuda(cuMemcpy3D(&param));

      mem.device_pointer = (device_ptr)array_3d;
      mem.device_size = size;
      stats.mem_alloc(size);

      cmem = &cuda_mem_map[&mem];
      cmem->texobject = 0;
      cmem->array = array_3d;
    }
    else if (mem.data_height > 0) {
      /* 2D texture, using pitch aligned linear memory. */
      int alignment = 0;
      check_result_cuda(cuDeviceGetAttribute(
          &alignment, CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT, cuda_device));
      dst_pitch = align_up(src_pitch, alignment);
      size_t dst_size = dst_pitch * mem.data_height;

      cmem = generic_alloc(mem, dst_size - mem.memory_size());
      if (!cmem) {
        return;
      }

      CUDA_MEMCPY2D param;
      memset(&param, 0, sizeof(param));
      param.dstMemoryType = CU_MEMORYTYPE_DEVICE;
      param.dstDevice = mem.device_pointer;
      param.dstPitch = dst_pitch;
      param.srcMemoryType = CU_MEMORYTYPE_HOST;
      param.srcHost = mem.host_pointer;
      param.srcPitch = src_pitch;
      param.WidthInBytes = param.srcPitch;
      param.Height = mem.data_height;

      check_result_cuda(cuMemcpy2DUnaligned(&param));
    }
    else {
      /* 1D texture, using linear memory. */
      cmem = generic_alloc(mem);
      if (!cmem) {
        return;
      }

      check_result_cuda(cuMemcpyHtoD(mem.device_pointer, mem.host_pointer, size));
    }

    /* Kepler+, bindless textures. */
    int flat_slot = 0;
    if (string_startswith(mem.name, "__tex_image")) {
      int pos = string(mem.name).rfind("_");
      flat_slot = atoi(mem.name + pos + 1);
    }
    else {
      assert(0);
    }

    CUDA_RESOURCE_DESC resDesc;
    memset(&resDesc, 0, sizeof(resDesc));

    if (array_3d) {
      resDesc.resType = CU_RESOURCE_TYPE_ARRAY;
      resDesc.res.array.hArray = array_3d;
      resDesc.flags = 0;
    }
    else if (mem.data_height > 0) {
      resDesc.resType = CU_RESOURCE_TYPE_PITCH2D;
      resDesc.res.pitch2D.devPtr = mem.device_pointer;
      resDesc.res.pitch2D.format = format;
      resDesc.res.pitch2D.numChannels = mem.data_elements;
      resDesc.res.pitch2D.height = mem.data_height;
      resDesc.res.pitch2D.width = mem.data_width;
      resDesc.res.pitch2D.pitchInBytes = dst_pitch;
    }
    else {
      resDesc.resType = CU_RESOURCE_TYPE_LINEAR;
      resDesc.res.linear.devPtr = mem.device_pointer;
      resDesc.res.linear.format = format;
      resDesc.res.linear.numChannels = mem.data_elements;
      resDesc.res.linear.sizeInBytes = mem.device_size;
    }

    CUDA_TEXTURE_DESC texDesc;
    memset(&texDesc, 0, sizeof(texDesc));
    texDesc.addressMode[0] = address_mode;
    texDesc.addressMode[1] = address_mode;
    texDesc.addressMode[2] = address_mode;
    texDesc.filterMode = filter_mode;
    texDesc.flags = CU_TRSF_NORMALIZED_COORDINATES;

    check_result_cuda(cuTexObjectCreate(&cmem->texobject, &resDesc, &texDesc, NULL));

    /* Resize once */
    if (flat_slot >= texture_info.size()) {
      /* Allocate some slots in advance, to reduce amount
       * of re-allocations. */
      texture_info.resize(flat_slot + 128);
    }

    /* Set Mapping and tag that we need to (re-)upload to device */
    TextureInfo &info = texture_info[flat_slot];
    info.data = (uint64_t)cmem->texobject;
    info.cl_buffer = 0;
    info.interpolation = mem.interpolation;
    info.extension = mem.extension;
    info.width = mem.data_width;
    info.height = mem.data_height;
    info.depth = mem.data_depth;
    need_texture_info = true;
  }

  void mem_copy_to(device_memory &mem) override
  {
    if (mem.type == MEM_PIXELS) {
      assert(!"mem_copy_to not supported for pixels.");
    }
    else if (mem.type == MEM_TEXTURE) {
      tex_free(mem);
      tex_alloc(mem);
    }
    else {
      if (!mem.device_pointer) {
        generic_alloc(mem);
      }

      generic_copy_to(mem);
    }
  }

  void generic_copy_to(device_memory &mem)
  {
    if (mem.host_pointer && mem.device_pointer) {
      CUDAContextScope scope(cuda_context);

      if (mem.host_pointer != mem.shared_pointer) {
        check_result_cuda(
            cuMemcpyHtoD((CUdeviceptr)mem.device_pointer, mem.host_pointer, mem.memory_size()));
      }
    }
  }

  void mem_copy_from(device_memory &mem, int y, int w, int h, int elem) override
  {
    if (mem.type == MEM_PIXELS && !background) {
      assert(!"mem_copy_from not supported for pixels.");
    }
    else if (mem.type == MEM_TEXTURE) {
      assert(!"mem_copy_from not supported for textures.");
    }
    else {
      // Calculate linear memory offset and size
      const size_t size = elem * w * h;
      const size_t offset = elem * y * w;

      if (mem.host_pointer && mem.device_pointer) {
        const CUDAContextScope scope(cuda_context);
        check_result_cuda(cuMemcpyDtoH(
            (char *)mem.host_pointer + offset, (CUdeviceptr)mem.device_pointer + offset, size));
      }
      else if (mem.host_pointer) {
        memset((char *)mem.host_pointer + offset, 0, size);
      }
    }
  }

  void mem_zero(device_memory &mem) override
  {
    if (mem.host_pointer)
      memset(mem.host_pointer, 0, mem.memory_size());
    if (mem.host_pointer && mem.host_pointer == mem.shared_pointer)
      return;  // This is shared host memory, so no device memory to update

    if (!mem.device_pointer)
      mem_alloc(mem);  // Need to allocate memory first if it does not exist yet

    const CUDAContextScope scope(cuda_context);
    check_result_cuda(cuMemsetD8((CUdeviceptr)mem.device_pointer, 0, mem.memory_size()));
  }

  void mem_free(device_memory &mem) override
  {
    if (mem.type == MEM_PIXELS && !background) {
      assert(!"mem_free not supported for pixels.");
    }
    else if (mem.type == MEM_TEXTURE) {
      tex_free(mem);
    }
    else {
      generic_free(mem);
    }
  }

  void generic_free(device_memory &mem)
  {
    if (mem.device_pointer) {
      CUDAContextScope scope(cuda_context);
      const CUDAMem &cmem = cuda_mem_map[&mem];

      if (cmem.map_host_pointer) {
        /* Free host memory. */
        if (cmem.free_map_host) {
          cuMemFreeHost(cmem.map_host_pointer);
          if (mem.host_pointer == mem.shared_pointer) {
            mem.host_pointer = 0;
          }
          mem.shared_pointer = 0;
        }

        map_host_used -= mem.device_size;
      }
      else {
        /* Free device memory. */
        cuMemFree(mem.device_pointer);
      }

      stats.mem_free(mem.device_size);
      mem.device_pointer = 0;
      mem.device_size = 0;

      cuda_mem_map.erase(cuda_mem_map.find(&mem));
    }
  }

  void tex_free(device_memory &mem)
  {
    if (mem.device_pointer) {
      CUDAContextScope scope(cuda_context);
      const CUDAMem &cmem = cuda_mem_map[&mem];

      if (cmem.texobject) {
        /* Free bindless texture. */
        cuTexObjectDestroy(cmem.texobject);
      }

      if (cmem.array) {
        /* Free array. */
        cuArrayDestroy(cmem.array);
        stats.mem_free(mem.device_size);
        mem.device_pointer = 0;
        mem.device_size = 0;

        cuda_mem_map.erase(cuda_mem_map.find(&mem));
      }
      else {
        generic_free(mem);
      }
    }
  }

  void move_textures_to_host(size_t size, bool for_texture)
  {
    /* Signal to reallocate textures in host memory only. */
    move_texture_to_host = true;

    while (size > 0) {
      /* Find suitable memory allocation to move. */
      device_memory *max_mem = NULL;
      size_t max_size = 0;
      bool max_is_image = false;

      foreach (auto &pair, cuda_mem_map) {
        device_memory &mem = *pair.first;
        CUDAMem *cmem = &pair.second;

        bool is_texture = (mem.type == MEM_TEXTURE) && (&mem != &texture_info);
        bool is_image = is_texture && (mem.data_height > 1);

        /* Can't move this type of memory. */
        if (!is_texture || cmem->array) {
          continue;
        }

        /* Already in host memory. */
        if (cmem->map_host_pointer) {
          continue;
        }

        /* For other textures, only move image textures. */
        if (for_texture && !is_image) {
          continue;
        }

        /* Try to move largest allocation, prefer moving images. */
        if (is_image > max_is_image || (is_image == max_is_image && mem.device_size > max_size)) {
          max_is_image = is_image;
          max_size = mem.device_size;
          max_mem = &mem;
        }
      }

      /* Move to host memory. This part is mutex protected since
       * multiple CUDA devices could be moving the memory. The
       * first one will do it, and the rest will adopt the pointer. */
      if (max_mem) {
        VLOG(1) << "Move memory from device to host: " << max_mem->name;

        static thread_mutex move_mutex;
        thread_scoped_lock lock(move_mutex);

        /* Preserve the original device pointer, in case of multi device
         * we can't change it because the pointer mapping would break. */
        device_ptr prev_pointer = max_mem->device_pointer;
        size_t prev_size = max_mem->device_size;

        tex_free(*max_mem);
        tex_alloc(*max_mem);
        size = (max_size >= size) ? 0 : size - max_size;

        max_mem->device_pointer = prev_pointer;
        max_mem->device_size = prev_size;
      }
      else {
        break;
      }
    }

    /* Update texture info array with new pointers. */
    update_texture_info();

    move_texture_to_host = false;
  }

  void const_copy_to(const char *name, void *host, size_t size) override
  {
    if (strcmp(name, "__data") == 0) {
      assert(size <= sizeof(KernelData));

      // Fix traversable handle on multi devices
      KernelData *const data = (KernelData *)host;
      *(OptixTraversableHandle *)&data->bvh.scene = tlas_handle;

      update_launch_params(name, offsetof(KernelParams, data), host, size);
    }
  }

  device_ptr mem_alloc_sub_ptr(device_memory &mem, int offset, int /*size*/) override
  {
    return (device_ptr)(((char *)mem.device_pointer) + mem.memory_elements_size(offset));
  }

  void task_add(DeviceTask &task) override
  {
    // Upload texture information to device if it has changed since last launch
    update_texture_info();

    // Split task into smaller ones
    list<DeviceTask> tasks;
    task.split(tasks, info.cpu_threads);

    // Queue tasks in internal task pool
    struct OptiXDeviceTask : public DeviceTask {
      OptiXDeviceTask(OptiXDevice *device, DeviceTask &task, int task_index) : DeviceTask(task)
      {
        // Using task index parameter instead of thread index, since number of CUDA streams may
        // differ from number of threads
        run = function_bind(&OptiXDevice::thread_run, device, *this, task_index);
      }
    };

    int task_index = 0;
    for (DeviceTask &task : tasks)
      task_pool.push(new OptiXDeviceTask(this, task, task_index++));
  }

  void task_wait() override
  {
    // Wait for all queued tasks to finish
    task_pool.wait_work();
  }

  void task_cancel() override
  {
    // Cancel any remaining tasks in the internal pool
    task_pool.cancel();
  }

#  define CUDA_GET_BLOCKSIZE(func, w, h) \
    int threads; \
    check_result_cuda_ret( \
        cuFuncGetAttribute(&threads, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func)); \
    threads = (int)sqrt((float)threads); \
    int xblocks = ((w) + threads - 1) / threads; \
    int yblocks = ((h) + threads - 1) / threads;

#  define CUDA_LAUNCH_KERNEL(func, args) \
    check_result_cuda_ret(cuLaunchKernel( \
        func, xblocks, yblocks, 1, threads, threads, 1, 0, cuda_stream[thread_index], args, 0));

  /* Similar as above, but for 1-dimensional blocks. */
#  define CUDA_GET_BLOCKSIZE_1D(func, w, h) \
    int threads; \
    check_result_cuda_ret( \
        cuFuncGetAttribute(&threads, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func)); \
    int xblocks = ((w) + threads - 1) / threads; \
    int yblocks = h;

#  define CUDA_LAUNCH_KERNEL_1D(func, args) \
    check_result_cuda_ret(cuLaunchKernel( \
        func, xblocks, yblocks, 1, threads, 1, 1, 0, cuda_stream[thread_index], args, 0));

  bool denoising_non_local_means(device_ptr image_ptr,
                                 device_ptr guide_ptr,
                                 device_ptr variance_ptr,
                                 device_ptr out_ptr,
                                 DenoisingTask *task,
                                 int thread_index)
  {
    if (have_error())
      return false;

    int stride = task->buffer.stride;
    int w = task->buffer.width;
    int h = task->buffer.h;
    int r = task->nlm_state.r;
    int f = task->nlm_state.f;
    float a = task->nlm_state.a;
    float k_2 = task->nlm_state.k_2;

    int pass_stride = task->buffer.pass_stride;
    int num_shifts = (2 * r + 1) * (2 * r + 1);
    int channel_offset = task->nlm_state.is_color ? task->buffer.pass_stride : 0;
    int frame_offset = 0;

    CUdeviceptr difference = (CUdeviceptr)task->buffer.temporary_mem.device_pointer;
    CUdeviceptr blurDifference = difference + sizeof(float) * pass_stride * num_shifts;
    CUdeviceptr weightAccum = difference + 2 * sizeof(float) * pass_stride * num_shifts;
    CUdeviceptr scale_ptr = 0;

    check_result_cuda_ret(
        cuMemsetD8Async(weightAccum, 0, sizeof(float) * pass_stride, cuda_stream[thread_index]));
    check_result_cuda_ret(
        cuMemsetD8Async(out_ptr, 0, sizeof(float) * pass_stride, cuda_stream[thread_index]));

    {
      CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMUpdateOutput;
      check_result_cuda_ret(cuModuleGetFunction(
          &cuNLMCalcDifference, cuda_filter_module, "kernel_cuda_filter_nlm_calc_difference"));
      check_result_cuda_ret(
          cuModuleGetFunction(&cuNLMBlur, cuda_filter_module, "kernel_cuda_filter_nlm_blur"));
      check_result_cuda_ret(cuModuleGetFunction(
          &cuNLMCalcWeight, cuda_filter_module, "kernel_cuda_filter_nlm_calc_weight"));
      check_result_cuda_ret(cuModuleGetFunction(
          &cuNLMUpdateOutput, cuda_filter_module, "kernel_cuda_filter_nlm_update_output"));

      check_result_cuda_ret(cuFuncSetCacheConfig(cuNLMCalcDifference, CU_FUNC_CACHE_PREFER_L1));
      check_result_cuda_ret(cuFuncSetCacheConfig(cuNLMBlur, CU_FUNC_CACHE_PREFER_L1));
      check_result_cuda_ret(cuFuncSetCacheConfig(cuNLMCalcWeight, CU_FUNC_CACHE_PREFER_L1));
      check_result_cuda_ret(cuFuncSetCacheConfig(cuNLMUpdateOutput, CU_FUNC_CACHE_PREFER_L1));

      CUDA_GET_BLOCKSIZE_1D(cuNLMCalcDifference, w * h, num_shifts);

      void *calc_difference_args[] = {&guide_ptr,
                                      &variance_ptr,
                                      &scale_ptr,
                                      &difference,
                                      &w,
                                      &h,
                                      &stride,
                                      &pass_stride,
                                      &r,
                                      &channel_offset,
                                      &frame_offset,
                                      &a,
                                      &k_2};
      void *blur_args[] = {&difference, &blurDifference, &w, &h, &stride, &pass_stride, &r, &f};
      void *calc_weight_args[] = {
          &blurDifference, &difference, &w, &h, &stride, &pass_stride, &r, &f};
      void *update_output_args[] = {&blurDifference,
                                    &image_ptr,
                                    &out_ptr,
                                    &weightAccum,
                                    &w,
                                    &h,
                                    &stride,
                                    &pass_stride,
                                    &channel_offset,
                                    &r,
                                    &f};

      CUDA_LAUNCH_KERNEL_1D(cuNLMCalcDifference, calc_difference_args);
      CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
      CUDA_LAUNCH_KERNEL_1D(cuNLMCalcWeight, calc_weight_args);
      CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
      CUDA_LAUNCH_KERNEL_1D(cuNLMUpdateOutput, update_output_args);
    }

    {
      CUfunction cuNLMNormalize;
      check_result_cuda_ret(cuModuleGetFunction(
          &cuNLMNormalize, cuda_filter_module, "kernel_cuda_filter_nlm_normalize"));
      check_result_cuda_ret(cuFuncSetCacheConfig(cuNLMNormalize, CU_FUNC_CACHE_PREFER_L1));
      void *normalize_args[] = {&out_ptr, &weightAccum, &w, &h, &stride};
      CUDA_GET_BLOCKSIZE(cuNLMNormalize, w, h);
      CUDA_LAUNCH_KERNEL(cuNLMNormalize, normalize_args);
      check_result_cuda_ret(cuStreamSynchronize(cuda_stream[thread_index]));
    }

    return !have_error();
  }

  bool denoising_construct_transform(DenoisingTask *task, int thread_index)
  {
    if (have_error())
      return false;

    CUfunction cuFilterConstructTransform;
    check_result_cuda_ret(cuModuleGetFunction(&cuFilterConstructTransform,
                                              cuda_filter_module,
                                              "kernel_cuda_filter_construct_transform"));
    check_result_cuda_ret(
        cuFuncSetCacheConfig(cuFilterConstructTransform, CU_FUNC_CACHE_PREFER_SHARED));
    CUDA_GET_BLOCKSIZE(cuFilterConstructTransform, task->storage.w, task->storage.h);

    void *args[] = {&task->buffer.mem.device_pointer,
                    &task->tile_info_mem.device_pointer,
                    &task->storage.transform.device_pointer,
                    &task->storage.rank.device_pointer,
                    &task->filter_area,
                    &task->rect,
                    &task->radius,
                    &task->pca_threshold,
                    &task->buffer.pass_stride,
                    &task->buffer.frame_stride,
                    &task->buffer.use_time};
    CUDA_LAUNCH_KERNEL(cuFilterConstructTransform, args);
    check_result_cuda_ret(cuCtxSynchronize());

    return !have_error();
  }

  bool denoising_accumulate(device_ptr color_ptr,
                            device_ptr color_variance_ptr,
                            device_ptr scale_ptr,
                            int frame,
                            DenoisingTask *task,
                            int thread_index)
  {
    if (have_error())
      return false;

    int r = task->radius;
    int f = 4;
    float a = 1.0f;
    float k_2 = task->nlm_k_2;

    int w = task->reconstruction_state.source_w;
    int h = task->reconstruction_state.source_h;
    int stride = task->buffer.stride;
    int frame_offset = frame * task->buffer.frame_stride;
    int t = task->tile_info->frames[frame];

    int pass_stride = task->buffer.pass_stride;
    int num_shifts = (2 * r + 1) * (2 * r + 1);

    CUdeviceptr difference = (CUdeviceptr)task->buffer.temporary_mem.device_pointer;
    CUdeviceptr blurDifference = difference + sizeof(float) * pass_stride * num_shifts;

    CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMConstructGramian;
    check_result_cuda_ret(cuModuleGetFunction(
        &cuNLMCalcDifference, cuda_filter_module, "kernel_cuda_filter_nlm_calc_difference"));
    check_result_cuda_ret(
        cuModuleGetFunction(&cuNLMBlur, cuda_filter_module, "kernel_cuda_filter_nlm_blur"));
    check_result_cuda_ret(cuModuleGetFunction(
        &cuNLMCalcWeight, cuda_filter_module, "kernel_cuda_filter_nlm_calc_weight"));
    check_result_cuda_ret(cuModuleGetFunction(
        &cuNLMConstructGramian, cuda_filter_module, "kernel_cuda_filter_nlm_construct_gramian"));

    check_result_cuda_ret(cuFuncSetCacheConfig(cuNLMCalcDifference, CU_FUNC_CACHE_PREFER_L1));
    check_result_cuda_ret(cuFuncSetCacheConfig(cuNLMBlur, CU_FUNC_CACHE_PREFER_L1));
    check_result_cuda_ret(cuFuncSetCacheConfig(cuNLMCalcWeight, CU_FUNC_CACHE_PREFER_L1));
    check_result_cuda_ret(
        cuFuncSetCacheConfig(cuNLMConstructGramian, CU_FUNC_CACHE_PREFER_SHARED));

    CUDA_GET_BLOCKSIZE_1D(cuNLMCalcDifference,
                          task->reconstruction_state.source_w *
                              task->reconstruction_state.source_h,
                          num_shifts);

    void *calc_difference_args[] = {&color_ptr,
                                    &color_variance_ptr,
                                    &scale_ptr,
                                    &difference,
                                    &w,
                                    &h,
                                    &stride,
                                    &pass_stride,
                                    &r,
                                    &pass_stride,
                                    &frame_offset,
                                    &a,
                                    &k_2};
    void *blur_args[] = {&difference, &blurDifference, &w, &h, &stride, &pass_stride, &r, &f};
    void *calc_weight_args[] = {
        &blurDifference, &difference, &w, &h, &stride, &pass_stride, &r, &f};
    void *construct_gramian_args[] = {&t,
                                      &blurDifference,
                                      &task->buffer.mem.device_pointer,
                                      &task->storage.transform.device_pointer,
                                      &task->storage.rank.device_pointer,
                                      &task->storage.XtWX.device_pointer,
                                      &task->storage.XtWY.device_pointer,
                                      &task->reconstruction_state.filter_window,
                                      &w,
                                      &h,
                                      &stride,
                                      &pass_stride,
                                      &r,
                                      &f,
                                      &frame_offset,
                                      &task->buffer.use_time};

    CUDA_LAUNCH_KERNEL_1D(cuNLMCalcDifference, calc_difference_args);
    CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
    CUDA_LAUNCH_KERNEL_1D(cuNLMCalcWeight, calc_weight_args);
    CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
    CUDA_LAUNCH_KERNEL_1D(cuNLMConstructGramian, construct_gramian_args);
    check_result_cuda_ret(cuCtxSynchronize());

    return !have_error();
  }

  bool denoising_solve(device_ptr output_ptr, DenoisingTask *task, int thread_index)
  {
    if (have_error())
      return false;

    CUfunction cuFinalize;
    check_result_cuda_ret(
        cuModuleGetFunction(&cuFinalize, cuda_filter_module, "kernel_cuda_filter_finalize"));
    check_result_cuda_ret(cuFuncSetCacheConfig(cuFinalize, CU_FUNC_CACHE_PREFER_L1));
    void *finalize_args[] = {&output_ptr,
                             &task->storage.rank.device_pointer,
                             &task->storage.XtWX.device_pointer,
                             &task->storage.XtWY.device_pointer,
                             &task->filter_area,
                             &task->reconstruction_state.buffer_params.x,
                             &task->render_buffer.samples};
    CUDA_GET_BLOCKSIZE(
        cuFinalize, task->reconstruction_state.source_w, task->reconstruction_state.source_h);
    CUDA_LAUNCH_KERNEL(cuFinalize, finalize_args);
    check_result_cuda_ret(cuStreamSynchronize(cuda_stream[thread_index]));

    return !have_error();
  }

  bool denoising_combine_halves(device_ptr a_ptr,
                                device_ptr b_ptr,
                                device_ptr mean_ptr,
                                device_ptr variance_ptr,
                                int r,
                                int4 rect,
                                DenoisingTask *task,
                                int thread_index)
  {
    if (have_error())
      return false;

    CUfunction cuFilterCombineHalves;
    check_result_cuda_ret(cuModuleGetFunction(
        &cuFilterCombineHalves, cuda_filter_module, "kernel_cuda_filter_combine_halves"));
    check_result_cuda_ret(cuFuncSetCacheConfig(cuFilterCombineHalves, CU_FUNC_CACHE_PREFER_L1));
    CUDA_GET_BLOCKSIZE(
        cuFilterCombineHalves, task->rect.z - task->rect.x, task->rect.w - task->rect.y);

    void *args[] = {&mean_ptr, &variance_ptr, &a_ptr, &b_ptr, &rect, &r};
    CUDA_LAUNCH_KERNEL(cuFilterCombineHalves, args);
    check_result_cuda_ret(cuStreamSynchronize(cuda_stream[thread_index]));

    return !have_error();
  }

  bool denoising_divide_shadow(device_ptr a_ptr,
                               device_ptr b_ptr,
                               device_ptr sample_variance_ptr,
                               device_ptr sv_variance_ptr,
                               device_ptr buffer_variance_ptr,
                               DenoisingTask *task,
                               int thread_index)
  {
    if (have_error())
      return false;

    CUfunction cuFilterDivideShadow;
    check_result_cuda_ret(cuModuleGetFunction(
        &cuFilterDivideShadow, cuda_filter_module, "kernel_cuda_filter_divide_shadow"));
    check_result_cuda_ret(cuFuncSetCacheConfig(cuFilterDivideShadow, CU_FUNC_CACHE_PREFER_L1));
    CUDA_GET_BLOCKSIZE(
        cuFilterDivideShadow, task->rect.z - task->rect.x, task->rect.w - task->rect.y);

    void *args[] = {&task->render_buffer.samples,
                    &task->tile_info_mem.device_pointer,
                    &a_ptr,
                    &b_ptr,
                    &sample_variance_ptr,
                    &sv_variance_ptr,
                    &buffer_variance_ptr,
                    &task->rect,
                    &task->render_buffer.pass_stride,
                    &task->render_buffer.offset};
    CUDA_LAUNCH_KERNEL(cuFilterDivideShadow, args);
    check_result_cuda_ret(cuStreamSynchronize(cuda_stream[thread_index]));

    return !have_error();
  }

  bool denoising_get_feature(int mean_offset,
                             int variance_offset,
                             device_ptr mean_ptr,
                             device_ptr variance_ptr,
                             float scale,
                             DenoisingTask *task,
                             int thread_index)
  {
    if (have_error())
      return false;

    CUfunction cuFilterGetFeature;
    check_result_cuda_ret(cuModuleGetFunction(
        &cuFilterGetFeature, cuda_filter_module, "kernel_cuda_filter_get_feature"));
    check_result_cuda_ret(cuFuncSetCacheConfig(cuFilterGetFeature, CU_FUNC_CACHE_PREFER_L1));
    CUDA_GET_BLOCKSIZE(
        cuFilterGetFeature, task->rect.z - task->rect.x, task->rect.w - task->rect.y);

    void *args[] = {&task->render_buffer.samples,
                    &task->tile_info_mem.device_pointer,
                    &mean_offset,
                    &variance_offset,
                    &mean_ptr,
                    &variance_ptr,
                    &scale,
                    &task->rect,
                    &task->render_buffer.pass_stride,
                    &task->render_buffer.offset};
    CUDA_LAUNCH_KERNEL(cuFilterGetFeature, args);
    check_result_cuda_ret(cuStreamSynchronize(cuda_stream[thread_index]));

    return !have_error();
  }

  bool denoising_write_feature(int out_offset,
                               device_ptr from_ptr,
                               device_ptr buffer_ptr,
                               DenoisingTask *task,
                               int thread_index)
  {
    if (have_error())
      return false;

    CUfunction cuFilterWriteFeature;
    check_result_cuda_ret(cuModuleGetFunction(
        &cuFilterWriteFeature, cuda_filter_module, "kernel_cuda_filter_write_feature"));
    check_result_cuda_ret(cuFuncSetCacheConfig(cuFilterWriteFeature, CU_FUNC_CACHE_PREFER_L1));
    CUDA_GET_BLOCKSIZE(cuFilterWriteFeature, task->filter_area.z, task->filter_area.w);

    void *args[] = {&task->render_buffer.samples,
                    &task->reconstruction_state.buffer_params,
                    &task->filter_area,
                    &from_ptr,
                    &buffer_ptr,
                    &out_offset,
                    &task->rect};
    CUDA_LAUNCH_KERNEL(cuFilterWriteFeature, args);
    check_result_cuda_ret(cuStreamSynchronize(cuda_stream[thread_index]));

    return !have_error();
  }

  bool denoising_detect_outliers(device_ptr image_ptr,
                                 device_ptr variance_ptr,
                                 device_ptr depth_ptr,
                                 device_ptr output_ptr,
                                 DenoisingTask *task,
                                 int thread_index)
  {
    if (have_error())
      return false;

    CUfunction cuFilterDetectOutliers;
    check_result_cuda_ret(cuModuleGetFunction(
        &cuFilterDetectOutliers, cuda_filter_module, "kernel_cuda_filter_detect_outliers"));
    check_result_cuda_ret(cuFuncSetCacheConfig(cuFilterDetectOutliers, CU_FUNC_CACHE_PREFER_L1));
    CUDA_GET_BLOCKSIZE(
        cuFilterDetectOutliers, task->rect.z - task->rect.x, task->rect.w - task->rect.y);

    void *args[] = {&image_ptr,
                    &variance_ptr,
                    &depth_ptr,
                    &output_ptr,
                    &task->rect,
                    &task->buffer.pass_stride};

    CUDA_LAUNCH_KERNEL(cuFilterDetectOutliers, args);
    check_result_cuda_ret(cuStreamSynchronize(cuda_stream[thread_index]));

    return !have_error();
  }
};

bool device_optix_init()
{
  if (g_optixFunctionTable.optixDeviceContextCreate != NULL)
    return true;  // Already initialized function table

  // Need to initialize CUDA as well
  if (!device_cuda_init())
    return false;

#  ifdef WITH_CUDA_DYNLOAD
  // Load NVRTC function pointers for adaptive kernel compilation
  if (DebugFlags().cuda.adaptive_compile && cuewInit(CUEW_INIT_NVRTC) != CUEW_SUCCESS) {
    VLOG(1)
        << "CUEW initialization failed for NVRTC. Adaptive kernel compilation won't be available.";
  }
#  endif

  const OptixResult result = optixInit();

  if (result == OPTIX_ERROR_UNSUPPORTED_ABI_VERSION) {
    VLOG(1)
        << "OptiX initialization failed because the installed driver does not support ABI version "
        << OPTIX_ABI_VERSION;
    return false;
  }
  else if (result != OPTIX_SUCCESS) {
    VLOG(1) << "OptiX initialization failed with error code " << (unsigned int)result;
    return false;
  }

  // Loaded OptiX successfully!
  return true;
}

void device_optix_info(vector<DeviceInfo> &devices)
{
  // Simply add all supported CUDA devices as OptiX devices again
  vector<DeviceInfo> cuda_devices;
  device_cuda_info(cuda_devices);

  for (auto it = cuda_devices.begin(); it != cuda_devices.end();) {
    DeviceInfo &info = *it;
    assert(info.type == DEVICE_CUDA);
    info.type = DEVICE_OPTIX;
    info.id += "_OptiX";

    // Figure out RTX support
    CUdevice cuda_device = 0;
    CUcontext cuda_context = NULL;
    unsigned int rtcore_version = 0;
    if (cuDeviceGet(&cuda_device, info.num) == CUDA_SUCCESS &&
        cuDevicePrimaryCtxRetain(&cuda_context, cuda_device) == CUDA_SUCCESS) {
      OptixDeviceContext optix_context = NULL;
      if (optixDeviceContextCreate(cuda_context, nullptr, &optix_context) == OPTIX_SUCCESS) {
        optixDeviceContextGetProperty(optix_context,
                                      OPTIX_DEVICE_PROPERTY_RTCORE_VERSION,
                                      &rtcore_version,
                                      sizeof(rtcore_version));
        optixDeviceContextDestroy(optix_context);
      }
      cuDevicePrimaryCtxRelease(cuda_device);
    }

    // Only add devices with RTX support
    if (rtcore_version == 0)
      it = cuda_devices.erase(it);
    else
      ++it;
  }

  devices.insert(devices.end(), cuda_devices.begin(), cuda_devices.end());
}

Device *device_optix_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background)
{
  return new OptiXDevice(info, stats, profiler, background);
}

CCL_NAMESPACE_END

#endif