Cycles: Add OptiX AI denoiser support
This patch adds support for the OptiX denoiser as an alternative to the existing NLM denoiser in Cycles. It's re-using the same denoising architecture based on tiles and therefore implicitly also works with multiple GPUs. Reviewed By: sergey Differential Revision: https://developer.blender.org/D6395
This commit is contained in:
@@ -197,6 +197,12 @@ enum_aov_types = (
|
|||||||
('COLOR', "Color", "Write a Color pass", 1),
|
('COLOR', "Color", "Write a Color pass", 1),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
enum_denoising_optix_input_passes= (
|
||||||
|
('RGB', "Color", "Use only color as input", 1),
|
||||||
|
('RGB_ALBEDO', "Color + Albedo", "Use color and albedo data as input", 2),
|
||||||
|
('RGB_ALBEDO_NORMAL', "Color + Albedo + Normal", "Use color, albedo and normal data as input", 3),
|
||||||
|
)
|
||||||
|
|
||||||
class CyclesRenderSettings(bpy.types.PropertyGroup):
|
class CyclesRenderSettings(bpy.types.PropertyGroup):
|
||||||
|
|
||||||
device: EnumProperty(
|
device: EnumProperty(
|
||||||
@@ -1279,6 +1285,7 @@ class CyclesRenderLayerSettings(bpy.types.PropertyGroup):
|
|||||||
default=False,
|
default=False,
|
||||||
update=update_render_passes,
|
update=update_render_passes,
|
||||||
)
|
)
|
||||||
|
|
||||||
use_pass_volume_direct: BoolProperty(
|
use_pass_volume_direct: BoolProperty(
|
||||||
name="Volume Direct",
|
name="Volume Direct",
|
||||||
description="Deliver direct volumetric scattering pass",
|
description="Deliver direct volumetric scattering pass",
|
||||||
@@ -1298,6 +1305,12 @@ class CyclesRenderLayerSettings(bpy.types.PropertyGroup):
|
|||||||
default=False,
|
default=False,
|
||||||
update=update_render_passes,
|
update=update_render_passes,
|
||||||
)
|
)
|
||||||
|
use_optix_denoising: BoolProperty(
|
||||||
|
name="Use OptiX AI Denoising",
|
||||||
|
description="Denoise the rendered image with the OptiX AI denoiser",
|
||||||
|
default=False,
|
||||||
|
update=update_render_passes,
|
||||||
|
)
|
||||||
denoising_diffuse_direct: BoolProperty(
|
denoising_diffuse_direct: BoolProperty(
|
||||||
name="Diffuse Direct",
|
name="Diffuse Direct",
|
||||||
description="Denoise the direct diffuse lighting",
|
description="Denoise the direct diffuse lighting",
|
||||||
@@ -1374,6 +1387,13 @@ class CyclesRenderLayerSettings(bpy.types.PropertyGroup):
|
|||||||
min=0, max=7,
|
min=0, max=7,
|
||||||
default=0,
|
default=0,
|
||||||
)
|
)
|
||||||
|
denoising_optix_input_passes: EnumProperty(
|
||||||
|
name="Input Passes",
|
||||||
|
description="Controls which passes the OptiX AI denoiser should use as input, which can have different effects on the denoised image",
|
||||||
|
items=enum_denoising_optix_input_passes,
|
||||||
|
default='RGB',
|
||||||
|
)
|
||||||
|
|
||||||
use_pass_crypto_object: BoolProperty(
|
use_pass_crypto_object: BoolProperty(
|
||||||
name="Cryptomatte Object",
|
name="Cryptomatte Object",
|
||||||
description="Render cryptomatte object pass, for isolating objects in compositing",
|
description="Render cryptomatte object pass, for isolating objects in compositing",
|
||||||
|
@@ -979,11 +979,21 @@ class CYCLES_RENDER_PT_denoising(CyclesButtonsPanel, Panel):
|
|||||||
split = layout.split()
|
split = layout.split()
|
||||||
split.active = cycles_view_layer.use_denoising
|
split.active = cycles_view_layer.use_denoising
|
||||||
|
|
||||||
layout = layout.column(align=True)
|
col = split.column(align=True)
|
||||||
layout.prop(cycles_view_layer, "denoising_radius", text="Radius")
|
|
||||||
layout.prop(cycles_view_layer, "denoising_strength", slider=True, text="Strength")
|
if use_optix(context):
|
||||||
layout.prop(cycles_view_layer, "denoising_feature_strength", slider=True, text="Feature Strength")
|
col.prop(cycles_view_layer, "use_optix_denoising", text="OptiX AI Denoising")
|
||||||
layout.prop(cycles_view_layer, "denoising_relative_pca")
|
|
||||||
|
if cycles_view_layer.use_optix_denoising:
|
||||||
|
col.prop(cycles_view_layer, "denoising_optix_input_passes")
|
||||||
|
return
|
||||||
|
|
||||||
|
col.separator(factor=2.0)
|
||||||
|
|
||||||
|
col.prop(cycles_view_layer, "denoising_radius", text="Radius")
|
||||||
|
col.prop(cycles_view_layer, "denoising_strength", slider=True, text="Strength")
|
||||||
|
col.prop(cycles_view_layer, "denoising_feature_strength", slider=True, text="Feature Strength")
|
||||||
|
col.prop(cycles_view_layer, "denoising_relative_pca")
|
||||||
|
|
||||||
layout.separator()
|
layout.separator()
|
||||||
|
|
||||||
|
@@ -478,23 +478,24 @@ void BlenderSession::render(BL::Depsgraph &b_depsgraph_)
|
|||||||
buffer_params.passes = passes;
|
buffer_params.passes = passes;
|
||||||
|
|
||||||
PointerRNA crl = RNA_pointer_get(&b_view_layer.ptr, "cycles");
|
PointerRNA crl = RNA_pointer_get(&b_view_layer.ptr, "cycles");
|
||||||
bool full_denoising = get_boolean(crl, "use_denoising");
|
bool use_denoising = get_boolean(crl, "use_denoising");
|
||||||
|
bool use_optix_denoising = get_boolean(crl, "use_optix_denoising");
|
||||||
bool write_denoising_passes = get_boolean(crl, "denoising_store_passes");
|
bool write_denoising_passes = get_boolean(crl, "denoising_store_passes");
|
||||||
|
|
||||||
bool run_denoising = full_denoising || write_denoising_passes;
|
buffer_params.denoising_data_pass = use_denoising || write_denoising_passes;
|
||||||
|
|
||||||
session->tile_manager.schedule_denoising = run_denoising;
|
|
||||||
buffer_params.denoising_data_pass = run_denoising;
|
|
||||||
buffer_params.denoising_clean_pass = (scene->film->denoising_flags & DENOISING_CLEAN_ALL_PASSES);
|
buffer_params.denoising_clean_pass = (scene->film->denoising_flags & DENOISING_CLEAN_ALL_PASSES);
|
||||||
buffer_params.denoising_prefiltered_pass = write_denoising_passes;
|
buffer_params.denoising_prefiltered_pass = write_denoising_passes && !use_optix_denoising;
|
||||||
|
|
||||||
session->params.run_denoising = run_denoising;
|
session->params.run_denoising = use_denoising || write_denoising_passes;
|
||||||
session->params.full_denoising = full_denoising;
|
session->params.full_denoising = use_denoising && !use_optix_denoising;
|
||||||
session->params.write_denoising_passes = write_denoising_passes;
|
session->params.optix_denoising = use_denoising && use_optix_denoising;
|
||||||
|
session->params.write_denoising_passes = write_denoising_passes && !use_optix_denoising;
|
||||||
session->params.denoising.radius = get_int(crl, "denoising_radius");
|
session->params.denoising.radius = get_int(crl, "denoising_radius");
|
||||||
session->params.denoising.strength = get_float(crl, "denoising_strength");
|
session->params.denoising.strength = get_float(crl, "denoising_strength");
|
||||||
session->params.denoising.feature_strength = get_float(crl, "denoising_feature_strength");
|
session->params.denoising.feature_strength = get_float(crl, "denoising_feature_strength");
|
||||||
session->params.denoising.relative_pca = get_boolean(crl, "denoising_relative_pca");
|
session->params.denoising.relative_pca = get_boolean(crl, "denoising_relative_pca");
|
||||||
|
session->params.denoising.optix_input_passes = get_enum(crl, "denoising_optix_input_passes");
|
||||||
|
session->tile_manager.schedule_denoising = session->params.run_denoising;
|
||||||
|
|
||||||
scene->film->denoising_data_pass = buffer_params.denoising_data_pass;
|
scene->film->denoising_data_pass = buffer_params.denoising_data_pass;
|
||||||
scene->film->denoising_clean_pass = buffer_params.denoising_clean_pass;
|
scene->film->denoising_clean_pass = buffer_params.denoising_clean_pass;
|
||||||
|
@@ -535,23 +535,26 @@ vector<Pass> BlenderSync::sync_render_passes(BL::RenderLayer &b_rlay, BL::ViewLa
|
|||||||
}
|
}
|
||||||
|
|
||||||
PointerRNA crp = RNA_pointer_get(&b_view_layer.ptr, "cycles");
|
PointerRNA crp = RNA_pointer_get(&b_view_layer.ptr, "cycles");
|
||||||
bool full_denoising = get_boolean(crp, "use_denoising");
|
bool use_denoising = get_boolean(crp, "use_denoising");
|
||||||
|
bool use_optix_denoising = get_boolean(crp, "use_optix_denoising");
|
||||||
bool write_denoising_passes = get_boolean(crp, "denoising_store_passes");
|
bool write_denoising_passes = get_boolean(crp, "denoising_store_passes");
|
||||||
|
|
||||||
scene->film->denoising_flags = 0;
|
scene->film->denoising_flags = 0;
|
||||||
if (full_denoising || write_denoising_passes) {
|
if (use_denoising || write_denoising_passes) {
|
||||||
|
if (!use_optix_denoising) {
|
||||||
#define MAP_OPTION(name, flag) \
|
#define MAP_OPTION(name, flag) \
|
||||||
if (!get_boolean(crp, name)) \
|
if (!get_boolean(crp, name)) \
|
||||||
scene->film->denoising_flags |= flag;
|
scene->film->denoising_flags |= flag;
|
||||||
MAP_OPTION("denoising_diffuse_direct", DENOISING_CLEAN_DIFFUSE_DIR);
|
MAP_OPTION("denoising_diffuse_direct", DENOISING_CLEAN_DIFFUSE_DIR);
|
||||||
MAP_OPTION("denoising_diffuse_indirect", DENOISING_CLEAN_DIFFUSE_IND);
|
MAP_OPTION("denoising_diffuse_indirect", DENOISING_CLEAN_DIFFUSE_IND);
|
||||||
MAP_OPTION("denoising_glossy_direct", DENOISING_CLEAN_GLOSSY_DIR);
|
MAP_OPTION("denoising_glossy_direct", DENOISING_CLEAN_GLOSSY_DIR);
|
||||||
MAP_OPTION("denoising_glossy_indirect", DENOISING_CLEAN_GLOSSY_IND);
|
MAP_OPTION("denoising_glossy_indirect", DENOISING_CLEAN_GLOSSY_IND);
|
||||||
MAP_OPTION("denoising_transmission_direct", DENOISING_CLEAN_TRANSMISSION_DIR);
|
MAP_OPTION("denoising_transmission_direct", DENOISING_CLEAN_TRANSMISSION_DIR);
|
||||||
MAP_OPTION("denoising_transmission_indirect", DENOISING_CLEAN_TRANSMISSION_IND);
|
MAP_OPTION("denoising_transmission_indirect", DENOISING_CLEAN_TRANSMISSION_IND);
|
||||||
MAP_OPTION("denoising_subsurface_direct", DENOISING_CLEAN_SUBSURFACE_DIR);
|
MAP_OPTION("denoising_subsurface_direct", DENOISING_CLEAN_SUBSURFACE_DIR);
|
||||||
MAP_OPTION("denoising_subsurface_indirect", DENOISING_CLEAN_SUBSURFACE_IND);
|
MAP_OPTION("denoising_subsurface_indirect", DENOISING_CLEAN_SUBSURFACE_IND);
|
||||||
#undef MAP_OPTION
|
#undef MAP_OPTION
|
||||||
|
}
|
||||||
b_engine.add_pass("Noisy Image", 4, "RGBA", b_view_layer.name().c_str());
|
b_engine.add_pass("Noisy Image", 4, "RGBA", b_view_layer.name().c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -559,14 +562,17 @@ vector<Pass> BlenderSync::sync_render_passes(BL::RenderLayer &b_rlay, BL::ViewLa
|
|||||||
b_engine.add_pass("Denoising Normal", 3, "XYZ", b_view_layer.name().c_str());
|
b_engine.add_pass("Denoising Normal", 3, "XYZ", b_view_layer.name().c_str());
|
||||||
b_engine.add_pass("Denoising Albedo", 3, "RGB", b_view_layer.name().c_str());
|
b_engine.add_pass("Denoising Albedo", 3, "RGB", b_view_layer.name().c_str());
|
||||||
b_engine.add_pass("Denoising Depth", 1, "Z", b_view_layer.name().c_str());
|
b_engine.add_pass("Denoising Depth", 1, "Z", b_view_layer.name().c_str());
|
||||||
b_engine.add_pass("Denoising Shadowing", 1, "X", b_view_layer.name().c_str());
|
if (!use_optix_denoising) {
|
||||||
b_engine.add_pass("Denoising Variance", 3, "RGB", b_view_layer.name().c_str());
|
b_engine.add_pass("Denoising Shadowing", 1, "X", b_view_layer.name().c_str());
|
||||||
b_engine.add_pass("Denoising Intensity", 1, "X", b_view_layer.name().c_str());
|
b_engine.add_pass("Denoising Variance", 3, "RGB", b_view_layer.name().c_str());
|
||||||
|
b_engine.add_pass("Denoising Intensity", 1, "X", b_view_layer.name().c_str());
|
||||||
|
}
|
||||||
|
|
||||||
if (scene->film->denoising_flags & DENOISING_CLEAN_ALL_PASSES) {
|
if (scene->film->denoising_flags & DENOISING_CLEAN_ALL_PASSES) {
|
||||||
b_engine.add_pass("Denoising Clean", 3, "RGB", b_view_layer.name().c_str());
|
b_engine.add_pass("Denoising Clean", 3, "RGB", b_view_layer.name().c_str());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef __KERNEL_DEBUG__
|
#ifdef __KERNEL_DEBUG__
|
||||||
if (get_boolean(crp, "pass_debug_bvh_traversed_nodes")) {
|
if (get_boolean(crp, "pass_debug_bvh_traversed_nodes")) {
|
||||||
b_engine.add_pass("Debug BVH Traversed Nodes", 1, "X", b_view_layer.name().c_str());
|
b_engine.add_pass("Debug BVH Traversed Nodes", 1, "X", b_view_layer.name().c_str());
|
||||||
|
@@ -42,6 +42,9 @@
|
|||||||
# include <optix_stubs.h>
|
# include <optix_stubs.h>
|
||||||
# include <optix_function_table_definition.h>
|
# include <optix_function_table_definition.h>
|
||||||
|
|
||||||
|
// TODO(pmours): Disable this once drivers have native support
|
||||||
|
# define OPTIX_DENOISER_NO_PIXEL_STRIDE 1
|
||||||
|
|
||||||
CCL_NAMESPACE_BEGIN
|
CCL_NAMESPACE_BEGIN
|
||||||
|
|
||||||
/* Make sure this stays in sync with kernel_globals.h */
|
/* Make sure this stays in sync with kernel_globals.h */
|
||||||
@@ -107,6 +110,30 @@ struct KernelParams {
|
|||||||
} \
|
} \
|
||||||
(void)0
|
(void)0
|
||||||
|
|
||||||
|
# define CUDA_GET_BLOCKSIZE(func, w, h) \
|
||||||
|
int threads; \
|
||||||
|
check_result_cuda_ret( \
|
||||||
|
cuFuncGetAttribute(&threads, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func)); \
|
||||||
|
threads = (int)sqrt((float)threads); \
|
||||||
|
int xblocks = ((w) + threads - 1) / threads; \
|
||||||
|
int yblocks = ((h) + threads - 1) / threads;
|
||||||
|
|
||||||
|
# define CUDA_LAUNCH_KERNEL(func, args) \
|
||||||
|
check_result_cuda_ret(cuLaunchKernel( \
|
||||||
|
func, xblocks, yblocks, 1, threads, threads, 1, 0, cuda_stream[thread_index], args, 0));
|
||||||
|
|
||||||
|
/* Similar as above, but for 1-dimensional blocks. */
|
||||||
|
# define CUDA_GET_BLOCKSIZE_1D(func, w, h) \
|
||||||
|
int threads; \
|
||||||
|
check_result_cuda_ret( \
|
||||||
|
cuFuncGetAttribute(&threads, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func)); \
|
||||||
|
int xblocks = ((w) + threads - 1) / threads; \
|
||||||
|
int yblocks = h;
|
||||||
|
|
||||||
|
# define CUDA_LAUNCH_KERNEL_1D(func, args) \
|
||||||
|
check_result_cuda_ret(cuLaunchKernel( \
|
||||||
|
func, xblocks, yblocks, 1, threads, 1, 1, 0, cuda_stream[thread_index], args, 0));
|
||||||
|
|
||||||
class OptiXDevice : public Device {
|
class OptiXDevice : public Device {
|
||||||
|
|
||||||
// List of OptiX program groups
|
// List of OptiX program groups
|
||||||
@@ -186,6 +213,9 @@ class OptiXDevice : public Device {
|
|||||||
map<device_memory *, CUDAMem> cuda_mem_map;
|
map<device_memory *, CUDAMem> cuda_mem_map;
|
||||||
bool move_texture_to_host = false;
|
bool move_texture_to_host = false;
|
||||||
|
|
||||||
|
OptixDenoiser denoiser = NULL;
|
||||||
|
vector<pair<int2, CUdeviceptr>> denoiser_state;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
OptiXDevice(DeviceInfo &info_, Stats &stats_, Profiler &profiler_, bool background_)
|
OptiXDevice(DeviceInfo &info_, Stats &stats_, Profiler &profiler_, bool background_)
|
||||||
: Device(info_, stats_, profiler_, background_),
|
: Device(info_, stats_, profiler_, background_),
|
||||||
@@ -262,6 +292,9 @@ class OptiXDevice : public Device {
|
|||||||
launch_params.data_elements = sizeof(KernelParams);
|
launch_params.data_elements = sizeof(KernelParams);
|
||||||
// Allocate launch parameter buffer memory on device
|
// Allocate launch parameter buffer memory on device
|
||||||
launch_params.alloc_to_device(info.cpu_threads);
|
launch_params.alloc_to_device(info.cpu_threads);
|
||||||
|
|
||||||
|
// Create denoiser state entries for all threads (but do not allocate yet)
|
||||||
|
denoiser_state.resize(info.cpu_threads);
|
||||||
}
|
}
|
||||||
~OptiXDevice()
|
~OptiXDevice()
|
||||||
{
|
{
|
||||||
@@ -272,7 +305,11 @@ class OptiXDevice : public Device {
|
|||||||
for (CUdeviceptr mem : as_mem) {
|
for (CUdeviceptr mem : as_mem) {
|
||||||
cuMemFree(mem);
|
cuMemFree(mem);
|
||||||
}
|
}
|
||||||
as_mem.clear();
|
|
||||||
|
// Free denoiser state for all threads
|
||||||
|
for (const pair<int2, CUdeviceptr> &state : denoiser_state) {
|
||||||
|
cuMemFree(state.second);
|
||||||
|
}
|
||||||
|
|
||||||
sbt_data.free();
|
sbt_data.free();
|
||||||
texture_info.free();
|
texture_info.free();
|
||||||
@@ -296,6 +333,9 @@ class OptiXDevice : public Device {
|
|||||||
for (CUstream stream : cuda_stream)
|
for (CUstream stream : cuda_stream)
|
||||||
cuStreamDestroy(stream);
|
cuStreamDestroy(stream);
|
||||||
|
|
||||||
|
if (denoiser != NULL)
|
||||||
|
optixDenoiserDestroy(denoiser);
|
||||||
|
|
||||||
// Destroy OptiX and CUDA context
|
// Destroy OptiX and CUDA context
|
||||||
optixDeviceContextDestroy(context);
|
optixDeviceContextDestroy(context);
|
||||||
cuDevicePrimaryCtxRelease(cuda_device);
|
cuDevicePrimaryCtxRelease(cuda_device);
|
||||||
@@ -686,46 +726,298 @@ class OptiXDevice : public Device {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void launch_denoise(DeviceTask &task, RenderTile &rtile, int thread_index)
|
bool launch_denoise(DeviceTask &task, RenderTile &rtile, int thread_index)
|
||||||
{
|
{
|
||||||
|
int total_samples = rtile.start_sample + rtile.num_samples;
|
||||||
|
|
||||||
const CUDAContextScope scope(cuda_context);
|
const CUDAContextScope scope(cuda_context);
|
||||||
|
|
||||||
// Run CUDA denoising kernels
|
// Choose between OptiX and NLM denoising
|
||||||
DenoisingTask denoising(this, task);
|
if (task.denoising_use_optix) {
|
||||||
denoising.functions.construct_transform = function_bind(
|
// Map neighboring tiles onto this device, indices are as following:
|
||||||
&OptiXDevice::denoising_construct_transform, this, &denoising, thread_index);
|
// Where index 4 is the center tile and index 9 is the target for the result.
|
||||||
denoising.functions.accumulate = function_bind(
|
// 0 1 2
|
||||||
&OptiXDevice::denoising_accumulate, this, _1, _2, _3, _4, &denoising, thread_index);
|
// 3 4 5
|
||||||
denoising.functions.solve = function_bind(
|
// 6 7 8 9
|
||||||
&OptiXDevice::denoising_solve, this, _1, &denoising, thread_index);
|
RenderTile rtiles[10];
|
||||||
denoising.functions.divide_shadow = function_bind(
|
rtiles[4] = rtile;
|
||||||
&OptiXDevice::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising, thread_index);
|
task.map_neighbor_tiles(rtiles, this);
|
||||||
denoising.functions.non_local_means = function_bind(
|
|
||||||
&OptiXDevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising, thread_index);
|
|
||||||
denoising.functions.combine_halves = function_bind(&OptiXDevice::denoising_combine_halves,
|
|
||||||
this,
|
|
||||||
_1,
|
|
||||||
_2,
|
|
||||||
_3,
|
|
||||||
_4,
|
|
||||||
_5,
|
|
||||||
_6,
|
|
||||||
&denoising,
|
|
||||||
thread_index);
|
|
||||||
denoising.functions.get_feature = function_bind(
|
|
||||||
&OptiXDevice::denoising_get_feature, this, _1, _2, _3, _4, _5, &denoising, thread_index);
|
|
||||||
denoising.functions.write_feature = function_bind(
|
|
||||||
&OptiXDevice::denoising_write_feature, this, _1, _2, _3, &denoising, thread_index);
|
|
||||||
denoising.functions.detect_outliers = function_bind(
|
|
||||||
&OptiXDevice::denoising_detect_outliers, this, _1, _2, _3, _4, &denoising, thread_index);
|
|
||||||
|
|
||||||
denoising.filter_area = make_int4(rtile.x, rtile.y, rtile.w, rtile.h);
|
// Calculate size of the tile to denoise (including overlap)
|
||||||
denoising.render_buffer.samples = rtile.sample = rtile.start_sample + rtile.num_samples;
|
int4 rect = make_int4(
|
||||||
denoising.buffer.gpu_temporary_mem = true;
|
rtiles[4].x, rtiles[4].y, rtiles[4].x + rtiles[4].w, rtiles[4].y + rtiles[4].h);
|
||||||
|
// Overlap between tiles has to be at least 64 pixels
|
||||||
|
// TODO(pmours): Query this value from OptiX
|
||||||
|
rect = rect_expand(rect, 64);
|
||||||
|
int4 clip_rect = make_int4(
|
||||||
|
rtiles[3].x, rtiles[1].y, rtiles[5].x + rtiles[5].w, rtiles[7].y + rtiles[7].h);
|
||||||
|
rect = rect_clip(rect, clip_rect);
|
||||||
|
int2 rect_size = make_int2(rect.z - rect.x, rect.w - rect.y);
|
||||||
|
int2 overlap_offset = make_int2(rtile.x - rect.x, rtile.y - rect.y);
|
||||||
|
|
||||||
denoising.run_denoising(&rtile);
|
// Calculate byte offsets and strides
|
||||||
|
int pixel_stride = task.pass_stride * (int)sizeof(float);
|
||||||
|
int pixel_offset = (rtile.offset + rtile.x + rtile.y * rtile.stride) * pixel_stride;
|
||||||
|
const int pass_offset[3] = {
|
||||||
|
(task.pass_denoising_data + DENOISING_PASS_COLOR) * (int)sizeof(float),
|
||||||
|
(task.pass_denoising_data + DENOISING_PASS_ALBEDO) * (int)sizeof(float),
|
||||||
|
(task.pass_denoising_data + DENOISING_PASS_NORMAL) * (int)sizeof(float)};
|
||||||
|
|
||||||
|
// Start with the current tile pointer offset
|
||||||
|
int input_stride = pixel_stride;
|
||||||
|
device_ptr input_ptr = rtile.buffer + pixel_offset;
|
||||||
|
|
||||||
|
// Copy tile data into a common buffer if necessary
|
||||||
|
device_only_memory<float> input(this, "denoiser input");
|
||||||
|
device_vector<TileInfo> tile_info_mem(this, "denoiser tile info", MEM_READ_WRITE);
|
||||||
|
|
||||||
|
if ((!rtiles[0].buffer || rtiles[0].buffer == rtile.buffer) &&
|
||||||
|
(!rtiles[1].buffer || rtiles[1].buffer == rtile.buffer) &&
|
||||||
|
(!rtiles[2].buffer || rtiles[2].buffer == rtile.buffer) &&
|
||||||
|
(!rtiles[3].buffer || rtiles[3].buffer == rtile.buffer) &&
|
||||||
|
(!rtiles[5].buffer || rtiles[5].buffer == rtile.buffer) &&
|
||||||
|
(!rtiles[6].buffer || rtiles[6].buffer == rtile.buffer) &&
|
||||||
|
(!rtiles[7].buffer || rtiles[7].buffer == rtile.buffer) &&
|
||||||
|
(!rtiles[8].buffer || rtiles[8].buffer == rtile.buffer)) {
|
||||||
|
// Tiles are in continous memory, so can just subtract overlap offset
|
||||||
|
input_ptr -= (overlap_offset.x + overlap_offset.y * rtile.stride) * pixel_stride;
|
||||||
|
// Stride covers the whole width of the image and not just a single tile
|
||||||
|
input_stride *= rtile.stride;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
// Adjacent tiles are in separate memory regions, so need to copy them into a single one
|
||||||
|
input.alloc_to_device(rect_size.x * rect_size.y * task.pass_stride);
|
||||||
|
// Start with the new input buffer
|
||||||
|
input_ptr = input.device_pointer;
|
||||||
|
// Stride covers the width of the new input buffer, which includes tile width and overlap
|
||||||
|
input_stride *= rect_size.x;
|
||||||
|
|
||||||
|
TileInfo *tile_info = tile_info_mem.alloc(1);
|
||||||
|
for (int i = 0; i < 9; i++) {
|
||||||
|
tile_info->offsets[i] = rtiles[i].offset;
|
||||||
|
tile_info->strides[i] = rtiles[i].stride;
|
||||||
|
tile_info->buffers[i] = rtiles[i].buffer;
|
||||||
|
}
|
||||||
|
tile_info->x[0] = rtiles[3].x;
|
||||||
|
tile_info->x[1] = rtiles[4].x;
|
||||||
|
tile_info->x[2] = rtiles[5].x;
|
||||||
|
tile_info->x[3] = rtiles[5].x + rtiles[5].w;
|
||||||
|
tile_info->y[0] = rtiles[1].y;
|
||||||
|
tile_info->y[1] = rtiles[4].y;
|
||||||
|
tile_info->y[2] = rtiles[7].y;
|
||||||
|
tile_info->y[3] = rtiles[7].y + rtiles[7].h;
|
||||||
|
tile_info_mem.copy_to_device();
|
||||||
|
|
||||||
|
CUfunction filter_copy_func;
|
||||||
|
check_result_cuda_ret(cuModuleGetFunction(
|
||||||
|
&filter_copy_func, cuda_filter_module, "kernel_cuda_filter_copy_input"));
|
||||||
|
check_result_cuda_ret(cuFuncSetCacheConfig(filter_copy_func, CU_FUNC_CACHE_PREFER_L1));
|
||||||
|
|
||||||
|
void *args[] = {
|
||||||
|
&input.device_pointer, &tile_info_mem.device_pointer, &rect.x, &task.pass_stride};
|
||||||
|
CUDA_GET_BLOCKSIZE(filter_copy_func, rect_size.x, rect_size.y);
|
||||||
|
CUDA_LAUNCH_KERNEL(filter_copy_func, args);
|
||||||
|
}
|
||||||
|
|
||||||
|
# if OPTIX_DENOISER_NO_PIXEL_STRIDE
|
||||||
|
device_only_memory<float> input_rgb(this, "denoiser input rgb");
|
||||||
|
{
|
||||||
|
input_rgb.alloc_to_device(rect_size.x * rect_size.y * 3 *
|
||||||
|
task.denoising.optix_input_passes);
|
||||||
|
|
||||||
|
CUfunction convert_to_rgb_func;
|
||||||
|
check_result_cuda_ret(cuModuleGetFunction(
|
||||||
|
&convert_to_rgb_func, cuda_filter_module, "kernel_cuda_filter_convert_to_rgb"));
|
||||||
|
check_result_cuda_ret(cuFuncSetCacheConfig(convert_to_rgb_func, CU_FUNC_CACHE_PREFER_L1));
|
||||||
|
|
||||||
|
void *args[] = {&input_rgb.device_pointer,
|
||||||
|
&input_ptr,
|
||||||
|
&rect_size.x,
|
||||||
|
&rect_size.y,
|
||||||
|
&input_stride,
|
||||||
|
&task.pass_stride,
|
||||||
|
const_cast<int *>(pass_offset),
|
||||||
|
&task.denoising.optix_input_passes,
|
||||||
|
&total_samples};
|
||||||
|
CUDA_GET_BLOCKSIZE(convert_to_rgb_func, rect_size.x, rect_size.y);
|
||||||
|
CUDA_LAUNCH_KERNEL(convert_to_rgb_func, args);
|
||||||
|
|
||||||
|
input_ptr = input_rgb.device_pointer;
|
||||||
|
pixel_stride = 3 * sizeof(float);
|
||||||
|
input_stride = rect_size.x * pixel_stride;
|
||||||
|
}
|
||||||
|
# endif
|
||||||
|
|
||||||
|
if (denoiser == NULL) {
|
||||||
|
// Create OptiX denoiser handle on demand when it is first used
|
||||||
|
OptixDenoiserOptions denoiser_options;
|
||||||
|
assert(task.denoising.optix_input_passes >= 1 && task.denoising.optix_input_passes <= 3);
|
||||||
|
denoiser_options.inputKind = static_cast<OptixDenoiserInputKind>(
|
||||||
|
OPTIX_DENOISER_INPUT_RGB + (task.denoising.optix_input_passes - 1));
|
||||||
|
denoiser_options.pixelFormat = OPTIX_PIXEL_FORMAT_FLOAT3;
|
||||||
|
check_result_optix_ret(optixDenoiserCreate(context, &denoiser_options, &denoiser));
|
||||||
|
check_result_optix_ret(
|
||||||
|
optixDenoiserSetModel(denoiser, OPTIX_DENOISER_MODEL_KIND_HDR, NULL, 0));
|
||||||
|
}
|
||||||
|
|
||||||
|
OptixDenoiserSizes sizes = {};
|
||||||
|
check_result_optix_ret(
|
||||||
|
optixDenoiserComputeMemoryResources(denoiser, rect_size.x, rect_size.y, &sizes));
|
||||||
|
|
||||||
|
auto &state = denoiser_state[thread_index].second;
|
||||||
|
auto &state_size = denoiser_state[thread_index].first;
|
||||||
|
const size_t scratch_size = sizes.recommendedScratchSizeInBytes;
|
||||||
|
const size_t scratch_offset = sizes.stateSizeInBytes;
|
||||||
|
|
||||||
|
// Allocate denoiser state if tile size has changed since last setup
|
||||||
|
if (state_size.x != rect_size.x || state_size.y != rect_size.y) {
|
||||||
|
if (state) {
|
||||||
|
cuMemFree(state);
|
||||||
|
state = 0;
|
||||||
|
}
|
||||||
|
check_result_cuda_ret(cuMemAlloc(&state, scratch_offset + scratch_size));
|
||||||
|
|
||||||
|
check_result_optix_ret(optixDenoiserSetup(denoiser,
|
||||||
|
cuda_stream[thread_index],
|
||||||
|
rect_size.x,
|
||||||
|
rect_size.y,
|
||||||
|
state,
|
||||||
|
scratch_offset,
|
||||||
|
state + scratch_offset,
|
||||||
|
scratch_size));
|
||||||
|
|
||||||
|
state_size = rect_size;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Set up input and output layer information
|
||||||
|
OptixImage2D input_layers[3] = {};
|
||||||
|
OptixImage2D output_layers[1] = {};
|
||||||
|
|
||||||
|
for (int i = 0; i < 3; ++i) {
|
||||||
|
# if OPTIX_DENOISER_NO_PIXEL_STRIDE
|
||||||
|
input_layers[i].data = input_ptr + (rect_size.x * rect_size.y * pixel_stride * i);
|
||||||
|
# else
|
||||||
|
input_layers[i].data = input_ptr + pass_offset[i];
|
||||||
|
# endif
|
||||||
|
input_layers[i].width = rect_size.x;
|
||||||
|
input_layers[i].height = rect_size.y;
|
||||||
|
input_layers[i].rowStrideInBytes = input_stride;
|
||||||
|
input_layers[i].pixelStrideInBytes = pixel_stride;
|
||||||
|
input_layers[i].format = OPTIX_PIXEL_FORMAT_FLOAT3;
|
||||||
|
}
|
||||||
|
|
||||||
|
# if OPTIX_DENOISER_NO_PIXEL_STRIDE
|
||||||
|
output_layers[0].data = input_ptr;
|
||||||
|
output_layers[0].width = rect_size.x;
|
||||||
|
output_layers[0].height = rect_size.y;
|
||||||
|
output_layers[0].rowStrideInBytes = input_stride;
|
||||||
|
output_layers[0].pixelStrideInBytes = pixel_stride;
|
||||||
|
int2 output_offset = overlap_offset;
|
||||||
|
overlap_offset = make_int2(0, 0); // Not supported by denoiser API, so apply manually
|
||||||
|
# else
|
||||||
|
output_layers[0].data = rtiles[9].buffer + pixel_offset;
|
||||||
|
output_layers[0].width = rtiles[9].w;
|
||||||
|
output_layers[0].height = rtiles[9].h;
|
||||||
|
output_layers[0].rowStrideInBytes = rtiles[9].stride * pixel_stride;
|
||||||
|
output_layers[0].pixelStrideInBytes = pixel_stride;
|
||||||
|
# endif
|
||||||
|
output_layers[0].format = OPTIX_PIXEL_FORMAT_FLOAT3;
|
||||||
|
|
||||||
|
// Finally run denonising
|
||||||
|
OptixDenoiserParams params = {}; // All parameters are disabled/zero
|
||||||
|
check_result_optix_ret(optixDenoiserInvoke(denoiser,
|
||||||
|
cuda_stream[thread_index],
|
||||||
|
¶ms,
|
||||||
|
state,
|
||||||
|
scratch_offset,
|
||||||
|
input_layers,
|
||||||
|
task.denoising.optix_input_passes,
|
||||||
|
overlap_offset.x,
|
||||||
|
overlap_offset.y,
|
||||||
|
output_layers,
|
||||||
|
state + scratch_offset,
|
||||||
|
scratch_size));
|
||||||
|
|
||||||
|
# if OPTIX_DENOISER_NO_PIXEL_STRIDE
|
||||||
|
{
|
||||||
|
CUfunction convert_from_rgb_func;
|
||||||
|
check_result_cuda_ret(cuModuleGetFunction(
|
||||||
|
&convert_from_rgb_func, cuda_filter_module, "kernel_cuda_filter_convert_from_rgb"));
|
||||||
|
check_result_cuda_ret(
|
||||||
|
cuFuncSetCacheConfig(convert_from_rgb_func, CU_FUNC_CACHE_PREFER_L1));
|
||||||
|
|
||||||
|
void *args[] = {&input_ptr,
|
||||||
|
&rtiles[9].buffer,
|
||||||
|
&output_offset.x,
|
||||||
|
&output_offset.y,
|
||||||
|
&rect_size.x,
|
||||||
|
&rect_size.y,
|
||||||
|
&rtiles[9].x,
|
||||||
|
&rtiles[9].y,
|
||||||
|
&rtiles[9].w,
|
||||||
|
&rtiles[9].h,
|
||||||
|
&rtiles[9].offset,
|
||||||
|
&rtiles[9].stride,
|
||||||
|
&task.pass_stride};
|
||||||
|
CUDA_GET_BLOCKSIZE(convert_from_rgb_func, rtiles[9].w, rtiles[9].h);
|
||||||
|
CUDA_LAUNCH_KERNEL(convert_from_rgb_func, args);
|
||||||
|
}
|
||||||
|
# endif
|
||||||
|
|
||||||
|
check_result_cuda_ret(cuStreamSynchronize(cuda_stream[thread_index]));
|
||||||
|
|
||||||
|
task.unmap_neighbor_tiles(rtiles, this);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
// Run CUDA denoising kernels
|
||||||
|
DenoisingTask denoising(this, task);
|
||||||
|
denoising.functions.construct_transform = function_bind(
|
||||||
|
&OptiXDevice::denoising_construct_transform, this, &denoising, thread_index);
|
||||||
|
denoising.functions.accumulate = function_bind(
|
||||||
|
&OptiXDevice::denoising_accumulate, this, _1, _2, _3, _4, &denoising, thread_index);
|
||||||
|
denoising.functions.solve = function_bind(
|
||||||
|
&OptiXDevice::denoising_solve, this, _1, &denoising, thread_index);
|
||||||
|
denoising.functions.divide_shadow = function_bind(&OptiXDevice::denoising_divide_shadow,
|
||||||
|
this,
|
||||||
|
_1,
|
||||||
|
_2,
|
||||||
|
_3,
|
||||||
|
_4,
|
||||||
|
_5,
|
||||||
|
&denoising,
|
||||||
|
thread_index);
|
||||||
|
denoising.functions.non_local_means = function_bind(
|
||||||
|
&OptiXDevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising, thread_index);
|
||||||
|
denoising.functions.combine_halves = function_bind(&OptiXDevice::denoising_combine_halves,
|
||||||
|
this,
|
||||||
|
_1,
|
||||||
|
_2,
|
||||||
|
_3,
|
||||||
|
_4,
|
||||||
|
_5,
|
||||||
|
_6,
|
||||||
|
&denoising,
|
||||||
|
thread_index);
|
||||||
|
denoising.functions.get_feature = function_bind(
|
||||||
|
&OptiXDevice::denoising_get_feature, this, _1, _2, _3, _4, _5, &denoising, thread_index);
|
||||||
|
denoising.functions.write_feature = function_bind(
|
||||||
|
&OptiXDevice::denoising_write_feature, this, _1, _2, _3, &denoising, thread_index);
|
||||||
|
denoising.functions.detect_outliers = function_bind(
|
||||||
|
&OptiXDevice::denoising_detect_outliers, this, _1, _2, _3, _4, &denoising, thread_index);
|
||||||
|
|
||||||
|
denoising.filter_area = make_int4(rtile.x, rtile.y, rtile.w, rtile.h);
|
||||||
|
denoising.render_buffer.samples = total_samples;
|
||||||
|
denoising.buffer.gpu_temporary_mem = true;
|
||||||
|
|
||||||
|
denoising.run_denoising(&rtile);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Update current sample, so it is displayed correctly
|
||||||
|
rtile.sample = total_samples;
|
||||||
|
// Update task progress after the denoiser completed processing
|
||||||
task.update_progress(&rtile, rtile.w * rtile.h);
|
task.update_progress(&rtile, rtile.w * rtile.h);
|
||||||
|
|
||||||
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
void launch_shader_eval(DeviceTask &task, int thread_index)
|
void launch_shader_eval(DeviceTask &task, int thread_index)
|
||||||
@@ -1899,30 +2191,6 @@ class OptiXDevice : public Device {
|
|||||||
task_pool.cancel();
|
task_pool.cancel();
|
||||||
}
|
}
|
||||||
|
|
||||||
# define CUDA_GET_BLOCKSIZE(func, w, h) \
|
|
||||||
int threads; \
|
|
||||||
check_result_cuda_ret( \
|
|
||||||
cuFuncGetAttribute(&threads, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func)); \
|
|
||||||
threads = (int)sqrt((float)threads); \
|
|
||||||
int xblocks = ((w) + threads - 1) / threads; \
|
|
||||||
int yblocks = ((h) + threads - 1) / threads;
|
|
||||||
|
|
||||||
# define CUDA_LAUNCH_KERNEL(func, args) \
|
|
||||||
check_result_cuda_ret(cuLaunchKernel( \
|
|
||||||
func, xblocks, yblocks, 1, threads, threads, 1, 0, cuda_stream[thread_index], args, 0));
|
|
||||||
|
|
||||||
/* Similar as above, but for 1-dimensional blocks. */
|
|
||||||
# define CUDA_GET_BLOCKSIZE_1D(func, w, h) \
|
|
||||||
int threads; \
|
|
||||||
check_result_cuda_ret( \
|
|
||||||
cuFuncGetAttribute(&threads, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func)); \
|
|
||||||
int xblocks = ((w) + threads - 1) / threads; \
|
|
||||||
int yblocks = h;
|
|
||||||
|
|
||||||
# define CUDA_LAUNCH_KERNEL_1D(func, args) \
|
|
||||||
check_result_cuda_ret(cuLaunchKernel( \
|
|
||||||
func, xblocks, yblocks, 1, threads, 1, 1, 0, cuda_stream[thread_index], args, 0));
|
|
||||||
|
|
||||||
bool denoising_non_local_means(device_ptr image_ptr,
|
bool denoising_non_local_means(device_ptr image_ptr,
|
||||||
device_ptr guide_ptr,
|
device_ptr guide_ptr,
|
||||||
device_ptr variance_ptr,
|
device_ptr variance_ptr,
|
||||||
@@ -2341,9 +2609,8 @@ bool device_optix_init()
|
|||||||
const OptixResult result = optixInit();
|
const OptixResult result = optixInit();
|
||||||
|
|
||||||
if (result == OPTIX_ERROR_UNSUPPORTED_ABI_VERSION) {
|
if (result == OPTIX_ERROR_UNSUPPORTED_ABI_VERSION) {
|
||||||
VLOG(1)
|
VLOG(1) << "OptiX initialization failed because driver does not support ABI version "
|
||||||
<< "OptiX initialization failed because the installed driver does not support ABI version "
|
<< OPTIX_ABI_VERSION;
|
||||||
<< OPTIX_ABI_VERSION;
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
else if (result != OPTIX_SUCCESS) {
|
else if (result != OPTIX_SUCCESS) {
|
||||||
|
@@ -47,6 +47,8 @@ class DenoiseParams {
|
|||||||
int neighbor_frames;
|
int neighbor_frames;
|
||||||
/* Clamp the input to the range of +-1e8. Should be enough for any legitimate data. */
|
/* Clamp the input to the range of +-1e8. Should be enough for any legitimate data. */
|
||||||
bool clamp_input;
|
bool clamp_input;
|
||||||
|
/* Controls which passes the OptiX AI denoiser should use as input. */
|
||||||
|
int optix_input_passes;
|
||||||
|
|
||||||
DenoiseParams()
|
DenoiseParams()
|
||||||
{
|
{
|
||||||
@@ -56,6 +58,7 @@ class DenoiseParams {
|
|||||||
relative_pca = false;
|
relative_pca = false;
|
||||||
neighbor_frames = 2;
|
neighbor_frames = 2;
|
||||||
clamp_input = true;
|
clamp_input = true;
|
||||||
|
optix_input_passes = 1;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -100,6 +103,7 @@ class DeviceTask : public Task {
|
|||||||
vector<int> denoising_frames;
|
vector<int> denoising_frames;
|
||||||
|
|
||||||
bool denoising_do_filter;
|
bool denoising_do_filter;
|
||||||
|
bool denoising_use_optix;
|
||||||
bool denoising_write_passes;
|
bool denoising_write_passes;
|
||||||
|
|
||||||
int pass_stride;
|
int pass_stride;
|
||||||
|
@@ -91,6 +91,10 @@ ccl_device_inline void kernel_update_denoising_features(KernelGlobals *kg,
|
|||||||
if (sum_weight != 0.0f) {
|
if (sum_weight != 0.0f) {
|
||||||
normal /= sum_weight;
|
normal /= sum_weight;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Transform normal into camera space. */
|
||||||
|
normal = transform_direction(&kernel_data.cam.worldtocamera, normal);
|
||||||
|
|
||||||
L->denoising_normal += ensure_finite3(state->denoising_feature_weight * normal);
|
L->denoising_normal += ensure_finite3(state->denoising_feature_weight * normal);
|
||||||
L->denoising_albedo += ensure_finite3(state->denoising_feature_weight * albedo);
|
L->denoising_albedo += ensure_finite3(state->denoising_feature_weight * albedo);
|
||||||
|
|
||||||
|
@@ -26,6 +26,74 @@
|
|||||||
|
|
||||||
/* kernels */
|
/* kernels */
|
||||||
|
|
||||||
|
extern "C" __global__ void
|
||||||
|
CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
|
||||||
|
kernel_cuda_filter_copy_input(float *buffer,
|
||||||
|
CCL_FILTER_TILE_INFO,
|
||||||
|
int4 prefilter_rect,
|
||||||
|
int buffer_pass_stride)
|
||||||
|
{
|
||||||
|
int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x;
|
||||||
|
int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y;
|
||||||
|
if(x < prefilter_rect.z && y < prefilter_rect.w) {
|
||||||
|
int xtile = (x < tile_info->x[1]) ? 0 : ((x < tile_info->x[2]) ? 1 : 2);
|
||||||
|
int ytile = (y < tile_info->y[1]) ? 0 : ((y < tile_info->y[2]) ? 1 : 2);
|
||||||
|
int itile = ytile * 3 + xtile;
|
||||||
|
float *const in = ((float *)ccl_get_tile_buffer(itile)) +
|
||||||
|
(tile_info->offsets[itile] + y * tile_info->strides[itile] + x) * buffer_pass_stride;
|
||||||
|
buffer += ((y - prefilter_rect.y) * (prefilter_rect.z - prefilter_rect.x) + (x - prefilter_rect.x)) * buffer_pass_stride;
|
||||||
|
for (int i = 0; i < buffer_pass_stride; ++i)
|
||||||
|
buffer[i] = in[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
extern "C" __global__ void
|
||||||
|
CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
|
||||||
|
kernel_cuda_filter_convert_to_rgb(float *rgb, float *buf, int sw, int sh, int stride, int pass_stride, int3 pass_offset, int num_inputs, int num_samples)
|
||||||
|
{
|
||||||
|
int x = blockDim.x*blockIdx.x + threadIdx.x;
|
||||||
|
int y = blockDim.y*blockIdx.y + threadIdx.y;
|
||||||
|
if(x < sw && y < sh) {
|
||||||
|
if (num_inputs > 0) {
|
||||||
|
float *in = buf + x * pass_stride + (y * stride + pass_offset.x) / sizeof(float);
|
||||||
|
float *out = rgb + (x + y * sw) * 3;
|
||||||
|
out[0] = in[0];
|
||||||
|
out[1] = in[1];
|
||||||
|
out[2] = in[2];
|
||||||
|
}
|
||||||
|
if (num_inputs > 1) {
|
||||||
|
float *in = buf + x * pass_stride + (y * stride + pass_offset.y) / sizeof(float);
|
||||||
|
float *out = rgb + (x + y * sw) * 3 + (sw * sh) * 3;
|
||||||
|
out[0] = in[0] / num_samples;
|
||||||
|
out[1] = in[1] / num_samples;
|
||||||
|
out[2] = in[2] / num_samples;
|
||||||
|
}
|
||||||
|
if (num_inputs > 2) {
|
||||||
|
float *in = buf + x * pass_stride + (y * stride + pass_offset.z) / sizeof(float);
|
||||||
|
float *out = rgb + (x + y * sw) * 3 + (sw * sh * 2) * 3;
|
||||||
|
out[0] = in[0] / num_samples;
|
||||||
|
out[1] = in[1] / num_samples;
|
||||||
|
out[2] = in[2] / num_samples;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
extern "C" __global__ void
|
||||||
|
CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
|
||||||
|
kernel_cuda_filter_convert_from_rgb(float *rgb, float *buf, int ix, int iy, int iw, int ih, int sx, int sy, int sw, int sh, int offset, int stride, int pass_stride)
|
||||||
|
{
|
||||||
|
int x = blockDim.x*blockIdx.x + threadIdx.x;
|
||||||
|
int y = blockDim.y*blockIdx.y + threadIdx.y;
|
||||||
|
if(x < sw && y < sh) {
|
||||||
|
float *in = rgb + ((ix + x) + (iy + y) * iw) * 3;
|
||||||
|
float *out = buf + (offset + (sx + x) + (sy + y) * stride) * pass_stride;
|
||||||
|
out[0] = in[0];
|
||||||
|
out[1] = in[1];
|
||||||
|
out[2] = in[2];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
extern "C" __global__ void
|
extern "C" __global__ void
|
||||||
CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
|
CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
|
||||||
kernel_cuda_filter_divide_shadow(int sample,
|
kernel_cuda_filter_divide_shadow(int sample,
|
||||||
@@ -97,14 +165,14 @@ kernel_cuda_filter_write_feature(int sample,
|
|||||||
int x = blockDim.x*blockIdx.x + threadIdx.x;
|
int x = blockDim.x*blockIdx.x + threadIdx.x;
|
||||||
int y = blockDim.y*blockIdx.y + threadIdx.y;
|
int y = blockDim.y*blockIdx.y + threadIdx.y;
|
||||||
if(x < filter_area.z && y < filter_area.w) {
|
if(x < filter_area.z && y < filter_area.w) {
|
||||||
kernel_filter_write_feature(sample,
|
kernel_filter_write_feature(sample,
|
||||||
x + filter_area.x,
|
x + filter_area.x,
|
||||||
y + filter_area.y,
|
y + filter_area.y,
|
||||||
buffer_params,
|
buffer_params,
|
||||||
from,
|
from,
|
||||||
buffer,
|
buffer,
|
||||||
out_offset,
|
out_offset,
|
||||||
prefilter_rect);
|
prefilter_rect);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@@ -55,7 +55,10 @@ bool BufferParams::modified(const BufferParams ¶ms)
|
|||||||
{
|
{
|
||||||
return !(full_x == params.full_x && full_y == params.full_y && width == params.width &&
|
return !(full_x == params.full_x && full_y == params.full_y && width == params.width &&
|
||||||
height == params.height && full_width == params.full_width &&
|
height == params.height && full_width == params.full_width &&
|
||||||
full_height == params.full_height && Pass::equals(passes, params.passes));
|
full_height == params.full_height && Pass::equals(passes, params.passes) &&
|
||||||
|
denoising_data_pass == params.denoising_data_pass &&
|
||||||
|
denoising_clean_pass == params.denoising_clean_pass &&
|
||||||
|
denoising_prefiltered_pass == params.denoising_prefiltered_pass);
|
||||||
}
|
}
|
||||||
|
|
||||||
int BufferParams::get_passes_size()
|
int BufferParams::get_passes_size()
|
||||||
@@ -183,13 +186,28 @@ bool RenderBuffers::get_denoising_pass_rect(
|
|||||||
offset = type + params.get_denoising_offset();
|
offset = type + params.get_denoising_offset();
|
||||||
scale /= sample;
|
scale /= sample;
|
||||||
}
|
}
|
||||||
else if (type == DENOISING_PASS_PREFILTERED_COLOR && !params.denoising_prefiltered_pass) {
|
else if (params.denoising_prefiltered_pass) {
|
||||||
/* If we're not saving the prefiltering result, return the original noisy pass. */
|
offset = type + params.get_denoising_prefiltered_offset();
|
||||||
offset = params.get_denoising_offset() + DENOISING_PASS_COLOR;
|
|
||||||
scale /= sample;
|
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
offset = type + params.get_denoising_prefiltered_offset();
|
switch (type) {
|
||||||
|
case DENOISING_PASS_PREFILTERED_DEPTH:
|
||||||
|
offset = params.get_denoising_offset() + DENOISING_PASS_DEPTH;
|
||||||
|
break;
|
||||||
|
case DENOISING_PASS_PREFILTERED_NORMAL:
|
||||||
|
offset = params.get_denoising_offset() + DENOISING_PASS_NORMAL;
|
||||||
|
break;
|
||||||
|
case DENOISING_PASS_PREFILTERED_ALBEDO:
|
||||||
|
offset = params.get_denoising_offset() + DENOISING_PASS_ALBEDO;
|
||||||
|
break;
|
||||||
|
case DENOISING_PASS_PREFILTERED_COLOR:
|
||||||
|
/* If we're not saving the prefiltering result, return the original noisy pass. */
|
||||||
|
offset = params.get_denoising_offset() + DENOISING_PASS_COLOR;
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
scale /= sample;
|
||||||
}
|
}
|
||||||
|
|
||||||
int pass_stride = params.get_passes_size();
|
int pass_stride = params.get_passes_size();
|
||||||
|
@@ -285,9 +285,7 @@ void Session::run_gpu()
|
|||||||
|
|
||||||
if (progress.get_cancel())
|
if (progress.get_cancel())
|
||||||
break;
|
break;
|
||||||
}
|
|
||||||
|
|
||||||
if (!no_tiles) {
|
|
||||||
/* buffers mutex is locked entirely while rendering each
|
/* buffers mutex is locked entirely while rendering each
|
||||||
* sample, and released/reacquired on each iteration to allow
|
* sample, and released/reacquired on each iteration to allow
|
||||||
* reset and draw in between */
|
* reset and draw in between */
|
||||||
@@ -978,7 +976,7 @@ void Session::update_status_time(bool show_pause, bool show_done)
|
|||||||
*/
|
*/
|
||||||
substatus += string_printf(", Sample %d/%d", progress.get_current_sample(), num_samples);
|
substatus += string_printf(", Sample %d/%d", progress.get_current_sample(), num_samples);
|
||||||
}
|
}
|
||||||
if (params.full_denoising) {
|
if (params.full_denoising || params.optix_denoising) {
|
||||||
substatus += string_printf(", Denoised %d tiles", progress.get_denoised_tiles());
|
substatus += string_printf(", Denoised %d tiles", progress.get_denoised_tiles());
|
||||||
}
|
}
|
||||||
else if (params.run_denoising) {
|
else if (params.run_denoising) {
|
||||||
@@ -1038,6 +1036,7 @@ void Session::render()
|
|||||||
|
|
||||||
task.denoising_from_render = true;
|
task.denoising_from_render = true;
|
||||||
task.denoising_do_filter = params.full_denoising;
|
task.denoising_do_filter = params.full_denoising;
|
||||||
|
task.denoising_use_optix = params.optix_denoising;
|
||||||
task.denoising_write_passes = params.write_denoising_passes;
|
task.denoising_write_passes = params.write_denoising_passes;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@@ -63,6 +63,7 @@ class SessionParams {
|
|||||||
bool run_denoising;
|
bool run_denoising;
|
||||||
bool write_denoising_passes;
|
bool write_denoising_passes;
|
||||||
bool full_denoising;
|
bool full_denoising;
|
||||||
|
bool optix_denoising;
|
||||||
DenoiseParams denoising;
|
DenoiseParams denoising;
|
||||||
|
|
||||||
double cancel_timeout;
|
double cancel_timeout;
|
||||||
@@ -92,6 +93,7 @@ class SessionParams {
|
|||||||
run_denoising = false;
|
run_denoising = false;
|
||||||
write_denoising_passes = false;
|
write_denoising_passes = false;
|
||||||
full_denoising = false;
|
full_denoising = false;
|
||||||
|
optix_denoising = false;
|
||||||
|
|
||||||
display_buffer_linear = false;
|
display_buffer_linear = false;
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user