Cycles: Implement denoising option for reducing noise in the rendered image

This commit contains the first part of the new Cycles denoising option,
which filters the resulting image using information gathered during rendering
to get rid of noise while preserving visual features as well as possible.

To use the option, enable it in the render layer options. The default settings
fit a wide range of scenes, but the user can tweak individual settings to
control the tradeoff between a noise-free image, image details, and calculation
time.

Note that the denoiser may still change in the future and that some features
are not implemented yet. The most important missing feature is animation
denoising, which uses information from multiple frames at once to produce a
flicker-free and smoother result. These features will be added in the future.

Finally, thanks to all the people who supported this project:

- Google (through the GSoC) and Theory Studios for sponsoring the development
- The authors of the papers I used for implementing the denoiser (more details
  on them will be included in the technical docs)
- The other Cycles devs for feedback on the code, especially Sergey for
  mentoring the GSoC project and Brecht for the code review!
- And of course the users who helped with testing, reported bugs and things
  that could and/or should work better!
This commit is contained in:
Lukas Stockner
2017-05-07 14:40:58 +02:00
parent bca6978347
commit 43b374e8c5
117 changed files with 6448 additions and 1149 deletions

View File

@@ -10,7 +10,23 @@ set(INC_SYS
set(SRC
kernels/cpu/kernel.cpp
kernels/cpu/kernel_sse2.cpp
kernels/cpu/kernel_sse3.cpp
kernels/cpu/kernel_sse41.cpp
kernels/cpu/kernel_avx.cpp
kernels/cpu/kernel_avx2.cpp
kernels/cpu/kernel_split.cpp
kernels/cpu/kernel_split_sse2.cpp
kernels/cpu/kernel_split_sse3.cpp
kernels/cpu/kernel_split_sse41.cpp
kernels/cpu/kernel_split_avx.cpp
kernels/cpu/kernel_split_avx2.cpp
kernels/cpu/filter.cpp
kernels/cpu/filter_sse2.cpp
kernels/cpu/filter_sse3.cpp
kernels/cpu/filter_sse41.cpp
kernels/cpu/filter_avx.cpp
kernels/cpu/filter_avx2.cpp
kernels/opencl/kernel.cl
kernels/opencl/kernel_state_buffer_size.cl
kernels/opencl/kernel_split.cl
@@ -32,8 +48,10 @@ set(SRC
kernels/opencl/kernel_next_iteration_setup.cl
kernels/opencl/kernel_indirect_subsurface.cl
kernels/opencl/kernel_buffer_update.cl
kernels/opencl/filter.cl
kernels/cuda/kernel.cu
kernels/cuda/kernel_split.cu
kernels/cuda/filter.cu
)
set(SRC_BVH_HEADERS
@@ -95,6 +113,8 @@ set(SRC_KERNELS_CPU_HEADERS
kernels/cpu/kernel_cpu.h
kernels/cpu/kernel_cpu_impl.h
kernels/cpu/kernel_cpu_image.h
kernels/cpu/filter_cpu.h
kernels/cpu/filter_cpu_impl.h
)
set(SRC_KERNELS_CUDA_HEADERS
@@ -190,6 +210,21 @@ set(SRC_GEOM_HEADERS
geom/geom_volume.h
)
set(SRC_FILTER_HEADERS
filter/filter.h
filter/filter_defines.h
filter/filter_features.h
filter/filter_features_sse.h
filter/filter_kernel.h
filter/filter_nlm_cpu.h
filter/filter_nlm_gpu.h
filter/filter_prefilter.h
filter/filter_reconstruction.h
filter/filter_transform.h
filter/filter_transform_gpu.h
filter/filter_transform_sse.h
)
set(SRC_UTIL_HEADERS
../util/util_atomic.h
../util/util_color.h
@@ -204,6 +239,7 @@ set(SRC_UTIL_HEADERS
../util/util_math_int2.h
../util/util_math_int3.h
../util/util_math_int4.h
../util/util_math_matrix.h
../util/util_static_assert.h
../util/util_transform.h
../util/util_texture.h
@@ -295,23 +331,21 @@ if(WITH_CYCLES_CUDA_BINARIES)
${SRC_CLOSURE_HEADERS}
${SRC_UTIL_HEADERS}
)
set(cuda_filter_sources kernels/cuda/filter.cu
${SRC_HEADERS}
${SRC_KERNELS_CUDA_HEADERS}
${SRC_FILTER_HEADERS}
${SRC_UTIL_HEADERS}
)
set(cuda_cubins)
macro(CYCLES_CUDA_KERNEL_ADD arch split experimental)
if(${split})
set(cuda_extra_flags "-D__SPLIT__")
set(cuda_cubin kernel_split)
else()
set(cuda_extra_flags "")
set(cuda_cubin kernel)
endif()
macro(CYCLES_CUDA_KERNEL_ADD arch name flags sources experimental)
if(${experimental})
set(cuda_extra_flags ${cuda_extra_flags} -D__KERNEL_EXPERIMENTAL__)
set(cuda_cubin ${cuda_cubin}_experimental)
set(flags ${flags} -D__KERNEL_EXPERIMENTAL__)
set(name ${name}_experimental)
endif()
set(cuda_cubin ${cuda_cubin}_${arch}.cubin)
set(cuda_cubin ${name}_${arch}.cubin)
if(WITH_CYCLES_DEBUG)
set(cuda_debug_flags "-D__KERNEL_DEBUG__")
@@ -325,11 +359,7 @@ if(WITH_CYCLES_CUDA_BINARIES)
set(cuda_version_flags "-D__KERNEL_CUDA_VERSION__=${cuda_nvcc_version}")
set(cuda_math_flags "--use_fast_math")
if(split)
set(cuda_kernel_src "/kernels/cuda/kernel_split.cu")
else()
set(cuda_kernel_src "/kernels/cuda/kernel.cu")
endif()
set(cuda_kernel_src "/kernels/cuda/${name}.cu")
add_custom_command(
OUTPUT ${cuda_cubin}
@@ -343,13 +373,13 @@ if(WITH_CYCLES_CUDA_BINARIES)
${cuda_arch_flags}
${cuda_version_flags}
${cuda_math_flags}
${cuda_extra_flags}
${flags}
${cuda_debug_flags}
-I${CMAKE_CURRENT_SOURCE_DIR}/..
-DCCL_NAMESPACE_BEGIN=
-DCCL_NAMESPACE_END=
-DNVCC
DEPENDS ${cuda_sources})
DEPENDS ${sources})
delayed_install("${CMAKE_CURRENT_BINARY_DIR}" "${cuda_cubin}" ${CYCLES_INSTALL_PATH}/lib)
list(APPEND cuda_cubins ${cuda_cubin})
@@ -363,11 +393,12 @@ if(WITH_CYCLES_CUDA_BINARIES)
foreach(arch ${CYCLES_CUDA_BINARIES_ARCH})
# Compile regular kernel
CYCLES_CUDA_KERNEL_ADD(${arch} FALSE FALSE)
CYCLES_CUDA_KERNEL_ADD(${arch} kernel "" "${cuda_sources}" FALSE)
CYCLES_CUDA_KERNEL_ADD(${arch} filter "" "${cuda_filter_sources}" FALSE)
if(WITH_CYCLES_CUDA_SPLIT_KERNEL_BINARIES)
# Compile split kernel
CYCLES_CUDA_KERNEL_ADD(${arch} TRUE FALSE)
CYCLES_CUDA_KERNEL_ADD(${arch} kernel_split "-D__SPLIT__" ${cuda_sources} FALSE)
endif()
endforeach()
@@ -388,41 +419,30 @@ include_directories(SYSTEM ${INC_SYS})
set_source_files_properties(kernels/cpu/kernel.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_KERNEL_FLAGS}")
set_source_files_properties(kernels/cpu/kernel_split.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_KERNEL_FLAGS}")
set_source_files_properties(kernels/cpu/filter.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_KERNEL_FLAGS}")
if(CXX_HAS_SSE)
list(APPEND SRC
kernels/cpu/kernel_sse2.cpp
kernels/cpu/kernel_sse3.cpp
kernels/cpu/kernel_sse41.cpp
kernels/cpu/kernel_split_sse2.cpp
kernels/cpu/kernel_split_sse3.cpp
kernels/cpu/kernel_split_sse41.cpp
)
set_source_files_properties(kernels/cpu/kernel_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}")
set_source_files_properties(kernels/cpu/kernel_sse3.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS}")
set_source_files_properties(kernels/cpu/kernel_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}")
set_source_files_properties(kernels/cpu/kernel_split_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}")
set_source_files_properties(kernels/cpu/kernel_split_sse3.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS}")
set_source_files_properties(kernels/cpu/kernel_split_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}")
set_source_files_properties(kernels/cpu/filter_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}")
set_source_files_properties(kernels/cpu/filter_sse3.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS}")
set_source_files_properties(kernels/cpu/filter_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}")
endif()
if(CXX_HAS_AVX)
list(APPEND SRC
kernels/cpu/kernel_avx.cpp
kernels/cpu/kernel_split_avx.cpp
)
set_source_files_properties(kernels/cpu/kernel_avx.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX_KERNEL_FLAGS}")
set_source_files_properties(kernels/cpu/kernel_split_avx.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX_KERNEL_FLAGS}")
set_source_files_properties(kernels/cpu/filter_avx.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX_KERNEL_FLAGS}")
endif()
if(CXX_HAS_AVX2)
list(APPEND SRC
kernels/cpu/kernel_avx2.cpp
kernels/cpu/kernel_split_avx2.cpp
)
set_source_files_properties(kernels/cpu/kernel_avx2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}")
set_source_files_properties(kernels/cpu/kernel_split_avx2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}")
set_source_files_properties(kernels/cpu/filter_avx2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}")
endif()
add_library(cycles_kernel
@@ -432,6 +452,7 @@ add_library(cycles_kernel
${SRC_KERNELS_CUDA_HEADERS}
${SRC_BVH_HEADERS}
${SRC_CLOSURE_HEADERS}
${SRC_FILTER_HEADERS}
${SRC_SVM_HEADERS}
${SRC_GEOM_HEADERS}
${SRC_SPLIT_HEADERS}
@@ -472,12 +493,15 @@ delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_shadow_blocke
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_next_iteration_setup.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_indirect_subsurface.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_buffer_update.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/filter.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/cuda/kernel.cu" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/cuda)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/cuda/kernel_split.cu" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/cuda)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/cuda/filter.cu" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/cuda)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNELS_CUDA_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/cuda)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_BVH_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/bvh)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_CLOSURE_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/closure)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_FILTER_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/filter)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_SVM_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/svm)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_GEOM_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/geom)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_UTIL_HEADERS}" ${CYCLES_INSTALL_PATH}/source/util)

View File

@@ -435,5 +435,23 @@ ccl_device bool bsdf_merge(ShaderClosure *a, ShaderClosure *b)
#endif
}
/* Classifies a closure as diffuse-like or specular-like.
* This is needed for the denoising feature pass generation,
* which are written on the first bounce where more than 25%
* of the sampling weight belongs to diffuse-line closures. */
ccl_device_inline bool bsdf_is_specular_like(ShaderClosure *sc)
{
if(CLOSURE_IS_BSDF_TRANSPARENT(sc->type)) {
return true;
}
if(CLOSURE_IS_BSDF_MICROFACET(sc->type)) {
MicrofacetBsdf *bsdf = (MicrofacetBsdf*) sc;
return (bsdf->alpha_x*bsdf->alpha_y <= 0.075f*0.075f);
}
return false;
}
CCL_NAMESPACE_END

View File

@@ -40,7 +40,6 @@ typedef ccl_addr_space struct VelvetBsdf {
float sigma;
float invsigma2;
float3 N;
} VelvetBsdf;
ccl_device int bsdf_ashikhmin_velvet_setup(VelvetBsdf *bsdf)

View File

@@ -37,7 +37,6 @@ CCL_NAMESPACE_BEGIN
typedef ccl_addr_space struct DiffuseBsdf {
SHADER_CLOSURE_BASE;
float3 N;
} DiffuseBsdf;
/* DIFFUSE */

View File

@@ -40,7 +40,6 @@ CCL_NAMESPACE_BEGIN
typedef ccl_addr_space struct DiffuseRampBsdf {
SHADER_CLOSURE_BASE;
float3 N;
float3 *colors;
} DiffuseRampBsdf;

View File

@@ -46,7 +46,6 @@ typedef ccl_addr_space struct MicrofacetBsdf {
float alpha_x, alpha_y, ior;
MicrofacetExtra *extra;
float3 T;
float3 N;
} MicrofacetBsdf;
/* Beckmann and GGX microfacet importance sampling. */

View File

@@ -42,7 +42,7 @@ ccl_device_forceinline float D_ggx_aniso(const float3 wm, const float2 alpha)
/* Sample slope distribution (based on page 14 of the supplemental implementation). */
ccl_device_forceinline float2 mf_sampleP22_11(const float cosI, const float2 randU)
{
if(cosI > 0.9999f || cosI < 1e-6f) {
if(cosI > 0.9999f || fabsf(cosI) < 1e-6f) {
const float r = sqrtf(randU.x / max(1.0f - randU.x, 1e-7f));
const float phi = M_2PI_F * randU.y;
return make_float2(r*cosf(phi), r*sinf(phi));

View File

@@ -22,7 +22,6 @@ CCL_NAMESPACE_BEGIN
typedef ccl_addr_space struct OrenNayarBsdf {
SHADER_CLOSURE_BASE;
float3 N;
float roughness;
float a;
float b;

View File

@@ -40,7 +40,6 @@ CCL_NAMESPACE_BEGIN
typedef ccl_addr_space struct PhongRampBsdf {
SHADER_CLOSURE_BASE;
float3 N;
float exponent;
float3 *colors;
} PhongRampBsdf;

View File

@@ -28,7 +28,6 @@ typedef ccl_addr_space struct PrincipledDiffuseBsdf {
SHADER_CLOSURE_BASE;
float roughness;
float3 N;
} PrincipledDiffuseBsdf;
ccl_device float3 calculate_principled_diffuse_brdf(const PrincipledDiffuseBsdf *bsdf,

View File

@@ -26,7 +26,6 @@ CCL_NAMESPACE_BEGIN
typedef ccl_addr_space struct PrincipledSheenBsdf {
SHADER_CLOSURE_BASE;
float3 N;
} PrincipledSheenBsdf;
ccl_device float3 calculate_principled_sheen_brdf(const PrincipledSheenBsdf *bsdf,

View File

@@ -38,7 +38,6 @@ CCL_NAMESPACE_BEGIN
typedef ccl_addr_space struct ToonBsdf {
SHADER_CLOSURE_BASE;
float3 N;
float size;
float smooth;
} ToonBsdf;

View File

@@ -28,7 +28,6 @@ typedef ccl_addr_space struct Bssrdf {
float texture_blur;
float albedo;
float roughness;
float3 N;
} Bssrdf;
/* Planar Truncated Gaussian

View File

@@ -0,0 +1,52 @@
/*
* Copyright 2011-2017 Blender Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef __FILTER_H__
#define __FILTER_H__
/* CPU Filter Kernel Interface */
#include "util/util_types.h"
#include "kernel/filter/filter_defines.h"
CCL_NAMESPACE_BEGIN
#define KERNEL_NAME_JOIN(x, y, z) x ## _ ## y ## _ ## z
#define KERNEL_NAME_EVAL(arch, name) KERNEL_NAME_JOIN(kernel, arch, name)
#define KERNEL_FUNCTION_FULL_NAME(name) KERNEL_NAME_EVAL(KERNEL_ARCH, name)
#define KERNEL_ARCH cpu
#include "kernel/kernels/cpu/filter_cpu.h"
#define KERNEL_ARCH cpu_sse2
#include "kernel/kernels/cpu/filter_cpu.h"
#define KERNEL_ARCH cpu_sse3
#include "kernel/kernels/cpu/filter_cpu.h"
#define KERNEL_ARCH cpu_sse41
#include "kernel/kernels/cpu/filter_cpu.h"
#define KERNEL_ARCH cpu_avx
#include "kernel/kernels/cpu/filter_cpu.h"
#define KERNEL_ARCH cpu_avx2
#include "kernel/kernels/cpu/filter_cpu.h"
CCL_NAMESPACE_END
#endif /* __FILTER_H__ */

View File

@@ -0,0 +1,38 @@
/*
* Copyright 2011-2017 Blender Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef __FILTER_DEFINES_H__
#define __FILTER_DEFINES_H__
#define DENOISE_FEATURES 10
#define TRANSFORM_SIZE (DENOISE_FEATURES*DENOISE_FEATURES)
#define XTWX_SIZE (((DENOISE_FEATURES+1)*(DENOISE_FEATURES+2))/2)
#define XTWY_SIZE (DENOISE_FEATURES+1)
typedef struct TilesInfo {
int offsets[9];
int strides[9];
int x[4];
int y[4];
/* TODO(lukas): CUDA doesn't have uint64_t... */
#ifdef __KERNEL_OPENCL__
ccl_global float *buffers[9];
#else
long long int buffers[9];
#endif
} TilesInfo;
#endif /* __FILTER_DEFINES_H__*/

View File

@@ -0,0 +1,120 @@
/*
* Copyright 2011-2017 Blender Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
CCL_NAMESPACE_BEGIN
#define ccl_get_feature(buffer, pass) buffer[(pass)*pass_stride]
/* Loop over the pixels in the range [low.x, high.x) x [low.y, high.y).
* pixel_buffer always points to the current pixel in the first pass. */
#define FOR_PIXEL_WINDOW pixel_buffer = buffer + (low.y - rect.y)*buffer_w + (low.x - rect.x); \
for(pixel.y = low.y; pixel.y < high.y; pixel.y++) { \
for(pixel.x = low.x; pixel.x < high.x; pixel.x++, pixel_buffer++) {
#define END_FOR_PIXEL_WINDOW } \
pixel_buffer += buffer_w - (high.x - low.x); \
}
ccl_device_inline void filter_get_features(int2 pixel, ccl_global float ccl_restrict_ptr buffer, float *features, float ccl_restrict_ptr mean, int pass_stride)
{
features[0] = pixel.x;
features[1] = pixel.y;
features[2] = ccl_get_feature(buffer, 0);
features[3] = ccl_get_feature(buffer, 1);
features[4] = ccl_get_feature(buffer, 2);
features[5] = ccl_get_feature(buffer, 3);
features[6] = ccl_get_feature(buffer, 4);
features[7] = ccl_get_feature(buffer, 5);
features[8] = ccl_get_feature(buffer, 6);
features[9] = ccl_get_feature(buffer, 7);
if(mean) {
for(int i = 0; i < DENOISE_FEATURES; i++)
features[i] -= mean[i];
}
}
ccl_device_inline void filter_get_feature_scales(int2 pixel, ccl_global float ccl_restrict_ptr buffer, float *scales, float ccl_restrict_ptr mean, int pass_stride)
{
scales[0] = fabsf(pixel.x - mean[0]);
scales[1] = fabsf(pixel.y - mean[1]);
scales[2] = fabsf(ccl_get_feature(buffer, 0) - mean[2]);
scales[3] = len_squared(make_float3(ccl_get_feature(buffer, 1) - mean[3],
ccl_get_feature(buffer, 2) - mean[4],
ccl_get_feature(buffer, 3) - mean[5]));
scales[4] = fabsf(ccl_get_feature(buffer, 4) - mean[6]);
scales[5] = len_squared(make_float3(ccl_get_feature(buffer, 5) - mean[7],
ccl_get_feature(buffer, 6) - mean[8],
ccl_get_feature(buffer, 7) - mean[9]));
}
ccl_device_inline void filter_calculate_scale(float *scale)
{
scale[0] = 1.0f/max(scale[0], 0.01f);
scale[1] = 1.0f/max(scale[1], 0.01f);
scale[2] = 1.0f/max(scale[2], 0.01f);
scale[6] = 1.0f/max(scale[4], 0.01f);
scale[7] = scale[8] = scale[9] = 1.0f/max(sqrtf(scale[5]), 0.01f);
scale[3] = scale[4] = scale[5] = 1.0f/max(sqrtf(scale[3]), 0.01f);
}
ccl_device_inline float3 filter_get_pixel_color(ccl_global float ccl_restrict_ptr buffer, int pass_stride)
{
return make_float3(ccl_get_feature(buffer, 0), ccl_get_feature(buffer, 1), ccl_get_feature(buffer, 2));
}
ccl_device_inline float filter_get_pixel_variance(ccl_global float ccl_restrict_ptr buffer, int pass_stride)
{
return average(make_float3(ccl_get_feature(buffer, 0), ccl_get_feature(buffer, 1), ccl_get_feature(buffer, 2)));
}
ccl_device_inline void design_row_add(float *design_row,
int rank,
ccl_global float ccl_restrict_ptr transform,
int stride,
int row,
float feature)
{
for(int i = 0; i < rank; i++) {
design_row[1+i] += transform[(row*DENOISE_FEATURES + i)*stride]*feature;
}
}
/* Fill the design row. */
ccl_device_inline void filter_get_design_row_transform(int2 p_pixel,
ccl_global float ccl_restrict_ptr p_buffer,
int2 q_pixel,
ccl_global float ccl_restrict_ptr q_buffer,
int pass_stride,
int rank,
float *design_row,
ccl_global float ccl_restrict_ptr transform,
int stride)
{
design_row[0] = 1.0f;
math_vector_zero(design_row+1, rank);
design_row_add(design_row, rank, transform, stride, 0, q_pixel.x - p_pixel.x);
design_row_add(design_row, rank, transform, stride, 1, q_pixel.y - p_pixel.y);
design_row_add(design_row, rank, transform, stride, 2, ccl_get_feature(q_buffer, 0) - ccl_get_feature(p_buffer, 0));
design_row_add(design_row, rank, transform, stride, 3, ccl_get_feature(q_buffer, 1) - ccl_get_feature(p_buffer, 1));
design_row_add(design_row, rank, transform, stride, 4, ccl_get_feature(q_buffer, 2) - ccl_get_feature(p_buffer, 2));
design_row_add(design_row, rank, transform, stride, 5, ccl_get_feature(q_buffer, 3) - ccl_get_feature(p_buffer, 3));
design_row_add(design_row, rank, transform, stride, 6, ccl_get_feature(q_buffer, 4) - ccl_get_feature(p_buffer, 4));
design_row_add(design_row, rank, transform, stride, 7, ccl_get_feature(q_buffer, 5) - ccl_get_feature(p_buffer, 5));
design_row_add(design_row, rank, transform, stride, 8, ccl_get_feature(q_buffer, 6) - ccl_get_feature(p_buffer, 6));
design_row_add(design_row, rank, transform, stride, 9, ccl_get_feature(q_buffer, 7) - ccl_get_feature(p_buffer, 7));
}
CCL_NAMESPACE_END

View File

@@ -0,0 +1,95 @@
/*
* Copyright 2011-2017 Blender Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
CCL_NAMESPACE_BEGIN
#define ccl_get_feature_sse(pass) _mm_loadu_ps(buffer + (pass)*pass_stride)
/* Loop over the pixels in the range [low.x, high.x) x [low.y, high.y), 4 at a time.
* pixel_buffer always points to the first of the 4 current pixel in the first pass.
* x4 and y4 contain the coordinates of the four pixels, active_pixels contains a mask that's set for all pixels within the window. */
#define FOR_PIXEL_WINDOW_SSE pixel_buffer = buffer + (low.y - rect.y)*buffer_w + (low.x - rect.x); \
for(pixel.y = low.y; pixel.y < high.y; pixel.y++) { \
__m128 y4 = _mm_set1_ps(pixel.y); \
for(pixel.x = low.x; pixel.x < high.x; pixel.x += 4, pixel_buffer += 4) { \
__m128 x4 = _mm_add_ps(_mm_set1_ps(pixel.x), _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); \
__m128 active_pixels = _mm_cmplt_ps(x4, _mm_set1_ps(high.x));
#define END_FOR_PIXEL_WINDOW_SSE } \
pixel_buffer += buffer_w - (pixel.x - low.x); \
}
ccl_device_inline void filter_get_features_sse(__m128 x, __m128 y, __m128 active_pixels, float ccl_restrict_ptr buffer, __m128 *features, __m128 ccl_restrict_ptr mean, int pass_stride)
{
features[0] = x;
features[1] = y;
features[2] = ccl_get_feature_sse(0);
features[3] = ccl_get_feature_sse(1);
features[4] = ccl_get_feature_sse(2);
features[5] = ccl_get_feature_sse(3);
features[6] = ccl_get_feature_sse(4);
features[7] = ccl_get_feature_sse(5);
features[8] = ccl_get_feature_sse(6);
features[9] = ccl_get_feature_sse(7);
if(mean) {
for(int i = 0; i < DENOISE_FEATURES; i++)
features[i] = _mm_sub_ps(features[i], mean[i]);
}
for(int i = 0; i < DENOISE_FEATURES; i++)
features[i] = _mm_mask_ps(features[i], active_pixels);
}
ccl_device_inline void filter_get_feature_scales_sse(__m128 x, __m128 y, __m128 active_pixels, float ccl_restrict_ptr buffer, __m128 *scales, __m128 ccl_restrict_ptr mean, int pass_stride)
{
scales[0] = _mm_mask_ps(_mm_fabs_ps(_mm_sub_ps(x, mean[0])), active_pixels);
scales[1] = _mm_mask_ps(_mm_fabs_ps(_mm_sub_ps(y, mean[1])), active_pixels);
scales[2] = _mm_mask_ps(_mm_fabs_ps(_mm_sub_ps(ccl_get_feature_sse(0), mean[2])), active_pixels);
__m128 diff, scale;
diff = _mm_sub_ps(ccl_get_feature_sse(1), mean[3]);
scale = _mm_mul_ps(diff, diff);
diff = _mm_sub_ps(ccl_get_feature_sse(2), mean[4]);
scale = _mm_add_ps(scale, _mm_mul_ps(diff, diff));
diff = _mm_sub_ps(ccl_get_feature_sse(3), mean[5]);
scale = _mm_add_ps(scale, _mm_mul_ps(diff, diff));
scales[3] = _mm_mask_ps(scale, active_pixels);
scales[4] = _mm_mask_ps(_mm_fabs_ps(_mm_sub_ps(ccl_get_feature_sse(4), mean[6])), active_pixels);
diff = _mm_sub_ps(ccl_get_feature_sse(5), mean[7]);
scale = _mm_mul_ps(diff, diff);
diff = _mm_sub_ps(ccl_get_feature_sse(6), mean[8]);
scale = _mm_add_ps(scale, _mm_mul_ps(diff, diff));
diff = _mm_sub_ps(ccl_get_feature_sse(7), mean[9]);
scale = _mm_add_ps(scale, _mm_mul_ps(diff, diff));
scales[5] = _mm_mask_ps(scale, active_pixels);
}
ccl_device_inline void filter_calculate_scale_sse(__m128 *scale)
{
scale[0] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(scale[0]), _mm_set1_ps(0.01f)));
scale[1] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(scale[1]), _mm_set1_ps(0.01f)));
scale[2] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(scale[2]), _mm_set1_ps(0.01f)));
scale[6] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(scale[4]), _mm_set1_ps(0.01f)));
scale[7] = scale[8] = scale[9] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(_mm_sqrt_ps(scale[5])), _mm_set1_ps(0.01f)));
scale[3] = scale[4] = scale[5] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(_mm_sqrt_ps(scale[3])), _mm_set1_ps(0.01f)));
}
CCL_NAMESPACE_END

View File

@@ -0,0 +1,50 @@
/*
* Copyright 2011-2017 Blender Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "util/util_color.h"
#include "util/util_math.h"
#include "util/util_math_fast.h"
#include "util/util_texture.h"
#include "util/util_atomic.h"
#include "util/util_math_matrix.h"
#include "kernel/filter/filter_defines.h"
#include "kernel/filter/filter_features.h"
#ifdef __KERNEL_SSE3__
# include "kernel/filter/filter_features_sse.h"
#endif
#include "kernel/filter/filter_prefilter.h"
#ifdef __KERNEL_GPU__
# include "kernel/filter/filter_transform_gpu.h"
#else
# ifdef __KERNEL_SSE3__
# include "kernel/filter/filter_transform_sse.h"
# else
# include "kernel/filter/filter_transform.h"
# endif
#endif
#include "kernel/filter/filter_reconstruction.h"
#ifdef __KERNEL_CPU__
# include "kernel/filter/filter_nlm_cpu.h"
#else
# include "kernel/filter/filter_nlm_gpu.h"
#endif

View File

@@ -0,0 +1,163 @@
/*
* Copyright 2011-2017 Blender Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
CCL_NAMESPACE_BEGIN
ccl_device_inline void kernel_filter_nlm_calc_difference(int dx, int dy, float ccl_restrict_ptr weightImage, float ccl_restrict_ptr varianceImage, float *differenceImage, int4 rect, int w, int channel_offset, float a, float k_2)
{
for(int y = rect.y; y < rect.w; y++) {
for(int x = rect.x; x < rect.z; x++) {
float diff = 0.0f;
int numChannels = channel_offset? 3 : 1;
for(int c = 0; c < numChannels; c++) {
float cdiff = weightImage[c*channel_offset + y*w+x] - weightImage[c*channel_offset + (y+dy)*w+(x+dx)];
float pvar = varianceImage[c*channel_offset + y*w+x];
float qvar = varianceImage[c*channel_offset + (y+dy)*w+(x+dx)];
diff += (cdiff*cdiff - a*(pvar + min(pvar, qvar))) / (1e-8f + k_2*(pvar+qvar));
}
if(numChannels > 1) {
diff *= 1.0f/numChannels;
}
differenceImage[y*w+x] = diff;
}
}
}
ccl_device_inline void kernel_filter_nlm_blur(float ccl_restrict_ptr differenceImage, float *outImage, int4 rect, int w, int f)
{
#ifdef __KERNEL_SSE3__
int aligned_lowx = (rect.x & ~(3));
int aligned_highx = ((rect.z + 3) & ~(3));
#endif
for(int y = rect.y; y < rect.w; y++) {
const int low = max(rect.y, y-f);
const int high = min(rect.w, y+f+1);
for(int x = rect.x; x < rect.z; x++) {
outImage[y*w+x] = 0.0f;
}
for(int y1 = low; y1 < high; y1++) {
#ifdef __KERNEL_SSE3__
for(int x = aligned_lowx; x < aligned_highx; x+=4) {
_mm_store_ps(outImage + y*w+x, _mm_add_ps(_mm_load_ps(outImage + y*w+x), _mm_load_ps(differenceImage + y1*w+x)));
}
#else
for(int x = rect.x; x < rect.z; x++) {
outImage[y*w+x] += differenceImage[y1*w+x];
}
#endif
}
for(int x = rect.x; x < rect.z; x++) {
outImage[y*w+x] *= 1.0f/(high - low);
}
}
}
ccl_device_inline void kernel_filter_nlm_calc_weight(float ccl_restrict_ptr differenceImage, float *outImage, int4 rect, int w, int f)
{
for(int y = rect.y; y < rect.w; y++) {
for(int x = rect.x; x < rect.z; x++) {
outImage[y*w+x] = 0.0f;
}
}
for(int dx = -f; dx <= f; dx++) {
int pos_dx = max(0, dx);
int neg_dx = min(0, dx);
for(int y = rect.y; y < rect.w; y++) {
for(int x = rect.x-neg_dx; x < rect.z-pos_dx; x++) {
outImage[y*w+x] += differenceImage[y*w+dx+x];
}
}
}
for(int y = rect.y; y < rect.w; y++) {
for(int x = rect.x; x < rect.z; x++) {
const int low = max(rect.x, x-f);
const int high = min(rect.z, x+f+1);
outImage[y*w+x] = expf(-max(outImage[y*w+x] * (1.0f/(high - low)), 0.0f));
}
}
}
ccl_device_inline void kernel_filter_nlm_update_output(int dx, int dy, float ccl_restrict_ptr differenceImage, float ccl_restrict_ptr image, float *outImage, float *accumImage, int4 rect, int w, int f)
{
for(int y = rect.y; y < rect.w; y++) {
for(int x = rect.x; x < rect.z; x++) {
const int low = max(rect.x, x-f);
const int high = min(rect.z, x+f+1);
float sum = 0.0f;
for(int x1 = low; x1 < high; x1++) {
sum += differenceImage[y*w+x1];
}
float weight = sum * (1.0f/(high - low));
accumImage[y*w+x] += weight;
outImage[y*w+x] += weight*image[(y+dy)*w+(x+dx)];
}
}
}
ccl_device_inline void kernel_filter_nlm_construct_gramian(int dx, int dy,
float ccl_restrict_ptr differenceImage,
float ccl_restrict_ptr buffer,
float *color_pass,
float *variance_pass,
float *transform,
int *rank,
float *XtWX,
float3 *XtWY,
int4 rect,
int4 filter_rect,
int w, int h, int f,
int pass_stride)
{
/* fy and fy are in filter-window-relative coordinates, while x and y are in feature-window-relative coordinates. */
for(int fy = max(0, rect.y-filter_rect.y); fy < min(filter_rect.w, rect.w-filter_rect.y); fy++) {
int y = fy + filter_rect.y;
for(int fx = max(0, rect.x-filter_rect.x); fx < min(filter_rect.z, rect.z-filter_rect.x); fx++) {
int x = fx + filter_rect.x;
const int low = max(rect.x, x-f);
const int high = min(rect.z, x+f+1);
float sum = 0.0f;
for(int x1 = low; x1 < high; x1++) {
sum += differenceImage[y*w+x1];
}
float weight = sum * (1.0f/(high - low));
int storage_ofs = fy*filter_rect.z + fx;
float *l_transform = transform + storage_ofs*TRANSFORM_SIZE;
float *l_XtWX = XtWX + storage_ofs*XTWX_SIZE;
float3 *l_XtWY = XtWY + storage_ofs*XTWY_SIZE;
int *l_rank = rank + storage_ofs;
kernel_filter_construct_gramian(x, y, 1,
dx, dy, w, h,
pass_stride,
buffer,
color_pass, variance_pass,
l_transform, l_rank,
weight, l_XtWX, l_XtWY, 0);
}
}
}
ccl_device_inline void kernel_filter_nlm_normalize(float *outImage, float ccl_restrict_ptr accumImage, int4 rect, int w)
{
for(int y = rect.y; y < rect.w; y++) {
for(int x = rect.x; x < rect.z; x++) {
outImage[y*w+x] /= accumImage[y*w+x];
}
}
}
CCL_NAMESPACE_END

View File

@@ -0,0 +1,147 @@
/*
* Copyright 2011-2017 Blender Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
CCL_NAMESPACE_BEGIN
ccl_device_inline void kernel_filter_nlm_calc_difference(int x, int y,
int dx, int dy,
ccl_global float ccl_restrict_ptr weightImage,
ccl_global float ccl_restrict_ptr varianceImage,
ccl_global float *differenceImage,
int4 rect, int w,
int channel_offset,
float a, float k_2)
{
float diff = 0.0f;
int numChannels = channel_offset? 3 : 1;
for(int c = 0; c < numChannels; c++) {
float cdiff = weightImage[c*channel_offset + y*w+x] - weightImage[c*channel_offset + (y+dy)*w+(x+dx)];
float pvar = varianceImage[c*channel_offset + y*w+x];
float qvar = varianceImage[c*channel_offset + (y+dy)*w+(x+dx)];
diff += (cdiff*cdiff - a*(pvar + min(pvar, qvar))) / (1e-8f + k_2*(pvar+qvar));
}
if(numChannels > 1) {
diff *= 1.0f/numChannels;
}
differenceImage[y*w+x] = diff;
}
ccl_device_inline void kernel_filter_nlm_blur(int x, int y,
ccl_global float ccl_restrict_ptr differenceImage,
ccl_global float *outImage,
int4 rect, int w, int f)
{
float sum = 0.0f;
const int low = max(rect.y, y-f);
const int high = min(rect.w, y+f+1);
for(int y1 = low; y1 < high; y1++) {
sum += differenceImage[y1*w+x];
}
sum *= 1.0f/(high-low);
outImage[y*w+x] = sum;
}
ccl_device_inline void kernel_filter_nlm_calc_weight(int x, int y,
ccl_global float ccl_restrict_ptr differenceImage,
ccl_global float *outImage,
int4 rect, int w, int f)
{
float sum = 0.0f;
const int low = max(rect.x, x-f);
const int high = min(rect.z, x+f+1);
for(int x1 = low; x1 < high; x1++) {
sum += differenceImage[y*w+x1];
}
sum *= 1.0f/(high-low);
outImage[y*w+x] = expf(-max(sum, 0.0f));
}
ccl_device_inline void kernel_filter_nlm_update_output(int x, int y,
int dx, int dy,
ccl_global float ccl_restrict_ptr differenceImage,
ccl_global float ccl_restrict_ptr image,
ccl_global float *outImage,
ccl_global float *accumImage,
int4 rect, int w, int f)
{
float sum = 0.0f;
const int low = max(rect.x, x-f);
const int high = min(rect.z, x+f+1);
for(int x1 = low; x1 < high; x1++) {
sum += differenceImage[y*w+x1];
}
sum *= 1.0f/(high-low);
if(outImage) {
accumImage[y*w+x] += sum;
outImage[y*w+x] += sum*image[(y+dy)*w+(x+dx)];
}
else {
accumImage[y*w+x] = sum;
}
}
ccl_device_inline void kernel_filter_nlm_construct_gramian(int fx, int fy,
int dx, int dy,
ccl_global float ccl_restrict_ptr differenceImage,
ccl_global float ccl_restrict_ptr buffer,
ccl_global float *color_pass,
ccl_global float *variance_pass,
ccl_global float ccl_restrict_ptr transform,
ccl_global int *rank,
ccl_global float *XtWX,
ccl_global float3 *XtWY,
int4 rect,
int4 filter_rect,
int w, int h, int f,
int pass_stride,
int localIdx)
{
int y = fy + filter_rect.y;
int x = fx + filter_rect.x;
const int low = max(rect.x, x-f);
const int high = min(rect.z, x+f+1);
float sum = 0.0f;
for(int x1 = low; x1 < high; x1++) {
sum += differenceImage[y*w+x1];
}
float weight = sum * (1.0f/(high - low));
int storage_ofs = fy*filter_rect.z + fx;
transform += storage_ofs;
rank += storage_ofs;
XtWX += storage_ofs;
XtWY += storage_ofs;
kernel_filter_construct_gramian(x, y,
filter_rect.z*filter_rect.w,
dx, dy, w, h,
pass_stride,
buffer,
color_pass, variance_pass,
transform, rank,
weight, XtWX, XtWY,
localIdx);
}
ccl_device_inline void kernel_filter_nlm_normalize(int x, int y,
ccl_global float *outImage,
ccl_global float ccl_restrict_ptr accumImage,
int4 rect, int w)
{
outImage[y*w+x] /= accumImage[y*w+x];
}
CCL_NAMESPACE_END

View File

@@ -0,0 +1,145 @@
/*
* Copyright 2011-2017 Blender Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
CCL_NAMESPACE_BEGIN
/* First step of the shadow prefiltering, performs the shadow division and stores all data
* in a nice and easy rectangular array that can be passed to the NLM filter.
*
* Calculates:
* unfiltered: Contains the two half images of the shadow feature pass
* sampleVariance: The sample-based variance calculated in the kernel. Note: This calculation is biased in general, and especially here since the variance of the ratio can only be approximated.
* sampleVarianceV: Variance of the sample variance estimation, quite noisy (since it's essentially the buffer variance of the two variance halves)
* bufferVariance: The buffer-based variance of the shadow feature. Unbiased, but quite noisy.
*/
ccl_device void kernel_filter_divide_shadow(int sample,
ccl_global TilesInfo *tiles,
int x, int y,
ccl_global float *unfilteredA,
ccl_global float *unfilteredB,
ccl_global float *sampleVariance,
ccl_global float *sampleVarianceV,
ccl_global float *bufferVariance,
int4 rect,
int buffer_pass_stride,
int buffer_denoising_offset,
bool use_split_variance)
{
int xtile = (x < tiles->x[1])? 0: ((x < tiles->x[2])? 1: 2);
int ytile = (y < tiles->y[1])? 0: ((y < tiles->y[2])? 1: 2);
int tile = ytile*3+xtile;
int offset = tiles->offsets[tile];
int stride = tiles->strides[tile];
ccl_global float ccl_restrict_ptr center_buffer = (ccl_global float*) tiles->buffers[tile];
center_buffer += (y*stride + x + offset)*buffer_pass_stride;
center_buffer += buffer_denoising_offset + 14;
int buffer_w = align_up(rect.z - rect.x, 4);
int idx = (y-rect.y)*buffer_w + (x - rect.x);
unfilteredA[idx] = center_buffer[1] / max(center_buffer[0], 1e-7f);
unfilteredB[idx] = center_buffer[4] / max(center_buffer[3], 1e-7f);
float varA = center_buffer[2];
float varB = center_buffer[5];
int odd_sample = (sample+1)/2;
int even_sample = sample/2;
if(use_split_variance) {
varA = max(0.0f, varA - unfilteredA[idx]*unfilteredA[idx]*odd_sample);
varB = max(0.0f, varB - unfilteredB[idx]*unfilteredB[idx]*even_sample);
}
varA /= (odd_sample - 1);
varB /= (even_sample - 1);
sampleVariance[idx] = 0.5f*(varA + varB) / sample;
sampleVarianceV[idx] = 0.5f * (varA - varB) * (varA - varB) / (sample*sample);
bufferVariance[idx] = 0.5f * (unfilteredA[idx] - unfilteredB[idx]) * (unfilteredA[idx] - unfilteredB[idx]);
}
/* Load a regular feature from the render buffers into the denoise buffer.
* Parameters:
* - sample: The sample amount in the buffer, used to normalize the buffer.
* - m_offset, v_offset: Render Buffer Pass offsets of mean and variance of the feature.
* - x, y: Current pixel
* - mean, variance: Target denoise buffers.
* - rect: The prefilter area (lower pixels inclusive, upper pixels exclusive).
*/
ccl_device void kernel_filter_get_feature(int sample,
ccl_global TilesInfo *tiles,
int m_offset, int v_offset,
int x, int y,
ccl_global float *mean,
ccl_global float *variance,
int4 rect, int buffer_pass_stride,
int buffer_denoising_offset,
bool use_split_variance)
{
int xtile = (x < tiles->x[1])? 0: ((x < tiles->x[2])? 1: 2);
int ytile = (y < tiles->y[1])? 0: ((y < tiles->y[2])? 1: 2);
int tile = ytile*3+xtile;
ccl_global float *center_buffer = ((ccl_global float*) tiles->buffers[tile]) + (tiles->offsets[tile] + y*tiles->strides[tile] + x)*buffer_pass_stride + buffer_denoising_offset;
int buffer_w = align_up(rect.z - rect.x, 4);
int idx = (y-rect.y)*buffer_w + (x - rect.x);
mean[idx] = center_buffer[m_offset] / sample;
if(use_split_variance) {
variance[idx] = max(0.0f, (center_buffer[v_offset] - mean[idx]*mean[idx]*sample) / (sample * (sample-1)));
}
else {
variance[idx] = center_buffer[v_offset] / (sample * (sample-1));
}
}
/* Combine A/B buffers.
* Calculates the combined mean and the buffer variance. */
ccl_device void kernel_filter_combine_halves(int x, int y,
ccl_global float *mean,
ccl_global float *variance,
ccl_global float *a,
ccl_global float *b,
int4 rect, int r)
{
int buffer_w = align_up(rect.z - rect.x, 4);
int idx = (y-rect.y)*buffer_w + (x - rect.x);
if(mean) mean[idx] = 0.5f * (a[idx]+b[idx]);
if(variance) {
if(r == 0) variance[idx] = 0.25f * (a[idx]-b[idx])*(a[idx]-b[idx]);
else {
variance[idx] = 0.0f;
float values[25];
int numValues = 0;
for(int py = max(y-r, rect.y); py < min(y+r+1, rect.w); py++) {
for(int px = max(x-r, rect.x); px < min(x+r+1, rect.z); px++) {
int pidx = (py-rect.y)*buffer_w + (px-rect.x);
values[numValues++] = 0.25f * (a[pidx]-b[pidx])*(a[pidx]-b[pidx]);
}
}
/* Insertion-sort the variances (fast enough for 25 elements). */
for(int i = 1; i < numValues; i++) {
float v = values[i];
int j;
for(j = i-1; j >= 0 && values[j] > v; j--)
values[j+1] = values[j];
values[j+1] = v;
}
variance[idx] = values[(7*numValues)/8];
}
}
}
CCL_NAMESPACE_END

View File

@@ -0,0 +1,103 @@
/*
* Copyright 2011-2017 Blender Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
CCL_NAMESPACE_BEGIN
ccl_device_inline void kernel_filter_construct_gramian(int x, int y,
int storage_stride,
int dx, int dy,
int w, int h,
int pass_stride,
ccl_global float ccl_restrict_ptr buffer,
ccl_global float *color_pass,
ccl_global float *variance_pass,
ccl_global float ccl_restrict_ptr transform,
ccl_global int *rank,
float weight,
ccl_global float *XtWX,
ccl_global float3 *XtWY,
int localIdx)
{
int p_offset = y *w + x;
int q_offset = (y+dy)*w + (x+dx);
#ifdef __KERNEL_CPU__
const int stride = 1;
(void)storage_stride;
(void)localIdx;
float design_row[DENOISE_FEATURES+1];
#elif defined(__KERNEL_CUDA__)
const int stride = storage_stride;
ccl_local float shared_design_row[(DENOISE_FEATURES+1)*CCL_MAX_LOCAL_SIZE];
ccl_local_param float *design_row = shared_design_row + localIdx*(DENOISE_FEATURES+1);
#else
const int stride = storage_stride;
float design_row[DENOISE_FEATURES+1];
#endif
float3 p_color = filter_get_pixel_color(color_pass + p_offset, pass_stride);
float3 q_color = filter_get_pixel_color(color_pass + q_offset, pass_stride);
float p_std_dev = sqrtf(filter_get_pixel_variance(variance_pass + p_offset, pass_stride));
float q_std_dev = sqrtf(filter_get_pixel_variance(variance_pass + q_offset, pass_stride));
if(average(fabs(p_color - q_color)) > 3.0f*(p_std_dev + q_std_dev + 1e-3f)) {
return;
}
filter_get_design_row_transform(make_int2(x, y), buffer + p_offset,
make_int2(x+dx, y+dy), buffer + q_offset,
pass_stride, *rank, design_row, transform, stride);
math_trimatrix_add_gramian_strided(XtWX, (*rank)+1, design_row, weight, stride);
math_vec3_add_strided(XtWY, (*rank)+1, design_row, weight * q_color, stride);
}
ccl_device_inline void kernel_filter_finalize(int x, int y, int w, int h,
ccl_global float *buffer,
ccl_global int *rank,
int storage_stride,
ccl_global float *XtWX,
ccl_global float3 *XtWY,
int4 buffer_params,
int sample)
{
#ifdef __KERNEL_CPU__
const int stride = 1;
(void)storage_stride;
#else
const int stride = storage_stride;
#endif
math_trimatrix_vec3_solve(XtWX, XtWY, (*rank)+1, stride);
float3 final_color = XtWY[0];
ccl_global float *combined_buffer = buffer + (y*buffer_params.y + x + buffer_params.x)*buffer_params.z;
final_color *= sample;
if(buffer_params.w) {
final_color.x += combined_buffer[buffer_params.w+0];
final_color.y += combined_buffer[buffer_params.w+1];
final_color.z += combined_buffer[buffer_params.w+2];
}
combined_buffer[0] = final_color.x;
combined_buffer[1] = final_color.y;
combined_buffer[2] = final_color.z;
}
#undef STORAGE_TYPE
CCL_NAMESPACE_END

View File

@@ -0,0 +1,113 @@
/*
* Copyright 2011-2017 Blender Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
CCL_NAMESPACE_BEGIN
ccl_device void kernel_filter_construct_transform(float ccl_restrict_ptr buffer,
int x, int y, int4 rect,
int pass_stride,
float *transform, int *rank,
int radius, float pca_threshold)
{
int buffer_w = align_up(rect.z - rect.x, 4);
float features[DENOISE_FEATURES];
/* Temporary storage, used in different steps of the algorithm. */
float tempmatrix[DENOISE_FEATURES*DENOISE_FEATURES];
float tempvector[2*DENOISE_FEATURES];
float ccl_restrict_ptr pixel_buffer;
int2 pixel;
/* === Calculate denoising window. === */
int2 low = make_int2(max(rect.x, x - radius),
max(rect.y, y - radius));
int2 high = make_int2(min(rect.z, x + radius + 1),
min(rect.w, y + radius + 1));
/* === Shift feature passes to have mean 0. === */
float feature_means[DENOISE_FEATURES];
math_vector_zero(feature_means, DENOISE_FEATURES);
FOR_PIXEL_WINDOW {
filter_get_features(pixel, pixel_buffer, features, NULL, pass_stride);
math_vector_add(feature_means, features, DENOISE_FEATURES);
} END_FOR_PIXEL_WINDOW
float pixel_scale = 1.0f / ((high.y - low.y) * (high.x - low.x));
math_vector_scale(feature_means, pixel_scale, DENOISE_FEATURES);
/* === Scale the shifted feature passes to a range of [-1; 1], will be baked into the transform later. === */
float *feature_scale = tempvector;
math_vector_zero(feature_scale, DENOISE_FEATURES);
FOR_PIXEL_WINDOW {
filter_get_feature_scales(pixel, pixel_buffer, features, feature_means, pass_stride);
math_vector_max(feature_scale, features, DENOISE_FEATURES);
} END_FOR_PIXEL_WINDOW
filter_calculate_scale(feature_scale);
/* === Generate the feature transformation. ===
* This transformation maps the DENOISE_FEATURES-dimentional feature space to a reduced feature (r-feature) space
* which generally has fewer dimensions. This mainly helps to prevent overfitting. */
float* feature_matrix = tempmatrix;
math_matrix_zero(feature_matrix, DENOISE_FEATURES);
FOR_PIXEL_WINDOW {
filter_get_features(pixel, pixel_buffer, features, feature_means, pass_stride);
math_vector_mul(features, feature_scale, DENOISE_FEATURES);
math_matrix_add_gramian(feature_matrix, DENOISE_FEATURES, features, 1.0f);
} END_FOR_PIXEL_WINDOW
math_matrix_jacobi_eigendecomposition(feature_matrix, transform, DENOISE_FEATURES, 1);
*rank = 0;
if(pca_threshold < 0.0f) {
float threshold_energy = 0.0f;
for(int i = 0; i < DENOISE_FEATURES; i++) {
threshold_energy += feature_matrix[i*DENOISE_FEATURES+i];
}
threshold_energy *= 1.0f - (-pca_threshold);
float reduced_energy = 0.0f;
for(int i = 0; i < DENOISE_FEATURES; i++, (*rank)++) {
if(i >= 2 && reduced_energy >= threshold_energy)
break;
float s = feature_matrix[i*DENOISE_FEATURES+i];
reduced_energy += s;
}
}
else {
for(int i = 0; i < DENOISE_FEATURES; i++, (*rank)++) {
float s = feature_matrix[i*DENOISE_FEATURES+i];
if(i >= 2 && sqrtf(s) < pca_threshold)
break;
}
}
/* Bake the feature scaling into the transformation matrix. */
for(int i = 0; i < (*rank); i++) {
math_vector_mul(transform + i*DENOISE_FEATURES, feature_scale, DENOISE_FEATURES);
}
math_matrix_transpose(transform, DENOISE_FEATURES, 1);
}
CCL_NAMESPACE_END

View File

@@ -0,0 +1,117 @@
/*
* Copyright 2011-2017 Blender Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
CCL_NAMESPACE_BEGIN
ccl_device void kernel_filter_construct_transform(ccl_global float ccl_restrict_ptr buffer,
int x, int y, int4 rect,
int pass_stride,
ccl_global float *transform,
ccl_global int *rank,
int radius, float pca_threshold,
int transform_stride, int localIdx)
{
int buffer_w = align_up(rect.z - rect.x, 4);
#ifdef __KERNEL_CUDA__
ccl_local float shared_features[DENOISE_FEATURES*CCL_MAX_LOCAL_SIZE];
ccl_local_param float *features = shared_features + localIdx*DENOISE_FEATURES;
#else
float features[DENOISE_FEATURES];
#endif
/* === Calculate denoising window. === */
int2 low = make_int2(max(rect.x, x - radius),
max(rect.y, y - radius));
int2 high = make_int2(min(rect.z, x + radius + 1),
min(rect.w, y + radius + 1));
ccl_global float ccl_restrict_ptr pixel_buffer;
int2 pixel;
/* === Shift feature passes to have mean 0. === */
float feature_means[DENOISE_FEATURES];
math_vector_zero(feature_means, DENOISE_FEATURES);
FOR_PIXEL_WINDOW {
filter_get_features(pixel, pixel_buffer, features, NULL, pass_stride);
math_vector_add(feature_means, features, DENOISE_FEATURES);
} END_FOR_PIXEL_WINDOW
float pixel_scale = 1.0f / ((high.y - low.y) * (high.x - low.x));
math_vector_scale(feature_means, pixel_scale, DENOISE_FEATURES);
/* === Scale the shifted feature passes to a range of [-1; 1], will be baked into the transform later. === */
float feature_scale[DENOISE_FEATURES];
math_vector_zero(feature_scale, DENOISE_FEATURES);
FOR_PIXEL_WINDOW {
filter_get_feature_scales(pixel, pixel_buffer, features, feature_means, pass_stride);
math_vector_max(feature_scale, features, DENOISE_FEATURES);
} END_FOR_PIXEL_WINDOW
filter_calculate_scale(feature_scale);
/* === Generate the feature transformation. ===
* This transformation maps the DENOISE_FEATURES-dimentional feature space to a reduced feature (r-feature) space
* which generally has fewer dimensions. This mainly helps to prevent overfitting. */
float feature_matrix[DENOISE_FEATURES*DENOISE_FEATURES];
math_matrix_zero(feature_matrix, DENOISE_FEATURES);
FOR_PIXEL_WINDOW {
filter_get_features(pixel, pixel_buffer, features, feature_means, pass_stride);
math_vector_mul(features, feature_scale, DENOISE_FEATURES);
math_matrix_add_gramian(feature_matrix, DENOISE_FEATURES, features, 1.0f);
} END_FOR_PIXEL_WINDOW
math_matrix_jacobi_eigendecomposition(feature_matrix, transform, DENOISE_FEATURES, transform_stride);
*rank = 0;
if(pca_threshold < 0.0f) {
float threshold_energy = 0.0f;
for(int i = 0; i < DENOISE_FEATURES; i++) {
threshold_energy += feature_matrix[i*DENOISE_FEATURES+i];
}
threshold_energy *= 1.0f - (-pca_threshold);
float reduced_energy = 0.0f;
for(int i = 0; i < DENOISE_FEATURES; i++, (*rank)++) {
if(i >= 2 && reduced_energy >= threshold_energy)
break;
float s = feature_matrix[i*DENOISE_FEATURES+i];
reduced_energy += s;
}
}
else {
for(int i = 0; i < DENOISE_FEATURES; i++, (*rank)++) {
float s = feature_matrix[i*DENOISE_FEATURES+i];
if(i >= 2 && sqrtf(s) < pca_threshold)
break;
}
}
math_matrix_transpose(transform, DENOISE_FEATURES, transform_stride);
/* Bake the feature scaling into the transformation matrix. */
for(int i = 0; i < DENOISE_FEATURES; i++) {
for(int j = 0; j < (*rank); j++) {
transform[i*DENOISE_FEATURES + j] *= feature_scale[i];
}
}
}
CCL_NAMESPACE_END

View File

@@ -0,0 +1,110 @@
/*
* Copyright 2011-2017 Blender Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
CCL_NAMESPACE_BEGIN
ccl_device void kernel_filter_construct_transform(float ccl_restrict_ptr buffer,
int x, int y, int4 rect,
int pass_stride,
float *transform, int *rank,
int radius, float pca_threshold)
{
int buffer_w = align_up(rect.z - rect.x, 4);
__m128 features[DENOISE_FEATURES];
float ccl_restrict_ptr pixel_buffer;
int2 pixel;
int2 low = make_int2(max(rect.x, x - radius),
max(rect.y, y - radius));
int2 high = make_int2(min(rect.z, x + radius + 1),
min(rect.w, y + radius + 1));
__m128 feature_means[DENOISE_FEATURES];
math_vector_zero_sse(feature_means, DENOISE_FEATURES);
FOR_PIXEL_WINDOW_SSE {
filter_get_features_sse(x4, y4, active_pixels, pixel_buffer, features, NULL, pass_stride);
math_vector_add_sse(feature_means, DENOISE_FEATURES, features);
} END_FOR_PIXEL_WINDOW_SSE
__m128 pixel_scale = _mm_set1_ps(1.0f / ((high.y - low.y) * (high.x - low.x)));
for(int i = 0; i < DENOISE_FEATURES; i++) {
feature_means[i] = _mm_mul_ps(_mm_hsum_ps(feature_means[i]), pixel_scale);
}
__m128 feature_scale[DENOISE_FEATURES];
math_vector_zero_sse(feature_scale, DENOISE_FEATURES);
FOR_PIXEL_WINDOW_SSE {
filter_get_feature_scales_sse(x4, y4, active_pixels, pixel_buffer, features, feature_means, pass_stride);
math_vector_max_sse(feature_scale, features, DENOISE_FEATURES);
} END_FOR_PIXEL_WINDOW_SSE
filter_calculate_scale_sse(feature_scale);
__m128 feature_matrix_sse[DENOISE_FEATURES*DENOISE_FEATURES];
math_matrix_zero_sse(feature_matrix_sse, DENOISE_FEATURES);
FOR_PIXEL_WINDOW_SSE {
filter_get_features_sse(x4, y4, active_pixels, pixel_buffer, features, feature_means, pass_stride);
math_vector_mul_sse(features, DENOISE_FEATURES, feature_scale);
math_matrix_add_gramian_sse(feature_matrix_sse, DENOISE_FEATURES, features, _mm_set1_ps(1.0f));
} END_FOR_PIXEL_WINDOW_SSE
float feature_matrix[DENOISE_FEATURES*DENOISE_FEATURES];
math_matrix_hsum(feature_matrix, DENOISE_FEATURES, feature_matrix_sse);
math_matrix_jacobi_eigendecomposition(feature_matrix, transform, DENOISE_FEATURES, 1);
*rank = 0;
if(pca_threshold < 0.0f) {
float threshold_energy = 0.0f;
for(int i = 0; i < DENOISE_FEATURES; i++) {
threshold_energy += feature_matrix[i*DENOISE_FEATURES+i];
}
threshold_energy *= 1.0f - (-pca_threshold);
float reduced_energy = 0.0f;
for(int i = 0; i < DENOISE_FEATURES; i++, (*rank)++) {
if(i >= 2 && reduced_energy >= threshold_energy)
break;
float s = feature_matrix[i*DENOISE_FEATURES+i];
reduced_energy += s;
/* Bake the feature scaling into the transformation matrix. */
for(int j = 0; j < DENOISE_FEATURES; j++) {
transform[(*rank)*DENOISE_FEATURES + j] *= _mm_cvtss_f32(feature_scale[j]);
}
}
}
else {
for(int i = 0; i < DENOISE_FEATURES; i++, (*rank)++) {
float s = feature_matrix[i*DENOISE_FEATURES+i];
if(i >= 2 && sqrtf(s) < pca_threshold)
break;
/* Bake the feature scaling into the transformation matrix. */
for(int j = 0; j < DENOISE_FEATURES; j++) {
transform[(*rank)*DENOISE_FEATURES + j] *= _mm_cvtss_f32(feature_scale[j]);
}
}
}
math_matrix_transpose(transform, DENOISE_FEATURES, 1);
/* Bake the feature scaling into the transformation matrix. */
for(int i = 0; i < DENOISE_FEATURES; i++) {
math_vector_scale(transform + i*DENOISE_FEATURES, _mm_cvtss_f32(feature_scale[i]), *rank);
}
}
CCL_NAMESPACE_END

View File

@@ -76,7 +76,7 @@ ccl_device_inline void triangle_vertices(KernelGlobals *kg, int prim, float3 P[3
/* Interpolate smooth vertex normal from vertices */
ccl_device_inline float3 triangle_smooth_normal(KernelGlobals *kg, int prim, float u, float v)
ccl_device_inline float3 triangle_smooth_normal(KernelGlobals *kg, float3 Ng, int prim, float u, float v)
{
/* load triangle vertices */
const uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, prim);
@@ -84,7 +84,9 @@ ccl_device_inline float3 triangle_smooth_normal(KernelGlobals *kg, int prim, flo
float3 n1 = float4_to_float3(kernel_tex_fetch(__tri_vnormal, tri_vindex.y));
float3 n2 = float4_to_float3(kernel_tex_fetch(__tri_vnormal, tri_vindex.z));
return normalize((1.0f - u - v)*n2 + u*n0 + v*n1);
float3 N = safe_normalize((1.0f - u - v)*n2 + u*n0 + v*n1);
return is_zero(N)? Ng: N;
}
/* Ray differentials on triangle */

View File

@@ -50,30 +50,20 @@ void kernel_tex_copy(KernelGlobals *kg,
#define KERNEL_ARCH cpu
#include "kernel/kernels/cpu/kernel_cpu.h"
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
# define KERNEL_ARCH cpu_sse2
# include "kernel/kernels/cpu/kernel_cpu.h"
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */
#define KERNEL_ARCH cpu_sse2
#include "kernel/kernels/cpu/kernel_cpu.h"
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
# define KERNEL_ARCH cpu_sse3
# include "kernel/kernels/cpu/kernel_cpu.h"
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */
#define KERNEL_ARCH cpu_sse3
#include "kernel/kernels/cpu/kernel_cpu.h"
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
# define KERNEL_ARCH cpu_sse41
# include "kernel/kernels/cpu/kernel_cpu.h"
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 */
#define KERNEL_ARCH cpu_sse41
#include "kernel/kernels/cpu/kernel_cpu.h"
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
# define KERNEL_ARCH cpu_avx
# include "kernel/kernels/cpu/kernel_cpu.h"
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX */
#define KERNEL_ARCH cpu_avx
#include "kernel/kernels/cpu/kernel_cpu.h"
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
# define KERNEL_ARCH cpu_avx2
# include "kernel/kernels/cpu/kernel_cpu.h"
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 */
#define KERNEL_ARCH cpu_avx2
#include "kernel/kernels/cpu/kernel_cpu.h"
CCL_NAMESPACE_END

View File

@@ -222,6 +222,12 @@ ccl_device_inline void path_radiance_init(PathRadiance *L, int use_light_pass)
L->path_total_shaded = make_float3(0.0f, 0.0f, 0.0f);
L->shadow_color = make_float3(0.0f, 0.0f, 0.0f);
#endif
#ifdef __DENOISING_FEATURES__
L->denoising_normal = make_float3(0.0f, 0.0f, 0.0f);
L->denoising_albedo = make_float3(0.0f, 0.0f, 0.0f);
L->denoising_depth = 0.0f;
#endif /* __DENOISING_FEATURES__ */
}
ccl_device_inline void path_radiance_bsdf_bounce(PathRadiance *L, ccl_addr_space float3 *throughput,
@@ -277,15 +283,15 @@ ccl_device_inline void path_radiance_accum_emission(PathRadiance *L, float3 thro
}
ccl_device_inline void path_radiance_accum_ao(PathRadiance *L,
ccl_addr_space PathState *state,
float3 throughput,
float3 alpha,
float3 bsdf,
float3 ao,
int bounce)
float3 ao)
{
#ifdef __PASSES__
if(L->use_light_pass) {
if(bounce == 0) {
if(state->bounce == 0) {
/* directly visible lighting */
L->direct_diffuse += throughput*bsdf*ao;
L->ao += alpha*throughput*ao;
@@ -302,31 +308,43 @@ ccl_device_inline void path_radiance_accum_ao(PathRadiance *L,
}
#ifdef __SHADOW_TRICKS__
float3 light = throughput * bsdf;
L->path_total += light;
L->path_total_shaded += ao * light;
if(state->flag & PATH_RAY_STORE_SHADOW_INFO) {
float3 light = throughput * bsdf;
L->path_total += light;
L->path_total_shaded += ao * light;
}
#endif
}
ccl_device_inline void path_radiance_accum_total_ao(
PathRadiance *L,
ccl_addr_space PathState *state,
float3 throughput,
float3 bsdf)
{
#ifdef __SHADOW_TRICKS__
L->path_total += throughput * bsdf;
if(state->flag & PATH_RAY_STORE_SHADOW_INFO) {
L->path_total += throughput * bsdf;
}
#else
(void) L;
(void) state;
(void) throughput;
(void) bsdf;
#endif
}
ccl_device_inline void path_radiance_accum_light(PathRadiance *L, float3 throughput, BsdfEval *bsdf_eval, float3 shadow, float shadow_fac, int bounce, bool is_lamp)
ccl_device_inline void path_radiance_accum_light(PathRadiance *L,
ccl_addr_space PathState *state,
float3 throughput,
BsdfEval *bsdf_eval,
float3 shadow,
float shadow_fac,
bool is_lamp)
{
#ifdef __PASSES__
if(L->use_light_pass) {
if(bounce == 0) {
if(state->bounce == 0) {
/* directly visible lighting */
L->direct_diffuse += throughput*bsdf_eval->diffuse*shadow;
L->direct_glossy += throughput*bsdf_eval->glossy*shadow;
@@ -352,21 +370,27 @@ ccl_device_inline void path_radiance_accum_light(PathRadiance *L, float3 through
}
#ifdef __SHADOW_TRICKS__
float3 light = throughput * bsdf_eval->sum_no_mis;
L->path_total += light;
L->path_total_shaded += shadow * light;
if(state->flag & PATH_RAY_STORE_SHADOW_INFO) {
float3 light = throughput * bsdf_eval->sum_no_mis;
L->path_total += light;
L->path_total_shaded += shadow * light;
}
#endif
}
ccl_device_inline void path_radiance_accum_total_light(
PathRadiance *L,
ccl_addr_space PathState *state,
float3 throughput,
const BsdfEval *bsdf_eval)
{
#ifdef __SHADOW_TRICKS__
L->path_total += throughput * bsdf_eval->sum_no_mis;
if(state->flag & PATH_RAY_STORE_SHADOW_INFO) {
L->path_total += throughput * bsdf_eval->sum_no_mis;
}
#else
(void) L;
(void) state;
(void) throughput;
(void) bsdf_eval;
#endif
@@ -393,11 +417,17 @@ ccl_device_inline void path_radiance_accum_background(PathRadiance *L,
}
#ifdef __SHADOW_TRICKS__
L->path_total += throughput * value;
if(state->flag & PATH_RAY_SHADOW_CATCHER_ONLY) {
L->path_total_shaded += throughput * value;
if(state->flag & PATH_RAY_STORE_SHADOW_INFO) {
L->path_total += throughput * value;
if(state->flag & PATH_RAY_SHADOW_CATCHER_ONLY) {
L->path_total_shaded += throughput * value;
}
}
#endif
#ifdef __DENOISING_FEATURES__
L->denoising_albedo += state->denoising_feature_weight * value;
#endif /* __DENOISING_FEATURES__ */
}
ccl_device_inline void path_radiance_sum_indirect(PathRadiance *L)
@@ -555,6 +585,38 @@ ccl_device_inline float3 path_radiance_clamp_and_sum(KernelGlobals *kg, PathRadi
return L_sum;
}
ccl_device_inline void path_radiance_split_denoising(KernelGlobals *kg, PathRadiance *L, float3 *noisy, float3 *clean)
{
#ifdef __PASSES__
kernel_assert(L->use_light_pass);
*clean = L->emission + L->background;
*noisy = L->direct_scatter + L->indirect_scatter;
# define ADD_COMPONENT(flag, component) \
if(kernel_data.film.denoising_flags & flag) \
*clean += component; \
else \
*noisy += component;
ADD_COMPONENT(DENOISING_CLEAN_DIFFUSE_DIR, L->direct_diffuse);
ADD_COMPONENT(DENOISING_CLEAN_DIFFUSE_IND, L->indirect_diffuse);
ADD_COMPONENT(DENOISING_CLEAN_GLOSSY_DIR, L->direct_glossy);
ADD_COMPONENT(DENOISING_CLEAN_GLOSSY_IND, L->indirect_glossy);
ADD_COMPONENT(DENOISING_CLEAN_TRANSMISSION_DIR, L->direct_transmission);
ADD_COMPONENT(DENOISING_CLEAN_TRANSMISSION_IND, L->indirect_transmission);
ADD_COMPONENT(DENOISING_CLEAN_SUBSURFACE_DIR, L->direct_subsurface);
ADD_COMPONENT(DENOISING_CLEAN_SUBSURFACE_IND, L->indirect_subsurface);
# undef ADD_COMPONENT
#else
*noisy = L->emission;
*clean = make_float3(0.0f, 0.0f, 0.0f);
#endif
*noisy = ensure_finite3(*noisy);
*clean = ensure_finite3(*clean);
}
ccl_device_inline void path_radiance_accum_sample(PathRadiance *L, PathRadiance *L_sample, int num_samples)
{
float fac = 1.0f/num_samples;
@@ -595,12 +657,12 @@ ccl_device_inline float path_radiance_sum_shadow(const PathRadiance *L)
/* Calculate final light sum and transparency for shadow catcher object. */
ccl_device_inline float3 path_radiance_sum_shadowcatcher(KernelGlobals *kg,
const PathRadiance *L,
ccl_addr_space float* L_transparent)
float* alpha)
{
const float shadow = path_radiance_sum_shadow(L);
float3 L_sum;
if(kernel_data.background.transparent) {
*L_transparent = shadow;
*alpha = 1.0f-shadow;
L_sum = make_float3(0.0f, 0.0f, 0.0f);
}
else {

View File

@@ -42,6 +42,8 @@
#include "util/util_types.h"
#include "util/util_texture.h"
#define ccl_restrict_ptr const * __restrict
#define ccl_addr_space
#define ccl_local_id(d) 0

View File

@@ -55,6 +55,10 @@
#define ccl_restrict __restrict__
#define ccl_align(n) __align__(n)
#define ccl_restrict_ptr const * __restrict__
#define CCL_MAX_LOCAL_SIZE (CUDA_THREADS_BLOCK_WIDTH*CUDA_THREADS_BLOCK_WIDTH)
/* No assert supported for CUDA */
#define kernel_assert(cond)

View File

@@ -50,6 +50,8 @@
# define ccl_addr_space
#endif
#define ccl_restrict_ptr const * __restrict__
#define ccl_local_id(d) get_local_id(d)
#define ccl_global_id(d) get_global_id(d)

View File

@@ -102,7 +102,7 @@ ccl_device_inline float area_light_sample(float3 P,
float cu = 1.0f / sqrtf(fu * fu + b0sq) * (fu > 0.0f ? 1.0f : -1.0f);
cu = clamp(cu, -1.0f, 1.0f);
/* Compute xu. */
float xu = -(cu * z0) / sqrtf(1.0f - cu * cu);
float xu = -(cu * z0) / max(sqrtf(1.0f - cu * cu), 1e-7f);
xu = clamp(xu, x0, x1);
/* Compute yv. */
float z0sq = z0 * z0;

View File

@@ -60,6 +60,135 @@ ccl_device_inline void kernel_write_pass_float4(ccl_global float *buffer, int sa
#endif /* __SPLIT_KERNEL__ */
}
#ifdef __DENOISING_FEATURES__
ccl_device_inline void kernel_write_pass_float_variance(ccl_global float *buffer, int sample, float value)
{
kernel_write_pass_float(buffer, sample, value);
/* The online one-pass variance update that's used for the megakernel can't easily be implemented
* with atomics, so for the split kernel the E[x^2] - 1/N * (E[x])^2 fallback is used. */
# ifdef __SPLIT_KERNEL__
kernel_write_pass_float(buffer+1, sample, value*value);
# else
if(sample == 0) {
kernel_write_pass_float(buffer+1, sample, 0.0f);
}
else {
float new_mean = buffer[0] * (1.0f / (sample + 1));
float old_mean = (buffer[0] - value) * (1.0f / sample);
kernel_write_pass_float(buffer+1, sample, (value - new_mean) * (value - old_mean));
}
# endif
}
# if defined(__SPLIT_KERNEL__)
# define kernel_write_pass_float3_unaligned kernel_write_pass_float3
# else
ccl_device_inline void kernel_write_pass_float3_unaligned(ccl_global float *buffer, int sample, float3 value)
{
buffer[0] = (sample == 0)? value.x: buffer[0] + value.x;
buffer[1] = (sample == 0)? value.y: buffer[1] + value.y;
buffer[2] = (sample == 0)? value.z: buffer[2] + value.z;
}
# endif
ccl_device_inline void kernel_write_pass_float3_variance(ccl_global float *buffer, int sample, float3 value)
{
kernel_write_pass_float3_unaligned(buffer, sample, value);
# ifdef __SPLIT_KERNEL__
kernel_write_pass_float3_unaligned(buffer+3, sample, value*value);
# else
if(sample == 0) {
kernel_write_pass_float3_unaligned(buffer+3, sample, make_float3(0.0f, 0.0f, 0.0f));
}
else {
float3 sum = make_float3(buffer[0], buffer[1], buffer[2]);
float3 new_mean = sum * (1.0f / (sample + 1));
float3 old_mean = (sum - value) * (1.0f / sample);
kernel_write_pass_float3_unaligned(buffer+3, sample, (value - new_mean) * (value - old_mean));
}
# endif
}
ccl_device_inline void kernel_write_denoising_shadow(KernelGlobals *kg, ccl_global float *buffer,
int sample, float path_total, float path_total_shaded)
{
if(kernel_data.film.pass_denoising_data == 0)
return;
buffer += (sample & 1)? DENOISING_PASS_SHADOW_B : DENOISING_PASS_SHADOW_A;
path_total = ensure_finite(path_total);
path_total_shaded = ensure_finite(path_total_shaded);
kernel_write_pass_float(buffer, sample/2, path_total);
kernel_write_pass_float(buffer+1, sample/2, path_total_shaded);
float value = path_total_shaded / max(path_total, 1e-7f);
# ifdef __SPLIT_KERNEL__
kernel_write_pass_float(buffer+2, sample/2, value*value);
# else
if(sample < 2) {
kernel_write_pass_float(buffer+2, sample/2, 0.0f);
}
else {
float old_value = (buffer[1] - path_total_shaded) / max(buffer[0] - path_total, 1e-7f);
float new_value = buffer[1] / max(buffer[0], 1e-7f);
kernel_write_pass_float(buffer+2, sample, (value - new_value) * (value - old_value));
}
# endif
}
#endif /* __DENOISING_FEATURES__ */
ccl_device_inline void kernel_update_denoising_features(KernelGlobals *kg,
ShaderData *sd,
ccl_global PathState *state,
PathRadiance *L)
{
#ifdef __DENOISING_FEATURES__
if(state->denoising_feature_weight == 0.0f) {
return;
}
L->denoising_depth += ensure_finite(state->denoising_feature_weight * sd->ray_length);
float3 normal = make_float3(0.0f, 0.0f, 0.0f);
float3 albedo = make_float3(0.0f, 0.0f, 0.0f);
float sum_weight = 0.0f, sum_nonspecular_weight = 0.0f;
for(int i = 0; i < sd->num_closure; i++) {
ShaderClosure *sc = &sd->closure[i];
if(!CLOSURE_IS_BSDF_OR_BSSRDF(sc->type))
continue;
/* All closures contribute to the normal feature, but only diffuse-like ones to the albedo. */
normal += sc->N * sc->sample_weight;
sum_weight += sc->sample_weight;
if(!bsdf_is_specular_like(sc)) {
albedo += sc->weight;
sum_nonspecular_weight += sc->sample_weight;
}
}
/* Wait for next bounce if 75% or more sample weight belongs to specular-like closures. */
if((sum_weight == 0.0f) || (sum_nonspecular_weight*4.0f > sum_weight)) {
if(sum_weight != 0.0f) {
normal /= sum_weight;
}
L->denoising_normal += ensure_finite3(state->denoising_feature_weight * normal);
L->denoising_albedo += ensure_finite3(state->denoising_feature_weight * albedo);
state->denoising_feature_weight = 0.0f;
}
#else
(void) kg;
(void) sd;
(void) state;
(void) L;
#endif /* __DENOISING_FEATURES__ */
}
ccl_device_inline void kernel_write_data_passes(KernelGlobals *kg, ccl_global float *buffer, PathRadiance *L,
ShaderData *sd, int sample, ccl_addr_space PathState *state, float3 throughput)
{
@@ -199,5 +328,79 @@ ccl_device_inline void kernel_write_light_passes(KernelGlobals *kg, ccl_global f
#endif
}
ccl_device_inline void kernel_write_result(KernelGlobals *kg, ccl_global float *buffer,
int sample, PathRadiance *L, float alpha, bool is_shadow_catcher)
{
if(L) {
float3 L_sum;
#ifdef __SHADOW_TRICKS__
if(is_shadow_catcher) {
L_sum = path_radiance_sum_shadowcatcher(kg, L, &alpha);
}
else
#endif /* __SHADOW_TRICKS__ */
{
L_sum = path_radiance_clamp_and_sum(kg, L);
}
kernel_write_pass_float4(buffer, sample, make_float4(L_sum.x, L_sum.y, L_sum.z, alpha));
kernel_write_light_passes(kg, buffer, L, sample);
#ifdef __DENOISING_FEATURES__
if(kernel_data.film.pass_denoising_data) {
# ifdef __SHADOW_TRICKS__
kernel_write_denoising_shadow(kg, buffer + kernel_data.film.pass_denoising_data, sample, average(L->path_total), average(L->path_total_shaded));
# else
kernel_write_denoising_shadow(kg, buffer + kernel_data.film.pass_denoising_data, sample, 0.0f, 0.0f);
# endif
if(kernel_data.film.pass_denoising_clean) {
float3 noisy, clean;
path_radiance_split_denoising(kg, L, &noisy, &clean);
kernel_write_pass_float3_variance(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_COLOR,
sample, noisy);
kernel_write_pass_float3_unaligned(buffer + kernel_data.film.pass_denoising_clean,
sample, clean);
}
else {
kernel_write_pass_float3_variance(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_COLOR,
sample, L_sum);
}
kernel_write_pass_float3_variance(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_NORMAL,
sample, L->denoising_normal);
kernel_write_pass_float3_variance(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_ALBEDO,
sample, L->denoising_albedo);
kernel_write_pass_float_variance(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_DEPTH,
sample, L->denoising_depth);
}
#endif /* __DENOISING_FEATURES__ */
}
else {
kernel_write_pass_float4(buffer, sample, make_float4(0.0f, 0.0f, 0.0f, 0.0f));
#ifdef __DENOISING_FEATURES__
if(kernel_data.film.pass_denoising_data) {
kernel_write_denoising_shadow(kg, buffer + kernel_data.film.pass_denoising_data, sample, 0.0f, 0.0f);
kernel_write_pass_float3_variance(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_COLOR,
sample, make_float3(0.0f, 0.0f, 0.0f));
kernel_write_pass_float3_variance(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_NORMAL,
sample, make_float3(0.0f, 0.0f, 0.0f));
kernel_write_pass_float3_variance(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_ALBEDO,
sample, make_float3(0.0f, 0.0f, 0.0f));
kernel_write_pass_float_variance(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_DEPTH,
sample, 0.0f);
if(kernel_data.film.pass_denoising_clean) {
kernel_write_pass_float3_unaligned(buffer + kernel_data.film.pass_denoising_clean,
sample, make_float3(0.0f, 0.0f, 0.0f));
}
}
#endif /* __DENOISING_FEATURES__ */
}
}
CCL_NAMESPACE_END

View File

@@ -90,10 +90,10 @@ ccl_device_noinline void kernel_path_ao(KernelGlobals *kg,
light_ray.dD = differential3_zero();
if(!shadow_blocked(kg, emission_sd, state, &light_ray, &ao_shadow)) {
path_radiance_accum_ao(L, throughput, ao_alpha, ao_bsdf, ao_shadow, state->bounce);
path_radiance_accum_ao(L, state, throughput, ao_alpha, ao_bsdf, ao_shadow);
}
else {
path_radiance_accum_total_ao(L, throughput, ao_bsdf);
path_radiance_accum_total_ao(L, state, throughput, ao_bsdf);
}
}
}
@@ -366,6 +366,8 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg,
throughput /= probability;
}
kernel_update_denoising_features(kg, sd, state, L);
#ifdef __AO__
/* ambient occlusion */
if(kernel_data.integrator.use_ambient_occlusion || (sd->flag & SD_AO)) {
@@ -427,18 +429,19 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg,
}
ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
RNG *rng,
int sample,
Ray ray,
ccl_global float *buffer)
ccl_device_inline float kernel_path_integrate(KernelGlobals *kg,
RNG *rng,
int sample,
Ray ray,
ccl_global float *buffer,
PathRadiance *L,
bool *is_shadow_catcher)
{
/* initialize */
PathRadiance L;
float3 throughput = make_float3(1.0f, 1.0f, 1.0f);
float L_transparent = 0.0f;
path_radiance_init(&L, kernel_data.film.use_light_pass);
path_radiance_init(L, kernel_data.film.use_light_pass);
/* shader data memory used for both volumes and surfaces, saves stack space */
ShaderData sd;
@@ -517,7 +520,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
float3 emission;
if(indirect_lamp_emission(kg, &emission_sd, &state, &light_ray, &emission))
path_radiance_accum_emission(&L, throughput, emission, state.bounce);
path_radiance_accum_emission(L, throughput, emission, state.bounce);
}
#endif /* __LAMP_MIS__ */
@@ -549,7 +552,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
/* emission */
if(volume_segment.closure_flag & SD_EMISSION)
path_radiance_accum_emission(&L, throughput, volume_segment.accum_emission, state.bounce);
path_radiance_accum_emission(L, throughput, volume_segment.accum_emission, state.bounce);
/* scattering */
VolumeIntegrateResult result = VOLUME_PATH_ATTENUATED;
@@ -559,7 +562,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
/* direct light sampling */
kernel_branched_path_volume_connect_light(kg, rng, &sd,
&emission_sd, throughput, &state, &L, all,
&emission_sd, throughput, &state, L, all,
&volume_ray, &volume_segment);
/* indirect sample. if we use distance sampling and take just
@@ -577,7 +580,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
kernel_volume_decoupled_free(kg, &volume_segment);
if(result == VOLUME_PATH_SCATTERED) {
if(kernel_path_volume_bounce(kg, rng, &sd, &throughput, &state, &L, &ray))
if(kernel_path_volume_bounce(kg, rng, &sd, &throughput, &state, L, &ray))
continue;
else
break;
@@ -591,15 +594,15 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
{
/* integrate along volume segment with distance sampling */
VolumeIntegrateResult result = kernel_volume_integrate(
kg, &state, &sd, &volume_ray, &L, &throughput, rng, heterogeneous);
kg, &state, &sd, &volume_ray, L, &throughput, rng, heterogeneous);
# ifdef __VOLUME_SCATTER__
if(result == VOLUME_PATH_SCATTERED) {
/* direct lighting */
kernel_path_volume_connect_light(kg, rng, &sd, &emission_sd, throughput, &state, &L);
kernel_path_volume_connect_light(kg, rng, &sd, &emission_sd, throughput, &state, L);
/* indirect light bounce */
if(kernel_path_volume_bounce(kg, rng, &sd, &throughput, &state, &L, &ray))
if(kernel_path_volume_bounce(kg, rng, &sd, &throughput, &state, L, &ray))
continue;
else
break;
@@ -623,7 +626,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
#ifdef __BACKGROUND__
/* sample background shader */
float3 L_background = indirect_background(kg, &emission_sd, &state, &ray);
path_radiance_accum_background(&L, &state, throughput, L_background);
path_radiance_accum_background(L, &state, throughput, L_background);
#endif /* __BACKGROUND__ */
break;
@@ -640,10 +643,10 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
#ifdef __SHADOW_TRICKS__
if((sd.object_flag & SD_OBJECT_SHADOW_CATCHER)) {
if(state.flag & PATH_RAY_CAMERA) {
state.flag |= (PATH_RAY_SHADOW_CATCHER | PATH_RAY_SHADOW_CATCHER_ONLY);
state.flag |= (PATH_RAY_SHADOW_CATCHER | PATH_RAY_SHADOW_CATCHER_ONLY | PATH_RAY_STORE_SHADOW_INFO);
state.catcher_object = sd.object;
if(!kernel_data.background.transparent) {
L.shadow_color = indirect_background(kg, &emission_sd, &state, &ray);
L->shadow_color = indirect_background(kg, &emission_sd, &state, &ray);
}
}
}
@@ -677,7 +680,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
#endif /* __HOLDOUT__ */
/* holdout mask objects do not write data passes */
kernel_write_data_passes(kg, buffer, &L, &sd, sample, &state, throughput);
kernel_write_data_passes(kg, buffer, L, &sd, sample, &state, throughput);
/* blurring of bsdf after bounces, for rays that have a small likelihood
* of following this particular path (diffuse, rough glossy) */
@@ -695,7 +698,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
if(sd.flag & SD_EMISSION) {
/* todo: is isect.t wrong here for transparent surfaces? */
float3 emission = indirect_primitive_emission(kg, &sd, isect.t, state.flag, state.ray_pdf);
path_radiance_accum_emission(&L, throughput, emission, state.bounce);
path_radiance_accum_emission(L, throughput, emission, state.bounce);
}
#endif /* __EMISSION__ */
@@ -715,10 +718,12 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
throughput /= probability;
}
kernel_update_denoising_features(kg, &sd, &state, L);
#ifdef __AO__
/* ambient occlusion */
if(kernel_data.integrator.use_ambient_occlusion || (sd.flag & SD_AO)) {
kernel_path_ao(kg, &sd, &emission_sd, &L, &state, rng, throughput, shader_bsdf_alpha(kg, &sd));
kernel_path_ao(kg, &sd, &emission_sd, L, &state, rng, throughput, shader_bsdf_alpha(kg, &sd));
}
#endif /* __AO__ */
@@ -729,7 +734,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
if(kernel_path_subsurface_scatter(kg,
&sd,
&emission_sd,
&L,
L,
&state,
rng,
&ray,
@@ -742,15 +747,15 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
#endif /* __SUBSURFACE__ */
/* direct lighting */
kernel_path_surface_connect_light(kg, rng, &sd, &emission_sd, throughput, &state, &L);
kernel_path_surface_connect_light(kg, rng, &sd, &emission_sd, throughput, &state, L);
/* compute direct lighting and next bounce */
if(!kernel_path_surface_bounce(kg, rng, &sd, &throughput, &state, &L, &ray))
if(!kernel_path_surface_bounce(kg, rng, &sd, &throughput, &state, L, &ray))
break;
}
#ifdef __SUBSURFACE__
kernel_path_subsurface_accum_indirect(&ss_indirect, &L);
kernel_path_subsurface_accum_indirect(&ss_indirect, L);
/* Trace indirect subsurface rays by restarting the loop. this uses less
* stack memory than invoking kernel_path_indirect.
@@ -760,7 +765,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
&ss_indirect,
&state,
&ray,
&L,
L,
&throughput);
}
else {
@@ -769,24 +774,15 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
}
#endif /* __SUBSURFACE__ */
float3 L_sum;
#ifdef __SHADOW_TRICKS__
if(state.flag & PATH_RAY_SHADOW_CATCHER) {
L_sum = path_radiance_sum_shadowcatcher(kg, &L, &L_transparent);
}
else
*is_shadow_catcher = (state.flag & PATH_RAY_SHADOW_CATCHER);
#endif /* __SHADOW_TRICKS__ */
{
L_sum = path_radiance_clamp_and_sum(kg, &L);
}
kernel_write_light_passes(kg, buffer, &L, sample);
#ifdef __KERNEL_DEBUG__
kernel_write_debug_passes(kg, buffer, &state, &debug_data, sample);
#endif /* __KERNEL_DEBUG__ */
return make_float4(L_sum.x, L_sum.y, L_sum.z, 1.0f - L_transparent);
return 1.0f - L_transparent;
}
ccl_device void kernel_path_trace(KernelGlobals *kg,
@@ -807,15 +803,16 @@ ccl_device void kernel_path_trace(KernelGlobals *kg,
kernel_path_trace_setup(kg, rng_state, sample, x, y, &rng, &ray);
/* integrate */
float4 L;
PathRadiance L;
bool is_shadow_catcher;
if(ray.t != 0.0f)
L = kernel_path_integrate(kg, &rng, sample, ray, buffer);
else
L = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
/* accumulate result in output buffer */
kernel_write_pass_float4(buffer, sample, L);
if(ray.t != 0.0f) {
float alpha = kernel_path_integrate(kg, &rng, sample, ray, buffer, &L, &is_shadow_catcher);
kernel_write_result(kg, buffer, sample, &L, alpha, is_shadow_catcher);
}
else {
kernel_write_result(kg, buffer, sample, NULL, 0.0f, false);
}
path_rng_end(kg, rng_state, rng);
}

View File

@@ -56,10 +56,10 @@ ccl_device_inline void kernel_branched_path_ao(KernelGlobals *kg,
light_ray.dD = differential3_zero();
if(!shadow_blocked(kg, emission_sd, state, &light_ray, &ao_shadow)) {
path_radiance_accum_ao(L, throughput*num_samples_inv, ao_alpha, ao_bsdf, ao_shadow, state->bounce);
path_radiance_accum_ao(L, state, throughput*num_samples_inv, ao_alpha, ao_bsdf, ao_shadow);
}
else {
path_radiance_accum_total_ao(L, throughput*num_samples_inv, ao_bsdf);
path_radiance_accum_total_ao(L, state, throughput*num_samples_inv, ao_bsdf);
}
}
}
@@ -72,14 +72,32 @@ ccl_device_noinline void kernel_branched_path_surface_indirect_light(KernelGloba
RNG *rng, ShaderData *sd, ShaderData *indirect_sd, ShaderData *emission_sd,
float3 throughput, float num_samples_adjust, PathState *state, PathRadiance *L)
{
float sum_sample_weight = 0.0f;
#ifdef __DENOISING_FEATURES__
if(state->denoising_feature_weight > 0.0f) {
for(int i = 0; i < sd->num_closure; i++) {
const ShaderClosure *sc = &sd->closure[i];
/* transparency is not handled here, but in outer loop */
if(!CLOSURE_IS_BSDF(sc->type) || CLOSURE_IS_BSDF_TRANSPARENT(sc->type)) {
continue;
}
sum_sample_weight += sc->sample_weight;
}
}
else {
sum_sample_weight = 1.0f;
}
#endif /* __DENOISING_FEATURES__ */
for(int i = 0; i < sd->num_closure; i++) {
const ShaderClosure *sc = &sd->closure[i];
if(!CLOSURE_IS_BSDF(sc->type))
continue;
/* transparency is not handled here, but in outer loop */
if(sc->type == CLOSURE_BSDF_TRANSPARENT_ID)
if(!CLOSURE_IS_BSDF(sc->type) || CLOSURE_IS_BSDF_TRANSPARENT(sc->type)) {
continue;
}
int num_samples;
@@ -111,7 +129,8 @@ ccl_device_noinline void kernel_branched_path_surface_indirect_light(KernelGloba
&tp,
&ps,
L,
&bsdf_ray))
&bsdf_ray,
sum_sample_weight))
{
continue;
}
@@ -243,14 +262,19 @@ ccl_device void kernel_branched_path_subsurface_scatter(KernelGlobals *kg,
}
#endif /* __SUBSURFACE__ */
ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, int sample, Ray ray, ccl_global float *buffer)
ccl_device float kernel_branched_path_integrate(KernelGlobals *kg,
RNG *rng,
int sample,
Ray ray,
ccl_global float *buffer,
PathRadiance *L,
bool *is_shadow_catcher)
{
/* initialize */
PathRadiance L;
float3 throughput = make_float3(1.0f, 1.0f, 1.0f);
float L_transparent = 0.0f;
path_radiance_init(&L, kernel_data.film.use_light_pass);
path_radiance_init(L, kernel_data.film.use_light_pass);
/* shader data memory used for both volumes and surfaces, saves stack space */
ShaderData sd;
@@ -330,7 +354,7 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
int all = kernel_data.integrator.sample_all_lights_direct;
kernel_branched_path_volume_connect_light(kg, rng, &sd,
&emission_sd, throughput, &state, &L, all,
&emission_sd, throughput, &state, L, all,
&volume_ray, &volume_segment);
/* indirect light sampling */
@@ -362,7 +386,7 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
&sd,
&tp,
&ps,
&L,
L,
&pray))
{
kernel_path_indirect(kg,
@@ -373,19 +397,19 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
tp*num_samples_inv,
num_samples,
&ps,
&L);
L);
/* for render passes, sum and reset indirect light pass variables
* for the next samples */
path_radiance_sum_indirect(&L);
path_radiance_reset_indirect(&L);
path_radiance_sum_indirect(L);
path_radiance_reset_indirect(L);
}
}
}
/* emission and transmittance */
if(volume_segment.closure_flag & SD_EMISSION)
path_radiance_accum_emission(&L, throughput, volume_segment.accum_emission, state.bounce);
path_radiance_accum_emission(L, throughput, volume_segment.accum_emission, state.bounce);
throughput *= volume_segment.accum_transmittance;
/* free cached steps */
@@ -407,20 +431,20 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
path_state_branch(&ps, j, num_samples);
VolumeIntegrateResult result = kernel_volume_integrate(
kg, &ps, &sd, &volume_ray, &L, &tp, rng, heterogeneous);
kg, &ps, &sd, &volume_ray, L, &tp, rng, heterogeneous);
#ifdef __VOLUME_SCATTER__
if(result == VOLUME_PATH_SCATTERED) {
/* todo: support equiangular, MIS and all light sampling.
* alternatively get decoupled ray marching working on the GPU */
kernel_path_volume_connect_light(kg, rng, &sd, &emission_sd, tp, &state, &L);
kernel_path_volume_connect_light(kg, rng, &sd, &emission_sd, tp, &state, L);
if(kernel_path_volume_bounce(kg,
rng,
&sd,
&tp,
&ps,
&L,
L,
&pray))
{
kernel_path_indirect(kg,
@@ -431,12 +455,12 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
tp,
num_samples,
&ps,
&L);
L);
/* for render passes, sum and reset indirect light pass variables
* for the next samples */
path_radiance_sum_indirect(&L);
path_radiance_reset_indirect(&L);
path_radiance_sum_indirect(L);
path_radiance_reset_indirect(L);
}
}
#endif /* __VOLUME_SCATTER__ */
@@ -462,7 +486,7 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
#ifdef __BACKGROUND__
/* sample background shader */
float3 L_background = indirect_background(kg, &emission_sd, &state, &ray);
path_radiance_accum_background(&L, &state, throughput, L_background);
path_radiance_accum_background(L, &state, throughput, L_background);
#endif /* __BACKGROUND__ */
break;
@@ -476,10 +500,10 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
#ifdef __SHADOW_TRICKS__
if((sd.object_flag & SD_OBJECT_SHADOW_CATCHER)) {
if(state.flag & PATH_RAY_CAMERA) {
state.flag |= (PATH_RAY_SHADOW_CATCHER | PATH_RAY_SHADOW_CATCHER_ONLY);
state.flag |= (PATH_RAY_SHADOW_CATCHER | PATH_RAY_SHADOW_CATCHER_ONLY | PATH_RAY_STORE_SHADOW_INFO);
state.catcher_object = sd.object;
if(!kernel_data.background.transparent) {
L.shadow_color = indirect_background(kg, &emission_sd, &state, &ray);
L->shadow_color = indirect_background(kg, &emission_sd, &state, &ray);
}
}
}
@@ -509,13 +533,13 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
#endif /* __HOLDOUT__ */
/* holdout mask objects do not write data passes */
kernel_write_data_passes(kg, buffer, &L, &sd, sample, &state, throughput);
kernel_write_data_passes(kg, buffer, L, &sd, sample, &state, throughput);
#ifdef __EMISSION__
/* emission */
if(sd.flag & SD_EMISSION) {
float3 emission = indirect_primitive_emission(kg, &sd, isect.t, state.flag, state.ray_pdf);
path_radiance_accum_emission(&L, throughput, emission, state.bounce);
path_radiance_accum_emission(L, throughput, emission, state.bounce);
}
#endif /* __EMISSION__ */
@@ -539,10 +563,12 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
}
}
kernel_update_denoising_features(kg, &sd, &state, L);
#ifdef __AO__
/* ambient occlusion */
if(kernel_data.integrator.use_ambient_occlusion || (sd.flag & SD_AO)) {
kernel_branched_path_ao(kg, &sd, &emission_sd, &L, &state, rng, throughput);
kernel_branched_path_ao(kg, &sd, &emission_sd, L, &state, rng, throughput);
}
#endif /* __AO__ */
@@ -550,7 +576,7 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
/* bssrdf scatter to a different location on the same object */
if(sd.flag & SD_BSSRDF) {
kernel_branched_path_subsurface_scatter(kg, &sd, &indirect_sd, &emission_sd,
&L, &state, rng, &ray, throughput);
L, &state, rng, &ray, throughput);
}
#endif /* __SUBSURFACE__ */
@@ -563,13 +589,13 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
int all = (kernel_data.integrator.sample_all_lights_direct) ||
(state.flag & PATH_RAY_SHADOW_CATCHER);
kernel_branched_path_surface_connect_light(kg, rng,
&sd, &emission_sd, &hit_state, throughput, 1.0f, &L, all);
&sd, &emission_sd, &hit_state, throughput, 1.0f, L, all);
}
#endif /* __EMISSION__ */
/* indirect light */
kernel_branched_path_surface_indirect_light(kg, rng,
&sd, &indirect_sd, &emission_sd, throughput, 1.0f, &hit_state, &L);
&sd, &indirect_sd, &emission_sd, throughput, 1.0f, &hit_state, L);
/* continue in case of transparency */
throughput *= shader_bsdf_transparency(kg, &sd);
@@ -598,24 +624,15 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
#endif /* __VOLUME__ */
}
float3 L_sum;
#ifdef __SHADOW_TRICKS__
if(state.flag & PATH_RAY_SHADOW_CATCHER) {
L_sum = path_radiance_sum_shadowcatcher(kg, &L, &L_transparent);
}
else
*is_shadow_catcher = (state.flag & PATH_RAY_SHADOW_CATCHER);
#endif /* __SHADOW_TRICKS__ */
{
L_sum = path_radiance_clamp_and_sum(kg, &L);
}
kernel_write_light_passes(kg, buffer, &L, sample);
#ifdef __KERNEL_DEBUG__
kernel_write_debug_passes(kg, buffer, &state, &debug_data, sample);
#endif /* __KERNEL_DEBUG__ */
return make_float4(L_sum.x, L_sum.y, L_sum.z, 1.0f - L_transparent);
return 1.0f - L_transparent;
}
ccl_device void kernel_branched_path_trace(KernelGlobals *kg,
@@ -636,15 +653,16 @@ ccl_device void kernel_branched_path_trace(KernelGlobals *kg,
kernel_path_trace_setup(kg, rng_state, sample, x, y, &rng, &ray);
/* integrate */
float4 L;
PathRadiance L;
bool is_shadow_catcher;
if(ray.t != 0.0f)
L = kernel_branched_path_integrate(kg, &rng, sample, ray, buffer);
else
L = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
/* accumulate result in output buffer */
kernel_write_pass_float4(buffer, sample, L);
if(ray.t != 0.0f) {
float alpha = kernel_branched_path_integrate(kg, &rng, sample, ray, buffer, &L, &is_shadow_catcher);
kernel_write_result(kg, buffer, sample, &L, alpha, is_shadow_catcher);
}
else {
kernel_write_result(kg, buffer, sample, NULL, 0.0f, false);
}
path_rng_end(kg, rng_state, rng);
}
@@ -654,4 +672,3 @@ ccl_device void kernel_branched_path_trace(KernelGlobals *kg,
#endif /* __BRANCHED_PATH__ */
CCL_NAMESPACE_END

View File

@@ -35,6 +35,16 @@ ccl_device_inline void path_state_init(KernelGlobals *kg,
state->transmission_bounce = 0;
state->transparent_bounce = 0;
#ifdef __DENOISING_FEATURES__
if(kernel_data.film.pass_denoising_data) {
state->flag |= PATH_RAY_STORE_SHADOW_INFO;
state->denoising_feature_weight = 1.0f;
}
else {
state->denoising_feature_weight = 0.0f;
}
#endif /* __DENOISING_FEATURES__ */
state->min_ray_pdf = FLT_MAX;
state->ray_pdf = 0.0f;
#ifdef __LAMP_MIS__
@@ -128,6 +138,10 @@ ccl_device_inline void path_state_next(KernelGlobals *kg, ccl_addr_space PathSta
/* random number generator next bounce */
state->rng_offset += PRNG_BOUNCE_NUM;
if((state->denoising_feature_weight == 0.0f) && !(state->flag & PATH_RAY_SHADOW_CATCHER)) {
state->flag &= ~PATH_RAY_STORE_SHADOW_INFO;
}
}
ccl_device_inline uint path_state_ray_visibility(KernelGlobals *kg, PathState *state)

View File

@@ -70,10 +70,10 @@ ccl_device_noinline void kernel_branched_path_surface_connect_light(
if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) {
/* accumulate */
path_radiance_accum_light(L, throughput*num_samples_inv, &L_light, shadow, num_samples_inv, state->bounce, is_lamp);
path_radiance_accum_light(L, state, throughput*num_samples_inv, &L_light, shadow, num_samples_inv, is_lamp);
}
else {
path_radiance_accum_total_light(L, throughput*num_samples_inv, &L_light);
path_radiance_accum_total_light(L, state, throughput*num_samples_inv, &L_light);
}
}
}
@@ -107,10 +107,10 @@ ccl_device_noinline void kernel_branched_path_surface_connect_light(
if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) {
/* accumulate */
path_radiance_accum_light(L, throughput*num_samples_inv, &L_light, shadow, num_samples_inv, state->bounce, is_lamp);
path_radiance_accum_light(L, state, throughput*num_samples_inv, &L_light, shadow, num_samples_inv, is_lamp);
}
else {
path_radiance_accum_total_light(L, throughput*num_samples_inv, &L_light);
path_radiance_accum_total_light(L, state, throughput*num_samples_inv, &L_light);
}
}
}
@@ -133,10 +133,10 @@ ccl_device_noinline void kernel_branched_path_surface_connect_light(
if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) {
/* accumulate */
path_radiance_accum_light(L, throughput*num_samples_adjust, &L_light, shadow, num_samples_adjust, state->bounce, is_lamp);
path_radiance_accum_light(L, state, throughput*num_samples_adjust, &L_light, shadow, num_samples_adjust, is_lamp);
}
else {
path_radiance_accum_total_light(L, throughput*num_samples_adjust, &L_light);
path_radiance_accum_total_light(L, state, throughput*num_samples_adjust, &L_light);
}
}
}
@@ -155,7 +155,8 @@ ccl_device bool kernel_branched_path_surface_bounce(
ccl_addr_space float3 *throughput,
ccl_addr_space PathState *state,
PathRadiance *L,
ccl_addr_space Ray *ray)
ccl_addr_space Ray *ray,
float sum_sample_weight)
{
/* sample BSDF */
float bsdf_pdf;
@@ -175,6 +176,10 @@ ccl_device bool kernel_branched_path_surface_bounce(
/* modify throughput */
path_radiance_bsdf_bounce(L, throughput, &bsdf_eval, bsdf_pdf, state->bounce, label);
#ifdef __DENOISING_FEATURES__
state->denoising_feature_weight *= sc->sample_weight / (sum_sample_weight * num_samples);
#endif
/* modify path state */
path_state_next(kg, state, label);
@@ -257,10 +262,10 @@ ccl_device_inline void kernel_path_surface_connect_light(KernelGlobals *kg, RNG
if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) {
/* accumulate */
path_radiance_accum_light(L, throughput, &L_light, shadow, 1.0f, state->bounce, is_lamp);
path_radiance_accum_light(L, state, throughput, &L_light, shadow, 1.0f, is_lamp);
}
else {
path_radiance_accum_total_light(L, throughput, &L_light);
path_radiance_accum_total_light(L, state, throughput, &L_light);
}
}
}

View File

@@ -55,7 +55,7 @@ ccl_device_inline void kernel_path_volume_connect_light(
if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) {
/* accumulate */
path_radiance_accum_light(L, throughput, &L_light, shadow, 1.0f, state->bounce, is_lamp);
path_radiance_accum_light(L, state, throughput, &L_light, shadow, 1.0f, is_lamp);
}
}
}
@@ -184,7 +184,7 @@ ccl_device void kernel_branched_path_volume_connect_light(
if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) {
/* accumulate */
path_radiance_accum_light(L, tp*num_samples_inv, &L_light, shadow, num_samples_inv, state->bounce, is_lamp);
path_radiance_accum_light(L, state, tp*num_samples_inv, &L_light, shadow, num_samples_inv, is_lamp);
}
}
}
@@ -233,7 +233,7 @@ ccl_device void kernel_branched_path_volume_connect_light(
if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) {
/* accumulate */
path_radiance_accum_light(L, tp*num_samples_inv, &L_light, shadow, num_samples_inv, state->bounce, is_lamp);
path_radiance_accum_light(L, state, tp*num_samples_inv, &L_light, shadow, num_samples_inv, is_lamp);
}
}
}
@@ -271,7 +271,7 @@ ccl_device void kernel_branched_path_volume_connect_light(
if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) {
/* accumulate */
path_radiance_accum_light(L, tp, &L_light, shadow, 1.0f, state->bounce, is_lamp);
path_radiance_accum_light(L, state, tp, &L_light, shadow, 1.0f, is_lamp);
}
}
}

View File

@@ -57,6 +57,9 @@ ccl_device float3 spherical_to_direction(float theta, float phi)
ccl_device float2 direction_to_equirectangular_range(float3 dir, float4 range)
{
if(is_zero(dir))
return make_float2(0.0f, 0.0f);
float u = (atan2f(dir.y, dir.x) - range.y) / range.x;
float v = (acosf(dir.z / len(dir)) - range.w) / range.z;

View File

@@ -99,7 +99,7 @@ ccl_device_noinline void shader_setup_from_ray(KernelGlobals *kg,
/* smooth normal */
if(sd->shader & SHADER_SMOOTH_NORMAL)
sd->N = triangle_smooth_normal(kg, sd->prim, sd->u, sd->v);
sd->N = triangle_smooth_normal(kg, Ng, sd->prim, sd->u, sd->v);
#ifdef __DPDU__
/* dPdu/dPdv */
@@ -186,7 +186,7 @@ void shader_setup_from_subsurface(
sd->N = Ng;
if(sd->shader & SHADER_SMOOTH_NORMAL)
sd->N = triangle_smooth_normal(kg, sd->prim, sd->u, sd->v);
sd->N = triangle_smooth_normal(kg, Ng, sd->prim, sd->u, sd->v);
# ifdef __DPDU__
/* dPdu/dPdv */
@@ -300,7 +300,7 @@ ccl_device_inline void shader_setup_from_sample(KernelGlobals *kg,
if(sd->type & PRIMITIVE_TRIANGLE) {
/* smooth normal */
if(sd->shader & SHADER_SMOOTH_NORMAL) {
sd->N = triangle_smooth_normal(kg, sd->prim, sd->u, sd->v);
sd->N = triangle_smooth_normal(kg, Ng, sd->prim, sd->u, sd->v);
#ifdef __INSTANCING__
if(!(sd->object_flag & SD_OBJECT_TRANSFORM_APPLIED)) {

View File

@@ -173,6 +173,8 @@ CCL_NAMESPACE_BEGIN
#define __PATCH_EVAL__
#define __SHADOW_TRICKS__
#define __DENOISING_FEATURES__
#ifdef __KERNEL_SHADING__
# define __SVM__
# define __EMISSION__
@@ -314,31 +316,32 @@ enum SamplingPattern {
/* these flags values correspond to raytypes in osl.cpp, so keep them in sync! */
enum PathRayFlag {
PATH_RAY_CAMERA = 1,
PATH_RAY_REFLECT = 2,
PATH_RAY_TRANSMIT = 4,
PATH_RAY_DIFFUSE = 8,
PATH_RAY_GLOSSY = 16,
PATH_RAY_SINGULAR = 32,
PATH_RAY_TRANSPARENT = 64,
PATH_RAY_CAMERA = (1 << 0),
PATH_RAY_REFLECT = (1 << 1),
PATH_RAY_TRANSMIT = (1 << 2),
PATH_RAY_DIFFUSE = (1 << 3),
PATH_RAY_GLOSSY = (1 << 4),
PATH_RAY_SINGULAR = (1 << 5),
PATH_RAY_TRANSPARENT = (1 << 6),
PATH_RAY_SHADOW_OPAQUE = 128,
PATH_RAY_SHADOW_TRANSPARENT = 256,
PATH_RAY_SHADOW_OPAQUE = (1 << 7),
PATH_RAY_SHADOW_TRANSPARENT = (1 << 8),
PATH_RAY_SHADOW = (PATH_RAY_SHADOW_OPAQUE|PATH_RAY_SHADOW_TRANSPARENT),
PATH_RAY_CURVE = 512, /* visibility flag to define curve segments */
PATH_RAY_VOLUME_SCATTER = 1024, /* volume scattering */
PATH_RAY_CURVE = (1 << 9), /* visibility flag to define curve segments */
PATH_RAY_VOLUME_SCATTER = (1 << 10), /* volume scattering */
/* Special flag to tag unaligned BVH nodes. */
PATH_RAY_NODE_UNALIGNED = 2048,
PATH_RAY_NODE_UNALIGNED = (1 << 11),
PATH_RAY_ALL_VISIBILITY = (1|2|4|8|16|32|64|128|256|512|1024|2048),
PATH_RAY_ALL_VISIBILITY = ((1 << 12)-1),
PATH_RAY_MIS_SKIP = 4096,
PATH_RAY_DIFFUSE_ANCESTOR = 8192,
PATH_RAY_SINGLE_PASS_DONE = 16384,
PATH_RAY_SHADOW_CATCHER = 32768,
PATH_RAY_SHADOW_CATCHER_ONLY = 65536,
PATH_RAY_MIS_SKIP = (1 << 12),
PATH_RAY_DIFFUSE_ANCESTOR = (1 << 13),
PATH_RAY_SINGLE_PASS_DONE = (1 << 14),
PATH_RAY_SHADOW_CATCHER = (1 << 15),
PATH_RAY_SHADOW_CATCHER_ONLY = (1 << 16),
PATH_RAY_STORE_SHADOW_INFO = (1 << 17),
};
/* Closure Label */
@@ -394,6 +397,22 @@ typedef enum PassType {
#define PASS_ALL (~0)
typedef enum DenoisingPassOffsets {
DENOISING_PASS_NORMAL = 0,
DENOISING_PASS_NORMAL_VAR = 3,
DENOISING_PASS_ALBEDO = 6,
DENOISING_PASS_ALBEDO_VAR = 9,
DENOISING_PASS_DEPTH = 12,
DENOISING_PASS_DEPTH_VAR = 13,
DENOISING_PASS_SHADOW_A = 14,
DENOISING_PASS_SHADOW_B = 17,
DENOISING_PASS_COLOR = 20,
DENOISING_PASS_COLOR_VAR = 23,
DENOISING_PASS_SIZE_BASE = 26,
DENOISING_PASS_SIZE_CLEAN = 3,
} DenoisingPassOffsets;
typedef enum BakePassFilter {
BAKE_FILTER_NONE = 0,
BAKE_FILTER_DIRECT = (1 << 0),
@@ -427,6 +446,18 @@ typedef enum BakePassFilterCombos {
BAKE_FILTER_SUBSURFACE_INDIRECT = (BAKE_FILTER_INDIRECT | BAKE_FILTER_SUBSURFACE),
} BakePassFilterCombos;
typedef enum DenoiseFlag {
DENOISING_CLEAN_DIFFUSE_DIR = (1 << 0),
DENOISING_CLEAN_DIFFUSE_IND = (1 << 1),
DENOISING_CLEAN_GLOSSY_DIR = (1 << 2),
DENOISING_CLEAN_GLOSSY_IND = (1 << 3),
DENOISING_CLEAN_TRANSMISSION_DIR = (1 << 4),
DENOISING_CLEAN_TRANSMISSION_IND = (1 << 5),
DENOISING_CLEAN_SUBSURFACE_DIR = (1 << 6),
DENOISING_CLEAN_SUBSURFACE_IND = (1 << 7),
DENOISING_CLEAN_ALL_PASSES = (1 << 8)-1,
} DenoiseFlag;
typedef ccl_addr_space struct PathRadiance {
#ifdef __PASSES__
int use_light_pass;
@@ -482,6 +513,12 @@ typedef ccl_addr_space struct PathRadiance {
/* Color of the background on which shadow is alpha-overed. */
float3 shadow_color;
#endif
#ifdef __DENOISING_FEATURES__
float3 denoising_normal;
float3 denoising_albedo;
float denoising_depth;
#endif /* __DENOISING_FEATURES__ */
} PathRadiance;
typedef struct BsdfEval {
@@ -724,12 +761,13 @@ typedef struct AttributeDescriptor {
#define SHADER_CLOSURE_BASE \
float3 weight; \
ClosureType type; \
float sample_weight \
float sample_weight; \
float3 N
typedef ccl_addr_space struct ccl_align(16) ShaderClosure {
SHADER_CLOSURE_BASE;
float data[14]; /* pad to 80 bytes */
float data[10]; /* pad to 80 bytes */
} ShaderClosure;
/* Shader Context
@@ -960,6 +998,10 @@ typedef struct PathState {
int transmission_bounce;
int transparent_bounce;
#ifdef __DENOISING_FEATURES__
float denoising_feature_weight;
#endif /* __DENOISING_FEATURES__ */
/* multiple importance sampling */
float min_ray_pdf; /* smallest bounce pdf over entire path up to now */
float ray_pdf; /* last bounce pdf */
@@ -1137,6 +1179,11 @@ typedef struct KernelFilm {
float mist_inv_depth;
float mist_falloff;
int pass_denoising_data;
int pass_denoising_clean;
int denoising_flags;
int pad;
#ifdef __KERNEL_DEBUG__
int pass_bvh_traversed_nodes;
int pass_bvh_traversed_instances;

View File

@@ -0,0 +1,61 @@
/*
* Copyright 2011-2017 Blender Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* CPU kernel entry points */
/* On x86-64, we can assume SSE2, so avoid the extra kernel and compile this
* one with SSE2 intrinsics.
*/
#if defined(__x86_64__) || defined(_M_X64)
# define __KERNEL_SSE2__
#endif
/* When building kernel for native machine detect kernel features from the flags
* set by compiler.
*/
#ifdef WITH_KERNEL_NATIVE
# ifdef __SSE2__
# ifndef __KERNEL_SSE2__
# define __KERNEL_SSE2__
# endif
# endif
# ifdef __SSE3__
# define __KERNEL_SSE3__
# endif
# ifdef __SSSE3__
# define __KERNEL_SSSE3__
# endif
# ifdef __SSE4_1__
# define __KERNEL_SSE41__
# endif
# ifdef __AVX__
# define __KERNEL_SSE__
# define __KERNEL_AVX__
# endif
# ifdef __AVX2__
# define __KERNEL_SSE__
# define __KERNEL_AVX2__
# endif
#endif
/* quiet unused define warnings */
#if defined(__KERNEL_SSE2__)
/* do nothing */
#endif
#include "kernel/filter/filter.h"
#define KERNEL_ARCH cpu
#include "kernel/kernels/cpu/filter_cpu_impl.h"

View File

@@ -0,0 +1,39 @@
/*
* Copyright 2011-2017 Blender Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* Optimized CPU kernel entry points. This file is compiled with AVX
* optimization flags and nearly all functions inlined, while kernel.cpp
* is compiled without for other CPU's. */
#include "util/util_optimization.h"
#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
# define KERNEL_STUB
#else
/* SSE optimization disabled for now on 32 bit, see bug #36316 */
# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
# define __KERNEL_SSE__
# define __KERNEL_SSE2__
# define __KERNEL_SSE3__
# define __KERNEL_SSSE3__
# define __KERNEL_SSE41__
# define __KERNEL_AVX__
# endif
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX */
#include "kernel/filter/filter.h"
#define KERNEL_ARCH cpu_avx
#include "kernel/kernels/cpu/filter_cpu_impl.h"

View File

@@ -0,0 +1,40 @@
/*
* Copyright 2011-2017 Blender Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* Optimized CPU kernel entry points. This file is compiled with AVX2
* optimization flags and nearly all functions inlined, while kernel.cpp
* is compiled without for other CPU's. */
#include "util/util_optimization.h"
#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
# define KERNEL_STUB
#else
/* SSE optimization disabled for now on 32 bit, see bug #36316 */
# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
# define __KERNEL_SSE__
# define __KERNEL_SSE2__
# define __KERNEL_SSE3__
# define __KERNEL_SSSE3__
# define __KERNEL_SSE41__
# define __KERNEL_AVX__
# define __KERNEL_AVX2__
# endif
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 */
#include "kernel/filter/filter.h"
#define KERNEL_ARCH cpu_avx2
#include "kernel/kernels/cpu/filter_cpu_impl.h"

View File

@@ -0,0 +1,132 @@
/*
* Copyright 2011-2017 Blender Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* Templated common declaration part of all CPU kernels. */
void KERNEL_FUNCTION_FULL_NAME(filter_divide_shadow)(int sample,
TilesInfo *tiles,
int x,
int y,
float *unfilteredA,
float *unfilteredB,
float *sampleV,
float *sampleVV,
float *bufferV,
int* prefilter_rect,
int buffer_pass_stride,
int buffer_denoising_offset,
bool use_split_variance);
void KERNEL_FUNCTION_FULL_NAME(filter_get_feature)(int sample,
TilesInfo *tiles,
int m_offset,
int v_offset,
int x,
int y,
float *mean,
float *variance,
int* prefilter_rect,
int buffer_pass_stride,
int buffer_denoising_offset,
bool use_split_variance);
void KERNEL_FUNCTION_FULL_NAME(filter_combine_halves)(int x, int y,
float *mean,
float *variance,
float *a,
float *b,
int* prefilter_rect,
int r);
void KERNEL_FUNCTION_FULL_NAME(filter_construct_transform)(float* buffer,
int x,
int y,
int storage_ofs,
float *transform,
int *rank,
int* rect,
int pass_stride,
int radius,
float pca_threshold);
void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_difference)(int dx,
int dy,
float *weightImage,
float *variance,
float *differenceImage,
int* rect,
int w,
int channel_offset,
float a,
float k_2);
void KERNEL_FUNCTION_FULL_NAME(filter_nlm_blur)(float *differenceImage,
float *outImage,
int* rect,
int w,
int f);
void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_weight)(float *differenceImage,
float *outImage,
int* rect,
int w,
int f);
void KERNEL_FUNCTION_FULL_NAME(filter_nlm_update_output)(int dx,
int dy,
float *differenceImage,
float *image,
float *outImage,
float *accumImage,
int* rect,
int w,
int f);
void KERNEL_FUNCTION_FULL_NAME(filter_nlm_construct_gramian)(int dx,
int dy,
float *differenceImage,
float *buffer,
float *color_pass,
float *variance_pass,
float *transform,
int *rank,
float *XtWX,
float3 *XtWY,
int *rect,
int *filter_rect,
int w,
int h,
int f,
int pass_stride);
void KERNEL_FUNCTION_FULL_NAME(filter_nlm_normalize)(float *outImage,
float *accumImage,
int* rect,
int w);
void KERNEL_FUNCTION_FULL_NAME(filter_finalize)(int x,
int y,
int storage_ofs,
int w,
int h,
float *buffer,
int *rank,
float *XtWX,
float3 *XtWY,
int *buffer_params,
int sample);
#undef KERNEL_ARCH

View File

@@ -0,0 +1,259 @@
/*
* Copyright 2011-2017 Blender Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* Templated common implementation part of all CPU kernels.
*
* The idea is that particular .cpp files sets needed optimization flags and
* simply includes this file without worry of copying actual implementation over.
*/
#include "kernel/kernel_compat_cpu.h"
#include "kernel/filter/filter_kernel.h"
#ifdef KERNEL_STUB
# include "util/util_debug.h"
# define STUB_ASSERT(arch, name) assert(!(#name " kernel stub for architecture " #arch " was called!"))
#endif
CCL_NAMESPACE_BEGIN
/* Denoise filter */
void KERNEL_FUNCTION_FULL_NAME(filter_divide_shadow)(int sample,
TilesInfo *tiles,
int x,
int y,
float *unfilteredA,
float *unfilteredB,
float *sampleVariance,
float *sampleVarianceV,
float *bufferVariance,
int* prefilter_rect,
int buffer_pass_stride,
int buffer_denoising_offset,
bool use_split_variance)
{
#ifdef KERNEL_STUB
STUB_ASSERT(KERNEL_ARCH, filter_divide_shadow);
#else
kernel_filter_divide_shadow(sample, tiles,
x, y,
unfilteredA,
unfilteredB,
sampleVariance,
sampleVarianceV,
bufferVariance,
load_int4(prefilter_rect),
buffer_pass_stride,
buffer_denoising_offset,
use_split_variance);
#endif
}
void KERNEL_FUNCTION_FULL_NAME(filter_get_feature)(int sample,
TilesInfo *tiles,
int m_offset,
int v_offset,
int x,
int y,
float *mean, float *variance,
int* prefilter_rect,
int buffer_pass_stride,
int buffer_denoising_offset,
bool use_split_variance)
{
#ifdef KERNEL_STUB
STUB_ASSERT(KERNEL_ARCH, filter_get_feature);
#else
kernel_filter_get_feature(sample, tiles,
m_offset, v_offset,
x, y,
mean, variance,
load_int4(prefilter_rect),
buffer_pass_stride,
buffer_denoising_offset,
use_split_variance);
#endif
}
void KERNEL_FUNCTION_FULL_NAME(filter_combine_halves)(int x, int y,
float *mean,
float *variance,
float *a,
float *b,
int* prefilter_rect,
int r)
{
#ifdef KERNEL_STUB
STUB_ASSERT(KERNEL_ARCH, filter_combine_halves);
#else
kernel_filter_combine_halves(x, y, mean, variance, a, b, load_int4(prefilter_rect), r);
#endif
}
void KERNEL_FUNCTION_FULL_NAME(filter_construct_transform)(float* buffer,
int x,
int y,
int storage_ofs,
float *transform,
int *rank,
int* prefilter_rect,
int pass_stride,
int radius,
float pca_threshold)
{
#ifdef KERNEL_STUB
STUB_ASSERT(KERNEL_ARCH, filter_construct_transform);
#else
rank += storage_ofs;
transform += storage_ofs*TRANSFORM_SIZE;
kernel_filter_construct_transform(buffer,
x, y,
load_int4(prefilter_rect),
pass_stride,
transform,
rank,
radius,
pca_threshold);
#endif
}
void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_difference)(int dx,
int dy,
float *weightImage,
float *variance,
float *differenceImage,
int *rect,
int w,
int channel_offset,
float a,
float k_2)
{
#ifdef KERNEL_STUB
STUB_ASSERT(KERNEL_ARCH, filter_nlm_calc_difference);
#else
kernel_filter_nlm_calc_difference(dx, dy, weightImage, variance, differenceImage, load_int4(rect), w, channel_offset, a, k_2);
#endif
}
void KERNEL_FUNCTION_FULL_NAME(filter_nlm_blur)(float *differenceImage,
float *outImage,
int *rect,
int w,
int f)
{
#ifdef KERNEL_STUB
STUB_ASSERT(KERNEL_ARCH, filter_nlm_blur);
#else
kernel_filter_nlm_blur(differenceImage, outImage, load_int4(rect), w, f);
#endif
}
void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_weight)(float *differenceImage,
float *outImage,
int *rect,
int w,
int f)
{
#ifdef KERNEL_STUB
STUB_ASSERT(KERNEL_ARCH, filter_nlm_calc_weight);
#else
kernel_filter_nlm_calc_weight(differenceImage, outImage, load_int4(rect), w, f);
#endif
}
void KERNEL_FUNCTION_FULL_NAME(filter_nlm_update_output)(int dx,
int dy,
float *differenceImage,
float *image,
float *outImage,
float *accumImage,
int *rect,
int w,
int f)
{
#ifdef KERNEL_STUB
STUB_ASSERT(KERNEL_ARCH, filter_nlm_update_output);
#else
kernel_filter_nlm_update_output(dx, dy, differenceImage, image, outImage, accumImage, load_int4(rect), w, f);
#endif
}
void KERNEL_FUNCTION_FULL_NAME(filter_nlm_construct_gramian)(int dx,
int dy,
float *differenceImage,
float *buffer,
float *color_pass,
float *variance_pass,
float *transform,
int *rank,
float *XtWX,
float3 *XtWY,
int *rect,
int *filter_rect,
int w,
int h,
int f,
int pass_stride)
{
#ifdef KERNEL_STUB
STUB_ASSERT(KERNEL_ARCH, filter_nlm_construct_gramian);
#else
kernel_filter_nlm_construct_gramian(dx, dy, differenceImage, buffer, color_pass, variance_pass, transform, rank, XtWX, XtWY, load_int4(rect), load_int4(filter_rect), w, h, f, pass_stride);
#endif
}
void KERNEL_FUNCTION_FULL_NAME(filter_nlm_normalize)(float *outImage,
float *accumImage,
int *rect,
int w)
{
#ifdef KERNEL_STUB
STUB_ASSERT(KERNEL_ARCH, filter_nlm_normalize);
#else
kernel_filter_nlm_normalize(outImage, accumImage, load_int4(rect), w);
#endif
}
void KERNEL_FUNCTION_FULL_NAME(filter_finalize)(int x,
int y,
int storage_ofs,
int w,
int h,
float *buffer,
int *rank,
float *XtWX,
float3 *XtWY,
int *buffer_params,
int sample)
{
#ifdef KERNEL_STUB
STUB_ASSERT(KERNEL_ARCH, filter_finalize);
#else
XtWX += storage_ofs*XTWX_SIZE;
XtWY += storage_ofs*XTWY_SIZE;
rank += storage_ofs;
kernel_filter_finalize(x, y, w, h, buffer, rank, 1, XtWX, XtWY, load_int4(buffer_params), sample);
#endif
}
#undef KERNEL_STUB
#undef STUB_ASSERT
#undef KERNEL_ARCH
CCL_NAMESPACE_END

View File

@@ -0,0 +1,34 @@
/*
* Copyright 2011-2017 Blender Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* Optimized CPU kernel entry points. This file is compiled with SSE2
* optimization flags and nearly all functions inlined, while kernel.cpp
* is compiled without for other CPU's. */
#include "util/util_optimization.h"
#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
# define KERNEL_STUB
#else
/* SSE optimization disabled for now on 32 bit, see bug #36316 */
# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
# define __KERNEL_SSE2__
# endif
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */
#include "kernel/filter/filter.h"
#define KERNEL_ARCH cpu_sse2
#include "kernel/kernels/cpu/filter_cpu_impl.h"

View File

@@ -0,0 +1,36 @@
/*
* Copyright 2011-2017 Blender Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* Optimized CPU kernel entry points. This file is compiled with SSE3/SSSE3
* optimization flags and nearly all functions inlined, while kernel.cpp
* is compiled without for other CPU's. */
#include "util/util_optimization.h"
#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
# define KERNEL_STUB
#else
/* SSE optimization disabled for now on 32 bit, see bug #36316 */
# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
# define __KERNEL_SSE2__
# define __KERNEL_SSE3__
# define __KERNEL_SSSE3__
# endif
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 */
#include "kernel/filter/filter.h"
#define KERNEL_ARCH cpu_sse3
#include "kernel/kernels/cpu/filter_cpu_impl.h"

View File

@@ -0,0 +1,37 @@
/*
* Copyright 2011-2017 Blender Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* Optimized CPU kernel entry points. This file is compiled with SSE3/SSSE3
* optimization flags and nearly all functions inlined, while kernel.cpp
* is compiled without for other CPU's. */
#include "util/util_optimization.h"
#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
# define KERNEL_STUB
#else
/* SSE optimization disabled for now on 32 bit, see bug #36316 */
# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
# define __KERNEL_SSE2__
# define __KERNEL_SSE3__
# define __KERNEL_SSSE3__
# define __KERNEL_SSE41__
# endif
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 */
#include "kernel/filter/filter.h"
#define KERNEL_ARCH cpu_sse41
#include "kernel/kernels/cpu/filter_cpu_impl.h"

View File

@@ -17,21 +17,23 @@
/* Optimized CPU kernel entry points. This file is compiled with AVX
* optimization flags and nearly all functions inlined, while kernel.cpp
* is compiled without for other CPU's. */
/* SSE optimization disabled for now on 32 bit, see bug #36316 */
#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
# define __KERNEL_SSE__
# define __KERNEL_SSE2__
# define __KERNEL_SSE3__
# define __KERNEL_SSSE3__
# define __KERNEL_SSE41__
# define __KERNEL_AVX__
#endif
#include "util/util_optimization.h"
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
# include "kernel/kernel.h"
# define KERNEL_ARCH cpu_avx
# include "kernel/kernels/cpu/kernel_cpu_impl.h"
#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
# define KERNEL_STUB
#else
/* SSE optimization disabled for now on 32 bit, see bug #36316 */
# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
# define __KERNEL_SSE__
# define __KERNEL_SSE2__
# define __KERNEL_SSE3__
# define __KERNEL_SSSE3__
# define __KERNEL_SSE41__
# define __KERNEL_AVX__
# endif
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX */
#include "kernel/kernel.h"
#define KERNEL_ARCH cpu_avx
#include "kernel/kernels/cpu/kernel_cpu_impl.h"

View File

@@ -18,21 +18,23 @@
* optimization flags and nearly all functions inlined, while kernel.cpp
* is compiled without for other CPU's. */
/* SSE optimization disabled for now on 32 bit, see bug #36316 */
#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
# define __KERNEL_SSE__
# define __KERNEL_SSE2__
# define __KERNEL_SSE3__
# define __KERNEL_SSSE3__
# define __KERNEL_SSE41__
# define __KERNEL_AVX__
# define __KERNEL_AVX2__
#endif
#include "util/util_optimization.h"
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
# include "kernel/kernel.h"
# define KERNEL_ARCH cpu_avx2
# include "kernel/kernels/cpu/kernel_cpu_impl.h"
#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
# define KERNEL_STUB
#else
/* SSE optimization disabled for now on 32 bit, see bug #36316 */
# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
# define __KERNEL_SSE__
# define __KERNEL_SSE2__
# define __KERNEL_SSE3__
# define __KERNEL_SSSE3__
# define __KERNEL_SSE41__
# define __KERNEL_AVX__
# define __KERNEL_AVX2__
# endif
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 */
#include "kernel/kernel.h"
#define KERNEL_ARCH cpu_avx2
#include "kernel/kernels/cpu/kernel_cpu_impl.h"

View File

@@ -89,6 +89,4 @@ DECLARE_SPLIT_KERNEL_FUNCTION(next_iteration_setup)
DECLARE_SPLIT_KERNEL_FUNCTION(indirect_subsurface)
DECLARE_SPLIT_KERNEL_FUNCTION(buffer_update)
void KERNEL_FUNCTION_FULL_NAME(register_functions)(void(*reg)(const char* name, void* func));
#undef KERNEL_ARCH

View File

@@ -57,6 +57,11 @@
# include "kernel/split/kernel_buffer_update.h"
#endif
#ifdef KERNEL_STUB
# include "util/util_debug.h"
# define STUB_ASSERT(arch, name) assert(!(#name " kernel stub for architecture " #arch " was called!"))
#endif
CCL_NAMESPACE_BEGIN
#ifndef __SPLIT_KERNEL__
@@ -71,7 +76,10 @@ void KERNEL_FUNCTION_FULL_NAME(path_trace)(KernelGlobals *kg,
int offset,
int stride)
{
#ifdef __BRANCHED_PATH__
#ifdef KERNEL_STUB
STUB_ASSERT(KERNEL_ARCH, path_trace);
#else
# ifdef __BRANCHED_PATH__
if(kernel_data.integrator.branched) {
kernel_branched_path_trace(kg,
buffer,
@@ -82,10 +90,11 @@ void KERNEL_FUNCTION_FULL_NAME(path_trace)(KernelGlobals *kg,
stride);
}
else
#endif
# endif
{
kernel_path_trace(kg, buffer, rng_state, sample, x, y, offset, stride);
}
#endif /* KERNEL_STUB */
}
/* Film */
@@ -98,6 +107,9 @@ void KERNEL_FUNCTION_FULL_NAME(convert_to_byte)(KernelGlobals *kg,
int offset,
int stride)
{
#ifdef KERNEL_STUB
STUB_ASSERT(KERNEL_ARCH, convert_to_byte);
#else
kernel_film_convert_to_byte(kg,
rgba,
buffer,
@@ -105,6 +117,7 @@ void KERNEL_FUNCTION_FULL_NAME(convert_to_byte)(KernelGlobals *kg,
x, y,
offset,
stride);
#endif /* KERNEL_STUB */
}
void KERNEL_FUNCTION_FULL_NAME(convert_to_half_float)(KernelGlobals *kg,
@@ -115,6 +128,9 @@ void KERNEL_FUNCTION_FULL_NAME(convert_to_half_float)(KernelGlobals *kg,
int offset,
int stride)
{
#ifdef KERNEL_STUB
STUB_ASSERT(KERNEL_ARCH, convert_to_half_float);
#else
kernel_film_convert_to_half_float(kg,
rgba,
buffer,
@@ -122,6 +138,7 @@ void KERNEL_FUNCTION_FULL_NAME(convert_to_half_float)(KernelGlobals *kg,
x, y,
offset,
stride);
#endif /* KERNEL_STUB */
}
/* Shader Evaluate */
@@ -136,9 +153,12 @@ void KERNEL_FUNCTION_FULL_NAME(shader)(KernelGlobals *kg,
int offset,
int sample)
{
#ifdef KERNEL_STUB
STUB_ASSERT(KERNEL_ARCH, shader);
#else
if(type >= SHADER_EVAL_BAKE) {
kernel_assert(output_luma == NULL);
#ifdef __BAKING__
# ifdef __BAKING__
kernel_bake_evaluate(kg,
input,
output,
@@ -147,7 +167,7 @@ void KERNEL_FUNCTION_FULL_NAME(shader)(KernelGlobals *kg,
i,
offset,
sample);
#endif
# endif
}
else {
kernel_shader_evaluate(kg,
@@ -158,17 +178,26 @@ void KERNEL_FUNCTION_FULL_NAME(shader)(KernelGlobals *kg,
i,
sample);
}
#endif /* KERNEL_STUB */
}
#else /* __SPLIT_KERNEL__ */
/* Split Kernel Path Tracing */
#define DEFINE_SPLIT_KERNEL_FUNCTION(name) \
#ifdef KERNEL_STUB
# define DEFINE_SPLIT_KERNEL_FUNCTION(name) \
void KERNEL_FUNCTION_FULL_NAME(name)(KernelGlobals *kg, KernelData* /*data*/) \
{ \
STUB_ASSERT(KERNEL_ARCH, name); \
}
#else
# define DEFINE_SPLIT_KERNEL_FUNCTION(name) \
void KERNEL_FUNCTION_FULL_NAME(name)(KernelGlobals *kg, KernelData* /*data*/) \
{ \
kernel_##name(kg); \
}
#endif /* KERNEL_STUB */
#define DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(name, type) \
void KERNEL_FUNCTION_FULL_NAME(name)(KernelGlobals *kg, KernelData* /*data*/) \
@@ -194,42 +223,10 @@ DEFINE_SPLIT_KERNEL_FUNCTION(shadow_blocked_dl)
DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(next_iteration_setup, uint)
DEFINE_SPLIT_KERNEL_FUNCTION(indirect_subsurface)
DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(buffer_update, uint)
void KERNEL_FUNCTION_FULL_NAME(register_functions)(void(*reg)(const char* name, void* func))
{
#define REGISTER_NAME_STRING(name) #name
#define REGISTER_EVAL_NAME(name) REGISTER_NAME_STRING(name)
#define REGISTER(name) reg(REGISTER_EVAL_NAME(KERNEL_FUNCTION_FULL_NAME(name)), (void*)KERNEL_FUNCTION_FULL_NAME(name));
REGISTER(path_trace);
REGISTER(convert_to_byte);
REGISTER(convert_to_half_float);
REGISTER(shader);
REGISTER(data_init);
REGISTER(path_init);
REGISTER(scene_intersect);
REGISTER(lamp_emission);
REGISTER(do_volume);
REGISTER(queue_enqueue);
REGISTER(indirect_background);
REGISTER(shader_setup);
REGISTER(shader_sort);
REGISTER(shader_eval);
REGISTER(holdout_emission_blurring_pathtermination_ao);
REGISTER(subsurface_scatter);
REGISTER(direct_lighting);
REGISTER(shadow_blocked_ao);
REGISTER(shadow_blocked_dl);
REGISTER(next_iteration_setup);
REGISTER(indirect_subsurface);
REGISTER(buffer_update);
#undef REGISTER
#undef REGISTER_EVAL_NAME
#undef REGISTER_NAME_STRING
}
#endif /* __SPLIT_KERNEL__ */
#undef KERNEL_STUB
#undef STUB_ASSERT
#undef KERNEL_ARCH
CCL_NAMESPACE_END

View File

@@ -17,22 +17,25 @@
/* Optimized CPU kernel entry points. This file is compiled with AVX
* optimization flags and nearly all functions inlined, while kernel.cpp
* is compiled without for other CPU's. */
/* SSE optimization disabled for now on 32 bit, see bug #36316 */
#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
# define __KERNEL_SSE2__
# define __KERNEL_SSE3__
# define __KERNEL_SSSE3__
# define __KERNEL_SSE41__
# define __KERNEL_AVX__
#endif
#define __SPLIT_KERNEL__
#include "util/util_optimization.h"
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
# include "kernel/kernel.h"
# define KERNEL_ARCH cpu_avx
# include "kernel/kernels/cpu/kernel_cpu_impl.h"
#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
# define KERNEL_STUB
#else
/* SSE optimization disabled for now on 32 bit, see bug #36316 */
# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
# define __KERNEL_SSE__
# define __KERNEL_SSE2__
# define __KERNEL_SSE3__
# define __KERNEL_SSSE3__
# define __KERNEL_SSE41__
# define __KERNEL_AVX__
# endif
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX */
#include "kernel/kernel.h"
#define KERNEL_ARCH cpu_avx
#include "kernel/kernels/cpu/kernel_cpu_impl.h"

View File

@@ -18,23 +18,25 @@
* optimization flags and nearly all functions inlined, while kernel.cpp
* is compiled without for other CPU's. */
/* SSE optimization disabled for now on 32 bit, see bug #36316 */
#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
# define __KERNEL_SSE__
# define __KERNEL_SSE2__
# define __KERNEL_SSE3__
# define __KERNEL_SSSE3__
# define __KERNEL_SSE41__
# define __KERNEL_AVX__
# define __KERNEL_AVX2__
#endif
#define __SPLIT_KERNEL__
#include "util/util_optimization.h"
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
# include "kernel/kernel.h"
# define KERNEL_ARCH cpu_avx2
# include "kernel/kernels/cpu/kernel_cpu_impl.h"
#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
# define KERNEL_STUB
#else
/* SSE optimization disabled for now on 32 bit, see bug #36316 */
# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
# define __KERNEL_SSE__
# define __KERNEL_SSE2__
# define __KERNEL_SSE3__
# define __KERNEL_SSSE3__
# define __KERNEL_SSE41__
# define __KERNEL_AVX__
# define __KERNEL_AVX2__
# endif
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 */
#include "kernel/kernel.h"
#define KERNEL_ARCH cpu_avx2
#include "kernel/kernels/cpu/kernel_cpu_impl.h"

View File

@@ -18,17 +18,19 @@
* optimization flags and nearly all functions inlined, while kernel.cpp
* is compiled without for other CPU's. */
/* SSE optimization disabled for now on 32 bit, see bug #36316 */
#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
# define __KERNEL_SSE2__
#endif
#define __SPLIT_KERNEL__
#include "util/util_optimization.h"
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
# include "kernel/kernel.h"
# define KERNEL_ARCH cpu_sse2
# include "kernel/kernels/cpu/kernel_cpu_impl.h"
#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
# define KERNEL_STUB
#else
/* SSE optimization disabled for now on 32 bit, see bug #36316 */
# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
# define __KERNEL_SSE2__
# endif
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */
#include "kernel/kernel.h"
#define KERNEL_ARCH cpu_sse2
#include "kernel/kernels/cpu/kernel_cpu_impl.h"

View File

@@ -18,19 +18,21 @@
* optimization flags and nearly all functions inlined, while kernel.cpp
* is compiled without for other CPU's. */
/* SSE optimization disabled for now on 32 bit, see bug #36316 */
#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
# define __KERNEL_SSE2__
# define __KERNEL_SSE3__
# define __KERNEL_SSSE3__
#endif
#define __SPLIT_KERNEL__
#include "util/util_optimization.h"
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
# include "kernel/kernel.h"
# define KERNEL_ARCH cpu_sse3
# include "kernel/kernels/cpu/kernel_cpu_impl.h"
#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
# define KERNEL_STUB
#else
/* SSE optimization disabled for now on 32 bit, see bug #36316 */
# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
# define __KERNEL_SSE2__
# define __KERNEL_SSE3__
# define __KERNEL_SSSE3__
# endif
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 */
#include "kernel/kernel.h"
#define KERNEL_ARCH cpu_sse3
#include "kernel/kernels/cpu/kernel_cpu_impl.h"

View File

@@ -18,20 +18,22 @@
* optimization flags and nearly all functions inlined, while kernel.cpp
* is compiled without for other CPU's. */
/* SSE optimization disabled for now on 32 bit, see bug #36316 */
#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
# define __KERNEL_SSE2__
# define __KERNEL_SSE3__
# define __KERNEL_SSSE3__
# define __KERNEL_SSE41__
#endif
#define __SPLIT_KERNEL__
#include "util/util_optimization.h"
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
# include "kernel/kernel.h"
# define KERNEL_ARCH cpu_sse41
# include "kernel/kernels/cpu/kernel_cpu_impl.h"
#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
# define KERNEL_STUB
#else
/* SSE optimization disabled for now on 32 bit, see bug #36316 */
# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
# define __KERNEL_SSE2__
# define __KERNEL_SSE3__
# define __KERNEL_SSSE3__
# define __KERNEL_SSE41__
# endif
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 */
#include "kernel/kernel.h"
#define KERNEL_ARCH cpu_sse41
#include "kernel/kernels/cpu/kernel_cpu_impl.h"

View File

@@ -18,15 +18,17 @@
* optimization flags and nearly all functions inlined, while kernel.cpp
* is compiled without for other CPU's. */
/* SSE optimization disabled for now on 32 bit, see bug #36316 */
#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
# define __KERNEL_SSE2__
#endif
#include "util/util_optimization.h"
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
# include "kernel/kernel.h"
# define KERNEL_ARCH cpu_sse2
# include "kernel/kernels/cpu/kernel_cpu_impl.h"
#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
# define KERNEL_STUB
#else
/* SSE optimization disabled for now on 32 bit, see bug #36316 */
# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
# define __KERNEL_SSE2__
# endif
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */
#include "kernel/kernel.h"
#define KERNEL_ARCH cpu_sse2
#include "kernel/kernels/cpu/kernel_cpu_impl.h"

View File

@@ -18,17 +18,19 @@
* optimization flags and nearly all functions inlined, while kernel.cpp
* is compiled without for other CPU's. */
/* SSE optimization disabled for now on 32 bit, see bug #36316 */
#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
# define __KERNEL_SSE2__
# define __KERNEL_SSE3__
# define __KERNEL_SSSE3__
#endif
#include "util/util_optimization.h"
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
# include "kernel/kernel.h"
# define KERNEL_ARCH cpu_sse3
# include "kernel/kernels/cpu/kernel_cpu_impl.h"
#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
# define KERNEL_STUB
#else
/* SSE optimization disabled for now on 32 bit, see bug #36316 */
# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
# define __KERNEL_SSE2__
# define __KERNEL_SSE3__
# define __KERNEL_SSSE3__
# endif
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 */
#include "kernel/kernel.h"
#define KERNEL_ARCH cpu_sse3
#include "kernel/kernels/cpu/kernel_cpu_impl.h"

View File

@@ -18,18 +18,20 @@
* optimization flags and nearly all functions inlined, while kernel.cpp
* is compiled without for other CPU's. */
/* SSE optimization disabled for now on 32 bit, see bug #36316 */
#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
# define __KERNEL_SSE2__
# define __KERNEL_SSE3__
# define __KERNEL_SSSE3__
# define __KERNEL_SSE41__
#endif
#include "util/util_optimization.h"
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
# include "kernel/kernel.h"
# define KERNEL_ARCH cpu_sse41
# include "kernel/kernels/cpu//kernel_cpu_impl.h"
#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
# define KERNEL_STUB
#else
/* SSE optimization disabled for now on 32 bit, see bug #36316 */
# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
# define __KERNEL_SSE2__
# define __KERNEL_SSE3__
# define __KERNEL_SSSE3__
# define __KERNEL_SSE41__
# endif
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 */
#include "kernel/kernel.h"
#define KERNEL_ARCH cpu_sse41
#include "kernel/kernels/cpu/kernel_cpu_impl.h"

View File

@@ -0,0 +1,235 @@
/*
* Copyright 2011-2017 Blender Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* CUDA kernel entry points */
#ifdef __CUDA_ARCH__
#include "kernel_config.h"
#include "kernel/kernel_compat_cuda.h"
#include "kernel/filter/filter_kernel.h"
/* kernels */
extern "C" __global__ void
CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
kernel_cuda_filter_divide_shadow(int sample,
TilesInfo *tiles,
float *unfilteredA,
float *unfilteredB,
float *sampleVariance,
float *sampleVarianceV,
float *bufferVariance,
int4 prefilter_rect,
int buffer_pass_stride,
int buffer_denoising_offset,
bool use_split_variance)
{
int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x;
int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y;
if(x < prefilter_rect.z && y < prefilter_rect.w) {
kernel_filter_divide_shadow(sample,
tiles,
x, y,
unfilteredA,
unfilteredB,
sampleVariance,
sampleVarianceV,
bufferVariance,
prefilter_rect,
buffer_pass_stride,
buffer_denoising_offset,
use_split_variance);
}
}
extern "C" __global__ void
CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
kernel_cuda_filter_get_feature(int sample,
TilesInfo *tiles,
int m_offset,
int v_offset,
float *mean,
float *variance,
int4 prefilter_rect,
int buffer_pass_stride,
int buffer_denoising_offset,
bool use_split_variance)
{
int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x;
int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y;
if(x < prefilter_rect.z && y < prefilter_rect.w) {
kernel_filter_get_feature(sample,
tiles,
m_offset, v_offset,
x, y,
mean, variance,
prefilter_rect,
buffer_pass_stride,
buffer_denoising_offset,
use_split_variance);
}
}
extern "C" __global__ void
CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
kernel_cuda_filter_combine_halves(float *mean, float *variance, float *a, float *b, int4 prefilter_rect, int r)
{
int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x;
int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y;
if(x < prefilter_rect.z && y < prefilter_rect.w) {
kernel_filter_combine_halves(x, y, mean, variance, a, b, prefilter_rect, r);
}
}
extern "C" __global__ void
CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
kernel_cuda_filter_construct_transform(float const* __restrict__ buffer,
float *transform, int *rank,
int4 filter_area, int4 rect,
int radius, float pca_threshold,
int pass_stride)
{
int x = blockDim.x*blockIdx.x + threadIdx.x;
int y = blockDim.y*blockIdx.y + threadIdx.y;
if(x < filter_area.z && y < filter_area.w) {
int *l_rank = rank + y*filter_area.z + x;
float *l_transform = transform + y*filter_area.z + x;
kernel_filter_construct_transform(buffer,
x + filter_area.x, y + filter_area.y,
rect, pass_stride,
l_transform, l_rank,
radius, pca_threshold,
filter_area.z*filter_area.w,
threadIdx.y*blockDim.x + threadIdx.x);
}
}
extern "C" __global__ void
CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
kernel_cuda_filter_nlm_calc_difference(int dx, int dy,
float ccl_restrict_ptr weightImage,
float ccl_restrict_ptr varianceImage,
float *differenceImage,
int4 rect, int w,
int channel_offset,
float a, float k_2) {
int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x;
int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y;
if(x < rect.z && y < rect.w) {
kernel_filter_nlm_calc_difference(x, y, dx, dy, weightImage, varianceImage, differenceImage, rect, w, channel_offset, a, k_2);
}
}
extern "C" __global__ void
CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
kernel_cuda_filter_nlm_blur(float ccl_restrict_ptr differenceImage, float *outImage, int4 rect, int w, int f) {
int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x;
int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y;
if(x < rect.z && y < rect.w) {
kernel_filter_nlm_blur(x, y, differenceImage, outImage, rect, w, f);
}
}
extern "C" __global__ void
CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
kernel_cuda_filter_nlm_calc_weight(float ccl_restrict_ptr differenceImage, float *outImage, int4 rect, int w, int f) {
int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x;
int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y;
if(x < rect.z && y < rect.w) {
kernel_filter_nlm_calc_weight(x, y, differenceImage, outImage, rect, w, f);
}
}
extern "C" __global__ void
CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
kernel_cuda_filter_nlm_update_output(int dx, int dy,
float ccl_restrict_ptr differenceImage,
float ccl_restrict_ptr image,
float *outImage, float *accumImage,
int4 rect, int w,
int f) {
int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x;
int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y;
if(x < rect.z && y < rect.w) {
kernel_filter_nlm_update_output(x, y, dx, dy, differenceImage, image, outImage, accumImage, rect, w, f);
}
}
extern "C" __global__ void
CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
kernel_cuda_filter_nlm_normalize(float *outImage, float ccl_restrict_ptr accumImage, int4 rect, int w) {
int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x;
int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y;
if(x < rect.z && y < rect.w) {
kernel_filter_nlm_normalize(x, y, outImage, accumImage, rect, w);
}
}
extern "C" __global__ void
CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
kernel_cuda_filter_nlm_construct_gramian(int dx, int dy,
float ccl_restrict_ptr differenceImage,
float ccl_restrict_ptr buffer,
float *color_pass,
float *variance_pass,
float const* __restrict__ transform,
int *rank,
float *XtWX,
float3 *XtWY,
int4 rect,
int4 filter_rect,
int w, int h, int f,
int pass_stride) {
int x = blockDim.x*blockIdx.x + threadIdx.x + max(0, rect.x-filter_rect.x);
int y = blockDim.y*blockIdx.y + threadIdx.y + max(0, rect.y-filter_rect.y);
if(x < min(filter_rect.z, rect.z-filter_rect.x) && y < min(filter_rect.w, rect.w-filter_rect.y)) {
kernel_filter_nlm_construct_gramian(x, y,
dx, dy,
differenceImage,
buffer,
color_pass, variance_pass,
transform, rank,
XtWX, XtWY,
rect, filter_rect,
w, h, f,
pass_stride,
threadIdx.y*blockDim.x + threadIdx.x);
}
}
extern "C" __global__ void
CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
kernel_cuda_filter_finalize(int w, int h,
float *buffer, int *rank,
float *XtWX, float3 *XtWY,
int4 filter_area, int4 buffer_params,
int sample) {
int x = blockDim.x*blockIdx.x + threadIdx.x;
int y = blockDim.y*blockIdx.y + threadIdx.y;
if(x < filter_area.z && y < filter_area.w) {
int storage_ofs = y*filter_area.z+x;
rank += storage_ofs;
XtWX += storage_ofs;
XtWY += storage_ofs;
kernel_filter_finalize(x, y, w, h, buffer, rank, filter_area.z*filter_area.w, XtWX, XtWY, buffer_params, sample);
}
}
#endif

View File

@@ -0,0 +1,262 @@
/*
* Copyright 2011-2017 Blender Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* OpenCL kernel entry points */
#include "kernel/kernel_compat_opencl.h"
#include "kernel/filter/filter_kernel.h"
/* kernels */
__kernel void kernel_ocl_filter_divide_shadow(int sample,
ccl_global TilesInfo *tiles,
ccl_global float *unfilteredA,
ccl_global float *unfilteredB,
ccl_global float *sampleVariance,
ccl_global float *sampleVarianceV,
ccl_global float *bufferVariance,
int4 prefilter_rect,
int buffer_pass_stride,
int buffer_denoising_offset,
char use_split_variance)
{
int x = prefilter_rect.x + get_global_id(0);
int y = prefilter_rect.y + get_global_id(1);
if(x < prefilter_rect.z && y < prefilter_rect.w) {
kernel_filter_divide_shadow(sample,
tiles,
x, y,
unfilteredA,
unfilteredB,
sampleVariance,
sampleVarianceV,
bufferVariance,
prefilter_rect,
buffer_pass_stride,
buffer_denoising_offset,
use_split_variance);
}
}
__kernel void kernel_ocl_filter_get_feature(int sample,
ccl_global TilesInfo *tiles,
int m_offset,
int v_offset,
ccl_global float *mean,
ccl_global float *variance,
int4 prefilter_rect,
int buffer_pass_stride,
int buffer_denoising_offset,
char use_split_variance)
{
int x = prefilter_rect.x + get_global_id(0);
int y = prefilter_rect.y + get_global_id(1);
if(x < prefilter_rect.z && y < prefilter_rect.w) {
kernel_filter_get_feature(sample,
tiles,
m_offset, v_offset,
x, y,
mean, variance,
prefilter_rect,
buffer_pass_stride,
buffer_denoising_offset,
use_split_variance);
}
}
__kernel void kernel_ocl_filter_combine_halves(ccl_global float *mean,
ccl_global float *variance,
ccl_global float *a,
ccl_global float *b,
int4 prefilter_rect,
int r)
{
int x = prefilter_rect.x + get_global_id(0);
int y = prefilter_rect.y + get_global_id(1);
if(x < prefilter_rect.z && y < prefilter_rect.w) {
kernel_filter_combine_halves(x, y, mean, variance, a, b, prefilter_rect, r);
}
}
__kernel void kernel_ocl_filter_construct_transform(ccl_global float ccl_restrict_ptr buffer,
ccl_global float *transform,
ccl_global int *rank,
int4 filter_area,
int4 rect,
int pass_stride,
int radius,
float pca_threshold)
{
int x = get_global_id(0);
int y = get_global_id(1);
if(x < filter_area.z && y < filter_area.w) {
ccl_global int *l_rank = rank + y*filter_area.z + x;
ccl_global float *l_transform = transform + y*filter_area.z + x;
kernel_filter_construct_transform(buffer,
x + filter_area.x, y + filter_area.y,
rect, pass_stride,
l_transform, l_rank,
radius, pca_threshold,
filter_area.z*filter_area.w,
get_local_id(1)*get_local_size(0) + get_local_id(0));
}
}
__kernel void kernel_ocl_filter_nlm_calc_difference(int dx,
int dy,
ccl_global float ccl_restrict_ptr weightImage,
ccl_global float ccl_restrict_ptr varianceImage,
ccl_global float *differenceImage,
int4 rect,
int w,
int channel_offset,
float a,
float k_2) {
int x = get_global_id(0) + rect.x;
int y = get_global_id(1) + rect.y;
if(x < rect.z && y < rect.w) {
kernel_filter_nlm_calc_difference(x, y, dx, dy, weightImage, varianceImage, differenceImage, rect, w, channel_offset, a, k_2);
}
}
__kernel void kernel_ocl_filter_nlm_blur(ccl_global float ccl_restrict_ptr differenceImage,
ccl_global float *outImage,
int4 rect,
int w,
int f) {
int x = get_global_id(0) + rect.x;
int y = get_global_id(1) + rect.y;
if(x < rect.z && y < rect.w) {
kernel_filter_nlm_blur(x, y, differenceImage, outImage, rect, w, f);
}
}
__kernel void kernel_ocl_filter_nlm_calc_weight(ccl_global float ccl_restrict_ptr differenceImage,
ccl_global float *outImage,
int4 rect,
int w,
int f) {
int x = get_global_id(0) + rect.x;
int y = get_global_id(1) + rect.y;
if(x < rect.z && y < rect.w) {
kernel_filter_nlm_calc_weight(x, y, differenceImage, outImage, rect, w, f);
}
}
__kernel void kernel_ocl_filter_nlm_update_output(int dx,
int dy,
ccl_global float ccl_restrict_ptr differenceImage,
ccl_global float ccl_restrict_ptr image,
ccl_global float *outImage,
ccl_global float *accumImage,
int4 rect,
int w,
int f) {
int x = get_global_id(0) + rect.x;
int y = get_global_id(1) + rect.y;
if(x < rect.z && y < rect.w) {
kernel_filter_nlm_update_output(x, y, dx, dy, differenceImage, image, outImage, accumImage, rect, w, f);
}
}
__kernel void kernel_ocl_filter_nlm_normalize(ccl_global float *outImage,
ccl_global float ccl_restrict_ptr accumImage,
int4 rect,
int w) {
int x = get_global_id(0) + rect.x;
int y = get_global_id(1) + rect.y;
if(x < rect.z && y < rect.w) {
kernel_filter_nlm_normalize(x, y, outImage, accumImage, rect, w);
}
}
__kernel void kernel_ocl_filter_nlm_construct_gramian(int dx,
int dy,
ccl_global float ccl_restrict_ptr differenceImage,
ccl_global float ccl_restrict_ptr buffer,
ccl_global float *color_pass,
ccl_global float *variance_pass,
ccl_global float ccl_restrict_ptr transform,
ccl_global int *rank,
ccl_global float *XtWX,
ccl_global float3 *XtWY,
int4 rect,
int4 filter_rect,
int w,
int h,
int f,
int pass_stride) {
int x = get_global_id(0) + max(0, rect.x-filter_rect.x);
int y = get_global_id(1) + max(0, rect.y-filter_rect.y);
if(x < min(filter_rect.z, rect.z-filter_rect.x) && y < min(filter_rect.w, rect.w-filter_rect.y)) {
kernel_filter_nlm_construct_gramian(x, y,
dx, dy,
differenceImage,
buffer,
color_pass, variance_pass,
transform, rank,
XtWX, XtWY,
rect, filter_rect,
w, h, f,
pass_stride,
get_local_id(1)*get_local_size(0) + get_local_id(0));
}
}
__kernel void kernel_ocl_filter_finalize(int w,
int h,
ccl_global float *buffer,
ccl_global int *rank,
ccl_global float *XtWX,
ccl_global float3 *XtWY,
int4 filter_area,
int4 buffer_params,
int sample) {
int x = get_global_id(0);
int y = get_global_id(1);
if(x < filter_area.z && y < filter_area.w) {
int storage_ofs = y*filter_area.z+x;
rank += storage_ofs;
XtWX += storage_ofs;
XtWY += storage_ofs;
kernel_filter_finalize(x, y, w, h, buffer, rank, filter_area.z*filter_area.w, XtWX, XtWY, buffer_params, sample);
}
}
__kernel void kernel_ocl_filter_set_tiles(ccl_global TilesInfo* tiles,
ccl_global float *buffer_1,
ccl_global float *buffer_2,
ccl_global float *buffer_3,
ccl_global float *buffer_4,
ccl_global float *buffer_5,
ccl_global float *buffer_6,
ccl_global float *buffer_7,
ccl_global float *buffer_8,
ccl_global float *buffer_9)
{
if((get_global_id(0) == 0) && (get_global_id(1) == 0)) {
tiles->buffers[0] = buffer_1;
tiles->buffers[1] = buffer_2;
tiles->buffers[2] = buffer_3;
tiles->buffers[3] = buffer_4;
tiles->buffers[4] = buffer_5;
tiles->buffers[5] = buffer_6;
tiles->buffers[6] = buffer_7;
tiles->buffers[7] = buffer_8;
tiles->buffers[8] = buffer_9;
}
}

View File

@@ -76,6 +76,26 @@ ccl_device_noinline bool kernel_split_branched_path_surface_indirect_light_iter(
RNG rng = kernel_split_state.rng[ray_index];
PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
float3 throughput = branched_state->throughput;
ccl_global PathState *ps = &kernel_split_state.path_state[ray_index];
float sum_sample_weight = 0.0f;
#ifdef __DENOISING_FEATURES__
if(ps->denoising_feature_weight > 0.0f) {
for(int i = 0; i < sd->num_closure; i++) {
const ShaderClosure *sc = &sd->closure[i];
/* transparency is not handled here, but in outer loop */
if(!CLOSURE_IS_BSDF(sc->type) || CLOSURE_IS_BSDF_TRANSPARENT(sc->type)) {
continue;
}
sum_sample_weight += sc->sample_weight;
}
}
else {
sum_sample_weight = 1.0f;
}
#endif /* __DENOISING_FEATURES__ */
for(int i = branched_state->next_closure; i < sd->num_closure; i++) {
const ShaderClosure *sc = &sd->closure[i];
@@ -103,7 +123,6 @@ ccl_device_noinline bool kernel_split_branched_path_surface_indirect_light_iter(
RNG bsdf_rng = cmj_hash(rng, i);
for(int j = branched_state->next_sample; j < num_samples; j++) {
ccl_global PathState *ps = &kernel_split_state.path_state[ray_index];
if(reset_path_state) {
*ps = branched_state->path_state;
}
@@ -122,7 +141,8 @@ ccl_device_noinline bool kernel_split_branched_path_surface_indirect_light_iter(
tp,
ps,
L,
bsdf_ray))
bsdf_ray,
sum_sample_weight))
{
continue;
}

View File

@@ -111,24 +111,15 @@ ccl_device void kernel_buffer_update(KernelGlobals *kg,
buffer += (kernel_split_params.offset + pixel_x + pixel_y*stride) * kernel_data.film.pass_stride;
if(IS_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER)) {
float3 L_sum;
#ifdef __SHADOW_TRICKS__
if(state->flag & PATH_RAY_SHADOW_CATCHER) {
L_sum = path_radiance_sum_shadowcatcher(kg, L, L_transparent);
}
else
#endif /* __SHADOW_TRICKS__ */
{
L_sum = path_radiance_clamp_and_sum(kg, L);
}
kernel_write_light_passes(kg, buffer, L, sample);
#ifdef __KERNEL_DEBUG__
kernel_write_debug_passes(kg, buffer, state, debug_data, sample);
#endif
float4 L_rad = make_float4(L_sum.x, L_sum.y, L_sum.z, 1.0f - (*L_transparent));
/* accumulate result in output buffer */
kernel_write_pass_float4(buffer, sample, L_rad);
bool is_shadow_catcher = (state->flag & PATH_RAY_SHADOW_CATCHER);
kernel_write_result(kg, buffer, sample, L, 1.0f - (*L_transparent), is_shadow_catcher);
path_rng_end(kg, rng_state, rng);
ASSIGN_RAY_STATE(ray_state, ray_index, RAY_TO_REGENERATE);

View File

@@ -125,7 +125,7 @@ ccl_device void kernel_holdout_emission_blurring_pathtermination_ao(
#ifdef __SHADOW_TRICKS__
if((sd->object_flag & SD_OBJECT_SHADOW_CATCHER)) {
if(state->flag & PATH_RAY_CAMERA) {
state->flag |= (PATH_RAY_SHADOW_CATCHER | PATH_RAY_SHADOW_CATCHER_ONLY);
state->flag |= (PATH_RAY_SHADOW_CATCHER | PATH_RAY_SHADOW_CATCHER_ONLY | PATH_RAY_STORE_SHADOW_INFO);
state->catcher_object = sd->object;
if(!kernel_data.background.transparent) {
PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
@@ -246,6 +246,8 @@ ccl_device void kernel_holdout_emission_blurring_pathtermination_ao(
kernel_split_state.throughput[ray_index] = throughput/probability;
}
}
kernel_update_denoising_features(kg, sd, state, L);
}
}

View File

@@ -89,10 +89,10 @@ ccl_device void kernel_shadow_blocked_dl(KernelGlobals *kg)
&shadow))
{
/* accumulate */
path_radiance_accum_light(L, throughput, &L_light, shadow, 1.0f, state->bounce, is_lamp);
path_radiance_accum_light(L, state, throughput, &L_light, shadow, 1.0f, is_lamp);
}
else {
path_radiance_accum_total_light(L, throughput, &L_light);
path_radiance_accum_total_light(L, state, throughput, &L_light);
}
}

View File

@@ -444,6 +444,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
ShaderClosure *bsdf = bsdf_alloc(sd, sizeof(ShaderClosure), weight);
if(bsdf) {
bsdf->N = N;
sd->flag |= bsdf_transparent_setup(bsdf);
}
break;
@@ -704,6 +705,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
ShaderClosure *bsdf = bsdf_alloc(sd, sizeof(ShaderClosure), weight);
if(bsdf) {
bsdf->N = N;
/* todo: giving a fixed weight here will cause issues when
* mixing multiple BSDFS. energy will not be conserved and
* the throughput can blow up after multiple bounces. we

View File

@@ -63,8 +63,13 @@ ccl_device void svm_node_set_bump(KernelGlobals *kg, ShaderData *sd, float *stac
strength = max(strength, 0.0f);
/* compute and output perturbed normal */
float3 normal_out = normalize(absdet*normal_in - distance*signf(det)*surfgrad);
normal_out = normalize(strength*normal_out + (1.0f - strength)*normal_in);
float3 normal_out = safe_normalize(absdet*normal_in - distance*signf(det)*surfgrad);
if(is_zero(normal_out)) {
normal_out = normal_in;
}
else {
normal_out = normalize(strength*normal_out + (1.0f - strength)*normal_in);
}
if(use_object_space) {
object_normal_transform(kg, sd, &normal_out);

View File

@@ -37,6 +37,7 @@ ccl_device_inline void svm_node_geometry(KernelGlobals *kg,
#ifdef __UV__
case NODE_GEOM_uv: data = make_float3(sd->u, sd->v, 0.0f); break;
#endif
default: data = make_float3(0.0f, 0.0f, 0.0f);
}
stack_store_float3(stack, out_offset, data);

View File

@@ -317,8 +317,8 @@ ccl_device void svm_node_tex_environment(KernelGlobals *kg, ShaderData *sd, floa
float3 co = stack_load_float3(stack, co_offset);
float2 uv;
co = normalize(co);
co = safe_normalize(co);
if(projection == 0)
uv = direction_to_equirectangular(co);
else

View File

@@ -402,7 +402,6 @@ typedef enum ClosureType {
CLOSURE_BSDF_DIFFUSE_TOON_ID,
/* Glossy */
CLOSURE_BSDF_GLOSSY_ID,
CLOSURE_BSDF_REFLECTION_ID,
CLOSURE_BSDF_MICROFACET_GGX_ID,
CLOSURE_BSDF_MICROFACET_GGX_FRESNEL_ID,
@@ -423,14 +422,13 @@ typedef enum ClosureType {
CLOSURE_BSDF_HAIR_REFLECTION_ID,
/* Transmission */
CLOSURE_BSDF_TRANSMISSION_ID,
CLOSURE_BSDF_TRANSLUCENT_ID,
CLOSURE_BSDF_REFRACTION_ID,
CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID,
CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID,
CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID,
CLOSURE_BSDF_MICROFACET_BECKMANN_GLASS_ID,
CLOSURE_BSDF_MICROFACET_GGX_GLASS_ID,
CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID,
CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_FRESNEL_ID,
CLOSURE_BSDF_SHARP_GLASS_ID,
CLOSURE_BSDF_HAIR_TRANSMISSION_ID,
@@ -465,13 +463,16 @@ typedef enum ClosureType {
/* watch this, being lazy with memory usage */
#define CLOSURE_IS_BSDF(type) (type <= CLOSURE_BSDF_TRANSPARENT_ID)
#define CLOSURE_IS_BSDF_DIFFUSE(type) (type >= CLOSURE_BSDF_DIFFUSE_ID && type <= CLOSURE_BSDF_DIFFUSE_TOON_ID)
#define CLOSURE_IS_BSDF_GLOSSY(type) (type >= CLOSURE_BSDF_GLOSSY_ID && type <= CLOSURE_BSDF_HAIR_REFLECTION_ID)
#define CLOSURE_IS_BSDF_TRANSMISSION(type) (type >= CLOSURE_BSDF_TRANSMISSION_ID && type <= CLOSURE_BSDF_HAIR_TRANSMISSION_ID)
#define CLOSURE_IS_BSDF_GLOSSY(type) (type >= CLOSURE_BSDF_REFLECTION_ID && type <= CLOSURE_BSDF_HAIR_REFLECTION_ID)
#define CLOSURE_IS_BSDF_TRANSMISSION(type) (type >= CLOSURE_BSDF_TRANSLUCENT_ID && type <= CLOSURE_BSDF_HAIR_TRANSMISSION_ID)
#define CLOSURE_IS_BSDF_BSSRDF(type) (type == CLOSURE_BSDF_BSSRDF_ID || type == CLOSURE_BSDF_BSSRDF_PRINCIPLED_ID)
#define CLOSURE_IS_BSDF_TRANSPARENT(type) (type == CLOSURE_BSDF_TRANSPARENT_ID)
#define CLOSURE_IS_BSDF_ANISOTROPIC(type) (type >= CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID && type <= CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID)
#define CLOSURE_IS_BSDF_MULTISCATTER(type) (type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID ||\
type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_ANISO_ID || \
type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID)
#define CLOSURE_IS_BSDF_MICROFACET(type) ((type >= CLOSURE_BSDF_REFLECTION_ID && type <= CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID) ||\
(type >= CLOSURE_BSDF_REFRACTION_ID && type <= CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID))
#define CLOSURE_IS_BSDF_OR_BSSRDF(type) (type <= CLOSURE_BSSRDF_BURLEY_ID)
#define CLOSURE_IS_BSSRDF(type) (type >= CLOSURE_BSSRDF_CUBIC_ID && type <= CLOSURE_BSSRDF_BURLEY_ID)
#define CLOSURE_IS_VOLUME(type) (type >= CLOSURE_VOLUME_ID && type <= CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID)
@@ -480,7 +481,7 @@ typedef enum ClosureType {
#define CLOSURE_IS_BACKGROUND(type) (type == CLOSURE_BACKGROUND_ID)
#define CLOSURE_IS_AMBIENT_OCCLUSION(type) (type == CLOSURE_AMBIENT_OCCLUSION_ID)
#define CLOSURE_IS_PHASE(type) (type == CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID)
#define CLOSURE_IS_GLASS(type) (type >= CLOSURE_BSDF_MICROFACET_BECKMANN_GLASS_ID && type <= CLOSURE_BSDF_SHARP_GLASS_ID)
#define CLOSURE_IS_GLASS(type) (type >= CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID && type <= CLOSURE_BSDF_SHARP_GLASS_ID)
#define CLOSURE_IS_PRINCIPLED(type) (type == CLOSURE_BSDF_PRINCIPLED_ID)
#define CLOSURE_WEIGHT_CUTOFF 1e-5f