Cycles: Implement record-all transparent shadow function for GPU
The idea is to record all possible transparent intersections when shooting transparent ray on GPU (similar to what we were doing on CPU already). This avoids need of doing whole ray-to-scene intersections queries for each intersection and speeds up a lot cases like transparent hair in the cost of extra memory. This commit is a base ground for now and this feature is kept disabled for until some further tweaks.
This commit is contained in:
@@ -357,7 +357,7 @@ ccl_device_inline float3 ray_offset(float3 P, float3 Ng)
|
|||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
#if defined(__SHADOW_RECORD_ALL__) || defined (__VOLUME_RECORD_ALL__)
|
#if defined(__SHADOW_RECORD_ALL__) || defined(__VOLUME_RECORD_ALL__)
|
||||||
/* ToDo: Move to another file? */
|
/* ToDo: Move to another file? */
|
||||||
ccl_device int intersections_compare(const void *a, const void *b)
|
ccl_device int intersections_compare(const void *a, const void *b)
|
||||||
{
|
{
|
||||||
@@ -371,7 +371,25 @@ ccl_device int intersections_compare(const void *a, const void *b)
|
|||||||
else
|
else
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
ccl_device_inline void sort_intersections(Intersection *hits, uint num_hits)
|
||||||
|
{
|
||||||
|
#ifdef __KERNEL_GPU__
|
||||||
|
/* Use bubble sort which has more friendly memory pattern on GPU. */
|
||||||
|
int i, j;
|
||||||
|
for(i = 0; i < num_hits; ++i) {
|
||||||
|
for(j = 0; j < num_hits - 1; ++j) {
|
||||||
|
if(hits[j].t < hits[j + 1].t) {
|
||||||
|
Intersection tmp = hits[j];
|
||||||
|
hits[j] = hits[j + 1];
|
||||||
|
hits[j + 1] = tmp;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
qsort(hits, num_hits, sizeof(Intersection), intersections_compare);
|
||||||
#endif
|
#endif
|
||||||
|
}
|
||||||
|
#endif /* __SHADOW_RECORD_ALL__ | __VOLUME_RECORD_ALL__ */
|
||||||
|
|
||||||
CCL_NAMESPACE_END
|
CCL_NAMESPACE_END
|
||||||
|
|
||||||
|
@@ -34,10 +34,8 @@ ccl_device_inline bool shadow_handle_transparent_isect(KernelGlobals *kg,
|
|||||||
kernel_volume_shadow(kg, shadow_sd, state, &segment_ray, throughput);
|
kernel_volume_shadow(kg, shadow_sd, state, &segment_ray, throughput);
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
/* Setup shader data at surface. */
|
/* Setup shader data at surface. */
|
||||||
shader_setup_from_ray(kg, shadow_sd, isect, ray);
|
shader_setup_from_ray(kg, shadow_sd, isect, ray);
|
||||||
|
|
||||||
/* Attenuation from transparent surface. */
|
/* Attenuation from transparent surface. */
|
||||||
if(!(shadow_sd->flag & SD_HAS_ONLY_VOLUME)) {
|
if(!(shadow_sd->flag & SD_HAS_ONLY_VOLUME)) {
|
||||||
path_state_modify_bounce(state, true);
|
path_state_modify_bounce(state, true);
|
||||||
@@ -51,42 +49,19 @@ ccl_device_inline bool shadow_handle_transparent_isect(KernelGlobals *kg,
|
|||||||
path_state_modify_bounce(state, false);
|
path_state_modify_bounce(state, false);
|
||||||
*throughput *= shader_bsdf_transparency(kg, shadow_sd);
|
*throughput *= shader_bsdf_transparency(kg, shadow_sd);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Stop if all light is blocked. */
|
/* Stop if all light is blocked. */
|
||||||
if(is_zero(*throughput)) {
|
if(is_zero(*throughput)) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef __VOLUME__
|
#ifdef __VOLUME__
|
||||||
/* Exit/enter volume. */
|
/* Exit/enter volume. */
|
||||||
kernel_volume_stack_enter_exit(kg, shadow_sd, state->volume_stack);
|
kernel_volume_stack_enter_exit(kg, shadow_sd, state->volume_stack);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef __SHADOW_RECORD_ALL__
|
#ifdef __SHADOW_RECORD_ALL__
|
||||||
|
/* Shadow function to compute how much light is blocked,
|
||||||
ccl_device_inline void sort_intersections(Intersection *hits, uint num_hits)
|
|
||||||
{
|
|
||||||
#ifdef __KERNEL_GPU__
|
|
||||||
/* Use bubble sort which has more friendly memory pattern on GPU. */
|
|
||||||
int i, j;
|
|
||||||
for(i = 0; i < num_hits; ++i) {
|
|
||||||
for(j = 0; j < num_hits - 1; ++j) {
|
|
||||||
if(hits[j].t < hits[j + 1].t) {
|
|
||||||
Intersection tmp = hits[j];
|
|
||||||
hits[j] = hits[j + 1];
|
|
||||||
hits[j + 1] = tmp;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#else
|
|
||||||
qsort(hits, num_hits, sizeof(Intersection), intersections_compare);
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Shadow function to compute how much light is blocked, CPU variation.
|
|
||||||
*
|
*
|
||||||
* We trace a single ray. If it hits any opaque surface, or more than a given
|
* We trace a single ray. If it hits any opaque surface, or more than a given
|
||||||
* number of transparent surfaces is hit, then we consider the geometry to be
|
* number of transparent surfaces is hit, then we consider the geometry to be
|
||||||
@@ -104,12 +79,20 @@ ccl_device_inline void sort_intersections(Intersection *hits, uint num_hits)
|
|||||||
* or there is a performance increase anyway due to avoiding the need to send
|
* or there is a performance increase anyway due to avoiding the need to send
|
||||||
* two rays with transparent shadows.
|
* two rays with transparent shadows.
|
||||||
*
|
*
|
||||||
* This is CPU only because of qsort, and malloc or high stack space usage to
|
* On CPU it'll handle all transparent bounces (by allocating storage for
|
||||||
* record all these intersections. */
|
* intersections when they don't fit into the stack storage).
|
||||||
|
*
|
||||||
|
* On GPU it'll only handle SHADOW_STACK_MAX_HITS-1 intersections, so this
|
||||||
|
* is something to be kept an eye on.
|
||||||
|
*/
|
||||||
|
|
||||||
#define STACK_MAX_HITS 64
|
#define SHADOW_STACK_MAX_HITS 64
|
||||||
|
|
||||||
ccl_device_inline bool shadow_blocked(KernelGlobals *kg, ShaderData *shadow_sd, PathState *state, Ray *ray, float3 *shadow)
|
ccl_device_inline bool shadow_blocked_all(KernelGlobals *kg,
|
||||||
|
ShaderData *shadow_sd,
|
||||||
|
PathState *state,
|
||||||
|
Ray *ray,
|
||||||
|
float3 *shadow)
|
||||||
{
|
{
|
||||||
*shadow = make_float3(1.0f, 1.0f, 1.0f);
|
*shadow = make_float3(1.0f, 1.0f, 1.0f);
|
||||||
if(ray->t == 0.0f) {
|
if(ray->t == 0.0f) {
|
||||||
@@ -126,7 +109,7 @@ ccl_device_inline bool shadow_blocked(KernelGlobals *kg, ShaderData *shadow_sd,
|
|||||||
/* Intersect to find an opaque surface, or record all transparent
|
/* Intersect to find an opaque surface, or record all transparent
|
||||||
* surface hits.
|
* surface hits.
|
||||||
*/
|
*/
|
||||||
Intersection hits_stack[STACK_MAX_HITS];
|
Intersection hits_stack[SHADOW_STACK_MAX_HITS];
|
||||||
Intersection *hits = hits_stack;
|
Intersection *hits = hits_stack;
|
||||||
const int transparent_max_bounce = kernel_data.integrator.transparent_max_bounce;
|
const int transparent_max_bounce = kernel_data.integrator.transparent_max_bounce;
|
||||||
uint max_hits = transparent_max_bounce - state->transparent_bounce - 1;
|
uint max_hits = transparent_max_bounce - state->transparent_bounce - 1;
|
||||||
@@ -138,7 +121,7 @@ ccl_device_inline bool shadow_blocked(KernelGlobals *kg, ShaderData *shadow_sd,
|
|||||||
*
|
*
|
||||||
* Ignore this on GPU because of slow/unavailable malloc().
|
* Ignore this on GPU because of slow/unavailable malloc().
|
||||||
*/
|
*/
|
||||||
if(max_hits + 1 > STACK_MAX_HITS) {
|
if(max_hits + 1 > SHADOW_STACK_MAX_HITS) {
|
||||||
if(kg->transparent_shadow_intersections == NULL) {
|
if(kg->transparent_shadow_intersections == NULL) {
|
||||||
kg->transparent_shadow_intersections =
|
kg->transparent_shadow_intersections =
|
||||||
(Intersection*)malloc(sizeof(Intersection)*(transparent_max_bounce + 1));
|
(Intersection*)malloc(sizeof(Intersection)*(transparent_max_bounce + 1));
|
||||||
@@ -211,30 +194,27 @@ ccl_device_inline bool shadow_blocked(KernelGlobals *kg, ShaderData *shadow_sd,
|
|||||||
#endif
|
#endif
|
||||||
return blocked;
|
return blocked;
|
||||||
}
|
}
|
||||||
|
#endif /* __SHADOW_RECORD_ALL__ */
|
||||||
|
|
||||||
#undef STACK_MAX_HITS
|
#ifndef __KERNEL_CPU__
|
||||||
|
/* Shadow function to compute how much light is blocked,
|
||||||
#else
|
|
||||||
|
|
||||||
/* Shadow function to compute how much light is blocked, GPU variation.
|
|
||||||
*
|
*
|
||||||
* Here we raytrace from one transparent surface to the next step by step.
|
* Here we raytrace from one transparent surface to the next step by step.
|
||||||
* To minimize overhead in cases where we don't need transparent shadows, we
|
* To minimize overhead in cases where we don't need transparent shadows, we
|
||||||
* first trace a regular shadow ray. We check if the hit primitive was
|
* first trace a regular shadow ray. We check if the hit primitive was
|
||||||
* potentially transparent, and only in that case start marching. this gives
|
* potentially transparent, and only in that case start marching. this gives
|
||||||
* one extra ray cast for the cases were we do want transparency. */
|
* one extra ray cast for the cases were we do want transparency.
|
||||||
|
*/
|
||||||
ccl_device_noinline bool shadow_blocked(KernelGlobals *kg,
|
ccl_device_noinline bool shadow_blocked_stepped(KernelGlobals *kg,
|
||||||
ShaderData *shadow_sd,
|
ShaderData *shadow_sd,
|
||||||
ccl_addr_space PathState *state,
|
ccl_addr_space PathState *state,
|
||||||
ccl_addr_space Ray *ray_input,
|
ccl_addr_space Ray *ray_input,
|
||||||
float3 *shadow)
|
float3 *shadow)
|
||||||
{
|
{
|
||||||
*shadow = make_float3(1.0f, 1.0f, 1.0f);
|
*shadow = make_float3(1.0f, 1.0f, 1.0f);
|
||||||
|
if(ray_input->t == 0.0f) {
|
||||||
if(ray_input->t == 0.0f)
|
|
||||||
return false;
|
return false;
|
||||||
|
}
|
||||||
#ifdef __SPLIT_KERNEL__
|
#ifdef __SPLIT_KERNEL__
|
||||||
Ray private_ray = *ray_input;
|
Ray private_ray = *ray_input;
|
||||||
Ray *ray = &private_ray;
|
Ray *ray = &private_ray;
|
||||||
@@ -313,10 +293,32 @@ ccl_device_noinline bool shadow_blocked(KernelGlobals *kg,
|
|||||||
}
|
}
|
||||||
# endif
|
# endif
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
return blocked;
|
return blocked;
|
||||||
}
|
}
|
||||||
|
#endif /* __KERNEL_CPU__ */
|
||||||
|
|
||||||
|
ccl_device_inline bool shadow_blocked(KernelGlobals *kg,
|
||||||
|
ShaderData *shadow_sd,
|
||||||
|
PathState *state,
|
||||||
|
Ray *ray,
|
||||||
|
float3 *shadow)
|
||||||
|
{
|
||||||
|
#ifdef __SHADOW_RECORD_ALL__
|
||||||
|
# ifdef __KERNEL_CPU__
|
||||||
|
return shadow_blocked_all(kg, shadow_sd, state, ray, shadow);
|
||||||
|
# else
|
||||||
|
const int transparent_max_bounce = kernel_data.integrator.transparent_max_bounce;
|
||||||
|
const uint max_hits = transparent_max_bounce - state->transparent_bounce - 1;
|
||||||
|
if(max_hits + 1 < SHADOW_STACK_MAX_HITS) {
|
||||||
|
return shadow_blocked_all(kg, shadow_sd, state, ray, shadow);
|
||||||
|
}
|
||||||
|
# endif
|
||||||
#endif
|
#endif
|
||||||
|
#ifndef __KERNEL_CPU__
|
||||||
|
return shadow_blocked_stepped(kg, shadow_sd, state, ray, shadow);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
#undef SHADOW_STACK_MAX_HITS
|
||||||
|
|
||||||
CCL_NAMESPACE_END
|
CCL_NAMESPACE_END
|
||||||
|
Reference in New Issue
Block a user