Cycles: inline more functions on the GPU
This makes little difference for CUDA and OpenCL, but will be helpful for Optix.
This commit is contained in:

committed by
Brecht Van Lommel

parent
2b999c6a68
commit
f491c23f1e
@@ -58,6 +58,7 @@ __device__ half __float2half(const float f)
|
||||
# define ccl_device_forceinline __device__ __forceinline__
|
||||
#endif
|
||||
#define ccl_device_noinline __device__ __noinline__
|
||||
#define ccl_device_noinline_cpu ccl_device
|
||||
#define ccl_global
|
||||
#define ccl_static_constant __constant__
|
||||
#define ccl_constant const
|
||||
|
@@ -35,6 +35,7 @@
|
||||
#define ccl_device_inline ccl_device
|
||||
#define ccl_device_forceinline ccl_device
|
||||
#define ccl_device_noinline ccl_device ccl_noinline
|
||||
#define ccl_device_noinline_cpu ccl_device
|
||||
#define ccl_may_alias
|
||||
#define ccl_static_constant static __constant
|
||||
#define ccl_constant __constant
|
||||
|
@@ -17,14 +17,14 @@
|
||||
CCL_NAMESPACE_BEGIN
|
||||
|
||||
/* Direction Emission */
|
||||
ccl_device_noinline float3 direct_emissive_eval(KernelGlobals *kg,
|
||||
ShaderData *emission_sd,
|
||||
LightSample *ls,
|
||||
ccl_addr_space PathState *state,
|
||||
float3 I,
|
||||
differential3 dI,
|
||||
float t,
|
||||
float time)
|
||||
ccl_device_noinline_cpu float3 direct_emissive_eval(KernelGlobals *kg,
|
||||
ShaderData *emission_sd,
|
||||
LightSample *ls,
|
||||
ccl_addr_space PathState *state,
|
||||
float3 I,
|
||||
differential3 dI,
|
||||
float t,
|
||||
float time)
|
||||
{
|
||||
/* setup shading at emitter */
|
||||
float3 eval = make_float3(0.0f, 0.0f, 0.0f);
|
||||
@@ -98,15 +98,15 @@ ccl_device_noinline float3 direct_emissive_eval(KernelGlobals *kg,
|
||||
return eval;
|
||||
}
|
||||
|
||||
ccl_device_noinline bool direct_emission(KernelGlobals *kg,
|
||||
ShaderData *sd,
|
||||
ShaderData *emission_sd,
|
||||
LightSample *ls,
|
||||
ccl_addr_space PathState *state,
|
||||
Ray *ray,
|
||||
BsdfEval *eval,
|
||||
bool *is_lamp,
|
||||
float rand_terminate)
|
||||
ccl_device_noinline_cpu bool direct_emission(KernelGlobals *kg,
|
||||
ShaderData *sd,
|
||||
ShaderData *emission_sd,
|
||||
LightSample *ls,
|
||||
ccl_addr_space PathState *state,
|
||||
Ray *ray,
|
||||
BsdfEval *eval,
|
||||
bool *is_lamp,
|
||||
float rand_terminate)
|
||||
{
|
||||
if (ls->pdf == 0.0f)
|
||||
return false;
|
||||
@@ -208,7 +208,7 @@ ccl_device_noinline bool direct_emission(KernelGlobals *kg,
|
||||
|
||||
/* Indirect Primitive Emission */
|
||||
|
||||
ccl_device_noinline float3 indirect_primitive_emission(
|
||||
ccl_device_noinline_cpu float3 indirect_primitive_emission(
|
||||
KernelGlobals *kg, ShaderData *sd, float t, int path_flag, float bsdf_pdf)
|
||||
{
|
||||
/* evaluate emissive closure */
|
||||
@@ -234,11 +234,11 @@ ccl_device_noinline float3 indirect_primitive_emission(
|
||||
|
||||
/* Indirect Lamp Emission */
|
||||
|
||||
ccl_device_noinline bool indirect_lamp_emission(KernelGlobals *kg,
|
||||
ShaderData *emission_sd,
|
||||
ccl_addr_space PathState *state,
|
||||
Ray *ray,
|
||||
float3 *emission)
|
||||
ccl_device_noinline_cpu bool indirect_lamp_emission(KernelGlobals *kg,
|
||||
ShaderData *emission_sd,
|
||||
ccl_addr_space PathState *state,
|
||||
Ray *ray,
|
||||
float3 *emission)
|
||||
{
|
||||
bool hit_lamp = false;
|
||||
|
||||
@@ -293,10 +293,10 @@ ccl_device_noinline bool indirect_lamp_emission(KernelGlobals *kg,
|
||||
|
||||
/* Indirect Background */
|
||||
|
||||
ccl_device_noinline float3 indirect_background(KernelGlobals *kg,
|
||||
ShaderData *emission_sd,
|
||||
ccl_addr_space PathState *state,
|
||||
ccl_addr_space Ray *ray)
|
||||
ccl_device_noinline_cpu float3 indirect_background(KernelGlobals *kg,
|
||||
ShaderData *emission_sd,
|
||||
ccl_addr_space PathState *state,
|
||||
ccl_addr_space Ray *ray)
|
||||
{
|
||||
#ifdef __BACKGROUND__
|
||||
int shader = kernel_data.background.surface_shader;
|
||||
|
@@ -182,17 +182,7 @@ ccl_device float lamp_light_pdf(KernelGlobals *kg, const float3 Ng, const float3
|
||||
|
||||
#ifdef __BACKGROUND_MIS__
|
||||
|
||||
/* TODO(sergey): In theory it should be all fine to use noinline for all
|
||||
* devices, but we're so close to the release so better not screw things
|
||||
* up for CPU at least.
|
||||
*/
|
||||
# ifdef __KERNEL_GPU__
|
||||
ccl_device_noinline
|
||||
# else
|
||||
ccl_device
|
||||
# endif
|
||||
float3
|
||||
background_map_sample(KernelGlobals *kg, float randu, float randv, float *pdf)
|
||||
ccl_device float3 background_map_sample(KernelGlobals *kg, float randu, float randv, float *pdf)
|
||||
{
|
||||
/* for the following, the CDF values are actually a pair of floats, with the
|
||||
* function value as X and the actual CDF as Y. The last entry's function
|
||||
@@ -274,13 +264,7 @@ ccl_device
|
||||
/* TODO(sergey): Same as above, after the release we should consider using
|
||||
* 'noinline' for all devices.
|
||||
*/
|
||||
# ifdef __KERNEL_GPU__
|
||||
ccl_device_noinline
|
||||
# else
|
||||
ccl_device
|
||||
# endif
|
||||
float
|
||||
background_map_pdf(KernelGlobals *kg, float3 direction)
|
||||
ccl_device float background_map_pdf(KernelGlobals *kg, float3 direction)
|
||||
{
|
||||
float2 uv = direction_to_equirectangular(direction);
|
||||
int res_x = kernel_data.integrator.pdf_background_res_x;
|
||||
|
@@ -198,14 +198,14 @@ ccl_device_forceinline void kernel_branched_path_volume(KernelGlobals *kg,
|
||||
# endif /* __VOLUME__ */
|
||||
|
||||
/* bounce off surface and integrate indirect light */
|
||||
ccl_device_noinline void kernel_branched_path_surface_indirect_light(KernelGlobals *kg,
|
||||
ShaderData *sd,
|
||||
ShaderData *indirect_sd,
|
||||
ShaderData *emission_sd,
|
||||
float3 throughput,
|
||||
float num_samples_adjust,
|
||||
PathState *state,
|
||||
PathRadiance *L)
|
||||
ccl_device_noinline_cpu void kernel_branched_path_surface_indirect_light(KernelGlobals *kg,
|
||||
ShaderData *sd,
|
||||
ShaderData *indirect_sd,
|
||||
ShaderData *emission_sd,
|
||||
float3 throughput,
|
||||
float num_samples_adjust,
|
||||
PathState *state,
|
||||
PathRadiance *L)
|
||||
{
|
||||
float sum_sample_weight = 0.0f;
|
||||
# ifdef __DENOISING_FEATURES__
|
||||
|
@@ -20,7 +20,7 @@ CCL_NAMESPACE_BEGIN
|
||||
defined(__BAKING__)
|
||||
/* branched path tracing: connect path directly to position on one or more lights and add it to L
|
||||
*/
|
||||
ccl_device_noinline void kernel_branched_path_surface_connect_light(
|
||||
ccl_device_noinline_cpu void kernel_branched_path_surface_connect_light(
|
||||
KernelGlobals *kg,
|
||||
ShaderData *sd,
|
||||
ShaderData *emission_sd,
|
||||
|
@@ -57,18 +57,12 @@ ccl_device_inline void kernel_path_volume_connect_light(KernelGlobals *kg,
|
||||
# endif /* __EMISSION__ */
|
||||
}
|
||||
|
||||
# ifdef __KERNEL_GPU__
|
||||
ccl_device_noinline
|
||||
# else
|
||||
ccl_device
|
||||
# endif
|
||||
bool
|
||||
kernel_path_volume_bounce(KernelGlobals *kg,
|
||||
ShaderData *sd,
|
||||
ccl_addr_space float3 *throughput,
|
||||
ccl_addr_space PathState *state,
|
||||
PathRadianceState *L_state,
|
||||
ccl_addr_space Ray *ray)
|
||||
ccl_device_noinline_cpu bool kernel_path_volume_bounce(KernelGlobals *kg,
|
||||
ShaderData *sd,
|
||||
ccl_addr_space float3 *throughput,
|
||||
ccl_addr_space PathState *state,
|
||||
PathRadianceState *L_state,
|
||||
ccl_addr_space Ray *ray)
|
||||
{
|
||||
/* sample phase function */
|
||||
float phase_pdf;
|
||||
|
@@ -672,7 +672,7 @@ kernel_volume_integrate_heterogeneous_distance(KernelGlobals *kg,
|
||||
* ray, with the assumption that there are no surfaces blocking light
|
||||
* between the endpoints. distance sampling is used to decide if we will
|
||||
* scatter or not. */
|
||||
ccl_device_noinline VolumeIntegrateResult
|
||||
ccl_device_noinline_cpu VolumeIntegrateResult
|
||||
kernel_volume_integrate(KernelGlobals *kg,
|
||||
ccl_addr_space PathState *state,
|
||||
ShaderData *sd,
|
||||
|
@@ -80,13 +80,7 @@ ccl_device void svm_node_attr(KernelGlobals *kg, ShaderData *sd, float *stack, u
|
||||
}
|
||||
}
|
||||
|
||||
#ifndef __KERNEL_CUDA__
|
||||
ccl_device
|
||||
#else
|
||||
ccl_device_noinline
|
||||
#endif
|
||||
void
|
||||
svm_node_attr_bump_dx(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node)
|
||||
ccl_device void svm_node_attr_bump_dx(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node)
|
||||
{
|
||||
NodeAttributeType type = NODE_ATTR_FLOAT;
|
||||
uint out_offset = 0;
|
||||
@@ -125,13 +119,7 @@ ccl_device_noinline
|
||||
}
|
||||
}
|
||||
|
||||
#ifndef __KERNEL_CUDA__
|
||||
ccl_device
|
||||
#else
|
||||
ccl_device_noinline
|
||||
#endif
|
||||
void
|
||||
svm_node_attr_bump_dy(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node)
|
||||
ccl_device void svm_node_attr_bump_dy(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node)
|
||||
{
|
||||
NodeAttributeType type = NODE_ATTR_FLOAT;
|
||||
uint out_offset = 0;
|
||||
|
@@ -27,16 +27,16 @@ ccl_device_inline float brick_noise(uint n) /* fast integer noise */
|
||||
return 0.5f * ((float)nn / 1073741824.0f);
|
||||
}
|
||||
|
||||
ccl_device_noinline float2 svm_brick(float3 p,
|
||||
float mortar_size,
|
||||
float mortar_smooth,
|
||||
float bias,
|
||||
float brick_width,
|
||||
float row_height,
|
||||
float offset_amount,
|
||||
int offset_frequency,
|
||||
float squash_amount,
|
||||
int squash_frequency)
|
||||
ccl_device_noinline_cpu float2 svm_brick(float3 p,
|
||||
float mortar_size,
|
||||
float mortar_smooth,
|
||||
float bias,
|
||||
float brick_width,
|
||||
float row_height,
|
||||
float offset_amount,
|
||||
int offset_frequency,
|
||||
float squash_amount,
|
||||
int squash_frequency)
|
||||
{
|
||||
int bricknum, rownum;
|
||||
float offset = 0.0f;
|
||||
|
@@ -264,7 +264,7 @@ ccl_device float3 svm_mix_clamp(float3 col)
|
||||
return outcol;
|
||||
}
|
||||
|
||||
ccl_device_noinline float3 svm_mix(NodeMix type, float fac, float3 c1, float3 c2)
|
||||
ccl_device_noinline_cpu float3 svm_mix(NodeMix type, float fac, float3 c1, float3 c2)
|
||||
{
|
||||
float t = saturate(fac);
|
||||
|
||||
|
@@ -18,7 +18,7 @@ CCL_NAMESPACE_BEGIN
|
||||
|
||||
/* Magic */
|
||||
|
||||
ccl_device_noinline float3 svm_magic(float3 p, int n, float distortion)
|
||||
ccl_device_noinline_cpu float3 svm_magic(float3 p, int n, float distortion)
|
||||
{
|
||||
float x = sinf((p.x + p.y + p.z) * 5.0f);
|
||||
float y = cosf((-p.x + p.y - p.z) * 5.0f);
|
||||
|
@@ -25,7 +25,10 @@ CCL_NAMESPACE_BEGIN
|
||||
* from "Texturing and Modelling: A procedural approach"
|
||||
*/
|
||||
|
||||
ccl_device_noinline float noise_musgrave_fBm(float3 p, float H, float lacunarity, float octaves)
|
||||
ccl_device_noinline_cpu float noise_musgrave_fBm(float3 p,
|
||||
float H,
|
||||
float lacunarity,
|
||||
float octaves)
|
||||
{
|
||||
float rmd;
|
||||
float value = 0.0f;
|
||||
@@ -53,10 +56,10 @@ ccl_device_noinline float noise_musgrave_fBm(float3 p, float H, float lacunarity
|
||||
* octaves: number of frequencies in the fBm
|
||||
*/
|
||||
|
||||
ccl_device_noinline float noise_musgrave_multi_fractal(float3 p,
|
||||
float H,
|
||||
float lacunarity,
|
||||
float octaves)
|
||||
ccl_device_noinline_cpu float noise_musgrave_multi_fractal(float3 p,
|
||||
float H,
|
||||
float lacunarity,
|
||||
float octaves)
|
||||
{
|
||||
float rmd;
|
||||
float value = 1.0f;
|
||||
@@ -85,7 +88,7 @@ ccl_device_noinline float noise_musgrave_multi_fractal(float3 p,
|
||||
* offset: raises the terrain from `sea level'
|
||||
*/
|
||||
|
||||
ccl_device_noinline float noise_musgrave_hetero_terrain(
|
||||
ccl_device_noinline_cpu float noise_musgrave_hetero_terrain(
|
||||
float3 p, float H, float lacunarity, float octaves, float offset)
|
||||
{
|
||||
float value, increment, rmd;
|
||||
@@ -121,7 +124,7 @@ ccl_device_noinline float noise_musgrave_hetero_terrain(
|
||||
* offset: raises the terrain from `sea level'
|
||||
*/
|
||||
|
||||
ccl_device_noinline float noise_musgrave_hybrid_multi_fractal(
|
||||
ccl_device_noinline_cpu float noise_musgrave_hybrid_multi_fractal(
|
||||
float3 p, float H, float lacunarity, float octaves, float offset, float gain)
|
||||
{
|
||||
float result, signal, weight, rmd;
|
||||
@@ -159,7 +162,7 @@ ccl_device_noinline float noise_musgrave_hybrid_multi_fractal(
|
||||
* offset: raises the terrain from `sea level'
|
||||
*/
|
||||
|
||||
ccl_device_noinline float noise_musgrave_ridged_multi_fractal(
|
||||
ccl_device_noinline_cpu float noise_musgrave_ridged_multi_fractal(
|
||||
float3 p, float H, float lacunarity, float octaves, float offset, float gain)
|
||||
{
|
||||
float result, signal, weight;
|
||||
|
@@ -182,7 +182,7 @@ ccl_device_inline ssef scale3_sse(const ssef &result)
|
||||
#endif
|
||||
|
||||
#ifndef __KERNEL_SSE2__
|
||||
ccl_device_noinline float perlin(float x, float y, float z)
|
||||
ccl_device_noinline_cpu float perlin(float x, float y, float z)
|
||||
{
|
||||
int X;
|
||||
float fx = floorfrac(x, &X);
|
||||
|
@@ -18,12 +18,12 @@ CCL_NAMESPACE_BEGIN
|
||||
|
||||
/* Wave */
|
||||
|
||||
ccl_device_noinline float svm_wave(NodeWaveType type,
|
||||
NodeWaveProfile profile,
|
||||
float3 p,
|
||||
float detail,
|
||||
float distortion,
|
||||
float dscale)
|
||||
ccl_device_noinline_cpu float svm_wave(NodeWaveType type,
|
||||
NodeWaveProfile profile,
|
||||
float3 p,
|
||||
float detail,
|
||||
float distortion,
|
||||
float dscale)
|
||||
{
|
||||
float n;
|
||||
|
||||
|
@@ -30,6 +30,7 @@
|
||||
# ifndef __KERNEL_GPU__
|
||||
# define ccl_device static inline
|
||||
# define ccl_device_noinline static
|
||||
# define ccl_device_noinline_cpu ccl_device_noinline
|
||||
# define ccl_global
|
||||
# define ccl_static_constant static const
|
||||
# define ccl_constant const
|
||||
|
Reference in New Issue
Block a user