Cycles: inline more functions on the GPU

This makes little difference for CUDA and OpenCL, but will be helpful
for Optix.
This commit is contained in:
Patrick Mours
2019-08-22 17:36:54 +02:00
committed by Brecht Van Lommel
parent 2b999c6a68
commit f491c23f1e
16 changed files with 80 additions and 108 deletions

View File

@@ -58,6 +58,7 @@ __device__ half __float2half(const float f)
# define ccl_device_forceinline __device__ __forceinline__ # define ccl_device_forceinline __device__ __forceinline__
#endif #endif
#define ccl_device_noinline __device__ __noinline__ #define ccl_device_noinline __device__ __noinline__
#define ccl_device_noinline_cpu ccl_device
#define ccl_global #define ccl_global
#define ccl_static_constant __constant__ #define ccl_static_constant __constant__
#define ccl_constant const #define ccl_constant const

View File

@@ -35,6 +35,7 @@
#define ccl_device_inline ccl_device #define ccl_device_inline ccl_device
#define ccl_device_forceinline ccl_device #define ccl_device_forceinline ccl_device
#define ccl_device_noinline ccl_device ccl_noinline #define ccl_device_noinline ccl_device ccl_noinline
#define ccl_device_noinline_cpu ccl_device
#define ccl_may_alias #define ccl_may_alias
#define ccl_static_constant static __constant #define ccl_static_constant static __constant
#define ccl_constant __constant #define ccl_constant __constant

View File

@@ -17,7 +17,7 @@
CCL_NAMESPACE_BEGIN CCL_NAMESPACE_BEGIN
/* Direction Emission */ /* Direction Emission */
ccl_device_noinline float3 direct_emissive_eval(KernelGlobals *kg, ccl_device_noinline_cpu float3 direct_emissive_eval(KernelGlobals *kg,
ShaderData *emission_sd, ShaderData *emission_sd,
LightSample *ls, LightSample *ls,
ccl_addr_space PathState *state, ccl_addr_space PathState *state,
@@ -98,7 +98,7 @@ ccl_device_noinline float3 direct_emissive_eval(KernelGlobals *kg,
return eval; return eval;
} }
ccl_device_noinline bool direct_emission(KernelGlobals *kg, ccl_device_noinline_cpu bool direct_emission(KernelGlobals *kg,
ShaderData *sd, ShaderData *sd,
ShaderData *emission_sd, ShaderData *emission_sd,
LightSample *ls, LightSample *ls,
@@ -208,7 +208,7 @@ ccl_device_noinline bool direct_emission(KernelGlobals *kg,
/* Indirect Primitive Emission */ /* Indirect Primitive Emission */
ccl_device_noinline float3 indirect_primitive_emission( ccl_device_noinline_cpu float3 indirect_primitive_emission(
KernelGlobals *kg, ShaderData *sd, float t, int path_flag, float bsdf_pdf) KernelGlobals *kg, ShaderData *sd, float t, int path_flag, float bsdf_pdf)
{ {
/* evaluate emissive closure */ /* evaluate emissive closure */
@@ -234,7 +234,7 @@ ccl_device_noinline float3 indirect_primitive_emission(
/* Indirect Lamp Emission */ /* Indirect Lamp Emission */
ccl_device_noinline bool indirect_lamp_emission(KernelGlobals *kg, ccl_device_noinline_cpu bool indirect_lamp_emission(KernelGlobals *kg,
ShaderData *emission_sd, ShaderData *emission_sd,
ccl_addr_space PathState *state, ccl_addr_space PathState *state,
Ray *ray, Ray *ray,
@@ -293,7 +293,7 @@ ccl_device_noinline bool indirect_lamp_emission(KernelGlobals *kg,
/* Indirect Background */ /* Indirect Background */
ccl_device_noinline float3 indirect_background(KernelGlobals *kg, ccl_device_noinline_cpu float3 indirect_background(KernelGlobals *kg,
ShaderData *emission_sd, ShaderData *emission_sd,
ccl_addr_space PathState *state, ccl_addr_space PathState *state,
ccl_addr_space Ray *ray) ccl_addr_space Ray *ray)

View File

@@ -182,17 +182,7 @@ ccl_device float lamp_light_pdf(KernelGlobals *kg, const float3 Ng, const float3
#ifdef __BACKGROUND_MIS__ #ifdef __BACKGROUND_MIS__
/* TODO(sergey): In theory it should be all fine to use noinline for all ccl_device float3 background_map_sample(KernelGlobals *kg, float randu, float randv, float *pdf)
* devices, but we're so close to the release so better not screw things
* up for CPU at least.
*/
# ifdef __KERNEL_GPU__
ccl_device_noinline
# else
ccl_device
# endif
float3
background_map_sample(KernelGlobals *kg, float randu, float randv, float *pdf)
{ {
/* for the following, the CDF values are actually a pair of floats, with the /* for the following, the CDF values are actually a pair of floats, with the
* function value as X and the actual CDF as Y. The last entry's function * function value as X and the actual CDF as Y. The last entry's function
@@ -274,13 +264,7 @@ ccl_device
/* TODO(sergey): Same as above, after the release we should consider using /* TODO(sergey): Same as above, after the release we should consider using
* 'noinline' for all devices. * 'noinline' for all devices.
*/ */
# ifdef __KERNEL_GPU__ ccl_device float background_map_pdf(KernelGlobals *kg, float3 direction)
ccl_device_noinline
# else
ccl_device
# endif
float
background_map_pdf(KernelGlobals *kg, float3 direction)
{ {
float2 uv = direction_to_equirectangular(direction); float2 uv = direction_to_equirectangular(direction);
int res_x = kernel_data.integrator.pdf_background_res_x; int res_x = kernel_data.integrator.pdf_background_res_x;

View File

@@ -198,7 +198,7 @@ ccl_device_forceinline void kernel_branched_path_volume(KernelGlobals *kg,
# endif /* __VOLUME__ */ # endif /* __VOLUME__ */
/* bounce off surface and integrate indirect light */ /* bounce off surface and integrate indirect light */
ccl_device_noinline void kernel_branched_path_surface_indirect_light(KernelGlobals *kg, ccl_device_noinline_cpu void kernel_branched_path_surface_indirect_light(KernelGlobals *kg,
ShaderData *sd, ShaderData *sd,
ShaderData *indirect_sd, ShaderData *indirect_sd,
ShaderData *emission_sd, ShaderData *emission_sd,

View File

@@ -20,7 +20,7 @@ CCL_NAMESPACE_BEGIN
defined(__BAKING__) defined(__BAKING__)
/* branched path tracing: connect path directly to position on one or more lights and add it to L /* branched path tracing: connect path directly to position on one or more lights and add it to L
*/ */
ccl_device_noinline void kernel_branched_path_surface_connect_light( ccl_device_noinline_cpu void kernel_branched_path_surface_connect_light(
KernelGlobals *kg, KernelGlobals *kg,
ShaderData *sd, ShaderData *sd,
ShaderData *emission_sd, ShaderData *emission_sd,

View File

@@ -57,13 +57,7 @@ ccl_device_inline void kernel_path_volume_connect_light(KernelGlobals *kg,
# endif /* __EMISSION__ */ # endif /* __EMISSION__ */
} }
# ifdef __KERNEL_GPU__ ccl_device_noinline_cpu bool kernel_path_volume_bounce(KernelGlobals *kg,
ccl_device_noinline
# else
ccl_device
# endif
bool
kernel_path_volume_bounce(KernelGlobals *kg,
ShaderData *sd, ShaderData *sd,
ccl_addr_space float3 *throughput, ccl_addr_space float3 *throughput,
ccl_addr_space PathState *state, ccl_addr_space PathState *state,

View File

@@ -672,7 +672,7 @@ kernel_volume_integrate_heterogeneous_distance(KernelGlobals *kg,
* ray, with the assumption that there are no surfaces blocking light * ray, with the assumption that there are no surfaces blocking light
* between the endpoints. distance sampling is used to decide if we will * between the endpoints. distance sampling is used to decide if we will
* scatter or not. */ * scatter or not. */
ccl_device_noinline VolumeIntegrateResult ccl_device_noinline_cpu VolumeIntegrateResult
kernel_volume_integrate(KernelGlobals *kg, kernel_volume_integrate(KernelGlobals *kg,
ccl_addr_space PathState *state, ccl_addr_space PathState *state,
ShaderData *sd, ShaderData *sd,

View File

@@ -80,13 +80,7 @@ ccl_device void svm_node_attr(KernelGlobals *kg, ShaderData *sd, float *stack, u
} }
} }
#ifndef __KERNEL_CUDA__ ccl_device void svm_node_attr_bump_dx(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node)
ccl_device
#else
ccl_device_noinline
#endif
void
svm_node_attr_bump_dx(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node)
{ {
NodeAttributeType type = NODE_ATTR_FLOAT; NodeAttributeType type = NODE_ATTR_FLOAT;
uint out_offset = 0; uint out_offset = 0;
@@ -125,13 +119,7 @@ ccl_device_noinline
} }
} }
#ifndef __KERNEL_CUDA__ ccl_device void svm_node_attr_bump_dy(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node)
ccl_device
#else
ccl_device_noinline
#endif
void
svm_node_attr_bump_dy(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node)
{ {
NodeAttributeType type = NODE_ATTR_FLOAT; NodeAttributeType type = NODE_ATTR_FLOAT;
uint out_offset = 0; uint out_offset = 0;

View File

@@ -27,7 +27,7 @@ ccl_device_inline float brick_noise(uint n) /* fast integer noise */
return 0.5f * ((float)nn / 1073741824.0f); return 0.5f * ((float)nn / 1073741824.0f);
} }
ccl_device_noinline float2 svm_brick(float3 p, ccl_device_noinline_cpu float2 svm_brick(float3 p,
float mortar_size, float mortar_size,
float mortar_smooth, float mortar_smooth,
float bias, float bias,

View File

@@ -264,7 +264,7 @@ ccl_device float3 svm_mix_clamp(float3 col)
return outcol; return outcol;
} }
ccl_device_noinline float3 svm_mix(NodeMix type, float fac, float3 c1, float3 c2) ccl_device_noinline_cpu float3 svm_mix(NodeMix type, float fac, float3 c1, float3 c2)
{ {
float t = saturate(fac); float t = saturate(fac);

View File

@@ -18,7 +18,7 @@ CCL_NAMESPACE_BEGIN
/* Magic */ /* Magic */
ccl_device_noinline float3 svm_magic(float3 p, int n, float distortion) ccl_device_noinline_cpu float3 svm_magic(float3 p, int n, float distortion)
{ {
float x = sinf((p.x + p.y + p.z) * 5.0f); float x = sinf((p.x + p.y + p.z) * 5.0f);
float y = cosf((-p.x + p.y - p.z) * 5.0f); float y = cosf((-p.x + p.y - p.z) * 5.0f);

View File

@@ -25,7 +25,10 @@ CCL_NAMESPACE_BEGIN
* from "Texturing and Modelling: A procedural approach" * from "Texturing and Modelling: A procedural approach"
*/ */
ccl_device_noinline float noise_musgrave_fBm(float3 p, float H, float lacunarity, float octaves) ccl_device_noinline_cpu float noise_musgrave_fBm(float3 p,
float H,
float lacunarity,
float octaves)
{ {
float rmd; float rmd;
float value = 0.0f; float value = 0.0f;
@@ -53,7 +56,7 @@ ccl_device_noinline float noise_musgrave_fBm(float3 p, float H, float lacunarity
* octaves: number of frequencies in the fBm * octaves: number of frequencies in the fBm
*/ */
ccl_device_noinline float noise_musgrave_multi_fractal(float3 p, ccl_device_noinline_cpu float noise_musgrave_multi_fractal(float3 p,
float H, float H,
float lacunarity, float lacunarity,
float octaves) float octaves)
@@ -85,7 +88,7 @@ ccl_device_noinline float noise_musgrave_multi_fractal(float3 p,
* offset: raises the terrain from `sea level' * offset: raises the terrain from `sea level'
*/ */
ccl_device_noinline float noise_musgrave_hetero_terrain( ccl_device_noinline_cpu float noise_musgrave_hetero_terrain(
float3 p, float H, float lacunarity, float octaves, float offset) float3 p, float H, float lacunarity, float octaves, float offset)
{ {
float value, increment, rmd; float value, increment, rmd;
@@ -121,7 +124,7 @@ ccl_device_noinline float noise_musgrave_hetero_terrain(
* offset: raises the terrain from `sea level' * offset: raises the terrain from `sea level'
*/ */
ccl_device_noinline float noise_musgrave_hybrid_multi_fractal( ccl_device_noinline_cpu float noise_musgrave_hybrid_multi_fractal(
float3 p, float H, float lacunarity, float octaves, float offset, float gain) float3 p, float H, float lacunarity, float octaves, float offset, float gain)
{ {
float result, signal, weight, rmd; float result, signal, weight, rmd;
@@ -159,7 +162,7 @@ ccl_device_noinline float noise_musgrave_hybrid_multi_fractal(
* offset: raises the terrain from `sea level' * offset: raises the terrain from `sea level'
*/ */
ccl_device_noinline float noise_musgrave_ridged_multi_fractal( ccl_device_noinline_cpu float noise_musgrave_ridged_multi_fractal(
float3 p, float H, float lacunarity, float octaves, float offset, float gain) float3 p, float H, float lacunarity, float octaves, float offset, float gain)
{ {
float result, signal, weight; float result, signal, weight;

View File

@@ -182,7 +182,7 @@ ccl_device_inline ssef scale3_sse(const ssef &result)
#endif #endif
#ifndef __KERNEL_SSE2__ #ifndef __KERNEL_SSE2__
ccl_device_noinline float perlin(float x, float y, float z) ccl_device_noinline_cpu float perlin(float x, float y, float z)
{ {
int X; int X;
float fx = floorfrac(x, &X); float fx = floorfrac(x, &X);

View File

@@ -18,7 +18,7 @@ CCL_NAMESPACE_BEGIN
/* Wave */ /* Wave */
ccl_device_noinline float svm_wave(NodeWaveType type, ccl_device_noinline_cpu float svm_wave(NodeWaveType type,
NodeWaveProfile profile, NodeWaveProfile profile,
float3 p, float3 p,
float detail, float detail,

View File

@@ -30,6 +30,7 @@
# ifndef __KERNEL_GPU__ # ifndef __KERNEL_GPU__
# define ccl_device static inline # define ccl_device static inline
# define ccl_device_noinline static # define ccl_device_noinline static
# define ccl_device_noinline_cpu ccl_device_noinline
# define ccl_global # define ccl_global
# define ccl_static_constant static const # define ccl_static_constant static const
# define ccl_constant const # define ccl_constant const