Cycles: inline more functions on the GPU

This makes little difference for CUDA and OpenCL, but will be helpful for Optix.
2019-08-22 17:36:54 +02:00
parent 2b999c6a68
commit f491c23f1e
16 changed files with 80 additions and 108 deletions
--- a/intern/cycles/kernel/kernel_compat_cuda.h
+++ b/intern/cycles/kernel/kernel_compat_cuda.h
@@ -58,6 +58,7 @@ __device__ half __float2half(const float f)
 #  define ccl_device_forceinline __device__ __forceinline__
 #endif
 #define ccl_device_noinline __device__ __noinline__
+#define ccl_device_noinline_cpu ccl_device
 #define ccl_global
 #define ccl_static_constant __constant__
 #define ccl_constant const
--- a/intern/cycles/kernel/kernel_compat_opencl.h
+++ b/intern/cycles/kernel/kernel_compat_opencl.h
@@ -35,6 +35,7 @@
 #define ccl_device_inline ccl_device
 #define ccl_device_forceinline ccl_device
 #define ccl_device_noinline ccl_device ccl_noinline
+#define ccl_device_noinline_cpu ccl_device
 #define ccl_may_alias
 #define ccl_static_constant static __constant
 #define ccl_constant __constant
--- a/intern/cycles/kernel/kernel_emission.h
+++ b/intern/cycles/kernel/kernel_emission.h
@@ -17,14 +17,14 @@
 CCL_NAMESPACE_BEGIN

 /* Direction Emission */
-ccl_device_noinline float3 direct_emissive_eval(KernelGlobals *kg,
-                                                ShaderData *emission_sd,
-                                                LightSample *ls,
-                                                ccl_addr_space PathState *state,
-                                                float3 I,
-                                                differential3 dI,
-                                                float t,
-                                                float time)
+ccl_device_noinline_cpu float3 direct_emissive_eval(KernelGlobals *kg,
+                                                    ShaderData *emission_sd,
+                                                    LightSample *ls,
+                                                    ccl_addr_space PathState *state,
+                                                    float3 I,
+                                                    differential3 dI,
+                                                    float t,
+                                                    float time)
 {
  /* setup shading at emitter */
  float3 eval = make_float3(0.0f, 0.0f, 0.0f);
@@ -98,15 +98,15 @@ ccl_device_noinline float3 direct_emissive_eval(KernelGlobals *kg,
  return eval;
 }

-ccl_device_noinline bool direct_emission(KernelGlobals *kg,
-                                         ShaderData *sd,
-                                         ShaderData *emission_sd,
-                                         LightSample *ls,
-                                         ccl_addr_space PathState *state,
-                                         Ray *ray,
-                                         BsdfEval *eval,
-                                         bool *is_lamp,
-                                         float rand_terminate)
+ccl_device_noinline_cpu bool direct_emission(KernelGlobals *kg,
+                                             ShaderData *sd,
+                                             ShaderData *emission_sd,
+                                             LightSample *ls,
+                                             ccl_addr_space PathState *state,
+                                             Ray *ray,
+                                             BsdfEval *eval,
+                                             bool *is_lamp,
+                                             float rand_terminate)
 {
  if (ls->pdf == 0.0f)
    return false;
@@ -208,7 +208,7 @@ ccl_device_noinline bool direct_emission(KernelGlobals *kg,

 /* Indirect Primitive Emission */

-ccl_device_noinline float3 indirect_primitive_emission(
+ccl_device_noinline_cpu float3 indirect_primitive_emission(
    KernelGlobals *kg, ShaderData *sd, float t, int path_flag, float bsdf_pdf)
 {
  /* evaluate emissive closure */
@@ -234,11 +234,11 @@ ccl_device_noinline float3 indirect_primitive_emission(

 /* Indirect Lamp Emission */

-ccl_device_noinline bool indirect_lamp_emission(KernelGlobals *kg,
-                                                ShaderData *emission_sd,
-                                                ccl_addr_space PathState *state,
-                                                Ray *ray,
-                                                float3 *emission)
+ccl_device_noinline_cpu bool indirect_lamp_emission(KernelGlobals *kg,
+                                                    ShaderData *emission_sd,
+                                                    ccl_addr_space PathState *state,
+                                                    Ray *ray,
+                                                    float3 *emission)
 {
  bool hit_lamp = false;

@@ -293,10 +293,10 @@ ccl_device_noinline bool indirect_lamp_emission(KernelGlobals *kg,

 /* Indirect Background */

-ccl_device_noinline float3 indirect_background(KernelGlobals *kg,
-                                               ShaderData *emission_sd,
-                                               ccl_addr_space PathState *state,
-                                               ccl_addr_space Ray *ray)
+ccl_device_noinline_cpu float3 indirect_background(KernelGlobals *kg,
+                                                   ShaderData *emission_sd,
+                                                   ccl_addr_space PathState *state,
+                                                   ccl_addr_space Ray *ray)
 {
 #ifdef __BACKGROUND__
  int shader = kernel_data.background.surface_shader;
--- a/intern/cycles/kernel/kernel_light.h
+++ b/intern/cycles/kernel/kernel_light.h
@@ -182,17 +182,7 @@ ccl_device float lamp_light_pdf(KernelGlobals *kg, const float3 Ng, const float3

 #ifdef __BACKGROUND_MIS__

-/* TODO(sergey): In theory it should be all fine to use noinline for all
- * devices, but we're so close to the release so better not screw things
- * up for CPU at least.
- */
-#  ifdef __KERNEL_GPU__
-ccl_device_noinline
-#  else
-ccl_device
-#  endif
-    float3
-    background_map_sample(KernelGlobals *kg, float randu, float randv, float *pdf)
+ccl_device float3 background_map_sample(KernelGlobals *kg, float randu, float randv, float *pdf)
 {
  /* for the following, the CDF values are actually a pair of floats, with the
   * function value as X and the actual CDF as Y.  The last entry's function
@@ -274,13 +264,7 @@ ccl_device
 /* TODO(sergey): Same as above, after the release we should consider using
 * 'noinline' for all devices.
 */
-#  ifdef __KERNEL_GPU__
-ccl_device_noinline
-#  else
-ccl_device
-#  endif
-    float
-    background_map_pdf(KernelGlobals *kg, float3 direction)
+ccl_device float background_map_pdf(KernelGlobals *kg, float3 direction)
 {
  float2 uv = direction_to_equirectangular(direction);
  int res_x = kernel_data.integrator.pdf_background_res_x;
--- a/intern/cycles/kernel/kernel_path_branched.h
+++ b/intern/cycles/kernel/kernel_path_branched.h
@@ -198,14 +198,14 @@ ccl_device_forceinline void kernel_branched_path_volume(KernelGlobals *kg,
 #    endif /* __VOLUME__ */

 /* bounce off surface and integrate indirect light */
-ccl_device_noinline void kernel_branched_path_surface_indirect_light(KernelGlobals *kg,
-                                                                     ShaderData *sd,
-                                                                     ShaderData *indirect_sd,
-                                                                     ShaderData *emission_sd,
-                                                                     float3 throughput,
-                                                                     float num_samples_adjust,
-                                                                     PathState *state,
-                                                                     PathRadiance *L)
+ccl_device_noinline_cpu void kernel_branched_path_surface_indirect_light(KernelGlobals *kg,
+                                                                         ShaderData *sd,
+                                                                         ShaderData *indirect_sd,
+                                                                         ShaderData *emission_sd,
+                                                                         float3 throughput,
+                                                                         float num_samples_adjust,
+                                                                         PathState *state,
+                                                                         PathRadiance *L)
 {
  float sum_sample_weight = 0.0f;
 #    ifdef __DENOISING_FEATURES__
--- a/intern/cycles/kernel/kernel_path_surface.h
+++ b/intern/cycles/kernel/kernel_path_surface.h
@@ -20,7 +20,7 @@ CCL_NAMESPACE_BEGIN
    defined(__BAKING__)
 /* branched path tracing: connect path directly to position on one or more lights and add it to L
 */
-ccl_device_noinline void kernel_branched_path_surface_connect_light(
+ccl_device_noinline_cpu void kernel_branched_path_surface_connect_light(
    KernelGlobals *kg,
    ShaderData *sd,
    ShaderData *emission_sd,
--- a/intern/cycles/kernel/kernel_path_volume.h
+++ b/intern/cycles/kernel/kernel_path_volume.h
@@ -57,18 +57,12 @@ ccl_device_inline void kernel_path_volume_connect_light(KernelGlobals *kg,
 #  endif /* __EMISSION__ */
 }

-#  ifdef __KERNEL_GPU__
-ccl_device_noinline
-#  else
-ccl_device
-#  endif
-    bool
-    kernel_path_volume_bounce(KernelGlobals *kg,
-                              ShaderData *sd,
-                              ccl_addr_space float3 *throughput,
-                              ccl_addr_space PathState *state,
-                              PathRadianceState *L_state,
-                              ccl_addr_space Ray *ray)
+ccl_device_noinline_cpu bool kernel_path_volume_bounce(KernelGlobals *kg,
+                                                       ShaderData *sd,
+                                                       ccl_addr_space float3 *throughput,
+                                                       ccl_addr_space PathState *state,
+                                                       PathRadianceState *L_state,
+                                                       ccl_addr_space Ray *ray)
 {
  /* sample phase function */
  float phase_pdf;
--- a/intern/cycles/kernel/kernel_volume.h
+++ b/intern/cycles/kernel/kernel_volume.h
@@ -672,7 +672,7 @@ kernel_volume_integrate_heterogeneous_distance(KernelGlobals *kg,
 * ray, with the assumption that there are no surfaces blocking light
 * between the endpoints. distance sampling is used to decide if we will
 * scatter or not. */
-ccl_device_noinline VolumeIntegrateResult
+ccl_device_noinline_cpu VolumeIntegrateResult
 kernel_volume_integrate(KernelGlobals *kg,
                        ccl_addr_space PathState *state,
                        ShaderData *sd,
--- a/intern/cycles/kernel/svm/svm_attribute.h
+++ b/intern/cycles/kernel/svm/svm_attribute.h
@@ -80,13 +80,7 @@ ccl_device void svm_node_attr(KernelGlobals *kg, ShaderData *sd, float *stack, u
  }
 }

-#ifndef __KERNEL_CUDA__
-ccl_device
-#else
-ccl_device_noinline
-#endif
-    void
-    svm_node_attr_bump_dx(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node)
+ccl_device void svm_node_attr_bump_dx(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node)
 {
  NodeAttributeType type = NODE_ATTR_FLOAT;
  uint out_offset = 0;
@@ -125,13 +119,7 @@ ccl_device_noinline
  }
 }

-#ifndef __KERNEL_CUDA__
-ccl_device
-#else
-ccl_device_noinline
-#endif
-    void
-    svm_node_attr_bump_dy(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node)
+ccl_device void svm_node_attr_bump_dy(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node)
 {
  NodeAttributeType type = NODE_ATTR_FLOAT;
  uint out_offset = 0;
--- a/intern/cycles/kernel/svm/svm_brick.h
+++ b/intern/cycles/kernel/svm/svm_brick.h
@@ -27,16 +27,16 @@ ccl_device_inline float brick_noise(uint n) /* fast integer noise */
  return 0.5f * ((float)nn / 1073741824.0f);
 }

-ccl_device_noinline float2 svm_brick(float3 p,
-                                     float mortar_size,
-                                     float mortar_smooth,
-                                     float bias,
-                                     float brick_width,
-                                     float row_height,
-                                     float offset_amount,
-                                     int offset_frequency,
-                                     float squash_amount,
-                                     int squash_frequency)
+ccl_device_noinline_cpu float2 svm_brick(float3 p,
+                                         float mortar_size,
+                                         float mortar_smooth,
+                                         float bias,
+                                         float brick_width,
+                                         float row_height,
+                                         float offset_amount,
+                                         int offset_frequency,
+                                         float squash_amount,
+                                         int squash_frequency)
 {
  int bricknum, rownum;
  float offset = 0.0f;
--- a/intern/cycles/kernel/svm/svm_color_util.h
+++ b/intern/cycles/kernel/svm/svm_color_util.h
@@ -264,7 +264,7 @@ ccl_device float3 svm_mix_clamp(float3 col)
  return outcol;
 }

-ccl_device_noinline float3 svm_mix(NodeMix type, float fac, float3 c1, float3 c2)
+ccl_device_noinline_cpu float3 svm_mix(NodeMix type, float fac, float3 c1, float3 c2)
 {
  float t = saturate(fac);

--- a/intern/cycles/kernel/svm/svm_magic.h
+++ b/intern/cycles/kernel/svm/svm_magic.h
@@ -18,7 +18,7 @@ CCL_NAMESPACE_BEGIN

 /* Magic */

-ccl_device_noinline float3 svm_magic(float3 p, int n, float distortion)
+ccl_device_noinline_cpu float3 svm_magic(float3 p, int n, float distortion)
 {
  float x = sinf((p.x + p.y + p.z) * 5.0f);
  float y = cosf((-p.x + p.y - p.z) * 5.0f);
--- a/intern/cycles/kernel/svm/svm_musgrave.h
+++ b/intern/cycles/kernel/svm/svm_musgrave.h
@@ -25,7 +25,10 @@ CCL_NAMESPACE_BEGIN
 * from "Texturing and Modelling: A procedural approach"
 */

-ccl_device_noinline float noise_musgrave_fBm(float3 p, float H, float lacunarity, float octaves)
+ccl_device_noinline_cpu float noise_musgrave_fBm(float3 p,
+                                                 float H,
+                                                 float lacunarity,
+                                                 float octaves)
 {
  float rmd;
  float value = 0.0f;
@@ -53,10 +56,10 @@ ccl_device_noinline float noise_musgrave_fBm(float3 p, float H, float lacunarity
 * octaves: number of frequencies in the fBm
 */

-ccl_device_noinline float noise_musgrave_multi_fractal(float3 p,
-                                                       float H,
-                                                       float lacunarity,
-                                                       float octaves)
+ccl_device_noinline_cpu float noise_musgrave_multi_fractal(float3 p,
+                                                           float H,
+                                                           float lacunarity,
+                                                           float octaves)
 {
  float rmd;
  float value = 1.0f;
@@ -85,7 +88,7 @@ ccl_device_noinline float noise_musgrave_multi_fractal(float3 p,
 * offset: raises the terrain from `sea level'
 */

-ccl_device_noinline float noise_musgrave_hetero_terrain(
+ccl_device_noinline_cpu float noise_musgrave_hetero_terrain(
    float3 p, float H, float lacunarity, float octaves, float offset)
 {
  float value, increment, rmd;
@@ -121,7 +124,7 @@ ccl_device_noinline float noise_musgrave_hetero_terrain(
 * offset: raises the terrain from `sea level'
 */

-ccl_device_noinline float noise_musgrave_hybrid_multi_fractal(
+ccl_device_noinline_cpu float noise_musgrave_hybrid_multi_fractal(
    float3 p, float H, float lacunarity, float octaves, float offset, float gain)
 {
  float result, signal, weight, rmd;
@@ -159,7 +162,7 @@ ccl_device_noinline float noise_musgrave_hybrid_multi_fractal(
 * offset: raises the terrain from `sea level'
 */

-ccl_device_noinline float noise_musgrave_ridged_multi_fractal(
+ccl_device_noinline_cpu float noise_musgrave_ridged_multi_fractal(
    float3 p, float H, float lacunarity, float octaves, float offset, float gain)
 {
  float result, signal, weight;
--- a/intern/cycles/kernel/svm/svm_noise.h
+++ b/intern/cycles/kernel/svm/svm_noise.h
@@ -182,7 +182,7 @@ ccl_device_inline ssef scale3_sse(const ssef &result)
 #endif

 #ifndef __KERNEL_SSE2__
-ccl_device_noinline float perlin(float x, float y, float z)
+ccl_device_noinline_cpu float perlin(float x, float y, float z)
 {
  int X;
  float fx = floorfrac(x, &X);
--- a/intern/cycles/kernel/svm/svm_wave.h
+++ b/intern/cycles/kernel/svm/svm_wave.h
@@ -18,12 +18,12 @@ CCL_NAMESPACE_BEGIN

 /* Wave */

-ccl_device_noinline float svm_wave(NodeWaveType type,
-                                   NodeWaveProfile profile,
-                                   float3 p,
-                                   float detail,
-                                   float distortion,
-                                   float dscale)
+ccl_device_noinline_cpu float svm_wave(NodeWaveType type,
+                                       NodeWaveProfile profile,
+                                       float3 p,
+                                       float detail,
+                                       float distortion,
+                                       float dscale)
 {
  float n;

--- a/intern/cycles/util/util_defines.h
+++ b/intern/cycles/util/util_defines.h
@@ -30,6 +30,7 @@
 #  ifndef __KERNEL_GPU__
 #    define ccl_device static inline
 #    define ccl_device_noinline static
+#    define ccl_device_noinline_cpu ccl_device_noinline
 #    define ccl_global
 #    define ccl_static_constant static const
 #    define ccl_constant const