Cycles: volume light sampling

* Volume multiple importace sampling support to combine equiangular and distance sampling, for both homogeneous and heterogeneous volumes. * Branched path "Sample All Direct Lights" and "Sample All Indirect Lights" now apply to volumes as well as surfaces. Implementation note: For simplicity this is all done with decoupled ray marching, the only case we do not use decoupled is for distance only sampling with one light sample. The homogeneous case should still compile on the GPU because it only requires fixed size storage, but the heterogeneous case will be trickier to get working.
2014-04-04 16:45:49 +02:00
parent d644753319
commit a29807cd63
10 changed files with 676 additions and 288 deletions
--- a/intern/cycles/blender/addon/properties.py
+++ b/intern/cycles/blender/addon/properties.py
@@ -111,6 +111,7 @@ enum_integrator = (
 enum_volume_homogeneous_sampling = (
    ('DISTANCE', "Distance", "Use Distance Sampling"),
    ('EQUI_ANGULAR', "Equi-angular", "Use Equi-angular Sampling"),
    ('MULTIPLE_IMPORTANCE', "Multiple Importance", "Combine distance and equi-angular sampling"),
    )
--- a/intern/cycles/kernel/kernel_emission.h
+++ b/intern/cycles/kernel/kernel_emission.h
@@ -194,6 +194,17 @@ ccl_device_noinline bool indirect_lamp_emission(KernelGlobals *kg, PathState *st
 		float3 L = direct_emissive_eval(kg, &ls, -ray->D, ray->dD, ls.t, ray->time, state->bounce, state->transparent_bounce);
 #ifdef __VOLUME__
 		if(state->volume_stack[0].shader != SHADER_NONE) {
 			/* shadow attenuation */
 			Ray volume_ray = *ray;
 			volume_ray.t = ls.t;
 			float3 volume_tp = make_float3(1.0f, 1.0f, 1.0f);
 			kernel_volume_shadow(kg, state, &volume_ray, &volume_tp);
 			L *= volume_tp;
 		}
 #endif
 		if(!(state->flag & PATH_RAY_MIS_SKIP)) {
 			/* multiple importance sampling, get regular light pdf,
 			 * and compute weight with respect to BSDF pdf */
--- a/intern/cycles/kernel/kernel_light.h
+++ b/intern/cycles/kernel/kernel_light.h
@@ -208,8 +208,8 @@ ccl_device float lamp_light_pdf(KernelGlobals *kg, const float3 Ng, const float3
 	return t*t/cos_pi;
 }
-ccl_device void lamp_light_sample(KernelGlobals *kg, int lamp,
+ccl_device bool lamp_light_sample(KernelGlobals *kg, int lamp,
-	float randu, float randv, float3 P, LightSample *ls)
+	float randu, float randv, float3 P, LightSample *ls, bool for_volume)
 {
 	float4 data0 = kernel_tex_fetch(__light_data, lamp*LIGHT_SIZE + 0);
 	float4 data1 = kernel_tex_fetch(__light_data, lamp*LIGHT_SIZE + 1);
@@ -224,6 +224,11 @@ ccl_device void lamp_light_sample(KernelGlobals *kg, int lamp,
 	ls->v = randv;
 	if(type == LIGHT_DISTANT) {
 #ifdef __VOLUME__
 		if(for_volume)
 			return false;
 #endif
 		/* distant light */
 		float3 lightD = make_float3(data0.y, data0.z, data0.w);
 		float3 D = lightD;
@@ -244,6 +249,11 @@ ccl_device void lamp_light_sample(KernelGlobals *kg, int lamp,
 	}
 #ifdef __BACKGROUND_MIS__
 	else if(type == LIGHT_BACKGROUND) {
 #ifdef __VOLUME__
 		if(for_volume)
 			return false;
 #endif
 		/* infinite area light (e.g. light dome or env light) */
 		float3 D = background_light_sample(kg, randu, randv, &ls->pdf);
@@ -299,6 +309,8 @@ ccl_device void lamp_light_sample(KernelGlobals *kg, int lamp,
 		ls->eval_fac *= kernel_data.integrator.inv_pdf_lights;
 		ls->pdf *= lamp_light_pdf(kg, ls->Ng, -ls->D, ls->t);
 	}
 	return true;
 }
 ccl_device bool lamp_light_eval(KernelGlobals *kg, int lamp, float3 P, float3 D, float t, LightSample *ls)
@@ -514,7 +526,7 @@ ccl_device int light_distribution_sample(KernelGlobals *kg, float randt)
 /* Generic Light */
-ccl_device void light_sample(KernelGlobals *kg, float randt, float randu, float randv, float time, float3 P, LightSample *ls)
+ccl_device bool light_sample(KernelGlobals *kg, float randt, float randu, float randv, float time, float3 P, LightSample *ls, bool for_volume)
 {
 	/* sample index */
 	int index = light_distribution_sample(kg, randt);
@@ -533,10 +545,12 @@ ccl_device void light_sample(KernelGlobals *kg, float randt, float randu, float
 		ls->D = normalize_len(ls->P - P, &ls->t);
 		ls->pdf = triangle_light_pdf(kg, ls->Ng, -ls->D, ls->t);
 		ls->shader |= shader_flag;
 		return true;
 	}
 	else {
 		int lamp = -prim-1;
-		lamp_light_sample(kg, lamp, randu, randv, P, ls);
+		return lamp_light_sample(kg, lamp, randu, randv, P, ls, for_volume);
 	}
 }
@@ -546,9 +560,9 @@ ccl_device int light_select_num_samples(KernelGlobals *kg, int index)
 	return __float_as_int(data3.x);
 }
-ccl_device void light_select(KernelGlobals *kg, int index, float randu, float randv, float3 P, LightSample *ls)
+ccl_device bool light_select(KernelGlobals *kg, int index, float randu, float randv, float3 P, LightSample *ls, bool for_volume)
 {
-	lamp_light_sample(kg, index, randu, randv, P, ls);
+	return lamp_light_sample(kg, index, randu, randv, P, ls, for_volume);
 }
 ccl_device int lamp_light_eval_sample(KernelGlobals *kg, float randt)
--- a/intern/cycles/kernel/kernel_path.h
+++ b/intern/cycles/kernel/kernel_path.h
@@ -29,7 +29,6 @@
 #include "kernel_accumulate.h"
 #include "kernel_shader.h"
 #include "kernel_light.h"
 #include "kernel_emission.h"
 #include "kernel_passes.h"
 #ifdef __SUBSURFACE__
@@ -42,6 +41,7 @@
 #include "kernel_path_state.h"
 #include "kernel_shadow.h"
 #include "kernel_emission.h"
 #include "kernel_path_surface.h"
 #include "kernel_path_volume.h"
@@ -88,17 +88,73 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, RNG *rng, Ray ray,
 			Ray volume_ray = ray;
 			volume_ray.t = (hit)? isect.t: FLT_MAX;
-			ShaderData volume_sd;
+			bool heterogeneous = volume_stack_is_heterogeneous(kg, state.volume_stack);
-			VolumeIntegrateResult result = kernel_volume_integrate(kg, &state,
+			bool decoupled = kernel_volume_use_decoupled(kg, heterogeneous, false);
 				&volume_sd, &volume_ray, L, &throughput, rng);
-			if(result == VOLUME_PATH_SCATTERED) {
+			if(decoupled) {
-				kernel_path_volume_connect_light(kg, rng, &volume_sd, throughput, &state, L, 1.0f);
+				/* cache steps along volume for repeated sampling */
 				VolumeSegment volume_segment;
 				ShaderData volume_sd;
-				if(kernel_path_volume_bounce(kg, rng, &volume_sd, &throughput, &state, L, &ray, 1.0f))
+				shader_setup_from_volume(kg, &volume_sd, &volume_ray, state.bounce, state.transparent_bounce);
-					continue;
+				kernel_volume_decoupled_record(kg, &state,
-				else
+					&volume_ray, &volume_sd, &volume_segment, heterogeneous);
-					break;
+
 				/* emission */
 				if(volume_segment.closure_flag & SD_EMISSION)
 					path_radiance_accum_emission(L, throughput, volume_segment.accum_emission, state.bounce);
 				/* scattering */
 				VolumeIntegrateResult result = VOLUME_PATH_ATTENUATED;
 				bool scatter = false;
 				if(volume_segment.closure_flag & SD_SCATTER) {
 					bool all = kernel_data.integrator.sample_all_lights_indirect;
 					/* direct light sampling */
 					kernel_branched_path_volume_connect_light(kg, rng, &volume_sd,
 						throughput, &state, L, 1.0f, all, &volume_ray, &volume_segment);
 					/* indirect sample. if we use distance sampling and take just
 					 * one sample for direct and indirect light, we could share
 					 * this computation, but makes code a bit complex */
 					float rphase = path_state_rng_1D_for_decision(kg, rng, &state, PRNG_PHASE);
 					float rscatter = path_state_rng_1D_for_decision(kg, rng, &state, PRNG_SCATTER_DISTANCE);
 					result = kernel_volume_decoupled_scatter(kg,
 						&state, &volume_ray, &volume_sd, &throughput,
 						rphase, rscatter, &volume_segment, NULL, true);
 					if(result == VOLUME_PATH_SCATTERED)
 						scatter = kernel_path_volume_bounce(kg, rng, &volume_sd, &throughput, &state, L, &ray, 1.0f);
 				}
 				/* free cached steps */
 				kernel_volume_decoupled_free(kg, &volume_segment);
 				if(result == VOLUME_PATH_SCATTERED) {
 					if(scatter)
 						continue;
 					else
 						break;
 				}
 			}
 			else {
 				/* integrate along volume segment with distance sampling */
 				ShaderData volume_sd;
 				VolumeIntegrateResult result = kernel_volume_integrate(
 					kg, &state, &volume_sd, &volume_ray, L, &throughput, rng);
 				if(result == VOLUME_PATH_SCATTERED) {
 					/* direct lighting */
 					kernel_path_volume_connect_light(kg, rng, &volume_sd, throughput, &state, L, 1.0f);
 					/* indirect light bounce */
 					if(kernel_path_volume_bounce(kg, rng, &volume_sd, &throughput, &state, L, &ray, 1.0f))
 						continue;
 					else
 						break;
 				}
 			}
 		}
 #endif
@@ -411,17 +467,73 @@ ccl_device float4 kernel_path_integrate(KernelGlobals *kg, RNG *rng, int sample,
 			Ray volume_ray = ray;
 			volume_ray.t = (hit)? isect.t: FLT_MAX;
-			ShaderData volume_sd;
+			bool heterogeneous = volume_stack_is_heterogeneous(kg, state.volume_stack);
-			VolumeIntegrateResult result = kernel_volume_integrate(kg, &state,
+			bool decoupled = kernel_volume_use_decoupled(kg, heterogeneous, true);
 				&volume_sd, &volume_ray, &L, &throughput, rng);
-			if(result == VOLUME_PATH_SCATTERED) {
+			if(decoupled) {
-				kernel_path_volume_connect_light(kg, rng, &volume_sd, throughput, &state, &L, 1.0f);
+				/* cache steps along volume for repeated sampling */
 				VolumeSegment volume_segment;
 				ShaderData volume_sd;
-				if(kernel_path_volume_bounce(kg, rng, &volume_sd, &throughput, &state, &L, &ray, 1.0f))
+				shader_setup_from_volume(kg, &volume_sd, &volume_ray, state.bounce, state.transparent_bounce);
-					continue;
+				kernel_volume_decoupled_record(kg, &state,
-				else
+					&volume_ray, &volume_sd, &volume_segment, heterogeneous);
-					break;
+
 				/* emission */
 				if(volume_segment.closure_flag & SD_EMISSION)
 					path_radiance_accum_emission(&L, throughput, volume_segment.accum_emission, state.bounce);
 				/* scattering */
 				VolumeIntegrateResult result = VOLUME_PATH_ATTENUATED;
 				bool scatter = false;
 				if(volume_segment.closure_flag & SD_SCATTER) {
 					bool all = false;
 					/* direct light sampling */
 					kernel_branched_path_volume_connect_light(kg, rng, &volume_sd,
 						throughput, &state, &L, 1.0f, all, &volume_ray, &volume_segment);
 					/* indirect sample. if we use distance sampling and take just
 					 * one sample for direct and indirect light, we could share
 					 * this computation, but makes code a bit complex */
 					float rphase = path_state_rng_1D_for_decision(kg, rng, &state, PRNG_PHASE);
 					float rscatter = path_state_rng_1D_for_decision(kg, rng, &state, PRNG_SCATTER_DISTANCE);
 					result = kernel_volume_decoupled_scatter(kg,
 						&state, &volume_ray, &volume_sd, &throughput,
 						rphase, rscatter, &volume_segment, NULL, true);
 					if(result == VOLUME_PATH_SCATTERED)
 						scatter = kernel_path_volume_bounce(kg, rng, &volume_sd, &throughput, &state, &L, &ray, 1.0f);
 				}
 				/* free cached steps */
 				kernel_volume_decoupled_free(kg, &volume_segment);
 				if(result == VOLUME_PATH_SCATTERED) {
 					if(scatter)
 						continue;
 					else
 						break;
 				}
 			}
 			else {
 				/* integrate along volume segment with distance sampling */
 				ShaderData volume_sd;
 				VolumeIntegrateResult result = kernel_volume_integrate(
 					kg, &state, &volume_sd, &volume_ray, &L, &throughput, rng);
 				if(result == VOLUME_PATH_SCATTERED) {
 					/* direct lighting */
 					kernel_path_volume_connect_light(kg, rng, &volume_sd, throughput, &state, &L, 1.0f);
 					/* indirect light bounce */
 					if(kernel_path_volume_bounce(kg, rng, &volume_sd, &throughput, &state, &L, &ray, 1.0f))
 						continue;
 					else
 						break;
 				}
 			}
 		}
 #endif
@@ -700,37 +812,47 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
 			kernel_volume_decoupled_record(kg, &state,
 				&volume_ray, &volume_sd, &volume_segment, heterogeneous);
-			/* sample scattering */
+			/* direct light sampling */
-			int num_samples = kernel_data.integrator.volume_samples;
+			if(volume_segment.closure_flag & SD_SCATTER) {
-			float num_samples_inv = 1.0f/num_samples;
+				bool all = kernel_data.integrator.sample_all_lights_direct;
 				kernel_branched_path_volume_connect_light(kg, rng, &volume_sd,
 					throughput, &state, &L, 1.0f, all, &volume_ray, &volume_segment);
-			for(int j = 0; j < num_samples; j++) {
+				/* indirect light sampling */
-				/* workaround to fix correlation bug in T38710, can find better solution
+				int num_samples = kernel_data.integrator.volume_samples;
-				 * in random number generator later, for now this is done here to not impact
+				float num_samples_inv = 1.0f/num_samples;
 				 * performance of rendering without volumes */
 				RNG tmp_rng = cmj_hash(*rng, state.rng_offset);
-				PathState ps = state;
+				for(int j = 0; j < num_samples; j++) {
-				Ray pray = ray;
+					/* workaround to fix correlation bug in T38710, can find better solution
-				float3 tp = throughput;
+					 * in random number generator later, for now this is done here to not impact
 					 * performance of rendering without volumes */
 					RNG tmp_rng = cmj_hash(*rng, state.rng_offset);
-				/* branch RNG state */
+					PathState ps = state;
-				path_state_branch(&ps, j, num_samples);
+					Ray pray = ray;
 					float3 tp = throughput;
-				VolumeIntegrateResult result = kernel_volume_decoupled_scatter(kg,
+					/* branch RNG state */
-					&ps, &volume_ray, &volume_sd, &tp, &tmp_rng, &volume_segment);
+					path_state_branch(&ps, j, num_samples);
 				if(result == VOLUME_PATH_SCATTERED) {
 					/* todo: use all-light sampling */
 					kernel_path_volume_connect_light(kg, rng, &volume_sd, tp, &state, &L, 1.0f);
-					if(kernel_path_volume_bounce(kg, rng, &volume_sd, &tp, &ps, &L, &pray, num_samples_inv)) {
+					/* scatter sample. if we use distance sampling and take just one
-						kernel_path_indirect(kg, rng, pray, tp*num_samples_inv, num_samples, ps, &L);
+					 * sample for direct and indirect light, we could share this
 					 * computation, but makes code a bit complex */
 					float rphase = path_state_rng_1D_for_decision(kg, &tmp_rng, &ps, PRNG_PHASE);
 					float rscatter = path_state_rng_1D_for_decision(kg, &tmp_rng, &ps, PRNG_SCATTER_DISTANCE);
-						/* for render passes, sum and reset indirect light pass variables
+					VolumeIntegrateResult result = kernel_volume_decoupled_scatter(kg,
-						 * for the next samples */
+						&ps, &pray, &volume_sd, &tp, rphase, rscatter, &volume_segment, NULL, false);
-						path_radiance_sum_indirect(&L);
+
-						path_radiance_reset_indirect(&L);
+					if(result == VOLUME_PATH_SCATTERED) {
 						if(kernel_path_volume_bounce(kg, rng, &volume_sd, &tp, &ps, &L, &pray, num_samples_inv)) {
 							kernel_path_indirect(kg, rng, pray, tp*num_samples_inv, num_samples, ps, &L);
 							/* for render passes, sum and reset indirect light pass variables
 							 * for the next samples */
 							path_radiance_sum_indirect(&L);
 							path_radiance_reset_indirect(&L);
 						}
 					}
 				}
 			}
@@ -759,12 +881,15 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
 				/* branch RNG state */
 				path_state_branch(&ps, j, num_samples);
-				VolumeIntegrateResult result = kernel_volume_integrate(kg, &ps,
+				VolumeIntegrateResult result = kernel_volume_integrate(
-					&volume_sd, &volume_ray, &L, &tp, rng);
+					kg, &ps, &volume_sd, &volume_ray, &L, &tp, rng);
 				if(result == VOLUME_PATH_SCATTERED) {
-					/* todo: use all-light sampling */
+					/* todo: support equiangular, MIS and all light sampling.
-					if(kernel_path_integrate_scatter_lighting(kg, rng, &volume_sd, &tp, &ps, &L, &pray, num_samples_inv)) {
+					 * alternatively get decoupled ray marching working on the GPU */
 					kernel_path_volume_connect_light(kg, rng, &volume_sd, &volume_ray, throughput, &state, &L, num_samples_inv);
 					if(kernel_path_volume_bounce(kg, rng, &volume_sd, &tp, &ps, &L, &pray, num_samples_inv)) {
 						kernel_path_indirect(kg, rng, pray, tp*num_samples_inv, num_samples, ps, &L);
 						/* for render passes, sum and reset indirect light pass variables
--- a/intern/cycles/kernel/kernel_path_surface.h
+++ b/intern/cycles/kernel/kernel_path_surface.h
@@ -22,97 +22,101 @@ CCL_NAMESPACE_BEGIN
 ccl_device void kernel_branched_path_surface_connect_light(KernelGlobals *kg, RNG *rng,
 	ShaderData *sd, PathState *state, float3 throughput, float num_samples_adjust, PathRadiance *L, bool sample_all_lights)
 {
 #ifdef __EMISSION__
 	/* sample illumination from lights to find path contribution */
-	if(sd->flag & SD_BSDF_HAS_EVAL) {
+	if(!(sd->flag & SD_BSDF_HAS_EVAL))
-		Ray light_ray;
+		return;
-		BsdfEval L_light;
+
-		bool is_lamp;
+	Ray light_ray;
 	BsdfEval L_light;
 	bool is_lamp;
 #ifdef __OBJECT_MOTION__
-		light_ray.time = sd->time;
+	light_ray.time = sd->time;
 #endif
-		if(sample_all_lights) {
+	if(sample_all_lights) {
-			/* lamp sampling */
+		/* lamp sampling */
-			for(int i = 0; i < kernel_data.integrator.num_all_lights; i++) {
+		for(int i = 0; i < kernel_data.integrator.num_all_lights; i++) {
-				int num_samples = ceil_to_int(num_samples_adjust*light_select_num_samples(kg, i));
+			int num_samples = ceil_to_int(num_samples_adjust*light_select_num_samples(kg, i));
-				float num_samples_inv = num_samples_adjust/(num_samples*kernel_data.integrator.num_all_lights);
+			float num_samples_inv = num_samples_adjust/(num_samples*kernel_data.integrator.num_all_lights);
-				RNG lamp_rng = cmj_hash(*rng, i);
+			RNG lamp_rng = cmj_hash(*rng, i);
-				if(kernel_data.integrator.pdf_triangles != 0.0f)
+			if(kernel_data.integrator.pdf_triangles != 0.0f)
-					num_samples_inv *= 0.5f;
+				num_samples_inv *= 0.5f;
-				for(int j = 0; j < num_samples; j++) {
+			for(int j = 0; j < num_samples; j++) {
-					float light_u, light_v;
+				float light_u, light_v;
-					path_branched_rng_2D(kg, &lamp_rng, state, j, num_samples, PRNG_LIGHT_U, &light_u, &light_v);
+				path_branched_rng_2D(kg, &lamp_rng, state, j, num_samples, PRNG_LIGHT_U, &light_u, &light_v);
-					LightSample ls;
+				LightSample ls;
-					light_select(kg, i, light_u, light_v, sd->P, &ls);
+				light_select(kg, i, light_u, light_v, sd->P, &ls, false);
-					if(direct_emission(kg, sd, &ls, &light_ray, &L_light, &is_lamp, state->bounce, state->transparent_bounce)) {
+				if(direct_emission(kg, sd, &ls, &light_ray, &L_light, &is_lamp, state->bounce, state->transparent_bounce)) {
-						/* trace shadow ray */
+					/* trace shadow ray */
-						float3 shadow;
+					float3 shadow;
-						if(!shadow_blocked(kg, state, &light_ray, &shadow)) {
+					if(!shadow_blocked(kg, state, &light_ray, &shadow)) {
-							/* accumulate */
+						/* accumulate */
-							path_radiance_accum_light(L, throughput*num_samples_inv, &L_light, shadow, num_samples_inv, state->bounce, is_lamp);
+						path_radiance_accum_light(L, throughput*num_samples_inv, &L_light, shadow, num_samples_inv, state->bounce, is_lamp);
 						}
 					}
 				}
 			}
 			/* mesh light sampling */
 			if(kernel_data.integrator.pdf_triangles != 0.0f) {
 				int num_samples = ceil_to_int(num_samples_adjust*kernel_data.integrator.mesh_light_samples);
 				float num_samples_inv = num_samples_adjust/num_samples;
 				if(kernel_data.integrator.num_all_lights)
 					num_samples_inv *= 0.5f;
 				for(int j = 0; j < num_samples; j++) {
 					float light_t = path_branched_rng_1D(kg, rng, state, j, num_samples, PRNG_LIGHT);
 					float light_u, light_v;
 					path_branched_rng_2D(kg, rng, state, j, num_samples, PRNG_LIGHT_U, &light_u, &light_v);
 					/* only sample triangle lights */
 					if(kernel_data.integrator.num_all_lights)
 						light_t = 0.5f*light_t;
 					LightSample ls;
 					light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, &ls);
 					if(direct_emission(kg, sd, &ls, &light_ray, &L_light, &is_lamp, state->bounce, state->transparent_bounce)) {
 						/* trace shadow ray */
 						float3 shadow;
 						if(!shadow_blocked(kg, state, &light_ray, &shadow)) {
 							/* accumulate */
 							path_radiance_accum_light(L, throughput*num_samples_inv, &L_light, shadow, num_samples_inv, state->bounce, is_lamp);
 						}
 					}
 				}
 			}
 		}
 		else {
 			float light_t = path_state_rng_1D(kg, rng, state, PRNG_LIGHT);
 			float light_u, light_v;
 			path_state_rng_2D(kg, rng, state, PRNG_LIGHT_U, &light_u, &light_v);
-			LightSample ls;
+		/* mesh light sampling */
-			light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, &ls);
+		if(kernel_data.integrator.pdf_triangles != 0.0f) {
 			int num_samples = ceil_to_int(num_samples_adjust*kernel_data.integrator.mesh_light_samples);
 			float num_samples_inv = num_samples_adjust/num_samples;
-			/* sample random light */
+			if(kernel_data.integrator.num_all_lights)
-			if(direct_emission(kg, sd, &ls, &light_ray, &L_light, &is_lamp, state->bounce, state->transparent_bounce)) {
+				num_samples_inv *= 0.5f;
 				/* trace shadow ray */
 				float3 shadow;
-				if(!shadow_blocked(kg, state, &light_ray, &shadow)) {
+			for(int j = 0; j < num_samples; j++) {
-					/* accumulate */
+				float light_t = path_branched_rng_1D(kg, rng, state, j, num_samples, PRNG_LIGHT);
-					path_radiance_accum_light(L, throughput, &L_light, shadow, num_samples_adjust, state->bounce, is_lamp);
+				float light_u, light_v;
 				path_branched_rng_2D(kg, rng, state, j, num_samples, PRNG_LIGHT_U, &light_u, &light_v);
 				/* only sample triangle lights */
 				if(kernel_data.integrator.num_all_lights)
 					light_t = 0.5f*light_t;
 				LightSample ls;
 				light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, &ls, false);
 				if(direct_emission(kg, sd, &ls, &light_ray, &L_light, &is_lamp, state->bounce, state->transparent_bounce)) {
 					/* trace shadow ray */
 					float3 shadow;
 					if(!shadow_blocked(kg, state, &light_ray, &shadow)) {
 						/* accumulate */
 						path_radiance_accum_light(L, throughput*num_samples_inv, &L_light, shadow, num_samples_inv, state->bounce, is_lamp);
 					}
 				}
 			}
 		}
 	}
 	else {
 		/* sample one light at random */
 		float light_t = path_state_rng_1D(kg, rng, state, PRNG_LIGHT);
 		float light_u, light_v;
 		path_state_rng_2D(kg, rng, state, PRNG_LIGHT_U, &light_u, &light_v);
 		LightSample ls;
 		light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, &ls, false);
 		/* sample random light */
 		if(direct_emission(kg, sd, &ls, &light_ray, &L_light, &is_lamp, state->bounce, state->transparent_bounce)) {
 			/* trace shadow ray */
 			float3 shadow;
 			if(!shadow_blocked(kg, state, &light_ray, &shadow)) {
 				/* accumulate */
 				path_radiance_accum_light(L, throughput, &L_light, shadow, num_samples_adjust, state->bounce, is_lamp);
 			}
 		}
 	}
 #endif
 }
 /* branched path tracing: bounce off or through surface to with new direction stored in ray */
@@ -196,7 +200,7 @@ ccl_device_inline void kernel_path_surface_connect_light(KernelGlobals *kg, RNG
 #endif
 	LightSample ls;
-	light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, &ls);
+	light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, &ls, false);
 	if(direct_emission(kg, sd, &ls, &light_ray, &L_light, &is_lamp, state->bounce, state->transparent_bounce)) {
 		/* trace shadow ray */
--- a/intern/cycles/kernel/kernel_path_volume.h
+++ b/intern/cycles/kernel/kernel_path_volume.h
@@ -18,11 +18,12 @@ CCL_NAMESPACE_BEGIN
 #ifdef __VOLUME__
-ccl_device_inline void kernel_path_volume_connect_light(KernelGlobals *kg, RNG *rng,
+ccl_device void kernel_path_volume_connect_light(KernelGlobals *kg, RNG *rng,
-	ShaderData *sd, float3 throughput, PathState *state, PathRadiance *L, float num_samples_adjust)
+	ShaderData *sd, float3 throughput, PathState *state, PathRadiance *L,
 	float num_samples_adjust)
 {
 #ifdef __EMISSION__
-	if(!(kernel_data.integrator.use_direct_light && (sd->flag & SD_BSDF_HAS_EVAL)))
+	if(!kernel_data.integrator.use_direct_light)
 		return;
 	/* sample illumination from lights to find path contribution */
@@ -32,15 +33,19 @@ ccl_device_inline void kernel_path_volume_connect_light(KernelGlobals *kg, RNG *
 	Ray light_ray;
 	BsdfEval L_light;
 	LightSample ls;
 	bool is_lamp;
 	/* connect to light from given point where shader has been evaluated */
 #ifdef __OBJECT_MOTION__
 	light_ray.time = sd->time;
 #endif
-	LightSample ls;
+	if(!light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, &ls, true))
-	light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, &ls);
+		return;
-
+	else if(ls.pdf == 0.0f)
 		return;
 	if(direct_emission(kg, sd, &ls, &light_ray, &L_light, &is_lamp, state->bounce, state->transparent_bounce)) {
 		/* trace shadow ray */
 		float3 shadow;
@@ -53,7 +58,7 @@ ccl_device_inline void kernel_path_volume_connect_light(KernelGlobals *kg, RNG *
 #endif
 }
-ccl_device_inline bool kernel_path_volume_bounce(KernelGlobals *kg, RNG *rng,
+ccl_device bool kernel_path_volume_bounce(KernelGlobals *kg, RNG *rng,
 	ShaderData *sd, float3 *throughput, PathState *state, PathRadiance *L, Ray *ray,
 	float num_samples_adjust)
 {
@@ -98,6 +103,178 @@ ccl_device_inline bool kernel_path_volume_bounce(KernelGlobals *kg, RNG *rng,
 	return true;
 }
 #ifdef __KERNEL_CPU__
 ccl_device void kernel_branched_path_volume_connect_light(KernelGlobals *kg, RNG *rng,
 	ShaderData *sd, float3 throughput, PathState *state, PathRadiance *L,
 	float num_samples_adjust, bool sample_all_lights, Ray *ray, const VolumeSegment *segment)
 {
 #ifdef __EMISSION__
 	if(!kernel_data.integrator.use_direct_light)
 		return;
 	Ray light_ray;
 	BsdfEval L_light;
 	bool is_lamp;
 #ifdef __OBJECT_MOTION__
 	light_ray.time = sd->time;
 #endif
 	if(sample_all_lights) {
 		/* lamp sampling */
 		for(int i = 0; i < kernel_data.integrator.num_all_lights; i++) {
 			int num_samples = ceil_to_int(num_samples_adjust*light_select_num_samples(kg, i));
 			float num_samples_inv = num_samples_adjust/(num_samples*kernel_data.integrator.num_all_lights);
 			RNG lamp_rng = cmj_hash(*rng, i);
 			if(kernel_data.integrator.pdf_triangles != 0.0f)
 				num_samples_inv *= 0.5f;
 			for(int j = 0; j < num_samples; j++) {
 				/* sample random position on given light */
 				float light_u, light_v;
 				path_branched_rng_2D(kg, &lamp_rng, state, j, num_samples, PRNG_LIGHT_U, &light_u, &light_v);
 				LightSample ls;
 				if(!light_select(kg, i, light_u, light_v, ray->P, &ls, true))
 					continue;
 				float3 tp = throughput;
 				/* sample position on volume segment */
 				if(segment) {
 					float rphase = path_branched_rng_1D_for_decision(kg, rng, state, j, num_samples, PRNG_PHASE);
 					float rscatter = path_branched_rng_1D_for_decision(kg, rng, state, j, num_samples, PRNG_SCATTER_DISTANCE);
 					VolumeIntegrateResult result = kernel_volume_decoupled_scatter(kg,
 						state, ray, sd, &tp, rphase, rscatter, segment, (ls.t != FLT_MAX)? &ls.P: NULL, false);
 					if(result != VOLUME_PATH_SCATTERED)
 						continue;
 					/* todo: split up light_sample so we don't have to call it again with new position */
 					if(!light_select(kg, i, light_u, light_v, sd->P, &ls, true))
 						continue;
 				}
 				if(ls.pdf == 0.0f)
 					continue;
 				if(direct_emission(kg, sd, &ls, &light_ray, &L_light, &is_lamp, state->bounce, state->transparent_bounce)) {
 					/* trace shadow ray */
 					float3 shadow;
 					if(!shadow_blocked(kg, state, &light_ray, &shadow)) {
 						/* accumulate */
 						path_radiance_accum_light(L, tp*num_samples_inv, &L_light, shadow, num_samples_inv, state->bounce, is_lamp);
 					}
 				}
 			}
 		}
 		/* mesh light sampling */
 		if(kernel_data.integrator.pdf_triangles != 0.0f) {
 			int num_samples = ceil_to_int(num_samples_adjust*kernel_data.integrator.mesh_light_samples);
 			float num_samples_inv = num_samples_adjust/num_samples;
 			if(kernel_data.integrator.num_all_lights)
 				num_samples_inv *= 0.5f;
 			for(int j = 0; j < num_samples; j++) {
 				/* sample random position on random triangle */
 				float light_t = path_branched_rng_1D_for_decision(kg, rng, state, j, num_samples, PRNG_LIGHT);
 				float light_u, light_v;
 				path_branched_rng_2D(kg, rng, state, j, num_samples, PRNG_LIGHT_U, &light_u, &light_v);
 				/* only sample triangle lights */
 				if(kernel_data.integrator.num_all_lights)
 					light_t = 0.5f*light_t;
 				LightSample ls;
 				if(!light_sample(kg, light_t, light_u, light_v, sd->time, ray->P, &ls, true))
 					continue;
 				float3 tp = throughput;
 				/* sample position on volume segment */
 				if(segment) {
 					float rphase = path_branched_rng_1D_for_decision(kg, rng, state, j, num_samples, PRNG_PHASE);
 					float rscatter = path_branched_rng_1D_for_decision(kg, rng, state, j, num_samples, PRNG_SCATTER_DISTANCE);
 					VolumeIntegrateResult result = kernel_volume_decoupled_scatter(kg,
 						state, ray, sd, &tp, rphase, rscatter, segment, (ls.t != FLT_MAX)? &ls.P: NULL, false);
 					if(result != VOLUME_PATH_SCATTERED)
 						continue;
 					/* todo: split up light_sample so we don't have to call it again with new position */
 					if(!light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, &ls, true))
 						continue;
 				}
 				if(ls.pdf == 0.0f)
 					continue;
 				if(direct_emission(kg, sd, &ls, &light_ray, &L_light, &is_lamp, state->bounce, state->transparent_bounce)) {
 					/* trace shadow ray */
 					float3 shadow;
 					if(!shadow_blocked(kg, state, &light_ray, &shadow)) {
 						/* accumulate */
 						path_radiance_accum_light(L, tp*num_samples_inv, &L_light, shadow, num_samples_inv, state->bounce, is_lamp);
 					}
 				}
 			}
 		}
 	}
 	else {
 		/* sample random position on random light */
 		float light_t = path_state_rng_1D(kg, rng, state, PRNG_LIGHT);
 		float light_u, light_v;
 		path_state_rng_2D(kg, rng, state, PRNG_LIGHT_U, &light_u, &light_v);
 		LightSample ls;
 		if(!light_sample(kg, light_t, light_u, light_v, sd->time, ray->P, &ls, true))
 			return;
 		float3 tp = throughput;
 		/* sample position on volume segment */
 		if(segment) {
 			float rphase = path_state_rng_1D_for_decision(kg, rng, state, PRNG_PHASE);
 			float rscatter = path_state_rng_1D_for_decision(kg, rng, state, PRNG_SCATTER_DISTANCE);
 			VolumeIntegrateResult result = kernel_volume_decoupled_scatter(kg,
 				state, ray, sd, &tp, rphase, rscatter, segment, (ls.t != FLT_MAX)? &ls.P: NULL, false);
 			if(result != VOLUME_PATH_SCATTERED)
 				return;
 			/* todo: split up light_sample so we don't have to call it again with new position */
 			if(!light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, &ls, true))
 				return;
 		}
 		if(ls.pdf == 0.0f)
 			return;
 		/* sample random light */
 		if(direct_emission(kg, sd, &ls, &light_ray, &L_light, &is_lamp, state->bounce, state->transparent_bounce)) {
 			/* trace shadow ray */
 			float3 shadow;
 			if(!shadow_blocked(kg, state, &light_ray, &shadow)) {
 				/* accumulate */
 				path_radiance_accum_light(L, tp, &L_light, shadow, 1.0f, state->bounce, is_lamp);
 			}
 		}
 	}
 #endif
 }
 #endif
 #endif
 CCL_NAMESPACE_END
--- a/intern/cycles/kernel/kernel_random.h
+++ b/intern/cycles/kernel/kernel_random.h
@@ -261,12 +261,12 @@ ccl_device uint lcg_init(uint seed)
 * For branches in the path we must be careful not to reuse the same number
 * in a sequence and offset accordingly. */
-ccl_device_inline float path_state_rng_1D(KernelGlobals *kg, RNG *rng, PathState *state, int dimension)
+ccl_device_inline float path_state_rng_1D(KernelGlobals *kg, RNG *rng, const PathState *state, int dimension)
 {
 	return path_rng_1D(kg, rng, state->sample, state->num_samples, state->rng_offset + dimension);
 }
-ccl_device_inline float path_state_rng_1D_for_decision(KernelGlobals *kg, RNG *rng, PathState *state, int dimension)
+ccl_device_inline float path_state_rng_1D_for_decision(KernelGlobals *kg, RNG *rng, const PathState *state, int dimension)
 {
 	/* the rng_offset is not increased for transparent bounces. if we do then
 	 * fully transparent objects can become subtly visible by the different
@@ -279,17 +279,23 @@ ccl_device_inline float path_state_rng_1D_for_decision(KernelGlobals *kg, RNG *r
 	return path_rng_1D(kg, rng, state->sample, state->num_samples, rng_offset + dimension);
 }
-ccl_device_inline void path_state_rng_2D(KernelGlobals *kg, RNG *rng, PathState *state, int dimension, float *fx, float *fy)
+ccl_device_inline void path_state_rng_2D(KernelGlobals *kg, RNG *rng, const PathState *state, int dimension, float *fx, float *fy)
 {
 	path_rng_2D(kg, rng, state->sample, state->num_samples, state->rng_offset + dimension, fx, fy);
 }
-ccl_device_inline float path_branched_rng_1D(KernelGlobals *kg, RNG *rng, PathState *state, int branch, int num_branches, int dimension)
+ccl_device_inline float path_branched_rng_1D(KernelGlobals *kg, RNG *rng, const PathState *state, int branch, int num_branches, int dimension)
 {
 	return path_rng_1D(kg, rng, state->sample*num_branches + branch, state->num_samples*num_branches, state->rng_offset + dimension);
 }
-ccl_device_inline void path_branched_rng_2D(KernelGlobals *kg, RNG *rng, PathState *state, int branch, int num_branches, int dimension, float *fx, float *fy)
+ccl_device_inline float path_branched_rng_1D_for_decision(KernelGlobals *kg, RNG *rng, const PathState *state, int branch, int num_branches, int dimension)
 {
 	int rng_offset = state->rng_offset + state->transparent_bounce*PRNG_BOUNCE_NUM;
 	return path_rng_1D(kg, rng, state->sample*num_branches + branch, state->num_samples*num_branches, rng_offset + dimension);
 }
 ccl_device_inline void path_branched_rng_2D(KernelGlobals *kg, RNG *rng, const PathState *state, int branch, int num_branches, int dimension, float *fx, float *fy)
 {
 	path_rng_2D(kg, rng, state->sample*num_branches + branch, state->num_samples*num_branches, state->rng_offset + dimension, fx, fy);
 }
@@ -303,7 +309,7 @@ ccl_device_inline void path_state_branch(PathState *state, int branch, int num_b
 	state->num_samples = state->num_samples*num_branches;
 }
-ccl_device_inline uint lcg_state_init(RNG *rng, PathState *state, uint scramble)
+ccl_device_inline uint lcg_state_init(RNG *rng, const PathState *state, uint scramble)
 {
 	return lcg_init(*rng + state->rng_offset + state->sample*scramble);
 }
--- a/intern/cycles/kernel/kernel_shader.h
+++ b/intern/cycles/kernel/kernel_shader.h
@@ -858,7 +858,7 @@ ccl_device_inline void _shader_volume_phase_multi_eval(const ShaderData *sd, con
 			if(phase_pdf != 0.0f) {
 				bsdf_eval_accum(result_eval, sc->type, eval);
-				sum_pdf += phase_pdf;
+				sum_pdf += phase_pdf*sc->sample_weight;
 			}
 			sum_sample_weight += sc->sample_weight;
--- a/intern/cycles/kernel/kernel_volume.h
+++ b/intern/cycles/kernel/kernel_volume.h
@@ -136,7 +136,7 @@ ccl_device void kernel_volume_shadow_homogeneous(KernelGlobals *kg, PathState *s
 ccl_device void kernel_volume_shadow_heterogeneous(KernelGlobals *kg, PathState *state, Ray *ray, ShaderData *sd, float3 *throughput)
 {
 	float3 tp = *throughput;
-	const float tp_eps = 1e-10f; /* todo: this is likely not the right value */
+	const float tp_eps = 1e-6f; /* todo: this is likely not the right value */
 	/* prepare for stepping */
 	int max_steps = kernel_data.integrator.volume_max_steps;
@@ -226,25 +226,6 @@ ccl_device float kernel_volume_equiangular_pdf(Ray *ray, float3 light_P, float s
 	return pdf;
 }
 ccl_device bool kernel_volume_equiangular_light_position(KernelGlobals *kg, PathState *state, Ray *ray, RNG *rng, float3 *light_P, bool *distant)
 {
 	/* light RNGs */
 	float light_t = path_state_rng_1D(kg, rng, state, PRNG_LIGHT);
 	float light_u, light_v;
 	path_state_rng_2D(kg, rng, state, PRNG_LIGHT_U, &light_u, &light_v);
 	/* light sample */
 	LightSample ls;
 	light_sample(kg, light_t, light_u, light_v, ray->time, ray->P, &ls);
 	if(ls.pdf == 0.0f)
 		return false;
 	*light_P = ls.P;
 	*distant = ls.t == FLT_MAX;
 	return true;
 }
 /* Distance sampling */
 ccl_device float kernel_volume_distance_sample(float max_t, float3 sigma_t, int channel, float xi, float3 *transmittance, float3 *pdf)
@@ -304,7 +285,7 @@ ccl_device float3 kernel_volume_emission_integrate(VolumeShaderCoefficients *coe
 * the volume shading coefficient for the entire line segment */
 ccl_device VolumeIntegrateResult kernel_volume_integrate_homogeneous(KernelGlobals *kg,
 	PathState *state, Ray *ray, ShaderData *sd, PathRadiance *L, float3 *throughput,
-	RNG *rng)
+	RNG *rng, bool probalistic_scatter)
 {
 	VolumeShaderCoefficients coeff;
@@ -326,47 +307,37 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_homogeneous(KernelGloba
 		int channel = (int)(rphase*3.0f);
 		sd->randb_closure = rphase*3.0f - channel;
 		/* decide if we will hit or miss */
 		bool scatter = true;
 		float xi = path_state_rng_1D_for_decision(kg, rng, state, PRNG_SCATTER_DISTANCE);
-		/* decide if we will hit or miss */
+		if(probalistic_scatter) {
-		float sample_sigma_t = kernel_volume_channel_get(sigma_t, channel);
+			float sample_sigma_t = kernel_volume_channel_get(sigma_t, channel);
-		float sample_transmittance = expf(-sample_sigma_t * t);
+			float sample_transmittance = expf(-sample_sigma_t * t);
-		if(xi >= sample_transmittance) {
+			if(1.0f - xi >= sample_transmittance) {
 				scatter = true;
 				/* rescale random number so we can reuse it */
 				xi = 1.0f - (1.0f - xi - sample_transmittance)/(1.0f - sample_transmittance);
 			}
 			else
 				scatter = false;
 		}
 		if(scatter) {
 			/* scattering */
 			float3 pdf;
 			float3 transmittance;
 			float sample_t;
-			/* rescale random number so we can reuse it */
+			/* distance sampling */
-			xi = (xi - sample_transmittance)/(1.0f - sample_transmittance);
+			sample_t = kernel_volume_distance_sample(ray->t, sigma_t, channel, xi, &transmittance, &pdf);
 			if(kernel_data.integrator.volume_homogeneous_sampling == 0 || !kernel_data.integrator.num_all_lights) { 
 				/* distance sampling */
 				sample_t = kernel_volume_distance_sample(ray->t, sigma_t, channel, xi, &transmittance, &pdf);
 			}
 			else {
 				/* equiangular sampling */
 				float3 light_P;
 				float equi_pdf;
 				bool light_distant;
 				if(!kernel_volume_equiangular_light_position(kg, state, ray, rng, &light_P, &light_distant))
 					return VOLUME_PATH_MISSED;
 				if(light_distant) {
 					/* distant light, revert to distance sampling because position is infinitely far away */
 					sample_t = kernel_volume_distance_sample(ray->t, sigma_t, channel, xi, &transmittance, &pdf);
 				}
 				else {
 					sample_t = kernel_volume_equiangular_sample(ray, light_P, xi, &equi_pdf);
 					transmittance = volume_color_transmittance(sigma_t, sample_t);
 					pdf = make_float3(equi_pdf, equi_pdf, equi_pdf);
 				}
 			}
 			/* modifiy pdf for hit/miss decision */
-			pdf *= make_float3(1.0f, 1.0f, 1.0f) - volume_color_transmittance(sigma_t, t);
+			if(probalistic_scatter)
 				pdf *= make_float3(1.0f, 1.0f, 1.0f) - volume_color_transmittance(sigma_t, t);
 			new_tp = *throughput * coeff.sigma_s * transmittance / average(pdf);
 			t = sample_t;
@@ -385,7 +356,7 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_homogeneous(KernelGloba
 	}
 	/* integrate emission attenuated by extinction */
-	if(closure_flag & SD_EMISSION) {
+	if(L && (closure_flag & SD_EMISSION)) {
 		float3 sigma_t = coeff.sigma_a + coeff.sigma_s;
 		float3 transmittance = volume_color_transmittance(sigma_t, ray->t);
 		float3 emission = kernel_volume_emission_integrate(&coeff, closure_flag, transmittance, ray->t);
@@ -408,13 +379,15 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_homogeneous(KernelGloba
 	return VOLUME_PATH_ATTENUATED;
 }
-/* heterogeneous volume: integrate stepping through the volume until we
+/* heterogeneous volume distance sampling: integrate stepping through the
- * reach the end, get absorbed entirely, or run out of iterations */
+ * volume until we reach the end, get absorbed entirely, or run out of
-ccl_device VolumeIntegrateResult kernel_volume_integrate_heterogeneous(KernelGlobals *kg,
+ * iterations. this does probalistically scatter or get transmitted through
 * for path tracing where we don't want to branch. */
 ccl_device VolumeIntegrateResult kernel_volume_integrate_heterogeneous_distance(KernelGlobals *kg,
 	PathState *state, Ray *ray, ShaderData *sd, PathRadiance *L, float3 *throughput, RNG *rng)
 {
 	float3 tp = *throughput;
-	const float tp_eps = 1e-10f; /* todo: this is likely not the right value */
+	const float tp_eps = 1e-6f; /* todo: this is likely not the right value */
 	/* prepare for stepping */
 	int max_steps = kernel_data.integrator.volume_max_steps;
@@ -425,9 +398,12 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_heterogeneous(KernelGlo
 	float t = 0.0f;
 	float3 accum_transmittance = make_float3(1.0f, 1.0f, 1.0f);
-	/* cache some constant variables */
+	/* pick random color channel, we use the Veach one-sample
-	float xi;
+	 * model with balance heuristic for the channels */
-	int channel = -1;
+	float xi = path_state_rng_1D_for_decision(kg, rng, state, PRNG_SCATTER_DISTANCE);
 	float rphase = path_state_rng_1D_for_decision(kg, rng, state, PRNG_PHASE);
 	int channel = (int)(rphase*3.0f);
 	sd->randb_closure = rphase*3.0f - channel;
 	bool has_scatter = false;
 	for(int i = 0; i < max_steps; i++) {
@@ -449,25 +425,13 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_heterogeneous(KernelGlo
 			float3 transmittance;
 			bool scatter = false;
-			/* randomly scatter, and if we do dt and new_t are shortened */
+			/* distance sampling */
 			if((closure_flag & SD_SCATTER) || (has_scatter && (closure_flag & SD_ABSORPTION))) {
 				has_scatter = true;
 				/* average sigma_t and sigma_s over segment */
 				float3 sigma_t = coeff.sigma_a + coeff.sigma_s;
 				float3 sigma_s = coeff.sigma_s;
 				/* lazily set up variables for sampling */
 				if(channel == -1) {
 					/* pick random color channel, we use the Veach one-sample
 					 * model with balance heuristic for the channels */
 					xi = path_state_rng_1D_for_decision(kg, rng, state, PRNG_SCATTER_DISTANCE);
 					float rphase = path_state_rng_1D_for_decision(kg, rng, state, PRNG_PHASE);
 					channel = (int)(rphase*3.0f);
 					sd->randb_closure = rphase*3.0f - channel;
 				}
 				/* compute transmittance over full step */
 				transmittance = volume_color_transmittance(sigma_t, dt);
@@ -480,10 +444,12 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_heterogeneous(KernelGlo
 					float new_dt = -logf(1.0f - xi)/sample_sigma_t;
 					new_t = t + new_dt;
-					/* transmittance, throughput */
+					/* transmittance and pdf */
 					float3 new_transmittance = volume_color_transmittance(sigma_t, new_dt);
-					float pdf = average(sigma_t * new_transmittance);
+					float3 pdf = sigma_t * new_transmittance;
-					new_tp = tp * sigma_s * new_transmittance / pdf;
+
 					/* throughput */
 					new_tp = tp * sigma_s * new_transmittance / average(pdf);
 					scatter = true;
 				}
 				else {
@@ -504,7 +470,7 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_heterogeneous(KernelGlo
 			}
 			/* integrate emission attenuated by absorption */
-			if(closure_flag & SD_EMISSION) {
+			if(L && (closure_flag & SD_EMISSION)) {
 				float3 emission = kernel_volume_emission_integrate(&coeff, closure_flag, transmittance, dt);
 				path_radiance_accum_emission(L, tp, emission, state->bounce);
 			}
@@ -518,19 +484,19 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_heterogeneous(KernelGlo
 					tp = make_float3(0.0f, 0.0f, 0.0f);
 					break;
 				}
 			}
-				/* prepare to scatter to new direction */
+			/* prepare to scatter to new direction */
-				if(scatter) {
+			if(scatter) {
-					/* adjust throughput and move to new location */
+				/* adjust throughput and move to new location */
-					sd->P = ray->P + new_t*ray->D;
+				sd->P = ray->P + new_t*ray->D;
-					*throughput = tp;
+				*throughput = tp;
-					return VOLUME_PATH_SCATTERED;
+				return VOLUME_PATH_SCATTERED;
-				}
+			}
-				else {
+			else {
-					/* accumulate transmittance */
+				/* accumulate transmittance */
-					accum_transmittance *= transmittance;
+				accum_transmittance *= transmittance;
 				}
 			}
 		}
@@ -545,14 +511,35 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_heterogeneous(KernelGlo
 	return VOLUME_PATH_ATTENUATED;
 }
 /* get the volume attenuation and emission over line segment defined by
 * ray, with the assumption that there are no surfaces blocking light
 * between the endpoints. distance sampling is used to decide if we will
 * scatter or not. */
 ccl_device_noinline VolumeIntegrateResult kernel_volume_integrate(KernelGlobals *kg,
 	PathState *state, ShaderData *sd, Ray *ray, PathRadiance *L, float3 *throughput, RNG *rng)
 {
 	/* workaround to fix correlation bug in T38710, can find better solution
 	 * in random number generator later, for now this is done here to not impact
 	 * performance of rendering without volumes */
 	RNG tmp_rng = cmj_hash(*rng, state->rng_offset);
 	bool heterogeneous = volume_stack_is_heterogeneous(kg, state->volume_stack);
 	shader_setup_from_volume(kg, sd, ray, state->bounce, state->transparent_bounce);
 	if(heterogeneous)
 		return kernel_volume_integrate_heterogeneous_distance(kg, state, ray, sd, L, throughput, &tmp_rng);
 	else
 		return kernel_volume_integrate_homogeneous(kg, state, ray, sd, L, throughput, &tmp_rng, true);
 }
 /* Decoupled Volume Sampling
 *
 * VolumeSegment is list of coefficients and transmittance stored at all steps
 * through a volume. This can then latter be used for decoupled sampling as in:
- * "Importance Sampling Techniques for Path Tracing in Participating Media" */
+ * "Importance Sampling Techniques for Path Tracing in Participating Media"
-
+ *
-/* CPU only because of malloc/free */
+ * On the GPU this is only supported for homogeneous volumes (1 step), due to
-#ifdef __KERNEL_CPU__
+ * no support for malloc/free and too much stack usage with a fix size array. */
 typedef struct VolumeStep {
 	float3 sigma_s;				/* scatter coefficient */
@@ -565,7 +552,11 @@ typedef struct VolumeStep {
 } VolumeStep;
 typedef struct VolumeSegment {
 #ifdef __KERNEL_CPU__
 	VolumeStep *steps;			/* recorded steps */
 #else
 	VolumeStep steps[1];		/* recorded steps */
 #endif
 	int numsteps;				/* number of steps */
 	int closure_flag;			/* accumulated closure flags from all steps */
@@ -582,6 +573,8 @@ typedef struct VolumeSegment {
 ccl_device void kernel_volume_decoupled_record(KernelGlobals *kg, PathState *state,
 	Ray *ray, ShaderData *sd, VolumeSegment *segment, bool heterogeneous)
 {
 	const float tp_eps = 1e-6f; /* todo: this is likely not the right value */
 	/* prepare for volume stepping */
 	int max_steps;
 	float step_size, random_jitter_offset;
@@ -608,7 +601,11 @@ ccl_device void kernel_volume_decoupled_record(KernelGlobals *kg, PathState *sta
 	segment->closure_flag = 0;
 	segment->numsteps = 0;
 #ifdef __KERNEL_CPU__
 	segment->steps = (VolumeStep*)malloc(sizeof(VolumeStep)*max_steps);
 #else
 	kernel_assert(max_steps == 1);
 #endif
 	VolumeStep *step = segment->steps;
@@ -669,6 +666,10 @@ ccl_device void kernel_volume_decoupled_record(KernelGlobals *kg, PathState *sta
 		t = new_t;
 		if(t == ray->t)
 			break;
 		/* stop if nearly all light blocked */
 		if(accum_transmittance.x < tp_eps && accum_transmittance.y < tp_eps && accum_transmittance.z < tp_eps)
 			break;
 	}
 	/* store total emission and transmittance */
@@ -690,7 +691,9 @@ ccl_device void kernel_volume_decoupled_record(KernelGlobals *kg, PathState *sta
 ccl_device void kernel_volume_decoupled_free(KernelGlobals *kg, VolumeSegment *segment)
 {
 #ifdef __KERNEL_CPU__
 	free(segment->steps);
 #endif
 }
 /* scattering for homogeneous and heterogeneous volumes, using decoupled ray
@@ -701,7 +704,8 @@ ccl_device void kernel_volume_decoupled_free(KernelGlobals *kg, VolumeSegment *s
 * these also do not do emission or modify throughput. */
 ccl_device VolumeIntegrateResult kernel_volume_decoupled_scatter(
 	KernelGlobals *kg, PathState *state, Ray *ray, ShaderData *sd,
-	float3 *throughput, RNG *rng, VolumeSegment *segment)
+	float3 *throughput, float rphase, float rscatter,
 	const VolumeSegment *segment, const float3 *light_P, bool probalistic_scatter)
 {
 	int closure_flag = segment->closure_flag;
@@ -710,38 +714,56 @@ ccl_device VolumeIntegrateResult kernel_volume_decoupled_scatter(
 	/* pick random color channel, we use the Veach one-sample
 	 * model with balance heuristic for the channels */
 	float rphase = path_state_rng_1D_for_decision(kg, rng, state, PRNG_PHASE);
 	int channel = (int)(rphase*3.0f);
 	sd->randb_closure = rphase*3.0f - channel;
 	float xi = rscatter;
-	float xi = path_state_rng_1D_for_decision(kg, rng, state, PRNG_SCATTER_DISTANCE);
+	/* probalistic scattering decision based on transmittance */
 	if(probalistic_scatter) {
 		float sample_transmittance = kernel_volume_channel_get(segment->accum_transmittance, channel);
 		if(1.0f - xi >= sample_transmittance) {
 			/* rescale random number so we can reuse it */
 			xi = 1.0f - (1.0f - xi - sample_transmittance)/(1.0f - sample_transmittance);
 		}
 		else
 			return VOLUME_PATH_MISSED;
 	}
 	VolumeStep *step;
 	float3 transmittance;
 	float pdf, sample_t;
 	float mis_weight = 1.0f;
 	bool distance_sample = true;
 	bool use_mis = false;
-	/* pick position on light for equiangular */
+	if(kernel_data.integrator.volume_homogeneous_sampling && light_P) {
-	bool equiangular = (kernel_data.integrator.volume_homogeneous_sampling != 0 && kernel_data.integrator.num_all_lights);
+		if(kernel_data.integrator.volume_homogeneous_sampling == 2) {
-	float3 light_P;
+			/* multiple importance sample: randomly pick between
 			 * equiangular and distance sampling strategy */
 			if(xi < 0.5f) {
 				xi *= 2.0f;
 			}
 			else {
 				xi = (xi - 0.5f)*2.0f;
 				distance_sample = false;
 			}
-	if(equiangular) {
+			use_mis = true;
-		bool light_distant;
+		}
-
+		else {
-		if(!kernel_volume_equiangular_light_position(kg, state, ray, rng, &light_P, &light_distant))
+			/* only equiangular sampling */
-			return VOLUME_PATH_MISSED;
+			distance_sample = false;
-
+		}
 		/* distant light, revert to distance sampling because position is infinitely far away */
 		if(light_distant)
 			equiangular = false;
 	}
 	/* distance sampling */
-	if(!equiangular) {
+	if(distance_sample) {
 		/* find step in cdf */
 		step = segment->steps;
 		float prev_t = 0.0f;
-		float3 step_pdf = make_float3(1.0f, 1.0f, 1.0f);
+		float3 step_pdf_distance = make_float3(1.0f, 1.0f, 1.0f);
 		if(segment->numsteps > 1) {
 			float prev_cdf = 0.0f;
@@ -764,7 +786,7 @@ ccl_device VolumeIntegrateResult kernel_volume_decoupled_scatter(
 			xi = (xi - prev_cdf)/(step_cdf - prev_cdf);
 			/* pdf for picking step */
-			step_pdf = step->cdf_distance - prev_cdf_distance;
+			step_pdf_distance = step->cdf_distance - prev_cdf_distance;
 		}
 		/* determine range in which we will sample */
@@ -773,30 +795,59 @@ ccl_device VolumeIntegrateResult kernel_volume_decoupled_scatter(
 		/* sample distance and compute transmittance */
 		float3 distance_pdf;
 		sample_t = prev_t + kernel_volume_distance_sample(step_t, step->sigma_t, channel, xi, &transmittance, &distance_pdf);
-		pdf = average(distance_pdf * step_pdf);
+
 		/* modifiy pdf for hit/miss decision */
 		if(probalistic_scatter)
 			distance_pdf *= make_float3(1.0f, 1.0f, 1.0f) - segment->accum_transmittance;
 		pdf = average(distance_pdf * step_pdf_distance);
 		/* multiple importance sampling */
 		if(use_mis) {
 			float equi_pdf = kernel_volume_equiangular_pdf(ray, *light_P, sample_t);
 			mis_weight = 2.0f*power_heuristic(pdf, equi_pdf);
 		}
 	}
 	/* equi-angular sampling */
 	else {
 		/* sample distance */
-		sample_t = kernel_volume_equiangular_sample(ray, light_P, xi, &pdf);
+		sample_t = kernel_volume_equiangular_sample(ray, *light_P, xi, &pdf);
 		/* find step in which sampled distance is located */
 		step = segment->steps;
 		float prev_t = 0.0f;
 		float3 step_pdf_distance = make_float3(1.0f, 1.0f, 1.0f);
 		if(segment->numsteps > 1) {
 			/* todo: optimize using binary search */
 			float3 prev_cdf_distance = make_float3(0.0f, 0.0f, 0.0f);
 			for(int i = 0; i < segment->numsteps-1; i++, step++) {
 				if(sample_t < step->t)
 					break;
 				prev_t = step->t;
 				prev_cdf_distance = step->cdf_distance;
 			}
 			/* pdf for picking step with distance sampling */
 			step_pdf_distance = step->cdf_distance - prev_cdf_distance;
 		}
-		
+
 		/* determine range in which we will sample */
 		float step_t = step->t - prev_t;
 		float step_sample_t = sample_t - prev_t;
 		/* compute transmittance */
-		transmittance = volume_color_transmittance(step->sigma_t, sample_t - prev_t);
+		transmittance = volume_color_transmittance(step->sigma_t, step_sample_t);
 		/* multiple importance sampling */
 		if(use_mis) {
 			float3 distance_pdf3 = kernel_volume_distance_pdf(step_t, step->sigma_t, step_sample_t);
 			float distance_pdf = average(distance_pdf3 * step_pdf_distance);
 			mis_weight = 2.0f*power_heuristic(pdf, distance_pdf);
 		}
 	}
 	/* compute transmittance up to this step */
@@ -804,7 +855,7 @@ ccl_device VolumeIntegrateResult kernel_volume_decoupled_scatter(
 		transmittance *= (step-1)->accum_transmittance;
 	/* modify throughput */
-	*throughput *= step->sigma_s * transmittance / pdf;
+	*throughput *= step->sigma_s * transmittance * (mis_weight / pdf);
 	/* evaluate shader to create closures at shading point */
 	if(segment->numsteps > 1) {
@@ -820,40 +871,28 @@ ccl_device VolumeIntegrateResult kernel_volume_decoupled_scatter(
 	return VOLUME_PATH_SCATTERED;
 }
-#endif
+/* decide if we need to use decoupled or not */
-
+ccl_device bool kernel_volume_use_decoupled(KernelGlobals *kg, bool heterogeneous, bool direct)
 /* get the volume attenuation and emission over line segment defined by
 * ray, with the assumption that there are no surfaces blocking light
 * between the endpoints */
 ccl_device_noinline VolumeIntegrateResult kernel_volume_integrate(KernelGlobals *kg,
 	PathState *state, ShaderData *sd, Ray *ray, PathRadiance *L, float3 *throughput, RNG *rng)
 {
-	/* workaround to fix correlation bug in T38710, can find better solution
+	/* decoupled ray marching for heterogenous volumes not supported on the GPU,
-	 * in random number generator later, for now this is done here to not impact
+	 * which also means equiangular and multiple importance sampling is not
-	 * performance of rendering without volumes */
+	 * support for that case */
-	RNG tmp_rng = cmj_hash(*rng, state->rng_offset);
+#ifdef __KERNEL_GPU__
 	bool heterogeneous = volume_stack_is_heterogeneous(kg, state->volume_stack);
 #if 0
 	/* debugging code to compare decoupled ray marching */
 	VolumeSegment segment;
 	shader_setup_from_volume(kg, sd, ray, state->bounce, state->transparent_bounce);
 	kernel_volume_decoupled_record(kg, state, ray, sd, &segment, heterogeneous);
 	VolumeIntegrateResult result = kernel_volume_decoupled_scatter(kg, state, ray, sd, throughput, &tmp_rng, &segment);
 	kernel_volume_decoupled_free(kg, &segment);
 	return result;
 #else
 	shader_setup_from_volume(kg, sd, ray, state->bounce, state->transparent_bounce);
 	if(heterogeneous)
-		return kernel_volume_integrate_heterogeneous(kg, state, ray, sd, L, throughput, &tmp_rng);
+		return false;
 	else
 		return kernel_volume_integrate_homogeneous(kg, state, ray, sd, L, throughput, &tmp_rng);
 #endif
 	/* equiangular sampling only implemented for decoupled */
 	bool equiangular = kernel_data.integrator.volume_homogeneous_sampling != 0;
 	if(equiangular)
 		return true;
 	/* for all light sampling use decoupled, reusing shader evaluations is
 	 * typically faster in that case */
 	if(direct)
 		return kernel_data.integrator.sample_all_lights_direct;
 	else
 		return kernel_data.integrator.sample_all_lights_indirect;
 }
 /* Volume Stack
--- a/intern/cycles/render/integrator.cpp
+++ b/intern/cycles/render/integrator.cpp
@@ -101,7 +101,11 @@ void Integrator::device_update(Device *device, DeviceScene *dscene, Scene *scene
 	if(!transparent_shadows)
 		kintegrator->transparent_shadows = false;
-	kintegrator->volume_homogeneous_sampling = volume_homogeneous_sampling;
+	if(kintegrator->num_all_lights > 0)
 		kintegrator->volume_homogeneous_sampling = volume_homogeneous_sampling;
 	else
 		kintegrator->volume_homogeneous_sampling = 0;
 	kintegrator->volume_max_steps = volume_max_steps;
 	kintegrator->volume_step_size = volume_step_size;
@@ -125,8 +129,15 @@ void Integrator::device_update(Device *device, DeviceScene *dscene, Scene *scene
 	kintegrator->mesh_light_samples = mesh_light_samples;
 	kintegrator->subsurface_samples = subsurface_samples;
 	kintegrator->volume_samples = volume_samples;
-	kintegrator->sample_all_lights_direct = sample_all_lights_direct;
+
-	kintegrator->sample_all_lights_indirect = sample_all_lights_indirect;
+	if(method == BRANCHED_PATH) {
 		kintegrator->sample_all_lights_direct = sample_all_lights_direct;
 		kintegrator->sample_all_lights_indirect = sample_all_lights_indirect;
 	}
 	else {
 		kintegrator->sample_all_lights_direct = false;
 		kintegrator->sample_all_lights_indirect = false;
 	}
 	kintegrator->sampling_pattern = sampling_pattern;
 	kintegrator->aa_samples = aa_samples;