Merge branch 'master' into 28

2017-04-12 14:23:47 +10:00
parent ed8c71da1c 0ebe08af34
commit dc1499ba1c
6 changed files with 38 additions and 20 deletions
--- a/intern/cycles/device/device_cuda.cpp
+++ b/intern/cycles/device/device_cuda.cpp
@@ -1613,10 +1613,23 @@ int2 CUDASplitKernel::split_kernel_local_size()
 	return make_int2(32, 1);
 }

-int2 CUDASplitKernel::split_kernel_global_size(device_memory& /*kg*/, device_memory& /*data*/, DeviceTask * /*task*/)
+int2 CUDASplitKernel::split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask * /*task*/)
 {
-	/* TODO(mai): implement something here to detect ideal work size */
-	return make_int2(256, 256);
+	size_t free;
+	size_t total;
+
+	device->cuda_push_context();
+	cuda_assert(cuMemGetInfo(&free, &total));
+	device->cuda_pop_context();
+
+	VLOG(1) << "Maximum device allocation size: "
+	        << string_human_readable_number(free) << " bytes. ("
+	        << string_human_readable_size(free) << ").";
+
+	size_t num_elements = max_elements_for_max_buffer_size(kg, data, free / 2);
+	int2 global_size = make_int2(round_down((int)sqrt(num_elements), 32), (int)sqrt(num_elements));
+	VLOG(1) << "Global size: " << global_size << ".";
+	return global_size;
 }

 bool device_cuda_init(void)
--- a/intern/cycles/device/device_split_kernel.cpp
+++ b/intern/cycles/device/device_split_kernel.cpp
@@ -128,26 +128,27 @@ bool DeviceSplitKernel::path_trace(DeviceTask *task,
 		local_size[1] = lsize[1];
 	}

-	/* Set gloabl size */
-	size_t global_size[2];
-	{
-		int2 gsize = split_kernel_global_size(kgbuffer, kernel_data, task);
-
-		/* Make sure that set work size is a multiple of local
-		 * work size dimensions.
-		 */
-		global_size[0] = round_up(gsize[0], local_size[0]);
-		global_size[1] = round_up(gsize[1], local_size[1]);
-	}
-
 	/* Number of elements in the global state buffer */
 	int num_global_elements = global_size[0] * global_size[1];
-	assert(num_global_elements % WORK_POOL_SIZE == 0);

 	/* Allocate all required global memory once. */
 	if(first_tile) {
 		first_tile = false;

+		/* Set gloabl size */
+		{
+			int2 gsize = split_kernel_global_size(kgbuffer, kernel_data, task);
+
+			/* Make sure that set work size is a multiple of local
+			 * work size dimensions.
+			 */
+			global_size[0] = round_up(gsize[0], local_size[0]);
+			global_size[1] = round_up(gsize[1], local_size[1]);
+		}
+
+		num_global_elements = global_size[0] * global_size[1];
+		assert(num_global_elements % WORK_POOL_SIZE == 0);
+
 		/* Calculate max groups */

 		/* Denotes the maximum work groups possible w.r.t. current requested tile size. */
--- a/intern/cycles/device/device_split_kernel.h
+++ b/intern/cycles/device/device_split_kernel.h
@@ -95,6 +95,9 @@ private:
 	/* Marked True in constructor and marked false at the end of path_trace(). */
 	bool first_tile;

+	/* Cached global size */
+	size_t global_size[2];
+
 public:
 	explicit DeviceSplitKernel(Device* device);
 	virtual ~DeviceSplitKernel();