Use generic task scheduler for threaded image processor

It allows to schedule tasks of smaller size without having threads overhead or extra worry about splitting tasks into smaller pieces. This simplifies code in color management which was manually splitting task into smaller chunks to keep memory usage low. Further optimization is possible by avoid malloc called from threads, but that's how it used to work for ages already and would be optimized as a separate patch.
2013-12-25 20:32:13 +06:00
parent cdd95e354e
commit 64aef25b83
2 changed files with 63 additions and 78 deletions
--- a/source/blender/imbuf/intern/colormanagement.c
+++ b/source/blender/imbuf/intern/colormanagement.c
@@ -1318,13 +1318,11 @@ static void display_buffer_init_handle(void *handle_v, int start_line, int tot_l
 	handle->float_colorspace = init_data->float_colorspace;
 }

-static void display_buffer_apply_get_linear_buffer(DisplayBufferThread *handle, int start_scanline, int num_scanlines,
+static void display_buffer_apply_get_linear_buffer(DisplayBufferThread *handle, int height,
                                                   float *linear_buffer, bool *is_straight_alpha)
 {
 	int channels = handle->channels;
 	int width = handle->width;
-	int height = num_scanlines;
-	int scanline_offset = channels * start_scanline * width;

 	int buffer_size = channels * width * height;

@@ -1342,7 +1340,7 @@ static void display_buffer_apply_get_linear_buffer(DisplayBufferThread *handle,
 		int i;

 		/* first convert byte buffer to float, keep in image space */
-		for (i = 0, fp = linear_buffer, cp = byte_buffer + scanline_offset;
+		for (i = 0, fp = linear_buffer, cp = byte_buffer;
 		     i < width * height;
 		     i++, fp += channels, cp += channels)
 		{
@@ -1375,7 +1373,7 @@ static void display_buffer_apply_get_linear_buffer(DisplayBufferThread *handle,
 		const char *from_colorspace = handle->float_colorspace;
 		const char *to_colorspace = global_role_scene_linear;

-		memcpy(linear_buffer, handle->buffer + scanline_offset, buffer_size * sizeof(float));
+		memcpy(linear_buffer, handle->buffer, buffer_size * sizeof(float));

 		if (!is_data && !is_data_display) {
 			IMB_colormanagement_transform(linear_buffer, width, height, channels,
@@ -1391,7 +1389,7 @@ static void display_buffer_apply_get_linear_buffer(DisplayBufferThread *handle,
 		 * using duplicated buffer here
 		 */

-		memcpy(linear_buffer, handle->buffer + scanline_offset, buffer_size * sizeof(float));
+		memcpy(linear_buffer, handle->buffer, buffer_size * sizeof(float));

 		*is_straight_alpha = false;
 	}
@@ -1421,28 +1419,12 @@ static void *do_display_buffer_apply_thread(void *handle_v)
 		}
 	}
 	else {
-#define SCANLINE_BLOCK_SIZE 64
-		/* TODO(sergey): Instead of nasty scanline-blocking in per-scanline-block thread we might
-		 *               better to use generic task scheduler, but that would need extra testing
-		 *               before deploying into production.
-		 */
-
-		int scanlines = (height + SCANLINE_BLOCK_SIZE - 1) / SCANLINE_BLOCK_SIZE;
-		int i;
-		float *linear_buffer = MEM_mallocN(channels * width * SCANLINE_BLOCK_SIZE * sizeof(float),
+		bool is_straight_alpha, predivide;
+		float *linear_buffer = MEM_mallocN(channels * width * height * sizeof(float),
 		                                   "color conversion linear buffer");

-		for (i = 0; i < scanlines; i ++) {
-			int start_scanline = i * SCANLINE_BLOCK_SIZE;
-			int num_scanlines = (i == scanlines - 1) ?
-			                    (height - SCANLINE_BLOCK_SIZE * i) :
-			                    SCANLINE_BLOCK_SIZE;
-			int scanline_offset = channels * start_scanline * width;
-			int scanline_offset4 = 4 * start_scanline * width;
-			bool is_straight_alpha, predivide;
+		display_buffer_apply_get_linear_buffer(handle, height, linear_buffer, &is_straight_alpha);

-			display_buffer_apply_get_linear_buffer(handle, start_scanline, num_scanlines,
-			                                       linear_buffer, &is_straight_alpha);
 		predivide = is_straight_alpha == false;

 		if (is_data) {
@@ -1452,38 +1434,35 @@ static void *do_display_buffer_apply_thread(void *handle_v)
 		}
 		else {
 			/* apply processor */
-				IMB_colormanagement_processor_apply(cm_processor, linear_buffer, width, num_scanlines, channels,
+			IMB_colormanagement_processor_apply(cm_processor, linear_buffer, width, height, channels,
 			                                    predivide);
 		}

 		/* copy result to output buffers */
 		if (display_buffer_byte) {
 			/* do conversion */
-				IMB_buffer_byte_from_float(display_buffer_byte + scanline_offset4, linear_buffer,
+			IMB_buffer_byte_from_float(display_buffer_byte, linear_buffer,
 			                           channels, dither, IB_PROFILE_SRGB, IB_PROFILE_SRGB,
-				                           predivide, width, num_scanlines, width, width);
+			                           predivide, width, height, width, width);
 		}

 		if (display_buffer) {
-				memcpy(display_buffer + scanline_offset, linear_buffer, width * num_scanlines * channels * sizeof(float));
+			memcpy(display_buffer, linear_buffer, width * height * channels * sizeof(float));

 			if (is_straight_alpha && channels == 4) {
 				int i;
 				float *fp;

 				for (i = 0, fp = display_buffer;
-					     i < width * num_scanlines;
+				     i < width * height;
 				     i++, fp += channels)
 				{
 					straight_to_premul_v4(fp);
 				}
 			}
 		}
-		}

 		MEM_freeN(linear_buffer);
-
-#undef SCANLINE_BLOCK_SIZE
 	}

 	return NULL;
--- a/source/blender/imbuf/intern/imageprocess.c
+++ b/source/blender/imbuf/intern/imageprocess.c
@@ -41,7 +41,7 @@
 #include "MEM_guardedalloc.h"

 #include "BLI_utildefines.h"
-#include "BLI_threads.h"
+#include "BLI_task.h"
 #include "BLI_listbase.h"
 #include "BLI_math.h"

@@ -288,48 +288,54 @@ void nearest_interpolation(ImBuf *in, ImBuf *out, float x, float y, int xout, in

 /*********************** Threaded image processing *************************/

+static void processor_apply_func(TaskPool *pool, void *taskdata, int UNUSED(threadid))
+{
+	void (*do_thread) (void *) = (void (*) (void *)) BLI_task_pool_userdata(pool);
+	do_thread(taskdata);
+}
+
 void IMB_processor_apply_threaded(int buffer_lines, int handle_size, void *init_customdata,
                                  void (init_handle) (void *handle, int start_line, int tot_line,
                                                      void *customdata),
                                  void *(do_thread) (void *))
 {
+	const int lines_per_task = 64;
+
+	TaskScheduler *task_scheduler = BLI_task_scheduler_get();
+	TaskPool *task_pool;
+
 	void *handles;
-	ListBase threads;
+	int total_tasks = (buffer_lines + lines_per_task - 1) / lines_per_task;
+	int i, start_line;

-	int i, tot_thread = BLI_system_thread_count();
-	int start_line, tot_line;
+	task_pool = BLI_task_pool_create(task_scheduler, do_thread);

-	handles = MEM_callocN(handle_size * tot_thread, "processor apply threaded handles");
-
-	if (tot_thread > 1)
-		BLI_init_threads(&threads, do_thread, tot_thread);
+	handles = MEM_callocN(handle_size * total_tasks, "processor apply threaded handles");

 	start_line = 0;
-	tot_line = ((float)(buffer_lines / tot_thread)) + 0.5f;

-	for (i = 0; i < tot_thread; i++) {
-		int cur_tot_line;
+	for (i = 0; i < total_tasks; i++) {
+		int lines_per_current_task;
 		void *handle = ((char *) handles) + handle_size * i;

-		if (i < tot_thread - 1)
-			cur_tot_line = tot_line;
+		if (i < total_tasks - 1)
+			lines_per_current_task = lines_per_task;
 		else
-			cur_tot_line = buffer_lines - start_line;
+			lines_per_current_task = buffer_lines - start_line;

-		init_handle(handle, start_line, cur_tot_line, init_customdata);
+		init_handle(handle, start_line, lines_per_current_task, init_customdata);

-		if (tot_thread > 1)
-			BLI_insert_thread(&threads, handle);
+		BLI_task_pool_push(task_pool, processor_apply_func, handle, false, TASK_PRIORITY_LOW);

-		start_line += tot_line;
+		start_line += lines_per_task;
 	}

-	if (tot_thread > 1)
-		BLI_end_threads(&threads);
-	else
-		do_thread(handles);
+	/* work and wait until tasks are done */
+	BLI_task_pool_work_and_wait(task_pool);

+	/* Free memory. */
 	MEM_freeN(handles);
+	BLI_task_pool_free(task_pool);
 }

 /* Alpha-under */