Cycles OpenCL: keep the opencl context and program around for quicker rendering

the second time, as for example Intel CPU startup time is 9 seconds.

* Adds an cache for contexts and programs for each platform and device pair,
  which also ensure now no two threads try to compile and write the binary cache
  file at the same time.
* Change clFinish to clFlush so we don't block until the result is done, instead
  it will block at the moment we copy back memory.
* Fix error in Cycles time_sleep implementation, does not affect any active code
  though.
* Adds some (disabled) debugging code in the task scheduler.

Patch #35559 by Doug Gale.
This commit is contained in:
Brecht Van Lommel
2013-05-31 16:19:03 +00:00
parent db42a596aa
commit 2d0a586c29
4 changed files with 310 additions and 57 deletions

View File

@@ -104,12 +104,194 @@ static string opencl_kernel_build_options(const string& platform, const string *
if(opencl_kernel_use_debug())
build_options += "-D__KERNEL_OPENCL_DEBUG__ ";
if (opencl_kernel_use_advanced_shading(platform))
if(opencl_kernel_use_advanced_shading(platform))
build_options += "-D__KERNEL_OPENCL_NEED_ADVANCED_SHADING__ ";
return build_options;
}
/* thread safe cache for contexts and programs */
class OpenCLCache
{
struct Slot
{
thread_mutex *mutex;
cl_context context;
cl_program program;
Slot() : mutex(NULL), context(NULL), program(NULL) {}
Slot(const Slot &rhs)
: mutex(rhs.mutex)
, context(rhs.context)
, program(rhs.program)
{
/* copy can only happen in map insert, assert that */
assert(mutex == NULL);
}
~Slot()
{
delete mutex;
mutex = NULL;
}
};
/* key is combination of platform ID and device ID */
typedef pair<cl_platform_id, cl_device_id> PlatformDevicePair;
/* map of Slot objects */
typedef map<PlatformDevicePair, Slot> CacheMap;
CacheMap cache;
thread_mutex cache_lock;
/* lazy instantiate */
static OpenCLCache &global_instance()
{
static OpenCLCache instance;
return instance;
}
OpenCLCache()
{
}
~OpenCLCache()
{
/* Intel OpenCL bug raises SIGABRT due to pure virtual call
* so this is disabled. It's not necessary to free objects
* at process exit anyway.
* http://software.intel.com/en-us/forums/topic/370083#comments */
//flush();
}
/* lookup something in the cache. If this returns NULL, slot_locker
* will be holding a lock for the cache. slot_locker should refer to a
* default constructed thread_scoped_lock */
template<typename T>
static T get_something(cl_platform_id platform, cl_device_id device,
T Slot::*member, cl_int (*retain_func)(T), thread_scoped_lock &slot_locker)
{
assert(platform != NULL);
OpenCLCache &self = global_instance();
thread_scoped_lock cache_lock(self.cache_lock);
pair<CacheMap::iterator,bool> ins = self.cache.insert(
CacheMap::value_type(PlatformDevicePair(platform, device), Slot()));
Slot &slot = ins.first->second;
/* create slot lock only while holding cache lock */
if(!slot.mutex)
slot.mutex = new thread_mutex;
/* need to unlock cache before locking slot, to allow store to complete */
cache_lock.unlock();
/* lock the slot */
slot_locker = thread_scoped_lock(*slot.mutex);
/* If the thing isn't cached */
if(slot.*member == NULL) {
/* return with the caller's lock holder holding the slot lock */
return NULL;
}
/* the item was already cached, release the slot lock */
slot_locker.unlock();
/* caller is going to release it when done with it, so retain it */
cl_int ciErr = retain_func(slot.*member);
assert(ciErr == CL_SUCCESS);
(void)ciErr;
return slot.*member;
}
/* store something in the cache. you MUST have tried to get the item before storing to it */
template<typename T>
static void store_something(cl_platform_id platform, cl_device_id device, T thing,
T Slot::*member, cl_int (*retain_func)(T), thread_scoped_lock &slot_locker)
{
assert(platform != NULL);
assert(device != NULL);
assert(thing != NULL);
OpenCLCache &self = global_instance();
thread_scoped_lock cache_lock(self.cache_lock);
CacheMap::iterator i = self.cache.find(PlatformDevicePair(platform, device));
cache_lock.unlock();
Slot &slot = i->second;
/* sanity check */
assert(i != self.cache.end());
assert(slot.*member == NULL);
slot.*member = thing;
/* unlock the slot */
slot_locker.unlock();
/* increment reference count in OpenCL.
* The caller is going to release the object when done with it. */
cl_int ciErr = retain_func(thing);
assert(ciErr == CL_SUCCESS);
(void)ciErr;
}
public:
/* see get_something comment */
static cl_context get_context(cl_platform_id platform, cl_device_id device,
thread_scoped_lock &slot_locker)
{
return get_something(platform, device, &Slot::context, clRetainContext, slot_locker);
}
/* see get_something comment */
static cl_program get_program(cl_platform_id platform, cl_device_id device,
thread_scoped_lock &slot_locker)
{
return get_something(platform, device, &Slot::program, clRetainProgram, slot_locker);
}
/* see store_something comment */
static void store_context(cl_platform_id platform, cl_device_id device, cl_context context,
thread_scoped_lock &slot_locker)
{
store_something(platform, device, context, &Slot::context, clRetainContext, slot_locker);
}
/* see store_something comment */
static void store_program(cl_platform_id platform, cl_device_id device, cl_program program,
thread_scoped_lock &slot_locker)
{
store_something(platform, device, program, &Slot::program, clRetainProgram, slot_locker);
}
/* discard all cached contexts and programs
* the parameter is a temporary workaround. See OpenCLCache::~OpenCLCache */
static void flush()
{
OpenCLCache &self = global_instance();
thread_scoped_lock cache_lock(self.cache_lock);
foreach(CacheMap::value_type &item, self.cache) {
if(item.second.program != NULL)
clReleaseProgram(item.second.program);
if(item.second.context != NULL)
clReleaseContext(item.second.context);
}
self.cache.clear();
}
};
class OpenCLDevice : public Device
{
public:
@@ -290,21 +472,34 @@ public:
opencl_error("OpenCL: no devices found.");
return;
}
else if (!cdDevice) {
else if(!cdDevice) {
opencl_error("OpenCL: specified device not found.");
return;
}
/* Create context properties array to specify platform */
const cl_context_properties context_props[] = {
CL_CONTEXT_PLATFORM, (cl_context_properties)cpPlatform,
0, 0
};
{
/* try to use cached context */
thread_scoped_lock cache_locker;
cxContext = OpenCLCache::get_context(cpPlatform, cdDevice, cache_locker);
/* create context */
cxContext = clCreateContext(context_props, 1, &cdDevice, NULL, NULL, &ciErr);
if(opencl_error(ciErr))
return;
if(cxContext == NULL) {
/* create context properties array to specify platform */
const cl_context_properties context_props[] = {
CL_CONTEXT_PLATFORM, (cl_context_properties)cpPlatform,
0, 0
};
/* create context */
cxContext = clCreateContext(context_props, 1, &cdDevice,
context_notify_callback, cdDevice, &ciErr);
if(opencl_error(ciErr))
return;
/* cache it */
OpenCLCache::store_context(cpPlatform, cdDevice, cxContext, cache_locker);
}
}
cqCommandQueue = clCreateCommandQueue(cxContext, cdDevice, 0, &ciErr);
if(opencl_error(ciErr))
@@ -317,6 +512,15 @@ public:
device_initialized = true;
}
static void context_notify_callback(const char *err_info,
const void *private_info, size_t cb, void *user_data)
{
char name[256];
clGetDeviceInfo((cl_device_id)user_data, CL_DEVICE_NAME, sizeof(name), &name, NULL);
fprintf(stderr, "OpenCL error (%s): %s\n", name, err_info);
}
bool opencl_version_check()
{
char version[256];
@@ -436,7 +640,7 @@ public:
string source = "#include \"kernel.cl\" // " + kernel_md5 + "\n";
source = path_source_replace_includes(source, kernel_path);
if (debug_src)
if(debug_src)
path_write_text(*debug_src, source);
size_t source_len = source.size();
@@ -487,39 +691,49 @@ public:
return false;
}
/* verify we have right opencl version */
if(!opencl_version_check())
return false;
/* try to use cached kernel */
thread_scoped_lock cache_locker;
cpProgram = OpenCLCache::get_program(cpPlatform, cdDevice, cache_locker);
/* md5 hash to detect changes */
string kernel_path = path_get("kernel");
string kernel_md5 = path_files_md5_hash(kernel_path);
string device_md5 = device_md5_hash();
/* path to cached binary */
string clbin = string_printf("cycles_kernel_%s_%s.clbin", device_md5.c_str(), kernel_md5.c_str());
clbin = path_user_get(path_join("cache", clbin));
/* path to preprocessed source for debugging */
string clsrc, *debug_src = NULL;
if (opencl_kernel_use_debug()) {
clsrc = string_printf("cycles_kernel_%s_%s.cl", device_md5.c_str(), kernel_md5.c_str());
clsrc = path_user_get(path_join("cache", clsrc));
debug_src = &clsrc;
}
/* if exists already, try use it */
if(path_exists(clbin) && load_binary(kernel_path, clbin, debug_src)) {
/* kernel loaded from binary */
}
else {
/* if does not exist or loading binary failed, compile kernel */
if(!compile_kernel(kernel_path, kernel_md5, debug_src))
if(!cpProgram) {
/* verify we have right opencl version */
if(!opencl_version_check())
return false;
/* save binary for reuse */
save_binary(clbin);
/* md5 hash to detect changes */
string kernel_path = path_get("kernel");
string kernel_md5 = path_files_md5_hash(kernel_path);
string device_md5 = device_md5_hash();
/* path to cached binary */
string clbin = string_printf("cycles_kernel_%s_%s.clbin", device_md5.c_str(), kernel_md5.c_str());
clbin = path_user_get(path_join("cache", clbin));
/* path to preprocessed source for debugging */
string clsrc, *debug_src = NULL;
if(opencl_kernel_use_debug()) {
clsrc = string_printf("cycles_kernel_%s_%s.cl", device_md5.c_str(), kernel_md5.c_str());
clsrc = path_user_get(path_join("cache", clsrc));
debug_src = &clsrc;
}
/* if exists already, try use it */
if(path_exists(clbin) && load_binary(kernel_path, clbin, debug_src)) {
/* kernel loaded from binary */
}
else {
/* if does not exist or loading binary failed, compile kernel */
if(!compile_kernel(kernel_path, kernel_md5, debug_src))
return false;
/* save binary for reuse */
if(!save_binary(clbin))
return false;
}
/* cache the program */
OpenCLCache::store_program(cpPlatform, cdDevice, cpProgram, cache_locker);
}
/* find kernels */
@@ -563,12 +777,17 @@ public:
{
size_t size = mem.memory_size();
cl_mem_flags mem_flag;
void *mem_ptr = NULL;
if(type == MEM_READ_ONLY)
mem.device_pointer = (device_ptr)clCreateBuffer(cxContext, CL_MEM_READ_ONLY, size, NULL, &ciErr);
mem_flag = CL_MEM_READ_ONLY;
else if(type == MEM_WRITE_ONLY)
mem.device_pointer = (device_ptr)clCreateBuffer(cxContext, CL_MEM_WRITE_ONLY, size, NULL, &ciErr);
mem_flag = CL_MEM_WRITE_ONLY;
else
mem.device_pointer = (device_ptr)clCreateBuffer(cxContext, CL_MEM_READ_WRITE, size, NULL, &ciErr);
mem_flag = CL_MEM_READ_WRITE;
mem.device_pointer = (device_ptr)clCreateBuffer(cxContext, mem_flag, size, mem_ptr, &ciErr);
opencl_assert(ciErr);
@@ -664,7 +883,7 @@ public:
size_t local_size[2] = {sqrt_workgroup_size, sqrt_workgroup_size};
/* some implementations have max size 1 on 2nd dimension */
if (local_size[1] > max_work_items[1]) {
if(local_size[1] > max_work_items[1]) {
local_size[0] = workgroup_size/max_work_items[1];
local_size[1] = max_work_items[1];
}
@@ -674,7 +893,7 @@ public:
/* run kernel */
ciErr = clEnqueueNDRangeKernel(cqCommandQueue, kernel, 2, NULL, global_size, local_size, 0, NULL, NULL);
opencl_assert(ciErr);
opencl_assert(clFinish(cqCommandQueue));
opencl_assert(clFlush(cqCommandQueue));
}
void path_trace(RenderTile& rtile, int sample)
@@ -789,7 +1008,7 @@ public:
int end_sample = tile.start_sample + tile.num_samples;
for(int sample = start_sample; sample < end_sample; sample++) {
if (task->get_cancel()) {
if(task->get_cancel()) {
if(task->need_finish_queue == false)
break;
}
@@ -798,7 +1017,7 @@ public:
tile.sample = sample + 1;
task->update_progress(tile);
//task->update_progress(tile);
}
task->release_tile(tile);

View File

@@ -114,6 +114,7 @@ PFNCLGETEXTENSIONFUNCTIONADDRESS __clewGetExtensionFunctionAddress = NULL;
#endif // CLCC_GENERATE_DOCUMENTATION
#if 0
//! \brief Unloads OpenCL dynamic library, should not be called directly
static void clewExit(void)
{
@@ -124,6 +125,7 @@ static void clewExit(void)
module = NULL;
}
}
#endif
//! \param path path to dynamic library to load
//! \return CLEW_ERROR_OPEN_FAILED if the library could not be opened
@@ -138,7 +140,6 @@ int clLibraryInit()
#else
const char *path = "libOpenCL.so";
#endif
int error = 0;
// OpenCL disabled for now, only works with this environment variable set
if(!getenv("CYCLES_OPENCL_TEST"))
@@ -159,8 +160,11 @@ int clLibraryInit()
return 0;
}
// Disabled because we retain OpenCL context and it's difficult to ensure
// this will exit after releasing the context
#if 0
// Set unloading
error = atexit(clewExit);
int error = atexit(clewExit);
if (error)
{
@@ -170,6 +174,7 @@ int clLibraryInit()
return 0;
}
#endif
// Determine function entry-points
__clewGetPlatformIDs = (PFNCLGETPLATFORMIDS )CLCC_DYNLIB_IMPORT(module, "clGetPlatformIDs");

View File

@@ -21,6 +21,15 @@
#include "util_system.h"
#include "util_task.h"
//#define THREADING_DEBUG_ENABLED
#ifdef THREADING_DEBUG_ENABLED
#include <stdio.h>
#define THREADING_DEBUG(...) do { printf(__VA_ARGS__); fflush(stdout); } while(0)
#else
#define THREADING_DEBUG(...)
#endif
CCL_NAMESPACE_BEGIN
/* Task Pool */
@@ -95,8 +104,11 @@ void TaskPool::wait_work()
if(num == 0)
break;
if(!found_entry)
if(!found_entry) {
THREADING_DEBUG("num==%d, Waiting for condition in TaskPool::wait_work !found_entry\n", num);
num_cond.wait(num_lock);
THREADING_DEBUG("num==%d, condition wait done in TaskPool::wait_work !found_entry\n", num);
}
}
}
@@ -109,8 +121,11 @@ void TaskPool::cancel()
{
thread_scoped_lock num_lock(num_mutex);
while(num)
while(num) {
THREADING_DEBUG("num==%d, Waiting for condition in TaskPool::cancel\n", num);
num_cond.wait(num_lock);
THREADING_DEBUG("num==%d condition wait done in TaskPool::cancel\n", num);
}
}
do_cancel = false;
@@ -134,8 +149,10 @@ void TaskPool::num_decrease(int done)
num -= done;
assert(num >= 0);
if(num == 0)
if(num == 0) {
THREADING_DEBUG("num==%d, notifying all in TaskPool::num_decrease\n", num);
num_cond.notify_all();
}
num_mutex.unlock();
}
@@ -144,6 +161,7 @@ void TaskPool::num_increase()
{
thread_scoped_lock num_lock(num_mutex);
num++;
THREADING_DEBUG("num==%d, notifying all in TaskPool::num_increase\n", num);
num_cond.notify_all();
}

View File

@@ -58,12 +58,23 @@ double time_dt()
return now.tv_sec + now.tv_usec*1e-6;
}
/* sleep t seconds */
void time_sleep(double t)
{
if(t >= 1.0)
sleep((int)t);
/* get whole seconds */
int s = (int)t;
usleep((int)(t*1e6));
if(s >= 1) {
sleep(s);
/* adjust parameter to remove whole seconds */
t -= s;
}
/* get microseconds */
int us = (int)(t * 1e6);
if (us > 0)
usleep(us);
}
CCL_NAMESPACE_END