Cuda use streams and async to avoid busywaiting
This switches api usage for cuda towards using more of the Async calls. Updating only once every second is sufficiently cheap that I don't think it is worth doing it less often. Reviewed By: brecht Differential Revision: https://developer.blender.org/D262
This commit is contained in:
@@ -41,11 +41,14 @@ public:
|
|||||||
CUdevice cuDevice;
|
CUdevice cuDevice;
|
||||||
CUcontext cuContext;
|
CUcontext cuContext;
|
||||||
CUmodule cuModule;
|
CUmodule cuModule;
|
||||||
|
CUstream cuStream;
|
||||||
|
CUevent tileDone;
|
||||||
map<device_ptr, bool> tex_interp_map;
|
map<device_ptr, bool> tex_interp_map;
|
||||||
int cuDevId;
|
int cuDevId;
|
||||||
int cuDevArchitecture;
|
int cuDevArchitecture;
|
||||||
bool first_error;
|
bool first_error;
|
||||||
bool use_texture_storage;
|
bool use_texture_storage;
|
||||||
|
unsigned int target_update_frequency;
|
||||||
|
|
||||||
struct PixelMem {
|
struct PixelMem {
|
||||||
GLuint cuPBO;
|
GLuint cuPBO;
|
||||||
@@ -177,6 +180,8 @@ public:
|
|||||||
first_error = true;
|
first_error = true;
|
||||||
background = background_;
|
background = background_;
|
||||||
use_texture_storage = true;
|
use_texture_storage = true;
|
||||||
|
/* we try an update / sync every 1000 ms */
|
||||||
|
target_update_frequency = 1000;
|
||||||
|
|
||||||
cuDevId = info.num;
|
cuDevId = info.num;
|
||||||
cuDevice = 0;
|
cuDevice = 0;
|
||||||
@@ -207,6 +212,9 @@ public:
|
|||||||
if(cuda_error_(result, "cuCtxCreate"))
|
if(cuda_error_(result, "cuCtxCreate"))
|
||||||
return;
|
return;
|
||||||
|
|
||||||
|
cuda_assert(cuStreamCreate(&cuStream, 0))
|
||||||
|
cuda_assert(cuEventCreate(&tileDone, 0x1))
|
||||||
|
|
||||||
int major, minor;
|
int major, minor;
|
||||||
cuDeviceComputeCapability(&major, &minor, cuDevId);
|
cuDeviceComputeCapability(&major, &minor, cuDevId);
|
||||||
cuDevArchitecture = major*100 + minor*10;
|
cuDevArchitecture = major*100 + minor*10;
|
||||||
@@ -223,6 +231,8 @@ public:
|
|||||||
{
|
{
|
||||||
task_pool.stop();
|
task_pool.stop();
|
||||||
|
|
||||||
|
cuda_assert(cuEventDestroy(tileDone))
|
||||||
|
cuda_assert(cuStreamDestroy(cuStream))
|
||||||
cuda_assert(cuCtxDestroy(cuContext))
|
cuda_assert(cuCtxDestroy(cuContext))
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -645,9 +655,7 @@ public:
|
|||||||
|
|
||||||
cuda_assert(cuFuncSetCacheConfig(cuPathTrace, CU_FUNC_CACHE_PREFER_L1))
|
cuda_assert(cuFuncSetCacheConfig(cuPathTrace, CU_FUNC_CACHE_PREFER_L1))
|
||||||
cuda_assert(cuFuncSetBlockShape(cuPathTrace, xthreads, ythreads, 1))
|
cuda_assert(cuFuncSetBlockShape(cuPathTrace, xthreads, ythreads, 1))
|
||||||
cuda_assert(cuLaunchGrid(cuPathTrace, xblocks, yblocks))
|
cuda_assert(cuLaunchGridAsync(cuPathTrace, xblocks, yblocks, cuStream))
|
||||||
|
|
||||||
cuda_assert(cuCtxSynchronize())
|
|
||||||
|
|
||||||
cuda_pop_context();
|
cuda_pop_context();
|
||||||
}
|
}
|
||||||
@@ -964,11 +972,16 @@ public:
|
|||||||
|
|
||||||
bool branched = task->integrator_branched;
|
bool branched = task->integrator_branched;
|
||||||
|
|
||||||
|
|
||||||
/* keep rendering tiles until done */
|
/* keep rendering tiles until done */
|
||||||
while(task->acquire_tile(this, tile)) {
|
while(task->acquire_tile(this, tile)) {
|
||||||
int start_sample = tile.start_sample;
|
int start_sample = tile.start_sample;
|
||||||
int end_sample = tile.start_sample + tile.num_samples;
|
int end_sample = tile.start_sample + tile.num_samples;
|
||||||
|
|
||||||
|
boost::posix_time::ptime start_time(boost::posix_time::microsec_clock::local_time());
|
||||||
|
boost::posix_time::ptime last_time = start_time;
|
||||||
|
int sync_sample = 10;
|
||||||
|
|
||||||
for(int sample = start_sample; sample < end_sample; sample++) {
|
for(int sample = start_sample; sample < end_sample; sample++) {
|
||||||
if (task->get_cancel()) {
|
if (task->get_cancel()) {
|
||||||
if(task->need_finish_queue == false)
|
if(task->need_finish_queue == false)
|
||||||
@@ -978,8 +991,28 @@ public:
|
|||||||
path_trace(tile, sample, branched);
|
path_trace(tile, sample, branched);
|
||||||
|
|
||||||
tile.sample = sample + 1;
|
tile.sample = sample + 1;
|
||||||
|
|
||||||
task->update_progress(tile);
|
task->update_progress(tile);
|
||||||
|
|
||||||
|
if(sample == sync_sample){
|
||||||
|
cuda_push_context();
|
||||||
|
cuda_assert(cuEventRecord(tileDone, cuStream ))
|
||||||
|
cuda_assert(cuEventSynchronize(tileDone))
|
||||||
|
|
||||||
|
/* Do some time keeping to find out if we need to sync less */
|
||||||
|
boost::posix_time::ptime current_time(boost::posix_time::microsec_clock::local_time());
|
||||||
|
boost::posix_time::time_duration sample_duration = current_time - last_time;
|
||||||
|
|
||||||
|
long msec = sample_duration.total_milliseconds();
|
||||||
|
float scaling_factor = (float)target_update_frequency / (float)msec;
|
||||||
|
|
||||||
|
/* sync at earliest next sample and probably later */
|
||||||
|
sync_sample = (sample + 1) + sync_sample * ceil(scaling_factor);
|
||||||
|
|
||||||
|
sync_sample = min(end_sample - 1, sync_sample); // make sure we sync the last sample always
|
||||||
|
|
||||||
|
last_time = current_time;
|
||||||
|
cuda_pop_context();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
task->release_tile(tile);
|
task->release_tile(tile);
|
||||||
|
Reference in New Issue
Block a user