Cycles: Add support for bindless textures.
This adds support for CUDA Texture objects (also known as Bindless textures) for Kepler GPUs (Geforce 6xx and above). This is used for all 2D/3D textures, data still uses arrays as before. User benefits: * No more limits of image textures on Kepler. We had 5 float4 and 145 byte4 slots there before, now we have 1024 float4 and 1024 byte4. This can be extended further if we need to (just change the define). * Single channel textures slots (byte and float) are now supported on Kepler as well (1024 slots for each type). ToDo / Issues: * 3D textures don't work yet, at least don't show up during render. I have no idea whats wrong yet. * Dynamically allocate bindless_mapping array? I hope Fermi still works fine, but that should be tested on a Fermi card before pushing to master. Part of my GSoC 2016. Reviewers: sergey, #cycles, brecht Subscribers: swerner, jtheninja, brecht, sergey Differential Revision: https://developer.blender.org/D1999
This commit is contained in:
@@ -54,7 +54,7 @@ public:
|
|||||||
bool display_device;
|
bool display_device;
|
||||||
bool advanced_shading;
|
bool advanced_shading;
|
||||||
bool pack_images;
|
bool pack_images;
|
||||||
bool extended_images; /* flag for GPU and Multi device */
|
bool has_bindless_textures; /* flag for GPU and Multi device */
|
||||||
bool use_split_kernel; /* Denotes if the device is going to run cycles using split-kernel */
|
bool use_split_kernel; /* Denotes if the device is going to run cycles using split-kernel */
|
||||||
vector<DeviceInfo> multi_devices;
|
vector<DeviceInfo> multi_devices;
|
||||||
|
|
||||||
@@ -66,7 +66,7 @@ public:
|
|||||||
display_device = false;
|
display_device = false;
|
||||||
advanced_shading = true;
|
advanced_shading = true;
|
||||||
pack_images = false;
|
pack_images = false;
|
||||||
extended_images = false;
|
has_bindless_textures = false;
|
||||||
use_split_kernel = false;
|
use_split_kernel = false;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
@@ -230,6 +230,7 @@ public:
|
|||||||
(void)interpolation; /* Ignored. */
|
(void)interpolation; /* Ignored. */
|
||||||
(void)extension; /* Ignored. */
|
(void)extension; /* Ignored. */
|
||||||
};
|
};
|
||||||
|
|
||||||
virtual void tex_free(device_memory& /*mem*/) {};
|
virtual void tex_free(device_memory& /*mem*/) {};
|
||||||
|
|
||||||
/* pixel memory */
|
/* pixel memory */
|
||||||
|
@@ -85,10 +85,10 @@ public:
|
|||||||
CUcontext cuContext;
|
CUcontext cuContext;
|
||||||
CUmodule cuModule;
|
CUmodule cuModule;
|
||||||
map<device_ptr, bool> tex_interp_map;
|
map<device_ptr, bool> tex_interp_map;
|
||||||
|
map<device_ptr, uint> tex_bindless_map;
|
||||||
int cuDevId;
|
int cuDevId;
|
||||||
int cuDevArchitecture;
|
int cuDevArchitecture;
|
||||||
bool first_error;
|
bool first_error;
|
||||||
bool use_texture_storage;
|
|
||||||
|
|
||||||
struct PixelMem {
|
struct PixelMem {
|
||||||
GLuint cuPBO;
|
GLuint cuPBO;
|
||||||
@@ -99,6 +99,10 @@ public:
|
|||||||
|
|
||||||
map<device_ptr, PixelMem> pixel_mem_map;
|
map<device_ptr, PixelMem> pixel_mem_map;
|
||||||
|
|
||||||
|
/* Bindless Textures */
|
||||||
|
device_vector<uint> bindless_mapping;
|
||||||
|
bool need_bindless_mapping;
|
||||||
|
|
||||||
CUdeviceptr cuda_device_ptr(device_ptr mem)
|
CUdeviceptr cuda_device_ptr(device_ptr mem)
|
||||||
{
|
{
|
||||||
return (CUdeviceptr)mem;
|
return (CUdeviceptr)mem;
|
||||||
@@ -176,12 +180,13 @@ public:
|
|||||||
{
|
{
|
||||||
first_error = true;
|
first_error = true;
|
||||||
background = background_;
|
background = background_;
|
||||||
use_texture_storage = true;
|
|
||||||
|
|
||||||
cuDevId = info.num;
|
cuDevId = info.num;
|
||||||
cuDevice = 0;
|
cuDevice = 0;
|
||||||
cuContext = 0;
|
cuContext = 0;
|
||||||
|
|
||||||
|
need_bindless_mapping = false;
|
||||||
|
|
||||||
/* intialize */
|
/* intialize */
|
||||||
if(cuda_error(cuInit(0)))
|
if(cuda_error(cuInit(0)))
|
||||||
return;
|
return;
|
||||||
@@ -211,11 +216,6 @@ public:
|
|||||||
cuDeviceComputeCapability(&major, &minor, cuDevId);
|
cuDeviceComputeCapability(&major, &minor, cuDevId);
|
||||||
cuDevArchitecture = major*100 + minor*10;
|
cuDevArchitecture = major*100 + minor*10;
|
||||||
|
|
||||||
/* In order to use full 6GB of memory on Titan cards, use arrays instead
|
|
||||||
* of textures. On earlier cards this seems slower, but on Titan it is
|
|
||||||
* actually slightly faster in tests. */
|
|
||||||
use_texture_storage = (cuDevArchitecture < 300);
|
|
||||||
|
|
||||||
cuda_pop_context();
|
cuda_pop_context();
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -223,6 +223,10 @@ public:
|
|||||||
{
|
{
|
||||||
task_pool.stop();
|
task_pool.stop();
|
||||||
|
|
||||||
|
if(info.has_bindless_textures) {
|
||||||
|
tex_free(bindless_mapping);
|
||||||
|
}
|
||||||
|
|
||||||
cuda_assert(cuCtxDestroy(cuContext));
|
cuda_assert(cuCtxDestroy(cuContext));
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -400,6 +404,15 @@ public:
|
|||||||
return (result == CUDA_SUCCESS);
|
return (result == CUDA_SUCCESS);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void load_bindless_mapping()
|
||||||
|
{
|
||||||
|
if(info.has_bindless_textures && need_bindless_mapping) {
|
||||||
|
tex_free(bindless_mapping);
|
||||||
|
tex_alloc("__bindless_mapping", bindless_mapping, INTERPOLATION_NONE, EXTENSION_REPEAT);
|
||||||
|
need_bindless_mapping = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
void mem_alloc(device_memory& mem, MemoryType /*type*/)
|
void mem_alloc(device_memory& mem, MemoryType /*type*/)
|
||||||
{
|
{
|
||||||
cuda_push_context();
|
cuda_push_context();
|
||||||
@@ -479,126 +492,99 @@ public:
|
|||||||
{
|
{
|
||||||
VLOG(1) << "Texture allocate: " << name << ", " << mem.memory_size() << " bytes.";
|
VLOG(1) << "Texture allocate: " << name << ", " << mem.memory_size() << " bytes.";
|
||||||
|
|
||||||
string bind_name = name;
|
/* Check if we are on sm_30 or above.
|
||||||
if(mem.data_depth > 1) {
|
* We use arrays and bindles textures for storage there */
|
||||||
/* Kernel uses different bind names for 2d and 3d float textures,
|
bool has_bindless_textures = info.has_bindless_textures;
|
||||||
* so we have to adjust couple of things here.
|
|
||||||
*/
|
|
||||||
vector<string> tokens;
|
|
||||||
string_split(tokens, name, "_");
|
|
||||||
bind_name = string_printf("__tex_image_%s_3d_%s",
|
|
||||||
tokens[2].c_str(),
|
|
||||||
tokens[3].c_str());
|
|
||||||
}
|
|
||||||
|
|
||||||
/* determine format */
|
/* General variables for both architectures */
|
||||||
CUarray_format_enum format;
|
string bind_name = name;
|
||||||
size_t dsize = datatype_size(mem.data_type);
|
size_t dsize = datatype_size(mem.data_type);
|
||||||
size_t size = mem.memory_size();
|
size_t size = mem.memory_size();
|
||||||
bool use_texture = (interpolation != INTERPOLATION_NONE) || use_texture_storage;
|
|
||||||
|
|
||||||
if(use_texture) {
|
CUaddress_mode address_mode = CU_TR_ADDRESS_MODE_WRAP;
|
||||||
|
switch(extension) {
|
||||||
|
case EXTENSION_REPEAT:
|
||||||
|
address_mode = CU_TR_ADDRESS_MODE_WRAP;
|
||||||
|
break;
|
||||||
|
case EXTENSION_EXTEND:
|
||||||
|
address_mode = CU_TR_ADDRESS_MODE_CLAMP;
|
||||||
|
break;
|
||||||
|
case EXTENSION_CLIP:
|
||||||
|
address_mode = CU_TR_ADDRESS_MODE_BORDER;
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
assert(0);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
switch(mem.data_type) {
|
CUfilter_mode filter_mode;
|
||||||
case TYPE_UCHAR: format = CU_AD_FORMAT_UNSIGNED_INT8; break;
|
if(interpolation == INTERPOLATION_CLOSEST) {
|
||||||
case TYPE_UINT: format = CU_AD_FORMAT_UNSIGNED_INT32; break;
|
filter_mode = CU_TR_FILTER_MODE_POINT;
|
||||||
case TYPE_INT: format = CU_AD_FORMAT_SIGNED_INT32; break;
|
}
|
||||||
case TYPE_FLOAT: format = CU_AD_FORMAT_FLOAT; break;
|
else {
|
||||||
default: assert(0); return;
|
filter_mode = CU_TR_FILTER_MODE_LINEAR;
|
||||||
|
}
|
||||||
|
|
||||||
|
CUarray_format_enum format;
|
||||||
|
switch(mem.data_type) {
|
||||||
|
case TYPE_UCHAR: format = CU_AD_FORMAT_UNSIGNED_INT8; break;
|
||||||
|
case TYPE_UINT: format = CU_AD_FORMAT_UNSIGNED_INT32; break;
|
||||||
|
case TYPE_INT: format = CU_AD_FORMAT_SIGNED_INT32; break;
|
||||||
|
case TYPE_FLOAT: format = CU_AD_FORMAT_FLOAT; break;
|
||||||
|
default: assert(0); return;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* General variables for Fermi */
|
||||||
|
CUtexref texref = NULL;
|
||||||
|
|
||||||
|
if(!has_bindless_textures) {
|
||||||
|
if(mem.data_depth > 1) {
|
||||||
|
/* Kernel uses different bind names for 2d and 3d float textures,
|
||||||
|
* so we have to adjust couple of things here.
|
||||||
|
*/
|
||||||
|
vector<string> tokens;
|
||||||
|
string_split(tokens, name, "_");
|
||||||
|
bind_name = string_printf("__tex_image_%s_3d_%s",
|
||||||
|
tokens[2].c_str(),
|
||||||
|
tokens[3].c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
CUtexref texref = NULL;
|
|
||||||
|
|
||||||
cuda_push_context();
|
cuda_push_context();
|
||||||
cuda_assert(cuModuleGetTexRef(&texref, cuModule, bind_name.c_str()));
|
cuda_assert(cuModuleGetTexRef(&texref, cuModule, bind_name.c_str()));
|
||||||
|
cuda_pop_context();
|
||||||
|
|
||||||
if(!texref) {
|
if(!texref) {
|
||||||
cuda_pop_context();
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if(interpolation != INTERPOLATION_NONE) {
|
/* Data Storage */
|
||||||
CUarray handle = NULL;
|
if(interpolation == INTERPOLATION_NONE) {
|
||||||
|
if(has_bindless_textures) {
|
||||||
|
mem_alloc(mem, MEM_READ_ONLY);
|
||||||
|
mem_copy_to(mem);
|
||||||
|
|
||||||
if(mem.data_depth > 1) {
|
cuda_push_context();
|
||||||
CUDA_ARRAY3D_DESCRIPTOR desc;
|
|
||||||
|
|
||||||
desc.Width = mem.data_width;
|
CUdeviceptr cumem;
|
||||||
desc.Height = mem.data_height;
|
size_t cubytes;
|
||||||
desc.Depth = mem.data_depth;
|
|
||||||
desc.Format = format;
|
|
||||||
desc.NumChannels = mem.data_elements;
|
|
||||||
desc.Flags = 0;
|
|
||||||
|
|
||||||
cuda_assert(cuArray3DCreate(&handle, &desc));
|
cuda_assert(cuModuleGetGlobal(&cumem, &cubytes, cuModule, bind_name.c_str()));
|
||||||
|
|
||||||
|
if(cubytes == 8) {
|
||||||
|
/* 64 bit device pointer */
|
||||||
|
uint64_t ptr = mem.device_pointer;
|
||||||
|
cuda_assert(cuMemcpyHtoD(cumem, (void*)&ptr, cubytes));
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
CUDA_ARRAY_DESCRIPTOR desc;
|
/* 32 bit device pointer */
|
||||||
|
uint32_t ptr = (uint32_t)mem.device_pointer;
|
||||||
desc.Width = mem.data_width;
|
cuda_assert(cuMemcpyHtoD(cumem, (void*)&ptr, cubytes));
|
||||||
desc.Height = mem.data_height;
|
|
||||||
desc.Format = format;
|
|
||||||
desc.NumChannels = mem.data_elements;
|
|
||||||
|
|
||||||
cuda_assert(cuArrayCreate(&handle, &desc));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if(!handle) {
|
cuda_pop_context();
|
||||||
cuda_pop_context();
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
if(mem.data_depth > 1) {
|
|
||||||
CUDA_MEMCPY3D param;
|
|
||||||
memset(¶m, 0, sizeof(param));
|
|
||||||
param.dstMemoryType = CU_MEMORYTYPE_ARRAY;
|
|
||||||
param.dstArray = handle;
|
|
||||||
param.srcMemoryType = CU_MEMORYTYPE_HOST;
|
|
||||||
param.srcHost = (void*)mem.data_pointer;
|
|
||||||
param.srcPitch = mem.data_width*dsize*mem.data_elements;
|
|
||||||
param.WidthInBytes = param.srcPitch;
|
|
||||||
param.Height = mem.data_height;
|
|
||||||
param.Depth = mem.data_depth;
|
|
||||||
|
|
||||||
cuda_assert(cuMemcpy3D(¶m));
|
|
||||||
}
|
|
||||||
else if(mem.data_height > 1) {
|
|
||||||
CUDA_MEMCPY2D param;
|
|
||||||
memset(¶m, 0, sizeof(param));
|
|
||||||
param.dstMemoryType = CU_MEMORYTYPE_ARRAY;
|
|
||||||
param.dstArray = handle;
|
|
||||||
param.srcMemoryType = CU_MEMORYTYPE_HOST;
|
|
||||||
param.srcHost = (void*)mem.data_pointer;
|
|
||||||
param.srcPitch = mem.data_width*dsize*mem.data_elements;
|
|
||||||
param.WidthInBytes = param.srcPitch;
|
|
||||||
param.Height = mem.data_height;
|
|
||||||
|
|
||||||
cuda_assert(cuMemcpy2D(¶m));
|
|
||||||
}
|
|
||||||
else
|
|
||||||
cuda_assert(cuMemcpyHtoA(handle, 0, (void*)mem.data_pointer, size));
|
|
||||||
|
|
||||||
cuda_assert(cuTexRefSetArray(texref, handle, CU_TRSA_OVERRIDE_FORMAT));
|
|
||||||
|
|
||||||
if(interpolation == INTERPOLATION_CLOSEST) {
|
|
||||||
cuda_assert(cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_POINT));
|
|
||||||
}
|
|
||||||
else if(interpolation == INTERPOLATION_LINEAR) {
|
|
||||||
cuda_assert(cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_LINEAR));
|
|
||||||
}
|
|
||||||
else {/* CUBIC and SMART are unsupported for CUDA */
|
|
||||||
cuda_assert(cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_LINEAR));
|
|
||||||
}
|
|
||||||
cuda_assert(cuTexRefSetFlags(texref, CU_TRSF_NORMALIZED_COORDINATES));
|
|
||||||
|
|
||||||
mem.device_pointer = (device_ptr)handle;
|
|
||||||
mem.device_size = size;
|
|
||||||
|
|
||||||
stats.mem_alloc(size);
|
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
cuda_pop_context();
|
|
||||||
|
|
||||||
mem_alloc(mem, MEM_READ_ONLY);
|
mem_alloc(mem, MEM_READ_ONLY);
|
||||||
mem_copy_to(mem);
|
mem_copy_to(mem);
|
||||||
|
|
||||||
@@ -607,23 +593,137 @@ public:
|
|||||||
cuda_assert(cuTexRefSetAddress(NULL, texref, cuda_device_ptr(mem.device_pointer), size));
|
cuda_assert(cuTexRefSetAddress(NULL, texref, cuda_device_ptr(mem.device_pointer), size));
|
||||||
cuda_assert(cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_POINT));
|
cuda_assert(cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_POINT));
|
||||||
cuda_assert(cuTexRefSetFlags(texref, CU_TRSF_READ_AS_INTEGER));
|
cuda_assert(cuTexRefSetFlags(texref, CU_TRSF_READ_AS_INTEGER));
|
||||||
|
|
||||||
|
cuda_pop_context();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
/* Texture Storage */
|
||||||
|
else {
|
||||||
|
CUarray handle = NULL;
|
||||||
|
|
||||||
|
cuda_push_context();
|
||||||
|
|
||||||
|
if(mem.data_depth > 1) {
|
||||||
|
CUDA_ARRAY3D_DESCRIPTOR desc;
|
||||||
|
|
||||||
|
desc.Width = mem.data_width;
|
||||||
|
desc.Height = mem.data_height;
|
||||||
|
desc.Depth = mem.data_depth;
|
||||||
|
desc.Format = format;
|
||||||
|
desc.NumChannels = mem.data_elements;
|
||||||
|
desc.Flags = 0;
|
||||||
|
|
||||||
|
cuda_assert(cuArray3DCreate(&handle, &desc));
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
CUDA_ARRAY_DESCRIPTOR desc;
|
||||||
|
|
||||||
|
desc.Width = mem.data_width;
|
||||||
|
desc.Height = mem.data_height;
|
||||||
|
desc.Format = format;
|
||||||
|
desc.NumChannels = mem.data_elements;
|
||||||
|
|
||||||
|
cuda_assert(cuArrayCreate(&handle, &desc));
|
||||||
}
|
}
|
||||||
|
|
||||||
CUaddress_mode address_mode = CU_TR_ADDRESS_MODE_WRAP;
|
if(!handle) {
|
||||||
switch(extension) {
|
cuda_pop_context();
|
||||||
case EXTENSION_REPEAT:
|
return;
|
||||||
address_mode = CU_TR_ADDRESS_MODE_WRAP;
|
|
||||||
break;
|
|
||||||
case EXTENSION_EXTEND:
|
|
||||||
address_mode = CU_TR_ADDRESS_MODE_CLAMP;
|
|
||||||
break;
|
|
||||||
case EXTENSION_CLIP:
|
|
||||||
address_mode = CU_TR_ADDRESS_MODE_BORDER;
|
|
||||||
break;
|
|
||||||
default:
|
|
||||||
assert(0);
|
|
||||||
break;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Allocate 3D, 2D or 1D memory */
|
||||||
|
if(mem.data_depth > 1) {
|
||||||
|
CUDA_MEMCPY3D param;
|
||||||
|
memset(¶m, 0, sizeof(param));
|
||||||
|
param.dstMemoryType = CU_MEMORYTYPE_ARRAY;
|
||||||
|
param.dstArray = handle;
|
||||||
|
param.srcMemoryType = CU_MEMORYTYPE_HOST;
|
||||||
|
param.srcHost = (void*)mem.data_pointer;
|
||||||
|
param.srcPitch = mem.data_width*dsize*mem.data_elements;
|
||||||
|
param.WidthInBytes = param.srcPitch;
|
||||||
|
param.Height = mem.data_height;
|
||||||
|
param.Depth = mem.data_depth;
|
||||||
|
|
||||||
|
cuda_assert(cuMemcpy3D(¶m));
|
||||||
|
}
|
||||||
|
else if(mem.data_height > 1) {
|
||||||
|
CUDA_MEMCPY2D param;
|
||||||
|
memset(¶m, 0, sizeof(param));
|
||||||
|
param.dstMemoryType = CU_MEMORYTYPE_ARRAY;
|
||||||
|
param.dstArray = handle;
|
||||||
|
param.srcMemoryType = CU_MEMORYTYPE_HOST;
|
||||||
|
param.srcHost = (void*)mem.data_pointer;
|
||||||
|
param.srcPitch = mem.data_width*dsize*mem.data_elements;
|
||||||
|
param.WidthInBytes = param.srcPitch;
|
||||||
|
param.Height = mem.data_height;
|
||||||
|
|
||||||
|
cuda_assert(cuMemcpy2D(¶m));
|
||||||
|
}
|
||||||
|
else
|
||||||
|
cuda_assert(cuMemcpyHtoA(handle, 0, (void*)mem.data_pointer, size));
|
||||||
|
|
||||||
|
/* Fermi and Kepler */
|
||||||
|
mem.device_pointer = (device_ptr)handle;
|
||||||
|
mem.device_size = size;
|
||||||
|
|
||||||
|
stats.mem_alloc(size);
|
||||||
|
|
||||||
|
/* Bindless Textures - Kepler */
|
||||||
|
if(has_bindless_textures) {
|
||||||
|
int flat_slot = 0;
|
||||||
|
if(string_startswith(name, "__tex_image")) {
|
||||||
|
int pos = string(name).rfind("_");
|
||||||
|
flat_slot = atoi(name + pos + 1);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
assert(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
CUDA_RESOURCE_DESC resDesc;
|
||||||
|
memset(&resDesc, 0, sizeof(resDesc));
|
||||||
|
resDesc.resType = CU_RESOURCE_TYPE_ARRAY;
|
||||||
|
resDesc.res.array.hArray = handle;
|
||||||
|
resDesc.flags = 0;
|
||||||
|
|
||||||
|
CUDA_TEXTURE_DESC texDesc;
|
||||||
|
memset(&texDesc, 0, sizeof(texDesc));
|
||||||
|
texDesc.addressMode[0] = address_mode;
|
||||||
|
texDesc.addressMode[1] = address_mode;
|
||||||
|
texDesc.addressMode[2] = address_mode;
|
||||||
|
texDesc.filterMode = filter_mode;
|
||||||
|
texDesc.flags = CU_TRSF_NORMALIZED_COORDINATES;
|
||||||
|
|
||||||
|
CUtexObject tex = 0;
|
||||||
|
cuda_assert(cuTexObjectCreate(&tex, &resDesc, &texDesc, NULL));
|
||||||
|
|
||||||
|
/* Safety check */
|
||||||
|
if((uint)tex > UINT_MAX) {
|
||||||
|
assert(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Resize once */
|
||||||
|
if(flat_slot >= bindless_mapping.size())
|
||||||
|
bindless_mapping.resize(4096); /*TODO(dingto): Make this a variable */
|
||||||
|
|
||||||
|
/* Set Mapping and tag that we need to (re-)upload to device */
|
||||||
|
bindless_mapping.get_data()[flat_slot] = (uint)tex;
|
||||||
|
tex_bindless_map[mem.device_pointer] = (uint)tex;
|
||||||
|
need_bindless_mapping = true;
|
||||||
|
}
|
||||||
|
/* Regular Textures - Fermi */
|
||||||
|
else {
|
||||||
|
cuda_assert(cuTexRefSetArray(texref, handle, CU_TRSA_OVERRIDE_FORMAT));
|
||||||
|
cuda_assert(cuTexRefSetFilterMode(texref, filter_mode));
|
||||||
|
cuda_assert(cuTexRefSetFlags(texref, CU_TRSF_NORMALIZED_COORDINATES));
|
||||||
|
}
|
||||||
|
|
||||||
|
cuda_pop_context();
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Fermi, Data and Image Textures */
|
||||||
|
if(!has_bindless_textures) {
|
||||||
|
cuda_push_context();
|
||||||
|
|
||||||
cuda_assert(cuTexRefSetAddressMode(texref, 0, address_mode));
|
cuda_assert(cuTexRefSetAddressMode(texref, 0, address_mode));
|
||||||
cuda_assert(cuTexRefSetAddressMode(texref, 1, address_mode));
|
cuda_assert(cuTexRefSetAddressMode(texref, 1, address_mode));
|
||||||
if(mem.data_depth > 1) {
|
if(mem.data_depth > 1) {
|
||||||
@@ -634,31 +734,8 @@ public:
|
|||||||
|
|
||||||
cuda_pop_context();
|
cuda_pop_context();
|
||||||
}
|
}
|
||||||
else {
|
|
||||||
mem_alloc(mem, MEM_READ_ONLY);
|
|
||||||
mem_copy_to(mem);
|
|
||||||
|
|
||||||
cuda_push_context();
|
|
||||||
|
|
||||||
CUdeviceptr cumem;
|
|
||||||
size_t cubytes;
|
|
||||||
|
|
||||||
cuda_assert(cuModuleGetGlobal(&cumem, &cubytes, cuModule, bind_name.c_str()));
|
|
||||||
|
|
||||||
if(cubytes == 8) {
|
|
||||||
/* 64 bit device pointer */
|
|
||||||
uint64_t ptr = mem.device_pointer;
|
|
||||||
cuda_assert(cuMemcpyHtoD(cumem, (void*)&ptr, cubytes));
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
/* 32 bit device pointer */
|
|
||||||
uint32_t ptr = (uint32_t)mem.device_pointer;
|
|
||||||
cuda_assert(cuMemcpyHtoD(cumem, (void*)&ptr, cubytes));
|
|
||||||
}
|
|
||||||
|
|
||||||
cuda_pop_context();
|
|
||||||
}
|
|
||||||
|
|
||||||
|
/* Fermi and Kepler */
|
||||||
tex_interp_map[mem.device_pointer] = (interpolation != INTERPOLATION_NONE);
|
tex_interp_map[mem.device_pointer] = (interpolation != INTERPOLATION_NONE);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -670,6 +747,12 @@ public:
|
|||||||
cuArrayDestroy((CUarray)mem.device_pointer);
|
cuArrayDestroy((CUarray)mem.device_pointer);
|
||||||
cuda_pop_context();
|
cuda_pop_context();
|
||||||
|
|
||||||
|
/* Free CUtexObject (Bindless Textures) */
|
||||||
|
if(info.has_bindless_textures && tex_bindless_map[mem.device_pointer]) {
|
||||||
|
uint flat_slot = tex_bindless_map[mem.device_pointer];
|
||||||
|
cuTexObjectDestroy(flat_slot);
|
||||||
|
}
|
||||||
|
|
||||||
tex_interp_map.erase(tex_interp_map.find(mem.device_pointer));
|
tex_interp_map.erase(tex_interp_map.find(mem.device_pointer));
|
||||||
mem.device_pointer = 0;
|
mem.device_pointer = 0;
|
||||||
|
|
||||||
@@ -1111,6 +1194,9 @@ public:
|
|||||||
RenderTile tile;
|
RenderTile tile;
|
||||||
|
|
||||||
bool branched = task->integrator_branched;
|
bool branched = task->integrator_branched;
|
||||||
|
|
||||||
|
/* Upload Bindless Mapping */
|
||||||
|
load_bindless_mapping();
|
||||||
|
|
||||||
/* keep rendering tiles until done */
|
/* keep rendering tiles until done */
|
||||||
while(task->acquire_tile(this, tile)) {
|
while(task->acquire_tile(this, tile)) {
|
||||||
@@ -1134,6 +1220,9 @@ public:
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
else if(task->type == DeviceTask::SHADER) {
|
else if(task->type == DeviceTask::SHADER) {
|
||||||
|
/* Upload Bindless Mapping */
|
||||||
|
load_bindless_mapping();
|
||||||
|
|
||||||
shader(*task);
|
shader(*task);
|
||||||
|
|
||||||
cuda_push_context();
|
cuda_push_context();
|
||||||
@@ -1269,7 +1358,7 @@ void device_cuda_info(vector<DeviceInfo>& devices)
|
|||||||
info.num = num;
|
info.num = num;
|
||||||
|
|
||||||
info.advanced_shading = (major >= 2);
|
info.advanced_shading = (major >= 2);
|
||||||
info.extended_images = (major >= 3);
|
info.has_bindless_textures = (major >= 3);
|
||||||
info.pack_images = false;
|
info.pack_images = false;
|
||||||
|
|
||||||
/* if device has a kernel timeout, assume it is used for display */
|
/* if device has a kernel timeout, assume it is used for display */
|
||||||
|
@@ -352,7 +352,7 @@ static bool device_multi_add(vector<DeviceInfo>& devices, DeviceType type, bool
|
|||||||
|
|
||||||
info.advanced_shading = with_advanced_shading;
|
info.advanced_shading = with_advanced_shading;
|
||||||
info.pack_images = false;
|
info.pack_images = false;
|
||||||
info.extended_images = true;
|
info.has_bindless_textures = true;
|
||||||
|
|
||||||
foreach(DeviceInfo& subinfo, devices) {
|
foreach(DeviceInfo& subinfo, devices) {
|
||||||
if(subinfo.type == type) {
|
if(subinfo.type == type) {
|
||||||
@@ -376,7 +376,7 @@ static bool device_multi_add(vector<DeviceInfo>& devices, DeviceType type, bool
|
|||||||
if(subinfo.display_device)
|
if(subinfo.display_device)
|
||||||
info.display_device = true;
|
info.display_device = true;
|
||||||
info.pack_images = info.pack_images || subinfo.pack_images;
|
info.pack_images = info.pack_images || subinfo.pack_images;
|
||||||
info.extended_images = info.extended_images && subinfo.extended_images;
|
info.has_bindless_textures = info.has_bindless_textures && subinfo.has_bindless_textures;
|
||||||
num_added++;
|
num_added++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@@ -29,7 +29,7 @@ CCL_NAMESPACE_BEGIN
|
|||||||
|
|
||||||
/* Return position normalized to 0..1 in mesh bounds */
|
/* Return position normalized to 0..1 in mesh bounds */
|
||||||
|
|
||||||
#ifdef __KERNEL_GPU__
|
#if defined(__KERNEL_GPU__) && __CUDA_ARCH__ < 300
|
||||||
ccl_device float4 volume_image_texture_3d(int id, float x, float y, float z)
|
ccl_device float4 volume_image_texture_3d(int id, float x, float y, float z)
|
||||||
{
|
{
|
||||||
float4 r;
|
float4 r;
|
||||||
@@ -65,7 +65,13 @@ ccl_device float volume_attribute_float(KernelGlobals *kg, const ShaderData *sd,
|
|||||||
{
|
{
|
||||||
float3 P = volume_normalized_position(kg, sd, sd->P);
|
float3 P = volume_normalized_position(kg, sd, sd->P);
|
||||||
#ifdef __KERNEL_GPU__
|
#ifdef __KERNEL_GPU__
|
||||||
|
# if __CUDA_ARCH__ >= 300
|
||||||
|
CUtexObject tex = kernel_tex_fetch(__bindless_mapping, id);
|
||||||
|
float f = kernel_tex_image_interp_3d_float(tex, P.x, P.y, P.z);
|
||||||
|
float4 r = make_float4(f, f, f, 1.0);
|
||||||
|
# else
|
||||||
float4 r = volume_image_texture_3d(id, P.x, P.y, P.z);
|
float4 r = volume_image_texture_3d(id, P.x, P.y, P.z);
|
||||||
|
# endif
|
||||||
#else
|
#else
|
||||||
float4 r;
|
float4 r;
|
||||||
if(sd->flag & SD_VOLUME_CUBIC)
|
if(sd->flag & SD_VOLUME_CUBIC)
|
||||||
@@ -84,7 +90,12 @@ ccl_device float3 volume_attribute_float3(KernelGlobals *kg, const ShaderData *s
|
|||||||
{
|
{
|
||||||
float3 P = volume_normalized_position(kg, sd, sd->P);
|
float3 P = volume_normalized_position(kg, sd, sd->P);
|
||||||
#ifdef __KERNEL_GPU__
|
#ifdef __KERNEL_GPU__
|
||||||
|
# if __CUDA_ARCH__ >= 300
|
||||||
|
CUtexObject tex = kernel_tex_fetch(__bindless_mapping, id);
|
||||||
|
float4 r = kernel_tex_image_interp_3d_float4(tex, P.x, P.y, P.z);
|
||||||
|
# else
|
||||||
float4 r = volume_image_texture_3d(id, P.x, P.y, P.z);
|
float4 r = volume_image_texture_3d(id, P.x, P.y, P.z);
|
||||||
|
# endif
|
||||||
#else
|
#else
|
||||||
float4 r;
|
float4 r;
|
||||||
if(sd->flag & SD_VOLUME_CUBIC)
|
if(sd->flag & SD_VOLUME_CUBIC)
|
||||||
|
@@ -67,20 +67,29 @@ typedef texture<uchar4, 2, cudaReadModeNormalizedFloat> texture_image_uchar4;
|
|||||||
|
|
||||||
/* Macros to handle different memory storage on different devices */
|
/* Macros to handle different memory storage on different devices */
|
||||||
|
|
||||||
/* In order to use full 6GB of memory on Titan cards, use arrays instead
|
/* On Fermi cards (4xx and 5xx), we use regular textures for both data and images.
|
||||||
* of textures. On earlier cards this seems slower, but on Titan it is
|
* On Kepler (6xx) and above, we use Bindless Textures for images and arrays for data.
|
||||||
* actually slightly faster in tests. */
|
*
|
||||||
|
* Arrays are necessary in order to use the full VRAM on newer cards, and it's slightly faster.
|
||||||
|
* Using Arrays on Fermi turned out to be slower.*/
|
||||||
|
|
||||||
|
/* Fermi */
|
||||||
#if __CUDA_ARCH__ < 300
|
#if __CUDA_ARCH__ < 300
|
||||||
# define __KERNEL_CUDA_TEX_STORAGE__
|
# define __KERNEL_CUDA_TEX_STORAGE__
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifdef __KERNEL_CUDA_TEX_STORAGE__
|
|
||||||
# define kernel_tex_fetch(t, index) tex1Dfetch(t, index)
|
# define kernel_tex_fetch(t, index) tex1Dfetch(t, index)
|
||||||
|
|
||||||
|
# define kernel_tex_image_interp(t, x, y) tex2D(t, x, y)
|
||||||
|
# define kernel_tex_image_interp_3d(t, x, y, z) tex3D(t, x, y, z)
|
||||||
|
|
||||||
|
/* Kepler */
|
||||||
#else
|
#else
|
||||||
# define kernel_tex_fetch(t, index) t[(index)]
|
# define kernel_tex_fetch(t, index) t[(index)]
|
||||||
|
|
||||||
|
# define kernel_tex_image_interp_float4(t, x, y) tex2D<float4>(t, x, y)
|
||||||
|
# define kernel_tex_image_interp_float(t, x, y) tex2D<float>(t, x, y)
|
||||||
|
# define kernel_tex_image_interp_3d_float4(t, x, y, z) tex3D<float4>(t, x, y, z)
|
||||||
|
# define kernel_tex_image_interp_3d_float(t, x, y, z) tex3D<float>(t, x, y, z)
|
||||||
#endif
|
#endif
|
||||||
#define kernel_tex_image_interp(t, x, y) tex2D(t, x, y)
|
|
||||||
#define kernel_tex_image_interp_3d(t, x, y, z) tex3D(t, x, y, z)
|
|
||||||
|
|
||||||
#define kernel_data __data
|
#define kernel_data __data
|
||||||
|
|
||||||
|
@@ -72,6 +72,8 @@ KERNEL_TEX(float, texture_float, __lookup_table)
|
|||||||
/* sobol */
|
/* sobol */
|
||||||
KERNEL_TEX(uint, texture_uint, __sobol_directions)
|
KERNEL_TEX(uint, texture_uint, __sobol_directions)
|
||||||
|
|
||||||
|
#ifdef __KERNEL_CUDA__
|
||||||
|
# if __CUDA_ARCH__ < 300
|
||||||
/* full-float image */
|
/* full-float image */
|
||||||
KERNEL_IMAGE_TEX(float4, texture_image_float4, __tex_image_float4_000)
|
KERNEL_IMAGE_TEX(float4, texture_image_float4, __tex_image_float4_000)
|
||||||
KERNEL_IMAGE_TEX(float4, texture_image_float4, __tex_image_float4_001)
|
KERNEL_IMAGE_TEX(float4, texture_image_float4, __tex_image_float4_001)
|
||||||
@@ -174,66 +176,12 @@ KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_089)
|
|||||||
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_090)
|
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_090)
|
||||||
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_091)
|
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_091)
|
||||||
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_092)
|
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_092)
|
||||||
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_093)
|
|
||||||
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_094)
|
|
||||||
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_095)
|
|
||||||
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_096)
|
|
||||||
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_097)
|
|
||||||
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_098)
|
|
||||||
|
|
||||||
/* Kepler and above */
|
# else
|
||||||
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_099)
|
/* bindless textures */
|
||||||
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_100)
|
KERNEL_TEX(uint, texture_uint, __bindless_mapping)
|
||||||
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_101)
|
# endif
|
||||||
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_102)
|
#endif
|
||||||
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_103)
|
|
||||||
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_104)
|
|
||||||
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_105)
|
|
||||||
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_106)
|
|
||||||
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_107)
|
|
||||||
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_108)
|
|
||||||
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_109)
|
|
||||||
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_110)
|
|
||||||
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_111)
|
|
||||||
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_112)
|
|
||||||
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_113)
|
|
||||||
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_114)
|
|
||||||
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_115)
|
|
||||||
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_116)
|
|
||||||
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_117)
|
|
||||||
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_118)
|
|
||||||
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_119)
|
|
||||||
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_120)
|
|
||||||
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_121)
|
|
||||||
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_122)
|
|
||||||
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_123)
|
|
||||||
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_124)
|
|
||||||
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_125)
|
|
||||||
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_126)
|
|
||||||
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_127)
|
|
||||||
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_128)
|
|
||||||
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_129)
|
|
||||||
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_130)
|
|
||||||
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_131)
|
|
||||||
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_132)
|
|
||||||
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_133)
|
|
||||||
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_134)
|
|
||||||
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_135)
|
|
||||||
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_136)
|
|
||||||
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_137)
|
|
||||||
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_138)
|
|
||||||
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_139)
|
|
||||||
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_140)
|
|
||||||
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_141)
|
|
||||||
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_142)
|
|
||||||
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_143)
|
|
||||||
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_144)
|
|
||||||
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_145)
|
|
||||||
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_146)
|
|
||||||
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_147)
|
|
||||||
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_148)
|
|
||||||
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_149)
|
|
||||||
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_150)
|
|
||||||
|
|
||||||
/* packed image (opencl) */
|
/* packed image (opencl) */
|
||||||
KERNEL_TEX(uchar4, texture_uchar4, __tex_image_byte4_packed)
|
KERNEL_TEX(uchar4, texture_uchar4, __tex_image_byte4_packed)
|
||||||
|
@@ -18,11 +18,15 @@ CCL_NAMESPACE_BEGIN
|
|||||||
|
|
||||||
/* Float4 textures on various devices. */
|
/* Float4 textures on various devices. */
|
||||||
#if defined(__KERNEL_CPU__)
|
#if defined(__KERNEL_CPU__)
|
||||||
#define TEX_NUM_FLOAT4_IMAGES TEX_NUM_FLOAT4_IMAGES_CPU
|
# define TEX_NUM_FLOAT4_IMAGES TEX_NUM_FLOAT4_IMAGES_CPU
|
||||||
#elif defined(__KERNEL_CUDA__)
|
#elif defined(__KERNEL_CUDA__)
|
||||||
#define TEX_NUM_FLOAT4_IMAGES TEX_NUM_FLOAT4_IMAGES_CUDA
|
# if __CUDA_ARCH__ < 300
|
||||||
|
# define TEX_NUM_FLOAT4_IMAGES TEX_NUM_FLOAT4_IMAGES_CUDA
|
||||||
|
# else
|
||||||
|
# define TEX_NUM_FLOAT4_IMAGES TEX_NUM_FLOAT4_IMAGES_CUDA_KEPLER
|
||||||
|
# endif
|
||||||
#else
|
#else
|
||||||
#define TEX_NUM_FLOAT4_IMAGES TEX_NUM_FLOAT4_IMAGES_OPENCL
|
# define TEX_NUM_FLOAT4_IMAGES TEX_NUM_FLOAT4_IMAGES_OPENCL
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef __KERNEL_OPENCL__
|
#ifdef __KERNEL_OPENCL__
|
||||||
@@ -151,6 +155,7 @@ ccl_device float4 svm_image_texture(KernelGlobals *kg, int id, float x, float y,
|
|||||||
#else
|
#else
|
||||||
float4 r;
|
float4 r;
|
||||||
|
|
||||||
|
# if __CUDA_ARCH__ < 300
|
||||||
/* not particularly proud of this massive switch, what are the
|
/* not particularly proud of this massive switch, what are the
|
||||||
* alternatives?
|
* alternatives?
|
||||||
* - use a single big 1D texture, and do our own lookup/filtering
|
* - use a single big 1D texture, and do our own lookup/filtering
|
||||||
@@ -254,72 +259,19 @@ ccl_device float4 svm_image_texture(KernelGlobals *kg, int id, float x, float y,
|
|||||||
case 90: r = kernel_tex_image_interp(__tex_image_byte4_090, x, y); break;
|
case 90: r = kernel_tex_image_interp(__tex_image_byte4_090, x, y); break;
|
||||||
case 91: r = kernel_tex_image_interp(__tex_image_byte4_091, x, y); break;
|
case 91: r = kernel_tex_image_interp(__tex_image_byte4_091, x, y); break;
|
||||||
case 92: r = kernel_tex_image_interp(__tex_image_byte4_092, x, y); break;
|
case 92: r = kernel_tex_image_interp(__tex_image_byte4_092, x, y); break;
|
||||||
|
|
||||||
# if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 300)
|
|
||||||
case 93: r = kernel_tex_image_interp(__tex_image_byte4_093, x, y); break;
|
|
||||||
case 94: r = kernel_tex_image_interp(__tex_image_byte4_094, x, y); break;
|
|
||||||
case 95: r = kernel_tex_image_interp(__tex_image_byte4_095, x, y); break;
|
|
||||||
case 96: r = kernel_tex_image_interp(__tex_image_byte4_096, x, y); break;
|
|
||||||
case 97: r = kernel_tex_image_interp(__tex_image_byte4_097, x, y); break;
|
|
||||||
case 98: r = kernel_tex_image_interp(__tex_image_byte4_098, x, y); break;
|
|
||||||
case 99: r = kernel_tex_image_interp(__tex_image_byte4_099, x, y); break;
|
|
||||||
case 100: r = kernel_tex_image_interp(__tex_image_byte4_100, x, y); break;
|
|
||||||
case 101: r = kernel_tex_image_interp(__tex_image_byte4_101, x, y); break;
|
|
||||||
case 102: r = kernel_tex_image_interp(__tex_image_byte4_102, x, y); break;
|
|
||||||
case 103: r = kernel_tex_image_interp(__tex_image_byte4_103, x, y); break;
|
|
||||||
case 104: r = kernel_tex_image_interp(__tex_image_byte4_104, x, y); break;
|
|
||||||
case 105: r = kernel_tex_image_interp(__tex_image_byte4_105, x, y); break;
|
|
||||||
case 106: r = kernel_tex_image_interp(__tex_image_byte4_106, x, y); break;
|
|
||||||
case 107: r = kernel_tex_image_interp(__tex_image_byte4_107, x, y); break;
|
|
||||||
case 108: r = kernel_tex_image_interp(__tex_image_byte4_108, x, y); break;
|
|
||||||
case 109: r = kernel_tex_image_interp(__tex_image_byte4_109, x, y); break;
|
|
||||||
case 110: r = kernel_tex_image_interp(__tex_image_byte4_110, x, y); break;
|
|
||||||
case 111: r = kernel_tex_image_interp(__tex_image_byte4_111, x, y); break;
|
|
||||||
case 112: r = kernel_tex_image_interp(__tex_image_byte4_112, x, y); break;
|
|
||||||
case 113: r = kernel_tex_image_interp(__tex_image_byte4_113, x, y); break;
|
|
||||||
case 114: r = kernel_tex_image_interp(__tex_image_byte4_114, x, y); break;
|
|
||||||
case 115: r = kernel_tex_image_interp(__tex_image_byte4_115, x, y); break;
|
|
||||||
case 116: r = kernel_tex_image_interp(__tex_image_byte4_116, x, y); break;
|
|
||||||
case 117: r = kernel_tex_image_interp(__tex_image_byte4_117, x, y); break;
|
|
||||||
case 118: r = kernel_tex_image_interp(__tex_image_byte4_118, x, y); break;
|
|
||||||
case 119: r = kernel_tex_image_interp(__tex_image_byte4_119, x, y); break;
|
|
||||||
case 120: r = kernel_tex_image_interp(__tex_image_byte4_120, x, y); break;
|
|
||||||
case 121: r = kernel_tex_image_interp(__tex_image_byte4_121, x, y); break;
|
|
||||||
case 122: r = kernel_tex_image_interp(__tex_image_byte4_122, x, y); break;
|
|
||||||
case 123: r = kernel_tex_image_interp(__tex_image_byte4_123, x, y); break;
|
|
||||||
case 124: r = kernel_tex_image_interp(__tex_image_byte4_124, x, y); break;
|
|
||||||
case 125: r = kernel_tex_image_interp(__tex_image_byte4_125, x, y); break;
|
|
||||||
case 126: r = kernel_tex_image_interp(__tex_image_byte4_126, x, y); break;
|
|
||||||
case 127: r = kernel_tex_image_interp(__tex_image_byte4_127, x, y); break;
|
|
||||||
case 128: r = kernel_tex_image_interp(__tex_image_byte4_128, x, y); break;
|
|
||||||
case 129: r = kernel_tex_image_interp(__tex_image_byte4_129, x, y); break;
|
|
||||||
case 130: r = kernel_tex_image_interp(__tex_image_byte4_130, x, y); break;
|
|
||||||
case 131: r = kernel_tex_image_interp(__tex_image_byte4_131, x, y); break;
|
|
||||||
case 132: r = kernel_tex_image_interp(__tex_image_byte4_132, x, y); break;
|
|
||||||
case 133: r = kernel_tex_image_interp(__tex_image_byte4_133, x, y); break;
|
|
||||||
case 134: r = kernel_tex_image_interp(__tex_image_byte4_134, x, y); break;
|
|
||||||
case 135: r = kernel_tex_image_interp(__tex_image_byte4_135, x, y); break;
|
|
||||||
case 136: r = kernel_tex_image_interp(__tex_image_byte4_136, x, y); break;
|
|
||||||
case 137: r = kernel_tex_image_interp(__tex_image_byte4_137, x, y); break;
|
|
||||||
case 138: r = kernel_tex_image_interp(__tex_image_byte4_138, x, y); break;
|
|
||||||
case 139: r = kernel_tex_image_interp(__tex_image_byte4_139, x, y); break;
|
|
||||||
case 140: r = kernel_tex_image_interp(__tex_image_byte4_140, x, y); break;
|
|
||||||
case 141: r = kernel_tex_image_interp(__tex_image_byte4_141, x, y); break;
|
|
||||||
case 142: r = kernel_tex_image_interp(__tex_image_byte4_142, x, y); break;
|
|
||||||
case 143: r = kernel_tex_image_interp(__tex_image_byte4_143, x, y); break;
|
|
||||||
case 144: r = kernel_tex_image_interp(__tex_image_byte4_144, x, y); break;
|
|
||||||
case 145: r = kernel_tex_image_interp(__tex_image_byte4_145, x, y); break;
|
|
||||||
case 146: r = kernel_tex_image_interp(__tex_image_byte4_146, x, y); break;
|
|
||||||
case 147: r = kernel_tex_image_interp(__tex_image_byte4_147, x, y); break;
|
|
||||||
case 148: r = kernel_tex_image_interp(__tex_image_byte4_148, x, y); break;
|
|
||||||
case 149: r = kernel_tex_image_interp(__tex_image_byte4_149, x, y); break;
|
|
||||||
case 150: r = kernel_tex_image_interp(__tex_image_byte4_150, x, y); break;
|
|
||||||
# endif
|
|
||||||
|
|
||||||
default:
|
default:
|
||||||
kernel_assert(0);
|
kernel_assert(0);
|
||||||
return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
|
return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
|
||||||
}
|
}
|
||||||
|
# else
|
||||||
|
CUtexObject tex = kernel_tex_fetch(__bindless_mapping, id);
|
||||||
|
if(id < 2048) /* TODO(dingto): Make this a variable */
|
||||||
|
r = kernel_tex_image_interp_float4(tex, x, y);
|
||||||
|
else {
|
||||||
|
float f = kernel_tex_image_interp_float(tex, x, y);
|
||||||
|
r = make_float4(f, f, f, 1.0);
|
||||||
|
}
|
||||||
|
# endif
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef __KERNEL_SSE2__
|
#ifdef __KERNEL_SSE2__
|
||||||
|
@@ -42,10 +42,21 @@ ccl_device void svm_node_tex_voxel(KernelGlobals *kg,
|
|||||||
tfm.w = read_node_float(kg, offset);
|
tfm.w = read_node_float(kg, offset);
|
||||||
co = transform_point(&tfm, co);
|
co = transform_point(&tfm, co);
|
||||||
}
|
}
|
||||||
|
float4 r;
|
||||||
# if defined(__KERNEL_GPU__)
|
# if defined(__KERNEL_GPU__)
|
||||||
float4 r = volume_image_texture_3d(id, co.x, co.y, co.z);
|
# if __CUDA_ARCH__ >= 300
|
||||||
# else
|
CUtexObject tex = kernel_tex_fetch(__bindless_mapping, id);
|
||||||
float4 r = kernel_tex_image_interp_3d(id, co.x, co.y, co.z);
|
if(id < 2048) /* TODO(dingto): Make this a variable */
|
||||||
|
r = kernel_tex_image_interp_3d_float4(tex, co.x, co.y, co.z);
|
||||||
|
else {
|
||||||
|
float f = kernel_tex_image_interp_3d_float(tex, co.x, co.y, co.z);
|
||||||
|
r = make_float4(f, f, f, 1.0);
|
||||||
|
}
|
||||||
|
# else /* __CUDA_ARCH__ >= 300 */
|
||||||
|
r = volume_image_texture_3d(id, co.x, co.y, co.z);
|
||||||
|
# endif
|
||||||
|
# else /* __KERNEL_GPU__ */
|
||||||
|
r = kernel_tex_image_interp_3d(id, co.x, co.y, co.z);
|
||||||
# endif
|
# endif
|
||||||
#else
|
#else
|
||||||
float4 r = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
|
float4 r = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
|
||||||
|
@@ -49,7 +49,7 @@ ImageManager::ImageManager(const DeviceInfo& info)
|
|||||||
tex_image_byte_start = TEX_IMAGE_BYTE_START_CPU;
|
tex_image_byte_start = TEX_IMAGE_BYTE_START_CPU;
|
||||||
}
|
}
|
||||||
/* CUDA (Fermi) */
|
/* CUDA (Fermi) */
|
||||||
else if((info.type == DEVICE_CUDA || info.type == DEVICE_MULTI) && !info.extended_images) {
|
else if((info.type == DEVICE_CUDA || info.type == DEVICE_MULTI) && !info.has_bindless_textures) {
|
||||||
tex_num_images[IMAGE_DATA_TYPE_BYTE4] = TEX_NUM_BYTE4_IMAGES_CUDA;
|
tex_num_images[IMAGE_DATA_TYPE_BYTE4] = TEX_NUM_BYTE4_IMAGES_CUDA;
|
||||||
tex_num_images[IMAGE_DATA_TYPE_FLOAT4] = TEX_NUM_FLOAT4_IMAGES_CUDA;
|
tex_num_images[IMAGE_DATA_TYPE_FLOAT4] = TEX_NUM_FLOAT4_IMAGES_CUDA;
|
||||||
tex_num_images[IMAGE_DATA_TYPE_FLOAT] = TEX_NUM_FLOAT_IMAGES_CUDA;
|
tex_num_images[IMAGE_DATA_TYPE_FLOAT] = TEX_NUM_FLOAT_IMAGES_CUDA;
|
||||||
@@ -59,7 +59,7 @@ ImageManager::ImageManager(const DeviceInfo& info)
|
|||||||
tex_image_byte_start = TEX_IMAGE_BYTE_START_CUDA;
|
tex_image_byte_start = TEX_IMAGE_BYTE_START_CUDA;
|
||||||
}
|
}
|
||||||
/* CUDA (Kepler and above) */
|
/* CUDA (Kepler and above) */
|
||||||
else if((info.type == DEVICE_CUDA || info.type == DEVICE_MULTI) && info.extended_images) {
|
else if((info.type == DEVICE_CUDA || info.type == DEVICE_MULTI) && info.has_bindless_textures) {
|
||||||
tex_num_images[IMAGE_DATA_TYPE_BYTE4] = TEX_NUM_BYTE4_IMAGES_CUDA_KEPLER;
|
tex_num_images[IMAGE_DATA_TYPE_BYTE4] = TEX_NUM_BYTE4_IMAGES_CUDA_KEPLER;
|
||||||
tex_num_images[IMAGE_DATA_TYPE_FLOAT4] = TEX_NUM_FLOAT4_IMAGES_CUDA_KEPLER;
|
tex_num_images[IMAGE_DATA_TYPE_FLOAT4] = TEX_NUM_FLOAT4_IMAGES_CUDA_KEPLER;
|
||||||
tex_num_images[IMAGE_DATA_TYPE_FLOAT] = TEX_NUM_FLOAT_IMAGES_CUDA_KEPLER;
|
tex_num_images[IMAGE_DATA_TYPE_FLOAT] = TEX_NUM_FLOAT_IMAGES_CUDA_KEPLER;
|
||||||
@@ -294,7 +294,7 @@ int ImageManager::add_image(const string& filename,
|
|||||||
if(type == IMAGE_DATA_TYPE_FLOAT || type == IMAGE_DATA_TYPE_FLOAT4)
|
if(type == IMAGE_DATA_TYPE_FLOAT || type == IMAGE_DATA_TYPE_FLOAT4)
|
||||||
is_float = true;
|
is_float = true;
|
||||||
|
|
||||||
/* No float and byte textures on GPU yet */
|
/* No single channel textures on Fermi GPUs, use available slots */
|
||||||
if(type == IMAGE_DATA_TYPE_FLOAT && tex_num_images[type] == 0)
|
if(type == IMAGE_DATA_TYPE_FLOAT && tex_num_images[type] == 0)
|
||||||
type = IMAGE_DATA_TYPE_FLOAT4;
|
type = IMAGE_DATA_TYPE_FLOAT4;
|
||||||
if(type == IMAGE_DATA_TYPE_BYTE && tex_num_images[type] == 0)
|
if(type == IMAGE_DATA_TYPE_BYTE && tex_num_images[type] == 0)
|
||||||
|
@@ -40,10 +40,10 @@ CCL_NAMESPACE_BEGIN
|
|||||||
#define TEX_IMAGE_BYTE_START_CUDA (TEX_NUM_FLOAT4_IMAGES_CUDA + TEX_NUM_BYTE4_IMAGES_CUDA + TEX_NUM_BYTE_IMAGES_CUDA)
|
#define TEX_IMAGE_BYTE_START_CUDA (TEX_NUM_FLOAT4_IMAGES_CUDA + TEX_NUM_BYTE4_IMAGES_CUDA + TEX_NUM_BYTE_IMAGES_CUDA)
|
||||||
|
|
||||||
/* CUDA (KEPLER and above) */
|
/* CUDA (KEPLER and above) */
|
||||||
#define TEX_NUM_BYTE4_IMAGES_CUDA_KEPLER 145
|
#define TEX_NUM_BYTE4_IMAGES_CUDA_KEPLER 1024
|
||||||
#define TEX_NUM_FLOAT4_IMAGES_CUDA_KEPLER 5
|
#define TEX_NUM_FLOAT4_IMAGES_CUDA_KEPLER 1024
|
||||||
#define TEX_NUM_FLOAT_IMAGES_CUDA_KEPLER 0
|
#define TEX_NUM_FLOAT_IMAGES_CUDA_KEPLER 1024
|
||||||
#define TEX_NUM_BYTE_IMAGES_CUDA_KEPLER 0
|
#define TEX_NUM_BYTE_IMAGES_CUDA_KEPLER 1024
|
||||||
#define TEX_IMAGE_BYTE4_START_CUDA_KEPLER TEX_NUM_FLOAT4_IMAGES_CUDA_KEPLER
|
#define TEX_IMAGE_BYTE4_START_CUDA_KEPLER TEX_NUM_FLOAT4_IMAGES_CUDA_KEPLER
|
||||||
#define TEX_IMAGE_FLOAT_START_CUDA_KEPLER (TEX_NUM_FLOAT4_IMAGES_CUDA_KEPLER + TEX_NUM_BYTE4_IMAGES_CUDA_KEPLER)
|
#define TEX_IMAGE_FLOAT_START_CUDA_KEPLER (TEX_NUM_FLOAT4_IMAGES_CUDA_KEPLER + TEX_NUM_BYTE4_IMAGES_CUDA_KEPLER)
|
||||||
#define TEX_IMAGE_BYTE_START_CUDA_KEPLER (TEX_NUM_FLOAT4_IMAGES_CUDA_KEPLER + TEX_NUM_BYTE4_IMAGES_CUDA_KEPLER + TEX_NUM_BYTE_IMAGES_CUDA_KEPLER)
|
#define TEX_IMAGE_BYTE_START_CUDA_KEPLER (TEX_NUM_FLOAT4_IMAGES_CUDA_KEPLER + TEX_NUM_BYTE4_IMAGES_CUDA_KEPLER + TEX_NUM_BYTE_IMAGES_CUDA_KEPLER)
|
||||||
|
Reference in New Issue
Block a user