
This is an initial implementation of BVH8 optimization structure and packated triangle intersection. The aim is to get faster ray to scene intersection checks. Scene BVH4 BVH8 barbershop_interior 10:24.94 10:10.74 bmw27 02:41.25 02:38.83 classroom 08:16.49 07:56.15 fishy_cat 04:24.56 04:17.29 koro 06:03.06 06:01.45 pavillon_barcelona 09:21.26 09:02.98 victor 23:39.65 22:53.71 As memory goes, peak usage raises by about 4.7% in a complex scenes. Note that BVH8 is disabled when using OSL, this is because OSL kernel does not get per-microarchitecture optimizations and hence always considers BVH3 is used. Original BVH8 patch from Anton Gavrikov. Batched triangles intersection from Victoria Zhislina. Extra work and tests and fixes from Maxym Dmytrychenko.
210 lines
5.7 KiB
C++
210 lines
5.7 KiB
C++
/*
|
|
* Copyright 2011-2016 Blender Foundation
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
* you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*/
|
|
|
|
#include "util/util_debug.h"
|
|
|
|
#include <stdlib.h>
|
|
|
|
#include "bvh/bvh_params.h"
|
|
|
|
#include "util/util_logging.h"
|
|
#include "util/util_string.h"
|
|
|
|
CCL_NAMESPACE_BEGIN
|
|
|
|
DebugFlags::CPU::CPU()
|
|
: avx2(true),
|
|
avx(true),
|
|
sse41(true),
|
|
sse3(true),
|
|
sse2(true),
|
|
bvh_layout(BVH_LAYOUT_DEFAULT),
|
|
split_kernel(false)
|
|
{
|
|
reset();
|
|
}
|
|
|
|
void DebugFlags::CPU::reset()
|
|
{
|
|
#define STRINGIFY(x) #x
|
|
#define CHECK_CPU_FLAGS(flag, env) \
|
|
do { \
|
|
flag = (getenv(env) == NULL); \
|
|
if(!flag) { \
|
|
VLOG(1) << "Disabling " << STRINGIFY(flag) << " instruction set."; \
|
|
} \
|
|
} while(0)
|
|
|
|
CHECK_CPU_FLAGS(avx2, "CYCLES_CPU_NO_AVX2");
|
|
CHECK_CPU_FLAGS(avx, "CYCLES_CPU_NO_AVX");
|
|
CHECK_CPU_FLAGS(sse41, "CYCLES_CPU_NO_SSE41");
|
|
CHECK_CPU_FLAGS(sse3, "CYCLES_CPU_NO_SSE3");
|
|
CHECK_CPU_FLAGS(sse2, "CYCLES_CPU_NO_SSE2");
|
|
|
|
#undef STRINGIFY
|
|
#undef CHECK_CPU_FLAGS
|
|
|
|
if (getenv("CYCLES_BVH2") != NULL) {
|
|
bvh_layout = BVH_LAYOUT_BVH2;
|
|
}
|
|
else if (getenv("CYCLES_BVH4") != NULL) {
|
|
bvh_layout = BVH_LAYOUT_BVH4;
|
|
}
|
|
else if (getenv("CYCLES_BVH8") != NULL) {
|
|
bvh_layout = BVH_LAYOUT_BVH8;
|
|
}
|
|
else {
|
|
bvh_layout = BVH_LAYOUT_DEFAULT;
|
|
}
|
|
|
|
split_kernel = false;
|
|
}
|
|
|
|
DebugFlags::CUDA::CUDA()
|
|
: adaptive_compile(false),
|
|
split_kernel(false)
|
|
{
|
|
reset();
|
|
}
|
|
|
|
void DebugFlags::CUDA::reset()
|
|
{
|
|
if(getenv("CYCLES_CUDA_ADAPTIVE_COMPILE") != NULL)
|
|
adaptive_compile = true;
|
|
|
|
split_kernel = false;
|
|
}
|
|
|
|
DebugFlags::OpenCL::OpenCL()
|
|
: device_type(DebugFlags::OpenCL::DEVICE_ALL),
|
|
kernel_type(DebugFlags::OpenCL::KERNEL_DEFAULT),
|
|
debug(false),
|
|
single_program(false)
|
|
{
|
|
reset();
|
|
}
|
|
|
|
void DebugFlags::OpenCL::reset()
|
|
{
|
|
/* Initialize device type from environment variables. */
|
|
device_type = DebugFlags::OpenCL::DEVICE_ALL;
|
|
char *device = getenv("CYCLES_OPENCL_TEST");
|
|
if(device) {
|
|
if(strcmp(device, "NONE") == 0) {
|
|
device_type = DebugFlags::OpenCL::DEVICE_NONE;
|
|
}
|
|
else if(strcmp(device, "ALL") == 0) {
|
|
device_type = DebugFlags::OpenCL::DEVICE_ALL;
|
|
}
|
|
else if(strcmp(device, "DEFAULT") == 0) {
|
|
device_type = DebugFlags::OpenCL::DEVICE_DEFAULT;
|
|
}
|
|
else if(strcmp(device, "CPU") == 0) {
|
|
device_type = DebugFlags::OpenCL::DEVICE_CPU;
|
|
}
|
|
else if(strcmp(device, "GPU") == 0) {
|
|
device_type = DebugFlags::OpenCL::DEVICE_GPU;
|
|
}
|
|
else if(strcmp(device, "ACCELERATOR") == 0) {
|
|
device_type = DebugFlags::OpenCL::DEVICE_ACCELERATOR;
|
|
}
|
|
}
|
|
/* Initialize kernel type from environment variables. */
|
|
kernel_type = DebugFlags::OpenCL::KERNEL_DEFAULT;
|
|
if(getenv("CYCLES_OPENCL_MEGA_KERNEL_TEST") != NULL) {
|
|
kernel_type = DebugFlags::OpenCL::KERNEL_MEGA;
|
|
}
|
|
else if(getenv("CYCLES_OPENCL_SPLIT_KERNEL_TEST") != NULL) {
|
|
kernel_type = DebugFlags::OpenCL::KERNEL_SPLIT;
|
|
}
|
|
/* Initialize other flags from environment variables. */
|
|
debug = (getenv("CYCLES_OPENCL_DEBUG") != NULL);
|
|
single_program = (getenv("CYCLES_OPENCL_MULTI_PROGRAM") == NULL);
|
|
}
|
|
|
|
DebugFlags::DebugFlags()
|
|
: viewport_static_bvh(false)
|
|
{
|
|
/* Nothing for now. */
|
|
}
|
|
|
|
void DebugFlags::reset()
|
|
{
|
|
viewport_static_bvh = false;
|
|
cpu.reset();
|
|
cuda.reset();
|
|
opencl.reset();
|
|
}
|
|
|
|
std::ostream& operator <<(std::ostream &os,
|
|
DebugFlagsConstRef debug_flags)
|
|
{
|
|
os << "CPU flags:\n"
|
|
<< " AVX2 : " << string_from_bool(debug_flags.cpu.avx2) << "\n"
|
|
<< " AVX : " << string_from_bool(debug_flags.cpu.avx) << "\n"
|
|
<< " SSE4.1 : " << string_from_bool(debug_flags.cpu.sse41) << "\n"
|
|
<< " SSE3 : " << string_from_bool(debug_flags.cpu.sse3) << "\n"
|
|
<< " SSE2 : " << string_from_bool(debug_flags.cpu.sse2) << "\n"
|
|
<< " BVH layout : " << bvh_layout_name(debug_flags.cpu.bvh_layout) << "\n"
|
|
<< " Split : " << string_from_bool(debug_flags.cpu.split_kernel) << "\n";
|
|
|
|
os << "CUDA flags:\n"
|
|
<< " Adaptive Compile: " << string_from_bool(debug_flags.cuda.adaptive_compile) << "\n";
|
|
|
|
const char *opencl_device_type,
|
|
*opencl_kernel_type;
|
|
switch(debug_flags.opencl.device_type) {
|
|
case DebugFlags::OpenCL::DEVICE_NONE:
|
|
opencl_device_type = "NONE";
|
|
break;
|
|
case DebugFlags::OpenCL::DEVICE_ALL:
|
|
opencl_device_type = "ALL";
|
|
break;
|
|
case DebugFlags::OpenCL::DEVICE_DEFAULT:
|
|
opencl_device_type = "DEFAULT";
|
|
break;
|
|
case DebugFlags::OpenCL::DEVICE_CPU:
|
|
opencl_device_type = "CPU";
|
|
break;
|
|
case DebugFlags::OpenCL::DEVICE_GPU:
|
|
opencl_device_type = "GPU";
|
|
break;
|
|
case DebugFlags::OpenCL::DEVICE_ACCELERATOR:
|
|
opencl_device_type = "ACCELERATOR";
|
|
break;
|
|
}
|
|
switch(debug_flags.opencl.kernel_type) {
|
|
case DebugFlags::OpenCL::KERNEL_DEFAULT:
|
|
opencl_kernel_type = "DEFAULT";
|
|
break;
|
|
case DebugFlags::OpenCL::KERNEL_MEGA:
|
|
opencl_kernel_type = "MEGA";
|
|
break;
|
|
case DebugFlags::OpenCL::KERNEL_SPLIT:
|
|
opencl_kernel_type = "SPLIT";
|
|
break;
|
|
}
|
|
os << "OpenCL flags:\n"
|
|
<< " Device type : " << opencl_device_type << "\n"
|
|
<< " Kernel type : " << opencl_kernel_type << "\n"
|
|
<< " Debug : " << string_from_bool(debug_flags.opencl.debug) << "\n"
|
|
<< " Single program : " << string_from_bool(debug_flags.opencl.single_program) << "\n"
|
|
<< " Memory limit : " << string_human_readable_size(debug_flags.opencl.mem_limit) << "\n";
|
|
return os;
|
|
}
|
|
|
|
CCL_NAMESPACE_END
|