Revert "Cycles: Enable inlining on Apple Silicon for 1.1x speedup"
This reverts commit b82de02e7c
. It is causing
crashes in various regression tests.
Ref D14763
This commit is contained in:
@@ -28,8 +28,7 @@ class MetalDevice : public Device {
|
|||||||
id<MTLCommandQueue> mtlGeneralCommandQueue = nil;
|
id<MTLCommandQueue> mtlGeneralCommandQueue = nil;
|
||||||
id<MTLArgumentEncoder> mtlAncillaryArgEncoder =
|
id<MTLArgumentEncoder> mtlAncillaryArgEncoder =
|
||||||
nil; /* encoder used for fetching device pointers from MTLBuffers */
|
nil; /* encoder used for fetching device pointers from MTLBuffers */
|
||||||
string source[PSO_NUM];
|
string source_used_for_compile[PSO_NUM];
|
||||||
string source_md5[PSO_NUM];
|
|
||||||
|
|
||||||
KernelParamsMetal launch_params = {0};
|
KernelParamsMetal launch_params = {0};
|
||||||
|
|
||||||
@@ -111,12 +110,6 @@ class MetalDevice : public Device {
|
|||||||
|
|
||||||
virtual void build_bvh(BVH *bvh, Progress &progress, bool refit) override;
|
virtual void build_bvh(BVH *bvh, Progress &progress, bool refit) override;
|
||||||
|
|
||||||
id<MTLLibrary> compile(string const &source);
|
|
||||||
|
|
||||||
const MetalKernelPipeline &get_best_pipeline(DeviceKernel kernel) const;
|
|
||||||
|
|
||||||
bool kernel_available(DeviceKernel kernel) const;
|
|
||||||
|
|
||||||
/* ------------------------------------------------------------------ */
|
/* ------------------------------------------------------------------ */
|
||||||
/* low-level memory management */
|
/* low-level memory management */
|
||||||
|
|
||||||
|
@@ -275,44 +275,96 @@ bool MetalDevice::load_kernels(const uint _kernel_features)
|
|||||||
* active, but may still need to be rendered without motion blur if that isn't active as well. */
|
* active, but may still need to be rendered without motion blur if that isn't active as well. */
|
||||||
motion_blur = kernel_features & KERNEL_FEATURE_OBJECT_MOTION;
|
motion_blur = kernel_features & KERNEL_FEATURE_OBJECT_MOTION;
|
||||||
|
|
||||||
source[PSO_GENERIC] = get_source(kernel_features);
|
NSError *error = NULL;
|
||||||
mtlLibrary[PSO_GENERIC] = compile(source[PSO_GENERIC]);
|
|
||||||
|
|
||||||
MD5Hash md5;
|
for (int i = 0; i < PSO_NUM; i++) {
|
||||||
md5.append(source[PSO_GENERIC]);
|
if (mtlLibrary[i]) {
|
||||||
source_md5[PSO_GENERIC] = md5.get_hex();
|
[mtlLibrary[i] release];
|
||||||
|
mtlLibrary[i] = nil;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
metal_printf("Front-end compilation finished (generic)\n");
|
|
||||||
|
|
||||||
bool result = kernels.load(this, false);
|
|
||||||
|
|
||||||
reserve_local_memory(kernel_features);
|
|
||||||
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
id<MTLLibrary> MetalDevice::compile(string const &source)
|
|
||||||
{
|
|
||||||
MTLCompileOptions *options = [[MTLCompileOptions alloc] init];
|
MTLCompileOptions *options = [[MTLCompileOptions alloc] init];
|
||||||
|
|
||||||
options.fastMathEnabled = YES;
|
options.fastMathEnabled = YES;
|
||||||
if (@available(macOS 12.0, *)) {
|
if (@available(macOS 12.0, *)) {
|
||||||
options.languageVersion = MTLLanguageVersion2_4;
|
options.languageVersion = MTLLanguageVersion2_4;
|
||||||
}
|
}
|
||||||
|
else {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
NSError *error = NULL;
|
string metalsrc;
|
||||||
id<MTLLibrary> mtlLibrary = [mtlDevice newLibraryWithSource:@(source.c_str())
|
|
||||||
|
/* local helper: dump source to disk and return filepath */
|
||||||
|
auto dump_source = [&](int kernel_type) -> string {
|
||||||
|
string &source = source_used_for_compile[kernel_type];
|
||||||
|
string metalsrc = path_cache_get(path_join("kernels",
|
||||||
|
string_printf("%s.%s.metal",
|
||||||
|
kernel_type_as_string(kernel_type),
|
||||||
|
util_md5_string(source).c_str())));
|
||||||
|
path_write_text(metalsrc, source);
|
||||||
|
return metalsrc;
|
||||||
|
};
|
||||||
|
|
||||||
|
/* local helper: fetch the kernel source code, adjust it for specific PSO_.. kernel_type flavor,
|
||||||
|
* then compile it into a MTLLibrary */
|
||||||
|
auto fetch_and_compile_source = [&](int kernel_type) {
|
||||||
|
/* Record the source used to compile this library, for hash building later. */
|
||||||
|
string &source = source_used_for_compile[kernel_type];
|
||||||
|
|
||||||
|
switch (kernel_type) {
|
||||||
|
case PSO_GENERIC: {
|
||||||
|
source = get_source(kernel_features);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case PSO_SPECIALISED: {
|
||||||
|
/* PSO_SPECIALISED derives from PSO_GENERIC */
|
||||||
|
string &generic_source = source_used_for_compile[PSO_GENERIC];
|
||||||
|
if (generic_source.empty()) {
|
||||||
|
generic_source = get_source(kernel_features);
|
||||||
|
}
|
||||||
|
source = "#define __KERNEL_METAL_USE_FUNCTION_SPECIALISATION__\n" + generic_source;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
default:
|
||||||
|
assert(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* create MTLLibrary (front-end compilation) */
|
||||||
|
mtlLibrary[kernel_type] = [mtlDevice newLibraryWithSource:@(source.c_str())
|
||||||
options:options
|
options:options
|
||||||
error:&error];
|
error:&error];
|
||||||
|
|
||||||
if (!mtlLibrary) {
|
bool do_source_dump = (getenv("CYCLES_METAL_DUMP_SOURCE") != nullptr);
|
||||||
NSString *err = [error localizedDescription];
|
|
||||||
set_error(string_printf("Failed to compile library:\n%s", [err UTF8String]));
|
if (!mtlLibrary[kernel_type] || do_source_dump) {
|
||||||
|
string metalsrc = dump_source(kernel_type);
|
||||||
|
|
||||||
|
if (!mtlLibrary[kernel_type]) {
|
||||||
|
NSString *err = [error localizedDescription];
|
||||||
|
set_error(string_printf("Failed to compile library:\n%s", [err UTF8String]));
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
};
|
||||||
|
|
||||||
|
fetch_and_compile_source(PSO_GENERIC);
|
||||||
|
|
||||||
|
if (use_function_specialisation) {
|
||||||
|
fetch_and_compile_source(PSO_SPECIALISED);
|
||||||
}
|
}
|
||||||
|
|
||||||
[options release];
|
metal_printf("Front-end compilation finished\n");
|
||||||
|
|
||||||
return mtlLibrary;
|
bool result = kernels.load(this, PSO_GENERIC);
|
||||||
|
|
||||||
|
[options release];
|
||||||
|
reserve_local_memory(kernel_features);
|
||||||
|
|
||||||
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
void MetalDevice::reserve_local_memory(const uint kernel_features)
|
void MetalDevice::reserve_local_memory(const uint kernel_features)
|
||||||
@@ -619,11 +671,6 @@ device_ptr MetalDevice::mem_alloc_sub_ptr(device_memory &mem, size_t offset, siz
|
|||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
const MetalKernelPipeline &MetalDevice::get_best_pipeline(DeviceKernel kernel) const
|
|
||||||
{
|
|
||||||
return kernels.get_best_pipeline(this, kernel);
|
|
||||||
}
|
|
||||||
|
|
||||||
void MetalDevice::const_copy_to(const char *name, void *host, size_t size)
|
void MetalDevice::const_copy_to(const char *name, void *host, size_t size)
|
||||||
{
|
{
|
||||||
if (strcmp(name, "__data") == 0) {
|
if (strcmp(name, "__data") == 0) {
|
||||||
|
@@ -54,41 +54,98 @@ enum {
|
|||||||
const char *kernel_type_as_string(int kernel_type);
|
const char *kernel_type_as_string(int kernel_type);
|
||||||
|
|
||||||
struct MetalKernelPipeline {
|
struct MetalKernelPipeline {
|
||||||
|
void release()
|
||||||
|
{
|
||||||
|
if (pipeline) {
|
||||||
|
[pipeline release];
|
||||||
|
pipeline = nil;
|
||||||
|
if (@available(macOS 11.0, *)) {
|
||||||
|
for (int i = 0; i < METALRT_TABLE_NUM; i++) {
|
||||||
|
if (intersection_func_table[i]) {
|
||||||
|
[intersection_func_table[i] release];
|
||||||
|
intersection_func_table[i] = nil;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (function) {
|
||||||
|
[function release];
|
||||||
|
function = nil;
|
||||||
|
}
|
||||||
|
if (@available(macOS 11.0, *)) {
|
||||||
|
for (int i = 0; i < METALRT_TABLE_NUM; i++) {
|
||||||
|
if (intersection_func_table[i]) {
|
||||||
|
[intersection_func_table[i] release];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
void compile();
|
|
||||||
|
|
||||||
id<MTLLibrary> mtlLibrary = nil;
|
|
||||||
bool scene_specialized;
|
|
||||||
string source_md5;
|
|
||||||
|
|
||||||
bool use_metalrt;
|
|
||||||
bool metalrt_hair;
|
|
||||||
bool metalrt_hair_thick;
|
|
||||||
bool metalrt_pointcloud;
|
|
||||||
|
|
||||||
int threads_per_threadgroup;
|
|
||||||
|
|
||||||
DeviceKernel device_kernel;
|
|
||||||
bool loaded = false;
|
bool loaded = false;
|
||||||
id<MTLDevice> mtlDevice = nil;
|
|
||||||
id<MTLFunction> function = nil;
|
id<MTLFunction> function = nil;
|
||||||
id<MTLComputePipelineState> pipeline = nil;
|
id<MTLComputePipelineState> pipeline = nil;
|
||||||
int num_threads_per_block = 0;
|
|
||||||
|
|
||||||
string error_str;
|
|
||||||
|
|
||||||
API_AVAILABLE(macos(11.0))
|
API_AVAILABLE(macos(11.0))
|
||||||
id<MTLIntersectionFunctionTable> intersection_func_table[METALRT_TABLE_NUM] = {nil};
|
id<MTLIntersectionFunctionTable> intersection_func_table[METALRT_TABLE_NUM] = {nil};
|
||||||
id<MTLFunction> rt_intersection_function[METALRT_FUNC_NUM] = {nil};
|
};
|
||||||
|
|
||||||
|
struct MetalKernelLoadDesc {
|
||||||
|
int pso_index = 0;
|
||||||
|
const char *function_name = nullptr;
|
||||||
|
int kernel_index = 0;
|
||||||
|
int threads_per_threadgroup = 0;
|
||||||
|
MTLFunctionConstantValues *constant_values = nullptr;
|
||||||
|
NSArray *linked_functions = nullptr;
|
||||||
|
|
||||||
|
struct IntersectorFunctions {
|
||||||
|
NSArray *defaults;
|
||||||
|
NSArray *shadow;
|
||||||
|
NSArray *local;
|
||||||
|
NSArray *operator[](int index) const
|
||||||
|
{
|
||||||
|
if (index == METALRT_TABLE_DEFAULT)
|
||||||
|
return defaults;
|
||||||
|
if (index == METALRT_TABLE_SHADOW)
|
||||||
|
return shadow;
|
||||||
|
return local;
|
||||||
|
}
|
||||||
|
} intersector_functions = {nullptr};
|
||||||
|
};
|
||||||
|
|
||||||
|
/* Metal kernel and associate occupancy information. */
|
||||||
|
class MetalDeviceKernel {
|
||||||
|
public:
|
||||||
|
~MetalDeviceKernel();
|
||||||
|
|
||||||
|
bool load(MetalDevice *device, MetalKernelLoadDesc const &desc, class MD5Hash const &md5);
|
||||||
|
|
||||||
|
void mark_loaded(int pso_index)
|
||||||
|
{
|
||||||
|
pso[pso_index].loaded = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
int get_num_threads_per_block() const
|
||||||
|
{
|
||||||
|
return num_threads_per_block;
|
||||||
|
}
|
||||||
|
const MetalKernelPipeline &get_pso() const;
|
||||||
|
|
||||||
|
double load_duration = 0.0;
|
||||||
|
|
||||||
|
private:
|
||||||
|
MetalKernelPipeline pso[PSO_NUM];
|
||||||
|
|
||||||
|
int num_threads_per_block = 0;
|
||||||
};
|
};
|
||||||
|
|
||||||
/* Cache of Metal kernels for each DeviceKernel. */
|
/* Cache of Metal kernels for each DeviceKernel. */
|
||||||
class MetalDeviceKernels {
|
class MetalDeviceKernels {
|
||||||
public:
|
public:
|
||||||
bool load(MetalDevice *device, bool scene_specialized);
|
bool load(MetalDevice *device, int kernel_type);
|
||||||
bool available(const MetalDevice *device, DeviceKernel kernel) const;
|
bool available(DeviceKernel kernel) const;
|
||||||
const MetalKernelPipeline &get_best_pipeline(const MetalDevice *device,
|
const MetalDeviceKernel &get(DeviceKernel kernel) const;
|
||||||
DeviceKernel kernel) const;
|
|
||||||
|
MetalDeviceKernel kernels_[DEVICE_KERNEL_NUM];
|
||||||
|
|
||||||
id<MTLFunction> rt_intersection_funcs[PSO_NUM][METALRT_FUNC_NUM] = {{nil}};
|
id<MTLFunction> rt_intersection_funcs[PSO_NUM][METALRT_FUNC_NUM] = {{nil}};
|
||||||
|
|
||||||
|
File diff suppressed because it is too large
Load Diff
@@ -108,6 +108,9 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel,
|
|||||||
VLOG(3) << "Metal queue launch " << device_kernel_as_string(kernel) << ", work_size "
|
VLOG(3) << "Metal queue launch " << device_kernel_as_string(kernel) << ", work_size "
|
||||||
<< work_size;
|
<< work_size;
|
||||||
|
|
||||||
|
const MetalDeviceKernel &metal_kernel = metal_device->kernels.get(kernel);
|
||||||
|
const MetalKernelPipeline &metal_kernel_pso = metal_kernel.get_pso();
|
||||||
|
|
||||||
id<MTLComputeCommandEncoder> mtlComputeCommandEncoder = get_compute_encoder(kernel);
|
id<MTLComputeCommandEncoder> mtlComputeCommandEncoder = get_compute_encoder(kernel);
|
||||||
|
|
||||||
/* Determine size requirement for argument buffer. */
|
/* Determine size requirement for argument buffer. */
|
||||||
@@ -209,8 +212,6 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel,
|
|||||||
}
|
}
|
||||||
bytes_written = globals_offsets + sizeof(KernelParamsMetal);
|
bytes_written = globals_offsets + sizeof(KernelParamsMetal);
|
||||||
|
|
||||||
const MetalKernelPipeline &metal_kernel_pso = metal_device->get_best_pipeline(kernel);
|
|
||||||
|
|
||||||
/* Encode ancillaries */
|
/* Encode ancillaries */
|
||||||
[metal_device->mtlAncillaryArgEncoder setArgumentBuffer:arg_buffer offset:metal_offsets];
|
[metal_device->mtlAncillaryArgEncoder setArgumentBuffer:arg_buffer offset:metal_offsets];
|
||||||
[metal_device->mtlAncillaryArgEncoder setBuffer:metal_device->texture_bindings_2d
|
[metal_device->mtlAncillaryArgEncoder setBuffer:metal_device->texture_bindings_2d
|
||||||
@@ -283,7 +284,7 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel,
|
|||||||
[mtlComputeCommandEncoder setComputePipelineState:metal_kernel_pso.pipeline];
|
[mtlComputeCommandEncoder setComputePipelineState:metal_kernel_pso.pipeline];
|
||||||
|
|
||||||
/* Compute kernel launch parameters. */
|
/* Compute kernel launch parameters. */
|
||||||
const int num_threads_per_block = metal_kernel_pso.num_threads_per_block;
|
const int num_threads_per_block = metal_kernel.get_num_threads_per_block();
|
||||||
|
|
||||||
int shared_mem_bytes = 0;
|
int shared_mem_bytes = 0;
|
||||||
|
|
||||||
@@ -546,8 +547,6 @@ id<MTLComputeCommandEncoder> MetalDeviceQueue::get_compute_encoder(DeviceKernel
|
|||||||
computeCommandEncoderWithDispatchType:concurrent ? MTLDispatchTypeConcurrent :
|
computeCommandEncoderWithDispatchType:concurrent ? MTLDispatchTypeConcurrent :
|
||||||
MTLDispatchTypeSerial];
|
MTLDispatchTypeSerial];
|
||||||
|
|
||||||
[mtlComputeEncoder setLabel:@(device_kernel_as_string(kernel))];
|
|
||||||
|
|
||||||
/* declare usage of MTLBuffers etc */
|
/* declare usage of MTLBuffers etc */
|
||||||
prepare_resources(kernel);
|
prepare_resources(kernel);
|
||||||
}
|
}
|
||||||
|
@@ -29,26 +29,10 @@ using namespace metal::raytracing;
|
|||||||
|
|
||||||
/* Qualifiers */
|
/* Qualifiers */
|
||||||
|
|
||||||
#if defined(__KERNEL_METAL_APPLE__)
|
#define ccl_device
|
||||||
|
#define ccl_device_inline ccl_device
|
||||||
/* Inline everything for Apple GPUs.
|
#define ccl_device_forceinline ccl_device
|
||||||
* This gives ~1.1x speedup and 10% spill reduction for integator_shade_surface
|
#define ccl_device_noinline ccl_device __attribute__((noinline))
|
||||||
* at the cost of longer compile times (~4.5 minutes on M1 Max). */
|
|
||||||
|
|
||||||
# define ccl_device __attribute__((always_inline))
|
|
||||||
# define ccl_device_inline __attribute__((always_inline))
|
|
||||||
# define ccl_device_forceinline __attribute__((always_inline))
|
|
||||||
# define ccl_device_noinline __attribute__((always_inline))
|
|
||||||
|
|
||||||
#else
|
|
||||||
|
|
||||||
# define ccl_device
|
|
||||||
# define ccl_device_inline ccl_device
|
|
||||||
# define ccl_device_forceinline ccl_device
|
|
||||||
# define ccl_device_noinline ccl_device __attribute__((noinline))
|
|
||||||
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#define ccl_device_noinline_cpu ccl_device
|
#define ccl_device_noinline_cpu ccl_device
|
||||||
#define ccl_device_inline_method ccl_device
|
#define ccl_device_inline_method ccl_device
|
||||||
#define ccl_global device
|
#define ccl_global device
|
||||||
|
Reference in New Issue
Block a user