Cycles: Enable inlining on Apple Silicon for 1.1x speedup
This is a stripped down version of D14645 without the scene specialisation optimisations. The two major changes in this patch are: - Enables more aggressive inlining on Apple Silicon resulting in a 1.1x speedup and 10% reduction in spill, at the cost of longer pipeline build times - Revival of shader binary archives through a new ShaderCache which is shared between MetalDevice instances using the same physical MTLDevice. This mitigates the extra compile times via explicit caching (rather than, as before, relying on the implicit system shader cache which can be purged without notice) Reviewed By: brecht Differential Revision: https://developer.blender.org/D14763
This commit is contained in:
@@ -28,7 +28,8 @@ class MetalDevice : public Device {
|
|||||||
id<MTLCommandQueue> mtlGeneralCommandQueue = nil;
|
id<MTLCommandQueue> mtlGeneralCommandQueue = nil;
|
||||||
id<MTLArgumentEncoder> mtlAncillaryArgEncoder =
|
id<MTLArgumentEncoder> mtlAncillaryArgEncoder =
|
||||||
nil; /* encoder used for fetching device pointers from MTLBuffers */
|
nil; /* encoder used for fetching device pointers from MTLBuffers */
|
||||||
string source_used_for_compile[PSO_NUM];
|
string source[PSO_NUM];
|
||||||
|
string source_md5[PSO_NUM];
|
||||||
|
|
||||||
KernelParamsMetal launch_params = {0};
|
KernelParamsMetal launch_params = {0};
|
||||||
|
|
||||||
@@ -110,6 +111,12 @@ class MetalDevice : public Device {
|
|||||||
|
|
||||||
virtual void build_bvh(BVH *bvh, Progress &progress, bool refit) override;
|
virtual void build_bvh(BVH *bvh, Progress &progress, bool refit) override;
|
||||||
|
|
||||||
|
id<MTLLibrary> compile(string const &source);
|
||||||
|
|
||||||
|
const MetalKernelPipeline &get_best_pipeline(DeviceKernel kernel) const;
|
||||||
|
|
||||||
|
bool kernel_available(DeviceKernel kernel) const;
|
||||||
|
|
||||||
/* ------------------------------------------------------------------ */
|
/* ------------------------------------------------------------------ */
|
||||||
/* low-level memory management */
|
/* low-level memory management */
|
||||||
|
|
||||||
|
@@ -275,96 +275,44 @@ bool MetalDevice::load_kernels(const uint _kernel_features)
|
|||||||
* active, but may still need to be rendered without motion blur if that isn't active as well. */
|
* active, but may still need to be rendered without motion blur if that isn't active as well. */
|
||||||
motion_blur = kernel_features & KERNEL_FEATURE_OBJECT_MOTION;
|
motion_blur = kernel_features & KERNEL_FEATURE_OBJECT_MOTION;
|
||||||
|
|
||||||
NSError *error = NULL;
|
source[PSO_GENERIC] = get_source(kernel_features);
|
||||||
|
mtlLibrary[PSO_GENERIC] = compile(source[PSO_GENERIC]);
|
||||||
|
|
||||||
for (int i = 0; i < PSO_NUM; i++) {
|
MD5Hash md5;
|
||||||
if (mtlLibrary[i]) {
|
md5.append(source[PSO_GENERIC]);
|
||||||
[mtlLibrary[i] release];
|
source_md5[PSO_GENERIC] = md5.get_hex();
|
||||||
mtlLibrary[i] = nil;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
metal_printf("Front-end compilation finished (generic)\n");
|
||||||
|
|
||||||
|
bool result = kernels.load(this, false);
|
||||||
|
|
||||||
|
reserve_local_memory(kernel_features);
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
id<MTLLibrary> MetalDevice::compile(string const &source)
|
||||||
|
{
|
||||||
MTLCompileOptions *options = [[MTLCompileOptions alloc] init];
|
MTLCompileOptions *options = [[MTLCompileOptions alloc] init];
|
||||||
|
|
||||||
options.fastMathEnabled = YES;
|
options.fastMathEnabled = YES;
|
||||||
if (@available(macOS 12.0, *)) {
|
if (@available(macOS 12.0, *)) {
|
||||||
options.languageVersion = MTLLanguageVersion2_4;
|
options.languageVersion = MTLLanguageVersion2_4;
|
||||||
}
|
}
|
||||||
else {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
string metalsrc;
|
NSError *error = NULL;
|
||||||
|
id<MTLLibrary> mtlLibrary = [mtlDevice newLibraryWithSource:@(source.c_str())
|
||||||
/* local helper: dump source to disk and return filepath */
|
|
||||||
auto dump_source = [&](int kernel_type) -> string {
|
|
||||||
string &source = source_used_for_compile[kernel_type];
|
|
||||||
string metalsrc = path_cache_get(path_join("kernels",
|
|
||||||
string_printf("%s.%s.metal",
|
|
||||||
kernel_type_as_string(kernel_type),
|
|
||||||
util_md5_string(source).c_str())));
|
|
||||||
path_write_text(metalsrc, source);
|
|
||||||
return metalsrc;
|
|
||||||
};
|
|
||||||
|
|
||||||
/* local helper: fetch the kernel source code, adjust it for specific PSO_.. kernel_type flavor,
|
|
||||||
* then compile it into a MTLLibrary */
|
|
||||||
auto fetch_and_compile_source = [&](int kernel_type) {
|
|
||||||
/* Record the source used to compile this library, for hash building later. */
|
|
||||||
string &source = source_used_for_compile[kernel_type];
|
|
||||||
|
|
||||||
switch (kernel_type) {
|
|
||||||
case PSO_GENERIC: {
|
|
||||||
source = get_source(kernel_features);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
case PSO_SPECIALISED: {
|
|
||||||
/* PSO_SPECIALISED derives from PSO_GENERIC */
|
|
||||||
string &generic_source = source_used_for_compile[PSO_GENERIC];
|
|
||||||
if (generic_source.empty()) {
|
|
||||||
generic_source = get_source(kernel_features);
|
|
||||||
}
|
|
||||||
source = "#define __KERNEL_METAL_USE_FUNCTION_SPECIALISATION__\n" + generic_source;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
default:
|
|
||||||
assert(0);
|
|
||||||
}
|
|
||||||
|
|
||||||
/* create MTLLibrary (front-end compilation) */
|
|
||||||
mtlLibrary[kernel_type] = [mtlDevice newLibraryWithSource:@(source.c_str())
|
|
||||||
options:options
|
options:options
|
||||||
error:&error];
|
error:&error];
|
||||||
|
|
||||||
bool do_source_dump = (getenv("CYCLES_METAL_DUMP_SOURCE") != nullptr);
|
if (!mtlLibrary) {
|
||||||
|
NSString *err = [error localizedDescription];
|
||||||
if (!mtlLibrary[kernel_type] || do_source_dump) {
|
set_error(string_printf("Failed to compile library:\n%s", [err UTF8String]));
|
||||||
string metalsrc = dump_source(kernel_type);
|
|
||||||
|
|
||||||
if (!mtlLibrary[kernel_type]) {
|
|
||||||
NSString *err = [error localizedDescription];
|
|
||||||
set_error(string_printf("Failed to compile library:\n%s", [err UTF8String]));
|
|
||||||
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return true;
|
|
||||||
};
|
|
||||||
|
|
||||||
fetch_and_compile_source(PSO_GENERIC);
|
|
||||||
|
|
||||||
if (use_function_specialisation) {
|
|
||||||
fetch_and_compile_source(PSO_SPECIALISED);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
metal_printf("Front-end compilation finished\n");
|
|
||||||
|
|
||||||
bool result = kernels.load(this, PSO_GENERIC);
|
|
||||||
|
|
||||||
[options release];
|
[options release];
|
||||||
reserve_local_memory(kernel_features);
|
|
||||||
|
|
||||||
return result;
|
return mtlLibrary;
|
||||||
}
|
}
|
||||||
|
|
||||||
void MetalDevice::reserve_local_memory(const uint kernel_features)
|
void MetalDevice::reserve_local_memory(const uint kernel_features)
|
||||||
@@ -671,6 +619,11 @@ device_ptr MetalDevice::mem_alloc_sub_ptr(device_memory &mem, size_t offset, siz
|
|||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const MetalKernelPipeline &MetalDevice::get_best_pipeline(DeviceKernel kernel) const
|
||||||
|
{
|
||||||
|
return kernels.get_best_pipeline(this, kernel);
|
||||||
|
}
|
||||||
|
|
||||||
void MetalDevice::const_copy_to(const char *name, void *host, size_t size)
|
void MetalDevice::const_copy_to(const char *name, void *host, size_t size)
|
||||||
{
|
{
|
||||||
if (strcmp(name, "__data") == 0) {
|
if (strcmp(name, "__data") == 0) {
|
||||||
|
@@ -54,98 +54,41 @@ enum {
|
|||||||
const char *kernel_type_as_string(int kernel_type);
|
const char *kernel_type_as_string(int kernel_type);
|
||||||
|
|
||||||
struct MetalKernelPipeline {
|
struct MetalKernelPipeline {
|
||||||
void release()
|
|
||||||
{
|
|
||||||
if (pipeline) {
|
|
||||||
[pipeline release];
|
|
||||||
pipeline = nil;
|
|
||||||
if (@available(macOS 11.0, *)) {
|
|
||||||
for (int i = 0; i < METALRT_TABLE_NUM; i++) {
|
|
||||||
if (intersection_func_table[i]) {
|
|
||||||
[intersection_func_table[i] release];
|
|
||||||
intersection_func_table[i] = nil;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (function) {
|
|
||||||
[function release];
|
|
||||||
function = nil;
|
|
||||||
}
|
|
||||||
if (@available(macOS 11.0, *)) {
|
|
||||||
for (int i = 0; i < METALRT_TABLE_NUM; i++) {
|
|
||||||
if (intersection_func_table[i]) {
|
|
||||||
[intersection_func_table[i] release];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
void compile();
|
||||||
|
|
||||||
|
id<MTLLibrary> mtlLibrary = nil;
|
||||||
|
bool scene_specialized;
|
||||||
|
string source_md5;
|
||||||
|
|
||||||
|
bool use_metalrt;
|
||||||
|
bool metalrt_hair;
|
||||||
|
bool metalrt_hair_thick;
|
||||||
|
bool metalrt_pointcloud;
|
||||||
|
|
||||||
|
int threads_per_threadgroup;
|
||||||
|
|
||||||
|
DeviceKernel device_kernel;
|
||||||
bool loaded = false;
|
bool loaded = false;
|
||||||
|
id<MTLDevice> mtlDevice = nil;
|
||||||
id<MTLFunction> function = nil;
|
id<MTLFunction> function = nil;
|
||||||
id<MTLComputePipelineState> pipeline = nil;
|
id<MTLComputePipelineState> pipeline = nil;
|
||||||
|
int num_threads_per_block = 0;
|
||||||
|
|
||||||
|
string error_str;
|
||||||
|
|
||||||
API_AVAILABLE(macos(11.0))
|
API_AVAILABLE(macos(11.0))
|
||||||
id<MTLIntersectionFunctionTable> intersection_func_table[METALRT_TABLE_NUM] = {nil};
|
id<MTLIntersectionFunctionTable> intersection_func_table[METALRT_TABLE_NUM] = {nil};
|
||||||
};
|
id<MTLFunction> rt_intersection_function[METALRT_FUNC_NUM] = {nil};
|
||||||
|
|
||||||
struct MetalKernelLoadDesc {
|
|
||||||
int pso_index = 0;
|
|
||||||
const char *function_name = nullptr;
|
|
||||||
int kernel_index = 0;
|
|
||||||
int threads_per_threadgroup = 0;
|
|
||||||
MTLFunctionConstantValues *constant_values = nullptr;
|
|
||||||
NSArray *linked_functions = nullptr;
|
|
||||||
|
|
||||||
struct IntersectorFunctions {
|
|
||||||
NSArray *defaults;
|
|
||||||
NSArray *shadow;
|
|
||||||
NSArray *local;
|
|
||||||
NSArray *operator[](int index) const
|
|
||||||
{
|
|
||||||
if (index == METALRT_TABLE_DEFAULT)
|
|
||||||
return defaults;
|
|
||||||
if (index == METALRT_TABLE_SHADOW)
|
|
||||||
return shadow;
|
|
||||||
return local;
|
|
||||||
}
|
|
||||||
} intersector_functions = {nullptr};
|
|
||||||
};
|
|
||||||
|
|
||||||
/* Metal kernel and associate occupancy information. */
|
|
||||||
class MetalDeviceKernel {
|
|
||||||
public:
|
|
||||||
~MetalDeviceKernel();
|
|
||||||
|
|
||||||
bool load(MetalDevice *device, MetalKernelLoadDesc const &desc, class MD5Hash const &md5);
|
|
||||||
|
|
||||||
void mark_loaded(int pso_index)
|
|
||||||
{
|
|
||||||
pso[pso_index].loaded = true;
|
|
||||||
}
|
|
||||||
|
|
||||||
int get_num_threads_per_block() const
|
|
||||||
{
|
|
||||||
return num_threads_per_block;
|
|
||||||
}
|
|
||||||
const MetalKernelPipeline &get_pso() const;
|
|
||||||
|
|
||||||
double load_duration = 0.0;
|
|
||||||
|
|
||||||
private:
|
|
||||||
MetalKernelPipeline pso[PSO_NUM];
|
|
||||||
|
|
||||||
int num_threads_per_block = 0;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
/* Cache of Metal kernels for each DeviceKernel. */
|
/* Cache of Metal kernels for each DeviceKernel. */
|
||||||
class MetalDeviceKernels {
|
class MetalDeviceKernels {
|
||||||
public:
|
public:
|
||||||
bool load(MetalDevice *device, int kernel_type);
|
bool load(MetalDevice *device, bool scene_specialized);
|
||||||
bool available(DeviceKernel kernel) const;
|
bool available(const MetalDevice *device, DeviceKernel kernel) const;
|
||||||
const MetalDeviceKernel &get(DeviceKernel kernel) const;
|
const MetalKernelPipeline &get_best_pipeline(const MetalDevice *device,
|
||||||
|
DeviceKernel kernel) const;
|
||||||
MetalDeviceKernel kernels_[DEVICE_KERNEL_NUM];
|
|
||||||
|
|
||||||
id<MTLFunction> rt_intersection_funcs[PSO_NUM][METALRT_FUNC_NUM] = {{nil}};
|
id<MTLFunction> rt_intersection_funcs[PSO_NUM][METALRT_FUNC_NUM] = {{nil}};
|
||||||
|
|
||||||
|
File diff suppressed because it is too large
Load Diff
@@ -108,9 +108,6 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel,
|
|||||||
VLOG(3) << "Metal queue launch " << device_kernel_as_string(kernel) << ", work_size "
|
VLOG(3) << "Metal queue launch " << device_kernel_as_string(kernel) << ", work_size "
|
||||||
<< work_size;
|
<< work_size;
|
||||||
|
|
||||||
const MetalDeviceKernel &metal_kernel = metal_device->kernels.get(kernel);
|
|
||||||
const MetalKernelPipeline &metal_kernel_pso = metal_kernel.get_pso();
|
|
||||||
|
|
||||||
id<MTLComputeCommandEncoder> mtlComputeCommandEncoder = get_compute_encoder(kernel);
|
id<MTLComputeCommandEncoder> mtlComputeCommandEncoder = get_compute_encoder(kernel);
|
||||||
|
|
||||||
/* Determine size requirement for argument buffer. */
|
/* Determine size requirement for argument buffer. */
|
||||||
@@ -212,6 +209,8 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel,
|
|||||||
}
|
}
|
||||||
bytes_written = globals_offsets + sizeof(KernelParamsMetal);
|
bytes_written = globals_offsets + sizeof(KernelParamsMetal);
|
||||||
|
|
||||||
|
const MetalKernelPipeline &metal_kernel_pso = metal_device->get_best_pipeline(kernel);
|
||||||
|
|
||||||
/* Encode ancillaries */
|
/* Encode ancillaries */
|
||||||
[metal_device->mtlAncillaryArgEncoder setArgumentBuffer:arg_buffer offset:metal_offsets];
|
[metal_device->mtlAncillaryArgEncoder setArgumentBuffer:arg_buffer offset:metal_offsets];
|
||||||
[metal_device->mtlAncillaryArgEncoder setBuffer:metal_device->texture_bindings_2d
|
[metal_device->mtlAncillaryArgEncoder setBuffer:metal_device->texture_bindings_2d
|
||||||
@@ -284,7 +283,7 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel,
|
|||||||
[mtlComputeCommandEncoder setComputePipelineState:metal_kernel_pso.pipeline];
|
[mtlComputeCommandEncoder setComputePipelineState:metal_kernel_pso.pipeline];
|
||||||
|
|
||||||
/* Compute kernel launch parameters. */
|
/* Compute kernel launch parameters. */
|
||||||
const int num_threads_per_block = metal_kernel.get_num_threads_per_block();
|
const int num_threads_per_block = metal_kernel_pso.num_threads_per_block;
|
||||||
|
|
||||||
int shared_mem_bytes = 0;
|
int shared_mem_bytes = 0;
|
||||||
|
|
||||||
@@ -547,6 +546,8 @@ id<MTLComputeCommandEncoder> MetalDeviceQueue::get_compute_encoder(DeviceKernel
|
|||||||
computeCommandEncoderWithDispatchType:concurrent ? MTLDispatchTypeConcurrent :
|
computeCommandEncoderWithDispatchType:concurrent ? MTLDispatchTypeConcurrent :
|
||||||
MTLDispatchTypeSerial];
|
MTLDispatchTypeSerial];
|
||||||
|
|
||||||
|
[mtlComputeEncoder setLabel:@(device_kernel_as_string(kernel))];
|
||||||
|
|
||||||
/* declare usage of MTLBuffers etc */
|
/* declare usage of MTLBuffers etc */
|
||||||
prepare_resources(kernel);
|
prepare_resources(kernel);
|
||||||
}
|
}
|
||||||
|
@@ -29,10 +29,26 @@ using namespace metal::raytracing;
|
|||||||
|
|
||||||
/* Qualifiers */
|
/* Qualifiers */
|
||||||
|
|
||||||
#define ccl_device
|
#if defined(__KERNEL_METAL_APPLE__)
|
||||||
#define ccl_device_inline ccl_device
|
|
||||||
#define ccl_device_forceinline ccl_device
|
/* Inline everything for Apple GPUs.
|
||||||
#define ccl_device_noinline ccl_device __attribute__((noinline))
|
* This gives ~1.1x speedup and 10% spill reduction for integator_shade_surface
|
||||||
|
* at the cost of longer compile times (~4.5 minutes on M1 Max). */
|
||||||
|
|
||||||
|
# define ccl_device __attribute__((always_inline))
|
||||||
|
# define ccl_device_inline __attribute__((always_inline))
|
||||||
|
# define ccl_device_forceinline __attribute__((always_inline))
|
||||||
|
# define ccl_device_noinline __attribute__((always_inline))
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
|
# define ccl_device
|
||||||
|
# define ccl_device_inline ccl_device
|
||||||
|
# define ccl_device_forceinline ccl_device
|
||||||
|
# define ccl_device_noinline ccl_device __attribute__((noinline))
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
#define ccl_device_noinline_cpu ccl_device
|
#define ccl_device_noinline_cpu ccl_device
|
||||||
#define ccl_device_inline_method ccl_device
|
#define ccl_device_inline_method ccl_device
|
||||||
#define ccl_global device
|
#define ccl_global device
|
||||||
|
Reference in New Issue
Block a user