Revert "Cycles: Enable inlining on Apple Silicon for 1.1x speedup"

This reverts commit b82de02e7c. It is causing crashes in various regression tests. Ref D14763
2022-04-28 00:46:14 +02:00
parent 3558f565f1
commit 52a5f68562
6 changed files with 576 additions and 593 deletions
--- a/intern/cycles/device/metal/device_impl.h
+++ b/intern/cycles/device/metal/device_impl.h
@@ -28,8 +28,7 @@ class MetalDevice : public Device {
  id<MTLCommandQueue> mtlGeneralCommandQueue = nil;
  id<MTLArgumentEncoder> mtlAncillaryArgEncoder =
      nil; /* encoder used for fetching device pointers from MTLBuffers */
-  string source[PSO_NUM];
-  string source_md5[PSO_NUM];
+  string source_used_for_compile[PSO_NUM];

  KernelParamsMetal launch_params = {0};

@@ -111,12 +110,6 @@ class MetalDevice : public Device {

  virtual void build_bvh(BVH *bvh, Progress &progress, bool refit) override;

-  id<MTLLibrary> compile(string const &source);
-
-  const MetalKernelPipeline &get_best_pipeline(DeviceKernel kernel) const;
-
-  bool kernel_available(DeviceKernel kernel) const;
-
  /* ------------------------------------------------------------------ */
  /* low-level memory management */

--- a/intern/cycles/device/metal/device_impl.mm
+++ b/intern/cycles/device/metal/device_impl.mm
@@ -275,44 +275,96 @@ bool MetalDevice::load_kernels(const uint _kernel_features)
   * active, but may still need to be rendered without motion blur if that isn't active as well. */
  motion_blur = kernel_features & KERNEL_FEATURE_OBJECT_MOTION;

-  source[PSO_GENERIC] = get_source(kernel_features);
-  mtlLibrary[PSO_GENERIC] = compile(source[PSO_GENERIC]);
+  NSError *error = NULL;

-  MD5Hash md5;
-  md5.append(source[PSO_GENERIC]);
-  source_md5[PSO_GENERIC] = md5.get_hex();
+  for (int i = 0; i < PSO_NUM; i++) {
+    if (mtlLibrary[i]) {
+      [mtlLibrary[i] release];
+      mtlLibrary[i] = nil;
+    }
+  }

-  metal_printf("Front-end compilation finished (generic)\n");
-
-  bool result = kernels.load(this, false);
-
-  reserve_local_memory(kernel_features);
-
-  return result;
-}
-
-id<MTLLibrary> MetalDevice::compile(string const &source)
-{
  MTLCompileOptions *options = [[MTLCompileOptions alloc] init];

  options.fastMathEnabled = YES;
  if (@available(macOS 12.0, *)) {
    options.languageVersion = MTLLanguageVersion2_4;
  }
+  else {
+    return false;
+  }

-  NSError *error = NULL;
-  id<MTLLibrary> mtlLibrary = [mtlDevice newLibraryWithSource:@(source.c_str())
+  string metalsrc;
+
+  /* local helper: dump source to disk and return filepath */
+  auto dump_source = [&](int kernel_type) -> string {
+    string &source = source_used_for_compile[kernel_type];
+    string metalsrc = path_cache_get(path_join("kernels",
+                                               string_printf("%s.%s.metal",
+                                                             kernel_type_as_string(kernel_type),
+                                                             util_md5_string(source).c_str())));
+    path_write_text(metalsrc, source);
+    return metalsrc;
+  };
+
+  /* local helper: fetch the kernel source code, adjust it for specific PSO_.. kernel_type flavor,
+   * then compile it into a MTLLibrary */
+  auto fetch_and_compile_source = [&](int kernel_type) {
+    /* Record the source used to compile this library, for hash building later. */
+    string &source = source_used_for_compile[kernel_type];
+
+    switch (kernel_type) {
+      case PSO_GENERIC: {
+        source = get_source(kernel_features);
+        break;
+      }
+      case PSO_SPECIALISED: {
+        /* PSO_SPECIALISED derives from PSO_GENERIC */
+        string &generic_source = source_used_for_compile[PSO_GENERIC];
+        if (generic_source.empty()) {
+          generic_source = get_source(kernel_features);
+        }
+        source = "#define __KERNEL_METAL_USE_FUNCTION_SPECIALISATION__\n" + generic_source;
+        break;
+      }
+      default:
+        assert(0);
+    }
+
+    /* create MTLLibrary (front-end compilation) */
+    mtlLibrary[kernel_type] = [mtlDevice newLibraryWithSource:@(source.c_str())
                                                      options:options
                                                        error:&error];

-  if (!mtlLibrary) {
+    bool do_source_dump = (getenv("CYCLES_METAL_DUMP_SOURCE") != nullptr);
+
+    if (!mtlLibrary[kernel_type] || do_source_dump) {
+      string metalsrc = dump_source(kernel_type);
+
+      if (!mtlLibrary[kernel_type]) {
        NSString *err = [error localizedDescription];
        set_error(string_printf("Failed to compile library:\n%s", [err UTF8String]));
+
+        return false;
+      }
+    }
+    return true;
+  };
+
+  fetch_and_compile_source(PSO_GENERIC);
+
+  if (use_function_specialisation) {
+    fetch_and_compile_source(PSO_SPECIALISED);
  }

-  [options release];
+  metal_printf("Front-end compilation finished\n");

-  return mtlLibrary;
+  bool result = kernels.load(this, PSO_GENERIC);
+
+  [options release];
+  reserve_local_memory(kernel_features);
+
+  return result;
 }

 void MetalDevice::reserve_local_memory(const uint kernel_features)
@@ -619,11 +671,6 @@ device_ptr MetalDevice::mem_alloc_sub_ptr(device_memory &mem, size_t offset, siz
  return 0;
 }

-const MetalKernelPipeline &MetalDevice::get_best_pipeline(DeviceKernel kernel) const
-{
-  return kernels.get_best_pipeline(this, kernel);
-}
-
 void MetalDevice::const_copy_to(const char *name, void *host, size_t size)
 {
  if (strcmp(name, "__data") == 0) {
--- a/intern/cycles/device/metal/kernel.h
+++ b/intern/cycles/device/metal/kernel.h
@@ -54,41 +54,98 @@ enum {
 const char *kernel_type_as_string(int kernel_type);

 struct MetalKernelPipeline {
+  void release()
+  {
+    if (pipeline) {
+      [pipeline release];
+      pipeline = nil;
+      if (@available(macOS 11.0, *)) {
+        for (int i = 0; i < METALRT_TABLE_NUM; i++) {
+          if (intersection_func_table[i]) {
+            [intersection_func_table[i] release];
+            intersection_func_table[i] = nil;
+          }
+        }
+      }
+    }
+    if (function) {
+      [function release];
+      function = nil;
+    }
+    if (@available(macOS 11.0, *)) {
+      for (int i = 0; i < METALRT_TABLE_NUM; i++) {
+        if (intersection_func_table[i]) {
+          [intersection_func_table[i] release];
+        }
+      }
+    }
+  }

-  void compile();
-
-  id<MTLLibrary> mtlLibrary = nil;
-  bool scene_specialized;
-  string source_md5;
-
-  bool use_metalrt;
-  bool metalrt_hair;
-  bool metalrt_hair_thick;
-  bool metalrt_pointcloud;
-
-  int threads_per_threadgroup;
-
-  DeviceKernel device_kernel;
  bool loaded = false;
-  id<MTLDevice> mtlDevice = nil;
  id<MTLFunction> function = nil;
  id<MTLComputePipelineState> pipeline = nil;
-  int num_threads_per_block = 0;
-
-  string error_str;

  API_AVAILABLE(macos(11.0))
  id<MTLIntersectionFunctionTable> intersection_func_table[METALRT_TABLE_NUM] = {nil};
-  id<MTLFunction> rt_intersection_function[METALRT_FUNC_NUM] = {nil};
+};
+
+struct MetalKernelLoadDesc {
+  int pso_index = 0;
+  const char *function_name = nullptr;
+  int kernel_index = 0;
+  int threads_per_threadgroup = 0;
+  MTLFunctionConstantValues *constant_values = nullptr;
+  NSArray *linked_functions = nullptr;
+
+  struct IntersectorFunctions {
+    NSArray *defaults;
+    NSArray *shadow;
+    NSArray *local;
+    NSArray *operator[](int index) const
+    {
+      if (index == METALRT_TABLE_DEFAULT)
+        return defaults;
+      if (index == METALRT_TABLE_SHADOW)
+        return shadow;
+      return local;
+    }
+  } intersector_functions = {nullptr};
+};
+
+/* Metal kernel and associate occupancy information. */
+class MetalDeviceKernel {
+ public:
+  ~MetalDeviceKernel();
+
+  bool load(MetalDevice *device, MetalKernelLoadDesc const &desc, class MD5Hash const &md5);
+
+  void mark_loaded(int pso_index)
+  {
+    pso[pso_index].loaded = true;
+  }
+
+  int get_num_threads_per_block() const
+  {
+    return num_threads_per_block;
+  }
+  const MetalKernelPipeline &get_pso() const;
+
+  double load_duration = 0.0;
+
+ private:
+  MetalKernelPipeline pso[PSO_NUM];
+
+  int num_threads_per_block = 0;
 };

 /* Cache of Metal kernels for each DeviceKernel. */
 class MetalDeviceKernels {
 public:
-  bool load(MetalDevice *device, bool scene_specialized);
-  bool available(const MetalDevice *device, DeviceKernel kernel) const;
-  const MetalKernelPipeline &get_best_pipeline(const MetalDevice *device,
-                                               DeviceKernel kernel) const;
+  bool load(MetalDevice *device, int kernel_type);
+  bool available(DeviceKernel kernel) const;
+  const MetalDeviceKernel &get(DeviceKernel kernel) const;
+
+  MetalDeviceKernel kernels_[DEVICE_KERNEL_NUM];

  id<MTLFunction> rt_intersection_funcs[PSO_NUM][METALRT_FUNC_NUM] = {{nil}};

--- a/intern/cycles/device/metal/kernel.mm
+++ b/intern/cycles/device/metal/kernel.mm
@@ -9,7 +9,6 @@
 #  include "util/path.h"
 #  include "util/tbb.h"
 #  include "util/time.h"
-#  include "util/unique_ptr.h"

 CCL_NAMESPACE_BEGIN

@@ -29,266 +28,315 @@ const char *kernel_type_as_string(int kernel_type)
  return "";
 }

-bool kernel_has_intersection(DeviceKernel device_kernel)
+MetalDeviceKernel::~MetalDeviceKernel()
 {
-  return (device_kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST ||
-          device_kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW ||
-          device_kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE ||
-          device_kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK ||
-          device_kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE);
-}
-
-struct ShaderCache {
-  ShaderCache(id<MTLDevice> _mtlDevice) : mtlDevice(_mtlDevice)
-  {
-  }
-  ~ShaderCache();
-
-  /* Get the fastest available pipeline for the specified kernel. */
-  MetalKernelPipeline *get_best_pipeline(DeviceKernel kernel, const MetalDevice *device);
-
-  /* Non-blocking request for a kernel, optionally specialized to the scene being rendered by
-   * device. */
-  void load_kernel(DeviceKernel kernel, MetalDevice *device, bool scene_specialized);
-
-  void wait_for_all();
-
- private:
-  friend ShaderCache *get_shader_cache(id<MTLDevice> mtlDevice);
-
-  void compile_thread_func(int thread_index);
-
-  using PipelineCollection = std::vector<unique_ptr<MetalKernelPipeline>>;
-
-  struct PipelineRequest {
-    MetalKernelPipeline *pipeline = nullptr;
-    std::function<void(MetalKernelPipeline *)> completionHandler;
-  };
-
-  std::mutex cache_mutex;
-
-  PipelineCollection pipelines[DEVICE_KERNEL_NUM];
-  id<MTLDevice> mtlDevice;
-
-  bool running = false;
-  std::condition_variable cond_var;
-  std::deque<PipelineRequest> request_queue;
-  std::vector<std::thread> compile_threads;
-  std::atomic_int incomplete_requests = 0;
-};
-
-std::mutex g_shaderCacheMutex;
-std::map<id<MTLDevice>, unique_ptr<ShaderCache>> g_shaderCache;
-
-ShaderCache *get_shader_cache(id<MTLDevice> mtlDevice)
-{
-  thread_scoped_lock lock(g_shaderCacheMutex);
-  auto it = g_shaderCache.find(mtlDevice);
-  if (it != g_shaderCache.end()) {
-    return it->second.get();
-  }
-
-  g_shaderCache[mtlDevice] = make_unique<ShaderCache>(mtlDevice);
-  return g_shaderCache[mtlDevice].get();
-}
-
-ShaderCache::~ShaderCache()
-{
-  running = false;
-  cond_var.notify_all();
-  for (auto &thread : compile_threads) {
-    thread.join();
+  for (int i = 0; i < PSO_NUM; i++) {
+    pso[i].release();
  }
 }

-void ShaderCache::wait_for_all()
+bool MetalDeviceKernel::load(MetalDevice *device,
+                             MetalKernelLoadDesc const &desc_in,
+                             MD5Hash const &md5)
 {
-  while (incomplete_requests > 0) {
-    std::this_thread::sleep_for(std::chrono::milliseconds(100));
-  }
-}
-
-void ShaderCache::compile_thread_func(int thread_index)
-{
-  while (1) {
-
-    /* wait for / acquire next request */
-    PipelineRequest request;
-    {
-      thread_scoped_lock lock(cache_mutex);
-      cond_var.wait(lock, [&] { return !running || !request_queue.empty(); });
-      if (!running) {
-        break;
-      }
-
-      if (!request_queue.empty()) {
-        request = request_queue.front();
-        request_queue.pop_front();
-      }
-    }
-
-    /* service request */
-    if (request.pipeline) {
-      request.pipeline->compile();
-      incomplete_requests--;
-    }
-  }
-}
-
-void ShaderCache::load_kernel(DeviceKernel device_kernel,
-                              MetalDevice *device,
-                              bool scene_specialized)
-{
-  {
-    /* create compiler threads on first run */
-    thread_scoped_lock lock(cache_mutex);
-    if (compile_threads.empty()) {
-      running = true;
-      for (int i = 0; i < max_mtlcompiler_threads; i++) {
-        compile_threads.push_back(std::thread([&] { compile_thread_func(i); }));
-      }
-    }
-  }
-
-  if (device_kernel == DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL) {
+  __block MetalKernelLoadDesc const desc(desc_in);
+  if (desc.kernel_index == DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL) {
    /* skip megakernel */
-    return;
+    return true;
  }

-  if (scene_specialized) {
-    /* Only specialize kernels where it can make an impact. */
-    if (device_kernel < DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST ||
-        device_kernel > DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL) {
-      return;
-    }
+  bool use_binary_archive = true;
+  if (device->device_vendor == METAL_GPU_APPLE) {
+    /* Workaround for T94142: Cycles Metal crash with simultaneous viewport and final render */
+    use_binary_archive = false;
  }

-  {
-    /* check whether the kernel has already been requested / cached */
-    thread_scoped_lock lock(cache_mutex);
-    for (auto &pipeline : pipelines[device_kernel]) {
-      if (scene_specialized) {
-        if (pipeline->source_md5 == device->source_md5[PSO_SPECIALISED]) {
-          /* we already requested a pipeline that is specialised for this kernel data */
-          metal_printf("Specialized kernel already requested (%s)\n",
-                       device_kernel_as_string(device_kernel));
-          return;
-        }
-      }
-      else {
-        if (pipeline->source_md5 == device->source_md5[PSO_GENERIC]) {
-          /* we already requested a generic pipeline for this kernel */
-          metal_printf("Generic kernel already requested (%s)\n",
-                       device_kernel_as_string(device_kernel));
-          return;
+  if (auto str = getenv("CYCLES_METAL_DISABLE_BINARY_ARCHIVES")) {
+    use_binary_archive = (atoi(str) == 0);
  }
+
+  id<MTLBinaryArchive> archive = nil;
+  string metalbin_path;
+  if (use_binary_archive) {
+    NSProcessInfo *processInfo = [NSProcessInfo processInfo];
+    string osVersion = [[processInfo operatingSystemVersionString] UTF8String];
+    MD5Hash local_md5(md5);
+    local_md5.append(osVersion);
+    string metalbin_name = string(desc.function_name) + "." + local_md5.get_hex() +
+                           to_string(desc.pso_index) + ".bin";
+    metalbin_path = path_cache_get(path_join("kernels", metalbin_name));
+    path_create_directories(metalbin_path);
+
+    if (path_exists(metalbin_path) && use_binary_archive) {
+      if (@available(macOS 11.0, *)) {
+        MTLBinaryArchiveDescriptor *archiveDesc = [[MTLBinaryArchiveDescriptor alloc] init];
+        archiveDesc.url = [NSURL fileURLWithPath:@(metalbin_path.c_str())];
+        archive = [device->mtlDevice newBinaryArchiveWithDescriptor:archiveDesc error:nil];
+        [archiveDesc release];
      }
    }
  }

-  incomplete_requests++;
-
-  PipelineRequest request;
-  request.pipeline = new MetalKernelPipeline;
-  request.pipeline->scene_specialized = scene_specialized;
-  request.pipeline->mtlDevice = mtlDevice;
-  request.pipeline->source_md5 =
-      device->source_md5[scene_specialized ? PSO_SPECIALISED : PSO_GENERIC];
-  request.pipeline->mtlLibrary =
-      device->mtlLibrary[scene_specialized ? PSO_SPECIALISED : PSO_GENERIC];
-  request.pipeline->device_kernel = device_kernel;
-  request.pipeline->threads_per_threadgroup = device->max_threads_per_threadgroup;
-
-  /* metalrt options */
-  request.pipeline->use_metalrt = device->use_metalrt;
-  request.pipeline->metalrt_hair = device->kernel_features & KERNEL_FEATURE_HAIR;
-  request.pipeline->metalrt_hair_thick = device->kernel_features & KERNEL_FEATURE_HAIR_THICK;
-  request.pipeline->metalrt_pointcloud = device->kernel_features & KERNEL_FEATURE_POINTCLOUD;
-
-  {
-    thread_scoped_lock lock(cache_mutex);
-    pipelines[device_kernel].push_back(unique_ptr<MetalKernelPipeline>(request.pipeline));
-    request_queue.push_back(request);
-  }
-  cond_var.notify_one();
-}
-
-MetalKernelPipeline *ShaderCache::get_best_pipeline(DeviceKernel kernel, const MetalDevice *device)
-{
-  thread_scoped_lock lock(cache_mutex);
-  auto &collection = pipelines[kernel];
-  if (collection.empty()) {
-    return nullptr;
-  }
-
-  /* metalrt options */
-  bool use_metalrt = device->use_metalrt;
-  bool metalrt_hair = device->kernel_features & KERNEL_FEATURE_HAIR;
-  bool metalrt_hair_thick = device->kernel_features & KERNEL_FEATURE_HAIR_THICK;
-  bool metalrt_pointcloud = device->kernel_features & KERNEL_FEATURE_POINTCLOUD;
-
-  MetalKernelPipeline *best_pipeline = nullptr;
-  for (auto &pipeline : collection) {
-    if (!pipeline->loaded) {
-      /* still loading - ignore */
-      continue;
-    }
-
-    if (pipeline->use_metalrt != use_metalrt || pipeline->metalrt_hair != metalrt_hair ||
-        pipeline->metalrt_hair_thick != metalrt_hair_thick ||
-        pipeline->metalrt_pointcloud != metalrt_pointcloud) {
-      /* wrong combination of metalrt options */
-      continue;
-    }
-
-    if (pipeline->scene_specialized) {
-      if (pipeline->source_md5 == device->source_md5[PSO_SPECIALISED]) {
-        best_pipeline = pipeline.get();
-      }
-    }
-    else if (!best_pipeline) {
-      best_pipeline = pipeline.get();
-    }
-  }
-
-  return best_pipeline;
-}
-
-void MetalKernelPipeline::compile()
-{
-  int pso_type = scene_specialized ? PSO_SPECIALISED : PSO_GENERIC;
-
-  const std::string function_name = std::string("cycles_metal_") +
-                                    device_kernel_as_string(device_kernel);
-
-  int threads_per_threadgroup = this->threads_per_threadgroup;
-  if (device_kernel > DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL &&
-      device_kernel < DEVICE_KERNEL_INTEGRATOR_RESET) {
-    /* Always use 512 for the sorting kernels */
-    threads_per_threadgroup = 512;
-  }
-
-  NSString *entryPoint = [@(function_name.c_str()) copy];
+  NSString *entryPoint = [@(desc.function_name) copy];

  NSError *error = NULL;
  if (@available(macOS 11.0, *)) {
    MTLFunctionDescriptor *func_desc = [MTLIntersectionFunctionDescriptor functionDescriptor];
    func_desc.name = entryPoint;
-    function = [mtlLibrary newFunctionWithDescriptor:func_desc error:&error];
+    if (desc.constant_values) {
+      func_desc.constantValues = desc.constant_values;
+    }
+    pso[desc.pso_index].function = [device->mtlLibrary[desc.pso_index]
+        newFunctionWithDescriptor:func_desc
+                            error:&error];
  }
-
  [entryPoint release];

-  if (function == nil) {
+  if (pso[desc.pso_index].function == nil) {
    NSString *err = [error localizedDescription];
    string errors = [err UTF8String];
-    metal_printf("Error getting function \"%s\": %s", function_name.c_str(), errors.c_str());
+
+    device->set_error(
+        string_printf("Error getting function \"%s\": %s", desc.function_name, errors.c_str()));
+    return false;
+  }
+
+  pso[desc.pso_index].function.label = [@(desc.function_name) copy];
+
+  __block MTLComputePipelineDescriptor *computePipelineStateDescriptor =
+      [[MTLComputePipelineDescriptor alloc] init];
+
+  computePipelineStateDescriptor.buffers[0].mutability = MTLMutabilityImmutable;
+  computePipelineStateDescriptor.buffers[1].mutability = MTLMutabilityImmutable;
+  computePipelineStateDescriptor.buffers[2].mutability = MTLMutabilityImmutable;
+
+  if (@available(macos 10.14, *)) {
+    computePipelineStateDescriptor.maxTotalThreadsPerThreadgroup = desc.threads_per_threadgroup;
+  }
+  computePipelineStateDescriptor.threadGroupSizeIsMultipleOfThreadExecutionWidth = true;
+
+  computePipelineStateDescriptor.computeFunction = pso[desc.pso_index].function;
+  if (@available(macOS 11.0, *)) {
+    /* Attach the additional functions to an MTLLinkedFunctions object */
+    if (desc.linked_functions) {
+      computePipelineStateDescriptor.linkedFunctions = [[MTLLinkedFunctions alloc] init];
+      computePipelineStateDescriptor.linkedFunctions.functions = desc.linked_functions;
+    }
+
+    computePipelineStateDescriptor.maxCallStackDepth = 1;
+  }
+
+  /* Create a new Compute pipeline state object */
+  MTLPipelineOption pipelineOptions = MTLPipelineOptionNone;
+
+  bool creating_new_archive = false;
+  if (@available(macOS 11.0, *)) {
+    if (use_binary_archive) {
+      if (!archive) {
+        MTLBinaryArchiveDescriptor *archiveDesc = [[MTLBinaryArchiveDescriptor alloc] init];
+        archiveDesc.url = nil;
+        archive = [device->mtlDevice newBinaryArchiveWithDescriptor:archiveDesc error:nil];
+        creating_new_archive = true;
+
+        double starttime = time_dt();
+
+        if (![archive addComputePipelineFunctionsWithDescriptor:computePipelineStateDescriptor
+                                                          error:&error]) {
+          NSString *errStr = [error localizedDescription];
+          metal_printf("Failed to add PSO to archive:\n%s\n",
+                       errStr ? [errStr UTF8String] : "nil");
+        }
+        else {
+          double duration = time_dt() - starttime;
+          metal_printf("%2d | %-55s | %7.2fs\n",
+                       desc.kernel_index,
+                       device_kernel_as_string((DeviceKernel)desc.kernel_index),
+                       duration);
+
+          if (desc.pso_index == PSO_GENERIC) {
+            this->load_duration = duration;
+          }
+        }
+      }
+      computePipelineStateDescriptor.binaryArchives = [NSArray arrayWithObjects:archive, nil];
+      pipelineOptions = MTLPipelineOptionFailOnBinaryArchiveMiss;
+    }
+  }
+
+  double starttime = time_dt();
+
+  MTLNewComputePipelineStateWithReflectionCompletionHandler completionHandler = ^(
+      id<MTLComputePipelineState> computePipelineState,
+      MTLComputePipelineReflection *reflection,
+      NSError *error) {
+    bool recreate_archive = false;
+    if (computePipelineState == nil && archive && !creating_new_archive) {
+
+      assert(0);
+
+      NSString *errStr = [error localizedDescription];
+      metal_printf(
+          "Failed to create compute pipeline state \"%s\" from archive - attempting recreation... "
+          "(error: %s)\n",
+          device_kernel_as_string((DeviceKernel)desc.kernel_index),
+          errStr ? [errStr UTF8String] : "nil");
+      computePipelineState = [device->mtlDevice
+          newComputePipelineStateWithDescriptor:computePipelineStateDescriptor
+                                        options:MTLPipelineOptionNone
+                                     reflection:nullptr
+                                          error:&error];
+      recreate_archive = true;
+    }
+
+    double duration = time_dt() - starttime;
+
+    if (computePipelineState == nil) {
+      NSString *errStr = [error localizedDescription];
+      device->set_error(string_printf("Failed to create compute pipeline state \"%s\", error: \n",
+                                      device_kernel_as_string((DeviceKernel)desc.kernel_index)) +
+                        (errStr ? [errStr UTF8String] : "nil"));
+      metal_printf("%2d | %-55s | %7.2fs | FAILED!\n",
+                   desc.kernel_index,
+                   device_kernel_as_string((DeviceKernel)desc.kernel_index),
+                   duration);
      return;
    }

-  function.label = [entryPoint copy];
+    pso[desc.pso_index].pipeline = computePipelineState;
+    num_threads_per_block = round_down(computePipelineState.maxTotalThreadsPerThreadgroup,
+                                       computePipelineState.threadExecutionWidth);
+    num_threads_per_block = std::max(num_threads_per_block,
+                                     (int)computePipelineState.threadExecutionWidth);

-  if (use_metalrt) {
+    if (!use_binary_archive) {
+      metal_printf("%2d | %-55s | %7.2fs\n",
+                   desc.kernel_index,
+                   device_kernel_as_string((DeviceKernel)desc.kernel_index),
+                   duration);
+
+      if (desc.pso_index == PSO_GENERIC) {
+        this->load_duration = duration;
+      }
+    }
+
+    if (@available(macOS 11.0, *)) {
+      if (creating_new_archive || recreate_archive) {
+        if (![archive serializeToURL:[NSURL fileURLWithPath:@(metalbin_path.c_str())]
+                               error:&error]) {
+          metal_printf("Failed to save binary archive, error:\n%s\n",
+                       [[error localizedDescription] UTF8String]);
+        }
+      }
+    }
+
+    [computePipelineStateDescriptor release];
+    computePipelineStateDescriptor = nil;
+
+    if (device->use_metalrt && desc.linked_functions) {
+      for (int table = 0; table < METALRT_TABLE_NUM; table++) {
+        if (@available(macOS 11.0, *)) {
+          MTLIntersectionFunctionTableDescriptor *ift_desc =
+              [[MTLIntersectionFunctionTableDescriptor alloc] init];
+          ift_desc.functionCount = desc.intersector_functions[table].count;
+
+          pso[desc.pso_index].intersection_func_table[table] = [pso[desc.pso_index].pipeline
+              newIntersectionFunctionTableWithDescriptor:ift_desc];
+
+          /* Finally write the function handles into this pipeline's table */
+          for (int i = 0; i < 2; i++) {
+            id<MTLFunctionHandle> handle = [pso[desc.pso_index].pipeline
+                functionHandleWithFunction:desc.intersector_functions[table][i]];
+            [pso[desc.pso_index].intersection_func_table[table] setFunction:handle atIndex:i];
+          }
+        }
+      }
+    }
+
+    mark_loaded(desc.pso_index);
+  };
+
+  if (desc.pso_index == PSO_SPECIALISED) {
+    /* Asynchronous load */
+    dispatch_async(dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_DEFAULT, 0), ^{
+      NSError *error;
+      id<MTLComputePipelineState> pipeline = [device->mtlDevice
+          newComputePipelineStateWithDescriptor:computePipelineStateDescriptor
+                                        options:pipelineOptions
+                                     reflection:nullptr
+                                          error:&error];
+      completionHandler(pipeline, nullptr, error);
+    });
+  }
+  else {
+    /* Block on load to ensure we continue with a valid kernel function */
+    id<MTLComputePipelineState> pipeline = [device->mtlDevice
+        newComputePipelineStateWithDescriptor:computePipelineStateDescriptor
+                                      options:pipelineOptions
+                                   reflection:nullptr
+                                        error:&error];
+    completionHandler(pipeline, nullptr, error);
+  }
+
+  return true;
+}
+
+const MetalKernelPipeline &MetalDeviceKernel::get_pso() const
+{
+  if (pso[PSO_SPECIALISED].loaded) {
+    return pso[PSO_SPECIALISED];
+  }
+
+  assert(pso[PSO_GENERIC].loaded);
+  return pso[PSO_GENERIC];
+}
+
+bool MetalDeviceKernels::load(MetalDevice *device, int kernel_type)
+{
+  bool any_error = false;
+
+  MD5Hash md5;
+
+  /* Build the function constant table */
+  MTLFunctionConstantValues *constant_values = nullptr;
+  if (kernel_type == PSO_SPECIALISED) {
+    constant_values = [MTLFunctionConstantValues new];
+
+#  define KERNEL_FILM(_type, name) \
+    [constant_values setConstantValue:&data.film.name \
+                                 type:get_MTLDataType_##_type() \
+                              atIndex:KernelData_film_##name]; \
+    md5.append((uint8_t *)&data.film.name, sizeof(data.film.name));
+
+#  define KERNEL_BACKGROUND(_type, name) \
+    [constant_values setConstantValue:&data.background.name \
+                                 type:get_MTLDataType_##_type() \
+                              atIndex:KernelData_background_##name]; \
+    md5.append((uint8_t *)&data.background.name, sizeof(data.background.name));
+
+#  define KERNEL_INTEGRATOR(_type, name) \
+    [constant_values setConstantValue:&data.integrator.name \
+                                 type:get_MTLDataType_##_type() \
+                              atIndex:KernelData_integrator_##name]; \
+    md5.append((uint8_t *)&data.integrator.name, sizeof(data.integrator.name));
+
+#  define KERNEL_BVH(_type, name) \
+    [constant_values setConstantValue:&data.bvh.name \
+                                 type:get_MTLDataType_##_type() \
+                              atIndex:KernelData_bvh_##name]; \
+    md5.append((uint8_t *)&data.bvh.name, sizeof(data.bvh.name));
+
+    /* METAL_WIP: populate constant_values based on KernelData */
+    assert(0);
+    /*
+        const KernelData &data = device->launch_params.data;
+    #    include "kernel/types/background.h"
+    #    include "kernel/types/bvh.h"
+    #    include "kernel/types/film.h"
+    #    include "kernel/types/integrator.h"
+    */
+  }
+
+  if (device->use_metalrt) {
    if (@available(macOS 11.0, *)) {
      /* create the id<MTLFunction> for each intersection function */
      const char *function_names[] = {
@@ -308,74 +356,89 @@ void MetalKernelPipeline::compile()
      assert(sizeof(function_names) / sizeof(function_names[0]) == METALRT_FUNC_NUM);

      MTLFunctionDescriptor *desc = [MTLIntersectionFunctionDescriptor functionDescriptor];
+      if (kernel_type == PSO_SPECIALISED) {
+        desc.constantValues = constant_values;
+      }
      for (int i = 0; i < METALRT_FUNC_NUM; i++) {
        const char *function_name = function_names[i];
        desc.name = [@(function_name) copy];

        NSError *error = NULL;
-        rt_intersection_function[i] = [mtlLibrary newFunctionWithDescriptor:desc error:&error];
+        rt_intersection_funcs[kernel_type][i] = [device->mtlLibrary[kernel_type]
+            newFunctionWithDescriptor:desc
+                                error:&error];

-        if (rt_intersection_function[i] == nil) {
+        if (rt_intersection_funcs[kernel_type][i] == nil) {
          NSString *err = [error localizedDescription];
          string errors = [err UTF8String];

-          error_str = string_printf(
-              "Error getting intersection function \"%s\": %s", function_name, errors.c_str());
+          device->set_error(string_printf(
+              "Error getting intersection function \"%s\": %s", function_name, errors.c_str()));
+          any_error = true;
          break;
        }

-        rt_intersection_function[i].label = [@(function_name) copy];
+        rt_intersection_funcs[kernel_type][i].label = [@(function_name) copy];
      }
    }
  }
+  md5.append(device->source_used_for_compile[kernel_type]);
+
+  string hash = md5.get_hex();
+  if (loaded_md5[kernel_type] == hash) {
+    return true;
+  }

+  if (!any_error) {
    NSArray *table_functions[METALRT_TABLE_NUM] = {nil};
-  NSArray *linked_functions = nil;
+    NSArray *function_list = nil;

-  if (use_metalrt) {
+    if (device->use_metalrt) {
      id<MTLFunction> curve_intersect_default = nil;
      id<MTLFunction> curve_intersect_shadow = nil;
      id<MTLFunction> point_intersect_default = nil;
      id<MTLFunction> point_intersect_shadow = nil;
-    if (metalrt_hair) {
+      if (device->kernel_features & KERNEL_FEATURE_HAIR) {
        /* Add curve intersection programs. */
-      if (metalrt_hair_thick) {
+        if (device->kernel_features & KERNEL_FEATURE_HAIR_THICK) {
          /* Slower programs for thick hair since that also slows down ribbons.
           * Ideally this should not be needed. */
-        curve_intersect_default = rt_intersection_function[METALRT_FUNC_CURVE_ALL];
-        curve_intersect_shadow = rt_intersection_function[METALRT_FUNC_CURVE_ALL_SHADOW];
+          curve_intersect_default = rt_intersection_funcs[kernel_type][METALRT_FUNC_CURVE_ALL];
+          curve_intersect_shadow =
+              rt_intersection_funcs[kernel_type][METALRT_FUNC_CURVE_ALL_SHADOW];
        }
        else {
-        curve_intersect_default = rt_intersection_function[METALRT_FUNC_CURVE_RIBBON];
-        curve_intersect_shadow = rt_intersection_function[METALRT_FUNC_CURVE_RIBBON_SHADOW];
+          curve_intersect_default = rt_intersection_funcs[kernel_type][METALRT_FUNC_CURVE_RIBBON];
+          curve_intersect_shadow =
+              rt_intersection_funcs[kernel_type][METALRT_FUNC_CURVE_RIBBON_SHADOW];
        }
      }
-    if (metalrt_pointcloud) {
-      point_intersect_default = rt_intersection_function[METALRT_FUNC_POINT];
-      point_intersect_shadow = rt_intersection_function[METALRT_FUNC_POINT_SHADOW];
+      if (device->kernel_features & KERNEL_FEATURE_POINTCLOUD) {
+        point_intersect_default = rt_intersection_funcs[kernel_type][METALRT_FUNC_POINT];
+        point_intersect_shadow = rt_intersection_funcs[kernel_type][METALRT_FUNC_POINT_SHADOW];
      }
      table_functions[METALRT_TABLE_DEFAULT] = [NSArray
-        arrayWithObjects:rt_intersection_function[METALRT_FUNC_DEFAULT_TRI],
+          arrayWithObjects:rt_intersection_funcs[kernel_type][METALRT_FUNC_DEFAULT_TRI],
                           curve_intersect_default ?
                               curve_intersect_default :
-                             rt_intersection_function[METALRT_FUNC_DEFAULT_BOX],
+                               rt_intersection_funcs[kernel_type][METALRT_FUNC_DEFAULT_BOX],
                           point_intersect_default ?
                               point_intersect_default :
-                             rt_intersection_function[METALRT_FUNC_DEFAULT_BOX],
+                               rt_intersection_funcs[kernel_type][METALRT_FUNC_DEFAULT_BOX],
                           nil];
      table_functions[METALRT_TABLE_SHADOW] = [NSArray
-        arrayWithObjects:rt_intersection_function[METALRT_FUNC_SHADOW_TRI],
+          arrayWithObjects:rt_intersection_funcs[kernel_type][METALRT_FUNC_SHADOW_TRI],
                           curve_intersect_shadow ?
                               curve_intersect_shadow :
-                             rt_intersection_function[METALRT_FUNC_SHADOW_BOX],
+                               rt_intersection_funcs[kernel_type][METALRT_FUNC_SHADOW_BOX],
                           point_intersect_shadow ?
                               point_intersect_shadow :
-                             rt_intersection_function[METALRT_FUNC_SHADOW_BOX],
+                               rt_intersection_funcs[kernel_type][METALRT_FUNC_SHADOW_BOX],
                           nil];
      table_functions[METALRT_TABLE_LOCAL] = [NSArray
-        arrayWithObjects:rt_intersection_function[METALRT_FUNC_LOCAL_TRI],
-                         rt_intersection_function[METALRT_FUNC_LOCAL_BOX],
-                         rt_intersection_function[METALRT_FUNC_LOCAL_BOX],
+          arrayWithObjects:rt_intersection_funcs[kernel_type][METALRT_FUNC_LOCAL_TRI],
+                           rt_intersection_funcs[kernel_type][METALRT_FUNC_LOCAL_BOX],
+                           rt_intersection_funcs[kernel_type][METALRT_FUNC_LOCAL_BOX],
                           nil];

      NSMutableSet *unique_functions = [NSMutableSet
@@ -383,246 +446,86 @@ void MetalKernelPipeline::compile()
      [unique_functions addObjectsFromArray:table_functions[METALRT_TABLE_SHADOW]];
      [unique_functions addObjectsFromArray:table_functions[METALRT_TABLE_LOCAL]];

-    if (kernel_has_intersection(device_kernel)) {
-      linked_functions = [[NSArray arrayWithArray:[unique_functions allObjects]]
+      function_list = [[NSArray arrayWithArray:[unique_functions allObjects]]
          sortedArrayUsingComparator:^NSComparisonResult(id<MTLFunction> f1, id<MTLFunction> f2) {
            return [f1.label compare:f2.label];
          }];
-    }
+
      unique_functions = nil;
    }

-  MTLComputePipelineDescriptor *computePipelineStateDescriptor =
-      [[MTLComputePipelineDescriptor alloc] init];
+    metal_printf("Starting %s \"cycles_metal_...\" pipeline builds\n",
+                 kernel_type_as_string(kernel_type));

-  computePipelineStateDescriptor.buffers[0].mutability = MTLMutabilityImmutable;
-  computePipelineStateDescriptor.buffers[1].mutability = MTLMutabilityImmutable;
-  computePipelineStateDescriptor.buffers[2].mutability = MTLMutabilityImmutable;
-
-  if (@available(macos 10.14, *)) {
-    computePipelineStateDescriptor.maxTotalThreadsPerThreadgroup = threads_per_threadgroup;
-  }
-  computePipelineStateDescriptor.threadGroupSizeIsMultipleOfThreadExecutionWidth = true;
-
-  computePipelineStateDescriptor.computeFunction = function;
-
-  if (@available(macOS 11.0, *)) {
-    /* Attach the additional functions to an MTLLinkedFunctions object */
-    if (linked_functions) {
-      computePipelineStateDescriptor.linkedFunctions = [[MTLLinkedFunctions alloc] init];
-      computePipelineStateDescriptor.linkedFunctions.functions = linked_functions;
-    }
-    computePipelineStateDescriptor.maxCallStackDepth = 1;
-    if (use_metalrt) {
-      computePipelineStateDescriptor.maxCallStackDepth = 8;
-    }
-  }
-
-  MTLPipelineOption pipelineOptions = MTLPipelineOptionNone;
-
-  bool use_binary_archive = true;
-  if (auto str = getenv("CYCLES_METAL_DISABLE_BINARY_ARCHIVES")) {
-    use_binary_archive = (atoi(str) == 0);
-  }
-
-  id<MTLBinaryArchive> archive = nil;
-  string metalbin_path;
-  string metalbin_name;
-  if (use_binary_archive) {
-    NSProcessInfo *processInfo = [NSProcessInfo processInfo];
-    string osVersion = [[processInfo operatingSystemVersionString] UTF8String];
-    MD5Hash local_md5;
-    local_md5.append(source_md5);
-    local_md5.append(osVersion);
-    local_md5.append((uint8_t *)&this->threads_per_threadgroup,
-                     sizeof(this->threads_per_threadgroup));
-
-    string options;
-    if (use_metalrt && kernel_has_intersection(device_kernel)) {
-      /* incorporate any MetalRT specialisations into the archive name */
-      options += string_printf(".hair_%d.hair_thick_%d.pointcloud_%d",
-                               metalrt_hair ? 1 : 0,
-                               metalrt_hair_thick ? 1 : 0,
-                               metalrt_pointcloud ? 1 : 0);
-    }
-
-    /* Replace non-alphanumerical characters with underscores. */
-    string device_name = [mtlDevice.name UTF8String];
-    for (char &c : device_name) {
-      if ((c < '0' || c > '9') && (c < 'a' || c > 'z') && (c < 'A' || c > 'Z')) {
-        c = '_';
-      }
-    }
-
-    metalbin_name = device_name;
-    metalbin_name = path_join(metalbin_name, device_kernel_as_string(device_kernel));
-    metalbin_name = path_join(metalbin_name, kernel_type_as_string(pso_type));
-    metalbin_name = path_join(metalbin_name, local_md5.get_hex() + options + ".bin");
-
-    metalbin_path = path_cache_get(path_join("kernels", metalbin_name));
-    path_create_directories(metalbin_path);
-
-    if (path_exists(metalbin_path) && use_binary_archive) {
-      if (@available(macOS 11.0, *)) {
-        MTLBinaryArchiveDescriptor *archiveDesc = [[MTLBinaryArchiveDescriptor alloc] init];
-        archiveDesc.url = [NSURL fileURLWithPath:@(metalbin_path.c_str())];
-        archive = [mtlDevice newBinaryArchiveWithDescriptor:archiveDesc error:nil];
-        [archiveDesc release];
-      }
-    }
-  }
-
-  __block bool creating_new_archive = false;
-  if (@available(macOS 11.0, *)) {
-    if (use_binary_archive) {
-      if (!archive) {
-        MTLBinaryArchiveDescriptor *archiveDesc = [[MTLBinaryArchiveDescriptor alloc] init];
-        archiveDesc.url = nil;
-        archive = [mtlDevice newBinaryArchiveWithDescriptor:archiveDesc error:nil];
-        creating_new_archive = true;
-      }
-      computePipelineStateDescriptor.binaryArchives = [NSArray arrayWithObjects:archive, nil];
-      pipelineOptions = MTLPipelineOptionFailOnBinaryArchiveMiss;
-    }
-  }
-
-  double starttime = time_dt();
-
-  MTLNewComputePipelineStateWithReflectionCompletionHandler completionHandler = ^(
-      id<MTLComputePipelineState> computePipelineState,
-      MTLComputePipelineReflection *reflection,
-      NSError *error) {
-    bool recreate_archive = false;
-    if (computePipelineState == nil && archive) {
-      NSString *errStr = [error localizedDescription];
-      metal_printf(
-          "Failed to create compute pipeline state \"%s\" from archive - attempting recreation... "
-          "(error: %s)\n",
-          device_kernel_as_string((DeviceKernel)device_kernel),
-          errStr ? [errStr UTF8String] : "nil");
-      computePipelineState = [mtlDevice
-          newComputePipelineStateWithDescriptor:computePipelineStateDescriptor
-                                        options:MTLPipelineOptionNone
-                                     reflection:nullptr
-                                          error:&error];
-      recreate_archive = true;
-    }
-
-    double duration = time_dt() - starttime;
-
-    if (computePipelineState == nil) {
-      NSString *errStr = [error localizedDescription];
-      error_str = string_printf("Failed to create compute pipeline state \"%s\", error: \n",
-                                device_kernel_as_string((DeviceKernel)device_kernel));
-      error_str += (errStr ? [errStr UTF8String] : "nil");
-      metal_printf("%16s | %2d | %-55s | %7.2fs | FAILED!\n",
-                   kernel_type_as_string(pso_type),
-                   device_kernel,
-                   device_kernel_as_string((DeviceKernel)device_kernel),
-                   duration);
+    tbb::task_arena local_arena(max_mtlcompiler_threads);
+    local_arena.execute([&]() {
+      parallel_for(int(0), int(DEVICE_KERNEL_NUM), [&](int i) {
+        /* skip megakernel */
+        if (i == DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL) {
          return;
        }

-    int num_threads_per_block = round_down(computePipelineState.maxTotalThreadsPerThreadgroup,
-                                           computePipelineState.threadExecutionWidth);
-    num_threads_per_block = std::max(num_threads_per_block,
-                                     (int)computePipelineState.threadExecutionWidth);
-    this->pipeline = computePipelineState;
-    this->num_threads_per_block = num_threads_per_block;
-
-    if (@available(macOS 11.0, *)) {
-      if (creating_new_archive || recreate_archive) {
-        if (![archive serializeToURL:[NSURL fileURLWithPath:@(metalbin_path.c_str())]
-                               error:&error]) {
-          metal_printf("Failed to save binary archive, error:\n%s\n",
-                       [[error localizedDescription] UTF8String]);
-        }
-      }
-    }
-  };
-
-  /* Block on load to ensure we continue with a valid kernel function */
-  if (creating_new_archive) {
-    starttime = time_dt();
-    NSError *error;
-    if (![archive addComputePipelineFunctionsWithDescriptor:computePipelineStateDescriptor
-                                                      error:&error]) {
-      NSString *errStr = [error localizedDescription];
-      metal_printf("Failed to add PSO to archive:\n%s\n", errStr ? [errStr UTF8String] : "nil");
-    }
-  }
-  id<MTLComputePipelineState> pipeline = [mtlDevice
-      newComputePipelineStateWithDescriptor:computePipelineStateDescriptor
-                                    options:pipelineOptions
-                                 reflection:nullptr
-                                      error:&error];
-  completionHandler(pipeline, nullptr, error);
-
-  this->loaded = true;
-  [computePipelineStateDescriptor release];
-  computePipelineStateDescriptor = nil;
-
-  if (use_metalrt && linked_functions) {
-    for (int table = 0; table < METALRT_TABLE_NUM; table++) {
-      if (@available(macOS 11.0, *)) {
-        MTLIntersectionFunctionTableDescriptor *ift_desc =
-            [[MTLIntersectionFunctionTableDescriptor alloc] init];
-        ift_desc.functionCount = table_functions[table].count;
-        intersection_func_table[table] = [this->pipeline
-            newIntersectionFunctionTableWithDescriptor:ift_desc];
-
-        /* Finally write the function handles into this pipeline's table */
-        for (int i = 0; i < 2; i++) {
-          id<MTLFunctionHandle> handle = [pipeline
-              functionHandleWithFunction:table_functions[table][i]];
-          [intersection_func_table[table] setFunction:handle atIndex:i];
-        }
-      }
+        /* Only specialize kernels where it can make an impact. */
+        if (kernel_type == PSO_SPECIALISED) {
+          if (i < DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST ||
+              i > DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL) {
+            return;
          }
        }

-  double duration = time_dt() - starttime;
+        MetalDeviceKernel &kernel = kernels_[i];

-  if (!use_binary_archive) {
-    metal_printf("%16s | %2d | %-55s | %7.2fs\n",
-                 kernel_type_as_string(pso_type),
-                 int(device_kernel),
-                 device_kernel_as_string(device_kernel),
-                 duration);
+        const std::string function_name = std::string("cycles_metal_") +
+                                          device_kernel_as_string((DeviceKernel)i);
+        int threads_per_threadgroup = device->max_threads_per_threadgroup;
+        if (i > DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL && i < DEVICE_KERNEL_INTEGRATOR_RESET) {
+          /* Always use 512 for the sorting kernels */
+          threads_per_threadgroup = 512;
        }
-  else {
-    metal_printf("%16s | %2d | %-55s | %7.2fs | %s: %s\n",
-                 kernel_type_as_string(pso_type),
-                 device_kernel,
-                 device_kernel_as_string((DeviceKernel)device_kernel),
-                 duration,
-                 creating_new_archive ? " new" : "load",
-                 metalbin_name.c_str());
+
+        NSArray *kernel_function_list = nil;
+
+        if (i == DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST ||
+            i == DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW ||
+            i == DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE ||
+            i == DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK ||
+            i == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE) {
+          kernel_function_list = function_list;
        }
+
+        MetalKernelLoadDesc desc;
+        desc.pso_index = kernel_type;
+        desc.kernel_index = i;
+        desc.linked_functions = kernel_function_list;
+        desc.intersector_functions.defaults = table_functions[METALRT_TABLE_DEFAULT];
+        desc.intersector_functions.shadow = table_functions[METALRT_TABLE_SHADOW];
+        desc.intersector_functions.local = table_functions[METALRT_TABLE_LOCAL];
+        desc.constant_values = constant_values;
+        desc.threads_per_threadgroup = threads_per_threadgroup;
+        desc.function_name = function_name.c_str();
+
+        bool success = kernel.load(device, desc, md5);
+
+        any_error |= !success;
+      });
+    });
+  }
+
+  bool loaded = !any_error;
+  if (loaded) {
+    loaded_md5[kernel_type] = hash;
+  }
+  return loaded;
 }

-bool MetalDeviceKernels::load(MetalDevice *device, bool scene_specialized)
+const MetalDeviceKernel &MetalDeviceKernels::get(DeviceKernel kernel) const
 {
-  auto shader_cache = get_shader_cache(device->mtlDevice);
-  for (int i = 0; i < DEVICE_KERNEL_NUM; i++) {
-    shader_cache->load_kernel((DeviceKernel)i, device, scene_specialized);
-  }
-
-  if (!scene_specialized || getenv("CYCLES_METAL_PROFILING")) {
-    shader_cache->wait_for_all();
-  }
-  return true;
+  return kernels_[(int)kernel];
 }

-const MetalKernelPipeline &MetalDeviceKernels::get_best_pipeline(const MetalDevice *device,
-                                                                 DeviceKernel kernel) const
+bool MetalDeviceKernels::available(DeviceKernel kernel) const
 {
-  return *get_shader_cache(device->mtlDevice)->get_best_pipeline(kernel, device);
-}
-
-bool MetalDeviceKernels::available(const MetalDevice *device, DeviceKernel kernel) const
-{
-  return get_shader_cache(device->mtlDevice)->get_best_pipeline(kernel, device) != nullptr;
+  return kernels_[(int)kernel].get_pso().function != nil;
 }

 CCL_NAMESPACE_END
--- a/intern/cycles/device/metal/queue.mm
+++ b/intern/cycles/device/metal/queue.mm
@@ -108,6 +108,9 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel,
  VLOG(3) << "Metal queue launch " << device_kernel_as_string(kernel) << ", work_size "
          << work_size;

+  const MetalDeviceKernel &metal_kernel = metal_device->kernels.get(kernel);
+  const MetalKernelPipeline &metal_kernel_pso = metal_kernel.get_pso();
+
  id<MTLComputeCommandEncoder> mtlComputeCommandEncoder = get_compute_encoder(kernel);

  /* Determine size requirement for argument buffer. */
@@ -209,8 +212,6 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel,
  }
  bytes_written = globals_offsets + sizeof(KernelParamsMetal);

-  const MetalKernelPipeline &metal_kernel_pso = metal_device->get_best_pipeline(kernel);
-
  /* Encode ancillaries */
  [metal_device->mtlAncillaryArgEncoder setArgumentBuffer:arg_buffer offset:metal_offsets];
  [metal_device->mtlAncillaryArgEncoder setBuffer:metal_device->texture_bindings_2d
@@ -283,7 +284,7 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel,
  [mtlComputeCommandEncoder setComputePipelineState:metal_kernel_pso.pipeline];

  /* Compute kernel launch parameters. */
-  const int num_threads_per_block = metal_kernel_pso.num_threads_per_block;
+  const int num_threads_per_block = metal_kernel.get_num_threads_per_block();

  int shared_mem_bytes = 0;

@@ -546,8 +547,6 @@ id<MTLComputeCommandEncoder> MetalDeviceQueue::get_compute_encoder(DeviceKernel
        computeCommandEncoderWithDispatchType:concurrent ? MTLDispatchTypeConcurrent :
                                                           MTLDispatchTypeSerial];

-    [mtlComputeEncoder setLabel:@(device_kernel_as_string(kernel))];
-
    /* declare usage of MTLBuffers etc */
    prepare_resources(kernel);
  }
--- a/intern/cycles/kernel/device/metal/compat.h
+++ b/intern/cycles/kernel/device/metal/compat.h
@@ -29,26 +29,10 @@ using namespace metal::raytracing;

 /* Qualifiers */

-#if defined(__KERNEL_METAL_APPLE__)
-
-/* Inline everything for Apple GPUs.
- * This gives ~1.1x speedup and 10% spill reduction for integator_shade_surface
- * at the cost of longer compile times (~4.5 minutes on M1 Max). */
-
-#  define ccl_device __attribute__((always_inline))
-#  define ccl_device_inline __attribute__((always_inline))
-#  define ccl_device_forceinline __attribute__((always_inline))
-#  define ccl_device_noinline __attribute__((always_inline))
-
-#else
-
-#  define ccl_device
-#  define ccl_device_inline ccl_device
-#  define ccl_device_forceinline ccl_device
-#  define ccl_device_noinline ccl_device __attribute__((noinline))
-
-#endif
-
+#define ccl_device
+#define ccl_device_inline ccl_device
+#define ccl_device_forceinline ccl_device
+#define ccl_device_noinline ccl_device __attribute__((noinline))
 #define ccl_device_noinline_cpu ccl_device
 #define ccl_device_inline_method ccl_device
 #define ccl_global device