|
|
|
@@ -9,7 +9,6 @@
|
|
|
|
|
# include "util/path.h"
|
|
|
|
|
# include "util/tbb.h"
|
|
|
|
|
# include "util/time.h"
|
|
|
|
|
# include "util/unique_ptr.h"
|
|
|
|
|
|
|
|
|
|
CCL_NAMESPACE_BEGIN
|
|
|
|
|
|
|
|
|
@@ -29,266 +28,315 @@ const char *kernel_type_as_string(int kernel_type)
|
|
|
|
|
return "";
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
bool kernel_has_intersection(DeviceKernel device_kernel)
|
|
|
|
|
MetalDeviceKernel::~MetalDeviceKernel()
|
|
|
|
|
{
|
|
|
|
|
return (device_kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST ||
|
|
|
|
|
device_kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW ||
|
|
|
|
|
device_kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE ||
|
|
|
|
|
device_kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK ||
|
|
|
|
|
device_kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
struct ShaderCache {
|
|
|
|
|
ShaderCache(id<MTLDevice> _mtlDevice) : mtlDevice(_mtlDevice)
|
|
|
|
|
{
|
|
|
|
|
}
|
|
|
|
|
~ShaderCache();
|
|
|
|
|
|
|
|
|
|
/* Get the fastest available pipeline for the specified kernel. */
|
|
|
|
|
MetalKernelPipeline *get_best_pipeline(DeviceKernel kernel, const MetalDevice *device);
|
|
|
|
|
|
|
|
|
|
/* Non-blocking request for a kernel, optionally specialized to the scene being rendered by
|
|
|
|
|
* device. */
|
|
|
|
|
void load_kernel(DeviceKernel kernel, MetalDevice *device, bool scene_specialized);
|
|
|
|
|
|
|
|
|
|
void wait_for_all();
|
|
|
|
|
|
|
|
|
|
private:
|
|
|
|
|
friend ShaderCache *get_shader_cache(id<MTLDevice> mtlDevice);
|
|
|
|
|
|
|
|
|
|
void compile_thread_func(int thread_index);
|
|
|
|
|
|
|
|
|
|
using PipelineCollection = std::vector<unique_ptr<MetalKernelPipeline>>;
|
|
|
|
|
|
|
|
|
|
struct PipelineRequest {
|
|
|
|
|
MetalKernelPipeline *pipeline = nullptr;
|
|
|
|
|
std::function<void(MetalKernelPipeline *)> completionHandler;
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
std::mutex cache_mutex;
|
|
|
|
|
|
|
|
|
|
PipelineCollection pipelines[DEVICE_KERNEL_NUM];
|
|
|
|
|
id<MTLDevice> mtlDevice;
|
|
|
|
|
|
|
|
|
|
bool running = false;
|
|
|
|
|
std::condition_variable cond_var;
|
|
|
|
|
std::deque<PipelineRequest> request_queue;
|
|
|
|
|
std::vector<std::thread> compile_threads;
|
|
|
|
|
std::atomic_int incomplete_requests = 0;
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
std::mutex g_shaderCacheMutex;
|
|
|
|
|
std::map<id<MTLDevice>, unique_ptr<ShaderCache>> g_shaderCache;
|
|
|
|
|
|
|
|
|
|
ShaderCache *get_shader_cache(id<MTLDevice> mtlDevice)
|
|
|
|
|
{
|
|
|
|
|
thread_scoped_lock lock(g_shaderCacheMutex);
|
|
|
|
|
auto it = g_shaderCache.find(mtlDevice);
|
|
|
|
|
if (it != g_shaderCache.end()) {
|
|
|
|
|
return it->second.get();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
g_shaderCache[mtlDevice] = make_unique<ShaderCache>(mtlDevice);
|
|
|
|
|
return g_shaderCache[mtlDevice].get();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
ShaderCache::~ShaderCache()
|
|
|
|
|
{
|
|
|
|
|
running = false;
|
|
|
|
|
cond_var.notify_all();
|
|
|
|
|
for (auto &thread : compile_threads) {
|
|
|
|
|
thread.join();
|
|
|
|
|
for (int i = 0; i < PSO_NUM; i++) {
|
|
|
|
|
pso[i].release();
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void ShaderCache::wait_for_all()
|
|
|
|
|
bool MetalDeviceKernel::load(MetalDevice *device,
|
|
|
|
|
MetalKernelLoadDesc const &desc_in,
|
|
|
|
|
MD5Hash const &md5)
|
|
|
|
|
{
|
|
|
|
|
while (incomplete_requests > 0) {
|
|
|
|
|
std::this_thread::sleep_for(std::chrono::milliseconds(100));
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void ShaderCache::compile_thread_func(int thread_index)
|
|
|
|
|
{
|
|
|
|
|
while (1) {
|
|
|
|
|
|
|
|
|
|
/* wait for / acquire next request */
|
|
|
|
|
PipelineRequest request;
|
|
|
|
|
{
|
|
|
|
|
thread_scoped_lock lock(cache_mutex);
|
|
|
|
|
cond_var.wait(lock, [&] { return !running || !request_queue.empty(); });
|
|
|
|
|
if (!running) {
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (!request_queue.empty()) {
|
|
|
|
|
request = request_queue.front();
|
|
|
|
|
request_queue.pop_front();
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* service request */
|
|
|
|
|
if (request.pipeline) {
|
|
|
|
|
request.pipeline->compile();
|
|
|
|
|
incomplete_requests--;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void ShaderCache::load_kernel(DeviceKernel device_kernel,
|
|
|
|
|
MetalDevice *device,
|
|
|
|
|
bool scene_specialized)
|
|
|
|
|
{
|
|
|
|
|
{
|
|
|
|
|
/* create compiler threads on first run */
|
|
|
|
|
thread_scoped_lock lock(cache_mutex);
|
|
|
|
|
if (compile_threads.empty()) {
|
|
|
|
|
running = true;
|
|
|
|
|
for (int i = 0; i < max_mtlcompiler_threads; i++) {
|
|
|
|
|
compile_threads.push_back(std::thread([&] { compile_thread_func(i); }));
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (device_kernel == DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL) {
|
|
|
|
|
__block MetalKernelLoadDesc const desc(desc_in);
|
|
|
|
|
if (desc.kernel_index == DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL) {
|
|
|
|
|
/* skip megakernel */
|
|
|
|
|
return;
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (scene_specialized) {
|
|
|
|
|
/* Only specialize kernels where it can make an impact. */
|
|
|
|
|
if (device_kernel < DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST ||
|
|
|
|
|
device_kernel > DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL) {
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
bool use_binary_archive = true;
|
|
|
|
|
if (device->device_vendor == METAL_GPU_APPLE) {
|
|
|
|
|
/* Workaround for T94142: Cycles Metal crash with simultaneous viewport and final render */
|
|
|
|
|
use_binary_archive = false;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
{
|
|
|
|
|
/* check whether the kernel has already been requested / cached */
|
|
|
|
|
thread_scoped_lock lock(cache_mutex);
|
|
|
|
|
for (auto &pipeline : pipelines[device_kernel]) {
|
|
|
|
|
if (scene_specialized) {
|
|
|
|
|
if (pipeline->source_md5 == device->source_md5[PSO_SPECIALISED]) {
|
|
|
|
|
/* we already requested a pipeline that is specialised for this kernel data */
|
|
|
|
|
metal_printf("Specialized kernel already requested (%s)\n",
|
|
|
|
|
device_kernel_as_string(device_kernel));
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
else {
|
|
|
|
|
if (pipeline->source_md5 == device->source_md5[PSO_GENERIC]) {
|
|
|
|
|
/* we already requested a generic pipeline for this kernel */
|
|
|
|
|
metal_printf("Generic kernel already requested (%s)\n",
|
|
|
|
|
device_kernel_as_string(device_kernel));
|
|
|
|
|
return;
|
|
|
|
|
if (auto str = getenv("CYCLES_METAL_DISABLE_BINARY_ARCHIVES")) {
|
|
|
|
|
use_binary_archive = (atoi(str) == 0);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
id<MTLBinaryArchive> archive = nil;
|
|
|
|
|
string metalbin_path;
|
|
|
|
|
if (use_binary_archive) {
|
|
|
|
|
NSProcessInfo *processInfo = [NSProcessInfo processInfo];
|
|
|
|
|
string osVersion = [[processInfo operatingSystemVersionString] UTF8String];
|
|
|
|
|
MD5Hash local_md5(md5);
|
|
|
|
|
local_md5.append(osVersion);
|
|
|
|
|
string metalbin_name = string(desc.function_name) + "." + local_md5.get_hex() +
|
|
|
|
|
to_string(desc.pso_index) + ".bin";
|
|
|
|
|
metalbin_path = path_cache_get(path_join("kernels", metalbin_name));
|
|
|
|
|
path_create_directories(metalbin_path);
|
|
|
|
|
|
|
|
|
|
if (path_exists(metalbin_path) && use_binary_archive) {
|
|
|
|
|
if (@available(macOS 11.0, *)) {
|
|
|
|
|
MTLBinaryArchiveDescriptor *archiveDesc = [[MTLBinaryArchiveDescriptor alloc] init];
|
|
|
|
|
archiveDesc.url = [NSURL fileURLWithPath:@(metalbin_path.c_str())];
|
|
|
|
|
archive = [device->mtlDevice newBinaryArchiveWithDescriptor:archiveDesc error:nil];
|
|
|
|
|
[archiveDesc release];
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
incomplete_requests++;
|
|
|
|
|
|
|
|
|
|
PipelineRequest request;
|
|
|
|
|
request.pipeline = new MetalKernelPipeline;
|
|
|
|
|
request.pipeline->scene_specialized = scene_specialized;
|
|
|
|
|
request.pipeline->mtlDevice = mtlDevice;
|
|
|
|
|
request.pipeline->source_md5 =
|
|
|
|
|
device->source_md5[scene_specialized ? PSO_SPECIALISED : PSO_GENERIC];
|
|
|
|
|
request.pipeline->mtlLibrary =
|
|
|
|
|
device->mtlLibrary[scene_specialized ? PSO_SPECIALISED : PSO_GENERIC];
|
|
|
|
|
request.pipeline->device_kernel = device_kernel;
|
|
|
|
|
request.pipeline->threads_per_threadgroup = device->max_threads_per_threadgroup;
|
|
|
|
|
|
|
|
|
|
/* metalrt options */
|
|
|
|
|
request.pipeline->use_metalrt = device->use_metalrt;
|
|
|
|
|
request.pipeline->metalrt_hair = device->kernel_features & KERNEL_FEATURE_HAIR;
|
|
|
|
|
request.pipeline->metalrt_hair_thick = device->kernel_features & KERNEL_FEATURE_HAIR_THICK;
|
|
|
|
|
request.pipeline->metalrt_pointcloud = device->kernel_features & KERNEL_FEATURE_POINTCLOUD;
|
|
|
|
|
|
|
|
|
|
{
|
|
|
|
|
thread_scoped_lock lock(cache_mutex);
|
|
|
|
|
pipelines[device_kernel].push_back(unique_ptr<MetalKernelPipeline>(request.pipeline));
|
|
|
|
|
request_queue.push_back(request);
|
|
|
|
|
}
|
|
|
|
|
cond_var.notify_one();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
MetalKernelPipeline *ShaderCache::get_best_pipeline(DeviceKernel kernel, const MetalDevice *device)
|
|
|
|
|
{
|
|
|
|
|
thread_scoped_lock lock(cache_mutex);
|
|
|
|
|
auto &collection = pipelines[kernel];
|
|
|
|
|
if (collection.empty()) {
|
|
|
|
|
return nullptr;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* metalrt options */
|
|
|
|
|
bool use_metalrt = device->use_metalrt;
|
|
|
|
|
bool metalrt_hair = device->kernel_features & KERNEL_FEATURE_HAIR;
|
|
|
|
|
bool metalrt_hair_thick = device->kernel_features & KERNEL_FEATURE_HAIR_THICK;
|
|
|
|
|
bool metalrt_pointcloud = device->kernel_features & KERNEL_FEATURE_POINTCLOUD;
|
|
|
|
|
|
|
|
|
|
MetalKernelPipeline *best_pipeline = nullptr;
|
|
|
|
|
for (auto &pipeline : collection) {
|
|
|
|
|
if (!pipeline->loaded) {
|
|
|
|
|
/* still loading - ignore */
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (pipeline->use_metalrt != use_metalrt || pipeline->metalrt_hair != metalrt_hair ||
|
|
|
|
|
pipeline->metalrt_hair_thick != metalrt_hair_thick ||
|
|
|
|
|
pipeline->metalrt_pointcloud != metalrt_pointcloud) {
|
|
|
|
|
/* wrong combination of metalrt options */
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (pipeline->scene_specialized) {
|
|
|
|
|
if (pipeline->source_md5 == device->source_md5[PSO_SPECIALISED]) {
|
|
|
|
|
best_pipeline = pipeline.get();
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
else if (!best_pipeline) {
|
|
|
|
|
best_pipeline = pipeline.get();
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return best_pipeline;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void MetalKernelPipeline::compile()
|
|
|
|
|
{
|
|
|
|
|
int pso_type = scene_specialized ? PSO_SPECIALISED : PSO_GENERIC;
|
|
|
|
|
|
|
|
|
|
const std::string function_name = std::string("cycles_metal_") +
|
|
|
|
|
device_kernel_as_string(device_kernel);
|
|
|
|
|
|
|
|
|
|
int threads_per_threadgroup = this->threads_per_threadgroup;
|
|
|
|
|
if (device_kernel > DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL &&
|
|
|
|
|
device_kernel < DEVICE_KERNEL_INTEGRATOR_RESET) {
|
|
|
|
|
/* Always use 512 for the sorting kernels */
|
|
|
|
|
threads_per_threadgroup = 512;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
NSString *entryPoint = [@(function_name.c_str()) copy];
|
|
|
|
|
NSString *entryPoint = [@(desc.function_name) copy];
|
|
|
|
|
|
|
|
|
|
NSError *error = NULL;
|
|
|
|
|
if (@available(macOS 11.0, *)) {
|
|
|
|
|
MTLFunctionDescriptor *func_desc = [MTLIntersectionFunctionDescriptor functionDescriptor];
|
|
|
|
|
func_desc.name = entryPoint;
|
|
|
|
|
function = [mtlLibrary newFunctionWithDescriptor:func_desc error:&error];
|
|
|
|
|
if (desc.constant_values) {
|
|
|
|
|
func_desc.constantValues = desc.constant_values;
|
|
|
|
|
}
|
|
|
|
|
pso[desc.pso_index].function = [device->mtlLibrary[desc.pso_index]
|
|
|
|
|
newFunctionWithDescriptor:func_desc
|
|
|
|
|
error:&error];
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
[entryPoint release];
|
|
|
|
|
|
|
|
|
|
if (function == nil) {
|
|
|
|
|
if (pso[desc.pso_index].function == nil) {
|
|
|
|
|
NSString *err = [error localizedDescription];
|
|
|
|
|
string errors = [err UTF8String];
|
|
|
|
|
metal_printf("Error getting function \"%s\": %s", function_name.c_str(), errors.c_str());
|
|
|
|
|
|
|
|
|
|
device->set_error(
|
|
|
|
|
string_printf("Error getting function \"%s\": %s", desc.function_name, errors.c_str()));
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
pso[desc.pso_index].function.label = [@(desc.function_name) copy];
|
|
|
|
|
|
|
|
|
|
__block MTLComputePipelineDescriptor *computePipelineStateDescriptor =
|
|
|
|
|
[[MTLComputePipelineDescriptor alloc] init];
|
|
|
|
|
|
|
|
|
|
computePipelineStateDescriptor.buffers[0].mutability = MTLMutabilityImmutable;
|
|
|
|
|
computePipelineStateDescriptor.buffers[1].mutability = MTLMutabilityImmutable;
|
|
|
|
|
computePipelineStateDescriptor.buffers[2].mutability = MTLMutabilityImmutable;
|
|
|
|
|
|
|
|
|
|
if (@available(macos 10.14, *)) {
|
|
|
|
|
computePipelineStateDescriptor.maxTotalThreadsPerThreadgroup = desc.threads_per_threadgroup;
|
|
|
|
|
}
|
|
|
|
|
computePipelineStateDescriptor.threadGroupSizeIsMultipleOfThreadExecutionWidth = true;
|
|
|
|
|
|
|
|
|
|
computePipelineStateDescriptor.computeFunction = pso[desc.pso_index].function;
|
|
|
|
|
if (@available(macOS 11.0, *)) {
|
|
|
|
|
/* Attach the additional functions to an MTLLinkedFunctions object */
|
|
|
|
|
if (desc.linked_functions) {
|
|
|
|
|
computePipelineStateDescriptor.linkedFunctions = [[MTLLinkedFunctions alloc] init];
|
|
|
|
|
computePipelineStateDescriptor.linkedFunctions.functions = desc.linked_functions;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
computePipelineStateDescriptor.maxCallStackDepth = 1;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* Create a new Compute pipeline state object */
|
|
|
|
|
MTLPipelineOption pipelineOptions = MTLPipelineOptionNone;
|
|
|
|
|
|
|
|
|
|
bool creating_new_archive = false;
|
|
|
|
|
if (@available(macOS 11.0, *)) {
|
|
|
|
|
if (use_binary_archive) {
|
|
|
|
|
if (!archive) {
|
|
|
|
|
MTLBinaryArchiveDescriptor *archiveDesc = [[MTLBinaryArchiveDescriptor alloc] init];
|
|
|
|
|
archiveDesc.url = nil;
|
|
|
|
|
archive = [device->mtlDevice newBinaryArchiveWithDescriptor:archiveDesc error:nil];
|
|
|
|
|
creating_new_archive = true;
|
|
|
|
|
|
|
|
|
|
double starttime = time_dt();
|
|
|
|
|
|
|
|
|
|
if (![archive addComputePipelineFunctionsWithDescriptor:computePipelineStateDescriptor
|
|
|
|
|
error:&error]) {
|
|
|
|
|
NSString *errStr = [error localizedDescription];
|
|
|
|
|
metal_printf("Failed to add PSO to archive:\n%s\n",
|
|
|
|
|
errStr ? [errStr UTF8String] : "nil");
|
|
|
|
|
}
|
|
|
|
|
else {
|
|
|
|
|
double duration = time_dt() - starttime;
|
|
|
|
|
metal_printf("%2d | %-55s | %7.2fs\n",
|
|
|
|
|
desc.kernel_index,
|
|
|
|
|
device_kernel_as_string((DeviceKernel)desc.kernel_index),
|
|
|
|
|
duration);
|
|
|
|
|
|
|
|
|
|
if (desc.pso_index == PSO_GENERIC) {
|
|
|
|
|
this->load_duration = duration;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
computePipelineStateDescriptor.binaryArchives = [NSArray arrayWithObjects:archive, nil];
|
|
|
|
|
pipelineOptions = MTLPipelineOptionFailOnBinaryArchiveMiss;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
double starttime = time_dt();
|
|
|
|
|
|
|
|
|
|
MTLNewComputePipelineStateWithReflectionCompletionHandler completionHandler = ^(
|
|
|
|
|
id<MTLComputePipelineState> computePipelineState,
|
|
|
|
|
MTLComputePipelineReflection *reflection,
|
|
|
|
|
NSError *error) {
|
|
|
|
|
bool recreate_archive = false;
|
|
|
|
|
if (computePipelineState == nil && archive && !creating_new_archive) {
|
|
|
|
|
|
|
|
|
|
assert(0);
|
|
|
|
|
|
|
|
|
|
NSString *errStr = [error localizedDescription];
|
|
|
|
|
metal_printf(
|
|
|
|
|
"Failed to create compute pipeline state \"%s\" from archive - attempting recreation... "
|
|
|
|
|
"(error: %s)\n",
|
|
|
|
|
device_kernel_as_string((DeviceKernel)desc.kernel_index),
|
|
|
|
|
errStr ? [errStr UTF8String] : "nil");
|
|
|
|
|
computePipelineState = [device->mtlDevice
|
|
|
|
|
newComputePipelineStateWithDescriptor:computePipelineStateDescriptor
|
|
|
|
|
options:MTLPipelineOptionNone
|
|
|
|
|
reflection:nullptr
|
|
|
|
|
error:&error];
|
|
|
|
|
recreate_archive = true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
double duration = time_dt() - starttime;
|
|
|
|
|
|
|
|
|
|
if (computePipelineState == nil) {
|
|
|
|
|
NSString *errStr = [error localizedDescription];
|
|
|
|
|
device->set_error(string_printf("Failed to create compute pipeline state \"%s\", error: \n",
|
|
|
|
|
device_kernel_as_string((DeviceKernel)desc.kernel_index)) +
|
|
|
|
|
(errStr ? [errStr UTF8String] : "nil"));
|
|
|
|
|
metal_printf("%2d | %-55s | %7.2fs | FAILED!\n",
|
|
|
|
|
desc.kernel_index,
|
|
|
|
|
device_kernel_as_string((DeviceKernel)desc.kernel_index),
|
|
|
|
|
duration);
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
function.label = [entryPoint copy];
|
|
|
|
|
pso[desc.pso_index].pipeline = computePipelineState;
|
|
|
|
|
num_threads_per_block = round_down(computePipelineState.maxTotalThreadsPerThreadgroup,
|
|
|
|
|
computePipelineState.threadExecutionWidth);
|
|
|
|
|
num_threads_per_block = std::max(num_threads_per_block,
|
|
|
|
|
(int)computePipelineState.threadExecutionWidth);
|
|
|
|
|
|
|
|
|
|
if (use_metalrt) {
|
|
|
|
|
if (!use_binary_archive) {
|
|
|
|
|
metal_printf("%2d | %-55s | %7.2fs\n",
|
|
|
|
|
desc.kernel_index,
|
|
|
|
|
device_kernel_as_string((DeviceKernel)desc.kernel_index),
|
|
|
|
|
duration);
|
|
|
|
|
|
|
|
|
|
if (desc.pso_index == PSO_GENERIC) {
|
|
|
|
|
this->load_duration = duration;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (@available(macOS 11.0, *)) {
|
|
|
|
|
if (creating_new_archive || recreate_archive) {
|
|
|
|
|
if (![archive serializeToURL:[NSURL fileURLWithPath:@(metalbin_path.c_str())]
|
|
|
|
|
error:&error]) {
|
|
|
|
|
metal_printf("Failed to save binary archive, error:\n%s\n",
|
|
|
|
|
[[error localizedDescription] UTF8String]);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
[computePipelineStateDescriptor release];
|
|
|
|
|
computePipelineStateDescriptor = nil;
|
|
|
|
|
|
|
|
|
|
if (device->use_metalrt && desc.linked_functions) {
|
|
|
|
|
for (int table = 0; table < METALRT_TABLE_NUM; table++) {
|
|
|
|
|
if (@available(macOS 11.0, *)) {
|
|
|
|
|
MTLIntersectionFunctionTableDescriptor *ift_desc =
|
|
|
|
|
[[MTLIntersectionFunctionTableDescriptor alloc] init];
|
|
|
|
|
ift_desc.functionCount = desc.intersector_functions[table].count;
|
|
|
|
|
|
|
|
|
|
pso[desc.pso_index].intersection_func_table[table] = [pso[desc.pso_index].pipeline
|
|
|
|
|
newIntersectionFunctionTableWithDescriptor:ift_desc];
|
|
|
|
|
|
|
|
|
|
/* Finally write the function handles into this pipeline's table */
|
|
|
|
|
for (int i = 0; i < 2; i++) {
|
|
|
|
|
id<MTLFunctionHandle> handle = [pso[desc.pso_index].pipeline
|
|
|
|
|
functionHandleWithFunction:desc.intersector_functions[table][i]];
|
|
|
|
|
[pso[desc.pso_index].intersection_func_table[table] setFunction:handle atIndex:i];
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
mark_loaded(desc.pso_index);
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
if (desc.pso_index == PSO_SPECIALISED) {
|
|
|
|
|
/* Asynchronous load */
|
|
|
|
|
dispatch_async(dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_DEFAULT, 0), ^{
|
|
|
|
|
NSError *error;
|
|
|
|
|
id<MTLComputePipelineState> pipeline = [device->mtlDevice
|
|
|
|
|
newComputePipelineStateWithDescriptor:computePipelineStateDescriptor
|
|
|
|
|
options:pipelineOptions
|
|
|
|
|
reflection:nullptr
|
|
|
|
|
error:&error];
|
|
|
|
|
completionHandler(pipeline, nullptr, error);
|
|
|
|
|
});
|
|
|
|
|
}
|
|
|
|
|
else {
|
|
|
|
|
/* Block on load to ensure we continue with a valid kernel function */
|
|
|
|
|
id<MTLComputePipelineState> pipeline = [device->mtlDevice
|
|
|
|
|
newComputePipelineStateWithDescriptor:computePipelineStateDescriptor
|
|
|
|
|
options:pipelineOptions
|
|
|
|
|
reflection:nullptr
|
|
|
|
|
error:&error];
|
|
|
|
|
completionHandler(pipeline, nullptr, error);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
const MetalKernelPipeline &MetalDeviceKernel::get_pso() const
|
|
|
|
|
{
|
|
|
|
|
if (pso[PSO_SPECIALISED].loaded) {
|
|
|
|
|
return pso[PSO_SPECIALISED];
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
assert(pso[PSO_GENERIC].loaded);
|
|
|
|
|
return pso[PSO_GENERIC];
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
bool MetalDeviceKernels::load(MetalDevice *device, int kernel_type)
|
|
|
|
|
{
|
|
|
|
|
bool any_error = false;
|
|
|
|
|
|
|
|
|
|
MD5Hash md5;
|
|
|
|
|
|
|
|
|
|
/* Build the function constant table */
|
|
|
|
|
MTLFunctionConstantValues *constant_values = nullptr;
|
|
|
|
|
if (kernel_type == PSO_SPECIALISED) {
|
|
|
|
|
constant_values = [MTLFunctionConstantValues new];
|
|
|
|
|
|
|
|
|
|
# define KERNEL_FILM(_type, name) \
|
|
|
|
|
[constant_values setConstantValue:&data.film.name \
|
|
|
|
|
type:get_MTLDataType_##_type() \
|
|
|
|
|
atIndex:KernelData_film_##name]; \
|
|
|
|
|
md5.append((uint8_t *)&data.film.name, sizeof(data.film.name));
|
|
|
|
|
|
|
|
|
|
# define KERNEL_BACKGROUND(_type, name) \
|
|
|
|
|
[constant_values setConstantValue:&data.background.name \
|
|
|
|
|
type:get_MTLDataType_##_type() \
|
|
|
|
|
atIndex:KernelData_background_##name]; \
|
|
|
|
|
md5.append((uint8_t *)&data.background.name, sizeof(data.background.name));
|
|
|
|
|
|
|
|
|
|
# define KERNEL_INTEGRATOR(_type, name) \
|
|
|
|
|
[constant_values setConstantValue:&data.integrator.name \
|
|
|
|
|
type:get_MTLDataType_##_type() \
|
|
|
|
|
atIndex:KernelData_integrator_##name]; \
|
|
|
|
|
md5.append((uint8_t *)&data.integrator.name, sizeof(data.integrator.name));
|
|
|
|
|
|
|
|
|
|
# define KERNEL_BVH(_type, name) \
|
|
|
|
|
[constant_values setConstantValue:&data.bvh.name \
|
|
|
|
|
type:get_MTLDataType_##_type() \
|
|
|
|
|
atIndex:KernelData_bvh_##name]; \
|
|
|
|
|
md5.append((uint8_t *)&data.bvh.name, sizeof(data.bvh.name));
|
|
|
|
|
|
|
|
|
|
/* METAL_WIP: populate constant_values based on KernelData */
|
|
|
|
|
assert(0);
|
|
|
|
|
/*
|
|
|
|
|
const KernelData &data = device->launch_params.data;
|
|
|
|
|
# include "kernel/types/background.h"
|
|
|
|
|
# include "kernel/types/bvh.h"
|
|
|
|
|
# include "kernel/types/film.h"
|
|
|
|
|
# include "kernel/types/integrator.h"
|
|
|
|
|
*/
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (device->use_metalrt) {
|
|
|
|
|
if (@available(macOS 11.0, *)) {
|
|
|
|
|
/* create the id<MTLFunction> for each intersection function */
|
|
|
|
|
const char *function_names[] = {
|
|
|
|
@@ -308,74 +356,89 @@ void MetalKernelPipeline::compile()
|
|
|
|
|
assert(sizeof(function_names) / sizeof(function_names[0]) == METALRT_FUNC_NUM);
|
|
|
|
|
|
|
|
|
|
MTLFunctionDescriptor *desc = [MTLIntersectionFunctionDescriptor functionDescriptor];
|
|
|
|
|
if (kernel_type == PSO_SPECIALISED) {
|
|
|
|
|
desc.constantValues = constant_values;
|
|
|
|
|
}
|
|
|
|
|
for (int i = 0; i < METALRT_FUNC_NUM; i++) {
|
|
|
|
|
const char *function_name = function_names[i];
|
|
|
|
|
desc.name = [@(function_name) copy];
|
|
|
|
|
|
|
|
|
|
NSError *error = NULL;
|
|
|
|
|
rt_intersection_function[i] = [mtlLibrary newFunctionWithDescriptor:desc error:&error];
|
|
|
|
|
rt_intersection_funcs[kernel_type][i] = [device->mtlLibrary[kernel_type]
|
|
|
|
|
newFunctionWithDescriptor:desc
|
|
|
|
|
error:&error];
|
|
|
|
|
|
|
|
|
|
if (rt_intersection_function[i] == nil) {
|
|
|
|
|
if (rt_intersection_funcs[kernel_type][i] == nil) {
|
|
|
|
|
NSString *err = [error localizedDescription];
|
|
|
|
|
string errors = [err UTF8String];
|
|
|
|
|
|
|
|
|
|
error_str = string_printf(
|
|
|
|
|
"Error getting intersection function \"%s\": %s", function_name, errors.c_str());
|
|
|
|
|
device->set_error(string_printf(
|
|
|
|
|
"Error getting intersection function \"%s\": %s", function_name, errors.c_str()));
|
|
|
|
|
any_error = true;
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
rt_intersection_function[i].label = [@(function_name) copy];
|
|
|
|
|
rt_intersection_funcs[kernel_type][i].label = [@(function_name) copy];
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
md5.append(device->source_used_for_compile[kernel_type]);
|
|
|
|
|
|
|
|
|
|
string hash = md5.get_hex();
|
|
|
|
|
if (loaded_md5[kernel_type] == hash) {
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (!any_error) {
|
|
|
|
|
NSArray *table_functions[METALRT_TABLE_NUM] = {nil};
|
|
|
|
|
NSArray *linked_functions = nil;
|
|
|
|
|
NSArray *function_list = nil;
|
|
|
|
|
|
|
|
|
|
if (use_metalrt) {
|
|
|
|
|
if (device->use_metalrt) {
|
|
|
|
|
id<MTLFunction> curve_intersect_default = nil;
|
|
|
|
|
id<MTLFunction> curve_intersect_shadow = nil;
|
|
|
|
|
id<MTLFunction> point_intersect_default = nil;
|
|
|
|
|
id<MTLFunction> point_intersect_shadow = nil;
|
|
|
|
|
if (metalrt_hair) {
|
|
|
|
|
if (device->kernel_features & KERNEL_FEATURE_HAIR) {
|
|
|
|
|
/* Add curve intersection programs. */
|
|
|
|
|
if (metalrt_hair_thick) {
|
|
|
|
|
if (device->kernel_features & KERNEL_FEATURE_HAIR_THICK) {
|
|
|
|
|
/* Slower programs for thick hair since that also slows down ribbons.
|
|
|
|
|
* Ideally this should not be needed. */
|
|
|
|
|
curve_intersect_default = rt_intersection_function[METALRT_FUNC_CURVE_ALL];
|
|
|
|
|
curve_intersect_shadow = rt_intersection_function[METALRT_FUNC_CURVE_ALL_SHADOW];
|
|
|
|
|
curve_intersect_default = rt_intersection_funcs[kernel_type][METALRT_FUNC_CURVE_ALL];
|
|
|
|
|
curve_intersect_shadow =
|
|
|
|
|
rt_intersection_funcs[kernel_type][METALRT_FUNC_CURVE_ALL_SHADOW];
|
|
|
|
|
}
|
|
|
|
|
else {
|
|
|
|
|
curve_intersect_default = rt_intersection_function[METALRT_FUNC_CURVE_RIBBON];
|
|
|
|
|
curve_intersect_shadow = rt_intersection_function[METALRT_FUNC_CURVE_RIBBON_SHADOW];
|
|
|
|
|
curve_intersect_default = rt_intersection_funcs[kernel_type][METALRT_FUNC_CURVE_RIBBON];
|
|
|
|
|
curve_intersect_shadow =
|
|
|
|
|
rt_intersection_funcs[kernel_type][METALRT_FUNC_CURVE_RIBBON_SHADOW];
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
if (metalrt_pointcloud) {
|
|
|
|
|
point_intersect_default = rt_intersection_function[METALRT_FUNC_POINT];
|
|
|
|
|
point_intersect_shadow = rt_intersection_function[METALRT_FUNC_POINT_SHADOW];
|
|
|
|
|
if (device->kernel_features & KERNEL_FEATURE_POINTCLOUD) {
|
|
|
|
|
point_intersect_default = rt_intersection_funcs[kernel_type][METALRT_FUNC_POINT];
|
|
|
|
|
point_intersect_shadow = rt_intersection_funcs[kernel_type][METALRT_FUNC_POINT_SHADOW];
|
|
|
|
|
}
|
|
|
|
|
table_functions[METALRT_TABLE_DEFAULT] = [NSArray
|
|
|
|
|
arrayWithObjects:rt_intersection_function[METALRT_FUNC_DEFAULT_TRI],
|
|
|
|
|
arrayWithObjects:rt_intersection_funcs[kernel_type][METALRT_FUNC_DEFAULT_TRI],
|
|
|
|
|
curve_intersect_default ?
|
|
|
|
|
curve_intersect_default :
|
|
|
|
|
rt_intersection_function[METALRT_FUNC_DEFAULT_BOX],
|
|
|
|
|
rt_intersection_funcs[kernel_type][METALRT_FUNC_DEFAULT_BOX],
|
|
|
|
|
point_intersect_default ?
|
|
|
|
|
point_intersect_default :
|
|
|
|
|
rt_intersection_function[METALRT_FUNC_DEFAULT_BOX],
|
|
|
|
|
rt_intersection_funcs[kernel_type][METALRT_FUNC_DEFAULT_BOX],
|
|
|
|
|
nil];
|
|
|
|
|
table_functions[METALRT_TABLE_SHADOW] = [NSArray
|
|
|
|
|
arrayWithObjects:rt_intersection_function[METALRT_FUNC_SHADOW_TRI],
|
|
|
|
|
arrayWithObjects:rt_intersection_funcs[kernel_type][METALRT_FUNC_SHADOW_TRI],
|
|
|
|
|
curve_intersect_shadow ?
|
|
|
|
|
curve_intersect_shadow :
|
|
|
|
|
rt_intersection_function[METALRT_FUNC_SHADOW_BOX],
|
|
|
|
|
rt_intersection_funcs[kernel_type][METALRT_FUNC_SHADOW_BOX],
|
|
|
|
|
point_intersect_shadow ?
|
|
|
|
|
point_intersect_shadow :
|
|
|
|
|
rt_intersection_function[METALRT_FUNC_SHADOW_BOX],
|
|
|
|
|
rt_intersection_funcs[kernel_type][METALRT_FUNC_SHADOW_BOX],
|
|
|
|
|
nil];
|
|
|
|
|
table_functions[METALRT_TABLE_LOCAL] = [NSArray
|
|
|
|
|
arrayWithObjects:rt_intersection_function[METALRT_FUNC_LOCAL_TRI],
|
|
|
|
|
rt_intersection_function[METALRT_FUNC_LOCAL_BOX],
|
|
|
|
|
rt_intersection_function[METALRT_FUNC_LOCAL_BOX],
|
|
|
|
|
arrayWithObjects:rt_intersection_funcs[kernel_type][METALRT_FUNC_LOCAL_TRI],
|
|
|
|
|
rt_intersection_funcs[kernel_type][METALRT_FUNC_LOCAL_BOX],
|
|
|
|
|
rt_intersection_funcs[kernel_type][METALRT_FUNC_LOCAL_BOX],
|
|
|
|
|
nil];
|
|
|
|
|
|
|
|
|
|
NSMutableSet *unique_functions = [NSMutableSet
|
|
|
|
@@ -383,246 +446,86 @@ void MetalKernelPipeline::compile()
|
|
|
|
|
[unique_functions addObjectsFromArray:table_functions[METALRT_TABLE_SHADOW]];
|
|
|
|
|
[unique_functions addObjectsFromArray:table_functions[METALRT_TABLE_LOCAL]];
|
|
|
|
|
|
|
|
|
|
if (kernel_has_intersection(device_kernel)) {
|
|
|
|
|
linked_functions = [[NSArray arrayWithArray:[unique_functions allObjects]]
|
|
|
|
|
function_list = [[NSArray arrayWithArray:[unique_functions allObjects]]
|
|
|
|
|
sortedArrayUsingComparator:^NSComparisonResult(id<MTLFunction> f1, id<MTLFunction> f2) {
|
|
|
|
|
return [f1.label compare:f2.label];
|
|
|
|
|
}];
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
unique_functions = nil;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
MTLComputePipelineDescriptor *computePipelineStateDescriptor =
|
|
|
|
|
[[MTLComputePipelineDescriptor alloc] init];
|
|
|
|
|
metal_printf("Starting %s \"cycles_metal_...\" pipeline builds\n",
|
|
|
|
|
kernel_type_as_string(kernel_type));
|
|
|
|
|
|
|
|
|
|
computePipelineStateDescriptor.buffers[0].mutability = MTLMutabilityImmutable;
|
|
|
|
|
computePipelineStateDescriptor.buffers[1].mutability = MTLMutabilityImmutable;
|
|
|
|
|
computePipelineStateDescriptor.buffers[2].mutability = MTLMutabilityImmutable;
|
|
|
|
|
|
|
|
|
|
if (@available(macos 10.14, *)) {
|
|
|
|
|
computePipelineStateDescriptor.maxTotalThreadsPerThreadgroup = threads_per_threadgroup;
|
|
|
|
|
}
|
|
|
|
|
computePipelineStateDescriptor.threadGroupSizeIsMultipleOfThreadExecutionWidth = true;
|
|
|
|
|
|
|
|
|
|
computePipelineStateDescriptor.computeFunction = function;
|
|
|
|
|
|
|
|
|
|
if (@available(macOS 11.0, *)) {
|
|
|
|
|
/* Attach the additional functions to an MTLLinkedFunctions object */
|
|
|
|
|
if (linked_functions) {
|
|
|
|
|
computePipelineStateDescriptor.linkedFunctions = [[MTLLinkedFunctions alloc] init];
|
|
|
|
|
computePipelineStateDescriptor.linkedFunctions.functions = linked_functions;
|
|
|
|
|
}
|
|
|
|
|
computePipelineStateDescriptor.maxCallStackDepth = 1;
|
|
|
|
|
if (use_metalrt) {
|
|
|
|
|
computePipelineStateDescriptor.maxCallStackDepth = 8;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
MTLPipelineOption pipelineOptions = MTLPipelineOptionNone;
|
|
|
|
|
|
|
|
|
|
bool use_binary_archive = true;
|
|
|
|
|
if (auto str = getenv("CYCLES_METAL_DISABLE_BINARY_ARCHIVES")) {
|
|
|
|
|
use_binary_archive = (atoi(str) == 0);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
id<MTLBinaryArchive> archive = nil;
|
|
|
|
|
string metalbin_path;
|
|
|
|
|
string metalbin_name;
|
|
|
|
|
if (use_binary_archive) {
|
|
|
|
|
NSProcessInfo *processInfo = [NSProcessInfo processInfo];
|
|
|
|
|
string osVersion = [[processInfo operatingSystemVersionString] UTF8String];
|
|
|
|
|
MD5Hash local_md5;
|
|
|
|
|
local_md5.append(source_md5);
|
|
|
|
|
local_md5.append(osVersion);
|
|
|
|
|
local_md5.append((uint8_t *)&this->threads_per_threadgroup,
|
|
|
|
|
sizeof(this->threads_per_threadgroup));
|
|
|
|
|
|
|
|
|
|
string options;
|
|
|
|
|
if (use_metalrt && kernel_has_intersection(device_kernel)) {
|
|
|
|
|
/* incorporate any MetalRT specialisations into the archive name */
|
|
|
|
|
options += string_printf(".hair_%d.hair_thick_%d.pointcloud_%d",
|
|
|
|
|
metalrt_hair ? 1 : 0,
|
|
|
|
|
metalrt_hair_thick ? 1 : 0,
|
|
|
|
|
metalrt_pointcloud ? 1 : 0);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* Replace non-alphanumerical characters with underscores. */
|
|
|
|
|
string device_name = [mtlDevice.name UTF8String];
|
|
|
|
|
for (char &c : device_name) {
|
|
|
|
|
if ((c < '0' || c > '9') && (c < 'a' || c > 'z') && (c < 'A' || c > 'Z')) {
|
|
|
|
|
c = '_';
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
metalbin_name = device_name;
|
|
|
|
|
metalbin_name = path_join(metalbin_name, device_kernel_as_string(device_kernel));
|
|
|
|
|
metalbin_name = path_join(metalbin_name, kernel_type_as_string(pso_type));
|
|
|
|
|
metalbin_name = path_join(metalbin_name, local_md5.get_hex() + options + ".bin");
|
|
|
|
|
|
|
|
|
|
metalbin_path = path_cache_get(path_join("kernels", metalbin_name));
|
|
|
|
|
path_create_directories(metalbin_path);
|
|
|
|
|
|
|
|
|
|
if (path_exists(metalbin_path) && use_binary_archive) {
|
|
|
|
|
if (@available(macOS 11.0, *)) {
|
|
|
|
|
MTLBinaryArchiveDescriptor *archiveDesc = [[MTLBinaryArchiveDescriptor alloc] init];
|
|
|
|
|
archiveDesc.url = [NSURL fileURLWithPath:@(metalbin_path.c_str())];
|
|
|
|
|
archive = [mtlDevice newBinaryArchiveWithDescriptor:archiveDesc error:nil];
|
|
|
|
|
[archiveDesc release];
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
__block bool creating_new_archive = false;
|
|
|
|
|
if (@available(macOS 11.0, *)) {
|
|
|
|
|
if (use_binary_archive) {
|
|
|
|
|
if (!archive) {
|
|
|
|
|
MTLBinaryArchiveDescriptor *archiveDesc = [[MTLBinaryArchiveDescriptor alloc] init];
|
|
|
|
|
archiveDesc.url = nil;
|
|
|
|
|
archive = [mtlDevice newBinaryArchiveWithDescriptor:archiveDesc error:nil];
|
|
|
|
|
creating_new_archive = true;
|
|
|
|
|
}
|
|
|
|
|
computePipelineStateDescriptor.binaryArchives = [NSArray arrayWithObjects:archive, nil];
|
|
|
|
|
pipelineOptions = MTLPipelineOptionFailOnBinaryArchiveMiss;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
double starttime = time_dt();
|
|
|
|
|
|
|
|
|
|
MTLNewComputePipelineStateWithReflectionCompletionHandler completionHandler = ^(
|
|
|
|
|
id<MTLComputePipelineState> computePipelineState,
|
|
|
|
|
MTLComputePipelineReflection *reflection,
|
|
|
|
|
NSError *error) {
|
|
|
|
|
bool recreate_archive = false;
|
|
|
|
|
if (computePipelineState == nil && archive) {
|
|
|
|
|
NSString *errStr = [error localizedDescription];
|
|
|
|
|
metal_printf(
|
|
|
|
|
"Failed to create compute pipeline state \"%s\" from archive - attempting recreation... "
|
|
|
|
|
"(error: %s)\n",
|
|
|
|
|
device_kernel_as_string((DeviceKernel)device_kernel),
|
|
|
|
|
errStr ? [errStr UTF8String] : "nil");
|
|
|
|
|
computePipelineState = [mtlDevice
|
|
|
|
|
newComputePipelineStateWithDescriptor:computePipelineStateDescriptor
|
|
|
|
|
options:MTLPipelineOptionNone
|
|
|
|
|
reflection:nullptr
|
|
|
|
|
error:&error];
|
|
|
|
|
recreate_archive = true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
double duration = time_dt() - starttime;
|
|
|
|
|
|
|
|
|
|
if (computePipelineState == nil) {
|
|
|
|
|
NSString *errStr = [error localizedDescription];
|
|
|
|
|
error_str = string_printf("Failed to create compute pipeline state \"%s\", error: \n",
|
|
|
|
|
device_kernel_as_string((DeviceKernel)device_kernel));
|
|
|
|
|
error_str += (errStr ? [errStr UTF8String] : "nil");
|
|
|
|
|
metal_printf("%16s | %2d | %-55s | %7.2fs | FAILED!\n",
|
|
|
|
|
kernel_type_as_string(pso_type),
|
|
|
|
|
device_kernel,
|
|
|
|
|
device_kernel_as_string((DeviceKernel)device_kernel),
|
|
|
|
|
duration);
|
|
|
|
|
tbb::task_arena local_arena(max_mtlcompiler_threads);
|
|
|
|
|
local_arena.execute([&]() {
|
|
|
|
|
parallel_for(int(0), int(DEVICE_KERNEL_NUM), [&](int i) {
|
|
|
|
|
/* skip megakernel */
|
|
|
|
|
if (i == DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL) {
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
int num_threads_per_block = round_down(computePipelineState.maxTotalThreadsPerThreadgroup,
|
|
|
|
|
computePipelineState.threadExecutionWidth);
|
|
|
|
|
num_threads_per_block = std::max(num_threads_per_block,
|
|
|
|
|
(int)computePipelineState.threadExecutionWidth);
|
|
|
|
|
this->pipeline = computePipelineState;
|
|
|
|
|
this->num_threads_per_block = num_threads_per_block;
|
|
|
|
|
|
|
|
|
|
if (@available(macOS 11.0, *)) {
|
|
|
|
|
if (creating_new_archive || recreate_archive) {
|
|
|
|
|
if (![archive serializeToURL:[NSURL fileURLWithPath:@(metalbin_path.c_str())]
|
|
|
|
|
error:&error]) {
|
|
|
|
|
metal_printf("Failed to save binary archive, error:\n%s\n",
|
|
|
|
|
[[error localizedDescription] UTF8String]);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
/* Block on load to ensure we continue with a valid kernel function */
|
|
|
|
|
if (creating_new_archive) {
|
|
|
|
|
starttime = time_dt();
|
|
|
|
|
NSError *error;
|
|
|
|
|
if (![archive addComputePipelineFunctionsWithDescriptor:computePipelineStateDescriptor
|
|
|
|
|
error:&error]) {
|
|
|
|
|
NSString *errStr = [error localizedDescription];
|
|
|
|
|
metal_printf("Failed to add PSO to archive:\n%s\n", errStr ? [errStr UTF8String] : "nil");
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
id<MTLComputePipelineState> pipeline = [mtlDevice
|
|
|
|
|
newComputePipelineStateWithDescriptor:computePipelineStateDescriptor
|
|
|
|
|
options:pipelineOptions
|
|
|
|
|
reflection:nullptr
|
|
|
|
|
error:&error];
|
|
|
|
|
completionHandler(pipeline, nullptr, error);
|
|
|
|
|
|
|
|
|
|
this->loaded = true;
|
|
|
|
|
[computePipelineStateDescriptor release];
|
|
|
|
|
computePipelineStateDescriptor = nil;
|
|
|
|
|
|
|
|
|
|
if (use_metalrt && linked_functions) {
|
|
|
|
|
for (int table = 0; table < METALRT_TABLE_NUM; table++) {
|
|
|
|
|
if (@available(macOS 11.0, *)) {
|
|
|
|
|
MTLIntersectionFunctionTableDescriptor *ift_desc =
|
|
|
|
|
[[MTLIntersectionFunctionTableDescriptor alloc] init];
|
|
|
|
|
ift_desc.functionCount = table_functions[table].count;
|
|
|
|
|
intersection_func_table[table] = [this->pipeline
|
|
|
|
|
newIntersectionFunctionTableWithDescriptor:ift_desc];
|
|
|
|
|
|
|
|
|
|
/* Finally write the function handles into this pipeline's table */
|
|
|
|
|
for (int i = 0; i < 2; i++) {
|
|
|
|
|
id<MTLFunctionHandle> handle = [pipeline
|
|
|
|
|
functionHandleWithFunction:table_functions[table][i]];
|
|
|
|
|
[intersection_func_table[table] setFunction:handle atIndex:i];
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
/* Only specialize kernels where it can make an impact. */
|
|
|
|
|
if (kernel_type == PSO_SPECIALISED) {
|
|
|
|
|
if (i < DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST ||
|
|
|
|
|
i > DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL) {
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
double duration = time_dt() - starttime;
|
|
|
|
|
MetalDeviceKernel &kernel = kernels_[i];
|
|
|
|
|
|
|
|
|
|
if (!use_binary_archive) {
|
|
|
|
|
metal_printf("%16s | %2d | %-55s | %7.2fs\n",
|
|
|
|
|
kernel_type_as_string(pso_type),
|
|
|
|
|
int(device_kernel),
|
|
|
|
|
device_kernel_as_string(device_kernel),
|
|
|
|
|
duration);
|
|
|
|
|
const std::string function_name = std::string("cycles_metal_") +
|
|
|
|
|
device_kernel_as_string((DeviceKernel)i);
|
|
|
|
|
int threads_per_threadgroup = device->max_threads_per_threadgroup;
|
|
|
|
|
if (i > DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL && i < DEVICE_KERNEL_INTEGRATOR_RESET) {
|
|
|
|
|
/* Always use 512 for the sorting kernels */
|
|
|
|
|
threads_per_threadgroup = 512;
|
|
|
|
|
}
|
|
|
|
|
else {
|
|
|
|
|
metal_printf("%16s | %2d | %-55s | %7.2fs | %s: %s\n",
|
|
|
|
|
kernel_type_as_string(pso_type),
|
|
|
|
|
device_kernel,
|
|
|
|
|
device_kernel_as_string((DeviceKernel)device_kernel),
|
|
|
|
|
duration,
|
|
|
|
|
creating_new_archive ? " new" : "load",
|
|
|
|
|
metalbin_name.c_str());
|
|
|
|
|
|
|
|
|
|
NSArray *kernel_function_list = nil;
|
|
|
|
|
|
|
|
|
|
if (i == DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST ||
|
|
|
|
|
i == DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW ||
|
|
|
|
|
i == DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE ||
|
|
|
|
|
i == DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK ||
|
|
|
|
|
i == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE) {
|
|
|
|
|
kernel_function_list = function_list;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
MetalKernelLoadDesc desc;
|
|
|
|
|
desc.pso_index = kernel_type;
|
|
|
|
|
desc.kernel_index = i;
|
|
|
|
|
desc.linked_functions = kernel_function_list;
|
|
|
|
|
desc.intersector_functions.defaults = table_functions[METALRT_TABLE_DEFAULT];
|
|
|
|
|
desc.intersector_functions.shadow = table_functions[METALRT_TABLE_SHADOW];
|
|
|
|
|
desc.intersector_functions.local = table_functions[METALRT_TABLE_LOCAL];
|
|
|
|
|
desc.constant_values = constant_values;
|
|
|
|
|
desc.threads_per_threadgroup = threads_per_threadgroup;
|
|
|
|
|
desc.function_name = function_name.c_str();
|
|
|
|
|
|
|
|
|
|
bool success = kernel.load(device, desc, md5);
|
|
|
|
|
|
|
|
|
|
any_error |= !success;
|
|
|
|
|
});
|
|
|
|
|
});
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
bool loaded = !any_error;
|
|
|
|
|
if (loaded) {
|
|
|
|
|
loaded_md5[kernel_type] = hash;
|
|
|
|
|
}
|
|
|
|
|
return loaded;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
bool MetalDeviceKernels::load(MetalDevice *device, bool scene_specialized)
|
|
|
|
|
const MetalDeviceKernel &MetalDeviceKernels::get(DeviceKernel kernel) const
|
|
|
|
|
{
|
|
|
|
|
auto shader_cache = get_shader_cache(device->mtlDevice);
|
|
|
|
|
for (int i = 0; i < DEVICE_KERNEL_NUM; i++) {
|
|
|
|
|
shader_cache->load_kernel((DeviceKernel)i, device, scene_specialized);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (!scene_specialized || getenv("CYCLES_METAL_PROFILING")) {
|
|
|
|
|
shader_cache->wait_for_all();
|
|
|
|
|
}
|
|
|
|
|
return true;
|
|
|
|
|
return kernels_[(int)kernel];
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
const MetalKernelPipeline &MetalDeviceKernels::get_best_pipeline(const MetalDevice *device,
|
|
|
|
|
DeviceKernel kernel) const
|
|
|
|
|
bool MetalDeviceKernels::available(DeviceKernel kernel) const
|
|
|
|
|
{
|
|
|
|
|
return *get_shader_cache(device->mtlDevice)->get_best_pipeline(kernel, device);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
bool MetalDeviceKernels::available(const MetalDevice *device, DeviceKernel kernel) const
|
|
|
|
|
{
|
|
|
|
|
return get_shader_cache(device->mtlDevice)->get_best_pipeline(kernel, device) != nullptr;
|
|
|
|
|
return kernels_[(int)kernel].get_pso().function != nil;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
CCL_NAMESPACE_END
|
|
|
|
|