diff --git a/Docs/Whats_New.md b/Docs/Whats_New.md index 41781fc7..5dfa1805 100644 --- a/Docs/Whats_New.md +++ b/Docs/Whats_New.md @@ -27,6 +27,9 @@ Released TBD - Rename `kMVKShaderStageMax` to `kMVKShaderStageCount`. - Fix crash when requesting `MTLCommandBuffer` logs in runtime debug mode on older OS versions. - Fix synchronization issue with locking `MTLArgumentEncoder` for Metal Argument Buffers. +- Fix race condition on submission fence during device loss. +- On command buffer submission failure, if `MVKConfiguration::resumeLostDevice` enabled, do not release + waits on `VkDevice`, and do not return `VK_ERROR_DEVICE_LOST`, unless `VkPhysicalDevice` is also lost. - Fix inconsistent handling of linear attachment decisions on Apple Silicon. - Protect against crash when retrieving `MTLTexture` when `VkImage` has no `VkDeviceMemory` bound. - Adjust some `VkPhysicalDeviceLimits` values for Vulkan and Metal compliance. diff --git a/MoltenVK/MoltenVK/API/vk_mvk_moltenvk.h b/MoltenVK/MoltenVK/API/vk_mvk_moltenvk.h index a78e93cc..7a088363 100644 --- a/MoltenVK/MoltenVK/API/vk_mvk_moltenvk.h +++ b/MoltenVK/MoltenVK/API/vk_mvk_moltenvk.h @@ -780,24 +780,24 @@ typedef struct { * Controls whether MoltenVK should treat a lost VkDevice as resumable, unless the * corresponding VkPhysicalDevice has also been lost. The VK_ERROR_DEVICE_LOST error has * a broad definitional range, and can mean anything from a GPU hiccup on the current - * command buffer submission, to a phyically removed GPU. In the case where this error does + * command buffer submission, to a phyisically removed GPU. In the case where this error does * not impact the VkPhysicalDevice, Vulkan requires that the app destroy and re-create a new * VkDevice. However, not all apps (including CTS) respect that requirement, leading to what * might be a transient command submission failure causing an unexpected catastophic app failure. - * If this setting is enabled, in the case of a VK_ERROR_DEVICE_LOST error that does not - * impact the VkPhysicalDevice, MoltenVK will remove the error condition on the VkDevice after - * the current queue submission is finished, allowing the VkDevice to continue to be used. - * If this setting is disabled, MoltenVK will maintain the VK_ERROR_DEVICE_LOST error condition - * on the VkDevice, and subsequent use of that VkDevice will be reduced or prohibited. * - * The value of this parameter should be changed before creating a VkDevice - * that will use it, for the change to take effect. + * If this setting is enabled, in the case of a VK_ERROR_DEVICE_LOST error that does NOT impact + * the VkPhysicalDevice, MoltenVK will log the error, but will not mark the VkDevice as lost, + * allowing the VkDevice to continue to be used. If this setting is disabled, MoltenVK will + * mark the VkDevice as lost, and subsequent use of that VkDevice will be reduced or prohibited. + * + * The value of this parameter may be changed at any time during application runtime, + * and the changed value will affect the error behavior of subsequent command submissions. * * The initial value or this parameter is set by the * MVK_CONFIG_RESUME_LOST_DEVICE * runtime environment variable or MoltenVK compile-time build setting. - * If neither is set, this setting is disabled by default, and MoltenVK will not - * resume a VkDevice that enters the VK_ERROR_DEVICE_LOST error state. + * If neither is set, this setting is disabled by default, and MoltenVK + * will mark the VkDevice as lost when a command submission failure occurs. */ VkBool32 resumeLostDevice; diff --git a/MoltenVK/MoltenVK/GPUObjects/MVKDevice.h b/MoltenVK/MoltenVK/GPUObjects/MVKDevice.h index d2afd9ee..b1bd3577 100644 --- a/MoltenVK/MoltenVK/GPUObjects/MVKDevice.h +++ b/MoltenVK/MoltenVK/GPUObjects/MVKDevice.h @@ -447,8 +447,8 @@ public: /** Block the current thread until all queues in this device are idle. */ VkResult waitIdle(); - /** Mark this device as lost. Releases all waits for this device. */ - VkResult markLost(); + /** Mark this device (and optionally the physical device) as lost. Releases all waits for this device. */ + VkResult markLost(bool alsoMarkPhysicalDevice = false); /** Returns whether or not the given descriptor set layout is supported. */ void getDescriptorSetLayoutSupport(const VkDescriptorSetLayoutCreateInfo* pCreateInfo, diff --git a/MoltenVK/MoltenVK/GPUObjects/MVKDevice.mm b/MoltenVK/MoltenVK/GPUObjects/MVKDevice.mm index b46eea62..e70de7e3 100644 --- a/MoltenVK/MoltenVK/GPUObjects/MVKDevice.mm +++ b/MoltenVK/MoltenVK/GPUObjects/MVKDevice.mm @@ -2861,9 +2861,12 @@ VkResult MVKDevice::waitIdle() { return VK_SUCCESS; } -VkResult MVKDevice::markLost() { +VkResult MVKDevice::markLost(bool alsoMarkPhysicalDevice) { lock_guard lock(_sem4Lock); + setConfigurationResult(VK_ERROR_DEVICE_LOST); + if (alsoMarkPhysicalDevice) { _physicalDevice->setConfigurationResult(VK_ERROR_DEVICE_LOST); } + for (auto* sem4 : _awaitingSemaphores) { sem4->release(); } @@ -2874,7 +2877,8 @@ VkResult MVKDevice::markLost() { } _awaitingSemaphores.clear(); _awaitingTimelineSem4s.clear(); - return VK_ERROR_DEVICE_LOST; + + return getConfigurationResult(); } void MVKDevice::getDescriptorSetLayoutSupport(const VkDescriptorSetLayoutCreateInfo* pCreateInfo, diff --git a/MoltenVK/MoltenVK/GPUObjects/MVKQueue.mm b/MoltenVK/MoltenVK/GPUObjects/MVKQueue.mm index 7464ae1a..f9556afb 100644 --- a/MoltenVK/MoltenVK/GPUObjects/MVKQueue.mm +++ b/MoltenVK/MoltenVK/GPUObjects/MVKQueue.mm @@ -330,19 +330,21 @@ void MVKQueueCommandBufferSubmission::commitActiveMTLCommandBuffer(bool signalCo uint64_t startTime = mvkDev->getPerformanceTimestamp(); [mtlCmdBuff addCompletedHandler: ^(id mtlCB) { if (mtlCB.status == MTLCommandBufferStatusError) { - getVulkanAPIObject()->reportError(mvkDev->markLost(), "Command buffer %p \"%s\" execution failed (code %li): %s", mtlCB, mtlCB.label ? mtlCB.label.UTF8String : "", mtlCB.error.code, mtlCB.error.localizedDescription.UTF8String); - // Some errors indicate we lost the physical device as well. + // If a command buffer error has occurred, report the error. If the error affects + // the physical device, always mark both the device and physical device as lost. + // If the error is local to this command buffer, optionally mark the device (but not the + // physical device) as lost, depending on the value of MVKConfiguration::resumeLostDevice. + getVulkanAPIObject()->reportError(VK_ERROR_DEVICE_LOST, "Command buffer %p \"%s\" execution failed (code %li): %s", mtlCB, mtlCB.label ? mtlCB.label.UTF8String : "", mtlCB.error.code, mtlCB.error.localizedDescription.UTF8String); switch (mtlCB.error.code) { case MTLCommandBufferErrorBlacklisted: - // XXX This may also be used for command buffers executed in the background without the right entitlement. - case MTLCommandBufferErrorNotPermitted: + case MTLCommandBufferErrorNotPermitted: // May also be used for command buffers executed in the background without the right entitlement. #if MVK_MACOS && !MVK_MACCAT case MTLCommandBufferErrorDeviceRemoved: #endif - mvkDev->getPhysicalDevice()->setConfigurationResult(VK_ERROR_DEVICE_LOST); + mvkDev->markLost(true); break; default: - if (mvkConfig().resumeLostDevice) { mvkDev->clearConfigurationResult(); } + if ( !mvkConfig().resumeLostDevice ) { mvkDev->markLost(); } break; } #if MVK_XCODE_12