From dd31587337b4b7017b72cd77cce54278fa7f4047 Mon Sep 17 00:00:00 2001 From: Bill Hollings Date: Sat, 12 Aug 2023 13:32:28 -0400 Subject: [PATCH] Fix sync delay between calls to vkQueueSubmit() on non-Apple-Silicon devices. The [MTLDevice sampleTimestamps:gpuTimestamp:] function turns out to be synchronized with other queue activities, and can block GPU execution if it is called between MTLCommandBuffer submissions. On non-Apple-Silicon devices, it was called before and after every vkQueueSubmit() submission, to track the correlation between GPU and CPU timestamps, and was delaying the start of GPU work on the next submission (on Apple Silicon, both CPU & GPU timestamps are specified in nanoseconds, and the call was bypassed). Move timestamp correlation from vkQueueSubmit() to vkGetPhysicalDeviceProperties(), where it is used to update VkPhysicalDeviceLimits::timestampPeriod on non-Apple-Silicon devices. Delegate MVKPhysicalDevice::getProperties(VkPhysicalDeviceProperties2*) to MVKPhysicalDevice::getProperties(VkPhysicalDeviceProperties*), plus minimize wasted effort if pNext is empty (unrelated). Move the declaration of several MVKPhysicalDevice member structs to potentially reduce member spacing (unrelated). --- Docs/Whats_New.md | 1 + MoltenVK/MoltenVK/GPUObjects/MVKDevice.h | 23 ++++--------- MoltenVK/MoltenVK/GPUObjects/MVKDevice.mm | 39 +++++++++++------------ MoltenVK/MoltenVK/GPUObjects/MVKQueue.h | 3 -- MoltenVK/MoltenVK/GPUObjects/MVKQueue.mm | 7 ---- 5 files changed, 26 insertions(+), 47 deletions(-) diff --git a/Docs/Whats_New.md b/Docs/Whats_New.md index 9e0fc42a..d473ae91 100644 --- a/Docs/Whats_New.md +++ b/Docs/Whats_New.md @@ -363,6 +363,7 @@ Released 2022/02/07 - Fix issue where *MSL 2.3* only available on *Apple Silicon*, even on *macOS*. - Fix memory leak of dummy `MTLTexture` in render subpasses that use no attachments. - Fix Metal object retain-release errors in assignment operators. +- Fix sync delay between calls to `vkQueueSubmit()` on non-Apple-Silicon devices. - Fix use of GPU counter sets on older versions of iOS running on the simulator. - `mvk::getShaderOutputs()` in `SPRIVReflection.h` support flattening nested structures. - Replaced ASL logging levels with `MVKConfigLogLevel`. diff --git a/MoltenVK/MoltenVK/GPUObjects/MVKDevice.h b/MoltenVK/MoltenVK/GPUObjects/MVKDevice.h index 5186fe4a..450fad66 100644 --- a/MoltenVK/MoltenVK/GPUObjects/MVKDevice.h +++ b/MoltenVK/MoltenVK/GPUObjects/MVKDevice.h @@ -356,20 +356,6 @@ public: return _metalFeatures.argumentBuffers && mvkConfig().useMetalArgumentBuffers != MVK_CONFIG_USE_METAL_ARGUMENT_BUFFERS_NEVER; }; - /** - * Returns the start timestamps of a timestamp correlation. - * The returned values should be later passed back to updateTimestampPeriod(). - */ - void startTimestampCorrelation(MTLTimestamp& cpuStart, MTLTimestamp& gpuStart); - - /** - * Updates the current value of VkPhysicalDeviceLimits::timestampPeriod, based on the - * correlation between the CPU time tickes and GPU time ticks, from the specified start - * values, to the current values. The cpuStart and gpuStart values should have been - * retrieved from a prior call to startTimestampCorrelation(). - */ - void updateTimestampPeriod(MTLTimestamp cpuStart, MTLTimestamp gpuStart); - #pragma mark Construction @@ -416,6 +402,7 @@ protected: void initExtensions(); void initCounterSets(); bool needsCounterSetRetained(); + void updateTimestampsAndPeriod(); MVKArrayRef getQueueFamilies(); void initPipelineCacheUUID(); uint32_t getHighestGPUCapability(); @@ -435,16 +422,18 @@ protected: VkPhysicalDeviceMemoryProperties _memoryProperties; MVKSmallVector _queueFamilies; MVKPixelFormats _pixelFormats; + VkExternalMemoryProperties _hostPointerExternalMemoryProperties; + VkExternalMemoryProperties _mtlBufferExternalMemoryProperties; + VkExternalMemoryProperties _mtlTextureExternalMemoryProperties; id _timestampMTLCounterSet; MVKSemaphoreStyle _vkSemaphoreStyle; + MTLTimestamp _prevCPUTimestamp = 0; + MTLTimestamp _prevGPUTimestamp = 0; uint32_t _allMemoryTypes; uint32_t _hostVisibleMemoryTypes; uint32_t _hostCoherentMemoryTypes; uint32_t _privateMemoryTypes; uint32_t _lazilyAllocatedMemoryTypes; - VkExternalMemoryProperties _hostPointerExternalMemoryProperties; - VkExternalMemoryProperties _mtlBufferExternalMemoryProperties; - VkExternalMemoryProperties _mtlTextureExternalMemoryProperties; }; diff --git a/MoltenVK/MoltenVK/GPUObjects/MVKDevice.mm b/MoltenVK/MoltenVK/GPUObjects/MVKDevice.mm index 3c77519d..59f90df3 100644 --- a/MoltenVK/MoltenVK/GPUObjects/MVKDevice.mm +++ b/MoltenVK/MoltenVK/GPUObjects/MVKDevice.mm @@ -451,10 +451,17 @@ void MVKPhysicalDevice::getFeatures(VkPhysicalDeviceFeatures2* features) { } void MVKPhysicalDevice::getProperties(VkPhysicalDeviceProperties* properties) { + updateTimestampsAndPeriod(); *properties = _properties; } void MVKPhysicalDevice::getProperties(VkPhysicalDeviceProperties2* properties) { + + properties->sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2; + getProperties(&properties->properties); + + if ( !properties->pNext ) { return; } + uint32_t uintMax = std::numeric_limits::max(); uint32_t maxSamplerCnt = getMaxSamplerCount(); bool isTier2 = supportsMetalArgumentBuffers() && (_metalFeatures.argumentBuffersTier >= MTLArgumentBuffersTier2); @@ -536,8 +543,6 @@ void MVKPhysicalDevice::getProperties(VkPhysicalDeviceProperties2* properties) { supportedProps12.maxTimelineSemaphoreValueDifference = std::numeric_limits::max(); supportedProps12.framebufferIntegerColorSampleCounts = _metalFeatures.supportedSampleCounts; - properties->sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2; - properties->properties = _properties; for (auto* next = (VkBaseOutStructure*)properties->pNext; next; next = next->pNext) { switch ((uint32_t)next->sType) { case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_1_PROPERTIES: { @@ -1562,23 +1567,17 @@ VkResult MVKPhysicalDevice::getQueueFamilyProperties(uint32_t* pCount, return rslt; } -// Don't need to do this for Apple GPUs, where the GPU and CPU timestamps -// are the same, or if we're not using GPU timestamp counters. -void MVKPhysicalDevice::startTimestampCorrelation(MTLTimestamp& cpuStart, MTLTimestamp& gpuStart) { - if (_properties.vendorID == kAppleVendorId || !_timestampMTLCounterSet) { return; } - [_mtlDevice sampleTimestamps: &cpuStart gpuTimestamp: &gpuStart]; -} - -// Don't need to do this for Apple GPUs, where the GPU and CPU timestamps -// are the same, or if we're not using GPU timestamp counters. -void MVKPhysicalDevice::updateTimestampPeriod(MTLTimestamp cpuStart, MTLTimestamp gpuStart) { - if (_properties.vendorID == kAppleVendorId || !_timestampMTLCounterSet) { return; } - - MTLTimestamp cpuEnd; - MTLTimestamp gpuEnd; - [_mtlDevice sampleTimestamps: &cpuEnd gpuTimestamp: &gpuEnd]; - - _properties.limits.timestampPeriod = (double)(cpuEnd - cpuStart) / (double)(gpuEnd - gpuStart); +// Mark both CPU and GPU timestamps, and if needed, update the timestamp period for this device. +// On Apple GPUs, the CPU & GPU timestamps are the same, and the timestamp period never changes. +void MVKPhysicalDevice::updateTimestampsAndPeriod() { + if (_properties.vendorID == kAppleVendorId) { + _prevGPUTimestamp = _prevCPUTimestamp = mvkGetElapsedNanoseconds(); + } else { + MTLTimestamp earlierCPUTs = _prevCPUTimestamp; + MTLTimestamp earlierGPUTs = _prevGPUTimestamp; + [_mtlDevice sampleTimestamps: &_prevCPUTimestamp gpuTimestamp: &_prevGPUTimestamp]; + _properties.limits.timestampPeriod = (double)(_prevCPUTimestamp - earlierCPUTs) / (double)(_prevGPUTimestamp - earlierGPUTs); + } } @@ -2606,7 +2605,7 @@ void MVKPhysicalDevice::initLimits() { _properties.limits.optimalBufferCopyRowPitchAlignment = 1; _properties.limits.timestampComputeAndGraphics = VK_TRUE; - _properties.limits.timestampPeriod = _metalFeatures.counterSamplingPoints ? 1.0 : mvkGetTimestampPeriod(); + _properties.limits.timestampPeriod = mvkGetTimestampPeriod(); // Will be 1.0 on Apple Silicon _properties.limits.pointSizeRange[0] = 1; switch (_properties.vendorID) { diff --git a/MoltenVK/MoltenVK/GPUObjects/MVKQueue.h b/MoltenVK/MoltenVK/GPUObjects/MVKQueue.h index 96d77bc1..bcefd2f3 100644 --- a/MoltenVK/MoltenVK/GPUObjects/MVKQueue.h +++ b/MoltenVK/MoltenVK/GPUObjects/MVKQueue.h @@ -255,11 +255,8 @@ public: protected: void submitCommandBuffers() override; - void finish() override; MVKSmallVector _cmdBuffers; - MTLTimestamp _cpuStart = 0; - MTLTimestamp _gpuStart = 0; }; diff --git a/MoltenVK/MoltenVK/GPUObjects/MVKQueue.mm b/MoltenVK/MoltenVK/GPUObjects/MVKQueue.mm index 96b34cc3..0ad14307 100644 --- a/MoltenVK/MoltenVK/GPUObjects/MVKQueue.mm +++ b/MoltenVK/MoltenVK/GPUObjects/MVKQueue.mm @@ -524,16 +524,9 @@ MVKQueueCommandBufferSubmission::~MVKQueueCommandBufferSubmission() { template void MVKQueueFullCommandBufferSubmission::submitCommandBuffers() { - _queue->getPhysicalDevice()->startTimestampCorrelation(_cpuStart, _gpuStart); for (auto& cb : _cmdBuffers) { cb->submit(this, &_encodingContext); } } -template -void MVKQueueFullCommandBufferSubmission::finish() { - _queue->getPhysicalDevice()->updateTimestampPeriod(_cpuStart, _gpuStart); - MVKQueueCommandBufferSubmission::finish(); -} - #pragma mark - #pragma mark MVKQueuePresentSurfaceSubmission