Fix sync delay between calls to vkQueueSubmit() on non-Apple-Silicon devices.

The [MTLDevice sampleTimestamps:gpuTimestamp:] function turns out to be
synchronized with other queue activities, and can block GPU execution
if it is called between MTLCommandBuffer submissions. On non-Apple-Silicon
devices, it was called before and after every vkQueueSubmit() submission,
to track the correlation between GPU and CPU timestamps, and was delaying
the start of GPU work on the next submission (on Apple Silicon, both
CPU & GPU timestamps are specified in nanoseconds, and the call was bypassed).

Move timestamp correlation from vkQueueSubmit() to
vkGetPhysicalDeviceProperties(), where it is used to update
VkPhysicalDeviceLimits::timestampPeriod on non-Apple-Silicon devices.

Delegate MVKPhysicalDevice::getProperties(VkPhysicalDeviceProperties2*)
to MVKPhysicalDevice::getProperties(VkPhysicalDeviceProperties*), plus
minimize wasted effort if pNext is empty (unrelated).

Move the declaration of several MVKPhysicalDevice member structs to
potentially reduce member spacing (unrelated).
This commit is contained in:
Bill Hollings 2023-08-12 13:32:28 -04:00
parent d9b32e0c75
commit dd31587337
5 changed files with 26 additions and 47 deletions

View File

@ -363,6 +363,7 @@ Released 2022/02/07
- Fix issue where *MSL 2.3* only available on *Apple Silicon*, even on *macOS*.
- Fix memory leak of dummy `MTLTexture` in render subpasses that use no attachments.
- Fix Metal object retain-release errors in assignment operators.
- Fix sync delay between calls to `vkQueueSubmit()` on non-Apple-Silicon devices.
- Fix use of GPU counter sets on older versions of iOS running on the simulator.
- `mvk::getShaderOutputs()` in `SPRIVReflection.h` support flattening nested structures.
- Replaced ASL logging levels with `MVKConfigLogLevel`.

View File

@ -356,20 +356,6 @@ public:
return _metalFeatures.argumentBuffers && mvkConfig().useMetalArgumentBuffers != MVK_CONFIG_USE_METAL_ARGUMENT_BUFFERS_NEVER;
};
/**
* Returns the start timestamps of a timestamp correlation.
* The returned values should be later passed back to updateTimestampPeriod().
*/
void startTimestampCorrelation(MTLTimestamp& cpuStart, MTLTimestamp& gpuStart);
/**
* Updates the current value of VkPhysicalDeviceLimits::timestampPeriod, based on the
* correlation between the CPU time tickes and GPU time ticks, from the specified start
* values, to the current values. The cpuStart and gpuStart values should have been
* retrieved from a prior call to startTimestampCorrelation().
*/
void updateTimestampPeriod(MTLTimestamp cpuStart, MTLTimestamp gpuStart);
#pragma mark Construction
@ -416,6 +402,7 @@ protected:
void initExtensions();
void initCounterSets();
bool needsCounterSetRetained();
void updateTimestampsAndPeriod();
MVKArrayRef<MVKQueueFamily*> getQueueFamilies();
void initPipelineCacheUUID();
uint32_t getHighestGPUCapability();
@ -435,16 +422,18 @@ protected:
VkPhysicalDeviceMemoryProperties _memoryProperties;
MVKSmallVector<MVKQueueFamily*, kMVKQueueFamilyCount> _queueFamilies;
MVKPixelFormats _pixelFormats;
VkExternalMemoryProperties _hostPointerExternalMemoryProperties;
VkExternalMemoryProperties _mtlBufferExternalMemoryProperties;
VkExternalMemoryProperties _mtlTextureExternalMemoryProperties;
id<MTLCounterSet> _timestampMTLCounterSet;
MVKSemaphoreStyle _vkSemaphoreStyle;
MTLTimestamp _prevCPUTimestamp = 0;
MTLTimestamp _prevGPUTimestamp = 0;
uint32_t _allMemoryTypes;
uint32_t _hostVisibleMemoryTypes;
uint32_t _hostCoherentMemoryTypes;
uint32_t _privateMemoryTypes;
uint32_t _lazilyAllocatedMemoryTypes;
VkExternalMemoryProperties _hostPointerExternalMemoryProperties;
VkExternalMemoryProperties _mtlBufferExternalMemoryProperties;
VkExternalMemoryProperties _mtlTextureExternalMemoryProperties;
};

View File

@ -451,10 +451,17 @@ void MVKPhysicalDevice::getFeatures(VkPhysicalDeviceFeatures2* features) {
}
void MVKPhysicalDevice::getProperties(VkPhysicalDeviceProperties* properties) {
updateTimestampsAndPeriod();
*properties = _properties;
}
void MVKPhysicalDevice::getProperties(VkPhysicalDeviceProperties2* properties) {
properties->sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2;
getProperties(&properties->properties);
if ( !properties->pNext ) { return; }
uint32_t uintMax = std::numeric_limits<uint32_t>::max();
uint32_t maxSamplerCnt = getMaxSamplerCount();
bool isTier2 = supportsMetalArgumentBuffers() && (_metalFeatures.argumentBuffersTier >= MTLArgumentBuffersTier2);
@ -536,8 +543,6 @@ void MVKPhysicalDevice::getProperties(VkPhysicalDeviceProperties2* properties) {
supportedProps12.maxTimelineSemaphoreValueDifference = std::numeric_limits<uint64_t>::max();
supportedProps12.framebufferIntegerColorSampleCounts = _metalFeatures.supportedSampleCounts;
properties->sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2;
properties->properties = _properties;
for (auto* next = (VkBaseOutStructure*)properties->pNext; next; next = next->pNext) {
switch ((uint32_t)next->sType) {
case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_1_PROPERTIES: {
@ -1562,23 +1567,17 @@ VkResult MVKPhysicalDevice::getQueueFamilyProperties(uint32_t* pCount,
return rslt;
}
// Don't need to do this for Apple GPUs, where the GPU and CPU timestamps
// are the same, or if we're not using GPU timestamp counters.
void MVKPhysicalDevice::startTimestampCorrelation(MTLTimestamp& cpuStart, MTLTimestamp& gpuStart) {
if (_properties.vendorID == kAppleVendorId || !_timestampMTLCounterSet) { return; }
[_mtlDevice sampleTimestamps: &cpuStart gpuTimestamp: &gpuStart];
}
// Don't need to do this for Apple GPUs, where the GPU and CPU timestamps
// are the same, or if we're not using GPU timestamp counters.
void MVKPhysicalDevice::updateTimestampPeriod(MTLTimestamp cpuStart, MTLTimestamp gpuStart) {
if (_properties.vendorID == kAppleVendorId || !_timestampMTLCounterSet) { return; }
MTLTimestamp cpuEnd;
MTLTimestamp gpuEnd;
[_mtlDevice sampleTimestamps: &cpuEnd gpuTimestamp: &gpuEnd];
_properties.limits.timestampPeriod = (double)(cpuEnd - cpuStart) / (double)(gpuEnd - gpuStart);
// Mark both CPU and GPU timestamps, and if needed, update the timestamp period for this device.
// On Apple GPUs, the CPU & GPU timestamps are the same, and the timestamp period never changes.
void MVKPhysicalDevice::updateTimestampsAndPeriod() {
if (_properties.vendorID == kAppleVendorId) {
_prevGPUTimestamp = _prevCPUTimestamp = mvkGetElapsedNanoseconds();
} else {
MTLTimestamp earlierCPUTs = _prevCPUTimestamp;
MTLTimestamp earlierGPUTs = _prevGPUTimestamp;
[_mtlDevice sampleTimestamps: &_prevCPUTimestamp gpuTimestamp: &_prevGPUTimestamp];
_properties.limits.timestampPeriod = (double)(_prevCPUTimestamp - earlierCPUTs) / (double)(_prevGPUTimestamp - earlierGPUTs);
}
}
@ -2606,7 +2605,7 @@ void MVKPhysicalDevice::initLimits() {
_properties.limits.optimalBufferCopyRowPitchAlignment = 1;
_properties.limits.timestampComputeAndGraphics = VK_TRUE;
_properties.limits.timestampPeriod = _metalFeatures.counterSamplingPoints ? 1.0 : mvkGetTimestampPeriod();
_properties.limits.timestampPeriod = mvkGetTimestampPeriod(); // Will be 1.0 on Apple Silicon
_properties.limits.pointSizeRange[0] = 1;
switch (_properties.vendorID) {

View File

@ -255,11 +255,8 @@ public:
protected:
void submitCommandBuffers() override;
void finish() override;
MVKSmallVector<MVKCommandBuffer*, N> _cmdBuffers;
MTLTimestamp _cpuStart = 0;
MTLTimestamp _gpuStart = 0;
};

View File

@ -524,16 +524,9 @@ MVKQueueCommandBufferSubmission::~MVKQueueCommandBufferSubmission() {
template <size_t N>
void MVKQueueFullCommandBufferSubmission<N>::submitCommandBuffers() {
_queue->getPhysicalDevice()->startTimestampCorrelation(_cpuStart, _gpuStart);
for (auto& cb : _cmdBuffers) { cb->submit(this, &_encodingContext); }
}
template <size_t N>
void MVKQueueFullCommandBufferSubmission<N>::finish() {
_queue->getPhysicalDevice()->updateTimestampPeriod(_cpuStart, _gpuStart);
MVKQueueCommandBufferSubmission::finish();
}
#pragma mark -
#pragma mark MVKQueuePresentSurfaceSubmission