Merge pull request #1387 from billhollings/occlusion-query-fixes-for-M1

Occlusion query fixes for M1
2021-06-28 08:10:17 -04:00 · 2021-06-28 08:10:17 -04:00 · e3cf071ace
commit e3cf071ace
parent 1421c1e9a4 45ffcc9564
7 changed files with 76 additions and 41 deletions
--- a/Docs/Whats_New.md
+++ b/Docs/Whats_New.md
@ -33,6 +33,8 @@ Released TBD
 - Fix synchronization issue with locking `MTLArgumentEncoder` for Metal Argument Buffers.
 - Fix race condition on submission fence during device loss.
 - Fix crash using memoryless storage for input attachments on Apple Silicon.
+- Fix issue where M1 GPU does not support reusing Metal visibility buffer offsets
+  across separate render encoders within a single Metal command buffer (Vulkan submit).
 - On command buffer submission failure, if `MVKConfiguration::resumeLostDevice` enabled,  do not release 
  waits on `VkDevice`, and do not return `VK_ERROR_DEVICE_LOST`, unless `VkPhysicalDevice` is also lost.
 - Fix inconsistent handling of linear attachment decisions on Apple Silicon.
--- a/MoltenVK/MoltenVK/Commands/MVKCommandBuffer.h
+++ b/MoltenVK/MoltenVK/Commands/MVKCommandBuffer.h
@ -46,6 +46,16 @@ class MVKComputePipeline;
 typedef uint64_t MVKMTLCommandBufferID;


+#pragma mark -
+#pragma mark MVKCommandEncodingContext
+
+/** Context for tracking information across multiple encodings. */
+typedef struct MVKCommandEncodingContext {
+	NSUInteger mtlVisibilityResultOffset = 0;
+	const MVKMTLBufferAllocation* visibilityResultBuffer = nullptr;
+} MVKCommandEncodingContext;
+
+
 #pragma mark -
 #pragma mark MVKCommandBuffer

@ -83,7 +93,7 @@ public:
 	inline MVKCommandPool* getCommandPool() { return _commandPool; }

 	/** Submit the commands in this buffer as part of the queue submission. */
-	void submit(MVKQueueCommandBufferSubmission* cmdBuffSubmit);
+	void submit(MVKQueueCommandBufferSubmission* cmdBuffSubmit, MVKCommandEncodingContext* pEncodingContext);

    /** Returns whether this command buffer can be submitted to a queue more than once. */
    inline bool getIsReusable() { return _isReusable; }
@ -264,7 +274,7 @@ public:
 	MVKVulkanAPIObject* getVulkanAPIObject() override { return _cmdBuffer->getVulkanAPIObject(); };

 	/** Encode commands from the command buffer onto the Metal command buffer. */
-	void encode(id<MTLCommandBuffer> mtlCmdBuff);
+	void encode(id<MTLCommandBuffer> mtlCmdBuff, MVKCommandEncodingContext* pEncodingContext);

 	/** Encode commands from the specified secondary command buffer onto the Metal command buffer. */
 	void encodeSecondary(MVKCommandBuffer* secondaryCmdBuffer);
@ -407,6 +417,9 @@ public:

 #pragma mark Dynamic encoding state accessed directly

+	/** Context for tracking information across multiple encodings. */
+	MVKCommandEncodingContext* _pEncodingContext;
+
    /** A reference to the Metal features supported by the device. */
    const MVKPhysicalDeviceMetalFeatures* _pDeviceMetalFeatures;

@ -428,9 +441,6 @@ public:
 	/** The current Metal render encoder. */
 	id<MTLRenderCommandEncoder> _mtlRenderEncoder;

-	/** The buffer used to hold occlusion query results in a render pass. */
-	const MVKMTLBufferAllocation* _visibilityResultMTLBuffer;
-
    /** Tracks the current graphics pipeline bound to the encoder. */
    MVKPipelineCommandEncoderState _graphicsPipelineState;

--- a/MoltenVK/MoltenVK/Commands/MVKCommandBuffer.mm
+++ b/MoltenVK/MoltenVK/Commands/MVKCommandBuffer.mm
@ -105,7 +105,8 @@ void MVKCommandBuffer::addCommand(MVKCommand* command) {
    _commandCount++;
 }

-void MVKCommandBuffer::submit(MVKQueueCommandBufferSubmission* cmdBuffSubmit) {
+void MVKCommandBuffer::submit(MVKQueueCommandBufferSubmission* cmdBuffSubmit,
+							  MVKCommandEncodingContext* pEncodingContext) {
 	if ( !canExecute() ) { return; }

 	if (_prefilledMTLCmdBuffer) {
@ -113,7 +114,7 @@ void MVKCommandBuffer::submit(MVKQueueCommandBufferSubmission* cmdBuffSubmit) {
 		clearPrefilledMTLCommandBuffer();
 	} else {
 		MVKCommandEncoder encoder(this);
-		encoder.encode(cmdBuffSubmit->getActiveMTLCommandBuffer());
+		encoder.encode(cmdBuffSubmit->getActiveMTLCommandBuffer(), pEncodingContext);
 	}

 	if ( !_supportsConcurrentExecution ) { _isExecutingNonConcurrently.clear(); }
@ -150,8 +151,9 @@ void MVKCommandBuffer::prefill() {
 		uint32_t qIdx = 0;
 		_prefilledMTLCmdBuffer = _commandPool->newMTLCommandBuffer(qIdx);	// retain

+		MVKCommandEncodingContext encodingContext;
 		MVKCommandEncoder encoder(this);
-		encoder.encode(_prefilledMTLCmdBuffer);
+		encoder.encode(_prefilledMTLCmdBuffer, &encodingContext);

 		// Once encoded onto Metal, if this command buffer is not reusable, we don't need the
 		// MVKCommand instances anymore, so release them in order to reduce memory pressure.
@ -246,13 +248,15 @@ MVKRenderSubpass* MVKCommandBuffer::getLastMultiviewSubpass() {
 #pragma mark -
 #pragma mark MVKCommandEncoder

-void MVKCommandEncoder::encode(id<MTLCommandBuffer> mtlCmdBuff) {
+void MVKCommandEncoder::encode(id<MTLCommandBuffer> mtlCmdBuff,
+							   MVKCommandEncodingContext* pEncodingContext) {
 	_renderPass = nullptr;
 	_subpassContents = VK_SUBPASS_CONTENTS_INLINE;
 	_renderSubpassIndex = 0;
 	_multiviewPassIndex = 0;
 	_canUseLayeredRendering = false;

+	_pEncodingContext = pEncodingContext;
 	_mtlCmdBuffer = mtlCmdBuff;		// not retained

 	setLabelIfNotNil(_mtlCmdBuffer, _cmdBuffer->_debugName);
@ -345,12 +349,12 @@ void MVKCommandEncoder::beginMetalRenderPass(bool loadOverride) {
 												  _clearValues.contents(),
 												  _isRenderingEntireAttachment,
 												  loadOverride);
-    if (_cmdBuffer->_needsVisibilityResultMTLBuffer) {
-        if (!_visibilityResultMTLBuffer) {
-            _visibilityResultMTLBuffer = getTempMTLBuffer(_pDeviceMetalFeatures->maxQueryBufferSize, true, true);
-        }
-        mtlRPDesc.visibilityResultBuffer = _visibilityResultMTLBuffer->_mtlBuffer;
-    }
+	if (_cmdBuffer->_needsVisibilityResultMTLBuffer) {
+		if ( !_pEncodingContext->visibilityResultBuffer ) {
+			_pEncodingContext->visibilityResultBuffer = getTempMTLBuffer(_pDeviceMetalFeatures->maxQueryBufferSize, true, true);
+		}
+		mtlRPDesc.visibilityResultBuffer = _pEncodingContext->visibilityResultBuffer->_mtlBuffer;
+	}

 	VkExtent2D fbExtent = _framebufferExtent;
    mtlRPDesc.renderTargetWidthMVK = max(min(_renderArea.offset.x + _renderArea.extent.width, fbExtent.width), 1u);
@ -770,7 +774,6 @@ void MVKCommandEncoder::finishQueries() {

 MVKCommandEncoder::MVKCommandEncoder(MVKCommandBuffer* cmdBuffer) : MVKBaseDeviceObject(cmdBuffer->getDevice()),
        _cmdBuffer(cmdBuffer),
-        _visibilityResultMTLBuffer(nil),
        _graphicsPipelineState(this),
        _computePipelineState(this),
        _viewportState(this),
@ -799,6 +802,7 @@ MVKCommandEncoder::MVKCommandEncoder(MVKCommandBuffer* cmdBuffer) : MVKBaseDevic
 			_mtlComputeEncoderUse = kMVKCommandUseNone;
            _mtlBlitEncoder = nil;
            _mtlBlitEncoderUse = kMVKCommandUseNone;
+			_pEncodingContext = nullptr;
 }


--- a/MoltenVK/MoltenVK/Commands/MVKCommandEncoderState.h
+++ b/MoltenVK/MoltenVK/Commands/MVKCommandEncoderState.h
@ -614,9 +614,18 @@ public:
 protected:
    void encodeImpl(uint32_t) override;

+	typedef struct OcclusionQueryLocation {
+		MVKOcclusionQueryPool* queryPool = nullptr;
+		uint32_t query = 0;
+		NSUInteger visibilityBufferOffset = 0;
+
+		OcclusionQueryLocation(MVKOcclusionQueryPool* qPool, uint32_t qIdx, NSUInteger vbOfst)
+		: queryPool(qPool), query(qIdx), visibilityBufferOffset(vbOfst) {}
+
+	} OcclusionQueryLocation;
+
+	MVKSmallVector<OcclusionQueryLocation> _mtlRenderPassQueries;
    MTLVisibilityResultMode _mtlVisibilityResultMode = MTLVisibilityResultModeDisabled;
-    NSUInteger _mtlVisibilityResultOffset = 0;
-	MVKSmallVector<std::pair<MVKQuerySpec, NSUInteger>> _mtlRenderPassQueries;
 };


--- a/MoltenVK/MoltenVK/Commands/MVKCommandEncoderState.mm
+++ b/MoltenVK/MoltenVK/Commands/MVKCommandEncoderState.mm
@ -1073,20 +1073,19 @@ void MVKComputeResourcesCommandEncoderState::encodeArgumentBufferResourceUsage(M
 #pragma mark MVKOcclusionQueryCommandEncoderState

 void MVKOcclusionQueryCommandEncoderState::endMetalRenderPass() {
+	const MVKMTLBufferAllocation* vizBuff = _cmdEncoder->_pEncodingContext->visibilityResultBuffer;
+    if ( !vizBuff || _mtlRenderPassQueries.empty() ) { return; }  // Nothing to do.

-    if (_mtlRenderPassQueries.empty()) { return; }  // Nothing to do.
-
-    id<MTLComputePipelineState> mtlAccumState = _cmdEncoder->getCommandEncodingPool()->getAccumulateOcclusionQueryResultsMTLComputePipelineState();
+	id<MTLComputePipelineState> mtlAccumState = _cmdEncoder->getCommandEncodingPool()->getAccumulateOcclusionQueryResultsMTLComputePipelineState();
    id<MTLComputeCommandEncoder> mtlAccumEncoder = _cmdEncoder->getMTLComputeEncoder(kMVKCommandUseAccumOcclusionQuery);
    [mtlAccumEncoder setComputePipelineState: mtlAccumState];
-    for (auto& query : _mtlRenderPassQueries) {
+    for (auto& qryLoc : _mtlRenderPassQueries) {
        // Accumulate the current results to the query pool's buffer.
-        auto* pQueryPool = (MVKOcclusionQueryPool*)query.first.queryPool;
-        [mtlAccumEncoder setBuffer: pQueryPool->getVisibilityResultMTLBuffer()
-                            offset: pQueryPool->getVisibilityResultOffset(query.first.query)
+        [mtlAccumEncoder setBuffer: qryLoc.queryPool->getVisibilityResultMTLBuffer()
+                            offset: qryLoc.queryPool->getVisibilityResultOffset(qryLoc.query)
                           atIndex: 0];
-        [mtlAccumEncoder setBuffer: _cmdEncoder->_visibilityResultMTLBuffer->_mtlBuffer
-                            offset: query.second
+        [mtlAccumEncoder setBuffer: vizBuff->_mtlBuffer
+                            offset: vizBuff->_offset + qryLoc.visibilityBufferOffset
                           atIndex: 1];
        [mtlAccumEncoder dispatchThreadgroups: MTLSizeMake(1, 1, 1)
                        threadsPerThreadgroup: MTLSizeMake(1, 1, 1)];
@ -1095,24 +1094,28 @@ void MVKOcclusionQueryCommandEncoderState::endMetalRenderPass() {
    _mtlRenderPassQueries.clear();
 }

+// The Metal visibility buffer has a finite size, and on some Metal platforms (looking at you M1),
+// query offsets cannnot be reused with the same MTLCommandBuffer. If enough occlusion queries are
+// begun within a single MTLCommandBuffer, it may exhaust the visibility buffer. If that occurs,
+// report an error and disable further visibility tracking for the remainder of the MTLCommandBuffer.
+// In most cases, a MTLCommandBuffer corresponds to a Vulkan command submit (VkSubmitInfo),
+// and so the error text is framed in terms of the Vulkan submit.
 void MVKOcclusionQueryCommandEncoderState::beginOcclusionQuery(MVKOcclusionQueryPool* pQueryPool, uint32_t query, VkQueryControlFlags flags) {
-
-    MVKQuerySpec querySpec;
-    querySpec.set(pQueryPool, query);
-    NSUInteger offset = _mtlRenderPassQueries.empty() ? 0 : _mtlVisibilityResultOffset + kMVKQuerySlotSizeInBytes;
-    NSUInteger maxOffset = _cmdEncoder->_pDeviceMetalFeatures->maxQueryBufferSize - kMVKQuerySlotSizeInBytes;
-    offset = min(offset, maxOffset);
-    _mtlRenderPassQueries.push_back(make_pair(querySpec, offset));
-
-    bool shouldCount = _cmdEncoder->_pDeviceFeatures->occlusionQueryPrecise && mvkAreAllFlagsEnabled(flags, VK_QUERY_CONTROL_PRECISE_BIT);
-    _mtlVisibilityResultMode = shouldCount ? MTLVisibilityResultModeCounting : MTLVisibilityResultModeBoolean;
-    _mtlVisibilityResultOffset = offset;
-
+	if (_cmdEncoder->_pEncodingContext->mtlVisibilityResultOffset + kMVKQuerySlotSizeInBytes <= _cmdEncoder->_pDeviceMetalFeatures->maxQueryBufferSize) {
+		bool shouldCount = _cmdEncoder->_pDeviceFeatures->occlusionQueryPrecise && mvkAreAllFlagsEnabled(flags, VK_QUERY_CONTROL_PRECISE_BIT);
+		_mtlVisibilityResultMode = shouldCount ? MTLVisibilityResultModeCounting : MTLVisibilityResultModeBoolean;
+		_mtlRenderPassQueries.emplace_back(pQueryPool, query, _cmdEncoder->_pEncodingContext->mtlVisibilityResultOffset);
+	} else {
+		reportError(VK_ERROR_OUT_OF_DEVICE_MEMORY, "vkCmdBeginQuery(): The maximum number of queries in a single Vulkan command submission is %llu.", _cmdEncoder->_pDeviceMetalFeatures->maxQueryBufferSize / kMVKQuerySlotSizeInBytes);
+		_mtlVisibilityResultMode = MTLVisibilityResultModeDisabled;
+		_cmdEncoder->_pEncodingContext->mtlVisibilityResultOffset -= kMVKQuerySlotSizeInBytes;
+	}
    markDirty();
 }

 void MVKOcclusionQueryCommandEncoderState::endOcclusionQuery(MVKOcclusionQueryPool* pQueryPool, uint32_t query) {
 	_mtlVisibilityResultMode = MTLVisibilityResultModeDisabled;
+	_cmdEncoder->_pEncodingContext->mtlVisibilityResultOffset += kMVKQuerySlotSizeInBytes;
 	markDirty();
 }

@ -1120,5 +1123,5 @@ void MVKOcclusionQueryCommandEncoderState::encodeImpl(uint32_t stage) {
 	if (stage != kMVKGraphicsStageRasterization) { return; }

 	[_cmdEncoder->_mtlRenderEncoder setVisibilityResultMode: _mtlVisibilityResultMode
-													 offset: _mtlVisibilityResultOffset];
+													 offset: _cmdEncoder->_pEncodingContext->mtlVisibilityResultOffset];
 }
--- a/MoltenVK/MoltenVK/GPUObjects/MVKQueue.h
+++ b/MoltenVK/MoltenVK/GPUObjects/MVKQueue.h
@ -237,7 +237,7 @@ public:
 		}

 protected:
-	void submitCommandBuffers() override { for (auto& cb : _cmdBuffers) { cb->submit(this); } }
+	void submitCommandBuffers() override;

 	MVKSmallVector<MVKCommandBuffer*, N> _cmdBuffers;
 };
--- a/MoltenVK/MoltenVK/GPUObjects/MVKQueue.mm
+++ b/MoltenVK/MoltenVK/GPUObjects/MVKQueue.mm
@ -463,6 +463,13 @@ MVKQueueCommandBufferSubmission::~MVKQueueCommandBufferSubmission() {
 }


+template <size_t N>
+void MVKQueueFullCommandBufferSubmission<N>::submitCommandBuffers() {
+	MVKCommandEncodingContext encodingContext;
+	for (auto& cb : _cmdBuffers) { cb->submit(this, &encodingContext); }
+}
+
+
 #pragma mark -
 #pragma mark MVKQueuePresentSurfaceSubmission