Vulkan timestamp query pools use Metal GPU counters when available.

Add MVKPhysicalDeviceMetalFeatures::counterSamplingPoints to track platform availability of GPU counters. MVKPhysicalDevice creates and manages MTLCounterSets and checks for and enables flags within MVKPhysicalDeviceMetalFeatures::counterSamplingPoints. Add abstract MVKGPUCounterQueryPool class as parent of MVKTimestampQueryPool and MVKPipelineStatisticsQueryPool concrete classes and refactor access to host and command copy tracking data to allow extraction from MTLCounterSampleBuffer. MVKTimestampQueryPool uses MTLCounterSampleBuffer if supported, otherwise reverts to using host data for timestamps. MVKCommandEncoder encodes Vulkan timestamp commands either as Metal staged or command timestamps, depending on whether the GPU is tile-based or immediate-mode. For Metal stage counters, we use a light-weight dummy BLIT encoder to mark timestamp commands executed in the previous Metal encoding pass. Add MVKDevice::getDummyBlitMTLBuffer() to supply a dummy single-byte buffer that can be used by a stand-alone MTLBlitCommandEncoder as dummy work to mark timestamps.
2021-07-20 22:13:04 -04:00 · 2021-07-20 22:13:04 -04:00 · 6ae1745a9c
commit 6ae1745a9c
parent feb8d41444
9 changed files with 455 additions and 185 deletions
--- a/Docs/Whats_New.md
+++ b/Docs/Whats_New.md
@ -18,11 +18,13 @@ MoltenVK 1.1.5

 Released TBD

+- Vulkan timestamp query pools use Metal GPU counters when available.
 - Fix incorrect translation of clear color values on Apple Silicon.
 - Fix swizzle of depth and stencil values into RGBA (`float4`) variable in shaders.
 - Disable `VK_FORMAT_FEATURE_COLOR_ATTACHMENT_BLEND_BIT` for 
  `VK_FORMAT_E5B9G9R9_UFLOAT_PACK32` on macOS Apple Silicon.
 - Support alpha-to-coverage without a color attachment.
+- Update `VK_MVK_MOLTENVK_SPEC_VERSION` to `32`.



--- a/MoltenVK/MoltenVK/API/vk_mvk_moltenvk.h
+++ b/MoltenVK/MoltenVK/API/vk_mvk_moltenvk.h
@ -835,6 +835,16 @@ typedef enum MVKFloatRounding {
 	MVK_FLOAT_ROUNDING_UP_MAX_ENUM = 0x7FFFFFFF
 } MVKFloatRounding;

+/** Identifies the pipeline points where GPU counter sampling can occur. Maps to MTLCounterSamplingPoint. */
+typedef enum MVKCounterSamplingBits {
+	MVK_COUNTER_SAMPLING_AT_DRAW           = 0x00000001,
+	MVK_COUNTER_SAMPLING_AT_DISPATCH       = 0x00000002,
+	MVK_COUNTER_SAMPLING_AT_BLIT           = 0x00000004,
+	MVK_COUNTER_SAMPLING_AT_PIPELINE_STAGE = 0x00000008,
+	MVK_COUNTER_SAMPLING_MAX_ENUM          = 0X7FFFFFFF
+} MVKCounterSamplingBits;
+typedef VkFlags MVKCounterSamplingFlags;
+
 /**
 * Features provided by the current implementation of Metal on the current device. You can
 * retrieve a copy of this structure using the vkGetPhysicalDeviceMetalFeaturesMVK() function.
@ -915,6 +925,7 @@ typedef struct {
 	VkBool32 argumentBuffers;						/**< If true, Metal argument buffers are supported. */
 	VkBool32 descriptorSetArgumentBuffers;			/**< If true, a Metal argument buffer can be assigned to a descriptor set, and used on any pipeline and pipeline stage. If false, a different Metal argument buffer must be used for each pipeline-stage/descriptor-set combination. */
 	MVKFloatRounding clearColorFloatRounding;		/**< Identifies the type of rounding Metal uses for MTLClearColor float to integer conversions. */
+	MVKCounterSamplingFlags counterSamplingPoints;	/**< Identifies the points where pipeline GPU counter sampling may occur. */
 } MVKPhysicalDeviceMetalFeatures;

 /** MoltenVK performance of a particular type of activity. */
--- a/MoltenVK/MoltenVK/Commands/MVKCommandBuffer.h
+++ b/MoltenVK/MoltenVK/Commands/MVKCommandBuffer.h
@ -410,7 +410,7 @@ public:
    void endOcclusionQuery(MVKOcclusionQueryPool* pQueryPool, uint32_t query);

    /** Marks a timestamp for the specified query. */
-    void markTimestamp(MVKQueryPool* pQueryPool, uint32_t query);
+    void markTimestamp(MVKTimestampQueryPool* pQueryPool, uint32_t query);

    /** Reset a range of queries. */
    void resetQueries(MVKQueryPool* pQueryPool, uint32_t firstQuery, uint32_t queryCount);
@ -499,6 +499,13 @@ protected:
 	void setSubpass(MVKCommand* passCmd, VkSubpassContents subpassContents, uint32_t subpassIndex);
 	void clearRenderArea();
    NSString* getMTLRenderCommandEncoderName();
+	void encodeGPUCounterSample(MVKGPUCounterQueryPool* mvkQryPool, uint32_t sampleIndex, MVKCounterSamplingFlags samplingPoints);
+	void encodeTimestampStageCounterSamples();
+
+	typedef struct GPUCounterQuery {
+		MVKGPUCounterQueryPool* queryPool = nullptr;
+		uint32_t query = 0;
+	} GPUCounterQuery;

 	VkSubpassContents _subpassContents;
 	MVKRenderPass* _renderPass;
@ -507,6 +514,7 @@ protected:
 	uint32_t _multiviewPassIndex;
 	VkRect2D _renderArea;
    MVKActivatedQueries* _pActivatedQueries;
+	MVKSmallVector<GPUCounterQuery, 16> _timestampStageCounterQueries;
 	MVKSmallVector<VkClearValue, kMVKDefaultAttachmentCount> _clearValues;
 	MVKSmallVector<MVKImageView*, kMVKDefaultAttachmentCount> _attachments;
 	id<MTLComputeCommandEncoder> _mtlComputeEncoder;
--- a/MoltenVK/MoltenVK/Commands/MVKCommandBuffer.mm
+++ b/MoltenVK/MoltenVK/Commands/MVKCommandBuffer.mm
@ -614,6 +614,8 @@ void MVKCommandEncoder::endCurrentMetalEncoding() {
 	[_mtlBlitEncoder endEncoding];
 	_mtlBlitEncoder = nil;          // not retained
    _mtlBlitEncoderUse = kMVKCommandUseNone;
+
+	encodeTimestampStageCounterSamples();
 }

 id<MTLComputeCommandEncoder> MVKCommandEncoder::getMTLComputeEncoder(MVKCommandUse cmdUse) {
@ -720,6 +722,23 @@ const MVKMTLBufferAllocation* MVKCommandEncoder::copyToTempMTLBufferAllocation(c

 #pragma mark Queries

+// Only executes on immediate-mode GPUs. Encode a GPU counter sample command on whichever Metal
+// encoder is currently in use, creating a temporary BLIT encoder if no encoder is currently active.
+// We only encode the GPU sample if the platform allows encoding at the associated pipeline point.
+void MVKCommandEncoder::encodeGPUCounterSample(MVKGPUCounterQueryPool* mvkQryPool, uint32_t sampleIndex, MVKCounterSamplingFlags samplingPoints){
+	if (_mtlRenderEncoder) {
+		if (mvkIsAnyFlagEnabled(samplingPoints, MVK_COUNTER_SAMPLING_AT_DRAW)) {
+			[_mtlRenderEncoder sampleCountersInBuffer: mvkQryPool->getMTLCounterBuffer() atSampleIndex: sampleIndex withBarrier: NO];
+		}
+	} else if (_mtlComputeEncoder) {
+		if (mvkIsAnyFlagEnabled(samplingPoints, MVK_COUNTER_SAMPLING_AT_DISPATCH)) {
+			[_mtlComputeEncoder sampleCountersInBuffer: mvkQryPool->getMTLCounterBuffer() atSampleIndex: sampleIndex withBarrier: NO];
+		}
+	} else if (mvkIsAnyFlagEnabled(samplingPoints, MVK_COUNTER_SAMPLING_AT_BLIT)) {
+		[getMTLBlitEncoder(kMVKCommandUseRecordGPUCounterSample) sampleCountersInBuffer: mvkQryPool->getMTLCounterBuffer() atSampleIndex: sampleIndex withBarrier: NO];
+	}
+}
+
 void MVKCommandEncoder::beginOcclusionQuery(MVKOcclusionQueryPool* pQueryPool, uint32_t query, VkQueryControlFlags flags) {
    _occlusionQueryState.beginOcclusionQuery(pQueryPool, query, flags);
    uint32_t queryCount = 1;
@ -733,12 +752,61 @@ void MVKCommandEncoder::endOcclusionQuery(MVKOcclusionQueryPool* pQueryPool, uin
    _occlusionQueryState.endOcclusionQuery(pQueryPool, query);
 }

-void MVKCommandEncoder::markTimestamp(MVKQueryPool* pQueryPool, uint32_t query) {
+void MVKCommandEncoder::markTimestamp(MVKTimestampQueryPool* pQueryPool, uint32_t query) {
    uint32_t queryCount = 1;
    if (_renderPass && getSubpass()->isMultiview()) {
        queryCount = getSubpass()->getViewCountInMetalPass(_multiviewPassIndex);
    }
 	addActivatedQueries(pQueryPool, query, queryCount);
+
+	MVKCounterSamplingFlags sampPts = _device->_pMetalFeatures->counterSamplingPoints;
+	if (sampPts) {
+		for (uint32_t qOfst = 0; qOfst < queryCount; qOfst++) {
+			if (mvkIsAnyFlagEnabled(sampPts, MVK_COUNTER_SAMPLING_AT_PIPELINE_STAGE)) {
+				_timestampStageCounterQueries.push_back({ pQueryPool, query + qOfst });
+			} else {
+				encodeGPUCounterSample(pQueryPool, query + qOfst, sampPts);
+			}
+		}
+	}
+}
+
+// Metal stage GPU counters need to be configured in a Metal render, compute, or BLIT encoder, meaning that the
+// Metal encoder needs to know about any Vulkan timestamp commands that will be executed during the execution
+// of a renderpass, or set of Vulkan dispatch or BLIT commands. In addition, there are a very small number of
+// staged timestamps (4) that can be tracked in any single render, compute, or BLIT pass, meaning a renderpass
+// that timestamped after each of many draw calls, would not be trackable. Finally, stage counters are only
+// available on tile-based GPU's, which means draw or dispatch calls cannot be individually timestamped.
+// We avoid dealing with all this complexity and mismatch between how Vulkan and Metal stage counters operate
+// by deferring all timestamps to the end of any batch of Metal encoding, and add a lightweight Metal encoder
+// that does minimal work (it won't timestamp if completely empty), and timestamps that work into all of the
+// Vulkan timestamp queries that have been executed during the execution of the previous Metal encoder.
+void MVKCommandEncoder::encodeTimestampStageCounterSamples() {
+	size_t qCnt = _timestampStageCounterQueries.size();
+	uint32_t qIdx = 0;
+	while (qIdx < qCnt) {
+
+		// With each BLIT pass, consume as many outstanding timestamp queries as possible.
+		// Attach an query result to each of the available sample buffer attachments in the BLIT pass descriptor.
+		auto* bpDesc = [[[MTLBlitPassDescriptor alloc] init] autorelease];
+		for (uint32_t attIdx = 0; attIdx < MTLMaxBlitPassSampleBuffers && qIdx < qCnt; attIdx++, qIdx++) {
+			auto* sbAttDesc = bpDesc.sampleBufferAttachments[attIdx];
+			auto& tsQry = _timestampStageCounterQueries[qIdx];
+
+			// We actually only need to use startOfEncoderSampleIndex, but apparently,
+			// and contradicting docs, Metal hits an unexpected validation error if
+			// endOfEncoderSampleIndex is left at MTLCounterDontSample.
+			sbAttDesc.startOfEncoderSampleIndex = tsQry.query;
+			sbAttDesc.endOfEncoderSampleIndex = tsQry.query;
+			sbAttDesc.sampleBuffer = tsQry.queryPool->getMTLCounterBuffer();
+		}
+
+		auto* mtlEnc = [_mtlCmdBuffer blitCommandEncoderWithDescriptor: bpDesc];
+		setLabelIfNotNil(mtlEnc, mvkMTLBlitCommandEncoderLabel(kMVKCommandUseRecordGPUCounterSample));
+		[mtlEnc fillBuffer: _device->getDummyBlitMTLBuffer() range: NSMakeRange(0, 1) value: 0];
+		[mtlEnc endEncoding];
+	}
+	_timestampStageCounterQueries.clear();
 }

 void MVKCommandEncoder::resetQueries(MVKQueryPool* pQueryPool, uint32_t firstQuery, uint32_t queryCount) {
@ -847,6 +915,7 @@ NSString* mvkMTLBlitCommandEncoderLabel(MVKCommandUse cmdUse) {
        case kMVKCommandUseUpdateBuffer:                    return @"vkCmdUpdateBuffer BlitEncoder";
        case kMVKCommandUseResetQueryPool:                  return @"vkCmdResetQueryPool BlitEncoder";
        case kMVKCommandUseCopyQueryPoolResults:            return @"vkCmdCopyQueryPoolResults BlitEncoder";
+		case kMVKCommandUseRecordGPUCounterSample:          return @"Record GPU Counter Sample BlitEncoder";
        default:                                            return @"Unknown Use BlitEncoder";
    }
 }
--- a/MoltenVK/MoltenVK/GPUObjects/MVKDevice.h
+++ b/MoltenVK/MoltenVK/GPUObjects/MVKDevice.h
@ -324,10 +324,10 @@ public:
 	}

 	/** Returns whether the MSL version is supported on this device. */
-	inline bool mslVersionIsAtLeast(MTLLanguageVersion minVer) { return _metalFeatures.mslVersionEnum >= minVer; }
+	bool mslVersionIsAtLeast(MTLLanguageVersion minVer) { return _metalFeatures.mslVersionEnum >= minVer; }

 	/** Returns whether this device is using Metal argument buffers. */
-	inline bool isUsingMetalArgumentBuffers() const  { return _metalFeatures.argumentBuffers && mvkConfig().useMetalArgumentBuffers; };
+	bool isUsingMetalArgumentBuffers() const  { return _metalFeatures.argumentBuffers && mvkConfig().useMetalArgumentBuffers; };


 #pragma mark Construction
@ -371,6 +371,7 @@ protected:
 	uint32_t getMaxSamplerCount();
 	void initExternalMemoryProperties();
 	void initExtensions();
+	void initCounterSets();
 	MVKArrayRef<MVKQueueFamily*> getQueueFamilies();
 	void initPipelineCacheUUID();
 	uint32_t getHighestMTLFeatureSet();
@ -388,6 +389,7 @@ protected:
 	VkPhysicalDeviceMemoryProperties _memoryProperties;
 	MVKSmallVector<MVKQueueFamily*, kMVKQueueFamilyCount> _queueFamilies;
 	MVKPixelFormats _pixelFormats;
+	id<MTLCounterSet> _timestampMTLCounterSet;
 	uint32_t _allMemoryTypes;
 	uint32_t _hostVisibleMemoryTypes;
 	uint32_t _hostCoherentMemoryTypes;
@ -684,12 +686,21 @@ public:
     */
    uint32_t expandVisibilityResultMTLBuffer(uint32_t queryCount);

+	/** Returns the GPU sample counter used for timestamps. */
+	id<MTLCounterSet> getTimestampMTLCounterSet() { return _physicalDevice->_timestampMTLCounterSet; }
+
    /** Returns the memory type index corresponding to the specified Metal memory storage mode. */
    uint32_t getVulkanMemoryTypeIndex(MTLStorageMode mtlStorageMode);

 	/** Returns a default MTLSamplerState to populate empty array element descriptors. */
 	id<MTLSamplerState> getDefaultMTLSamplerState();

+	/**
+	 * Returns a MTLBuffer of length one that can be used as a dummy to
+	 * create a no-op BLIT encoder based on filling this single-byte buffer.
+	 */
+	id<MTLBuffer> getDummyBlitMTLBuffer();
+
 	/**
 	 * Returns whether MTLCommandBuffers can be prefilled.
 	 *
@ -818,6 +829,7 @@ protected:
    std::mutex _perfLock;
    id<MTLBuffer> _globalVisibilityResultMTLBuffer;
 	id<MTLSamplerState> _defaultMTLSamplerState;
+	id<MTLBuffer> _dummyBlitMTLBuffer;
    uint32_t _globalVisibilityQueryCount;
    std::mutex _vizLock;
 	bool _useMTLFenceForSemaphores;
--- a/MoltenVK/MoltenVK/GPUObjects/MVKDevice.mm
+++ b/MoltenVK/MoltenVK/GPUObjects/MVKDevice.mm
@ -1150,6 +1150,7 @@ MVKPhysicalDevice::MVKPhysicalDevice(MVKInstance* mvkInstance, id<MTLDevice> mtl
 	initExtensions();
 	initMemoryProperties();
 	initExternalMemoryProperties();
+	initCounterSets();
 	logGPUInfo();
 }

@ -1594,6 +1595,16 @@ void MVKPhysicalDevice::initMetalFeatures() {
 	// Currently, if we don't support descriptor set argument buffers, we can't support argument buffers.
 	_metalFeatures.argumentBuffers = _metalFeatures.descriptorSetArgumentBuffers;

+#define checkSupportsMTLCounterSamplingPoint(mtlSP, mvkSP)  \
+	if ([_mtlDevice respondsToSelector: @selector(supportsCounterSampling:)] &&  \
+		[_mtlDevice supportsCounterSampling: MTLCounterSamplingPointAt ##mtlSP ##Boundary]) {  \
+		_metalFeatures.counterSamplingPoints |= MVK_COUNTER_SAMPLING_AT_ ##mvkSP;  \
+	}
+
+	checkSupportsMTLCounterSamplingPoint(Draw, DRAW);
+	checkSupportsMTLCounterSamplingPoint(Dispatch, DISPATCH);
+	checkSupportsMTLCounterSamplingPoint(Blit, BLIT);
+	checkSupportsMTLCounterSamplingPoint(Stage, PIPELINE_STAGE);
 }

 // Initializes the physical device features of this instance.
@ -2726,6 +2737,28 @@ void MVKPhysicalDevice::initExtensions() {
 #endif
 }

+void MVKPhysicalDevice::initCounterSets() {
+	_timestampMTLCounterSet = nil;
+	@autoreleasepool {
+		if (_metalFeatures.counterSamplingPoints) {
+			NSArray<id<MTLCounterSet>>* counterSets = _mtlDevice.counterSets;
+			for (id<MTLCounterSet> cs in counterSets){
+				NSString* csName = cs.name;
+				if ( [csName caseInsensitiveCompare: MTLCommonCounterSetTimestamp] == NSOrderedSame) {
+					NSArray<id<MTLCounter>>* countersInSet = cs.counters;
+					for(id<MTLCounter> ctr in countersInSet) {
+						if ( [ctr.name caseInsensitiveCompare: MTLCommonCounterTimestamp] == NSOrderedSame) {
+							_timestampMTLCounterSet = [cs retain];		// retained
+							break;
+						}
+					}
+					break;
+				}
+			}
+		}
+	}
+}
+
 void MVKPhysicalDevice::logGPUInfo() {
 	string devTypeStr;
 	switch (_properties.deviceType) {
@ -2838,6 +2871,7 @@ void MVKPhysicalDevice::logGPUInfo() {

 MVKPhysicalDevice::~MVKPhysicalDevice() {
 	mvkDestroyContainerContents(_queueFamilies);
+	[_timestampMTLCounterSet release];
 	[_mtlDevice release];
 }

@ -3712,6 +3746,20 @@ id<MTLSamplerState> MVKDevice::getDefaultMTLSamplerState() {
 	return _defaultMTLSamplerState;
 }

+id<MTLBuffer> MVKDevice::getDummyBlitMTLBuffer() {
+	if ( !_dummyBlitMTLBuffer ) {
+
+		// Lock and check again in case another thread has created the buffer.
+		lock_guard<mutex> lock(_rezLock);
+		if ( !_dummyBlitMTLBuffer ) {
+			@autoreleasepool {
+				_dummyBlitMTLBuffer = [getMTLDevice() newBufferWithLength: 1 options: MTLResourceStorageModePrivate];
+			}
+		}
+	}
+	return _dummyBlitMTLBuffer;
+}
+
 MTLCompileOptions* MVKDevice::getMTLCompileOptions(bool useFastMath, bool preserveInvariance) {
 	MTLCompileOptions* mtlCompOpt = [MTLCompileOptions new];
 	mtlCompOpt.languageVersion = _pMetalFeatures->mslVersionEnum;
@ -3833,6 +3881,7 @@ MVKDevice::MVKDevice(MVKPhysicalDevice* physicalDevice, const VkDeviceCreateInfo
    _globalVisibilityQueryCount = 0;

 	_defaultMTLSamplerState = nil;
+	_dummyBlitMTLBuffer = nil;

 	_commandResourceFactory = new MVKCommandResourceFactory(this);

@ -4200,6 +4249,7 @@ MVKDevice::~MVKDevice() {

    [_globalVisibilityResultMTLBuffer release];
 	[_defaultMTLSamplerState release];
+	[_dummyBlitMTLBuffer release];

 	stopAutoGPUCapture(MVK_CONFIG_AUTO_GPU_CAPTURE_SCOPE_DEVICE);

--- a/MoltenVK/MoltenVK/GPUObjects/MVKQueryPool.h
+++ b/MoltenVK/MoltenVK/GPUObjects/MVKQueryPool.h
@ -38,7 +38,6 @@ class MVKCommandEncoder;
 /** 
 * Abstract class representing a Vulkan query pool.
 * Subclasses are specialized for specific query types.
- * Subclasses will generally override the beginQuery(), endQuery(), and getResult(uint32_t, void*, bool) member functions.
 */
 class MVKQueryPool : public MVKVulkanAPIDeviceObject {

@ -106,10 +105,12 @@ public:

 protected:
 	bool areQueriesHostAvailable(uint32_t firstQuery, uint32_t endQuery);
-    VkResult getResult(uint32_t query, void* pQryData, VkQueryResultFlags flags);
-	virtual void getResult(uint32_t query, void* pQryData, bool shouldOutput64Bit) {}
+	virtual NSData* getQuerySourceData(uint32_t firstQuery, uint32_t queryCount) { return nil; }
+    VkResult getResult(uint32_t query, NSData* srcData, uint32_t srcDataQueryOffset, void* pDstData, VkQueryResultFlags flags);
 	virtual id<MTLBuffer> getResultBuffer(MVKCommandEncoder* cmdEncoder, uint32_t firstQuery, uint32_t queryCount, NSUInteger& offset) { return nil; }
-	virtual void encodeSetResultBuffer(MVKCommandEncoder* cmdEncoder, uint32_t firstQuery, uint32_t queryCount, uint32_t index) {}
+	virtual id<MTLComputeCommandEncoder> encodeComputeCopyResults(MVKCommandEncoder* cmdEncoder, uint32_t firstQuery, uint32_t queryCount, uint32_t index) { return nil; }
+	virtual void encodeDirectCopyResults(MVKCommandEncoder* cmdEncoder, uint32_t firstQuery, uint32_t queryCount,
+										 MVKBuffer* destBuffer, VkDeviceSize destOffset, VkDeviceSize stride);

 	struct DeferredCopy {
 		uint32_t firstQuery;
@ -136,31 +137,6 @@ protected:
 };


-#pragma mark -
-#pragma mark MVKTimestampQueryPool
-
-/** A Vulkan query pool for timestamp queries. */
-class MVKTimestampQueryPool : public MVKQueryPool {
-
-public:
-    void endQuery(uint32_t query, MVKCommandEncoder* cmdEncoder) override;
-    void finishQueries(const MVKArrayRef<uint32_t>& queries) override;
-
-
-#pragma mark Construction
-
-	MVKTimestampQueryPool(MVKDevice* device, const VkQueryPoolCreateInfo* pCreateInfo);
-
-protected:
-	void propagateDebugName() override {}
-	void getResult(uint32_t query, void* pQryData, bool shouldOutput64Bit) override;
-	id<MTLBuffer> getResultBuffer(MVKCommandEncoder* cmdEncoder, uint32_t firstQuery, uint32_t queryCount, NSUInteger& offset) override;
-	void encodeSetResultBuffer(MVKCommandEncoder* cmdEncoder, uint32_t firstQuery, uint32_t queryCount, uint32_t index) override;
-
-	MVKSmallVector<uint64_t, kMVKDefaultQueryCount> _timestamps;
-};
-
-
 #pragma mark -
 #pragma mark MVKOcclusionQueryPool

@ -189,20 +165,71 @@ public:

 protected:
 	void propagateDebugName() override;
-    void getResult(uint32_t query, void* pQryData, bool shouldOutput64Bit) override;
+	NSData* getQuerySourceData(uint32_t firstQuery, uint32_t queryCount) override;
 	id<MTLBuffer> getResultBuffer(MVKCommandEncoder* cmdEncoder, uint32_t firstQuery, uint32_t queryCount, NSUInteger& offset) override;
-	void encodeSetResultBuffer(MVKCommandEncoder* cmdEncoder, uint32_t firstQuery, uint32_t queryCount, uint32_t index) override;
+	id<MTLComputeCommandEncoder> encodeComputeCopyResults(MVKCommandEncoder* cmdEncoder, uint32_t firstQuery, uint32_t queryCount, uint32_t index) override;

    id<MTLBuffer> _visibilityResultMTLBuffer;
    uint32_t _queryIndexOffset;
 };


+#pragma mark -
+#pragma mark MVKGPUCounterQueryPool
+
+/** An abstract parent class for query pools that use Metal GPU counters if they are supported on the platform. */
+class MVKGPUCounterQueryPool : public MVKQueryPool {
+
+public:
+
+	/**
+	 * Returns the MTLCounterBuffer being used by this query pool,
+	 * or returns nil if GPU counters are not supported.
+	 * */
+	id<MTLCounterSampleBuffer> getMTLCounterBuffer() { return _mtlCounterBuffer; }
+
+	MVKGPUCounterQueryPool(MVKDevice* device, const VkQueryPoolCreateInfo* pCreateInfo);
+
+	~MVKGPUCounterQueryPool() override;
+
+protected:
+	void initMTLCounterSampleBuffer(const VkQueryPoolCreateInfo* pCreateInfo, id<MTLCounterSet> mtlCounterSet);
+
+	id<MTLCounterSampleBuffer> _mtlCounterBuffer;
+};
+
+
+#pragma mark -
+#pragma mark MVKTimestampQueryPool
+
+/** A Vulkan query pool for timestamp queries. */
+class MVKTimestampQueryPool : public MVKGPUCounterQueryPool {
+
+public:
+	void endQuery(uint32_t query, MVKCommandEncoder* cmdEncoder) override;
+	void finishQueries(const MVKArrayRef<uint32_t>& queries) override;
+
+#pragma mark Construction
+
+	MVKTimestampQueryPool(MVKDevice* device, const VkQueryPoolCreateInfo* pCreateInfo);
+
+protected:
+	void propagateDebugName() override {}
+	NSData* getQuerySourceData(uint32_t firstQuery, uint32_t queryCount) override;
+	id<MTLBuffer> getResultBuffer(MVKCommandEncoder* cmdEncoder, uint32_t firstQuery, uint32_t queryCount, NSUInteger& offset) override;
+	id<MTLComputeCommandEncoder> encodeComputeCopyResults(MVKCommandEncoder* cmdEncoder, uint32_t firstQuery, uint32_t queryCount, uint32_t index) override;
+	void encodeDirectCopyResults(MVKCommandEncoder* cmdEncoder, uint32_t firstQuery, uint32_t queryCount,
+								 MVKBuffer* destBuffer, VkDeviceSize destOffset, VkDeviceSize stride) override;
+
+	MVKSmallVector<uint64_t> _timestamps;
+};
+
+
 #pragma mark -
 #pragma mark MVKPipelineStatisticsQueryPool

 /** A Vulkan query pool for a query pool type that tracks pipeline statistics. */
-class MVKPipelineStatisticsQueryPool : public MVKQueryPool {
+class MVKPipelineStatisticsQueryPool : public MVKGPUCounterQueryPool {

 public:
    MVKPipelineStatisticsQueryPool(MVKDevice* device, const VkQueryPoolCreateInfo* pCreateInfo);
--- a/MoltenVK/MoltenVK/GPUObjects/MVKQueryPool.mm
+++ b/MoltenVK/MoltenVK/GPUObjects/MVKQueryPool.mm
@ -89,11 +89,14 @@ VkResult MVKQueryPool::getResults(uint32_t firstQuery,
 	}

 	VkResult rqstRslt = VK_SUCCESS;
-	uintptr_t pQryData = (uintptr_t)pData;
-	for (uint32_t query = firstQuery; query < endQuery; query++, pQryData += stride) {
-		VkResult qryRslt = getResult(query, (void*)pQryData, flags);
+	@autoreleasepool {
+		NSData* srcData = getQuerySourceData(firstQuery, queryCount);
+		uintptr_t pDstData = (uintptr_t)pData;
+		for (uint32_t query = firstQuery; query < endQuery; query++, pDstData += stride) {
+			VkResult qryRslt = getResult(query, srcData, firstQuery, (void*)pDstData, flags);
 			if (rqstRslt == VK_SUCCESS) { rqstRslt = qryRslt; }
 		}
+	}
 	return rqstRslt;
 }

@ -114,7 +117,7 @@ bool MVKQueryPool::areQueriesHostAvailable(uint32_t firstQuery, uint32_t endQuer
    return true;
 }

-VkResult MVKQueryPool::getResult(uint32_t query, void* pQryData, VkQueryResultFlags flags) {
+VkResult MVKQueryPool::getResult(uint32_t query, NSData* srcData, uint32_t srcDataQueryOffset, void* pDstData, VkQueryResultFlags flags) {

 	if (_device->getConfigurationResult() != VK_SUCCESS) { return _device->getConfigurationResult(); }

@ -123,15 +126,22 @@ VkResult MVKQueryPool::getResult(uint32_t query, void* pQryData, VkQueryResultFl
 	bool shouldOutput64Bit = mvkAreAllFlagsEnabled(flags, VK_QUERY_RESULT_64_BIT);

 	// Output the results of this query
-	if (shouldOutput) { getResult(query, pQryData, shouldOutput64Bit); }
+	if (shouldOutput) {
+		uint64_t rsltVal = ((uint64_t*)srcData.bytes)[query - srcDataQueryOffset];
+		if (shouldOutput64Bit) {
+			*(uint64_t*)pDstData = rsltVal;
+		} else {
+			*(uint32_t*)pDstData = (uint32_t)rsltVal;
+		}
+	}

 	// If requested, output the availability bit
 	if (mvkAreAllFlagsEnabled(flags, VK_QUERY_RESULT_WITH_AVAILABILITY_BIT)) {
 		if (shouldOutput64Bit) {
-			uintptr_t pAvailability = (uintptr_t)pQryData + (_queryElementCount * sizeof(uint64_t));
+			uintptr_t pAvailability = (uintptr_t)pDstData + (_queryElementCount * sizeof(uint64_t));
 			*(uint64_t*)pAvailability = isAvailable;
 		} else {
-			uintptr_t pAvailability = (uintptr_t)pQryData + (_queryElementCount * sizeof(uint32_t));
+			uintptr_t pAvailability = (uintptr_t)pDstData + (_queryElementCount * sizeof(uint32_t));
 			*(uint32_t*)pAvailability = isAvailable;
 		}
 	}
@ -154,20 +164,12 @@ void MVKQueryPool::encodeCopyResults(MVKCommandEncoder* cmdEncoder,
 		stride == _queryElementCount * sizeof(uint64_t) &&
 		areQueriesDeviceAvailable(firstQuery, queryCount)) {

-		id<MTLBlitCommandEncoder> mtlBlitCmdEnc = cmdEncoder->getMTLBlitEncoder(kMVKCommandUseCopyQueryPoolResults);
-		NSUInteger srcOffset;
-		id<MTLBuffer> srcBuff = getResultBuffer(cmdEncoder, firstQuery, queryCount, srcOffset);
-		[mtlBlitCmdEnc copyFromBuffer: srcBuff
-						 sourceOffset: srcOffset
-							 toBuffer: destBuffer->getMTLBuffer()
-					destinationOffset: destBuffer->getMTLBufferOffset() + destOffset
-								 size: stride * queryCount];
+		encodeDirectCopyResults(cmdEncoder, firstQuery, queryCount, destBuffer, destOffset, stride);
 		// TODO: In the case where none of the queries is ready, we can fill with 0.
 	} else {
-		id<MTLComputeCommandEncoder> mtlComputeCmdEnc = cmdEncoder->getMTLComputeEncoder(kMVKCommandUseCopyQueryPoolResults);
 		id<MTLComputePipelineState> mtlCopyResultsState = cmdEncoder->getCommandEncodingPool()->getCmdCopyQueryPoolResultsMTLComputePipelineState();
+		id<MTLComputeCommandEncoder> mtlComputeCmdEnc = encodeComputeCopyResults(cmdEncoder, firstQuery, queryCount, 0);
 		[mtlComputeCmdEnc setComputePipelineState: mtlCopyResultsState];
-		encodeSetResultBuffer(cmdEncoder, firstQuery, queryCount, 0);
 		[mtlComputeCmdEnc setBuffer: destBuffer->getMTLBuffer()
 							 offset: destBuffer->getMTLBufferOffset() + destOffset
 							atIndex: 1];
@ -183,6 +185,24 @@ void MVKQueryPool::encodeCopyResults(MVKCommandEncoder* cmdEncoder,
 	}
 }

+// If this asked for 64-bit results with no availability and packed stride, then we can do a straight copy.
+void MVKQueryPool::encodeDirectCopyResults(MVKCommandEncoder* cmdEncoder,
+									 uint32_t firstQuery,
+									 uint32_t queryCount,
+									 MVKBuffer* destBuffer,
+									 VkDeviceSize destOffset,
+									 VkDeviceSize stride) {
+
+	id<MTLBlitCommandEncoder> mtlBlitCmdEnc = cmdEncoder->getMTLBlitEncoder(kMVKCommandUseCopyQueryPoolResults);
+	NSUInteger srcOffset;
+	id<MTLBuffer> srcBuff = getResultBuffer(cmdEncoder, firstQuery, queryCount, srcOffset);
+	[mtlBlitCmdEnc copyFromBuffer: srcBuff
+					 sourceOffset: srcOffset
+						 toBuffer: destBuffer->getMTLBuffer()
+				destinationOffset: destBuffer->getMTLBufferOffset() + destOffset
+							 size: stride * queryCount];
+}
+
 void MVKQueryPool::deferCopyResults(uint32_t firstQuery,
 									uint32_t queryCount,
 									MVKBuffer* destBuffer,
@ -195,53 +215,6 @@ void MVKQueryPool::deferCopyResults(uint32_t firstQuery,
 }


-#pragma mark -
-#pragma mark MVKTimestampQueryPool
-
-void MVKTimestampQueryPool::endQuery(uint32_t query, MVKCommandEncoder* cmdEncoder) {
-    cmdEncoder->markTimestamp(this, query);
-    MVKQueryPool::endQuery(query, cmdEncoder);
-}
-
-// Update timestamp values, then mark queries as available
-void MVKTimestampQueryPool::finishQueries(const MVKArrayRef<uint32_t>& queries) {
-    uint64_t ts = mvkGetTimestamp();
-    for (uint32_t qry : queries) { _timestamps[qry] = ts; }
-
-    MVKQueryPool::finishQueries(queries);
-}
-
-void MVKTimestampQueryPool::getResult(uint32_t query, void* pQryData, bool shouldOutput64Bit) {
-	if (shouldOutput64Bit) {
-		*(uint64_t*)pQryData = _timestamps[query];
-	} else {
-		*(uint32_t*)pQryData = (uint32_t)_timestamps[query];
-	}
-}
-
-id<MTLBuffer> MVKTimestampQueryPool::getResultBuffer(MVKCommandEncoder* cmdEncoder, uint32_t firstQuery, uint32_t queryCount, NSUInteger& offset) {
-	const MVKMTLBufferAllocation* tempBuff = cmdEncoder->getTempMTLBuffer(queryCount * _queryElementCount * sizeof(uint64_t));
-	void* pBuffData = tempBuff->getContents();
-	size_t size = queryCount * _queryElementCount * sizeof(uint64_t);
-	memcpy(pBuffData, &_timestamps[firstQuery], size);
-	offset = tempBuff->_offset;
-	return tempBuff->_mtlBuffer;
-}
-
-void MVKTimestampQueryPool::encodeSetResultBuffer(MVKCommandEncoder* cmdEncoder, uint32_t firstQuery, uint32_t queryCount, uint32_t index) {
-	// No need to create a temp buffer here.
-	cmdEncoder->setComputeBytes(cmdEncoder->getMTLComputeEncoder(kMVKCommandUseCopyQueryPoolResults), &_timestamps[firstQuery], queryCount * _queryElementCount * sizeof(uint64_t), index);
-}
-
-
-#pragma mark Construction
-
-MVKTimestampQueryPool::MVKTimestampQueryPool(MVKDevice* device,
-											 const VkQueryPoolCreateInfo* pCreateInfo) :
-	MVKQueryPool(device, pCreateInfo, 1), _timestamps(pCreateInfo->queryCount, 0) {
-}
-
-
 #pragma mark -
 #pragma mark MVKOcclusionQueryPool

@ -285,15 +258,14 @@ void MVKOcclusionQueryPool::resetResults(uint32_t firstQuery, uint32_t queryCoun
    }
 }

-void MVKOcclusionQueryPool::getResult(uint32_t query, void* pQryData, bool shouldOutput64Bit) {
-    NSUInteger mtlBuffOffset = getVisibilityResultOffset(query);
-    uint64_t* pData = (uint64_t*)((uintptr_t)getVisibilityResultMTLBuffer().contents + mtlBuffOffset);
+NSData* MVKOcclusionQueryPool::getQuerySourceData(uint32_t firstQuery, uint32_t queryCount) {
+	id<MTLBuffer> vizBuff = getVisibilityResultMTLBuffer();
+	return [NSData dataWithBytesNoCopy: (void*)((uintptr_t)vizBuff.contents + getVisibilityResultOffset(firstQuery))
+								length: queryCount * kMVKQuerySlotSizeInBytes
+						  freeWhenDone: false];

-    if (shouldOutput64Bit) {
-        *(uint64_t*)pQryData = *pData;
-    } else {
-        *(uint32_t*)pQryData = (uint32_t)(*pData);
-    }
+
+	return [NSData dataWithBytesNoCopy: vizBuff.contents length: vizBuff.length freeWhenDone: false];
 }

 id<MTLBuffer> MVKOcclusionQueryPool::getResultBuffer(MVKCommandEncoder*, uint32_t firstQuery, uint32_t, NSUInteger& offset) {
@ -301,10 +273,10 @@ id<MTLBuffer> MVKOcclusionQueryPool::getResultBuffer(MVKCommandEncoder*, uint32_
 	return getVisibilityResultMTLBuffer();
 }

-void MVKOcclusionQueryPool::encodeSetResultBuffer(MVKCommandEncoder* cmdEncoder, uint32_t firstQuery, uint32_t, uint32_t index) {
-	[cmdEncoder->getMTLComputeEncoder(kMVKCommandUseCopyQueryPoolResults) setBuffer: getVisibilityResultMTLBuffer()
-																			 offset: getVisibilityResultOffset(firstQuery)
-																			atIndex: index];
+id<MTLComputeCommandEncoder> MVKOcclusionQueryPool::encodeComputeCopyResults(MVKCommandEncoder* cmdEncoder, uint32_t firstQuery, uint32_t, uint32_t index) {
+	id<MTLComputeCommandEncoder> mtlCmdEnc = cmdEncoder->getMTLComputeEncoder(kMVKCommandUseCopyQueryPoolResults);
+	[mtlCmdEnc setBuffer: getVisibilityResultMTLBuffer() offset: getVisibilityResultOffset(firstQuery) atIndex: index];
+	return mtlCmdEnc;
 }

 void MVKOcclusionQueryPool::beginQueryAddedTo(uint32_t query, MVKCommandBuffer* cmdBuffer) {
@ -332,14 +304,12 @@ MVKOcclusionQueryPool::MVKOcclusionQueryPool(MVKDevice* device,
        _queryIndexOffset = 0;

        // Ensure we don't overflow the maximum number of queries
-        uint32_t queryCount = pCreateInfo->queryCount;
-        VkDeviceSize reqBuffLen = (VkDeviceSize)queryCount * kMVKQuerySlotSizeInBytes;
+        VkDeviceSize reqBuffLen = (VkDeviceSize)pCreateInfo->queryCount * kMVKQuerySlotSizeInBytes;
        VkDeviceSize maxBuffLen = _device->_pMetalFeatures->maxQueryBufferSize;
        VkDeviceSize newBuffLen = min(reqBuffLen, maxBuffLen);
-        queryCount = uint32_t(newBuffLen / kMVKQuerySlotSizeInBytes);

        if (reqBuffLen > maxBuffLen) {
-            reportError(VK_ERROR_OUT_OF_DEVICE_MEMORY, "vkCreateQueryPool(): Each query pool can support a maximum of %d queries.", queryCount);
+            reportError(VK_ERROR_OUT_OF_DEVICE_MEMORY, "vkCreateQueryPool(): Each query pool can support a maximum of %d queries.", uint32_t(newBuffLen / kMVKQuerySlotSizeInBytes));
        }

        NSUInteger mtlBuffLen = mvkAlignByteCount(newBuffLen, _device->_pMetalFeatures->mtlBufferAlignment);
@ -357,11 +327,131 @@ MVKOcclusionQueryPool::~MVKOcclusionQueryPool() {
 };


+#pragma mark -
+#pragma mark MVKGPUCounterQueryPool
+
+MVKGPUCounterQueryPool::MVKGPUCounterQueryPool(MVKDevice* device, const VkQueryPoolCreateInfo* pCreateInfo) :
+	MVKQueryPool(device, pCreateInfo, 1), _mtlCounterBuffer(nil) {}
+
+// To establish the Metal counter sample buffer, this must be called from the construtors
+// of subclasses, because the type of MTLCounterSet is determined by the subclass.
+void MVKGPUCounterQueryPool::initMTLCounterSampleBuffer(const VkQueryPoolCreateInfo* pCreateInfo, id<MTLCounterSet> mtlCounterSet) {
+	if ( !_device->_pMetalFeatures->counterSamplingPoints ) { return; }
+
+	@autoreleasepool {
+		MTLCounterSampleBufferDescriptor* tsDesc = [[[MTLCounterSampleBufferDescriptor alloc] init] autorelease];
+		tsDesc.counterSet = mtlCounterSet;
+		tsDesc.storageMode = MTLStorageModeShared;
+		tsDesc.sampleCount = pCreateInfo->queryCount;
+
+		NSError* err = nil;
+		_mtlCounterBuffer = [getMTLDevice() newCounterSampleBufferWithDescriptor: tsDesc error: &err];
+		if (err) {
+			setConfigurationResult(reportError(VK_ERROR_INITIALIZATION_FAILED,
+											   "Could not create MTLCounterSampleBuffer for query pool of type VK_QUERY_TYPE_TIMESTAMP. Reverting to emulated GPU timestamps. (Error code %li): %s",
+											   (long)err.code, err.localizedDescription.UTF8String));
+		}
+	}
+};
+
+MVKGPUCounterQueryPool::~MVKGPUCounterQueryPool() {
+	[_mtlCounterBuffer release];
+}
+
+
+#pragma mark -
+#pragma mark MVKTimestampQueryPool
+
+void MVKTimestampQueryPool::endQuery(uint32_t query, MVKCommandEncoder* cmdEncoder) {
+	cmdEncoder->markTimestamp(this, query);
+	MVKQueryPool::endQuery(query, cmdEncoder);
+}
+
+// If not using MTLCounterSampleBuffer, update timestamp values, then mark queries as available
+void MVKTimestampQueryPool::finishQueries(const MVKArrayRef<uint32_t>& queries) {
+	if ( !_mtlCounterBuffer ) {
+		uint64_t ts = mvkGetTimestamp();
+		for (uint32_t qry : queries) { _timestamps[qry] = ts; }
+	}
+	MVKQueryPool::finishQueries(queries);
+}
+
+NSData* MVKTimestampQueryPool::getQuerySourceData(uint32_t firstQuery, uint32_t queryCount) {
+	if (_mtlCounterBuffer) {
+		return [_mtlCounterBuffer resolveCounterRange: NSMakeRange(firstQuery, queryCount)];
+	} else {
+		return [NSData dataWithBytesNoCopy: (void*)&_timestamps[firstQuery]
+									length: queryCount * kMVKQuerySlotSizeInBytes
+							  freeWhenDone: false];
+	}
+}
+
+void MVKTimestampQueryPool::encodeDirectCopyResults(MVKCommandEncoder* cmdEncoder,
+													uint32_t firstQuery,
+													uint32_t queryCount,
+													MVKBuffer* destBuffer,
+													VkDeviceSize destOffset,
+													VkDeviceSize stride) {
+	if (_mtlCounterBuffer) {
+		id<MTLBlitCommandEncoder> mtlBlitCmdEnc = cmdEncoder->getMTLBlitEncoder(kMVKCommandUseCopyQueryPoolResults);
+		[mtlBlitCmdEnc resolveCounters: _mtlCounterBuffer
+							   inRange: NSMakeRange(firstQuery,  queryCount)
+					 destinationBuffer: destBuffer->getMTLBuffer()
+					 destinationOffset: destBuffer->getMTLBufferOffset() + destOffset];
+	} else {
+		MVKQueryPool::encodeDirectCopyResults(cmdEncoder, firstQuery, queryCount, destBuffer, destOffset, stride);
+	}
+}
+
+id<MTLBuffer> MVKTimestampQueryPool::getResultBuffer(MVKCommandEncoder* cmdEncoder, uint32_t firstQuery, uint32_t queryCount, NSUInteger& offset) {
+	const MVKMTLBufferAllocation* tempBuff = cmdEncoder->getTempMTLBuffer(queryCount * _queryElementCount * sizeof(uint64_t));
+	void* pBuffData = tempBuff->getContents();
+	size_t size = queryCount * _queryElementCount * sizeof(uint64_t);
+	memcpy(pBuffData, &_timestamps[firstQuery], size);
+	offset = tempBuff->_offset;
+	return tempBuff->_mtlBuffer;
+}
+
+id<MTLComputeCommandEncoder> MVKTimestampQueryPool::encodeComputeCopyResults(MVKCommandEncoder* cmdEncoder, uint32_t firstQuery, uint32_t queryCount, uint32_t index) {
+	if (_mtlCounterBuffer) {
+		// We first need to resolve from the MTLCounterSampleBuffer into a temp buffer using a
+		// MTLBlitCommandEncoder, before creating the compute encoder and set that temp buffer into it.
+		const MVKMTLBufferAllocation* tempBuff = cmdEncoder->getTempMTLBuffer(queryCount * _queryElementCount * sizeof(uint64_t));
+		id<MTLBlitCommandEncoder> mtlBlitCmdEnc = cmdEncoder->getMTLBlitEncoder(kMVKCommandUseCopyQueryPoolResults);
+		[mtlBlitCmdEnc resolveCounters: _mtlCounterBuffer
+							   inRange: NSMakeRange(firstQuery,  queryCount)
+					 destinationBuffer: tempBuff->_mtlBuffer
+					 destinationOffset: tempBuff->_offset];
+
+		id<MTLComputeCommandEncoder> mtlCmdEnc = cmdEncoder->getMTLComputeEncoder(kMVKCommandUseCopyQueryPoolResults);
+		[mtlCmdEnc setBuffer: tempBuff->_mtlBuffer offset: tempBuff->_offset atIndex: index];
+		return mtlCmdEnc;
+	} else {
+		// We can set the timestamp bytes into the compute encoder.
+		id<MTLComputeCommandEncoder> mtlCmdEnc = cmdEncoder->getMTLComputeEncoder(kMVKCommandUseCopyQueryPoolResults);
+		cmdEncoder->setComputeBytes(mtlCmdEnc, &_timestamps[firstQuery], queryCount * _queryElementCount * sizeof(uint64_t), index);
+		return mtlCmdEnc;
+	}
+}
+
+
+#pragma mark Construction
+
+MVKTimestampQueryPool::MVKTimestampQueryPool(MVKDevice* device, const VkQueryPoolCreateInfo* pCreateInfo) :
+	MVKGPUCounterQueryPool(device, pCreateInfo) {
+
+		initMTLCounterSampleBuffer(pCreateInfo, _device->getTimestampMTLCounterSet());
+
+		// If we don't use a MTLCounterSampleBuffer, allocate memory to hold the timestamps.
+		if ( !_mtlCounterBuffer ) { _timestamps.resize(pCreateInfo->queryCount, 0); }
+}
+
+
 #pragma mark -
 #pragma mark MVKPipelineStatisticsQueryPool

 MVKPipelineStatisticsQueryPool::MVKPipelineStatisticsQueryPool(MVKDevice* device,
-															   const VkQueryPoolCreateInfo* pCreateInfo) : MVKQueryPool(device, pCreateInfo, 1) {
+															   const VkQueryPoolCreateInfo* pCreateInfo) : MVKGPUCounterQueryPool(device, pCreateInfo) {
 	if ( !_device->_enabledFeatures.pipelineStatisticsQuery ) {
 		setConfigurationResult(reportError(VK_ERROR_FEATURE_NOT_PRESENT, "vkCreateQueryPool: VK_QUERY_TYPE_PIPELINE_STATISTICS is not supported."));
 	}
--- a/MoltenVK/MoltenVK/Utility/MVKFoundation.h
+++ b/MoltenVK/MoltenVK/Utility/MVKFoundation.h
@ -91,7 +91,8 @@ typedef enum : uint8_t {
    kMVKCommandUseTessellationVertexTessCtl,    /**< vkCmdDraw* - vertex and tessellation control stages. */
 	kMVKCommandUseMultiviewInstanceCountAdjust, /**< vkCmdDrawIndirect* - adjust instance count for multiview. */
    kMVKCommandUseCopyQueryPoolResults,         /**< vkCmdCopyQueryPoolResults. */
-    kMVKCommandUseAccumOcclusionQuery           /**< Any command terminating a Metal render pass with active visibility buffer. */
+    kMVKCommandUseAccumOcclusionQuery,          /**< Any command terminating a Metal render pass with active visibility buffer. */
+	kMVKCommandUseRecordGPUCounterSample        /**< Any command triggering the recording of a GPU counter sample. */
 } MVKCommandUse;

 /** Represents a given stage of a graphics pipeline. */