Support the VK_EXT_subgroup_size_control extension.

This extension allows the subgroup size to vary between draw/dispatch calls, and even allows clients to declare that full subgroups must always be dispatched. It corresponds better to how Metal actually works. No support for declaring a required subgroup size, unfortunately.
2020-11-28 17:31:49 -06:00 · 2020-11-28 17:31:49 -06:00 · e0e5d3ce28
commit e0e5d3ce28
parent 9ec67edc42
7 changed files with 69 additions and 16 deletions
--- a/Docs/MoltenVK_Runtime_UserGuide.md
+++ b/Docs/MoltenVK_Runtime_UserGuide.md
@ -290,10 +290,11 @@ In addition to core *Vulkan* functionality, **MoltenVK**  also supports the foll
 - `VK_KHR_portability_subset`
 - `VK_KHR_push_descriptor`
 - `VK_KHR_relaxed_block_layout`
- `VK_KHR_sampler_mirror_clamp_to_edge` *(macOS)*
+- `VK_KHR_sampler_mirror_clamp_to_edge` *(requires a Mac GPU or Apple family 7 GPU)*
 - `VK_KHR_sampler_ycbcr_conversion`
 - `VK_KHR_shader_draw_parameters`
 - `VK_KHR_shader_float16_int8`
+- `VK_KHR_shader_subgroup_extended_types` *(requires Metal 2.1 on Mac or Metal 2.2 and Apple family 4 on iOS)*
 - `VK_KHR_storage_buffer_storage_class`
 - `VK_KHR_surface`
 - `VK_KHR_swapchain`
@ -317,6 +318,7 @@ In addition to core *Vulkan* functionality, **MoltenVK**  also supports the foll
 - `VK_EXT_scalar_block_layout`
 - `VK_EXT_shader_stencil_export` *(requires Mac GPU family 2 or iOS GPU family 5)*
 - `VK_EXT_shader_viewport_index_layer`
+- `VK_EXT_subgroup_size_control` *(requires Metal 2.1 on Mac or Metal 2.2 and Apple family 4 on iOS)*
 - `VK_EXT_swapchain_colorspace`
 - `VK_EXT_vertex_attribute_divisor`
 - `VK_EXT_texel_buffer_alignment` *(requires Metal 2.0)*
--- a/Docs/Whats_New.md
+++ b/Docs/Whats_New.md
@ -19,10 +19,12 @@ MoltenVK 1.1.1
 Released TBD

 - Add support for extensions:
+	- `VK_KHR_sampler_mirror_clamp_to_edge` (iOS)
 	- `VK_KHR_timeline_semaphore`
 	- `VK_EXT_descriptor_indexing` (initial release limited to Metal Tier 1: 96/128 textures, 16 samplers)
 	- `VK_EXT_post_depth_coverage` (macOS)
 	- `VK_EXT_private_data`
+	- `VK_EXT_subgroup_size_control`
 	- `VK_EXT_texture_compression_astc_hdr`
 	- `VK_AMD_shader_image_load_store` (macOS)
 	- `VK_IMG_format_pvrtc` (macOS)
--- a/MoltenVK/MoltenVK/API/vk_mvk_moltenvk.h
+++ b/MoltenVK/MoltenVK/API/vk_mvk_moltenvk.h
@ -616,7 +616,7 @@ typedef struct {
 	VkBool32 placementHeaps;					/**< If true, MTLHeap objects support placement of resources. */
 	VkDeviceSize pushConstantSizeAlignment;		/**< The alignment used internally when allocating memory for push constants. Must be PoT. */
 	uint32_t maxTextureLayers;					/**< The maximum number of layers in an array texture. */
-    uint32_t subgroupSize;			            /**< The number of threads in a SIMD-group. */
+    uint32_t maxSubgroupSize;			        /**< The maximum number of threads in a SIMD-group. */
 	VkDeviceSize vertexStrideAlignment;         /**< The alignment used for the stride of vertex attribute bindings. */
 	VkBool32 indirectTessellationDrawing;		/**< If true, tessellation draw calls support parameters held in a GPU buffer. */
 	VkBool32 nonUniformThreadgroups;			/**< If true, the device supports arbitrary-sized grids in compute workloads. */
@ -634,6 +634,7 @@ typedef struct {
 	VkBool32 quadPermute;						/**< If true, quadgroup permutation functions (vote, ballot, shuffle) are supported in shaders. */
 	VkBool32 simdPermute;						/**< If true, SIMD-group permutation functions (vote, ballot, shuffle) are supported in shaders. */
 	VkBool32 simdReduction;						/**< If true, SIMD-group reduction functions (arithmetic) are supported in shaders. */
+    uint32_t minSubgroupSize;			        /**< The minimum number of threads in a SIMD-group. */
 } MVKPhysicalDeviceMetalFeatures;

 /** MoltenVK performance of a particular type of activity. */
--- a/MoltenVK/MoltenVK/GPUObjects/MVKDevice.mm
+++ b/MoltenVK/MoltenVK/GPUObjects/MVKDevice.mm
@ -53,6 +53,14 @@ using namespace std;
 #define supportsMTLFeatureSet(MFS)	[_mtlDevice supportsFeatureSet: MTLFeatureSet_ ##MFS]
 #define supportsMTLGPUFamily(GPUF)	([_mtlDevice respondsToSelector: @selector(supportsFamily:)] && [_mtlDevice supportsFamily: MTLGPUFamily ##GPUF])

+static const uint32_t kAMDVendorId = 0x1002;
+static const uint32_t kAppleVendorId = 0x106b;
+static const uint32_t kIntelVendorId = 0x8086;
+static const uint32_t kNVVendorId = 0x10de;
+
+static const uint32_t kAMDRadeonRX5700XTDeviceId = 0x731f;
+static const uint32_t kAMDRadeonRX5500XTDeviceId = 0x7340;
+

 #pragma mark -
 #pragma mark MVKPhysicalDevice
@ -192,6 +200,12 @@ void MVKPhysicalDevice::getFeatures(VkPhysicalDeviceFeatures2* features) {
 				scalarLayoutFeatures->scalarBlockLayout = true;
 				break;
 			}
+			case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SUBGROUP_SIZE_CONTROL_FEATURES_EXT: {
+				auto* subgroupSizeFeatures = (VkPhysicalDeviceSubgroupSizeControlFeaturesEXT*)next;
+				subgroupSizeFeatures->subgroupSizeControl = _metalFeatures.simdPermute || _metalFeatures.quadPermute;
+				subgroupSizeFeatures->computeFullSubgroups = _metalFeatures.simdPermute || _metalFeatures.quadPermute;
+				break;
+			}
 			case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_TEXEL_BUFFER_ALIGNMENT_FEATURES_EXT: {
 				auto* texelBuffAlignFeatures = (VkPhysicalDeviceTexelBufferAlignmentFeaturesEXT*)next;
 				texelBuffAlignFeatures->texelBufferAlignment = _metalFeatures.texelBuffers && [_mtlDevice respondsToSelector: @selector(minimumLinearTextureAlignmentForPixelFormat:)];
@ -319,7 +333,7 @@ void MVKPhysicalDevice::getProperties(VkPhysicalDeviceProperties2* properties) {
 			}
            case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SUBGROUP_PROPERTIES: {
                auto* subgroupProps = (VkPhysicalDeviceSubgroupProperties*)next;
-                subgroupProps->subgroupSize = _metalFeatures.subgroupSize;
+                subgroupProps->subgroupSize = _metalFeatures.maxSubgroupSize;
                subgroupProps->supportedStages = VK_SHADER_STAGE_COMPUTE_BIT;
                if (_features.tessellationShader) {
                    subgroupProps->supportedStages |= VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT;
@ -392,6 +406,14 @@ void MVKPhysicalDevice::getProperties(VkPhysicalDeviceProperties2* properties) {
 				robustness2Props->robustUniformBufferAccessSizeAlignment = 1;
 				break;
 			}
+			case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SUBGROUP_SIZE_CONTROL_PROPERTIES_EXT: {
+				auto* subgroupSizeProps = (VkPhysicalDeviceSubgroupSizeControlPropertiesEXT*)next;
+				subgroupSizeProps->minSubgroupSize = _metalFeatures.minSubgroupSize;
+				subgroupSizeProps->maxSubgroupSize = _metalFeatures.maxSubgroupSize;
+				subgroupSizeProps->maxComputeWorkgroupSubgroups = _properties.limits.maxComputeWorkGroupInvocations / _metalFeatures.minSubgroupSize;
+				subgroupSizeProps->requiredSubgroupSizeStages = 0;
+				break;
+			}
 			case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_TEXEL_BUFFER_ALIGNMENT_PROPERTIES_EXT: {
 				auto* texelBuffAlignProps = (VkPhysicalDeviceTexelBufferAlignmentPropertiesEXT*)next;
 				// Save the 'next' pointer; we'll unintentionally overwrite it
@ -1435,18 +1457,43 @@ void MVKPhysicalDevice::initMetalFeatures() {
        }
    }

-    _metalFeatures.subgroupSize = 1;
+    _metalFeatures.minSubgroupSize = _metalFeatures.maxSubgroupSize = 1;
 #if MVK_MACOS
    if (_metalFeatures.simdPermute) {
-        static const uint32_t kAMDVendorId = 0x1002;
-        _metalFeatures.subgroupSize = (_properties.vendorID == kAMDVendorId) ? 64 : 32;
+        // Based on data from Sascha Willems' Vulkan Hardware Database.
+        // This would be a lot easier and less painful if MTLDevice had properties for this...
+        _metalFeatures.maxSubgroupSize = (_properties.vendorID == kAMDVendorId) ? 64 : 32;
+        switch (_properties.vendorID) {
+            case kIntelVendorId:
+                _metalFeatures.minSubgroupSize = 8;
+                break;
+            case kAMDVendorId:
+                switch (_properties.deviceID) {
+                    case kAMDRadeonRX5700XTDeviceId:
+                    case kAMDRadeonRX5500XTDeviceId:
+                        _metalFeatures.minSubgroupSize = 32;
+                        break;
+                    default:
+                        _metalFeatures.minSubgroupSize = _metalFeatures.maxSubgroupSize;
+                        break;
+                }
+                break;
+            case kAppleVendorId:
+                // XXX Minimum thread execution width for Apple GPUs is unknown, but assumed to be 4. May be greater.
+                _metalFeatures.minSubgroupSize = 4;
+                break;
+            default:
+                _metalFeatures.minSubgroupSize = _metalFeatures.maxSubgroupSize;
+                break;
+        }
    }
 #endif
 #if MVK_IOS
    if (_metalFeatures.simdPermute) {
-        _metalFeatures.subgroupSize = 32;
+        _metalFeatures.minSubgroupSize = 4;
+        _metalFeatures.maxSubgroupSize = 32;
    } else if (_metalFeatures.quadPermute) {
-        _metalFeatures.subgroupSize = 4;
+        _metalFeatures.minSubgroupSize = _metalFeatures.maxSubgroupSize = 4;
    }
 #endif

@ -1930,8 +1977,6 @@ void MVKPhysicalDevice::initLimits() {
    _properties.limits.lineWidthRange[1] = 1;
    _properties.limits.lineWidthGranularity = 1;

-    static const uint32_t kIntelVendorId = 0x8086;
-    static const uint32_t kNVVendorId = 0x10de;
    _properties.limits.standardSampleLocations = VK_TRUE;
    _properties.limits.strictLines = _properties.vendorID == kIntelVendorId || _properties.vendorID == kNVVendorId;

@ -2038,7 +2083,6 @@ static uint32_t mvkGetEntryProperty(io_registry_entry_t entry, CFStringRef prope

 void MVKPhysicalDevice::initGPUInfoProperties() {

-	static const uint32_t kIntelVendorId = 0x8086;
 	bool isFound = false;

 	bool isIntegrated = _mtlDevice.isLowPower;
@ -2048,7 +2092,7 @@ void MVKPhysicalDevice::initGPUInfoProperties() {
 	if (supportsMTLGPUFamily(Apple5)) {
 		// This is an Apple GPU. It won't have a 'device-id' property, so fill it in
 		// like on iOS/tvOS.
-		_properties.vendorID = 0x106b;	// Apple's PCI ID
+		_properties.vendorID = kAppleVendorId;
 #if MVK_MACOS_APPLE_SILICON
 		if (supportsMTLGPUFamily(Apple7)) {
 			_properties.deviceID = 0xa140;
@ -2130,7 +2174,7 @@ void MVKPhysicalDevice::initGPUInfoProperties() {
 		devID = coreCnt > 2 ? 0xa081 : 0xa080;
 	}

-	_properties.vendorID = 0x0000106b;	// Apple's PCI ID
+	_properties.vendorID = kAppleVendorId;
 	_properties.deviceID = devID;
 	_properties.deviceType = VK_PHYSICAL_DEVICE_TYPE_INTEGRATED_GPU;
 	strlcpy(_properties.deviceName, _mtlDevice.name.UTF8String, VK_MAX_PHYSICAL_DEVICE_NAME_SIZE);
@ -2147,7 +2191,7 @@ void MVKPhysicalDevice::initGPUInfoProperties() {
 		devID = 0xa101;
 	}

-  _properties.vendorID = 0x0000106b;  // Apple's PCI ID
+  _properties.vendorID = kAppleVendorId;
  _properties.deviceID = devID;
  _properties.deviceType = VK_PHYSICAL_DEVICE_TYPE_INTEGRATED_GPU;
  strlcpy(_properties.deviceName, _mtlDevice.name.UTF8String, VK_MAX_PHYSICAL_DEVICE_NAME_SIZE);
--- a/MoltenVK/MoltenVK/GPUObjects/MVKPipeline.mm
+++ b/MoltenVK/MoltenVK/GPUObjects/MVKPipeline.mm
@ -1000,6 +1000,7 @@ bool MVKGraphicsPipeline::addTessCtlShaderToPipeline(MTLComputePipelineDescripto
 	shaderContext.options.mslOptions.buffer_size_buffer_index = _bufferSizeBufferIndex.stages[kMVKShaderStageTessCtl];
 	shaderContext.options.mslOptions.capture_output_to_buffer = true;
 	shaderContext.options.mslOptions.multi_patch_workgroup = true;
+	shaderContext.options.mslOptions.fixed_subgroup_size = mvkIsAnyFlagEnabled(_pTessCtlSS->flags, VK_PIPELINE_SHADER_STAGE_CREATE_ALLOW_VARYING_SUBGROUP_SIZE_BIT_EXT) ? 0 : _device->_pMetalFeatures->maxSubgroupSize;
 	addPrevStageOutputToShaderConverterContext(shaderContext, vtxOutputs);

 	MVKMTLFunction func = ((MVKShaderModule*)_pTessCtlSS->module)->getMTLFunction(&shaderContext, _pTessCtlSS->pSpecializationInfo, _pipelineCache);
@ -1090,6 +1091,7 @@ bool MVKGraphicsPipeline::addFragmentShaderToPipeline(MTLRenderPipelineDescripto
 		shaderContext.options.mslOptions.view_mask_buffer_index = _viewRangeBufferIndex.stages[kMVKShaderStageFragment];
 		shaderContext.options.entryPointName = _pFragmentSS->pName;
 		shaderContext.options.mslOptions.capture_output_to_buffer = false;
+		shaderContext.options.mslOptions.fixed_subgroup_size = mvkIsAnyFlagEnabled(_pFragmentSS->flags, VK_PIPELINE_SHADER_STAGE_CREATE_ALLOW_VARYING_SUBGROUP_SIZE_BIT_EXT) ? 0 : _device->_pMetalFeatures->maxSubgroupSize;
 		if (pCreateInfo->pMultisampleState) {
 			if (pCreateInfo->pMultisampleState->pSampleMask && pCreateInfo->pMultisampleState->pSampleMask[0] != 0xffffffff) {
 				shaderContext.options.mslOptions.additional_fixed_sample_mask = pCreateInfo->pMultisampleState->pSampleMask[0];
@ -1484,7 +1486,6 @@ void MVKGraphicsPipeline::initMVKShaderConverterContext(SPIRVToMSLConversionConf
    shaderContext.options.mslOptions.multiview = mvkRendPass->isMultiview();
    shaderContext.options.mslOptions.multiview_layered_rendering = getDevice()->getPhysicalDevice()->canUseInstancingForMultiview();
    shaderContext.options.mslOptions.view_index_from_device_index = mvkAreAllFlagsEnabled(pCreateInfo->flags, VK_PIPELINE_CREATE_VIEW_INDEX_FROM_DEVICE_INDEX_BIT);
-    shaderContext.options.mslOptions.fixed_subgroup_size = _device->_pMetalFeatures->subgroupSize;
 #if MVK_MACOS
    shaderContext.options.mslOptions.emulate_subgroups = !_device->_pMetalFeatures->simdPermute;
 #endif
@ -1647,6 +1648,7 @@ MVKComputePipeline::MVKComputePipeline(MVKDevice* device,
 		MTLComputePipelineDescriptor* plDesc = [MTLComputePipelineDescriptor new];	// temp retain
 		plDesc.computeFunction = mtlFunc;
 		plDesc.maxTotalThreadsPerThreadgroup = _mtlThreadgroupSize.width * _mtlThreadgroupSize.height * _mtlThreadgroupSize.depth;
+		plDesc.threadGroupSizeIsMultipleOfThreadExecutionWidth = mvkIsAnyFlagEnabled(pCreateInfo->stage.flags, VK_PIPELINE_SHADER_STAGE_CREATE_REQUIRE_FULL_SUBGROUPS_BIT_EXT);

 		// Metal does not allow the name of the pipeline to be changed after it has been created,
 		// and we need to create the Metal pipeline immediately to provide error feedback to app.
@ -1690,7 +1692,7 @@ MVKMTLFunction MVKComputePipeline::getMTLFunction(const VkComputePipelineCreateI
 	shaderContext.options.mslOptions.texture_buffer_native = _device->_pMetalFeatures->textureBuffers;
 	shaderContext.options.mslOptions.dispatch_base = _allowsDispatchBase;
 	shaderContext.options.mslOptions.texture_1D_as_2D = mvkTreatTexture1DAs2D();
-    shaderContext.options.mslOptions.fixed_subgroup_size = _device->_pMetalFeatures->subgroupSize;
+    shaderContext.options.mslOptions.fixed_subgroup_size = mvkIsAnyFlagEnabled(pSS->flags, VK_PIPELINE_SHADER_STAGE_CREATE_ALLOW_VARYING_SUBGROUP_SIZE_BIT_EXT) ? 0 : _device->_pMetalFeatures->maxSubgroupSize;
 #if MVK_MACOS
    shaderContext.options.mslOptions.emulate_subgroups = !_device->_pMetalFeatures->simdPermute;
 #endif
--- a/MoltenVK/MoltenVK/Layers/MVKExtensions.def
+++ b/MoltenVK/MoltenVK/Layers/MVKExtensions.def
@ -95,6 +95,7 @@ MVK_EXTENSION(EXT_robustness2, EXT_ROBUSTNESS_2, DEVICE)
 MVK_EXTENSION(EXT_scalar_block_layout, EXT_SCALAR_BLOCK_LAYOUT, DEVICE)
 MVK_EXTENSION(EXT_shader_stencil_export, EXT_SHADER_STENCIL_EXPORT, DEVICE)
 MVK_EXTENSION(EXT_shader_viewport_index_layer, EXT_SHADER_VIEWPORT_INDEX_LAYER, DEVICE)
+MVK_EXTENSION(EXT_subgroup_size_control, EXT_SUBGROUP_SIZE_CONTROL, DEVICE)
 MVK_EXTENSION(EXT_swapchain_colorspace, EXT_SWAPCHAIN_COLOR_SPACE, INSTANCE)
 MVK_EXTENSION(EXT_texel_buffer_alignment, EXT_TEXEL_BUFFER_ALIGNMENT, DEVICE)
 MVK_EXTENSION(EXT_texture_compression_astc_hdr, EXT_TEXTURE_COMPRESSION_ASTC_HDR, DEVICE)
--- a/MoltenVK/MoltenVK/Layers/MVKExtensions.mm
+++ b/MoltenVK/MoltenVK/Layers/MVKExtensions.mm
@ -69,6 +69,7 @@ static bool mvkIsSupportedOnPlatform(VkExtensionProperties* pProperties) {
 	MVK_EXTENSION_MIN_OS(EXT_MEMORY_BUDGET,                  10.13, 11.0)
 	MVK_EXTENSION_MIN_OS(EXT_POST_DEPTH_COVERAGE,            10.16, 11.0)
 	MVK_EXTENSION_MIN_OS(EXT_SHADER_STENCIL_EXPORT,          10.14, 12.0)
+	MVK_EXTENSION_MIN_OS(EXT_SUBGROUP_SIZE_CONTROL,          10.14, 13.0)
 	MVK_EXTENSION_MIN_OS(EXT_TEXEL_BUFFER_ALIGNMENT,         10.13, 11.0)
 	MVK_EXTENSION_MIN_OS(EXT_TEXTURE_COMPRESSION_ASTC_HDR,   10.16, 13.0)
 	MVK_EXTENSION_MIN_OS(AMD_SHADER_TRINARY_MINMAX,          10.14, 12.0)