Support the VK_EXT_subgroup_size_control extension.

This extension allows the subgroup size to vary between draw/dispatch
calls, and even allows clients to declare that full subgroups must
always be dispatched. It corresponds better to how Metal actually works.

No support for declaring a required subgroup size, unfortunately.
This commit is contained in:
Chip Davis 2020-11-28 17:31:49 -06:00
parent 9ec67edc42
commit e0e5d3ce28
7 changed files with 69 additions and 16 deletions

View File

@ -290,10 +290,11 @@ In addition to core *Vulkan* functionality, **MoltenVK** also supports the foll
- `VK_KHR_portability_subset`
- `VK_KHR_push_descriptor`
- `VK_KHR_relaxed_block_layout`
- `VK_KHR_sampler_mirror_clamp_to_edge` *(macOS)*
- `VK_KHR_sampler_mirror_clamp_to_edge` *(requires a Mac GPU or Apple family 7 GPU)*
- `VK_KHR_sampler_ycbcr_conversion`
- `VK_KHR_shader_draw_parameters`
- `VK_KHR_shader_float16_int8`
- `VK_KHR_shader_subgroup_extended_types` *(requires Metal 2.1 on Mac or Metal 2.2 and Apple family 4 on iOS)*
- `VK_KHR_storage_buffer_storage_class`
- `VK_KHR_surface`
- `VK_KHR_swapchain`
@ -317,6 +318,7 @@ In addition to core *Vulkan* functionality, **MoltenVK** also supports the foll
- `VK_EXT_scalar_block_layout`
- `VK_EXT_shader_stencil_export` *(requires Mac GPU family 2 or iOS GPU family 5)*
- `VK_EXT_shader_viewport_index_layer`
- `VK_EXT_subgroup_size_control` *(requires Metal 2.1 on Mac or Metal 2.2 and Apple family 4 on iOS)*
- `VK_EXT_swapchain_colorspace`
- `VK_EXT_vertex_attribute_divisor`
- `VK_EXT_texel_buffer_alignment` *(requires Metal 2.0)*

View File

@ -19,10 +19,12 @@ MoltenVK 1.1.1
Released TBD
- Add support for extensions:
- `VK_KHR_sampler_mirror_clamp_to_edge` (iOS)
- `VK_KHR_timeline_semaphore`
- `VK_EXT_descriptor_indexing` (initial release limited to Metal Tier 1: 96/128 textures, 16 samplers)
- `VK_EXT_post_depth_coverage` (macOS)
- `VK_EXT_private_data`
- `VK_EXT_subgroup_size_control`
- `VK_EXT_texture_compression_astc_hdr`
- `VK_AMD_shader_image_load_store` (macOS)
- `VK_IMG_format_pvrtc` (macOS)

View File

@ -616,7 +616,7 @@ typedef struct {
VkBool32 placementHeaps; /**< If true, MTLHeap objects support placement of resources. */
VkDeviceSize pushConstantSizeAlignment; /**< The alignment used internally when allocating memory for push constants. Must be PoT. */
uint32_t maxTextureLayers; /**< The maximum number of layers in an array texture. */
uint32_t subgroupSize; /**< The number of threads in a SIMD-group. */
uint32_t maxSubgroupSize; /**< The maximum number of threads in a SIMD-group. */
VkDeviceSize vertexStrideAlignment; /**< The alignment used for the stride of vertex attribute bindings. */
VkBool32 indirectTessellationDrawing; /**< If true, tessellation draw calls support parameters held in a GPU buffer. */
VkBool32 nonUniformThreadgroups; /**< If true, the device supports arbitrary-sized grids in compute workloads. */
@ -634,6 +634,7 @@ typedef struct {
VkBool32 quadPermute; /**< If true, quadgroup permutation functions (vote, ballot, shuffle) are supported in shaders. */
VkBool32 simdPermute; /**< If true, SIMD-group permutation functions (vote, ballot, shuffle) are supported in shaders. */
VkBool32 simdReduction; /**< If true, SIMD-group reduction functions (arithmetic) are supported in shaders. */
uint32_t minSubgroupSize; /**< The minimum number of threads in a SIMD-group. */
} MVKPhysicalDeviceMetalFeatures;
/** MoltenVK performance of a particular type of activity. */

View File

@ -53,6 +53,14 @@ using namespace std;
#define supportsMTLFeatureSet(MFS) [_mtlDevice supportsFeatureSet: MTLFeatureSet_ ##MFS]
#define supportsMTLGPUFamily(GPUF) ([_mtlDevice respondsToSelector: @selector(supportsFamily:)] && [_mtlDevice supportsFamily: MTLGPUFamily ##GPUF])
static const uint32_t kAMDVendorId = 0x1002;
static const uint32_t kAppleVendorId = 0x106b;
static const uint32_t kIntelVendorId = 0x8086;
static const uint32_t kNVVendorId = 0x10de;
static const uint32_t kAMDRadeonRX5700XTDeviceId = 0x731f;
static const uint32_t kAMDRadeonRX5500XTDeviceId = 0x7340;
#pragma mark -
#pragma mark MVKPhysicalDevice
@ -192,6 +200,12 @@ void MVKPhysicalDevice::getFeatures(VkPhysicalDeviceFeatures2* features) {
scalarLayoutFeatures->scalarBlockLayout = true;
break;
}
case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SUBGROUP_SIZE_CONTROL_FEATURES_EXT: {
auto* subgroupSizeFeatures = (VkPhysicalDeviceSubgroupSizeControlFeaturesEXT*)next;
subgroupSizeFeatures->subgroupSizeControl = _metalFeatures.simdPermute || _metalFeatures.quadPermute;
subgroupSizeFeatures->computeFullSubgroups = _metalFeatures.simdPermute || _metalFeatures.quadPermute;
break;
}
case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_TEXEL_BUFFER_ALIGNMENT_FEATURES_EXT: {
auto* texelBuffAlignFeatures = (VkPhysicalDeviceTexelBufferAlignmentFeaturesEXT*)next;
texelBuffAlignFeatures->texelBufferAlignment = _metalFeatures.texelBuffers && [_mtlDevice respondsToSelector: @selector(minimumLinearTextureAlignmentForPixelFormat:)];
@ -319,7 +333,7 @@ void MVKPhysicalDevice::getProperties(VkPhysicalDeviceProperties2* properties) {
}
case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SUBGROUP_PROPERTIES: {
auto* subgroupProps = (VkPhysicalDeviceSubgroupProperties*)next;
subgroupProps->subgroupSize = _metalFeatures.subgroupSize;
subgroupProps->subgroupSize = _metalFeatures.maxSubgroupSize;
subgroupProps->supportedStages = VK_SHADER_STAGE_COMPUTE_BIT;
if (_features.tessellationShader) {
subgroupProps->supportedStages |= VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT;
@ -392,6 +406,14 @@ void MVKPhysicalDevice::getProperties(VkPhysicalDeviceProperties2* properties) {
robustness2Props->robustUniformBufferAccessSizeAlignment = 1;
break;
}
case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SUBGROUP_SIZE_CONTROL_PROPERTIES_EXT: {
auto* subgroupSizeProps = (VkPhysicalDeviceSubgroupSizeControlPropertiesEXT*)next;
subgroupSizeProps->minSubgroupSize = _metalFeatures.minSubgroupSize;
subgroupSizeProps->maxSubgroupSize = _metalFeatures.maxSubgroupSize;
subgroupSizeProps->maxComputeWorkgroupSubgroups = _properties.limits.maxComputeWorkGroupInvocations / _metalFeatures.minSubgroupSize;
subgroupSizeProps->requiredSubgroupSizeStages = 0;
break;
}
case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_TEXEL_BUFFER_ALIGNMENT_PROPERTIES_EXT: {
auto* texelBuffAlignProps = (VkPhysicalDeviceTexelBufferAlignmentPropertiesEXT*)next;
// Save the 'next' pointer; we'll unintentionally overwrite it
@ -1435,18 +1457,43 @@ void MVKPhysicalDevice::initMetalFeatures() {
}
}
_metalFeatures.subgroupSize = 1;
_metalFeatures.minSubgroupSize = _metalFeatures.maxSubgroupSize = 1;
#if MVK_MACOS
if (_metalFeatures.simdPermute) {
static const uint32_t kAMDVendorId = 0x1002;
_metalFeatures.subgroupSize = (_properties.vendorID == kAMDVendorId) ? 64 : 32;
// Based on data from Sascha Willems' Vulkan Hardware Database.
// This would be a lot easier and less painful if MTLDevice had properties for this...
_metalFeatures.maxSubgroupSize = (_properties.vendorID == kAMDVendorId) ? 64 : 32;
switch (_properties.vendorID) {
case kIntelVendorId:
_metalFeatures.minSubgroupSize = 8;
break;
case kAMDVendorId:
switch (_properties.deviceID) {
case kAMDRadeonRX5700XTDeviceId:
case kAMDRadeonRX5500XTDeviceId:
_metalFeatures.minSubgroupSize = 32;
break;
default:
_metalFeatures.minSubgroupSize = _metalFeatures.maxSubgroupSize;
break;
}
break;
case kAppleVendorId:
// XXX Minimum thread execution width for Apple GPUs is unknown, but assumed to be 4. May be greater.
_metalFeatures.minSubgroupSize = 4;
break;
default:
_metalFeatures.minSubgroupSize = _metalFeatures.maxSubgroupSize;
break;
}
}
#endif
#if MVK_IOS
if (_metalFeatures.simdPermute) {
_metalFeatures.subgroupSize = 32;
_metalFeatures.minSubgroupSize = 4;
_metalFeatures.maxSubgroupSize = 32;
} else if (_metalFeatures.quadPermute) {
_metalFeatures.subgroupSize = 4;
_metalFeatures.minSubgroupSize = _metalFeatures.maxSubgroupSize = 4;
}
#endif
@ -1930,8 +1977,6 @@ void MVKPhysicalDevice::initLimits() {
_properties.limits.lineWidthRange[1] = 1;
_properties.limits.lineWidthGranularity = 1;
static const uint32_t kIntelVendorId = 0x8086;
static const uint32_t kNVVendorId = 0x10de;
_properties.limits.standardSampleLocations = VK_TRUE;
_properties.limits.strictLines = _properties.vendorID == kIntelVendorId || _properties.vendorID == kNVVendorId;
@ -2038,7 +2083,6 @@ static uint32_t mvkGetEntryProperty(io_registry_entry_t entry, CFStringRef prope
void MVKPhysicalDevice::initGPUInfoProperties() {
static const uint32_t kIntelVendorId = 0x8086;
bool isFound = false;
bool isIntegrated = _mtlDevice.isLowPower;
@ -2048,7 +2092,7 @@ void MVKPhysicalDevice::initGPUInfoProperties() {
if (supportsMTLGPUFamily(Apple5)) {
// This is an Apple GPU. It won't have a 'device-id' property, so fill it in
// like on iOS/tvOS.
_properties.vendorID = 0x106b; // Apple's PCI ID
_properties.vendorID = kAppleVendorId;
#if MVK_MACOS_APPLE_SILICON
if (supportsMTLGPUFamily(Apple7)) {
_properties.deviceID = 0xa140;
@ -2130,7 +2174,7 @@ void MVKPhysicalDevice::initGPUInfoProperties() {
devID = coreCnt > 2 ? 0xa081 : 0xa080;
}
_properties.vendorID = 0x0000106b; // Apple's PCI ID
_properties.vendorID = kAppleVendorId;
_properties.deviceID = devID;
_properties.deviceType = VK_PHYSICAL_DEVICE_TYPE_INTEGRATED_GPU;
strlcpy(_properties.deviceName, _mtlDevice.name.UTF8String, VK_MAX_PHYSICAL_DEVICE_NAME_SIZE);
@ -2147,7 +2191,7 @@ void MVKPhysicalDevice::initGPUInfoProperties() {
devID = 0xa101;
}
_properties.vendorID = 0x0000106b; // Apple's PCI ID
_properties.vendorID = kAppleVendorId;
_properties.deviceID = devID;
_properties.deviceType = VK_PHYSICAL_DEVICE_TYPE_INTEGRATED_GPU;
strlcpy(_properties.deviceName, _mtlDevice.name.UTF8String, VK_MAX_PHYSICAL_DEVICE_NAME_SIZE);

View File

@ -1000,6 +1000,7 @@ bool MVKGraphicsPipeline::addTessCtlShaderToPipeline(MTLComputePipelineDescripto
shaderContext.options.mslOptions.buffer_size_buffer_index = _bufferSizeBufferIndex.stages[kMVKShaderStageTessCtl];
shaderContext.options.mslOptions.capture_output_to_buffer = true;
shaderContext.options.mslOptions.multi_patch_workgroup = true;
shaderContext.options.mslOptions.fixed_subgroup_size = mvkIsAnyFlagEnabled(_pTessCtlSS->flags, VK_PIPELINE_SHADER_STAGE_CREATE_ALLOW_VARYING_SUBGROUP_SIZE_BIT_EXT) ? 0 : _device->_pMetalFeatures->maxSubgroupSize;
addPrevStageOutputToShaderConverterContext(shaderContext, vtxOutputs);
MVKMTLFunction func = ((MVKShaderModule*)_pTessCtlSS->module)->getMTLFunction(&shaderContext, _pTessCtlSS->pSpecializationInfo, _pipelineCache);
@ -1090,6 +1091,7 @@ bool MVKGraphicsPipeline::addFragmentShaderToPipeline(MTLRenderPipelineDescripto
shaderContext.options.mslOptions.view_mask_buffer_index = _viewRangeBufferIndex.stages[kMVKShaderStageFragment];
shaderContext.options.entryPointName = _pFragmentSS->pName;
shaderContext.options.mslOptions.capture_output_to_buffer = false;
shaderContext.options.mslOptions.fixed_subgroup_size = mvkIsAnyFlagEnabled(_pFragmentSS->flags, VK_PIPELINE_SHADER_STAGE_CREATE_ALLOW_VARYING_SUBGROUP_SIZE_BIT_EXT) ? 0 : _device->_pMetalFeatures->maxSubgroupSize;
if (pCreateInfo->pMultisampleState) {
if (pCreateInfo->pMultisampleState->pSampleMask && pCreateInfo->pMultisampleState->pSampleMask[0] != 0xffffffff) {
shaderContext.options.mslOptions.additional_fixed_sample_mask = pCreateInfo->pMultisampleState->pSampleMask[0];
@ -1484,7 +1486,6 @@ void MVKGraphicsPipeline::initMVKShaderConverterContext(SPIRVToMSLConversionConf
shaderContext.options.mslOptions.multiview = mvkRendPass->isMultiview();
shaderContext.options.mslOptions.multiview_layered_rendering = getDevice()->getPhysicalDevice()->canUseInstancingForMultiview();
shaderContext.options.mslOptions.view_index_from_device_index = mvkAreAllFlagsEnabled(pCreateInfo->flags, VK_PIPELINE_CREATE_VIEW_INDEX_FROM_DEVICE_INDEX_BIT);
shaderContext.options.mslOptions.fixed_subgroup_size = _device->_pMetalFeatures->subgroupSize;
#if MVK_MACOS
shaderContext.options.mslOptions.emulate_subgroups = !_device->_pMetalFeatures->simdPermute;
#endif
@ -1647,6 +1648,7 @@ MVKComputePipeline::MVKComputePipeline(MVKDevice* device,
MTLComputePipelineDescriptor* plDesc = [MTLComputePipelineDescriptor new]; // temp retain
plDesc.computeFunction = mtlFunc;
plDesc.maxTotalThreadsPerThreadgroup = _mtlThreadgroupSize.width * _mtlThreadgroupSize.height * _mtlThreadgroupSize.depth;
plDesc.threadGroupSizeIsMultipleOfThreadExecutionWidth = mvkIsAnyFlagEnabled(pCreateInfo->stage.flags, VK_PIPELINE_SHADER_STAGE_CREATE_REQUIRE_FULL_SUBGROUPS_BIT_EXT);
// Metal does not allow the name of the pipeline to be changed after it has been created,
// and we need to create the Metal pipeline immediately to provide error feedback to app.
@ -1690,7 +1692,7 @@ MVKMTLFunction MVKComputePipeline::getMTLFunction(const VkComputePipelineCreateI
shaderContext.options.mslOptions.texture_buffer_native = _device->_pMetalFeatures->textureBuffers;
shaderContext.options.mslOptions.dispatch_base = _allowsDispatchBase;
shaderContext.options.mslOptions.texture_1D_as_2D = mvkTreatTexture1DAs2D();
shaderContext.options.mslOptions.fixed_subgroup_size = _device->_pMetalFeatures->subgroupSize;
shaderContext.options.mslOptions.fixed_subgroup_size = mvkIsAnyFlagEnabled(pSS->flags, VK_PIPELINE_SHADER_STAGE_CREATE_ALLOW_VARYING_SUBGROUP_SIZE_BIT_EXT) ? 0 : _device->_pMetalFeatures->maxSubgroupSize;
#if MVK_MACOS
shaderContext.options.mslOptions.emulate_subgroups = !_device->_pMetalFeatures->simdPermute;
#endif

View File

@ -95,6 +95,7 @@ MVK_EXTENSION(EXT_robustness2, EXT_ROBUSTNESS_2, DEVICE)
MVK_EXTENSION(EXT_scalar_block_layout, EXT_SCALAR_BLOCK_LAYOUT, DEVICE)
MVK_EXTENSION(EXT_shader_stencil_export, EXT_SHADER_STENCIL_EXPORT, DEVICE)
MVK_EXTENSION(EXT_shader_viewport_index_layer, EXT_SHADER_VIEWPORT_INDEX_LAYER, DEVICE)
MVK_EXTENSION(EXT_subgroup_size_control, EXT_SUBGROUP_SIZE_CONTROL, DEVICE)
MVK_EXTENSION(EXT_swapchain_colorspace, EXT_SWAPCHAIN_COLOR_SPACE, INSTANCE)
MVK_EXTENSION(EXT_texel_buffer_alignment, EXT_TEXEL_BUFFER_ALIGNMENT, DEVICE)
MVK_EXTENSION(EXT_texture_compression_astc_hdr, EXT_TEXTURE_COMPRESSION_ASTC_HDR, DEVICE)

View File

@ -69,6 +69,7 @@ static bool mvkIsSupportedOnPlatform(VkExtensionProperties* pProperties) {
MVK_EXTENSION_MIN_OS(EXT_MEMORY_BUDGET, 10.13, 11.0)
MVK_EXTENSION_MIN_OS(EXT_POST_DEPTH_COVERAGE, 10.16, 11.0)
MVK_EXTENSION_MIN_OS(EXT_SHADER_STENCIL_EXPORT, 10.14, 12.0)
MVK_EXTENSION_MIN_OS(EXT_SUBGROUP_SIZE_CONTROL, 10.14, 13.0)
MVK_EXTENSION_MIN_OS(EXT_TEXEL_BUFFER_ALIGNMENT, 10.13, 11.0)
MVK_EXTENSION_MIN_OS(EXT_TEXTURE_COMPRESSION_ASTC_HDR, 10.16, 13.0)
MVK_EXTENSION_MIN_OS(AMD_SHADER_TRINARY_MINMAX, 10.14, 12.0)