Support the VK_EXT_subgroup_size_control extension.
This extension allows the subgroup size to vary between draw/dispatch calls, and even allows clients to declare that full subgroups must always be dispatched. It corresponds better to how Metal actually works. No support for declaring a required subgroup size, unfortunately.
This commit is contained in:
parent
9ec67edc42
commit
e0e5d3ce28
@ -290,10 +290,11 @@ In addition to core *Vulkan* functionality, **MoltenVK** also supports the foll
|
||||
- `VK_KHR_portability_subset`
|
||||
- `VK_KHR_push_descriptor`
|
||||
- `VK_KHR_relaxed_block_layout`
|
||||
- `VK_KHR_sampler_mirror_clamp_to_edge` *(macOS)*
|
||||
- `VK_KHR_sampler_mirror_clamp_to_edge` *(requires a Mac GPU or Apple family 7 GPU)*
|
||||
- `VK_KHR_sampler_ycbcr_conversion`
|
||||
- `VK_KHR_shader_draw_parameters`
|
||||
- `VK_KHR_shader_float16_int8`
|
||||
- `VK_KHR_shader_subgroup_extended_types` *(requires Metal 2.1 on Mac or Metal 2.2 and Apple family 4 on iOS)*
|
||||
- `VK_KHR_storage_buffer_storage_class`
|
||||
- `VK_KHR_surface`
|
||||
- `VK_KHR_swapchain`
|
||||
@ -317,6 +318,7 @@ In addition to core *Vulkan* functionality, **MoltenVK** also supports the foll
|
||||
- `VK_EXT_scalar_block_layout`
|
||||
- `VK_EXT_shader_stencil_export` *(requires Mac GPU family 2 or iOS GPU family 5)*
|
||||
- `VK_EXT_shader_viewport_index_layer`
|
||||
- `VK_EXT_subgroup_size_control` *(requires Metal 2.1 on Mac or Metal 2.2 and Apple family 4 on iOS)*
|
||||
- `VK_EXT_swapchain_colorspace`
|
||||
- `VK_EXT_vertex_attribute_divisor`
|
||||
- `VK_EXT_texel_buffer_alignment` *(requires Metal 2.0)*
|
||||
|
@ -19,10 +19,12 @@ MoltenVK 1.1.1
|
||||
Released TBD
|
||||
|
||||
- Add support for extensions:
|
||||
- `VK_KHR_sampler_mirror_clamp_to_edge` (iOS)
|
||||
- `VK_KHR_timeline_semaphore`
|
||||
- `VK_EXT_descriptor_indexing` (initial release limited to Metal Tier 1: 96/128 textures, 16 samplers)
|
||||
- `VK_EXT_post_depth_coverage` (macOS)
|
||||
- `VK_EXT_private_data`
|
||||
- `VK_EXT_subgroup_size_control`
|
||||
- `VK_EXT_texture_compression_astc_hdr`
|
||||
- `VK_AMD_shader_image_load_store` (macOS)
|
||||
- `VK_IMG_format_pvrtc` (macOS)
|
||||
|
@ -616,7 +616,7 @@ typedef struct {
|
||||
VkBool32 placementHeaps; /**< If true, MTLHeap objects support placement of resources. */
|
||||
VkDeviceSize pushConstantSizeAlignment; /**< The alignment used internally when allocating memory for push constants. Must be PoT. */
|
||||
uint32_t maxTextureLayers; /**< The maximum number of layers in an array texture. */
|
||||
uint32_t subgroupSize; /**< The number of threads in a SIMD-group. */
|
||||
uint32_t maxSubgroupSize; /**< The maximum number of threads in a SIMD-group. */
|
||||
VkDeviceSize vertexStrideAlignment; /**< The alignment used for the stride of vertex attribute bindings. */
|
||||
VkBool32 indirectTessellationDrawing; /**< If true, tessellation draw calls support parameters held in a GPU buffer. */
|
||||
VkBool32 nonUniformThreadgroups; /**< If true, the device supports arbitrary-sized grids in compute workloads. */
|
||||
@ -634,6 +634,7 @@ typedef struct {
|
||||
VkBool32 quadPermute; /**< If true, quadgroup permutation functions (vote, ballot, shuffle) are supported in shaders. */
|
||||
VkBool32 simdPermute; /**< If true, SIMD-group permutation functions (vote, ballot, shuffle) are supported in shaders. */
|
||||
VkBool32 simdReduction; /**< If true, SIMD-group reduction functions (arithmetic) are supported in shaders. */
|
||||
uint32_t minSubgroupSize; /**< The minimum number of threads in a SIMD-group. */
|
||||
} MVKPhysicalDeviceMetalFeatures;
|
||||
|
||||
/** MoltenVK performance of a particular type of activity. */
|
||||
|
@ -53,6 +53,14 @@ using namespace std;
|
||||
#define supportsMTLFeatureSet(MFS) [_mtlDevice supportsFeatureSet: MTLFeatureSet_ ##MFS]
|
||||
#define supportsMTLGPUFamily(GPUF) ([_mtlDevice respondsToSelector: @selector(supportsFamily:)] && [_mtlDevice supportsFamily: MTLGPUFamily ##GPUF])
|
||||
|
||||
static const uint32_t kAMDVendorId = 0x1002;
|
||||
static const uint32_t kAppleVendorId = 0x106b;
|
||||
static const uint32_t kIntelVendorId = 0x8086;
|
||||
static const uint32_t kNVVendorId = 0x10de;
|
||||
|
||||
static const uint32_t kAMDRadeonRX5700XTDeviceId = 0x731f;
|
||||
static const uint32_t kAMDRadeonRX5500XTDeviceId = 0x7340;
|
||||
|
||||
|
||||
#pragma mark -
|
||||
#pragma mark MVKPhysicalDevice
|
||||
@ -192,6 +200,12 @@ void MVKPhysicalDevice::getFeatures(VkPhysicalDeviceFeatures2* features) {
|
||||
scalarLayoutFeatures->scalarBlockLayout = true;
|
||||
break;
|
||||
}
|
||||
case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SUBGROUP_SIZE_CONTROL_FEATURES_EXT: {
|
||||
auto* subgroupSizeFeatures = (VkPhysicalDeviceSubgroupSizeControlFeaturesEXT*)next;
|
||||
subgroupSizeFeatures->subgroupSizeControl = _metalFeatures.simdPermute || _metalFeatures.quadPermute;
|
||||
subgroupSizeFeatures->computeFullSubgroups = _metalFeatures.simdPermute || _metalFeatures.quadPermute;
|
||||
break;
|
||||
}
|
||||
case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_TEXEL_BUFFER_ALIGNMENT_FEATURES_EXT: {
|
||||
auto* texelBuffAlignFeatures = (VkPhysicalDeviceTexelBufferAlignmentFeaturesEXT*)next;
|
||||
texelBuffAlignFeatures->texelBufferAlignment = _metalFeatures.texelBuffers && [_mtlDevice respondsToSelector: @selector(minimumLinearTextureAlignmentForPixelFormat:)];
|
||||
@ -319,7 +333,7 @@ void MVKPhysicalDevice::getProperties(VkPhysicalDeviceProperties2* properties) {
|
||||
}
|
||||
case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SUBGROUP_PROPERTIES: {
|
||||
auto* subgroupProps = (VkPhysicalDeviceSubgroupProperties*)next;
|
||||
subgroupProps->subgroupSize = _metalFeatures.subgroupSize;
|
||||
subgroupProps->subgroupSize = _metalFeatures.maxSubgroupSize;
|
||||
subgroupProps->supportedStages = VK_SHADER_STAGE_COMPUTE_BIT;
|
||||
if (_features.tessellationShader) {
|
||||
subgroupProps->supportedStages |= VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT;
|
||||
@ -392,6 +406,14 @@ void MVKPhysicalDevice::getProperties(VkPhysicalDeviceProperties2* properties) {
|
||||
robustness2Props->robustUniformBufferAccessSizeAlignment = 1;
|
||||
break;
|
||||
}
|
||||
case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SUBGROUP_SIZE_CONTROL_PROPERTIES_EXT: {
|
||||
auto* subgroupSizeProps = (VkPhysicalDeviceSubgroupSizeControlPropertiesEXT*)next;
|
||||
subgroupSizeProps->minSubgroupSize = _metalFeatures.minSubgroupSize;
|
||||
subgroupSizeProps->maxSubgroupSize = _metalFeatures.maxSubgroupSize;
|
||||
subgroupSizeProps->maxComputeWorkgroupSubgroups = _properties.limits.maxComputeWorkGroupInvocations / _metalFeatures.minSubgroupSize;
|
||||
subgroupSizeProps->requiredSubgroupSizeStages = 0;
|
||||
break;
|
||||
}
|
||||
case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_TEXEL_BUFFER_ALIGNMENT_PROPERTIES_EXT: {
|
||||
auto* texelBuffAlignProps = (VkPhysicalDeviceTexelBufferAlignmentPropertiesEXT*)next;
|
||||
// Save the 'next' pointer; we'll unintentionally overwrite it
|
||||
@ -1435,18 +1457,43 @@ void MVKPhysicalDevice::initMetalFeatures() {
|
||||
}
|
||||
}
|
||||
|
||||
_metalFeatures.subgroupSize = 1;
|
||||
_metalFeatures.minSubgroupSize = _metalFeatures.maxSubgroupSize = 1;
|
||||
#if MVK_MACOS
|
||||
if (_metalFeatures.simdPermute) {
|
||||
static const uint32_t kAMDVendorId = 0x1002;
|
||||
_metalFeatures.subgroupSize = (_properties.vendorID == kAMDVendorId) ? 64 : 32;
|
||||
// Based on data from Sascha Willems' Vulkan Hardware Database.
|
||||
// This would be a lot easier and less painful if MTLDevice had properties for this...
|
||||
_metalFeatures.maxSubgroupSize = (_properties.vendorID == kAMDVendorId) ? 64 : 32;
|
||||
switch (_properties.vendorID) {
|
||||
case kIntelVendorId:
|
||||
_metalFeatures.minSubgroupSize = 8;
|
||||
break;
|
||||
case kAMDVendorId:
|
||||
switch (_properties.deviceID) {
|
||||
case kAMDRadeonRX5700XTDeviceId:
|
||||
case kAMDRadeonRX5500XTDeviceId:
|
||||
_metalFeatures.minSubgroupSize = 32;
|
||||
break;
|
||||
default:
|
||||
_metalFeatures.minSubgroupSize = _metalFeatures.maxSubgroupSize;
|
||||
break;
|
||||
}
|
||||
break;
|
||||
case kAppleVendorId:
|
||||
// XXX Minimum thread execution width for Apple GPUs is unknown, but assumed to be 4. May be greater.
|
||||
_metalFeatures.minSubgroupSize = 4;
|
||||
break;
|
||||
default:
|
||||
_metalFeatures.minSubgroupSize = _metalFeatures.maxSubgroupSize;
|
||||
break;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if MVK_IOS
|
||||
if (_metalFeatures.simdPermute) {
|
||||
_metalFeatures.subgroupSize = 32;
|
||||
_metalFeatures.minSubgroupSize = 4;
|
||||
_metalFeatures.maxSubgroupSize = 32;
|
||||
} else if (_metalFeatures.quadPermute) {
|
||||
_metalFeatures.subgroupSize = 4;
|
||||
_metalFeatures.minSubgroupSize = _metalFeatures.maxSubgroupSize = 4;
|
||||
}
|
||||
#endif
|
||||
|
||||
@ -1930,8 +1977,6 @@ void MVKPhysicalDevice::initLimits() {
|
||||
_properties.limits.lineWidthRange[1] = 1;
|
||||
_properties.limits.lineWidthGranularity = 1;
|
||||
|
||||
static const uint32_t kIntelVendorId = 0x8086;
|
||||
static const uint32_t kNVVendorId = 0x10de;
|
||||
_properties.limits.standardSampleLocations = VK_TRUE;
|
||||
_properties.limits.strictLines = _properties.vendorID == kIntelVendorId || _properties.vendorID == kNVVendorId;
|
||||
|
||||
@ -2038,7 +2083,6 @@ static uint32_t mvkGetEntryProperty(io_registry_entry_t entry, CFStringRef prope
|
||||
|
||||
void MVKPhysicalDevice::initGPUInfoProperties() {
|
||||
|
||||
static const uint32_t kIntelVendorId = 0x8086;
|
||||
bool isFound = false;
|
||||
|
||||
bool isIntegrated = _mtlDevice.isLowPower;
|
||||
@ -2048,7 +2092,7 @@ void MVKPhysicalDevice::initGPUInfoProperties() {
|
||||
if (supportsMTLGPUFamily(Apple5)) {
|
||||
// This is an Apple GPU. It won't have a 'device-id' property, so fill it in
|
||||
// like on iOS/tvOS.
|
||||
_properties.vendorID = 0x106b; // Apple's PCI ID
|
||||
_properties.vendorID = kAppleVendorId;
|
||||
#if MVK_MACOS_APPLE_SILICON
|
||||
if (supportsMTLGPUFamily(Apple7)) {
|
||||
_properties.deviceID = 0xa140;
|
||||
@ -2130,7 +2174,7 @@ void MVKPhysicalDevice::initGPUInfoProperties() {
|
||||
devID = coreCnt > 2 ? 0xa081 : 0xa080;
|
||||
}
|
||||
|
||||
_properties.vendorID = 0x0000106b; // Apple's PCI ID
|
||||
_properties.vendorID = kAppleVendorId;
|
||||
_properties.deviceID = devID;
|
||||
_properties.deviceType = VK_PHYSICAL_DEVICE_TYPE_INTEGRATED_GPU;
|
||||
strlcpy(_properties.deviceName, _mtlDevice.name.UTF8String, VK_MAX_PHYSICAL_DEVICE_NAME_SIZE);
|
||||
@ -2147,7 +2191,7 @@ void MVKPhysicalDevice::initGPUInfoProperties() {
|
||||
devID = 0xa101;
|
||||
}
|
||||
|
||||
_properties.vendorID = 0x0000106b; // Apple's PCI ID
|
||||
_properties.vendorID = kAppleVendorId;
|
||||
_properties.deviceID = devID;
|
||||
_properties.deviceType = VK_PHYSICAL_DEVICE_TYPE_INTEGRATED_GPU;
|
||||
strlcpy(_properties.deviceName, _mtlDevice.name.UTF8String, VK_MAX_PHYSICAL_DEVICE_NAME_SIZE);
|
||||
|
@ -1000,6 +1000,7 @@ bool MVKGraphicsPipeline::addTessCtlShaderToPipeline(MTLComputePipelineDescripto
|
||||
shaderContext.options.mslOptions.buffer_size_buffer_index = _bufferSizeBufferIndex.stages[kMVKShaderStageTessCtl];
|
||||
shaderContext.options.mslOptions.capture_output_to_buffer = true;
|
||||
shaderContext.options.mslOptions.multi_patch_workgroup = true;
|
||||
shaderContext.options.mslOptions.fixed_subgroup_size = mvkIsAnyFlagEnabled(_pTessCtlSS->flags, VK_PIPELINE_SHADER_STAGE_CREATE_ALLOW_VARYING_SUBGROUP_SIZE_BIT_EXT) ? 0 : _device->_pMetalFeatures->maxSubgroupSize;
|
||||
addPrevStageOutputToShaderConverterContext(shaderContext, vtxOutputs);
|
||||
|
||||
MVKMTLFunction func = ((MVKShaderModule*)_pTessCtlSS->module)->getMTLFunction(&shaderContext, _pTessCtlSS->pSpecializationInfo, _pipelineCache);
|
||||
@ -1090,6 +1091,7 @@ bool MVKGraphicsPipeline::addFragmentShaderToPipeline(MTLRenderPipelineDescripto
|
||||
shaderContext.options.mslOptions.view_mask_buffer_index = _viewRangeBufferIndex.stages[kMVKShaderStageFragment];
|
||||
shaderContext.options.entryPointName = _pFragmentSS->pName;
|
||||
shaderContext.options.mslOptions.capture_output_to_buffer = false;
|
||||
shaderContext.options.mslOptions.fixed_subgroup_size = mvkIsAnyFlagEnabled(_pFragmentSS->flags, VK_PIPELINE_SHADER_STAGE_CREATE_ALLOW_VARYING_SUBGROUP_SIZE_BIT_EXT) ? 0 : _device->_pMetalFeatures->maxSubgroupSize;
|
||||
if (pCreateInfo->pMultisampleState) {
|
||||
if (pCreateInfo->pMultisampleState->pSampleMask && pCreateInfo->pMultisampleState->pSampleMask[0] != 0xffffffff) {
|
||||
shaderContext.options.mslOptions.additional_fixed_sample_mask = pCreateInfo->pMultisampleState->pSampleMask[0];
|
||||
@ -1484,7 +1486,6 @@ void MVKGraphicsPipeline::initMVKShaderConverterContext(SPIRVToMSLConversionConf
|
||||
shaderContext.options.mslOptions.multiview = mvkRendPass->isMultiview();
|
||||
shaderContext.options.mslOptions.multiview_layered_rendering = getDevice()->getPhysicalDevice()->canUseInstancingForMultiview();
|
||||
shaderContext.options.mslOptions.view_index_from_device_index = mvkAreAllFlagsEnabled(pCreateInfo->flags, VK_PIPELINE_CREATE_VIEW_INDEX_FROM_DEVICE_INDEX_BIT);
|
||||
shaderContext.options.mslOptions.fixed_subgroup_size = _device->_pMetalFeatures->subgroupSize;
|
||||
#if MVK_MACOS
|
||||
shaderContext.options.mslOptions.emulate_subgroups = !_device->_pMetalFeatures->simdPermute;
|
||||
#endif
|
||||
@ -1647,6 +1648,7 @@ MVKComputePipeline::MVKComputePipeline(MVKDevice* device,
|
||||
MTLComputePipelineDescriptor* plDesc = [MTLComputePipelineDescriptor new]; // temp retain
|
||||
plDesc.computeFunction = mtlFunc;
|
||||
plDesc.maxTotalThreadsPerThreadgroup = _mtlThreadgroupSize.width * _mtlThreadgroupSize.height * _mtlThreadgroupSize.depth;
|
||||
plDesc.threadGroupSizeIsMultipleOfThreadExecutionWidth = mvkIsAnyFlagEnabled(pCreateInfo->stage.flags, VK_PIPELINE_SHADER_STAGE_CREATE_REQUIRE_FULL_SUBGROUPS_BIT_EXT);
|
||||
|
||||
// Metal does not allow the name of the pipeline to be changed after it has been created,
|
||||
// and we need to create the Metal pipeline immediately to provide error feedback to app.
|
||||
@ -1690,7 +1692,7 @@ MVKMTLFunction MVKComputePipeline::getMTLFunction(const VkComputePipelineCreateI
|
||||
shaderContext.options.mslOptions.texture_buffer_native = _device->_pMetalFeatures->textureBuffers;
|
||||
shaderContext.options.mslOptions.dispatch_base = _allowsDispatchBase;
|
||||
shaderContext.options.mslOptions.texture_1D_as_2D = mvkTreatTexture1DAs2D();
|
||||
shaderContext.options.mslOptions.fixed_subgroup_size = _device->_pMetalFeatures->subgroupSize;
|
||||
shaderContext.options.mslOptions.fixed_subgroup_size = mvkIsAnyFlagEnabled(pSS->flags, VK_PIPELINE_SHADER_STAGE_CREATE_ALLOW_VARYING_SUBGROUP_SIZE_BIT_EXT) ? 0 : _device->_pMetalFeatures->maxSubgroupSize;
|
||||
#if MVK_MACOS
|
||||
shaderContext.options.mslOptions.emulate_subgroups = !_device->_pMetalFeatures->simdPermute;
|
||||
#endif
|
||||
|
@ -95,6 +95,7 @@ MVK_EXTENSION(EXT_robustness2, EXT_ROBUSTNESS_2, DEVICE)
|
||||
MVK_EXTENSION(EXT_scalar_block_layout, EXT_SCALAR_BLOCK_LAYOUT, DEVICE)
|
||||
MVK_EXTENSION(EXT_shader_stencil_export, EXT_SHADER_STENCIL_EXPORT, DEVICE)
|
||||
MVK_EXTENSION(EXT_shader_viewport_index_layer, EXT_SHADER_VIEWPORT_INDEX_LAYER, DEVICE)
|
||||
MVK_EXTENSION(EXT_subgroup_size_control, EXT_SUBGROUP_SIZE_CONTROL, DEVICE)
|
||||
MVK_EXTENSION(EXT_swapchain_colorspace, EXT_SWAPCHAIN_COLOR_SPACE, INSTANCE)
|
||||
MVK_EXTENSION(EXT_texel_buffer_alignment, EXT_TEXEL_BUFFER_ALIGNMENT, DEVICE)
|
||||
MVK_EXTENSION(EXT_texture_compression_astc_hdr, EXT_TEXTURE_COMPRESSION_ASTC_HDR, DEVICE)
|
||||
|
@ -69,6 +69,7 @@ static bool mvkIsSupportedOnPlatform(VkExtensionProperties* pProperties) {
|
||||
MVK_EXTENSION_MIN_OS(EXT_MEMORY_BUDGET, 10.13, 11.0)
|
||||
MVK_EXTENSION_MIN_OS(EXT_POST_DEPTH_COVERAGE, 10.16, 11.0)
|
||||
MVK_EXTENSION_MIN_OS(EXT_SHADER_STENCIL_EXPORT, 10.14, 12.0)
|
||||
MVK_EXTENSION_MIN_OS(EXT_SUBGROUP_SIZE_CONTROL, 10.14, 13.0)
|
||||
MVK_EXTENSION_MIN_OS(EXT_TEXEL_BUFFER_ALIGNMENT, 10.13, 11.0)
|
||||
MVK_EXTENSION_MIN_OS(EXT_TEXTURE_COMPRESSION_ASTC_HDR, 10.16, 13.0)
|
||||
MVK_EXTENSION_MIN_OS(AMD_SHADER_TRINARY_MINMAX, 10.14, 12.0)
|
||||
|
Loading…
x
Reference in New Issue
Block a user