diff --git a/Docs/MoltenVK_Runtime_UserGuide.md b/Docs/MoltenVK_Runtime_UserGuide.md index 49881456..3b110afd 100644 --- a/Docs/MoltenVK_Runtime_UserGuide.md +++ b/Docs/MoltenVK_Runtime_UserGuide.md @@ -290,10 +290,11 @@ In addition to core *Vulkan* functionality, **MoltenVK** also supports the foll - `VK_KHR_portability_subset` - `VK_KHR_push_descriptor` - `VK_KHR_relaxed_block_layout` -- `VK_KHR_sampler_mirror_clamp_to_edge` *(macOS)* +- `VK_KHR_sampler_mirror_clamp_to_edge` *(requires a Mac GPU or Apple family 7 GPU)* - `VK_KHR_sampler_ycbcr_conversion` - `VK_KHR_shader_draw_parameters` - `VK_KHR_shader_float16_int8` +- `VK_KHR_shader_subgroup_extended_types` *(requires Metal 2.1 on Mac or Metal 2.2 and Apple family 4 on iOS)* - `VK_KHR_storage_buffer_storage_class` - `VK_KHR_surface` - `VK_KHR_swapchain` @@ -317,6 +318,7 @@ In addition to core *Vulkan* functionality, **MoltenVK** also supports the foll - `VK_EXT_scalar_block_layout` - `VK_EXT_shader_stencil_export` *(requires Mac GPU family 2 or iOS GPU family 5)* - `VK_EXT_shader_viewport_index_layer` +- `VK_EXT_subgroup_size_control` *(requires Metal 2.1 on Mac or Metal 2.2 and Apple family 4 on iOS)* - `VK_EXT_swapchain_colorspace` - `VK_EXT_vertex_attribute_divisor` - `VK_EXT_texel_buffer_alignment` *(requires Metal 2.0)* diff --git a/Docs/Whats_New.md b/Docs/Whats_New.md index 330df3bf..0d5b1896 100644 --- a/Docs/Whats_New.md +++ b/Docs/Whats_New.md @@ -19,10 +19,12 @@ MoltenVK 1.1.1 Released TBD - Add support for extensions: + - `VK_KHR_sampler_mirror_clamp_to_edge` (iOS) - `VK_KHR_timeline_semaphore` - `VK_EXT_descriptor_indexing` (initial release limited to Metal Tier 1: 96/128 textures, 16 samplers) - `VK_EXT_post_depth_coverage` (macOS) - `VK_EXT_private_data` + - `VK_EXT_subgroup_size_control` - `VK_EXT_texture_compression_astc_hdr` - `VK_AMD_shader_image_load_store` (macOS) - `VK_IMG_format_pvrtc` (macOS) diff --git a/MoltenVK/MoltenVK/API/vk_mvk_moltenvk.h b/MoltenVK/MoltenVK/API/vk_mvk_moltenvk.h index 992c891d..47f20dec 100644 --- a/MoltenVK/MoltenVK/API/vk_mvk_moltenvk.h +++ b/MoltenVK/MoltenVK/API/vk_mvk_moltenvk.h @@ -616,7 +616,7 @@ typedef struct { VkBool32 placementHeaps; /**< If true, MTLHeap objects support placement of resources. */ VkDeviceSize pushConstantSizeAlignment; /**< The alignment used internally when allocating memory for push constants. Must be PoT. */ uint32_t maxTextureLayers; /**< The maximum number of layers in an array texture. */ - uint32_t subgroupSize; /**< The number of threads in a SIMD-group. */ + uint32_t maxSubgroupSize; /**< The maximum number of threads in a SIMD-group. */ VkDeviceSize vertexStrideAlignment; /**< The alignment used for the stride of vertex attribute bindings. */ VkBool32 indirectTessellationDrawing; /**< If true, tessellation draw calls support parameters held in a GPU buffer. */ VkBool32 nonUniformThreadgroups; /**< If true, the device supports arbitrary-sized grids in compute workloads. */ @@ -634,6 +634,7 @@ typedef struct { VkBool32 quadPermute; /**< If true, quadgroup permutation functions (vote, ballot, shuffle) are supported in shaders. */ VkBool32 simdPermute; /**< If true, SIMD-group permutation functions (vote, ballot, shuffle) are supported in shaders. */ VkBool32 simdReduction; /**< If true, SIMD-group reduction functions (arithmetic) are supported in shaders. */ + uint32_t minSubgroupSize; /**< The minimum number of threads in a SIMD-group. */ } MVKPhysicalDeviceMetalFeatures; /** MoltenVK performance of a particular type of activity. */ diff --git a/MoltenVK/MoltenVK/GPUObjects/MVKDevice.mm b/MoltenVK/MoltenVK/GPUObjects/MVKDevice.mm index 70535678..1b575c6d 100644 --- a/MoltenVK/MoltenVK/GPUObjects/MVKDevice.mm +++ b/MoltenVK/MoltenVK/GPUObjects/MVKDevice.mm @@ -53,6 +53,14 @@ using namespace std; #define supportsMTLFeatureSet(MFS) [_mtlDevice supportsFeatureSet: MTLFeatureSet_ ##MFS] #define supportsMTLGPUFamily(GPUF) ([_mtlDevice respondsToSelector: @selector(supportsFamily:)] && [_mtlDevice supportsFamily: MTLGPUFamily ##GPUF]) +static const uint32_t kAMDVendorId = 0x1002; +static const uint32_t kAppleVendorId = 0x106b; +static const uint32_t kIntelVendorId = 0x8086; +static const uint32_t kNVVendorId = 0x10de; + +static const uint32_t kAMDRadeonRX5700XTDeviceId = 0x731f; +static const uint32_t kAMDRadeonRX5500XTDeviceId = 0x7340; + #pragma mark - #pragma mark MVKPhysicalDevice @@ -192,6 +200,12 @@ void MVKPhysicalDevice::getFeatures(VkPhysicalDeviceFeatures2* features) { scalarLayoutFeatures->scalarBlockLayout = true; break; } + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SUBGROUP_SIZE_CONTROL_FEATURES_EXT: { + auto* subgroupSizeFeatures = (VkPhysicalDeviceSubgroupSizeControlFeaturesEXT*)next; + subgroupSizeFeatures->subgroupSizeControl = _metalFeatures.simdPermute || _metalFeatures.quadPermute; + subgroupSizeFeatures->computeFullSubgroups = _metalFeatures.simdPermute || _metalFeatures.quadPermute; + break; + } case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_TEXEL_BUFFER_ALIGNMENT_FEATURES_EXT: { auto* texelBuffAlignFeatures = (VkPhysicalDeviceTexelBufferAlignmentFeaturesEXT*)next; texelBuffAlignFeatures->texelBufferAlignment = _metalFeatures.texelBuffers && [_mtlDevice respondsToSelector: @selector(minimumLinearTextureAlignmentForPixelFormat:)]; @@ -319,7 +333,7 @@ void MVKPhysicalDevice::getProperties(VkPhysicalDeviceProperties2* properties) { } case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SUBGROUP_PROPERTIES: { auto* subgroupProps = (VkPhysicalDeviceSubgroupProperties*)next; - subgroupProps->subgroupSize = _metalFeatures.subgroupSize; + subgroupProps->subgroupSize = _metalFeatures.maxSubgroupSize; subgroupProps->supportedStages = VK_SHADER_STAGE_COMPUTE_BIT; if (_features.tessellationShader) { subgroupProps->supportedStages |= VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT; @@ -392,6 +406,14 @@ void MVKPhysicalDevice::getProperties(VkPhysicalDeviceProperties2* properties) { robustness2Props->robustUniformBufferAccessSizeAlignment = 1; break; } + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SUBGROUP_SIZE_CONTROL_PROPERTIES_EXT: { + auto* subgroupSizeProps = (VkPhysicalDeviceSubgroupSizeControlPropertiesEXT*)next; + subgroupSizeProps->minSubgroupSize = _metalFeatures.minSubgroupSize; + subgroupSizeProps->maxSubgroupSize = _metalFeatures.maxSubgroupSize; + subgroupSizeProps->maxComputeWorkgroupSubgroups = _properties.limits.maxComputeWorkGroupInvocations / _metalFeatures.minSubgroupSize; + subgroupSizeProps->requiredSubgroupSizeStages = 0; + break; + } case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_TEXEL_BUFFER_ALIGNMENT_PROPERTIES_EXT: { auto* texelBuffAlignProps = (VkPhysicalDeviceTexelBufferAlignmentPropertiesEXT*)next; // Save the 'next' pointer; we'll unintentionally overwrite it @@ -1435,18 +1457,43 @@ void MVKPhysicalDevice::initMetalFeatures() { } } - _metalFeatures.subgroupSize = 1; + _metalFeatures.minSubgroupSize = _metalFeatures.maxSubgroupSize = 1; #if MVK_MACOS if (_metalFeatures.simdPermute) { - static const uint32_t kAMDVendorId = 0x1002; - _metalFeatures.subgroupSize = (_properties.vendorID == kAMDVendorId) ? 64 : 32; + // Based on data from Sascha Willems' Vulkan Hardware Database. + // This would be a lot easier and less painful if MTLDevice had properties for this... + _metalFeatures.maxSubgroupSize = (_properties.vendorID == kAMDVendorId) ? 64 : 32; + switch (_properties.vendorID) { + case kIntelVendorId: + _metalFeatures.minSubgroupSize = 8; + break; + case kAMDVendorId: + switch (_properties.deviceID) { + case kAMDRadeonRX5700XTDeviceId: + case kAMDRadeonRX5500XTDeviceId: + _metalFeatures.minSubgroupSize = 32; + break; + default: + _metalFeatures.minSubgroupSize = _metalFeatures.maxSubgroupSize; + break; + } + break; + case kAppleVendorId: + // XXX Minimum thread execution width for Apple GPUs is unknown, but assumed to be 4. May be greater. + _metalFeatures.minSubgroupSize = 4; + break; + default: + _metalFeatures.minSubgroupSize = _metalFeatures.maxSubgroupSize; + break; + } } #endif #if MVK_IOS if (_metalFeatures.simdPermute) { - _metalFeatures.subgroupSize = 32; + _metalFeatures.minSubgroupSize = 4; + _metalFeatures.maxSubgroupSize = 32; } else if (_metalFeatures.quadPermute) { - _metalFeatures.subgroupSize = 4; + _metalFeatures.minSubgroupSize = _metalFeatures.maxSubgroupSize = 4; } #endif @@ -1930,8 +1977,6 @@ void MVKPhysicalDevice::initLimits() { _properties.limits.lineWidthRange[1] = 1; _properties.limits.lineWidthGranularity = 1; - static const uint32_t kIntelVendorId = 0x8086; - static const uint32_t kNVVendorId = 0x10de; _properties.limits.standardSampleLocations = VK_TRUE; _properties.limits.strictLines = _properties.vendorID == kIntelVendorId || _properties.vendorID == kNVVendorId; @@ -2038,7 +2083,6 @@ static uint32_t mvkGetEntryProperty(io_registry_entry_t entry, CFStringRef prope void MVKPhysicalDevice::initGPUInfoProperties() { - static const uint32_t kIntelVendorId = 0x8086; bool isFound = false; bool isIntegrated = _mtlDevice.isLowPower; @@ -2048,7 +2092,7 @@ void MVKPhysicalDevice::initGPUInfoProperties() { if (supportsMTLGPUFamily(Apple5)) { // This is an Apple GPU. It won't have a 'device-id' property, so fill it in // like on iOS/tvOS. - _properties.vendorID = 0x106b; // Apple's PCI ID + _properties.vendorID = kAppleVendorId; #if MVK_MACOS_APPLE_SILICON if (supportsMTLGPUFamily(Apple7)) { _properties.deviceID = 0xa140; @@ -2130,7 +2174,7 @@ void MVKPhysicalDevice::initGPUInfoProperties() { devID = coreCnt > 2 ? 0xa081 : 0xa080; } - _properties.vendorID = 0x0000106b; // Apple's PCI ID + _properties.vendorID = kAppleVendorId; _properties.deviceID = devID; _properties.deviceType = VK_PHYSICAL_DEVICE_TYPE_INTEGRATED_GPU; strlcpy(_properties.deviceName, _mtlDevice.name.UTF8String, VK_MAX_PHYSICAL_DEVICE_NAME_SIZE); @@ -2147,7 +2191,7 @@ void MVKPhysicalDevice::initGPUInfoProperties() { devID = 0xa101; } - _properties.vendorID = 0x0000106b; // Apple's PCI ID + _properties.vendorID = kAppleVendorId; _properties.deviceID = devID; _properties.deviceType = VK_PHYSICAL_DEVICE_TYPE_INTEGRATED_GPU; strlcpy(_properties.deviceName, _mtlDevice.name.UTF8String, VK_MAX_PHYSICAL_DEVICE_NAME_SIZE); diff --git a/MoltenVK/MoltenVK/GPUObjects/MVKPipeline.mm b/MoltenVK/MoltenVK/GPUObjects/MVKPipeline.mm index cbe42663..7a160fa2 100644 --- a/MoltenVK/MoltenVK/GPUObjects/MVKPipeline.mm +++ b/MoltenVK/MoltenVK/GPUObjects/MVKPipeline.mm @@ -1000,6 +1000,7 @@ bool MVKGraphicsPipeline::addTessCtlShaderToPipeline(MTLComputePipelineDescripto shaderContext.options.mslOptions.buffer_size_buffer_index = _bufferSizeBufferIndex.stages[kMVKShaderStageTessCtl]; shaderContext.options.mslOptions.capture_output_to_buffer = true; shaderContext.options.mslOptions.multi_patch_workgroup = true; + shaderContext.options.mslOptions.fixed_subgroup_size = mvkIsAnyFlagEnabled(_pTessCtlSS->flags, VK_PIPELINE_SHADER_STAGE_CREATE_ALLOW_VARYING_SUBGROUP_SIZE_BIT_EXT) ? 0 : _device->_pMetalFeatures->maxSubgroupSize; addPrevStageOutputToShaderConverterContext(shaderContext, vtxOutputs); MVKMTLFunction func = ((MVKShaderModule*)_pTessCtlSS->module)->getMTLFunction(&shaderContext, _pTessCtlSS->pSpecializationInfo, _pipelineCache); @@ -1090,6 +1091,7 @@ bool MVKGraphicsPipeline::addFragmentShaderToPipeline(MTLRenderPipelineDescripto shaderContext.options.mslOptions.view_mask_buffer_index = _viewRangeBufferIndex.stages[kMVKShaderStageFragment]; shaderContext.options.entryPointName = _pFragmentSS->pName; shaderContext.options.mslOptions.capture_output_to_buffer = false; + shaderContext.options.mslOptions.fixed_subgroup_size = mvkIsAnyFlagEnabled(_pFragmentSS->flags, VK_PIPELINE_SHADER_STAGE_CREATE_ALLOW_VARYING_SUBGROUP_SIZE_BIT_EXT) ? 0 : _device->_pMetalFeatures->maxSubgroupSize; if (pCreateInfo->pMultisampleState) { if (pCreateInfo->pMultisampleState->pSampleMask && pCreateInfo->pMultisampleState->pSampleMask[0] != 0xffffffff) { shaderContext.options.mslOptions.additional_fixed_sample_mask = pCreateInfo->pMultisampleState->pSampleMask[0]; @@ -1484,7 +1486,6 @@ void MVKGraphicsPipeline::initMVKShaderConverterContext(SPIRVToMSLConversionConf shaderContext.options.mslOptions.multiview = mvkRendPass->isMultiview(); shaderContext.options.mslOptions.multiview_layered_rendering = getDevice()->getPhysicalDevice()->canUseInstancingForMultiview(); shaderContext.options.mslOptions.view_index_from_device_index = mvkAreAllFlagsEnabled(pCreateInfo->flags, VK_PIPELINE_CREATE_VIEW_INDEX_FROM_DEVICE_INDEX_BIT); - shaderContext.options.mslOptions.fixed_subgroup_size = _device->_pMetalFeatures->subgroupSize; #if MVK_MACOS shaderContext.options.mslOptions.emulate_subgroups = !_device->_pMetalFeatures->simdPermute; #endif @@ -1647,6 +1648,7 @@ MVKComputePipeline::MVKComputePipeline(MVKDevice* device, MTLComputePipelineDescriptor* plDesc = [MTLComputePipelineDescriptor new]; // temp retain plDesc.computeFunction = mtlFunc; plDesc.maxTotalThreadsPerThreadgroup = _mtlThreadgroupSize.width * _mtlThreadgroupSize.height * _mtlThreadgroupSize.depth; + plDesc.threadGroupSizeIsMultipleOfThreadExecutionWidth = mvkIsAnyFlagEnabled(pCreateInfo->stage.flags, VK_PIPELINE_SHADER_STAGE_CREATE_REQUIRE_FULL_SUBGROUPS_BIT_EXT); // Metal does not allow the name of the pipeline to be changed after it has been created, // and we need to create the Metal pipeline immediately to provide error feedback to app. @@ -1690,7 +1692,7 @@ MVKMTLFunction MVKComputePipeline::getMTLFunction(const VkComputePipelineCreateI shaderContext.options.mslOptions.texture_buffer_native = _device->_pMetalFeatures->textureBuffers; shaderContext.options.mslOptions.dispatch_base = _allowsDispatchBase; shaderContext.options.mslOptions.texture_1D_as_2D = mvkTreatTexture1DAs2D(); - shaderContext.options.mslOptions.fixed_subgroup_size = _device->_pMetalFeatures->subgroupSize; + shaderContext.options.mslOptions.fixed_subgroup_size = mvkIsAnyFlagEnabled(pSS->flags, VK_PIPELINE_SHADER_STAGE_CREATE_ALLOW_VARYING_SUBGROUP_SIZE_BIT_EXT) ? 0 : _device->_pMetalFeatures->maxSubgroupSize; #if MVK_MACOS shaderContext.options.mslOptions.emulate_subgroups = !_device->_pMetalFeatures->simdPermute; #endif diff --git a/MoltenVK/MoltenVK/Layers/MVKExtensions.def b/MoltenVK/MoltenVK/Layers/MVKExtensions.def index 267e4f24..a35dbcdb 100644 --- a/MoltenVK/MoltenVK/Layers/MVKExtensions.def +++ b/MoltenVK/MoltenVK/Layers/MVKExtensions.def @@ -95,6 +95,7 @@ MVK_EXTENSION(EXT_robustness2, EXT_ROBUSTNESS_2, DEVICE) MVK_EXTENSION(EXT_scalar_block_layout, EXT_SCALAR_BLOCK_LAYOUT, DEVICE) MVK_EXTENSION(EXT_shader_stencil_export, EXT_SHADER_STENCIL_EXPORT, DEVICE) MVK_EXTENSION(EXT_shader_viewport_index_layer, EXT_SHADER_VIEWPORT_INDEX_LAYER, DEVICE) +MVK_EXTENSION(EXT_subgroup_size_control, EXT_SUBGROUP_SIZE_CONTROL, DEVICE) MVK_EXTENSION(EXT_swapchain_colorspace, EXT_SWAPCHAIN_COLOR_SPACE, INSTANCE) MVK_EXTENSION(EXT_texel_buffer_alignment, EXT_TEXEL_BUFFER_ALIGNMENT, DEVICE) MVK_EXTENSION(EXT_texture_compression_astc_hdr, EXT_TEXTURE_COMPRESSION_ASTC_HDR, DEVICE) diff --git a/MoltenVK/MoltenVK/Layers/MVKExtensions.mm b/MoltenVK/MoltenVK/Layers/MVKExtensions.mm index 0dc34002..0e0eba84 100644 --- a/MoltenVK/MoltenVK/Layers/MVKExtensions.mm +++ b/MoltenVK/MoltenVK/Layers/MVKExtensions.mm @@ -69,6 +69,7 @@ static bool mvkIsSupportedOnPlatform(VkExtensionProperties* pProperties) { MVK_EXTENSION_MIN_OS(EXT_MEMORY_BUDGET, 10.13, 11.0) MVK_EXTENSION_MIN_OS(EXT_POST_DEPTH_COVERAGE, 10.16, 11.0) MVK_EXTENSION_MIN_OS(EXT_SHADER_STENCIL_EXPORT, 10.14, 12.0) + MVK_EXTENSION_MIN_OS(EXT_SUBGROUP_SIZE_CONTROL, 10.14, 13.0) MVK_EXTENSION_MIN_OS(EXT_TEXEL_BUFFER_ALIGNMENT, 10.13, 11.0) MVK_EXTENSION_MIN_OS(EXT_TEXTURE_COMPRESSION_ASTC_HDR, 10.16, 13.0) MVK_EXTENSION_MIN_OS(AMD_SHADER_TRINARY_MINMAX, 10.14, 12.0)