Process multiple patches per workgroup in a tessellation control shader.

This should hopefully reduce underutilization of the GPU, especially on GPUs where the thread execution width is greater than the number of control points. This also eliminates the extra invocations previously needed to read the varyings from the vertex shader into the tessellation shader. The number of threads per workgroup is now lcm(SIMD-size, output control points). This should ensure we always process a whole number of patches per workgroup, and further reduce underutilization of the GPU's SIMD units. To avoid complexity handling indices in the tessellation control shader, I've also changed the way vertex shaders for tessellation are handled. They are now compute kernels using Metal's support for vertex-style stage input. This lets us always emit vertices into the buffer in order of vertex shader execution. Now we no longer have to deal with indexing in the tessellation control shader, nor do we always have to duplicate the index buffer to insert gaps. This also fixes a long-standing issue where if an index were greater than the number of vertices to draw, the vertex shader would wind up writing outside the buffer, and the vertex would be lost.
2020-07-26 19:22:42 -05:00 · 2020-07-26 19:22:42 -05:00 · 3db2cbff6b
commit 3db2cbff6b
parent 4609416ef2
12 changed files with 625 additions and 447 deletions
--- a/MoltenVK/MoltenVK/API/mvk_datatypes.h
+++ b/MoltenVK/MoltenVK/API/mvk_datatypes.h
@ -344,6 +344,9 @@ MTLVertexFormat mvkMTLVertexFormatFromVkFormat(VkFormat vkFormat);
 /** Returns the Metal MTLVertexStepFunction corresponding to the specified Vulkan VkVertexInputRate. */
 MTLVertexStepFunction mvkMTLVertexStepFunctionFromVkVertexInputRate(VkVertexInputRate vkVtxStep);

+/** Returns the Metal MTLStepFunction corresponding to the specified Vulkan VkVertexInputRate. */
+MTLStepFunction mvkMTLStepFunctionFromVkVertexInputRate(VkVertexInputRate vkVtxStep, bool forTess = false);
+
 /** Returns the Metal MTLPrimitiveType corresponding to the specified Vulkan VkPrimitiveTopology. */
 MTLPrimitiveType mvkMTLPrimitiveTypeFromVkPrimitiveTopology(VkPrimitiveTopology vkTopology);

--- a/MoltenVK/MoltenVK/API/vk_mvk_moltenvk.h
+++ b/MoltenVK/MoltenVK/API/vk_mvk_moltenvk.h
@ -55,7 +55,7 @@ typedef unsigned long MTLLanguageVersion;
 #define MVK_MAKE_VERSION(major, minor, patch)    (((major) * 10000) + ((minor) * 100) + (patch))
 #define MVK_VERSION     MVK_MAKE_VERSION(MVK_VERSION_MAJOR, MVK_VERSION_MINOR, MVK_VERSION_PATCH)

-#define VK_MVK_MOLTENVK_SPEC_VERSION            27
+#define VK_MVK_MOLTENVK_SPEC_VERSION            28
 #define VK_MVK_MOLTENVK_EXTENSION_NAME          "VK_MVK_moltenvk"

 /**
@ -614,6 +614,7 @@ typedef struct {
    uint32_t subgroupSize;			            /**< The number of threads in a SIMD-group. */
 	VkDeviceSize vertexStrideAlignment;         /**< The alignment used for the stride of vertex attribute bindings. */
 	VkBool32 indirectTessellationDrawing;		/**< If true, tessellation draw calls support parameters held in a GPU buffer. */
+	VkBool32 nonUniformThreadgroups;			/**< If true, the device supports arbitrary-sized grids in compute workloads. */
 } MVKPhysicalDeviceMetalFeatures;

 /** MoltenVK performance of a particular type of activity. */
--- a/MoltenVK/MoltenVK/Commands/MVKCmdDraw.mm
+++ b/MoltenVK/MoltenVK/Commands/MVKCmdDraw.mm
@ -112,21 +112,21 @@ void MVKCmdDraw::encode(MVKCommandEncoder* cmdEncoder) {
    pipeline->getStages(stages);

    const MVKMTLBufferAllocation* vtxOutBuff = nullptr;
-    const MVKMTLBufferAllocation* vtxParamsBuff = nullptr;
    const MVKMTLBufferAllocation* tcOutBuff = nullptr;
    const MVKMTLBufferAllocation* tcPatchOutBuff = nullptr;
    const MVKMTLBufferAllocation* tcLevelBuff = nullptr;
-    uint32_t patchCount = 0;
-    uint32_t inControlPointCount = 0, outControlPointCount = 0;
+	struct {
+		uint32_t inControlPointCount = 0;
+		uint32_t patchCount = 0;
+	} tessParams;
+    uint32_t outControlPointCount = 0;
    if (pipeline->isTessellationPipeline()) {
-        inControlPointCount = pipeline->getInputControlPointCount();
+        tessParams.inControlPointCount = pipeline->getInputControlPointCount();
        outControlPointCount = pipeline->getOutputControlPointCount();
-        patchCount = mvkCeilingDivide(_vertexCount, inControlPointCount);
+        tessParams.patchCount = mvkCeilingDivide(_vertexCount, tessParams.inControlPointCount) * _instanceCount;
    }
    for (uint32_t s : stages) {
        auto stage = MVKGraphicsStage(s);
-        if (stage == kMVKGraphicsStageVertex)
-            cmdEncoder->_depthStencilState.markDirty();
        cmdEncoder->finalizeDrawState(stage);	// Ensure all updated state has been submitted to Metal

 		if ( !pipeline->hasValidMTLPipelineStates() ) { return; }	// Abort if this pipeline stage could not be compiled.
@ -134,33 +134,25 @@ void MVKCmdDraw::encode(MVKCommandEncoder* cmdEncoder) {
 		id<MTLComputeCommandEncoder> mtlTessCtlEncoder = nil;

 		switch (stage) {
-            case kMVKGraphicsStageVertex:
+            case kMVKGraphicsStageVertex: {
+                mtlTessCtlEncoder = cmdEncoder->getMTLComputeEncoder(kMVKCommandUseTessellationVertexTessCtl);
                if (pipeline->needsVertexOutputBuffer()) {
                    vtxOutBuff = cmdEncoder->getTempMTLBuffer(_vertexCount * _instanceCount * 4 * cmdEncoder->_pDeviceProperties->limits.maxVertexOutputComponents);
-                    [cmdEncoder->_mtlRenderEncoder setVertexBuffer: vtxOutBuff->_mtlBuffer
-                                                            offset: vtxOutBuff->_offset
-                                                           atIndex: pipeline->getOutputBufferIndex().stages[kMVKShaderStageVertex]];
-                    // The shader only needs the number of vertices, so that's all we'll give it.
-                    // It'd be nice to be able to use setVertexBytes(), but since we can't guarantee
-                    // more than 4 bytes alignment because of indirect draws, we're stuck doing this.
-                    vtxParamsBuff = cmdEncoder->getTempMTLBuffer(4);
-                    *(uint32_t*)vtxParamsBuff->getContents() = _vertexCount;
-                    [cmdEncoder->_mtlRenderEncoder setVertexBuffer: vtxParamsBuff->_mtlBuffer
-                                                            offset: vtxParamsBuff->_offset
-                                                           atIndex: pipeline->getIndirectParamsIndex().stages[kMVKShaderStageVertex]];
-                }
-                if (cmdEncoder->_pDeviceMetalFeatures->baseVertexInstanceDrawing) {
-                    [cmdEncoder->_mtlRenderEncoder drawPrimitives: MTLPrimitiveTypePoint
-                                                      vertexStart: _firstVertex
-                                                      vertexCount: _vertexCount
-                                                    instanceCount: _instanceCount
-                                                     baseInstance: _firstInstance];
-                } else {
-                    [cmdEncoder->_mtlRenderEncoder drawPrimitives: MTLPrimitiveTypePoint
-                                                      vertexStart: _firstVertex
-                                                      vertexCount: _vertexCount
-                                                    instanceCount: _instanceCount];
+                    [mtlTessCtlEncoder setBuffer: vtxOutBuff->_mtlBuffer
+                                          offset: vtxOutBuff->_offset
+                                         atIndex: pipeline->getOutputBufferIndex().stages[kMVKShaderStageVertex]];
                }
+				[mtlTessCtlEncoder setStageInRegion: MTLRegionMake2D(_firstVertex, _firstInstance, _vertexCount, _instanceCount)];
+				id<MTLComputePipelineState> vtxState = pipeline->getTessVertexStageState();
+				if (cmdEncoder->getDevice()->_pMetalFeatures->nonUniformThreadgroups) {
+#if MVK_MACOS_OR_IOS
+					[mtlTessCtlEncoder dispatchThreads: MTLSizeMake(_vertexCount, _instanceCount, 1)
+                                 threadsPerThreadgroup: MTLSizeMake(vtxState.threadExecutionWidth, 1, 1)];
+#endif
+				} else {
+					[mtlTessCtlEncoder dispatchThreadgroups: MTLSizeMake(mvkCeilingDivide(_vertexCount, vtxState.threadExecutionWidth), _instanceCount, 1)
+                                      threadsPerThreadgroup: MTLSizeMake(vtxState.threadExecutionWidth, 1, 1)];
+				}
                // Mark pipeline, resources, and tess control push constants as dirty
                // so I apply them during the next stage.
                cmdEncoder->_graphicsPipelineState.beginMetalRenderPass();
@ -168,56 +160,55 @@ void MVKCmdDraw::encode(MVKCommandEncoder* cmdEncoder) {
                cmdEncoder->_depthStencilState.markDirty();
                cmdEncoder->getPushConstants(VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT)->beginMetalRenderPass();
                break;
-            case kMVKGraphicsStageTessControl:
-                mtlTessCtlEncoder = cmdEncoder->getMTLComputeEncoder(kMVKCommandUseTessellationControl);
+			}
+            case kMVKGraphicsStageTessControl: {
+                mtlTessCtlEncoder = cmdEncoder->getMTLComputeEncoder(kMVKCommandUseTessellationVertexTessCtl);
                if (pipeline->needsTessCtlOutputBuffer()) {
-                    tcOutBuff = cmdEncoder->getTempMTLBuffer(outControlPointCount * patchCount * _instanceCount * 4 * cmdEncoder->_pDeviceProperties->limits.maxTessellationControlPerVertexOutputComponents);
+                    tcOutBuff = cmdEncoder->getTempMTLBuffer(outControlPointCount * tessParams.patchCount * 4 * cmdEncoder->_pDeviceProperties->limits.maxTessellationControlPerVertexOutputComponents);
                    [mtlTessCtlEncoder setBuffer: tcOutBuff->_mtlBuffer
                                          offset: tcOutBuff->_offset
                                         atIndex: pipeline->getOutputBufferIndex().stages[kMVKShaderStageTessCtl]];
                }
                if (pipeline->needsTessCtlPatchOutputBuffer()) {
-                    tcPatchOutBuff = cmdEncoder->getTempMTLBuffer(_instanceCount * patchCount * 4 * cmdEncoder->_pDeviceProperties->limits.maxTessellationControlPerPatchOutputComponents);
+                    tcPatchOutBuff = cmdEncoder->getTempMTLBuffer(tessParams.patchCount * 4 * cmdEncoder->_pDeviceProperties->limits.maxTessellationControlPerPatchOutputComponents);
                    [mtlTessCtlEncoder setBuffer: tcPatchOutBuff->_mtlBuffer
                                          offset: tcPatchOutBuff->_offset
                                         atIndex: pipeline->getTessCtlPatchOutputBufferIndex()];
                }
-                tcLevelBuff = cmdEncoder->getTempMTLBuffer(_instanceCount * patchCount * sizeof(MTLQuadTessellationFactorsHalf));
+                tcLevelBuff = cmdEncoder->getTempMTLBuffer(tessParams.patchCount * sizeof(MTLQuadTessellationFactorsHalf));
                [mtlTessCtlEncoder setBuffer: tcLevelBuff->_mtlBuffer
                                      offset: tcLevelBuff->_offset
                                     atIndex: pipeline->getTessCtlLevelBufferIndex()];
                cmdEncoder->setComputeBytes(mtlTessCtlEncoder,
-                                            &inControlPointCount,
-                                            sizeof(inControlPointCount),
+                                            &tessParams,
+                                            sizeof(tessParams),
                                            pipeline->getIndirectParamsIndex().stages[kMVKShaderStageTessCtl]);
                if (pipeline->needsVertexOutputBuffer()) {
                    [mtlTessCtlEncoder setBuffer: vtxOutBuff->_mtlBuffer
                                          offset: vtxOutBuff->_offset
                                         atIndex: kMVKTessCtlInputBufferIndex];
-                    [mtlTessCtlEncoder setStageInRegion: MTLRegionMake1D(0, _instanceCount * std::max(_vertexCount, outControlPointCount * patchCount))];
                }
-                if (outControlPointCount > inControlPointCount) {
-                    // In this case, we use an index buffer to avoid stepping over some of the input points.
-                    const MVKMTLBufferAllocation* tcIndexBuff = cmdEncoder->getTempMTLBuffer(_instanceCount * patchCount * outControlPointCount * 4);
-                    auto* indices = (uint32_t*)tcIndexBuff->getContents();
-                    uint32_t index = 0;
-                    for (uint32_t i = 0; i < outControlPointCount * patchCount; i++) {
-                        if ((i % outControlPointCount) < inControlPointCount) {
-                            indices[i] = index++;
-                        } else {
-                            indices[i] = 0;
-                        }
-                    }
-                    [mtlTessCtlEncoder setBuffer: tcIndexBuff->_mtlBuffer
-                                          offset: tcIndexBuff->_offset
-                                         atIndex: kMVKTessCtlIndexBufferIndex];
-                }
-                [mtlTessCtlEncoder dispatchThreadgroups: MTLSizeMake(_instanceCount * patchCount, 1, 1)
-                                  threadsPerThreadgroup: MTLSizeMake(std::max(inControlPointCount, outControlPointCount), 1, 1)];
+				
+				NSUInteger sgSize = pipeline->getTessControlStageState().threadExecutionWidth;
+				NSUInteger wgSize = mvkLeastCommonMultiple(outControlPointCount, sgSize);
+				while (wgSize > cmdEncoder->getDevice()->_pProperties->limits.maxComputeWorkGroupSize[0]) {
+					sgSize >>= 1;
+					wgSize = mvkLeastCommonMultiple(outControlPointCount, sgSize);
+				}
+				if (cmdEncoder->getDevice()->_pMetalFeatures->nonUniformThreadgroups) {
+#if MVK_MACOS_OR_IOS
+					[mtlTessCtlEncoder dispatchThreads: MTLSizeMake(tessParams.patchCount * outControlPointCount, 1, 1)
+								 threadsPerThreadgroup: MTLSizeMake(wgSize, 1, 1)];
+#endif
+				} else {
+					[mtlTessCtlEncoder dispatchThreadgroups: MTLSizeMake(mvkCeilingDivide(tessParams.patchCount * outControlPointCount, wgSize), 1, 1)
+									  threadsPerThreadgroup: MTLSizeMake(wgSize, 1, 1)];
+				}
                // Running this stage prematurely ended the render pass, so we have to start it up again.
                // TODO: On iOS, maybe we could use a tile shader to avoid this.
                cmdEncoder->beginMetalRenderPass(_loadOverride, _storeOverride);
                break;
+			}
            case kMVKGraphicsStageRasterization:
                if (pipeline->isTessellationPipeline()) {
                    if (pipeline->needsTessCtlOutputBuffer()) {
@ -238,7 +229,7 @@ void MVKCmdDraw::encode(MVKCommandEncoder* cmdEncoder) {
                                                                instanceStride: 0];
                    [cmdEncoder->_mtlRenderEncoder drawPatches: outControlPointCount
                                                    patchStart: 0
-                                                    patchCount: _instanceCount * patchCount
+                                                    patchCount: tessParams.patchCount
                                              patchIndexBuffer: nil
                                        patchIndexBufferOffset: 0
                                                 instanceCount: 1
@ -310,92 +301,49 @@ void MVKCmdDrawIndexed::encode(MVKCommandEncoder* cmdEncoder) {
    VkDeviceSize idxBuffOffset = ibb.offset + (_firstIndex * idxSize);

    const MVKMTLBufferAllocation* vtxOutBuff = nullptr;
-    const MVKMTLBufferAllocation* vtxParamsBuff = nullptr;
    const MVKMTLBufferAllocation* tcOutBuff = nullptr;
    const MVKMTLBufferAllocation* tcPatchOutBuff = nullptr;
    const MVKMTLBufferAllocation* tcLevelBuff = nullptr;
-    const MVKMTLBufferAllocation* tcIndexBuff = nullptr;
-    uint32_t patchCount = 0;
-    uint32_t inControlPointCount = 0, outControlPointCount = 0;
+	struct {
+		uint32_t inControlPointCount = 0;
+		uint32_t patchCount = 0;
+	} tessParams;
+    uint32_t outControlPointCount = 0;
    if (pipeline->isTessellationPipeline()) {
-        inControlPointCount = pipeline->getInputControlPointCount();
+        tessParams.inControlPointCount = pipeline->getInputControlPointCount();
        outControlPointCount = pipeline->getOutputControlPointCount();
-        patchCount = mvkCeilingDivide(_indexCount, inControlPointCount);
+        tessParams.patchCount = mvkCeilingDivide(_indexCount, tessParams.inControlPointCount) * _instanceCount;
    }
    for (uint32_t s : stages) {
        auto stage = MVKGraphicsStage(s);
        id<MTLComputeCommandEncoder> mtlTessCtlEncoder = nil;
-        if (stage == kMVKGraphicsStageTessControl && (outControlPointCount > inControlPointCount || _instanceCount > 1)) {
-            // We need make a copy of the old index buffer so we can insert gaps where
-            // there are more output points than input points, and also to add more indices
-            // to handle instancing. Do it now, before finalizing draw state, or the
-            // pipeline will get overridden.
-            // Yeah, this sucks. But there aren't many good ways for dealing with this issue.
-            mtlTessCtlEncoder = cmdEncoder->getMTLComputeEncoder(kMVKCommandUseTessellationControl);
-            tcIndexBuff = cmdEncoder->getTempMTLBuffer(_instanceCount * patchCount * outControlPointCount * idxSize);
-            id<MTLComputePipelineState> mtlCopyIndexState = cmdEncoder->getCommandEncodingPool()->getCmdDrawIndexedCopyIndexBufferMTLComputePipelineState((MTLIndexType)ibb.mtlIndexType);
-            [mtlTessCtlEncoder setComputePipelineState: mtlCopyIndexState];
-            [mtlTessCtlEncoder setBuffer: ibb.mtlBuffer
-                                  offset: ibb.offset
-                                 atIndex: 0];
-            [mtlTessCtlEncoder setBuffer: tcIndexBuff->_mtlBuffer
-                                  offset: tcIndexBuff->_offset
-                                 atIndex: 1];
-            cmdEncoder->setComputeBytes(mtlTessCtlEncoder,
-                                        &inControlPointCount,
-                                        sizeof(inControlPointCount),
-                                        2);
-            cmdEncoder->setComputeBytes(mtlTessCtlEncoder,
-                                        &outControlPointCount,
-                                        sizeof(outControlPointCount),
-                                        3);
-            const MVKMTLBufferAllocation* indexParamsBuff = cmdEncoder->getTempMTLBuffer(sizeof(MTLDrawIndexedPrimitivesIndirectArguments));
-            auto* params = (MTLDrawIndexedPrimitivesIndirectArguments*)indexParamsBuff->getContents();
-            params->indexCount = _indexCount;
-            params->instanceCount = _instanceCount;
-            params->indexStart = _firstIndex;
-            [mtlTessCtlEncoder setBuffer: indexParamsBuff->_mtlBuffer
-                                  offset: indexParamsBuff->_offset
-                                 atIndex: 4];
-            [mtlTessCtlEncoder dispatchThreadgroups: MTLSizeMake(1, 1, 1) threadsPerThreadgroup: MTLSizeMake(1, 1, 1)];
-        }
-        if (stage == kMVKGraphicsStageVertex)
-            cmdEncoder->_depthStencilState.markDirty();
        cmdEncoder->finalizeDrawState(stage);	// Ensure all updated state has been submitted to Metal

 		if ( !pipeline->hasValidMTLPipelineStates() ) { return; }	// Abort if this pipeline stage could not be compiled.

        switch (stage) {
-            case kMVKGraphicsStageVertex:
+            case kMVKGraphicsStageVertex: {
+                mtlTessCtlEncoder = cmdEncoder->getMTLComputeEncoder(kMVKCommandUseTessellationVertexTessCtl);
                if (pipeline->needsVertexOutputBuffer()) {
                    vtxOutBuff = cmdEncoder->getTempMTLBuffer(_indexCount * _instanceCount * 4 * cmdEncoder->_pDeviceProperties->limits.maxVertexOutputComponents);
-                    [cmdEncoder->_mtlRenderEncoder setVertexBuffer: vtxOutBuff->_mtlBuffer
-                                                            offset: vtxOutBuff->_offset
-                                                           atIndex: pipeline->getOutputBufferIndex().stages[kMVKShaderStageVertex]];
-                    // The shader only needs the number of vertices, so that's all we'll give it.
-                    vtxParamsBuff = cmdEncoder->getTempMTLBuffer(4);
-                    *(uint32_t*)vtxParamsBuff->getContents() = _indexCount;
-                    [cmdEncoder->_mtlRenderEncoder setVertexBuffer: vtxParamsBuff->_mtlBuffer
-                                                            offset: vtxParamsBuff->_offset
-                                                           atIndex: pipeline->getIndirectParamsIndex().stages[kMVKShaderStageVertex]];
-                }
-                if (cmdEncoder->_pDeviceMetalFeatures->baseVertexInstanceDrawing) {
-                    [cmdEncoder->_mtlRenderEncoder drawIndexedPrimitives: MTLPrimitiveTypePoint
-                                                              indexCount: _indexCount
-                                                               indexType: (MTLIndexType)ibb.mtlIndexType
-                                                             indexBuffer: ibb.mtlBuffer
-                                                       indexBufferOffset: idxBuffOffset
-                                                           instanceCount: _instanceCount
-                                                              baseVertex: _vertexOffset
-                                                            baseInstance: _firstInstance];
-                } else {
-                    [cmdEncoder->_mtlRenderEncoder drawIndexedPrimitives: MTLPrimitiveTypePoint
-                                                              indexCount: _indexCount
-                                                               indexType: (MTLIndexType)ibb.mtlIndexType
-                                                             indexBuffer: ibb.mtlBuffer
-                                                       indexBufferOffset: idxBuffOffset
-                                                           instanceCount: _instanceCount];
+                    [mtlTessCtlEncoder setBuffer: vtxOutBuff->_mtlBuffer
+                                          offset: vtxOutBuff->_offset
+                                         atIndex: pipeline->getOutputBufferIndex().stages[kMVKShaderStageVertex]];
                }
+				[mtlTessCtlEncoder setBuffer: ibb.mtlBuffer
+                                      offset: idxBuffOffset
+                                     atIndex: pipeline->getIndirectParamsIndex().stages[kMVKShaderStageVertex]];
+				[mtlTessCtlEncoder setStageInRegion: MTLRegionMake2D(_vertexOffset, _firstInstance, _indexCount, _instanceCount)];
+				id<MTLComputePipelineState> vtxState = ibb.mtlIndexType == MTLIndexTypeUInt16 ? pipeline->getTessVertexStageIndex16State() : pipeline->getTessVertexStageIndex32State();
+				if (cmdEncoder->getDevice()->_pMetalFeatures->nonUniformThreadgroups) {
+#if MVK_MACOS_OR_IOS
+					[mtlTessCtlEncoder dispatchThreads: MTLSizeMake(_indexCount, _instanceCount, 1)
+                                 threadsPerThreadgroup: MTLSizeMake(vtxState.threadExecutionWidth, 1, 1)];
+#endif
+				} else {
+					[mtlTessCtlEncoder dispatchThreadgroups: MTLSizeMake(mvkCeilingDivide(_indexCount, vtxState.threadExecutionWidth), _instanceCount, 1)
+                                      threadsPerThreadgroup: MTLSizeMake(vtxState.threadExecutionWidth, 1, 1)];
+				}
                // Mark pipeline, resources, and tess control push constants as dirty
                // so I apply them during the next stage.
                cmdEncoder->_graphicsPipelineState.beginMetalRenderPass();
@ -403,49 +351,56 @@ void MVKCmdDrawIndexed::encode(MVKCommandEncoder* cmdEncoder) {
                cmdEncoder->_depthStencilState.markDirty();
                cmdEncoder->getPushConstants(VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT)->beginMetalRenderPass();
                break;
-            case kMVKGraphicsStageTessControl:
-                mtlTessCtlEncoder = cmdEncoder->getMTLComputeEncoder(kMVKCommandUseTessellationControl);
+			}
+            case kMVKGraphicsStageTessControl: {
+                mtlTessCtlEncoder = cmdEncoder->getMTLComputeEncoder(kMVKCommandUseTessellationVertexTessCtl);
                if (pipeline->needsTessCtlOutputBuffer()) {
-                    tcOutBuff = cmdEncoder->getTempMTLBuffer(outControlPointCount * patchCount * _instanceCount * 4 * cmdEncoder->_pDeviceProperties->limits.maxTessellationControlPerVertexOutputComponents);
+                    tcOutBuff = cmdEncoder->getTempMTLBuffer(outControlPointCount * tessParams.patchCount * 4 * cmdEncoder->_pDeviceProperties->limits.maxTessellationControlPerVertexOutputComponents);
                    [mtlTessCtlEncoder setBuffer: tcOutBuff->_mtlBuffer
                                          offset: tcOutBuff->_offset
                                         atIndex: pipeline->getOutputBufferIndex().stages[kMVKShaderStageTessCtl]];
                }
                if (pipeline->needsTessCtlPatchOutputBuffer()) {
-                    tcPatchOutBuff = cmdEncoder->getTempMTLBuffer(_instanceCount * patchCount * 4 * cmdEncoder->_pDeviceProperties->limits.maxTessellationControlPerPatchOutputComponents);
+                    tcPatchOutBuff = cmdEncoder->getTempMTLBuffer(tessParams.patchCount * 4 * cmdEncoder->_pDeviceProperties->limits.maxTessellationControlPerPatchOutputComponents);
                    [mtlTessCtlEncoder setBuffer: tcPatchOutBuff->_mtlBuffer
                                          offset: tcPatchOutBuff->_offset
                                         atIndex: pipeline->getTessCtlPatchOutputBufferIndex()];
                }
-                tcLevelBuff = cmdEncoder->getTempMTLBuffer(_instanceCount * patchCount * sizeof(MTLQuadTessellationFactorsHalf));
+                tcLevelBuff = cmdEncoder->getTempMTLBuffer(tessParams.patchCount * sizeof(MTLQuadTessellationFactorsHalf));
                [mtlTessCtlEncoder setBuffer: tcLevelBuff->_mtlBuffer
                                      offset: tcLevelBuff->_offset
                                     atIndex: pipeline->getTessCtlLevelBufferIndex()];
                cmdEncoder->setComputeBytes(mtlTessCtlEncoder,
-                                            &inControlPointCount,
-                                            sizeof(inControlPointCount),
+                                            &tessParams,
+                                            sizeof(tessParams),
                                            pipeline->getIndirectParamsIndex().stages[kMVKShaderStageTessCtl]);
                if (pipeline->needsVertexOutputBuffer()) {
                    [mtlTessCtlEncoder setBuffer: vtxOutBuff->_mtlBuffer
                                          offset: vtxOutBuff->_offset
                                         atIndex: kMVKTessCtlInputBufferIndex];
-                    [mtlTessCtlEncoder setStageInRegion: MTLRegionMake1D(0, _instanceCount * std::max(_indexCount, outControlPointCount * patchCount))];
                }
-                if (outControlPointCount > inControlPointCount || _instanceCount > 1) {
-                    [mtlTessCtlEncoder setBuffer: tcIndexBuff->_mtlBuffer
-                                          offset: tcIndexBuff->_offset
-                                         atIndex: kMVKTessCtlIndexBufferIndex];
-                } else {
-                    [mtlTessCtlEncoder setBuffer: ibb.mtlBuffer
-                                          offset: idxBuffOffset
-                                         atIndex: kMVKTessCtlIndexBufferIndex];
-                }
-                [mtlTessCtlEncoder dispatchThreadgroups: MTLSizeMake(_instanceCount * patchCount, 1, 1)
-                                  threadsPerThreadgroup: MTLSizeMake(std::max(inControlPointCount, outControlPointCount), 1, 1)];
+				// The vertex shader produced output in the correct order, so there's no need to use
+				// an index buffer here.
+				NSUInteger sgSize = pipeline->getTessControlStageState().threadExecutionWidth;
+				NSUInteger wgSize = mvkLeastCommonMultiple(outControlPointCount, sgSize);
+				while (wgSize > cmdEncoder->getDevice()->_pProperties->limits.maxComputeWorkGroupSize[0]) {
+					sgSize >>= 1;
+					wgSize = mvkLeastCommonMultiple(outControlPointCount, sgSize);
+				}
+				if (cmdEncoder->getDevice()->_pMetalFeatures->nonUniformThreadgroups) {
+#if MVK_MACOS_OR_IOS
+					[mtlTessCtlEncoder dispatchThreads: MTLSizeMake(tessParams.patchCount * outControlPointCount, 1, 1)
+								 threadsPerThreadgroup: MTLSizeMake(wgSize, 1, 1)];
+#endif
+				} else {
+					[mtlTessCtlEncoder dispatchThreadgroups: MTLSizeMake(mvkCeilingDivide(tessParams.patchCount * outControlPointCount, wgSize), 1, 1)
+									  threadsPerThreadgroup: MTLSizeMake(wgSize, 1, 1)];
+				}
                // Running this stage prematurely ended the render pass, so we have to start it up again.
                // TODO: On iOS, maybe we could use a tile shader to avoid this.
                cmdEncoder->beginMetalRenderPass(_loadOverride, _storeOverride);
                break;
+			}
            case kMVKGraphicsStageRasterization:
                if (pipeline->isTessellationPipeline()) {
                    if (pipeline->needsTessCtlOutputBuffer()) {
@ -468,7 +423,7 @@ void MVKCmdDrawIndexed::encode(MVKCommandEncoder* cmdEncoder) {
                    // an index buffer here.
                    [cmdEncoder->_mtlRenderEncoder drawPatches: outControlPointCount
                                                    patchStart: 0
-                                                    patchCount: _instanceCount * patchCount
+                                                    patchCount: tessParams.patchCount
                                              patchIndexBuffer: nil
                                        patchIndexBufferOffset: 0
                                                 instanceCount: 1
@ -546,13 +501,15 @@ void MVKCmdDrawIndirect::encode(MVKCommandEncoder* cmdEncoder) {
    // While we're at it, we can create the temporary output buffers once and reuse them
    // for each draw.
    const MVKMTLBufferAllocation* tcIndirectBuff = nullptr;
+	const MVKMTLBufferAllocation* tcParamsBuff = nullptr;
    const MVKMTLBufferAllocation* vtxOutBuff = nullptr;
    const MVKMTLBufferAllocation* tcOutBuff = nullptr;
    const MVKMTLBufferAllocation* tcPatchOutBuff = nullptr;
    const MVKMTLBufferAllocation* tcLevelBuff = nullptr;
-    const MVKMTLBufferAllocation* tcIndexBuff = nullptr;
    uint32_t patchCount = 0, vertexCount = 0;
    uint32_t inControlPointCount = 0, outControlPointCount = 0;
+	VkDeviceSize paramsIncr = 0;
+
    if (pipeline->isTessellationPipeline()) {
        // We can't read the indirect buffer CPU-side, since it may change between
        // encoding and execution. So we don't know how big to make the buffers.
@ -562,11 +519,14 @@ void MVKCmdDrawIndirect::encode(MVKCommandEncoder* cmdEncoder) {
        outControlPointCount = pipeline->getOutputControlPointCount();
        vertexCount = kMVKDrawIndirectVertexCountUpperBound;
        patchCount = mvkCeilingDivide(vertexCount, inControlPointCount);
-        VkDeviceSize indirectSize = (sizeof(MTLDispatchThreadgroupsIndirectArguments) + sizeof(MTLDrawPatchIndirectArguments)) * _drawCount;
+        VkDeviceSize indirectSize = (2 * sizeof(MTLDispatchThreadgroupsIndirectArguments) + sizeof(MTLDrawPatchIndirectArguments)) * _drawCount;
        if (cmdEncoder->_pDeviceMetalFeatures->mslVersion >= 20100) {
            indirectSize += sizeof(MTLStageInRegionIndirectArguments) * _drawCount;
        }
+		paramsIncr = std::max((size_t)cmdEncoder->getDevice()->_pProperties->limits.minUniformBufferOffsetAlignment, sizeof(uint32_t) * 2);
+		VkDeviceSize paramsSize = paramsIncr * _drawCount;
        tcIndirectBuff = cmdEncoder->getTempMTLBuffer(indirectSize);
+		tcParamsBuff = cmdEncoder->getTempMTLBuffer(paramsSize);
        if (pipeline->needsVertexOutputBuffer()) {
            vtxOutBuff = cmdEncoder->getTempMTLBuffer(vertexCount * 4 * cmdEncoder->_pDeviceProperties->limits.maxVertexOutputComponents);
        }
@ -577,19 +537,6 @@ void MVKCmdDrawIndirect::encode(MVKCommandEncoder* cmdEncoder) {
            tcPatchOutBuff = cmdEncoder->getTempMTLBuffer(patchCount * 4 * cmdEncoder->_pDeviceProperties->limits.maxTessellationControlPerPatchOutputComponents);
        }
        tcLevelBuff = cmdEncoder->getTempMTLBuffer(patchCount * sizeof(MTLQuadTessellationFactorsHalf));
-        if (outControlPointCount > inControlPointCount) {
-            // In this case, we use an index buffer to avoid stepping over some of the input points.
-            tcIndexBuff = cmdEncoder->getTempMTLBuffer(patchCount * outControlPointCount * 4);
-            auto* indices = (uint32_t*)tcIndexBuff->getContents();
-            uint32_t index = 0;
-            for (uint32_t i = 0; i < tcIndexBuff->_length / 4; i++) {
-                if ((i % outControlPointCount) < inControlPointCount) {
-                    indices[i] = index++;
-                } else {
-                    indices[i] = 0;
-                }
-            }
-        }
    }

 	MVKPiplineStages stages;
@ -597,16 +544,24 @@ void MVKCmdDrawIndirect::encode(MVKCommandEncoder* cmdEncoder) {

    VkDeviceSize mtlIndBuffOfst = _mtlIndirectBufferOffset;
    VkDeviceSize mtlTCIndBuffOfst = tcIndirectBuff ? tcIndirectBuff->_offset : 0;
+	VkDeviceSize mtlParmBuffOfst = tcParamsBuff ? tcParamsBuff->_offset : 0;
+	NSUInteger vtxThreadExecWidth = pipeline->getTessVertexStageState().threadExecutionWidth;
+	NSUInteger sgSize = pipeline->getTessControlStageState().threadExecutionWidth;
+	NSUInteger tcWorkgroupSize = mvkLeastCommonMultiple(outControlPointCount, sgSize);
+	while (tcWorkgroupSize > cmdEncoder->getDevice()->_pProperties->limits.maxComputeWorkGroupSize[0]) {
+		sgSize >>= 1;
+		tcWorkgroupSize = mvkLeastCommonMultiple(outControlPointCount, sgSize);
+	}
    for (uint32_t drawIdx = 0; drawIdx < _drawCount; drawIdx++) {
        for (uint32_t s : stages) {
            auto stage = MVKGraphicsStage(s);
            id<MTLComputeCommandEncoder> mtlTessCtlEncoder = nil;
-            if (drawIdx == 0 && stage == kMVKGraphicsStageTessControl) {
+            if (drawIdx == 0 && stage == kMVKGraphicsStageVertex) {
                // We need the indirect buffers now. This must be done before finalizing
                // draw state, or the pipeline will get overridden. This is a good time
                // to do it, since it will require switching to compute anyway. Do it all
                // at once to get it over with.
-                mtlTessCtlEncoder = cmdEncoder->getMTLComputeEncoder(kMVKCommandUseTessellationControl);
+                mtlTessCtlEncoder = cmdEncoder->getMTLComputeEncoder(kMVKCommandUseTessellationVertexTessCtl);
                id<MTLComputePipelineState> mtlConvertState = cmdEncoder->getCommandEncodingPool()->getCmdDrawIndirectConvertBuffersMTLComputePipelineState(false);
                [mtlTessCtlEncoder setComputePipelineState: mtlConvertState];
                [mtlTessCtlEncoder setBuffer: _mtlIndirectBuffer
@ -615,46 +570,67 @@ void MVKCmdDrawIndirect::encode(MVKCommandEncoder* cmdEncoder) {
                [mtlTessCtlEncoder setBuffer: tcIndirectBuff->_mtlBuffer
                                      offset: tcIndirectBuff->_offset
                                     atIndex: 1];
+                [mtlTessCtlEncoder setBuffer: tcParamsBuff->_mtlBuffer
+                                      offset: tcParamsBuff->_offset
+                                     atIndex: 2];
                cmdEncoder->setComputeBytes(mtlTessCtlEncoder,
                                            &_mtlIndirectBufferStride,
                                            sizeof(_mtlIndirectBufferStride),
-                                            2);
+                                            3);
                cmdEncoder->setComputeBytes(mtlTessCtlEncoder,
                                            &inControlPointCount,
                                            sizeof(inControlPointCount),
-                                            3);
+                                            4);
                cmdEncoder->setComputeBytes(mtlTessCtlEncoder,
                                            &outControlPointCount,
-                                            sizeof(inControlPointCount),
-                                            4);
+                                            sizeof(outControlPointCount),
+                                            5);
                cmdEncoder->setComputeBytes(mtlTessCtlEncoder,
                                            &_drawCount,
                                            sizeof(_drawCount),
-                                            5);
-                [mtlTessCtlEncoder dispatchThreadgroups: MTLSizeMake(mvkCeilingDivide<NSUInteger>(_drawCount, mtlConvertState.threadExecutionWidth), 1, 1)
-                                  threadsPerThreadgroup: MTLSizeMake(mtlConvertState.threadExecutionWidth, 1, 1)];
+                                            6);
+                cmdEncoder->setComputeBytes(mtlTessCtlEncoder,
+                                            &vtxThreadExecWidth,
+                                            sizeof(vtxThreadExecWidth),
+                                            7);
+                cmdEncoder->setComputeBytes(mtlTessCtlEncoder,
+                                            &tcWorkgroupSize,
+                                            sizeof(tcWorkgroupSize),
+                                            8);
+				if (cmdEncoder->getDevice()->_pMetalFeatures->nonUniformThreadgroups) {
+#if MVK_MACOS_OR_IOS
+					[mtlTessCtlEncoder dispatchThreads: MTLSizeMake(_drawCount, 1, 1)
+								 threadsPerThreadgroup: MTLSizeMake(mtlConvertState.threadExecutionWidth, 1, 1)];
+#endif
+				} else {
+					[mtlTessCtlEncoder dispatchThreadgroups: MTLSizeMake(mvkCeilingDivide<NSUInteger>(_drawCount, mtlConvertState.threadExecutionWidth), 1, 1)
+									  threadsPerThreadgroup: MTLSizeMake(mtlConvertState.threadExecutionWidth, 1, 1)];
+				}
            }

-            if (stage == kMVKGraphicsStageVertex)
-                cmdEncoder->_depthStencilState.markDirty();
            cmdEncoder->finalizeDrawState(stage);	// Ensure all updated state has been submitted to Metal

 			if ( !pipeline->hasValidMTLPipelineStates() ) { return; }	// Abort if this pipeline stage could not be compiled.

            switch (stage) {
                case kMVKGraphicsStageVertex:
+                    mtlTessCtlEncoder = cmdEncoder->getMTLComputeEncoder(kMVKCommandUseTessellationVertexTessCtl);
                    if (pipeline->needsVertexOutputBuffer()) {
-                        [cmdEncoder->_mtlRenderEncoder setVertexBuffer: vtxOutBuff->_mtlBuffer
-                                                                offset: vtxOutBuff->_offset
-                                                               atIndex: pipeline->getOutputBufferIndex().stages[kMVKShaderStageVertex]];
-                        [cmdEncoder->_mtlRenderEncoder setVertexBuffer: _mtlIndirectBuffer
-                                                                offset: mtlIndBuffOfst
-                                                              atIndex: pipeline->getIndirectParamsIndex().stages[kMVKShaderStageVertex]];
+                        [mtlTessCtlEncoder setBuffer: vtxOutBuff->_mtlBuffer
+                                              offset: vtxOutBuff->_offset
+                                             atIndex: pipeline->getOutputBufferIndex().stages[kMVKShaderStageVertex]];
                    }
-                    [cmdEncoder->_mtlRenderEncoder drawPrimitives: MTLPrimitiveTypePoint
-                                                   indirectBuffer: _mtlIndirectBuffer
-                                             indirectBufferOffset: mtlIndBuffOfst];
-                    mtlIndBuffOfst += _mtlIndirectBufferStride;
+					// We must assume we can read up to the maximum number of vertices.
+					[mtlTessCtlEncoder setStageInRegion: MTLRegionMake2D(0, 0, vertexCount, vertexCount)];
+					if ([mtlTessCtlEncoder respondsToSelector: @selector(setStageInRegionWithIndirectBuffer:indirectBufferOffset:)]) {
+						[mtlTessCtlEncoder setStageInRegionWithIndirectBuffer: tcIndirectBuff->_mtlBuffer
+						                                 indirectBufferOffset: mtlTCIndBuffOfst];
+						mtlTCIndBuffOfst += sizeof(MTLStageInRegionIndirectArguments);
+					}
+					[mtlTessCtlEncoder dispatchThreadgroupsWithIndirectBuffer: tcIndirectBuff->_mtlBuffer
+														 indirectBufferOffset: mtlTCIndBuffOfst
+														threadsPerThreadgroup: MTLSizeMake(vtxThreadExecWidth, 1, 1)];
+					mtlTCIndBuffOfst += sizeof(MTLDispatchThreadgroupsIndirectArguments);
                    // Mark pipeline, resources, and tess control push constants as dirty
                    // so I apply them during the next stage.
                    cmdEncoder->_graphicsPipelineState.beginMetalRenderPass();
@ -663,7 +639,7 @@ void MVKCmdDrawIndirect::encode(MVKCommandEncoder* cmdEncoder) {
                    cmdEncoder->getPushConstants(VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT)->beginMetalRenderPass();
                    break;
                case kMVKGraphicsStageTessControl:
-                    mtlTessCtlEncoder = cmdEncoder->getMTLComputeEncoder(kMVKCommandUseTessellationControl);
+                    mtlTessCtlEncoder = cmdEncoder->getMTLComputeEncoder(kMVKCommandUseTessellationVertexTessCtl);
                    if (pipeline->needsTessCtlOutputBuffer()) {
                        [mtlTessCtlEncoder setBuffer: tcOutBuff->_mtlBuffer
                                              offset: tcOutBuff->_offset
@ -677,33 +653,18 @@ void MVKCmdDrawIndirect::encode(MVKCommandEncoder* cmdEncoder) {
                    [mtlTessCtlEncoder setBuffer: tcLevelBuff->_mtlBuffer
                                          offset: tcLevelBuff->_offset
                                         atIndex: pipeline->getTessCtlLevelBufferIndex()];
-                    cmdEncoder->setComputeBytes(mtlTessCtlEncoder,
-                                                &inControlPointCount,
-                                                sizeof(inControlPointCount),
-                                                pipeline->getIndirectParamsIndex().stages[kMVKShaderStageTessCtl]);
+					[mtlTessCtlEncoder setBuffer: tcParamsBuff->_mtlBuffer
+										  offset: mtlParmBuffOfst
+										 atIndex: pipeline->getIndirectParamsIndex().stages[kMVKShaderStageTessCtl]];
+					mtlParmBuffOfst += paramsIncr;
                    if (pipeline->needsVertexOutputBuffer()) {
                        [mtlTessCtlEncoder setBuffer: vtxOutBuff->_mtlBuffer
                                              offset: vtxOutBuff->_offset
                                             atIndex: kMVKTessCtlInputBufferIndex];
-                        if ([mtlTessCtlEncoder respondsToSelector: @selector(setStageInRegionWithIndirectBuffer:indirectBufferOffset:)]) {
-                            // setStageInRegionWithIndirectBuffer appears to be broken. We have a 1D linear region anyway, so size is irrelevant
-                            //[mtlTessCtlEncoder setStageInRegionWithIndirectBuffer: tcIndirectBuff->_mtlBuffer
-                            //                                 indirectBufferOffset: mtlTCIndBuffOfst];
-                            [mtlTessCtlEncoder setStageInRegion: MTLRegionMake1D(0, std::max(inControlPointCount, outControlPointCount) * patchCount)];
-                            mtlTCIndBuffOfst += sizeof(MTLStageInRegionIndirectArguments);
-                        } else {
-                            // We must assume we can read up to the maximum number of vertices.
-                            [mtlTessCtlEncoder setStageInRegion: MTLRegionMake1D(0, std::max(inControlPointCount, outControlPointCount) * patchCount)];
-                        }
-                    }
-                    if (outControlPointCount > inControlPointCount) {
-                        [mtlTessCtlEncoder setBuffer: tcIndexBuff->_mtlBuffer
-                                              offset: tcIndexBuff->_offset
-                                             atIndex: kMVKTessCtlIndexBufferIndex];
                    }
                    [mtlTessCtlEncoder dispatchThreadgroupsWithIndirectBuffer: tcIndirectBuff->_mtlBuffer
                                                         indirectBufferOffset: mtlTCIndBuffOfst
-                                                        threadsPerThreadgroup: MTLSizeMake(std::max(inControlPointCount, outControlPointCount), 1, 1)];
+                                                        threadsPerThreadgroup: MTLSizeMake(tcWorkgroupSize, 1, 1)];
                    mtlTCIndBuffOfst += sizeof(MTLDispatchThreadgroupsIndirectArguments);
                    // Running this stage prematurely ended the render pass, so we have to start it up again.
                    // TODO: On iOS, maybe we could use a tile shader to avoid this.
@ -742,7 +703,7 @@ void MVKCmdDrawIndirect::encode(MVKCommandEncoder* cmdEncoder) {
                        // so I apply them during the next stage.
                        cmdEncoder->_graphicsPipelineState.beginMetalRenderPass();
                        cmdEncoder->_graphicsResourcesState.beginMetalRenderPass();
-                        cmdEncoder->getPushConstants(VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT)->beginMetalRenderPass();
+                        cmdEncoder->getPushConstants(VK_SHADER_STAGE_VERTEX_BIT)->beginMetalRenderPass();
                    } else {
                        [cmdEncoder->_mtlRenderEncoder drawPrimitives: cmdEncoder->_mtlPrimitiveType
                                                       indirectBuffer: _mtlIndirectBuffer
@ -788,20 +749,22 @@ VkResult MVKCmdDrawIndexedIndirect::setContent(MVKCommandBuffer* cmdBuff,
 void MVKCmdDrawIndexedIndirect::encode(MVKCommandEncoder* cmdEncoder) {

    MVKIndexMTLBufferBinding& ibb = cmdEncoder->_graphicsResourcesState._mtlIndexBufferBinding;
-    size_t idxSize = mvkMTLIndexTypeSizeInBytes((MTLIndexType)ibb.mtlIndexType);
    auto* pipeline = (MVKGraphicsPipeline*)cmdEncoder->_graphicsPipelineState.getPipeline();
    // The indirect calls for dispatchThreadgroups:... and drawPatches:... have different formats.
    // We have to convert from the drawIndexedPrimitives:... format to them.
    // While we're at it, we can create the temporary output buffers once and reuse them
    // for each draw.
    const MVKMTLBufferAllocation* tcIndirectBuff = nullptr;
+    const MVKMTLBufferAllocation* tcParamsBuff = nullptr;
    const MVKMTLBufferAllocation* vtxOutBuff = nullptr;
    const MVKMTLBufferAllocation* tcOutBuff = nullptr;
    const MVKMTLBufferAllocation* tcPatchOutBuff = nullptr;
    const MVKMTLBufferAllocation* tcLevelBuff = nullptr;
-    const MVKMTLBufferAllocation* tcIndexBuff = nullptr;
+    const MVKMTLBufferAllocation* vtxIndexBuff = nullptr;
    uint32_t patchCount = 0, vertexCount = 0;
    uint32_t inControlPointCount = 0, outControlPointCount = 0;
+	VkDeviceSize paramsIncr = 0;
+
    if (pipeline->isTessellationPipeline()) {
        // We can't read the indirect buffer CPU-side, since it may change between
        // encoding and execution. So we don't know how big to make the buffers.
@ -815,7 +778,10 @@ void MVKCmdDrawIndexedIndirect::encode(MVKCommandEncoder* cmdEncoder) {
        if (cmdEncoder->_pDeviceMetalFeatures->mslVersion >= 20100) {
            indirectSize += sizeof(MTLStageInRegionIndirectArguments) * _drawCount;
        }
+		paramsIncr = std::max((size_t)cmdEncoder->getDevice()->_pProperties->limits.minUniformBufferOffsetAlignment, sizeof(uint32_t) * 2);
+		VkDeviceSize paramsSize = paramsIncr * _drawCount;
        tcIndirectBuff = cmdEncoder->getTempMTLBuffer(indirectSize);
+		tcParamsBuff = cmdEncoder->getTempMTLBuffer(paramsSize);
        if (pipeline->needsVertexOutputBuffer()) {
            vtxOutBuff = cmdEncoder->getTempMTLBuffer(vertexCount * 4 * cmdEncoder->_pDeviceProperties->limits.maxVertexOutputComponents);
        }
@ -826,7 +792,7 @@ void MVKCmdDrawIndexedIndirect::encode(MVKCommandEncoder* cmdEncoder) {
            tcPatchOutBuff = cmdEncoder->getTempMTLBuffer(patchCount * 4 * cmdEncoder->_pDeviceProperties->limits.maxTessellationControlPerPatchOutputComponents);
        }
        tcLevelBuff = cmdEncoder->getTempMTLBuffer(patchCount * sizeof(MTLQuadTessellationFactorsHalf));
-        tcIndexBuff = cmdEncoder->getTempMTLBuffer(patchCount * outControlPointCount * idxSize);
+        vtxIndexBuff = cmdEncoder->getTempMTLBuffer(ibb.mtlBuffer.length);
    }

 	MVKPiplineStages stages;
@ -834,12 +800,21 @@ void MVKCmdDrawIndexedIndirect::encode(MVKCommandEncoder* cmdEncoder) {

    VkDeviceSize mtlIndBuffOfst = _mtlIndirectBufferOffset;
    VkDeviceSize mtlTCIndBuffOfst = tcIndirectBuff ? tcIndirectBuff->_offset : 0;
+	VkDeviceSize mtlParmBuffOfst = tcParamsBuff ? tcParamsBuff->_offset : 0;
+	id<MTLComputePipelineState> vtxState = ibb.mtlIndexType == MTLIndexTypeUInt16 ? pipeline->getTessVertexStageIndex16State() : pipeline->getTessVertexStageIndex32State();
+	NSUInteger vtxThreadExecWidth = vtxState.threadExecutionWidth;
+	NSUInteger sgSize = pipeline->getTessControlStageState().threadExecutionWidth;
+	NSUInteger tcWorkgroupSize = mvkLeastCommonMultiple(outControlPointCount, sgSize);
+	while (tcWorkgroupSize > cmdEncoder->getDevice()->_pProperties->limits.maxComputeWorkGroupSize[0]) {
+		sgSize >>= 1;
+		tcWorkgroupSize = mvkLeastCommonMultiple(outControlPointCount, sgSize);
+	}
    for (uint32_t drawIdx = 0; drawIdx < _drawCount; drawIdx++) {
        for (uint32_t s : stages) {
            auto stage = MVKGraphicsStage(s);
            id<MTLComputeCommandEncoder> mtlTessCtlEncoder = nil;
-            if (stage == kMVKGraphicsStageTessControl) {
-                mtlTessCtlEncoder = cmdEncoder->getMTLComputeEncoder(kMVKCommandUseTessellationControl);
+            if (stage == kMVKGraphicsStageVertex) {
+                mtlTessCtlEncoder = cmdEncoder->getMTLComputeEncoder(kMVKCommandUseTessellationVertexTessCtl);
                // We need the indirect buffers now. This must be done before finalizing
                // draw state, or the pipeline will get overridden. This is a good time
                // to do it, since it will require switching to compute anyway. Do it all
@ -853,27 +828,37 @@ void MVKCmdDrawIndexedIndirect::encode(MVKCommandEncoder* cmdEncoder) {
                    [mtlTessCtlEncoder setBuffer: tcIndirectBuff->_mtlBuffer
                                          offset: tcIndirectBuff->_offset
                                         atIndex: 1];
+                    [mtlTessCtlEncoder setBuffer: tcParamsBuff->_mtlBuffer
+                                          offset: tcParamsBuff->_offset
+                                         atIndex: 2];
                    cmdEncoder->setComputeBytes(mtlTessCtlEncoder,
                                                &_mtlIndirectBufferStride,
                                                sizeof(_mtlIndirectBufferStride),
-                                                2);
+                                                3);
                    cmdEncoder->setComputeBytes(mtlTessCtlEncoder,
                                                &inControlPointCount,
                                                sizeof(inControlPointCount),
-                                                3);
+                                                4);
                    cmdEncoder->setComputeBytes(mtlTessCtlEncoder,
                                                &outControlPointCount,
                                                sizeof(inControlPointCount),
-                                                4);
+                                                5);
                    cmdEncoder->setComputeBytes(mtlTessCtlEncoder,
                                                &_drawCount,
                                                sizeof(_drawCount),
-                                                5);
+                                                6);
+					cmdEncoder->setComputeBytes(mtlTessCtlEncoder,
+												&vtxThreadExecWidth,
+												sizeof(vtxThreadExecWidth),
+												7);
+					cmdEncoder->setComputeBytes(mtlTessCtlEncoder,
+												&tcWorkgroupSize,
+												sizeof(tcWorkgroupSize),
+												8);
                    [mtlTessCtlEncoder dispatchThreadgroups: MTLSizeMake(mvkCeilingDivide<NSUInteger>(_drawCount, mtlConvertState.threadExecutionWidth), 1, 1)
                                      threadsPerThreadgroup: MTLSizeMake(mtlConvertState.threadExecutionWidth, 1, 1)];
                }
-                // We actually need to make a copy of the index buffer, regardless of whether
-                // or not there are gaps in it, because there's no way to tell Metal to
+                // We actually need to make a copy of the index buffer, because there's no way to tell Metal to
                // offset an index buffer from a value in an indirect buffer. This also
                // means that, to make a copy, we have to use a compute shader.
                id<MTLComputePipelineState> mtlCopyIndexState = cmdEncoder->getCommandEncodingPool()->getCmdDrawIndexedCopyIndexBufferMTLComputePipelineState((MTLIndexType)ibb.mtlIndexType);
@ -881,46 +866,43 @@ void MVKCmdDrawIndexedIndirect::encode(MVKCommandEncoder* cmdEncoder) {
                [mtlTessCtlEncoder setBuffer: ibb.mtlBuffer
                                      offset: ibb.offset
                                     atIndex: 0];
-                [mtlTessCtlEncoder setBuffer: tcIndexBuff->_mtlBuffer
-                                      offset: tcIndexBuff->_offset
+                [mtlTessCtlEncoder setBuffer: vtxIndexBuff->_mtlBuffer
+                                      offset: vtxIndexBuff->_offset
                                     atIndex: 1];
-                cmdEncoder->setComputeBytes(mtlTessCtlEncoder,
-                                            &inControlPointCount,
-                                            sizeof(inControlPointCount),
-                                            2);
-                cmdEncoder->setComputeBytes(mtlTessCtlEncoder,
-                                            &outControlPointCount,
-                                            sizeof(outControlPointCount),
-                                            3);
-                [mtlTessCtlEncoder setBuffer: tcIndirectBuff->_mtlBuffer
-                                      offset: mtlTCIndBuffOfst
-                                     atIndex: 4];
-                [mtlTessCtlEncoder dispatchThreadgroups: MTLSizeMake(1, 1, 1) threadsPerThreadgroup: MTLSizeMake(1, 1, 1)];
+                [mtlTessCtlEncoder setBuffer: _mtlIndirectBuffer
+                                      offset: mtlIndBuffOfst
+                                     atIndex: 2];
+                [mtlTessCtlEncoder dispatchThreadgroupsWithIndirectBuffer: tcIndirectBuff->_mtlBuffer
+													 indirectBufferOffset: mtlTCIndBuffOfst
+                                                    threadsPerThreadgroup: MTLSizeMake(vtxThreadExecWidth, 1, 1)];
+				mtlIndBuffOfst += sizeof(MTLDrawIndexedPrimitivesIndirectArguments);
            }

-            if (stage == kMVKGraphicsStageVertex)
-                cmdEncoder->_depthStencilState.markDirty();
 	        cmdEncoder->finalizeDrawState(stage);	// Ensure all updated state has been submitted to Metal

 			if ( !pipeline->hasValidMTLPipelineStates() ) { return; }	// Abort if this pipeline stage could not be compiled.

            switch (stage) {
                case kMVKGraphicsStageVertex:
+                    mtlTessCtlEncoder = cmdEncoder->getMTLComputeEncoder(kMVKCommandUseTessellationVertexTessCtl);
                    if (pipeline->needsVertexOutputBuffer()) {
-                        [cmdEncoder->_mtlRenderEncoder setVertexBuffer: vtxOutBuff->_mtlBuffer
-                                                                offset: vtxOutBuff->_offset
-                                                               atIndex: pipeline->getOutputBufferIndex().stages[kMVKShaderStageVertex]];
-                        [cmdEncoder->_mtlRenderEncoder setVertexBuffer: _mtlIndirectBuffer
-                                                                offset: mtlIndBuffOfst
-                                                              atIndex: pipeline->getIndirectParamsIndex().stages[kMVKShaderStageVertex]];
+                        [mtlTessCtlEncoder setBuffer: vtxOutBuff->_mtlBuffer
+                                             offset: vtxOutBuff->_offset
+                                            atIndex: pipeline->getOutputBufferIndex().stages[kMVKShaderStageVertex]];
                    }
-                    [cmdEncoder->_mtlRenderEncoder drawIndexedPrimitives: MTLPrimitiveTypePoint
-                                                               indexType: (MTLIndexType)ibb.mtlIndexType
-                                                             indexBuffer: ibb.mtlBuffer
-                                                       indexBufferOffset: ibb.offset
-                                                          indirectBuffer: _mtlIndirectBuffer
-                                                    indirectBufferOffset: mtlIndBuffOfst];
-                    mtlIndBuffOfst += _mtlIndirectBufferStride;
+					[mtlTessCtlEncoder setBuffer: vtxIndexBuff->_mtlBuffer
+										  offset: vtxIndexBuff->_offset
+										 atIndex: pipeline->getIndirectParamsIndex().stages[kMVKShaderStageVertex]];
+					[mtlTessCtlEncoder setStageInRegion: MTLRegionMake2D(0, 0, vertexCount, vertexCount)];
+					if ([mtlTessCtlEncoder respondsToSelector: @selector(setStageInRegionWithIndirectBuffer:indirectBufferOffset:)]) {
+						[mtlTessCtlEncoder setStageInRegionWithIndirectBuffer: tcIndirectBuff->_mtlBuffer
+						                                 indirectBufferOffset: mtlTCIndBuffOfst];
+						mtlTCIndBuffOfst += sizeof(MTLStageInRegionIndirectArguments);
+					}
+					[mtlTessCtlEncoder dispatchThreadgroupsWithIndirectBuffer: tcIndirectBuff->_mtlBuffer
+														 indirectBufferOffset: mtlTCIndBuffOfst
+														threadsPerThreadgroup: MTLSizeMake(vtxThreadExecWidth, 1, 1)];
+					mtlTCIndBuffOfst += sizeof(MTLDispatchThreadgroupsIndirectArguments);
                    // Mark pipeline, resources, and tess control push constants as dirty
                    // so I apply them during the next stage.
                    cmdEncoder->_graphicsPipelineState.beginMetalRenderPass();
@ -929,7 +911,7 @@ void MVKCmdDrawIndexedIndirect::encode(MVKCommandEncoder* cmdEncoder) {
                    cmdEncoder->getPushConstants(VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT)->beginMetalRenderPass();
                    break;
                case kMVKGraphicsStageTessControl:
-                    mtlTessCtlEncoder = cmdEncoder->getMTLComputeEncoder(kMVKCommandUseTessellationControl);
+                    mtlTessCtlEncoder = cmdEncoder->getMTLComputeEncoder(kMVKCommandUseTessellationVertexTessCtl);
                    if (pipeline->needsTessCtlOutputBuffer()) {
                        [mtlTessCtlEncoder setBuffer: tcOutBuff->_mtlBuffer
                                              offset: tcOutBuff->_offset
@ -943,31 +925,18 @@ void MVKCmdDrawIndexedIndirect::encode(MVKCommandEncoder* cmdEncoder) {
                    [mtlTessCtlEncoder setBuffer: tcLevelBuff->_mtlBuffer
                                          offset: tcLevelBuff->_offset
                                         atIndex: pipeline->getTessCtlLevelBufferIndex()];
-                    cmdEncoder->setComputeBytes(mtlTessCtlEncoder,
-                                                &inControlPointCount,
-                                                sizeof(inControlPointCount),
-                                                pipeline->getIndirectParamsIndex().stages[kMVKShaderStageTessCtl]);
+					[mtlTessCtlEncoder setBuffer: tcParamsBuff->_mtlBuffer
+										  offset: mtlParmBuffOfst
+										 atIndex: pipeline->getIndirectParamsIndex().stages[kMVKShaderStageTessCtl]];
+					mtlParmBuffOfst += paramsIncr;
                    if (pipeline->needsVertexOutputBuffer()) {
                        [mtlTessCtlEncoder setBuffer: vtxOutBuff->_mtlBuffer
                                              offset: vtxOutBuff->_offset
                                             atIndex: kMVKTessCtlInputBufferIndex];
-                        if ([mtlTessCtlEncoder respondsToSelector: @selector(setStageInRegionWithIndirectBuffer:indirectBufferOffset:)]) {
-                            // setStageInRegionWithIndirectBuffer appears to be broken. We have a 1D linear region anyway, so size is irrelevant
-                            //[mtlTessCtlEncoder setStageInRegionWithIndirectBuffer: tcIndirectBuff->_mtlBuffer
-                            //                                 indirectBufferOffset: mtlTCIndBuffOfst];
-                            [mtlTessCtlEncoder setStageInRegion: MTLRegionMake1D(0, std::max(inControlPointCount, outControlPointCount) * patchCount)];
-                            mtlTCIndBuffOfst += sizeof(MTLStageInRegionIndirectArguments);
-                        } else {
-                            // We must assume we can read up to the maximum number of vertices.
-                            [mtlTessCtlEncoder setStageInRegion: MTLRegionMake1D(0, std::max(inControlPointCount, outControlPointCount) * patchCount)];
-                        }
                    }
-                    [mtlTessCtlEncoder setBuffer: tcIndexBuff->_mtlBuffer
-                                          offset: tcIndexBuff->_offset
-                                         atIndex: kMVKTessCtlIndexBufferIndex];
                    [mtlTessCtlEncoder dispatchThreadgroupsWithIndirectBuffer: tcIndirectBuff->_mtlBuffer
                                                         indirectBufferOffset: mtlTCIndBuffOfst
-                                                        threadsPerThreadgroup: MTLSizeMake(std::max(inControlPointCount, outControlPointCount), 1, 1)];
+                                                        threadsPerThreadgroup: MTLSizeMake(tcWorkgroupSize, 1, 1)];
                    mtlTCIndBuffOfst += sizeof(MTLDispatchThreadgroupsIndirectArguments);
                    // Running this stage prematurely ended the render pass, so we have to start it up again.
                    // TODO: On iOS, maybe we could use a tile shader to avoid this.
--- a/MoltenVK/MoltenVK/Commands/MVKCommandBuffer.mm
+++ b/MoltenVK/MoltenVK/Commands/MVKCommandBuffer.mm
@ -678,7 +678,7 @@ NSString* mvkMTLComputeCommandEncoderLabel(MVKCommandUse cmdUse) {
        case kMVKCommandUseCopyBufferToImage:   return @"vkCmdCopyBufferToImage ComputeEncoder";
        case kMVKCommandUseCopyImageToBuffer:   return @"vkCmdCopyImageToBuffer ComputeEncoder";
        case kMVKCommandUseFillBuffer:          return @"vkCmdFillBuffer ComputeEncoder";
-        case kMVKCommandUseTessellationControl: return @"vkCmdDraw (tess control stage) ComputeEncoder";
+        case kMVKCommandUseTessellationVertexTessCtl: return @"vkCmdDraw (vertex and tess control stages) ComputeEncoder";
        case kMVKCommandUseCopyQueryPoolResults:return @"vkCmdCopyQueryPoolResults ComputeEncoder";
        default:                                return @"Unknown Use ComputeEncoder";
    }
--- a/MoltenVK/MoltenVK/Commands/MVKCommandEncoderState.mm
+++ b/MoltenVK/MoltenVK/Commands/MVKCommandEncoderState.mm
@ -197,7 +197,13 @@ void MVKPushConstantsCommandEncoderState::encodeImpl(uint32_t stage) {

    switch (_shaderStage) {
        case VK_SHADER_STAGE_VERTEX_BIT:
-            if (stage == (isTessellating() ? kMVKGraphicsStageVertex : kMVKGraphicsStageRasterization)) {
+			if (stage == kMVKGraphicsStageVertex) {
+                _cmdEncoder->setComputeBytes(_cmdEncoder->getMTLComputeEncoder(kMVKCommandUseTessellationVertexTessCtl),
+                                             _pushConstants.data(),
+                                             _pushConstants.size(),
+                                             _mtlBufferIndex);
+				_isDirty = false;	// Okay, I changed the encoder
+			} else if (!isTessellating() && stage == kMVKGraphicsStageRasterization) {
                _cmdEncoder->setVertexBytes(_cmdEncoder->_mtlRenderEncoder,
                                            _pushConstants.data(),
                                            _pushConstants.size(),
@ -207,7 +213,7 @@ void MVKPushConstantsCommandEncoderState::encodeImpl(uint32_t stage) {
            break;
        case VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT:
            if (stage == kMVKGraphicsStageTessControl) {
-                _cmdEncoder->setComputeBytes(_cmdEncoder->getMTLComputeEncoder(kMVKCommandUseTessellationControl),
+                _cmdEncoder->setComputeBytes(_cmdEncoder->getMTLComputeEncoder(kMVKCommandUseTessellationVertexTessCtl),
                                             _pushConstants.data(),
                                             _pushConstants.size(),
                                             _mtlBufferIndex);
@ -353,11 +359,6 @@ void MVKDepthStencilCommandEncoderState::encodeImpl(uint32_t stage) {
 			[_cmdEncoder->_mtlRenderEncoder setDepthStencilState: cmdEncPool->getMTLDepthStencilState(adjustedDSData)];
 			break;
 		}
-		case kMVKGraphicsStageVertex: {
-			// Vertex stage of tessellation pipeline requires depth/stencil testing be disabled
-			[_cmdEncoder->_mtlRenderEncoder setDepthStencilState: cmdEncPool->getMTLDepthStencilState(false, false)];
-			break;
-		}
 		default:		// Do nothing on other stages
 			break;
 	}
@ -606,7 +607,35 @@ void MVKGraphicsResourcesCommandEncoderState::encodeImpl(uint32_t stage) {
    bool fullImageViewSwizzle = pipeline->fullImageViewSwizzle() || _cmdEncoder->getDevice()->_pMetalFeatures->nativeTextureSwizzle;
    bool forTessellation = pipeline->isTessellationPipeline();

-    if (stage == (forTessellation ? kMVKGraphicsStageVertex : kMVKGraphicsStageRasterization)) {
+	if (stage == kMVKGraphicsStageVertex) {
+        encodeBindings(kMVKShaderStageVertex, "vertex", fullImageViewSwizzle,
+                       [](MVKCommandEncoder* cmdEncoder, MVKMTLBufferBinding& b)->void {
+                           if (b.isInline)
+                               cmdEncoder->setComputeBytes(cmdEncoder->getMTLComputeEncoder(kMVKCommandUseTessellationVertexTessCtl),
+                                                           b.mtlBytes,
+                                                           b.size,
+                                                           b.index);
+                           else
+                               [cmdEncoder->getMTLComputeEncoder(kMVKCommandUseTessellationVertexTessCtl) setBuffer: b.mtlBuffer
+                                                                                                             offset: b.offset
+                                                                                                            atIndex: b.index];
+                       },
+                       [](MVKCommandEncoder* cmdEncoder, MVKMTLBufferBinding& b, const MVKArrayRef<uint32_t>& s)->void {
+                           cmdEncoder->setComputeBytes(cmdEncoder->getMTLComputeEncoder(kMVKCommandUseTessellationVertexTessCtl),
+                                                       s.data,
+                                                       s.size * sizeof(uint32_t),
+                                                       b.index);
+                       },
+                       [](MVKCommandEncoder* cmdEncoder, MVKMTLTextureBinding& b)->void {
+                           [cmdEncoder->getMTLComputeEncoder(kMVKCommandUseTessellationVertexTessCtl) setTexture: b.mtlTexture
+                                                                                                         atIndex: b.index];
+                       },
+                       [](MVKCommandEncoder* cmdEncoder, MVKMTLSamplerStateBinding& b)->void {
+                           [cmdEncoder->getMTLComputeEncoder(kMVKCommandUseTessellationVertexTessCtl) setSamplerState: b.mtlSamplerState
+                                                                                                              atIndex: b.index];
+                       });
+
+	} else if (!forTessellation && stage == kMVKGraphicsStageRasterization) {
        encodeBindings(kMVKShaderStageVertex, "vertex", fullImageViewSwizzle,
                       [pipeline](MVKCommandEncoder* cmdEncoder, MVKMTLBufferBinding& b)->void {
 					       if (b.isInline) {
@ -651,28 +680,28 @@ void MVKGraphicsResourcesCommandEncoderState::encodeImpl(uint32_t stage) {
        encodeBindings(kMVKShaderStageTessCtl, "tessellation control", fullImageViewSwizzle,
                       [](MVKCommandEncoder* cmdEncoder, MVKMTLBufferBinding& b)->void {
                           if (b.isInline)
-                               cmdEncoder->setComputeBytes(cmdEncoder->getMTLComputeEncoder(kMVKCommandUseTessellationControl),
+                               cmdEncoder->setComputeBytes(cmdEncoder->getMTLComputeEncoder(kMVKCommandUseTessellationVertexTessCtl),
                                                           b.mtlBytes,
                                                           b.size,
                                                           b.index);
                           else
-                               [cmdEncoder->getMTLComputeEncoder(kMVKCommandUseTessellationControl) setBuffer: b.mtlBuffer
-                                                                                                       offset: b.offset
-                                                                                                      atIndex: b.index];
+                               [cmdEncoder->getMTLComputeEncoder(kMVKCommandUseTessellationVertexTessCtl) setBuffer: b.mtlBuffer
+                                                                                                             offset: b.offset
+                                                                                                            atIndex: b.index];
                       },
                       [](MVKCommandEncoder* cmdEncoder, MVKMTLBufferBinding& b, const MVKArrayRef<uint32_t>& s)->void {
-                           cmdEncoder->setComputeBytes(cmdEncoder->getMTLComputeEncoder(kMVKCommandUseTessellationControl),
+                           cmdEncoder->setComputeBytes(cmdEncoder->getMTLComputeEncoder(kMVKCommandUseTessellationVertexTessCtl),
                                                       s.data,
                                                       s.size * sizeof(uint32_t),
                                                       b.index);
                       },
                       [](MVKCommandEncoder* cmdEncoder, MVKMTLTextureBinding& b)->void {
-                           [cmdEncoder->getMTLComputeEncoder(kMVKCommandUseTessellationControl) setTexture: b.mtlTexture
-                                                                                                   atIndex: b.index];
+                           [cmdEncoder->getMTLComputeEncoder(kMVKCommandUseTessellationVertexTessCtl) setTexture: b.mtlTexture
+                                                                                                         atIndex: b.index];
                       },
                       [](MVKCommandEncoder* cmdEncoder, MVKMTLSamplerStateBinding& b)->void {
-                           [cmdEncoder->getMTLComputeEncoder(kMVKCommandUseTessellationControl) setSamplerState: b.mtlSamplerState
-                                                                                                        atIndex: b.index];
+                           [cmdEncoder->getMTLComputeEncoder(kMVKCommandUseTessellationVertexTessCtl) setSamplerState: b.mtlSamplerState
+                                                                                                              atIndex: b.index];
                       });

    }
--- a/MoltenVK/MoltenVK/Commands/MVKCommandPipelineStateFactoryShaderSource.h
+++ b/MoltenVK/MoltenVK/Commands/MVKCommandPipelineStateFactoryShaderSource.h
@ -173,96 +173,106 @@ struct MTLStageInRegionIndirectArguments {
 #if __METAL_VERSION__ >= 120                                                                                    \n\
 kernel void cmdDrawIndirectConvertBuffers(const device char* srcBuff [[buffer(0)]],                             \n\
                                          device char* destBuff [[buffer(1)]],                                  \n\
-                                          constant uint32_t& srcStride [[buffer(2)]],                           \n\
-                                          constant uint32_t& inControlPointCount [[buffer(3)]],                 \n\
-                                          constant uint32_t& outControlPointCount [[buffer(4)]],                \n\
-                                          constant uint32_t& drawCount [[buffer(5)]],                           \n\
+                                          device char* paramsBuff [[buffer(2)]],                                \n\
+                                          constant uint32_t& srcStride [[buffer(3)]],                           \n\
+                                          constant uint32_t& inControlPointCount [[buffer(4)]],                 \n\
+                                          constant uint32_t& outControlPointCount [[buffer(5)]],                \n\
+                                          constant uint32_t& drawCount [[buffer(6)]],                           \n\
+                                          constant uint32_t& vtxThreadExecWidth [[buffer(7)]],                  \n\
+                                          constant uint32_t& tcWorkgroupSize [[buffer(8)]],                     \n\
                                          uint idx [[thread_position_in_grid]]) {                               \n\
    if (idx >= drawCount) { return; }                                                                           \n\
    const device auto& src = *reinterpret_cast<const device MTLDrawPrimitivesIndirectArguments*>(srcBuff + idx * srcStride);\n\
-    device char* dest = destBuff + idx * (sizeof(MTLDispatchThreadgroupsIndirectArguments) + sizeof(MTLDrawPatchIndirectArguments));\n\
+    device char* dest;                                                                                          \n\
+    device auto* params = reinterpret_cast<device uint32_t*>(paramsBuff + idx * 256);                           \n\
 #if __METAL_VERSION__ >= 210                                                                                    \n\
+    dest = destBuff + idx * (sizeof(MTLStageInRegionIndirectArguments) + sizeof(MTLDispatchThreadgroupsIndirectArguments) * 2 + sizeof(MTLDrawPatchIndirectArguments));\n\
    device auto& destSI = *(device MTLStageInRegionIndirectArguments*)dest;                                     \n\
    dest += sizeof(MTLStageInRegionIndirectArguments);                                                          \n\
+#else                                                                                                           \n\
+    dest = destBuff + idx * (sizeof(MTLDispatchThreadgroupsIndirectArguments) * 2 + sizeof(MTLDrawPatchIndirectArguments));\n\
 #endif                                                                                                          \n\
-    device auto& destTC = *(device MTLDispatchThreadgroupsIndirectArguments*)dest;                              \n\
-    device auto& destTE = *(device MTLDrawPatchIndirectArguments*)(dest + sizeof(MTLDispatchThreadgroupsIndirectArguments));\n\
-    destTC.threadgroupsPerGrid[0] = (src.vertexCount * src.instanceCount + inControlPointCount - 1) / inControlPointCount;\n\
+    device auto& destVtx = *(device MTLDispatchThreadgroupsIndirectArguments*)dest;                             \n\
+    device auto& destTC = *(device MTLDispatchThreadgroupsIndirectArguments*)(dest + sizeof(MTLDispatchThreadgroupsIndirectArguments));\n\
+    device auto& destTE = *(device MTLDrawPatchIndirectArguments*)(dest + sizeof(MTLDispatchThreadgroupsIndirectArguments) * 2);\n\
+    uint32_t patchCount = (src.vertexCount * src.instanceCount + inControlPointCount - 1) / inControlPointCount;\n\
+    params[0] = inControlPointCount;                                                                            \n\
+    params[1] = patchCount;                                                                                     \n\
+    destVtx.threadgroupsPerGrid[0] = (src.vertexCount + vtxThreadExecWidth - 1) / vtxThreadExecWidth;           \n\
+    destVtx.threadgroupsPerGrid[1] = src.instanceCount;                                                         \n\
+    destVtx.threadgroupsPerGrid[2] = 1;                                                                         \n\
+    destTC.threadgroupsPerGrid[0] = (patchCount * outControlPointCount + tcWorkgroupSize - 1) / tcWorkgroupSize;\n\
    destTC.threadgroupsPerGrid[1] = destTC.threadgroupsPerGrid[2] = 1;                                          \n\
-    destTE.patchCount = destTC.threadgroupsPerGrid[0];                                                          \n\
+    destTE.patchCount = patchCount;                                                                             \n\
    destTE.instanceCount = 1;                                                                                   \n\
    destTE.patchStart = destTE.baseInstance = 0;                                                                \n\
 #if __METAL_VERSION__ >= 210                                                                                    \n\
-    destSI.stageInOrigin[0] = destSI.stageInOrigin[1] = destSI.stageInOrigin[2] = 0;                            \n\
-    destSI.stageInSize[0] = src.instanceCount * max(src.vertexCount, outControlPointCount * destTE.patchCount); \n\
-    destSI.stageInSize[1] = destSI.stageInSize[2] = 1;                                                          \n\
+    destSI.stageInOrigin[0] = src.vertexStart;                                                                  \n\
+    destSI.stageInOrigin[1] = src.baseInstance;                                                                 \n\
+    destSI.stageInOrigin[2] = 0;                                                                                \n\
+    destSI.stageInSize[0] = src.vertexCount;                                                                    \n\
+    destSI.stageInSize[1] = src.instanceCount;                                                                  \n\
+    destSI.stageInSize[2] = 1;                                                                                  \n\
 #endif                                                                                                          \n\
 }                                                                                                               \n\
                                                                                                                \n\
 kernel void cmdDrawIndexedIndirectConvertBuffers(const device char* srcBuff [[buffer(0)]],                      \n\
                                                 device char* destBuff [[buffer(1)]],                           \n\
-                                                 constant uint32_t& srcStride [[buffer(2)]],                    \n\
-                                                 constant uint32_t& inControlPointCount [[buffer(3)]],          \n\
-                                                 constant uint32_t& outControlPointCount [[buffer(4)]],         \n\
-                                                 constant uint32_t& drawCount [[buffer(5)]],                    \n\
+                                                 device char* paramsBuff [[buffer(2)]],                         \n\
+                                                 constant uint32_t& srcStride [[buffer(3)]],                    \n\
+                                                 constant uint32_t& inControlPointCount [[buffer(4)]],          \n\
+                                                 constant uint32_t& outControlPointCount [[buffer(5)]],         \n\
+                                                 constant uint32_t& drawCount [[buffer(6)]],                    \n\
+                                                 constant uint32_t& vtxThreadExecWidth [[buffer(7)]],           \n\
+                                                 constant uint32_t& tcWorkgroupSize [[buffer(8)]],              \n\
                                                 uint idx [[thread_position_in_grid]]) {                        \n\
    if (idx >= drawCount) { return; }                                                                           \n\
    const device auto& src = *reinterpret_cast<const device MTLDrawIndexedPrimitivesIndirectArguments*>(srcBuff + idx * srcStride);\n\
-    device char* dest = destBuff + idx * (sizeof(MTLDispatchThreadgroupsIndirectArguments) + sizeof(MTLDrawPatchIndirectArguments));\n\
+    device char* dest;                                                                                          \n\
+    device auto* params = reinterpret_cast<device uint32_t*>(paramsBuff + idx * 256);                           \n\
 #if __METAL_VERSION__ >= 210                                                                                    \n\
+    dest = destBuff + idx * (sizeof(MTLStageInRegionIndirectArguments) + sizeof(MTLDispatchThreadgroupsIndirectArguments) * 2 + sizeof(MTLDrawPatchIndirectArguments));\n\
    device auto& destSI = *(device MTLStageInRegionIndirectArguments*)dest;                                     \n\
    dest += sizeof(MTLStageInRegionIndirectArguments);                                                          \n\
+#else                                                                                                           \n\
+    dest = destBuff + idx * (sizeof(MTLDispatchThreadgroupsIndirectArguments) * 2 + sizeof(MTLDrawPatchIndirectArguments));\n\
 #endif                                                                                                          \n\
-    device auto& destTC = *(device MTLDispatchThreadgroupsIndirectArguments*)dest;                              \n\
-    device auto& destTE = *(device MTLDrawPatchIndirectArguments*)(dest + sizeof(MTLDispatchThreadgroupsIndirectArguments));\n\
-    destTC.threadgroupsPerGrid[0] = (src.indexCount * src.instanceCount + inControlPointCount - 1) / inControlPointCount;\n\
+    device auto& destVtx = *(device MTLDispatchThreadgroupsIndirectArguments*)dest;                             \n\
+    device auto& destTC = *(device MTLDispatchThreadgroupsIndirectArguments*)(dest + sizeof(MTLDispatchThreadgroupsIndirectArguments));\n\
+    device auto& destTE = *(device MTLDrawPatchIndirectArguments*)(dest + sizeof(MTLDispatchThreadgroupsIndirectArguments) * 2);\n\
+    uint32_t patchCount = (src.indexCount * src.instanceCount + inControlPointCount - 1) / inControlPointCount;\n\
+    params[0] = inControlPointCount;                                                                            \n\
+    params[1] = patchCount;                                                                                     \n\
+    destVtx.threadgroupsPerGrid[0] = (src.indexCount + vtxThreadExecWidth - 1) / vtxThreadExecWidth;            \n\
+    destVtx.threadgroupsPerGrid[1] = src.instanceCount;                                                         \n\
+    destVtx.threadgroupsPerGrid[2] = 1;                                                                         \n\
+    destTC.threadgroupsPerGrid[0] = (patchCount * outControlPointCount + tcWorkgroupSize - 1) / tcWorkgroupSize;\n\
    destTC.threadgroupsPerGrid[1] = destTC.threadgroupsPerGrid[2] = 1;                                          \n\
-    destTE.patchCount = destTC.threadgroupsPerGrid[0];                                                          \n\
+    destTE.patchCount = patchCount;                                                                             \n\
    destTE.instanceCount = 1;                                                                                   \n\
    destTE.patchStart = destTE.baseInstance = 0;                                                                \n\
 #if __METAL_VERSION__ >= 210                                                                                    \n\
-    destSI.stageInOrigin[0] = destSI.stageInOrigin[1] = destSI.stageInOrigin[2] = 0;                            \n\
-    destSI.stageInSize[0] = src.instanceCount * max(src.indexCount, outControlPointCount * destTE.patchCount); \n\
-    destSI.stageInSize[1] = destSI.stageInSize[2] = 1;                                                          \n\
+    destSI.stageInOrigin[0] = src.baseVertex;                                                                   \n\
+    destSI.stageInOrigin[1] = src.baseInstance;                                                                 \n\
+    destSI.stageInOrigin[2] = 0;                                                                                \n\
+    destSI.stageInSize[0] = src.indexCount;                                                                     \n\
+    destSI.stageInSize[1] = src.instanceCount;                                                                  \n\
+    destSI.stageInSize[2] = 1;                                                                                  \n\
 #endif                                                                                                          \n\
 }                                                                                                               \n\
                                                                                                                \n\
 kernel void cmdDrawIndexedCopyIndex16Buffer(const device uint16_t* srcBuff [[buffer(0)]],                       \n\
                                            device uint16_t* destBuff [[buffer(1)]],                            \n\
-                                            constant uint32_t& inControlPointCount [[buffer(2)]],               \n\
-                                            constant uint32_t& outControlPointCount [[buffer(3)]],              \n\
-                                            const device MTLDrawIndexedPrimitivesIndirectArguments& params [[buffer(4)]]) {\n\
-    uint patchCount = (params.indexCount + inControlPointCount - 1) / inControlPointCount;                      \n\
-    for (uint i = 0; i < params.instanceCount; i++) {                                                           \n\
-        for (uint j = 0; j < patchCount; j++) {                                                                 \n\
-            for (uint k = 0; k < max(inControlPointCount, outControlPointCount); k++) {                         \n\
-                if (k < inControlPointCount) {                                                                  \n\
-                    destBuff[i * params.indexCount + j * outControlPointCount + k] = srcBuff[params.indexStart + j * inControlPointCount + k] + i * params.indexCount;\n\
-                } else {                                                                                        \n\
-                    destBuff[i * params.indexCount + j * outControlPointCount + k] = 0;                         \n\
-                }                                                                                               \n\
-            }                                                                                                   \n\
-        }                                                                                                       \n\
-    }                                                                                                           \n\
+                                            const device MTLDrawIndexedPrimitivesIndirectArguments& params [[buffer(2)]],\n\
+                                            uint i [[thread_position_in_grid]]) {                               \n\
+    destBuff[i] = srcBuff[params.indexStart + i];                                                               \n\
 }                                                                                                               \n\
                                                                                                                \n\
 kernel void cmdDrawIndexedCopyIndex32Buffer(const device uint32_t* srcBuff [[buffer(0)]],                       \n\
                                            device uint32_t* destBuff [[buffer(1)]],                            \n\
-                                            constant uint32_t& inControlPointCount [[buffer(2)]],               \n\
-                                            constant uint32_t& outControlPointCount [[buffer(3)]],              \n\
-                                            const device MTLDrawIndexedPrimitivesIndirectArguments& params [[buffer(4)]]) {\n\
-    uint patchCount = (params.indexCount + inControlPointCount - 1) / inControlPointCount;                      \n\
-    for (uint i = 0; i < params.instanceCount; i++) {                                                           \n\
-        for (uint j = 0; j < patchCount; j++) {                                                                 \n\
-            for (uint k = 0; k < max(inControlPointCount, outControlPointCount); k++) {                         \n\
-                if (k < inControlPointCount) {                                                                  \n\
-                    destBuff[i * params.indexCount + j * outControlPointCount + k] = srcBuff[params.indexStart + j * inControlPointCount + k] + i * params.indexCount;\n\
-                } else {                                                                                        \n\
-                    destBuff[i * params.indexCount + j * outControlPointCount + k] = 0;                         \n\
-                }                                                                                               \n\
-            }                                                                                                   \n\
-        }                                                                                                       \n\
-    }                                                                                                           \n\
+                                            const device MTLDrawIndexedPrimitivesIndirectArguments& params [[buffer(2)]],\n\
+                                            uint i [[thread_position_in_grid]]) {                               \n\
+    destBuff[i] = srcBuff[params.indexStart + i];                                                               \n\
 }                                                                                                               \n\
                                                                                                                \n\
 #endif                                                                                                          \n\
--- a/MoltenVK/MoltenVK/GPUObjects/MVKDevice.mm
+++ b/MoltenVK/MoltenVK/GPUObjects/MVKDevice.mm
@ -1063,6 +1063,7 @@ void MVKPhysicalDevice::initMetalFeatures() {

 	if (supportsMTLFeatureSet(iOS_GPUFamily4_v1)) {
 		_metalFeatures.postDepthCoverage = true;
+		_metalFeatures.nonUniformThreadgroups = true;
 	}

 	if (supportsMTLFeatureSet(iOS_GPUFamily5_v1)) {
@ -1110,6 +1111,7 @@ void MVKPhysicalDevice::initMetalFeatures() {
 		_metalFeatures.arrayOfSamplers = true;
 		_metalFeatures.presentModeImmediate = true;
 		_metalFeatures.fences = true;
+		_metalFeatures.nonUniformThreadgroups = true;
    }

    if (supportsMTLFeatureSet(macOS_GPUFamily1_v4)) {
--- a/MoltenVK/MoltenVK/GPUObjects/MVKPipeline.h
+++ b/MoltenVK/MoltenVK/GPUObjects/MVKPipeline.h
@ -124,8 +124,7 @@ protected:
 #pragma mark MVKPipeline

 static const uint32_t kMVKTessCtlInputBufferIndex = 30;
-static const uint32_t kMVKTessCtlIndexBufferIndex = 29;
-static const uint32_t kMVKTessCtlNumReservedBuffers = 2;
+static const uint32_t kMVKTessCtlNumReservedBuffers = 1;

 static const uint32_t kMVKTessEvalInputBufferIndex = 30;
 static const uint32_t kMVKTessEvalPatchInputBufferIndex = 29;
@ -228,6 +227,18 @@ public:
 	/** Returns the current tessellation level buffer binding for the tess. control shader. */
 	uint32_t getTessCtlLevelBufferIndex() { return _tessCtlLevelBufferIndex; }

+	/** Returns the MTLComputePipelineState object for the vertex stage of a tessellated draw with no indices. */
+	id<MTLComputePipelineState> getTessVertexStageState();
+
+	/** Returns the MTLComputePipelineState object for the vertex stage of a tessellated draw with 16-bit indices. */
+	id<MTLComputePipelineState> getTessVertexStageIndex16State();
+
+	/** Returns the MTLComputePipelineState object for the vertex stage of a tessellated draw with 32-bit indices. */
+	id<MTLComputePipelineState> getTessVertexStageIndex32State();
+
+	/** Returns the MTLComputePipelineState object for the tessellation control stage of a tessellated draw. */
+	id<MTLComputePipelineState> getTessControlStageState() { return _mtlTessControlStageState; }
+
 	/** Returns true if the vertex shader needs a buffer to store its output. */
 	bool needsVertexOutputBuffer() { return _needsVertexOutputBuffer; }

@ -261,17 +272,19 @@ protected:
    void addVertexInputToShaderConverterContext(SPIRVToMSLConversionConfiguration& shaderContext, const VkGraphicsPipelineCreateInfo* pCreateInfo);
    void addPrevStageOutputToShaderConverterContext(SPIRVToMSLConversionConfiguration& shaderContext, SPIRVShaderOutputs& outputs);
    MTLRenderPipelineDescriptor* newMTLRenderPipelineDescriptor(const VkGraphicsPipelineCreateInfo* pCreateInfo, const SPIRVTessReflectionData& reflectData);
-    MTLRenderPipelineDescriptor* newMTLTessVertexStageDescriptor(const VkGraphicsPipelineCreateInfo* pCreateInfo, const SPIRVTessReflectionData& reflectData, SPIRVToMSLConversionConfiguration& shaderContext);
+    MTLComputePipelineDescriptor* newMTLTessVertexStageDescriptor(const VkGraphicsPipelineCreateInfo* pCreateInfo, const SPIRVTessReflectionData& reflectData, SPIRVToMSLConversionConfiguration& shaderContext);
 	MTLComputePipelineDescriptor* newMTLTessControlStageDescriptor(const VkGraphicsPipelineCreateInfo* pCreateInfo, const SPIRVTessReflectionData& reflectData, SPIRVToMSLConversionConfiguration& shaderContext);
 	MTLRenderPipelineDescriptor* newMTLTessRasterStageDescriptor(const VkGraphicsPipelineCreateInfo* pCreateInfo, const SPIRVTessReflectionData& reflectData, SPIRVToMSLConversionConfiguration& shaderContext);
 	bool addVertexShaderToPipeline(MTLRenderPipelineDescriptor* plDesc, const VkGraphicsPipelineCreateInfo* pCreateInfo, SPIRVToMSLConversionConfiguration& shaderContext);
+	bool addVertexShaderToPipeline(MTLComputePipelineDescriptor* plDesc, const VkGraphicsPipelineCreateInfo* pCreateInfo, SPIRVToMSLConversionConfiguration& shaderContext);
 	bool addTessCtlShaderToPipeline(MTLComputePipelineDescriptor* plDesc, const VkGraphicsPipelineCreateInfo* pCreateInfo, SPIRVToMSLConversionConfiguration& shaderContext, SPIRVShaderOutputs& prevOutput);
 	bool addTessEvalShaderToPipeline(MTLRenderPipelineDescriptor* plDesc, const VkGraphicsPipelineCreateInfo* pCreateInfo, SPIRVToMSLConversionConfiguration& shaderContext, SPIRVShaderOutputs& prevOutput);
    bool addFragmentShaderToPipeline(MTLRenderPipelineDescriptor* plDesc, const VkGraphicsPipelineCreateInfo* pCreateInfo, SPIRVToMSLConversionConfiguration& shaderContext, SPIRVShaderOutputs& prevOutput);
-	bool addVertexInputToPipeline(MTLRenderPipelineDescriptor* plDesc, const VkPipelineVertexInputStateCreateInfo* pVI, const SPIRVToMSLConversionConfiguration& shaderContext);
+	template<class T>
+	bool addVertexInputToPipeline(T* inputDesc, const VkPipelineVertexInputStateCreateInfo* pVI, const SPIRVToMSLConversionConfiguration& shaderContext);
    void addTessellationToPipeline(MTLRenderPipelineDescriptor* plDesc, const SPIRVTessReflectionData& reflectData, const VkPipelineTessellationStateCreateInfo* pTS);
-    void addFragmentOutputToPipeline(MTLRenderPipelineDescriptor* plDesc, const SPIRVTessReflectionData& reflectData, const VkGraphicsPipelineCreateInfo* pCreateInfo, bool isTessellationVertexPipeline = false);
-    bool isRenderingPoints(const VkGraphicsPipelineCreateInfo* pCreateInfo, const SPIRVTessReflectionData& reflectData);
+    void addFragmentOutputToPipeline(MTLRenderPipelineDescriptor* plDesc, const VkGraphicsPipelineCreateInfo* pCreateInfo);
+    bool isRenderingPoints(const VkGraphicsPipelineCreateInfo* pCreateInfo);
 	bool verifyImplicitBuffer(bool needsBuffer, MVKShaderImplicitRezBinding& index, MVKShaderStage stage, const char* name, uint32_t reservedBuffers);
 	uint32_t getTranslatedVertexBinding(uint32_t binding, uint32_t translationOffset, uint32_t maxBinding);

@ -288,12 +301,13 @@ protected:
 	MVKSmallVector<VkRect2D, kMVKCachedViewportScissorCount> _scissors;
 	MVKSmallVector<MVKTranslatedVertexBinding> _translatedVertexBindings;

-	MTLComputePipelineDescriptor* _mtlTessControlStageDesc = nil;
+	MTLComputePipelineDescriptor* _mtlTessVertexStageDesc = nil;
+	id<MTLFunction> _mtlTessVertexFunctions[3] = {nil, nil, nil};

-	id<MTLRenderPipelineState> _mtlTessVertexStageState = nil;
+	id<MTLComputePipelineState> _mtlTessVertexStageState = nil;
+	id<MTLComputePipelineState> _mtlTessVertexStageIndex16State = nil;
+	id<MTLComputePipelineState> _mtlTessVertexStageIndex32State = nil;
 	id<MTLComputePipelineState> _mtlTessControlStageState = nil;
-	id<MTLComputePipelineState> _mtlTessControlStageIndex16State = nil;
-	id<MTLComputePipelineState> _mtlTessControlStageIndex32State = nil;
 	id<MTLRenderPipelineState> _mtlPipelineState = nil;
 	MTLCullMode _mtlCullMode;
 	MTLWinding _mtlFrontWinding;
@ -316,7 +330,7 @@ protected:
 	bool _needsTessCtlBufferSizeBuffer = false;
 	bool _needsTessCtlOutputBuffer = false;
 	bool _needsTessCtlPatchOutputBuffer = false;
-	bool _needsTessCtlInput = false;
+	bool _needsTessCtlInputBuffer = false;
 	bool _needsTessEvalSwizzleBuffer = false;
 	bool _needsTessEvalBufferSizeBuffer = false;
 	bool _needsFragmentSwizzleBuffer = false;
--- a/MoltenVK/MoltenVK/GPUObjects/MVKPipeline.mm
+++ b/MoltenVK/MoltenVK/GPUObjects/MVKPipeline.mm
@ -189,49 +189,41 @@ void MVKGraphicsPipeline::encode(MVKCommandEncoder* cmdEncoder, uint32_t stage)
 	if ( !_hasValidMTLPipelineStates ) { return; }

    id<MTLRenderCommandEncoder> mtlCmdEnc = cmdEncoder->_mtlRenderEncoder;
+	id<MTLComputeCommandEncoder> tessCtlEnc;
    if ( stage != kMVKGraphicsStageTessControl && !mtlCmdEnc ) { return; }   // Pre-renderpass. Come back later.

    switch (stage) {

-		case kMVKGraphicsStageVertex:
-			// Stage 1 of a tessellated draw: vertex-only pipeline with rasterization disabled.
-
-            [mtlCmdEnc setRenderPipelineState: _mtlTessVertexStageState];
-            break;
-
-        case kMVKGraphicsStageTessControl: {
-			// Stage 2 of a tessellated draw: compute pipeline to run the tess. control shader.
-
+		case kMVKGraphicsStageVertex: {
+			// Stage 1 of a tessellated draw: compute pipeline to run the vertex shader.
 			// N.B. This will prematurely terminate the current subpass. We'll have to remember to start it back up again.
 			// Due to yet another impedance mismatch between Metal and Vulkan, which pipeline
 			// state we use depends on whether or not we have an index buffer, and if we do,
-			// the kind of indices in it. Furthermore, to avoid fetching the wrong attribute
-			// data when there are more output vertices than input vertices, we use an
-			// indexed dispatch to force each instance to fetch the correct entry.
+			// the kind of indices in it.
+
            id<MTLComputePipelineState> plState;
-			const char* compilerType = "Tessellation control stage pipeline";
 			const MVKIndexMTLBufferBinding& indexBuff = cmdEncoder->_graphicsResourcesState._mtlIndexBufferBinding;
-            MTLComputePipelineDescriptor* plDesc = [_mtlTessControlStageDesc copy];  // temp retain a copy to be thread-safe.
-            if (!indexBuff.mtlBuffer && getInputControlPointCount() >= getOutputControlPointCount()) {
-                plState = getOrCompilePipeline(plDesc, _mtlTessControlStageState, compilerType);
+            if (!indexBuff.mtlBuffer) {
+                plState = getTessVertexStageState();
            } else if (indexBuff.mtlIndexType == MTLIndexTypeUInt16) {
-                plDesc.stageInputDescriptor.indexType = MTLIndexTypeUInt16;
-                plDesc.stageInputDescriptor.layouts[kMVKTessCtlInputBufferIndex].stepFunction = MTLStepFunctionThreadPositionInGridXIndexed;
-                plState = getOrCompilePipeline(plDesc, _mtlTessControlStageIndex16State, compilerType);
+                plState = getTessVertexStageIndex16State();
            } else {
-                plDesc.stageInputDescriptor.indexType = MTLIndexTypeUInt32;
-                plDesc.stageInputDescriptor.layouts[kMVKTessCtlInputBufferIndex].stepFunction = MTLStepFunctionThreadPositionInGridXIndexed;
-                plState = getOrCompilePipeline(plDesc, _mtlTessControlStageIndex32State, compilerType);
+                plState = getTessVertexStageIndex32State();
            }
-			[plDesc release];														// temp release

 			if ( !_hasValidMTLPipelineStates ) { return; }

-            id<MTLComputeCommandEncoder> tessCtlEnc = cmdEncoder->getMTLComputeEncoder(kMVKCommandUseTessellationControl);
+            tessCtlEnc = cmdEncoder->getMTLComputeEncoder(kMVKCommandUseTessellationVertexTessCtl);
            [tessCtlEnc setComputePipelineState: plState];
-            if (_needsTessCtlInput) {
-                [tessCtlEnc setThreadgroupMemoryLength: getDevice()->_pProperties->limits.maxTessellationControlPerVertexInputComponents * 4 * getInputControlPointCount() atIndex: 0];
-            }
+            break;
+		}
+
+        case kMVKGraphicsStageTessControl: {
+			// Stage 2 of a tessellated draw: compute pipeline to run the tess. control shader.
+			if ( !_mtlTessControlStageState ) { return; }		// Abort if pipeline could not be created.
+
+            tessCtlEnc = cmdEncoder->getMTLComputeEncoder(kMVKCommandUseTessellationVertexTessCtl);
+            [tessCtlEnc setComputePipelineState: _mtlTessControlStageState];
            break;
        }

@ -287,6 +279,46 @@ bool MVKGraphicsPipeline::supportsDynamicState(VkDynamicState state) {
    }
 }

+static const char vtxCompilerType[] = "Vertex stage pipeline for tessellation";
+
+id<MTLComputePipelineState> MVKGraphicsPipeline::getTessVertexStageState() {
+    MTLComputePipelineDescriptor* plDesc = [_mtlTessVertexStageDesc copy];  // temp retain a copy to be thread-safe.
+    plDesc.computeFunction = _mtlTessVertexFunctions[0];
+    id<MTLComputePipelineState> plState = getOrCompilePipeline(plDesc, _mtlTessVertexStageState, vtxCompilerType);
+    [plDesc release];                                                       // temp release
+    return plState;
+}
+
+id<MTLComputePipelineState> MVKGraphicsPipeline::getTessVertexStageIndex16State() {
+    MTLComputePipelineDescriptor* plDesc = [_mtlTessVertexStageDesc copy];  // temp retain a copy to be thread-safe.
+    plDesc.computeFunction = _mtlTessVertexFunctions[1];
+    plDesc.stageInputDescriptor.indexType = MTLIndexTypeUInt16;
+    for (uint32_t i = 0; i < 31; i++) {
+		MTLBufferLayoutDescriptor* blDesc = plDesc.stageInputDescriptor.layouts[i];
+        if (blDesc.stepFunction == MTLStepFunctionThreadPositionInGridX) {
+                blDesc.stepFunction = MTLStepFunctionThreadPositionInGridXIndexed;
+        }
+    }
+    id<MTLComputePipelineState> plState = getOrCompilePipeline(plDesc, _mtlTessVertexStageIndex16State, vtxCompilerType);
+    [plDesc release];                                                       // temp release
+    return plState;
+}
+
+id<MTLComputePipelineState> MVKGraphicsPipeline::getTessVertexStageIndex32State() {
+    MTLComputePipelineDescriptor* plDesc = [_mtlTessVertexStageDesc copy];  // temp retain a copy to be thread-safe.
+    plDesc.computeFunction = _mtlTessVertexFunctions[2];
+    plDesc.stageInputDescriptor.indexType = MTLIndexTypeUInt32;
+    for (uint32_t i = 0; i < 31; i++) {
+		MTLBufferLayoutDescriptor* blDesc = plDesc.stageInputDescriptor.layouts[i];
+        if (blDesc.stepFunction == MTLStepFunctionThreadPositionInGridX) {
+                blDesc.stepFunction = MTLStepFunctionThreadPositionInGridXIndexed;
+        }
+    }
+    id<MTLComputePipelineState> plState = getOrCompilePipeline(plDesc, _mtlTessVertexStageIndex32State, vtxCompilerType);
+    [plDesc release];                                                       // temp release
+    return plState;
+}
+

 #pragma mark Construction

@ -343,7 +375,7 @@ MVKGraphicsPipeline::MVKGraphicsPipeline(MVKDevice* device,

 	// Topology
 	_mtlPrimitiveType = MTLPrimitiveTypePoint;
-	if (pCreateInfo->pInputAssemblyState && !isRenderingPoints(pCreateInfo, reflectData)) {
+	if (pCreateInfo->pInputAssemblyState && !isRenderingPoints(pCreateInfo)) {
 		_mtlPrimitiveType = mvkMTLPrimitiveTypeFromVkPrimitiveTopology(pCreateInfo->pInputAssemblyState->topology);
 	}

@ -427,11 +459,12 @@ id<MTLComputePipelineState> MVKGraphicsPipeline::getOrCompilePipeline(MTLCompute
 // Constructs the underlying Metal render pipeline.
 void MVKGraphicsPipeline::initMTLRenderPipelineState(const VkGraphicsPipelineCreateInfo* pCreateInfo, const SPIRVTessReflectionData& reflectData) {
 	_mtlTessVertexStageState = nil;
+	_mtlTessVertexStageIndex16State = nil;
+	_mtlTessVertexStageIndex32State = nil;
 	_mtlTessControlStageState = nil;
-	_mtlTessControlStageIndex16State = nil;
-	_mtlTessControlStageIndex32State = nil;
 	_mtlPipelineState = nil;
-	_mtlTessControlStageDesc = nil;
+	_mtlTessVertexStageDesc = nil;
+	for (uint32_t i = 0; i < 3; i++) { _mtlTessVertexFunctions[i] = nil; }
 	if (!isTessellationPipeline()) {
 		MTLRenderPipelineDescriptor* plDesc = newMTLRenderPipelineDescriptor(pCreateInfo, reflectData);	// temp retain
 		if (plDesc) {
@ -440,20 +473,20 @@ void MVKGraphicsPipeline::initMTLRenderPipelineState(const VkGraphicsPipelineCre
 		[plDesc release];																				// temp release
 	} else {
 		// In this case, we need to create three render pipelines. But, the way Metal handles
-		// index buffers for compute stage-in means we have to defer creation of stage 2 until
+		// index buffers for compute stage-in means we have to defer creation of stage 1 until
 		// draw time. In the meantime, we'll create and retain a descriptor for it.
 		SPIRVToMSLConversionConfiguration shaderContext;
 		initMVKShaderConverterContext(shaderContext, pCreateInfo, reflectData);

-		MTLRenderPipelineDescriptor* vtxPLDesc = newMTLTessVertexStageDescriptor(pCreateInfo, reflectData, shaderContext);	// temp retain
-		_mtlTessControlStageDesc = newMTLTessControlStageDescriptor(pCreateInfo, reflectData, shaderContext);				// retained
+		_mtlTessVertexStageDesc = newMTLTessVertexStageDescriptor(pCreateInfo, reflectData, shaderContext);					// retained
+		MTLComputePipelineDescriptor* tcPLDesc = newMTLTessControlStageDescriptor(pCreateInfo, reflectData, shaderContext);	// temp retained
 		MTLRenderPipelineDescriptor* rastPLDesc = newMTLTessRasterStageDescriptor(pCreateInfo, reflectData, shaderContext);	// temp retained
-		if (vtxPLDesc && _mtlTessControlStageDesc && rastPLDesc) {
-			if (getOrCompilePipeline(vtxPLDesc, _mtlTessVertexStageState)) {
+		if (_mtlTessVertexStageDesc && tcPLDesc && rastPLDesc) {
+			if (getOrCompilePipeline(tcPLDesc, _mtlTessControlStageState, "Tessellation control")) {
 				getOrCompilePipeline(rastPLDesc, _mtlPipelineState);
 			}
 		}
-		[vtxPLDesc release];	// temp release
+		[tcPLDesc release];		// temp release
 		[rastPLDesc release];	// temp release
 	}
 }
@ -479,13 +512,13 @@ MTLRenderPipelineDescriptor* MVKGraphicsPipeline::newMTLRenderPipelineDescriptor

 	// Vertex input
 	// This needs to happen before compiling the fragment shader, or we'll lose information on vertex attributes.
-	if (!addVertexInputToPipeline(plDesc, pCreateInfo->pVertexInputState, shaderContext)) { return nil; }
+	if (!addVertexInputToPipeline(plDesc.vertexDescriptor, pCreateInfo->pVertexInputState, shaderContext)) { return nil; }

 	// Fragment shader - only add if rasterization is enabled
 	if (!addFragmentShaderToPipeline(plDesc, pCreateInfo, shaderContext, vtxOutputs)) { return nil; }

 	// Output
-	addFragmentOutputToPipeline(plDesc, reflectData, pCreateInfo);
+	addFragmentOutputToPipeline(plDesc, pCreateInfo);

 	// Metal does not allow the name of the pipeline to be changed after it has been created,
 	// and we need to create the Metal pipeline immediately to provide error feedback to app.
@ -495,22 +528,27 @@ MTLRenderPipelineDescriptor* MVKGraphicsPipeline::newMTLRenderPipelineDescriptor
 	return plDesc;
 }

-// Returns a retained MTLRenderPipelineDescriptor for the vertex stage of a tessellated draw constructed from this instance, or nil if an error occurs.
+// Returns a retained MTLComputePipelineDescriptor for the vertex stage of a tessellated draw constructed from this instance, or nil if an error occurs.
 // It is the responsibility of the caller to release the returned descriptor.
-MTLRenderPipelineDescriptor* MVKGraphicsPipeline::newMTLTessVertexStageDescriptor(const VkGraphicsPipelineCreateInfo* pCreateInfo,
+MTLComputePipelineDescriptor* MVKGraphicsPipeline::newMTLTessVertexStageDescriptor(const VkGraphicsPipelineCreateInfo* pCreateInfo,
 																				  const SPIRVTessReflectionData& reflectData,
 																				  SPIRVToMSLConversionConfiguration& shaderContext) {
-	MTLRenderPipelineDescriptor* plDesc = [MTLRenderPipelineDescriptor new];	// retained
+	MTLComputePipelineDescriptor* plDesc = [MTLComputePipelineDescriptor new];	// retained

 	// Add shader stages.
 	if (!addVertexShaderToPipeline(plDesc, pCreateInfo, shaderContext)) { return nil; }

 	// Vertex input
-	if (!addVertexInputToPipeline(plDesc, pCreateInfo->pVertexInputState, shaderContext)) { return nil; }
+	plDesc.stageInputDescriptor = [MTLStageInputOutputDescriptor stageInputOutputDescriptor];
+	if (!addVertexInputToPipeline(plDesc.stageInputDescriptor, pCreateInfo->pVertexInputState, shaderContext)) { return nil; }
+	plDesc.stageInputDescriptor.indexBufferIndex = _indirectParamsIndex.stages[kMVKShaderStageVertex];

-	// Even though this won't be used for rasterization, we still have to set up the rasterization state to
-	// match the render pass, or Metal will complain.
-	addFragmentOutputToPipeline(plDesc, reflectData, pCreateInfo, true);
+	plDesc.threadGroupSizeIsMultipleOfThreadExecutionWidth = YES;
+
+	// Metal does not allow the name of the pipeline to be changed after it has been created,
+	// and we need to create the Metal pipeline immediately to provide error feedback to app.
+	// The best we can do at this point is set the pipeline name from the layout.
+	setLabelIfNotNil(plDesc, ((MVKPipelineLayout*)pCreateInfo->layout)->getDebugName());

 	return plDesc;
 }
@ -628,25 +666,6 @@ MTLComputePipelineDescriptor* MVKGraphicsPipeline::newMTLTessControlStageDescrip
 		return nil;
 	}

-	// Stage input
-	plDesc.stageInputDescriptor = [MTLStageInputOutputDescriptor stageInputOutputDescriptor];
-	uint32_t offset = 0;
-	for (const SPIRVShaderOutput& output : vtxOutputs) {
-		if (output.builtin == spv::BuiltInPointSize && !reflectData.pointMode) { continue; }
-		offset = (uint32_t)mvkAlignByteCount(offset, sizeOfOutput(output));
-		if (shaderContext.isShaderInputLocationUsed(output.location)) {
-			plDesc.stageInputDescriptor.attributes[output.location].bufferIndex = kMVKTessCtlInputBufferIndex;
-			plDesc.stageInputDescriptor.attributes[output.location].format = (MTLAttributeFormat)getPixelFormats()->getMTLVertexFormat(mvkFormatFromOutput(output));
-			plDesc.stageInputDescriptor.attributes[output.location].offset = offset;
-		}
-		offset += sizeOfOutput(output);
-	}
-	if (vtxOutputs.size() > 0) {
-		plDesc.stageInputDescriptor.layouts[kMVKTessCtlInputBufferIndex].stepFunction = MTLStepFunctionThreadPositionInGridX;
-		plDesc.stageInputDescriptor.layouts[kMVKTessCtlInputBufferIndex].stride = mvkAlignByteCount(offset, sizeOfOutput(vtxOutputs[0]));
-	}
-	plDesc.stageInputDescriptor.indexBufferIndex = kMVKTessCtlIndexBufferIndex;
-
 	// Metal does not allow the name of the pipeline to be changed after it has been created,
 	// and we need to create the Metal pipeline immediately to provide error feedback to app.
 	// The best we can do at this point is set the pipeline name from the layout.
@ -767,7 +786,7 @@ MTLRenderPipelineDescriptor* MVKGraphicsPipeline::newMTLTessRasterStageDescripto
 	addTessellationToPipeline(plDesc, reflectData, pCreateInfo->pTessellationState);

 	// Output
-	addFragmentOutputToPipeline(plDesc, reflectData, pCreateInfo);
+	addFragmentOutputToPipeline(plDesc, pCreateInfo);

 	return plDesc;
 }
@ -833,6 +852,62 @@ bool MVKGraphicsPipeline::addVertexShaderToPipeline(MTLRenderPipelineDescriptor*
 	return true;
 }

+// Adds a vertex shader compiled as a compute kernel to the pipeline description.
+bool MVKGraphicsPipeline::addVertexShaderToPipeline(MTLComputePipelineDescriptor* plDesc,
+													const VkGraphicsPipelineCreateInfo* pCreateInfo,
+													SPIRVToMSLConversionConfiguration& shaderContext) {
+	uint32_t vbCnt = pCreateInfo->pVertexInputState->vertexBindingDescriptionCount;
+	shaderContext.options.entryPointStage = spv::ExecutionModelVertex;
+	shaderContext.options.entryPointName = _pVertexSS->pName;
+	shaderContext.options.mslOptions.swizzle_buffer_index = _swizzleBufferIndex.stages[kMVKShaderStageVertex];
+	shaderContext.options.mslOptions.shader_index_buffer_index = _indirectParamsIndex.stages[kMVKShaderStageVertex];
+	shaderContext.options.mslOptions.shader_output_buffer_index = _outputBufferIndex.stages[kMVKShaderStageVertex];
+	shaderContext.options.mslOptions.buffer_size_buffer_index = _bufferSizeBufferIndex.stages[kMVKShaderStageVertex];
+	shaderContext.options.mslOptions.capture_output_to_buffer = true;
+	shaderContext.options.mslOptions.vertex_for_tessellation = true;
+	shaderContext.options.mslOptions.disable_rasterization = true;
+    addVertexInputToShaderConverterContext(shaderContext, pCreateInfo);
+
+	static const CompilerMSL::Options::IndexType indexTypes[] = {
+		CompilerMSL::Options::IndexType::None,
+		CompilerMSL::Options::IndexType::UInt16,
+		CompilerMSL::Options::IndexType::UInt32,
+	};
+	// We need to compile this function three times, with no indexing, 16-bit indices, and 32-bit indices.
+	for (uint32_t i = 0; i < sizeof(indexTypes)/sizeof(indexTypes[0]); i++) {
+		shaderContext.options.mslOptions.vertex_index_type = indexTypes[i];
+		MVKMTLFunction func = ((MVKShaderModule*)_pVertexSS->module)->getMTLFunction(&shaderContext, _pVertexSS->pSpecializationInfo, _pipelineCache);
+		id<MTLFunction> mtlFunc = func.getMTLFunction();
+		if ( !mtlFunc ) {
+			setConfigurationResult(reportError(VK_ERROR_INVALID_SHADER_NV, "Vertex shader function could not be compiled into pipeline. See previous logged error."));
+			return false;
+		}
+		_mtlTessVertexFunctions[i] = [mtlFunc retain];
+
+		auto& funcRslts = func.shaderConversionResults;
+		_needsVertexSwizzleBuffer = funcRslts.needsSwizzleBuffer;
+		_needsVertexBufferSizeBuffer = funcRslts.needsBufferSizeBuffer;
+		_needsVertexOutputBuffer = funcRslts.needsOutputBuffer;
+	}
+
+	// If we need the swizzle buffer and there's no place to put it, we're in serious trouble.
+	if (!verifyImplicitBuffer(_needsVertexSwizzleBuffer, _swizzleBufferIndex, kMVKShaderStageVertex, "swizzle", vbCnt)) {
+		return false;
+	}
+	// Ditto buffer size buffer.
+	if (!verifyImplicitBuffer(_needsVertexBufferSizeBuffer, _bufferSizeBufferIndex, kMVKShaderStageVertex, "buffer size", vbCnt)) {
+		return false;
+	}
+	// Ditto captured output buffer.
+	if (!verifyImplicitBuffer(_needsVertexOutputBuffer, _outputBufferIndex, kMVKShaderStageVertex, "output", vbCnt)) {
+		return false;
+	}
+	if (!verifyImplicitBuffer(!shaderContext.shaderInputs.empty(), _indirectParamsIndex, kMVKShaderStageVertex, "index", vbCnt)) {
+		return false;
+	}
+	return true;
+}
+
 bool MVKGraphicsPipeline::addTessCtlShaderToPipeline(MTLComputePipelineDescriptor* plDesc,
 													 const VkGraphicsPipelineCreateInfo* pCreateInfo,
 													 SPIRVToMSLConversionConfiguration& shaderContext,
@ -841,11 +916,13 @@ bool MVKGraphicsPipeline::addTessCtlShaderToPipeline(MTLComputePipelineDescripto
 	shaderContext.options.entryPointName = _pTessCtlSS->pName;
 	shaderContext.options.mslOptions.swizzle_buffer_index = _swizzleBufferIndex.stages[kMVKShaderStageTessCtl];
 	shaderContext.options.mslOptions.indirect_params_buffer_index = _indirectParamsIndex.stages[kMVKShaderStageTessCtl];
+	shaderContext.options.mslOptions.shader_input_buffer_index = kMVKTessCtlInputBufferIndex;
 	shaderContext.options.mslOptions.shader_output_buffer_index = _outputBufferIndex.stages[kMVKShaderStageTessCtl];
 	shaderContext.options.mslOptions.shader_patch_output_buffer_index = _tessCtlPatchOutputBufferIndex;
 	shaderContext.options.mslOptions.shader_tess_factor_buffer_index = _tessCtlLevelBufferIndex;
 	shaderContext.options.mslOptions.buffer_size_buffer_index = _bufferSizeBufferIndex.stages[kMVKShaderStageTessCtl];
 	shaderContext.options.mslOptions.capture_output_to_buffer = true;
+	shaderContext.options.mslOptions.multi_patch_workgroup = true;
 	addPrevStageOutputToShaderConverterContext(shaderContext, vtxOutputs);

 	MVKMTLFunction func = ((MVKShaderModule*)_pTessCtlSS->module)->getMTLFunction(&shaderContext, _pTessCtlSS->pSpecializationInfo, _pipelineCache);
@ -861,7 +938,7 @@ bool MVKGraphicsPipeline::addTessCtlShaderToPipeline(MTLComputePipelineDescripto
 	_needsTessCtlBufferSizeBuffer = funcRslts.needsBufferSizeBuffer;
 	_needsTessCtlOutputBuffer = funcRslts.needsOutputBuffer;
 	_needsTessCtlPatchOutputBuffer = funcRslts.needsPatchOutputBuffer;
-	_needsTessCtlInput = funcRslts.needsInputThreadgroupMem;
+	_needsTessCtlInputBuffer = funcRslts.needsInputThreadgroupMem;

 	if (!verifyImplicitBuffer(_needsTessCtlSwizzleBuffer, _swizzleBufferIndex, kMVKShaderStageTessCtl, "swizzle", kMVKTessCtlNumReservedBuffers)) {
 		return false;
@ -957,7 +1034,8 @@ bool MVKGraphicsPipeline::addFragmentShaderToPipeline(MTLRenderPipelineDescripto
 	return true;
 }

-bool MVKGraphicsPipeline::addVertexInputToPipeline(MTLRenderPipelineDescriptor* plDesc,
+template<class T>
+bool MVKGraphicsPipeline::addVertexInputToPipeline(T* inputDesc,
 												   const VkPipelineVertexInputStateCreateInfo* pVI,
 												   const SPIRVToMSLConversionConfiguration& shaderContext) {
    // Collect extension structures
@ -989,16 +1067,16 @@ bool MVKGraphicsPipeline::addVertexInputToPipeline(MTLRenderPipelineDescriptor*

 			maxBinding = max(pVKVB->binding, maxBinding);
 			uint32_t vbIdx = getMetalBufferIndexForVertexAttributeBinding(pVKVB->binding);
-			MTLVertexBufferLayoutDescriptor* vbDesc = plDesc.vertexDescriptor.layouts[vbIdx];
+			auto vbDesc = inputDesc.layouts[vbIdx];
 			if (pVKVB->stride == 0) {
 				// Stride can't be 0, it will be set later to attributes' maximum offset + size
 				// to prevent it from being larger than the underlying buffer permits.
 				vbDesc.stride = 0;
-				vbDesc.stepFunction = MTLVertexStepFunctionConstant;
+				vbDesc.stepFunction = (decltype(vbDesc.stepFunction))MTLStepFunctionConstant;
 				vbDesc.stepRate = 0;
 			} else {
 				vbDesc.stride = pVKVB->stride;
-				vbDesc.stepFunction = mvkMTLVertexStepFunctionFromVkVertexInputRate(pVKVB->inputRate);
+				vbDesc.stepFunction = (decltype(vbDesc.stepFunction))mvkMTLStepFunctionFromVkVertexInputRate(pVKVB->inputRate, isTessellationPipeline());
 				vbDesc.stepRate = 1;
 			}
        }
@ -1011,11 +1089,11 @@ bool MVKGraphicsPipeline::addVertexInputToPipeline(MTLRenderPipelineDescriptor*
            const VkVertexInputBindingDivisorDescriptionEXT* pVKVB = &pVertexInputDivisorState->pVertexBindingDivisors[i];
            uint32_t vbIdx = getMetalBufferIndexForVertexAttributeBinding(pVKVB->binding);
            if (shaderContext.isVertexBufferUsed(vbIdx)) {
-                MTLVertexBufferLayoutDescriptor* vbDesc = plDesc.vertexDescriptor.layouts[vbIdx];
-                if (vbDesc.stepFunction == MTLVertexStepFunctionPerInstance) {
+                if ((NSUInteger)inputDesc.layouts[vbIdx].stepFunction == MTLStepFunctionPerInstance ||
+					(NSUInteger)inputDesc.layouts[vbIdx].stepFunction == MTLStepFunctionThreadPositionInGridY) {
                    if (pVKVB->divisor == 0)
-                        vbDesc.stepFunction = MTLVertexStepFunctionConstant;
-                    vbDesc.stepRate = pVKVB->divisor;
+                        inputDesc.layouts[vbIdx].stepFunction = (decltype(inputDesc.layouts[vbIdx].stepFunction))MTLStepFunctionConstant;
+                    inputDesc.layouts[vbIdx].stepRate = pVKVB->divisor;
                }
            }
        }
@ -1040,7 +1118,7 @@ bool MVKGraphicsPipeline::addVertexInputToPipeline(MTLRenderPipelineDescriptor*
 						// The step is set to constant, but we need to change stride to be non-zero for metal.
 						// Look for the maximum offset + size to set as the stride.
 						uint32_t vbIdx = getMetalBufferIndexForVertexAttributeBinding(pVKVB->binding);
-						MTLVertexBufferLayoutDescriptor* vbDesc = plDesc.vertexDescriptor.layouts[vbIdx];
+						auto vbDesc = inputDesc.layouts[vbIdx];
 						uint32_t strideLowBound = vaOffset + attrSize;
 						if (vbDesc.stride < strideLowBound) vbDesc.stride = strideLowBound;
 					} else if (vaOffset + attrSize > pVKVB->stride) {
@ -1060,9 +1138,9 @@ bool MVKGraphicsPipeline::addVertexInputToPipeline(MTLRenderPipelineDescriptor*
 				}
 			}

-			MTLVertexAttributeDescriptor* vaDesc = plDesc.vertexDescriptor.attributes[pVKVA->location];
-			vaDesc.format = getPixelFormats()->getMTLVertexFormat(pVKVA->format);
-			vaDesc.bufferIndex = getMetalBufferIndexForVertexAttributeBinding(vaBinding);
+			auto vaDesc = inputDesc.attributes[pVKVA->location];
+			vaDesc.format = (decltype(vaDesc.format))getPixelFormats()->getMTLVertexFormat(pVKVA->format);
+			vaDesc.bufferIndex = (decltype(vaDesc.bufferIndex))getMetalBufferIndexForVertexAttributeBinding(vaBinding);
 			vaDesc.offset = vaOffset;
 		}
 	}
@ -1075,13 +1153,13 @@ bool MVKGraphicsPipeline::addVertexInputToPipeline(MTLRenderPipelineDescriptor*
 		uint32_t vbVACnt = shaderContext.countShaderInputsAt(pVKVB->binding);
 		if (vbVACnt > 0) {
 			uint32_t vbIdx = getMetalBufferIndexForVertexAttributeBinding(pVKVB->binding);
-			MTLVertexBufferLayoutDescriptor* vbDesc = plDesc.vertexDescriptor.layouts[vbIdx];
+			auto vbDesc = inputDesc.layouts[vbIdx];

 			uint32_t xldtVACnt = 0;
 			for (auto& xltdBind : _translatedVertexBindings) {
 				if (xltdBind.binding == pVKVB->binding) {
 					uint32_t vbXltdIdx = getMetalBufferIndexForVertexAttributeBinding(xltdBind.translationBinding);
-					MTLVertexBufferLayoutDescriptor* vbXltdDesc = plDesc.vertexDescriptor.layouts[vbXltdIdx];
+					auto vbXltdDesc = inputDesc.layouts[vbXltdIdx];
 					vbXltdDesc.stride = vbDesc.stride;
 					vbXltdDesc.stepFunction = vbDesc.stepFunction;
 					vbXltdDesc.stepRate = vbDesc.stepRate;
@ -1097,6 +1175,13 @@ bool MVKGraphicsPipeline::addVertexInputToPipeline(MTLRenderPipelineDescriptor*
 	return true;
 }

+template bool MVKGraphicsPipeline::addVertexInputToPipeline<MTLVertexDescriptor>(MTLVertexDescriptor* inputDesc,
+																				 const VkPipelineVertexInputStateCreateInfo* pVI,
+																				 const SPIRVToMSLConversionConfiguration& shaderContext);
+template bool MVKGraphicsPipeline::addVertexInputToPipeline<MTLStageInputOutputDescriptor>(MTLStageInputOutputDescriptor* inputDesc,
+																						   const VkPipelineVertexInputStateCreateInfo* pVI,
+																						   const SPIRVToMSLConversionConfiguration& shaderContext);
+
 // Returns a translated binding for the existing binding and translation offset, creating it if needed.
 uint32_t MVKGraphicsPipeline::getTranslatedVertexBinding(uint32_t binding, uint32_t translationOffset, uint32_t maxBinding) {
 	// See if a translated binding already exists (for example if more than one VA needs the same translation).
@ -1146,9 +1231,7 @@ void MVKGraphicsPipeline::addTessellationToPipeline(MTLRenderPipelineDescriptor*
 }

 void MVKGraphicsPipeline::addFragmentOutputToPipeline(MTLRenderPipelineDescriptor* plDesc,
-													  const SPIRVTessReflectionData& reflectData,
-													  const VkGraphicsPipelineCreateInfo* pCreateInfo,
-													  bool isTessellationVertexPipeline) {
+													  const VkGraphicsPipelineCreateInfo* pCreateInfo) {

    // Retrieve the render subpass for which this pipeline is being constructed
    MVKRenderPass* mvkRendPass = (MVKRenderPass*)pCreateInfo->renderPass;
@ -1156,7 +1239,7 @@ void MVKGraphicsPipeline::addFragmentOutputToPipeline(MTLRenderPipelineDescripto

 	// Topology
 	if (pCreateInfo->pInputAssemblyState) {
-		plDesc.inputPrimitiveTopologyMVK = (isTessellationVertexPipeline || isRenderingPoints(pCreateInfo, reflectData))
+		plDesc.inputPrimitiveTopologyMVK = isRenderingPoints(pCreateInfo)
 												? MTLPrimitiveTopologyClassPoint
 												: mvkMTLPrimitiveTopologyClassFromVkPrimitiveTopology(pCreateInfo->pInputAssemblyState->topology);
 	}
@ -1254,7 +1337,7 @@ void MVKGraphicsPipeline::initMVKShaderConverterContext(SPIRVToMSLConversionConf
 	}

 	shaderContext.options.mslOptions.texture_1D_as_2D = mvkTreatTexture1DAs2D();
-    shaderContext.options.mslOptions.enable_point_size_builtin = isRenderingPoints(pCreateInfo, reflectData);
+    shaderContext.options.mslOptions.enable_point_size_builtin = isRenderingPoints(pCreateInfo) || reflectData.pointMode;
 	shaderContext.options.mslOptions.enable_frag_depth_builtin = pixFmts->isDepthFormat(mtlDSFormat);
 	shaderContext.options.mslOptions.enable_frag_stencil_ref_builtin = pixFmts->isStencilFormat(mtlDSFormat);
    shaderContext.options.shouldFlipVertexY = _device->_pMVKConfig->shaderConversionFlipVertexY;
@ -1324,6 +1407,8 @@ void MVKGraphicsPipeline::addPrevStageOutputToShaderConverterContext(SPIRVToMSLC
    shaderContext.shaderInputs.clear();
    uint32_t siCnt = (uint32_t)shaderOutputs.size();
    for (uint32_t siIdx = 0; siIdx < siCnt; siIdx++) {
+		if (!shaderOutputs[siIdx].isUsed) { continue; }
+
        mvk::MSLShaderInput si;
        si.shaderInput.location = shaderOutputs[siIdx].location;
        si.shaderInput.builtin = shaderOutputs[siIdx].builtin;
@ -1331,13 +1416,24 @@ void MVKGraphicsPipeline::addPrevStageOutputToShaderConverterContext(SPIRVToMSLC

        switch (getPixelFormats()->getFormatType(mvkFormatFromOutput(shaderOutputs[siIdx]) ) ) {
            case kMVKFormatColorUInt8:
-                si.shaderInput.format = MSL_VERTEX_FORMAT_UINT8;
+                si.shaderInput.format = MSL_SHADER_INPUT_FORMAT_UINT8;
                break;

            case kMVKFormatColorUInt16:
-                si.shaderInput.format = MSL_VERTEX_FORMAT_UINT16;
+                si.shaderInput.format = MSL_SHADER_INPUT_FORMAT_UINT16;
                break;

+			case kMVKFormatColorHalf:
+			case kMVKFormatColorInt16:
+				si.shaderInput.format = MSL_SHADER_INPUT_FORMAT_ANY16;
+				break;
+
+			case kMVKFormatColorFloat:
+			case kMVKFormatColorInt32:
+			case kMVKFormatColorUInt32:
+				si.shaderInput.format = MSL_SHADER_INPUT_FORMAT_ANY32;
+				break;
+
            default:
                break;
        }
@ -1347,20 +1443,21 @@ void MVKGraphicsPipeline::addPrevStageOutputToShaderConverterContext(SPIRVToMSLC
 }

 // We render points if either the topology or polygon fill mode dictate it
-bool MVKGraphicsPipeline::isRenderingPoints(const VkGraphicsPipelineCreateInfo* pCreateInfo, const SPIRVTessReflectionData& reflectData) {
+bool MVKGraphicsPipeline::isRenderingPoints(const VkGraphicsPipelineCreateInfo* pCreateInfo) {
 	return ((pCreateInfo->pInputAssemblyState && (pCreateInfo->pInputAssemblyState->topology == VK_PRIMITIVE_TOPOLOGY_POINT_LIST)) ||
-			(pCreateInfo->pRasterizationState && (pCreateInfo->pRasterizationState->polygonMode == VK_POLYGON_MODE_POINT)) ||
-			(reflectData.pointMode));
+			(pCreateInfo->pRasterizationState && (pCreateInfo->pRasterizationState->polygonMode == VK_POLYGON_MODE_POINT)));
 }

 MVKGraphicsPipeline::~MVKGraphicsPipeline() {
-	[_mtlTessControlStageDesc release];
+	[_mtlTessVertexStageDesc release];

 	[_mtlTessVertexStageState release];
+	[_mtlTessVertexStageIndex16State release];
+	[_mtlTessVertexStageIndex32State release];
 	[_mtlTessControlStageState release];
-	[_mtlTessControlStageIndex16State release];
-	[_mtlTessControlStageIndex32State release];
 	[_mtlPipelineState release];
+
+	for (id<MTLFunction> func : _mtlTessVertexFunctions) { [func release]; }
 }


--- a/MoltenVK/MoltenVK/Utility/MVKFoundation.h
+++ b/MoltenVK/MoltenVK/Utility/MVKFoundation.h
@ -23,8 +23,11 @@
 #include "MVKCommonEnvironment.h"
 #include "mvk_vulkan.h"
 #include <algorithm>
+#include <cassert>
+#include <limits>
 #include <string>
 #include <simd/simd.h>
+#include <type_traits>


 #pragma mark Math
@ -81,7 +84,7 @@ typedef enum : uint8_t {
    kMVKCommandUseClearDepthStencilImage,   /**< vkCmdClearDepthStencilImage. */
    kMVKCommandUseResetQueryPool,           /**< vkCmdResetQueryPool. */
    kMVKCommandUseDispatch,                 /**< vkCmdDispatch. */
-    kMVKCommandUseTessellationControl,      /**< vkCmdDraw* - tessellation control stage. */
+    kMVKCommandUseTessellationVertexTessCtl,/**< vkCmdDraw* - vertex and tessellation control stages. */
    kMVKCommandUseCopyQueryPoolResults      /**< vkCmdCopyQueryPoolResults. */
 } MVKCommandUse;

@ -346,10 +349,49 @@ const T& mvkClamp(const T& val, const T& lower, const T& upper) {
 }

 /** Returns the result of a division, rounded up. */
-template<typename T>
-T mvkCeilingDivide(T numerator, T denominator) {
+template<typename T, typename U>
+constexpr typename std::common_type<T, U>::type mvkCeilingDivide(T numerator, U denominator) {
+	typedef typename std::common_type<T, U>::type R;
 	// Short circuit very common usecase of dividing by one.
-	return (denominator == 1) ? numerator : (numerator + denominator - 1) / denominator;
+	return (denominator == 1) ? numerator : (R(numerator) + denominator - 1) / denominator;
+}
+
+/** Returns the absolute value of a number. */
+template<typename R, typename T, bool = std::is_signed<T>::value>
+struct MVKAbs;
+
+template<typename R, typename T>
+struct MVKAbs<R, T, true> {
+	static constexpr R eval(T x) noexcept {
+		return x >= 0 ? x : (x == std::numeric_limits<T>::min() ? -static_cast<R>(x) : -x);
+	}
+};
+
+template<typename R, typename T>
+struct MVKAbs<R, T, false> {
+	static constexpr R eval(T x) noexcept {
+		return x;
+	}
+};
+
+/** Returns the greatest common divisor of two numbers. */
+template<typename T>
+constexpr T mvkGreatestCommonDivisorImpl(T a, T b) {
+	return b == 0 ? a : mvkGreatestCommonDivisorImpl(b, a % b);
+}
+
+template<typename T, typename U>
+constexpr typename std::common_type<T, U>::type mvkGreatestCommonDivisor(T a, U b) {
+	typedef typename std::common_type<T, U>::type R;
+	typedef typename std::make_unsigned<R>::type UI;
+	return static_cast<R>(mvkGreatestCommonDivisorImpl(static_cast<UI>(MVKAbs<R, T>::eval(a)), static_cast<UI>(MVKAbs<R, U>::eval(b))));
+}
+
+/** Returns the least common multiple of two numbers. */
+template<typename T, typename U>
+constexpr typename std::common_type<T, U>::type mvkLeastCommonMultiple(T a, U b) {
+	typedef typename std::common_type<T, U>::type R;
+	return (a == 0 && b == 0) ? 0 : MVKAbs<R, T>::eval(a) / mvkGreatestCommonDivisor(a, b) * MVKAbs<R, U>::eval(b);
 }


--- a/MoltenVK/MoltenVK/Vulkan/mvk_datatypes.mm
+++ b/MoltenVK/MoltenVK/Vulkan/mvk_datatypes.mm
@ -380,6 +380,17 @@ MVK_PUBLIC_SYMBOL MTLVertexStepFunction mvkMTLVertexStepFunctionFromVkVertexInpu
 	}
 }

+MVK_PUBLIC_SYMBOL MTLStepFunction mvkMTLStepFunctionFromVkVertexInputRate(VkVertexInputRate vkVtxStep, bool forTess) {
+	if (!forTess) {
+		return (MTLStepFunction)mvkMTLVertexStepFunctionFromVkVertexInputRate(vkVtxStep);
+	}
+	switch (vkVtxStep) {
+		case VK_VERTEX_INPUT_RATE_VERTEX:		return MTLStepFunctionThreadPositionInGridX;
+		case VK_VERTEX_INPUT_RATE_INSTANCE:		return MTLStepFunctionThreadPositionInGridY;
+		default:								return MTLStepFunctionThreadPositionInGridX;
+	}
+}
+
 #undef mvkMTLPrimitiveTypeFromVkPrimitiveTopology
 MVK_PUBLIC_SYMBOL MTLPrimitiveType mvkMTLPrimitiveTypeFromVkPrimitiveTopology(VkPrimitiveTopology vkTopology) {
 	return mvkMTLPrimitiveTypeFromVkPrimitiveTopologyInObj(vkTopology, nullptr);
--- a/MoltenVKShaderConverter/MoltenVKSPIRVToMSLConverter/SPIRVReflection.h
+++ b/MoltenVKShaderConverter/MoltenVKSPIRVToMSLConverter/SPIRVReflection.h
@ -214,7 +214,7 @@ namespace mvk {
 						if (reflect.has_member_decoration(type->self, idx, spv::DecorationLocation)) {
 							memberLoc = reflect.get_member_decoration(type->self, idx, spv::DecorationLocation);
 						}
-						patch = reflect.has_member_decoration(type->self, idx, spv::DecorationPatch);
+						patch = patch || reflect.has_member_decoration(type->self, idx, spv::DecorationPatch);
 						if (reflect.has_member_decoration(type->self, idx, spv::DecorationBuiltIn)) {
 							biType = (spv::BuiltIn)reflect.get_member_decoration(type->self, idx, spv::DecorationBuiltIn);
 							isUsed = reflect.has_active_builtin(biType, var.storage);