diff --git a/MoltenVK/MoltenVK/API/mvk_private_api.h b/MoltenVK/MoltenVK/API/mvk_private_api.h
index b4172602..1c9b2d3b 100644
--- a/MoltenVK/MoltenVK/API/mvk_private_api.h
+++ b/MoltenVK/MoltenVK/API/mvk_private_api.h
@@ -44,7 +44,7 @@ typedef unsigned long MTLArgumentBuffersTier;
  */
 
 
-#define MVK_PRIVATE_API_VERSION   40
+#define MVK_PRIVATE_API_VERSION   41
 
 
 #pragma mark -
@@ -376,6 +376,7 @@ typedef struct {
 typedef struct {
     uint32_t count;       /**< The number of activities of this type. */
 	double latest;        /**< The latest (most recent) value of the activity. */
+	double previous;      /**< The previous (second most recent) value of the activity. */
     double average;       /**< The average value of the activity. */
     double minimum;       /**< The minimum value of the activity. */
     double maximum;       /**< The maximum value of the activity. */
@@ -407,9 +408,11 @@ typedef struct {
 typedef struct {
 	MVKPerformanceTracker retrieveMTLCommandBuffer;     /** Retrieve a MTLCommandBuffer from a MTLQueue, in milliseconds. */
 	MVKPerformanceTracker commandBufferEncoding;        /** Encode a single VkCommandBuffer to a MTLCommandBuffer (excludes MTLCommandBuffer encoding from configured immediate prefilling), in milliseconds. */
+	MVKPerformanceTracker waitSubmitCommandBuffers;		/** Wait time from vkQueueSubmit() call to starting the encoding of the command buffers to the GPU, in milliseconds. Useful when MVK_CONFIG_SYNCHRONOUS_QUEUE_SUBMITS is disabled. */
 	MVKPerformanceTracker submitCommandBuffers;         /** Submit and encode all VkCommandBuffers in a vkQueueSubmit() operation to MTLCommandBuffers (including both prefilled and deferred encoding), in milliseconds. */
 	MVKPerformanceTracker mtlCommandBufferExecution;    /** Execute a MTLCommandBuffer on the GPU, from commit to completion callback, in milliseconds. */
 	MVKPerformanceTracker retrieveCAMetalDrawable;      /** Retrieve next CAMetalDrawable from a CAMetalLayer, in milliseconds. */
+	MVKPerformanceTracker waitPresentSwapchains;		/** Wait time from vkQueuePresentKHR() call to starting the encoding of the swapchains to the GPU, in milliseconds. Useful when MVK_CONFIG_SYNCHRONOUS_QUEUE_SUBMITS is disabled. */
 	MVKPerformanceTracker presentSwapchains;            /** Present the swapchains in a vkQueuePresentKHR() on the GPU, from commit to presentation callback, in milliseconds. */
 	MVKPerformanceTracker frameInterval;                /** Frame presentation interval (1000/FPS), in milliseconds. */
 } MVKQueuePerformance;
@@ -427,10 +430,6 @@ typedef struct {
  * than your app was, the size of this structure in your app may be larger or smaller than the
  * struct in MoltenVK. See the description of the vkGetPerformanceStatisticsMVK() function for
  * information about how to handle this.
- *
- * TO SUPPORT DYNAMIC LINKING TO THIS STRUCTURE AS DESCRIBED ABOVE, THIS STRUCTURE SHOULD NOT
- * BE CHANGED EXCEPT TO ADD ADDITIONAL MEMBERS ON THE END. EXISTING MEMBERS, AND THEIR ORDER,
- * SHOULD NOT BE CHANGED.
  */
 typedef struct {
 	MVKShaderCompilationPerformance shaderCompilation;	/** Shader compilations activities. */
@@ -526,11 +525,12 @@ VKAPI_ATTR VkResult VKAPI_CALL vkGetPhysicalDeviceMetalFeaturesMVK(
  * to tell MoltenVK the limit of the size of your MVKPerformanceStatistics structure. Upon return
  * from this function, the value of *pPerfSize will hold the actual number of bytes copied into
  * your passed MVKPerformanceStatistics structure, which will be the smaller of what your app
- * thinks is the size of MVKPerformanceStatistics, and what MoltenVK thinks it is. This
- * represents the safe access area within the structure for both MoltenVK and your app.
+ * thinks is the size of MVKPerformanceStatistics, and what MoltenVK thinks it is.
  *
  * If the size that MoltenVK expects for MVKPerformanceStatistics is different than the value passed
  * in *pPerfSize, this function will return VK_INCOMPLETE, otherwise it will return VK_SUCCESS.
+ * This indicates that the data returned from this function will likely be incorrect, as the structures
+ * nested under MVKPerformanceStatistics may be different.
  *
  * Although it is not necessary, you can use this function to determine in advance the value
  * that MoltenVK expects the size of MVKPerformanceStatistics to be by setting the value of
diff --git a/MoltenVK/MoltenVK/GPUObjects/MVKDevice.mm b/MoltenVK/MoltenVK/GPUObjects/MVKDevice.mm
index c3960f47..c1774001 100644
--- a/MoltenVK/MoltenVK/GPUObjects/MVKDevice.mm
+++ b/MoltenVK/MoltenVK/GPUObjects/MVKDevice.mm
@@ -4342,6 +4342,7 @@ void MVKDevice::applyMemoryBarrier(MVKPipelineBarrier& barrier,
 void MVKDevice::updateActivityPerformance(MVKPerformanceTracker& activity, double currentValue) {
 	lock_guard<mutex> lock(_perfLock);
 
+	activity.previous = activity.latest;
 	activity.latest = currentValue;
 	activity.minimum = ((activity.minimum == 0.0)
 								? currentValue :
@@ -4364,12 +4365,13 @@ void MVKDevice::logActivityInline(MVKPerformanceTracker& activity, MVKPerformanc
 }
 void MVKDevice::logActivityDuration(MVKPerformanceTracker& activity, MVKPerformanceStatistics& perfStats, bool isInline) {
 	const char* fmt = (isInline
-					   ? "%s performance avg: %.3f ms, latest: %.3f ms, min: %.3f ms, max: %.3f ms, count: %d"
-					   : "  %-45s avg: %.3f ms, latest: %.3f ms, min: %.3f ms, max: %.3f ms, count: %d");
+					   ? "%s performance avg: %.3f ms, latest: %.3f ms, prev: %.3f ms, min: %.3f ms, max: %.3f ms, count: %d"
+					   : "  %-45s avg: %.3f ms, latest: %.3f ms, prev: %.3f ms, min: %.3f ms, max: %.3f ms, count: %d");
 	MVKLogInfo(fmt,
 			   getActivityPerformanceDescription(activity, perfStats),
 			   activity.average,
 			   activity.latest,
+			   activity.previous,
 			   activity.minimum,
 			   activity.maximum,
 			   activity.count);
@@ -4377,12 +4379,13 @@ void MVKDevice::logActivityDuration(MVKPerformanceTracker& activity, MVKPerforma
 
 void MVKDevice::logActivityByteCount(MVKPerformanceTracker& activity, MVKPerformanceStatistics& perfStats, bool isInline) {
 	const char* fmt = (isInline
-					   ? "%s avg: %5llu MB, latest: %5llu MB, min: %5llu MB, max: %5llu MB, count: %d"
-					   : "  %-45s avg: %5llu MB, latest: %5llu MB, min: %5llu MB, max: %5llu MB, count: %d");
+					   ? "%s avg: %5llu MB, latest: %5llu MB, prev: %5llu MB, min: %5llu MB, max: %5llu MB, count: %d"
+					   : "  %-45s avg: %5llu MB, latest: %5llu MB, prev: %5llu MB, min: %5llu MB, max: %5llu MB, count: %d");
 	MVKLogInfo(fmt,
 			   getActivityPerformanceDescription(activity, perfStats),
 			   uint64_t(activity.average) / KIBI,
 			   uint64_t(activity.latest) / KIBI,
+			   uint64_t(activity.previous) / KIBI,
 			   uint64_t(activity.minimum) / KIBI,
 			   uint64_t(activity.maximum) / KIBI,
 			   activity.count);
diff --git a/MoltenVK/MoltenVK/GPUObjects/MVKQueue.h b/MoltenVK/MoltenVK/GPUObjects/MVKQueue.h
index 0adc557f..752fcf80 100644
--- a/MoltenVK/MoltenVK/GPUObjects/MVKQueue.h
+++ b/MoltenVK/MoltenVK/GPUObjects/MVKQueue.h
@@ -216,6 +216,7 @@ protected:
 
 	MVKQueue* _queue;
 	MVKSmallVector<MVKSemaphoreSubmitInfo> _waitSemaphores;
+	uint64_t _creationTime;
 };
 
 
diff --git a/MoltenVK/MoltenVK/GPUObjects/MVKQueue.mm b/MoltenVK/MoltenVK/GPUObjects/MVKQueue.mm
index 40301695..38b50b16 100644
--- a/MoltenVK/MoltenVK/GPUObjects/MVKQueue.mm
+++ b/MoltenVK/MoltenVK/GPUObjects/MVKQueue.mm
@@ -415,6 +415,8 @@ MVKQueueSubmission::MVKQueueSubmission(MVKQueue* queue,
 	_queue = queue;
 	_queue->retain();	// Retain here and release in destructor. See note for MVKQueueCommandBufferSubmission::finish().
 
+	_creationTime = getDevice()->getPerformanceTimestamp();		// call getDevice() only after _queue is defined
+
 	_waitSemaphores.reserve(waitSemaphoreInfoCount);
 	for (uint32_t i = 0; i < waitSemaphoreInfoCount; i++) {
 		_waitSemaphores.emplace_back(pWaitSemaphoreSubmitInfos[i]);
@@ -428,6 +430,8 @@ MVKQueueSubmission::MVKQueueSubmission(MVKQueue* queue,
 	_queue = queue;
 	_queue->retain();	// Retain here and release in destructor. See note for MVKQueueCommandBufferSubmission::finish().
 
+	_creationTime = getDevice()->getPerformanceTimestamp();		// call getDevice() only after _queue is defined
+
 	_waitSemaphores.reserve(waitSemaphoreCount);
 	for (uint32_t i = 0; i < waitSemaphoreCount; i++) {
 		_waitSemaphores.emplace_back(pWaitSemaphores[i], pWaitDstStageMask ? pWaitDstStageMask[i] : 0);
@@ -449,6 +453,10 @@ VkResult MVKQueueCommandBufferSubmission::execute() {
 	// If using encoded semaphore waiting, do so now.
 	for (auto& ws : _waitSemaphores) { ws.encodeWait(getActiveMTLCommandBuffer()); }
 
+	// Wait time from an async vkQueueSubmit() call to starting submit and encoding of the command buffers
+	MVKDevice* mvkDev = getDevice();
+	mvkDev->addPerformanceInterval(mvkDev->_performanceStatistics.queue.waitSubmitCommandBuffers, _creationTime);
+
 	// Submit each command buffer.
 	submitCommandBuffers();
 
@@ -679,7 +687,7 @@ MVKQueueFullCommandBufferSubmission<N>::MVKQueueFullCommandBufferSubmission(MVKQ
 // If the semaphores are not encodable, wait on them inline after presenting.
 // The semaphores know what to do.
 VkResult MVKQueuePresentSurfaceSubmission::execute() {
-	// MTLCommandBuffer retain references to avoid rare case where objects are destroyed too early. 
+	// MTLCommandBuffer retain references to avoid rare case where objects are destroyed too early.
 	// Although testing could not determine which objects were being lost, queue present MTLCommandBuffers
 	// are used only once per frame, and retain so few objects, that blanket retention is still performant.
 	id<MTLCommandBuffer> mtlCmdBuff = _queue->getMTLCommandBuffer(kMVKCommandUseQueuePresent, true);
@@ -689,6 +697,10 @@ VkResult MVKQueuePresentSurfaceSubmission::execute() {
 		ws.encodeWait(nil);			// Inline semaphore waits
 	}
 
+	// Wait time from an async vkQueuePresentKHR() call to starting presentation of the swapchains
+	MVKDevice* mvkDev = getDevice();
+	mvkDev->addPerformanceInterval(mvkDev->_performanceStatistics.queue.waitPresentSwapchains, _creationTime);
+
 	for (int i = 0; i < _presentInfo.size(); i++ ) {
 		setConfigurationResult(_presentInfo[i].presentableImage->presentCAMetalDrawable(mtlCmdBuff, _presentInfo[i]));
 	}
diff --git a/MoltenVK/MoltenVK/Vulkan/mvk_api.mm b/MoltenVK/MoltenVK/Vulkan/mvk_api.mm
index cade5025..1f7bc04e 100644
--- a/MoltenVK/MoltenVK/Vulkan/mvk_api.mm
+++ b/MoltenVK/MoltenVK/Vulkan/mvk_api.mm
@@ -43,7 +43,7 @@ VkResult mvkCopyGrowingStruct(S* pDst, const S* pSrc, size_t* pCopySize) {
 		size_t origSize = *pCopySize;
 		*pCopySize = std::min(origSize, sizeof(S));
 		memcpy(pDst, pSrc, *pCopySize);
-		return (*pCopySize == origSize) ? VK_SUCCESS : VK_INCOMPLETE;
+		return (sizeof(S) == origSize) ? VK_SUCCESS : VK_INCOMPLETE;
 	} else {
 		*pCopySize = sizeof(S);
 		return VK_SUCCESS;