From 251f29dd7fa530997cd6d27a8db28c4a39efd127 Mon Sep 17 00:00:00 2001
From: Henrik Rydgard <hrydgard@gmail.com>
Date: Thu, 28 Apr 2016 21:43:49 +0200
Subject: [PATCH] Optimize the vertex loader, nearly doubling its speed.

---
 src/video_core/vertex_loader.cpp | 85 ++++++++++++++++++++------------
 src/video_core/vertex_loader.h   |  1 -
 2 files changed, 54 insertions(+), 32 deletions(-)

diff --git a/src/video_core/vertex_loader.cpp b/src/video_core/vertex_loader.cpp
index c2630d9eb..38682d2be 100644
--- a/src/video_core/vertex_loader.cpp
+++ b/src/video_core/vertex_loader.cpp
@@ -46,13 +46,11 @@ void VertexLoader::Setup(const Pica::Regs& regs) {
 
             u32 attribute_index = loader_config.GetComponent(component);
             if (attribute_index < 12) {
-                int element_size = attribute_config.GetElementSizeInBytes(attribute_index);
-                offset = Common::AlignUp(offset, element_size);
+                offset = Common::AlignUp(offset, attribute_config.GetElementSizeInBytes(attribute_index));
                 vertex_attribute_sources[attribute_index] = loader_config.data_offset + offset;
                 vertex_attribute_strides[attribute_index] = static_cast<u32>(loader_config.byte_count);
                 vertex_attribute_formats[attribute_index] = attribute_config.GetFormat(attribute_index);
                 vertex_attribute_elements[attribute_index] = attribute_config.GetNumElements(attribute_index);
-                vertex_attribute_element_size[attribute_index] = element_size;
                 offset += attribute_config.GetStride(attribute_index);
             } else if (attribute_index < 16) {
                 // Attribute ids 12, 13, 14 and 15 signify 4, 8, 12 and 16-byte paddings, respectively
@@ -68,38 +66,63 @@ void VertexLoader::Setup(const Pica::Regs& regs) {
 void VertexLoader::LoadVertex(u32 base_address, int index, int vertex, Shader::InputVertex& input, MemoryAccesses& memory_accesses) {
     for (int i = 0; i < num_total_attributes; ++i) {
         if (vertex_attribute_elements[i] != 0) {
+            // Load per-vertex data from the loader arrays
+            u32 source_addr = base_address + vertex_attribute_sources[i] + vertex_attribute_strides[i] * vertex;
+
+            if (g_debug_context && Pica::g_debug_context->recorder) {
+                memory_accesses.AddAccess(source_addr,
+                    (vertex_attribute_formats[i] == Regs::VertexAttributeFormat::FLOAT) ? 4
+                    : (vertex_attribute_formats[i] == Regs::VertexAttributeFormat::SHORT) ? 2 : 1);
+            }
+
+            switch (vertex_attribute_formats[i]) {
+            case Regs::VertexAttributeFormat::BYTE:
+            {
+                const s8* srcdata = reinterpret_cast<const s8*>(Memory::GetPhysicalPointer(source_addr));
+                for (unsigned int comp = 0; comp < vertex_attribute_elements[i]; ++comp) {
+                    input.attr[i][comp] = float24::FromFloat32(srcdata[comp]);
+                }
+                break;
+            }
+            case Regs::VertexAttributeFormat::UBYTE:
+            {
+                const u8* srcdata = reinterpret_cast<const u8*>(Memory::GetPhysicalPointer(source_addr));
+                for (unsigned int comp = 0; comp < vertex_attribute_elements[i]; ++comp) {
+                    input.attr[i][comp] = float24::FromFloat32(srcdata[comp]);
+                }
+                break;
+            }
+            case Regs::VertexAttributeFormat::SHORT:
+            {
+                const s16* srcdata = reinterpret_cast<const s16*>(Memory::GetPhysicalPointer(source_addr));
+                for (unsigned int comp = 0; comp < vertex_attribute_elements[i]; ++comp) {
+                    input.attr[i][comp] = float24::FromFloat32(srcdata[comp]);
+                }
+                break;
+            }
+            case Regs::VertexAttributeFormat::FLOAT:
+            {
+                const float* srcdata = reinterpret_cast<const float*>(Memory::GetPhysicalPointer(source_addr));
+                for (unsigned int comp = 0; comp < vertex_attribute_elements[i]; ++comp) {
+                    input.attr[i][comp] = float24::FromFloat32(srcdata[comp]);
+                }
+                break;
+            }
+            }
+
             // Default attribute values set if array elements have < 4 components. This
             // is *not* carried over from the default attribute settings even if they're
             // enabled for this attribute.
-            static const float24 zero = float24::FromFloat32(0.0f);
-            static const float24 one = float24::FromFloat32(1.0f);
-            input.attr[i] = Math::Vec4<float24>(zero, zero, zero, one);
-
-            // Load per-vertex data from the loader arrays
-            for (unsigned int comp = 0; comp < vertex_attribute_elements[i]; ++comp) {
-                u32 source_addr = base_address + vertex_attribute_sources[i] + vertex_attribute_strides[i] * vertex + comp * vertex_attribute_element_size[i];
-                const u8* srcdata = Memory::GetPhysicalPointer(source_addr);
-
-                if (g_debug_context && Pica::g_debug_context->recorder) {
-                    memory_accesses.AddAccess(source_addr,
-                        (vertex_attribute_formats[i] == Regs::VertexAttributeFormat::FLOAT) ? 4
-                        : (vertex_attribute_formats[i] == Regs::VertexAttributeFormat::SHORT) ? 2 : 1);
-                }
-
-                const float srcval =
-                    (vertex_attribute_formats[i] == Regs::VertexAttributeFormat::BYTE) ? *reinterpret_cast<const s8*>(srcdata) :
-                    (vertex_attribute_formats[i] == Regs::VertexAttributeFormat::UBYTE) ? *reinterpret_cast<const u8*>(srcdata) :
-                    (vertex_attribute_formats[i] == Regs::VertexAttributeFormat::SHORT) ? *reinterpret_cast<const s16*>(srcdata) :
-                    *reinterpret_cast<const float*>(srcdata);
-
-                input.attr[i][comp] = float24::FromFloat32(srcval);
-                LOG_TRACE(HW_GPU, "Loaded component %x of attribute %x for vertex %x (index %x) from 0x%08x + 0x%08x + 0x%04x: %f",
-                    comp, i, vertex, index,
-                    base_address,
-                    vertex_attribute_sources[i],
-                    vertex_attribute_strides[i] * vertex + comp * vertex_attribute_element_size[i],
-                    input.attr[i][comp].ToFloat32());
+            for (unsigned int comp = vertex_attribute_elements[i]; comp < 4; ++comp) {
+                input.attr[i][comp] = comp == 3 ? float24::FromFloat32(1.0f) : float24::FromFloat32(0.0f);
             }
+
+            LOG_TRACE(HW_GPU, "Loaded %d components of attribute %x for vertex %x (index %x) from 0x%08x + 0x%08x + 0x%04x: %f %f %f %f",
+                vertex_attribute_elements[i], i, vertex, index,
+                base_address,
+                vertex_attribute_sources[i],
+                vertex_attribute_strides[i] * vertex,
+                input.attr[i][0].ToFloat32(), input.attr[i][1].ToFloat32(), input.attr[i][2].ToFloat32(), input.attr[i][3].ToFloat32());
         } else if (vertex_attribute_is_default[i]) {
             // Load the default attribute if we're configured to do so
             input.attr[i] = g_state.vs.default_attributes[i];
diff --git a/src/video_core/vertex_loader.h b/src/video_core/vertex_loader.h
index 40c5e83c9..7267ea9c6 100644
--- a/src/video_core/vertex_loader.h
+++ b/src/video_core/vertex_loader.h
@@ -47,7 +47,6 @@ private:
     u32 vertex_attribute_strides[16] = {};
     Regs::VertexAttributeFormat vertex_attribute_formats[16] = {};
     u32 vertex_attribute_elements[16] = {};
-    u32 vertex_attribute_element_size[16] = {};
     bool vertex_attribute_is_default[16];
     int num_total_attributes;
 };