From d151d797b1c281d5813ca705722f43b4be20ca6d Mon Sep 17 00:00:00 2001
From: Yuri Kunde Schlesner <yuriks@yuriks.net>
Date: Sun, 28 Dec 2014 18:20:33 -0200
Subject: [PATCH 1/7] Vertex Shader: Zero OutputVertex to avoid denormals

Unused OutputVertex attributes were being left un-initialized. The
leftover garbage sometimes decoded as floating-point denormalized
values, causing fallbacks to microcode and massive slowdowns in the rest
of the rasterization pipeline even though the results were unused. By
zeroing the structure we ensure these attributes only contain harmless
zeros.
---
 src/video_core/vertex_shader.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/video_core/vertex_shader.cpp b/src/video_core/vertex_shader.cpp
index e31bc3bc73..bed5081a0e 100644
--- a/src/video_core/vertex_shader.cpp
+++ b/src/video_core/vertex_shader.cpp
@@ -469,6 +469,10 @@ OutputVertex RunShader(const InputVertex& input, int num_attributes)
 
     // Setup output register table
     OutputVertex ret;
+    // Zero output so that attributes which aren't output won't have denormals in them, which will
+    // slow us down later.
+    memset(&ret, 0, sizeof(ret));
+
     for (int i = 0; i < 7; ++i) {
         const auto& output_register_map = registers.vs_output_attributes[i];
 

From a320d1a5b4b7ce3b90372697fbe50242b78d082e Mon Sep 17 00:00:00 2001
From: Yuri Kunde Schlesner <yuriks@yuriks.net>
Date: Sun, 28 Dec 2014 00:56:32 -0200
Subject: [PATCH 2/7] Clipper: Avoid dynamic allocations

The triangle clipper was allocating its temporary input, output and work
buffers using a std::vector. Since this is a hot path, it's desirable to
use stack allocation instead.
---
 externals/boost            |  2 +-
 src/video_core/clipper.cpp | 17 +++++++----------
 2 files changed, 8 insertions(+), 11 deletions(-)

diff --git a/externals/boost b/externals/boost
index b060148c08..97052c28ac 160000
--- a/externals/boost
+++ b/externals/boost
@@ -1 +1 @@
-Subproject commit b060148c08ae87a3a5809c4f48cb26ac667487ab
+Subproject commit 97052c28acb141dbf3c5e14114af99045344b695
diff --git a/src/video_core/clipper.cpp b/src/video_core/clipper.cpp
index 0bcd0b8950..e89b7a0c0e 100644
--- a/src/video_core/clipper.cpp
+++ b/src/video_core/clipper.cpp
@@ -2,7 +2,7 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 
-#include <vector>
+#include <boost/container/static_vector.hpp>
 
 #include "clipper.h"
 #include "pica.h"
@@ -98,18 +98,15 @@ static void InitScreenCoordinates(OutputVertex& vtx)
 }
 
 void ProcessTriangle(OutputVertex &v0, OutputVertex &v1, OutputVertex &v2) {
+    using boost::container::static_vector;
 
     // TODO (neobrain):
     // The list of output vertices has some fixed maximum size,
     // however I haven't taken the time to figure out what it is exactly.
-    // For now, we hence just assume a maximal size of 1000 vertices.
-    const size_t max_vertices = 1000;
-    std::vector<OutputVertex> buffer_vertices;
-    std::vector<OutputVertex*> output_list{ &v0, &v1, &v2 };
-
-    // Make sure to reserve space for all vertices.
-    // Without this, buffer reallocation would invalidate references.
-    buffer_vertices.reserve(max_vertices);
+    // For now, we hence just assume a maximal size of 256 vertices.
+    static const size_t MAX_VERTICES = 256;
+    static_vector<OutputVertex, MAX_VERTICES> buffer_vertices;
+    static_vector<OutputVertex*, MAX_VERTICES> output_list = { &v0, &v1, &v2 };
 
     // Simple implementation of the Sutherland-Hodgman clipping algorithm.
     // TODO: Make this less inefficient (currently lots of useless buffering overhead happens here)
@@ -120,7 +117,7 @@ void ProcessTriangle(OutputVertex &v0, OutputVertex &v1, OutputVertex &v2) {
                        ClippingEdge(ClippingEdge::POS_Z, float24::FromFloat32(+1.0)),
                        ClippingEdge(ClippingEdge::NEG_Z, float24::FromFloat32(-1.0)) }) {
 
-        const std::vector<OutputVertex*> input_list = output_list;
+        const static_vector<OutputVertex*, MAX_VERTICES> input_list = output_list;
         output_list.clear();
 
         const OutputVertex* reference_vertex = input_list.back();

From da049764377804f055ff1898ba0e58c8ee096805 Mon Sep 17 00:00:00 2001
From: Yuri Kunde Schlesner <yuriks@yuriks.net>
Date: Sun, 28 Dec 2014 02:46:29 -0200
Subject: [PATCH 3/7] CMake: Require Boost 1.57.0 (fixes Travis OS X)

---
 CMakeLists.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5bb87d50dd..884520cef0 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -41,11 +41,11 @@ else()
     message(STATUS "libpng not found. Some debugging features have been disabled.")
 endif()
 
-find_package(Boost)
+find_package(Boost 1.57.0)
 if (Boost_FOUND)
     include_directories(${Boost_INCLUDE_DIRS})
 else()
-    message(STATUS "Boost not found, falling back to externals")
+    message(STATUS "Boost 1.57.0 or newer not found, falling back to externals")
     include_directories(externals/boost)
 endif()
 

From 7e9bc85cc826c55a5aa612a3c2f14b8fb631a68c Mon Sep 17 00:00:00 2001
From: Yuri Kunde Schlesner <yuriks@yuriks.net>
Date: Sun, 28 Dec 2014 23:05:15 -0200
Subject: [PATCH 4/7] Clipper: Compact buffers on each clipping pass

Use a new buffer management scheme in the clipper that allows using a
bounded minimal amount of buffer space. Even though it copies more data
it is still slightly faster likely due to using less cache.
---
 src/video_core/clipper.cpp | 55 +++++++++++++++++++-------------------
 1 file changed, 27 insertions(+), 28 deletions(-)

diff --git a/src/video_core/clipper.cpp b/src/video_core/clipper.cpp
index e89b7a0c0e..0521ef8661 100644
--- a/src/video_core/clipper.cpp
+++ b/src/video_core/clipper.cpp
@@ -100,13 +100,15 @@ static void InitScreenCoordinates(OutputVertex& vtx)
 void ProcessTriangle(OutputVertex &v0, OutputVertex &v1, OutputVertex &v2) {
     using boost::container::static_vector;
 
-    // TODO (neobrain):
-    // The list of output vertices has some fixed maximum size,
-    // however I haven't taken the time to figure out what it is exactly.
-    // For now, we hence just assume a maximal size of 256 vertices.
-    static const size_t MAX_VERTICES = 256;
-    static_vector<OutputVertex, MAX_VERTICES> buffer_vertices;
-    static_vector<OutputVertex*, MAX_VERTICES> output_list = { &v0, &v1, &v2 };
+    // Clipping a planar n-gon against a plane will remove at least 1 vertex and introduces 2 at
+    // the new edge (or less in degenerate cases). As such, we can say that each clipping plane
+    // introduces at most 1 new vertex to the polygon. Since we start with a triangle and have a
+    // fixed 6 clipping planes, the maximum number of vertices of the clipped polygon is 3 + 6 = 9.
+    static const size_t MAX_VERTICES = 9;
+    static_vector<OutputVertex, MAX_VERTICES> buffer_a = { v0, v1, v2 };
+    static_vector<OutputVertex, MAX_VERTICES> buffer_b;
+    auto* output_list = &buffer_a;
+    auto* input_list  = &buffer_b;
 
     // Simple implementation of the Sutherland-Hodgman clipping algorithm.
     // TODO: Make this less inefficient (currently lots of useless buffering overhead happens here)
@@ -117,48 +119,45 @@ void ProcessTriangle(OutputVertex &v0, OutputVertex &v1, OutputVertex &v2) {
                        ClippingEdge(ClippingEdge::POS_Z, float24::FromFloat32(+1.0)),
                        ClippingEdge(ClippingEdge::NEG_Z, float24::FromFloat32(-1.0)) }) {
 
-        const static_vector<OutputVertex*, MAX_VERTICES> input_list = output_list;
-        output_list.clear();
+        std::swap(input_list, output_list);
+        output_list->clear();
 
-        const OutputVertex* reference_vertex = input_list.back();
+        const OutputVertex* reference_vertex = &input_list->back();
 
-        for (const auto& vertex : input_list) {
+        for (const auto& vertex : *input_list) {
             // NOTE: This algorithm changes vertex order in some cases!
-            if (edge.IsInside(*vertex)) {
+            if (edge.IsInside(vertex)) {
                 if (edge.IsOutSide(*reference_vertex)) {
-                    buffer_vertices.push_back(edge.GetIntersection(*vertex, *reference_vertex));
-                    output_list.push_back(&(buffer_vertices.back()));
+                    output_list->push_back(edge.GetIntersection(vertex, *reference_vertex));
                 }
 
-                output_list.push_back(vertex);
+                output_list->push_back(vertex);
             } else if (edge.IsInside(*reference_vertex)) {
-                buffer_vertices.push_back(edge.GetIntersection(*vertex, *reference_vertex));
-                output_list.push_back(&(buffer_vertices.back()));
+                output_list->push_back(edge.GetIntersection(vertex, *reference_vertex));
             }
-
-            reference_vertex = vertex;
+            reference_vertex = &vertex;
         }
 
         // Need to have at least a full triangle to continue...
-        if (output_list.size() < 3)
+        if (output_list->size() < 3)
             return;
     }
 
-    InitScreenCoordinates(*(output_list[0]));
-    InitScreenCoordinates(*(output_list[1]));
+    InitScreenCoordinates((*output_list)[0]);
+    InitScreenCoordinates((*output_list)[1]);
 
-    for (size_t i = 0; i < output_list.size() - 2; i ++) {
-        OutputVertex& vtx0 = *(output_list[0]);
-        OutputVertex& vtx1 = *(output_list[i+1]);
-        OutputVertex& vtx2 = *(output_list[i+2]);
+    for (size_t i = 0; i < output_list->size() - 2; i ++) {
+        OutputVertex& vtx0 = (*output_list)[0];
+        OutputVertex& vtx1 = (*output_list)[i+1];
+        OutputVertex& vtx2 = (*output_list)[i+2];
 
         InitScreenCoordinates(vtx2);
 
         LOG_TRACE(Render_Software,
-                  "Triangle %lu/%lu (%lu buffer vertices) at position (%.3f, %.3f, %.3f, %.3f), "
+                  "Triangle %lu/%lu at position (%.3f, %.3f, %.3f, %.3f), "
                   "(%.3f, %.3f, %.3f, %.3f), (%.3f, %.3f, %.3f, %.3f) and "
                   "screen position (%.2f, %.2f, %.2f), (%.2f, %.2f, %.2f), (%.2f, %.2f, %.2f)",
-                  i,output_list.size(), buffer_vertices.size(),
+                  i, output_list->size(),
                   vtx0.pos.x.ToFloat32(), vtx0.pos.y.ToFloat32(), vtx0.pos.z.ToFloat32(), vtx0.pos.w.ToFloat32(),
                   vtx1.pos.x.ToFloat32(), vtx1.pos.y.ToFloat32(), vtx1.pos.z.ToFloat32(), vtx1.pos.w.ToFloat32(),
                   vtx2.pos.x.ToFloat32(), vtx2.pos.y.ToFloat32(), vtx2.pos.z.ToFloat32(), vtx2.pos.w.ToFloat32(),

From 2012e1420f90ea86ea6975f2005f05ecd304b0c4 Mon Sep 17 00:00:00 2001
From: Yuri Kunde Schlesner <yuriks@yuriks.net>
Date: Tue, 23 Dec 2014 10:59:07 -0200
Subject: [PATCH 5/7] Rasterizer: Common sub-expression elimination

Move the computation of some values out of loops so that they're not
constantly recalculated even when they don't change.
---
 src/video_core/rasterizer.cpp | 31 +++++++++++++++++--------------
 1 file changed, 17 insertions(+), 14 deletions(-)

diff --git a/src/video_core/rasterizer.cpp b/src/video_core/rasterizer.cpp
index df1f88c793..63da7104d6 100644
--- a/src/video_core/rasterizer.cpp
+++ b/src/video_core/rasterizer.cpp
@@ -106,6 +106,14 @@ void ProcessTriangle(const VertexShader::OutputVertex& v0,
     int bias1 = IsRightSideOrFlatBottomEdge(vtxpos[1].xy(), vtxpos[2].xy(), vtxpos[0].xy()) ? -1 : 0;
     int bias2 = IsRightSideOrFlatBottomEdge(vtxpos[2].xy(), vtxpos[0].xy(), vtxpos[1].xy()) ? -1 : 0;
 
+    const Math::Vec3<float24> w_inverse = Math::MakeVec(
+            float24::FromFloat32(1.0f) / v0.pos.w,
+            float24::FromFloat32(1.0f) / v1.pos.w,
+            float24::FromFloat32(1.0f) / v2.pos.w);
+
+    auto textures = registers.GetTextures();
+    auto tev_stages = registers.GetTevStages();
+
     // TODO: Not sure if looping through x first might be faster
     for (u16 y = min_y; y < max_y; y += 0x10) {
         for (u16 x = min_x; x < max_x; x += 0x10) {
@@ -129,6 +137,11 @@ void ProcessTriangle(const VertexShader::OutputVertex& v0,
             if (w0 < 0 || w1 < 0 || w2 < 0)
                 continue;
 
+            auto baricentric_coordinates = Math::MakeVec(float24::FromFloat32(static_cast<float>(w0)),
+                                                float24::FromFloat32(static_cast<float>(w1)),
+                                                float24::FromFloat32(static_cast<float>(w2)));
+            float24 interpolated_w_inverse = float24::FromFloat32(1.0f) / Math::Dot(w_inverse, baricentric_coordinates);
+
             // Perspective correct attribute interpolation:
             // Attribute values cannot be calculated by simple linear interpolation since
             // they are not linear in screen space. For example, when interpolating a
@@ -145,19 +158,9 @@ void ProcessTriangle(const VertexShader::OutputVertex& v0,
             //
             // The generalization to three vertices is straightforward in baricentric coordinates.
             auto GetInterpolatedAttribute = [&](float24 attr0, float24 attr1, float24 attr2) {
-                auto attr_over_w = Math::MakeVec(attr0 / v0.pos.w,
-                                                 attr1 / v1.pos.w,
-                                                 attr2 / v2.pos.w);
-                auto w_inverse   = Math::MakeVec(float24::FromFloat32(1.f) / v0.pos.w,
-                                                 float24::FromFloat32(1.f) / v1.pos.w,
-                                                 float24::FromFloat32(1.f) / v2.pos.w);
-                auto baricentric_coordinates = Math::MakeVec(float24::FromFloat32(static_cast<float>(w0)),
-                                                             float24::FromFloat32(static_cast<float>(w1)),
-                                                             float24::FromFloat32(static_cast<float>(w2)));
-
+                auto attr_over_w = Math::MakeVec(attr0, attr1, attr2) * w_inverse;
                 float24 interpolated_attr_over_w = Math::Dot(attr_over_w, baricentric_coordinates);
-                float24 interpolated_w_inverse   = Math::Dot(w_inverse,   baricentric_coordinates);
-                return interpolated_attr_over_w / interpolated_w_inverse;
+                return interpolated_attr_over_w * interpolated_w_inverse;
             };
 
             Math::Vec4<u8> primary_color{
@@ -177,7 +180,7 @@ void ProcessTriangle(const VertexShader::OutputVertex& v0,
 
             Math::Vec4<u8> texture_color[3]{};
             for (int i = 0; i < 3; ++i) {
-                auto texture = registers.GetTextures()[i];
+                const auto& texture = textures[i];
                 if (!texture.enabled)
                     continue;
 
@@ -219,7 +222,7 @@ void ProcessTriangle(const VertexShader::OutputVertex& v0,
             // with some basic arithmetic. Alpha combiners can be configured separately but work
             // analogously.
             Math::Vec4<u8> combiner_output;
-            for (auto tev_stage : registers.GetTevStages()) {
+            for (const auto& tev_stage : tev_stages) {
                 using Source = Regs::TevStageConfig::Source;
                 using ColorModifier = Regs::TevStageConfig::ColorModifier;
                 using AlphaModifier = Regs::TevStageConfig::AlphaModifier;

From fe186d3a598837ba7337f06399dfb8ae7930a070 Mon Sep 17 00:00:00 2001
From: Yuri Kunde Schlesner <yuriks@yuriks.net>
Date: Tue, 23 Dec 2014 12:27:56 -0200
Subject: [PATCH 6/7] GPU: Bitwise texture swizzling

Replace the loop-based texture address swizzling code by a bit-twiddling
implementation, providing a very small speed up. Also simplify
addressing code.
---
 src/video_core/debug_utils/debug_utils.cpp | 49 ++++++++++------------
 1 file changed, 23 insertions(+), 26 deletions(-)

diff --git a/src/video_core/debug_utils/debug_utils.cpp b/src/video_core/debug_utils/debug_utils.cpp
index 328386b7e4..5921185a65 100644
--- a/src/video_core/debug_utils/debug_utils.cpp
+++ b/src/video_core/debug_utils/debug_utils.cpp
@@ -304,7 +304,6 @@ std::unique_ptr<PicaTrace> FinishPicaTracing()
 }
 
 const Math::Vec4<u8> LookupTexture(const u8* source, int x, int y, const TextureInfo& info, bool disable_alpha) {
-
     // Images are split into 8x8 tiles. Each tile is composed of four 4x4 subtiles each
     // of which is composed of four 2x2 subtiles each of which is composed of four texels.
     // Each structure is embedded into the next-bigger one in a diagonal pattern, e.g.
@@ -323,41 +322,39 @@ const Math::Vec4<u8> LookupTexture(const u8* source, int x, int y, const Texture
     // 02 03 06 07 18 19 22 23
     // 00 01 04 05 16 17 20 21
 
-    // TODO(neobrain): Not sure if this swizzling pattern is used for all textures.
-    // To be flexible in case different but similar patterns are used, we keep this
-    // somewhat inefficient code around for now.
-    int texel_index_within_tile = 0;
-    for (int block_size_index = 0; block_size_index < 3; ++block_size_index) {
-        int sub_tile_width = 1 << block_size_index;
-        int sub_tile_height = 1 << block_size_index;
+    const unsigned int block_width = 8;
+    const unsigned int block_height = 8;
 
-        int sub_tile_index = (x & sub_tile_width) << block_size_index;
-        sub_tile_index += 2 * ((y & sub_tile_height) << block_size_index);
-        texel_index_within_tile += sub_tile_index;
-    }
+    const unsigned int coarse_x = x & ~7;
+    const unsigned int coarse_y = y & ~7;
 
-    const int block_width = 8;
-    const int block_height = 8;
+    // Interleave the lower 3 bits of each coordinate to get the intra-block offsets, which are
+    // arranged in a Z-order curve. More details on the bit manipulation at:
+    // https://fgiesen.wordpress.com/2009/12/13/decoding-morton-codes/
+    unsigned int i = (x | (y << 8)) & 0x0707; // ---- -210
+    i = (i ^ (i << 2)) & 0x1313;              // ---2 --10
+    i = (i ^ (i << 1)) & 0x1515;              // ---2 -1-0
+    i = (i | (i >> 7)) & 0x3F;
 
-    int coarse_x = (x / block_width) * block_width;
-    int coarse_y = (y / block_height) * block_height;
+    source += coarse_y * info.stride;
+    const unsigned int offset = coarse_x * block_height + i;
 
     switch (info.format) {
     case Regs::TextureFormat::RGBA8:
     {
-        const u8* source_ptr = source + coarse_x * block_height * 4 + coarse_y * info.stride + texel_index_within_tile * 4;
+        const u8* source_ptr = source + offset * 4;
         return { source_ptr[3], source_ptr[2], source_ptr[1], disable_alpha ? (u8)255 : source_ptr[0] };
     }
 
     case Regs::TextureFormat::RGB8:
     {
-        const u8* source_ptr = source + coarse_x * block_height * 3 + coarse_y * info.stride + texel_index_within_tile * 3;
+        const u8* source_ptr = source + offset * 3;
         return { source_ptr[2], source_ptr[1], source_ptr[0], 255 };
     }
 
     case Regs::TextureFormat::RGBA5551:
     {
-        const u16 source_ptr = *(const u16*)(source + coarse_x * block_height * 2 + coarse_y * info.stride + texel_index_within_tile * 2);
+        const u16 source_ptr = *(const u16*)(source + offset * 2);
         u8 r = (source_ptr >> 11) & 0x1F;
         u8 g = ((source_ptr) >> 6) & 0x1F;
         u8 b = (source_ptr >> 1) & 0x1F;
@@ -367,7 +364,7 @@ const Math::Vec4<u8> LookupTexture(const u8* source, int x, int y, const Texture
 
     case Regs::TextureFormat::RGB565:
     {
-        const u16 source_ptr = *(const u16*)(source + coarse_x * block_height * 2 + coarse_y * info.stride + texel_index_within_tile * 2);
+        const u16 source_ptr = *(const u16*)(source + offset * 2);
         u8 r = (source_ptr >> 11) & 0x1F;
         u8 g = ((source_ptr) >> 5) & 0x3F;
         u8 b = (source_ptr) & 0x1F;
@@ -376,7 +373,7 @@ const Math::Vec4<u8> LookupTexture(const u8* source, int x, int y, const Texture
 
     case Regs::TextureFormat::RGBA4:
     {
-        const u8* source_ptr = source + coarse_x * block_height * 2 + coarse_y * info.stride + texel_index_within_tile * 2;
+        const u8* source_ptr = source + offset * 2;
         u8 r = source_ptr[1] >> 4;
         u8 g = source_ptr[1] & 0xFF;
         u8 b = source_ptr[0] >> 4;
@@ -390,7 +387,7 @@ const Math::Vec4<u8> LookupTexture(const u8* source, int x, int y, const Texture
 
     case Regs::TextureFormat::IA8:
     {
-        const u8* source_ptr = source + coarse_x * block_height * 2 + coarse_y * info.stride + texel_index_within_tile * 2;
+        const u8* source_ptr = source + offset * 2;
 
         // TODO: component order not verified
 
@@ -404,13 +401,13 @@ const Math::Vec4<u8> LookupTexture(const u8* source, int x, int y, const Texture
 
     case Regs::TextureFormat::I8:
     {
-        const u8* source_ptr = source + coarse_x * block_height + coarse_y * info.stride + texel_index_within_tile;
+        const u8* source_ptr = source + offset;
         return { *source_ptr, *source_ptr, *source_ptr, 255 };
     }
 
     case Regs::TextureFormat::A8:
     {
-        const u8* source_ptr = source + coarse_x * block_height + coarse_y * info.stride + texel_index_within_tile;
+        const u8* source_ptr = source + offset;
 
         if (disable_alpha) {
             return { *source_ptr, *source_ptr, *source_ptr, 255 };
@@ -421,7 +418,7 @@ const Math::Vec4<u8> LookupTexture(const u8* source, int x, int y, const Texture
 
     case Regs::TextureFormat::IA4:
     {
-        const u8* source_ptr = source + coarse_x * block_height / 2 + coarse_y * info.stride + texel_index_within_tile / 2;
+        const u8* source_ptr = source + offset / 2;
 
         // TODO: component order not verified
 
@@ -440,7 +437,7 @@ const Math::Vec4<u8> LookupTexture(const u8* source, int x, int y, const Texture
 
     case Regs::TextureFormat::A4:
     {
-        const u8* source_ptr = source + coarse_x * block_height / 2 + coarse_y * info.stride + texel_index_within_tile / 2;
+        const u8* source_ptr = source + offset / 2;
 
         // TODO: component order not verified
 

From 8369ee58035ca98f776428f6cccbcf987fee3bc9 Mon Sep 17 00:00:00 2001
From: Yuri Kunde Schlesner <yuriks@yuriks.net>
Date: Tue, 23 Dec 2014 13:05:51 -0200
Subject: [PATCH 7/7] Rasterizer: Pre-divide vertex attributes by W

Execute the division-by-W for perspective-correct interpolation of
values in the clipper, moving them out of the rasterization inner loop.
---
 src/video_core/clipper.cpp    | 13 ++++++++++---
 src/video_core/pica.h         | 20 ++++++++++++++++++++
 src/video_core/rasterizer.cpp |  7 ++-----
 3 files changed, 32 insertions(+), 8 deletions(-)

diff --git a/src/video_core/clipper.cpp b/src/video_core/clipper.cpp
index 0521ef8661..1744066ba0 100644
--- a/src/video_core/clipper.cpp
+++ b/src/video_core/clipper.cpp
@@ -91,10 +91,17 @@ static void InitScreenCoordinates(OutputVertex& vtx)
     viewport.zscale     = float24::FromRawFloat24(registers.viewport_depth_range);
     viewport.offset_z   = float24::FromRawFloat24(registers.viewport_depth_far_plane);
 
+    float24 inv_w = float24::FromFloat32(1.f) / vtx.pos.w;
+    vtx.color *= inv_w;
+    vtx.tc0 *= inv_w;
+    vtx.tc1 *= inv_w;
+    vtx.tc2 *= inv_w;
+    vtx.pos.w = inv_w;
+
     // TODO: Not sure why the viewport width needs to be divided by 2 but the viewport height does not
-    vtx.screenpos[0] = (vtx.pos.x / vtx.pos.w + float24::FromFloat32(1.0)) * viewport.halfsize_x + viewport.offset_x;
-    vtx.screenpos[1] = (vtx.pos.y / vtx.pos.w + float24::FromFloat32(1.0)) * viewport.halfsize_y + viewport.offset_y;
-    vtx.screenpos[2] = viewport.offset_z - vtx.pos.z / vtx.pos.w * viewport.zscale;
+    vtx.screenpos[0] = (vtx.pos.x * inv_w + float24::FromFloat32(1.0)) * viewport.halfsize_x + viewport.offset_x;
+    vtx.screenpos[1] = (vtx.pos.y * inv_w + float24::FromFloat32(1.0)) * viewport.halfsize_y + viewport.offset_y;
+    vtx.screenpos[2] = viewport.offset_z - vtx.pos.z * inv_w * viewport.zscale;
 }
 
 void ProcessTriangle(OutputVertex &v0, OutputVertex &v1, OutputVertex &v2) {
diff --git a/src/video_core/pica.h b/src/video_core/pica.h
index 89d97e4e93..38bac748cd 100644
--- a/src/video_core/pica.h
+++ b/src/video_core/pica.h
@@ -757,6 +757,26 @@ struct float24 {
         return float24::FromFloat32(ToFloat32() - flt.ToFloat32());
     }
 
+    float24& operator *= (const float24& flt) {
+        value *= flt.ToFloat32();
+        return *this;
+    }
+
+    float24& operator /= (const float24& flt) {
+        value /= flt.ToFloat32();
+        return *this;
+    }
+
+    float24& operator += (const float24& flt) {
+        value += flt.ToFloat32();
+        return *this;
+    }
+
+    float24& operator -= (const float24& flt) {
+        value -= flt.ToFloat32();
+        return *this;
+    }
+
     float24 operator - () const {
         return float24::FromFloat32(-ToFloat32());
     }
diff --git a/src/video_core/rasterizer.cpp b/src/video_core/rasterizer.cpp
index 63da7104d6..a801488726 100644
--- a/src/video_core/rasterizer.cpp
+++ b/src/video_core/rasterizer.cpp
@@ -106,10 +106,7 @@ void ProcessTriangle(const VertexShader::OutputVertex& v0,
     int bias1 = IsRightSideOrFlatBottomEdge(vtxpos[1].xy(), vtxpos[2].xy(), vtxpos[0].xy()) ? -1 : 0;
     int bias2 = IsRightSideOrFlatBottomEdge(vtxpos[2].xy(), vtxpos[0].xy(), vtxpos[1].xy()) ? -1 : 0;
 
-    const Math::Vec3<float24> w_inverse = Math::MakeVec(
-            float24::FromFloat32(1.0f) / v0.pos.w,
-            float24::FromFloat32(1.0f) / v1.pos.w,
-            float24::FromFloat32(1.0f) / v2.pos.w);
+    auto w_inverse = Math::MakeVec(v0.pos.w, v1.pos.w, v2.pos.w);
 
     auto textures = registers.GetTextures();
     auto tev_stages = registers.GetTevStages();
@@ -158,7 +155,7 @@ void ProcessTriangle(const VertexShader::OutputVertex& v0,
             //
             // The generalization to three vertices is straightforward in baricentric coordinates.
             auto GetInterpolatedAttribute = [&](float24 attr0, float24 attr1, float24 attr2) {
-                auto attr_over_w = Math::MakeVec(attr0, attr1, attr2) * w_inverse;
+                auto attr_over_w = Math::MakeVec(attr0, attr1, attr2);
                 float24 interpolated_attr_over_w = Math::Dot(attr_over_w, baricentric_coordinates);
                 return interpolated_attr_over_w * interpolated_w_inverse;
             };