diff --git a/src/citra_qt/CMakeLists.txt b/src/citra_qt/CMakeLists.txt
index d4c0cecc3..47aaeca24 100644
--- a/src/citra_qt/CMakeLists.txt
+++ b/src/citra_qt/CMakeLists.txt
@@ -12,6 +12,7 @@ set(SRCS
             debugger/graphics_breakpoints.cpp
             debugger/graphics_cmdlists.cpp
             debugger/graphics_framebuffer.cpp
+            debugger/graphics_tracing.cpp
             debugger/graphics_vertex_shader.cpp
             debugger/profiler.cpp
             debugger/ramview.cpp
@@ -35,6 +36,7 @@ set(HEADERS
             debugger/graphics_breakpoints_p.h
             debugger/graphics_cmdlists.h
             debugger/graphics_framebuffer.h
+            debugger/graphics_tracing.h
             debugger/graphics_vertex_shader.h
             debugger/profiler.h
             debugger/ramview.h
diff --git a/src/citra_qt/debugger/graphics_breakpoint_observer.h b/src/citra_qt/debugger/graphics_breakpoint_observer.h
index f0d3361f8..02a0f4f4f 100644
--- a/src/citra_qt/debugger/graphics_breakpoint_observer.h
+++ b/src/citra_qt/debugger/graphics_breakpoint_observer.h
@@ -13,7 +13,7 @@
  * This is because the Pica breakpoint callbacks are called from a non-GUI thread, while
  * the widget usually wants to perform reactions in the GUI thread.
  */
-class BreakPointObserverDock : public QDockWidget, private Pica::DebugContext::BreakPointObserver {
+class BreakPointObserverDock : public QDockWidget, protected Pica::DebugContext::BreakPointObserver {
     Q_OBJECT
 
 public:
diff --git a/src/citra_qt/debugger/graphics_tracing.cpp b/src/citra_qt/debugger/graphics_tracing.cpp
new file mode 100644
index 000000000..3f20f149d
--- /dev/null
+++ b/src/citra_qt/debugger/graphics_tracing.cpp
@@ -0,0 +1,170 @@
+// Copyright 2015 Citra Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <memory>
+
+#include <QBoxLayout>
+#include <QComboBox>
+#include <QFileDialog>
+#include <QLabel>
+#include <QMessageBox>
+#include <QPushButton>
+#include <QSpinBox>
+
+#include <boost/range/algorithm/copy.hpp>
+
+#include "core/hw/gpu.h"
+#include "core/hw/lcd.h"
+
+#include "video_core/pica.h"
+
+#include "nihstro/float24.h"
+
+#include "graphics_tracing.h"
+
+GraphicsTracingWidget::GraphicsTracingWidget(std::shared_ptr<Pica::DebugContext> debug_context,
+                                             QWidget* parent)
+    : BreakPointObserverDock(debug_context, tr("CiTrace Recorder"), parent) {
+
+    setObjectName("CiTracing");
+
+    QPushButton* start_recording = new QPushButton(tr("Start Recording"));
+    QPushButton* stop_recording = new QPushButton(QIcon::fromTheme("document-save"), tr("Stop and Save"));
+    QPushButton* abort_recording = new QPushButton(tr("Abort Recording"));
+
+    connect(this, SIGNAL(SetStartTracingButtonEnabled(bool)), start_recording, SLOT(setVisible(bool)));
+    connect(this, SIGNAL(SetStopTracingButtonEnabled(bool)), stop_recording, SLOT(setVisible(bool)));
+    connect(this, SIGNAL(SetAbortTracingButtonEnabled(bool)), abort_recording, SLOT(setVisible(bool)));
+    connect(start_recording, SIGNAL(clicked()), this, SLOT(StartRecording()));
+    connect(stop_recording, SIGNAL(clicked()), this, SLOT(StopRecording()));
+    connect(abort_recording, SIGNAL(clicked()), this, SLOT(AbortRecording()));
+
+    stop_recording->setVisible(false);
+    abort_recording->setVisible(false);
+
+    auto main_widget = new QWidget;
+    auto main_layout = new QVBoxLayout;
+    {
+        auto sub_layout = new QHBoxLayout;
+        sub_layout->addWidget(start_recording);
+        sub_layout->addWidget(stop_recording);
+        sub_layout->addWidget(abort_recording);
+        main_layout->addLayout(sub_layout);
+    }
+    main_widget->setLayout(main_layout);
+    setWidget(main_widget);
+}
+
+void GraphicsTracingWidget::StartRecording() {
+    auto context = context_weak.lock();
+    if (!context)
+        return;
+
+    auto shader_binary = Pica::g_state.vs.program_code;
+    auto swizzle_data = Pica::g_state.vs.swizzle_data;
+
+    // Encode floating point numbers to 24-bit values
+    // TODO: Drop this explicit conversion once we store float24 values bit-correctly internally.
+    std::array<uint32_t, 4 * 16> default_attributes;
+    for (unsigned i = 0; i < 16; ++i) {
+        for (unsigned comp = 0; comp < 3; ++comp) {
+            default_attributes[4 * i + comp] = nihstro::to_float24(Pica::g_state.vs.default_attributes[i][comp].ToFloat32());
+        }
+    }
+
+    std::array<uint32_t, 4 * 96> vs_float_uniforms;
+    for (unsigned i = 0; i < 96; ++i)
+        for (unsigned comp = 0; comp < 3; ++comp)
+            vs_float_uniforms[4 * i + comp] = nihstro::to_float24(Pica::g_state.vs.uniforms.f[i][comp].ToFloat32());
+
+    CiTrace::Recorder::InitialState state;
+    std::copy_n((u32*)&GPU::g_regs, sizeof(GPU::g_regs) / sizeof(u32), std::back_inserter(state.gpu_registers));
+    std::copy_n((u32*)&LCD::g_regs, sizeof(LCD::g_regs) / sizeof(u32), std::back_inserter(state.lcd_registers));
+    std::copy_n((u32*)&Pica::g_state.regs, sizeof(Pica::g_state.regs) / sizeof(u32), std::back_inserter(state.pica_registers));
+    boost::copy(default_attributes, std::back_inserter(state.default_attributes));
+    boost::copy(shader_binary, std::back_inserter(state.vs_program_binary));
+    boost::copy(swizzle_data, std::back_inserter(state.vs_swizzle_data));
+    boost::copy(vs_float_uniforms, std::back_inserter(state.vs_float_uniforms));
+    //boost::copy(TODO: Not implemented, std::back_inserter(state.gs_program_binary));
+    //boost::copy(TODO: Not implemented, std::back_inserter(state.gs_swizzle_data));
+    //boost::copy(TODO: Not implemented, std::back_inserter(state.gs_float_uniforms));
+
+    auto recorder = new CiTrace::Recorder(state);
+    context->recorder = std::shared_ptr<CiTrace::Recorder>(recorder);
+
+    emit SetStartTracingButtonEnabled(false);
+    emit SetStopTracingButtonEnabled(true);
+    emit SetAbortTracingButtonEnabled(true);
+}
+
+void GraphicsTracingWidget::StopRecording() {
+    auto context = context_weak.lock();
+    if (!context)
+        return;
+
+    QString filename = QFileDialog::getSaveFileName(this, tr("Save CiTrace"), "citrace.ctf",
+                                                    tr("CiTrace File (*.ctf)"));
+
+    if (filename.isEmpty()) {
+        // If the user canceled the dialog, keep recording
+        return;
+    }
+
+    context->recorder->Finish(filename.toStdString());
+    context->recorder = nullptr;
+
+    emit SetStopTracingButtonEnabled(false);
+    emit SetAbortTracingButtonEnabled(false);
+    emit SetStartTracingButtonEnabled(true);
+}
+
+void GraphicsTracingWidget::AbortRecording() {
+    auto context = context_weak.lock();
+    if (!context)
+        return;
+
+    context->recorder = nullptr;
+
+    emit SetStopTracingButtonEnabled(false);
+    emit SetAbortTracingButtonEnabled(false);
+    emit SetStartTracingButtonEnabled(true);
+}
+
+void GraphicsTracingWidget::OnBreakPointHit(Pica::DebugContext::Event event, void* data) {
+    widget()->setEnabled(true);
+}
+
+void GraphicsTracingWidget::OnResumed() {
+    widget()->setEnabled(false);
+}
+
+void GraphicsTracingWidget::OnEmulationStarting(EmuThread* emu_thread) {
+    // Disable tracing starting/stopping until a GPU breakpoint is reached
+    widget()->setEnabled(false);
+}
+
+void GraphicsTracingWidget::OnEmulationStopping() {
+    // TODO: Is it safe to access the context here?
+
+    auto context = context_weak.lock();
+    if (!context)
+        return;
+
+
+    if (context->recorder) {
+        auto reply = QMessageBox::question(this, tr("CiTracing still active"),
+                tr("A CiTrace is still being recorded. Do you want to save it? If not, all recorded data will be discarded."),
+                QMessageBox::Yes | QMessageBox::No, QMessageBox::Yes);
+
+        if (reply == QMessageBox::Yes) {
+            StopRecording();
+        } else {
+            AbortRecording();
+        }
+    }
+
+    // If the widget was disabled before, enable it now to allow starting
+    // tracing before starting the next emulation session
+    widget()->setEnabled(true);
+}
diff --git a/src/citra_qt/debugger/graphics_tracing.h b/src/citra_qt/debugger/graphics_tracing.h
new file mode 100644
index 000000000..2a0e4819b
--- /dev/null
+++ b/src/citra_qt/debugger/graphics_tracing.h
@@ -0,0 +1,32 @@
+// Copyright 2015 Citra Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include "graphics_breakpoint_observer.h"
+
+class EmuThread;
+
+class GraphicsTracingWidget : public BreakPointObserverDock {
+    Q_OBJECT
+
+public:
+    GraphicsTracingWidget(std::shared_ptr<Pica::DebugContext> debug_context, QWidget* parent = nullptr);
+
+private slots:
+    void StartRecording();
+    void StopRecording();
+    void AbortRecording();
+
+    void OnBreakPointHit(Pica::DebugContext::Event event, void* data) override;
+    void OnResumed() override;
+
+    void OnEmulationStarting(EmuThread* emu_thread);
+    void OnEmulationStopping();
+
+signals:
+    void SetStartTracingButtonEnabled(bool enable);
+    void SetStopTracingButtonEnabled(bool enable);
+    void SetAbortTracingButtonEnabled(bool enable);
+};
diff --git a/src/citra_qt/main.cpp b/src/citra_qt/main.cpp
index d23bafafc..2746de779 100644
--- a/src/citra_qt/main.cpp
+++ b/src/citra_qt/main.cpp
@@ -32,6 +32,7 @@
 #include "debugger/graphics_breakpoints.h"
 #include "debugger/graphics_cmdlists.h"
 #include "debugger/graphics_framebuffer.h"
+#include "debugger/graphics_tracing.h"
 #include "debugger/graphics_vertex_shader.h"
 #include "debugger/profiler.h"
 
@@ -94,6 +95,10 @@ GMainWindow::GMainWindow() : emu_thread(nullptr)
     addDockWidget(Qt::RightDockWidgetArea, graphicsVertexShaderWidget);
     graphicsVertexShaderWidget->hide();
 
+    auto graphicsTracingWidget = new GraphicsTracingWidget(Pica::g_debug_context, this);
+    addDockWidget(Qt::RightDockWidgetArea, graphicsTracingWidget);
+    graphicsTracingWidget->hide();
+
     QMenu* debug_menu = ui.menu_View->addMenu(tr("Debugging"));
     debug_menu->addAction(profilerWidget->toggleViewAction());
     debug_menu->addAction(disasmWidget->toggleViewAction());
@@ -104,6 +109,7 @@ GMainWindow::GMainWindow() : emu_thread(nullptr)
     debug_menu->addAction(graphicsBreakpointsWidget->toggleViewAction());
     debug_menu->addAction(graphicsFramebufferWidget->toggleViewAction());
     debug_menu->addAction(graphicsVertexShaderWidget->toggleViewAction());
+    debug_menu->addAction(graphicsTracingWidget->toggleViewAction());
 
     // Set default UI state
     // geometry: 55% of the window contents are in the upper screen half, 45% in the lower half
@@ -148,6 +154,9 @@ GMainWindow::GMainWindow() : emu_thread(nullptr)
     connect(this, SIGNAL(EmulationStopping()), registersWidget, SLOT(OnEmulationStopping()));
     connect(this, SIGNAL(EmulationStarting(EmuThread*)), render_window, SLOT(OnEmulationStarting(EmuThread*)));
     connect(this, SIGNAL(EmulationStopping()), render_window, SLOT(OnEmulationStopping()));
+    connect(this, SIGNAL(EmulationStarting(EmuThread*)), graphicsTracingWidget, SLOT(OnEmulationStarting(EmuThread*)));
+    connect(this, SIGNAL(EmulationStopping()), graphicsTracingWidget, SLOT(OnEmulationStopping()));
+
 
     // Setup hotkeys
     RegisterHotkey("Main Window", "Load File", QKeySequence::Open);
diff --git a/src/common/file_util.h b/src/common/file_util.h
index 8fe772aee..9637d1b85 100644
--- a/src/common/file_util.h
+++ b/src/common/file_util.h
@@ -181,6 +181,10 @@ public:
     template <typename T>
     size_t WriteArray(const T* data, size_t length)
     {
+        static_assert(std::is_standard_layout<T>::value, "Given array does not consist of standard layout objects");
+        // TODO: gcc 4.8 does not support is_trivially_copyable, but we really should check for it here.
+        //static_assert(std::is_trivially_copyable<T>::value, "Given array does not consist of trivially copyable objects");
+
         if (!IsOpen()) {
             m_good = false;
             return -1;
@@ -203,6 +207,12 @@ public:
         return WriteArray(reinterpret_cast<const char*>(data), length);
     }
 
+    template<typename T>
+    size_t WriteObject(const T& object) {
+        static_assert(!std::is_pointer<T>::value, "Given object is a pointer");
+        return WriteArray(&object, 1);
+    }
+
     bool IsOpen() { return nullptr != m_file; }
 
     // m_good is set to false when a read, write or other function fails
diff --git a/src/core/CMakeLists.txt b/src/core/CMakeLists.txt
index 9b004440c..8267ee586 100644
--- a/src/core/CMakeLists.txt
+++ b/src/core/CMakeLists.txt
@@ -115,6 +115,7 @@ set(SRCS
             loader/elf.cpp
             loader/loader.cpp
             loader/ncch.cpp
+            tracer/recorder.cpp
             mem_map.cpp
             memory.cpp
             settings.cpp
@@ -243,6 +244,8 @@ set(HEADERS
             loader/elf.h
             loader/loader.h
             loader/ncch.h
+            tracer/recorder.h
+            tracer/citrace.h
             mem_map.h
             memory.h
             memory_setup.h
diff --git a/src/core/hle/service/gsp_gpu.cpp b/src/core/hle/service/gsp_gpu.cpp
index f175085e8..3910d0227 100644
--- a/src/core/hle/service/gsp_gpu.cpp
+++ b/src/core/hle/service/gsp_gpu.cpp
@@ -349,7 +349,7 @@ void SignalInterrupt(InterruptId interrupt_id) {
 /// Executes the next GSP command
 static void ExecuteCommand(const Command& command, u32 thread_id) {
     // Utility function to convert register ID to address
-    auto WriteGPURegister = [](u32 id, u32 data) {
+    static auto WriteGPURegister = [](u32 id, u32 data) {
         GPU::Write<u32>(0x1EF00000 + 4 * id, data);
     };
 
diff --git a/src/core/hw/gpu.cpp b/src/core/hw/gpu.cpp
index a1789f9c7..a3a7d128f 100644
--- a/src/core/hw/gpu.cpp
+++ b/src/core/hw/gpu.cpp
@@ -21,12 +21,17 @@
 #include "core/hw/hw.h"
 #include "core/hw/gpu.h"
 
+#include "core/tracer/recorder.h"
+
 #include "video_core/command_processor.h"
 #include "video_core/hwrasterizer_base.h"
 #include "video_core/renderer_base.h"
 #include "video_core/utils.h"
 #include "video_core/video_core.h"
 
+#include "video_core/debug_utils/debug_utils.h"
+
+
 namespace GPU {
 
 Regs g_regs;
@@ -101,39 +106,43 @@ inline void Write(u32 addr, const T data) {
         const bool is_second_filler = (index != GPU_REG_INDEX(memory_fill_config[0].trigger));
         auto& config = g_regs.memory_fill_config[is_second_filler];
 
-        if (config.address_start && config.trigger) {
-            u8* start = Memory::GetPhysicalPointer(config.GetStartAddress());
-            u8* end = Memory::GetPhysicalPointer(config.GetEndAddress());
+        if (config.trigger) {
+            if (config.address_start) { // Some games pass invalid values here
+                u8* start = Memory::GetPhysicalPointer(config.GetStartAddress());
+                u8* end = Memory::GetPhysicalPointer(config.GetEndAddress());
 
-            if (config.fill_24bit) {
-                // fill with 24-bit values
-                for (u8* ptr = start; ptr < end; ptr += 3) {
-                    ptr[0] = config.value_24bit_r;
-                    ptr[1] = config.value_24bit_g;
-                    ptr[2] = config.value_24bit_b;
+                if (config.fill_24bit) {
+                    // fill with 24-bit values
+                    for (u8* ptr = start; ptr < end; ptr += 3) {
+                        ptr[0] = config.value_24bit_r;
+                        ptr[1] = config.value_24bit_g;
+                        ptr[2] = config.value_24bit_b;
+                    }
+                } else if (config.fill_32bit) {
+                    // fill with 32-bit values
+                    for (u32* ptr = (u32*)start; ptr < (u32*)end; ++ptr)
+                        *ptr = config.value_32bit;
+                } else {
+                    // fill with 16-bit values
+                    for (u16* ptr = (u16*)start; ptr < (u16*)end; ++ptr)
+                        *ptr = config.value_16bit;
                 }
-            } else if (config.fill_32bit) {
-                // fill with 32-bit values
-                for (u32* ptr = (u32*)start; ptr < (u32*)end; ++ptr)
-                    *ptr = config.value_32bit;
-            } else {
-                // fill with 16-bit values
-                for (u16* ptr = (u16*)start; ptr < (u16*)end; ++ptr)
-                    *ptr = config.value_16bit;
+
+                LOG_TRACE(HW_GPU, "MemoryFill from 0x%08x to 0x%08x", config.GetStartAddress(), config.GetEndAddress());
+
+                if (!is_second_filler) {
+                    GSP_GPU::SignalInterrupt(GSP_GPU::InterruptId::PSC0);
+                } else {
+                    GSP_GPU::SignalInterrupt(GSP_GPU::InterruptId::PSC1);
+                }
+
+                VideoCore::g_renderer->hw_rasterizer->NotifyFlush(config.GetStartAddress(), config.GetEndAddress() - config.GetStartAddress());
             }
 
-            LOG_TRACE(HW_GPU, "MemoryFill from 0x%08x to 0x%08x", config.GetStartAddress(), config.GetEndAddress());
-
+            // Reset "trigger" flag and set the "finish" flag
+            // NOTE: This was confirmed to happen on hardware even if "address_start" is zero.
             config.trigger = 0;
             config.finished = 1;
-
-            if (!is_second_filler) {
-                GSP_GPU::SignalInterrupt(GSP_GPU::InterruptId::PSC0);
-            } else {
-                GSP_GPU::SignalInterrupt(GSP_GPU::InterruptId::PSC1);
-            }
-
-            VideoCore::g_renderer->hw_rasterizer->NotifyFlush(config.GetStartAddress(), config.GetEndAddress() - config.GetStartAddress());
         }
         break;
     }
@@ -270,6 +279,7 @@ inline void Write(u32 addr, const T data) {
                       config.GetPhysicalOutputAddress(), output_width, output_height,
                       config.output_format.Value(), config.flags);
 
+            g_regs.display_transfer_config.trigger = 0;
             GSP_GPU::SignalInterrupt(GSP_GPU::InterruptId::PPF);
 
             VideoCore::g_renderer->hw_rasterizer->NotifyFlush(config.GetPhysicalOutputAddress(), output_size);
@@ -284,7 +294,14 @@ inline void Write(u32 addr, const T data) {
         if (config.trigger & 1)
         {
             u32* buffer = (u32*)Memory::GetPhysicalPointer(config.GetPhysicalAddress());
+
+            if (Pica::g_debug_context && Pica::g_debug_context->recorder) {
+                Pica::g_debug_context->recorder->MemoryAccessed((u8*)buffer, config.size * sizeof(u32), config.GetPhysicalAddress());
+            }
+
             Pica::CommandProcessor::ProcessCommandList(buffer, config.size);
+
+            g_regs.command_processor_config.trigger = 0;
         }
         break;
     }
@@ -292,6 +309,13 @@ inline void Write(u32 addr, const T data) {
     default:
         break;
     }
+
+    // Notify tracer about the register write
+    // This is happening *after* handling the write to make sure we properly catch all memory reads.
+    if (Pica::g_debug_context && Pica::g_debug_context->recorder) {
+        // addr + GPU VBase - IO VBase + IO PBase
+        Pica::g_debug_context->recorder->RegisterWritten<T>(addr + 0x1EF00000 - 0x1EC00000 + 0x10100000, data);
+    }
 }
 
 // Explicitly instantiate template functions because we aren't defining this in the header:
diff --git a/src/core/hw/hw.cpp b/src/core/hw/hw.cpp
index c7006a498..b5fdbf9c1 100644
--- a/src/core/hw/hw.cpp
+++ b/src/core/hw/hw.cpp
@@ -15,6 +15,21 @@ template <typename T>
 inline void Read(T &var, const u32 addr) {
     switch (addr & 0xFFFFF000) {
     case VADDR_GPU:
+    case VADDR_GPU + 0x1000:
+    case VADDR_GPU + 0x2000:
+    case VADDR_GPU + 0x3000:
+    case VADDR_GPU + 0x4000:
+    case VADDR_GPU + 0x5000:
+    case VADDR_GPU + 0x6000:
+    case VADDR_GPU + 0x7000:
+    case VADDR_GPU + 0x8000:
+    case VADDR_GPU + 0x9000:
+    case VADDR_GPU + 0xA000:
+    case VADDR_GPU + 0xB000:
+    case VADDR_GPU + 0xC000:
+    case VADDR_GPU + 0xD000:
+    case VADDR_GPU + 0xE000:
+    case VADDR_GPU + 0xF000:
         GPU::Read(var, addr);
         break;
     case VADDR_LCD:
@@ -29,6 +44,21 @@ template <typename T>
 inline void Write(u32 addr, const T data) {
     switch (addr & 0xFFFFF000) {
     case VADDR_GPU:
+    case VADDR_GPU + 0x1000:
+    case VADDR_GPU + 0x2000:
+    case VADDR_GPU + 0x3000:
+    case VADDR_GPU + 0x4000:
+    case VADDR_GPU + 0x5000:
+    case VADDR_GPU + 0x6000:
+    case VADDR_GPU + 0x7000:
+    case VADDR_GPU + 0x8000:
+    case VADDR_GPU + 0x9000:
+    case VADDR_GPU + 0xA000:
+    case VADDR_GPU + 0xB000:
+    case VADDR_GPU + 0xC000:
+    case VADDR_GPU + 0xD000:
+    case VADDR_GPU + 0xE000:
+    case VADDR_GPU + 0xF000:
         GPU::Write(addr, data);
         break;
     case VADDR_LCD:
diff --git a/src/core/hw/lcd.cpp b/src/core/hw/lcd.cpp
index cdb757a18..6f93709e3 100644
--- a/src/core/hw/lcd.cpp
+++ b/src/core/hw/lcd.cpp
@@ -10,6 +10,9 @@
 #include "core/hw/hw.h"
 #include "core/hw/lcd.h"
 
+#include "core/tracer/recorder.h"
+#include "video_core/debug_utils/debug_utils.h"
+
 namespace LCD {
 
 Regs g_regs;
@@ -40,6 +43,13 @@ inline void Write(u32 addr, const T data) {
     }
 
     g_regs[index] = static_cast<u32>(data);
+
+    // Notify tracer about the register write
+    // This is happening *after* handling the write to make sure we properly catch all memory reads.
+    if (Pica::g_debug_context && Pica::g_debug_context->recorder) {
+        // addr + GPU VBase - IO VBase + IO PBase
+        Pica::g_debug_context->recorder->RegisterWritten<T>(addr + HW::VADDR_LCD - 0x1EC00000 + 0x10100000, data);
+    }
 }
 
 // Explicitly instantiate template functions because we aren't defining this in the header:
diff --git a/src/core/tracer/citrace.h b/src/core/tracer/citrace.h
new file mode 100644
index 000000000..5deb6ce9e
--- /dev/null
+++ b/src/core/tracer/citrace.h
@@ -0,0 +1,101 @@
+// Copyright 2015 Citra Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <cstdint>
+
+namespace CiTrace {
+
+// NOTE: Things are stored in little-endian
+
+#pragma pack(1)
+
+struct CTHeader {
+    static const char* ExpectedMagicWord() {
+        return "CiTr";
+    }
+
+    static uint32_t ExpectedVersion() {
+        return 1;
+    }
+
+    char magic[4];
+    uint32_t version;
+    uint32_t header_size;
+
+    struct {
+        // NOTE: Register range sizes are technically hardware-constants, but the actual limits
+        // aren't known. Hence we store the presumed limits along the offsets.
+        // Sizes are given in uint32_t units.
+        uint32_t gpu_registers;
+        uint32_t gpu_registers_size;
+        uint32_t lcd_registers;
+        uint32_t lcd_registers_size;
+        uint32_t pica_registers;
+        uint32_t pica_registers_size;
+        uint32_t default_attributes;
+        uint32_t default_attributes_size;
+        uint32_t vs_program_binary;
+        uint32_t vs_program_binary_size;
+        uint32_t vs_swizzle_data;
+        uint32_t vs_swizzle_data_size;
+        uint32_t vs_float_uniforms;
+        uint32_t vs_float_uniforms_size;
+        uint32_t gs_program_binary;
+        uint32_t gs_program_binary_size;
+        uint32_t gs_swizzle_data;
+        uint32_t gs_swizzle_data_size;
+        uint32_t gs_float_uniforms;
+        uint32_t gs_float_uniforms_size;
+
+        // Other things we might want to store here:
+        // - Initial framebuffer data, maybe even a full copy of FCRAM/VRAM
+        // - Lookup tables for fragment lighting
+        // - Lookup tables for procedural textures
+    } initial_state_offsets;
+
+    uint32_t stream_offset;
+    uint32_t stream_size;
+};
+
+enum CTStreamElementType : uint32_t {
+    FrameMarker   = 0xE1,
+    MemoryLoad    = 0xE2,
+    RegisterWrite = 0xE3,
+};
+
+struct CTMemoryLoad {
+    uint32_t file_offset;
+    uint32_t size;
+    uint32_t physical_address;
+    uint32_t pad;
+};
+
+struct CTRegisterWrite {
+    uint32_t physical_address;
+
+    enum : uint32_t {
+        SIZE_8  = 0xD1,
+        SIZE_16 = 0xD2,
+        SIZE_32 = 0xD3,
+        SIZE_64 = 0xD4
+    } size;
+
+    // TODO: Make it clearer which bits of this member are used for sizes other than 32 bits
+    uint64_t value;
+};
+
+struct CTStreamElement {
+    CTStreamElementType type;
+
+    union {
+        CTMemoryLoad memory_load;
+        CTRegisterWrite register_write;
+    };
+};
+
+#pragma pack()
+
+}
diff --git a/src/core/tracer/recorder.cpp b/src/core/tracer/recorder.cpp
new file mode 100644
index 000000000..656706c0c
--- /dev/null
+++ b/src/core/tracer/recorder.cpp
@@ -0,0 +1,187 @@
+// Copyright 2015 Citra Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <cstring>
+
+#include "common/assert.h"
+#include "common/file_util.h"
+#include "common/logging/log.h"
+
+#include "recorder.h"
+
+namespace CiTrace {
+
+Recorder::Recorder(const InitialState& initial_state) : initial_state(initial_state) {
+
+}
+
+void Recorder::Finish(const std::string& filename) {
+    // Setup CiTrace header
+    CTHeader header;
+    std::memcpy(header.magic, CTHeader::ExpectedMagicWord(), 4);
+    header.version = CTHeader::ExpectedVersion();
+    header.header_size = sizeof(CTHeader);
+
+    // Calculate file offsets
+    auto& initial = header.initial_state_offsets;
+
+    initial.gpu_registers_size      = initial_state.gpu_registers.size();
+    initial.lcd_registers_size      = initial_state.lcd_registers.size();
+    initial.pica_registers_size     = initial_state.pica_registers.size();
+    initial.default_attributes_size = initial_state.default_attributes.size();
+    initial.vs_program_binary_size  = initial_state.vs_program_binary.size();
+    initial.vs_swizzle_data_size    = initial_state.vs_swizzle_data.size();
+    initial.vs_float_uniforms_size  = initial_state.vs_float_uniforms.size();
+    initial.gs_program_binary_size  = initial_state.gs_program_binary.size();
+    initial.gs_swizzle_data_size    = initial_state.gs_swizzle_data.size();
+    initial.gs_float_uniforms_size  = initial_state.gs_float_uniforms.size();
+    header.stream_size              = stream.size();
+
+    initial.gpu_registers      = sizeof(header);
+    initial.lcd_registers      = initial.gpu_registers      + initial.gpu_registers_size * sizeof(u32);
+    initial.pica_registers     = initial.lcd_registers      + initial.lcd_registers_size * sizeof(u32);;
+    initial.default_attributes = initial.pica_registers     + initial.pica_registers_size * sizeof(u32);
+    initial.vs_program_binary  = initial.default_attributes + initial.default_attributes_size * sizeof(u32);
+    initial.vs_swizzle_data    = initial.vs_program_binary  + initial.vs_program_binary_size * sizeof(u32);
+    initial.vs_float_uniforms  = initial.vs_swizzle_data    + initial.vs_swizzle_data_size * sizeof(u32);
+    initial.gs_program_binary  = initial.vs_float_uniforms  + initial.vs_float_uniforms_size * sizeof(u32);
+    initial.gs_swizzle_data    = initial.gs_program_binary  + initial.gs_program_binary_size * sizeof(u32);
+    initial.gs_float_uniforms  = initial.gs_swizzle_data    + initial.gs_swizzle_data_size * sizeof(u32);
+    header.stream_offset       = initial.gs_float_uniforms  + initial.gs_float_uniforms_size * sizeof(u32);
+
+    // Iterate through stream elements, update relevant stream element data
+    for (auto& stream_element : stream) {
+        switch (stream_element.data.type) {
+        case MemoryLoad:
+        {
+            auto& file_offset = memory_regions[stream_element.hash];
+            if (!stream_element.uses_existing_data) {
+                file_offset = header.stream_offset;
+            }
+            stream_element.data.memory_load.file_offset = file_offset;
+            break;
+        }
+
+        default:
+            // Other commands don't use any extra data
+            DEBUG_ASSERT(stream_element.extra_data.size() == 0);
+            break;
+        }
+        header.stream_offset += stream_element.extra_data.size();
+    }
+
+    try {
+        // Open file and write header
+        FileUtil::IOFile file(filename, "wb");
+        size_t written = file.WriteObject(header);
+        if (written != 1 || file.Tell() != initial.gpu_registers)
+            throw "Failed to write header";
+
+        // Write initial state
+        written = file.WriteArray(initial_state.gpu_registers.data(), initial_state.gpu_registers.size());
+        if (written != initial_state.gpu_registers.size() || file.Tell() != initial.lcd_registers)
+            throw "Failed to write GPU registers";
+
+        written = file.WriteArray(initial_state.lcd_registers.data(), initial_state.lcd_registers.size());
+        if (written != initial_state.lcd_registers.size() || file.Tell() != initial.pica_registers)
+            throw "Failed to write LCD registers";
+
+        written = file.WriteArray(initial_state.pica_registers.data(), initial_state.pica_registers.size());
+        if (written != initial_state.pica_registers.size() || file.Tell() != initial.default_attributes)
+            throw "Failed to write Pica registers";
+
+        written = file.WriteArray(initial_state.default_attributes.data(), initial_state.default_attributes.size());
+        if (written != initial_state.default_attributes.size() || file.Tell() != initial.vs_program_binary)
+            throw "Failed to write default vertex attributes";
+
+        written = file.WriteArray(initial_state.vs_program_binary.data(), initial_state.vs_program_binary.size());
+        if (written != initial_state.vs_program_binary.size() || file.Tell() != initial.vs_swizzle_data)
+            throw "Failed to write vertex shader program binary";
+
+        written = file.WriteArray(initial_state.vs_swizzle_data.data(), initial_state.vs_swizzle_data.size());
+        if (written != initial_state.vs_swizzle_data.size() || file.Tell() != initial.vs_float_uniforms)
+            throw "Failed to write vertex shader swizzle data";
+
+        written = file.WriteArray(initial_state.vs_float_uniforms.data(), initial_state.vs_float_uniforms.size());
+        if (written != initial_state.vs_float_uniforms.size() || file.Tell() != initial.gs_program_binary)
+            throw "Failed to write vertex shader float uniforms";
+
+        written = file.WriteArray(initial_state.gs_program_binary.data(), initial_state.gs_program_binary.size());
+        if (written != initial_state.gs_program_binary.size() || file.Tell() != initial.gs_swizzle_data)
+            throw "Failed to write geomtry shader program binary";
+
+        written = file.WriteArray(initial_state.gs_swizzle_data.data(), initial_state.gs_swizzle_data.size());
+        if (written != initial_state.gs_swizzle_data.size() || file.Tell() != initial.gs_float_uniforms)
+            throw "Failed to write geometry shader swizzle data";
+
+        written = file.WriteArray(initial_state.gs_float_uniforms.data(), initial_state.gs_float_uniforms.size());
+        if (written != initial_state.gs_float_uniforms.size() || file.Tell() != initial.gs_float_uniforms + sizeof(u32) * initial.gs_float_uniforms_size)
+            throw "Failed to write geometry shader float uniforms";
+
+        // Iterate through stream elements, write "extra data"
+        for (const auto& stream_element : stream) {
+            if (stream_element.extra_data.size() == 0)
+                continue;
+
+            written = file.WriteBytes(stream_element.extra_data.data(), stream_element.extra_data.size());
+            if (written != stream_element.extra_data.size())
+                throw "Failed to write extra data";
+        }
+
+        if (file.Tell() != header.stream_offset)
+            throw "Unexpected end of extra data";
+
+        // Write actual stream elements
+        for (const auto& stream_element : stream) {
+            if (1 != file.WriteObject(stream_element.data))
+                throw "Failed to write stream element";
+        }
+    } catch(const char* str) {
+        LOG_ERROR(HW_GPU, "Writing CiTrace file failed: %s", str);
+    }
+}
+
+void Recorder::FrameFinished() {
+    stream.push_back( { FrameMarker } );
+}
+
+void Recorder::MemoryAccessed(const u8* data, u32 size, u32 physical_address) {
+    StreamElement element = { MemoryLoad };
+    element.data.memory_load.size = size;
+    element.data.memory_load.physical_address = physical_address;
+
+    // Compute hash over given memory region to check if the contents are already stored internally
+    boost::crc_32_type result;
+    result.process_bytes(data, size);
+    element.hash = result.checksum();
+
+    element.uses_existing_data = (memory_regions.find(element.hash) != memory_regions.end());
+    if (!element.uses_existing_data) {
+        element.extra_data.resize(size);
+        memcpy(element.extra_data.data(), data, size);
+        memory_regions.insert({element.hash, 0}); // file offset will be initialized in Finish()
+    }
+
+    stream.push_back(element);
+}
+
+template<typename T>
+void Recorder::RegisterWritten(u32 physical_address, T value) {
+    StreamElement element = { RegisterWrite };
+    element.data.register_write.size = (sizeof(T) == 1) ? CTRegisterWrite::SIZE_8
+                                     : (sizeof(T) == 2) ? CTRegisterWrite::SIZE_16
+                                     : (sizeof(T) == 4) ? CTRegisterWrite::SIZE_32
+                                     :                    CTRegisterWrite::SIZE_64;
+    element.data.register_write.physical_address = physical_address;
+    element.data.register_write.value = value;
+
+    stream.push_back(element);
+}
+
+template void Recorder::RegisterWritten(u32,u8);
+template void Recorder::RegisterWritten(u32,u16);
+template void Recorder::RegisterWritten(u32,u32);
+template void Recorder::RegisterWritten(u32,u64);
+
+}
diff --git a/src/core/tracer/recorder.h b/src/core/tracer/recorder.h
new file mode 100644
index 000000000..6e4b70015
--- /dev/null
+++ b/src/core/tracer/recorder.h
@@ -0,0 +1,90 @@
+// Copyright 2015 Citra Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <unordered_map>
+#include <vector>
+
+#include <boost/crc.hpp>
+
+#include "common/common_types.h"
+
+#include "citrace.h"
+
+namespace CiTrace {
+
+class Recorder {
+public:
+    struct InitialState {
+        std::vector<u32> gpu_registers;
+        std::vector<u32> lcd_registers;
+        std::vector<u32> pica_registers;
+        std::vector<u32> default_attributes;
+        std::vector<u32> vs_program_binary;
+        std::vector<u32> vs_swizzle_data;
+        std::vector<u32> vs_float_uniforms;
+        std::vector<u32> gs_program_binary;
+        std::vector<u32> gs_swizzle_data;
+        std::vector<u32> gs_float_uniforms;
+    };
+
+    /**
+     * Recorder constructor
+     * @param default_attributes Pointer to an array of 32-bit-aligned 24-bit floating point values.
+     * @param vs_float_uniforms Pointer to an array of 32-bit-aligned 24-bit floating point values.
+     */
+    Recorder(const InitialState& initial_state);
+
+    /// Finish recording of this Citrace and save it using the given filename.
+    void Finish(const std::string& filename);
+
+    /// Mark end of a frame
+    void FrameFinished();
+
+    /**
+     * Store a copy of the given memory range in the recording.
+     * @note Use this whenever the GPU is about to access a particular memory region.
+     * @note The implementation will make sure to minimize redundant memory updates.
+     */
+    void MemoryAccessed(const u8* data, u32 size, u32 physical_address);
+
+    /**
+     * Record a register write.
+     * @note Use this whenever a GPU-related MMIO register has been written to.
+     */
+    template<typename T>
+    void RegisterWritten(u32 physical_address, T value);
+
+private:
+    // Initial state of recording start
+    InitialState initial_state;
+
+    // Command stream
+    struct StreamElement {
+        CTStreamElement data;
+
+        /**
+          * Extra data to store along "core" data.
+          * This is e.g. used for data used in MemoryUpdates.
+          */
+        std::vector<u8> extra_data;
+
+        /// Optional CRC hash (e.g. for hashing memory regions)
+        boost::crc_32_type::value_type hash;
+
+        /// If true, refer to data already written to the output file instead of extra_data
+        bool uses_existing_data;
+    };
+
+    std::vector<StreamElement> stream;
+
+    /**
+     * Internal cache which maps hashes of memory contents to file offsets at which those memory
+     * contents are stored.
+     */
+    std::unordered_map<boost::crc_32_type::value_type /*hash*/, u32 /*file_offset*/> memory_regions;
+};
+
+} // namespace
diff --git a/src/video_core/command_processor.cpp b/src/video_core/command_processor.cpp
index 110caec76..2a1c885a7 100644
--- a/src/video_core/command_processor.cpp
+++ b/src/video_core/command_processor.cpp
@@ -123,12 +123,55 @@ static inline void WritePicaReg(u32 id, u32 value, u32 mask) {
             PrimitiveAssembler<VertexShader::OutputVertex> primitive_assembler(regs.triangle_topology.Value());
             PrimitiveAssembler<DebugUtils::GeometryDumper::Vertex> dumping_primitive_assembler(regs.triangle_topology.Value());
 
+            if (g_debug_context) {
+                for (int i = 0; i < 3; ++i) {
+                    const auto texture = regs.GetTextures()[i];
+                    if (!texture.enabled)
+                        continue;
+
+                    u8* texture_data = Memory::GetPhysicalPointer(texture.config.GetPhysicalAddress());
+                    if (g_debug_context && Pica::g_debug_context->recorder)
+                        g_debug_context->recorder->MemoryAccessed(texture_data, Pica::Regs::NibblesPerPixel(texture.format) * texture.config.width / 2 * texture.config.height, texture.config.GetPhysicalAddress());
+                }
+            }
+
+            class {
+                /// Combine overlapping and close ranges
+                void SimplifyRanges() {
+                    for (auto it = ranges.begin(); it != ranges.end(); ++it) {
+                        // NOTE: We add 32 to the range end address to make sure "close" ranges are combined, too
+                        auto it2 = std::next(it);
+                        while (it2 != ranges.end() && it->first + it->second + 32 >= it2->first) {
+                            it->second = std::max(it->second, it2->first + it2->second - it->first);
+                            it2 = ranges.erase(it2);
+                        }
+                    }
+                }
+
+            public:
+                /// Record a particular memory access in the list
+                void AddAccess(u32 paddr, u32 size) {
+                    // Create new range or extend existing one
+                    ranges[paddr] = std::max(ranges[paddr], size);
+
+                    // Simplify ranges...
+                    SimplifyRanges();
+                }
+
+                /// Map of accessed ranges (mapping start address to range size)
+                std::map<u32, u32> ranges;
+            } memory_accesses;
+
             for (unsigned int index = 0; index < regs.num_vertices; ++index)
             {
                 unsigned int vertex = is_indexed ? (index_u16 ? index_address_16[index] : index_address_8[index]) : index;
 
                 if (is_indexed) {
                     // TODO: Implement some sort of vertex cache!
+                    if (g_debug_context && Pica::g_debug_context->recorder) {
+                        int size = index_u16 ? 2 : 1;
+                        memory_accesses.AddAccess(base_address + index_info.offset + size * index, size);
+                    }
                 }
 
                 // Initialize data for the current vertex
@@ -151,7 +194,14 @@ static inline void WritePicaReg(u32 id, u32 value, u32 mask) {
 
                     // Load per-vertex data from the loader arrays
                     for (unsigned int comp = 0; comp < vertex_attribute_elements[i]; ++comp) {
-                        const u8* srcdata = Memory::GetPhysicalPointer(vertex_attribute_sources[i] + vertex_attribute_strides[i] * vertex + comp * vertex_attribute_element_size[i]);
+                        u32 source_addr = vertex_attribute_sources[i] + vertex_attribute_strides[i] * vertex + comp * vertex_attribute_element_size[i];
+                        const u8* srcdata = Memory::GetPhysicalPointer(source_addr);
+
+                        if (g_debug_context && Pica::g_debug_context->recorder) {
+                            memory_accesses.AddAccess(source_addr,
+                                    (vertex_attribute_formats[i] == Regs::VertexAttributeFormat::FLOAT) ? 4
+                                    : (vertex_attribute_formats[i] == Regs::VertexAttributeFormat::SHORT) ? 2 : 1);
+                        }
 
                         const float srcval = (vertex_attribute_formats[i] == Regs::VertexAttributeFormat::BYTE) ? *(s8*)srcdata :
                             (vertex_attribute_formats[i] == Regs::VertexAttributeFormat::UBYTE) ? *(u8*)srcdata :
@@ -213,14 +263,20 @@ static inline void WritePicaReg(u32 id, u32 value, u32 mask) {
                 }
             }
 
+            for (auto& range : memory_accesses.ranges) {
+                g_debug_context->recorder->MemoryAccessed(Memory::GetPhysicalPointer(range.first),
+                                                          range.second, range.first);
+            }
+
             if (Settings::values.use_hw_renderer) {
                 VideoCore::g_renderer->hw_rasterizer->DrawTriangles();
             }
 
             geometry_dumper.Dump();
 
-            if (g_debug_context)
+            if (g_debug_context) {
                 g_debug_context->OnEvent(DebugContext::Event::FinishedPrimitiveBatch, nullptr);
+            }
 
             break;
         }
diff --git a/src/video_core/debug_utils/debug_utils.h b/src/video_core/debug_utils/debug_utils.h
index 7926d64ec..2573292e2 100644
--- a/src/video_core/debug_utils/debug_utils.h
+++ b/src/video_core/debug_utils/debug_utils.h
@@ -14,6 +14,8 @@
 
 #include "common/vector_math.h"
 
+#include "core/tracer/recorder.h"
+
 #include "video_core/pica.h"
 
 namespace Pica {
@@ -129,6 +131,8 @@ public:
     Event active_breakpoint;
     bool at_breakpoint = false;
 
+    std::shared_ptr<CiTrace::Recorder> recorder = nullptr;
+
 private:
     /**
      * Private default constructor to make sure people always construct this through Construct()
diff --git a/src/video_core/renderer_opengl/renderer_opengl.cpp b/src/video_core/renderer_opengl/renderer_opengl.cpp
index 9799f74fa..96e12839a 100644
--- a/src/video_core/renderer_opengl/renderer_opengl.cpp
+++ b/src/video_core/renderer_opengl/renderer_opengl.cpp
@@ -22,6 +22,8 @@
 #include "video_core/renderer_opengl/gl_shader_util.h"
 #include "video_core/renderer_opengl/gl_shaders.h"
 
+#include "video_core/debug_utils/debug_utils.h"
+
 /**
  * Vertex structure that the drawn screen rectangles are composed of.
  */
@@ -129,6 +131,10 @@ void RendererOpenGL::SwapBuffers() {
             hw_rasterizer->Reset();
         }
     }
+
+    if (Pica::g_debug_context && Pica::g_debug_context->recorder) {
+        Pica::g_debug_context->recorder->FrameFinished();
+    }
 }
 
 /**