diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h
index 99abe0edf3..557227b37d 100644
--- a/src/video_core/buffer_cache/buffer_cache.h
+++ b/src/video_core/buffer_cache/buffer_cache.h
@@ -171,7 +171,9 @@ public:
                                   bool is_written, bool is_image);
 
     [[nodiscard]] std::pair<Buffer*, u32> ObtainBuffer(GPUVAddr gpu_addr, u32 size,
-                                                       bool synchronize, bool mark_as_written);
+                                                       bool synchronize = true,
+                                                       bool mark_as_written = false,
+                                                       bool discard_downloads = false);
 
     void FlushCachedWrites();
 
@@ -203,6 +205,14 @@ public:
     /// Return true when a CPU region is modified from the CPU
     [[nodiscard]] bool IsRegionCpuModified(VAddr addr, size_t size);
 
+    void SetDrawIndirect(const Tegra::Engines::DrawManager::IndirectParams* current_draw_indirect_) {
+        current_draw_indirect = current_draw_indirect_;
+    }
+
+    [[nodiscard]] std::pair<Buffer*, u32> GetDrawIndirectCount();
+
+    [[nodiscard]] std::pair<Buffer*, u32> GetDrawIndirectBuffer();
+
     std::mutex mutex;
     Runtime& runtime;
 
@@ -275,6 +285,8 @@ private:
 
     void BindHostVertexBuffers();
 
+    void BindHostDrawIndirectBuffers();
+
     void BindHostGraphicsUniformBuffers(size_t stage);
 
     void BindHostGraphicsUniformBuffer(size_t stage, u32 index, u32 binding_index, bool needs_bind);
@@ -301,6 +313,8 @@ private:
 
     void UpdateVertexBuffer(u32 index);
 
+    void UpdateDrawIndirect();
+
     void UpdateUniformBuffers(size_t stage);
 
     void UpdateStorageBuffers(size_t stage);
@@ -340,6 +354,8 @@ private:
 
     bool SynchronizeBufferImpl(Buffer& buffer, VAddr cpu_addr, u32 size);
 
+    bool SynchronizeBufferNoModified(Buffer& buffer, VAddr cpu_addr, u32 size);
+
     void UploadMemory(Buffer& buffer, u64 total_size_bytes, u64 largest_copy,
                       std::span<BufferCopy> copies);
 
@@ -375,6 +391,8 @@ private:
     SlotVector<Buffer> slot_buffers;
     DelayedDestructionRing<Buffer, 8> delayed_destruction_ring;
 
+    const Tegra::Engines::DrawManager::IndirectParams* current_draw_indirect{};
+
     u32 last_index_count = 0;
 
     Binding index_buffer;
@@ -383,6 +401,8 @@ private:
     std::array<std::array<Binding, NUM_STORAGE_BUFFERS>, NUM_STAGES> storage_buffers;
     std::array<std::array<TextureBufferBinding, NUM_TEXTURE_BUFFERS>, NUM_STAGES> texture_buffers;
     std::array<Binding, NUM_TRANSFORM_FEEDBACK_BUFFERS> transform_feedback_buffers;
+    Binding count_buffer_binding;
+    Binding indirect_buffer_binding;
 
     std::array<Binding, NUM_COMPUTE_UNIFORM_BUFFERS> compute_uniform_buffers;
     std::array<Binding, NUM_STORAGE_BUFFERS> compute_storage_buffers;
@@ -422,6 +442,7 @@ private:
 
     std::vector<BufferId> cached_write_buffer_ids;
 
+    IntervalSet discarded_ranges;
     IntervalSet uncommitted_ranges;
     IntervalSet common_ranges;
     std::deque<IntervalSet> committed_ranges;
@@ -579,13 +600,17 @@ bool BufferCache<P>::DMACopy(GPUVAddr src_address, GPUVAddr dest_address, u64 am
     }};
 
     boost::container::small_vector<IntervalType, 4> tmp_intervals;
+    const bool is_high_accuracy =
+        Settings::values.gpu_accuracy.GetValue() == Settings::GPUAccuracy::High;
     auto mirror = [&](VAddr base_address, VAddr base_address_end) {
         const u64 size = base_address_end - base_address;
         const VAddr diff = base_address - *cpu_src_address;
         const VAddr new_base_address = *cpu_dest_address + diff;
         const IntervalType add_interval{new_base_address, new_base_address + size};
-        uncommitted_ranges.add(add_interval);
         tmp_intervals.push_back(add_interval);
+        if (is_high_accuracy) {
+            uncommitted_ranges.add(add_interval);
+        }
     };
     ForEachWrittenRange(*cpu_src_address, amount, mirror);
     // This subtraction in this order is important for overlapping copies.
@@ -677,6 +702,9 @@ void BufferCache<P>::BindHostGeometryBuffers(bool is_indexed) {
     }
     BindHostVertexBuffers();
     BindHostTransformFeedbackBuffers();
+    if (current_draw_indirect) {
+        BindHostDrawIndirectBuffers();
+    }
 }
 
 template <class P>
@@ -796,7 +824,8 @@ void BufferCache<P>::BindComputeTextureBuffer(size_t tbo_index, GPUVAddr gpu_add
 template <class P>
 std::pair<typename P::Buffer*, u32> BufferCache<P>::ObtainBuffer(GPUVAddr gpu_addr, u32 size,
                                                                  bool synchronize,
-                                                                 bool mark_as_written) {
+                                                                 bool mark_as_written,
+                                                                 bool discard_downloads) {
     const std::optional<VAddr> cpu_addr = gpu_memory->GpuToCpuAddress(gpu_addr);
     if (!cpu_addr) {
         return {&slot_buffers[NULL_BUFFER_ID], 0};
@@ -804,11 +833,17 @@ std::pair<typename P::Buffer*, u32> BufferCache<P>::ObtainBuffer(GPUVAddr gpu_ad
     const BufferId buffer_id = FindBuffer(*cpu_addr, size);
     Buffer& buffer = slot_buffers[buffer_id];
     if (synchronize) {
-        SynchronizeBuffer(buffer, *cpu_addr, size);
+        // SynchronizeBuffer(buffer, *cpu_addr, size);
+        SynchronizeBufferNoModified(buffer, *cpu_addr, size);
     }
     if (mark_as_written) {
         MarkWrittenBuffer(buffer_id, *cpu_addr, size);
     }
+    if (discard_downloads) {
+        IntervalType interval{*cpu_addr, size};
+        ClearDownload(interval);
+        discarded_ranges.subtract(interval);
+    }
     return {&buffer, buffer.Offset(*cpu_addr)};
 }
 
@@ -827,10 +862,6 @@ bool BufferCache<P>::HasUncommittedFlushes() const noexcept {
 
 template <class P>
 void BufferCache<P>::AccumulateFlushes() {
-    if (Settings::values.gpu_accuracy.GetValue() != Settings::GPUAccuracy::High) {
-        uncommitted_ranges.clear();
-        return;
-    }
     if (uncommitted_ranges.empty()) {
         return;
     }
@@ -845,12 +876,15 @@ bool BufferCache<P>::ShouldWaitAsyncFlushes() const noexcept {
 template <class P>
 void BufferCache<P>::CommitAsyncFlushesHigh() {
     AccumulateFlushes();
+
+    for (const auto& interval : discarded_ranges) {
+        common_ranges.subtract(interval);
+    }
+
     if (committed_ranges.empty()) {
         return;
     }
     MICROPROFILE_SCOPE(GPU_DownloadMemory);
-    const bool is_accuracy_normal =
-        Settings::values.gpu_accuracy.GetValue() == Settings::GPUAccuracy::Normal;
 
     auto it = committed_ranges.begin();
     while (it != committed_ranges.end()) {
@@ -875,9 +909,6 @@ void BufferCache<P>::CommitAsyncFlushesHigh() {
             ForEachBufferInRange(cpu_addr, size, [&](BufferId buffer_id, Buffer& buffer) {
                 buffer.ForEachDownloadRangeAndClear(
                     cpu_addr, size, [&](u64 range_offset, u64 range_size) {
-                        if (is_accuracy_normal) {
-                            return;
-                        }
                         const VAddr buffer_addr = buffer.CpuAddr();
                         const auto add_download = [&](VAddr start, VAddr end) {
                             const u64 new_offset = start - buffer_addr;
@@ -891,7 +922,7 @@ void BufferCache<P>::CommitAsyncFlushesHigh() {
                                 buffer_id,
                             });
                             // Align up to avoid cache conflicts
-                            constexpr u64 align = 256ULL;
+                            constexpr u64 align = 8ULL;
                             constexpr u64 mask = ~(align - 1ULL);
                             total_size_bytes += (new_size + align - 1) & mask;
                             largest_copy = std::max(largest_copy, new_size);
@@ -942,12 +973,7 @@ void BufferCache<P>::CommitAsyncFlushesHigh() {
 
 template <class P>
 void BufferCache<P>::CommitAsyncFlushes() {
-    if (Settings::values.gpu_accuracy.GetValue() == Settings::GPUAccuracy::High) {
-        CommitAsyncFlushesHigh();
-    } else {
-        uncommitted_ranges.clear();
-        committed_ranges.clear();
-    }
+    CommitAsyncFlushesHigh();
 }
 
 template <class P>
@@ -1063,6 +1089,19 @@ void BufferCache<P>::BindHostVertexBuffers() {
     }
 }
 
+template <class P>
+void BufferCache<P>::BindHostDrawIndirectBuffers() {
+    const auto bind_buffer = [this](const Binding& binding) {
+        Buffer& buffer = slot_buffers[binding.buffer_id];
+        TouchBuffer(buffer, binding.buffer_id);
+        SynchronizeBuffer(buffer, binding.cpu_addr, binding.size);
+    };
+    if (current_draw_indirect->include_count) {
+        bind_buffer(count_buffer_binding);
+    }
+    bind_buffer(indirect_buffer_binding);
+}
+
 template <class P>
 void BufferCache<P>::BindHostGraphicsUniformBuffers(size_t stage) {
     u32 dirty = ~0U;
@@ -1294,6 +1333,9 @@ void BufferCache<P>::DoUpdateGraphicsBuffers(bool is_indexed) {
             UpdateStorageBuffers(stage);
             UpdateTextureBuffers(stage);
         }
+        if (current_draw_indirect) {
+            UpdateDrawIndirect();
+        }
     } while (has_deleted_buffers);
 }
 
@@ -1383,6 +1425,27 @@ void BufferCache<P>::UpdateVertexBuffer(u32 index) {
     };
 }
 
+template <class P>
+void BufferCache<P>::UpdateDrawIndirect() {
+    const auto update = [this](GPUVAddr gpu_addr, size_t size, Binding& binding) {
+        const std::optional<VAddr> cpu_addr = gpu_memory->GpuToCpuAddress(gpu_addr);
+        if (!cpu_addr) {
+            binding = NULL_BINDING;
+            return;
+        }
+        binding = Binding{
+            .cpu_addr = *cpu_addr,
+            .size = static_cast<u32>(size),
+            .buffer_id = FindBuffer(*cpu_addr, static_cast<u32>(size)),
+        };
+    };
+    if (current_draw_indirect->include_count) {
+        update(current_draw_indirect->count_start_address, sizeof(u32), count_buffer_binding);
+    }
+    update(current_draw_indirect->indirect_start_address, current_draw_indirect->buffer_size,
+           indirect_buffer_binding);
+}
+
 template <class P>
 void BufferCache<P>::UpdateUniformBuffers(size_t stage) {
     ForEachEnabledBit(enabled_uniform_buffer_masks[stage], [&](u32 index) {
@@ -1704,6 +1767,51 @@ bool BufferCache<P>::SynchronizeBufferImpl(Buffer& buffer, VAddr cpu_addr, u32 s
     return false;
 }
 
+template <class P>
+bool BufferCache<P>::SynchronizeBufferNoModified(Buffer& buffer, VAddr cpu_addr, u32 size) {
+    boost::container::small_vector<BufferCopy, 4> copies;
+    u64 total_size_bytes = 0;
+    u64 largest_copy = 0;
+    IntervalSet found_sets{};
+    auto make_copies = [&] {
+        for (auto& interval : found_sets) {
+            const std::size_t sub_size = interval.upper() - interval.lower();
+            const VAddr cpu_addr = interval.lower();
+            copies.push_back(BufferCopy{
+                .src_offset = total_size_bytes,
+                .dst_offset = cpu_addr - buffer.CpuAddr(),
+                .size = sub_size,
+            });
+            total_size_bytes += sub_size;
+            largest_copy = std::max(largest_copy, sub_size);
+        }
+        const std::span<BufferCopy> copies_span(copies.data(), copies.size());
+        UploadMemory(buffer, total_size_bytes, largest_copy, copies_span);
+    };
+    buffer.ForEachUploadRange(cpu_addr, size, [&](u64 range_offset, u64 range_size) {
+        const VAddr base_adr = buffer.CpuAddr() + range_offset;
+        const VAddr end_adr = base_adr + range_size;
+        const IntervalType add_interval{base_adr, end_adr};
+        found_sets.add(add_interval);
+    });
+    if (found_sets.empty()) {
+        return true;
+    }
+    const IntervalType search_interval{cpu_addr, cpu_addr + size};
+    auto it = common_ranges.lower_bound(search_interval);
+    auto it_end = common_ranges.upper_bound(search_interval);
+    if (it == common_ranges.end()) {
+        make_copies();
+        return false;
+    }
+    while (it != it_end) {
+        found_sets.subtract(*it);
+        it++;
+    }
+    make_copies();
+    return false;
+}
+
 template <class P>
 void BufferCache<P>::UploadMemory(Buffer& buffer, u64 total_size_bytes, u64 largest_copy,
                                   std::span<BufferCopy> copies) {
@@ -1963,4 +2071,16 @@ bool BufferCache<P>::HasFastUniformBufferBound(size_t stage, u32 binding_index)
     }
 }
 
+template <class P>
+std::pair<typename BufferCache<P>::Buffer*, u32> BufferCache<P>::GetDrawIndirectCount() {
+    auto& buffer = slot_buffers[count_buffer_binding.buffer_id];
+    return std::make_pair(&buffer, buffer.Offset(count_buffer_binding.cpu_addr));
+}
+
+template <class P>
+std::pair<typename BufferCache<P>::Buffer*, u32> BufferCache<P>::GetDrawIndirectBuffer() {
+    auto& buffer = slot_buffers[indirect_buffer_binding.buffer_id];
+    return std::make_pair(&buffer, buffer.Offset(indirect_buffer_binding.cpu_addr));
+}
+
 } // namespace VideoCommon
diff --git a/src/video_core/dma_pusher.cpp b/src/video_core/dma_pusher.cpp
index eb13716123..13ff64fa3f 100644
--- a/src/video_core/dma_pusher.cpp
+++ b/src/video_core/dma_pusher.cpp
@@ -97,6 +97,7 @@ void DmaPusher::ProcessCommands(std::span<const CommandHeader> commands) {
             if (dma_state.non_incrementing) {
                 const u32 max_write = static_cast<u32>(
                     std::min<std::size_t>(index + dma_state.method_count, commands.size()) - index);
+                dma_state.dma_word_offset = static_cast<u32>(index * sizeof(u32));
                 CallMultiMethod(&command_header.argument, max_write);
                 dma_state.method_count -= max_write;
                 dma_state.is_last_call = true;
@@ -175,7 +176,7 @@ void DmaPusher::CallMultiMethod(const u32* base_start, u32 num_methods) const {
                                dma_state.method_count);
     } else {
         auto subchannel = subchannels[dma_state.subchannel];
-        subchannel->current_dma_segment = dma_state.dma_get;
+        subchannel->current_dma_segment = dma_state.dma_get + dma_state.dma_word_offset;
         subchannel->CallMultiMethod(dma_state.method, base_start, num_methods,
                                     dma_state.method_count);
     }
diff --git a/src/video_core/dma_pusher.h b/src/video_core/dma_pusher.h
index ca0899ba71..da7728ded2 100644
--- a/src/video_core/dma_pusher.h
+++ b/src/video_core/dma_pusher.h
@@ -157,6 +157,7 @@ private:
         u32 method_count;      ///< Current method count
         u32 length_pending;    ///< Large NI command length pending
         GPUVAddr dma_get;      ///< Currently read segment
+        u32 dma_word_offset;   ///< Current word ofset from address
         bool non_incrementing; ///< Current command's NI flag
         bool is_last_call;
     };
diff --git a/src/video_core/engines/draw_manager.cpp b/src/video_core/engines/draw_manager.cpp
index 4fa77b6843..c60f32aad5 100644
--- a/src/video_core/engines/draw_manager.cpp
+++ b/src/video_core/engines/draw_manager.cpp
@@ -216,7 +216,7 @@ void DrawManager::ProcessDrawIndirect(bool draw_indexed) {
     UpdateTopology();
 
     if (maxwell3d->ShouldExecute()) {
-        maxwell3d->rasterizer->DrawIndirect(draw_indexed);
+        maxwell3d->rasterizer->DrawIndirect();
     }
 }
 } // namespace Tegra::Engines
diff --git a/src/video_core/engines/draw_manager.h b/src/video_core/engines/draw_manager.h
index 0cdb37f83d..4379901624 100644
--- a/src/video_core/engines/draw_manager.h
+++ b/src/video_core/engines/draw_manager.h
@@ -33,7 +33,10 @@ public:
     };
 
     struct IndirectParams {
-        GPUVAddr start_address;
+        bool is_indexed;
+        bool include_count;
+        GPUVAddr count_start_address;
+        GPUVAddr indirect_start_address;
         size_t buffer_size;
         size_t max_draw_counts;
         size_t stride;
diff --git a/src/video_core/engines/maxwell_3d.cpp b/src/video_core/engines/maxwell_3d.cpp
index 9b182b6530..cd6274a9b2 100644
--- a/src/video_core/engines/maxwell_3d.cpp
+++ b/src/video_core/engines/maxwell_3d.cpp
@@ -130,11 +130,15 @@ void Maxwell3D::ProcessMacro(u32 method, const u32* base_start, u32 amount, bool
     }
 
     macro_params.insert(macro_params.end(), base_start, base_start + amount);
+    for (size_t i = 0; i < amount; i++) {
+        macro_addresses.push_back(current_dma_segment + i * sizeof(u32));
+    }
 
     // Call the macro when there are no more parameters in the command buffer
     if (is_last_call) {
         CallMacroMethod(executing_macro, macro_params);
         macro_params.clear();
+        macro_addresses.clear();
     }
 }
 
diff --git a/src/video_core/engines/maxwell_3d.h b/src/video_core/engines/maxwell_3d.h
index 22b9043192..ac5e87563c 100644
--- a/src/video_core/engines/maxwell_3d.h
+++ b/src/video_core/engines/maxwell_3d.h
@@ -3066,6 +3066,15 @@ public:
 
     std::unique_ptr<DrawManager> draw_manager;
     friend class DrawManager;
+    
+    std::vector<u8> inline_index_draw_indexes;
+    std::vector<GPUVAddr> macro_addresses;
+
+    Core::System& system;
+    MemoryManager& memory_manager;
+
+    /// Handles a write to the CLEAR_BUFFERS register.
+    void ProcessClearBuffers(u32 layer_count);
 
 private:
     void InitializeRegisterDefaults();
@@ -3126,9 +3135,6 @@ private:
     /// Returns a query's value or an empty object if the value will be deferred through a cache.
     std::optional<u64> GetQueryResult();
 
-    Core::System& system;
-    MemoryManager& memory_manager;
-
     VideoCore::RasterizerInterface* rasterizer = nullptr;
 
     /// Start offsets of each macro in macro_memory
diff --git a/src/video_core/macro/macro_hle.cpp b/src/video_core/macro/macro_hle.cpp
index 1cc202cc7f..da988cc0d5 100644
--- a/src/video_core/macro/macro_hle.cpp
+++ b/src/video_core/macro/macro_hle.cpp
@@ -9,6 +9,7 @@
 #include "video_core/engines/maxwell_3d.h"
 #include "video_core/macro/macro.h"
 #include "video_core/macro/macro_hle.h"
+#include "video_core/memory_manager.h"
 #include "video_core/rasterizer_interface.h"
 
 namespace Tegra {
@@ -24,15 +25,14 @@ void HLE_771BB18C62444DA0(Engines::Maxwell3D& maxwell3d, const std::vector<u32>&
         parameters[4], parameters[1], parameters[3], parameters[5], instance_count);
 }
 
-void HLE_0D61FC9FAAC9FCAD(Engines::Maxwell3D& maxwell3d, const std::vector<u32>& parameters) {
+void HLE_DrawArraysIndirect(Engines::Maxwell3D& maxwell3d, const std::vector<u32>& parameters) {
     const u32 instance_count = (maxwell3d.GetRegisterValue(0xD1B) & parameters[2]);
     maxwell3d.draw_manager->DrawArray(
         static_cast<Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology>(parameters[0]),
         parameters[3], parameters[1], parameters[4], instance_count);
 }
 
-void HLE_0217920100488FF7(Engines::Maxwell3D& maxwell3d, const std::vector<u32>& parameters) {
-    const u32 instance_count = (maxwell3d.GetRegisterValue(0xD1B) & parameters[2]);
+void HLE_DrawIndexedIndirect(Engines::Maxwell3D& maxwell3d, const std::vector<u32>& parameters) {
     const u32 element_base = parameters[4];
     const u32 base_instance = parameters[5];
     maxwell3d.regs.vertex_id_base = element_base;
@@ -41,9 +41,18 @@ void HLE_0217920100488FF7(Engines::Maxwell3D& maxwell3d, const std::vector<u32>&
     maxwell3d.CallMethod(0x8e4, element_base, true);
     maxwell3d.CallMethod(0x8e5, base_instance, true);
 
-    maxwell3d.draw_manager->DrawIndex(
-        static_cast<Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology>(parameters[0]),
-        parameters[3], parameters[1], element_base, base_instance, instance_count);
+    auto& params = maxwell3d.draw_manager->GetIndirectParams();
+    params.is_indexed = true;
+    params.include_count = false;
+    params.count_start_address = 0;
+    params.indirect_start_address = maxwell3d.macro_addresses[1];
+    params.buffer_size = 5 * sizeof(u32);
+    params.max_draw_counts = 1;
+    params.stride = 0;
+
+    maxwell3d.draw_manager->DrawIndexedIndirect(
+        static_cast<Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology>(parameters[0]), 0,
+        1U << 18);
 
     maxwell3d.regs.vertex_id_base = 0x0;
     maxwell3d.CallMethod(0x8e3, 0x640, true);
@@ -51,8 +60,9 @@ void HLE_0217920100488FF7(Engines::Maxwell3D& maxwell3d, const std::vector<u32>&
     maxwell3d.CallMethod(0x8e5, 0x0, true);
 }
 
-// Multidraw Indirect
-void HLE_3F5E74B9C9A50164(Engines::Maxwell3D& maxwell3d, const std::vector<u32>& parameters) {
+// Multidraw Indixed Indirect
+void HLE_MultiDrawIndexedIndirect(Engines::Maxwell3D& maxwell3d,
+                                  const std::vector<u32>& parameters) {
     const u32 start_indirect = parameters[0];
     const u32 end_indirect = parameters[1];
     if (start_indirect >= end_indirect) {
@@ -66,7 +76,6 @@ void HLE_3F5E74B9C9A50164(Engines::Maxwell3D& maxwell3d, const std::vector<u32>&
     // size of each indirect segment
     const u32 indirect_words = 5 + padding;
     const u32 stride = indirect_words * sizeof(u32);
-    const GPUVAddr start_address = maxwell3d.current_dma_segment + 4 * sizeof(u32);
     const std::size_t draw_count = end_indirect - start_indirect;
     u32 lowest_first = std::numeric_limits<u32>::max();
     u32 highest_limit = std::numeric_limits<u32>::min();
@@ -80,12 +89,16 @@ void HLE_3F5E74B9C9A50164(Engines::Maxwell3D& maxwell3d, const std::vector<u32>&
 
     const u32 base_vertex = parameters[8];
     const u32 base_instance = parameters[9];
+    maxwell3d.regs.vertex_id_base = base_vertex;
     maxwell3d.CallMethod(0x8e3, 0x640, true);
     maxwell3d.CallMethod(0x8e4, base_vertex, true);
     maxwell3d.CallMethod(0x8e5, base_instance, true);
     auto& params = maxwell3d.draw_manager->GetIndirectParams();
-    params.start_address = start_address;
-    params.buffer_size = sizeof(u32) + stride * draw_count;
+    params.is_indexed = true;
+    params.include_count = true;
+    params.count_start_address = maxwell3d.macro_addresses[4];
+    params.indirect_start_address = maxwell3d.macro_addresses[5];
+    params.buffer_size = stride * draw_count;
     params.max_draw_counts = draw_count;
     params.stride = stride;
     maxwell3d.dirty.flags[VideoCommon::Dirty::IndexBuffer] = true;
@@ -93,7 +106,7 @@ void HLE_3F5E74B9C9A50164(Engines::Maxwell3D& maxwell3d, const std::vector<u32>&
 }
 
 // Multi-layer Clear
-void HLE_EAD26C3E2109B06B(Engines::Maxwell3D& maxwell3d, const std::vector<u32>& parameters) {
+void HLE_MultiLayerClear(Engines::Maxwell3D& maxwell3d, const std::vector<u32>& parameters) {
     ASSERT(parameters.size() == 1);
 
     const Engines::Maxwell3D::Regs::ClearSurface clear_params{parameters[0]};
@@ -107,10 +120,10 @@ void HLE_EAD26C3E2109B06B(Engines::Maxwell3D& maxwell3d, const std::vector<u32>&
 
 constexpr std::array<std::pair<u64, HLEFunction>, 5> hle_funcs{{
     {0x771BB18C62444DA0, &HLE_771BB18C62444DA0},
-    {0x0D61FC9FAAC9FCAD, &HLE_0D61FC9FAAC9FCAD},
-    {0x0217920100488FF7, &HLE_0217920100488FF7},
-    {0x3F5E74B9C9A50164, &HLE_3F5E74B9C9A50164},
-    {0xEAD26C3E2109B06B, &HLE_EAD26C3E2109B06B},
+    {0x0D61FC9FAAC9FCAD, &HLE_DrawArraysIndirect},
+    {0x0217920100488FF7, &HLE_DrawIndexedIndirect},
+    {0x3F5E74B9C9A50164, &HLE_MultiDrawIndexedIndirect},
+    {0xEAD26C3E2109B06B, &HLE_MultiLayerClear},
 }};
 
 class HLEMacroImpl final : public CachedMacro {
diff --git a/src/video_core/rasterizer_interface.h b/src/video_core/rasterizer_interface.h
index a2a651f341..641b95c7c0 100644
--- a/src/video_core/rasterizer_interface.h
+++ b/src/video_core/rasterizer_interface.h
@@ -43,7 +43,7 @@ public:
     virtual void Draw(bool is_indexed, u32 instance_count) = 0;
 
     /// Dispatches an indirect draw invocation
-    virtual void DrawIndirect(bool is_indexed) {}
+    virtual void DrawIndirect() {}
 
     /// Clear the current framebuffer
     virtual void Clear(u32 layer_count) = 0;
diff --git a/src/video_core/renderer_vulkan/vk_buffer_cache.cpp b/src/video_core/renderer_vulkan/vk_buffer_cache.cpp
index 6b54d7111c..487d8b4160 100644
--- a/src/video_core/renderer_vulkan/vk_buffer_cache.cpp
+++ b/src/video_core/renderer_vulkan/vk_buffer_cache.cpp
@@ -56,7 +56,8 @@ vk::Buffer CreateBuffer(const Device& device, u64 size) {
         VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT |
         VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT | VK_BUFFER_USAGE_STORAGE_TEXEL_BUFFER_BIT |
         VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT | VK_BUFFER_USAGE_STORAGE_BUFFER_BIT |
-        VK_BUFFER_USAGE_INDEX_BUFFER_BIT | VK_BUFFER_USAGE_VERTEX_BUFFER_BIT;
+        VK_BUFFER_USAGE_INDEX_BUFFER_BIT | VK_BUFFER_USAGE_VERTEX_BUFFER_BIT |
+        VK_BUFFER_USAGE_INDIRECT_BUFFER_BIT;
     if (device.IsExtTransformFeedbackSupported()) {
         flags |= VK_BUFFER_USAGE_TRANSFORM_FEEDBACK_BUFFER_BIT_EXT;
     }
@@ -516,6 +517,7 @@ void BufferCacheRuntime::ReserveNullBuffer() {
     if (device.IsExtTransformFeedbackSupported()) {
         create_info.usage |= VK_BUFFER_USAGE_TRANSFORM_FEEDBACK_BUFFER_BIT_EXT;
     }
+    create_info.usage |= VK_BUFFER_USAGE_INDIRECT_BUFFER_BIT;
     null_buffer = device.GetLogical().CreateBuffer(create_info);
     if (device.HasDebuggingToolAttached()) {
         null_buffer.SetObjectNameEXT("Null buffer");
diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp
index 9b75f33dd9..6f1adc97ff 100644
--- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp
+++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp
@@ -225,25 +225,40 @@ void RasterizerVulkan::Draw(bool is_indexed, u32 instance_count) {
     });
 }
 
-void RasterizerVulkan::DrawIndirect(bool is_indexed) {
-    PrepareDraw(is_indexed, [this, is_indexed] {
-        const auto params = maxwell3d->draw_manager->GetIndirectParams();
-        const auto [buffer, offset] = buffer_cache.ObtainBuffer(
-            params.start_address, static_cast<u32>(params.buffer_size), true, false);
-        scheduler.Record([buffer_obj = buffer->Handle(), offset,
-                          max_draw_counts = params.max_draw_counts, stride = params.stride,
-                          is_indexed](vk::CommandBuffer cmdbuf) {
-            if (is_indexed) {
-                cmdbuf.DrawIndexedIndirectCount(buffer_obj, offset + 4ULL, buffer_obj, offset,
-                                                static_cast<u32>(max_draw_counts),
-                                                static_cast<u32>(stride));
+void RasterizerVulkan::DrawIndirect() {
+    const auto& params = maxwell3d->draw_manager->GetIndirectParams();
+    buffer_cache.SetDrawIndirect(&params);
+    PrepareDraw(params.is_indexed, [this, &params] {
+        const auto [buffer, offset] = buffer_cache.GetDrawIndirectBuffer();
+        if (params.include_count) {
+            const auto [draw_buffer, offset_base] = buffer_cache.GetDrawIndirectCount();
+            scheduler.Record([draw_buffer_obj = draw_buffer->Handle(),
+                              buffer_obj = buffer->Handle(), offset_base, offset,
+                              params](vk::CommandBuffer cmdbuf) {
+                if (params.is_indexed) {
+                    cmdbuf.DrawIndexedIndirectCount(
+                        buffer_obj, offset, draw_buffer_obj, offset_base,
+                        static_cast<u32>(params.max_draw_counts), static_cast<u32>(params.stride));
+                } else {
+                    cmdbuf.DrawIndirectCount(buffer_obj, offset, draw_buffer_obj, offset_base,
+                                             static_cast<u32>(params.max_draw_counts),
+                                             static_cast<u32>(params.stride));
+                }
+            });
+            return;
+        }
+        scheduler.Record([buffer_obj = buffer->Handle(), offset, params](vk::CommandBuffer cmdbuf) {
+            if (params.is_indexed) {
+                cmdbuf.DrawIndexedIndirect(buffer_obj, offset,
+                                           static_cast<u32>(params.max_draw_counts),
+                                           static_cast<u32>(params.stride));
             } else {
-                cmdbuf.DrawIndirectCount(buffer_obj, offset + 4ULL, buffer_obj, offset,
-                                         static_cast<u32>(max_draw_counts),
-                                         static_cast<u32>(stride));
+                cmdbuf.DrawIndirect(buffer_obj, offset, static_cast<u32>(params.max_draw_counts),
+                                    static_cast<u32>(params.stride));
             }
         });
     });
+    buffer_cache.SetDrawIndirect(nullptr);
 }
 
 void RasterizerVulkan::Clear(u32 layer_count) {
@@ -425,9 +440,6 @@ void RasterizerVulkan::FlushRegion(VAddr addr, u64 size) {
 
 bool RasterizerVulkan::MustFlushRegion(VAddr addr, u64 size) {
     std::scoped_lock lock{texture_cache.mutex, buffer_cache.mutex};
-    if (!Settings::IsGPULevelHigh()) {
-        return buffer_cache.IsRegionGpuModified(addr, size);
-    }
     return texture_cache.IsRegionGpuModified(addr, size) ||
            buffer_cache.IsRegionGpuModified(addr, size);
 }
diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.h b/src/video_core/renderer_vulkan/vk_rasterizer.h
index bc43a8a1f3..43a210da0b 100644
--- a/src/video_core/renderer_vulkan/vk_rasterizer.h
+++ b/src/video_core/renderer_vulkan/vk_rasterizer.h
@@ -65,7 +65,7 @@ public:
     ~RasterizerVulkan() override;
 
     void Draw(bool is_indexed, u32 instance_count) override;
-    void DrawIndirect(bool is_indexed) override;
+    void DrawIndirect() override;
     void Clear(u32 layer_count) override;
     void DispatchCompute() override;
     void ResetCounter(VideoCore::QueryType type) override;
diff --git a/src/video_core/vulkan_common/vulkan_device.cpp b/src/video_core/vulkan_common/vulkan_device.cpp
index 477fc428bc..207fae8c91 100644
--- a/src/video_core/vulkan_common/vulkan_device.cpp
+++ b/src/video_core/vulkan_common/vulkan_device.cpp
@@ -351,7 +351,7 @@ Device::Device(VkInstance instance_, vk::PhysicalDevice physical_, VkSurfaceKHR
             .dualSrcBlend = true,
             .logicOp = true,
             .multiDrawIndirect = true,
-            .drawIndirectFirstInstance = false,
+            .drawIndirectFirstInstance = true,
             .depthClamp = true,
             .depthBiasClamp = true,
             .fillModeNonSolid = true,
@@ -1024,6 +1024,8 @@ void Device::CheckSuitability(bool requires_swapchain) const {
         std::make_pair(features.vertexPipelineStoresAndAtomics, "vertexPipelineStoresAndAtomics"),
         std::make_pair(features.imageCubeArray, "imageCubeArray"),
         std::make_pair(features.independentBlend, "independentBlend"),
+        std::make_pair(features.multiDrawIndirect, "multiDrawIndirect"),
+        std::make_pair(features.drawIndirectFirstInstance, "drawIndirectFirstInstance"),
         std::make_pair(features.depthClamp, "depthClamp"),
         std::make_pair(features.samplerAnisotropy, "samplerAnisotropy"),
         std::make_pair(features.largePoints, "largePoints"),
@@ -1117,6 +1119,7 @@ std::vector<const char*> Device::LoadExtensions(bool requires_surface) {
         test(khr_spirv_1_4, VK_KHR_SPIRV_1_4_EXTENSION_NAME, true);
         test(khr_push_descriptor, VK_KHR_PUSH_DESCRIPTOR_EXTENSION_NAME, true);
         test(has_khr_shader_float16_int8, VK_KHR_SHADER_FLOAT16_INT8_EXTENSION_NAME, false);
+        test(khr_draw_indirect_count, VK_KHR_DRAW_INDIRECT_COUNT_EXTENSION_NAME, true);
         test(ext_depth_range_unrestricted, VK_EXT_DEPTH_RANGE_UNRESTRICTED_EXTENSION_NAME, true);
         test(ext_index_type_uint8, VK_EXT_INDEX_TYPE_UINT8_EXTENSION_NAME, true);
         test(has_ext_primitive_topology_list_restart,
diff --git a/src/video_core/vulkan_common/vulkan_device.h b/src/video_core/vulkan_common/vulkan_device.h
index 6a26c4e6e2..d0d7c2299f 100644
--- a/src/video_core/vulkan_common/vulkan_device.h
+++ b/src/video_core/vulkan_common/vulkan_device.h
@@ -451,6 +451,7 @@ private:
     bool nv_viewport_swizzle{};                 ///< Support for VK_NV_viewport_swizzle.
     bool nv_viewport_array2{};                  ///< Support for VK_NV_viewport_array2.
     bool nv_geometry_shader_passthrough{};      ///< Support for VK_NV_geometry_shader_passthrough.
+    bool khr_draw_indirect_count{};             ///< Support for VK_KHR_draw_indirect_count.
     bool khr_uniform_buffer_standard_layout{};  ///< Support for scalar uniform buffer layouts.
     bool khr_spirv_1_4{};                       ///< Support for VK_KHR_spirv_1_4.
     bool khr_workgroup_memory_explicit_layout{}; ///< Support for explicit workgroup layouts.
diff --git a/src/video_core/vulkan_common/vulkan_wrapper.cpp b/src/video_core/vulkan_common/vulkan_wrapper.cpp
index c58c4c1c49..f8f8ed9f88 100644
--- a/src/video_core/vulkan_common/vulkan_wrapper.cpp
+++ b/src/video_core/vulkan_common/vulkan_wrapper.cpp
@@ -94,8 +94,10 @@ void Load(VkDevice device, DeviceDispatch& dld) noexcept {
     X(vkCmdDispatch);
     X(vkCmdDraw);
     X(vkCmdDrawIndexed);
-    X(vkCmdDrawIndirectCount);
-    X(vkCmdDrawIndexedIndirectCount);
+    X(vkCmdDrawIndirect);
+    X(vkCmdDrawIndexedIndirect);
+    X(vkCmdDrawIndirectCountKHR);
+    X(vkCmdDrawIndexedIndirectCountKHR);
     X(vkCmdEndQuery);
     X(vkCmdEndRenderPass);
     X(vkCmdEndTransformFeedbackEXT);
diff --git a/src/video_core/vulkan_common/vulkan_wrapper.h b/src/video_core/vulkan_common/vulkan_wrapper.h
index 9bd158dce5..493a48573f 100644
--- a/src/video_core/vulkan_common/vulkan_wrapper.h
+++ b/src/video_core/vulkan_common/vulkan_wrapper.h
@@ -213,8 +213,10 @@ struct DeviceDispatch : InstanceDispatch {
     PFN_vkCmdDispatch vkCmdDispatch{};
     PFN_vkCmdDraw vkCmdDraw{};
     PFN_vkCmdDrawIndexed vkCmdDrawIndexed{};
-    PFN_vkCmdDrawIndirectCount vkCmdDrawIndirectCount{};
-    PFN_vkCmdDrawIndexedIndirectCount vkCmdDrawIndexedIndirectCount{};
+    PFN_vkCmdDrawIndirect vkCmdDrawIndirect{};
+    PFN_vkCmdDrawIndexedIndirect vkCmdDrawIndexedIndirect{};
+    PFN_vkCmdDrawIndirectCountKHR vkCmdDrawIndirectCountKHR{};
+    PFN_vkCmdDrawIndexedIndirectCountKHR vkCmdDrawIndexedIndirectCountKHR{};
     PFN_vkCmdEndDebugUtilsLabelEXT vkCmdEndDebugUtilsLabelEXT{};
     PFN_vkCmdEndQuery vkCmdEndQuery{};
     PFN_vkCmdEndRenderPass vkCmdEndRenderPass{};
@@ -1021,17 +1023,27 @@ public:
                               first_instance);
     }
 
+    void DrawIndirect(VkBuffer src_buffer, VkDeviceSize src_offset, u32 draw_count,
+                      u32 stride) const noexcept {
+        dld->vkCmdDrawIndirect(handle, src_buffer, src_offset, draw_count, stride);
+    }
+
+    void DrawIndexedIndirect(VkBuffer src_buffer, VkDeviceSize src_offset, u32 draw_count,
+                             u32 stride) const noexcept {
+        dld->vkCmdDrawIndexedIndirect(handle, src_buffer, src_offset, draw_count, stride);
+    }
+
     void DrawIndirectCount(VkBuffer src_buffer, VkDeviceSize src_offset, VkBuffer count_buffer,
                            VkDeviceSize count_offset, u32 draw_count, u32 stride) const noexcept {
-        dld->vkCmdDrawIndirectCount(handle, src_buffer, src_offset, count_buffer, count_offset,
-                                    draw_count, stride);
+        dld->vkCmdDrawIndirectCountKHR(handle, src_buffer, src_offset, count_buffer, count_offset,
+                                       draw_count, stride);
     }
 
     void DrawIndexedIndirectCount(VkBuffer src_buffer, VkDeviceSize src_offset,
                                   VkBuffer count_buffer, VkDeviceSize count_offset, u32 draw_count,
                                   u32 stride) const noexcept {
-        dld->vkCmdDrawIndexedIndirectCount(handle, src_buffer, src_offset, count_buffer,
-                                           count_offset, draw_count, stride);
+        dld->vkCmdDrawIndexedIndirectCountKHR(handle, src_buffer, src_offset, count_buffer,
+                                              count_offset, draw_count, stride);
     }
 
     void ClearAttachments(Span<VkClearAttachment> attachments,