30 files changed, 1264 insertions, 314 deletions
diff --git a/externals/sirit b/externals/sirit
-Subproject a62c5bbc100a5e5a31ea0ccc4a78d8fa6a4167c
+Subproject eefca56afd49379bdebc97ded8b480839f93088
diff --git a/src/core/hle/kernel/readable_event.cpp b/src/core/hle/kernel/readable_event.cpp
index 00860fcbd..ef5e19e63 100644
--- a/src/core/hle/kernel/readable_event.cpp
+++ b/src/core/hle/kernel/readable_event.cpp
@@ -38,7 +38,7 @@ void ReadableEvent::Clear() {
 
 ResultCode ReadableEvent::Reset() {
     if (!is_signaled) {
-        LOG_ERROR(Kernel, "Handle is not signaled! object_id={}, object_type={}, object_name={}",
+        LOG_TRACE(Kernel, "Handle is not signaled! object_id={}, object_type={}, object_name={}",
                   GetObjectId(), GetTypeName(), GetName());
         return ERR_INVALID_STATE;
     }
diff --git a/src/core/settings.h b/src/core/settings.h
index 78eb33737..36cd66fd4 100644
--- a/src/core/settings.h
+++ b/src/core/settings.h
@@ -474,6 +474,7 @@ struct Values {
     bool reporting_services;
     bool quest_flag;
     bool disable_cpu_opt;
+    bool disable_macro_jit;
 
     // BCAT
     std::string bcat_backend;
diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt
index d6ee82836..2bf8d68ce 100644
--- a/src/video_core/CMakeLists.txt
+++ b/src/video_core/CMakeLists.txt
@@ -25,6 +25,12 @@ add_library(video_core STATIC
     engines/shader_bytecode.h
     engines/shader_header.h
     engines/shader_type.h
+    macro/macro.cpp
+    macro/macro.h
+    macro/macro_interpreter.cpp
+    macro/macro_interpreter.h
+    macro/macro_jit_x64.cpp
+    macro/macro_jit_x64.h
     fence_manager.h
     gpu.cpp
     gpu.h
@@ -36,8 +42,6 @@ add_library(video_core STATIC
     gpu_thread.h
     guest_driver.cpp
     guest_driver.h
-    macro_interpreter.cpp
-    macro_interpreter.h
     memory_manager.cpp
     memory_manager.h
     morton.cpp
diff --git a/src/video_core/engines/maxwell_3d.cpp b/src/video_core/engines/maxwell_3d.cpp
index 13ef2e42d..e46b153f9 100644
--- a/src/video_core/engines/maxwell_3d.cpp
+++ b/src/video_core/engines/maxwell_3d.cpp
@@ -25,9 +25,8 @@ constexpr u32 MacroRegistersStart = 0xE00;
 Maxwell3D::Maxwell3D(Core::System& system, VideoCore::RasterizerInterface& rasterizer,
                      MemoryManager& memory_manager)
     : system{system}, rasterizer{rasterizer}, memory_manager{memory_manager},
-      macro_interpreter{*this}, upload_state{memory_manager, regs.upload} {
+      macro_engine{GetMacroEngine(*this)}, upload_state{memory_manager, regs.upload} {
     dirty.flags.flip();
-
     InitializeRegisterDefaults();
 }
 
@@ -120,7 +119,7 @@ void Maxwell3D::InitializeRegisterDefaults() {
     mme_inline[MAXWELL3D_REG_INDEX(index_array.count)] = true;
 }
 
-void Maxwell3D::CallMacroMethod(u32 method, std::size_t num_parameters, const u32* parameters) {
+void Maxwell3D::CallMacroMethod(u32 method, const std::vector<u32>& parameters) {
     // Reset the current macro.
     executing_macro = 0;
 
@@ -129,7 +128,7 @@ void Maxwell3D::CallMacroMethod(u32 method, std::size_t num_parameters, const u3
         ((method - MacroRegistersStart) >> 1) % static_cast<u32>(macro_positions.size());
 
     // Execute the current macro.
-    macro_interpreter.Execute(macro_positions[entry], num_parameters, parameters);
+    macro_engine->Execute(macro_positions[entry], parameters);
     if (mme_draw.current_mode != MMEDrawMode::Undefined) {
         FlushMMEInlineDraw();
     }
@@ -165,7 +164,7 @@ void Maxwell3D::CallMethod(u32 method, u32 method_argument, bool is_last_call) {
 
         // Call the macro when there are no more parameters in the command buffer
         if (is_last_call) {
-            CallMacroMethod(executing_macro, macro_params.size(), macro_params.data());
+            CallMacroMethod(executing_macro, macro_params);
             macro_params.clear();
         }
         return;
@@ -201,7 +200,7 @@ void Maxwell3D::CallMethod(u32 method, u32 method_argument, bool is_last_call) {
         break;
     }
     case MAXWELL3D_REG_INDEX(macros.data): {
-        ProcessMacroUpload(arg);
+        macro_engine->AddCode(regs.macros.upload_address, arg);
         break;
     }
     case MAXWELL3D_REG_INDEX(macros.bind): {
@@ -310,7 +309,7 @@ void Maxwell3D::CallMultiMethod(u32 method, const u32* base_start, u32 amount,
 
         // Call the macro when there are no more parameters in the command buffer
         if (amount == methods_pending) {
-            CallMacroMethod(executing_macro, macro_params.size(), macro_params.data());
+            CallMacroMethod(executing_macro, macro_params);
             macro_params.clear();
         }
         return;
@@ -424,9 +423,7 @@ void Maxwell3D::FlushMMEInlineDraw() {
 }
 
 void Maxwell3D::ProcessMacroUpload(u32 data) {
-    ASSERT_MSG(regs.macros.upload_address < macro_memory.size(),
-               "upload_address exceeded macro_memory size!");
-    macro_memory[regs.macros.upload_address++] = data;
+    macro_engine->AddCode(regs.macros.upload_address++, data);
 }
 
 void Maxwell3D::ProcessMacroBind(u32 data) {
diff --git a/src/video_core/engines/maxwell_3d.h b/src/video_core/engines/maxwell_3d.h
index 05dd6b39b..b827b112f 100644
--- a/src/video_core/engines/maxwell_3d.h
+++ b/src/video_core/engines/maxwell_3d.h
@@ -23,7 +23,7 @@
 #include "video_core/engines/engine_upload.h"
 #include "video_core/engines/shader_type.h"
 #include "video_core/gpu.h"
-#include "video_core/macro_interpreter.h"
+#include "video_core/macro/macro.h"
 #include "video_core/textures/texture.h"
 
 namespace Core {
@@ -1411,15 +1411,6 @@ public:
 
     const VideoCore::GuestDriverProfile& AccessGuestDriverProfile() const override;
 
-    /// Memory for macro code - it's undetermined how big this is, however 1MB is much larger than
-    /// we've seen used.
-    using MacroMemory = std::array<u32, 0x40000>;
-
-    /// Gets a reference to macro memory.
-    const MacroMemory& GetMacroMemory() const {
-        return macro_memory;
-    }
-
     bool ShouldExecute() const {
         return execute_on;
     }
@@ -1468,16 +1459,13 @@ private:
 
     std::array<bool, Regs::NUM_REGS> mme_inline{};
 
-    /// Memory for macro code
-    MacroMemory macro_memory;
-
     /// Macro method that is currently being executed / being fed parameters.
     u32 executing_macro = 0;
     /// Parameters that have been submitted to the macro call so far.
     std::vector<u32> macro_params;
 
     /// Interpreter for the macro codes uploaded to the GPU.
-    MacroInterpreter macro_interpreter;
+    std::unique_ptr<MacroEngine> macro_engine;
 
     static constexpr u32 null_cb_data = 0xFFFFFFFF;
     struct {
@@ -1506,7 +1494,7 @@ private:
      * @param num_parameters Number of arguments
      * @param parameters Arguments to the method call
      */
-    void CallMacroMethod(u32 method, std::size_t num_parameters, const u32* parameters);
+    void CallMacroMethod(u32 method, const std::vector<u32>& parameters);
 
     /// Handles writes to the macro uploading register.
     void ProcessMacroUpload(u32 data);
diff --git a/src/video_core/macro/macro.cpp b/src/video_core/macro/macro.cpp
new file mode 100644
index 000000000..89077a2d8
--- /dev/null
+++ b/src/video_core/macro/macro.cpp
@@ -0,0 +1,45 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include "common/assert.h"
+#include "common/logging/log.h"
+#include "core/settings.h"
+#include "video_core/macro/macro.h"
+#include "video_core/macro/macro_interpreter.h"
+#include "video_core/macro/macro_jit_x64.h"
+
+namespace Tegra {
+
+void MacroEngine::AddCode(u32 method, u32 data) {
+    uploaded_macro_code[method].push_back(data);
+}
+
+void MacroEngine::Execute(u32 method, const std::vector<u32>& parameters) {
+    auto compiled_macro = macro_cache.find(method);
+    if (compiled_macro != macro_cache.end()) {
+        compiled_macro->second->Execute(parameters, method);
+    } else {
+        // Macro not compiled, check if it's uploaded and if so, compile it
+        auto macro_code = uploaded_macro_code.find(method);
+        if (macro_code == uploaded_macro_code.end()) {
+            UNREACHABLE_MSG("Macro 0x{0:x} was not uploaded", method);
+            return;
+        }
+        macro_cache[method] = Compile(macro_code->second);
+        macro_cache[method]->Execute(parameters, method);
+    }
+}
+
+std::unique_ptr<MacroEngine> GetMacroEngine(Engines::Maxwell3D& maxwell3d) {
+    if (Settings::values.disable_macro_jit) {
+        return std::make_unique<MacroInterpreter>(maxwell3d);
+    }
+#ifdef ARCHITECTURE_x86_64
+    return std::make_unique<MacroJITx64>(maxwell3d);
+#else
+    return std::make_unique<MacroInterpreter>(maxwell3d);
+#endif
+}
+
+} // namespace Tegra
diff --git a/src/video_core/macro/macro.h b/src/video_core/macro/macro.h
new file mode 100644
index 000000000..b76ed891f
--- /dev/null
+++ b/src/video_core/macro/macro.h
@@ -0,0 +1,128 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <memory>
+#include <unordered_map>
+#include <vector>
+#include "common/bit_field.h"
+#include "common/common_types.h"
+
+namespace Tegra {
+namespace Engines {
+class Maxwell3D;
+}
+namespace Macro {
+constexpr std::size_t NUM_MACRO_REGISTERS = 8;
+enum class Operation : u32 {
+    ALU = 0,
+    AddImmediate = 1,
+    ExtractInsert = 2,
+    ExtractShiftLeftImmediate = 3,
+    ExtractShiftLeftRegister = 4,
+    Read = 5,
+    Unused = 6, // This operation doesn't seem to be a valid encoding.
+    Branch = 7,
+};
+
+enum class ALUOperation : u32 {
+    Add = 0,
+    AddWithCarry = 1,
+    Subtract = 2,
+    SubtractWithBorrow = 3,
+    // Operations 4-7 don't seem to be valid encodings.
+    Xor = 8,
+    Or = 9,
+    And = 10,
+    AndNot = 11,
+    Nand = 12
+};
+
+enum class ResultOperation : u32 {
+    IgnoreAndFetch = 0,
+    Move = 1,
+    MoveAndSetMethod = 2,
+    FetchAndSend = 3,
+    MoveAndSend = 4,
+    FetchAndSetMethod = 5,
+    MoveAndSetMethodFetchAndSend = 6,
+    MoveAndSetMethodSend = 7
+};
+
+enum class BranchCondition : u32 {
+    Zero = 0,
+    NotZero = 1,
+};
+
+union Opcode {
+    u32 raw;
+    BitField<0, 3, Operation> operation;
+    BitField<4, 3, ResultOperation> result_operation;
+    BitField<4, 1, BranchCondition> branch_condition;
+    // If set on a branch, then the branch doesn't have a delay slot.
+    BitField<5, 1, u32> branch_annul;
+    BitField<7, 1, u32> is_exit;
+    BitField<8, 3, u32> dst;
+    BitField<11, 3, u32> src_a;
+    BitField<14, 3, u32> src_b;
+    // The signed immediate overlaps the second source operand and the alu operation.
+    BitField<14, 18, s32> immediate;
+
+    BitField<17, 5, ALUOperation> alu_operation;
+
+    // Bitfield instructions data
+    BitField<17, 5, u32> bf_src_bit;
+    BitField<22, 5, u32> bf_size;
+    BitField<27, 5, u32> bf_dst_bit;
+
+    u32 GetBitfieldMask() const {
+        return (1 << bf_size) - 1;
+    }
+
+    s32 GetBranchTarget() const {
+        return static_cast<s32>(immediate * sizeof(u32));
+    }
+};
+
+union MethodAddress {
+    u32 raw;
+    BitField<0, 12, u32> address;
+    BitField<12, 6, u32> increment;
+};
+
+} // namespace Macro
+
+class CachedMacro {
+public:
+    virtual ~CachedMacro() = default;
+    /**
+     * Executes the macro code with the specified input parameters.
+     * @param code The macro byte code to execute
+     * @param parameters The parameters of the macro
+     */
+    virtual void Execute(const std::vector<u32>& parameters, u32 method) = 0;
+};
+
+class MacroEngine {
+public:
+    virtual ~MacroEngine() = default;
+
+    // Store the uploaded macro code to compile them when they're called.
+    void AddCode(u32 method, u32 data);
+
+    // Compiles the macro if its not in the cache, and executes the compiled macro
+    void Execute(u32 method, const std::vector<u32>& parameters);
+
+protected:
+    virtual std::unique_ptr<CachedMacro> Compile(const std::vector<u32>& code) = 0;
+
+private:
+    std::unordered_map<u32, std::unique_ptr<CachedMacro>> macro_cache;
+    std::unordered_map<u32, std::vector<u32>> uploaded_macro_code;
+};
+
+std::unique_ptr<MacroEngine> GetMacroEngine(Engines::Maxwell3D& maxwell3d);
+
+} // namespace Tegra
diff --git a/src/video_core/macro_interpreter.cpp b/src/video_core/macro/macro_interpreter.cpp
index 947364928..5edff27aa 100644
--- a/src/video_core/macro_interpreter.cpp
+++ b/src/video_core/macro/macro_interpreter.cpp
@@ -1,4 +1,4 @@
-// Copyright 2018 yuzu Emulator Project
+// Copyright 2020 yuzu Emulator Project
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 
@@ -6,109 +6,46 @@
 #include "common/logging/log.h"
 #include "common/microprofile.h"
 #include "video_core/engines/maxwell_3d.h"
-#include "video_core/macro_interpreter.h"
+#include "video_core/macro/macro_interpreter.h"
 
 MICROPROFILE_DEFINE(MacroInterp, "GPU", "Execute macro interpreter", MP_RGB(128, 128, 192));
 
 namespace Tegra {
-namespace {
-enum class Operation : u32 {
-    ALU = 0,
-    AddImmediate = 1,
-    ExtractInsert = 2,
-    ExtractShiftLeftImmediate = 3,
-    ExtractShiftLeftRegister = 4,
-    Read = 5,
-    Unused = 6, // This operation doesn't seem to be a valid encoding.
-    Branch = 7,
-};
-} // Anonymous namespace
-
-enum class MacroInterpreter::ALUOperation : u32 {
-    Add = 0,
-    AddWithCarry = 1,
-    Subtract = 2,
-    SubtractWithBorrow = 3,
-    // Operations 4-7 don't seem to be valid encodings.
-    Xor = 8,
-    Or = 9,
-    And = 10,
-    AndNot = 11,
-    Nand = 12
-};
-
-enum class MacroInterpreter::ResultOperation : u32 {
-    IgnoreAndFetch = 0,
-    Move = 1,
-    MoveAndSetMethod = 2,
-    FetchAndSend = 3,
-    MoveAndSend = 4,
-    FetchAndSetMethod = 5,
-    MoveAndSetMethodFetchAndSend = 6,
-    MoveAndSetMethodSend = 7
-};
-
-enum class MacroInterpreter::BranchCondition : u32 {
-    Zero = 0,
-    NotZero = 1,
-};
-
-union MacroInterpreter::Opcode {
-    u32 raw;
-    BitField<0, 3, Operation> operation;
-    BitField<4, 3, ResultOperation> result_operation;
-    BitField<4, 1, BranchCondition> branch_condition;
-    // If set on a branch, then the branch doesn't have a delay slot.
-    BitField<5, 1, u32> branch_annul;
-    BitField<7, 1, u32> is_exit;
-    BitField<8, 3, u32> dst;
-    BitField<11, 3, u32> src_a;
-    BitField<14, 3, u32> src_b;
-    // The signed immediate overlaps the second source operand and the alu operation.
-    BitField<14, 18, s32> immediate;
-
-    BitField<17, 5, ALUOperation> alu_operation;
-
-    // Bitfield instructions data
-    BitField<17, 5, u32> bf_src_bit;
-    BitField<22, 5, u32> bf_size;
-    BitField<27, 5, u32> bf_dst_bit;
-
-    u32 GetBitfieldMask() const {
-        return (1 << bf_size) - 1;
-    }
+MacroInterpreter::MacroInterpreter(Engines::Maxwell3D& maxwell3d) : maxwell3d(maxwell3d) {}
 
-    s32 GetBranchTarget() const {
-        return static_cast<s32>(immediate * sizeof(u32));
-    }
-};
+std::unique_ptr<CachedMacro> MacroInterpreter::Compile(const std::vector<u32>& code) {
+    return std::make_unique<MacroInterpreterImpl>(maxwell3d, code);
+}
 
-MacroInterpreter::MacroInterpreter(Engines::Maxwell3D& maxwell3d) : maxwell3d(maxwell3d) {}
+MacroInterpreterImpl::MacroInterpreterImpl(Engines::Maxwell3D& maxwell3d,
+                                           const std::vector<u32>& code)
+    : maxwell3d(maxwell3d), code(code) {}
 
-void MacroInterpreter::Execute(u32 offset, std::size_t num_parameters, const u32* parameters) {
+void MacroInterpreterImpl::Execute(const std::vector<u32>& parameters, u32 method) {
     MICROPROFILE_SCOPE(MacroInterp);
     Reset();
 
     registers[1] = parameters[0];
+    num_parameters = parameters.size();
 
     if (num_parameters > parameters_capacity) {
         parameters_capacity = num_parameters;
         this->parameters = std::make_unique<u32[]>(num_parameters);
     }
-    std::memcpy(this->parameters.get(), parameters, num_parameters * sizeof(u32));
+    std::memcpy(this->parameters.get(), parameters.data(), num_parameters * sizeof(u32));
     this->num_parameters = num_parameters;
 
     // Execute the code until we hit an exit condition.
     bool keep_executing = true;
     while (keep_executing) {
-        keep_executing = Step(offset, false);
+        keep_executing = Step(false);
     }
 
     // Assert the the macro used all the input parameters
     ASSERT(next_parameter_index == num_parameters);
 }
 
-void MacroInterpreter::Reset() {
+void MacroInterpreterImpl::Reset() {
     registers = {};
     pc = 0;
     delayed_pc = {};
@@ -120,10 +57,10 @@ void MacroInterpreter::Reset() {
     carry_flag = false;
 }
 
-bool MacroInterpreter::Step(u32 offset, bool is_delay_slot) {
+bool MacroInterpreterImpl::Step(bool is_delay_slot) {
     u32 base_address = pc;
 
-    Opcode opcode = GetOpcode(offset);
+    Macro::Opcode opcode = GetOpcode();
     pc += 4;
 
     // Update the program counter if we were delayed
@@ -134,18 +71,18 @@ bool MacroInterpreter::Step(u32 offset, bool is_delay_slot) {
     }
 
     switch (opcode.operation) {
-    case Operation::ALU: {
+    case Macro::Operation::ALU: {
         u32 result = GetALUResult(opcode.alu_operation, GetRegister(opcode.src_a),
                                   GetRegister(opcode.src_b));
         ProcessResult(opcode.result_operation, opcode.dst, result);
         break;
     }
-    case Operation::AddImmediate: {
+    case Macro::Operation::AddImmediate: {
         ProcessResult(opcode.result_operation, opcode.dst,
                       GetRegister(opcode.src_a) + opcode.immediate);
         break;
     }
-    case Operation::ExtractInsert: {
+    case Macro::Operation::ExtractInsert: {
         u32 dst = GetRegister(opcode.src_a);
         u32 src = GetRegister(opcode.src_b);
 
@@ -155,7 +92,7 @@ bool MacroInterpreter::Step(u32 offset, bool is_delay_slot) {
         ProcessResult(opcode.result_operation, opcode.dst, dst);
         break;
     }
-    case Operation::ExtractShiftLeftImmediate: {
+    case Macro::Operation::ExtractShiftLeftImmediate: {
         u32 dst = GetRegister(opcode.src_a);
         u32 src = GetRegister(opcode.src_b);
 
@@ -164,7 +101,7 @@ bool MacroInterpreter::Step(u32 offset, bool is_delay_slot) {
         ProcessResult(opcode.result_operation, opcode.dst, result);
         break;
     }
-    case Operation::ExtractShiftLeftRegister: {
+    case Macro::Operation::ExtractShiftLeftRegister: {
         u32 dst = GetRegister(opcode.src_a);
         u32 src = GetRegister(opcode.src_b);
 
@@ -173,12 +110,12 @@ bool MacroInterpreter::Step(u32 offset, bool is_delay_slot) {
         ProcessResult(opcode.result_operation, opcode.dst, result);
         break;
     }
-    case Operation::Read: {
+    case Macro::Operation::Read: {
         u32 result = Read(GetRegister(opcode.src_a) + opcode.immediate);
         ProcessResult(opcode.result_operation, opcode.dst, result);
         break;
     }
-    case Operation::Branch: {
+    case Macro::Operation::Branch: {
         ASSERT_MSG(!is_delay_slot, "Executing a branch in a delay slot is not valid");
         u32 value = GetRegister(opcode.src_a);
         bool taken = EvaluateBranchCondition(opcode.branch_condition, value);
@@ -191,7 +128,7 @@ bool MacroInterpreter::Step(u32 offset, bool is_delay_slot) {
 
             delayed_pc = base_address + opcode.GetBranchTarget();
             // Execute one more instruction due to the delay slot.
-            return Step(offset, true);
+            return Step(true);
         }
         break;
     }
@@ -204,51 +141,44 @@ bool MacroInterpreter::Step(u32 offset, bool is_delay_slot) {
     // cause an exit if it's executed inside a delay slot.
     if (opcode.is_exit && !is_delay_slot) {
         // Exit has a delay slot, execute the next instruction
-        Step(offset, true);
+        Step(true);
         return false;
     }
 
     return true;
 }
 
-MacroInterpreter::Opcode MacroInterpreter::GetOpcode(u32 offset) const {
-    const auto& macro_memory{maxwell3d.GetMacroMemory()};
-    ASSERT((pc % sizeof(u32)) == 0);
-    ASSERT((pc + offset) < macro_memory.size() * sizeof(u32));
-    return {macro_memory[offset + pc / sizeof(u32)]};
-}
-
-u32 MacroInterpreter::GetALUResult(ALUOperation operation, u32 src_a, u32 src_b) {
+u32 MacroInterpreterImpl::GetALUResult(Macro::ALUOperation operation, u32 src_a, u32 src_b) {
     switch (operation) {
-    case ALUOperation::Add: {
+    case Macro::ALUOperation::Add: {
         const u64 result{static_cast<u64>(src_a) + src_b};
         carry_flag = result > 0xffffffff;
         return static_cast<u32>(result);
     }
-    case ALUOperation::AddWithCarry: {
+    case Macro::ALUOperation::AddWithCarry: {
         const u64 result{static_cast<u64>(src_a) + src_b + (carry_flag ? 1ULL : 0ULL)};
         carry_flag = result > 0xffffffff;
         return static_cast<u32>(result);
     }
-    case ALUOperation::Subtract: {
+    case Macro::ALUOperation::Subtract: {
         const u64 result{static_cast<u64>(src_a) - src_b};
         carry_flag = result < 0x100000000;
         return static_cast<u32>(result);
     }
-    case ALUOperation::SubtractWithBorrow: {
+    case Macro::ALUOperation::SubtractWithBorrow: {
         const u64 result{static_cast<u64>(src_a) - src_b - (carry_flag ? 0ULL : 1ULL)};
         carry_flag = result < 0x100000000;
         return static_cast<u32>(result);
     }
-    case ALUOperation::Xor:
+    case Macro::ALUOperation::Xor:
         return src_a ^ src_b;
-    case ALUOperation::Or:
+    case Macro::ALUOperation::Or:
         return src_a | src_b;
-    case ALUOperation::And:
+    case Macro::ALUOperation::And:
         return src_a & src_b;
-    case ALUOperation::AndNot:
+    case Macro::ALUOperation::AndNot:
         return src_a & ~src_b;
-    case ALUOperation::Nand:
+    case Macro::ALUOperation::Nand:
         return ~(src_a & src_b);
 
     default:
@@ -257,43 +187,43 @@ u32 MacroInterpreter::GetALUResult(ALUOperation operation, u32 src_a, u32 src_b)
     }
 }
 
-void MacroInterpreter::ProcessResult(ResultOperation operation, u32 reg, u32 result) {
+void MacroInterpreterImpl::ProcessResult(Macro::ResultOperation operation, u32 reg, u32 result) {
     switch (operation) {
-    case ResultOperation::IgnoreAndFetch:
+    case Macro::ResultOperation::IgnoreAndFetch:
         // Fetch parameter and ignore result.
         SetRegister(reg, FetchParameter());
         break;
-    case ResultOperation::Move:
+    case Macro::ResultOperation::Move:
         // Move result.
         SetRegister(reg, result);
         break;
-    case ResultOperation::MoveAndSetMethod:
+    case Macro::ResultOperation::MoveAndSetMethod:
         // Move result and use as Method Address.
         SetRegister(reg, result);
         SetMethodAddress(result);
         break;
-    case ResultOperation::FetchAndSend:
+    case Macro::ResultOperation::FetchAndSend:
         // Fetch parameter and send result.
         SetRegister(reg, FetchParameter());
         Send(result);
         break;
-    case ResultOperation::MoveAndSend:
+    case Macro::ResultOperation::MoveAndSend:
         // Move and send result.
         SetRegister(reg, result);
         Send(result);
         break;
-    case ResultOperation::FetchAndSetMethod:
+    case Macro::ResultOperation::FetchAndSetMethod:
         // Fetch parameter and use result as Method Address.
         SetRegister(reg, FetchParameter());
         SetMethodAddress(result);
         break;
-    case ResultOperation::MoveAndSetMethodFetchAndSend:
+    case Macro::ResultOperation::MoveAndSetMethodFetchAndSend:
         // Move result and use as Method Address, then fetch and send parameter.
         SetRegister(reg, result);
         SetMethodAddress(result);
         Send(FetchParameter());
         break;
-    case ResultOperation::MoveAndSetMethodSend:
+    case Macro::ResultOperation::MoveAndSetMethodSend:
         // Move result and use as Method Address, then send bits 12:17 of result.
         SetRegister(reg, result);
         SetMethodAddress(result);
@@ -304,16 +234,28 @@ void MacroInterpreter::ProcessResult(ResultOperation operation, u32 reg, u32 res
     }
 }
 
-u32 MacroInterpreter::FetchParameter() {
-    ASSERT(next_parameter_index < num_parameters);
-    return parameters[next_parameter_index++];
+bool MacroInterpreterImpl::EvaluateBranchCondition(Macro::BranchCondition cond, u32 value) const {
+    switch (cond) {
+    case Macro::BranchCondition::Zero:
+        return value == 0;
+    case Macro::BranchCondition::NotZero:
+        return value != 0;
+    }
+    UNREACHABLE();
+    return true;
 }
 
-u32 MacroInterpreter::GetRegister(u32 register_id) const {
+Macro::Opcode MacroInterpreterImpl::GetOpcode() const {
+    ASSERT((pc % sizeof(u32)) == 0);
+    ASSERT(pc < code.size() * sizeof(u32));
+    return {code[pc / sizeof(u32)]};
+}
+
+u32 MacroInterpreterImpl::GetRegister(u32 register_id) const {
     return registers.at(register_id);
 }
 
-void MacroInterpreter::SetRegister(u32 register_id, u32 value) {
+void MacroInterpreterImpl::SetRegister(u32 register_id, u32 value) {
     // Register 0 is hardwired as the zero register.
     // Ensure no writes to it actually occur.
     if (register_id == 0) {
@@ -323,30 +265,24 @@ void MacroInterpreter::SetRegister(u32 register_id, u32 value) {
     registers.at(register_id) = value;
 }
 
-void MacroInterpreter::SetMethodAddress(u32 address) {
+void MacroInterpreterImpl::SetMethodAddress(u32 address) {
     method_address.raw = address;
 }
 
-void MacroInterpreter::Send(u32 value) {
+void MacroInterpreterImpl::Send(u32 value) {
     maxwell3d.CallMethodFromMME(method_address.address, value);
     // Increment the method address by the method increment.
     method_address.address.Assign(method_address.address.Value() +
                                   method_address.increment.Value());
 }
 
-u32 MacroInterpreter::Read(u32 method) const {
+u32 MacroInterpreterImpl::Read(u32 method) const {
     return maxwell3d.GetRegisterValue(method);
 }
 
-bool MacroInterpreter::EvaluateBranchCondition(BranchCondition cond, u32 value) const {
-    switch (cond) {
-    case BranchCondition::Zero:
-        return value == 0;
-    case BranchCondition::NotZero:
-        return value != 0;
-    }
-    UNREACHABLE();
-    return true;
+u32 MacroInterpreterImpl::FetchParameter() {
+    ASSERT(next_parameter_index < num_parameters);
+    return parameters[next_parameter_index++];
 }
 
 } // namespace Tegra
diff --git a/src/video_core/macro_interpreter.h b/src/video_core/macro/macro_interpreter.h
index 631146d89..90217fc89 100644
--- a/src/video_core/macro_interpreter.h
+++ b/src/video_core/macro/macro_interpreter.h
@@ -1,44 +1,37 @@
-// Copyright 2018 yuzu Emulator Project
+// Copyright 2020 yuzu Emulator Project
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 
 #pragma once
-
 #include <array>
 #include <optional>
-
+#include <vector>
 #include "common/bit_field.h"
 #include "common/common_types.h"
+#include "video_core/macro/macro.h"
 
 namespace Tegra {
 namespace Engines {
 class Maxwell3D;
 }
 
-class MacroInterpreter final {
+class MacroInterpreter final : public MacroEngine {
 public:
     explicit MacroInterpreter(Engines::Maxwell3D& maxwell3d);
 
-    /**
-     * Executes the macro code with the specified input parameters.
-     * @param offset Offset to start execution at.
-     * @param parameters The parameters of the macro.
-     */
-    void Execute(u32 offset, std::size_t num_parameters, const u32* parameters);
+protected:
+    std::unique_ptr<CachedMacro> Compile(const std::vector<u32>& code) override;
 
 private:
-    enum class ALUOperation : u32;
-    enum class BranchCondition : u32;
-    enum class ResultOperation : u32;
-
-    union Opcode;
+    Engines::Maxwell3D& maxwell3d;
+};
 
-    union MethodAddress {
-        u32 raw;
-        BitField<0, 12, u32> address;
-        BitField<12, 6, u32> increment;
-    };
+class MacroInterpreterImpl : public CachedMacro {
+public:
+    MacroInterpreterImpl(Engines::Maxwell3D& maxwell3d, const std::vector<u32>& code);
+    void Execute(const std::vector<u32>& parameters, u32 method) override;
 
+private:
     /// Resets the execution engine state, zeroing registers, etc.
     void Reset();
 
@@ -49,20 +42,20 @@ private:
      * @param is_delay_slot Whether the current step is being executed due to a delay slot in a
      * previous instruction.
      */
-    bool Step(u32 offset, bool is_delay_slot);
+    bool Step(bool is_delay_slot);
 
     /// Calculates the result of an ALU operation. src_a OP src_b;
-    u32 GetALUResult(ALUOperation operation, u32 src_a, u32 src_b);
+    u32 GetALUResult(Macro::ALUOperation operation, u32 src_a, u32 src_b);
 
     /// Performs the result operation on the input result and stores it in the specified register
     /// (if necessary).
-    void ProcessResult(ResultOperation operation, u32 reg, u32 result);
+    void ProcessResult(Macro::ResultOperation operation, u32 reg, u32 result);
 
     /// Evaluates the branch condition and returns whether the branch should be taken or not.
-    bool EvaluateBranchCondition(BranchCondition cond, u32 value) const;
+    bool EvaluateBranchCondition(Macro::BranchCondition cond, u32 value) const;
 
     /// Reads an opcode at the current program counter location.
-    Opcode GetOpcode(u32 offset) const;
+    Macro::Opcode GetOpcode() const;
 
     /// Returns the specified register's value. Register 0 is hardcoded to always return 0.
     u32 GetRegister(u32 register_id) const;
@@ -89,13 +82,11 @@ private:
     /// Program counter to execute at after the delay slot is executed.
     std::optional<u32> delayed_pc;
 
-    static constexpr std::size_t NumMacroRegisters = 8;
-
     /// General purpose macro registers.
-    std::array<u32, NumMacroRegisters> registers = {};
+    std::array<u32, Macro::NUM_MACRO_REGISTERS> registers = {};
 
     /// Method address to use for the next Send instruction.
-    MethodAddress method_address = {};
+    Macro::MethodAddress method_address = {};
 
     /// Input parameters of the current macro.
     std::unique_ptr<u32[]> parameters;
@@ -105,5 +96,7 @@ private:
     u32 next_parameter_index = 0;
 
     bool carry_flag = false;
+    const std::vector<u32>& code;
 };
+
 } // namespace Tegra
diff --git a/src/video_core/macro/macro_jit_x64.cpp b/src/video_core/macro/macro_jit_x64.cpp
new file mode 100644
index 000000000..11c1cc3be
--- /dev/null
+++ b/src/video_core/macro/macro_jit_x64.cpp
@@ -0,0 +1,640 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include "common/assert.h"
+#include "common/logging/log.h"
+#include "common/microprofile.h"
+#include "common/x64/xbyak_util.h"
+#include "video_core/engines/maxwell_3d.h"
+#include "video_core/macro/macro_interpreter.h"
+#include "video_core/macro/macro_jit_x64.h"
+
+MICROPROFILE_DEFINE(MacroJitCompile, "GPU", "Compile macro JIT", MP_RGB(173, 255, 47));
+MICROPROFILE_DEFINE(MacroJitExecute, "GPU", "Execute macro JIT", MP_RGB(255, 255, 0));
+
+namespace Tegra {
+static const Xbyak::Reg64 PARAMETERS = Xbyak::util::r9;
+static const Xbyak::Reg64 REGISTERS = Xbyak::util::r10;
+static const Xbyak::Reg64 STATE = Xbyak::util::r11;
+static const Xbyak::Reg64 NEXT_PARAMETER = Xbyak::util::r12;
+static const Xbyak::Reg32 RESULT = Xbyak::util::r13d;
+static const Xbyak::Reg64 RESULT_64 = Xbyak::util::r13;
+static const Xbyak::Reg32 METHOD_ADDRESS = Xbyak::util::r14d;
+static const Xbyak::Reg64 METHOD_ADDRESS_64 = Xbyak::util::r14;
+static const Xbyak::Reg64 BRANCH_HOLDER = Xbyak::util::r15;
+
+static const std::bitset<32> PERSISTENT_REGISTERS = Common::X64::BuildRegSet({
+    PARAMETERS,
+    REGISTERS,
+    STATE,
+    NEXT_PARAMETER,
+    RESULT,
+    METHOD_ADDRESS,
+    BRANCH_HOLDER,
+});
+
+MacroJITx64::MacroJITx64(Engines::Maxwell3D& maxwell3d) : maxwell3d(maxwell3d) {}
+
+std::unique_ptr<CachedMacro> MacroJITx64::Compile(const std::vector<u32>& code) {
+    return std::make_unique<MacroJITx64Impl>(maxwell3d, code);
+}
+
+MacroJITx64Impl::MacroJITx64Impl(Engines::Maxwell3D& maxwell3d, const std::vector<u32>& code)
+    : Xbyak::CodeGenerator(MAX_CODE_SIZE), code(code), maxwell3d(maxwell3d) {
+    Compile();
+}
+
+MacroJITx64Impl::~MacroJITx64Impl() = default;
+
+void MacroJITx64Impl::Execute(const std::vector<u32>& parameters, u32 method) {
+    MICROPROFILE_SCOPE(MacroJitExecute);
+    ASSERT_OR_EXECUTE(program != nullptr, { return; });
+    JITState state{};
+    state.maxwell3d = &maxwell3d;
+    state.registers = {};
+    state.parameters = parameters.data();
+    program(&state);
+}
+
+void MacroJITx64Impl::Compile_ALU(Macro::Opcode opcode) {
+    const bool is_a_zero = opcode.src_a == 0;
+    const bool is_b_zero = opcode.src_b == 0;
+    const bool valid_operation = !is_a_zero && !is_b_zero;
+    const bool is_move_operation = !is_a_zero && is_b_zero;
+    const bool has_zero_register = is_a_zero || is_b_zero;
+
+    Xbyak::Reg64 src_a;
+    Xbyak::Reg32 src_b;
+
+    if (!optimizer.zero_reg_skip) {
+        src_a = Compile_GetRegister(opcode.src_a, RESULT_64);
+        src_b = Compile_GetRegister(opcode.src_b, ebx);
+    } else {
+        if (!is_a_zero) {
+            src_a = Compile_GetRegister(opcode.src_a, RESULT_64);
+        }
+        if (!is_b_zero) {
+            src_b = Compile_GetRegister(opcode.src_b, ebx);
+        }
+    }
+    Xbyak::Label skip_carry{};
+
+    bool has_emitted = false;
+
+    switch (opcode.alu_operation) {
+    case Macro::ALUOperation::Add:
+        if (optimizer.zero_reg_skip) {
+            if (valid_operation) {
+                add(src_a, src_b);
+            }
+        } else {
+            add(src_a, src_b);
+        }
+
+        if (!optimizer.can_skip_carry) {
+            setc(byte[STATE + offsetof(JITState, carry_flag)]);
+        }
+        break;
+    case Macro::ALUOperation::AddWithCarry:
+        bt(dword[STATE + offsetof(JITState, carry_flag)], 0);
+        adc(src_a, src_b);
+        setc(byte[STATE + offsetof(JITState, carry_flag)]);
+        break;
+    case Macro::ALUOperation::Subtract:
+        if (optimizer.zero_reg_skip) {
+            if (valid_operation) {
+                sub(src_a, src_b);
+                has_emitted = true;
+            }
+        } else {
+            sub(src_a, src_b);
+            has_emitted = true;
+        }
+        if (!optimizer.can_skip_carry && has_emitted) {
+            setc(byte[STATE + offsetof(JITState, carry_flag)]);
+        }
+        break;
+    case Macro::ALUOperation::SubtractWithBorrow:
+        bt(dword[STATE + offsetof(JITState, carry_flag)], 0);
+        sbb(src_a, src_b);
+        setc(byte[STATE + offsetof(JITState, carry_flag)]);
+        break;
+    case Macro::ALUOperation::Xor:
+        if (optimizer.zero_reg_skip) {
+            if (valid_operation) {
+                xor_(src_a, src_b);
+            }
+        } else {
+            xor_(src_a, src_b);
+        }
+        break;
+    case Macro::ALUOperation::Or:
+        if (optimizer.zero_reg_skip) {
+            if (valid_operation) {
+                or_(src_a, src_b);
+            }
+        } else {
+            or_(src_a, src_b);
+        }
+        break;
+    case Macro::ALUOperation::And:
+        if (optimizer.zero_reg_skip) {
+            if (!has_zero_register) {
+                and_(src_a, src_b);
+            }
+        } else {
+            and_(src_a, src_b);
+        }
+        break;
+    case Macro::ALUOperation::AndNot:
+        if (optimizer.zero_reg_skip) {
+            if (!is_a_zero) {
+                not_(src_b);
+                and_(src_a, src_b);
+            }
+        } else {
+            not_(src_b);
+            and_(src_a, src_b);
+        }
+        break;
+    case Macro::ALUOperation::Nand:
+        if (optimizer.zero_reg_skip) {
+            if (!is_a_zero) {
+                and_(src_a, src_b);
+                not_(src_a);
+            }
+        } else {
+            and_(src_a, src_b);
+            not_(src_a);
+        }
+        break;
+    default:
+        UNIMPLEMENTED_MSG("Unimplemented ALU operation {}",
+                          static_cast<std::size_t>(opcode.alu_operation.Value()));
+        break;
+    }
+    Compile_ProcessResult(opcode.result_operation, opcode.dst);
+}
+
+void MacroJITx64Impl::Compile_AddImmediate(Macro::Opcode opcode) {
+    if (optimizer.skip_dummy_addimmediate) {
+        // Games tend to use this as an exit instruction placeholder. It's to encode an instruction
+        // without doing anything. In our case we can just not emit anything.
+        if (opcode.result_operation == Macro::ResultOperation::Move && opcode.dst == 0) {
+            return;
+        }
+    }
+    // Check for redundant moves
+    if (optimizer.optimize_for_method_move &&
+        opcode.result_operation == Macro::ResultOperation::MoveAndSetMethod) {
+        if (next_opcode.has_value()) {
+            const auto next = *next_opcode;
+            if (next.result_operation == Macro::ResultOperation::MoveAndSetMethod) {
+                return;
+            }
+        }
+    }
+    if (optimizer.zero_reg_skip && opcode.src_a == 0) {
+        if (opcode.immediate == 0) {
+            xor_(RESULT, RESULT);
+        } else {
+            mov(RESULT, opcode.immediate);
+        }
+    } else {
+        auto result = Compile_GetRegister(opcode.src_a, RESULT);
+        if (opcode.immediate > 2) {
+            add(result, opcode.immediate);
+        } else if (opcode.immediate == 1) {
+            inc(result);
+        } else if (opcode.immediate < 0) {
+            sub(result, opcode.immediate * -1);
+        }
+    }
+    Compile_ProcessResult(opcode.result_operation, opcode.dst);
+}
+
+void MacroJITx64Impl::Compile_ExtractInsert(Macro::Opcode opcode) {
+    auto dst = Compile_GetRegister(opcode.src_a, RESULT);
+    auto src = Compile_GetRegister(opcode.src_b, eax);
+
+    if (opcode.bf_src_bit != 0 && opcode.bf_src_bit != 31) {
+        shr(src, opcode.bf_src_bit);
+    } else if (opcode.bf_src_bit == 31) {
+        xor_(src, src);
+    }
+    // Don't bother masking the whole register since we're using a 32 bit register
+    if (opcode.bf_size != 31 && opcode.bf_size != 0) {
+        and_(src, opcode.GetBitfieldMask());
+    } else if (opcode.bf_size == 0) {
+        xor_(src, src);
+    }
+    if (opcode.bf_dst_bit != 31 && opcode.bf_dst_bit != 0) {
+        shl(src, opcode.bf_dst_bit);
+    } else if (opcode.bf_dst_bit == 31) {
+        xor_(src, src);
+    }
+
+    const u32 mask = ~(opcode.GetBitfieldMask() << opcode.bf_dst_bit);
+    if (mask != 0xffffffff) {
+        and_(dst, mask);
+    }
+    or_(dst, src);
+    Compile_ProcessResult(opcode.result_operation, opcode.dst);
+}
+
+void MacroJITx64Impl::Compile_ExtractShiftLeftImmediate(Macro::Opcode opcode) {
+    auto dst = Compile_GetRegister(opcode.src_a, eax);
+    auto src = Compile_GetRegister(opcode.src_b, RESULT);
+
+    shr(src, al);
+    if (opcode.bf_size != 0 && opcode.bf_size != 31) {
+        and_(src, opcode.GetBitfieldMask());
+    } else if (opcode.bf_size == 0) {
+        xor_(src, src);
+    }
+
+    if (opcode.bf_dst_bit != 0 && opcode.bf_dst_bit != 31) {
+        shl(src, opcode.bf_dst_bit);
+    } else if (opcode.bf_dst_bit == 31) {
+        xor_(src, src);
+    }
+    Compile_ProcessResult(opcode.result_operation, opcode.dst);
+}
+
+void MacroJITx64Impl::Compile_ExtractShiftLeftRegister(Macro::Opcode opcode) {
+    auto dst = Compile_GetRegister(opcode.src_a, eax);
+    auto src = Compile_GetRegister(opcode.src_b, RESULT);
+
+    if (opcode.bf_src_bit != 0) {
+        shr(src, opcode.bf_src_bit);
+    }
+
+    if (opcode.bf_size != 31) {
+        and_(src, opcode.GetBitfieldMask());
+    }
+    shl(src, al);
+    Compile_ProcessResult(opcode.result_operation, opcode.dst);
+}
+
+static u32 Read(Engines::Maxwell3D* maxwell3d, u32 method) {
+    return maxwell3d->GetRegisterValue(method);
+}
+
+static void Send(Engines::Maxwell3D* maxwell3d, Macro::MethodAddress method_address, u32 value) {
+    maxwell3d->CallMethodFromMME(method_address.address, value);
+}
+
+void MacroJITx64Impl::Compile_Read(Macro::Opcode opcode) {
+    if (optimizer.zero_reg_skip && opcode.src_a == 0) {
+        if (opcode.immediate == 0) {
+            xor_(RESULT, RESULT);
+        } else {
+            mov(RESULT, opcode.immediate);
+        }
+    } else {
+        auto result = Compile_GetRegister(opcode.src_a, RESULT);
+        if (opcode.immediate > 2) {
+            add(result, opcode.immediate);
+        } else if (opcode.immediate == 1) {
+            inc(result);
+        } else if (opcode.immediate < 0) {
+            sub(result, opcode.immediate * -1);
+        }
+    }
+    Common::X64::ABI_PushRegistersAndAdjustStackGPS(*this, PersistentCallerSavedRegs(), 0);
+    mov(Common::X64::ABI_PARAM1, qword[STATE]);
+    mov(Common::X64::ABI_PARAM2, RESULT);
+    Common::X64::CallFarFunction(*this, &Read);
+    Common::X64::ABI_PopRegistersAndAdjustStackGPS(*this, PersistentCallerSavedRegs(), 0);
+    mov(RESULT, Common::X64::ABI_RETURN.cvt32());
+    Compile_ProcessResult(opcode.result_operation, opcode.dst);
+}
+
+void Tegra::MacroJITx64Impl::Compile_Send(Xbyak::Reg32 value) {
+    Common::X64::ABI_PushRegistersAndAdjustStackGPS(*this, PersistentCallerSavedRegs(), 0);
+    mov(Common::X64::ABI_PARAM1, qword[STATE]);
+    mov(Common::X64::ABI_PARAM2, METHOD_ADDRESS);
+    mov(Common::X64::ABI_PARAM3, value);
+    Common::X64::CallFarFunction(*this, &Send);
+    Common::X64::ABI_PopRegistersAndAdjustStackGPS(*this, PersistentCallerSavedRegs(), 0);
+
+    Xbyak::Label dont_process{};
+    // Get increment
+    test(METHOD_ADDRESS, 0x3f000);
+    // If zero, method address doesn't update
+    je(dont_process);
+
+    mov(ecx, METHOD_ADDRESS);
+    and_(METHOD_ADDRESS, 0xfff);
+    shr(ecx, 12);
+    and_(ecx, 0x3f);
+    lea(eax, ptr[rcx + METHOD_ADDRESS_64]);
+    sal(ecx, 12);
+    or_(eax, ecx);
+
+    mov(METHOD_ADDRESS, eax);
+
+    L(dont_process);
+}
+
+void Tegra::MacroJITx64Impl::Compile_Branch(Macro::Opcode opcode) {
+    ASSERT_MSG(!is_delay_slot, "Executing a branch in a delay slot is not valid");
+    const s32 jump_address =
+        static_cast<s32>(pc) + static_cast<s32>(opcode.GetBranchTarget() / sizeof(s32));
+
+    Xbyak::Label end;
+    auto value = Compile_GetRegister(opcode.src_a, eax);
+    test(value, value);
+    if (optimizer.has_delayed_pc) {
+        switch (opcode.branch_condition) {
+        case Macro::BranchCondition::Zero:
+            jne(end, T_NEAR);
+            break;
+        case Macro::BranchCondition::NotZero:
+            je(end, T_NEAR);
+            break;
+        }
+
+        if (opcode.branch_annul) {
+            xor_(BRANCH_HOLDER, BRANCH_HOLDER);
+            jmp(labels[jump_address], T_NEAR);
+        } else {
+            Xbyak::Label handle_post_exit{};
+            Xbyak::Label skip{};
+            jmp(skip, T_NEAR);
+            if (opcode.is_exit) {
+                L(handle_post_exit);
+                // Execute 1 instruction
+                mov(BRANCH_HOLDER, end_of_code);
+                // Jump to next instruction to skip delay slot check
+                jmp(labels[jump_address], T_NEAR);
+            } else {
+                L(handle_post_exit);
+                xor_(BRANCH_HOLDER, BRANCH_HOLDER);
+                jmp(labels[jump_address], T_NEAR);
+            }
+            L(skip);
+            mov(BRANCH_HOLDER, handle_post_exit);
+            jmp(delay_skip[pc], T_NEAR);
+        }
+    } else {
+        switch (opcode.branch_condition) {
+        case Macro::BranchCondition::Zero:
+            je(labels[jump_address], T_NEAR);
+            break;
+        case Macro::BranchCondition::NotZero:
+            jne(labels[jump_address], T_NEAR);
+            break;
+        }
+    }
+
+    L(end);
+}
+
+void Tegra::MacroJITx64Impl::Optimizer_ScanFlags() {
+    optimizer.can_skip_carry = true;
+    optimizer.has_delayed_pc = false;
+    for (auto raw_op : code) {
+        Macro::Opcode op{};
+        op.raw = raw_op;
+
+        if (op.operation == Macro::Operation::ALU) {
+            // Scan for any ALU operations which actually use the carry flag, if they don't exist in
+            // our current code we can skip emitting the carry flag handling operations
+            if (op.alu_operation == Macro::ALUOperation::AddWithCarry ||
+                op.alu_operation == Macro::ALUOperation::SubtractWithBorrow) {
+                optimizer.can_skip_carry = false;
+            }
+        }
+
+        if (op.operation == Macro::Operation::Branch) {
+            if (!op.branch_annul) {
+                optimizer.has_delayed_pc = true;
+            }
+        }
+    }
+}
+
+void MacroJITx64Impl::Compile() {
+    MICROPROFILE_SCOPE(MacroJitCompile);
+    bool keep_executing = true;
+    labels.fill(Xbyak::Label());
+
+    Common::X64::ABI_PushRegistersAndAdjustStackGPS(*this, Common::X64::ABI_ALL_CALLEE_SAVED, 8);
+    // JIT state
+    mov(STATE, Common::X64::ABI_PARAM1);
+    mov(PARAMETERS, qword[Common::X64::ABI_PARAM1 +
+                          static_cast<Xbyak::uint32>(offsetof(JITState, parameters))]);
+    mov(REGISTERS, Common::X64::ABI_PARAM1);
+    add(REGISTERS, static_cast<Xbyak::uint32>(offsetof(JITState, registers)));
+    xor_(RESULT, RESULT);
+    xor_(METHOD_ADDRESS, METHOD_ADDRESS);
+    xor_(NEXT_PARAMETER, NEXT_PARAMETER);
+    xor_(BRANCH_HOLDER, BRANCH_HOLDER);
+
+    mov(dword[REGISTERS + 4], Compile_FetchParameter());
+
+    // Track get register for zero registers and mark it as no-op
+    optimizer.zero_reg_skip = true;
+
+    // AddImmediate tends to be used as a NOP instruction, if we detect this we can
+    // completely skip the entire code path and no emit anything
+    optimizer.skip_dummy_addimmediate = true;
+
+    // SMO tends to emit a lot of unnecessary method moves, we can mitigate this by only emitting
+    // one if our register isn't "dirty"
+    optimizer.optimize_for_method_move = true;
+
+    // Check to see if we can skip emitting certain instructions
+    Optimizer_ScanFlags();
+
+    const u32 op_count = static_cast<u32>(code.size());
+    for (u32 i = 0; i < op_count; i++) {
+        if (i < op_count - 1) {
+            pc = i + 1;
+            next_opcode = GetOpCode();
+        } else {
+            next_opcode = {};
+        }
+        pc = i;
+        Compile_NextInstruction();
+    }
+
+    L(end_of_code);
+
+    Common::X64::ABI_PopRegistersAndAdjustStackGPS(*this, Common::X64::ABI_ALL_CALLEE_SAVED, 8);
+    ret();
+    ready();
+    program = getCode<ProgramType>();
+}
+
+bool MacroJITx64Impl::Compile_NextInstruction() {
+    const auto opcode = GetOpCode();
+    if (labels[pc].getAddress()) {
+        return false;
+    }
+
+    L(labels[pc]);
+
+    switch (opcode.operation) {
+    case Macro::Operation::ALU:
+        Compile_ALU(opcode);
+        break;
+    case Macro::Operation::AddImmediate:
+        Compile_AddImmediate(opcode);
+        break;
+    case Macro::Operation::ExtractInsert:
+        Compile_ExtractInsert(opcode);
+        break;
+    case Macro::Operation::ExtractShiftLeftImmediate:
+        Compile_ExtractShiftLeftImmediate(opcode);
+        break;
+    case Macro::Operation::ExtractShiftLeftRegister:
+        Compile_ExtractShiftLeftRegister(opcode);
+        break;
+    case Macro::Operation::Read:
+        Compile_Read(opcode);
+        break;
+    case Macro::Operation::Branch:
+        Compile_Branch(opcode);
+        break;
+    default:
+        UNIMPLEMENTED_MSG("Unimplemented opcode {}", opcode.operation.Value());
+        break;
+    }
+
+    if (optimizer.has_delayed_pc) {
+        if (opcode.is_exit) {
+            mov(rax, end_of_code);
+            test(BRANCH_HOLDER, BRANCH_HOLDER);
+            cmove(BRANCH_HOLDER, rax);
+            // Jump to next instruction to skip delay slot check
+            je(labels[pc + 1], T_NEAR);
+        } else {
+            // TODO(ogniK): Optimize delay slot branching
+            Xbyak::Label no_delay_slot{};
+            test(BRANCH_HOLDER, BRANCH_HOLDER);
+            je(no_delay_slot, T_NEAR);
+            mov(rax, BRANCH_HOLDER);
+            xor_(BRANCH_HOLDER, BRANCH_HOLDER);
+            jmp(rax);
+            L(no_delay_slot);
+        }
+        L(delay_skip[pc]);
+        if (opcode.is_exit) {
+            return false;
+        }
+    } else {
+        test(BRANCH_HOLDER, BRANCH_HOLDER);
+        jne(end_of_code, T_NEAR);
+        if (opcode.is_exit) {
+            inc(BRANCH_HOLDER);
+            return false;
+        }
+    }
+    return true;
+}
+
+Xbyak::Reg32 Tegra::MacroJITx64Impl::Compile_FetchParameter() {
+    mov(eax, dword[PARAMETERS + NEXT_PARAMETER * sizeof(u32)]);
+    inc(NEXT_PARAMETER);
+    return eax;
+}
+
+Xbyak::Reg32 MacroJITx64Impl::Compile_GetRegister(u32 index, Xbyak::Reg32 dst) {
+    if (index == 0) {
+        // Register 0 is always zero
+        xor_(dst, dst);
+    } else {
+        mov(dst, dword[REGISTERS + index * sizeof(u32)]);
+    }
+
+    return dst;
+}
+
+Xbyak::Reg64 Tegra::MacroJITx64Impl::Compile_GetRegister(u32 index, Xbyak::Reg64 dst) {
+    if (index == 0) {
+        // Register 0 is always zero
+        xor_(dst, dst);
+    } else {
+        mov(dst, dword[REGISTERS + index * sizeof(u32)]);
+    }
+
+    return dst;
+}
+
+void Tegra::MacroJITx64Impl::Compile_WriteCarry(Xbyak::Reg64 dst) {
+    Xbyak::Label zero{}, end{};
+    xor_(ecx, ecx);
+    shr(dst, 32);
+    setne(cl);
+    mov(dword[STATE + offsetof(JITState, carry_flag)], ecx);
+}
+
+void MacroJITx64Impl::Compile_ProcessResult(Macro::ResultOperation operation, u32 reg) {
+    auto SetRegister = [=](u32 reg, Xbyak::Reg32 result) {
+        // Register 0 is supposed to always return 0. NOP is implemented as a store to the zero
+        // register.
+        if (reg == 0) {
+            return;
+        }
+        mov(dword[REGISTERS + reg * sizeof(u32)], result);
+    };
+    auto SetMethodAddress = [=](Xbyak::Reg32 reg) { mov(METHOD_ADDRESS, reg); };
+
+    switch (operation) {
+    case Macro::ResultOperation::IgnoreAndFetch:
+        SetRegister(reg, Compile_FetchParameter());
+        break;
+    case Macro::ResultOperation::Move:
+        SetRegister(reg, RESULT);
+        break;
+    case Macro::ResultOperation::MoveAndSetMethod:
+        SetRegister(reg, RESULT);
+        SetMethodAddress(RESULT);
+        break;
+    case Macro::ResultOperation::FetchAndSend:
+        // Fetch parameter and send result.
+        SetRegister(reg, Compile_FetchParameter());
+        Compile_Send(RESULT);
+        break;
+    case Macro::ResultOperation::MoveAndSend:
+        // Move and send result.
+        SetRegister(reg, RESULT);
+        Compile_Send(RESULT);
+        break;
+    case Macro::ResultOperation::FetchAndSetMethod:
+        // Fetch parameter and use result as Method Address.
+        SetRegister(reg, Compile_FetchParameter());
+        SetMethodAddress(RESULT);
+        break;
+    case Macro::ResultOperation::MoveAndSetMethodFetchAndSend:
+        // Move result and use as Method Address, then fetch and send parameter.
+        SetRegister(reg, RESULT);
+        SetMethodAddress(RESULT);
+        Compile_Send(Compile_FetchParameter());
+        break;
+    case Macro::ResultOperation::MoveAndSetMethodSend:
+        // Move result and use as Method Address, then send bits 12:17 of result.
+        SetRegister(reg, RESULT);
+        SetMethodAddress(RESULT);
+        shr(RESULT, 12);
+        and_(RESULT, 0b111111);
+        Compile_Send(RESULT);
+        break;
+    default:
+        UNIMPLEMENTED_MSG("Unimplemented macro operation {}", static_cast<std::size_t>(operation));
+    }
+}
+
+Macro::Opcode MacroJITx64Impl::GetOpCode() const {
+    ASSERT(pc < code.size());
+    return {code[pc]};
+}
+
+std::bitset<32> MacroJITx64Impl::PersistentCallerSavedRegs() const {
+    return PERSISTENT_REGISTERS & Common::X64::ABI_ALL_CALLER_SAVED;
+}
+
+} // namespace Tegra
diff --git a/src/video_core/macro/macro_jit_x64.h b/src/video_core/macro/macro_jit_x64.h
new file mode 100644
index 000000000..21ee157cf
--- /dev/null
+++ b/src/video_core/macro/macro_jit_x64.h
@@ -0,0 +1,100 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <array>
+#include <bitset>
+#include <xbyak.h>
+#include "common/bit_field.h"
+#include "common/common_types.h"
+#include "common/x64/xbyak_abi.h"
+#include "video_core/macro/macro.h"
+
+namespace Tegra {
+
+namespace Engines {
+class Maxwell3D;
+}
+
+/// MAX_CODE_SIZE is arbitrarily chosen based on current booting games
+constexpr size_t MAX_CODE_SIZE = 0x10000;
+
+class MacroJITx64 final : public MacroEngine {
+public:
+    explicit MacroJITx64(Engines::Maxwell3D& maxwell3d);
+
+protected:
+    std::unique_ptr<CachedMacro> Compile(const std::vector<u32>& code) override;
+
+private:
+    Engines::Maxwell3D& maxwell3d;
+};
+
+class MacroJITx64Impl : public Xbyak::CodeGenerator, public CachedMacro {
+public:
+    MacroJITx64Impl(Engines::Maxwell3D& maxwell3d, const std::vector<u32>& code);
+    ~MacroJITx64Impl();
+
+    void Execute(const std::vector<u32>& parameters, u32 method) override;
+
+    void Compile_ALU(Macro::Opcode opcode);
+    void Compile_AddImmediate(Macro::Opcode opcode);
+    void Compile_ExtractInsert(Macro::Opcode opcode);
+    void Compile_ExtractShiftLeftImmediate(Macro::Opcode opcode);
+    void Compile_ExtractShiftLeftRegister(Macro::Opcode opcode);
+    void Compile_Read(Macro::Opcode opcode);
+    void Compile_Branch(Macro::Opcode opcode);
+
+private:
+    void Optimizer_ScanFlags();
+
+    void Compile();
+    bool Compile_NextInstruction();
+
+    Xbyak::Reg32 Compile_FetchParameter();
+    Xbyak::Reg32 Compile_GetRegister(u32 index, Xbyak::Reg32 dst);
+    Xbyak::Reg64 Compile_GetRegister(u32 index, Xbyak::Reg64 dst);
+    void Compile_WriteCarry(Xbyak::Reg64 dst);
+
+    void Compile_ProcessResult(Macro::ResultOperation operation, u32 reg);
+    void Compile_Send(Xbyak::Reg32 value);
+
+    Macro::Opcode GetOpCode() const;
+    std::bitset<32> PersistentCallerSavedRegs() const;
+
+    struct JITState {
+        Engines::Maxwell3D* maxwell3d{};
+        std::array<u32, Macro::NUM_MACRO_REGISTERS> registers{};
+        const u32* parameters{};
+        u32 carry_flag{};
+    };
+    static_assert(offsetof(JITState, maxwell3d) == 0, "Maxwell3D is not at 0x0");
+    using ProgramType = void (*)(JITState*);
+
+    struct OptimizerState {
+        bool can_skip_carry{};
+        bool has_delayed_pc{};
+        bool zero_reg_skip{};
+        bool skip_dummy_addimmediate{};
+        bool optimize_for_method_move{};
+    };
+    OptimizerState optimizer{};
+
+    std::optional<Macro::Opcode> next_opcode{};
+    ProgramType program{nullptr};
+
+    std::array<Xbyak::Label, MAX_CODE_SIZE> labels{};
+    std::array<Xbyak::Label, MAX_CODE_SIZE> delay_skip{};
+    Xbyak::Label end_of_code{};
+
+    bool is_delay_slot{};
+    u32 pc{};
+    std::optional<u32> delayed_pc;
+
+    const std::vector<u32>& code;
+    Engines::Maxwell3D& maxwell3d;
+};
+
+} // namespace Tegra
diff --git a/src/video_core/renderer_opengl/gl_device.cpp b/src/video_core/renderer_opengl/gl_device.cpp
index 8b424e2cb..890fc6c63 100644
--- a/src/video_core/renderer_opengl/gl_device.cpp
+++ b/src/video_core/renderer_opengl/gl_device.cpp
@@ -185,12 +185,20 @@ bool IsASTCSupported() {
 Device::Device()
     : max_uniform_buffers{BuildMaxUniformBuffers()}, base_bindings{BuildBaseBindings()} {
     const std::string_view vendor = reinterpret_cast<const char*>(glGetString(GL_VENDOR));
-    const auto renderer = reinterpret_cast<const char*>(glGetString(GL_RENDERER));
+    const std::string_view version = reinterpret_cast<const char*>(glGetString(GL_VERSION));
     const std::vector extensions = GetExtensions();
 
     const bool is_nvidia = vendor == "NVIDIA Corporation";
     const bool is_amd = vendor == "ATI Technologies Inc.";
 
+    bool disable_fast_buffer_sub_data = false;
+    if (is_nvidia && version == "4.6.0 NVIDIA 443.24") {
+        LOG_WARNING(
+            Render_OpenGL,
+            "Beta driver 443.24 is known to have issues. There might be performance issues.");
+        disable_fast_buffer_sub_data = true;
+    }
+
     uniform_buffer_alignment = GetInteger<std::size_t>(GL_UNIFORM_BUFFER_OFFSET_ALIGNMENT);
     shader_storage_alignment = GetInteger<std::size_t>(GL_SHADER_STORAGE_BUFFER_OFFSET_ALIGNMENT);
     max_vertex_attributes = GetInteger<u32>(GL_MAX_VERTEX_ATTRIBS);
@@ -204,7 +212,7 @@ Device::Device()
     has_variable_aoffi = TestVariableAoffi();
     has_component_indexing_bug = is_amd;
     has_precise_bug = TestPreciseBug();
-    has_fast_buffer_sub_data = is_nvidia;
+    has_fast_buffer_sub_data = is_nvidia && !disable_fast_buffer_sub_data;
     use_assembly_shaders = Settings::values.use_assembly_shaders && GLAD_GL_NV_gpu_program5 &&
                            GLAD_GL_NV_compute_program5 && GLAD_GL_NV_transform_feedback &&
                            GLAD_GL_NV_transform_feedback2;
diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
index 502b95973..d6e30b321 100644
--- a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
@@ -626,7 +626,9 @@ private:
                 break;
             }
         }
-        if (stage != ShaderType::Vertex || device.HasVertexViewportLayer()) {
+
+        if (stage != ShaderType::Geometry &&
+            (stage != ShaderType::Vertex || device.HasVertexViewportLayer())) {
             if (ir.UsesLayer()) {
                 code.AddLine("int gl_Layer;");
             }
@@ -655,6 +657,16 @@ private:
         --code.scope;
         code.AddLine("}};");
         code.AddNewLine();
+
+        if (stage == ShaderType::Geometry) {
+            if (ir.UsesLayer()) {
+                code.AddLine("out int gl_Layer;");
+            }
+            if (ir.UsesViewportIndex()) {
+                code.AddLine("out int gl_ViewportIndex;");
+            }
+        }
+        code.AddNewLine();
     }
 
     void DeclareRegisters() {
diff --git a/src/video_core/renderer_opengl/renderer_opengl.cpp b/src/video_core/renderer_opengl/renderer_opengl.cpp
index e7952924a..6214fcbc3 100644
--- a/src/video_core/renderer_opengl/renderer_opengl.cpp
+++ b/src/video_core/renderer_opengl/renderer_opengl.cpp
@@ -751,11 +751,9 @@ void RendererOpenGL::RenderScreenshot() {
 }
 
 bool RendererOpenGL::Init() {
-    if (GLAD_GL_KHR_debug) {
+    if (Settings::values.renderer_debug && GLAD_GL_KHR_debug) {
         glEnable(GL_DEBUG_OUTPUT);
-        if (Settings::values.renderer_debug) {
-            glEnable(GL_DEBUG_OUTPUT_SYNCHRONOUS);
-        }
+        glEnable(GL_DEBUG_OUTPUT_SYNCHRONOUS);
         glDebugMessageCallback(DebugHandler, nullptr);
     }
 
diff --git a/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp b/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp
index 8e1b46277..281bf9ac3 100644
--- a/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp
+++ b/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp
@@ -53,8 +53,9 @@ vk::DescriptorSetLayout VKComputePipeline::CreateDescriptorSetLayout() const {
     };
     add_bindings(VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, entries.const_buffers.size());
     add_bindings(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, entries.global_buffers.size());
-    add_bindings(VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER, entries.texel_buffers.size());
+    add_bindings(VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER, entries.uniform_texels.size());
     add_bindings(VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, entries.samplers.size());
+    add_bindings(VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER, entries.storage_texels.size());
     add_bindings(VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, entries.images.size());
 
     VkDescriptorSetLayoutCreateInfo ci;
diff --git a/src/video_core/renderer_vulkan/vk_descriptor_pool.cpp b/src/video_core/renderer_vulkan/vk_descriptor_pool.cpp
index 890fd52cf..9259b618d 100644
--- a/src/video_core/renderer_vulkan/vk_descriptor_pool.cpp
+++ b/src/video_core/renderer_vulkan/vk_descriptor_pool.cpp
@@ -42,6 +42,7 @@ vk::DescriptorPool* VKDescriptorPool::AllocateNewPool() {
         {VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, num_sets * 60},
         {VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER, num_sets * 64},
         {VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, num_sets * 64},
+        {VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER, num_sets * 64},
         {VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, num_sets * 40}};
 
     VkDescriptorPoolCreateInfo ci;
diff --git a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp
index 65a1c6245..b8ccf164f 100644
--- a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp
+++ b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp
@@ -45,6 +45,7 @@ constexpr VkDescriptorType UNIFORM_BUFFER = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER;
 constexpr VkDescriptorType STORAGE_BUFFER = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
 constexpr VkDescriptorType UNIFORM_TEXEL_BUFFER = VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER;
 constexpr VkDescriptorType COMBINED_IMAGE_SAMPLER = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER;
+constexpr VkDescriptorType STORAGE_TEXEL_BUFFER = VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER;
 constexpr VkDescriptorType STORAGE_IMAGE = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE;
 
 constexpr VideoCommon::Shader::CompilerSettings compiler_settings{
@@ -104,8 +105,9 @@ u32 FillDescriptorLayout(const ShaderEntries& entries,
     u32 binding = base_binding;
     AddBindings<UNIFORM_BUFFER>(bindings, binding, flags, entries.const_buffers);
     AddBindings<STORAGE_BUFFER>(bindings, binding, flags, entries.global_buffers);
-    AddBindings<UNIFORM_TEXEL_BUFFER>(bindings, binding, flags, entries.texel_buffers);
+    AddBindings<UNIFORM_TEXEL_BUFFER>(bindings, binding, flags, entries.uniform_texels);
     AddBindings<COMBINED_IMAGE_SAMPLER>(bindings, binding, flags, entries.samplers);
+    AddBindings<STORAGE_TEXEL_BUFFER>(bindings, binding, flags, entries.storage_texels);
     AddBindings<STORAGE_IMAGE>(bindings, binding, flags, entries.images);
     return binding;
 }
@@ -377,16 +379,17 @@ void AddEntry(std::vector<VkDescriptorUpdateTemplateEntry>& template_entries, u3
         return;
     }
 
-    if constexpr (descriptor_type == UNIFORM_TEXEL_BUFFER) {
-        // Nvidia has a bug where updating multiple uniform texels at once causes the driver to
-        // crash.
+    if constexpr (descriptor_type == UNIFORM_TEXEL_BUFFER ||
+                  descriptor_type == STORAGE_TEXEL_BUFFER) {
+        // Nvidia has a bug where updating multiple texels at once causes the driver to crash.
+        // Note: Fixed in driver Windows 443.24, Linux 440.66.15
         for (u32 i = 0; i < count; ++i) {
             VkDescriptorUpdateTemplateEntry& entry = template_entries.emplace_back();
             entry.dstBinding = binding + i;
             entry.dstArrayElement = 0;
             entry.descriptorCount = 1;
             entry.descriptorType = descriptor_type;
-            entry.offset = offset + i * entry_size;
+            entry.offset = static_cast<std::size_t>(offset + i * entry_size);
             entry.stride = entry_size;
         }
     } else if (count > 0) {
@@ -407,8 +410,9 @@ void FillDescriptorUpdateTemplateEntries(
     std::vector<VkDescriptorUpdateTemplateEntryKHR>& template_entries) {
     AddEntry<UNIFORM_BUFFER>(template_entries, offset, binding, entries.const_buffers);
     AddEntry<STORAGE_BUFFER>(template_entries, offset, binding, entries.global_buffers);
-    AddEntry<UNIFORM_TEXEL_BUFFER>(template_entries, offset, binding, entries.texel_buffers);
+    AddEntry<UNIFORM_TEXEL_BUFFER>(template_entries, offset, binding, entries.uniform_texels);
     AddEntry<COMBINED_IMAGE_SAMPLER>(template_entries, offset, binding, entries.samplers);
+    AddEntry<STORAGE_TEXEL_BUFFER>(template_entries, offset, binding, entries.storage_texels);
     AddEntry<STORAGE_IMAGE>(template_entries, offset, binding, entries.images);
 }
 
diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp
index a3d992ed3..d86c46412 100644
--- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp
+++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp
@@ -468,8 +468,9 @@ void RasterizerVulkan::DispatchCompute(GPUVAddr code_addr) {
     const auto& entries = pipeline.GetEntries();
     SetupComputeConstBuffers(entries);
     SetupComputeGlobalBuffers(entries);
-    SetupComputeTexelBuffers(entries);
+    SetupComputeUniformTexels(entries);
     SetupComputeTextures(entries);
+    SetupComputeStorageTexels(entries);
     SetupComputeImages(entries);
 
     buffer_cache.Unmap();
@@ -787,8 +788,9 @@ void RasterizerVulkan::SetupShaderDescriptors(
         const auto& entries = shader->GetEntries();
         SetupGraphicsConstBuffers(entries, stage);
         SetupGraphicsGlobalBuffers(entries, stage);
-        SetupGraphicsTexelBuffers(entries, stage);
+        SetupGraphicsUniformTexels(entries, stage);
         SetupGraphicsTextures(entries, stage);
+        SetupGraphicsStorageTexels(entries, stage);
         SetupGraphicsImages(entries, stage);
     }
     texture_cache.GuardSamplers(false);
@@ -838,6 +840,10 @@ void RasterizerVulkan::BeginTransformFeedback() {
     if (regs.tfb_enabled == 0) {
         return;
     }
+    if (!device.IsExtTransformFeedbackSupported()) {
+        LOG_ERROR(Render_Vulkan, "Transform feedbacks used but not supported");
+        return;
+    }
 
     UNIMPLEMENTED_IF(regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::TesselationControl) ||
                      regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::TesselationEval) ||
@@ -866,6 +872,9 @@ void RasterizerVulkan::EndTransformFeedback() {
     if (regs.tfb_enabled == 0) {
         return;
     }
+    if (!device.IsExtTransformFeedbackSupported()) {
+        return;
+    }
 
     scheduler.Record(
         [](vk::CommandBuffer cmdbuf) { cmdbuf.EndTransformFeedbackEXT(0, 0, nullptr, nullptr); });
@@ -976,12 +985,12 @@ void RasterizerVulkan::SetupGraphicsGlobalBuffers(const ShaderEntries& entries,
     }
 }
 
-void RasterizerVulkan::SetupGraphicsTexelBuffers(const ShaderEntries& entries, std::size_t stage) {
+void RasterizerVulkan::SetupGraphicsUniformTexels(const ShaderEntries& entries, std::size_t stage) {
     MICROPROFILE_SCOPE(Vulkan_Textures);
     const auto& gpu = system.GPU().Maxwell3D();
-    for (const auto& entry : entries.texel_buffers) {
+    for (const auto& entry : entries.uniform_texels) {
         const auto image = GetTextureInfo(gpu, entry, stage).tic;
-        SetupTexelBuffer(image, entry);
+        SetupUniformTexels(image, entry);
     }
 }
 
@@ -996,6 +1005,15 @@ void RasterizerVulkan::SetupGraphicsTextures(const ShaderEntries& entries, std::
     }
 }
 
+void RasterizerVulkan::SetupGraphicsStorageTexels(const ShaderEntries& entries, std::size_t stage) {
+    MICROPROFILE_SCOPE(Vulkan_Textures);
+    const auto& gpu = system.GPU().Maxwell3D();
+    for (const auto& entry : entries.storage_texels) {
+        const auto image = GetTextureInfo(gpu, entry, stage).tic;
+        SetupStorageTexel(image, entry);
+    }
+}
+
 void RasterizerVulkan::SetupGraphicsImages(const ShaderEntries& entries, std::size_t stage) {
     MICROPROFILE_SCOPE(Vulkan_Images);
     const auto& gpu = system.GPU().Maxwell3D();
@@ -1028,12 +1046,12 @@ void RasterizerVulkan::SetupComputeGlobalBuffers(const ShaderEntries& entries) {
     }
 }
 
-void RasterizerVulkan::SetupComputeTexelBuffers(const ShaderEntries& entries) {
+void RasterizerVulkan::SetupComputeUniformTexels(const ShaderEntries& entries) {
     MICROPROFILE_SCOPE(Vulkan_Textures);
     const auto& gpu = system.GPU().KeplerCompute();
-    for (const auto& entry : entries.texel_buffers) {
+    for (const auto& entry : entries.uniform_texels) {
         const auto image = GetTextureInfo(gpu, entry, ComputeShaderIndex).tic;
-        SetupTexelBuffer(image, entry);
+        SetupUniformTexels(image, entry);
     }
 }
 
@@ -1048,6 +1066,15 @@ void RasterizerVulkan::SetupComputeTextures(const ShaderEntries& entries) {
     }
 }
 
+void RasterizerVulkan::SetupComputeStorageTexels(const ShaderEntries& entries) {
+    MICROPROFILE_SCOPE(Vulkan_Textures);
+    const auto& gpu = system.GPU().KeplerCompute();
+    for (const auto& entry : entries.storage_texels) {
+        const auto image = GetTextureInfo(gpu, entry, ComputeShaderIndex).tic;
+        SetupStorageTexel(image, entry);
+    }
+}
+
 void RasterizerVulkan::SetupComputeImages(const ShaderEntries& entries) {
     MICROPROFILE_SCOPE(Vulkan_Images);
     const auto& gpu = system.GPU().KeplerCompute();
@@ -1097,8 +1124,8 @@ void RasterizerVulkan::SetupGlobalBuffer(const GlobalBufferEntry& entry, GPUVAdd
     update_descriptor_queue.AddBuffer(buffer, offset, size);
 }
 
-void RasterizerVulkan::SetupTexelBuffer(const Tegra::Texture::TICEntry& tic,
-                                        const TexelBufferEntry& entry) {
+void RasterizerVulkan::SetupUniformTexels(const Tegra::Texture::TICEntry& tic,
+                                          const UniformTexelEntry& entry) {
     const auto view = texture_cache.GetTextureSurface(tic, entry);
     ASSERT(view->IsBufferView());
 
@@ -1120,6 +1147,14 @@ void RasterizerVulkan::SetupTexture(const Tegra::Texture::FullTextureInfo& textu
     sampled_views.push_back(ImageView{std::move(view), image_layout});
 }
 
+void RasterizerVulkan::SetupStorageTexel(const Tegra::Texture::TICEntry& tic,
+                                         const StorageTexelEntry& entry) {
+    const auto view = texture_cache.GetImageSurface(tic, entry);
+    ASSERT(view->IsBufferView());
+
+    update_descriptor_queue.AddTexelBuffer(view->GetBufferView());
+}
+
 void RasterizerVulkan::SetupImage(const Tegra::Texture::TICEntry& tic, const ImageEntry& entry) {
     auto view = texture_cache.GetImageSurface(tic, entry);
 
diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.h b/src/video_core/renderer_vulkan/vk_rasterizer.h
index 0ed0e48c6..04be37a5e 100644
--- a/src/video_core/renderer_vulkan/vk_rasterizer.h
+++ b/src/video_core/renderer_vulkan/vk_rasterizer.h
@@ -193,12 +193,15 @@ private:
     /// Setup global buffers in the graphics pipeline.
     void SetupGraphicsGlobalBuffers(const ShaderEntries& entries, std::size_t stage);
 
-    /// Setup texel buffers in the graphics pipeline.
-    void SetupGraphicsTexelBuffers(const ShaderEntries& entries, std::size_t stage);
+    /// Setup uniform texels in the graphics pipeline.
+    void SetupGraphicsUniformTexels(const ShaderEntries& entries, std::size_t stage);
 
     /// Setup textures in the graphics pipeline.
     void SetupGraphicsTextures(const ShaderEntries& entries, std::size_t stage);
 
+    /// Setup storage texels in the graphics pipeline.
+    void SetupGraphicsStorageTexels(const ShaderEntries& entries, std::size_t stage);
+
     /// Setup images in the graphics pipeline.
     void SetupGraphicsImages(const ShaderEntries& entries, std::size_t stage);
 
@@ -209,11 +212,14 @@ private:
     void SetupComputeGlobalBuffers(const ShaderEntries& entries);
 
     /// Setup texel buffers in the compute pipeline.
-    void SetupComputeTexelBuffers(const ShaderEntries& entries);
+    void SetupComputeUniformTexels(const ShaderEntries& entries);
 
     /// Setup textures in the compute pipeline.
     void SetupComputeTextures(const ShaderEntries& entries);
 
+    /// Setup storage texels in the compute pipeline.
+    void SetupComputeStorageTexels(const ShaderEntries& entries);
+
     /// Setup images in the compute pipeline.
     void SetupComputeImages(const ShaderEntries& entries);
 
@@ -222,10 +228,12 @@ private:
 
     void SetupGlobalBuffer(const GlobalBufferEntry& entry, GPUVAddr address);
 
-    void SetupTexelBuffer(const Tegra::Texture::TICEntry& image, const TexelBufferEntry& entry);
+    void SetupUniformTexels(const Tegra::Texture::TICEntry& image, const UniformTexelEntry& entry);
 
     void SetupTexture(const Tegra::Texture::FullTextureInfo& texture, const SamplerEntry& entry);
 
+    void SetupStorageTexel(const Tegra::Texture::TICEntry& tic, const StorageTexelEntry& entry);
+
     void SetupImage(const Tegra::Texture::TICEntry& tic, const ImageEntry& entry);
 
     void UpdateViewportsState(Tegra::Engines::Maxwell3D::Regs& regs);
diff --git a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
index a13e8baa7..97429cc59 100644
--- a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
+++ b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
@@ -400,8 +400,9 @@ private:
         u32 binding = specialization.base_binding;
         binding = DeclareConstantBuffers(binding);
         binding = DeclareGlobalBuffers(binding);
-        binding = DeclareTexelBuffers(binding);
+        binding = DeclareUniformTexels(binding);
         binding = DeclareSamplers(binding);
+        binding = DeclareStorageTexels(binding);
         binding = DeclareImages(binding);
 
         const Id main = OpFunction(t_void, {}, TypeFunction(t_void));
@@ -889,7 +890,7 @@ private:
         return binding;
     }
 
-    u32 DeclareTexelBuffers(u32 binding) {
+    u32 DeclareUniformTexels(u32 binding) {
         for (const auto& sampler : ir.GetSamplers()) {
             if (!sampler.is_buffer) {
                 continue;
@@ -910,7 +911,7 @@ private:
             Decorate(id, spv::Decoration::Binding, binding++);
             Decorate(id, spv::Decoration::DescriptorSet, DESCRIPTOR_SET);
 
-            texel_buffers.emplace(sampler.index, TexelBuffer{image_type, id});
+            uniform_texels.emplace(sampler.index, TexelBuffer{image_type, id});
         }
         return binding;
     }
@@ -945,31 +946,48 @@ private:
         return binding;
     }
 
-    u32 DeclareImages(u32 binding) {
+    u32 DeclareStorageTexels(u32 binding) {
         for (const auto& image : ir.GetImages()) {
-            const auto [dim, arrayed] = GetImageDim(image);
-            constexpr int depth = 0;
-            constexpr bool ms = false;
-            constexpr int sampled = 2; // This won't be accessed with a sampler
-            constexpr auto format = spv::ImageFormat::Unknown;
-            const Id image_type = TypeImage(t_uint, dim, depth, arrayed, ms, sampled, format, {});
-            const Id pointer_type = TypePointer(spv::StorageClass::UniformConstant, image_type);
-            const Id id = OpVariable(pointer_type, spv::StorageClass::UniformConstant);
-            AddGlobalVariable(Name(id, fmt::format("image_{}", image.index)));
-
-            Decorate(id, spv::Decoration::Binding, binding++);
-            Decorate(id, spv::Decoration::DescriptorSet, DESCRIPTOR_SET);
-            if (image.is_read && !image.is_written) {
-                Decorate(id, spv::Decoration::NonWritable);
-            } else if (image.is_written && !image.is_read) {
-                Decorate(id, spv::Decoration::NonReadable);
+            if (image.type != Tegra::Shader::ImageType::TextureBuffer) {
+                continue;
             }
+            DeclareImage(image, binding);
+        }
+        return binding;
+    }
 
-            images.emplace(image.index, StorageImage{image_type, id});
+    u32 DeclareImages(u32 binding) {
+        for (const auto& image : ir.GetImages()) {
+            if (image.type == Tegra::Shader::ImageType::TextureBuffer) {
+                continue;
+            }
+            DeclareImage(image, binding);
         }
         return binding;
     }
 
+    void DeclareImage(const Image& image, u32& binding) {
+        const auto [dim, arrayed] = GetImageDim(image);
+        constexpr int depth = 0;
+        constexpr bool ms = false;
+        constexpr int sampled = 2; // This won't be accessed with a sampler
+        const auto format = image.is_atomic ? spv::ImageFormat::R32ui : spv::ImageFormat::Unknown;
+        const Id image_type = TypeImage(t_uint, dim, depth, arrayed, ms, sampled, format, {});
+        const Id pointer_type = TypePointer(spv::StorageClass::UniformConstant, image_type);
+        const Id id = OpVariable(pointer_type, spv::StorageClass::UniformConstant);
+        AddGlobalVariable(Name(id, fmt::format("image_{}", image.index)));
+
+        Decorate(id, spv::Decoration::Binding, binding++);
+        Decorate(id, spv::Decoration::DescriptorSet, DESCRIPTOR_SET);
+        if (image.is_read && !image.is_written) {
+            Decorate(id, spv::Decoration::NonWritable);
+        } else if (image.is_written && !image.is_read) {
+            Decorate(id, spv::Decoration::NonReadable);
+        }
+
+        images.emplace(image.index, StorageImage{image_type, id});
+    }
+
     bool IsRenderTargetEnabled(u32 rt) const {
         for (u32 component = 0; component < 4; ++component) {
             if (header.ps.IsColorComponentOutputEnabled(rt, component)) {
@@ -1256,7 +1274,7 @@ private:
                 } else {
                     UNREACHABLE_MSG("Unmanaged offset node type");
                 }
-                pointer = OpAccessChain(t_cbuf_float, buffer_id, Constant(t_uint, 0), buffer_index,
+                pointer = OpAccessChain(t_cbuf_float, buffer_id, v_uint_zero, buffer_index,
                                         buffer_element);
             }
             return {OpLoad(t_float, pointer), Type::Float};
@@ -1611,7 +1629,7 @@ private:
 
         const Id result = OpIAddCarry(TypeStruct({t_uint, t_uint}), op_a, op_b);
         const Id carry = OpCompositeExtract(t_uint, result, 1);
-        return {OpINotEqual(t_bool, carry, Constant(t_uint, 0)), Type::Bool};
+        return {OpINotEqual(t_bool, carry, v_uint_zero), Type::Bool};
     }
 
     Expression LogicalAssign(Operation operation) {
@@ -1674,7 +1692,7 @@ private:
         const auto& meta = std::get<MetaTexture>(operation.GetMeta());
         const u32 index = meta.sampler.index;
         if (meta.sampler.is_buffer) {
-            const auto& entry = texel_buffers.at(index);
+            const auto& entry = uniform_texels.at(index);
             return OpLoad(entry.image_type, entry.image);
         } else {
             const auto& entry = sampled_images.at(index);
@@ -1951,39 +1969,20 @@ private:
         return {};
     }
 
-    Expression AtomicImageAdd(Operation operation) {
-        UNIMPLEMENTED();
-        return {};
-    }
-
-    Expression AtomicImageMin(Operation operation) {
-        UNIMPLEMENTED();
-        return {};
-    }
-
-    Expression AtomicImageMax(Operation operation) {
-        UNIMPLEMENTED();
-        return {};
-    }
-
-    Expression AtomicImageAnd(Operation operation) {
-        UNIMPLEMENTED();
-        return {};
-    }
-
-    Expression AtomicImageOr(Operation operation) {
-        UNIMPLEMENTED();
-        return {};
-    }
+    template <Id (Module::*func)(Id, Id, Id, Id, Id)>
+    Expression AtomicImage(Operation operation) {
+        const auto& meta{std::get<MetaImage>(operation.GetMeta())};
+        ASSERT(meta.values.size() == 1);
 
-    Expression AtomicImageXor(Operation operation) {
-        UNIMPLEMENTED();
-        return {};
-    }
+        const Id coordinate = GetCoordinates(operation, Type::Int);
+        const Id image = images.at(meta.image.index).image;
+        const Id sample = v_uint_zero;
+        const Id pointer = OpImageTexelPointer(t_image_uint, image, coordinate, sample);
 
-    Expression AtomicImageExchange(Operation operation) {
-        UNIMPLEMENTED();
-        return {};
+        const Id scope = Constant(t_uint, static_cast<u32>(spv::Scope::Device));
+        const Id semantics = v_uint_zero;
+        const Id value = AsUint(Visit(meta.values[0]));
+        return {(this->*func)(t_uint, pointer, scope, semantics, value), Type::Uint};
     }
 
     template <Id (Module::*func)(Id, Id, Id, Id, Id)>
@@ -1998,7 +1997,7 @@ private:
             return {v_float_zero, Type::Float};
         }
         const Id scope = Constant(t_uint, static_cast<u32>(spv::Scope::Device));
-        const Id semantics = Constant(t_uint, 0);
+        const Id semantics = v_uint_zero;
         const Id value = AsUint(Visit(operation[1]));
 
         return {(this->*func)(t_uint, pointer, scope, semantics, value), Type::Uint};
@@ -2622,11 +2621,11 @@ private:
 
         &SPIRVDecompiler::ImageLoad,
         &SPIRVDecompiler::ImageStore,
-        &SPIRVDecompiler::AtomicImageAdd,
-        &SPIRVDecompiler::AtomicImageAnd,
-        &SPIRVDecompiler::AtomicImageOr,
-        &SPIRVDecompiler::AtomicImageXor,
-        &SPIRVDecompiler::AtomicImageExchange,
+        &SPIRVDecompiler::AtomicImage<&Module::OpAtomicIAdd>,
+        &SPIRVDecompiler::AtomicImage<&Module::OpAtomicAnd>,
+        &SPIRVDecompiler::AtomicImage<&Module::OpAtomicOr>,
+        &SPIRVDecompiler::AtomicImage<&Module::OpAtomicXor>,
+        &SPIRVDecompiler::AtomicImage<&Module::OpAtomicExchange>,
 
         &SPIRVDecompiler::Atomic<&Module::OpAtomicExchange>,
         &SPIRVDecompiler::Atomic<&Module::OpAtomicIAdd>,
@@ -2768,8 +2767,11 @@ private:
         Decorate(TypeStruct(t_gmem_array), spv::Decoration::Block), 0, spv::Decoration::Offset, 0);
     const Id t_gmem_ssbo = TypePointer(spv::StorageClass::StorageBuffer, t_gmem_struct);
 
+    const Id t_image_uint = TypePointer(spv::StorageClass::Image, t_uint);
+
     const Id v_float_zero = Constant(t_float, 0.0f);
     const Id v_float_one = Constant(t_float, 1.0f);
+    const Id v_uint_zero = Constant(t_uint, 0);
 
     // Nvidia uses these defaults for varyings (e.g. position and generic attributes)
     const Id v_varying_default =
@@ -2794,15 +2796,16 @@ private:
     std::unordered_map<u8, GenericVaryingDescription> output_attributes;
     std::map<u32, Id> constant_buffers;
     std::map<GlobalMemoryBase, Id> global_buffers;
-    std::map<u32, TexelBuffer> texel_buffers;
+    std::map<u32, TexelBuffer> uniform_texels;
     std::map<u32, SampledImage> sampled_images;
+    std::map<u32, TexelBuffer> storage_texels;
     std::map<u32, StorageImage> images;
 
+    std::array<Id, Maxwell::NumRenderTargets> frag_colors{};
     Id instance_index{};
     Id vertex_index{};
     Id base_instance{};
     Id base_vertex{};
-    std::array<Id, Maxwell::NumRenderTargets> frag_colors{};
     Id frag_depth{};
     Id frag_coord{};
     Id front_facing{};
@@ -3058,13 +3061,17 @@ ShaderEntries GenerateShaderEntries(const VideoCommon::Shader::ShaderIR& ir) {
     }
     for (const auto& sampler : ir.GetSamplers()) {
         if (sampler.is_buffer) {
-            entries.texel_buffers.emplace_back(sampler);
+            entries.uniform_texels.emplace_back(sampler);
         } else {
             entries.samplers.emplace_back(sampler);
         }
     }
     for (const auto& image : ir.GetImages()) {
-        entries.images.emplace_back(image);
+        if (image.type == Tegra::Shader::ImageType::TextureBuffer) {
+            entries.storage_texels.emplace_back(image);
+        } else {
+            entries.images.emplace_back(image);
+        }
     }
     for (const auto& attribute : ir.GetInputAttributes()) {
         if (IsGenericAttribute(attribute)) {
diff --git a/src/video_core/renderer_vulkan/vk_shader_decompiler.h b/src/video_core/renderer_vulkan/vk_shader_decompiler.h
index b7af26388..2b0e90396 100644
--- a/src/video_core/renderer_vulkan/vk_shader_decompiler.h
+++ b/src/video_core/renderer_vulkan/vk_shader_decompiler.h
@@ -21,8 +21,9 @@ class VKDevice;
 namespace Vulkan {
 
 using Maxwell = Tegra::Engines::Maxwell3D::Regs;
-using TexelBufferEntry = VideoCommon::Shader::Sampler;
+using UniformTexelEntry = VideoCommon::Shader::Sampler;
 using SamplerEntry = VideoCommon::Shader::Sampler;
+using StorageTexelEntry = VideoCommon::Shader::Image;
 using ImageEntry = VideoCommon::Shader::Image;
 
 constexpr u32 DESCRIPTOR_SET = 0;
@@ -66,13 +67,15 @@ private:
 struct ShaderEntries {
     u32 NumBindings() const {
         return static_cast<u32>(const_buffers.size() + global_buffers.size() +
-                                texel_buffers.size() + samplers.size() + images.size());
+                                uniform_texels.size() + samplers.size() + storage_texels.size() +
+                                images.size());
     }
 
     std::vector<ConstBufferEntry> const_buffers;
     std::vector<GlobalBufferEntry> global_buffers;
-    std::vector<TexelBufferEntry> texel_buffers;
+    std::vector<UniformTexelEntry> uniform_texels;
     std::vector<SamplerEntry> samplers;
+    std::vector<StorageTexelEntry> storage_texels;
     std::vector<ImageEntry> images;
     std::set<u32> attributes;
     std::array<bool, Maxwell::NumClipDistances> clip_distances{};
diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.cpp b/src/video_core/renderer_vulkan/vk_texture_cache.cpp
index 2f1d5021d..ea487b770 100644
--- a/src/video_core/renderer_vulkan/vk_texture_cache.cpp
+++ b/src/video_core/renderer_vulkan/vk_texture_cache.cpp
@@ -100,8 +100,8 @@ vk::Buffer CreateBuffer(const VKDevice& device, const SurfaceParams& params,
     ci.pNext = nullptr;
     ci.flags = 0;
     ci.size = static_cast<VkDeviceSize>(host_memory_size);
-    ci.usage = VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT |
-               VK_BUFFER_USAGE_TRANSFER_DST_BIT;
+    ci.usage = VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT | VK_BUFFER_USAGE_STORAGE_TEXEL_BUFFER_BIT |
+               VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT;
     ci.sharingMode = VK_SHARING_MODE_EXCLUSIVE;
     ci.queueFamilyIndexCount = 0;
     ci.pQueueFamilyIndices = nullptr;
diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h
index 45e3ddd2c..6f63217a2 100644
--- a/src/video_core/texture_cache/texture_cache.h
+++ b/src/video_core/texture_cache/texture_cache.h
@@ -655,45 +655,63 @@ private:
      **/
     std::optional<std::pair<TSurface, TView>> TryReconstructSurface(VectorSurface& overlaps,
                                                                     const SurfaceParams& params,
-                                                                    const GPUVAddr gpu_addr) {
+                                                                    GPUVAddr gpu_addr) {
         if (params.target == SurfaceTarget::Texture3D) {
-            return {};
+            return std::nullopt;
         }
-        bool modified = false;
+        const auto test_modified = [](TSurface& surface) { return surface->IsModified(); };
         TSurface new_surface = GetUncachedSurface(gpu_addr, params);
-        u32 passed_tests = 0;
+
+        if (std::none_of(overlaps.begin(), overlaps.end(), test_modified)) {
+            LoadSurface(new_surface);
+            for (const auto& surface : overlaps) {
+                Unregister(surface);
+            }
+            Register(new_surface);
+            return {{new_surface, new_surface->GetMainView()}};
+        }
+
+        std::size_t passed_tests = 0;
         for (auto& surface : overlaps) {
             const SurfaceParams& src_params = surface->GetSurfaceParams();
-            if (src_params.is_layered || src_params.num_levels > 1) {
-                // We send this cases to recycle as they are more complex to handle
-                return {};
-            }
-            const std::size_t candidate_size = surface->GetSizeInBytes();
-            auto mipmap_layer{new_surface->GetLayerMipmap(surface->GetGpuAddr())};
+            const auto mipmap_layer{new_surface->GetLayerMipmap(surface->GetGpuAddr())};
             if (!mipmap_layer) {
                 continue;
             }
-            const auto [layer, mipmap] = *mipmap_layer;
-            if (new_surface->GetMipmapSize(mipmap) != candidate_size) {
+            const auto [base_layer, base_mipmap] = *mipmap_layer;
+            if (new_surface->GetMipmapSize(base_mipmap) != surface->GetMipmapSize(0)) {
                 continue;
             }
-            modified |= surface->IsModified();
-            // Now we got all the data set up
-            const u32 width = SurfaceParams::IntersectWidth(src_params, params, 0, mipmap);
-            const u32 height = SurfaceParams::IntersectHeight(src_params, params, 0, mipmap);
-            const CopyParams copy_params(0, 0, 0, 0, 0, layer, 0, mipmap, width, height, 1);
-            passed_tests++;
-            ImageCopy(surface, new_surface, copy_params);
+            ++passed_tests;
+
+            // Copy all mipmaps and layers
+            const u32 block_width = params.GetDefaultBlockWidth();
+            const u32 block_height = params.GetDefaultBlockHeight();
+            for (u32 mipmap = base_mipmap; mipmap < base_mipmap + src_params.num_levels; ++mipmap) {
+                const u32 width = SurfaceParams::IntersectWidth(src_params, params, 0, mipmap);
+                const u32 height = SurfaceParams::IntersectHeight(src_params, params, 0, mipmap);
+                if (width < block_width || height < block_height) {
+                    // Current APIs forbid copying small compressed textures, avoid errors
+                    break;
+                }
+                const CopyParams copy_params(0, 0, 0, 0, 0, base_layer, 0, mipmap, width, height,
+                                             src_params.depth);
+                ImageCopy(surface, new_surface, copy_params);
+            }
         }
         if (passed_tests == 0) {
-            return {};
+            return std::nullopt;
+        }
+        if (Settings::IsGPULevelExtreme() && passed_tests != overlaps.size()) {
             // In Accurate GPU all tests should pass, else we recycle
-        } else if (Settings::IsGPULevelExtreme() && passed_tests != overlaps.size()) {
-            return {};
+            return std::nullopt;
         }
+
+        const bool modified = std::any_of(overlaps.begin(), overlaps.end(), test_modified);
         for (const auto& surface : overlaps) {
             Unregister(surface);
         }
+
         new_surface->MarkAsModified(modified, Tick());
         Register(new_surface);
         return {{new_surface, new_surface->GetMainView()}};
@@ -871,12 +889,9 @@ private:
             // two things either the candidate surface is a supertexture of the overlap
             // or they don't match in any known way.
             if (!current_surface->IsInside(gpu_addr, gpu_addr + candidate_size)) {
-                if (current_surface->GetGpuAddr() == gpu_addr) {
-                    std::optional<std::pair<TSurface, TView>> view =
-                        TryReconstructSurface(overlaps, params, gpu_addr);
-                    if (view) {
-                        return *view;
-                    }
+                const std::optional view = TryReconstructSurface(overlaps, params, gpu_addr);
+                if (view) {
+                    return *view;
                 }
                 return RecycleSurface(overlaps, params, gpu_addr, preserve_contents,
                                       MatchTopologyResult::FullMatch);
diff --git a/src/yuzu/configuration/config.cpp b/src/yuzu/configuration/config.cpp
index b08b87426..7e9073cc3 100644
--- a/src/yuzu/configuration/config.cpp
+++ b/src/yuzu/configuration/config.cpp
@@ -533,6 +533,8 @@ void Config::ReadDebuggingValues() {
     Settings::values.quest_flag = ReadSetting(QStringLiteral("quest_flag"), false).toBool();
     Settings::values.disable_cpu_opt =
         ReadSetting(QStringLiteral("disable_cpu_opt"), false).toBool();
+    Settings::values.disable_macro_jit =
+        ReadSetting(QStringLiteral("disable_macro_jit"), false).toBool();
 
     qt_config->endGroup();
 }
@@ -1011,6 +1013,7 @@ void Config::SaveDebuggingValues() {
     WriteSetting(QStringLiteral("dump_nso"), Settings::values.dump_nso, false);
     WriteSetting(QStringLiteral("quest_flag"), Settings::values.quest_flag, false);
     WriteSetting(QStringLiteral("disable_cpu_opt"), Settings::values.disable_cpu_opt, false);
+    WriteSetting(QStringLiteral("disable_macro_jit"), Settings::values.disable_macro_jit, false);
 
     qt_config->endGroup();
 }
diff --git a/src/yuzu/configuration/configure_debug.cpp b/src/yuzu/configuration/configure_debug.cpp
index c2026763e..2c77441fd 100644
--- a/src/yuzu/configuration/configure_debug.cpp
+++ b/src/yuzu/configuration/configure_debug.cpp
@@ -39,6 +39,8 @@ void ConfigureDebug::SetConfiguration() {
     ui->disable_cpu_opt->setChecked(Settings::values.disable_cpu_opt);
     ui->enable_graphics_debugging->setEnabled(!Core::System::GetInstance().IsPoweredOn());
     ui->enable_graphics_debugging->setChecked(Settings::values.renderer_debug);
+    ui->disable_macro_jit->setEnabled(!Core::System::GetInstance().IsPoweredOn());
+    ui->disable_macro_jit->setChecked(Settings::values.disable_macro_jit);
 }
 
 void ConfigureDebug::ApplyConfiguration() {
@@ -51,6 +53,7 @@ void ConfigureDebug::ApplyConfiguration() {
     Settings::values.quest_flag = ui->quest_flag->isChecked();
     Settings::values.disable_cpu_opt = ui->disable_cpu_opt->isChecked();
     Settings::values.renderer_debug = ui->enable_graphics_debugging->isChecked();
+    Settings::values.disable_macro_jit = ui->disable_macro_jit->isChecked();
     Debugger::ToggleConsole();
     Log::Filter filter;
     filter.ParseFilterString(Settings::values.log_filter);
diff --git a/src/yuzu/configuration/configure_debug.ui b/src/yuzu/configuration/configure_debug.ui
index e0d4c4a44..46f0208c6 100644
--- a/src/yuzu/configuration/configure_debug.ui
+++ b/src/yuzu/configuration/configure_debug.ui
@@ -148,6 +148,19 @@
         </property>
        </widget>
       </item>
+      <item>
+       <widget class="QCheckBox" name="disable_macro_jit">
+        <property name="enabled">
+         <bool>true</bool>
+        </property>
+        <property name="whatsThis">
+         <string>When checked, it disables the macro Just In Time compiler. Enabled this makes games run slower</string>
+        </property>
+        <property name="text">
+         <string>Disable Macro JIT</string>
+        </property>
+       </widget>
+      </item>
      </layout>
     </widget>
    </item>
diff --git a/src/yuzu/configuration/configure_input_player.cpp b/src/yuzu/configuration/configure_input_player.cpp
index e4eb5594b..a05fa64ba 100644
--- a/src/yuzu/configuration/configure_input_player.cpp
+++ b/src/yuzu/configuration/configure_input_player.cpp
@@ -480,7 +480,9 @@ void ConfigureInputPlayer::RestoreDefaults() {
             SetAnalogButton(params, analogs_param[analog_id], analog_sub_buttons[sub_button_id]);
         }
     }
+
     UpdateButtonLabels();
+    ApplyConfiguration();
 }
 
 void ConfigureInputPlayer::ClearAll() {
@@ -505,6 +507,7 @@ void ConfigureInputPlayer::ClearAll() {
     }
 
     UpdateButtonLabels();
+    ApplyConfiguration();
 }
 
 void ConfigureInputPlayer::UpdateButtonLabels() {
diff --git a/src/yuzu_cmd/config.cpp b/src/yuzu_cmd/config.cpp
index c20d48c42..7240270f5 100644
--- a/src/yuzu_cmd/config.cpp
+++ b/src/yuzu_cmd/config.cpp
@@ -432,6 +432,8 @@ void Config::ReadValues() {
     Settings::values.quest_flag = sdl2_config->GetBoolean("Debugging", "quest_flag", false);
     Settings::values.disable_cpu_opt =
         sdl2_config->GetBoolean("Debugging", "disable_cpu_opt", false);
+    Settings::values.disable_macro_jit =
+        sdl2_config->GetBoolean("Debugging", "disable_macro_jit", false);
 
     const auto title_list = sdl2_config->Get("AddOns", "title_ids", "");
     std::stringstream ss(title_list);
diff --git a/src/yuzu_cmd/default_ini.h b/src/yuzu_cmd/default_ini.h
index abc6e6e65..6f53e9659 100644
--- a/src/yuzu_cmd/default_ini.h
+++ b/src/yuzu_cmd/default_ini.h
@@ -291,6 +291,8 @@ quest_flag =
 # Determines whether or not JIT CPU optimizations are enabled
 # false: Optimizations Enabled, true: Optimizations Disabled
 disable_cpu_opt =
+# Enables/Disables the macro JIT compiler
+disable_macro_jit=false
 
 [WebService]
 # Whether or not to enable telemetry