9 files changed, 352 insertions, 66 deletions
diff --git a/src/shader_recompiler/backend/glsl/emit_glsl_warp.cpp b/src/shader_recompiler/backend/glsl/emit_glsl_warp.cpp
index a982dd8a2..cd285e2c8 100644
--- a/src/shader_recompiler/backend/glsl/emit_glsl_warp.cpp
+++ b/src/shader_recompiler/backend/glsl/emit_glsl_warp.cpp
@@ -11,6 +11,8 @@
 
 namespace Shader::Backend::GLSL {
 namespace {
+constexpr char THREAD_ID[]{"gl_SubGroupInvocationARB"};
+
 void SetInBoundsFlag(EmitContext& ctx, IR::Inst& inst) {
     IR::Inst* const in_bounds{inst.GetAssociatedPseudoOperation(IR::Opcode::GetInBoundsFromOp)};
     if (!in_bounds) {
@@ -43,84 +45,100 @@ void UseShuffleNv(EmitContext& ctx, IR::Inst& inst, std::string_view shfl_op,
     ctx.AddU32("{}={}({},{},{},shfl_in_bounds);", inst, shfl_op, value, index, width);
     SetInBoundsFlag(ctx, inst);
 }
+
+std::string_view BallotIndex(EmitContext& ctx) {
+    if (!ctx.profile.warp_size_potentially_larger_than_guest) {
+        return ".x";
+    }
+    return "[gl_SubGroupInvocationARB>>5]";
+}
+
+std::string GetMask(EmitContext& ctx, std::string_view mask) {
+    const auto ballot_index{BallotIndex(ctx)};
+    return fmt::format("uint(uvec2({}){})", mask, ballot_index);
+}
 } // Anonymous namespace
 
 void EmitLaneId(EmitContext& ctx, IR::Inst& inst) {
-    ctx.AddU32("{}=gl_SubGroupInvocationARB&31u;", inst);
+    ctx.AddU32("{}={}&31u;", inst, THREAD_ID);
 }
 
 void EmitVoteAll(EmitContext& ctx, IR::Inst& inst, std::string_view pred) {
     if (!ctx.profile.warp_size_potentially_larger_than_guest) {
         ctx.AddU1("{}=allInvocationsEqualARB({});", inst, pred);
-    } else {
-        const auto active_mask{fmt::format("uvec2(ballotARB(true))[gl_SubGroupInvocationARB]")};
-        const auto ballot{fmt::format("uvec2(ballotARB({}))[gl_SubGroupInvocationARB]", pred)};
-        ctx.AddU1("{}=({}&{})=={};", inst, ballot, active_mask, active_mask);
+        return;
     }
+    const auto ballot_index{BallotIndex(ctx)};
+    const auto active_mask{fmt::format("uvec2(ballotARB(true)){}", ballot_index)};
+    const auto ballot{fmt::format("uvec2(ballotARB({})){}", pred, ballot_index)};
+    ctx.AddU1("{}=({}&{})=={};", inst, ballot, active_mask, active_mask);
 }
 
 void EmitVoteAny(EmitContext& ctx, IR::Inst& inst, std::string_view pred) {
     if (!ctx.profile.warp_size_potentially_larger_than_guest) {
         ctx.AddU1("{}=anyInvocationARB({});", inst, pred);
-    } else {
-        const auto active_mask{fmt::format("uvec2(ballotARB(true))[gl_SubGroupInvocationARB]")};
-        const auto ballot{fmt::format("uvec2(ballotARB({}))[gl_SubGroupInvocationARB]", pred)};
-        ctx.AddU1("{}=({}&{})!=0u;", inst, ballot, active_mask, active_mask);
+        return;
     }
+    const auto ballot_index{BallotIndex(ctx)};
+    const auto active_mask{fmt::format("uvec2(ballotARB(true)){}", ballot_index)};
+    const auto ballot{fmt::format("uvec2(ballotARB({})){}", pred, ballot_index)};
+    ctx.AddU1("{}=({}&{})!=0u;", inst, ballot, active_mask, active_mask);
 }
 
 void EmitVoteEqual(EmitContext& ctx, IR::Inst& inst, std::string_view pred) {
     if (!ctx.profile.warp_size_potentially_larger_than_guest) {
         ctx.AddU1("{}=allInvocationsEqualARB({});", inst, pred);
-    } else {
-        const auto active_mask{fmt::format("uvec2(ballotARB(true))[gl_SubGroupInvocationARB]")};
-        const auto ballot{fmt::format("uvec2(ballotARB({}))[gl_SubGroupInvocationARB]", pred)};
-        const auto value{fmt::format("({}^{})", ballot, active_mask)};
-        ctx.AddU1("{}=({}==0)||({}=={});", inst, value, value, active_mask);
+        return;
     }
+    const auto ballot_index{BallotIndex(ctx)};
+    const auto active_mask{fmt::format("uvec2(ballotARB(true)){}", ballot_index)};
+    const auto ballot{fmt::format("uvec2(ballotARB({})){}", pred, ballot_index)};
+    const auto value{fmt::format("({}^{})", ballot, active_mask)};
+    ctx.AddU1("{}=({}==0)||({}=={});", inst, value, value, active_mask);
 }
 
 void EmitSubgroupBallot(EmitContext& ctx, IR::Inst& inst, std::string_view pred) {
-    if (!ctx.profile.warp_size_potentially_larger_than_guest) {
-        ctx.AddU32("{}=uvec2(ballotARB({})).x;", inst, pred);
-    } else {
-        ctx.AddU32("{}=uvec2(ballotARB({}))[gl_SubGroupInvocationARB];", inst, pred);
-    }
+    const auto ballot_index{BallotIndex(ctx)};
+    ctx.AddU32("{}=uvec2(ballotARB({})){};", inst, pred, ballot_index);
 }
 
 void EmitSubgroupEqMask(EmitContext& ctx, IR::Inst& inst) {
-    ctx.AddU32("{}=uint(gl_SubGroupEqMaskARB.x);", inst);
+    ctx.AddU32("{}={};", inst, GetMask(ctx, "gl_SubGroupEqMaskARB"));
 }
 
 void EmitSubgroupLtMask(EmitContext& ctx, IR::Inst& inst) {
-    ctx.AddU32("{}=uint(gl_SubGroupLtMaskARB.x);", inst);
+    ctx.AddU32("{}={};", inst, GetMask(ctx, "gl_SubGroupLtMaskARB"));
 }
 
 void EmitSubgroupLeMask(EmitContext& ctx, IR::Inst& inst) {
-    ctx.AddU32("{}=uint(gl_SubGroupLeMaskARB.x);", inst);
+    ctx.AddU32("{}={};", inst, GetMask(ctx, "gl_SubGroupLeMaskARB"));
 }
 
 void EmitSubgroupGtMask(EmitContext& ctx, IR::Inst& inst) {
-    ctx.AddU32("{}=uint(gl_SubGroupGtMaskARB.x);", inst);
+    ctx.AddU32("{}={};", inst, GetMask(ctx, "gl_SubGroupGtMaskARB"));
 }
 
 void EmitSubgroupGeMask(EmitContext& ctx, IR::Inst& inst) {
-    ctx.AddU32("{}=uint(gl_SubGroupGeMaskARB.x);", inst);
+    ctx.AddU32("{}={};", inst, GetMask(ctx, "gl_SubGroupGeMaskARB"));
 }
 
 void EmitShuffleIndex(EmitContext& ctx, IR::Inst& inst, std::string_view value,
-                      std::string_view index, std::string_view clamp,
-                      std::string_view segmentation_mask) {
+                      std::string_view index, std::string_view clamp, std::string_view seg_mask) {
     if (ctx.profile.support_gl_warp_intrinsics) {
-        UseShuffleNv(ctx, inst, "shuffleNV", value, index, clamp, segmentation_mask);
+        UseShuffleNv(ctx, inst, "shuffleNV", value, index, clamp, seg_mask);
         return;
     }
-    const auto not_seg_mask{fmt::format("(~{})", segmentation_mask)};
-    const auto thread_id{"gl_SubGroupInvocationARB"};
-    const auto min_thread_id{ComputeMinThreadId(thread_id, segmentation_mask)};
-    const auto max_thread_id{ComputeMaxThreadId(min_thread_id, clamp, not_seg_mask)};
+    const bool big_warp{ctx.profile.warp_size_potentially_larger_than_guest};
+    const auto is_upper_partition{"int(gl_SubGroupInvocationARB)>=32"};
+    const auto upper_index{fmt::format("{}?{}+32:{}", is_upper_partition, index, index)};
+    const auto upper_clamp{fmt::format("{}?{}+32:{}", is_upper_partition, clamp, clamp)};
+
+    const auto not_seg_mask{fmt::format("(~{})", seg_mask)};
+    const auto min_thread_id{ComputeMinThreadId(THREAD_ID, seg_mask)};
+    const auto max_thread_id{
+        ComputeMaxThreadId(min_thread_id, big_warp ? upper_clamp : clamp, not_seg_mask)};
 
-    const auto lhs{fmt::format("({}&{})", index, not_seg_mask)};
+    const auto lhs{fmt::format("({}&{})", big_warp ? upper_index : index, not_seg_mask)};
     const auto src_thread_id{fmt::format("({})|({})", lhs, min_thread_id)};
     ctx.Add("shfl_in_bounds=int({})<=int({});", src_thread_id, max_thread_id);
     SetInBoundsFlag(ctx, inst);
@@ -128,29 +146,34 @@ void EmitShuffleIndex(EmitContext& ctx, IR::Inst& inst, std::string_view value,
 }
 
 void EmitShuffleUp(EmitContext& ctx, IR::Inst& inst, std::string_view value, std::string_view index,
-                   std::string_view clamp, std::string_view segmentation_mask) {
+                   std::string_view clamp, std::string_view seg_mask) {
     if (ctx.profile.support_gl_warp_intrinsics) {
-        UseShuffleNv(ctx, inst, "shuffleUpNV", value, index, clamp, segmentation_mask);
+        UseShuffleNv(ctx, inst, "shuffleUpNV", value, index, clamp, seg_mask);
         return;
     }
-    const auto thread_id{"gl_SubGroupInvocationARB"};
-    const auto max_thread_id{GetMaxThreadId(thread_id, clamp, segmentation_mask)};
-    const auto src_thread_id{fmt::format("({}-{})", thread_id, index)};
+    const bool big_warp{ctx.profile.warp_size_potentially_larger_than_guest};
+    const auto is_upper_partition{"int(gl_SubGroupInvocationARB)>=32"};
+    const auto upper_clamp{fmt::format("{}?{}+32:{}", is_upper_partition, clamp, clamp)};
+
+    const auto max_thread_id{GetMaxThreadId(THREAD_ID, big_warp ? upper_clamp : clamp, seg_mask)};
+    const auto src_thread_id{fmt::format("({}-{})", THREAD_ID, index)};
     ctx.Add("shfl_in_bounds=int({})>=int({});", src_thread_id, max_thread_id);
     SetInBoundsFlag(ctx, inst);
     ctx.AddU32("{}=shfl_in_bounds?readInvocationARB({},{}):{};", inst, value, src_thread_id, value);
 }
 
 void EmitShuffleDown(EmitContext& ctx, IR::Inst& inst, std::string_view value,
-                     std::string_view index, std::string_view clamp,
-                     std::string_view segmentation_mask) {
+                     std::string_view index, std::string_view clamp, std::string_view seg_mask) {
     if (ctx.profile.support_gl_warp_intrinsics) {
-        UseShuffleNv(ctx, inst, "shuffleDownNV", value, index, clamp, segmentation_mask);
+        UseShuffleNv(ctx, inst, "shuffleDownNV", value, index, clamp, seg_mask);
         return;
     }
-    const auto thread_id{"gl_SubGroupInvocationARB"};
-    const auto max_thread_id{GetMaxThreadId(thread_id, clamp, segmentation_mask)};
-    const auto src_thread_id{fmt::format("({}+{})", thread_id, index)};
+    const bool big_warp{ctx.profile.warp_size_potentially_larger_than_guest};
+    const auto is_upper_partition{"int(gl_SubGroupInvocationARB)>=32"};
+    const auto upper_clamp{fmt::format("{}?{}+32:{}", is_upper_partition, clamp, clamp)};
+
+    const auto max_thread_id{GetMaxThreadId(THREAD_ID, big_warp ? upper_clamp : clamp, seg_mask)};
+    const auto src_thread_id{fmt::format("({}+{})", THREAD_ID, index)};
     ctx.Add("shfl_in_bounds=int({})<=int({});", src_thread_id, max_thread_id);
     SetInBoundsFlag(ctx, inst);
     ctx.AddU32("{}=shfl_in_bounds?readInvocationARB({},{}):{};", inst, value, src_thread_id, value);
@@ -158,14 +181,17 @@ void EmitShuffleDown(EmitContext& ctx, IR::Inst& inst, std::string_view value,
 
 void EmitShuffleButterfly(EmitContext& ctx, IR::Inst& inst, std::string_view value,
                           std::string_view index, std::string_view clamp,
-                          std::string_view segmentation_mask) {
+                          std::string_view seg_mask) {
     if (ctx.profile.support_gl_warp_intrinsics) {
-        UseShuffleNv(ctx, inst, "shuffleXorNV", value, index, clamp, segmentation_mask);
+        UseShuffleNv(ctx, inst, "shuffleXorNV", value, index, clamp, seg_mask);
         return;
     }
-    const auto thread_id{"gl_SubGroupInvocationARB"};
-    const auto max_thread_id{GetMaxThreadId(thread_id, clamp, segmentation_mask)};
-    const auto src_thread_id{fmt::format("({}^{})", thread_id, index)};
+    const bool big_warp{ctx.profile.warp_size_potentially_larger_than_guest};
+    const auto is_upper_partition{"int(gl_SubGroupInvocationARB)>=32"};
+    const auto upper_clamp{fmt::format("{}?{}+32:{}", is_upper_partition, clamp, clamp)};
+
+    const auto max_thread_id{GetMaxThreadId(THREAD_ID, big_warp ? upper_clamp : clamp, seg_mask)};
+    const auto src_thread_id{fmt::format("({}^{})", THREAD_ID, index)};
     ctx.Add("shfl_in_bounds=int({})<=int({});", src_thread_id, max_thread_id);
     SetInBoundsFlag(ctx, inst);
     ctx.AddU32("{}=shfl_in_bounds?readInvocationARB({},{}):{};", inst, value, src_thread_id, value);
diff --git a/src/shader_recompiler/backend/spirv/emit_context.cpp b/src/shader_recompiler/backend/spirv/emit_context.cpp
index 2d29d8c14..2885e6799 100644
--- a/src/shader_recompiler/backend/spirv/emit_context.cpp
+++ b/src/shader_recompiler/backend/spirv/emit_context.cpp
@@ -15,6 +15,8 @@
 
 namespace Shader::Backend::SPIRV {
 namespace {
+constexpr size_t NUM_FIXEDFNCTEXTURE = 10;
+
 enum class Operation {
     Increment,
     Decrement,
@@ -427,6 +429,16 @@ Id DescType(EmitContext& ctx, Id sampled_type, Id pointer_type, u32 count) {
         return pointer_type;
     }
 }
+
+size_t FindNextUnusedLocation(const std::bitset<IR::NUM_GENERICS>& used_locations,
+                              size_t start_offset) {
+    for (size_t location = start_offset; location < used_locations.size(); ++location) {
+        if (!used_locations.test(location)) {
+            return location;
+        }
+    }
+    throw RuntimeError("Unable to get an unused location for legacy attribute");
+}
 } // Anonymous namespace
 
 void VectorTypes::Define(Sirit::Module& sirit_ctx, Id base_type, std::string_view name) {
@@ -1227,6 +1239,7 @@ void EmitContext::DefineInputs(const IR::Program& program) {
         loads[IR::Attribute::TessellationEvaluationPointV]) {
         tess_coord = DefineInput(*this, F32[3], false, spv::BuiltIn::TessCoord);
     }
+    std::bitset<IR::NUM_GENERICS> used_locations{};
     for (size_t index = 0; index < IR::NUM_GENERICS; ++index) {
         const AttributeType input_type{runtime_info.generic_input_types[index]};
         if (!runtime_info.previous_stage_stores.Generic(index)) {
@@ -1238,6 +1251,7 @@ void EmitContext::DefineInputs(const IR::Program& program) {
         if (input_type == AttributeType::Disabled) {
             continue;
         }
+        used_locations.set(index);
         const Id type{GetAttributeType(*this, input_type)};
         const Id id{DefineInput(*this, type, true)};
         Decorate(id, spv::Decoration::Location, static_cast<u32>(index));
@@ -1263,6 +1277,26 @@ void EmitContext::DefineInputs(const IR::Program& program) {
             break;
         }
     }
+    size_t previous_unused_location = 0;
+    if (loads.AnyComponent(IR::Attribute::ColorFrontDiffuseR)) {
+        const size_t location = FindNextUnusedLocation(used_locations, previous_unused_location);
+        previous_unused_location = location;
+        used_locations.set(location);
+        const Id id{DefineInput(*this, F32[4], true)};
+        Decorate(id, spv::Decoration::Location, location);
+        input_front_color = id;
+    }
+    for (size_t index = 0; index < NUM_FIXEDFNCTEXTURE; ++index) {
+        if (loads.AnyComponent(IR::Attribute::FixedFncTexture0S + index * 4)) {
+            const size_t location =
+                FindNextUnusedLocation(used_locations, previous_unused_location);
+            previous_unused_location = location;
+            used_locations.set(location);
+            const Id id{DefineInput(*this, F32[4], true)};
+            Decorate(id, spv::Decoration::Location, location);
+            input_fixed_fnc_textures[index] = id;
+        }
+    }
     if (stage == Stage::TessellationEval) {
         for (size_t index = 0; index < info.uses_patches.size(); ++index) {
             if (!info.uses_patches[index]) {
@@ -1313,9 +1347,31 @@ void EmitContext::DefineOutputs(const IR::Program& program) {
         viewport_mask = DefineOutput(*this, TypeArray(U32[1], Const(1u)), std::nullopt,
                                      spv::BuiltIn::ViewportMaskNV);
     }
+    std::bitset<IR::NUM_GENERICS> used_locations{};
     for (size_t index = 0; index < IR::NUM_GENERICS; ++index) {
         if (info.stores.Generic(index)) {
             DefineGenericOutput(*this, index, invocations);
+            used_locations.set(index);
+        }
+    }
+    size_t previous_unused_location = 0;
+    if (info.stores.AnyComponent(IR::Attribute::ColorFrontDiffuseR)) {
+        const size_t location = FindNextUnusedLocation(used_locations, previous_unused_location);
+        previous_unused_location = location;
+        used_locations.set(location);
+        const Id id{DefineOutput(*this, F32[4], invocations)};
+        Decorate(id, spv::Decoration::Location, static_cast<u32>(location));
+        output_front_color = id;
+    }
+    for (size_t index = 0; index < NUM_FIXEDFNCTEXTURE; ++index) {
+        if (info.stores.AnyComponent(IR::Attribute::FixedFncTexture0S + index * 4)) {
+            const size_t location =
+                FindNextUnusedLocation(used_locations, previous_unused_location);
+            previous_unused_location = location;
+            used_locations.set(location);
+            const Id id{DefineOutput(*this, F32[4], invocations)};
+            Decorate(id, spv::Decoration::Location, location);
+            output_fixed_fnc_textures[index] = id;
         }
     }
     switch (stage) {
diff --git a/src/shader_recompiler/backend/spirv/emit_context.h b/src/shader_recompiler/backend/spirv/emit_context.h
index e277bc358..847d0c0e6 100644
--- a/src/shader_recompiler/backend/spirv/emit_context.h
+++ b/src/shader_recompiler/backend/spirv/emit_context.h
@@ -268,10 +268,14 @@ public:
     Id write_global_func_u32x4{};
 
     Id input_position{};
+    Id input_front_color{};
+    std::array<Id, 10> input_fixed_fnc_textures{};
     std::array<Id, 32> input_generics{};
 
     Id output_point_size{};
     Id output_position{};
+    Id output_front_color{};
+    std::array<Id, 10> output_fixed_fnc_textures{};
     std::array<std::array<GenericElementInfo, 4>, 32> output_generics{};
 
     Id output_tess_level_outer{};
diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp
index 14c77f162..68f360b3c 100644
--- a/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp
+++ b/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp
@@ -43,6 +43,25 @@ Id AttrPointer(EmitContext& ctx, Id pointer_type, Id vertex, Id base, Args&&...
     }
 }
 
+bool IsFixedFncTexture(IR::Attribute attribute) {
+    return attribute >= IR::Attribute::FixedFncTexture0S &&
+           attribute <= IR::Attribute::FixedFncTexture9Q;
+}
+
+u32 FixedFncTextureAttributeIndex(IR::Attribute attribute) {
+    if (!IsFixedFncTexture(attribute)) {
+        throw InvalidArgument("Attribute {} is not a FixedFncTexture", attribute);
+    }
+    return (static_cast<u32>(attribute) - static_cast<u32>(IR::Attribute::FixedFncTexture0S)) / 4u;
+}
+
+u32 FixedFncTextureAttributeElement(IR::Attribute attribute) {
+    if (!IsFixedFncTexture(attribute)) {
+        throw InvalidArgument("Attribute {} is not a FixedFncTexture", attribute);
+    }
+    return static_cast<u32>(attribute) % 4u;
+}
+
 template <typename... Args>
 Id OutputAccessChain(EmitContext& ctx, Id result_type, Id base, Args&&... args) {
     if (ctx.stage == Stage::TessellationControl) {
@@ -74,6 +93,13 @@ std::optional<OutAttr> OutputAttrPointer(EmitContext& ctx, IR::Attribute attr) {
             return OutputAccessChain(ctx, ctx.output_f32, info.id, index_id);
         }
     }
+    if (IsFixedFncTexture(attr)) {
+        const u32 index{FixedFncTextureAttributeIndex(attr)};
+        const u32 element{FixedFncTextureAttributeElement(attr)};
+        const Id element_id{ctx.Const(element)};
+        return OutputAccessChain(ctx, ctx.output_f32, ctx.output_fixed_fnc_textures[index],
+                                 element_id);
+    }
     switch (attr) {
     case IR::Attribute::PointSize:
         return ctx.output_point_size;
@@ -85,6 +111,14 @@ std::optional<OutAttr> OutputAttrPointer(EmitContext& ctx, IR::Attribute attr) {
         const Id element_id{ctx.Const(element)};
         return OutputAccessChain(ctx, ctx.output_f32, ctx.output_position, element_id);
     }
+    case IR::Attribute::ColorFrontDiffuseR:
+    case IR::Attribute::ColorFrontDiffuseG:
+    case IR::Attribute::ColorFrontDiffuseB:
+    case IR::Attribute::ColorFrontDiffuseA: {
+        const u32 element{static_cast<u32>(attr) % 4};
+        const Id element_id{ctx.Const(element)};
+        return OutputAccessChain(ctx, ctx.output_f32, ctx.output_front_color, element_id);
+    }
     case IR::Attribute::ClipDistance0:
     case IR::Attribute::ClipDistance1:
     case IR::Attribute::ClipDistance2:
@@ -307,6 +341,12 @@ Id EmitGetAttribute(EmitContext& ctx, IR::Attribute attr, Id vertex) {
         const Id value{ctx.OpLoad(type->id, pointer)};
         return type->needs_cast ? ctx.OpBitcast(ctx.F32[1], value) : value;
     }
+    if (IsFixedFncTexture(attr)) {
+        const u32 index{FixedFncTextureAttributeIndex(attr)};
+        const Id attr_id{ctx.input_fixed_fnc_textures[index]};
+        const Id attr_ptr{AttrPointer(ctx, ctx.input_f32, vertex, attr_id, ctx.Const(element))};
+        return ctx.OpLoad(ctx.F32[1], attr_ptr);
+    }
     switch (attr) {
     case IR::Attribute::PrimitiveId:
         return ctx.OpBitcast(ctx.F32[1], ctx.OpLoad(ctx.U32[1], ctx.primitive_id));
@@ -316,6 +356,13 @@ Id EmitGetAttribute(EmitContext& ctx, IR::Attribute attr, Id vertex) {
     case IR::Attribute::PositionW:
         return ctx.OpLoad(ctx.F32[1], AttrPointer(ctx, ctx.input_f32, vertex, ctx.input_position,
                                                   ctx.Const(element)));
+    case IR::Attribute::ColorFrontDiffuseR:
+    case IR::Attribute::ColorFrontDiffuseG:
+    case IR::Attribute::ColorFrontDiffuseB:
+    case IR::Attribute::ColorFrontDiffuseA: {
+        return ctx.OpLoad(ctx.F32[1], AttrPointer(ctx, ctx.input_f32, vertex, ctx.input_front_color,
+                                                  ctx.Const(element)));
+    }
     case IR::Attribute::InstanceId:
         if (ctx.profile.support_vertex_instance_id) {
             return ctx.OpBitcast(ctx.F32[1], ctx.OpLoad(ctx.U32[1], ctx.instance_id));
@@ -333,8 +380,9 @@ Id EmitGetAttribute(EmitContext& ctx, IR::Attribute attr, Id vertex) {
             return ctx.OpBitcast(ctx.F32[1], ctx.OpISub(ctx.U32[1], index, base));
         }
     case IR::Attribute::FrontFace:
-        return ctx.OpSelect(ctx.U32[1], ctx.OpLoad(ctx.U1, ctx.front_face),
-                            ctx.Const(std::numeric_limits<u32>::max()), ctx.u32_zero_value);
+        return ctx.OpSelect(ctx.F32[1], ctx.OpLoad(ctx.U1, ctx.front_face),
+                            ctx.OpBitcast(ctx.F32[1], ctx.Const(std::numeric_limits<u32>::max())),
+                            ctx.f32_zero_value);
     case IR::Attribute::PointSpriteS:
         return ctx.OpLoad(ctx.F32[1],
                           ctx.OpAccessChain(ctx.input_f32, ctx.point_coord, ctx.u32_zero_value));
diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_warp.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_warp.cpp
index 78b1e1ba7..cef52c56e 100644
--- a/src/shader_recompiler/backend/spirv/emit_spirv_warp.cpp
+++ b/src/shader_recompiler/backend/spirv/emit_spirv_warp.cpp
@@ -7,8 +7,13 @@
 
 namespace Shader::Backend::SPIRV {
 namespace {
+Id GetThreadId(EmitContext& ctx) {
+    return ctx.OpLoad(ctx.U32[1], ctx.subgroup_local_invocation_id);
+}
+
 Id WarpExtract(EmitContext& ctx, Id value) {
-    const Id local_index{ctx.OpLoad(ctx.U32[1], ctx.subgroup_local_invocation_id)};
+    const Id thread_id{GetThreadId(ctx)};
+    const Id local_index{ctx.OpShiftRightArithmetic(ctx.U32[1], thread_id, ctx.Const(5U))};
     return ctx.OpVectorExtractDynamic(ctx.U32[1], value, local_index);
 }
 
@@ -48,10 +53,17 @@ Id SelectValue(EmitContext& ctx, Id in_range, Id value, Id src_thread_id) {
     return ctx.OpSelect(ctx.U32[1], in_range,
                         ctx.OpSubgroupReadInvocationKHR(ctx.U32[1], value, src_thread_id), value);
 }
+
+Id GetUpperClamp(EmitContext& ctx, Id invocation_id, Id clamp) {
+    const Id thirty_two{ctx.Const(32u)};
+    const Id is_upper_partition{ctx.OpSGreaterThanEqual(ctx.U1, invocation_id, thirty_two)};
+    const Id upper_clamp{ctx.OpIAdd(ctx.U32[1], thirty_two, clamp)};
+    return ctx.OpSelect(ctx.U32[1], is_upper_partition, upper_clamp, clamp);
+}
 } // Anonymous namespace
 
 Id EmitLaneId(EmitContext& ctx) {
-    const Id id{ctx.OpLoad(ctx.U32[1], ctx.subgroup_local_invocation_id)};
+    const Id id{GetThreadId(ctx)};
     if (!ctx.profile.warp_size_potentially_larger_than_guest) {
         return id;
     }
@@ -123,7 +135,15 @@ Id EmitSubgroupGeMask(EmitContext& ctx) {
 Id EmitShuffleIndex(EmitContext& ctx, IR::Inst* inst, Id value, Id index, Id clamp,
                     Id segmentation_mask) {
     const Id not_seg_mask{ctx.OpNot(ctx.U32[1], segmentation_mask)};
-    const Id thread_id{ctx.OpLoad(ctx.U32[1], ctx.subgroup_local_invocation_id)};
+    const Id thread_id{GetThreadId(ctx)};
+    if (ctx.profile.warp_size_potentially_larger_than_guest) {
+        const Id thirty_two{ctx.Const(32u)};
+        const Id is_upper_partition{ctx.OpSGreaterThanEqual(ctx.U1, thread_id, thirty_two)};
+        const Id upper_index{ctx.OpIAdd(ctx.U32[1], thirty_two, index)};
+        const Id upper_clamp{ctx.OpIAdd(ctx.U32[1], thirty_two, clamp)};
+        index = ctx.OpSelect(ctx.U32[1], is_upper_partition, upper_index, index);
+        clamp = ctx.OpSelect(ctx.U32[1], is_upper_partition, upper_clamp, clamp);
+    }
     const Id min_thread_id{ComputeMinThreadId(ctx, thread_id, segmentation_mask)};
     const Id max_thread_id{ComputeMaxThreadId(ctx, min_thread_id, clamp, not_seg_mask)};
 
@@ -137,7 +157,10 @@ Id EmitShuffleIndex(EmitContext& ctx, IR::Inst* inst, Id value, Id index, Id cla
 
 Id EmitShuffleUp(EmitContext& ctx, IR::Inst* inst, Id value, Id index, Id clamp,
                  Id segmentation_mask) {
-    const Id thread_id{ctx.OpLoad(ctx.U32[1], ctx.subgroup_local_invocation_id)};
+    const Id thread_id{GetThreadId(ctx)};
+    if (ctx.profile.warp_size_potentially_larger_than_guest) {
+        clamp = GetUpperClamp(ctx, thread_id, clamp);
+    }
     const Id max_thread_id{GetMaxThreadId(ctx, thread_id, clamp, segmentation_mask)};
     const Id src_thread_id{ctx.OpISub(ctx.U32[1], thread_id, index)};
     const Id in_range{ctx.OpSGreaterThanEqual(ctx.U1, src_thread_id, max_thread_id)};
@@ -148,7 +171,10 @@ Id EmitShuffleUp(EmitContext& ctx, IR::Inst* inst, Id value, Id index, Id clamp,
 
 Id EmitShuffleDown(EmitContext& ctx, IR::Inst* inst, Id value, Id index, Id clamp,
                    Id segmentation_mask) {
-    const Id thread_id{ctx.OpLoad(ctx.U32[1], ctx.subgroup_local_invocation_id)};
+    const Id thread_id{GetThreadId(ctx)};
+    if (ctx.profile.warp_size_potentially_larger_than_guest) {
+        clamp = GetUpperClamp(ctx, thread_id, clamp);
+    }
     const Id max_thread_id{GetMaxThreadId(ctx, thread_id, clamp, segmentation_mask)};
     const Id src_thread_id{ctx.OpIAdd(ctx.U32[1], thread_id, index)};
     const Id in_range{ctx.OpSLessThanEqual(ctx.U1, src_thread_id, max_thread_id)};
@@ -159,7 +185,10 @@ Id EmitShuffleDown(EmitContext& ctx, IR::Inst* inst, Id value, Id index, Id clam
 
 Id EmitShuffleButterfly(EmitContext& ctx, IR::Inst* inst, Id value, Id index, Id clamp,
                         Id segmentation_mask) {
-    const Id thread_id{ctx.OpLoad(ctx.U32[1], ctx.subgroup_local_invocation_id)};
+    const Id thread_id{GetThreadId(ctx)};
+    if (ctx.profile.warp_size_potentially_larger_than_guest) {
+        clamp = GetUpperClamp(ctx, thread_id, clamp);
+    }
     const Id max_thread_id{GetMaxThreadId(ctx, thread_id, clamp, segmentation_mask)};
     const Id src_thread_id{ctx.OpBitwiseXor(ctx.U32[1], thread_id, index)};
     const Id in_range{ctx.OpSLessThanEqual(ctx.U1, src_thread_id, max_thread_id)};
diff --git a/src/shader_recompiler/frontend/maxwell/structured_control_flow.cpp b/src/shader_recompiler/frontend/maxwell/structured_control_flow.cpp
index 8b3e0a15c..69eeaa3e6 100644
--- a/src/shader_recompiler/frontend/maxwell/structured_control_flow.cpp
+++ b/src/shader_recompiler/frontend/maxwell/structured_control_flow.cpp
@@ -20,6 +20,7 @@
 #include "shader_recompiler/frontend/maxwell/decode.h"
 #include "shader_recompiler/frontend/maxwell/structured_control_flow.h"
 #include "shader_recompiler/frontend/maxwell/translate/translate.h"
+#include "shader_recompiler/host_translate_info.h"
 #include "shader_recompiler/object_pool.h"
 
 namespace Shader::Maxwell {
@@ -652,7 +653,7 @@ class TranslatePass {
 public:
     TranslatePass(ObjectPool<IR::Inst>& inst_pool_, ObjectPool<IR::Block>& block_pool_,
                   ObjectPool<Statement>& stmt_pool_, Environment& env_, Statement& root_stmt,
-                  IR::AbstractSyntaxList& syntax_list_)
+                  IR::AbstractSyntaxList& syntax_list_, const HostTranslateInfo& host_info)
         : stmt_pool{stmt_pool_}, inst_pool{inst_pool_}, block_pool{block_pool_}, env{env_},
           syntax_list{syntax_list_} {
         Visit(root_stmt, nullptr, nullptr);
@@ -660,6 +661,9 @@ public:
         IR::Block& first_block{*syntax_list.front().data.block};
         IR::IREmitter ir(first_block, first_block.begin());
         ir.Prologue();
+        if (uses_demote_to_helper && host_info.needs_demote_reorder) {
+            DemoteCombinationPass();
+        }
     }
 
 private:
@@ -809,7 +813,14 @@ private:
             }
             case StatementType::Return: {
                 ensure_block();
-                IR::IREmitter{*current_block}.Epilogue();
+                IR::Block* return_block{block_pool.Create(inst_pool)};
+                IR::IREmitter{*return_block}.Epilogue();
+                current_block->AddBranch(return_block);
+
+                auto& merge{syntax_list.emplace_back()};
+                merge.type = IR::AbstractSyntaxNode::Type::Block;
+                merge.data.block = return_block;
+
                 current_block = nullptr;
                 syntax_list.emplace_back().type = IR::AbstractSyntaxNode::Type::Return;
                 break;
@@ -824,6 +835,7 @@ private:
                 auto& merge{syntax_list.emplace_back()};
                 merge.type = IR::AbstractSyntaxNode::Type::Block;
                 merge.data.block = demote_block;
+                uses_demote_to_helper = true;
                 break;
             }
             case StatementType::Unreachable: {
@@ -855,11 +867,117 @@ private:
         return block_pool.Create(inst_pool);
     }
 
+    void DemoteCombinationPass() {
+        using Type = IR::AbstractSyntaxNode::Type;
+        std::vector<IR::Block*> demote_blocks;
+        std::vector<IR::U1> demote_conds;
+        u32 num_epilogues{};
+        u32 branch_depth{};
+        for (const IR::AbstractSyntaxNode& node : syntax_list) {
+            if (node.type == Type::If) {
+                ++branch_depth;
+            }
+            if (node.type == Type::EndIf) {
+                --branch_depth;
+            }
+            if (node.type != Type::Block) {
+                continue;
+            }
+            if (branch_depth > 1) {
+                // Skip reordering nested demote branches.
+                continue;
+            }
+            for (const IR::Inst& inst : node.data.block->Instructions()) {
+                const IR::Opcode op{inst.GetOpcode()};
+                if (op == IR::Opcode::DemoteToHelperInvocation) {
+                    demote_blocks.push_back(node.data.block);
+                    break;
+                }
+                if (op == IR::Opcode::Epilogue) {
+                    ++num_epilogues;
+                }
+            }
+        }
+        if (demote_blocks.size() == 0) {
+            return;
+        }
+        if (num_epilogues > 1) {
+            LOG_DEBUG(Shader, "Combining demotes with more than one return is not implemented.");
+            return;
+        }
+        s64 last_iterator_offset{};
+        auto& asl{syntax_list};
+        for (const IR::Block* demote_block : demote_blocks) {
+            const auto start_it{asl.begin() + last_iterator_offset};
+            auto asl_it{std::find_if(start_it, asl.end(), [&](const IR::AbstractSyntaxNode& asn) {
+                return asn.type == Type::If && asn.data.if_node.body == demote_block;
+            })};
+            if (asl_it == asl.end()) {
+                // Demote without a conditional branch.
+                // No need to proceed since all fragment instances will be demoted regardless.
+                return;
+            }
+            const IR::Block* const end_if = asl_it->data.if_node.merge;
+            demote_conds.push_back(asl_it->data.if_node.cond);
+            last_iterator_offset = std::distance(asl.begin(), asl_it);
+
+            asl_it = asl.erase(asl_it);
+            asl_it = std::find_if(asl_it, asl.end(), [&](const IR::AbstractSyntaxNode& asn) {
+                return asn.type == Type::Block && asn.data.block == demote_block;
+            });
+
+            asl_it = asl.erase(asl_it);
+            asl_it = std::find_if(asl_it, asl.end(), [&](const IR::AbstractSyntaxNode& asn) {
+                return asn.type == Type::EndIf && asn.data.end_if.merge == end_if;
+            });
+            asl_it = asl.erase(asl_it);
+        }
+        const auto epilogue_func{[](const IR::AbstractSyntaxNode& asn) {
+            if (asn.type != Type::Block) {
+                return false;
+            }
+            for (const auto& inst : asn.data.block->Instructions()) {
+                if (inst.GetOpcode() == IR::Opcode::Epilogue) {
+                    return true;
+                }
+            }
+            return false;
+        }};
+        const auto reverse_it{std::find_if(asl.rbegin(), asl.rend(), epilogue_func)};
+        const auto return_block_it{(reverse_it + 1).base()};
+
+        IR::IREmitter ir{*(return_block_it - 1)->data.block};
+        IR::U1 cond(IR::Value(false));
+        for (const auto& demote_cond : demote_conds) {
+            cond = ir.LogicalOr(cond, demote_cond);
+        }
+        cond.Inst()->DestructiveAddUsage(1);
+
+        IR::AbstractSyntaxNode demote_if_node{};
+        demote_if_node.type = Type::If;
+        demote_if_node.data.if_node.cond = cond;
+        demote_if_node.data.if_node.body = demote_blocks[0];
+        demote_if_node.data.if_node.merge = return_block_it->data.block;
+
+        IR::AbstractSyntaxNode demote_node{};
+        demote_node.type = Type::Block;
+        demote_node.data.block = demote_blocks[0];
+
+        IR::AbstractSyntaxNode demote_endif_node{};
+        demote_endif_node.type = Type::EndIf;
+        demote_endif_node.data.end_if.merge = return_block_it->data.block;
+
+        asl.insert(return_block_it, demote_endif_node);
+        asl.insert(return_block_it, demote_node);
+        asl.insert(return_block_it, demote_if_node);
+    }
+
     ObjectPool<Statement>& stmt_pool;
     ObjectPool<IR::Inst>& inst_pool;
     ObjectPool<IR::Block>& block_pool;
     Environment& env;
     IR::AbstractSyntaxList& syntax_list;
+    bool uses_demote_to_helper{};
 
 // TODO: C++20 Remove this when all compilers support constexpr std::vector
 #if __cpp_lib_constexpr_vector >= 201907
@@ -871,12 +989,13 @@ private:
 } // Anonymous namespace
 
 IR::AbstractSyntaxList BuildASL(ObjectPool<IR::Inst>& inst_pool, ObjectPool<IR::Block>& block_pool,
-                                Environment& env, Flow::CFG& cfg) {
+                                Environment& env, Flow::CFG& cfg,
+                                const HostTranslateInfo& host_info) {
     ObjectPool<Statement> stmt_pool{64};
     GotoPass goto_pass{cfg, stmt_pool};
     Statement& root{goto_pass.RootStatement()};
     IR::AbstractSyntaxList syntax_list;
-    TranslatePass{inst_pool, block_pool, stmt_pool, env, root, syntax_list};
+    TranslatePass{inst_pool, block_pool, stmt_pool, env, root, syntax_list, host_info};
     return syntax_list;
 }
 
diff --git a/src/shader_recompiler/frontend/maxwell/structured_control_flow.h b/src/shader_recompiler/frontend/maxwell/structured_control_flow.h
index 88b083649..e38158da3 100644
--- a/src/shader_recompiler/frontend/maxwell/structured_control_flow.h
+++ b/src/shader_recompiler/frontend/maxwell/structured_control_flow.h
@@ -11,10 +11,13 @@
 #include "shader_recompiler/frontend/maxwell/control_flow.h"
 #include "shader_recompiler/object_pool.h"
 
-namespace Shader::Maxwell {
+namespace Shader {
+struct HostTranslateInfo;
+namespace Maxwell {
 
 [[nodiscard]] IR::AbstractSyntaxList BuildASL(ObjectPool<IR::Inst>& inst_pool,
                                               ObjectPool<IR::Block>& block_pool, Environment& env,
-                                              Flow::CFG& cfg);
+                                              Flow::CFG& cfg, const HostTranslateInfo& host_info);
 
-} // namespace Shader::Maxwell
+} // namespace Maxwell
+} // namespace Shader
diff --git a/src/shader_recompiler/frontend/maxwell/translate_program.cpp b/src/shader_recompiler/frontend/maxwell/translate_program.cpp
index c067d459c..012d55357 100644
--- a/src/shader_recompiler/frontend/maxwell/translate_program.cpp
+++ b/src/shader_recompiler/frontend/maxwell/translate_program.cpp
@@ -130,7 +130,7 @@ void AddNVNStorageBuffers(IR::Program& program) {
 IR::Program TranslateProgram(ObjectPool<IR::Inst>& inst_pool, ObjectPool<IR::Block>& block_pool,
                              Environment& env, Flow::CFG& cfg, const HostTranslateInfo& host_info) {
     IR::Program program;
-    program.syntax_list = BuildASL(inst_pool, block_pool, env, cfg);
+    program.syntax_list = BuildASL(inst_pool, block_pool, env, cfg, host_info);
     program.blocks = GenerateBlocks(program.syntax_list);
     program.post_order_blocks = PostOrder(program.syntax_list.front());
     program.stage = env.ShaderStage();
diff --git a/src/shader_recompiler/host_translate_info.h b/src/shader_recompiler/host_translate_info.h
index 94a584219..96468b2e7 100644
--- a/src/shader_recompiler/host_translate_info.h
+++ b/src/shader_recompiler/host_translate_info.h
@@ -11,8 +11,9 @@ namespace Shader {
 
 /// Misc information about the host
 struct HostTranslateInfo {
-    bool support_float16{}; ///< True when the device supports 16-bit floats
-    bool support_int64{};   ///< True when the device supports 64-bit integers
+    bool support_float16{};      ///< True when the device supports 16-bit floats
+    bool support_int64{};        ///< True when the device supports 64-bit integers
+    bool needs_demote_reorder{}; ///< True when the device needs DemoteToHelperInvocation reordered
 };
 
 } // namespace Shader