diff options
Diffstat (limited to '')
9 files changed, 352 insertions, 66 deletions
diff --git a/src/shader_recompiler/backend/glsl/emit_glsl_warp.cpp b/src/shader_recompiler/backend/glsl/emit_glsl_warp.cpp index a982dd8a2..cd285e2c8 100644 --- a/src/shader_recompiler/backend/glsl/emit_glsl_warp.cpp +++ b/src/shader_recompiler/backend/glsl/emit_glsl_warp.cpp @@ -11,6 +11,8 @@ namespace Shader::Backend::GLSL { namespace { +constexpr char THREAD_ID[]{"gl_SubGroupInvocationARB"}; + void SetInBoundsFlag(EmitContext& ctx, IR::Inst& inst) { IR::Inst* const in_bounds{inst.GetAssociatedPseudoOperation(IR::Opcode::GetInBoundsFromOp)}; if (!in_bounds) { @@ -43,84 +45,100 @@ void UseShuffleNv(EmitContext& ctx, IR::Inst& inst, std::string_view shfl_op, ctx.AddU32("{}={}({},{},{},shfl_in_bounds);", inst, shfl_op, value, index, width); SetInBoundsFlag(ctx, inst); } + +std::string_view BallotIndex(EmitContext& ctx) { + if (!ctx.profile.warp_size_potentially_larger_than_guest) { + return ".x"; + } + return "[gl_SubGroupInvocationARB>>5]"; +} + +std::string GetMask(EmitContext& ctx, std::string_view mask) { + const auto ballot_index{BallotIndex(ctx)}; + return fmt::format("uint(uvec2({}){})", mask, ballot_index); +} } // Anonymous namespace void EmitLaneId(EmitContext& ctx, IR::Inst& inst) { - ctx.AddU32("{}=gl_SubGroupInvocationARB&31u;", inst); + ctx.AddU32("{}={}&31u;", inst, THREAD_ID); } void EmitVoteAll(EmitContext& ctx, IR::Inst& inst, std::string_view pred) { if (!ctx.profile.warp_size_potentially_larger_than_guest) { ctx.AddU1("{}=allInvocationsEqualARB({});", inst, pred); - } else { - const auto active_mask{fmt::format("uvec2(ballotARB(true))[gl_SubGroupInvocationARB]")}; - const auto ballot{fmt::format("uvec2(ballotARB({}))[gl_SubGroupInvocationARB]", pred)}; - ctx.AddU1("{}=({}&{})=={};", inst, ballot, active_mask, active_mask); + return; } + const auto ballot_index{BallotIndex(ctx)}; + const auto active_mask{fmt::format("uvec2(ballotARB(true)){}", ballot_index)}; + const auto ballot{fmt::format("uvec2(ballotARB({})){}", pred, ballot_index)}; + ctx.AddU1("{}=({}&{})=={};", inst, ballot, active_mask, active_mask); } void EmitVoteAny(EmitContext& ctx, IR::Inst& inst, std::string_view pred) { if (!ctx.profile.warp_size_potentially_larger_than_guest) { ctx.AddU1("{}=anyInvocationARB({});", inst, pred); - } else { - const auto active_mask{fmt::format("uvec2(ballotARB(true))[gl_SubGroupInvocationARB]")}; - const auto ballot{fmt::format("uvec2(ballotARB({}))[gl_SubGroupInvocationARB]", pred)}; - ctx.AddU1("{}=({}&{})!=0u;", inst, ballot, active_mask, active_mask); + return; } + const auto ballot_index{BallotIndex(ctx)}; + const auto active_mask{fmt::format("uvec2(ballotARB(true)){}", ballot_index)}; + const auto ballot{fmt::format("uvec2(ballotARB({})){}", pred, ballot_index)}; + ctx.AddU1("{}=({}&{})!=0u;", inst, ballot, active_mask, active_mask); } void EmitVoteEqual(EmitContext& ctx, IR::Inst& inst, std::string_view pred) { if (!ctx.profile.warp_size_potentially_larger_than_guest) { ctx.AddU1("{}=allInvocationsEqualARB({});", inst, pred); - } else { - const auto active_mask{fmt::format("uvec2(ballotARB(true))[gl_SubGroupInvocationARB]")}; - const auto ballot{fmt::format("uvec2(ballotARB({}))[gl_SubGroupInvocationARB]", pred)}; - const auto value{fmt::format("({}^{})", ballot, active_mask)}; - ctx.AddU1("{}=({}==0)||({}=={});", inst, value, value, active_mask); + return; } + const auto ballot_index{BallotIndex(ctx)}; + const auto active_mask{fmt::format("uvec2(ballotARB(true)){}", ballot_index)}; + const auto ballot{fmt::format("uvec2(ballotARB({})){}", pred, ballot_index)}; + const auto value{fmt::format("({}^{})", ballot, active_mask)}; + ctx.AddU1("{}=({}==0)||({}=={});", inst, value, value, active_mask); } void EmitSubgroupBallot(EmitContext& ctx, IR::Inst& inst, std::string_view pred) { - if (!ctx.profile.warp_size_potentially_larger_than_guest) { - ctx.AddU32("{}=uvec2(ballotARB({})).x;", inst, pred); - } else { - ctx.AddU32("{}=uvec2(ballotARB({}))[gl_SubGroupInvocationARB];", inst, pred); - } + const auto ballot_index{BallotIndex(ctx)}; + ctx.AddU32("{}=uvec2(ballotARB({})){};", inst, pred, ballot_index); } void EmitSubgroupEqMask(EmitContext& ctx, IR::Inst& inst) { - ctx.AddU32("{}=uint(gl_SubGroupEqMaskARB.x);", inst); + ctx.AddU32("{}={};", inst, GetMask(ctx, "gl_SubGroupEqMaskARB")); } void EmitSubgroupLtMask(EmitContext& ctx, IR::Inst& inst) { - ctx.AddU32("{}=uint(gl_SubGroupLtMaskARB.x);", inst); + ctx.AddU32("{}={};", inst, GetMask(ctx, "gl_SubGroupLtMaskARB")); } void EmitSubgroupLeMask(EmitContext& ctx, IR::Inst& inst) { - ctx.AddU32("{}=uint(gl_SubGroupLeMaskARB.x);", inst); + ctx.AddU32("{}={};", inst, GetMask(ctx, "gl_SubGroupLeMaskARB")); } void EmitSubgroupGtMask(EmitContext& ctx, IR::Inst& inst) { - ctx.AddU32("{}=uint(gl_SubGroupGtMaskARB.x);", inst); + ctx.AddU32("{}={};", inst, GetMask(ctx, "gl_SubGroupGtMaskARB")); } void EmitSubgroupGeMask(EmitContext& ctx, IR::Inst& inst) { - ctx.AddU32("{}=uint(gl_SubGroupGeMaskARB.x);", inst); + ctx.AddU32("{}={};", inst, GetMask(ctx, "gl_SubGroupGeMaskARB")); } void EmitShuffleIndex(EmitContext& ctx, IR::Inst& inst, std::string_view value, - std::string_view index, std::string_view clamp, - std::string_view segmentation_mask) { + std::string_view index, std::string_view clamp, std::string_view seg_mask) { if (ctx.profile.support_gl_warp_intrinsics) { - UseShuffleNv(ctx, inst, "shuffleNV", value, index, clamp, segmentation_mask); + UseShuffleNv(ctx, inst, "shuffleNV", value, index, clamp, seg_mask); return; } - const auto not_seg_mask{fmt::format("(~{})", segmentation_mask)}; - const auto thread_id{"gl_SubGroupInvocationARB"}; - const auto min_thread_id{ComputeMinThreadId(thread_id, segmentation_mask)}; - const auto max_thread_id{ComputeMaxThreadId(min_thread_id, clamp, not_seg_mask)}; + const bool big_warp{ctx.profile.warp_size_potentially_larger_than_guest}; + const auto is_upper_partition{"int(gl_SubGroupInvocationARB)>=32"}; + const auto upper_index{fmt::format("{}?{}+32:{}", is_upper_partition, index, index)}; + const auto upper_clamp{fmt::format("{}?{}+32:{}", is_upper_partition, clamp, clamp)}; + + const auto not_seg_mask{fmt::format("(~{})", seg_mask)}; + const auto min_thread_id{ComputeMinThreadId(THREAD_ID, seg_mask)}; + const auto max_thread_id{ + ComputeMaxThreadId(min_thread_id, big_warp ? upper_clamp : clamp, not_seg_mask)}; - const auto lhs{fmt::format("({}&{})", index, not_seg_mask)}; + const auto lhs{fmt::format("({}&{})", big_warp ? upper_index : index, not_seg_mask)}; const auto src_thread_id{fmt::format("({})|({})", lhs, min_thread_id)}; ctx.Add("shfl_in_bounds=int({})<=int({});", src_thread_id, max_thread_id); SetInBoundsFlag(ctx, inst); @@ -128,29 +146,34 @@ void EmitShuffleIndex(EmitContext& ctx, IR::Inst& inst, std::string_view value, } void EmitShuffleUp(EmitContext& ctx, IR::Inst& inst, std::string_view value, std::string_view index, - std::string_view clamp, std::string_view segmentation_mask) { + std::string_view clamp, std::string_view seg_mask) { if (ctx.profile.support_gl_warp_intrinsics) { - UseShuffleNv(ctx, inst, "shuffleUpNV", value, index, clamp, segmentation_mask); + UseShuffleNv(ctx, inst, "shuffleUpNV", value, index, clamp, seg_mask); return; } - const auto thread_id{"gl_SubGroupInvocationARB"}; - const auto max_thread_id{GetMaxThreadId(thread_id, clamp, segmentation_mask)}; - const auto src_thread_id{fmt::format("({}-{})", thread_id, index)}; + const bool big_warp{ctx.profile.warp_size_potentially_larger_than_guest}; + const auto is_upper_partition{"int(gl_SubGroupInvocationARB)>=32"}; + const auto upper_clamp{fmt::format("{}?{}+32:{}", is_upper_partition, clamp, clamp)}; + + const auto max_thread_id{GetMaxThreadId(THREAD_ID, big_warp ? upper_clamp : clamp, seg_mask)}; + const auto src_thread_id{fmt::format("({}-{})", THREAD_ID, index)}; ctx.Add("shfl_in_bounds=int({})>=int({});", src_thread_id, max_thread_id); SetInBoundsFlag(ctx, inst); ctx.AddU32("{}=shfl_in_bounds?readInvocationARB({},{}):{};", inst, value, src_thread_id, value); } void EmitShuffleDown(EmitContext& ctx, IR::Inst& inst, std::string_view value, - std::string_view index, std::string_view clamp, - std::string_view segmentation_mask) { + std::string_view index, std::string_view clamp, std::string_view seg_mask) { if (ctx.profile.support_gl_warp_intrinsics) { - UseShuffleNv(ctx, inst, "shuffleDownNV", value, index, clamp, segmentation_mask); + UseShuffleNv(ctx, inst, "shuffleDownNV", value, index, clamp, seg_mask); return; } - const auto thread_id{"gl_SubGroupInvocationARB"}; - const auto max_thread_id{GetMaxThreadId(thread_id, clamp, segmentation_mask)}; - const auto src_thread_id{fmt::format("({}+{})", thread_id, index)}; + const bool big_warp{ctx.profile.warp_size_potentially_larger_than_guest}; + const auto is_upper_partition{"int(gl_SubGroupInvocationARB)>=32"}; + const auto upper_clamp{fmt::format("{}?{}+32:{}", is_upper_partition, clamp, clamp)}; + + const auto max_thread_id{GetMaxThreadId(THREAD_ID, big_warp ? upper_clamp : clamp, seg_mask)}; + const auto src_thread_id{fmt::format("({}+{})", THREAD_ID, index)}; ctx.Add("shfl_in_bounds=int({})<=int({});", src_thread_id, max_thread_id); SetInBoundsFlag(ctx, inst); ctx.AddU32("{}=shfl_in_bounds?readInvocationARB({},{}):{};", inst, value, src_thread_id, value); @@ -158,14 +181,17 @@ void EmitShuffleDown(EmitContext& ctx, IR::Inst& inst, std::string_view value, void EmitShuffleButterfly(EmitContext& ctx, IR::Inst& inst, std::string_view value, std::string_view index, std::string_view clamp, - std::string_view segmentation_mask) { + std::string_view seg_mask) { if (ctx.profile.support_gl_warp_intrinsics) { - UseShuffleNv(ctx, inst, "shuffleXorNV", value, index, clamp, segmentation_mask); + UseShuffleNv(ctx, inst, "shuffleXorNV", value, index, clamp, seg_mask); return; } - const auto thread_id{"gl_SubGroupInvocationARB"}; - const auto max_thread_id{GetMaxThreadId(thread_id, clamp, segmentation_mask)}; - const auto src_thread_id{fmt::format("({}^{})", thread_id, index)}; + const bool big_warp{ctx.profile.warp_size_potentially_larger_than_guest}; + const auto is_upper_partition{"int(gl_SubGroupInvocationARB)>=32"}; + const auto upper_clamp{fmt::format("{}?{}+32:{}", is_upper_partition, clamp, clamp)}; + + const auto max_thread_id{GetMaxThreadId(THREAD_ID, big_warp ? upper_clamp : clamp, seg_mask)}; + const auto src_thread_id{fmt::format("({}^{})", THREAD_ID, index)}; ctx.Add("shfl_in_bounds=int({})<=int({});", src_thread_id, max_thread_id); SetInBoundsFlag(ctx, inst); ctx.AddU32("{}=shfl_in_bounds?readInvocationARB({},{}):{};", inst, value, src_thread_id, value); diff --git a/src/shader_recompiler/backend/spirv/emit_context.cpp b/src/shader_recompiler/backend/spirv/emit_context.cpp index 2d29d8c14..2885e6799 100644 --- a/src/shader_recompiler/backend/spirv/emit_context.cpp +++ b/src/shader_recompiler/backend/spirv/emit_context.cpp @@ -15,6 +15,8 @@ namespace Shader::Backend::SPIRV { namespace { +constexpr size_t NUM_FIXEDFNCTEXTURE = 10; + enum class Operation { Increment, Decrement, @@ -427,6 +429,16 @@ Id DescType(EmitContext& ctx, Id sampled_type, Id pointer_type, u32 count) { return pointer_type; } } + +size_t FindNextUnusedLocation(const std::bitset<IR::NUM_GENERICS>& used_locations, + size_t start_offset) { + for (size_t location = start_offset; location < used_locations.size(); ++location) { + if (!used_locations.test(location)) { + return location; + } + } + throw RuntimeError("Unable to get an unused location for legacy attribute"); +} } // Anonymous namespace void VectorTypes::Define(Sirit::Module& sirit_ctx, Id base_type, std::string_view name) { @@ -1227,6 +1239,7 @@ void EmitContext::DefineInputs(const IR::Program& program) { loads[IR::Attribute::TessellationEvaluationPointV]) { tess_coord = DefineInput(*this, F32[3], false, spv::BuiltIn::TessCoord); } + std::bitset<IR::NUM_GENERICS> used_locations{}; for (size_t index = 0; index < IR::NUM_GENERICS; ++index) { const AttributeType input_type{runtime_info.generic_input_types[index]}; if (!runtime_info.previous_stage_stores.Generic(index)) { @@ -1238,6 +1251,7 @@ void EmitContext::DefineInputs(const IR::Program& program) { if (input_type == AttributeType::Disabled) { continue; } + used_locations.set(index); const Id type{GetAttributeType(*this, input_type)}; const Id id{DefineInput(*this, type, true)}; Decorate(id, spv::Decoration::Location, static_cast<u32>(index)); @@ -1263,6 +1277,26 @@ void EmitContext::DefineInputs(const IR::Program& program) { break; } } + size_t previous_unused_location = 0; + if (loads.AnyComponent(IR::Attribute::ColorFrontDiffuseR)) { + const size_t location = FindNextUnusedLocation(used_locations, previous_unused_location); + previous_unused_location = location; + used_locations.set(location); + const Id id{DefineInput(*this, F32[4], true)}; + Decorate(id, spv::Decoration::Location, location); + input_front_color = id; + } + for (size_t index = 0; index < NUM_FIXEDFNCTEXTURE; ++index) { + if (loads.AnyComponent(IR::Attribute::FixedFncTexture0S + index * 4)) { + const size_t location = + FindNextUnusedLocation(used_locations, previous_unused_location); + previous_unused_location = location; + used_locations.set(location); + const Id id{DefineInput(*this, F32[4], true)}; + Decorate(id, spv::Decoration::Location, location); + input_fixed_fnc_textures[index] = id; + } + } if (stage == Stage::TessellationEval) { for (size_t index = 0; index < info.uses_patches.size(); ++index) { if (!info.uses_patches[index]) { @@ -1313,9 +1347,31 @@ void EmitContext::DefineOutputs(const IR::Program& program) { viewport_mask = DefineOutput(*this, TypeArray(U32[1], Const(1u)), std::nullopt, spv::BuiltIn::ViewportMaskNV); } + std::bitset<IR::NUM_GENERICS> used_locations{}; for (size_t index = 0; index < IR::NUM_GENERICS; ++index) { if (info.stores.Generic(index)) { DefineGenericOutput(*this, index, invocations); + used_locations.set(index); + } + } + size_t previous_unused_location = 0; + if (info.stores.AnyComponent(IR::Attribute::ColorFrontDiffuseR)) { + const size_t location = FindNextUnusedLocation(used_locations, previous_unused_location); + previous_unused_location = location; + used_locations.set(location); + const Id id{DefineOutput(*this, F32[4], invocations)}; + Decorate(id, spv::Decoration::Location, static_cast<u32>(location)); + output_front_color = id; + } + for (size_t index = 0; index < NUM_FIXEDFNCTEXTURE; ++index) { + if (info.stores.AnyComponent(IR::Attribute::FixedFncTexture0S + index * 4)) { + const size_t location = + FindNextUnusedLocation(used_locations, previous_unused_location); + previous_unused_location = location; + used_locations.set(location); + const Id id{DefineOutput(*this, F32[4], invocations)}; + Decorate(id, spv::Decoration::Location, location); + output_fixed_fnc_textures[index] = id; } } switch (stage) { diff --git a/src/shader_recompiler/backend/spirv/emit_context.h b/src/shader_recompiler/backend/spirv/emit_context.h index e277bc358..847d0c0e6 100644 --- a/src/shader_recompiler/backend/spirv/emit_context.h +++ b/src/shader_recompiler/backend/spirv/emit_context.h @@ -268,10 +268,14 @@ public: Id write_global_func_u32x4{}; Id input_position{}; + Id input_front_color{}; + std::array<Id, 10> input_fixed_fnc_textures{}; std::array<Id, 32> input_generics{}; Id output_point_size{}; Id output_position{}; + Id output_front_color{}; + std::array<Id, 10> output_fixed_fnc_textures{}; std::array<std::array<GenericElementInfo, 4>, 32> output_generics{}; Id output_tess_level_outer{}; diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp index 14c77f162..68f360b3c 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp +++ b/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp @@ -43,6 +43,25 @@ Id AttrPointer(EmitContext& ctx, Id pointer_type, Id vertex, Id base, Args&&... } } +bool IsFixedFncTexture(IR::Attribute attribute) { + return attribute >= IR::Attribute::FixedFncTexture0S && + attribute <= IR::Attribute::FixedFncTexture9Q; +} + +u32 FixedFncTextureAttributeIndex(IR::Attribute attribute) { + if (!IsFixedFncTexture(attribute)) { + throw InvalidArgument("Attribute {} is not a FixedFncTexture", attribute); + } + return (static_cast<u32>(attribute) - static_cast<u32>(IR::Attribute::FixedFncTexture0S)) / 4u; +} + +u32 FixedFncTextureAttributeElement(IR::Attribute attribute) { + if (!IsFixedFncTexture(attribute)) { + throw InvalidArgument("Attribute {} is not a FixedFncTexture", attribute); + } + return static_cast<u32>(attribute) % 4u; +} + template <typename... Args> Id OutputAccessChain(EmitContext& ctx, Id result_type, Id base, Args&&... args) { if (ctx.stage == Stage::TessellationControl) { @@ -74,6 +93,13 @@ std::optional<OutAttr> OutputAttrPointer(EmitContext& ctx, IR::Attribute attr) { return OutputAccessChain(ctx, ctx.output_f32, info.id, index_id); } } + if (IsFixedFncTexture(attr)) { + const u32 index{FixedFncTextureAttributeIndex(attr)}; + const u32 element{FixedFncTextureAttributeElement(attr)}; + const Id element_id{ctx.Const(element)}; + return OutputAccessChain(ctx, ctx.output_f32, ctx.output_fixed_fnc_textures[index], + element_id); + } switch (attr) { case IR::Attribute::PointSize: return ctx.output_point_size; @@ -85,6 +111,14 @@ std::optional<OutAttr> OutputAttrPointer(EmitContext& ctx, IR::Attribute attr) { const Id element_id{ctx.Const(element)}; return OutputAccessChain(ctx, ctx.output_f32, ctx.output_position, element_id); } + case IR::Attribute::ColorFrontDiffuseR: + case IR::Attribute::ColorFrontDiffuseG: + case IR::Attribute::ColorFrontDiffuseB: + case IR::Attribute::ColorFrontDiffuseA: { + const u32 element{static_cast<u32>(attr) % 4}; + const Id element_id{ctx.Const(element)}; + return OutputAccessChain(ctx, ctx.output_f32, ctx.output_front_color, element_id); + } case IR::Attribute::ClipDistance0: case IR::Attribute::ClipDistance1: case IR::Attribute::ClipDistance2: @@ -307,6 +341,12 @@ Id EmitGetAttribute(EmitContext& ctx, IR::Attribute attr, Id vertex) { const Id value{ctx.OpLoad(type->id, pointer)}; return type->needs_cast ? ctx.OpBitcast(ctx.F32[1], value) : value; } + if (IsFixedFncTexture(attr)) { + const u32 index{FixedFncTextureAttributeIndex(attr)}; + const Id attr_id{ctx.input_fixed_fnc_textures[index]}; + const Id attr_ptr{AttrPointer(ctx, ctx.input_f32, vertex, attr_id, ctx.Const(element))}; + return ctx.OpLoad(ctx.F32[1], attr_ptr); + } switch (attr) { case IR::Attribute::PrimitiveId: return ctx.OpBitcast(ctx.F32[1], ctx.OpLoad(ctx.U32[1], ctx.primitive_id)); @@ -316,6 +356,13 @@ Id EmitGetAttribute(EmitContext& ctx, IR::Attribute attr, Id vertex) { case IR::Attribute::PositionW: return ctx.OpLoad(ctx.F32[1], AttrPointer(ctx, ctx.input_f32, vertex, ctx.input_position, ctx.Const(element))); + case IR::Attribute::ColorFrontDiffuseR: + case IR::Attribute::ColorFrontDiffuseG: + case IR::Attribute::ColorFrontDiffuseB: + case IR::Attribute::ColorFrontDiffuseA: { + return ctx.OpLoad(ctx.F32[1], AttrPointer(ctx, ctx.input_f32, vertex, ctx.input_front_color, + ctx.Const(element))); + } case IR::Attribute::InstanceId: if (ctx.profile.support_vertex_instance_id) { return ctx.OpBitcast(ctx.F32[1], ctx.OpLoad(ctx.U32[1], ctx.instance_id)); @@ -333,8 +380,9 @@ Id EmitGetAttribute(EmitContext& ctx, IR::Attribute attr, Id vertex) { return ctx.OpBitcast(ctx.F32[1], ctx.OpISub(ctx.U32[1], index, base)); } case IR::Attribute::FrontFace: - return ctx.OpSelect(ctx.U32[1], ctx.OpLoad(ctx.U1, ctx.front_face), - ctx.Const(std::numeric_limits<u32>::max()), ctx.u32_zero_value); + return ctx.OpSelect(ctx.F32[1], ctx.OpLoad(ctx.U1, ctx.front_face), + ctx.OpBitcast(ctx.F32[1], ctx.Const(std::numeric_limits<u32>::max())), + ctx.f32_zero_value); case IR::Attribute::PointSpriteS: return ctx.OpLoad(ctx.F32[1], ctx.OpAccessChain(ctx.input_f32, ctx.point_coord, ctx.u32_zero_value)); diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_warp.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_warp.cpp index 78b1e1ba7..cef52c56e 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_warp.cpp +++ b/src/shader_recompiler/backend/spirv/emit_spirv_warp.cpp @@ -7,8 +7,13 @@ namespace Shader::Backend::SPIRV { namespace { +Id GetThreadId(EmitContext& ctx) { + return ctx.OpLoad(ctx.U32[1], ctx.subgroup_local_invocation_id); +} + Id WarpExtract(EmitContext& ctx, Id value) { - const Id local_index{ctx.OpLoad(ctx.U32[1], ctx.subgroup_local_invocation_id)}; + const Id thread_id{GetThreadId(ctx)}; + const Id local_index{ctx.OpShiftRightArithmetic(ctx.U32[1], thread_id, ctx.Const(5U))}; return ctx.OpVectorExtractDynamic(ctx.U32[1], value, local_index); } @@ -48,10 +53,17 @@ Id SelectValue(EmitContext& ctx, Id in_range, Id value, Id src_thread_id) { return ctx.OpSelect(ctx.U32[1], in_range, ctx.OpSubgroupReadInvocationKHR(ctx.U32[1], value, src_thread_id), value); } + +Id GetUpperClamp(EmitContext& ctx, Id invocation_id, Id clamp) { + const Id thirty_two{ctx.Const(32u)}; + const Id is_upper_partition{ctx.OpSGreaterThanEqual(ctx.U1, invocation_id, thirty_two)}; + const Id upper_clamp{ctx.OpIAdd(ctx.U32[1], thirty_two, clamp)}; + return ctx.OpSelect(ctx.U32[1], is_upper_partition, upper_clamp, clamp); +} } // Anonymous namespace Id EmitLaneId(EmitContext& ctx) { - const Id id{ctx.OpLoad(ctx.U32[1], ctx.subgroup_local_invocation_id)}; + const Id id{GetThreadId(ctx)}; if (!ctx.profile.warp_size_potentially_larger_than_guest) { return id; } @@ -123,7 +135,15 @@ Id EmitSubgroupGeMask(EmitContext& ctx) { Id EmitShuffleIndex(EmitContext& ctx, IR::Inst* inst, Id value, Id index, Id clamp, Id segmentation_mask) { const Id not_seg_mask{ctx.OpNot(ctx.U32[1], segmentation_mask)}; - const Id thread_id{ctx.OpLoad(ctx.U32[1], ctx.subgroup_local_invocation_id)}; + const Id thread_id{GetThreadId(ctx)}; + if (ctx.profile.warp_size_potentially_larger_than_guest) { + const Id thirty_two{ctx.Const(32u)}; + const Id is_upper_partition{ctx.OpSGreaterThanEqual(ctx.U1, thread_id, thirty_two)}; + const Id upper_index{ctx.OpIAdd(ctx.U32[1], thirty_two, index)}; + const Id upper_clamp{ctx.OpIAdd(ctx.U32[1], thirty_two, clamp)}; + index = ctx.OpSelect(ctx.U32[1], is_upper_partition, upper_index, index); + clamp = ctx.OpSelect(ctx.U32[1], is_upper_partition, upper_clamp, clamp); + } const Id min_thread_id{ComputeMinThreadId(ctx, thread_id, segmentation_mask)}; const Id max_thread_id{ComputeMaxThreadId(ctx, min_thread_id, clamp, not_seg_mask)}; @@ -137,7 +157,10 @@ Id EmitShuffleIndex(EmitContext& ctx, IR::Inst* inst, Id value, Id index, Id cla Id EmitShuffleUp(EmitContext& ctx, IR::Inst* inst, Id value, Id index, Id clamp, Id segmentation_mask) { - const Id thread_id{ctx.OpLoad(ctx.U32[1], ctx.subgroup_local_invocation_id)}; + const Id thread_id{GetThreadId(ctx)}; + if (ctx.profile.warp_size_potentially_larger_than_guest) { + clamp = GetUpperClamp(ctx, thread_id, clamp); + } const Id max_thread_id{GetMaxThreadId(ctx, thread_id, clamp, segmentation_mask)}; const Id src_thread_id{ctx.OpISub(ctx.U32[1], thread_id, index)}; const Id in_range{ctx.OpSGreaterThanEqual(ctx.U1, src_thread_id, max_thread_id)}; @@ -148,7 +171,10 @@ Id EmitShuffleUp(EmitContext& ctx, IR::Inst* inst, Id value, Id index, Id clamp, Id EmitShuffleDown(EmitContext& ctx, IR::Inst* inst, Id value, Id index, Id clamp, Id segmentation_mask) { - const Id thread_id{ctx.OpLoad(ctx.U32[1], ctx.subgroup_local_invocation_id)}; + const Id thread_id{GetThreadId(ctx)}; + if (ctx.profile.warp_size_potentially_larger_than_guest) { + clamp = GetUpperClamp(ctx, thread_id, clamp); + } const Id max_thread_id{GetMaxThreadId(ctx, thread_id, clamp, segmentation_mask)}; const Id src_thread_id{ctx.OpIAdd(ctx.U32[1], thread_id, index)}; const Id in_range{ctx.OpSLessThanEqual(ctx.U1, src_thread_id, max_thread_id)}; @@ -159,7 +185,10 @@ Id EmitShuffleDown(EmitContext& ctx, IR::Inst* inst, Id value, Id index, Id clam Id EmitShuffleButterfly(EmitContext& ctx, IR::Inst* inst, Id value, Id index, Id clamp, Id segmentation_mask) { - const Id thread_id{ctx.OpLoad(ctx.U32[1], ctx.subgroup_local_invocation_id)}; + const Id thread_id{GetThreadId(ctx)}; + if (ctx.profile.warp_size_potentially_larger_than_guest) { + clamp = GetUpperClamp(ctx, thread_id, clamp); + } const Id max_thread_id{GetMaxThreadId(ctx, thread_id, clamp, segmentation_mask)}; const Id src_thread_id{ctx.OpBitwiseXor(ctx.U32[1], thread_id, index)}; const Id in_range{ctx.OpSLessThanEqual(ctx.U1, src_thread_id, max_thread_id)}; diff --git a/src/shader_recompiler/frontend/maxwell/structured_control_flow.cpp b/src/shader_recompiler/frontend/maxwell/structured_control_flow.cpp index 8b3e0a15c..69eeaa3e6 100644 --- a/src/shader_recompiler/frontend/maxwell/structured_control_flow.cpp +++ b/src/shader_recompiler/frontend/maxwell/structured_control_flow.cpp @@ -20,6 +20,7 @@ #include "shader_recompiler/frontend/maxwell/decode.h" #include "shader_recompiler/frontend/maxwell/structured_control_flow.h" #include "shader_recompiler/frontend/maxwell/translate/translate.h" +#include "shader_recompiler/host_translate_info.h" #include "shader_recompiler/object_pool.h" namespace Shader::Maxwell { @@ -652,7 +653,7 @@ class TranslatePass { public: TranslatePass(ObjectPool<IR::Inst>& inst_pool_, ObjectPool<IR::Block>& block_pool_, ObjectPool<Statement>& stmt_pool_, Environment& env_, Statement& root_stmt, - IR::AbstractSyntaxList& syntax_list_) + IR::AbstractSyntaxList& syntax_list_, const HostTranslateInfo& host_info) : stmt_pool{stmt_pool_}, inst_pool{inst_pool_}, block_pool{block_pool_}, env{env_}, syntax_list{syntax_list_} { Visit(root_stmt, nullptr, nullptr); @@ -660,6 +661,9 @@ public: IR::Block& first_block{*syntax_list.front().data.block}; IR::IREmitter ir(first_block, first_block.begin()); ir.Prologue(); + if (uses_demote_to_helper && host_info.needs_demote_reorder) { + DemoteCombinationPass(); + } } private: @@ -809,7 +813,14 @@ private: } case StatementType::Return: { ensure_block(); - IR::IREmitter{*current_block}.Epilogue(); + IR::Block* return_block{block_pool.Create(inst_pool)}; + IR::IREmitter{*return_block}.Epilogue(); + current_block->AddBranch(return_block); + + auto& merge{syntax_list.emplace_back()}; + merge.type = IR::AbstractSyntaxNode::Type::Block; + merge.data.block = return_block; + current_block = nullptr; syntax_list.emplace_back().type = IR::AbstractSyntaxNode::Type::Return; break; @@ -824,6 +835,7 @@ private: auto& merge{syntax_list.emplace_back()}; merge.type = IR::AbstractSyntaxNode::Type::Block; merge.data.block = demote_block; + uses_demote_to_helper = true; break; } case StatementType::Unreachable: { @@ -855,11 +867,117 @@ private: return block_pool.Create(inst_pool); } + void DemoteCombinationPass() { + using Type = IR::AbstractSyntaxNode::Type; + std::vector<IR::Block*> demote_blocks; + std::vector<IR::U1> demote_conds; + u32 num_epilogues{}; + u32 branch_depth{}; + for (const IR::AbstractSyntaxNode& node : syntax_list) { + if (node.type == Type::If) { + ++branch_depth; + } + if (node.type == Type::EndIf) { + --branch_depth; + } + if (node.type != Type::Block) { + continue; + } + if (branch_depth > 1) { + // Skip reordering nested demote branches. + continue; + } + for (const IR::Inst& inst : node.data.block->Instructions()) { + const IR::Opcode op{inst.GetOpcode()}; + if (op == IR::Opcode::DemoteToHelperInvocation) { + demote_blocks.push_back(node.data.block); + break; + } + if (op == IR::Opcode::Epilogue) { + ++num_epilogues; + } + } + } + if (demote_blocks.size() == 0) { + return; + } + if (num_epilogues > 1) { + LOG_DEBUG(Shader, "Combining demotes with more than one return is not implemented."); + return; + } + s64 last_iterator_offset{}; + auto& asl{syntax_list}; + for (const IR::Block* demote_block : demote_blocks) { + const auto start_it{asl.begin() + last_iterator_offset}; + auto asl_it{std::find_if(start_it, asl.end(), [&](const IR::AbstractSyntaxNode& asn) { + return asn.type == Type::If && asn.data.if_node.body == demote_block; + })}; + if (asl_it == asl.end()) { + // Demote without a conditional branch. + // No need to proceed since all fragment instances will be demoted regardless. + return; + } + const IR::Block* const end_if = asl_it->data.if_node.merge; + demote_conds.push_back(asl_it->data.if_node.cond); + last_iterator_offset = std::distance(asl.begin(), asl_it); + + asl_it = asl.erase(asl_it); + asl_it = std::find_if(asl_it, asl.end(), [&](const IR::AbstractSyntaxNode& asn) { + return asn.type == Type::Block && asn.data.block == demote_block; + }); + + asl_it = asl.erase(asl_it); + asl_it = std::find_if(asl_it, asl.end(), [&](const IR::AbstractSyntaxNode& asn) { + return asn.type == Type::EndIf && asn.data.end_if.merge == end_if; + }); + asl_it = asl.erase(asl_it); + } + const auto epilogue_func{[](const IR::AbstractSyntaxNode& asn) { + if (asn.type != Type::Block) { + return false; + } + for (const auto& inst : asn.data.block->Instructions()) { + if (inst.GetOpcode() == IR::Opcode::Epilogue) { + return true; + } + } + return false; + }}; + const auto reverse_it{std::find_if(asl.rbegin(), asl.rend(), epilogue_func)}; + const auto return_block_it{(reverse_it + 1).base()}; + + IR::IREmitter ir{*(return_block_it - 1)->data.block}; + IR::U1 cond(IR::Value(false)); + for (const auto& demote_cond : demote_conds) { + cond = ir.LogicalOr(cond, demote_cond); + } + cond.Inst()->DestructiveAddUsage(1); + + IR::AbstractSyntaxNode demote_if_node{}; + demote_if_node.type = Type::If; + demote_if_node.data.if_node.cond = cond; + demote_if_node.data.if_node.body = demote_blocks[0]; + demote_if_node.data.if_node.merge = return_block_it->data.block; + + IR::AbstractSyntaxNode demote_node{}; + demote_node.type = Type::Block; + demote_node.data.block = demote_blocks[0]; + + IR::AbstractSyntaxNode demote_endif_node{}; + demote_endif_node.type = Type::EndIf; + demote_endif_node.data.end_if.merge = return_block_it->data.block; + + asl.insert(return_block_it, demote_endif_node); + asl.insert(return_block_it, demote_node); + asl.insert(return_block_it, demote_if_node); + } + ObjectPool<Statement>& stmt_pool; ObjectPool<IR::Inst>& inst_pool; ObjectPool<IR::Block>& block_pool; Environment& env; IR::AbstractSyntaxList& syntax_list; + bool uses_demote_to_helper{}; // TODO: C++20 Remove this when all compilers support constexpr std::vector #if __cpp_lib_constexpr_vector >= 201907 @@ -871,12 +989,13 @@ private: } // Anonymous namespace IR::AbstractSyntaxList BuildASL(ObjectPool<IR::Inst>& inst_pool, ObjectPool<IR::Block>& block_pool, - Environment& env, Flow::CFG& cfg) { + Environment& env, Flow::CFG& cfg, + const HostTranslateInfo& host_info) { ObjectPool<Statement> stmt_pool{64}; GotoPass goto_pass{cfg, stmt_pool}; Statement& root{goto_pass.RootStatement()}; IR::AbstractSyntaxList syntax_list; - TranslatePass{inst_pool, block_pool, stmt_pool, env, root, syntax_list}; + TranslatePass{inst_pool, block_pool, stmt_pool, env, root, syntax_list, host_info}; return syntax_list; } diff --git a/src/shader_recompiler/frontend/maxwell/structured_control_flow.h b/src/shader_recompiler/frontend/maxwell/structured_control_flow.h index 88b083649..e38158da3 100644 --- a/src/shader_recompiler/frontend/maxwell/structured_control_flow.h +++ b/src/shader_recompiler/frontend/maxwell/structured_control_flow.h @@ -11,10 +11,13 @@ #include "shader_recompiler/frontend/maxwell/control_flow.h" #include "shader_recompiler/object_pool.h" -namespace Shader::Maxwell { +namespace Shader { +struct HostTranslateInfo; +namespace Maxwell { [[nodiscard]] IR::AbstractSyntaxList BuildASL(ObjectPool<IR::Inst>& inst_pool, ObjectPool<IR::Block>& block_pool, Environment& env, - Flow::CFG& cfg); + Flow::CFG& cfg, const HostTranslateInfo& host_info); -} // namespace Shader::Maxwell +} // namespace Maxwell +} // namespace Shader diff --git a/src/shader_recompiler/frontend/maxwell/translate_program.cpp b/src/shader_recompiler/frontend/maxwell/translate_program.cpp index c067d459c..012d55357 100644 --- a/src/shader_recompiler/frontend/maxwell/translate_program.cpp +++ b/src/shader_recompiler/frontend/maxwell/translate_program.cpp @@ -130,7 +130,7 @@ void AddNVNStorageBuffers(IR::Program& program) { IR::Program TranslateProgram(ObjectPool<IR::Inst>& inst_pool, ObjectPool<IR::Block>& block_pool, Environment& env, Flow::CFG& cfg, const HostTranslateInfo& host_info) { IR::Program program; - program.syntax_list = BuildASL(inst_pool, block_pool, env, cfg); + program.syntax_list = BuildASL(inst_pool, block_pool, env, cfg, host_info); program.blocks = GenerateBlocks(program.syntax_list); program.post_order_blocks = PostOrder(program.syntax_list.front()); program.stage = env.ShaderStage(); diff --git a/src/shader_recompiler/host_translate_info.h b/src/shader_recompiler/host_translate_info.h index 94a584219..96468b2e7 100644 --- a/src/shader_recompiler/host_translate_info.h +++ b/src/shader_recompiler/host_translate_info.h @@ -11,8 +11,9 @@ namespace Shader { /// Misc information about the host struct HostTranslateInfo { - bool support_float16{}; ///< True when the device supports 16-bit floats - bool support_int64{}; ///< True when the device supports 64-bit integers + bool support_float16{}; ///< True when the device supports 16-bit floats + bool support_int64{}; ///< True when the device supports 64-bit integers + bool needs_demote_reorder{}; ///< True when the device needs DemoteToHelperInvocation reordered }; } // namespace Shader |