summaryrefslogtreecommitdiffstats
path: root/src/shader_recompiler/ir_opt
diff options
context:
space:
mode:
authorlat9nq <lat9nq@gmail.com>2021-07-25 21:31:33 +0200
committerGitHub <noreply@github.com>2021-07-25 21:31:33 +0200
commit09d6cc99435322c5f480eaa2b0967e33f4966ba6 (patch)
tree72cdf06f6b7d77fdf5826104fea691f3ea450f54 /src/shader_recompiler/ir_opt
parentconfiguration: Use combobox apply template where possible (diff)
parentMerge pull request #6575 from FernandoS27/new_settings (diff)
downloadyuzu-09d6cc99435322c5f480eaa2b0967e33f4966ba6.tar
yuzu-09d6cc99435322c5f480eaa2b0967e33f4966ba6.tar.gz
yuzu-09d6cc99435322c5f480eaa2b0967e33f4966ba6.tar.bz2
yuzu-09d6cc99435322c5f480eaa2b0967e33f4966ba6.tar.lz
yuzu-09d6cc99435322c5f480eaa2b0967e33f4966ba6.tar.xz
yuzu-09d6cc99435322c5f480eaa2b0967e33f4966ba6.tar.zst
yuzu-09d6cc99435322c5f480eaa2b0967e33f4966ba6.zip
Diffstat (limited to 'src/shader_recompiler/ir_opt')
-rw-r--r--src/shader_recompiler/ir_opt/collect_shader_info_pass.cpp928
-rw-r--r--src/shader_recompiler/ir_opt/constant_propagation_pass.cpp610
-rw-r--r--src/shader_recompiler/ir_opt/dead_code_elimination_pass.cpp26
-rw-r--r--src/shader_recompiler/ir_opt/dual_vertex_pass.cpp30
-rw-r--r--src/shader_recompiler/ir_opt/global_memory_to_storage_buffer_pass.cpp526
-rw-r--r--src/shader_recompiler/ir_opt/identity_removal_pass.cpp38
-rw-r--r--src/shader_recompiler/ir_opt/lower_fp16_to_fp32.cpp143
-rw-r--r--src/shader_recompiler/ir_opt/lower_int64_to_int32.cpp218
-rw-r--r--src/shader_recompiler/ir_opt/passes.h32
-rw-r--r--src/shader_recompiler/ir_opt/ssa_rewrite_pass.cpp383
-rw-r--r--src/shader_recompiler/ir_opt/texture_pass.cpp523
-rw-r--r--src/shader_recompiler/ir_opt/verification_pass.cpp98
12 files changed, 3555 insertions, 0 deletions
diff --git a/src/shader_recompiler/ir_opt/collect_shader_info_pass.cpp b/src/shader_recompiler/ir_opt/collect_shader_info_pass.cpp
new file mode 100644
index 000000000..5ead930f1
--- /dev/null
+++ b/src/shader_recompiler/ir_opt/collect_shader_info_pass.cpp
@@ -0,0 +1,928 @@
+// Copyright 2021 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include "common/alignment.h"
+#include "shader_recompiler/environment.h"
+#include "shader_recompiler/frontend/ir/modifiers.h"
+#include "shader_recompiler/frontend/ir/program.h"
+#include "shader_recompiler/frontend/ir/value.h"
+#include "shader_recompiler/ir_opt/passes.h"
+#include "shader_recompiler/shader_info.h"
+
+namespace Shader::Optimization {
+namespace {
+void AddConstantBufferDescriptor(Info& info, u32 index, u32 count) {
+ if (count != 1) {
+ throw NotImplementedException("Constant buffer descriptor indexing");
+ }
+ if ((info.constant_buffer_mask & (1U << index)) != 0) {
+ return;
+ }
+ info.constant_buffer_mask |= 1U << index;
+
+ auto& cbufs{info.constant_buffer_descriptors};
+ cbufs.insert(std::ranges::lower_bound(cbufs, index, {}, &ConstantBufferDescriptor::index),
+ ConstantBufferDescriptor{
+ .index = index,
+ .count = 1,
+ });
+}
+
+void GetPatch(Info& info, IR::Patch patch) {
+ if (!IR::IsGeneric(patch)) {
+ throw NotImplementedException("Reading non-generic patch {}", patch);
+ }
+ info.uses_patches.at(IR::GenericPatchIndex(patch)) = true;
+}
+
+void SetPatch(Info& info, IR::Patch patch) {
+ if (IR::IsGeneric(patch)) {
+ info.uses_patches.at(IR::GenericPatchIndex(patch)) = true;
+ return;
+ }
+ switch (patch) {
+ case IR::Patch::TessellationLodLeft:
+ case IR::Patch::TessellationLodTop:
+ case IR::Patch::TessellationLodRight:
+ case IR::Patch::TessellationLodBottom:
+ info.stores_tess_level_outer = true;
+ break;
+ case IR::Patch::TessellationLodInteriorU:
+ case IR::Patch::TessellationLodInteriorV:
+ info.stores_tess_level_inner = true;
+ break;
+ default:
+ throw NotImplementedException("Set patch {}", patch);
+ }
+}
+
+void CheckCBufNVN(Info& info, IR::Inst& inst) {
+ const IR::Value cbuf_index{inst.Arg(0)};
+ if (!cbuf_index.IsImmediate()) {
+ info.nvn_buffer_used.set();
+ return;
+ }
+ const u32 index{cbuf_index.U32()};
+ if (index != 0) {
+ return;
+ }
+ const IR::Value cbuf_offset{inst.Arg(1)};
+ if (!cbuf_offset.IsImmediate()) {
+ info.nvn_buffer_used.set();
+ return;
+ }
+ const u32 offset{cbuf_offset.U32()};
+ const u32 descriptor_size{0x10};
+ const u32 upper_limit{info.nvn_buffer_base + descriptor_size * 16};
+ if (offset >= info.nvn_buffer_base && offset < upper_limit) {
+ const std::size_t nvn_index{(offset - info.nvn_buffer_base) / descriptor_size};
+ info.nvn_buffer_used.set(nvn_index, true);
+ }
+}
+
+void VisitUsages(Info& info, IR::Inst& inst) {
+ switch (inst.GetOpcode()) {
+ case IR::Opcode::CompositeConstructF16x2:
+ case IR::Opcode::CompositeConstructF16x3:
+ case IR::Opcode::CompositeConstructF16x4:
+ case IR::Opcode::CompositeExtractF16x2:
+ case IR::Opcode::CompositeExtractF16x3:
+ case IR::Opcode::CompositeExtractF16x4:
+ case IR::Opcode::CompositeInsertF16x2:
+ case IR::Opcode::CompositeInsertF16x3:
+ case IR::Opcode::CompositeInsertF16x4:
+ case IR::Opcode::SelectF16:
+ case IR::Opcode::BitCastU16F16:
+ case IR::Opcode::BitCastF16U16:
+ case IR::Opcode::PackFloat2x16:
+ case IR::Opcode::UnpackFloat2x16:
+ case IR::Opcode::ConvertS16F16:
+ case IR::Opcode::ConvertS32F16:
+ case IR::Opcode::ConvertS64F16:
+ case IR::Opcode::ConvertU16F16:
+ case IR::Opcode::ConvertU32F16:
+ case IR::Opcode::ConvertU64F16:
+ case IR::Opcode::ConvertF16S8:
+ case IR::Opcode::ConvertF16S16:
+ case IR::Opcode::ConvertF16S32:
+ case IR::Opcode::ConvertF16S64:
+ case IR::Opcode::ConvertF16U8:
+ case IR::Opcode::ConvertF16U16:
+ case IR::Opcode::ConvertF16U32:
+ case IR::Opcode::ConvertF16U64:
+ case IR::Opcode::FPAbs16:
+ case IR::Opcode::FPAdd16:
+ case IR::Opcode::FPCeil16:
+ case IR::Opcode::FPFloor16:
+ case IR::Opcode::FPFma16:
+ case IR::Opcode::FPMul16:
+ case IR::Opcode::FPNeg16:
+ case IR::Opcode::FPRoundEven16:
+ case IR::Opcode::FPSaturate16:
+ case IR::Opcode::FPClamp16:
+ case IR::Opcode::FPTrunc16:
+ case IR::Opcode::FPOrdEqual16:
+ case IR::Opcode::FPUnordEqual16:
+ case IR::Opcode::FPOrdNotEqual16:
+ case IR::Opcode::FPUnordNotEqual16:
+ case IR::Opcode::FPOrdLessThan16:
+ case IR::Opcode::FPUnordLessThan16:
+ case IR::Opcode::FPOrdGreaterThan16:
+ case IR::Opcode::FPUnordGreaterThan16:
+ case IR::Opcode::FPOrdLessThanEqual16:
+ case IR::Opcode::FPUnordLessThanEqual16:
+ case IR::Opcode::FPOrdGreaterThanEqual16:
+ case IR::Opcode::FPUnordGreaterThanEqual16:
+ case IR::Opcode::FPIsNan16:
+ case IR::Opcode::GlobalAtomicAddF16x2:
+ case IR::Opcode::GlobalAtomicMinF16x2:
+ case IR::Opcode::GlobalAtomicMaxF16x2:
+ case IR::Opcode::StorageAtomicAddF16x2:
+ case IR::Opcode::StorageAtomicMinF16x2:
+ case IR::Opcode::StorageAtomicMaxF16x2:
+ info.uses_fp16 = true;
+ break;
+ case IR::Opcode::CompositeConstructF64x2:
+ case IR::Opcode::CompositeConstructF64x3:
+ case IR::Opcode::CompositeConstructF64x4:
+ case IR::Opcode::CompositeExtractF64x2:
+ case IR::Opcode::CompositeExtractF64x3:
+ case IR::Opcode::CompositeExtractF64x4:
+ case IR::Opcode::CompositeInsertF64x2:
+ case IR::Opcode::CompositeInsertF64x3:
+ case IR::Opcode::CompositeInsertF64x4:
+ case IR::Opcode::SelectF64:
+ case IR::Opcode::BitCastU64F64:
+ case IR::Opcode::BitCastF64U64:
+ case IR::Opcode::PackDouble2x32:
+ case IR::Opcode::UnpackDouble2x32:
+ case IR::Opcode::FPAbs64:
+ case IR::Opcode::FPAdd64:
+ case IR::Opcode::FPCeil64:
+ case IR::Opcode::FPFloor64:
+ case IR::Opcode::FPFma64:
+ case IR::Opcode::FPMax64:
+ case IR::Opcode::FPMin64:
+ case IR::Opcode::FPMul64:
+ case IR::Opcode::FPNeg64:
+ case IR::Opcode::FPRecip64:
+ case IR::Opcode::FPRecipSqrt64:
+ case IR::Opcode::FPRoundEven64:
+ case IR::Opcode::FPSaturate64:
+ case IR::Opcode::FPClamp64:
+ case IR::Opcode::FPTrunc64:
+ case IR::Opcode::FPOrdEqual64:
+ case IR::Opcode::FPUnordEqual64:
+ case IR::Opcode::FPOrdNotEqual64:
+ case IR::Opcode::FPUnordNotEqual64:
+ case IR::Opcode::FPOrdLessThan64:
+ case IR::Opcode::FPUnordLessThan64:
+ case IR::Opcode::FPOrdGreaterThan64:
+ case IR::Opcode::FPUnordGreaterThan64:
+ case IR::Opcode::FPOrdLessThanEqual64:
+ case IR::Opcode::FPUnordLessThanEqual64:
+ case IR::Opcode::FPOrdGreaterThanEqual64:
+ case IR::Opcode::FPUnordGreaterThanEqual64:
+ case IR::Opcode::FPIsNan64:
+ case IR::Opcode::ConvertS16F64:
+ case IR::Opcode::ConvertS32F64:
+ case IR::Opcode::ConvertS64F64:
+ case IR::Opcode::ConvertU16F64:
+ case IR::Opcode::ConvertU32F64:
+ case IR::Opcode::ConvertU64F64:
+ case IR::Opcode::ConvertF32F64:
+ case IR::Opcode::ConvertF64F32:
+ case IR::Opcode::ConvertF64S8:
+ case IR::Opcode::ConvertF64S16:
+ case IR::Opcode::ConvertF64S32:
+ case IR::Opcode::ConvertF64S64:
+ case IR::Opcode::ConvertF64U8:
+ case IR::Opcode::ConvertF64U16:
+ case IR::Opcode::ConvertF64U32:
+ case IR::Opcode::ConvertF64U64:
+ info.uses_fp64 = true;
+ break;
+ default:
+ break;
+ }
+ switch (inst.GetOpcode()) {
+ case IR::Opcode::GetCbufU8:
+ case IR::Opcode::GetCbufS8:
+ case IR::Opcode::UndefU8:
+ case IR::Opcode::LoadGlobalU8:
+ case IR::Opcode::LoadGlobalS8:
+ case IR::Opcode::WriteGlobalU8:
+ case IR::Opcode::WriteGlobalS8:
+ case IR::Opcode::LoadStorageU8:
+ case IR::Opcode::LoadStorageS8:
+ case IR::Opcode::WriteStorageU8:
+ case IR::Opcode::WriteStorageS8:
+ case IR::Opcode::LoadSharedU8:
+ case IR::Opcode::LoadSharedS8:
+ case IR::Opcode::WriteSharedU8:
+ case IR::Opcode::SelectU8:
+ case IR::Opcode::ConvertF16S8:
+ case IR::Opcode::ConvertF16U8:
+ case IR::Opcode::ConvertF32S8:
+ case IR::Opcode::ConvertF32U8:
+ case IR::Opcode::ConvertF64S8:
+ case IR::Opcode::ConvertF64U8:
+ info.uses_int8 = true;
+ break;
+ default:
+ break;
+ }
+ switch (inst.GetOpcode()) {
+ case IR::Opcode::GetCbufU16:
+ case IR::Opcode::GetCbufS16:
+ case IR::Opcode::UndefU16:
+ case IR::Opcode::LoadGlobalU16:
+ case IR::Opcode::LoadGlobalS16:
+ case IR::Opcode::WriteGlobalU16:
+ case IR::Opcode::WriteGlobalS16:
+ case IR::Opcode::LoadStorageU16:
+ case IR::Opcode::LoadStorageS16:
+ case IR::Opcode::WriteStorageU16:
+ case IR::Opcode::WriteStorageS16:
+ case IR::Opcode::LoadSharedU16:
+ case IR::Opcode::LoadSharedS16:
+ case IR::Opcode::WriteSharedU16:
+ case IR::Opcode::SelectU16:
+ case IR::Opcode::BitCastU16F16:
+ case IR::Opcode::BitCastF16U16:
+ case IR::Opcode::ConvertS16F16:
+ case IR::Opcode::ConvertS16F32:
+ case IR::Opcode::ConvertS16F64:
+ case IR::Opcode::ConvertU16F16:
+ case IR::Opcode::ConvertU16F32:
+ case IR::Opcode::ConvertU16F64:
+ case IR::Opcode::ConvertF16S16:
+ case IR::Opcode::ConvertF16U16:
+ case IR::Opcode::ConvertF32S16:
+ case IR::Opcode::ConvertF32U16:
+ case IR::Opcode::ConvertF64S16:
+ case IR::Opcode::ConvertF64U16:
+ info.uses_int16 = true;
+ break;
+ default:
+ break;
+ }
+ switch (inst.GetOpcode()) {
+ case IR::Opcode::UndefU64:
+ case IR::Opcode::LoadGlobalU8:
+ case IR::Opcode::LoadGlobalS8:
+ case IR::Opcode::LoadGlobalU16:
+ case IR::Opcode::LoadGlobalS16:
+ case IR::Opcode::LoadGlobal32:
+ case IR::Opcode::LoadGlobal64:
+ case IR::Opcode::LoadGlobal128:
+ case IR::Opcode::WriteGlobalU8:
+ case IR::Opcode::WriteGlobalS8:
+ case IR::Opcode::WriteGlobalU16:
+ case IR::Opcode::WriteGlobalS16:
+ case IR::Opcode::WriteGlobal32:
+ case IR::Opcode::WriteGlobal64:
+ case IR::Opcode::WriteGlobal128:
+ case IR::Opcode::SelectU64:
+ case IR::Opcode::BitCastU64F64:
+ case IR::Opcode::BitCastF64U64:
+ case IR::Opcode::PackUint2x32:
+ case IR::Opcode::UnpackUint2x32:
+ case IR::Opcode::IAdd64:
+ case IR::Opcode::ISub64:
+ case IR::Opcode::INeg64:
+ case IR::Opcode::ShiftLeftLogical64:
+ case IR::Opcode::ShiftRightLogical64:
+ case IR::Opcode::ShiftRightArithmetic64:
+ case IR::Opcode::ConvertS64F16:
+ case IR::Opcode::ConvertS64F32:
+ case IR::Opcode::ConvertS64F64:
+ case IR::Opcode::ConvertU64F16:
+ case IR::Opcode::ConvertU64F32:
+ case IR::Opcode::ConvertU64F64:
+ case IR::Opcode::ConvertU64U32:
+ case IR::Opcode::ConvertU32U64:
+ case IR::Opcode::ConvertF16U64:
+ case IR::Opcode::ConvertF32U64:
+ case IR::Opcode::ConvertF64U64:
+ case IR::Opcode::SharedAtomicExchange64:
+ case IR::Opcode::GlobalAtomicIAdd64:
+ case IR::Opcode::GlobalAtomicSMin64:
+ case IR::Opcode::GlobalAtomicUMin64:
+ case IR::Opcode::GlobalAtomicSMax64:
+ case IR::Opcode::GlobalAtomicUMax64:
+ case IR::Opcode::GlobalAtomicAnd64:
+ case IR::Opcode::GlobalAtomicOr64:
+ case IR::Opcode::GlobalAtomicXor64:
+ case IR::Opcode::GlobalAtomicExchange64:
+ case IR::Opcode::StorageAtomicIAdd64:
+ case IR::Opcode::StorageAtomicSMin64:
+ case IR::Opcode::StorageAtomicUMin64:
+ case IR::Opcode::StorageAtomicSMax64:
+ case IR::Opcode::StorageAtomicUMax64:
+ case IR::Opcode::StorageAtomicAnd64:
+ case IR::Opcode::StorageAtomicOr64:
+ case IR::Opcode::StorageAtomicXor64:
+ case IR::Opcode::StorageAtomicExchange64:
+ info.uses_int64 = true;
+ break;
+ default:
+ break;
+ }
+ switch (inst.GetOpcode()) {
+ case IR::Opcode::WriteGlobalU8:
+ case IR::Opcode::WriteGlobalS8:
+ case IR::Opcode::WriteGlobalU16:
+ case IR::Opcode::WriteGlobalS16:
+ case IR::Opcode::WriteGlobal32:
+ case IR::Opcode::WriteGlobal64:
+ case IR::Opcode::WriteGlobal128:
+ case IR::Opcode::GlobalAtomicIAdd32:
+ case IR::Opcode::GlobalAtomicSMin32:
+ case IR::Opcode::GlobalAtomicUMin32:
+ case IR::Opcode::GlobalAtomicSMax32:
+ case IR::Opcode::GlobalAtomicUMax32:
+ case IR::Opcode::GlobalAtomicInc32:
+ case IR::Opcode::GlobalAtomicDec32:
+ case IR::Opcode::GlobalAtomicAnd32:
+ case IR::Opcode::GlobalAtomicOr32:
+ case IR::Opcode::GlobalAtomicXor32:
+ case IR::Opcode::GlobalAtomicExchange32:
+ case IR::Opcode::GlobalAtomicIAdd64:
+ case IR::Opcode::GlobalAtomicSMin64:
+ case IR::Opcode::GlobalAtomicUMin64:
+ case IR::Opcode::GlobalAtomicSMax64:
+ case IR::Opcode::GlobalAtomicUMax64:
+ case IR::Opcode::GlobalAtomicAnd64:
+ case IR::Opcode::GlobalAtomicOr64:
+ case IR::Opcode::GlobalAtomicXor64:
+ case IR::Opcode::GlobalAtomicExchange64:
+ case IR::Opcode::GlobalAtomicAddF32:
+ case IR::Opcode::GlobalAtomicAddF16x2:
+ case IR::Opcode::GlobalAtomicAddF32x2:
+ case IR::Opcode::GlobalAtomicMinF16x2:
+ case IR::Opcode::GlobalAtomicMinF32x2:
+ case IR::Opcode::GlobalAtomicMaxF16x2:
+ case IR::Opcode::GlobalAtomicMaxF32x2:
+ info.stores_global_memory = true;
+ [[fallthrough]];
+ case IR::Opcode::LoadGlobalU8:
+ case IR::Opcode::LoadGlobalS8:
+ case IR::Opcode::LoadGlobalU16:
+ case IR::Opcode::LoadGlobalS16:
+ case IR::Opcode::LoadGlobal32:
+ case IR::Opcode::LoadGlobal64:
+ case IR::Opcode::LoadGlobal128:
+ info.uses_int64 = true;
+ info.uses_global_memory = true;
+ info.used_constant_buffer_types |= IR::Type::U32 | IR::Type::U32x2;
+ info.used_storage_buffer_types |= IR::Type::U32 | IR::Type::U32x2 | IR::Type::U32x4;
+ break;
+ default:
+ break;
+ }
+ switch (inst.GetOpcode()) {
+ case IR::Opcode::DemoteToHelperInvocation:
+ info.uses_demote_to_helper_invocation = true;
+ break;
+ case IR::Opcode::GetAttribute:
+ info.loads.mask[static_cast<size_t>(inst.Arg(0).Attribute())] = true;
+ break;
+ case IR::Opcode::SetAttribute:
+ info.stores.mask[static_cast<size_t>(inst.Arg(0).Attribute())] = true;
+ break;
+ case IR::Opcode::GetPatch:
+ GetPatch(info, inst.Arg(0).Patch());
+ break;
+ case IR::Opcode::SetPatch:
+ SetPatch(info, inst.Arg(0).Patch());
+ break;
+ case IR::Opcode::GetAttributeIndexed:
+ info.loads_indexed_attributes = true;
+ break;
+ case IR::Opcode::SetAttributeIndexed:
+ info.stores_indexed_attributes = true;
+ break;
+ case IR::Opcode::SetFragColor:
+ info.stores_frag_color[inst.Arg(0).U32()] = true;
+ break;
+ case IR::Opcode::SetSampleMask:
+ info.stores_sample_mask = true;
+ break;
+ case IR::Opcode::SetFragDepth:
+ info.stores_frag_depth = true;
+ break;
+ case IR::Opcode::WorkgroupId:
+ info.uses_workgroup_id = true;
+ break;
+ case IR::Opcode::LocalInvocationId:
+ info.uses_local_invocation_id = true;
+ break;
+ case IR::Opcode::InvocationId:
+ info.uses_invocation_id = true;
+ break;
+ case IR::Opcode::SampleId:
+ info.uses_sample_id = true;
+ break;
+ case IR::Opcode::IsHelperInvocation:
+ info.uses_is_helper_invocation = true;
+ break;
+ case IR::Opcode::LaneId:
+ info.uses_subgroup_invocation_id = true;
+ break;
+ case IR::Opcode::ShuffleIndex:
+ case IR::Opcode::ShuffleUp:
+ case IR::Opcode::ShuffleDown:
+ case IR::Opcode::ShuffleButterfly:
+ info.uses_subgroup_shuffles = true;
+ break;
+ case IR::Opcode::GetCbufU8:
+ case IR::Opcode::GetCbufS8:
+ case IR::Opcode::GetCbufU16:
+ case IR::Opcode::GetCbufS16:
+ case IR::Opcode::GetCbufU32:
+ case IR::Opcode::GetCbufF32:
+ case IR::Opcode::GetCbufU32x2: {
+ const IR::Value index{inst.Arg(0)};
+ const IR::Value offset{inst.Arg(1)};
+ if (!index.IsImmediate()) {
+ throw NotImplementedException("Constant buffer with non-immediate index");
+ }
+ AddConstantBufferDescriptor(info, index.U32(), 1);
+ u32 element_size{};
+ switch (inst.GetOpcode()) {
+ case IR::Opcode::GetCbufU8:
+ case IR::Opcode::GetCbufS8:
+ info.used_constant_buffer_types |= IR::Type::U8;
+ element_size = 1;
+ break;
+ case IR::Opcode::GetCbufU16:
+ case IR::Opcode::GetCbufS16:
+ info.used_constant_buffer_types |= IR::Type::U16;
+ element_size = 2;
+ break;
+ case IR::Opcode::GetCbufU32:
+ info.used_constant_buffer_types |= IR::Type::U32;
+ element_size = 4;
+ break;
+ case IR::Opcode::GetCbufF32:
+ info.used_constant_buffer_types |= IR::Type::F32;
+ element_size = 4;
+ break;
+ case IR::Opcode::GetCbufU32x2:
+ info.used_constant_buffer_types |= IR::Type::U32x2;
+ element_size = 8;
+ break;
+ default:
+ break;
+ }
+ u32& size{info.constant_buffer_used_sizes[index.U32()]};
+ if (offset.IsImmediate()) {
+ size = Common::AlignUp(std::max(size, offset.U32() + element_size), 16u);
+ } else {
+ size = 0x10'000;
+ }
+ break;
+ }
+ case IR::Opcode::BindlessImageSampleImplicitLod:
+ case IR::Opcode::BindlessImageSampleExplicitLod:
+ case IR::Opcode::BindlessImageSampleDrefImplicitLod:
+ case IR::Opcode::BindlessImageSampleDrefExplicitLod:
+ case IR::Opcode::BindlessImageGather:
+ case IR::Opcode::BindlessImageGatherDref:
+ case IR::Opcode::BindlessImageFetch:
+ case IR::Opcode::BindlessImageQueryDimensions:
+ case IR::Opcode::BindlessImageQueryLod:
+ case IR::Opcode::BindlessImageGradient:
+ case IR::Opcode::BoundImageSampleImplicitLod:
+ case IR::Opcode::BoundImageSampleExplicitLod:
+ case IR::Opcode::BoundImageSampleDrefImplicitLod:
+ case IR::Opcode::BoundImageSampleDrefExplicitLod:
+ case IR::Opcode::BoundImageGather:
+ case IR::Opcode::BoundImageGatherDref:
+ case IR::Opcode::BoundImageFetch:
+ case IR::Opcode::BoundImageQueryDimensions:
+ case IR::Opcode::BoundImageQueryLod:
+ case IR::Opcode::BoundImageGradient:
+ case IR::Opcode::ImageGather:
+ case IR::Opcode::ImageGatherDref:
+ case IR::Opcode::ImageFetch:
+ case IR::Opcode::ImageQueryDimensions:
+ case IR::Opcode::ImageGradient: {
+ const TextureType type{inst.Flags<IR::TextureInstInfo>().type};
+ info.uses_sampled_1d |= type == TextureType::Color1D || type == TextureType::ColorArray1D;
+ info.uses_sparse_residency |=
+ inst.GetAssociatedPseudoOperation(IR::Opcode::GetSparseFromOp) != nullptr;
+ break;
+ }
+ case IR::Opcode::ImageSampleImplicitLod:
+ case IR::Opcode::ImageSampleExplicitLod:
+ case IR::Opcode::ImageSampleDrefImplicitLod:
+ case IR::Opcode::ImageSampleDrefExplicitLod:
+ case IR::Opcode::ImageQueryLod: {
+ const auto flags{inst.Flags<IR::TextureInstInfo>()};
+ const TextureType type{flags.type};
+ info.uses_sampled_1d |= type == TextureType::Color1D || type == TextureType::ColorArray1D;
+ info.uses_shadow_lod |= flags.is_depth != 0;
+ info.uses_sparse_residency |=
+ inst.GetAssociatedPseudoOperation(IR::Opcode::GetSparseFromOp) != nullptr;
+ break;
+ }
+ case IR::Opcode::ImageRead: {
+ const auto flags{inst.Flags<IR::TextureInstInfo>()};
+ info.uses_typeless_image_reads |= flags.image_format == ImageFormat::Typeless;
+ info.uses_sparse_residency |=
+ inst.GetAssociatedPseudoOperation(IR::Opcode::GetSparseFromOp) != nullptr;
+ break;
+ }
+ case IR::Opcode::ImageWrite: {
+ const auto flags{inst.Flags<IR::TextureInstInfo>()};
+ info.uses_typeless_image_writes |= flags.image_format == ImageFormat::Typeless;
+ info.uses_image_buffers |= flags.type == TextureType::Buffer;
+ break;
+ }
+ case IR::Opcode::SubgroupEqMask:
+ case IR::Opcode::SubgroupLtMask:
+ case IR::Opcode::SubgroupLeMask:
+ case IR::Opcode::SubgroupGtMask:
+ case IR::Opcode::SubgroupGeMask:
+ info.uses_subgroup_mask = true;
+ break;
+ case IR::Opcode::VoteAll:
+ case IR::Opcode::VoteAny:
+ case IR::Opcode::VoteEqual:
+ case IR::Opcode::SubgroupBallot:
+ info.uses_subgroup_vote = true;
+ break;
+ case IR::Opcode::FSwizzleAdd:
+ info.uses_fswzadd = true;
+ break;
+ case IR::Opcode::DPdxFine:
+ case IR::Opcode::DPdyFine:
+ case IR::Opcode::DPdxCoarse:
+ case IR::Opcode::DPdyCoarse:
+ info.uses_derivatives = true;
+ break;
+ case IR::Opcode::LoadStorageU8:
+ case IR::Opcode::LoadStorageS8:
+ case IR::Opcode::WriteStorageU8:
+ case IR::Opcode::WriteStorageS8:
+ info.used_storage_buffer_types |= IR::Type::U8;
+ break;
+ case IR::Opcode::LoadStorageU16:
+ case IR::Opcode::LoadStorageS16:
+ case IR::Opcode::WriteStorageU16:
+ case IR::Opcode::WriteStorageS16:
+ info.used_storage_buffer_types |= IR::Type::U16;
+ break;
+ case IR::Opcode::LoadStorage32:
+ case IR::Opcode::WriteStorage32:
+ case IR::Opcode::StorageAtomicIAdd32:
+ case IR::Opcode::StorageAtomicUMin32:
+ case IR::Opcode::StorageAtomicUMax32:
+ case IR::Opcode::StorageAtomicAnd32:
+ case IR::Opcode::StorageAtomicOr32:
+ case IR::Opcode::StorageAtomicXor32:
+ case IR::Opcode::StorageAtomicExchange32:
+ info.used_storage_buffer_types |= IR::Type::U32;
+ break;
+ case IR::Opcode::LoadStorage64:
+ case IR::Opcode::WriteStorage64:
+ info.used_storage_buffer_types |= IR::Type::U32x2;
+ break;
+ case IR::Opcode::LoadStorage128:
+ case IR::Opcode::WriteStorage128:
+ info.used_storage_buffer_types |= IR::Type::U32x4;
+ break;
+ case IR::Opcode::SharedAtomicSMin32:
+ info.uses_atomic_s32_min = true;
+ break;
+ case IR::Opcode::SharedAtomicSMax32:
+ info.uses_atomic_s32_max = true;
+ break;
+ case IR::Opcode::SharedAtomicInc32:
+ info.uses_shared_increment = true;
+ break;
+ case IR::Opcode::SharedAtomicDec32:
+ info.uses_shared_decrement = true;
+ break;
+ case IR::Opcode::SharedAtomicExchange64:
+ info.uses_int64_bit_atomics = true;
+ break;
+ case IR::Opcode::GlobalAtomicInc32:
+ case IR::Opcode::StorageAtomicInc32:
+ info.used_storage_buffer_types |= IR::Type::U32;
+ info.uses_global_increment = true;
+ break;
+ case IR::Opcode::GlobalAtomicDec32:
+ case IR::Opcode::StorageAtomicDec32:
+ info.used_storage_buffer_types |= IR::Type::U32;
+ info.uses_global_decrement = true;
+ break;
+ case IR::Opcode::GlobalAtomicAddF32:
+ case IR::Opcode::StorageAtomicAddF32:
+ info.used_storage_buffer_types |= IR::Type::U32;
+ info.uses_atomic_f32_add = true;
+ break;
+ case IR::Opcode::GlobalAtomicAddF16x2:
+ case IR::Opcode::StorageAtomicAddF16x2:
+ info.used_storage_buffer_types |= IR::Type::U32;
+ info.uses_atomic_f16x2_add = true;
+ break;
+ case IR::Opcode::GlobalAtomicAddF32x2:
+ case IR::Opcode::StorageAtomicAddF32x2:
+ info.used_storage_buffer_types |= IR::Type::U32;
+ info.uses_atomic_f32x2_add = true;
+ break;
+ case IR::Opcode::GlobalAtomicMinF16x2:
+ case IR::Opcode::StorageAtomicMinF16x2:
+ info.used_storage_buffer_types |= IR::Type::U32;
+ info.uses_atomic_f16x2_min = true;
+ break;
+ case IR::Opcode::GlobalAtomicMinF32x2:
+ case IR::Opcode::StorageAtomicMinF32x2:
+ info.used_storage_buffer_types |= IR::Type::U32;
+ info.uses_atomic_f32x2_min = true;
+ break;
+ case IR::Opcode::GlobalAtomicMaxF16x2:
+ case IR::Opcode::StorageAtomicMaxF16x2:
+ info.used_storage_buffer_types |= IR::Type::U32;
+ info.uses_atomic_f16x2_max = true;
+ break;
+ case IR::Opcode::GlobalAtomicMaxF32x2:
+ case IR::Opcode::StorageAtomicMaxF32x2:
+ info.used_storage_buffer_types |= IR::Type::U32;
+ info.uses_atomic_f32x2_max = true;
+ break;
+ case IR::Opcode::StorageAtomicSMin32:
+ info.used_storage_buffer_types |= IR::Type::U32;
+ info.uses_atomic_s32_min = true;
+ break;
+ case IR::Opcode::StorageAtomicSMax32:
+ info.used_storage_buffer_types |= IR::Type::U32;
+ info.uses_atomic_s32_max = true;
+ break;
+ case IR::Opcode::GlobalAtomicIAdd64:
+ case IR::Opcode::GlobalAtomicSMin64:
+ case IR::Opcode::GlobalAtomicUMin64:
+ case IR::Opcode::GlobalAtomicSMax64:
+ case IR::Opcode::GlobalAtomicUMax64:
+ case IR::Opcode::GlobalAtomicAnd64:
+ case IR::Opcode::GlobalAtomicOr64:
+ case IR::Opcode::GlobalAtomicXor64:
+ case IR::Opcode::GlobalAtomicExchange64:
+ case IR::Opcode::StorageAtomicIAdd64:
+ case IR::Opcode::StorageAtomicSMin64:
+ case IR::Opcode::StorageAtomicUMin64:
+ case IR::Opcode::StorageAtomicSMax64:
+ case IR::Opcode::StorageAtomicUMax64:
+ case IR::Opcode::StorageAtomicAnd64:
+ case IR::Opcode::StorageAtomicOr64:
+ case IR::Opcode::StorageAtomicXor64:
+ info.used_storage_buffer_types |= IR::Type::U64;
+ info.uses_int64_bit_atomics = true;
+ break;
+ case IR::Opcode::BindlessImageAtomicIAdd32:
+ case IR::Opcode::BindlessImageAtomicSMin32:
+ case IR::Opcode::BindlessImageAtomicUMin32:
+ case IR::Opcode::BindlessImageAtomicSMax32:
+ case IR::Opcode::BindlessImageAtomicUMax32:
+ case IR::Opcode::BindlessImageAtomicInc32:
+ case IR::Opcode::BindlessImageAtomicDec32:
+ case IR::Opcode::BindlessImageAtomicAnd32:
+ case IR::Opcode::BindlessImageAtomicOr32:
+ case IR::Opcode::BindlessImageAtomicXor32:
+ case IR::Opcode::BindlessImageAtomicExchange32:
+ case IR::Opcode::BoundImageAtomicIAdd32:
+ case IR::Opcode::BoundImageAtomicSMin32:
+ case IR::Opcode::BoundImageAtomicUMin32:
+ case IR::Opcode::BoundImageAtomicSMax32:
+ case IR::Opcode::BoundImageAtomicUMax32:
+ case IR::Opcode::BoundImageAtomicInc32:
+ case IR::Opcode::BoundImageAtomicDec32:
+ case IR::Opcode::BoundImageAtomicAnd32:
+ case IR::Opcode::BoundImageAtomicOr32:
+ case IR::Opcode::BoundImageAtomicXor32:
+ case IR::Opcode::BoundImageAtomicExchange32:
+ case IR::Opcode::ImageAtomicIAdd32:
+ case IR::Opcode::ImageAtomicSMin32:
+ case IR::Opcode::ImageAtomicUMin32:
+ case IR::Opcode::ImageAtomicSMax32:
+ case IR::Opcode::ImageAtomicUMax32:
+ case IR::Opcode::ImageAtomicInc32:
+ case IR::Opcode::ImageAtomicDec32:
+ case IR::Opcode::ImageAtomicAnd32:
+ case IR::Opcode::ImageAtomicOr32:
+ case IR::Opcode::ImageAtomicXor32:
+ case IR::Opcode::ImageAtomicExchange32:
+ info.uses_atomic_image_u32 = true;
+ break;
+ default:
+ break;
+ }
+}
+
+void VisitFpModifiers(Info& info, IR::Inst& inst) {
+ switch (inst.GetOpcode()) {
+ case IR::Opcode::FPAdd16:
+ case IR::Opcode::FPFma16:
+ case IR::Opcode::FPMul16:
+ case IR::Opcode::FPRoundEven16:
+ case IR::Opcode::FPFloor16:
+ case IR::Opcode::FPCeil16:
+ case IR::Opcode::FPTrunc16: {
+ const auto control{inst.Flags<IR::FpControl>()};
+ switch (control.fmz_mode) {
+ case IR::FmzMode::DontCare:
+ break;
+ case IR::FmzMode::FTZ:
+ case IR::FmzMode::FMZ:
+ info.uses_fp16_denorms_flush = true;
+ break;
+ case IR::FmzMode::None:
+ info.uses_fp16_denorms_preserve = true;
+ break;
+ }
+ break;
+ }
+ case IR::Opcode::FPAdd32:
+ case IR::Opcode::FPFma32:
+ case IR::Opcode::FPMul32:
+ case IR::Opcode::FPRoundEven32:
+ case IR::Opcode::FPFloor32:
+ case IR::Opcode::FPCeil32:
+ case IR::Opcode::FPTrunc32:
+ case IR::Opcode::FPOrdEqual32:
+ case IR::Opcode::FPUnordEqual32:
+ case IR::Opcode::FPOrdNotEqual32:
+ case IR::Opcode::FPUnordNotEqual32:
+ case IR::Opcode::FPOrdLessThan32:
+ case IR::Opcode::FPUnordLessThan32:
+ case IR::Opcode::FPOrdGreaterThan32:
+ case IR::Opcode::FPUnordGreaterThan32:
+ case IR::Opcode::FPOrdLessThanEqual32:
+ case IR::Opcode::FPUnordLessThanEqual32:
+ case IR::Opcode::FPOrdGreaterThanEqual32:
+ case IR::Opcode::FPUnordGreaterThanEqual32:
+ case IR::Opcode::ConvertF16F32:
+ case IR::Opcode::ConvertF64F32: {
+ const auto control{inst.Flags<IR::FpControl>()};
+ switch (control.fmz_mode) {
+ case IR::FmzMode::DontCare:
+ break;
+ case IR::FmzMode::FTZ:
+ case IR::FmzMode::FMZ:
+ info.uses_fp32_denorms_flush = true;
+ break;
+ case IR::FmzMode::None:
+ info.uses_fp32_denorms_preserve = true;
+ break;
+ }
+ break;
+ }
+ default:
+ break;
+ }
+}
+
+void VisitCbufs(Info& info, IR::Inst& inst) {
+ switch (inst.GetOpcode()) {
+ case IR::Opcode::GetCbufU8:
+ case IR::Opcode::GetCbufS8:
+ case IR::Opcode::GetCbufU16:
+ case IR::Opcode::GetCbufS16:
+ case IR::Opcode::GetCbufU32:
+ case IR::Opcode::GetCbufF32:
+ case IR::Opcode::GetCbufU32x2: {
+ CheckCBufNVN(info, inst);
+ break;
+ }
+ default:
+ break;
+ }
+}
+
+void Visit(Info& info, IR::Inst& inst) {
+ VisitUsages(info, inst);
+ VisitFpModifiers(info, inst);
+ VisitCbufs(info, inst);
+}
+
+void GatherInfoFromHeader(Environment& env, Info& info) {
+ Stage stage{env.ShaderStage()};
+ if (stage == Stage::Compute) {
+ return;
+ }
+ const auto& header{env.SPH()};
+ if (stage == Stage::Fragment) {
+ if (!info.loads_indexed_attributes) {
+ return;
+ }
+ for (size_t index = 0; index < IR::NUM_GENERICS; ++index) {
+ const size_t offset{static_cast<size_t>(IR::Attribute::Generic0X) + index * 4};
+ const auto vector{header.ps.imap_generic_vector[index]};
+ info.loads.mask[offset + 0] = vector.x != PixelImap::Unused;
+ info.loads.mask[offset + 1] = vector.y != PixelImap::Unused;
+ info.loads.mask[offset + 2] = vector.z != PixelImap::Unused;
+ info.loads.mask[offset + 3] = vector.w != PixelImap::Unused;
+ }
+ return;
+ }
+ if (info.loads_indexed_attributes) {
+ for (size_t index = 0; index < IR::NUM_GENERICS; ++index) {
+ const IR::Attribute attribute{IR::Attribute::Generic0X + index * 4};
+ const auto mask = header.vtg.InputGeneric(index);
+ for (size_t i = 0; i < 4; ++i) {
+ info.loads.Set(attribute + i, mask[i]);
+ }
+ }
+ for (size_t index = 0; index < 8; ++index) {
+ const u16 mask{header.vtg.clip_distances};
+ info.loads.Set(IR::Attribute::ClipDistance0 + index, ((mask >> index) & 1) != 0);
+ }
+ info.loads.Set(IR::Attribute::PrimitiveId, header.vtg.imap_systemb.primitive_array_id != 0);
+ info.loads.Set(IR::Attribute::Layer, header.vtg.imap_systemb.rt_array_index != 0);
+ info.loads.Set(IR::Attribute::ViewportIndex, header.vtg.imap_systemb.viewport_index != 0);
+ info.loads.Set(IR::Attribute::PointSize, header.vtg.imap_systemb.point_size != 0);
+ info.loads.Set(IR::Attribute::PositionX, header.vtg.imap_systemb.position_x != 0);
+ info.loads.Set(IR::Attribute::PositionY, header.vtg.imap_systemb.position_y != 0);
+ info.loads.Set(IR::Attribute::PositionZ, header.vtg.imap_systemb.position_z != 0);
+ info.loads.Set(IR::Attribute::PositionW, header.vtg.imap_systemb.position_w != 0);
+ info.loads.Set(IR::Attribute::PointSpriteS, header.vtg.point_sprite_s != 0);
+ info.loads.Set(IR::Attribute::PointSpriteT, header.vtg.point_sprite_t != 0);
+ info.loads.Set(IR::Attribute::FogCoordinate, header.vtg.fog_coordinate != 0);
+ info.loads.Set(IR::Attribute::TessellationEvaluationPointU,
+ header.vtg.tessellation_eval_point_u != 0);
+ info.loads.Set(IR::Attribute::TessellationEvaluationPointV,
+ header.vtg.tessellation_eval_point_v != 0);
+ info.loads.Set(IR::Attribute::InstanceId, header.vtg.instance_id != 0);
+ info.loads.Set(IR::Attribute::VertexId, header.vtg.vertex_id != 0);
+ // TODO: Legacy varyings
+ }
+ if (info.stores_indexed_attributes) {
+ for (size_t index = 0; index < IR::NUM_GENERICS; ++index) {
+ const IR::Attribute attribute{IR::Attribute::Generic0X + index * 4};
+ const auto mask{header.vtg.OutputGeneric(index)};
+ for (size_t i = 0; i < 4; ++i) {
+ info.stores.Set(attribute + i, mask[i]);
+ }
+ }
+ for (size_t index = 0; index < 8; ++index) {
+ const u16 mask{header.vtg.omap_systemc.clip_distances};
+ info.stores.Set(IR::Attribute::ClipDistance0 + index, ((mask >> index) & 1) != 0);
+ }
+ info.stores.Set(IR::Attribute::PrimitiveId,
+ header.vtg.omap_systemb.primitive_array_id != 0);
+ info.stores.Set(IR::Attribute::Layer, header.vtg.omap_systemb.rt_array_index != 0);
+ info.stores.Set(IR::Attribute::ViewportIndex, header.vtg.omap_systemb.viewport_index != 0);
+ info.stores.Set(IR::Attribute::PointSize, header.vtg.omap_systemb.point_size != 0);
+ info.stores.Set(IR::Attribute::PositionX, header.vtg.omap_systemb.position_x != 0);
+ info.stores.Set(IR::Attribute::PositionY, header.vtg.omap_systemb.position_y != 0);
+ info.stores.Set(IR::Attribute::PositionZ, header.vtg.omap_systemb.position_z != 0);
+ info.stores.Set(IR::Attribute::PositionW, header.vtg.omap_systemb.position_w != 0);
+ info.stores.Set(IR::Attribute::PointSpriteS, header.vtg.omap_systemc.point_sprite_s != 0);
+ info.stores.Set(IR::Attribute::PointSpriteT, header.vtg.omap_systemc.point_sprite_t != 0);
+ info.stores.Set(IR::Attribute::FogCoordinate, header.vtg.omap_systemc.fog_coordinate != 0);
+ info.stores.Set(IR::Attribute::TessellationEvaluationPointU,
+ header.vtg.omap_systemc.tessellation_eval_point_u != 0);
+ info.stores.Set(IR::Attribute::TessellationEvaluationPointV,
+ header.vtg.omap_systemc.tessellation_eval_point_v != 0);
+ info.stores.Set(IR::Attribute::InstanceId, header.vtg.omap_systemc.instance_id != 0);
+ info.stores.Set(IR::Attribute::VertexId, header.vtg.omap_systemc.vertex_id != 0);
+ // TODO: Legacy varyings
+ }
+}
+} // Anonymous namespace
+
+void CollectShaderInfoPass(Environment& env, IR::Program& program) {
+ Info& info{program.info};
+ const u32 base{[&] {
+ switch (program.stage) {
+ case Stage::VertexA:
+ case Stage::VertexB:
+ return 0x110u;
+ case Stage::TessellationControl:
+ return 0x210u;
+ case Stage::TessellationEval:
+ return 0x310u;
+ case Stage::Geometry:
+ return 0x410u;
+ case Stage::Fragment:
+ return 0x510u;
+ case Stage::Compute:
+ return 0x310u;
+ }
+ throw InvalidArgument("Invalid stage {}", program.stage);
+ }()};
+ info.nvn_buffer_base = base;
+
+ for (IR::Block* const block : program.post_order_blocks) {
+ for (IR::Inst& inst : block->Instructions()) {
+ Visit(info, inst);
+ }
+ }
+ GatherInfoFromHeader(env, info);
+}
+
+} // namespace Shader::Optimization
diff --git a/src/shader_recompiler/ir_opt/constant_propagation_pass.cpp b/src/shader_recompiler/ir_opt/constant_propagation_pass.cpp
new file mode 100644
index 000000000..8dd6d6c2c
--- /dev/null
+++ b/src/shader_recompiler/ir_opt/constant_propagation_pass.cpp
@@ -0,0 +1,610 @@
+// Copyright 2021 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <algorithm>
+#include <tuple>
+#include <type_traits>
+
+#include "common/bit_cast.h"
+#include "common/bit_util.h"
+#include "shader_recompiler/exception.h"
+#include "shader_recompiler/frontend/ir/ir_emitter.h"
+#include "shader_recompiler/frontend/ir/value.h"
+#include "shader_recompiler/ir_opt/passes.h"
+
+namespace Shader::Optimization {
+namespace {
+// Metaprogramming stuff to get arguments information out of a lambda
+template <typename Func>
+struct LambdaTraits : LambdaTraits<decltype(&std::remove_reference_t<Func>::operator())> {};
+
+template <typename ReturnType, typename LambdaType, typename... Args>
+struct LambdaTraits<ReturnType (LambdaType::*)(Args...) const> {
+ template <size_t I>
+ using ArgType = std::tuple_element_t<I, std::tuple<Args...>>;
+
+ static constexpr size_t NUM_ARGS{sizeof...(Args)};
+};
+
+template <typename T>
+[[nodiscard]] T Arg(const IR::Value& value) {
+ if constexpr (std::is_same_v<T, bool>) {
+ return value.U1();
+ } else if constexpr (std::is_same_v<T, u32>) {
+ return value.U32();
+ } else if constexpr (std::is_same_v<T, s32>) {
+ return static_cast<s32>(value.U32());
+ } else if constexpr (std::is_same_v<T, f32>) {
+ return value.F32();
+ } else if constexpr (std::is_same_v<T, u64>) {
+ return value.U64();
+ }
+}
+
+template <typename T, typename ImmFn>
+bool FoldCommutative(IR::Inst& inst, ImmFn&& imm_fn) {
+ const IR::Value lhs{inst.Arg(0)};
+ const IR::Value rhs{inst.Arg(1)};
+
+ const bool is_lhs_immediate{lhs.IsImmediate()};
+ const bool is_rhs_immediate{rhs.IsImmediate()};
+
+ if (is_lhs_immediate && is_rhs_immediate) {
+ const auto result{imm_fn(Arg<T>(lhs), Arg<T>(rhs))};
+ inst.ReplaceUsesWith(IR::Value{result});
+ return false;
+ }
+ if (is_lhs_immediate && !is_rhs_immediate) {
+ IR::Inst* const rhs_inst{rhs.InstRecursive()};
+ if (rhs_inst->GetOpcode() == inst.GetOpcode() && rhs_inst->Arg(1).IsImmediate()) {
+ const auto combined{imm_fn(Arg<T>(lhs), Arg<T>(rhs_inst->Arg(1)))};
+ inst.SetArg(0, rhs_inst->Arg(0));
+ inst.SetArg(1, IR::Value{combined});
+ } else {
+ // Normalize
+ inst.SetArg(0, rhs);
+ inst.SetArg(1, lhs);
+ }
+ }
+ if (!is_lhs_immediate && is_rhs_immediate) {
+ const IR::Inst* const lhs_inst{lhs.InstRecursive()};
+ if (lhs_inst->GetOpcode() == inst.GetOpcode() && lhs_inst->Arg(1).IsImmediate()) {
+ const auto combined{imm_fn(Arg<T>(rhs), Arg<T>(lhs_inst->Arg(1)))};
+ inst.SetArg(0, lhs_inst->Arg(0));
+ inst.SetArg(1, IR::Value{combined});
+ }
+ }
+ return true;
+}
+
+template <typename Func>
+bool FoldWhenAllImmediates(IR::Inst& inst, Func&& func) {
+ if (!inst.AreAllArgsImmediates() || inst.HasAssociatedPseudoOperation()) {
+ return false;
+ }
+ using Indices = std::make_index_sequence<LambdaTraits<decltype(func)>::NUM_ARGS>;
+ inst.ReplaceUsesWith(EvalImmediates(inst, func, Indices{}));
+ return true;
+}
+
+void FoldGetRegister(IR::Inst& inst) {
+ if (inst.Arg(0).Reg() == IR::Reg::RZ) {
+ inst.ReplaceUsesWith(IR::Value{u32{0}});
+ }
+}
+
+void FoldGetPred(IR::Inst& inst) {
+ if (inst.Arg(0).Pred() == IR::Pred::PT) {
+ inst.ReplaceUsesWith(IR::Value{true});
+ }
+}
+
+/// Replaces the pattern generated by two XMAD multiplications
+bool FoldXmadMultiply(IR::Block& block, IR::Inst& inst) {
+ /*
+ * We are looking for this pattern:
+ * %rhs_bfe = BitFieldUExtract %factor_a, #0, #16
+ * %rhs_mul = IMul32 %rhs_bfe, %factor_b
+ * %lhs_bfe = BitFieldUExtract %factor_a, #16, #16
+ * %rhs_mul = IMul32 %lhs_bfe, %factor_b
+ * %lhs_shl = ShiftLeftLogical32 %rhs_mul, #16
+ * %result = IAdd32 %lhs_shl, %rhs_mul
+ *
+ * And replacing it with
+ * %result = IMul32 %factor_a, %factor_b
+ *
+ * This optimization has been proven safe by LLVM and MSVC.
+ */
+ const IR::Value lhs_arg{inst.Arg(0)};
+ const IR::Value rhs_arg{inst.Arg(1)};
+ if (lhs_arg.IsImmediate() || rhs_arg.IsImmediate()) {
+ return false;
+ }
+ IR::Inst* const lhs_shl{lhs_arg.InstRecursive()};
+ if (lhs_shl->GetOpcode() != IR::Opcode::ShiftLeftLogical32 ||
+ lhs_shl->Arg(1) != IR::Value{16U}) {
+ return false;
+ }
+ if (lhs_shl->Arg(0).IsImmediate()) {
+ return false;
+ }
+ IR::Inst* const lhs_mul{lhs_shl->Arg(0).InstRecursive()};
+ IR::Inst* const rhs_mul{rhs_arg.InstRecursive()};
+ if (lhs_mul->GetOpcode() != IR::Opcode::IMul32 || rhs_mul->GetOpcode() != IR::Opcode::IMul32) {
+ return false;
+ }
+ if (lhs_mul->Arg(1).Resolve() != rhs_mul->Arg(1).Resolve()) {
+ return false;
+ }
+ const IR::U32 factor_b{lhs_mul->Arg(1)};
+ if (lhs_mul->Arg(0).IsImmediate() || rhs_mul->Arg(0).IsImmediate()) {
+ return false;
+ }
+ IR::Inst* const lhs_bfe{lhs_mul->Arg(0).InstRecursive()};
+ IR::Inst* const rhs_bfe{rhs_mul->Arg(0).InstRecursive()};
+ if (lhs_bfe->GetOpcode() != IR::Opcode::BitFieldUExtract) {
+ return false;
+ }
+ if (rhs_bfe->GetOpcode() != IR::Opcode::BitFieldUExtract) {
+ return false;
+ }
+ if (lhs_bfe->Arg(1) != IR::Value{16U} || lhs_bfe->Arg(2) != IR::Value{16U}) {
+ return false;
+ }
+ if (rhs_bfe->Arg(1) != IR::Value{0U} || rhs_bfe->Arg(2) != IR::Value{16U}) {
+ return false;
+ }
+ if (lhs_bfe->Arg(0).Resolve() != rhs_bfe->Arg(0).Resolve()) {
+ return false;
+ }
+ const IR::U32 factor_a{lhs_bfe->Arg(0)};
+ IR::IREmitter ir{block, IR::Block::InstructionList::s_iterator_to(inst)};
+ inst.ReplaceUsesWith(ir.IMul(factor_a, factor_b));
+ return true;
+}
+
+template <typename T>
+void FoldAdd(IR::Block& block, IR::Inst& inst) {
+ if (inst.HasAssociatedPseudoOperation()) {
+ return;
+ }
+ if (!FoldCommutative<T>(inst, [](T a, T b) { return a + b; })) {
+ return;
+ }
+ const IR::Value rhs{inst.Arg(1)};
+ if (rhs.IsImmediate() && Arg<T>(rhs) == 0) {
+ inst.ReplaceUsesWith(inst.Arg(0));
+ return;
+ }
+ if constexpr (std::is_same_v<T, u32>) {
+ if (FoldXmadMultiply(block, inst)) {
+ return;
+ }
+ }
+}
+
+void FoldISub32(IR::Inst& inst) {
+ if (FoldWhenAllImmediates(inst, [](u32 a, u32 b) { return a - b; })) {
+ return;
+ }
+ if (inst.Arg(0).IsImmediate() || inst.Arg(1).IsImmediate()) {
+ return;
+ }
+ // ISub32 is generally used to subtract two constant buffers, compare and replace this with
+ // zero if they equal.
+ const auto equal_cbuf{[](IR::Inst* a, IR::Inst* b) {
+ return a->GetOpcode() == IR::Opcode::GetCbufU32 &&
+ b->GetOpcode() == IR::Opcode::GetCbufU32 && a->Arg(0) == b->Arg(0) &&
+ a->Arg(1) == b->Arg(1);
+ }};
+ IR::Inst* op_a{inst.Arg(0).InstRecursive()};
+ IR::Inst* op_b{inst.Arg(1).InstRecursive()};
+ if (equal_cbuf(op_a, op_b)) {
+ inst.ReplaceUsesWith(IR::Value{u32{0}});
+ return;
+ }
+ // It's also possible a value is being added to a cbuf and then subtracted
+ if (op_b->GetOpcode() == IR::Opcode::IAdd32) {
+ // Canonicalize local variables to simplify the following logic
+ std::swap(op_a, op_b);
+ }
+ if (op_b->GetOpcode() != IR::Opcode::GetCbufU32) {
+ return;
+ }
+ IR::Inst* const inst_cbuf{op_b};
+ if (op_a->GetOpcode() != IR::Opcode::IAdd32) {
+ return;
+ }
+ IR::Value add_op_a{op_a->Arg(0)};
+ IR::Value add_op_b{op_a->Arg(1)};
+ if (add_op_b.IsImmediate()) {
+ // Canonicalize
+ std::swap(add_op_a, add_op_b);
+ }
+ if (add_op_b.IsImmediate()) {
+ return;
+ }
+ IR::Inst* const add_cbuf{add_op_b.InstRecursive()};
+ if (equal_cbuf(add_cbuf, inst_cbuf)) {
+ inst.ReplaceUsesWith(add_op_a);
+ }
+}
+
+void FoldSelect(IR::Inst& inst) {
+ const IR::Value cond{inst.Arg(0)};
+ if (cond.IsImmediate()) {
+ inst.ReplaceUsesWith(cond.U1() ? inst.Arg(1) : inst.Arg(2));
+ }
+}
+
+void FoldFPMul32(IR::Inst& inst) {
+ const auto control{inst.Flags<IR::FpControl>()};
+ if (control.no_contraction) {
+ return;
+ }
+ // Fold interpolation operations
+ const IR::Value lhs_value{inst.Arg(0)};
+ const IR::Value rhs_value{inst.Arg(1)};
+ if (lhs_value.IsImmediate() || rhs_value.IsImmediate()) {
+ return;
+ }
+ IR::Inst* const lhs_op{lhs_value.InstRecursive()};
+ IR::Inst* const rhs_op{rhs_value.InstRecursive()};
+ if (lhs_op->GetOpcode() != IR::Opcode::FPMul32 ||
+ rhs_op->GetOpcode() != IR::Opcode::FPRecip32) {
+ return;
+ }
+ const IR::Value recip_source{rhs_op->Arg(0)};
+ const IR::Value lhs_mul_source{lhs_op->Arg(1).Resolve()};
+ if (recip_source.IsImmediate() || lhs_mul_source.IsImmediate()) {
+ return;
+ }
+ IR::Inst* const attr_a{recip_source.InstRecursive()};
+ IR::Inst* const attr_b{lhs_mul_source.InstRecursive()};
+ if (attr_a->GetOpcode() != IR::Opcode::GetAttribute ||
+ attr_b->GetOpcode() != IR::Opcode::GetAttribute) {
+ return;
+ }
+ if (attr_a->Arg(0).Attribute() == attr_b->Arg(0).Attribute()) {
+ inst.ReplaceUsesWith(lhs_op->Arg(0));
+ }
+}
+
+void FoldLogicalAnd(IR::Inst& inst) {
+ if (!FoldCommutative<bool>(inst, [](bool a, bool b) { return a && b; })) {
+ return;
+ }
+ const IR::Value rhs{inst.Arg(1)};
+ if (rhs.IsImmediate()) {
+ if (rhs.U1()) {
+ inst.ReplaceUsesWith(inst.Arg(0));
+ } else {
+ inst.ReplaceUsesWith(IR::Value{false});
+ }
+ }
+}
+
+void FoldLogicalOr(IR::Inst& inst) {
+ if (!FoldCommutative<bool>(inst, [](bool a, bool b) { return a || b; })) {
+ return;
+ }
+ const IR::Value rhs{inst.Arg(1)};
+ if (rhs.IsImmediate()) {
+ if (rhs.U1()) {
+ inst.ReplaceUsesWith(IR::Value{true});
+ } else {
+ inst.ReplaceUsesWith(inst.Arg(0));
+ }
+ }
+}
+
+void FoldLogicalNot(IR::Inst& inst) {
+ const IR::U1 value{inst.Arg(0)};
+ if (value.IsImmediate()) {
+ inst.ReplaceUsesWith(IR::Value{!value.U1()});
+ return;
+ }
+ IR::Inst* const arg{value.InstRecursive()};
+ if (arg->GetOpcode() == IR::Opcode::LogicalNot) {
+ inst.ReplaceUsesWith(arg->Arg(0));
+ }
+}
+
+template <IR::Opcode op, typename Dest, typename Source>
+void FoldBitCast(IR::Inst& inst, IR::Opcode reverse) {
+ const IR::Value value{inst.Arg(0)};
+ if (value.IsImmediate()) {
+ inst.ReplaceUsesWith(IR::Value{Common::BitCast<Dest>(Arg<Source>(value))});
+ return;
+ }
+ IR::Inst* const arg_inst{value.InstRecursive()};
+ if (arg_inst->GetOpcode() == reverse) {
+ inst.ReplaceUsesWith(arg_inst->Arg(0));
+ return;
+ }
+ if constexpr (op == IR::Opcode::BitCastF32U32) {
+ if (arg_inst->GetOpcode() == IR::Opcode::GetCbufU32) {
+ // Replace the bitcast with a typed constant buffer read
+ inst.ReplaceOpcode(IR::Opcode::GetCbufF32);
+ inst.SetArg(0, arg_inst->Arg(0));
+ inst.SetArg(1, arg_inst->Arg(1));
+ return;
+ }
+ }
+}
+
+void FoldInverseFunc(IR::Inst& inst, IR::Opcode reverse) {
+ const IR::Value value{inst.Arg(0)};
+ if (value.IsImmediate()) {
+ return;
+ }
+ IR::Inst* const arg_inst{value.InstRecursive()};
+ if (arg_inst->GetOpcode() == reverse) {
+ inst.ReplaceUsesWith(arg_inst->Arg(0));
+ return;
+ }
+}
+
+template <typename Func, size_t... I>
+IR::Value EvalImmediates(const IR::Inst& inst, Func&& func, std::index_sequence<I...>) {
+ using Traits = LambdaTraits<decltype(func)>;
+ return IR::Value{func(Arg<typename Traits::template ArgType<I>>(inst.Arg(I))...)};
+}
+
+std::optional<IR::Value> FoldCompositeExtractImpl(IR::Value inst_value, IR::Opcode insert,
+ IR::Opcode construct, u32 first_index) {
+ IR::Inst* const inst{inst_value.InstRecursive()};
+ if (inst->GetOpcode() == construct) {
+ return inst->Arg(first_index);
+ }
+ if (inst->GetOpcode() != insert) {
+ return std::nullopt;
+ }
+ IR::Value value_index{inst->Arg(2)};
+ if (!value_index.IsImmediate()) {
+ return std::nullopt;
+ }
+ const u32 second_index{value_index.U32()};
+ if (first_index != second_index) {
+ IR::Value value_composite{inst->Arg(0)};
+ if (value_composite.IsImmediate()) {
+ return std::nullopt;
+ }
+ return FoldCompositeExtractImpl(value_composite, insert, construct, first_index);
+ }
+ return inst->Arg(1);
+}
+
+void FoldCompositeExtract(IR::Inst& inst, IR::Opcode construct, IR::Opcode insert) {
+ const IR::Value value_1{inst.Arg(0)};
+ const IR::Value value_2{inst.Arg(1)};
+ if (value_1.IsImmediate()) {
+ return;
+ }
+ if (!value_2.IsImmediate()) {
+ return;
+ }
+ const u32 first_index{value_2.U32()};
+ const std::optional result{FoldCompositeExtractImpl(value_1, insert, construct, first_index)};
+ if (!result) {
+ return;
+ }
+ inst.ReplaceUsesWith(*result);
+}
+
+IR::Value GetThroughCast(IR::Value value, IR::Opcode expected_cast) {
+ if (value.IsImmediate()) {
+ return value;
+ }
+ IR::Inst* const inst{value.InstRecursive()};
+ if (inst->GetOpcode() == expected_cast) {
+ return inst->Arg(0).Resolve();
+ }
+ return value;
+}
+
+void FoldFSwizzleAdd(IR::Block& block, IR::Inst& inst) {
+ const IR::Value swizzle{inst.Arg(2)};
+ if (!swizzle.IsImmediate()) {
+ return;
+ }
+ const IR::Value value_1{GetThroughCast(inst.Arg(0).Resolve(), IR::Opcode::BitCastF32U32)};
+ const IR::Value value_2{GetThroughCast(inst.Arg(1).Resolve(), IR::Opcode::BitCastF32U32)};
+ if (value_1.IsImmediate()) {
+ return;
+ }
+ const u32 swizzle_value{swizzle.U32()};
+ if (swizzle_value != 0x99 && swizzle_value != 0xA5) {
+ return;
+ }
+ IR::Inst* const inst2{value_1.InstRecursive()};
+ if (inst2->GetOpcode() != IR::Opcode::ShuffleButterfly) {
+ return;
+ }
+ const IR::Value value_3{GetThroughCast(inst2->Arg(0).Resolve(), IR::Opcode::BitCastU32F32)};
+ if (value_2 != value_3) {
+ return;
+ }
+ const IR::Value index{inst2->Arg(1)};
+ const IR::Value clamp{inst2->Arg(2)};
+ const IR::Value segmentation_mask{inst2->Arg(3)};
+ if (!index.IsImmediate() || !clamp.IsImmediate() || !segmentation_mask.IsImmediate()) {
+ return;
+ }
+ if (clamp.U32() != 3 || segmentation_mask.U32() != 28) {
+ return;
+ }
+ if (swizzle_value == 0x99) {
+ // DPdxFine
+ if (index.U32() == 1) {
+ IR::IREmitter ir{block, IR::Block::InstructionList::s_iterator_to(inst)};
+ inst.ReplaceUsesWith(ir.DPdxFine(IR::F32{inst.Arg(1)}));
+ }
+ } else if (swizzle_value == 0xA5) {
+ // DPdyFine
+ if (index.U32() == 2) {
+ IR::IREmitter ir{block, IR::Block::InstructionList::s_iterator_to(inst)};
+ inst.ReplaceUsesWith(ir.DPdyFine(IR::F32{inst.Arg(1)}));
+ }
+ }
+}
+
+void ConstantPropagation(IR::Block& block, IR::Inst& inst) {
+ switch (inst.GetOpcode()) {
+ case IR::Opcode::GetRegister:
+ return FoldGetRegister(inst);
+ case IR::Opcode::GetPred:
+ return FoldGetPred(inst);
+ case IR::Opcode::IAdd32:
+ return FoldAdd<u32>(block, inst);
+ case IR::Opcode::ISub32:
+ return FoldISub32(inst);
+ case IR::Opcode::IMul32:
+ FoldWhenAllImmediates(inst, [](u32 a, u32 b) { return a * b; });
+ return;
+ case IR::Opcode::ShiftRightArithmetic32:
+ FoldWhenAllImmediates(inst, [](s32 a, s32 b) { return static_cast<u32>(a >> b); });
+ return;
+ case IR::Opcode::BitCastF32U32:
+ return FoldBitCast<IR::Opcode::BitCastF32U32, f32, u32>(inst, IR::Opcode::BitCastU32F32);
+ case IR::Opcode::BitCastU32F32:
+ return FoldBitCast<IR::Opcode::BitCastU32F32, u32, f32>(inst, IR::Opcode::BitCastF32U32);
+ case IR::Opcode::IAdd64:
+ return FoldAdd<u64>(block, inst);
+ case IR::Opcode::PackHalf2x16:
+ return FoldInverseFunc(inst, IR::Opcode::UnpackHalf2x16);
+ case IR::Opcode::UnpackHalf2x16:
+ return FoldInverseFunc(inst, IR::Opcode::PackHalf2x16);
+ case IR::Opcode::SelectU1:
+ case IR::Opcode::SelectU8:
+ case IR::Opcode::SelectU16:
+ case IR::Opcode::SelectU32:
+ case IR::Opcode::SelectU64:
+ case IR::Opcode::SelectF16:
+ case IR::Opcode::SelectF32:
+ case IR::Opcode::SelectF64:
+ return FoldSelect(inst);
+ case IR::Opcode::FPMul32:
+ return FoldFPMul32(inst);
+ case IR::Opcode::LogicalAnd:
+ return FoldLogicalAnd(inst);
+ case IR::Opcode::LogicalOr:
+ return FoldLogicalOr(inst);
+ case IR::Opcode::LogicalNot:
+ return FoldLogicalNot(inst);
+ case IR::Opcode::SLessThan:
+ FoldWhenAllImmediates(inst, [](s32 a, s32 b) { return a < b; });
+ return;
+ case IR::Opcode::ULessThan:
+ FoldWhenAllImmediates(inst, [](u32 a, u32 b) { return a < b; });
+ return;
+ case IR::Opcode::SLessThanEqual:
+ FoldWhenAllImmediates(inst, [](s32 a, s32 b) { return a <= b; });
+ return;
+ case IR::Opcode::ULessThanEqual:
+ FoldWhenAllImmediates(inst, [](u32 a, u32 b) { return a <= b; });
+ return;
+ case IR::Opcode::SGreaterThan:
+ FoldWhenAllImmediates(inst, [](s32 a, s32 b) { return a > b; });
+ return;
+ case IR::Opcode::UGreaterThan:
+ FoldWhenAllImmediates(inst, [](u32 a, u32 b) { return a > b; });
+ return;
+ case IR::Opcode::SGreaterThanEqual:
+ FoldWhenAllImmediates(inst, [](s32 a, s32 b) { return a >= b; });
+ return;
+ case IR::Opcode::UGreaterThanEqual:
+ FoldWhenAllImmediates(inst, [](u32 a, u32 b) { return a >= b; });
+ return;
+ case IR::Opcode::IEqual:
+ FoldWhenAllImmediates(inst, [](u32 a, u32 b) { return a == b; });
+ return;
+ case IR::Opcode::INotEqual:
+ FoldWhenAllImmediates(inst, [](u32 a, u32 b) { return a != b; });
+ return;
+ case IR::Opcode::BitwiseAnd32:
+ FoldWhenAllImmediates(inst, [](u32 a, u32 b) { return a & b; });
+ return;
+ case IR::Opcode::BitwiseOr32:
+ FoldWhenAllImmediates(inst, [](u32 a, u32 b) { return a | b; });
+ return;
+ case IR::Opcode::BitwiseXor32:
+ FoldWhenAllImmediates(inst, [](u32 a, u32 b) { return a ^ b; });
+ return;
+ case IR::Opcode::BitFieldUExtract:
+ FoldWhenAllImmediates(inst, [](u32 base, u32 shift, u32 count) {
+ if (static_cast<size_t>(shift) + static_cast<size_t>(count) > 32) {
+ throw LogicError("Undefined result in {}({}, {}, {})", IR::Opcode::BitFieldUExtract,
+ base, shift, count);
+ }
+ return (base >> shift) & ((1U << count) - 1);
+ });
+ return;
+ case IR::Opcode::BitFieldSExtract:
+ FoldWhenAllImmediates(inst, [](s32 base, u32 shift, u32 count) {
+ const size_t back_shift{static_cast<size_t>(shift) + static_cast<size_t>(count)};
+ const size_t left_shift{32 - back_shift};
+ const size_t right_shift{static_cast<size_t>(32 - count)};
+ if (back_shift > 32 || left_shift >= 32 || right_shift >= 32) {
+ throw LogicError("Undefined result in {}({}, {}, {})", IR::Opcode::BitFieldSExtract,
+ base, shift, count);
+ }
+ return static_cast<u32>((base << left_shift) >> right_shift);
+ });
+ return;
+ case IR::Opcode::BitFieldInsert:
+ FoldWhenAllImmediates(inst, [](u32 base, u32 insert, u32 offset, u32 bits) {
+ if (bits >= 32 || offset >= 32) {
+ throw LogicError("Undefined result in {}({}, {}, {}, {})",
+ IR::Opcode::BitFieldInsert, base, insert, offset, bits);
+ }
+ return (base & ~(~(~0u << bits) << offset)) | (insert << offset);
+ });
+ return;
+ case IR::Opcode::CompositeExtractU32x2:
+ return FoldCompositeExtract(inst, IR::Opcode::CompositeConstructU32x2,
+ IR::Opcode::CompositeInsertU32x2);
+ case IR::Opcode::CompositeExtractU32x3:
+ return FoldCompositeExtract(inst, IR::Opcode::CompositeConstructU32x3,
+ IR::Opcode::CompositeInsertU32x3);
+ case IR::Opcode::CompositeExtractU32x4:
+ return FoldCompositeExtract(inst, IR::Opcode::CompositeConstructU32x4,
+ IR::Opcode::CompositeInsertU32x4);
+ case IR::Opcode::CompositeExtractF32x2:
+ return FoldCompositeExtract(inst, IR::Opcode::CompositeConstructF32x2,
+ IR::Opcode::CompositeInsertF32x2);
+ case IR::Opcode::CompositeExtractF32x3:
+ return FoldCompositeExtract(inst, IR::Opcode::CompositeConstructF32x3,
+ IR::Opcode::CompositeInsertF32x3);
+ case IR::Opcode::CompositeExtractF32x4:
+ return FoldCompositeExtract(inst, IR::Opcode::CompositeConstructF32x4,
+ IR::Opcode::CompositeInsertF32x4);
+ case IR::Opcode::CompositeExtractF16x2:
+ return FoldCompositeExtract(inst, IR::Opcode::CompositeConstructF16x2,
+ IR::Opcode::CompositeInsertF16x2);
+ case IR::Opcode::CompositeExtractF16x3:
+ return FoldCompositeExtract(inst, IR::Opcode::CompositeConstructF16x3,
+ IR::Opcode::CompositeInsertF16x3);
+ case IR::Opcode::CompositeExtractF16x4:
+ return FoldCompositeExtract(inst, IR::Opcode::CompositeConstructF16x4,
+ IR::Opcode::CompositeInsertF16x4);
+ case IR::Opcode::FSwizzleAdd:
+ return FoldFSwizzleAdd(block, inst);
+ default:
+ break;
+ }
+}
+} // Anonymous namespace
+
+void ConstantPropagationPass(IR::Program& program) {
+ const auto end{program.post_order_blocks.rend()};
+ for (auto it = program.post_order_blocks.rbegin(); it != end; ++it) {
+ IR::Block* const block{*it};
+ for (IR::Inst& inst : block->Instructions()) {
+ ConstantPropagation(*block, inst);
+ }
+ }
+}
+
+} // namespace Shader::Optimization
diff --git a/src/shader_recompiler/ir_opt/dead_code_elimination_pass.cpp b/src/shader_recompiler/ir_opt/dead_code_elimination_pass.cpp
new file mode 100644
index 000000000..400836301
--- /dev/null
+++ b/src/shader_recompiler/ir_opt/dead_code_elimination_pass.cpp
@@ -0,0 +1,26 @@
+// Copyright 2021 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include "shader_recompiler/frontend/ir/basic_block.h"
+#include "shader_recompiler/frontend/ir/value.h"
+#include "shader_recompiler/ir_opt/passes.h"
+
+namespace Shader::Optimization {
+
+void DeadCodeEliminationPass(IR::Program& program) {
+ // We iterate over the instructions in reverse order.
+ // This is because removing an instruction reduces the number of uses for earlier instructions.
+ for (IR::Block* const block : program.post_order_blocks) {
+ auto it{block->end()};
+ while (it != block->begin()) {
+ --it;
+ if (!it->HasUses() && !it->MayHaveSideEffects()) {
+ it->Invalidate();
+ it = block->Instructions().erase(it);
+ }
+ }
+ }
+}
+
+} // namespace Shader::Optimization
diff --git a/src/shader_recompiler/ir_opt/dual_vertex_pass.cpp b/src/shader_recompiler/ir_opt/dual_vertex_pass.cpp
new file mode 100644
index 000000000..055ba9c54
--- /dev/null
+++ b/src/shader_recompiler/ir_opt/dual_vertex_pass.cpp
@@ -0,0 +1,30 @@
+// Copyright 2021 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include "shader_recompiler/frontend/ir/ir_emitter.h"
+#include "shader_recompiler/ir_opt/passes.h"
+
+namespace Shader::Optimization {
+
+void VertexATransformPass(IR::Program& program) {
+ for (IR::Block* const block : program.blocks) {
+ for (IR::Inst& inst : block->Instructions()) {
+ if (inst.GetOpcode() == IR::Opcode::Epilogue) {
+ return inst.Invalidate();
+ }
+ }
+ }
+}
+
+void VertexBTransformPass(IR::Program& program) {
+ for (IR::Block* const block : program.blocks) {
+ for (IR::Inst& inst : block->Instructions()) {
+ if (inst.GetOpcode() == IR::Opcode::Prologue) {
+ return inst.Invalidate();
+ }
+ }
+ }
+}
+
+} // namespace Shader::Optimization
diff --git a/src/shader_recompiler/ir_opt/global_memory_to_storage_buffer_pass.cpp b/src/shader_recompiler/ir_opt/global_memory_to_storage_buffer_pass.cpp
new file mode 100644
index 000000000..4197b0095
--- /dev/null
+++ b/src/shader_recompiler/ir_opt/global_memory_to_storage_buffer_pass.cpp
@@ -0,0 +1,526 @@
+// Copyright 2021 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <algorithm>
+#include <compare>
+#include <optional>
+#include <queue>
+
+#include <boost/container/flat_set.hpp>
+#include <boost/container/small_vector.hpp>
+
+#include "common/alignment.h"
+#include "shader_recompiler/frontend/ir/basic_block.h"
+#include "shader_recompiler/frontend/ir/breadth_first_search.h"
+#include "shader_recompiler/frontend/ir/ir_emitter.h"
+#include "shader_recompiler/frontend/ir/value.h"
+#include "shader_recompiler/ir_opt/passes.h"
+
+namespace Shader::Optimization {
+namespace {
+/// Address in constant buffers to the storage buffer descriptor
+struct StorageBufferAddr {
+ auto operator<=>(const StorageBufferAddr&) const noexcept = default;
+
+ u32 index;
+ u32 offset;
+};
+
+/// Block iterator to a global memory instruction and the storage buffer it uses
+struct StorageInst {
+ StorageBufferAddr storage_buffer;
+ IR::Inst* inst;
+ IR::Block* block;
+};
+
+/// Bias towards a certain range of constant buffers when looking for storage buffers
+struct Bias {
+ u32 index;
+ u32 offset_begin;
+ u32 offset_end;
+};
+
+using boost::container::flat_set;
+using boost::container::small_vector;
+using StorageBufferSet =
+ flat_set<StorageBufferAddr, std::less<StorageBufferAddr>, small_vector<StorageBufferAddr, 16>>;
+using StorageInstVector = small_vector<StorageInst, 24>;
+using StorageWritesSet =
+ flat_set<StorageBufferAddr, std::less<StorageBufferAddr>, small_vector<StorageBufferAddr, 16>>;
+
+struct StorageInfo {
+ StorageBufferSet set;
+ StorageInstVector to_replace;
+ StorageWritesSet writes;
+};
+
+/// Returns true when the instruction is a global memory instruction
+bool IsGlobalMemory(const IR::Inst& inst) {
+ switch (inst.GetOpcode()) {
+ case IR::Opcode::LoadGlobalS8:
+ case IR::Opcode::LoadGlobalU8:
+ case IR::Opcode::LoadGlobalS16:
+ case IR::Opcode::LoadGlobalU16:
+ case IR::Opcode::LoadGlobal32:
+ case IR::Opcode::LoadGlobal64:
+ case IR::Opcode::LoadGlobal128:
+ case IR::Opcode::WriteGlobalS8:
+ case IR::Opcode::WriteGlobalU8:
+ case IR::Opcode::WriteGlobalS16:
+ case IR::Opcode::WriteGlobalU16:
+ case IR::Opcode::WriteGlobal32:
+ case IR::Opcode::WriteGlobal64:
+ case IR::Opcode::WriteGlobal128:
+ case IR::Opcode::GlobalAtomicIAdd32:
+ case IR::Opcode::GlobalAtomicSMin32:
+ case IR::Opcode::GlobalAtomicUMin32:
+ case IR::Opcode::GlobalAtomicSMax32:
+ case IR::Opcode::GlobalAtomicUMax32:
+ case IR::Opcode::GlobalAtomicInc32:
+ case IR::Opcode::GlobalAtomicDec32:
+ case IR::Opcode::GlobalAtomicAnd32:
+ case IR::Opcode::GlobalAtomicOr32:
+ case IR::Opcode::GlobalAtomicXor32:
+ case IR::Opcode::GlobalAtomicExchange32:
+ case IR::Opcode::GlobalAtomicIAdd64:
+ case IR::Opcode::GlobalAtomicSMin64:
+ case IR::Opcode::GlobalAtomicUMin64:
+ case IR::Opcode::GlobalAtomicSMax64:
+ case IR::Opcode::GlobalAtomicUMax64:
+ case IR::Opcode::GlobalAtomicAnd64:
+ case IR::Opcode::GlobalAtomicOr64:
+ case IR::Opcode::GlobalAtomicXor64:
+ case IR::Opcode::GlobalAtomicExchange64:
+ case IR::Opcode::GlobalAtomicAddF32:
+ case IR::Opcode::GlobalAtomicAddF16x2:
+ case IR::Opcode::GlobalAtomicAddF32x2:
+ case IR::Opcode::GlobalAtomicMinF16x2:
+ case IR::Opcode::GlobalAtomicMinF32x2:
+ case IR::Opcode::GlobalAtomicMaxF16x2:
+ case IR::Opcode::GlobalAtomicMaxF32x2:
+ return true;
+ default:
+ return false;
+ }
+}
+
+/// Returns true when the instruction is a global memory instruction
+bool IsGlobalMemoryWrite(const IR::Inst& inst) {
+ switch (inst.GetOpcode()) {
+ case IR::Opcode::WriteGlobalS8:
+ case IR::Opcode::WriteGlobalU8:
+ case IR::Opcode::WriteGlobalS16:
+ case IR::Opcode::WriteGlobalU16:
+ case IR::Opcode::WriteGlobal32:
+ case IR::Opcode::WriteGlobal64:
+ case IR::Opcode::WriteGlobal128:
+ case IR::Opcode::GlobalAtomicIAdd32:
+ case IR::Opcode::GlobalAtomicSMin32:
+ case IR::Opcode::GlobalAtomicUMin32:
+ case IR::Opcode::GlobalAtomicSMax32:
+ case IR::Opcode::GlobalAtomicUMax32:
+ case IR::Opcode::GlobalAtomicInc32:
+ case IR::Opcode::GlobalAtomicDec32:
+ case IR::Opcode::GlobalAtomicAnd32:
+ case IR::Opcode::GlobalAtomicOr32:
+ case IR::Opcode::GlobalAtomicXor32:
+ case IR::Opcode::GlobalAtomicExchange32:
+ case IR::Opcode::GlobalAtomicIAdd64:
+ case IR::Opcode::GlobalAtomicSMin64:
+ case IR::Opcode::GlobalAtomicUMin64:
+ case IR::Opcode::GlobalAtomicSMax64:
+ case IR::Opcode::GlobalAtomicUMax64:
+ case IR::Opcode::GlobalAtomicAnd64:
+ case IR::Opcode::GlobalAtomicOr64:
+ case IR::Opcode::GlobalAtomicXor64:
+ case IR::Opcode::GlobalAtomicExchange64:
+ case IR::Opcode::GlobalAtomicAddF32:
+ case IR::Opcode::GlobalAtomicAddF16x2:
+ case IR::Opcode::GlobalAtomicAddF32x2:
+ case IR::Opcode::GlobalAtomicMinF16x2:
+ case IR::Opcode::GlobalAtomicMinF32x2:
+ case IR::Opcode::GlobalAtomicMaxF16x2:
+ case IR::Opcode::GlobalAtomicMaxF32x2:
+ return true;
+ default:
+ return false;
+ }
+}
+
+/// Converts a global memory opcode to its storage buffer equivalent
+IR::Opcode GlobalToStorage(IR::Opcode opcode) {
+ switch (opcode) {
+ case IR::Opcode::LoadGlobalS8:
+ return IR::Opcode::LoadStorageS8;
+ case IR::Opcode::LoadGlobalU8:
+ return IR::Opcode::LoadStorageU8;
+ case IR::Opcode::LoadGlobalS16:
+ return IR::Opcode::LoadStorageS16;
+ case IR::Opcode::LoadGlobalU16:
+ return IR::Opcode::LoadStorageU16;
+ case IR::Opcode::LoadGlobal32:
+ return IR::Opcode::LoadStorage32;
+ case IR::Opcode::LoadGlobal64:
+ return IR::Opcode::LoadStorage64;
+ case IR::Opcode::LoadGlobal128:
+ return IR::Opcode::LoadStorage128;
+ case IR::Opcode::WriteGlobalS8:
+ return IR::Opcode::WriteStorageS8;
+ case IR::Opcode::WriteGlobalU8:
+ return IR::Opcode::WriteStorageU8;
+ case IR::Opcode::WriteGlobalS16:
+ return IR::Opcode::WriteStorageS16;
+ case IR::Opcode::WriteGlobalU16:
+ return IR::Opcode::WriteStorageU16;
+ case IR::Opcode::WriteGlobal32:
+ return IR::Opcode::WriteStorage32;
+ case IR::Opcode::WriteGlobal64:
+ return IR::Opcode::WriteStorage64;
+ case IR::Opcode::WriteGlobal128:
+ return IR::Opcode::WriteStorage128;
+ case IR::Opcode::GlobalAtomicIAdd32:
+ return IR::Opcode::StorageAtomicIAdd32;
+ case IR::Opcode::GlobalAtomicSMin32:
+ return IR::Opcode::StorageAtomicSMin32;
+ case IR::Opcode::GlobalAtomicUMin32:
+ return IR::Opcode::StorageAtomicUMin32;
+ case IR::Opcode::GlobalAtomicSMax32:
+ return IR::Opcode::StorageAtomicSMax32;
+ case IR::Opcode::GlobalAtomicUMax32:
+ return IR::Opcode::StorageAtomicUMax32;
+ case IR::Opcode::GlobalAtomicInc32:
+ return IR::Opcode::StorageAtomicInc32;
+ case IR::Opcode::GlobalAtomicDec32:
+ return IR::Opcode::StorageAtomicDec32;
+ case IR::Opcode::GlobalAtomicAnd32:
+ return IR::Opcode::StorageAtomicAnd32;
+ case IR::Opcode::GlobalAtomicOr32:
+ return IR::Opcode::StorageAtomicOr32;
+ case IR::Opcode::GlobalAtomicXor32:
+ return IR::Opcode::StorageAtomicXor32;
+ case IR::Opcode::GlobalAtomicIAdd64:
+ return IR::Opcode::StorageAtomicIAdd64;
+ case IR::Opcode::GlobalAtomicSMin64:
+ return IR::Opcode::StorageAtomicSMin64;
+ case IR::Opcode::GlobalAtomicUMin64:
+ return IR::Opcode::StorageAtomicUMin64;
+ case IR::Opcode::GlobalAtomicSMax64:
+ return IR::Opcode::StorageAtomicSMax64;
+ case IR::Opcode::GlobalAtomicUMax64:
+ return IR::Opcode::StorageAtomicUMax64;
+ case IR::Opcode::GlobalAtomicAnd64:
+ return IR::Opcode::StorageAtomicAnd64;
+ case IR::Opcode::GlobalAtomicOr64:
+ return IR::Opcode::StorageAtomicOr64;
+ case IR::Opcode::GlobalAtomicXor64:
+ return IR::Opcode::StorageAtomicXor64;
+ case IR::Opcode::GlobalAtomicExchange32:
+ return IR::Opcode::StorageAtomicExchange32;
+ case IR::Opcode::GlobalAtomicExchange64:
+ return IR::Opcode::StorageAtomicExchange64;
+ case IR::Opcode::GlobalAtomicAddF32:
+ return IR::Opcode::StorageAtomicAddF32;
+ case IR::Opcode::GlobalAtomicAddF16x2:
+ return IR::Opcode::StorageAtomicAddF16x2;
+ case IR::Opcode::GlobalAtomicMinF16x2:
+ return IR::Opcode::StorageAtomicMinF16x2;
+ case IR::Opcode::GlobalAtomicMaxF16x2:
+ return IR::Opcode::StorageAtomicMaxF16x2;
+ case IR::Opcode::GlobalAtomicAddF32x2:
+ return IR::Opcode::StorageAtomicAddF32x2;
+ case IR::Opcode::GlobalAtomicMinF32x2:
+ return IR::Opcode::StorageAtomicMinF32x2;
+ case IR::Opcode::GlobalAtomicMaxF32x2:
+ return IR::Opcode::StorageAtomicMaxF32x2;
+ default:
+ throw InvalidArgument("Invalid global memory opcode {}", opcode);
+ }
+}
+
+/// Returns true when a storage buffer address satisfies a bias
+bool MeetsBias(const StorageBufferAddr& storage_buffer, const Bias& bias) noexcept {
+ return storage_buffer.index == bias.index && storage_buffer.offset >= bias.offset_begin &&
+ storage_buffer.offset < bias.offset_end;
+}
+
+struct LowAddrInfo {
+ IR::U32 value;
+ s32 imm_offset;
+};
+
+/// Tries to track the first 32-bits of a global memory instruction
+std::optional<LowAddrInfo> TrackLowAddress(IR::Inst* inst) {
+ // The first argument is the low level GPU pointer to the global memory instruction
+ const IR::Value addr{inst->Arg(0)};
+ if (addr.IsImmediate()) {
+ // Not much we can do if it's an immediate
+ return std::nullopt;
+ }
+ // This address is expected to either be a PackUint2x32, a IAdd64, or a CompositeConstructU32x2
+ IR::Inst* addr_inst{addr.InstRecursive()};
+ s32 imm_offset{0};
+ if (addr_inst->GetOpcode() == IR::Opcode::IAdd64) {
+ // If it's an IAdd64, get the immediate offset it is applying and grab the address
+ // instruction. This expects for the instruction to be canonicalized having the address on
+ // the first argument and the immediate offset on the second one.
+ const IR::U64 imm_offset_value{addr_inst->Arg(1)};
+ if (!imm_offset_value.IsImmediate()) {
+ return std::nullopt;
+ }
+ imm_offset = static_cast<s32>(static_cast<s64>(imm_offset_value.U64()));
+ const IR::U64 iadd_addr{addr_inst->Arg(0)};
+ if (iadd_addr.IsImmediate()) {
+ return std::nullopt;
+ }
+ addr_inst = iadd_addr.InstRecursive();
+ }
+ // With IAdd64 handled, now PackUint2x32 is expected
+ if (addr_inst->GetOpcode() == IR::Opcode::PackUint2x32) {
+ // PackUint2x32 is expected to be generated from a vector
+ const IR::Value vector{addr_inst->Arg(0)};
+ if (vector.IsImmediate()) {
+ return std::nullopt;
+ }
+ addr_inst = vector.InstRecursive();
+ }
+ // The vector is expected to be a CompositeConstructU32x2
+ if (addr_inst->GetOpcode() != IR::Opcode::CompositeConstructU32x2) {
+ return std::nullopt;
+ }
+ // Grab the first argument from the CompositeConstructU32x2, this is the low address.
+ return LowAddrInfo{
+ .value{IR::U32{addr_inst->Arg(0)}},
+ .imm_offset = imm_offset,
+ };
+}
+
+/// Tries to track the storage buffer address used by a global memory instruction
+std::optional<StorageBufferAddr> Track(const IR::Value& value, const Bias* bias) {
+ const auto pred{[bias](const IR::Inst* inst) -> std::optional<StorageBufferAddr> {
+ if (inst->GetOpcode() != IR::Opcode::GetCbufU32) {
+ return std::nullopt;
+ }
+ const IR::Value index{inst->Arg(0)};
+ const IR::Value offset{inst->Arg(1)};
+ if (!index.IsImmediate()) {
+ // Definitely not a storage buffer if it's read from a
+ // non-immediate index
+ return std::nullopt;
+ }
+ if (!offset.IsImmediate()) {
+ // TODO: Support SSBO arrays
+ return std::nullopt;
+ }
+ const StorageBufferAddr storage_buffer{
+ .index = index.U32(),
+ .offset = offset.U32(),
+ };
+ if (!Common::IsAligned(storage_buffer.offset, 16)) {
+ // The SSBO pointer has to be aligned
+ return std::nullopt;
+ }
+ if (bias && !MeetsBias(storage_buffer, *bias)) {
+ // We have to blacklist some addresses in case we wrongly
+ // point to them
+ return std::nullopt;
+ }
+ return storage_buffer;
+ }};
+ return BreadthFirstSearch(value, pred);
+}
+
+/// Collects the storage buffer used by a global memory instruction and the instruction itself
+void CollectStorageBuffers(IR::Block& block, IR::Inst& inst, StorageInfo& info) {
+ // NVN puts storage buffers in a specific range, we have to bias towards these addresses to
+ // avoid getting false positives
+ static constexpr Bias nvn_bias{
+ .index = 0,
+ .offset_begin = 0x110,
+ .offset_end = 0x610,
+ };
+ // Track the low address of the instruction
+ const std::optional<LowAddrInfo> low_addr_info{TrackLowAddress(&inst)};
+ if (!low_addr_info) {
+ // Failed to track the low address, use NVN fallbacks
+ return;
+ }
+ // First try to find storage buffers in the NVN address
+ const IR::U32 low_addr{low_addr_info->value};
+ std::optional<StorageBufferAddr> storage_buffer{Track(low_addr, &nvn_bias)};
+ if (!storage_buffer) {
+ // If it fails, track without a bias
+ storage_buffer = Track(low_addr, nullptr);
+ if (!storage_buffer) {
+ // If that also fails, use NVN fallbacks
+ return;
+ }
+ }
+ // Collect storage buffer and the instruction
+ if (IsGlobalMemoryWrite(inst)) {
+ info.writes.insert(*storage_buffer);
+ }
+ info.set.insert(*storage_buffer);
+ info.to_replace.push_back(StorageInst{
+ .storage_buffer{*storage_buffer},
+ .inst = &inst,
+ .block = &block,
+ });
+}
+
+/// Returns the offset in indices (not bytes) for an equivalent storage instruction
+IR::U32 StorageOffset(IR::Block& block, IR::Inst& inst, StorageBufferAddr buffer) {
+ IR::IREmitter ir{block, IR::Block::InstructionList::s_iterator_to(inst)};
+ IR::U32 offset;
+ if (const std::optional<LowAddrInfo> low_addr{TrackLowAddress(&inst)}) {
+ offset = low_addr->value;
+ if (low_addr->imm_offset != 0) {
+ offset = ir.IAdd(offset, ir.Imm32(low_addr->imm_offset));
+ }
+ } else {
+ offset = ir.UConvert(32, IR::U64{inst.Arg(0)});
+ }
+ // Subtract the least significant 32 bits from the guest offset. The result is the storage
+ // buffer offset in bytes.
+ const IR::U32 low_cbuf{ir.GetCbuf(ir.Imm32(buffer.index), ir.Imm32(buffer.offset))};
+ return ir.ISub(offset, low_cbuf);
+}
+
+/// Replace a global memory load instruction with its storage buffer equivalent
+void ReplaceLoad(IR::Block& block, IR::Inst& inst, const IR::U32& storage_index,
+ const IR::U32& offset) {
+ const IR::Opcode new_opcode{GlobalToStorage(inst.GetOpcode())};
+ const auto it{IR::Block::InstructionList::s_iterator_to(inst)};
+ const IR::Value value{&*block.PrependNewInst(it, new_opcode, {storage_index, offset})};
+ inst.ReplaceUsesWith(value);
+}
+
+/// Replace a global memory write instruction with its storage buffer equivalent
+void ReplaceWrite(IR::Block& block, IR::Inst& inst, const IR::U32& storage_index,
+ const IR::U32& offset) {
+ const IR::Opcode new_opcode{GlobalToStorage(inst.GetOpcode())};
+ const auto it{IR::Block::InstructionList::s_iterator_to(inst)};
+ block.PrependNewInst(it, new_opcode, {storage_index, offset, inst.Arg(1)});
+ inst.Invalidate();
+}
+
+/// Replace an atomic operation on global memory instruction with its storage buffer equivalent
+void ReplaceAtomic(IR::Block& block, IR::Inst& inst, const IR::U32& storage_index,
+ const IR::U32& offset) {
+ const IR::Opcode new_opcode{GlobalToStorage(inst.GetOpcode())};
+ const auto it{IR::Block::InstructionList::s_iterator_to(inst)};
+ const IR::Value value{
+ &*block.PrependNewInst(it, new_opcode, {storage_index, offset, inst.Arg(1)})};
+ inst.ReplaceUsesWith(value);
+}
+
+/// Replace a global memory instruction with its storage buffer equivalent
+void Replace(IR::Block& block, IR::Inst& inst, const IR::U32& storage_index,
+ const IR::U32& offset) {
+ switch (inst.GetOpcode()) {
+ case IR::Opcode::LoadGlobalS8:
+ case IR::Opcode::LoadGlobalU8:
+ case IR::Opcode::LoadGlobalS16:
+ case IR::Opcode::LoadGlobalU16:
+ case IR::Opcode::LoadGlobal32:
+ case IR::Opcode::LoadGlobal64:
+ case IR::Opcode::LoadGlobal128:
+ return ReplaceLoad(block, inst, storage_index, offset);
+ case IR::Opcode::WriteGlobalS8:
+ case IR::Opcode::WriteGlobalU8:
+ case IR::Opcode::WriteGlobalS16:
+ case IR::Opcode::WriteGlobalU16:
+ case IR::Opcode::WriteGlobal32:
+ case IR::Opcode::WriteGlobal64:
+ case IR::Opcode::WriteGlobal128:
+ return ReplaceWrite(block, inst, storage_index, offset);
+ case IR::Opcode::GlobalAtomicIAdd32:
+ case IR::Opcode::GlobalAtomicSMin32:
+ case IR::Opcode::GlobalAtomicUMin32:
+ case IR::Opcode::GlobalAtomicSMax32:
+ case IR::Opcode::GlobalAtomicUMax32:
+ case IR::Opcode::GlobalAtomicInc32:
+ case IR::Opcode::GlobalAtomicDec32:
+ case IR::Opcode::GlobalAtomicAnd32:
+ case IR::Opcode::GlobalAtomicOr32:
+ case IR::Opcode::GlobalAtomicXor32:
+ case IR::Opcode::GlobalAtomicExchange32:
+ case IR::Opcode::GlobalAtomicIAdd64:
+ case IR::Opcode::GlobalAtomicSMin64:
+ case IR::Opcode::GlobalAtomicUMin64:
+ case IR::Opcode::GlobalAtomicSMax64:
+ case IR::Opcode::GlobalAtomicUMax64:
+ case IR::Opcode::GlobalAtomicAnd64:
+ case IR::Opcode::GlobalAtomicOr64:
+ case IR::Opcode::GlobalAtomicXor64:
+ case IR::Opcode::GlobalAtomicExchange64:
+ case IR::Opcode::GlobalAtomicAddF32:
+ case IR::Opcode::GlobalAtomicAddF16x2:
+ case IR::Opcode::GlobalAtomicAddF32x2:
+ case IR::Opcode::GlobalAtomicMinF16x2:
+ case IR::Opcode::GlobalAtomicMinF32x2:
+ case IR::Opcode::GlobalAtomicMaxF16x2:
+ case IR::Opcode::GlobalAtomicMaxF32x2:
+ return ReplaceAtomic(block, inst, storage_index, offset);
+ default:
+ throw InvalidArgument("Invalid global memory opcode {}", inst.GetOpcode());
+ }
+}
+} // Anonymous namespace
+
+void GlobalMemoryToStorageBufferPass(IR::Program& program) {
+ StorageInfo info;
+ for (IR::Block* const block : program.post_order_blocks) {
+ for (IR::Inst& inst : block->Instructions()) {
+ if (!IsGlobalMemory(inst)) {
+ continue;
+ }
+ CollectStorageBuffers(*block, inst, info);
+ }
+ }
+ for (const StorageBufferAddr& storage_buffer : info.set) {
+ program.info.storage_buffers_descriptors.push_back({
+ .cbuf_index = storage_buffer.index,
+ .cbuf_offset = storage_buffer.offset,
+ .count = 1,
+ .is_written = info.writes.contains(storage_buffer),
+ });
+ }
+ for (const StorageInst& storage_inst : info.to_replace) {
+ const StorageBufferAddr storage_buffer{storage_inst.storage_buffer};
+ const auto it{info.set.find(storage_inst.storage_buffer)};
+ const IR::U32 index{IR::Value{static_cast<u32>(info.set.index_of(it))}};
+ IR::Block* const block{storage_inst.block};
+ IR::Inst* const inst{storage_inst.inst};
+ const IR::U32 offset{StorageOffset(*block, *inst, storage_buffer)};
+ Replace(*block, *inst, index, offset);
+ }
+}
+
+template <typename Descriptors, typename Descriptor, typename Func>
+static u32 Add(Descriptors& descriptors, const Descriptor& desc, Func&& pred) {
+ // TODO: Handle arrays
+ const auto it{std::ranges::find_if(descriptors, pred)};
+ if (it != descriptors.end()) {
+ return static_cast<u32>(std::distance(descriptors.begin(), it));
+ }
+ descriptors.push_back(desc);
+ return static_cast<u32>(descriptors.size()) - 1;
+}
+
+void JoinStorageInfo(Info& base, Info& source) {
+ auto& descriptors = base.storage_buffers_descriptors;
+ for (auto& desc : source.storage_buffers_descriptors) {
+ auto it{std::ranges::find_if(descriptors, [&desc](const auto& existing) {
+ return desc.cbuf_index == existing.cbuf_index &&
+ desc.cbuf_offset == existing.cbuf_offset && desc.count == existing.count;
+ })};
+ if (it != descriptors.end()) {
+ it->is_written |= desc.is_written;
+ continue;
+ }
+ descriptors.push_back(desc);
+ }
+}
+
+} // namespace Shader::Optimization
diff --git a/src/shader_recompiler/ir_opt/identity_removal_pass.cpp b/src/shader_recompiler/ir_opt/identity_removal_pass.cpp
new file mode 100644
index 000000000..e9b55f835
--- /dev/null
+++ b/src/shader_recompiler/ir_opt/identity_removal_pass.cpp
@@ -0,0 +1,38 @@
+// Copyright 2021 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <vector>
+
+#include "shader_recompiler/frontend/ir/basic_block.h"
+#include "shader_recompiler/frontend/ir/value.h"
+#include "shader_recompiler/ir_opt/passes.h"
+
+namespace Shader::Optimization {
+
+void IdentityRemovalPass(IR::Program& program) {
+ std::vector<IR::Inst*> to_invalidate;
+ for (IR::Block* const block : program.blocks) {
+ for (auto inst = block->begin(); inst != block->end();) {
+ const size_t num_args{inst->NumArgs()};
+ for (size_t i = 0; i < num_args; ++i) {
+ IR::Value arg;
+ while ((arg = inst->Arg(i)).IsIdentity()) {
+ inst->SetArg(i, arg.Inst()->Arg(0));
+ }
+ }
+ if (inst->GetOpcode() == IR::Opcode::Identity ||
+ inst->GetOpcode() == IR::Opcode::Void) {
+ to_invalidate.push_back(&*inst);
+ inst = block->Instructions().erase(inst);
+ } else {
+ ++inst;
+ }
+ }
+ }
+ for (IR::Inst* const inst : to_invalidate) {
+ inst->Invalidate();
+ }
+}
+
+} // namespace Shader::Optimization
diff --git a/src/shader_recompiler/ir_opt/lower_fp16_to_fp32.cpp b/src/shader_recompiler/ir_opt/lower_fp16_to_fp32.cpp
new file mode 100644
index 000000000..773e1f961
--- /dev/null
+++ b/src/shader_recompiler/ir_opt/lower_fp16_to_fp32.cpp
@@ -0,0 +1,143 @@
+// Copyright 2021 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <algorithm>
+
+#include "shader_recompiler/frontend/ir/ir_emitter.h"
+#include "shader_recompiler/frontend/ir/value.h"
+#include "shader_recompiler/ir_opt/passes.h"
+
+namespace Shader::Optimization {
+namespace {
+IR::Opcode Replace(IR::Opcode op) {
+ switch (op) {
+ case IR::Opcode::FPAbs16:
+ return IR::Opcode::FPAbs32;
+ case IR::Opcode::FPAdd16:
+ return IR::Opcode::FPAdd32;
+ case IR::Opcode::FPCeil16:
+ return IR::Opcode::FPCeil32;
+ case IR::Opcode::FPFloor16:
+ return IR::Opcode::FPFloor32;
+ case IR::Opcode::FPFma16:
+ return IR::Opcode::FPFma32;
+ case IR::Opcode::FPMul16:
+ return IR::Opcode::FPMul32;
+ case IR::Opcode::FPNeg16:
+ return IR::Opcode::FPNeg32;
+ case IR::Opcode::FPRoundEven16:
+ return IR::Opcode::FPRoundEven32;
+ case IR::Opcode::FPSaturate16:
+ return IR::Opcode::FPSaturate32;
+ case IR::Opcode::FPClamp16:
+ return IR::Opcode::FPClamp32;
+ case IR::Opcode::FPTrunc16:
+ return IR::Opcode::FPTrunc32;
+ case IR::Opcode::CompositeConstructF16x2:
+ return IR::Opcode::CompositeConstructF32x2;
+ case IR::Opcode::CompositeConstructF16x3:
+ return IR::Opcode::CompositeConstructF32x3;
+ case IR::Opcode::CompositeConstructF16x4:
+ return IR::Opcode::CompositeConstructF32x4;
+ case IR::Opcode::CompositeExtractF16x2:
+ return IR::Opcode::CompositeExtractF32x2;
+ case IR::Opcode::CompositeExtractF16x3:
+ return IR::Opcode::CompositeExtractF32x3;
+ case IR::Opcode::CompositeExtractF16x4:
+ return IR::Opcode::CompositeExtractF32x4;
+ case IR::Opcode::CompositeInsertF16x2:
+ return IR::Opcode::CompositeInsertF32x2;
+ case IR::Opcode::CompositeInsertF16x3:
+ return IR::Opcode::CompositeInsertF32x3;
+ case IR::Opcode::CompositeInsertF16x4:
+ return IR::Opcode::CompositeInsertF32x4;
+ case IR::Opcode::FPOrdEqual16:
+ return IR::Opcode::FPOrdEqual32;
+ case IR::Opcode::FPUnordEqual16:
+ return IR::Opcode::FPUnordEqual32;
+ case IR::Opcode::FPOrdNotEqual16:
+ return IR::Opcode::FPOrdNotEqual32;
+ case IR::Opcode::FPUnordNotEqual16:
+ return IR::Opcode::FPUnordNotEqual32;
+ case IR::Opcode::FPOrdLessThan16:
+ return IR::Opcode::FPOrdLessThan32;
+ case IR::Opcode::FPUnordLessThan16:
+ return IR::Opcode::FPUnordLessThan32;
+ case IR::Opcode::FPOrdGreaterThan16:
+ return IR::Opcode::FPOrdGreaterThan32;
+ case IR::Opcode::FPUnordGreaterThan16:
+ return IR::Opcode::FPUnordGreaterThan32;
+ case IR::Opcode::FPOrdLessThanEqual16:
+ return IR::Opcode::FPOrdLessThanEqual32;
+ case IR::Opcode::FPUnordLessThanEqual16:
+ return IR::Opcode::FPUnordLessThanEqual32;
+ case IR::Opcode::FPOrdGreaterThanEqual16:
+ return IR::Opcode::FPOrdGreaterThanEqual32;
+ case IR::Opcode::FPUnordGreaterThanEqual16:
+ return IR::Opcode::FPUnordGreaterThanEqual32;
+ case IR::Opcode::FPIsNan16:
+ return IR::Opcode::FPIsNan32;
+ case IR::Opcode::ConvertS16F16:
+ return IR::Opcode::ConvertS16F32;
+ case IR::Opcode::ConvertS32F16:
+ return IR::Opcode::ConvertS32F32;
+ case IR::Opcode::ConvertS64F16:
+ return IR::Opcode::ConvertS64F32;
+ case IR::Opcode::ConvertU16F16:
+ return IR::Opcode::ConvertU16F32;
+ case IR::Opcode::ConvertU32F16:
+ return IR::Opcode::ConvertU32F32;
+ case IR::Opcode::ConvertU64F16:
+ return IR::Opcode::ConvertU64F32;
+ case IR::Opcode::PackFloat2x16:
+ return IR::Opcode::PackHalf2x16;
+ case IR::Opcode::UnpackFloat2x16:
+ return IR::Opcode::UnpackHalf2x16;
+ case IR::Opcode::ConvertF32F16:
+ return IR::Opcode::Identity;
+ case IR::Opcode::ConvertF16F32:
+ return IR::Opcode::Identity;
+ case IR::Opcode::ConvertF16S8:
+ return IR::Opcode::ConvertF32S8;
+ case IR::Opcode::ConvertF16S16:
+ return IR::Opcode::ConvertF32S16;
+ case IR::Opcode::ConvertF16S32:
+ return IR::Opcode::ConvertF32S32;
+ case IR::Opcode::ConvertF16S64:
+ return IR::Opcode::ConvertF32S64;
+ case IR::Opcode::ConvertF16U8:
+ return IR::Opcode::ConvertF32U8;
+ case IR::Opcode::ConvertF16U16:
+ return IR::Opcode::ConvertF32U16;
+ case IR::Opcode::ConvertF16U32:
+ return IR::Opcode::ConvertF32U32;
+ case IR::Opcode::ConvertF16U64:
+ return IR::Opcode::ConvertF32U64;
+ case IR::Opcode::GlobalAtomicAddF16x2:
+ return IR::Opcode::GlobalAtomicAddF32x2;
+ case IR::Opcode::StorageAtomicAddF16x2:
+ return IR::Opcode::StorageAtomicAddF32x2;
+ case IR::Opcode::GlobalAtomicMinF16x2:
+ return IR::Opcode::GlobalAtomicMinF32x2;
+ case IR::Opcode::StorageAtomicMinF16x2:
+ return IR::Opcode::StorageAtomicMinF32x2;
+ case IR::Opcode::GlobalAtomicMaxF16x2:
+ return IR::Opcode::GlobalAtomicMaxF32x2;
+ case IR::Opcode::StorageAtomicMaxF16x2:
+ return IR::Opcode::StorageAtomicMaxF32x2;
+ default:
+ return op;
+ }
+}
+} // Anonymous namespace
+
+void LowerFp16ToFp32(IR::Program& program) {
+ for (IR::Block* const block : program.blocks) {
+ for (IR::Inst& inst : block->Instructions()) {
+ inst.ReplaceOpcode(Replace(inst.GetOpcode()));
+ }
+ }
+}
+
+} // namespace Shader::Optimization
diff --git a/src/shader_recompiler/ir_opt/lower_int64_to_int32.cpp b/src/shader_recompiler/ir_opt/lower_int64_to_int32.cpp
new file mode 100644
index 000000000..e80d3d1d9
--- /dev/null
+++ b/src/shader_recompiler/ir_opt/lower_int64_to_int32.cpp
@@ -0,0 +1,218 @@
+// Copyright 2021 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <utility>
+
+#include "shader_recompiler/exception.h"
+#include "shader_recompiler/frontend/ir/basic_block.h"
+#include "shader_recompiler/frontend/ir/ir_emitter.h"
+#include "shader_recompiler/frontend/ir/program.h"
+#include "shader_recompiler/frontend/ir/value.h"
+#include "shader_recompiler/ir_opt/passes.h"
+
+namespace Shader::Optimization {
+namespace {
+std::pair<IR::U32, IR::U32> Unpack(IR::IREmitter& ir, const IR::Value& packed) {
+ if (packed.IsImmediate()) {
+ const u64 value{packed.U64()};
+ return {
+ ir.Imm32(static_cast<u32>(value)),
+ ir.Imm32(static_cast<u32>(value >> 32)),
+ };
+ } else {
+ return std::pair<IR::U32, IR::U32>{
+ ir.CompositeExtract(packed, 0u),
+ ir.CompositeExtract(packed, 1u),
+ };
+ }
+}
+
+void IAdd64To32(IR::Block& block, IR::Inst& inst) {
+ if (inst.HasAssociatedPseudoOperation()) {
+ throw NotImplementedException("IAdd64 emulation with pseudo instructions");
+ }
+ IR::IREmitter ir(block, IR::Block::InstructionList::s_iterator_to(inst));
+ const auto [a_lo, a_hi]{Unpack(ir, inst.Arg(0))};
+ const auto [b_lo, b_hi]{Unpack(ir, inst.Arg(1))};
+
+ const IR::U32 ret_lo{ir.IAdd(a_lo, b_lo)};
+ const IR::U32 carry{ir.Select(ir.GetCarryFromOp(ret_lo), ir.Imm32(1u), ir.Imm32(0u))};
+
+ const IR::U32 ret_hi{ir.IAdd(ir.IAdd(a_hi, b_hi), carry)};
+ inst.ReplaceUsesWith(ir.CompositeConstruct(ret_lo, ret_hi));
+}
+
+void ISub64To32(IR::Block& block, IR::Inst& inst) {
+ if (inst.HasAssociatedPseudoOperation()) {
+ throw NotImplementedException("ISub64 emulation with pseudo instructions");
+ }
+ IR::IREmitter ir(block, IR::Block::InstructionList::s_iterator_to(inst));
+ const auto [a_lo, a_hi]{Unpack(ir, inst.Arg(0))};
+ const auto [b_lo, b_hi]{Unpack(ir, inst.Arg(1))};
+
+ const IR::U32 ret_lo{ir.ISub(a_lo, b_lo)};
+ const IR::U1 underflow{ir.IGreaterThan(ret_lo, a_lo, false)};
+ const IR::U32 underflow_bit{ir.Select(underflow, ir.Imm32(1u), ir.Imm32(0u))};
+
+ const IR::U32 ret_hi{ir.ISub(ir.ISub(a_hi, b_hi), underflow_bit)};
+ inst.ReplaceUsesWith(ir.CompositeConstruct(ret_lo, ret_hi));
+}
+
+void INeg64To32(IR::Block& block, IR::Inst& inst) {
+ if (inst.HasAssociatedPseudoOperation()) {
+ throw NotImplementedException("INeg64 emulation with pseudo instructions");
+ }
+ IR::IREmitter ir(block, IR::Block::InstructionList::s_iterator_to(inst));
+ auto [lo, hi]{Unpack(ir, inst.Arg(0))};
+ lo = ir.BitwiseNot(lo);
+ hi = ir.BitwiseNot(hi);
+
+ lo = ir.IAdd(lo, ir.Imm32(1));
+
+ const IR::U32 carry{ir.Select(ir.GetCarryFromOp(lo), ir.Imm32(1u), ir.Imm32(0u))};
+ hi = ir.IAdd(hi, carry);
+
+ inst.ReplaceUsesWith(ir.CompositeConstruct(lo, hi));
+}
+
+void ShiftLeftLogical64To32(IR::Block& block, IR::Inst& inst) {
+ if (inst.HasAssociatedPseudoOperation()) {
+ throw NotImplementedException("ShiftLeftLogical64 emulation with pseudo instructions");
+ }
+ IR::IREmitter ir(block, IR::Block::InstructionList::s_iterator_to(inst));
+ const auto [lo, hi]{Unpack(ir, inst.Arg(0))};
+ const IR::U32 shift{inst.Arg(1)};
+
+ const IR::U32 shifted_lo{ir.ShiftLeftLogical(lo, shift)};
+ const IR::U32 shifted_hi{ir.ShiftLeftLogical(hi, shift)};
+
+ const IR::U32 inv_shift{ir.ISub(shift, ir.Imm32(32))};
+ const IR::U1 is_long{ir.IGreaterThanEqual(inv_shift, ir.Imm32(0), true)};
+ const IR::U1 is_zero{ir.IEqual(shift, ir.Imm32(0))};
+
+ const IR::U32 long_ret_lo{ir.Imm32(0)};
+ const IR::U32 long_ret_hi{ir.ShiftLeftLogical(lo, inv_shift)};
+
+ const IR::U32 shift_complement{ir.ISub(ir.Imm32(32), shift)};
+ const IR::U32 lo_extract{ir.BitFieldExtract(lo, shift_complement, shift, false)};
+ const IR::U32 short_ret_lo{shifted_lo};
+ const IR::U32 short_ret_hi{ir.BitwiseOr(shifted_hi, lo_extract)};
+
+ const IR::U32 zero_ret_lo{lo};
+ const IR::U32 zero_ret_hi{hi};
+
+ const IR::U32 non_zero_lo{ir.Select(is_long, long_ret_lo, short_ret_lo)};
+ const IR::U32 non_zero_hi{ir.Select(is_long, long_ret_hi, short_ret_hi)};
+
+ const IR::U32 ret_lo{ir.Select(is_zero, zero_ret_lo, non_zero_lo)};
+ const IR::U32 ret_hi{ir.Select(is_zero, zero_ret_hi, non_zero_hi)};
+ inst.ReplaceUsesWith(ir.CompositeConstruct(ret_lo, ret_hi));
+}
+
+void ShiftRightLogical64To32(IR::Block& block, IR::Inst& inst) {
+ if (inst.HasAssociatedPseudoOperation()) {
+ throw NotImplementedException("ShiftRightLogical64 emulation with pseudo instructions");
+ }
+ IR::IREmitter ir(block, IR::Block::InstructionList::s_iterator_to(inst));
+ const auto [lo, hi]{Unpack(ir, inst.Arg(0))};
+ const IR::U32 shift{inst.Arg(1)};
+
+ const IR::U32 shifted_lo{ir.ShiftRightLogical(lo, shift)};
+ const IR::U32 shifted_hi{ir.ShiftRightLogical(hi, shift)};
+
+ const IR::U32 inv_shift{ir.ISub(shift, ir.Imm32(32))};
+ const IR::U1 is_long{ir.IGreaterThanEqual(inv_shift, ir.Imm32(0), true)};
+ const IR::U1 is_zero{ir.IEqual(shift, ir.Imm32(0))};
+
+ const IR::U32 long_ret_hi{ir.Imm32(0)};
+ const IR::U32 long_ret_lo{ir.ShiftRightLogical(hi, inv_shift)};
+
+ const IR::U32 shift_complement{ir.ISub(ir.Imm32(32), shift)};
+ const IR::U32 short_hi_extract{ir.BitFieldExtract(hi, ir.Imm32(0), shift)};
+ const IR::U32 short_ret_hi{shifted_hi};
+ const IR::U32 short_ret_lo{
+ ir.BitFieldInsert(shifted_lo, short_hi_extract, shift_complement, shift)};
+
+ const IR::U32 zero_ret_lo{lo};
+ const IR::U32 zero_ret_hi{hi};
+
+ const IR::U32 non_zero_lo{ir.Select(is_long, long_ret_lo, short_ret_lo)};
+ const IR::U32 non_zero_hi{ir.Select(is_long, long_ret_hi, short_ret_hi)};
+
+ const IR::U32 ret_lo{ir.Select(is_zero, zero_ret_lo, non_zero_lo)};
+ const IR::U32 ret_hi{ir.Select(is_zero, zero_ret_hi, non_zero_hi)};
+ inst.ReplaceUsesWith(ir.CompositeConstruct(ret_lo, ret_hi));
+}
+
+void ShiftRightArithmetic64To32(IR::Block& block, IR::Inst& inst) {
+ if (inst.HasAssociatedPseudoOperation()) {
+ throw NotImplementedException("ShiftRightArithmetic64 emulation with pseudo instructions");
+ }
+ IR::IREmitter ir(block, IR::Block::InstructionList::s_iterator_to(inst));
+ const auto [lo, hi]{Unpack(ir, inst.Arg(0))};
+ const IR::U32 shift{inst.Arg(1)};
+
+ const IR::U32 shifted_lo{ir.ShiftRightLogical(lo, shift)};
+ const IR::U32 shifted_hi{ir.ShiftRightArithmetic(hi, shift)};
+
+ const IR::U32 sign_extension{ir.ShiftRightArithmetic(hi, ir.Imm32(31))};
+
+ const IR::U32 inv_shift{ir.ISub(shift, ir.Imm32(32))};
+ const IR::U1 is_long{ir.IGreaterThanEqual(inv_shift, ir.Imm32(0), true)};
+ const IR::U1 is_zero{ir.IEqual(shift, ir.Imm32(0))};
+
+ const IR::U32 long_ret_hi{sign_extension};
+ const IR::U32 long_ret_lo{ir.ShiftRightArithmetic(hi, inv_shift)};
+
+ const IR::U32 shift_complement{ir.ISub(ir.Imm32(32), shift)};
+ const IR::U32 short_hi_extract(ir.BitFieldExtract(hi, ir.Imm32(0), shift));
+ const IR::U32 short_ret_hi{shifted_hi};
+ const IR::U32 short_ret_lo{
+ ir.BitFieldInsert(shifted_lo, short_hi_extract, shift_complement, shift)};
+
+ const IR::U32 zero_ret_lo{lo};
+ const IR::U32 zero_ret_hi{hi};
+
+ const IR::U32 non_zero_lo{ir.Select(is_long, long_ret_lo, short_ret_lo)};
+ const IR::U32 non_zero_hi{ir.Select(is_long, long_ret_hi, short_ret_hi)};
+
+ const IR::U32 ret_lo{ir.Select(is_zero, zero_ret_lo, non_zero_lo)};
+ const IR::U32 ret_hi{ir.Select(is_zero, zero_ret_hi, non_zero_hi)};
+ inst.ReplaceUsesWith(ir.CompositeConstruct(ret_lo, ret_hi));
+}
+
+void Lower(IR::Block& block, IR::Inst& inst) {
+ switch (inst.GetOpcode()) {
+ case IR::Opcode::PackUint2x32:
+ case IR::Opcode::UnpackUint2x32:
+ return inst.ReplaceOpcode(IR::Opcode::Identity);
+ case IR::Opcode::IAdd64:
+ return IAdd64To32(block, inst);
+ case IR::Opcode::ISub64:
+ return ISub64To32(block, inst);
+ case IR::Opcode::INeg64:
+ return INeg64To32(block, inst);
+ case IR::Opcode::ShiftLeftLogical64:
+ return ShiftLeftLogical64To32(block, inst);
+ case IR::Opcode::ShiftRightLogical64:
+ return ShiftRightLogical64To32(block, inst);
+ case IR::Opcode::ShiftRightArithmetic64:
+ return ShiftRightArithmetic64To32(block, inst);
+ default:
+ break;
+ }
+}
+} // Anonymous namespace
+
+void LowerInt64ToInt32(IR::Program& program) {
+ const auto end{program.post_order_blocks.rend()};
+ for (auto it = program.post_order_blocks.rbegin(); it != end; ++it) {
+ IR::Block* const block{*it};
+ for (IR::Inst& inst : block->Instructions()) {
+ Lower(*block, inst);
+ }
+ }
+}
+
+} // namespace Shader::Optimization
diff --git a/src/shader_recompiler/ir_opt/passes.h b/src/shader_recompiler/ir_opt/passes.h
new file mode 100644
index 000000000..2f89b1ea0
--- /dev/null
+++ b/src/shader_recompiler/ir_opt/passes.h
@@ -0,0 +1,32 @@
+// Copyright 2021 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <span>
+
+#include "shader_recompiler/environment.h"
+#include "shader_recompiler/frontend/ir/basic_block.h"
+#include "shader_recompiler/frontend/ir/program.h"
+
+namespace Shader::Optimization {
+
+void CollectShaderInfoPass(Environment& env, IR::Program& program);
+void ConstantPropagationPass(IR::Program& program);
+void DeadCodeEliminationPass(IR::Program& program);
+void GlobalMemoryToStorageBufferPass(IR::Program& program);
+void IdentityRemovalPass(IR::Program& program);
+void LowerFp16ToFp32(IR::Program& program);
+void LowerInt64ToInt32(IR::Program& program);
+void SsaRewritePass(IR::Program& program);
+void TexturePass(Environment& env, IR::Program& program);
+void VerificationPass(const IR::Program& program);
+
+// Dual Vertex
+void VertexATransformPass(IR::Program& program);
+void VertexBTransformPass(IR::Program& program);
+void JoinTextureInfo(Info& base, Info& source);
+void JoinStorageInfo(Info& base, Info& source);
+
+} // namespace Shader::Optimization
diff --git a/src/shader_recompiler/ir_opt/ssa_rewrite_pass.cpp b/src/shader_recompiler/ir_opt/ssa_rewrite_pass.cpp
new file mode 100644
index 000000000..53145fb5e
--- /dev/null
+++ b/src/shader_recompiler/ir_opt/ssa_rewrite_pass.cpp
@@ -0,0 +1,383 @@
+// Copyright 2021 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+// This file implements the SSA rewriting algorithm proposed in
+//
+// Simple and Efficient Construction of Static Single Assignment Form.
+// Braun M., Buchwald S., Hack S., Leiba R., Mallon C., Zwinkau A. (2013)
+// In: Jhala R., De Bosschere K. (eds)
+// Compiler Construction. CC 2013.
+// Lecture Notes in Computer Science, vol 7791.
+// Springer, Berlin, Heidelberg
+//
+// https://link.springer.com/chapter/10.1007/978-3-642-37051-9_6
+//
+
+#include <span>
+#include <variant>
+#include <vector>
+
+#include <boost/container/flat_map.hpp>
+#include <boost/container/flat_set.hpp>
+
+#include "shader_recompiler/frontend/ir/basic_block.h"
+#include "shader_recompiler/frontend/ir/opcodes.h"
+#include "shader_recompiler/frontend/ir/pred.h"
+#include "shader_recompiler/frontend/ir/reg.h"
+#include "shader_recompiler/frontend/ir/value.h"
+#include "shader_recompiler/ir_opt/passes.h"
+
+namespace Shader::Optimization {
+namespace {
+struct FlagTag {
+ auto operator<=>(const FlagTag&) const noexcept = default;
+};
+struct ZeroFlagTag : FlagTag {};
+struct SignFlagTag : FlagTag {};
+struct CarryFlagTag : FlagTag {};
+struct OverflowFlagTag : FlagTag {};
+
+struct GotoVariable : FlagTag {
+ GotoVariable() = default;
+ explicit GotoVariable(u32 index_) : index{index_} {}
+
+ auto operator<=>(const GotoVariable&) const noexcept = default;
+
+ u32 index;
+};
+
+struct IndirectBranchVariable {
+ auto operator<=>(const IndirectBranchVariable&) const noexcept = default;
+};
+
+using Variant = std::variant<IR::Reg, IR::Pred, ZeroFlagTag, SignFlagTag, CarryFlagTag,
+ OverflowFlagTag, GotoVariable, IndirectBranchVariable>;
+using ValueMap = boost::container::flat_map<IR::Block*, IR::Value>;
+
+struct DefTable {
+ const IR::Value& Def(IR::Block* block, IR::Reg variable) {
+ return block->SsaRegValue(variable);
+ }
+ void SetDef(IR::Block* block, IR::Reg variable, const IR::Value& value) {
+ block->SetSsaRegValue(variable, value);
+ }
+
+ const IR::Value& Def(IR::Block* block, IR::Pred variable) {
+ return preds[IR::PredIndex(variable)][block];
+ }
+ void SetDef(IR::Block* block, IR::Pred variable, const IR::Value& value) {
+ preds[IR::PredIndex(variable)].insert_or_assign(block, value);
+ }
+
+ const IR::Value& Def(IR::Block* block, GotoVariable variable) {
+ return goto_vars[variable.index][block];
+ }
+ void SetDef(IR::Block* block, GotoVariable variable, const IR::Value& value) {
+ goto_vars[variable.index].insert_or_assign(block, value);
+ }
+
+ const IR::Value& Def(IR::Block* block, IndirectBranchVariable) {
+ return indirect_branch_var[block];
+ }
+ void SetDef(IR::Block* block, IndirectBranchVariable, const IR::Value& value) {
+ indirect_branch_var.insert_or_assign(block, value);
+ }
+
+ const IR::Value& Def(IR::Block* block, ZeroFlagTag) {
+ return zero_flag[block];
+ }
+ void SetDef(IR::Block* block, ZeroFlagTag, const IR::Value& value) {
+ zero_flag.insert_or_assign(block, value);
+ }
+
+ const IR::Value& Def(IR::Block* block, SignFlagTag) {
+ return sign_flag[block];
+ }
+ void SetDef(IR::Block* block, SignFlagTag, const IR::Value& value) {
+ sign_flag.insert_or_assign(block, value);
+ }
+
+ const IR::Value& Def(IR::Block* block, CarryFlagTag) {
+ return carry_flag[block];
+ }
+ void SetDef(IR::Block* block, CarryFlagTag, const IR::Value& value) {
+ carry_flag.insert_or_assign(block, value);
+ }
+
+ const IR::Value& Def(IR::Block* block, OverflowFlagTag) {
+ return overflow_flag[block];
+ }
+ void SetDef(IR::Block* block, OverflowFlagTag, const IR::Value& value) {
+ overflow_flag.insert_or_assign(block, value);
+ }
+
+ std::array<ValueMap, IR::NUM_USER_PREDS> preds;
+ boost::container::flat_map<u32, ValueMap> goto_vars;
+ ValueMap indirect_branch_var;
+ ValueMap zero_flag;
+ ValueMap sign_flag;
+ ValueMap carry_flag;
+ ValueMap overflow_flag;
+};
+
+IR::Opcode UndefOpcode(IR::Reg) noexcept {
+ return IR::Opcode::UndefU32;
+}
+
+IR::Opcode UndefOpcode(IR::Pred) noexcept {
+ return IR::Opcode::UndefU1;
+}
+
+IR::Opcode UndefOpcode(const FlagTag&) noexcept {
+ return IR::Opcode::UndefU1;
+}
+
+IR::Opcode UndefOpcode(IndirectBranchVariable) noexcept {
+ return IR::Opcode::UndefU32;
+}
+
+enum class Status {
+ Start,
+ SetValue,
+ PreparePhiArgument,
+ PushPhiArgument,
+};
+
+template <typename Type>
+struct ReadState {
+ ReadState(IR::Block* block_) : block{block_} {}
+ ReadState() = default;
+
+ IR::Block* block{};
+ IR::Value result{};
+ IR::Inst* phi{};
+ IR::Block* const* pred_it{};
+ IR::Block* const* pred_end{};
+ Status pc{Status::Start};
+};
+
+class Pass {
+public:
+ template <typename Type>
+ void WriteVariable(Type variable, IR::Block* block, const IR::Value& value) {
+ current_def.SetDef(block, variable, value);
+ }
+
+ template <typename Type>
+ IR::Value ReadVariable(Type variable, IR::Block* root_block) {
+ boost::container::small_vector<ReadState<Type>, 64> stack{
+ ReadState<Type>(nullptr),
+ ReadState<Type>(root_block),
+ };
+ const auto prepare_phi_operand{[&] {
+ if (stack.back().pred_it == stack.back().pred_end) {
+ IR::Inst* const phi{stack.back().phi};
+ IR::Block* const block{stack.back().block};
+ const IR::Value result{TryRemoveTrivialPhi(*phi, block, UndefOpcode(variable))};
+ stack.pop_back();
+ stack.back().result = result;
+ WriteVariable(variable, block, result);
+ } else {
+ IR::Block* const imm_pred{*stack.back().pred_it};
+ stack.back().pc = Status::PushPhiArgument;
+ stack.emplace_back(imm_pred);
+ }
+ }};
+ do {
+ IR::Block* const block{stack.back().block};
+ switch (stack.back().pc) {
+ case Status::Start: {
+ if (const IR::Value& def = current_def.Def(block, variable); !def.IsEmpty()) {
+ stack.back().result = def;
+ } else if (!block->IsSsaSealed()) {
+ // Incomplete CFG
+ IR::Inst* phi{&*block->PrependNewInst(block->begin(), IR::Opcode::Phi)};
+ phi->SetFlags(IR::TypeOf(UndefOpcode(variable)));
+
+ incomplete_phis[block].insert_or_assign(variable, phi);
+ stack.back().result = IR::Value{&*phi};
+ } else if (const std::span imm_preds = block->ImmPredecessors();
+ imm_preds.size() == 1) {
+ // Optimize the common case of one predecessor: no phi needed
+ stack.back().pc = Status::SetValue;
+ stack.emplace_back(imm_preds.front());
+ break;
+ } else {
+ // Break potential cycles with operandless phi
+ IR::Inst* const phi{&*block->PrependNewInst(block->begin(), IR::Opcode::Phi)};
+ phi->SetFlags(IR::TypeOf(UndefOpcode(variable)));
+
+ WriteVariable(variable, block, IR::Value{phi});
+
+ stack.back().phi = phi;
+ stack.back().pred_it = imm_preds.data();
+ stack.back().pred_end = imm_preds.data() + imm_preds.size();
+ prepare_phi_operand();
+ break;
+ }
+ }
+ [[fallthrough]];
+ case Status::SetValue: {
+ const IR::Value result{stack.back().result};
+ WriteVariable(variable, block, result);
+ stack.pop_back();
+ stack.back().result = result;
+ break;
+ }
+ case Status::PushPhiArgument: {
+ IR::Inst* const phi{stack.back().phi};
+ phi->AddPhiOperand(*stack.back().pred_it, stack.back().result);
+ ++stack.back().pred_it;
+ }
+ [[fallthrough]];
+ case Status::PreparePhiArgument:
+ prepare_phi_operand();
+ break;
+ }
+ } while (stack.size() > 1);
+ return stack.back().result;
+ }
+
+ void SealBlock(IR::Block* block) {
+ const auto it{incomplete_phis.find(block)};
+ if (it != incomplete_phis.end()) {
+ for (auto& pair : it->second) {
+ auto& variant{pair.first};
+ auto& phi{pair.second};
+ std::visit([&](auto& variable) { AddPhiOperands(variable, *phi, block); }, variant);
+ }
+ }
+ block->SsaSeal();
+ }
+
+private:
+ template <typename Type>
+ IR::Value AddPhiOperands(Type variable, IR::Inst& phi, IR::Block* block) {
+ for (IR::Block* const imm_pred : block->ImmPredecessors()) {
+ phi.AddPhiOperand(imm_pred, ReadVariable(variable, imm_pred));
+ }
+ return TryRemoveTrivialPhi(phi, block, UndefOpcode(variable));
+ }
+
+ IR::Value TryRemoveTrivialPhi(IR::Inst& phi, IR::Block* block, IR::Opcode undef_opcode) {
+ IR::Value same;
+ const size_t num_args{phi.NumArgs()};
+ for (size_t arg_index = 0; arg_index < num_args; ++arg_index) {
+ const IR::Value& op{phi.Arg(arg_index)};
+ if (op.Resolve() == same.Resolve() || op == IR::Value{&phi}) {
+ // Unique value or self-reference
+ continue;
+ }
+ if (!same.IsEmpty()) {
+ // The phi merges at least two values: not trivial
+ return IR::Value{&phi};
+ }
+ same = op;
+ }
+ // Remove the phi node from the block, it will be reinserted
+ IR::Block::InstructionList& list{block->Instructions()};
+ list.erase(IR::Block::InstructionList::s_iterator_to(phi));
+
+ // Find the first non-phi instruction and use it as an insertion point
+ IR::Block::iterator reinsert_point{std::ranges::find_if_not(list, IR::IsPhi)};
+ if (same.IsEmpty()) {
+ // The phi is unreachable or in the start block
+ // Insert an undefined instruction and make it the phi node replacement
+ // The "phi" node reinsertion point is specified after this instruction
+ reinsert_point = block->PrependNewInst(reinsert_point, undef_opcode);
+ same = IR::Value{&*reinsert_point};
+ ++reinsert_point;
+ }
+ // Reinsert the phi node and reroute all its uses to the "same" value
+ list.insert(reinsert_point, phi);
+ phi.ReplaceUsesWith(same);
+ // TODO: Try to recursively remove all phi users, which might have become trivial
+ return same;
+ }
+
+ boost::container::flat_map<IR::Block*, boost::container::flat_map<Variant, IR::Inst*>>
+ incomplete_phis;
+ DefTable current_def;
+};
+
+void VisitInst(Pass& pass, IR::Block* block, IR::Inst& inst) {
+ switch (inst.GetOpcode()) {
+ case IR::Opcode::SetRegister:
+ if (const IR::Reg reg{inst.Arg(0).Reg()}; reg != IR::Reg::RZ) {
+ pass.WriteVariable(reg, block, inst.Arg(1));
+ }
+ break;
+ case IR::Opcode::SetPred:
+ if (const IR::Pred pred{inst.Arg(0).Pred()}; pred != IR::Pred::PT) {
+ pass.WriteVariable(pred, block, inst.Arg(1));
+ }
+ break;
+ case IR::Opcode::SetGotoVariable:
+ pass.WriteVariable(GotoVariable{inst.Arg(0).U32()}, block, inst.Arg(1));
+ break;
+ case IR::Opcode::SetIndirectBranchVariable:
+ pass.WriteVariable(IndirectBranchVariable{}, block, inst.Arg(0));
+ break;
+ case IR::Opcode::SetZFlag:
+ pass.WriteVariable(ZeroFlagTag{}, block, inst.Arg(0));
+ break;
+ case IR::Opcode::SetSFlag:
+ pass.WriteVariable(SignFlagTag{}, block, inst.Arg(0));
+ break;
+ case IR::Opcode::SetCFlag:
+ pass.WriteVariable(CarryFlagTag{}, block, inst.Arg(0));
+ break;
+ case IR::Opcode::SetOFlag:
+ pass.WriteVariable(OverflowFlagTag{}, block, inst.Arg(0));
+ break;
+ case IR::Opcode::GetRegister:
+ if (const IR::Reg reg{inst.Arg(0).Reg()}; reg != IR::Reg::RZ) {
+ inst.ReplaceUsesWith(pass.ReadVariable(reg, block));
+ }
+ break;
+ case IR::Opcode::GetPred:
+ if (const IR::Pred pred{inst.Arg(0).Pred()}; pred != IR::Pred::PT) {
+ inst.ReplaceUsesWith(pass.ReadVariable(pred, block));
+ }
+ break;
+ case IR::Opcode::GetGotoVariable:
+ inst.ReplaceUsesWith(pass.ReadVariable(GotoVariable{inst.Arg(0).U32()}, block));
+ break;
+ case IR::Opcode::GetIndirectBranchVariable:
+ inst.ReplaceUsesWith(pass.ReadVariable(IndirectBranchVariable{}, block));
+ break;
+ case IR::Opcode::GetZFlag:
+ inst.ReplaceUsesWith(pass.ReadVariable(ZeroFlagTag{}, block));
+ break;
+ case IR::Opcode::GetSFlag:
+ inst.ReplaceUsesWith(pass.ReadVariable(SignFlagTag{}, block));
+ break;
+ case IR::Opcode::GetCFlag:
+ inst.ReplaceUsesWith(pass.ReadVariable(CarryFlagTag{}, block));
+ break;
+ case IR::Opcode::GetOFlag:
+ inst.ReplaceUsesWith(pass.ReadVariable(OverflowFlagTag{}, block));
+ break;
+ default:
+ break;
+ }
+}
+
+void VisitBlock(Pass& pass, IR::Block* block) {
+ for (IR::Inst& inst : block->Instructions()) {
+ VisitInst(pass, block, inst);
+ }
+ pass.SealBlock(block);
+}
+} // Anonymous namespace
+
+void SsaRewritePass(IR::Program& program) {
+ Pass pass;
+ const auto end{program.post_order_blocks.rend()};
+ for (auto block = program.post_order_blocks.rbegin(); block != end; ++block) {
+ VisitBlock(pass, *block);
+ }
+}
+
+} // namespace Shader::Optimization
diff --git a/src/shader_recompiler/ir_opt/texture_pass.cpp b/src/shader_recompiler/ir_opt/texture_pass.cpp
new file mode 100644
index 000000000..44ad10d43
--- /dev/null
+++ b/src/shader_recompiler/ir_opt/texture_pass.cpp
@@ -0,0 +1,523 @@
+// Copyright 2021 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <algorithm>
+#include <bit>
+#include <optional>
+
+#include <boost/container/small_vector.hpp>
+
+#include "shader_recompiler/environment.h"
+#include "shader_recompiler/frontend/ir/basic_block.h"
+#include "shader_recompiler/frontend/ir/breadth_first_search.h"
+#include "shader_recompiler/frontend/ir/ir_emitter.h"
+#include "shader_recompiler/ir_opt/passes.h"
+#include "shader_recompiler/shader_info.h"
+
+namespace Shader::Optimization {
+namespace {
+struct ConstBufferAddr {
+ u32 index;
+ u32 offset;
+ u32 secondary_index;
+ u32 secondary_offset;
+ IR::U32 dynamic_offset;
+ u32 count;
+ bool has_secondary;
+};
+
+struct TextureInst {
+ ConstBufferAddr cbuf;
+ IR::Inst* inst;
+ IR::Block* block;
+};
+
+using TextureInstVector = boost::container::small_vector<TextureInst, 24>;
+
+constexpr u32 DESCRIPTOR_SIZE = 8;
+constexpr u32 DESCRIPTOR_SIZE_SHIFT = static_cast<u32>(std::countr_zero(DESCRIPTOR_SIZE));
+
+IR::Opcode IndexedInstruction(const IR::Inst& inst) {
+ switch (inst.GetOpcode()) {
+ case IR::Opcode::BindlessImageSampleImplicitLod:
+ case IR::Opcode::BoundImageSampleImplicitLod:
+ return IR::Opcode::ImageSampleImplicitLod;
+ case IR::Opcode::BoundImageSampleExplicitLod:
+ case IR::Opcode::BindlessImageSampleExplicitLod:
+ return IR::Opcode::ImageSampleExplicitLod;
+ case IR::Opcode::BoundImageSampleDrefImplicitLod:
+ case IR::Opcode::BindlessImageSampleDrefImplicitLod:
+ return IR::Opcode::ImageSampleDrefImplicitLod;
+ case IR::Opcode::BoundImageSampleDrefExplicitLod:
+ case IR::Opcode::BindlessImageSampleDrefExplicitLod:
+ return IR::Opcode::ImageSampleDrefExplicitLod;
+ case IR::Opcode::BindlessImageGather:
+ case IR::Opcode::BoundImageGather:
+ return IR::Opcode::ImageGather;
+ case IR::Opcode::BindlessImageGatherDref:
+ case IR::Opcode::BoundImageGatherDref:
+ return IR::Opcode::ImageGatherDref;
+ case IR::Opcode::BindlessImageFetch:
+ case IR::Opcode::BoundImageFetch:
+ return IR::Opcode::ImageFetch;
+ case IR::Opcode::BoundImageQueryDimensions:
+ case IR::Opcode::BindlessImageQueryDimensions:
+ return IR::Opcode::ImageQueryDimensions;
+ case IR::Opcode::BoundImageQueryLod:
+ case IR::Opcode::BindlessImageQueryLod:
+ return IR::Opcode::ImageQueryLod;
+ case IR::Opcode::BoundImageGradient:
+ case IR::Opcode::BindlessImageGradient:
+ return IR::Opcode::ImageGradient;
+ case IR::Opcode::BoundImageRead:
+ case IR::Opcode::BindlessImageRead:
+ return IR::Opcode::ImageRead;
+ case IR::Opcode::BoundImageWrite:
+ case IR::Opcode::BindlessImageWrite:
+ return IR::Opcode::ImageWrite;
+ case IR::Opcode::BoundImageAtomicIAdd32:
+ case IR::Opcode::BindlessImageAtomicIAdd32:
+ return IR::Opcode::ImageAtomicIAdd32;
+ case IR::Opcode::BoundImageAtomicSMin32:
+ case IR::Opcode::BindlessImageAtomicSMin32:
+ return IR::Opcode::ImageAtomicSMin32;
+ case IR::Opcode::BoundImageAtomicUMin32:
+ case IR::Opcode::BindlessImageAtomicUMin32:
+ return IR::Opcode::ImageAtomicUMin32;
+ case IR::Opcode::BoundImageAtomicSMax32:
+ case IR::Opcode::BindlessImageAtomicSMax32:
+ return IR::Opcode::ImageAtomicSMax32;
+ case IR::Opcode::BoundImageAtomicUMax32:
+ case IR::Opcode::BindlessImageAtomicUMax32:
+ return IR::Opcode::ImageAtomicUMax32;
+ case IR::Opcode::BoundImageAtomicInc32:
+ case IR::Opcode::BindlessImageAtomicInc32:
+ return IR::Opcode::ImageAtomicInc32;
+ case IR::Opcode::BoundImageAtomicDec32:
+ case IR::Opcode::BindlessImageAtomicDec32:
+ return IR::Opcode::ImageAtomicDec32;
+ case IR::Opcode::BoundImageAtomicAnd32:
+ case IR::Opcode::BindlessImageAtomicAnd32:
+ return IR::Opcode::ImageAtomicAnd32;
+ case IR::Opcode::BoundImageAtomicOr32:
+ case IR::Opcode::BindlessImageAtomicOr32:
+ return IR::Opcode::ImageAtomicOr32;
+ case IR::Opcode::BoundImageAtomicXor32:
+ case IR::Opcode::BindlessImageAtomicXor32:
+ return IR::Opcode::ImageAtomicXor32;
+ case IR::Opcode::BoundImageAtomicExchange32:
+ case IR::Opcode::BindlessImageAtomicExchange32:
+ return IR::Opcode::ImageAtomicExchange32;
+ default:
+ return IR::Opcode::Void;
+ }
+}
+
+bool IsBindless(const IR::Inst& inst) {
+ switch (inst.GetOpcode()) {
+ case IR::Opcode::BindlessImageSampleImplicitLod:
+ case IR::Opcode::BindlessImageSampleExplicitLod:
+ case IR::Opcode::BindlessImageSampleDrefImplicitLod:
+ case IR::Opcode::BindlessImageSampleDrefExplicitLod:
+ case IR::Opcode::BindlessImageGather:
+ case IR::Opcode::BindlessImageGatherDref:
+ case IR::Opcode::BindlessImageFetch:
+ case IR::Opcode::BindlessImageQueryDimensions:
+ case IR::Opcode::BindlessImageQueryLod:
+ case IR::Opcode::BindlessImageGradient:
+ case IR::Opcode::BindlessImageRead:
+ case IR::Opcode::BindlessImageWrite:
+ case IR::Opcode::BindlessImageAtomicIAdd32:
+ case IR::Opcode::BindlessImageAtomicSMin32:
+ case IR::Opcode::BindlessImageAtomicUMin32:
+ case IR::Opcode::BindlessImageAtomicSMax32:
+ case IR::Opcode::BindlessImageAtomicUMax32:
+ case IR::Opcode::BindlessImageAtomicInc32:
+ case IR::Opcode::BindlessImageAtomicDec32:
+ case IR::Opcode::BindlessImageAtomicAnd32:
+ case IR::Opcode::BindlessImageAtomicOr32:
+ case IR::Opcode::BindlessImageAtomicXor32:
+ case IR::Opcode::BindlessImageAtomicExchange32:
+ return true;
+ case IR::Opcode::BoundImageSampleImplicitLod:
+ case IR::Opcode::BoundImageSampleExplicitLod:
+ case IR::Opcode::BoundImageSampleDrefImplicitLod:
+ case IR::Opcode::BoundImageSampleDrefExplicitLod:
+ case IR::Opcode::BoundImageGather:
+ case IR::Opcode::BoundImageGatherDref:
+ case IR::Opcode::BoundImageFetch:
+ case IR::Opcode::BoundImageQueryDimensions:
+ case IR::Opcode::BoundImageQueryLod:
+ case IR::Opcode::BoundImageGradient:
+ case IR::Opcode::BoundImageRead:
+ case IR::Opcode::BoundImageWrite:
+ case IR::Opcode::BoundImageAtomicIAdd32:
+ case IR::Opcode::BoundImageAtomicSMin32:
+ case IR::Opcode::BoundImageAtomicUMin32:
+ case IR::Opcode::BoundImageAtomicSMax32:
+ case IR::Opcode::BoundImageAtomicUMax32:
+ case IR::Opcode::BoundImageAtomicInc32:
+ case IR::Opcode::BoundImageAtomicDec32:
+ case IR::Opcode::BoundImageAtomicAnd32:
+ case IR::Opcode::BoundImageAtomicOr32:
+ case IR::Opcode::BoundImageAtomicXor32:
+ case IR::Opcode::BoundImageAtomicExchange32:
+ return false;
+ default:
+ throw InvalidArgument("Invalid opcode {}", inst.GetOpcode());
+ }
+}
+
+bool IsTextureInstruction(const IR::Inst& inst) {
+ return IndexedInstruction(inst) != IR::Opcode::Void;
+}
+
+std::optional<ConstBufferAddr> TryGetConstBuffer(const IR::Inst* inst);
+
+std::optional<ConstBufferAddr> Track(const IR::Value& value) {
+ return IR::BreadthFirstSearch(value, TryGetConstBuffer);
+}
+
+std::optional<ConstBufferAddr> TryGetConstBuffer(const IR::Inst* inst) {
+ switch (inst->GetOpcode()) {
+ default:
+ return std::nullopt;
+ case IR::Opcode::BitwiseOr32: {
+ std::optional lhs{Track(inst->Arg(0))};
+ std::optional rhs{Track(inst->Arg(1))};
+ if (!lhs || !rhs) {
+ return std::nullopt;
+ }
+ if (lhs->has_secondary || rhs->has_secondary) {
+ return std::nullopt;
+ }
+ if (lhs->count > 1 || rhs->count > 1) {
+ return std::nullopt;
+ }
+ if (lhs->index > rhs->index || lhs->offset > rhs->offset) {
+ std::swap(lhs, rhs);
+ }
+ return ConstBufferAddr{
+ .index = lhs->index,
+ .offset = lhs->offset,
+ .secondary_index = rhs->index,
+ .secondary_offset = rhs->offset,
+ .dynamic_offset = {},
+ .count = 1,
+ .has_secondary = true,
+ };
+ }
+ case IR::Opcode::GetCbufU32x2:
+ case IR::Opcode::GetCbufU32:
+ break;
+ }
+ const IR::Value index{inst->Arg(0)};
+ const IR::Value offset{inst->Arg(1)};
+ if (!index.IsImmediate()) {
+ // Reading a bindless texture from variable indices is valid
+ // but not supported here at the moment
+ return std::nullopt;
+ }
+ if (offset.IsImmediate()) {
+ return ConstBufferAddr{
+ .index = index.U32(),
+ .offset = offset.U32(),
+ .secondary_index = 0,
+ .secondary_offset = 0,
+ .dynamic_offset = {},
+ .count = 1,
+ .has_secondary = false,
+ };
+ }
+ IR::Inst* const offset_inst{offset.InstRecursive()};
+ if (offset_inst->GetOpcode() != IR::Opcode::IAdd32) {
+ return std::nullopt;
+ }
+ u32 base_offset{};
+ IR::U32 dynamic_offset;
+ if (offset_inst->Arg(0).IsImmediate()) {
+ base_offset = offset_inst->Arg(0).U32();
+ dynamic_offset = IR::U32{offset_inst->Arg(1)};
+ } else if (offset_inst->Arg(1).IsImmediate()) {
+ base_offset = offset_inst->Arg(1).U32();
+ dynamic_offset = IR::U32{offset_inst->Arg(0)};
+ } else {
+ return std::nullopt;
+ }
+ return ConstBufferAddr{
+ .index = index.U32(),
+ .offset = base_offset,
+ .secondary_index = 0,
+ .secondary_offset = 0,
+ .dynamic_offset = dynamic_offset,
+ .count = 8,
+ .has_secondary = false,
+ };
+}
+
+TextureInst MakeInst(Environment& env, IR::Block* block, IR::Inst& inst) {
+ ConstBufferAddr addr;
+ if (IsBindless(inst)) {
+ const std::optional<ConstBufferAddr> track_addr{Track(inst.Arg(0))};
+ if (!track_addr) {
+ throw NotImplementedException("Failed to track bindless texture constant buffer");
+ }
+ addr = *track_addr;
+ } else {
+ addr = ConstBufferAddr{
+ .index = env.TextureBoundBuffer(),
+ .offset = inst.Arg(0).U32(),
+ .secondary_index = 0,
+ .secondary_offset = 0,
+ .dynamic_offset = {},
+ .count = 1,
+ .has_secondary = false,
+ };
+ }
+ return TextureInst{
+ .cbuf = addr,
+ .inst = &inst,
+ .block = block,
+ };
+}
+
+TextureType ReadTextureType(Environment& env, const ConstBufferAddr& cbuf) {
+ const u32 secondary_index{cbuf.has_secondary ? cbuf.secondary_index : cbuf.index};
+ const u32 secondary_offset{cbuf.has_secondary ? cbuf.secondary_offset : cbuf.offset};
+ const u32 lhs_raw{env.ReadCbufValue(cbuf.index, cbuf.offset)};
+ const u32 rhs_raw{env.ReadCbufValue(secondary_index, secondary_offset)};
+ return env.ReadTextureType(lhs_raw | rhs_raw);
+}
+
+class Descriptors {
+public:
+ explicit Descriptors(TextureBufferDescriptors& texture_buffer_descriptors_,
+ ImageBufferDescriptors& image_buffer_descriptors_,
+ TextureDescriptors& texture_descriptors_,
+ ImageDescriptors& image_descriptors_)
+ : texture_buffer_descriptors{texture_buffer_descriptors_},
+ image_buffer_descriptors{image_buffer_descriptors_},
+ texture_descriptors{texture_descriptors_}, image_descriptors{image_descriptors_} {}
+
+ u32 Add(const TextureBufferDescriptor& desc) {
+ return Add(texture_buffer_descriptors, desc, [&desc](const auto& existing) {
+ return desc.cbuf_index == existing.cbuf_index &&
+ desc.cbuf_offset == existing.cbuf_offset &&
+ desc.secondary_cbuf_index == existing.secondary_cbuf_index &&
+ desc.secondary_cbuf_offset == existing.secondary_cbuf_offset &&
+ desc.count == existing.count && desc.size_shift == existing.size_shift &&
+ desc.has_secondary == existing.has_secondary;
+ });
+ }
+
+ u32 Add(const ImageBufferDescriptor& desc) {
+ const u32 index{Add(image_buffer_descriptors, desc, [&desc](const auto& existing) {
+ return desc.format == existing.format && desc.cbuf_index == existing.cbuf_index &&
+ desc.cbuf_offset == existing.cbuf_offset && desc.count == existing.count &&
+ desc.size_shift == existing.size_shift;
+ })};
+ image_buffer_descriptors[index].is_written |= desc.is_written;
+ image_buffer_descriptors[index].is_read |= desc.is_read;
+ return index;
+ }
+
+ u32 Add(const TextureDescriptor& desc) {
+ return Add(texture_descriptors, desc, [&desc](const auto& existing) {
+ return desc.type == existing.type && desc.is_depth == existing.is_depth &&
+ desc.has_secondary == existing.has_secondary &&
+ desc.cbuf_index == existing.cbuf_index &&
+ desc.cbuf_offset == existing.cbuf_offset &&
+ desc.secondary_cbuf_index == existing.secondary_cbuf_index &&
+ desc.secondary_cbuf_offset == existing.secondary_cbuf_offset &&
+ desc.count == existing.count && desc.size_shift == existing.size_shift;
+ });
+ }
+
+ u32 Add(const ImageDescriptor& desc) {
+ const u32 index{Add(image_descriptors, desc, [&desc](const auto& existing) {
+ return desc.type == existing.type && desc.format == existing.format &&
+ desc.cbuf_index == existing.cbuf_index &&
+ desc.cbuf_offset == existing.cbuf_offset && desc.count == existing.count &&
+ desc.size_shift == existing.size_shift;
+ })};
+ image_descriptors[index].is_written |= desc.is_written;
+ image_descriptors[index].is_read |= desc.is_read;
+ return index;
+ }
+
+private:
+ template <typename Descriptors, typename Descriptor, typename Func>
+ static u32 Add(Descriptors& descriptors, const Descriptor& desc, Func&& pred) {
+ // TODO: Handle arrays
+ const auto it{std::ranges::find_if(descriptors, pred)};
+ if (it != descriptors.end()) {
+ return static_cast<u32>(std::distance(descriptors.begin(), it));
+ }
+ descriptors.push_back(desc);
+ return static_cast<u32>(descriptors.size()) - 1;
+ }
+
+ TextureBufferDescriptors& texture_buffer_descriptors;
+ ImageBufferDescriptors& image_buffer_descriptors;
+ TextureDescriptors& texture_descriptors;
+ ImageDescriptors& image_descriptors;
+};
+} // Anonymous namespace
+
+void TexturePass(Environment& env, IR::Program& program) {
+ TextureInstVector to_replace;
+ for (IR::Block* const block : program.post_order_blocks) {
+ for (IR::Inst& inst : block->Instructions()) {
+ if (!IsTextureInstruction(inst)) {
+ continue;
+ }
+ to_replace.push_back(MakeInst(env, block, inst));
+ }
+ }
+ // Sort instructions to visit textures by constant buffer index, then by offset
+ std::ranges::sort(to_replace, [](const auto& lhs, const auto& rhs) {
+ return lhs.cbuf.offset < rhs.cbuf.offset;
+ });
+ std::stable_sort(to_replace.begin(), to_replace.end(), [](const auto& lhs, const auto& rhs) {
+ return lhs.cbuf.index < rhs.cbuf.index;
+ });
+ Descriptors descriptors{
+ program.info.texture_buffer_descriptors,
+ program.info.image_buffer_descriptors,
+ program.info.texture_descriptors,
+ program.info.image_descriptors,
+ };
+ for (TextureInst& texture_inst : to_replace) {
+ // TODO: Handle arrays
+ IR::Inst* const inst{texture_inst.inst};
+ inst->ReplaceOpcode(IndexedInstruction(*inst));
+
+ const auto& cbuf{texture_inst.cbuf};
+ auto flags{inst->Flags<IR::TextureInstInfo>()};
+ switch (inst->GetOpcode()) {
+ case IR::Opcode::ImageQueryDimensions:
+ flags.type.Assign(ReadTextureType(env, cbuf));
+ inst->SetFlags(flags);
+ break;
+ case IR::Opcode::ImageFetch:
+ if (flags.type != TextureType::Color1D) {
+ break;
+ }
+ if (ReadTextureType(env, cbuf) == TextureType::Buffer) {
+ // Replace with the bound texture type only when it's a texture buffer
+ // If the instruction is 1D and the bound type is 2D, don't change the code and let
+ // the rasterizer robustness handle it
+ // This happens on Fire Emblem: Three Houses
+ flags.type.Assign(TextureType::Buffer);
+ }
+ break;
+ default:
+ break;
+ }
+ u32 index;
+ switch (inst->GetOpcode()) {
+ case IR::Opcode::ImageRead:
+ case IR::Opcode::ImageAtomicIAdd32:
+ case IR::Opcode::ImageAtomicSMin32:
+ case IR::Opcode::ImageAtomicUMin32:
+ case IR::Opcode::ImageAtomicSMax32:
+ case IR::Opcode::ImageAtomicUMax32:
+ case IR::Opcode::ImageAtomicInc32:
+ case IR::Opcode::ImageAtomicDec32:
+ case IR::Opcode::ImageAtomicAnd32:
+ case IR::Opcode::ImageAtomicOr32:
+ case IR::Opcode::ImageAtomicXor32:
+ case IR::Opcode::ImageAtomicExchange32:
+ case IR::Opcode::ImageWrite: {
+ if (cbuf.has_secondary) {
+ throw NotImplementedException("Unexpected separate sampler");
+ }
+ const bool is_written{inst->GetOpcode() != IR::Opcode::ImageRead};
+ const bool is_read{inst->GetOpcode() != IR::Opcode::ImageWrite};
+ if (flags.type == TextureType::Buffer) {
+ index = descriptors.Add(ImageBufferDescriptor{
+ .format = flags.image_format,
+ .is_written = is_written,
+ .is_read = is_read,
+ .cbuf_index = cbuf.index,
+ .cbuf_offset = cbuf.offset,
+ .count = cbuf.count,
+ .size_shift = DESCRIPTOR_SIZE_SHIFT,
+ });
+ } else {
+ index = descriptors.Add(ImageDescriptor{
+ .type = flags.type,
+ .format = flags.image_format,
+ .is_written = is_written,
+ .is_read = is_read,
+ .cbuf_index = cbuf.index,
+ .cbuf_offset = cbuf.offset,
+ .count = cbuf.count,
+ .size_shift = DESCRIPTOR_SIZE_SHIFT,
+ });
+ }
+ break;
+ }
+ default:
+ if (flags.type == TextureType::Buffer) {
+ index = descriptors.Add(TextureBufferDescriptor{
+ .has_secondary = cbuf.has_secondary,
+ .cbuf_index = cbuf.index,
+ .cbuf_offset = cbuf.offset,
+ .secondary_cbuf_index = cbuf.secondary_index,
+ .secondary_cbuf_offset = cbuf.secondary_offset,
+ .count = cbuf.count,
+ .size_shift = DESCRIPTOR_SIZE_SHIFT,
+ });
+ } else {
+ index = descriptors.Add(TextureDescriptor{
+ .type = flags.type,
+ .is_depth = flags.is_depth != 0,
+ .has_secondary = cbuf.has_secondary,
+ .cbuf_index = cbuf.index,
+ .cbuf_offset = cbuf.offset,
+ .secondary_cbuf_index = cbuf.secondary_index,
+ .secondary_cbuf_offset = cbuf.secondary_offset,
+ .count = cbuf.count,
+ .size_shift = DESCRIPTOR_SIZE_SHIFT,
+ });
+ }
+ break;
+ }
+ flags.descriptor_index.Assign(index);
+ inst->SetFlags(flags);
+
+ if (cbuf.count > 1) {
+ const auto insert_point{IR::Block::InstructionList::s_iterator_to(*inst)};
+ IR::IREmitter ir{*texture_inst.block, insert_point};
+ const IR::U32 shift{ir.Imm32(std::countr_zero(DESCRIPTOR_SIZE))};
+ inst->SetArg(0, ir.ShiftRightArithmetic(cbuf.dynamic_offset, shift));
+ } else {
+ inst->SetArg(0, IR::Value{});
+ }
+ }
+}
+
+void JoinTextureInfo(Info& base, Info& source) {
+ Descriptors descriptors{
+ base.texture_buffer_descriptors,
+ base.image_buffer_descriptors,
+ base.texture_descriptors,
+ base.image_descriptors,
+ };
+ for (auto& desc : source.texture_buffer_descriptors) {
+ descriptors.Add(desc);
+ }
+ for (auto& desc : source.image_buffer_descriptors) {
+ descriptors.Add(desc);
+ }
+ for (auto& desc : source.texture_descriptors) {
+ descriptors.Add(desc);
+ }
+ for (auto& desc : source.image_descriptors) {
+ descriptors.Add(desc);
+ }
+}
+
+} // namespace Shader::Optimization
diff --git a/src/shader_recompiler/ir_opt/verification_pass.cpp b/src/shader_recompiler/ir_opt/verification_pass.cpp
new file mode 100644
index 000000000..975d5aadf
--- /dev/null
+++ b/src/shader_recompiler/ir_opt/verification_pass.cpp
@@ -0,0 +1,98 @@
+// Copyright 2021 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <map>
+#include <set>
+
+#include "shader_recompiler/exception.h"
+#include "shader_recompiler/frontend/ir/basic_block.h"
+#include "shader_recompiler/frontend/ir/value.h"
+#include "shader_recompiler/ir_opt/passes.h"
+
+namespace Shader::Optimization {
+
+static void ValidateTypes(const IR::Program& program) {
+ for (const auto& block : program.blocks) {
+ for (const IR::Inst& inst : *block) {
+ if (inst.GetOpcode() == IR::Opcode::Phi) {
+ // Skip validation on phi nodes
+ continue;
+ }
+ const size_t num_args{inst.NumArgs()};
+ for (size_t i = 0; i < num_args; ++i) {
+ const IR::Type t1{inst.Arg(i).Type()};
+ const IR::Type t2{IR::ArgTypeOf(inst.GetOpcode(), i)};
+ if (!IR::AreTypesCompatible(t1, t2)) {
+ throw LogicError("Invalid types in block:\n{}", IR::DumpBlock(*block));
+ }
+ }
+ }
+ }
+}
+
+static void ValidateUses(const IR::Program& program) {
+ std::map<IR::Inst*, int> actual_uses;
+ for (const auto& block : program.blocks) {
+ for (const IR::Inst& inst : *block) {
+ const size_t num_args{inst.NumArgs()};
+ for (size_t i = 0; i < num_args; ++i) {
+ const IR::Value arg{inst.Arg(i)};
+ if (!arg.IsImmediate()) {
+ ++actual_uses[arg.Inst()];
+ }
+ }
+ }
+ }
+ for (const auto [inst, uses] : actual_uses) {
+ if (inst->UseCount() != uses) {
+ throw LogicError("Invalid uses in block: {}", IR::DumpProgram(program));
+ }
+ }
+}
+
+static void ValidateForwardDeclarations(const IR::Program& program) {
+ std::set<const IR::Inst*> definitions;
+ for (const IR::Block* const block : program.blocks) {
+ for (const IR::Inst& inst : *block) {
+ definitions.emplace(&inst);
+ if (inst.GetOpcode() == IR::Opcode::Phi) {
+ // Phi nodes can have forward declarations
+ continue;
+ }
+ const size_t num_args{inst.NumArgs()};
+ for (size_t arg = 0; arg < num_args; ++arg) {
+ if (inst.Arg(arg).IsImmediate()) {
+ continue;
+ }
+ if (!definitions.contains(inst.Arg(arg).Inst())) {
+ throw LogicError("Forward declaration in block: {}", IR::DumpBlock(*block));
+ }
+ }
+ }
+ }
+}
+
+static void ValidatePhiNodes(const IR::Program& program) {
+ for (const IR::Block* const block : program.blocks) {
+ bool no_more_phis{false};
+ for (const IR::Inst& inst : *block) {
+ if (inst.GetOpcode() == IR::Opcode::Phi) {
+ if (no_more_phis) {
+ throw LogicError("Interleaved phi nodes: {}", IR::DumpBlock(*block));
+ }
+ } else {
+ no_more_phis = true;
+ }
+ }
+ }
+}
+
+void VerificationPass(const IR::Program& program) {
+ ValidateTypes(program);
+ ValidateUses(program);
+ ValidateForwardDeclarations(program);
+ ValidatePhiNodes(program);
+}
+
+} // namespace Shader::Optimization