3 files changed, 176 insertions, 143 deletions
diff --git a/src/video_core/textures/astc.cpp b/src/video_core/textures/astc.cpp
index 3ab500760..25161df1f 100644
--- a/src/video_core/textures/astc.cpp
+++ b/src/video_core/textures/astc.cpp
@@ -151,6 +151,76 @@ private:
     const IntType& m_Bits;
 };
 
+enum class IntegerEncoding { JustBits, Quint, Trit };
+
+struct IntegerEncodedValue {
+    constexpr IntegerEncodedValue() = default;
+
+    constexpr IntegerEncodedValue(IntegerEncoding encoding_, u32 num_bits_)
+        : encoding{encoding_}, num_bits{num_bits_} {}
+
+    constexpr bool MatchesEncoding(const IntegerEncodedValue& other) const {
+        return encoding == other.encoding && num_bits == other.num_bits;
+    }
+
+    // Returns the number of bits required to encode num_vals values.
+    u32 GetBitLength(u32 num_vals) const {
+        u32 total_bits = num_bits * num_vals;
+        if (encoding == IntegerEncoding::Trit) {
+            total_bits += (num_vals * 8 + 4) / 5;
+        } else if (encoding == IntegerEncoding::Quint) {
+            total_bits += (num_vals * 7 + 2) / 3;
+        }
+        return total_bits;
+    }
+
+    IntegerEncoding encoding{};
+    u32 num_bits = 0;
+    u32 bit_value = 0;
+    union {
+        u32 quint_value = 0;
+        u32 trit_value;
+    };
+};
+
+// Returns a new instance of this struct that corresponds to the
+// can take no more than mav_value values
+static constexpr IntegerEncodedValue CreateEncoding(u32 mav_value) {
+    while (mav_value > 0) {
+        u32 check = mav_value + 1;
+
+        // Is mav_value a power of two?
+        if (!(check & (check - 1))) {
+            return IntegerEncodedValue(IntegerEncoding::JustBits, std::popcount(mav_value));
+        }
+
+        // Is mav_value of the type 3*2^n - 1?
+        if ((check % 3 == 0) && !((check / 3) & ((check / 3) - 1))) {
+            return IntegerEncodedValue(IntegerEncoding::Trit, std::popcount(check / 3 - 1));
+        }
+
+        // Is mav_value of the type 5*2^n - 1?
+        if ((check % 5 == 0) && !((check / 5) & ((check / 5) - 1))) {
+            return IntegerEncodedValue(IntegerEncoding::Quint, std::popcount(check / 5 - 1));
+        }
+
+        // Apparently it can't be represented with a bounded integer sequence...
+        // just iterate.
+        mav_value--;
+    }
+    return IntegerEncodedValue(IntegerEncoding::JustBits, 0);
+}
+
+static constexpr std::array<IntegerEncodedValue, 256> MakeEncodedValues() {
+    std::array<IntegerEncodedValue, 256> encodings{};
+    for (std::size_t i = 0; i < encodings.size(); ++i) {
+        encodings[i] = CreateEncoding(static_cast<u32>(i));
+    }
+    return encodings;
+}
+
+static constexpr std::array<IntegerEncodedValue, 256> ASTC_ENCODINGS_VALUES = MakeEncodedValues();
+
 namespace Tegra::Texture::ASTC {
 using IntegerEncodedVector = boost::container::static_vector<
     IntegerEncodedValue, 256,
@@ -521,35 +591,41 @@ static TexelWeightParams DecodeBlockInfo(InputBitStream& strm) {
     return params;
 }
 
-static void FillVoidExtentLDR(InputBitStream& strm, std::span<u32> outBuf, u32 blockWidth,
-                              u32 blockHeight) {
-    // Don't actually care about the void extent, just read the bits...
-    for (s32 i = 0; i < 4; ++i) {
-        strm.ReadBits<13>();
+// Replicates low num_bits such that [(to_bit - 1):(to_bit - 1 - from_bit)]
+// is the same as [(num_bits - 1):0] and repeats all the way down.
+template <typename IntType>
+static constexpr IntType Replicate(IntType val, u32 num_bits, u32 to_bit) {
+    if (num_bits == 0 || to_bit == 0) {
+        return 0;
     }
-
-    // Decode the RGBA components and renormalize them to the range [0, 255]
-    u16 r = static_cast<u16>(strm.ReadBits<16>());
-    u16 g = static_cast<u16>(strm.ReadBits<16>());
-    u16 b = static_cast<u16>(strm.ReadBits<16>());
-    u16 a = static_cast<u16>(strm.ReadBits<16>());
-
-    u32 rgba = (r >> 8) | (g & 0xFF00) | (static_cast<u32>(b) & 0xFF00) << 8 |
-               (static_cast<u32>(a) & 0xFF00) << 16;
-
-    for (u32 j = 0; j < blockHeight; j++) {
-        for (u32 i = 0; i < blockWidth; i++) {
-            outBuf[j * blockWidth + i] = rgba;
+    const IntType v = val & static_cast<IntType>((1 << num_bits) - 1);
+    IntType res = v;
+    u32 reslen = num_bits;
+    while (reslen < to_bit) {
+        u32 comp = 0;
+        if (num_bits > to_bit - reslen) {
+            u32 newshift = to_bit - reslen;
+            comp = num_bits - newshift;
+            num_bits = newshift;
         }
+        res = static_cast<IntType>(res << num_bits);
+        res = static_cast<IntType>(res | (v >> comp));
+        reslen += num_bits;
     }
+    return res;
 }
 
-static void FillError(std::span<u32> outBuf, u32 blockWidth, u32 blockHeight) {
-    for (u32 j = 0; j < blockHeight; j++) {
-        for (u32 i = 0; i < blockWidth; i++) {
-            outBuf[j * blockWidth + i] = 0xFFFF00FF;
-        }
+static constexpr std::size_t NumReplicateEntries(u32 num_bits) {
+    return std::size_t(1) << num_bits;
+}
+
+template <typename IntType, u32 num_bits, u32 to_bit>
+static constexpr auto MakeReplicateTable() {
+    std::array<IntType, NumReplicateEntries(num_bits)> table{};
+    for (IntType value = 0; value < static_cast<IntType>(std::size(table)); ++value) {
+        table[value] = Replicate(value, num_bits, to_bit);
     }
+    return table;
 }
 
 static constexpr auto REPLICATE_BYTE_TO_16_TABLE = MakeReplicateTable<u32, 8, 16>();
@@ -572,6 +648,9 @@ static constexpr auto REPLICATE_2_BIT_TO_8_TABLE = MakeReplicateTable<u32, 2, 8>
 static constexpr auto REPLICATE_3_BIT_TO_8_TABLE = MakeReplicateTable<u32, 3, 8>();
 static constexpr auto REPLICATE_4_BIT_TO_8_TABLE = MakeReplicateTable<u32, 4, 8>();
 static constexpr auto REPLICATE_5_BIT_TO_8_TABLE = MakeReplicateTable<u32, 5, 8>();
+static constexpr auto REPLICATE_6_BIT_TO_8_TABLE = MakeReplicateTable<u32, 6, 8>();
+static constexpr auto REPLICATE_7_BIT_TO_8_TABLE = MakeReplicateTable<u32, 7, 8>();
+static constexpr auto REPLICATE_8_BIT_TO_8_TABLE = MakeReplicateTable<u32, 8, 8>();
 /// Use a precompiled table with the most common usages, if it's not in the expected range, fallback
 /// to the runtime implementation
 static constexpr u32 FastReplicateTo8(u32 value, u32 num_bits) {
@@ -1316,6 +1395,37 @@ static void ComputeEndpoints(Pixel& ep1, Pixel& ep2, const u32*& colorValues,
 #undef READ_INT_VALUES
 }
 
+static void FillVoidExtentLDR(InputBitStream& strm, std::span<u32> outBuf, u32 blockWidth,
+                              u32 blockHeight) {
+    // Don't actually care about the void extent, just read the bits...
+    for (s32 i = 0; i < 4; ++i) {
+        strm.ReadBits<13>();
+    }
+
+    // Decode the RGBA components and renormalize them to the range [0, 255]
+    u16 r = static_cast<u16>(strm.ReadBits<16>());
+    u16 g = static_cast<u16>(strm.ReadBits<16>());
+    u16 b = static_cast<u16>(strm.ReadBits<16>());
+    u16 a = static_cast<u16>(strm.ReadBits<16>());
+
+    u32 rgba = (r >> 8) | (g & 0xFF00) | (static_cast<u32>(b) & 0xFF00) << 8 |
+               (static_cast<u32>(a) & 0xFF00) << 16;
+
+    for (u32 j = 0; j < blockHeight; j++) {
+        for (u32 i = 0; i < blockWidth; i++) {
+            outBuf[j * blockWidth + i] = rgba;
+        }
+    }
+}
+
+static void FillError(std::span<u32> outBuf, u32 blockWidth, u32 blockHeight) {
+    for (u32 j = 0; j < blockHeight; j++) {
+        for (u32 i = 0; i < blockWidth; i++) {
+            outBuf[j * blockWidth + i] = 0xFFFF00FF;
+        }
+    }
+}
+
 static void DecompressBlock(std::span<const u8, 16> inBuf, const u32 blockWidth,
                             const u32 blockHeight, std::span<u32, 12 * 12> outBuf) {
     InputBitStream strm(inBuf);
diff --git a/src/video_core/textures/astc.h b/src/video_core/textures/astc.h
index 0229ae122..14d2beec0 100644
--- a/src/video_core/textures/astc.h
+++ b/src/video_core/textures/astc.h
@@ -9,117 +9,6 @@
 
 namespace Tegra::Texture::ASTC {
 
-enum class IntegerEncoding { JustBits, Quint, Trit };
-
-struct IntegerEncodedValue {
-    constexpr IntegerEncodedValue() = default;
-
-    constexpr IntegerEncodedValue(IntegerEncoding encoding_, u32 num_bits_)
-        : encoding{encoding_}, num_bits{num_bits_} {}
-
-    constexpr bool MatchesEncoding(const IntegerEncodedValue& other) const {
-        return encoding == other.encoding && num_bits == other.num_bits;
-    }
-
-    // Returns the number of bits required to encode num_vals values.
-    u32 GetBitLength(u32 num_vals) const {
-        u32 total_bits = num_bits * num_vals;
-        if (encoding == IntegerEncoding::Trit) {
-            total_bits += (num_vals * 8 + 4) / 5;
-        } else if (encoding == IntegerEncoding::Quint) {
-            total_bits += (num_vals * 7 + 2) / 3;
-        }
-        return total_bits;
-    }
-
-    IntegerEncoding encoding{};
-    u32 num_bits = 0;
-    u32 bit_value = 0;
-    union {
-        u32 quint_value = 0;
-        u32 trit_value;
-    };
-};
-
-// Returns a new instance of this struct that corresponds to the
-// can take no more than mav_value values
-constexpr IntegerEncodedValue CreateEncoding(u32 mav_value) {
-    while (mav_value > 0) {
-        u32 check = mav_value + 1;
-
-        // Is mav_value a power of two?
-        if (!(check & (check - 1))) {
-            return IntegerEncodedValue(IntegerEncoding::JustBits, std::popcount(mav_value));
-        }
-
-        // Is mav_value of the type 3*2^n - 1?
-        if ((check % 3 == 0) && !((check / 3) & ((check / 3) - 1))) {
-            return IntegerEncodedValue(IntegerEncoding::Trit, std::popcount(check / 3 - 1));
-        }
-
-        // Is mav_value of the type 5*2^n - 1?
-        if ((check % 5 == 0) && !((check / 5) & ((check / 5) - 1))) {
-            return IntegerEncodedValue(IntegerEncoding::Quint, std::popcount(check / 5 - 1));
-        }
-
-        // Apparently it can't be represented with a bounded integer sequence...
-        // just iterate.
-        mav_value--;
-    }
-    return IntegerEncodedValue(IntegerEncoding::JustBits, 0);
-}
-
-constexpr std::array<IntegerEncodedValue, 256> MakeEncodedValues() {
-    std::array<IntegerEncodedValue, 256> encodings{};
-    for (std::size_t i = 0; i < encodings.size(); ++i) {
-        encodings[i] = CreateEncoding(static_cast<u32>(i));
-    }
-    return encodings;
-}
-
-constexpr std::array<IntegerEncodedValue, 256> ASTC_ENCODINGS_VALUES = MakeEncodedValues();
-
-// Replicates low num_bits such that [(to_bit - 1):(to_bit - 1 - from_bit)]
-// is the same as [(num_bits - 1):0] and repeats all the way down.
-template <typename IntType>
-constexpr IntType Replicate(IntType val, u32 num_bits, u32 to_bit) {
-    if (num_bits == 0 || to_bit == 0) {
-        return 0;
-    }
-    const IntType v = val & static_cast<IntType>((1 << num_bits) - 1);
-    IntType res = v;
-    u32 reslen = num_bits;
-    while (reslen < to_bit) {
-        u32 comp = 0;
-        if (num_bits > to_bit - reslen) {
-            u32 newshift = to_bit - reslen;
-            comp = num_bits - newshift;
-            num_bits = newshift;
-        }
-        res = static_cast<IntType>(res << num_bits);
-        res = static_cast<IntType>(res | (v >> comp));
-        reslen += num_bits;
-    }
-    return res;
-}
-
-constexpr std::size_t NumReplicateEntries(u32 num_bits) {
-    return std::size_t(1) << num_bits;
-}
-
-template <typename IntType, u32 num_bits, u32 to_bit>
-constexpr auto MakeReplicateTable() {
-    std::array<IntType, NumReplicateEntries(num_bits)> table{};
-    for (IntType value = 0; value < static_cast<IntType>(std::size(table)); ++value) {
-        table[value] = Replicate(value, num_bits, to_bit);
-    }
-    return table;
-}
-
-constexpr auto REPLICATE_6_BIT_TO_8_TABLE = MakeReplicateTable<u32, 6, 8>();
-constexpr auto REPLICATE_7_BIT_TO_8_TABLE = MakeReplicateTable<u32, 7, 8>();
-constexpr auto REPLICATE_8_BIT_TO_8_TABLE = MakeReplicateTable<u32, 8, 8>();
-
 void Decompress(std::span<const uint8_t> data, uint32_t width, uint32_t height, uint32_t depth,
                 uint32_t block_width, uint32_t block_height, std::span<uint8_t> output);
 
diff --git a/src/video_core/textures/decoders.cpp b/src/video_core/textures/decoders.cpp
index f1f523ad1..c32ae956a 100644
--- a/src/video_core/textures/decoders.cpp
+++ b/src/video_core/textures/decoders.cpp
@@ -18,9 +18,9 @@
 
 namespace Tegra::Texture {
 namespace {
-template <bool TO_LINEAR>
-void Swizzle(std::span<u8> output, std::span<const u8> input, u32 bytes_per_pixel, u32 width,
-             u32 height, u32 depth, u32 block_height, u32 block_depth, u32 stride_alignment) {
+template <bool TO_LINEAR, u32 BYTES_PER_PIXEL>
+void SwizzleImpl(std::span<u8> output, std::span<const u8> input, u32 width, u32 height, u32 depth,
+                 u32 block_height, u32 block_depth, u32 stride_alignment) {
     // The origin of the transformation can be configured here, leave it as zero as the current API
     // doesn't expose it.
     static constexpr u32 origin_x = 0;
@@ -28,9 +28,9 @@ void Swizzle(std::span<u8> output, std::span<const u8> input, u32 bytes_per_pixe
     static constexpr u32 origin_z = 0;
 
     // We can configure here a custom pitch
-    // As it's not exposed 'width * bpp' will be the expected pitch.
-    const u32 pitch = width * bytes_per_pixel;
-    const u32 stride = Common::AlignUpLog2(width, stride_alignment) * bytes_per_pixel;
+    // As it's not exposed 'width * BYTES_PER_PIXEL' will be the expected pitch.
+    const u32 pitch = width * BYTES_PER_PIXEL;
+    const u32 stride = Common::AlignUpLog2(width, stride_alignment) * BYTES_PER_PIXEL;
 
     const u32 gobs_in_x = Common::DivCeilLog2(stride, GOB_SIZE_X_SHIFT);
     const u32 block_size = gobs_in_x << (GOB_SIZE_SHIFT + block_height + block_depth);
@@ -54,14 +54,14 @@ void Swizzle(std::span<u8> output, std::span<const u8> input, u32 bytes_per_pixe
                                  ((block_y & block_height_mask) << GOB_SIZE_SHIFT);
 
             for (u32 column = 0; column < width; ++column) {
-                const u32 x = (column + origin_x) * bytes_per_pixel;
+                const u32 x = (column + origin_x) * BYTES_PER_PIXEL;
                 const u32 offset_x = (x >> GOB_SIZE_X_SHIFT) << x_shift;
 
                 const u32 base_swizzled_offset = offset_z + offset_y + offset_x;
                 const u32 swizzled_offset = base_swizzled_offset + table[x % GOB_SIZE_X];
 
                 const u32 unswizzled_offset =
-                    slice * pitch * height + line * pitch + column * bytes_per_pixel;
+                    slice * pitch * height + line * pitch + column * BYTES_PER_PIXEL;
 
                 if (const auto offset = (TO_LINEAR ? unswizzled_offset : swizzled_offset);
                     offset >= input.size()) {
@@ -73,11 +73,45 @@ void Swizzle(std::span<u8> output, std::span<const u8> input, u32 bytes_per_pixe
 
                 u8* const dst = &output[TO_LINEAR ? swizzled_offset : unswizzled_offset];
                 const u8* const src = &input[TO_LINEAR ? unswizzled_offset : swizzled_offset];
-                std::memcpy(dst, src, bytes_per_pixel);
+
+                std::memcpy(dst, src, BYTES_PER_PIXEL);
             }
         }
     }
 }
+
+template <bool TO_LINEAR>
+void Swizzle(std::span<u8> output, std::span<const u8> input, u32 bytes_per_pixel, u32 width,
+             u32 height, u32 depth, u32 block_height, u32 block_depth, u32 stride_alignment) {
+    switch (bytes_per_pixel) {
+    case 1:
+        return SwizzleImpl<TO_LINEAR, 1>(output, input, width, height, depth, block_height,
+                                         block_depth, stride_alignment);
+    case 2:
+        return SwizzleImpl<TO_LINEAR, 2>(output, input, width, height, depth, block_height,
+                                         block_depth, stride_alignment);
+    case 3:
+        return SwizzleImpl<TO_LINEAR, 3>(output, input, width, height, depth, block_height,
+                                         block_depth, stride_alignment);
+    case 4:
+        return SwizzleImpl<TO_LINEAR, 4>(output, input, width, height, depth, block_height,
+                                         block_depth, stride_alignment);
+    case 6:
+        return SwizzleImpl<TO_LINEAR, 6>(output, input, width, height, depth, block_height,
+                                         block_depth, stride_alignment);
+    case 8:
+        return SwizzleImpl<TO_LINEAR, 8>(output, input, width, height, depth, block_height,
+                                         block_depth, stride_alignment);
+    case 12:
+        return SwizzleImpl<TO_LINEAR, 12>(output, input, width, height, depth, block_height,
+                                          block_depth, stride_alignment);
+    case 16:
+        return SwizzleImpl<TO_LINEAR, 16>(output, input, width, height, depth, block_height,
+                                          block_depth, stride_alignment);
+    default:
+        UNREACHABLE_MSG("Invalid bytes_per_pixel={}", bytes_per_pixel);
+    }
+}
 } // Anonymous namespace
 
 void UnswizzleTexture(std::span<u8> output, std::span<const u8> input, u32 bytes_per_pixel,