From 2985e5e94c82febcf215feb0023f4184b38bb24a Mon Sep 17 00:00:00 2001
From: ameerj <52414509+ameerj@users.noreply.github.com>
Date: Sat, 13 Feb 2021 15:50:12 -0500
Subject: renderer_opengl: Accelerate ASTC texture decoding with a compute
 shader

ASTC texture decoding is currently handled by a CPU decoder for GPU's without native ASTC decoding support (most desktop GPUs). This is the cause for noticeable performance degradation in titles which use the format extensively.

This commit adds support to accelerate ASTC decoding using a compute shader on OpenGL for GPUs without native support.
---
 src/video_core/host_shaders/astc_decoder.comp      | 1288 ++++++++++++++++++++
 .../renderer_opengl/gl_texture_cache.cpp           |   10 +-
 src/video_core/renderer_opengl/gl_texture_cache.h  |    2 +
 src/video_core/renderer_opengl/util_shaders.cpp    |   99 +-
 src/video_core/renderer_opengl/util_shaders.h      |   11 +
 src/video_core/textures/astc.h                     |  190 +++
 6 files changed, 1598 insertions(+), 2 deletions(-)
 create mode 100644 src/video_core/host_shaders/astc_decoder.comp

diff --git a/src/video_core/host_shaders/astc_decoder.comp b/src/video_core/host_shaders/astc_decoder.comp
new file mode 100644
index 000000000..070190a5c
--- /dev/null
+++ b/src/video_core/host_shaders/astc_decoder.comp
@@ -0,0 +1,1288 @@
+// Copyright 2021 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#version 450
+
+#ifdef VULKAN
+
+#define BEGIN_PUSH_CONSTANTS layout(push_constant) uniform PushConstants {
+#define END_PUSH_CONSTANTS };
+#define UNIFORM(n)
+#define BINDING_SWIZZLE_BUFFER 0
+#define BINDING_INPUT_BUFFER 1
+#define BINDING_ENC_BUFFER 2
+#define BINDING_6_TO_8_BUFFER 3
+#define BINDING_7_TO_8_BUFFER 4
+#define BINDING_8_TO_8_BUFFER 5
+#define BINDING_BYTE_TO_16_BUFFER 6
+#define BINDING_OUTPUT_IMAGE 3
+
+#else // ^^^ Vulkan ^^^ // vvv OpenGL vvv
+
+#define BEGIN_PUSH_CONSTANTS
+#define END_PUSH_CONSTANTS
+#define UNIFORM(n) layout(location = n) uniform
+#define BINDING_SWIZZLE_BUFFER 0
+#define BINDING_INPUT_BUFFER 1
+#define BINDING_ENC_BUFFER 2
+#define BINDING_6_TO_8_BUFFER 3
+#define BINDING_7_TO_8_BUFFER 4
+#define BINDING_8_TO_8_BUFFER 5
+#define BINDING_BYTE_TO_16_BUFFER 6
+#define BINDING_OUTPUT_IMAGE 0
+
+#endif
+
+layout(local_size_x = 32, local_size_y = 32, local_size_z = 1) in;
+
+BEGIN_PUSH_CONSTANTS
+UNIFORM(0) uvec2 num_image_blocks;
+UNIFORM(1) uvec2 block_dims;
+UNIFORM(2) uint layer;
+
+UNIFORM(3) uvec3 origin;
+UNIFORM(4) ivec3 destination;
+UNIFORM(5) uint bytes_per_block_log2;
+UNIFORM(6) uint layer_stride;
+UNIFORM(7) uint block_size;
+UNIFORM(8) uint x_shift;
+UNIFORM(9) uint block_height;
+UNIFORM(10) uint block_height_mask;
+
+END_PUSH_CONSTANTS
+
+uint current_index = 0;
+int bitsread = 0;
+uint total_bitsread = 0;
+uint local_buff[16];
+
+const int JustBits = 0;
+const int Quint = 1;
+const int Trit = 2;
+
+struct EncodingData {
+    uint encoding;
+    uint num_bits;
+    uint bit_value;
+    uint quint_trit_value;
+};
+
+struct TexelWeightParams {
+    uvec2 size;
+    bool dual_plane;
+    uint max_weight;
+    bool Error;
+    bool VoidExtentLDR;
+    bool VoidExtentHDR;
+};
+
+// Swizzle data
+layout(binding = BINDING_SWIZZLE_BUFFER, std430) readonly buffer SwizzleTable {
+    uint swizzle_table[];
+};
+
+layout(binding = BINDING_INPUT_BUFFER, std430) buffer InputBufferU32 {
+    uint astc_data[];
+};
+layout(binding = BINDING_OUTPUT_IMAGE) uniform writeonly image2D dest_image;
+
+const uint GOB_SIZE_X = 64;
+const uint GOB_SIZE_Y = 8;
+const uint GOB_SIZE_Z = 1;
+const uint GOB_SIZE = GOB_SIZE_X * GOB_SIZE_Y * GOB_SIZE_Z;
+
+const uint GOB_SIZE_X_SHIFT = 6;
+const uint GOB_SIZE_Y_SHIFT = 3;
+const uint GOB_SIZE_Z_SHIFT = 0;
+const uint GOB_SIZE_SHIFT = GOB_SIZE_X_SHIFT + GOB_SIZE_Y_SHIFT + GOB_SIZE_Z_SHIFT;
+
+const uvec2 SWIZZLE_MASK = uvec2(GOB_SIZE_X - 1, GOB_SIZE_Y - 1);
+
+uint SwizzleOffset(uvec2 pos) {
+    pos = pos & SWIZZLE_MASK;
+    return swizzle_table[pos.y * 64 + pos.x];
+}
+
+uint ReadTexel(uint offset) {
+    // extract the 8-bit value from the 32-bit packed data.
+    return bitfieldExtract(astc_data[offset / 4], int((offset * 8) & 24), 8);
+}
+
+// ASTC Encodings data
+layout(binding = BINDING_ENC_BUFFER, std430) readonly buffer EncodingsValues {
+    EncodingData encoding_values[256];
+};
+// ASTC Precompiled tables
+layout(binding = BINDING_6_TO_8_BUFFER, std430) readonly buffer REPLICATE_6_BIT_TO_8 {
+    uint REPLICATE_6_BIT_TO_8_TABLE[];
+};
+layout(binding = BINDING_7_TO_8_BUFFER, std430) readonly buffer REPLICATE_7_BIT_TO_8 {
+    uint REPLICATE_7_BIT_TO_8_TABLE[];
+};
+layout(binding = BINDING_8_TO_8_BUFFER, std430) readonly buffer REPLICATE_8_BIT_TO_8 {
+    uint REPLICATE_8_BIT_TO_8_TABLE[];
+};
+layout(binding = BINDING_BYTE_TO_16_BUFFER, std430) readonly buffer REPLICATE_BYTE_TO_16 {
+    uint REPLICATE_BYTE_TO_16_TABLE[];
+};
+
+const int BLOCK_SIZE_IN_BYTES = 16;
+
+const int BLOCK_INFO_ERROR = 0;
+const int BLOCK_INFO_VOID_EXTENT_HDR = 1;
+const int BLOCK_INFO_VOID_EXTENT_LDR = 2;
+const int BLOCK_INFO_NORMAL = 3;
+
+// Replicates low numBits such that [(toBit - 1):(toBit - 1 - fromBit)]
+// is the same as [(numBits - 1):0] and repeats all the way down.
+uint Replicate(uint val, uint num_bits, uint to_bit) {
+    if (num_bits == 0) {
+        return 0;
+    }
+    if (to_bit == 0) {
+        return 0;
+    }
+    const uint v = val & uint((1 << num_bits) - 1);
+    uint res = v;
+    uint reslen = num_bits;
+    while (reslen < to_bit) {
+        uint comp = 0;
+        if (num_bits > to_bit - reslen) {
+            uint newshift = to_bit - reslen;
+            comp = num_bits - newshift;
+            num_bits = newshift;
+        }
+        res = uint(res << num_bits);
+        res = uint(res | (v >> comp));
+        reslen += num_bits;
+    }
+    return res;
+}
+
+uvec4 ReplicateByteTo16(uvec4 value) {
+    return uvec4(REPLICATE_BYTE_TO_16_TABLE[value.x], REPLICATE_BYTE_TO_16_TABLE[value.y],
+                 REPLICATE_BYTE_TO_16_TABLE[value.z], REPLICATE_BYTE_TO_16_TABLE[value.w]);
+}
+
+const uint REPLICATE_BIT_TO_7_TABLE[2] = uint[](0, 127);
+uint ReplicateBitTo7(uint value) {
+    return REPLICATE_BIT_TO_7_TABLE[value];
+    ;
+}
+
+const uint REPLICATE_1_BIT_TO_9_TABLE[2] = uint[](0, 511);
+uint ReplicateBitTo9(uint value) {
+    return REPLICATE_1_BIT_TO_9_TABLE[value];
+}
+
+const uint REPLICATE_1_BIT_TO_8_TABLE[2] = uint[](0, 255);
+const uint REPLICATE_2_BIT_TO_8_TABLE[4] = uint[](0, 85, 170, 255);
+const uint REPLICATE_3_BIT_TO_8_TABLE[8] = uint[](0, 36, 73, 109, 146, 182, 219, 255);
+const uint REPLICATE_4_BIT_TO_8_TABLE[16] =
+    uint[](0, 17, 34, 51, 68, 85, 102, 119, 136, 153, 170, 187, 204, 221, 238, 255);
+const uint REPLICATE_5_BIT_TO_8_TABLE[32] =
+    uint[](0, 8, 16, 24, 33, 41, 49, 57, 66, 74, 82, 90, 99, 107, 115, 123, 132, 140, 148, 156, 165,
+           173, 181, 189, 198, 206, 214, 222, 231, 239, 247, 255);
+
+uint FastReplicateTo8(uint value, uint num_bits) {
+    switch (num_bits) {
+    case 1:
+        return REPLICATE_1_BIT_TO_8_TABLE[value];
+    case 2:
+        return REPLICATE_2_BIT_TO_8_TABLE[value];
+    case 3:
+        return REPLICATE_3_BIT_TO_8_TABLE[value];
+    case 4:
+        return REPLICATE_4_BIT_TO_8_TABLE[value];
+    case 5:
+        return REPLICATE_5_BIT_TO_8_TABLE[value];
+    case 6:
+        return REPLICATE_6_BIT_TO_8_TABLE[value];
+    case 7:
+        return REPLICATE_7_BIT_TO_8_TABLE[value];
+    case 8:
+        return REPLICATE_8_BIT_TO_8_TABLE[value];
+    }
+    return Replicate(value, num_bits, 8);
+}
+
+const uint REPLICATE_1_BIT_TO_6_TABLE[2] = uint[](0, 63);
+const uint REPLICATE_2_BIT_TO_6_TABLE[4] = uint[](0, 21, 42, 63);
+const uint REPLICATE_3_BIT_TO_6_TABLE[8] = uint[](0, 9, 18, 27, 36, 45, 54, 63);
+const uint REPLICATE_4_BIT_TO_6_TABLE[16] =
+    uint[](0, 4, 8, 12, 17, 21, 25, 29, 34, 38, 42, 46, 51, 55, 59, 63);
+const uint REPLICATE_5_BIT_TO_6_TABLE[32] =
+    uint[](0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 33, 35, 37, 39, 41, 43, 45,
+           47, 49, 51, 53, 55, 57, 59, 61, 63);
+
+uint FastReplicateTo6(uint value, uint num_bits) {
+    switch (num_bits) {
+    case 1:
+        return REPLICATE_1_BIT_TO_6_TABLE[value];
+    case 2:
+        return REPLICATE_2_BIT_TO_6_TABLE[value];
+    case 3:
+        return REPLICATE_3_BIT_TO_6_TABLE[value];
+    case 4:
+        return REPLICATE_4_BIT_TO_6_TABLE[value];
+    case 5:
+        return REPLICATE_5_BIT_TO_6_TABLE[value];
+    }
+    return Replicate(value, num_bits, 6);
+}
+
+uint hash52(uint p) {
+    p ^= p >> 15;
+    p -= p << 17;
+    p += p << 7;
+    p += p << 4;
+    p ^= p >> 5;
+    p += p << 16;
+    p ^= p >> 7;
+    p ^= p >> 3;
+    p ^= p << 6;
+    p ^= p >> 17;
+    return p;
+}
+
+uint SelectPartition(uint seed, uint x, uint y, uint z, uint partition_count, bool small_block) {
+    if (1 == partition_count)
+        return 0;
+
+    if (small_block) {
+        x <<= 1;
+        y <<= 1;
+        z <<= 1;
+    }
+
+    seed += (partition_count - 1) * 1024;
+
+    uint rnum = hash52(uint(seed));
+    uint seed1 = uint(rnum & 0xF);
+    uint seed2 = uint((rnum >> 4) & 0xF);
+    uint seed3 = uint((rnum >> 8) & 0xF);
+    uint seed4 = uint((rnum >> 12) & 0xF);
+    uint seed5 = uint((rnum >> 16) & 0xF);
+    uint seed6 = uint((rnum >> 20) & 0xF);
+    uint seed7 = uint((rnum >> 24) & 0xF);
+    uint seed8 = uint((rnum >> 28) & 0xF);
+    uint seed9 = uint((rnum >> 18) & 0xF);
+    uint seed10 = uint((rnum >> 22) & 0xF);
+    uint seed11 = uint((rnum >> 26) & 0xF);
+    uint seed12 = uint(((rnum >> 30) | (rnum << 2)) & 0xF);
+
+    seed1 = (seed1 * seed1);
+    seed2 = (seed2 * seed2);
+    seed3 = (seed3 * seed3);
+    seed4 = (seed4 * seed4);
+    seed5 = (seed5 * seed5);
+    seed6 = (seed6 * seed6);
+    seed7 = (seed7 * seed7);
+    seed8 = (seed8 * seed8);
+    seed9 = (seed9 * seed9);
+    seed10 = (seed10 * seed10);
+    seed11 = (seed11 * seed11);
+    seed12 = (seed12 * seed12);
+
+    int sh1, sh2, sh3;
+    if ((seed & 1) > 0) {
+        sh1 = (seed & 2) > 0 ? 4 : 5;
+        sh2 = (partition_count == 3) ? 6 : 5;
+    } else {
+        sh1 = (partition_count == 3) ? 6 : 5;
+        sh2 = (seed & 2) > 0 ? 4 : 5;
+    }
+    sh3 = (seed & 0x10) > 0 ? sh1 : sh2;
+
+    seed1 = (seed1 >> sh1);
+    seed2 = (seed2 >> sh2);
+    seed3 = (seed3 >> sh1);
+    seed4 = (seed4 >> sh2);
+    seed5 = (seed5 >> sh1);
+    seed6 = (seed6 >> sh2);
+    seed7 = (seed7 >> sh1);
+    seed8 = (seed8 >> sh2);
+    seed9 = (seed9 >> sh3);
+    seed10 = (seed10 >> sh3);
+    seed11 = (seed11 >> sh3);
+    seed12 = (seed12 >> sh3);
+
+    uint a = seed1 * x + seed2 * y + seed11 * z + (rnum >> 14);
+    uint b = seed3 * x + seed4 * y + seed12 * z + (rnum >> 10);
+    uint c = seed5 * x + seed6 * y + seed9 * z + (rnum >> 6);
+    uint d = seed7 * x + seed8 * y + seed10 * z + (rnum >> 2);
+
+    a &= 0x3F;
+    b &= 0x3F;
+    c &= 0x3F;
+    d &= 0x3F;
+
+    if (partition_count < 4)
+        d = 0;
+    if (partition_count < 3)
+        c = 0;
+
+    if (a >= b && a >= c && a >= d)
+        return 0;
+    else if (b >= c && b >= d)
+        return 1;
+    else if (c >= d)
+        return 2;
+    return 3;
+}
+
+uint Select2DPartition(uint seed, uint x, uint y, uint partition_count, bool small_block) {
+    return SelectPartition(seed, x, y, 0, partition_count, small_block);
+}
+
+uint ReadBit() {
+    uint bit = bitfieldExtract(local_buff[current_index], bitsread, 1);
+    bitsread++;
+    total_bitsread++;
+    if (bitsread == 8) {
+        current_index++;
+        bitsread = 0;
+    }
+    return bit;
+}
+
+uint StreamBits(uint num_bits) {
+    uint ret = 0;
+    for (uint i = 0; i < num_bits; i++) {
+        ret |= ((ReadBit() & 1) << i);
+    }
+    return ret;
+}
+
+// Define color data.
+uint color_endpoint_data[16];
+int color_bitsread = 0;
+uint total_color_bitsread = 0;
+int color_index = 0;
+
+// Define color data.
+uint texel_weight_data[16];
+int texel_bitsread = 0;
+uint total_texel_bitsread = 0;
+int texel_index = 0;
+
+bool texel_flag = false;
+
+uint ReadColorBit() {
+    uint bit = 0;
+    if (texel_flag) {
+        bit = bitfieldExtract(texel_weight_data[texel_index], texel_bitsread, 1);
+        texel_bitsread++;
+        total_texel_bitsread++;
+        if (texel_bitsread == 8) {
+            texel_index++;
+            texel_bitsread = 0;
+        }
+    } else {
+        bit = bitfieldExtract(color_endpoint_data[color_index], color_bitsread, 1);
+        color_bitsread++;
+        total_color_bitsread++;
+        if (color_bitsread == 8) {
+            color_index++;
+            color_bitsread = 0;
+        }
+    }
+    return bit;
+}
+
+uint StreamColorBits(uint num_bits) {
+    uint ret = 0;
+    for (uint i = 0; i < num_bits; i++) {
+        ret |= ((ReadColorBit() & 1) << i);
+    }
+    return ret;
+}
+
+EncodingData result_vector[100];
+int result_index = 0;
+
+EncodingData texel_vector[100];
+int texel_vector_index = 0;
+
+void ResultEmplaceBack(EncodingData val) {
+    if (texel_flag) {
+        texel_vector[texel_vector_index] = val;
+        texel_vector_index++;
+    } else {
+        result_vector[result_index] = val;
+        result_index++;
+    }
+}
+
+// Returns the number of bits required to encode n_vals values.
+uint GetBitLength(uint n_vals, uint encoding_index) {
+    uint totalBits = encoding_values[encoding_index].num_bits * n_vals;
+    if (encoding_values[encoding_index].encoding == Trit) {
+        totalBits += (n_vals * 8 + 4) / 5;
+    } else if (encoding_values[encoding_index].encoding == Quint) {
+        totalBits += (n_vals * 7 + 2) / 3;
+    }
+    return totalBits;
+}
+
+uint GetNumWeightValues(uvec2 size, bool dual_plane) {
+    uint n_vals = size.x * size.y;
+    if (dual_plane) {
+        n_vals *= 2;
+    }
+    return n_vals;
+}
+
+uint GetPackedBitSize(uvec2 size, bool dual_plane, uint max_weight) {
+    uint n_vals = GetNumWeightValues(size, dual_plane);
+    return GetBitLength(n_vals, max_weight);
+}
+
+uint BitsBracket(uint bits, uint pos) {
+    return ((bits >> pos) & 1);
+}
+
+uint BitsOp(uint bits, uint start, uint end) {
+    if (start == end) {
+        return BitsBracket(bits, start);
+    } else if (start > end) {
+        uint t = start;
+        start = end;
+        end = t;
+    }
+
+    uint mask = (1 << (end - start + 1)) - 1;
+    return ((bits >> start) & mask);
+}
+
+void DecodeQuintBlock(uint num_bits) { // Value number of bits
+    uint m[3];
+    uint q[3];
+    uint Q;
+    m[0] = StreamColorBits(num_bits);
+    Q = StreamColorBits(3);
+    m[1] = StreamColorBits(num_bits);
+    Q |= StreamColorBits(2) << 3;
+    m[2] = StreamColorBits(num_bits);
+    Q |= StreamColorBits(2) << 5;
+    if (BitsOp(Q, 1, 2) == 3 && BitsOp(Q, 5, 6) == 0) {
+        q[0] = 4;
+        q[1] = 4;
+        q[2] = (BitsBracket(Q, 0) << 2) | ((BitsBracket(Q, 4) & ~BitsBracket(Q, 0)) << 1) |
+               (BitsBracket(Q, 3) & ~BitsBracket(Q, 0));
+    } else {
+        uint C = 0;
+        if (BitsOp(Q, 1, 2) == 3) {
+            q[2] = 4;
+            C = (BitsOp(Q, 3, 4) << 3) | ((~BitsOp(Q, 5, 6) & 3) << 1) | BitsBracket(Q, 0);
+        } else {
+            q[2] = BitsOp(Q, 5, 6);
+            C = BitsOp(Q, 0, 4);
+        }
+
+        if (BitsOp(C, 0, 2) == 5) {
+            q[1] = 4;
+            q[0] = BitsOp(C, 3, 4);
+        } else {
+            q[1] = BitsOp(C, 3, 4);
+            q[0] = BitsOp(C, 0, 2);
+        }
+    }
+
+    for (uint i = 0; i < 3; i++) {
+        EncodingData val;
+        val.encoding = Quint;
+        val.num_bits = num_bits;
+        val.bit_value = m[i];
+        val.quint_trit_value = q[i];
+        ResultEmplaceBack(val);
+    }
+}
+
+void DecodeTritBlock(uint num_bits) {
+    uint m[5];
+    uint t[5];
+    uint T;
+    m[0] = StreamColorBits(num_bits);
+    T = StreamColorBits(2);
+    m[1] = StreamColorBits(num_bits);
+    T |= StreamColorBits(2) << 2;
+    m[2] = StreamColorBits(num_bits);
+    T |= StreamColorBits(1) << 4;
+    m[3] = StreamColorBits(num_bits);
+    T |= StreamColorBits(2) << 5;
+    m[4] = StreamColorBits(num_bits);
+    T |= StreamColorBits(1) << 7;
+    uint C = 0;
+    if (BitsOp(T, 2, 4) == 7) {
+        C = (BitsOp(T, 5, 7) << 2) | BitsOp(T, 0, 1);
+        t[4] = 2;
+        t[3] = 2;
+    } else {
+        C = BitsOp(T, 0, 4);
+        if (BitsOp(T, 5, 6) == 3) {
+            t[4] = 2;
+            t[3] = BitsBracket(T, 7);
+        } else {
+            t[4] = BitsBracket(T, 7);
+            t[3] = BitsOp(T, 5, 6);
+        }
+    }
+    if (BitsOp(C, 0, 1) == 3) {
+        t[2] = 2;
+        t[1] = BitsBracket(C, 4);
+        t[0] = (BitsBracket(C, 3) << 1) | (BitsBracket(C, 2) & ~BitsBracket(C, 3));
+    } else if (BitsOp(C, 2, 3) == 3) {
+        t[2] = 2;
+        t[1] = 2;
+        t[0] = BitsOp(C, 0, 1);
+    } else {
+        t[2] = BitsBracket(C, 4);
+        t[1] = BitsOp(C, 2, 3);
+        t[0] = (BitsBracket(C, 1) << 1) | (BitsBracket(C, 0) & ~BitsBracket(C, 1));
+    }
+    for (uint i = 0; i < 5; i++) {
+        EncodingData val;
+        val.encoding = Trit;
+        val.num_bits = num_bits;
+        val.bit_value = m[i];
+        val.quint_trit_value = t[i];
+        ResultEmplaceBack(val);
+    }
+}
+void DecodeIntegerSequence(uint max_range, uint num_values) {
+    EncodingData val = encoding_values[max_range];
+    uint vals_decoded = 0;
+    while (vals_decoded < num_values) {
+        switch (val.encoding) {
+        case Quint:
+            DecodeQuintBlock(val.num_bits);
+            vals_decoded += 3;
+            break;
+
+        case Trit:
+            DecodeTritBlock(val.num_bits);
+            vals_decoded += 5;
+            break;
+
+        case JustBits:
+            val.bit_value = StreamColorBits(val.num_bits);
+            ResultEmplaceBack(val);
+            vals_decoded++;
+            break;
+        }
+    }
+}
+
+void DecodeColorValues(out uint color_values[32], uvec4 modes, uint num_partitions,
+                       uint color_data_bits) {
+    uint num_values = 0;
+    for (uint i = 0; i < num_partitions; i++) {
+        num_values += ((modes[i] >> 2) + 1) << 1;
+    }
+    int range = 256;
+    while (--range > 0) {
+        EncodingData val = encoding_values[range];
+        uint bitLength = GetBitLength(num_values, range);
+        if (bitLength <= color_data_bits) {
+            while (--range > 0) {
+                EncodingData newval = encoding_values[range];
+                if (newval.encoding != val.encoding && newval.num_bits != val.num_bits) {
+                    break;
+                }
+            }
+            range++;
+            break;
+        }
+    }
+    DecodeIntegerSequence(range, num_values);
+    uint out_index = 0;
+    for (int itr = 0; itr < result_index; itr++) {
+        if (out_index >= num_values) {
+            break;
+        }
+        EncodingData val = result_vector[itr];
+        uint bitlen = val.num_bits;
+        uint bitval = val.bit_value;
+        uint A = 0, B = 0, C = 0, D = 0;
+        A = ReplicateBitTo9((bitval & 1));
+        switch (val.encoding) {
+        case JustBits:
+            color_values[out_index++] = FastReplicateTo8(bitval, bitlen);
+            break;
+        case Trit: {
+            D = val.quint_trit_value;
+            switch (bitlen) {
+            case 1: {
+                C = 204;
+            } break;
+            case 2: {
+                C = 93;
+                uint b = (bitval >> 1) & 1;
+                B = (b << 8) | (b << 4) | (b << 2) | (b << 1);
+            } break;
+
+            case 3: {
+                C = 44;
+                uint cb = (bitval >> 1) & 3;
+                B = (cb << 7) | (cb << 2) | cb;
+            } break;
+
+            case 4: {
+                C = 22;
+                uint dcb = (bitval >> 1) & 7;
+                B = (dcb << 6) | dcb;
+            } break;
+
+            case 5: {
+                C = 11;
+                uint edcb = (bitval >> 1) & 0xF;
+                B = (edcb << 5) | (edcb >> 2);
+            } break;
+
+            case 6: {
+                C = 5;
+                uint fedcb = (bitval >> 1) & 0x1F;
+                B = (fedcb << 4) | (fedcb >> 4);
+            } break;
+            }
+        } break;
+        case Quint: {
+            D = val.quint_trit_value;
+            switch (bitlen) {
+            case 1: {
+                C = 113;
+            } break;
+            case 2: {
+                C = 54;
+                uint b = (bitval >> 1) & 1;
+                B = (b << 8) | (b << 3) | (b << 2);
+            } break;
+            case 3: {
+                C = 26;
+                uint cb = (bitval >> 1) & 3;
+                B = (cb << 7) | (cb << 1) | (cb >> 1);
+            } break;
+            case 4: {
+                C = 13;
+                uint dcb = (bitval >> 1) & 7;
+                B = (dcb << 6) | (dcb >> 1);
+            } break;
+            case 5: {
+                C = 6;
+                uint edcb = (bitval >> 1) & 0xF;
+                B = (edcb << 5) | (edcb >> 3);
+            } break;
+            }
+        } break;
+        }
+
+        if (val.encoding != JustBits) {
+            uint T = (D * C) + B;
+            T ^= A;
+            T = (A & 0x80) | (T >> 2);
+            color_values[out_index++] = T;
+        }
+    }
+}
+ivec2 BitTransferSigned(int a, int b) {
+    ivec2 transferred;
+    transferred[1] = b >> 1;
+    transferred[1] |= a & 0x80;
+    transferred[0] = a >> 1;
+    transferred[0] &= 0x3F;
+    if ((transferred[0] & 0x20) > 0) {
+        transferred[0] -= 0x40;
+    }
+    return transferred;
+}
+
+uvec4 ClampByte(ivec4 color) {
+    for (uint i = 0; i < 4; i++) {
+        color[i] = (color[i] < 0) ? 0 : ((color[i] > 255) ? 255 : color[i]);
+    }
+    return uvec4(color);
+}
+ivec4 BlueContract(int a, int r, int g, int b) {
+    return ivec4(a, (r + b) >> 1, (g + b) >> 1, b);
+}
+int colvals_index = 0;
+void ComputeEndpoints(out uvec4 ep1, out uvec4 ep2, uint color_values[32],
+                      uint color_endpoint_mode) {
+#define READ_UINT_VALUES(N)                                                                        \
+    uint v[N];                                                                                     \
+    for (uint i = 0; i < N; i++) {                                                                 \
+        v[i] = color_values[colvals_index++];                                                      \
+    }
+
+#define READ_INT_VALUES(N)                                                                         \
+    int v[N];                                                                                      \
+    for (uint i = 0; i < N; i++) {                                                                 \
+        v[i] = int(color_values[colvals_index++]);                                                 \
+    }
+
+    switch (color_endpoint_mode) {
+    case 0: {
+        READ_UINT_VALUES(2)
+        ep1 = uvec4(0xFF, v[0], v[0], v[0]);
+        ep2 = uvec4(0xFF, v[1], v[1], v[1]);
+    } break;
+
+    case 1: {
+        READ_UINT_VALUES(2)
+        uint L0 = (v[0] >> 2) | (v[1] & 0xC0);
+        uint L1 = max(L0 + (v[1] & 0x3F), 0xFFU);
+        ep1 = uvec4(0xFF, L0, L0, L0);
+        ep2 = uvec4(0xFF, L1, L1, L1);
+    } break;
+
+    case 4: {
+        READ_UINT_VALUES(4)
+        ep1 = uvec4(v[2], v[0], v[0], v[0]);
+        ep2 = uvec4(v[3], v[1], v[1], v[1]);
+    } break;
+
+    case 5: {
+        READ_INT_VALUES(4)
+        ivec2 transferred = BitTransferSigned(v[1], v[0]);
+        v[1] = transferred[0];
+        v[0] = transferred[1];
+        transferred = BitTransferSigned(v[3], v[2]);
+        v[3] = transferred[0];
+        v[2] = transferred[1];
+        ep1 = ClampByte(ivec4(v[2], v[0], v[0], v[0]));
+        ep2 = ClampByte(ivec4((v[2] + v[3]), v[0] + v[1], v[0] + v[1], v[0] + v[1]));
+    } break;
+
+    case 6: {
+        READ_UINT_VALUES(4)
+        ep1 = uvec4(0xFF, v[0] * v[3] >> 8, v[1] * v[3] >> 8, v[2] * v[3] >> 8);
+        ep2 = uvec4(0xFF, v[0], v[1], v[2]);
+    } break;
+
+    case 8: {
+        READ_UINT_VALUES(6)
+        if (v[1] + v[3] + v[5] >= v[0] + v[2] + v[4]) {
+            ep1 = uvec4(0xFF, v[0], v[2], v[4]);
+            ep2 = uvec4(0xFF, v[1], v[3], v[5]);
+        } else {
+            ep1 = uvec4(BlueContract(0xFF, int(v[1]), int(v[3]), int(v[5])));
+            ep2 = uvec4(BlueContract(0xFF, int(v[0]), int(v[2]), int(v[4])));
+        }
+    } break;
+
+    case 9: {
+        READ_INT_VALUES(6)
+        ivec2 transferred = BitTransferSigned(v[1], v[0]);
+        v[1] = transferred[0];
+        v[0] = transferred[1];
+        transferred = BitTransferSigned(v[3], v[2]);
+        v[3] = transferred[0];
+        v[2] = transferred[1];
+        transferred = BitTransferSigned(v[5], v[4]);
+        v[5] = transferred[0];
+        v[4] = transferred[1];
+        if (v[1] + v[3] + v[5] >= 0) {
+            ep1 = ClampByte(ivec4(0xFF, v[0], v[2], v[4]));
+            ep2 = ClampByte(ivec4(0xFF, v[0] + v[1], v[2] + v[3], v[4] + v[5]));
+        } else {
+            ep1 = ClampByte(BlueContract(0xFF, v[0] + v[1], v[2] + v[3], v[4] + v[5]));
+            ep2 = ClampByte(BlueContract(0xFF, v[0], v[2], v[4]));
+        }
+    } break;
+
+    case 10: {
+        READ_UINT_VALUES(6)
+        ep1 = uvec4(v[4], v[0] * v[3] >> 8, v[1] * v[3] >> 8, v[2] * v[3] >> 8);
+        ep2 = uvec4(v[5], v[0], v[1], v[2]);
+    } break;
+
+    case 12: {
+        READ_UINT_VALUES(8)
+        if (v[1] + v[3] + v[5] >= v[0] + v[2] + v[4]) {
+            ep1 = uvec4(v[6], v[0], v[2], v[4]);
+            ep2 = uvec4(v[7], v[1], v[3], v[5]);
+        } else {
+            ep1 = uvec4(BlueContract(int(v[7]), int(v[1]), int(v[3]), int(v[5])));
+            ep2 = uvec4(BlueContract(int(v[6]), int(v[0]), int(v[2]), int(v[4])));
+        }
+    } break;
+
+    case 13: {
+        READ_INT_VALUES(8)
+        ivec2 transferred = BitTransferSigned(v[1], v[0]);
+        v[1] = transferred[0];
+        v[0] = transferred[1];
+        transferred = BitTransferSigned(v[3], v[2]);
+        v[3] = transferred[0];
+        v[2] = transferred[1];
+
+        transferred = BitTransferSigned(v[5], v[4]);
+        v[5] = transferred[0];
+        v[4] = transferred[1];
+
+        transferred = BitTransferSigned(v[7], v[6]);
+        v[7] = transferred[0];
+        v[6] = transferred[1];
+
+        if (v[1] + v[3] + v[5] >= 0) {
+            ep1 = ClampByte(ivec4(v[6], v[0], v[2], v[4]));
+            ep2 = ClampByte(ivec4(v[7] + v[6], v[0] + v[1], v[2] + v[3], v[4] + v[5]));
+        } else {
+            ep1 = ClampByte(BlueContract(v[6] + v[7], v[0] + v[1], v[2] + v[3], v[4] + v[5]));
+            ep2 = ClampByte(BlueContract(v[6], v[0], v[2], v[4]));
+        }
+    } break;
+    }
+#undef READ_UINT_VALUES
+#undef READ_INT_VALUES
+}
+
+uint UnquantizeTexelWeight(EncodingData val) {
+    uint bitval = val.bit_value;
+    uint bitlen = val.num_bits;
+    uint A = ReplicateBitTo7((bitval & 1));
+    uint B = 0, C = 0, D = 0;
+    uint result = 0;
+    switch (val.encoding) {
+    case JustBits:
+        result = FastReplicateTo6(bitval, bitlen);
+        break;
+    case Trit: {
+        D = val.quint_trit_value;
+        switch (bitlen) {
+        case 0: {
+            uint results[3] = {0, 32, 63};
+            result = results[D];
+        } break;
+        case 1: {
+            C = 50;
+        } break;
+        case 2: {
+            C = 23;
+            uint b = (bitval >> 1) & 1;
+            B = (b << 6) | (b << 2) | b;
+        } break;
+        case 3: {
+            C = 11;
+            uint cb = (bitval >> 1) & 3;
+            B = (cb << 5) | cb;
+        } break;
+        default:
+            break;
+        }
+    } break;
+    case Quint: {
+        D = val.quint_trit_value;
+        switch (bitlen) {
+        case 0: {
+            uint results[5] = {0, 16, 32, 47, 63};
+            result = results[D];
+        } break;
+        case 1: {
+            C = 28;
+        } break;
+        case 2: {
+            C = 13;
+            uint b = (bitval >> 1) & 1;
+            B = (b << 6) | (b << 1);
+        } break;
+        }
+    } break;
+    }
+    if (val.encoding != JustBits && bitlen > 0) {
+        result = D * C + B;
+        result ^= A;
+        result = (A & 0x20) | (result >> 2);
+    }
+    if (result > 32) {
+        result += 1;
+    }
+    return result;
+}
+
+void UnquantizeTexelWeights(out uint outbuffer[2][144], bool dual_plane, uvec2 size) {
+    uint weight_idx = 0;
+    uint unquantized[2][144];
+    uint area = size.x * size.y;
+    for (uint itr = 0; itr < texel_vector_index; itr++) {
+        unquantized[0][weight_idx] = UnquantizeTexelWeight(texel_vector[itr]);
+        if (dual_plane) {
+            ++itr;
+            unquantized[1][weight_idx] = UnquantizeTexelWeight(texel_vector[itr]);
+            if (itr == texel_vector_index) {
+                break;
+            }
+        }
+        if (++weight_idx >= (area))
+            break;
+    }
+    uint Ds = uint((block_dims.x * 0.5f + 1024) / (block_dims.x - 1));
+    uint Dt = uint((block_dims.y * 0.5f + 1024) / (block_dims.y - 1));
+    uint kPlaneScale = dual_plane ? 2 : 1;
+    for (uint plane = 0; plane < kPlaneScale; plane++)
+        for (uint t = 0; t < block_dims.y; t++)
+            for (uint s = 0; s < block_dims.x; s++) {
+                uint cs = Ds * s;
+                uint ct = Dt * t;
+                uint gs = (cs * (size.x - 1) + 32) >> 6;
+                uint gt = (ct * (size.y - 1) + 32) >> 6;
+                uint js = gs >> 4;
+                uint fs = gs & 0xF;
+                uint jt = gt >> 4;
+                uint ft = gt & 0x0F;
+                uint w11 = (fs * ft + 8) >> 4;
+                uint w10 = ft - w11;
+                uint w01 = fs - w11;
+                uint w00 = 16 - fs - ft + w11;
+                uvec4 w = uvec4(w00, w01, w10, w11);
+                uint v0 = jt * size.x + js;
+
+                uvec4 p = uvec4(0);
+                if (v0 < area) {
+                    p.x = unquantized[plane][v0];
+                }
+                if ((v0 + 1) < (area)) {
+                    p.y = unquantized[plane][v0 + 1];
+                }
+                if ((v0 + size.x) < (area)) {
+                    p.z = unquantized[plane][(v0 + size.x)];
+                }
+                if ((v0 + size.x + 1) < (area)) {
+                    p.w = unquantized[plane][(v0 + size.x + 1)];
+                }
+                outbuffer[plane][t * block_dims.x + s] = (uint(dot(p, w)) + 8) >> 4;
+            }
+}
+
+int FindLayout(uint mode) {
+    if ((mode & 3) != 0) {
+        if ((mode & 8) != 0) {
+            if ((mode & 4) != 0) {
+                if ((mode & 0x100) != 0) {
+                    return 4;
+                }
+                return 3;
+            }
+            return 2;
+        }
+        if ((mode & 4) != 0) {
+            return 1;
+        }
+        return 0;
+    }
+    if ((mode & 0x100) != 0) {
+        if ((mode & 0x80) != 0) {
+            if ((mode & 0x20) != 0) {
+                return 8;
+            }
+            return 7;
+        }
+        return 9;
+    }
+    if ((mode & 0x80) != 0) {
+        return 6;
+    }
+    return 5;
+}
+
+TexelWeightParams DecodeBlockInfo(uint block_index) {
+    TexelWeightParams params = TexelWeightParams(uvec2(0), false, 0, false, false, false);
+    uint mode = StreamBits(11);
+    if ((mode & 0x1ff) == 0x1fc) {
+        if ((mode & 0x200) != 0) {
+            params.VoidExtentHDR = true;
+        } else {
+            params.VoidExtentLDR = true;
+        }
+        if ((mode & 0x400) == 0 || StreamBits(1) == 0) {
+            params.Error = true;
+        }
+        return params;
+    }
+    if ((mode & 0xf) == 0) {
+        params.Error = true;
+        return params;
+    }
+    if ((mode & 3) == 0 && (mode & 0x1c0) == 0x1c0) {
+        params.Error = true;
+        return params;
+    }
+    uint A, B;
+    uint mode_layout = FindLayout(mode);
+    switch (mode_layout) {
+    case 0:
+        A = (mode >> 5) & 0x3;
+        B = (mode >> 7) & 0x3;
+        params.size = uvec2(B + 4, A + 2);
+        break;
+    case 1:
+        A = (mode >> 5) & 0x3;
+        B = (mode >> 7) & 0x3;
+        params.size = uvec2(B + 8, A + 2);
+        break;
+    case 2:
+        A = (mode >> 5) & 0x3;
+        B = (mode >> 7) & 0x3;
+        params.size = uvec2(A + 2, B + 8);
+        break;
+    case 3:
+        A = (mode >> 5) & 0x3;
+        B = (mode >> 7) & 0x1;
+        params.size = uvec2(A + 2, B + 6);
+        break;
+    case 4:
+        A = (mode >> 5) & 0x3;
+        B = (mode >> 7) & 0x1;
+        params.size = uvec2(B + 2, A + 2);
+        break;
+    case 5:
+        A = (mode >> 5) & 0x3;
+        params.size = uvec2(12, A + 2);
+        break;
+    case 6:
+        A = (mode >> 5) & 0x3;
+        params.size = uvec2(A + 2, 12);
+        break;
+    case 7:
+        params.size = uvec2(6, 10);
+        break;
+    case 8:
+        params.size = uvec2(10, 6);
+        break;
+    case 9:
+        A = (mode >> 5) & 0x3;
+        B = (mode >> 9) & 0x3;
+        params.size = uvec2(A + 6, B + 6);
+        break;
+    default:
+        params.Error = true;
+        break;
+    }
+    params.dual_plane = (mode_layout != 9) && ((mode & 0x400) != 0);
+    uint weight_index = (mode & 0x10) != 0 ? 1 : 0;
+    if (mode_layout < 5) {
+        weight_index |= (mode & 0x3) << 1;
+    } else {
+        weight_index |= (mode & 0xc) >> 1;
+    }
+    weight_index -= 2;
+    if ((mode_layout != 9) && ((mode & 0x200) != 0)) {
+        const int max_weights[6] = int[6](9, 11, 15, 19, 23, 31);
+        params.max_weight = max_weights[weight_index];
+    } else {
+        const int max_weights[6] = int[6](1, 2, 3, 4, 5, 7);
+        params.max_weight = max_weights[weight_index];
+    }
+    return params;
+}
+
+void FillError(ivec3 coord) {
+    for (uint j = 0; j < block_dims.y; j++) {
+        for (uint i = 0; i < block_dims.x; i++) {
+            imageStore(dest_image, coord.xy + ivec2(i, j), vec4(1.0, 1.0, 0.0, 1.0));
+        }
+    }
+    return;
+}
+
+void FillVoidExtentLDR(ivec3 coord, uint block_index) {
+    for (int i = 0; i < 4; i++) {
+        StreamBits(13);
+    }
+
+    uint r_u = StreamBits(16);
+    uint g_u = StreamBits(16);
+    uint b_u = StreamBits(16);
+    uint a_u = StreamBits(16);
+    float a = float(a_u) / 65535.0f;
+    float r = float(r_u) / 65535.0f;
+    float g = float(g_u) / 65535.0f;
+    float b = float(b_u) / 65535.0f;
+    for (uint j = 0; j < block_dims.y; j++) {
+        for (uint i = 0; i < block_dims.x; i++) {
+            imageStore(dest_image, coord.xy + ivec2(i, j), vec4(r, g, b, a));
+        }
+    }
+}
+
+void DecompressBlock(ivec3 coord, uint block_index) {
+    TexelWeightParams params;
+    params = DecodeBlockInfo(block_index);
+    if (params.Error) {
+        FillError(coord);
+        return;
+    }
+    if (params.VoidExtentHDR) {
+        FillError(coord);
+        return;
+    }
+    if (params.VoidExtentLDR) {
+        FillVoidExtentLDR(coord, block_index);
+        return;
+    }
+    if (params.size.x > block_dims.x || params.size.y > block_dims.y) {
+        FillError(coord);
+        return;
+    }
+    uint num_partitions = StreamBits(2) + 1;
+    if (num_partitions > 4 || (num_partitions == 4 && params.dual_plane)) {
+        FillError(coord);
+        return;
+    }
+    int plane_index = -1;
+    uint partition_index = 1;
+    uvec4 color_endpoint_mode = uvec4(0);
+    uint ced_pointer = 0;
+    uint base_cem = 0;
+    if (num_partitions == 1) {
+        color_endpoint_mode[0] = StreamBits(4);
+        partition_index = 0;
+    } else {
+        partition_index = StreamBits(10);
+        base_cem = StreamBits(6);
+    }
+    uint base_mode = base_cem & 3;
+    uint weight_bits = GetPackedBitSize(params.size, params.dual_plane, params.max_weight);
+    uint remaining_bits = 128 - weight_bits - total_bitsread;
+    uint extra_cem_bits = 0;
+    if (base_mode > 0) {
+        switch (num_partitions) {
+        case 2:
+            extra_cem_bits += 2;
+            break;
+        case 3:
+            extra_cem_bits += 5;
+            break;
+        case 4:
+            extra_cem_bits += 8;
+            break;
+        default:
+            return;
+        }
+    }
+    remaining_bits -= extra_cem_bits;
+    uint plane_selector_bits = 0;
+    if (params.dual_plane) {
+        plane_selector_bits = 2;
+    }
+    remaining_bits -= plane_selector_bits;
+    // Read color data...
+    uint color_data_bits = remaining_bits;
+    while (remaining_bits > 0) {
+        uint nb = min(remaining_bits, 8);
+        uint b = StreamBits(nb);
+        color_endpoint_data[ced_pointer] = uint(bitfieldExtract(b, 0, 8));
+        ced_pointer++;
+        remaining_bits -= nb;
+    }
+    plane_index = int(StreamBits(plane_selector_bits));
+    if (base_mode > 0) {
+        uint extra_cem = StreamBits(extra_cem_bits);
+        uint cem = (extra_cem << 6) | base_cem;
+        cem >>= 2;
+        uint C[4] = {0, 0, 0, 0};
+        for (uint i = 0; i < num_partitions; i++) {
+            C[i] = cem & 1;
+            cem >>= 1;
+        }
+        uint M[4] = {0, 0, 0, 0};
+        for (uint i = 0; i < num_partitions; i++) {
+            M[i] = cem & 3;
+            cem >>= 2;
+        }
+        for (uint i = 0; i < num_partitions; i++) {
+            color_endpoint_mode[i] = base_mode;
+            if ((C[i]) == 0) {
+                color_endpoint_mode[i] -= 1;
+            }
+            color_endpoint_mode[i] <<= 2;
+            color_endpoint_mode[i] |= M[i];
+        }
+    } else if (num_partitions > 1) {
+        uint cem = base_cem >> 2;
+        for (uint i = 0; i < num_partitions; i++) {
+            color_endpoint_mode[i] = cem;
+        }
+    }
+
+    uint color_values[32]; // Four values, two endpoints, four maximum paritions
+    DecodeColorValues(color_values, color_endpoint_mode, num_partitions, color_data_bits);
+    uvec4 endpoints[4][2];
+    for (uint i = 0; i < num_partitions; i++) {
+        ComputeEndpoints(endpoints[i][0], endpoints[i][1], color_values, color_endpoint_mode[i]);
+    }
+    for (uint i = 0; i < 16; i++) {
+        texel_weight_data[i] = local_buff[i];
+    }
+    for (uint i = 0; i < 8; i++) {
+#define REVERSE_BYTE(b) ((b * 0x0802U & 0x22110U) | (b * 0x8020U & 0x88440U)) * 0x10101U >> 16
+        uint a = REVERSE_BYTE(texel_weight_data[i]);
+        uint b = REVERSE_BYTE(texel_weight_data[15 - i]);
+#undef REVERSE_BYTE
+        texel_weight_data[i] = uint(bitfieldExtract(b, 0, 8));
+        texel_weight_data[15 - i] = uint(bitfieldExtract(a, 0, 8));
+    }
+    uint clear_byte_start =
+        (GetPackedBitSize(params.size, params.dual_plane, params.max_weight) >> 3) + 1;
+    texel_weight_data[clear_byte_start - 1] =
+        texel_weight_data[clear_byte_start - 1] &
+        uint(
+            ((1 << (GetPackedBitSize(params.size, params.dual_plane, params.max_weight) % 8)) - 1));
+    for (uint i = 0; i < 16 - clear_byte_start; i++) {
+        texel_weight_data[clear_byte_start + i] = uint(0U);
+    }
+    texel_flag = true; // use texel "vector" and bit stream in integer decoding
+    DecodeIntegerSequence(params.max_weight, GetNumWeightValues(params.size, params.dual_plane));
+    uint weights[2][144];
+    UnquantizeTexelWeights(weights, params.dual_plane, params.size);
+    for (uint j = 0; j < block_dims.y; j++) {
+        for (uint i = 0; i < block_dims.x; i++) {
+            uint local_partition = Select2DPartition(partition_index, i, j, num_partitions,
+                                                     (block_dims.y * block_dims.x) < 32);
+            vec4 p;
+            uvec4 C0 = ReplicateByteTo16(endpoints[local_partition][0]);
+            uvec4 C1 = ReplicateByteTo16(endpoints[local_partition][1]);
+            uvec4 plane_vec = uvec4(0);
+            uvec4 weight_vec = uvec4(0);
+            for (uint c = 0; c < 4; c++) {
+                if (params.dual_plane && (((plane_index + 1) & 3) == c)) {
+                    plane_vec[c] = 1;
+                }
+                weight_vec[c] = weights[plane_vec[c]][j * block_dims.x + i];
+            }
+            vec4 Cf = vec4((C0 * (uvec4(64) - weight_vec) + C1 * weight_vec + uvec4(32)) >> 6);
+            p = (Cf / 65535.0);
+            imageStore(dest_image, coord.xy + ivec2(i, j), p.gbar);
+        }
+    }
+}
+
+void main() {
+    uvec3 pos = gl_GlobalInvocationID + origin;
+    pos.x <<= bytes_per_block_log2;
+
+    // Read as soon as possible due to its latency
+    const uint swizzle = SwizzleOffset(pos.xy);
+
+    const uint block_y = pos.y >> GOB_SIZE_Y_SHIFT;
+
+    uint offset = 0;
+    offset += layer * layer_stride;
+    offset += (block_y >> block_height) * block_size;
+    offset += (block_y & block_height_mask) << GOB_SIZE_SHIFT;
+    offset += (pos.x >> GOB_SIZE_X_SHIFT) << x_shift;
+    offset += swizzle;
+
+    const ivec3 invocation_destination = ivec3(gl_GlobalInvocationID + destination);
+    const ivec3 coord = ivec3(invocation_destination * uvec3(block_dims, 1.0));
+    uint block_index =
+        layer * num_image_blocks.x * num_image_blocks.y + pos.y * num_image_blocks.x + pos.x;
+    current_index = 0;
+    bitsread = 0;
+    for (int i = 0; i < 16; i++) {
+        local_buff[i] = ReadTexel(offset + i);
+    }
+    DecompressBlock(coord, block_index);
+}
diff --git a/src/video_core/renderer_opengl/gl_texture_cache.cpp b/src/video_core/renderer_opengl/gl_texture_cache.cpp
index e028677e9..29105ecad 100644
--- a/src/video_core/renderer_opengl/gl_texture_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_texture_cache.cpp
@@ -307,7 +307,8 @@ void ApplySwizzle(GLuint handle, PixelFormat format, std::array<SwizzleSource, 4
 
 [[nodiscard]] bool CanBeAccelerated(const TextureCacheRuntime& runtime,
                                     const VideoCommon::ImageInfo& info) {
-    // Disable accelerated uploads for now as they don't implement swizzled uploads
+    return (!runtime.HasNativeASTC() && IsPixelFormatASTC(info.format));
+    // Disable other accelerated uploads for now as they don't implement swizzled uploads
     return false;
     switch (info.type) {
     case ImageType::e2D:
@@ -567,6 +568,9 @@ void TextureCacheRuntime::BlitFramebuffer(Framebuffer* dst, Framebuffer* src,
 
 void TextureCacheRuntime::AccelerateImageUpload(Image& image, const ImageBufferMap& map,
                                                 std::span<const SwizzleParameters> swizzles) {
+    if (IsPixelFormatASTC(image.info.format)) {
+        return util_shaders.ASTCDecode(image, map, swizzles);
+    }
     switch (image.info.type) {
     case ImageType::e2D:
         return util_shaders.BlockLinearUpload2D(image, map, swizzles);
@@ -599,6 +603,10 @@ FormatProperties TextureCacheRuntime::FormatInfo(ImageType type, GLenum internal
     }
 }
 
+bool TextureCacheRuntime::HasNativeASTC() const noexcept {
+    return device.HasASTC();
+}
+
 TextureCacheRuntime::StagingBuffers::StagingBuffers(GLenum storage_flags_, GLenum map_flags_)
     : storage_flags{storage_flags_}, map_flags{map_flags_} {}
 
diff --git a/src/video_core/renderer_opengl/gl_texture_cache.h b/src/video_core/renderer_opengl/gl_texture_cache.h
index 3fbaa102f..3c871541b 100644
--- a/src/video_core/renderer_opengl/gl_texture_cache.h
+++ b/src/video_core/renderer_opengl/gl_texture_cache.h
@@ -95,6 +95,8 @@ public:
         return has_broken_texture_view_formats;
     }
 
+    bool HasNativeASTC() const noexcept;
+
 private:
     struct StagingBuffers {
         explicit StagingBuffers(GLenum storage_flags_, GLenum map_flags_);
diff --git a/src/video_core/renderer_opengl/util_shaders.cpp b/src/video_core/renderer_opengl/util_shaders.cpp
index 2fe4799bc..2a4220661 100644
--- a/src/video_core/renderer_opengl/util_shaders.cpp
+++ b/src/video_core/renderer_opengl/util_shaders.cpp
@@ -3,7 +3,10 @@
 // Refer to the license.txt file included.
 
 #include <bit>
+#include <fstream>
 #include <span>
+#include <streambuf>
+#include <string>
 #include <string_view>
 
 #include <glad/glad.h>
@@ -24,11 +27,13 @@
 #include "video_core/texture_cache/accelerated_swizzle.h"
 #include "video_core/texture_cache/types.h"
 #include "video_core/texture_cache/util.h"
+#include "video_core/textures/astc.h"
 #include "video_core/textures/decoders.h"
 
 namespace OpenGL {
 
 using namespace HostShaders;
+using namespace Tegra::Texture::ASTC;
 
 using VideoCommon::Extent3D;
 using VideoCommon::ImageCopy;
@@ -63,12 +68,104 @@ UtilShaders::UtilShaders(ProgramManager& program_manager_)
       pitch_unswizzle_program(MakeProgram(PITCH_UNSWIZZLE_COMP)),
       copy_bgra_program(MakeProgram(OPENGL_COPY_BGRA_COMP)),
       copy_bc4_program(MakeProgram(OPENGL_COPY_BC4_COMP)) {
+    // TODO: Load shader string as a header
+    std::string astc_path = "astc_decoder.comp";
+    std::ifstream t(astc_path);
+    std::string str((std::istreambuf_iterator<char>(t)), std::istreambuf_iterator<char>());
+    astc_decoder_program = MakeProgram(str);
+    MakeBuffers();
+}
+
+UtilShaders::~UtilShaders() = default;
+
+void UtilShaders::MakeBuffers() {
     const auto swizzle_table = Tegra::Texture::MakeSwizzleTable();
     swizzle_table_buffer.Create();
     glNamedBufferStorage(swizzle_table_buffer.handle, sizeof(swizzle_table), &swizzle_table, 0);
+
+    astc_encodings_buffer.Create();
+    glNamedBufferStorage(astc_encodings_buffer.handle, sizeof(EncodingsValues), &EncodingsValues,
+                         0);
+    replicate_6_to_8_buffer.Create();
+    glNamedBufferStorage(replicate_6_to_8_buffer.handle, sizeof(REPLICATE_6_BIT_TO_8_TABLE),
+                         &REPLICATE_6_BIT_TO_8_TABLE, 0);
+    replicate_7_to_8_buffer.Create();
+    glNamedBufferStorage(replicate_7_to_8_buffer.handle, sizeof(REPLICATE_7_BIT_TO_8_TABLE),
+                         &REPLICATE_7_BIT_TO_8_TABLE, 0);
+    replicate_8_to_8_buffer.Create();
+    glNamedBufferStorage(replicate_8_to_8_buffer.handle, sizeof(REPLICATE_8_BIT_TO_8_TABLE),
+                         &REPLICATE_8_BIT_TO_8_TABLE, 0);
+    replicate_byte_to_16_buffer.Create();
+    glNamedBufferStorage(replicate_byte_to_16_buffer.handle, sizeof(REPLICATE_BYTE_TO_16_TABLE),
+                         &REPLICATE_BYTE_TO_16_TABLE, 0);
 }
 
-UtilShaders::~UtilShaders() = default;
+void UtilShaders::ASTCDecode(Image& image, const ImageBufferMap& map,
+                             std::span<const VideoCommon::SwizzleParameters> swizzles) {
+    static constexpr GLuint BINDING_SWIZZLE_BUFFER = 0;
+    static constexpr GLuint BINDING_INPUT_BUFFER = 1;
+    static constexpr GLuint BINDING_ENC_BUFFER = 2;
+
+    static constexpr GLuint BINDING_6_TO_8_BUFFER = 3;
+    static constexpr GLuint BINDING_7_TO_8_BUFFER = 4;
+    static constexpr GLuint BINDING_8_TO_8_BUFFER = 5;
+    static constexpr GLuint BINDING_BYTE_TO_16_BUFFER = 6;
+
+    static constexpr GLuint BINDING_OUTPUT_IMAGE = 0;
+    static constexpr GLuint LOC_NUM_IMAGE_BLOCKS = 0;
+    static constexpr GLuint LOC_BLOCK_DIMS = 1;
+    static constexpr GLuint LOC_LAYER = 2;
+
+    const Extent3D tile_size = {
+        VideoCore::Surface::DefaultBlockWidth(image.info.format),
+        VideoCore::Surface::DefaultBlockHeight(image.info.format),
+    };
+    program_manager.BindHostCompute(astc_decoder_program.handle);
+    glBindBufferBase(GL_SHADER_STORAGE_BUFFER, BINDING_SWIZZLE_BUFFER, swizzle_table_buffer.handle);
+    glBindBufferBase(GL_SHADER_STORAGE_BUFFER, BINDING_ENC_BUFFER, astc_encodings_buffer.handle);
+    glBindBufferBase(GL_SHADER_STORAGE_BUFFER, BINDING_6_TO_8_BUFFER,
+                     replicate_6_to_8_buffer.handle);
+    glBindBufferBase(GL_SHADER_STORAGE_BUFFER, BINDING_7_TO_8_BUFFER,
+                     replicate_7_to_8_buffer.handle);
+    glBindBufferBase(GL_SHADER_STORAGE_BUFFER, BINDING_8_TO_8_BUFFER,
+                     replicate_8_to_8_buffer.handle);
+    glBindBufferBase(GL_SHADER_STORAGE_BUFFER, BINDING_BYTE_TO_16_BUFFER,
+                     replicate_byte_to_16_buffer.handle);
+
+    glFlushMappedNamedBufferRange(map.buffer, map.offset, image.guest_size_bytes);
+    glUniform2ui(LOC_BLOCK_DIMS, tile_size.width, tile_size.height);
+
+    for (u32 layer = 0; layer < image.info.resources.layers; layer++) {
+        for (const SwizzleParameters& swizzle : swizzles) {
+            glBindImageTexture(BINDING_OUTPUT_IMAGE, image.StorageHandle(), swizzle.level, GL_FALSE,
+                               layer, GL_WRITE_ONLY, GL_RGBA8);
+            const size_t input_offset = swizzle.buffer_offset + map.offset;
+            const auto num_dispatches_x = Common::DivCeil(swizzle.num_tiles.width, 32U);
+            const auto num_dispatches_y = Common::DivCeil(swizzle.num_tiles.height, 32U);
+
+            glUniform2ui(LOC_NUM_IMAGE_BLOCKS, swizzle.num_tiles.width, swizzle.num_tiles.height);
+            glUniform1ui(LOC_LAYER, layer);
+
+            // To unswizzle the ASTC data
+            const auto params = MakeBlockLinearSwizzle2DParams(swizzle, image.info);
+            glUniform3uiv(3, 1, params.origin.data());
+            glUniform3iv(4, 1, params.destination.data());
+            glUniform1ui(5, params.bytes_per_block_log2);
+            glUniform1ui(6, params.layer_stride);
+            glUniform1ui(7, params.block_size);
+            glUniform1ui(8, params.x_shift);
+            glUniform1ui(9, params.block_height);
+            glUniform1ui(10, params.block_height_mask);
+
+            // ASTC texture data
+            glBindBufferRange(GL_SHADER_STORAGE_BUFFER, BINDING_INPUT_BUFFER, map.buffer,
+                              input_offset, image.guest_size_bytes - swizzle.buffer_offset);
+
+            glDispatchCompute(num_dispatches_x, num_dispatches_y, 1);
+        }
+    }
+    program_manager.RestoreGuestCompute();
+}
 
 void UtilShaders::BlockLinearUpload2D(Image& image, const ImageBufferMap& map,
                                       std::span<const SwizzleParameters> swizzles) {
diff --git a/src/video_core/renderer_opengl/util_shaders.h b/src/video_core/renderer_opengl/util_shaders.h
index 93b009743..08a1cb9b2 100644
--- a/src/video_core/renderer_opengl/util_shaders.h
+++ b/src/video_core/renderer_opengl/util_shaders.h
@@ -40,6 +40,11 @@ public:
     explicit UtilShaders(ProgramManager& program_manager);
     ~UtilShaders();
 
+    void MakeBuffers();
+
+    void ASTCDecode(Image& image, const ImageBufferMap& map,
+                    std::span<const VideoCommon::SwizzleParameters> swizzles);
+
     void BlockLinearUpload2D(Image& image, const ImageBufferMap& map,
                              std::span<const VideoCommon::SwizzleParameters> swizzles);
 
@@ -59,7 +64,13 @@ private:
     ProgramManager& program_manager;
 
     OGLBuffer swizzle_table_buffer;
+    OGLBuffer astc_encodings_buffer;
+    OGLBuffer replicate_6_to_8_buffer;
+    OGLBuffer replicate_7_to_8_buffer;
+    OGLBuffer replicate_8_to_8_buffer;
+    OGLBuffer replicate_byte_to_16_buffer;
 
+    OGLProgram astc_decoder_program;
     OGLProgram block_linear_unswizzle_2d_program;
     OGLProgram block_linear_unswizzle_3d_program;
     OGLProgram pitch_unswizzle_program;
diff --git a/src/video_core/textures/astc.h b/src/video_core/textures/astc.h
index 9105119bc..bc8bddaec 100644
--- a/src/video_core/textures/astc.h
+++ b/src/video_core/textures/astc.h
@@ -8,6 +8,196 @@
 
 namespace Tegra::Texture::ASTC {
 
+/// Count the number of bits set in a number.
+constexpr u32 Popcnt(u32 n) {
+    u32 c = 0;
+    for (; n; c++) {
+        n &= n - 1;
+    }
+    return c;
+}
+
+enum class IntegerEncoding { JustBits, Qus32, Trit };
+
+struct IntegerEncodedValue {
+    constexpr IntegerEncodedValue() = default;
+
+    constexpr IntegerEncodedValue(IntegerEncoding encoding_, u32 num_bits_)
+        : encoding{encoding_}, num_bits{num_bits_} {}
+
+    constexpr bool MatchesEncoding(const IntegerEncodedValue& other) const {
+        return encoding == other.encoding && num_bits == other.num_bits;
+    }
+
+    // Returns the number of bits required to encode nVals values.
+    u32 GetBitLength(u32 nVals) const {
+        u32 totalBits = num_bits * nVals;
+        if (encoding == IntegerEncoding::Trit) {
+            totalBits += (nVals * 8 + 4) / 5;
+        } else if (encoding == IntegerEncoding::Qus32) {
+            totalBits += (nVals * 7 + 2) / 3;
+        }
+        return totalBits;
+    }
+
+    IntegerEncoding encoding{};
+    u32 num_bits = 0;
+    u32 bit_value = 0;
+    union {
+        u32 qus32_value = 0;
+        u32 trit_value;
+    };
+};
+
+// Returns a new instance of this struct that corresponds to the
+// can take no more than maxval values
+static constexpr IntegerEncodedValue CreateEncoding(u32 maxVal) {
+    while (maxVal > 0) {
+        u32 check = maxVal + 1;
+
+        // Is maxVal a power of two?
+        if (!(check & (check - 1))) {
+            return IntegerEncodedValue(IntegerEncoding::JustBits, Popcnt(maxVal));
+        }
+
+        // Is maxVal of the type 3*2^n - 1?
+        if ((check % 3 == 0) && !((check / 3) & ((check / 3) - 1))) {
+            return IntegerEncodedValue(IntegerEncoding::Trit, Popcnt(check / 3 - 1));
+        }
+
+        // Is maxVal of the type 5*2^n - 1?
+        if ((check % 5 == 0) && !((check / 5) & ((check / 5) - 1))) {
+            return IntegerEncodedValue(IntegerEncoding::Qus32, Popcnt(check / 5 - 1));
+        }
+
+        // Apparently it can't be represented with a bounded integer sequence...
+        // just iterate.
+        maxVal--;
+    }
+    return IntegerEncodedValue(IntegerEncoding::JustBits, 0);
+}
+
+static constexpr std::array<IntegerEncodedValue, 256> MakeEncodedValues() {
+    std::array<IntegerEncodedValue, 256> encodings{};
+    for (std::size_t i = 0; i < encodings.size(); ++i) {
+        encodings[i] = CreateEncoding(static_cast<u32>(i));
+    }
+    return encodings;
+}
+
+static constexpr std::array<IntegerEncodedValue, 256> EncodingsValues = MakeEncodedValues();
+
+// Replicates low numBits such that [(toBit - 1):(toBit - 1 - fromBit)]
+// is the same as [(numBits - 1):0] and repeats all the way down.
+template <typename IntType>
+static constexpr IntType Replicate(IntType val, u32 numBits, u32 toBit) {
+    if (numBits == 0) {
+        return 0;
+    }
+    if (toBit == 0) {
+        return 0;
+    }
+    const IntType v = val & static_cast<IntType>((1 << numBits) - 1);
+    IntType res = v;
+    u32 reslen = numBits;
+    while (reslen < toBit) {
+        u32 comp = 0;
+        if (numBits > toBit - reslen) {
+            u32 newshift = toBit - reslen;
+            comp = numBits - newshift;
+            numBits = newshift;
+        }
+        res = static_cast<IntType>(res << numBits);
+        res = static_cast<IntType>(res | (v >> comp));
+        reslen += numBits;
+    }
+    return res;
+}
+
+static constexpr std::size_t NumReplicateEntries(u32 num_bits) {
+    return std::size_t(1) << num_bits;
+}
+
+template <typename IntType, u32 num_bits, u32 to_bit>
+static constexpr auto MakeReplicateTable() {
+    std::array<IntType, NumReplicateEntries(num_bits)> table{};
+    for (IntType value = 0; value < static_cast<IntType>(std::size(table)); ++value) {
+        table[value] = Replicate(value, num_bits, to_bit);
+    }
+    return table;
+}
+
+static constexpr auto REPLICATE_BYTE_TO_16_TABLE = MakeReplicateTable<u32, 8, 16>();
+static constexpr u32 ReplicateByteTo16(std::size_t value) {
+    return REPLICATE_BYTE_TO_16_TABLE[value];
+}
+
+static constexpr auto REPLICATE_BIT_TO_7_TABLE = MakeReplicateTable<u32, 1, 7>();
+static constexpr u32 ReplicateBitTo7(std::size_t value) {
+    return REPLICATE_BIT_TO_7_TABLE[value];
+}
+
+static constexpr auto REPLICATE_BIT_TO_9_TABLE = MakeReplicateTable<u32, 1, 9>();
+static constexpr u32 ReplicateBitTo9(std::size_t value) {
+    return REPLICATE_BIT_TO_9_TABLE[value];
+}
+
+static constexpr auto REPLICATE_1_BIT_TO_8_TABLE = MakeReplicateTable<u32, 1, 8>();
+static constexpr auto REPLICATE_2_BIT_TO_8_TABLE = MakeReplicateTable<u32, 2, 8>();
+static constexpr auto REPLICATE_3_BIT_TO_8_TABLE = MakeReplicateTable<u32, 3, 8>();
+static constexpr auto REPLICATE_4_BIT_TO_8_TABLE = MakeReplicateTable<u32, 4, 8>();
+static constexpr auto REPLICATE_5_BIT_TO_8_TABLE = MakeReplicateTable<u32, 5, 8>();
+static constexpr auto REPLICATE_6_BIT_TO_8_TABLE = MakeReplicateTable<u32, 6, 8>();
+static constexpr auto REPLICATE_7_BIT_TO_8_TABLE = MakeReplicateTable<u32, 7, 8>();
+static constexpr auto REPLICATE_8_BIT_TO_8_TABLE = MakeReplicateTable<u32, 8, 8>();
+/// Use a precompiled table with the most common usages, if it's not in the expected range, fallback
+/// to the runtime implementation
+static constexpr u32 FastReplicateTo8(u32 value, u32 num_bits) {
+    switch (num_bits) {
+    case 1:
+        return REPLICATE_1_BIT_TO_8_TABLE[value];
+    case 2:
+        return REPLICATE_2_BIT_TO_8_TABLE[value];
+    case 3:
+        return REPLICATE_3_BIT_TO_8_TABLE[value];
+    case 4:
+        return REPLICATE_4_BIT_TO_8_TABLE[value];
+    case 5:
+        return REPLICATE_5_BIT_TO_8_TABLE[value];
+    case 6:
+        return REPLICATE_6_BIT_TO_8_TABLE[value];
+    case 7:
+        return REPLICATE_7_BIT_TO_8_TABLE[value];
+    case 8:
+        return REPLICATE_8_BIT_TO_8_TABLE[value];
+    default:
+        return Replicate(value, num_bits, 8);
+    }
+}
+
+static constexpr auto REPLICATE_1_BIT_TO_6_TABLE = MakeReplicateTable<u32, 1, 6>();
+static constexpr auto REPLICATE_2_BIT_TO_6_TABLE = MakeReplicateTable<u32, 2, 6>();
+static constexpr auto REPLICATE_3_BIT_TO_6_TABLE = MakeReplicateTable<u32, 3, 6>();
+static constexpr auto REPLICATE_4_BIT_TO_6_TABLE = MakeReplicateTable<u32, 4, 6>();
+static constexpr auto REPLICATE_5_BIT_TO_6_TABLE = MakeReplicateTable<u32, 5, 6>();
+
+static constexpr u32 FastReplicateTo6(u32 value, u32 num_bits) {
+    switch (num_bits) {
+    case 1:
+        return REPLICATE_1_BIT_TO_6_TABLE[value];
+    case 2:
+        return REPLICATE_2_BIT_TO_6_TABLE[value];
+    case 3:
+        return REPLICATE_3_BIT_TO_6_TABLE[value];
+    case 4:
+        return REPLICATE_4_BIT_TO_6_TABLE[value];
+    case 5:
+        return REPLICATE_5_BIT_TO_6_TABLE[value];
+    default:
+        return Replicate(value, num_bits, 6);
+    }
+}
+
 void Decompress(std::span<const uint8_t> data, uint32_t width, uint32_t height, uint32_t depth,
                 uint32_t block_width, uint32_t block_height, std::span<uint8_t> output);
 
-- 
cgit v1.2.3