From 2985e5e94c82febcf215feb0023f4184b38bb24a Mon Sep 17 00:00:00 2001
From: ameerj <52414509+ameerj@users.noreply.github.com>
Date: Sat, 13 Feb 2021 15:50:12 -0500
Subject: renderer_opengl: Accelerate ASTC texture decoding with a compute
 shader

ASTC texture decoding is currently handled by a CPU decoder for GPU's without native ASTC decoding support (most desktop GPUs). This is the cause for noticeable performance degradation in titles which use the format extensively.

This commit adds support to accelerate ASTC decoding using a compute shader on OpenGL for GPUs without native support.
---
 src/video_core/host_shaders/astc_decoder.comp | 1288 +++++++++++++++++++++++++
 1 file changed, 1288 insertions(+)
 create mode 100644 src/video_core/host_shaders/astc_decoder.comp

(limited to 'src/video_core/host_shaders')

diff --git a/src/video_core/host_shaders/astc_decoder.comp b/src/video_core/host_shaders/astc_decoder.comp
new file mode 100644
index 000000000..070190a5c
--- /dev/null
+++ b/src/video_core/host_shaders/astc_decoder.comp
@@ -0,0 +1,1288 @@
+// Copyright 2021 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#version 450
+
+#ifdef VULKAN
+
+#define BEGIN_PUSH_CONSTANTS layout(push_constant) uniform PushConstants {
+#define END_PUSH_CONSTANTS };
+#define UNIFORM(n)
+#define BINDING_SWIZZLE_BUFFER 0
+#define BINDING_INPUT_BUFFER 1
+#define BINDING_ENC_BUFFER 2
+#define BINDING_6_TO_8_BUFFER 3
+#define BINDING_7_TO_8_BUFFER 4
+#define BINDING_8_TO_8_BUFFER 5
+#define BINDING_BYTE_TO_16_BUFFER 6
+#define BINDING_OUTPUT_IMAGE 3
+
+#else // ^^^ Vulkan ^^^ // vvv OpenGL vvv
+
+#define BEGIN_PUSH_CONSTANTS
+#define END_PUSH_CONSTANTS
+#define UNIFORM(n) layout(location = n) uniform
+#define BINDING_SWIZZLE_BUFFER 0
+#define BINDING_INPUT_BUFFER 1
+#define BINDING_ENC_BUFFER 2
+#define BINDING_6_TO_8_BUFFER 3
+#define BINDING_7_TO_8_BUFFER 4
+#define BINDING_8_TO_8_BUFFER 5
+#define BINDING_BYTE_TO_16_BUFFER 6
+#define BINDING_OUTPUT_IMAGE 0
+
+#endif
+
+layout(local_size_x = 32, local_size_y = 32, local_size_z = 1) in;
+
+BEGIN_PUSH_CONSTANTS
+UNIFORM(0) uvec2 num_image_blocks;
+UNIFORM(1) uvec2 block_dims;
+UNIFORM(2) uint layer;
+
+UNIFORM(3) uvec3 origin;
+UNIFORM(4) ivec3 destination;
+UNIFORM(5) uint bytes_per_block_log2;
+UNIFORM(6) uint layer_stride;
+UNIFORM(7) uint block_size;
+UNIFORM(8) uint x_shift;
+UNIFORM(9) uint block_height;
+UNIFORM(10) uint block_height_mask;
+
+END_PUSH_CONSTANTS
+
+uint current_index = 0;
+int bitsread = 0;
+uint total_bitsread = 0;
+uint local_buff[16];
+
+const int JustBits = 0;
+const int Quint = 1;
+const int Trit = 2;
+
+struct EncodingData {
+    uint encoding;
+    uint num_bits;
+    uint bit_value;
+    uint quint_trit_value;
+};
+
+struct TexelWeightParams {
+    uvec2 size;
+    bool dual_plane;
+    uint max_weight;
+    bool Error;
+    bool VoidExtentLDR;
+    bool VoidExtentHDR;
+};
+
+// Swizzle data
+layout(binding = BINDING_SWIZZLE_BUFFER, std430) readonly buffer SwizzleTable {
+    uint swizzle_table[];
+};
+
+layout(binding = BINDING_INPUT_BUFFER, std430) buffer InputBufferU32 {
+    uint astc_data[];
+};
+layout(binding = BINDING_OUTPUT_IMAGE) uniform writeonly image2D dest_image;
+
+const uint GOB_SIZE_X = 64;
+const uint GOB_SIZE_Y = 8;
+const uint GOB_SIZE_Z = 1;
+const uint GOB_SIZE = GOB_SIZE_X * GOB_SIZE_Y * GOB_SIZE_Z;
+
+const uint GOB_SIZE_X_SHIFT = 6;
+const uint GOB_SIZE_Y_SHIFT = 3;
+const uint GOB_SIZE_Z_SHIFT = 0;
+const uint GOB_SIZE_SHIFT = GOB_SIZE_X_SHIFT + GOB_SIZE_Y_SHIFT + GOB_SIZE_Z_SHIFT;
+
+const uvec2 SWIZZLE_MASK = uvec2(GOB_SIZE_X - 1, GOB_SIZE_Y - 1);
+
+uint SwizzleOffset(uvec2 pos) {
+    pos = pos & SWIZZLE_MASK;
+    return swizzle_table[pos.y * 64 + pos.x];
+}
+
+uint ReadTexel(uint offset) {
+    // extract the 8-bit value from the 32-bit packed data.
+    return bitfieldExtract(astc_data[offset / 4], int((offset * 8) & 24), 8);
+}
+
+// ASTC Encodings data
+layout(binding = BINDING_ENC_BUFFER, std430) readonly buffer EncodingsValues {
+    EncodingData encoding_values[256];
+};
+// ASTC Precompiled tables
+layout(binding = BINDING_6_TO_8_BUFFER, std430) readonly buffer REPLICATE_6_BIT_TO_8 {
+    uint REPLICATE_6_BIT_TO_8_TABLE[];
+};
+layout(binding = BINDING_7_TO_8_BUFFER, std430) readonly buffer REPLICATE_7_BIT_TO_8 {
+    uint REPLICATE_7_BIT_TO_8_TABLE[];
+};
+layout(binding = BINDING_8_TO_8_BUFFER, std430) readonly buffer REPLICATE_8_BIT_TO_8 {
+    uint REPLICATE_8_BIT_TO_8_TABLE[];
+};
+layout(binding = BINDING_BYTE_TO_16_BUFFER, std430) readonly buffer REPLICATE_BYTE_TO_16 {
+    uint REPLICATE_BYTE_TO_16_TABLE[];
+};
+
+const int BLOCK_SIZE_IN_BYTES = 16;
+
+const int BLOCK_INFO_ERROR = 0;
+const int BLOCK_INFO_VOID_EXTENT_HDR = 1;
+const int BLOCK_INFO_VOID_EXTENT_LDR = 2;
+const int BLOCK_INFO_NORMAL = 3;
+
+// Replicates low numBits such that [(toBit - 1):(toBit - 1 - fromBit)]
+// is the same as [(numBits - 1):0] and repeats all the way down.
+uint Replicate(uint val, uint num_bits, uint to_bit) {
+    if (num_bits == 0) {
+        return 0;
+    }
+    if (to_bit == 0) {
+        return 0;
+    }
+    const uint v = val & uint((1 << num_bits) - 1);
+    uint res = v;
+    uint reslen = num_bits;
+    while (reslen < to_bit) {
+        uint comp = 0;
+        if (num_bits > to_bit - reslen) {
+            uint newshift = to_bit - reslen;
+            comp = num_bits - newshift;
+            num_bits = newshift;
+        }
+        res = uint(res << num_bits);
+        res = uint(res | (v >> comp));
+        reslen += num_bits;
+    }
+    return res;
+}
+
+uvec4 ReplicateByteTo16(uvec4 value) {
+    return uvec4(REPLICATE_BYTE_TO_16_TABLE[value.x], REPLICATE_BYTE_TO_16_TABLE[value.y],
+                 REPLICATE_BYTE_TO_16_TABLE[value.z], REPLICATE_BYTE_TO_16_TABLE[value.w]);
+}
+
+const uint REPLICATE_BIT_TO_7_TABLE[2] = uint[](0, 127);
+uint ReplicateBitTo7(uint value) {
+    return REPLICATE_BIT_TO_7_TABLE[value];
+    ;
+}
+
+const uint REPLICATE_1_BIT_TO_9_TABLE[2] = uint[](0, 511);
+uint ReplicateBitTo9(uint value) {
+    return REPLICATE_1_BIT_TO_9_TABLE[value];
+}
+
+const uint REPLICATE_1_BIT_TO_8_TABLE[2] = uint[](0, 255);
+const uint REPLICATE_2_BIT_TO_8_TABLE[4] = uint[](0, 85, 170, 255);
+const uint REPLICATE_3_BIT_TO_8_TABLE[8] = uint[](0, 36, 73, 109, 146, 182, 219, 255);
+const uint REPLICATE_4_BIT_TO_8_TABLE[16] =
+    uint[](0, 17, 34, 51, 68, 85, 102, 119, 136, 153, 170, 187, 204, 221, 238, 255);
+const uint REPLICATE_5_BIT_TO_8_TABLE[32] =
+    uint[](0, 8, 16, 24, 33, 41, 49, 57, 66, 74, 82, 90, 99, 107, 115, 123, 132, 140, 148, 156, 165,
+           173, 181, 189, 198, 206, 214, 222, 231, 239, 247, 255);
+
+uint FastReplicateTo8(uint value, uint num_bits) {
+    switch (num_bits) {
+    case 1:
+        return REPLICATE_1_BIT_TO_8_TABLE[value];
+    case 2:
+        return REPLICATE_2_BIT_TO_8_TABLE[value];
+    case 3:
+        return REPLICATE_3_BIT_TO_8_TABLE[value];
+    case 4:
+        return REPLICATE_4_BIT_TO_8_TABLE[value];
+    case 5:
+        return REPLICATE_5_BIT_TO_8_TABLE[value];
+    case 6:
+        return REPLICATE_6_BIT_TO_8_TABLE[value];
+    case 7:
+        return REPLICATE_7_BIT_TO_8_TABLE[value];
+    case 8:
+        return REPLICATE_8_BIT_TO_8_TABLE[value];
+    }
+    return Replicate(value, num_bits, 8);
+}
+
+const uint REPLICATE_1_BIT_TO_6_TABLE[2] = uint[](0, 63);
+const uint REPLICATE_2_BIT_TO_6_TABLE[4] = uint[](0, 21, 42, 63);
+const uint REPLICATE_3_BIT_TO_6_TABLE[8] = uint[](0, 9, 18, 27, 36, 45, 54, 63);
+const uint REPLICATE_4_BIT_TO_6_TABLE[16] =
+    uint[](0, 4, 8, 12, 17, 21, 25, 29, 34, 38, 42, 46, 51, 55, 59, 63);
+const uint REPLICATE_5_BIT_TO_6_TABLE[32] =
+    uint[](0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 33, 35, 37, 39, 41, 43, 45,
+           47, 49, 51, 53, 55, 57, 59, 61, 63);
+
+uint FastReplicateTo6(uint value, uint num_bits) {
+    switch (num_bits) {
+    case 1:
+        return REPLICATE_1_BIT_TO_6_TABLE[value];
+    case 2:
+        return REPLICATE_2_BIT_TO_6_TABLE[value];
+    case 3:
+        return REPLICATE_3_BIT_TO_6_TABLE[value];
+    case 4:
+        return REPLICATE_4_BIT_TO_6_TABLE[value];
+    case 5:
+        return REPLICATE_5_BIT_TO_6_TABLE[value];
+    }
+    return Replicate(value, num_bits, 6);
+}
+
+uint hash52(uint p) {
+    p ^= p >> 15;
+    p -= p << 17;
+    p += p << 7;
+    p += p << 4;
+    p ^= p >> 5;
+    p += p << 16;
+    p ^= p >> 7;
+    p ^= p >> 3;
+    p ^= p << 6;
+    p ^= p >> 17;
+    return p;
+}
+
+uint SelectPartition(uint seed, uint x, uint y, uint z, uint partition_count, bool small_block) {
+    if (1 == partition_count)
+        return 0;
+
+    if (small_block) {
+        x <<= 1;
+        y <<= 1;
+        z <<= 1;
+    }
+
+    seed += (partition_count - 1) * 1024;
+
+    uint rnum = hash52(uint(seed));
+    uint seed1 = uint(rnum & 0xF);
+    uint seed2 = uint((rnum >> 4) & 0xF);
+    uint seed3 = uint((rnum >> 8) & 0xF);
+    uint seed4 = uint((rnum >> 12) & 0xF);
+    uint seed5 = uint((rnum >> 16) & 0xF);
+    uint seed6 = uint((rnum >> 20) & 0xF);
+    uint seed7 = uint((rnum >> 24) & 0xF);
+    uint seed8 = uint((rnum >> 28) & 0xF);
+    uint seed9 = uint((rnum >> 18) & 0xF);
+    uint seed10 = uint((rnum >> 22) & 0xF);
+    uint seed11 = uint((rnum >> 26) & 0xF);
+    uint seed12 = uint(((rnum >> 30) | (rnum << 2)) & 0xF);
+
+    seed1 = (seed1 * seed1);
+    seed2 = (seed2 * seed2);
+    seed3 = (seed3 * seed3);
+    seed4 = (seed4 * seed4);
+    seed5 = (seed5 * seed5);
+    seed6 = (seed6 * seed6);
+    seed7 = (seed7 * seed7);
+    seed8 = (seed8 * seed8);
+    seed9 = (seed9 * seed9);
+    seed10 = (seed10 * seed10);
+    seed11 = (seed11 * seed11);
+    seed12 = (seed12 * seed12);
+
+    int sh1, sh2, sh3;
+    if ((seed & 1) > 0) {
+        sh1 = (seed & 2) > 0 ? 4 : 5;
+        sh2 = (partition_count == 3) ? 6 : 5;
+    } else {
+        sh1 = (partition_count == 3) ? 6 : 5;
+        sh2 = (seed & 2) > 0 ? 4 : 5;
+    }
+    sh3 = (seed & 0x10) > 0 ? sh1 : sh2;
+
+    seed1 = (seed1 >> sh1);
+    seed2 = (seed2 >> sh2);
+    seed3 = (seed3 >> sh1);
+    seed4 = (seed4 >> sh2);
+    seed5 = (seed5 >> sh1);
+    seed6 = (seed6 >> sh2);
+    seed7 = (seed7 >> sh1);
+    seed8 = (seed8 >> sh2);
+    seed9 = (seed9 >> sh3);
+    seed10 = (seed10 >> sh3);
+    seed11 = (seed11 >> sh3);
+    seed12 = (seed12 >> sh3);
+
+    uint a = seed1 * x + seed2 * y + seed11 * z + (rnum >> 14);
+    uint b = seed3 * x + seed4 * y + seed12 * z + (rnum >> 10);
+    uint c = seed5 * x + seed6 * y + seed9 * z + (rnum >> 6);
+    uint d = seed7 * x + seed8 * y + seed10 * z + (rnum >> 2);
+
+    a &= 0x3F;
+    b &= 0x3F;
+    c &= 0x3F;
+    d &= 0x3F;
+
+    if (partition_count < 4)
+        d = 0;
+    if (partition_count < 3)
+        c = 0;
+
+    if (a >= b && a >= c && a >= d)
+        return 0;
+    else if (b >= c && b >= d)
+        return 1;
+    else if (c >= d)
+        return 2;
+    return 3;
+}
+
+uint Select2DPartition(uint seed, uint x, uint y, uint partition_count, bool small_block) {
+    return SelectPartition(seed, x, y, 0, partition_count, small_block);
+}
+
+uint ReadBit() {
+    uint bit = bitfieldExtract(local_buff[current_index], bitsread, 1);
+    bitsread++;
+    total_bitsread++;
+    if (bitsread == 8) {
+        current_index++;
+        bitsread = 0;
+    }
+    return bit;
+}
+
+uint StreamBits(uint num_bits) {
+    uint ret = 0;
+    for (uint i = 0; i < num_bits; i++) {
+        ret |= ((ReadBit() & 1) << i);
+    }
+    return ret;
+}
+
+// Define color data.
+uint color_endpoint_data[16];
+int color_bitsread = 0;
+uint total_color_bitsread = 0;
+int color_index = 0;
+
+// Define color data.
+uint texel_weight_data[16];
+int texel_bitsread = 0;
+uint total_texel_bitsread = 0;
+int texel_index = 0;
+
+bool texel_flag = false;
+
+uint ReadColorBit() {
+    uint bit = 0;
+    if (texel_flag) {
+        bit = bitfieldExtract(texel_weight_data[texel_index], texel_bitsread, 1);
+        texel_bitsread++;
+        total_texel_bitsread++;
+        if (texel_bitsread == 8) {
+            texel_index++;
+            texel_bitsread = 0;
+        }
+    } else {
+        bit = bitfieldExtract(color_endpoint_data[color_index], color_bitsread, 1);
+        color_bitsread++;
+        total_color_bitsread++;
+        if (color_bitsread == 8) {
+            color_index++;
+            color_bitsread = 0;
+        }
+    }
+    return bit;
+}
+
+uint StreamColorBits(uint num_bits) {
+    uint ret = 0;
+    for (uint i = 0; i < num_bits; i++) {
+        ret |= ((ReadColorBit() & 1) << i);
+    }
+    return ret;
+}
+
+EncodingData result_vector[100];
+int result_index = 0;
+
+EncodingData texel_vector[100];
+int texel_vector_index = 0;
+
+void ResultEmplaceBack(EncodingData val) {
+    if (texel_flag) {
+        texel_vector[texel_vector_index] = val;
+        texel_vector_index++;
+    } else {
+        result_vector[result_index] = val;
+        result_index++;
+    }
+}
+
+// Returns the number of bits required to encode n_vals values.
+uint GetBitLength(uint n_vals, uint encoding_index) {
+    uint totalBits = encoding_values[encoding_index].num_bits * n_vals;
+    if (encoding_values[encoding_index].encoding == Trit) {
+        totalBits += (n_vals * 8 + 4) / 5;
+    } else if (encoding_values[encoding_index].encoding == Quint) {
+        totalBits += (n_vals * 7 + 2) / 3;
+    }
+    return totalBits;
+}
+
+uint GetNumWeightValues(uvec2 size, bool dual_plane) {
+    uint n_vals = size.x * size.y;
+    if (dual_plane) {
+        n_vals *= 2;
+    }
+    return n_vals;
+}
+
+uint GetPackedBitSize(uvec2 size, bool dual_plane, uint max_weight) {
+    uint n_vals = GetNumWeightValues(size, dual_plane);
+    return GetBitLength(n_vals, max_weight);
+}
+
+uint BitsBracket(uint bits, uint pos) {
+    return ((bits >> pos) & 1);
+}
+
+uint BitsOp(uint bits, uint start, uint end) {
+    if (start == end) {
+        return BitsBracket(bits, start);
+    } else if (start > end) {
+        uint t = start;
+        start = end;
+        end = t;
+    }
+
+    uint mask = (1 << (end - start + 1)) - 1;
+    return ((bits >> start) & mask);
+}
+
+void DecodeQuintBlock(uint num_bits) { // Value number of bits
+    uint m[3];
+    uint q[3];
+    uint Q;
+    m[0] = StreamColorBits(num_bits);
+    Q = StreamColorBits(3);
+    m[1] = StreamColorBits(num_bits);
+    Q |= StreamColorBits(2) << 3;
+    m[2] = StreamColorBits(num_bits);
+    Q |= StreamColorBits(2) << 5;
+    if (BitsOp(Q, 1, 2) == 3 && BitsOp(Q, 5, 6) == 0) {
+        q[0] = 4;
+        q[1] = 4;
+        q[2] = (BitsBracket(Q, 0) << 2) | ((BitsBracket(Q, 4) & ~BitsBracket(Q, 0)) << 1) |
+               (BitsBracket(Q, 3) & ~BitsBracket(Q, 0));
+    } else {
+        uint C = 0;
+        if (BitsOp(Q, 1, 2) == 3) {
+            q[2] = 4;
+            C = (BitsOp(Q, 3, 4) << 3) | ((~BitsOp(Q, 5, 6) & 3) << 1) | BitsBracket(Q, 0);
+        } else {
+            q[2] = BitsOp(Q, 5, 6);
+            C = BitsOp(Q, 0, 4);
+        }
+
+        if (BitsOp(C, 0, 2) == 5) {
+            q[1] = 4;
+            q[0] = BitsOp(C, 3, 4);
+        } else {
+            q[1] = BitsOp(C, 3, 4);
+            q[0] = BitsOp(C, 0, 2);
+        }
+    }
+
+    for (uint i = 0; i < 3; i++) {
+        EncodingData val;
+        val.encoding = Quint;
+        val.num_bits = num_bits;
+        val.bit_value = m[i];
+        val.quint_trit_value = q[i];
+        ResultEmplaceBack(val);
+    }
+}
+
+void DecodeTritBlock(uint num_bits) {
+    uint m[5];
+    uint t[5];
+    uint T;
+    m[0] = StreamColorBits(num_bits);
+    T = StreamColorBits(2);
+    m[1] = StreamColorBits(num_bits);
+    T |= StreamColorBits(2) << 2;
+    m[2] = StreamColorBits(num_bits);
+    T |= StreamColorBits(1) << 4;
+    m[3] = StreamColorBits(num_bits);
+    T |= StreamColorBits(2) << 5;
+    m[4] = StreamColorBits(num_bits);
+    T |= StreamColorBits(1) << 7;
+    uint C = 0;
+    if (BitsOp(T, 2, 4) == 7) {
+        C = (BitsOp(T, 5, 7) << 2) | BitsOp(T, 0, 1);
+        t[4] = 2;
+        t[3] = 2;
+    } else {
+        C = BitsOp(T, 0, 4);
+        if (BitsOp(T, 5, 6) == 3) {
+            t[4] = 2;
+            t[3] = BitsBracket(T, 7);
+        } else {
+            t[4] = BitsBracket(T, 7);
+            t[3] = BitsOp(T, 5, 6);
+        }
+    }
+    if (BitsOp(C, 0, 1) == 3) {
+        t[2] = 2;
+        t[1] = BitsBracket(C, 4);
+        t[0] = (BitsBracket(C, 3) << 1) | (BitsBracket(C, 2) & ~BitsBracket(C, 3));
+    } else if (BitsOp(C, 2, 3) == 3) {
+        t[2] = 2;
+        t[1] = 2;
+        t[0] = BitsOp(C, 0, 1);
+    } else {
+        t[2] = BitsBracket(C, 4);
+        t[1] = BitsOp(C, 2, 3);
+        t[0] = (BitsBracket(C, 1) << 1) | (BitsBracket(C, 0) & ~BitsBracket(C, 1));
+    }
+    for (uint i = 0; i < 5; i++) {
+        EncodingData val;
+        val.encoding = Trit;
+        val.num_bits = num_bits;
+        val.bit_value = m[i];
+        val.quint_trit_value = t[i];
+        ResultEmplaceBack(val);
+    }
+}
+void DecodeIntegerSequence(uint max_range, uint num_values) {
+    EncodingData val = encoding_values[max_range];
+    uint vals_decoded = 0;
+    while (vals_decoded < num_values) {
+        switch (val.encoding) {
+        case Quint:
+            DecodeQuintBlock(val.num_bits);
+            vals_decoded += 3;
+            break;
+
+        case Trit:
+            DecodeTritBlock(val.num_bits);
+            vals_decoded += 5;
+            break;
+
+        case JustBits:
+            val.bit_value = StreamColorBits(val.num_bits);
+            ResultEmplaceBack(val);
+            vals_decoded++;
+            break;
+        }
+    }
+}
+
+void DecodeColorValues(out uint color_values[32], uvec4 modes, uint num_partitions,
+                       uint color_data_bits) {
+    uint num_values = 0;
+    for (uint i = 0; i < num_partitions; i++) {
+        num_values += ((modes[i] >> 2) + 1) << 1;
+    }
+    int range = 256;
+    while (--range > 0) {
+        EncodingData val = encoding_values[range];
+        uint bitLength = GetBitLength(num_values, range);
+        if (bitLength <= color_data_bits) {
+            while (--range > 0) {
+                EncodingData newval = encoding_values[range];
+                if (newval.encoding != val.encoding && newval.num_bits != val.num_bits) {
+                    break;
+                }
+            }
+            range++;
+            break;
+        }
+    }
+    DecodeIntegerSequence(range, num_values);
+    uint out_index = 0;
+    for (int itr = 0; itr < result_index; itr++) {
+        if (out_index >= num_values) {
+            break;
+        }
+        EncodingData val = result_vector[itr];
+        uint bitlen = val.num_bits;
+        uint bitval = val.bit_value;
+        uint A = 0, B = 0, C = 0, D = 0;
+        A = ReplicateBitTo9((bitval & 1));
+        switch (val.encoding) {
+        case JustBits:
+            color_values[out_index++] = FastReplicateTo8(bitval, bitlen);
+            break;
+        case Trit: {
+            D = val.quint_trit_value;
+            switch (bitlen) {
+            case 1: {
+                C = 204;
+            } break;
+            case 2: {
+                C = 93;
+                uint b = (bitval >> 1) & 1;
+                B = (b << 8) | (b << 4) | (b << 2) | (b << 1);
+            } break;
+
+            case 3: {
+                C = 44;
+                uint cb = (bitval >> 1) & 3;
+                B = (cb << 7) | (cb << 2) | cb;
+            } break;
+
+            case 4: {
+                C = 22;
+                uint dcb = (bitval >> 1) & 7;
+                B = (dcb << 6) | dcb;
+            } break;
+
+            case 5: {
+                C = 11;
+                uint edcb = (bitval >> 1) & 0xF;
+                B = (edcb << 5) | (edcb >> 2);
+            } break;
+
+            case 6: {
+                C = 5;
+                uint fedcb = (bitval >> 1) & 0x1F;
+                B = (fedcb << 4) | (fedcb >> 4);
+            } break;
+            }
+        } break;
+        case Quint: {
+            D = val.quint_trit_value;
+            switch (bitlen) {
+            case 1: {
+                C = 113;
+            } break;
+            case 2: {
+                C = 54;
+                uint b = (bitval >> 1) & 1;
+                B = (b << 8) | (b << 3) | (b << 2);
+            } break;
+            case 3: {
+                C = 26;
+                uint cb = (bitval >> 1) & 3;
+                B = (cb << 7) | (cb << 1) | (cb >> 1);
+            } break;
+            case 4: {
+                C = 13;
+                uint dcb = (bitval >> 1) & 7;
+                B = (dcb << 6) | (dcb >> 1);
+            } break;
+            case 5: {
+                C = 6;
+                uint edcb = (bitval >> 1) & 0xF;
+                B = (edcb << 5) | (edcb >> 3);
+            } break;
+            }
+        } break;
+        }
+
+        if (val.encoding != JustBits) {
+            uint T = (D * C) + B;
+            T ^= A;
+            T = (A & 0x80) | (T >> 2);
+            color_values[out_index++] = T;
+        }
+    }
+}
+ivec2 BitTransferSigned(int a, int b) {
+    ivec2 transferred;
+    transferred[1] = b >> 1;
+    transferred[1] |= a & 0x80;
+    transferred[0] = a >> 1;
+    transferred[0] &= 0x3F;
+    if ((transferred[0] & 0x20) > 0) {
+        transferred[0] -= 0x40;
+    }
+    return transferred;
+}
+
+uvec4 ClampByte(ivec4 color) {
+    for (uint i = 0; i < 4; i++) {
+        color[i] = (color[i] < 0) ? 0 : ((color[i] > 255) ? 255 : color[i]);
+    }
+    return uvec4(color);
+}
+ivec4 BlueContract(int a, int r, int g, int b) {
+    return ivec4(a, (r + b) >> 1, (g + b) >> 1, b);
+}
+int colvals_index = 0;
+void ComputeEndpoints(out uvec4 ep1, out uvec4 ep2, uint color_values[32],
+                      uint color_endpoint_mode) {
+#define READ_UINT_VALUES(N)                                                                        \
+    uint v[N];                                                                                     \
+    for (uint i = 0; i < N; i++) {                                                                 \
+        v[i] = color_values[colvals_index++];                                                      \
+    }
+
+#define READ_INT_VALUES(N)                                                                         \
+    int v[N];                                                                                      \
+    for (uint i = 0; i < N; i++) {                                                                 \
+        v[i] = int(color_values[colvals_index++]);                                                 \
+    }
+
+    switch (color_endpoint_mode) {
+    case 0: {
+        READ_UINT_VALUES(2)
+        ep1 = uvec4(0xFF, v[0], v[0], v[0]);
+        ep2 = uvec4(0xFF, v[1], v[1], v[1]);
+    } break;
+
+    case 1: {
+        READ_UINT_VALUES(2)
+        uint L0 = (v[0] >> 2) | (v[1] & 0xC0);
+        uint L1 = max(L0 + (v[1] & 0x3F), 0xFFU);
+        ep1 = uvec4(0xFF, L0, L0, L0);
+        ep2 = uvec4(0xFF, L1, L1, L1);
+    } break;
+
+    case 4: {
+        READ_UINT_VALUES(4)
+        ep1 = uvec4(v[2], v[0], v[0], v[0]);
+        ep2 = uvec4(v[3], v[1], v[1], v[1]);
+    } break;
+
+    case 5: {
+        READ_INT_VALUES(4)
+        ivec2 transferred = BitTransferSigned(v[1], v[0]);
+        v[1] = transferred[0];
+        v[0] = transferred[1];
+        transferred = BitTransferSigned(v[3], v[2]);
+        v[3] = transferred[0];
+        v[2] = transferred[1];
+        ep1 = ClampByte(ivec4(v[2], v[0], v[0], v[0]));
+        ep2 = ClampByte(ivec4((v[2] + v[3]), v[0] + v[1], v[0] + v[1], v[0] + v[1]));
+    } break;
+
+    case 6: {
+        READ_UINT_VALUES(4)
+        ep1 = uvec4(0xFF, v[0] * v[3] >> 8, v[1] * v[3] >> 8, v[2] * v[3] >> 8);
+        ep2 = uvec4(0xFF, v[0], v[1], v[2]);
+    } break;
+
+    case 8: {
+        READ_UINT_VALUES(6)
+        if (v[1] + v[3] + v[5] >= v[0] + v[2] + v[4]) {
+            ep1 = uvec4(0xFF, v[0], v[2], v[4]);
+            ep2 = uvec4(0xFF, v[1], v[3], v[5]);
+        } else {
+            ep1 = uvec4(BlueContract(0xFF, int(v[1]), int(v[3]), int(v[5])));
+            ep2 = uvec4(BlueContract(0xFF, int(v[0]), int(v[2]), int(v[4])));
+        }
+    } break;
+
+    case 9: {
+        READ_INT_VALUES(6)
+        ivec2 transferred = BitTransferSigned(v[1], v[0]);
+        v[1] = transferred[0];
+        v[0] = transferred[1];
+        transferred = BitTransferSigned(v[3], v[2]);
+        v[3] = transferred[0];
+        v[2] = transferred[1];
+        transferred = BitTransferSigned(v[5], v[4]);
+        v[5] = transferred[0];
+        v[4] = transferred[1];
+        if (v[1] + v[3] + v[5] >= 0) {
+            ep1 = ClampByte(ivec4(0xFF, v[0], v[2], v[4]));
+            ep2 = ClampByte(ivec4(0xFF, v[0] + v[1], v[2] + v[3], v[4] + v[5]));
+        } else {
+            ep1 = ClampByte(BlueContract(0xFF, v[0] + v[1], v[2] + v[3], v[4] + v[5]));
+            ep2 = ClampByte(BlueContract(0xFF, v[0], v[2], v[4]));
+        }
+    } break;
+
+    case 10: {
+        READ_UINT_VALUES(6)
+        ep1 = uvec4(v[4], v[0] * v[3] >> 8, v[1] * v[3] >> 8, v[2] * v[3] >> 8);
+        ep2 = uvec4(v[5], v[0], v[1], v[2]);
+    } break;
+
+    case 12: {
+        READ_UINT_VALUES(8)
+        if (v[1] + v[3] + v[5] >= v[0] + v[2] + v[4]) {
+            ep1 = uvec4(v[6], v[0], v[2], v[4]);
+            ep2 = uvec4(v[7], v[1], v[3], v[5]);
+        } else {
+            ep1 = uvec4(BlueContract(int(v[7]), int(v[1]), int(v[3]), int(v[5])));
+            ep2 = uvec4(BlueContract(int(v[6]), int(v[0]), int(v[2]), int(v[4])));
+        }
+    } break;
+
+    case 13: {
+        READ_INT_VALUES(8)
+        ivec2 transferred = BitTransferSigned(v[1], v[0]);
+        v[1] = transferred[0];
+        v[0] = transferred[1];
+        transferred = BitTransferSigned(v[3], v[2]);
+        v[3] = transferred[0];
+        v[2] = transferred[1];
+
+        transferred = BitTransferSigned(v[5], v[4]);
+        v[5] = transferred[0];
+        v[4] = transferred[1];
+
+        transferred = BitTransferSigned(v[7], v[6]);
+        v[7] = transferred[0];
+        v[6] = transferred[1];
+
+        if (v[1] + v[3] + v[5] >= 0) {
+            ep1 = ClampByte(ivec4(v[6], v[0], v[2], v[4]));
+            ep2 = ClampByte(ivec4(v[7] + v[6], v[0] + v[1], v[2] + v[3], v[4] + v[5]));
+        } else {
+            ep1 = ClampByte(BlueContract(v[6] + v[7], v[0] + v[1], v[2] + v[3], v[4] + v[5]));
+            ep2 = ClampByte(BlueContract(v[6], v[0], v[2], v[4]));
+        }
+    } break;
+    }
+#undef READ_UINT_VALUES
+#undef READ_INT_VALUES
+}
+
+uint UnquantizeTexelWeight(EncodingData val) {
+    uint bitval = val.bit_value;
+    uint bitlen = val.num_bits;
+    uint A = ReplicateBitTo7((bitval & 1));
+    uint B = 0, C = 0, D = 0;
+    uint result = 0;
+    switch (val.encoding) {
+    case JustBits:
+        result = FastReplicateTo6(bitval, bitlen);
+        break;
+    case Trit: {
+        D = val.quint_trit_value;
+        switch (bitlen) {
+        case 0: {
+            uint results[3] = {0, 32, 63};
+            result = results[D];
+        } break;
+        case 1: {
+            C = 50;
+        } break;
+        case 2: {
+            C = 23;
+            uint b = (bitval >> 1) & 1;
+            B = (b << 6) | (b << 2) | b;
+        } break;
+        case 3: {
+            C = 11;
+            uint cb = (bitval >> 1) & 3;
+            B = (cb << 5) | cb;
+        } break;
+        default:
+            break;
+        }
+    } break;
+    case Quint: {
+        D = val.quint_trit_value;
+        switch (bitlen) {
+        case 0: {
+            uint results[5] = {0, 16, 32, 47, 63};
+            result = results[D];
+        } break;
+        case 1: {
+            C = 28;
+        } break;
+        case 2: {
+            C = 13;
+            uint b = (bitval >> 1) & 1;
+            B = (b << 6) | (b << 1);
+        } break;
+        }
+    } break;
+    }
+    if (val.encoding != JustBits && bitlen > 0) {
+        result = D * C + B;
+        result ^= A;
+        result = (A & 0x20) | (result >> 2);
+    }
+    if (result > 32) {
+        result += 1;
+    }
+    return result;
+}
+
+void UnquantizeTexelWeights(out uint outbuffer[2][144], bool dual_plane, uvec2 size) {
+    uint weight_idx = 0;
+    uint unquantized[2][144];
+    uint area = size.x * size.y;
+    for (uint itr = 0; itr < texel_vector_index; itr++) {
+        unquantized[0][weight_idx] = UnquantizeTexelWeight(texel_vector[itr]);
+        if (dual_plane) {
+            ++itr;
+            unquantized[1][weight_idx] = UnquantizeTexelWeight(texel_vector[itr]);
+            if (itr == texel_vector_index) {
+                break;
+            }
+        }
+        if (++weight_idx >= (area))
+            break;
+    }
+    uint Ds = uint((block_dims.x * 0.5f + 1024) / (block_dims.x - 1));
+    uint Dt = uint((block_dims.y * 0.5f + 1024) / (block_dims.y - 1));
+    uint kPlaneScale = dual_plane ? 2 : 1;
+    for (uint plane = 0; plane < kPlaneScale; plane++)
+        for (uint t = 0; t < block_dims.y; t++)
+            for (uint s = 0; s < block_dims.x; s++) {
+                uint cs = Ds * s;
+                uint ct = Dt * t;
+                uint gs = (cs * (size.x - 1) + 32) >> 6;
+                uint gt = (ct * (size.y - 1) + 32) >> 6;
+                uint js = gs >> 4;
+                uint fs = gs & 0xF;
+                uint jt = gt >> 4;
+                uint ft = gt & 0x0F;
+                uint w11 = (fs * ft + 8) >> 4;
+                uint w10 = ft - w11;
+                uint w01 = fs - w11;
+                uint w00 = 16 - fs - ft + w11;
+                uvec4 w = uvec4(w00, w01, w10, w11);
+                uint v0 = jt * size.x + js;
+
+                uvec4 p = uvec4(0);
+                if (v0 < area) {
+                    p.x = unquantized[plane][v0];
+                }
+                if ((v0 + 1) < (area)) {
+                    p.y = unquantized[plane][v0 + 1];
+                }
+                if ((v0 + size.x) < (area)) {
+                    p.z = unquantized[plane][(v0 + size.x)];
+                }
+                if ((v0 + size.x + 1) < (area)) {
+                    p.w = unquantized[plane][(v0 + size.x + 1)];
+                }
+                outbuffer[plane][t * block_dims.x + s] = (uint(dot(p, w)) + 8) >> 4;
+            }
+}
+
+int FindLayout(uint mode) {
+    if ((mode & 3) != 0) {
+        if ((mode & 8) != 0) {
+            if ((mode & 4) != 0) {
+                if ((mode & 0x100) != 0) {
+                    return 4;
+                }
+                return 3;
+            }
+            return 2;
+        }
+        if ((mode & 4) != 0) {
+            return 1;
+        }
+        return 0;
+    }
+    if ((mode & 0x100) != 0) {
+        if ((mode & 0x80) != 0) {
+            if ((mode & 0x20) != 0) {
+                return 8;
+            }
+            return 7;
+        }
+        return 9;
+    }
+    if ((mode & 0x80) != 0) {
+        return 6;
+    }
+    return 5;
+}
+
+TexelWeightParams DecodeBlockInfo(uint block_index) {
+    TexelWeightParams params = TexelWeightParams(uvec2(0), false, 0, false, false, false);
+    uint mode = StreamBits(11);
+    if ((mode & 0x1ff) == 0x1fc) {
+        if ((mode & 0x200) != 0) {
+            params.VoidExtentHDR = true;
+        } else {
+            params.VoidExtentLDR = true;
+        }
+        if ((mode & 0x400) == 0 || StreamBits(1) == 0) {
+            params.Error = true;
+        }
+        return params;
+    }
+    if ((mode & 0xf) == 0) {
+        params.Error = true;
+        return params;
+    }
+    if ((mode & 3) == 0 && (mode & 0x1c0) == 0x1c0) {
+        params.Error = true;
+        return params;
+    }
+    uint A, B;
+    uint mode_layout = FindLayout(mode);
+    switch (mode_layout) {
+    case 0:
+        A = (mode >> 5) & 0x3;
+        B = (mode >> 7) & 0x3;
+        params.size = uvec2(B + 4, A + 2);
+        break;
+    case 1:
+        A = (mode >> 5) & 0x3;
+        B = (mode >> 7) & 0x3;
+        params.size = uvec2(B + 8, A + 2);
+        break;
+    case 2:
+        A = (mode >> 5) & 0x3;
+        B = (mode >> 7) & 0x3;
+        params.size = uvec2(A + 2, B + 8);
+        break;
+    case 3:
+        A = (mode >> 5) & 0x3;
+        B = (mode >> 7) & 0x1;
+        params.size = uvec2(A + 2, B + 6);
+        break;
+    case 4:
+        A = (mode >> 5) & 0x3;
+        B = (mode >> 7) & 0x1;
+        params.size = uvec2(B + 2, A + 2);
+        break;
+    case 5:
+        A = (mode >> 5) & 0x3;
+        params.size = uvec2(12, A + 2);
+        break;
+    case 6:
+        A = (mode >> 5) & 0x3;
+        params.size = uvec2(A + 2, 12);
+        break;
+    case 7:
+        params.size = uvec2(6, 10);
+        break;
+    case 8:
+        params.size = uvec2(10, 6);
+        break;
+    case 9:
+        A = (mode >> 5) & 0x3;
+        B = (mode >> 9) & 0x3;
+        params.size = uvec2(A + 6, B + 6);
+        break;
+    default:
+        params.Error = true;
+        break;
+    }
+    params.dual_plane = (mode_layout != 9) && ((mode & 0x400) != 0);
+    uint weight_index = (mode & 0x10) != 0 ? 1 : 0;
+    if (mode_layout < 5) {
+        weight_index |= (mode & 0x3) << 1;
+    } else {
+        weight_index |= (mode & 0xc) >> 1;
+    }
+    weight_index -= 2;
+    if ((mode_layout != 9) && ((mode & 0x200) != 0)) {
+        const int max_weights[6] = int[6](9, 11, 15, 19, 23, 31);
+        params.max_weight = max_weights[weight_index];
+    } else {
+        const int max_weights[6] = int[6](1, 2, 3, 4, 5, 7);
+        params.max_weight = max_weights[weight_index];
+    }
+    return params;
+}
+
+void FillError(ivec3 coord) {
+    for (uint j = 0; j < block_dims.y; j++) {
+        for (uint i = 0; i < block_dims.x; i++) {
+            imageStore(dest_image, coord.xy + ivec2(i, j), vec4(1.0, 1.0, 0.0, 1.0));
+        }
+    }
+    return;
+}
+
+void FillVoidExtentLDR(ivec3 coord, uint block_index) {
+    for (int i = 0; i < 4; i++) {
+        StreamBits(13);
+    }
+
+    uint r_u = StreamBits(16);
+    uint g_u = StreamBits(16);
+    uint b_u = StreamBits(16);
+    uint a_u = StreamBits(16);
+    float a = float(a_u) / 65535.0f;
+    float r = float(r_u) / 65535.0f;
+    float g = float(g_u) / 65535.0f;
+    float b = float(b_u) / 65535.0f;
+    for (uint j = 0; j < block_dims.y; j++) {
+        for (uint i = 0; i < block_dims.x; i++) {
+            imageStore(dest_image, coord.xy + ivec2(i, j), vec4(r, g, b, a));
+        }
+    }
+}
+
+void DecompressBlock(ivec3 coord, uint block_index) {
+    TexelWeightParams params;
+    params = DecodeBlockInfo(block_index);
+    if (params.Error) {
+        FillError(coord);
+        return;
+    }
+    if (params.VoidExtentHDR) {
+        FillError(coord);
+        return;
+    }
+    if (params.VoidExtentLDR) {
+        FillVoidExtentLDR(coord, block_index);
+        return;
+    }
+    if (params.size.x > block_dims.x || params.size.y > block_dims.y) {
+        FillError(coord);
+        return;
+    }
+    uint num_partitions = StreamBits(2) + 1;
+    if (num_partitions > 4 || (num_partitions == 4 && params.dual_plane)) {
+        FillError(coord);
+        return;
+    }
+    int plane_index = -1;
+    uint partition_index = 1;
+    uvec4 color_endpoint_mode = uvec4(0);
+    uint ced_pointer = 0;
+    uint base_cem = 0;
+    if (num_partitions == 1) {
+        color_endpoint_mode[0] = StreamBits(4);
+        partition_index = 0;
+    } else {
+        partition_index = StreamBits(10);
+        base_cem = StreamBits(6);
+    }
+    uint base_mode = base_cem & 3;
+    uint weight_bits = GetPackedBitSize(params.size, params.dual_plane, params.max_weight);
+    uint remaining_bits = 128 - weight_bits - total_bitsread;
+    uint extra_cem_bits = 0;
+    if (base_mode > 0) {
+        switch (num_partitions) {
+        case 2:
+            extra_cem_bits += 2;
+            break;
+        case 3:
+            extra_cem_bits += 5;
+            break;
+        case 4:
+            extra_cem_bits += 8;
+            break;
+        default:
+            return;
+        }
+    }
+    remaining_bits -= extra_cem_bits;
+    uint plane_selector_bits = 0;
+    if (params.dual_plane) {
+        plane_selector_bits = 2;
+    }
+    remaining_bits -= plane_selector_bits;
+    // Read color data...
+    uint color_data_bits = remaining_bits;
+    while (remaining_bits > 0) {
+        uint nb = min(remaining_bits, 8);
+        uint b = StreamBits(nb);
+        color_endpoint_data[ced_pointer] = uint(bitfieldExtract(b, 0, 8));
+        ced_pointer++;
+        remaining_bits -= nb;
+    }
+    plane_index = int(StreamBits(plane_selector_bits));
+    if (base_mode > 0) {
+        uint extra_cem = StreamBits(extra_cem_bits);
+        uint cem = (extra_cem << 6) | base_cem;
+        cem >>= 2;
+        uint C[4] = {0, 0, 0, 0};
+        for (uint i = 0; i < num_partitions; i++) {
+            C[i] = cem & 1;
+            cem >>= 1;
+        }
+        uint M[4] = {0, 0, 0, 0};
+        for (uint i = 0; i < num_partitions; i++) {
+            M[i] = cem & 3;
+            cem >>= 2;
+        }
+        for (uint i = 0; i < num_partitions; i++) {
+            color_endpoint_mode[i] = base_mode;
+            if ((C[i]) == 0) {
+                color_endpoint_mode[i] -= 1;
+            }
+            color_endpoint_mode[i] <<= 2;
+            color_endpoint_mode[i] |= M[i];
+        }
+    } else if (num_partitions > 1) {
+        uint cem = base_cem >> 2;
+        for (uint i = 0; i < num_partitions; i++) {
+            color_endpoint_mode[i] = cem;
+        }
+    }
+
+    uint color_values[32]; // Four values, two endpoints, four maximum paritions
+    DecodeColorValues(color_values, color_endpoint_mode, num_partitions, color_data_bits);
+    uvec4 endpoints[4][2];
+    for (uint i = 0; i < num_partitions; i++) {
+        ComputeEndpoints(endpoints[i][0], endpoints[i][1], color_values, color_endpoint_mode[i]);
+    }
+    for (uint i = 0; i < 16; i++) {
+        texel_weight_data[i] = local_buff[i];
+    }
+    for (uint i = 0; i < 8; i++) {
+#define REVERSE_BYTE(b) ((b * 0x0802U & 0x22110U) | (b * 0x8020U & 0x88440U)) * 0x10101U >> 16
+        uint a = REVERSE_BYTE(texel_weight_data[i]);
+        uint b = REVERSE_BYTE(texel_weight_data[15 - i]);
+#undef REVERSE_BYTE
+        texel_weight_data[i] = uint(bitfieldExtract(b, 0, 8));
+        texel_weight_data[15 - i] = uint(bitfieldExtract(a, 0, 8));
+    }
+    uint clear_byte_start =
+        (GetPackedBitSize(params.size, params.dual_plane, params.max_weight) >> 3) + 1;
+    texel_weight_data[clear_byte_start - 1] =
+        texel_weight_data[clear_byte_start - 1] &
+        uint(
+            ((1 << (GetPackedBitSize(params.size, params.dual_plane, params.max_weight) % 8)) - 1));
+    for (uint i = 0; i < 16 - clear_byte_start; i++) {
+        texel_weight_data[clear_byte_start + i] = uint(0U);
+    }
+    texel_flag = true; // use texel "vector" and bit stream in integer decoding
+    DecodeIntegerSequence(params.max_weight, GetNumWeightValues(params.size, params.dual_plane));
+    uint weights[2][144];
+    UnquantizeTexelWeights(weights, params.dual_plane, params.size);
+    for (uint j = 0; j < block_dims.y; j++) {
+        for (uint i = 0; i < block_dims.x; i++) {
+            uint local_partition = Select2DPartition(partition_index, i, j, num_partitions,
+                                                     (block_dims.y * block_dims.x) < 32);
+            vec4 p;
+            uvec4 C0 = ReplicateByteTo16(endpoints[local_partition][0]);
+            uvec4 C1 = ReplicateByteTo16(endpoints[local_partition][1]);
+            uvec4 plane_vec = uvec4(0);
+            uvec4 weight_vec = uvec4(0);
+            for (uint c = 0; c < 4; c++) {
+                if (params.dual_plane && (((plane_index + 1) & 3) == c)) {
+                    plane_vec[c] = 1;
+                }
+                weight_vec[c] = weights[plane_vec[c]][j * block_dims.x + i];
+            }
+            vec4 Cf = vec4((C0 * (uvec4(64) - weight_vec) + C1 * weight_vec + uvec4(32)) >> 6);
+            p = (Cf / 65535.0);
+            imageStore(dest_image, coord.xy + ivec2(i, j), p.gbar);
+        }
+    }
+}
+
+void main() {
+    uvec3 pos = gl_GlobalInvocationID + origin;
+    pos.x <<= bytes_per_block_log2;
+
+    // Read as soon as possible due to its latency
+    const uint swizzle = SwizzleOffset(pos.xy);
+
+    const uint block_y = pos.y >> GOB_SIZE_Y_SHIFT;
+
+    uint offset = 0;
+    offset += layer * layer_stride;
+    offset += (block_y >> block_height) * block_size;
+    offset += (block_y & block_height_mask) << GOB_SIZE_SHIFT;
+    offset += (pos.x >> GOB_SIZE_X_SHIFT) << x_shift;
+    offset += swizzle;
+
+    const ivec3 invocation_destination = ivec3(gl_GlobalInvocationID + destination);
+    const ivec3 coord = ivec3(invocation_destination * uvec3(block_dims, 1.0));
+    uint block_index =
+        layer * num_image_blocks.x * num_image_blocks.y + pos.y * num_image_blocks.x + pos.x;
+    current_index = 0;
+    bitsread = 0;
+    for (int i = 0; i < 16; i++) {
+        local_buff[i] = ReadTexel(offset + i);
+    }
+    DecompressBlock(coord, block_index);
+}
-- 
cgit v1.2.3