// SPDX-FileCopyrightText: Copyright 2021 yuzu Emulator Project
// SPDX-License-Identifier: GPL-2.0-or-later
#version 450
#ifdef VULKAN
#define BEGIN_PUSH_CONSTANTS layout(push_constant) uniform PushConstants {
#define END_PUSH_CONSTANTS };
#define UNIFORM(n)
#define BINDING_INPUT_BUFFER 0
#define BINDING_OUTPUT_IMAGE 1
#else // ^^^ Vulkan ^^^ // vvv OpenGL vvv
#define BEGIN_PUSH_CONSTANTS
#define END_PUSH_CONSTANTS
#define UNIFORM(n) layout(location = n) uniform
#define BINDING_INPUT_BUFFER 0
#define BINDING_OUTPUT_IMAGE 0
#endif
layout(local_size_x = 8, local_size_y = 8, local_size_z = 1) in;
BEGIN_PUSH_CONSTANTS
UNIFORM(1) uvec2 block_dims;
UNIFORM(2) uint layer_stride;
UNIFORM(3) uint block_size;
UNIFORM(4) uint x_shift;
UNIFORM(5) uint block_height;
UNIFORM(6) uint block_height_mask;
END_PUSH_CONSTANTS
struct EncodingData {
uint data;
};
struct TexelWeightParams {
uvec2 size;
uint max_weight;
bool dual_plane;
};
layout(binding = BINDING_INPUT_BUFFER, std430) readonly restrict buffer InputBufferU32 {
uvec4 astc_data[];
};
layout(binding = BINDING_OUTPUT_IMAGE, rgba8) uniform writeonly restrict image2DArray dest_image;
const uint GOB_SIZE_X_SHIFT = 6;
const uint GOB_SIZE_Y_SHIFT = 3;
const uint GOB_SIZE_SHIFT = GOB_SIZE_X_SHIFT + GOB_SIZE_Y_SHIFT;
const uint BYTES_PER_BLOCK_LOG2 = 4;
const uint JUST_BITS = 0u;
const uint QUINT = 1u;
const uint TRIT = 2u;
// ASTC Encodings data, sorted in ascending order based on their BitLength value
// (see GetBitLength() function)
const uvec4 encoding_values[6] = uvec4[](
uvec4((JUST_BITS), (JUST_BITS | (1u << 8u)), (TRIT), (JUST_BITS | (2u << 8u))),
uvec4((QUINT), (TRIT | (1u << 8u)), (JUST_BITS | (3u << 8u)), (QUINT | (1u << 8u))),
uvec4((TRIT | (2u << 8u)), (JUST_BITS | (4u << 8u)), (QUINT | (2u << 8u)), (TRIT | (3u << 8u))),
uvec4((JUST_BITS | (5u << 8u)), (QUINT | (3u << 8u)), (TRIT | (4u << 8u)), (JUST_BITS | (6u << 8u))),
uvec4((QUINT | (4u << 8u)), (TRIT | (5u << 8u)), (JUST_BITS | (7u << 8u)), (QUINT | (5u << 8u))),
uvec4((TRIT | (6u << 8u)), (JUST_BITS | (8u << 8u)), 0u, 0u));
// Input ASTC texture globals
int total_bitsread = 0;
uvec4 local_buff;
// Color data globals
uvec4 color_endpoint_data;
int color_bitsread = 0;
// Global "vector" to be pushed into when decoding
// At most will require BLOCK_WIDTH x BLOCK_HEIGHT in single plane mode
// At most will require BLOCK_WIDTH x BLOCK_HEIGHT x 2 in dual plane mode
// So the maximum would be 144 (12 x 12) elements, x 2 for two planes
#define DIVCEIL(number, divisor) (number + divisor - 1) / divisor
#define ARRAY_NUM_ELEMENTS 144
#define VECTOR_ARRAY_SIZE DIVCEIL(ARRAY_NUM_ELEMENTS * 2, 4)
uvec4 result_vector[VECTOR_ARRAY_SIZE];
uint result_index = 0;
uint result_vector_max_index;
bool result_limit_reached = false;
// EncodingData helpers
uint Encoding(EncodingData val) {
return bitfieldExtract(val.data, 0, 8);
}
uint NumBits(EncodingData val) {
return bitfieldExtract(val.data, 8, 8);
}
uint BitValue(EncodingData val) {
return bitfieldExtract(val.data, 16, 8);
}
uint QuintTritValue(EncodingData val) {
return bitfieldExtract(val.data, 24, 8);
}
void Encoding(inout EncodingData val, uint v) {
val.data = bitfieldInsert(val.data, v, 0, 8);
}
void NumBits(inout EncodingData val, uint v) {
val.data = bitfieldInsert(val.data, v, 8, 8);
}
void BitValue(inout EncodingData val, uint v) {
val.data = bitfieldInsert(val.data, v, 16, 8);
}
void QuintTritValue(inout EncodingData val, uint v) {
val.data = bitfieldInsert(val.data, v, 24, 8);
}
EncodingData CreateEncodingData(uint encoding, uint num_bits, uint bit_val, uint quint_trit_val) {
return EncodingData(((encoding) << 0u) | ((num_bits) << 8u) |
((bit_val) << 16u) | ((quint_trit_val) << 24u));
}
void ResultEmplaceBack(EncodingData val) {
if (result_index >= result_vector_max_index) {
// Alert callers to avoid decoding more than needed by this phase
result_limit_reached = true;
return;
}
const uint array_index = result_index / 4u;
const uint vector_index = result_index % 4u;
result_vector[array_index][vector_index] = val.data;
++result_index;
}
uvec4 ReplicateByteTo16(uvec4 value) {
return value * 0x101;
}
uint ReplicateBitTo7(uint value) {
return value * 127;
}
uint ReplicateBitTo9(uint value) {
return value * 511;
}
uint FastReplicateTo8(uint value, uint num_bits) {
if (value == 0) {
return 0;
}
const uint array_index = value / 4;
const uint vector_index = bitfieldExtract(value, 0, 2);
switch (num_bits) {
case 1:
return 255;
case 2: {
const uvec4 REPLICATE_2_BIT_TO_8_TABLE = (uvec4(0, 85, 170, 255));
return REPLICATE_2_BIT_TO_8_TABLE[vector_index];
}
case 3: {
const uvec4 REPLICATE_3_BIT_TO_8_TABLE[2] =
uvec4[](uvec4(0, 36, 73, 109), uvec4(146, 182, 219, 255));
return REPLICATE_3_BIT_TO_8_TABLE[array_index][vector_index];
}
case 4: {
const uvec4 REPLICATE_4_BIT_TO_8_TABLE[4] =
uvec4[](uvec4(0, 17, 34, 51), uvec4(68, 85, 102, 119), uvec4(136, 153, 170, 187),
uvec4(204, 221, 238, 255));
return REPLICATE_4_BIT_TO_8_TABLE[array_index][vector_index];
}
case 5: {
const uvec4 REPLICATE_5_BIT_TO_8_TABLE[8] =
uvec4[](uvec4(0, 8, 16, 24), uvec4(33, 41, 49, 57), uvec4(66, 74, 82, 90),
uvec4(99, 107, 115, 123), uvec4(132, 140, 148, 156), uvec4(165, 173, 181, 189),
uvec4(198, 206, 214, 222), uvec4(231, 239, 247, 255));
return REPLICATE_5_BIT_TO_8_TABLE[array_index][vector_index];
}
case 6: {
const uvec4 REPLICATE_6_BIT_TO_8_TABLE[16] = uvec4[](
uvec4(0, 4, 8, 12), uvec4(16, 20, 24, 28), uvec4(32, 36, 40, 44), uvec4(48, 52, 56, 60),
uvec4(65, 69, 73, 77), uvec4(81, 85, 89, 93), uvec4(97, 101, 105, 109),
uvec4(113, 117, 121, 125), uvec4(130, 134, 138, 142), uvec4(146, 150, 154, 158),
uvec4(162, 166, 170, 174), uvec4(178, 182, 186, 190), uvec4(195, 199, 203, 207),
uvec4(211, 215, 219, 223), uvec4(227, 231, 235, 239), uvec4(243, 247, 251, 255));
return REPLICATE_6_BIT_TO_8_TABLE[array_index][vector_index];
}
case 7: {
const uvec4 REPLICATE_7_BIT_TO_8_TABLE[32] =
uvec4[](uvec4(0, 2, 4, 6), uvec4(8, 10, 12, 14), uvec4(16, 18, 20, 22),
uvec4(24, 26, 28, 30), uvec4(32, 34, 36, 38), uvec4(40, 42, 44, 46),
uvec4(48, 50, 52, 54), uvec4(56, 58, 60, 62), uvec4(64, 66, 68, 70),
uvec4(72, 74, 76, 78), uvec4(80, 82, 84, 86), uvec4(88, 90, 92, 94),
uvec4(96, 98, 100, 102), uvec4(104, 106, 108, 110), uvec4(112, 114, 116, 118),
uvec4(120, 122, 124, 126), uvec4(129, 131, 133, 135), uvec4(137, 139, 141, 143),
uvec4(145, 147, 149, 151), uvec4(153, 155, 157, 159), uvec4(161, 163, 165, 167),
uvec4(169, 171, 173, 175), uvec4(177, 179, 181, 183), uvec4(185, 187, 189, 191),
uvec4(193, 195, 197, 199), uvec4(201, 203, 205, 207), uvec4(209, 211, 213, 215),
uvec4(217, 219, 221, 223), uvec4(225, 227, 229, 231), uvec4(233, 235, 237, 239),
uvec4(241, 243, 245, 247), uvec4(249, 251, 253, 255));
return REPLICATE_7_BIT_TO_8_TABLE[array_index][vector_index];
}
}
return value;
}
uint FastReplicateTo6(uint value, uint num_bits) {
if (value == 0) {
return 0;
}
const uint array_index = value / 4;
const uint vector_index = bitfieldExtract(value, 0, 2);
switch (num_bits) {
case 1:
return 63;
case 2: {
const uvec4 REPLICATE_2_BIT_TO_6_TABLE = uvec4(0, 21, 42, 63);
return REPLICATE_2_BIT_TO_6_TABLE[vector_index];
}
case 3: {
const uvec4 REPLICATE_3_BIT_TO_6_TABLE[2] =
uvec4[](uvec4(0, 9, 18, 27), uvec4(36, 45, 54, 63));
return REPLICATE_3_BIT_TO_6_TABLE[array_index][vector_index];
}
case 4: {
const uvec4 REPLICATE_4_BIT_TO_6_TABLE[4] =
uvec4[](uvec4(0, 4, 8, 12), uvec4(17, 21, 25, 29), uvec4(34, 38, 42, 46),
uvec4(51, 55, 59, 63));
return REPLICATE_4_BIT_TO_6_TABLE[array_index][vector_index];
}
case 5: {
const uvec4 REPLICATE_5_BIT_TO_6_TABLE[8] =
uvec4[](uvec4(0, 2, 4, 6), uvec4(8, 10, 12, 14), uvec4(16, 18, 20, 22),
uvec4(24, 26, 28, 30), uvec4(33, 35, 37, 39), uvec4(41, 43, 45, 47),
uvec4(49, 51, 53, 55), uvec4(57, 59, 61, 63));
return REPLICATE_5_BIT_TO_6_TABLE[array_index][vector_index];
}
}
return value;
}
uint Div3Floor(uint v) {
return (v * 0x5556) >> 16;
}
uint Div3Ceil(uint v) {
return Div3Floor(v + 2);
}
uint Div5Floor(uint v) {
return (v * 0x3334) >> 16;
}
uint Div5Ceil(uint v) {
return Div5Floor(v + 4);
}
uint Hash52(uint p) {
p ^= p >> 15;
p -= p << 17;
p += p << 7;
p += p << 4;
p ^= p >> 5;
p += p << 16;
p ^= p >> 7;
p ^= p >> 3;
p ^= p << 6;
p ^= p >> 17;
return p;
}
uint Select2DPartition(uint seed, uint x, uint y, uint partition_count, bool small_block) {
if (small_block) {
x <<= 1;
y <<= 1;
}
seed += (partition_count - 1) * 1024;
const uint rnum = Hash52(uint(seed));
uint seed1 = uint(rnum & 0xF);
uint seed2 = uint((rnum >> 4) & 0xF);
uint seed3 = uint((rnum >> 8) & 0xF);
uint seed4 = uint((rnum >> 12) & 0xF);
uint seed5 = uint((rnum >> 16) & 0xF);
uint seed6 = uint((rnum >> 20) & 0xF);
uint seed7 = uint((rnum >> 24) & 0xF);
uint seed8 = uint((rnum >> 28) & 0xF);
seed1 = (seed1 * seed1);
seed2 = (seed2 * seed2);
seed3 = (seed3 * seed3);
seed4 = (seed4 * seed4);
seed5 = (seed5 * seed5);
seed6 = (seed6 * seed6);
seed7 = (seed7 * seed7);
seed8 = (seed8 * seed8);
uint sh1, sh2;
if ((seed & 1) > 0) {
sh1 = (seed & 2) > 0 ? 4 : 5;
sh2 = (partition_count == 3) ? 6 : 5;
} else {
sh1 = (partition_count == 3) ? 6 : 5;
sh2 = (seed & 2) > 0 ? 4 : 5;
}
seed1 >>= sh1;
seed2 >>= sh2;
seed3 >>= sh1;
seed4 >>= sh2;
seed5 >>= sh1;
seed6 >>= sh2;
seed7 >>= sh1;
seed8 >>= sh2;
uint a = seed1 * x + seed2 * y + (rnum >> 14);
uint b = seed3 * x + seed4 * y + (rnum >> 10);
uint c = seed5 * x + seed6 * y + (rnum >> 6);
uint d = seed7 * x + seed8 * y + (rnum >> 2);
a &= 0x3F;
b &= 0x3F;
c &= 0x3F;
d &= 0x3F;
if (partition_count < 4) {
d = 0;
}
if (partition_count < 3) {
c = 0;
}
if (a >= b && a >= c && a >= d) {
return 0;
} else if (b >= c && b >= d) {
return 1;
} else if (c >= d) {
return 2;
} else {
return 3;
}
}
uint ExtractBits(uvec4 payload, int offset, int bits) {
if (bits <= 0) {
return 0;
}
if (bits > 32) {
return 0;
}
const int last_offset = offset + bits - 1;
const int shifted_offset = offset >> 5;
if ((last_offset >> 5) == shifted_offset) {
return bitfieldExtract(payload[shifted_offset], offset & 31, bits);
}
const int first_bits = 32 - (offset & 31);
const int result_first = int(bitfieldExtract(payload[shifted_offset], offset & 31, first_bits));
const int result_second = int(bitfieldExtract(payload[shifted_offset + 1], 0, bits - first_bits));
return result_first | (result_second << first_bits);
}
uint StreamBits(uint num_bits) {
const int int_bits = int(num_bits);
const uint ret = ExtractBits(local_buff, total_bitsread, int_bits);
total_bitsread += int_bits;
return ret;
}
void SkipBits(uint num_bits) {
const int int_bits = int(num_bits);
total_bitsread += int_bits;
}
uint StreamColorBits(uint num_bits) {
const int int_bits = int(num_bits);
const uint ret = ExtractBits(color_endpoint_data, color_bitsread, int_bits);
color_bitsread += int_bits;
return ret;
}
EncodingData GetEncodingFromVector(uint index) {
const uint array_index = index / 4;
const uint vector_index = index % 4;
const uint data = result_vector[array_index][vector_index];
return EncodingData(data);
}
// Returns the number of bits required to encode n_vals values.
uint GetBitLength(uint n_vals, uint encoding_index) {
const EncodingData encoding_value =
EncodingData(encoding_values[encoding_index / 4][encoding_index % 4]);
const uint encoding = Encoding(encoding_value);
uint total_bits = NumBits(encoding_value) * n_vals;
if (encoding == TRIT) {
total_bits += Div5Ceil(n_vals * 8);
} else if (encoding == QUINT) {
total_bits += Div3Ceil(n_vals * 7);
}
return total_bits;
}
uint GetNumWeightValues(uvec2 size, bool dual_plane) {
uint n_vals = size.x * size.y;
if (dual_plane) {
n_vals *= 2;
}
return n_vals;
}
uint GetPackedBitSize(uvec2 size, bool dual_plane, uint max_weight) {
const uint n_vals = GetNumWeightValues(size, dual_plane);
return GetBitLength(n_vals, max_weight);
}
uint BitsBracket(uint bits, uint pos) {
return ((bits >> pos) & 1);
}
uint BitsOp(uint bits, uint start, uint end) {
const uint mask = (1 << (end - start + 1)) - 1;
return ((bits >> start) & mask);
}
void DecodeQuintBlock(uint num_bits) {
uvec3 m;
uvec4 qQ;
m[0] = StreamColorBits(num_bits);
qQ.w = StreamColorBits(3);
m[1] = StreamColorBits(num_bits);
qQ.w |= StreamColorBits(2) << 3;
m[2] = StreamColorBits(num_bits);
qQ.w |= StreamColorBits(2) << 5;
if (BitsOp(qQ.w, 1, 2) == 3 && BitsOp(qQ.w, 5, 6) == 0) {
qQ.x = 4;
qQ.y = 4;
qQ.z = (BitsBracket(qQ.w, 0) << 2) | ((BitsBracket(qQ.w, 4) & ~BitsBracket(qQ.w, 0)) << 1) |
(BitsBracket(qQ.w, 3) & ~BitsBracket(qQ.w, 0));
} else {
uint C = 0;
if (BitsOp(qQ.w, 1, 2) == 3) {
qQ.z = 4;
C = (BitsOp(qQ.w, 3, 4) << 3) | ((~BitsOp(qQ.w, 5, 6) & 3) << 1) | BitsBracket(qQ.w, 0);
} else {
qQ.z = BitsOp(qQ.w, 5, 6);
C = BitsOp(qQ.w, 0, 4);
}
if (BitsOp(C, 0, 2) == 5) {
qQ.y = 4;
qQ.x = BitsOp(C, 3, 4);
} else {
qQ.y = BitsOp(C, 3, 4);
qQ.x = BitsOp(C, 0, 2);
}
}
for (uint i = 0; i < 3; i++) {
const EncodingData val = CreateEncodingData(QUINT, num_bits, m[i], qQ[i]);
ResultEmplaceBack(val);
}
}
void DecodeTritBlock(uint num_bits) {
uvec4 m;
uvec4 t;
uvec3 Tm5t5;
m[0] = StreamColorBits(num_bits);
Tm5t5.x = StreamColorBits(2);
m[1] = StreamColorBits(num_bits);
Tm5t5.x |= StreamColorBits(2) << 2;
m[2] = StreamColorBits(num_bits);
Tm5t5.x |= StreamColorBits(1) << 4;
m[3] = StreamColorBits(num_bits);
Tm5t5.x |= StreamColorBits(2) << 5;
Tm5t5.y = StreamColorBits(num_bits);
Tm5t5.x |= StreamColorBits(1) << 7;
uint C = 0;
if (BitsOp(Tm5t5.x, 2, 4) == 7) {
C = (BitsOp(Tm5t5.x, 5, 7) << 2) | BitsOp(Tm5t5.x, 0, 1);
Tm5t5.z = 2;
t[3] = 2;
} else {
C = BitsOp(Tm5t5.x, 0, 4);
if (BitsOp(Tm5t5.x, 5, 6) == 3) {
Tm5t5.z = 2;
t[3] = BitsBracket(Tm5t5.x, 7);
} else {
Tm5t5.z = BitsBracket(Tm5t5.x, 7);
t[3] = BitsOp(Tm5t5.x, 5, 6);
}
}
if (BitsOp(C, 0, 1) == 3) {
t[2] = 2;
t[1] = BitsBracket(C, 4);
t[0] = (BitsBracket(C, 3) << 1) | (BitsBracket(C, 2) & ~BitsBracket(C, 3));
} else if (BitsOp(C, 2, 3) == 3) {
t[2] = 2;
t[1] = 2;
t[0] = BitsOp(C, 0, 1);
} else {
t[2] = BitsBracket(C, 4);
t[1] = BitsOp(C, 2, 3);
t[0] = (BitsBracket(C, 1) << 1) | (BitsBracket(C, 0) & ~BitsBracket(C, 1));
}
for (uint i = 0; i < 4; i++) {
const EncodingData val = CreateEncodingData(TRIT, num_bits, m[i], t[i]);
ResultEmplaceBack(val);
}
const EncodingData val = CreateEncodingData(TRIT, num_bits, Tm5t5.y, Tm5t5.z);
ResultEmplaceBack(val);
}
void DecodeIntegerSequence(uint max_range, uint num_values) {
EncodingData val = EncodingData(encoding_values[max_range / 4][max_range % 4]);
const uint encoding = Encoding(val);
const uint num_bits = NumBits(val);
uint vals_decoded = 0;
while (vals_decoded < num_values && !result_limit_reached) {
switch (encoding) {
case QUINT:
DecodeQuintBlock(num_bits);
vals_decoded += 3;
break;
case TRIT:
DecodeTritBlock(num_bits);
vals_decoded += 5;
break;
case JUST_BITS:
BitValue(val, StreamColorBits(num_bits));
ResultEmplaceBack(val);
vals_decoded++;
break;
}
}
}
uvec4 color_values[8];
void DecodeColorValues(uvec4 modes, uint num_partitions, uint color_data_bits) {
uint num_values = 0;
for (uint i = 0; i < num_partitions; i++) {
num_values += ((modes[i] >> 2) + 1) << 1;
}
// Find the largest encoding that's within color_data_bits
// TODO(ameerj): profile with binary search
int range = 0;
while (++range < ((encoding_values.length() * 4) - 2)) {
const uint bit_length = GetBitLength(num_values, range);
if (bit_length > color_data_bits) {
break;
}
}
DecodeIntegerSequence(range - 1, num_values);
uint out_index = 0;
for (int itr = 0; itr < result_index; ++itr) {
if (out_index >= num_values) {
break;
}
const EncodingData val = GetEncodingFromVector(itr);
const uint encoding = Encoding(val);
const uint bitlen = NumBits(val);
const uint bitval = BitValue(val);
uint A = 0, B = 0, C = 0, D = 0;
A = ReplicateBitTo9((bitval & 1));
switch (encoding) {
case JUST_BITS:
color_values[out_index / 4][out_index % 4] = FastReplicateTo8(bitval, bitlen);
++out_index;
break;
case TRIT: {
D = QuintTritValue(val);
switch (bitlen) {
case 1:
C = 204;
break;
case 2: {
C = 93;
const uint b = (bitval >> 1) & 1;
B = (b << 8) | (b << 4) | (b << 2) | (b << 1);
break;
}
case 3: {
C = 44;
const uint cb = (bitval >> 1) & 3;
B = (cb << 7) | (cb << 2) | cb;
break;
}
case 4: {
C = 22;
const uint dcb = (bitval >> 1) & 7;
B = (dcb << 6) | dcb;
break;
}
case 5: {
C = 11;
const uint edcb = (bitval >> 1) & 0xF;
B = (edcb << 5) | (edcb >> 2);
break;
}
case 6: {
C = 5;
const uint fedcb = (bitval >> 1) & 0x1F;
B = (fedcb << 4) | (fedcb >> 4);
break;
}
}
break;
}
case QUINT: {
D = QuintTritValue(val);
switch (bitlen) {
case 1:
C = 113;
break;
case 2: {
C = 54;
const uint b = (bitval >> 1) & 1;
B = (b << 8) | (b << 3) | (b << 2);
break;
}
case 3: {
C = 26;
const uint cb = (bitval >> 1) & 3;
B = (cb << 7) | (cb << 1) | (cb >> 1);
break;
}
case 4: {
C = 13;
const uint dcb = (bitval >> 1) & 7;
B = (dcb << 6) | (dcb >> 1);
break;
}
case 5: {
C = 6;
const uint edcb = (bitval >> 1) & 0xF;
B = (edcb << 5) | (edcb >> 3);
break;
}
}
break;
}
}
if (encoding != JUST_BITS) {
uint T = (D * C) + B;
T ^= A;
T = (A & 0x80) | (T >> 2);
color_values[out_index / 4][out_index % 4] = T;
++out_index;
}
}
}
ivec2 BitTransferSigned(int a, int b) {
ivec2 transferred;
transferred.y = b >> 1;
transferred.y |= a & 0x80;
transferred.x = a >> 1;
transferred.x &= 0x3F;
if ((transferred.x & 0x20) > 0) {
transferred.x -= 0x40;
}
return transferred;
}
uvec4 ClampByte(ivec4 color) {
return uvec4(clamp(color, 0, 255));
}
ivec4 BlueContract(int a, int r, int g, int b) {
return ivec4(a, (r + b) >> 1, (g + b) >> 1, b);
}
void ComputeEndpoints(out uvec4 ep1, out uvec4 ep2, uint color_endpoint_mode,
inout uint colvals_index) {
#define READ_UINT_VALUES(N) \
uvec4 V[2]; \
for (uint i = 0; i < N; i++) { \
V[i / 4][i % 4] = color_values[colvals_index / 4][colvals_index % 4]; \
++colvals_index; \
}
#define READ_INT_VALUES(N) \
ivec4 V[2]; \
for (uint i = 0; i < N; i++) { \
V[i / 4][i % 4] = int(color_values[colvals_index / 4][colvals_index % 4]); \
++colvals_index; \
}
switch (color_endpoint_mode) {
case 0: {
READ_UINT_VALUES(2)
ep1 = uvec4(0xFF, V[0].x, V[0].x, V[0].x);
ep2 = uvec4(0xFF, V[0].y, V[0].y, V[0].y);
break;
}
case 1: {
READ_UINT_VALUES(2)
const uint L0 = (V[0].x >> 2) | (V[0].y & 0xC0);
const uint L1 = min(L0 + (V[0].y & 0x3F), 0xFFU);
ep1 = uvec4(0xFF, L0, L0, L0);
ep2 = uvec4(0xFF, L1, L1, L1);
break;
}
case 4: {
READ_UINT_VALUES(4)
ep1 = uvec4(V[0].z, V[0].x, V[0].x, V[0].x);
ep2 = uvec4(V[0].w, V[0].y, V[0].y, V[0].y);
break;
}
case 5: {
READ_INT_VALUES(4)
ivec2 transferred = BitTransferSigned(V[0].y, V[0].x);
V[0].y = transferred.x;
V[0].x = transferred.y;
transferred = BitTransferSigned(V[0].w, V[0].z);
V[0].w = transferred.x;
V[0].z = transferred.y;
ep1 = ClampByte(ivec4(V[0].z, V[0].x, V[0].x, V[0].x));
ep2 = ClampByte(ivec4(V[0].z + V[0].w, V[0].x + V[0].y, V[0].x + V[0].y, V[0].x + V[0].y));
break;
}
case 6: {
READ_UINT_VALUES(4)
ep1 = uvec4(0xFF, (V[0].x * V[0].w) >> 8, (V[0].y * V[0].w) >> 8, (V[0].z * V[0].w) >> 8);
ep2 = uvec4(0xFF, V[0].x, V[0].y, V[0].z);
break;
}
case 8: {
READ_UINT_VALUES(6)
if ((V[0].y + V[0].w + V[1].y) >= (V[0].x + V[0].z + V[1].x)) {
ep1 = uvec4(0xFF, V[0].x, V[0].z, V[1].x);
ep2 = uvec4(0xFF, V[0].y, V[0].w, V[1].y);
} else {
ep1 = uvec4(BlueContract(0xFF, int(V[0].y), int(V[0].w), int(V[1].y)));
ep2 = uvec4(BlueContract(0xFF, int(V[0].x), int(V[0].z), int(V[1].x)));
}
break;
}
case 9: {
READ_INT_VALUES(6)
ivec2 transferred = BitTransferSigned(V[0].y, V[0].x);
V[0].y = transferred.x;
V[0].x = transferred.y;
transferred = BitTransferSigned(V[0].w, V[0].z);
V[0].w = transferred.x;
V[0].z = transferred.y;
transferred = BitTransferSigned(V[1].y, V[1].x);
V[1].y = transferred.x;
V[1].x = transferred.y;
if ((V[0].y + V[0].w + V[1].y) >= 0) {
ep1 = ClampByte(ivec4(0xFF, V[0].x, V[0].z, V[1].x));
ep2 = ClampByte(ivec4(0xFF, V[0].x + V[0].y, V[0].z + V[0].w, V[1].x + V[1].y));
} else {
ep1 = ClampByte(BlueContract(0xFF, V[0].x + V[0].y, V[0].z + V[0].w, V[1].x + V[1].y));
ep2 = ClampByte(BlueContract(0xFF, V[0].x, V[0].z, V[1].x));
}
break;
}
case 10: {
READ_UINT_VALUES(6)
ep1 = uvec4(V[1].x, (V[0].x * V[0].w) >> 8, (V[0].y * V[0].w) >> 8, (V[0].z * V[0].w) >> 8);
ep2 = uvec4(V[1].y, V[0].x, V[0].y, V[0].z);
break;
}
case 12: {
READ_UINT_VALUES(8)
if ((V[0].y + V[0].w + V[1].y) >= (V[0].x + V[0].z + V[1].x)) {
ep1 = uvec4(V[1].z, V[0].x, V[0].z, V[1].x);
ep2 = uvec4(V[1].w, V[0].y, V[0].w, V[1].y);
} else {
ep1 = uvec4(BlueContract(int(V[1].w), int(V[0].y), int(V[0].w), int(V[1].y)));
ep2 = uvec4(BlueContract(int(V[1].z), int(V[0].x), int(V[0].z), int(V[1].x)));
}
break;
}
case 13: {
READ_INT_VALUES(8)
ivec2 transferred = BitTransferSigned(V[0].y, V[0].x);
V[0].y = transferred.x;
V[0].x = transferred.y;
transferred = BitTransferSigned(V[0].w, V[0].z);
V[0].w = transferred.x;
V[0].z = transferred.y;
transferred = BitTransferSigned(V[1].y, V[1].x);
V[1].y = transferred.x;
V[1].x = transferred.y;
transferred = BitTransferSigned(V[1].w, V[1].z);
V[1].w = transferred.x;
V[1].z = transferred.y;
if ((V[0].y + V[0].w + V[1].y) >= 0) {
ep1 = ClampByte(ivec4(V[1].z, V[0].x, V[0].z, V[1].x));
ep2 = ClampByte(ivec4(V[1].w + V[1].z, V[0].x + V[0].y, V[0].z + V[0].w, V[1].x + V[1].y));
} else {
ep1 = ClampByte(BlueContract(V[1].z + V[1].w, V[0].x + V[0].y, V[0].z + V[0].w, V[1].x + V[1].y));
ep2 = ClampByte(BlueContract(V[1].z, V[0].x, V[0].z, V[1].x));
}
break;
}
default: {
// HDR mode, or more likely a bug computing the color_endpoint_mode
ep1 = uvec4(0xFF, 0xFF, 0, 0);
ep2 = uvec4(0xFF, 0xFF, 0, 0);
break;
}
}
#undef READ_UINT_VALUES
#undef READ_INT_VALUES
}
uint UnquantizeTexelWeight(EncodingData val) {
const uint encoding = Encoding(val);
const uint bitlen = NumBits(val);
const uint bitval = BitValue(val);
const uint A = ReplicateBitTo7((bitval & 1));
uint B = 0, C = 0, D = 0;
uint result = 0;
switch (encoding) {
case JUST_BITS:
result = FastReplicateTo6(bitval, bitlen);
break;
case TRIT: {
D = QuintTritValue(val);
switch (bitlen) {
case 0: {
const uint results[3] = {0, 32, 63};
result = results[D];
break;
}
case 1: {
C = 50;
break;
}
case 2: {
C = 23;
const uint b = (bitval >> 1) & 1;
B = (b << 6) | (b << 2) | b;
break;
}
case 3: {
C = 11;
const uint cb = (bitval >> 1) & 3;
B = (cb << 5) | cb;
break;
}
default:
break;
}
break;
}
case QUINT: {
D = QuintTritValue(val);
switch (bitlen) {
case 0: {
const uint results[5] = {0, 16, 32, 47, 63};
result = results[D];
break;
}
case 1: {
C = 28;
break;
}
case 2: {
C = 13;
const uint b = (bitval >> 1) & 1;
B = (b << 6) | (b << 1);
break;
}
}
break;
}
}
if (encoding != JUST_BITS && bitlen > 0) {
result = D * C + B;
result ^= A;
result = (A & 0x20) | (result >> 2);
}
if (result > 32) {
result += 1;
}
return result;
}
uvec4 unquantized_texel_weights[VECTOR_ARRAY_SIZE];
void UnquantizeTexelWeights(uvec2 size, bool is_dual_plane) {
const uint Ds = uint((block_dims.x * 0.5f + 1024) / (block_dims.x - 1));
const uint Dt = uint((block_dims.y * 0.5f + 1024) / (block_dims.y - 1));
const uint num_planes = is_dual_plane ? 2 : 1;
const uint area = size.x * size.y;
const uint loop_count = min(result_index, area * num_planes);
for (uint itr = 0; itr < loop_count; ++itr) {
const uint array_index = itr / 4;
const uint vector_index = itr % 4;
result_vector[array_index][vector_index] =
UnquantizeTexelWeight(GetEncodingFromVector(itr));
}
for (uint plane = 0; plane < num_planes; ++plane) {
for (uint t = 0; t < block_dims.y; t++) {
for (uint s = 0; s < block_dims.x; s++) {
const uint cs = Ds * s;
const uint ct = Dt * t;
const uint gs = (cs * (size.x - 1) + 32) >> 6;
const uint gt = (ct * (size.y - 1) + 32) >> 6;
const uint js = gs >> 4;
const uint fs = gs & 0xF;
const uint jt = gt >> 4;
const uint ft = gt & 0x0F;
const uint w11 = (fs * ft + 8) >> 4;
const uint w10 = ft - w11;
const uint w01 = fs - w11;
const uint w00 = 16 - fs - ft + w11;
const uvec4 w = uvec4(w00, w01, w10, w11);
const uint v0 = jt * size.x + js;
uvec4 p = uvec4(0);
#define VectorIndicesFromBase(offset_base) \
const uint offset = is_dual_plane ? 2 * offset_base + plane : offset_base; \
const uint array_index = offset / 4; \
const uint vector_index = offset % 4;
if (v0 < area) {
const uint offset_base = v0;
VectorIndicesFromBase(offset_base);
p.x = result_vector[array_index][vector_index];
}
if ((v0 + 1) < (area)) {
const uint offset_base = v0 + 1;
VectorIndicesFromBase(offset_base);
p.y = result_vector[array_index][vector_index];
}
if ((v0 + size.x) < (area)) {
const uint offset_base = v0 + size.x;
VectorIndicesFromBase(offset_base);
p.z = result_vector[array_index][vector_index];
}
if ((v0 + size.x + 1) < (area)) {
const uint offset_base = v0 + size.x + 1;
VectorIndicesFromBase(offset_base);
p.w = result_vector[array_index][vector_index];
}
const uint offset = (t * block_dims.x + s) + ARRAY_NUM_ELEMENTS * plane;
const uint array_index = offset / 4;
const uint vector_index = offset % 4;
unquantized_texel_weights[array_index][vector_index] = (uint(dot(p, w)) + 8) >> 4;
}
}
}
}
int FindLayout(uint mode) {
if ((mode & 3) != 0) {
if ((mode & 8) != 0) {
if ((mode & 4) != 0) {
if ((mode & 0x100) != 0) {
return 4;
}
return 3;
}
return 2;
}
if ((mode & 4) != 0) {
return 1;
}
return 0;
}
if ((mode & 0x100) != 0) {
if ((mode & 0x80) != 0) {
if ((mode & 0x20) != 0) {
return 8;
}
return 7;
}
return 9;
}
if ((mode & 0x80) != 0) {
return 6;
}
return 5;
}
void FillError(ivec3 coord) {
for (uint j = 0; j < block_dims.y; j++) {
for (uint i = 0; i < block_dims.x; i++) {
imageStore(dest_image, coord + ivec3(i, j, 0), vec4(0.0, 0.0, 0.0, 0.0));
}
}
}
void FillVoidExtentLDR(ivec3 coord) {
SkipBits(52);
const uint r_u = StreamBits(16);
const uint g_u = StreamBits(16);
const uint b_u = StreamBits(16);
const uint a_u = StreamBits(16);
const float a = float(a_u) / 65535.0f;
const float r = float(r_u) / 65535.0f;
const float g = float(g_u) / 65535.0f;
const float b = float(b_u) / 65535.0f;
for (uint j = 0; j < block_dims.y; j++) {
for (uint i = 0; i < block_dims.x; i++) {
imageStore(dest_image, coord + ivec3(i, j, 0), vec4(r, g, b, a));
}
}
}
bool IsError(uint mode) {
if ((mode & 0x1ff) == 0x1fc) {
if ((mode & 0x200) != 0) {
// params.void_extent_hdr = true;
return true;
}
if ((mode & 0x400) == 0 || StreamBits(1) == 0) {
return true;
}
return false;
}
if ((mode & 0xf) == 0) {
return true;
}
if ((mode & 3) == 0 && (mode & 0x1c0) == 0x1c0) {
return true;
}
return false;
}
TexelWeightParams DecodeBlockInfo(uint mode) {
TexelWeightParams params = TexelWeightParams(uvec2(0), 0, false);
uint A, B;
uint mode_layout = FindLayout(mode);
switch (mode_layout) {
case 0:
A = (mode >> 5) & 0x3;
B = (mode >> 7) & 0x3;
params.size = uvec2(B + 4, A + 2);
break;
case 1:
A = (mode >> 5) & 0x3;
B = (mode >> 7) & 0x3;
params.size = uvec2(B + 8, A + 2);
break;
case 2:
A = (mode >> 5) & 0x3;
B = (mode >> 7) & 0x3;
params.size = uvec2(A + 2, B + 8);
break;
case 3:
A = (mode >> 5) & 0x3;
B = (mode >> 7) & 0x1;
params.size = uvec2(A + 2, B + 6);
break;
case 4:
A = (mode >> 5) & 0x3;
B = (mode >> 7) & 0x1;
params.size = uvec2(B + 2, A + 2);
break;
case 5:
A = (mode >> 5) & 0x3;
params.size = uvec2(12, A + 2);
break;
case 6:
A = (mode >> 5) & 0x3;
params.size = uvec2(A + 2, 12);
break;
case 7:
params.size = uvec2(6, 10);
break;
case 8:
params.size = uvec2(10, 6);
break;
case 9:
A = (mode >> 5) & 0x3;
B = (mode >> 9) & 0x3;
params.size = uvec2(A + 6, B + 6);
break;
default:
break;
}
params.dual_plane = (mode_layout != 9) && ((mode & 0x400) != 0);
uint weight_index = (mode & 0x10) != 0 ? 1 : 0;
if (mode_layout < 5) {
weight_index |= (mode & 0x3) << 1;
} else {
weight_index |= (mode & 0xc) >> 1;
}
weight_index -= 2;
if ((mode_layout != 9) && ((mode & 0x200) != 0)) {
weight_index += 6;
}
params.max_weight = weight_index + 1;
return params;
}
void DecompressBlock(ivec3 coord) {
uint mode = StreamBits(11);
const TexelWeightParams params = DecodeBlockInfo(mode);
if (IsError(mode)) {
FillError(coord);
return;
}
if ((mode & 0x1ff) == 0x1fc) {
// params.void_extent_ldr = true;
FillVoidExtentLDR(coord);
return;
}
if ((params.size.x > block_dims.x) || (params.size.y > block_dims.y)) {
FillError(coord);
return;
}
const uint num_partitions = StreamBits(2) + 1;
if (num_partitions > 4 || (num_partitions == 4 && params.dual_plane)) {
FillError(coord);
return;
}
uint partition_index = 1;
uvec4 color_endpoint_mode = uvec4(0);
uint ced_pointer = 0;
uint base_cem = 0;
if (num_partitions == 1) {
color_endpoint_mode.x = StreamBits(4);
partition_index = 0;
} else {
partition_index = StreamBits(10);
base_cem = StreamBits(6);
}
const uint base_mode = base_cem & 3;
const uint weight_bits = GetPackedBitSize(params.size, params.dual_plane, params.max_weight);
uint remaining_bits = 128 - weight_bits - total_bitsread;
uint extra_cem_bits = 0;
if (base_mode > 0) {
switch (num_partitions) {
case 2:
extra_cem_bits += 2;
break;
case 3:
extra_cem_bits += 5;
break;
case 4:
extra_cem_bits += 8;
break;
default:
return;
}
}
remaining_bits -= extra_cem_bits;
const uint plane_selector_bits = params.dual_plane ? 2 : 0;
remaining_bits -= plane_selector_bits;
if (remaining_bits > 128) {
// Bad data, more remaining bits than 4 bytes
// return early
return;
}
// Read color data...
const uint color_data_bits = remaining_bits;
while (remaining_bits > 0) {
const int nb = int(min(remaining_bits, 32U));
const uint b = StreamBits(nb);
color_endpoint_data[ced_pointer] = uint(bitfieldExtract(b, 0, nb));
++ced_pointer;
remaining_bits -= nb;
}
const uint plane_index = uint(StreamBits(plane_selector_bits));
if (base_mode > 0) {
const uint extra_cem = StreamBits(extra_cem_bits);
uint cem = (extra_cem << 6) | base_cem;
cem >>= 2;
uvec4 C = uvec4(0);
for (uint i = 0; i < num_partitions; i++) {
C[i] = (cem & 1);
cem >>= 1;
}
uvec4 M = uvec4(0);
for (uint i = 0; i < num_partitions; i++) {
M[i] = cem & 3;
cem >>= 2;
}
for (uint i = 0; i < num_partitions; i++) {
color_endpoint_mode[i] = base_mode;
if (C[i] == 0) {
--color_endpoint_mode[i];
}
color_endpoint_mode[i] <<= 2;
color_endpoint_mode[i] |= M[i];
}
} else if (num_partitions > 1) {
const uint cem = base_cem >> 2;
for (uint i = 0; i < num_partitions; i++) {
color_endpoint_mode[i] = cem;
}
}
uvec4 endpoints0[4];
uvec4 endpoints1[4];
{
// This decode phase should at most push 32 elements into the vector
result_vector_max_index = 32;
// uvec4 color_values[8];
uint colvals_index = 0;
DecodeColorValues(color_endpoint_mode, num_partitions, color_data_bits);
for (uint i = 0; i < num_partitions; i++) {
ComputeEndpoints(endpoints0[i], endpoints1[i], color_endpoint_mode[i],
colvals_index);
}
}
color_endpoint_data = local_buff;
color_endpoint_data = bitfieldReverse(color_endpoint_data).wzyx;
const uint clear_byte_start = (weight_bits >> 3) + 1;
const uint byte_insert = ExtractBits(color_endpoint_data, int(clear_byte_start - 1) * 8, 8) &
uint(((1 << (weight_bits % 8)) - 1));
const uint vec_index = (clear_byte_start - 1) >> 2;
color_endpoint_data[vec_index] = bitfieldInsert(color_endpoint_data[vec_index], byte_insert,
int((clear_byte_start - 1) % 4) * 8, 8);
for (uint i = clear_byte_start; i < 16; ++i) {
const uint idx = i >> 2;
color_endpoint_data[idx] = bitfieldInsert(color_endpoint_data[idx], 0, int(i % 4) * 8, 8);
}
// Re-init vector variables for next decode phase
result_index = 0;
color_bitsread = 0;
result_limit_reached = false;
// The limit for the Unquantize phase, avoids decoding more data than needed.
result_vector_max_index = params.size.x * params.size.y;
if (params.dual_plane) {
result_vector_max_index *= 2;
}
DecodeIntegerSequence(params.max_weight, GetNumWeightValues(params.size, params.dual_plane));
UnquantizeTexelWeights(params.size, params.dual_plane);
for (uint j = 0; j < block_dims.y; j++) {
for (uint i = 0; i < block_dims.x; i++) {
uint local_partition = 0;
if (num_partitions > 1) {
local_partition = Select2DPartition(partition_index, i, j, num_partitions,
(block_dims.y * block_dims.x) < 32);
}
const uvec4 C0 = ReplicateByteTo16(endpoints0[local_partition]);
const uvec4 C1 = ReplicateByteTo16(endpoints1[local_partition]);
const uint weight_offset = (j * block_dims.x + i);
const uint array_index = weight_offset / 4;
const uint vector_index = weight_offset % 4;
const uint primary_weight = unquantized_texel_weights[array_index][vector_index];
uvec4 weight_vec = uvec4(primary_weight);
if (params.dual_plane) {
const uint secondary_weight_offset = (j * block_dims.x + i) + ARRAY_NUM_ELEMENTS;
const uint secondary_array_index = secondary_weight_offset / 4;
const uint secondary_vector_index = secondary_weight_offset % 4;
const uint secondary_weight =
unquantized_texel_weights[secondary_array_index][secondary_vector_index];
for (uint c = 0; c < 4; c++) {
const bool is_secondary = ((plane_index + 1u) & 3u) == c;
weight_vec[c] = is_secondary ? secondary_weight : primary_weight;
}
}
const vec4 Cf =
vec4((C0 * (uvec4(64) - weight_vec) + C1 * weight_vec + uvec4(32)) / 64);
const vec4 p = (Cf / 65535.0);
imageStore(dest_image, coord + ivec3(i, j, 0), p.gbar);
}
}
}
uint SwizzleOffset(uvec2 pos) {
const uint x = pos.x;
const uint y = pos.y;
return ((x % 64) / 32) * 256 + ((y % 8) / 2) * 64 +
((x % 32) / 16) * 32 + (y % 2) * 16 + (x % 16);
}
void main() {
uvec3 pos = gl_GlobalInvocationID;
pos.x <<= BYTES_PER_BLOCK_LOG2;
const uint swizzle = SwizzleOffset(pos.xy);
const uint block_y = pos.y >> GOB_SIZE_Y_SHIFT;
uint offset = 0;
offset += pos.z * layer_stride;
offset += (block_y >> block_height) * block_size;
offset += (block_y & block_height_mask) << GOB_SIZE_SHIFT;
offset += (pos.x >> GOB_SIZE_X_SHIFT) << x_shift;
offset += swizzle;
const ivec3 coord = ivec3(gl_GlobalInvocationID * uvec3(block_dims, 1));
if (any(greaterThanEqual(coord, imageSize(dest_image)))) {
return;
}
local_buff = astc_data[offset / 16];
DecompressBlock(coord);
}