From a5bff8e9b3eed5419bbb38abaaa642ed11b235db Mon Sep 17 00:00:00 2001 From: ameerj <52414509+ameerj@users.noreply.github.com> Date: Sun, 16 Jan 2022 15:52:34 -0500 Subject: astc_decoder: Combine FastReplicate functions to work around new NV driver bug The new Nvidia drivers have a bug where the FastReplicateTo6 function produces a lookup into the REPLICATE_TO_8 table rather than the REPLICATE_TO_6 table. This seems to be an optimization gone wrong. Combining the logic of the FastReplicate functions seems to address the bug. --- src/video_core/host_shaders/astc_decoder.comp | 80 +++++++++++++++------------ 1 file changed, 46 insertions(+), 34 deletions(-) (limited to 'src') diff --git a/src/video_core/host_shaders/astc_decoder.comp b/src/video_core/host_shaders/astc_decoder.comp index f34c5f5d9..3a10578cb 100644 --- a/src/video_core/host_shaders/astc_decoder.comp +++ b/src/video_core/host_shaders/astc_decoder.comp @@ -155,9 +155,6 @@ uint SwizzleOffset(uvec2 pos) { // Replicates low num_bits such that [(to_bit - 1):(to_bit - 1 - from_bit)] // is the same as [(num_bits - 1):0] and repeats all the way down. uint Replicate(uint val, uint num_bits, uint to_bit) { - if (num_bits == 0 || to_bit == 0) { - return 0; - } const uint v = val & uint((1 << num_bits) - 1); uint res = v; uint reslen = num_bits; @@ -187,42 +184,57 @@ uint ReplicateBitTo9(uint value) { return REPLICATE_1_BIT_TO_9_TABLE[value]; } -uint FastReplicateTo8(uint value, uint num_bits) { - switch (num_bits) { - case 1: - return REPLICATE_1_BIT_TO_8_TABLE[value]; - case 2: - return REPLICATE_2_BIT_TO_8_TABLE[value]; - case 3: - return REPLICATE_3_BIT_TO_8_TABLE[value]; - case 4: - return REPLICATE_4_BIT_TO_8_TABLE[value]; - case 5: - return REPLICATE_5_BIT_TO_8_TABLE[value]; - case 6: - return REPLICATE_6_BIT_TO_8_TABLE[value]; - case 7: - return REPLICATE_7_BIT_TO_8_TABLE[value]; - case 8: +uint FastReplicate(uint value, uint num_bits, uint to_bit) { + if (num_bits == 0) { + return 0; + } + if (num_bits == to_bit) { return value; } - return Replicate(value, num_bits, 8); + if (to_bit == 6) { + switch (num_bits) { + case 1: + return REPLICATE_1_BIT_TO_6_TABLE[value]; + case 2: + return REPLICATE_2_BIT_TO_6_TABLE[value]; + case 3: + return REPLICATE_3_BIT_TO_6_TABLE[value]; + case 4: + return REPLICATE_4_BIT_TO_6_TABLE[value]; + case 5: + return REPLICATE_5_BIT_TO_6_TABLE[value]; + default: + break; + } + } else { /* if (to_bit == 8) */ + switch (num_bits) { + case 1: + return REPLICATE_1_BIT_TO_8_TABLE[value]; + case 2: + return REPLICATE_2_BIT_TO_8_TABLE[value]; + case 3: + return REPLICATE_3_BIT_TO_8_TABLE[value]; + case 4: + return REPLICATE_4_BIT_TO_8_TABLE[value]; + case 5: + return REPLICATE_5_BIT_TO_8_TABLE[value]; + case 6: + return REPLICATE_6_BIT_TO_8_TABLE[value]; + case 7: + return REPLICATE_7_BIT_TO_8_TABLE[value]; + default: + break; + } + } + return Replicate(value, num_bits, to_bit); +} + +uint FastReplicateTo8(uint value, uint num_bits) { + return FastReplicate(value, num_bits, 8); } uint FastReplicateTo6(uint value, uint num_bits) { - switch (num_bits) { - case 1: - return REPLICATE_1_BIT_TO_6_TABLE[value]; - case 2: - return REPLICATE_2_BIT_TO_6_TABLE[value]; - case 3: - return REPLICATE_3_BIT_TO_6_TABLE[value]; - case 4: - return REPLICATE_4_BIT_TO_6_TABLE[value]; - case 5: - return REPLICATE_5_BIT_TO_6_TABLE[value]; - } - return Replicate(value, num_bits, 6); + return FastReplicate(value, num_bits, 6); } uint Div3Floor(uint v) { -- cgit v1.2.3