From 5665d055476fa793192523c3cb6fe06369d58674 Mon Sep 17 00:00:00 2001 From: ameerj <52414509+ameerj@users.noreply.github.com> Date: Sun, 4 Jul 2021 22:48:41 -0400 Subject: astc_decoder: Optimize the use EncodingData This buffer was a list of EncodingData structures sorted by their bit length, with some duplication from the cpu decoder implementation. We can take advantage of its sorted property to optimize its usage in the shader. Thanks to wwylele for the optimization idea. --- src/video_core/renderer_vulkan/vk_compute_pass.cpp | 42 +++++----------------- 1 file changed, 9 insertions(+), 33 deletions(-) (limited to 'src/video_core/renderer_vulkan') diff --git a/src/video_core/renderer_vulkan/vk_compute_pass.cpp b/src/video_core/renderer_vulkan/vk_compute_pass.cpp index 561cf5e11..328813a57 100644 --- a/src/video_core/renderer_vulkan/vk_compute_pass.cpp +++ b/src/video_core/renderer_vulkan/vk_compute_pass.cpp @@ -30,16 +30,13 @@ namespace Vulkan { using Tegra::Texture::SWIZZLE_TABLE; -using Tegra::Texture::ASTC::ASTC_ENCODINGS_VALUES; -using namespace Tegra::Texture::ASTC; namespace { constexpr u32 ASTC_BINDING_INPUT_BUFFER = 0; -constexpr u32 ASTC_BINDING_ENC_BUFFER = 1; -constexpr u32 ASTC_BINDING_SWIZZLE_BUFFER = 2; -constexpr u32 ASTC_BINDING_OUTPUT_IMAGE = 3; -constexpr size_t ASTC_NUM_BINDINGS = 4; +constexpr u32 ASTC_BINDING_SWIZZLE_BUFFER = 1; +constexpr u32 ASTC_BINDING_OUTPUT_IMAGE = 2; +constexpr size_t ASTC_NUM_BINDINGS = 3; template inline constexpr VkPushConstantRange COMPUTE_PUSH_CONSTANT_RANGE{ @@ -75,7 +72,7 @@ constexpr DescriptorBankInfo INPUT_OUTPUT_BANK_INFO{ .score = 2, }; -constexpr std::array ASTC_DESCRIPTOR_SET_BINDINGS{{ +constexpr std::array ASTC_DESCRIPTOR_SET_BINDINGS{{ { .binding = ASTC_BINDING_INPUT_BUFFER, .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, @@ -83,13 +80,6 @@ constexpr std::array ASTC_DESCRIPTOR_SET_BINDIN .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, .pImmutableSamplers = nullptr, }, - { - .binding = ASTC_BINDING_ENC_BUFFER, - .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, - .descriptorCount = 1, - .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, - .pImmutableSamplers = nullptr, - }, { .binding = ASTC_BINDING_SWIZZLE_BUFFER, .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, @@ -108,12 +98,12 @@ constexpr std::array ASTC_DESCRIPTOR_SET_BINDIN constexpr DescriptorBankInfo ASTC_BANK_INFO{ .uniform_buffers = 0, - .storage_buffers = 3, + .storage_buffers = 2, .texture_buffers = 0, .image_buffers = 0, .textures = 0, .images = 1, - .score = 4, + .score = 3, }; constexpr VkDescriptorUpdateTemplateEntryKHR INPUT_OUTPUT_DESCRIPTOR_UPDATE_TEMPLATE{ @@ -135,14 +125,6 @@ constexpr std::array .offset = ASTC_BINDING_INPUT_BUFFER * sizeof(DescriptorUpdateEntry), .stride = sizeof(DescriptorUpdateEntry), }, - { - .dstBinding = ASTC_BINDING_ENC_BUFFER, - .dstArrayElement = 0, - .descriptorCount = 1, - .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, - .offset = ASTC_BINDING_ENC_BUFFER * sizeof(DescriptorUpdateEntry), - .stride = sizeof(DescriptorUpdateEntry), - }, { .dstBinding = ASTC_BINDING_SWIZZLE_BUFFER, .dstArrayElement = 0, @@ -355,7 +337,7 @@ ASTCDecoderPass::ASTCDecoderPass(const Device& device_, VKScheduler& scheduler_, ASTCDecoderPass::~ASTCDecoderPass() = default; void ASTCDecoderPass::MakeDataBuffer() { - constexpr size_t TOTAL_BUFFER_SIZE = sizeof(ASTC_ENCODINGS_VALUES) + sizeof(SWIZZLE_TABLE); + constexpr size_t TOTAL_BUFFER_SIZE = sizeof(SWIZZLE_TABLE); data_buffer = device.GetLogical().CreateBuffer(VkBufferCreateInfo{ .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, .pNext = nullptr, @@ -369,11 +351,7 @@ void ASTCDecoderPass::MakeDataBuffer() { data_buffer_commit = memory_allocator.Commit(data_buffer, MemoryUsage::Upload); const auto staging_ref = staging_buffer_pool.Request(TOTAL_BUFFER_SIZE, MemoryUsage::Upload); - std::memcpy(staging_ref.mapped_span.data(), &ASTC_ENCODINGS_VALUES, - sizeof(ASTC_ENCODINGS_VALUES)); - // Tack on the swizzle table at the end of the buffer - std::memcpy(staging_ref.mapped_span.data() + sizeof(ASTC_ENCODINGS_VALUES), &SWIZZLE_TABLE, - sizeof(SWIZZLE_TABLE)); + std::memcpy(staging_ref.mapped_span.data(), &SWIZZLE_TABLE, sizeof(SWIZZLE_TABLE)); scheduler.Record([src = staging_ref.buffer, offset = staging_ref.offset, dst = *data_buffer, TOTAL_BUFFER_SIZE](vk::CommandBuffer cmdbuf) { @@ -443,9 +421,7 @@ void ASTCDecoderPass::Assemble(Image& image, const StagingBufferRef& map, update_descriptor_queue.Acquire(); update_descriptor_queue.AddBuffer(map.buffer, input_offset, image.guest_size_bytes - swizzle.buffer_offset); - update_descriptor_queue.AddBuffer(*data_buffer, 0, sizeof(ASTC_ENCODINGS_VALUES)); - update_descriptor_queue.AddBuffer(*data_buffer, sizeof(ASTC_ENCODINGS_VALUES), - sizeof(SWIZZLE_TABLE)); + update_descriptor_queue.AddBuffer(*data_buffer, 0, sizeof(SWIZZLE_TABLE)); update_descriptor_queue.AddImage(image.StorageImageView(swizzle.level)); const void* const descriptor_data{update_descriptor_queue.UpdateData()}; -- cgit v1.2.3 From 5ab80535118e593ef3add3ce2b5935437e1dc1d3 Mon Sep 17 00:00:00 2001 From: ameerj <52414509+ameerj@users.noreply.github.com> Date: Sat, 31 Jul 2021 22:24:15 -0400 Subject: astc_decoder: Compute offset swizzles in-shader Alleviates the dependency on the swizzle table and a uniform which is constant for all ASTC texture sizes. --- src/video_core/renderer_vulkan/vk_compute_pass.cpp | 67 ++-------------------- src/video_core/renderer_vulkan/vk_compute_pass.h | 5 -- 2 files changed, 5 insertions(+), 67 deletions(-) (limited to 'src/video_core/renderer_vulkan') diff --git a/src/video_core/renderer_vulkan/vk_compute_pass.cpp b/src/video_core/renderer_vulkan/vk_compute_pass.cpp index 328813a57..d13d58e8c 100644 --- a/src/video_core/renderer_vulkan/vk_compute_pass.cpp +++ b/src/video_core/renderer_vulkan/vk_compute_pass.cpp @@ -34,9 +34,8 @@ using Tegra::Texture::SWIZZLE_TABLE; namespace { constexpr u32 ASTC_BINDING_INPUT_BUFFER = 0; -constexpr u32 ASTC_BINDING_SWIZZLE_BUFFER = 1; -constexpr u32 ASTC_BINDING_OUTPUT_IMAGE = 2; -constexpr size_t ASTC_NUM_BINDINGS = 3; +constexpr u32 ASTC_BINDING_OUTPUT_IMAGE = 1; +constexpr size_t ASTC_NUM_BINDINGS = 2; template inline constexpr VkPushConstantRange COMPUTE_PUSH_CONSTANT_RANGE{ @@ -80,13 +79,6 @@ constexpr std::array ASTC_DESCR .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, .pImmutableSamplers = nullptr, }, - { - .binding = ASTC_BINDING_SWIZZLE_BUFFER, - .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, - .descriptorCount = 1, - .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, - .pImmutableSamplers = nullptr, - }, { .binding = ASTC_BINDING_OUTPUT_IMAGE, .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, @@ -98,12 +90,12 @@ constexpr std::array ASTC_DESCR constexpr DescriptorBankInfo ASTC_BANK_INFO{ .uniform_buffers = 0, - .storage_buffers = 2, + .storage_buffers = 1, .texture_buffers = 0, .image_buffers = 0, .textures = 0, .images = 1, - .score = 3, + .score = 2, }; constexpr VkDescriptorUpdateTemplateEntryKHR INPUT_OUTPUT_DESCRIPTOR_UPDATE_TEMPLATE{ @@ -125,14 +117,6 @@ constexpr std::array .offset = ASTC_BINDING_INPUT_BUFFER * sizeof(DescriptorUpdateEntry), .stride = sizeof(DescriptorUpdateEntry), }, - { - .dstBinding = ASTC_BINDING_SWIZZLE_BUFFER, - .dstArrayElement = 0, - .descriptorCount = 1, - .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, - .offset = ASTC_BINDING_SWIZZLE_BUFFER * sizeof(DescriptorUpdateEntry), - .stride = sizeof(DescriptorUpdateEntry), - }, { .dstBinding = ASTC_BINDING_OUTPUT_IMAGE, .dstArrayElement = 0, @@ -145,7 +129,6 @@ constexpr std::array struct AstcPushConstants { std::array blocks_dims; - u32 bytes_per_block_log2; u32 layer_stride; u32 block_size; u32 x_shift; @@ -336,42 +319,6 @@ ASTCDecoderPass::ASTCDecoderPass(const Device& device_, VKScheduler& scheduler_, ASTCDecoderPass::~ASTCDecoderPass() = default; -void ASTCDecoderPass::MakeDataBuffer() { - constexpr size_t TOTAL_BUFFER_SIZE = sizeof(SWIZZLE_TABLE); - data_buffer = device.GetLogical().CreateBuffer(VkBufferCreateInfo{ - .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, - .pNext = nullptr, - .flags = 0, - .size = TOTAL_BUFFER_SIZE, - .usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT, - .sharingMode = VK_SHARING_MODE_EXCLUSIVE, - .queueFamilyIndexCount = 0, - .pQueueFamilyIndices = nullptr, - }); - data_buffer_commit = memory_allocator.Commit(data_buffer, MemoryUsage::Upload); - - const auto staging_ref = staging_buffer_pool.Request(TOTAL_BUFFER_SIZE, MemoryUsage::Upload); - std::memcpy(staging_ref.mapped_span.data(), &SWIZZLE_TABLE, sizeof(SWIZZLE_TABLE)); - - scheduler.Record([src = staging_ref.buffer, offset = staging_ref.offset, dst = *data_buffer, - TOTAL_BUFFER_SIZE](vk::CommandBuffer cmdbuf) { - static constexpr VkMemoryBarrier write_barrier{ - .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER, - .pNext = nullptr, - .srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT, - .dstAccessMask = VK_ACCESS_SHADER_READ_BIT, - }; - const VkBufferCopy copy{ - .srcOffset = offset, - .dstOffset = 0, - .size = TOTAL_BUFFER_SIZE, - }; - cmdbuf.CopyBuffer(src, dst, copy); - cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, - 0, write_barrier); - }); -} - void ASTCDecoderPass::Assemble(Image& image, const StagingBufferRef& map, std::span swizzles) { using namespace VideoCommon::Accelerated; @@ -380,9 +327,6 @@ void ASTCDecoderPass::Assemble(Image& image, const StagingBufferRef& map, VideoCore::Surface::DefaultBlockHeight(image.info.format), }; scheduler.RequestOutsideRenderPassOperationContext(); - if (!data_buffer) { - MakeDataBuffer(); - } const VkPipeline vk_pipeline = *pipeline; const VkImageAspectFlags aspect_mask = image.AspectMask(); const VkImage vk_image = image.Handle(); @@ -421,7 +365,6 @@ void ASTCDecoderPass::Assemble(Image& image, const StagingBufferRef& map, update_descriptor_queue.Acquire(); update_descriptor_queue.AddBuffer(map.buffer, input_offset, image.guest_size_bytes - swizzle.buffer_offset); - update_descriptor_queue.AddBuffer(*data_buffer, 0, sizeof(SWIZZLE_TABLE)); update_descriptor_queue.AddImage(image.StorageImageView(swizzle.level)); const void* const descriptor_data{update_descriptor_queue.UpdateData()}; @@ -429,11 +372,11 @@ void ASTCDecoderPass::Assemble(Image& image, const StagingBufferRef& map, const auto params = MakeBlockLinearSwizzle2DParams(swizzle, image.info); ASSERT(params.origin == (std::array{0, 0, 0})); ASSERT(params.destination == (std::array{0, 0, 0})); + ASSERT(params.bytes_per_block_log2 == 4); scheduler.Record([this, num_dispatches_x, num_dispatches_y, num_dispatches_z, block_dims, params, descriptor_data](vk::CommandBuffer cmdbuf) { const AstcPushConstants uniforms{ .blocks_dims = block_dims, - .bytes_per_block_log2 = params.bytes_per_block_log2, .layer_stride = params.layer_stride, .block_size = params.block_size, .x_shift = params.x_shift, diff --git a/src/video_core/renderer_vulkan/vk_compute_pass.h b/src/video_core/renderer_vulkan/vk_compute_pass.h index 114aef2bd..c7b92cce0 100644 --- a/src/video_core/renderer_vulkan/vk_compute_pass.h +++ b/src/video_core/renderer_vulkan/vk_compute_pass.h @@ -96,15 +96,10 @@ public: std::span swizzles); private: - void MakeDataBuffer(); - VKScheduler& scheduler; StagingBufferPool& staging_buffer_pool; VKUpdateDescriptorQueue& update_descriptor_queue; MemoryAllocator& memory_allocator; - - vk::Buffer data_buffer; - MemoryCommit data_buffer_commit; }; } // namespace Vulkan -- cgit v1.2.3 From c439fc9be994583801418743ab202fb63d1c83a0 Mon Sep 17 00:00:00 2001 From: ameerj <52414509+ameerj@users.noreply.github.com> Date: Sat, 31 Jul 2021 23:55:20 -0400 Subject: astc_decoder: Reduce workgroup size This reduces the amount of over dispatching when there are odd dimensions (i.e. ASTC 8x5), which rarely evenly divide into 32x32. --- src/video_core/renderer_vulkan/vk_compute_pass.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'src/video_core/renderer_vulkan') diff --git a/src/video_core/renderer_vulkan/vk_compute_pass.cpp b/src/video_core/renderer_vulkan/vk_compute_pass.cpp index d13d58e8c..3e96c0f60 100644 --- a/src/video_core/renderer_vulkan/vk_compute_pass.cpp +++ b/src/video_core/renderer_vulkan/vk_compute_pass.cpp @@ -358,8 +358,8 @@ void ASTCDecoderPass::Assemble(Image& image, const StagingBufferRef& map, }); for (const VideoCommon::SwizzleParameters& swizzle : swizzles) { const size_t input_offset = swizzle.buffer_offset + map.offset; - const u32 num_dispatches_x = Common::DivCeil(swizzle.num_tiles.width, 32U); - const u32 num_dispatches_y = Common::DivCeil(swizzle.num_tiles.height, 32U); + const u32 num_dispatches_x = Common::DivCeil(swizzle.num_tiles.width, 8U); + const u32 num_dispatches_y = Common::DivCeil(swizzle.num_tiles.height, 8U); const u32 num_dispatches_z = image.info.resources.layers; update_descriptor_queue.Acquire(); -- cgit v1.2.3