diff options
31 files changed, 661 insertions, 545 deletions
diff --git a/src/core/hle/kernel/process.cpp b/src/core/hle/kernel/process.cpp index 36724569f..c4c5199b1 100644 --- a/src/core/hle/kernel/process.cpp +++ b/src/core/hle/kernel/process.cpp @@ -132,7 +132,8 @@ std::shared_ptr<ResourceLimit> Process::GetResourceLimit() const { u64 Process::GetTotalPhysicalMemoryAvailable() const { const u64 capacity{resource_limit->GetCurrentResourceValue(ResourceType::PhysicalMemory) + - page_table->GetTotalHeapSize() + image_size + main_thread_stack_size}; + page_table->GetTotalHeapSize() + GetSystemResourceSize() + image_size + + main_thread_stack_size}; if (capacity < memory_usage_capacity) { return capacity; @@ -146,7 +147,8 @@ u64 Process::GetTotalPhysicalMemoryAvailableWithoutSystemResource() const { } u64 Process::GetTotalPhysicalMemoryUsed() const { - return image_size + main_thread_stack_size + page_table->GetTotalHeapSize(); + return image_size + main_thread_stack_size + page_table->GetTotalHeapSize() + + GetSystemResourceSize(); } u64 Process::GetTotalPhysicalMemoryUsedWithoutSystemResource() const { diff --git a/src/core/hle/kernel/resource_limit.cpp b/src/core/hle/kernel/resource_limit.cpp index d9beaa3a4..212e442f4 100644 --- a/src/core/hle/kernel/resource_limit.cpp +++ b/src/core/hle/kernel/resource_limit.cpp @@ -24,13 +24,9 @@ bool ResourceLimit::Reserve(ResourceType resource, s64 amount, u64 timeout) { const std::size_t index{ResourceTypeToIndex(resource)}; s64 new_value = current[index] + amount; - while (new_value > limit[index] && available[index] + amount <= limit[index]) { + if (new_value > limit[index] && available[index] + amount <= limit[index]) { // TODO(bunnei): This is wrong for multicore, we should wait the calling thread for timeout new_value = current[index] + amount; - - if (timeout >= 0) { - break; - } } if (new_value <= limit[index]) { diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt index 2bf8d68ce..39d5d8401 100644 --- a/src/video_core/CMakeLists.txt +++ b/src/video_core/CMakeLists.txt @@ -49,8 +49,6 @@ add_library(video_core STATIC query_cache.h rasterizer_accelerated.cpp rasterizer_accelerated.h - rasterizer_cache.cpp - rasterizer_cache.h rasterizer_interface.h renderer_base.cpp renderer_base.h @@ -93,6 +91,7 @@ add_library(video_core STATIC renderer_opengl/utils.h sampler_cache.cpp sampler_cache.h + shader_cache.h shader/decode/arithmetic.cpp shader/decode/arithmetic_immediate.cpp shader/decode/bfe.cpp diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h index b88fce2cd..77ae34339 100644 --- a/src/video_core/buffer_cache/buffer_cache.h +++ b/src/video_core/buffer_cache/buffer_cache.h @@ -110,19 +110,23 @@ public: }); } - void Map(std::size_t max_size) { + /// Prepares the buffer cache for data uploading + /// @param max_size Maximum number of bytes that will be uploaded + /// @return True when a stream buffer invalidation was required, false otherwise + bool Map(std::size_t max_size) { std::lock_guard lock{mutex}; + bool invalidated; std::tie(buffer_ptr, buffer_offset_base, invalidated) = stream_buffer->Map(max_size, 4); buffer_offset = buffer_offset_base; + + return invalidated; } - /// Finishes the upload stream, returns true on bindings invalidation. - bool Unmap() { + /// Finishes the upload stream + void Unmap() { std::lock_guard lock{mutex}; - stream_buffer->Unmap(buffer_offset - buffer_offset_base); - return std::exchange(invalidated, false); } void TickFrame() { @@ -576,8 +580,6 @@ private: std::unique_ptr<StreamBuffer> stream_buffer; BufferType stream_buffer_handle{}; - bool invalidated = false; - u8* buffer_ptr = nullptr; u64 buffer_offset = 0; u64 buffer_offset_base = 0; diff --git a/src/video_core/engines/const_buffer_engine_interface.h b/src/video_core/engines/const_buffer_engine_interface.h index ebe139504..f46e81bb7 100644 --- a/src/video_core/engines/const_buffer_engine_interface.h +++ b/src/video_core/engines/const_buffer_engine_interface.h @@ -93,6 +93,7 @@ public: virtual SamplerDescriptor AccessBoundSampler(ShaderType stage, u64 offset) const = 0; virtual SamplerDescriptor AccessBindlessSampler(ShaderType stage, u64 const_buffer, u64 offset) const = 0; + virtual SamplerDescriptor AccessSampler(u32 handle) const = 0; virtual u32 GetBoundBuffer() const = 0; virtual VideoCore::GuestDriverProfile& AccessGuestDriverProfile() = 0; diff --git a/src/video_core/engines/kepler_compute.cpp b/src/video_core/engines/kepler_compute.cpp index f6237fc6a..a82b06a38 100644 --- a/src/video_core/engines/kepler_compute.cpp +++ b/src/video_core/engines/kepler_compute.cpp @@ -92,8 +92,11 @@ SamplerDescriptor KeplerCompute::AccessBindlessSampler(ShaderType stage, u64 con ASSERT(stage == ShaderType::Compute); const auto& tex_info_buffer = launch_description.const_buffer_config[const_buffer]; const GPUVAddr tex_info_address = tex_info_buffer.Address() + offset; + return AccessSampler(memory_manager.Read<u32>(tex_info_address)); +} - const Texture::TextureHandle tex_handle{memory_manager.Read<u32>(tex_info_address)}; +SamplerDescriptor KeplerCompute::AccessSampler(u32 handle) const { + const Texture::TextureHandle tex_handle{handle}; const Texture::FullTextureInfo tex_info = GetTextureInfo(tex_handle); SamplerDescriptor result = SamplerDescriptor::FromTIC(tex_info.tic); result.is_shadow.Assign(tex_info.tsc.depth_compare_enabled.Value()); diff --git a/src/video_core/engines/kepler_compute.h b/src/video_core/engines/kepler_compute.h index 18ceedfaf..b7f668d88 100644 --- a/src/video_core/engines/kepler_compute.h +++ b/src/video_core/engines/kepler_compute.h @@ -219,6 +219,8 @@ public: SamplerDescriptor AccessBindlessSampler(ShaderType stage, u64 const_buffer, u64 offset) const override; + SamplerDescriptor AccessSampler(u32 handle) const override; + u32 GetBoundBuffer() const override { return regs.tex_cb_index; } diff --git a/src/video_core/engines/maxwell_3d.cpp b/src/video_core/engines/maxwell_3d.cpp index e46b153f9..ea3c8a963 100644 --- a/src/video_core/engines/maxwell_3d.cpp +++ b/src/video_core/engines/maxwell_3d.cpp @@ -740,8 +740,11 @@ SamplerDescriptor Maxwell3D::AccessBindlessSampler(ShaderType stage, u64 const_b const auto& shader = state.shader_stages[static_cast<std::size_t>(stage)]; const auto& tex_info_buffer = shader.const_buffers[const_buffer]; const GPUVAddr tex_info_address = tex_info_buffer.address + offset; + return AccessSampler(memory_manager.Read<u32>(tex_info_address)); +} - const Texture::TextureHandle tex_handle{memory_manager.Read<u32>(tex_info_address)}; +SamplerDescriptor Maxwell3D::AccessSampler(u32 handle) const { + const Texture::TextureHandle tex_handle{handle}; const Texture::FullTextureInfo tex_info = GetTextureInfo(tex_handle); SamplerDescriptor result = SamplerDescriptor::FromTIC(tex_info.tic); result.is_shadow.Assign(tex_info.tsc.depth_compare_enabled.Value()); diff --git a/src/video_core/engines/maxwell_3d.h b/src/video_core/engines/maxwell_3d.h index 79fc9bbea..d5fe25065 100644 --- a/src/video_core/engines/maxwell_3d.h +++ b/src/video_core/engines/maxwell_3d.h @@ -1404,6 +1404,8 @@ public: SamplerDescriptor AccessBindlessSampler(ShaderType stage, u64 const_buffer, u64 offset) const override; + SamplerDescriptor AccessSampler(u32 handle) const override; + u32 GetBoundBuffer() const override { return regs.tex_cb_index; } diff --git a/src/video_core/rasterizer_cache.cpp b/src/video_core/rasterizer_cache.cpp deleted file mode 100644 index 093b2cdf4..000000000 --- a/src/video_core/rasterizer_cache.cpp +++ /dev/null @@ -1,7 +0,0 @@ -// Copyright 2018 yuzu Emulator Project -// Licensed under GPLv2 or any later version -// Refer to the license.txt file included. - -#include "video_core/rasterizer_cache.h" - -RasterizerCacheObject::~RasterizerCacheObject() = default; diff --git a/src/video_core/rasterizer_cache.h b/src/video_core/rasterizer_cache.h deleted file mode 100644 index 096ee337c..000000000 --- a/src/video_core/rasterizer_cache.h +++ /dev/null @@ -1,253 +0,0 @@ -// Copyright 2018 yuzu Emulator Project -// Licensed under GPLv2 or any later version -// Refer to the license.txt file included. - -#pragma once - -#include <mutex> -#include <set> -#include <unordered_map> - -#include <boost/icl/interval_map.hpp> -#include <boost/range/iterator_range_core.hpp> - -#include "common/common_types.h" -#include "core/settings.h" -#include "video_core/gpu.h" -#include "video_core/rasterizer_interface.h" - -class RasterizerCacheObject { -public: - explicit RasterizerCacheObject(const VAddr cpu_addr) : cpu_addr{cpu_addr} {} - - virtual ~RasterizerCacheObject(); - - VAddr GetCpuAddr() const { - return cpu_addr; - } - - /// Gets the size of the shader in guest memory, required for cache management - virtual std::size_t GetSizeInBytes() const = 0; - - /// Sets whether the cached object should be considered registered - void SetIsRegistered(bool registered) { - is_registered = registered; - } - - /// Returns true if the cached object is registered - bool IsRegistered() const { - return is_registered; - } - - /// Returns true if the cached object is dirty - bool IsDirty() const { - return is_dirty; - } - - /// Returns ticks from when this cached object was last modified - u64 GetLastModifiedTicks() const { - return last_modified_ticks; - } - - /// Marks an object as recently modified, used to specify whether it is clean or dirty - template <class T> - void MarkAsModified(bool dirty, T& cache) { - is_dirty = dirty; - last_modified_ticks = cache.GetModifiedTicks(); - } - - void SetMemoryMarked(bool is_memory_marked_) { - is_memory_marked = is_memory_marked_; - } - - bool IsMemoryMarked() const { - return is_memory_marked; - } - - void SetSyncPending(bool is_sync_pending_) { - is_sync_pending = is_sync_pending_; - } - - bool IsSyncPending() const { - return is_sync_pending; - } - -private: - bool is_registered{}; ///< Whether the object is currently registered with the cache - bool is_dirty{}; ///< Whether the object is dirty (out of sync with guest memory) - bool is_memory_marked{}; ///< Whether the object is marking rasterizer memory. - bool is_sync_pending{}; ///< Whether the object is pending deletion. - u64 last_modified_ticks{}; ///< When the object was last modified, used for in-order flushing - VAddr cpu_addr{}; ///< Cpu address memory, unique from emulated virtual address space -}; - -template <class T> -class RasterizerCache : NonCopyable { - friend class RasterizerCacheObject; - -public: - explicit RasterizerCache(VideoCore::RasterizerInterface& rasterizer) : rasterizer{rasterizer} {} - - /// Write any cached resources overlapping the specified region back to memory - void FlushRegion(VAddr addr, std::size_t size) { - std::lock_guard lock{mutex}; - - const auto& objects{GetSortedObjectsFromRegion(addr, size)}; - for (auto& object : objects) { - FlushObject(object); - } - } - - /// Mark the specified region as being invalidated - void InvalidateRegion(VAddr addr, u64 size) { - std::lock_guard lock{mutex}; - - const auto& objects{GetSortedObjectsFromRegion(addr, size)}; - for (auto& object : objects) { - if (!object->IsRegistered()) { - // Skip duplicates - continue; - } - Unregister(object); - } - } - - void OnCPUWrite(VAddr addr, std::size_t size) { - std::lock_guard lock{mutex}; - - for (const auto& object : GetSortedObjectsFromRegion(addr, size)) { - if (object->IsRegistered()) { - UnmarkMemory(object); - object->SetSyncPending(true); - marked_for_unregister.emplace_back(object); - } - } - } - - void SyncGuestHost() { - std::lock_guard lock{mutex}; - - for (const auto& object : marked_for_unregister) { - if (object->IsRegistered()) { - object->SetSyncPending(false); - Unregister(object); - } - } - marked_for_unregister.clear(); - } - - /// Invalidates everything in the cache - void InvalidateAll() { - std::lock_guard lock{mutex}; - - while (interval_cache.begin() != interval_cache.end()) { - Unregister(*interval_cache.begin()->second.begin()); - } - } - -protected: - /// Tries to get an object from the cache with the specified cache address - T TryGet(VAddr addr) const { - const auto iter = map_cache.find(addr); - if (iter != map_cache.end()) - return iter->second; - return nullptr; - } - - /// Register an object into the cache - virtual void Register(const T& object) { - std::lock_guard lock{mutex}; - - object->SetIsRegistered(true); - interval_cache.add({GetInterval(object), ObjectSet{object}}); - map_cache.insert({object->GetCpuAddr(), object}); - rasterizer.UpdatePagesCachedCount(object->GetCpuAddr(), object->GetSizeInBytes(), 1); - object->SetMemoryMarked(true); - } - - /// Unregisters an object from the cache - virtual void Unregister(const T& object) { - std::lock_guard lock{mutex}; - - UnmarkMemory(object); - object->SetIsRegistered(false); - if (object->IsSyncPending()) { - marked_for_unregister.remove(object); - object->SetSyncPending(false); - } - const VAddr addr = object->GetCpuAddr(); - interval_cache.subtract({GetInterval(object), ObjectSet{object}}); - map_cache.erase(addr); - } - - void UnmarkMemory(const T& object) { - if (!object->IsMemoryMarked()) { - return; - } - rasterizer.UpdatePagesCachedCount(object->GetCpuAddr(), object->GetSizeInBytes(), -1); - object->SetMemoryMarked(false); - } - - /// Returns a ticks counter used for tracking when cached objects were last modified - u64 GetModifiedTicks() { - std::lock_guard lock{mutex}; - - return ++modified_ticks; - } - - virtual void FlushObjectInner(const T& object) = 0; - - /// Flushes the specified object, updating appropriate cache state as needed - void FlushObject(const T& object) { - std::lock_guard lock{mutex}; - - if (!object->IsDirty()) { - return; - } - FlushObjectInner(object); - object->MarkAsModified(false, *this); - } - - std::recursive_mutex mutex; - -private: - /// Returns a list of cached objects from the specified memory region, ordered by access time - std::vector<T> GetSortedObjectsFromRegion(VAddr addr, u64 size) { - if (size == 0) { - return {}; - } - - std::vector<T> objects; - const ObjectInterval interval{addr, addr + size}; - for (auto& pair : boost::make_iterator_range(interval_cache.equal_range(interval))) { - for (auto& cached_object : pair.second) { - if (!cached_object) { - continue; - } - objects.push_back(cached_object); - } - } - - std::sort(objects.begin(), objects.end(), [](const T& a, const T& b) -> bool { - return a->GetLastModifiedTicks() < b->GetLastModifiedTicks(); - }); - - return objects; - } - - using ObjectSet = std::set<T>; - using ObjectCache = std::unordered_map<VAddr, T>; - using IntervalCache = boost::icl::interval_map<VAddr, ObjectSet>; - using ObjectInterval = typename IntervalCache::interval_type; - - static auto GetInterval(const T& object) { - return ObjectInterval::right_open(object->GetCpuAddr(), - object->GetCpuAddr() + object->GetSizeInBytes()); - } - - ObjectCache map_cache; - IntervalCache interval_cache; ///< Cache of objects - u64 modified_ticks{}; ///< Counter of cache state ticks, used for in-order flushing - VideoCore::RasterizerInterface& rasterizer; - std::list<T> marked_for_unregister; -}; diff --git a/src/video_core/renderer_opengl/gl_buffer_cache.h b/src/video_core/renderer_opengl/gl_buffer_cache.h index a9e86cfc7..679b9b1d7 100644 --- a/src/video_core/renderer_opengl/gl_buffer_cache.h +++ b/src/video_core/renderer_opengl/gl_buffer_cache.h @@ -10,7 +10,6 @@ #include "common/common_types.h" #include "video_core/buffer_cache/buffer_cache.h" #include "video_core/engines/maxwell_3d.h" -#include "video_core/rasterizer_cache.h" #include "video_core/renderer_opengl/gl_resource_manager.h" #include "video_core/renderer_opengl/gl_stream_buffer.h" diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp index f802fd384..2d6c11320 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.cpp +++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp @@ -30,6 +30,7 @@ #include "video_core/renderer_opengl/gl_shader_cache.h" #include "video_core/renderer_opengl/maxwell_to_gl.h" #include "video_core/renderer_opengl/renderer_opengl.h" +#include "video_core/shader_cache.h" namespace OpenGL { @@ -65,10 +66,22 @@ constexpr std::size_t NumSupportedVertexAttributes = 16; template <typename Engine, typename Entry> Tegra::Texture::FullTextureInfo GetTextureInfo(const Engine& engine, const Entry& entry, ShaderType shader_type, std::size_t index = 0) { + if constexpr (std::is_same_v<Entry, SamplerEntry>) { + if (entry.is_separated) { + const u32 buffer_1 = entry.buffer; + const u32 buffer_2 = entry.secondary_buffer; + const u32 offset_1 = entry.offset; + const u32 offset_2 = entry.secondary_offset; + const u32 handle_1 = engine.AccessConstBuffer32(shader_type, buffer_1, offset_1); + const u32 handle_2 = engine.AccessConstBuffer32(shader_type, buffer_2, offset_2); + return engine.GetTextureInfo(handle_1 | handle_2); + } + } if (entry.is_bindless) { - const auto tex_handle = engine.AccessConstBuffer32(shader_type, entry.buffer, entry.offset); - return engine.GetTextureInfo(tex_handle); + const u32 handle = engine.AccessConstBuffer32(shader_type, entry.buffer, entry.offset); + return engine.GetTextureInfo(handle); } + const auto& gpu_profile = engine.AccessGuestDriverProfile(); const u32 offset = entry.offset + static_cast<u32>(index * gpu_profile.GetTextureHandlerSize()); if constexpr (std::is_same_v<Engine, Tegra::Engines::Maxwell3D>) { @@ -310,7 +323,7 @@ void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) { continue; } - Shader shader{shader_cache.GetStageProgram(program)}; + Shader* const shader = shader_cache.GetStageProgram(program); if (device.UseAssemblyShaders()) { // Check for ARB limitation. We only have 16 SSBOs per context state. To workaround this @@ -604,7 +617,16 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) { (Maxwell::MaxConstBufferSize + device.GetUniformBufferAlignment()); // Prepare the vertex array. - buffer_cache.Map(buffer_size); + const bool invalidated = buffer_cache.Map(buffer_size); + + if (invalidated) { + // When the stream buffer has been invalidated, we have to consider vertex buffers as dirty + auto& dirty = gpu.dirty.flags; + dirty[Dirty::VertexBuffers] = true; + for (int index = Dirty::VertexBuffer0; index <= Dirty::VertexBuffer31; ++index) { + dirty[index] = true; + } + } // Prepare vertex array format. SetupVertexFormat(); @@ -870,7 +892,7 @@ bool RasterizerOpenGL::AccelerateDisplay(const Tegra::FramebufferConfig& config, return true; } -void RasterizerOpenGL::SetupDrawConstBuffers(std::size_t stage_index, const Shader& shader) { +void RasterizerOpenGL::SetupDrawConstBuffers(std::size_t stage_index, Shader* shader) { static constexpr std::array PARAMETER_LUT = { GL_VERTEX_PROGRAM_PARAMETER_BUFFER_NV, GL_TESS_CONTROL_PROGRAM_PARAMETER_BUFFER_NV, GL_TESS_EVALUATION_PROGRAM_PARAMETER_BUFFER_NV, GL_GEOMETRY_PROGRAM_PARAMETER_BUFFER_NV, @@ -900,7 +922,7 @@ void RasterizerOpenGL::SetupDrawConstBuffers(std::size_t stage_index, const Shad } } -void RasterizerOpenGL::SetupComputeConstBuffers(const Shader& kernel) { +void RasterizerOpenGL::SetupComputeConstBuffers(Shader* kernel) { MICROPROFILE_SCOPE(OpenGL_UBO); const auto& launch_desc = system.GPU().KeplerCompute().launch_description; const auto& entries = kernel->GetEntries(); @@ -969,7 +991,7 @@ void RasterizerOpenGL::SetupConstBuffer(GLenum stage, u32 binding, } } -void RasterizerOpenGL::SetupDrawGlobalMemory(std::size_t stage_index, const Shader& shader) { +void RasterizerOpenGL::SetupDrawGlobalMemory(std::size_t stage_index, Shader* shader) { auto& gpu{system.GPU()}; auto& memory_manager{gpu.MemoryManager()}; const auto cbufs{gpu.Maxwell3D().state.shader_stages[stage_index]}; @@ -984,7 +1006,7 @@ void RasterizerOpenGL::SetupDrawGlobalMemory(std::size_t stage_index, const Shad } } -void RasterizerOpenGL::SetupComputeGlobalMemory(const Shader& kernel) { +void RasterizerOpenGL::SetupComputeGlobalMemory(Shader* kernel) { auto& gpu{system.GPU()}; auto& memory_manager{gpu.MemoryManager()}; const auto cbufs{gpu.KeplerCompute().launch_description.const_buffer_config}; @@ -1007,7 +1029,7 @@ void RasterizerOpenGL::SetupGlobalMemory(u32 binding, const GlobalMemoryEntry& e static_cast<GLsizeiptr>(size)); } -void RasterizerOpenGL::SetupDrawTextures(std::size_t stage_index, const Shader& shader) { +void RasterizerOpenGL::SetupDrawTextures(std::size_t stage_index, Shader* shader) { MICROPROFILE_SCOPE(OpenGL_Texture); const auto& maxwell3d = system.GPU().Maxwell3D(); u32 binding = device.GetBaseBindings(stage_index).sampler; @@ -1020,7 +1042,7 @@ void RasterizerOpenGL::SetupDrawTextures(std::size_t stage_index, const Shader& } } -void RasterizerOpenGL::SetupComputeTextures(const Shader& kernel) { +void RasterizerOpenGL::SetupComputeTextures(Shader* kernel) { MICROPROFILE_SCOPE(OpenGL_Texture); const auto& compute = system.GPU().KeplerCompute(); u32 binding = 0; @@ -1049,7 +1071,7 @@ void RasterizerOpenGL::SetupTexture(u32 binding, const Tegra::Texture::FullTextu } } -void RasterizerOpenGL::SetupDrawImages(std::size_t stage_index, const Shader& shader) { +void RasterizerOpenGL::SetupDrawImages(std::size_t stage_index, Shader* shader) { const auto& maxwell3d = system.GPU().Maxwell3D(); u32 binding = device.GetBaseBindings(stage_index).image; for (const auto& entry : shader->GetEntries().images) { @@ -1059,7 +1081,7 @@ void RasterizerOpenGL::SetupDrawImages(std::size_t stage_index, const Shader& sh } } -void RasterizerOpenGL::SetupComputeImages(const Shader& shader) { +void RasterizerOpenGL::SetupComputeImages(Shader* shader) { const auto& compute = system.GPU().KeplerCompute(); u32 binding = 0; for (const auto& entry : shader->GetEntries().images) { diff --git a/src/video_core/renderer_opengl/gl_rasterizer.h b/src/video_core/renderer_opengl/gl_rasterizer.h index 7abc8fdbd..4f082592f 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.h +++ b/src/video_core/renderer_opengl/gl_rasterizer.h @@ -19,7 +19,6 @@ #include "video_core/engines/const_buffer_info.h" #include "video_core/engines/maxwell_3d.h" #include "video_core/rasterizer_accelerated.h" -#include "video_core/rasterizer_cache.h" #include "video_core/rasterizer_interface.h" #include "video_core/renderer_opengl/gl_buffer_cache.h" #include "video_core/renderer_opengl/gl_device.h" @@ -100,10 +99,10 @@ private: void ConfigureClearFramebuffer(bool using_color, bool using_depth_stencil); /// Configures the current constbuffers to use for the draw command. - void SetupDrawConstBuffers(std::size_t stage_index, const Shader& shader); + void SetupDrawConstBuffers(std::size_t stage_index, Shader* shader); /// Configures the current constbuffers to use for the kernel invocation. - void SetupComputeConstBuffers(const Shader& kernel); + void SetupComputeConstBuffers(Shader* kernel); /// Configures a constant buffer. void SetupConstBuffer(GLenum stage, u32 binding, const Tegra::Engines::ConstBufferInfo& buffer, @@ -111,30 +110,30 @@ private: std::size_t unified_offset); /// Configures the current global memory entries to use for the draw command. - void SetupDrawGlobalMemory(std::size_t stage_index, const Shader& shader); + void SetupDrawGlobalMemory(std::size_t stage_index, Shader* shader); /// Configures the current global memory entries to use for the kernel invocation. - void SetupComputeGlobalMemory(const Shader& kernel); + void SetupComputeGlobalMemory(Shader* kernel); /// Configures a constant buffer. void SetupGlobalMemory(u32 binding, const GlobalMemoryEntry& entry, GPUVAddr gpu_addr, std::size_t size); /// Configures the current textures to use for the draw command. - void SetupDrawTextures(std::size_t stage_index, const Shader& shader); + void SetupDrawTextures(std::size_t stage_index, Shader* shader); /// Configures the textures used in a compute shader. - void SetupComputeTextures(const Shader& kernel); + void SetupComputeTextures(Shader* kernel); /// Configures a texture. void SetupTexture(u32 binding, const Tegra::Texture::FullTextureInfo& texture, const SamplerEntry& entry); /// Configures images in a graphics shader. - void SetupDrawImages(std::size_t stage_index, const Shader& shader); + void SetupDrawImages(std::size_t stage_index, Shader* shader); /// Configures images in a compute shader. - void SetupComputeImages(const Shader& shader); + void SetupComputeImages(Shader* shader); /// Configures an image. void SetupImage(u32 binding, const Tegra::Texture::TICEntry& tic, const ImageEntry& entry); diff --git a/src/video_core/renderer_opengl/gl_shader_cache.cpp b/src/video_core/renderer_opengl/gl_shader_cache.cpp index a991ca64a..c28486b1d 100644 --- a/src/video_core/renderer_opengl/gl_shader_cache.cpp +++ b/src/video_core/renderer_opengl/gl_shader_cache.cpp @@ -29,6 +29,7 @@ #include "video_core/shader/memory_util.h" #include "video_core/shader/registry.h" #include "video_core/shader/shader_ir.h" +#include "video_core/shader_cache.h" namespace OpenGL { @@ -194,12 +195,9 @@ std::unordered_set<GLenum> GetSupportedFormats() { } // Anonymous namespace -CachedShader::CachedShader(VAddr cpu_addr, std::size_t size_in_bytes, - std::shared_ptr<VideoCommon::Shader::Registry> registry, - ShaderEntries entries, ProgramSharedPtr program_) - : RasterizerCacheObject{cpu_addr}, registry{std::move(registry)}, entries{std::move(entries)}, - size_in_bytes{size_in_bytes}, program{std::move(program_)} { - // Assign either the assembly program or source program. We can't have both. +Shader::Shader(std::shared_ptr<VideoCommon::Shader::Registry> registry_, ShaderEntries entries_, + ProgramSharedPtr program_) + : registry{std::move(registry_)}, entries{std::move(entries_)}, program{std::move(program_)} { handle = program->assembly_program.handle; if (handle == 0) { handle = program->source_program.handle; @@ -207,16 +205,16 @@ CachedShader::CachedShader(VAddr cpu_addr, std::size_t size_in_bytes, ASSERT(handle != 0); } -CachedShader::~CachedShader() = default; +Shader::~Shader() = default; -GLuint CachedShader::GetHandle() const { +GLuint Shader::GetHandle() const { DEBUG_ASSERT(registry->IsConsistent()); return handle; } -Shader CachedShader::CreateStageFromMemory(const ShaderParameters& params, - Maxwell::ShaderProgram program_type, ProgramCode code, - ProgramCode code_b) { +std::unique_ptr<Shader> Shader::CreateStageFromMemory(const ShaderParameters& params, + Maxwell::ShaderProgram program_type, + ProgramCode code, ProgramCode code_b) { const auto shader_type = GetShaderType(program_type); const std::size_t size_in_bytes = code.size() * sizeof(u64); @@ -241,12 +239,12 @@ Shader CachedShader::CreateStageFromMemory(const ShaderParameters& params, entry.bindless_samplers = registry->GetBindlessSamplers(); params.disk_cache.SaveEntry(std::move(entry)); - return std::shared_ptr<CachedShader>( - new CachedShader(params.cpu_addr, size_in_bytes, std::move(registry), - MakeEntries(params.device, ir, shader_type), std::move(program))); + return std::unique_ptr<Shader>(new Shader( + std::move(registry), MakeEntries(params.device, ir, shader_type), std::move(program))); } -Shader CachedShader::CreateKernelFromMemory(const ShaderParameters& params, ProgramCode code) { +std::unique_ptr<Shader> Shader::CreateKernelFromMemory(const ShaderParameters& params, + ProgramCode code) { const std::size_t size_in_bytes = code.size() * sizeof(u64); auto& engine = params.system.GPU().KeplerCompute(); @@ -266,23 +264,23 @@ Shader CachedShader::CreateKernelFromMemory(const ShaderParameters& params, Prog entry.bindless_samplers = registry->GetBindlessSamplers(); params.disk_cache.SaveEntry(std::move(entry)); - return std::shared_ptr<CachedShader>( - new CachedShader(params.cpu_addr, size_in_bytes, std::move(registry), - MakeEntries(params.device, ir, ShaderType::Compute), std::move(program))); + return std::unique_ptr<Shader>(new Shader(std::move(registry), + MakeEntries(params.device, ir, ShaderType::Compute), + std::move(program))); } -Shader CachedShader::CreateFromCache(const ShaderParameters& params, - const PrecompiledShader& precompiled_shader, - std::size_t size_in_bytes) { - return std::shared_ptr<CachedShader>( - new CachedShader(params.cpu_addr, size_in_bytes, precompiled_shader.registry, - precompiled_shader.entries, precompiled_shader.program)); +std::unique_ptr<Shader> Shader::CreateFromCache(const ShaderParameters& params, + const PrecompiledShader& precompiled_shader) { + return std::unique_ptr<Shader>(new Shader( + precompiled_shader.registry, precompiled_shader.entries, precompiled_shader.program)); } ShaderCacheOpenGL::ShaderCacheOpenGL(RasterizerOpenGL& rasterizer, Core::System& system, Core::Frontend::EmuWindow& emu_window, const Device& device) - : RasterizerCache{rasterizer}, system{system}, emu_window{emu_window}, device{device}, - disk_cache{system} {} + : VideoCommon::ShaderCache<Shader>{rasterizer}, system{system}, + emu_window{emu_window}, device{device}, disk_cache{system} {} + +ShaderCacheOpenGL::~ShaderCacheOpenGL() = default; void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading, const VideoCore::DiskResourceLoadCallback& callback) { @@ -436,7 +434,7 @@ ProgramSharedPtr ShaderCacheOpenGL::GeneratePrecompiledProgram( return program; } -Shader ShaderCacheOpenGL::GetStageProgram(Maxwell::ShaderProgram program) { +Shader* ShaderCacheOpenGL::GetStageProgram(Maxwell::ShaderProgram program) { if (!system.GPU().Maxwell3D().dirty.flags[Dirty::Shaders]) { return last_shaders[static_cast<std::size_t>(program)]; } @@ -446,8 +444,7 @@ Shader ShaderCacheOpenGL::GetStageProgram(Maxwell::ShaderProgram program) { // Look up shader in the cache based on address const auto cpu_addr{memory_manager.GpuToCpuAddress(address)}; - Shader shader{cpu_addr ? TryGet(*cpu_addr) : null_shader}; - if (shader) { + if (Shader* const shader{cpu_addr ? TryGet(*cpu_addr) : null_shader.get()}) { return last_shaders[static_cast<std::size_t>(program)] = shader; } @@ -468,30 +465,29 @@ Shader ShaderCacheOpenGL::GetStageProgram(Maxwell::ShaderProgram program) { const ShaderParameters params{system, disk_cache, device, *cpu_addr, host_ptr, unique_identifier}; + std::unique_ptr<Shader> shader; const auto found = runtime_cache.find(unique_identifier); if (found == runtime_cache.end()) { - shader = CachedShader::CreateStageFromMemory(params, program, std::move(code), - std::move(code_b)); + shader = Shader::CreateStageFromMemory(params, program, std::move(code), std::move(code_b)); } else { - const std::size_t size_in_bytes = code.size() * sizeof(u64); - shader = CachedShader::CreateFromCache(params, found->second, size_in_bytes); + shader = Shader::CreateFromCache(params, found->second); } + Shader* const result = shader.get(); if (cpu_addr) { - Register(shader); + Register(std::move(shader), *cpu_addr, code.size() * sizeof(u64)); } else { - null_shader = shader; + null_shader = std::move(shader); } - return last_shaders[static_cast<std::size_t>(program)] = shader; + return last_shaders[static_cast<std::size_t>(program)] = result; } -Shader ShaderCacheOpenGL::GetComputeKernel(GPUVAddr code_addr) { +Shader* ShaderCacheOpenGL::GetComputeKernel(GPUVAddr code_addr) { auto& memory_manager{system.GPU().MemoryManager()}; const auto cpu_addr{memory_manager.GpuToCpuAddress(code_addr)}; - auto kernel = cpu_addr ? TryGet(*cpu_addr) : null_kernel; - if (kernel) { + if (Shader* const kernel = cpu_addr ? TryGet(*cpu_addr) : null_kernel.get()) { return kernel; } @@ -503,20 +499,21 @@ Shader ShaderCacheOpenGL::GetComputeKernel(GPUVAddr code_addr) { const ShaderParameters params{system, disk_cache, device, *cpu_addr, host_ptr, unique_identifier}; + std::unique_ptr<Shader> kernel; const auto found = runtime_cache.find(unique_identifier); if (found == runtime_cache.end()) { - kernel = CachedShader::CreateKernelFromMemory(params, std::move(code)); + kernel = Shader::CreateKernelFromMemory(params, std::move(code)); } else { - const std::size_t size_in_bytes = code.size() * sizeof(u64); - kernel = CachedShader::CreateFromCache(params, found->second, size_in_bytes); + kernel = Shader::CreateFromCache(params, found->second); } + Shader* const result = kernel.get(); if (cpu_addr) { - Register(kernel); + Register(std::move(kernel), *cpu_addr, code.size() * sizeof(u64)); } else { - null_kernel = kernel; + null_kernel = std::move(kernel); } - return kernel; + return result; } } // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_shader_cache.h b/src/video_core/renderer_opengl/gl_shader_cache.h index b2ae8d7f9..6848f1388 100644 --- a/src/video_core/renderer_opengl/gl_shader_cache.h +++ b/src/video_core/renderer_opengl/gl_shader_cache.h @@ -18,12 +18,12 @@ #include "common/common_types.h" #include "video_core/engines/shader_type.h" -#include "video_core/rasterizer_cache.h" #include "video_core/renderer_opengl/gl_resource_manager.h" #include "video_core/renderer_opengl/gl_shader_decompiler.h" #include "video_core/renderer_opengl/gl_shader_disk_cache.h" #include "video_core/shader/registry.h" #include "video_core/shader/shader_ir.h" +#include "video_core/shader_cache.h" namespace Core { class System; @@ -35,12 +35,10 @@ class EmuWindow; namespace OpenGL { -class CachedShader; class Device; class RasterizerOpenGL; struct UnspecializedShader; -using Shader = std::shared_ptr<CachedShader>; using Maxwell = Tegra::Engines::Maxwell3D::Regs; struct ProgramHandle { @@ -64,62 +62,53 @@ struct ShaderParameters { u64 unique_identifier; }; -class CachedShader final : public RasterizerCacheObject { +class Shader final { public: - ~CachedShader(); + ~Shader(); /// Gets the GL program handle for the shader GLuint GetHandle() const; - /// Returns the size in bytes of the shader - std::size_t GetSizeInBytes() const override { - return size_in_bytes; - } - /// Gets the shader entries for the shader const ShaderEntries& GetEntries() const { return entries; } - static Shader CreateStageFromMemory(const ShaderParameters& params, - Maxwell::ShaderProgram program_type, - ProgramCode program_code, ProgramCode program_code_b); - static Shader CreateKernelFromMemory(const ShaderParameters& params, ProgramCode code); + static std::unique_ptr<Shader> CreateStageFromMemory(const ShaderParameters& params, + Maxwell::ShaderProgram program_type, + ProgramCode program_code, + ProgramCode program_code_b); + static std::unique_ptr<Shader> CreateKernelFromMemory(const ShaderParameters& params, + ProgramCode code); - static Shader CreateFromCache(const ShaderParameters& params, - const PrecompiledShader& precompiled_shader, - std::size_t size_in_bytes); + static std::unique_ptr<Shader> CreateFromCache(const ShaderParameters& params, + const PrecompiledShader& precompiled_shader); private: - explicit CachedShader(VAddr cpu_addr, std::size_t size_in_bytes, - std::shared_ptr<VideoCommon::Shader::Registry> registry, - ShaderEntries entries, ProgramSharedPtr program); + explicit Shader(std::shared_ptr<VideoCommon::Shader::Registry> registry, ShaderEntries entries, + ProgramSharedPtr program); std::shared_ptr<VideoCommon::Shader::Registry> registry; ShaderEntries entries; - std::size_t size_in_bytes = 0; ProgramSharedPtr program; GLuint handle = 0; }; -class ShaderCacheOpenGL final : public RasterizerCache<Shader> { +class ShaderCacheOpenGL final : public VideoCommon::ShaderCache<Shader> { public: explicit ShaderCacheOpenGL(RasterizerOpenGL& rasterizer, Core::System& system, Core::Frontend::EmuWindow& emu_window, const Device& device); + ~ShaderCacheOpenGL() override; /// Loads disk cache for the current game void LoadDiskCache(const std::atomic_bool& stop_loading, const VideoCore::DiskResourceLoadCallback& callback); /// Gets the current specified shader stage program - Shader GetStageProgram(Maxwell::ShaderProgram program); + Shader* GetStageProgram(Maxwell::ShaderProgram program); /// Gets a compute kernel in the passed address - Shader GetComputeKernel(GPUVAddr code_addr); - -protected: - // We do not have to flush this cache as things in it are never modified by us. - void FlushObjectInner(const Shader& object) override {} + Shader* GetComputeKernel(GPUVAddr code_addr); private: ProgramSharedPtr GeneratePrecompiledProgram( @@ -132,10 +121,10 @@ private: ShaderDiskCacheOpenGL disk_cache; std::unordered_map<u64, PrecompiledShader> runtime_cache; - Shader null_shader{}; - Shader null_kernel{}; + std::unique_ptr<Shader> null_shader; + std::unique_ptr<Shader> null_kernel; - std::array<Shader, Maxwell::MaxShaderProgram> last_shaders; + std::array<Shader*, Maxwell::MaxShaderProgram> last_shaders{}; }; } // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp b/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp index 9e95a122b..653c3f2f9 100644 --- a/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp +++ b/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp @@ -29,6 +29,8 @@ using VideoCommon::Shader::KeyMap; namespace { +using VideoCommon::Shader::SeparateSamplerKey; + using ShaderCacheVersionHash = std::array<u8, 64>; struct ConstBufferKey { @@ -37,18 +39,26 @@ struct ConstBufferKey { u32 value = 0; }; -struct BoundSamplerKey { +struct BoundSamplerEntry { u32 offset = 0; Tegra::Engines::SamplerDescriptor sampler; }; -struct BindlessSamplerKey { +struct SeparateSamplerEntry { + u32 cbuf1 = 0; + u32 cbuf2 = 0; + u32 offset1 = 0; + u32 offset2 = 0; + Tegra::Engines::SamplerDescriptor sampler; +}; + +struct BindlessSamplerEntry { u32 cbuf = 0; u32 offset = 0; Tegra::Engines::SamplerDescriptor sampler; }; -constexpr u32 NativeVersion = 20; +constexpr u32 NativeVersion = 21; ShaderCacheVersionHash GetShaderCacheVersionHash() { ShaderCacheVersionHash hash{}; @@ -87,12 +97,14 @@ bool ShaderDiskCacheEntry::Load(FileUtil::IOFile& file) { u32 texture_handler_size_value; u32 num_keys; u32 num_bound_samplers; + u32 num_separate_samplers; u32 num_bindless_samplers; if (file.ReadArray(&unique_identifier, 1) != 1 || file.ReadArray(&bound_buffer, 1) != 1 || file.ReadArray(&is_texture_handler_size_known, 1) != 1 || file.ReadArray(&texture_handler_size_value, 1) != 1 || file.ReadArray(&graphics_info, 1) != 1 || file.ReadArray(&compute_info, 1) != 1 || file.ReadArray(&num_keys, 1) != 1 || file.ReadArray(&num_bound_samplers, 1) != 1 || + file.ReadArray(&num_separate_samplers, 1) != 1 || file.ReadArray(&num_bindless_samplers, 1) != 1) { return false; } @@ -101,23 +113,32 @@ bool ShaderDiskCacheEntry::Load(FileUtil::IOFile& file) { } std::vector<ConstBufferKey> flat_keys(num_keys); - std::vector<BoundSamplerKey> flat_bound_samplers(num_bound_samplers); - std::vector<BindlessSamplerKey> flat_bindless_samplers(num_bindless_samplers); + std::vector<BoundSamplerEntry> flat_bound_samplers(num_bound_samplers); + std::vector<SeparateSamplerEntry> flat_separate_samplers(num_separate_samplers); + std::vector<BindlessSamplerEntry> flat_bindless_samplers(num_bindless_samplers); if (file.ReadArray(flat_keys.data(), flat_keys.size()) != flat_keys.size() || file.ReadArray(flat_bound_samplers.data(), flat_bound_samplers.size()) != flat_bound_samplers.size() || + file.ReadArray(flat_separate_samplers.data(), flat_separate_samplers.size()) != + flat_separate_samplers.size() || file.ReadArray(flat_bindless_samplers.data(), flat_bindless_samplers.size()) != flat_bindless_samplers.size()) { return false; } - for (const auto& key : flat_keys) { - keys.insert({{key.cbuf, key.offset}, key.value}); + for (const auto& entry : flat_keys) { + keys.insert({{entry.cbuf, entry.offset}, entry.value}); } - for (const auto& key : flat_bound_samplers) { - bound_samplers.emplace(key.offset, key.sampler); + for (const auto& entry : flat_bound_samplers) { + bound_samplers.emplace(entry.offset, entry.sampler); } - for (const auto& key : flat_bindless_samplers) { - bindless_samplers.insert({{key.cbuf, key.offset}, key.sampler}); + for (const auto& entry : flat_separate_samplers) { + SeparateSamplerKey key; + key.buffers = {entry.cbuf1, entry.cbuf2}; + key.offsets = {entry.offset1, entry.offset2}; + separate_samplers.emplace(key, entry.sampler); + } + for (const auto& entry : flat_bindless_samplers) { + bindless_samplers.insert({{entry.cbuf, entry.offset}, entry.sampler}); } return true; @@ -142,6 +163,7 @@ bool ShaderDiskCacheEntry::Save(FileUtil::IOFile& file) const { file.WriteObject(graphics_info) != 1 || file.WriteObject(compute_info) != 1 || file.WriteObject(static_cast<u32>(keys.size())) != 1 || file.WriteObject(static_cast<u32>(bound_samplers.size())) != 1 || + file.WriteObject(static_cast<u32>(separate_samplers.size())) != 1 || file.WriteObject(static_cast<u32>(bindless_samplers.size())) != 1) { return false; } @@ -152,22 +174,34 @@ bool ShaderDiskCacheEntry::Save(FileUtil::IOFile& file) const { flat_keys.push_back(ConstBufferKey{address.first, address.second, value}); } - std::vector<BoundSamplerKey> flat_bound_samplers; + std::vector<BoundSamplerEntry> flat_bound_samplers; flat_bound_samplers.reserve(bound_samplers.size()); for (const auto& [address, sampler] : bound_samplers) { - flat_bound_samplers.push_back(BoundSamplerKey{address, sampler}); + flat_bound_samplers.push_back(BoundSamplerEntry{address, sampler}); + } + + std::vector<SeparateSamplerEntry> flat_separate_samplers; + flat_separate_samplers.reserve(separate_samplers.size()); + for (const auto& [key, sampler] : separate_samplers) { + SeparateSamplerEntry entry; + std::tie(entry.cbuf1, entry.cbuf2) = key.buffers; + std::tie(entry.offset1, entry.offset2) = key.offsets; + entry.sampler = sampler; + flat_separate_samplers.push_back(entry); } - std::vector<BindlessSamplerKey> flat_bindless_samplers; + std::vector<BindlessSamplerEntry> flat_bindless_samplers; flat_bindless_samplers.reserve(bindless_samplers.size()); for (const auto& [address, sampler] : bindless_samplers) { flat_bindless_samplers.push_back( - BindlessSamplerKey{address.first, address.second, sampler}); + BindlessSamplerEntry{address.first, address.second, sampler}); } return file.WriteArray(flat_keys.data(), flat_keys.size()) == flat_keys.size() && file.WriteArray(flat_bound_samplers.data(), flat_bound_samplers.size()) == flat_bound_samplers.size() && + file.WriteArray(flat_separate_samplers.data(), flat_separate_samplers.size()) == + flat_separate_samplers.size() && file.WriteArray(flat_bindless_samplers.data(), flat_bindless_samplers.size()) == flat_bindless_samplers.size(); } diff --git a/src/video_core/renderer_opengl/gl_shader_disk_cache.h b/src/video_core/renderer_opengl/gl_shader_disk_cache.h index d5be52e40..a79cef0e9 100644 --- a/src/video_core/renderer_opengl/gl_shader_disk_cache.h +++ b/src/video_core/renderer_opengl/gl_shader_disk_cache.h @@ -57,6 +57,7 @@ struct ShaderDiskCacheEntry { VideoCommon::Shader::ComputeInfo compute_info; VideoCommon::Shader::KeyMap keys; VideoCommon::Shader::BoundSamplerMap bound_samplers; + VideoCommon::Shader::SeparateSamplerMap separate_samplers; VideoCommon::Shader::BindlessSamplerMap bindless_samplers; }; diff --git a/src/video_core/renderer_vulkan/vk_buffer_cache.h b/src/video_core/renderer_vulkan/vk_buffer_cache.h index a54583e7d..65cb3c8ad 100644 --- a/src/video_core/renderer_vulkan/vk_buffer_cache.h +++ b/src/video_core/renderer_vulkan/vk_buffer_cache.h @@ -8,7 +8,6 @@ #include "common/common_types.h" #include "video_core/buffer_cache/buffer_cache.h" -#include "video_core/rasterizer_cache.h" #include "video_core/renderer_vulkan/vk_memory_manager.h" #include "video_core/renderer_vulkan/vk_staging_buffer_pool.h" #include "video_core/renderer_vulkan/vk_stream_buffer.h" diff --git a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp index b8ccf164f..ea66e621e 100644 --- a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp @@ -27,6 +27,7 @@ #include "video_core/renderer_vulkan/wrapper.h" #include "video_core/shader/compiler_settings.h" #include "video_core/shader/memory_util.h" +#include "video_core/shader_cache.h" namespace Vulkan { @@ -132,19 +133,18 @@ bool ComputePipelineCacheKey::operator==(const ComputePipelineCacheKey& rhs) con return std::memcmp(&rhs, this, sizeof *this) == 0; } -CachedShader::CachedShader(Core::System& system, Tegra::Engines::ShaderType stage, - GPUVAddr gpu_addr, VAddr cpu_addr, ProgramCode program_code, - u32 main_offset) - : RasterizerCacheObject{cpu_addr}, gpu_addr{gpu_addr}, program_code{std::move(program_code)}, +Shader::Shader(Core::System& system, Tegra::Engines::ShaderType stage, GPUVAddr gpu_addr, + VideoCommon::Shader::ProgramCode program_code, u32 main_offset) + : gpu_addr{gpu_addr}, program_code{std::move(program_code)}, registry{stage, GetEngine(system, stage)}, shader_ir{this->program_code, main_offset, compiler_settings, registry}, entries{GenerateShaderEntries(shader_ir)} {} -CachedShader::~CachedShader() = default; +Shader::~Shader() = default; -Tegra::Engines::ConstBufferEngineInterface& CachedShader::GetEngine( - Core::System& system, Tegra::Engines::ShaderType stage) { - if (stage == Tegra::Engines::ShaderType::Compute) { +Tegra::Engines::ConstBufferEngineInterface& Shader::GetEngine(Core::System& system, + Tegra::Engines::ShaderType stage) { + if (stage == ShaderType::Compute) { return system.GPU().KeplerCompute(); } else { return system.GPU().Maxwell3D(); @@ -156,16 +156,16 @@ VKPipelineCache::VKPipelineCache(Core::System& system, RasterizerVulkan& rasteri VKDescriptorPool& descriptor_pool, VKUpdateDescriptorQueue& update_descriptor_queue, VKRenderPassCache& renderpass_cache) - : RasterizerCache{rasterizer}, system{system}, device{device}, scheduler{scheduler}, - descriptor_pool{descriptor_pool}, update_descriptor_queue{update_descriptor_queue}, - renderpass_cache{renderpass_cache} {} + : VideoCommon::ShaderCache<Shader>{rasterizer}, system{system}, device{device}, + scheduler{scheduler}, descriptor_pool{descriptor_pool}, + update_descriptor_queue{update_descriptor_queue}, renderpass_cache{renderpass_cache} {} VKPipelineCache::~VKPipelineCache() = default; -std::array<Shader, Maxwell::MaxShaderProgram> VKPipelineCache::GetShaders() { +std::array<Shader*, Maxwell::MaxShaderProgram> VKPipelineCache::GetShaders() { const auto& gpu = system.GPU().Maxwell3D(); - std::array<Shader, Maxwell::MaxShaderProgram> shaders; + std::array<Shader*, Maxwell::MaxShaderProgram> shaders{}; for (std::size_t index = 0; index < Maxwell::MaxShaderProgram; ++index) { const auto program{static_cast<Maxwell::ShaderProgram>(index)}; @@ -178,24 +178,28 @@ std::array<Shader, Maxwell::MaxShaderProgram> VKPipelineCache::GetShaders() { const GPUVAddr program_addr{GetShaderAddress(system, program)}; const std::optional cpu_addr = memory_manager.GpuToCpuAddress(program_addr); ASSERT(cpu_addr); - auto shader = cpu_addr ? TryGet(*cpu_addr) : null_shader; - if (!shader) { + + Shader* result = cpu_addr ? TryGet(*cpu_addr) : null_shader.get(); + if (!result) { const auto host_ptr{memory_manager.GetPointer(program_addr)}; // No shader found - create a new one constexpr u32 stage_offset = STAGE_MAIN_OFFSET; - const auto stage = static_cast<Tegra::Engines::ShaderType>(index == 0 ? 0 : index - 1); + const auto stage = static_cast<ShaderType>(index == 0 ? 0 : index - 1); ProgramCode code = GetShaderCode(memory_manager, program_addr, host_ptr, false); + const std::size_t size_in_bytes = code.size() * sizeof(u64); + + auto shader = std::make_unique<Shader>(system, stage, program_addr, std::move(code), + stage_offset); + result = shader.get(); - shader = std::make_shared<CachedShader>(system, stage, program_addr, *cpu_addr, - std::move(code), stage_offset); if (cpu_addr) { - Register(shader); + Register(std::move(shader), *cpu_addr, size_in_bytes); } else { - null_shader = shader; + null_shader = std::move(shader); } } - shaders[index] = std::move(shader); + shaders[index] = result; } return last_shaders = shaders; } @@ -236,19 +240,22 @@ VKComputePipeline& VKPipelineCache::GetComputePipeline(const ComputePipelineCach const auto cpu_addr = memory_manager.GpuToCpuAddress(program_addr); ASSERT(cpu_addr); - auto shader = cpu_addr ? TryGet(*cpu_addr) : null_kernel; + Shader* shader = cpu_addr ? TryGet(*cpu_addr) : null_kernel.get(); if (!shader) { // No shader found - create a new one const auto host_ptr = memory_manager.GetPointer(program_addr); ProgramCode code = GetShaderCode(memory_manager, program_addr, host_ptr, true); - shader = std::make_shared<CachedShader>(system, Tegra::Engines::ShaderType::Compute, - program_addr, *cpu_addr, std::move(code), - KERNEL_MAIN_OFFSET); + const std::size_t size_in_bytes = code.size() * sizeof(u64); + + auto shader_info = std::make_unique<Shader>(system, ShaderType::Compute, program_addr, + std::move(code), KERNEL_MAIN_OFFSET); + shader = shader_info.get(); + if (cpu_addr) { - Register(shader); + Register(std::move(shader_info), *cpu_addr, size_in_bytes); } else { - null_kernel = shader; + null_kernel = std::move(shader_info); } } @@ -264,7 +271,7 @@ VKComputePipeline& VKPipelineCache::GetComputePipeline(const ComputePipelineCach return *entry; } -void VKPipelineCache::Unregister(const Shader& shader) { +void VKPipelineCache::OnShaderRemoval(Shader* shader) { bool finished = false; const auto Finish = [&] { // TODO(Rodrigo): Instead of finishing here, wait for the fences that use this pipeline and @@ -296,8 +303,6 @@ void VKPipelineCache::Unregister(const Shader& shader) { Finish(); it = compute_cache.erase(it); } - - RasterizerCache::Unregister(shader); } std::pair<SPIRVProgram, std::vector<VkDescriptorSetLayoutBinding>> @@ -332,12 +337,11 @@ VKPipelineCache::DecompileShaders(const GraphicsPipelineCacheKey& key) { } const GPUVAddr gpu_addr = GetShaderAddress(system, program_enum); - const auto cpu_addr = memory_manager.GpuToCpuAddress(gpu_addr); - const auto shader = cpu_addr ? TryGet(*cpu_addr) : null_shader; - ASSERT(shader); + const std::optional<VAddr> cpu_addr = memory_manager.GpuToCpuAddress(gpu_addr); + Shader* const shader = cpu_addr ? TryGet(*cpu_addr) : null_shader.get(); const std::size_t stage = index == 0 ? 0 : index - 1; // Stage indices are 0 - 5 - const auto program_type = GetShaderType(program_enum); + const ShaderType program_type = GetShaderType(program_enum); const auto& entries = shader->GetEntries(); program[stage] = { Decompile(device, shader->GetIR(), program_type, shader->GetRegistry(), specialization), diff --git a/src/video_core/renderer_vulkan/vk_pipeline_cache.h b/src/video_core/renderer_vulkan/vk_pipeline_cache.h index 0b5796fef..0a36e5112 100644 --- a/src/video_core/renderer_vulkan/vk_pipeline_cache.h +++ b/src/video_core/renderer_vulkan/vk_pipeline_cache.h @@ -17,7 +17,6 @@ #include "common/common_types.h" #include "video_core/engines/const_buffer_engine_interface.h" #include "video_core/engines/maxwell_3d.h" -#include "video_core/rasterizer_cache.h" #include "video_core/renderer_vulkan/fixed_pipeline_state.h" #include "video_core/renderer_vulkan/vk_graphics_pipeline.h" #include "video_core/renderer_vulkan/vk_renderpass_cache.h" @@ -26,6 +25,7 @@ #include "video_core/shader/memory_util.h" #include "video_core/shader/registry.h" #include "video_core/shader/shader_ir.h" +#include "video_core/shader_cache.h" namespace Core { class System; @@ -41,8 +41,6 @@ class VKFence; class VKScheduler; class VKUpdateDescriptorQueue; -class CachedShader; -using Shader = std::shared_ptr<CachedShader>; using Maxwell = Tegra::Engines::Maxwell3D::Regs; struct GraphicsPipelineCacheKey { @@ -102,21 +100,16 @@ struct hash<Vulkan::ComputePipelineCacheKey> { namespace Vulkan { -class CachedShader final : public RasterizerCacheObject { +class Shader { public: - explicit CachedShader(Core::System& system, Tegra::Engines::ShaderType stage, GPUVAddr gpu_addr, - VAddr cpu_addr, VideoCommon::Shader::ProgramCode program_code, - u32 main_offset); - ~CachedShader(); + explicit Shader(Core::System& system, Tegra::Engines::ShaderType stage, GPUVAddr gpu_addr, + VideoCommon::Shader::ProgramCode program_code, u32 main_offset); + ~Shader(); GPUVAddr GetGpuAddr() const { return gpu_addr; } - std::size_t GetSizeInBytes() const override { - return program_code.size() * sizeof(u64); - } - VideoCommon::Shader::ShaderIR& GetIR() { return shader_ir; } @@ -144,25 +137,23 @@ private: ShaderEntries entries; }; -class VKPipelineCache final : public RasterizerCache<Shader> { +class VKPipelineCache final : public VideoCommon::ShaderCache<Shader> { public: explicit VKPipelineCache(Core::System& system, RasterizerVulkan& rasterizer, const VKDevice& device, VKScheduler& scheduler, VKDescriptorPool& descriptor_pool, VKUpdateDescriptorQueue& update_descriptor_queue, VKRenderPassCache& renderpass_cache); - ~VKPipelineCache(); + ~VKPipelineCache() override; - std::array<Shader, Maxwell::MaxShaderProgram> GetShaders(); + std::array<Shader*, Maxwell::MaxShaderProgram> GetShaders(); VKGraphicsPipeline& GetGraphicsPipeline(const GraphicsPipelineCacheKey& key); VKComputePipeline& GetComputePipeline(const ComputePipelineCacheKey& key); protected: - void Unregister(const Shader& shader) override; - - void FlushObjectInner(const Shader& object) override {} + void OnShaderRemoval(Shader* shader) final; private: std::pair<SPIRVProgram, std::vector<VkDescriptorSetLayoutBinding>> DecompileShaders( @@ -175,10 +166,10 @@ private: VKUpdateDescriptorQueue& update_descriptor_queue; VKRenderPassCache& renderpass_cache; - Shader null_shader{}; - Shader null_kernel{}; + std::unique_ptr<Shader> null_shader; + std::unique_ptr<Shader> null_kernel; - std::array<Shader, Maxwell::MaxShaderProgram> last_shaders; + std::array<Shader*, Maxwell::MaxShaderProgram> last_shaders{}; GraphicsPipelineCacheKey last_graphics_key; VKGraphicsPipeline* last_graphics_pipeline = nullptr; diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp index 19b8f9da3..184b2238a 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp +++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp @@ -38,6 +38,7 @@ #include "video_core/renderer_vulkan/vk_texture_cache.h" #include "video_core/renderer_vulkan/vk_update_descriptor.h" #include "video_core/renderer_vulkan/wrapper.h" +#include "video_core/shader_cache.h" namespace Vulkan { @@ -98,7 +99,7 @@ VkRect2D GetScissorState(const Maxwell& regs, std::size_t index) { } std::array<GPUVAddr, Maxwell::MaxShaderProgram> GetShaderAddresses( - const std::array<Shader, Maxwell::MaxShaderProgram>& shaders) { + const std::array<Shader*, Maxwell::MaxShaderProgram>& shaders) { std::array<GPUVAddr, Maxwell::MaxShaderProgram> addresses; for (std::size_t i = 0; i < std::size(addresses); ++i) { addresses[i] = shaders[i] ? shaders[i]->GetGpuAddr() : 0; @@ -117,6 +118,17 @@ template <typename Engine, typename Entry> Tegra::Texture::FullTextureInfo GetTextureInfo(const Engine& engine, const Entry& entry, std::size_t stage, std::size_t index = 0) { const auto stage_type = static_cast<Tegra::Engines::ShaderType>(stage); + if constexpr (std::is_same_v<Entry, SamplerEntry>) { + if (entry.is_separated) { + const u32 buffer_1 = entry.buffer; + const u32 buffer_2 = entry.secondary_buffer; + const u32 offset_1 = entry.offset; + const u32 offset_2 = entry.secondary_offset; + const u32 handle_1 = engine.AccessConstBuffer32(stage_type, buffer_1, offset_1); + const u32 handle_2 = engine.AccessConstBuffer32(stage_type, buffer_2, offset_2); + return engine.GetTextureInfo(handle_1 | handle_2); + } + } if (entry.is_bindless) { const auto tex_handle = engine.AccessConstBuffer32(stage_type, entry.buffer, entry.offset); return engine.GetTextureInfo(tex_handle); @@ -776,12 +788,12 @@ RasterizerVulkan::DrawParameters RasterizerVulkan::SetupGeometry(FixedPipelineSt } void RasterizerVulkan::SetupShaderDescriptors( - const std::array<Shader, Maxwell::MaxShaderProgram>& shaders) { + const std::array<Shader*, Maxwell::MaxShaderProgram>& shaders) { texture_cache.GuardSamplers(true); for (std::size_t stage = 0; stage < Maxwell::MaxShaderStage; ++stage) { // Skip VertexA stage - const auto& shader = shaders[stage + 1]; + Shader* const shader = shaders[stage + 1]; if (!shader) { continue; } diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.h b/src/video_core/renderer_vulkan/vk_rasterizer.h index 04be37a5e..c8c187606 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.h +++ b/src/video_core/renderer_vulkan/vk_rasterizer.h @@ -168,7 +168,7 @@ private: bool is_indexed, bool is_instanced); /// Setup descriptors in the graphics pipeline. - void SetupShaderDescriptors(const std::array<Shader, Maxwell::MaxShaderProgram>& shaders); + void SetupShaderDescriptors(const std::array<Shader*, Maxwell::MaxShaderProgram>& shaders); void SetupImageTransitions(Texceptions texceptions, const std::array<View, Maxwell::NumRenderTargets>& color_attachments, diff --git a/src/video_core/shader/decode/texture.cpp b/src/video_core/shader/decode/texture.cpp index 8f0bb996e..29ebf65ba 100644 --- a/src/video_core/shader/decode/texture.cpp +++ b/src/video_core/shader/decode/texture.cpp @@ -357,13 +357,11 @@ u32 ShaderIR::DecodeTexture(NodeBlock& bb, u32 pc) { return pc; } -ShaderIR::SamplerInfo ShaderIR::GetSamplerInfo(SamplerInfo info, u32 offset, - std::optional<u32> buffer) { +ShaderIR::SamplerInfo ShaderIR::GetSamplerInfo( + SamplerInfo info, std::optional<Tegra::Engines::SamplerDescriptor> sampler) { if (info.IsComplete()) { return info; } - const auto sampler = buffer ? registry.ObtainBindlessSampler(*buffer, offset) - : registry.ObtainBoundSampler(offset); if (!sampler) { LOG_WARNING(HW_GPU, "Unknown sampler info"); info.type = info.type.value_or(Tegra::Shader::TextureType::Texture2D); @@ -381,8 +379,8 @@ ShaderIR::SamplerInfo ShaderIR::GetSamplerInfo(SamplerInfo info, u32 offset, std::optional<Sampler> ShaderIR::GetSampler(Tegra::Shader::Sampler sampler, SamplerInfo sampler_info) { - const auto offset = static_cast<u32>(sampler.index.Value()); - const auto info = GetSamplerInfo(sampler_info, offset); + const u32 offset = static_cast<u32>(sampler.index.Value()); + const auto info = GetSamplerInfo(sampler_info, registry.ObtainBoundSampler(offset)); // If this sampler has already been used, return the existing mapping. const auto it = std::find_if(used_samplers.begin(), used_samplers.end(), @@ -404,20 +402,19 @@ std::optional<Sampler> ShaderIR::GetBindlessSampler(Tegra::Shader::Register reg, const Node sampler_register = GetRegister(reg); const auto [base_node, tracked_sampler_info] = TrackBindlessSampler(sampler_register, global_code, static_cast<s64>(global_code.size())); - ASSERT(base_node != nullptr); - if (base_node == nullptr) { + if (!base_node) { + UNREACHABLE(); return std::nullopt; } - if (const auto bindless_sampler_info = - std::get_if<BindlessSamplerNode>(&*tracked_sampler_info)) { - const u32 buffer = bindless_sampler_info->GetIndex(); - const u32 offset = bindless_sampler_info->GetOffset(); - info = GetSamplerInfo(info, offset, buffer); + if (const auto sampler_info = std::get_if<BindlessSamplerNode>(&*tracked_sampler_info)) { + const u32 buffer = sampler_info->index; + const u32 offset = sampler_info->offset; + info = GetSamplerInfo(info, registry.ObtainBindlessSampler(buffer, offset)); // If this sampler has already been used, return the existing mapping. const auto it = std::find_if(used_samplers.begin(), used_samplers.end(), - [buffer = buffer, offset = offset](const Sampler& entry) { + [buffer, offset](const Sampler& entry) { return entry.buffer == buffer && entry.offset == offset; }); if (it != used_samplers.end()) { @@ -431,10 +428,32 @@ std::optional<Sampler> ShaderIR::GetBindlessSampler(Tegra::Shader::Register reg, return used_samplers.emplace_back(next_index, offset, buffer, *info.type, *info.is_array, *info.is_shadow, *info.is_buffer, false); } - if (const auto array_sampler_info = std::get_if<ArraySamplerNode>(&*tracked_sampler_info)) { - const u32 base_offset = array_sampler_info->GetBaseOffset() / 4; - index_var = GetCustomVariable(array_sampler_info->GetIndexVar()); - info = GetSamplerInfo(info, base_offset); + if (const auto sampler_info = std::get_if<SeparateSamplerNode>(&*tracked_sampler_info)) { + const std::pair indices = sampler_info->indices; + const std::pair offsets = sampler_info->offsets; + info = GetSamplerInfo(info, registry.ObtainSeparateSampler(indices, offsets)); + + // Try to use an already created sampler if it exists + const auto it = std::find_if( + used_samplers.begin(), used_samplers.end(), [indices, offsets](const Sampler& entry) { + return offsets == std::pair{entry.offset, entry.secondary_offset} && + indices == std::pair{entry.buffer, entry.secondary_buffer}; + }); + if (it != used_samplers.end()) { + ASSERT(it->is_separated && it->type == info.type && it->is_array == info.is_array && + it->is_shadow == info.is_shadow && it->is_buffer == info.is_buffer); + return *it; + } + + // Otherwise create a new mapping for this sampler + const u32 next_index = static_cast<u32>(used_samplers.size()); + return used_samplers.emplace_back(next_index, offsets, indices, *info.type, *info.is_array, + *info.is_shadow, *info.is_buffer); + } + if (const auto sampler_info = std::get_if<ArraySamplerNode>(&*tracked_sampler_info)) { + const u32 base_offset = sampler_info->base_offset / 4; + index_var = GetCustomVariable(sampler_info->bindless_var); + info = GetSamplerInfo(info, registry.ObtainBoundSampler(base_offset)); // If this sampler has already been used, return the existing mapping. const auto it = std::find_if( diff --git a/src/video_core/shader/node.h b/src/video_core/shader/node.h index c5e5165ff..8f230d57a 100644 --- a/src/video_core/shader/node.h +++ b/src/video_core/shader/node.h @@ -275,10 +275,11 @@ using Node = std::shared_ptr<NodeData>; using Node4 = std::array<Node, 4>; using NodeBlock = std::vector<Node>; -class BindlessSamplerNode; -class ArraySamplerNode; +struct ArraySamplerNode; +struct BindlessSamplerNode; +struct SeparateSamplerNode; -using TrackSamplerData = std::variant<BindlessSamplerNode, ArraySamplerNode>; +using TrackSamplerData = std::variant<BindlessSamplerNode, SeparateSamplerNode, ArraySamplerNode>; using TrackSampler = std::shared_ptr<TrackSamplerData>; struct Sampler { @@ -288,63 +289,51 @@ struct Sampler { : index{index}, offset{offset}, type{type}, is_array{is_array}, is_shadow{is_shadow}, is_buffer{is_buffer}, is_indexed{is_indexed} {} + /// Separate sampler constructor + constexpr explicit Sampler(u32 index, std::pair<u32, u32> offsets, std::pair<u32, u32> buffers, + Tegra::Shader::TextureType type, bool is_array, bool is_shadow, + bool is_buffer) + : index{index}, offset{offsets.first}, secondary_offset{offsets.second}, + buffer{buffers.first}, secondary_buffer{buffers.second}, type{type}, is_array{is_array}, + is_shadow{is_shadow}, is_buffer{is_buffer}, is_separated{true} {} + /// Bindless samplers constructor constexpr explicit Sampler(u32 index, u32 offset, u32 buffer, Tegra::Shader::TextureType type, bool is_array, bool is_shadow, bool is_buffer, bool is_indexed) : index{index}, offset{offset}, buffer{buffer}, type{type}, is_array{is_array}, is_shadow{is_shadow}, is_buffer{is_buffer}, is_bindless{true}, is_indexed{is_indexed} {} - u32 index = 0; ///< Emulated index given for the this sampler. - u32 offset = 0; ///< Offset in the const buffer from where the sampler is being read. - u32 buffer = 0; ///< Buffer where the bindless sampler is being read (unused on bound samplers). - u32 size = 1; ///< Size of the sampler. + u32 index = 0; ///< Emulated index given for the this sampler. + u32 offset = 0; ///< Offset in the const buffer from where the sampler is being read. + u32 secondary_offset = 0; ///< Secondary offset in the const buffer. + u32 buffer = 0; ///< Buffer where the bindless sampler is read. + u32 secondary_buffer = 0; ///< Secondary buffer where the bindless sampler is read. + u32 size = 1; ///< Size of the sampler. Tegra::Shader::TextureType type{}; ///< The type used to sample this texture (Texture2D, etc) - bool is_array = false; ///< Whether the texture is being sampled as an array texture or not. - bool is_shadow = false; ///< Whether the texture is being sampled as a depth texture or not. - bool is_buffer = false; ///< Whether the texture is a texture buffer without sampler. - bool is_bindless = false; ///< Whether this sampler belongs to a bindless texture or not. - bool is_indexed = false; ///< Whether this sampler is an indexed array of textures. + bool is_array = false; ///< Whether the texture is being sampled as an array texture or not. + bool is_shadow = false; ///< Whether the texture is being sampled as a depth texture or not. + bool is_buffer = false; ///< Whether the texture is a texture buffer without sampler. + bool is_bindless = false; ///< Whether this sampler belongs to a bindless texture or not. + bool is_indexed = false; ///< Whether this sampler is an indexed array of textures. + bool is_separated = false; ///< Whether the image and sampler is separated or not. }; /// Represents a tracked bindless sampler into a direct const buffer -class ArraySamplerNode final { -public: - explicit ArraySamplerNode(u32 index, u32 base_offset, u32 bindless_var) - : index{index}, base_offset{base_offset}, bindless_var{bindless_var} {} - - constexpr u32 GetIndex() const { - return index; - } - - constexpr u32 GetBaseOffset() const { - return base_offset; - } - - constexpr u32 GetIndexVar() const { - return bindless_var; - } - -private: +struct ArraySamplerNode { u32 index; u32 base_offset; u32 bindless_var; }; -/// Represents a tracked bindless sampler into a direct const buffer -class BindlessSamplerNode final { -public: - explicit BindlessSamplerNode(u32 index, u32 offset) : index{index}, offset{offset} {} - - constexpr u32 GetIndex() const { - return index; - } - - constexpr u32 GetOffset() const { - return offset; - } +/// Represents a tracked separate sampler image pair that was folded statically +struct SeparateSamplerNode { + std::pair<u32, u32> indices; + std::pair<u32, u32> offsets; +}; -private: +/// Represents a tracked bindless sampler into a direct const buffer +struct BindlessSamplerNode { u32 index; u32 offset; }; diff --git a/src/video_core/shader/node_helper.h b/src/video_core/shader/node_helper.h index 11231bbea..1e0886185 100644 --- a/src/video_core/shader/node_helper.h +++ b/src/video_core/shader/node_helper.h @@ -48,7 +48,7 @@ Node MakeNode(Args&&... args) { template <typename T, typename... Args> TrackSampler MakeTrackSampler(Args&&... args) { static_assert(std::is_convertible_v<T, TrackSamplerData>); - return std::make_shared<TrackSamplerData>(T(std::forward<Args>(args)...)); + return std::make_shared<TrackSamplerData>(T{std::forward<Args>(args)...}); } template <typename... Args> diff --git a/src/video_core/shader/registry.cpp b/src/video_core/shader/registry.cpp index af70b3f35..cdf274e54 100644 --- a/src/video_core/shader/registry.cpp +++ b/src/video_core/shader/registry.cpp @@ -93,6 +93,26 @@ std::optional<SamplerDescriptor> Registry::ObtainBoundSampler(u32 offset) { return value; } +std::optional<Tegra::Engines::SamplerDescriptor> Registry::ObtainSeparateSampler( + std::pair<u32, u32> buffers, std::pair<u32, u32> offsets) { + SeparateSamplerKey key; + key.buffers = buffers; + key.offsets = offsets; + const auto iter = separate_samplers.find(key); + if (iter != separate_samplers.end()) { + return iter->second; + } + if (!engine) { + return std::nullopt; + } + + const u32 handle_1 = engine->AccessConstBuffer32(stage, key.buffers.first, key.offsets.first); + const u32 handle_2 = engine->AccessConstBuffer32(stage, key.buffers.second, key.offsets.second); + const SamplerDescriptor value = engine->AccessSampler(handle_1 | handle_2); + separate_samplers.emplace(key, value); + return value; +} + std::optional<Tegra::Engines::SamplerDescriptor> Registry::ObtainBindlessSampler(u32 buffer, u32 offset) { const std::pair key = {buffer, offset}; diff --git a/src/video_core/shader/registry.h b/src/video_core/shader/registry.h index 0c80d35fd..231206765 100644 --- a/src/video_core/shader/registry.h +++ b/src/video_core/shader/registry.h @@ -19,8 +19,39 @@ namespace VideoCommon::Shader { +struct SeparateSamplerKey { + std::pair<u32, u32> buffers; + std::pair<u32, u32> offsets; +}; + +} // namespace VideoCommon::Shader + +namespace std { + +template <> +struct hash<VideoCommon::Shader::SeparateSamplerKey> { + std::size_t operator()(const VideoCommon::Shader::SeparateSamplerKey& key) const noexcept { + return std::hash<u32>{}(key.buffers.first ^ key.buffers.second ^ key.offsets.first ^ + key.offsets.second); + } +}; + +template <> +struct equal_to<VideoCommon::Shader::SeparateSamplerKey> { + bool operator()(const VideoCommon::Shader::SeparateSamplerKey& lhs, + const VideoCommon::Shader::SeparateSamplerKey& rhs) const noexcept { + return lhs.buffers == rhs.buffers && lhs.offsets == rhs.offsets; + } +}; + +} // namespace std + +namespace VideoCommon::Shader { + using KeyMap = std::unordered_map<std::pair<u32, u32>, u32, Common::PairHash>; using BoundSamplerMap = std::unordered_map<u32, Tegra::Engines::SamplerDescriptor>; +using SeparateSamplerMap = + std::unordered_map<SeparateSamplerKey, Tegra::Engines::SamplerDescriptor>; using BindlessSamplerMap = std::unordered_map<std::pair<u32, u32>, Tegra::Engines::SamplerDescriptor, Common::PairHash>; @@ -73,6 +104,9 @@ public: std::optional<Tegra::Engines::SamplerDescriptor> ObtainBoundSampler(u32 offset); + std::optional<Tegra::Engines::SamplerDescriptor> ObtainSeparateSampler( + std::pair<u32, u32> buffers, std::pair<u32, u32> offsets); + std::optional<Tegra::Engines::SamplerDescriptor> ObtainBindlessSampler(u32 buffer, u32 offset); /// Inserts a key. @@ -128,6 +162,7 @@ private: Tegra::Engines::ConstBufferEngineInterface* engine = nullptr; KeyMap keys; BoundSamplerMap bound_samplers; + SeparateSamplerMap separate_samplers; BindlessSamplerMap bindless_samplers; u32 bound_buffer; GraphicsInfo graphics_info; diff --git a/src/video_core/shader/shader_ir.h b/src/video_core/shader/shader_ir.h index 15ae152f2..3a98b2104 100644 --- a/src/video_core/shader/shader_ir.h +++ b/src/video_core/shader/shader_ir.h @@ -330,8 +330,8 @@ private: OperationCode GetPredicateCombiner(Tegra::Shader::PredOperation operation); /// Queries the missing sampler info from the execution context. - SamplerInfo GetSamplerInfo(SamplerInfo info, u32 offset, - std::optional<u32> buffer = std::nullopt); + SamplerInfo GetSamplerInfo(SamplerInfo info, + std::optional<Tegra::Engines::SamplerDescriptor> sampler); /// Accesses a texture sampler. std::optional<Sampler> GetSampler(Tegra::Shader::Sampler sampler, SamplerInfo info); @@ -409,8 +409,14 @@ private: std::tuple<Node, u32, u32> TrackCbuf(Node tracked, const NodeBlock& code, s64 cursor) const; - std::tuple<Node, TrackSampler> TrackBindlessSampler(Node tracked, const NodeBlock& code, - s64 cursor); + std::pair<Node, TrackSampler> TrackBindlessSampler(Node tracked, const NodeBlock& code, + s64 cursor); + + std::pair<Node, TrackSampler> HandleBindlessIndirectRead(const CbufNode& cbuf, + const OperationNode& operation, + Node gpr, Node base_offset, + Node tracked, const NodeBlock& code, + s64 cursor); std::optional<u32> TrackImmediate(Node tracked, const NodeBlock& code, s64 cursor) const; diff --git a/src/video_core/shader/track.cpp b/src/video_core/shader/track.cpp index eb97bfd41..d5ed81442 100644 --- a/src/video_core/shader/track.cpp +++ b/src/video_core/shader/track.cpp @@ -14,6 +14,7 @@ namespace VideoCommon::Shader { namespace { + std::pair<Node, s64> FindOperation(const NodeBlock& code, s64 cursor, OperationCode operation_code) { for (; cursor >= 0; --cursor) { @@ -63,7 +64,8 @@ bool AmendNodeCv(std::size_t amend_index, Node node) { if (const auto operation = std::get_if<OperationNode>(&*node)) { operation->SetAmendIndex(amend_index); return true; - } else if (const auto conditional = std::get_if<ConditionalNode>(&*node)) { + } + if (const auto conditional = std::get_if<ConditionalNode>(&*node)) { conditional->SetAmendIndex(amend_index); return true; } @@ -72,40 +74,27 @@ bool AmendNodeCv(std::size_t amend_index, Node node) { } // Anonymous namespace -std::tuple<Node, TrackSampler> ShaderIR::TrackBindlessSampler(Node tracked, const NodeBlock& code, - s64 cursor) { +std::pair<Node, TrackSampler> ShaderIR::TrackBindlessSampler(Node tracked, const NodeBlock& code, + s64 cursor) { if (const auto cbuf = std::get_if<CbufNode>(&*tracked)) { + const u32 cbuf_index = cbuf->GetIndex(); + // Constant buffer found, test if it's an immediate const auto& offset = cbuf->GetOffset(); if (const auto immediate = std::get_if<ImmediateNode>(&*offset)) { - auto track = - MakeTrackSampler<BindlessSamplerNode>(cbuf->GetIndex(), immediate->GetValue()); + auto track = MakeTrackSampler<BindlessSamplerNode>(cbuf_index, immediate->GetValue()); return {tracked, track}; } if (const auto operation = std::get_if<OperationNode>(&*offset)) { const u32 bound_buffer = registry.GetBoundBuffer(); - if (bound_buffer != cbuf->GetIndex()) { + if (bound_buffer != cbuf_index) { return {}; } - const auto pair = DecoupleIndirectRead(*operation); - if (!pair) { - return {}; + if (const std::optional pair = DecoupleIndirectRead(*operation)) { + auto [gpr, base_offset] = *pair; + return HandleBindlessIndirectRead(*cbuf, *operation, gpr, base_offset, tracked, + code, cursor); } - auto [gpr, base_offset] = *pair; - const auto offset_inm = std::get_if<ImmediateNode>(&*base_offset); - const auto& gpu_driver = registry.AccessGuestDriverProfile(); - const u32 bindless_cv = NewCustomVariable(); - Node op = - Operation(OperationCode::UDiv, gpr, Immediate(gpu_driver.GetTextureHandlerSize())); - - const Node cv_node = GetCustomVariable(bindless_cv); - Node amend_op = Operation(OperationCode::Assign, cv_node, std::move(op)); - const std::size_t amend_index = DeclareAmend(std::move(amend_op)); - AmendNodeCv(amend_index, code[cursor]); - // TODO Implement Bindless Index custom variable - auto track = MakeTrackSampler<ArraySamplerNode>(cbuf->GetIndex(), - offset_inm->GetValue(), bindless_cv); - return {tracked, track}; } return {}; } @@ -122,10 +111,23 @@ std::tuple<Node, TrackSampler> ShaderIR::TrackBindlessSampler(Node tracked, cons return TrackBindlessSampler(source, code, new_cursor); } if (const auto operation = std::get_if<OperationNode>(&*tracked)) { - for (std::size_t i = operation->GetOperandsCount(); i > 0; --i) { - if (auto found = TrackBindlessSampler((*operation)[i - 1], code, cursor); - std::get<0>(found)) { - // Cbuf found in operand. + const OperationNode& op = *operation; + + const OperationCode opcode = operation->GetCode(); + if (opcode == OperationCode::IBitwiseOr || opcode == OperationCode::UBitwiseOr) { + ASSERT(op.GetOperandsCount() == 2); + auto [node_a, index_a, offset_a] = TrackCbuf(op[0], code, cursor); + auto [node_b, index_b, offset_b] = TrackCbuf(op[1], code, cursor); + if (node_a && node_b) { + auto track = MakeTrackSampler<SeparateSamplerNode>(std::pair{index_a, index_b}, + std::pair{offset_a, offset_b}); + return {tracked, std::move(track)}; + } + } + std::size_t i = op.GetOperandsCount(); + while (i--) { + if (auto found = TrackBindlessSampler(op[i - 1], code, cursor); std::get<0>(found)) { + // Constant buffer found in operand. return found; } } @@ -139,6 +141,26 @@ std::tuple<Node, TrackSampler> ShaderIR::TrackBindlessSampler(Node tracked, cons return {}; } +std::pair<Node, TrackSampler> ShaderIR::HandleBindlessIndirectRead( + const CbufNode& cbuf, const OperationNode& operation, Node gpr, Node base_offset, Node tracked, + const NodeBlock& code, s64 cursor) { + const auto offset_imm = std::get<ImmediateNode>(*base_offset); + const auto& gpu_driver = registry.AccessGuestDriverProfile(); + const u32 bindless_cv = NewCustomVariable(); + const u32 texture_handler_size = gpu_driver.GetTextureHandlerSize(); + Node op = Operation(OperationCode::UDiv, gpr, Immediate(texture_handler_size)); + + Node cv_node = GetCustomVariable(bindless_cv); + Node amend_op = Operation(OperationCode::Assign, std::move(cv_node), std::move(op)); + const std::size_t amend_index = DeclareAmend(std::move(amend_op)); + AmendNodeCv(amend_index, code[cursor]); + + // TODO: Implement bindless index custom variable + auto track = + MakeTrackSampler<ArraySamplerNode>(cbuf.GetIndex(), offset_imm.GetValue(), bindless_cv); + return {tracked, track}; +} + std::tuple<Node, u32, u32> ShaderIR::TrackCbuf(Node tracked, const NodeBlock& code, s64 cursor) const { if (const auto cbuf = std::get_if<CbufNode>(&*tracked)) { diff --git a/src/video_core/shader_cache.h b/src/video_core/shader_cache.h new file mode 100644 index 000000000..a23c23886 --- /dev/null +++ b/src/video_core/shader_cache.h @@ -0,0 +1,228 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include <algorithm> +#include <memory> +#include <mutex> +#include <unordered_map> +#include <utility> +#include <vector> + +#include "common/assert.h" +#include "common/common_types.h" +#include "video_core/rasterizer_interface.h" + +namespace VideoCommon { + +template <class T> +class ShaderCache { + static constexpr u64 PAGE_SHIFT = 14; + + struct Entry { + VAddr addr_start; + VAddr addr_end; + T* data; + + bool is_memory_marked = true; + + constexpr bool Overlaps(VAddr start, VAddr end) const noexcept { + return start < addr_end && addr_start < end; + } + }; + +public: + virtual ~ShaderCache() = default; + + /// @brief Removes shaders inside a given region + /// @note Checks for ranges + /// @param addr Start address of the invalidation + /// @param size Number of bytes of the invalidation + void InvalidateRegion(VAddr addr, std::size_t size) { + std::scoped_lock lock{invalidation_mutex}; + InvalidatePagesInRegion(addr, size); + RemovePendingShaders(); + } + + /// @brief Unmarks a memory region as cached and marks it for removal + /// @param addr Start address of the CPU write operation + /// @param size Number of bytes of the CPU write operation + void OnCPUWrite(VAddr addr, std::size_t size) { + std::lock_guard lock{invalidation_mutex}; + InvalidatePagesInRegion(addr, size); + } + + /// @brief Flushes delayed removal operations + void SyncGuestHost() { + std::scoped_lock lock{invalidation_mutex}; + RemovePendingShaders(); + } + + /// @brief Tries to obtain a cached shader starting in a given address + /// @note Doesn't check for ranges, the given address has to be the start of the shader + /// @param addr Start address of the shader, this doesn't cache for region + /// @return Pointer to a valid shader, nullptr when nothing is found + T* TryGet(VAddr addr) const { + std::scoped_lock lock{lookup_mutex}; + + const auto it = lookup_cache.find(addr); + if (it == lookup_cache.end()) { + return nullptr; + } + return it->second->data; + } + +protected: + explicit ShaderCache(VideoCore::RasterizerInterface& rasterizer_) : rasterizer{rasterizer_} {} + + /// @brief Register in the cache a given entry + /// @param data Shader to store in the cache + /// @param addr Start address of the shader that will be registered + /// @param size Size in bytes of the shader + void Register(std::unique_ptr<T> data, VAddr addr, std::size_t size) { + std::scoped_lock lock{invalidation_mutex, lookup_mutex}; + + const VAddr addr_end = addr + size; + Entry* const entry = NewEntry(addr, addr_end, data.get()); + + const u64 page_end = addr_end >> PAGE_SHIFT; + for (u64 page = addr >> PAGE_SHIFT; page <= page_end; ++page) { + invalidation_cache[page].push_back(entry); + } + + storage.push_back(std::move(data)); + + rasterizer.UpdatePagesCachedCount(addr, size, 1); + } + + /// @brief Called when a shader is going to be removed + /// @param shader Shader that will be removed + /// @pre invalidation_cache is locked + /// @pre lookup_mutex is locked + virtual void OnShaderRemoval([[maybe_unused]] T* shader) {} + +private: + /// @brief Invalidate pages in a given region + /// @pre invalidation_mutex is locked + void InvalidatePagesInRegion(VAddr addr, std::size_t size) { + const VAddr addr_end = addr + size; + const u64 page_end = addr_end >> PAGE_SHIFT; + for (u64 page = addr >> PAGE_SHIFT; page <= page_end; ++page) { + const auto it = invalidation_cache.find(page); + if (it == invalidation_cache.end()) { + continue; + } + + std::vector<Entry*>& entries = it->second; + InvalidatePageEntries(entries, addr, addr_end); + + // If there's nothing else in this page, remove it to avoid overpopulating the hash map. + if (entries.empty()) { + invalidation_cache.erase(it); + } + } + } + + /// @brief Remove shaders marked for deletion + /// @pre invalidation_mutex is locked + void RemovePendingShaders() { + if (marked_for_removal.empty()) { + return; + } + std::scoped_lock lock{lookup_mutex}; + + std::vector<T*> removed_shaders; + removed_shaders.reserve(marked_for_removal.size()); + + for (Entry* const entry : marked_for_removal) { + if (lookup_cache.erase(entry->addr_start) > 0) { + removed_shaders.push_back(entry->data); + } + } + marked_for_removal.clear(); + + if (!removed_shaders.empty()) { + RemoveShadersFromStorage(std::move(removed_shaders)); + } + } + + /// @brief Invalidates entries in a given range for the passed page + /// @param entries Vector of entries in the page, it will be modified on overlaps + /// @param addr Start address of the invalidation + /// @param addr_end Non-inclusive end address of the invalidation + /// @pre invalidation_mutex is locked + void InvalidatePageEntries(std::vector<Entry*>& entries, VAddr addr, VAddr addr_end) { + auto it = entries.begin(); + while (it != entries.end()) { + Entry* const entry = *it; + if (!entry->Overlaps(addr, addr_end)) { + ++it; + continue; + } + UnmarkMemory(entry); + marked_for_removal.push_back(entry); + + it = entries.erase(it); + } + } + + /// @brief Unmarks an entry from the rasterizer cache + /// @param entry Entry to unmark from memory + void UnmarkMemory(Entry* entry) { + if (!entry->is_memory_marked) { + return; + } + entry->is_memory_marked = false; + + const VAddr addr = entry->addr_start; + const std::size_t size = entry->addr_end - addr; + rasterizer.UpdatePagesCachedCount(addr, size, -1); + } + + /// @brief Removes a vector of shaders from a list + /// @param removed_shaders Shaders to be removed from the storage, it can contain duplicates + /// @pre invalidation_mutex is locked + /// @pre lookup_mutex is locked + void RemoveShadersFromStorage(std::vector<T*> removed_shaders) { + // Remove duplicates + std::sort(removed_shaders.begin(), removed_shaders.end()); + removed_shaders.erase(std::unique(removed_shaders.begin(), removed_shaders.end()), + removed_shaders.end()); + + // Now that there are no duplicates, we can notify removals + for (T* const shader : removed_shaders) { + OnShaderRemoval(shader); + } + + // Remove them from the cache + const auto is_removed = [&removed_shaders](std::unique_ptr<T>& shader) { + return std::find(removed_shaders.begin(), removed_shaders.end(), shader.get()) != + removed_shaders.end(); + }; + storage.erase(std::remove_if(storage.begin(), storage.end(), is_removed), storage.end()); + } + + /// @brief Creates a new entry in the lookup cache and returns its pointer + /// @pre lookup_mutex is locked + Entry* NewEntry(VAddr addr, VAddr addr_end, T* data) { + auto entry = std::make_unique<Entry>(Entry{addr, addr_end, data}); + Entry* const entry_pointer = entry.get(); + + lookup_cache.emplace(addr, std::move(entry)); + return entry_pointer; + } + + VideoCore::RasterizerInterface& rasterizer; + + mutable std::mutex lookup_mutex; + std::mutex invalidation_mutex; + + std::unordered_map<u64, std::unique_ptr<Entry>> lookup_cache; + std::unordered_map<u64, std::vector<Entry*>> invalidation_cache; + std::vector<std::unique_ptr<T>> storage; + std::vector<Entry*> marked_for_removal; +}; + +} // namespace VideoCommon |