From 76ca2a5f82f4df64cb839af42c93acb6705411ae Mon Sep 17 00:00:00 2001 From: ReinUsesLisp Date: Sat, 2 Nov 2019 04:08:31 -0300 Subject: gl_rasterizer: Upload constant buffers with glNamedBufferSubData Nvidia's OpenGL driver maps gl(Named)BufferSubData with some requirements to a fast. This path has an extra memcpy but updates the buffer without orphaning or waiting for previous calls. It can be seen as a better model for "push constants" that can upload a whole UBO instead of 256 bytes. This path has some requirements established here: http://on-demand.gputechconf.com/gtc/2014/presentations/S4379-opengl-44-scene-rendering-techniques.pdf#page=24 Instead of using the stream buffer, this commits moves constant buffers uploads to calls of glNamedBufferSubData and from my testing it brings a performance improvement. This is disabled when the vendor is not Nvidia since it brings performance regressions. --- src/video_core/renderer_opengl/gl_rasterizer.cpp | 29 ++++++++++++++++-------- 1 file changed, 19 insertions(+), 10 deletions(-) (limited to 'src/video_core/renderer_opengl/gl_rasterizer.cpp') diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp index 6a4d2c83a..28fa8a8be 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.cpp +++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp @@ -67,7 +67,7 @@ static std::size_t GetConstBufferSize(const Tegra::Engines::ConstBufferInfo& buf RasterizerOpenGL::RasterizerOpenGL(Core::System& system, Core::Frontend::EmuWindow& emu_window, ScreenInfo& info) : texture_cache{system, *this, device}, shader_cache{*this, system, emu_window, device}, - system{system}, screen_info{info}, buffer_cache{*this, system, STREAM_BUFFER_SIZE} { + system{system}, screen_info{info}, buffer_cache{*this, system, device, STREAM_BUFFER_SIZE} { shader_program_manager = std::make_unique(); state.draw.shader_program = 0; state.Apply(); @@ -558,6 +558,8 @@ void RasterizerOpenGL::DrawPrelude() { SyncPolygonOffset(); SyncAlphaTest(); + buffer_cache.Acquire(); + // Draw the vertex batch const bool is_indexed = accelerate_draw == AccelDraw::Indexed; @@ -573,9 +575,11 @@ void RasterizerOpenGL::DrawPrelude() { (sizeof(GLShader::MaxwellUniformData) + device.GetUniformBufferAlignment()) * Maxwell::MaxShaderStage; - // Add space for at least 18 constant buffers - buffer_size += Maxwell::MaxConstBuffers * - (Maxwell::MaxConstBufferSize + device.GetUniformBufferAlignment()); + if (!device.HasFastBufferSubData()) { + // Add space for at least 18 constant buffers + buffer_size += Maxwell::MaxConstBuffers * + (Maxwell::MaxConstBufferSize + device.GetUniformBufferAlignment()); + } // Prepare the vertex array. buffer_cache.Map(buffer_size); @@ -739,10 +743,12 @@ void RasterizerOpenGL::DispatchCompute(GPUVAddr code_addr) { state.draw.shader_program = program; state.draw.program_pipeline = 0; - const std::size_t buffer_size = - Tegra::Engines::KeplerCompute::NumConstBuffers * - (Maxwell::MaxConstBufferSize + device.GetUniformBufferAlignment()); - buffer_cache.Map(buffer_size); + if (!device.HasFastBufferSubData()) { + const std::size_t buffer_size = + Tegra::Engines::KeplerCompute::NumConstBuffers * + (Maxwell::MaxConstBufferSize + device.GetUniformBufferAlignment()); + buffer_cache.Map(buffer_size); + } bind_ubo_pushbuffer.Setup(0); bind_ssbo_pushbuffer.Setup(0); @@ -750,7 +756,9 @@ void RasterizerOpenGL::DispatchCompute(GPUVAddr code_addr) { SetupComputeConstBuffers(kernel); SetupComputeGlobalMemory(kernel); - buffer_cache.Unmap(); + if (!device.HasFastBufferSubData()) { + buffer_cache.Unmap(); + } bind_ubo_pushbuffer.Bind(); bind_ssbo_pushbuffer.Bind(); @@ -879,7 +887,8 @@ void RasterizerOpenGL::SetupConstBuffer(const Tegra::Engines::ConstBufferInfo& b const std::size_t size = Common::AlignUp(GetConstBufferSize(buffer, entry), sizeof(GLvec4)); const auto alignment = device.GetUniformBufferAlignment(); - const auto [cbuf, offset] = buffer_cache.UploadMemory(buffer.address, size, alignment); + const auto [cbuf, offset] = buffer_cache.UploadMemory(buffer.address, size, alignment, false, + device.HasFastBufferSubData()); bind_ubo_pushbuffer.Push(cbuf, offset, size); } -- cgit v1.2.3 From 442a1cc0211131cb237b5291fd49dbd2f37399e9 Mon Sep 17 00:00:00 2001 From: ReinUsesLisp Date: Sat, 2 Nov 2019 13:19:07 -0300 Subject: gl_rasterizer: Re-enable stream buffer memory due to global memory Global memory is still using the stream buffer when it shouldn't. As a temporary fix re-enable the stream buffer on compute. --- src/video_core/renderer_opengl/gl_rasterizer.cpp | 22 ++++++++-------------- 1 file changed, 8 insertions(+), 14 deletions(-) (limited to 'src/video_core/renderer_opengl/gl_rasterizer.cpp') diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp index 28fa8a8be..e9abd901e 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.cpp +++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp @@ -575,11 +575,9 @@ void RasterizerOpenGL::DrawPrelude() { (sizeof(GLShader::MaxwellUniformData) + device.GetUniformBufferAlignment()) * Maxwell::MaxShaderStage; - if (!device.HasFastBufferSubData()) { - // Add space for at least 18 constant buffers - buffer_size += Maxwell::MaxConstBuffers * - (Maxwell::MaxConstBufferSize + device.GetUniformBufferAlignment()); - } + // Add space for at least 18 constant buffers + buffer_size += Maxwell::MaxConstBuffers * + (Maxwell::MaxConstBufferSize + device.GetUniformBufferAlignment()); // Prepare the vertex array. buffer_cache.Map(buffer_size); @@ -743,12 +741,10 @@ void RasterizerOpenGL::DispatchCompute(GPUVAddr code_addr) { state.draw.shader_program = program; state.draw.program_pipeline = 0; - if (!device.HasFastBufferSubData()) { - const std::size_t buffer_size = - Tegra::Engines::KeplerCompute::NumConstBuffers * - (Maxwell::MaxConstBufferSize + device.GetUniformBufferAlignment()); - buffer_cache.Map(buffer_size); - } + const std::size_t buffer_size = + Tegra::Engines::KeplerCompute::NumConstBuffers * + (Maxwell::MaxConstBufferSize + device.GetUniformBufferAlignment()); + buffer_cache.Map(buffer_size); bind_ubo_pushbuffer.Setup(0); bind_ssbo_pushbuffer.Setup(0); @@ -756,9 +752,7 @@ void RasterizerOpenGL::DispatchCompute(GPUVAddr code_addr) { SetupComputeConstBuffers(kernel); SetupComputeGlobalMemory(kernel); - if (!device.HasFastBufferSubData()) { - buffer_cache.Unmap(); - } + buffer_cache.Unmap(); bind_ubo_pushbuffer.Bind(); bind_ssbo_pushbuffer.Bind(); -- cgit v1.2.3