From 668e80a9f42fb4ce0e16f6381d05bcbd286b2da1 Mon Sep 17 00:00:00 2001 From: Fernando Sahmkow Date: Sun, 30 Jan 2022 10:31:13 +0100 Subject: VideoCore: Refactor syncing. --- src/core/core.cpp | 12 + src/core/core.h | 9 + .../hle/service/nvdrv/devices/nvdisp_disp0.cpp | 2 +- src/core/hle/service/nvdrv/devices/nvhost_ctrl.cpp | 19 +- src/core/hle/service/nvdrv/devices/nvhost_ctrl.h | 4 + src/core/hle/service/nvdrv/devices/nvhost_gpu.cpp | 10 +- src/core/hle/service/nvflinger/nvflinger.cpp | 9 +- src/video_core/CMakeLists.txt | 40 +- src/video_core/cdma_pusher.cpp | 25 +- src/video_core/cdma_pusher.h | 15 +- src/video_core/command_classes/codecs/codec.cpp | 310 ------- src/video_core/command_classes/codecs/codec.h | 81 -- src/video_core/command_classes/codecs/h264.cpp | 277 ------ src/video_core/command_classes/codecs/h264.h | 173 ---- src/video_core/command_classes/codecs/vp8.cpp | 53 -- src/video_core/command_classes/codecs/vp8.h | 73 -- src/video_core/command_classes/codecs/vp9.cpp | 946 --------------------- src/video_core/command_classes/codecs/vp9.h | 192 ----- src/video_core/command_classes/codecs/vp9_types.h | 306 ------- src/video_core/command_classes/host1x.h | 34 - src/video_core/command_classes/nvdec.cpp | 47 - src/video_core/command_classes/nvdec.h | 33 - src/video_core/command_classes/nvdec_common.h | 97 --- src/video_core/command_classes/sync_manager.cpp | 43 - src/video_core/command_classes/sync_manager.h | 47 - src/video_core/command_classes/vic.cpp | 238 ------ src/video_core/command_classes/vic.h | 61 -- src/video_core/control/channel_state.cpp | 2 +- src/video_core/control/channel_state.h | 2 +- src/video_core/control/channel_state_cache.h | 4 + src/video_core/control/scheduler.cpp | 2 +- src/video_core/control/scheduler.h | 2 +- src/video_core/dma_pusher.h | 26 +- src/video_core/engines/puller.cpp | 65 +- src/video_core/engines/puller.h | 1 - src/video_core/fence_manager.h | 12 +- src/video_core/gpu.cpp | 197 ++--- src/video_core/gpu.h | 19 +- src/video_core/gpu_thread.cpp | 6 +- src/video_core/gpu_thread.h | 2 + src/video_core/host1x/codecs/codec.cpp | 310 +++++++ src/video_core/host1x/codecs/codec.h | 81 ++ src/video_core/host1x/codecs/h264.cpp | 277 ++++++ src/video_core/host1x/codecs/h264.h | 173 ++++ src/video_core/host1x/codecs/vp8.cpp | 53 ++ src/video_core/host1x/codecs/vp8.h | 74 ++ src/video_core/host1x/codecs/vp9.cpp | 946 +++++++++++++++++++++ src/video_core/host1x/codecs/vp9.h | 194 +++++ src/video_core/host1x/codecs/vp9_types.h | 306 +++++++ src/video_core/host1x/control.cpp | 35 + src/video_core/host1x/control.h | 41 + src/video_core/host1x/host1x.h | 33 + src/video_core/host1x/nvdec.cpp | 47 + src/video_core/host1x/nvdec.h | 38 + src/video_core/host1x/nvdec_common.h | 97 +++ src/video_core/host1x/sync_manager.cpp | 51 ++ src/video_core/host1x/sync_manager.h | 53 ++ src/video_core/host1x/syncpoint_manager.cpp | 93 ++ src/video_core/host1x/syncpoint_manager.h | 99 +++ src/video_core/host1x/vic.cpp | 243 ++++++ src/video_core/host1x/vic.h | 66 ++ 61 files changed, 3601 insertions(+), 3205 deletions(-) delete mode 100644 src/video_core/command_classes/codecs/codec.cpp delete mode 100644 src/video_core/command_classes/codecs/codec.h delete mode 100644 src/video_core/command_classes/codecs/h264.cpp delete mode 100644 src/video_core/command_classes/codecs/h264.h delete mode 100644 src/video_core/command_classes/codecs/vp8.cpp delete mode 100644 src/video_core/command_classes/codecs/vp8.h delete mode 100644 src/video_core/command_classes/codecs/vp9.cpp delete mode 100644 src/video_core/command_classes/codecs/vp9.h delete mode 100644 src/video_core/command_classes/codecs/vp9_types.h delete mode 100644 src/video_core/command_classes/host1x.h delete mode 100644 src/video_core/command_classes/nvdec.cpp delete mode 100644 src/video_core/command_classes/nvdec.h delete mode 100644 src/video_core/command_classes/nvdec_common.h delete mode 100644 src/video_core/command_classes/sync_manager.cpp delete mode 100644 src/video_core/command_classes/sync_manager.h delete mode 100644 src/video_core/command_classes/vic.cpp delete mode 100644 src/video_core/command_classes/vic.h create mode 100644 src/video_core/host1x/codecs/codec.cpp create mode 100644 src/video_core/host1x/codecs/codec.h create mode 100644 src/video_core/host1x/codecs/h264.cpp create mode 100644 src/video_core/host1x/codecs/h264.h create mode 100644 src/video_core/host1x/codecs/vp8.cpp create mode 100644 src/video_core/host1x/codecs/vp8.h create mode 100644 src/video_core/host1x/codecs/vp9.cpp create mode 100644 src/video_core/host1x/codecs/vp9.h create mode 100644 src/video_core/host1x/codecs/vp9_types.h create mode 100644 src/video_core/host1x/control.cpp create mode 100644 src/video_core/host1x/control.h create mode 100644 src/video_core/host1x/host1x.h create mode 100644 src/video_core/host1x/nvdec.cpp create mode 100644 src/video_core/host1x/nvdec.h create mode 100644 src/video_core/host1x/nvdec_common.h create mode 100644 src/video_core/host1x/sync_manager.cpp create mode 100644 src/video_core/host1x/sync_manager.h create mode 100644 src/video_core/host1x/syncpoint_manager.cpp create mode 100644 src/video_core/host1x/syncpoint_manager.h create mode 100644 src/video_core/host1x/vic.cpp create mode 100644 src/video_core/host1x/vic.h (limited to 'src') diff --git a/src/core/core.cpp b/src/core/core.cpp index 121092868..fa059a394 100644 --- a/src/core/core.cpp +++ b/src/core/core.cpp @@ -51,6 +51,7 @@ #include "core/telemetry_session.h" #include "core/tools/freezer.h" #include "network/network.h" +#include "video_core/host1x/host1x.h" #include "video_core/renderer_base.h" #include "video_core/video_core.h" @@ -215,6 +216,7 @@ struct System::Impl { telemetry_session = std::make_unique(); + host1x_core = std::make_unique(); gpu_core = VideoCore::CreateGPU(emu_window, system); if (!gpu_core) { return SystemResultStatus::ErrorVideoCore; @@ -373,6 +375,7 @@ struct System::Impl { app_loader.reset(); audio_core.reset(); gpu_core.reset(); + host1x_core.reset(); perf_stats.reset(); kernel.Shutdown(); memory.Reset(); @@ -450,6 +453,7 @@ struct System::Impl { /// AppLoader used to load the current executing application std::unique_ptr app_loader; std::unique_ptr gpu_core; + std::unique_ptr host1x_core; std::unique_ptr interrupt_manager; std::unique_ptr device_memory; std::unique_ptr audio_core; @@ -668,6 +672,14 @@ const Tegra::GPU& System::GPU() const { return *impl->gpu_core; } +Tegra::Host1x::Host1x& System::Host1x() { + return *impl->host1x_core; +} + +const Tegra::Host1x::Host1x& System::Host1x() const { + return *impl->host1x_core; +} + Core::Hardware::InterruptManager& System::InterruptManager() { return *impl->interrupt_manager; } diff --git a/src/core/core.h b/src/core/core.h index 0ce3b1d60..e4168a921 100644 --- a/src/core/core.h +++ b/src/core/core.h @@ -74,6 +74,9 @@ class TimeManager; namespace Tegra { class DebugContext; class GPU; +namespace Host1x { +class Host1x; +} // namespace Host1x } // namespace Tegra namespace VideoCore { @@ -260,6 +263,12 @@ public: /// Gets an immutable reference to the GPU interface. [[nodiscard]] const Tegra::GPU& GPU() const; + /// Gets a mutable reference to the Host1x interface + [[nodiscard]] Tegra::Host1x::Host1x& Host1x(); + + /// Gets an immutable reference to the Host1x interface. + [[nodiscard]] const Tegra::Host1x::Host1x& Host1x() const; + /// Gets a mutable reference to the renderer. [[nodiscard]] VideoCore::RendererBase& Renderer(); diff --git a/src/core/hle/service/nvdrv/devices/nvdisp_disp0.cpp b/src/core/hle/service/nvdrv/devices/nvdisp_disp0.cpp index b1c0e9eb2..e6a976714 100644 --- a/src/core/hle/service/nvdrv/devices/nvdisp_disp0.cpp +++ b/src/core/hle/service/nvdrv/devices/nvdisp_disp0.cpp @@ -50,7 +50,7 @@ void nvdisp_disp0::flip(u32 buffer_handle, u32 offset, android::PixelFormat form stride, format, transform, crop_rect}; system.GetPerfStats().EndSystemFrame(); - system.GPU().SwapBuffers(&framebuffer); + system.GPU().RequestSwapBuffers(&framebuffer, nullptr, 0); system.SpeedLimiter().DoSpeedLimiting(system.CoreTiming().GetGlobalTimeUs()); system.GetPerfStats().BeginSystemFrame(); } diff --git a/src/core/hle/service/nvdrv/devices/nvhost_ctrl.cpp b/src/core/hle/service/nvdrv/devices/nvhost_ctrl.cpp index 54074af75..ffe42d423 100644 --- a/src/core/hle/service/nvdrv/devices/nvhost_ctrl.cpp +++ b/src/core/hle/service/nvdrv/devices/nvhost_ctrl.cpp @@ -18,6 +18,7 @@ #include "core/hle/service/nvdrv/core/syncpoint_manager.h" #include "core/hle/service/nvdrv/devices/nvhost_ctrl.h" #include "video_core/gpu.h" +#include "video_core/host1x/host1x.h" namespace Service::Nvidia::Devices { @@ -129,7 +130,7 @@ NvResult nvhost_ctrl::IocCtrlEventWait(const std::vector& input, std::vector return NvResult::Success; } - auto& gpu = system.GPU(); + auto& host1x_syncpoint_manager = system.Host1x().GetSyncpointManager(); const u32 target_value = params.fence.value; auto lock = NvEventsLock(); @@ -149,7 +150,7 @@ NvResult nvhost_ctrl::IocCtrlEventWait(const std::vector& input, std::vector if (events[slot].fails > 2) { { auto lk = system.StallProcesses(); - gpu.WaitFence(fence_id, target_value); + host1x_syncpoint_manager.WaitHost(fence_id, target_value); system.UnstallProcesses(); } params.value.raw = target_value; @@ -198,7 +199,15 @@ NvResult nvhost_ctrl::IocCtrlEventWait(const std::vector& input, std::vector } params.value.raw |= slot; - gpu.RegisterSyncptInterrupt(fence_id, target_value); + event.wait_handle = + host1x_syncpoint_manager.RegisterHostAction(fence_id, target_value, [this, slot]() { + auto& event = events[slot]; + if (event.status.exchange(EventState::Signalling, std::memory_order_acq_rel) == + EventState::Waiting) { + event.kevent->GetWritableEvent().Signal(); + } + event.status.store(EventState::Signalled, std::memory_order_release); + }); return NvResult::Timeout; } @@ -288,8 +297,10 @@ NvResult nvhost_ctrl::IocCtrlClearEventWait(const std::vector& input, std::v auto& event = events[event_id]; if (event.status.exchange(EventState::Cancelling, std::memory_order_acq_rel) == EventState::Waiting) { - system.GPU().CancelSyncptInterrupt(event.assigned_syncpt, event.assigned_value); + auto& host1x_syncpoint_manager = system.Host1x().GetSyncpointManager(); + host1x_syncpoint_manager.DeregisterHostAction(event.assigned_syncpt, event.wait_handle); syncpoint_manager.RefreshSyncpoint(event.assigned_syncpt); + event.wait_handle = {}; } event.fails++; event.status.store(EventState::Cancelled, std::memory_order_release); diff --git a/src/core/hle/service/nvdrv/devices/nvhost_ctrl.h b/src/core/hle/service/nvdrv/devices/nvhost_ctrl.h index d56aea405..136a1e925 100644 --- a/src/core/hle/service/nvdrv/devices/nvhost_ctrl.h +++ b/src/core/hle/service/nvdrv/devices/nvhost_ctrl.h @@ -11,6 +11,7 @@ #include "common/common_types.h" #include "core/hle/service/nvdrv/devices/nvdevice.h" #include "core/hle/service/nvdrv/nvdrv.h" +#include "video_core/host1x/syncpoint_manager.h" namespace Service::Nvidia::NvCore { class Container; @@ -78,6 +79,9 @@ private: // Tells if an NVEvent is registered or not bool registered{}; + // Used for waiting on a syncpoint & canceling it. + Tegra::Host1x::SyncpointManager::ActionHandle wait_handle{}; + bool IsBeingUsed() { const auto current_status = status.load(std::memory_order_acquire); return current_status == EventState::Waiting || diff --git a/src/core/hle/service/nvdrv/devices/nvhost_gpu.cpp b/src/core/hle/service/nvdrv/devices/nvhost_gpu.cpp index 38d45cb79..db3e266ad 100644 --- a/src/core/hle/service/nvdrv/devices/nvhost_gpu.cpp +++ b/src/core/hle/service/nvdrv/devices/nvhost_gpu.cpp @@ -210,10 +210,10 @@ NvResult nvhost_gpu::AllocateObjectContext(const std::vector& input, std::ve static std::vector BuildWaitCommandList(NvFence fence) { return { - Tegra::BuildCommandHeader(Tegra::BufferMethods::FenceValue, 1, + Tegra::BuildCommandHeader(Tegra::BufferMethods::SyncpointPayload, 1, Tegra::SubmissionMode::Increasing), {fence.value}, - Tegra::BuildCommandHeader(Tegra::BufferMethods::FenceAction, 1, + Tegra::BuildCommandHeader(Tegra::BufferMethods::SyncpointOperation, 1, Tegra::SubmissionMode::Increasing), BuildFenceAction(Tegra::Engines::Puller::FenceOperation::Acquire, fence.id), }; @@ -222,12 +222,12 @@ static std::vector BuildWaitCommandList(NvFence fence) { static std::vector BuildIncrementCommandList(NvFence fence, u32 add_increment) { std::vector result{ - Tegra::BuildCommandHeader(Tegra::BufferMethods::FenceValue, 1, + Tegra::BuildCommandHeader(Tegra::BufferMethods::SyncpointPayload, 1, Tegra::SubmissionMode::Increasing), {}}; for (u32 count = 0; count < add_increment; ++count) { - result.emplace_back(Tegra::BuildCommandHeader(Tegra::BufferMethods::FenceAction, 1, + result.emplace_back(Tegra::BuildCommandHeader(Tegra::BufferMethods::SyncpointOperation, 1, Tegra::SubmissionMode::Increasing)); result.emplace_back( BuildFenceAction(Tegra::Engines::Puller::FenceOperation::Increment, fence.id)); @@ -239,7 +239,7 @@ static std::vector BuildIncrementCommandList(NvFence fence static std::vector BuildIncrementWithWfiCommandList(NvFence fence, u32 add_increment) { std::vector result{ - Tegra::BuildCommandHeader(Tegra::BufferMethods::WaitForInterrupt, 1, + Tegra::BuildCommandHeader(Tegra::BufferMethods::WaitForIdle, 1, Tegra::SubmissionMode::Increasing), {}}; const std::vector increment{ diff --git a/src/core/hle/service/nvflinger/nvflinger.cpp b/src/core/hle/service/nvflinger/nvflinger.cpp index 8c3013f83..aa112021d 100644 --- a/src/core/hle/service/nvflinger/nvflinger.cpp +++ b/src/core/hle/service/nvflinger/nvflinger.cpp @@ -24,6 +24,8 @@ #include "core/hle/service/vi/layer/vi_layer.h" #include "core/hle/service/vi/vi_results.h" #include "video_core/gpu.h" +#include "video_core/host1x/host1x.h" +#include "video_core/host1x/syncpoint_manager.h" namespace Service::NVFlinger { @@ -267,12 +269,12 @@ void NVFlinger::Compose() { return; // We are likely shutting down } - auto& gpu = system.GPU(); + auto& syncpoint_manager = system.Host1x().GetSyncpointManager(); const auto& multi_fence = buffer.fence; guard->unlock(); for (u32 fence_id = 0; fence_id < multi_fence.num_fences; fence_id++) { const auto& fence = multi_fence.fences[fence_id]; - gpu.WaitFence(fence.id, fence.value); + syncpoint_manager.WaitGuest(fence.id, fence.value); } guard->lock(); @@ -284,6 +286,7 @@ void NVFlinger::Compose() { auto nvdisp = nvdrv->GetDevice(disp_fd); ASSERT(nvdisp); + guard->unlock(); Common::Rectangle crop_rect{ static_cast(buffer.crop.Left()), static_cast(buffer.crop.Top()), static_cast(buffer.crop.Right()), static_cast(buffer.crop.Bottom())}; @@ -292,6 +295,8 @@ void NVFlinger::Compose() { igbp_buffer.Width(), igbp_buffer.Height(), igbp_buffer.Stride(), static_cast(buffer.transform), crop_rect); + guard->lock(); + swap_interval = buffer.swap_interval; auto fence = android::Fence::NoFence(); diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt index 35faa70a0..723f9b67c 100644 --- a/src/video_core/CMakeLists.txt +++ b/src/video_core/CMakeLists.txt @@ -4,7 +4,7 @@ add_subdirectory(host_shaders) if(LIBVA_FOUND) - set_source_files_properties(command_classes/codecs/codec.cpp + set_source_files_properties(host1x/codecs/codec.cpp PROPERTIES COMPILE_DEFINITIONS LIBVA_FOUND=1) list(APPEND FFmpeg_LIBRARIES ${LIBVA_LIBRARIES}) endif() @@ -15,24 +15,6 @@ add_library(video_core STATIC buffer_cache/buffer_cache.h cdma_pusher.cpp cdma_pusher.h - command_classes/codecs/codec.cpp - command_classes/codecs/codec.h - command_classes/codecs/h264.cpp - command_classes/codecs/h264.h - command_classes/codecs/vp8.cpp - command_classes/codecs/vp8.h - command_classes/codecs/vp9.cpp - command_classes/codecs/vp9.h - command_classes/codecs/vp9_types.h - command_classes/host1x.cpp - command_classes/host1x.h - command_classes/nvdec.cpp - command_classes/nvdec.h - command_classes/nvdec_common.h - command_classes/sync_manager.cpp - command_classes/sync_manager.h - command_classes/vic.cpp - command_classes/vic.h compatible_formats.cpp compatible_formats.h control/channel_state.cpp @@ -63,6 +45,26 @@ add_library(video_core STATIC engines/puller.cpp engines/puller.h framebuffer_config.h + host1x/codecs/codec.cpp + host1x/codecs/codec.h + host1x/codecs/h264.cpp + host1x/codecs/h264.h + host1x/codecs/vp8.cpp + host1x/codecs/vp8.h + host1x/codecs/vp9.cpp + host1x/codecs/vp9.h + host1x/codecs/vp9_types.h + host1x/control.cpp + host1x/control.h + host1x/nvdec.cpp + host1x/nvdec.h + host1x/nvdec_common.h + host1x/sync_manager.cpp + host1x/sync_manager.h + host1x/syncpoint_manager.cpp + host1x/syncpoint_manager.h + host1x/vic.cpp + host1x/vic.h macro/macro.cpp macro/macro.h macro/macro_hle.cpp diff --git a/src/video_core/cdma_pusher.cpp b/src/video_core/cdma_pusher.cpp index 8e890a85e..148126347 100644 --- a/src/video_core/cdma_pusher.cpp +++ b/src/video_core/cdma_pusher.cpp @@ -2,20 +2,22 @@ // SPDX-License-Identifier: MIT #include -#include "command_classes/host1x.h" -#include "command_classes/nvdec.h" -#include "command_classes/vic.h" #include "video_core/cdma_pusher.h" -#include "video_core/command_classes/sync_manager.h" #include "video_core/engines/maxwell_3d.h" #include "video_core/gpu.h" +#include "video_core/host1x/control.h" +#include "video_core/host1x/nvdec.h" +#include "video_core/host1x/nvdec_common.h" +#include "video_core/host1x/sync_manager.h" +#include "video_core/host1x/vic.h" +#include "video_core/memory_manager.h" namespace Tegra { CDmaPusher::CDmaPusher(GPU& gpu_) - : gpu{gpu_}, nvdec_processor(std::make_shared(gpu)), - vic_processor(std::make_unique(gpu, nvdec_processor)), - host1x_processor(std::make_unique(gpu)), - sync_manager(std::make_unique(gpu)) {} + : gpu{gpu_}, nvdec_processor(std::make_shared(gpu)), + vic_processor(std::make_unique(gpu, nvdec_processor)), + host1x_processor(std::make_unique(gpu)), + sync_manager(std::make_unique(gpu)) {} CDmaPusher::~CDmaPusher() = default; @@ -109,16 +111,17 @@ void CDmaPusher::ExecuteCommand(u32 state_offset, u32 data) { case ThiMethod::SetMethod1: LOG_DEBUG(Service_NVDRV, "VIC method 0x{:X}, Args=({})", static_cast(vic_thi_state.method_0), data); - vic_processor->ProcessMethod(static_cast(vic_thi_state.method_0), data); + vic_processor->ProcessMethod(static_cast(vic_thi_state.method_0), + data); break; default: break; } break; - case ChClassId::Host1x: + case ChClassId::Control: // This device is mainly for syncpoint synchronization LOG_DEBUG(Service_NVDRV, "Host1X Class Method"); - host1x_processor->ProcessMethod(static_cast(offset), data); + host1x_processor->ProcessMethod(static_cast(offset), data); break; default: UNIMPLEMENTED_MSG("Current class not implemented {:X}", static_cast(current_class)); diff --git a/src/video_core/cdma_pusher.h b/src/video_core/cdma_pusher.h index d6ffef95f..de17c2082 100644 --- a/src/video_core/cdma_pusher.h +++ b/src/video_core/cdma_pusher.h @@ -13,10 +13,13 @@ namespace Tegra { class GPU; -class Host1x; + +namespace Host1x { +class Control; class Nvdec; class SyncptIncrManager; class Vic; +} // namespace Host1x enum class ChSubmissionMode : u32 { SetClass = 0, @@ -30,7 +33,7 @@ enum class ChSubmissionMode : u32 { enum class ChClassId : u32 { NoClass = 0x0, - Host1x = 0x1, + Control = 0x1, VideoEncodeMpeg = 0x20, VideoEncodeNvEnc = 0x21, VideoStreamingVi = 0x30, @@ -102,10 +105,10 @@ private: void ThiStateWrite(ThiRegisters& state, u32 offset, u32 argument); GPU& gpu; - std::shared_ptr nvdec_processor; - std::unique_ptr vic_processor; - std::unique_ptr host1x_processor; - std::unique_ptr sync_manager; + std::shared_ptr nvdec_processor; + std::unique_ptr vic_processor; + std::unique_ptr host1x_processor; + std::unique_ptr sync_manager; ChClassId current_class{}; ThiRegisters vic_thi_state{}; ThiRegisters nvdec_thi_state{}; diff --git a/src/video_core/command_classes/codecs/codec.cpp b/src/video_core/command_classes/codecs/codec.cpp deleted file mode 100644 index a5eb97b7f..000000000 --- a/src/video_core/command_classes/codecs/codec.cpp +++ /dev/null @@ -1,310 +0,0 @@ -// SPDX-FileCopyrightText: Copyright 2020 yuzu Emulator Project -// SPDX-License-Identifier: GPL-2.0-or-later - -#include -#include -#include -#include "common/assert.h" -#include "common/settings.h" -#include "video_core/command_classes/codecs/codec.h" -#include "video_core/command_classes/codecs/h264.h" -#include "video_core/command_classes/codecs/vp8.h" -#include "video_core/command_classes/codecs/vp9.h" -#include "video_core/gpu.h" -#include "video_core/memory_manager.h" - -extern "C" { -#include -#ifdef LIBVA_FOUND -// for querying VAAPI driver information -#include -#endif -} - -namespace Tegra { -namespace { -constexpr AVPixelFormat PREFERRED_GPU_FMT = AV_PIX_FMT_NV12; -constexpr AVPixelFormat PREFERRED_CPU_FMT = AV_PIX_FMT_YUV420P; -constexpr std::array PREFERRED_GPU_DECODERS = { - AV_HWDEVICE_TYPE_CUDA, -#ifdef _WIN32 - AV_HWDEVICE_TYPE_D3D11VA, - AV_HWDEVICE_TYPE_DXVA2, -#elif defined(__unix__) - AV_HWDEVICE_TYPE_VAAPI, - AV_HWDEVICE_TYPE_VDPAU, -#endif - // last resort for Linux Flatpak (w/ NVIDIA) - AV_HWDEVICE_TYPE_VULKAN, -}; - -void AVPacketDeleter(AVPacket* ptr) { - av_packet_free(&ptr); -} - -using AVPacketPtr = std::unique_ptr; - -AVPixelFormat GetGpuFormat(AVCodecContext* av_codec_ctx, const AVPixelFormat* pix_fmts) { - for (const AVPixelFormat* p = pix_fmts; *p != AV_PIX_FMT_NONE; ++p) { - if (*p == av_codec_ctx->pix_fmt) { - return av_codec_ctx->pix_fmt; - } - } - LOG_INFO(Service_NVDRV, "Could not find compatible GPU AV format, falling back to CPU"); - av_buffer_unref(&av_codec_ctx->hw_device_ctx); - av_codec_ctx->pix_fmt = PREFERRED_CPU_FMT; - return PREFERRED_CPU_FMT; -} - -// List all the currently available hwcontext in ffmpeg -std::vector ListSupportedContexts() { - std::vector contexts{}; - AVHWDeviceType current_device_type = AV_HWDEVICE_TYPE_NONE; - do { - current_device_type = av_hwdevice_iterate_types(current_device_type); - contexts.push_back(current_device_type); - } while (current_device_type != AV_HWDEVICE_TYPE_NONE); - return contexts; -} - -} // namespace - -void AVFrameDeleter(AVFrame* ptr) { - av_frame_free(&ptr); -} - -Codec::Codec(GPU& gpu_, const NvdecCommon::NvdecRegisters& regs) - : gpu(gpu_), state{regs}, h264_decoder(std::make_unique(gpu)), - vp8_decoder(std::make_unique(gpu)), - vp9_decoder(std::make_unique(gpu)) {} - -Codec::~Codec() { - if (!initialized) { - return; - } - // Free libav memory - avcodec_free_context(&av_codec_ctx); - av_buffer_unref(&av_gpu_decoder); -} - -bool Codec::CreateGpuAvDevice() { - static constexpr auto HW_CONFIG_METHOD = AV_CODEC_HW_CONFIG_METHOD_HW_DEVICE_CTX; - static const auto supported_contexts = ListSupportedContexts(); - for (const auto& type : PREFERRED_GPU_DECODERS) { - if (std::none_of(supported_contexts.begin(), supported_contexts.end(), - [&type](const auto& context) { return context == type; })) { - LOG_DEBUG(Service_NVDRV, "{} explicitly unsupported", av_hwdevice_get_type_name(type)); - continue; - } - // Avoid memory leak from not cleaning up after av_hwdevice_ctx_create - av_buffer_unref(&av_gpu_decoder); - const int hwdevice_res = av_hwdevice_ctx_create(&av_gpu_decoder, type, nullptr, nullptr, 0); - if (hwdevice_res < 0) { - LOG_DEBUG(Service_NVDRV, "{} av_hwdevice_ctx_create failed {}", - av_hwdevice_get_type_name(type), hwdevice_res); - continue; - } -#ifdef LIBVA_FOUND - if (type == AV_HWDEVICE_TYPE_VAAPI) { - // we need to determine if this is an impersonated VAAPI driver - AVHWDeviceContext* hwctx = - static_cast(static_cast(av_gpu_decoder->data)); - AVVAAPIDeviceContext* vactx = static_cast(hwctx->hwctx); - const char* vendor_name = vaQueryVendorString(vactx->display); - if (strstr(vendor_name, "VDPAU backend")) { - // VDPAU impersonated VAAPI impl's are super buggy, we need to skip them - LOG_DEBUG(Service_NVDRV, "Skipping vdapu impersonated VAAPI driver"); - continue; - } else { - // according to some user testing, certain vaapi driver (Intel?) could be buggy - // so let's log the driver name which may help the developers/supporters - LOG_DEBUG(Service_NVDRV, "Using VAAPI driver: {}", vendor_name); - } - } -#endif - for (int i = 0;; i++) { - const AVCodecHWConfig* config = avcodec_get_hw_config(av_codec, i); - if (!config) { - LOG_DEBUG(Service_NVDRV, "{} decoder does not support device type {}.", - av_codec->name, av_hwdevice_get_type_name(type)); - break; - } - if ((config->methods & HW_CONFIG_METHOD) != 0 && config->device_type == type) { -#if defined(__unix__) - // Some linux decoding backends are reported to crash with this config method - // TODO(ameerj): Properly support this method - if ((config->methods & AV_CODEC_HW_CONFIG_METHOD_HW_FRAMES_CTX) != 0) { - // skip zero-copy decoders, we don't currently support them - LOG_DEBUG(Service_NVDRV, "Skipping decoder {} with unsupported capability {}.", - av_hwdevice_get_type_name(type), config->methods); - continue; - } -#endif - LOG_INFO(Service_NVDRV, "Using {} GPU decoder", av_hwdevice_get_type_name(type)); - av_codec_ctx->pix_fmt = config->pix_fmt; - return true; - } - } - } - return false; -} - -void Codec::InitializeAvCodecContext() { - av_codec_ctx = avcodec_alloc_context3(av_codec); - av_opt_set(av_codec_ctx->priv_data, "tune", "zerolatency", 0); -} - -void Codec::InitializeGpuDecoder() { - if (!CreateGpuAvDevice()) { - av_buffer_unref(&av_gpu_decoder); - return; - } - auto* hw_device_ctx = av_buffer_ref(av_gpu_decoder); - ASSERT_MSG(hw_device_ctx, "av_buffer_ref failed"); - av_codec_ctx->hw_device_ctx = hw_device_ctx; - av_codec_ctx->get_format = GetGpuFormat; -} - -void Codec::Initialize() { - const AVCodecID codec = [&] { - switch (current_codec) { - case NvdecCommon::VideoCodec::H264: - return AV_CODEC_ID_H264; - case NvdecCommon::VideoCodec::VP8: - return AV_CODEC_ID_VP8; - case NvdecCommon::VideoCodec::VP9: - return AV_CODEC_ID_VP9; - default: - UNIMPLEMENTED_MSG("Unknown codec {}", current_codec); - return AV_CODEC_ID_NONE; - } - }(); - av_codec = avcodec_find_decoder(codec); - - InitializeAvCodecContext(); - if (Settings::values.nvdec_emulation.GetValue() == Settings::NvdecEmulation::GPU) { - InitializeGpuDecoder(); - } - if (const int res = avcodec_open2(av_codec_ctx, av_codec, nullptr); res < 0) { - LOG_ERROR(Service_NVDRV, "avcodec_open2() Failed with result {}", res); - avcodec_free_context(&av_codec_ctx); - av_buffer_unref(&av_gpu_decoder); - return; - } - if (!av_codec_ctx->hw_device_ctx) { - LOG_INFO(Service_NVDRV, "Using FFmpeg software decoding"); - } - initialized = true; -} - -void Codec::SetTargetCodec(NvdecCommon::VideoCodec codec) { - if (current_codec != codec) { - current_codec = codec; - LOG_INFO(Service_NVDRV, "NVDEC video codec initialized to {}", GetCurrentCodecName()); - } -} - -void Codec::Decode() { - const bool is_first_frame = !initialized; - if (is_first_frame) { - Initialize(); - } - if (!initialized) { - return; - } - bool vp9_hidden_frame = false; - const auto& frame_data = [&]() { - switch (current_codec) { - case Tegra::NvdecCommon::VideoCodec::H264: - return h264_decoder->ComposeFrame(state, is_first_frame); - case Tegra::NvdecCommon::VideoCodec::VP8: - return vp8_decoder->ComposeFrame(state); - case Tegra::NvdecCommon::VideoCodec::VP9: - vp9_decoder->ComposeFrame(state); - vp9_hidden_frame = vp9_decoder->WasFrameHidden(); - return vp9_decoder->GetFrameBytes(); - default: - ASSERT(false); - return std::vector{}; - } - }(); - AVPacketPtr packet{av_packet_alloc(), AVPacketDeleter}; - if (!packet) { - LOG_ERROR(Service_NVDRV, "av_packet_alloc failed"); - return; - } - packet->data = const_cast(frame_data.data()); - packet->size = static_cast(frame_data.size()); - if (const int res = avcodec_send_packet(av_codec_ctx, packet.get()); res != 0) { - LOG_DEBUG(Service_NVDRV, "avcodec_send_packet error {}", res); - return; - } - // Only receive/store visible frames - if (vp9_hidden_frame) { - return; - } - AVFramePtr initial_frame{av_frame_alloc(), AVFrameDeleter}; - AVFramePtr final_frame{nullptr, AVFrameDeleter}; - ASSERT_MSG(initial_frame, "av_frame_alloc initial_frame failed"); - if (const int ret = avcodec_receive_frame(av_codec_ctx, initial_frame.get()); ret) { - LOG_DEBUG(Service_NVDRV, "avcodec_receive_frame error {}", ret); - return; - } - if (initial_frame->width == 0 || initial_frame->height == 0) { - LOG_WARNING(Service_NVDRV, "Zero width or height in frame"); - return; - } - if (av_codec_ctx->hw_device_ctx) { - final_frame = AVFramePtr{av_frame_alloc(), AVFrameDeleter}; - ASSERT_MSG(final_frame, "av_frame_alloc final_frame failed"); - // Can't use AV_PIX_FMT_YUV420P and share code with software decoding in vic.cpp - // because Intel drivers crash unless using AV_PIX_FMT_NV12 - final_frame->format = PREFERRED_GPU_FMT; - const int ret = av_hwframe_transfer_data(final_frame.get(), initial_frame.get(), 0); - ASSERT_MSG(!ret, "av_hwframe_transfer_data error {}", ret); - } else { - final_frame = std::move(initial_frame); - } - if (final_frame->format != PREFERRED_CPU_FMT && final_frame->format != PREFERRED_GPU_FMT) { - UNIMPLEMENTED_MSG("Unexpected video format: {}", final_frame->format); - return; - } - av_frames.push(std::move(final_frame)); - if (av_frames.size() > 10) { - LOG_TRACE(Service_NVDRV, "av_frames.push overflow dropped frame"); - av_frames.pop(); - } -} - -AVFramePtr Codec::GetCurrentFrame() { - // Sometimes VIC will request more frames than have been decoded. - // in this case, return a nullptr and don't overwrite previous frame data - if (av_frames.empty()) { - return AVFramePtr{nullptr, AVFrameDeleter}; - } - AVFramePtr frame = std::move(av_frames.front()); - av_frames.pop(); - return frame; -} - -NvdecCommon::VideoCodec Codec::GetCurrentCodec() const { - return current_codec; -} - -std::string_view Codec::GetCurrentCodecName() const { - switch (current_codec) { - case NvdecCommon::VideoCodec::None: - return "None"; - case NvdecCommon::VideoCodec::H264: - return "H264"; - case NvdecCommon::VideoCodec::VP8: - return "VP8"; - case NvdecCommon::VideoCodec::H265: - return "H265"; - case NvdecCommon::VideoCodec::VP9: - return "VP9"; - default: - return "Unknown"; - } -} -} // namespace Tegra diff --git a/src/video_core/command_classes/codecs/codec.h b/src/video_core/command_classes/codecs/codec.h deleted file mode 100644 index 0c2405465..000000000 --- a/src/video_core/command_classes/codecs/codec.h +++ /dev/null @@ -1,81 +0,0 @@ -// SPDX-FileCopyrightText: Copyright 2020 yuzu Emulator Project -// SPDX-License-Identifier: GPL-2.0-or-later - -#pragma once - -#include -#include -#include - -#include "video_core/command_classes/nvdec_common.h" - -extern "C" { -#if defined(__GNUC__) || defined(__clang__) -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wconversion" -#endif -#include -#if defined(__GNUC__) || defined(__clang__) -#pragma GCC diagnostic pop -#endif -} - -namespace Tegra { -class GPU; - -void AVFrameDeleter(AVFrame* ptr); -using AVFramePtr = std::unique_ptr; - -namespace Decoder { -class H264; -class VP8; -class VP9; -} // namespace Decoder - -class Codec { -public: - explicit Codec(GPU& gpu, const NvdecCommon::NvdecRegisters& regs); - ~Codec(); - - /// Initialize the codec, returning success or failure - void Initialize(); - - /// Sets NVDEC video stream codec - void SetTargetCodec(NvdecCommon::VideoCodec codec); - - /// Call decoders to construct headers, decode AVFrame with ffmpeg - void Decode(); - - /// Returns next decoded frame - [[nodiscard]] AVFramePtr GetCurrentFrame(); - - /// Returns the value of current_codec - [[nodiscard]] NvdecCommon::VideoCodec GetCurrentCodec() const; - - /// Return name of the current codec - [[nodiscard]] std::string_view GetCurrentCodecName() const; - -private: - void InitializeAvCodecContext(); - - void InitializeGpuDecoder(); - - bool CreateGpuAvDevice(); - - bool initialized{}; - NvdecCommon::VideoCodec current_codec{NvdecCommon::VideoCodec::None}; - - const AVCodec* av_codec{nullptr}; - AVCodecContext* av_codec_ctx{nullptr}; - AVBufferRef* av_gpu_decoder{nullptr}; - - GPU& gpu; - const NvdecCommon::NvdecRegisters& state; - std::unique_ptr h264_decoder; - std::unique_ptr vp8_decoder; - std::unique_ptr vp9_decoder; - - std::queue av_frames{}; -}; - -} // namespace Tegra diff --git a/src/video_core/command_classes/codecs/h264.cpp b/src/video_core/command_classes/codecs/h264.cpp deleted file mode 100644 index e2acd54d4..000000000 --- a/src/video_core/command_classes/codecs/h264.cpp +++ /dev/null @@ -1,277 +0,0 @@ -// SPDX-FileCopyrightText: Ryujinx Team and Contributors -// SPDX-License-Identifier: MIT - -#include -#include - -#include "common/settings.h" -#include "video_core/command_classes/codecs/h264.h" -#include "video_core/gpu.h" -#include "video_core/memory_manager.h" - -namespace Tegra::Decoder { -namespace { -// ZigZag LUTs from libavcodec. -constexpr std::array zig_zag_direct{ - 0, 1, 8, 16, 9, 2, 3, 10, 17, 24, 32, 25, 18, 11, 4, 5, 12, 19, 26, 33, 40, 48, - 41, 34, 27, 20, 13, 6, 7, 14, 21, 28, 35, 42, 49, 56, 57, 50, 43, 36, 29, 22, 15, 23, - 30, 37, 44, 51, 58, 59, 52, 45, 38, 31, 39, 46, 53, 60, 61, 54, 47, 55, 62, 63, -}; - -constexpr std::array zig_zag_scan{ - 0 + 0 * 4, 1 + 0 * 4, 0 + 1 * 4, 0 + 2 * 4, 1 + 1 * 4, 2 + 0 * 4, 3 + 0 * 4, 2 + 1 * 4, - 1 + 2 * 4, 0 + 3 * 4, 1 + 3 * 4, 2 + 2 * 4, 3 + 1 * 4, 3 + 2 * 4, 2 + 3 * 4, 3 + 3 * 4, -}; -} // Anonymous namespace - -H264::H264(GPU& gpu_) : gpu(gpu_) {} - -H264::~H264() = default; - -const std::vector& H264::ComposeFrame(const NvdecCommon::NvdecRegisters& state, - bool is_first_frame) { - H264DecoderContext context; - gpu.MemoryManager().ReadBlock(state.picture_info_offset, &context, sizeof(H264DecoderContext)); - - const s64 frame_number = context.h264_parameter_set.frame_number.Value(); - if (!is_first_frame && frame_number != 0) { - frame.resize(context.stream_len); - gpu.MemoryManager().ReadBlock(state.frame_bitstream_offset, frame.data(), frame.size()); - return frame; - } - - // Encode header - H264BitWriter writer{}; - writer.WriteU(1, 24); - writer.WriteU(0, 1); - writer.WriteU(3, 2); - writer.WriteU(7, 5); - writer.WriteU(100, 8); - writer.WriteU(0, 8); - writer.WriteU(31, 8); - writer.WriteUe(0); - const u32 chroma_format_idc = - static_cast(context.h264_parameter_set.chroma_format_idc.Value()); - writer.WriteUe(chroma_format_idc); - if (chroma_format_idc == 3) { - writer.WriteBit(false); - } - - writer.WriteUe(0); - writer.WriteUe(0); - writer.WriteBit(false); // QpprimeYZeroTransformBypassFlag - writer.WriteBit(false); // Scaling matrix present flag - - writer.WriteUe(static_cast(context.h264_parameter_set.log2_max_frame_num_minus4.Value())); - - const auto order_cnt_type = - static_cast(context.h264_parameter_set.pic_order_cnt_type.Value()); - writer.WriteUe(order_cnt_type); - if (order_cnt_type == 0) { - writer.WriteUe(context.h264_parameter_set.log2_max_pic_order_cnt_lsb_minus4); - } else if (order_cnt_type == 1) { - writer.WriteBit(context.h264_parameter_set.delta_pic_order_always_zero_flag != 0); - - writer.WriteSe(0); - writer.WriteSe(0); - writer.WriteUe(0); - } - - const s32 pic_height = context.h264_parameter_set.frame_height_in_map_units / - (context.h264_parameter_set.frame_mbs_only_flag ? 1 : 2); - - // TODO (ameerj): Where do we get this number, it seems to be particular for each stream - const auto nvdec_decoding = Settings::values.nvdec_emulation.GetValue(); - const bool uses_gpu_decoding = nvdec_decoding == Settings::NvdecEmulation::GPU; - const u32 max_num_ref_frames = uses_gpu_decoding ? 6u : 16u; - writer.WriteUe(max_num_ref_frames); - writer.WriteBit(false); - writer.WriteUe(context.h264_parameter_set.pic_width_in_mbs - 1); - writer.WriteUe(pic_height - 1); - writer.WriteBit(context.h264_parameter_set.frame_mbs_only_flag != 0); - - if (!context.h264_parameter_set.frame_mbs_only_flag) { - writer.WriteBit(context.h264_parameter_set.flags.mbaff_frame.Value() != 0); - } - - writer.WriteBit(context.h264_parameter_set.flags.direct_8x8_inference.Value() != 0); - writer.WriteBit(false); // Frame cropping flag - writer.WriteBit(false); // VUI parameter present flag - - writer.End(); - - // H264 PPS - writer.WriteU(1, 24); - writer.WriteU(0, 1); - writer.WriteU(3, 2); - writer.WriteU(8, 5); - - writer.WriteUe(0); - writer.WriteUe(0); - - writer.WriteBit(context.h264_parameter_set.entropy_coding_mode_flag != 0); - writer.WriteBit(false); - writer.WriteUe(0); - writer.WriteUe(context.h264_parameter_set.num_refidx_l0_default_active); - writer.WriteUe(context.h264_parameter_set.num_refidx_l1_default_active); - writer.WriteBit(context.h264_parameter_set.flags.weighted_pred.Value() != 0); - writer.WriteU(static_cast(context.h264_parameter_set.weighted_bipred_idc.Value()), 2); - s32 pic_init_qp = static_cast(context.h264_parameter_set.pic_init_qp_minus26.Value()); - writer.WriteSe(pic_init_qp); - writer.WriteSe(0); - s32 chroma_qp_index_offset = - static_cast(context.h264_parameter_set.chroma_qp_index_offset.Value()); - - writer.WriteSe(chroma_qp_index_offset); - writer.WriteBit(context.h264_parameter_set.deblocking_filter_control_present_flag != 0); - writer.WriteBit(context.h264_parameter_set.flags.constrained_intra_pred.Value() != 0); - writer.WriteBit(context.h264_parameter_set.redundant_pic_cnt_present_flag != 0); - writer.WriteBit(context.h264_parameter_set.transform_8x8_mode_flag != 0); - - writer.WriteBit(true); - - for (s32 index = 0; index < 6; index++) { - writer.WriteBit(true); - std::span matrix{context.weight_scale}; - writer.WriteScalingList(matrix, index * 16, 16); - } - - if (context.h264_parameter_set.transform_8x8_mode_flag) { - for (s32 index = 0; index < 2; index++) { - writer.WriteBit(true); - std::span matrix{context.weight_scale_8x8}; - writer.WriteScalingList(matrix, index * 64, 64); - } - } - - s32 chroma_qp_index_offset2 = - static_cast(context.h264_parameter_set.second_chroma_qp_index_offset.Value()); - - writer.WriteSe(chroma_qp_index_offset2); - - writer.End(); - - const auto& encoded_header = writer.GetByteArray(); - frame.resize(encoded_header.size() + context.stream_len); - std::memcpy(frame.data(), encoded_header.data(), encoded_header.size()); - - gpu.MemoryManager().ReadBlock(state.frame_bitstream_offset, - frame.data() + encoded_header.size(), context.stream_len); - - return frame; -} - -H264BitWriter::H264BitWriter() = default; - -H264BitWriter::~H264BitWriter() = default; - -void H264BitWriter::WriteU(s32 value, s32 value_sz) { - WriteBits(value, value_sz); -} - -void H264BitWriter::WriteSe(s32 value) { - WriteExpGolombCodedInt(value); -} - -void H264BitWriter::WriteUe(u32 value) { - WriteExpGolombCodedUInt(value); -} - -void H264BitWriter::End() { - WriteBit(true); - Flush(); -} - -void H264BitWriter::WriteBit(bool state) { - WriteBits(state ? 1 : 0, 1); -} - -void H264BitWriter::WriteScalingList(std::span list, s32 start, s32 count) { - std::vector scan(count); - if (count == 16) { - std::memcpy(scan.data(), zig_zag_scan.data(), scan.size()); - } else { - std::memcpy(scan.data(), zig_zag_direct.data(), scan.size()); - } - u8 last_scale = 8; - - for (s32 index = 0; index < count; index++) { - const u8 value = list[start + scan[index]]; - const s32 delta_scale = static_cast(value - last_scale); - - WriteSe(delta_scale); - - last_scale = value; - } -} - -std::vector& H264BitWriter::GetByteArray() { - return byte_array; -} - -const std::vector& H264BitWriter::GetByteArray() const { - return byte_array; -} - -void H264BitWriter::WriteBits(s32 value, s32 bit_count) { - s32 value_pos = 0; - - s32 remaining = bit_count; - - while (remaining > 0) { - s32 copy_size = remaining; - - const s32 free_bits = GetFreeBufferBits(); - - if (copy_size > free_bits) { - copy_size = free_bits; - } - - const s32 mask = (1 << copy_size) - 1; - - const s32 src_shift = (bit_count - value_pos) - copy_size; - const s32 dst_shift = (buffer_size - buffer_pos) - copy_size; - - buffer |= ((value >> src_shift) & mask) << dst_shift; - - value_pos += copy_size; - buffer_pos += copy_size; - remaining -= copy_size; - } -} - -void H264BitWriter::WriteExpGolombCodedInt(s32 value) { - const s32 sign = value <= 0 ? 0 : 1; - if (value < 0) { - value = -value; - } - value = (value << 1) - sign; - WriteExpGolombCodedUInt(value); -} - -void H264BitWriter::WriteExpGolombCodedUInt(u32 value) { - const s32 size = 32 - std::countl_zero(value + 1); - WriteBits(1, size); - - value -= (1U << (size - 1)) - 1; - WriteBits(static_cast(value), size - 1); -} - -s32 H264BitWriter::GetFreeBufferBits() { - if (buffer_pos == buffer_size) { - Flush(); - } - - return buffer_size - buffer_pos; -} - -void H264BitWriter::Flush() { - if (buffer_pos == 0) { - return; - } - byte_array.push_back(static_cast(buffer)); - - buffer = 0; - buffer_pos = 0; -} -} // namespace Tegra::Decoder diff --git a/src/video_core/command_classes/codecs/h264.h b/src/video_core/command_classes/codecs/h264.h deleted file mode 100644 index 261574364..000000000 --- a/src/video_core/command_classes/codecs/h264.h +++ /dev/null @@ -1,173 +0,0 @@ -// SPDX-FileCopyrightText: Ryujinx Team and Contributors -// SPDX-License-Identifier: MIT - -#pragma once - -#include -#include -#include "common/bit_field.h" -#include "common/common_funcs.h" -#include "common/common_types.h" -#include "video_core/command_classes/nvdec_common.h" - -namespace Tegra { -class GPU; -namespace Decoder { - -class H264BitWriter { -public: - H264BitWriter(); - ~H264BitWriter(); - - /// The following Write methods are based on clause 9.1 in the H.264 specification. - /// WriteSe and WriteUe write in the Exp-Golomb-coded syntax - void WriteU(s32 value, s32 value_sz); - void WriteSe(s32 value); - void WriteUe(u32 value); - - /// Finalize the bitstream - void End(); - - /// append a bit to the stream, equivalent value to the state parameter - void WriteBit(bool state); - - /// Based on section 7.3.2.1.1.1 and Table 7-4 in the H.264 specification - /// Writes the scaling matrices of the sream - void WriteScalingList(std::span list, s32 start, s32 count); - - /// Return the bitstream as a vector. - [[nodiscard]] std::vector& GetByteArray(); - [[nodiscard]] const std::vector& GetByteArray() const; - -private: - void WriteBits(s32 value, s32 bit_count); - void WriteExpGolombCodedInt(s32 value); - void WriteExpGolombCodedUInt(u32 value); - [[nodiscard]] s32 GetFreeBufferBits(); - void Flush(); - - s32 buffer_size{8}; - - s32 buffer{}; - s32 buffer_pos{}; - std::vector byte_array; -}; - -class H264 { -public: - explicit H264(GPU& gpu); - ~H264(); - - /// Compose the H264 frame for FFmpeg decoding - [[nodiscard]] const std::vector& ComposeFrame(const NvdecCommon::NvdecRegisters& state, - bool is_first_frame = false); - -private: - std::vector frame; - GPU& gpu; - - struct H264ParameterSet { - s32 log2_max_pic_order_cnt_lsb_minus4; ///< 0x00 - s32 delta_pic_order_always_zero_flag; ///< 0x04 - s32 frame_mbs_only_flag; ///< 0x08 - u32 pic_width_in_mbs; ///< 0x0C - u32 frame_height_in_map_units; ///< 0x10 - union { ///< 0x14 - BitField<0, 2, u32> tile_format; - BitField<2, 3, u32> gob_height; - }; - u32 entropy_coding_mode_flag; ///< 0x18 - s32 pic_order_present_flag; ///< 0x1C - s32 num_refidx_l0_default_active; ///< 0x20 - s32 num_refidx_l1_default_active; ///< 0x24 - s32 deblocking_filter_control_present_flag; ///< 0x28 - s32 redundant_pic_cnt_present_flag; ///< 0x2C - u32 transform_8x8_mode_flag; ///< 0x30 - u32 pitch_luma; ///< 0x34 - u32 pitch_chroma; ///< 0x38 - u32 luma_top_offset; ///< 0x3C - u32 luma_bot_offset; ///< 0x40 - u32 luma_frame_offset; ///< 0x44 - u32 chroma_top_offset; ///< 0x48 - u32 chroma_bot_offset; ///< 0x4C - u32 chroma_frame_offset; ///< 0x50 - u32 hist_buffer_size; ///< 0x54 - union { ///< 0x58 - union { - BitField<0, 1, u64> mbaff_frame; - BitField<1, 1, u64> direct_8x8_inference; - BitField<2, 1, u64> weighted_pred; - BitField<3, 1, u64> constrained_intra_pred; - BitField<4, 1, u64> ref_pic; - BitField<5, 1, u64> field_pic; - BitField<6, 1, u64> bottom_field; - BitField<7, 1, u64> second_field; - } flags; - BitField<8, 4, u64> log2_max_frame_num_minus4; - BitField<12, 2, u64> chroma_format_idc; - BitField<14, 2, u64> pic_order_cnt_type; - BitField<16, 6, s64> pic_init_qp_minus26; - BitField<22, 5, s64> chroma_qp_index_offset; - BitField<27, 5, s64> second_chroma_qp_index_offset; - BitField<32, 2, u64> weighted_bipred_idc; - BitField<34, 7, u64> curr_pic_idx; - BitField<41, 5, u64> curr_col_idx; - BitField<46, 16, u64> frame_number; - BitField<62, 1, u64> frame_surfaces; - BitField<63, 1, u64> output_memory_layout; - }; - }; - static_assert(sizeof(H264ParameterSet) == 0x60, "H264ParameterSet is an invalid size"); - - struct H264DecoderContext { - INSERT_PADDING_WORDS_NOINIT(18); ///< 0x0000 - u32 stream_len; ///< 0x0048 - INSERT_PADDING_WORDS_NOINIT(3); ///< 0x004C - H264ParameterSet h264_parameter_set; ///< 0x0058 - INSERT_PADDING_WORDS_NOINIT(66); ///< 0x00B8 - std::array weight_scale; ///< 0x01C0 - std::array weight_scale_8x8; ///< 0x0220 - }; - static_assert(sizeof(H264DecoderContext) == 0x2A0, "H264DecoderContext is an invalid size"); - -#define ASSERT_POSITION(field_name, position) \ - static_assert(offsetof(H264ParameterSet, field_name) == position, \ - "Field " #field_name " has invalid position") - - ASSERT_POSITION(log2_max_pic_order_cnt_lsb_minus4, 0x00); - ASSERT_POSITION(delta_pic_order_always_zero_flag, 0x04); - ASSERT_POSITION(frame_mbs_only_flag, 0x08); - ASSERT_POSITION(pic_width_in_mbs, 0x0C); - ASSERT_POSITION(frame_height_in_map_units, 0x10); - ASSERT_POSITION(tile_format, 0x14); - ASSERT_POSITION(entropy_coding_mode_flag, 0x18); - ASSERT_POSITION(pic_order_present_flag, 0x1C); - ASSERT_POSITION(num_refidx_l0_default_active, 0x20); - ASSERT_POSITION(num_refidx_l1_default_active, 0x24); - ASSERT_POSITION(deblocking_filter_control_present_flag, 0x28); - ASSERT_POSITION(redundant_pic_cnt_present_flag, 0x2C); - ASSERT_POSITION(transform_8x8_mode_flag, 0x30); - ASSERT_POSITION(pitch_luma, 0x34); - ASSERT_POSITION(pitch_chroma, 0x38); - ASSERT_POSITION(luma_top_offset, 0x3C); - ASSERT_POSITION(luma_bot_offset, 0x40); - ASSERT_POSITION(luma_frame_offset, 0x44); - ASSERT_POSITION(chroma_top_offset, 0x48); - ASSERT_POSITION(chroma_bot_offset, 0x4C); - ASSERT_POSITION(chroma_frame_offset, 0x50); - ASSERT_POSITION(hist_buffer_size, 0x54); - ASSERT_POSITION(flags, 0x58); -#undef ASSERT_POSITION - -#define ASSERT_POSITION(field_name, position) \ - static_assert(offsetof(H264DecoderContext, field_name) == position, \ - "Field " #field_name " has invalid position") - - ASSERT_POSITION(stream_len, 0x48); - ASSERT_POSITION(h264_parameter_set, 0x58); - ASSERT_POSITION(weight_scale, 0x1C0); -#undef ASSERT_POSITION -}; - -} // namespace Decoder -} // namespace Tegra diff --git a/src/video_core/command_classes/codecs/vp8.cpp b/src/video_core/command_classes/codecs/vp8.cpp deleted file mode 100644 index c83b9bbc2..000000000 --- a/src/video_core/command_classes/codecs/vp8.cpp +++ /dev/null @@ -1,53 +0,0 @@ -// SPDX-FileCopyrightText: Copyright 2021 yuzu Emulator Project -// SPDX-License-Identifier: GPL-2.0-or-later - -#include - -#include "video_core/command_classes/codecs/vp8.h" -#include "video_core/gpu.h" -#include "video_core/memory_manager.h" - -namespace Tegra::Decoder { -VP8::VP8(GPU& gpu_) : gpu(gpu_) {} - -VP8::~VP8() = default; - -const std::vector& VP8::ComposeFrame(const NvdecCommon::NvdecRegisters& state) { - VP8PictureInfo info; - gpu.MemoryManager().ReadBlock(state.picture_info_offset, &info, sizeof(VP8PictureInfo)); - - const bool is_key_frame = info.key_frame == 1u; - const auto bitstream_size = static_cast(info.vld_buffer_size); - const size_t header_size = is_key_frame ? 10u : 3u; - frame.resize(header_size + bitstream_size); - - // Based on page 30 of the VP8 specification. - // https://datatracker.ietf.org/doc/rfc6386/ - frame[0] = is_key_frame ? 0u : 1u; // 1-bit frame type (0: keyframe, 1: interframes). - frame[0] |= static_cast((info.version & 7u) << 1u); // 3-bit version number - frame[0] |= static_cast(1u << 4u); // 1-bit show_frame flag - - // The next 19-bits are the first partition size - frame[0] |= static_cast((info.first_part_size & 7u) << 5u); - frame[1] = static_cast((info.first_part_size & 0x7f8u) >> 3u); - frame[2] = static_cast((info.first_part_size & 0x7f800u) >> 11u); - - if (is_key_frame) { - frame[3] = 0x9du; - frame[4] = 0x01u; - frame[5] = 0x2au; - // TODO(ameerj): Horizontal/Vertical Scale - // 16 bits: (2 bits Horizontal Scale << 14) | Width (14 bits) - frame[6] = static_cast(info.frame_width & 0xff); - frame[7] = static_cast(((info.frame_width >> 8) & 0x3f)); - // 16 bits:(2 bits Vertical Scale << 14) | Height (14 bits) - frame[8] = static_cast(info.frame_height & 0xff); - frame[9] = static_cast(((info.frame_height >> 8) & 0x3f)); - } - const u64 bitstream_offset = state.frame_bitstream_offset; - gpu.MemoryManager().ReadBlock(bitstream_offset, frame.data() + header_size, bitstream_size); - - return frame; -} - -} // namespace Tegra::Decoder diff --git a/src/video_core/command_classes/codecs/vp8.h b/src/video_core/command_classes/codecs/vp8.h deleted file mode 100644 index 3357667b0..000000000 --- a/src/video_core/command_classes/codecs/vp8.h +++ /dev/null @@ -1,73 +0,0 @@ -// SPDX-FileCopyrightText: Copyright 2021 yuzu Emulator Project -// SPDX-License-Identifier: GPL-2.0-or-later - -#pragma once - -#include -#include - -#include "common/common_funcs.h" -#include "common/common_types.h" -#include "video_core/command_classes/nvdec_common.h" - -namespace Tegra { -class GPU; -namespace Decoder { - -class VP8 { -public: - explicit VP8(GPU& gpu); - ~VP8(); - - /// Compose the VP8 frame for FFmpeg decoding - [[nodiscard]] const std::vector& ComposeFrame(const NvdecCommon::NvdecRegisters& state); - -private: - std::vector frame; - GPU& gpu; - - struct VP8PictureInfo { - INSERT_PADDING_WORDS_NOINIT(14); - u16 frame_width; // actual frame width - u16 frame_height; // actual frame height - u8 key_frame; - u8 version; - union { - u8 raw; - BitField<0, 2, u8> tile_format; - BitField<2, 3, u8> gob_height; - BitField<5, 3, u8> reserverd_surface_format; - }; - u8 error_conceal_on; // 1: error conceal on; 0: off - u32 first_part_size; // the size of first partition(frame header and mb header partition) - u32 hist_buffer_size; // in units of 256 - u32 vld_buffer_size; // in units of 1 - // Current frame buffers - std::array frame_stride; // [y_c] - u32 luma_top_offset; // offset of luma top field in units of 256 - u32 luma_bot_offset; // offset of luma bottom field in units of 256 - u32 luma_frame_offset; // offset of luma frame in units of 256 - u32 chroma_top_offset; // offset of chroma top field in units of 256 - u32 chroma_bot_offset; // offset of chroma bottom field in units of 256 - u32 chroma_frame_offset; // offset of chroma frame in units of 256 - - INSERT_PADDING_BYTES_NOINIT(0x1c); // NvdecDisplayParams - - // Decode picture buffer related - s8 current_output_memory_layout; - // output NV12/NV24 setting. index 0: golden; 1: altref; 2: last - std::array output_memory_layout; - - u8 segmentation_feature_data_update; - INSERT_PADDING_BYTES_NOINIT(3); - - // ucode return result - u32 result_value; - std::array partition_offset; - INSERT_PADDING_WORDS_NOINIT(3); - }; - static_assert(sizeof(VP8PictureInfo) == 0xc0, "PictureInfo is an invalid size"); -}; - -} // namespace Decoder -} // namespace Tegra diff --git a/src/video_core/command_classes/codecs/vp9.cpp b/src/video_core/command_classes/codecs/vp9.cpp deleted file mode 100644 index c01431441..000000000 --- a/src/video_core/command_classes/codecs/vp9.cpp +++ /dev/null @@ -1,946 +0,0 @@ -// SPDX-FileCopyrightText: Copyright 2020 yuzu Emulator Project -// SPDX-License-Identifier: GPL-2.0-or-later - -#include // for std::copy -#include -#include "common/assert.h" -#include "video_core/command_classes/codecs/vp9.h" -#include "video_core/gpu.h" -#include "video_core/memory_manager.h" - -namespace Tegra::Decoder { -namespace { -constexpr u32 diff_update_probability = 252; -constexpr u32 frame_sync_code = 0x498342; - -// Default compressed header probabilities once frame context resets -constexpr Vp9EntropyProbs default_probs{ - .y_mode_prob{ - 65, 32, 18, 144, 162, 194, 41, 51, 98, 132, 68, 18, 165, 217, 196, 45, 40, 78, - 173, 80, 19, 176, 240, 193, 64, 35, 46, 221, 135, 38, 194, 248, 121, 96, 85, 29, - }, - .partition_prob{ - 199, 122, 141, 0, 147, 63, 159, 0, 148, 133, 118, 0, 121, 104, 114, 0, - 174, 73, 87, 0, 92, 41, 83, 0, 82, 99, 50, 0, 53, 39, 39, 0, - 177, 58, 59, 0, 68, 26, 63, 0, 52, 79, 25, 0, 17, 14, 12, 0, - 222, 34, 30, 0, 72, 16, 44, 0, 58, 32, 12, 0, 10, 7, 6, 0, - }, - .coef_probs{ - 195, 29, 183, 84, 49, 136, 8, 42, 71, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 31, 107, 169, 35, 99, 159, 17, 82, 140, 8, 66, 114, 2, 44, 76, 1, 19, 32, - 40, 132, 201, 29, 114, 187, 13, 91, 157, 7, 75, 127, 3, 58, 95, 1, 28, 47, - 69, 142, 221, 42, 122, 201, 15, 91, 159, 6, 67, 121, 1, 42, 77, 1, 17, 31, - 102, 148, 228, 67, 117, 204, 17, 82, 154, 6, 59, 114, 2, 39, 75, 1, 15, 29, - 156, 57, 233, 119, 57, 212, 58, 48, 163, 29, 40, 124, 12, 30, 81, 3, 12, 31, - 191, 107, 226, 124, 117, 204, 25, 99, 155, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 29, 148, 210, 37, 126, 194, 8, 93, 157, 2, 68, 118, 1, 39, 69, 1, 17, 33, - 41, 151, 213, 27, 123, 193, 3, 82, 144, 1, 58, 105, 1, 32, 60, 1, 13, 26, - 59, 159, 220, 23, 126, 198, 4, 88, 151, 1, 66, 114, 1, 38, 71, 1, 18, 34, - 114, 136, 232, 51, 114, 207, 11, 83, 155, 3, 56, 105, 1, 33, 65, 1, 17, 34, - 149, 65, 234, 121, 57, 215, 61, 49, 166, 28, 36, 114, 12, 25, 76, 3, 16, 42, - 214, 49, 220, 132, 63, 188, 42, 65, 137, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 85, 137, 221, 104, 131, 216, 49, 111, 192, 21, 87, 155, 2, 49, 87, 1, 16, 28, - 89, 163, 230, 90, 137, 220, 29, 100, 183, 10, 70, 135, 2, 42, 81, 1, 17, 33, - 108, 167, 237, 55, 133, 222, 15, 97, 179, 4, 72, 135, 1, 45, 85, 1, 19, 38, - 124, 146, 240, 66, 124, 224, 17, 88, 175, 4, 58, 122, 1, 36, 75, 1, 18, 37, - 141, 79, 241, 126, 70, 227, 66, 58, 182, 30, 44, 136, 12, 34, 96, 2, 20, 47, - 229, 99, 249, 143, 111, 235, 46, 109, 192, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 82, 158, 236, 94, 146, 224, 25, 117, 191, 9, 87, 149, 3, 56, 99, 1, 33, 57, - 83, 167, 237, 68, 145, 222, 10, 103, 177, 2, 72, 131, 1, 41, 79, 1, 20, 39, - 99, 167, 239, 47, 141, 224, 10, 104, 178, 2, 73, 133, 1, 44, 85, 1, 22, 47, - 127, 145, 243, 71, 129, 228, 17, 93, 177, 3, 61, 124, 1, 41, 84, 1, 21, 52, - 157, 78, 244, 140, 72, 231, 69, 58, 184, 31, 44, 137, 14, 38, 105, 8, 23, 61, - 125, 34, 187, 52, 41, 133, 6, 31, 56, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 37, 109, 153, 51, 102, 147, 23, 87, 128, 8, 67, 101, 1, 41, 63, 1, 19, 29, - 31, 154, 185, 17, 127, 175, 6, 96, 145, 2, 73, 114, 1, 51, 82, 1, 28, 45, - 23, 163, 200, 10, 131, 185, 2, 93, 148, 1, 67, 111, 1, 41, 69, 1, 14, 24, - 29, 176, 217, 12, 145, 201, 3, 101, 156, 1, 69, 111, 1, 39, 63, 1, 14, 23, - 57, 192, 233, 25, 154, 215, 6, 109, 167, 3, 78, 118, 1, 48, 69, 1, 21, 29, - 202, 105, 245, 108, 106, 216, 18, 90, 144, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 33, 172, 219, 64, 149, 206, 14, 117, 177, 5, 90, 141, 2, 61, 95, 1, 37, 57, - 33, 179, 220, 11, 140, 198, 1, 89, 148, 1, 60, 104, 1, 33, 57, 1, 12, 21, - 30, 181, 221, 8, 141, 198, 1, 87, 145, 1, 58, 100, 1, 31, 55, 1, 12, 20, - 32, 186, 224, 7, 142, 198, 1, 86, 143, 1, 58, 100, 1, 31, 55, 1, 12, 22, - 57, 192, 227, 20, 143, 204, 3, 96, 154, 1, 68, 112, 1, 42, 69, 1, 19, 32, - 212, 35, 215, 113, 47, 169, 29, 48, 105, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 74, 129, 203, 106, 120, 203, 49, 107, 178, 19, 84, 144, 4, 50, 84, 1, 15, 25, - 71, 172, 217, 44, 141, 209, 15, 102, 173, 6, 76, 133, 2, 51, 89, 1, 24, 42, - 64, 185, 231, 31, 148, 216, 8, 103, 175, 3, 74, 131, 1, 46, 81, 1, 18, 30, - 65, 196, 235, 25, 157, 221, 5, 105, 174, 1, 67, 120, 1, 38, 69, 1, 15, 30, - 65, 204, 238, 30, 156, 224, 7, 107, 177, 2, 70, 124, 1, 42, 73, 1, 18, 34, - 225, 86, 251, 144, 104, 235, 42, 99, 181, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 85, 175, 239, 112, 165, 229, 29, 136, 200, 12, 103, 162, 6, 77, 123, 2, 53, 84, - 75, 183, 239, 30, 155, 221, 3, 106, 171, 1, 74, 128, 1, 44, 76, 1, 17, 28, - 73, 185, 240, 27, 159, 222, 2, 107, 172, 1, 75, 127, 1, 42, 73, 1, 17, 29, - 62, 190, 238, 21, 159, 222, 2, 107, 172, 1, 72, 122, 1, 40, 71, 1, 18, 32, - 61, 199, 240, 27, 161, 226, 4, 113, 180, 1, 76, 129, 1, 46, 80, 1, 23, 41, - 7, 27, 153, 5, 30, 95, 1, 16, 30, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 50, 75, 127, 57, 75, 124, 27, 67, 108, 10, 54, 86, 1, 33, 52, 1, 12, 18, - 43, 125, 151, 26, 108, 148, 7, 83, 122, 2, 59, 89, 1, 38, 60, 1, 17, 27, - 23, 144, 163, 13, 112, 154, 2, 75, 117, 1, 50, 81, 1, 31, 51, 1, 14, 23, - 18, 162, 185, 6, 123, 171, 1, 78, 125, 1, 51, 86, 1, 31, 54, 1, 14, 23, - 15, 199, 227, 3, 150, 204, 1, 91, 146, 1, 55, 95, 1, 30, 53, 1, 11, 20, - 19, 55, 240, 19, 59, 196, 3, 52, 105, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 41, 166, 207, 104, 153, 199, 31, 123, 181, 14, 101, 152, 5, 72, 106, 1, 36, 52, - 35, 176, 211, 12, 131, 190, 2, 88, 144, 1, 60, 101, 1, 36, 60, 1, 16, 28, - 28, 183, 213, 8, 134, 191, 1, 86, 142, 1, 56, 96, 1, 30, 53, 1, 12, 20, - 20, 190, 215, 4, 135, 192, 1, 84, 139, 1, 53, 91, 1, 28, 49, 1, 11, 20, - 13, 196, 216, 2, 137, 192, 1, 86, 143, 1, 57, 99, 1, 32, 56, 1, 13, 24, - 211, 29, 217, 96, 47, 156, 22, 43, 87, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 78, 120, 193, 111, 116, 186, 46, 102, 164, 15, 80, 128, 2, 49, 76, 1, 18, 28, - 71, 161, 203, 42, 132, 192, 10, 98, 150, 3, 69, 109, 1, 44, 70, 1, 18, 29, - 57, 186, 211, 30, 140, 196, 4, 93, 146, 1, 62, 102, 1, 38, 65, 1, 16, 27, - 47, 199, 217, 14, 145, 196, 1, 88, 142, 1, 57, 98, 1, 36, 62, 1, 15, 26, - 26, 219, 229, 5, 155, 207, 1, 94, 151, 1, 60, 104, 1, 36, 62, 1, 16, 28, - 233, 29, 248, 146, 47, 220, 43, 52, 140, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 100, 163, 232, 179, 161, 222, 63, 142, 204, 37, 113, 174, 26, 89, 137, 18, 68, 97, - 85, 181, 230, 32, 146, 209, 7, 100, 164, 3, 71, 121, 1, 45, 77, 1, 18, 30, - 65, 187, 230, 20, 148, 207, 2, 97, 159, 1, 68, 116, 1, 40, 70, 1, 14, 29, - 40, 194, 227, 8, 147, 204, 1, 94, 155, 1, 65, 112, 1, 39, 66, 1, 14, 26, - 16, 208, 228, 3, 151, 207, 1, 98, 160, 1, 67, 117, 1, 41, 74, 1, 17, 31, - 17, 38, 140, 7, 34, 80, 1, 17, 29, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 37, 75, 128, 41, 76, 128, 26, 66, 116, 12, 52, 94, 2, 32, 55, 1, 10, 16, - 50, 127, 154, 37, 109, 152, 16, 82, 121, 5, 59, 85, 1, 35, 54, 1, 13, 20, - 40, 142, 167, 17, 110, 157, 2, 71, 112, 1, 44, 72, 1, 27, 45, 1, 11, 17, - 30, 175, 188, 9, 124, 169, 1, 74, 116, 1, 48, 78, 1, 30, 49, 1, 11, 18, - 10, 222, 223, 2, 150, 194, 1, 83, 128, 1, 48, 79, 1, 27, 45, 1, 11, 17, - 36, 41, 235, 29, 36, 193, 10, 27, 111, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 85, 165, 222, 177, 162, 215, 110, 135, 195, 57, 113, 168, 23, 83, 120, 10, 49, 61, - 85, 190, 223, 36, 139, 200, 5, 90, 146, 1, 60, 103, 1, 38, 65, 1, 18, 30, - 72, 202, 223, 23, 141, 199, 2, 86, 140, 1, 56, 97, 1, 36, 61, 1, 16, 27, - 55, 218, 225, 13, 145, 200, 1, 86, 141, 1, 57, 99, 1, 35, 61, 1, 13, 22, - 15, 235, 212, 1, 132, 184, 1, 84, 139, 1, 57, 97, 1, 34, 56, 1, 14, 23, - 181, 21, 201, 61, 37, 123, 10, 38, 71, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 47, 106, 172, 95, 104, 173, 42, 93, 159, 18, 77, 131, 4, 50, 81, 1, 17, 23, - 62, 147, 199, 44, 130, 189, 28, 102, 154, 18, 75, 115, 2, 44, 65, 1, 12, 19, - 55, 153, 210, 24, 130, 194, 3, 93, 146, 1, 61, 97, 1, 31, 50, 1, 10, 16, - 49, 186, 223, 17, 148, 204, 1, 96, 142, 1, 53, 83, 1, 26, 44, 1, 11, 17, - 13, 217, 212, 2, 136, 180, 1, 78, 124, 1, 50, 83, 1, 29, 49, 1, 14, 23, - 197, 13, 247, 82, 17, 222, 25, 17, 162, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 126, 186, 247, 234, 191, 243, 176, 177, 234, 104, 158, 220, 66, 128, 186, 55, 90, 137, - 111, 197, 242, 46, 158, 219, 9, 104, 171, 2, 65, 125, 1, 44, 80, 1, 17, 91, - 104, 208, 245, 39, 168, 224, 3, 109, 162, 1, 79, 124, 1, 50, 102, 1, 43, 102, - 84, 220, 246, 31, 177, 231, 2, 115, 180, 1, 79, 134, 1, 55, 77, 1, 60, 79, - 43, 243, 240, 8, 180, 217, 1, 115, 166, 1, 84, 121, 1, 51, 67, 1, 16, 6, - }, - .switchable_interp_prob{235, 162, 36, 255, 34, 3, 149, 144}, - .inter_mode_prob{ - 2, 173, 34, 0, 7, 145, 85, 0, 7, 166, 63, 0, 7, 94, - 66, 0, 8, 64, 46, 0, 17, 81, 31, 0, 25, 29, 30, 0, - }, - .intra_inter_prob{9, 102, 187, 225}, - .comp_inter_prob{9, 102, 187, 225, 0}, - .single_ref_prob{33, 16, 77, 74, 142, 142, 172, 170, 238, 247}, - .comp_ref_prob{50, 126, 123, 221, 226}, - .tx_32x32_prob{3, 136, 37, 5, 52, 13}, - .tx_16x16_prob{20, 152, 15, 101}, - .tx_8x8_prob{100, 66}, - .skip_probs{192, 128, 64}, - .joints{32, 64, 96}, - .sign{128, 128}, - .classes{ - 224, 144, 192, 168, 192, 176, 192, 198, 198, 245, - 216, 128, 176, 160, 176, 176, 192, 198, 198, 208, - }, - .class_0{216, 208}, - .prob_bits{ - 136, 140, 148, 160, 176, 192, 224, 234, 234, 240, - 136, 140, 148, 160, 176, 192, 224, 234, 234, 240, - }, - .class_0_fr{128, 128, 64, 96, 112, 64, 128, 128, 64, 96, 112, 64}, - .fr{64, 96, 64, 64, 96, 64}, - .class_0_hp{160, 160}, - .high_precision{128, 128}, -}; - -constexpr std::array norm_lut{ - 0, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -}; - -constexpr std::array map_lut{ - 20, 21, 22, 23, 24, 25, 0, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, - 1, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 2, 50, 51, 52, 53, 54, - 55, 56, 57, 58, 59, 60, 61, 3, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, - 73, 4, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 5, 86, 87, 88, 89, - 90, 91, 92, 93, 94, 95, 96, 97, 6, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, - 108, 109, 7, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 8, 122, 123, 124, - 125, 126, 127, 128, 129, 130, 131, 132, 133, 9, 134, 135, 136, 137, 138, 139, 140, 141, 142, - 143, 144, 145, 10, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 11, 158, 159, - 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 12, 170, 171, 172, 173, 174, 175, 176, 177, - 178, 179, 180, 181, 13, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 14, 194, - 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 15, 206, 207, 208, 209, 210, 211, 212, - 213, 214, 215, 216, 217, 16, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 17, - 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 18, 242, 243, 244, 245, 246, 247, - 248, 249, 250, 251, 252, 253, 19, -}; - -// 6.2.14 Tile size calculation - -[[nodiscard]] s32 CalcMinLog2TileCols(s32 frame_width) { - const s32 sb64_cols = (frame_width + 63) / 64; - s32 min_log2 = 0; - - while ((64 << min_log2) < sb64_cols) { - min_log2++; - } - - return min_log2; -} - -[[nodiscard]] s32 CalcMaxLog2TileCols(s32 frame_width) { - const s32 sb64_cols = (frame_width + 63) / 64; - s32 max_log2 = 1; - - while ((sb64_cols >> max_log2) >= 4) { - max_log2++; - } - - return max_log2 - 1; -} - -// Recenters probability. Based on section 6.3.6 of VP9 Specification -[[nodiscard]] s32 RecenterNonNeg(s32 new_prob, s32 old_prob) { - if (new_prob > old_prob * 2) { - return new_prob; - } - - if (new_prob >= old_prob) { - return (new_prob - old_prob) * 2; - } - - return (old_prob - new_prob) * 2 - 1; -} - -// Adjusts old_prob depending on new_prob. Based on section 6.3.5 of VP9 Specification -[[nodiscard]] s32 RemapProbability(s32 new_prob, s32 old_prob) { - new_prob--; - old_prob--; - - std::size_t index{}; - - if (old_prob * 2 <= 0xff) { - index = static_cast(std::max(0, RecenterNonNeg(new_prob, old_prob) - 1)); - } else { - index = static_cast( - std::max(0, RecenterNonNeg(0xff - 1 - new_prob, 0xff - 1 - old_prob) - 1)); - } - - return static_cast(map_lut[index]); -} -} // Anonymous namespace - -VP9::VP9(GPU& gpu_) : gpu{gpu_} {} - -VP9::~VP9() = default; - -void VP9::WriteProbabilityUpdate(VpxRangeEncoder& writer, u8 new_prob, u8 old_prob) { - const bool update = new_prob != old_prob; - - writer.Write(update, diff_update_probability); - - if (update) { - WriteProbabilityDelta(writer, new_prob, old_prob); - } -} -template -void VP9::WriteProbabilityUpdate(VpxRangeEncoder& writer, const std::array& new_prob, - const std::array& old_prob) { - for (std::size_t offset = 0; offset < new_prob.size(); ++offset) { - WriteProbabilityUpdate(writer, new_prob[offset], old_prob[offset]); - } -} - -template -void VP9::WriteProbabilityUpdateAligned4(VpxRangeEncoder& writer, const std::array& new_prob, - const std::array& old_prob) { - for (std::size_t offset = 0; offset < new_prob.size(); offset += 4) { - WriteProbabilityUpdate(writer, new_prob[offset + 0], old_prob[offset + 0]); - WriteProbabilityUpdate(writer, new_prob[offset + 1], old_prob[offset + 1]); - WriteProbabilityUpdate(writer, new_prob[offset + 2], old_prob[offset + 2]); - } -} - -void VP9::WriteProbabilityDelta(VpxRangeEncoder& writer, u8 new_prob, u8 old_prob) { - const int delta = RemapProbability(new_prob, old_prob); - - EncodeTermSubExp(writer, delta); -} - -void VP9::EncodeTermSubExp(VpxRangeEncoder& writer, s32 value) { - if (WriteLessThan(writer, value, 16)) { - writer.Write(value, 4); - } else if (WriteLessThan(writer, value, 32)) { - writer.Write(value - 16, 4); - } else if (WriteLessThan(writer, value, 64)) { - writer.Write(value - 32, 5); - } else { - value -= 64; - - constexpr s32 size = 8; - - const s32 mask = (1 << size) - 191; - - const s32 delta = value - mask; - - if (delta < 0) { - writer.Write(value, size - 1); - } else { - writer.Write(delta / 2 + mask, size - 1); - writer.Write(delta & 1, 1); - } - } -} - -bool VP9::WriteLessThan(VpxRangeEncoder& writer, s32 value, s32 test) { - const bool is_lt = value < test; - writer.Write(!is_lt); - return is_lt; -} - -void VP9::WriteCoefProbabilityUpdate(VpxRangeEncoder& writer, s32 tx_mode, - const std::array& new_prob, - const std::array& old_prob) { - constexpr u32 block_bytes = 2 * 2 * 6 * 6 * 3; - - const auto needs_update = [&](u32 base_index) { - return !std::equal(new_prob.begin() + base_index, - new_prob.begin() + base_index + block_bytes, - old_prob.begin() + base_index); - }; - - for (u32 block_index = 0; block_index < 4; block_index++) { - const u32 base_index = block_index * block_bytes; - const bool update = needs_update(base_index); - writer.Write(update); - - if (update) { - u32 index = base_index; - for (s32 i = 0; i < 2; i++) { - for (s32 j = 0; j < 2; j++) { - for (s32 k = 0; k < 6; k++) { - for (s32 l = 0; l < 6; l++) { - if (k != 0 || l < 3) { - WriteProbabilityUpdate(writer, new_prob[index + 0], - old_prob[index + 0]); - WriteProbabilityUpdate(writer, new_prob[index + 1], - old_prob[index + 1]); - WriteProbabilityUpdate(writer, new_prob[index + 2], - old_prob[index + 2]); - } - index += 3; - } - } - } - } - } - if (block_index == static_cast(tx_mode)) { - break; - } - } -} - -void VP9::WriteMvProbabilityUpdate(VpxRangeEncoder& writer, u8 new_prob, u8 old_prob) { - const bool update = new_prob != old_prob; - writer.Write(update, diff_update_probability); - - if (update) { - writer.Write(new_prob >> 1, 7); - } -} - -Vp9PictureInfo VP9::GetVp9PictureInfo(const NvdecCommon::NvdecRegisters& state) { - PictureInfo picture_info; - gpu.MemoryManager().ReadBlock(state.picture_info_offset, &picture_info, sizeof(PictureInfo)); - Vp9PictureInfo vp9_info = picture_info.Convert(); - - InsertEntropy(state.vp9_entropy_probs_offset, vp9_info.entropy); - - // surface_luma_offset[0:3] contains the address of the reference frame offsets in the following - // order: last, golden, altref, current. - std::copy(state.surface_luma_offset.begin(), state.surface_luma_offset.begin() + 4, - vp9_info.frame_offsets.begin()); - - return vp9_info; -} - -void VP9::InsertEntropy(u64 offset, Vp9EntropyProbs& dst) { - EntropyProbs entropy; - gpu.MemoryManager().ReadBlock(offset, &entropy, sizeof(EntropyProbs)); - entropy.Convert(dst); -} - -Vp9FrameContainer VP9::GetCurrentFrame(const NvdecCommon::NvdecRegisters& state) { - Vp9FrameContainer current_frame{}; - { - gpu.SyncGuestHost(); - current_frame.info = GetVp9PictureInfo(state); - current_frame.bit_stream.resize(current_frame.info.bitstream_size); - gpu.MemoryManager().ReadBlock(state.frame_bitstream_offset, current_frame.bit_stream.data(), - current_frame.info.bitstream_size); - } - if (!next_frame.bit_stream.empty()) { - Vp9FrameContainer temp{ - .info = current_frame.info, - .bit_stream = std::move(current_frame.bit_stream), - }; - next_frame.info.show_frame = current_frame.info.last_frame_shown; - current_frame.info = next_frame.info; - current_frame.bit_stream = std::move(next_frame.bit_stream); - next_frame = std::move(temp); - } else { - next_frame.info = current_frame.info; - next_frame.bit_stream = current_frame.bit_stream; - } - return current_frame; -} - -std::vector VP9::ComposeCompressedHeader() { - VpxRangeEncoder writer{}; - const bool update_probs = !current_frame_info.is_key_frame && current_frame_info.show_frame; - if (!current_frame_info.lossless) { - if (static_cast(current_frame_info.transform_mode) >= 3) { - writer.Write(3, 2); - writer.Write(current_frame_info.transform_mode == 4); - } else { - writer.Write(current_frame_info.transform_mode, 2); - } - } - - if (current_frame_info.transform_mode == 4) { - // tx_mode_probs() in the spec - WriteProbabilityUpdate(writer, current_frame_info.entropy.tx_8x8_prob, - prev_frame_probs.tx_8x8_prob); - WriteProbabilityUpdate(writer, current_frame_info.entropy.tx_16x16_prob, - prev_frame_probs.tx_16x16_prob); - WriteProbabilityUpdate(writer, current_frame_info.entropy.tx_32x32_prob, - prev_frame_probs.tx_32x32_prob); - if (update_probs) { - prev_frame_probs.tx_8x8_prob = current_frame_info.entropy.tx_8x8_prob; - prev_frame_probs.tx_16x16_prob = current_frame_info.entropy.tx_16x16_prob; - prev_frame_probs.tx_32x32_prob = current_frame_info.entropy.tx_32x32_prob; - } - } - // read_coef_probs() in the spec - WriteCoefProbabilityUpdate(writer, current_frame_info.transform_mode, - current_frame_info.entropy.coef_probs, prev_frame_probs.coef_probs); - // read_skip_probs() in the spec - WriteProbabilityUpdate(writer, current_frame_info.entropy.skip_probs, - prev_frame_probs.skip_probs); - - if (update_probs) { - prev_frame_probs.coef_probs = current_frame_info.entropy.coef_probs; - prev_frame_probs.skip_probs = current_frame_info.entropy.skip_probs; - } - - if (!current_frame_info.intra_only) { - // read_inter_probs() in the spec - WriteProbabilityUpdateAligned4(writer, current_frame_info.entropy.inter_mode_prob, - prev_frame_probs.inter_mode_prob); - - if (current_frame_info.interp_filter == 4) { - // read_interp_filter_probs() in the spec - WriteProbabilityUpdate(writer, current_frame_info.entropy.switchable_interp_prob, - prev_frame_probs.switchable_interp_prob); - if (update_probs) { - prev_frame_probs.switchable_interp_prob = - current_frame_info.entropy.switchable_interp_prob; - } - } - - // read_is_inter_probs() in the spec - WriteProbabilityUpdate(writer, current_frame_info.entropy.intra_inter_prob, - prev_frame_probs.intra_inter_prob); - - // frame_reference_mode() in the spec - if ((current_frame_info.ref_frame_sign_bias[1] & 1) != - (current_frame_info.ref_frame_sign_bias[2] & 1) || - (current_frame_info.ref_frame_sign_bias[1] & 1) != - (current_frame_info.ref_frame_sign_bias[3] & 1)) { - if (current_frame_info.reference_mode >= 1) { - writer.Write(1, 1); - writer.Write(current_frame_info.reference_mode == 2); - } else { - writer.Write(0, 1); - } - } - - // frame_reference_mode_probs() in the spec - if (current_frame_info.reference_mode == 2) { - WriteProbabilityUpdate(writer, current_frame_info.entropy.comp_inter_prob, - prev_frame_probs.comp_inter_prob); - if (update_probs) { - prev_frame_probs.comp_inter_prob = current_frame_info.entropy.comp_inter_prob; - } - } - - if (current_frame_info.reference_mode != 1) { - WriteProbabilityUpdate(writer, current_frame_info.entropy.single_ref_prob, - prev_frame_probs.single_ref_prob); - if (update_probs) { - prev_frame_probs.single_ref_prob = current_frame_info.entropy.single_ref_prob; - } - } - - if (current_frame_info.reference_mode != 0) { - WriteProbabilityUpdate(writer, current_frame_info.entropy.comp_ref_prob, - prev_frame_probs.comp_ref_prob); - if (update_probs) { - prev_frame_probs.comp_ref_prob = current_frame_info.entropy.comp_ref_prob; - } - } - - // read_y_mode_probs - for (std::size_t index = 0; index < current_frame_info.entropy.y_mode_prob.size(); - ++index) { - WriteProbabilityUpdate(writer, current_frame_info.entropy.y_mode_prob[index], - prev_frame_probs.y_mode_prob[index]); - } - - // read_partition_probs - WriteProbabilityUpdateAligned4(writer, current_frame_info.entropy.partition_prob, - prev_frame_probs.partition_prob); - - // mv_probs - for (s32 i = 0; i < 3; i++) { - WriteMvProbabilityUpdate(writer, current_frame_info.entropy.joints[i], - prev_frame_probs.joints[i]); - } - if (update_probs) { - prev_frame_probs.inter_mode_prob = current_frame_info.entropy.inter_mode_prob; - prev_frame_probs.intra_inter_prob = current_frame_info.entropy.intra_inter_prob; - prev_frame_probs.y_mode_prob = current_frame_info.entropy.y_mode_prob; - prev_frame_probs.partition_prob = current_frame_info.entropy.partition_prob; - prev_frame_probs.joints = current_frame_info.entropy.joints; - } - - for (s32 i = 0; i < 2; i++) { - WriteMvProbabilityUpdate(writer, current_frame_info.entropy.sign[i], - prev_frame_probs.sign[i]); - for (s32 j = 0; j < 10; j++) { - const int index = i * 10 + j; - WriteMvProbabilityUpdate(writer, current_frame_info.entropy.classes[index], - prev_frame_probs.classes[index]); - } - WriteMvProbabilityUpdate(writer, current_frame_info.entropy.class_0[i], - prev_frame_probs.class_0[i]); - - for (s32 j = 0; j < 10; j++) { - const int index = i * 10 + j; - WriteMvProbabilityUpdate(writer, current_frame_info.entropy.prob_bits[index], - prev_frame_probs.prob_bits[index]); - } - } - - for (s32 i = 0; i < 2; i++) { - for (s32 j = 0; j < 2; j++) { - for (s32 k = 0; k < 3; k++) { - const int index = i * 2 * 3 + j * 3 + k; - WriteMvProbabilityUpdate(writer, current_frame_info.entropy.class_0_fr[index], - prev_frame_probs.class_0_fr[index]); - } - } - - for (s32 j = 0; j < 3; j++) { - const int index = i * 3 + j; - WriteMvProbabilityUpdate(writer, current_frame_info.entropy.fr[index], - prev_frame_probs.fr[index]); - } - } - - if (current_frame_info.allow_high_precision_mv) { - for (s32 index = 0; index < 2; index++) { - WriteMvProbabilityUpdate(writer, current_frame_info.entropy.class_0_hp[index], - prev_frame_probs.class_0_hp[index]); - WriteMvProbabilityUpdate(writer, current_frame_info.entropy.high_precision[index], - prev_frame_probs.high_precision[index]); - } - } - - // save previous probs - if (update_probs) { - prev_frame_probs.sign = current_frame_info.entropy.sign; - prev_frame_probs.classes = current_frame_info.entropy.classes; - prev_frame_probs.class_0 = current_frame_info.entropy.class_0; - prev_frame_probs.prob_bits = current_frame_info.entropy.prob_bits; - prev_frame_probs.class_0_fr = current_frame_info.entropy.class_0_fr; - prev_frame_probs.fr = current_frame_info.entropy.fr; - prev_frame_probs.class_0_hp = current_frame_info.entropy.class_0_hp; - prev_frame_probs.high_precision = current_frame_info.entropy.high_precision; - } - } - writer.End(); - return writer.GetBuffer(); -} - -VpxBitStreamWriter VP9::ComposeUncompressedHeader() { - VpxBitStreamWriter uncomp_writer{}; - - uncomp_writer.WriteU(2, 2); // Frame marker. - uncomp_writer.WriteU(0, 2); // Profile. - uncomp_writer.WriteBit(false); // Show existing frame. - uncomp_writer.WriteBit(!current_frame_info.is_key_frame); // is key frame? - uncomp_writer.WriteBit(current_frame_info.show_frame); // show frame? - uncomp_writer.WriteBit(current_frame_info.error_resilient_mode); // error reslience - - if (current_frame_info.is_key_frame) { - uncomp_writer.WriteU(frame_sync_code, 24); - uncomp_writer.WriteU(0, 3); // Color space. - uncomp_writer.WriteU(0, 1); // Color range. - uncomp_writer.WriteU(current_frame_info.frame_size.width - 1, 16); - uncomp_writer.WriteU(current_frame_info.frame_size.height - 1, 16); - uncomp_writer.WriteBit(false); // Render and frame size different. - - // Reset context - prev_frame_probs = default_probs; - swap_ref_indices = false; - loop_filter_ref_deltas.fill(0); - loop_filter_mode_deltas.fill(0); - frame_ctxs.fill(default_probs); - - // intra only, meaning the frame can be recreated with no other references - current_frame_info.intra_only = true; - } else { - if (!current_frame_info.show_frame) { - uncomp_writer.WriteBit(current_frame_info.intra_only); - } else { - current_frame_info.intra_only = false; - } - if (!current_frame_info.error_resilient_mode) { - uncomp_writer.WriteU(0, 2); // Reset frame context. - } - const auto& curr_offsets = current_frame_info.frame_offsets; - const auto& next_offsets = next_frame.info.frame_offsets; - const bool ref_frames_different = curr_offsets[1] != curr_offsets[2]; - const bool next_references_swap = - (next_offsets[1] == curr_offsets[2]) || (next_offsets[2] == curr_offsets[1]); - const bool needs_ref_swap = ref_frames_different && next_references_swap; - if (needs_ref_swap) { - swap_ref_indices = !swap_ref_indices; - } - union { - u32 raw; - BitField<0, 1, u32> refresh_last; - BitField<1, 2, u32> refresh_golden; - BitField<2, 1, u32> refresh_alt; - } refresh_frame_flags; - - refresh_frame_flags.raw = 0; - for (u32 index = 0; index < 3; ++index) { - // Refresh indices that use the current frame as an index - if (curr_offsets[3] == next_offsets[index]) { - refresh_frame_flags.raw |= 1u << index; - } - } - if (swap_ref_indices) { - const u32 temp = refresh_frame_flags.refresh_golden; - refresh_frame_flags.refresh_golden.Assign(refresh_frame_flags.refresh_alt.Value()); - refresh_frame_flags.refresh_alt.Assign(temp); - } - if (current_frame_info.intra_only) { - uncomp_writer.WriteU(frame_sync_code, 24); - uncomp_writer.WriteU(refresh_frame_flags.raw, 8); - uncomp_writer.WriteU(current_frame_info.frame_size.width - 1, 16); - uncomp_writer.WriteU(current_frame_info.frame_size.height - 1, 16); - uncomp_writer.WriteBit(false); // Render and frame size different. - } else { - const bool swap_indices = needs_ref_swap ^ swap_ref_indices; - const auto ref_frame_index = swap_indices ? std::array{0, 2, 1} : std::array{0, 1, 2}; - uncomp_writer.WriteU(refresh_frame_flags.raw, 8); - for (size_t index = 1; index < 4; index++) { - uncomp_writer.WriteU(ref_frame_index[index - 1], 3); - uncomp_writer.WriteU(current_frame_info.ref_frame_sign_bias[index], 1); - } - uncomp_writer.WriteBit(true); // Frame size with refs. - uncomp_writer.WriteBit(false); // Render and frame size different. - uncomp_writer.WriteBit(current_frame_info.allow_high_precision_mv); - uncomp_writer.WriteBit(current_frame_info.interp_filter == 4); - - if (current_frame_info.interp_filter != 4) { - uncomp_writer.WriteU(current_frame_info.interp_filter, 2); - } - } - } - - if (!current_frame_info.error_resilient_mode) { - uncomp_writer.WriteBit(true); // Refresh frame context. where do i get this info from? - uncomp_writer.WriteBit(true); // Frame parallel decoding mode. - } - - int frame_ctx_idx = 0; - if (!current_frame_info.show_frame) { - frame_ctx_idx = 1; - } - - uncomp_writer.WriteU(frame_ctx_idx, 2); // Frame context index. - prev_frame_probs = frame_ctxs[frame_ctx_idx]; // reference probabilities for compressed header - frame_ctxs[frame_ctx_idx] = current_frame_info.entropy; - - uncomp_writer.WriteU(current_frame_info.first_level, 6); - uncomp_writer.WriteU(current_frame_info.sharpness_level, 3); - uncomp_writer.WriteBit(current_frame_info.mode_ref_delta_enabled); - - if (current_frame_info.mode_ref_delta_enabled) { - // check if ref deltas are different, update accordingly - std::array update_loop_filter_ref_deltas; - std::array update_loop_filter_mode_deltas; - - bool loop_filter_delta_update = false; - - for (std::size_t index = 0; index < current_frame_info.ref_deltas.size(); index++) { - const s8 old_deltas = loop_filter_ref_deltas[index]; - const s8 new_deltas = current_frame_info.ref_deltas[index]; - const bool differing_delta = old_deltas != new_deltas; - - update_loop_filter_ref_deltas[index] = differing_delta; - loop_filter_delta_update |= differing_delta; - } - - for (std::size_t index = 0; index < current_frame_info.mode_deltas.size(); index++) { - const s8 old_deltas = loop_filter_mode_deltas[index]; - const s8 new_deltas = current_frame_info.mode_deltas[index]; - const bool differing_delta = old_deltas != new_deltas; - - update_loop_filter_mode_deltas[index] = differing_delta; - loop_filter_delta_update |= differing_delta; - } - - uncomp_writer.WriteBit(loop_filter_delta_update); - - if (loop_filter_delta_update) { - for (std::size_t index = 0; index < current_frame_info.ref_deltas.size(); index++) { - uncomp_writer.WriteBit(update_loop_filter_ref_deltas[index]); - - if (update_loop_filter_ref_deltas[index]) { - uncomp_writer.WriteS(current_frame_info.ref_deltas[index], 6); - } - } - - for (std::size_t index = 0; index < current_frame_info.mode_deltas.size(); index++) { - uncomp_writer.WriteBit(update_loop_filter_mode_deltas[index]); - - if (update_loop_filter_mode_deltas[index]) { - uncomp_writer.WriteS(current_frame_info.mode_deltas[index], 6); - } - } - // save new deltas - loop_filter_ref_deltas = current_frame_info.ref_deltas; - loop_filter_mode_deltas = current_frame_info.mode_deltas; - } - } - - uncomp_writer.WriteU(current_frame_info.base_q_index, 8); - - uncomp_writer.WriteDeltaQ(current_frame_info.y_dc_delta_q); - uncomp_writer.WriteDeltaQ(current_frame_info.uv_dc_delta_q); - uncomp_writer.WriteDeltaQ(current_frame_info.uv_ac_delta_q); - - ASSERT(!current_frame_info.segment_enabled); - uncomp_writer.WriteBit(false); // Segmentation enabled (TODO). - - const s32 min_tile_cols_log2 = CalcMinLog2TileCols(current_frame_info.frame_size.width); - const s32 max_tile_cols_log2 = CalcMaxLog2TileCols(current_frame_info.frame_size.width); - - const s32 tile_cols_log2_diff = current_frame_info.log2_tile_cols - min_tile_cols_log2; - const s32 tile_cols_log2_inc_mask = (1 << tile_cols_log2_diff) - 1; - - // If it's less than the maximum, we need to add an extra 0 on the bitstream - // to indicate that it should stop reading. - if (current_frame_info.log2_tile_cols < max_tile_cols_log2) { - uncomp_writer.WriteU(tile_cols_log2_inc_mask << 1, tile_cols_log2_diff + 1); - } else { - uncomp_writer.WriteU(tile_cols_log2_inc_mask, tile_cols_log2_diff); - } - - const bool tile_rows_log2_is_nonzero = current_frame_info.log2_tile_rows != 0; - - uncomp_writer.WriteBit(tile_rows_log2_is_nonzero); - - if (tile_rows_log2_is_nonzero) { - uncomp_writer.WriteBit(current_frame_info.log2_tile_rows > 1); - } - - return uncomp_writer; -} - -void VP9::ComposeFrame(const NvdecCommon::NvdecRegisters& state) { - std::vector bitstream; - { - Vp9FrameContainer curr_frame = GetCurrentFrame(state); - current_frame_info = curr_frame.info; - bitstream = std::move(curr_frame.bit_stream); - } - // The uncompressed header routine sets PrevProb parameters needed for the compressed header - auto uncomp_writer = ComposeUncompressedHeader(); - std::vector compressed_header = ComposeCompressedHeader(); - - uncomp_writer.WriteU(static_cast(compressed_header.size()), 16); - uncomp_writer.Flush(); - std::vector uncompressed_header = uncomp_writer.GetByteArray(); - - // Write headers and frame to buffer - frame.resize(uncompressed_header.size() + compressed_header.size() + bitstream.size()); - std::copy(uncompressed_header.begin(), uncompressed_header.end(), frame.begin()); - std::copy(compressed_header.begin(), compressed_header.end(), - frame.begin() + uncompressed_header.size()); - std::copy(bitstream.begin(), bitstream.end(), - frame.begin() + uncompressed_header.size() + compressed_header.size()); -} - -VpxRangeEncoder::VpxRangeEncoder() { - Write(false); -} - -VpxRangeEncoder::~VpxRangeEncoder() = default; - -void VpxRangeEncoder::Write(s32 value, s32 value_size) { - for (s32 bit = value_size - 1; bit >= 0; bit--) { - Write(((value >> bit) & 1) != 0); - } -} - -void VpxRangeEncoder::Write(bool bit) { - Write(bit, half_probability); -} - -void VpxRangeEncoder::Write(bool bit, s32 probability) { - u32 local_range = range; - const u32 split = 1 + (((local_range - 1) * static_cast(probability)) >> 8); - local_range = split; - - if (bit) { - low_value += split; - local_range = range - split; - } - - s32 shift = static_cast(norm_lut[local_range]); - local_range <<= shift; - count += shift; - - if (count >= 0) { - const s32 offset = shift - count; - - if (((low_value << (offset - 1)) >> 31) != 0) { - const s32 current_pos = static_cast(base_stream.GetPosition()); - base_stream.Seek(-1, Common::SeekOrigin::FromCurrentPos); - while (PeekByte() == 0xff) { - base_stream.WriteByte(0); - - base_stream.Seek(-2, Common::SeekOrigin::FromCurrentPos); - } - base_stream.WriteByte(static_cast((PeekByte() + 1))); - base_stream.Seek(current_pos, Common::SeekOrigin::SetOrigin); - } - base_stream.WriteByte(static_cast((low_value >> (24 - offset)))); - - low_value <<= offset; - shift = count; - low_value &= 0xffffff; - count -= 8; - } - - low_value <<= shift; - range = local_range; -} - -void VpxRangeEncoder::End() { - for (std::size_t index = 0; index < 32; ++index) { - Write(false); - } -} - -u8 VpxRangeEncoder::PeekByte() { - const u8 value = base_stream.ReadByte(); - base_stream.Seek(-1, Common::SeekOrigin::FromCurrentPos); - - return value; -} - -VpxBitStreamWriter::VpxBitStreamWriter() = default; - -VpxBitStreamWriter::~VpxBitStreamWriter() = default; - -void VpxBitStreamWriter::WriteU(u32 value, u32 value_size) { - WriteBits(value, value_size); -} - -void VpxBitStreamWriter::WriteS(s32 value, u32 value_size) { - const bool sign = value < 0; - if (sign) { - value = -value; - } - - WriteBits(static_cast(value << 1) | (sign ? 1 : 0), value_size + 1); -} - -void VpxBitStreamWriter::WriteDeltaQ(u32 value) { - const bool delta_coded = value != 0; - WriteBit(delta_coded); - - if (delta_coded) { - WriteBits(value, 4); - } -} - -void VpxBitStreamWriter::WriteBits(u32 value, u32 bit_count) { - s32 value_pos = 0; - s32 remaining = bit_count; - - while (remaining > 0) { - s32 copy_size = remaining; - - const s32 free = GetFreeBufferBits(); - - if (copy_size > free) { - copy_size = free; - } - - const s32 mask = (1 << copy_size) - 1; - - const s32 src_shift = (bit_count - value_pos) - copy_size; - const s32 dst_shift = (buffer_size - buffer_pos) - copy_size; - - buffer |= ((value >> src_shift) & mask) << dst_shift; - - value_pos += copy_size; - buffer_pos += copy_size; - remaining -= copy_size; - } -} - -void VpxBitStreamWriter::WriteBit(bool state) { - WriteBits(state ? 1 : 0, 1); -} - -s32 VpxBitStreamWriter::GetFreeBufferBits() { - if (buffer_pos == buffer_size) { - Flush(); - } - - return buffer_size - buffer_pos; -} - -void VpxBitStreamWriter::Flush() { - if (buffer_pos == 0) { - return; - } - byte_array.push_back(static_cast(buffer)); - buffer = 0; - buffer_pos = 0; -} - -std::vector& VpxBitStreamWriter::GetByteArray() { - return byte_array; -} - -const std::vector& VpxBitStreamWriter::GetByteArray() const { - return byte_array; -} - -} // namespace Tegra::Decoder diff --git a/src/video_core/command_classes/codecs/vp9.h b/src/video_core/command_classes/codecs/vp9.h deleted file mode 100644 index ecc40e8b1..000000000 --- a/src/video_core/command_classes/codecs/vp9.h +++ /dev/null @@ -1,192 +0,0 @@ -// SPDX-FileCopyrightText: Copyright 2020 yuzu Emulator Project -// SPDX-License-Identifier: GPL-2.0-or-later - -#pragma once - -#include -#include - -#include "common/common_types.h" -#include "common/stream.h" -#include "video_core/command_classes/codecs/vp9_types.h" -#include "video_core/command_classes/nvdec_common.h" - -namespace Tegra { -class GPU; -namespace Decoder { - -/// The VpxRangeEncoder, and VpxBitStreamWriter classes are used to compose the -/// VP9 header bitstreams. - -class VpxRangeEncoder { -public: - VpxRangeEncoder(); - ~VpxRangeEncoder(); - - VpxRangeEncoder(const VpxRangeEncoder&) = delete; - VpxRangeEncoder& operator=(const VpxRangeEncoder&) = delete; - - VpxRangeEncoder(VpxRangeEncoder&&) = default; - VpxRangeEncoder& operator=(VpxRangeEncoder&&) = default; - - /// Writes the rightmost value_size bits from value into the stream - void Write(s32 value, s32 value_size); - - /// Writes a single bit with half probability - void Write(bool bit); - - /// Writes a bit to the base_stream encoded with probability - void Write(bool bit, s32 probability); - - /// Signal the end of the bitstream - void End(); - - [[nodiscard]] std::vector& GetBuffer() { - return base_stream.GetBuffer(); - } - - [[nodiscard]] const std::vector& GetBuffer() const { - return base_stream.GetBuffer(); - } - -private: - u8 PeekByte(); - Common::Stream base_stream{}; - u32 low_value{}; - u32 range{0xff}; - s32 count{-24}; - s32 half_probability{128}; -}; - -class VpxBitStreamWriter { -public: - VpxBitStreamWriter(); - ~VpxBitStreamWriter(); - - VpxBitStreamWriter(const VpxBitStreamWriter&) = delete; - VpxBitStreamWriter& operator=(const VpxBitStreamWriter&) = delete; - - VpxBitStreamWriter(VpxBitStreamWriter&&) = default; - VpxBitStreamWriter& operator=(VpxBitStreamWriter&&) = default; - - /// Write an unsigned integer value - void WriteU(u32 value, u32 value_size); - - /// Write a signed integer value - void WriteS(s32 value, u32 value_size); - - /// Based on 6.2.10 of VP9 Spec, writes a delta coded value - void WriteDeltaQ(u32 value); - - /// Write a single bit. - void WriteBit(bool state); - - /// Pushes current buffer into buffer_array, resets buffer - void Flush(); - - /// Returns byte_array - [[nodiscard]] std::vector& GetByteArray(); - - /// Returns const byte_array - [[nodiscard]] const std::vector& GetByteArray() const; - -private: - /// Write bit_count bits from value into buffer - void WriteBits(u32 value, u32 bit_count); - - /// Gets next available position in buffer, invokes Flush() if buffer is full - s32 GetFreeBufferBits(); - - s32 buffer_size{8}; - - s32 buffer{}; - s32 buffer_pos{}; - std::vector byte_array; -}; - -class VP9 { -public: - explicit VP9(GPU& gpu_); - ~VP9(); - - VP9(const VP9&) = delete; - VP9& operator=(const VP9&) = delete; - - VP9(VP9&&) = default; - VP9& operator=(VP9&&) = delete; - - /// Composes the VP9 frame from the GPU state information. - /// Based on the official VP9 spec documentation - void ComposeFrame(const NvdecCommon::NvdecRegisters& state); - - /// Returns true if the most recent frame was a hidden frame. - [[nodiscard]] bool WasFrameHidden() const { - return !current_frame_info.show_frame; - } - - /// Returns a const reference to the composed frame data. - [[nodiscard]] const std::vector& GetFrameBytes() const { - return frame; - } - -private: - /// Generates compressed header probability updates in the bitstream writer - template - void WriteProbabilityUpdate(VpxRangeEncoder& writer, const std::array& new_prob, - const std::array& old_prob); - - /// Generates compressed header probability updates in the bitstream writer - /// If probs are not equal, WriteProbabilityDelta is invoked - void WriteProbabilityUpdate(VpxRangeEncoder& writer, u8 new_prob, u8 old_prob); - - /// Generates compressed header probability deltas in the bitstream writer - void WriteProbabilityDelta(VpxRangeEncoder& writer, u8 new_prob, u8 old_prob); - - /// Inverse of 6.3.4 Decode term subexp - void EncodeTermSubExp(VpxRangeEncoder& writer, s32 value); - - /// Writes if the value is less than the test value - bool WriteLessThan(VpxRangeEncoder& writer, s32 value, s32 test); - - /// Writes probability updates for the Coef probabilities - void WriteCoefProbabilityUpdate(VpxRangeEncoder& writer, s32 tx_mode, - const std::array& new_prob, - const std::array& old_prob); - - /// Write probabilities for 4-byte aligned structures - template - void WriteProbabilityUpdateAligned4(VpxRangeEncoder& writer, const std::array& new_prob, - const std::array& old_prob); - - /// Write motion vector probability updates. 6.3.17 in the spec - void WriteMvProbabilityUpdate(VpxRangeEncoder& writer, u8 new_prob, u8 old_prob); - - /// Returns VP9 information from NVDEC provided offset and size - [[nodiscard]] Vp9PictureInfo GetVp9PictureInfo(const NvdecCommon::NvdecRegisters& state); - - /// Read and convert NVDEC provided entropy probs to Vp9EntropyProbs struct - void InsertEntropy(u64 offset, Vp9EntropyProbs& dst); - - /// Returns frame to be decoded after buffering - [[nodiscard]] Vp9FrameContainer GetCurrentFrame(const NvdecCommon::NvdecRegisters& state); - - /// Use NVDEC providied information to compose the headers for the current frame - [[nodiscard]] std::vector ComposeCompressedHeader(); - [[nodiscard]] VpxBitStreamWriter ComposeUncompressedHeader(); - - GPU& gpu; - std::vector frame; - - std::array loop_filter_ref_deltas{}; - std::array loop_filter_mode_deltas{}; - - Vp9FrameContainer next_frame{}; - std::array frame_ctxs{}; - bool swap_ref_indices{}; - - Vp9PictureInfo current_frame_info{}; - Vp9EntropyProbs prev_frame_probs{}; -}; - -} // namespace Decoder -} // namespace Tegra diff --git a/src/video_core/command_classes/codecs/vp9_types.h b/src/video_core/command_classes/codecs/vp9_types.h deleted file mode 100644 index bb3d8df6e..000000000 --- a/src/video_core/command_classes/codecs/vp9_types.h +++ /dev/null @@ -1,306 +0,0 @@ -// SPDX-FileCopyrightText: Copyright 2020 yuzu Emulator Project -// SPDX-License-Identifier: GPL-2.0-or-later - -#pragma once - -#include -#include -#include "common/common_funcs.h" -#include "common/common_types.h" - -namespace Tegra { -class GPU; - -namespace Decoder { -struct Vp9FrameDimensions { - s16 width; - s16 height; - s16 luma_pitch; - s16 chroma_pitch; -}; -static_assert(sizeof(Vp9FrameDimensions) == 0x8, "Vp9 Vp9FrameDimensions is an invalid size"); - -enum class FrameFlags : u32 { - IsKeyFrame = 1 << 0, - LastFrameIsKeyFrame = 1 << 1, - FrameSizeChanged = 1 << 2, - ErrorResilientMode = 1 << 3, - LastShowFrame = 1 << 4, - IntraOnly = 1 << 5, -}; -DECLARE_ENUM_FLAG_OPERATORS(FrameFlags) - -enum class TxSize { - Tx4x4 = 0, // 4x4 transform - Tx8x8 = 1, // 8x8 transform - Tx16x16 = 2, // 16x16 transform - Tx32x32 = 3, // 32x32 transform - TxSizes = 4 -}; - -enum class TxMode { - Only4X4 = 0, // Only 4x4 transform used - Allow8X8 = 1, // Allow block transform size up to 8x8 - Allow16X16 = 2, // Allow block transform size up to 16x16 - Allow32X32 = 3, // Allow block transform size up to 32x32 - TxModeSelect = 4, // Transform specified for each block - TxModes = 5 -}; - -struct Segmentation { - u8 enabled; - u8 update_map; - u8 temporal_update; - u8 abs_delta; - std::array feature_mask; - std::array, 8> feature_data; -}; -static_assert(sizeof(Segmentation) == 0x64, "Segmentation is an invalid size"); - -struct LoopFilter { - u8 mode_ref_delta_enabled; - std::array ref_deltas; - std::array mode_deltas; -}; -static_assert(sizeof(LoopFilter) == 0x7, "LoopFilter is an invalid size"); - -struct Vp9EntropyProbs { - std::array y_mode_prob; ///< 0x0000 - std::array partition_prob; ///< 0x0024 - std::array coef_probs; ///< 0x0064 - std::array switchable_interp_prob; ///< 0x0724 - std::array inter_mode_prob; ///< 0x072C - std::array intra_inter_prob; ///< 0x0748 - std::array comp_inter_prob; ///< 0x074C - std::array single_ref_prob; ///< 0x0751 - std::array comp_ref_prob; ///< 0x075B - std::array tx_32x32_prob; ///< 0x0760 - std::array tx_16x16_prob; ///< 0x0766 - std::array tx_8x8_prob; ///< 0x076A - std::array skip_probs; ///< 0x076C - std::array joints; ///< 0x076F - std::array sign; ///< 0x0772 - std::array classes; ///< 0x0774 - std::array class_0; ///< 0x0788 - std::array prob_bits; ///< 0x078A - std::array class_0_fr; ///< 0x079E - std::array fr; ///< 0x07AA - std::array class_0_hp; ///< 0x07B0 - std::array high_precision; ///< 0x07B2 -}; -static_assert(sizeof(Vp9EntropyProbs) == 0x7B4, "Vp9EntropyProbs is an invalid size"); - -struct Vp9PictureInfo { - u32 bitstream_size; - std::array frame_offsets; - std::array ref_frame_sign_bias; - s32 base_q_index; - s32 y_dc_delta_q; - s32 uv_dc_delta_q; - s32 uv_ac_delta_q; - s32 transform_mode; - s32 interp_filter; - s32 reference_mode; - s32 log2_tile_cols; - s32 log2_tile_rows; - std::array ref_deltas; - std::array mode_deltas; - Vp9EntropyProbs entropy; - Vp9FrameDimensions frame_size; - u8 first_level; - u8 sharpness_level; - bool is_key_frame; - bool intra_only; - bool last_frame_was_key; - bool error_resilient_mode; - bool last_frame_shown; - bool show_frame; - bool lossless; - bool allow_high_precision_mv; - bool segment_enabled; - bool mode_ref_delta_enabled; -}; - -struct Vp9FrameContainer { - Vp9PictureInfo info{}; - std::vector bit_stream; -}; - -struct PictureInfo { - INSERT_PADDING_WORDS_NOINIT(12); ///< 0x00 - u32 bitstream_size; ///< 0x30 - INSERT_PADDING_WORDS_NOINIT(5); ///< 0x34 - Vp9FrameDimensions last_frame_size; ///< 0x48 - Vp9FrameDimensions golden_frame_size; ///< 0x50 - Vp9FrameDimensions alt_frame_size; ///< 0x58 - Vp9FrameDimensions current_frame_size; ///< 0x60 - FrameFlags vp9_flags; ///< 0x68 - std::array ref_frame_sign_bias; ///< 0x6C - u8 first_level; ///< 0x70 - u8 sharpness_level; ///< 0x71 - u8 base_q_index; ///< 0x72 - u8 y_dc_delta_q; ///< 0x73 - u8 uv_ac_delta_q; ///< 0x74 - u8 uv_dc_delta_q; ///< 0x75 - u8 lossless; ///< 0x76 - u8 tx_mode; ///< 0x77 - u8 allow_high_precision_mv; ///< 0x78 - u8 interp_filter; ///< 0x79 - u8 reference_mode; ///< 0x7A - INSERT_PADDING_BYTES_NOINIT(3); ///< 0x7B - u8 log2_tile_cols; ///< 0x7E - u8 log2_tile_rows; ///< 0x7F - Segmentation segmentation; ///< 0x80 - LoopFilter loop_filter; ///< 0xE4 - INSERT_PADDING_BYTES_NOINIT(21); ///< 0xEB - - [[nodiscard]] Vp9PictureInfo Convert() const { - return { - .bitstream_size = bitstream_size, - .frame_offsets{}, - .ref_frame_sign_bias = ref_frame_sign_bias, - .base_q_index = base_q_index, - .y_dc_delta_q = y_dc_delta_q, - .uv_dc_delta_q = uv_dc_delta_q, - .uv_ac_delta_q = uv_ac_delta_q, - .transform_mode = tx_mode, - .interp_filter = interp_filter, - .reference_mode = reference_mode, - .log2_tile_cols = log2_tile_cols, - .log2_tile_rows = log2_tile_rows, - .ref_deltas = loop_filter.ref_deltas, - .mode_deltas = loop_filter.mode_deltas, - .entropy{}, - .frame_size = current_frame_size, - .first_level = first_level, - .sharpness_level = sharpness_level, - .is_key_frame = True(vp9_flags & FrameFlags::IsKeyFrame), - .intra_only = True(vp9_flags & FrameFlags::IntraOnly), - .last_frame_was_key = True(vp9_flags & FrameFlags::LastFrameIsKeyFrame), - .error_resilient_mode = True(vp9_flags & FrameFlags::ErrorResilientMode), - .last_frame_shown = True(vp9_flags & FrameFlags::LastShowFrame), - .show_frame = true, - .lossless = lossless != 0, - .allow_high_precision_mv = allow_high_precision_mv != 0, - .segment_enabled = segmentation.enabled != 0, - .mode_ref_delta_enabled = loop_filter.mode_ref_delta_enabled != 0, - }; - } -}; -static_assert(sizeof(PictureInfo) == 0x100, "PictureInfo is an invalid size"); - -struct EntropyProbs { - INSERT_PADDING_BYTES_NOINIT(1024); ///< 0x0000 - std::array inter_mode_prob; ///< 0x0400 - std::array intra_inter_prob; ///< 0x041C - INSERT_PADDING_BYTES_NOINIT(80); ///< 0x0420 - std::array tx_8x8_prob; ///< 0x0470 - std::array tx_16x16_prob; ///< 0x0472 - std::array tx_32x32_prob; ///< 0x0476 - std::array y_mode_prob_e8; ///< 0x047C - std::array, 4> y_mode_prob_e0e7; ///< 0x0480 - INSERT_PADDING_BYTES_NOINIT(64); ///< 0x04A0 - std::array partition_prob; ///< 0x04E0 - INSERT_PADDING_BYTES_NOINIT(10); ///< 0x0520 - std::array switchable_interp_prob; ///< 0x052A - std::array comp_inter_prob; ///< 0x0532 - std::array skip_probs; ///< 0x0537 - INSERT_PADDING_BYTES_NOINIT(1); ///< 0x053A - std::array joints; ///< 0x053B - std::array sign; ///< 0x053E - std::array class_0; ///< 0x0540 - std::array fr; ///< 0x0542 - std::array class_0_hp; ///< 0x0548 - std::array high_precision; ///< 0x054A - std::array classes; ///< 0x054C - std::array class_0_fr; ///< 0x0560 - std::array pred_bits; ///< 0x056C - std::array single_ref_prob; ///< 0x0580 - std::array comp_ref_prob; ///< 0x058A - INSERT_PADDING_BYTES_NOINIT(17); ///< 0x058F - std::array coef_probs; ///< 0x05A0 - - void Convert(Vp9EntropyProbs& fc) { - fc.inter_mode_prob = inter_mode_prob; - fc.intra_inter_prob = intra_inter_prob; - fc.tx_8x8_prob = tx_8x8_prob; - fc.tx_16x16_prob = tx_16x16_prob; - fc.tx_32x32_prob = tx_32x32_prob; - - for (std::size_t i = 0; i < 4; i++) { - for (std::size_t j = 0; j < 9; j++) { - fc.y_mode_prob[j + 9 * i] = j < 8 ? y_mode_prob_e0e7[i][j] : y_mode_prob_e8[i]; - } - } - - fc.partition_prob = partition_prob; - fc.switchable_interp_prob = switchable_interp_prob; - fc.comp_inter_prob = comp_inter_prob; - fc.skip_probs = skip_probs; - fc.joints = joints; - fc.sign = sign; - fc.class_0 = class_0; - fc.fr = fr; - fc.class_0_hp = class_0_hp; - fc.high_precision = high_precision; - fc.classes = classes; - fc.class_0_fr = class_0_fr; - fc.prob_bits = pred_bits; - fc.single_ref_prob = single_ref_prob; - fc.comp_ref_prob = comp_ref_prob; - - // Skip the 4th element as it goes unused - for (std::size_t i = 0; i < coef_probs.size(); i += 4) { - const std::size_t j = i - i / 4; - fc.coef_probs[j] = coef_probs[i]; - fc.coef_probs[j + 1] = coef_probs[i + 1]; - fc.coef_probs[j + 2] = coef_probs[i + 2]; - } - } -}; -static_assert(sizeof(EntropyProbs) == 0xEA0, "EntropyProbs is an invalid size"); - -enum class Ref { Last, Golden, AltRef }; - -struct RefPoolElement { - s64 frame{}; - Ref ref{}; - bool refresh{}; -}; - -#define ASSERT_POSITION(field_name, position) \ - static_assert(offsetof(Vp9EntropyProbs, field_name) == position, \ - "Field " #field_name " has invalid position") - -ASSERT_POSITION(partition_prob, 0x0024); -ASSERT_POSITION(switchable_interp_prob, 0x0724); -ASSERT_POSITION(sign, 0x0772); -ASSERT_POSITION(class_0_fr, 0x079E); -ASSERT_POSITION(high_precision, 0x07B2); -#undef ASSERT_POSITION - -#define ASSERT_POSITION(field_name, position) \ - static_assert(offsetof(PictureInfo, field_name) == position, \ - "Field " #field_name " has invalid position") - -ASSERT_POSITION(bitstream_size, 0x30); -ASSERT_POSITION(last_frame_size, 0x48); -ASSERT_POSITION(first_level, 0x70); -ASSERT_POSITION(segmentation, 0x80); -ASSERT_POSITION(loop_filter, 0xE4); -#undef ASSERT_POSITION - -#define ASSERT_POSITION(field_name, position) \ - static_assert(offsetof(EntropyProbs, field_name) == position, \ - "Field " #field_name " has invalid position") - -ASSERT_POSITION(inter_mode_prob, 0x400); -ASSERT_POSITION(tx_8x8_prob, 0x470); -ASSERT_POSITION(partition_prob, 0x4E0); -ASSERT_POSITION(class_0, 0x540); -ASSERT_POSITION(class_0_fr, 0x560); -ASSERT_POSITION(coef_probs, 0x5A0); -#undef ASSERT_POSITION - -}; // namespace Decoder -}; // namespace Tegra diff --git a/src/video_core/command_classes/host1x.h b/src/video_core/command_classes/host1x.h deleted file mode 100644 index bb48a4381..000000000 --- a/src/video_core/command_classes/host1x.h +++ /dev/null @@ -1,34 +0,0 @@ -// SPDX-FileCopyrightText: Copyright 2020 yuzu Emulator Project -// SPDX-License-Identifier: GPL-2.0-or-later - -#pragma once - -#include "common/common_types.h" - -namespace Tegra { -class GPU; -class Nvdec; - -class Host1x { -public: - enum class Method : u32 { - WaitSyncpt = 0x8, - LoadSyncptPayload32 = 0x4e, - WaitSyncpt32 = 0x50, - }; - - explicit Host1x(GPU& gpu); - ~Host1x(); - - /// Writes the method into the state, Invoke Execute() if encountered - void ProcessMethod(Method method, u32 argument); - -private: - /// For Host1x, execute is waiting on a syncpoint previously written into the state - void Execute(u32 data); - - u32 syncpoint_value{}; - GPU& gpu; -}; - -} // namespace Tegra diff --git a/src/video_core/command_classes/nvdec.cpp b/src/video_core/command_classes/nvdec.cpp deleted file mode 100644 index 4fbbe3da6..000000000 --- a/src/video_core/command_classes/nvdec.cpp +++ /dev/null @@ -1,47 +0,0 @@ -// SPDX-FileCopyrightText: Copyright 2020 yuzu Emulator Project -// SPDX-License-Identifier: GPL-2.0-or-later - -#include "common/assert.h" -#include "video_core/command_classes/nvdec.h" -#include "video_core/gpu.h" - -namespace Tegra { - -#define NVDEC_REG_INDEX(field_name) \ - (offsetof(NvdecCommon::NvdecRegisters, field_name) / sizeof(u64)) - -Nvdec::Nvdec(GPU& gpu_) : gpu(gpu_), state{}, codec(std::make_unique(gpu, state)) {} - -Nvdec::~Nvdec() = default; - -void Nvdec::ProcessMethod(u32 method, u32 argument) { - state.reg_array[method] = static_cast(argument) << 8; - - switch (method) { - case NVDEC_REG_INDEX(set_codec_id): - codec->SetTargetCodec(static_cast(argument)); - break; - case NVDEC_REG_INDEX(execute): - Execute(); - break; - } -} - -AVFramePtr Nvdec::GetFrame() { - return codec->GetCurrentFrame(); -} - -void Nvdec::Execute() { - switch (codec->GetCurrentCodec()) { - case NvdecCommon::VideoCodec::H264: - case NvdecCommon::VideoCodec::VP8: - case NvdecCommon::VideoCodec::VP9: - codec->Decode(); - break; - default: - UNIMPLEMENTED_MSG("Codec {}", codec->GetCurrentCodecName()); - break; - } -} - -} // namespace Tegra diff --git a/src/video_core/command_classes/nvdec.h b/src/video_core/command_classes/nvdec.h deleted file mode 100644 index 488531fc6..000000000 --- a/src/video_core/command_classes/nvdec.h +++ /dev/null @@ -1,33 +0,0 @@ -// SPDX-FileCopyrightText: Copyright 2020 yuzu Emulator Project -// SPDX-License-Identifier: GPL-2.0-or-later - -#pragma once - -#include -#include -#include "common/common_types.h" -#include "video_core/command_classes/codecs/codec.h" - -namespace Tegra { -class GPU; - -class Nvdec { -public: - explicit Nvdec(GPU& gpu); - ~Nvdec(); - - /// Writes the method into the state, Invoke Execute() if encountered - void ProcessMethod(u32 method, u32 argument); - - /// Return most recently decoded frame - [[nodiscard]] AVFramePtr GetFrame(); - -private: - /// Invoke codec to decode a frame - void Execute(); - - GPU& gpu; - NvdecCommon::NvdecRegisters state; - std::unique_ptr codec; -}; -} // namespace Tegra diff --git a/src/video_core/command_classes/nvdec_common.h b/src/video_core/command_classes/nvdec_common.h deleted file mode 100644 index 521e5b52b..000000000 --- a/src/video_core/command_classes/nvdec_common.h +++ /dev/null @@ -1,97 +0,0 @@ -// SPDX-FileCopyrightText: Copyright 2020 yuzu Emulator Project -// SPDX-License-Identifier: GPL-2.0-or-later - -#pragma once - -#include "common/bit_field.h" -#include "common/common_funcs.h" -#include "common/common_types.h" - -namespace Tegra::NvdecCommon { - -enum class VideoCodec : u64 { - None = 0x0, - H264 = 0x3, - VP8 = 0x5, - H265 = 0x7, - VP9 = 0x9, -}; - -// NVDEC should use a 32-bit address space, but is mapped to 64-bit, -// doubling the sizes here is compensating for that. -struct NvdecRegisters { - static constexpr std::size_t NUM_REGS = 0x178; - - union { - struct { - INSERT_PADDING_WORDS_NOINIT(256); ///< 0x0000 - VideoCodec set_codec_id; ///< 0x0400 - INSERT_PADDING_WORDS_NOINIT(126); ///< 0x0408 - u64 execute; ///< 0x0600 - INSERT_PADDING_WORDS_NOINIT(126); ///< 0x0608 - struct { ///< 0x0800 - union { - BitField<0, 3, VideoCodec> codec; - BitField<4, 1, u64> gp_timer_on; - BitField<13, 1, u64> mb_timer_on; - BitField<14, 1, u64> intra_frame_pslc; - BitField<17, 1, u64> all_intra_frame; - }; - } control_params; - u64 picture_info_offset; ///< 0x0808 - u64 frame_bitstream_offset; ///< 0x0810 - u64 frame_number; ///< 0x0818 - u64 h264_slice_data_offsets; ///< 0x0820 - u64 h264_mv_dump_offset; ///< 0x0828 - INSERT_PADDING_WORDS_NOINIT(6); ///< 0x0830 - u64 frame_stats_offset; ///< 0x0848 - u64 h264_last_surface_luma_offset; ///< 0x0850 - u64 h264_last_surface_chroma_offset; ///< 0x0858 - std::array surface_luma_offset; ///< 0x0860 - std::array surface_chroma_offset; ///< 0x08E8 - INSERT_PADDING_WORDS_NOINIT(68); ///< 0x0970 - u64 vp8_prob_data_offset; ///< 0x0A80 - u64 vp8_header_partition_buf_offset; ///< 0x0A88 - INSERT_PADDING_WORDS_NOINIT(60); ///< 0x0A90 - u64 vp9_entropy_probs_offset; ///< 0x0B80 - u64 vp9_backward_updates_offset; ///< 0x0B88 - u64 vp9_last_frame_segmap_offset; ///< 0x0B90 - u64 vp9_curr_frame_segmap_offset; ///< 0x0B98 - INSERT_PADDING_WORDS_NOINIT(2); ///< 0x0BA0 - u64 vp9_last_frame_mvs_offset; ///< 0x0BA8 - u64 vp9_curr_frame_mvs_offset; ///< 0x0BB0 - INSERT_PADDING_WORDS_NOINIT(2); ///< 0x0BB8 - }; - std::array reg_array; - }; -}; -static_assert(sizeof(NvdecRegisters) == (0xBC0), "NvdecRegisters is incorrect size"); - -#define ASSERT_REG_POSITION(field_name, position) \ - static_assert(offsetof(NvdecRegisters, field_name) == position * sizeof(u64), \ - "Field " #field_name " has invalid position") - -ASSERT_REG_POSITION(set_codec_id, 0x80); -ASSERT_REG_POSITION(execute, 0xC0); -ASSERT_REG_POSITION(control_params, 0x100); -ASSERT_REG_POSITION(picture_info_offset, 0x101); -ASSERT_REG_POSITION(frame_bitstream_offset, 0x102); -ASSERT_REG_POSITION(frame_number, 0x103); -ASSERT_REG_POSITION(h264_slice_data_offsets, 0x104); -ASSERT_REG_POSITION(frame_stats_offset, 0x109); -ASSERT_REG_POSITION(h264_last_surface_luma_offset, 0x10A); -ASSERT_REG_POSITION(h264_last_surface_chroma_offset, 0x10B); -ASSERT_REG_POSITION(surface_luma_offset, 0x10C); -ASSERT_REG_POSITION(surface_chroma_offset, 0x11D); -ASSERT_REG_POSITION(vp8_prob_data_offset, 0x150); -ASSERT_REG_POSITION(vp8_header_partition_buf_offset, 0x151); -ASSERT_REG_POSITION(vp9_entropy_probs_offset, 0x170); -ASSERT_REG_POSITION(vp9_backward_updates_offset, 0x171); -ASSERT_REG_POSITION(vp9_last_frame_segmap_offset, 0x172); -ASSERT_REG_POSITION(vp9_curr_frame_segmap_offset, 0x173); -ASSERT_REG_POSITION(vp9_last_frame_mvs_offset, 0x175); -ASSERT_REG_POSITION(vp9_curr_frame_mvs_offset, 0x176); - -#undef ASSERT_REG_POSITION - -} // namespace Tegra::NvdecCommon diff --git a/src/video_core/command_classes/sync_manager.cpp b/src/video_core/command_classes/sync_manager.cpp deleted file mode 100644 index 67e58046f..000000000 --- a/src/video_core/command_classes/sync_manager.cpp +++ /dev/null @@ -1,43 +0,0 @@ -// SPDX-FileCopyrightText: Ryujinx Team and Contributors -// SPDX-License-Identifier: MIT - -#include -#include "sync_manager.h" -#include "video_core/gpu.h" - -namespace Tegra { -SyncptIncrManager::SyncptIncrManager(GPU& gpu_) : gpu(gpu_) {} -SyncptIncrManager::~SyncptIncrManager() = default; - -void SyncptIncrManager::Increment(u32 id) { - increments.emplace_back(0, 0, id, true); - IncrementAllDone(); -} - -u32 SyncptIncrManager::IncrementWhenDone(u32 class_id, u32 id) { - const u32 handle = current_id++; - increments.emplace_back(handle, class_id, id); - return handle; -} - -void SyncptIncrManager::SignalDone(u32 handle) { - const auto done_incr = - std::find_if(increments.begin(), increments.end(), - [handle](const SyncptIncr& incr) { return incr.id == handle; }); - if (done_incr != increments.cend()) { - done_incr->complete = true; - } - IncrementAllDone(); -} - -void SyncptIncrManager::IncrementAllDone() { - std::size_t done_count = 0; - for (; done_count < increments.size(); ++done_count) { - if (!increments[done_count].complete) { - break; - } - gpu.IncrementSyncPoint(increments[done_count].syncpt_id); - } - increments.erase(increments.begin(), increments.begin() + done_count); -} -} // namespace Tegra diff --git a/src/video_core/command_classes/sync_manager.h b/src/video_core/command_classes/sync_manager.h deleted file mode 100644 index 6dfaae080..000000000 --- a/src/video_core/command_classes/sync_manager.h +++ /dev/null @@ -1,47 +0,0 @@ -// SPDX-FileCopyrightText: Ryujinx Team and Contributors -// SPDX-License-Identifier: MIT - -#pragma once - -#include -#include -#include "common/common_types.h" - -namespace Tegra { -class GPU; -struct SyncptIncr { - u32 id; - u32 class_id; - u32 syncpt_id; - bool complete; - - SyncptIncr(u32 id_, u32 class_id_, u32 syncpt_id_, bool done = false) - : id(id_), class_id(class_id_), syncpt_id(syncpt_id_), complete(done) {} -}; - -class SyncptIncrManager { -public: - explicit SyncptIncrManager(GPU& gpu); - ~SyncptIncrManager(); - - /// Add syncpoint id and increment all - void Increment(u32 id); - - /// Returns a handle to increment later - u32 IncrementWhenDone(u32 class_id, u32 id); - - /// IncrememntAllDone, including handle - void SignalDone(u32 handle); - - /// Increment all sequential pending increments that are already done. - void IncrementAllDone(); - -private: - std::vector increments; - std::mutex increment_lock; - u32 current_id{}; - - GPU& gpu; -}; - -} // namespace Tegra diff --git a/src/video_core/command_classes/vic.cpp b/src/video_core/command_classes/vic.cpp deleted file mode 100644 index 7c17df353..000000000 --- a/src/video_core/command_classes/vic.cpp +++ /dev/null @@ -1,238 +0,0 @@ -// SPDX-FileCopyrightText: Copyright 2020 yuzu Emulator Project -// SPDX-License-Identifier: GPL-2.0-or-later - -#include - -extern "C" { -#if defined(__GNUC__) || defined(__clang__) -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wconversion" -#endif -#include -#if defined(__GNUC__) || defined(__clang__) -#pragma GCC diagnostic pop -#endif -} - -#include "common/assert.h" -#include "common/bit_field.h" -#include "common/logging/log.h" - -#include "video_core/command_classes/nvdec.h" -#include "video_core/command_classes/vic.h" -#include "video_core/engines/maxwell_3d.h" -#include "video_core/gpu.h" -#include "video_core/memory_manager.h" -#include "video_core/textures/decoders.h" - -namespace Tegra { -namespace { -enum class VideoPixelFormat : u64_le { - RGBA8 = 0x1f, - BGRA8 = 0x20, - RGBX8 = 0x23, - YUV420 = 0x44, -}; -} // Anonymous namespace - -union VicConfig { - u64_le raw{}; - BitField<0, 7, VideoPixelFormat> pixel_format; - BitField<7, 2, u64_le> chroma_loc_horiz; - BitField<9, 2, u64_le> chroma_loc_vert; - BitField<11, 4, u64_le> block_linear_kind; - BitField<15, 4, u64_le> block_linear_height_log2; - BitField<32, 14, u64_le> surface_width_minus1; - BitField<46, 14, u64_le> surface_height_minus1; -}; - -Vic::Vic(GPU& gpu_, std::shared_ptr nvdec_processor_) - : gpu(gpu_), - nvdec_processor(std::move(nvdec_processor_)), converted_frame_buffer{nullptr, av_free} {} - -Vic::~Vic() = default; - -void Vic::ProcessMethod(Method method, u32 argument) { - LOG_DEBUG(HW_GPU, "Vic method 0x{:X}", static_cast(method)); - const u64 arg = static_cast(argument) << 8; - switch (method) { - case Method::Execute: - Execute(); - break; - case Method::SetConfigStructOffset: - config_struct_address = arg; - break; - case Method::SetOutputSurfaceLumaOffset: - output_surface_luma_address = arg; - break; - case Method::SetOutputSurfaceChromaOffset: - output_surface_chroma_address = arg; - break; - default: - break; - } -} - -void Vic::Execute() { - if (output_surface_luma_address == 0) { - LOG_ERROR(Service_NVDRV, "VIC Luma address not set."); - return; - } - const VicConfig config{gpu.MemoryManager().Read(config_struct_address + 0x20)}; - const AVFramePtr frame_ptr = nvdec_processor->GetFrame(); - const auto* frame = frame_ptr.get(); - if (!frame) { - return; - } - const u64 surface_width = config.surface_width_minus1 + 1; - const u64 surface_height = config.surface_height_minus1 + 1; - if (static_cast(frame->width) != surface_width || - static_cast(frame->height) != surface_height) { - // TODO: Properly support multiple video streams with differing frame dimensions - LOG_WARNING(Service_NVDRV, "Frame dimensions {}x{} don't match surface dimensions {}x{}", - frame->width, frame->height, surface_width, surface_height); - } - switch (config.pixel_format) { - case VideoPixelFormat::RGBA8: - case VideoPixelFormat::BGRA8: - case VideoPixelFormat::RGBX8: - WriteRGBFrame(frame, config); - break; - case VideoPixelFormat::YUV420: - WriteYUVFrame(frame, config); - break; - default: - UNIMPLEMENTED_MSG("Unknown video pixel format {:X}", config.pixel_format.Value()); - break; - } -} - -void Vic::WriteRGBFrame(const AVFrame* frame, const VicConfig& config) { - LOG_TRACE(Service_NVDRV, "Writing RGB Frame"); - - if (!scaler_ctx || frame->width != scaler_width || frame->height != scaler_height) { - const AVPixelFormat target_format = [pixel_format = config.pixel_format]() { - switch (pixel_format) { - case VideoPixelFormat::RGBA8: - return AV_PIX_FMT_RGBA; - case VideoPixelFormat::BGRA8: - return AV_PIX_FMT_BGRA; - case VideoPixelFormat::RGBX8: - return AV_PIX_FMT_RGB0; - default: - return AV_PIX_FMT_RGBA; - } - }(); - - sws_freeContext(scaler_ctx); - // Frames are decoded into either YUV420 or NV12 formats. Convert to desired RGB format - scaler_ctx = sws_getContext(frame->width, frame->height, - static_cast(frame->format), frame->width, - frame->height, target_format, 0, nullptr, nullptr, nullptr); - scaler_width = frame->width; - scaler_height = frame->height; - converted_frame_buffer.reset(); - } - if (!converted_frame_buffer) { - const size_t frame_size = frame->width * frame->height * 4; - converted_frame_buffer = AVMallocPtr{static_cast(av_malloc(frame_size)), av_free}; - } - const std::array converted_stride{frame->width * 4, frame->height * 4, 0, 0}; - u8* const converted_frame_buf_addr{converted_frame_buffer.get()}; - sws_scale(scaler_ctx, frame->data, frame->linesize, 0, frame->height, &converted_frame_buf_addr, - converted_stride.data()); - - // Use the minimum of surface/frame dimensions to avoid buffer overflow. - const u32 surface_width = static_cast(config.surface_width_minus1) + 1; - const u32 surface_height = static_cast(config.surface_height_minus1) + 1; - const u32 width = std::min(surface_width, static_cast(frame->width)); - const u32 height = std::min(surface_height, static_cast(frame->height)); - const u32 blk_kind = static_cast(config.block_linear_kind); - if (blk_kind != 0) { - // swizzle pitch linear to block linear - const u32 block_height = static_cast(config.block_linear_height_log2); - const auto size = Texture::CalculateSize(true, 4, width, height, 1, block_height, 0); - luma_buffer.resize(size); - Texture::SwizzleSubrect(width, height, width * 4, width, 4, luma_buffer.data(), - converted_frame_buf_addr, block_height, 0, 0); - - gpu.MemoryManager().WriteBlock(output_surface_luma_address, luma_buffer.data(), size); - } else { - // send pitch linear frame - const size_t linear_size = width * height * 4; - gpu.MemoryManager().WriteBlock(output_surface_luma_address, converted_frame_buf_addr, - linear_size); - } -} - -void Vic::WriteYUVFrame(const AVFrame* frame, const VicConfig& config) { - LOG_TRACE(Service_NVDRV, "Writing YUV420 Frame"); - - const std::size_t surface_width = config.surface_width_minus1 + 1; - const std::size_t surface_height = config.surface_height_minus1 + 1; - const std::size_t aligned_width = (surface_width + 0xff) & ~0xffUL; - // Use the minimum of surface/frame dimensions to avoid buffer overflow. - const auto frame_width = std::min(surface_width, static_cast(frame->width)); - const auto frame_height = std::min(surface_height, static_cast(frame->height)); - - const auto stride = static_cast(frame->linesize[0]); - - luma_buffer.resize(aligned_width * surface_height); - chroma_buffer.resize(aligned_width * surface_height / 2); - - // Populate luma buffer - const u8* luma_src = frame->data[0]; - for (std::size_t y = 0; y < frame_height; ++y) { - const std::size_t src = y * stride; - const std::size_t dst = y * aligned_width; - for (std::size_t x = 0; x < frame_width; ++x) { - luma_buffer[dst + x] = luma_src[src + x]; - } - } - gpu.MemoryManager().WriteBlock(output_surface_luma_address, luma_buffer.data(), - luma_buffer.size()); - - // Chroma - const std::size_t half_height = frame_height / 2; - const auto half_stride = static_cast(frame->linesize[1]); - - switch (frame->format) { - case AV_PIX_FMT_YUV420P: { - // Frame from FFmpeg software - // Populate chroma buffer from both channels with interleaving. - const std::size_t half_width = frame_width / 2; - const u8* chroma_b_src = frame->data[1]; - const u8* chroma_r_src = frame->data[2]; - for (std::size_t y = 0; y < half_height; ++y) { - const std::size_t src = y * half_stride; - const std::size_t dst = y * aligned_width; - - for (std::size_t x = 0; x < half_width; ++x) { - chroma_buffer[dst + x * 2] = chroma_b_src[src + x]; - chroma_buffer[dst + x * 2 + 1] = chroma_r_src[src + x]; - } - } - break; - } - case AV_PIX_FMT_NV12: { - // Frame from VA-API hardware - // This is already interleaved so just copy - const u8* chroma_src = frame->data[1]; - for (std::size_t y = 0; y < half_height; ++y) { - const std::size_t src = y * stride; - const std::size_t dst = y * aligned_width; - for (std::size_t x = 0; x < frame_width; ++x) { - chroma_buffer[dst + x] = chroma_src[src + x]; - } - } - break; - } - default: - ASSERT(false); - break; - } - gpu.MemoryManager().WriteBlock(output_surface_chroma_address, chroma_buffer.data(), - chroma_buffer.size()); -} - -} // namespace Tegra diff --git a/src/video_core/command_classes/vic.h b/src/video_core/command_classes/vic.h deleted file mode 100644 index 010daa6b6..000000000 --- a/src/video_core/command_classes/vic.h +++ /dev/null @@ -1,61 +0,0 @@ -// SPDX-FileCopyrightText: Copyright 2020 yuzu Emulator Project -// SPDX-License-Identifier: GPL-2.0-or-later - -#pragma once - -#include -#include -#include "common/common_types.h" - -struct SwsContext; - -namespace Tegra { -class GPU; -class Nvdec; -union VicConfig; - -class Vic { -public: - enum class Method : u32 { - Execute = 0xc0, - SetControlParams = 0x1c1, - SetConfigStructOffset = 0x1c2, - SetOutputSurfaceLumaOffset = 0x1c8, - SetOutputSurfaceChromaOffset = 0x1c9, - SetOutputSurfaceChromaUnusedOffset = 0x1ca - }; - - explicit Vic(GPU& gpu, std::shared_ptr nvdec_processor); - - ~Vic(); - - /// Write to the device state. - void ProcessMethod(Method method, u32 argument); - -private: - void Execute(); - - void WriteRGBFrame(const AVFrame* frame, const VicConfig& config); - - void WriteYUVFrame(const AVFrame* frame, const VicConfig& config); - - GPU& gpu; - std::shared_ptr nvdec_processor; - - /// Avoid reallocation of the following buffers every frame, as their - /// size does not change during a stream - using AVMallocPtr = std::unique_ptr; - AVMallocPtr converted_frame_buffer; - std::vector luma_buffer; - std::vector chroma_buffer; - - GPUVAddr config_struct_address{}; - GPUVAddr output_surface_luma_address{}; - GPUVAddr output_surface_chroma_address{}; - - SwsContext* scaler_ctx{}; - s32 scaler_width{}; - s32 scaler_height{}; -}; - -} // namespace Tegra diff --git a/src/video_core/control/channel_state.cpp b/src/video_core/control/channel_state.cpp index 67803fe94..3613c4992 100644 --- a/src/video_core/control/channel_state.cpp +++ b/src/video_core/control/channel_state.cpp @@ -1,5 +1,5 @@ // Copyright 2021 yuzu Emulator Project -// Licensed under GPLv2 or any later version +// Licensed under GPLv3 or any later version // Refer to the license.txt file included. #include "common/assert.h" diff --git a/src/video_core/control/channel_state.h b/src/video_core/control/channel_state.h index 82808a6b8..08a7591e1 100644 --- a/src/video_core/control/channel_state.h +++ b/src/video_core/control/channel_state.h @@ -1,5 +1,5 @@ // Copyright 2021 yuzu Emulator Project -// Licensed under GPLv2 or any later version +// Licensed under GPLv3 or any later version // Refer to the license.txt file included. #pragma once diff --git a/src/video_core/control/channel_state_cache.h b/src/video_core/control/channel_state_cache.h index 31d80e8b7..dbf833de7 100644 --- a/src/video_core/control/channel_state_cache.h +++ b/src/video_core/control/channel_state_cache.h @@ -1,3 +1,7 @@ +// Copyright 2021 yuzu Emulator Project +// Licensed under GPLv3 or any later version +// Refer to the license.txt file included. + #pragma once #include diff --git a/src/video_core/control/scheduler.cpp b/src/video_core/control/scheduler.cpp index e1abcb188..a9bb00aa7 100644 --- a/src/video_core/control/scheduler.cpp +++ b/src/video_core/control/scheduler.cpp @@ -1,5 +1,5 @@ // Copyright 2021 yuzu Emulator Project -// Licensed under GPLv2 or any later version +// Licensed under GPLv3 or any later version // Refer to the license.txt file included. #include diff --git a/src/video_core/control/scheduler.h b/src/video_core/control/scheduler.h index 802e9caff..c1a773946 100644 --- a/src/video_core/control/scheduler.h +++ b/src/video_core/control/scheduler.h @@ -1,5 +1,5 @@ // Copyright 2021 yuzu Emulator Project -// Licensed under GPLv2 or any later version +// Licensed under GPLv3 or any later version // Refer to the license.txt file included. #pragma once diff --git a/src/video_core/dma_pusher.h b/src/video_core/dma_pusher.h index fd7c936c4..938f0f11c 100644 --- a/src/video_core/dma_pusher.h +++ b/src/video_core/dma_pusher.h @@ -37,24 +37,32 @@ enum class SubmissionMode : u32 { // Note that, traditionally, methods are treated as 4-byte addressable locations, and hence // their numbers are written down multiplied by 4 in Docs. Here we are not multiply by 4. // So the values you see in docs might be multiplied by 4. +// Register documentation: +// https://github.com/NVIDIA/open-gpu-doc/blob/ab27fc22db5de0d02a4cabe08e555663b62db4d4/classes/host/cla26f.h +// +// Register Description (approx): +// https://github.com/NVIDIA/open-gpu-doc/blob/ab27fc22db5de0d02a4cabe08e555663b62db4d4/manuals/volta/gv100/dev_pbdma.ref.txt enum class BufferMethods : u32 { BindObject = 0x0, + Illegal = 0x1, Nop = 0x2, SemaphoreAddressHigh = 0x4, SemaphoreAddressLow = 0x5, - SemaphoreSequence = 0x6, - SemaphoreTrigger = 0x7, - NotifyIntr = 0x8, + SemaphoreSequencePayload = 0x6, + SemaphoreOperation = 0x7, + NonStallInterrupt = 0x8, WrcacheFlush = 0x9, - Unk28 = 0xA, - UnkCacheFlush = 0xB, + MemOpA = 0xA, + MemOpB = 0xB, + MemOpC = 0xC, + MemOpD = 0xD, RefCnt = 0x14, SemaphoreAcquire = 0x1A, SemaphoreRelease = 0x1B, - FenceValue = 0x1C, - FenceAction = 0x1D, - WaitForInterrupt = 0x1E, - Unk7c = 0x1F, + SyncpointPayload = 0x1C, + SyncpointOperation = 0x1D, + WaitForIdle = 0x1E, + CRCCheck = 0x1F, Yield = 0x20, NonPullerMethods = 0x40, }; diff --git a/src/video_core/engines/puller.cpp b/src/video_core/engines/puller.cpp index 3866c8746..8c17639e4 100644 --- a/src/video_core/engines/puller.cpp +++ b/src/video_core/engines/puller.cpp @@ -68,11 +68,6 @@ void Puller::ProcessFenceActionMethod() { } } -void Puller::ProcessWaitForInterruptMethod() { - // TODO(bunnei) ImplementMe - LOG_WARNING(HW_GPU, "(STUBBED) called"); -} - void Puller::ProcessSemaphoreTriggerMethod() { const auto semaphoreOperationMask = 0xF; const auto op = @@ -91,29 +86,33 @@ void Puller::ProcessSemaphoreTriggerMethod() { block.timestamp = gpu.GetTicks(); memory_manager.WriteBlock(regs.semaphore_address.SemaphoreAddress(), &block, sizeof(block)); } else { - const u32 word{memory_manager.Read(regs.semaphore_address.SemaphoreAddress())}; - if ((op == GpuSemaphoreOperation::AcquireEqual && word == regs.semaphore_sequence) || - (op == GpuSemaphoreOperation::AcquireGequal && - static_cast(word - regs.semaphore_sequence) > 0) || - (op == GpuSemaphoreOperation::AcquireMask && (word & regs.semaphore_sequence))) { - // Nothing to do in this case - } else { + do { + const u32 word{memory_manager.Read(regs.semaphore_address.SemaphoreAddress())}; regs.acquire_source = true; regs.acquire_value = regs.semaphore_sequence; if (op == GpuSemaphoreOperation::AcquireEqual) { regs.acquire_active = true; regs.acquire_mode = false; + if (word != regs.acquire_value) { + std::this_thread::sleep_for(std::chrono::milliseconds(1)); + continue; + } } else if (op == GpuSemaphoreOperation::AcquireGequal) { regs.acquire_active = true; regs.acquire_mode = true; + if (word < regs.acquire_value) { + std::this_thread::sleep_for(std::chrono::milliseconds(1)); + continue; + } } else if (op == GpuSemaphoreOperation::AcquireMask) { - // TODO(kemathe) The acquire mask operation waits for a value that, ANDed with - // semaphore_sequence, gives a non-0 result - LOG_ERROR(HW_GPU, "Invalid semaphore operation AcquireMask not implemented"); + if (word & regs.semaphore_sequence == 0) { + std::this_thread::sleep_for(std::chrono::milliseconds(1)); + continue; + } } else { LOG_ERROR(HW_GPU, "Invalid semaphore operation"); } - } + } while (false); } } @@ -124,6 +123,7 @@ void Puller::ProcessSemaphoreRelease() { void Puller::ProcessSemaphoreAcquire() { const u32 word = memory_manager.Read(regs.semaphore_address.SemaphoreAddress()); const auto value = regs.semaphore_acquire; + std::this_thread::sleep_for(std::chrono::milliseconds(5)); if (word != value) { regs.acquire_active = true; regs.acquire_value = value; @@ -146,32 +146,39 @@ void Puller::CallPullerMethod(const MethodCall& method_call) { case BufferMethods::Nop: case BufferMethods::SemaphoreAddressHigh: case BufferMethods::SemaphoreAddressLow: - case BufferMethods::SemaphoreSequence: - case BufferMethods::UnkCacheFlush: + case BufferMethods::SemaphoreSequencePayload: case BufferMethods::WrcacheFlush: - case BufferMethods::FenceValue: + case BufferMethods::SyncpointPayload: break; case BufferMethods::RefCnt: rasterizer->SignalReference(); break; - case BufferMethods::FenceAction: + case BufferMethods::SyncpointOperation: ProcessFenceActionMethod(); break; - case BufferMethods::WaitForInterrupt: - ProcessWaitForInterruptMethod(); + case BufferMethods::WaitForIdle: + rasterizer->WaitForIdle(); break; - case BufferMethods::SemaphoreTrigger: { + case BufferMethods::SemaphoreOperation: { ProcessSemaphoreTriggerMethod(); break; } - case BufferMethods::NotifyIntr: { - // TODO(Kmather73): Research and implement this method. - LOG_ERROR(HW_GPU, "Special puller engine method NotifyIntr not implemented"); + case BufferMethods::NonStallInterrupt: { + LOG_ERROR(HW_GPU, "Special puller engine method NonStallInterrupt not implemented"); break; } - case BufferMethods::Unk28: { - // TODO(Kmather73): Research and implement this method. - LOG_ERROR(HW_GPU, "Special puller engine method Unk28 not implemented"); + case BufferMethods::MemOpA: { + LOG_ERROR(HW_GPU, "Memory Operation A"); + break; + } + case BufferMethods::MemOpB: { + // Implement this better. + rasterizer->SyncGuestHost(); + break; + } + case BufferMethods::MemOpC: + case BufferMethods::MemOpD: { + LOG_ERROR(HW_GPU, "Memory Operation C,D"); break; } case BufferMethods::SemaphoreAcquire: { diff --git a/src/video_core/engines/puller.h b/src/video_core/engines/puller.h index d948ec790..b4619e9a8 100644 --- a/src/video_core/engines/puller.h +++ b/src/video_core/engines/puller.h @@ -141,7 +141,6 @@ private: void ProcessSemaphoreAcquire(); void ProcessSemaphoreRelease(); void ProcessSemaphoreTriggerMethod(); - void ProcessWaitForInterruptMethod(); [[nodiscard]] bool ExecuteMethodOnEngine(u32 method); /// Mapping of command subchannels to their bound engine ids diff --git a/src/video_core/fence_manager.h b/src/video_core/fence_manager.h index d658e038d..03a70e5e0 100644 --- a/src/video_core/fence_manager.h +++ b/src/video_core/fence_manager.h @@ -11,6 +11,8 @@ #include "common/common_types.h" #include "video_core/delayed_destruction_ring.h" #include "video_core/gpu.h" +#include "video_core/host1x/host1x.h" +#include "video_core/host1x/syncpoint_manager.h" #include "video_core/rasterizer_interface.h" namespace VideoCommon { @@ -72,6 +74,7 @@ public: } void SignalSyncPoint(u32 value) { + syncpoint_manager.IncrementGuest(value); TryReleasePendingFences(); const bool should_flush = ShouldFlush(); CommitAsyncFlushes(); @@ -96,7 +99,7 @@ public: auto payload = current_fence->GetPayload(); std::memcpy(address, &payload, sizeof(payload)); } else { - gpu.IncrementSyncPoint(current_fence->GetPayload()); + syncpoint_manager.IncrementHost(current_fence->GetPayload()); } PopFence(); } @@ -106,8 +109,8 @@ protected: explicit FenceManager(VideoCore::RasterizerInterface& rasterizer_, Tegra::GPU& gpu_, TTextureCache& texture_cache_, TTBufferCache& buffer_cache_, TQueryCache& query_cache_) - : rasterizer{rasterizer_}, gpu{gpu_}, texture_cache{texture_cache_}, - buffer_cache{buffer_cache_}, query_cache{query_cache_} {} + : rasterizer{rasterizer_}, gpu{gpu_}, syncpoint_manager{gpu.Host1x().GetSyncpointManager()}, + texture_cache{texture_cache_}, buffer_cache{buffer_cache_}, query_cache{query_cache_} {} virtual ~FenceManager() = default; @@ -125,6 +128,7 @@ protected: VideoCore::RasterizerInterface& rasterizer; Tegra::GPU& gpu; + Tegra::Host1x::SyncpointManager& syncpoint_manager; TTextureCache& texture_cache; TTBufferCache& buffer_cache; TQueryCache& query_cache; @@ -142,7 +146,7 @@ private: const auto payload = current_fence->GetPayload(); std::memcpy(address, &payload, sizeof(payload)); } else { - gpu.IncrementSyncPoint(current_fence->GetPayload()); + syncpoint_manager.IncrementHost(current_fence->GetPayload()); } PopFence(); } diff --git a/src/video_core/gpu.cpp b/src/video_core/gpu.cpp index eebd7f3ff..1097db08a 100644 --- a/src/video_core/gpu.cpp +++ b/src/video_core/gpu.cpp @@ -28,6 +28,8 @@ #include "video_core/engines/maxwell_dma.h" #include "video_core/gpu.h" #include "video_core/gpu_thread.h" +#include "video_core/host1x/host1x.h" +#include "video_core/host1x/syncpoint_manager.h" #include "video_core/memory_manager.h" #include "video_core/renderer_base.h" #include "video_core/shader_notify.h" @@ -38,7 +40,7 @@ MICROPROFILE_DEFINE(GPU_wait, "GPU", "Wait for the GPU", MP_RGB(128, 128, 192)); struct GPU::Impl { explicit Impl(GPU& gpu_, Core::System& system_, bool is_async_, bool use_nvdec_) - : gpu{gpu_}, system{system_}, use_nvdec{use_nvdec_}, + : gpu{gpu_}, system{system_}, host1x{system.Host1x()}, use_nvdec{use_nvdec_}, shader_notify{std::make_unique()}, is_async{is_async_}, gpu_thread{system_, is_async_}, scheduler{std::make_unique(gpu)} {} @@ -115,31 +117,35 @@ struct GPU::Impl { } /// Request a host GPU memory flush from the CPU. - [[nodiscard]] u64 RequestFlush(VAddr addr, std::size_t size) { - std::unique_lock lck{flush_request_mutex}; - const u64 fence = ++last_flush_fence; - flush_requests.emplace_back(fence, addr, size); + template + [[nodiscard]] u64 RequestSyncOperation(Func&& action) { + std::unique_lock lck{sync_request_mutex}; + const u64 fence = ++last_sync_fence; + sync_requests.emplace_back(action); return fence; } /// Obtains current flush request fence id. - [[nodiscard]] u64 CurrentFlushRequestFence() const { - return current_flush_fence.load(std::memory_order_relaxed); + [[nodiscard]] u64 CurrentSyncRequestFence() const { + return current_sync_fence.load(std::memory_order_relaxed); + } + + void WaitForSyncOperation(const u64 fence) { + std::unique_lock lck{sync_request_mutex}; + sync_request_cv.wait(lck, [this, fence] { return CurrentSyncRequestFence() >= fence; }); } /// Tick pending requests within the GPU. void TickWork() { - std::unique_lock lck{flush_request_mutex}; - while (!flush_requests.empty()) { - auto& request = flush_requests.front(); - const u64 fence = request.fence; - const VAddr addr = request.addr; - const std::size_t size = request.size; - flush_requests.pop_front(); - flush_request_mutex.unlock(); - rasterizer->FlushRegion(addr, size); - current_flush_fence.store(fence); - flush_request_mutex.lock(); + std::unique_lock lck{sync_request_mutex}; + while (!sync_requests.empty()) { + auto request = std::move(sync_requests.front()); + sync_requests.pop_front(); + sync_request_mutex.unlock(); + request(); + current_sync_fence.fetch_add(1, std::memory_order_release); + sync_request_mutex.lock(); + sync_request_cv.notify_all(); } } @@ -207,78 +213,26 @@ struct GPU::Impl { /// Allows the CPU/NvFlinger to wait on the GPU before presenting a frame. void WaitFence(u32 syncpoint_id, u32 value) { - // Synced GPU, is always in sync - if (!is_async) { - return; - } if (syncpoint_id == UINT32_MAX) { - // TODO: Research what this does. - LOG_ERROR(HW_GPU, "Waiting for syncpoint -1 not implemented"); return; } MICROPROFILE_SCOPE(GPU_wait); - std::unique_lock lock{sync_mutex}; - sync_cv.wait(lock, [=, this] { - if (shutting_down.load(std::memory_order_relaxed)) { - // We're shutting down, ensure no threads continue to wait for the next syncpoint - return true; - } - return syncpoints.at(syncpoint_id).load() >= value; - }); + host1x.GetSyncpointManager().WaitHost(syncpoint_id, value); } void IncrementSyncPoint(u32 syncpoint_id) { - auto& syncpoint = syncpoints.at(syncpoint_id); - syncpoint++; - std::scoped_lock lock{sync_mutex}; - sync_cv.notify_all(); - auto& interrupt = syncpt_interrupts.at(syncpoint_id); - if (!interrupt.empty()) { - u32 value = syncpoint.load(); - auto it = interrupt.begin(); - while (it != interrupt.end()) { - if (value >= *it) { - TriggerCpuInterrupt(syncpoint_id, *it); - it = interrupt.erase(it); - continue; - } - it++; - } - } + host1x.GetSyncpointManager().IncrementHost(syncpoint_id); } [[nodiscard]] u32 GetSyncpointValue(u32 syncpoint_id) const { - return syncpoints.at(syncpoint_id).load(); + return host1x.GetSyncpointManager().GetHostSyncpointValue(syncpoint_id); } void RegisterSyncptInterrupt(u32 syncpoint_id, u32 value) { - std::scoped_lock lock{sync_mutex}; - u32 current_value = syncpoints.at(syncpoint_id).load(); - if ((static_cast(current_value) - static_cast(value)) >= 0) { + auto& syncpoint_manager = host1x.GetSyncpointManager(); + syncpoint_manager.RegisterHostAction(syncpoint_id, value, [this, syncpoint_id, value]() { TriggerCpuInterrupt(syncpoint_id, value); - return; - } - auto& interrupt = syncpt_interrupts.at(syncpoint_id); - bool contains = std::any_of(interrupt.begin(), interrupt.end(), - [value](u32 in_value) { return in_value == value; }); - if (contains) { - return; - } - interrupt.emplace_back(value); - } - - [[nodiscard]] bool CancelSyncptInterrupt(u32 syncpoint_id, u32 value) { - std::scoped_lock lock{sync_mutex}; - auto& interrupt = syncpt_interrupts.at(syncpoint_id); - const auto iter = - std::find_if(interrupt.begin(), interrupt.end(), - [value](u32 interrupt_value) { return value == interrupt_value; }); - - if (iter == interrupt.end()) { - return false; - } - interrupt.erase(iter); - return true; + }); } [[nodiscard]] u64 GetTicks() const { @@ -387,8 +341,48 @@ struct GPU::Impl { interrupt_manager.GPUInterruptSyncpt(syncpoint_id, value); } + void RequestSwapBuffers(const Tegra::FramebufferConfig* framebuffer, + Service::Nvidia::NvFence* fences, size_t num_fences) { + size_t current_request_counter{}; + { + std::unique_lock lk(request_swap_mutex); + if (free_swap_counters.empty()) { + current_request_counter = request_swap_counters.size(); + request_swap_counters.emplace_back(num_fences); + } else { + current_request_counter = free_swap_counters.front(); + request_swap_counters[current_request_counter] = num_fences; + free_swap_counters.pop_front(); + } + } + const auto wait_fence = + RequestSyncOperation([this, current_request_counter, framebuffer, fences, num_fences] { + auto& syncpoint_manager = host1x.GetSyncpointManager(); + if (num_fences == 0) { + renderer->SwapBuffers(framebuffer); + } + const auto executer = [this, current_request_counter, + framebuffer_copy = *framebuffer]() { + { + std::unique_lock lk(request_swap_mutex); + if (--request_swap_counters[current_request_counter] != 0) { + return; + } + free_swap_counters.push_back(current_request_counter); + } + renderer->SwapBuffers(&framebuffer_copy); + }; + for (size_t i = 0; i < num_fences; i++) { + syncpoint_manager.RegisterGuestAction(fences[i].id, fences[i].value, executer); + } + }); + gpu_thread.TickGPU(); + WaitForSyncOperation(wait_fence); + } + GPU& gpu; Core::System& system; + Host1x::Host1x& host1x; std::map> cdma_pushers; std::unique_ptr renderer; @@ -411,18 +405,11 @@ struct GPU::Impl { std::condition_variable sync_cv; - struct FlushRequest { - explicit FlushRequest(u64 fence_, VAddr addr_, std::size_t size_) - : fence{fence_}, addr{addr_}, size{size_} {} - u64 fence; - VAddr addr; - std::size_t size; - }; - - std::list flush_requests; - std::atomic current_flush_fence{}; - u64 last_flush_fence{}; - std::mutex flush_request_mutex; + std::list> sync_requests; + std::atomic current_sync_fence{}; + u64 last_sync_fence{}; + std::mutex sync_request_mutex; + std::condition_variable sync_request_cv; const bool is_async; @@ -433,6 +420,10 @@ struct GPU::Impl { std::unordered_map> channels; Tegra::Control::ChannelState* current_channel; s32 bound_channel{-1}; + + std::deque free_swap_counters; + std::deque request_swap_counters; + std::mutex request_swap_mutex; }; GPU::GPU(Core::System& system, bool is_async, bool use_nvdec) @@ -477,17 +468,32 @@ void GPU::OnCommandListEnd() { } u64 GPU::RequestFlush(VAddr addr, std::size_t size) { - return impl->RequestFlush(addr, size); + return impl->RequestSyncOperation( + [this, addr, size]() { impl->rasterizer->FlushRegion(addr, size); }); +} + +u64 GPU::CurrentSyncRequestFence() const { + return impl->CurrentSyncRequestFence(); } -u64 GPU::CurrentFlushRequestFence() const { - return impl->CurrentFlushRequestFence(); +void GPU::WaitForSyncOperation(u64 fence) { + return impl->WaitForSyncOperation(fence); } void GPU::TickWork() { impl->TickWork(); } +/// Gets a mutable reference to the Host1x interface +Host1x::Host1x& GPU::Host1x() { + return impl->host1x; +} + +/// Gets an immutable reference to the Host1x interface. +const Host1x::Host1x& GPU::Host1x() const { + return impl->host1x; +} + Engines::Maxwell3D& GPU::Maxwell3D() { return impl->Maxwell3D(); } @@ -536,6 +542,11 @@ const VideoCore::ShaderNotify& GPU::ShaderNotify() const { return impl->ShaderNotify(); } +void GPU::RequestSwapBuffers(const Tegra::FramebufferConfig* framebuffer, + Service::Nvidia::NvFence* fences, size_t num_fences) { + impl->RequestSwapBuffers(framebuffer, fences, num_fences); +} + void GPU::WaitFence(u32 syncpoint_id, u32 value) { impl->WaitFence(syncpoint_id, value); } @@ -552,10 +563,6 @@ void GPU::RegisterSyncptInterrupt(u32 syncpoint_id, u32 value) { impl->RegisterSyncptInterrupt(syncpoint_id, value); } -bool GPU::CancelSyncptInterrupt(u32 syncpoint_id, u32 value) { - return impl->CancelSyncptInterrupt(syncpoint_id, value); -} - u64 GPU::GetTicks() const { return impl->GetTicks(); } diff --git a/src/video_core/gpu.h b/src/video_core/gpu.h index 7e84b0d2f..c1a538257 100644 --- a/src/video_core/gpu.h +++ b/src/video_core/gpu.h @@ -93,6 +93,10 @@ namespace Control { struct ChannelState; } +namespace Host1x { +class Host1x; +} // namespace Host1x + class MemoryManager; class GPU final { @@ -124,11 +128,19 @@ public: [[nodiscard]] u64 RequestFlush(VAddr addr, std::size_t size); /// Obtains current flush request fence id. - [[nodiscard]] u64 CurrentFlushRequestFence() const; + [[nodiscard]] u64 CurrentSyncRequestFence() const; + + void WaitForSyncOperation(u64 fence); /// Tick pending requests within the GPU. void TickWork(); + /// Gets a mutable reference to the Host1x interface + [[nodiscard]] Host1x::Host1x& Host1x(); + + /// Gets an immutable reference to the Host1x interface. + [[nodiscard]] const Host1x::Host1x& Host1x() const; + /// Returns a reference to the Maxwell3D GPU engine. [[nodiscard]] Engines::Maxwell3D& Maxwell3D(); @@ -174,8 +186,6 @@ public: void RegisterSyncptInterrupt(u32 syncpoint_id, u32 value); - bool CancelSyncptInterrupt(u32 syncpoint_id, u32 value); - [[nodiscard]] u64 GetTicks() const; [[nodiscard]] bool IsAsync() const; @@ -184,6 +194,9 @@ public: void RendererFrameEndNotify(); + void RequestSwapBuffers(const Tegra::FramebufferConfig* framebuffer, + Service::Nvidia::NvFence* fences, size_t num_fences); + /// Performs any additional setup necessary in order to begin GPU emulation. /// This can be used to launch any necessary threads and register any necessary /// core timing events. diff --git a/src/video_core/gpu_thread.cpp b/src/video_core/gpu_thread.cpp index 9844cde43..2c03545bf 100644 --- a/src/video_core/gpu_thread.cpp +++ b/src/video_core/gpu_thread.cpp @@ -93,8 +93,12 @@ void ThreadManager::FlushRegion(VAddr addr, u64 size) { } auto& gpu = system.GPU(); u64 fence = gpu.RequestFlush(addr, size); + TickGPU(); + gpu.WaitForSyncOperation(fence); +} + +void ThreadManager::TickGPU() { PushCommand(GPUTickCommand(), true); - ASSERT(fence <= gpu.CurrentFlushRequestFence()); } void ThreadManager::InvalidateRegion(VAddr addr, u64 size) { diff --git a/src/video_core/gpu_thread.h b/src/video_core/gpu_thread.h index c5078a2b3..64628d3e3 100644 --- a/src/video_core/gpu_thread.h +++ b/src/video_core/gpu_thread.h @@ -135,6 +135,8 @@ public: void OnCommandListEnd(); + void TickGPU(); + private: /// Pushes a command to be executed by the GPU thread u64 PushCommand(CommandData&& command_data, bool block = false); diff --git a/src/video_core/host1x/codecs/codec.cpp b/src/video_core/host1x/codecs/codec.cpp new file mode 100644 index 000000000..70c47ae03 --- /dev/null +++ b/src/video_core/host1x/codecs/codec.cpp @@ -0,0 +1,310 @@ +// SPDX-FileCopyrightText: Copyright 2020 yuzu Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#include +#include +#include +#include "common/assert.h" +#include "common/settings.h" +#include "video_core/gpu.h" +#include "video_core/host1x/codecs/codec.h" +#include "video_core/host1x/codecs/h264.h" +#include "video_core/host1x/codecs/vp8.h" +#include "video_core/host1x/codecs/vp9.h" +#include "video_core/memory_manager.h" + +extern "C" { +#include +#ifdef LIBVA_FOUND +// for querying VAAPI driver information +#include +#endif +} + +namespace Tegra { +namespace { +constexpr AVPixelFormat PREFERRED_GPU_FMT = AV_PIX_FMT_NV12; +constexpr AVPixelFormat PREFERRED_CPU_FMT = AV_PIX_FMT_YUV420P; +constexpr std::array PREFERRED_GPU_DECODERS = { + AV_HWDEVICE_TYPE_CUDA, +#ifdef _WIN32 + AV_HWDEVICE_TYPE_D3D11VA, + AV_HWDEVICE_TYPE_DXVA2, +#elif defined(__unix__) + AV_HWDEVICE_TYPE_VAAPI, + AV_HWDEVICE_TYPE_VDPAU, +#endif + // last resort for Linux Flatpak (w/ NVIDIA) + AV_HWDEVICE_TYPE_VULKAN, +}; + +void AVPacketDeleter(AVPacket* ptr) { + av_packet_free(&ptr); +} + +using AVPacketPtr = std::unique_ptr; + +AVPixelFormat GetGpuFormat(AVCodecContext* av_codec_ctx, const AVPixelFormat* pix_fmts) { + for (const AVPixelFormat* p = pix_fmts; *p != AV_PIX_FMT_NONE; ++p) { + if (*p == av_codec_ctx->pix_fmt) { + return av_codec_ctx->pix_fmt; + } + } + LOG_INFO(Service_NVDRV, "Could not find compatible GPU AV format, falling back to CPU"); + av_buffer_unref(&av_codec_ctx->hw_device_ctx); + av_codec_ctx->pix_fmt = PREFERRED_CPU_FMT; + return PREFERRED_CPU_FMT; +} + +// List all the currently available hwcontext in ffmpeg +std::vector ListSupportedContexts() { + std::vector contexts{}; + AVHWDeviceType current_device_type = AV_HWDEVICE_TYPE_NONE; + do { + current_device_type = av_hwdevice_iterate_types(current_device_type); + contexts.push_back(current_device_type); + } while (current_device_type != AV_HWDEVICE_TYPE_NONE); + return contexts; +} + +} // namespace + +void AVFrameDeleter(AVFrame* ptr) { + av_frame_free(&ptr); +} + +Codec::Codec(GPU& gpu_, const Host1x::NvdecCommon::NvdecRegisters& regs) + : gpu(gpu_), state{regs}, h264_decoder(std::make_unique(gpu)), + vp8_decoder(std::make_unique(gpu)), + vp9_decoder(std::make_unique(gpu)) {} + +Codec::~Codec() { + if (!initialized) { + return; + } + // Free libav memory + avcodec_free_context(&av_codec_ctx); + av_buffer_unref(&av_gpu_decoder); +} + +bool Codec::CreateGpuAvDevice() { + static constexpr auto HW_CONFIG_METHOD = AV_CODEC_HW_CONFIG_METHOD_HW_DEVICE_CTX; + static const auto supported_contexts = ListSupportedContexts(); + for (const auto& type : PREFERRED_GPU_DECODERS) { + if (std::none_of(supported_contexts.begin(), supported_contexts.end(), + [&type](const auto& context) { return context == type; })) { + LOG_DEBUG(Service_NVDRV, "{} explicitly unsupported", av_hwdevice_get_type_name(type)); + continue; + } + // Avoid memory leak from not cleaning up after av_hwdevice_ctx_create + av_buffer_unref(&av_gpu_decoder); + const int hwdevice_res = av_hwdevice_ctx_create(&av_gpu_decoder, type, nullptr, nullptr, 0); + if (hwdevice_res < 0) { + LOG_DEBUG(Service_NVDRV, "{} av_hwdevice_ctx_create failed {}", + av_hwdevice_get_type_name(type), hwdevice_res); + continue; + } +#ifdef LIBVA_FOUND + if (type == AV_HWDEVICE_TYPE_VAAPI) { + // we need to determine if this is an impersonated VAAPI driver + AVHWDeviceContext* hwctx = + static_cast(static_cast(av_gpu_decoder->data)); + AVVAAPIDeviceContext* vactx = static_cast(hwctx->hwctx); + const char* vendor_name = vaQueryVendorString(vactx->display); + if (strstr(vendor_name, "VDPAU backend")) { + // VDPAU impersonated VAAPI impl's are super buggy, we need to skip them + LOG_DEBUG(Service_NVDRV, "Skipping vdapu impersonated VAAPI driver"); + continue; + } else { + // according to some user testing, certain vaapi driver (Intel?) could be buggy + // so let's log the driver name which may help the developers/supporters + LOG_DEBUG(Service_NVDRV, "Using VAAPI driver: {}", vendor_name); + } + } +#endif + for (int i = 0;; i++) { + const AVCodecHWConfig* config = avcodec_get_hw_config(av_codec, i); + if (!config) { + LOG_DEBUG(Service_NVDRV, "{} decoder does not support device type {}.", + av_codec->name, av_hwdevice_get_type_name(type)); + break; + } + if ((config->methods & HW_CONFIG_METHOD) != 0 && config->device_type == type) { +#if defined(__unix__) + // Some linux decoding backends are reported to crash with this config method + // TODO(ameerj): Properly support this method + if ((config->methods & AV_CODEC_HW_CONFIG_METHOD_HW_FRAMES_CTX) != 0) { + // skip zero-copy decoders, we don't currently support them + LOG_DEBUG(Service_NVDRV, "Skipping decoder {} with unsupported capability {}.", + av_hwdevice_get_type_name(type), config->methods); + continue; + } +#endif + LOG_INFO(Service_NVDRV, "Using {} GPU decoder", av_hwdevice_get_type_name(type)); + av_codec_ctx->pix_fmt = config->pix_fmt; + return true; + } + } + } + return false; +} + +void Codec::InitializeAvCodecContext() { + av_codec_ctx = avcodec_alloc_context3(av_codec); + av_opt_set(av_codec_ctx->priv_data, "tune", "zerolatency", 0); +} + +void Codec::InitializeGpuDecoder() { + if (!CreateGpuAvDevice()) { + av_buffer_unref(&av_gpu_decoder); + return; + } + auto* hw_device_ctx = av_buffer_ref(av_gpu_decoder); + ASSERT_MSG(hw_device_ctx, "av_buffer_ref failed"); + av_codec_ctx->hw_device_ctx = hw_device_ctx; + av_codec_ctx->get_format = GetGpuFormat; +} + +void Codec::Initialize() { + const AVCodecID codec = [&] { + switch (current_codec) { + case Host1x::NvdecCommon::VideoCodec::H264: + return AV_CODEC_ID_H264; + case Host1x::NvdecCommon::VideoCodec::VP8: + return AV_CODEC_ID_VP8; + case Host1x::NvdecCommon::VideoCodec::VP9: + return AV_CODEC_ID_VP9; + default: + UNIMPLEMENTED_MSG("Unknown codec {}", current_codec); + return AV_CODEC_ID_NONE; + } + }(); + av_codec = avcodec_find_decoder(codec); + + InitializeAvCodecContext(); + if (Settings::values.nvdec_emulation.GetValue() == Settings::NvdecEmulation::GPU) { + InitializeGpuDecoder(); + } + if (const int res = avcodec_open2(av_codec_ctx, av_codec, nullptr); res < 0) { + LOG_ERROR(Service_NVDRV, "avcodec_open2() Failed with result {}", res); + avcodec_free_context(&av_codec_ctx); + av_buffer_unref(&av_gpu_decoder); + return; + } + if (!av_codec_ctx->hw_device_ctx) { + LOG_INFO(Service_NVDRV, "Using FFmpeg software decoding"); + } + initialized = true; +} + +void Codec::SetTargetCodec(Host1x::NvdecCommon::VideoCodec codec) { + if (current_codec != codec) { + current_codec = codec; + LOG_INFO(Service_NVDRV, "NVDEC video codec initialized to {}", GetCurrentCodecName()); + } +} + +void Codec::Decode() { + const bool is_first_frame = !initialized; + if (is_first_frame) { + Initialize(); + } + if (!initialized) { + return; + } + bool vp9_hidden_frame = false; + const auto& frame_data = [&]() { + switch (current_codec) { + case Tegra::Host1x::NvdecCommon::VideoCodec::H264: + return h264_decoder->ComposeFrame(state, is_first_frame); + case Tegra::Host1x::NvdecCommon::VideoCodec::VP8: + return vp8_decoder->ComposeFrame(state); + case Tegra::Host1x::NvdecCommon::VideoCodec::VP9: + vp9_decoder->ComposeFrame(state); + vp9_hidden_frame = vp9_decoder->WasFrameHidden(); + return vp9_decoder->GetFrameBytes(); + default: + ASSERT(false); + return std::vector{}; + } + }(); + AVPacketPtr packet{av_packet_alloc(), AVPacketDeleter}; + if (!packet) { + LOG_ERROR(Service_NVDRV, "av_packet_alloc failed"); + return; + } + packet->data = const_cast(frame_data.data()); + packet->size = static_cast(frame_data.size()); + if (const int res = avcodec_send_packet(av_codec_ctx, packet.get()); res != 0) { + LOG_DEBUG(Service_NVDRV, "avcodec_send_packet error {}", res); + return; + } + // Only receive/store visible frames + if (vp9_hidden_frame) { + return; + } + AVFramePtr initial_frame{av_frame_alloc(), AVFrameDeleter}; + AVFramePtr final_frame{nullptr, AVFrameDeleter}; + ASSERT_MSG(initial_frame, "av_frame_alloc initial_frame failed"); + if (const int ret = avcodec_receive_frame(av_codec_ctx, initial_frame.get()); ret) { + LOG_DEBUG(Service_NVDRV, "avcodec_receive_frame error {}", ret); + return; + } + if (initial_frame->width == 0 || initial_frame->height == 0) { + LOG_WARNING(Service_NVDRV, "Zero width or height in frame"); + return; + } + if (av_codec_ctx->hw_device_ctx) { + final_frame = AVFramePtr{av_frame_alloc(), AVFrameDeleter}; + ASSERT_MSG(final_frame, "av_frame_alloc final_frame failed"); + // Can't use AV_PIX_FMT_YUV420P and share code with software decoding in vic.cpp + // because Intel drivers crash unless using AV_PIX_FMT_NV12 + final_frame->format = PREFERRED_GPU_FMT; + const int ret = av_hwframe_transfer_data(final_frame.get(), initial_frame.get(), 0); + ASSERT_MSG(!ret, "av_hwframe_transfer_data error {}", ret); + } else { + final_frame = std::move(initial_frame); + } + if (final_frame->format != PREFERRED_CPU_FMT && final_frame->format != PREFERRED_GPU_FMT) { + UNIMPLEMENTED_MSG("Unexpected video format: {}", final_frame->format); + return; + } + av_frames.push(std::move(final_frame)); + if (av_frames.size() > 10) { + LOG_TRACE(Service_NVDRV, "av_frames.push overflow dropped frame"); + av_frames.pop(); + } +} + +AVFramePtr Codec::GetCurrentFrame() { + // Sometimes VIC will request more frames than have been decoded. + // in this case, return a nullptr and don't overwrite previous frame data + if (av_frames.empty()) { + return AVFramePtr{nullptr, AVFrameDeleter}; + } + AVFramePtr frame = std::move(av_frames.front()); + av_frames.pop(); + return frame; +} + +Host1x::NvdecCommon::VideoCodec Codec::GetCurrentCodec() const { + return current_codec; +} + +std::string_view Codec::GetCurrentCodecName() const { + switch (current_codec) { + case Host1x::NvdecCommon::VideoCodec::None: + return "None"; + case Host1x::NvdecCommon::VideoCodec::H264: + return "H264"; + case Host1x::NvdecCommon::VideoCodec::VP8: + return "VP8"; + case Host1x::NvdecCommon::VideoCodec::H265: + return "H265"; + case Host1x::NvdecCommon::VideoCodec::VP9: + return "VP9"; + default: + return "Unknown"; + } +} +} // namespace Tegra diff --git a/src/video_core/host1x/codecs/codec.h b/src/video_core/host1x/codecs/codec.h new file mode 100644 index 000000000..117cb3ccd --- /dev/null +++ b/src/video_core/host1x/codecs/codec.h @@ -0,0 +1,81 @@ +// SPDX-FileCopyrightText: Copyright 2020 yuzu Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#pragma once + +#include +#include +#include +#include "common/common_types.h" +#include "video_core/host1x/nvdec_common.h" + +extern "C" { +#if defined(__GNUC__) || defined(__clang__) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wconversion" +#endif +#include +#if defined(__GNUC__) || defined(__clang__) +#pragma GCC diagnostic pop +#endif +} + +namespace Tegra { +class GPU; + +void AVFrameDeleter(AVFrame* ptr); +using AVFramePtr = std::unique_ptr; + +namespace Decoder { +class H264; +class VP8; +class VP9; +} // namespace Decoder + +class Codec { +public: + explicit Codec(GPU& gpu, const Host1x::NvdecCommon::NvdecRegisters& regs); + ~Codec(); + + /// Initialize the codec, returning success or failure + void Initialize(); + + /// Sets NVDEC video stream codec + void SetTargetCodec(Host1x::NvdecCommon::VideoCodec codec); + + /// Call decoders to construct headers, decode AVFrame with ffmpeg + void Decode(); + + /// Returns next decoded frame + [[nodiscard]] AVFramePtr GetCurrentFrame(); + + /// Returns the value of current_codec + [[nodiscard]] Host1x::NvdecCommon::VideoCodec GetCurrentCodec() const; + + /// Return name of the current codec + [[nodiscard]] std::string_view GetCurrentCodecName() const; + +private: + void InitializeAvCodecContext(); + + void InitializeGpuDecoder(); + + bool CreateGpuAvDevice(); + + bool initialized{}; + Host1x::NvdecCommon::VideoCodec current_codec{Host1x::NvdecCommon::VideoCodec::None}; + + const AVCodec* av_codec{nullptr}; + AVCodecContext* av_codec_ctx{nullptr}; + AVBufferRef* av_gpu_decoder{nullptr}; + + GPU& gpu; + const Host1x::NvdecCommon::NvdecRegisters& state; + std::unique_ptr h264_decoder; + std::unique_ptr vp8_decoder; + std::unique_ptr vp9_decoder; + + std::queue av_frames{}; +}; + +} // namespace Tegra diff --git a/src/video_core/host1x/codecs/h264.cpp b/src/video_core/host1x/codecs/h264.cpp new file mode 100644 index 000000000..95534bc85 --- /dev/null +++ b/src/video_core/host1x/codecs/h264.cpp @@ -0,0 +1,277 @@ +// SPDX-FileCopyrightText: Ryujinx Team and Contributors +// SPDX-License-Identifier: MIT + +#include +#include + +#include "common/settings.h" +#include "video_core/gpu.h" +#include "video_core/host1x/codecs/h264.h" +#include "video_core/memory_manager.h" + +namespace Tegra::Decoder { +namespace { +// ZigZag LUTs from libavcodec. +constexpr std::array zig_zag_direct{ + 0, 1, 8, 16, 9, 2, 3, 10, 17, 24, 32, 25, 18, 11, 4, 5, 12, 19, 26, 33, 40, 48, + 41, 34, 27, 20, 13, 6, 7, 14, 21, 28, 35, 42, 49, 56, 57, 50, 43, 36, 29, 22, 15, 23, + 30, 37, 44, 51, 58, 59, 52, 45, 38, 31, 39, 46, 53, 60, 61, 54, 47, 55, 62, 63, +}; + +constexpr std::array zig_zag_scan{ + 0 + 0 * 4, 1 + 0 * 4, 0 + 1 * 4, 0 + 2 * 4, 1 + 1 * 4, 2 + 0 * 4, 3 + 0 * 4, 2 + 1 * 4, + 1 + 2 * 4, 0 + 3 * 4, 1 + 3 * 4, 2 + 2 * 4, 3 + 1 * 4, 3 + 2 * 4, 2 + 3 * 4, 3 + 3 * 4, +}; +} // Anonymous namespace + +H264::H264(GPU& gpu_) : gpu(gpu_) {} + +H264::~H264() = default; + +const std::vector& H264::ComposeFrame(const Host1x::NvdecCommon::NvdecRegisters& state, + bool is_first_frame) { + H264DecoderContext context; + gpu.MemoryManager().ReadBlock(state.picture_info_offset, &context, sizeof(H264DecoderContext)); + + const s64 frame_number = context.h264_parameter_set.frame_number.Value(); + if (!is_first_frame && frame_number != 0) { + frame.resize(context.stream_len); + gpu.MemoryManager().ReadBlock(state.frame_bitstream_offset, frame.data(), frame.size()); + return frame; + } + + // Encode header + H264BitWriter writer{}; + writer.WriteU(1, 24); + writer.WriteU(0, 1); + writer.WriteU(3, 2); + writer.WriteU(7, 5); + writer.WriteU(100, 8); + writer.WriteU(0, 8); + writer.WriteU(31, 8); + writer.WriteUe(0); + const u32 chroma_format_idc = + static_cast(context.h264_parameter_set.chroma_format_idc.Value()); + writer.WriteUe(chroma_format_idc); + if (chroma_format_idc == 3) { + writer.WriteBit(false); + } + + writer.WriteUe(0); + writer.WriteUe(0); + writer.WriteBit(false); // QpprimeYZeroTransformBypassFlag + writer.WriteBit(false); // Scaling matrix present flag + + writer.WriteUe(static_cast(context.h264_parameter_set.log2_max_frame_num_minus4.Value())); + + const auto order_cnt_type = + static_cast(context.h264_parameter_set.pic_order_cnt_type.Value()); + writer.WriteUe(order_cnt_type); + if (order_cnt_type == 0) { + writer.WriteUe(context.h264_parameter_set.log2_max_pic_order_cnt_lsb_minus4); + } else if (order_cnt_type == 1) { + writer.WriteBit(context.h264_parameter_set.delta_pic_order_always_zero_flag != 0); + + writer.WriteSe(0); + writer.WriteSe(0); + writer.WriteUe(0); + } + + const s32 pic_height = context.h264_parameter_set.frame_height_in_map_units / + (context.h264_parameter_set.frame_mbs_only_flag ? 1 : 2); + + // TODO (ameerj): Where do we get this number, it seems to be particular for each stream + const auto nvdec_decoding = Settings::values.nvdec_emulation.GetValue(); + const bool uses_gpu_decoding = nvdec_decoding == Settings::NvdecEmulation::GPU; + const u32 max_num_ref_frames = uses_gpu_decoding ? 6u : 16u; + writer.WriteUe(max_num_ref_frames); + writer.WriteBit(false); + writer.WriteUe(context.h264_parameter_set.pic_width_in_mbs - 1); + writer.WriteUe(pic_height - 1); + writer.WriteBit(context.h264_parameter_set.frame_mbs_only_flag != 0); + + if (!context.h264_parameter_set.frame_mbs_only_flag) { + writer.WriteBit(context.h264_parameter_set.flags.mbaff_frame.Value() != 0); + } + + writer.WriteBit(context.h264_parameter_set.flags.direct_8x8_inference.Value() != 0); + writer.WriteBit(false); // Frame cropping flag + writer.WriteBit(false); // VUI parameter present flag + + writer.End(); + + // H264 PPS + writer.WriteU(1, 24); + writer.WriteU(0, 1); + writer.WriteU(3, 2); + writer.WriteU(8, 5); + + writer.WriteUe(0); + writer.WriteUe(0); + + writer.WriteBit(context.h264_parameter_set.entropy_coding_mode_flag != 0); + writer.WriteBit(false); + writer.WriteUe(0); + writer.WriteUe(context.h264_parameter_set.num_refidx_l0_default_active); + writer.WriteUe(context.h264_parameter_set.num_refidx_l1_default_active); + writer.WriteBit(context.h264_parameter_set.flags.weighted_pred.Value() != 0); + writer.WriteU(static_cast(context.h264_parameter_set.weighted_bipred_idc.Value()), 2); + s32 pic_init_qp = static_cast(context.h264_parameter_set.pic_init_qp_minus26.Value()); + writer.WriteSe(pic_init_qp); + writer.WriteSe(0); + s32 chroma_qp_index_offset = + static_cast(context.h264_parameter_set.chroma_qp_index_offset.Value()); + + writer.WriteSe(chroma_qp_index_offset); + writer.WriteBit(context.h264_parameter_set.deblocking_filter_control_present_flag != 0); + writer.WriteBit(context.h264_parameter_set.flags.constrained_intra_pred.Value() != 0); + writer.WriteBit(context.h264_parameter_set.redundant_pic_cnt_present_flag != 0); + writer.WriteBit(context.h264_parameter_set.transform_8x8_mode_flag != 0); + + writer.WriteBit(true); + + for (s32 index = 0; index < 6; index++) { + writer.WriteBit(true); + std::span matrix{context.weight_scale}; + writer.WriteScalingList(matrix, index * 16, 16); + } + + if (context.h264_parameter_set.transform_8x8_mode_flag) { + for (s32 index = 0; index < 2; index++) { + writer.WriteBit(true); + std::span matrix{context.weight_scale_8x8}; + writer.WriteScalingList(matrix, index * 64, 64); + } + } + + s32 chroma_qp_index_offset2 = + static_cast(context.h264_parameter_set.second_chroma_qp_index_offset.Value()); + + writer.WriteSe(chroma_qp_index_offset2); + + writer.End(); + + const auto& encoded_header = writer.GetByteArray(); + frame.resize(encoded_header.size() + context.stream_len); + std::memcpy(frame.data(), encoded_header.data(), encoded_header.size()); + + gpu.MemoryManager().ReadBlock(state.frame_bitstream_offset, + frame.data() + encoded_header.size(), context.stream_len); + + return frame; +} + +H264BitWriter::H264BitWriter() = default; + +H264BitWriter::~H264BitWriter() = default; + +void H264BitWriter::WriteU(s32 value, s32 value_sz) { + WriteBits(value, value_sz); +} + +void H264BitWriter::WriteSe(s32 value) { + WriteExpGolombCodedInt(value); +} + +void H264BitWriter::WriteUe(u32 value) { + WriteExpGolombCodedUInt(value); +} + +void H264BitWriter::End() { + WriteBit(true); + Flush(); +} + +void H264BitWriter::WriteBit(bool state) { + WriteBits(state ? 1 : 0, 1); +} + +void H264BitWriter::WriteScalingList(std::span list, s32 start, s32 count) { + std::vector scan(count); + if (count == 16) { + std::memcpy(scan.data(), zig_zag_scan.data(), scan.size()); + } else { + std::memcpy(scan.data(), zig_zag_direct.data(), scan.size()); + } + u8 last_scale = 8; + + for (s32 index = 0; index < count; index++) { + const u8 value = list[start + scan[index]]; + const s32 delta_scale = static_cast(value - last_scale); + + WriteSe(delta_scale); + + last_scale = value; + } +} + +std::vector& H264BitWriter::GetByteArray() { + return byte_array; +} + +const std::vector& H264BitWriter::GetByteArray() const { + return byte_array; +} + +void H264BitWriter::WriteBits(s32 value, s32 bit_count) { + s32 value_pos = 0; + + s32 remaining = bit_count; + + while (remaining > 0) { + s32 copy_size = remaining; + + const s32 free_bits = GetFreeBufferBits(); + + if (copy_size > free_bits) { + copy_size = free_bits; + } + + const s32 mask = (1 << copy_size) - 1; + + const s32 src_shift = (bit_count - value_pos) - copy_size; + const s32 dst_shift = (buffer_size - buffer_pos) - copy_size; + + buffer |= ((value >> src_shift) & mask) << dst_shift; + + value_pos += copy_size; + buffer_pos += copy_size; + remaining -= copy_size; + } +} + +void H264BitWriter::WriteExpGolombCodedInt(s32 value) { + const s32 sign = value <= 0 ? 0 : 1; + if (value < 0) { + value = -value; + } + value = (value << 1) - sign; + WriteExpGolombCodedUInt(value); +} + +void H264BitWriter::WriteExpGolombCodedUInt(u32 value) { + const s32 size = 32 - std::countl_zero(value + 1); + WriteBits(1, size); + + value -= (1U << (size - 1)) - 1; + WriteBits(static_cast(value), size - 1); +} + +s32 H264BitWriter::GetFreeBufferBits() { + if (buffer_pos == buffer_size) { + Flush(); + } + + return buffer_size - buffer_pos; +} + +void H264BitWriter::Flush() { + if (buffer_pos == 0) { + return; + } + byte_array.push_back(static_cast(buffer)); + + buffer = 0; + buffer_pos = 0; +} +} // namespace Tegra::Decoder diff --git a/src/video_core/host1x/codecs/h264.h b/src/video_core/host1x/codecs/h264.h new file mode 100644 index 000000000..a98730474 --- /dev/null +++ b/src/video_core/host1x/codecs/h264.h @@ -0,0 +1,173 @@ +// SPDX-FileCopyrightText: Ryujinx Team and Contributors +// SPDX-License-Identifier: MIT + +#pragma once + +#include +#include +#include "common/bit_field.h" +#include "common/common_funcs.h" +#include "common/common_types.h" +#include "video_core/host1x/nvdec_common.h" + +namespace Tegra { +class GPU; +namespace Decoder { + +class H264BitWriter { +public: + H264BitWriter(); + ~H264BitWriter(); + + /// The following Write methods are based on clause 9.1 in the H.264 specification. + /// WriteSe and WriteUe write in the Exp-Golomb-coded syntax + void WriteU(s32 value, s32 value_sz); + void WriteSe(s32 value); + void WriteUe(u32 value); + + /// Finalize the bitstream + void End(); + + /// append a bit to the stream, equivalent value to the state parameter + void WriteBit(bool state); + + /// Based on section 7.3.2.1.1.1 and Table 7-4 in the H.264 specification + /// Writes the scaling matrices of the sream + void WriteScalingList(std::span list, s32 start, s32 count); + + /// Return the bitstream as a vector. + [[nodiscard]] std::vector& GetByteArray(); + [[nodiscard]] const std::vector& GetByteArray() const; + +private: + void WriteBits(s32 value, s32 bit_count); + void WriteExpGolombCodedInt(s32 value); + void WriteExpGolombCodedUInt(u32 value); + [[nodiscard]] s32 GetFreeBufferBits(); + void Flush(); + + s32 buffer_size{8}; + + s32 buffer{}; + s32 buffer_pos{}; + std::vector byte_array; +}; + +class H264 { +public: + explicit H264(GPU& gpu); + ~H264(); + + /// Compose the H264 frame for FFmpeg decoding + [[nodiscard]] const std::vector& ComposeFrame( + const Host1x::NvdecCommon::NvdecRegisters& state, bool is_first_frame = false); + +private: + std::vector frame; + GPU& gpu; + + struct H264ParameterSet { + s32 log2_max_pic_order_cnt_lsb_minus4; ///< 0x00 + s32 delta_pic_order_always_zero_flag; ///< 0x04 + s32 frame_mbs_only_flag; ///< 0x08 + u32 pic_width_in_mbs; ///< 0x0C + u32 frame_height_in_map_units; ///< 0x10 + union { ///< 0x14 + BitField<0, 2, u32> tile_format; + BitField<2, 3, u32> gob_height; + }; + u32 entropy_coding_mode_flag; ///< 0x18 + s32 pic_order_present_flag; ///< 0x1C + s32 num_refidx_l0_default_active; ///< 0x20 + s32 num_refidx_l1_default_active; ///< 0x24 + s32 deblocking_filter_control_present_flag; ///< 0x28 + s32 redundant_pic_cnt_present_flag; ///< 0x2C + u32 transform_8x8_mode_flag; ///< 0x30 + u32 pitch_luma; ///< 0x34 + u32 pitch_chroma; ///< 0x38 + u32 luma_top_offset; ///< 0x3C + u32 luma_bot_offset; ///< 0x40 + u32 luma_frame_offset; ///< 0x44 + u32 chroma_top_offset; ///< 0x48 + u32 chroma_bot_offset; ///< 0x4C + u32 chroma_frame_offset; ///< 0x50 + u32 hist_buffer_size; ///< 0x54 + union { ///< 0x58 + union { + BitField<0, 1, u64> mbaff_frame; + BitField<1, 1, u64> direct_8x8_inference; + BitField<2, 1, u64> weighted_pred; + BitField<3, 1, u64> constrained_intra_pred; + BitField<4, 1, u64> ref_pic; + BitField<5, 1, u64> field_pic; + BitField<6, 1, u64> bottom_field; + BitField<7, 1, u64> second_field; + } flags; + BitField<8, 4, u64> log2_max_frame_num_minus4; + BitField<12, 2, u64> chroma_format_idc; + BitField<14, 2, u64> pic_order_cnt_type; + BitField<16, 6, s64> pic_init_qp_minus26; + BitField<22, 5, s64> chroma_qp_index_offset; + BitField<27, 5, s64> second_chroma_qp_index_offset; + BitField<32, 2, u64> weighted_bipred_idc; + BitField<34, 7, u64> curr_pic_idx; + BitField<41, 5, u64> curr_col_idx; + BitField<46, 16, u64> frame_number; + BitField<62, 1, u64> frame_surfaces; + BitField<63, 1, u64> output_memory_layout; + }; + }; + static_assert(sizeof(H264ParameterSet) == 0x60, "H264ParameterSet is an invalid size"); + + struct H264DecoderContext { + INSERT_PADDING_WORDS_NOINIT(18); ///< 0x0000 + u32 stream_len; ///< 0x0048 + INSERT_PADDING_WORDS_NOINIT(3); ///< 0x004C + H264ParameterSet h264_parameter_set; ///< 0x0058 + INSERT_PADDING_WORDS_NOINIT(66); ///< 0x00B8 + std::array weight_scale; ///< 0x01C0 + std::array weight_scale_8x8; ///< 0x0220 + }; + static_assert(sizeof(H264DecoderContext) == 0x2A0, "H264DecoderContext is an invalid size"); + +#define ASSERT_POSITION(field_name, position) \ + static_assert(offsetof(H264ParameterSet, field_name) == position, \ + "Field " #field_name " has invalid position") + + ASSERT_POSITION(log2_max_pic_order_cnt_lsb_minus4, 0x00); + ASSERT_POSITION(delta_pic_order_always_zero_flag, 0x04); + ASSERT_POSITION(frame_mbs_only_flag, 0x08); + ASSERT_POSITION(pic_width_in_mbs, 0x0C); + ASSERT_POSITION(frame_height_in_map_units, 0x10); + ASSERT_POSITION(tile_format, 0x14); + ASSERT_POSITION(entropy_coding_mode_flag, 0x18); + ASSERT_POSITION(pic_order_present_flag, 0x1C); + ASSERT_POSITION(num_refidx_l0_default_active, 0x20); + ASSERT_POSITION(num_refidx_l1_default_active, 0x24); + ASSERT_POSITION(deblocking_filter_control_present_flag, 0x28); + ASSERT_POSITION(redundant_pic_cnt_present_flag, 0x2C); + ASSERT_POSITION(transform_8x8_mode_flag, 0x30); + ASSERT_POSITION(pitch_luma, 0x34); + ASSERT_POSITION(pitch_chroma, 0x38); + ASSERT_POSITION(luma_top_offset, 0x3C); + ASSERT_POSITION(luma_bot_offset, 0x40); + ASSERT_POSITION(luma_frame_offset, 0x44); + ASSERT_POSITION(chroma_top_offset, 0x48); + ASSERT_POSITION(chroma_bot_offset, 0x4C); + ASSERT_POSITION(chroma_frame_offset, 0x50); + ASSERT_POSITION(hist_buffer_size, 0x54); + ASSERT_POSITION(flags, 0x58); +#undef ASSERT_POSITION + +#define ASSERT_POSITION(field_name, position) \ + static_assert(offsetof(H264DecoderContext, field_name) == position, \ + "Field " #field_name " has invalid position") + + ASSERT_POSITION(stream_len, 0x48); + ASSERT_POSITION(h264_parameter_set, 0x58); + ASSERT_POSITION(weight_scale, 0x1C0); +#undef ASSERT_POSITION +}; + +} // namespace Decoder +} // namespace Tegra diff --git a/src/video_core/host1x/codecs/vp8.cpp b/src/video_core/host1x/codecs/vp8.cpp new file mode 100644 index 000000000..aac026e17 --- /dev/null +++ b/src/video_core/host1x/codecs/vp8.cpp @@ -0,0 +1,53 @@ +// SPDX-FileCopyrightText: Copyright 2021 yuzu Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#include + +#include "video_core/gpu.h" +#include "video_core/host1x/codecs/vp8.h" +#include "video_core/memory_manager.h" + +namespace Tegra::Decoder { +VP8::VP8(GPU& gpu_) : gpu(gpu_) {} + +VP8::~VP8() = default; + +const std::vector& VP8::ComposeFrame(const Host1x::NvdecCommon::NvdecRegisters& state) { + VP8PictureInfo info; + gpu.MemoryManager().ReadBlock(state.picture_info_offset, &info, sizeof(VP8PictureInfo)); + + const bool is_key_frame = info.key_frame == 1u; + const auto bitstream_size = static_cast(info.vld_buffer_size); + const size_t header_size = is_key_frame ? 10u : 3u; + frame.resize(header_size + bitstream_size); + + // Based on page 30 of the VP8 specification. + // https://datatracker.ietf.org/doc/rfc6386/ + frame[0] = is_key_frame ? 0u : 1u; // 1-bit frame type (0: keyframe, 1: interframes). + frame[0] |= static_cast((info.version & 7u) << 1u); // 3-bit version number + frame[0] |= static_cast(1u << 4u); // 1-bit show_frame flag + + // The next 19-bits are the first partition size + frame[0] |= static_cast((info.first_part_size & 7u) << 5u); + frame[1] = static_cast((info.first_part_size & 0x7f8u) >> 3u); + frame[2] = static_cast((info.first_part_size & 0x7f800u) >> 11u); + + if (is_key_frame) { + frame[3] = 0x9du; + frame[4] = 0x01u; + frame[5] = 0x2au; + // TODO(ameerj): Horizontal/Vertical Scale + // 16 bits: (2 bits Horizontal Scale << 14) | Width (14 bits) + frame[6] = static_cast(info.frame_width & 0xff); + frame[7] = static_cast(((info.frame_width >> 8) & 0x3f)); + // 16 bits:(2 bits Vertical Scale << 14) | Height (14 bits) + frame[8] = static_cast(info.frame_height & 0xff); + frame[9] = static_cast(((info.frame_height >> 8) & 0x3f)); + } + const u64 bitstream_offset = state.frame_bitstream_offset; + gpu.MemoryManager().ReadBlock(bitstream_offset, frame.data() + header_size, bitstream_size); + + return frame; +} + +} // namespace Tegra::Decoder diff --git a/src/video_core/host1x/codecs/vp8.h b/src/video_core/host1x/codecs/vp8.h new file mode 100644 index 000000000..a1dfa5f03 --- /dev/null +++ b/src/video_core/host1x/codecs/vp8.h @@ -0,0 +1,74 @@ +// SPDX-FileCopyrightText: Copyright 2021 yuzu Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#pragma once + +#include +#include + +#include "common/common_funcs.h" +#include "common/common_types.h" +#include "video_core/host1x/nvdec_common.h" + +namespace Tegra { +class GPU; +namespace Decoder { + +class VP8 { +public: + explicit VP8(GPU& gpu); + ~VP8(); + + /// Compose the VP8 frame for FFmpeg decoding + [[nodiscard]] const std::vector& ComposeFrame( + const Host1x::NvdecCommon::NvdecRegisters& state); + +private: + std::vector frame; + GPU& gpu; + + struct VP8PictureInfo { + INSERT_PADDING_WORDS_NOINIT(14); + u16 frame_width; // actual frame width + u16 frame_height; // actual frame height + u8 key_frame; + u8 version; + union { + u8 raw; + BitField<0, 2, u8> tile_format; + BitField<2, 3, u8> gob_height; + BitField<5, 3, u8> reserverd_surface_format; + }; + u8 error_conceal_on; // 1: error conceal on; 0: off + u32 first_part_size; // the size of first partition(frame header and mb header partition) + u32 hist_buffer_size; // in units of 256 + u32 vld_buffer_size; // in units of 1 + // Current frame buffers + std::array frame_stride; // [y_c] + u32 luma_top_offset; // offset of luma top field in units of 256 + u32 luma_bot_offset; // offset of luma bottom field in units of 256 + u32 luma_frame_offset; // offset of luma frame in units of 256 + u32 chroma_top_offset; // offset of chroma top field in units of 256 + u32 chroma_bot_offset; // offset of chroma bottom field in units of 256 + u32 chroma_frame_offset; // offset of chroma frame in units of 256 + + INSERT_PADDING_BYTES_NOINIT(0x1c); // NvdecDisplayParams + + // Decode picture buffer related + s8 current_output_memory_layout; + // output NV12/NV24 setting. index 0: golden; 1: altref; 2: last + std::array output_memory_layout; + + u8 segmentation_feature_data_update; + INSERT_PADDING_BYTES_NOINIT(3); + + // ucode return result + u32 result_value; + std::array partition_offset; + INSERT_PADDING_WORDS_NOINIT(3); + }; + static_assert(sizeof(VP8PictureInfo) == 0xc0, "PictureInfo is an invalid size"); +}; + +} // namespace Decoder +} // namespace Tegra diff --git a/src/video_core/host1x/codecs/vp9.cpp b/src/video_core/host1x/codecs/vp9.cpp new file mode 100644 index 000000000..bc50c6ba4 --- /dev/null +++ b/src/video_core/host1x/codecs/vp9.cpp @@ -0,0 +1,946 @@ +// SPDX-FileCopyrightText: Copyright 2020 yuzu Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#include // for std::copy +#include +#include "common/assert.h" +#include "video_core/gpu.h" +#include "video_core/host1x/codecs/vp9.h" +#include "video_core/memory_manager.h" + +namespace Tegra::Decoder { +namespace { +constexpr u32 diff_update_probability = 252; +constexpr u32 frame_sync_code = 0x498342; + +// Default compressed header probabilities once frame context resets +constexpr Vp9EntropyProbs default_probs{ + .y_mode_prob{ + 65, 32, 18, 144, 162, 194, 41, 51, 98, 132, 68, 18, 165, 217, 196, 45, 40, 78, + 173, 80, 19, 176, 240, 193, 64, 35, 46, 221, 135, 38, 194, 248, 121, 96, 85, 29, + }, + .partition_prob{ + 199, 122, 141, 0, 147, 63, 159, 0, 148, 133, 118, 0, 121, 104, 114, 0, + 174, 73, 87, 0, 92, 41, 83, 0, 82, 99, 50, 0, 53, 39, 39, 0, + 177, 58, 59, 0, 68, 26, 63, 0, 52, 79, 25, 0, 17, 14, 12, 0, + 222, 34, 30, 0, 72, 16, 44, 0, 58, 32, 12, 0, 10, 7, 6, 0, + }, + .coef_probs{ + 195, 29, 183, 84, 49, 136, 8, 42, 71, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 31, 107, 169, 35, 99, 159, 17, 82, 140, 8, 66, 114, 2, 44, 76, 1, 19, 32, + 40, 132, 201, 29, 114, 187, 13, 91, 157, 7, 75, 127, 3, 58, 95, 1, 28, 47, + 69, 142, 221, 42, 122, 201, 15, 91, 159, 6, 67, 121, 1, 42, 77, 1, 17, 31, + 102, 148, 228, 67, 117, 204, 17, 82, 154, 6, 59, 114, 2, 39, 75, 1, 15, 29, + 156, 57, 233, 119, 57, 212, 58, 48, 163, 29, 40, 124, 12, 30, 81, 3, 12, 31, + 191, 107, 226, 124, 117, 204, 25, 99, 155, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 29, 148, 210, 37, 126, 194, 8, 93, 157, 2, 68, 118, 1, 39, 69, 1, 17, 33, + 41, 151, 213, 27, 123, 193, 3, 82, 144, 1, 58, 105, 1, 32, 60, 1, 13, 26, + 59, 159, 220, 23, 126, 198, 4, 88, 151, 1, 66, 114, 1, 38, 71, 1, 18, 34, + 114, 136, 232, 51, 114, 207, 11, 83, 155, 3, 56, 105, 1, 33, 65, 1, 17, 34, + 149, 65, 234, 121, 57, 215, 61, 49, 166, 28, 36, 114, 12, 25, 76, 3, 16, 42, + 214, 49, 220, 132, 63, 188, 42, 65, 137, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 85, 137, 221, 104, 131, 216, 49, 111, 192, 21, 87, 155, 2, 49, 87, 1, 16, 28, + 89, 163, 230, 90, 137, 220, 29, 100, 183, 10, 70, 135, 2, 42, 81, 1, 17, 33, + 108, 167, 237, 55, 133, 222, 15, 97, 179, 4, 72, 135, 1, 45, 85, 1, 19, 38, + 124, 146, 240, 66, 124, 224, 17, 88, 175, 4, 58, 122, 1, 36, 75, 1, 18, 37, + 141, 79, 241, 126, 70, 227, 66, 58, 182, 30, 44, 136, 12, 34, 96, 2, 20, 47, + 229, 99, 249, 143, 111, 235, 46, 109, 192, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 82, 158, 236, 94, 146, 224, 25, 117, 191, 9, 87, 149, 3, 56, 99, 1, 33, 57, + 83, 167, 237, 68, 145, 222, 10, 103, 177, 2, 72, 131, 1, 41, 79, 1, 20, 39, + 99, 167, 239, 47, 141, 224, 10, 104, 178, 2, 73, 133, 1, 44, 85, 1, 22, 47, + 127, 145, 243, 71, 129, 228, 17, 93, 177, 3, 61, 124, 1, 41, 84, 1, 21, 52, + 157, 78, 244, 140, 72, 231, 69, 58, 184, 31, 44, 137, 14, 38, 105, 8, 23, 61, + 125, 34, 187, 52, 41, 133, 6, 31, 56, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 37, 109, 153, 51, 102, 147, 23, 87, 128, 8, 67, 101, 1, 41, 63, 1, 19, 29, + 31, 154, 185, 17, 127, 175, 6, 96, 145, 2, 73, 114, 1, 51, 82, 1, 28, 45, + 23, 163, 200, 10, 131, 185, 2, 93, 148, 1, 67, 111, 1, 41, 69, 1, 14, 24, + 29, 176, 217, 12, 145, 201, 3, 101, 156, 1, 69, 111, 1, 39, 63, 1, 14, 23, + 57, 192, 233, 25, 154, 215, 6, 109, 167, 3, 78, 118, 1, 48, 69, 1, 21, 29, + 202, 105, 245, 108, 106, 216, 18, 90, 144, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 33, 172, 219, 64, 149, 206, 14, 117, 177, 5, 90, 141, 2, 61, 95, 1, 37, 57, + 33, 179, 220, 11, 140, 198, 1, 89, 148, 1, 60, 104, 1, 33, 57, 1, 12, 21, + 30, 181, 221, 8, 141, 198, 1, 87, 145, 1, 58, 100, 1, 31, 55, 1, 12, 20, + 32, 186, 224, 7, 142, 198, 1, 86, 143, 1, 58, 100, 1, 31, 55, 1, 12, 22, + 57, 192, 227, 20, 143, 204, 3, 96, 154, 1, 68, 112, 1, 42, 69, 1, 19, 32, + 212, 35, 215, 113, 47, 169, 29, 48, 105, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 74, 129, 203, 106, 120, 203, 49, 107, 178, 19, 84, 144, 4, 50, 84, 1, 15, 25, + 71, 172, 217, 44, 141, 209, 15, 102, 173, 6, 76, 133, 2, 51, 89, 1, 24, 42, + 64, 185, 231, 31, 148, 216, 8, 103, 175, 3, 74, 131, 1, 46, 81, 1, 18, 30, + 65, 196, 235, 25, 157, 221, 5, 105, 174, 1, 67, 120, 1, 38, 69, 1, 15, 30, + 65, 204, 238, 30, 156, 224, 7, 107, 177, 2, 70, 124, 1, 42, 73, 1, 18, 34, + 225, 86, 251, 144, 104, 235, 42, 99, 181, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 85, 175, 239, 112, 165, 229, 29, 136, 200, 12, 103, 162, 6, 77, 123, 2, 53, 84, + 75, 183, 239, 30, 155, 221, 3, 106, 171, 1, 74, 128, 1, 44, 76, 1, 17, 28, + 73, 185, 240, 27, 159, 222, 2, 107, 172, 1, 75, 127, 1, 42, 73, 1, 17, 29, + 62, 190, 238, 21, 159, 222, 2, 107, 172, 1, 72, 122, 1, 40, 71, 1, 18, 32, + 61, 199, 240, 27, 161, 226, 4, 113, 180, 1, 76, 129, 1, 46, 80, 1, 23, 41, + 7, 27, 153, 5, 30, 95, 1, 16, 30, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 50, 75, 127, 57, 75, 124, 27, 67, 108, 10, 54, 86, 1, 33, 52, 1, 12, 18, + 43, 125, 151, 26, 108, 148, 7, 83, 122, 2, 59, 89, 1, 38, 60, 1, 17, 27, + 23, 144, 163, 13, 112, 154, 2, 75, 117, 1, 50, 81, 1, 31, 51, 1, 14, 23, + 18, 162, 185, 6, 123, 171, 1, 78, 125, 1, 51, 86, 1, 31, 54, 1, 14, 23, + 15, 199, 227, 3, 150, 204, 1, 91, 146, 1, 55, 95, 1, 30, 53, 1, 11, 20, + 19, 55, 240, 19, 59, 196, 3, 52, 105, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 41, 166, 207, 104, 153, 199, 31, 123, 181, 14, 101, 152, 5, 72, 106, 1, 36, 52, + 35, 176, 211, 12, 131, 190, 2, 88, 144, 1, 60, 101, 1, 36, 60, 1, 16, 28, + 28, 183, 213, 8, 134, 191, 1, 86, 142, 1, 56, 96, 1, 30, 53, 1, 12, 20, + 20, 190, 215, 4, 135, 192, 1, 84, 139, 1, 53, 91, 1, 28, 49, 1, 11, 20, + 13, 196, 216, 2, 137, 192, 1, 86, 143, 1, 57, 99, 1, 32, 56, 1, 13, 24, + 211, 29, 217, 96, 47, 156, 22, 43, 87, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 78, 120, 193, 111, 116, 186, 46, 102, 164, 15, 80, 128, 2, 49, 76, 1, 18, 28, + 71, 161, 203, 42, 132, 192, 10, 98, 150, 3, 69, 109, 1, 44, 70, 1, 18, 29, + 57, 186, 211, 30, 140, 196, 4, 93, 146, 1, 62, 102, 1, 38, 65, 1, 16, 27, + 47, 199, 217, 14, 145, 196, 1, 88, 142, 1, 57, 98, 1, 36, 62, 1, 15, 26, + 26, 219, 229, 5, 155, 207, 1, 94, 151, 1, 60, 104, 1, 36, 62, 1, 16, 28, + 233, 29, 248, 146, 47, 220, 43, 52, 140, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 100, 163, 232, 179, 161, 222, 63, 142, 204, 37, 113, 174, 26, 89, 137, 18, 68, 97, + 85, 181, 230, 32, 146, 209, 7, 100, 164, 3, 71, 121, 1, 45, 77, 1, 18, 30, + 65, 187, 230, 20, 148, 207, 2, 97, 159, 1, 68, 116, 1, 40, 70, 1, 14, 29, + 40, 194, 227, 8, 147, 204, 1, 94, 155, 1, 65, 112, 1, 39, 66, 1, 14, 26, + 16, 208, 228, 3, 151, 207, 1, 98, 160, 1, 67, 117, 1, 41, 74, 1, 17, 31, + 17, 38, 140, 7, 34, 80, 1, 17, 29, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 37, 75, 128, 41, 76, 128, 26, 66, 116, 12, 52, 94, 2, 32, 55, 1, 10, 16, + 50, 127, 154, 37, 109, 152, 16, 82, 121, 5, 59, 85, 1, 35, 54, 1, 13, 20, + 40, 142, 167, 17, 110, 157, 2, 71, 112, 1, 44, 72, 1, 27, 45, 1, 11, 17, + 30, 175, 188, 9, 124, 169, 1, 74, 116, 1, 48, 78, 1, 30, 49, 1, 11, 18, + 10, 222, 223, 2, 150, 194, 1, 83, 128, 1, 48, 79, 1, 27, 45, 1, 11, 17, + 36, 41, 235, 29, 36, 193, 10, 27, 111, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 85, 165, 222, 177, 162, 215, 110, 135, 195, 57, 113, 168, 23, 83, 120, 10, 49, 61, + 85, 190, 223, 36, 139, 200, 5, 90, 146, 1, 60, 103, 1, 38, 65, 1, 18, 30, + 72, 202, 223, 23, 141, 199, 2, 86, 140, 1, 56, 97, 1, 36, 61, 1, 16, 27, + 55, 218, 225, 13, 145, 200, 1, 86, 141, 1, 57, 99, 1, 35, 61, 1, 13, 22, + 15, 235, 212, 1, 132, 184, 1, 84, 139, 1, 57, 97, 1, 34, 56, 1, 14, 23, + 181, 21, 201, 61, 37, 123, 10, 38, 71, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 47, 106, 172, 95, 104, 173, 42, 93, 159, 18, 77, 131, 4, 50, 81, 1, 17, 23, + 62, 147, 199, 44, 130, 189, 28, 102, 154, 18, 75, 115, 2, 44, 65, 1, 12, 19, + 55, 153, 210, 24, 130, 194, 3, 93, 146, 1, 61, 97, 1, 31, 50, 1, 10, 16, + 49, 186, 223, 17, 148, 204, 1, 96, 142, 1, 53, 83, 1, 26, 44, 1, 11, 17, + 13, 217, 212, 2, 136, 180, 1, 78, 124, 1, 50, 83, 1, 29, 49, 1, 14, 23, + 197, 13, 247, 82, 17, 222, 25, 17, 162, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 126, 186, 247, 234, 191, 243, 176, 177, 234, 104, 158, 220, 66, 128, 186, 55, 90, 137, + 111, 197, 242, 46, 158, 219, 9, 104, 171, 2, 65, 125, 1, 44, 80, 1, 17, 91, + 104, 208, 245, 39, 168, 224, 3, 109, 162, 1, 79, 124, 1, 50, 102, 1, 43, 102, + 84, 220, 246, 31, 177, 231, 2, 115, 180, 1, 79, 134, 1, 55, 77, 1, 60, 79, + 43, 243, 240, 8, 180, 217, 1, 115, 166, 1, 84, 121, 1, 51, 67, 1, 16, 6, + }, + .switchable_interp_prob{235, 162, 36, 255, 34, 3, 149, 144}, + .inter_mode_prob{ + 2, 173, 34, 0, 7, 145, 85, 0, 7, 166, 63, 0, 7, 94, + 66, 0, 8, 64, 46, 0, 17, 81, 31, 0, 25, 29, 30, 0, + }, + .intra_inter_prob{9, 102, 187, 225}, + .comp_inter_prob{9, 102, 187, 225, 0}, + .single_ref_prob{33, 16, 77, 74, 142, 142, 172, 170, 238, 247}, + .comp_ref_prob{50, 126, 123, 221, 226}, + .tx_32x32_prob{3, 136, 37, 5, 52, 13}, + .tx_16x16_prob{20, 152, 15, 101}, + .tx_8x8_prob{100, 66}, + .skip_probs{192, 128, 64}, + .joints{32, 64, 96}, + .sign{128, 128}, + .classes{ + 224, 144, 192, 168, 192, 176, 192, 198, 198, 245, + 216, 128, 176, 160, 176, 176, 192, 198, 198, 208, + }, + .class_0{216, 208}, + .prob_bits{ + 136, 140, 148, 160, 176, 192, 224, 234, 234, 240, + 136, 140, 148, 160, 176, 192, 224, 234, 234, 240, + }, + .class_0_fr{128, 128, 64, 96, 112, 64, 128, 128, 64, 96, 112, 64}, + .fr{64, 96, 64, 64, 96, 64}, + .class_0_hp{160, 160}, + .high_precision{128, 128}, +}; + +constexpr std::array norm_lut{ + 0, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +}; + +constexpr std::array map_lut{ + 20, 21, 22, 23, 24, 25, 0, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, + 1, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 2, 50, 51, 52, 53, 54, + 55, 56, 57, 58, 59, 60, 61, 3, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, + 73, 4, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 5, 86, 87, 88, 89, + 90, 91, 92, 93, 94, 95, 96, 97, 6, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, + 108, 109, 7, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 8, 122, 123, 124, + 125, 126, 127, 128, 129, 130, 131, 132, 133, 9, 134, 135, 136, 137, 138, 139, 140, 141, 142, + 143, 144, 145, 10, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 11, 158, 159, + 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 12, 170, 171, 172, 173, 174, 175, 176, 177, + 178, 179, 180, 181, 13, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 14, 194, + 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 15, 206, 207, 208, 209, 210, 211, 212, + 213, 214, 215, 216, 217, 16, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 17, + 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 18, 242, 243, 244, 245, 246, 247, + 248, 249, 250, 251, 252, 253, 19, +}; + +// 6.2.14 Tile size calculation + +[[nodiscard]] s32 CalcMinLog2TileCols(s32 frame_width) { + const s32 sb64_cols = (frame_width + 63) / 64; + s32 min_log2 = 0; + + while ((64 << min_log2) < sb64_cols) { + min_log2++; + } + + return min_log2; +} + +[[nodiscard]] s32 CalcMaxLog2TileCols(s32 frame_width) { + const s32 sb64_cols = (frame_width + 63) / 64; + s32 max_log2 = 1; + + while ((sb64_cols >> max_log2) >= 4) { + max_log2++; + } + + return max_log2 - 1; +} + +// Recenters probability. Based on section 6.3.6 of VP9 Specification +[[nodiscard]] s32 RecenterNonNeg(s32 new_prob, s32 old_prob) { + if (new_prob > old_prob * 2) { + return new_prob; + } + + if (new_prob >= old_prob) { + return (new_prob - old_prob) * 2; + } + + return (old_prob - new_prob) * 2 - 1; +} + +// Adjusts old_prob depending on new_prob. Based on section 6.3.5 of VP9 Specification +[[nodiscard]] s32 RemapProbability(s32 new_prob, s32 old_prob) { + new_prob--; + old_prob--; + + std::size_t index{}; + + if (old_prob * 2 <= 0xff) { + index = static_cast(std::max(0, RecenterNonNeg(new_prob, old_prob) - 1)); + } else { + index = static_cast( + std::max(0, RecenterNonNeg(0xff - 1 - new_prob, 0xff - 1 - old_prob) - 1)); + } + + return static_cast(map_lut[index]); +} +} // Anonymous namespace + +VP9::VP9(GPU& gpu_) : gpu{gpu_} {} + +VP9::~VP9() = default; + +void VP9::WriteProbabilityUpdate(VpxRangeEncoder& writer, u8 new_prob, u8 old_prob) { + const bool update = new_prob != old_prob; + + writer.Write(update, diff_update_probability); + + if (update) { + WriteProbabilityDelta(writer, new_prob, old_prob); + } +} +template +void VP9::WriteProbabilityUpdate(VpxRangeEncoder& writer, const std::array& new_prob, + const std::array& old_prob) { + for (std::size_t offset = 0; offset < new_prob.size(); ++offset) { + WriteProbabilityUpdate(writer, new_prob[offset], old_prob[offset]); + } +} + +template +void VP9::WriteProbabilityUpdateAligned4(VpxRangeEncoder& writer, const std::array& new_prob, + const std::array& old_prob) { + for (std::size_t offset = 0; offset < new_prob.size(); offset += 4) { + WriteProbabilityUpdate(writer, new_prob[offset + 0], old_prob[offset + 0]); + WriteProbabilityUpdate(writer, new_prob[offset + 1], old_prob[offset + 1]); + WriteProbabilityUpdate(writer, new_prob[offset + 2], old_prob[offset + 2]); + } +} + +void VP9::WriteProbabilityDelta(VpxRangeEncoder& writer, u8 new_prob, u8 old_prob) { + const int delta = RemapProbability(new_prob, old_prob); + + EncodeTermSubExp(writer, delta); +} + +void VP9::EncodeTermSubExp(VpxRangeEncoder& writer, s32 value) { + if (WriteLessThan(writer, value, 16)) { + writer.Write(value, 4); + } else if (WriteLessThan(writer, value, 32)) { + writer.Write(value - 16, 4); + } else if (WriteLessThan(writer, value, 64)) { + writer.Write(value - 32, 5); + } else { + value -= 64; + + constexpr s32 size = 8; + + const s32 mask = (1 << size) - 191; + + const s32 delta = value - mask; + + if (delta < 0) { + writer.Write(value, size - 1); + } else { + writer.Write(delta / 2 + mask, size - 1); + writer.Write(delta & 1, 1); + } + } +} + +bool VP9::WriteLessThan(VpxRangeEncoder& writer, s32 value, s32 test) { + const bool is_lt = value < test; + writer.Write(!is_lt); + return is_lt; +} + +void VP9::WriteCoefProbabilityUpdate(VpxRangeEncoder& writer, s32 tx_mode, + const std::array& new_prob, + const std::array& old_prob) { + constexpr u32 block_bytes = 2 * 2 * 6 * 6 * 3; + + const auto needs_update = [&](u32 base_index) { + return !std::equal(new_prob.begin() + base_index, + new_prob.begin() + base_index + block_bytes, + old_prob.begin() + base_index); + }; + + for (u32 block_index = 0; block_index < 4; block_index++) { + const u32 base_index = block_index * block_bytes; + const bool update = needs_update(base_index); + writer.Write(update); + + if (update) { + u32 index = base_index; + for (s32 i = 0; i < 2; i++) { + for (s32 j = 0; j < 2; j++) { + for (s32 k = 0; k < 6; k++) { + for (s32 l = 0; l < 6; l++) { + if (k != 0 || l < 3) { + WriteProbabilityUpdate(writer, new_prob[index + 0], + old_prob[index + 0]); + WriteProbabilityUpdate(writer, new_prob[index + 1], + old_prob[index + 1]); + WriteProbabilityUpdate(writer, new_prob[index + 2], + old_prob[index + 2]); + } + index += 3; + } + } + } + } + } + if (block_index == static_cast(tx_mode)) { + break; + } + } +} + +void VP9::WriteMvProbabilityUpdate(VpxRangeEncoder& writer, u8 new_prob, u8 old_prob) { + const bool update = new_prob != old_prob; + writer.Write(update, diff_update_probability); + + if (update) { + writer.Write(new_prob >> 1, 7); + } +} + +Vp9PictureInfo VP9::GetVp9PictureInfo(const Host1x::NvdecCommon::NvdecRegisters& state) { + PictureInfo picture_info; + gpu.MemoryManager().ReadBlock(state.picture_info_offset, &picture_info, sizeof(PictureInfo)); + Vp9PictureInfo vp9_info = picture_info.Convert(); + + InsertEntropy(state.vp9_entropy_probs_offset, vp9_info.entropy); + + // surface_luma_offset[0:3] contains the address of the reference frame offsets in the following + // order: last, golden, altref, current. + std::copy(state.surface_luma_offset.begin(), state.surface_luma_offset.begin() + 4, + vp9_info.frame_offsets.begin()); + + return vp9_info; +} + +void VP9::InsertEntropy(u64 offset, Vp9EntropyProbs& dst) { + EntropyProbs entropy; + gpu.MemoryManager().ReadBlock(offset, &entropy, sizeof(EntropyProbs)); + entropy.Convert(dst); +} + +Vp9FrameContainer VP9::GetCurrentFrame(const Host1x::NvdecCommon::NvdecRegisters& state) { + Vp9FrameContainer current_frame{}; + { + gpu.SyncGuestHost(); + current_frame.info = GetVp9PictureInfo(state); + current_frame.bit_stream.resize(current_frame.info.bitstream_size); + gpu.MemoryManager().ReadBlock(state.frame_bitstream_offset, current_frame.bit_stream.data(), + current_frame.info.bitstream_size); + } + if (!next_frame.bit_stream.empty()) { + Vp9FrameContainer temp{ + .info = current_frame.info, + .bit_stream = std::move(current_frame.bit_stream), + }; + next_frame.info.show_frame = current_frame.info.last_frame_shown; + current_frame.info = next_frame.info; + current_frame.bit_stream = std::move(next_frame.bit_stream); + next_frame = std::move(temp); + } else { + next_frame.info = current_frame.info; + next_frame.bit_stream = current_frame.bit_stream; + } + return current_frame; +} + +std::vector VP9::ComposeCompressedHeader() { + VpxRangeEncoder writer{}; + const bool update_probs = !current_frame_info.is_key_frame && current_frame_info.show_frame; + if (!current_frame_info.lossless) { + if (static_cast(current_frame_info.transform_mode) >= 3) { + writer.Write(3, 2); + writer.Write(current_frame_info.transform_mode == 4); + } else { + writer.Write(current_frame_info.transform_mode, 2); + } + } + + if (current_frame_info.transform_mode == 4) { + // tx_mode_probs() in the spec + WriteProbabilityUpdate(writer, current_frame_info.entropy.tx_8x8_prob, + prev_frame_probs.tx_8x8_prob); + WriteProbabilityUpdate(writer, current_frame_info.entropy.tx_16x16_prob, + prev_frame_probs.tx_16x16_prob); + WriteProbabilityUpdate(writer, current_frame_info.entropy.tx_32x32_prob, + prev_frame_probs.tx_32x32_prob); + if (update_probs) { + prev_frame_probs.tx_8x8_prob = current_frame_info.entropy.tx_8x8_prob; + prev_frame_probs.tx_16x16_prob = current_frame_info.entropy.tx_16x16_prob; + prev_frame_probs.tx_32x32_prob = current_frame_info.entropy.tx_32x32_prob; + } + } + // read_coef_probs() in the spec + WriteCoefProbabilityUpdate(writer, current_frame_info.transform_mode, + current_frame_info.entropy.coef_probs, prev_frame_probs.coef_probs); + // read_skip_probs() in the spec + WriteProbabilityUpdate(writer, current_frame_info.entropy.skip_probs, + prev_frame_probs.skip_probs); + + if (update_probs) { + prev_frame_probs.coef_probs = current_frame_info.entropy.coef_probs; + prev_frame_probs.skip_probs = current_frame_info.entropy.skip_probs; + } + + if (!current_frame_info.intra_only) { + // read_inter_probs() in the spec + WriteProbabilityUpdateAligned4(writer, current_frame_info.entropy.inter_mode_prob, + prev_frame_probs.inter_mode_prob); + + if (current_frame_info.interp_filter == 4) { + // read_interp_filter_probs() in the spec + WriteProbabilityUpdate(writer, current_frame_info.entropy.switchable_interp_prob, + prev_frame_probs.switchable_interp_prob); + if (update_probs) { + prev_frame_probs.switchable_interp_prob = + current_frame_info.entropy.switchable_interp_prob; + } + } + + // read_is_inter_probs() in the spec + WriteProbabilityUpdate(writer, current_frame_info.entropy.intra_inter_prob, + prev_frame_probs.intra_inter_prob); + + // frame_reference_mode() in the spec + if ((current_frame_info.ref_frame_sign_bias[1] & 1) != + (current_frame_info.ref_frame_sign_bias[2] & 1) || + (current_frame_info.ref_frame_sign_bias[1] & 1) != + (current_frame_info.ref_frame_sign_bias[3] & 1)) { + if (current_frame_info.reference_mode >= 1) { + writer.Write(1, 1); + writer.Write(current_frame_info.reference_mode == 2); + } else { + writer.Write(0, 1); + } + } + + // frame_reference_mode_probs() in the spec + if (current_frame_info.reference_mode == 2) { + WriteProbabilityUpdate(writer, current_frame_info.entropy.comp_inter_prob, + prev_frame_probs.comp_inter_prob); + if (update_probs) { + prev_frame_probs.comp_inter_prob = current_frame_info.entropy.comp_inter_prob; + } + } + + if (current_frame_info.reference_mode != 1) { + WriteProbabilityUpdate(writer, current_frame_info.entropy.single_ref_prob, + prev_frame_probs.single_ref_prob); + if (update_probs) { + prev_frame_probs.single_ref_prob = current_frame_info.entropy.single_ref_prob; + } + } + + if (current_frame_info.reference_mode != 0) { + WriteProbabilityUpdate(writer, current_frame_info.entropy.comp_ref_prob, + prev_frame_probs.comp_ref_prob); + if (update_probs) { + prev_frame_probs.comp_ref_prob = current_frame_info.entropy.comp_ref_prob; + } + } + + // read_y_mode_probs + for (std::size_t index = 0; index < current_frame_info.entropy.y_mode_prob.size(); + ++index) { + WriteProbabilityUpdate(writer, current_frame_info.entropy.y_mode_prob[index], + prev_frame_probs.y_mode_prob[index]); + } + + // read_partition_probs + WriteProbabilityUpdateAligned4(writer, current_frame_info.entropy.partition_prob, + prev_frame_probs.partition_prob); + + // mv_probs + for (s32 i = 0; i < 3; i++) { + WriteMvProbabilityUpdate(writer, current_frame_info.entropy.joints[i], + prev_frame_probs.joints[i]); + } + if (update_probs) { + prev_frame_probs.inter_mode_prob = current_frame_info.entropy.inter_mode_prob; + prev_frame_probs.intra_inter_prob = current_frame_info.entropy.intra_inter_prob; + prev_frame_probs.y_mode_prob = current_frame_info.entropy.y_mode_prob; + prev_frame_probs.partition_prob = current_frame_info.entropy.partition_prob; + prev_frame_probs.joints = current_frame_info.entropy.joints; + } + + for (s32 i = 0; i < 2; i++) { + WriteMvProbabilityUpdate(writer, current_frame_info.entropy.sign[i], + prev_frame_probs.sign[i]); + for (s32 j = 0; j < 10; j++) { + const int index = i * 10 + j; + WriteMvProbabilityUpdate(writer, current_frame_info.entropy.classes[index], + prev_frame_probs.classes[index]); + } + WriteMvProbabilityUpdate(writer, current_frame_info.entropy.class_0[i], + prev_frame_probs.class_0[i]); + + for (s32 j = 0; j < 10; j++) { + const int index = i * 10 + j; + WriteMvProbabilityUpdate(writer, current_frame_info.entropy.prob_bits[index], + prev_frame_probs.prob_bits[index]); + } + } + + for (s32 i = 0; i < 2; i++) { + for (s32 j = 0; j < 2; j++) { + for (s32 k = 0; k < 3; k++) { + const int index = i * 2 * 3 + j * 3 + k; + WriteMvProbabilityUpdate(writer, current_frame_info.entropy.class_0_fr[index], + prev_frame_probs.class_0_fr[index]); + } + } + + for (s32 j = 0; j < 3; j++) { + const int index = i * 3 + j; + WriteMvProbabilityUpdate(writer, current_frame_info.entropy.fr[index], + prev_frame_probs.fr[index]); + } + } + + if (current_frame_info.allow_high_precision_mv) { + for (s32 index = 0; index < 2; index++) { + WriteMvProbabilityUpdate(writer, current_frame_info.entropy.class_0_hp[index], + prev_frame_probs.class_0_hp[index]); + WriteMvProbabilityUpdate(writer, current_frame_info.entropy.high_precision[index], + prev_frame_probs.high_precision[index]); + } + } + + // save previous probs + if (update_probs) { + prev_frame_probs.sign = current_frame_info.entropy.sign; + prev_frame_probs.classes = current_frame_info.entropy.classes; + prev_frame_probs.class_0 = current_frame_info.entropy.class_0; + prev_frame_probs.prob_bits = current_frame_info.entropy.prob_bits; + prev_frame_probs.class_0_fr = current_frame_info.entropy.class_0_fr; + prev_frame_probs.fr = current_frame_info.entropy.fr; + prev_frame_probs.class_0_hp = current_frame_info.entropy.class_0_hp; + prev_frame_probs.high_precision = current_frame_info.entropy.high_precision; + } + } + writer.End(); + return writer.GetBuffer(); +} + +VpxBitStreamWriter VP9::ComposeUncompressedHeader() { + VpxBitStreamWriter uncomp_writer{}; + + uncomp_writer.WriteU(2, 2); // Frame marker. + uncomp_writer.WriteU(0, 2); // Profile. + uncomp_writer.WriteBit(false); // Show existing frame. + uncomp_writer.WriteBit(!current_frame_info.is_key_frame); // is key frame? + uncomp_writer.WriteBit(current_frame_info.show_frame); // show frame? + uncomp_writer.WriteBit(current_frame_info.error_resilient_mode); // error reslience + + if (current_frame_info.is_key_frame) { + uncomp_writer.WriteU(frame_sync_code, 24); + uncomp_writer.WriteU(0, 3); // Color space. + uncomp_writer.WriteU(0, 1); // Color range. + uncomp_writer.WriteU(current_frame_info.frame_size.width - 1, 16); + uncomp_writer.WriteU(current_frame_info.frame_size.height - 1, 16); + uncomp_writer.WriteBit(false); // Render and frame size different. + + // Reset context + prev_frame_probs = default_probs; + swap_ref_indices = false; + loop_filter_ref_deltas.fill(0); + loop_filter_mode_deltas.fill(0); + frame_ctxs.fill(default_probs); + + // intra only, meaning the frame can be recreated with no other references + current_frame_info.intra_only = true; + } else { + if (!current_frame_info.show_frame) { + uncomp_writer.WriteBit(current_frame_info.intra_only); + } else { + current_frame_info.intra_only = false; + } + if (!current_frame_info.error_resilient_mode) { + uncomp_writer.WriteU(0, 2); // Reset frame context. + } + const auto& curr_offsets = current_frame_info.frame_offsets; + const auto& next_offsets = next_frame.info.frame_offsets; + const bool ref_frames_different = curr_offsets[1] != curr_offsets[2]; + const bool next_references_swap = + (next_offsets[1] == curr_offsets[2]) || (next_offsets[2] == curr_offsets[1]); + const bool needs_ref_swap = ref_frames_different && next_references_swap; + if (needs_ref_swap) { + swap_ref_indices = !swap_ref_indices; + } + union { + u32 raw; + BitField<0, 1, u32> refresh_last; + BitField<1, 2, u32> refresh_golden; + BitField<2, 1, u32> refresh_alt; + } refresh_frame_flags; + + refresh_frame_flags.raw = 0; + for (u32 index = 0; index < 3; ++index) { + // Refresh indices that use the current frame as an index + if (curr_offsets[3] == next_offsets[index]) { + refresh_frame_flags.raw |= 1u << index; + } + } + if (swap_ref_indices) { + const u32 temp = refresh_frame_flags.refresh_golden; + refresh_frame_flags.refresh_golden.Assign(refresh_frame_flags.refresh_alt.Value()); + refresh_frame_flags.refresh_alt.Assign(temp); + } + if (current_frame_info.intra_only) { + uncomp_writer.WriteU(frame_sync_code, 24); + uncomp_writer.WriteU(refresh_frame_flags.raw, 8); + uncomp_writer.WriteU(current_frame_info.frame_size.width - 1, 16); + uncomp_writer.WriteU(current_frame_info.frame_size.height - 1, 16); + uncomp_writer.WriteBit(false); // Render and frame size different. + } else { + const bool swap_indices = needs_ref_swap ^ swap_ref_indices; + const auto ref_frame_index = swap_indices ? std::array{0, 2, 1} : std::array{0, 1, 2}; + uncomp_writer.WriteU(refresh_frame_flags.raw, 8); + for (size_t index = 1; index < 4; index++) { + uncomp_writer.WriteU(ref_frame_index[index - 1], 3); + uncomp_writer.WriteU(current_frame_info.ref_frame_sign_bias[index], 1); + } + uncomp_writer.WriteBit(true); // Frame size with refs. + uncomp_writer.WriteBit(false); // Render and frame size different. + uncomp_writer.WriteBit(current_frame_info.allow_high_precision_mv); + uncomp_writer.WriteBit(current_frame_info.interp_filter == 4); + + if (current_frame_info.interp_filter != 4) { + uncomp_writer.WriteU(current_frame_info.interp_filter, 2); + } + } + } + + if (!current_frame_info.error_resilient_mode) { + uncomp_writer.WriteBit(true); // Refresh frame context. where do i get this info from? + uncomp_writer.WriteBit(true); // Frame parallel decoding mode. + } + + int frame_ctx_idx = 0; + if (!current_frame_info.show_frame) { + frame_ctx_idx = 1; + } + + uncomp_writer.WriteU(frame_ctx_idx, 2); // Frame context index. + prev_frame_probs = frame_ctxs[frame_ctx_idx]; // reference probabilities for compressed header + frame_ctxs[frame_ctx_idx] = current_frame_info.entropy; + + uncomp_writer.WriteU(current_frame_info.first_level, 6); + uncomp_writer.WriteU(current_frame_info.sharpness_level, 3); + uncomp_writer.WriteBit(current_frame_info.mode_ref_delta_enabled); + + if (current_frame_info.mode_ref_delta_enabled) { + // check if ref deltas are different, update accordingly + std::array update_loop_filter_ref_deltas; + std::array update_loop_filter_mode_deltas; + + bool loop_filter_delta_update = false; + + for (std::size_t index = 0; index < current_frame_info.ref_deltas.size(); index++) { + const s8 old_deltas = loop_filter_ref_deltas[index]; + const s8 new_deltas = current_frame_info.ref_deltas[index]; + const bool differing_delta = old_deltas != new_deltas; + + update_loop_filter_ref_deltas[index] = differing_delta; + loop_filter_delta_update |= differing_delta; + } + + for (std::size_t index = 0; index < current_frame_info.mode_deltas.size(); index++) { + const s8 old_deltas = loop_filter_mode_deltas[index]; + const s8 new_deltas = current_frame_info.mode_deltas[index]; + const bool differing_delta = old_deltas != new_deltas; + + update_loop_filter_mode_deltas[index] = differing_delta; + loop_filter_delta_update |= differing_delta; + } + + uncomp_writer.WriteBit(loop_filter_delta_update); + + if (loop_filter_delta_update) { + for (std::size_t index = 0; index < current_frame_info.ref_deltas.size(); index++) { + uncomp_writer.WriteBit(update_loop_filter_ref_deltas[index]); + + if (update_loop_filter_ref_deltas[index]) { + uncomp_writer.WriteS(current_frame_info.ref_deltas[index], 6); + } + } + + for (std::size_t index = 0; index < current_frame_info.mode_deltas.size(); index++) { + uncomp_writer.WriteBit(update_loop_filter_mode_deltas[index]); + + if (update_loop_filter_mode_deltas[index]) { + uncomp_writer.WriteS(current_frame_info.mode_deltas[index], 6); + } + } + // save new deltas + loop_filter_ref_deltas = current_frame_info.ref_deltas; + loop_filter_mode_deltas = current_frame_info.mode_deltas; + } + } + + uncomp_writer.WriteU(current_frame_info.base_q_index, 8); + + uncomp_writer.WriteDeltaQ(current_frame_info.y_dc_delta_q); + uncomp_writer.WriteDeltaQ(current_frame_info.uv_dc_delta_q); + uncomp_writer.WriteDeltaQ(current_frame_info.uv_ac_delta_q); + + ASSERT(!current_frame_info.segment_enabled); + uncomp_writer.WriteBit(false); // Segmentation enabled (TODO). + + const s32 min_tile_cols_log2 = CalcMinLog2TileCols(current_frame_info.frame_size.width); + const s32 max_tile_cols_log2 = CalcMaxLog2TileCols(current_frame_info.frame_size.width); + + const s32 tile_cols_log2_diff = current_frame_info.log2_tile_cols - min_tile_cols_log2; + const s32 tile_cols_log2_inc_mask = (1 << tile_cols_log2_diff) - 1; + + // If it's less than the maximum, we need to add an extra 0 on the bitstream + // to indicate that it should stop reading. + if (current_frame_info.log2_tile_cols < max_tile_cols_log2) { + uncomp_writer.WriteU(tile_cols_log2_inc_mask << 1, tile_cols_log2_diff + 1); + } else { + uncomp_writer.WriteU(tile_cols_log2_inc_mask, tile_cols_log2_diff); + } + + const bool tile_rows_log2_is_nonzero = current_frame_info.log2_tile_rows != 0; + + uncomp_writer.WriteBit(tile_rows_log2_is_nonzero); + + if (tile_rows_log2_is_nonzero) { + uncomp_writer.WriteBit(current_frame_info.log2_tile_rows > 1); + } + + return uncomp_writer; +} + +void VP9::ComposeFrame(const Host1x::NvdecCommon::NvdecRegisters& state) { + std::vector bitstream; + { + Vp9FrameContainer curr_frame = GetCurrentFrame(state); + current_frame_info = curr_frame.info; + bitstream = std::move(curr_frame.bit_stream); + } + // The uncompressed header routine sets PrevProb parameters needed for the compressed header + auto uncomp_writer = ComposeUncompressedHeader(); + std::vector compressed_header = ComposeCompressedHeader(); + + uncomp_writer.WriteU(static_cast(compressed_header.size()), 16); + uncomp_writer.Flush(); + std::vector uncompressed_header = uncomp_writer.GetByteArray(); + + // Write headers and frame to buffer + frame.resize(uncompressed_header.size() + compressed_header.size() + bitstream.size()); + std::copy(uncompressed_header.begin(), uncompressed_header.end(), frame.begin()); + std::copy(compressed_header.begin(), compressed_header.end(), + frame.begin() + uncompressed_header.size()); + std::copy(bitstream.begin(), bitstream.end(), + frame.begin() + uncompressed_header.size() + compressed_header.size()); +} + +VpxRangeEncoder::VpxRangeEncoder() { + Write(false); +} + +VpxRangeEncoder::~VpxRangeEncoder() = default; + +void VpxRangeEncoder::Write(s32 value, s32 value_size) { + for (s32 bit = value_size - 1; bit >= 0; bit--) { + Write(((value >> bit) & 1) != 0); + } +} + +void VpxRangeEncoder::Write(bool bit) { + Write(bit, half_probability); +} + +void VpxRangeEncoder::Write(bool bit, s32 probability) { + u32 local_range = range; + const u32 split = 1 + (((local_range - 1) * static_cast(probability)) >> 8); + local_range = split; + + if (bit) { + low_value += split; + local_range = range - split; + } + + s32 shift = static_cast(norm_lut[local_range]); + local_range <<= shift; + count += shift; + + if (count >= 0) { + const s32 offset = shift - count; + + if (((low_value << (offset - 1)) >> 31) != 0) { + const s32 current_pos = static_cast(base_stream.GetPosition()); + base_stream.Seek(-1, Common::SeekOrigin::FromCurrentPos); + while (PeekByte() == 0xff) { + base_stream.WriteByte(0); + + base_stream.Seek(-2, Common::SeekOrigin::FromCurrentPos); + } + base_stream.WriteByte(static_cast((PeekByte() + 1))); + base_stream.Seek(current_pos, Common::SeekOrigin::SetOrigin); + } + base_stream.WriteByte(static_cast((low_value >> (24 - offset)))); + + low_value <<= offset; + shift = count; + low_value &= 0xffffff; + count -= 8; + } + + low_value <<= shift; + range = local_range; +} + +void VpxRangeEncoder::End() { + for (std::size_t index = 0; index < 32; ++index) { + Write(false); + } +} + +u8 VpxRangeEncoder::PeekByte() { + const u8 value = base_stream.ReadByte(); + base_stream.Seek(-1, Common::SeekOrigin::FromCurrentPos); + + return value; +} + +VpxBitStreamWriter::VpxBitStreamWriter() = default; + +VpxBitStreamWriter::~VpxBitStreamWriter() = default; + +void VpxBitStreamWriter::WriteU(u32 value, u32 value_size) { + WriteBits(value, value_size); +} + +void VpxBitStreamWriter::WriteS(s32 value, u32 value_size) { + const bool sign = value < 0; + if (sign) { + value = -value; + } + + WriteBits(static_cast(value << 1) | (sign ? 1 : 0), value_size + 1); +} + +void VpxBitStreamWriter::WriteDeltaQ(u32 value) { + const bool delta_coded = value != 0; + WriteBit(delta_coded); + + if (delta_coded) { + WriteBits(value, 4); + } +} + +void VpxBitStreamWriter::WriteBits(u32 value, u32 bit_count) { + s32 value_pos = 0; + s32 remaining = bit_count; + + while (remaining > 0) { + s32 copy_size = remaining; + + const s32 free = GetFreeBufferBits(); + + if (copy_size > free) { + copy_size = free; + } + + const s32 mask = (1 << copy_size) - 1; + + const s32 src_shift = (bit_count - value_pos) - copy_size; + const s32 dst_shift = (buffer_size - buffer_pos) - copy_size; + + buffer |= ((value >> src_shift) & mask) << dst_shift; + + value_pos += copy_size; + buffer_pos += copy_size; + remaining -= copy_size; + } +} + +void VpxBitStreamWriter::WriteBit(bool state) { + WriteBits(state ? 1 : 0, 1); +} + +s32 VpxBitStreamWriter::GetFreeBufferBits() { + if (buffer_pos == buffer_size) { + Flush(); + } + + return buffer_size - buffer_pos; +} + +void VpxBitStreamWriter::Flush() { + if (buffer_pos == 0) { + return; + } + byte_array.push_back(static_cast(buffer)); + buffer = 0; + buffer_pos = 0; +} + +std::vector& VpxBitStreamWriter::GetByteArray() { + return byte_array; +} + +const std::vector& VpxBitStreamWriter::GetByteArray() const { + return byte_array; +} + +} // namespace Tegra::Decoder diff --git a/src/video_core/host1x/codecs/vp9.h b/src/video_core/host1x/codecs/vp9.h new file mode 100644 index 000000000..a425c0fa4 --- /dev/null +++ b/src/video_core/host1x/codecs/vp9.h @@ -0,0 +1,194 @@ +// SPDX-FileCopyrightText: Copyright 2020 yuzu Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#pragma once + +#include +#include + +#include "common/common_types.h" +#include "common/stream.h" +#include "video_core/host1x/codecs/vp9_types.h" +#include "video_core/host1x/nvdec_common.h" + +namespace Tegra { +class GPU; +namespace Decoder { + +/// The VpxRangeEncoder, and VpxBitStreamWriter classes are used to compose the +/// VP9 header bitstreams. + +class VpxRangeEncoder { +public: + VpxRangeEncoder(); + ~VpxRangeEncoder(); + + VpxRangeEncoder(const VpxRangeEncoder&) = delete; + VpxRangeEncoder& operator=(const VpxRangeEncoder&) = delete; + + VpxRangeEncoder(VpxRangeEncoder&&) = default; + VpxRangeEncoder& operator=(VpxRangeEncoder&&) = default; + + /// Writes the rightmost value_size bits from value into the stream + void Write(s32 value, s32 value_size); + + /// Writes a single bit with half probability + void Write(bool bit); + + /// Writes a bit to the base_stream encoded with probability + void Write(bool bit, s32 probability); + + /// Signal the end of the bitstream + void End(); + + [[nodiscard]] std::vector& GetBuffer() { + return base_stream.GetBuffer(); + } + + [[nodiscard]] const std::vector& GetBuffer() const { + return base_stream.GetBuffer(); + } + +private: + u8 PeekByte(); + Common::Stream base_stream{}; + u32 low_value{}; + u32 range{0xff}; + s32 count{-24}; + s32 half_probability{128}; +}; + +class VpxBitStreamWriter { +public: + VpxBitStreamWriter(); + ~VpxBitStreamWriter(); + + VpxBitStreamWriter(const VpxBitStreamWriter&) = delete; + VpxBitStreamWriter& operator=(const VpxBitStreamWriter&) = delete; + + VpxBitStreamWriter(VpxBitStreamWriter&&) = default; + VpxBitStreamWriter& operator=(VpxBitStreamWriter&&) = default; + + /// Write an unsigned integer value + void WriteU(u32 value, u32 value_size); + + /// Write a signed integer value + void WriteS(s32 value, u32 value_size); + + /// Based on 6.2.10 of VP9 Spec, writes a delta coded value + void WriteDeltaQ(u32 value); + + /// Write a single bit. + void WriteBit(bool state); + + /// Pushes current buffer into buffer_array, resets buffer + void Flush(); + + /// Returns byte_array + [[nodiscard]] std::vector& GetByteArray(); + + /// Returns const byte_array + [[nodiscard]] const std::vector& GetByteArray() const; + +private: + /// Write bit_count bits from value into buffer + void WriteBits(u32 value, u32 bit_count); + + /// Gets next available position in buffer, invokes Flush() if buffer is full + s32 GetFreeBufferBits(); + + s32 buffer_size{8}; + + s32 buffer{}; + s32 buffer_pos{}; + std::vector byte_array; +}; + +class VP9 { +public: + explicit VP9(GPU& gpu_); + ~VP9(); + + VP9(const VP9&) = delete; + VP9& operator=(const VP9&) = delete; + + VP9(VP9&&) = default; + VP9& operator=(VP9&&) = delete; + + /// Composes the VP9 frame from the GPU state information. + /// Based on the official VP9 spec documentation + void ComposeFrame(const Host1x::NvdecCommon::NvdecRegisters& state); + + /// Returns true if the most recent frame was a hidden frame. + [[nodiscard]] bool WasFrameHidden() const { + return !current_frame_info.show_frame; + } + + /// Returns a const reference to the composed frame data. + [[nodiscard]] const std::vector& GetFrameBytes() const { + return frame; + } + +private: + /// Generates compressed header probability updates in the bitstream writer + template + void WriteProbabilityUpdate(VpxRangeEncoder& writer, const std::array& new_prob, + const std::array& old_prob); + + /// Generates compressed header probability updates in the bitstream writer + /// If probs are not equal, WriteProbabilityDelta is invoked + void WriteProbabilityUpdate(VpxRangeEncoder& writer, u8 new_prob, u8 old_prob); + + /// Generates compressed header probability deltas in the bitstream writer + void WriteProbabilityDelta(VpxRangeEncoder& writer, u8 new_prob, u8 old_prob); + + /// Inverse of 6.3.4 Decode term subexp + void EncodeTermSubExp(VpxRangeEncoder& writer, s32 value); + + /// Writes if the value is less than the test value + bool WriteLessThan(VpxRangeEncoder& writer, s32 value, s32 test); + + /// Writes probability updates for the Coef probabilities + void WriteCoefProbabilityUpdate(VpxRangeEncoder& writer, s32 tx_mode, + const std::array& new_prob, + const std::array& old_prob); + + /// Write probabilities for 4-byte aligned structures + template + void WriteProbabilityUpdateAligned4(VpxRangeEncoder& writer, const std::array& new_prob, + const std::array& old_prob); + + /// Write motion vector probability updates. 6.3.17 in the spec + void WriteMvProbabilityUpdate(VpxRangeEncoder& writer, u8 new_prob, u8 old_prob); + + /// Returns VP9 information from NVDEC provided offset and size + [[nodiscard]] Vp9PictureInfo GetVp9PictureInfo( + const Host1x::NvdecCommon::NvdecRegisters& state); + + /// Read and convert NVDEC provided entropy probs to Vp9EntropyProbs struct + void InsertEntropy(u64 offset, Vp9EntropyProbs& dst); + + /// Returns frame to be decoded after buffering + [[nodiscard]] Vp9FrameContainer GetCurrentFrame( + const Host1x::NvdecCommon::NvdecRegisters& state); + + /// Use NVDEC providied information to compose the headers for the current frame + [[nodiscard]] std::vector ComposeCompressedHeader(); + [[nodiscard]] VpxBitStreamWriter ComposeUncompressedHeader(); + + GPU& gpu; + std::vector frame; + + std::array loop_filter_ref_deltas{}; + std::array loop_filter_mode_deltas{}; + + Vp9FrameContainer next_frame{}; + std::array frame_ctxs{}; + bool swap_ref_indices{}; + + Vp9PictureInfo current_frame_info{}; + Vp9EntropyProbs prev_frame_probs{}; +}; + +} // namespace Decoder +} // namespace Tegra diff --git a/src/video_core/host1x/codecs/vp9_types.h b/src/video_core/host1x/codecs/vp9_types.h new file mode 100644 index 000000000..bb3d8df6e --- /dev/null +++ b/src/video_core/host1x/codecs/vp9_types.h @@ -0,0 +1,306 @@ +// SPDX-FileCopyrightText: Copyright 2020 yuzu Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#pragma once + +#include +#include +#include "common/common_funcs.h" +#include "common/common_types.h" + +namespace Tegra { +class GPU; + +namespace Decoder { +struct Vp9FrameDimensions { + s16 width; + s16 height; + s16 luma_pitch; + s16 chroma_pitch; +}; +static_assert(sizeof(Vp9FrameDimensions) == 0x8, "Vp9 Vp9FrameDimensions is an invalid size"); + +enum class FrameFlags : u32 { + IsKeyFrame = 1 << 0, + LastFrameIsKeyFrame = 1 << 1, + FrameSizeChanged = 1 << 2, + ErrorResilientMode = 1 << 3, + LastShowFrame = 1 << 4, + IntraOnly = 1 << 5, +}; +DECLARE_ENUM_FLAG_OPERATORS(FrameFlags) + +enum class TxSize { + Tx4x4 = 0, // 4x4 transform + Tx8x8 = 1, // 8x8 transform + Tx16x16 = 2, // 16x16 transform + Tx32x32 = 3, // 32x32 transform + TxSizes = 4 +}; + +enum class TxMode { + Only4X4 = 0, // Only 4x4 transform used + Allow8X8 = 1, // Allow block transform size up to 8x8 + Allow16X16 = 2, // Allow block transform size up to 16x16 + Allow32X32 = 3, // Allow block transform size up to 32x32 + TxModeSelect = 4, // Transform specified for each block + TxModes = 5 +}; + +struct Segmentation { + u8 enabled; + u8 update_map; + u8 temporal_update; + u8 abs_delta; + std::array feature_mask; + std::array, 8> feature_data; +}; +static_assert(sizeof(Segmentation) == 0x64, "Segmentation is an invalid size"); + +struct LoopFilter { + u8 mode_ref_delta_enabled; + std::array ref_deltas; + std::array mode_deltas; +}; +static_assert(sizeof(LoopFilter) == 0x7, "LoopFilter is an invalid size"); + +struct Vp9EntropyProbs { + std::array y_mode_prob; ///< 0x0000 + std::array partition_prob; ///< 0x0024 + std::array coef_probs; ///< 0x0064 + std::array switchable_interp_prob; ///< 0x0724 + std::array inter_mode_prob; ///< 0x072C + std::array intra_inter_prob; ///< 0x0748 + std::array comp_inter_prob; ///< 0x074C + std::array single_ref_prob; ///< 0x0751 + std::array comp_ref_prob; ///< 0x075B + std::array tx_32x32_prob; ///< 0x0760 + std::array tx_16x16_prob; ///< 0x0766 + std::array tx_8x8_prob; ///< 0x076A + std::array skip_probs; ///< 0x076C + std::array joints; ///< 0x076F + std::array sign; ///< 0x0772 + std::array classes; ///< 0x0774 + std::array class_0; ///< 0x0788 + std::array prob_bits; ///< 0x078A + std::array class_0_fr; ///< 0x079E + std::array fr; ///< 0x07AA + std::array class_0_hp; ///< 0x07B0 + std::array high_precision; ///< 0x07B2 +}; +static_assert(sizeof(Vp9EntropyProbs) == 0x7B4, "Vp9EntropyProbs is an invalid size"); + +struct Vp9PictureInfo { + u32 bitstream_size; + std::array frame_offsets; + std::array ref_frame_sign_bias; + s32 base_q_index; + s32 y_dc_delta_q; + s32 uv_dc_delta_q; + s32 uv_ac_delta_q; + s32 transform_mode; + s32 interp_filter; + s32 reference_mode; + s32 log2_tile_cols; + s32 log2_tile_rows; + std::array ref_deltas; + std::array mode_deltas; + Vp9EntropyProbs entropy; + Vp9FrameDimensions frame_size; + u8 first_level; + u8 sharpness_level; + bool is_key_frame; + bool intra_only; + bool last_frame_was_key; + bool error_resilient_mode; + bool last_frame_shown; + bool show_frame; + bool lossless; + bool allow_high_precision_mv; + bool segment_enabled; + bool mode_ref_delta_enabled; +}; + +struct Vp9FrameContainer { + Vp9PictureInfo info{}; + std::vector bit_stream; +}; + +struct PictureInfo { + INSERT_PADDING_WORDS_NOINIT(12); ///< 0x00 + u32 bitstream_size; ///< 0x30 + INSERT_PADDING_WORDS_NOINIT(5); ///< 0x34 + Vp9FrameDimensions last_frame_size; ///< 0x48 + Vp9FrameDimensions golden_frame_size; ///< 0x50 + Vp9FrameDimensions alt_frame_size; ///< 0x58 + Vp9FrameDimensions current_frame_size; ///< 0x60 + FrameFlags vp9_flags; ///< 0x68 + std::array ref_frame_sign_bias; ///< 0x6C + u8 first_level; ///< 0x70 + u8 sharpness_level; ///< 0x71 + u8 base_q_index; ///< 0x72 + u8 y_dc_delta_q; ///< 0x73 + u8 uv_ac_delta_q; ///< 0x74 + u8 uv_dc_delta_q; ///< 0x75 + u8 lossless; ///< 0x76 + u8 tx_mode; ///< 0x77 + u8 allow_high_precision_mv; ///< 0x78 + u8 interp_filter; ///< 0x79 + u8 reference_mode; ///< 0x7A + INSERT_PADDING_BYTES_NOINIT(3); ///< 0x7B + u8 log2_tile_cols; ///< 0x7E + u8 log2_tile_rows; ///< 0x7F + Segmentation segmentation; ///< 0x80 + LoopFilter loop_filter; ///< 0xE4 + INSERT_PADDING_BYTES_NOINIT(21); ///< 0xEB + + [[nodiscard]] Vp9PictureInfo Convert() const { + return { + .bitstream_size = bitstream_size, + .frame_offsets{}, + .ref_frame_sign_bias = ref_frame_sign_bias, + .base_q_index = base_q_index, + .y_dc_delta_q = y_dc_delta_q, + .uv_dc_delta_q = uv_dc_delta_q, + .uv_ac_delta_q = uv_ac_delta_q, + .transform_mode = tx_mode, + .interp_filter = interp_filter, + .reference_mode = reference_mode, + .log2_tile_cols = log2_tile_cols, + .log2_tile_rows = log2_tile_rows, + .ref_deltas = loop_filter.ref_deltas, + .mode_deltas = loop_filter.mode_deltas, + .entropy{}, + .frame_size = current_frame_size, + .first_level = first_level, + .sharpness_level = sharpness_level, + .is_key_frame = True(vp9_flags & FrameFlags::IsKeyFrame), + .intra_only = True(vp9_flags & FrameFlags::IntraOnly), + .last_frame_was_key = True(vp9_flags & FrameFlags::LastFrameIsKeyFrame), + .error_resilient_mode = True(vp9_flags & FrameFlags::ErrorResilientMode), + .last_frame_shown = True(vp9_flags & FrameFlags::LastShowFrame), + .show_frame = true, + .lossless = lossless != 0, + .allow_high_precision_mv = allow_high_precision_mv != 0, + .segment_enabled = segmentation.enabled != 0, + .mode_ref_delta_enabled = loop_filter.mode_ref_delta_enabled != 0, + }; + } +}; +static_assert(sizeof(PictureInfo) == 0x100, "PictureInfo is an invalid size"); + +struct EntropyProbs { + INSERT_PADDING_BYTES_NOINIT(1024); ///< 0x0000 + std::array inter_mode_prob; ///< 0x0400 + std::array intra_inter_prob; ///< 0x041C + INSERT_PADDING_BYTES_NOINIT(80); ///< 0x0420 + std::array tx_8x8_prob; ///< 0x0470 + std::array tx_16x16_prob; ///< 0x0472 + std::array tx_32x32_prob; ///< 0x0476 + std::array y_mode_prob_e8; ///< 0x047C + std::array, 4> y_mode_prob_e0e7; ///< 0x0480 + INSERT_PADDING_BYTES_NOINIT(64); ///< 0x04A0 + std::array partition_prob; ///< 0x04E0 + INSERT_PADDING_BYTES_NOINIT(10); ///< 0x0520 + std::array switchable_interp_prob; ///< 0x052A + std::array comp_inter_prob; ///< 0x0532 + std::array skip_probs; ///< 0x0537 + INSERT_PADDING_BYTES_NOINIT(1); ///< 0x053A + std::array joints; ///< 0x053B + std::array sign; ///< 0x053E + std::array class_0; ///< 0x0540 + std::array fr; ///< 0x0542 + std::array class_0_hp; ///< 0x0548 + std::array high_precision; ///< 0x054A + std::array classes; ///< 0x054C + std::array class_0_fr; ///< 0x0560 + std::array pred_bits; ///< 0x056C + std::array single_ref_prob; ///< 0x0580 + std::array comp_ref_prob; ///< 0x058A + INSERT_PADDING_BYTES_NOINIT(17); ///< 0x058F + std::array coef_probs; ///< 0x05A0 + + void Convert(Vp9EntropyProbs& fc) { + fc.inter_mode_prob = inter_mode_prob; + fc.intra_inter_prob = intra_inter_prob; + fc.tx_8x8_prob = tx_8x8_prob; + fc.tx_16x16_prob = tx_16x16_prob; + fc.tx_32x32_prob = tx_32x32_prob; + + for (std::size_t i = 0; i < 4; i++) { + for (std::size_t j = 0; j < 9; j++) { + fc.y_mode_prob[j + 9 * i] = j < 8 ? y_mode_prob_e0e7[i][j] : y_mode_prob_e8[i]; + } + } + + fc.partition_prob = partition_prob; + fc.switchable_interp_prob = switchable_interp_prob; + fc.comp_inter_prob = comp_inter_prob; + fc.skip_probs = skip_probs; + fc.joints = joints; + fc.sign = sign; + fc.class_0 = class_0; + fc.fr = fr; + fc.class_0_hp = class_0_hp; + fc.high_precision = high_precision; + fc.classes = classes; + fc.class_0_fr = class_0_fr; + fc.prob_bits = pred_bits; + fc.single_ref_prob = single_ref_prob; + fc.comp_ref_prob = comp_ref_prob; + + // Skip the 4th element as it goes unused + for (std::size_t i = 0; i < coef_probs.size(); i += 4) { + const std::size_t j = i - i / 4; + fc.coef_probs[j] = coef_probs[i]; + fc.coef_probs[j + 1] = coef_probs[i + 1]; + fc.coef_probs[j + 2] = coef_probs[i + 2]; + } + } +}; +static_assert(sizeof(EntropyProbs) == 0xEA0, "EntropyProbs is an invalid size"); + +enum class Ref { Last, Golden, AltRef }; + +struct RefPoolElement { + s64 frame{}; + Ref ref{}; + bool refresh{}; +}; + +#define ASSERT_POSITION(field_name, position) \ + static_assert(offsetof(Vp9EntropyProbs, field_name) == position, \ + "Field " #field_name " has invalid position") + +ASSERT_POSITION(partition_prob, 0x0024); +ASSERT_POSITION(switchable_interp_prob, 0x0724); +ASSERT_POSITION(sign, 0x0772); +ASSERT_POSITION(class_0_fr, 0x079E); +ASSERT_POSITION(high_precision, 0x07B2); +#undef ASSERT_POSITION + +#define ASSERT_POSITION(field_name, position) \ + static_assert(offsetof(PictureInfo, field_name) == position, \ + "Field " #field_name " has invalid position") + +ASSERT_POSITION(bitstream_size, 0x30); +ASSERT_POSITION(last_frame_size, 0x48); +ASSERT_POSITION(first_level, 0x70); +ASSERT_POSITION(segmentation, 0x80); +ASSERT_POSITION(loop_filter, 0xE4); +#undef ASSERT_POSITION + +#define ASSERT_POSITION(field_name, position) \ + static_assert(offsetof(EntropyProbs, field_name) == position, \ + "Field " #field_name " has invalid position") + +ASSERT_POSITION(inter_mode_prob, 0x400); +ASSERT_POSITION(tx_8x8_prob, 0x470); +ASSERT_POSITION(partition_prob, 0x4E0); +ASSERT_POSITION(class_0, 0x540); +ASSERT_POSITION(class_0_fr, 0x560); +ASSERT_POSITION(coef_probs, 0x5A0); +#undef ASSERT_POSITION + +}; // namespace Decoder +}; // namespace Tegra diff --git a/src/video_core/host1x/control.cpp b/src/video_core/host1x/control.cpp new file mode 100644 index 000000000..b72b01aa3 --- /dev/null +++ b/src/video_core/host1x/control.cpp @@ -0,0 +1,35 @@ +// Copyright 2022 yuzu Emulator Project +// Licensed under GPLv3 or any later version +// Refer to the license.txt file included. + +#include "common/assert.h" +#include "video_core/gpu.h" +#include "video_core/host1x/control.h" +#include "video_core/host1x/host1x.h" + +namespace Tegra::Host1x { + +Control::Control(GPU& gpu_) : gpu(gpu_) {} + +Control::~Control() = default; + +void Control::ProcessMethod(Method method, u32 argument) { + switch (method) { + case Method::LoadSyncptPayload32: + syncpoint_value = argument; + break; + case Method::WaitSyncpt: + case Method::WaitSyncpt32: + Execute(argument); + break; + default: + UNIMPLEMENTED_MSG("Control method 0x{:X}", static_cast(method)); + break; + } +} + +void Control::Execute(u32 data) { + gpu.Host1x().GetSyncpointManager().WaitHost(data, syncpoint_value); +} + +} // namespace Tegra::Host1x diff --git a/src/video_core/host1x/control.h b/src/video_core/host1x/control.h new file mode 100644 index 000000000..04dac7d51 --- /dev/null +++ b/src/video_core/host1x/control.h @@ -0,0 +1,41 @@ +// SPDX-FileCopyrightText: 2021 yuzu emulator team and Skyline Team and Contributors +// (https://github.com/skyline-emu/) +// SPDX-License-Identifier: GPL-3.0-or-later Licensed under GPLv3 +// or any later version Refer to the license.txt file included. + +#pragma once + +#include "common/common_types.h" + +namespace Tegra { +class GPU; + +namespace Host1x { + +class Nvdec; + +class Control { +public: + enum class Method : u32 { + WaitSyncpt = 0x8, + LoadSyncptPayload32 = 0x4e, + WaitSyncpt32 = 0x50, + }; + + explicit Control(GPU& gpu); + ~Control(); + + /// Writes the method into the state, Invoke Execute() if encountered + void ProcessMethod(Method method, u32 argument); + +private: + /// For Host1x, execute is waiting on a syncpoint previously written into the state + void Execute(u32 data); + + u32 syncpoint_value{}; + GPU& gpu; +}; + +} // namespace Host1x + +} // namespace Tegra diff --git a/src/video_core/host1x/host1x.h b/src/video_core/host1x/host1x.h new file mode 100644 index 000000000..2971be286 --- /dev/null +++ b/src/video_core/host1x/host1x.h @@ -0,0 +1,33 @@ +// Copyright 2022 yuzu Emulator Project +// Licensed under GPLv3 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include "common/common_types.h" + +#include "video_core/host1x/syncpoint_manager.h" + +namespace Tegra { + +namespace Host1x { + +class Host1x { +public: + Host1x() : syncpoint_manager{} {} + + SyncpointManager& GetSyncpointManager() { + return syncpoint_manager; + } + + const SyncpointManager& GetSyncpointManager() const { + return syncpoint_manager; + } + +private: + SyncpointManager syncpoint_manager; +}; + +} // namespace Host1x + +} // namespace Tegra diff --git a/src/video_core/host1x/nvdec.cpp b/src/video_core/host1x/nvdec.cpp new file mode 100644 index 000000000..5f6decd0d --- /dev/null +++ b/src/video_core/host1x/nvdec.cpp @@ -0,0 +1,47 @@ +// SPDX-FileCopyrightText: Copyright 2020 yuzu Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#include "common/assert.h" +#include "video_core/gpu.h" +#include "video_core/host1x/nvdec.h" + +namespace Tegra::Host1x { + +#define NVDEC_REG_INDEX(field_name) \ + (offsetof(NvdecCommon::NvdecRegisters, field_name) / sizeof(u64)) + +Nvdec::Nvdec(GPU& gpu_) : gpu(gpu_), state{}, codec(std::make_unique(gpu, state)) {} + +Nvdec::~Nvdec() = default; + +void Nvdec::ProcessMethod(u32 method, u32 argument) { + state.reg_array[method] = static_cast(argument) << 8; + + switch (method) { + case NVDEC_REG_INDEX(set_codec_id): + codec->SetTargetCodec(static_cast(argument)); + break; + case NVDEC_REG_INDEX(execute): + Execute(); + break; + } +} + +AVFramePtr Nvdec::GetFrame() { + return codec->GetCurrentFrame(); +} + +void Nvdec::Execute() { + switch (codec->GetCurrentCodec()) { + case NvdecCommon::VideoCodec::H264: + case NvdecCommon::VideoCodec::VP8: + case NvdecCommon::VideoCodec::VP9: + codec->Decode(); + break; + default: + UNIMPLEMENTED_MSG("Codec {}", codec->GetCurrentCodecName()); + break; + } +} + +} // namespace Tegra::Host1x diff --git a/src/video_core/host1x/nvdec.h b/src/video_core/host1x/nvdec.h new file mode 100644 index 000000000..41ba1f7a0 --- /dev/null +++ b/src/video_core/host1x/nvdec.h @@ -0,0 +1,38 @@ +// SPDX-FileCopyrightText: Copyright 2020 yuzu Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#pragma once + +#include +#include +#include "common/common_types.h" +#include "video_core/host1x/codecs/codec.h" + +namespace Tegra { +class GPU; + +namespace Host1x { + +class Nvdec { +public: + explicit Nvdec(GPU& gpu); + ~Nvdec(); + + /// Writes the method into the state, Invoke Execute() if encountered + void ProcessMethod(u32 method, u32 argument); + + /// Return most recently decoded frame + [[nodiscard]] AVFramePtr GetFrame(); + +private: + /// Invoke codec to decode a frame + void Execute(); + + GPU& gpu; + NvdecCommon::NvdecRegisters state; + std::unique_ptr codec; +}; + +} // namespace Host1x + +} // namespace Tegra diff --git a/src/video_core/host1x/nvdec_common.h b/src/video_core/host1x/nvdec_common.h new file mode 100644 index 000000000..49d67ebbe --- /dev/null +++ b/src/video_core/host1x/nvdec_common.h @@ -0,0 +1,97 @@ +// SPDX-FileCopyrightText: Copyright 2020 yuzu Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#pragma once + +#include "common/bit_field.h" +#include "common/common_funcs.h" +#include "common/common_types.h" + +namespace Tegra::Host1x::NvdecCommon { + +enum class VideoCodec : u64 { + None = 0x0, + H264 = 0x3, + VP8 = 0x5, + H265 = 0x7, + VP9 = 0x9, +}; + +// NVDEC should use a 32-bit address space, but is mapped to 64-bit, +// doubling the sizes here is compensating for that. +struct NvdecRegisters { + static constexpr std::size_t NUM_REGS = 0x178; + + union { + struct { + INSERT_PADDING_WORDS_NOINIT(256); ///< 0x0000 + VideoCodec set_codec_id; ///< 0x0400 + INSERT_PADDING_WORDS_NOINIT(126); ///< 0x0408 + u64 execute; ///< 0x0600 + INSERT_PADDING_WORDS_NOINIT(126); ///< 0x0608 + struct { ///< 0x0800 + union { + BitField<0, 3, VideoCodec> codec; + BitField<4, 1, u64> gp_timer_on; + BitField<13, 1, u64> mb_timer_on; + BitField<14, 1, u64> intra_frame_pslc; + BitField<17, 1, u64> all_intra_frame; + }; + } control_params; + u64 picture_info_offset; ///< 0x0808 + u64 frame_bitstream_offset; ///< 0x0810 + u64 frame_number; ///< 0x0818 + u64 h264_slice_data_offsets; ///< 0x0820 + u64 h264_mv_dump_offset; ///< 0x0828 + INSERT_PADDING_WORDS_NOINIT(6); ///< 0x0830 + u64 frame_stats_offset; ///< 0x0848 + u64 h264_last_surface_luma_offset; ///< 0x0850 + u64 h264_last_surface_chroma_offset; ///< 0x0858 + std::array surface_luma_offset; ///< 0x0860 + std::array surface_chroma_offset; ///< 0x08E8 + INSERT_PADDING_WORDS_NOINIT(68); ///< 0x0970 + u64 vp8_prob_data_offset; ///< 0x0A80 + u64 vp8_header_partition_buf_offset; ///< 0x0A88 + INSERT_PADDING_WORDS_NOINIT(60); ///< 0x0A90 + u64 vp9_entropy_probs_offset; ///< 0x0B80 + u64 vp9_backward_updates_offset; ///< 0x0B88 + u64 vp9_last_frame_segmap_offset; ///< 0x0B90 + u64 vp9_curr_frame_segmap_offset; ///< 0x0B98 + INSERT_PADDING_WORDS_NOINIT(2); ///< 0x0BA0 + u64 vp9_last_frame_mvs_offset; ///< 0x0BA8 + u64 vp9_curr_frame_mvs_offset; ///< 0x0BB0 + INSERT_PADDING_WORDS_NOINIT(2); ///< 0x0BB8 + }; + std::array reg_array; + }; +}; +static_assert(sizeof(NvdecRegisters) == (0xBC0), "NvdecRegisters is incorrect size"); + +#define ASSERT_REG_POSITION(field_name, position) \ + static_assert(offsetof(NvdecRegisters, field_name) == position * sizeof(u64), \ + "Field " #field_name " has invalid position") + +ASSERT_REG_POSITION(set_codec_id, 0x80); +ASSERT_REG_POSITION(execute, 0xC0); +ASSERT_REG_POSITION(control_params, 0x100); +ASSERT_REG_POSITION(picture_info_offset, 0x101); +ASSERT_REG_POSITION(frame_bitstream_offset, 0x102); +ASSERT_REG_POSITION(frame_number, 0x103); +ASSERT_REG_POSITION(h264_slice_data_offsets, 0x104); +ASSERT_REG_POSITION(frame_stats_offset, 0x109); +ASSERT_REG_POSITION(h264_last_surface_luma_offset, 0x10A); +ASSERT_REG_POSITION(h264_last_surface_chroma_offset, 0x10B); +ASSERT_REG_POSITION(surface_luma_offset, 0x10C); +ASSERT_REG_POSITION(surface_chroma_offset, 0x11D); +ASSERT_REG_POSITION(vp8_prob_data_offset, 0x150); +ASSERT_REG_POSITION(vp8_header_partition_buf_offset, 0x151); +ASSERT_REG_POSITION(vp9_entropy_probs_offset, 0x170); +ASSERT_REG_POSITION(vp9_backward_updates_offset, 0x171); +ASSERT_REG_POSITION(vp9_last_frame_segmap_offset, 0x172); +ASSERT_REG_POSITION(vp9_curr_frame_segmap_offset, 0x173); +ASSERT_REG_POSITION(vp9_last_frame_mvs_offset, 0x175); +ASSERT_REG_POSITION(vp9_curr_frame_mvs_offset, 0x176); + +#undef ASSERT_REG_POSITION + +} // namespace Tegra::Host1x::NvdecCommon diff --git a/src/video_core/host1x/sync_manager.cpp b/src/video_core/host1x/sync_manager.cpp new file mode 100644 index 000000000..8694f77e2 --- /dev/null +++ b/src/video_core/host1x/sync_manager.cpp @@ -0,0 +1,51 @@ +// SPDX-FileCopyrightText: Ryujinx Team and Contributors +// SPDX-License-Identifier: MIT + +#include +#include "sync_manager.h" +#include "video_core/gpu.h" +#include "video_core/host1x/host1x.h" +#include "video_core/host1x/syncpoint_manager.h" + +namespace Tegra { +namespace Host1x { + +SyncptIncrManager::SyncptIncrManager(GPU& gpu_) : gpu(gpu_) {} +SyncptIncrManager::~SyncptIncrManager() = default; + +void SyncptIncrManager::Increment(u32 id) { + increments.emplace_back(0, 0, id, true); + IncrementAllDone(); +} + +u32 SyncptIncrManager::IncrementWhenDone(u32 class_id, u32 id) { + const u32 handle = current_id++; + increments.emplace_back(handle, class_id, id); + return handle; +} + +void SyncptIncrManager::SignalDone(u32 handle) { + const auto done_incr = + std::find_if(increments.begin(), increments.end(), + [handle](const SyncptIncr& incr) { return incr.id == handle; }); + if (done_incr != increments.cend()) { + done_incr->complete = true; + } + IncrementAllDone(); +} + +void SyncptIncrManager::IncrementAllDone() { + std::size_t done_count = 0; + for (; done_count < increments.size(); ++done_count) { + if (!increments[done_count].complete) { + break; + } + auto& syncpoint_manager = gpu.Host1x().GetSyncpointManager(); + syncpoint_manager.IncrementGuest(increments[done_count].syncpt_id); + syncpoint_manager.IncrementHost(increments[done_count].syncpt_id); + } + increments.erase(increments.begin(), increments.begin() + done_count); +} + +} // namespace Host1x +} // namespace Tegra diff --git a/src/video_core/host1x/sync_manager.h b/src/video_core/host1x/sync_manager.h new file mode 100644 index 000000000..aba72d5c5 --- /dev/null +++ b/src/video_core/host1x/sync_manager.h @@ -0,0 +1,53 @@ +// SPDX-FileCopyrightText: Ryujinx Team and Contributors +// SPDX-License-Identifier: MIT + +#pragma once + +#include +#include +#include "common/common_types.h" + +namespace Tegra { + +class GPU; + +namespace Host1x { + +struct SyncptIncr { + u32 id; + u32 class_id; + u32 syncpt_id; + bool complete; + + SyncptIncr(u32 id_, u32 class_id_, u32 syncpt_id_, bool done = false) + : id(id_), class_id(class_id_), syncpt_id(syncpt_id_), complete(done) {} +}; + +class SyncptIncrManager { +public: + explicit SyncptIncrManager(GPU& gpu); + ~SyncptIncrManager(); + + /// Add syncpoint id and increment all + void Increment(u32 id); + + /// Returns a handle to increment later + u32 IncrementWhenDone(u32 class_id, u32 id); + + /// IncrememntAllDone, including handle + void SignalDone(u32 handle); + + /// Increment all sequential pending increments that are already done. + void IncrementAllDone(); + +private: + std::vector increments; + std::mutex increment_lock; + u32 current_id{}; + + GPU& gpu; +}; + +} // namespace Host1x + +} // namespace Tegra diff --git a/src/video_core/host1x/syncpoint_manager.cpp b/src/video_core/host1x/syncpoint_manager.cpp new file mode 100644 index 000000000..c606b8bd0 --- /dev/null +++ b/src/video_core/host1x/syncpoint_manager.cpp @@ -0,0 +1,93 @@ +// Copyright 2021 yuzu Emulator Project +// Licensed under GPLv3 or any later version +// Refer to the license.txt file included. + +#include "video_core/host1x/syncpoint_manager.h" + +namespace Tegra { + +namespace Host1x { + +SyncpointManager::ActionHandle SyncpointManager::RegisterAction( + std::atomic& syncpoint, std::list& action_storage, u32 expected_value, + std::function& action) { + if (syncpoint.load(std::memory_order_acquire) >= expected_value) { + action(); + return {}; + } + + std::unique_lock lk(guard); + if (syncpoint.load(std::memory_order_relaxed) >= expected_value) { + action(); + return {}; + } + auto it = action_storage.begin(); + while (it != action_storage.end()) { + if (it->expected_value >= expected_value) { + break; + } + ++it; + } + return action_storage.emplace(it, expected_value, action); +} + +void SyncpointManager::DeregisterAction(std::list& action_storage, + ActionHandle& handle) { + std::unique_lock lk(guard); + action_storage.erase(handle); +} + +void SyncpointManager::DeregisterGuestAction(u32 syncpoint_id, ActionHandle& handle) { + DeregisterAction(guest_action_storage[syncpoint_id], handle); +} + +void SyncpointManager::DeregisterHostAction(u32 syncpoint_id, ActionHandle& handle) { + DeregisterAction(host_action_storage[syncpoint_id], handle); +} + +void SyncpointManager::IncrementGuest(u32 syncpoint_id) { + Increment(syncpoints_guest[syncpoint_id], wait_guest_cv, guest_action_storage[syncpoint_id]); +} + +void SyncpointManager::IncrementHost(u32 syncpoint_id) { + Increment(syncpoints_host[syncpoint_id], wait_host_cv, host_action_storage[syncpoint_id]); +} + +void SyncpointManager::WaitGuest(u32 syncpoint_id, u32 expected_value) { + Wait(syncpoints_guest[syncpoint_id], wait_guest_cv, expected_value); +} + +void SyncpointManager::WaitHost(u32 syncpoint_id, u32 expected_value) { + Wait(syncpoints_host[syncpoint_id], wait_host_cv, expected_value); +} + +void SyncpointManager::Increment(std::atomic& syncpoint, std::condition_variable& wait_cv, + std::list& action_storage) { + auto new_value{syncpoint.fetch_add(1, std::memory_order_acq_rel) + 1}; + + std::unique_lock lk(guard); + auto it = action_storage.begin(); + while (it != action_storage.end()) { + if (it->expected_value > new_value) { + break; + } + it->action(); + it = action_storage.erase(it); + } + wait_cv.notify_all(); +} + +void SyncpointManager::Wait(std::atomic& syncpoint, std::condition_variable& wait_cv, + u32 expected_value) { + const auto pred = [&]() { return syncpoint.load(std::memory_order_acquire) >= expected_value; }; + if (pred()) { + return; + } + + std::unique_lock lk(guard); + wait_cv.wait(lk, pred); +} + +} // namespace Host1x + +} // namespace Tegra diff --git a/src/video_core/host1x/syncpoint_manager.h b/src/video_core/host1x/syncpoint_manager.h new file mode 100644 index 000000000..0ecc040ab --- /dev/null +++ b/src/video_core/host1x/syncpoint_manager.h @@ -0,0 +1,99 @@ +// Copyright 2021 yuzu Emulator Project +// Licensed under GPLv3 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include +#include +#include +#include +#include +#include + +#include "common/common_types.h" + +namespace Tegra { + +namespace Host1x { + +class SyncpointManager { +public: + u32 GetGuestSyncpointValue(u32 id) { + return syncpoints_guest[id].load(std::memory_order_acquire); + } + + u32 GetHostSyncpointValue(u32 id) { + return syncpoints_host[id].load(std::memory_order_acquire); + } + + struct RegisteredAction { + RegisteredAction(u32 expected_value_, std::function& action_) + : expected_value{expected_value_}, action{action_} {} + u32 expected_value; + std::function action; + }; + using ActionHandle = std::list::iterator; + + template + ActionHandle RegisterGuestAction(u32 syncpoint_id, u32 expected_value, Func&& action) { + std::function func(action); + return RegisterAction(syncpoints_guest[syncpoint_id], guest_action_storage[syncpoint_id], + expected_value, func); + } + + template + ActionHandle RegisterHostAction(u32 syncpoint_id, u32 expected_value, Func&& action) { + std::function func(action); + return RegisterAction(syncpoints_host[syncpoint_id], host_action_storage[syncpoint_id], + expected_value, func); + } + + void DeregisterGuestAction(u32 syncpoint_id,ActionHandle& handle); + + void DeregisterHostAction(u32 syncpoint_id,ActionHandle& handle); + + void IncrementGuest(u32 syncpoint_id); + + void IncrementHost(u32 syncpoint_id); + + void WaitGuest(u32 syncpoint_id, u32 expected_value); + + void WaitHost(u32 syncpoint_id, u32 expected_value); + + bool IsReadyGuest(u32 syncpoint_id, u32 expected_value) { + return syncpoints_guest[syncpoint_id].load(std::memory_order_acquire) >= expected_value; + } + + bool IsReadyHost(u32 syncpoint_id, u32 expected_value) { + return syncpoints_host[syncpoint_id].load(std::memory_order_acquire) >= expected_value; + } + +private: + void Increment(std::atomic& syncpoint, std::condition_variable& wait_cv, + std::list& action_storage); + + ActionHandle RegisterAction(std::atomic& syncpoint, + std::list& action_storage, u32 expected_value, + std::function& action); + + void DeregisterAction(std::list& action_storage, ActionHandle& handle); + + void Wait(std::atomic& syncpoint, std::condition_variable& wait_cv, u32 expected_value); + + static constexpr size_t NUM_MAX_SYNCPOINTS = 192; + + std::array, NUM_MAX_SYNCPOINTS> syncpoints_guest{}; + std::array, NUM_MAX_SYNCPOINTS> syncpoints_host{}; + + std::array, NUM_MAX_SYNCPOINTS> guest_action_storage; + std::array, NUM_MAX_SYNCPOINTS> host_action_storage; + + std::mutex guard; + std::condition_variable wait_guest_cv; + std::condition_variable wait_host_cv; +}; + +} // namespace Host1x + +} // namespace Tegra diff --git a/src/video_core/host1x/vic.cpp b/src/video_core/host1x/vic.cpp new file mode 100644 index 000000000..a9422670a --- /dev/null +++ b/src/video_core/host1x/vic.cpp @@ -0,0 +1,243 @@ +// SPDX-FileCopyrightText: Copyright 2020 yuzu Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#include + +extern "C" { +#if defined(__GNUC__) || defined(__clang__) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wconversion" +#endif +#include +#if defined(__GNUC__) || defined(__clang__) +#pragma GCC diagnostic pop +#endif +} + +#include "common/assert.h" +#include "common/bit_field.h" +#include "common/logging/log.h" + +#include "video_core/engines/maxwell_3d.h" +#include "video_core/gpu.h" +#include "video_core/host1x/nvdec.h" +#include "video_core/host1x/vic.h" +#include "video_core/memory_manager.h" +#include "video_core/textures/decoders.h" + +namespace Tegra { + +namespace Host1x { + +namespace { +enum class VideoPixelFormat : u64_le { + RGBA8 = 0x1f, + BGRA8 = 0x20, + RGBX8 = 0x23, + YUV420 = 0x44, +}; +} // Anonymous namespace + +union VicConfig { + u64_le raw{}; + BitField<0, 7, VideoPixelFormat> pixel_format; + BitField<7, 2, u64_le> chroma_loc_horiz; + BitField<9, 2, u64_le> chroma_loc_vert; + BitField<11, 4, u64_le> block_linear_kind; + BitField<15, 4, u64_le> block_linear_height_log2; + BitField<32, 14, u64_le> surface_width_minus1; + BitField<46, 14, u64_le> surface_height_minus1; +}; + +Vic::Vic(GPU& gpu_, std::shared_ptr nvdec_processor_) + : gpu(gpu_), + nvdec_processor(std::move(nvdec_processor_)), converted_frame_buffer{nullptr, av_free} {} + +Vic::~Vic() = default; + +void Vic::ProcessMethod(Method method, u32 argument) { + LOG_DEBUG(HW_GPU, "Vic method 0x{:X}", static_cast(method)); + const u64 arg = static_cast(argument) << 8; + switch (method) { + case Method::Execute: + Execute(); + break; + case Method::SetConfigStructOffset: + config_struct_address = arg; + break; + case Method::SetOutputSurfaceLumaOffset: + output_surface_luma_address = arg; + break; + case Method::SetOutputSurfaceChromaOffset: + output_surface_chroma_address = arg; + break; + default: + break; + } +} + +void Vic::Execute() { + if (output_surface_luma_address == 0) { + LOG_ERROR(Service_NVDRV, "VIC Luma address not set."); + return; + } + const VicConfig config{gpu.MemoryManager().Read(config_struct_address + 0x20)}; + const AVFramePtr frame_ptr = nvdec_processor->GetFrame(); + const auto* frame = frame_ptr.get(); + if (!frame) { + return; + } + const u64 surface_width = config.surface_width_minus1 + 1; + const u64 surface_height = config.surface_height_minus1 + 1; + if (static_cast(frame->width) != surface_width || + static_cast(frame->height) != surface_height) { + // TODO: Properly support multiple video streams with differing frame dimensions + LOG_WARNING(Service_NVDRV, "Frame dimensions {}x{} don't match surface dimensions {}x{}", + frame->width, frame->height, surface_width, surface_height); + } + switch (config.pixel_format) { + case VideoPixelFormat::RGBA8: + case VideoPixelFormat::BGRA8: + case VideoPixelFormat::RGBX8: + WriteRGBFrame(frame, config); + break; + case VideoPixelFormat::YUV420: + WriteYUVFrame(frame, config); + break; + default: + UNIMPLEMENTED_MSG("Unknown video pixel format {:X}", config.pixel_format.Value()); + break; + } +} + +void Vic::WriteRGBFrame(const AVFrame* frame, const VicConfig& config) { + LOG_TRACE(Service_NVDRV, "Writing RGB Frame"); + + if (!scaler_ctx || frame->width != scaler_width || frame->height != scaler_height) { + const AVPixelFormat target_format = [pixel_format = config.pixel_format]() { + switch (pixel_format) { + case VideoPixelFormat::RGBA8: + return AV_PIX_FMT_RGBA; + case VideoPixelFormat::BGRA8: + return AV_PIX_FMT_BGRA; + case VideoPixelFormat::RGBX8: + return AV_PIX_FMT_RGB0; + default: + return AV_PIX_FMT_RGBA; + } + }(); + + sws_freeContext(scaler_ctx); + // Frames are decoded into either YUV420 or NV12 formats. Convert to desired RGB format + scaler_ctx = sws_getContext(frame->width, frame->height, + static_cast(frame->format), frame->width, + frame->height, target_format, 0, nullptr, nullptr, nullptr); + scaler_width = frame->width; + scaler_height = frame->height; + converted_frame_buffer.reset(); + } + if (!converted_frame_buffer) { + const size_t frame_size = frame->width * frame->height * 4; + converted_frame_buffer = AVMallocPtr{static_cast(av_malloc(frame_size)), av_free}; + } + const std::array converted_stride{frame->width * 4, frame->height * 4, 0, 0}; + u8* const converted_frame_buf_addr{converted_frame_buffer.get()}; + sws_scale(scaler_ctx, frame->data, frame->linesize, 0, frame->height, &converted_frame_buf_addr, + converted_stride.data()); + + // Use the minimum of surface/frame dimensions to avoid buffer overflow. + const u32 surface_width = static_cast(config.surface_width_minus1) + 1; + const u32 surface_height = static_cast(config.surface_height_minus1) + 1; + const u32 width = std::min(surface_width, static_cast(frame->width)); + const u32 height = std::min(surface_height, static_cast(frame->height)); + const u32 blk_kind = static_cast(config.block_linear_kind); + if (blk_kind != 0) { + // swizzle pitch linear to block linear + const u32 block_height = static_cast(config.block_linear_height_log2); + const auto size = Texture::CalculateSize(true, 4, width, height, 1, block_height, 0); + luma_buffer.resize(size); + Texture::SwizzleSubrect(width, height, width * 4, width, 4, luma_buffer.data(), + converted_frame_buf_addr, block_height, 0, 0); + + gpu.MemoryManager().WriteBlock(output_surface_luma_address, luma_buffer.data(), size); + } else { + // send pitch linear frame + const size_t linear_size = width * height * 4; + gpu.MemoryManager().WriteBlock(output_surface_luma_address, converted_frame_buf_addr, + linear_size); + } +} + +void Vic::WriteYUVFrame(const AVFrame* frame, const VicConfig& config) { + LOG_TRACE(Service_NVDRV, "Writing YUV420 Frame"); + + const std::size_t surface_width = config.surface_width_minus1 + 1; + const std::size_t surface_height = config.surface_height_minus1 + 1; + const std::size_t aligned_width = (surface_width + 0xff) & ~0xffUL; + // Use the minimum of surface/frame dimensions to avoid buffer overflow. + const auto frame_width = std::min(surface_width, static_cast(frame->width)); + const auto frame_height = std::min(surface_height, static_cast(frame->height)); + + const auto stride = static_cast(frame->linesize[0]); + + luma_buffer.resize(aligned_width * surface_height); + chroma_buffer.resize(aligned_width * surface_height / 2); + + // Populate luma buffer + const u8* luma_src = frame->data[0]; + for (std::size_t y = 0; y < frame_height; ++y) { + const std::size_t src = y * stride; + const std::size_t dst = y * aligned_width; + for (std::size_t x = 0; x < frame_width; ++x) { + luma_buffer[dst + x] = luma_src[src + x]; + } + } + gpu.MemoryManager().WriteBlock(output_surface_luma_address, luma_buffer.data(), + luma_buffer.size()); + + // Chroma + const std::size_t half_height = frame_height / 2; + const auto half_stride = static_cast(frame->linesize[1]); + + switch (frame->format) { + case AV_PIX_FMT_YUV420P: { + // Frame from FFmpeg software + // Populate chroma buffer from both channels with interleaving. + const std::size_t half_width = frame_width / 2; + const u8* chroma_b_src = frame->data[1]; + const u8* chroma_r_src = frame->data[2]; + for (std::size_t y = 0; y < half_height; ++y) { + const std::size_t src = y * half_stride; + const std::size_t dst = y * aligned_width; + + for (std::size_t x = 0; x < half_width; ++x) { + chroma_buffer[dst + x * 2] = chroma_b_src[src + x]; + chroma_buffer[dst + x * 2 + 1] = chroma_r_src[src + x]; + } + } + break; + } + case AV_PIX_FMT_NV12: { + // Frame from VA-API hardware + // This is already interleaved so just copy + const u8* chroma_src = frame->data[1]; + for (std::size_t y = 0; y < half_height; ++y) { + const std::size_t src = y * stride; + const std::size_t dst = y * aligned_width; + for (std::size_t x = 0; x < frame_width; ++x) { + chroma_buffer[dst + x] = chroma_src[src + x]; + } + } + break; + } + default: + ASSERT(false); + break; + } + gpu.MemoryManager().WriteBlock(output_surface_chroma_address, chroma_buffer.data(), + chroma_buffer.size()); +} + +} // namespace Host1x + +} // namespace Tegra diff --git a/src/video_core/host1x/vic.h b/src/video_core/host1x/vic.h new file mode 100644 index 000000000..c51f8af7e --- /dev/null +++ b/src/video_core/host1x/vic.h @@ -0,0 +1,66 @@ +// SPDX-FileCopyrightText: Copyright 2020 yuzu Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#pragma once + +#include +#include +#include "common/common_types.h" + +struct SwsContext; + +namespace Tegra { +class GPU; + +namespace Host1x { + +class Nvdec; +union VicConfig; + +class Vic { +public: + enum class Method : u32 { + Execute = 0xc0, + SetControlParams = 0x1c1, + SetConfigStructOffset = 0x1c2, + SetOutputSurfaceLumaOffset = 0x1c8, + SetOutputSurfaceChromaOffset = 0x1c9, + SetOutputSurfaceChromaUnusedOffset = 0x1ca + }; + + explicit Vic(GPU& gpu, std::shared_ptr nvdec_processor); + + ~Vic(); + + /// Write to the device state. + void ProcessMethod(Method method, u32 argument); + +private: + void Execute(); + + void WriteRGBFrame(const AVFrame* frame, const VicConfig& config); + + void WriteYUVFrame(const AVFrame* frame, const VicConfig& config); + + GPU& gpu; + std::shared_ptr nvdec_processor; + + /// Avoid reallocation of the following buffers every frame, as their + /// size does not change during a stream + using AVMallocPtr = std::unique_ptr; + AVMallocPtr converted_frame_buffer; + std::vector luma_buffer; + std::vector chroma_buffer; + + GPUVAddr config_struct_address{}; + GPUVAddr output_surface_luma_address{}; + GPUVAddr output_surface_chroma_address{}; + + SwsContext* scaler_ctx{}; + s32 scaler_width{}; + s32 scaler_height{}; +}; + +} // namespace Host1x + +} // namespace Tegra -- cgit v1.2.3