From 2868d4ba84f43c9bf3c7b6997ddcafb6e65c4a02 Mon Sep 17 00:00:00 2001 From: yzct12345 <87620833+yzct12345@users.noreply.github.com> Date: Wed, 4 Aug 2021 03:43:11 +0000 Subject: nvdec: Implement VA-API hardware video acceleration (#6713) * nvdec: VA-API * Verify formatting * Forgot a semicolon for Windows * Clarify comment about AV_PIX_FMT_NV12 * Fix assert log spam from missing negation * vic: Remove forgotten debug code * Address lioncash's review * Mention VA-API is Intel/AMD * Address v1993's review * Hopefully fix CMakeLists style this time * vic: Improve cache locality * vic: Fix off-by-one error * codec: Async * codec: Forgot the GetValue() * nvdec: Address ameerj's review * codec: Fallback to CPU without VA-API support * cmake: Address lat9nq's review * cmake: Make VA-API optional * vaapi: Multiple GPU * Apply suggestions from code review Co-authored-by: Ameer J <52414509+ameerj@users.noreply.github.com> * nvdec: Address ameerj's review * codec: Use anonymous instead of static * nvdec: Remove enum and fix memory leak * nvdec: Address ameerj's review * codec: Remove preparation for threading Co-authored-by: Ameer J <52414509+ameerj@users.noreply.github.com> --- src/video_core/CMakeLists.txt | 5 + src/video_core/command_classes/codecs/codec.cpp | 144 ++++++++++++++++++------ src/video_core/command_classes/codecs/codec.h | 4 +- src/video_core/command_classes/vic.cpp | 87 ++++++++------ src/video_core/command_classes/vic.h | 7 +- 5 files changed, 175 insertions(+), 72 deletions(-) (limited to 'src') diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt index 333f6f35f..1eb67c051 100644 --- a/src/video_core/CMakeLists.txt +++ b/src/video_core/CMakeLists.txt @@ -1,5 +1,10 @@ add_subdirectory(host_shaders) +if(LIBVA_FOUND) + set_source_files_properties(command_classes/codecs/codec.cpp + PROPERTIES COMPILE_DEFINITIONS LIBVA_FOUND=1) +endif() + add_library(video_core STATIC buffer_cache/buffer_base.h buffer_cache/buffer_cache.cpp diff --git a/src/video_core/command_classes/codecs/codec.cpp b/src/video_core/command_classes/codecs/codec.cpp index 1b4bbc8ac..f798a0053 100644 --- a/src/video_core/command_classes/codecs/codec.cpp +++ b/src/video_core/command_classes/codecs/codec.cpp @@ -2,7 +2,6 @@ // Licensed under GPLv2 or any later version // Refer to the license.txt file included. -#include #include #include #include "common/assert.h" @@ -17,10 +16,47 @@ extern "C" { } namespace Tegra { +#if defined(LIBVA_FOUND) +// Hardware acceleration code from FFmpeg/doc/examples/hw_decode.c originally under MIT license +namespace { +constexpr std::array VAAPI_DRIVERS = { + "i915", + "amdgpu", +}; + +AVPixelFormat GetHwFormat(AVCodecContext*, const AVPixelFormat* pix_fmts) { + for (const AVPixelFormat* p = pix_fmts; *p != AV_PIX_FMT_NONE; ++p) { + if (*p == AV_PIX_FMT_VAAPI) { + return AV_PIX_FMT_VAAPI; + } + } + LOG_INFO(Service_NVDRV, "Could not find compatible GPU AV format, falling back to CPU"); + return *pix_fmts; +} + +bool CreateVaapiHwdevice(AVBufferRef** av_hw_device) { + AVDictionary* hwdevice_options = nullptr; + av_dict_set(&hwdevice_options, "connection_type", "drm", 0); + for (const auto& driver : VAAPI_DRIVERS) { + av_dict_set(&hwdevice_options, "kernel_driver", driver, 0); + const int hwdevice_error = av_hwdevice_ctx_create(av_hw_device, AV_HWDEVICE_TYPE_VAAPI, + nullptr, hwdevice_options, 0); + if (hwdevice_error >= 0) { + LOG_INFO(Service_NVDRV, "Using VA-API with {}", driver); + av_dict_free(&hwdevice_options); + return true; + } + LOG_DEBUG(Service_NVDRV, "VA-API av_hwdevice_ctx_create failed {}", hwdevice_error); + } + LOG_DEBUG(Service_NVDRV, "VA-API av_hwdevice_ctx_create failed for all drivers"); + av_dict_free(&hwdevice_options); + return false; +} +} // namespace +#endif void AVFrameDeleter(AVFrame* ptr) { - av_frame_unref(ptr); - av_free(ptr); + av_frame_free(&ptr); } Codec::Codec(GPU& gpu_, const NvdecCommon::NvdecRegisters& regs) @@ -32,19 +68,31 @@ Codec::~Codec() { return; } // Free libav memory - AVFrame* av_frame{nullptr}; avcodec_send_packet(av_codec_ctx, nullptr); - av_frame = av_frame_alloc(); + AVFrame* av_frame = av_frame_alloc(); avcodec_receive_frame(av_codec_ctx, av_frame); avcodec_flush_buffers(av_codec_ctx); - - av_frame_unref(av_frame); - av_free(av_frame); + av_frame_free(&av_frame); avcodec_close(av_codec_ctx); + av_buffer_unref(&av_hw_device); +} + +void Codec::InitializeHwdec() { + // Prioritize integrated GPU to mitigate bandwidth bottlenecks +#if defined(LIBVA_FOUND) + if (CreateVaapiHwdevice(&av_hw_device)) { + const auto hw_device_ctx = av_buffer_ref(av_hw_device); + ASSERT_MSG(hw_device_ctx, "av_buffer_ref failed"); + av_codec_ctx->hw_device_ctx = hw_device_ctx; + av_codec_ctx->get_format = GetHwFormat; + return; + } +#endif + // TODO more GPU accelerated decoders } void Codec::Initialize() { - AVCodecID codec{AV_CODEC_ID_NONE}; + AVCodecID codec; switch (current_codec) { case NvdecCommon::VideoCodec::H264: codec = AV_CODEC_ID_H264; @@ -53,22 +101,24 @@ void Codec::Initialize() { codec = AV_CODEC_ID_VP9; break; default: + UNIMPLEMENTED_MSG("Unknown codec {}", current_codec); return; } av_codec = avcodec_find_decoder(codec); av_codec_ctx = avcodec_alloc_context3(av_codec); av_opt_set(av_codec_ctx->priv_data, "tune", "zerolatency", 0); - - // TODO(ameerj): libavcodec gpu hw acceleration - + InitializeHwdec(); + if (!av_codec_ctx->hw_device_ctx) { + LOG_INFO(Service_NVDRV, "Using FFmpeg software decoding"); + } const auto av_error = avcodec_open2(av_codec_ctx, av_codec, nullptr); if (av_error < 0) { LOG_ERROR(Service_NVDRV, "avcodec_open2() Failed."); avcodec_close(av_codec_ctx); + av_buffer_unref(&av_hw_device); return; } initialized = true; - return; } void Codec::SetTargetCodec(NvdecCommon::VideoCodec codec) { @@ -80,36 +130,64 @@ void Codec::SetTargetCodec(NvdecCommon::VideoCodec codec) { void Codec::Decode() { const bool is_first_frame = !initialized; - if (!initialized) { + if (is_first_frame) { Initialize(); } - bool vp9_hidden_frame = false; - AVPacket packet{}; - av_init_packet(&packet); std::vector frame_data; - if (current_codec == NvdecCommon::VideoCodec::H264) { frame_data = h264_decoder->ComposeFrameHeader(state, is_first_frame); } else if (current_codec == NvdecCommon::VideoCodec::Vp9) { frame_data = vp9_decoder->ComposeFrameHeader(state); vp9_hidden_frame = vp9_decoder->WasFrameHidden(); } - + AVPacket packet{}; + av_init_packet(&packet); packet.data = frame_data.data(); packet.size = static_cast(frame_data.size()); - - avcodec_send_packet(av_codec_ctx, &packet); - - if (!vp9_hidden_frame) { - // Only receive/store visible frames - AVFramePtr frame = AVFramePtr{av_frame_alloc(), AVFrameDeleter}; - avcodec_receive_frame(av_codec_ctx, frame.get()); - av_frames.push(std::move(frame)); - // Limit queue to 10 frames. Workaround for ZLA decode and queue spam - if (av_frames.size() > 10) { - av_frames.pop(); - } + if (const int ret = avcodec_send_packet(av_codec_ctx, &packet); ret) { + LOG_DEBUG(Service_NVDRV, "avcodec_send_packet error {}", ret); + return; + } + // Only receive/store visible frames + if (vp9_hidden_frame) { + return; + } + AVFrame* hw_frame = av_frame_alloc(); + AVFrame* sw_frame = hw_frame; + ASSERT_MSG(hw_frame, "av_frame_alloc hw_frame failed"); + if (const int ret = avcodec_receive_frame(av_codec_ctx, hw_frame); ret) { + LOG_DEBUG(Service_NVDRV, "avcodec_receive_frame error {}", ret); + av_frame_free(&hw_frame); + return; + } + if (!hw_frame->width || !hw_frame->height) { + LOG_WARNING(Service_NVDRV, "Zero width or height in frame"); + av_frame_free(&hw_frame); + return; + } +#if defined(LIBVA_FOUND) + // Hardware acceleration code from FFmpeg/doc/examples/hw_decode.c under MIT license + if (hw_frame->format == AV_PIX_FMT_VAAPI) { + sw_frame = av_frame_alloc(); + ASSERT_MSG(sw_frame, "av_frame_alloc sw_frame failed"); + // Can't use AV_PIX_FMT_YUV420P and share code with software decoding in vic.cpp + // because Intel drivers crash unless using AV_PIX_FMT_NV12 + sw_frame->format = AV_PIX_FMT_NV12; + const int transfer_data_ret = av_hwframe_transfer_data(sw_frame, hw_frame, 0); + ASSERT_MSG(!transfer_data_ret, "av_hwframe_transfer_data error {}", transfer_data_ret); + av_frame_free(&hw_frame); + } +#endif + if (sw_frame->format != AV_PIX_FMT_YUV420P && sw_frame->format != AV_PIX_FMT_NV12) { + UNIMPLEMENTED_MSG("Unexpected video format from host graphics: {}", sw_frame->format); + av_frame_free(&sw_frame); + return; + } + av_frames.push(AVFramePtr{sw_frame, AVFrameDeleter}); + if (av_frames.size() > 10) { + LOG_TRACE(Service_NVDRV, "av_frames.push overflow dropped frame"); + av_frames.pop(); } } @@ -119,7 +197,6 @@ AVFramePtr Codec::GetCurrentFrame() { if (av_frames.empty()) { return AVFramePtr{nullptr, AVFrameDeleter}; } - AVFramePtr frame = std::move(av_frames.front()); av_frames.pop(); return frame; @@ -144,6 +221,5 @@ std::string_view Codec::GetCurrentCodecName() const { default: return "Unknown"; } -}; - +} } // namespace Tegra diff --git a/src/video_core/command_classes/codecs/codec.h b/src/video_core/command_classes/codecs/codec.h index 96c823c76..71936203f 100644 --- a/src/video_core/command_classes/codecs/codec.h +++ b/src/video_core/command_classes/codecs/codec.h @@ -22,7 +22,6 @@ extern "C" { namespace Tegra { class GPU; -struct VicRegisters; void AVFrameDeleter(AVFrame* ptr); using AVFramePtr = std::unique_ptr; @@ -55,10 +54,13 @@ public: [[nodiscard]] std::string_view GetCurrentCodecName() const; private: + void InitializeHwdec(); + bool initialized{}; NvdecCommon::VideoCodec current_codec{NvdecCommon::VideoCodec::None}; AVCodec* av_codec{nullptr}; + AVBufferRef* av_hw_device{nullptr}; AVCodecContext* av_codec_ctx{nullptr}; GPU& gpu; diff --git a/src/video_core/command_classes/vic.cpp b/src/video_core/command_classes/vic.cpp index ffb7c82a1..d5e77941c 100644 --- a/src/video_core/command_classes/vic.cpp +++ b/src/video_core/command_classes/vic.cpp @@ -46,11 +46,8 @@ void Vic::ProcessMethod(Method method, u32 argument) { case Method::SetOutputSurfaceLumaOffset: output_surface_luma_address = arg; break; - case Method::SetOutputSurfaceChromaUOffset: - output_surface_chroma_u_address = arg; - break; - case Method::SetOutputSurfaceChromaVOffset: - output_surface_chroma_v_address = arg; + case Method::SetOutputSurfaceChromaOffset: + output_surface_chroma_address = arg; break; default: break; @@ -65,11 +62,10 @@ void Vic::Execute() { const VicConfig config{gpu.MemoryManager().Read(config_struct_address + 0x20)}; const AVFramePtr frame_ptr = nvdec_processor->GetFrame(); const auto* frame = frame_ptr.get(); - if (!frame || frame->width == 0 || frame->height == 0) { + if (!frame) { return; } - const VideoPixelFormat pixel_format = - static_cast(config.pixel_format.Value()); + const auto pixel_format = static_cast(config.pixel_format.Value()); switch (pixel_format) { case VideoPixelFormat::BGRA8: case VideoPixelFormat::RGBA8: { @@ -83,16 +79,18 @@ void Vic::Execute() { sws_freeContext(scaler_ctx); scaler_ctx = nullptr; - // FFmpeg returns all frames in YUV420, convert it into expected format - scaler_ctx = - sws_getContext(frame->width, frame->height, AV_PIX_FMT_YUV420P, frame->width, - frame->height, target_format, 0, nullptr, nullptr, nullptr); + // Frames are decoded into either YUV420 or NV12 formats. Convert to desired format + scaler_ctx = sws_getContext(frame->width, frame->height, + static_cast(frame->format), frame->width, + frame->height, target_format, 0, nullptr, nullptr, nullptr); scaler_width = frame->width; scaler_height = frame->height; } // Get Converted frame - const std::size_t linear_size = frame->width * frame->height * 4; + const u32 width = static_cast(frame->width); + const u32 height = static_cast(frame->height); + const std::size_t linear_size = width * height * 4; // Only allocate frame_buffer once per stream, as the size is not expected to change if (!converted_frame_buffer) { @@ -109,11 +107,10 @@ void Vic::Execute() { if (blk_kind != 0) { // swizzle pitch linear to block linear const u32 block_height = static_cast(config.block_linear_height_log2); - const auto size = Tegra::Texture::CalculateSize(true, 4, frame->width, frame->height, 1, - block_height, 0); + const auto size = + Tegra::Texture::CalculateSize(true, 4, width, height, 1, block_height, 0); luma_buffer.resize(size); - Tegra::Texture::SwizzleSubrect(frame->width, frame->height, frame->width * 4, - frame->width, 4, luma_buffer.data(), + Tegra::Texture::SwizzleSubrect(width, height, width * 4, width, 4, luma_buffer.data(), converted_frame_buffer.get(), block_height, 0, 0); gpu.MemoryManager().WriteBlock(output_surface_luma_address, luma_buffer.data(), size); @@ -131,41 +128,65 @@ void Vic::Execute() { const std::size_t surface_height = config.surface_height_minus1 + 1; const auto frame_width = std::min(surface_width, static_cast(frame->width)); const auto frame_height = std::min(surface_height, static_cast(frame->height)); - const std::size_t half_width = frame_width / 2; - const std::size_t half_height = frame_height / 2; - const std::size_t aligned_width = (surface_width + 0xff) & ~0xff; + const std::size_t aligned_width = (surface_width + 0xff) & ~0xffUL; - const auto* luma_ptr = frame->data[0]; - const auto* chroma_b_ptr = frame->data[1]; - const auto* chroma_r_ptr = frame->data[2]; const auto stride = static_cast(frame->linesize[0]); - const auto half_stride = static_cast(frame->linesize[1]); luma_buffer.resize(aligned_width * surface_height); chroma_buffer.resize(aligned_width * surface_height / 2); // Populate luma buffer + const u8* luma_src = frame->data[0]; for (std::size_t y = 0; y < frame_height; ++y) { const std::size_t src = y * stride; const std::size_t dst = y * aligned_width; for (std::size_t x = 0; x < frame_width; ++x) { - luma_buffer[dst + x] = luma_ptr[src + x]; + luma_buffer[dst + x] = luma_src[src + x]; } } gpu.MemoryManager().WriteBlock(output_surface_luma_address, luma_buffer.data(), luma_buffer.size()); - // Populate chroma buffer from both channels with interleaving. - for (std::size_t y = 0; y < half_height; ++y) { - const std::size_t src = y * half_stride; - const std::size_t dst = y * aligned_width; + // Chroma + const std::size_t half_height = frame_height / 2; + const auto half_stride = static_cast(frame->linesize[1]); - for (std::size_t x = 0; x < half_width; ++x) { - chroma_buffer[dst + x * 2] = chroma_b_ptr[src + x]; - chroma_buffer[dst + x * 2 + 1] = chroma_r_ptr[src + x]; + switch (frame->format) { + case AV_PIX_FMT_YUV420P: { + // Frame from FFmpeg software + // Populate chroma buffer from both channels with interleaving. + const std::size_t half_width = frame_width / 2; + const u8* chroma_b_src = frame->data[1]; + const u8* chroma_r_src = frame->data[2]; + for (std::size_t y = 0; y < half_height; ++y) { + const std::size_t src = y * half_stride; + const std::size_t dst = y * aligned_width; + + for (std::size_t x = 0; x < half_width; ++x) { + chroma_buffer[dst + x * 2] = chroma_b_src[src + x]; + chroma_buffer[dst + x * 2 + 1] = chroma_r_src[src + x]; + } + } + break; + } + case AV_PIX_FMT_NV12: { + // Frame from VA-API hardware + // This is already interleaved so just copy + const u8* chroma_src = frame->data[1]; + for (std::size_t y = 0; y < half_height; ++y) { + const std::size_t src = y * stride; + const std::size_t dst = y * aligned_width; + for (std::size_t x = 0; x < frame_width; ++x) { + chroma_buffer[dst + x] = chroma_src[src + x]; + } } + break; + } + default: + UNREACHABLE(); + break; } - gpu.MemoryManager().WriteBlock(output_surface_chroma_u_address, chroma_buffer.data(), + gpu.MemoryManager().WriteBlock(output_surface_chroma_address, chroma_buffer.data(), chroma_buffer.size()); break; } diff --git a/src/video_core/command_classes/vic.h b/src/video_core/command_classes/vic.h index f5a2ed100..74246e08c 100644 --- a/src/video_core/command_classes/vic.h +++ b/src/video_core/command_classes/vic.h @@ -22,8 +22,8 @@ public: SetControlParams = 0x1c1, SetConfigStructOffset = 0x1c2, SetOutputSurfaceLumaOffset = 0x1c8, - SetOutputSurfaceChromaUOffset = 0x1c9, - SetOutputSurfaceChromaVOffset = 0x1ca + SetOutputSurfaceChromaOffset = 0x1c9, + SetOutputSurfaceChromaUnusedOffset = 0x1ca }; explicit Vic(GPU& gpu, std::shared_ptr nvdec_processor); @@ -64,8 +64,7 @@ private: GPUVAddr config_struct_address{}; GPUVAddr output_surface_luma_address{}; - GPUVAddr output_surface_chroma_u_address{}; - GPUVAddr output_surface_chroma_v_address{}; + GPUVAddr output_surface_chroma_address{}; SwsContext* scaler_ctx{}; s32 scaler_width{}; -- cgit v1.2.3