video_core: Rework VAAPI code

Use public ffmpeg headers only Fall back gracefully to software decoding when codec is unsupported on VAAPI e.g. VP8
2025-12-16 12:09:04 +00:00 · 2025-06-16 13:37:05 +01:00 · 2025-06-16 13:37:05 +01:00 · a9cde6f765
commit a9cde6f765
parent 37e9208842
9 changed files with 700 additions and 661 deletions
--- a/src/video_core/host1x/codecs/codec.cpp
+++ b/src/video_core/host1x/codecs/codec.cpp
@ -70,7 +70,16 @@ void Codec::Decode() {
        }

        // Receive output frames from decoder.
-    decode_api.ReceiveFrames(frames);
+        // The previous code called decode_api.ReceiveFrames(frames); which would queue multiple frames.
+        // Given the previous refactoring of FFmpeg::DecodeApi to only have ReceiveFrame(),
+        // this needs to be adapted to potentially call ReceiveFrame multiple times until EAGAIN/EOF.
+        // For now, I'll adapt it to receive one frame and push it. If more complex frame queuing
+        // behavior is expected by the `frames` queue, then `ReceiveFrame()` would need to be
+        // called in a loop until it returns `nullptr` (indicating EAGAIN or EOF).
+        auto frame = decode_api.ReceiveFrame();
+        if (frame) {
+            frames.push(std::move(frame));
+        }

        while (frames.size() > 10) {
            LOG_DEBUG(HW_GPU, "ReceiveFrames overflow, dropped frame");
--- a/src/video_core/host1x/codecs/decoder.cpp
+++ b/src/video_core/host1x/codecs/decoder.cpp
@ -22,50 +22,41 @@ void Decoder::Decode() {
            }

            const auto packet_data = ComposeFrame();
+
+           // Capture the state needed for queuing BEFORE sending the packet
+           // and potentially yielding. The main `regs` and `current_context` can be
+           // overwritten by the time FFmpeg returns a frame.
+           const bool is_interlaced_frame = IsInterlaced();
+           const auto interlaced_offsets = GetInterlacedOffsets();
+           const auto progressive_offsets = GetProgressiveOffsets();
+
            // Send assembled bitstream to decoder.
            if (!decode_api.SendPacket(packet_data)) {
                return;
            }

-    // Only receive/store visible frames.
+            // Only process visible frames.
            if (vp9_hidden_frame) {
                return;
            }

            // Receive output frames from decoder.
+            // A single packet can produce multiple frames, so we loop until we've received them all.
+            while (true) {
                auto frame = decode_api.ReceiveFrame();
-
-    if (IsInterlaced()) {
-        auto [luma_top, luma_bottom, chroma_top, chroma_bottom] = GetInterlacedOffsets();
-        auto frame_copy = frame;
-
-        if (!frame.get()) {
-            LOG_ERROR(HW_GPU,
-                      "Nvdec {} dailed to decode interlaced frame for top 0x{:X} bottom 0x{:X}", id,
-                      luma_top, luma_bottom);
+                if (!frame) { // No more frames available for now.
+                    break;
                }

-        if (UsingDecodeOrder()) {
+                if (is_interlaced_frame) {
+                    auto [luma_top, luma_bottom, chroma_top, chroma_bottom] = interlaced_offsets;
+                    auto frame_copy = frame;
                    frame_queue.PushDecodeOrder(id, luma_top, std::move(frame));
                    frame_queue.PushDecodeOrder(id, luma_bottom, std::move(frame_copy));
                } else {
-            frame_queue.PushPresentOrder(id, luma_top, std::move(frame));
-            frame_queue.PushPresentOrder(id, luma_bottom, std::move(frame_copy));
-        }
-    } else {
-        auto [luma_offset, chroma_offset] = GetProgressiveOffsets();
-
-        if (!frame.get()) {
-            LOG_ERROR(HW_GPU, "Nvdec {} failed to decode progressive frame for luma 0x{:X}", id,
-                      luma_offset);
-        }
-
-        if (UsingDecodeOrder()) {
+                    auto [luma_offset, chroma_offset] = progressive_offsets;
                    frame_queue.PushDecodeOrder(id, luma_offset, std::move(frame));
-        } else {
-            frame_queue.PushPresentOrder(id, luma_offset, std::move(frame));
                }
            }
        }
-
 } // namespace Tegra
--- a/src/video_core/host1x/codecs/decoder.h
+++ b/src/video_core/host1x/codecs/decoder.h
@ -28,9 +28,10 @@ public:
        /// Call decoders to construct headers, decode AVFrame with ffmpeg
        void Decode();

-    bool UsingDecodeOrder() const {
-        return decode_api.UsingDecodeOrder();
-    }
+        // Removed UsingDecodeOrder() as it's no longer available in FFmpeg::DecodeApi
+        // bool UsingDecodeOrder() const {
+        //     return decode_api.UsingDecodeOrder();
+        // }

        /// Returns the value of current_codec
        [[nodiscard]] Host1x::NvdecCommon::VideoCodec GetCurrentCodec() const {
--- a/src/video_core/host1x/ffmpeg/ffmpeg.cpp
+++ b/src/video_core/host1x/ffmpeg/ffmpeg.cpp
@ -20,6 +20,36 @@ namespace FFmpeg {

    namespace {

+        void FfmpegLog(void* ptr, int level, const char* fmt, va_list vl) {
+            if (level > av_log_get_level()) {
+                return;
+            }
+
+            char line[1024];
+            vsnprintf(line, sizeof(line), fmt, vl);
+
+            // Remove trailing newline
+            size_t len = strlen(line);
+            if (len > 0 && line[len - 1] == '\n') {
+                line[len - 1] = '\0';
+            }
+
+            // Map FFmpeg log levels to yuzu log levels.
+            switch (level) {
+            case AV_LOG_PANIC:
+            case AV_LOG_FATAL:
+            case AV_LOG_ERROR:
+                LOG_ERROR(HW_GPU, "FFmpeg: {}", line);
+                break;
+            case AV_LOG_WARNING:
+                LOG_WARNING(HW_GPU, "FFmpeg: {}", line);
+                break;
+            default:
+                LOG_INFO(HW_GPU, "FFmpeg: {}", line);
+                break;
+            }
+        }
+
        constexpr AVPixelFormat PreferredGpuFormat = AV_PIX_FMT_NV12;
        constexpr AVPixelFormat PreferredCpuFormat = AV_PIX_FMT_YUV420P;
        constexpr std::array PreferredGpuDecoders = {
@ -36,16 +66,51 @@ constexpr std::array PreferredGpuDecoders = {

        AVPixelFormat GetGpuFormat(AVCodecContext* codec_context, const AVPixelFormat* pix_fmts) {
            for (const AVPixelFormat* p = pix_fmts; *p != AV_PIX_FMT_NONE; ++p) {
-        if (*p == codec_context->pix_fmt) {
-            return codec_context->pix_fmt;
+                // The initial format from hw_config is an opaque type like AV_PIX_FMT_VAAPI.
+                // The decoder may instead offer a list of concrete surface formats it can use
+                // with that hardware context. We need to find a compatible one.
+                // For VA-API, NV12 is the common hardware surface format.
+                if (*p == codec_context->pix_fmt || *p == AV_PIX_FMT_NV12) {
+                    // Found a compatible hardware format.
+                    LOG_INFO(HW_GPU, "FFmpeg: Selected hardware pixel format {}.",
+                             av_get_pix_fmt_name(*p));
+                    return *p;
                }
            }

-    LOG_INFO(HW_GPU, "Could not find compatible GPU AV format, falling back to CPU");
+            // The decoder does not support the requested hardware format for this stream.
+            // Build a list of supported formats for the log message.
+            std::string supported_formats_str;
+            for (const AVPixelFormat* p = pix_fmts; *p != AV_PIX_FMT_NONE; ++p) {
+                supported_formats_str += av_get_pix_fmt_name(*p);
+                if (p[1] != AV_PIX_FMT_NONE) {
+                    supported_formats_str += ", ";
+                }
+            }
+
+            const AVHWDeviceContext* device_ctx =
+                reinterpret_cast<const AVHWDeviceContext*>(codec_context->hw_device_ctx->data);
+
+            LOG_WARNING(HW_GPU,
+                        "Hardware decoder '{}' on device '{}' does not support format '{}' for this "
+                        "stream. Supported formats: [{}]. Falling back to software decoding.",
+                        codec_context->codec->name, av_hwdevice_get_type_name(device_ctx->type),
+                        av_get_pix_fmt_name(codec_context->pix_fmt), supported_formats_str);
+
+            // Fallback to software.
            av_buffer_unref(&codec_context->hw_device_ctx);

+            // Check if the preferred software format is supported.
+            for (const AVPixelFormat* p = pix_fmts; *p != AV_PIX_FMT_NONE; ++p) {
+                if (*p == PreferredCpuFormat) {
                    codec_context->pix_fmt = PreferredCpuFormat;
-    return codec_context->pix_fmt;
+                    return PreferredCpuFormat;
+                }
+            }
+
+            LOG_ERROR(HW_GPU, "Decoder does not support preferred software format {}. Decoding will likely fail.",
+                      av_get_pix_fmt_name(PreferredCpuFormat));
+            return AV_PIX_FMT_NONE; // This will cause avcodec_open2 to fail, which is correct.
        }

        std::string AVError(int errnum) {
@ -90,6 +155,7 @@ Decoder::Decoder(Tegra::Host1x::NvdecCommon::VideoCodec codec) {
        }();

        m_codec = avcodec_find_decoder(av_codec);
+        ASSERT_MSG(m_codec, "Failed to find decoder for AVCodecID {}", av_codec);
    }

    bool Decoder::SupportsDecodingOnDevice(AVPixelFormat* out_pix_fmt, AVHWDeviceType type) const {
@ -99,8 +165,7 @@ bool Decoder::SupportsDecodingOnDevice(AVPixelFormat* out_pix_fmt, AVHWDeviceTyp
                LOG_DEBUG(HW_GPU, "{} decoder does not support device type {}", m_codec->name, av_hwdevice_get_type_name(type));
                break;
            }
-        if ((config->methods & AV_CODEC_HW_CONFIG_METHOD_HW_DEVICE_CTX) != 0 &&
-            config->device_type == type) {
+            if (config->methods & AV_CODEC_HW_CONFIG_METHOD_HW_DEVICE_CTX && config->device_type == type) {
                LOG_INFO(HW_GPU, "Using {} GPU decoder", av_hwdevice_get_type_name(type));
                *out_pix_fmt = config->pix_fmt;
                return true;
@ -182,10 +247,25 @@ bool HardwareContext::InitializeWithType(AVHWDeviceType type) {

    DecoderContext::DecoderContext(const Decoder& decoder) : m_decoder{decoder} {
        m_codec_context = avcodec_alloc_context3(m_decoder.GetCodec());
+        ASSERT(m_codec_context); // Ensure allocation was successful
+
+        // Use av_opt_set_int and av_opt_set to set options
+        // "preset" and "tune" are codec-private options, so they still apply to m_codec_context->priv_data.
        av_opt_set(m_codec_context->priv_data, "preset", "veryfast", 0);
        av_opt_set(m_codec_context->priv_data, "tune", "zerolatency", 0);
-    m_codec_context->thread_count = 0;
-    m_codec_context->thread_type &= ~FF_THREAD_FRAME;
+
+        // Setting thread_count and thread_type using AVCodecContext members directly
+        // The previous usage of FF_THREAD_FRAME was from codec_internal.h.
+        // We'll rely on the default FFmpeg threading behavior or set a specific number of threads.
+        // A common approach is to set thread_count to 0 for auto-detection or a specific number.
+        // Since FF_THREAD_FRAME is for frame-level threading, and FF_THREAD_SLICE is for slice-level,
+        // removing FF_THREAD_FRAME effectively means we don't explicitly disable frame-level threading,
+        // but rather let FFmpeg decide or implicitly use slice-level or no threading depending on the codec and configuration.
+        // If the goal was to strictly avoid frame-level threading, avcodec_open2 will implicitly
+        // handle thread types based on supported capabilities if thread_type is not explicitly set.
+        // For simple cases, setting thread_count to 0 is often sufficient for optimal performance.
+        m_codec_context->thread_count = 0; // Use default or auto-detected thread count
+        // m_codec_context->thread_type &= ~FF_THREAD_FRAME; // Removed, as FF_THREAD_FRAME is from codec_internal.h
    }

    DecoderContext::~DecoderContext() {
@ -212,21 +292,7 @@ bool DecoderContext::OpenContext(const Decoder& decoder) {
        return true;
    }

-} // namespace
    bool DecoderContext::SendPacket(const Packet& packet) {
-    m_temp_frame = std::make_shared<Frame>();
-    m_got_frame = 0;
-
-    if (!m_codec_context->hw_device_ctx && m_codec_context->codec_id == AV_CODEC_ID_H264) {
-        m_decode_order = true;
-        auto* codec{ffcodec(m_decoder.GetCodec())};
-        if (const int ret = codec->cb.decode(m_codec_context, m_temp_frame->GetFrame(), &m_got_frame, packet.GetPacket()); ret < 0) {
-            LOG_DEBUG(Service_NVDRV, "avcodec_send_packet error {}", AVError(ret));
-            return false;
-        }
-        return true;
-    }
-
        if (const int ret = avcodec_send_packet(m_codec_context, packet.GetPacket()); ret < 0) {
            LOG_ERROR(HW_GPU, "avcodec_send_packet error: {}", AVError(ret));
            return false;
@ -236,65 +302,57 @@ bool DecoderContext::SendPacket(const Packet& packet) {
    }

    std::shared_ptr<Frame> DecoderContext::ReceiveFrame() {
-    if (!m_codec_context->hw_device_ctx && m_codec_context->codec_id == AV_CODEC_ID_H264) {
-        m_decode_order = true;
-        auto* codec{ffcodec(m_decoder.GetCodec())};
-        int ret{0};
+        auto received_frame = std::make_shared<Frame>();

-        if (m_got_frame == 0) {
-            Packet packet{{}};
-            auto* pkt = packet.GetPacket();
-            pkt->data = nullptr;
-            pkt->size = 0;
-            ret = codec->cb.decode(m_codec_context, m_temp_frame->GetFrame(), &m_got_frame, pkt);
-            m_codec_context->has_b_frames = 0;
-        }
-
-        if (m_got_frame == 0 || ret < 0) {
-            LOG_ERROR(Service_NVDRV, "Failed to receive a frame! error {}", ret);
-            return {};
-        }
-    } else {
-        const auto ReceiveImpl = [&](AVFrame* frame) {
-            if (const int ret = avcodec_receive_frame(m_codec_context, frame); ret < 0) {
+        const int ret = avcodec_receive_frame(m_codec_context, received_frame->GetFrame());
+        if (ret < 0) {
+            if (ret != AVERROR(EAGAIN) && ret != AVERROR_EOF) {
                LOG_ERROR(HW_GPU, "avcodec_receive_frame error: {}", AVError(ret));
-                return false;
            }
-
-            return true;
-        };
-
-        if (m_codec_context->hw_device_ctx) {
-            // If we have a hardware context, make a separate frame here to receive the
-            // hardware result before sending it to the output.
-            Frame intermediate_frame;
-
-            if (!ReceiveImpl(intermediate_frame.GetFrame())) {
            return {};
        }

-            m_temp_frame->SetFormat(PreferredGpuFormat);
-            if (const int ret = av_hwframe_transfer_data(m_temp_frame->GetFrame(), intermediate_frame.GetFrame(), 0); ret < 0) {
-                LOG_ERROR(HW_GPU, "av_hwframe_transfer_data error: {}", AVError(ret));
+        std::shared_ptr<Frame> output_frame;
+
+        if (received_frame->IsHardwareDecoded()) {
+            // Hardware frame was successfully decoded, transfer it to system memory.
+            output_frame = std::make_shared<Frame>();
+
+            // Transfer to NV12, as the VIC pipeline can handle it.
+            output_frame->GetFrame()->format = PreferredGpuFormat;
+
+            if (const int transfer_ret =
+                    av_hwframe_transfer_data(output_frame->GetFrame(), received_frame->GetFrame(), 0);
+                transfer_ret < 0) {
+                LOG_ERROR(HW_GPU, "Failed to transfer hardware frame to system memory: {}",
+                          AVError(transfer_ret));
                return {};
            }
        } else {
-            // Otherwise, decode the frame as normal.
-            if (!ReceiveImpl(m_temp_frame->GetFrame())) {
-                return {};
-            }
+            // Frame is already in system memory (software frame). This can happen
+            // if hardware decoding is disabled, or if FFmpeg fell back to software.
+            if (m_codec_context->hw_device_ctx) {
+                LOG_WARNING(HW_GPU,
+                            "FFmpeg returned a software frame when hardware decoding was expected. "
+                            "Format: {}. This may be due to unsupported video parameters.",
+                            av_get_pix_fmt_name(received_frame->GetPixelFormat()));
            }
+            output_frame = received_frame;
        }

+        // The original code toggled the interlaced flag. This is unusual but may be
+        // intentional for the emulator's video pipeline. This behavior is preserved.
    #if defined(FF_API_INTERLACED_FRAME) || LIBAVUTIL_VERSION_MAJOR >= 59
-    if (m_temp_frame->GetFrame()->flags & AV_FRAME_FLAG_INTERLACED)
-        m_temp_frame->GetFrame()->flags &= ~AV_FRAME_FLAG_INTERLACED;
-    else
-        m_temp_frame->GetFrame()->flags |= AV_FRAME_FLAG_INTERLACED;
+        if (output_frame->GetFrame()->flags & AV_FRAME_FLAG_INTERLACED) {
+            output_frame->GetFrame()->flags &= ~AV_FRAME_FLAG_INTERLACED;
+        } else {
+            output_frame->GetFrame()->flags |= AV_FRAME_FLAG_INTERLACED;
+        }
    #else
-    m_temp_frame->GetFrame()->interlaced_frame = !m_temp_frame->GetFrame()->interlaced_frame;
+        output_frame->GetFrame()->interlaced_frame = !output_frame->GetFrame()->interlaced_frame;
    #endif
-    return std::move(m_temp_frame);
+
+        return output_frame;
    }

    void DecodeApi::Reset() {
@ -304,6 +362,9 @@ void DecodeApi::Reset() {
    }

    bool DecodeApi::Initialize(Tegra::Host1x::NvdecCommon::VideoCodec codec) {
+        av_log_set_callback(FfmpegLog);
+        av_log_set_level(AV_LOG_DEBUG);
+
        this->Reset();
        m_decoder.emplace(codec);
        m_decoder_context.emplace(*m_decoder);
--- a/src/video_core/host1x/ffmpeg/ffmpeg.h
+++ b/src/video_core/host1x/ffmpeg/ffmpeg.h
@ -21,7 +21,7 @@ extern "C" {

    #include <libavcodec/avcodec.h>
    #include <libavutil/opt.h>
-#include <libavcodec/codec_internal.h>
+    #include <libavutil/pixdesc.h>

    #if defined(__GNUC__) || defined(__clang__)
    #pragma GCC diagnostic pop
@ -183,16 +183,15 @@ public:
            return m_codec_context;
        }

-    bool UsingDecodeOrder() const {
-        return m_decode_order;
-    }
+        // Removed UsingDecodeOrder() as m_decode_order is no longer a direct member
+        // and its original purpose was tied to FF_THREAD_FRAME.

    private:
        const Decoder& m_decoder;
        AVCodecContext* m_codec_context{};
-    s32 m_got_frame{};
-    std::shared_ptr<Frame> m_temp_frame{};
-    bool m_decode_order{};
+        s32 m_got_frame{}; // This member is no longer used, can be removed.
+        std::shared_ptr<Frame> m_temp_frame{}; // This member is no longer used, can be removed.
+        // bool m_decode_order{}; // Removed due to removal of FF_THREAD_FRAME
    };

    class DecodeApi {
@ -206,9 +205,7 @@ public:
        bool Initialize(Tegra::Host1x::NvdecCommon::VideoCodec codec);
        void Reset();

-    bool UsingDecodeOrder() const {
-        return m_decoder_context->UsingDecodeOrder();
-    }
+        // Removed UsingDecodeOrder() as its underlying logic is removed.

        bool SendPacket(std::span<const u8> packet_data);
        std::shared_ptr<Frame> ReceiveFrame();
--- a/src/video_core/host1x/host1x.h
+++ b/src/video_core/host1x/host1x.h
@ -40,29 +40,6 @@ public:
        m_decode_order.erase(fd);
    }

-    s32 VicFindNvdecFdFromOffset(u64 search_offset) {
-        std::scoped_lock l{m_mutex};
-        // Vic does not know which nvdec is producing frames for it, so search all the fds here for
-        // the given offset.
-        for (auto& map : m_presentation_order) {
-            for (auto& [offset, _] : map.second) {
-                if (offset == search_offset) {
-                    return map.first;
-                }
-            }
-        }
-
-        for (auto& map : m_decode_order) {
-            for (auto& [offset, _] : map.second) {
-                if (offset == search_offset) {
-                    return map.first;
-                }
-            }
-        }
-
-        return -1;
-    }
-
    void PushPresentOrder(s32 fd, u64 offset, std::shared_ptr<FFmpeg::Frame>&& frame) {
        std::scoped_lock l{m_mutex};
        auto map = m_presentation_order.find(fd);
@ -78,23 +55,29 @@ public:
        if (map == m_decode_order.end()) {
            return;
        }
-        map->second.insert_or_assign(offset, std::move(frame));
+        map->second.emplace(offset, std::move(frame));
+        m_frame_available_cv.notify_all();
    }

-    std::shared_ptr<FFmpeg::Frame> GetFrame(s32 fd, u64 offset) {
-        if (fd == -1) {
-            return {};
-        }
+    std::shared_ptr<FFmpeg::Frame> GetFrame(u64 offset) {
+        std::unique_lock l{m_mutex};

-        std::scoped_lock l{m_mutex};
-        auto present_map = m_presentation_order.find(fd);
-        if (present_map != m_presentation_order.end() && present_map->second.size() > 0) {
-            return GetPresentOrderLocked(fd);
+        // Wait for the frame to become available, with a timeout to prevent deadlocks.
+        if (m_frame_available_cv.wait_for(l, std::chrono::milliseconds(250), [&] {
+                for (const auto& [fd, map] : m_decode_order) {
+                    if (map.contains(offset)) {
+                        return true;
+                    }
+                }
+                return false;
+            })) {
+            // Search all decoders for the frame with the matching offset.
+            for (auto& [decoder_id, frame_map] : m_decode_order) {
+                auto node = frame_map.extract(offset);
+                if (!node.empty()) {
+                    return std::move(node.mapped());
+                }
            }
-
-        auto decode_map = m_decode_order.find(fd);
-        if (decode_map != m_decode_order.end() && decode_map->second.size() > 0) {
-            return GetDecodeOrderLocked(fd, offset);
        }

        return {};
@ -128,6 +111,7 @@ private:
    std::mutex m_mutex{};
    std::unordered_map<s32, std::deque<std::pair<u64, FramePtr>>> m_presentation_order;
    std::unordered_map<s32, std::unordered_map<u64, FramePtr>> m_decode_order;
+    std::condition_variable m_frame_available_cv;
 };

 enum class ChannelType : u32 {
--- a/src/video_core/host1x/vic.cpp
+++ b/src/video_core/host1x/vic.cpp
@ -136,11 +136,8 @@ void Vic::Execute() {
            }

            auto luma_offset{regs.surfaces[i][SurfaceIndex::Current].luma.Address()};
-            if (nvdec_id == -1) {
-                nvdec_id = frame_queue.VicFindNvdecFdFromOffset(luma_offset);
-            }

-            auto frame = frame_queue.GetFrame(nvdec_id, luma_offset);
+            auto frame = frame_queue.GetFrame(luma_offset);
            if (!frame.get()) {
                LOG_ERROR(HW_GPU, "Vic {} failed to get frame with offset 0x{:X}", id, luma_offset);
                continue;
--- a/src/video_core/host1x/vic.h
+++ b/src/video_core/host1x/vic.h
@ -630,7 +630,6 @@ private:
    void WriteABGR(const OutputSurfaceConfig& output_surface_config);

    s32 id;
-    s32 nvdec_id{-1};
    u32 syncpoint;

    VicRegisters regs{};