[SOLVED] Error encoding H.264 video with libav from frame grabber source

I am struggling since days with the actual issue. I did hours of internet search, ChatGPT sessions, code reviewing etc. but I don't get it working as expected.
Base: Windows 11 Pro x64, own LGPL build of libav with libmfx, x64 C++ VC program.
Following situation: I am receiving frames from a framegrabber card (YUAN SDI 2K in the below samples case) and want to encode them with hardware Intel QS H.264 encoder. This works fine so far. The primary processing steps are:
Receive frame from frame grabber card in YUV420P pixel format
Scale frame to NV12 format used by QSV encoder
Encode this frame with Intel QSV encoder
Write / mux these frames into mp4 file
This works so far but I am struggeling with PTS and DTS timestamps. I tried a lot of combinations of hints from ChatGPT but I don't get it really working so that a valid video becomes produced. I either get errors that PTS/DTS is not monotonically increasing, did not change at all, PTS and DTS have completly different values and so on. Once I got a video where the time base seemed to be fine and libav did not report any erros but the order of the frames in the video was not correct.
Here the actual sample (prototype, no proper cleanup, no flushing etc.) code:
#define __STDC_LIMIT_MACROS

#include <cstdio>
#include <cstdint>
#include <Windows.h>

using namespace _DSHOWLIB_NAMESPACE;

#ifdef _WIN32
//Windows
extern "C"
{
#include "libavcodec/avcodec.h"
#include "libavformat/avformat.h"
#include "libswscale/swscale.h"
#include "libavdevice/avdevice.h"
#include "libavutil/hwcontext_qsv.h"
};
#endif
#include <iostream>

void uSleep(double waitTimeInUs, LARGE_INTEGER frequency)
{
    LARGE_INTEGER startTime, currentTime;

    QueryPerformanceCounter(&startTime);

    if (waitTimeInUs > 16500.0)
        Sleep(1);

    do
    {
        YieldProcessor();
        //Sleep(0);
        QueryPerformanceCounter(&currentTime);
    } while (waitTimeInUs > (currentTime.QuadPart - startTime.QuadPart) * 1000000.0 / frequency.QuadPart);
}

void check_error(int ret)
{
    if (ret < 0) {
        char errbuf[128];
        int tmp = errno;
        av_strerror(ret, errbuf, sizeof(errbuf));
        std::cerr << "Error: " << errbuf << '\n';
        exit(1);
    }
}

bool _isRunning = true;

BOOL WINAPI consoleHandler(DWORD signal)
{
    if (signal == CTRL_C_EVENT)
    {
        _isRunning = false;
    }

    return TRUE;
}

int main(int argc, char* argv[])
{
    if (!SetConsoleCtrlHandler(consoleHandler, TRUE)) 
    {
        std::cerr << "Could not set control handler!" << '\n';
        return 1;
    }

    unsigned int videoIndex = 0;
    
    avdevice_register_all();

    av_log_set_level(AV_LOG_TRACE);

    const AVInputFormat * pFrameGrabberInputFormat = av_find_input_format("dshow");

    constexpr int frameGrabberPixelWidth = 1920;
    constexpr int frameGrabberPixelHeight = 1080;
    constexpr int frameGrabberFrameRate = 25;

    char shortStringBuffer[32];

    AVDictionary* pFrameGrabberOptions = nullptr;

    _snprintf_s(shortStringBuffer, sizeof(shortStringBuffer), "%dx%d", frameGrabberPixelWidth, frameGrabberPixelHeight);
    av_dict_set(&pFrameGrabberOptions, "video_size", shortStringBuffer, 0);

    _snprintf_s(shortStringBuffer, sizeof(shortStringBuffer), "%d", frameGrabberFrameRate);

    av_dict_set(&pFrameGrabberOptions, "framerate", shortStringBuffer, 0);
    av_dict_set(&pFrameGrabberOptions, "pixel_format", "yuv420p", 0);

    AVFormatContext* pFrameGrabberFormatContext = avformat_alloc_context();

    pFrameGrabberFormatContext->flags = AVFMT_FLAG_NOBUFFER | AVFMT_FLAG_FLUSH_PACKETS;

    if(avformat_open_input(&pFrameGrabberFormatContext, "video=MZ0380 PCI, Analog 01 Capture", pFrameGrabberInputFormat, &pFrameGrabberOptions) != 0)
    {
        std::cerr << "Couldn't open input stream." << '\n';
        return -1;
    }

    if(avformat_find_stream_info(pFrameGrabberFormatContext, nullptr) < 0)
    {
        std::cerr << "Couldn't find stream information." << '\n';
        return -1;
    }

    bool foundVideoStream = false;

    for(unsigned int loop_videoIndex = 0; loop_videoIndex < pFrameGrabberFormatContext->nb_streams; loop_videoIndex++)
    {
        if(pFrameGrabberFormatContext->streams[loop_videoIndex]->codecpar->codec_type == AVMEDIA_TYPE_VIDEO)
        {
            videoIndex = loop_videoIndex;
            foundVideoStream = true;
            break;
        }
    }

    if(!foundVideoStream)
    {
        std::cerr << "Couldn't find a video stream." << '\n';
        return -1;
    }

    const AVCodec* pFrameGrabberCodec = avcodec_find_decoder(pFrameGrabberFormatContext->streams[videoIndex]->codecpar->codec_id);

    AVCodecContext* pFrameGrabberCodecContext = avcodec_alloc_context3(pFrameGrabberCodec);

    if(pFrameGrabberCodec == nullptr)
    {
        std::cerr << "Codec not found." << '\n';
        return -1;
    }

    pFrameGrabberCodecContext->pix_fmt = AV_PIX_FMT_YUV420P;
    pFrameGrabberCodecContext->width = frameGrabberPixelWidth;
    pFrameGrabberCodecContext->height = frameGrabberPixelHeight;

    int ret = avcodec_open2(pFrameGrabberCodecContext, pFrameGrabberCodec, nullptr);

    if(ret < 0)
    {
        std::cerr << "Could not open pVideoCodec." << '\n';
        return -1;
    }

    const char* outputFilePath = "c:\\temp\\output.mp4";
    constexpr int outputWidth = frameGrabberPixelWidth;
    constexpr int outputHeight = frameGrabberPixelHeight;
    constexpr int outputFrameRate = frameGrabberFrameRate;

    SwsContext* img_convert_ctx = sws_getContext(frameGrabberPixelWidth, frameGrabberPixelHeight, AV_PIX_FMT_YUV420P, outputWidth, outputHeight, AV_PIX_FMT_NV12, SWS_BICUBIC, nullptr, nullptr, nullptr);

    constexpr double frameTimeinUs = 1000000.0 / frameGrabberFrameRate;

    LARGE_INTEGER frequency;
    LARGE_INTEGER lastTime, currentTime;

    QueryPerformanceFrequency(&frequency);
    QueryPerformanceCounter(&lastTime);

    const AVCodec* pVideoCodec = avcodec_find_encoder_by_name("h264_qsv");

    if (!pVideoCodec)
    {
        std::cerr << "Codec not found" << '\n';
        return 1;
    }

    AVCodecContext* pVideoCodecContext = avcodec_alloc_context3(pVideoCodec);

    if (!pVideoCodecContext)
    {
        std::cerr << "Could not allocate video pVideoCodec context" << '\n';
        return 1;
    }

    AVBufferRef* pHardwareDeviceContextRef = nullptr;

    ret = av_hwdevice_ctx_create(&pHardwareDeviceContextRef, AV_HWDEVICE_TYPE_QSV, nullptr, nullptr, 0);

    pVideoCodecContext->bit_rate = static_cast<int64_t>(outputWidth * outputHeight) * 2;
    pVideoCodecContext->width = outputWidth;
    pVideoCodecContext->height = outputHeight;
    pVideoCodecContext->framerate = { outputFrameRate, 1 };
    pVideoCodecContext->time_base = { 1, outputFrameRate };
    pVideoCodecContext->pix_fmt = AV_PIX_FMT_QSV;
    pVideoCodecContext->gop_size = 10;  
    pVideoCodecContext->max_b_frames = 2;

    AVBufferRef* pHardwareFramesContextRef = av_hwframe_ctx_alloc(pHardwareDeviceContextRef);

    AVHWFramesContext* pHardwareFramesContext = reinterpret_cast<AVHWFramesContext*>(pHardwareFramesContextRef->data);

    pHardwareFramesContext->format = AV_PIX_FMT_QSV;
    pHardwareFramesContext->sw_format = AV_PIX_FMT_NV12;
    pHardwareFramesContext->width = outputWidth;
    pHardwareFramesContext->height = outputHeight;
    pHardwareFramesContext->initial_pool_size = 32;

    ret = av_hwframe_ctx_init(pHardwareFramesContextRef);

    pVideoCodecContext->hw_frames_ctx = av_buffer_ref(pHardwareFramesContextRef);
    pVideoCodecContext->hw_device_ctx = av_buffer_ref(pHardwareDeviceContextRef);

    ret = avcodec_open2(pVideoCodecContext, pVideoCodec, nullptr);//&pVideoOptionsDict);
    check_error(ret);

    AVFormatContext* pVideoFormatContext = nullptr;

    avformat_alloc_output_context2(&pVideoFormatContext, nullptr, nullptr, outputFilePath);

    if (!pVideoFormatContext)
    {
        std::cerr << "Could not create output context" << '\n';
        return 1;
    }

    const AVOutputFormat* pVideoOutputFormat = pVideoFormatContext->oformat;

    if (pVideoFormatContext->oformat->flags & AVFMT_GLOBALHEADER)
    {
        pVideoCodecContext->flags |= AV_CODEC_FLAG_GLOBAL_HEADER;
    }

    AVStream* pVideoStream = avformat_new_stream(pVideoFormatContext, pVideoCodec);

    if (!pVideoStream)
    {
        std::cerr << "Could not allocate stream" << '\n';
        return 1;
    }

    pVideoStream->time_base = pVideoCodecContext->time_base;

    ret = avcodec_parameters_from_context(pVideoStream->codecpar, pVideoCodecContext);

    check_error(ret);

    if (!(pVideoOutputFormat->flags & AVFMT_NOFILE)) 
    {
        ret = avio_open(&pVideoFormatContext->pb, outputFilePath, AVIO_FLAG_WRITE);
        check_error(ret);
    }

    pVideoFormatContext->flags |= AVFMT_FLAG_GENPTS & AVFMT_FLAG_IGNDTS;

    ret = avformat_write_header(pVideoFormatContext, nullptr);

    check_error(ret);

    AVFrame* pHardwareFrame = av_frame_alloc();

    if (av_hwframe_get_buffer(pVideoCodecContext->hw_frames_ctx, pHardwareFrame, 0) < 0)
    {
        std::cerr << "Error allocating a hw frame" << '\n';
        return -1;
    }

    AVFrame* pFrameGrabberFrame = av_frame_alloc();
    AVPacket* pFrameGrabberPacket = av_packet_alloc();

    AVPacket* pVideoPacket = av_packet_alloc();
    AVFrame* pVideoFrame = av_frame_alloc();

    while (_isRunning)
    {
        if (av_read_frame(pFrameGrabberFormatContext, pFrameGrabberPacket) == 0)
        {
            if (pFrameGrabberPacket->stream_index == videoIndex)
            {
                ret = avcodec_send_packet(pFrameGrabberCodecContext, pFrameGrabberPacket);

                if (ret < 0)
                {
                    std::cerr << "Error sending a packet for decoding!" << '\n';
                    return -1;
                }

                ret = avcodec_receive_frame(pFrameGrabberCodecContext, pFrameGrabberFrame);

                if (ret != 0)
                {
                    std::cerr << "Receiving frame failed!" << '\n';
                    return -1;
                }

                if (ret == AVERROR(EAGAIN) || ret == AVERROR(AVERROR_EOF))
                {
                    std::cout << "End of stream detected. Exiting now." << '\n';
                    return 0;
                }

                if (ret != 0)
                {
                    std::cerr << "Decode Error!" << '\n';
                    return -1;
                }

                QueryPerformanceCounter(&currentTime);

                const double elapsedTime = (currentTime.QuadPart - lastTime.QuadPart) * 1000000.0 / frequency.QuadPart;

                if (elapsedTime > 0.0 && elapsedTime < frameTimeinUs)
                {
                    uSleep(frameTimeinUs - elapsedTime, frequency);
                }

                ret = sws_scale_frame(img_convert_ctx, pVideoFrame, pFrameGrabberFrame);

                if (ret < 0)
                {
                    std::cerr << "Scaling frame for Intel QS Encoder did fail!" << '\n';
                    return -1;
                }

                if (av_hwframe_transfer_data(pHardwareFrame, pVideoFrame, 0) < 0) 
                {
                    std::cerr << "Error transferring frame data to hw frame!" << '\n';
                    return -1;
                }

                av_packet_unref(pVideoPacket);

                pHardwareFrame->pts = av_rescale_q(pHardwareFrame->pts, { 1, outputFrameRate }, pVideoCodecContext->time_base);

                ret = avcodec_send_frame(pVideoCodecContext, pHardwareFrame);

                if (ret < 0)
                {
                    std::cerr << "Error sending a frame for encoding" << '\n';
                    check_error(ret);
                }

                while (ret >= 0) 
                {
                    ret = avcodec_receive_packet(pVideoCodecContext, pVideoPacket);

                    if (ret == AVERROR(EAGAIN) || ret == AVERROR_EOF)
                    {
                        break;
                    }

                    if (ret < 0) 
                    {
                        std::cerr << "Error during encoding" << '\n';
                        return 1;
                    }

                    pVideoPacket->stream_index = 0;

                    av_packet_rescale_ts(pVideoPacket, pVideoCodecContext->time_base, pVideoFormatContext->streams[0]->time_base);

                    ret = av_interleaved_write_frame(pVideoFormatContext, pVideoPacket);

                    check_error(ret);

                    av_packet_unref(pVideoPacket);
                }

                av_packet_unref(pFrameGrabberPacket);

                QueryPerformanceCounter(&lastTime);
            }
        }
    }   

    av_write_trailer(pVideoFormatContext);
    av_buffer_unref(&pHardwareDeviceContextRef);
    avcodec_free_context(&pVideoCodecContext);
    avio_closep(&pVideoFormatContext->pb);
    avformat_free_context(pVideoFormatContext);
    av_packet_free(&pVideoPacket);

    avcodec_free_context(&pFrameGrabberCodecContext);
    av_frame_free(&pFrameGrabberFrame);
    av_packet_free(&pFrameGrabberPacket);
    avformat_close_input(&pFrameGrabberFormatContext);

    return 0;
}