I'm attempting to render video using using the Microsoft sample DX11VideoRenderer found at: https://github.com/Microsoft/Windows-classic-samples/tree/master/Samples/DX11VideoRenderer From my extensive research it seems that using DirectX 11 with hardware-acceleration is the most up-to-date method (least likely to be deprecated) and offers the best performance solution.
There are 2 similar functions within Presenter.cpp that process frames but I cannot figure out what the difference is between them. ProcessFrameUsingD3D11()
uses VideoProcessorBlt()
to actually do the render. The mystery is that ProcessFrameUsingXVP()
does not use this function so how does it actually do the render? Or is it doing something else entirely? Also it appears that my implementation is using ProcessFrameUsingXVP()
based in the value of the variable m_useXVP
which is by default set to '1'. Here is the code sample:
if (m_useXVP)
{
BOOL bInputFrameUsed = FALSE;
hr = ProcessFrameUsingXVP( pCurrentType, pSample, pTexture2D, rcDest, ppOutputSample, &bInputFrameUsed );
if (SUCCEEDED(hr) && !bInputFrameUsed)
{
*pbProcessAgain = TRUE;
}
}
else
{
hr = ProcessFrameUsingD3D11( pTexture2D, pEVTexture2D, dwViewIndex, dwEVViewIndex, rcDest, *punInterlaceMode, ppOutputSample );
LONGLONG hnsDuration = 0;
LONGLONG hnsTime = 0;
DWORD dwSampleFlags = 0;
if (ppOutputSample != NULL && *ppOutputSample != NULL)
{
if (SUCCEEDED(pSample->GetSampleDuration(&hnsDuration)))
{
(*ppOutputSample)->SetSampleDuration(hnsDuration);
}
if (SUCCEEDED(pSample->GetSampleTime(&hnsTime)))
{
(*ppOutputSample)->SetSampleTime(hnsTime);
}
if (SUCCEEDED(pSample->GetSampleFlags(&dwSampleFlags)))
{
(*ppOutputSample)->SetSampleFlags(dwSampleFlags);
}
}
}
The reason for setting m_useXVP
is also a mystery to me and I cannot find an answer in my research. It uses a registry key that does not exist on my particular Windows10 PC so the value is not modified.
const TCHAR* lpcszInVP = TEXT("XVP");
const TCHAR* lpcszREGKEY = TEXT("SOFTWARE\\Microsoft\\Scrunch\\CodecPack\\MSDVD");
if(0 == RegOpenKeyEx(HKEY_CURRENT_USER, lpcszREGKEY, 0, KEY_READ, &hk))
{
dwData = 0;
cbData = sizeof(DWORD);
if (0 == RegQueryValueEx(hk, lpcszInVP, 0, &cbType, (LPBYTE)&dwData, &cbData))
{
m_useXVP = dwData;
}
}
So since my PC does not have this key, the code is defaulting to using ProcessFrameUsingXVP()
. Here is the definition:
HRESULT DX11VideoRenderer::CPresenter::ProcessFrameUsingXVP(IMFMediaType* pCurrentType, IMFSample* pVideoFrame, ID3D11Texture2D* pTexture2D, RECT rcDest, IMFSample** ppVideoOutFrame, BOOL* pbInputFrameUsed)
{
HRESULT hr = S_OK;
ID3D11VideoContext* pVideoContext = NULL;
ID3D11Texture2D* pDXGIBackBuffer = NULL;
IMFSample* pRTSample = NULL;
IMFMediaBuffer* pBuffer = NULL;
IMFAttributes* pAttributes = NULL;
D3D11_VIDEO_PROCESSOR_CAPS vpCaps = { 0 };
do
{
if (!m_pDX11VideoDevice)
{
hr = m_pD3D11Device->QueryInterface(__uuidof(ID3D11VideoDevice), (void**)&m_pDX11VideoDevice);
if (FAILED(hr))
{
break;
}
}
hr = m_pD3DImmediateContext->QueryInterface(__uuidof(ID3D11VideoContext), (void**)&pVideoContext);
if (FAILED(hr))
{
break;
}
// remember the original rectangles
RECT TRectOld = m_rcDstApp;
RECT SRectOld = m_rcSrcApp;
UpdateRectangles(&TRectOld, &SRectOld);
//Update destination rect with current client rect
m_rcDstApp = rcDest;
D3D11_TEXTURE2D_DESC surfaceDesc;
pTexture2D->GetDesc(&surfaceDesc);
BOOL fTypeChanged = FALSE;
if (!m_pVideoProcessorEnum || !m_pSwapChain1 || m_imageWidthInPixels != surfaceDesc.Width || m_imageHeightInPixels != surfaceDesc.Height)
{
SafeRelease(m_pVideoProcessorEnum);
SafeRelease(m_pSwapChain1);
m_imageWidthInPixels = surfaceDesc.Width;
m_imageHeightInPixels = surfaceDesc.Height;
fTypeChanged = TRUE;
D3D11_VIDEO_PROCESSOR_CONTENT_DESC ContentDesc;
ZeroMemory(&ContentDesc, sizeof(ContentDesc));
ContentDesc.InputFrameFormat = D3D11_VIDEO_FRAME_FORMAT_INTERLACED_TOP_FIELD_FIRST;
ContentDesc.InputWidth = surfaceDesc.Width;
ContentDesc.InputHeight = surfaceDesc.Height;
ContentDesc.OutputWidth = surfaceDesc.Width;
ContentDesc.OutputHeight = surfaceDesc.Height;
ContentDesc.Usage = D3D11_VIDEO_USAGE_PLAYBACK_NORMAL;
hr = m_pDX11VideoDevice->CreateVideoProcessorEnumerator(&ContentDesc, &m_pVideoProcessorEnum);
if (FAILED(hr))
{
break;
}
m_rcSrcApp.left = 0;
m_rcSrcApp.top = 0;
m_rcSrcApp.right = m_uiRealDisplayWidth;
m_rcSrcApp.bottom = m_uiRealDisplayHeight;
if (m_b3DVideo)
{
hr = m_pVideoProcessorEnum->GetVideoProcessorCaps(&vpCaps);
if (FAILED(hr))
{
break;
}
if (vpCaps.FeatureCaps & D3D11_VIDEO_PROCESSOR_FEATURE_CAPS_STEREO)
{
m_bStereoEnabled = TRUE;
}
DXGI_MODE_DESC1 modeFilter = { 0 };
modeFilter.Format = DXGI_FORMAT_B8G8R8A8_UNORM;
modeFilter.Width = surfaceDesc.Width;
modeFilter.Height = surfaceDesc.Height;
modeFilter.Stereo = m_bStereoEnabled;
DXGI_MODE_DESC1 matchedMode;
if (m_bFullScreenState)
{
hr = m_pDXGIOutput1->FindClosestMatchingMode1(&modeFilter, &matchedMode, m_pD3D11Device);
if (FAILED(hr))
{
break;
}
}
hr = m_pXVP->GetAttributes(&pAttributes);
if (FAILED(hr))
{
break;
}
hr = pAttributes->SetUINT32(MF_ENABLE_3DVIDEO_OUTPUT, (0 != m_vp3DOutput) ? MF3DVideoOutputType_Stereo : MF3DVideoOutputType_BaseView);
if (FAILED(hr))
{
break;
}
}
}
// now create the input and output media types - these need to reflect
// the src and destination rectangles that we have been given.
RECT TRect = m_rcDstApp;
RECT SRect = m_rcSrcApp;
UpdateRectangles(&TRect, &SRect);
const BOOL fDestRectChanged = !EqualRect(&TRect, &TRectOld);
const BOOL fSrcRectChanged = !EqualRect(&SRect, &SRectOld);
if (!m_pSwapChain1 || fDestRectChanged)
{
hr = UpdateDXGISwapChain();
if (FAILED(hr))
{
break;
}
}
if (fTypeChanged || fSrcRectChanged || fDestRectChanged)
{
// stop streaming to avoid multiple start\stop calls internally in XVP
hr = m_pXVP->ProcessMessage(MFT_MESSAGE_NOTIFY_END_STREAMING, 0);
if (FAILED(hr))
{
break;
}
if (fTypeChanged)
{
hr = SetXVPOutputMediaType(pCurrentType, DXGI_FORMAT_B8G8R8A8_UNORM);
if (FAILED(hr))
{
break;
}
}
if (fDestRectChanged)
{
hr = m_pXVPControl->SetDestinationRectangle(&m_rcDstApp);
if (FAILED(hr))
{
break;
}
}
if (fSrcRectChanged)
{
hr = m_pXVPControl->SetSourceRectangle(&SRect);
if (FAILED(hr))
{
break;
}
}
hr = m_pXVP->ProcessMessage(MFT_MESSAGE_NOTIFY_BEGIN_STREAMING, 0);
if (FAILED(hr))
{
break;
}
}
m_bCanProcessNextSample = FALSE;
// Get Backbuffer
hr = m_pSwapChain1->GetBuffer(0, __uuidof(ID3D11Texture2D), (void**)&pDXGIBackBuffer);
if (FAILED(hr))
{
break;
}
// create the output media sample
hr = MFCreateSample(&pRTSample);
if (FAILED(hr))
{
break;
}
hr = MFCreateDXGISurfaceBuffer(__uuidof(ID3D11Texture2D), pDXGIBackBuffer, 0, FALSE, &pBuffer);
if (FAILED(hr))
{
break;
}
hr = pRTSample->AddBuffer(pBuffer);
if (FAILED(hr))
{
break;
}
if (m_b3DVideo && 0 != m_vp3DOutput)
{
SafeRelease(pBuffer);
hr = MFCreateDXGISurfaceBuffer(__uuidof(ID3D11Texture2D), pDXGIBackBuffer, 1, FALSE, &pBuffer);
if (FAILED(hr))
{
break;
}
hr = pRTSample->AddBuffer(pBuffer);
if (FAILED(hr))
{
break;
}
}
DWORD dwStatus = 0;
MFT_OUTPUT_DATA_BUFFER outputDataBuffer = {};
outputDataBuffer.pSample = pRTSample;
hr = m_pXVP->ProcessOutput(0, 1, &outputDataBuffer, &dwStatus);
if (hr == MF_E_TRANSFORM_NEED_MORE_INPUT)
{
//call process input on the MFT to deliver the YUV video sample
// and the call process output to extract of newly processed frame
hr = m_pXVP->ProcessInput(0, pVideoFrame, 0);
if (FAILED(hr))
{
break;
}
*pbInputFrameUsed = TRUE;
hr = m_pXVP->ProcessOutput(0, 1, &outputDataBuffer, &dwStatus);
if (FAILED(hr))
{
break;
}
}
else
{
*pbInputFrameUsed = FALSE;
}
if (ppVideoOutFrame != NULL)
{
*ppVideoOutFrame = pRTSample;
(*ppVideoOutFrame)->AddRef();
}
} while (FALSE);
SafeRelease(pAttributes);
SafeRelease(pBuffer);
SafeRelease(pRTSample);
SafeRelease(pDXGIBackBuffer);
SafeRelease(pVideoContext);
return hr;
}
And here is the definition of ProcessFrameUsingD3D11()
:
HRESULT DX11VideoRenderer::CPresenter::ProcessFrameUsingD3D11( ID3D11Texture2D* pLeftTexture2D, ID3D11Texture2D* pRightTexture2D, UINT dwLeftViewIndex, UINT dwRightViewIndex,
RECT rcDest, UINT32 unInterlaceMode, IMFSample** ppVideoOutFrame )
{
HRESULT hr = S_OK;
ID3D11VideoContext* pVideoContext = NULL;
ID3D11VideoProcessorInputView* pLeftInputView = NULL;
ID3D11VideoProcessorInputView* pRightInputView = NULL;
ID3D11VideoProcessorOutputView* pOutputView = NULL;
ID3D11Texture2D* pDXGIBackBuffer = NULL;
ID3D11RenderTargetView* pRTView = NULL;
IMFSample* pRTSample = NULL;
IMFMediaBuffer* pBuffer = NULL;
D3D11_VIDEO_PROCESSOR_CAPS vpCaps = {0};
LARGE_INTEGER lpcStart,lpcEnd;
do
{
if (!m_pDX11VideoDevice)
{
hr = m_pD3D11Device->QueryInterface(__uuidof(ID3D11VideoDevice), (void**)&m_pDX11VideoDevice);
if (FAILED(hr))
{
break;
}
}
hr = m_pD3DImmediateContext->QueryInterface(__uuidof( ID3D11VideoContext ), (void**)&pVideoContext);
if (FAILED(hr))
{
break;
}
// remember the original rectangles
RECT TRectOld = m_rcDstApp;
RECT SRectOld = m_rcSrcApp;
UpdateRectangles(&TRectOld, &SRectOld);
//Update destination rect with current client rect
m_rcDstApp = rcDest;
D3D11_TEXTURE2D_DESC surfaceDesc;
pLeftTexture2D->GetDesc(&surfaceDesc);
if (!m_pVideoProcessorEnum || !m_pVideoProcessor || m_imageWidthInPixels != surfaceDesc.Width || m_imageHeightInPixels != surfaceDesc.Height)
{
SafeRelease(m_pVideoProcessorEnum);
SafeRelease(m_pVideoProcessor);
m_imageWidthInPixels = surfaceDesc.Width;
m_imageHeightInPixels = surfaceDesc.Height;
D3D11_VIDEO_PROCESSOR_CONTENT_DESC ContentDesc;
ZeroMemory( &ContentDesc, sizeof( ContentDesc ) );
ContentDesc.InputFrameFormat = D3D11_VIDEO_FRAME_FORMAT_INTERLACED_TOP_FIELD_FIRST;
ContentDesc.InputWidth = surfaceDesc.Width;
ContentDesc.InputHeight = surfaceDesc.Height;
ContentDesc.OutputWidth = surfaceDesc.Width;
ContentDesc.OutputHeight = surfaceDesc.Height;
ContentDesc.Usage = D3D11_VIDEO_USAGE_PLAYBACK_NORMAL;
hr = m_pDX11VideoDevice->CreateVideoProcessorEnumerator(&ContentDesc, &m_pVideoProcessorEnum);
if (FAILED(hr))
{
break;
}
UINT uiFlags;
DXGI_FORMAT VP_Output_Format = DXGI_FORMAT_B8G8R8A8_UNORM;
hr = m_pVideoProcessorEnum->CheckVideoProcessorFormat(VP_Output_Format, &uiFlags);
if (FAILED(hr) || 0 == (uiFlags & D3D11_VIDEO_PROCESSOR_FORMAT_SUPPORT_OUTPUT))
{
hr = MF_E_UNSUPPORTED_D3D_TYPE;
break;
}
m_rcSrcApp.left = 0;
m_rcSrcApp.top = 0;
m_rcSrcApp.right = m_uiRealDisplayWidth;
m_rcSrcApp.bottom = m_uiRealDisplayHeight;
DWORD index;
hr = FindBOBProcessorIndex(&index); // GG This may not be needed. BOB is something to do with deinterlacing.
if (FAILED(hr))
{
break;
}
hr = m_pDX11VideoDevice->CreateVideoProcessor(m_pVideoProcessorEnum, index, &m_pVideoProcessor);
if (FAILED(hr))
{
break;
}
if (m_b3DVideo)
{
hr = m_pVideoProcessorEnum->GetVideoProcessorCaps(&vpCaps);
if (FAILED(hr))
{
break;
}
if (vpCaps.FeatureCaps & D3D11_VIDEO_PROCESSOR_FEATURE_CAPS_STEREO)
{
m_bStereoEnabled = TRUE;
}
DXGI_MODE_DESC1 modeFilter = { 0 };
modeFilter.Format = DXGI_FORMAT_B8G8R8A8_UNORM;
modeFilter.Width = surfaceDesc.Width;
modeFilter.Height = surfaceDesc.Height;
modeFilter.Stereo = m_bStereoEnabled;
DXGI_MODE_DESC1 matchedMode;
if (m_bFullScreenState)
{
hr = m_pDXGIOutput1->FindClosestMatchingMode1(&modeFilter, &matchedMode, m_pD3D11Device);
if (FAILED(hr))
{
break;
}
}
}
}
// now create the input and output media types - these need to reflect
// the src and destination rectangles that we have been given.
RECT TRect = m_rcDstApp;
RECT SRect = m_rcSrcApp;
UpdateRectangles(&TRect, &SRect);
const BOOL fDestRectChanged = !EqualRect(&TRect, &TRectOld);
if (!m_pSwapChain1 || fDestRectChanged)
{
hr = UpdateDXGISwapChain();
if (FAILED(hr))
{
break;
}
}
m_bCanProcessNextSample = FALSE;
// Get Backbuffer
hr = m_pSwapChain1->GetBuffer(0, __uuidof(ID3D11Texture2D), (void**)&pDXGIBackBuffer);
if (FAILED(hr))
{
break;
}
// create the output media sample
hr = MFCreateSample(&pRTSample);
if (FAILED(hr))
{
break;
}
hr = MFCreateDXGISurfaceBuffer(__uuidof(ID3D11Texture2D), pDXGIBackBuffer, 0, FALSE, &pBuffer);
if (FAILED(hr))
{
break;
}
hr = pRTSample->AddBuffer(pBuffer);
if (FAILED(hr))
{
break;
}
// GG For 3D - don't need.
if (m_b3DVideo && 0 != m_vp3DOutput)
{
SafeRelease(pBuffer);
hr = MFCreateDXGISurfaceBuffer(__uuidof(ID3D11Texture2D), pDXGIBackBuffer, 1, FALSE, &pBuffer);
if (FAILED(hr))
{
break;
}
hr = pRTSample->AddBuffer(pBuffer);
if (FAILED(hr))
{
break;
}
}
QueryPerformanceCounter(&lpcStart);
QueryPerformanceCounter(&lpcEnd);
//
// Create Output View of Output Surfaces.
//
D3D11_VIDEO_PROCESSOR_OUTPUT_VIEW_DESC OutputViewDesc;
ZeroMemory( &OutputViewDesc, sizeof( OutputViewDesc ) );
if (m_b3DVideo && m_bStereoEnabled)
{
OutputViewDesc.ViewDimension = D3D11_VPOV_DIMENSION_TEXTURE2DARRAY;
}
else
{
OutputViewDesc.ViewDimension = D3D11_VPOV_DIMENSION_TEXTURE2D;
}
OutputViewDesc.Texture2D.MipSlice = 0;
OutputViewDesc.Texture2DArray.MipSlice = 0;
OutputViewDesc.Texture2DArray.FirstArraySlice = 0;
if (m_b3DVideo && 0 != m_vp3DOutput)
{
OutputViewDesc.Texture2DArray.ArraySize = 2; // STEREO
}
QueryPerformanceCounter(&lpcStart);
hr = m_pDX11VideoDevice->CreateVideoProcessorOutputView(pDXGIBackBuffer, m_pVideoProcessorEnum, &OutputViewDesc, &pOutputView);
if (FAILED(hr))
{
break;
}
D3D11_VIDEO_PROCESSOR_INPUT_VIEW_DESC InputLeftViewDesc;
ZeroMemory( &InputLeftViewDesc, sizeof( InputLeftViewDesc ) );
InputLeftViewDesc.FourCC = 0;
InputLeftViewDesc.ViewDimension = D3D11_VPIV_DIMENSION_TEXTURE2D;
InputLeftViewDesc.Texture2D.MipSlice = 0;
InputLeftViewDesc.Texture2D.ArraySlice = dwLeftViewIndex;
hr = m_pDX11VideoDevice->CreateVideoProcessorInputView(pLeftTexture2D, m_pVideoProcessorEnum, &InputLeftViewDesc, &pLeftInputView);
if (FAILED(hr))
{
break;
}
if (m_b3DVideo && MFVideo3DSampleFormat_MultiView == m_vp3DOutput && pRightTexture2D)
{
D3D11_VIDEO_PROCESSOR_INPUT_VIEW_DESC InputRightViewDesc;
ZeroMemory( &InputRightViewDesc, sizeof( InputRightViewDesc ) );
InputRightViewDesc.FourCC = 0;
InputRightViewDesc.ViewDimension = D3D11_VPIV_DIMENSION_TEXTURE2D;
InputRightViewDesc.Texture2D.MipSlice = 0;
InputRightViewDesc.Texture2D.ArraySlice = dwRightViewIndex;
hr = m_pDX11VideoDevice->CreateVideoProcessorInputView(pRightTexture2D, m_pVideoProcessorEnum, &InputRightViewDesc, &pRightInputView);
if (FAILED(hr))
{
break;
}
}
QueryPerformanceCounter(&lpcEnd);
QueryPerformanceCounter(&lpcStart);
SetVideoContextParameters(pVideoContext, &SRect, &TRect, unInterlaceMode);
// Enable/Disable Stereo
if (m_b3DVideo)
{
pVideoContext->VideoProcessorSetOutputStereoMode(m_pVideoProcessor, m_bStereoEnabled);
D3D11_VIDEO_PROCESSOR_STEREO_FORMAT vpStereoFormat = D3D11_VIDEO_PROCESSOR_STEREO_FORMAT_SEPARATE;
if (MFVideo3DSampleFormat_Packed_LeftRight == m_vp3DOutput)
{
vpStereoFormat = D3D11_VIDEO_PROCESSOR_STEREO_FORMAT_HORIZONTAL;
}
else if (MFVideo3DSampleFormat_Packed_TopBottom == m_vp3DOutput)
{
vpStereoFormat = D3D11_VIDEO_PROCESSOR_STEREO_FORMAT_VERTICAL;
}
pVideoContext->VideoProcessorSetStreamStereoFormat(m_pVideoProcessor,
0, m_bStereoEnabled, vpStereoFormat, TRUE, TRUE, D3D11_VIDEO_PROCESSOR_STEREO_FLIP_NONE, 0);
}
QueryPerformanceCounter(&lpcEnd);
QueryPerformanceCounter(&lpcStart);
D3D11_VIDEO_PROCESSOR_STREAM StreamData;
ZeroMemory( &StreamData, sizeof( StreamData ) );
StreamData.Enable = TRUE;
StreamData.OutputIndex = 0;
StreamData.InputFrameOrField = 0;
StreamData.PastFrames = 0;
StreamData.FutureFrames = 0;
StreamData.ppPastSurfaces = NULL;
StreamData.ppFutureSurfaces = NULL;
StreamData.pInputSurface = pLeftInputView;
StreamData.ppPastSurfacesRight = NULL;
StreamData.ppFutureSurfacesRight = NULL;
if (m_b3DVideo && MFVideo3DSampleFormat_MultiView == m_vp3DOutput && pRightTexture2D)
{
StreamData.pInputSurfaceRight = pRightInputView;
}
hr = pVideoContext->VideoProcessorBlt(m_pVideoProcessor, pOutputView, 0, 1, &StreamData );
if (FAILED(hr))
{
break;
}
QueryPerformanceCounter(&lpcEnd);
if (ppVideoOutFrame != NULL)
{
*ppVideoOutFrame = pRTSample;
(*ppVideoOutFrame)->AddRef();
}
}
while (FALSE);
SafeRelease(pBuffer);
SafeRelease(pRTSample);
SafeRelease(pDXGIBackBuffer);
SafeRelease(pOutputView);
SafeRelease(pLeftInputView);
SafeRelease(pRightInputView);
SafeRelease(pVideoContext);
return hr;
}
One last note, the documentation states that:
Specifically, this sample shows how to:
- Decode the video using the Media Foundation APIs
- Render the decoded video using the DirectX 11 APIs
- Output the video stream to multi-monitor displays
I cannot find anything that does decoding unless by some MF magic chant phrase that I haven't stumbled across yet. But it's not a showstopper because I can stick an H.264 decoder MFT in front no problem. I would just like to clarify the documentation.
Any help would be much appreciated. Thank you!
There are 2 similar functions within Presenter.cpp that process frames but I cannot figure out what the difference is between them. ProcessFrameUsingD3D11()uses VideoProcessorBlt() to actually do the render.
The functions are not rendering - they are two ways to scale video frames. Scaling might be done with a readily available Media Foundation transform internally managed by the renderer's presenter, or scaling might be done with the help of Direct3D 11 processor. Actually both utilize Direct3D 11, so the two methods are close one to another and are just one step in the rendering process.
I cannot find anything that does decoding unless by some MF magic chant phrase that I haven't stumbled across yet.
There is no decoding and the list of sink video formats in StreamSink.cpp suggests that by only listing uncompressed video formats. The renderer presents frames carried by Direct3D 11 textures, which in turn assumes that a decode, esp. hardware decoder such as DXVA2 based already supplies the decoded textures on the renderer input.