Skip to content

Commit 7460c70

Browse files
vkd3d: Don't try to overtake the LL2 sleep cycle.
If the GPU queue is too deep, back off from trying to use latency fences, and instead rely on low-latency sleeps to clean it up. Avoids some questionable NV driver behavior where driver will not sleep when we are also blocking on swapchain backpressure. Signed-off-by: Hans-Kristian Arntzen <[email protected]>
1 parent fdcbc0e commit 7460c70

File tree

1 file changed

+74
-1
lines changed

1 file changed

+74
-1
lines changed

libs/vkd3d/swapchain.c

Lines changed: 74 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,7 @@ struct dxgi_vk_swap_chain
9090
vkd3d_native_sync_handle frame_latency_event_internal;
9191
vkd3d_native_sync_handle present_request_done_event;
9292
bool outstanding_present_request;
93+
uint32_t frame_latency_event_internal_wait_counts;
9394

9495
UINT frame_latency;
9596
UINT frame_latency_internal;
@@ -885,11 +886,81 @@ static bool dxgi_vk_swap_chain_present_is_occluded(struct dxgi_vk_swap_chain *ch
885886

886887
static void dxgi_vk_swap_chain_present_callback(void *chain);
887888

889+
static void dxgi_vk_swap_chain_wait_internal_handle(struct dxgi_vk_swap_chain *chain, bool low_latency_enable)
890+
{
891+
const struct vkd3d_vk_device_procs *vk_procs = &chain->queue->device->vk_procs;
892+
bool non_blocking_internal_handle_wait = low_latency_enable;
893+
uint64_t completed_submissions = 0;
894+
uint64_t user_submissions = 0;
895+
896+
chain->frame_latency_event_internal_wait_counts++;
897+
898+
if (non_blocking_internal_handle_wait)
899+
{
900+
/* If we're using low latency mode, we expect that applications sleep on their own in LatencySleep.
901+
* If we start sleeping ourselves here, we sometimes end up fighting with NV's LL2 implementation over
902+
* which sleep cycle gets to dominate. This can manifest as a random pumping pattern.
903+
*
904+
* If our sleep dominates, we end up in an unstable situation where LL2 may think we're
905+
* more CPU bound than we actually are.
906+
*
907+
* In a FIFO bound scenario however where GPU completes long before vblank hits,
908+
* we should rely on frame latency sleeps.
909+
*
910+
* Use a very simple heuristic. If the blit timeline semaphore lags behind by 2+ frames, assume we're
911+
* fully GPU bound and we should back off and let low latency deal with it more gracefully. */
912+
user_submissions = chain->user.blit_count;
913+
914+
if (VK_CALL(vkGetSemaphoreCounterValue(chain->queue->device->vk_device,
915+
chain->present.vk_complete_semaphore,
916+
&completed_submissions)) == VK_SUCCESS)
917+
{
918+
/* We just submitted frame N. If N - 2 is already complete, it means there is <= 2 frames worth of GPU work
919+
* queued up. For a FIFO bound or CPU bound game, this is the case we expect, so we should use latency fences here.
920+
* If we're GPU bound with <= 2 frames queued up, we'll likely not block in our own latency handles anyway. */
921+
if (completed_submissions + 2 >= user_submissions)
922+
{
923+
non_blocking_internal_handle_wait = false;
924+
}
925+
else if (chain->debug_latency)
926+
{
927+
INFO("Completed count: %"PRIu64", submitted count: %"PRIu64". GPU queue is too deep, deferring to low latency sleep.\n",
928+
completed_submissions, user_submissions);
929+
}
930+
}
931+
else
932+
{
933+
ERR("Failed to query semaphore complete value.\n");
934+
non_blocking_internal_handle_wait = false;
935+
}
936+
}
937+
938+
if (non_blocking_internal_handle_wait)
939+
{
940+
/* Just make sure the counter doesn't get unbounded. */
941+
while (chain->frame_latency_event_internal_wait_counts &&
942+
vkd3d_native_sync_handle_acquire_timeout(chain->frame_latency_event_internal, 0))
943+
{
944+
chain->frame_latency_event_internal_wait_counts--;
945+
}
946+
}
947+
else
948+
{
949+
while (chain->frame_latency_event_internal_wait_counts)
950+
{
951+
vkd3d_native_sync_handle_acquire(chain->frame_latency_event_internal);
952+
chain->frame_latency_event_internal_wait_counts--;
953+
}
954+
}
955+
}
956+
888957
static HRESULT STDMETHODCALLTYPE dxgi_vk_swap_chain_Present(IDXGIVkSwapChain *iface, UINT SyncInterval, UINT PresentFlags, const DXGI_PRESENT_PARAMETERS *pPresentParameters)
889958
{
890959
struct dxgi_vk_swap_chain *chain = impl_from_IDXGIVkSwapChain(iface);
891960
struct dxgi_vk_swap_chain_present_request *request;
892961
struct vkd3d_queue_timeline_trace_cookie cookie;
962+
bool low_latency_enable;
963+
893964
TRACE("iface %p, SyncInterval %u, PresentFlags #%x, pPresentParameters %p.\n",
894965
iface, SyncInterval, PresentFlags, pPresentParameters);
895966
(void)pPresentParameters;
@@ -937,12 +1008,14 @@ static HRESULT STDMETHODCALLTYPE dxgi_vk_swap_chain_Present(IDXGIVkSwapChain *if
9371008
request->requested_low_latency_state = chain->requested_low_latency_state;
9381009
request->low_latency_update_requested = chain->low_latency_update_requested;
9391010
chain->low_latency_update_requested = false;
1011+
low_latency_enable = chain->requested_low_latency_state.mode;
9401012
pthread_mutex_unlock(&chain->present.low_latency_state_update_lock);
9411013
}
9421014
else
9431015
{
9441016
memset(&request->requested_low_latency_state, 0, sizeof(request->requested_low_latency_state));
9451017
request->low_latency_update_requested = false;
1018+
low_latency_enable = false;
9461019
}
9471020

9481021
/* Need to process this task in queue thread to deal with wait-before-signal.
@@ -960,7 +1033,7 @@ static HRESULT STDMETHODCALLTYPE dxgi_vk_swap_chain_Present(IDXGIVkSwapChain *if
9601033

9611034
/* Relevant if application does not use latency fence, or we force a lower latency through VKD3D_SWAPCHAIN_FRAME_LATENCY overrides. */
9621035
if (vkd3d_native_sync_handle_is_valid(chain->frame_latency_event_internal))
963-
vkd3d_native_sync_handle_acquire(chain->frame_latency_event_internal);
1036+
dxgi_vk_swap_chain_wait_internal_handle(chain, low_latency_enable);
9641037

9651038
if (vkd3d_native_sync_handle_is_valid(chain->present_request_done_event))
9661039
{

0 commit comments

Comments
 (0)