@@ -90,6 +90,7 @@ struct dxgi_vk_swap_chain
90
90
vkd3d_native_sync_handle frame_latency_event_internal ;
91
91
vkd3d_native_sync_handle present_request_done_event ;
92
92
bool outstanding_present_request ;
93
+ uint32_t frame_latency_event_internal_wait_counts ;
93
94
94
95
UINT frame_latency ;
95
96
UINT frame_latency_internal ;
@@ -885,11 +886,81 @@ static bool dxgi_vk_swap_chain_present_is_occluded(struct dxgi_vk_swap_chain *ch
885
886
886
887
static void dxgi_vk_swap_chain_present_callback (void * chain );
887
888
889
+ static void dxgi_vk_swap_chain_wait_internal_handle (struct dxgi_vk_swap_chain * chain , bool low_latency_enable )
890
+ {
891
+ const struct vkd3d_vk_device_procs * vk_procs = & chain -> queue -> device -> vk_procs ;
892
+ bool non_blocking_internal_handle_wait = low_latency_enable ;
893
+ uint64_t completed_submissions = 0 ;
894
+ uint64_t user_submissions = 0 ;
895
+
896
+ chain -> frame_latency_event_internal_wait_counts ++ ;
897
+
898
+ if (non_blocking_internal_handle_wait )
899
+ {
900
+ /* If we're using low latency mode, we expect that applications sleep on their own in LatencySleep.
901
+ * If we start sleeping ourselves here, we sometimes end up fighting with NV's LL2 implementation over
902
+ * which sleep cycle gets to dominate. This can manifest as a random pumping pattern.
903
+ *
904
+ * If our sleep dominates, we end up in an unstable situation where LL2 may think we're
905
+ * more CPU bound than we actually are.
906
+ *
907
+ * In a FIFO bound scenario however where GPU completes long before vblank hits,
908
+ * we should rely on frame latency sleeps.
909
+ *
910
+ * Use a very simple heuristic. If the blit timeline semaphore lags behind by 2+ frames, assume we're
911
+ * fully GPU bound and we should back off and let low latency deal with it more gracefully. */
912
+ user_submissions = chain -> user .blit_count ;
913
+
914
+ if (VK_CALL (vkGetSemaphoreCounterValue (chain -> queue -> device -> vk_device ,
915
+ chain -> present .vk_complete_semaphore ,
916
+ & completed_submissions )) == VK_SUCCESS )
917
+ {
918
+ /* We just submitted frame N. If N - 2 is already complete, it means there is <= 2 frames worth of GPU work
919
+ * queued up. For a FIFO bound or CPU bound game, this is the case we expect, so we should use latency fences here.
920
+ * If we're GPU bound with <= 2 frames queued up, we'll likely not block in our own latency handles anyway. */
921
+ if (completed_submissions + 2 >= user_submissions )
922
+ {
923
+ non_blocking_internal_handle_wait = false;
924
+ }
925
+ else if (chain -> debug_latency )
926
+ {
927
+ INFO ("Completed count: %" PRIu64 ", submitted count: %" PRIu64 ". GPU queue is too deep, deferring to low latency sleep.\n" ,
928
+ completed_submissions , user_submissions );
929
+ }
930
+ }
931
+ else
932
+ {
933
+ ERR ("Failed to query semaphore complete value.\n" );
934
+ non_blocking_internal_handle_wait = false;
935
+ }
936
+ }
937
+
938
+ if (non_blocking_internal_handle_wait )
939
+ {
940
+ /* Just make sure the counter doesn't get unbounded. */
941
+ while (chain -> frame_latency_event_internal_wait_counts &&
942
+ vkd3d_native_sync_handle_acquire_timeout (chain -> frame_latency_event_internal , 0 ))
943
+ {
944
+ chain -> frame_latency_event_internal_wait_counts -- ;
945
+ }
946
+ }
947
+ else
948
+ {
949
+ while (chain -> frame_latency_event_internal_wait_counts )
950
+ {
951
+ vkd3d_native_sync_handle_acquire (chain -> frame_latency_event_internal );
952
+ chain -> frame_latency_event_internal_wait_counts -- ;
953
+ }
954
+ }
955
+ }
956
+
888
957
static HRESULT STDMETHODCALLTYPE dxgi_vk_swap_chain_Present (IDXGIVkSwapChain * iface , UINT SyncInterval , UINT PresentFlags , const DXGI_PRESENT_PARAMETERS * pPresentParameters )
889
958
{
890
959
struct dxgi_vk_swap_chain * chain = impl_from_IDXGIVkSwapChain (iface );
891
960
struct dxgi_vk_swap_chain_present_request * request ;
892
961
struct vkd3d_queue_timeline_trace_cookie cookie ;
962
+ bool low_latency_enable ;
963
+
893
964
TRACE ("iface %p, SyncInterval %u, PresentFlags #%x, pPresentParameters %p.\n" ,
894
965
iface , SyncInterval , PresentFlags , pPresentParameters );
895
966
(void )pPresentParameters ;
@@ -937,12 +1008,14 @@ static HRESULT STDMETHODCALLTYPE dxgi_vk_swap_chain_Present(IDXGIVkSwapChain *if
937
1008
request -> requested_low_latency_state = chain -> requested_low_latency_state ;
938
1009
request -> low_latency_update_requested = chain -> low_latency_update_requested ;
939
1010
chain -> low_latency_update_requested = false;
1011
+ low_latency_enable = chain -> requested_low_latency_state .mode ;
940
1012
pthread_mutex_unlock (& chain -> present .low_latency_state_update_lock );
941
1013
}
942
1014
else
943
1015
{
944
1016
memset (& request -> requested_low_latency_state , 0 , sizeof (request -> requested_low_latency_state ));
945
1017
request -> low_latency_update_requested = false;
1018
+ low_latency_enable = false;
946
1019
}
947
1020
948
1021
/* Need to process this task in queue thread to deal with wait-before-signal.
@@ -960,7 +1033,7 @@ static HRESULT STDMETHODCALLTYPE dxgi_vk_swap_chain_Present(IDXGIVkSwapChain *if
960
1033
961
1034
/* Relevant if application does not use latency fence, or we force a lower latency through VKD3D_SWAPCHAIN_FRAME_LATENCY overrides. */
962
1035
if (vkd3d_native_sync_handle_is_valid (chain -> frame_latency_event_internal ))
963
- vkd3d_native_sync_handle_acquire (chain -> frame_latency_event_internal );
1036
+ dxgi_vk_swap_chain_wait_internal_handle (chain , low_latency_enable );
964
1037
965
1038
if (vkd3d_native_sync_handle_is_valid (chain -> present_request_done_event ))
966
1039
{
0 commit comments