Skip to content

HDRP Metal fixes [Hold] #2044

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 5 commits into
base: HDRP/staging
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 13 additions & 1 deletion com.unity.render-pipelines.core/ShaderLibrary/Common.hlsl
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,13 @@
#define PREFER_HALF 1
#endif

#if HAS_HALF && PREFER_HALF
// Unity historically has a mixed stance on default sampler precision between mobiles and other platforms,
// and changes without altering existing behavior/performance is difficult.
//
// When UNITY_UNIFIED_SHADER_PRECISION_MODEL is defined, the expectation is to have full precision
// on all platforms and explicitly optimize against lower precision when useful.

#if !defined(UNITY_UNIFIED_SHADER_PRECISION_MODEL) && HAS_HALF && PREFER_HALF
#define REAL_IS_HALF 1
#else
#define REAL_IS_HALF 0
Expand Down Expand Up @@ -221,6 +227,12 @@
#define LANE_SWIZZLE_OFFSET(andMask, orMask, xorMask) (andMask | (orMask << 5) | (xorMask << 10))
#endif

// For multi_compile
#ifdef PLATFORM_LANE_COUNT_32
#undef PLATFORM_LANE_COUNT
#define PLATFORM_LANE_COUNT 32
#endif

#include "Packages/com.unity.render-pipelines.core/ShaderLibrary/CommonDeprecated.hlsl"

#if !defined(SHADER_API_GLES)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -312,25 +312,6 @@ protected int GetNumMips(int dim)
return iNumMips;
}

public static bool isMobileBuildTarget
{
get
{
#if UNITY_EDITOR
switch (EditorUserBuildSettings.activeBuildTarget)
{
case BuildTarget.iOS:
case BuildTarget.Android:
return true;
default:
return false;
}
#else
return Application.isMobilePlatform;
#endif
}
}

public static TextureFormat GetPreferredHDRCompressedTextureFormat
{
get
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,6 @@ unsafe struct ShaderVariablesDebugDisplay
public int _DebugSingleShadowIndex;

public int _DebugProbeVolumeMode;
public Vector3 _DebugDisplayPad0;
}

/// <summary>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,6 @@ CBUFFER_START(ShaderVariablesDebugDisplay)
float _MatcapViewScale;
int _DebugSingleShadowIndex;
int _DebugProbeVolumeMode;
float3 _DebugDisplayPad0;
CBUFFER_END


Expand Down
Original file line number Diff line number Diff line change
@@ -1,9 +1,18 @@
#pragma kernel ClearList

// TODO: Fix PLATFORM_LANE_COUNT=32 support here and in Lightloop.cs if needed
//#pragma multi_compile _ PLATFORM_LANE_COUNT_32

RWStructuredBuffer<uint> _LightListToClear;
int _LightListEntries;

[numthreads(64, 1, 1)]
#ifdef PLATFORM_LANE_COUNT
#define NR_THREADS PLATFORM_LANE_COUNT
#else
#define NR_THREADS 64 // default to 64 threads per group
#endif

[numthreads(NR_THREADS, 1, 1)]
void ClearList(uint3 id : SV_DispatchThreadID)
{
if (id.x < (uint)_LightListEntries)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -280,23 +280,6 @@ public partial class HDRenderPipeline
internal const int k_MaxLightsPerClusterCell = 24;
internal static readonly Vector3 k_BoxCullingExtentThreshold = Vector3.one * 0.01f;

#if UNITY_SWITCH
static bool k_PreferFragment = true;
#else
static bool k_PreferFragment = false;
#endif
#if !UNITY_EDITOR && UNITY_SWITCH
const bool k_HasNativeQuadSupport = true;
#else
const bool k_HasNativeQuadSupport = false;
#endif

#if !UNITY_EDITOR && UNITY_SWITCH
const int k_ThreadGroupOptimalSize = 32;
#else
const int k_ThreadGroupOptimalSize = 64;
#endif

int m_MaxDirectionalLightsOnScreen;
int m_MaxPunctualLightsOnScreen;
int m_MaxAreaLightsOnScreen;
Expand Down Expand Up @@ -464,7 +447,7 @@ public void AllocateNonRenderGraphResolutionDependentBuffers(HDCamera hdCamera,
var nrClustersY = (height + LightDefinitions.s_TileSizeClustered - 1) / LightDefinitions.s_TileSizeClustered;
var nrClusterTiles = nrClustersX * nrClustersY * viewCount;

perVoxelOffset = new ComputeBuffer((int)LightCategory.Count * (1 << k_Log2NumClusters) * nrClusterTiles, sizeof(uint));
perVoxelOffset = new ComputeBuffer((int)LightCategory.Count * (1 << DeviceInfo.log2NumClusters) * nrClusterTiles, sizeof(uint));
perVoxelLightLists = new ComputeBuffer(NumLightIndicesPerClusteredTile() * nrClusterTiles, sizeof(uint));

if (clusterNeedsDepth)
Expand Down Expand Up @@ -720,11 +703,6 @@ enum ClusterDepthSource : int

const bool k_UseDepthBuffer = true; // only has an impact when EnableClustered is true (requires a depth-prepass)

#if !UNITY_EDITOR && UNITY_SWITCH
const int k_Log2NumClusters = 5; // accepted range is from 0 to 5 (NR_THREADS is set to 32 on Switch). NumClusters is 1<<g_iLog2NumClusters
#else
const int k_Log2NumClusters = 6; // accepted range is from 0 to 6 (NR_THREADS is set to 64 on other platforms). NumClusters is 1<<g_iLog2NumClusters
#endif
const float k_ClustLogBase = 1.02f; // each slice 2% bigger than the previous
float m_ClusterScale;

Expand Down Expand Up @@ -879,41 +857,41 @@ void InitializeLightLoop(IBLFilterBSDF[] iBLFilterBSDFArray)
m_MaxLightsOnScreen = m_MaxDirectionalLightsOnScreen + m_MaxPunctualLightsOnScreen + m_MaxAreaLightsOnScreen + m_MaxEnvLightsOnScreen;
m_MaxPlanarReflectionOnScreen = lightLoopSettings.maxPlanarReflectionOnScreen;

s_GenAABBKernel = buildScreenAABBShader.FindKernel("ScreenBoundsAABB");
s_GenAABBKernel = DeviceInfo.FindKernel(buildScreenAABBShader, "ScreenBoundsAABB");

// Cluster
{
s_ClearVoxelAtomicKernel = clearClusterAtomicIndexShader.FindKernel("ClearAtomic");
s_ClearVoxelAtomicKernel = DeviceInfo.FindKernel(clearClusterAtomicIndexShader, "ClearAtomic");

for (int i = 0; i < (int)ClusterPrepassSource.Count; ++i)
{
for (int j = 0; j < (int)ClusterDepthSource.Count; ++j)
{
s_ClusterKernels[i, j] = buildPerVoxelLightListShader.FindKernel(s_ClusterKernelNames[i, j]);
s_ClusterObliqueKernels[i, j] = buildPerVoxelLightListShader.FindKernel(s_ClusterObliqueKernelNames[i, j]);
s_ClusterKernels[i, j] = DeviceInfo.FindKernel(buildPerVoxelLightListShader, s_ClusterKernelNames[i, j]);
s_ClusterObliqueKernels[i, j] = DeviceInfo.FindKernel(buildPerVoxelLightListShader, s_ClusterObliqueKernelNames[i, j]);
}
}
}

s_GenListPerTileKernel = buildPerTileLightListShader.FindKernel("TileLightListGen");
s_GenListPerTileKernel = DeviceInfo.FindKernel(buildPerTileLightListShader, "TileLightListGen");

s_GenListPerBigTileKernel = buildPerBigTileLightListShader.FindKernel("BigTileLightListGen");
s_GenListPerBigTileKernel = DeviceInfo.FindKernel(buildPerBigTileLightListShader, "BigTileLightListGen");

s_BuildIndirectKernel = buildDispatchIndirectShader.FindKernel("BuildIndirect");
s_ClearDispatchIndirectKernel = clearDispatchIndirectShader.FindKernel("ClearDispatchIndirect");
s_BuildIndirectKernel = DeviceInfo.FindKernel(buildDispatchIndirectShader, "BuildIndirect");
s_ClearDispatchIndirectKernel = DeviceInfo.FindKernel(clearDispatchIndirectShader, "ClearDispatchIndirect");

s_ClearDrawProceduralIndirectKernel = clearDispatchIndirectShader.FindKernel("ClearDrawProceduralIndirect");
s_ClearDrawProceduralIndirectKernel = DeviceInfo.FindKernel(clearDispatchIndirectShader, "ClearDrawProceduralIndirect");

s_BuildMaterialFlagsWriteKernel = buildMaterialFlagsShader.FindKernel("MaterialFlagsGen");
s_BuildMaterialFlagsWriteKernel = DeviceInfo.FindKernel(buildMaterialFlagsShader, "MaterialFlagsGen");

s_shadeOpaqueDirectFptlKernel = deferredComputeShader.FindKernel("Deferred_Direct_Fptl");
s_shadeOpaqueDirectFptlDebugDisplayKernel = deferredComputeShader.FindKernel("Deferred_Direct_Fptl_DebugDisplay");
s_shadeOpaqueDirectFptlKernel = DeviceInfo.FindKernel(deferredComputeShader, "Deferred_Direct_Fptl");
s_shadeOpaqueDirectFptlDebugDisplayKernel = DeviceInfo.FindKernel(deferredComputeShader, "Deferred_Direct_Fptl_DebugDisplay");

s_deferredContactShadowKernel = contactShadowComputeShader.FindKernel("DeferredContactShadow");
s_deferredContactShadowKernel = DeviceInfo.FindKernel(contactShadowComputeShader, "DeferredContactShadow");

for (int variant = 0; variant < LightDefinitions.s_NumFeatureVariants; variant++)
{
s_shadeOpaqueIndirectFptlKernels[variant] = deferredComputeShader.FindKernel("Deferred_Indirect_Fptl_Variant" + variant);
s_shadeOpaqueIndirectFptlKernels[variant] = DeviceInfo.FindKernel(deferredComputeShader, "Deferred_Indirect_Fptl_Variant" + variant);
}

m_TextureCaches.Initialize(asset, defaultResources, iBLFilterBSDFArray);
Expand Down Expand Up @@ -1096,7 +1074,7 @@ void LightLoopNewFrame(CommandBuffer cmd, HDCamera hdCamera)

static int NumLightIndicesPerClusteredTile()
{
return 32 * (1 << k_Log2NumClusters); // total footprint for all layers of the tile (measured in light index entries)
return 32 * (1 << DeviceInfo.log2NumClusters); // total footprint for all layers of the tile (measured in light index entries)
}

void LightLoopAllocResolutionDependentBuffers(HDCamera hdCamera, int width, int height)
Expand Down Expand Up @@ -3061,6 +3039,7 @@ static void ClearLightList(in BuildGPULightListParameters parameters, CommandBuf
cmd.SetComputeBufferParam(parameters.clearLightListCS, parameters.clearLightListKernel, HDShaderIDs._LightListToClear, bufferToClear);
cmd.SetComputeIntParam(parameters.clearLightListCS, HDShaderIDs._LightListEntries, bufferToClear.count);

// TODO: Round on DeviceInfo.optimalThreadGroupSize so we have optimal thread for ClearList kernel
int groupSize = 64;
cmd.DispatchCompute(parameters.clearLightListCS, parameters.clearLightListKernel, (bufferToClear.count + groupSize - 1) / groupSize, 1, 1);
}
Expand Down Expand Up @@ -3267,7 +3246,7 @@ static void BuildDispatchIndirectArguments(in BuildGPULightListParameters parame
{
cmd.SetComputeBufferParam(parameters.clearDispatchIndirectShader, s_ClearDrawProceduralIndirectKernel, HDShaderIDs.g_DispatchIndirectBuffer, resources.dispatchIndirectBuffer);
cmd.SetComputeIntParam(parameters.clearDispatchIndirectShader, HDShaderIDs.g_NumTiles, parameters.numTilesFPTL);
cmd.SetComputeIntParam(parameters.clearDispatchIndirectShader, HDShaderIDs.g_VertexPerTile, k_HasNativeQuadSupport ? 4 : 6);
cmd.SetComputeIntParam(parameters.clearDispatchIndirectShader, HDShaderIDs.g_VertexPerTile, SystemInfo.supportsHardwareQuadTopology ? 4 : 6);
cmd.DispatchCompute(parameters.clearDispatchIndirectShader, s_ClearDrawProceduralIndirectKernel, 1, 1, 1);

}
Expand All @@ -3283,14 +3262,14 @@ static void BuildDispatchIndirectArguments(in BuildGPULightListParameters parame
cmd.SetComputeBufferParam(parameters.buildDispatchIndirectShader, s_BuildIndirectKernel, HDShaderIDs.g_TileFeatureFlags, resources.tileFeatureFlags);
cmd.SetComputeIntParam(parameters.buildDispatchIndirectShader, HDShaderIDs.g_NumTiles, parameters.numTilesFPTL);
cmd.SetComputeIntParam(parameters.buildDispatchIndirectShader, HDShaderIDs.g_NumTilesX, parameters.numTilesFPTLX);
// Round on k_ThreadGroupOptimalSize so we have optimal thread for buildDispatchIndirectShader kernel
cmd.DispatchCompute(parameters.buildDispatchIndirectShader, s_BuildIndirectKernel, (parameters.numTilesFPTL + k_ThreadGroupOptimalSize - 1) / k_ThreadGroupOptimalSize, 1, parameters.viewCount);
// Round on DeviceInfo.optimalThreadGroupSize so we have optimal thread for buildDispatchIndirectShader kernel
cmd.DispatchCompute(parameters.buildDispatchIndirectShader, s_BuildIndirectKernel, (parameters.numTilesFPTL + DeviceInfo.optimalThreadGroupSize - 1) / DeviceInfo.optimalThreadGroupSize, 1, parameters.viewCount);
}
}

static bool DeferredUseComputeAsPixel(FrameSettings frameSettings)
{
return frameSettings.IsEnabled(FrameSettingsField.DeferredTile) && (!frameSettings.IsEnabled(FrameSettingsField.ComputeLightEvaluation) || k_PreferFragment);
return frameSettings.IsEnabled(FrameSettingsField.DeferredTile) && (!frameSettings.IsEnabled(FrameSettingsField.ComputeLightEvaluation) || !DeviceInfo.preferComputeKernels);
}

unsafe BuildGPULightListParameters PrepareBuildGPULightListParameters( HDCamera hdCamera,
Expand Down Expand Up @@ -3404,7 +3383,7 @@ unsafe BuildGPULightListParameters PrepareBuildGPULightListParameters( HDCamera

// Clear light lsts
parameters.clearLightListCS = defaultResources.shaders.clearLightListsCS;
parameters.clearLightListKernel = parameters.clearLightListCS.FindKernel("ClearList");
parameters.clearLightListKernel = DeviceInfo.FindKernel(parameters.clearLightListCS, "ClearList");

// Screen space AABB
parameters.screenSpaceAABBShader = buildScreenAABBShader;
Expand Down Expand Up @@ -3625,7 +3604,7 @@ unsafe void UpdateShaderVariablesGlobalLightLoop(ref ShaderVariablesGlobal cb, H
cb._EnableDecalLayers = hdCamera.frameSettings.IsEnabled(FrameSettingsField.DecalLayers) ? 1u : 0u;
cb._EnvLightSkyEnabled = m_SkyManager.IsLightingSkyValid(hdCamera) ? 1 : 0;

const float C = (float)(1 << k_Log2NumClusters);
float C = (float)(1 << DeviceInfo.log2NumClusters);
var geomSeries = (1.0 - Mathf.Pow(k_ClustLogBase, C)) / (1 - k_ClustLogBase); // geometric series: sum_k=0^{C-1} base^k

// Tile/Cluster
Expand All @@ -3635,7 +3614,7 @@ unsafe void UpdateShaderVariablesGlobalLightLoop(ref ShaderVariablesGlobal cb, H
cb.g_fClustBase = k_ClustLogBase;
cb.g_fNearPlane = hdCamera.camera.nearClipPlane;
cb.g_fFarPlane = hdCamera.camera.farClipPlane;
cb.g_iLog2NumClusters = k_Log2NumClusters;
cb.g_iLog2NumClusters = DeviceInfo.log2NumClusters;
cb.g_isLogBaseBufferEnabled = k_UseDepthBuffer ? 1 : 0;
cb._NumTileClusteredX = (uint)GetNumTileClusteredX(hdCamera);
cb._NumTileClusteredY = (uint)GetNumTileClusteredY(hdCamera);
Expand Down Expand Up @@ -3961,7 +3940,7 @@ void RenderDeferredLighting(HDCamera hdCamera, CommandBuffer cmd)

if (parameters.enableTile)
{
bool useCompute = parameters.useComputeLightingEvaluation && !k_PreferFragment;
bool useCompute = parameters.useComputeLightingEvaluation && DeviceInfo.preferComputeKernels;
if (useCompute)
RenderComputeDeferredLighting(parameters, resources, cmd);
else
Expand Down Expand Up @@ -4057,7 +4036,7 @@ static void RenderComputeAsPixelDeferredLighting(in DeferredLightingParameters p

cmd.EnableShaderKeyword(s_variantNames[variant]);

MeshTopology topology = k_HasNativeQuadSupport ? MeshTopology.Quads : MeshTopology.Triangles;
MeshTopology topology = SystemInfo.supportsHardwareQuadTopology ? MeshTopology.Quads : MeshTopology.Triangles;
cmd.DrawProceduralIndirect(Matrix4x4.identity, deferredMat, 0, topology, resources.dispatchIndirectBuffer, variant * 4 * sizeof(uint), null);

// Must disable variant keyword because it will not get overridden.
Expand Down Expand Up @@ -4368,5 +4347,5 @@ static void RenderProbeVolumeDebugOverlay(in DebugParameters debugParameters, Co
// cmd.DispatchCompute(parameters.buildMaterialFlagsShader, buildMaterialFlagsKernel, parameters.numTilesFPTLX, parameters.numTilesFPTLY, parameters.viewCount);
// cmd.DispatchCompute(parameters.clearDispatchIndirectShader, s_ClearDispatchIndirectKernel, 1, 1, 1);
// BuildDispatchIndirectArguments
// cmd.DispatchCompute(parameters.buildDispatchIndirectShader, s_BuildDispatchIndirectKernel, (parameters.numTilesFPTL + k_ThreadGroupOptimalSize - 1) / k_ThreadGroupOptimalSize, 1, parameters.viewCount);
// cmd.DispatchCompute(parameters.buildDispatchIndirectShader, s_BuildDispatchIndirectKernel, (parameters.numTilesFPTL + DeviceInfo.optimalThreadGroupSize - 1) / DeviceInfo.optimalThreadGroupSize, 1, parameters.viewCount);
// Then dispatch indirect will trigger the number of tile for a variant x4 as we process by wavefront of 64 (16x16 => 4 x 8x8)
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#pragma kernel BuildIndirect

#pragma multi_compile _ IS_DRAWPROCEDURALINDIRECT
#pragma multi_compile _ PLATFORM_LANE_COUNT_32

#pragma only_renderers d3d11 playstation xboxone vulkan metal switch

Expand All @@ -12,10 +13,10 @@

#include "Packages/com.unity.render-pipelines.high-definition/Runtime/Material/Lit/Lit.hlsl"

#ifdef PLATFORM_LANE_COUNT // We can infer the size of a wave. This is currently not possible on non-consoles, so we have to fallback to a sensible default in those cases.
#ifdef PLATFORM_LANE_COUNT
#define NR_THREADS PLATFORM_LANE_COUNT
#else
#define NR_THREADS 64 // default to 64 threads per group on other platforms..
#define NR_THREADS 64 // default to 64 threads per group
#endif

RWBuffer<uint> g_DispatchIndirectBuffer : register( u0 ); // Indirect arguments have to be in a _buffer_, not a structured buffer
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
#pragma kernel ClearDispatchIndirect
#pragma kernel ClearDrawProceduralIndirect

#pragma multi_compile _ PLATFORM_LANE_COUNT_32

#pragma only_renderers d3d11 playstation xboxone vulkan metal switch

RWBuffer<uint> g_DispatchIndirectBuffer : register( u0 ); // Indirect arguments have to be in a _buffer_, not a structured buffer
Expand All @@ -8,9 +11,9 @@ RWBuffer<uint> g_DispatchIndirectBuffer : register( u0 ); // Indirect argument
#include "Packages/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/LightLoop.cs.hlsl"

#ifdef PLATFORM_LANE_COUNT
#define NR_THREADS PLATFORM_LANE_COUNT
#define NR_THREADS PLATFORM_LANE_COUNT
#else
#define NR_THREADS 64 // default to 64 threads per group on other platforms..
#define NR_THREADS 64 // default to 64 threads per group
#endif

uniform uint g_NumTiles;
Expand Down
Loading