From 1cfe16c33f155ac81ddbccf2cd2bc953bae6fb57 Mon Sep 17 00:00:00 2001 From: Andrew Au <3410332+cshung@users.noreply.github.com> Date: Wed, 20 May 2026 18:00:46 +0000 Subject: [PATCH 01/14] [ARM32] Eliminate red zone usage in runtime stubs On ARM32 Linux, the area below SP is not guaranteed to be preserved across signal delivery. Replace red zone reads/writes with explicit stack adjustments (push/pop) in: - NativeAOT interop thunks (ldr pc dispatch, no stack intermediate) - NativeAOT UniversalTransition (caller pushes args onto stack) - NativeAOT interface dispatch stubs (PROLOG_STACK_ALLOC instead of sub-SP stores) - CoreCLR VTableCallStub (pre-indexed str/post-indexed ldr) Guarded by FEATURE_AVOID_RED_ZONE, enabled for ARM32 non-Windows targets. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/coreclr/nativeaot/Runtime/CMakeLists.txt | 3 + src/coreclr/nativeaot/Runtime/EHHelpers.cpp | 7 ++ .../nativeaot/Runtime/StackFrameIterator.cpp | 5 ++ .../nativeaot/Runtime/ThunksMapping.cpp | 25 ++++++- .../Runtime/arm/InteropThunksHelpers.S | 5 ++ .../Runtime/arm/UniversalTransition.S | 28 ++++++- src/coreclr/runtime/arm/StubDispatch.S | 73 +++++++++++++++++++ src/coreclr/vm/CMakeLists.txt | 4 + src/coreclr/vm/arm/virtualcallstubcpu.hpp | 18 ++++- 9 files changed, 162 insertions(+), 6 deletions(-) diff --git a/src/coreclr/nativeaot/Runtime/CMakeLists.txt b/src/coreclr/nativeaot/Runtime/CMakeLists.txt index a98536fb5adaae..50749056415aee 100644 --- a/src/coreclr/nativeaot/Runtime/CMakeLists.txt +++ b/src/coreclr/nativeaot/Runtime/CMakeLists.txt @@ -287,6 +287,9 @@ add_compile_definitions($<$:FEATURE_GC_STRESS>) add_definitions(-DFEATURE_NATIVEAOT) add_definitions(-DVERIFY_HEAP) add_definitions(-DNATIVEAOT) +if(CLR_CMAKE_TARGET_ARCH_ARM AND CLR_CMAKE_TARGET_LINUX) + add_definitions(-DFEATURE_AVOID_RED_ZONE) +endif() add_definitions(-D_LIB) if(NOT CLR_CMAKE_TARGET_ARCH_WASM) add_definitions(-DGC_DESCRIPTOR) diff --git a/src/coreclr/nativeaot/Runtime/EHHelpers.cpp b/src/coreclr/nativeaot/Runtime/EHHelpers.cpp index 30d6ab453632bd..44b9ee10588b01 100644 --- a/src/coreclr/nativeaot/Runtime/EHHelpers.cpp +++ b/src/coreclr/nativeaot/Runtime/EHHelpers.cpp @@ -326,6 +326,13 @@ static uintptr_t UnwindSimpleHelperToCaller( pContext->SetSp(sp+sizeof(uintptr_t)); // pop the stack #elif defined(HOST_ARM) || defined(HOST_ARM64) uintptr_t adjustedFaultingIP = pContext->GetLr(); +#if defined(HOST_ARM) && defined(FEATURE_AVOID_RED_ZONE) + // When FEATURE_AVOID_RED_ZONE is active, interface dispatch stubs allocate stack space + // (PROLOG_STACK_ALLOC 8) before the AV location. We must restore SP to the caller's + // original value so the exception handler sees the correct frame. + if (InInterfaceDispatchHelper(pContext->GetIp())) + pContext->SetSp(pContext->GetSp() + 8); +#endif #elif defined(HOST_LOONGARCH64) || defined(HOST_RISCV64) uintptr_t adjustedFaultingIP = pContext->GetRa(); #else diff --git a/src/coreclr/nativeaot/Runtime/StackFrameIterator.cpp b/src/coreclr/nativeaot/Runtime/StackFrameIterator.cpp index d3b67edb83c65b..6648464f77ecef 100644 --- a/src/coreclr/nativeaot/Runtime/StackFrameIterator.cpp +++ b/src/coreclr/nativeaot/Runtime/StackFrameIterator.cpp @@ -1429,7 +1429,12 @@ struct UniversalTransitionStackFrame uint64_t m_fpArgRegs[8]; // ChildSP+008 CallerSP-070 (0x40 bytes) (d0-d7) uint64_t m_returnBlock[4]; // ChildSP+048 CallerSP-030 (0x20 bytes) uintptr_t m_intArgRegs[4]; // ChildSP+068 CallerSP-010 (0x10 bytes) (r0-r3) +#ifdef FEATURE_AVOID_RED_ZONE + uintptr_t m_callerPushedArgs[2]; // ChildSP+078 CallerSP-008 (0x8 bytes) (extra arg + target fn) + uintptr_t m_stackPassedArgs[1]; // ChildSP+080 CallerSP+000 (unknown size) +#else uintptr_t m_stackPassedArgs[1]; // ChildSP+078 CallerSP+000 (unknown size) +#endif public: PTR_uintptr_t get_CallerSP() { return GET_POINTER_TO_FIELD(m_stackPassedArgs[0]); } diff --git a/src/coreclr/nativeaot/Runtime/ThunksMapping.cpp b/src/coreclr/nativeaot/Runtime/ThunksMapping.cpp index f29c73333e1184..ce360d8acb69c6 100644 --- a/src/coreclr/nativeaot/Runtime/ThunksMapping.cpp +++ b/src/coreclr/nativeaot/Runtime/ThunksMapping.cpp @@ -201,26 +201,45 @@ EXTERN_C HRESULT QCALLTYPE RhAllocateThunksMapping(void** ppThunksSection) #elif TARGET_ARM +#ifdef FEATURE_AVOID_RED_ZONE + // mov r12, + // ldr pc,[r12, ] + // r12 retains data address; RhCommonStub reads it directly without stack + + EncodeThumb2Mov32((uint16_t*)pCurrentThunkAddress, (uint32_t)pCurrentDataAddress, 12); + pCurrentThunkAddress += 8; + + // ldr pc, [r12, #offset] — Thumb2 T3 encoding with Rt=PC(15), Rn=r12 + *((uint32_t*)pCurrentThunkAddress) = 0xf000f8dc | ((OS_PAGE_SIZE - POINTER_SIZE - (i * POINTER_SIZE * 2)) << 16); + pCurrentThunkAddress += 4; + + // Unreachable padding to maintain THUNK_SIZE (20 bytes) + *((uint16_t*)pCurrentThunkAddress) = 0xbf00; pCurrentThunkAddress += 2; + *((uint16_t*)pCurrentThunkAddress) = 0xbf00; pCurrentThunkAddress += 2; + *((uint16_t*)pCurrentThunkAddress) = 0xbf00; pCurrentThunkAddress += 2; + *((uint16_t*)pCurrentThunkAddress) = 0xbf00; pCurrentThunkAddress += 2; +#else // mov r12, // str r12,[sp,#-4] - // ldr r12,[r12, ] // bx r12 EncodeThumb2Mov32((uint16_t*)pCurrentThunkAddress, (uint32_t)pCurrentDataAddress, 12); pCurrentThunkAddress += 8; - *((uint32_t*)pCurrentThunkAddress) = 0xcc04f84d; + *((uint32_t*)pCurrentThunkAddress) = 0xcc04f84d; // str r12,[sp,#-4] (no writeback) pCurrentThunkAddress += 4; *((uint32_t*)pCurrentThunkAddress) = 0xc000f8dc | ((OS_PAGE_SIZE - POINTER_SIZE - (i * POINTER_SIZE * 2)) << 16); pCurrentThunkAddress += 4; - *((uint16_t*)pCurrentThunkAddress) = 0x4760; + *((uint16_t*)pCurrentThunkAddress) = 0x4760; // bx r12 pCurrentThunkAddress += 2; // nops for alignment *((uint16_t*)pCurrentThunkAddress) = 0xbf00; pCurrentThunkAddress += 2; +#endif #elif TARGET_ARM64 diff --git a/src/coreclr/nativeaot/Runtime/arm/InteropThunksHelpers.S b/src/coreclr/nativeaot/Runtime/arm/InteropThunksHelpers.S index dc2babe8b6ab02..71a7b55314d53f 100644 --- a/src/coreclr/nativeaot/Runtime/arm/InteropThunksHelpers.S +++ b/src/coreclr/nativeaot/Runtime/arm/InteropThunksHelpers.S @@ -14,9 +14,14 @@ // NESTED_ENTRY RhCommonStub, _TEXT, NoHandler // Custom calling convention: +#ifdef FEATURE_AVOID_RED_ZONE + // r12 already has the current thunk's data block pointer + // (thunk branched here via ldr pc, preserving r12) +#else // red zone has pointer to the current thunk's data block (data contains 2 pointer values: context + target pointers) // Copy red zone value into r12 so that the PROLOG_PUSH doesn't destroy it ldr r12, [sp, #-4] +#endif PROLOG_PUSH "{r0-r4, lr}" PROLOG_VPUSH {d0-d7} // Capture the floating point argument registers diff --git a/src/coreclr/nativeaot/Runtime/arm/UniversalTransition.S b/src/coreclr/nativeaot/Runtime/arm/UniversalTransition.S index c0373dd9db7be3..42725f75116f56 100644 --- a/src/coreclr/nativeaot/Runtime/arm/UniversalTransition.S +++ b/src/coreclr/nativeaot/Runtime/arm/UniversalTransition.S @@ -48,10 +48,15 @@ // sp-4 will contain the managed function that is to be called by this transition function // sp-8 will contain the pointer sized extra argument to the managed function // +// When FEATURE_AVOID_RED_ZONE is defined, the caller pushes the 2 extra arguments onto the +// stack instead (sp adjusted by -8 before branching here): +// [sp+4] = managed function to be called +// [sp+0] = pointer sized extra argument +// // When invoking the callee: // // r0 shall contain a pointer to the TransitionBlock -// r1 shall contain the value that was in sp-8 at entry to this function +// r1 shall contain the value that was in sp-8 (or [sp+0] with FEATURE_AVOID_RED_ZONE) // // Frame layout is: // @@ -65,6 +70,16 @@ // {PushedLR} ChildSP+004 CallerSP-074 // {PushedR11} ChildSP+000 CallerSP-078 // +// When FEATURE_AVOID_RED_ZONE is defined, the frame includes the caller's 8-byte push: +// +// {StackPassedArgs} ChildSP+080 CallerSP+000 +// {CallerPushedArgs (extra+target) (8 bytes)} ChildSP+078 CallerSP-008 +// {IntArgRegs (r0-r3) (0x10 bytes)} ChildSP+068 CallerSP-018 +// {ReturnBlock (0x20 bytes)} ChildSP+048 CallerSP-038 +// {FpArgRegs (d0-d7) (0x40 bytes)} ChildSP+008 CallerSP-078 +// {PushedLR} ChildSP+004 CallerSP-07C +// {PushedR11} ChildSP+000 CallerSP-080 +// // NOTE: If the frame layout ever changes, the C++ UniversalTransitionStackFrame structure // must be updated as well. // @@ -81,11 +96,19 @@ NESTED_ENTRY Rhp\FunctionName, _TEXT, NoHandler // Save argument registers (including floating point) and the return address. +#ifdef FEATURE_AVOID_RED_ZONE + // Caller pushed 8 bytes: [sp]=extra arg, [sp+4]=target fn + ldr r12, [sp, #4] // Capture target function into r12 + PROLOG_PUSH "{r3}" // Save original r3 (sp -= 4; caller data now at sp+4 and sp+8) + ldr r3, [sp, #4] // Capture extra arg (was at caller's [sp], now at sp+4 after push) + PROLOG_PUSH "{r0-r2}" // Push the rest of the argument registers +#else // NOTE: While we do that, capture the two arguments in the red zone into r12 and r3. ldr r12, [sp, #-4] // Capture first argument from red zone into r12 PROLOG_PUSH "{r3}" // Push r3 ldr r3, [sp, #-4] // Capture second argument from red zone into r3 PROLOG_PUSH "{r0-r2}" // Push the rest of the registers +#endif PROLOG_STACK_ALLOC RETURN_BLOCK_SIZE // Save space a buffer to be used to hold return buffer data. PROLOG_VPUSH {d0-d7} // Capture the floating point argument registers PROLOG_PUSH "{r11,lr}" // Save caller's frame chain pointer and PC @@ -134,6 +157,9 @@ GLOBAL_LABEL ReturnFrom\FunctionName EPILOG_VPOP {d0-d7} EPILOG_STACK_FREE RETURN_BLOCK_SIZE // pop return block conservatively reported area EPILOG_POP "{r0-r3}" +#ifdef FEATURE_AVOID_RED_ZONE + EPILOG_STACK_FREE 8 // free the 8 bytes pushed by the caller +#endif // Tailcall to the target address. EPILOG_BRANCH_REG r12 diff --git a/src/coreclr/runtime/arm/StubDispatch.S b/src/coreclr/runtime/arm/StubDispatch.S index f196ea9a076379..8ecf140f2a235c 100644 --- a/src/coreclr/runtime/arm/StubDispatch.S +++ b/src/coreclr/runtime/arm/StubDispatch.S @@ -12,6 +12,62 @@ .macro DEFINE_INTERFACE_DISPATCH_STUB entries NESTED_ENTRY RhpInterfaceDispatch\entries, _TEXT, NoHandler +#ifdef FEATURE_AVOID_RED_ZONE + // r12 currently contains the indirection cell address. But we need more scratch registers and + // we may A/V on a null this. Save r1 and r2 on the stack. + PROLOG_STACK_ALLOC 8 + str r1, [sp] + str r2, [sp, #4] + + // r12 currently holds the indirection cell address. We need to get the cache structure instead. + ldr r2, [r12, #OFFSETOF__InterfaceDispatchCell__m_pCache] + + // Validate r2 is a cache pointer matching the expected cache size. + tst r2, #IDC_CACHE_POINTER_MASK + bne LOCAL_LABEL(RaceRetry_\entries) + ldr r1, [r2, #OFFSETOF__InterfaceDispatchCache__m_cEntries] + cmp r1, #\entries + bne LOCAL_LABEL(RaceRetry_\entries) + + // Load the MethodTable from the object instance in r0. + GLOBAL_LABEL RhpInterfaceDispatchAVLocation\entries + ldr r1, [r0] + + CurrentOffset = OFFSETOF__InterfaceDispatchCache__m_rgEntries + // R1 : Instance MethodTable* + // R2 : Cache data structure + // R12 : Trashed. On successful check, set to the target address to jump to. + .rept \entries + ldr r12, [r2, #CurrentOffset] + cmp r1, r12 + bne 0f + ldr r12, [r2, #(CurrentOffset + 4)] + b LOCAL_LABEL(99_\entries) + 0: + CurrentOffset = CurrentOffset + 8 + .endr + + // Point r12 to the indirection cell using the back pointer in the cache block + ldr r12, [r2, #OFFSETOF__InterfaceDispatchCache__m_pCell] + + ldr r1, [sp] + ldr r2, [sp, #4] + EPILOG_STACK_FREE 8 + b C_FUNC(RhpInterfaceDispatchSlow) + +LOCAL_LABEL(RaceRetry_\entries): + ldr r1, [sp] + ldr r2, [sp, #4] + EPILOG_STACK_FREE 8 + ldr pc, [r12] + +LOCAL_LABEL(99_\entries): + ldr r1, [sp] + ldr r2, [sp, #4] + EPILOG_STACK_FREE 8 + EPILOG_BRANCH_REG r12 + +#else // !FEATURE_AVOID_RED_ZONE // r12 currently contains the indirection cell address. But we need more scratch registers and // we may A/V on a null this. Store r1 and r2 in red zone. str r1, [sp, #-8] @@ -75,6 +131,7 @@ LOCAL_LABEL(99_\entries): str r2, [sp, #-8] ldr r2, [sp, #-4] EPILOG_BRANCH_REG r12 +#endif // FEATURE_AVOID_RED_ZONE NESTED_END RhpInterfaceDispatch\entries, _TEXT @@ -108,6 +165,21 @@ LEAF_END RhpInitialDynamicInterfaceDispatch, _TEXT // Cache miss case, call the runtime to resolve the target and update the cache. // Use universal transition helper to allow an exception to flow out of resolution +#ifdef FEATURE_AVOID_RED_ZONE +NESTED_ENTRY RhpInterfaceDispatchSlow, _TEXT, NoHandler + // r12 has the interface dispatch cell address in it. + // Push the two arguments that the universal transition thunk expects: + // [sp] = parameter for the universal thunk target (cell address) + // [sp+4] = universal thunk target address (RhpCidResolve) + PROLOG_STACK_ALLOC 8 + str r12, [sp] + PREPARE_EXTERNAL_VAR RhpCidResolve, r12 + str r12, [sp, #4] + + // jump to universal transition thunk + b C_FUNC(RhpUniversalTransitionTailCall) +NESTED_END RhpInterfaceDispatchSlow, _TEXT +#else LEAF_ENTRY RhpInterfaceDispatchSlow, _TEXT // r12 has the interface dispatch cell address in it. // The calling convention of the universal thunk is that the parameter @@ -120,5 +192,6 @@ LEAF_ENTRY RhpInterfaceDispatchSlow, _TEXT // jump to universal transition thunk b C_FUNC(RhpUniversalTransitionTailCall) LEAF_END RhpInterfaceDispatchSlow, _TEXT +#endif #endif // FEATURE_CACHED_INTERFACE_DISPATCH diff --git a/src/coreclr/vm/CMakeLists.txt b/src/coreclr/vm/CMakeLists.txt index a7a29f40a1ca1a..40392a0aeb8746 100644 --- a/src/coreclr/vm/CMakeLists.txt +++ b/src/coreclr/vm/CMakeLists.txt @@ -23,6 +23,10 @@ if(CLR_CMAKE_TARGET_ANDROID OR CLR_CMAKE_TARGET_OPENBSD) add_definitions(-DFEATURE_EMULATED_TLS) endif() +if(CLR_CMAKE_TARGET_ARCH_ARM AND CLR_CMAKE_TARGET_LINUX) + add_definitions(-DFEATURE_AVOID_RED_ZONE) +endif() + foreach (Config DEBUG CHECKED) add_compile_definitions($<$:WRITE_BARRIER_CHECK>) endforeach (Config) diff --git a/src/coreclr/vm/arm/virtualcallstubcpu.hpp b/src/coreclr/vm/arm/virtualcallstubcpu.hpp index ede098dde691bf..071349cabb8c9b 100644 --- a/src/coreclr/vm/arm/virtualcallstubcpu.hpp +++ b/src/coreclr/vm/arm/virtualcallstubcpu.hpp @@ -289,7 +289,11 @@ struct VTableCallStub size_t cbSize = 4; // First ldr instruction // If we never save r0 to the red zone, we have the short version of the stub +#ifdef FEATURE_AVOID_RED_ZONE + if (*(UINT32*)(&pStubCode[cbSize]) != 0x0d04f84d) +#else if (*(UINT32*)(&pStubCode[cbSize]) != 0x0c04f84d) +#endif { return 4 + // ldr r12,[r0] @@ -299,7 +303,7 @@ struct VTableCallStub 4; // Slot value (data storage, not a real instruction) } - cbSize += 4; // Saving r0 into red zone + cbSize += 4; // Saving r0 (push or red zone) cbSize += (*(WORD*)(&pStubCode[cbSize]) == 0xf8dc ? 4 : 12); // Loading of vtable into r12 cbSize += (*(WORD*)(&pStubCode[cbSize]) == 0xf8dc ? 4 : 12); // Loading of targe address into r12 @@ -335,7 +339,7 @@ struct VTableCallHolder int indirectionsSize = (offsetOfIndirection > 0xFFF ? 12 : 4) + (offsetAfterIndirection > 0xFFF ? 12 : 4); if (offsetOfIndirection > 0xFFF || offsetAfterIndirection > 0xFFF) - indirectionsSize += 8; // Save/restore r0 using red zone + indirectionsSize += 8; // Save/restore r0 (push/pop with FEATURE_AVOID_RED_ZONE, red zone otherwise) return 6 + indirectionsSize + 4; } @@ -429,8 +433,13 @@ void VTableCallHolder::Initialize(unsigned slot) if (offsetOfIndirection > 0xFFF || offsetAfterIndirection > 0xFFF) { +#ifdef FEATURE_AVOID_RED_ZONE + // str r0, [sp, #-4]! (pre-indexed: decrements SP, stores at new SP) + *(UINT32*)p = 0x0d04f84d; p += 4; +#else // str r0, [sp, #-4]. Save r0 in the red zone *(UINT32*)p = 0x0c04f84d; p += 4; +#endif } if (offsetOfIndirection > 0xFFF) @@ -463,8 +472,13 @@ void VTableCallHolder::Initialize(unsigned slot) if (offsetOfIndirection > 0xFFF || offsetAfterIndirection > 0xFFF) { +#ifdef FEATURE_AVOID_RED_ZONE + // ldr r0, [sp], #4 (post-indexed: loads from SP, then increments SP) + *(UINT32*)p = 0x0b04f85d; p += 4; +#else // ldr r0, [sp, #-4]. Restore r0 from the red zone. *(UINT32*)p = 0x0c04f85d; p += 4; +#endif } // bx r12 From f4b43f8d58ecf9088721f2d519f7b3f84f42747a Mon Sep 17 00:00:00 2001 From: Andrew Au <3410332+cshung@users.noreply.github.com> Date: Mon, 15 Jun 2026 14:56:58 +0000 Subject: [PATCH 02/14] Remove FEATURE_AVOID_RED_ZONE ifdef, make fix unconditional MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Windows ARM32 is no longer supported, so every ARM32 target is Linux. The red zone avoidance is always needed — remove the preprocessor guard and delete the old red zone code paths entirely. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/coreclr/nativeaot/Runtime/CMakeLists.txt | 3 - src/coreclr/nativeaot/Runtime/EHHelpers.cpp | 8 +- .../nativeaot/Runtime/StackFrameIterator.cpp | 4 - .../nativeaot/Runtime/ThunksMapping.cpp | 23 ------ .../Runtime/arm/InteropThunksHelpers.S | 6 -- .../Runtime/arm/UniversalTransition.S | 28 +------ src/coreclr/runtime/arm/StubDispatch.S | 82 ------------------- src/coreclr/vm/CMakeLists.txt | 4 - src/coreclr/vm/arm/virtualcallstubcpu.hpp | 20 +---- 9 files changed, 10 insertions(+), 168 deletions(-) diff --git a/src/coreclr/nativeaot/Runtime/CMakeLists.txt b/src/coreclr/nativeaot/Runtime/CMakeLists.txt index 50749056415aee..a98536fb5adaae 100644 --- a/src/coreclr/nativeaot/Runtime/CMakeLists.txt +++ b/src/coreclr/nativeaot/Runtime/CMakeLists.txt @@ -287,9 +287,6 @@ add_compile_definitions($<$:FEATURE_GC_STRESS>) add_definitions(-DFEATURE_NATIVEAOT) add_definitions(-DVERIFY_HEAP) add_definitions(-DNATIVEAOT) -if(CLR_CMAKE_TARGET_ARCH_ARM AND CLR_CMAKE_TARGET_LINUX) - add_definitions(-DFEATURE_AVOID_RED_ZONE) -endif() add_definitions(-D_LIB) if(NOT CLR_CMAKE_TARGET_ARCH_WASM) add_definitions(-DGC_DESCRIPTOR) diff --git a/src/coreclr/nativeaot/Runtime/EHHelpers.cpp b/src/coreclr/nativeaot/Runtime/EHHelpers.cpp index 44b9ee10588b01..374dc8f22a7f8f 100644 --- a/src/coreclr/nativeaot/Runtime/EHHelpers.cpp +++ b/src/coreclr/nativeaot/Runtime/EHHelpers.cpp @@ -326,10 +326,10 @@ static uintptr_t UnwindSimpleHelperToCaller( pContext->SetSp(sp+sizeof(uintptr_t)); // pop the stack #elif defined(HOST_ARM) || defined(HOST_ARM64) uintptr_t adjustedFaultingIP = pContext->GetLr(); -#if defined(HOST_ARM) && defined(FEATURE_AVOID_RED_ZONE) - // When FEATURE_AVOID_RED_ZONE is active, interface dispatch stubs allocate stack space - // (PROLOG_STACK_ALLOC 8) before the AV location. We must restore SP to the caller's - // original value so the exception handler sees the correct frame. +#if defined(HOST_ARM) + // Interface dispatch stubs allocate stack space (PROLOG_STACK_ALLOC 8) before the AV + // location. We must restore SP to the caller's original value so the exception handler + // sees the correct frame. if (InInterfaceDispatchHelper(pContext->GetIp())) pContext->SetSp(pContext->GetSp() + 8); #endif diff --git a/src/coreclr/nativeaot/Runtime/StackFrameIterator.cpp b/src/coreclr/nativeaot/Runtime/StackFrameIterator.cpp index 6648464f77ecef..d600acbbf3d8fe 100644 --- a/src/coreclr/nativeaot/Runtime/StackFrameIterator.cpp +++ b/src/coreclr/nativeaot/Runtime/StackFrameIterator.cpp @@ -1429,12 +1429,8 @@ struct UniversalTransitionStackFrame uint64_t m_fpArgRegs[8]; // ChildSP+008 CallerSP-070 (0x40 bytes) (d0-d7) uint64_t m_returnBlock[4]; // ChildSP+048 CallerSP-030 (0x20 bytes) uintptr_t m_intArgRegs[4]; // ChildSP+068 CallerSP-010 (0x10 bytes) (r0-r3) -#ifdef FEATURE_AVOID_RED_ZONE uintptr_t m_callerPushedArgs[2]; // ChildSP+078 CallerSP-008 (0x8 bytes) (extra arg + target fn) uintptr_t m_stackPassedArgs[1]; // ChildSP+080 CallerSP+000 (unknown size) -#else - uintptr_t m_stackPassedArgs[1]; // ChildSP+078 CallerSP+000 (unknown size) -#endif public: PTR_uintptr_t get_CallerSP() { return GET_POINTER_TO_FIELD(m_stackPassedArgs[0]); } diff --git a/src/coreclr/nativeaot/Runtime/ThunksMapping.cpp b/src/coreclr/nativeaot/Runtime/ThunksMapping.cpp index ce360d8acb69c6..8c3d095eb46129 100644 --- a/src/coreclr/nativeaot/Runtime/ThunksMapping.cpp +++ b/src/coreclr/nativeaot/Runtime/ThunksMapping.cpp @@ -201,7 +201,6 @@ EXTERN_C HRESULT QCALLTYPE RhAllocateThunksMapping(void** ppThunksSection) #elif TARGET_ARM -#ifdef FEATURE_AVOID_RED_ZONE // mov r12, // ldr pc,[r12, ] // r12 retains data address; RhCommonStub reads it directly without stack @@ -218,28 +217,6 @@ EXTERN_C HRESULT QCALLTYPE RhAllocateThunksMapping(void** ppThunksSection) *((uint16_t*)pCurrentThunkAddress) = 0xbf00; pCurrentThunkAddress += 2; *((uint16_t*)pCurrentThunkAddress) = 0xbf00; pCurrentThunkAddress += 2; *((uint16_t*)pCurrentThunkAddress) = 0xbf00; pCurrentThunkAddress += 2; -#else - // mov r12, - // str r12,[sp,#-4] - // ldr r12,[r12, ] - // bx r12 - - EncodeThumb2Mov32((uint16_t*)pCurrentThunkAddress, (uint32_t)pCurrentDataAddress, 12); - pCurrentThunkAddress += 8; - - *((uint32_t*)pCurrentThunkAddress) = 0xcc04f84d; // str r12,[sp,#-4] (no writeback) - pCurrentThunkAddress += 4; - - *((uint32_t*)pCurrentThunkAddress) = 0xc000f8dc | ((OS_PAGE_SIZE - POINTER_SIZE - (i * POINTER_SIZE * 2)) << 16); - pCurrentThunkAddress += 4; - - *((uint16_t*)pCurrentThunkAddress) = 0x4760; // bx r12 - pCurrentThunkAddress += 2; - - // nops for alignment - *((uint16_t*)pCurrentThunkAddress) = 0xbf00; - pCurrentThunkAddress += 2; -#endif #elif TARGET_ARM64 diff --git a/src/coreclr/nativeaot/Runtime/arm/InteropThunksHelpers.S b/src/coreclr/nativeaot/Runtime/arm/InteropThunksHelpers.S index 71a7b55314d53f..20c5c3a9914b21 100644 --- a/src/coreclr/nativeaot/Runtime/arm/InteropThunksHelpers.S +++ b/src/coreclr/nativeaot/Runtime/arm/InteropThunksHelpers.S @@ -14,14 +14,8 @@ // NESTED_ENTRY RhCommonStub, _TEXT, NoHandler // Custom calling convention: -#ifdef FEATURE_AVOID_RED_ZONE // r12 already has the current thunk's data block pointer // (thunk branched here via ldr pc, preserving r12) -#else - // red zone has pointer to the current thunk's data block (data contains 2 pointer values: context + target pointers) - // Copy red zone value into r12 so that the PROLOG_PUSH doesn't destroy it - ldr r12, [sp, #-4] -#endif PROLOG_PUSH "{r0-r4, lr}" PROLOG_VPUSH {d0-d7} // Capture the floating point argument registers diff --git a/src/coreclr/nativeaot/Runtime/arm/UniversalTransition.S b/src/coreclr/nativeaot/Runtime/arm/UniversalTransition.S index 42725f75116f56..0826f507e660e7 100644 --- a/src/coreclr/nativeaot/Runtime/arm/UniversalTransition.S +++ b/src/coreclr/nativeaot/Runtime/arm/UniversalTransition.S @@ -48,30 +48,18 @@ // sp-4 will contain the managed function that is to be called by this transition function // sp-8 will contain the pointer sized extra argument to the managed function // -// When FEATURE_AVOID_RED_ZONE is defined, the caller pushes the 2 extra arguments onto the -// stack instead (sp adjusted by -8 before branching here): +// The caller pushes the 2 extra arguments onto the stack (sp adjusted by -8 before +// branching here): // [sp+4] = managed function to be called // [sp+0] = pointer sized extra argument // // When invoking the callee: // // r0 shall contain a pointer to the TransitionBlock -// r1 shall contain the value that was in sp-8 (or [sp+0] with FEATURE_AVOID_RED_ZONE) +// r1 shall contain the value that was in [sp+0] at entry to this function // // Frame layout is: // -// {StackPassedArgs} ChildSP+078 CallerSP+000 -// {IntArgRegs (r0-r3) (0x10 bytes)} ChildSP+068 CallerSP-010 -// {ReturnBlock (0x20 bytes)} ChildSP+048 CallerSP-030 -// -- The base address of the Return block is the TransitionBlock pointer, the floating point args are -// in the neg space of the TransitionBlock pointer. Note that the callee has knowledge of the exact -// layout of all pieces of the frame that lie at or above the pushed floating point registers. -// {FpArgRegs (d0-d7) (0x40 bytes)} ChildSP+008 CallerSP-070 -// {PushedLR} ChildSP+004 CallerSP-074 -// {PushedR11} ChildSP+000 CallerSP-078 -// -// When FEATURE_AVOID_RED_ZONE is defined, the frame includes the caller's 8-byte push: -// // {StackPassedArgs} ChildSP+080 CallerSP+000 // {CallerPushedArgs (extra+target) (8 bytes)} ChildSP+078 CallerSP-008 // {IntArgRegs (r0-r3) (0x10 bytes)} ChildSP+068 CallerSP-018 @@ -96,19 +84,11 @@ NESTED_ENTRY Rhp\FunctionName, _TEXT, NoHandler // Save argument registers (including floating point) and the return address. -#ifdef FEATURE_AVOID_RED_ZONE // Caller pushed 8 bytes: [sp]=extra arg, [sp+4]=target fn ldr r12, [sp, #4] // Capture target function into r12 PROLOG_PUSH "{r3}" // Save original r3 (sp -= 4; caller data now at sp+4 and sp+8) ldr r3, [sp, #4] // Capture extra arg (was at caller's [sp], now at sp+4 after push) PROLOG_PUSH "{r0-r2}" // Push the rest of the argument registers -#else - // NOTE: While we do that, capture the two arguments in the red zone into r12 and r3. - ldr r12, [sp, #-4] // Capture first argument from red zone into r12 - PROLOG_PUSH "{r3}" // Push r3 - ldr r3, [sp, #-4] // Capture second argument from red zone into r3 - PROLOG_PUSH "{r0-r2}" // Push the rest of the registers -#endif PROLOG_STACK_ALLOC RETURN_BLOCK_SIZE // Save space a buffer to be used to hold return buffer data. PROLOG_VPUSH {d0-d7} // Capture the floating point argument registers PROLOG_PUSH "{r11,lr}" // Save caller's frame chain pointer and PC @@ -157,9 +137,7 @@ GLOBAL_LABEL ReturnFrom\FunctionName EPILOG_VPOP {d0-d7} EPILOG_STACK_FREE RETURN_BLOCK_SIZE // pop return block conservatively reported area EPILOG_POP "{r0-r3}" -#ifdef FEATURE_AVOID_RED_ZONE EPILOG_STACK_FREE 8 // free the 8 bytes pushed by the caller -#endif // Tailcall to the target address. EPILOG_BRANCH_REG r12 diff --git a/src/coreclr/runtime/arm/StubDispatch.S b/src/coreclr/runtime/arm/StubDispatch.S index 8ecf140f2a235c..6c65e5b5dbac30 100644 --- a/src/coreclr/runtime/arm/StubDispatch.S +++ b/src/coreclr/runtime/arm/StubDispatch.S @@ -12,7 +12,6 @@ .macro DEFINE_INTERFACE_DISPATCH_STUB entries NESTED_ENTRY RhpInterfaceDispatch\entries, _TEXT, NoHandler -#ifdef FEATURE_AVOID_RED_ZONE // r12 currently contains the indirection cell address. But we need more scratch registers and // we may A/V on a null this. Save r1 and r2 on the stack. PROLOG_STACK_ALLOC 8 @@ -67,72 +66,6 @@ LOCAL_LABEL(99_\entries): EPILOG_STACK_FREE 8 EPILOG_BRANCH_REG r12 -#else // !FEATURE_AVOID_RED_ZONE - // r12 currently contains the indirection cell address. But we need more scratch registers and - // we may A/V on a null this. Store r1 and r2 in red zone. - str r1, [sp, #-8] - str r2, [sp, #-4] - - // r12 currently holds the indirection cell address. We need to get the cache structure instead. - ldr r2, [r12, #OFFSETOF__InterfaceDispatchCell__m_pCache] - - // Validate r2 is a cache pointer matching the expected cache size. - // This compensates for a race where the stub was updated to expect a larger cache - // but we loaded a stale (smaller or non-cache) m_pCache value. - tst r2, #IDC_CACHE_POINTER_MASK - bne LOCAL_LABEL(RaceRetry_\entries) - ldr r1, [r2, #OFFSETOF__InterfaceDispatchCache__m_cEntries] - cmp r1, #\entries - bne LOCAL_LABEL(RaceRetry_\entries) - - // Load the MethodTable from the object instance in r0. - GLOBAL_LABEL RhpInterfaceDispatchAVLocation\entries - ldr r1, [r0] - - CurrentOffset = OFFSETOF__InterfaceDispatchCache__m_rgEntries - // For each entry in the cache, see if its MethodTable type matches the MethodTable in r1. - // If so, call the second cache entry. If not, skip the InterfaceDispatchCacheEntry. - // R1 : Instance MethodTable* - // R2: Cache data structure - // R12 : Trashed. On successful check, set to the target address to jump to. - .rept \entries - ldr r12, [r2, #CurrentOffset] - cmp r1, r12 - bne 0f - ldr r12, [r2, #(CurrentOffset + 4)] - b LOCAL_LABEL(99_\entries) - 0: - CurrentOffset = CurrentOffset + 8 - .endr - - // Point r12 to the indirection cell using the back pointer in the cache block - ldr r12, [r2, #OFFSETOF__InterfaceDispatchCache__m_pCell] - - ldr r1, [sp, #-8] - ldr r2, [sp, #-4] - b C_FUNC(RhpInterfaceDispatchSlow) - - // Race detected: r12 still holds the indirection cell address (not yet clobbered). - // Re-dispatch through the indirection cell to retry with the current stub and cache pair. - // ldr pc, [r12] branches to the current m_pStub without clobbering r12. -LOCAL_LABEL(RaceRetry_\entries): - ldr r1, [sp, #-8] - ldr r2, [sp, #-4] - ldr pc, [r12] - - // Common epilog for cache hits. Have to out of line it here due to limitation on the number of - // epilogs imposed by the unwind code macros. -LOCAL_LABEL(99_\entries): - // R2 contains address of the cache block. We store it in the red zone in case the target we jump - // to needs it. - // R12 contains the target address to jump to - ldr r1, [sp, #-8] - // We have to store R2 with address of the cache block into red zone before restoring original r2. - str r2, [sp, #-8] - ldr r2, [sp, #-4] - EPILOG_BRANCH_REG r12 -#endif // FEATURE_AVOID_RED_ZONE - NESTED_END RhpInterfaceDispatch\entries, _TEXT .endm // DEFINE_INTERFACE_DISPATCH_STUB @@ -165,7 +98,6 @@ LEAF_END RhpInitialDynamicInterfaceDispatch, _TEXT // Cache miss case, call the runtime to resolve the target and update the cache. // Use universal transition helper to allow an exception to flow out of resolution -#ifdef FEATURE_AVOID_RED_ZONE NESTED_ENTRY RhpInterfaceDispatchSlow, _TEXT, NoHandler // r12 has the interface dispatch cell address in it. // Push the two arguments that the universal transition thunk expects: @@ -179,19 +111,5 @@ NESTED_ENTRY RhpInterfaceDispatchSlow, _TEXT, NoHandler // jump to universal transition thunk b C_FUNC(RhpUniversalTransitionTailCall) NESTED_END RhpInterfaceDispatchSlow, _TEXT -#else -LEAF_ENTRY RhpInterfaceDispatchSlow, _TEXT - // r12 has the interface dispatch cell address in it. - // The calling convention of the universal thunk is that the parameter - // for the universal thunk target is to be placed in sp-8 - // and the universal thunk target address is to be placed in sp-4 - str r12, [sp, #-8] - PREPARE_EXTERNAL_VAR RhpCidResolve, r12 - str r12, [sp, #-4] - - // jump to universal transition thunk - b C_FUNC(RhpUniversalTransitionTailCall) -LEAF_END RhpInterfaceDispatchSlow, _TEXT -#endif #endif // FEATURE_CACHED_INTERFACE_DISPATCH diff --git a/src/coreclr/vm/CMakeLists.txt b/src/coreclr/vm/CMakeLists.txt index 40392a0aeb8746..a7a29f40a1ca1a 100644 --- a/src/coreclr/vm/CMakeLists.txt +++ b/src/coreclr/vm/CMakeLists.txt @@ -23,10 +23,6 @@ if(CLR_CMAKE_TARGET_ANDROID OR CLR_CMAKE_TARGET_OPENBSD) add_definitions(-DFEATURE_EMULATED_TLS) endif() -if(CLR_CMAKE_TARGET_ARCH_ARM AND CLR_CMAKE_TARGET_LINUX) - add_definitions(-DFEATURE_AVOID_RED_ZONE) -endif() - foreach (Config DEBUG CHECKED) add_compile_definitions($<$:WRITE_BARRIER_CHECK>) endforeach (Config) diff --git a/src/coreclr/vm/arm/virtualcallstubcpu.hpp b/src/coreclr/vm/arm/virtualcallstubcpu.hpp index 071349cabb8c9b..e3d21529fd803d 100644 --- a/src/coreclr/vm/arm/virtualcallstubcpu.hpp +++ b/src/coreclr/vm/arm/virtualcallstubcpu.hpp @@ -288,12 +288,8 @@ struct VTableCallStub size_t cbSize = 4; // First ldr instruction - // If we never save r0 to the red zone, we have the short version of the stub -#ifdef FEATURE_AVOID_RED_ZONE + // If we never save r0 to the stack, we have the short version of the stub if (*(UINT32*)(&pStubCode[cbSize]) != 0x0d04f84d) -#else - if (*(UINT32*)(&pStubCode[cbSize]) != 0x0c04f84d) -#endif { return 4 + // ldr r12,[r0] @@ -303,7 +299,7 @@ struct VTableCallStub 4; // Slot value (data storage, not a real instruction) } - cbSize += 4; // Saving r0 (push or red zone) + cbSize += 4; // Saving r0 (push) cbSize += (*(WORD*)(&pStubCode[cbSize]) == 0xf8dc ? 4 : 12); // Loading of vtable into r12 cbSize += (*(WORD*)(&pStubCode[cbSize]) == 0xf8dc ? 4 : 12); // Loading of targe address into r12 @@ -339,7 +335,7 @@ struct VTableCallHolder int indirectionsSize = (offsetOfIndirection > 0xFFF ? 12 : 4) + (offsetAfterIndirection > 0xFFF ? 12 : 4); if (offsetOfIndirection > 0xFFF || offsetAfterIndirection > 0xFFF) - indirectionsSize += 8; // Save/restore r0 (push/pop with FEATURE_AVOID_RED_ZONE, red zone otherwise) + indirectionsSize += 8; // Save/restore r0 (push/pop) return 6 + indirectionsSize + 4; } @@ -433,13 +429,8 @@ void VTableCallHolder::Initialize(unsigned slot) if (offsetOfIndirection > 0xFFF || offsetAfterIndirection > 0xFFF) { -#ifdef FEATURE_AVOID_RED_ZONE // str r0, [sp, #-4]! (pre-indexed: decrements SP, stores at new SP) *(UINT32*)p = 0x0d04f84d; p += 4; -#else - // str r0, [sp, #-4]. Save r0 in the red zone - *(UINT32*)p = 0x0c04f84d; p += 4; -#endif } if (offsetOfIndirection > 0xFFF) @@ -472,13 +463,8 @@ void VTableCallHolder::Initialize(unsigned slot) if (offsetOfIndirection > 0xFFF || offsetAfterIndirection > 0xFFF) { -#ifdef FEATURE_AVOID_RED_ZONE // ldr r0, [sp], #4 (post-indexed: loads from SP, then increments SP) *(UINT32*)p = 0x0b04f85d; p += 4; -#else - // ldr r0, [sp, #-4]. Restore r0 from the red zone. - *(UINT32*)p = 0x0c04f85d; p += 4; -#endif } // bx r12 From a88a9ef8ccf7a1af67eb27f81817878bb8270595 Mon Sep 17 00:00:00 2001 From: Andrew Au <3410332+cshung@users.noreply.github.com> Date: Mon, 15 Jun 2026 18:16:25 +0000 Subject: [PATCH 03/14] Shrink ARM32 THUNK_SIZE from 20 to 12 bytes The ldr pc dispatch needs only 12 bytes (mov r12 + ldr pc), no padding required. This increases thunks per page from 204 to 341 (67% more). Also shorten verbose comments per review feedback. Co-authored-by: Jan Kotas Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/coreclr/nativeaot/Runtime/ThunksMapping.cpp | 10 ++-------- src/coreclr/vm/arm/virtualcallstubcpu.hpp | 4 ++-- 2 files changed, 4 insertions(+), 10 deletions(-) diff --git a/src/coreclr/nativeaot/Runtime/ThunksMapping.cpp b/src/coreclr/nativeaot/Runtime/ThunksMapping.cpp index 8c3d095eb46129..9e00ebeded19a3 100644 --- a/src/coreclr/nativeaot/Runtime/ThunksMapping.cpp +++ b/src/coreclr/nativeaot/Runtime/ThunksMapping.cpp @@ -19,7 +19,7 @@ #elif TARGET_X86 #define THUNK_SIZE 12 #elif TARGET_ARM -#define THUNK_SIZE 20 +#define THUNK_SIZE 12 #elif TARGET_ARM64 #define THUNK_SIZE 16 #elif TARGET_LOONGARCH64 @@ -208,16 +208,10 @@ EXTERN_C HRESULT QCALLTYPE RhAllocateThunksMapping(void** ppThunksSection) EncodeThumb2Mov32((uint16_t*)pCurrentThunkAddress, (uint32_t)pCurrentDataAddress, 12); pCurrentThunkAddress += 8; - // ldr pc, [r12, #offset] — Thumb2 T3 encoding with Rt=PC(15), Rn=r12 + // ldr pc, [r12, #offset] *((uint32_t*)pCurrentThunkAddress) = 0xf000f8dc | ((OS_PAGE_SIZE - POINTER_SIZE - (i * POINTER_SIZE * 2)) << 16); pCurrentThunkAddress += 4; - // Unreachable padding to maintain THUNK_SIZE (20 bytes) - *((uint16_t*)pCurrentThunkAddress) = 0xbf00; pCurrentThunkAddress += 2; - *((uint16_t*)pCurrentThunkAddress) = 0xbf00; pCurrentThunkAddress += 2; - *((uint16_t*)pCurrentThunkAddress) = 0xbf00; pCurrentThunkAddress += 2; - *((uint16_t*)pCurrentThunkAddress) = 0xbf00; pCurrentThunkAddress += 2; - #elif TARGET_ARM64 //adr xip0, diff --git a/src/coreclr/vm/arm/virtualcallstubcpu.hpp b/src/coreclr/vm/arm/virtualcallstubcpu.hpp index e3d21529fd803d..3f49e99cd2e5d5 100644 --- a/src/coreclr/vm/arm/virtualcallstubcpu.hpp +++ b/src/coreclr/vm/arm/virtualcallstubcpu.hpp @@ -429,7 +429,7 @@ void VTableCallHolder::Initialize(unsigned slot) if (offsetOfIndirection > 0xFFF || offsetAfterIndirection > 0xFFF) { - // str r0, [sp, #-4]! (pre-indexed: decrements SP, stores at new SP) + // str r0, [sp, #-4]! *(UINT32*)p = 0x0d04f84d; p += 4; } @@ -463,7 +463,7 @@ void VTableCallHolder::Initialize(unsigned slot) if (offsetOfIndirection > 0xFFF || offsetAfterIndirection > 0xFFF) { - // ldr r0, [sp], #4 (post-indexed: loads from SP, then increments SP) + // ldr r0, [sp], #4 *(UINT32*)p = 0x0b04f85d; p += 4; } From 87288df9b2d8c45076351415ac71a06857fdee59 Mon Sep 17 00:00:00 2001 From: Andrew Au <3410332+cshung@users.noreply.github.com> Date: Mon, 15 Jun 2026 18:16:35 +0000 Subject: [PATCH 04/14] Address review feedback: simplify prolog/epilog sequences - StubDispatch: use PROLOG_PUSH/EPILOG_POP {r1,r2} instead of manual STACK_ALLOC + str/ldr - UniversalTransition: replace interleaved ldr/push dance with a single PROLOG_PUSH {r0-r3} then load caller args from known stack offsets - Clean up stale red zone comments Co-authored-by: Jan Kotas Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../Runtime/arm/UniversalTransition.S | 19 +++++-------------- src/coreclr/runtime/arm/StubDispatch.S | 16 ++++------------ 2 files changed, 9 insertions(+), 26 deletions(-) diff --git a/src/coreclr/nativeaot/Runtime/arm/UniversalTransition.S b/src/coreclr/nativeaot/Runtime/arm/UniversalTransition.S index 0826f507e660e7..78ee662b8086e3 100644 --- a/src/coreclr/nativeaot/Runtime/arm/UniversalTransition.S +++ b/src/coreclr/nativeaot/Runtime/arm/UniversalTransition.S @@ -43,13 +43,8 @@ // // At input to this function, r0-3, d0-7 and the stack may contain any number of arguments. // -// In addition, there are 2 extra arguments passed in the RED ZONE (8 byte negative space -// off of sp). -// sp-4 will contain the managed function that is to be called by this transition function -// sp-8 will contain the pointer sized extra argument to the managed function -// -// The caller pushes the 2 extra arguments onto the stack (sp adjusted by -8 before -// branching here): +// In addition, there are 2 extra arguments passed on the stack. The caller pushes them +// (sp adjusted by -8 before branching here): // [sp+4] = managed function to be called // [sp+0] = pointer sized extra argument // @@ -85,17 +80,13 @@ NESTED_ENTRY Rhp\FunctionName, _TEXT, NoHandler // Save argument registers (including floating point) and the return address. // Caller pushed 8 bytes: [sp]=extra arg, [sp+4]=target fn - ldr r12, [sp, #4] // Capture target function into r12 - PROLOG_PUSH "{r3}" // Save original r3 (sp -= 4; caller data now at sp+4 and sp+8) - ldr r3, [sp, #4] // Capture extra arg (was at caller's [sp], now at sp+4 after push) - PROLOG_PUSH "{r0-r2}" // Push the rest of the argument registers + PROLOG_PUSH "{r0-r3}" // Push the argument registers + ldr r12, [sp, #20] // Capture target function (caller's [sp+4], now at sp+16+4) + ldr r1, [sp, #16] // Capture extra arg (caller's [sp], now at sp+16) PROLOG_STACK_ALLOC RETURN_BLOCK_SIZE // Save space a buffer to be used to hold return buffer data. PROLOG_VPUSH {d0-d7} // Capture the floating point argument registers PROLOG_PUSH "{r11,lr}" // Save caller's frame chain pointer and PC - // Setup the arguments to the transition thunk. - mov r1, r3 - #ifdef TRASH_SAVED_ARGUMENT_REGISTERS // Before calling out, trash all of the argument registers except the ones (r0, r1) that diff --git a/src/coreclr/runtime/arm/StubDispatch.S b/src/coreclr/runtime/arm/StubDispatch.S index 6c65e5b5dbac30..842f79995b34a3 100644 --- a/src/coreclr/runtime/arm/StubDispatch.S +++ b/src/coreclr/runtime/arm/StubDispatch.S @@ -14,9 +14,7 @@ NESTED_ENTRY RhpInterfaceDispatch\entries, _TEXT, NoHandler // r12 currently contains the indirection cell address. But we need more scratch registers and // we may A/V on a null this. Save r1 and r2 on the stack. - PROLOG_STACK_ALLOC 8 - str r1, [sp] - str r2, [sp, #4] + PROLOG_PUSH "{r1,r2}" // r12 currently holds the indirection cell address. We need to get the cache structure instead. ldr r2, [r12, #OFFSETOF__InterfaceDispatchCell__m_pCache] @@ -49,21 +47,15 @@ NESTED_ENTRY RhpInterfaceDispatch\entries, _TEXT, NoHandler // Point r12 to the indirection cell using the back pointer in the cache block ldr r12, [r2, #OFFSETOF__InterfaceDispatchCache__m_pCell] - ldr r1, [sp] - ldr r2, [sp, #4] - EPILOG_STACK_FREE 8 + EPILOG_POP "{r1,r2}" b C_FUNC(RhpInterfaceDispatchSlow) LOCAL_LABEL(RaceRetry_\entries): - ldr r1, [sp] - ldr r2, [sp, #4] - EPILOG_STACK_FREE 8 + EPILOG_POP "{r1,r2}" ldr pc, [r12] LOCAL_LABEL(99_\entries): - ldr r1, [sp] - ldr r2, [sp, #4] - EPILOG_STACK_FREE 8 + EPILOG_POP "{r1,r2}" EPILOG_BRANCH_REG r12 NESTED_END RhpInterfaceDispatch\entries, _TEXT From a3f998e324bb92267ef214e15cb62e1290c6b947 Mon Sep 17 00:00:00 2001 From: Jan Kotas Date: Mon, 15 Jun 2026 13:06:31 -0700 Subject: [PATCH 05/14] Apply suggestions from code review Co-authored-by: Jan Kotas --- .../nativeaot/Runtime/arm/UniversalTransition.S | 1 + src/coreclr/runtime/arm/StubDispatch.S | 10 ++++++++-- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/src/coreclr/nativeaot/Runtime/arm/UniversalTransition.S b/src/coreclr/nativeaot/Runtime/arm/UniversalTransition.S index 78ee662b8086e3..e00dc50a6b9b18 100644 --- a/src/coreclr/nativeaot/Runtime/arm/UniversalTransition.S +++ b/src/coreclr/nativeaot/Runtime/arm/UniversalTransition.S @@ -80,6 +80,7 @@ NESTED_ENTRY Rhp\FunctionName, _TEXT, NoHandler // Save argument registers (including floating point) and the return address. // Caller pushed 8 bytes: [sp]=extra arg, [sp+4]=target fn + .pad #8 PROLOG_PUSH "{r0-r3}" // Push the argument registers ldr r12, [sp, #20] // Capture target function (caller's [sp+4], now at sp+16+4) ldr r1, [sp, #16] // Capture extra arg (caller's [sp], now at sp+16) diff --git a/src/coreclr/runtime/arm/StubDispatch.S b/src/coreclr/runtime/arm/StubDispatch.S index 842f79995b34a3..64429e12a5c48b 100644 --- a/src/coreclr/runtime/arm/StubDispatch.S +++ b/src/coreclr/runtime/arm/StubDispatch.S @@ -12,14 +12,16 @@ .macro DEFINE_INTERFACE_DISPATCH_STUB entries NESTED_ENTRY RhpInterfaceDispatch\entries, _TEXT, NoHandler - // r12 currently contains the indirection cell address. But we need more scratch registers and - // we may A/V on a null this. Save r1 and r2 on the stack. + // r12 currently contains the indirection cell address. But we need more scratch registers. + // Save r1 and r2 on the stack. PROLOG_PUSH "{r1,r2}" // r12 currently holds the indirection cell address. We need to get the cache structure instead. ldr r2, [r12, #OFFSETOF__InterfaceDispatchCell__m_pCache] // Validate r2 is a cache pointer matching the expected cache size. + // This compensates for a race where the stub was updated to expect a larger cache + // but we loaded a stale (smaller or non-cache) m_pCache value. tst r2, #IDC_CACHE_POINTER_MASK bne LOCAL_LABEL(RaceRetry_\entries) ldr r1, [r2, #OFFSETOF__InterfaceDispatchCache__m_cEntries] @@ -31,6 +33,8 @@ NESTED_ENTRY RhpInterfaceDispatch\entries, _TEXT, NoHandler ldr r1, [r0] CurrentOffset = OFFSETOF__InterfaceDispatchCache__m_rgEntries + // For each entry in the cache, see if its MethodTable type matches the MethodTable in r1. + // If so, call the second cache entry. If not, skip the InterfaceDispatchCacheEntry. // R1 : Instance MethodTable* // R2 : Cache data structure // R12 : Trashed. On successful check, set to the target address to jump to. @@ -50,6 +54,8 @@ NESTED_ENTRY RhpInterfaceDispatch\entries, _TEXT, NoHandler EPILOG_POP "{r1,r2}" b C_FUNC(RhpInterfaceDispatchSlow) + // Race detected: r12 still holds the indirection cell address (not yet clobbered). + // Re-dispatch through the indirection cell to retry with the current stub and cache pair. LOCAL_LABEL(RaceRetry_\entries): EPILOG_POP "{r1,r2}" ldr pc, [r12] From 3998fca397565331a214bcf79062ed5c5de469eb Mon Sep 17 00:00:00 2001 From: Andrew Au <3410332+cshung@users.noreply.github.com> Date: Tue, 16 Jun 2026 04:19:36 +0000 Subject: [PATCH 06/14] Fix DispatchResolve.S: eliminate red zone, push args for UniversalTransition DispatchResolve.S was the only caller of RhpUniversalTransitionTailCall that still wrote arguments to the red zone ([sp, #-8], [sp, #-4]) instead of pushing them on the stack. The mismatch with the updated UniversalTransition.S (which now expects pushed args at [sp+0], [sp+4]) caused every interface dispatch cache miss to crash (SIGSEGV on startup). Additionally, the entire function used 28 bytes of red zone for register spills during the hash table probe. Convert from LEAF_ENTRY to NESTED_ENTRY with PROLOG_STACK_ALLOC 28 to eliminate all red zone usage. Update EHHelpers.cpp SP adjustment from 8 to 28 to match the new frame size. Update StackFrameIterator.cpp CallerSP offset comments for accuracy. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/coreclr/nativeaot/Runtime/EHHelpers.cpp | 5 +- .../nativeaot/Runtime/StackFrameIterator.cpp | 10 +-- .../nativeaot/Runtime/arm/DispatchResolve.S | 87 ++++++++----------- 3 files changed, 43 insertions(+), 59 deletions(-) diff --git a/src/coreclr/nativeaot/Runtime/EHHelpers.cpp b/src/coreclr/nativeaot/Runtime/EHHelpers.cpp index 374dc8f22a7f8f..e4cd5ba8bcc0ae 100644 --- a/src/coreclr/nativeaot/Runtime/EHHelpers.cpp +++ b/src/coreclr/nativeaot/Runtime/EHHelpers.cpp @@ -327,9 +327,8 @@ static uintptr_t UnwindSimpleHelperToCaller( #elif defined(HOST_ARM) || defined(HOST_ARM64) uintptr_t adjustedFaultingIP = pContext->GetLr(); #if defined(HOST_ARM) - // Interface dispatch stubs allocate stack space (PROLOG_STACK_ALLOC 8) before the AV - // location. We must restore SP to the caller's original value so the exception handler - // sees the correct frame. + // Interface dispatch pushes {r1,r2} (8 bytes) before the potential null-this AV. + // Restore SP to the caller's original value. if (InInterfaceDispatchHelper(pContext->GetIp())) pContext->SetSp(pContext->GetSp() + 8); #endif diff --git a/src/coreclr/nativeaot/Runtime/StackFrameIterator.cpp b/src/coreclr/nativeaot/Runtime/StackFrameIterator.cpp index d600acbbf3d8fe..22b0dd6e596e42 100644 --- a/src/coreclr/nativeaot/Runtime/StackFrameIterator.cpp +++ b/src/coreclr/nativeaot/Runtime/StackFrameIterator.cpp @@ -1424,11 +1424,11 @@ struct UniversalTransitionStackFrame // Conservative GC reporting must be applied to everything between the base of the // ReturnBlock and the top of the StackPassedArgs. private: - uintptr_t m_pushedR11; // ChildSP+000 CallerSP-078 (0x4 bytes) (r11) - uintptr_t m_pushedLR; // ChildSP+004 CallerSP-074 (0x4 bytes) (lr) - uint64_t m_fpArgRegs[8]; // ChildSP+008 CallerSP-070 (0x40 bytes) (d0-d7) - uint64_t m_returnBlock[4]; // ChildSP+048 CallerSP-030 (0x20 bytes) - uintptr_t m_intArgRegs[4]; // ChildSP+068 CallerSP-010 (0x10 bytes) (r0-r3) + uintptr_t m_pushedR11; // ChildSP+000 CallerSP-080 (0x4 bytes) (r11) + uintptr_t m_pushedLR; // ChildSP+004 CallerSP-07C (0x4 bytes) (lr) + uint64_t m_fpArgRegs[8]; // ChildSP+008 CallerSP-078 (0x40 bytes) (d0-d7) + uint64_t m_returnBlock[4]; // ChildSP+048 CallerSP-038 (0x20 bytes) + uintptr_t m_intArgRegs[4]; // ChildSP+068 CallerSP-018 (0x10 bytes) (r0-r3) uintptr_t m_callerPushedArgs[2]; // ChildSP+078 CallerSP-008 (0x8 bytes) (extra arg + target fn) uintptr_t m_stackPassedArgs[1]; // ChildSP+080 CallerSP+000 (unknown size) diff --git a/src/coreclr/nativeaot/Runtime/arm/DispatchResolve.S b/src/coreclr/nativeaot/Runtime/arm/DispatchResolve.S index ec2c22c8a7ad1a..113b13bc8a7ece 100644 --- a/src/coreclr/nativeaot/Runtime/arm/DispatchResolve.S +++ b/src/coreclr/nativeaot/Runtime/arm/DispatchResolve.S @@ -7,12 +7,12 @@ #include // Dispatching version of RhpResolveInterfaceMethod -LEAF_ENTRY RhpInterfaceDispatch, _TEXT - // r12 currently contains the indirection cell address. But we need more scratch registers and - // we may A/V on a null this. Store r1 and r2 in red zone. - str r1, [sp, #-8] - str r2, [sp, #-4] +NESTED_ENTRY RhpInterfaceDispatch, _TEXT, NoHandler + + // r12 currently contains the indirection cell address. We need scratch registers. + // Allocate stack frame for register spills. + PROLOG_PUSH "{r1,r2}" // Load the MethodTable from the object instance in r0. // The label marks the location of a potential nullref for the unwinder. @@ -25,25 +25,18 @@ LEAF_ENTRY RhpInterfaceDispatch, _TEXT bne LOCAL_LABEL(Hashtable) // Fast path: restore r1/r2 before jumping to cached code. - ldr r1, [sp, #-8] - ldr r2, [sp, #-4] + EPILOG_POP "{r1,r2}" // dmb ensures that the Code load below sees the value written before // MethodTable. ARM32 has no load-acquire instruction (LDAR is ARMv8 only). dmb ldr r12, [r12, #4] // load the cached monomorphic resolved code address - bx r12 + EPILOG_BRANCH_REG r12 LOCAL_LABEL(Hashtable): // r1 = MethodTable, r12 = indirection cell address // Look up the target in the dispatch cache hashtable (GenericCache). - // Spill additional registers to the red zone below sp - // so we don't modify sp (this is a LEAF_ENTRY with no unwind info). - str r3, [sp, #-12] - str lr, [sp, #-16] - str r4, [sp, #-20] - str r5, [sp, #-24] - str r6, [sp, #-28] + push {r3,r4,r5,r6,r8,lr} // Load the _table field (Entry[]) from the cache struct. PREPARE_EXTERNAL_VAR_INDIRECT g_pDispatchCache, r2 @@ -53,13 +46,13 @@ LOCAL_LABEL(Hashtable): // hash = (RotateLeft(dispatchCell, 16) ^ objectType) * GoldenRatio ror r3, r12, #16 eor r3, r3, r1 - movw lr, #0x79B9 - movt lr, #0x9E37 // lr = 0x9E3779B9 - mul r3, r3, lr + movw r8, #0x79B9 + movt r8, #0x9E37 // r8 = 0x9E3779B9 + mul r3, r3, r8 // HashToBucket: bucket = hash >> hashShift - ldrb lr, [r2, #8] - lsr r3, r3, lr + ldrb r8, [r2, #8] + lsr r3, r3, r8 mov r4, #0 // i = 0 @@ -75,15 +68,15 @@ LOCAL_LABEL(ProbeLoop): dmb // Compare key (dispatchCell, objectType) - ldr lr, [r5, #4] - cmp r12, lr + ldr r8, [r5, #4] + cmp r12, r8 bne LOCAL_LABEL(ProbeMiss) - ldr lr, [r5, #8] - cmp r1, lr + ldr r8, [r5, #8] + cmp r1, r8 bne LOCAL_LABEL(ProbeMiss) // Read the cached code pointer, then re-verify the version has not changed. - ldr lr, [r5, #12] + ldr r8, [r5, #12] dmb // Verify: (original version & ~1) == re-read version. @@ -94,17 +87,12 @@ LOCAL_LABEL(ProbeLoop): bne LOCAL_LABEL(CacheMiss) // Dispatch to cached target. - mov r12, lr + mov r12, r8 - ldr r6, [sp, #-28] - ldr r5, [sp, #-24] - ldr r4, [sp, #-20] - ldr r3, [sp, #-12] - ldr lr, [sp, #-16] - ldr r1, [sp, #-8] - ldr r2, [sp, #-4] + pop {r3,r4,r5,r6,r8,lr} + EPILOG_POP "{r1,r2}" - bx r12 + EPILOG_BRANCH_REG r12 LOCAL_LABEL(ProbeMiss): // If version is zero the rest of the bucket is unclaimed — stop probing. @@ -114,28 +102,25 @@ LOCAL_LABEL(ProbeMiss): // Quadratic reprobe: i++; index = (index + i) & tableMask add r4, r4, #1 add r3, r3, r4 - ldr lr, [r2, #4] - sub lr, lr, #2 - and r3, r3, lr + ldr r8, [r2, #4] + sub r8, r8, #2 + and r3, r3, r8 cmp r4, #8 blt LOCAL_LABEL(ProbeLoop) LOCAL_LABEL(CacheMiss): - ldr r6, [sp, #-28] - ldr r5, [sp, #-24] - ldr r4, [sp, #-20] - -LOCAL_LABEL(SlowPath): - // restore original value of r1, r2, r3, lr - ldr r3, [sp, #-12] - ldr lr, [sp, #-16] - ldr r1, [sp, #-8] - ldr r2, [sp, #-4] - - str r12, [sp, #-8] + pop {r3,r4,r5,r6,r8,lr} + + // restore original value of r1, r2 + EPILOG_POP "{r1,r2}" + + // Push args for RhpUniversalTransitionTailCall: + // [sp+0] = extra arg (indirection cell), [sp+4] = target fn (RhpCidResolve) + sub sp, sp, #8 + str r12, [sp, #0] PREPARE_EXTERNAL_VAR RhpCidResolve, r12 - str r12, [sp, #-4] + str r12, [sp, #4] b C_FUNC(RhpUniversalTransitionTailCall) -LEAF_END RhpInterfaceDispatch, _TEXT +NESTED_END RhpInterfaceDispatch, _TEXT From 69e2dd3c6085d8b5db58fec5642341dab9b7eb16 Mon Sep 17 00:00:00 2001 From: Jan Kotas Date: Tue, 16 Jun 2026 22:49:33 -0700 Subject: [PATCH 07/14] Apply suggestions from code review Co-authored-by: Jan Kotas --- src/coreclr/nativeaot/Runtime/arm/DispatchResolve.S | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/coreclr/nativeaot/Runtime/arm/DispatchResolve.S b/src/coreclr/nativeaot/Runtime/arm/DispatchResolve.S index 113b13bc8a7ece..ec0797f501621e 100644 --- a/src/coreclr/nativeaot/Runtime/arm/DispatchResolve.S +++ b/src/coreclr/nativeaot/Runtime/arm/DispatchResolve.S @@ -36,7 +36,7 @@ LOCAL_LABEL(Hashtable): // r1 = MethodTable, r12 = indirection cell address // Look up the target in the dispatch cache hashtable (GenericCache). - push {r3,r4,r5,r6,r8,lr} + PROLOG_PUSH "{r3,r4,r5,r6,r8}" // Load the _table field (Entry[]) from the cache struct. PREPARE_EXTERNAL_VAR_INDIRECT g_pDispatchCache, r2 @@ -89,7 +89,7 @@ LOCAL_LABEL(ProbeLoop): // Dispatch to cached target. mov r12, r8 - pop {r3,r4,r5,r6,r8,lr} + EPILOG_POP "{r3,r4,r5,r6,r8}" EPILOG_POP "{r1,r2}" EPILOG_BRANCH_REG r12 @@ -109,7 +109,7 @@ LOCAL_LABEL(ProbeMiss): blt LOCAL_LABEL(ProbeLoop) LOCAL_LABEL(CacheMiss): - pop {r3,r4,r5,r6,r8,lr} + EPILOG_POP "{r3,r4,r5,r6,r8}" // restore original value of r1, r2 EPILOG_POP "{r1,r2}" From 3bab15b698675aef2c7bfa042757518f9b2be2da Mon Sep 17 00:00:00 2001 From: Jan Kotas Date: Tue, 16 Jun 2026 22:51:01 -0700 Subject: [PATCH 08/14] Apply suggestion from @jkotas --- src/coreclr/nativeaot/Runtime/arm/DispatchResolve.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/coreclr/nativeaot/Runtime/arm/DispatchResolve.S b/src/coreclr/nativeaot/Runtime/arm/DispatchResolve.S index ec0797f501621e..b85f1d86a00b00 100644 --- a/src/coreclr/nativeaot/Runtime/arm/DispatchResolve.S +++ b/src/coreclr/nativeaot/Runtime/arm/DispatchResolve.S @@ -25,7 +25,7 @@ NESTED_ENTRY RhpInterfaceDispatch, _TEXT, NoHandler bne LOCAL_LABEL(Hashtable) // Fast path: restore r1/r2 before jumping to cached code. - EPILOG_POP "{r1,r2}" + EPILOG_POP "{r1,r2}" // dmb ensures that the Code load below sees the value written before // MethodTable. ARM32 has no load-acquire instruction (LDAR is ARMv8 only). dmb From 00b526ef6cea6f8fd02eba4deaf014c12681fe6b Mon Sep 17 00:00:00 2001 From: Jan Kotas Date: Tue, 16 Jun 2026 22:52:56 -0700 Subject: [PATCH 09/14] Apply suggestion from @jkotas --- src/coreclr/nativeaot/Runtime/arm/DispatchResolve.S | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/coreclr/nativeaot/Runtime/arm/DispatchResolve.S b/src/coreclr/nativeaot/Runtime/arm/DispatchResolve.S index b85f1d86a00b00..f012e279e2af7f 100644 --- a/src/coreclr/nativeaot/Runtime/arm/DispatchResolve.S +++ b/src/coreclr/nativeaot/Runtime/arm/DispatchResolve.S @@ -89,8 +89,8 @@ LOCAL_LABEL(ProbeLoop): // Dispatch to cached target. mov r12, r8 - EPILOG_POP "{r3,r4,r5,r6,r8}" - EPILOG_POP "{r1,r2}" + EPILOG_POP "{r3,r4,r5,r6,r8}" + EPILOG_POP "{r1,r2}" EPILOG_BRANCH_REG r12 From 78bdf7f669a8a7ad8819af6b00cbdb6293a033ed Mon Sep 17 00:00:00 2001 From: Jan Kotas Date: Tue, 16 Jun 2026 22:53:35 -0700 Subject: [PATCH 10/14] Apply suggestion from @jkotas --- src/coreclr/nativeaot/Runtime/arm/DispatchResolve.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/coreclr/nativeaot/Runtime/arm/DispatchResolve.S b/src/coreclr/nativeaot/Runtime/arm/DispatchResolve.S index f012e279e2af7f..f0f98cc329eed7 100644 --- a/src/coreclr/nativeaot/Runtime/arm/DispatchResolve.S +++ b/src/coreclr/nativeaot/Runtime/arm/DispatchResolve.S @@ -112,7 +112,7 @@ LOCAL_LABEL(CacheMiss): EPILOG_POP "{r3,r4,r5,r6,r8}" // restore original value of r1, r2 - EPILOG_POP "{r1,r2}" + EPILOG_POP "{r1,r2}" // Push args for RhpUniversalTransitionTailCall: // [sp+0] = extra arg (indirection cell), [sp+4] = target fn (RhpCidResolve) From 5ffee391af2de9a1479106b347acbfa0dff54da2 Mon Sep 17 00:00:00 2001 From: Jan Kotas Date: Tue, 16 Jun 2026 23:16:18 -0700 Subject: [PATCH 11/14] Potential fix for pull request finding Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> --- src/coreclr/nativeaot/Runtime/arm/DispatchResolve.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/coreclr/nativeaot/Runtime/arm/DispatchResolve.S b/src/coreclr/nativeaot/Runtime/arm/DispatchResolve.S index f0f98cc329eed7..ef0010e17d93e6 100644 --- a/src/coreclr/nativeaot/Runtime/arm/DispatchResolve.S +++ b/src/coreclr/nativeaot/Runtime/arm/DispatchResolve.S @@ -116,7 +116,7 @@ LOCAL_LABEL(CacheMiss): // Push args for RhpUniversalTransitionTailCall: // [sp+0] = extra arg (indirection cell), [sp+4] = target fn (RhpCidResolve) - sub sp, sp, #8 + PROLOG_STACK_ALLOC 8 str r12, [sp, #0] PREPARE_EXTERNAL_VAR RhpCidResolve, r12 str r12, [sp, #4] From bf61b67e4e38102c94e7e6784c0f82f2e5f5fef6 Mon Sep 17 00:00:00 2001 From: Jan Kotas Date: Wed, 17 Jun 2026 10:03:44 -0700 Subject: [PATCH 12/14] Apply suggestion from @jkotas --- src/coreclr/nativeaot/Runtime/arm/DispatchResolve.S | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/coreclr/nativeaot/Runtime/arm/DispatchResolve.S b/src/coreclr/nativeaot/Runtime/arm/DispatchResolve.S index ef0010e17d93e6..16fcf79e95cab7 100644 --- a/src/coreclr/nativeaot/Runtime/arm/DispatchResolve.S +++ b/src/coreclr/nativeaot/Runtime/arm/DispatchResolve.S @@ -34,6 +34,9 @@ NESTED_ENTRY RhpInterfaceDispatch, _TEXT, NoHandler LOCAL_LABEL(Hashtable): + // Match what the fast path has pushed. + .save {r1,r2} + // r1 = MethodTable, r12 = indirection cell address // Look up the target in the dispatch cache hashtable (GenericCache). PROLOG_PUSH "{r3,r4,r5,r6,r8}" From bcae72e3a93fd6e623e425a7582913a5221432a9 Mon Sep 17 00:00:00 2001 From: Jan Kotas Date: Wed, 17 Jun 2026 21:23:18 -0700 Subject: [PATCH 13/14] Apply suggestion from @MichalStrehovsky MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Michal Strehovský --- src/coreclr/nativeaot/Runtime/arm/DispatchResolve.S | 1 + 1 file changed, 1 insertion(+) diff --git a/src/coreclr/nativeaot/Runtime/arm/DispatchResolve.S b/src/coreclr/nativeaot/Runtime/arm/DispatchResolve.S index 16fcf79e95cab7..f686968ee72512 100644 --- a/src/coreclr/nativeaot/Runtime/arm/DispatchResolve.S +++ b/src/coreclr/nativeaot/Runtime/arm/DispatchResolve.S @@ -12,6 +12,7 @@ NESTED_ENTRY RhpInterfaceDispatch, _TEXT, NoHandler // r12 currently contains the indirection cell address. We need scratch registers. // Allocate stack frame for register spills. + // Unwinder special cases this push to be able to unwind out of the potential nullref below. PROLOG_PUSH "{r1,r2}" // Load the MethodTable from the object instance in r0. From 62a6695b17f5cee28e4ee986de0de8b578a86a84 Mon Sep 17 00:00:00 2001 From: Andrew Au <3410332+cshung@users.noreply.github.com> Date: Thu, 18 Jun 2026 16:37:14 +0000 Subject: [PATCH 14/14] Address review feedback: preserve original frame layout, use PROLOG_PUSH/EPILOG_POP - DispatchResolve.S: use PROLOG_PUSH/EPILOG_POP for {r3,r4,r5,r6,r8}, add .save {r1,r2} at Hashtable entry, drop lr from push list - UniversalTransition.S: rewrite prolog to preserve original frame layout (push r0-r1, capture caller args, store r2-r3 into caller slots) - StackFrameIterator.cpp: revert to original UniversalTransitionStackFrame layout (no m_callerPushedArgs) Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../nativeaot/Runtime/StackFrameIterator.cpp | 13 +++++----- .../nativeaot/Runtime/arm/DispatchResolve.S | 6 +---- .../Runtime/arm/UniversalTransition.S | 25 +++++++++++-------- 3 files changed, 21 insertions(+), 23 deletions(-) diff --git a/src/coreclr/nativeaot/Runtime/StackFrameIterator.cpp b/src/coreclr/nativeaot/Runtime/StackFrameIterator.cpp index 22b0dd6e596e42..d3b67edb83c65b 100644 --- a/src/coreclr/nativeaot/Runtime/StackFrameIterator.cpp +++ b/src/coreclr/nativeaot/Runtime/StackFrameIterator.cpp @@ -1424,13 +1424,12 @@ struct UniversalTransitionStackFrame // Conservative GC reporting must be applied to everything between the base of the // ReturnBlock and the top of the StackPassedArgs. private: - uintptr_t m_pushedR11; // ChildSP+000 CallerSP-080 (0x4 bytes) (r11) - uintptr_t m_pushedLR; // ChildSP+004 CallerSP-07C (0x4 bytes) (lr) - uint64_t m_fpArgRegs[8]; // ChildSP+008 CallerSP-078 (0x40 bytes) (d0-d7) - uint64_t m_returnBlock[4]; // ChildSP+048 CallerSP-038 (0x20 bytes) - uintptr_t m_intArgRegs[4]; // ChildSP+068 CallerSP-018 (0x10 bytes) (r0-r3) - uintptr_t m_callerPushedArgs[2]; // ChildSP+078 CallerSP-008 (0x8 bytes) (extra arg + target fn) - uintptr_t m_stackPassedArgs[1]; // ChildSP+080 CallerSP+000 (unknown size) + uintptr_t m_pushedR11; // ChildSP+000 CallerSP-078 (0x4 bytes) (r11) + uintptr_t m_pushedLR; // ChildSP+004 CallerSP-074 (0x4 bytes) (lr) + uint64_t m_fpArgRegs[8]; // ChildSP+008 CallerSP-070 (0x40 bytes) (d0-d7) + uint64_t m_returnBlock[4]; // ChildSP+048 CallerSP-030 (0x20 bytes) + uintptr_t m_intArgRegs[4]; // ChildSP+068 CallerSP-010 (0x10 bytes) (r0-r3) + uintptr_t m_stackPassedArgs[1]; // ChildSP+078 CallerSP+000 (unknown size) public: PTR_uintptr_t get_CallerSP() { return GET_POINTER_TO_FIELD(m_stackPassedArgs[0]); } diff --git a/src/coreclr/nativeaot/Runtime/arm/DispatchResolve.S b/src/coreclr/nativeaot/Runtime/arm/DispatchResolve.S index f686968ee72512..351fc74b10a2e4 100644 --- a/src/coreclr/nativeaot/Runtime/arm/DispatchResolve.S +++ b/src/coreclr/nativeaot/Runtime/arm/DispatchResolve.S @@ -10,8 +10,6 @@ NESTED_ENTRY RhpInterfaceDispatch, _TEXT, NoHandler - // r12 currently contains the indirection cell address. We need scratch registers. - // Allocate stack frame for register spills. // Unwinder special cases this push to be able to unwind out of the potential nullref below. PROLOG_PUSH "{r1,r2}" @@ -40,7 +38,7 @@ LOCAL_LABEL(Hashtable): // r1 = MethodTable, r12 = indirection cell address // Look up the target in the dispatch cache hashtable (GenericCache). - PROLOG_PUSH "{r3,r4,r5,r6,r8}" + PROLOG_PUSH "{r3,r4,r5,r6,r8}" // Load the _table field (Entry[]) from the cache struct. PREPARE_EXTERNAL_VAR_INDIRECT g_pDispatchCache, r2 @@ -114,8 +112,6 @@ LOCAL_LABEL(ProbeMiss): LOCAL_LABEL(CacheMiss): EPILOG_POP "{r3,r4,r5,r6,r8}" - - // restore original value of r1, r2 EPILOG_POP "{r1,r2}" // Push args for RhpUniversalTransitionTailCall: diff --git a/src/coreclr/nativeaot/Runtime/arm/UniversalTransition.S b/src/coreclr/nativeaot/Runtime/arm/UniversalTransition.S index e00dc50a6b9b18..8d942868421597 100644 --- a/src/coreclr/nativeaot/Runtime/arm/UniversalTransition.S +++ b/src/coreclr/nativeaot/Runtime/arm/UniversalTransition.S @@ -55,13 +55,15 @@ // // Frame layout is: // -// {StackPassedArgs} ChildSP+080 CallerSP+000 -// {CallerPushedArgs (extra+target) (8 bytes)} ChildSP+078 CallerSP-008 -// {IntArgRegs (r0-r3) (0x10 bytes)} ChildSP+068 CallerSP-018 -// {ReturnBlock (0x20 bytes)} ChildSP+048 CallerSP-038 -// {FpArgRegs (d0-d7) (0x40 bytes)} ChildSP+008 CallerSP-078 -// {PushedLR} ChildSP+004 CallerSP-07C -// {PushedR11} ChildSP+000 CallerSP-080 +// {StackPassedArgs} ChildSP+078 CallerSP+000 +// {IntArgRegs (r0-r3) (0x10 bytes)} ChildSP+068 CallerSP-010 +// {ReturnBlock (0x20 bytes)} ChildSP+048 CallerSP-030 +// -- The base address of the Return block is the TransitionBlock pointer, the floating point args are +// in the neg space of the TransitionBlock pointer. Note that the callee has knowledge of the exact +// layout of all pieces of the frame that lie at or above the pushed floating point registers. +// {FpArgRegs (d0-d7) (0x40 bytes)} ChildSP+008 CallerSP-070 +// {PushedLR} ChildSP+004 CallerSP-074 +// {PushedR11} ChildSP+000 CallerSP-078 // // NOTE: If the frame layout ever changes, the C++ UniversalTransitionStackFrame structure // must be updated as well. @@ -81,9 +83,11 @@ NESTED_ENTRY Rhp\FunctionName, _TEXT, NoHandler // Save argument registers (including floating point) and the return address. // Caller pushed 8 bytes: [sp]=extra arg, [sp+4]=target fn .pad #8 - PROLOG_PUSH "{r0-r3}" // Push the argument registers - ldr r12, [sp, #20] // Capture target function (caller's [sp+4], now at sp+16+4) - ldr r1, [sp, #16] // Capture extra arg (caller's [sp], now at sp+16) + PROLOG_PUSH "{r0-r1}" + ldr r12, [sp, #12] // Capture target function (caller's [sp+4], now at sp+8+4) + ldr r1, [sp, #8] // Capture extra arg (caller's [sp], now at sp+8) + str r3, [sp, #12] // Store remaining arg registers into the space used for the hidden args + str r2, [sp, #8] PROLOG_STACK_ALLOC RETURN_BLOCK_SIZE // Save space a buffer to be used to hold return buffer data. PROLOG_VPUSH {d0-d7} // Capture the floating point argument registers PROLOG_PUSH "{r11,lr}" // Save caller's frame chain pointer and PC @@ -129,7 +133,6 @@ GLOBAL_LABEL ReturnFrom\FunctionName EPILOG_VPOP {d0-d7} EPILOG_STACK_FREE RETURN_BLOCK_SIZE // pop return block conservatively reported area EPILOG_POP "{r0-r3}" - EPILOG_STACK_FREE 8 // free the 8 bytes pushed by the caller // Tailcall to the target address. EPILOG_BRANCH_REG r12