diff --git a/src/coreclr/nativeaot/Runtime/EHHelpers.cpp b/src/coreclr/nativeaot/Runtime/EHHelpers.cpp index 30d6ab453632bd..e4cd5ba8bcc0ae 100644 --- a/src/coreclr/nativeaot/Runtime/EHHelpers.cpp +++ b/src/coreclr/nativeaot/Runtime/EHHelpers.cpp @@ -326,6 +326,12 @@ static uintptr_t UnwindSimpleHelperToCaller( pContext->SetSp(sp+sizeof(uintptr_t)); // pop the stack #elif defined(HOST_ARM) || defined(HOST_ARM64) uintptr_t adjustedFaultingIP = pContext->GetLr(); +#if defined(HOST_ARM) + // Interface dispatch pushes {r1,r2} (8 bytes) before the potential null-this AV. + // Restore SP to the caller's original value. + if (InInterfaceDispatchHelper(pContext->GetIp())) + pContext->SetSp(pContext->GetSp() + 8); +#endif #elif defined(HOST_LOONGARCH64) || defined(HOST_RISCV64) uintptr_t adjustedFaultingIP = pContext->GetRa(); #else diff --git a/src/coreclr/nativeaot/Runtime/ThunksMapping.cpp b/src/coreclr/nativeaot/Runtime/ThunksMapping.cpp index f29c73333e1184..9e00ebeded19a3 100644 --- a/src/coreclr/nativeaot/Runtime/ThunksMapping.cpp +++ b/src/coreclr/nativeaot/Runtime/ThunksMapping.cpp @@ -19,7 +19,7 @@ #elif TARGET_X86 #define THUNK_SIZE 12 #elif TARGET_ARM -#define THUNK_SIZE 20 +#define THUNK_SIZE 12 #elif TARGET_ARM64 #define THUNK_SIZE 16 #elif TARGET_LOONGARCH64 @@ -202,26 +202,16 @@ EXTERN_C HRESULT QCALLTYPE RhAllocateThunksMapping(void** ppThunksSection) #elif TARGET_ARM // mov r12, - // str r12,[sp,#-4] - // ldr r12,[r12, ] + // r12 retains data address; RhCommonStub reads it directly without stack EncodeThumb2Mov32((uint16_t*)pCurrentThunkAddress, (uint32_t)pCurrentDataAddress, 12); pCurrentThunkAddress += 8; - *((uint32_t*)pCurrentThunkAddress) = 0xcc04f84d; + // ldr pc, [r12, #offset] + *((uint32_t*)pCurrentThunkAddress) = 0xf000f8dc | ((OS_PAGE_SIZE - POINTER_SIZE - (i * POINTER_SIZE * 2)) << 16); pCurrentThunkAddress += 4; - *((uint32_t*)pCurrentThunkAddress) = 0xc000f8dc | ((OS_PAGE_SIZE - POINTER_SIZE - (i * POINTER_SIZE * 2)) << 16); - pCurrentThunkAddress += 4; - - *((uint16_t*)pCurrentThunkAddress) = 0x4760; - pCurrentThunkAddress += 2; - - // nops for alignment - *((uint16_t*)pCurrentThunkAddress) = 0xbf00; - pCurrentThunkAddress += 2; - #elif TARGET_ARM64 //adr xip0, diff --git a/src/coreclr/nativeaot/Runtime/arm/DispatchResolve.S b/src/coreclr/nativeaot/Runtime/arm/DispatchResolve.S index ec2c22c8a7ad1a..351fc74b10a2e4 100644 --- a/src/coreclr/nativeaot/Runtime/arm/DispatchResolve.S +++ b/src/coreclr/nativeaot/Runtime/arm/DispatchResolve.S @@ -7,12 +7,11 @@ #include // Dispatching version of RhpResolveInterfaceMethod -LEAF_ENTRY RhpInterfaceDispatch, _TEXT - // r12 currently contains the indirection cell address. But we need more scratch registers and - // we may A/V on a null this. Store r1 and r2 in red zone. - str r1, [sp, #-8] - str r2, [sp, #-4] +NESTED_ENTRY RhpInterfaceDispatch, _TEXT, NoHandler + + // Unwinder special cases this push to be able to unwind out of the potential nullref below. + PROLOG_PUSH "{r1,r2}" // Load the MethodTable from the object instance in r0. // The label marks the location of a potential nullref for the unwinder. @@ -25,25 +24,21 @@ LEAF_ENTRY RhpInterfaceDispatch, _TEXT bne LOCAL_LABEL(Hashtable) // Fast path: restore r1/r2 before jumping to cached code. - ldr r1, [sp, #-8] - ldr r2, [sp, #-4] + EPILOG_POP "{r1,r2}" // dmb ensures that the Code load below sees the value written before // MethodTable. ARM32 has no load-acquire instruction (LDAR is ARMv8 only). dmb ldr r12, [r12, #4] // load the cached monomorphic resolved code address - bx r12 + EPILOG_BRANCH_REG r12 LOCAL_LABEL(Hashtable): + // Match what the fast path has pushed. + .save {r1,r2} + // r1 = MethodTable, r12 = indirection cell address // Look up the target in the dispatch cache hashtable (GenericCache). - // Spill additional registers to the red zone below sp - // so we don't modify sp (this is a LEAF_ENTRY with no unwind info). - str r3, [sp, #-12] - str lr, [sp, #-16] - str r4, [sp, #-20] - str r5, [sp, #-24] - str r6, [sp, #-28] + PROLOG_PUSH "{r3,r4,r5,r6,r8}" // Load the _table field (Entry[]) from the cache struct. PREPARE_EXTERNAL_VAR_INDIRECT g_pDispatchCache, r2 @@ -53,13 +48,13 @@ LOCAL_LABEL(Hashtable): // hash = (RotateLeft(dispatchCell, 16) ^ objectType) * GoldenRatio ror r3, r12, #16 eor r3, r3, r1 - movw lr, #0x79B9 - movt lr, #0x9E37 // lr = 0x9E3779B9 - mul r3, r3, lr + movw r8, #0x79B9 + movt r8, #0x9E37 // r8 = 0x9E3779B9 + mul r3, r3, r8 // HashToBucket: bucket = hash >> hashShift - ldrb lr, [r2, #8] - lsr r3, r3, lr + ldrb r8, [r2, #8] + lsr r3, r3, r8 mov r4, #0 // i = 0 @@ -75,15 +70,15 @@ LOCAL_LABEL(ProbeLoop): dmb // Compare key (dispatchCell, objectType) - ldr lr, [r5, #4] - cmp r12, lr + ldr r8, [r5, #4] + cmp r12, r8 bne LOCAL_LABEL(ProbeMiss) - ldr lr, [r5, #8] - cmp r1, lr + ldr r8, [r5, #8] + cmp r1, r8 bne LOCAL_LABEL(ProbeMiss) // Read the cached code pointer, then re-verify the version has not changed. - ldr lr, [r5, #12] + ldr r8, [r5, #12] dmb // Verify: (original version & ~1) == re-read version. @@ -94,17 +89,12 @@ LOCAL_LABEL(ProbeLoop): bne LOCAL_LABEL(CacheMiss) // Dispatch to cached target. - mov r12, lr + mov r12, r8 - ldr r6, [sp, #-28] - ldr r5, [sp, #-24] - ldr r4, [sp, #-20] - ldr r3, [sp, #-12] - ldr lr, [sp, #-16] - ldr r1, [sp, #-8] - ldr r2, [sp, #-4] + EPILOG_POP "{r3,r4,r5,r6,r8}" + EPILOG_POP "{r1,r2}" - bx r12 + EPILOG_BRANCH_REG r12 LOCAL_LABEL(ProbeMiss): // If version is zero the rest of the bucket is unclaimed — stop probing. @@ -114,28 +104,23 @@ LOCAL_LABEL(ProbeMiss): // Quadratic reprobe: i++; index = (index + i) & tableMask add r4, r4, #1 add r3, r3, r4 - ldr lr, [r2, #4] - sub lr, lr, #2 - and r3, r3, lr + ldr r8, [r2, #4] + sub r8, r8, #2 + and r3, r3, r8 cmp r4, #8 blt LOCAL_LABEL(ProbeLoop) LOCAL_LABEL(CacheMiss): - ldr r6, [sp, #-28] - ldr r5, [sp, #-24] - ldr r4, [sp, #-20] - -LOCAL_LABEL(SlowPath): - // restore original value of r1, r2, r3, lr - ldr r3, [sp, #-12] - ldr lr, [sp, #-16] - ldr r1, [sp, #-8] - ldr r2, [sp, #-4] - - str r12, [sp, #-8] + EPILOG_POP "{r3,r4,r5,r6,r8}" + EPILOG_POP "{r1,r2}" + + // Push args for RhpUniversalTransitionTailCall: + // [sp+0] = extra arg (indirection cell), [sp+4] = target fn (RhpCidResolve) + PROLOG_STACK_ALLOC 8 + str r12, [sp, #0] PREPARE_EXTERNAL_VAR RhpCidResolve, r12 - str r12, [sp, #-4] + str r12, [sp, #4] b C_FUNC(RhpUniversalTransitionTailCall) -LEAF_END RhpInterfaceDispatch, _TEXT +NESTED_END RhpInterfaceDispatch, _TEXT diff --git a/src/coreclr/nativeaot/Runtime/arm/InteropThunksHelpers.S b/src/coreclr/nativeaot/Runtime/arm/InteropThunksHelpers.S index dc2babe8b6ab02..20c5c3a9914b21 100644 --- a/src/coreclr/nativeaot/Runtime/arm/InteropThunksHelpers.S +++ b/src/coreclr/nativeaot/Runtime/arm/InteropThunksHelpers.S @@ -14,9 +14,8 @@ // NESTED_ENTRY RhCommonStub, _TEXT, NoHandler // Custom calling convention: - // red zone has pointer to the current thunk's data block (data contains 2 pointer values: context + target pointers) - // Copy red zone value into r12 so that the PROLOG_PUSH doesn't destroy it - ldr r12, [sp, #-4] + // r12 already has the current thunk's data block pointer + // (thunk branched here via ldr pc, preserving r12) PROLOG_PUSH "{r0-r4, lr}" PROLOG_VPUSH {d0-d7} // Capture the floating point argument registers diff --git a/src/coreclr/nativeaot/Runtime/arm/UniversalTransition.S b/src/coreclr/nativeaot/Runtime/arm/UniversalTransition.S index c0373dd9db7be3..8d942868421597 100644 --- a/src/coreclr/nativeaot/Runtime/arm/UniversalTransition.S +++ b/src/coreclr/nativeaot/Runtime/arm/UniversalTransition.S @@ -43,15 +43,15 @@ // // At input to this function, r0-3, d0-7 and the stack may contain any number of arguments. // -// In addition, there are 2 extra arguments passed in the RED ZONE (8 byte negative space -// off of sp). -// sp-4 will contain the managed function that is to be called by this transition function -// sp-8 will contain the pointer sized extra argument to the managed function +// In addition, there are 2 extra arguments passed on the stack. The caller pushes them +// (sp adjusted by -8 before branching here): +// [sp+4] = managed function to be called +// [sp+0] = pointer sized extra argument // // When invoking the callee: // // r0 shall contain a pointer to the TransitionBlock -// r1 shall contain the value that was in sp-8 at entry to this function +// r1 shall contain the value that was in [sp+0] at entry to this function // // Frame layout is: // @@ -81,18 +81,17 @@ NESTED_ENTRY Rhp\FunctionName, _TEXT, NoHandler // Save argument registers (including floating point) and the return address. - // NOTE: While we do that, capture the two arguments in the red zone into r12 and r3. - ldr r12, [sp, #-4] // Capture first argument from red zone into r12 - PROLOG_PUSH "{r3}" // Push r3 - ldr r3, [sp, #-4] // Capture second argument from red zone into r3 - PROLOG_PUSH "{r0-r2}" // Push the rest of the registers + // Caller pushed 8 bytes: [sp]=extra arg, [sp+4]=target fn + .pad #8 + PROLOG_PUSH "{r0-r1}" + ldr r12, [sp, #12] // Capture target function (caller's [sp+4], now at sp+8+4) + ldr r1, [sp, #8] // Capture extra arg (caller's [sp], now at sp+8) + str r3, [sp, #12] // Store remaining arg registers into the space used for the hidden args + str r2, [sp, #8] PROLOG_STACK_ALLOC RETURN_BLOCK_SIZE // Save space a buffer to be used to hold return buffer data. PROLOG_VPUSH {d0-d7} // Capture the floating point argument registers PROLOG_PUSH "{r11,lr}" // Save caller's frame chain pointer and PC - // Setup the arguments to the transition thunk. - mov r1, r3 - #ifdef TRASH_SAVED_ARGUMENT_REGISTERS // Before calling out, trash all of the argument registers except the ones (r0, r1) that diff --git a/src/coreclr/runtime/arm/StubDispatch.S b/src/coreclr/runtime/arm/StubDispatch.S index f196ea9a076379..64429e12a5c48b 100644 --- a/src/coreclr/runtime/arm/StubDispatch.S +++ b/src/coreclr/runtime/arm/StubDispatch.S @@ -12,10 +12,9 @@ .macro DEFINE_INTERFACE_DISPATCH_STUB entries NESTED_ENTRY RhpInterfaceDispatch\entries, _TEXT, NoHandler - // r12 currently contains the indirection cell address. But we need more scratch registers and - // we may A/V on a null this. Store r1 and r2 in red zone. - str r1, [sp, #-8] - str r2, [sp, #-4] + // r12 currently contains the indirection cell address. But we need more scratch registers. + // Save r1 and r2 on the stack. + PROLOG_PUSH "{r1,r2}" // r12 currently holds the indirection cell address. We need to get the cache structure instead. ldr r2, [r12, #OFFSETOF__InterfaceDispatchCell__m_pCache] @@ -37,7 +36,7 @@ NESTED_ENTRY RhpInterfaceDispatch\entries, _TEXT, NoHandler // For each entry in the cache, see if its MethodTable type matches the MethodTable in r1. // If so, call the second cache entry. If not, skip the InterfaceDispatchCacheEntry. // R1 : Instance MethodTable* - // R2: Cache data structure + // R2 : Cache data structure // R12 : Trashed. On successful check, set to the target address to jump to. .rept \entries ldr r12, [r2, #CurrentOffset] @@ -52,28 +51,17 @@ NESTED_ENTRY RhpInterfaceDispatch\entries, _TEXT, NoHandler // Point r12 to the indirection cell using the back pointer in the cache block ldr r12, [r2, #OFFSETOF__InterfaceDispatchCache__m_pCell] - ldr r1, [sp, #-8] - ldr r2, [sp, #-4] + EPILOG_POP "{r1,r2}" b C_FUNC(RhpInterfaceDispatchSlow) // Race detected: r12 still holds the indirection cell address (not yet clobbered). // Re-dispatch through the indirection cell to retry with the current stub and cache pair. - // ldr pc, [r12] branches to the current m_pStub without clobbering r12. LOCAL_LABEL(RaceRetry_\entries): - ldr r1, [sp, #-8] - ldr r2, [sp, #-4] + EPILOG_POP "{r1,r2}" ldr pc, [r12] - // Common epilog for cache hits. Have to out of line it here due to limitation on the number of - // epilogs imposed by the unwind code macros. LOCAL_LABEL(99_\entries): - // R2 contains address of the cache block. We store it in the red zone in case the target we jump - // to needs it. - // R12 contains the target address to jump to - ldr r1, [sp, #-8] - // We have to store R2 with address of the cache block into red zone before restoring original r2. - str r2, [sp, #-8] - ldr r2, [sp, #-4] + EPILOG_POP "{r1,r2}" EPILOG_BRANCH_REG r12 NESTED_END RhpInterfaceDispatch\entries, _TEXT @@ -108,17 +96,18 @@ LEAF_END RhpInitialDynamicInterfaceDispatch, _TEXT // Cache miss case, call the runtime to resolve the target and update the cache. // Use universal transition helper to allow an exception to flow out of resolution -LEAF_ENTRY RhpInterfaceDispatchSlow, _TEXT +NESTED_ENTRY RhpInterfaceDispatchSlow, _TEXT, NoHandler // r12 has the interface dispatch cell address in it. - // The calling convention of the universal thunk is that the parameter - // for the universal thunk target is to be placed in sp-8 - // and the universal thunk target address is to be placed in sp-4 - str r12, [sp, #-8] + // Push the two arguments that the universal transition thunk expects: + // [sp] = parameter for the universal thunk target (cell address) + // [sp+4] = universal thunk target address (RhpCidResolve) + PROLOG_STACK_ALLOC 8 + str r12, [sp] PREPARE_EXTERNAL_VAR RhpCidResolve, r12 - str r12, [sp, #-4] + str r12, [sp, #4] // jump to universal transition thunk b C_FUNC(RhpUniversalTransitionTailCall) -LEAF_END RhpInterfaceDispatchSlow, _TEXT +NESTED_END RhpInterfaceDispatchSlow, _TEXT #endif // FEATURE_CACHED_INTERFACE_DISPATCH diff --git a/src/coreclr/vm/arm/virtualcallstubcpu.hpp b/src/coreclr/vm/arm/virtualcallstubcpu.hpp index ede098dde691bf..3f49e99cd2e5d5 100644 --- a/src/coreclr/vm/arm/virtualcallstubcpu.hpp +++ b/src/coreclr/vm/arm/virtualcallstubcpu.hpp @@ -288,8 +288,8 @@ struct VTableCallStub size_t cbSize = 4; // First ldr instruction - // If we never save r0 to the red zone, we have the short version of the stub - if (*(UINT32*)(&pStubCode[cbSize]) != 0x0c04f84d) + // If we never save r0 to the stack, we have the short version of the stub + if (*(UINT32*)(&pStubCode[cbSize]) != 0x0d04f84d) { return 4 + // ldr r12,[r0] @@ -299,7 +299,7 @@ struct VTableCallStub 4; // Slot value (data storage, not a real instruction) } - cbSize += 4; // Saving r0 into red zone + cbSize += 4; // Saving r0 (push) cbSize += (*(WORD*)(&pStubCode[cbSize]) == 0xf8dc ? 4 : 12); // Loading of vtable into r12 cbSize += (*(WORD*)(&pStubCode[cbSize]) == 0xf8dc ? 4 : 12); // Loading of targe address into r12 @@ -335,7 +335,7 @@ struct VTableCallHolder int indirectionsSize = (offsetOfIndirection > 0xFFF ? 12 : 4) + (offsetAfterIndirection > 0xFFF ? 12 : 4); if (offsetOfIndirection > 0xFFF || offsetAfterIndirection > 0xFFF) - indirectionsSize += 8; // Save/restore r0 using red zone + indirectionsSize += 8; // Save/restore r0 (push/pop) return 6 + indirectionsSize + 4; } @@ -429,8 +429,8 @@ void VTableCallHolder::Initialize(unsigned slot) if (offsetOfIndirection > 0xFFF || offsetAfterIndirection > 0xFFF) { - // str r0, [sp, #-4]. Save r0 in the red zone - *(UINT32*)p = 0x0c04f84d; p += 4; + // str r0, [sp, #-4]! + *(UINT32*)p = 0x0d04f84d; p += 4; } if (offsetOfIndirection > 0xFFF) @@ -463,8 +463,8 @@ void VTableCallHolder::Initialize(unsigned slot) if (offsetOfIndirection > 0xFFF || offsetAfterIndirection > 0xFFF) { - // ldr r0, [sp, #-4]. Restore r0 from the red zone. - *(UINT32*)p = 0x0c04f85d; p += 4; + // ldr r0, [sp], #4 + *(UINT32*)p = 0x0b04f85d; p += 4; } // bx r12