pulseengine · avrabe · Mar 17, 2026 · Mar 17, 2026 · Mar 17, 2026
diff --git a/crates/synth-backend/src/arm_encoder.rs b/crates/synth-backend/src/arm_encoder.rs
@@ -564,11 +564,12 @@ impl ArmEncoder {
             ArmOp::F32Sqrt { sd, sm } => encode_vfp_2reg(0xEEB10AC0, sd, sm),
 
             // f32 pseudo-ops — multi-instruction sequences
+            // FPSCR RMode: 00=nearest, 01=+inf(ceil), 10=-inf(floor), 11=zero(trunc)
             ArmOp::F32Ceil { sd, sm } => {
-                return self.encode_arm_f32_rounding(sd, sm, 0b10); // VCVT toward +Inf
+                return self.encode_arm_f32_rounding(sd, sm, 0b01); // Round toward +Inf
             }
             ArmOp::F32Floor { sd, sm } => {
-                return self.encode_arm_f32_rounding(sd, sm, 0b01); // VCVT toward -Inf
+                return self.encode_arm_f32_rounding(sd, sm, 0b10); // Round toward -Inf
             }
             ArmOp::F32Trunc { sd, sm } => {
                 return self.encode_arm_f32_rounding(sd, sm, 0b11); // VCVT toward zero
@@ -646,11 +647,12 @@ impl ArmEncoder {
             ArmOp::F64Sqrt { dd, dm } => encode_vfp_2reg_f64(0xEEB10BC0, dd, dm),
 
             // f64 pseudo-ops
+            // FPSCR RMode: 00=nearest, 01=+inf(ceil), 10=-inf(floor), 11=zero(trunc)
             ArmOp::F64Ceil { dd, dm } => {
-                return self.encode_arm_f64_rounding(dd, dm, 0b10);
+                return self.encode_arm_f64_rounding(dd, dm, 0b01);
             }
             ArmOp::F64Floor { dd, dm } => {
-                return self.encode_arm_f64_rounding(dd, dm, 0b01);
+                return self.encode_arm_f64_rounding(dd, dm, 0b10);
             }
             ArmOp::F64Trunc { dd, dm } => {
                 return self.encode_arm_f64_rounding(dd, dm, 0b11);
@@ -826,20 +828,59 @@ impl ArmEncoder {
     /// Strategy: VCVT.S32.F32 Sd, Sm (toward zero), then VCVT.F32.S32 Sd, Sd
     /// For ceil/floor/nearest, we use VCVTR (round toward mode) + convert back.
     /// Simplified: convert to int (toward zero for trunc) then back to float.
-    fn encode_arm_f32_rounding(&self, sd: &VfpReg, sm: &VfpReg, _mode: u8) -> Result<Vec<u8>> {
+    /// Encode F32 rounding as ARM32.
+    /// `mode`: FPSCR RMode — 0b00=nearest, 0b01=+inf(ceil), 0b10=-inf(floor), 0b11=zero(trunc)
+    ///
+    /// For trunc (mode=0b11): uses VCVTR.S32.F32 (always rounds toward zero).
+    /// For ceil/floor/nearest: sets FPSCR rounding mode, uses VCVT.S32.F32 (non-R variant
+    /// which honours FPSCR rmode), then restores FPSCR.
+    fn encode_arm_f32_rounding(&self, sd: &VfpReg, sm: &VfpReg, mode: u8) -> Result<Vec<u8>> {
         let mut bytes = Vec::new();
         let sm_num = vfp_sreg_to_num(sm);
         let sd_num = vfp_sreg_to_num(sd);
         let (vd_s, d_s) = encode_sreg(sd_num);
         let (vm_s, m_s) = encode_sreg(sm_num);
 
-        // VCVT.S32.F32 Sd, Sm (truncate toward zero)
-        // 0xEEBD0AC0 | D << 22 | Vd << 12 | M << 5 | Vm
-        let vcvt_to_int = 0xEEBD0AC0 | (d_s << 22) | (vd_s << 12) | (m_s << 5) | vm_s;
-        bytes.extend_from_slice(&vcvt_to_int.to_le_bytes());
+        if mode == 0b11 {
+            // Trunc (toward zero): VCVTR.S32.F32 — the "R" variant always truncates.
+            // 0xEEBD0AC0: bit[7]=1 => round toward zero regardless of FPSCR
+            let vcvt_to_int = 0xEEBD0AC0 | (d_s << 22) | (vd_s << 12) | (m_s << 5) | vm_s;
+            bytes.extend_from_slice(&vcvt_to_int.to_le_bytes());
+        } else {
+            // ceil/floor/nearest: manipulate FPSCR rounding mode
+            let rt: u32 = 12; // R12/IP as temp
+
+            // VMRS R12, FPSCR
+            let vmrs = 0xEEF10A10 | (rt << 12);
+            bytes.extend_from_slice(&vmrs.to_le_bytes());
+
+            // BIC R12, R12, #(3 << 22) — clear RMode bits [23:22]
+            // 3<<22 = 0x00C00000. ARM rotated imm: 0x03 ror 10 (rotation=5, imm8=0x03)
+            let bic = 0xE3CC0000 | (rt << 12) | (0x05 << 8) | 0x03;
+            bytes.extend_from_slice(&bic.to_le_bytes());
 
-        // VCVT.F32.S32 Sd, Sd (convert back to float)
-        // 0xEEB80A40 | D << 22 | Vd << 12 | M << 5 | Vm
+            // ORR R12, R12, #(mode << 22) — set desired rounding mode
+            if mode != 0 {
+                // mode<<22: rotation=5, imm8=mode
+                let orr = 0xE38C0000 | (rt << 12) | (0x05 << 8) | (mode as u32);
+                bytes.extend_from_slice(&orr.to_le_bytes());
+            }
+
+            // VMSR FPSCR, R12
+            let vmsr = 0xEEE10A10 | (rt << 12);
+            bytes.extend_from_slice(&vmsr.to_le_bytes());
+
+            // VCVT.S32.F32 Sd, Sm — non-R variant (bit[7]=0), uses FPSCR rounding mode
+            let vcvt_to_int = 0xEEBD0A40 | (d_s << 22) | (vd_s << 12) | (m_s << 5) | vm_s;
+            bytes.extend_from_slice(&vcvt_to_int.to_le_bytes());
+
+            // Restore FPSCR: clear rmode bits back to nearest (default)
+            bytes.extend_from_slice(&vmrs.to_le_bytes());
+            bytes.extend_from_slice(&bic.to_le_bytes());
+            bytes.extend_from_slice(&vmsr.to_le_bytes());
+        }
+
+        // VCVT.F32.S32 Sd, Sd (convert integer result back to float)
         let (vd2, d2) = encode_sreg(sd_num);
         let vcvt_to_float = 0xEEB80A40 | (d2 << 22) | (vd2 << 12) | (d_s << 5) | vd_s;
         bytes.extend_from_slice(&vcvt_to_float.to_le_bytes());
@@ -1035,21 +1076,56 @@ impl ArmEncoder {
     }
 
     /// Encode F64 rounding pseudo-op as ARM32 via VCVT to integer and back.
-    fn encode_arm_f64_rounding(&self, dd: &VfpReg, dm: &VfpReg, _mode: u8) -> Result<Vec<u8>> {
+    /// Encode F64 rounding as ARM32.
+    /// `mode`: FPSCR RMode — 0b00=nearest, 0b01=+inf(ceil), 0b10=-inf(floor), 0b11=zero(trunc)
+    ///
+    /// For trunc: uses VCVTR.S32.F64 (always truncates).
+    /// For ceil/floor/nearest: sets FPSCR rounding mode, uses VCVT.S32.F64 (non-R variant),
+    /// then restores FPSCR.
+    fn encode_arm_f64_rounding(&self, dd: &VfpReg, dm: &VfpReg, mode: u8) -> Result<Vec<u8>> {
         let mut bytes = Vec::new();
         let dm_num = vfp_dreg_to_num(dm);
         let dd_num = vfp_dreg_to_num(dd);
         let (vm, m) = encode_dreg(dm_num);
         let (vd, d) = encode_dreg(dd_num);
 
-        // VCVT.S32.F64 S0, Dm (truncate toward zero)
-        // S0: Vd=0, D=0
-        let vcvt_to_int = 0xEEBD0BC0 | (m << 5) | vm;
-        bytes.extend_from_slice(&vcvt_to_int.to_le_bytes());
+        if mode == 0b11 {
+            // Trunc (toward zero): VCVTR.S32.F64 — bit[7]=1, always truncates
+            let vcvt_to_int = 0xEEBD0BC0 | (m << 5) | vm;
+            bytes.extend_from_slice(&vcvt_to_int.to_le_bytes());
+        } else {
+            // ceil/floor/nearest: manipulate FPSCR rounding mode
+            let rt: u32 = 12;
+
+            // VMRS R12, FPSCR
+            let vmrs = 0xEEF10A10 | (rt << 12);
+            bytes.extend_from_slice(&vmrs.to_le_bytes());
+
+            // BIC R12, R12, #(3 << 22)
+            let bic = 0xE3CC0000 | (rt << 12) | (0x05 << 8) | 0x03;
+            bytes.extend_from_slice(&bic.to_le_bytes());
+
+            // ORR R12, R12, #(mode << 22)
+            if mode != 0 {
+                let orr = 0xE38C0000 | (rt << 12) | (0x05 << 8) | (mode as u32);
+                bytes.extend_from_slice(&orr.to_le_bytes());
+            }
+
+            // VMSR FPSCR, R12
+            let vmsr = 0xEEE10A10 | (rt << 12);
+            bytes.extend_from_slice(&vmsr.to_le_bytes());
+
+            // VCVT.S32.F64 S0, Dm — non-R variant (bit[7]=0), uses FPSCR rmode
+            let vcvt_to_int = 0xEEBD0B40 | (m << 5) | vm;
+            bytes.extend_from_slice(&vcvt_to_int.to_le_bytes());
+
+            // Restore FPSCR
+            bytes.extend_from_slice(&vmrs.to_le_bytes());
+            bytes.extend_from_slice(&bic.to_le_bytes());
+            bytes.extend_from_slice(&vmsr.to_le_bytes());
+        }
 
         // VCVT.F64.S32 Dd, S0 (convert back to double)
-        // 0xEEB80B40 | D << 22 | Vd << 12 | M << 5 | Vm
-        // S0: Vm=0, M=0
         let vcvt_to_float = 0xEEB80B40 | (d << 22) | (vd << 12);
         bytes.extend_from_slice(&vcvt_to_float.to_le_bytes());
 
@@ -3985,8 +4061,9 @@ impl ArmEncoder {
             }
 
             // f32 pseudo-ops — multi-instruction sequences
-            ArmOp::F32Ceil { sd, sm } => self.encode_thumb_f32_rounding(sd, sm, 0b10),
-            ArmOp::F32Floor { sd, sm } => self.encode_thumb_f32_rounding(sd, sm, 0b01),
+            // FPSCR RMode: 00=nearest, 01=+inf(ceil), 10=-inf(floor), 11=zero(trunc)
+            ArmOp::F32Ceil { sd, sm } => self.encode_thumb_f32_rounding(sd, sm, 0b01),
+            ArmOp::F32Floor { sd, sm } => self.encode_thumb_f32_rounding(sd, sm, 0b10),
             ArmOp::F32Trunc { sd, sm } => self.encode_thumb_f32_rounding(sd, sm, 0b11),
             ArmOp::F32Nearest { sd, sm } => self.encode_thumb_f32_rounding(sd, sm, 0b00),
             ArmOp::F32Min { sd, sn, sm } => self.encode_thumb_f32_minmax(sd, sn, sm, true),
@@ -4051,8 +4128,9 @@ impl ArmEncoder {
             }
 
             // f64 pseudo-ops
-            ArmOp::F64Ceil { dd, dm } => self.encode_thumb_f64_rounding(dd, dm, 0b10),
-            ArmOp::F64Floor { dd, dm } => self.encode_thumb_f64_rounding(dd, dm, 0b01),
+            // FPSCR RMode: 00=nearest, 01=+inf(ceil), 10=-inf(floor), 11=zero(trunc)
+            ArmOp::F64Ceil { dd, dm } => self.encode_thumb_f64_rounding(dd, dm, 0b01),
+            ArmOp::F64Floor { dd, dm } => self.encode_thumb_f64_rounding(dd, dm, 0b10),
             ArmOp::F64Trunc { dd, dm } => self.encode_thumb_f64_rounding(dd, dm, 0b11),
             ArmOp::F64Nearest { dd, dm } => self.encode_thumb_f64_rounding(dd, dm, 0b00),
             ArmOp::F64Min { dd, dn, dm } => self.encode_thumb_f64_minmax(dd, dn, dm, true),
@@ -4246,18 +4324,64 @@ impl ArmEncoder {
     }
 
     /// Encode F32 rounding pseudo-op as Thumb-2 via VCVT to integer and back
-    fn encode_thumb_f32_rounding(&self, sd: &VfpReg, sm: &VfpReg, _mode: u8) -> Result<Vec<u8>> {
+    /// Encode F32 rounding as Thumb-2.
+    /// `mode`: FPSCR RMode — 0b00=nearest, 0b01=+inf(ceil), 0b10=-inf(floor), 0b11=zero(trunc)
+    ///
+    /// For trunc: uses VCVTR.S32.F32 (always truncates).
+    /// For ceil/floor/nearest: sets FPSCR rounding mode, uses VCVT.S32.F32 (non-R variant),
+    /// then restores FPSCR.
+    fn encode_thumb_f32_rounding(&self, sd: &VfpReg, sm: &VfpReg, mode: u8) -> Result<Vec<u8>> {
         let mut bytes = Vec::new();
         let sm_num = vfp_sreg_to_num(sm);
         let sd_num = vfp_sreg_to_num(sd);
         let (vd_s, d_s) = encode_sreg(sd_num);
         let (vm_s, m_s) = encode_sreg(sm_num);
 
-        // VCVT.S32.F32 Sd, Sm (truncate toward zero)
-        let vcvt_to_int = 0xEEBD0AC0 | (d_s << 22) | (vd_s << 12) | (m_s << 5) | vm_s;
-        bytes.extend_from_slice(&vfp_to_thumb_bytes(vcvt_to_int));
+        if mode == 0b11 {
+            // Trunc (toward zero): VCVTR.S32.F32 — bit[7]=1, always truncates
+            let vcvt_to_int = 0xEEBD0AC0 | (d_s << 22) | (vd_s << 12) | (m_s << 5) | vm_s;
+            bytes.extend_from_slice(&vfp_to_thumb_bytes(vcvt_to_int));
+        } else {
+            // ceil/floor/nearest: manipulate FPSCR rounding mode
+            let rt: u32 = 12; // R12/IP as temp
+
+            // VMRS R12, FPSCR
+            let vmrs = 0xEEF10A10 | (rt << 12);
+            bytes.extend_from_slice(&vfp_to_thumb_bytes(vmrs));
+
+            // BIC.W R12, R12, #(3 << 22) — clear RMode bits [23:22]
+            // Thumb-2 modified immediate for 3<<22 = 0x00C00000:
+            // BIC.W encoding: 11110 i 0 0001 S Rn | 0 imm3 Rd imm8
+            // 0x00C00000 = 0x03 shifted left by 22 => Thumb mod-imm: i=0, imm3=0b101, imm8=0x03
+            let bic_hw1: u16 = 0xF020 | ((rt as u16) & 0xF); // BIC, Rn=R12
+            let bic_hw2: u16 = (0x05 << 12) | ((rt as u16) << 8) | 0x03;
+            bytes.extend_from_slice(&bic_hw1.to_le_bytes());
+            bytes.extend_from_slice(&bic_hw2.to_le_bytes());
+
+            // ORR.W R12, R12, #(mode << 22)
+            if mode != 0 {
+                let orr_hw1: u16 = 0xF040 | ((rt as u16) & 0xF); // ORR, Rn=R12
+                let orr_hw2: u16 = (0x05 << 12) | ((rt as u16) << 8) | (mode as u16);
+                bytes.extend_from_slice(&orr_hw1.to_le_bytes());
+                bytes.extend_from_slice(&orr_hw2.to_le_bytes());
+            }
+
+            // VMSR FPSCR, R12
+            let vmsr = 0xEEE10A10 | (rt << 12);
+            bytes.extend_from_slice(&vfp_to_thumb_bytes(vmsr));
+
+            // VCVT.S32.F32 Sd, Sm — non-R variant (bit[7]=0), uses FPSCR rmode
+            let vcvt_to_int = 0xEEBD0A40 | (d_s << 22) | (vd_s << 12) | (m_s << 5) | vm_s;
+            bytes.extend_from_slice(&vfp_to_thumb_bytes(vcvt_to_int));
+
+            // Restore FPSCR: clear rmode bits back to nearest (default)
+            bytes.extend_from_slice(&vfp_to_thumb_bytes(vmrs));
+            bytes.extend_from_slice(&bic_hw1.to_le_bytes());
+            bytes.extend_from_slice(&bic_hw2.to_le_bytes());
+            bytes.extend_from_slice(&vfp_to_thumb_bytes(vmsr));
+        }
 
-        // VCVT.F32.S32 Sd, Sd (convert back to float)
+        // VCVT.F32.S32 Sd, Sd (convert integer result back to float)
         let (vd2, d2) = encode_sreg(sd_num);
         let vcvt_to_float = 0xEEB80A40 | (d2 << 22) | (vd2 << 12) | (d_s << 5) | vd_s;
         bytes.extend_from_slice(&vfp_to_thumb_bytes(vcvt_to_float));
@@ -4487,16 +4611,54 @@ impl ArmEncoder {
     }
 
     /// Encode F64 rounding pseudo-op as Thumb-2 via VCVT to integer and back
-    fn encode_thumb_f64_rounding(&self, dd: &VfpReg, dm: &VfpReg, _mode: u8) -> Result<Vec<u8>> {
+    /// Encode F64 rounding as Thumb-2.
+    /// `mode`: FPSCR RMode — 0b00=nearest, 0b01=+inf(ceil), 0b10=-inf(floor), 0b11=zero(trunc)
+    fn encode_thumb_f64_rounding(&self, dd: &VfpReg, dm: &VfpReg, mode: u8) -> Result<Vec<u8>> {
         let mut bytes = Vec::new();
         let dm_num = vfp_dreg_to_num(dm);
         let dd_num = vfp_dreg_to_num(dd);
         let (vm, m) = encode_dreg(dm_num);
         let (vd, d) = encode_dreg(dd_num);
 
-        // VCVT.S32.F64 S0, Dm
-        let vcvt_to_int = 0xEEBD0BC0 | (m << 5) | vm;
-        bytes.extend_from_slice(&vfp_to_thumb_bytes(vcvt_to_int));
+        if mode == 0b11 {
+            // Trunc: VCVTR.S32.F64 — bit[7]=1, always truncates
+            let vcvt_to_int = 0xEEBD0BC0 | (m << 5) | vm;
+            bytes.extend_from_slice(&vfp_to_thumb_bytes(vcvt_to_int));
+        } else {
+            let rt: u32 = 12;
+
+            // VMRS R12, FPSCR
+            let vmrs = 0xEEF10A10 | (rt << 12);
+            bytes.extend_from_slice(&vfp_to_thumb_bytes(vmrs));
+
+            // BIC.W R12, R12, #(3 << 22)
+            let bic_hw1: u16 = 0xF020 | ((rt as u16) & 0xF);
+            let bic_hw2: u16 = (0x05 << 12) | ((rt as u16) << 8) | 0x03;
+            bytes.extend_from_slice(&bic_hw1.to_le_bytes());
+            bytes.extend_from_slice(&bic_hw2.to_le_bytes());
+
+            // ORR.W R12, R12, #(mode << 22)
+            if mode != 0 {
+                let orr_hw1: u16 = 0xF040 | ((rt as u16) & 0xF);
+                let orr_hw2: u16 = (0x05 << 12) | ((rt as u16) << 8) | (mode as u16);
+                bytes.extend_from_slice(&orr_hw1.to_le_bytes());
+                bytes.extend_from_slice(&orr_hw2.to_le_bytes());
+            }
+
+            // VMSR FPSCR, R12
+            let vmsr = 0xEEE10A10 | (rt << 12);
+            bytes.extend_from_slice(&vfp_to_thumb_bytes(vmsr));
+
+            // VCVT.S32.F64 S0, Dm — non-R variant (bit[7]=0)
+            let vcvt_to_int = 0xEEBD0B40 | (m << 5) | vm;
+            bytes.extend_from_slice(&vfp_to_thumb_bytes(vcvt_to_int));
+
+            // Restore FPSCR
+            bytes.extend_from_slice(&vfp_to_thumb_bytes(vmrs));
+            bytes.extend_from_slice(&bic_hw1.to_le_bytes());
+            bytes.extend_from_slice(&bic_hw2.to_le_bytes());
+            bytes.extend_from_slice(&vfp_to_thumb_bytes(vmsr));
+        }
 
         // VCVT.F64.S32 Dd, S0
         let vcvt_to_float = 0xEEB80B40 | (d << 22) | (vd << 12);
@@ -5557,7 +5719,8 @@ mod tests {
             sm: VfpReg::S2,
         };
         let code = encoder.encode(&op).unwrap();
-        assert_eq!(code.len(), 8); // VCVT.S32.F32 + VCVT.F32.S32
+        // VMRS + BIC + ORR + VMSR + VCVT.S32.F32 + VMRS + BIC + VMSR + VCVT.F32.S32
+        assert_eq!(code.len(), 36);
     }
 
     #[test]
@@ -5568,7 +5731,8 @@ mod tests {
             sm: VfpReg::S2,
         };
         let code = encoder.encode(&op).unwrap();
-        assert_eq!(code.len(), 8); // Two VFP instructions as Thumb-2
+        // VMRS + BIC.W + ORR.W + VMSR + VCVT + VMRS + BIC.W + VMSR + VCVT.F32.S32
+        assert_eq!(code.len(), 36);
     }
 
     #[test]

diff --git a/crates/synth-backend/tests/f32_vfp_encoding_test.rs b/crates/synth-backend/tests/f32_vfp_encoding_test.rs
@@ -425,7 +425,8 @@ fn test_f32_ceil_encodes_successfully() {
         sm: VfpReg::S0,
     });
     assert!(result.is_ok(), "F32Ceil should encode successfully");
-    assert_eq!(result.unwrap().len(), 8); // VCVT.S32.F32 + VCVT.F32.S32
+    // VMRS + BIC.W + ORR.W + VMSR + VCVT + VMRS + BIC.W + VMSR + VCVT.F32.S32
+    assert_eq!(result.unwrap().len(), 36);
 }
 
 #[test]