diff --git a/README.md b/README.md
index f718271ea..df0afd5c6 100644
--- a/README.md
+++ b/README.md
@@ -90,7 +90,8 @@ App|Description
 
 App|Description
 ---|---
-[hello_encrypted](encrypted/hello_encrypted) | Create a self-decrypting binary.
+[hello_encrypted](encrypted/hello_encrypted) | Create a self-decrypting binary, using the hardened decryption stage. This should be secure against side channel attacks.
+[hello_encrypted_mbedtls](encrypted/hello_encrypted) | Create a self-decrypting binary, using the MbedTLS decryption stage. This is not secure against side channel attacks, so is fast but provides limited protection.
 
 ### HSTX (RP235x Only)
 
diff --git a/bootloaders/encrypted/CMakeLists.txt b/bootloaders/encrypted/CMakeLists.txt
index 2d6d77f0d..fb7eb2d48 100644
--- a/bootloaders/encrypted/CMakeLists.txt
+++ b/bootloaders/encrypted/CMakeLists.txt
@@ -1,15 +1,19 @@
 # Encrypted Bootloader
 add_executable(enc_bootloader
         enc_bootloader.c
-        aes.S
+        mbedtls_aes.c
         )
 
 # pull in common dependencies
-target_link_libraries(enc_bootloader pico_stdlib pico_rand)
+target_link_libraries(enc_bootloader pico_stdlib pico_rand pico_mbedtls)
 
 # use stack guards, as AES variables are written near the stack
 target_compile_definitions(enc_bootloader PRIVATE PICO_USE_STACK_GUARDS=1)
 
+target_link_options(enc_bootloader PUBLIC -Wl,--print-memory-usage)
+
+target_include_directories(enc_bootloader PRIVATE ${CMAKE_CURRENT_LIST_DIR})
+
 # set as no_flash binary
 pico_set_binary_type(enc_bootloader no_flash)
 
@@ -35,8 +39,8 @@ function(add_linker_script target origin length)
     pico_set_linker_script(${target} ${CMAKE_CURRENT_BINARY_DIR}/${target}.ld)
 endfunction()
 
-# create linker script to run from 0x20078000
-add_linker_script(enc_bootloader "0x20078000" "32k")
+# create linker script to run from 0x20070000
+add_linker_script(enc_bootloader "0x20070000" "64k")
 
 # sign, hash, and clear SRAM
 pico_sign_binary(enc_bootloader ${CMAKE_CURRENT_LIST_DIR}/private.pem)
@@ -50,6 +54,9 @@ pico_embed_pt_in_binary(enc_bootloader ${CMAKE_CURRENT_LIST_DIR}/enc-pt.json)
 pico_set_uf2_family(enc_bootloader "absolute")
 pico_package_uf2_output(enc_bootloader 0x10000000)
 
+# optionally enable USB output in addition to UART
+# pico_enable_stdio_usb(enc_bootloader 1)
+
 # create map/bin/hex/uf2 file etc.
 pico_add_extra_outputs(enc_bootloader)
 
@@ -83,6 +90,9 @@ pico_encrypt_binary(hello_serial_enc ${CMAKE_CURRENT_LIST_DIR}/privateaes.bin ${
 # package uf2 in flash
 pico_package_uf2_output(hello_serial_enc 0x10000000)
 
+# optionally enable USB output in addition to UART
+# pico_enable_stdio_usb(hello_serial_enc 1)
+
 # create map/bin/hex/uf2 file etc.
 pico_add_extra_outputs(hello_serial_enc)
 
diff --git a/bootloaders/encrypted/README.md b/bootloaders/encrypted/README.md
index 0e10e5e3d..790b9605b 100644
--- a/bootloaders/encrypted/README.md
+++ b/bootloaders/encrypted/README.md
@@ -1,5 +1,7 @@
 For security you **must** replace private.pem and privateaes.bin with your own keys, and ivsalt.bin with your own per-device salt. Make sure you **don't lose your keys and salts**, else you may not be able to update the code on your device.
 
+This bootloader uses MbedTLS for decryption, so it is not secure against side channel attacks and therefore only offers limited protection against physical attackers.
+
 Your signing key must be for the _secp256k1_ curve, in PEM format. You can create a .PEM file with:
 
 ```bash
diff --git a/bootloaders/encrypted/aes.S b/bootloaders/encrypted/aes.S
deleted file mode 100644
index 093c4b0f1..000000000
--- a/bootloaders/encrypted/aes.S
+++ /dev/null
@@ -1,1944 +0,0 @@
-/* MEMORY LAYOUT ASSUMPTIONS
-
-The "chaff" area must be located at the start of Y scratch RAM, 0x20081000: see
-the macro getchaffaddress.
-
-The stack must be located at the end of Y scratch RAM: see the memory
-wiping at the end of ctr_crypt_s where memory between the start of Y
-scratch RAM and the stack pointer is overwritten.
-*/
-
-.syntax unified
-.cpu cortex-m33
-.thumb
-
-#include "config.h"
-#include "hardware/platform_defs.h"
-#include "hardware/regs/addressmap.h"
-#include "hardware/regs/clocks.h"
-#include "hardware/regs/sha256.h"
-#include "hardware/regs/resets.h"
-#include "hardware/regs/rosc.h"
-#include "hardware/regs/trng.h"
-#include "hardware/rcp.h"
-
-.global decrypt
-.global chaff
-
-.extern lock_key
-
-@ RCP macros
-
-#define CTAG0  0x2a
-#define CTAG1  0x2b
-#define CTAG2  0x2c
-#define CTAG3  0x2d
-#define CTAG4  0x2e
-#define CTAG5  0x30
-#define CTAG6  0x31
-#define CTAG7  0x32
-#define CTAG8  0x33
-#define CTAG9  0x34
-#define CTAG10 0x35 @ not used
-#define CTAG11 0x36 @ not used
-#define CTAG12 0x37
-#define CTAG13 0x38
-#define CTAG14 0x39
-#define CTAG15 0x3a
-#define CTAG16 0x3b
-#define CTAG17 0x3c
-#define CTAG18 0x3d @ not used
-
-@ number of blocks from the TRNG processed to initialise rstate_sha
-#define TRNG_BLOCKS 25
-
-@ The lower jitterpriorty is, the more the jitter
-.macro SET_COUNT n,jitterpriority
-.if RC_COUNT
-.if RC_JITTER > \jitterpriority
- rcp_count_set \n
-.else
- rcp_count_set_nodelay \n
-.endif
-.endif
-.endm
-
-.macro CHK_COUNT n,jitterpriority
-.if RC_COUNT
-.if RC_JITTER > \jitterpriority
- rcp_count_check \n
-.else
- rcp_count_check_nodelay \n
-.endif
-.endif
-.endm
-
-.macro GET_CANARY rx,tag,jitterpriority
-.if RC_CANARY
-.if RC_JITTER > \jitterpriority
- rcp_canary_get \rx,\tag
-.else
- rcp_canary_get_nodelay \rx,\tag
-.endif
-.endif
-.endm
-
-.macro CHK_CANARY rx,tag,jitterpriority
-.if RC_CANARY
-.if RC_JITTER > \jitterpriority
- rcp_canary_check \rx,\tag
-.else
- rcp_canary_check_nodelay \rx,\tag
-.endif
-.endif
-.endm
-
-@ Clear internal stripe load registers, and r0-r3
-@ 0 <= offset <= 32
-.macro clear03 offset=0
- getchaffaddress r0,\offset
- ldmia r0,{r0-r3}
-.endm
-
-.macro clear03_preserve_r3 offset=0
- getchaffaddress r0,\offset
- ldmia r0!,{r1-r2}
- ldmia r0!,{r1-r2}
-.endm
-
-.macro clear01 offset=0
- getchaffaddress r0,\offset
- ldmia r0,{r0,r1}
-.endm
-
-@ Put workspace in the second scratch area
-@ The "a"=allocatable attribute (and possibly the %progbits attribute) are necessary to store the murmur3 constants,
-@ otherwise they may end up silently replaced with 0 or 0xffffffff
-.section .scratch_y.aes,"aw",%progbits
-
-workspace_start:
-
-@ chaff has to be at the start of scratch_y = 0x20081000 because this is assumed by the following macro, getchaffaddress
-@ (It seems ADR does not work, nor is it possible to assert that chaff==0x20081000)
-@ getchaffaddress is used by clear03 and clear01 and other sensitive cases which require the first load to be a random one
-@ chaff has to be 0 mod 16 for other reasons
-.macro getchaffaddress rx,offset=0
-@ ldr \rx,=(chaff+\offset)
- mov \rx,#(0x1000+\offset)
- movt \rx,#0x2008
-.endm
-chaff:
-.space 48
-
-.balign 16
-rkey_s:                      @ round key shares: 600 bytes = 15 rounds * 2 shares * (4+1) words
-                             @ see comment at init_key_4way for description of layout and meaning of rkey_s
-.space 600
-rkey4way:                    @ scratch area for init_key_4way; could overlap this with other scratch space if need to save space
-.space 128
-.if CT_BPERM
-bperm_rand:                  @ 32 half words that define the oblivious permutation of blocks
-.space 64
-.endif
-
-.balign 16
-permscratch:                 @ Must be 0 mod 16; 16 bytes of scratch space to store permutation(s)
-perm16:
-.space 16
-@ Scratch space of 32 bytes used both by init_key_sbox and map_sbox_s
-.balign 16
-fourway:                     @ Must be 0 mod 16
-shareA:                      @ 0 mod 16
-.space 20                    @ Only need 16 bytes, but choosing shareB!=shareA mod 16
-shareB:                      @ 4 mod 16
-.space 20
-shareC:                      @ 8 mod 16
-.space 4
-statevperm:                  @ 12 mod 16
-.space 4                     @ vperm state rotation: only last two bits are operational; other bits random
-RKshareC:                    @ Round key common share C; see comment at init_key_4way for explanation
-.space 4
-RKshareCchange:              @ Temporary used by ref_roundkey_share_s
-.space 4
-IV0:                         @ 2-way share of IV for block 0
-.space 36                    @ Considering IV0 as a word pointer, the format is IV = IV0[0,1,2,3] ^ (IV0[5,6,7,8],ror#16)
-                             @ The gap at IV0[4] is to defeat unsharing by internal striped memory registers
-                             @ I.e., there are implicit XORs IV0[0]^IV0[4], IV0[1]^IV0[5], ..., that the 1 word offset renders useless
-
-@ Regardless of configuration, the code uses a single 256-entry LUT,
-@ which is a simple S-box table.
-@ The LUT is represented as two shares, lut_a and lut_b,
-@ whose values must be EORed. Furthermore, the contents of each share are
-@ scambled according to a 4-byte "map". The map comprises two bytes that
-@ are EORed into the addressing of the share, and two bytes that are
-@ EORed into the data read back from the share. Performing a lookup
-@ of a value x involves computing
-@ lut_a[x ^ a₀ ^ a₁] ^ c₀ ^ c₁ ^ lut_b[x ^ b₀ ^ b₁] ^ d₀ ^ d₁
-@ where a₀, a₁, c₀ and c₁ are the "map" of the lut_a share and
-@ b₀, b₁, d₀ and d₁ are the "map" of the lut_b share.
-@ In practice the result of a lookup is itself represented in two
-@ shares, namely
-@ lut_a[x ^ a₀ ^ a₁] ^ c₀ ^ d₀  and
-@ lut_b[x ^ b₀ ^ b₁] ^ c₁ ^ d₁
-.balign 16
-lut_a:                       @ LUT share A (must be 0 mod 16 so that init_key_sbox knows how to mask the lookup)
-.byte 0x63,0x7c,0x77,0x7b,0xf2,0x6b,0x6f,0xc5,0x30,0x01,0x67,0x2b,0xfe,0xd7,0xab,0x76
-.byte 0xca,0x82,0xc9,0x7d,0xfa,0x59,0x47,0xf0,0xad,0xd4,0xa2,0xaf,0x9c,0xa4,0x72,0xc0
-.byte 0xb7,0xfd,0x93,0x26,0x36,0x3f,0xf7,0xcc,0x34,0xa5,0xe5,0xf1,0x71,0xd8,0x31,0x15
-.byte 0x04,0xc7,0x23,0xc3,0x18,0x96,0x05,0x9a,0x07,0x12,0x80,0xe2,0xeb,0x27,0xb2,0x75
-.byte 0x09,0x83,0x2c,0x1a,0x1b,0x6e,0x5a,0xa0,0x52,0x3b,0xd6,0xb3,0x29,0xe3,0x2f,0x84
-.byte 0x53,0xd1,0x00,0xed,0x20,0xfc,0xb1,0x5b,0x6a,0xcb,0xbe,0x39,0x4a,0x4c,0x58,0xcf
-.byte 0xd0,0xef,0xaa,0xfb,0x43,0x4d,0x33,0x85,0x45,0xf9,0x02,0x7f,0x50,0x3c,0x9f,0xa8
-.byte 0x51,0xa3,0x40,0x8f,0x92,0x9d,0x38,0xf5,0xbc,0xb6,0xda,0x21,0x10,0xff,0xf3,0xd2
-.byte 0xcd,0x0c,0x13,0xec,0x5f,0x97,0x44,0x17,0xc4,0xa7,0x7e,0x3d,0x64,0x5d,0x19,0x73
-.byte 0x60,0x81,0x4f,0xdc,0x22,0x2a,0x90,0x88,0x46,0xee,0xb8,0x14,0xde,0x5e,0x0b,0xdb
-.byte 0xe0,0x32,0x3a,0x0a,0x49,0x06,0x24,0x5c,0xc2,0xd3,0xac,0x62,0x91,0x95,0xe4,0x79
-.byte 0xe7,0xc8,0x37,0x6d,0x8d,0xd5,0x4e,0xa9,0x6c,0x56,0xf4,0xea,0x65,0x7a,0xae,0x08
-.byte 0xba,0x78,0x25,0x2e,0x1c,0xa6,0xb4,0xc6,0xe8,0xdd,0x74,0x1f,0x4b,0xbd,0x8b,0x8a
-.byte 0x70,0x3e,0xb5,0x66,0x48,0x03,0xf6,0x0e,0x61,0x35,0x57,0xb9,0x86,0xc1,0x1d,0x9e
-.byte 0xe1,0xf8,0x98,0x11,0x69,0xd9,0x8e,0x94,0x9b,0x1e,0x87,0xe9,0xce,0x55,0x28,0xdf
-.byte 0x8c,0xa1,0x89,0x0d,0xbf,0xe6,0x42,0x68,0x41,0x99,0x2d,0x0f,0xb0,0x54,0xbb,0x16
-lut_a_map:                   @ the current scrambling of lut_a; not particularly secret since it can be deduced from the contents of lut_a and lut_b
-.space 4
-.space 4                     @ align to 8 mod 16
-lut_b:                       @ LUT share B (must be 8 mod 16 so that init_key_sbox knows how to mask the lookup)
-.space 256
-lut_b_map:
-.space 4
-.space 4                     @ align to multiple of 8
-
-.balign 16
-rstate_all_start:            @ Mark start of RNG data to allow selective memory wipe
-rstate_sha:                  @ 128-bit SHA random state, to be initialised to TRNG bytes; zeroth byte must be initialised to zero
-.space 16
-jstate:                      @ 32-bit jitter state
-.space 4
-rstate_lfsr:                 @ 32-bit LFSR random state and constant used to step it
-.space 4
-.word 0x1d872b41             @ constant that defines a maximal-length LFSR
-rstate_all_end:              @ Mark end of RNG data to allow selective memory wipe
-
-.if CT_BPERM
-.balign 16
-murmur3_constants:           @ Five constants used in murmur3_32 hash
-.word 0xcc9e2d51
-.word 0x1b873593
-.word 0xe6546b64
-.word 0x85ebca6b
-.word 0xc2b2ae35
-.endif
-
-scratch_y_end:
-
-@ Initialisation code in main .text section
-.section .text,"ax",%progbits
-
-@ The following is copied from the A2 boot ROM code at src/main/arm/varm_boot_path.c with adjustments.
-@ We feed a stream of bits from the TRNG into the SHA hardware accelerator to generate some
-@ random numbers.
-@ Trashes r0-r6
-.balign 4
-init_rstate:
- CHK_COUNT 24,6
- ldr r4,=TRNG_BASE+TRNG_RNG_IMR_OFFSET
- ldr r5,=SHA256_BASE
- movs r1,#1
- str r1,[r4,#TRNG_TRNG_SW_RESET_OFFSET     -TRNG_RNG_IMR_OFFSET]
- ldr r6,[r4,#TRNG_TRNG_SW_RESET_OFFSET     -TRNG_RNG_IMR_OFFSET]     @ reads as 0
- movw r1,#SHA256_CSR_RESET|SHA256_CSR_START_BITS                     @ initialise SHA internal state by writing START bit
- str r1,[r5,#SHA256_CSR_OFFSET]
- str r6,[r4,#TRNG_SAMPLE_CNT1_OFFSET       -TRNG_RNG_IMR_OFFSET]
- movs r6,#TRNG_BLOCKS*2+1                                            @ odd so that we break out of the loop half-way through loading the SHA hardware, giving
-                                                                     @ time for previous SHA computation to complete
-2:
- movs r1,#0xff                                                       @ TRNG setup is inside loop in case it is skipped.
- str r1,[r4,#TRNG_TRNG_DEBUG_CONTROL_OFFSET-TRNG_RNG_IMR_OFFSET]     @ disable checks and bypass decorrelators,to stream raw TRNG ROSC samples
- str r1,[r4,#TRNG_RND_SOURCE_ENABLE_OFFSET -TRNG_RNG_IMR_OFFSET]     @ start ROSC if it is not already started
- str r1,[r4,#TRNG_RNG_ICR_OFFSET           -TRNG_RNG_IMR_OFFSET]     @ clear all interrupts (including EHR_VLD)
- adds r0,r4,#TRNG_EHR_DATA0_OFFSET         -TRNG_RNG_IMR_OFFSET
- movs r2,#TRNG_TRNG_BUSY_OFFSET            -TRNG_RNG_IMR_OFFSET
-1:
- ldr r1,[r4,r2]                                                      @ wait for 192 ROSC samples to fill EHR,should take constant time
- cmp r1,#0
- bne 1b
- subs r6,#1                                                          @ done?
- beq 3f
- movs r1,#8
-1:
- ldmia r0!,{r2}                                                      @ copy 6 EHR words to SHA-256, plus garbage (RND_SOURCE_ENABLE and SAMPLE_CNT1)
- str r2,[r5,#SHA256_WDATA_OFFSET]                                    @ for a total of half a SHA-256 block
- subs r1,#1
- bne 1b
- ldr r2,[r5,#SHA256_SUM0_OFFSET]                                     @ TRNG is now sampling again; use some SHA bits to modulate the chain length
- str r2,[r4,#TRNG_TRNG_CONFIG_OFFSET       -TRNG_RNG_IMR_OFFSET]
- b.n 2b
-
-3:
- CHK_COUNT 25,6
- str r1,[r4,#TRNG_TRNG_CONFIG_OFFSET       -TRNG_RNG_IMR_OFFSET]     @ turn off rand source and wipe SHA bits left in TRNG config; r1=0
- str r1,[r4,#TRNG_RND_SOURCE_ENABLE_OFFSET -TRNG_RNG_IMR_OFFSET]
- adds r5,r5,#SHA256_SUM0_OFFSET
-@ r5=SHA256 SUM0 register (r5+4=SUM1, r4+8=SUM2, etc)
- ldmia r5,{r0-r3}  @ load first 4 words of the 8 word SHA256 output
- ldr r6,=rstate_sha
-@ r5=SHA256 SUM0 register (r5+4=SUM1, r4+8=SUM2, etc), r6=rstate_sha
- stmia r6,{r0-r3}
- CHK_COUNT 26,6
- movs r0,#0
- strb r0,[r6]      @ make sure rstate_sha[0] has byte 0 set to 0, representing "out of data"
-
-@ try to find a non-zero initialiser to create a non-degenerate LFSR random state
- ldr r1,[r5,#16]   @ SHA SUM4
- cbnz r1,1f        @ is word 4 non-zero? then use it
- ldr r1,[r5,#20]   @ SHA SUM5
- cbnz r1,1f        @ otherwise, is word 5 non-zero? use it
- mov r1,r6         @ give up and use the address of rstate_sha (which is non-zero); this can't really happen (2^{-64} probability)
-1:
- str r1,[r6,#rstate_lfsr-rstate_sha]
- 
-@ try to find a non-zero initialiser to create a non-degenerate ROSC random state
- ldr r1,[r5,#24]   @ SHA SUM6
- cbnz r1,1f        @ is word 6 non-zero? then use it
- ldr r1,[r5,#28]   @ SHA SUM7
- cbnz r1,1f        @ otherwise, is word 7 non-zero? use it
- mov r1,r6         @ give up and use the address of rstate_sha (which is non-zero); this can't really happen (2^{-64} probability)
-1:
- ldr r2,=ROSC_RANDOM_OFFSET+ROSC_BASE
- str r1,[r2,#0]    @ Initialise ROSC LFSR
- CHK_COUNT 27,6
- 
-.if GEN_RAND_SHA
-.if SH_JITTER
- movs r2,#0
- str r2,[r6,#jstate-rstate_sha]
-.endif
-.endif
-
- CHK_COUNT 28,6
- bx r14
-
-@ Put AES core code in first scratch area
-.section .scratch_x.aes,"ax",%progbits
-
-.if GEN_RAND_SHA
-@ we need SHA256_SUM0_OFFSET==8 (see note below)
-.if SHA256_SUM0_OFFSET!=8
-.err
-.endif
-
-@ Return single random word in r0
-@ Preserves r1-r13
-.balign 4
-gen_rand_sha:
- push {r14}
- GET_CANARY r14,CTAG1,2
- push {r1-r3,r14}
-.if SH_JITTER
- ldr r2,=rstate_sha
- ldr r0,[r2,#jstate-rstate_sha]
- movs r1,#1
- ands r3,r0,#3
- movs r3,r3,lsl#2
- movs r3,r1,lsl r3       @ 1<<(4*(r0&3))
- udiv r3,r3,r1           @ Takes constant + (r0&3) cycles
- lsrs r0,r0,#2
- bne 1f
- bl gen_rand_sha_nonpres
- ldr r2,=rstate_sha
-1:
- str r0,[r2,#jstate-rstate_sha]
-.endif
- bl gen_rand_sha_nonpres
- pop {r1-r3,r14}
- CHK_CANARY r14,CTAG1,0
- pop {r15}
-
-@ Return single random word in r0
-@ Trashes r1-r3
-.balign 4
-gen_rand_sha_nonpres:
- ldr r0,=SHA256_BASE
- ldr r2,=rstate_sha
- ldrb r1,[r2]                @ get word counter from bottom byte of rstate_sha[] (offset into SUM registers)
- subs r3,r1,#4               @ decrement it to previous SUM register
- ble 1f                      @ if the offset was 4 or less we have run out of SUM register values
- ldr r0,[r0,r1]              @ read value from SUM register: note that this relies on SHA256_SUM0_OFFSET==8
- strb r3,[r2]                @ save updated SUM register offset in bottom byte of rstate_sha[]
- bx r14
-1:
-@ [CK_JITTER code was here]
- movs r3,#SHA256_SUM6_OFFSET+1
- strb r3,[r2]                @ reset word counter: the +1 is compensated for later
- movw r1,#(1<<SHA256_CSR_BSWAP_LSB)+(1<<SHA256_CSR_START_LSB)
- str r1,[r0,#SHA256_CSR_OFFSET]        @ start SHA256 hardware
- movs r3,#3                  @ take four words from rstate_sha, incrementing as we go
- ldr r1,[r2]
- adds r1,r1,#255             @ overall this adds 256 to the value in rstate_sha and resets the bottom byte to SHA256_SUM6_OFFSET
-1:
- str r1,[r2],#4
- str r1,[r0,#SHA256_WDATA_OFFSET]
- cbz r3,3f
- ldr r1,[r2]
- adcs r1,r1,#0
- sub r3,r3,#1                @ preserve the carry
- b 1b
-3:
- movs r1,#0x80               @ End of message bit (with byte-swapped endianity) = start of message padding
- str r1,[r0,#SHA256_WDATA_OFFSET]
- movs r1,#10
-1:
- str r3,[r0,#SHA256_WDATA_OFFSET]
- subs r1,r1,#1
- bne 1b
- mov r1,#0x80000000          @ Specifies message length = 128 bits (with byte-swapped endianity)
- str r1,[r0,#SHA256_WDATA_OFFSET]
-1:
- ldr r3,[r0,#SHA256_CSR_OFFSET]
- lsrs r3,r3,#SHA256_CSR_SUM_VLD_LSB+1
- bcc 1b                      @ wait for hardware to finish
- ldr r0,[r0,#SHA256_SUM7_OFFSET]
- bx r14
-.endif
-
-@ simple LFSR rand versions
-@ return a random number in r0
-@ This version preserves all r1-r13
-@ 23 or 24 cycles including branch = 23 or 24 cycles/word
-@ (would be 20 or 21 cycles if written out)
-.balign 4
-.thumb_func
-.if !GEN_RAND_SHA
-gen_rand_sha:
-gen_rand_lfsr:               @ Not used
- push {r14}
- GET_CANARY r14,CTAG2,2
- push {r1,r2,r14}
- bl gen_rand_lfsr_nonpres
- pop {r1,r2,r14}
- CHK_CANARY r14,CTAG2,0
- pop {r15}
-.endif
-
-@ Trashes r1,r2
-@ 12 cycles including branch = 12 cycles/word
-.balign 4
-.if !GEN_RAND_SHA
-gen_rand_sha_nonpres:
-.endif
-gen_rand_lfsr_nonpres:
- ldr r2,=rstate_lfsr
- ldmia r2,{r0-r1}           @ r0=state_in, r1=0x1d872b41=constant for a maximum-length sequence
- and r1,r1,r0,asr#31        @ will we be shifting out a 1? keep the constant, otherwise 0
- eor r0,r1,r0,lsl#1
- str r0,[r2]
- bx r14
-
-.macro loadlfsr
- ldr r2,=rstate_lfsr
- ldmia r2,{r0-r1}           @ r0=lfsr_state, r1=lfsr_const=0x1d872b41 for a maximum-length sequence
-.endm
-
-.macro steplfsr
- ands r3,r1,r0,asr#31       @ will we be shifting out a 1? keep the constant, otherwise 0
- eors r0,r3,r0,lsl#1
-.endm
-
-.macro savelfsr
- str r0,[r2]
-.endm
-
-.ltorg
-
-.balign 4
-.thumb_func
-decrypt:
-@ r0=4-way key, r1=IV_shareA, r2=IV_shareB, r3=message buffer, [r13]=number of blocks
- ldr r12,[r13]               @ Pop 5th argument in r12 (which we are allowed to treat as scratch according to AAPCS)
- push {r14}
- GET_CANARY r14,CTAG3,6
- SET_COUNT 23,6
- push {r4-r11,r14}
- push {r0-r3,r12}            @ Save the five arguments
- bl reset_sha_trng
- bl init_rstate
-@ randomly re-share the LUT contents
- ldr r4,=lut_a
- mov r5,#64                  @ 64 words = 256 bytes
-1:
- bl gen_rand_sha_nonpres
- ldr r6,[r4,#lut_b-lut_a]    @ EOR a random word into both shares
- eors r6,r6,r0
- str r6,[r4,#lut_b-lut_a]
- ldr r6,[r4]
- eors r6,r6,r0
- stmia r4!,{r6}
- subs r5,r5,#1
- bne 1b
- CHK_COUNT 29,6
- bl remap                    @ scramble the LUTs
- pop {r0}                    @ pointer to 4way key data
- CHK_COUNT 30,6
- bl init_key_4way
- CHK_COUNT 31,6
- bl lock_key
- pop {r0-r3}                 @ r0=IV_shareA, r1=IV_shareB, r2=message, r3=num blocks
- bl ctr_crypt_s
- bl randomisechaff
- clear03
- pop {r4-r11,r14}
- CHK_CANARY r14,CTAG3,6
- pop {r15}
-
-.balign 4
-.thumb_func
-reset_sha_trng:
- ldr r1,=RESETS_BASE+RESETS_RESET_OFFSET
- ldr r2,[r1]
- ldr r3,=#RESETS_RESET_SHA256_BITS|RESETS_RESET_TRNG_BITS
- orrs r2,r2,r3
- str r2,[r1]       @ reset the SHA hardware and the TRNG hardware
- CHK_COUNT 23,6
- bics r2,r2,r3
- str r2,[r1]       @ release the reset
- bx r14
-
-.balign 4
-.thumb_func
-makesmallperm:
-@ Make a uniformly random permutation of R0 bytes and stores the resulting byte array at R1
-@ Should be very uniform up to R0=10; maybe 11 or 12 are also OK. (10! << 2^32)
-@ To make it valid up to R0=256, move the bl gen_rand_sha inside the loop
-@ Uses inside-out method (slightly more efficient variant of Fisher-Yates)
-@ Trashes r0-r3
-
- push {r14}
- GET_CANARY r14,CTAG4,6
- push {r4-r6,r14}
- movs r4,r1
- movs r6,r0
- movs r1,#0
- movs r2,#1
- bl gen_rand_sha
-
-1:
-@ r1,r2=i,i+1,   i=0, 2, 4, ...
- cmp r1,r6
- beq 2f
-
- umull r0,r3,r0,r2
- ldrb r5,[r4,r3]
- strb r5,[r4,r1]
- strb r1,[r4,r3]
- adds r1,r1,#2
-
-@ r2,r1=i,i+1,   i=1, 3, 5, ...
- cmp r2,r6
- beq 2f
-
- umull r0,r3,r0,r1
- ldrb r5,[r4,r3]
- strb r5,[r4,r2]
- strb r2,[r4,r3]
- adds r2,r2,#2
-
- b 1b
-
-2:
- pop {r4-r6,r14}
- CHK_CANARY r14,CTAG4,6
- pop {r15}
-
-.balign 4
-.thumb_func
-makeperm16:
-@ Make a random permutation of 16 things using the inside-out method (slightly more efficient variant of Fisher-Yates)
-@ Store it in the 16 bytes at perm16
-@ More efficient than calling makeperm with R0=16, R1=perm16 - fewer calls to gen_rand_sha
-@ Trashes r0-r5
-
- GET_CANARY r0,CTAG5,1
- push {r0,r14}
- ldr r4,=perm16
- bl gen_rand_sha_nonpres
-
-@ i=0
- movs r1,#0
- movs r2,#1       @ r1,r2=i,i+1
- strb r1,[r4]
-
-@ i=1
- adds r1,r1,#2    @ r1,r2=i+1,i
- umull r0,r3,r0,r1
- ldrb r5,[r4,r3]
- strb r5,[r4,r2]
- strb r2,[r4,r3]
-
-1:
-@ i=2, 4, 6, 8
- adds r2,r2,#2    @ r1,r2=i,i+1
- umull r0,r3,r0,r2
- ldrb r5,[r4,r3]
- strb r5,[r4,r1]
- strb r1,[r4,r3]
-
-@ i=3, 5, 7, 9
- adds r1,r1,#2    @ r1,r2=i+1,i
- umull r0,r3,r0,r1
- ldrb r5,[r4,r3]
- strb r5,[r4,r2]
- cmp r1,#10
- strb r2,[r4,r3]
- bne 1b
-
-@ refresh random number after extracting 10! from it
-@ 10! and 16!/10! are both much less than 2^32, so the permutation will be extremely close to uniform
- bl gen_rand_sha
-
-1:
-@ i=10, 12, 14
- adds r2,r2,#2    @ r1,r2=i,i+1
- umull r0,r3,r0,r2
- ldrb r5,[r4,r3]
- strb r5,[r4,r1]
- strb r1,[r4,r3]
-
-@ i=11, 13, 15
- adds r1,r1,#2    @ r1,r2=i+1,i
- umull r0,r3,r0,r1
- ldrb r5,[r4,r3]
- strb r5,[r4,r2]
- cmp r1,#16
- strb r2,[r4,r3]
- bne 1b
-
- pop {r0,r14}
- CHK_CANARY r0,CTAG5,4
- bx r14
-
-.balign 4
-.thumb_func
-remap:
-@ do a random remap of the LUTs
-@ preserves r0-r11; trashes r12
- GET_CANARY r12,CTAG6,6
- push {r0-r12,r14}
- bl gen_rand_sha_nonpres
- ldr r1,=lut_a
- bl remap_1
- bl gen_rand_sha_nonpres
- ldr r1,=lut_b
- bl remap_1
- pop {r0-r12,r14}
- CHK_CANARY r12,CTAG6,6
- bx r14
-
-remap_1:
-@ r0: B0:xa B1:xb B2:ya B3:yb
-@ r1: array of 256 bytes, followed by a 4-byte map
-@ shuffle LUT share array such that new[i]=old[i^xa^xb]^ya^yb, update map according to r0
- GET_CANARY r6,CTAG7,6
- push {r6,r14}
- mov r14,0x01010101
- ubfx r6,r0,#16,#8
- ubfx r7,r0,#24,#8
- mul r6,r6,r14               @ data remaps ya and yb, byte replicated
- mul r7,r7,r14
- movw r10,#0x1010
- and r10,r10,r0,lsl#3        @ 0/16 in each byte of r10 from b1 and b9 of r0, ready for rotates by 0 or 16
- mov r3,#0x7f7f7f7f
- ubfx r2,r0,#0,#1
- lsl r11,r3,r2               @ 0x7f or 0xfe in each byte of r11, ready for sel of rev16
- ubfx r2,r0,#8,#1
- lsl r12,r3,r2
- ldr r2,[r1,#0x100]          @ old map
- eors r2,r2,r0
- str r2,[r1,#0x100]          @ updated map
- mov r2,#252                 @ loop over entries
-1:
- ldr r4,[r1,r2]
- eor r3,r2,r0
- eor r3,r3,r0,ror#8
- and r3,r3,#0xfc             @ r3=remapped address r2
- ldr r5,[r1,r3]
- eors r5,r5,r6               @ remap data; ensure case x==0 works by doing both remaps on same side
- eors r5,r5,r7
- lsr r8,r10,#8
- ror r5,r5,r8                @ ROR#16 is the same as eor of address with 2
- ror r5,r5,r10
- rev16 r8,r5                 @ REV16 is the same as eor of address with 1
- uadd8 r9,r11,r11
- sel r5,r8,r5
- rev16 r8,r5
- uadd8 r9,r12,r12
- sel r5,r8,r5
- mul r8,r14,r2
- mul r9,r14,r3
- usub8 r8,r8,r9              @ bytewise comparison of original address and remapped address, both byte replicated
- sel r8,r4,r5                @ swap r4 and r5 as necessary in constant time
- str r8,[r1,r2]              @ write possibly swapped values back
- sel r8,r5,r4
- str r8,[r1,r3]
- subs r2,r2,#4
- bpl 1b
- pop {r6,r14}
- CHK_CANARY r6,CTAG7,6
- bx r14
-
-.if RK_ROR
-
-@ "refresh" shares of rkeys by random eor into both shares of each word, and also randomise the single word RKshareC
-@ Trashes r0-r12
-@ If i = word number 0..3,
-@ Aptr=memory word pointer to block of 20 bytes containing H&V-rotated share A roundkey (similarly B), then
-@ vpermA=Aptr[4]>>30, vpermB=Bptr[4]>>30, and
-@ roundkey shareA(i) = Aptr[i+vpermA mod 4] ror ((i+vpermA mod 4)^th byte of Aptr[4])
-@ roundkey shareB(i) = Bptr[i+vpermB mod 4] ror ((i+vpermB mod 4)^th byte of Bptr[4])+16
-.balign 4
-.thumb_func
-ref_roundkey_shares_s:
- mov r11,#15                 @ there are 15 expanded keys
-ref_roundkey_shares_s_test:  @ entry point for test code to do fewer than 15 rounds
- ldr r4,=rkey_s
- loadlfsr
- steplfsr                    @ r0=change in RKshareC
- ldr r2,=RKshareCchange
- str r0,[r2]
- ldr r3,=RKshareC
- ldr r5,[r3]
- eors r5,r5,r0
- str r5,[r3]
- @ r0=lfsr_state, r1=lfsr_const, r4=roundkey_ptr, r11=roundcounter
-
-ref_roundkey_shares_s_loop:
- ldmia r4!,{r5-r8,r10}       @ r5-r8 = rkey shareA, r10=X_A=vperm+rotations of rkey shareA
-
- ldr r12,[r4,#16]            @ r12 = X_B=vperm+rotations of rkey shareB
- mov r2,r12,lsr#30           @ r2 = vpermB
- sub r9,r2,r10,lsr#30        @ r9 = vpermB - vpermA (|junk)
- mov r2,r9,lsl#3             @ r2 = 8*(vpermB - vpermA) mod 32
- mov r12,r12,ror r2
- usub8 r12,r10,r12           @ r12 = rotsA - (rotsB ror r2)
-
- @ r2,r3,r10=workspace, r0=lfsr_state, r1=lfsr_const, r4=roundkeyB_ptr, r5-r8=roundkeyA, r9=vpermdiff, r10=rotsA, r11=roundcounter, r12=rotdiff
- steplfsr; eors r5,r5,r0; ands r9,r9,#3; ldr r3,[r4,r9,lsl#2]; ror r2,r0,r12; eors r3,r3,r2,ror#16; mov r12,r12,ror#8; str r3,[r4,r9,lsl#2]; adds r9,r9,#1
- steplfsr; eors r6,r6,r0; ands r9,r9,#3; ldr r3,[r4,r9,lsl#2]; ror r2,r0,r12; eors r3,r3,r2,ror#16; mov r12,r12,ror#8; str r3,[r4,r9,lsl#2]; adds r9,r9,#1
- steplfsr; eors r7,r7,r0; ands r9,r9,#3; ldr r3,[r4,r9,lsl#2]; ror r2,r0,r12; eors r3,r3,r2,ror#16; mov r12,r12,ror#8; str r3,[r4,r9,lsl#2]; adds r9,r9,#1
- steplfsr; eors r8,r8,r0; ands r9,r9,#3; ldr r3,[r4,r9,lsl#2]; ror r2,r0,r12; eors r3,r3,r2,ror#16;                    str r3,[r4,r9,lsl#2]
-
- ldr r3,=RKshareCchange
- ldr r3,[r3]
- movs r2,#0
- usub8 r10,r2,r10
- ror r2,r3,r10; mov r10,r10,ror#8; eors r5,r5,r2
- ror r2,r3,r10; mov r10,r10,ror#8; eors r6,r6,r2
- ror r2,r3,r10; mov r10,r10,ror#8; eors r7,r7,r2
- ror r2,r3,r10;                    eors r8,r8,r2
-
- subs r4,r4,#20
- stmia r4,{r5-r8}
- adds r4,r4,#40
- subs r11,r11,#1
-
- bne ref_roundkey_shares_s_loop
- ldr r2,=rstate_lfsr         @ restore rstate_lfsr
- savelfsr                    @ Save lfsr_state
- clear03 24
-ref_roundkey_shares_s_exit:
- bx r14
-
-.balign 4
-.thumb_func
-@ Rotates roundkey vperms and RK_ROR rotations by random amounts
-@ Trashes r0-r10
-@ If i = word number 0..3,
-@ Aptr=memory word pointer to block of 20 bytes containing H&V-rotated share A roundkey (similarly B), then
-@ vpermA=Aptr[4]>>30, vpermB=Bptr[4]>>30, and
-@ roundkey shareA(i) = Aptr[i+vpermA mod 4] ror ((i+vpermA mod 4)^th byte of Aptr[4])
-@ roundkey shareB(i) = Bptr[i+vpermB mod 4] ror ((i+vpermB mod 4)^th byte of Bptr[4])+16
-ref_roundkey_hvperms_s:
- movs r7,#30
-ref_roundkey_hvperms_s_test:  @ entry point for test code to do fewer than 30 key shares
- GET_CANARY r10,CTAG9,6
- push {r10,r14}
- ldr r10,=rkey_s
-ref_roundkey_hvperms_s_loop:
- bl gen_rand_lfsr_nonpres     @ r0=new vperm high|rotations
- ldmia r10,{r2-r5,r9}         @ r2-r5=roundkey share A/B, r9=old vperm high|rotations
- str r0,[r10,#16]
- mov r8,r0,lsr#30             @ r8=new vperm low
- sub r6,r8,r9,lsr#30          @ r6=(new vperm low)-(old vperm low) | junk
- mov r8,r6,lsl#3              @ r8=8*((new vperm low)-(old vperm low)) mod 32
- mov r0,r0,ror r8
- usub8 r0,r9,r0               @ i^th byte of r0 = (i^th byte of old rotations) - ((i+newvperm-oldvperm)^th byte of new rotations)
- movs r2,r2,ror r0; ands r6,r6,#3; str r2,[r10,r6,lsl#2]; movs r0,r0,ror#8; adds r6,r6,#1
- movs r3,r3,ror r0; ands r6,r6,#3; str r3,[r10,r6,lsl#2]; movs r0,r0,ror#8; adds r6,r6,#1
- movs r4,r4,ror r0; ands r6,r6,#3; str r4,[r10,r6,lsl#2]; movs r0,r0,ror#8; adds r6,r6,#1
- movs r5,r5,ror r0; ands r6,r6,#3; str r5,[r10,r6,lsl#2]
- adds r10,r10,#20
- subs r7,r7,#1
- bne ref_roundkey_hvperms_s_loop
- clear03 28
-ref_roundkey_hvperms_s_exit:  @ label exit point to be to able to specify to analysis code
- pop {r10,r14}
- CHK_CANARY r10,CTAG9,6
- bx r14
-
-.else
-
-@ "refresh" shares of rkeys by random eor into both shares of each word, and also randomise the single word RKshareC
-@ Trashes r0-r11
-.balign 4
-.thumb_func
-ref_roundkey_shares_s:
- mov r11,#15                 @ there are 15 expanded keys
-ref_roundkey_shares_s_test:  @ entry point for test code to do fewer than 15 rounds
- GET_CANARY r4,CTAG8,6
- push {r4,r14}
- ldr r4,=rkey_s
- loadlfsr
- steplfsr                    @ r0=change in RKshareC
- ldr r3,=RKshareC
- ldr r5,[r3]
- eors r5,r5,r0
- str r5,[r3]
- mov r10,r0
-ref_roundkey_shares_s_loop:
- ldmia r4!,{r5-r9}           @ r5-r8 = rkey shareA with vperm r9
-
- @ clear03: would need to do this with, say r2,r3,r12 (reloading r2 later)
-
- ldr r3,[r4,#16]             @ rkey shareB has a vperm of r10>>30
- movs r3,r3,lsr#30
- sub r9,r3,r9,lsr#30         @ r9 = vperm_B - vperm_A (|junk)
- @ r3,r12=workspace, r0=lfsr_state, r1=lfsr_const, r2=rstate_lfsr, r4=roundkeyB_ptr, r5-r8=roundkeyA, r9=vpermdiff, r10=RKshareCchange, r11=roundcounter
-
- steplfsr; eors r5,r5,r0; and r9,r9,#3; eors r5,r5,r10; ldr r3,[r4,r9,lsl#2]; eors r3,r3,r0,ror#16; str r3,[r4,r9,lsl#2]; adds r9,r9,#1
- steplfsr; eors r6,r6,r0; and r9,r9,#3; eors r6,r6,r10; ldr r3,[r4,r9,lsl#2]; eors r3,r3,r0,ror#16; str r3,[r4,r9,lsl#2]; adds r9,r9,#1
- steplfsr; eors r7,r7,r0; and r9,r9,#3; eors r7,r7,r10; ldr r3,[r4,r9,lsl#2]; eors r3,r3,r0,ror#16; str r3,[r4,r9,lsl#2]; adds r9,r9,#1
- steplfsr; eors r8,r8,r0; and r9,r9,#3; eors r8,r8,r10; ldr r3,[r4,r9,lsl#2]; eors r3,r3,r0,ror#16; str r3,[r4,r9,lsl#2]
-
- subs r4,r4,#20
- stmia r4,{r5-r8}
- adds r4,r4,#40
- subs r11,r11,#1
-
- @ clear03: would need to do this with, say r3,r5-r8
-
- bne ref_roundkey_shares_s_loop
- savelfsr
- clear03 24
-ref_roundkey_shares_s_exit:
- pop {r4,r14}
- CHK_CANARY r4,CTAG8,6
- bx r14
-
-.balign 4
-.thumb_func
-@ Rotates roundkey vperms by random amounts
-@ Trashes r0-r9
-ref_roundkey_hvperms_s:
- movs r7,#30
-ref_roundkey_hvperms_s_test:  @ entry point for test code to do fewer than 30 key shares
- GET_CANARY r0,CTAG9,6
- push {r0,r14}
- bl gen_rand_lfsr_nonpres
- ldr r1,=rkey_s
-ref_roundkey_hvperms_s_loop:
- cmp r7,#15
- bne 2f
-@ Get a new random r0 after using 15 x 2 bits of the original one
-@ Note that the junk bits (2-31) in the vperms are not adjusted independently, but that's no big loss,
-@ and the gain is only calling gen_rand_lfsr twice instead of 30 times.
- push {r1}; bl gen_rand_lfsr_nonpres; pop {r1}
- 2:
- ldmia r1,{r2-r5,r9}    @ roundkey share A/B=r2-r5, vperm=r9 (including junk bits)
- mov r8,r9,lsr#30       @ r8=old vperm (low)
- add r6,r9,r0           @ r6=new vperm (high) | new junk
- str r6,[r1,#16]
- rsb  r6,r8,r6,lsr#30   @ r6=(new vperm low)-(old vperm low) | junk bits
- ands r6,r6,#3; str r2,[r1,r6,lsl#2]; adds r6,r6,#1
- ands r6,r6,#3; str r3,[r1,r6,lsl#2]; adds r6,r6,#1
- ands r6,r6,#3; str r4,[r1,r6,lsl#2]; adds r6,r6,#1
- ands r6,r6,#3; str r5,[r1,r6,lsl#2]
- adds r1,r1,#20
- movs r0,r0,ror#2
- subs r7,r7,#1
- bne ref_roundkey_hvperms_s_loop
- clear03 28
-ref_roundkey_hvperms_s_exit:  @ label exit point to be to able to specify to analysis code
- pop {r0,r14}
- CHK_CANARY r0,CTAG9,6
- bx r14
-
-.endif
-
-.ltorg
-
-.if ST_VPERM
-.balign 4
-.thumb_func
-@ Cycle share registers r4-r7, r8-r11 (r4->r5-r6->r7->r4 etc.) by an addtional amount
-@ given in the bottom two bits of R0 and update the rotation recorded at statevperm.
-@ On entry R1 must point to statevperm.
-@ Trashes r0-r3,r12
-@ Maintains r4=rorig(4+(-!r1)%4), r5=rorig(4+(1-!r1)%4), ...
-@           r8=rorig(8+(-!r1)%4), r9=rorig(8+(1-!r1)%4), ...
-@ Note: only low 2 bits of !r1 are used. The rest are random to add to the noise.
-addstatevperm:
- ldr r2,[r1]
- adds r2,r2,r0
- str r2,[r1]
-
- ldr r1,=shareA
- ands r0,r0,#3; str r4,[r1,r0,lsl#2]; adds r0,r0,#1
- ands r0,r0,#3; str r5,[r1,r0,lsl#2]; adds r0,r0,#1
- ands r0,r0,#3; str r6,[r1,r0,lsl#2]; adds r0,r0,#1
- ands r0,r0,#3; str r7,[r1,r0,lsl#2]; adds r0,r0,#1
- ldmia r1,{r4-r7}
-
- getchaffaddress r12          @ Overwrite temporary storage with random numbers
- ldmia r12!,{r2,r3}
- stmia r1!,{r2,r3}
- ldmia r12!,{r2,r3}
- stmia r1!,{r2,r3}
-
- ldr r1,=shareB
- ands r0,r0,#3; str r8, [r1,r0,lsl#2]; adds r0,r0,#1
- ands r0,r0,#3; str r9, [r1,r0,lsl#2]; adds r0,r0,#1
- ands r0,r0,#3; str r10,[r1,r0,lsl#2]; adds r0,r0,#1
- ands r0,r0,#3; str r11,[r1,r0,lsl#2]; adds r0,r0,#1
- ldmia r1,{r8-r11}
-
- getchaffaddress r0,16        @ Overwrite temporary storage with random numbers
- ldmia r0!,{r2,r3}
- stmia r1!,{r2,r3}
- ldmia r0!,{r2,r3}
- stmia r1!,{r2,r3}
-
-addstatevperm_exit:           @ label exit point to be to able to specify to analysis code
- bx r14
-.endif
-
-@ Conjugate lut_a, lut_b with (state) shareC
-@ I.e., EOR the input and output with shareC.
-@ We need to pick one input for each share A and B, and one output for ONE of the shares A and B
-@ Arbitrarily choosing a0, b1 and d0
-.balign 4
-conjshareC:
-.if ST_SHAREC
- ldr r1,=shareC
- ldr r0,[r1]                   @ Get shareC as a word (all bytes the same)
- ldr r1,=lut_a                 @ Need to EOR share C into inputs of both lut_a and lut_b, and one of their outputs...
- ldr r2,[r1,#0x100]
- eors r2,r2,r0,lsr#24
- str r2,[r1,#0x100]
- movs r0,r0,lsr#16
- ldr r1,=lut_b                 @ ... (continued) Here we're EORing share C into a0, b1 and d0.
- ldr r2,[r1,#0x100]
- eors r2,r2,r0,lsl#8
- str r2,[r1,#0x100]
-.endif
- bx r14
-
-.balign 4
-.thumb_func
-shift_rows_s:
-@ First "rotate" the two most-significant bytes of the state by two registers
-@ Trashes r0-r3
-@ Slightly faster (but not shorter?) with ubfx/bfi
- eors r0,r4,r6               @ ta=state[0]^state[2]; ta&=0xffff0000; state[0]^=ta; state[2]^=ta;
- lsrs r0,r0,#16
- lsls r0,r0,#16
- eors r4,r4,r0
- eors r6,r6,r0
- eors r0,r5,r7               @ ta=state[1]^state[3]; ta&=0xffff0000; state[1]^=ta; state[3]^=ta;
- lsrs r0,r0,#16
- lsls r0,r0,#16
- eors r5,r5,r0
- eors r7,r7,r0
-@ next "rotate" the two odd-significance bytes of the state by one register
- eors r1,r7,r4               @ tb=state[3]^state[0]; tb&=0xff00ff00;
- ands r1,r1,#0xff00ff00
- eors r0,r4,r5               @ ta=state[0]^state[1]; ta&=0xff00ff00; state[0]^=ta;
- ands r0,r0,#0xff00ff00
- eors r4,r4,r0
- eors r0,r5,r6               @ ta=state[1]^state[2]; ta&=0xff00ff00; state[1]^=ta;
- ands r0,r0,#0xff00ff00
- eors r5,r5,r0
- eors r0,r6,r7               @ ta=state[2]^state[3]; ta&=0xff00ff00; state[2]^=ta;
- ands r0,r0,#0xff00ff00
- eors r6,r6,r0
- eors r7,r7,r1               @                                       state[3]^=tb;
-@ repeat for other share, conjugated by ror#16
- clear01                     @ barrier
- eors r0,r8,r10              @ ta=state[0]^state[2]; ta&=0x0000ffff; state[0]^=ta; state[2]^=ta;
- lsls r0,r0,#16
- lsrs r0,r0,#16
- eors r8,r8,r0
- eors r10,r10,r0
- eors r0,r9,r11              @ ta=state[1]^state[3]; ta&=0x0000ffff; state[1]^=ta; state[3]^=ta;
- lsls r0,r0,#16
- lsrs r0,r0,#16
- eors r9,r9,r0
- eors r11,r11,r0
- eors r1,r11,r8              @ tb=state[3]^state[0]; tb&=0xff00ff00;
- ands r1,r1,#0xff00ff00
- eors r0,r8,r9               @ ta=state[0]^state[1]; ta&=0xff00ff00; state[0]^=ta;
- ands r0,r0,#0xff00ff00
- eors r8,r8,r0
- eors r0,r9,r10              @ ta=state[1]^state[2]; ta&=0xff00ff00; state[1]^=ta;
- ands r0,r0,#0xff00ff00
- eors r9,r9,r0
- eors r0,r10,r11             @ ta=state[2]^state[3]; ta&=0xff00ff00; state[2]^=ta;
- ands r0,r0,#0xff00ff00
- eors r10,r10,r0
-
- eors r11,r11,r1             @                                       state[3]^=tb;
-
- clear01                     @ barrier
- bx r14
-
-@ multiply polynomial over GF(2⁸) by c(x) = 0x03x³ + 0x01x² + 0x01x + 0x02 modulo x⁴+1
-@ r0x00 is a register holding 0x00000000;  r0x1b is a register holding 0x1b1b1b1b
-.macro mixcol rx,rt,ru,r0x00,r0x1b
-                             @ let rx=(a,b,c,d)
- uadd8 \rt,\rx,\rx           @ MSB of each byte into the GE flags
- sel \ru,\r0x1b,\r0x00       @ get bytewise correction for bytewise field multiplication by 2
- eors \rt,\rt,\ru            @ (2a,2b,2c,2d)
-
- eors \ru,\rt,\rx            @ (3a,3b,3c,3d)
- eors \rt,\rt,\rx,ror#24     @ (2a+b,2b+c,2c+d,2d+a)
- eors \rt,\rt,\rx,ror#16     @ (2a+b+c,2b+c+d,2c+d+a,2d+a+b)
- eors \rx,\rt,\ru,ror#8      @ (2a+b+c+3d,2b+c+d+3a,2c+d+a+3b,2d+a+b+3c)
-.endm
-
-@ multiply polynomial over GF(2⁸) by d(x) = 0x0Bx³ + 0x0Dx² + 0x09x + 0x0E modulo x⁴+1; c(x)d(x)=1 modulo x⁴+1
-.macro invmixcol rx,rt,ru,rv,rw,r0x00,r0x1b
- uadd8 \rt,\rx,\rx           @ field multiplication by 2 as above
- sel \rw,\r0x1b,\r0x00
- eors \rt,\rt,\rw            @ 2x
- uadd8 \ru,\rt,\rt
- sel \rw,\r0x1b,\r0x00
- eors \ru,\ru,\rw            @ 4x
- uadd8 \rv,\ru,\ru
- sel \rw,\r0x1b,\r0x00
- eors \rv,\rv,\rw            @ 8x
-
- eors \rx,\rx,\rv            @ 9x
- eors \rw,\rx,\rt            @ 11x
- eors \rw,\rw,\rx,ror#16     @ 11x ^ 9x ROL #16
- eors \rx,\rx,\ru            @ 13x
- eors \rw,\rw,\rx,ror#8      @ 11x ^ 9x ROL #16 ^ 13x ROL #24
- eors \rt,\rt,\ru            @ 6x
- eors \rt,\rt,\rv            @ 14x
- eors \rx,\rt,\rw,ror#8      @ 14x ^ 9x ROL #8 ^ 13x ROL #16 ^ 11x ROL #24
-.endm
-
-.balign 4
-.thumb_func
-@ Trashes r0-r3,r12
-mix_cols_s:
- mov r2,#0x00000000
- mov r3,#0x1b1b1b1b
- mixcol r4 ,r0,r1,r2,r3      @ apply mixcol to each state word
- mixcol r5 ,r0,r1,r2,r3
- mixcol r6 ,r0,r1,r2,r3
- mixcol r7 ,r0,r1,r2,r3
- ldr r12,=chaff
- ldmia r12!,{r0,r1}          @ overwrite sensitive shareA-related quantities r0,r1 with random numbers
- mixcol r8 ,r0,r1,r2,r3
- mixcol r9 ,r0,r1,r2,r3
- mixcol r10,r0,r1,r2,r3
- mixcol r11,r0,r1,r2,r3
- ldmia r12!,{r0,r1}          @ overwrite  sensitive shareB-related quantities r0,r1 with random numbers
- bx r14
-
-@ Lookup each byte of a word, Rtarg, in a table and replace Rtarg with the result (used for SBOX lookups)
-.macro subbytes Rtarg,Rtable,Rspare0,Rspare1,Rspare2,Rspare3
- ubfx \Rspare0,\Rtarg,#0,  #8
- ubfx \Rspare1,\Rtarg,#8,  #8
- ubfx \Rspare2,\Rtarg,#16, #8
- ubfx \Rspare3,\Rtarg,#24, #8
-
- ldrb \Rspare0,[\Rtable,\Rspare0]
- ldrb \Rspare1,[\Rtable,\Rspare1]
- ldrb \Rspare2,[\Rtable,\Rspare2]
- ldrb \Rspare3,[\Rtable,\Rspare3]
- orr \Rspare0,\Rspare0,\Rspare1,lsl#8
- orr \Rspare2,\Rspare2,\Rspare3,lsl#8
- orr \Rtarg,\Rspare0,\Rspare2,lsl#16
-.endm
-
-@ map all bytes of the state through the split LUT, lut_a and lut_b
-@ Trashes r0-r3,r12
-.balign 4
-.thumb_func
-map_sbox_s:
- GET_CANARY r12,CTAG12,3
- push {r12,r14}
-
- ldr r0,=shareA                 @ Write out state share A to memory
-@ stmia r0,{r4-r7}              @ Used to do a STM
- getchaffaddress r1
- ldr r2,[r1]
- str r4,[r0]                    @ Interperse with dummy writes to prevent implicit broadcasting of HW(ShareA_word0^ShareA_word1)+cyclic perms,
- str r2,[r1]                    @ which arise due to internal write buffer. Such a quantity could (without such interspersing) be paired
- str r5,[r0,#4]                 @ via 2nd order with its share B counterpart, resulting in broadcasting HW(word0^word1)+cyclic.
- str r2,[r1]                    @ shareC doesn't guard against this, because word0^shareC^word1^shareC=word0^word1.
- str r6,[r0,#8]                 @ Broadcasting of HW(ShareA_word0)+cyclic on the other hand is not prevented by interspersing, but
- str r2,[r1]                    @ it isn't useful at 2nd order because shareC kills its relationship with HW(ShareB_word0)+cyclic.
- str r7,[r0,#12]
- str r2,[r1]
-
- ldr r0,=shareB                 @ Write out state share B to memory
- stmia r0,{r8-r11}              @ Not essential to intersperse share B too because i0B^i1B etc should have nothing in share A to couple with
-
- bl makeperm16                  @ Rebuild random 16-way permutation. Maybe do this less frequently
-@ Now combine state shares A and B and apply the split sbox to each byte, in the order given by the above random permutation
-
- bl gen_rand_sha_nonpres
- mov r11,r0
- ldr r8,=lut_a
- ldr r9,=lut_b
- ldr r0,[r8,#0x100]             @ R0 = a0 | a1<<8 | c0<<16 | c1<<24   (lut_a_map)
- eors r3,r0,r0,lsr#8            @ R3 = a0^a1 | junk
- uxtb r10,r3
- ldr r1,[r9,#0x100]             @ R1 = b0 | b1<<8 | d0<<16 | d1<<24   (lut_b_map)
- eors r1,r0,r1
- eors r2,r1,r1,lsr#8
- movs r12,r1,lsr#16             @ R12 = c0^d0 | (c1^d1)<<8
- bfi r12,r2,#16,#8              @ R12 = c0^d0 | (c1^d1)<<8 | (a0^a1^b0^b1)<<16
-
- ldr r4,=perm16
- ldr r5,=shareA
- ldr r6,=shareB
- movs r1,#0;movs r2,#0;movs r3,#0
-@ Using r0=loop counter, r4=perm16, r5=shareA, r6=shareB, r8=lut_a, r9=lut_b, r10=a0^a1, r11=Random, r12=(c0^d0) | (c1^d1)<<8 | (a0^a1^b0^b1)<<16
- movs r0,#15
-1:                              @ (Ordering instructions to minimise result delays)
- ldrb r1,[r4,r0]                @ r1 = perm[r0]
- mov  r11,r11,ror#11            @ Rotate random 32 bits to present a new low 8 bits
- eors r7,r1,#2                  @ r7 = perm[r0]^2
- ldrb r2,[r5,r1]                @ r2 = shareA[perm[r0]]
- eor  r11,r11,r2,ror#8          @ Transfer some of the share-randomness of the input to the output (the share-randomness would otherwise be lost/wasted)
- ldrb r3,[r6,r7]                @ r3 = shareB[perm[r0]^2]
- eor  r2,r2,r10                 @ r2 = shareA[perm[r0]]^a0^a1
- eors r2,r2,r3                  @ r2 = shareA[perm[r0]]^a0^a1^shareB[perm[r0]^2]
- ldrb r3,[r8,r2]                @ r3 = lut_a[shareA[perm[r0]]^a0^a1^shareB[perm[r0]^2]]
- eor  r2,r2,r12,lsr#16          @ r2 = shareA[perm[r0]]^b0^b1^shareB[perm[r0]^2]
- eor  r3,r3,r12                 @ r3 = lut_a[shareA[perm[r0]]^a0^a1^shareB[perm[r0]^2]]^c0^d0 | (junk<<8)
- eor  r3,r3,r11                 @ r3 = lut_a[shareA[perm[r0]]^a0^a1^shareB[perm[r0]^2]]^c0^d0^rand | (junk<<8)
- strb r3,[r5,r1]                @ shareA'[perm[r0]] = lut_a[shareA[perm[r0]]^a0^a1^shareB[perm[r0]^2]]^c0^d0^rand
- ldrb r3,[r9,r2]                @ r3 = lut_b[shareA[perm[r0]]^b0^b1^shareB[perm[r0]^2]]
- subs r0,r0,#1
- eor  r3,r3,r11                 @ r3 = lut_b[shareA[perm[r0]]^b0^b1^shareB[perm[r0]^2]]^rand
- eor  r3,r3,r12,lsr#8           @ r3 = lut_b[shareA[perm[r0]]^b0^b1^shareB[perm[r0]^2]]^rand^c1^d1 | (junk<<8)
- strb r3,[r6,r7]                @ shareB'[perm[r0]^2] = lut_b[shareA[perm[r0]]^b0^b1^shareB[perm[r0]^2]]^rand^c1^d1
- bpl 1b
- clear03 8                      @ barrier
-
- ldmia r6,{r8-r11}              @ Read state share B back from memory
- clear03 12                     @ barrier
- getchaffaddress r0,16
- bfi r0,r5,#0,#4                @ match chaff pointer (r0) to share A location (R5) mod 16
- @ldmia r5,{r4-r7}               @ Read state share A back from memory
- @clear03 16                     @ barrier
- ldr r4,[r5]                    @ Intersperse barriers to prevent HW(o0^o1)+cyclic being broadcast; see similar note re stores at the start of map_sbox_s
- ldr r1,[r0]
- ldr r6,[r5,#8]
- ldr r1,[r0,#8]
- ldr r7,[r5,#12]
- ldr r1,[r0,#12]
- ldr r5,[r5,#4]                 @ Do r5 last because it's the address register
- ldr r1,[r0,#4]
-
-@ Refresh state shares because luts only give imperfect share-by-value
-@ Probably not necessary now that we use R11 and input-reuse random resharing during the sbox operation (though the R11 bytes are not fully independent)
-@ loadlfsr
-@ steplfsr; eors r4,r4,r0; mov r12,#0; eor r8,r8,r0,ror#16              @ Barriers between each pair of eors to prevent implicit r4^r8 etc
-@ steplfsr; eors r5,r5,r0; mov r12,#0; eor r9,r9,r0,ror#16
-@ steplfsr; eors r6,r6,r0; mov r12,#0; eor r10,r10,r0,ror#16
-@ steplfsr; eors r7,r7,r0; mov r12,#0; eor r11,r11,r0,ror#16
-@ savelfsr
-
- pop {r12,r14}
- CHK_CANARY r12,CTAG12,5
- bx r14
-
-.ltorg
-
-.balign 4
-.thumb_func
-randomisechaff:
-@ Randomise 48 bytes of chaff values (random load values)
-@ Uses 12 bytes of permscratch
-@ Trashes r0-3
- GET_CANARY r0,CTAG13,6
- push {r0,r14}
- movs r0,#12
- ldr r1,=permscratch
- bl makesmallperm           @ Store the random words in a random order to make 2nd order attacks harder
- movs r1,#11
-1:
- push {r1}
- bl gen_rand_sha_nonpres
- pop {r1}
- ldr r2,=permscratch
- ldrb r2,[r2,r1]
- getchaffaddress r3
- str r0,[r3,r2,lsl#2]
- subs r1,r1,#1
- bpl 1b
- pop {r0,r14}
- CHK_CANARY r0,CTAG13,6
- bx r14
-
-.balign 4
-refreshchaff_and_lfsr:
-@ Update 48 bytes of chaff values (random load values) using faster RNG than used for randomisechaff
-@ Re-randomise LFSR with SHA
-@ Uses 12 bytes of permscratch
-@ Trashes r0-3,12
- GET_CANARY r0,CTAG14,6
- push {r0,r14}
-
-@ Refresh LFSR using SHA to make it harder to reverse-engineer LFSR sequence
- bl gen_rand_sha_nonpres
- ldr r1,=rstate_lfsr
- ldr r2,[r1]
- adds r2,r2,r0
- beq 1f           @ Don't update LFSR state to 0
- str r2,[r1]
-1:
-
-@ Choose a random order to update chaff words to make 2nd order attacks harder
- movs r0,#12
- ldr r1,=permscratch
- bl makesmallperm
- 
- movs r1,#11
-1:
- push {r1}
- bl gen_rand_lfsr_nonpres
- pop {r1}
- ldr r2,=permscratch
- ldr r3,=chaff
- ldrb r2,[r2,r1]
- ldr r12,[r3,r2,lsl#2]
- add r0,r0,r12
- str r0,[r3,r2,lsl#2]
- subs r1,r1,#1
- bpl 1b
- pop {r0,r14}
- CHK_CANARY r0,CTAG14,6
- bx r14
-
-.balign 4
-.thumb_func
-@ Do sbox on the four bytes of the 4-way share r4-r7
-@ Trashes r0,r8-r12
-init_key_sbox:
- GET_CANARY r12,CTAG15,6
- push {r1-r3,r12,r14}
- bl gen_rand_sha_nonpres; mov r8,r0
- bl gen_rand_sha_nonpres; mov r9,r0
- bl gen_rand_sha_nonpres; mov r10,r0
- bl gen_rand_sha_nonpres; mov r11,r0
- ldr r0,=fourway                @ Write out 4-way share to memory
- stmia r0,{r8-r11}              @ Save random values first to obscure saving of state
- stmia r0,{r4-r7}
- movs r4,#0                     @ Clear r4-r7 so that they don't interact with makesmallperm
- movs r5,#0
- movs r6,#0
- movs r7,#0
-
- bl randomisechaff              @ Randomise block of memory mainly used for obscuring loads
-
- movs r0,#4
- ldr r1,=permscratch
- bl makesmallperm               @ Build random 4-way permutation determining order of bytes to be SBOXed
- ldr r1,=permscratch            @ Write out random addresses in advance to save two registers (reusing permscratch)
- ldr r4,[r1]
- ldr r0,=fourway
- uxtab r5,r0,r4
- uxtab r6,r0,r4,ror#8
- uxtab r7,r0,r4,ror#16
- uxtab r8,r0,r4,ror#24
- stmia r1,{r5-r8}               @ Store at r1=permscratch: fourway+perm[0], fourway+perm[1], fourway+perm[2], fourway+perm[3]
-
- bl gen_rand_sha                @ Save some randomness for the resharing operation later
- movs r7,r0
- bl gen_rand_sha
- movs r8,r0
-
- ldr r2,=lut_a
- ldr r3,=lut_b
- ldr r0,[r2,#0x100]             @ R0 = a0 | a1<<8 | c0<<16 | c1<<24   (lut_a_map)
- eors r10,r0,r0,lsr#8
- uxtb r10,r10                   @ R10 = a0^a1
- ldr r1,[r3,#0x100]             @ R1 = b0 | b1<<8 | d0<<16 | d1<<24   (lut_b_map)
- eors r1,r0,r1
- eors r4,r1,r1,lsr#8
- uxtb r11,r4                    @ R11 = a0^a1^b0^b1
- eor r10,r10,r11,lsl#8          @ R10 = a0^a1 | (a0^a1^b0^b1)<<8
- movs r12,r1,ror#16             @ R12 = c0^d0 | (c1^d1)<<8 | junk<<16 | junk<<24
-
- ldr r1,=permscratch
- ldr r11,=chaff
-@ Using r1=permutedfourwaypointer, r2=lut_a, r3=lut_b, r7,r8=randomness, r10=(a0^a1)|(a0^a1^b0^b1)<<8, r11=chaff, r12=(c0^d0)|(c1^d1)<<8|junk
-1:
- ands r5,r1,#12
- adds r5,r11,r5                 @ Align chaff address to r1
- ldr  r6,[r1],#4                @ r6 = fourway + perm[i] (i=0-3, loop iteration)
- ldr  r5,[r5]                   @ Random load to mask previous load
-
- ands r9,r6,#12
- add  r9,r11,r9                 @ r9 = chaff address aligned to (r6 bic 3) mod 16
- ldrb r4,[r6,#0]
- ldr  r14,[r9,#0]               @ Random load to mask previous load
- eor  r4,r4,r10
- eor  r4,r4,r14,lsl#8           @ Add in some junk in bits 8-31
-
- ldrb r5,[r6,#4]
- ldr  r14,[r9,#4]               @ Random load to mask previous load
- eors r4,r4,r5
- eor  r4,r4,r14,lsl#8           @ Add in some junk in bits 8-31
-
- ldrb r5,[r6,#8]
- ldr  r14,[r9,#8]               @ Random load to mask previous load
- eors r4,r4,r5
- eor  r4,r4,r14,lsl#8           @ Add in some junk in bits 8-31
-
- ldrb r5,[r6,#12]
- ldr  r14,[r9,#12]              @ Random load to mask previous load
- eors r4,r4,r5                  @ r4 = unsharedbyte[perm[i]]^a0^a1 | junk
- eor  r4,r4,r14,lsl#8           @ Add in some junk in bits 8-31
-
- ands r14,r4,#255
- ldrb r5,[r2,r14]               @ r5 = lut_a[unsharedbyte[perm[i]]^a0^a1]
- and  r14,r4,#15
- add  r14,r14,#32
- ldrb r14,[r11,r14]             @ Random load to mask previous load (r2 and r11 are both 0 mod 16)
- eors r5,r5,r12                 @ r5 = lut_a[unsharedbyte[perm[i]]^a0^a1]^c0^d0 | junk<<8 | junk<<16 | junk<<24
-@ split r5 into two shares and store at [r6,#0] and [r6,#4]
- strb r7,[r6,#0]
- eors r5,r5,r7
- strb r5,[r6,#4]
-
- mov r5,r10,lsr#8               @ r5=a0^a1^b0^b1
- ldr  r14,[r11,#44]             @ Need to eor into a random destination register
- eors r14,r4,r5                 @ r14 = unsharedbyte[perm[i]]^b0^b1 | junk<<8
- and r14,r14,#255
-
- ldrb r5,[r3,r14]               @ r5 = lut_b[unsharedbyte[perm[i]]^b0^b1]
- and  r14,r14,#15
- add  r4,r11,#24
- ldrb r14,[r4,r14]              @ Random load to mask previous load (r3==8 and r11==0 mod 16)
- eor  r5,r5,r12,ror#8           @ r5 = lut_b[unsharedbyte[perm[i]]^b0^b1]^c1^d1 | junk<<8 | junk<<16 | junk<<24
-@ split r5 into two shares and store at [r6,#8] and [r6,#12]
- strb r8,[r6,#8]
- eors r5,r5,r8
- strb r5,[r6,#12]
-
- movs r7,r7,ror#8
- movs r8,r8,ror#8
-
- tst r1,#12                     @ This does 4 loop iterations because permscratch is guaranteed to be 0 mod 16
- bne 1b
-
- ldr r0,=fourway
- ldmia r0,{r4-r7}               @ Load SBOXed values back into register r4-r7
- ldmia r11,{r8-r12,r14}         @ Random load to mask previous load and to obfuscate registers
-
- pop {r1-r3,r12,r14}
- CHK_CANARY r12,CTAG15,6
- bx r14
-
-.balign 4
-.thumb_func
-@ r1 = pointer to 4 x 4-way share (16 words); left unchanged
-@ r3 = rkey_s+40*roundkeynumber; advanced by 40
-@ Trashes r8-r12
-@ If i = word number 0..3,
-@ Aptr=memory word pointer to block of 20 bytes containing H&V-rotated share A roundkey (similarly B), then
-@ vpermA=Aptr[4]>>30, vpermB=Bptr[4]>>30, and
-@ roundkey shareA(i) = Aptr[i+vpermA mod 4] ror #((i+vpermA mod 4)^th byte of Aptr[4])
-@ roundkey shareB(i) = Bptr[i+vpermB mod 4] ror #((i+vpermB mod 4)^th byte of Bptr[4])+16
-storeroundkey:
- GET_CANARY r8,CTAG16,6
- push {r2,r8,r14}
-
-@ eor two 4-way share components to make a component of a 2-way share
-@ Note that we load from 4-way share at a random address then convert to 2-way share and
-@ store at a fixed address, rather than the other way around, so that 2-way shares are obscured
-@ by vperm (we don't know which 2-way share is being processed at a particular point in time).
-@ And (if RK_ROR) we rotate first before EORing down to 2-way, so there is never an unrotated 2-way share
-
- bl gen_rand_sha             @ Get r0 = vperm for shareA of the round key
- str r0,[r3,#16]
- mov r8,r0,lsr#30
- rsb r8,r8,#0                @ r8=-vperm
-.if RK_ROR
- movs r2,#0
- usub8 r2,r2,r0              @ r2=-hperms
-.endif
- mov r9,#4
-1:
- and r8,r8,#3
- adds r0,r1,r8,lsl#4
-
- ldmia r0,{r10,r11}
-.if RK_ROR
- mov r10,r10,ror r2
- mov r11,r11,ror r2
- movs r2,r2,ror#8
-.endif
- eor r10,r10,r11
- str r10,[r3],#4
- add r8,r8,#1
- subs r9,r9,#1
- bne 1b
-
- adds r1,r1,#8
- adds r3,r3,#4               @ skip over vperm (already stored)
-
- bl gen_rand_sha             @ Get r0 = vperm for shareB of the round key
- str r0,[r3,#16]
- mov r8,r0,lsr#30
- rsb r8,r8,#0                @ r8=-vperm
-.if RK_ROR
- movs r2,#0
- usub8 r2,r2,r0              @ r2=-hperms
-.endif
- mov r9,#4
- ldr r12,=RKshareC
- ldr r12,[r12]
-1:
- and r8,r8,#3
- adds r0,r1,r8,lsl#4
- ldmia r0,{r10,r11}
- eor r10,r10,r12             @ Mix in RKshareC into round key shareB
-.if RK_ROR
- mov r10,r10,ror r2
- mov r11,r11,ror r2
- movs r2,r2,ror#8
-.endif
- mov r10,r10,ror#16
- mov r11,r11,ror#16
- eor r10,r10,r11
- str r10,[r3],#4
- add r8,r8,#1
- subs r9,r9,#1
- bne 1b
-
- subs r1,r1,#8               @ Restore r1 = (r1 on entry)
- adds r3,r3,#4               @ Set     r3 = (r3 on entry) + 40
-
- pop {r2,r8,r14}
- CHK_CANARY r8,CTAG16,6
- bx r14
-
-.balign 4
-.thumb_func
-init_key_4way:
-@ On entry, r0 points to 4-way shared raw key data (128 bytes)
-@ The format is a0 b0 c0 d0 a1 b1 c1 d1 ... a7 b7 c7 d7
-@ That is, each word, K, of the original 256-bit key is expanded into four words whose exclusive OR is K.
-@
-@ On exit, rkeys_s, a 40*15=600-byte region, is filled as follows.
-@ Each of the 15 round keys is represented as two 5-word regions rka[0..4] and rkb[0..4],
-@ each of which consists of 4 words of round key followed by a word encoding vperm and rotation (RK_ROR) information.
-@ In addition a common share word, RKshareC, is set randomly.
-@ For a given round, rk[i] = the i^th word of the actual round key is given by:
-@ vpermA=rka[4]>>30
-@ vpermB=rkb[4]>>30
-@ rka_unrot[i] = rka[i+vpermA mod 4] ror #((i+vpermA mod 4)^th byte of rka[4])
-@ rkb_unrot[i] = rkb[i+vpermB mod 4] ror #((i+vpermB mod 4)^th byte of rkb[4])+16
-@ rk[i] = rka_unrot[i] ^ rkb_unrot[i] ^ RKshareC
-
- GET_CANARY r12,CTAG17,6
- push {r0-r12,r14}
- 
-@ Transfer 4-way key into local workspace, rerandomising the shares
- mov r5,r0                   @ r5=4-way key input
- bl randomisechaff
- ldr r6,=rkey4way
- movs r7,#8
-1:
- ldmia r5!,{r1-r4}
- bl gen_rand_sha; eors r1,r1,r0; eors r4,r4,r0
- bl gen_rand_sha; eors r2,r2,r0; eors r4,r4,r0
- bl gen_rand_sha; eors r3,r3,r0; eors r4,r4,r0
- stmia r6!,{r1-r4}
- subs r7,r7,#1
- bne 1b
-
-@ Now raw key is stored in rkey4way[], construct 2-way share in rkey_s[] for
-@ the 128-bit roundkeys 0 and 1, then expand from 2 to 15 roundkeys.
- bl gen_rand_sha_nonpres
- ldr r12,=RKshareC
- str r0,[r12]                @ Make RKshareC random word
- ldr r3,=rkey_s              @ r3=rkey_s
- ldr r1,=rkey4way            @ r1=rkey4way
- bl storeroundkey            @ Store round key 0 and advance r3 by 40
- adds r1,r1,#64
- bl storeroundkey            @ Store round key 1 and advance r3 by 40
- adds r1,r1,#48
- ldmia r1!,{r4-r7}           @ r4-r7 = 4-way share of previous round key word
-                             @ r1=rkey4way+128 on entry to main loop
- movs r2,#0                  @ r2=word counter (0-51), offset from word 8
-
-@ Note that r1-r3 are not sensitive values, so it's safe to stack
-@ them and conditionally branch on them.
-
-@ rkey4way = 8 x 4 consecutive 4-way share words as cyclic buffer of
-@   Rounds 0,1     Rounds 2,3            Rounds 12,13       Round 14
-@   a0 b0 c0 d0 -> a8 b8 c8 d8 -> ... -> a48 b48 c48 d48 -> a56 b56 c56 d56
-@   a1 b1 c1 d1 -> a9 b9 c9 d9           a49 b49 c49 d49    a57 b57 c57 d57
-@   a2 b2 c2 d2    etc                   a50 b50 c50 d50    a58 b58 c58 d58
-@   a3 b3 c3 d3                          a51 b51 c51 d51    a59 b59 c59 d59
-@   a4 b4 c4 d4                          a52 b52 c52 d52    ===============
-@   a5 b5 c5 d5                          a53 b53 c53 d53
-@   a6 b6 c6 d6                          a54 b54 c54 d54
-@   a7 b7 c7 d7                          a55 b55 c55 d55
-
-init_key_expandloop:
-@ r1 = pointer past one of eight 4-way shares of a roundkey word in the above cyclic buffer (r1=rkey4way+16i for i=1,...,8)
-@ r2 = round key word counter (0-51), offset from word 8 (counting expanded roundkey words)
-@ r3 = pointer to rkey_s+40*roundnumber = rkey_s+40*(2+[r2/4])
-@ r4-r7 = 4-way share of previous roundkey word
-
- tst r2,#7
- bne 1f
- subs r1,r1,#128             @ Every 8th word, reset cyclic buffer pointer and do ROTWORD
- movs r4,r4,ror#8
- movs r5,r5,ror#8
- movs r6,r6,ror#8
- movs r7,r7,ror#8
-1:
-
- tst r2,#3
- bne 1f
- bl init_key_sbox            @ Every 4th word, do SUBBYTES (sbox) on r4-r7
-1:
-
- tst r2,#7
- bne 1f
- movs r0,r2,lsr#3
- mov r8,#1
- movs r8,r8,lsl r0
- eors r4,r4,r8               @ Every 8th word, add in round constant
-1:
-
- ldmia r1,{r8-r11}           @ eor with key from two rounds ago and advance r1 by 16
- eors r4,r4,r8
- eors r5,r5,r9
- eors r6,r6,r10
- eors r7,r7,r11
- stmia r1!,{r4-r7}
-
- add r2,r2,#1
- tst r2,#3
- bne 1f
- subs r1,r1,#64
- bl storeroundkey            @ Store round key 1+r2/4 and advance r3 by 40
- adds r1,r1,#64
-1:
-
- cmp r2,#52
- bne init_key_expandloop
-
- pop {r0-r12,r14}
- CHK_CANARY r12,CTAG17,6
- bx r14
-
-.ltorg
-
-@ Add the round key shares pointed to by r12 into the state shares
-@ Trashes r0-r3
-.balign 4
-addrkey_s:
-
- ldr r0,=chaff               @ guaranteed 0 mod 16
-.if ST_VPERM
- ldr r3,=statevperm
- ldr r3,[r3]                 @ r3=vperm state rotation in bottom two bits
- ldr r2,[r0,#12]             @ barrier load
-.else
- movs r3,#0
-.endif
- bfi r0,r12,#0,#4            @ match chaff pointer (r0) to roundkey ptr (r12) mod 16
- ldr r1,[r12,#16]            @ r1=vperm key rotation in top two bits
- ldr r2,[r0,#16]             @ barrier load
-
- rsb r2,r3,r1,lsr#30         @ r2=vpermkeyrot-vpermstaterot
-@ Read shareA of roundkey, offset by vpermkeyrot-vpermstaterot, and eor it into shareA of state, offset by -vpermstaterot
-@ r1=rkeyArotdata, r2=vpermkeyrot-vpermstaterot, r3=statevperm, r4-r11=state, r12=roundkeyAptr
-.if RK_ROR
- movs r0,r2,lsl#3
- movs r1,r1,ror r0
- ands r2,r2,#3; ldr r0,[r12,r2,lsl#2]; adds r2,r2,#1;                   rors r0,r0,r1; eors r4,r4,r0
- ands r2,r2,#3; ldr r0,[r12,r2,lsl#2]; adds r2,r2,#1; movs r1,r1,ror#8; rors r0,r0,r1; eors r5,r5,r0
- ands r2,r2,#3; ldr r0,[r12,r2,lsl#2]; adds r2,r2,#1; movs r1,r1,ror#8; rors r0,r0,r1; eors r6,r6,r0
- ands r2,r2,#3; ldr r0,[r12,r2,lsl#2];                movs r1,r1,ror#8; rors r0,r0,r1; eors r7,r7,r0
-.else
- ands r2,r2,#3; ldr r0,[r12,r2,lsl#2]; adds r2,r2,#1; eors r4,r4,r0
- ands r2,r2,#3; ldr r0,[r12,r2,lsl#2]; adds r2,r2,#1; eors r5,r5,r0
- ands r2,r2,#3; ldr r0,[r12,r2,lsl#2]; adds r2,r2,#1; eors r6,r6,r0
- ands r2,r2,#3; ldr r0,[r12,r2,lsl#2];                eors r7,r7,r0
-.endif
- clear03_preserve_r3
- add r12,r12,#20
- @ r0=chaff+16, r3=statevperm, r4-r11=state, r12=roundkeyBptr
- 
- bfi r0,r12,#0,#4            @ match chaff pointer (r0) to roundkey ptr (r12) mod 16
- ldr r1,[r12,#16]            @ r1=vperm key rotation in top two bits
- ldr r2,[r0,#16]             @ barrier load
- rsb r2,r3,r1,lsr#30         @ r2=vpermkeyrot-vpermstaterot
- ldr r3,=RKshareC            @ r3=common round key shareC
- bfi r0,r3,#0,#4
- ldr r3,[r3]
- ldr r0,[r0]                 @ barrier load
- 
-@ Read shareB of roundkey, offset by vpermkeyrot-vpermstaterot, and eor it into shareB of state, offset by -vpermstaterot
-@ r1=rkeyBrotdata, r2=vpermkeyrot-vpermstaterot, r3=RKshareC, r4-r11=state, r12=roundkeyB ptr
-.if RK_ROR
- movs r0,r2,lsl#3
- movs r1,r1,ror r0
- ands r2,r2,#3; ldr r0,[r12,r2,lsl#2]; eor r8,r8,r3,ror#16;   adds r2,r2,#1;                   rors r0,r0,r1; eor r8,r8,r0
- ands r2,r2,#3; ldr r0,[r12,r2,lsl#2]; eor r9,r9,r3,ror#16;   adds r2,r2,#1; movs r1,r1,ror#8; rors r0,r0,r1; eor r9,r9,r0
- ands r2,r2,#3; ldr r0,[r12,r2,lsl#2]; eor r10,r10,r3,ror#16; adds r2,r2,#1; movs r1,r1,ror#8; rors r0,r0,r1; eor r10,r10,r0
- ands r2,r2,#3; ldr r0,[r12,r2,lsl#2]; eor r11,r11,r3,ror#16;                movs r1,r1,ror#8; rors r0,r0,r1; eor r11,r11,r0
-.else
- ands r2,r2,#3; ldr r0,[r12,r2,lsl#2]; eor r8,r8,r3,ror#16;   adds r2,r2,#1; eors r8,r8,r0
- ands r2,r2,#3; ldr r0,[r12,r2,lsl#2]; eor r9,r9,r3,ror#16;   adds r2,r2,#1; eors r9,r9,r0
- ands r2,r2,#3; ldr r0,[r12,r2,lsl#2]; eor r10,r10,r3,ror#16; adds r2,r2,#1; eors r10,r10,r0
- ands r2,r2,#3; ldr r0,[r12,r2,lsl#2]; eor r11,r11,r3,ror#16;                eors r11,r11,r0
-.endif
- clear03
- bx r14
-
-.balign 4
-.thumb_func
-@ de/encrypt data in place
-@ r0: ivec
-@ r1: buf
-@ r2: n, number of blocks, n>0
-.if CT_BPERM
-@ In AES-CTR each block can be independently en/decrypted as the encryption only depends on the IV,
-@ the key, and the block number. We can therefore process them in any order, and using a
-@ random order helps to defeat attacks that work on the output of the AES, since an attacker
-@ wouldn't know what plaintext or ciphertext corresponds to a particular instruction.
-.endif
-
-ctr_crypt_s:
-@ r0=IV_shareA, r1=IV_shareB, r2=cipher/plaintext buffer, r3=number of blocks
- GET_CANARY r12,CTAG0,6
- push {r0-r12,r14}           @ save all registers so that when we restore we overwrite any secrets
-
- push {r0-r3}
- 
- SET_COUNT 93,6
-
-.if CT_BPERM
-@ Initialise 32 random numbers (which fit in half-words)
-@ r3=number of blocks
- ldr r4,=bperm_rand
- movs r5,#32
-1:
- bl gen_rand_sha
- umull r0,r2,r0,r3        @ Random number between 0 and n-1 (n=#blocks)
- strh r2,[r4],#2
- subs r5,r5,#1
- bne 1b
-.endif
-
- bl randomisechaff
-
-@ Refresh IVshareA and IVshareB, convert to ror#16 format and store the result at IV0
-@ Not doing shareC or state vperm at this point
- pop {r0}
- ldmia r0,{r4-r7}         @ r4-r7 = IVshareA
- clear03 16
- pop {r1}
- ldmia r1,{r8-r11}        @ r8-r11 = IVshareB
- clear03 32
- bl gen_rand_sha_nonpres; eors r4,r4,r0; movs r1,#0; mov r8, r8, ror#16; eor r8, r8, r0,ror#16   @ Barriers between shares to prevent implicit r4^r8 etc
- bl gen_rand_sha_nonpres; eors r5,r5,r0; movs r1,#0; mov r9, r9, ror#16; eor r9, r9, r0,ror#16
- bl gen_rand_sha_nonpres; eors r6,r6,r0; movs r1,#0; mov r10,r10,ror#16; eor r10,r10,r0,ror#16
- bl gen_rand_sha_nonpres; eors r7,r7,r0; movs r1,#0; mov r11,r11,ror#16; eor r11,r11,r0,ror#16
- ldr r0,=IV0
- stmia r0,{r4-r7}
- adds r0,r0,#20
- stmia r0,{r8-r11}
-@ "Decommission" IV0 so that it doesn't get stacked
- bl gen_rand_sha_nonpres; movs r4,r0
- bl gen_rand_sha_nonpres; movs r5,r0
- bl gen_rand_sha_nonpres; movs r6,r0
- bl gen_rand_sha_nonpres; movs r7,r0
- bl gen_rand_sha_nonpres; mov  r8,r0
- bl gen_rand_sha_nonpres; mov  r9,r0
- bl gen_rand_sha_nonpres; mov r10,r0
- bl gen_rand_sha_nonpres; mov r11,r0
- pop {r1,r2}
-@ r1=cipher/plaintext buffer, r2=number of blocks
-
- movs r3,#0
- CHK_COUNT 93,6
-
-ctr_crypt_mainloop:
- SET_COUNT 80,6
-@ r1=cipher/plaintext buffer, r2=number of blocks, r3=block counter
-
-@ Do as much preparatory stuff as possible that doesn't involve the IV (to reduce interaction with it)
- push {r1-r3}
-@ It's OK for execution time to depend on the block counter r3 ("public"), but not the block number (secret)
-
- tst r3,#(REFCHAFF_PERIOD-1)
- bne 1f
- bl refreshchaff_and_lfsr
-1:
-
- ldr r3,[r13,#8]             @ get block count off the stack
- tst r3,#(REMAP_PERIOD-1)
- bne 1f
- bl remap                    @ shuffle the LUTs; this preserves R3
-1:
- CHK_COUNT 80,6
-
- tst r3,#(REFROUNDKEYSHARES_PERIOD-1)
- bne 1f
- bl ref_roundkey_shares_s    @ refresh the round key shares
-1:
-
- ldr r3,[r13,#8]             @ get block count off the stack
- tst r3,#(REFROUNDKEYHVPERMS_PERIOD-1)
- bne 1f
- bl ref_roundkey_hvperms_s   @ refresh the round key vperms
-1:
-
- CHK_COUNT 81,6
-
- pop {r1-r3}
-@ r1=cipher/plaintext buffer, r2=number of blocks, r3=block counter
-
-@ Now calculate r12 = block number-to-be-deciphered from r3 = block counter
-.if CT_BPERM
-@ Use a "swap-or-not" method to generate an "oblivious" permutation; see makeperm.py version 7
- push {r1}
- ldr r0,=murmur3_constants
- ldmia r0,{r9-r12,r14}       @ load five murmur3_32 hash constants
- ldr r0,=bperm_rand
- movs r1,#31
- movs r4,r3                  @ r4=i
-1:
- ldrh r5,[r0],#2             @ r5=k
- subs r5,r5,r4               @ r5=k-i
- ands r6,r2,r5,asr#31        @ r6=n*(k-i<0)
- adds r5,r5,r6               @ r5=j=(k-i)%n
- adds r6,r4,r5               @ r6=i+j
- subs r7,r4,r5               @ r7=i-j
- and  r8,r7,r7,asr#31        @ r8=min(i-j,0)
- sub  r7,r7,r8,lsl#1         @ r7=|i-j|
- mla  r6,r6,r2,r7            @ r6=n(i+j)+|i-j|, encodes the unordered pair {i,j}
- eors r6,r6,r1,lsl#27        @ mix with swap-or-not round counter to get different hash functions
-@ Now do murmur3_32 hash of r6
- mul  r6,r6,r9
- movs r6,r6,ror#17
- mul  r6,r6,r10
- movs r6,r6,ror#19
- adds r6,r6,r6,lsl#2
- add  r6,r6,r11
- eors r6,r6,#4
- eors r6,r6,r6,lsr#16
- mul  r6,r6,r12
- eors r6,r6,r6,lsr#13
- mul  r6,r6,r14
- eors r6,r6,r6,lsr#16        @ not actually used here
-@ Now set i to j, conditional on the top bit of r6
- subs r7,r5,r4               @ r7=j-i
- ands r7,r7,r6,asr#31        @ r7=(j-i)*(top bit of r6)
- adds r4,r4,r7               @ r4=j if top bit of r6, else i
- subs r1,r1,#1
- bpl 1b
- pop {r1}
- mov r12,r4
-.else
- mov r12,r3
-.endif
- CHK_COUNT 82,6
-
-@ r1=cipher/plaintext buffer, r2=number of blocks, r3=block counter (monotonic), r12=block number (block to be deciphered)
- push {r1-r3,r12}
-@ r4-r11 = IV0, r12=block number
-
-processIV:                   @ non-target label to assist power analysis
- ldr r8,=IV0
- ldmia r8,{r4-r7}            @ load IV0_A
- clear03 16
- add r8,r8,#20
- ldmia r8,{r8-r11}           @ load IV0_B
- clear03 32
- rev r0,r12
- eor r7,r7,r0                @ XOR in block number to IV0. IV(block n) = IV0 ^ n, cf standard CTR mode IV0 + n.
-                             @ XOR (vs addition) is compatible with XOR-shares, so stealthier/simpler because don't have to unshare to work out IV(block n)
-@ r4-r11 = IV for the current block
- CHK_COUNT 83,6
-.if ST_SHAREC
- bl gen_rand_sha_nonpres     @ Create state share C; all bytes the same
- ands r0,r0,#255
- orrs r0,r0,r0,lsl#8
- orrs r12,r0,r0,lsl#16
- ldr r1,=shareC
- str r12,[r1]
-.else
- movs r12,#0
-.endif
-@ r4-r11 = IV for the current block w/o shareC, r12=shareC
-@ refresh state shares and mix in shareC
- bl gen_rand_sha_nonpres; eors r4,r4,r0; eor r4,r4,r12; movs r1,#0; eor r8, r8, r0,ror#16   @ Barriers between shares to prevent implicit r4^r8 etc
- bl gen_rand_sha_nonpres; eors r5,r5,r0; eor r5,r5,r12; movs r1,#0; eor r9, r9, r0,ror#16
- bl gen_rand_sha_nonpres; eors r6,r6,r0; eor r6,r6,r12; movs r1,#0; eor r10,r10,r0,ror#16
- bl gen_rand_sha_nonpres; eors r7,r7,r0; eor r7,r7,r12; movs r1,#0; eor r11,r11,r0,ror#16
-.if ST_VPERM
- bl gen_rand_sha_nonpres
- ldr r1,=statevperm
- movs r2,#0
- str r2,[r1]
- bl addstatevperm            @ Initialise state vperm (use SHA RNG to start with, later refreshes are with LFSR RNG)
-.endif
-
- CHK_COUNT 84,6
- bl conjshareC               @ Add the effect of shareC to lut_a, lut_b
- CHK_COUNT 85,6
-@ now perform the 15 encryption rounds on (key, state=IV+x)
-@ here r4-r7, r8-r11: state
- mov r2,#0                   @ round counter
-rounds_s_mainloop:
- ldr r12,=rkey_s
- add r12,r12,r2,lsl#5        @ pointer to key shares for this round
- add r12,r12,r2,lsl#3
- push {r2}                   @ save round count
- bl addrkey_s
- bl map_sbox_s
- bl shift_rows_s
-.if ST_VPERM
- ldr r2,[r13]                @ peek at stack to get round count
- cmp r2,#NUMREFSTATEVPERM
- bcs 1f
- bl gen_rand_lfsr_nonpres
- ldr r1,=statevperm
- bl addstatevperm            @ V shuffle of r4-r11
-1:
-.endif
- pop {r2}
- adds r2,r2,#1               @ increment round counter
- cmp r2,#14
- beq 2f                      @ break from loop? (last round has no mix_cols)
- push {r2}
- bl mix_cols_s
- pop {r2}
- b rounds_s_mainloop
-2:
- CHK_COUNT 86,6
- ldr r12,=rkey_s+14*40       @ final round key shares
- bl addrkey_s
- CHK_COUNT 87,6
- bl conjshareC               @ Undo the effect of shareC from lut_a, lut_b
- CHK_COUNT 88,6
-.if ST_VPERM
-@ Undo the effects of vperm rotation recorded in statevperm
- ldr r1,=statevperm
- ldr r2,[r1]
- rsbs r0,r2,#0
- bl addstatevperm
-.endif
-
- pop {r1-r3,r12}
- push {r3}
-@ r1=cipher/plaintext buffer, r2=number of blocks, r3=block counter, r12=block to be deciphered
-
-decryption_start:
-@ Decrypt ciphertext using AES output in shares: r4-r11
-.if ST_SHAREC
- ldr r0,=shareC
- ldr r0,[r0]
-.else
- movs r0,#0
-.endif
- ldr r14,=chaff
-@ r0=shareC, r1=cipher/plaintext buffer, r2=number of blocks, r3=free, r4-r11=stateA/B, r12=block to be deciphered, r14=chaff
- CHK_COUNT 89,6
- add r1,r1,r12,lsl#4         @ Temporarily r1 points to block-to-be-deciphered
- ldr r3,[r1]                 @ r3=ciphertext word
- eors r3,r3,r4               @ r3=r3^shareA
- ldr r4,[r14]                @ barrier load
- eor r3,r3,r8,ror#16         @ r3=r3^shareB
- eors r3,r3,r0               @ r3=r3^shareC
- str r3,[r1]                 @ plaintext word=r3
- ldr r3,[r1,#4]              @ and similarly for words 1,2,3 of block...
- ldr r4,[r14,#4]
- eors r3,r3,r5
- eor r3,r3,r9,ror#16
- eors r3,r3,r0
- str r3,[r1,#4]
- ldr r3,[r1,#8]
- ldr r4,[r14,#8]
- eors r3,r3,r6
- eor r3,r3,r10,ror#16
- eors r3,r3,r0
- str r3,[r1,#8]
- ldr r3,[r1,#12]
- ldr r4,[r14,#12]
- eors r3,r3,r7
- eor r3,r3,r11,ror#16
- eors r3,r3,r0
- str r3,[r1,#12]
-
- sub r1,r1,r12,lsl#4         @ Restore r1 to point to start of buffer
- CHK_COUNT 90,6
-
- pop {r3}                    @ Restore block counter
-@ r1=cipher/plaintext buffer, r2=number of blocks, r3=block counter
-decryption_end:
-
- adds r3,r3,#1
- cmp r3,r2
- CHK_COUNT 91,6
- bne ctr_crypt_mainloop
-
-#if WIPE_MEMORY
-@ Wipe memory from workspace_start up to the stack pointer
-@ First fill everything (except the RNG state itself) with random numbers to avoid any possibly useful power signals
- ldr r4,=workspace_start
- ldr r5,=rstate_all_start
-1:
- bl gen_rand_sha_nonpres
- stmia r4!,{r0}
- cmp r4,r5
- bcc 1b
- ldr r4,=rstate_all_end
- mov r5,r13                  @ gcc arm assembler says cmp r4,r13 is deprecated, so use another register
-1:
- bl gen_rand_sha_nonpres
- stmia r4!,{r0}
- cmp r4,r5
- bcc 1b
-
-@ Then fill everything with zeros so as not to leave behind clues about the RNG state
- ldr r4,=workspace_start
- movs r0,#0
- mov r5,r13
-1:
- stmia r4!,{r0}
- cmp r4,r5
- bcc 1b
-#endif
-
-.if GEN_RAND_SHA
- SET_COUNT 23,6
- bl reset_sha_trng           @ clear out the SHA hardware
-.endif
- pop {r0-r12,r14}
- CHK_CANARY r12,CTAG0,6
- bx r14
diff --git a/bootloaders/encrypted/config.h b/bootloaders/encrypted/config.h
deleted file mode 100644
index 2c4ce0d03..000000000
--- a/bootloaders/encrypted/config.h
+++ /dev/null
@@ -1,90 +0,0 @@
-#pragma once
-
-// These options (up to long /////////////// line) should be enabled because the security risk of not using them is too high
-// or because the time cost is very low so you may as well have them.
-// They can be set to 0 for analysis or testing purposes.
-
-#ifndef GEN_RAND_SHA
-#define GEN_RAND_SHA         1         // use SHA256 hardware to generate some random numbers
-#endif
-                                       // Some RNG calls are hard coded to LFSR RNG, others to SHA RNG
-                                       // Setting GEN_RAND_SHA to 0 has the effect of redirecting the latter to LFSR RNG
-#ifndef ST_SHAREC
-#define ST_SHAREC            1         // This creates a partial extra share at almost no extra cost
-#endif
-#ifndef ST_VPERM
-#define ST_VPERM             1         // insert random vertical permutations in state during de/encryption?
-#endif
-#ifndef CT_BPERM
-#define CT_BPERM             1         // process blocks in a random order in counter mode?
-#endif
-#ifndef RK_ROR
-#define RK_ROR               1         // store round key shares with random rotations within each word
-#endif
-
-#ifndef WIPE_MEMORY
-#define WIPE_MEMORY          1         // Wipe memory after decryption
-#endif
-
-// The following options should be enabled to increase resistance to glitching attacks.
-
-#ifndef RC_CANARY
-#define RC_CANARY            1         // use rcp_canary feature
-#endif
-#ifndef RC_COUNT
-#define RC_COUNT             1         // use rcp_count feature
-#endif
-
-// Although jitter/timing-variation may be circumventable in theory, in practice
-// randomising the timing of operations can make side-channel attacks very much more
-// effort to carry out. These can be disabled for analysis or testing purposes.
-// It is advisable to use a least one form of jitter.
-
-// RC_JITTER is quite slow, and is probably the most predictable of the three, so it is disabled by default.
-// (Leaving it as an option because it's just possible that the large delays it produces are advantageous in defeating certain side-channel attacks.)
-#ifndef RC_JITTER
-#define RC_JITTER            0         // 0-7. Higher = more jitter. Governs use of random-delay versions of RCP instructions.
-#endif
-
-#ifndef SH_JITTER
-#define SH_JITTER            1         // Insert random delays, tagged onto SHA RNG
-#endif
-
-
-////////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// The following options can be adjusted, affecting the performance/security tradeoff
-
-// Period = X means that the operation in question occurs every X blocks, so higher = more performance and lower security.
-// No point in making them more than 16 or so, since the time taken by the subroutines would be negligible.
-// These must be a power of 2. Timings as of commit 82d31652
-// 
-//                                        Baseline time per 16-byte block = 14109 (with no jitter)         cycles
-#ifndef REFCHAFF_PERIOD
-#define REFCHAFF_PERIOD             1     // Extra cost per 16-byte block =   474/REFCHAFF_PERIOD          cycles
-#endif
-#ifndef REMAP_PERIOD
-#define REMAP_PERIOD                4     // Extra cost per 16-byte block =  4148/REMAP_PERIOD             cycles
-#endif
-#ifndef REFROUNDKEYSHARES_PERIOD
-#define REFROUNDKEYSHARES_PERIOD    1     // Extra cost per 16-byte block =  1304/REFROUNDKEYSHARES_PERIOD cycles
-#endif
-#ifndef REFROUNDKEYHVPERMS_PERIOD
-#define REFROUNDKEYHVPERMS_PERIOD   1     // Extra cost per 16-byte block =  1486/REFROUNDKEYVPERM_PERIOD  cycles
-#endif
-
-// Setting NUMREFSTATEVPERM to X means that state vperm refreshing happens on the first X AES rounds only,
-// so lower = more performance and lower security.
-// The rationale for doing it this way is that later rounds should be protected by CT_BPERM.
-// NUMREFSTATEVPERM can be from 0 to 14.
-#ifndef NUMREFSTATEVPERM
-#define NUMREFSTATEVPERM            7     // Extra cost per 16-byte block =  61*NUMREFSTATEVPERM cycles
-#endif
-
-////////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-#define MAX_NUM_BLOCKS 32768
-
-#if SH_JITTER && !GEN_RAND_SHA
-#error GEN_RAND_SHA must be set if you want to use SH_JITTER
-#endif
diff --git a/bootloaders/encrypted/enc-pt.json b/bootloaders/encrypted/enc-pt.json
index e9a12b7dd..9c5c3a17e 100644
--- a/bootloaders/encrypted/enc-pt.json
+++ b/bootloaders/encrypted/enc-pt.json
@@ -12,8 +12,8 @@
     {
       "name": "A",
       "id": 0,
-      "start": "40K",
-      "size": "480K",
+      "start": "64K",
+      "size": "448K",
       "families": ["rp2350-arm-s"],
       "permissions": {
         "secure": "rw",
@@ -24,7 +24,7 @@
     {
       "name": "B",
       "id": 1,
-      "size": "480K",
+      "size": "448K",
       "families": ["rp2350-arm-s"],
       "permissions": {
         "secure": "rw",
diff --git a/bootloaders/encrypted/enc_bootloader.c b/bootloaders/encrypted/enc_bootloader.c
index 54e89d2e5..d6cce4d6a 100644
--- a/bootloaders/encrypted/enc_bootloader.c
+++ b/bootloaders/encrypted/enc_bootloader.c
@@ -13,17 +13,94 @@
 #include "hardware/structs/otp.h"
 #include "hardware/structs/qmi.h"
 #include "hardware/structs/xip_ctrl.h"
+#include "hardware/clocks.h"
+#include "hardware/xosc.h"
+#include "hardware/structs/rosc.h"
+#include "hardware/pll.h"
 
-#include "config.h"
-
-#define OTP_KEY_PAGE 30
+#define OTP_KEY_PAGE 29
 
 extern void decrypt(uint8_t* key4way, uint8_t* IV_OTPsalt, uint8_t* IV_public, uint8_t(*buf)[16], int nblk);
 
+// These just have to be higher than the actual frequency, to prevent overclocking unused peripherals
+#define ROSC_HZ 300*MHZ
+#define OTHER_CLK_DIV 30
+
+
+void runtime_init_clocks(void) {
+    // Disable resus that may be enabled from previous software
+    clocks_hw->resus.ctrl = 0;
+
+    uint32_t rosc_div = 2; // default divider 2
+    uint32_t rosc_drive = 0x7777; // default drives of 0b111 (0x7)
+
+    // Bump up ROSC speed to ~110MHz
+    rosc_hw->freqa = 0; // reset the drive strengths
+    rosc_hw->div = rosc_div | ROSC_DIV_VALUE_PASS; // set divider
+    // Increment the freqency range one step at a time - this is safe provided the current config is not TOOHIGH
+    // because ROSC_CTRL_FREQ_RANGE_VALUE_MEDIUM | ROSC_CTRL_FREQ_RANGE_VALUE_HIGH == ROSC_CTRL_FREQ_RANGE_VALUE_HIGH
+    static_assert((ROSC_CTRL_FREQ_RANGE_VALUE_LOW | ROSC_CTRL_FREQ_RANGE_VALUE_MEDIUM) == ROSC_CTRL_FREQ_RANGE_VALUE_MEDIUM);
+    static_assert((ROSC_CTRL_FREQ_RANGE_VALUE_MEDIUM | ROSC_CTRL_FREQ_RANGE_VALUE_HIGH) == ROSC_CTRL_FREQ_RANGE_VALUE_HIGH);
+    hw_set_bits(&rosc_hw->ctrl, ROSC_CTRL_FREQ_RANGE_VALUE_MEDIUM);
+    hw_set_bits(&rosc_hw->ctrl, ROSC_CTRL_FREQ_RANGE_VALUE_HIGH);
+
+    // Enable rosc randomisation
+    rosc_hw->freqa = (ROSC_FREQA_PASSWD_VALUE_PASS << ROSC_FREQA_PASSWD_LSB) |
+            rosc_drive | ROSC_FREQA_DS1_RANDOM_BITS | ROSC_FREQA_DS0_RANDOM_BITS; // enable randomisation
+
+    // Not used with FREQ_RANGE_VALUE_HIGH, but should still be set to the maximum drive
+    rosc_hw->freqb = (ROSC_FREQB_PASSWD_VALUE_PASS << ROSC_FREQB_PASSWD_LSB) |
+            ROSC_FREQB_DS7_LSB | ROSC_FREQB_DS6_LSB | ROSC_FREQB_DS5_LSB | ROSC_FREQB_DS4_LSB;
+
+    // CLK SYS = ROSC directly, as it's running slowly enough
+    clock_configure_int_divider(clk_sys,
+                    CLOCKS_CLK_SYS_CTRL_SRC_VALUE_CLKSRC_CLK_SYS_AUX,
+                    CLOCKS_CLK_SYS_CTRL_AUXSRC_VALUE_ROSC_CLKSRC,
+                    ROSC_HZ,    // this doesn't have to be accurate
+                    1);
+
+    // CLK_REF = ROSC / OTHER_CLK_DIV - this isn't really used, so just needs to be set to a low enough frequency
+    clock_configure_int_divider(clk_ref,
+                    CLOCKS_CLK_REF_CTRL_SRC_VALUE_ROSC_CLKSRC_PH,
+                    0,
+                    ROSC_HZ,
+                    OTHER_CLK_DIV);
+
+
+    // Everything else should run from PLL USB, so we can use UART and USB for output
+    xosc_init();
+    pll_init(pll_usb, PLL_USB_REFDIV, PLL_USB_VCO_FREQ_HZ, PLL_USB_POSTDIV1, PLL_USB_POSTDIV2);
+
+    // CLK USB = PLL USB 48MHz / 1 = 48MHz
+    clock_configure_undivided(clk_usb,
+                    0, // No GLMUX
+                    CLOCKS_CLK_USB_CTRL_AUXSRC_VALUE_CLKSRC_PLL_USB,
+                    USB_CLK_HZ);
+
+    // CLK ADC = PLL USB 48MHz / 1 = 48MHz
+    clock_configure_undivided(clk_adc,
+                    0, // No GLMUX
+                    CLOCKS_CLK_ADC_CTRL_AUXSRC_VALUE_CLKSRC_PLL_USB,
+                    USB_CLK_HZ);
+
+    // CLK PERI = PLL USB 48MHz / 1 = 48MHz. Used as reference clock for UART and SPI serial.
+    clock_configure_undivided(clk_peri,
+                    0,
+                    CLOCKS_CLK_PERI_CTRL_AUXSRC_VALUE_CLKSRC_PLL_USB,
+                    USB_CLK_HZ);
+
+    // CLK_HSTX = PLL USB 48MHz / 1 = 48MHz. Transmit bit clock for the HSTX peripheral.
+    clock_configure_undivided(clk_hstx,
+                    0,
+                    CLOCKS_CLK_HSTX_CTRL_AUXSRC_VALUE_CLKSRC_PLL_USB,
+                    USB_CLK_HZ);
+}
+
 // The function lock_key() is called from decrypt() after key initialisation is complete and before decryption begins.
 // That is a suitable point to lock the OTP area where key information is stored.
 void lock_key() {
     otp_hw->sw_lock[OTP_KEY_PAGE] = 0xf;
+    otp_hw->sw_lock[OTP_KEY_PAGE + 1] = 0xf;
 }
 
 
@@ -121,15 +198,6 @@ int main() {
         reset_usb_boot(0, 0);
     }
 
-    printf("OTP Valid Keys %x\n", otp_hw->key_valid);
-
-    printf("Unlocking\n");
-    for (int i=0; i<4; i++) {
-        uint32_t key_i = ((i*2+1) << 24) | ((i*2+1) << 16) |
-                         (i*2 << 8) | i*2;
-        otp_hw->crt_key_w[i] = key_i;
-    }
-
     uint8_t iv[16];
     data_start_addr += first_mb_end;
     memcpy(iv, (void*)(XIP_BASE + data_start_addr), sizeof(iv));
@@ -153,12 +221,12 @@ int main() {
 
     decrypt(
         (uint8_t*)&(otp_data[OTP_KEY_PAGE * 0x40]),
-        (uint8_t*)&(otp_data[(OTP_KEY_PAGE + 1) * 0x40]),
+        (uint8_t*)&(otp_data[(OTP_KEY_PAGE + 2) * 0x40]),
         iv, (void*)SRAM_BASE, data_size/16
     );
 
     // Lock the IV salt
-    otp_hw->sw_lock[OTP_KEY_PAGE + 1] = 0xf;
+    otp_hw->sw_lock[OTP_KEY_PAGE + 2] = 0xf;
 
     printf("Post decryption image begins with\n");
     for (int i=0; i < 4; i++)
@@ -166,7 +234,7 @@ int main() {
 
     printf("Chaining into %x, size %x\n", SRAM_BASE, data_size);
 
-    stdio_deinit_all();
+    stdio_uart_deinit();    // stdio_usb_deinit doesn't work here, so only deinit UART
 
     rc = rom_chain_image(
         workarea,
@@ -175,7 +243,7 @@ int main() {
         data_size
     );
 
-    stdio_init_all();
+    stdio_uart_init();
     printf("Shouldn't return from ROM call %d\n", rc);
 
     reset_usb_boot(0, 0);
diff --git a/bootloaders/encrypted/mbedtls_aes.c b/bootloaders/encrypted/mbedtls_aes.c
new file mode 100644
index 000000000..9f19c9b4d
--- /dev/null
+++ b/bootloaders/encrypted/mbedtls_aes.c
@@ -0,0 +1,73 @@
+#include <mbedtls/aes.h>
+#include "pico/stdlib.h"
+
+extern void lock_key();
+
+int mb_aes_crypt_ctr_xor(mbedtls_aes_context *ctx,
+    size_t length,
+    unsigned char iv0[16],
+    unsigned char nonce_xor[16],
+    unsigned char stream_block[16],
+    const unsigned char *input,
+    unsigned char *output)
+{
+    int c;
+    int ret = 0;
+    size_t n = 0;
+    uint32_t counter = 0;
+
+    assert(length == (uint32_t)length);
+
+    while (length--) {
+        if (n == 0) {
+            for (int i = 16; i > 0; i--) {
+                nonce_xor[i-1] = iv0[i-1];
+                if (i - (int)(16 - sizeof(counter)) > (int)0) {
+                    nonce_xor[i-1] ^= (unsigned char)(counter >> ((16-i)*8));
+                }
+            }
+
+            ret = mbedtls_aes_crypt_ecb(ctx, MBEDTLS_AES_ENCRYPT, nonce_xor, stream_block);
+            if (ret != 0) {
+                break;
+            }
+            counter++;
+        }
+        c = *input++;
+        *output++ = (unsigned char) (c ^ stream_block[n]);
+
+        n = (n + 1) & 0x0F;
+    }
+
+    return ret;
+}
+
+void decrypt(uint8_t* key4way, uint8_t* IV_OTPsalt, uint8_t* IV_public, uint8_t(*buf)[16], int nblk) {
+    mbedtls_aes_context aes;
+
+    uint32_t aes_key[8];
+    uint32_t* key4waywords = (uint32_t*)key4way;
+    // Key is stored as a 4-way share of each word, ie X[0] = A[0] ^ B[0] ^ C[0] ^ D[0], stored as A[0], B[0], C[0], D[0]
+    for (int i=0; i < count_of(aes_key); i++) {
+        int skip = (i/4)*16;    // skip every other 16 words (64 bytes), due to the FIB workaround
+        aes_key[i] = key4waywords[i*4 + skip]
+                   ^ key4waywords[i*4 + 1 + skip]
+                   ^ key4waywords[i*4 + 2 + skip]
+                   ^ key4waywords[i*4 + 3 + skip];
+    }
+
+    uint8_t iv[16];
+    for (int i=0; i < sizeof(iv); i++) {
+        iv[i] = IV_OTPsalt[i] ^ IV_public[i];
+    }
+
+    int len = nblk * 16;
+
+    mbedtls_aes_setkey_enc(&aes, (uint8_t*)aes_key, 256);
+
+    lock_key();
+
+    uint8_t xor_working_block[16] = {0};
+    uint8_t stream_block[16] = {0};
+    mb_aes_crypt_ctr_xor(&aes, len, (uint8_t*)iv, xor_working_block, stream_block, (uint8_t*)buf, (uint8_t*)buf);
+}
diff --git a/bootloaders/encrypted/mbedtls_config.h b/bootloaders/encrypted/mbedtls_config.h
new file mode 100644
index 000000000..7b1c073c1
--- /dev/null
+++ b/bootloaders/encrypted/mbedtls_config.h
@@ -0,0 +1,9 @@
+#ifndef _MBEDTLS_CONFIG_H
+#define _MBEDTLS_CONFIG_H
+
+#define MBEDTLS_HAVE_ASM
+#define MBEDTLS_AES_C
+#define MBEDTLS_AES_ROM_TABLES
+#define MBEDTLS_CIPHER_MODE_CTR
+
+#endif
diff --git a/encrypted/hello_encrypted/CMakeLists.txt b/encrypted/hello_encrypted/CMakeLists.txt
index 7ec352727..443e5dfa4 100644
--- a/encrypted/hello_encrypted/CMakeLists.txt
+++ b/encrypted/hello_encrypted/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Example encrypted binary
+# Example encrypted binary - this should be secure against side channel attacks
 add_executable(hello_encrypted
         hello_encrypted.c
         secret.S
@@ -35,8 +35,7 @@ pico_hash_binary(hello_encrypted)
 pico_encrypt_binary(hello_encrypted
     ${CMAKE_CURRENT_LIST_DIR}/privateaes.bin
     ${CMAKE_CURRENT_LIST_DIR}/ivsalt.bin
-    EMBED
-    OTP_KEY_PAGE 29)
+    EMBED)
 
 # package uf2 in flash
 pico_package_uf2_output(hello_encrypted 0x10000000)
@@ -48,7 +47,7 @@ pico_add_extra_outputs(hello_encrypted)
 example_auto_set_url(hello_encrypted)
 
 
-# Example encrypted binary using MbedTLS
+# Example encrypted binary using MbedTLS - this is faster, but not secure against side channel attacks
 add_executable(hello_encrypted_mbedtls
         hello_encrypted.c
         secret.S