diff --git a/README.md b/README.md index f718271ea..df0afd5c6 100644 --- a/README.md +++ b/README.md @@ -90,7 +90,8 @@ App|Description App|Description ---|--- -[hello_encrypted](encrypted/hello_encrypted) | Create a self-decrypting binary. +[hello_encrypted](encrypted/hello_encrypted) | Create a self-decrypting binary, using the hardened decryption stage. This should be secure against side channel attacks. +[hello_encrypted_mbedtls](encrypted/hello_encrypted) | Create a self-decrypting binary, using the MbedTLS decryption stage. This is not secure against side channel attacks, so is fast but provides limited protection. ### HSTX (RP235x Only) diff --git a/bootloaders/encrypted/CMakeLists.txt b/bootloaders/encrypted/CMakeLists.txt index 2d6d77f0d..fb7eb2d48 100644 --- a/bootloaders/encrypted/CMakeLists.txt +++ b/bootloaders/encrypted/CMakeLists.txt @@ -1,15 +1,19 @@ # Encrypted Bootloader add_executable(enc_bootloader enc_bootloader.c - aes.S + mbedtls_aes.c ) # pull in common dependencies -target_link_libraries(enc_bootloader pico_stdlib pico_rand) +target_link_libraries(enc_bootloader pico_stdlib pico_rand pico_mbedtls) # use stack guards, as AES variables are written near the stack target_compile_definitions(enc_bootloader PRIVATE PICO_USE_STACK_GUARDS=1) +target_link_options(enc_bootloader PUBLIC -Wl,--print-memory-usage) + +target_include_directories(enc_bootloader PRIVATE ${CMAKE_CURRENT_LIST_DIR}) + # set as no_flash binary pico_set_binary_type(enc_bootloader no_flash) @@ -35,8 +39,8 @@ function(add_linker_script target origin length) pico_set_linker_script(${target} ${CMAKE_CURRENT_BINARY_DIR}/${target}.ld) endfunction() -# create linker script to run from 0x20078000 -add_linker_script(enc_bootloader "0x20078000" "32k") +# create linker script to run from 0x20070000 +add_linker_script(enc_bootloader "0x20070000" "64k") # sign, hash, and clear SRAM pico_sign_binary(enc_bootloader ${CMAKE_CURRENT_LIST_DIR}/private.pem) @@ -50,6 +54,9 @@ pico_embed_pt_in_binary(enc_bootloader ${CMAKE_CURRENT_LIST_DIR}/enc-pt.json) pico_set_uf2_family(enc_bootloader "absolute") pico_package_uf2_output(enc_bootloader 0x10000000) +# optionally enable USB output in addition to UART +# pico_enable_stdio_usb(enc_bootloader 1) + # create map/bin/hex/uf2 file etc. pico_add_extra_outputs(enc_bootloader) @@ -83,6 +90,9 @@ pico_encrypt_binary(hello_serial_enc ${CMAKE_CURRENT_LIST_DIR}/privateaes.bin ${ # package uf2 in flash pico_package_uf2_output(hello_serial_enc 0x10000000) +# optionally enable USB output in addition to UART +# pico_enable_stdio_usb(hello_serial_enc 1) + # create map/bin/hex/uf2 file etc. pico_add_extra_outputs(hello_serial_enc) diff --git a/bootloaders/encrypted/README.md b/bootloaders/encrypted/README.md index 0e10e5e3d..790b9605b 100644 --- a/bootloaders/encrypted/README.md +++ b/bootloaders/encrypted/README.md @@ -1,5 +1,7 @@ For security you **must** replace private.pem and privateaes.bin with your own keys, and ivsalt.bin with your own per-device salt. Make sure you **don't lose your keys and salts**, else you may not be able to update the code on your device. +This bootloader uses MbedTLS for decryption, so it is not secure against side channel attacks and therefore only offers limited protection against physical attackers. + Your signing key must be for the _secp256k1_ curve, in PEM format. You can create a .PEM file with: ```bash diff --git a/bootloaders/encrypted/aes.S b/bootloaders/encrypted/aes.S deleted file mode 100644 index 093c4b0f1..000000000 --- a/bootloaders/encrypted/aes.S +++ /dev/null @@ -1,1944 +0,0 @@ -/* MEMORY LAYOUT ASSUMPTIONS - -The "chaff" area must be located at the start of Y scratch RAM, 0x20081000: see -the macro getchaffaddress. - -The stack must be located at the end of Y scratch RAM: see the memory -wiping at the end of ctr_crypt_s where memory between the start of Y -scratch RAM and the stack pointer is overwritten. -*/ - -.syntax unified -.cpu cortex-m33 -.thumb - -#include "config.h" -#include "hardware/platform_defs.h" -#include "hardware/regs/addressmap.h" -#include "hardware/regs/clocks.h" -#include "hardware/regs/sha256.h" -#include "hardware/regs/resets.h" -#include "hardware/regs/rosc.h" -#include "hardware/regs/trng.h" -#include "hardware/rcp.h" - -.global decrypt -.global chaff - -.extern lock_key - -@ RCP macros - -#define CTAG0 0x2a -#define CTAG1 0x2b -#define CTAG2 0x2c -#define CTAG3 0x2d -#define CTAG4 0x2e -#define CTAG5 0x30 -#define CTAG6 0x31 -#define CTAG7 0x32 -#define CTAG8 0x33 -#define CTAG9 0x34 -#define CTAG10 0x35 @ not used -#define CTAG11 0x36 @ not used -#define CTAG12 0x37 -#define CTAG13 0x38 -#define CTAG14 0x39 -#define CTAG15 0x3a -#define CTAG16 0x3b -#define CTAG17 0x3c -#define CTAG18 0x3d @ not used - -@ number of blocks from the TRNG processed to initialise rstate_sha -#define TRNG_BLOCKS 25 - -@ The lower jitterpriorty is, the more the jitter -.macro SET_COUNT n,jitterpriority -.if RC_COUNT -.if RC_JITTER > \jitterpriority - rcp_count_set \n -.else - rcp_count_set_nodelay \n -.endif -.endif -.endm - -.macro CHK_COUNT n,jitterpriority -.if RC_COUNT -.if RC_JITTER > \jitterpriority - rcp_count_check \n -.else - rcp_count_check_nodelay \n -.endif -.endif -.endm - -.macro GET_CANARY rx,tag,jitterpriority -.if RC_CANARY -.if RC_JITTER > \jitterpriority - rcp_canary_get \rx,\tag -.else - rcp_canary_get_nodelay \rx,\tag -.endif -.endif -.endm - -.macro CHK_CANARY rx,tag,jitterpriority -.if RC_CANARY -.if RC_JITTER > \jitterpriority - rcp_canary_check \rx,\tag -.else - rcp_canary_check_nodelay \rx,\tag -.endif -.endif -.endm - -@ Clear internal stripe load registers, and r0-r3 -@ 0 <= offset <= 32 -.macro clear03 offset=0 - getchaffaddress r0,\offset - ldmia r0,{r0-r3} -.endm - -.macro clear03_preserve_r3 offset=0 - getchaffaddress r0,\offset - ldmia r0!,{r1-r2} - ldmia r0!,{r1-r2} -.endm - -.macro clear01 offset=0 - getchaffaddress r0,\offset - ldmia r0,{r0,r1} -.endm - -@ Put workspace in the second scratch area -@ The "a"=allocatable attribute (and possibly the %progbits attribute) are necessary to store the murmur3 constants, -@ otherwise they may end up silently replaced with 0 or 0xffffffff -.section .scratch_y.aes,"aw",%progbits - -workspace_start: - -@ chaff has to be at the start of scratch_y = 0x20081000 because this is assumed by the following macro, getchaffaddress -@ (It seems ADR does not work, nor is it possible to assert that chaff==0x20081000) -@ getchaffaddress is used by clear03 and clear01 and other sensitive cases which require the first load to be a random one -@ chaff has to be 0 mod 16 for other reasons -.macro getchaffaddress rx,offset=0 -@ ldr \rx,=(chaff+\offset) - mov \rx,#(0x1000+\offset) - movt \rx,#0x2008 -.endm -chaff: -.space 48 - -.balign 16 -rkey_s: @ round key shares: 600 bytes = 15 rounds * 2 shares * (4+1) words - @ see comment at init_key_4way for description of layout and meaning of rkey_s -.space 600 -rkey4way: @ scratch area for init_key_4way; could overlap this with other scratch space if need to save space -.space 128 -.if CT_BPERM -bperm_rand: @ 32 half words that define the oblivious permutation of blocks -.space 64 -.endif - -.balign 16 -permscratch: @ Must be 0 mod 16; 16 bytes of scratch space to store permutation(s) -perm16: -.space 16 -@ Scratch space of 32 bytes used both by init_key_sbox and map_sbox_s -.balign 16 -fourway: @ Must be 0 mod 16 -shareA: @ 0 mod 16 -.space 20 @ Only need 16 bytes, but choosing shareB!=shareA mod 16 -shareB: @ 4 mod 16 -.space 20 -shareC: @ 8 mod 16 -.space 4 -statevperm: @ 12 mod 16 -.space 4 @ vperm state rotation: only last two bits are operational; other bits random -RKshareC: @ Round key common share C; see comment at init_key_4way for explanation -.space 4 -RKshareCchange: @ Temporary used by ref_roundkey_share_s -.space 4 -IV0: @ 2-way share of IV for block 0 -.space 36 @ Considering IV0 as a word pointer, the format is IV = IV0[0,1,2,3] ^ (IV0[5,6,7,8],ror#16) - @ The gap at IV0[4] is to defeat unsharing by internal striped memory registers - @ I.e., there are implicit XORs IV0[0]^IV0[4], IV0[1]^IV0[5], ..., that the 1 word offset renders useless - -@ Regardless of configuration, the code uses a single 256-entry LUT, -@ which is a simple S-box table. -@ The LUT is represented as two shares, lut_a and lut_b, -@ whose values must be EORed. Furthermore, the contents of each share are -@ scambled according to a 4-byte "map". The map comprises two bytes that -@ are EORed into the addressing of the share, and two bytes that are -@ EORed into the data read back from the share. Performing a lookup -@ of a value x involves computing -@ lut_a[x ^ a₀ ^ a₁] ^ c₀ ^ c₁ ^ lut_b[x ^ b₀ ^ b₁] ^ d₀ ^ d₁ -@ where a₀, a₁, c₀ and c₁ are the "map" of the lut_a share and -@ b₀, b₁, d₀ and d₁ are the "map" of the lut_b share. -@ In practice the result of a lookup is itself represented in two -@ shares, namely -@ lut_a[x ^ a₀ ^ a₁] ^ c₀ ^ d₀ and -@ lut_b[x ^ b₀ ^ b₁] ^ c₁ ^ d₁ -.balign 16 -lut_a: @ LUT share A (must be 0 mod 16 so that init_key_sbox knows how to mask the lookup) -.byte 0x63,0x7c,0x77,0x7b,0xf2,0x6b,0x6f,0xc5,0x30,0x01,0x67,0x2b,0xfe,0xd7,0xab,0x76 -.byte 0xca,0x82,0xc9,0x7d,0xfa,0x59,0x47,0xf0,0xad,0xd4,0xa2,0xaf,0x9c,0xa4,0x72,0xc0 -.byte 0xb7,0xfd,0x93,0x26,0x36,0x3f,0xf7,0xcc,0x34,0xa5,0xe5,0xf1,0x71,0xd8,0x31,0x15 -.byte 0x04,0xc7,0x23,0xc3,0x18,0x96,0x05,0x9a,0x07,0x12,0x80,0xe2,0xeb,0x27,0xb2,0x75 -.byte 0x09,0x83,0x2c,0x1a,0x1b,0x6e,0x5a,0xa0,0x52,0x3b,0xd6,0xb3,0x29,0xe3,0x2f,0x84 -.byte 0x53,0xd1,0x00,0xed,0x20,0xfc,0xb1,0x5b,0x6a,0xcb,0xbe,0x39,0x4a,0x4c,0x58,0xcf -.byte 0xd0,0xef,0xaa,0xfb,0x43,0x4d,0x33,0x85,0x45,0xf9,0x02,0x7f,0x50,0x3c,0x9f,0xa8 -.byte 0x51,0xa3,0x40,0x8f,0x92,0x9d,0x38,0xf5,0xbc,0xb6,0xda,0x21,0x10,0xff,0xf3,0xd2 -.byte 0xcd,0x0c,0x13,0xec,0x5f,0x97,0x44,0x17,0xc4,0xa7,0x7e,0x3d,0x64,0x5d,0x19,0x73 -.byte 0x60,0x81,0x4f,0xdc,0x22,0x2a,0x90,0x88,0x46,0xee,0xb8,0x14,0xde,0x5e,0x0b,0xdb -.byte 0xe0,0x32,0x3a,0x0a,0x49,0x06,0x24,0x5c,0xc2,0xd3,0xac,0x62,0x91,0x95,0xe4,0x79 -.byte 0xe7,0xc8,0x37,0x6d,0x8d,0xd5,0x4e,0xa9,0x6c,0x56,0xf4,0xea,0x65,0x7a,0xae,0x08 -.byte 0xba,0x78,0x25,0x2e,0x1c,0xa6,0xb4,0xc6,0xe8,0xdd,0x74,0x1f,0x4b,0xbd,0x8b,0x8a -.byte 0x70,0x3e,0xb5,0x66,0x48,0x03,0xf6,0x0e,0x61,0x35,0x57,0xb9,0x86,0xc1,0x1d,0x9e -.byte 0xe1,0xf8,0x98,0x11,0x69,0xd9,0x8e,0x94,0x9b,0x1e,0x87,0xe9,0xce,0x55,0x28,0xdf -.byte 0x8c,0xa1,0x89,0x0d,0xbf,0xe6,0x42,0x68,0x41,0x99,0x2d,0x0f,0xb0,0x54,0xbb,0x16 -lut_a_map: @ the current scrambling of lut_a; not particularly secret since it can be deduced from the contents of lut_a and lut_b -.space 4 -.space 4 @ align to 8 mod 16 -lut_b: @ LUT share B (must be 8 mod 16 so that init_key_sbox knows how to mask the lookup) -.space 256 -lut_b_map: -.space 4 -.space 4 @ align to multiple of 8 - -.balign 16 -rstate_all_start: @ Mark start of RNG data to allow selective memory wipe -rstate_sha: @ 128-bit SHA random state, to be initialised to TRNG bytes; zeroth byte must be initialised to zero -.space 16 -jstate: @ 32-bit jitter state -.space 4 -rstate_lfsr: @ 32-bit LFSR random state and constant used to step it -.space 4 -.word 0x1d872b41 @ constant that defines a maximal-length LFSR -rstate_all_end: @ Mark end of RNG data to allow selective memory wipe - -.if CT_BPERM -.balign 16 -murmur3_constants: @ Five constants used in murmur3_32 hash -.word 0xcc9e2d51 -.word 0x1b873593 -.word 0xe6546b64 -.word 0x85ebca6b -.word 0xc2b2ae35 -.endif - -scratch_y_end: - -@ Initialisation code in main .text section -.section .text,"ax",%progbits - -@ The following is copied from the A2 boot ROM code at src/main/arm/varm_boot_path.c with adjustments. -@ We feed a stream of bits from the TRNG into the SHA hardware accelerator to generate some -@ random numbers. -@ Trashes r0-r6 -.balign 4 -init_rstate: - CHK_COUNT 24,6 - ldr r4,=TRNG_BASE+TRNG_RNG_IMR_OFFSET - ldr r5,=SHA256_BASE - movs r1,#1 - str r1,[r4,#TRNG_TRNG_SW_RESET_OFFSET -TRNG_RNG_IMR_OFFSET] - ldr r6,[r4,#TRNG_TRNG_SW_RESET_OFFSET -TRNG_RNG_IMR_OFFSET] @ reads as 0 - movw r1,#SHA256_CSR_RESET|SHA256_CSR_START_BITS @ initialise SHA internal state by writing START bit - str r1,[r5,#SHA256_CSR_OFFSET] - str r6,[r4,#TRNG_SAMPLE_CNT1_OFFSET -TRNG_RNG_IMR_OFFSET] - movs r6,#TRNG_BLOCKS*2+1 @ odd so that we break out of the loop half-way through loading the SHA hardware, giving - @ time for previous SHA computation to complete -2: - movs r1,#0xff @ TRNG setup is inside loop in case it is skipped. - str r1,[r4,#TRNG_TRNG_DEBUG_CONTROL_OFFSET-TRNG_RNG_IMR_OFFSET] @ disable checks and bypass decorrelators,to stream raw TRNG ROSC samples - str r1,[r4,#TRNG_RND_SOURCE_ENABLE_OFFSET -TRNG_RNG_IMR_OFFSET] @ start ROSC if it is not already started - str r1,[r4,#TRNG_RNG_ICR_OFFSET -TRNG_RNG_IMR_OFFSET] @ clear all interrupts (including EHR_VLD) - adds r0,r4,#TRNG_EHR_DATA0_OFFSET -TRNG_RNG_IMR_OFFSET - movs r2,#TRNG_TRNG_BUSY_OFFSET -TRNG_RNG_IMR_OFFSET -1: - ldr r1,[r4,r2] @ wait for 192 ROSC samples to fill EHR,should take constant time - cmp r1,#0 - bne 1b - subs r6,#1 @ done? - beq 3f - movs r1,#8 -1: - ldmia r0!,{r2} @ copy 6 EHR words to SHA-256, plus garbage (RND_SOURCE_ENABLE and SAMPLE_CNT1) - str r2,[r5,#SHA256_WDATA_OFFSET] @ for a total of half a SHA-256 block - subs r1,#1 - bne 1b - ldr r2,[r5,#SHA256_SUM0_OFFSET] @ TRNG is now sampling again; use some SHA bits to modulate the chain length - str r2,[r4,#TRNG_TRNG_CONFIG_OFFSET -TRNG_RNG_IMR_OFFSET] - b.n 2b - -3: - CHK_COUNT 25,6 - str r1,[r4,#TRNG_TRNG_CONFIG_OFFSET -TRNG_RNG_IMR_OFFSET] @ turn off rand source and wipe SHA bits left in TRNG config; r1=0 - str r1,[r4,#TRNG_RND_SOURCE_ENABLE_OFFSET -TRNG_RNG_IMR_OFFSET] - adds r5,r5,#SHA256_SUM0_OFFSET -@ r5=SHA256 SUM0 register (r5+4=SUM1, r4+8=SUM2, etc) - ldmia r5,{r0-r3} @ load first 4 words of the 8 word SHA256 output - ldr r6,=rstate_sha -@ r5=SHA256 SUM0 register (r5+4=SUM1, r4+8=SUM2, etc), r6=rstate_sha - stmia r6,{r0-r3} - CHK_COUNT 26,6 - movs r0,#0 - strb r0,[r6] @ make sure rstate_sha[0] has byte 0 set to 0, representing "out of data" - -@ try to find a non-zero initialiser to create a non-degenerate LFSR random state - ldr r1,[r5,#16] @ SHA SUM4 - cbnz r1,1f @ is word 4 non-zero? then use it - ldr r1,[r5,#20] @ SHA SUM5 - cbnz r1,1f @ otherwise, is word 5 non-zero? use it - mov r1,r6 @ give up and use the address of rstate_sha (which is non-zero); this can't really happen (2^{-64} probability) -1: - str r1,[r6,#rstate_lfsr-rstate_sha] - -@ try to find a non-zero initialiser to create a non-degenerate ROSC random state - ldr r1,[r5,#24] @ SHA SUM6 - cbnz r1,1f @ is word 6 non-zero? then use it - ldr r1,[r5,#28] @ SHA SUM7 - cbnz r1,1f @ otherwise, is word 7 non-zero? use it - mov r1,r6 @ give up and use the address of rstate_sha (which is non-zero); this can't really happen (2^{-64} probability) -1: - ldr r2,=ROSC_RANDOM_OFFSET+ROSC_BASE - str r1,[r2,#0] @ Initialise ROSC LFSR - CHK_COUNT 27,6 - -.if GEN_RAND_SHA -.if SH_JITTER - movs r2,#0 - str r2,[r6,#jstate-rstate_sha] -.endif -.endif - - CHK_COUNT 28,6 - bx r14 - -@ Put AES core code in first scratch area -.section .scratch_x.aes,"ax",%progbits - -.if GEN_RAND_SHA -@ we need SHA256_SUM0_OFFSET==8 (see note below) -.if SHA256_SUM0_OFFSET!=8 -.err -.endif - -@ Return single random word in r0 -@ Preserves r1-r13 -.balign 4 -gen_rand_sha: - push {r14} - GET_CANARY r14,CTAG1,2 - push {r1-r3,r14} -.if SH_JITTER - ldr r2,=rstate_sha - ldr r0,[r2,#jstate-rstate_sha] - movs r1,#1 - ands r3,r0,#3 - movs r3,r3,lsl#2 - movs r3,r1,lsl r3 @ 1<<(4*(r0&3)) - udiv r3,r3,r1 @ Takes constant + (r0&3) cycles - lsrs r0,r0,#2 - bne 1f - bl gen_rand_sha_nonpres - ldr r2,=rstate_sha -1: - str r0,[r2,#jstate-rstate_sha] -.endif - bl gen_rand_sha_nonpres - pop {r1-r3,r14} - CHK_CANARY r14,CTAG1,0 - pop {r15} - -@ Return single random word in r0 -@ Trashes r1-r3 -.balign 4 -gen_rand_sha_nonpres: - ldr r0,=SHA256_BASE - ldr r2,=rstate_sha - ldrb r1,[r2] @ get word counter from bottom byte of rstate_sha[] (offset into SUM registers) - subs r3,r1,#4 @ decrement it to previous SUM register - ble 1f @ if the offset was 4 or less we have run out of SUM register values - ldr r0,[r0,r1] @ read value from SUM register: note that this relies on SHA256_SUM0_OFFSET==8 - strb r3,[r2] @ save updated SUM register offset in bottom byte of rstate_sha[] - bx r14 -1: -@ [CK_JITTER code was here] - movs r3,#SHA256_SUM6_OFFSET+1 - strb r3,[r2] @ reset word counter: the +1 is compensated for later - movw r1,#(1<>30, vpermB=Bptr[4]>>30, and -@ roundkey shareA(i) = Aptr[i+vpermA mod 4] ror ((i+vpermA mod 4)^th byte of Aptr[4]) -@ roundkey shareB(i) = Bptr[i+vpermB mod 4] ror ((i+vpermB mod 4)^th byte of Bptr[4])+16 -.balign 4 -.thumb_func -ref_roundkey_shares_s: - mov r11,#15 @ there are 15 expanded keys -ref_roundkey_shares_s_test: @ entry point for test code to do fewer than 15 rounds - ldr r4,=rkey_s - loadlfsr - steplfsr @ r0=change in RKshareC - ldr r2,=RKshareCchange - str r0,[r2] - ldr r3,=RKshareC - ldr r5,[r3] - eors r5,r5,r0 - str r5,[r3] - @ r0=lfsr_state, r1=lfsr_const, r4=roundkey_ptr, r11=roundcounter - -ref_roundkey_shares_s_loop: - ldmia r4!,{r5-r8,r10} @ r5-r8 = rkey shareA, r10=X_A=vperm+rotations of rkey shareA - - ldr r12,[r4,#16] @ r12 = X_B=vperm+rotations of rkey shareB - mov r2,r12,lsr#30 @ r2 = vpermB - sub r9,r2,r10,lsr#30 @ r9 = vpermB - vpermA (|junk) - mov r2,r9,lsl#3 @ r2 = 8*(vpermB - vpermA) mod 32 - mov r12,r12,ror r2 - usub8 r12,r10,r12 @ r12 = rotsA - (rotsB ror r2) - - @ r2,r3,r10=workspace, r0=lfsr_state, r1=lfsr_const, r4=roundkeyB_ptr, r5-r8=roundkeyA, r9=vpermdiff, r10=rotsA, r11=roundcounter, r12=rotdiff - steplfsr; eors r5,r5,r0; ands r9,r9,#3; ldr r3,[r4,r9,lsl#2]; ror r2,r0,r12; eors r3,r3,r2,ror#16; mov r12,r12,ror#8; str r3,[r4,r9,lsl#2]; adds r9,r9,#1 - steplfsr; eors r6,r6,r0; ands r9,r9,#3; ldr r3,[r4,r9,lsl#2]; ror r2,r0,r12; eors r3,r3,r2,ror#16; mov r12,r12,ror#8; str r3,[r4,r9,lsl#2]; adds r9,r9,#1 - steplfsr; eors r7,r7,r0; ands r9,r9,#3; ldr r3,[r4,r9,lsl#2]; ror r2,r0,r12; eors r3,r3,r2,ror#16; mov r12,r12,ror#8; str r3,[r4,r9,lsl#2]; adds r9,r9,#1 - steplfsr; eors r8,r8,r0; ands r9,r9,#3; ldr r3,[r4,r9,lsl#2]; ror r2,r0,r12; eors r3,r3,r2,ror#16; str r3,[r4,r9,lsl#2] - - ldr r3,=RKshareCchange - ldr r3,[r3] - movs r2,#0 - usub8 r10,r2,r10 - ror r2,r3,r10; mov r10,r10,ror#8; eors r5,r5,r2 - ror r2,r3,r10; mov r10,r10,ror#8; eors r6,r6,r2 - ror r2,r3,r10; mov r10,r10,ror#8; eors r7,r7,r2 - ror r2,r3,r10; eors r8,r8,r2 - - subs r4,r4,#20 - stmia r4,{r5-r8} - adds r4,r4,#40 - subs r11,r11,#1 - - bne ref_roundkey_shares_s_loop - ldr r2,=rstate_lfsr @ restore rstate_lfsr - savelfsr @ Save lfsr_state - clear03 24 -ref_roundkey_shares_s_exit: - bx r14 - -.balign 4 -.thumb_func -@ Rotates roundkey vperms and RK_ROR rotations by random amounts -@ Trashes r0-r10 -@ If i = word number 0..3, -@ Aptr=memory word pointer to block of 20 bytes containing H&V-rotated share A roundkey (similarly B), then -@ vpermA=Aptr[4]>>30, vpermB=Bptr[4]>>30, and -@ roundkey shareA(i) = Aptr[i+vpermA mod 4] ror ((i+vpermA mod 4)^th byte of Aptr[4]) -@ roundkey shareB(i) = Bptr[i+vpermB mod 4] ror ((i+vpermB mod 4)^th byte of Bptr[4])+16 -ref_roundkey_hvperms_s: - movs r7,#30 -ref_roundkey_hvperms_s_test: @ entry point for test code to do fewer than 30 key shares - GET_CANARY r10,CTAG9,6 - push {r10,r14} - ldr r10,=rkey_s -ref_roundkey_hvperms_s_loop: - bl gen_rand_lfsr_nonpres @ r0=new vperm high|rotations - ldmia r10,{r2-r5,r9} @ r2-r5=roundkey share A/B, r9=old vperm high|rotations - str r0,[r10,#16] - mov r8,r0,lsr#30 @ r8=new vperm low - sub r6,r8,r9,lsr#30 @ r6=(new vperm low)-(old vperm low) | junk - mov r8,r6,lsl#3 @ r8=8*((new vperm low)-(old vperm low)) mod 32 - mov r0,r0,ror r8 - usub8 r0,r9,r0 @ i^th byte of r0 = (i^th byte of old rotations) - ((i+newvperm-oldvperm)^th byte of new rotations) - movs r2,r2,ror r0; ands r6,r6,#3; str r2,[r10,r6,lsl#2]; movs r0,r0,ror#8; adds r6,r6,#1 - movs r3,r3,ror r0; ands r6,r6,#3; str r3,[r10,r6,lsl#2]; movs r0,r0,ror#8; adds r6,r6,#1 - movs r4,r4,ror r0; ands r6,r6,#3; str r4,[r10,r6,lsl#2]; movs r0,r0,ror#8; adds r6,r6,#1 - movs r5,r5,ror r0; ands r6,r6,#3; str r5,[r10,r6,lsl#2] - adds r10,r10,#20 - subs r7,r7,#1 - bne ref_roundkey_hvperms_s_loop - clear03 28 -ref_roundkey_hvperms_s_exit: @ label exit point to be to able to specify to analysis code - pop {r10,r14} - CHK_CANARY r10,CTAG9,6 - bx r14 - -.else - -@ "refresh" shares of rkeys by random eor into both shares of each word, and also randomise the single word RKshareC -@ Trashes r0-r11 -.balign 4 -.thumb_func -ref_roundkey_shares_s: - mov r11,#15 @ there are 15 expanded keys -ref_roundkey_shares_s_test: @ entry point for test code to do fewer than 15 rounds - GET_CANARY r4,CTAG8,6 - push {r4,r14} - ldr r4,=rkey_s - loadlfsr - steplfsr @ r0=change in RKshareC - ldr r3,=RKshareC - ldr r5,[r3] - eors r5,r5,r0 - str r5,[r3] - mov r10,r0 -ref_roundkey_shares_s_loop: - ldmia r4!,{r5-r9} @ r5-r8 = rkey shareA with vperm r9 - - @ clear03: would need to do this with, say r2,r3,r12 (reloading r2 later) - - ldr r3,[r4,#16] @ rkey shareB has a vperm of r10>>30 - movs r3,r3,lsr#30 - sub r9,r3,r9,lsr#30 @ r9 = vperm_B - vperm_A (|junk) - @ r3,r12=workspace, r0=lfsr_state, r1=lfsr_const, r2=rstate_lfsr, r4=roundkeyB_ptr, r5-r8=roundkeyA, r9=vpermdiff, r10=RKshareCchange, r11=roundcounter - - steplfsr; eors r5,r5,r0; and r9,r9,#3; eors r5,r5,r10; ldr r3,[r4,r9,lsl#2]; eors r3,r3,r0,ror#16; str r3,[r4,r9,lsl#2]; adds r9,r9,#1 - steplfsr; eors r6,r6,r0; and r9,r9,#3; eors r6,r6,r10; ldr r3,[r4,r9,lsl#2]; eors r3,r3,r0,ror#16; str r3,[r4,r9,lsl#2]; adds r9,r9,#1 - steplfsr; eors r7,r7,r0; and r9,r9,#3; eors r7,r7,r10; ldr r3,[r4,r9,lsl#2]; eors r3,r3,r0,ror#16; str r3,[r4,r9,lsl#2]; adds r9,r9,#1 - steplfsr; eors r8,r8,r0; and r9,r9,#3; eors r8,r8,r10; ldr r3,[r4,r9,lsl#2]; eors r3,r3,r0,ror#16; str r3,[r4,r9,lsl#2] - - subs r4,r4,#20 - stmia r4,{r5-r8} - adds r4,r4,#40 - subs r11,r11,#1 - - @ clear03: would need to do this with, say r3,r5-r8 - - bne ref_roundkey_shares_s_loop - savelfsr - clear03 24 -ref_roundkey_shares_s_exit: - pop {r4,r14} - CHK_CANARY r4,CTAG8,6 - bx r14 - -.balign 4 -.thumb_func -@ Rotates roundkey vperms by random amounts -@ Trashes r0-r9 -ref_roundkey_hvperms_s: - movs r7,#30 -ref_roundkey_hvperms_s_test: @ entry point for test code to do fewer than 30 key shares - GET_CANARY r0,CTAG9,6 - push {r0,r14} - bl gen_rand_lfsr_nonpres - ldr r1,=rkey_s -ref_roundkey_hvperms_s_loop: - cmp r7,#15 - bne 2f -@ Get a new random r0 after using 15 x 2 bits of the original one -@ Note that the junk bits (2-31) in the vperms are not adjusted independently, but that's no big loss, -@ and the gain is only calling gen_rand_lfsr twice instead of 30 times. - push {r1}; bl gen_rand_lfsr_nonpres; pop {r1} - 2: - ldmia r1,{r2-r5,r9} @ roundkey share A/B=r2-r5, vperm=r9 (including junk bits) - mov r8,r9,lsr#30 @ r8=old vperm (low) - add r6,r9,r0 @ r6=new vperm (high) | new junk - str r6,[r1,#16] - rsb r6,r8,r6,lsr#30 @ r6=(new vperm low)-(old vperm low) | junk bits - ands r6,r6,#3; str r2,[r1,r6,lsl#2]; adds r6,r6,#1 - ands r6,r6,#3; str r3,[r1,r6,lsl#2]; adds r6,r6,#1 - ands r6,r6,#3; str r4,[r1,r6,lsl#2]; adds r6,r6,#1 - ands r6,r6,#3; str r5,[r1,r6,lsl#2] - adds r1,r1,#20 - movs r0,r0,ror#2 - subs r7,r7,#1 - bne ref_roundkey_hvperms_s_loop - clear03 28 -ref_roundkey_hvperms_s_exit: @ label exit point to be to able to specify to analysis code - pop {r0,r14} - CHK_CANARY r0,CTAG9,6 - bx r14 - -.endif - -.ltorg - -.if ST_VPERM -.balign 4 -.thumb_func -@ Cycle share registers r4-r7, r8-r11 (r4->r5-r6->r7->r4 etc.) by an addtional amount -@ given in the bottom two bits of R0 and update the rotation recorded at statevperm. -@ On entry R1 must point to statevperm. -@ Trashes r0-r3,r12 -@ Maintains r4=rorig(4+(-!r1)%4), r5=rorig(4+(1-!r1)%4), ... -@ r8=rorig(8+(-!r1)%4), r9=rorig(8+(1-!r1)%4), ... -@ Note: only low 2 bits of !r1 are used. The rest are random to add to the noise. -addstatevperm: - ldr r2,[r1] - adds r2,r2,r0 - str r2,[r1] - - ldr r1,=shareA - ands r0,r0,#3; str r4,[r1,r0,lsl#2]; adds r0,r0,#1 - ands r0,r0,#3; str r5,[r1,r0,lsl#2]; adds r0,r0,#1 - ands r0,r0,#3; str r6,[r1,r0,lsl#2]; adds r0,r0,#1 - ands r0,r0,#3; str r7,[r1,r0,lsl#2]; adds r0,r0,#1 - ldmia r1,{r4-r7} - - getchaffaddress r12 @ Overwrite temporary storage with random numbers - ldmia r12!,{r2,r3} - stmia r1!,{r2,r3} - ldmia r12!,{r2,r3} - stmia r1!,{r2,r3} - - ldr r1,=shareB - ands r0,r0,#3; str r8, [r1,r0,lsl#2]; adds r0,r0,#1 - ands r0,r0,#3; str r9, [r1,r0,lsl#2]; adds r0,r0,#1 - ands r0,r0,#3; str r10,[r1,r0,lsl#2]; adds r0,r0,#1 - ands r0,r0,#3; str r11,[r1,r0,lsl#2]; adds r0,r0,#1 - ldmia r1,{r8-r11} - - getchaffaddress r0,16 @ Overwrite temporary storage with random numbers - ldmia r0!,{r2,r3} - stmia r1!,{r2,r3} - ldmia r0!,{r2,r3} - stmia r1!,{r2,r3} - -addstatevperm_exit: @ label exit point to be to able to specify to analysis code - bx r14 -.endif - -@ Conjugate lut_a, lut_b with (state) shareC -@ I.e., EOR the input and output with shareC. -@ We need to pick one input for each share A and B, and one output for ONE of the shares A and B -@ Arbitrarily choosing a0, b1 and d0 -.balign 4 -conjshareC: -.if ST_SHAREC - ldr r1,=shareC - ldr r0,[r1] @ Get shareC as a word (all bytes the same) - ldr r1,=lut_a @ Need to EOR share C into inputs of both lut_a and lut_b, and one of their outputs... - ldr r2,[r1,#0x100] - eors r2,r2,r0,lsr#24 - str r2,[r1,#0x100] - movs r0,r0,lsr#16 - ldr r1,=lut_b @ ... (continued) Here we're EORing share C into a0, b1 and d0. - ldr r2,[r1,#0x100] - eors r2,r2,r0,lsl#8 - str r2,[r1,#0x100] -.endif - bx r14 - -.balign 4 -.thumb_func -shift_rows_s: -@ First "rotate" the two most-significant bytes of the state by two registers -@ Trashes r0-r3 -@ Slightly faster (but not shorter?) with ubfx/bfi - eors r0,r4,r6 @ ta=state[0]^state[2]; ta&=0xffff0000; state[0]^=ta; state[2]^=ta; - lsrs r0,r0,#16 - lsls r0,r0,#16 - eors r4,r4,r0 - eors r6,r6,r0 - eors r0,r5,r7 @ ta=state[1]^state[3]; ta&=0xffff0000; state[1]^=ta; state[3]^=ta; - lsrs r0,r0,#16 - lsls r0,r0,#16 - eors r5,r5,r0 - eors r7,r7,r0 -@ next "rotate" the two odd-significance bytes of the state by one register - eors r1,r7,r4 @ tb=state[3]^state[0]; tb&=0xff00ff00; - ands r1,r1,#0xff00ff00 - eors r0,r4,r5 @ ta=state[0]^state[1]; ta&=0xff00ff00; state[0]^=ta; - ands r0,r0,#0xff00ff00 - eors r4,r4,r0 - eors r0,r5,r6 @ ta=state[1]^state[2]; ta&=0xff00ff00; state[1]^=ta; - ands r0,r0,#0xff00ff00 - eors r5,r5,r0 - eors r0,r6,r7 @ ta=state[2]^state[3]; ta&=0xff00ff00; state[2]^=ta; - ands r0,r0,#0xff00ff00 - eors r6,r6,r0 - eors r7,r7,r1 @ state[3]^=tb; -@ repeat for other share, conjugated by ror#16 - clear01 @ barrier - eors r0,r8,r10 @ ta=state[0]^state[2]; ta&=0x0000ffff; state[0]^=ta; state[2]^=ta; - lsls r0,r0,#16 - lsrs r0,r0,#16 - eors r8,r8,r0 - eors r10,r10,r0 - eors r0,r9,r11 @ ta=state[1]^state[3]; ta&=0x0000ffff; state[1]^=ta; state[3]^=ta; - lsls r0,r0,#16 - lsrs r0,r0,#16 - eors r9,r9,r0 - eors r11,r11,r0 - eors r1,r11,r8 @ tb=state[3]^state[0]; tb&=0xff00ff00; - ands r1,r1,#0xff00ff00 - eors r0,r8,r9 @ ta=state[0]^state[1]; ta&=0xff00ff00; state[0]^=ta; - ands r0,r0,#0xff00ff00 - eors r8,r8,r0 - eors r0,r9,r10 @ ta=state[1]^state[2]; ta&=0xff00ff00; state[1]^=ta; - ands r0,r0,#0xff00ff00 - eors r9,r9,r0 - eors r0,r10,r11 @ ta=state[2]^state[3]; ta&=0xff00ff00; state[2]^=ta; - ands r0,r0,#0xff00ff00 - eors r10,r10,r0 - - eors r11,r11,r1 @ state[3]^=tb; - - clear01 @ barrier - bx r14 - -@ multiply polynomial over GF(2⁸) by c(x) = 0x03x³ + 0x01x² + 0x01x + 0x02 modulo x⁴+1 -@ r0x00 is a register holding 0x00000000; r0x1b is a register holding 0x1b1b1b1b -.macro mixcol rx,rt,ru,r0x00,r0x1b - @ let rx=(a,b,c,d) - uadd8 \rt,\rx,\rx @ MSB of each byte into the GE flags - sel \ru,\r0x1b,\r0x00 @ get bytewise correction for bytewise field multiplication by 2 - eors \rt,\rt,\ru @ (2a,2b,2c,2d) - - eors \ru,\rt,\rx @ (3a,3b,3c,3d) - eors \rt,\rt,\rx,ror#24 @ (2a+b,2b+c,2c+d,2d+a) - eors \rt,\rt,\rx,ror#16 @ (2a+b+c,2b+c+d,2c+d+a,2d+a+b) - eors \rx,\rt,\ru,ror#8 @ (2a+b+c+3d,2b+c+d+3a,2c+d+a+3b,2d+a+b+3c) -.endm - -@ multiply polynomial over GF(2⁸) by d(x) = 0x0Bx³ + 0x0Dx² + 0x09x + 0x0E modulo x⁴+1; c(x)d(x)=1 modulo x⁴+1 -.macro invmixcol rx,rt,ru,rv,rw,r0x00,r0x1b - uadd8 \rt,\rx,\rx @ field multiplication by 2 as above - sel \rw,\r0x1b,\r0x00 - eors \rt,\rt,\rw @ 2x - uadd8 \ru,\rt,\rt - sel \rw,\r0x1b,\r0x00 - eors \ru,\ru,\rw @ 4x - uadd8 \rv,\ru,\ru - sel \rw,\r0x1b,\r0x00 - eors \rv,\rv,\rw @ 8x - - eors \rx,\rx,\rv @ 9x - eors \rw,\rx,\rt @ 11x - eors \rw,\rw,\rx,ror#16 @ 11x ^ 9x ROL #16 - eors \rx,\rx,\ru @ 13x - eors \rw,\rw,\rx,ror#8 @ 11x ^ 9x ROL #16 ^ 13x ROL #24 - eors \rt,\rt,\ru @ 6x - eors \rt,\rt,\rv @ 14x - eors \rx,\rt,\rw,ror#8 @ 14x ^ 9x ROL #8 ^ 13x ROL #16 ^ 11x ROL #24 -.endm - -.balign 4 -.thumb_func -@ Trashes r0-r3,r12 -mix_cols_s: - mov r2,#0x00000000 - mov r3,#0x1b1b1b1b - mixcol r4 ,r0,r1,r2,r3 @ apply mixcol to each state word - mixcol r5 ,r0,r1,r2,r3 - mixcol r6 ,r0,r1,r2,r3 - mixcol r7 ,r0,r1,r2,r3 - ldr r12,=chaff - ldmia r12!,{r0,r1} @ overwrite sensitive shareA-related quantities r0,r1 with random numbers - mixcol r8 ,r0,r1,r2,r3 - mixcol r9 ,r0,r1,r2,r3 - mixcol r10,r0,r1,r2,r3 - mixcol r11,r0,r1,r2,r3 - ldmia r12!,{r0,r1} @ overwrite sensitive shareB-related quantities r0,r1 with random numbers - bx r14 - -@ Lookup each byte of a word, Rtarg, in a table and replace Rtarg with the result (used for SBOX lookups) -.macro subbytes Rtarg,Rtable,Rspare0,Rspare1,Rspare2,Rspare3 - ubfx \Rspare0,\Rtarg,#0, #8 - ubfx \Rspare1,\Rtarg,#8, #8 - ubfx \Rspare2,\Rtarg,#16, #8 - ubfx \Rspare3,\Rtarg,#24, #8 - - ldrb \Rspare0,[\Rtable,\Rspare0] - ldrb \Rspare1,[\Rtable,\Rspare1] - ldrb \Rspare2,[\Rtable,\Rspare2] - ldrb \Rspare3,[\Rtable,\Rspare3] - orr \Rspare0,\Rspare0,\Rspare1,lsl#8 - orr \Rspare2,\Rspare2,\Rspare3,lsl#8 - orr \Rtarg,\Rspare0,\Rspare2,lsl#16 -.endm - -@ map all bytes of the state through the split LUT, lut_a and lut_b -@ Trashes r0-r3,r12 -.balign 4 -.thumb_func -map_sbox_s: - GET_CANARY r12,CTAG12,3 - push {r12,r14} - - ldr r0,=shareA @ Write out state share A to memory -@ stmia r0,{r4-r7} @ Used to do a STM - getchaffaddress r1 - ldr r2,[r1] - str r4,[r0] @ Interperse with dummy writes to prevent implicit broadcasting of HW(ShareA_word0^ShareA_word1)+cyclic perms, - str r2,[r1] @ which arise due to internal write buffer. Such a quantity could (without such interspersing) be paired - str r5,[r0,#4] @ via 2nd order with its share B counterpart, resulting in broadcasting HW(word0^word1)+cyclic. - str r2,[r1] @ shareC doesn't guard against this, because word0^shareC^word1^shareC=word0^word1. - str r6,[r0,#8] @ Broadcasting of HW(ShareA_word0)+cyclic on the other hand is not prevented by interspersing, but - str r2,[r1] @ it isn't useful at 2nd order because shareC kills its relationship with HW(ShareB_word0)+cyclic. - str r7,[r0,#12] - str r2,[r1] - - ldr r0,=shareB @ Write out state share B to memory - stmia r0,{r8-r11} @ Not essential to intersperse share B too because i0B^i1B etc should have nothing in share A to couple with - - bl makeperm16 @ Rebuild random 16-way permutation. Maybe do this less frequently -@ Now combine state shares A and B and apply the split sbox to each byte, in the order given by the above random permutation - - bl gen_rand_sha_nonpres - mov r11,r0 - ldr r8,=lut_a - ldr r9,=lut_b - ldr r0,[r8,#0x100] @ R0 = a0 | a1<<8 | c0<<16 | c1<<24 (lut_a_map) - eors r3,r0,r0,lsr#8 @ R3 = a0^a1 | junk - uxtb r10,r3 - ldr r1,[r9,#0x100] @ R1 = b0 | b1<<8 | d0<<16 | d1<<24 (lut_b_map) - eors r1,r0,r1 - eors r2,r1,r1,lsr#8 - movs r12,r1,lsr#16 @ R12 = c0^d0 | (c1^d1)<<8 - bfi r12,r2,#16,#8 @ R12 = c0^d0 | (c1^d1)<<8 | (a0^a1^b0^b1)<<16 - - ldr r4,=perm16 - ldr r5,=shareA - ldr r6,=shareB - movs r1,#0;movs r2,#0;movs r3,#0 -@ Using r0=loop counter, r4=perm16, r5=shareA, r6=shareB, r8=lut_a, r9=lut_b, r10=a0^a1, r11=Random, r12=(c0^d0) | (c1^d1)<<8 | (a0^a1^b0^b1)<<16 - movs r0,#15 -1: @ (Ordering instructions to minimise result delays) - ldrb r1,[r4,r0] @ r1 = perm[r0] - mov r11,r11,ror#11 @ Rotate random 32 bits to present a new low 8 bits - eors r7,r1,#2 @ r7 = perm[r0]^2 - ldrb r2,[r5,r1] @ r2 = shareA[perm[r0]] - eor r11,r11,r2,ror#8 @ Transfer some of the share-randomness of the input to the output (the share-randomness would otherwise be lost/wasted) - ldrb r3,[r6,r7] @ r3 = shareB[perm[r0]^2] - eor r2,r2,r10 @ r2 = shareA[perm[r0]]^a0^a1 - eors r2,r2,r3 @ r2 = shareA[perm[r0]]^a0^a1^shareB[perm[r0]^2] - ldrb r3,[r8,r2] @ r3 = lut_a[shareA[perm[r0]]^a0^a1^shareB[perm[r0]^2]] - eor r2,r2,r12,lsr#16 @ r2 = shareA[perm[r0]]^b0^b1^shareB[perm[r0]^2] - eor r3,r3,r12 @ r3 = lut_a[shareA[perm[r0]]^a0^a1^shareB[perm[r0]^2]]^c0^d0 | (junk<<8) - eor r3,r3,r11 @ r3 = lut_a[shareA[perm[r0]]^a0^a1^shareB[perm[r0]^2]]^c0^d0^rand | (junk<<8) - strb r3,[r5,r1] @ shareA'[perm[r0]] = lut_a[shareA[perm[r0]]^a0^a1^shareB[perm[r0]^2]]^c0^d0^rand - ldrb r3,[r9,r2] @ r3 = lut_b[shareA[perm[r0]]^b0^b1^shareB[perm[r0]^2]] - subs r0,r0,#1 - eor r3,r3,r11 @ r3 = lut_b[shareA[perm[r0]]^b0^b1^shareB[perm[r0]^2]]^rand - eor r3,r3,r12,lsr#8 @ r3 = lut_b[shareA[perm[r0]]^b0^b1^shareB[perm[r0]^2]]^rand^c1^d1 | (junk<<8) - strb r3,[r6,r7] @ shareB'[perm[r0]^2] = lut_b[shareA[perm[r0]]^b0^b1^shareB[perm[r0]^2]]^rand^c1^d1 - bpl 1b - clear03 8 @ barrier - - ldmia r6,{r8-r11} @ Read state share B back from memory - clear03 12 @ barrier - getchaffaddress r0,16 - bfi r0,r5,#0,#4 @ match chaff pointer (r0) to share A location (R5) mod 16 - @ldmia r5,{r4-r7} @ Read state share A back from memory - @clear03 16 @ barrier - ldr r4,[r5] @ Intersperse barriers to prevent HW(o0^o1)+cyclic being broadcast; see similar note re stores at the start of map_sbox_s - ldr r1,[r0] - ldr r6,[r5,#8] - ldr r1,[r0,#8] - ldr r7,[r5,#12] - ldr r1,[r0,#12] - ldr r5,[r5,#4] @ Do r5 last because it's the address register - ldr r1,[r0,#4] - -@ Refresh state shares because luts only give imperfect share-by-value -@ Probably not necessary now that we use R11 and input-reuse random resharing during the sbox operation (though the R11 bytes are not fully independent) -@ loadlfsr -@ steplfsr; eors r4,r4,r0; mov r12,#0; eor r8,r8,r0,ror#16 @ Barriers between each pair of eors to prevent implicit r4^r8 etc -@ steplfsr; eors r5,r5,r0; mov r12,#0; eor r9,r9,r0,ror#16 -@ steplfsr; eors r6,r6,r0; mov r12,#0; eor r10,r10,r0,ror#16 -@ steplfsr; eors r7,r7,r0; mov r12,#0; eor r11,r11,r0,ror#16 -@ savelfsr - - pop {r12,r14} - CHK_CANARY r12,CTAG12,5 - bx r14 - -.ltorg - -.balign 4 -.thumb_func -randomisechaff: -@ Randomise 48 bytes of chaff values (random load values) -@ Uses 12 bytes of permscratch -@ Trashes r0-3 - GET_CANARY r0,CTAG13,6 - push {r0,r14} - movs r0,#12 - ldr r1,=permscratch - bl makesmallperm @ Store the random words in a random order to make 2nd order attacks harder - movs r1,#11 -1: - push {r1} - bl gen_rand_sha_nonpres - pop {r1} - ldr r2,=permscratch - ldrb r2,[r2,r1] - getchaffaddress r3 - str r0,[r3,r2,lsl#2] - subs r1,r1,#1 - bpl 1b - pop {r0,r14} - CHK_CANARY r0,CTAG13,6 - bx r14 - -.balign 4 -refreshchaff_and_lfsr: -@ Update 48 bytes of chaff values (random load values) using faster RNG than used for randomisechaff -@ Re-randomise LFSR with SHA -@ Uses 12 bytes of permscratch -@ Trashes r0-3,12 - GET_CANARY r0,CTAG14,6 - push {r0,r14} - -@ Refresh LFSR using SHA to make it harder to reverse-engineer LFSR sequence - bl gen_rand_sha_nonpres - ldr r1,=rstate_lfsr - ldr r2,[r1] - adds r2,r2,r0 - beq 1f @ Don't update LFSR state to 0 - str r2,[r1] -1: - -@ Choose a random order to update chaff words to make 2nd order attacks harder - movs r0,#12 - ldr r1,=permscratch - bl makesmallperm - - movs r1,#11 -1: - push {r1} - bl gen_rand_lfsr_nonpres - pop {r1} - ldr r2,=permscratch - ldr r3,=chaff - ldrb r2,[r2,r1] - ldr r12,[r3,r2,lsl#2] - add r0,r0,r12 - str r0,[r3,r2,lsl#2] - subs r1,r1,#1 - bpl 1b - pop {r0,r14} - CHK_CANARY r0,CTAG14,6 - bx r14 - -.balign 4 -.thumb_func -@ Do sbox on the four bytes of the 4-way share r4-r7 -@ Trashes r0,r8-r12 -init_key_sbox: - GET_CANARY r12,CTAG15,6 - push {r1-r3,r12,r14} - bl gen_rand_sha_nonpres; mov r8,r0 - bl gen_rand_sha_nonpres; mov r9,r0 - bl gen_rand_sha_nonpres; mov r10,r0 - bl gen_rand_sha_nonpres; mov r11,r0 - ldr r0,=fourway @ Write out 4-way share to memory - stmia r0,{r8-r11} @ Save random values first to obscure saving of state - stmia r0,{r4-r7} - movs r4,#0 @ Clear r4-r7 so that they don't interact with makesmallperm - movs r5,#0 - movs r6,#0 - movs r7,#0 - - bl randomisechaff @ Randomise block of memory mainly used for obscuring loads - - movs r0,#4 - ldr r1,=permscratch - bl makesmallperm @ Build random 4-way permutation determining order of bytes to be SBOXed - ldr r1,=permscratch @ Write out random addresses in advance to save two registers (reusing permscratch) - ldr r4,[r1] - ldr r0,=fourway - uxtab r5,r0,r4 - uxtab r6,r0,r4,ror#8 - uxtab r7,r0,r4,ror#16 - uxtab r8,r0,r4,ror#24 - stmia r1,{r5-r8} @ Store at r1=permscratch: fourway+perm[0], fourway+perm[1], fourway+perm[2], fourway+perm[3] - - bl gen_rand_sha @ Save some randomness for the resharing operation later - movs r7,r0 - bl gen_rand_sha - movs r8,r0 - - ldr r2,=lut_a - ldr r3,=lut_b - ldr r0,[r2,#0x100] @ R0 = a0 | a1<<8 | c0<<16 | c1<<24 (lut_a_map) - eors r10,r0,r0,lsr#8 - uxtb r10,r10 @ R10 = a0^a1 - ldr r1,[r3,#0x100] @ R1 = b0 | b1<<8 | d0<<16 | d1<<24 (lut_b_map) - eors r1,r0,r1 - eors r4,r1,r1,lsr#8 - uxtb r11,r4 @ R11 = a0^a1^b0^b1 - eor r10,r10,r11,lsl#8 @ R10 = a0^a1 | (a0^a1^b0^b1)<<8 - movs r12,r1,ror#16 @ R12 = c0^d0 | (c1^d1)<<8 | junk<<16 | junk<<24 - - ldr r1,=permscratch - ldr r11,=chaff -@ Using r1=permutedfourwaypointer, r2=lut_a, r3=lut_b, r7,r8=randomness, r10=(a0^a1)|(a0^a1^b0^b1)<<8, r11=chaff, r12=(c0^d0)|(c1^d1)<<8|junk -1: - ands r5,r1,#12 - adds r5,r11,r5 @ Align chaff address to r1 - ldr r6,[r1],#4 @ r6 = fourway + perm[i] (i=0-3, loop iteration) - ldr r5,[r5] @ Random load to mask previous load - - ands r9,r6,#12 - add r9,r11,r9 @ r9 = chaff address aligned to (r6 bic 3) mod 16 - ldrb r4,[r6,#0] - ldr r14,[r9,#0] @ Random load to mask previous load - eor r4,r4,r10 - eor r4,r4,r14,lsl#8 @ Add in some junk in bits 8-31 - - ldrb r5,[r6,#4] - ldr r14,[r9,#4] @ Random load to mask previous load - eors r4,r4,r5 - eor r4,r4,r14,lsl#8 @ Add in some junk in bits 8-31 - - ldrb r5,[r6,#8] - ldr r14,[r9,#8] @ Random load to mask previous load - eors r4,r4,r5 - eor r4,r4,r14,lsl#8 @ Add in some junk in bits 8-31 - - ldrb r5,[r6,#12] - ldr r14,[r9,#12] @ Random load to mask previous load - eors r4,r4,r5 @ r4 = unsharedbyte[perm[i]]^a0^a1 | junk - eor r4,r4,r14,lsl#8 @ Add in some junk in bits 8-31 - - ands r14,r4,#255 - ldrb r5,[r2,r14] @ r5 = lut_a[unsharedbyte[perm[i]]^a0^a1] - and r14,r4,#15 - add r14,r14,#32 - ldrb r14,[r11,r14] @ Random load to mask previous load (r2 and r11 are both 0 mod 16) - eors r5,r5,r12 @ r5 = lut_a[unsharedbyte[perm[i]]^a0^a1]^c0^d0 | junk<<8 | junk<<16 | junk<<24 -@ split r5 into two shares and store at [r6,#0] and [r6,#4] - strb r7,[r6,#0] - eors r5,r5,r7 - strb r5,[r6,#4] - - mov r5,r10,lsr#8 @ r5=a0^a1^b0^b1 - ldr r14,[r11,#44] @ Need to eor into a random destination register - eors r14,r4,r5 @ r14 = unsharedbyte[perm[i]]^b0^b1 | junk<<8 - and r14,r14,#255 - - ldrb r5,[r3,r14] @ r5 = lut_b[unsharedbyte[perm[i]]^b0^b1] - and r14,r14,#15 - add r4,r11,#24 - ldrb r14,[r4,r14] @ Random load to mask previous load (r3==8 and r11==0 mod 16) - eor r5,r5,r12,ror#8 @ r5 = lut_b[unsharedbyte[perm[i]]^b0^b1]^c1^d1 | junk<<8 | junk<<16 | junk<<24 -@ split r5 into two shares and store at [r6,#8] and [r6,#12] - strb r8,[r6,#8] - eors r5,r5,r8 - strb r5,[r6,#12] - - movs r7,r7,ror#8 - movs r8,r8,ror#8 - - tst r1,#12 @ This does 4 loop iterations because permscratch is guaranteed to be 0 mod 16 - bne 1b - - ldr r0,=fourway - ldmia r0,{r4-r7} @ Load SBOXed values back into register r4-r7 - ldmia r11,{r8-r12,r14} @ Random load to mask previous load and to obfuscate registers - - pop {r1-r3,r12,r14} - CHK_CANARY r12,CTAG15,6 - bx r14 - -.balign 4 -.thumb_func -@ r1 = pointer to 4 x 4-way share (16 words); left unchanged -@ r3 = rkey_s+40*roundkeynumber; advanced by 40 -@ Trashes r8-r12 -@ If i = word number 0..3, -@ Aptr=memory word pointer to block of 20 bytes containing H&V-rotated share A roundkey (similarly B), then -@ vpermA=Aptr[4]>>30, vpermB=Bptr[4]>>30, and -@ roundkey shareA(i) = Aptr[i+vpermA mod 4] ror #((i+vpermA mod 4)^th byte of Aptr[4]) -@ roundkey shareB(i) = Bptr[i+vpermB mod 4] ror #((i+vpermB mod 4)^th byte of Bptr[4])+16 -storeroundkey: - GET_CANARY r8,CTAG16,6 - push {r2,r8,r14} - -@ eor two 4-way share components to make a component of a 2-way share -@ Note that we load from 4-way share at a random address then convert to 2-way share and -@ store at a fixed address, rather than the other way around, so that 2-way shares are obscured -@ by vperm (we don't know which 2-way share is being processed at a particular point in time). -@ And (if RK_ROR) we rotate first before EORing down to 2-way, so there is never an unrotated 2-way share - - bl gen_rand_sha @ Get r0 = vperm for shareA of the round key - str r0,[r3,#16] - mov r8,r0,lsr#30 - rsb r8,r8,#0 @ r8=-vperm -.if RK_ROR - movs r2,#0 - usub8 r2,r2,r0 @ r2=-hperms -.endif - mov r9,#4 -1: - and r8,r8,#3 - adds r0,r1,r8,lsl#4 - - ldmia r0,{r10,r11} -.if RK_ROR - mov r10,r10,ror r2 - mov r11,r11,ror r2 - movs r2,r2,ror#8 -.endif - eor r10,r10,r11 - str r10,[r3],#4 - add r8,r8,#1 - subs r9,r9,#1 - bne 1b - - adds r1,r1,#8 - adds r3,r3,#4 @ skip over vperm (already stored) - - bl gen_rand_sha @ Get r0 = vperm for shareB of the round key - str r0,[r3,#16] - mov r8,r0,lsr#30 - rsb r8,r8,#0 @ r8=-vperm -.if RK_ROR - movs r2,#0 - usub8 r2,r2,r0 @ r2=-hperms -.endif - mov r9,#4 - ldr r12,=RKshareC - ldr r12,[r12] -1: - and r8,r8,#3 - adds r0,r1,r8,lsl#4 - ldmia r0,{r10,r11} - eor r10,r10,r12 @ Mix in RKshareC into round key shareB -.if RK_ROR - mov r10,r10,ror r2 - mov r11,r11,ror r2 - movs r2,r2,ror#8 -.endif - mov r10,r10,ror#16 - mov r11,r11,ror#16 - eor r10,r10,r11 - str r10,[r3],#4 - add r8,r8,#1 - subs r9,r9,#1 - bne 1b - - subs r1,r1,#8 @ Restore r1 = (r1 on entry) - adds r3,r3,#4 @ Set r3 = (r3 on entry) + 40 - - pop {r2,r8,r14} - CHK_CANARY r8,CTAG16,6 - bx r14 - -.balign 4 -.thumb_func -init_key_4way: -@ On entry, r0 points to 4-way shared raw key data (128 bytes) -@ The format is a0 b0 c0 d0 a1 b1 c1 d1 ... a7 b7 c7 d7 -@ That is, each word, K, of the original 256-bit key is expanded into four words whose exclusive OR is K. -@ -@ On exit, rkeys_s, a 40*15=600-byte region, is filled as follows. -@ Each of the 15 round keys is represented as two 5-word regions rka[0..4] and rkb[0..4], -@ each of which consists of 4 words of round key followed by a word encoding vperm and rotation (RK_ROR) information. -@ In addition a common share word, RKshareC, is set randomly. -@ For a given round, rk[i] = the i^th word of the actual round key is given by: -@ vpermA=rka[4]>>30 -@ vpermB=rkb[4]>>30 -@ rka_unrot[i] = rka[i+vpermA mod 4] ror #((i+vpermA mod 4)^th byte of rka[4]) -@ rkb_unrot[i] = rkb[i+vpermB mod 4] ror #((i+vpermB mod 4)^th byte of rkb[4])+16 -@ rk[i] = rka_unrot[i] ^ rkb_unrot[i] ^ RKshareC - - GET_CANARY r12,CTAG17,6 - push {r0-r12,r14} - -@ Transfer 4-way key into local workspace, rerandomising the shares - mov r5,r0 @ r5=4-way key input - bl randomisechaff - ldr r6,=rkey4way - movs r7,#8 -1: - ldmia r5!,{r1-r4} - bl gen_rand_sha; eors r1,r1,r0; eors r4,r4,r0 - bl gen_rand_sha; eors r2,r2,r0; eors r4,r4,r0 - bl gen_rand_sha; eors r3,r3,r0; eors r4,r4,r0 - stmia r6!,{r1-r4} - subs r7,r7,#1 - bne 1b - -@ Now raw key is stored in rkey4way[], construct 2-way share in rkey_s[] for -@ the 128-bit roundkeys 0 and 1, then expand from 2 to 15 roundkeys. - bl gen_rand_sha_nonpres - ldr r12,=RKshareC - str r0,[r12] @ Make RKshareC random word - ldr r3,=rkey_s @ r3=rkey_s - ldr r1,=rkey4way @ r1=rkey4way - bl storeroundkey @ Store round key 0 and advance r3 by 40 - adds r1,r1,#64 - bl storeroundkey @ Store round key 1 and advance r3 by 40 - adds r1,r1,#48 - ldmia r1!,{r4-r7} @ r4-r7 = 4-way share of previous round key word - @ r1=rkey4way+128 on entry to main loop - movs r2,#0 @ r2=word counter (0-51), offset from word 8 - -@ Note that r1-r3 are not sensitive values, so it's safe to stack -@ them and conditionally branch on them. - -@ rkey4way = 8 x 4 consecutive 4-way share words as cyclic buffer of -@ Rounds 0,1 Rounds 2,3 Rounds 12,13 Round 14 -@ a0 b0 c0 d0 -> a8 b8 c8 d8 -> ... -> a48 b48 c48 d48 -> a56 b56 c56 d56 -@ a1 b1 c1 d1 -> a9 b9 c9 d9 a49 b49 c49 d49 a57 b57 c57 d57 -@ a2 b2 c2 d2 etc a50 b50 c50 d50 a58 b58 c58 d58 -@ a3 b3 c3 d3 a51 b51 c51 d51 a59 b59 c59 d59 -@ a4 b4 c4 d4 a52 b52 c52 d52 =============== -@ a5 b5 c5 d5 a53 b53 c53 d53 -@ a6 b6 c6 d6 a54 b54 c54 d54 -@ a7 b7 c7 d7 a55 b55 c55 d55 - -init_key_expandloop: -@ r1 = pointer past one of eight 4-way shares of a roundkey word in the above cyclic buffer (r1=rkey4way+16i for i=1,...,8) -@ r2 = round key word counter (0-51), offset from word 8 (counting expanded roundkey words) -@ r3 = pointer to rkey_s+40*roundnumber = rkey_s+40*(2+[r2/4]) -@ r4-r7 = 4-way share of previous roundkey word - - tst r2,#7 - bne 1f - subs r1,r1,#128 @ Every 8th word, reset cyclic buffer pointer and do ROTWORD - movs r4,r4,ror#8 - movs r5,r5,ror#8 - movs r6,r6,ror#8 - movs r7,r7,ror#8 -1: - - tst r2,#3 - bne 1f - bl init_key_sbox @ Every 4th word, do SUBBYTES (sbox) on r4-r7 -1: - - tst r2,#7 - bne 1f - movs r0,r2,lsr#3 - mov r8,#1 - movs r8,r8,lsl r0 - eors r4,r4,r8 @ Every 8th word, add in round constant -1: - - ldmia r1,{r8-r11} @ eor with key from two rounds ago and advance r1 by 16 - eors r4,r4,r8 - eors r5,r5,r9 - eors r6,r6,r10 - eors r7,r7,r11 - stmia r1!,{r4-r7} - - add r2,r2,#1 - tst r2,#3 - bne 1f - subs r1,r1,#64 - bl storeroundkey @ Store round key 1+r2/4 and advance r3 by 40 - adds r1,r1,#64 -1: - - cmp r2,#52 - bne init_key_expandloop - - pop {r0-r12,r14} - CHK_CANARY r12,CTAG17,6 - bx r14 - -.ltorg - -@ Add the round key shares pointed to by r12 into the state shares -@ Trashes r0-r3 -.balign 4 -addrkey_s: - - ldr r0,=chaff @ guaranteed 0 mod 16 -.if ST_VPERM - ldr r3,=statevperm - ldr r3,[r3] @ r3=vperm state rotation in bottom two bits - ldr r2,[r0,#12] @ barrier load -.else - movs r3,#0 -.endif - bfi r0,r12,#0,#4 @ match chaff pointer (r0) to roundkey ptr (r12) mod 16 - ldr r1,[r12,#16] @ r1=vperm key rotation in top two bits - ldr r2,[r0,#16] @ barrier load - - rsb r2,r3,r1,lsr#30 @ r2=vpermkeyrot-vpermstaterot -@ Read shareA of roundkey, offset by vpermkeyrot-vpermstaterot, and eor it into shareA of state, offset by -vpermstaterot -@ r1=rkeyArotdata, r2=vpermkeyrot-vpermstaterot, r3=statevperm, r4-r11=state, r12=roundkeyAptr -.if RK_ROR - movs r0,r2,lsl#3 - movs r1,r1,ror r0 - ands r2,r2,#3; ldr r0,[r12,r2,lsl#2]; adds r2,r2,#1; rors r0,r0,r1; eors r4,r4,r0 - ands r2,r2,#3; ldr r0,[r12,r2,lsl#2]; adds r2,r2,#1; movs r1,r1,ror#8; rors r0,r0,r1; eors r5,r5,r0 - ands r2,r2,#3; ldr r0,[r12,r2,lsl#2]; adds r2,r2,#1; movs r1,r1,ror#8; rors r0,r0,r1; eors r6,r6,r0 - ands r2,r2,#3; ldr r0,[r12,r2,lsl#2]; movs r1,r1,ror#8; rors r0,r0,r1; eors r7,r7,r0 -.else - ands r2,r2,#3; ldr r0,[r12,r2,lsl#2]; adds r2,r2,#1; eors r4,r4,r0 - ands r2,r2,#3; ldr r0,[r12,r2,lsl#2]; adds r2,r2,#1; eors r5,r5,r0 - ands r2,r2,#3; ldr r0,[r12,r2,lsl#2]; adds r2,r2,#1; eors r6,r6,r0 - ands r2,r2,#3; ldr r0,[r12,r2,lsl#2]; eors r7,r7,r0 -.endif - clear03_preserve_r3 - add r12,r12,#20 - @ r0=chaff+16, r3=statevperm, r4-r11=state, r12=roundkeyBptr - - bfi r0,r12,#0,#4 @ match chaff pointer (r0) to roundkey ptr (r12) mod 16 - ldr r1,[r12,#16] @ r1=vperm key rotation in top two bits - ldr r2,[r0,#16] @ barrier load - rsb r2,r3,r1,lsr#30 @ r2=vpermkeyrot-vpermstaterot - ldr r3,=RKshareC @ r3=common round key shareC - bfi r0,r3,#0,#4 - ldr r3,[r3] - ldr r0,[r0] @ barrier load - -@ Read shareB of roundkey, offset by vpermkeyrot-vpermstaterot, and eor it into shareB of state, offset by -vpermstaterot -@ r1=rkeyBrotdata, r2=vpermkeyrot-vpermstaterot, r3=RKshareC, r4-r11=state, r12=roundkeyB ptr -.if RK_ROR - movs r0,r2,lsl#3 - movs r1,r1,ror r0 - ands r2,r2,#3; ldr r0,[r12,r2,lsl#2]; eor r8,r8,r3,ror#16; adds r2,r2,#1; rors r0,r0,r1; eor r8,r8,r0 - ands r2,r2,#3; ldr r0,[r12,r2,lsl#2]; eor r9,r9,r3,ror#16; adds r2,r2,#1; movs r1,r1,ror#8; rors r0,r0,r1; eor r9,r9,r0 - ands r2,r2,#3; ldr r0,[r12,r2,lsl#2]; eor r10,r10,r3,ror#16; adds r2,r2,#1; movs r1,r1,ror#8; rors r0,r0,r1; eor r10,r10,r0 - ands r2,r2,#3; ldr r0,[r12,r2,lsl#2]; eor r11,r11,r3,ror#16; movs r1,r1,ror#8; rors r0,r0,r1; eor r11,r11,r0 -.else - ands r2,r2,#3; ldr r0,[r12,r2,lsl#2]; eor r8,r8,r3,ror#16; adds r2,r2,#1; eors r8,r8,r0 - ands r2,r2,#3; ldr r0,[r12,r2,lsl#2]; eor r9,r9,r3,ror#16; adds r2,r2,#1; eors r9,r9,r0 - ands r2,r2,#3; ldr r0,[r12,r2,lsl#2]; eor r10,r10,r3,ror#16; adds r2,r2,#1; eors r10,r10,r0 - ands r2,r2,#3; ldr r0,[r12,r2,lsl#2]; eor r11,r11,r3,ror#16; eors r11,r11,r0 -.endif - clear03 - bx r14 - -.balign 4 -.thumb_func -@ de/encrypt data in place -@ r0: ivec -@ r1: buf -@ r2: n, number of blocks, n>0 -.if CT_BPERM -@ In AES-CTR each block can be independently en/decrypted as the encryption only depends on the IV, -@ the key, and the block number. We can therefore process them in any order, and using a -@ random order helps to defeat attacks that work on the output of the AES, since an attacker -@ wouldn't know what plaintext or ciphertext corresponds to a particular instruction. -.endif - -ctr_crypt_s: -@ r0=IV_shareA, r1=IV_shareB, r2=cipher/plaintext buffer, r3=number of blocks - GET_CANARY r12,CTAG0,6 - push {r0-r12,r14} @ save all registers so that when we restore we overwrite any secrets - - push {r0-r3} - - SET_COUNT 93,6 - -.if CT_BPERM -@ Initialise 32 random numbers (which fit in half-words) -@ r3=number of blocks - ldr r4,=bperm_rand - movs r5,#32 -1: - bl gen_rand_sha - umull r0,r2,r0,r3 @ Random number between 0 and n-1 (n=#blocks) - strh r2,[r4],#2 - subs r5,r5,#1 - bne 1b -.endif - - bl randomisechaff - -@ Refresh IVshareA and IVshareB, convert to ror#16 format and store the result at IV0 -@ Not doing shareC or state vperm at this point - pop {r0} - ldmia r0,{r4-r7} @ r4-r7 = IVshareA - clear03 16 - pop {r1} - ldmia r1,{r8-r11} @ r8-r11 = IVshareB - clear03 32 - bl gen_rand_sha_nonpres; eors r4,r4,r0; movs r1,#0; mov r8, r8, ror#16; eor r8, r8, r0,ror#16 @ Barriers between shares to prevent implicit r4^r8 etc - bl gen_rand_sha_nonpres; eors r5,r5,r0; movs r1,#0; mov r9, r9, ror#16; eor r9, r9, r0,ror#16 - bl gen_rand_sha_nonpres; eors r6,r6,r0; movs r1,#0; mov r10,r10,ror#16; eor r10,r10,r0,ror#16 - bl gen_rand_sha_nonpres; eors r7,r7,r0; movs r1,#0; mov r11,r11,ror#16; eor r11,r11,r0,ror#16 - ldr r0,=IV0 - stmia r0,{r4-r7} - adds r0,r0,#20 - stmia r0,{r8-r11} -@ "Decommission" IV0 so that it doesn't get stacked - bl gen_rand_sha_nonpres; movs r4,r0 - bl gen_rand_sha_nonpres; movs r5,r0 - bl gen_rand_sha_nonpres; movs r6,r0 - bl gen_rand_sha_nonpres; movs r7,r0 - bl gen_rand_sha_nonpres; mov r8,r0 - bl gen_rand_sha_nonpres; mov r9,r0 - bl gen_rand_sha_nonpres; mov r10,r0 - bl gen_rand_sha_nonpres; mov r11,r0 - pop {r1,r2} -@ r1=cipher/plaintext buffer, r2=number of blocks - - movs r3,#0 - CHK_COUNT 93,6 - -ctr_crypt_mainloop: - SET_COUNT 80,6 -@ r1=cipher/plaintext buffer, r2=number of blocks, r3=block counter - -@ Do as much preparatory stuff as possible that doesn't involve the IV (to reduce interaction with it) - push {r1-r3} -@ It's OK for execution time to depend on the block counter r3 ("public"), but not the block number (secret) - - tst r3,#(REFCHAFF_PERIOD-1) - bne 1f - bl refreshchaff_and_lfsr -1: - - ldr r3,[r13,#8] @ get block count off the stack - tst r3,#(REMAP_PERIOD-1) - bne 1f - bl remap @ shuffle the LUTs; this preserves R3 -1: - CHK_COUNT 80,6 - - tst r3,#(REFROUNDKEYSHARES_PERIOD-1) - bne 1f - bl ref_roundkey_shares_s @ refresh the round key shares -1: - - ldr r3,[r13,#8] @ get block count off the stack - tst r3,#(REFROUNDKEYHVPERMS_PERIOD-1) - bne 1f - bl ref_roundkey_hvperms_s @ refresh the round key vperms -1: - - CHK_COUNT 81,6 - - pop {r1-r3} -@ r1=cipher/plaintext buffer, r2=number of blocks, r3=block counter - -@ Now calculate r12 = block number-to-be-deciphered from r3 = block counter -.if CT_BPERM -@ Use a "swap-or-not" method to generate an "oblivious" permutation; see makeperm.py version 7 - push {r1} - ldr r0,=murmur3_constants - ldmia r0,{r9-r12,r14} @ load five murmur3_32 hash constants - ldr r0,=bperm_rand - movs r1,#31 - movs r4,r3 @ r4=i -1: - ldrh r5,[r0],#2 @ r5=k - subs r5,r5,r4 @ r5=k-i - ands r6,r2,r5,asr#31 @ r6=n*(k-i<0) - adds r5,r5,r6 @ r5=j=(k-i)%n - adds r6,r4,r5 @ r6=i+j - subs r7,r4,r5 @ r7=i-j - and r8,r7,r7,asr#31 @ r8=min(i-j,0) - sub r7,r7,r8,lsl#1 @ r7=|i-j| - mla r6,r6,r2,r7 @ r6=n(i+j)+|i-j|, encodes the unordered pair {i,j} - eors r6,r6,r1,lsl#27 @ mix with swap-or-not round counter to get different hash functions -@ Now do murmur3_32 hash of r6 - mul r6,r6,r9 - movs r6,r6,ror#17 - mul r6,r6,r10 - movs r6,r6,ror#19 - adds r6,r6,r6,lsl#2 - add r6,r6,r11 - eors r6,r6,#4 - eors r6,r6,r6,lsr#16 - mul r6,r6,r12 - eors r6,r6,r6,lsr#13 - mul r6,r6,r14 - eors r6,r6,r6,lsr#16 @ not actually used here -@ Now set i to j, conditional on the top bit of r6 - subs r7,r5,r4 @ r7=j-i - ands r7,r7,r6,asr#31 @ r7=(j-i)*(top bit of r6) - adds r4,r4,r7 @ r4=j if top bit of r6, else i - subs r1,r1,#1 - bpl 1b - pop {r1} - mov r12,r4 -.else - mov r12,r3 -.endif - CHK_COUNT 82,6 - -@ r1=cipher/plaintext buffer, r2=number of blocks, r3=block counter (monotonic), r12=block number (block to be deciphered) - push {r1-r3,r12} -@ r4-r11 = IV0, r12=block number - -processIV: @ non-target label to assist power analysis - ldr r8,=IV0 - ldmia r8,{r4-r7} @ load IV0_A - clear03 16 - add r8,r8,#20 - ldmia r8,{r8-r11} @ load IV0_B - clear03 32 - rev r0,r12 - eor r7,r7,r0 @ XOR in block number to IV0. IV(block n) = IV0 ^ n, cf standard CTR mode IV0 + n. - @ XOR (vs addition) is compatible with XOR-shares, so stealthier/simpler because don't have to unshare to work out IV(block n) -@ r4-r11 = IV for the current block - CHK_COUNT 83,6 -.if ST_SHAREC - bl gen_rand_sha_nonpres @ Create state share C; all bytes the same - ands r0,r0,#255 - orrs r0,r0,r0,lsl#8 - orrs r12,r0,r0,lsl#16 - ldr r1,=shareC - str r12,[r1] -.else - movs r12,#0 -.endif -@ r4-r11 = IV for the current block w/o shareC, r12=shareC -@ refresh state shares and mix in shareC - bl gen_rand_sha_nonpres; eors r4,r4,r0; eor r4,r4,r12; movs r1,#0; eor r8, r8, r0,ror#16 @ Barriers between shares to prevent implicit r4^r8 etc - bl gen_rand_sha_nonpres; eors r5,r5,r0; eor r5,r5,r12; movs r1,#0; eor r9, r9, r0,ror#16 - bl gen_rand_sha_nonpres; eors r6,r6,r0; eor r6,r6,r12; movs r1,#0; eor r10,r10,r0,ror#16 - bl gen_rand_sha_nonpres; eors r7,r7,r0; eor r7,r7,r12; movs r1,#0; eor r11,r11,r0,ror#16 -.if ST_VPERM - bl gen_rand_sha_nonpres - ldr r1,=statevperm - movs r2,#0 - str r2,[r1] - bl addstatevperm @ Initialise state vperm (use SHA RNG to start with, later refreshes are with LFSR RNG) -.endif - - CHK_COUNT 84,6 - bl conjshareC @ Add the effect of shareC to lut_a, lut_b - CHK_COUNT 85,6 -@ now perform the 15 encryption rounds on (key, state=IV+x) -@ here r4-r7, r8-r11: state - mov r2,#0 @ round counter -rounds_s_mainloop: - ldr r12,=rkey_s - add r12,r12,r2,lsl#5 @ pointer to key shares for this round - add r12,r12,r2,lsl#3 - push {r2} @ save round count - bl addrkey_s - bl map_sbox_s - bl shift_rows_s -.if ST_VPERM - ldr r2,[r13] @ peek at stack to get round count - cmp r2,#NUMREFSTATEVPERM - bcs 1f - bl gen_rand_lfsr_nonpres - ldr r1,=statevperm - bl addstatevperm @ V shuffle of r4-r11 -1: -.endif - pop {r2} - adds r2,r2,#1 @ increment round counter - cmp r2,#14 - beq 2f @ break from loop? (last round has no mix_cols) - push {r2} - bl mix_cols_s - pop {r2} - b rounds_s_mainloop -2: - CHK_COUNT 86,6 - ldr r12,=rkey_s+14*40 @ final round key shares - bl addrkey_s - CHK_COUNT 87,6 - bl conjshareC @ Undo the effect of shareC from lut_a, lut_b - CHK_COUNT 88,6 -.if ST_VPERM -@ Undo the effects of vperm rotation recorded in statevperm - ldr r1,=statevperm - ldr r2,[r1] - rsbs r0,r2,#0 - bl addstatevperm -.endif - - pop {r1-r3,r12} - push {r3} -@ r1=cipher/plaintext buffer, r2=number of blocks, r3=block counter, r12=block to be deciphered - -decryption_start: -@ Decrypt ciphertext using AES output in shares: r4-r11 -.if ST_SHAREC - ldr r0,=shareC - ldr r0,[r0] -.else - movs r0,#0 -.endif - ldr r14,=chaff -@ r0=shareC, r1=cipher/plaintext buffer, r2=number of blocks, r3=free, r4-r11=stateA/B, r12=block to be deciphered, r14=chaff - CHK_COUNT 89,6 - add r1,r1,r12,lsl#4 @ Temporarily r1 points to block-to-be-deciphered - ldr r3,[r1] @ r3=ciphertext word - eors r3,r3,r4 @ r3=r3^shareA - ldr r4,[r14] @ barrier load - eor r3,r3,r8,ror#16 @ r3=r3^shareB - eors r3,r3,r0 @ r3=r3^shareC - str r3,[r1] @ plaintext word=r3 - ldr r3,[r1,#4] @ and similarly for words 1,2,3 of block... - ldr r4,[r14,#4] - eors r3,r3,r5 - eor r3,r3,r9,ror#16 - eors r3,r3,r0 - str r3,[r1,#4] - ldr r3,[r1,#8] - ldr r4,[r14,#8] - eors r3,r3,r6 - eor r3,r3,r10,ror#16 - eors r3,r3,r0 - str r3,[r1,#8] - ldr r3,[r1,#12] - ldr r4,[r14,#12] - eors r3,r3,r7 - eor r3,r3,r11,ror#16 - eors r3,r3,r0 - str r3,[r1,#12] - - sub r1,r1,r12,lsl#4 @ Restore r1 to point to start of buffer - CHK_COUNT 90,6 - - pop {r3} @ Restore block counter -@ r1=cipher/plaintext buffer, r2=number of blocks, r3=block counter -decryption_end: - - adds r3,r3,#1 - cmp r3,r2 - CHK_COUNT 91,6 - bne ctr_crypt_mainloop - -#if WIPE_MEMORY -@ Wipe memory from workspace_start up to the stack pointer -@ First fill everything (except the RNG state itself) with random numbers to avoid any possibly useful power signals - ldr r4,=workspace_start - ldr r5,=rstate_all_start -1: - bl gen_rand_sha_nonpres - stmia r4!,{r0} - cmp r4,r5 - bcc 1b - ldr r4,=rstate_all_end - mov r5,r13 @ gcc arm assembler says cmp r4,r13 is deprecated, so use another register -1: - bl gen_rand_sha_nonpres - stmia r4!,{r0} - cmp r4,r5 - bcc 1b - -@ Then fill everything with zeros so as not to leave behind clues about the RNG state - ldr r4,=workspace_start - movs r0,#0 - mov r5,r13 -1: - stmia r4!,{r0} - cmp r4,r5 - bcc 1b -#endif - -.if GEN_RAND_SHA - SET_COUNT 23,6 - bl reset_sha_trng @ clear out the SHA hardware -.endif - pop {r0-r12,r14} - CHK_CANARY r12,CTAG0,6 - bx r14 diff --git a/bootloaders/encrypted/config.h b/bootloaders/encrypted/config.h deleted file mode 100644 index 2c4ce0d03..000000000 --- a/bootloaders/encrypted/config.h +++ /dev/null @@ -1,90 +0,0 @@ -#pragma once - -// These options (up to long /////////////// line) should be enabled because the security risk of not using them is too high -// or because the time cost is very low so you may as well have them. -// They can be set to 0 for analysis or testing purposes. - -#ifndef GEN_RAND_SHA -#define GEN_RAND_SHA 1 // use SHA256 hardware to generate some random numbers -#endif - // Some RNG calls are hard coded to LFSR RNG, others to SHA RNG - // Setting GEN_RAND_SHA to 0 has the effect of redirecting the latter to LFSR RNG -#ifndef ST_SHAREC -#define ST_SHAREC 1 // This creates a partial extra share at almost no extra cost -#endif -#ifndef ST_VPERM -#define ST_VPERM 1 // insert random vertical permutations in state during de/encryption? -#endif -#ifndef CT_BPERM -#define CT_BPERM 1 // process blocks in a random order in counter mode? -#endif -#ifndef RK_ROR -#define RK_ROR 1 // store round key shares with random rotations within each word -#endif - -#ifndef WIPE_MEMORY -#define WIPE_MEMORY 1 // Wipe memory after decryption -#endif - -// The following options should be enabled to increase resistance to glitching attacks. - -#ifndef RC_CANARY -#define RC_CANARY 1 // use rcp_canary feature -#endif -#ifndef RC_COUNT -#define RC_COUNT 1 // use rcp_count feature -#endif - -// Although jitter/timing-variation may be circumventable in theory, in practice -// randomising the timing of operations can make side-channel attacks very much more -// effort to carry out. These can be disabled for analysis or testing purposes. -// It is advisable to use a least one form of jitter. - -// RC_JITTER is quite slow, and is probably the most predictable of the three, so it is disabled by default. -// (Leaving it as an option because it's just possible that the large delays it produces are advantageous in defeating certain side-channel attacks.) -#ifndef RC_JITTER -#define RC_JITTER 0 // 0-7. Higher = more jitter. Governs use of random-delay versions of RCP instructions. -#endif - -#ifndef SH_JITTER -#define SH_JITTER 1 // Insert random delays, tagged onto SHA RNG -#endif - - -//////////////////////////////////////////////////////////////////////////////////////////////////////////// - -// The following options can be adjusted, affecting the performance/security tradeoff - -// Period = X means that the operation in question occurs every X blocks, so higher = more performance and lower security. -// No point in making them more than 16 or so, since the time taken by the subroutines would be negligible. -// These must be a power of 2. Timings as of commit 82d31652 -// -// Baseline time per 16-byte block = 14109 (with no jitter) cycles -#ifndef REFCHAFF_PERIOD -#define REFCHAFF_PERIOD 1 // Extra cost per 16-byte block = 474/REFCHAFF_PERIOD cycles -#endif -#ifndef REMAP_PERIOD -#define REMAP_PERIOD 4 // Extra cost per 16-byte block = 4148/REMAP_PERIOD cycles -#endif -#ifndef REFROUNDKEYSHARES_PERIOD -#define REFROUNDKEYSHARES_PERIOD 1 // Extra cost per 16-byte block = 1304/REFROUNDKEYSHARES_PERIOD cycles -#endif -#ifndef REFROUNDKEYHVPERMS_PERIOD -#define REFROUNDKEYHVPERMS_PERIOD 1 // Extra cost per 16-byte block = 1486/REFROUNDKEYVPERM_PERIOD cycles -#endif - -// Setting NUMREFSTATEVPERM to X means that state vperm refreshing happens on the first X AES rounds only, -// so lower = more performance and lower security. -// The rationale for doing it this way is that later rounds should be protected by CT_BPERM. -// NUMREFSTATEVPERM can be from 0 to 14. -#ifndef NUMREFSTATEVPERM -#define NUMREFSTATEVPERM 7 // Extra cost per 16-byte block = 61*NUMREFSTATEVPERM cycles -#endif - -//////////////////////////////////////////////////////////////////////////////////////////////////////////// - -#define MAX_NUM_BLOCKS 32768 - -#if SH_JITTER && !GEN_RAND_SHA -#error GEN_RAND_SHA must be set if you want to use SH_JITTER -#endif diff --git a/bootloaders/encrypted/enc-pt.json b/bootloaders/encrypted/enc-pt.json index e9a12b7dd..9c5c3a17e 100644 --- a/bootloaders/encrypted/enc-pt.json +++ b/bootloaders/encrypted/enc-pt.json @@ -12,8 +12,8 @@ { "name": "A", "id": 0, - "start": "40K", - "size": "480K", + "start": "64K", + "size": "448K", "families": ["rp2350-arm-s"], "permissions": { "secure": "rw", @@ -24,7 +24,7 @@ { "name": "B", "id": 1, - "size": "480K", + "size": "448K", "families": ["rp2350-arm-s"], "permissions": { "secure": "rw", diff --git a/bootloaders/encrypted/enc_bootloader.c b/bootloaders/encrypted/enc_bootloader.c index 54e89d2e5..d6cce4d6a 100644 --- a/bootloaders/encrypted/enc_bootloader.c +++ b/bootloaders/encrypted/enc_bootloader.c @@ -13,17 +13,94 @@ #include "hardware/structs/otp.h" #include "hardware/structs/qmi.h" #include "hardware/structs/xip_ctrl.h" +#include "hardware/clocks.h" +#include "hardware/xosc.h" +#include "hardware/structs/rosc.h" +#include "hardware/pll.h" -#include "config.h" - -#define OTP_KEY_PAGE 30 +#define OTP_KEY_PAGE 29 extern void decrypt(uint8_t* key4way, uint8_t* IV_OTPsalt, uint8_t* IV_public, uint8_t(*buf)[16], int nblk); +// These just have to be higher than the actual frequency, to prevent overclocking unused peripherals +#define ROSC_HZ 300*MHZ +#define OTHER_CLK_DIV 30 + + +void runtime_init_clocks(void) { + // Disable resus that may be enabled from previous software + clocks_hw->resus.ctrl = 0; + + uint32_t rosc_div = 2; // default divider 2 + uint32_t rosc_drive = 0x7777; // default drives of 0b111 (0x7) + + // Bump up ROSC speed to ~110MHz + rosc_hw->freqa = 0; // reset the drive strengths + rosc_hw->div = rosc_div | ROSC_DIV_VALUE_PASS; // set divider + // Increment the freqency range one step at a time - this is safe provided the current config is not TOOHIGH + // because ROSC_CTRL_FREQ_RANGE_VALUE_MEDIUM | ROSC_CTRL_FREQ_RANGE_VALUE_HIGH == ROSC_CTRL_FREQ_RANGE_VALUE_HIGH + static_assert((ROSC_CTRL_FREQ_RANGE_VALUE_LOW | ROSC_CTRL_FREQ_RANGE_VALUE_MEDIUM) == ROSC_CTRL_FREQ_RANGE_VALUE_MEDIUM); + static_assert((ROSC_CTRL_FREQ_RANGE_VALUE_MEDIUM | ROSC_CTRL_FREQ_RANGE_VALUE_HIGH) == ROSC_CTRL_FREQ_RANGE_VALUE_HIGH); + hw_set_bits(&rosc_hw->ctrl, ROSC_CTRL_FREQ_RANGE_VALUE_MEDIUM); + hw_set_bits(&rosc_hw->ctrl, ROSC_CTRL_FREQ_RANGE_VALUE_HIGH); + + // Enable rosc randomisation + rosc_hw->freqa = (ROSC_FREQA_PASSWD_VALUE_PASS << ROSC_FREQA_PASSWD_LSB) | + rosc_drive | ROSC_FREQA_DS1_RANDOM_BITS | ROSC_FREQA_DS0_RANDOM_BITS; // enable randomisation + + // Not used with FREQ_RANGE_VALUE_HIGH, but should still be set to the maximum drive + rosc_hw->freqb = (ROSC_FREQB_PASSWD_VALUE_PASS << ROSC_FREQB_PASSWD_LSB) | + ROSC_FREQB_DS7_LSB | ROSC_FREQB_DS6_LSB | ROSC_FREQB_DS5_LSB | ROSC_FREQB_DS4_LSB; + + // CLK SYS = ROSC directly, as it's running slowly enough + clock_configure_int_divider(clk_sys, + CLOCKS_CLK_SYS_CTRL_SRC_VALUE_CLKSRC_CLK_SYS_AUX, + CLOCKS_CLK_SYS_CTRL_AUXSRC_VALUE_ROSC_CLKSRC, + ROSC_HZ, // this doesn't have to be accurate + 1); + + // CLK_REF = ROSC / OTHER_CLK_DIV - this isn't really used, so just needs to be set to a low enough frequency + clock_configure_int_divider(clk_ref, + CLOCKS_CLK_REF_CTRL_SRC_VALUE_ROSC_CLKSRC_PH, + 0, + ROSC_HZ, + OTHER_CLK_DIV); + + + // Everything else should run from PLL USB, so we can use UART and USB for output + xosc_init(); + pll_init(pll_usb, PLL_USB_REFDIV, PLL_USB_VCO_FREQ_HZ, PLL_USB_POSTDIV1, PLL_USB_POSTDIV2); + + // CLK USB = PLL USB 48MHz / 1 = 48MHz + clock_configure_undivided(clk_usb, + 0, // No GLMUX + CLOCKS_CLK_USB_CTRL_AUXSRC_VALUE_CLKSRC_PLL_USB, + USB_CLK_HZ); + + // CLK ADC = PLL USB 48MHz / 1 = 48MHz + clock_configure_undivided(clk_adc, + 0, // No GLMUX + CLOCKS_CLK_ADC_CTRL_AUXSRC_VALUE_CLKSRC_PLL_USB, + USB_CLK_HZ); + + // CLK PERI = PLL USB 48MHz / 1 = 48MHz. Used as reference clock for UART and SPI serial. + clock_configure_undivided(clk_peri, + 0, + CLOCKS_CLK_PERI_CTRL_AUXSRC_VALUE_CLKSRC_PLL_USB, + USB_CLK_HZ); + + // CLK_HSTX = PLL USB 48MHz / 1 = 48MHz. Transmit bit clock for the HSTX peripheral. + clock_configure_undivided(clk_hstx, + 0, + CLOCKS_CLK_HSTX_CTRL_AUXSRC_VALUE_CLKSRC_PLL_USB, + USB_CLK_HZ); +} + // The function lock_key() is called from decrypt() after key initialisation is complete and before decryption begins. // That is a suitable point to lock the OTP area where key information is stored. void lock_key() { otp_hw->sw_lock[OTP_KEY_PAGE] = 0xf; + otp_hw->sw_lock[OTP_KEY_PAGE + 1] = 0xf; } @@ -121,15 +198,6 @@ int main() { reset_usb_boot(0, 0); } - printf("OTP Valid Keys %x\n", otp_hw->key_valid); - - printf("Unlocking\n"); - for (int i=0; i<4; i++) { - uint32_t key_i = ((i*2+1) << 24) | ((i*2+1) << 16) | - (i*2 << 8) | i*2; - otp_hw->crt_key_w[i] = key_i; - } - uint8_t iv[16]; data_start_addr += first_mb_end; memcpy(iv, (void*)(XIP_BASE + data_start_addr), sizeof(iv)); @@ -153,12 +221,12 @@ int main() { decrypt( (uint8_t*)&(otp_data[OTP_KEY_PAGE * 0x40]), - (uint8_t*)&(otp_data[(OTP_KEY_PAGE + 1) * 0x40]), + (uint8_t*)&(otp_data[(OTP_KEY_PAGE + 2) * 0x40]), iv, (void*)SRAM_BASE, data_size/16 ); // Lock the IV salt - otp_hw->sw_lock[OTP_KEY_PAGE + 1] = 0xf; + otp_hw->sw_lock[OTP_KEY_PAGE + 2] = 0xf; printf("Post decryption image begins with\n"); for (int i=0; i < 4; i++) @@ -166,7 +234,7 @@ int main() { printf("Chaining into %x, size %x\n", SRAM_BASE, data_size); - stdio_deinit_all(); + stdio_uart_deinit(); // stdio_usb_deinit doesn't work here, so only deinit UART rc = rom_chain_image( workarea, @@ -175,7 +243,7 @@ int main() { data_size ); - stdio_init_all(); + stdio_uart_init(); printf("Shouldn't return from ROM call %d\n", rc); reset_usb_boot(0, 0); diff --git a/bootloaders/encrypted/mbedtls_aes.c b/bootloaders/encrypted/mbedtls_aes.c new file mode 100644 index 000000000..9f19c9b4d --- /dev/null +++ b/bootloaders/encrypted/mbedtls_aes.c @@ -0,0 +1,73 @@ +#include +#include "pico/stdlib.h" + +extern void lock_key(); + +int mb_aes_crypt_ctr_xor(mbedtls_aes_context *ctx, + size_t length, + unsigned char iv0[16], + unsigned char nonce_xor[16], + unsigned char stream_block[16], + const unsigned char *input, + unsigned char *output) +{ + int c; + int ret = 0; + size_t n = 0; + uint32_t counter = 0; + + assert(length == (uint32_t)length); + + while (length--) { + if (n == 0) { + for (int i = 16; i > 0; i--) { + nonce_xor[i-1] = iv0[i-1]; + if (i - (int)(16 - sizeof(counter)) > (int)0) { + nonce_xor[i-1] ^= (unsigned char)(counter >> ((16-i)*8)); + } + } + + ret = mbedtls_aes_crypt_ecb(ctx, MBEDTLS_AES_ENCRYPT, nonce_xor, stream_block); + if (ret != 0) { + break; + } + counter++; + } + c = *input++; + *output++ = (unsigned char) (c ^ stream_block[n]); + + n = (n + 1) & 0x0F; + } + + return ret; +} + +void decrypt(uint8_t* key4way, uint8_t* IV_OTPsalt, uint8_t* IV_public, uint8_t(*buf)[16], int nblk) { + mbedtls_aes_context aes; + + uint32_t aes_key[8]; + uint32_t* key4waywords = (uint32_t*)key4way; + // Key is stored as a 4-way share of each word, ie X[0] = A[0] ^ B[0] ^ C[0] ^ D[0], stored as A[0], B[0], C[0], D[0] + for (int i=0; i < count_of(aes_key); i++) { + int skip = (i/4)*16; // skip every other 16 words (64 bytes), due to the FIB workaround + aes_key[i] = key4waywords[i*4 + skip] + ^ key4waywords[i*4 + 1 + skip] + ^ key4waywords[i*4 + 2 + skip] + ^ key4waywords[i*4 + 3 + skip]; + } + + uint8_t iv[16]; + for (int i=0; i < sizeof(iv); i++) { + iv[i] = IV_OTPsalt[i] ^ IV_public[i]; + } + + int len = nblk * 16; + + mbedtls_aes_setkey_enc(&aes, (uint8_t*)aes_key, 256); + + lock_key(); + + uint8_t xor_working_block[16] = {0}; + uint8_t stream_block[16] = {0}; + mb_aes_crypt_ctr_xor(&aes, len, (uint8_t*)iv, xor_working_block, stream_block, (uint8_t*)buf, (uint8_t*)buf); +} diff --git a/bootloaders/encrypted/mbedtls_config.h b/bootloaders/encrypted/mbedtls_config.h new file mode 100644 index 000000000..7b1c073c1 --- /dev/null +++ b/bootloaders/encrypted/mbedtls_config.h @@ -0,0 +1,9 @@ +#ifndef _MBEDTLS_CONFIG_H +#define _MBEDTLS_CONFIG_H + +#define MBEDTLS_HAVE_ASM +#define MBEDTLS_AES_C +#define MBEDTLS_AES_ROM_TABLES +#define MBEDTLS_CIPHER_MODE_CTR + +#endif diff --git a/encrypted/hello_encrypted/CMakeLists.txt b/encrypted/hello_encrypted/CMakeLists.txt index 7ec352727..443e5dfa4 100644 --- a/encrypted/hello_encrypted/CMakeLists.txt +++ b/encrypted/hello_encrypted/CMakeLists.txt @@ -1,4 +1,4 @@ -# Example encrypted binary +# Example encrypted binary - this should be secure against side channel attacks add_executable(hello_encrypted hello_encrypted.c secret.S @@ -35,8 +35,7 @@ pico_hash_binary(hello_encrypted) pico_encrypt_binary(hello_encrypted ${CMAKE_CURRENT_LIST_DIR}/privateaes.bin ${CMAKE_CURRENT_LIST_DIR}/ivsalt.bin - EMBED - OTP_KEY_PAGE 29) + EMBED) # package uf2 in flash pico_package_uf2_output(hello_encrypted 0x10000000) @@ -48,7 +47,7 @@ pico_add_extra_outputs(hello_encrypted) example_auto_set_url(hello_encrypted) -# Example encrypted binary using MbedTLS +# Example encrypted binary using MbedTLS - this is faster, but not secure against side channel attacks add_executable(hello_encrypted_mbedtls hello_encrypted.c secret.S