From 6a6c18f9b07df10383edd1eb13bf2f9d46cc793c Mon Sep 17 00:00:00 2001 From: William Vinnicombe Date: Tue, 14 Jan 2025 15:33:37 +0000 Subject: [PATCH 01/20] Use key share for AES file Update CMake tooling to use 128 byte key files (a 4-way share of the 32 byte key). Also temporarily update the enc_bootloader to deshare this key - the actual fix will need to be in aes.S. --- bootloaders/encrypted/README.md | 9 ++- bootloaders/encrypted/enc_bootloader.c | 14 +++- bootloaders/encrypted/otp.json | 100 ++++++++++++++++++++++++- bootloaders/encrypted/privateaes.bin | Bin 32 -> 128 bytes bootloaders/encrypted/update-key.cmake | 2 +- 5 files changed, 119 insertions(+), 6 deletions(-) diff --git a/bootloaders/encrypted/README.md b/bootloaders/encrypted/README.md index f079d9469..cd909ddff 100644 --- a/bootloaders/encrypted/README.md +++ b/bootloaders/encrypted/README.md @@ -4,10 +4,15 @@ Replace private.pem and privateaes.bin with your own keys - your signing key mus openssl ecparam -name secp256k1 -genkey -out private.pem ``` -The AES key is just be a 32 byte binary file - you can create one with +The AES key is stored as a 4-way share in a 128 byte binary file - you can create one with ```bash -dd if=/dev/urandom of=privateaes.bin bs=1 count=32 +dd if=/dev/urandom of=privateaes.bin bs=1 count=128 +``` + +or in Powershell 7 +```powershell +[byte[]] $(Get-SecureRandom -Maximum 256 -Count 128) | Set-Content privateaes.bin -AsByteStream ``` Then either drag & drop the UF2 files to the device in order (enc_bootloader first, then hello_serial_enc) waiting for a reboot in-between, or run diff --git a/bootloaders/encrypted/enc_bootloader.c b/bootloaders/encrypted/enc_bootloader.c index 1df509101..dc828d62b 100644 --- a/bootloaders/encrypted/enc_bootloader.c +++ b/bootloaders/encrypted/enc_bootloader.c @@ -180,7 +180,19 @@ int main() { init_lut_map(); // Read key directly from OTP - guarded reads will throw a bus fault if there are any errors uint16_t* otp_data = (uint16_t*)OTP_DATA_GUARDED_BASE; - init_key(rkey_s, (uint8_t*)&(otp_data[(OTP_CMD_ROW_BITS & 0x780)])); + + // Temporary de-sharing - REMOVE THIS AND MODIFY ASM INSTEAD + uint8_t* shared_key_a = (uint8_t*)&(otp_data[(OTP_CMD_ROW_BITS & 0x780)]); + uint8_t* shared_key_b = (uint8_t*)&(otp_data[(OTP_CMD_ROW_BITS & 0x790)]); + uint8_t* shared_key_c = (uint8_t*)&(otp_data[(OTP_CMD_ROW_BITS & 0x7A0)]); + uint8_t* shared_key_d = (uint8_t*)&(otp_data[(OTP_CMD_ROW_BITS & 0x7B0)]); + uint8_t deshared_key[32]; + for (int i=0; i < sizeof(deshared_key); i++) { + deshared_key[i] = shared_key_a[i] ^ shared_key_b[i] ^ shared_key_c[i] ^ shared_key_d[i]; + } + init_key(rkey_s, deshared_key); + + // init_key(rkey_s, (uint8_t*)&(otp_data[(OTP_CMD_ROW_BITS & 0x780)])); otp_hw->sw_lock[30] = 0xf; flush_reg(); ctr_crypt_s(iv, (void*)SRAM_BASE, data_size/16); diff --git a/bootloaders/encrypted/otp.json b/bootloaders/encrypted/otp.json index f86a9e019..412c11078 100644 --- a/bootloaders/encrypted/otp.json +++ b/bootloaders/encrypted/otp.json @@ -35,7 +35,103 @@ "0xc0", "0xd0", "0xe0", - "0xf0" + "0xf0", + "0x0f", + "0x0e", + "0x0d", + "0x0c", + "0x0b", + "0x0a", + "0x09", + "0x08", + "0x07", + "0x06", + "0x05", + "0x04", + "0x03", + "0x02", + "0x01", + "0x00", + "0xf0", + "0xe0", + "0xd0", + "0xc0", + "0xb0", + "0xa0", + "0x90", + "0x80", + "0x70", + "0x60", + "0x50", + "0x40", + "0x30", + "0x20", + "0x10", + "0x00", + "0x08", + "0x09", + "0x0a", + "0x0b", + "0x0c", + "0x0d", + "0x0e", + "0x0f", + "0x00", + "0x01", + "0x02", + "0x03", + "0x04", + "0x05", + "0x06", + "0x07", + "0x80", + "0x90", + "0xa0", + "0xb0", + "0xc0", + "0xd0", + "0xe0", + "0xf0", + "0x00", + "0x10", + "0x20", + "0x30", + "0x40", + "0x50", + "0x60", + "0x70", + "0x07", + "0x06", + "0x05", + "0x04", + "0x03", + "0x02", + "0x01", + "0x00", + "0x0f", + "0x0e", + "0x0d", + "0x0c", + "0x0b", + "0x0a", + "0x09", + "0x08", + "0x70", + "0x60", + "0x50", + "0x40", + "0x30", + "0x20", + "0x10", + "0x00", + "0xf0", + "0xe0", + "0xd0", + "0xc0", + "0xb0", + "0xa0", + "0x90", + "0x80" ] }, "OTP_DATA_KEY1" : [ 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7 ], @@ -43,4 +139,4 @@ "OTP_DATA_KEY2" : [ 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0 ], "OTP_DATA_KEY2_VALID" : "0x010101", "PAGE30_LOCK0" : "0x4a4a4a" -} \ No newline at end of file +} diff --git a/bootloaders/encrypted/privateaes.bin b/bootloaders/encrypted/privateaes.bin index 0122f8a2ce6c1a5666cec893fed33fe49d39227c..ef7a0dc1d6662d847d48d6fc1a4f6ee3ce8fcd7a 100644 GIT binary patch literal 128 zcmZQzWMXDvWn<^yO}%P%v-^NGNESuwcW13lBc<^YQX Date: Tue, 14 Jan 2025 17:17:42 +0000 Subject: [PATCH 02/20] Improve checking for malicious flash data Add data_max_size to prevent overwriting the bootloader with data from flash --- bootloaders/encrypted/enc_bootloader.c | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/bootloaders/encrypted/enc_bootloader.c b/bootloaders/encrypted/enc_bootloader.c index dc828d62b..02f81ff09 100644 --- a/bootloaders/encrypted/enc_bootloader.c +++ b/bootloaders/encrypted/enc_bootloader.c @@ -92,31 +92,35 @@ int main() { rc = rom_get_partition_table_info((uint32_t*)workarea, 0x8, PT_INFO_PARTITION_LOCATION_AND_FLAGS | PT_INFO_SINGLE_PARTITION | (boot_partition << 24)); - uint32_t data_start_addr; - uint32_t data_end_addr; + uint32_t data_start_addr = 0; + uint32_t data_end_addr = 0; + uint32_t data_max_size = 0; if (rc != 3) { printf("No boot partition - assuming bin at start of flash\n"); data_start_addr = 0; data_end_addr = 0x70000; // must fit into 0x20000000 -> 0x20070000 + data_max_size = data_end_addr - data_start_addr; } else { uint16_t first_sector_number = (((uint32_t*)workarea)[1] & PICOBIN_PARTITION_LOCATION_FIRST_SECTOR_BITS) >> PICOBIN_PARTITION_LOCATION_FIRST_SECTOR_LSB; uint16_t last_sector_number = (((uint32_t*)workarea)[1] & PICOBIN_PARTITION_LOCATION_LAST_SECTOR_BITS) >> PICOBIN_PARTITION_LOCATION_LAST_SECTOR_LSB; data_start_addr = first_sector_number * 0x1000; data_end_addr = (last_sector_number + 1) * 0x1000; + data_max_size = data_end_addr - data_start_addr; - printf("Partition Start %x, End %x\n", data_start_addr, data_end_addr); + printf("Partition Start %x, End %x, Max Size %x\n", data_start_addr, data_end_addr, data_max_size); } printf("Decrypting the chosen image\n"); uint32_t first_mb_start = 0; + bool first_mb_start_found = false; uint32_t first_mb_end = 0; uint32_t last_mb_start = 0; - for (uint16_t i=0; i <= 0x1000; i += 4) { + for (uint16_t i=0; i < 0x1000; i += 4) { if (*(uint32_t*)(XIP_BASE + data_start_addr + i) == 0xffffded3) { printf("Found first block start\n"); first_mb_start = i; - } - if (*(uint32_t*)(XIP_BASE + data_start_addr + i) == 0xab123579) { + first_mb_start_found = true; + } else if (first_mb_start_found && (*(uint32_t*)(XIP_BASE + data_start_addr + i) == 0xab123579)) { printf("Found first block end\n"); first_mb_end = i + 4; last_mb_start = *(uint32_t*)(XIP_BASE + data_start_addr + i-4) + first_mb_start; @@ -124,6 +128,12 @@ int main() { } } + if (last_mb_start > data_max_size) { + // todo - harden this check + printf("ERROR: Encrypted binary is too big for it's partition - resetting\n"); + reset_usb_boot(0, 0); + } + if (*(uint32_t*)(XIP_BASE + data_start_addr + last_mb_start) == 0xffffded3) { printf("Found last block start where expected\n"); } else { From 55fee2f9f3b68af084230fef23f6d20639ce1c53 Mon Sep 17 00:00:00 2001 From: William Vinnicombe Date: Wed, 15 Jan 2025 15:04:22 +0000 Subject: [PATCH 03/20] Incorporate latest changes to aes.S Also shrink the space allocated for the bootloader to 32K (plus 8K scratch) --- bootloaders/encrypted/CMakeLists.txt | 4 +- bootloaders/encrypted/aes.S | 2260 +++++++++++++++--------- bootloaders/encrypted/config.h | 157 +- bootloaders/encrypted/enc-pt.json | 8 +- bootloaders/encrypted/enc_bootloader.c | 49 +- 5 files changed, 1507 insertions(+), 971 deletions(-) diff --git a/bootloaders/encrypted/CMakeLists.txt b/bootloaders/encrypted/CMakeLists.txt index f29f0efe2..65cf86f78 100644 --- a/bootloaders/encrypted/CMakeLists.txt +++ b/bootloaders/encrypted/CMakeLists.txt @@ -46,8 +46,8 @@ function(add_linker_script target origin length) pico_set_linker_script(${target} ${CMAKE_CURRENT_BINARY_DIR}/${target}.ld) endfunction() -# create linker script to run from 0x20070000 -add_linker_script(enc_bootloader "0x20070000" "64k") +# create linker script to run from 0x20078000 +add_linker_script(enc_bootloader "0x20078000" "32k") # configure otp output pico_set_otp_key_output_file(enc_bootloader ${CMAKE_CURRENT_BINARY_DIR}/otp.json) diff --git a/bootloaders/encrypted/aes.S b/bootloaders/encrypted/aes.S index feccaae68..fb10d8745 100644 --- a/bootloaders/encrypted/aes.S +++ b/bootloaders/encrypted/aes.S @@ -5,14 +5,10 @@ #include "hardware/platform_defs.h" #include "hardware/regs/addressmap.h" #include "hardware/regs/sha256.h" -#include "hardware/rcp.h" #include "config.h" .global delay -.global aes_start -.global aes_end -.global flush_reg .global isr_systick .extern systick_data @@ -33,89 +29,31 @@ .endif .global remap -.global gen_rand +.global gen_rand_sha +.global gen_irand .global init_key .global rkey_s .global lut_a,lut_a_map .global lut_b,lut_b_map -.global rstate - -@ RCP macros - -#define CTAG0 0x2a -#define CTAG1 0x2b -#define CTAG2 0x2c -#define CTAG3 0x2d -#define CTAG4 0x2e -#define CTAG5 0x30 -#define CTAG6 0x31 -#define CTAG7 0x32 -#define CTAG8 0x33 -#define CTAG9 0x34 -#define CTAG10 0x35 -#define CTAG11 0x36 -#define CTAG12 0x37 -#define CTAG13 0x38 -#define CTAG14 0x39 -#define CTAG15 0x3a -#define CTAG16 0x3b -#define CTAG17 0x3c - -.macro SET_COUNT n -.if RC_COUNT -.if RC_JITTER - rcp_count_set \n -.else - rcp_count_set_nodelay \n -.endif -.endif -.endm - -.macro CHK_COUNT n -.if RC_COUNT -.if RC_JITTER - rcp_count_check \n -.else - rcp_count_check_nodelay \n -.endif -.endif -.endm - -.macro GET_CANARY rx,tag -.if RC_CANARY -.if RC_JITTER - rcp_canary_get \rx,\tag -.else - rcp_canary_get_nodelay \rx,\tag -.endif -.endif -.endm - -.macro CHK_CANARY rx,tag -.if RC_CANARY -.if RC_JITTER - rcp_canary_check \rx,\tag -.else - rcp_canary_check_nodelay \rx,\tag -.endif -.endif -.endm +.global rstate_sha,rstate_lfsr -.macro GET_CANARY_NJ rx,tag @ with no jitter even if you ask for it (otherwise slows down gen_rand a lot) -.if RC_CANARY - rcp_canary_get_nodelay \rx,\tag +.if CT_BPERM +@ Use .data section here because everything is initialised to zero in a .bss section +.section .data.aes +.balign 16 +murmur3_constants: @ Five constants used in murmur3_32 hash +.word 0xcc9e2d51 +.word 0x1b873593 +.word 0xe6546b64 +.word 0x85ebca6b +.word 0xc2b2ae35 .endif -.endm -.macro CHK_CANARY_NJ rx,tag @ with no jitter even if you ask for it -.if RC_CANARY - rcp_canary_check_nodelay \rx,\tag -.endif -.endm +@ Put workspace in the second scratch area (was .section .bss.aes) +.section .scratch_y.aes -.section .stack.aes -@ Regardless of configuration the code uses a single 256-entry LUT. If both +@ Regardless of configuration, the code uses a single 256-entry LUT. If both @ encryption and decryption are enabled then this is a table of inverses @ of GF(2⁸) field elements, from which both the S-box and inverse S-box @ functions can be derived; otherwise it can be a simple inverse S-box @@ -133,67 +71,105 @@ @ shares, namely @ lut_a[x ^ a₀ ^ a₁] ^ c₀ ^ d₀ and @ lut_b[x ^ b₀ ^ b₁] ^ c₁ ^ d₁ -lut_a: @ LUT share A +.balign 16 +lut_a: @ LUT share A (must be 0 mod 16 so that init_key_sbox knows how to mask the lookup) .space 256 lut_a_map: @ the current scrambling of lut_a; not particularly secret since it can be deduced from the contents of lut_a and lut_b .space 4 -.space 4 @ align to multiple of 8 -lut_b: @ LUT share B +.space 4 @ align to 8 mod 16 +lut_b: @ LUT share B (must be 8 mod 16 so that init_key_sbox knows how to mask the lookup) .space 256 lut_b_map: .space 4 .space 4 @ align to multiple of 8 -rkey_s: @ round key shares -.if RK_ROR +rkey_s: @ round key shares: 600 bytes = 15 rounds * 2 shares * (4+1) words + @ every fourth word has a word that is used as a vperm count, and also as a spacer to misalign the shares mod 16 .space 600 -.else -.space 480 -.endif +rkey4way: @ scratch area for init_key; could overlap this with other scratch space if need to save space +.space 128 .if CT_BPERM -ctr_scratch: @ scratch area for CTR code to use when "decrypting" out-of-range blocks -.space 16 +bperm_rand: @ 32 half words that define the oblivious permutation of blocks +.space 64 .endif -rstate: @ SHA random state, to be initialised to TRNG bytes; zeroth byte must be initialised to zero +.balign 16 +rstate_sha: @ 128-bit SHA random state, to be initialised to TRNG bytes; zeroth byte must be initialised to zero +.space 16 +rstate_lfsr: @ 32-bit LFSR random state and constant used to step it (initialised by C program) +.space 8 +.balign 16 +permscratch: @ Must be 0 mod 16; 16 bytes of scratch space to store permutation(s) +perm16: .space 16 +@ Scratch space of 32 bytes used both by init_key_sbox and map_sbox_s +.balign 16 +fourway: @ Must be 0 mod 16 +shareA: @ 0 mod 16 +.space 20 @ Only need 16 bytes, but choosing shareB!=shareA mod 16 +shareB: @ 4 mod 16 +.space 20 +shareC: @ 8 mod 16 +.space 4 +statevperm: @ 12 mod 16 +.space 4 @ vperm state rotation: only last two bits are operational; other bits random +.balign 16 +chaff: @ Must be 0 mod 16; This will be filled with random numbers to do barrier loads +.space 48 +.balign 16 + +@ Put main code in first scratch area (was .section .text.aes,"ax",%progbits) +.section .scratch_x.aes,"ax",%progbits + +.macro gpioput pin,state,reg1,reg2 + mov \reg1,#0xd0000000 + mov \reg2,#(1<<\pin) + str \reg2,[\reg1,#32-8*\state] +.endm -.section .text.aes,"ax",%progbits +.macro clear03 offset=0 + ldr r0,=(chaff+\offset) + ldmia r0,{r0-r3} +.endm -.thumb_func -aes_start: - nop +.macro clear01 offset=0 + ldr r0,=(chaff+\offset) + ldmia r0,{r0,r1} + rev r0,r0 +.endm .if GEN_RAND_SHA -.balign 4 -.thumb_func @ random numbers using SHA256 hardware -@ preserves r1-r3 -gen_rand: - GET_CANARY_NJ r0,CTAG1 - push {r0-r3,r14} - ldr r0,=#SHA256_BASE -4: - ldr r2,=#rstate - ldrb r1,[r2] @ get word counter from bottom byte of rstate[] (offset into SUM registers) - subs r3,r1,#4 @ decrement it to previous SUM register - ble 1f @ if the offset was 4 or less we have run out of SUM register values .if SHA256_SUM0_OFFSET!=8 .err .endif -2: - ldr r0,[r0,r1] @ read value from SUM register: note that this relies on SHA256_SUM0_OFFSET==8 - strb r3,[r2] @ save updated SUM register offset in bottom byte of rstate[] - pop {r1} - CHK_CANARY_NJ r1,CTAG1 + +@ Return single random word in r0 +@ Preserves r1-r13 +.balign 4 +gen_rand_sha: + push {r1-r3,r14} + bl gen_rand_sha_nonpres pop {r1-r3,r15} +@ Return single random word in r0 +@ Trashes r1-r3 +.balign 4 +gen_rand_sha_nonpres: + ldr r0,=SHA256_BASE + ldr r2,=rstate_sha + ldrb r1,[r2] @ get word counter from bottom byte of rstate_sha[] (offset into SUM registers) + subs r3,r1,#4 @ decrement it to previous SUM register + ble 1f @ if the offset was 4 or less we have run out of SUM register values + ldr r0,[r0,r1] @ read value from SUM register: note that this relies on SHA256_SUM0_OFFSET==8 + strb r3,[r2] @ save updated SUM register offset in bottom byte of rstate_sha[] + bx r14 1: movs r3,#SHA256_SUM6_OFFSET+1 strb r3,[r2] @ reset word counter: the +1 is compensated for later movw r1,#(1<>30, vpermB=Bptr[4]>>30, and +@ roundkey shareA(i) = Aptr[i+vpermA mod 4] ror ((i+vpermA mod 4)^th byte of Aptr[4]) +@ roundkey shareB(i) = Bptr[i+vpermB mod 4] ror ((i+vpermB mod 4)^th byte of Bptr[4])+16 .balign 4 .thumb_func -hperm: -@ rotate state within registers -@ r0: B0: rotate amount for r4,r8; B1: rotate amount for r5,r9; B2: rotate amount for r6,r10; B3: rotate amount for r7,r11 -@ return r0 value required to undo - movs r1,#0x18 @ constant for subsequent ANDs - and r2,r1,r0,lsl#3 @ extract amount - rors r4,r4,r2 @ rotate share A - rors r8,r8,r2 @ rotate share B - and r2,r1,r0,lsr#5 @ etc. - rors r5,r5,r2 - rors r9,r9,r2 - and r2,r1,r0,lsr#13 - rors r6,r6,r2 - rors r10,r10,r2 - and r2,r1,r0,lsr#21 - rors r7,r7,r2 - rors r11,r11,r2 -@ movs r1,#0 @ not needed as 0x18 has zeros in all the required places to do a two-bit-wise negate - usub8 r0,r1,r0 - bx r14 -.endif +ref_roundkey_shares_s: + mov r11,#15 @ there are 15 expanded keys +ref_roundkey_shares_s_test: @ entry point for test code to do fewer than 15 rounds + push {r14} + ldr r4,=rkey_s +ref_roundkey_shares_s_loop: + ldmia r4!,{r5-r8,r10} @ r5-r8 = rkey shareA, r10=X_A=vperm+rotations of rkey shareA + +@ ldr r0,=chaff +@ and r1,r11,#7 +@ add r0,r0,r1,lsl#2 +@ ldmia r0,{r0-r3} + + ldr r12,[r4,#16] @ r12 = X_B=vperm+rotations of rkey shareB + mov r0,r12,lsr#30 + sub r9,r0,r10,lsr#30 @ r9 = vperm_B - vperm_A (|junk) + mov r0,r9,lsl#3 @ r0 = 8*(vperm_B - vperm_A) mod 32 + mov r12,r12,ror r0 + usub8 r12,r10,r12 @ r12 = X_A - (X_B ror r0) + bl gen_rand_lfsr4 + eors r5,r5,r0; ands r9,r9,#3; ldr r10,[r4,r9,lsl#2]; ror r0,r0,r12; eor r10,r10,r0,ror#16; mov r12,r12,ror#8; str r10,[r4,r9,lsl#2]; adds r9,r9,#1 + eors r6,r6,r1; ands r9,r9,#3; ldr r10,[r4,r9,lsl#2]; ror r1,r1,r12; eor r10,r10,r1,ror#16; mov r12,r12,ror#8; str r10,[r4,r9,lsl#2]; adds r9,r9,#1 + eors r7,r7,r2; ands r9,r9,#3; ldr r10,[r4,r9,lsl#2]; ror r2,r2,r12; eor r10,r10,r2,ror#16; mov r12,r12,ror#8; str r10,[r4,r9,lsl#2]; adds r9,r9,#1 + eors r8,r8,r3; ands r9,r9,#3; ldr r10,[r4,r9,lsl#2]; ror r3,r3,r12; eor r10,r10,r3,ror#16; str r10,[r4,r9,lsl#2] + subs r4,r4,#20 + stmia r4,{r5-r8} + adds r4,r4,#40 + subs r11,r11,#1 + +@ ldr r0,=chaff +@ add r1,r11,#3 +@ and r1,r1,#7 +@ add r0,r0,r1,lsl#2 +@ ldmia r0,{r0-r3} + + bne ref_roundkey_shares_s_loop + clear03 24 +ref_roundkey_shares_s_exit: + pop {r15} -.if NEED_VPERM .balign 4 .thumb_func -vperm: -@ rotate state registers r4->r5-r6->r7->r4 etc. in constant time -@ r0: b0..1: rotate amount -@ returns r0 value required to undo -@ preserves r2 - and r1,r0,#2 - rsbs r1,r1,#0 @ 0 or fffffffe depending on b1 of r0 - uadd8 r1,r1,r1 @ set/clear all GE flags according to b1 of r0: set if rotate of two places is required - mov r1,r4 - sel r4,r6,r4 - sel r6,r1,r6 - mov r1,r5 - sel r5,r7,r5 - sel r7,r1,r7 - mov r1,r8 - sel r8,r10,r8 - sel r10,r1,r10 - mov r1,r9 - sel r9,r11,r9 - sel r11,r1,r11 - and r1,r0,#1 - rsbs r1,r1,#0 @ 0 or ffffffff depending on b0 of r0 - uadd8 r1,r1,r1 @ set/clear all GE flags according to b0 of r0: set if rotate of one place is required - mov r1,r4 - sel r4,r5,r4 - sel r5,r6,r5 - sel r6,r7,r6 - sel r7,r1,r7 - mov r1,r8 - sel r8, r9 ,r8 - sel r9, r10 ,r9 - sel r10,r11,r10 - sel r11,r1 ,r11 - rsbs r0,r0,#0 @ generate control value for inverse operation - bx r14 -.endif +@ Rotates roundkey vperms and RK_ROR rotations by random amounts +@ Trashes r0-r10 +@ If i = word number 0..3, +@ Aptr=memory word pointer to block of 20 bytes containing H&V-rotated share A roundkey (similarly B), then +@ vpermA=Aptr[4]>>30, vpermB=Bptr[4]>>30, and +@ roundkey shareA(i) = Aptr[i+vpermA mod 4] ror ((i+vpermA mod 4)^th byte of Aptr[4]) +@ roundkey shareB(i) = Bptr[i+vpermB mod 4] ror ((i+vpermB mod 4)^th byte of Bptr[4])+16 +ref_roundkey_hvperms_s: + movs r7,#30 +ref_roundkey_hvperms_s_test: @ entry point for test code to do fewer than 30 key shares + push {r14} + ldr r10,=rkey_s +ref_roundkey_hvperms_s_loop: + bl gen_rand_lfsr_nonpres @ r0=new vperm high|rotations + ldmia r10,{r2-r5,r9} @ r2-r5=roundkey share A/B, r9=old vperm high|rotations + str r0,[r10,#16] + mov r8,r0,lsr#30 @ r8=new vperm low + sub r6,r8,r9,lsr#30 @ r6=(new vperm low)-(old vperm low) | junk + mov r8,r6,lsl#3 @ r8=8*((new vperm low)-(old vperm low)) mod 32 + mov r0,r0,ror r8 + usub8 r0,r9,r0 @ i^th byte of r0 = (i^th byte of old rotations) - ((i+newvperm-oldvperm)^th byte of new rotations) + movs r2,r2,ror r0; ands r6,r6,#3; str r2,[r10,r6,lsl#2]; movs r0,r0,ror#8; adds r6,r6,#1 + movs r3,r3,ror r0; ands r6,r6,#3; str r3,[r10,r6,lsl#2]; movs r0,r0,ror#8; adds r6,r6,#1 + movs r4,r4,ror r0; ands r6,r6,#3; str r4,[r10,r6,lsl#2]; movs r0,r0,ror#8; adds r6,r6,#1 + movs r5,r5,ror r0; ands r6,r6,#3; str r5,[r10,r6,lsl#2] + adds r10,r10,#20 + subs r7,r7,#1 + bne ref_roundkey_hvperms_s_loop + clear03 28 +ref_roundkey_hvperms_s_exit: @ label exit point to be to able to specify to analysis code + pop {r15} -.if IK_SHUFREAD -@ randomly shuffle an array n bytes long, n≤65536 a power of 2, by performing k random exchanges, k>0 -@ r0: array pointer p -@ r1: n -@ r2: k -@ does not need to be a subroutine!!! -array_shuf: - push {r4-r6,r14} - mov r4,r0 - subs r5,r1,#1 @ mask for random number generation - mov r6,r2 -1: - bl gen_rand - and r1,r5,r0,lsr#16 - and r0,r5,r0 @ r0,r1 are two random numbers 0..n-1 - ldrb r2,[r4,r0] - ldrb r3,[r4,r1] - strb r3,[r4,r0] - strb r2,[r4,r1] - subs r6,r6,#1 - bne 1b - pop {r4-r6,r15} -.endif +.else @ "refresh" shares of rkeys by random eor into both shares of each word -.if RK_ROR -@ and randomly change rotate amount on each word of each share -.endif -@ preserves r0-r11 +@ Trashes r0-r11 .balign 4 -ref_round_keys_s: +.thumb_func +ref_roundkey_shares_s: + mov r11,#15 @ there are 15 expanded keys +ref_roundkey_shares_s_test: @ entry point for test code to do fewer than 15 rounds push {r14} - GET_CANARY r14,CTAG4 - push {r0-r11,r14} - ldr r0,=rkey_s - mov r1,#15 @ there are 15 expanded keys -1: -.if RK_ROR - ldmia r0,{r2-r11} - push {r0-r1} - - bl gen_rand @ xra=random extra rotates for share A - usub8 r6,r6,r0 @ ra-=xra bytewise - rors r2,r2,r0 @ a=ror(a,xra) - rev16 r0,r0 @ byte order 2301, i.e. B1 at the bottom - rors r3,r3,r0 @ a=ror(a,xra) - rev r0,r0 @ byte order 1032, i.e. B2 at the bottom - rors r4,r4,r0 @ a=ror(a,xra) - rev16 r0,r0 @ byte order 0123, i.e. B3 at the bottom - rors r5,r5,r0 @ a=ror(a,xra) - - bl gen_rand @ xrb=random extra rotates for share B - usub8 r11,r11,r0 @ rb-=xrb bytewise - rors r7,r7,r0 @ b=ror(b,xrb) - rev16 r0,r0 - rors r8,r8,r0 @ b=ror(b,xrb) - rev r0,r0 - rors r9,r9,r0 @ b=ror(b,xrb) - rev16 r0,r0 - rors r10,r10,r0 @ b=ror(b,xrb) - usub8 r1,r6,r11 @ ra-rb bytewise - - bl gen_rand @ xab=extra exclusive OR into shares - eors r2,r2,r0 @ a^=xab - rors r0,r0,r1 @ ror(xab,ra-rb) - eors r7,r7,r0 @ b^=ror(xab,ra-rb) - rev16 r1,r1 - - bl gen_rand @ xab - eors r3,r3,r0 @ a^=xab - rors r0,r0,r1 @ ror(xab,ra-rb) - eors r8,r8,r0 @ b^=ror(xab,ra-rb) - rev r1,r1 - - bl gen_rand @ xab - eors r4,r4,r0 @ a^=xab - rors r0,r0,r1 @ ror(xab,ra-rb) - eors r9,r9,r0 @ b^=ror(xab,ra-rb) - rev16 r1,r1 - - bl gen_rand @ xab - eors r5,r5,r0 @ a^=xab - rors r0,r0,r1 @ ror(xab,ra-rb) - eors r10,r10,r0 @ b^=ror(xab,ra-rb) - - pop {r0-r1} - stmia r0!,{r2-r11} -.else - ldmia r0,{r4-r11} @ EOR random data into the shares - push {r0-r1} - bl gen_rand - eor r4,r4,r0 - eor r8,r8,r0 - bl gen_rand - eor r5,r5,r0 - eor r9,r9,r0 - bl gen_rand - eor r6,r6,r0 - eor r10,r10,r0 - bl gen_rand - eor r7,r7,r0 - eor r11,r11,r0 - pop {r0-r1} - stmia r0!,{r4-r11} + ldr r4,=rkey_s +ref_roundkey_shares_s_loop: + ldmia r4!,{r5-r9} @ r5-r8 = rkey shareA with vperm r9 + +@ ldr r0,=chaff +@ and r1,r11,#7 +@ add r0,r0,r1,lsl#2 +@ ldmia r0,{r0-r3} + + ldr r10,[r4,#16] @ rkey shareB has a vperm of r10>>30 + mov r10,r10,lsr#30 + sub r9,r10,r9,lsr#30 @ r9 = vperm_B - vperm_A (|junk) + bl gen_rand_lfsr4 + eors r5,r5,r0; ands r9,r9,#3; ldr r10,[r4,r9,lsl#2]; eor r10,r10,r0,ror#16; str r10,[r4,r9,lsl#2]; adds r9,r9,#1 + eors r6,r6,r1; ands r9,r9,#3; ldr r10,[r4,r9,lsl#2]; eor r10,r10,r1,ror#16; str r10,[r4,r9,lsl#2]; adds r9,r9,#1 + eors r7,r7,r2; ands r9,r9,#3; ldr r10,[r4,r9,lsl#2]; eor r10,r10,r2,ror#16; str r10,[r4,r9,lsl#2]; adds r9,r9,#1 + eors r8,r8,r3; ands r9,r9,#3; ldr r10,[r4,r9,lsl#2]; eor r10,r10,r3,ror#16; str r10,[r4,r9,lsl#2] + subs r4,r4,#20 + stmia r4,{r5-r8} + adds r4,r4,#40 + subs r11,r11,#1 + +@ ldr r0,=chaff +@ add r1,r11,#3 +@ and r1,r1,#7 +@ add r0,r0,r1,lsl#2 +@ ldmia r0,{r0-r3} + + bne ref_roundkey_shares_s_loop + clear03 24 +ref_roundkey_shares_s_exit: + pop {r15} + +.balign 4 +.thumb_func +@ Rotates roundkey vperms by random amounts +@ Trashes r0-r9 +ref_roundkey_hvperms_s: + movs r7,#30 +ref_roundkey_hvperms_s_test: @ entry point for test code to do fewer than 30 key shares + push {r14} + bl gen_rand_lfsr_nonpres + ldr r1,=rkey_s +ref_roundkey_hvperms_s_loop: + cmp r7,#15 + bne 2f +@ Get a new random r0 after using 15 x 2 bits of the original one +@ Note that the junk bits (2-31) in the vperms are not adjusted independently, but that's no big loss, +@ and the gain is only calling gen_rand_lfsr twice instead of 30 times. + push {r1}; bl gen_rand_lfsr_nonpres; pop {r1} + 2: + ldmia r1,{r2-r5,r9} @ roundkey share A/B=r2-r5, vperm=r9 (including junk bits) + mov r8,r9,lsr#30 @ r8=old vperm (low) + add r6,r9,r0 @ r6=new vperm (high) | new junk + str r6,[r1,#16] + rsb r6,r8,r6,lsr#30 @ r6=(new vperm low)-(old vperm low) | junk bits + ands r6,r6,#3; str r2,[r1,r6,lsl#2]; adds r6,r6,#1 + ands r6,r6,#3; str r3,[r1,r6,lsl#2]; adds r6,r6,#1 + ands r6,r6,#3; str r4,[r1,r6,lsl#2]; adds r6,r6,#1 + ands r6,r6,#3; str r5,[r1,r6,lsl#2] + adds r1,r1,#20 + movs r0,r0,ror#2 + subs r7,r7,#1 + bne ref_roundkey_hvperms_s_loop + clear03 28 +ref_roundkey_hvperms_s_exit: @ label exit point to be to able to specify to analysis code + pop {r15} + .endif - subs r1,r1,#1 - bne 1b - pop {r0-r11,r14} - CHK_CANARY r14,CTAG4 + +.if NEED_VPERM +.balign 4 +.thumb_func +vpermundo: +@ Undo the effects of vperm rotation on share registers r4-r7, r8-r11 +@ Expect r1=statevperm (state rotations) on entry +@ Trashes r0-r3,r12 + push {r14} + ldr r1,=statevperm + ldr r2,[r1] + rsbs r0,r2,#0 + b vpermaddr0 + +.balign 4 +.thumb_func +refreshstatevperm: + +@ Rotate share registers r4-r7, r8-r11 (r4->r5-r6->r7->r4 etc.) by an addtional random amount and update the rotation at !r1 +@ Trashes r0-r3,r12 +@ Maintains r4=rorig(4+(-!r1)%4), r5=rorig(4+(1-!r1)%4), ... +@ r8=rorig(8+(-!r1)%4), r9=rorig(8+(1-!r1)%4), ... +@ Note: only low 2 bits of !r1 are used. The rest are random to add to the noise. + + push {r14} + bl gen_rand_lfsr_nonpres + ldr r1,=statevperm + ldr r2,[r1] +vpermaddr0: + adds r2,r2,r0 + str r2,[r1] + + ldr r1,=shareA + ands r0,r0,#3; str r4,[r1,r0,lsl#2]; adds r0,r0,#1 + ands r0,r0,#3; str r5,[r1,r0,lsl#2]; adds r0,r0,#1 + ands r0,r0,#3; str r6,[r1,r0,lsl#2]; adds r0,r0,#1 + ands r0,r0,#3; str r7,[r1,r0,lsl#2]; adds r0,r0,#1 + ldmia r1,{r4-r7} + + ldr r12,=chaff @ Overwrite temperorary storage with random numbers + ldmia r12,{r2,r3,r12,r14} + stmia r1,{r2,r3,r12,r14} + + ldr r1,=shareB + ands r0,r0,#3; str r8, [r1,r0,lsl#2]; adds r0,r0,#1 + ands r0,r0,#3; str r9, [r1,r0,lsl#2]; adds r0,r0,#1 + ands r0,r0,#3; str r10,[r1,r0,lsl#2]; adds r0,r0,#1 + ands r0,r0,#3; str r11,[r1,r0,lsl#2]; adds r0,r0,#1 + ldmia r1,{r8-r11} + + ldr r12,=chaff+16 @ Overwrite temperorary storage with random numbers + ldmia r12,{r2,r3,r12,r14} + stmia r1,{r2,r3,r12,r14} + +refreshstatevperm_exit: @ label exit point to be to able to specify to analysis code pop {r15} +.endif -@ switch from non-shared to shared state +@ Switch from non-shared to shared state +@ Trashes r0-r3,r12 .balign 4 ns_to_s: push {r14} - GET_CANARY r14,CTAG5 - push {r0-r3,r14} - bl gen_rand - mov r8,r0 - bl gen_rand - mov r9,r0 - bl gen_rand - mov r10,r0 - bl gen_rand - mov r11,r0 - eors r4,r4,r8 - eors r5,r5,r9 - eors r6,r6,r10 - eors r7,r7,r11 - pop {r0-r3,r14} - CHK_CANARY r14,CTAG5 +.if ST_SHAREC + bl gen_rand_sha_nonpres @ Create state share C; all bytes the same + ands r0,r0,#255 + orrs r0,r0,r0,lsl#8 + orrs r12,r0,r0,lsl#16 + ldr r1,=shareC + str r12,[r1] +.else + movs r12,#0 +.endif + bl gen_rand_sha_nonpres + eors r4,r4,r0 + eor r8,r12,r0,ror#16 + bl gen_rand_sha_nonpres + eors r5,r5,r0 + eor r9,r12,r0,ror#16 + bl gen_rand_sha_nonpres + eors r6,r6,r0 + eor r10,r12,r0,ror#16 + bl gen_rand_sha_nonpres + eors r7,r7,r0 + eor r11,r12,r0,ror#16 +.if ST_VPERM + bl gen_rand_sha_nonpres +.endif + ldr r1,=statevperm + movs r2,#0 + str r2,[r1] +.if ST_VPERM + b vpermaddr0 @ Tail call. Initialise state vperm with SHA RNG, refresh with LFSR RNG +.else pop {r15} +.endif +@ Conjugate lut_a, lut_b with shareC +@ I.e., EOR the input and output with shareC. +@ We need to pick one input for each share A and B, and one output for ONE of the shares A and B +@ Arbitrarily choosing a0, b1 and d0 +.balign 4 +conjshareC: +.if ST_SHAREC + ldr r1,=shareC + ldr r0,[r1] @ Get shareC as a word (all bytes the same) + ldr r1,=lut_a @ Need to EOR share C into inputs of both lut_a and lut_b, and one of their outputs... + ldr r2,[r1,#0x100] + eors r2,r2,r0,lsr#24 + str r2,[r1,#0x100] + movs r0,r0,lsr#16 + ldr r1,=lut_b @ ... (continued) Here we're EORing share C into a0, b1 and d0. + ldr r2,[r1,#0x100] + eors r2,r2,r0,lsl#8 + str r2,[r1,#0x100] +.endif + bx r14 + .if NEED_ROUNDS .balign 4 .thumb_func shift_rows_s: -@ first "rotate" the two most-significant bytes of the state by two registers -@ slightly faster (but not shorter?) with ubfx/bfi +@ First "rotate" the two most-significant bytes of the state by two registers +@ Trashes r0-r3 +@ Slightly faster (but not shorter?) with ubfx/bfi eors r0,r4,r6 @ ta=state[0]^state[2]; ta&=0xffff0000; state[0]^=ta; state[2]^=ta; lsrs r0,r0,#16 lsls r0,r0,#16 @@ -567,18 +770,18 @@ shift_rows_s: ands r0,r0,#0xff00ff00 eors r6,r6,r0 eors r7,r7,r1 @ state[3]^=tb; -@ repeat for other share - eors r0,r8,r10 @ ta=state[0]^state[2]; ta&=0xffff0000; state[0]^=ta; state[2]^=ta; - lsrs r0,r0,#16 +@ repeat for other share, conjugated by ror#16 + clear01 @ barrier + eors r0,r8,r10 @ ta=state[0]^state[2]; ta&=0x0000ffff; state[0]^=ta; state[2]^=ta; lsls r0,r0,#16 + lsrs r0,r0,#16 eors r8,r8,r0 eors r10,r10,r0 - eors r0,r9,r11 @ ta=state[1]^state[3]; ta&=0xffff0000; state[1]^=ta; state[3]^=ta; - lsrs r0,r0,#16 + eors r0,r9,r11 @ ta=state[1]^state[3]; ta&=0x0000ffff; state[1]^=ta; state[3]^=ta; lsls r0,r0,#16 + lsrs r0,r0,#16 eors r9,r9,r0 eors r11,r11,r0 - eors r1,r11,r8 @ tb=state[3]^state[0]; tb&=0xff00ff00; ands r1,r1,#0xff00ff00 eors r0,r8,r9 @ ta=state[0]^state[1]; ta&=0xff00ff00; state[0]^=ta; @@ -590,7 +793,10 @@ shift_rows_s: eors r0,r10,r11 @ ta=state[2]^state[3]; ta&=0xff00ff00; state[2]^=ta; ands r0,r0,#0xff00ff00 eors r10,r10,r0 + eors r11,r11,r1 @ state[3]^=tb; + + clear01 @ barrier bx r14 .endif @@ -690,6 +896,7 @@ inv_shift_rows_s: .if NEED_ROUNDS .balign 4 .thumb_func +@ Trashes r0-r3,r12 mix_cols_s: mov r2,#0x00000000 mov r3,#0x1b1b1b1b @@ -697,10 +904,13 @@ mix_cols_s: mixcol r5 ,r0,r1,r2,r3 mixcol r6 ,r0,r1,r2,r3 mixcol r7 ,r0,r1,r2,r3 + ldr r12,=chaff + ldmia r12!,{r0,r1} @ overwrite sensitive shareA-related quantities r0,r1 with random numbers mixcol r8 ,r0,r1,r2,r3 mixcol r9 ,r0,r1,r2,r3 mixcol r10,r0,r1,r2,r3 mixcol r11,r0,r1,r2,r3 + ldmia r12!,{r0,r1} @ overwrite sensitive shareB-related quantities r0,r1 with random numbers bx r14 .endif @@ -708,8 +918,6 @@ mix_cols_s: .balign 4 .thumb_func inv_mix_cols_s: - push {r14} - GET_CANARY r14,CTAG6 push {r14} mov r12,#0x00000000 mov r14,#0x1b1b1b1b @@ -721,8 +929,6 @@ inv_mix_cols_s: invmixcol r9 ,r0,r1,r2,r3,r12,r14 invmixcol r10,r0,r1,r2,r3,r12,r14 invmixcol r11,r0,r1,r2,r3,r12,r14 - pop {r14} - CHK_CANARY r14,CTAG6 pop {r15} .endif @@ -756,9 +962,7 @@ inv_mix_cols_s: .balign 4 .thumb_func -map_sbox_s: - push {r14} - GET_CANARY r14,CTAG7 +map_sbox_s: @ (we're currently still under .if SBOX_VIA_INV) version of map_sbox_x that uses lutmap_state_s as a lookup into a table of inverses push {r14} bl lutmap_state_s @ the S-box function is an inverse followed by an affine transformation: conv_0x1f r4 ,r0,r1 @ see https://en.wikipedia.org/wiki/Rijndael_S-box @@ -777,16 +981,12 @@ map_sbox_s: eor r9 ,r9 ,#0x96969696 eor r10,r10,#0x6f6f6f6f eor r11,r11,#0xc1c1c1c1 - pop {r14} - CHK_CANARY r14,CTAG7 pop {r15} .if NEED_INV_ROUNDS .balign 4 .thumb_func -inv_map_sbox_s: - push {r14} - GET_CANARY r14,CTAG8 +inv_map_sbox_s: @ version that computes via tables of inverses push {r14} @ similarly, the inverse S-box is an affine transformation followed by an inverse conv_0x4a r4 ,r0,r1 conv_0x4a r5 ,r0,r1 @@ -805,8 +1005,6 @@ inv_map_sbox_s: eor r10,r10,#0xf9f9f9f9 eor r11,r11,#0x3f3f3f3f bl lutmap_state_s - pop {r14} - CHK_CANARY r14,CTAG8 pop {r15} .endif @@ -815,12 +1013,11 @@ inv_map_sbox_s: .balign 4 .thumb_func gen_lut_sbox: -@ set both lut_a and lut_b to the S-box table +@ gen_lut_sbox sets both lut_a and lut_b to the S-box table and @ returns r0=lut_a+256, r1=lut_b+256 push {r14} - GET_CANARY r14,CTAG9 - push {r14} @ similarly, the inverse S-box is an affine transformation followed by an inverse bl gen_lut_inverse @ first generate the table of inverses in lut_a + @ At this point r0=lut_a, r1=lut_b, lut_a[] contains inverses and lut_b[] contains other stuff mov r14,#256 1: ldrb r2,[r0] @@ -829,12 +1026,10 @@ gen_lut_sbox: eors r3,r3,r2,lsl#4 eors r2,r3,r3,lsr#8 eor r2,r2,#0x63 @ and add 0x63 - strb r2,[r0],#1 - strb r2,[r1],#1 + strb r2,[r0],#1 @ let lut_a[i]=sbox[i] + strb r2,[r1],#1 @ let lut_b[i]=sbox[i] subs r14,r14,#1 bne 1b - pop {r14} - CHK_CANARY r14,CTAG9 pop {r15} .if NEED_INV_ROUNDS @@ -842,8 +1037,6 @@ gen_lut_sbox: .thumb_func gen_lut_inv_sbox: @ set lut_a to the inverse S-box table - push {r14} - GET_CANARY r14,CTAG10 push {r14} bl gen_lut_sbox @ get the forwards S-box sub r0,r0,#256 @@ -855,12 +1048,26 @@ gen_lut_inv_sbox: adds r2,r2,#1 cmp r2,#255 bls 1b - pop {r14} - CHK_CANARY r14,CTAG10 pop {r15} .endif .endif +@ Lookup each byte of a word, Rtarg, in a table and replace Rtarg with the result (used for SBOX lookups) +.macro subbytes Rtarg,Rtable,Rspare0,Rspare1,Rspare2,Rspare3 + ubfx \Rspare0,\Rtarg,#0, #8 + ubfx \Rspare1,\Rtarg,#8, #8 + ubfx \Rspare2,\Rtarg,#16, #8 + ubfx \Rspare3,\Rtarg,#24, #8 + + ldrb \Rspare0,[\Rtable,\Rspare0] + ldrb \Rspare1,[\Rtable,\Rspare1] + ldrb \Rspare2,[\Rtable,\Rspare2] + ldrb \Rspare3,[\Rtable,\Rspare3] + orr \Rspare0,\Rspare0,\Rspare1,lsl#8 + orr \Rspare2,\Rspare2,\Rspare3,lsl#8 + orr \Rtarg,\Rspare0,\Rspare2,lsl#16 +.endm + @ if we are using direct S-box lookup then [inv_]map_sbox_s is the same as lutmap_state_s .if !SBOX_VIA_INV .balign 4 @@ -872,88 +1079,72 @@ inv_map_sbox_s: .endif .endif -@ map all bytes of the state through the LUT +@ lutmap_state_s maps all bytes of the state through the split LUT, lut_a and lut_b +@ This is either the whole of map_sbox_s (if SBOX_VIA_INV=0), or (if SBOX_VIA_INV=1) it's a subroutine called by map_sbox_s +@ Trashes r0-r3,r12 .balign 4 lutmap_state_s: + push {r14} - GET_CANARY r14,CTAG11 - push {r14} - ldr r12,=lut_a - ldr r14,=lut_b - mov r0,#0x8000 @ "counter" for bytes of state mapped -1: - ldr r3,[r12,#0x100] @ lut_a_map - eor r1,r4,r3 @ share A of x ^ share A of lut_a address map - eor r1,r1,r8 @ ^ share B of x - eor r1,r1,r3,ror#8 @ ^ share B of lut_a address map - uxtb r1,r1 - ldrb r1,[r12,r1] @ look up in lut_a - eor r1,r1,r3,ror#16 @ ^ share A of lut_a data map - ldr r3,[r14,#0x100] @ lut_b_map - eor r1,r1,r3,ror#24 @ ^ share B of lut_b data map, generating share A of the result - - eor r2,r4,r3 @ share A of x ^ share A of lut_b address map - eor r2,r2,r8 @ ^ share B of x - eor r2,r2,r3,ror#8 @ ^ share B of lut_b address map - uxtb r2,r2 - ldrb r2,[r14,r2] @ look up in lut_b - eor r2,r2,r3,ror#16 @ ^ share A of lut_b data map - ldr r3,[r12,#0x100] @ lut_a_map - eor r2,r2,r3,ror#24 @ ^ share B of lut_a data map, generating share B of the result - - lsrs r4,#8 @ shift share A of state down one byte... - orrs r4,r4,r5,lsl#24 - lsrs r5,#8 - orrs r5,r5,r6,lsl#24 - lsrs r6,#8 - orrs r6,r6,r7,lsl#24 - lsrs r7,#8 - orrs r7,r7,r1,lsl#24 @ and insert share A of mapped byte - - lsrs r8,#8 @ shift share B of state down one byte... - orrs r8,r8,r9,lsl#24 - lsrs r9,#8 - orrs r9,r9,r10,lsl#24 - lsrs r10,#8 - orrs r10,r10,r11,lsl#24 - lsrs r11,#8 - orrs r11,r11,r2,lsl#24 @ and insert share B of mapped byte - - lsrs r0,#1 @ count 16 iterations - bne 1b - pop {r14} - CHK_CANARY r14,CTAG11 - pop {r15} + + ldr r0,=shareA @ Write out state share A to memory + stmia r0,{r4-r7} + clear03 @ barrier + + ldr r0,=shareB @ Write out state share B to memory + stmia r0,{r8-r11} + clear03 4 @ barrier + + bl makeperm16 @ Rebuild random 16-way permutation. Maybe do this less frequently + @ Now combine state shares A and B and apply the split sbox to each byte, in the order given by the above random permutation + + ldr r8,=lut_a + ldr r9,=lut_b + ldr r0,[r8,#0x100] @ R0 = a0 | a1<<8 | c0<<16 | c1<<24 (lut_a_map) + eors r10,r0,r0,lsr#8 + uxtb r10,r10 @ R10 = a0^a1 + ldr r1,[r9,#0x100] @ R1 = b0 | b1<<8 | d0<<16 | d1<<24 (lut_b_map) + eors r1,r0,r1 + eors r2,r1,r1,lsr#8 + uxtb r11,r2 @ R11 = a0^a1^b0^b1 + movs r12,r1,lsr#16 @ R12 = c0^d0 | (c1^d1)<<8 + + ldr r4,=perm16 + ldr r5,=shareA + ldr r6,=shareB +@ Using r0=loop counter, r4=perm16, r5=shareA, r6=shareB, r8=lut_a, r9=lut_b, r10=a0^a1, r11=a0^a1^b0^b1, r12=(c0^d0) | (c1^d1)<<8 + movs r0,#15 +1: @ (Ordering instructions to minimise result delays) + ldrb r1,[r4,r0] @ r1 = perm[r0] + eors r7,r1,#2 @ r7 = perm[r0]^2 + ldrb r2,[r5,r1] @ r2 = shareA[perm[r0]] + ldrb r3,[r6,r7] @ r3 = shareB[perm[r0]^2] + eors r2,r2,r10 @ r2 = shareA[perm[r0]]^a0^a1 + eors r2,r2,r3 @ r2 = shareA[perm[r0]]^a0^a1^shareB[perm[r0]^2] + ldrb r3,[r8,r2] @ r3 = lut_a[shareA[perm[r0]]^a0^a1^shareB[perm[r0]^2]] + eors r3,r3,r12 @ r3 = lut_a[shareA[perm[r0]]^a0^a1^shareB[perm[r0]^2]]^c0^d0 | (junk<<8) + eors r2,r2,r11 @ r2 = shareA[perm[r0]]^b0^b1^shareB[perm[r0]^2] + strb r3,[r5,r1] @ shareA'[perm[r0]] = lut_a[shareA[perm[r0]]^a0^a1^shareB[perm[r0]^2]]^c0^d0 + ldrb r3,[r9,r2] @ r3 = lut_b[shareA[perm[r0]]^b0^b1^shareB[perm[r0]^2]] + subs r0,r0,#1 + eor r3,r3,r12,lsr#8 @ r3 = lut_b[shareA[perm[r0]]^b0^b1^shareB[perm[r0]^2]]^c1^d1 + strb r3,[r6,r7] @ shareB'[perm[r0]^2] = lut_b[shareA[perm[r0]]^b0^b1^shareB[perm[r0]^2]]^c1^d1 + bpl 1b + clear03 8 @ barrier + + ldmia r6,{r8-r11} @ Read state share B back from memory + clear03 12 @ barrier + ldmia r5,{r4-r7} @ Read state share A back from memory + clear03 16 @ barrier + +@ Refresh state shares because luts only give imperfect share-by-value + bl gen_rand_lfsr4 + eors r4,r4,r0; mov r12,#0; eors r8,r8,r0,ror#16 @ Barriers between each pair of eors to prevent implicit r4^r8 etc + eors r5,r5,r1; mov r12,#0; eors r9,r9,r1,ror#16 + eors r6,r6,r2; mov r12,#0; eors r10,r10,r2,ror#16 + eors r7,r7,r3; mov r12,#0; eors r11,r11,r3,ror#16 -@ perform one EOR step in round key generation -@ !!! can we introduce some more randomness into the shares here? -.balign 4 -grk_s_step: - ldmia r0!,{r5-r7,r12} @ from last round key_a but one - eors r5,r5,r4 - eors r6,r6,r5 - eors r7,r7,r6 - eors r12,r12,r7 - stmia r1!,{r5-r7,r12} - mov r4,r12 -.if RK_ROR - movs r12,#0 - str r12,[r0],#4 - str r12,[r1],#4 -.endif - ldmia r0!,{r9-r11,r12} @ from last round key_a but one - eors r9,r9,r8 - eors r10,r10,r9 - eors r11,r11,r10 - eors r12,r12,r11 - stmia r1!,{r9-r11,r12} - mov r8,r12 -.if RK_ROR - movs r12,#0 - str r12,[r0],#4 - str r12,[r1],#4 -.endif - bx r14 + pop {r15} .macro jitter rx .if IK_JITTER @@ -967,273 +1158,494 @@ grk_s_step: .balign 4 .thumb_func -init_key: -@ r0: rkeys_s -@ r1: raw key data (32 bytes) -.if RK_ROR -@ rkeys_s is a 40*15=600-byte region -@ each of the 15 round keys is represented as two 4-word regions rka[0..3] and rkb[0..3], each of which is followed by a word containing -@ four byte-wide rotate values ra[i] and rb[i] -@ such that rk[i]=(rka[i] ROR ra[i])^(rkb[i] ROR rb[i]) gives the round keys -@ rotations always operate mod 32, so we do not bother to mask the rotate amounts to 5 bits -.else -@ rkeys_s is a 32*15=480-byte region -@ each of the 15 round keys is represented as two 4-word regions rka[0..3] and rkb[0..3] -@ such that rk[i]=rka[i]^rkb[i] gives the round keys -.endif - GET_CANARY r12,CTAG12 - push {r4-r12,r14} -.if IK_JITTER - push {r0,r1} - bl gen_rand - mov r12,r0 - pop {r0,r1} -.endif - jitter r12 - mov r4,r0 - mov r5,r1 -.if IK_SHUFREAD - SET_COUNT 73 - add r6,r4,#128 @ use 64 bytes of temporary space at r0+128 for buf - mov r7,#0 +randomisechaff: +@ Randomise 48 bytes of chaff values (random load values) +@ Uses 12 bytes of permscratch +@ Trashes r0-3 + push {r14} + movs r0,#12 + ldr r1,=permscratch + bl makesmallperm @ Store the random words in a random order to make 2nd order attacks harder + movs r1,#11 1: - bl gen_rand - and r0,r0,#0x1f - strb r0,[r6,#32] @ buf contains each number 0..31 and 32 more random numbers in that range - strb r7,[r6],#1 @ so each number at least once... - adds r7,r7,#1 - cmp r7,#32 - bne 1b - CHK_COUNT 73 - add r0,r4,#128 - mov r10,r0 - movs r1,#64 - movs r2,#200 - bl array_shuf @ ... in a random order - mov r11,#63 - CHK_COUNT 74 -.else - mov r6,#31 -.endif + push {r1} + bl gen_rand_sha_nonpres + pop {r1} + ldr r2,=permscratch + ldrb r2,[r2,r1] + ldr r3,=chaff + str r0,[r3,r2,lsl#2] + subs r1,r1,#1 + bpl 1b + pop {r15} + +.balign 4 +refreshchaff: +@ Update 48 bytes of chaff values (random load values) using faster RNG than used for randomisechaff +@ Uses 12 bytes of permscratch +@ Trashes r0-3,12 + push {r14} + movs r0,#12 + ldr r1,=permscratch + bl makesmallperm @ Update the random words in a random order to make 2nd order attacks harder + movs r1,#11 1: - SET_COUNT 104 - jitter r12 -.if IK_SHUFREAD - ldrb r6,[r10,r11] @ now process the raw key bytes in the order given by buf, some more than once -.endif - lsrs r8,r6,#4 -.if RK_ROR - add r7,r6,r8,lsl#3 - add r7,r7,r8,lsl#4 @ 0..15 -> 0..15, 16..31 -> 40..55 -.else - add r7,r6,r8,lsl#4 @ 0..15 -> 0..15, 16..31 -> 32..47 -.endif - ldrb r9,[r5,r6] @ fetch key byte - bl gen_rand @ make random shares of round key 0 - CHK_COUNT 104 - eor r9,r9,r0 - strb r9,[r4,r7] -.if RK_ROR - adds r7,#20 -.else - adds r7,#16 -.endif - strb r0,[r4,r7] -.if IK_SHUFREAD - subs r11,r11,#1 -.else - subs r6,r6,#1 -.endif - CHK_COUNT 105 + push {r1} + bl gen_rand_lfsr_nonpres + pop {r1} + ldr r2,=permscratch + ldr r3,=chaff + ldrb r2,[r2,r1] + ldr r12,[r3,r2,lsl#2] + add r0,r0,r12 + str r0,[r3,r2,lsl#2] + subs r1,r1,#1 bpl 1b - CHK_COUNT 106 - mov r0,r4 + pop {r15} + +.balign 4 +.thumb_func +@ Do sbox on the four bytes of the 4-way share r4-r7 +@ Trashes r0,r8-r12 +init_key_sbox: + push {r1-r3,r14} + bl gen_rand_sha_nonpres; mov r8,r0 + bl gen_rand_sha_nonpres; mov r9,r0 + bl gen_rand_sha_nonpres; mov r10,r0 + bl gen_rand_sha_nonpres; mov r11,r0 + ldr r0,=fourway @ Write out 4-way share to memory + stmia r0,{r8-r11} @ Save random values first to obscure saving of state + stmia r0,{r4-r7} + movs r4,#0 @ Clear r4-r7 so that they don't interact with makesmallperm + movs r5,#0 + movs r6,#0 + movs r7,#0 + + bl randomisechaff @ Randomise block of memory mainly used for obscuring loads + + movs r0,#4 + ldr r1,=permscratch + bl makesmallperm @ Build random 4-way permutation determining order of bytes to be SBOXed + ldr r1,=permscratch @ Write out random addresses in advance to save two registers + ldr r4,[r1] + ldr r0,=fourway + uxtab r5,r0,r4 + uxtab r6,r0,r4,ror#8 + uxtab r7,r0,r4,ror#16 + uxtab r8,r0,r4,ror#24 + stmia r1,{r5-r8} @ Store fourway+perm[0], fourway+perm[1], fourway+perm[2], fourway+perm[3] + + bl gen_rand_sha @ Save some randomness for the resharing operation later + movs r7,r0 + bl gen_rand_sha + movs r8,r0 + + ldr r2,=lut_a + ldr r3,=lut_b + ldr r0,[r2,#0x100] @ R0 = a0 | a1<<8 | c0<<16 | c1<<24 (lut_a_map) + eors r10,r0,r0,lsr#8 + uxtb r10,r10 @ R10 = a0^a1 + ldr r1,[r3,#0x100] @ R1 = b0 | b1<<8 | d0<<16 | d1<<24 (lut_b_map) + eors r1,r0,r1 + eors r4,r1,r1,lsr#8 + uxtb r11,r4 @ R11 = a0^a1^b0^b1 + eor r10,r10,r11,lsl#8 @ R10 = a0^a1 | (a0^a1^b0^b1)<<8 + movs r12,r1,ror#16 @ R12 = c0^d0 | (c1^d1)<<8 | junk<<16 | junk<<24 + + ldr r1,=permscratch + ldr r11,=chaff + @ Using r1=permutedfourwaypointer, r2=lut_a, r3=lut_b, r7,r8=randomness, r10=(a0^a1)|(a0^a1^b0^b1)<<8, r11=chaff, r12=(c0^d0)|(c1^d1)<<8|junk +1: + ands r5,r1,#12 + adds r5,r11,r5 @ Align chaff address to r1 + ldr r6,[r1],#4 @ r6 = fourway + perm[i] (i=0-3, loop iteration) + ldr r5,[r5] @ Random load to mask previous load + + ands r9,r6,#12 @ r9 = chaff address aligned to r6 mod 16 + add r9,r11,r9 + ldrb r4,[r6,#0] + ldr r14,[r9,#0] @ Random load to mask previous load + eor r4,r4,r10 + eor r4,r4,r14,lsl#8 @ Add in some junk in bits 8-31 + + ldrb r5,[r6,#4] + ldr r14,[r9,#4] @ Random load to mask previous load + eors r4,r4,r5 + eor r4,r4,r14,lsl#8 @ Add in some junk in bits 8-31 + + ldrb r5,[r6,#8] + ldr r14,[r9,#8] @ Random load to mask previous load + eors r4,r4,r5 + eor r4,r4,r14,lsl#8 @ Add in some junk in bits 8-31 + + ldrb r5,[r6,#12] + ldr r14,[r9,#12] @ Random load to mask previous load + eors r4,r4,r5 @ r4 = unsharedbyte[perm[i]]^a0^a1 | junk + eor r4,r4,r14,lsl#8 @ Add in some junk in bits 8-31 + + ands r14,r4,#255 + ldrb r5,[r2,r14] @ r5 = lut_a[unsharedbyte[perm[i]]^a0^a1] + and r14,r4,#15 + add r14,r14,#32 + ldrb r14,[r11,r14] @ Random load to mask previous load (r2 and r11 are both 0 mod 16) + eors r5,r5,r12 @ r5 = lut_a[unsharedbyte[perm[i]]^a0^a1]^c0^d0 | junk<<8 | junk<<16 | junk<<24 + @ split r5 into two shares and store at [r6,#0] and [r6,#4] + strb r7,[r6,#0] + eors r5,r5,r7 + strb r5,[r6,#4] + + mov r5,r10,lsr#8 @ r5=a0^a1^b0^b1 + ldr r14,[r11,#44] @ Need to eor into a random destination register + eors r14,r4,r5 @ r14 = unsharedbyte[perm[i]]^b0^b1 | junk<<8 + and r14,r14,#255 + + ldrb r5,[r3,r14] @ r5 = lut_b[unsharedbyte[perm[i]]^b0^b1] + and r14,r14,#15 + add r4,r11,#24 + ldrb r14,[r4,r14] @ Random load to mask previous load (r3==8 and r11==0 mod 16) + eor r5,r5,r12,ror#8 @ r5 = lut_b[unsharedbyte[perm[i]]^b0^b1]^c1^d1 | junk<<8 | junk<<16 | junk<<24 + @ split r5 into two shares and store at [r6,#8] and [r6,#12] + strb r8,[r6,#8] + eors r5,r5,r8 + strb r5,[r6,#12] + + movs r7,r7,ror#8 + movs r8,r8,ror#8 + + tst r1,#12 @ This does 4 loop iterations because permscratch is guaranteed to be 0 mod 16 + bne 1b + + ldr r0,=fourway + ldmia r0,{r4-r7} @ Load SBOXed values back into register r4-r7 + ldmia r11,{r8-r12,r14} @ Random load to mask previous load and to obfuscate registers + + pop {r1-r3,r15} + +.balign 4 +.thumb_func +@ r1 = pointer to 4 x 4-way share (16 words); left unchanged +@ r3 = rkey_s+40*roundkeynumber; advanced by 40 +@ Trashes r8-r11 +@ If i = word number 0..3, +@ Aptr=memory word pointer to block of 20 bytes containing H&V-rotated share A roundkey (similarly B), then +@ vpermA=Aptr[4]>>30, vpermB=Bptr[4]>>30, and +@ roundkey shareA(i) = Aptr[i+vpermA mod 4] ror ((i+vpermA mod 4)^th byte of Aptr[4]) +@ roundkey shareB(i) = Bptr[i+vpermB mod 4] ror ((i+vpermB mod 4)^th byte of Bptr[4])+16 +storeroundkey: + push {r2,r14} + +@ eor two 4-way share components to make a component of a 2-way share +@ Note that we load from 4-way share at a random address then convert to 2-way share and +@ store at a fixed address, rather than the other way around, so that 2-way shares are obscured +@ by vperm (we don't know which 2-way share is being processed at a particular point in time). +@ And (if RK_ROR) we rotate first before EORing down to 2-way, so there is never an unrotated 2-way share + + bl gen_rand_sha @ Get r0 = vperm for shareA of the round key + str r0,[r3,#16] + mov r8,r0,lsr#30 + rsb r8,r8,#0 @ r8=-vperm .if RK_ROR - movs r1,#0 - str r1,[r0,#16] - str r1,[r0,#36] + movs r2,#0 + usub8 r2,r2,r0 @ r2=-hperms .endif -@ now generate the other round keys - movs r2,#1 @ round constant + mov r9,#4 +1: + and r8,r8,#3 + adds r0,r1,r8,lsl#4 + + ldmia r0,{r10,r11} .if RK_ROR - add r1,r0,#80 - ldr r4,[r0,#52] @ last word from previous round key_a - ldr r8,[r0,#72] @ last word from previous round key_b -.else - add r1,r0,#64 - ldr r4,[r0,#44] @ last word from previous round key_a - ldr r8,[r0,#60] @ last word from previous round key_b + mov r10,r10,ror r2 + mov r11,r11,ror r2 + movs r2,r2,ror#8 +.endif + eor r10,r10,r11 + str r10,[r3],#4 + add r8,r8,#1 + subs r9,r9,#1 + bne 1b + + adds r1,r1,#8 + adds r3,r3,#4 @ skip over vperm (already stored) + + bl gen_rand_sha @ Get r0 = vperm for shareB of the round key + str r0,[r3,#16] + mov r8,r0,lsr#30 + rsb r8,r8,#0 @ r8=-vperm +.if RK_ROR + movs r2,#0 + usub8 r2,r2,r0 @ r2=-hperms .endif - CHK_COUNT 107 + mov r9,#4 1: - SET_COUNT 42 - rors r4,r4,#8 - rors r8,r8,#8 - push {r0-r3} -.if IK_JUNK - bl gen_rand @ put some junk in r5-r7, r9-r11 - mov r5,r0 - bl gen_rand - mov r6,r0 - bl gen_rand - mov r7,r0 - bl gen_rand - mov r9,r0 - bl gen_rand - mov r10,r0 - bl gen_rand - mov r11,r0 -.endif - CHK_COUNT 42 -.if IK_REMAP - bl remap -.endif - CHK_COUNT 43 -.if IK_PERM - bl gen_rand - bl vperm - push {r0} - bl gen_rand - bl hperm + and r8,r8,#3 + adds r0,r1,r8,lsl#4 + ldmia r0,{r10,r11} +.if RK_ROR + mov r10,r10,ror r2 + mov r11,r11,ror r2 + movs r2,r2,ror#8 +.endif + mov r10,r10,ror#16 + mov r11,r11,ror#16 + eor r10,r10,r11 + str r10,[r3],#4 + add r8,r8,#1 + subs r9,r9,#1 + bne 1b + + subs r1,r1,#8 @ Restore r1 = (r1 on entry) + adds r3,r3,#4 @ Set r3 = (r3 on entry) + 40 + + pop {r2,r15} + +.balign 4 +.thumb_func +init_key: +@ r0: rkeys_s (this input is ignored because it's defined here in the assembler file) +@ r1: raw key data (32 bytes) +@ rkeys_s is a 40*15=600-byte region +@ each of the 15 round keys is represented as two 4-word regions rka[0..3] and rkb[0..3] (each of which is followed by a zero word), +@ such that rk[i]=rka[i-r]^(rkb[i-r] ROR#16) gives the round keys, where r=!vpermkeyrot and i-r is interpreted in the relevant range, and i-r specifies mod 4 + + push {r4-r11,r14} + +.if IK_JITTER push {r0} - bl map_sbox_s @ this actually maps all of r4..r7, r8..r11 - i.e., trashes r5, r6, r7, r9, r10, r11 - pop {r0} - bl hperm + bl gen_rand_sha + mov r12,r0 pop {r0} - bl vperm -.else - bl map_sbox_s @ this actually maps all of r4..r7, r8..r11 - i.e., trashes r5, r6, r7, r9, r10, r11 .endif - CHK_COUNT 44 - pop {r0-r3} - eors r4,r4,r2 @ round constant - bl grk_s_step - CHK_COUNT 45 - lsls r2,#1 @ step round constant - cmp r2,#0x40 @ done? - bhi 2f - push {r0-r2} - bl map_sbox_s @ this actually maps all of r4..r7, r8..r11 - i.e., trashes r5, r6, r7, r9, r10, r11 - CHK_COUNT 46 - pop {r0-r2} - bl grk_s_step - CHK_COUNT 47 - b 1b + jitter r12 + + mov r5,r1 @ Here and for the rawkey reading loop, R5=raw key data + + jitter r12 + + @ Make lots of small perms so that it's harder for attacker to correlate permutation creation steps with the permutation's use + @ Can use rkey_s space because it won't be used before init_key_expandloop + ldr r1,=rkey_s + movs r2,#64 +1: + movs r0,#8 + push {r1,r2} + bl makesmallperm @ make a random permutation of 8 things (to randomise reading of key words) + pop {r1,r2} + adds r1,r1,#8 + subs r2,r2,#1 + bne 1b + bl gen_rand_sha_nonpres @ Choose a random one of these 64 to use + ands r0,r0,#63 + ldr r1,=rkey_s + adds r7,r1,r0,lsl#3 + +init_key_loadrawkey: + + bl randomisechaff + +@ Loading the raw key and turning it into 4-way shares for round 0 and 1 + ldr r11,=chaff @ This needs to have 48 bytes of chaff + sub r0,r7,r11; ands r0,r0,#15; add r10,r11,r0 @ align r10 to r7 mod 16 (permutation array) + sub r0,r5,r11; ands r0,r0,#15; add r11,r11,r0 @ align r11 to r5 mod 16 (raw key data) + ldr r4,=rkey4way @ 128 byte scratch space for 4-way shares, laid out in words as a0 b0 c0 d0 a1 b1 c1 d1 ... a7 b7 c7 d7 + movs r6,#7 +@ r4=rkey4way, r5=rawkeydata, r6=loopcounter, r7=permutationarray, r10,r11=zeroarray (same mod 16 alignment as r7,r5 resp) 2: - CHK_COUNT 46 - pop {r4-r12,r14} - CHK_CANARY r12,CTAG12 - bx r14 +@ Do calls to gen_rand_sha before we have sensitive values, so that gen_rand_sha doesn't push them on the stack + bl gen_rand_sha_nonpres; movs r8,r0 + bl gen_rand_sha_nonpres; movs r9,r0 + bl gen_rand_sha_nonpres; movs r1,r0 + bl gen_rand_sha @ r0,r1,r8,r9 are fresh random numbers + ldrb r12,[r10,r6] @ barrier to following load + ldrb r2,[r7,r6] @ r2 = perm8[r6] = which key word to load + ldrb r12,[r10,r6] @ barrier load to erase internal version of r2 + movs r14,r0,lsr#29 @ temporarily borrow some randomness to create a random address offset + ldr r12,[r11,r14,lsl#2] @ + ldr r3,[r11,r2,lsl#2] @ barrier to following load (random value, same memory bank) + ldr r3,[r5,r2,lsl#2] @ r3 = key word + ldr r12,[r11,r2,lsl#2] @ barrier load to erase internal version of r3 + ldr r12,[r11,r14,lsl#2] @ erase internal address + mov r14,#0 @ erase r14 + ldr r12,[r11,#32] + eor r12,r12,r12 + eors r9,r3,r8 @ extra care: sacrifice random r9 to further mask this operation + eors r3,r9,r0 @ r9=r0^r3^r8 (also has the effect of safely retiring the sensitive value r3) + eors r3,r3,r1 @ r9=r0^r1^r3^r8 so r0,r1,r8,r9 is a 4-way share of r3 + adds r2,r4,r2,lsl#4 + stmia r2,{r0,r1,r3,r8} @ Store 4-way share of this key word + movs r0,#0 @ Clear sensitive working values so they don't get used somehow (e.g., pushed onto the stack by gen_rand_sha) + movs r1,#0 + movs r2,#0 + movs r3,#0 + subs r6,r6,#1 + bpl 2b + mov r8,#0 + mov r9,#0 + + +@ Now raw key is stored in rkey4way[], construct 2-way share in rkey_s[] for +@ the 128-bit roundkeys 0 and 1, then expand from 2 to 15 roundkeys. + + ldr r3,=rkey_s @ r3=rkey_s + ldr r1,=rkey4way @ r1=rkey4way + bl storeroundkey @ Store round key 0 and advance r3 by 40 + adds r1,r1,#64 + bl storeroundkey @ Store round key 1 and advance r3 by 40 + adds r1,r1,#48 + ldmia r1!,{r4-r7} @ r4-r7 = 4-way share of previous round key word + @ r1=rkey4way+128 on entry to main loop + movs r2,#0 @ r2=word counter (0-51), offset from word 8 + +@ Note that r1-r3 are not sensitive values, so it's safe to stack +@ them and conditionally branch on them. + +@ rkey4way = 8 x 4 consecutive 4-way share words as cyclic buffer of +@ Rounds 0,1 Rounds 2,3 Rounds 12,13 Round 14 +@ a0 b0 c0 d0 -> a8 b8 c8 d8 -> ... -> a48 b48 c48 d48 -> a56 b56 c56 d56 +@ a1 b1 c1 d1 -> a9 b9 c9 d9 a49 b49 c49 d49 a57 b57 c57 d57 +@ a2 b2 c2 d2 etc a50 b50 c50 d50 a58 b58 c58 d58 +@ a3 b3 c3 d3 a51 b51 c51 d51 a59 b59 c59 d59 +@ a4 b4 c4 d4 a52 b52 c52 d52 =============== +@ a5 b5 c5 d5 a53 b53 c53 d53 +@ a6 b6 c6 d6 a54 b54 c54 d54 +@ a7 b7 c7 d7 a55 b55 c55 d55 + +init_key_expandloop: + @ r1 = pointer past one of eight 4-way shares of a roundkey word in the above cyclic buffer (r1=rkey4way+16i for i=1,...,8) + @ r2 = round key word counter (0-51), offset from word 8 (counting expanded roundkey words) + @ r3 = pointer to rkey_s+40*roundnumber = rkey_s+40*(2+[r2/4]) + @ r4-r7 = 4-way share of previous roundkey word + + tst r2,#7 + bne 1f + subs r1,r1,#128 @ Every 8th word, reset cyclic buffer pointer and do ROTWORD + movs r4,r4,ror#8 + movs r5,r5,ror#8 + movs r6,r6,ror#8 + movs r7,r7,ror#8 +1: + + tst r2,#3 + bne 1f + bl init_key_sbox @ Every 4th word, do SUBBYTES (sbox) on r4-r7 +1: + + tst r2,#7 + bne 1f + movs r0,r2,lsr#3 + mov r8,#1 + movs r8,r8,lsl r0 + eors r4,r4,r8 @ Every 8th word, add in round constant +1: + + ldmia r1,{r8-r11} @ eor with key from two rounds ago and advance r1 by 16 + eors r4,r4,r8 + eors r5,r5,r9 + eors r6,r6,r10 + eors r7,r7,r11 + stmia r1!,{r4-r7} + + add r2,r2,#1 + tst r2,#3 + bne 1f + subs r1,r1,#64 + bl storeroundkey @ Store round key 1+r2/4 and advance r3 by 40 + adds r1,r1,#64 +1: + + cmp r2,#52 + bne init_key_expandloop -@ add the round key shares pointed to by r12 into the state shares + pop {r4-r11,r15} + +@ Add the round key shares pointed to by r12 into the state shares +@ Trashes r0-r3 .balign 4 addrkey_s: - push {r14} - GET_CANARY r14,CTAG13 - push {r0-r3,r14} + + ldr r0,=statevperm + ldr r0,[r0] @ r0=vperm state rotation in bottom two bits + ldr r1,[r12,#16] @ r1=vperm key rotation in top two bits + rsbs r3,r0,r1,lsr#30 + @ Read shareA of roundkey, offset by vpermkeyrot-vpermstaterot, and eor it into shareA of state, offset by -vpermstaterot .if RK_ROR - ldmia r12!,{r0-r3,r14} @ share A of round key + ROR data - rors r0,r0,r14 @ ROR first word - eors r4,r4,r0 @ add to state - rev16 r0,r14 @ move byte 1 of ROR data into byte 0 - rors r1,r1,r0 - eors r5,r5,r1 - rev r0,r0 @ move byte 2 of ROR data into byte 0 - rors r2,r2,r0 - eors r6,r6,r2 - rev16 r0,r0 @ move byte 3 of ROR data into byte 0 - rors r3,r3,r0 - eors r7,r7,r3 + add r2,r12,#16 + ands r3,r3,#3; ldr r0,[r12,r3,lsl#2]; ldrb r1,[r2,r3]; rors r0,r0,r1; eors r4,r4,r0; adds r3,r3,#1 + ands r3,r3,#3; ldr r0,[r12,r3,lsl#2]; ldrb r1,[r2,r3]; rors r0,r0,r1; eors r5,r5,r0; adds r3,r3,#1 + ands r3,r3,#3; ldr r0,[r12,r3,lsl#2]; ldrb r1,[r2,r3]; rors r0,r0,r1; eors r6,r6,r0; adds r3,r3,#1 + ands r3,r3,#3; ldr r0,[r12,r3,lsl#2]; ldrb r1,[r2,r3]; rors r0,r0,r1; eors r7,r7,r0 .else - ldmia r12!,{r0-r3} @ share A of round key - eors r4,r4,r0 - eors r5,r5,r1 - eors r6,r6,r2 - eors r7,r7,r3 -.endif + ands r3,r3,#3; ldr r0,[r12,r3,lsl#2]; eors r4,r4,r0; adds r3,r3,#1 + ands r3,r3,#3; ldr r0,[r12,r3,lsl#2]; eors r5,r5,r0; adds r3,r3,#1 + ands r3,r3,#3; ldr r0,[r12,r3,lsl#2]; eors r6,r6,r0; adds r3,r3,#1 + ands r3,r3,#3; ldr r0,[r12,r3,lsl#2]; eors r7,r7,r0 +.endif + adds r12,r12,#20 + + clear03 @ barrier to clear internal load registers + + ldr r0,=statevperm + ldr r0,[r0] @ r0=vperm state rotation in bottom two bits + ldr r1,[r12,#16] @ r1=vperm key rotation in top two bits + rsbs r3,r0,r1,lsr#30 + @ Read shareB of roundkey, offset by vpermkeyrot-vpermstaterot, and eor it into shareB of state, offset by -vpermstaterot .if RK_ROR - ldmia r12!,{r0-r3,r14} @ share B of round key + ROR data - rors r0,r0,r14 @ ROR first word - eors r8,r8,r0 @ etc., as above - rev16 r0,r14 - rors r1,r1,r0 - eors r9,r9,r1 - rev r0,r0 - rors r2,r2,r0 - eors r10,r10,r2 - rev16 r0,r0 - rors r3,r3,r0 - eors r11,r11,r3 + add r2,r12,#16 + ands r3,r3,#3; ldr r0,[r12,r3,lsl#2]; ldrb r1,[r2,r3]; rors r0,r0,r1; eors r8,r8,r0; adds r3,r3,#1 + ands r3,r3,#3; ldr r0,[r12,r3,lsl#2]; ldrb r1,[r2,r3]; rors r0,r0,r1; eors r9,r9,r0; adds r3,r3,#1 + ands r3,r3,#3; ldr r0,[r12,r3,lsl#2]; ldrb r1,[r2,r3]; rors r0,r0,r1; eors r10,r10,r0; adds r3,r3,#1 + ands r3,r3,#3; ldr r0,[r12,r3,lsl#2]; ldrb r1,[r2,r3]; rors r0,r0,r1; eors r11,r11,r0 .else - ldmia r12!,{r0-r3} @ share B of round key - eors r8 ,r8 ,r0 - eors r9 ,r9 ,r1 - eors r10,r10,r2 - eors r11,r11,r3 + ands r3,r3,#3; ldr r0,[r12,r3,lsl#2]; eors r8,r8,r0; adds r3,r3,#1 + ands r3,r3,#3; ldr r0,[r12,r3,lsl#2]; eors r9,r9,r0; adds r3,r3,#1 + ands r3,r3,#3; ldr r0,[r12,r3,lsl#2]; eors r10,r10,r0; adds r3,r3,#1 + ands r3,r3,#3; ldr r0,[r12,r3,lsl#2]; eors r11,r11,r0 .endif - pop {r0-r3,r14} - CHK_CANARY r14,CTAG13 - pop {r15} + adds r12,r12,#20 + + clear03 20 @ barrier to clear internal load registers + bx r14 + .if NEED_ROUNDS @ perform encryption rounds @ r4-r7, r8-r11: state -@ preserves r0-r3,r12 +@ Trashes r0-r3,r12 .balign 4 rounds_s: push {r14} - GET_CANARY r14,CTAG14 - push {r0-r3,r12,r14} mov r2,#0 @ round counter -1: +rounds_s_mainloop: ldr r12,=rkey_s add r12,r12,r2,lsl#5 @ pointer to key shares for this round -.if RK_ROR add r12,r12,r2,lsl#3 -.endif + push {r2} @ save round count bl addrkey_s -.if ST_VPERM - bl gen_rand - bl vperm @ V shuffle -.endif - push {r0,r2} @ save round count -.if ST_HPERM - bl gen_rand - bl hperm @ H shuffle - push {r0} -.endif bl map_sbox_s -.if ST_HPERM - pop {r0} - bl hperm @ undo H shuffle -.endif bl shift_rows_s - ldr r2,[r13,#4] @ increment round counter on stack - adds r2,r2,#1 - str r2,[r13,#4] +.if ST_VPERM + ldmia r13,{r2} @ peek at stack to get round count + cmp r2,#NUMREFSTATEVPERM + bcs 1f + bl refreshstatevperm @ V shuffle of r4-r11 +1: +.endif + pop {r2} + adds r2,r2,#1 @ increment round counter cmp r2,#14 beq 2f @ break from loop? (last round has no mix_cols) + push {r2} bl mix_cols_s - pop {r0,r2} -.if ST_VPERM - bl vperm @ undo V shuffle -.endif - b 1b + pop {r2} + b rounds_s_mainloop 2: -@ bl inv_mix_cols_s @ or could skip in last round above - pop {r0,r2} -.if ST_VPERM - bl vperm @ undo V shuffle -.endif -.if RK_ROR - ldr r12,=rkey_s+14*40 @ final round key shares -.else - ldr r12,=rkey_s+14*32 @ final round key shares -.endif + ldr r12,=rkey_s+14*40 @ final round key shares bl addrkey_s - pop {r0-r3,r12,r14} - CHK_CANARY r14,CTAG14 + @eor r0,r4,r8;bl logword + @eor r0,r5,r9;bl logword + @eor r0,r6,r10;bl logword + @eor r0,r7,r11;bl logword pop {r15} .endif @@ -1243,19 +1655,13 @@ rounds_s: @ preserves r0-r2 .balign 4 inv_rounds_s: - push {r14} - GET_CANARY r14,CTAG15 push {r0-r2,r14} -.if RK_ROR - ldr r12,=rkey_s+14*40 @ final round key shares -.else - ldr r12,=rkey_s+14*32 @ final round key shares -.endif + ldr r12,=rkey_s+14*40 @ final round key shares bl addrkey_s mov r2,#13 @ round counter push {r2} .if ST_VPERM - bl gen_rand + bl gen_rand_sha bl vperm @ V shuffle push {r0} .endif @@ -1263,23 +1669,14 @@ inv_rounds_s: 1: push {r2} .if ST_VPERM - bl gen_rand + bl gen_rand_sha bl vperm @ V shuffle push {r0} .endif bl inv_mix_cols_s 2: bl inv_shift_rows_s -.if ST_HPERM - bl gen_rand - bl hperm @ H shuffle - push {r0} -.endif bl inv_map_sbox_s -.if ST_HPERM - pop {r0} - bl hperm @ undo H shuffle -.endif .if ST_VPERM pop {r0} bl vperm @ undo V shuffle @@ -1287,15 +1684,11 @@ inv_rounds_s: pop {r2} ldr r12,=rkey_s add r12,r12,r2,lsl#5 @ pointer to key shares for this round -.if RK_ROR add r12,r12,r2,lsl#3 -.endif bl addrkey_s subs r2,r2,#1 bpl 1b - pop {r0-r2,r14} - CHK_CANARY r14,CTAG15 - pop {r15} + pop {r0-r2,r15} .endif .if INCLUDE_ENCRYPT_CBC @@ -1303,13 +1696,11 @@ inv_rounds_s: .thumb_func @ encrypt data in place @ r0: ivec -@ r1: buf +@ r1: buf: starts with plaintext; ends up with ciphertext @ r2: number of blocks @ this implementation does not scramble the shares properly; consider a better implementation @ if security is required in encryption cbc_encrypt_s: - push {r14} - GET_CANARY r14,CTAG16 push {r4-r11,r14} ldmia r0,{r4-r7} @ load iv into share a 2: @@ -1322,9 +1713,7 @@ cbc_encrypt_s: stmia r1!,{r4-r7} subs r2,r2,#1 bne 2b - pop {r4-r11,r14} - CHK_CANARY r14,CTAG16 - pop {r15} + pop {r4-r11,r15} .endif .if INCLUDE_DECRYPT_CBC @@ -1339,8 +1728,6 @@ cbc_encrypt_s: @ r0=1: fault detected @ could be simplified to use more ldmia:s at the cost of another 8 words of stack cbc_decrypt_s: - push {r14} - GET_CANARY r14,CTAG17 push {r4-r11,r14} ldmia r0,{r4-r7} @ load IV bl ns_to_s @@ -1437,16 +1824,112 @@ cbc_decrypt_s: bne 2b add r13,#32 mov r0,#0 @ return OK status - pop {r4-r11,r14} - CHK_CANARY r14,CTAG17 - pop {r15} + pop {r4-r11,r15} .if ROUND_TRIP_TEST 1: @ fault here - rcp_panic -.endif -.endif + add r13,#32 + mov r0,#1 @ return fault status + pop {r4-r11,r15} +.endif +.endif + +@ Does mov r(i),#(0x80+i)*0x1010101 for i=flushfrom,flushfrom+1,...,12 +@ Assume 0 <= flushfrom <= 3 +@ Not possible to do this in a loop (or recursively) in gas without .altmacro? +.macro flush_regs flushfrom +.if \flushfrom<1 + mov r0,#0x80808080 +.endif +.if \flushfrom<2 + mov r1,#0x81818181 +.endif +.if \flushfrom<3 + mov r2,#0x83838383 +.endif + mov r3, #0x83838383 + mov r4, #0x84848484 + mov r5, #0x85858585 + mov r6, #0x86868686 + mov r7, #0x87878787 + mov r8, #0x88888888 + mov r9, #0x89898989 + mov r10, #0x8a8a8a8a + mov r11, #0x8b8b8b8b + mov r12, #0x8c8c8c8c +.endm + + +@ numargs is the number of arguments of the function-to-be-wrapped (i.e., excluding systick), assumed to be <=3 +.macro prewrap numargs + push {r4-r12,r14} + +@ Reset DWT count registers + mov r4,#0xe0000000 + add r4,r4,#0x1000 + add r4,r4,#4 + mov r5,#0 + mov r6,#0 + stmia r4!,{r5-r6} + add r4,r4,#8 + stmia r4!,{r5-r6} + +@ Clear any possible pending SysTick interrupt status + mov r4,#0xe0000000 + add r4,r4,#0xed00 + mov r5,#1<<25 + str r5,[r4,#4] @ ICSR at e000ed04 + + isb sy + dsb sy + +@ Allow SysTick interrupts, depending on r0=0 or 1 input + mov r0,r0,lsl#1 + add r0,r0,#5 + mov r4,#0xe000e000 + str r0,[r4,#0x10] @ SysTick CSR + + gpioput 16,1,r4,r5 @ ADC trigger high (starts power trace capture) + +@ Shift arguments down to remove systick argument +.if \numargs>=1 + mov r0,r1 +.if \numargs>=2 + mov r1,r2 +.if \numargs>=3 + mov r2,r3 +.endif +.endif +.endif + +@ Set registers r\numargs - r12 to definite values + flush_regs \numargs +@ Set r3 back to non-sentinel value in case the test program never changes r3 or r12 which would confuse the auto-detect of start/end + mov r3,#0 + +.endm + +@ numreturn is the number of return values, assumed to be 0 or 1 +.macro postwrap numreturn + gpioput 16,0,r1,r2 @ ADC trigger low + flush_regs \numreturn + mov r1,#0xe000e000 + mov r2,#4 + str r2,[r1,#0x10] @ Disable SysTick + ldr r2,[r1,#0x18] + ldr r1,=lastsystickcvr + str r2,[r1] + +@ Get final DWT cycle count + ldr r1,=0xe0001000 + ldr r2,[r1,#4] + ldr r1,=lastdwtcount + str r2,[r1] + + pop {r4-r12,r15} +.endm + .if INCLUDE_CRYPT_CTR .balign 4 @@ -1456,143 +1939,220 @@ cbc_decrypt_s: @ r1: buf @ r2: n, number of blocks, n>0 .if CT_BPERM -@ In AES-CTR each block can be independently en/decrypted as the encryption only depends on -@ the IV, the key, and the block number. We can therefore process them in any order. Hence -@ we generate all the residues mod u=2^k such that u≥n in a pseudo-random order using a linear conguential -@ generator (x_i+1 = a x_i + c mod u), and process the blocks in that order. We choose -@ x_0 and a randomly (subject to a=5 mod 8), as well as adding an overall random offset -@ to the sequence, which is equivalent to choosing a random c. -@ -@ For residues greater than or equal to n we "decrypt" an area of scratch -@ memory, taking the same time as a real decryption. The inefficiency -@ due to rounding up the number of blocks processed to the next power of -@ two is a factor of 2 in the worst case. -@ q.v. https://en.wikipedia.org/wiki/Linear_congruential_generator#m_a_power_of_2,_c_%E2%89%A0_0 +@ In AES-CTR each block can be independently en/decrypted as the encryption only depends on the IV, +@ the key, and the block number. We can therefore process them in any order, and using a +@ random order helps to defeat attacks that work on the output of the AES, since an attacker +@ wouldn't know what plaintext or ciphertext corresponds to a particular instruction. .endif + ctr_crypt_s: - GET_CANARY r3,CTAG0 - SET_COUNT 171 + +@ r0=IV, r1=cipher/plaintext buffer, r2=number of blocks + push {r0,r4-r11,r14} + + push {r0-r2} + .if CT_BPERM - push {r0,r1,r3,r4-r11,r14} - mvn r4,#0 - subs r5,r2,#1 @ make sure we generate optimal mask for n an exact power of 2 - clz r5,r5 - lsrs r4,r4,r5 @ mask m=2^k-1 s.t. m≥n - orrs r4,r4,#7 @ m≥7 - bl gen_rand - bic r5,r0,#7 - adds r5,r5,#5 @ multiplier a, randomly initialised, but make sure it is 5 mod 8 - bl gen_rand - mov r7,r0 @ initial block pointer x₀, randomly initialised - bl gen_rand - mov r8,r0 @ sequence offset, randomly initialised: this is equivalent to choosing a random c - mov r6,r4 -.else - push {r0,r3,r4-r11,r14} - movs r12,#0 -.endif - CHK_COUNT 171 +@ Initialise 32 random numbers (which fit in half-words) + ldr r4,=bperm_rand + movs r5,#32 1: - SET_COUNT 129 + bl gen_rand_sha + umull r0,r3,r0,r2 @ Random number between 0 and n-1 (n=#blocks) + strh r3,[r4],#2 + subs r5,r5,#1 + bne 1b +.endif + + bl randomisechaff + pop {r0-r2} + movs r3,#0 + +ctr_crypt_mainloop: +@ r0=IV, r1=cipher/plaintext buffer, r2=number of blocks, r3=block counter + +@ Do as much preparatory stuff as possible that doesn't involve the IV (to reduce interaction with it) + push {r0-r2} + +@ It's OK for execution time to depend on the block counter r3 ("public"), but not the block number (secret) + + tst r3,#(REFCHAFF_PERIOD-1) + bne 1f + push {r3} + bl refreshchaff + pop {r3} + 1: + + tst r3,#(REMAP_PERIOD-1) + bne 1f + push {r3} + bl remap @ shuffle the LUts + pop {r3} + 1: + + tst r3,#(REFROUNDKEYSHARES_PERIOD-1) + bne 1f + push {r3} + bl ref_roundkey_shares_s @ refresh the round key shares + pop {r3} + 1: + + tst r3,#(REFROUNDKEYHVPERMS_PERIOD-1) + bne 1f + push {r3} + bl ref_roundkey_hvperms_s @ refresh the round key vperms + pop {r3} + 1: + + pop {r0-r2} +@ r0=IV, r1=cipher/plaintext buffer, r2=number of blocks, r3=block counter + +@ Now calculate r12 = block number-to-be-deciphered from r3 = block counter .if CT_BPERM - add r12,r7,r8 @ add sequence offset - and r12,r12,r4 @ get block pointer mod 2^k - cmp r12,r2 @ set C if beyond end of buffer - sbcs r3,r3,r3 @ r3==0xffffffff in buffer, 0x00000000 past end - uadd8 r3,r3,r3 @ set/clear all GE flags if in buffer/past end - ldr r1,[r13,#4] @ get buffer address from stack - add r1,r1,r12,lsl#4 @ calculate address of block - ldr r3,=ctr_scratch - sel r1,r1,r3 @ if beyond end of buffer, just process scratch area - ldr r0,[r13] @ get IV address from stack - push {r4-r8,r12} +@ Use a "swap-or-not" method to generate an "oblivious" permutation; see makeperm.py version 7 + push {r0,r1} + ldr r0,=murmur3_constants + ldmia r0,{r9-r12,r14} @ load five murmur3_32 hash constants + ldr r0,=bperm_rand + movs r1,#31 + movs r4,r3 @ r4=i +1: + ldrh r5,[r0],#2 @ r5=k + subs r5,r5,r4 @ r5=k-i + ands r6,r2,r5,asr#31 @ r6=n*(k-i<0) + adds r5,r5,r6 @ r5=j=(k-i)%n + adds r6,r4,r5 @ r6=i+j + subs r7,r4,r5 @ r7=i-j + and r8,r7,r7,asr#31 @ r8=min(i-j,0) + sub r7,r7,r8,lsl#1 @ r7=|i-j| + mla r6,r6,r2,r7 @ r6=n(i+j)+|i-j| + eors r6,r6,r1,lsl#27 @ mix with swap-or-not round counter to get different hash functions +@ Now do murmur3_32 hash of r6 + mul r6,r6,r9 + movs r6,r6,ror#17 + mul r6,r6,r10 + movs r6,r6,ror#19 + adds r6,r6,r6,lsl#2 + add r6,r6,r11 + eors r6,r6,#4 + eors r6,r6,r6,lsr#16 + mul r6,r6,r12 + eors r6,r6,r6,lsr#13 + mul r6,r6,r14 + eors r6,r6,r6,lsr#16 @ not actually used here +@ Now set i to j, conditional on the top bit of r6 + subs r7,r5,r4 @ r7=j-i + ands r7,r7,r6,asr#31 @ r7=(j-1)*(top bit of r6) + adds r4,r4,r7 @ r4=j if top bit of r6, else i + subs r1,r1,#1 + bpl 1b + pop {r0,r1} + mov r12,r4 .else - ldr r0,[r13] @ get IV address from stack - push {r12} + mov r12,r3 .endif - CHK_COUNT 129 + +@ r0=IV, r1=cipher/plaintext buffer, r2=number of blocks, r3=block counter, r12=block to be deciphered + push {r0-r3,r12} + +processIV: @ non-target label to assist power analysis + @ It is not clear if the following addition of the block number in r12 to the IV can usefully @ be done in terms of shares. Instead we do an addition and subtraction whose overall effect -@ is the same, and which provides a small degree of masking. The IV is not a secret anyway. - ldmia r0,{r4-r7} @ load IV - rev r7,r7 @ prepare for byte-big-endian, bit-little-endian (!) addition - rev r6,r6 - rev r5,r5 - rev r4,r4 - bl gen_rand - bic r8,r0,#0x80000000 @ only 31 bits so we don't get any overflows in the following +@ is the same, and which provides a small degree of masking. The IV is not traditionally a secret, +@ though it will make it harder for the attacker if it is obscured. + bl gen_rand_sha + movs r8,r0,lsr#16 @ only use 16 low bits so we don't get any overflows in the following, and so that a carry from the first word is rare add r9,r8,r12 @ "masked" block number - adds r7,r7,r9 @ 128-bit addition - adcs r6,r6,#0 - adcs r5,r5,#0 - adcs r4,r4,#0 - subs r7,r7,r8 @ 128-bit subtraction, unmasking block number - sbcs r6,r6,r8,asr#31 - sbcs r5,r5,r8,asr#31 - sbcs r4,r4,r8,asr#31 - rev r7,r7 - rev r6,r6 - rev r5,r5 - rev r4,r4 - CHK_COUNT 130 - bl remap @ shuffle the LUts - CHK_COUNT 131 - bl ref_round_keys_s @ refresh the round keys - CHK_COUNT 132 - bl ns_to_s @ convert IV+x to shares - CHK_COUNT 133 - bl rounds_s @ forward AES rounds on IV+x - CHK_COUNT 134 - ldr r3,[r1] @ decrypt ciphertext +@ r8=random, r9=(block number)+r8, stack=IV,... + + ldr r0,[r13] @ peek at stack to restore r0=IV ptr + ldmia r0,{r4-r7} @ load IV + clear03 @ barrier to remove traces of IV from internal CPU load registers + push {r0-r3} @ We want to randomise the internal memory registers associated with the above LDM load, but this + pop {r0-r3} @ may come from non-scratch memory and have its own internal registers, so we clear it using a + @ stack save/load. Either R13 is in non-scratch memory, in which case this works, or it isn't, in + @ which case it doesn't matter, because the only subsequent use of non-scratch memory is the stack. + +@ Add in r9 in byte-big-endian, bit-little-endian (!) fashion, while trying to avoid rev operations +@ as far as possible as these tend to expose (via power fluctuations) byte-level hamming weights. +@ It's worth avoiding revs on r6, r5, r4, even at the cost of introducing a small timing dependency. + +@ First do 128-bit addition of r9 to byte-reversed IV + rev r7,r7; adds r7,r7,r9; bcc 1f + rev r6,r6; adcs r6,r6,#0; rev r6,r6; bcc 1f + rev r5,r5; adcs r5,r5,#0; rev r5,r5; bcc 1f + rev r4,r4; adcs r4,r4,#0; rev r4,r4 +1: +@ At this point, r7 is reversed and r4-r6 are not +@ Now do 128-bit subtraction of r8 from byte-reversed IV + subs r7,r7,r8; rev r7,r7; bcs 1f + rev r6,r6; sbcs r6,r6,#0; rev r6,r6; bcs 1f + rev r5,r5; sbcs r5,r5,#0; rev r5,r5; bcs 1f + rev r4,r4; sbcs r4,r4,#0; rev r4,r4 +1: + clear01 16 + +@ r4-r7 = IV for the current block + bl ns_to_s @ convert IV+x to shares, which includes choosing and incorporating a random shareC + bl conjshareC @ Add the effect of shareC to lut_a, lut_b + bl rounds_s @ Do the 15 AES rounds on (key, state=IV+x), with the (shared) result in the state, R4-R11 + bl conjshareC @ Undo the effect of shareC from lut_a, lut_b +.if ST_VPERM + bl vpermundo @ Undo vperm on the state shares +.endif + + pop {r0-r3,r12} + push {r0,r3} +@ r0=IV, r1=cipher/plaintext buffer, r2=number of blocks, r3=block counter, r12=block to be deciphered + +@ Decrypt ciphertext using AES output in shares: r4-r11 +.if ST_SHAREC + ldr r0,=shareC + ldr r0,[r0] +.else + movs r0,#0 +.endif + add r1,r1,r12,lsl#4 @ Temporarily r1 points to block-to-be-deciphered + ldr r3,[r1] eors r3,r3,r4 - eors r3,r3,r8 + eors r3,r3,r8,ror#16 @ Now r4 and r8 are free + eors r3,r3,r0 str r3,[r1] ldr r3,[r1,#4] eors r3,r3,r5 - eors r3,r3,r9 + eors r3,r3,r9,ror#16 + eors r3,r3,r0 str r3,[r1,#4] ldr r3,[r1,#8] eors r3,r3,r6 - eors r3,r3,r10 + eors r3,r3,r10,ror#16 + eors r3,r3,r0 str r3,[r1,#8] ldr r3,[r1,#12] eors r3,r3,r7 - eors r3,r3,r11 + eors r3,r3,r11,ror#16 + eors r3,r3,r0 str r3,[r1,#12] - CHK_COUNT 135 -.if CT_BPERM - pop {r4-r8,r12} - muls r7,r7,r5 @ LCG step: x<-ax+1 - adds r7,r7,#1 - subs r6,r6,#1 - CHK_COUNT 136 - bcs 1b - pop {r0,r1,r3,r4-r11,r14} -.else - pop {r12} - adds r1,r1,#16 - add r12,r12,#1 - cmp r12,r2 - CHK_COUNT 136 - bne 1b - pop {r0,r3,r4-r11,r14} -.endif - CHK_COUNT 137 - CHK_CANARY r3,CTAG0 - bx r14 -.endif + sub r1,r1,r12,lsl#4 @ Restore r1 to point to start of buffer + + pop {r0,r3} @ Restore IV and block counter +@ r0=IV, r1=cipher/plaintext buffer, r2=number of blocks, r3=block counter -.ltorg + adds r3,r3,#1 + cmp r3,r2 + bne ctr_crypt_mainloop + pop {r0,r4-r11,r15} -.thumb_func -aes_end: - nop +.endif + +.section .text.debugging,"ax",%progbits @@@@@@@@@@@@@@@@@@@@@@@@@ test functions @@@@@@@@@@@@@@@@@@@@@@@@@ @ .global test_v -@ .section .text.test_v,"ax",%progbits +@@ .section .text.test_v,"ax",%progbits @ .macro fn @ ldr.n r0,=0x12345678 @ ldr.n r0,=0xedcba987 @@ -1639,7 +2199,9 @@ aes_end: @ eor r7,r7,r11 @ bx r14 -.section .text.debugging,"ax",%progbits +.extern o8hex +.extern osp +.extern onl .thumb_func delay: @@ -1651,26 +2213,27 @@ delay: bcs delay bx r14 -.thumb_func -flush_reg: -@ put known values into r0-r3,r12 - mov r0, #0x80808080 - mov r1, #0x81818181 - mov r2, #0x82828282 - mov r3, #0x83838383 - mov r12,#0x8c8c8c8c - bx r14 .thumb_func isr_systick: - mov.w r2,#0xd0000000 @ set GPIO24 - mov.w r3,#0x01000000 - str r3,[r2,#24] - ldr r0,=systick_data + @ Stop SysTick counting + mov r0,#0xe000e000 + mov r1,#4 + str r1,[r0,#0x10] @ SysTick Control and Status Register + + @ Clear any possible pending SysTick interrupt status due to SysTick count timing out during its own handler + add r0,r0,#0xd00 + mov r1,#1<<25 + str r1,[r0,#4] @ ICSR at e000ed04 + + gpioput 24,1,r2,r3 @ set GPIO24 + + ldr r0,=systick_data ldr r1,[r0] adds r1,r1,#1 stmia r0!,{r1} + ldr r1,[r13,#0] @ r0..r2 ldr r2,[r13,#4] ldr r3,[r13,#8] @@ -1689,10 +2252,47 @@ isr_systick: @ RETPSR still in r3 stmia r0!,{r1-r3} - ldr r0,=0xe000e010 - mov r1,#5 - str r1,[r0] @ write to CSR - mov.w r2,#0xd0000000 - mov.w r3,#0x01000000 - str r3,[r2,#32] @ clear GPIO24 - bx r14 \ No newline at end of file +@ Store DWT counts CYCCNT, CPICNT, LSUCNT, FOLDCNT in sysdata[18-21] + ldr r1,=0xe0001004 + ldmia r1!,{r2,r3} + stmia r0!,{r2,r3} + add r1,r1,#8 + ldmia r1!,{r2,r3} + stmia r0!,{r2,r3} + + gpioput 24,0,r2,r3 @ clear GPIO24 + + bx r14 + +.balign 4 +.thumb_func +@ Takes SHA256 of 64-bits (r0,r1) and stores the result at memory pointed to by r2 (32 bytes) +@ This is used to generate random inputs (key and IV) to repeated instances of the crypt code. +@ These random numbers are mimicked in powerpair.py which can then analyse the effect of these random inputs on the power signal. +@ Preserves r0-r13 +gen_irand: + push {r0-r8,r14} + mov r8,r2 + ldr r4,=SHA256_BASE + movw r2,#(1<sw_lock[30] = 0xf; - flush_reg(); + // flush_reg(); ctr_crypt_s(iv, (void*)SRAM_BASE, data_size/16); - flush_reg(); + // flush_reg(); printf("Post decryption image begins with\n"); for (int i=0; i < 4; i++) From a7291e191663c9f351edc7bce7295d1a9e1a1c39 Mon Sep 17 00:00:00 2001 From: William Vinnicombe Date: Mon, 20 Jan 2025 11:16:50 +0000 Subject: [PATCH 04/20] Encorporated latest encryption code with 4-way shares Also switch to random default key --- bootloaders/encrypted/aes.S | 1646 ++++++++---------------- bootloaders/encrypted/config.h | 84 +- bootloaders/encrypted/enc_bootloader.c | 65 +- bootloaders/encrypted/otp.json | 236 ++-- bootloaders/encrypted/privateaes.bin | Bin 128 -> 128 bytes bootloaders/encrypted/update-key.cmake | 2 +- 6 files changed, 725 insertions(+), 1308 deletions(-) diff --git a/bootloaders/encrypted/aes.S b/bootloaders/encrypted/aes.S index fb10d8745..d51605a4a 100644 --- a/bootloaders/encrypted/aes.S +++ b/bootloaders/encrypted/aes.S @@ -2,35 +2,16 @@ .cpu cortex-m33 .thumb +#include "config.h" #include "hardware/platform_defs.h" #include "hardware/regs/addressmap.h" #include "hardware/regs/sha256.h" +#include "hardware/rcp.h" -#include "config.h" - -.global delay -.global isr_systick -.extern systick_data - -.global gen_lut_inverse .global gen_lut_sbox -.if NEED_INV_ROUNDS -.global gen_lut_inv_sbox -.endif - -.if INCLUDE_ENCRYPT_CBC -.global cbc_encrypt_s -.endif -.if INCLUDE_DECRYPT_CBC -.global cbc_decrypt_s -.endif -.if INCLUDE_CRYPT_CTR .global ctr_crypt_s -.endif - .global remap .global gen_rand_sha -.global gen_irand .global init_key .global rkey_s @@ -38,27 +19,116 @@ .global lut_b,lut_b_map .global rstate_sha,rstate_lfsr -.if CT_BPERM -@ Use .data section here because everything is initialised to zero in a .bss section -.section .data.aes -.balign 16 -murmur3_constants: @ Five constants used in murmur3_32 hash -.word 0xcc9e2d51 -.word 0x1b873593 -.word 0xe6546b64 -.word 0x85ebca6b -.word 0xc2b2ae35 +@ RCP macros + +#define CTAG0 0x2a +#define CTAG1 0x2b +#define CTAG2 0x2c +#define CTAG3 0x2d @ not used +#define CTAG4 0x2e +#define CTAG5 0x30 +#define CTAG6 0x31 +#define CTAG7 0x32 +#define CTAG8 0x33 +#define CTAG9 0x34 +#define CTAG10 0x35 @ not used +#define CTAG11 0x36 +#define CTAG12 0x37 +#define CTAG13 0x38 +#define CTAG14 0x39 +#define CTAG15 0x3a +#define CTAG16 0x3b +#define CTAG17 0x3c +#define CTAG18 0x3d @ not used + +.macro SET_COUNT n +.if RC_COUNT +.if RC_JITTER + rcp_count_set \n +.else + rcp_count_set_nodelay \n +.endif +.endif +.endm + +.macro CHK_COUNT n +.if RC_COUNT +.if RC_JITTER + rcp_count_check \n +.else + rcp_count_check_nodelay \n +.endif +.endif +.endm + +.macro GET_CANARY rx,tag +.if RC_CANARY +.if RC_JITTER + rcp_canary_get \rx,\tag +.else + rcp_canary_get_nodelay \rx,\tag +.endif +.endif +.endm + +.macro CHK_CANARY rx,tag +.if RC_CANARY +.if RC_JITTER + rcp_canary_check \rx,\tag +.else + rcp_canary_check_nodelay \rx,\tag +.endif +.endif +.endm + +.macro GET_CANARY_NJ rx,tag @ with no jitter even if you ask for it (for situations where it would otherwise slow things down a lot) +.if RC_CANARY + rcp_canary_get_nodelay \rx,\tag +.endif +.endm + +.macro CHK_CANARY_NJ rx,tag @ with no jitter even if you ask for it +.if RC_CANARY + rcp_canary_check_nodelay \rx,\tag .endif +.endm + +.macro clear03 offset=0 + getchaffaddress r0,\offset + ldmia r0,{r0-r3} +.endm + +.macro clear03_preserve_r3 offset=0 + getchaffaddress r0,\offset + ldmia r0!,{r1-r2} + ldmia r0!,{r1-r2} +.endm -@ Put workspace in the second scratch area (was .section .bss.aes) -.section .scratch_y.aes +.macro clear01 offset=0 + getchaffaddress r0,\offset + ldmia r0,{r0,r1} +.endm + +@ Put workspace in the second scratch area +@ The "a"=allocatable attribute (and possibly the %progbits attribute) are necessary to store the murmur3 constants, +@ otherwise they may end up silently replaced with 0 or 0xffffffff +.section .scratch_y.aes,"a",%progbits + +@ chaff has to be at the start of scratch_y = 0x20081000 because this is assumed by the following macro, getchaffaddress +@ (It seems ADR does not work, nor is it possible to assert that chaff==0x20081000) +@ getchaffaddress is used by clear03 and clear01 and other sensitive cases which require the first load to be a random one +@ chaff has to be 0 mod 16 for other reasons +.macro getchaffaddress rx,offset=0 +@ ldr \rx,=(chaff+\offset) + mov \rx,#(0x1000+\offset) + movt \rx,#0x2008 +.endm +chaff: +.space 48 -@ Regardless of configuration, the code uses a single 256-entry LUT. If both -@ encryption and decryption are enabled then this is a table of inverses -@ of GF(2⁸) field elements, from which both the S-box and inverse S-box -@ functions can be derived; otherwise it can be a simple inverse S-box -@ table. -@ In either case the LUT is represented as two shares, lut_a and lut_b, +@ Regardless of configuration, the code uses a single 256-entry LUT, +@ which is a simple S-box table. +@ The LUT is represented as two shares, lut_a and lut_b, @ whose values must be EORed. Furthermore, the contents of each share are @ scambled according to a 4-byte "map". The map comprises two bytes that @ are EORed into the addressing of the share, and two bytes that are @@ -111,33 +181,25 @@ shareC: @ 8 mod 16 .space 4 statevperm: @ 12 mod 16 .space 4 @ vperm state rotation: only last two bits are operational; other bits random +RKshareC: +.space 4 .balign 16 -chaff: @ Must be 0 mod 16; This will be filled with random numbers to do barrier loads -.space 48 + +.if CT_BPERM .balign 16 +murmur3_constants: @ Five constants used in murmur3_32 hash +.word 0xcc9e2d51 +.word 0x1b873593 +.word 0xe6546b64 +.word 0x85ebca6b +.word 0xc2b2ae35 +.endif -@ Put main code in first scratch area (was .section .text.aes,"ax",%progbits) +@ Put main code in first scratch area .section .scratch_x.aes,"ax",%progbits -.macro gpioput pin,state,reg1,reg2 - mov \reg1,#0xd0000000 - mov \reg2,#(1<<\pin) - str \reg2,[\reg1,#32-8*\state] -.endm - -.macro clear03 offset=0 - ldr r0,=(chaff+\offset) - ldmia r0,{r0-r3} -.endm - -.macro clear01 offset=0 - ldr r0,=(chaff+\offset) - ldmia r0,{r0,r1} - rev r0,r0 -.endm - .if GEN_RAND_SHA -@ random numbers using SHA256 hardware +@ we need SHA256_SUM0_OFFSET==8 (see note below) .if SHA256_SUM0_OFFSET!=8 .err .endif @@ -146,9 +208,13 @@ chaff: @ Must be 0 mod 16; This will be filled with ran @ Preserves r1-r13 .balign 4 gen_rand_sha: + push {r14} + GET_CANARY_NJ r14,CTAG1 push {r1-r3,r14} bl gen_rand_sha_nonpres - pop {r1-r3,r15} + pop {r1-r3,r14} + CHK_CANARY_NJ r14,CTAG1 + pop {r15} @ Return single random word in r0 @ Trashes r1-r3 @@ -205,11 +271,15 @@ gen_rand_sha_nonpres: .thumb_func .if !GEN_RAND_SHA gen_rand_sha: -.endif -gen_rand_lfsr: +gen_rand_lfsr: @ Not used + push {r14} + GET_CANARY_NJ r14,CTAG2 push {r1,r2,r14} bl gen_rand_lfsr_nonpres - pop {r1,r2,r15} + pop {r1,r2,r14} + CHK_CANARY_NJ r14,CTAG2 + pop {r15} +.endif @ Trashes r1,r2 @ 12 cycles including branch = 12 cycles/word @@ -219,103 +289,93 @@ gen_rand_sha_nonpres: .endif gen_rand_lfsr_nonpres: ldr r2,=rstate_lfsr - ldr r0,[r2] - ldr r1,=0x1d872b41 @ constant for a maximum-length sequence + ldmia r2,{r0-r1} @ r0=state_in, r1=0x1d872b41=constant for a maximum-length sequence and r1,r1,r0,asr#31 @ will we be shifting out a 1? keep the constant, otherwise 0 eor r0,r1,r0,lsl#1 str r0,[r2] bx r14 -@ Return two random words in r0,r1 -@ Trashes r2,r3 -@ 16 cycles including branch = 8 cycles/word -.balign 4 -gen_rand_lfsr2: +.macro loadlfsr ldr r2,=rstate_lfsr - ldmia r2,{r1,r3} @ r1=state_in, r3=0x1d872b41 = constant for a maximum-length sequence - and r0,r3,r1,asr#31; eor r0,r0,r1,lsl#1 @ Get new state r0 - and r1,r3,r0,asr#31; eor r1,r1,r0,lsl#1 @ Get new state r1 - str r1,[r2] - bx r14 + ldmia r2,{r0-r1} @ r0=lfsr_state, r1=lfsr_const=0x1d872b41 for a maximum-length sequence +.endm -@ Return four random words in r0-r3 -@ 27 cycles including branch = 6.75 cycles/word -.balign 4 -gen_rand_lfsr4: - push {r14} - ldr r14,=rstate_lfsr - ldmia r14,{r3,r14} @ r3=state_in, r14=0x1d872b41 = constant for a maximum-length sequence - and r0,r14,r3,asr#31; eor r0,r0,r3,lsl#1 @ Get new state r0 - and r1,r14,r0,asr#31; eor r1,r1,r0,lsl#1 @ Get new state r1 - and r2,r14,r1,asr#31; eor r2,r2,r1,lsl#1 @ Get new state r2 - and r3,r14,r2,asr#31; eor r3,r3,r2,lsl#1 @ Get new state r3 - ldr r14,=rstate_lfsr - str r3,[r14] - pop {r15} +.macro steplfsr + ands r3,r1,r0,asr#31 @ will we be shifting out a 1? keep the constant, otherwise 0 + eors r0,r3,r0,lsl#1 +.endm + +.macro savelfsr + str r0,[r2] +.endm .ltorg .balign 4 .thumb_func makesmallperm: - @ Make a uniformly random permutation of R0 bytes and stores the resulting byte array at R1 - @ Should be very uniform up to R0=10; maybe 11 or 12 are also OK. (10! << 2^32) - @ To make it valid up to R0=256, move the bl gen_rand_sha inside the loop - @ Uses inside-out method (slightly more efficient variant of Fisher-Yates) - @ Trashes r0-r3 +@ Make a uniformly random permutation of R0 bytes and stores the resulting byte array at R1 +@ Should be very uniform up to R0=10; maybe 11 or 12 are also OK. (10! << 2^32) +@ To make it valid up to R0=256, move the bl gen_rand_sha inside the loop +@ Uses inside-out method (slightly more efficient variant of Fisher-Yates) +@ Trashes r0-r3 + push {r14} + GET_CANARY_NJ r14,CTAG4 push {r4-r6,r14} movs r4,r1 movs r6,r0 movs r1,#0 movs r2,#1 bl gen_rand_sha - 1: - @ r1,r2=i,i+1, i=0, 2, 4, ... +@ r1,r2=i,i+1, i=0, 2, 4, ... cmp r1,r6 beq 2f - + umull r0,r3,r0,r2 ldrb r5,[r4,r3] strb r5,[r4,r1] strb r1,[r4,r3] adds r1,r1,#2 - @ r2,r1=i,i+1, i=1, 3, 5, ... +@ r2,r1=i,i+1, i=1, 3, 5, ... cmp r2,r6 beq 2f - + umull r0,r3,r0,r1 ldrb r5,[r4,r3] strb r5,[r4,r2] strb r2,[r4,r3] adds r2,r2,#2 - + b 1b - + 2: - pop {r4-r6,r15} + pop {r4-r6,r14} + CHK_CANARY_NJ r14,CTAG4 + pop {r15} .balign 4 .thumb_func makeperm16: - @ Make a random permutation of 16 things using the inside-out method (slightly more efficient variant of Fisher-Yates) - @ Store it in the 16 bytes at perm16 - @ More efficient than calling makeperm with R0=16, R1=perm16 - fewer calls to gen_rand_sha - @ Trashes r0-r5 +@ Make a random permutation of 16 things using the inside-out method (slightly more efficient variant of Fisher-Yates) +@ Store it in the 16 bytes at perm16 +@ More efficient than calling makeperm with R0=16, R1=perm16 - fewer calls to gen_rand_sha +@ Trashes r0-r5 - push {r14} + GET_CANARY r0,CTAG5 + push {r0,r14} ldr r4,=perm16 bl gen_rand_sha_nonpres - - @ i=0 + +@ i=0 movs r1,#0 movs r2,#1 @ r1,r2=i,i+1 strb r1,[r4] - - @ i=1 + +@ i=1 adds r1,r1,#2 @ r1,r2=i+1,i umull r0,r3,r0,r1 ldrb r5,[r4,r3] @@ -323,14 +383,14 @@ makeperm16: strb r2,[r4,r3] 1: - @ i=2, 4, 6, 8 +@ i=2, 4, 6, 8 adds r2,r2,#2 @ r1,r2=i,i+1 umull r0,r3,r0,r2 ldrb r5,[r4,r3] strb r5,[r4,r1] strb r1,[r4,r3] - @ i=3, 5, 7, 9 +@ i=3, 5, 7, 9 adds r1,r1,#2 @ r1,r2=i+1,i umull r0,r3,r0,r1 ldrb r5,[r4,r3] @@ -339,19 +399,19 @@ makeperm16: strb r2,[r4,r3] bne 1b - @ refresh random number after extracting 10! from it - @ 10! and 16!/10! are both much less than 2^32, so the permutation will be extremely close to uniform +@ refresh random number after extracting 10! from it +@ 10! and 16!/10! are both much less than 2^32, so the permutation will be extremely close to uniform bl gen_rand_sha 1: - @ i=10, 12, 14 +@ i=10, 12, 14 adds r2,r2,#2 @ r1,r2=i,i+1 umull r0,r3,r0,r2 ldrb r5,[r4,r3] strb r5,[r4,r1] strb r1,[r4,r3] - @ i=11, 13, 15 +@ i=11, 13, 15 adds r1,r1,#2 @ r1,r2=i+1,i umull r0,r3,r0,r1 ldrb r5,[r4,r3] @@ -360,59 +420,34 @@ makeperm16: strb r2,[r4,r3] bne 1b - @ Finished making permutation - pop {r15} - -.balign 4 -.thumb_func -gen_lut_inverse: -@ set lut_a to be a table of GF(2⁸) inverses, using lut_b as temporary storage -@ return r0=lut_a, r1=lut_b - ldr r0,=lut_a - ldr r1,=lut_b -@ first set lut_a to be a table of antilogarithms, lut_b a table of logarithms - mov r2,#0 - strb r2,[r0] @ (*) - mov r3,#1 @ we maintain invariant that r2=log(r3) -1: - strb r2,[r0,r3] @ log table - strb r3,[r1,r2] @ antilog table - lsls r12,r3,#25 - it cs - eorcs r12,r12,#0x1b000000 @ multiply by x - eor r3,r3,r12,lsr#24 @ multiply by x+1 ("3"), which is a primitive element - add r2,r2,#1 - cmp r2,#255 - bls 1b - movs r2,#255 -1: - ldrb r3,[r0,r2] @ for each i≠0, find log,... - eor r3,r3,#255 @ ... negate... - ldrb r3,[r1,r3] @ ... and antilog to get inverse - strb r3,[r0,r2] - subs r2,r2,#1 - bne 1b @ note that inverse(0)=0 by (*) above + pop {r0,r14} + CHK_CANARY r0,CTAG5 bx r14 .balign 4 .thumb_func remap: @ do a random remap of the LUTs -@ preserves r0-r11 - push {r0-r11,r14} +@ preserves r0-r11; trashes r12 + GET_CANARY r12,CTAG6 + push {r0-r12,r14} bl gen_rand_sha_nonpres ldr r1,=lut_a bl remap_1 bl gen_rand_sha_nonpres ldr r1,=lut_b bl remap_1 - pop {r0-r11,r15} + pop {r0-r12,r14} + CHK_CANARY r12,CTAG6 + bx r14 + remap_1: @ r0: B0:xa B1:xb B2:ya B3:yb @ r1: array of 256 bytes, followed by a 4-byte map @ shuffle LUT share array such that new[i]=old[i^xa^xb]^ya^yb, update map according to r0 - push {r14} + GET_CANARY_NJ r6,CTAG7 + push {r6,r14} mov r14,0x01010101 ubfx r6,r0,#16,#8 ubfx r7,r0,#24,#8 @@ -455,12 +490,13 @@ remap_1: str r8,[r1,r3] subs r2,r2,#4 bpl 1b - pop {r15} - + pop {r6,r14} + CHK_CANARY_NJ r6,CTAG7 + bx r14 .if RK_ROR -@ "refresh" shares of rkeys by random eor into both shares of each word +@ "refresh" shares of rkeys by random eor into both shares of each word, and also randomise the single word RKshareC @ Trashes r0-r12 @ If i = word number 0..3, @ Aptr=memory word pointer to block of 20 bytes containing H&V-rotated share A roundkey (similarly B), then @@ -472,42 +508,55 @@ remap_1: ref_roundkey_shares_s: mov r11,#15 @ there are 15 expanded keys ref_roundkey_shares_s_test: @ entry point for test code to do fewer than 15 rounds - push {r14} ldr r4,=rkey_s + loadlfsr + steplfsr @ r0=change in RKshareC + adr r2,RKshareCchange + str r0,[r2] + ldr r3,=RKshareC + ldr r5,[r3] + eors r5,r5,r0 + str r5,[r3] + @ r0=lfsr_state, r1=lfsr_const, r4=roundkey_ptr, r11=roundcounter + ref_roundkey_shares_s_loop: ldmia r4!,{r5-r8,r10} @ r5-r8 = rkey shareA, r10=X_A=vperm+rotations of rkey shareA -@ ldr r0,=chaff -@ and r1,r11,#7 -@ add r0,r0,r1,lsl#2 -@ ldmia r0,{r0-r3} - ldr r12,[r4,#16] @ r12 = X_B=vperm+rotations of rkey shareB - mov r0,r12,lsr#30 - sub r9,r0,r10,lsr#30 @ r9 = vperm_B - vperm_A (|junk) - mov r0,r9,lsl#3 @ r0 = 8*(vperm_B - vperm_A) mod 32 - mov r12,r12,ror r0 - usub8 r12,r10,r12 @ r12 = X_A - (X_B ror r0) - bl gen_rand_lfsr4 - eors r5,r5,r0; ands r9,r9,#3; ldr r10,[r4,r9,lsl#2]; ror r0,r0,r12; eor r10,r10,r0,ror#16; mov r12,r12,ror#8; str r10,[r4,r9,lsl#2]; adds r9,r9,#1 - eors r6,r6,r1; ands r9,r9,#3; ldr r10,[r4,r9,lsl#2]; ror r1,r1,r12; eor r10,r10,r1,ror#16; mov r12,r12,ror#8; str r10,[r4,r9,lsl#2]; adds r9,r9,#1 - eors r7,r7,r2; ands r9,r9,#3; ldr r10,[r4,r9,lsl#2]; ror r2,r2,r12; eor r10,r10,r2,ror#16; mov r12,r12,ror#8; str r10,[r4,r9,lsl#2]; adds r9,r9,#1 - eors r8,r8,r3; ands r9,r9,#3; ldr r10,[r4,r9,lsl#2]; ror r3,r3,r12; eor r10,r10,r3,ror#16; str r10,[r4,r9,lsl#2] + mov r2,r12,lsr#30 @ r2 = vpermB + sub r9,r2,r10,lsr#30 @ r9 = vpermB - vpermA (|junk) + mov r2,r9,lsl#3 @ r2 = 8*(vpermB - vpermA) mod 32 + mov r12,r12,ror r2 + usub8 r12,r10,r12 @ r12 = rotsA - (rotsB ror r2) + + @ r2,r3,r10=workspace, r0=lfsr_state, r1=lfsr_const, r4=roundkeyB_ptr, r5-r8=roundkeyA, r9=vpermdiff, r10=rotsA, r11=roundcounter, r12=rotdiff + steplfsr; eors r5,r5,r0; ands r9,r9,#3; ldr r3,[r4,r9,lsl#2]; ror r2,r0,r12; eors r3,r3,r2,ror#16; mov r12,r12,ror#8; str r3,[r4,r9,lsl#2]; adds r9,r9,#1 + steplfsr; eors r6,r6,r0; ands r9,r9,#3; ldr r3,[r4,r9,lsl#2]; ror r2,r0,r12; eors r3,r3,r2,ror#16; mov r12,r12,ror#8; str r3,[r4,r9,lsl#2]; adds r9,r9,#1 + steplfsr; eors r7,r7,r0; ands r9,r9,#3; ldr r3,[r4,r9,lsl#2]; ror r2,r0,r12; eors r3,r3,r2,ror#16; mov r12,r12,ror#8; str r3,[r4,r9,lsl#2]; adds r9,r9,#1 + steplfsr; eors r8,r8,r0; ands r9,r9,#3; ldr r3,[r4,r9,lsl#2]; ror r2,r0,r12; eors r3,r3,r2,ror#16; str r3,[r4,r9,lsl#2] + + ldr r3,RKshareCchange + movs r2,#0 + usub8 r10,r2,r10 + ror r2,r3,r10; mov r10,r10,ror#8; eors r5,r5,r2 + ror r2,r3,r10; mov r10,r10,ror#8; eors r6,r6,r2 + ror r2,r3,r10; mov r10,r10,ror#8; eors r7,r7,r2 + ror r2,r3,r10; eors r8,r8,r2 + subs r4,r4,#20 stmia r4,{r5-r8} adds r4,r4,#40 subs r11,r11,#1 - -@ ldr r0,=chaff -@ add r1,r11,#3 -@ and r1,r1,#7 -@ add r0,r0,r1,lsl#2 -@ ldmia r0,{r0-r3} - + bne ref_roundkey_shares_s_loop + ldr r2,=rstate_lfsr @ restore rstate_lfsr + savelfsr @ Save lfsr_state clear03 24 ref_roundkey_shares_s_exit: - pop {r15} + bx r14 + .balign 4 +RKshareCchange: + .space 4 .balign 4 .thumb_func @@ -521,7 +570,8 @@ ref_roundkey_shares_s_exit: ref_roundkey_hvperms_s: movs r7,#30 ref_roundkey_hvperms_s_test: @ entry point for test code to do fewer than 30 key shares - push {r14} + GET_CANARY r10,CTAG9 + push {r10,r14} ldr r10,=rkey_s ref_roundkey_hvperms_s_loop: bl gen_rand_lfsr_nonpres @ r0=new vperm high|rotations @@ -541,50 +591,58 @@ ref_roundkey_hvperms_s_loop: bne ref_roundkey_hvperms_s_loop clear03 28 ref_roundkey_hvperms_s_exit: @ label exit point to be to able to specify to analysis code - pop {r15} + pop {r10,r14} + CHK_CANARY r10,CTAG9 + bx r14 .else -@ "refresh" shares of rkeys by random eor into both shares of each word +@ "refresh" shares of rkeys by random eor into both shares of each word, and also randomise the single word RKshareC @ Trashes r0-r11 .balign 4 .thumb_func ref_roundkey_shares_s: mov r11,#15 @ there are 15 expanded keys ref_roundkey_shares_s_test: @ entry point for test code to do fewer than 15 rounds - push {r14} + GET_CANARY r4,CTAG8 + push {r4,r14} ldr r4,=rkey_s + loadlfsr + steplfsr @ r0=change in RKshareC + ldr r3,=RKshareC + ldr r5,[r3] + eors r5,r5,r0 + str r5,[r3] + mov r10,r0 ref_roundkey_shares_s_loop: ldmia r4!,{r5-r9} @ r5-r8 = rkey shareA with vperm r9 -@ ldr r0,=chaff -@ and r1,r11,#7 -@ add r0,r0,r1,lsl#2 -@ ldmia r0,{r0-r3} - - ldr r10,[r4,#16] @ rkey shareB has a vperm of r10>>30 - mov r10,r10,lsr#30 - sub r9,r10,r9,lsr#30 @ r9 = vperm_B - vperm_A (|junk) - bl gen_rand_lfsr4 - eors r5,r5,r0; ands r9,r9,#3; ldr r10,[r4,r9,lsl#2]; eor r10,r10,r0,ror#16; str r10,[r4,r9,lsl#2]; adds r9,r9,#1 - eors r6,r6,r1; ands r9,r9,#3; ldr r10,[r4,r9,lsl#2]; eor r10,r10,r1,ror#16; str r10,[r4,r9,lsl#2]; adds r9,r9,#1 - eors r7,r7,r2; ands r9,r9,#3; ldr r10,[r4,r9,lsl#2]; eor r10,r10,r2,ror#16; str r10,[r4,r9,lsl#2]; adds r9,r9,#1 - eors r8,r8,r3; ands r9,r9,#3; ldr r10,[r4,r9,lsl#2]; eor r10,r10,r3,ror#16; str r10,[r4,r9,lsl#2] + @ clear03: would need to do this with, say r2,r3,r12 (reloading r2 later) + + ldr r3,[r4,#16] @ rkey shareB has a vperm of r10>>30 + movs r3,r3,lsr#30 + sub r9,r3,r9,lsr#30 @ r9 = vperm_B - vperm_A (|junk) + @ r3,r12=workspace, r0=lfsr_state, r1=lfsr_const, r2=rstate_lfsr, r4=roundkeyB_ptr, r5-r8=roundkeyA, r9=vpermdiff, r10=RKshareCchange, r11=roundcounter + + steplfsr; eors r5,r5,r0; and r9,r9,#3; eors r5,r5,r10; ldr r3,[r4,r9,lsl#2]; eors r3,r3,r0,ror#16; str r3,[r4,r9,lsl#2]; adds r9,r9,#1 + steplfsr; eors r6,r6,r0; and r9,r9,#3; eors r6,r6,r10; ldr r3,[r4,r9,lsl#2]; eors r3,r3,r0,ror#16; str r3,[r4,r9,lsl#2]; adds r9,r9,#1 + steplfsr; eors r7,r7,r0; and r9,r9,#3; eors r7,r7,r10; ldr r3,[r4,r9,lsl#2]; eors r3,r3,r0,ror#16; str r3,[r4,r9,lsl#2]; adds r9,r9,#1 + steplfsr; eors r8,r8,r0; and r9,r9,#3; eors r8,r8,r10; ldr r3,[r4,r9,lsl#2]; eors r3,r3,r0,ror#16; str r3,[r4,r9,lsl#2] + subs r4,r4,#20 stmia r4,{r5-r8} adds r4,r4,#40 subs r11,r11,#1 - -@ ldr r0,=chaff -@ add r1,r11,#3 -@ and r1,r1,#7 -@ add r0,r0,r1,lsl#2 -@ ldmia r0,{r0-r3} - + + @ clear03: would need to do this with, say r3,r5-r8 + bne ref_roundkey_shares_s_loop + savelfsr clear03 24 ref_roundkey_shares_s_exit: - pop {r15} + pop {r4,r14} + CHK_CANARY r4,CTAG8 + bx r14 .balign 4 .thumb_func @@ -593,7 +651,8 @@ ref_roundkey_shares_s_exit: ref_roundkey_hvperms_s: movs r7,#30 ref_roundkey_hvperms_s_test: @ entry point for test code to do fewer than 30 key shares - push {r14} + GET_CANARY r0,CTAG9 + push {r0,r14} bl gen_rand_lfsr_nonpres ldr r1,=rkey_s ref_roundkey_hvperms_s_loop: @@ -619,51 +678,39 @@ ref_roundkey_hvperms_s_loop: bne ref_roundkey_hvperms_s_loop clear03 28 ref_roundkey_hvperms_s_exit: @ label exit point to be to able to specify to analysis code - pop {r15} + pop {r0,r14} + CHK_CANARY r0,CTAG9 + bx r14 .endif -.if NEED_VPERM -.balign 4 -.thumb_func -vpermundo: -@ Undo the effects of vperm rotation on share registers r4-r7, r8-r11 -@ Expect r1=statevperm (state rotations) on entry -@ Trashes r0-r3,r12 - push {r14} - ldr r1,=statevperm - ldr r2,[r1] - rsbs r0,r2,#0 - b vpermaddr0 - +.if ST_VPERM .balign 4 .thumb_func -refreshstatevperm: - -@ Rotate share registers r4-r7, r8-r11 (r4->r5-r6->r7->r4 etc.) by an addtional random amount and update the rotation at !r1 +@ Rotate share registers r4-r7, r8-r11 (r4->r5-r6->r7->r4 etc.) by an addtional amount +@ given in the bottom two bits of R0 and update the rotation recorded at statevperm. +@ On entry R1 must point to statevperm. @ Trashes r0-r3,r12 @ Maintains r4=rorig(4+(-!r1)%4), r5=rorig(4+(1-!r1)%4), ... @ r8=rorig(8+(-!r1)%4), r9=rorig(8+(1-!r1)%4), ... @ Note: only low 2 bits of !r1 are used. The rest are random to add to the noise. - - push {r14} - bl gen_rand_lfsr_nonpres - ldr r1,=statevperm +addstatevperm: ldr r2,[r1] -vpermaddr0: adds r2,r2,r0 str r2,[r1] - + ldr r1,=shareA ands r0,r0,#3; str r4,[r1,r0,lsl#2]; adds r0,r0,#1 ands r0,r0,#3; str r5,[r1,r0,lsl#2]; adds r0,r0,#1 ands r0,r0,#3; str r6,[r1,r0,lsl#2]; adds r0,r0,#1 ands r0,r0,#3; str r7,[r1,r0,lsl#2]; adds r0,r0,#1 ldmia r1,{r4-r7} - - ldr r12,=chaff @ Overwrite temperorary storage with random numbers - ldmia r12,{r2,r3,r12,r14} - stmia r1,{r2,r3,r12,r14} + + getchaffaddress r12 @ Overwrite temporary storage with random numbers + ldmia r12!,{r2,r3} + stmia r1!,{r2,r3} + ldmia r12!,{r2,r3} + stmia r1!,{r2,r3} ldr r1,=shareB ands r0,r0,#3; str r8, [r1,r0,lsl#2]; adds r0,r0,#1 @@ -671,20 +718,23 @@ vpermaddr0: ands r0,r0,#3; str r10,[r1,r0,lsl#2]; adds r0,r0,#1 ands r0,r0,#3; str r11,[r1,r0,lsl#2]; adds r0,r0,#1 ldmia r1,{r8-r11} - - ldr r12,=chaff+16 @ Overwrite temperorary storage with random numbers - ldmia r12,{r2,r3,r12,r14} - stmia r1,{r2,r3,r12,r14} -refreshstatevperm_exit: @ label exit point to be to able to specify to analysis code - pop {r15} + getchaffaddress r0,16 @ Overwrite temporary storage with random numbers + ldmia r0!,{r2,r3} + stmia r1!,{r2,r3} + ldmia r0!,{r2,r3} + stmia r1!,{r2,r3} + +addstatevperm_exit: @ label exit point to be to able to specify to analysis code + bx r14 .endif @ Switch from non-shared to shared state @ Trashes r0-r3,r12 .balign 4 ns_to_s: - push {r14} + GET_CANARY r12,CTAG11 + push {r12,r14} .if ST_SHAREC bl gen_rand_sha_nonpres @ Create state share C; all bytes the same ands r0,r0,#255 @@ -709,15 +759,14 @@ ns_to_s: eor r11,r12,r0,ror#16 .if ST_VPERM bl gen_rand_sha_nonpres -.endif ldr r1,=statevperm movs r2,#0 str r2,[r1] -.if ST_VPERM - b vpermaddr0 @ Tail call. Initialise state vperm with SHA RNG, refresh with LFSR RNG -.else - pop {r15} + bl addstatevperm @ Initialise state vperm with SHA RNG, refresh with LFSR RNG .endif + pop {r12,r14} + CHK_CANARY r12,CTAG11 + bx r14 @ Conjugate lut_a, lut_b with shareC @ I.e., EOR the input and output with shareC. @@ -739,8 +788,7 @@ conjshareC: str r2,[r1,#0x100] .endif bx r14 - -.if NEED_ROUNDS + .balign 4 .thumb_func shift_rows_s: @@ -793,67 +841,11 @@ shift_rows_s: eors r0,r10,r11 @ ta=state[2]^state[3]; ta&=0xff00ff00; state[2]^=ta; ands r0,r0,#0xff00ff00 eors r10,r10,r0 - - eors r11,r11,r1 @ state[3]^=tb; - - clear01 @ barrier - bx r14 -.endif - -.if NEED_INV_ROUNDS -.balign 4 -.thumb_func -inv_shift_rows_s: -@ first half is the same as shift_rows; halves could be done in opposite order for tail chain - eors r0,r4,r6 @ ta=state[0]^state[2]; ta&=0xffff0000; state[0]^=ta; state[2]^=ta; - lsrs r0,r0,#16 - lsls r0,r0,#16 - eors r4,r4,r0 - eors r6,r6,r0 - eors r0,r5,r7 @ ta=state[1]^state[3]; ta&=0xffff0000; state[1]^=ta; state[3]^=ta; - lsrs r0,r0,#16 - lsls r0,r0,#16 - eors r5,r5,r0 - eors r7,r7,r0 - - eors r1,r7,r4 @ tb=state[3]^state[0]; tb&=0xff00ff00; - ands r1,r1,#0xff00ff00 - eors r0,r6,r7 @ ta=state[2]^state[3]; ta&=0xff00ff00; state[3]^=ta; - ands r0,r0,#0xff00ff00 - eors r7,r7,r0 - eors r0,r5,r6 @ ta=state[1]^state[2]; ta&=0xff00ff00; state[2]^=ta; - ands r0,r0,#0xff00ff00 - eors r6,r6,r0 - eors r0,r4,r5 @ ta=state[0]^state[1]; ta&=0xff00ff00; state[1]^=ta; - ands r0,r0,#0xff00ff00 - eors r5,r5,r0 - eors r4,r4,r1 @ state[0]^=tb; - eors r0,r8,r10 @ ta=state[0]^state[2]; ta&=0xffff0000; state[0]^=ta; state[2]^=ta; - lsrs r0,r0,#16 - lsls r0,r0,#16 - eors r8,r8,r0 - eors r10,r10,r0 - eors r0,r9,r11 @ ta=state[1]^state[3]; ta&=0xffff0000; state[1]^=ta; state[3]^=ta; - lsrs r0,r0,#16 - lsls r0,r0,#16 - eors r9,r9,r0 - eors r11,r11,r0 + eors r11,r11,r1 @ state[3]^=tb; - eors r1,r11,r8 @ tb=state[3]^state[0]; tb&=0xff00ff00; - ands r1,r1,#0xff00ff00 - eors r0,r10,r11 @ ta=state[2]^state[3]; ta&=0xff00ff00; state[3]^=ta; - ands r0,r0,#0xff00ff00 - eors r11,r11,r0 - eors r0,r9,r10 @ ta=state[1]^state[2]; ta&=0xff00ff00; state[2]^=ta; - ands r0,r0,#0xff00ff00 - eors r10,r10,r0 - eors r0,r8,r9 @ ta=state[0]^state[1]; ta&=0xff00ff00; state[1]^=ta; - ands r0,r0,#0xff00ff00 - eors r9,r9,r0 - eors r8,r8,r1 @ state[0]^=tb; + clear01 @ barrier bx r14 -.endif @ multiply polynomial over GF(2⁸) by c(x) = 0x03x³ + 0x01x² + 0x01x + 0x02 modulo x⁴+1 @ r0x00 is a register holding 0x00000000; r0x1b is a register holding 0x1b1b1b1b @@ -893,7 +885,6 @@ inv_shift_rows_s: eors \rx,\rt,\rw,ror#8 @ 14x ^ 9x ROL #8 ^ 13x ROL #16 ^ 11x ROL #24 .endm -.if NEED_ROUNDS .balign 4 .thumb_func @ Trashes r0-r3,r12 @@ -912,113 +903,39 @@ mix_cols_s: mixcol r11,r0,r1,r2,r3 ldmia r12!,{r0,r1} @ overwrite sensitive shareB-related quantities r0,r1 with random numbers bx r14 -.endif - -.if NEED_INV_ROUNDS -.balign 4 -.thumb_func -inv_mix_cols_s: - push {r14} - mov r12,#0x00000000 - mov r14,#0x1b1b1b1b - invmixcol r4 ,r0,r1,r2,r3,r12,r14 @ apply invmixcol to each state word - invmixcol r5 ,r0,r1,r2,r3,r12,r14 - invmixcol r6 ,r0,r1,r2,r3,r12,r14 - invmixcol r7 ,r0,r1,r2,r3,r12,r14 - invmixcol r8 ,r0,r1,r2,r3,r12,r14 - invmixcol r9 ,r0,r1,r2,r3,r12,r14 - invmixcol r10,r0,r1,r2,r3,r12,r14 - invmixcol r11,r0,r1,r2,r3,r12,r14 - pop {r15} -.endif - -.if SBOX_VIA_INV -@ bytewise EOR-convolution with constant 0x1f -.macro conv_0x1f rx,rt,ru - eors \rt,\rx,\rx,ror#31 @ t=x^ROL(x,1); - eors \rt,\rt,\rt,ror#30 @ t=t^ROL(t,2); - eors \rt,\rt,\rx,ror#28 @ t=t^ROL(x,4); @ convolution with byte boundaries "trashed" - ands \ru,\rx,#0xf0f0f0f0 @ u=x&0xf0f0f0f0; - eors \ru,\ru,\ru,ror#31 @ u=u^ROL(u,1); - eors \ru,\ru,\ru,ror#30 @ u=u^ROL(u,2); - ands \ru,\ru,#0x87878787 @ u=u&0x87878787; @ compensation for trashing - eors \ru,\ru,\ru,ror#24 @ u=u^ROL(u,8); - eors \rx,\rt,\ru,ror#7 @ t^=ROR(u,7); @ with trashing fixed -.endm - -@ bytewise EOR-convolution with constant 0x4a -.macro conv_0x4a rx,rt,ru - eors \rt,\rx,\rx,ror#30 @ t=x^ROL(x,2); - eors \rt,\rt,\rx,ror#27 @ t=t^ROL(x,5); - ands \ru,\rx,#0xf8f8f8f8 @ u=x&0xf8f8f8f8; - eors \ru,\ru,\ru,ror#29 @ u=u^ROL(u,3); - ands \ru,\ru,#0xc7c7c7c7 @ u=u&0xc7c7c7c7; - eors \ru,\ru,\ru,ror#24 @ u=u^ROL(u,8); - eors \rt,\rt,\ru,ror#6 @ t^=ROR(u,6); - ands \ru,\rt,#0x80808080 @ t=rorbytes(t,7); - uadd8 \rt,\rt,\rt - orrs \rx,\rt,\ru,lsr#7 -.endm - -.balign 4 -.thumb_func -map_sbox_s: @ (we're currently still under .if SBOX_VIA_INV) version of map_sbox_x that uses lutmap_state_s as a lookup into a table of inverses - push {r14} - bl lutmap_state_s @ the S-box function is an inverse followed by an affine transformation: - conv_0x1f r4 ,r0,r1 @ see https://en.wikipedia.org/wiki/Rijndael_S-box - conv_0x1f r5 ,r0,r1 - conv_0x1f r6 ,r0,r1 - conv_0x1f r7 ,r0,r1 - conv_0x1f r8 ,r0,r1 - conv_0x1f r9 ,r0,r1 - conv_0x1f r10,r0,r1 - conv_0x1f r11,r0,r1 - eor r4 ,r4 ,#0xcacacaca @ scramble the shares slightly: 0x63=0xca^0xa9 etc. - eor r5 ,r5 ,#0xf5f5f5f5 - eor r6 ,r6 ,#0x0c0c0c0c - eor r7 ,r7 ,#0xa2a2a2a2 - eor r8 ,r8 ,#0xa9a9a9a9 - eor r9 ,r9 ,#0x96969696 - eor r10,r10,#0x6f6f6f6f - eor r11,r11,#0xc1c1c1c1 - pop {r15} - -.if NEED_INV_ROUNDS -.balign 4 -.thumb_func -inv_map_sbox_s: @ version that computes via tables of inverses - push {r14} @ similarly, the inverse S-box is an affine transformation followed by an inverse - conv_0x4a r4 ,r0,r1 - conv_0x4a r5 ,r0,r1 - conv_0x4a r6 ,r0,r1 - conv_0x4a r7 ,r0,r1 - conv_0x4a r8 ,r0,r1 - conv_0x4a r9 ,r0,r1 - conv_0x4a r10,r0,r1 - conv_0x4a r11,r0,r1 - eor r4 ,r4 ,#0xd1d1d1d1 @ scramble the shares slightly: 0x05=0xd1^0xd4 etc. - eor r5 ,r5 ,#0x94949494 - eor r6 ,r6 ,#0xfcfcfcfc - eor r7 ,r7 ,#0x3a3a3a3a - eor r8 ,r8 ,#0xd4d4d4d4 - eor r9 ,r9 ,#0x91919191 - eor r10,r10,#0xf9f9f9f9 - eor r11,r11,#0x3f3f3f3f - bl lutmap_state_s - pop {r15} -.endif - -.else .balign 4 .thumb_func gen_lut_sbox: @ gen_lut_sbox sets both lut_a and lut_b to the S-box table and @ returns r0=lut_a+256, r1=lut_b+256 - push {r14} - bl gen_lut_inverse @ first generate the table of inverses in lut_a - @ At this point r0=lut_a, r1=lut_b, lut_a[] contains inverses and lut_b[] contains other stuff - mov r14,#256 +@ first set lut_a to be a table of GF(2⁸) inverses, using lut_b as temporary storage + ldr r0,=lut_a + ldr r1,=lut_b +@ first set lut_a to be a table of antilogarithms, lut_b a table of logarithms + mov r2,#0 + strb r2,[r0] @ (*) + mov r3,#1 @ we maintain invariant that r2=log(r3) +1: + strb r2,[r0,r3] @ log table + strb r3,[r1,r2] @ antilog table + lsls r12,r3,#25 + it cs + eorcs r12,r12,#0x1b000000 @ multiply by x + eor r3,r3,r12,lsr#24 @ multiply by x+1 ("3"), which is a primitive element + add r2,r2,#1 + cmp r2,#255 + bls 1b + movs r2,#255 +1: + ldrb r3,[r0,r2] @ for each i≠0, find log,... + eor r3,r3,#255 @ ... negate... + ldrb r3,[r1,r3] @ ... and antilog to get inverse + strb r3,[r0,r2] + subs r2,r2,#1 + bne 1b @ note that inverse(0)=0 by (*) above +@ At this point r0=lut_a, r1=lut_b, lut_a[] contains inverses and lut_b[] contains other stuff + mov r12,#256 1: ldrb r2,[r0] eors r3,r2,r2,lsl#1 @ convolve byte with 0x1f @@ -1028,29 +945,9 @@ gen_lut_sbox: eor r2,r2,#0x63 @ and add 0x63 strb r2,[r0],#1 @ let lut_a[i]=sbox[i] strb r2,[r1],#1 @ let lut_b[i]=sbox[i] - subs r14,r14,#1 + subs r12,r12,#1 bne 1b - pop {r15} - -.if NEED_INV_ROUNDS -.balign 4 -.thumb_func -gen_lut_inv_sbox: -@ set lut_a to the inverse S-box table - push {r14} - bl gen_lut_sbox @ get the forwards S-box - sub r0,r0,#256 - sub r1,r1,#256 - mov r2,#0 -1: - ldrb r3,[r1],#1 @ get y=S-box(x)... - strb r2,[r0,r3] @ ... and store x at location y - adds r2,r2,#1 - cmp r2,#255 - bls 1b - pop {r15} -.endif -.endif + bx r14 @ Lookup each byte of a word, Rtarg, in a table and replace Rtarg with the result (used for SBOX lookups) .macro subbytes Rtarg,Rtable,Rspare0,Rspare1,Rspare2,Rspare3 @@ -1068,25 +965,14 @@ gen_lut_inv_sbox: orr \Rtarg,\Rspare0,\Rspare2,lsl#16 .endm -@ if we are using direct S-box lookup then [inv_]map_sbox_s is the same as lutmap_state_s -.if !SBOX_VIA_INV +@ map all bytes of the state through the split LUT, lut_a and lut_b +@ Trashes r0-r3,r12 .balign 4 .thumb_func map_sbox_s: -.if NEED_INV_ROUNDS -.thumb_func -inv_map_sbox_s: -.endif -.endif - -@ lutmap_state_s maps all bytes of the state through the split LUT, lut_a and lut_b -@ This is either the whole of map_sbox_s (if SBOX_VIA_INV=0), or (if SBOX_VIA_INV=1) it's a subroutine called by map_sbox_s -@ Trashes r0-r3,r12 -.balign 4 -lutmap_state_s: + GET_CANARY r12,CTAG12 + push {r12,r14} - push {r14} - ldr r0,=shareA @ Write out state share A to memory stmia r0,{r4-r7} clear03 @ barrier @@ -1096,7 +982,7 @@ lutmap_state_s: clear03 4 @ barrier bl makeperm16 @ Rebuild random 16-way permutation. Maybe do this less frequently - @ Now combine state shares A and B and apply the split sbox to each byte, in the order given by the above random permutation +@ Now combine state shares A and B and apply the split sbox to each byte, in the order given by the above random permutation ldr r8,=lut_a ldr r9,=lut_b @@ -1108,7 +994,7 @@ lutmap_state_s: eors r2,r1,r1,lsr#8 uxtb r11,r2 @ R11 = a0^a1^b0^b1 movs r12,r1,lsr#16 @ R12 = c0^d0 | (c1^d1)<<8 - + ldr r4,=perm16 ldr r5,=shareA ldr r6,=shareB @@ -1131,30 +1017,24 @@ lutmap_state_s: strb r3,[r6,r7] @ shareB'[perm[r0]^2] = lut_b[shareA[perm[r0]]^b0^b1^shareB[perm[r0]^2]]^c1^d1 bpl 1b clear03 8 @ barrier - + ldmia r6,{r8-r11} @ Read state share B back from memory clear03 12 @ barrier ldmia r5,{r4-r7} @ Read state share A back from memory clear03 16 @ barrier @ Refresh state shares because luts only give imperfect share-by-value - bl gen_rand_lfsr4 - eors r4,r4,r0; mov r12,#0; eors r8,r8,r0,ror#16 @ Barriers between each pair of eors to prevent implicit r4^r8 etc - eors r5,r5,r1; mov r12,#0; eors r9,r9,r1,ror#16 - eors r6,r6,r2; mov r12,#0; eors r10,r10,r2,ror#16 - eors r7,r7,r3; mov r12,#0; eors r11,r11,r3,ror#16 - pop {r15} + loadlfsr + steplfsr; eors r4,r4,r0; mov r12,#0; eors r8,r8,r0,ror#16 @ Barriers between each pair of eors to prevent implicit r4^r8 etc + steplfsr; eors r5,r5,r0; mov r12,#0; eors r9,r9,r0,ror#16 + steplfsr; eors r6,r6,r0; mov r12,#0; eors r10,r10,r0,ror#16 + steplfsr; eors r7,r7,r0; mov r12,#0; eors r11,r11,r0,ror#16 + savelfsr -.macro jitter rx -.if IK_JITTER - rors \rx,\rx,#1 - bcc \@f -\@: -.else -@ nothing -.endif -.endm + pop {r12,r14} + CHK_CANARY r12,CTAG12 + bx r14 .balign 4 .thumb_func @@ -1162,7 +1042,8 @@ randomisechaff: @ Randomise 48 bytes of chaff values (random load values) @ Uses 12 bytes of permscratch @ Trashes r0-3 - push {r14} + GET_CANARY r0,CTAG13 + push {r0,r14} movs r0,#12 ldr r1,=permscratch bl makesmallperm @ Store the random words in a random order to make 2nd order attacks harder @@ -1173,18 +1054,21 @@ randomisechaff: pop {r1} ldr r2,=permscratch ldrb r2,[r2,r1] - ldr r3,=chaff + getchaffaddress r3 str r0,[r3,r2,lsl#2] subs r1,r1,#1 bpl 1b - pop {r15} + pop {r0,r14} + CHK_CANARY r0,CTAG13 + bx r14 .balign 4 refreshchaff: @ Update 48 bytes of chaff values (random load values) using faster RNG than used for randomisechaff @ Uses 12 bytes of permscratch @ Trashes r0-3,12 - push {r14} + GET_CANARY r0,CTAG14 + push {r0,r14} movs r0,#12 ldr r1,=permscratch bl makesmallperm @ Update the random words in a random order to make 2nd order attacks harder @@ -1201,14 +1085,17 @@ refreshchaff: str r0,[r3,r2,lsl#2] subs r1,r1,#1 bpl 1b - pop {r15} + pop {r0,r14} + CHK_CANARY r0,CTAG14 + bx r14 .balign 4 .thumb_func @ Do sbox on the four bytes of the 4-way share r4-r7 @ Trashes r0,r8-r12 init_key_sbox: - push {r1-r3,r14} + GET_CANARY r12,CTAG15 + push {r1-r3,r12,r14} bl gen_rand_sha_nonpres; mov r8,r0 bl gen_rand_sha_nonpres; mov r9,r0 bl gen_rand_sha_nonpres; mov r10,r0 @@ -1220,7 +1107,7 @@ init_key_sbox: movs r5,#0 movs r6,#0 movs r7,#0 - + bl randomisechaff @ Randomise block of memory mainly used for obscuring loads movs r0,#4 @@ -1251,45 +1138,45 @@ init_key_sbox: uxtb r11,r4 @ R11 = a0^a1^b0^b1 eor r10,r10,r11,lsl#8 @ R10 = a0^a1 | (a0^a1^b0^b1)<<8 movs r12,r1,ror#16 @ R12 = c0^d0 | (c1^d1)<<8 | junk<<16 | junk<<24 - + ldr r1,=permscratch ldr r11,=chaff - @ Using r1=permutedfourwaypointer, r2=lut_a, r3=lut_b, r7,r8=randomness, r10=(a0^a1)|(a0^a1^b0^b1)<<8, r11=chaff, r12=(c0^d0)|(c1^d1)<<8|junk +@ Using r1=permutedfourwaypointer, r2=lut_a, r3=lut_b, r7,r8=randomness, r10=(a0^a1)|(a0^a1^b0^b1)<<8, r11=chaff, r12=(c0^d0)|(c1^d1)<<8|junk 1: ands r5,r1,#12 adds r5,r11,r5 @ Align chaff address to r1 ldr r6,[r1],#4 @ r6 = fourway + perm[i] (i=0-3, loop iteration) ldr r5,[r5] @ Random load to mask previous load - + ands r9,r6,#12 @ r9 = chaff address aligned to r6 mod 16 add r9,r11,r9 ldrb r4,[r6,#0] ldr r14,[r9,#0] @ Random load to mask previous load eor r4,r4,r10 eor r4,r4,r14,lsl#8 @ Add in some junk in bits 8-31 - + ldrb r5,[r6,#4] ldr r14,[r9,#4] @ Random load to mask previous load eors r4,r4,r5 eor r4,r4,r14,lsl#8 @ Add in some junk in bits 8-31 - + ldrb r5,[r6,#8] ldr r14,[r9,#8] @ Random load to mask previous load eors r4,r4,r5 eor r4,r4,r14,lsl#8 @ Add in some junk in bits 8-31 - + ldrb r5,[r6,#12] ldr r14,[r9,#12] @ Random load to mask previous load eors r4,r4,r5 @ r4 = unsharedbyte[perm[i]]^a0^a1 | junk eor r4,r4,r14,lsl#8 @ Add in some junk in bits 8-31 - + ands r14,r4,#255 ldrb r5,[r2,r14] @ r5 = lut_a[unsharedbyte[perm[i]]^a0^a1] and r14,r4,#15 add r14,r14,#32 ldrb r14,[r11,r14] @ Random load to mask previous load (r2 and r11 are both 0 mod 16) eors r5,r5,r12 @ r5 = lut_a[unsharedbyte[perm[i]]^a0^a1]^c0^d0 | junk<<8 | junk<<16 | junk<<24 - @ split r5 into two shares and store at [r6,#0] and [r6,#4] +@ split r5 into two shares and store at [r6,#0] and [r6,#4] strb r7,[r6,#0] eors r5,r5,r7 strb r5,[r6,#4] @@ -1304,7 +1191,7 @@ init_key_sbox: add r4,r11,#24 ldrb r14,[r4,r14] @ Random load to mask previous load (r3==8 and r11==0 mod 16) eor r5,r5,r12,ror#8 @ r5 = lut_b[unsharedbyte[perm[i]]^b0^b1]^c1^d1 | junk<<8 | junk<<16 | junk<<24 - @ split r5 into two shares and store at [r6,#8] and [r6,#12] +@ split r5 into two shares and store at [r6,#8] and [r6,#12] strb r8,[r6,#8] eors r5,r5,r8 strb r5,[r6,#12] @@ -1318,21 +1205,24 @@ init_key_sbox: ldr r0,=fourway ldmia r0,{r4-r7} @ Load SBOXed values back into register r4-r7 ldmia r11,{r8-r12,r14} @ Random load to mask previous load and to obfuscate registers - - pop {r1-r3,r15} + + pop {r1-r3,r12,r14} + CHK_CANARY r12,CTAG15 + bx r14 .balign 4 .thumb_func @ r1 = pointer to 4 x 4-way share (16 words); left unchanged @ r3 = rkey_s+40*roundkeynumber; advanced by 40 -@ Trashes r8-r11 +@ Trashes r8-r12 @ If i = word number 0..3, @ Aptr=memory word pointer to block of 20 bytes containing H&V-rotated share A roundkey (similarly B), then @ vpermA=Aptr[4]>>30, vpermB=Bptr[4]>>30, and -@ roundkey shareA(i) = Aptr[i+vpermA mod 4] ror ((i+vpermA mod 4)^th byte of Aptr[4]) -@ roundkey shareB(i) = Bptr[i+vpermB mod 4] ror ((i+vpermB mod 4)^th byte of Bptr[4])+16 +@ roundkey shareA(i) = Aptr[i+vpermA mod 4] ror #((i+vpermA mod 4)^th byte of Aptr[4]) +@ roundkey shareB(i) = Bptr[i+vpermB mod 4] ror #((i+vpermB mod 4)^th byte of Bptr[4])+16 storeroundkey: - push {r2,r14} + GET_CANARY r8,CTAG16 + push {r2,r8,r14} @ eor two 4-way share components to make a component of a 2-way share @ Note that we load from 4-way share at a random address then convert to 2-way share and @@ -1377,10 +1267,13 @@ storeroundkey: usub8 r2,r2,r0 @ r2=-hperms .endif mov r9,#4 + ldr r12,=RKshareC + ldr r12,[r12] 1: and r8,r8,#3 adds r0,r1,r8,lsl#4 ldmia r0,{r10,r11} + eor r10,r10,r12 @ Mix in RKshareC into round key shareB .if RK_ROR mov r10,r10,ror r2 mov r11,r11,ror r2 @@ -1397,95 +1290,46 @@ storeroundkey: subs r1,r1,#8 @ Restore r1 = (r1 on entry) adds r3,r3,#4 @ Set r3 = (r3 on entry) + 40 - pop {r2,r15} + pop {r2,r8,r14} + CHK_CANARY r8,CTAG16 + bx r14 .balign 4 .thumb_func init_key: -@ r0: rkeys_s (this input is ignored because it's defined here in the assembler file) -@ r1: raw key data (32 bytes) -@ rkeys_s is a 40*15=600-byte region -@ each of the 15 round keys is represented as two 4-word regions rka[0..3] and rkb[0..3] (each of which is followed by a zero word), -@ such that rk[i]=rka[i-r]^(rkb[i-r] ROR#16) gives the round keys, where r=!vpermkeyrot and i-r is interpreted in the relevant range, and i-r specifies mod 4 - - push {r4-r11,r14} - -.if IK_JITTER - push {r0} - bl gen_rand_sha - mov r12,r0 - pop {r0} -.endif - jitter r12 - - mov r5,r1 @ Here and for the rawkey reading loop, R5=raw key data - - jitter r12 - - @ Make lots of small perms so that it's harder for attacker to correlate permutation creation steps with the permutation's use - @ Can use rkey_s space because it won't be used before init_key_expandloop - ldr r1,=rkey_s - movs r2,#64 -1: - movs r0,#8 - push {r1,r2} - bl makesmallperm @ make a random permutation of 8 things (to randomise reading of key words) - pop {r1,r2} - adds r1,r1,#8 - subs r2,r2,#1 - bne 1b - bl gen_rand_sha_nonpres @ Choose a random one of these 64 to use - ands r0,r0,#63 - ldr r1,=rkey_s - adds r7,r1,r0,lsl#3 - -init_key_loadrawkey: - +@ On entry, r0 points to 4-way shared raw key data (128 bytes) +@ The format is a0 b0 c0 d0 a1 b1 c1 d1 ... a7 b7 c7 d7 +@ That is, each word, K, of the original 256-bit key is expanded into four words whose exclusive OR is K. +@ +@ On exit, rkeys_s, a 40*15=600-byte region, is filled as follows. +@ Each of the 15 round keys is represented as two 5-word regions rka[0..4] and rkb[0..4], +@ each of which consists of 4 words of round key followed by a word encoding vperm and rotation (RK_ROR) information. +@ In addition a common share word, RKshareC, is set randomly. +@ For a given round, rk[i] = the i^th word of the actual round key is given by: +@ vpermA=rka[4]>>30 +@ vpermB=rkb[4]>>30 +@ rka_unrot[i] = rka[i+vpermA mod 4] ror #((i+vpermA mod 4)^th byte of rka[4]) +@ rkb_unrot[i] = rkb[i+vpermB mod 4] ror #((i+vpermB mod 4)^th byte of rkb[4])+16 +@ rk[i] = rka_unrot[i] ^ rkb_unrot[i] ^ RKshareC + + GET_CANARY r12,CTAG17 + push {r4-r11,r12,r14} + + mov r5,r0 @ r5=4-way key input bl randomisechaff - -@ Loading the raw key and turning it into 4-way shares for round 0 and 1 - ldr r11,=chaff @ This needs to have 48 bytes of chaff - sub r0,r7,r11; ands r0,r0,#15; add r10,r11,r0 @ align r10 to r7 mod 16 (permutation array) - sub r0,r5,r11; ands r0,r0,#15; add r11,r11,r0 @ align r11 to r5 mod 16 (raw key data) - ldr r4,=rkey4way @ 128 byte scratch space for 4-way shares, laid out in words as a0 b0 c0 d0 a1 b1 c1 d1 ... a7 b7 c7 d7 - movs r6,#7 -@ r4=rkey4way, r5=rawkeydata, r6=loopcounter, r7=permutationarray, r10,r11=zeroarray (same mod 16 alignment as r7,r5 resp) -2: -@ Do calls to gen_rand_sha before we have sensitive values, so that gen_rand_sha doesn't push them on the stack - bl gen_rand_sha_nonpres; movs r8,r0 - bl gen_rand_sha_nonpres; movs r9,r0 - bl gen_rand_sha_nonpres; movs r1,r0 - bl gen_rand_sha @ r0,r1,r8,r9 are fresh random numbers - ldrb r12,[r10,r6] @ barrier to following load - ldrb r2,[r7,r6] @ r2 = perm8[r6] = which key word to load - ldrb r12,[r10,r6] @ barrier load to erase internal version of r2 - movs r14,r0,lsr#29 @ temporarily borrow some randomness to create a random address offset - ldr r12,[r11,r14,lsl#2] @ - ldr r3,[r11,r2,lsl#2] @ barrier to following load (random value, same memory bank) - ldr r3,[r5,r2,lsl#2] @ r3 = key word - ldr r12,[r11,r2,lsl#2] @ barrier load to erase internal version of r3 - ldr r12,[r11,r14,lsl#2] @ erase internal address - mov r14,#0 @ erase r14 - ldr r12,[r11,#32] - eor r12,r12,r12 - eors r9,r3,r8 @ extra care: sacrifice random r9 to further mask this operation - eors r3,r9,r0 @ r9=r0^r3^r8 (also has the effect of safely retiring the sensitive value r3) - eors r3,r3,r1 @ r9=r0^r1^r3^r8 so r0,r1,r8,r9 is a 4-way share of r3 - adds r2,r4,r2,lsl#4 - stmia r2,{r0,r1,r3,r8} @ Store 4-way share of this key word - movs r0,#0 @ Clear sensitive working values so they don't get used somehow (e.g., pushed onto the stack by gen_rand_sha) - movs r1,#0 - movs r2,#0 - movs r3,#0 + ldr r4,=rkey4way + movs r6,#8 +1: + ldmia r5!,{r0-r3} + stmia r4!,{r0-r3} subs r6,r6,#1 - bpl 2b - mov r8,#0 - mov r9,#0 - + bne 1b @ Now raw key is stored in rkey4way[], construct 2-way share in rkey_s[] for @ the 128-bit roundkeys 0 and 1, then expand from 2 to 15 roundkeys. - + bl gen_rand_sha_nonpres + ldr r12,=RKshareC + str r0,[r12] @ Make RKshareC random word ldr r3,=rkey_s @ r3=rkey_s ldr r1,=rkey4way @ r1=rkey4way bl storeroundkey @ Store round key 0 and advance r3 by 40 @@ -1495,7 +1339,7 @@ init_key_loadrawkey: ldmia r1!,{r4-r7} @ r4-r7 = 4-way share of previous round key word @ r1=rkey4way+128 on entry to main loop movs r2,#0 @ r2=word counter (0-51), offset from word 8 - + @ Note that r1-r3 are not sensitive values, so it's safe to stack @ them and conditionally branch on them. @@ -1511,10 +1355,10 @@ init_key_loadrawkey: @ a7 b7 c7 d7 a55 b55 c55 d55 init_key_expandloop: - @ r1 = pointer past one of eight 4-way shares of a roundkey word in the above cyclic buffer (r1=rkey4way+16i for i=1,...,8) - @ r2 = round key word counter (0-51), offset from word 8 (counting expanded roundkey words) - @ r3 = pointer to rkey_s+40*roundnumber = rkey_s+40*(2+[r2/4]) - @ r4-r7 = 4-way share of previous roundkey word +@ r1 = pointer past one of eight 4-way shares of a roundkey word in the above cyclic buffer (r1=rkey4way+16i for i=1,...,8) +@ r2 = round key word counter (0-51), offset from word 8 (counting expanded roundkey words) +@ r3 = pointer to rkey_s+40*roundnumber = rkey_s+40*(2+[r2/4]) +@ r4-r7 = 4-way share of previous roundkey word tst r2,#7 bne 1f @@ -1556,382 +1400,75 @@ init_key_expandloop: cmp r2,#52 bne init_key_expandloop - pop {r4-r11,r15} + pop {r4-r11,r12,r14} + CHK_CANARY r12,CTAG17 + bx r14 @ Add the round key shares pointed to by r12 into the state shares @ Trashes r0-r3 .balign 4 addrkey_s: - ldr r0,=statevperm - ldr r0,[r0] @ r0=vperm state rotation in bottom two bits - ldr r1,[r12,#16] @ r1=vperm key rotation in top two bits - rsbs r3,r0,r1,lsr#30 - @ Read shareA of roundkey, offset by vpermkeyrot-vpermstaterot, and eor it into shareA of state, offset by -vpermstaterot -.if RK_ROR - add r2,r12,#16 - ands r3,r3,#3; ldr r0,[r12,r3,lsl#2]; ldrb r1,[r2,r3]; rors r0,r0,r1; eors r4,r4,r0; adds r3,r3,#1 - ands r3,r3,#3; ldr r0,[r12,r3,lsl#2]; ldrb r1,[r2,r3]; rors r0,r0,r1; eors r5,r5,r0; adds r3,r3,#1 - ands r3,r3,#3; ldr r0,[r12,r3,lsl#2]; ldrb r1,[r2,r3]; rors r0,r0,r1; eors r6,r6,r0; adds r3,r3,#1 - ands r3,r3,#3; ldr r0,[r12,r3,lsl#2]; ldrb r1,[r2,r3]; rors r0,r0,r1; eors r7,r7,r0 + ldr r0,=chaff @ guaranteed 0 mod 16 +.if ST_VPERM + ldr r3,=statevperm + ldr r3,[r3] @ r3=vperm state rotation in bottom two bits + ldr r2,[r0,#12] @ barrier load .else - ands r3,r3,#3; ldr r0,[r12,r3,lsl#2]; eors r4,r4,r0; adds r3,r3,#1 - ands r3,r3,#3; ldr r0,[r12,r3,lsl#2]; eors r5,r5,r0; adds r3,r3,#1 - ands r3,r3,#3; ldr r0,[r12,r3,lsl#2]; eors r6,r6,r0; adds r3,r3,#1 - ands r3,r3,#3; ldr r0,[r12,r3,lsl#2]; eors r7,r7,r0 + movs r3,#0 .endif - adds r12,r12,#20 - - clear03 @ barrier to clear internal load registers - - ldr r0,=statevperm - ldr r0,[r0] @ r0=vperm state rotation in bottom two bits + bfi r0,r12,#0,#4 @ match chaff pointer (r0) to roundkey ptr (r12) mod 16 ldr r1,[r12,#16] @ r1=vperm key rotation in top two bits - rsbs r3,r0,r1,lsr#30 - @ Read shareB of roundkey, offset by vpermkeyrot-vpermstaterot, and eor it into shareB of state, offset by -vpermstaterot + ldr r2,[r0,#16] @ barrier load + + rsbs r2,r3,r1,lsr#30 @ r2=vpermkeyrot-vpermstaterot +@ Read shareA of roundkey, offset by vpermkeyrot-vpermstaterot, and eor it into shareA of state, offset by -vpermstaterot +@ r1=rkeyArotdata, r2=vpermkeyrot-vpermstaterot, r3=statevperm, r4-r11=state, r12=roundkeyAptr .if RK_ROR - add r2,r12,#16 - ands r3,r3,#3; ldr r0,[r12,r3,lsl#2]; ldrb r1,[r2,r3]; rors r0,r0,r1; eors r8,r8,r0; adds r3,r3,#1 - ands r3,r3,#3; ldr r0,[r12,r3,lsl#2]; ldrb r1,[r2,r3]; rors r0,r0,r1; eors r9,r9,r0; adds r3,r3,#1 - ands r3,r3,#3; ldr r0,[r12,r3,lsl#2]; ldrb r1,[r2,r3]; rors r0,r0,r1; eors r10,r10,r0; adds r3,r3,#1 - ands r3,r3,#3; ldr r0,[r12,r3,lsl#2]; ldrb r1,[r2,r3]; rors r0,r0,r1; eors r11,r11,r0 + movs r0,r2,lsl#3 + movs r1,r1,ror r0 + ands r2,r2,#3; ldr r0,[r12,r2,lsl#2]; rors r0,r0,r1; eors r4,r4,r0; adds r2,r2,#1 + ands r2,r2,#3; ldr r0,[r12,r2,lsl#2]; movs r1,r1,ror#8; rors r0,r0,r1; eors r5,r5,r0; adds r2,r2,#1 + ands r2,r2,#3; ldr r0,[r12,r2,lsl#2]; movs r1,r1,ror#8; rors r0,r0,r1; eors r6,r6,r0; adds r2,r2,#1 + ands r2,r2,#3; ldr r0,[r12,r2,lsl#2]; movs r1,r1,ror#8; rors r0,r0,r1; eors r7,r7,r0 .else - ands r3,r3,#3; ldr r0,[r12,r3,lsl#2]; eors r8,r8,r0; adds r3,r3,#1 - ands r3,r3,#3; ldr r0,[r12,r3,lsl#2]; eors r9,r9,r0; adds r3,r3,#1 - ands r3,r3,#3; ldr r0,[r12,r3,lsl#2]; eors r10,r10,r0; adds r3,r3,#1 - ands r3,r3,#3; ldr r0,[r12,r3,lsl#2]; eors r11,r11,r0 + ands r2,r2,#3; ldr r0,[r12,r2,lsl#2]; eors r4,r4,r0; adds r2,r2,#1 + ands r2,r2,#3; ldr r0,[r12,r2,lsl#2]; eors r5,r5,r0; adds r2,r2,#1 + ands r2,r2,#3; ldr r0,[r12,r2,lsl#2]; eors r6,r6,r0; adds r2,r2,#1 + ands r2,r2,#3; ldr r0,[r12,r2,lsl#2]; eors r7,r7,r0 .endif - adds r12,r12,#20 - - clear03 20 @ barrier to clear internal load registers - - bx r14 + clear03_preserve_r3 + add r12,r12,#20 + @ r0=chaff+16, r3=statevperm, r4-r11=state, r12=roundkeyBptr -.if NEED_ROUNDS - -@ perform encryption rounds -@ r4-r7, r8-r11: state -@ Trashes r0-r3,r12 -.balign 4 -rounds_s: - push {r14} - mov r2,#0 @ round counter -rounds_s_mainloop: - ldr r12,=rkey_s - add r12,r12,r2,lsl#5 @ pointer to key shares for this round - add r12,r12,r2,lsl#3 - push {r2} @ save round count - bl addrkey_s - bl map_sbox_s - bl shift_rows_s -.if ST_VPERM - ldmia r13,{r2} @ peek at stack to get round count - cmp r2,#NUMREFSTATEVPERM - bcs 1f - bl refreshstatevperm @ V shuffle of r4-r11 -1: -.endif - pop {r2} - adds r2,r2,#1 @ increment round counter - cmp r2,#14 - beq 2f @ break from loop? (last round has no mix_cols) - push {r2} - bl mix_cols_s - pop {r2} - b rounds_s_mainloop -2: - ldr r12,=rkey_s+14*40 @ final round key shares - bl addrkey_s - @eor r0,r4,r8;bl logword - @eor r0,r5,r9;bl logword - @eor r0,r6,r10;bl logword - @eor r0,r7,r11;bl logword - pop {r15} -.endif - -.if NEED_INV_ROUNDS -@ perform decryption rounds -@ r4-r7, r8-r11: state -@ preserves r0-r2 -.balign 4 -inv_rounds_s: - push {r0-r2,r14} - ldr r12,=rkey_s+14*40 @ final round key shares - bl addrkey_s - mov r2,#13 @ round counter - push {r2} -.if ST_VPERM - bl gen_rand_sha - bl vperm @ V shuffle - push {r0} -.endif - b 2f @ into middle of loop (last round has no mix_cols) -1: - push {r2} -.if ST_VPERM - bl gen_rand_sha - bl vperm @ V shuffle - push {r0} -.endif - bl inv_mix_cols_s -2: - bl inv_shift_rows_s - bl inv_map_sbox_s -.if ST_VPERM - pop {r0} - bl vperm @ undo V shuffle -.endif - pop {r2} - ldr r12,=rkey_s - add r12,r12,r2,lsl#5 @ pointer to key shares for this round - add r12,r12,r2,lsl#3 - bl addrkey_s - subs r2,r2,#1 - bpl 1b - pop {r0-r2,r15} -.endif - -.if INCLUDE_ENCRYPT_CBC -.balign 4 -.thumb_func -@ encrypt data in place -@ r0: ivec -@ r1: buf: starts with plaintext; ends up with ciphertext -@ r2: number of blocks -@ this implementation does not scramble the shares properly; consider a better implementation -@ if security is required in encryption -cbc_encrypt_s: - push {r4-r11,r14} - ldmia r0,{r4-r7} @ load iv into share a -2: - ldmia r1,{r8-r11} @ load plaintext into share b - bl rounds_s - eor r4,r4,r8 @ convert shared to non-shared - eor r5,r5,r9 - eor r6,r6,r10 - eor r7,r7,r11 - stmia r1!,{r4-r7} - subs r2,r2,#1 - bne 2b - pop {r4-r11,r15} -.endif - -.if INCLUDE_DECRYPT_CBC -.balign 4 -.thumb_func -@ decrypt data in place -@ r0: ivec -@ r1: buf -@ r2: number of blocks -@ return -@ r0=0 OK -@ r0=1: fault detected -@ could be simplified to use more ldmia:s at the cost of another 8 words of stack -cbc_decrypt_s: - push {r4-r11,r14} - ldmia r0,{r4-r7} @ load IV - bl ns_to_s - push {r4-r11} @ IV shares on the stack -2: - bl remap - bl ref_round_keys_s @ refresh the round keys - ldmia r1,{r4-r7} @ load the ciphertext - bl ns_to_s @ convert to shares - bl inv_rounds_s @ do decryption rounds - -.if ROUND_TRIP_TEST - -@ compute plaintext {r4-r7}^{r8-r11}^{SP[0..3]}^{SP[4..7]} -@ as shares {r4-r7}^{SP[0..3]}, {r8-r11}^{SP[4..7]} - ldrd r0,r3,[r13,#0] - eor r0,r0,r4 - eor r3,r3,r5 - strd r0,r3,[r13,#0] - ldrd r0,r3,[r13,#8] - eor r0,r0,r6 - eor r3,r3,r7 - strd r0,r3,[r13,#8] - ldrd r0,r3,[r13,#16] - eor r0,r0,r8 - eor r3,r3,r9 - strd r0,r3,[r13,#16] - ldrd r0,r3,[r13,#24] - eor r0,r0,r10 - eor r3,r3,r11 - strd r0,r3,[r13,#24] @ plaintext_s now on the stack - bl rounds_s @ restore original ciphertext (or we could have saved it) - - ldmia r1!,{r0,r3} @ reload actual ciphertext and compare to check for faults - eors r0,r0,r4 - eors r0,r0,r8 - bne 1f @ mismatch? could repeat this bne or add other protection against its being skipped - eors r3,r3,r5 - eors r3,r3,r9 - bne 1f - ldmia r1!,{r0,r3} - eors r0,r0,r6 - eors r0,r0,r10 - bne 1f - eors r3,r3,r7 - eors r3,r3,r11 - bne 1f - subs r1,r1,#16 - - pop {r0,r3} @ now EOR plaintext shares on stack to recover non-shared plaintext - ldr r14,[sp,#8] - eors r0,r0,r14 - ldr r14,[sp,#12] - eors r3,r3,r14 - stmia r1!,{r0,r3} @ overwrite ciphertext with plaintext - - pop {r0,r3} - ldr r14,[sp,#8] - eors r0,r0,r14 - ldr r14,[sp,#12] - eors r3,r3,r14 - stmia r1!,{r0,r3} @ overwrite ciphertext with plaintext - - add r13,#16 @ first share of plaintext has now been popped; skip the other share - -.else - -@ compute plaintext {r4-r7}^{r8-r11}^{SP[0..3]}^{SP[4..7]} -@ as shares {r4-r7}^{SP[0..3]}, {r8-r11}^{SP[4..7]} - pop {r0,r3} - eor r4,r0,r4 - eor r5,r3,r5 - pop {r0,r3} - eor r6,r0,r6 - eor r7,r3,r7 - pop {r0,r3} - eor r8,r0,r8 - eor r9,r3,r9 - pop {r0,r3} - eor r10,r0,r10 - eor r11,r3,r11 @ now plaintext_s in r4-r11 - eor r8,r8,r4 @ convert to non-shared - eor r9,r9,r5 - eor r10,r10,r6 - eor r11,r11,r7 @ now plaintext_ns in r8-r11 - ldmia r1,{r4-r7} @ ciphertext_ns in r4-r7 - stmia r1!,{r8-r11} @ overwrite ciphertext_ns with plaintext_ns - bl ns_to_s @ convert non-shared ciphertext to shared - -.endif - - push {r4-r11} @ push ciphertext_s, replacing iv or previous ciphertext_s on stack - subs r2,r2,#1 @ count the blocks - bne 2b - add r13,#32 - mov r0,#0 @ return OK status - pop {r4-r11,r15} - -.if ROUND_TRIP_TEST -1: -@ fault here - add r13,#32 - mov r0,#1 @ return fault status - pop {r4-r11,r15} -.endif -.endif - -@ Does mov r(i),#(0x80+i)*0x1010101 for i=flushfrom,flushfrom+1,...,12 -@ Assume 0 <= flushfrom <= 3 -@ Not possible to do this in a loop (or recursively) in gas without .altmacro? -.macro flush_regs flushfrom -.if \flushfrom<1 - mov r0,#0x80808080 -.endif -.if \flushfrom<2 - mov r1,#0x81818181 -.endif -.if \flushfrom<3 - mov r2,#0x83838383 -.endif - mov r3, #0x83838383 - mov r4, #0x84848484 - mov r5, #0x85858585 - mov r6, #0x86868686 - mov r7, #0x87878787 - mov r8, #0x88888888 - mov r9, #0x89898989 - mov r10, #0x8a8a8a8a - mov r11, #0x8b8b8b8b - mov r12, #0x8c8c8c8c -.endm - - -@ numargs is the number of arguments of the function-to-be-wrapped (i.e., excluding systick), assumed to be <=3 -.macro prewrap numargs - push {r4-r12,r14} - -@ Reset DWT count registers - mov r4,#0xe0000000 - add r4,r4,#0x1000 - add r4,r4,#4 - mov r5,#0 - mov r6,#0 - stmia r4!,{r5-r6} - add r4,r4,#8 - stmia r4!,{r5-r6} - -@ Clear any possible pending SysTick interrupt status - mov r4,#0xe0000000 - add r4,r4,#0xed00 - mov r5,#1<<25 - str r5,[r4,#4] @ ICSR at e000ed04 - - isb sy - dsb sy - -@ Allow SysTick interrupts, depending on r0=0 or 1 input - mov r0,r0,lsl#1 - add r0,r0,#5 - mov r4,#0xe000e000 - str r0,[r4,#0x10] @ SysTick CSR + bfi r0,r12,#0,#4 @ match chaff pointer (r0) to roundkey ptr (r12) mod 16 + ldr r1,[r12,#16] @ r1=vperm key rotation in top two bits + ldr r2,[r0,#16] @ barrier load + rsbs r2,r3,r1,lsr#30 @ r2=vpermkeyrot-vpermstaterot + ldr r3,=RKshareC @ r3=common round key shareC + bfi r0,r3,#0,#4 + ldr r3,[r3] + ldr r0,[r0] @ barrier load - gpioput 16,1,r4,r5 @ ADC trigger high (starts power trace capture) - -@ Shift arguments down to remove systick argument -.if \numargs>=1 - mov r0,r1 -.if \numargs>=2 - mov r1,r2 -.if \numargs>=3 - mov r2,r3 -.endif -.endif +@ Read shareB of roundkey, offset by vpermkeyrot-vpermstaterot, and eor it into shareB of state, offset by -vpermstaterot +@ r1=rkeyBrotdata, r2=vpermkeyrot-vpermstaterot, r3=RKshareC, r4-r11=state, r12=roundkeyB ptr +.if RK_ROR + movs r0,r2,lsl#3 + movs r1,r1,ror r0 + ands r2,r2,#3; ldr r0,[r12,r2,lsl#2]; eors r8,r8,r3,ror#16; rors r0,r0,r1; eors r8,r8,r0; adds r2,r2,#1 + ands r2,r2,#3; ldr r0,[r12,r2,lsl#2]; eors r9,r9,r3,ror#16; movs r1,r1,ror#8; rors r0,r0,r1; eors r9,r9,r0; adds r2,r2,#1 + ands r2,r2,#3; ldr r0,[r12,r2,lsl#2]; eors r10,r10,r3,ror#16; movs r1,r1,ror#8; rors r0,r0,r1; eors r10,r10,r0; adds r2,r2,#1 + ands r2,r2,#3; ldr r0,[r12,r2,lsl#2]; eors r11,r11,r3,ror#16; movs r1,r1,ror#8; rors r0,r0,r1; eors r11,r11,r0 +.else + ands r2,r2,#3; ldr r0,[r12,r2,lsl#2]; eors r8,r8,r0; eors r8,r8,r3,ror#16; adds r2,r2,#1 + ands r2,r2,#3; ldr r0,[r12,r2,lsl#2]; eors r9,r9,r0; eors r9,r9,r3,ror#16; adds r2,r2,#1 + ands r2,r2,#3; ldr r0,[r12,r2,lsl#2]; eors r10,r10,r0; eors r10,r10,r3,ror#16; adds r2,r2,#1 + ands r2,r2,#3; ldr r0,[r12,r2,lsl#2]; eors r11,r11,r0; eors r11,r11,r3,ror#16 .endif - -@ Set registers r\numargs - r12 to definite values - flush_regs \numargs -@ Set r3 back to non-sentinel value in case the test program never changes r3 or r12 which would confuse the auto-detect of start/end - mov r3,#0 - -.endm - -@ numreturn is the number of return values, assumed to be 0 or 1 -.macro postwrap numreturn - gpioput 16,0,r1,r2 @ ADC trigger low - flush_regs \numreturn - mov r1,#0xe000e000 - mov r2,#4 - str r2,[r1,#0x10] @ Disable SysTick - ldr r2,[r1,#0x18] - ldr r1,=lastsystickcvr - str r2,[r1] - -@ Get final DWT cycle count - ldr r1,=0xe0001000 - ldr r2,[r1,#4] - ldr r1,=lastdwtcount - str r2,[r1] + clear03 - pop {r4-r12,r15} -.endm - + bx r14 -.if INCLUDE_CRYPT_CTR .balign 4 .thumb_func @ de/encrypt data in place @@ -1946,11 +1483,12 @@ cbc_decrypt_s: .endif ctr_crypt_s: - @ r0=IV, r1=cipher/plaintext buffer, r2=number of blocks - push {r0,r4-r11,r14} - + GET_CANARY r12,CTAG0 + push {r0,r4-r11,r12,r14} + push {r0-r2} + SET_COUNT 93 .if CT_BPERM @ Initialise 32 random numbers (which fit in half-words) @@ -1967,44 +1505,41 @@ ctr_crypt_s: bl randomisechaff pop {r0-r2} movs r3,#0 + CHK_COUNT 93 ctr_crypt_mainloop: -@ r0=IV, r1=cipher/plaintext buffer, r2=number of blocks, r3=block counter + SET_COUNT 80 +@ here r0=IV, r1=cipher/plaintext buffer, r2=number of blocks, r3=block counter @ Do as much preparatory stuff as possible that doesn't involve the IV (to reduce interaction with it) - push {r0-r2} - + push {r0-r3} @ It's OK for execution time to depend on the block counter r3 ("public"), but not the block number (secret) tst r3,#(REFCHAFF_PERIOD-1) bne 1f - push {r3} bl refreshchaff - pop {r3} - 1: +1: + ldr r3,[r13,#12] @ get block count off the stack tst r3,#(REMAP_PERIOD-1) bne 1f - push {r3} - bl remap @ shuffle the LUts - pop {r3} - 1: + bl remap @ shuffle the LUTs; this preserves R3 +1: + CHK_COUNT 80 tst r3,#(REFROUNDKEYSHARES_PERIOD-1) bne 1f - push {r3} bl ref_roundkey_shares_s @ refresh the round key shares - pop {r3} - 1: +1: + ldr r3,[r13,#12] @ get block count off the stack tst r3,#(REFROUNDKEYHVPERMS_PERIOD-1) bne 1f - push {r3} bl ref_roundkey_hvperms_s @ refresh the round key vperms - pop {r3} - 1: +1: - pop {r0-r2} + CHK_COUNT 81 + pop {r0-r3} @ r0=IV, r1=cipher/plaintext buffer, r2=number of blocks, r3=block counter @ Now calculate r12 = block number-to-be-deciphered from r3 = block counter @@ -2025,7 +1560,7 @@ ctr_crypt_mainloop: subs r7,r4,r5 @ r7=i-j and r8,r7,r7,asr#31 @ r8=min(i-j,0) sub r7,r7,r8,lsl#1 @ r7=|i-j| - mla r6,r6,r2,r7 @ r6=n(i+j)+|i-j| + mla r6,r6,r2,r7 @ r6=n(i+j)+|i-j|, encodes the unordered pair {i,j} eors r6,r6,r1,lsl#27 @ mix with swap-or-not round counter to get different hash functions @ Now do murmur3_32 hash of r6 mul r6,r6,r9 @@ -2042,7 +1577,7 @@ ctr_crypt_mainloop: eors r6,r6,r6,lsr#16 @ not actually used here @ Now set i to j, conditional on the top bit of r6 subs r7,r5,r4 @ r7=j-i - ands r7,r7,r6,asr#31 @ r7=(j-1)*(top bit of r6) + ands r7,r7,r6,asr#31 @ r7=(j-i)*(top bit of r6) adds r4,r4,r7 @ r4=j if top bit of r6, else i subs r1,r1,#1 bpl 1b @@ -2051,6 +1586,7 @@ ctr_crypt_mainloop: .else mov r12,r3 .endif + CHK_COUNT 82 @ r0=IV, r1=cipher/plaintext buffer, r2=number of blocks, r3=block counter, r12=block to be deciphered push {r0-r3,r12} @@ -2073,7 +1609,7 @@ processIV: @ non-target label to assist power analysis pop {r0-r3} @ may come from non-scratch memory and have its own internal registers, so we clear it using a @ stack save/load. Either R13 is in non-scratch memory, in which case this works, or it isn't, in @ which case it doesn't matter, because the only subsequent use of non-scratch memory is the stack. - + @ Add in r9 in byte-big-endian, bit-little-endian (!) fashion, while trying to avoid rev operations @ as far as possible as these tend to expose (via power fluctuations) byte-level hamming weights. @ It's worth avoiding revs on r6, r5, r4, even at the cost of introducing a small timing dependency. @@ -2092,14 +1628,54 @@ processIV: @ non-target label to assist power analysis rev r4,r4; sbcs r4,r4,#0; rev r4,r4 1: clear01 16 - + CHK_COUNT 83 + @ r4-r7 = IV for the current block bl ns_to_s @ convert IV+x to shares, which includes choosing and incorporating a random shareC + CHK_COUNT 84 bl conjshareC @ Add the effect of shareC to lut_a, lut_b - bl rounds_s @ Do the 15 AES rounds on (key, state=IV+x), with the (shared) result in the state, R4-R11 + CHK_COUNT 85 +@ now perform the 15 encryption rounds on (key, state=IV+x) +@ here r4-r7, r8-r11: state + mov r2,#0 @ round counter +rounds_s_mainloop: + ldr r12,=rkey_s + add r12,r12,r2,lsl#5 @ pointer to key shares for this round + add r12,r12,r2,lsl#3 + push {r2} @ save round count + bl addrkey_s + bl map_sbox_s + bl shift_rows_s +.if ST_VPERM + ldmia r13,{r2} @ peek at stack to get round count + cmp r2,#NUMREFSTATEVPERM + bcs 1f + bl gen_rand_lfsr_nonpres + ldr r1,=statevperm + bl addstatevperm @ V shuffle of r4-r11 +1: +.endif + pop {r2} + adds r2,r2,#1 @ increment round counter + cmp r2,#14 + beq 2f @ break from loop? (last round has no mix_cols) + push {r2} + bl mix_cols_s + pop {r2} + b rounds_s_mainloop +2: + CHK_COUNT 86 + ldr r12,=rkey_s+14*40 @ final round key shares + bl addrkey_s + CHK_COUNT 87 bl conjshareC @ Undo the effect of shareC from lut_a, lut_b + CHK_COUNT 88 .if ST_VPERM - bl vpermundo @ Undo vperm on the state shares +@ Undo the effects of vperm rotation recorded in statevperm + ldr r1,=statevperm + ldr r2,[r1] + rsbs r0,r2,#0 + bl addstatevperm .endif pop {r0-r3,r12} @@ -2113,6 +1689,7 @@ processIV: @ non-target label to assist power analysis .else movs r0,#0 .endif + CHK_COUNT 89 add r1,r1,r12,lsl#4 @ Temporarily r1 points to block-to-be-deciphered ldr r3,[r1] eors r3,r3,r4 @@ -2135,164 +1712,15 @@ processIV: @ non-target label to assist power analysis eors r3,r3,r0 str r3,[r1,#12] sub r1,r1,r12,lsl#4 @ Restore r1 to point to start of buffer - + CHK_COUNT 90 + pop {r0,r3} @ Restore IV and block counter @ r0=IV, r1=cipher/plaintext buffer, r2=number of blocks, r3=block counter adds r3,r3,#1 cmp r3,r2 + CHK_COUNT 91 bne ctr_crypt_mainloop - pop {r0,r4-r11,r15} - -.endif - -.section .text.debugging,"ax",%progbits - -@@@@@@@@@@@@@@@@@@@@@@@@@ test functions @@@@@@@@@@@@@@@@@@@@@@@@@ - -@ .global test_v - -@@ .section .text.test_v,"ax",%progbits -@ .macro fn -@ ldr.n r0,=0x12345678 -@ ldr.n r0,=0xedcba987 -@ .endm -@ .macro tenfn -@ fn -@ fn -@ fn -@ fn -@ fn -@ fn -@ fn -@ fn -@ fn -@ fn -@ .endm -@ .macro hundredfn -@ tenfn -@ tenfn -@ tenfn -@ tenfn -@ tenfn -@ tenfn -@ tenfn -@ tenfn -@ tenfn -@ tenfn -@ .endm -@ -@ .thumb_func -@ test_v: -@ .balign 4 -@ 1: -@ hundredfn -@ b 1b -@ bx r14 -@ .ltorg - -@ switch from shared to non-shared state -@ s_to_ns: -@ eor r4,r4,r8 -@ eor r5,r5,r9 -@ eor r6,r6,r10 -@ eor r7,r7,r11 -@ bx r14 - -.extern o8hex -.extern osp -.extern onl - -.thumb_func -delay: -.if CHIPW - subs r0,r0,#3 @ we are clocked approximately three times slower -.else - subs r0,r0,#1 -.endif - bcs delay + pop {r0,r4-r11,r12,r14} + CHK_CANARY r12,CTAG0 bx r14 - - -.thumb_func -isr_systick: - - @ Stop SysTick counting - mov r0,#0xe000e000 - mov r1,#4 - str r1,[r0,#0x10] @ SysTick Control and Status Register - - @ Clear any possible pending SysTick interrupt status due to SysTick count timing out during its own handler - add r0,r0,#0xd00 - mov r1,#1<<25 - str r1,[r0,#4] @ ICSR at e000ed04 - - gpioput 24,1,r2,r3 @ set GPIO24 - - ldr r0,=systick_data - ldr r1,[r0] - adds r1,r1,#1 - stmia r0!,{r1} - - ldr r1,[r13,#0] @ r0..r2 - ldr r2,[r13,#4] - ldr r3,[r13,#8] - stmia r0!,{r1-r3} - ldr r1,[r13,#12] @ r3 - stmia r0!,{r1,r4-r11} - ldr r1,[r13,#16] @ r12 - ldr r3,[r13,#28] @ RETPSR - ubfx r2,r3,#9,#1 @ SPREALIGN - add r2,r13,r2,lsl#2 @ add 4 to SP if SPREALIGN set in RETPSR - add r2,r2,#0x68 @ r13 - stmia r0!,{r1-r2} - - ldr r1,[r13,#20] @ r14 - ldr r2,[r13,#24] @ ReturnAddress -@ RETPSR still in r3 - stmia r0!,{r1-r3} - -@ Store DWT counts CYCCNT, CPICNT, LSUCNT, FOLDCNT in sysdata[18-21] - ldr r1,=0xe0001004 - ldmia r1!,{r2,r3} - stmia r0!,{r2,r3} - add r1,r1,#8 - ldmia r1!,{r2,r3} - stmia r0!,{r2,r3} - - gpioput 24,0,r2,r3 @ clear GPIO24 - - bx r14 - -.balign 4 -.thumb_func -@ Takes SHA256 of 64-bits (r0,r1) and stores the result at memory pointed to by r2 (32 bytes) -@ This is used to generate random inputs (key and IV) to repeated instances of the crypt code. -@ These random numbers are mimicked in powerpair.py which can then analyse the effect of these random inputs on the power signal. -@ Preserves r0-r13 -gen_irand: - push {r0-r8,r14} - mov r8,r2 - ldr r4,=SHA256_BASE - movw r2,#(1<sw_lock[30] = 0xf; - // flush_reg(); ctr_crypt_s(iv, (void*)SRAM_BASE, data_size/16); - // flush_reg(); printf("Post decryption image begins with\n"); for (int i=0; i < 4; i++) diff --git a/bootloaders/encrypted/otp.json b/bootloaders/encrypted/otp.json index 412c11078..e6393cfb0 100644 --- a/bootloaders/encrypted/otp.json +++ b/bootloaders/encrypted/otp.json @@ -4,134 +4,134 @@ "ecc" : true, "value" : [ - "0x00", - "0x01", - "0x02", + "0x31", + "0xb6", + "0xd8", + "0x18", + "0x23", + "0x2e", + "0x7b", + "0x7c", + "0xa3", + "0xb1", + "0xb7", + "0x90", + "0x7b", + "0x2f", + "0x41", + "0xd2", + "0x51", + "0xb5", "0x03", - "0x04", - "0x05", - "0x06", - "0x07", - "0x08", - "0x09", - "0x0a", - "0x0b", + "0x62", + "0xd6", + "0x21", "0x0c", + "0xb5", + "0x8d", + "0x17", + "0xe6", + "0xd5", + "0x6b", "0x0d", - "0x0e", - "0x0f", - "0x00", - "0x10", - "0x20", - "0x30", - "0x40", - "0x50", - "0x60", - "0x70", - "0x80", - "0x90", - "0xa0", - "0xb0", - "0xc0", - "0xd0", - "0xe0", - "0xf0", - "0x0f", - "0x0e", - "0x0d", - "0x0c", - "0x0b", - "0x0a", - "0x09", - "0x08", - "0x07", - "0x06", + "0x87", + "0x8d", + "0x2b", + "0x74", + "0xa4", + "0xba", + "0xb9", + "0x14", + "0x75", + "0x88", + "0x9b", "0x05", - "0x04", - "0x03", - "0x02", - "0x01", - "0x00", - "0xf0", - "0xe0", - "0xd0", - "0xc0", - "0xb0", - "0xa0", - "0x90", - "0x80", - "0x70", - "0x60", - "0x50", - "0x40", - "0x30", - "0x20", - "0x10", - "0x00", - "0x08", + "0x2d", + "0x32", + "0x51", + "0xc1", + "0x35", "0x09", - "0x0a", - "0x0b", - "0x0c", - "0x0d", - "0x0e", - "0x0f", - "0x00", - "0x01", - "0x02", + "0x78", + "0xbb", + "0x6d", + "0xc2", + "0xbb", + "0xa6", + "0x5e", + "0x95", + "0xa2", + "0x29", + "0x32", + "0x34", + "0x5b", + "0x2c", + "0xd3", + "0xf8", + "0x5d", + "0xe2", + "0x5f", + "0x23", + "0xeb", + "0x27", + "0xa4", + "0xcd", + "0xb0", + "0x8e", + "0xf4", + "0x6e", + "0x94", + "0x86", + "0x19", + "0x93", + "0x3a", + "0xd8", + "0x97", + "0x65", + "0x29", + "0x25", + "0x57", + "0x65", + "0x49", "0x03", - "0x04", - "0x05", - "0x06", - "0x07", + "0xfe", + "0xc6", + "0xe9", + "0x8b", + "0xa3", + "0x7e", + "0x2b", + "0x53", "0x80", - "0x90", - "0xa0", - "0xb0", - "0xc0", - "0xd0", - "0xe0", - "0xf0", - "0x00", - "0x10", - "0x20", - "0x30", - "0x40", - "0x50", - "0x60", - "0x70", - "0x07", - "0x06", + "0x68", + "0xdd", "0x05", - "0x04", - "0x03", - "0x02", - "0x01", - "0x00", - "0x0f", - "0x0e", - "0x0d", - "0x0c", - "0x0b", - "0x0a", - "0x09", - "0x08", - "0x70", - "0x60", - "0x50", - "0x40", - "0x30", - "0x20", "0x10", - "0x00", - "0xf0", - "0xe0", - "0xd0", - "0xc0", - "0xb0", - "0xa0", + "0x17", + "0xca", + "0xc3", + "0xa8", + "0x04", + "0x8d", + "0x12", + "0xaf", + "0xd9", + "0x49", + "0xa9", + "0x6d", "0x90", - "0x80" + "0x7c", + "0xb3", + "0x63", + "0x4f", + "0x36", + "0xc5", + "0x00", + "0xb5", + "0x71", + "0x74", + "0xe6", + "0x9a" ] }, "OTP_DATA_KEY1" : [ 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7 ], diff --git a/bootloaders/encrypted/privateaes.bin b/bootloaders/encrypted/privateaes.bin index ef7a0dc1d6662d847d48d6fc1a4f6ee3ce8fcd7a..21a47756d7b947b1e8a7c3a74b0ef5edc3984f87 100644 GIT binary patch literal 128 zcmV-`0Du26w%8aWE_-~VvA2+WFG12#wF6?-Aq=&R7v|M#4Tp^@bfmhu6m^K31uZgB z!8HkZyKTa|re2kzDKa!$EYtX1;$I`{C#21=j`VJnh8dGO*q3D~C0Auh1OCS8i=%!k iQ-EmQ1rQg?!>9y}60g}wscn#avtv&-#Q?Q&bmp4XE<5l5 literal 128 zcmZQzWMXDvWn<^yO}%P%v-^NGNESuwcW13lBc<^YQX Date: Mon, 20 Jan 2025 14:09:03 +0000 Subject: [PATCH 05/20] Apply encrypted-example 6de8084b6eda --- bootloaders/encrypted/aes.S | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/bootloaders/encrypted/aes.S b/bootloaders/encrypted/aes.S index d51605a4a..ad6c448d8 100644 --- a/bootloaders/encrypted/aes.S +++ b/bootloaders/encrypted/aes.S @@ -1427,15 +1427,15 @@ addrkey_s: .if RK_ROR movs r0,r2,lsl#3 movs r1,r1,ror r0 - ands r2,r2,#3; ldr r0,[r12,r2,lsl#2]; rors r0,r0,r1; eors r4,r4,r0; adds r2,r2,#1 - ands r2,r2,#3; ldr r0,[r12,r2,lsl#2]; movs r1,r1,ror#8; rors r0,r0,r1; eors r5,r5,r0; adds r2,r2,#1 - ands r2,r2,#3; ldr r0,[r12,r2,lsl#2]; movs r1,r1,ror#8; rors r0,r0,r1; eors r6,r6,r0; adds r2,r2,#1 - ands r2,r2,#3; ldr r0,[r12,r2,lsl#2]; movs r1,r1,ror#8; rors r0,r0,r1; eors r7,r7,r0 + ands r2,r2,#3; ldr r0,[r12,r2,lsl#2]; adds r2,r2,#1; rors r0,r0,r1; eors r4,r4,r0 + ands r2,r2,#3; ldr r0,[r12,r2,lsl#2]; adds r2,r2,#1; movs r1,r1,ror#8; rors r0,r0,r1; eors r5,r5,r0 + ands r2,r2,#3; ldr r0,[r12,r2,lsl#2]; adds r2,r2,#1; movs r1,r1,ror#8; rors r0,r0,r1; eors r6,r6,r0 + ands r2,r2,#3; ldr r0,[r12,r2,lsl#2]; movs r1,r1,ror#8; rors r0,r0,r1; eors r7,r7,r0 .else - ands r2,r2,#3; ldr r0,[r12,r2,lsl#2]; eors r4,r4,r0; adds r2,r2,#1 - ands r2,r2,#3; ldr r0,[r12,r2,lsl#2]; eors r5,r5,r0; adds r2,r2,#1 - ands r2,r2,#3; ldr r0,[r12,r2,lsl#2]; eors r6,r6,r0; adds r2,r2,#1 - ands r2,r2,#3; ldr r0,[r12,r2,lsl#2]; eors r7,r7,r0 + ands r2,r2,#3; ldr r0,[r12,r2,lsl#2]; adds r2,r2,#1; eors r4,r4,r0 + ands r2,r2,#3; ldr r0,[r12,r2,lsl#2]; adds r2,r2,#1; eors r5,r5,r0 + ands r2,r2,#3; ldr r0,[r12,r2,lsl#2]; adds r2,r2,#1; eors r6,r6,r0 + ands r2,r2,#3; ldr r0,[r12,r2,lsl#2]; eors r7,r7,r0 .endif clear03_preserve_r3 add r12,r12,#20 @@ -1455,15 +1455,15 @@ addrkey_s: .if RK_ROR movs r0,r2,lsl#3 movs r1,r1,ror r0 - ands r2,r2,#3; ldr r0,[r12,r2,lsl#2]; eors r8,r8,r3,ror#16; rors r0,r0,r1; eors r8,r8,r0; adds r2,r2,#1 - ands r2,r2,#3; ldr r0,[r12,r2,lsl#2]; eors r9,r9,r3,ror#16; movs r1,r1,ror#8; rors r0,r0,r1; eors r9,r9,r0; adds r2,r2,#1 - ands r2,r2,#3; ldr r0,[r12,r2,lsl#2]; eors r10,r10,r3,ror#16; movs r1,r1,ror#8; rors r0,r0,r1; eors r10,r10,r0; adds r2,r2,#1 - ands r2,r2,#3; ldr r0,[r12,r2,lsl#2]; eors r11,r11,r3,ror#16; movs r1,r1,ror#8; rors r0,r0,r1; eors r11,r11,r0 + ands r2,r2,#3; ldr r0,[r12,r2,lsl#2]; eor r8,r8,r3,ror#16; adds r2,r2,#1; rors r0,r0,r1; eor r8,r8,r0 + ands r2,r2,#3; ldr r0,[r12,r2,lsl#2]; eor r9,r9,r3,ror#16; adds r2,r2,#1; movs r1,r1,ror#8; rors r0,r0,r1; eor r9,r9,r0 + ands r2,r2,#3; ldr r0,[r12,r2,lsl#2]; eor r10,r10,r3,ror#16; adds r2,r2,#1; movs r1,r1,ror#8; rors r0,r0,r1; eor r10,r10,r0 + ands r2,r2,#3; ldr r0,[r12,r2,lsl#2]; eor r11,r11,r3,ror#16; movs r1,r1,ror#8; rors r0,r0,r1; eor r11,r11,r0 .else - ands r2,r2,#3; ldr r0,[r12,r2,lsl#2]; eors r8,r8,r0; eors r8,r8,r3,ror#16; adds r2,r2,#1 - ands r2,r2,#3; ldr r0,[r12,r2,lsl#2]; eors r9,r9,r0; eors r9,r9,r3,ror#16; adds r2,r2,#1 - ands r2,r2,#3; ldr r0,[r12,r2,lsl#2]; eors r10,r10,r0; eors r10,r10,r3,ror#16; adds r2,r2,#1 - ands r2,r2,#3; ldr r0,[r12,r2,lsl#2]; eors r11,r11,r0; eors r11,r11,r3,ror#16 + ands r2,r2,#3; ldr r0,[r12,r2,lsl#2]; eor r8,r8,r3,ror#16; adds r2,r2,#1; eors r8,r8,r0 + ands r2,r2,#3; ldr r0,[r12,r2,lsl#2]; eor r9,r9,r3,ror#16; adds r2,r2,#1; eors r9,r9,r0 + ands r2,r2,#3; ldr r0,[r12,r2,lsl#2]; eor r10,r10,r3,ror#16; adds r2,r2,#1; eors r10,r10,r0 + ands r2,r2,#3; ldr r0,[r12,r2,lsl#2]; eor r11,r11,r3,ror#16; eors r11,r11,r0 .endif clear03 From 2b6415779e3cae2c0efb7f010666e97671e4f78d Mon Sep 17 00:00:00 2001 From: William Vinnicombe Date: Wed, 29 Jan 2025 15:10:02 +0000 Subject: [PATCH 06/20] Add hello_encrypted example --- CMakeLists.txt | 1 + encrypted/CMakeLists.txt | 6 + encrypted/hello_encrypted/CMakeLists.txt | 51 +++++++ encrypted/hello_encrypted/hello_encrypted.c | 30 +++++ encrypted/hello_encrypted/otp.json | 141 ++++++++++++++++++++ encrypted/hello_encrypted/private.pem | 8 ++ encrypted/hello_encrypted/privateaes.bin | Bin 0 -> 128 bytes encrypted/hello_encrypted/update-key.cmake | 23 ++++ 8 files changed, 260 insertions(+) create mode 100644 encrypted/CMakeLists.txt create mode 100644 encrypted/hello_encrypted/CMakeLists.txt create mode 100644 encrypted/hello_encrypted/hello_encrypted.c create mode 100644 encrypted/hello_encrypted/otp.json create mode 100644 encrypted/hello_encrypted/private.pem create mode 100644 encrypted/hello_encrypted/privateaes.bin create mode 100644 encrypted/hello_encrypted/update-key.cmake diff --git a/CMakeLists.txt b/CMakeLists.txt index c78ad9449..33992dec8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -71,6 +71,7 @@ add_subdirectory(cmake) add_subdirectory(dcp) add_subdirectory(divider) add_subdirectory(dma) +add_subdirectory(encrypted) add_subdirectory(flash) add_subdirectory(gpio) add_subdirectory(hstx) diff --git a/encrypted/CMakeLists.txt b/encrypted/CMakeLists.txt new file mode 100644 index 000000000..c7346d4ea --- /dev/null +++ b/encrypted/CMakeLists.txt @@ -0,0 +1,6 @@ +if (TARGET pico_mbedtls) + add_subdirectory_exclude_platforms(hello_encrypted host rp2040 rp2350-riscv) +else() + # Assume picotool has no signing support, if no pico_mbedtls available + message("Skipping encrypted example as pico_mbedtls unavailable") +endif () diff --git a/encrypted/hello_encrypted/CMakeLists.txt b/encrypted/hello_encrypted/CMakeLists.txt new file mode 100644 index 000000000..78225be50 --- /dev/null +++ b/encrypted/hello_encrypted/CMakeLists.txt @@ -0,0 +1,51 @@ +# Example encrypted binard +add_executable(hello_encrypted + hello_encrypted.c + ) + +# pull in common dependencies +target_link_libraries(hello_encrypted pico_stdlib) + +# enable stdio_usb and stdio_uart +pico_enable_stdio_uart(hello_encrypted 1) +pico_enable_stdio_usb(hello_encrypted 1) + +# set as no_flash binary +pico_set_binary_type(hello_encrypted no_flash) + +# set version (optional) +pico_set_binary_version(hello_encrypted MAJOR 7 MINOR 3) + +# set tbyb (optional) +# target_compile_definitions(hello_encrypted PRIVATE PICO_CRT0_IMAGE_TYPE_TBYB=1) + +# Add command to update otp.json if privateaes.bin changes +add_custom_command(OUTPUT ${CMAKE_CURRENT_LIST_DIR}/otp.json + COMMAND ${CMAKE_COMMAND} -P "${CMAKE_CURRENT_LIST_DIR}/update-key.cmake" + DEPENDS ${CMAKE_CURRENT_LIST_DIR}/privateaes.bin) +# Copy that otp.json file to build directory +add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/otp.json + COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_CURRENT_LIST_DIR}/otp.json" "${CMAKE_CURRENT_BINARY_DIR}/otp.json" + DEPENDS ${CMAKE_CURRENT_LIST_DIR}/otp.json) +add_custom_target(hello_encrypted_otp_json DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/otp.json) +add_dependencies(hello_encrypted hello_encrypted_otp_json) + +# configure otp output +pico_set_otp_key_output_file(hello_encrypted ${CMAKE_CURRENT_BINARY_DIR}/otp.json) + +# sign, hash, and encrypt +pico_sign_binary(hello_encrypted ${CMAKE_CURRENT_LIST_DIR}/private.pem) +pico_hash_binary(hello_encrypted) +pico_encrypt_binary(hello_encrypted + ${CMAKE_CURRENT_LIST_DIR}/privateaes.bin + EMBED + OTP_KEY_PAGE 29) + +# package uf2 in flash +pico_package_uf2_output(hello_encrypted 0x10000000) + +# create map/bin/hex/uf2 file etc. +pico_add_extra_outputs(hello_encrypted) + +# add url via pico_set_program_url +example_auto_set_url(hello_encrypted) diff --git a/encrypted/hello_encrypted/hello_encrypted.c b/encrypted/hello_encrypted/hello_encrypted.c new file mode 100644 index 000000000..90a3db199 --- /dev/null +++ b/encrypted/hello_encrypted/hello_encrypted.c @@ -0,0 +1,30 @@ +/** + * Copyright (c) 2020 Raspberry Pi (Trading) Ltd. + * + * SPDX-License-Identifier: BSD-3-Clause + */ + +#include +#include +#include "pico/stdlib.h" +#include "pico/bootrom.h" +#include "hardware/sync.h" + +int main() { + restore_interrupts_from_disabled(0); + stdio_init_all(); + +#if PICO_CRT0_IMAGE_TYPE_TBYB + // If TBYB image, then buy it + uint8_t* buffer = malloc(4096); + rom_explicit_buy(buffer, 4096); + free(buffer); +#endif + + while (true) { + printf("Hello, world!\n"); + printf("I'm a self-decrypting binary\n"); + printf("My secret is...\n"); + sleep_ms(1000); + } +} diff --git a/encrypted/hello_encrypted/otp.json b/encrypted/hello_encrypted/otp.json new file mode 100644 index 000000000..466b7d85d --- /dev/null +++ b/encrypted/hello_encrypted/otp.json @@ -0,0 +1,141 @@ +{ + "29:0" : + { + "ecc" : true, + "value" : + [ + "0x31", + "0xb6", + "0xd8", + "0x18", + "0x23", + "0x2e", + "0x7b", + "0x7c", + "0xa3", + "0xb1", + "0xb7", + "0x90", + "0x7b", + "0x2f", + "0x41", + "0xd2", + "0x51", + "0xb5", + "0x03", + "0x62", + "0xd6", + "0x21", + "0x0c", + "0xb5", + "0x8d", + "0x17", + "0xe6", + "0xd5", + "0x6b", + "0x0d", + "0x87", + "0x8d", + "0x2b", + "0x74", + "0xa4", + "0xba", + "0xb9", + "0x14", + "0x75", + "0x88", + "0x9b", + "0x05", + "0x2d", + "0x32", + "0x51", + "0xc1", + "0x35", + "0x09", + "0x78", + "0xbb", + "0x6d", + "0xc2", + "0xbb", + "0xa6", + "0x5e", + "0x95", + "0xa2", + "0x29", + "0x32", + "0x34", + "0x5b", + "0x2c", + "0xd3", + "0xf8", + "0x5d", + "0xe2", + "0x5f", + "0x23", + "0xeb", + "0x27", + "0xa4", + "0xcd", + "0xb0", + "0x8e", + "0xf4", + "0x6e", + "0x94", + "0x86", + "0x19", + "0x93", + "0x3a", + "0xd8", + "0x97", + "0x65", + "0x29", + "0x25", + "0x57", + "0x65", + "0x49", + "0x03", + "0xfe", + "0xc6", + "0xe9", + "0x8b", + "0xa3", + "0x7e", + "0x2b", + "0x53", + "0x80", + "0x68", + "0xdd", + "0x05", + "0x10", + "0x17", + "0xca", + "0xc3", + "0xa8", + "0x04", + "0x8d", + "0x12", + "0xaf", + "0xd9", + "0x49", + "0xa9", + "0x6d", + "0x90", + "0x7c", + "0xb3", + "0x63", + "0x4f", + "0x36", + "0xc5", + "0x00", + "0xb5", + "0x71", + "0x74", + "0xe6", + "0x9a" + ] + }, + "OTP_DATA_KEY1" : [ 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7 ], + "OTP_DATA_KEY1_VALID" : "0x010101", + "PAGE29_LOCK0" : "0x494949", + "PAGE29_LOCK1" : "0x3d3d3d" +} \ No newline at end of file diff --git a/encrypted/hello_encrypted/private.pem b/encrypted/hello_encrypted/private.pem new file mode 100644 index 000000000..bf777d897 --- /dev/null +++ b/encrypted/hello_encrypted/private.pem @@ -0,0 +1,8 @@ +-----BEGIN EC PARAMETERS----- +BgUrgQQACg== +-----END EC PARAMETERS----- +-----BEGIN EC PRIVATE KEY----- +MHQCAQEEIAXAdiilH8wT07TESUzWPt+BY9+NcchvYU3xbnpK+CBNoAcGBSuBBAAK +oUQDQgAEYYJtMQFGW4AB94tU3u/Qir5sRcYjBYMqCa+8gxsYd9OwMS3dqWKsnVBz +dyy7bFWdJzXDMb9o20xRRd57Q9xSYw== +-----END EC PRIVATE KEY----- diff --git a/encrypted/hello_encrypted/privateaes.bin b/encrypted/hello_encrypted/privateaes.bin new file mode 100644 index 0000000000000000000000000000000000000000..21a47756d7b947b1e8a7c3a74b0ef5edc3984f87 GIT binary patch literal 128 zcmV-`0Du26w%8aWE_-~VvA2+WFG12#wF6?-Aq=&R7v|M#4Tp^@bfmhu6m^K31uZgB z!8HkZyKTa|re2kzDKa!$EYtX1;$I`{C#21=j`VJnh8dGO*q3D~C0Auh1OCS8i=%!k iQ-EmQ1rQg?!>9y}60g}wscn#avtv&-#Q?Q&bmp4XE<5l5 literal 0 HcmV?d00001 diff --git a/encrypted/hello_encrypted/update-key.cmake b/encrypted/hello_encrypted/update-key.cmake new file mode 100644 index 000000000..40b6750fe --- /dev/null +++ b/encrypted/hello_encrypted/update-key.cmake @@ -0,0 +1,23 @@ +if (CMAKE_VERSION VERSION_LESS 3.19) + # Check if keyfile is not the default, and print warning + file(READ ${CMAKE_CURRENT_LIST_DIR}/privateaes.bin key_file HEX) + if (NOT ${key_file} STREQUAL "31b6d818232e7b7ca3b1b7907b2f41d251b50362d6210cb58d17e6d56b0d878d2b74a4bab91475889b052d3251c1350978bb6dc2bba65e95a22932345b2cd3f85de25f23eb27a4cdb08ef46e948619933ad89765292557654903fec6e98ba37e2b538068dd051017cac3a8048d12afd949a96d907cb3634f36c500b57174e69a") + message(WARNING + "AES key not updated in otp.json file, as CMake version is < 3.19" + " - you will need to change the key in otp.json manually and re-run the build" + ) + endif() +else() + # Read the JSON file. + file(READ ${CMAKE_CURRENT_LIST_DIR}/otp.json json_string) + # Read the key file + file(READ ${CMAKE_CURRENT_LIST_DIR}/privateaes.bin key_file HEX) + + # adds '0x' prefix, comma suffix, and quotes for every byte + string(REGEX REPLACE "([0-9a-f][0-9a-f])" "\"0x\\1\", " key_file ${key_file}) + set(key_file_json "[${key_file}]") + + string(JSON json_string SET ${json_string} "29:0" "value" ${key_file_json}) + + file(WRITE ${CMAKE_CURRENT_LIST_DIR}/otp.json ${json_string}) +endif() From 805e0071cfd0a575e620640888fb69e0e6d44b30 Mon Sep 17 00:00:00 2001 From: William Vinnicombe Date: Mon, 24 Feb 2025 14:35:51 +0000 Subject: [PATCH 07/20] Use new `enable_interrupts` function --- encrypted/hello_encrypted/hello_encrypted.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/encrypted/hello_encrypted/hello_encrypted.c b/encrypted/hello_encrypted/hello_encrypted.c index 90a3db199..e96155745 100644 --- a/encrypted/hello_encrypted/hello_encrypted.c +++ b/encrypted/hello_encrypted/hello_encrypted.c @@ -11,7 +11,7 @@ #include "hardware/sync.h" int main() { - restore_interrupts_from_disabled(0); + enable_interrupts(); stdio_init_all(); #if PICO_CRT0_IMAGE_TYPE_TBYB From d4ed99899eee90954e155895fa6fff4a1b4e624f Mon Sep 17 00:00:00 2001 From: William Vinnicombe Date: Mon, 24 Feb 2025 15:30:29 +0000 Subject: [PATCH 08/20] Remove update-key.cmake This is not necessary anymore, now picotool writes the AES key to otp json files Fixes #613 --- bootloaders/encrypted/CMakeLists.txt | 6 +- bootloaders/encrypted/otp.json | 139 +------------------- bootloaders/encrypted/update-key.cmake | 23 ---- encrypted/hello_encrypted/CMakeLists.txt | 6 +- encrypted/hello_encrypted/otp.json | 140 +-------------------- encrypted/hello_encrypted/update-key.cmake | 23 ---- 6 files changed, 5 insertions(+), 332 deletions(-) delete mode 100644 bootloaders/encrypted/update-key.cmake delete mode 100644 encrypted/hello_encrypted/update-key.cmake diff --git a/bootloaders/encrypted/CMakeLists.txt b/bootloaders/encrypted/CMakeLists.txt index 65cf86f78..44180c4f6 100644 --- a/bootloaders/encrypted/CMakeLists.txt +++ b/bootloaders/encrypted/CMakeLists.txt @@ -4,11 +4,7 @@ add_executable(enc_bootloader aes.S ) -# Add command to update otp.json if privateaes.bin changes -add_custom_command(OUTPUT ${CMAKE_CURRENT_LIST_DIR}/otp.json - COMMAND ${CMAKE_COMMAND} -P "${CMAKE_CURRENT_LIST_DIR}/update-key.cmake" - DEPENDS ${CMAKE_CURRENT_LIST_DIR}/privateaes.bin) -# Copy that otp.json file to build directory +# Copy otp.json file to build directory add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/otp.json COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_CURRENT_LIST_DIR}/otp.json" "${CMAKE_CURRENT_BINARY_DIR}/otp.json" DEPENDS ${CMAKE_CURRENT_LIST_DIR}/otp.json) diff --git a/bootloaders/encrypted/otp.json b/bootloaders/encrypted/otp.json index e6393cfb0..4c671139a 100644 --- a/bootloaders/encrypted/otp.json +++ b/bootloaders/encrypted/otp.json @@ -1,142 +1,5 @@ { - "30:0" : - { - "ecc" : true, - "value" : - [ - "0x31", - "0xb6", - "0xd8", - "0x18", - "0x23", - "0x2e", - "0x7b", - "0x7c", - "0xa3", - "0xb1", - "0xb7", - "0x90", - "0x7b", - "0x2f", - "0x41", - "0xd2", - "0x51", - "0xb5", - "0x03", - "0x62", - "0xd6", - "0x21", - "0x0c", - "0xb5", - "0x8d", - "0x17", - "0xe6", - "0xd5", - "0x6b", - "0x0d", - "0x87", - "0x8d", - "0x2b", - "0x74", - "0xa4", - "0xba", - "0xb9", - "0x14", - "0x75", - "0x88", - "0x9b", - "0x05", - "0x2d", - "0x32", - "0x51", - "0xc1", - "0x35", - "0x09", - "0x78", - "0xbb", - "0x6d", - "0xc2", - "0xbb", - "0xa6", - "0x5e", - "0x95", - "0xa2", - "0x29", - "0x32", - "0x34", - "0x5b", - "0x2c", - "0xd3", - "0xf8", - "0x5d", - "0xe2", - "0x5f", - "0x23", - "0xeb", - "0x27", - "0xa4", - "0xcd", - "0xb0", - "0x8e", - "0xf4", - "0x6e", - "0x94", - "0x86", - "0x19", - "0x93", - "0x3a", - "0xd8", - "0x97", - "0x65", - "0x29", - "0x25", - "0x57", - "0x65", - "0x49", - "0x03", - "0xfe", - "0xc6", - "0xe9", - "0x8b", - "0xa3", - "0x7e", - "0x2b", - "0x53", - "0x80", - "0x68", - "0xdd", - "0x05", - "0x10", - "0x17", - "0xca", - "0xc3", - "0xa8", - "0x04", - "0x8d", - "0x12", - "0xaf", - "0xd9", - "0x49", - "0xa9", - "0x6d", - "0x90", - "0x7c", - "0xb3", - "0x63", - "0x4f", - "0x36", - "0xc5", - "0x00", - "0xb5", - "0x71", - "0x74", - "0xe6", - "0x9a" - ] - }, "OTP_DATA_KEY1" : [ 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7 ], "OTP_DATA_KEY1_VALID" : "0x010101", - "OTP_DATA_KEY2" : [ 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0 ], - "OTP_DATA_KEY2_VALID" : "0x010101", - "PAGE30_LOCK0" : "0x4a4a4a" + "PAGE30_LOCK0" : "0x494949" } diff --git a/bootloaders/encrypted/update-key.cmake b/bootloaders/encrypted/update-key.cmake deleted file mode 100644 index 2beb8e983..000000000 --- a/bootloaders/encrypted/update-key.cmake +++ /dev/null @@ -1,23 +0,0 @@ -if (CMAKE_VERSION VERSION_LESS 3.19) - # Check if keyfile is not the default, and print warning - file(READ ${CMAKE_CURRENT_LIST_DIR}/privateaes.bin key_file HEX) - if (NOT ${key_file} STREQUAL "31b6d818232e7b7ca3b1b7907b2f41d251b50362d6210cb58d17e6d56b0d878d2b74a4bab91475889b052d3251c1350978bb6dc2bba65e95a22932345b2cd3f85de25f23eb27a4cdb08ef46e948619933ad89765292557654903fec6e98ba37e2b538068dd051017cac3a8048d12afd949a96d907cb3634f36c500b57174e69a") - message(WARNING - "Encrypted bootloader AES key not updated in otp.json file, as CMake version is < 3.19" - " - you will need to change the key in otp.json manually and re-run the build" - ) - endif() -else() - # Read the JSON file. - file(READ ${CMAKE_CURRENT_LIST_DIR}/otp.json json_string) - # Read the key file - file(READ ${CMAKE_CURRENT_LIST_DIR}/privateaes.bin key_file HEX) - - # adds '0x' prefix, comma suffix, and quotes for every byte - string(REGEX REPLACE "([0-9a-f][0-9a-f])" "\"0x\\1\", " key_file ${key_file}) - set(key_file_json "[${key_file}]") - - string(JSON json_string SET ${json_string} "30:0" "value" ${key_file_json}) - - file(WRITE ${CMAKE_CURRENT_LIST_DIR}/otp.json ${json_string}) -endif() diff --git a/encrypted/hello_encrypted/CMakeLists.txt b/encrypted/hello_encrypted/CMakeLists.txt index 78225be50..7f72ece66 100644 --- a/encrypted/hello_encrypted/CMakeLists.txt +++ b/encrypted/hello_encrypted/CMakeLists.txt @@ -19,11 +19,7 @@ pico_set_binary_version(hello_encrypted MAJOR 7 MINOR 3) # set tbyb (optional) # target_compile_definitions(hello_encrypted PRIVATE PICO_CRT0_IMAGE_TYPE_TBYB=1) -# Add command to update otp.json if privateaes.bin changes -add_custom_command(OUTPUT ${CMAKE_CURRENT_LIST_DIR}/otp.json - COMMAND ${CMAKE_COMMAND} -P "${CMAKE_CURRENT_LIST_DIR}/update-key.cmake" - DEPENDS ${CMAKE_CURRENT_LIST_DIR}/privateaes.bin) -# Copy that otp.json file to build directory +# Copy otp.json file to build directory add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/otp.json COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_CURRENT_LIST_DIR}/otp.json" "${CMAKE_CURRENT_BINARY_DIR}/otp.json" DEPENDS ${CMAKE_CURRENT_LIST_DIR}/otp.json) diff --git a/encrypted/hello_encrypted/otp.json b/encrypted/hello_encrypted/otp.json index 466b7d85d..2a4bbe2c4 100644 --- a/encrypted/hello_encrypted/otp.json +++ b/encrypted/hello_encrypted/otp.json @@ -1,141 +1,5 @@ { - "29:0" : - { - "ecc" : true, - "value" : - [ - "0x31", - "0xb6", - "0xd8", - "0x18", - "0x23", - "0x2e", - "0x7b", - "0x7c", - "0xa3", - "0xb1", - "0xb7", - "0x90", - "0x7b", - "0x2f", - "0x41", - "0xd2", - "0x51", - "0xb5", - "0x03", - "0x62", - "0xd6", - "0x21", - "0x0c", - "0xb5", - "0x8d", - "0x17", - "0xe6", - "0xd5", - "0x6b", - "0x0d", - "0x87", - "0x8d", - "0x2b", - "0x74", - "0xa4", - "0xba", - "0xb9", - "0x14", - "0x75", - "0x88", - "0x9b", - "0x05", - "0x2d", - "0x32", - "0x51", - "0xc1", - "0x35", - "0x09", - "0x78", - "0xbb", - "0x6d", - "0xc2", - "0xbb", - "0xa6", - "0x5e", - "0x95", - "0xa2", - "0x29", - "0x32", - "0x34", - "0x5b", - "0x2c", - "0xd3", - "0xf8", - "0x5d", - "0xe2", - "0x5f", - "0x23", - "0xeb", - "0x27", - "0xa4", - "0xcd", - "0xb0", - "0x8e", - "0xf4", - "0x6e", - "0x94", - "0x86", - "0x19", - "0x93", - "0x3a", - "0xd8", - "0x97", - "0x65", - "0x29", - "0x25", - "0x57", - "0x65", - "0x49", - "0x03", - "0xfe", - "0xc6", - "0xe9", - "0x8b", - "0xa3", - "0x7e", - "0x2b", - "0x53", - "0x80", - "0x68", - "0xdd", - "0x05", - "0x10", - "0x17", - "0xca", - "0xc3", - "0xa8", - "0x04", - "0x8d", - "0x12", - "0xaf", - "0xd9", - "0x49", - "0xa9", - "0x6d", - "0x90", - "0x7c", - "0xb3", - "0x63", - "0x4f", - "0x36", - "0xc5", - "0x00", - "0xb5", - "0x71", - "0x74", - "0xe6", - "0x9a" - ] - }, "OTP_DATA_KEY1" : [ 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7 ], "OTP_DATA_KEY1_VALID" : "0x010101", - "PAGE29_LOCK0" : "0x494949", - "PAGE29_LOCK1" : "0x3d3d3d" -} \ No newline at end of file + "PAGE29_LOCK0" : "0x494949" +} diff --git a/encrypted/hello_encrypted/update-key.cmake b/encrypted/hello_encrypted/update-key.cmake deleted file mode 100644 index 40b6750fe..000000000 --- a/encrypted/hello_encrypted/update-key.cmake +++ /dev/null @@ -1,23 +0,0 @@ -if (CMAKE_VERSION VERSION_LESS 3.19) - # Check if keyfile is not the default, and print warning - file(READ ${CMAKE_CURRENT_LIST_DIR}/privateaes.bin key_file HEX) - if (NOT ${key_file} STREQUAL "31b6d818232e7b7ca3b1b7907b2f41d251b50362d6210cb58d17e6d56b0d878d2b74a4bab91475889b052d3251c1350978bb6dc2bba65e95a22932345b2cd3f85de25f23eb27a4cdb08ef46e948619933ad89765292557654903fec6e98ba37e2b538068dd051017cac3a8048d12afd949a96d907cb3634f36c500b57174e69a") - message(WARNING - "AES key not updated in otp.json file, as CMake version is < 3.19" - " - you will need to change the key in otp.json manually and re-run the build" - ) - endif() -else() - # Read the JSON file. - file(READ ${CMAKE_CURRENT_LIST_DIR}/otp.json json_string) - # Read the key file - file(READ ${CMAKE_CURRENT_LIST_DIR}/privateaes.bin key_file HEX) - - # adds '0x' prefix, comma suffix, and quotes for every byte - string(REGEX REPLACE "([0-9a-f][0-9a-f])" "\"0x\\1\", " key_file ${key_file}) - set(key_file_json "[${key_file}]") - - string(JSON json_string SET ${json_string} "29:0" "value" ${key_file_json}) - - file(WRITE ${CMAKE_CURRENT_LIST_DIR}/otp.json ${json_string}) -endif() From e8266aa3c5332f3d32cbf8e84db1744bc5a973a8 Mon Sep 17 00:00:00 2001 From: William Vinnicombe Date: Mon, 24 Feb 2025 15:44:51 +0000 Subject: [PATCH 09/20] Add hello_encrypted to readme --- README.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/README.md b/README.md index 5a436a348..c4127fc1c 100644 --- a/README.md +++ b/README.md @@ -86,6 +86,12 @@ App|Description [channel_irq](dma/channel_irq) | Use an IRQ handler to reconfigure a DMA channel, in order to continuously drive data through a PIO state machine. [sniff_crc](dma/sniff_crc) | Use the DMA engine's 'sniff' capability to calculate a CRC32 on a data buffer. +### Encrypted + +App|Description +---|--- +[hello_encrypted](encrypted/hello_encrypted) | Create a self-decrypting binary. + ### HSTX (RP235x Only) App|Description From bba9a5ef3c26bde5f8884c8e5a2fe04db69bc721 Mon Sep 17 00:00:00 2001 From: William Vinnicombe Date: Mon, 24 Feb 2025 17:34:56 +0000 Subject: [PATCH 10/20] Update enc_bootloader with latest aes.S (picotool 333d571c) CK_JITTER is removed as the enc_bootloader runs from XOSC not ROSC --- bootloaders/encrypted/aes.S | 681 ++++++++++++++++--------- bootloaders/encrypted/config.h | 46 +- bootloaders/encrypted/enc_bootloader.c | 53 +- 3 files changed, 488 insertions(+), 292 deletions(-) diff --git a/bootloaders/encrypted/aes.S b/bootloaders/encrypted/aes.S index ad6c448d8..e0d653237 100644 --- a/bootloaders/encrypted/aes.S +++ b/bootloaders/encrypted/aes.S @@ -1,3 +1,13 @@ +/* MEMORY LAYOUT ASSUMPTIONS + +The "chaff" area must be located at the start of Y scratch RAM, 0x20081000: see +the macro getchaffaddress. + +The stack must be located at the end of Y scratch RAM: see the memory +wiping at the end of ctr_crypt_s where memory between the start of Y +scratch RAM and the stack pointer is overwritten. +*/ + .syntax unified .cpu cortex-m33 .thumb @@ -5,26 +15,24 @@ #include "config.h" #include "hardware/platform_defs.h" #include "hardware/regs/addressmap.h" +#include "hardware/regs/clocks.h" #include "hardware/regs/sha256.h" +#include "hardware/regs/resets.h" +#include "hardware/regs/rosc.h" +#include "hardware/regs/trng.h" #include "hardware/rcp.h" -.global gen_lut_sbox -.global ctr_crypt_s -.global remap -.global gen_rand_sha -.global init_key +.global decrypt +.global chaff -.global rkey_s -.global lut_a,lut_a_map -.global lut_b,lut_b_map -.global rstate_sha,rstate_lfsr +.extern lock_key @ RCP macros #define CTAG0 0x2a #define CTAG1 0x2b #define CTAG2 0x2c -#define CTAG3 0x2d @ not used +#define CTAG3 0x2d #define CTAG4 0x2e #define CTAG5 0x30 #define CTAG6 0x31 @@ -41,9 +49,13 @@ #define CTAG17 0x3c #define CTAG18 0x3d @ not used -.macro SET_COUNT n +@ number of blocks from the TRNG processed to initialise rstate_sha +#define TRNG_BLOCKS 25 + +@ The lower jitterpriorty is, the more the jitter +.macro SET_COUNT n,jitterpriority .if RC_COUNT -.if RC_JITTER +.if RC_JITTER > \jitterpriority rcp_count_set \n .else rcp_count_set_nodelay \n @@ -51,9 +63,9 @@ .endif .endm -.macro CHK_COUNT n +.macro CHK_COUNT n,jitterpriority .if RC_COUNT -.if RC_JITTER +.if RC_JITTER > \jitterpriority rcp_count_check \n .else rcp_count_check_nodelay \n @@ -61,9 +73,9 @@ .endif .endm -.macro GET_CANARY rx,tag +.macro GET_CANARY rx,tag,jitterpriority .if RC_CANARY -.if RC_JITTER +.if RC_JITTER > \jitterpriority rcp_canary_get \rx,\tag .else rcp_canary_get_nodelay \rx,\tag @@ -71,9 +83,9 @@ .endif .endm -.macro CHK_CANARY rx,tag +.macro CHK_CANARY rx,tag,jitterpriority .if RC_CANARY -.if RC_JITTER +.if RC_JITTER > \jitterpriority rcp_canary_check \rx,\tag .else rcp_canary_check_nodelay \rx,\tag @@ -81,18 +93,6 @@ .endif .endm -.macro GET_CANARY_NJ rx,tag @ with no jitter even if you ask for it (for situations where it would otherwise slow things down a lot) -.if RC_CANARY - rcp_canary_get_nodelay \rx,\tag -.endif -.endm - -.macro CHK_CANARY_NJ rx,tag @ with no jitter even if you ask for it -.if RC_CANARY - rcp_canary_check_nodelay \rx,\tag -.endif -.endm - .macro clear03 offset=0 getchaffaddress r0,\offset ldmia r0,{r0-r3} @@ -112,7 +112,9 @@ @ Put workspace in the second scratch area @ The "a"=allocatable attribute (and possibly the %progbits attribute) are necessary to store the murmur3 constants, @ otherwise they may end up silently replaced with 0 or 0xffffffff -.section .scratch_y.aes,"a",%progbits +.section .scratch_y.aes,"aw",%progbits + +workspace_start: @ chaff has to be at the start of scratch_y = 0x20081000 because this is assumed by the following macro, getchaffaddress @ (It seems ADR does not work, nor is it possible to assert that chaff==0x20081000) @@ -126,6 +128,37 @@ chaff: .space 48 +.balign 16 +rkey_s: @ round key shares: 600 bytes = 15 rounds * 2 shares * (4+1) words + @ see comment at init_key_4way for description of layout and meaning of rkey_s +.space 600 +rkey4way: @ scratch area for init_key_4way; could overlap this with other scratch space if need to save space +.space 128 +.if CT_BPERM +bperm_rand: @ 32 half words that define the oblivious permutation of blocks +.space 64 +.endif + +.balign 16 +permscratch: @ Must be 0 mod 16; 16 bytes of scratch space to store permutation(s) +perm16: +.space 16 +@ Scratch space of 32 bytes used both by init_key_sbox and map_sbox_s +.balign 16 +fourway: @ Must be 0 mod 16 +shareA: @ 0 mod 16 +.space 20 @ Only need 16 bytes, but choosing shareB!=shareA mod 16 +shareB: @ 4 mod 16 +.space 20 +shareC: @ 8 mod 16 +.space 4 +statevperm: @ 12 mod 16 +.space 4 @ vperm state rotation: only last two bits are operational; other bits random +RKshareC: @ Round key common share C; see comment at init_key_4way for explanation +.space 4 +RKshareCchange: @ Temporary used by ref_roundkey_share_s +.space 4 + @ Regardless of configuration, the code uses a single 256-entry LUT, @ which is a simple S-box table. @ The LUT is represented as two shares, lut_a and lut_b, @@ -143,7 +176,22 @@ chaff: @ lut_b[x ^ b₀ ^ b₁] ^ c₁ ^ d₁ .balign 16 lut_a: @ LUT share A (must be 0 mod 16 so that init_key_sbox knows how to mask the lookup) -.space 256 +.byte 0x63,0x7c,0x77,0x7b,0xf2,0x6b,0x6f,0xc5,0x30,0x01,0x67,0x2b,0xfe,0xd7,0xab,0x76 +.byte 0xca,0x82,0xc9,0x7d,0xfa,0x59,0x47,0xf0,0xad,0xd4,0xa2,0xaf,0x9c,0xa4,0x72,0xc0 +.byte 0xb7,0xfd,0x93,0x26,0x36,0x3f,0xf7,0xcc,0x34,0xa5,0xe5,0xf1,0x71,0xd8,0x31,0x15 +.byte 0x04,0xc7,0x23,0xc3,0x18,0x96,0x05,0x9a,0x07,0x12,0x80,0xe2,0xeb,0x27,0xb2,0x75 +.byte 0x09,0x83,0x2c,0x1a,0x1b,0x6e,0x5a,0xa0,0x52,0x3b,0xd6,0xb3,0x29,0xe3,0x2f,0x84 +.byte 0x53,0xd1,0x00,0xed,0x20,0xfc,0xb1,0x5b,0x6a,0xcb,0xbe,0x39,0x4a,0x4c,0x58,0xcf +.byte 0xd0,0xef,0xaa,0xfb,0x43,0x4d,0x33,0x85,0x45,0xf9,0x02,0x7f,0x50,0x3c,0x9f,0xa8 +.byte 0x51,0xa3,0x40,0x8f,0x92,0x9d,0x38,0xf5,0xbc,0xb6,0xda,0x21,0x10,0xff,0xf3,0xd2 +.byte 0xcd,0x0c,0x13,0xec,0x5f,0x97,0x44,0x17,0xc4,0xa7,0x7e,0x3d,0x64,0x5d,0x19,0x73 +.byte 0x60,0x81,0x4f,0xdc,0x22,0x2a,0x90,0x88,0x46,0xee,0xb8,0x14,0xde,0x5e,0x0b,0xdb +.byte 0xe0,0x32,0x3a,0x0a,0x49,0x06,0x24,0x5c,0xc2,0xd3,0xac,0x62,0x91,0x95,0xe4,0x79 +.byte 0xe7,0xc8,0x37,0x6d,0x8d,0xd5,0x4e,0xa9,0x6c,0x56,0xf4,0xea,0x65,0x7a,0xae,0x08 +.byte 0xba,0x78,0x25,0x2e,0x1c,0xa6,0xb4,0xc6,0xe8,0xdd,0x74,0x1f,0x4b,0xbd,0x8b,0x8a +.byte 0x70,0x3e,0xb5,0x66,0x48,0x03,0xf6,0x0e,0x61,0x35,0x57,0xb9,0x86,0xc1,0x1d,0x9e +.byte 0xe1,0xf8,0x98,0x11,0x69,0xd9,0x8e,0x94,0x9b,0x1e,0x87,0xe9,0xce,0x55,0x28,0xdf +.byte 0x8c,0xa1,0x89,0x0d,0xbf,0xe6,0x42,0x68,0x41,0x99,0x2d,0x0f,0xb0,0x54,0xbb,0x16 lut_a_map: @ the current scrambling of lut_a; not particularly secret since it can be deduced from the contents of lut_a and lut_b .space 4 .space 4 @ align to 8 mod 16 @@ -152,38 +200,17 @@ lut_b: @ LUT share B (must be 8 mod 16 so that init_key_sb lut_b_map: .space 4 .space 4 @ align to multiple of 8 -rkey_s: @ round key shares: 600 bytes = 15 rounds * 2 shares * (4+1) words - @ every fourth word has a word that is used as a vperm count, and also as a spacer to misalign the shares mod 16 -.space 600 -rkey4way: @ scratch area for init_key; could overlap this with other scratch space if need to save space -.space 128 -.if CT_BPERM -bperm_rand: @ 32 half words that define the oblivious permutation of blocks -.space 64 -.endif + .balign 16 +rstate_all_start: @ Mark start of RNG data to allow selective memory wipe rstate_sha: @ 128-bit SHA random state, to be initialised to TRNG bytes; zeroth byte must be initialised to zero .space 16 -rstate_lfsr: @ 32-bit LFSR random state and constant used to step it (initialised by C program) -.space 8 -.balign 16 -permscratch: @ Must be 0 mod 16; 16 bytes of scratch space to store permutation(s) -perm16: -.space 16 -@ Scratch space of 32 bytes used both by init_key_sbox and map_sbox_s -.balign 16 -fourway: @ Must be 0 mod 16 -shareA: @ 0 mod 16 -.space 20 @ Only need 16 bytes, but choosing shareB!=shareA mod 16 -shareB: @ 4 mod 16 -.space 20 -shareC: @ 8 mod 16 +jstate: @ 32-bit jitter state .space 4 -statevperm: @ 12 mod 16 -.space 4 @ vperm state rotation: only last two bits are operational; other bits random -RKshareC: +rstate_lfsr: @ 32-bit LFSR random state and constant used to step it .space 4 -.balign 16 +.word 0x1d872b41 @ constant that defines a maximal-length LFSR +rstate_all_end: @ Mark end of RNG data to allow selective memory wipe .if CT_BPERM .balign 16 @@ -195,7 +222,88 @@ murmur3_constants: @ Five constants used in murmur3_32 hash .word 0xc2b2ae35 .endif -@ Put main code in first scratch area +scratch_y_end: + +@ Initialisation code in main .text section +.section .text,"ax",%progbits + +@ The following is copied from the A2 boot ROM code at src/main/arm/varm_boot_path.c with adjustments. +@ We feed a stream of bits from the TRNG into the SHA hardware accelerator to generate some +@ random numbers. +@ Trashes r0-r6 +.balign 4 +init_rstate: + CHK_COUNT 24,6 + ldr r4,=TRNG_BASE+TRNG_RNG_IMR_OFFSET + ldr r5,=SHA256_BASE + movs r1,#1 + str r1,[r4,#TRNG_TRNG_SW_RESET_OFFSET -TRNG_RNG_IMR_OFFSET] + ldr r6,[r4,#TRNG_TRNG_SW_RESET_OFFSET -TRNG_RNG_IMR_OFFSET] @ reads as 0 + movw r1,#SHA256_CSR_RESET|SHA256_CSR_START_BITS @ initialise SHA internal state by writing START bit + str r1,[r5,#SHA256_CSR_OFFSET] + str r6,[r4,#TRNG_SAMPLE_CNT1_OFFSET -TRNG_RNG_IMR_OFFSET] + movs r6,#TRNG_BLOCKS*2+1 @ odd so that we break out of the loop half-way through loading the SHA hardware, giving + @ time for previous SHA computation to complete +2: + movs r1,#0xff @ TRNG setup is inside loop in case it is skipped. + str r1,[r4,#TRNG_TRNG_DEBUG_CONTROL_OFFSET-TRNG_RNG_IMR_OFFSET] @ disable checks and bypass decorrelators,to stream raw TRNG ROSC samples + str r1,[r4,#TRNG_RND_SOURCE_ENABLE_OFFSET -TRNG_RNG_IMR_OFFSET] @ start ROSC if it is not already started + str r1,[r4,#TRNG_RNG_ICR_OFFSET -TRNG_RNG_IMR_OFFSET] @ clear all interrupts (including EHR_VLD) + adds r0,r4,#TRNG_EHR_DATA0_OFFSET -TRNG_RNG_IMR_OFFSET + movs r2,#TRNG_TRNG_BUSY_OFFSET -TRNG_RNG_IMR_OFFSET +1: + ldr r1,[r4,r2] @ wait for 192 ROSC samples to fill EHR,should take constant time + cmp r1,#0 + bne 1b + subs r6,#1 @ done? + beq 3f + movs r1,#8 +1: + ldmia r0!,{r2} @ copy 6 EHR words to SHA-256, plus garbage (RND_SOURCE_ENABLE and SAMPLE_CNT1) + str r2,[r5,#SHA256_WDATA_OFFSET] @ for a total of half a SHA-256 block + subs r1,#1 + bne 1b + ldr r2,[r5,#SHA256_SUM0_OFFSET] @ TRNG is now sampling again; use some SHA bits to modulate the chain length + str r2,[r4,#TRNG_TRNG_CONFIG_OFFSET -TRNG_RNG_IMR_OFFSET] + b.n 2b + +3: + CHK_COUNT 25,6 + str r1,[r4,#TRNG_TRNG_CONFIG_OFFSET -TRNG_RNG_IMR_OFFSET] @ turn off rand source and wipe SHA bits left in TRNG config; r1=0 + str r1,[r4,#TRNG_RND_SOURCE_ENABLE_OFFSET -TRNG_RNG_IMR_OFFSET] + adds r5,r5,#SHA256_SUM0_OFFSET + ldmia r5!,{r0-r3} + ldr r5,=rstate_sha + stmia r5,{r0-r3} + CHK_COUNT 26,6 + +@ r5=rstate_sha + movs r0,#0 + strb r0,[r5] @ make sure rstate_sha[0] has byte 0 set to 0, representing "out of data" +@ try to find a non-zero initialiser to create a non-degenerate LFSR + ldr r1,[r5,#4] + cbnz r1,1f @ is word 1 non-zero? then use it + ldr r1,[r5,#8] + cbnz r1,1f @ otherwise, is word 2 non-zero? use it + ldr r1,[r5,#12] + cbnz r1,1f @ otherwise, is word 3 non-zero? use it + mov r1,r5 @ give up and use the address of rstate_sha (which is non-zero); this can't really happen (2^{-96} probability) +1: + str r1,[r5,#rstate_lfsr-rstate_sha] + ldr r2,=ROSC_RANDOM_OFFSET+ROSC_BASE + str r1,[r2,#0] + CHK_COUNT 27,6 +.if GEN_RAND_SHA +.if SH_JITTER + movs r2,#0 + str r2,[r5,#jstate-rstate_sha] +.endif +.endif + + CHK_COUNT 28,6 + bx r14 + +@ Put AES core code in first scratch area .section .scratch_x.aes,"ax",%progbits .if GEN_RAND_SHA @@ -209,11 +317,26 @@ murmur3_constants: @ Five constants used in murmur3_32 hash .balign 4 gen_rand_sha: push {r14} - GET_CANARY_NJ r14,CTAG1 + GET_CANARY r14,CTAG1,2 push {r1-r3,r14} +.if SH_JITTER + ldr r2,=rstate_sha + ldr r0,[r2,#jstate-rstate_sha] + movs r1,#1 + movs r3,r0,lsl#2 + ands r3,r3,#31 + movs r3,r1,lsl r3 @ 1<<(4*(r0&7)) + udiv r3,r3,r1 @ Takes constant + (r0&7) cycles + lsrs r0,r0,#1 + bne 1f + bl gen_rand_sha_nonpres + ldr r2,=rstate_sha +1: + str r0,[r2,#jstate-rstate_sha] +.endif bl gen_rand_sha_nonpres pop {r1-r3,r14} - CHK_CANARY_NJ r14,CTAG1 + CHK_CANARY r14,CTAG1,0 pop {r15} @ Return single random word in r0 @@ -273,11 +396,11 @@ gen_rand_sha_nonpres: gen_rand_sha: gen_rand_lfsr: @ Not used push {r14} - GET_CANARY_NJ r14,CTAG2 + GET_CANARY r14,CTAG2,2 push {r1,r2,r14} bl gen_rand_lfsr_nonpres pop {r1,r2,r14} - CHK_CANARY_NJ r14,CTAG2 + CHK_CANARY r14,CTAG2,0 pop {r15} .endif @@ -311,6 +434,56 @@ gen_rand_lfsr_nonpres: .ltorg +.balign 4 +.thumb_func +decrypt: + push {r14} + GET_CANARY r14,CTAG3,6 + SET_COUNT 23,6 + push {r0-r12,r14} + bl reset_sha_trng + bl init_rstate +@ randomly re-share the LUT contents + ldr r4,=lut_a + mov r5,#64 @ 64 words = 256 bytes +1: + bl gen_rand_sha_nonpres + ldr r6,[r4,#lut_b-lut_a] @ EOR a random word into both shares + eors r6,r6,r0 + str r6,[r4,#lut_b-lut_a] + ldr r6,[r4] + eors r6,r6,r0 + stmia r4!,{r6} + subs r5,r5,#1 + bne 1b + CHK_COUNT 29,6 + bl remap @ scramble the LUTs + pop {r0} @ pointer to 4way key data + CHK_COUNT 30,6 + bl init_key_4way + CHK_COUNT 31,6 + bl lock_key + pop {r0-r2} + bl ctr_crypt_s + bl randomisechaff + clear03 + pop {r4-r12,r14} + CHK_CANARY r14,CTAG3,6 + pop {r15} + +.balign 4 +.thumb_func +reset_sha_trng: + ldr r1,=RESETS_BASE+RESETS_RESET_OFFSET + ldr r2,[r1] + ldr r3,=#RESETS_RESET_SHA256_BITS|RESETS_RESET_TRNG_BITS + orrs r2,r2,r3 + str r2,[r1] @ reset the SHA hardware and the TRNG hardware + CHK_COUNT 23,6 + bics r2,r2,r3 + str r2,[r1] @ release the reset + bx r14 + .balign 4 .thumb_func makesmallperm: @@ -321,7 +494,7 @@ makesmallperm: @ Trashes r0-r3 push {r14} - GET_CANARY_NJ r14,CTAG4 + GET_CANARY r14,CTAG4,6 push {r4-r6,r14} movs r4,r1 movs r6,r0 @@ -354,7 +527,7 @@ makesmallperm: 2: pop {r4-r6,r14} - CHK_CANARY_NJ r14,CTAG4 + CHK_CANARY r14,CTAG4,6 pop {r15} .balign 4 @@ -365,7 +538,7 @@ makeperm16: @ More efficient than calling makeperm with R0=16, R1=perm16 - fewer calls to gen_rand_sha @ Trashes r0-r5 - GET_CANARY r0,CTAG5 + GET_CANARY r0,CTAG5,1 push {r0,r14} ldr r4,=perm16 bl gen_rand_sha_nonpres @@ -421,7 +594,7 @@ makeperm16: bne 1b pop {r0,r14} - CHK_CANARY r0,CTAG5 + CHK_CANARY r0,CTAG5,4 bx r14 .balign 4 @@ -429,7 +602,7 @@ makeperm16: remap: @ do a random remap of the LUTs @ preserves r0-r11; trashes r12 - GET_CANARY r12,CTAG6 + GET_CANARY r12,CTAG6,6 push {r0-r12,r14} bl gen_rand_sha_nonpres ldr r1,=lut_a @@ -438,15 +611,14 @@ remap: ldr r1,=lut_b bl remap_1 pop {r0-r12,r14} - CHK_CANARY r12,CTAG6 + CHK_CANARY r12,CTAG6,6 bx r14 - remap_1: @ r0: B0:xa B1:xb B2:ya B3:yb @ r1: array of 256 bytes, followed by a 4-byte map @ shuffle LUT share array such that new[i]=old[i^xa^xb]^ya^yb, update map according to r0 - GET_CANARY_NJ r6,CTAG7 + GET_CANARY r6,CTAG7,6 push {r6,r14} mov r14,0x01010101 ubfx r6,r0,#16,#8 @@ -491,7 +663,7 @@ remap_1: subs r2,r2,#4 bpl 1b pop {r6,r14} - CHK_CANARY_NJ r6,CTAG7 + CHK_CANARY r6,CTAG7,6 bx r14 .if RK_ROR @@ -511,7 +683,7 @@ ref_roundkey_shares_s_test: @ entry point for test code to do fewer than 15 rou ldr r4,=rkey_s loadlfsr steplfsr @ r0=change in RKshareC - adr r2,RKshareCchange + ldr r2,=RKshareCchange str r0,[r2] ldr r3,=RKshareC ldr r5,[r3] @@ -535,7 +707,8 @@ ref_roundkey_shares_s_loop: steplfsr; eors r7,r7,r0; ands r9,r9,#3; ldr r3,[r4,r9,lsl#2]; ror r2,r0,r12; eors r3,r3,r2,ror#16; mov r12,r12,ror#8; str r3,[r4,r9,lsl#2]; adds r9,r9,#1 steplfsr; eors r8,r8,r0; ands r9,r9,#3; ldr r3,[r4,r9,lsl#2]; ror r2,r0,r12; eors r3,r3,r2,ror#16; str r3,[r4,r9,lsl#2] - ldr r3,RKshareCchange + ldr r3,=RKshareCchange + ldr r3,[r3] movs r2,#0 usub8 r10,r2,r10 ror r2,r3,r10; mov r10,r10,ror#8; eors r5,r5,r2 @@ -554,9 +727,6 @@ ref_roundkey_shares_s_loop: clear03 24 ref_roundkey_shares_s_exit: bx r14 - .balign 4 -RKshareCchange: - .space 4 .balign 4 .thumb_func @@ -570,7 +740,7 @@ RKshareCchange: ref_roundkey_hvperms_s: movs r7,#30 ref_roundkey_hvperms_s_test: @ entry point for test code to do fewer than 30 key shares - GET_CANARY r10,CTAG9 + GET_CANARY r10,CTAG9,6 push {r10,r14} ldr r10,=rkey_s ref_roundkey_hvperms_s_loop: @@ -592,7 +762,7 @@ ref_roundkey_hvperms_s_loop: clear03 28 ref_roundkey_hvperms_s_exit: @ label exit point to be to able to specify to analysis code pop {r10,r14} - CHK_CANARY r10,CTAG9 + CHK_CANARY r10,CTAG9,6 bx r14 .else @@ -604,7 +774,7 @@ ref_roundkey_hvperms_s_exit: @ label exit point to be to able to specify to ana ref_roundkey_shares_s: mov r11,#15 @ there are 15 expanded keys ref_roundkey_shares_s_test: @ entry point for test code to do fewer than 15 rounds - GET_CANARY r4,CTAG8 + GET_CANARY r4,CTAG8,6 push {r4,r14} ldr r4,=rkey_s loadlfsr @@ -641,7 +811,7 @@ ref_roundkey_shares_s_loop: clear03 24 ref_roundkey_shares_s_exit: pop {r4,r14} - CHK_CANARY r4,CTAG8 + CHK_CANARY r4,CTAG8,6 bx r14 .balign 4 @@ -651,7 +821,7 @@ ref_roundkey_shares_s_exit: ref_roundkey_hvperms_s: movs r7,#30 ref_roundkey_hvperms_s_test: @ entry point for test code to do fewer than 30 key shares - GET_CANARY r0,CTAG9 + GET_CANARY r0,CTAG9,6 push {r0,r14} bl gen_rand_lfsr_nonpres ldr r1,=rkey_s @@ -679,11 +849,13 @@ ref_roundkey_hvperms_s_loop: clear03 28 ref_roundkey_hvperms_s_exit: @ label exit point to be to able to specify to analysis code pop {r0,r14} - CHK_CANARY r0,CTAG9 + CHK_CANARY r0,CTAG9,6 bx r14 .endif +.ltorg + .if ST_VPERM .balign 4 .thumb_func @@ -733,7 +905,7 @@ addstatevperm_exit: @ label exit point to be to able to specify to ana @ Trashes r0-r3,r12 .balign 4 ns_to_s: - GET_CANARY r12,CTAG11 + GET_CANARY r12,CTAG11,6 push {r12,r14} .if ST_SHAREC bl gen_rand_sha_nonpres @ Create state share C; all bytes the same @@ -765,7 +937,7 @@ ns_to_s: bl addstatevperm @ Initialise state vperm with SHA RNG, refresh with LFSR RNG .endif pop {r12,r14} - CHK_CANARY r12,CTAG11 + CHK_CANARY r12,CTAG11,6 bx r14 @ Conjugate lut_a, lut_b with shareC @@ -863,8 +1035,6 @@ shift_rows_s: @ multiply polynomial over GF(2⁸) by d(x) = 0x0Bx³ + 0x0Dx² + 0x09x + 0x0E modulo x⁴+1; c(x)d(x)=1 modulo x⁴+1 .macro invmixcol rx,rt,ru,rv,rw,r0x00,r0x1b -@ !!! can probably save some registers, e.g. allow trashing of r0x00, r0x1b -@ can possibly also simplify slightly with refactorisation uadd8 \rt,\rx,\rx @ field multiplication by 2 as above sel \rw,\r0x1b,\r0x00 eors \rt,\rt,\rw @ 2x @@ -904,51 +1074,6 @@ mix_cols_s: ldmia r12!,{r0,r1} @ overwrite sensitive shareB-related quantities r0,r1 with random numbers bx r14 -.balign 4 -.thumb_func -gen_lut_sbox: -@ gen_lut_sbox sets both lut_a and lut_b to the S-box table and -@ returns r0=lut_a+256, r1=lut_b+256 -@ first set lut_a to be a table of GF(2⁸) inverses, using lut_b as temporary storage - ldr r0,=lut_a - ldr r1,=lut_b -@ first set lut_a to be a table of antilogarithms, lut_b a table of logarithms - mov r2,#0 - strb r2,[r0] @ (*) - mov r3,#1 @ we maintain invariant that r2=log(r3) -1: - strb r2,[r0,r3] @ log table - strb r3,[r1,r2] @ antilog table - lsls r12,r3,#25 - it cs - eorcs r12,r12,#0x1b000000 @ multiply by x - eor r3,r3,r12,lsr#24 @ multiply by x+1 ("3"), which is a primitive element - add r2,r2,#1 - cmp r2,#255 - bls 1b - movs r2,#255 -1: - ldrb r3,[r0,r2] @ for each i≠0, find log,... - eor r3,r3,#255 @ ... negate... - ldrb r3,[r1,r3] @ ... and antilog to get inverse - strb r3,[r0,r2] - subs r2,r2,#1 - bne 1b @ note that inverse(0)=0 by (*) above -@ At this point r0=lut_a, r1=lut_b, lut_a[] contains inverses and lut_b[] contains other stuff - mov r12,#256 -1: - ldrb r2,[r0] - eors r3,r2,r2,lsl#1 @ convolve byte with 0x1f - eors r3,r3,r3,lsl#2 - eors r3,r3,r2,lsl#4 - eors r2,r3,r3,lsr#8 - eor r2,r2,#0x63 @ and add 0x63 - strb r2,[r0],#1 @ let lut_a[i]=sbox[i] - strb r2,[r1],#1 @ let lut_b[i]=sbox[i] - subs r12,r12,#1 - bne 1b - bx r14 - @ Lookup each byte of a word, Rtarg, in a table and replace Rtarg with the result (used for SBOX lookups) .macro subbytes Rtarg,Rtable,Rspare0,Rspare1,Rspare2,Rspare3 ubfx \Rspare0,\Rtarg,#0, #8 @@ -970,79 +1095,106 @@ gen_lut_sbox: .balign 4 .thumb_func map_sbox_s: - GET_CANARY r12,CTAG12 + GET_CANARY r12,CTAG12,3 push {r12,r14} ldr r0,=shareA @ Write out state share A to memory - stmia r0,{r4-r7} - clear03 @ barrier +@ stmia r0,{r4-r7} @ Used to do a STM + getchaffaddress r1 + ldr r2,[r1] + str r4,[r0] @ Interperse with dummy writes to prevent implicit broadcasting of HW(ShareA_word0^ShareA_word1)+cyclic perms, + str r2,[r1] @ which arise due to internal write buffer. Such a quantity could (without such interspersing) be paired + str r5,[r0,#4] @ via 2nd order with its share B counterpart, resulting in broadcasting HW(word0^word1)+cyclic. + str r2,[r1] @ shareC doesn't guard against this, because word0^shareC^word1^shareC=word0^word1. + str r6,[r0,#8] @ Broadcasting of HW(ShareA_word0)+cyclic on the other hand is not prevented by interspersing, but + str r2,[r1] @ it isn't useful at 2nd order because shareC kills its relationship with HW(ShareB_word0)+cyclic. + str r7,[r0,#12] + str r2,[r1] ldr r0,=shareB @ Write out state share B to memory - stmia r0,{r8-r11} - clear03 4 @ barrier + stmia r0,{r8-r11} @ Not essential to intersperse share B too because i0B^i1B etc should have nothing in share A to couple with bl makeperm16 @ Rebuild random 16-way permutation. Maybe do this less frequently @ Now combine state shares A and B and apply the split sbox to each byte, in the order given by the above random permutation + bl gen_rand_sha_nonpres + mov r11,r0 ldr r8,=lut_a ldr r9,=lut_b ldr r0,[r8,#0x100] @ R0 = a0 | a1<<8 | c0<<16 | c1<<24 (lut_a_map) - eors r10,r0,r0,lsr#8 - uxtb r10,r10 @ R10 = a0^a1 + eors r3,r0,r0,lsr#8 @ R3 = a0^a1 | junk + uxtb r10,r3 ldr r1,[r9,#0x100] @ R1 = b0 | b1<<8 | d0<<16 | d1<<24 (lut_b_map) eors r1,r0,r1 eors r2,r1,r1,lsr#8 - uxtb r11,r2 @ R11 = a0^a1^b0^b1 movs r12,r1,lsr#16 @ R12 = c0^d0 | (c1^d1)<<8 + bfi r12,r2,#16,#8 @ R12 = c0^d0 | (c1^d1)<<8 | (a0^a1^b0^b1)<<16 ldr r4,=perm16 ldr r5,=shareA ldr r6,=shareB -@ Using r0=loop counter, r4=perm16, r5=shareA, r6=shareB, r8=lut_a, r9=lut_b, r10=a0^a1, r11=a0^a1^b0^b1, r12=(c0^d0) | (c1^d1)<<8 + movs r1,#0;movs r2,#0;movs r3,#0 +@ Using r0=loop counter, r4=perm16, r5=shareA, r6=shareB, r8=lut_a, r9=lut_b, r10=a0^a1, r11=Random, r12=(c0^d0) | (c1^d1)<<8 | (a0^a1^b0^b1)<<16 movs r0,#15 1: @ (Ordering instructions to minimise result delays) ldrb r1,[r4,r0] @ r1 = perm[r0] + mov r11,r11,ror#11 @ Rotate random 32 bits to present a new low 8 bits eors r7,r1,#2 @ r7 = perm[r0]^2 ldrb r2,[r5,r1] @ r2 = shareA[perm[r0]] + eor r11,r11,r2,ror#8 @ Transfer some of the share-randomness of the input to the output (the share-randomness would otherwise be lost/wasted) ldrb r3,[r6,r7] @ r3 = shareB[perm[r0]^2] - eors r2,r2,r10 @ r2 = shareA[perm[r0]]^a0^a1 + eor r2,r2,r10 @ r2 = shareA[perm[r0]]^a0^a1 eors r2,r2,r3 @ r2 = shareA[perm[r0]]^a0^a1^shareB[perm[r0]^2] ldrb r3,[r8,r2] @ r3 = lut_a[shareA[perm[r0]]^a0^a1^shareB[perm[r0]^2]] - eors r3,r3,r12 @ r3 = lut_a[shareA[perm[r0]]^a0^a1^shareB[perm[r0]^2]]^c0^d0 | (junk<<8) - eors r2,r2,r11 @ r2 = shareA[perm[r0]]^b0^b1^shareB[perm[r0]^2] - strb r3,[r5,r1] @ shareA'[perm[r0]] = lut_a[shareA[perm[r0]]^a0^a1^shareB[perm[r0]^2]]^c0^d0 + eor r2,r2,r12,lsr#16 @ r2 = shareA[perm[r0]]^b0^b1^shareB[perm[r0]^2] + eor r3,r3,r12 @ r3 = lut_a[shareA[perm[r0]]^a0^a1^shareB[perm[r0]^2]]^c0^d0 | (junk<<8) + eor r3,r3,r11 @ r3 = lut_a[shareA[perm[r0]]^a0^a1^shareB[perm[r0]^2]]^c0^d0^rand | (junk<<8) + strb r3,[r5,r1] @ shareA'[perm[r0]] = lut_a[shareA[perm[r0]]^a0^a1^shareB[perm[r0]^2]]^c0^d0^rand ldrb r3,[r9,r2] @ r3 = lut_b[shareA[perm[r0]]^b0^b1^shareB[perm[r0]^2]] subs r0,r0,#1 - eor r3,r3,r12,lsr#8 @ r3 = lut_b[shareA[perm[r0]]^b0^b1^shareB[perm[r0]^2]]^c1^d1 - strb r3,[r6,r7] @ shareB'[perm[r0]^2] = lut_b[shareA[perm[r0]]^b0^b1^shareB[perm[r0]^2]]^c1^d1 + eor r3,r3,r11 @ r3 = lut_b[shareA[perm[r0]]^b0^b1^shareB[perm[r0]^2]]^rand + eor r3,r3,r12,lsr#8 @ r3 = lut_b[shareA[perm[r0]]^b0^b1^shareB[perm[r0]^2]]^rand^c1^d1 | (junk<<8) + strb r3,[r6,r7] @ shareB'[perm[r0]^2] = lut_b[shareA[perm[r0]]^b0^b1^shareB[perm[r0]^2]]^rand^c1^d1 bpl 1b clear03 8 @ barrier ldmia r6,{r8-r11} @ Read state share B back from memory clear03 12 @ barrier - ldmia r5,{r4-r7} @ Read state share A back from memory - clear03 16 @ barrier + getchaffaddress r0,16 + bfi r0,r5,#0,#4 @ match chaff pointer (r0) to share A location (R5) mod 16 + @ldmia r5,{r4-r7} @ Read state share A back from memory + @clear03 16 @ barrier + ldr r4,[r5] @ Intersperse barriers to prevent HW(o0^o1)+cyclic being broadcast; see similar note re stores at the start of map_sbox_s + ldr r1,[r0] + ldr r6,[r5,#8] + ldr r1,[r0,#8] + ldr r7,[r5,#12] + ldr r1,[r0,#12] + ldr r5,[r5,#4] @ Do r5 last because it's the address register + ldr r1,[r0,#4] @ Refresh state shares because luts only give imperfect share-by-value - - loadlfsr - steplfsr; eors r4,r4,r0; mov r12,#0; eors r8,r8,r0,ror#16 @ Barriers between each pair of eors to prevent implicit r4^r8 etc - steplfsr; eors r5,r5,r0; mov r12,#0; eors r9,r9,r0,ror#16 - steplfsr; eors r6,r6,r0; mov r12,#0; eors r10,r10,r0,ror#16 - steplfsr; eors r7,r7,r0; mov r12,#0; eors r11,r11,r0,ror#16 - savelfsr +@ Probably not necessary now that we use R11 and input-reuse random resharing during the sbox operation (though the R11 bytes are not fully independent) +@ loadlfsr +@ steplfsr; eors r4,r4,r0; mov r12,#0; eor r8,r8,r0,ror#16 @ Barriers between each pair of eors to prevent implicit r4^r8 etc +@ steplfsr; eors r5,r5,r0; mov r12,#0; eor r9,r9,r0,ror#16 +@ steplfsr; eors r6,r6,r0; mov r12,#0; eor r10,r10,r0,ror#16 +@ steplfsr; eors r7,r7,r0; mov r12,#0; eor r11,r11,r0,ror#16 +@ savelfsr pop {r12,r14} - CHK_CANARY r12,CTAG12 + CHK_CANARY r12,CTAG12,5 bx r14 +.ltorg + .balign 4 .thumb_func randomisechaff: @ Randomise 48 bytes of chaff values (random load values) @ Uses 12 bytes of permscratch @ Trashes r0-3 - GET_CANARY r0,CTAG13 + GET_CANARY r0,CTAG13,6 push {r0,r14} movs r0,#12 ldr r1,=permscratch @@ -1059,19 +1211,32 @@ randomisechaff: subs r1,r1,#1 bpl 1b pop {r0,r14} - CHK_CANARY r0,CTAG13 + CHK_CANARY r0,CTAG13,6 bx r14 .balign 4 -refreshchaff: +refreshchaff_and_lfsr: @ Update 48 bytes of chaff values (random load values) using faster RNG than used for randomisechaff +@ Re-randomise LFSR with SHA @ Uses 12 bytes of permscratch @ Trashes r0-3,12 - GET_CANARY r0,CTAG14 + GET_CANARY r0,CTAG14,6 push {r0,r14} + +@ Refresh LFSR using SHA to make it harder to reverse-engineer LFSR sequence + bl gen_rand_sha_nonpres + ldr r1,=rstate_lfsr + ldr r2,[r1] + adds r2,r2,r0 + beq 1f @ Don't update LFSR state to 0 + str r2,[r1] +1: + +@ Choose a random order to update chaff words to make 2nd order attacks harder movs r0,#12 ldr r1,=permscratch - bl makesmallperm @ Update the random words in a random order to make 2nd order attacks harder + bl makesmallperm + movs r1,#11 1: push {r1} @@ -1086,7 +1251,7 @@ refreshchaff: subs r1,r1,#1 bpl 1b pop {r0,r14} - CHK_CANARY r0,CTAG14 + CHK_CANARY r0,CTAG14,6 bx r14 .balign 4 @@ -1094,7 +1259,7 @@ refreshchaff: @ Do sbox on the four bytes of the 4-way share r4-r7 @ Trashes r0,r8-r12 init_key_sbox: - GET_CANARY r12,CTAG15 + GET_CANARY r12,CTAG15,6 push {r1-r3,r12,r14} bl gen_rand_sha_nonpres; mov r8,r0 bl gen_rand_sha_nonpres; mov r9,r0 @@ -1113,16 +1278,16 @@ init_key_sbox: movs r0,#4 ldr r1,=permscratch bl makesmallperm @ Build random 4-way permutation determining order of bytes to be SBOXed - ldr r1,=permscratch @ Write out random addresses in advance to save two registers + ldr r1,=permscratch @ Write out random addresses in advance to save two registers (reusing permscratch) ldr r4,[r1] ldr r0,=fourway uxtab r5,r0,r4 uxtab r6,r0,r4,ror#8 uxtab r7,r0,r4,ror#16 uxtab r8,r0,r4,ror#24 - stmia r1,{r5-r8} @ Store fourway+perm[0], fourway+perm[1], fourway+perm[2], fourway+perm[3] + stmia r1,{r5-r8} @ Store at r1=permscratch: fourway+perm[0], fourway+perm[1], fourway+perm[2], fourway+perm[3] - bl gen_rand_sha @ Save some randomness for the resharing operation later + bl gen_rand_sha @ Save some randomness for the resharing operation later movs r7,r0 bl gen_rand_sha movs r8,r0 @@ -1148,8 +1313,8 @@ init_key_sbox: ldr r6,[r1],#4 @ r6 = fourway + perm[i] (i=0-3, loop iteration) ldr r5,[r5] @ Random load to mask previous load - ands r9,r6,#12 @ r9 = chaff address aligned to r6 mod 16 - add r9,r11,r9 + ands r9,r6,#12 + add r9,r11,r9 @ r9 = chaff address aligned to (r6 bic 3) mod 16 ldrb r4,[r6,#0] ldr r14,[r9,#0] @ Random load to mask previous load eor r4,r4,r10 @@ -1171,7 +1336,7 @@ init_key_sbox: eor r4,r4,r14,lsl#8 @ Add in some junk in bits 8-31 ands r14,r4,#255 - ldrb r5,[r2,r14] @ r5 = lut_a[unsharedbyte[perm[i]]^a0^a1] + ldrb r5,[r2,r14] @ r5 = lut_a[unsharedbyte[perm[i]]^a0^a1] and r14,r4,#15 add r14,r14,#32 ldrb r14,[r11,r14] @ Random load to mask previous load (r2 and r11 are both 0 mod 16) @@ -1207,7 +1372,7 @@ init_key_sbox: ldmia r11,{r8-r12,r14} @ Random load to mask previous load and to obfuscate registers pop {r1-r3,r12,r14} - CHK_CANARY r12,CTAG15 + CHK_CANARY r12,CTAG15,6 bx r14 .balign 4 @@ -1221,7 +1386,7 @@ init_key_sbox: @ roundkey shareA(i) = Aptr[i+vpermA mod 4] ror #((i+vpermA mod 4)^th byte of Aptr[4]) @ roundkey shareB(i) = Bptr[i+vpermB mod 4] ror #((i+vpermB mod 4)^th byte of Bptr[4])+16 storeroundkey: - GET_CANARY r8,CTAG16 + GET_CANARY r8,CTAG16,6 push {r2,r8,r14} @ eor two 4-way share components to make a component of a 2-way share @@ -1291,12 +1456,12 @@ storeroundkey: adds r3,r3,#4 @ Set r3 = (r3 on entry) + 40 pop {r2,r8,r14} - CHK_CANARY r8,CTAG16 + CHK_CANARY r8,CTAG16,6 bx r14 .balign 4 .thumb_func -init_key: +init_key_4way: @ On entry, r0 points to 4-way shared raw key data (128 bytes) @ The format is a0 b0 c0 d0 a1 b1 c1 d1 ... a7 b7 c7 d7 @ That is, each word, K, of the original 256-bit key is expanded into four words whose exclusive OR is K. @@ -1312,17 +1477,21 @@ init_key: @ rkb_unrot[i] = rkb[i+vpermB mod 4] ror #((i+vpermB mod 4)^th byte of rkb[4])+16 @ rk[i] = rka_unrot[i] ^ rkb_unrot[i] ^ RKshareC - GET_CANARY r12,CTAG17 - push {r4-r11,r12,r14} + GET_CANARY r12,CTAG17,6 + push {r0-r12,r14} +@ Transfer 4-way key into local workspace, rerandomising the shares mov r5,r0 @ r5=4-way key input bl randomisechaff - ldr r4,=rkey4way - movs r6,#8 + ldr r6,=rkey4way + movs r7,#8 1: - ldmia r5!,{r0-r3} - stmia r4!,{r0-r3} - subs r6,r6,#1 + ldmia r5!,{r1-r4} + bl gen_rand_sha; eors r1,r1,r0; eors r4,r4,r0 + bl gen_rand_sha; eors r2,r2,r0; eors r4,r4,r0 + bl gen_rand_sha; eors r3,r3,r0; eors r4,r4,r0 + stmia r6!,{r1-r4} + subs r7,r7,#1 bne 1b @ Now raw key is stored in rkey4way[], construct 2-way share in rkey_s[] for @@ -1400,10 +1569,12 @@ init_key_expandloop: cmp r2,#52 bne init_key_expandloop - pop {r4-r11,r12,r14} - CHK_CANARY r12,CTAG17 + pop {r0-r12,r14} + CHK_CANARY r12,CTAG17,6 bx r14 +.ltorg + @ Add the round key shares pointed to by r12 into the state shares @ Trashes r0-r3 .balign 4 @@ -1421,7 +1592,7 @@ addrkey_s: ldr r1,[r12,#16] @ r1=vperm key rotation in top two bits ldr r2,[r0,#16] @ barrier load - rsbs r2,r3,r1,lsr#30 @ r2=vpermkeyrot-vpermstaterot + rsb r2,r3,r1,lsr#30 @ r2=vpermkeyrot-vpermstaterot @ Read shareA of roundkey, offset by vpermkeyrot-vpermstaterot, and eor it into shareA of state, offset by -vpermstaterot @ r1=rkeyArotdata, r2=vpermkeyrot-vpermstaterot, r3=statevperm, r4-r11=state, r12=roundkeyAptr .if RK_ROR @@ -1444,7 +1615,7 @@ addrkey_s: bfi r0,r12,#0,#4 @ match chaff pointer (r0) to roundkey ptr (r12) mod 16 ldr r1,[r12,#16] @ r1=vperm key rotation in top two bits ldr r2,[r0,#16] @ barrier load - rsbs r2,r3,r1,lsr#30 @ r2=vpermkeyrot-vpermstaterot + rsb r2,r3,r1,lsr#30 @ r2=vpermkeyrot-vpermstaterot ldr r3,=RKshareC @ r3=common round key shareC bfi r0,r3,#0,#4 ldr r3,[r3] @@ -1466,7 +1637,6 @@ addrkey_s: ands r2,r2,#3; ldr r0,[r12,r2,lsl#2]; eor r11,r11,r3,ror#16; eors r11,r11,r0 .endif clear03 - bx r14 .balign 4 @@ -1484,11 +1654,11 @@ addrkey_s: ctr_crypt_s: @ r0=IV, r1=cipher/plaintext buffer, r2=number of blocks - GET_CANARY r12,CTAG0 - push {r0,r4-r11,r12,r14} + GET_CANARY r12,CTAG0,6 + push {r0-r12,r14} @ save all registers so that when we restore we overwrite any secrets push {r0-r2} - SET_COUNT 93 + SET_COUNT 93,6 .if CT_BPERM @ Initialise 32 random numbers (which fit in half-words) @@ -1505,10 +1675,10 @@ ctr_crypt_s: bl randomisechaff pop {r0-r2} movs r3,#0 - CHK_COUNT 93 + CHK_COUNT 93,6 ctr_crypt_mainloop: - SET_COUNT 80 + SET_COUNT 80,6 @ here r0=IV, r1=cipher/plaintext buffer, r2=number of blocks, r3=block counter @ Do as much preparatory stuff as possible that doesn't involve the IV (to reduce interaction with it) @@ -1517,7 +1687,7 @@ ctr_crypt_mainloop: tst r3,#(REFCHAFF_PERIOD-1) bne 1f - bl refreshchaff + bl refreshchaff_and_lfsr 1: ldr r3,[r13,#12] @ get block count off the stack @@ -1525,7 +1695,7 @@ ctr_crypt_mainloop: bne 1f bl remap @ shuffle the LUTs; this preserves R3 1: - CHK_COUNT 80 + CHK_COUNT 80,6 tst r3,#(REFROUNDKEYSHARES_PERIOD-1) bne 1f @@ -1538,7 +1708,8 @@ ctr_crypt_mainloop: bl ref_roundkey_hvperms_s @ refresh the round key vperms 1: - CHK_COUNT 81 + CHK_COUNT 81,6 + pop {r0-r3} @ r0=IV, r1=cipher/plaintext buffer, r2=number of blocks, r3=block counter @@ -1586,9 +1757,9 @@ ctr_crypt_mainloop: .else mov r12,r3 .endif - CHK_COUNT 82 + CHK_COUNT 82,6 -@ r0=IV, r1=cipher/plaintext buffer, r2=number of blocks, r3=block counter, r12=block to be deciphered +@ r0=IV, r1=cipher/plaintext buffer, r2=number of blocks, r3=block counter (monotonic), r12=block number (block to be deciphered) push {r0-r3,r12} processIV: @ non-target label to assist power analysis @@ -1605,36 +1776,36 @@ processIV: @ non-target label to assist power analysis ldr r0,[r13] @ peek at stack to restore r0=IV ptr ldmia r0,{r4-r7} @ load IV clear03 @ barrier to remove traces of IV from internal CPU load registers - push {r0-r3} @ We want to randomise the internal memory registers associated with the above LDM load, but this - pop {r0-r3} @ may come from non-scratch memory and have its own internal registers, so we clear it using a - @ stack save/load. Either R13 is in non-scratch memory, in which case this works, or it isn't, in - @ which case it doesn't matter, because the only subsequent use of non-scratch memory is the stack. @ Add in r9 in byte-big-endian, bit-little-endian (!) fashion, while trying to avoid rev operations @ as far as possible as these tend to expose (via power fluctuations) byte-level hamming weights. -@ It's worth avoiding revs on r6, r5, r4, even at the cost of introducing a small timing dependency. - @ First do 128-bit addition of r9 to byte-reversed IV - rev r7,r7; adds r7,r7,r9; bcc 1f - rev r6,r6; adcs r6,r6,#0; rev r6,r6; bcc 1f - rev r5,r5; adcs r5,r5,#0; rev r5,r5; bcc 1f - rev r4,r4; adcs r4,r4,#0; rev r4,r4 + rev r7,r7 + cmn r7,#MAX_NUM_BLOCKS @ Compare against maximum number of blocks + bcs 1f + add r7,r7,r9 @ This can temporarily overflow but it doesn't matter as we know that r7+r12 does not overflow + sub r7,r7,r8 + b 2f 1: -@ At this point, r7 is reversed and r4-r6 are not + adds r7,r7,r9 + rev r6,r6; adcs r6,r6,#0 + rev r5,r5; adcs r5,r5,#0 + rev r4,r4; adcs r4,r4,#0 @ Now do 128-bit subtraction of r8 from byte-reversed IV - subs r7,r7,r8; rev r7,r7; bcs 1f - rev r6,r6; sbcs r6,r6,#0; rev r6,r6; bcs 1f - rev r5,r5; sbcs r5,r5,#0; rev r5,r5; bcs 1f - rev r4,r4; sbcs r4,r4,#0; rev r4,r4 -1: + subs r7,r7,r8 + sbcs r6,r6,#0; rev r6,r6 + sbcs r5,r5,#0; rev r5,r5 + sbcs r4,r4,#0; rev r4,r4 +2: + rev r7,r7 clear01 16 - CHK_COUNT 83 + CHK_COUNT 83,6 @ r4-r7 = IV for the current block bl ns_to_s @ convert IV+x to shares, which includes choosing and incorporating a random shareC - CHK_COUNT 84 + CHK_COUNT 84,6 bl conjshareC @ Add the effect of shareC to lut_a, lut_b - CHK_COUNT 85 + CHK_COUNT 85,6 @ now perform the 15 encryption rounds on (key, state=IV+x) @ here r4-r7, r8-r11: state mov r2,#0 @ round counter @@ -1647,7 +1818,7 @@ rounds_s_mainloop: bl map_sbox_s bl shift_rows_s .if ST_VPERM - ldmia r13,{r2} @ peek at stack to get round count + ldr r2,[r13] @ peek at stack to get round count cmp r2,#NUMREFSTATEVPERM bcs 1f bl gen_rand_lfsr_nonpres @@ -1664,12 +1835,12 @@ rounds_s_mainloop: pop {r2} b rounds_s_mainloop 2: - CHK_COUNT 86 + CHK_COUNT 86,6 ldr r12,=rkey_s+14*40 @ final round key shares bl addrkey_s - CHK_COUNT 87 + CHK_COUNT 87,6 bl conjshareC @ Undo the effect of shareC from lut_a, lut_b - CHK_COUNT 88 + CHK_COUNT 88,6 .if ST_VPERM @ Undo the effects of vperm rotation recorded in statevperm ldr r1,=statevperm @@ -1682,6 +1853,7 @@ rounds_s_mainloop: push {r0,r3} @ r0=IV, r1=cipher/plaintext buffer, r2=number of blocks, r3=block counter, r12=block to be deciphered +decryption_start: @ Decrypt ciphertext using AES output in shares: r4-r11 .if ST_SHAREC ldr r0,=shareC @@ -1689,38 +1861,79 @@ rounds_s_mainloop: .else movs r0,#0 .endif - CHK_COUNT 89 + ldr r14,=chaff +@ r0=shareC, r1=cipher/plaintext buffer, r2=number of blocks, r3=free, r4-r11=stateA/B, r12=block to be deciphered, r14=chaff + CHK_COUNT 89,6 add r1,r1,r12,lsl#4 @ Temporarily r1 points to block-to-be-deciphered - ldr r3,[r1] - eors r3,r3,r4 - eors r3,r3,r8,ror#16 @ Now r4 and r8 are free - eors r3,r3,r0 - str r3,[r1] - ldr r3,[r1,#4] + ldr r3,[r1] @ r3=ciphertext word + eors r3,r3,r4 @ r3=r3^shareA + ldr r4,[r14] @ barrier load + eor r3,r3,r8,ror#16 @ r3=r3^shareB + eors r3,r3,r0 @ r3=r3^shareC + str r3,[r1] @ plaintext word=r3 + ldr r3,[r1,#4] @ and similarly for words 1,2,3 of block... + ldr r4,[r14,#4] eors r3,r3,r5 - eors r3,r3,r9,ror#16 + eor r3,r3,r9,ror#16 eors r3,r3,r0 str r3,[r1,#4] ldr r3,[r1,#8] + ldr r4,[r14,#8] eors r3,r3,r6 - eors r3,r3,r10,ror#16 + eor r3,r3,r10,ror#16 eors r3,r3,r0 str r3,[r1,#8] ldr r3,[r1,#12] + ldr r4,[r14,#12] eors r3,r3,r7 - eors r3,r3,r11,ror#16 + eor r3,r3,r11,ror#16 eors r3,r3,r0 str r3,[r1,#12] + sub r1,r1,r12,lsl#4 @ Restore r1 to point to start of buffer - CHK_COUNT 90 + CHK_COUNT 90,6 pop {r0,r3} @ Restore IV and block counter @ r0=IV, r1=cipher/plaintext buffer, r2=number of blocks, r3=block counter +decryption_end: adds r3,r3,#1 cmp r3,r2 - CHK_COUNT 91 + CHK_COUNT 91,6 bne ctr_crypt_mainloop - pop {r0,r4-r11,r12,r14} - CHK_CANARY r12,CTAG0 + +#if WIPE_MEMORY +@ Wipe memory from workspace_start up to the stack pointer +@ First fill everything (except the RNG state itself) with random numbers to avoid any possibly useful power signals + ldr r4,=workspace_start + ldr r5,=rstate_all_start +1: + bl gen_rand_sha_nonpres + stmia r4!,{r0} + cmp r4,r5 + bcc 1b + ldr r4,=rstate_all_end + mov r5,r13 @ gcc arm assembler says cmp r4,r13 is deprecated, so use another register +1: + bl gen_rand_sha_nonpres + stmia r4!,{r0} + cmp r4,r5 + bcc 1b + +@ Then fill everything with zeros so as not to leave behind clues about the RNG state + ldr r4,=workspace_start + movs r0,#0 + mov r5,r13 +1: + stmia r4!,{r0} + cmp r4,r5 + bcc 1b +#endif + +.if GEN_RAND_SHA + SET_COUNT 23,6 + bl reset_sha_trng @ clear out the SHA hardware +.endif + pop {r0-r12,r14} + CHK_CANARY r12,CTAG0,6 bx r14 diff --git a/bootloaders/encrypted/config.h b/bootloaders/encrypted/config.h index dd0c9898e..2c4ce0d03 100644 --- a/bootloaders/encrypted/config.h +++ b/bootloaders/encrypted/config.h @@ -1,6 +1,6 @@ #pragma once -// These options should be enabled because the security risk of not using them is too high +// These options (up to long /////////////// line) should be enabled because the security risk of not using them is too high // or because the time cost is very low so you may as well have them. // They can be set to 0 for analysis or testing purposes. @@ -22,6 +22,10 @@ #define RK_ROR 1 // store round key shares with random rotations within each word #endif +#ifndef WIPE_MEMORY +#define WIPE_MEMORY 1 // Wipe memory after decryption +#endif + // The following options should be enabled to increase resistance to glitching attacks. #ifndef RC_CANARY @@ -31,34 +35,42 @@ #define RC_COUNT 1 // use rcp_count feature #endif -// Although enabling the following option likely has little theoretical benefit, in -// practice randomising the timing of operations can make side-channel attacks very -// much more effort to carry out. It can be disabled for analysis or testing purposes. +// Although jitter/timing-variation may be circumventable in theory, in practice +// randomising the timing of operations can make side-channel attacks very much more +// effort to carry out. These can be disabled for analysis or testing purposes. +// It is advisable to use a least one form of jitter. +// RC_JITTER is quite slow, and is probably the most predictable of the three, so it is disabled by default. +// (Leaving it as an option because it's just possible that the large delays it produces are advantageous in defeating certain side-channel attacks.) #ifndef RC_JITTER -#define RC_JITTER 1 // use random-delay versions of RCP instructions +#define RC_JITTER 0 // 0-7. Higher = more jitter. Governs use of random-delay versions of RCP instructions. #endif +#ifndef SH_JITTER +#define SH_JITTER 1 // Insert random delays, tagged onto SHA RNG +#endif + + //////////////////////////////////////////////////////////////////////////////////////////////////////////// // The following options can be adjusted, affecting the performance/security tradeoff // Period = X means that the operation in question occurs every X blocks, so higher = more performance and lower security. // No point in making them more than 16 or so, since the time taken by the subroutines would be negligible. -// These must be a power of 2. Timings as of commit 24277d13 -// RK_ROR=0 RK_ROR=1 -// Baseline time per 16-byte block = { 14066 14336 } cycles +// These must be a power of 2. Timings as of commit 82d31652 +// +// Baseline time per 16-byte block = 14109 (with no jitter) cycles #ifndef REFCHAFF_PERIOD -#define REFCHAFF_PERIOD 1 // Extra cost per 16-byte block = { 462 462 }/REFCHAFF_PERIOD cycles +#define REFCHAFF_PERIOD 1 // Extra cost per 16-byte block = 474/REFCHAFF_PERIOD cycles #endif #ifndef REMAP_PERIOD -#define REMAP_PERIOD 4 // Extra cost per 16-byte block = { 4131 4131 }/REMAP_PERIOD cycles +#define REMAP_PERIOD 4 // Extra cost per 16-byte block = 4148/REMAP_PERIOD cycles #endif #ifndef REFROUNDKEYSHARES_PERIOD -#define REFROUNDKEYSHARES_PERIOD 1 // Extra cost per 16-byte block = { 1107 1212 }/REFROUNDKEYSHARES_PERIOD cycles +#define REFROUNDKEYSHARES_PERIOD 1 // Extra cost per 16-byte block = 1304/REFROUNDKEYSHARES_PERIOD cycles #endif #ifndef REFROUNDKEYHVPERMS_PERIOD -#define REFROUNDKEYHVPERMS_PERIOD 1 // Extra cost per 16-byte block = { 936 1422 }/REFROUnDKEYVPERM_PERIOD cycles +#define REFROUNDKEYHVPERMS_PERIOD 1 // Extra cost per 16-byte block = 1486/REFROUNDKEYVPERM_PERIOD cycles #endif // Setting NUMREFSTATEVPERM to X means that state vperm refreshing happens on the first X AES rounds only, @@ -66,5 +78,13 @@ // The rationale for doing it this way is that later rounds should be protected by CT_BPERM. // NUMREFSTATEVPERM can be from 0 to 14. #ifndef NUMREFSTATEVPERM -#define NUMREFSTATEVPERM 7 // Extra cost per 16-byte block = 80*NUMREFSTATEVPERM cycles +#define NUMREFSTATEVPERM 7 // Extra cost per 16-byte block = 61*NUMREFSTATEVPERM cycles +#endif + +//////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#define MAX_NUM_BLOCKS 32768 + +#if SH_JITTER && !GEN_RAND_SHA +#error GEN_RAND_SHA must be set if you want to use SH_JITTER #endif diff --git a/bootloaders/encrypted/enc_bootloader.c b/bootloaders/encrypted/enc_bootloader.c index 79fb8fb10..814ce70cc 100644 --- a/bootloaders/encrypted/enc_bootloader.c +++ b/bootloaders/encrypted/enc_bootloader.c @@ -16,51 +16,17 @@ #include "config.h" -volatile uint32_t systick_data[18]; // count, R0-R15,RETPSR - -extern void remap(); -extern uint32_t gen_rand_sha(); -extern void init_key(uint8_t *key); -extern void gen_lut_sbox(); -extern int ctr_crypt_s(uint8_t*iv,uint8_t*buf,int nblk); - -extern uint8_t rkey_s[480]; -extern uint8_t lut_a[256]; -extern uint8_t lut_b[256]; -extern uint32_t lut_a_map[1]; -extern uint32_t lut_b_map[1]; -extern uint32_t rstate_sha[4],rstate_lfsr[2]; - -void resetrng() { - uint32_t f0,f1; - do f0=get_rand_32(); while(f0==0); // make sure we don't initialise the LFSR to zero - f1=get_rand_32(); - rstate_sha[0]=f0&0xffffff00; // bottom byte must be zero (or 4) for SHA, representing "out of data" - rstate_sha[1]=f1; - rstate_sha[2]=0x41414141; - rstate_sha[3]=0x41414141; - rstate_lfsr[0]=f0; // must be nonzero for non-degenerate LFSR - rstate_lfsr[1]=0x1d872b41; // constant that defines LFSR -#if GEN_RAND_SHA - reset_block(RESETS_RESET_SHA256_BITS); - unreset_block(RESETS_RESET_SHA256_BITS); -#endif -} +#define OTP_KEY_PAGE 30 -static void init_lut_map() { - int i; - for(i=0;i<256;i++) lut_b[i]=gen_rand_sha()&0xff, lut_a[i]^=lut_b[i]; - lut_a_map[0]=0; - lut_b_map[0]=0; - remap(); -} +extern void decrypt(uint8_t* key4way, uint8_t* iv, uint8_t(*buf)[16], int nblk); -static void init_aes() { - resetrng(); - gen_lut_sbox(); - init_lut_map(); +// The function lock_key() is called from decrypt() after key initialisation is complete and before decryption begins. +// That is a suitable point to lock the OTP area where key information is stored. +void lock_key() { + otp_hw->sw_lock[OTP_KEY_PAGE] = 0xf; } + static __attribute__((aligned(4))) uint8_t workarea[4 * 1024]; int main() { @@ -182,13 +148,10 @@ int main() { for (int i=0; i < 4; i++) printf("%08x\n", *(uint32_t*)(SRAM_BASE + i*4)); - init_aes(); // Read key directly from OTP - guarded reads will throw a bus fault if there are any errors uint16_t* otp_data = (uint16_t*)OTP_DATA_GUARDED_BASE; - init_key((uint8_t*)&(otp_data[(OTP_CMD_ROW_BITS & 0x780)])); - otp_hw->sw_lock[30] = 0xf; - ctr_crypt_s(iv, (void*)SRAM_BASE, data_size/16); + decrypt((uint8_t*)&(otp_data[(OTP_CMD_ROW_BITS & (OTP_KEY_PAGE * 0x40))]), iv, (void*)SRAM_BASE, data_size/16); printf("Post decryption image begins with\n"); for (int i=0; i < 4; i++) From bcce195016584e02b54ec04f99909327e0540e76 Mon Sep 17 00:00:00 2001 From: William Vinnicombe Date: Wed, 26 Feb 2025 12:32:10 +0000 Subject: [PATCH 11/20] Add IV salts --- bootloaders/encrypted/CMakeLists.txt | 2 +- bootloaders/encrypted/ivsalt.bin | 1 + bootloaders/encrypted/otp.json | 3 ++- encrypted/hello_encrypted/CMakeLists.txt | 1 + encrypted/hello_encrypted/ivsalt.bin | 1 + encrypted/hello_encrypted/otp.json | 3 ++- 6 files changed, 8 insertions(+), 3 deletions(-) create mode 100644 bootloaders/encrypted/ivsalt.bin create mode 100644 encrypted/hello_encrypted/ivsalt.bin diff --git a/bootloaders/encrypted/CMakeLists.txt b/bootloaders/encrypted/CMakeLists.txt index 44180c4f6..bd16b34d2 100644 --- a/bootloaders/encrypted/CMakeLists.txt +++ b/bootloaders/encrypted/CMakeLists.txt @@ -85,7 +85,7 @@ add_linker_script(hello_serial_enc "0x20000000" "448k") # sign, hash, and encrypt pico_sign_binary(hello_serial_enc ${CMAKE_CURRENT_LIST_DIR}/private.pem) pico_hash_binary(hello_serial_enc) -pico_encrypt_binary(hello_serial_enc ${CMAKE_CURRENT_LIST_DIR}/privateaes.bin) +pico_encrypt_binary(hello_serial_enc ${CMAKE_CURRENT_LIST_DIR}/privateaes.bin ${CMAKE_CURRENT_LIST_DIR}/ivsalt.bin) # package uf2 in flash pico_package_uf2_output(hello_serial_enc 0x10000000) diff --git a/bootloaders/encrypted/ivsalt.bin b/bootloaders/encrypted/ivsalt.bin new file mode 100644 index 000000000..fb9ef50b8 --- /dev/null +++ b/bootloaders/encrypted/ivsalt.bin @@ -0,0 +1 @@ +x%^=TČ \ No newline at end of file diff --git a/bootloaders/encrypted/otp.json b/bootloaders/encrypted/otp.json index 4c671139a..a238c1511 100644 --- a/bootloaders/encrypted/otp.json +++ b/bootloaders/encrypted/otp.json @@ -1,5 +1,6 @@ { "OTP_DATA_KEY1" : [ 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7 ], "OTP_DATA_KEY1_VALID" : "0x010101", - "PAGE30_LOCK0" : "0x494949" + "PAGE30_LOCK0" : "0x494949", + "PAGE31_LOCK0" : "0x494949" } diff --git a/encrypted/hello_encrypted/CMakeLists.txt b/encrypted/hello_encrypted/CMakeLists.txt index 7f72ece66..284c9b309 100644 --- a/encrypted/hello_encrypted/CMakeLists.txt +++ b/encrypted/hello_encrypted/CMakeLists.txt @@ -34,6 +34,7 @@ pico_sign_binary(hello_encrypted ${CMAKE_CURRENT_LIST_DIR}/private.pem) pico_hash_binary(hello_encrypted) pico_encrypt_binary(hello_encrypted ${CMAKE_CURRENT_LIST_DIR}/privateaes.bin + ${CMAKE_CURRENT_LIST_DIR}/ivsalt.bin EMBED OTP_KEY_PAGE 29) diff --git a/encrypted/hello_encrypted/ivsalt.bin b/encrypted/hello_encrypted/ivsalt.bin new file mode 100644 index 000000000..fb9ef50b8 --- /dev/null +++ b/encrypted/hello_encrypted/ivsalt.bin @@ -0,0 +1 @@ +x%^=TČ \ No newline at end of file diff --git a/encrypted/hello_encrypted/otp.json b/encrypted/hello_encrypted/otp.json index 2a4bbe2c4..c9a7be2d5 100644 --- a/encrypted/hello_encrypted/otp.json +++ b/encrypted/hello_encrypted/otp.json @@ -1,5 +1,6 @@ { "OTP_DATA_KEY1" : [ 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7 ], "OTP_DATA_KEY1_VALID" : "0x010101", - "PAGE29_LOCK0" : "0x494949" + "PAGE29_LOCK0" : "0x494949", + "PAGE30_LOCK0" : "0x494949" } From 05557f51940e460d906576e66c7d5c571ec34de5 Mon Sep 17 00:00:00 2001 From: William Vinnicombe Date: Wed, 26 Feb 2025 15:32:12 +0000 Subject: [PATCH 12/20] Update with latest aes.S --- bootloaders/encrypted/aes.S | 204 ++++++++++++------------- bootloaders/encrypted/enc_bootloader.c | 11 +- 2 files changed, 105 insertions(+), 110 deletions(-) diff --git a/bootloaders/encrypted/aes.S b/bootloaders/encrypted/aes.S index e0d653237..2014b5fd2 100644 --- a/bootloaders/encrypted/aes.S +++ b/bootloaders/encrypted/aes.S @@ -40,7 +40,7 @@ scratch RAM and the stack pointer is overwritten. #define CTAG8 0x33 #define CTAG9 0x34 #define CTAG10 0x35 @ not used -#define CTAG11 0x36 +#define CTAG11 0x36 @ not used #define CTAG12 0x37 #define CTAG13 0x38 #define CTAG14 0x39 @@ -93,6 +93,8 @@ scratch RAM and the stack pointer is overwritten. .endif .endm +@ Clear internal stripe load registers, and r0-r3 +@ 0 <= offset <= 32 .macro clear03 offset=0 getchaffaddress r0,\offset ldmia r0,{r0-r3} @@ -158,6 +160,10 @@ RKshareC: @ Round key common share C; see comment at init_key .space 4 RKshareCchange: @ Temporary used by ref_roundkey_share_s .space 4 +IV0: @ 2-way share of IV for block 0 +.space 36 @ Considering IV0 as a word pointer, the format is IV = IV0[0,1,2,3] ^ (IV0[5,6,7,8],ror#16) + @ The gap at IV0[4] is to defeat unsharing by internal striped memory registers + @ I.e., there are implicit XORs IV0[0]^IV0[4], IV0[1]^IV0[5], ..., that the 1 word offset renders useless @ Regardless of configuration, the code uses a single 256-entry LUT, @ which is a simple S-box table. @@ -323,11 +329,11 @@ gen_rand_sha: ldr r2,=rstate_sha ldr r0,[r2,#jstate-rstate_sha] movs r1,#1 - movs r3,r0,lsl#2 - ands r3,r3,#31 - movs r3,r1,lsl r3 @ 1<<(4*(r0&7)) - udiv r3,r3,r1 @ Takes constant + (r0&7) cycles - lsrs r0,r0,#1 + ands r3,r0,#3 + movs r3,r3,lsl#2 + movs r3,r1,lsl r3 @ 1<<(4*(r0&3)) + udiv r3,r3,r1 @ Takes constant + (r0&3) cycles + lsrs r0,r0,#2 bne 1f bl gen_rand_sha_nonpres ldr r2,=rstate_sha @@ -352,6 +358,7 @@ gen_rand_sha_nonpres: strb r3,[r2] @ save updated SUM register offset in bottom byte of rstate_sha[] bx r14 1: +@ [CK_JITTER code was here] movs r3,#SHA256_SUM6_OFFSET+1 strb r3,[r2] @ reset word counter: the +1 is compensated for later movw r1,#(1<r5-r6->r7->r4 etc.) by an addtional amount +@ Cycle share registers r4-r7, r8-r11 (r4->r5-r6->r7->r4 etc.) by an addtional amount @ given in the bottom two bits of R0 and update the rotation recorded at statevperm. @ On entry R1 must point to statevperm. @ Trashes r0-r3,r12 @@ -901,46 +911,7 @@ addstatevperm_exit: @ label exit point to be to able to specify to ana bx r14 .endif -@ Switch from non-shared to shared state -@ Trashes r0-r3,r12 -.balign 4 -ns_to_s: - GET_CANARY r12,CTAG11,6 - push {r12,r14} -.if ST_SHAREC - bl gen_rand_sha_nonpres @ Create state share C; all bytes the same - ands r0,r0,#255 - orrs r0,r0,r0,lsl#8 - orrs r12,r0,r0,lsl#16 - ldr r1,=shareC - str r12,[r1] -.else - movs r12,#0 -.endif - bl gen_rand_sha_nonpres - eors r4,r4,r0 - eor r8,r12,r0,ror#16 - bl gen_rand_sha_nonpres - eors r5,r5,r0 - eor r9,r12,r0,ror#16 - bl gen_rand_sha_nonpres - eors r6,r6,r0 - eor r10,r12,r0,ror#16 - bl gen_rand_sha_nonpres - eors r7,r7,r0 - eor r11,r12,r0,ror#16 -.if ST_VPERM - bl gen_rand_sha_nonpres - ldr r1,=statevperm - movs r2,#0 - str r2,[r1] - bl addstatevperm @ Initialise state vperm with SHA RNG, refresh with LFSR RNG -.endif - pop {r12,r14} - CHK_CANARY r12,CTAG11,6 - bx r14 - -@ Conjugate lut_a, lut_b with shareC +@ Conjugate lut_a, lut_b with (state) shareC @ I.e., EOR the input and output with shareC. @ We need to pick one input for each share A and B, and one output for ONE of the shares A and B @ Arbitrarily choosing a0, b1 and d0 @@ -1653,36 +1624,57 @@ addrkey_s: .endif ctr_crypt_s: -@ r0=IV, r1=cipher/plaintext buffer, r2=number of blocks +@ r0=IV_shareA, r1=IV_shareB, r2=cipher/plaintext buffer, r3=number of blocks GET_CANARY r12,CTAG0,6 push {r0-r12,r14} @ save all registers so that when we restore we overwrite any secrets - push {r0-r2} + push {r0-r3} + SET_COUNT 93,6 .if CT_BPERM @ Initialise 32 random numbers (which fit in half-words) +@ r3=number of blocks ldr r4,=bperm_rand movs r5,#32 1: bl gen_rand_sha - umull r0,r3,r0,r2 @ Random number between 0 and n-1 (n=#blocks) - strh r3,[r4],#2 + umull r0,r2,r0,r3 @ Random number between 0 and n-1 (n=#blocks) + strh r2,[r4],#2 subs r5,r5,#1 bne 1b .endif bl randomisechaff - pop {r0-r2} + +@ Refresh IVshareA and IVshareB, convert to ror#16 format and store the result at IV0 +@ Not doing shareC or state vperm at this point + pop {r0} + ldmia r0,{r4-r7} @ r4-r7 = IVshareA + clear03 16 + pop {r1} + ldmia r1,{r8-r11} @ r8-r11 = IVshareB + clear03 32 + bl gen_rand_sha_nonpres; eors r4,r4,r0; mov r8, r8, ror#16; eor r8, r8, r0,ror#16 + bl gen_rand_sha_nonpres; eors r5,r5,r0; mov r9, r9, ror#16; eor r9, r9, r0,ror#16 + bl gen_rand_sha_nonpres; eors r6,r6,r0; mov r10,r10,ror#16; eor r10,r10,r0,ror#16 + bl gen_rand_sha_nonpres; eors r7,r7,r0; mov r11,r11,ror#16; eor r11,r11,r0,ror#16 + ldr r0,=IV0 + stmia r0,{r4-r7} + adds r0,r0,#20 + stmia r0,{r8-r11} + pop {r1,r2} +@ r1=cipher/plaintext buffer, r2=number of blocks + movs r3,#0 CHK_COUNT 93,6 ctr_crypt_mainloop: SET_COUNT 80,6 -@ here r0=IV, r1=cipher/plaintext buffer, r2=number of blocks, r3=block counter +@ r1=cipher/plaintext buffer, r2=number of blocks, r3=block counter @ Do as much preparatory stuff as possible that doesn't involve the IV (to reduce interaction with it) - push {r0-r3} + push {r1-r3} @ It's OK for execution time to depend on the block counter r3 ("public"), but not the block number (secret) tst r3,#(REFCHAFF_PERIOD-1) @@ -1690,7 +1682,7 @@ ctr_crypt_mainloop: bl refreshchaff_and_lfsr 1: - ldr r3,[r13,#12] @ get block count off the stack + ldr r3,[r13,#8] @ get block count off the stack tst r3,#(REMAP_PERIOD-1) bne 1f bl remap @ shuffle the LUTs; this preserves R3 @@ -1702,7 +1694,7 @@ ctr_crypt_mainloop: bl ref_roundkey_shares_s @ refresh the round key shares 1: - ldr r3,[r13,#12] @ get block count off the stack + ldr r3,[r13,#8] @ get block count off the stack tst r3,#(REFROUNDKEYHVPERMS_PERIOD-1) bne 1f bl ref_roundkey_hvperms_s @ refresh the round key vperms @@ -1710,13 +1702,13 @@ ctr_crypt_mainloop: CHK_COUNT 81,6 - pop {r0-r3} -@ r0=IV, r1=cipher/plaintext buffer, r2=number of blocks, r3=block counter + pop {r1-r3} +@ r1=cipher/plaintext buffer, r2=number of blocks, r3=block counter @ Now calculate r12 = block number-to-be-deciphered from r3 = block counter .if CT_BPERM @ Use a "swap-or-not" method to generate an "oblivious" permutation; see makeperm.py version 7 - push {r0,r1} + push {r1} ldr r0,=murmur3_constants ldmia r0,{r9-r12,r14} @ load five murmur3_32 hash constants ldr r0,=bperm_rand @@ -1752,57 +1744,53 @@ ctr_crypt_mainloop: adds r4,r4,r7 @ r4=j if top bit of r6, else i subs r1,r1,#1 bpl 1b - pop {r0,r1} + pop {r1} mov r12,r4 .else mov r12,r3 .endif CHK_COUNT 82,6 -@ r0=IV, r1=cipher/plaintext buffer, r2=number of blocks, r3=block counter (monotonic), r12=block number (block to be deciphered) - push {r0-r3,r12} +@ r1=cipher/plaintext buffer, r2=number of blocks, r3=block counter (monotonic), r12=block number (block to be deciphered) + push {r1-r3,r12} +@ r4-r11 = IV0, r12=block number processIV: @ non-target label to assist power analysis - -@ It is not clear if the following addition of the block number in r12 to the IV can usefully -@ be done in terms of shares. Instead we do an addition and subtraction whose overall effect -@ is the same, and which provides a small degree of masking. The IV is not traditionally a secret, -@ though it will make it harder for the attacker if it is obscured. - bl gen_rand_sha - movs r8,r0,lsr#16 @ only use 16 low bits so we don't get any overflows in the following, and so that a carry from the first word is rare - add r9,r8,r12 @ "masked" block number -@ r8=random, r9=(block number)+r8, stack=IV,... - - ldr r0,[r13] @ peek at stack to restore r0=IV ptr - ldmia r0,{r4-r7} @ load IV - clear03 @ barrier to remove traces of IV from internal CPU load registers - -@ Add in r9 in byte-big-endian, bit-little-endian (!) fashion, while trying to avoid rev operations -@ as far as possible as these tend to expose (via power fluctuations) byte-level hamming weights. -@ First do 128-bit addition of r9 to byte-reversed IV - rev r7,r7 - cmn r7,#MAX_NUM_BLOCKS @ Compare against maximum number of blocks - bcs 1f - add r7,r7,r9 @ This can temporarily overflow but it doesn't matter as we know that r7+r12 does not overflow - sub r7,r7,r8 - b 2f -1: - adds r7,r7,r9 - rev r6,r6; adcs r6,r6,#0 - rev r5,r5; adcs r5,r5,#0 - rev r4,r4; adcs r4,r4,#0 -@ Now do 128-bit subtraction of r8 from byte-reversed IV - subs r7,r7,r8 - sbcs r6,r6,#0; rev r6,r6 - sbcs r5,r5,#0; rev r5,r5 - sbcs r4,r4,#0; rev r4,r4 -2: - rev r7,r7 - clear01 16 + ldr r8,=IV0 + ldmia r8,{r4-r7} @ load IV0_A + clear03 16 + add r8,r8,#20 + ldmia r8,{r8-r11} @ load IV0_B + clear03 32 + rev r0,r12 + eor r7,r7,r0 @ XOR in block number to IV0. IV(block n) = IV0 ^ n, cf standard CTR mode IV0 + n. + @ XOR (vs addition) is compatible with XOR-shares, so stealthier/simpler because don't have to unshare to work out IV(block n) +@ r4-r11 = IV for the current block CHK_COUNT 83,6 +.if ST_SHAREC + bl gen_rand_sha_nonpres @ Create state share C; all bytes the same + ands r0,r0,#255 + orrs r0,r0,r0,lsl#8 + orrs r12,r0,r0,lsl#16 + ldr r1,=shareC + str r12,[r1] +.else + movs r12,#0 +.endif +@ r4-r11 = IV for the current block w/o shareC, r12=shareC +@ refresh state shares and mix in shareC + bl gen_rand_sha_nonpres; eors r4,r4,r0; eor r4,r4,r12; movs r1,#0; eor r8, r8, r0,ror#16 @ Barriers between shares to prevent implicit r4^r8 etc + bl gen_rand_sha_nonpres; eors r5,r5,r0; eor r5,r5,r12; movs r1,#0; eor r9, r9, r0,ror#16 + bl gen_rand_sha_nonpres; eors r6,r6,r0; eor r6,r6,r12; movs r1,#0; eor r10,r10,r0,ror#16 + bl gen_rand_sha_nonpres; eors r7,r7,r0; eor r7,r7,r12; movs r1,#0; eor r11,r11,r0,ror#16 +.if ST_VPERM + bl gen_rand_sha_nonpres + ldr r1,=statevperm + movs r2,#0 + str r2,[r1] + bl addstatevperm @ Initialise state vperm (use SHA RNG to start with, later refreshes are with LFSR RNG) +.endif -@ r4-r7 = IV for the current block - bl ns_to_s @ convert IV+x to shares, which includes choosing and incorporating a random shareC CHK_COUNT 84,6 bl conjshareC @ Add the effect of shareC to lut_a, lut_b CHK_COUNT 85,6 @@ -1849,9 +1837,9 @@ rounds_s_mainloop: bl addstatevperm .endif - pop {r0-r3,r12} - push {r0,r3} -@ r0=IV, r1=cipher/plaintext buffer, r2=number of blocks, r3=block counter, r12=block to be deciphered + pop {r1-r3,r12} + push {r3} +@ r1=cipher/plaintext buffer, r2=number of blocks, r3=block counter, r12=block to be deciphered decryption_start: @ Decrypt ciphertext using AES output in shares: r4-r11 @@ -1893,8 +1881,8 @@ decryption_start: sub r1,r1,r12,lsl#4 @ Restore r1 to point to start of buffer CHK_COUNT 90,6 - pop {r0,r3} @ Restore IV and block counter -@ r0=IV, r1=cipher/plaintext buffer, r2=number of blocks, r3=block counter + pop {r3} @ Restore block counter +@ r1=cipher/plaintext buffer, r2=number of blocks, r3=block counter decryption_end: adds r3,r3,#1 diff --git a/bootloaders/encrypted/enc_bootloader.c b/bootloaders/encrypted/enc_bootloader.c index 814ce70cc..97012f485 100644 --- a/bootloaders/encrypted/enc_bootloader.c +++ b/bootloaders/encrypted/enc_bootloader.c @@ -18,7 +18,7 @@ #define OTP_KEY_PAGE 30 -extern void decrypt(uint8_t* key4way, uint8_t* iv, uint8_t(*buf)[16], int nblk); +extern void decrypt(uint8_t* key4way, uint8_t* IV_OTPsalt, uint8_t* IV_public, uint8_t(*buf)[16], int nblk); // The function lock_key() is called from decrypt() after key initialisation is complete and before decryption begins. // That is a suitable point to lock the OTP area where key information is stored. @@ -151,7 +151,14 @@ int main() { // Read key directly from OTP - guarded reads will throw a bus fault if there are any errors uint16_t* otp_data = (uint16_t*)OTP_DATA_GUARDED_BASE; - decrypt((uint8_t*)&(otp_data[(OTP_CMD_ROW_BITS & (OTP_KEY_PAGE * 0x40))]), iv, (void*)SRAM_BASE, data_size/16); + decrypt( + (uint8_t*)&(otp_data[(OTP_CMD_ROW_BITS & (OTP_KEY_PAGE * 0x40))]), + (uint8_t*)&(otp_data[(OTP_CMD_ROW_BITS & ((OTP_KEY_PAGE + 1) * 0x40))]), + iv, (void*)SRAM_BASE, data_size/16 + ); + + // Lock the IV salt + otp_hw->sw_lock[OTP_KEY_PAGE + 1] = 0xf; printf("Post decryption image begins with\n"); for (int i=0; i < 4; i++) From ad9842acc9a0dd12e4036d9064028f861848d341 Mon Sep 17 00:00:00 2001 From: William Vinnicombe Date: Wed, 26 Feb 2025 15:39:11 +0000 Subject: [PATCH 13/20] Update readmes This includes the changes from #553 --- bootloaders/encrypted/README.md | 12 +++++++++++- encrypted/hello_encrypted/README.md | 26 ++++++++++++++++++++++++++ 2 files changed, 37 insertions(+), 1 deletion(-) create mode 100644 encrypted/hello_encrypted/README.md diff --git a/bootloaders/encrypted/README.md b/bootloaders/encrypted/README.md index cd909ddff..0ad03bc6d 100644 --- a/bootloaders/encrypted/README.md +++ b/bootloaders/encrypted/README.md @@ -1,4 +1,4 @@ -Replace private.pem and privateaes.bin with your own keys - your signing key must be for the _secp256k1_ curve, in PEM format. You can create a .PEM file with: +For security you **must** replace private.pem and privateaes.bin with your own keys, and ivsalt.bin with your own per-device salt. Your signing key must be for the _secp256k1_ curve, in PEM format. You can create a .PEM file with: ```bash openssl ecparam -name secp256k1 -genkey -out private.pem @@ -15,6 +15,16 @@ or in Powershell 7 [byte[]] $(Get-SecureRandom -Maximum 256 -Count 128) | Set-Content privateaes.bin -AsByteStream ``` +The IV salt is just a 16 byte binary file - you can create it the same way, replacing `128` with `16` and `privateaes.bin` with `ivsalt.bin` in the commands above. + +You will need to program your OTP using the `otp.json` file generated by the build in your build folder +NOTE: This will enable secure boot on your device, so only correctly signed binaries can then run, and will also lock down the OTP pages the AES key and IV salt are stored in. +```bash +picotool otp load otp.json +``` + +> For more information on security see chapter 10 of the [RP2350 datasheet](https://datasheets.raspberrypi.com/rp2350/rp2350-datasheet.pdf), and for information on how to sign other binaries to run on a secure chip see chapter 5.10 + Then either drag & drop the UF2 files to the device in order (enc_bootloader first, then hello_serial_enc) waiting for a reboot in-between, or run ```bash picotool load enc_bootloader.uf2 diff --git a/encrypted/hello_encrypted/README.md b/encrypted/hello_encrypted/README.md new file mode 100644 index 000000000..ea2207c65 --- /dev/null +++ b/encrypted/hello_encrypted/README.md @@ -0,0 +1,26 @@ +For security you **must** replace private.pem and privateaes.bin with your own keys, and ivsalt.bin with your own per-device salt. Your signing key must be for the _secp256k1_ curve, in PEM format. You can create a .PEM file with: + +```bash +openssl ecparam -name secp256k1 -genkey -out private.pem +``` + +The AES key is stored as a 4-way share in a 128 byte binary file - you can create one with + +```bash +dd if=/dev/urandom of=privateaes.bin bs=1 count=128 +``` + +or in Powershell 7 +```powershell +[byte[]] $(Get-SecureRandom -Maximum 256 -Count 128) | Set-Content privateaes.bin -AsByteStream +``` + +The IV salt is just a 16 byte binary file - you can create it the same way, replacing `128` with `16` and `privateaes.bin` with `ivsalt.bin` in the commands above. + +You will need to program your OTP using the `otp.json` file generated by the build in your build folder +NOTE: This will enable secure boot on your device, so only correctly signed binaries can then run, and will also lock down the OTP pages the AES key and IV salt are stored in. +```bash +picotool otp load otp.json +``` + +> For more information on security see chapter 10 of the [RP2350 datasheet](https://datasheets.raspberrypi.com/rp2350/rp2350-datasheet.pdf), and for information on how to sign other binaries to run on a secure chip see chapter 5.10 From d9337653516f2d82ba547554b1e76ea41316d89a Mon Sep 17 00:00:00 2001 From: William Vinnicombe Date: Wed, 26 Feb 2025 16:09:39 +0000 Subject: [PATCH 14/20] Add secret file to print out This is useful for testing decryption with large files --- encrypted/hello_encrypted/CMakeLists.txt | 7 +++++++ encrypted/hello_encrypted/hello_encrypted.c | 3 +++ encrypted/hello_encrypted/secret.S | 5 +++++ encrypted/hello_encrypted/secret.txt | 1 + 4 files changed, 16 insertions(+) create mode 100644 encrypted/hello_encrypted/secret.S create mode 100644 encrypted/hello_encrypted/secret.txt diff --git a/encrypted/hello_encrypted/CMakeLists.txt b/encrypted/hello_encrypted/CMakeLists.txt index 284c9b309..2ecfb858c 100644 --- a/encrypted/hello_encrypted/CMakeLists.txt +++ b/encrypted/hello_encrypted/CMakeLists.txt @@ -1,8 +1,15 @@ # Example encrypted binard add_executable(hello_encrypted hello_encrypted.c + secret.S ) +# include secret.txt +target_include_directories(hello_encrypted PRIVATE ${CMAKE_CURRENT_LIST_DIR}) + +# add dependency on secret.txt +set_property(SOURCE secret.S APPEND PROPERTY OBJECT_DEPENDS ${CMAKE_CURRENT_LIST_DIR}/secret.txt) + # pull in common dependencies target_link_libraries(hello_encrypted pico_stdlib) diff --git a/encrypted/hello_encrypted/hello_encrypted.c b/encrypted/hello_encrypted/hello_encrypted.c index e96155745..76d40bfe9 100644 --- a/encrypted/hello_encrypted/hello_encrypted.c +++ b/encrypted/hello_encrypted/hello_encrypted.c @@ -20,11 +20,14 @@ int main() { rom_explicit_buy(buffer, 4096); free(buffer); #endif + extern char secret_data[]; while (true) { printf("Hello, world!\n"); printf("I'm a self-decrypting binary\n"); printf("My secret is...\n"); sleep_ms(1000); + printf(secret_data); + sleep_ms(10000); } } diff --git a/encrypted/hello_encrypted/secret.S b/encrypted/hello_encrypted/secret.S new file mode 100644 index 000000000..0014c0d6e --- /dev/null +++ b/encrypted/hello_encrypted/secret.S @@ -0,0 +1,5 @@ +.section .rodata +.global secret_data +secret_data: +.incbin "secret.txt" +.byte 0 \ No newline at end of file diff --git a/encrypted/hello_encrypted/secret.txt b/encrypted/hello_encrypted/secret.txt new file mode 100644 index 000000000..351db192a --- /dev/null +++ b/encrypted/hello_encrypted/secret.txt @@ -0,0 +1 @@ +TODO: Put a funny secret here From 522208e60fcaaa514c8cca78d431b2856d3dfe08 Mon Sep 17 00:00:00 2001 From: William Vinnicombe Date: Wed, 26 Feb 2025 16:19:49 +0000 Subject: [PATCH 15/20] Add notes about unique AES keys, and not losing keys/salts --- bootloaders/encrypted/README.md | 6 ++++-- encrypted/hello_encrypted/README.md | 6 ++++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/bootloaders/encrypted/README.md b/bootloaders/encrypted/README.md index 0ad03bc6d..582a379bc 100644 --- a/bootloaders/encrypted/README.md +++ b/bootloaders/encrypted/README.md @@ -1,4 +1,6 @@ -For security you **must** replace private.pem and privateaes.bin with your own keys, and ivsalt.bin with your own per-device salt. Your signing key must be for the _secp256k1_ curve, in PEM format. You can create a .PEM file with: +For security you **must** replace private.pem and privateaes.bin with your own keys, and ivsalt.bin with your own per-device salt. The AES key should also be different for each device. Make sure you **don't lose your keys and salts**, else you may not be able to boot code on your device. + +Your signing key must be for the _secp256k1_ curve, in PEM format. You can create a .PEM file with: ```bash openssl ecparam -name secp256k1 -genkey -out private.pem @@ -23,7 +25,7 @@ NOTE: This will enable secure boot on your device, so only correctly signed bina picotool otp load otp.json ``` -> For more information on security see chapter 10 of the [RP2350 datasheet](https://datasheets.raspberrypi.com/rp2350/rp2350-datasheet.pdf), and for information on how to sign other binaries to run on a secure chip see chapter 5.10 +> For more information on security see chapter 10 of the [RP2350 datasheet](https://datasheets.raspberrypi.com/rp2350/rp2350-datasheet.pdf), and for information on how to sign other binaries to run on a secure chip see section 5.10 Then either drag & drop the UF2 files to the device in order (enc_bootloader first, then hello_serial_enc) waiting for a reboot in-between, or run ```bash diff --git a/encrypted/hello_encrypted/README.md b/encrypted/hello_encrypted/README.md index ea2207c65..c1ce1435d 100644 --- a/encrypted/hello_encrypted/README.md +++ b/encrypted/hello_encrypted/README.md @@ -1,4 +1,6 @@ -For security you **must** replace private.pem and privateaes.bin with your own keys, and ivsalt.bin with your own per-device salt. Your signing key must be for the _secp256k1_ curve, in PEM format. You can create a .PEM file with: +For security you **must** replace private.pem and privateaes.bin with your own keys, and ivsalt.bin with your own per-device salt. The AES key should also be different for each device. Make sure you **don't lose your keys and salts**, else you may not be able to boot code on your device. + +Your signing key must be for the _secp256k1_ curve, in PEM format. You can create a .PEM file with: ```bash openssl ecparam -name secp256k1 -genkey -out private.pem @@ -23,4 +25,4 @@ NOTE: This will enable secure boot on your device, so only correctly signed bina picotool otp load otp.json ``` -> For more information on security see chapter 10 of the [RP2350 datasheet](https://datasheets.raspberrypi.com/rp2350/rp2350-datasheet.pdf), and for information on how to sign other binaries to run on a secure chip see chapter 5.10 +> For more information on security see chapter 10 of the [RP2350 datasheet](https://datasheets.raspberrypi.com/rp2350/rp2350-datasheet.pdf), and for information on how to sign other binaries to run on a secure chip see section 5.10 From c3bc79a80553770fea0c190313f20e1a144e8225 Mon Sep 17 00:00:00 2001 From: William Vinnicombe Date: Wed, 26 Feb 2025 17:07:03 +0000 Subject: [PATCH 16/20] Update readmes --- bootloaders/encrypted/README.md | 2 +- encrypted/hello_encrypted/README.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/bootloaders/encrypted/README.md b/bootloaders/encrypted/README.md index 582a379bc..0e10e5e3d 100644 --- a/bootloaders/encrypted/README.md +++ b/bootloaders/encrypted/README.md @@ -1,4 +1,4 @@ -For security you **must** replace private.pem and privateaes.bin with your own keys, and ivsalt.bin with your own per-device salt. The AES key should also be different for each device. Make sure you **don't lose your keys and salts**, else you may not be able to boot code on your device. +For security you **must** replace private.pem and privateaes.bin with your own keys, and ivsalt.bin with your own per-device salt. Make sure you **don't lose your keys and salts**, else you may not be able to update the code on your device. Your signing key must be for the _secp256k1_ curve, in PEM format. You can create a .PEM file with: diff --git a/encrypted/hello_encrypted/README.md b/encrypted/hello_encrypted/README.md index c1ce1435d..d65a2c50c 100644 --- a/encrypted/hello_encrypted/README.md +++ b/encrypted/hello_encrypted/README.md @@ -1,4 +1,4 @@ -For security you **must** replace private.pem and privateaes.bin with your own keys, and ivsalt.bin with your own per-device salt. The AES key should also be different for each device. Make sure you **don't lose your keys and salts**, else you may not be able to boot code on your device. +For security you **must** replace private.pem and privateaes.bin with your own keys, and ivsalt.bin with your own per-device salt. Make sure you **don't lose your keys and salts**, else you may not be able to update the code on your device. Your signing key must be for the _secp256k1_ curve, in PEM format. You can create a .PEM file with: From bde13d69cbe82fce8124a814241669f41c31f2f9 Mon Sep 17 00:00:00 2001 From: William Vinnicombe Date: Thu, 20 Mar 2025 11:05:58 +0000 Subject: [PATCH 17/20] Fix enc_bootloader example OTP output --- bootloaders/encrypted/CMakeLists.txt | 20 ++++----- bootloaders/encrypted/aes.S | 59 +++++++++++++++++--------- bootloaders/encrypted/enc_bootloader.c | 4 +- 3 files changed, 50 insertions(+), 33 deletions(-) diff --git a/bootloaders/encrypted/CMakeLists.txt b/bootloaders/encrypted/CMakeLists.txt index bd16b34d2..ba7190b8b 100644 --- a/bootloaders/encrypted/CMakeLists.txt +++ b/bootloaders/encrypted/CMakeLists.txt @@ -4,13 +4,6 @@ add_executable(enc_bootloader aes.S ) -# Copy otp.json file to build directory -add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/otp.json - COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_CURRENT_LIST_DIR}/otp.json" "${CMAKE_CURRENT_BINARY_DIR}/otp.json" - DEPENDS ${CMAKE_CURRENT_LIST_DIR}/otp.json) -add_custom_target(otp_json DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/otp.json) -add_dependencies(enc_bootloader otp_json) - # pull in common dependencies target_link_libraries(enc_bootloader pico_stdlib pico_rand) @@ -45,9 +38,6 @@ endfunction() # create linker script to run from 0x20078000 add_linker_script(enc_bootloader "0x20078000" "32k") -# configure otp output -pico_set_otp_key_output_file(enc_bootloader ${CMAKE_CURRENT_BINARY_DIR}/otp.json) - # sign, hash, and clear SRAM pico_sign_binary(enc_bootloader ${CMAKE_CURRENT_LIST_DIR}/private.pem) pico_hash_binary(enc_bootloader) @@ -82,6 +72,16 @@ pico_set_binary_type(hello_serial_enc no_flash) # create linker script to ensure it doesn't overwrite the bootloader at 0x20070000 add_linker_script(hello_serial_enc "0x20000000" "448k") +# Copy otp.json file to build directory +add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/otp.json + COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_CURRENT_LIST_DIR}/otp.json" "${CMAKE_CURRENT_BINARY_DIR}/otp.json" + DEPENDS ${CMAKE_CURRENT_LIST_DIR}/otp.json) +add_custom_target(otp_json DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/otp.json) +add_dependencies(hello_serial_enc otp_json) + +# configure otp output +pico_set_otp_key_output_file(hello_serial_enc ${CMAKE_CURRENT_BINARY_DIR}/otp.json) + # sign, hash, and encrypt pico_sign_binary(hello_serial_enc ${CMAKE_CURRENT_LIST_DIR}/private.pem) pico_hash_binary(hello_serial_enc) diff --git a/bootloaders/encrypted/aes.S b/bootloaders/encrypted/aes.S index 2014b5fd2..093c4b0f1 100644 --- a/bootloaders/encrypted/aes.S +++ b/bootloaders/encrypted/aes.S @@ -278,31 +278,39 @@ init_rstate: str r1,[r4,#TRNG_TRNG_CONFIG_OFFSET -TRNG_RNG_IMR_OFFSET] @ turn off rand source and wipe SHA bits left in TRNG config; r1=0 str r1,[r4,#TRNG_RND_SOURCE_ENABLE_OFFSET -TRNG_RNG_IMR_OFFSET] adds r5,r5,#SHA256_SUM0_OFFSET - ldmia r5!,{r0-r3} - ldr r5,=rstate_sha - stmia r5,{r0-r3} +@ r5=SHA256 SUM0 register (r5+4=SUM1, r4+8=SUM2, etc) + ldmia r5,{r0-r3} @ load first 4 words of the 8 word SHA256 output + ldr r6,=rstate_sha +@ r5=SHA256 SUM0 register (r5+4=SUM1, r4+8=SUM2, etc), r6=rstate_sha + stmia r6,{r0-r3} CHK_COUNT 26,6 - -@ r5=rstate_sha movs r0,#0 - strb r0,[r5] @ make sure rstate_sha[0] has byte 0 set to 0, representing "out of data" -@ try to find a non-zero initialiser to create a non-degenerate LFSR - ldr r1,[r5,#4] - cbnz r1,1f @ is word 1 non-zero? then use it - ldr r1,[r5,#8] - cbnz r1,1f @ otherwise, is word 2 non-zero? use it - ldr r1,[r5,#12] - cbnz r1,1f @ otherwise, is word 3 non-zero? use it - mov r1,r5 @ give up and use the address of rstate_sha (which is non-zero); this can't really happen (2^{-96} probability) + strb r0,[r6] @ make sure rstate_sha[0] has byte 0 set to 0, representing "out of data" + +@ try to find a non-zero initialiser to create a non-degenerate LFSR random state + ldr r1,[r5,#16] @ SHA SUM4 + cbnz r1,1f @ is word 4 non-zero? then use it + ldr r1,[r5,#20] @ SHA SUM5 + cbnz r1,1f @ otherwise, is word 5 non-zero? use it + mov r1,r6 @ give up and use the address of rstate_sha (which is non-zero); this can't really happen (2^{-64} probability) +1: + str r1,[r6,#rstate_lfsr-rstate_sha] + +@ try to find a non-zero initialiser to create a non-degenerate ROSC random state + ldr r1,[r5,#24] @ SHA SUM6 + cbnz r1,1f @ is word 6 non-zero? then use it + ldr r1,[r5,#28] @ SHA SUM7 + cbnz r1,1f @ otherwise, is word 7 non-zero? use it + mov r1,r6 @ give up and use the address of rstate_sha (which is non-zero); this can't really happen (2^{-64} probability) 1: - str r1,[r5,#rstate_lfsr-rstate_sha] ldr r2,=ROSC_RANDOM_OFFSET+ROSC_BASE - str r1,[r2,#0] + str r1,[r2,#0] @ Initialise ROSC LFSR CHK_COUNT 27,6 + .if GEN_RAND_SHA .if SH_JITTER movs r2,#0 - str r2,[r5,#jstate-rstate_sha] + str r2,[r6,#jstate-rstate_sha] .endif .endif @@ -1655,14 +1663,23 @@ ctr_crypt_s: pop {r1} ldmia r1,{r8-r11} @ r8-r11 = IVshareB clear03 32 - bl gen_rand_sha_nonpres; eors r4,r4,r0; mov r8, r8, ror#16; eor r8, r8, r0,ror#16 - bl gen_rand_sha_nonpres; eors r5,r5,r0; mov r9, r9, ror#16; eor r9, r9, r0,ror#16 - bl gen_rand_sha_nonpres; eors r6,r6,r0; mov r10,r10,ror#16; eor r10,r10,r0,ror#16 - bl gen_rand_sha_nonpres; eors r7,r7,r0; mov r11,r11,ror#16; eor r11,r11,r0,ror#16 + bl gen_rand_sha_nonpres; eors r4,r4,r0; movs r1,#0; mov r8, r8, ror#16; eor r8, r8, r0,ror#16 @ Barriers between shares to prevent implicit r4^r8 etc + bl gen_rand_sha_nonpres; eors r5,r5,r0; movs r1,#0; mov r9, r9, ror#16; eor r9, r9, r0,ror#16 + bl gen_rand_sha_nonpres; eors r6,r6,r0; movs r1,#0; mov r10,r10,ror#16; eor r10,r10,r0,ror#16 + bl gen_rand_sha_nonpres; eors r7,r7,r0; movs r1,#0; mov r11,r11,ror#16; eor r11,r11,r0,ror#16 ldr r0,=IV0 stmia r0,{r4-r7} adds r0,r0,#20 stmia r0,{r8-r11} +@ "Decommission" IV0 so that it doesn't get stacked + bl gen_rand_sha_nonpres; movs r4,r0 + bl gen_rand_sha_nonpres; movs r5,r0 + bl gen_rand_sha_nonpres; movs r6,r0 + bl gen_rand_sha_nonpres; movs r7,r0 + bl gen_rand_sha_nonpres; mov r8,r0 + bl gen_rand_sha_nonpres; mov r9,r0 + bl gen_rand_sha_nonpres; mov r10,r0 + bl gen_rand_sha_nonpres; mov r11,r0 pop {r1,r2} @ r1=cipher/plaintext buffer, r2=number of blocks diff --git a/bootloaders/encrypted/enc_bootloader.c b/bootloaders/encrypted/enc_bootloader.c index 97012f485..54e89d2e5 100644 --- a/bootloaders/encrypted/enc_bootloader.c +++ b/bootloaders/encrypted/enc_bootloader.c @@ -152,8 +152,8 @@ int main() { uint16_t* otp_data = (uint16_t*)OTP_DATA_GUARDED_BASE; decrypt( - (uint8_t*)&(otp_data[(OTP_CMD_ROW_BITS & (OTP_KEY_PAGE * 0x40))]), - (uint8_t*)&(otp_data[(OTP_CMD_ROW_BITS & ((OTP_KEY_PAGE + 1) * 0x40))]), + (uint8_t*)&(otp_data[OTP_KEY_PAGE * 0x40]), + (uint8_t*)&(otp_data[(OTP_KEY_PAGE + 1) * 0x40]), iv, (void*)SRAM_BASE, data_size/16 ); From d0379cb8d96a57989b37abe9cc3cb3a1ec6d6dd5 Mon Sep 17 00:00:00 2001 From: William Vinnicombe Date: Mon, 31 Mar 2025 11:02:41 +0100 Subject: [PATCH 18/20] Remove OTP key locking functionality from encrypted examples --- bootloaders/encrypted/CMakeLists.txt | 7 ------- bootloaders/encrypted/otp.json | 6 ------ encrypted/hello_encrypted/CMakeLists.txt | 7 ------- encrypted/hello_encrypted/otp.json | 6 ------ 4 files changed, 26 deletions(-) delete mode 100644 bootloaders/encrypted/otp.json delete mode 100644 encrypted/hello_encrypted/otp.json diff --git a/bootloaders/encrypted/CMakeLists.txt b/bootloaders/encrypted/CMakeLists.txt index ba7190b8b..2d6d77f0d 100644 --- a/bootloaders/encrypted/CMakeLists.txt +++ b/bootloaders/encrypted/CMakeLists.txt @@ -72,13 +72,6 @@ pico_set_binary_type(hello_serial_enc no_flash) # create linker script to ensure it doesn't overwrite the bootloader at 0x20070000 add_linker_script(hello_serial_enc "0x20000000" "448k") -# Copy otp.json file to build directory -add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/otp.json - COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_CURRENT_LIST_DIR}/otp.json" "${CMAKE_CURRENT_BINARY_DIR}/otp.json" - DEPENDS ${CMAKE_CURRENT_LIST_DIR}/otp.json) -add_custom_target(otp_json DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/otp.json) -add_dependencies(hello_serial_enc otp_json) - # configure otp output pico_set_otp_key_output_file(hello_serial_enc ${CMAKE_CURRENT_BINARY_DIR}/otp.json) diff --git a/bootloaders/encrypted/otp.json b/bootloaders/encrypted/otp.json deleted file mode 100644 index a238c1511..000000000 --- a/bootloaders/encrypted/otp.json +++ /dev/null @@ -1,6 +0,0 @@ -{ - "OTP_DATA_KEY1" : [ 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7 ], - "OTP_DATA_KEY1_VALID" : "0x010101", - "PAGE30_LOCK0" : "0x494949", - "PAGE31_LOCK0" : "0x494949" -} diff --git a/encrypted/hello_encrypted/CMakeLists.txt b/encrypted/hello_encrypted/CMakeLists.txt index 2ecfb858c..6afcb5f32 100644 --- a/encrypted/hello_encrypted/CMakeLists.txt +++ b/encrypted/hello_encrypted/CMakeLists.txt @@ -26,13 +26,6 @@ pico_set_binary_version(hello_encrypted MAJOR 7 MINOR 3) # set tbyb (optional) # target_compile_definitions(hello_encrypted PRIVATE PICO_CRT0_IMAGE_TYPE_TBYB=1) -# Copy otp.json file to build directory -add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/otp.json - COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_CURRENT_LIST_DIR}/otp.json" "${CMAKE_CURRENT_BINARY_DIR}/otp.json" - DEPENDS ${CMAKE_CURRENT_LIST_DIR}/otp.json) -add_custom_target(hello_encrypted_otp_json DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/otp.json) -add_dependencies(hello_encrypted hello_encrypted_otp_json) - # configure otp output pico_set_otp_key_output_file(hello_encrypted ${CMAKE_CURRENT_BINARY_DIR}/otp.json) diff --git a/encrypted/hello_encrypted/otp.json b/encrypted/hello_encrypted/otp.json deleted file mode 100644 index c9a7be2d5..000000000 --- a/encrypted/hello_encrypted/otp.json +++ /dev/null @@ -1,6 +0,0 @@ -{ - "OTP_DATA_KEY1" : [ 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7 ], - "OTP_DATA_KEY1_VALID" : "0x010101", - "PAGE29_LOCK0" : "0x494949", - "PAGE30_LOCK0" : "0x494949" -} From 4235e8f8dcd67efde0fa320c595a8c7ae6a39a26 Mon Sep 17 00:00:00 2001 From: William Vinnicombe Date: Thu, 3 Apr 2025 18:12:44 +0100 Subject: [PATCH 19/20] Improve TBYB sequence Add self check (1 == 1), which is only performed on first boot --- encrypted/hello_encrypted/hello_encrypted.c | 27 ++++++++++++++++++--- 1 file changed, 23 insertions(+), 4 deletions(-) diff --git a/encrypted/hello_encrypted/hello_encrypted.c b/encrypted/hello_encrypted/hello_encrypted.c index 76d40bfe9..074f59777 100644 --- a/encrypted/hello_encrypted/hello_encrypted.c +++ b/encrypted/hello_encrypted/hello_encrypted.c @@ -15,10 +15,29 @@ int main() { stdio_init_all(); #if PICO_CRT0_IMAGE_TYPE_TBYB - // If TBYB image, then buy it - uint8_t* buffer = malloc(4096); - rom_explicit_buy(buffer, 4096); - free(buffer); + boot_info_t boot_info = {}; + int ret = rom_get_boot_info(&boot_info); + if (ret) { + // BOOT_TBYB_AND_UPDATE_FLAG_BUY_PENDING will always be set, but check anyway + if (boot_info.tbyb_and_update_info & BOOT_TBYB_AND_UPDATE_FLAG_BUY_PENDING) { + // Need to check flash_update_base is set to see if this is a TBYB update + uint32_t flash_update_base = boot_info.reboot_params[0]; + if (flash_update_base) { + printf("Perform self-check... "); + if (1 == 1) { + printf("passed\n"); + } else { + printf("failed - looping forever\n"); + while (true) sleep_ms(1000); + } + } + uint32_t buf_size = flash_update_base ? 4096 : 0; + uint8_t* buffer = flash_update_base ? malloc(buf_size) : NULL; + int ret = rom_explicit_buy(buffer, buf_size); + assert(ret == 0); + if (buffer) free(buffer); + } + } #endif extern char secret_data[]; From 25d5b4330cc39b6a9f12aa5976b66c64f6a9d459 Mon Sep 17 00:00:00 2001 From: William Vinnicombe Date: Thu, 24 Apr 2025 12:49:51 +0100 Subject: [PATCH 20/20] Review fixups Comments and readme --- README.md | 2 +- encrypted/hello_encrypted/CMakeLists.txt | 4 ++-- encrypted/hello_encrypted/hello_encrypted.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index c4127fc1c..f718271ea 100644 --- a/README.md +++ b/README.md @@ -86,7 +86,7 @@ App|Description [channel_irq](dma/channel_irq) | Use an IRQ handler to reconfigure a DMA channel, in order to continuously drive data through a PIO state machine. [sniff_crc](dma/sniff_crc) | Use the DMA engine's 'sniff' capability to calculate a CRC32 on a data buffer. -### Encrypted +### Encrypted (RP235x Only) App|Description ---|--- diff --git a/encrypted/hello_encrypted/CMakeLists.txt b/encrypted/hello_encrypted/CMakeLists.txt index 6afcb5f32..8e400a646 100644 --- a/encrypted/hello_encrypted/CMakeLists.txt +++ b/encrypted/hello_encrypted/CMakeLists.txt @@ -1,10 +1,10 @@ -# Example encrypted binard +# Example encrypted binary add_executable(hello_encrypted hello_encrypted.c secret.S ) -# include secret.txt +# include directory containing secret.txt target_include_directories(hello_encrypted PRIVATE ${CMAKE_CURRENT_LIST_DIR}) # add dependency on secret.txt diff --git a/encrypted/hello_encrypted/hello_encrypted.c b/encrypted/hello_encrypted/hello_encrypted.c index 074f59777..ff578bdcc 100644 --- a/encrypted/hello_encrypted/hello_encrypted.c +++ b/encrypted/hello_encrypted/hello_encrypted.c @@ -24,7 +24,7 @@ int main() { uint32_t flash_update_base = boot_info.reboot_params[0]; if (flash_update_base) { printf("Perform self-check... "); - if (1 == 1) { + if (1 == 1) { // replace this with your actual self-check function printf("passed\n"); } else { printf("failed - looping forever\n");