Skip to content

Commit 05557f5

Browse files
committed
Update with latest aes.S
1 parent bcce195 commit 05557f5

File tree

2 files changed

+105
-110
lines changed

2 files changed

+105
-110
lines changed

bootloaders/encrypted/aes.S

+96-108
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ scratch RAM and the stack pointer is overwritten.
4040
#define CTAG8 0x33
4141
#define CTAG9 0x34
4242
#define CTAG10 0x35 @ not used
43-
#define CTAG11 0x36
43+
#define CTAG11 0x36 @ not used
4444
#define CTAG12 0x37
4545
#define CTAG13 0x38
4646
#define CTAG14 0x39
@@ -93,6 +93,8 @@ scratch RAM and the stack pointer is overwritten.
9393
.endif
9494
.endm
9595

96+
@ Clear internal stripe load registers, and r0-r3
97+
@ 0 <= offset <= 32
9698
.macro clear03 offset=0
9799
getchaffaddress r0,\offset
98100
ldmia r0,{r0-r3}
@@ -158,6 +160,10 @@ RKshareC: @ Round key common share C; see comment at init_key
158160
.space 4
159161
RKshareCchange: @ Temporary used by ref_roundkey_share_s
160162
.space 4
163+
IV0: @ 2-way share of IV for block 0
164+
.space 36 @ Considering IV0 as a word pointer, the format is IV = IV0[0,1,2,3] ^ (IV0[5,6,7,8],ror#16)
165+
@ The gap at IV0[4] is to defeat unsharing by internal striped memory registers
166+
@ I.e., there are implicit XORs IV0[0]^IV0[4], IV0[1]^IV0[5], ..., that the 1 word offset renders useless
161167

162168
@ Regardless of configuration, the code uses a single 256-entry LUT,
163169
@ which is a simple S-box table.
@@ -323,11 +329,11 @@ gen_rand_sha:
323329
ldr r2,=rstate_sha
324330
ldr r0,[r2,#jstate-rstate_sha]
325331
movs r1,#1
326-
movs r3,r0,lsl#2
327-
ands r3,r3,#31
328-
movs r3,r1,lsl r3 @ 1<<(4*(r0&7))
329-
udiv r3,r3,r1 @ Takes constant + (r0&7) cycles
330-
lsrs r0,r0,#1
332+
ands r3,r0,#3
333+
movs r3,r3,lsl#2
334+
movs r3,r1,lsl r3 @ 1<<(4*(r0&3))
335+
udiv r3,r3,r1 @ Takes constant + (r0&3) cycles
336+
lsrs r0,r0,#2
331337
bne 1f
332338
bl gen_rand_sha_nonpres
333339
ldr r2,=rstate_sha
@@ -352,6 +358,7 @@ gen_rand_sha_nonpres:
352358
strb r3,[r2] @ save updated SUM register offset in bottom byte of rstate_sha[]
353359
bx r14
354360
1:
361+
@ [CK_JITTER code was here]
355362
movs r3,#SHA256_SUM6_OFFSET+1
356363
strb r3,[r2] @ reset word counter: the +1 is compensated for later
357364
movw r1,#(1<<SHA256_CSR_BSWAP_LSB)+(1<<SHA256_CSR_START_LSB)
@@ -437,10 +444,13 @@ gen_rand_lfsr_nonpres:
437444
.balign 4
438445
.thumb_func
439446
decrypt:
447+
@ r0=4-way key, r1=IV_shareA, r2=IV_shareB, r3=message buffer, [r13]=number of blocks
448+
ldr r12,[r13] @ Pop 5th argument in r12 (which we are allowed to treat as scratch according to AAPCS)
440449
push {r14}
441450
GET_CANARY r14,CTAG3,6
442451
SET_COUNT 23,6
443-
push {r0-r12,r14}
452+
push {r4-r11,r14}
453+
push {r0-r3,r12} @ Save the five arguments
444454
bl reset_sha_trng
445455
bl init_rstate
446456
@ randomly re-share the LUT contents
@@ -463,11 +473,11 @@ decrypt:
463473
bl init_key_4way
464474
CHK_COUNT 31,6
465475
bl lock_key
466-
pop {r0-r2}
476+
pop {r0-r3} @ r0=IV_shareA, r1=IV_shareB, r2=message, r3=num blocks
467477
bl ctr_crypt_s
468478
bl randomisechaff
469479
clear03
470-
pop {r4-r12,r14}
480+
pop {r4-r11,r14}
471481
CHK_CANARY r14,CTAG3,6
472482
pop {r15}
473483

@@ -859,7 +869,7 @@ ref_roundkey_hvperms_s_exit: @ label exit point to be to able to specify to ana
859869
.if ST_VPERM
860870
.balign 4
861871
.thumb_func
862-
@ Rotate share registers r4-r7, r8-r11 (r4->r5-r6->r7->r4 etc.) by an addtional amount
872+
@ Cycle share registers r4-r7, r8-r11 (r4->r5-r6->r7->r4 etc.) by an addtional amount
863873
@ given in the bottom two bits of R0 and update the rotation recorded at statevperm.
864874
@ On entry R1 must point to statevperm.
865875
@ Trashes r0-r3,r12
@@ -901,46 +911,7 @@ addstatevperm_exit: @ label exit point to be to able to specify to ana
901911
bx r14
902912
.endif
903913

904-
@ Switch from non-shared to shared state
905-
@ Trashes r0-r3,r12
906-
.balign 4
907-
ns_to_s:
908-
GET_CANARY r12,CTAG11,6
909-
push {r12,r14}
910-
.if ST_SHAREC
911-
bl gen_rand_sha_nonpres @ Create state share C; all bytes the same
912-
ands r0,r0,#255
913-
orrs r0,r0,r0,lsl#8
914-
orrs r12,r0,r0,lsl#16
915-
ldr r1,=shareC
916-
str r12,[r1]
917-
.else
918-
movs r12,#0
919-
.endif
920-
bl gen_rand_sha_nonpres
921-
eors r4,r4,r0
922-
eor r8,r12,r0,ror#16
923-
bl gen_rand_sha_nonpres
924-
eors r5,r5,r0
925-
eor r9,r12,r0,ror#16
926-
bl gen_rand_sha_nonpres
927-
eors r6,r6,r0
928-
eor r10,r12,r0,ror#16
929-
bl gen_rand_sha_nonpres
930-
eors r7,r7,r0
931-
eor r11,r12,r0,ror#16
932-
.if ST_VPERM
933-
bl gen_rand_sha_nonpres
934-
ldr r1,=statevperm
935-
movs r2,#0
936-
str r2,[r1]
937-
bl addstatevperm @ Initialise state vperm with SHA RNG, refresh with LFSR RNG
938-
.endif
939-
pop {r12,r14}
940-
CHK_CANARY r12,CTAG11,6
941-
bx r14
942-
943-
@ Conjugate lut_a, lut_b with shareC
914+
@ Conjugate lut_a, lut_b with (state) shareC
944915
@ I.e., EOR the input and output with shareC.
945916
@ We need to pick one input for each share A and B, and one output for ONE of the shares A and B
946917
@ Arbitrarily choosing a0, b1 and d0
@@ -1653,44 +1624,65 @@ addrkey_s:
16531624
.endif
16541625

16551626
ctr_crypt_s:
1656-
@ r0=IV, r1=cipher/plaintext buffer, r2=number of blocks
1627+
@ r0=IV_shareA, r1=IV_shareB, r2=cipher/plaintext buffer, r3=number of blocks
16571628
GET_CANARY r12,CTAG0,6
16581629
push {r0-r12,r14} @ save all registers so that when we restore we overwrite any secrets
16591630

1660-
push {r0-r2}
1631+
push {r0-r3}
1632+
16611633
SET_COUNT 93,6
16621634

16631635
.if CT_BPERM
16641636
@ Initialise 32 random numbers (which fit in half-words)
1637+
@ r3=number of blocks
16651638
ldr r4,=bperm_rand
16661639
movs r5,#32
16671640
1:
16681641
bl gen_rand_sha
1669-
umull r0,r3,r0,r2 @ Random number between 0 and n-1 (n=#blocks)
1670-
strh r3,[r4],#2
1642+
umull r0,r2,r0,r3 @ Random number between 0 and n-1 (n=#blocks)
1643+
strh r2,[r4],#2
16711644
subs r5,r5,#1
16721645
bne 1b
16731646
.endif
16741647

16751648
bl randomisechaff
1676-
pop {r0-r2}
1649+
1650+
@ Refresh IVshareA and IVshareB, convert to ror#16 format and store the result at IV0
1651+
@ Not doing shareC or state vperm at this point
1652+
pop {r0}
1653+
ldmia r0,{r4-r7} @ r4-r7 = IVshareA
1654+
clear03 16
1655+
pop {r1}
1656+
ldmia r1,{r8-r11} @ r8-r11 = IVshareB
1657+
clear03 32
1658+
bl gen_rand_sha_nonpres; eors r4,r4,r0; mov r8, r8, ror#16; eor r8, r8, r0,ror#16
1659+
bl gen_rand_sha_nonpres; eors r5,r5,r0; mov r9, r9, ror#16; eor r9, r9, r0,ror#16
1660+
bl gen_rand_sha_nonpres; eors r6,r6,r0; mov r10,r10,ror#16; eor r10,r10,r0,ror#16
1661+
bl gen_rand_sha_nonpres; eors r7,r7,r0; mov r11,r11,ror#16; eor r11,r11,r0,ror#16
1662+
ldr r0,=IV0
1663+
stmia r0,{r4-r7}
1664+
adds r0,r0,#20
1665+
stmia r0,{r8-r11}
1666+
pop {r1,r2}
1667+
@ r1=cipher/plaintext buffer, r2=number of blocks
1668+
16771669
movs r3,#0
16781670
CHK_COUNT 93,6
16791671

16801672
ctr_crypt_mainloop:
16811673
SET_COUNT 80,6
1682-
@ here r0=IV, r1=cipher/plaintext buffer, r2=number of blocks, r3=block counter
1674+
@ r1=cipher/plaintext buffer, r2=number of blocks, r3=block counter
16831675

16841676
@ Do as much preparatory stuff as possible that doesn't involve the IV (to reduce interaction with it)
1685-
push {r0-r3}
1677+
push {r1-r3}
16861678
@ It's OK for execution time to depend on the block counter r3 ("public"), but not the block number (secret)
16871679

16881680
tst r3,#(REFCHAFF_PERIOD-1)
16891681
bne 1f
16901682
bl refreshchaff_and_lfsr
16911683
1:
16921684

1693-
ldr r3,[r13,#12] @ get block count off the stack
1685+
ldr r3,[r13,#8] @ get block count off the stack
16941686
tst r3,#(REMAP_PERIOD-1)
16951687
bne 1f
16961688
bl remap @ shuffle the LUTs; this preserves R3
@@ -1702,21 +1694,21 @@ ctr_crypt_mainloop:
17021694
bl ref_roundkey_shares_s @ refresh the round key shares
17031695
1:
17041696

1705-
ldr r3,[r13,#12] @ get block count off the stack
1697+
ldr r3,[r13,#8] @ get block count off the stack
17061698
tst r3,#(REFROUNDKEYHVPERMS_PERIOD-1)
17071699
bne 1f
17081700
bl ref_roundkey_hvperms_s @ refresh the round key vperms
17091701
1:
17101702

17111703
CHK_COUNT 81,6
17121704

1713-
pop {r0-r3}
1714-
@ r0=IV, r1=cipher/plaintext buffer, r2=number of blocks, r3=block counter
1705+
pop {r1-r3}
1706+
@ r1=cipher/plaintext buffer, r2=number of blocks, r3=block counter
17151707

17161708
@ Now calculate r12 = block number-to-be-deciphered from r3 = block counter
17171709
.if CT_BPERM
17181710
@ Use a "swap-or-not" method to generate an "oblivious" permutation; see makeperm.py version 7
1719-
push {r0,r1}
1711+
push {r1}
17201712
ldr r0,=murmur3_constants
17211713
ldmia r0,{r9-r12,r14} @ load five murmur3_32 hash constants
17221714
ldr r0,=bperm_rand
@@ -1752,57 +1744,53 @@ ctr_crypt_mainloop:
17521744
adds r4,r4,r7 @ r4=j if top bit of r6, else i
17531745
subs r1,r1,#1
17541746
bpl 1b
1755-
pop {r0,r1}
1747+
pop {r1}
17561748
mov r12,r4
17571749
.else
17581750
mov r12,r3
17591751
.endif
17601752
CHK_COUNT 82,6
17611753

1762-
@ r0=IV, r1=cipher/plaintext buffer, r2=number of blocks, r3=block counter (monotonic), r12=block number (block to be deciphered)
1763-
push {r0-r3,r12}
1754+
@ r1=cipher/plaintext buffer, r2=number of blocks, r3=block counter (monotonic), r12=block number (block to be deciphered)
1755+
push {r1-r3,r12}
1756+
@ r4-r11 = IV0, r12=block number
17641757

17651758
processIV: @ non-target label to assist power analysis
1766-
1767-
@ It is not clear if the following addition of the block number in r12 to the IV can usefully
1768-
@ be done in terms of shares. Instead we do an addition and subtraction whose overall effect
1769-
@ is the same, and which provides a small degree of masking. The IV is not traditionally a secret,
1770-
@ though it will make it harder for the attacker if it is obscured.
1771-
bl gen_rand_sha
1772-
movs r8,r0,lsr#16 @ only use 16 low bits so we don't get any overflows in the following, and so that a carry from the first word is rare
1773-
add r9,r8,r12 @ "masked" block number
1774-
@ r8=random, r9=(block number)+r8, stack=IV,...
1775-
1776-
ldr r0,[r13] @ peek at stack to restore r0=IV ptr
1777-
ldmia r0,{r4-r7} @ load IV
1778-
clear03 @ barrier to remove traces of IV from internal CPU load registers
1779-
1780-
@ Add in r9 in byte-big-endian, bit-little-endian (!) fashion, while trying to avoid rev operations
1781-
@ as far as possible as these tend to expose (via power fluctuations) byte-level hamming weights.
1782-
@ First do 128-bit addition of r9 to byte-reversed IV
1783-
rev r7,r7
1784-
cmn r7,#MAX_NUM_BLOCKS @ Compare against maximum number of blocks
1785-
bcs 1f
1786-
add r7,r7,r9 @ This can temporarily overflow but it doesn't matter as we know that r7+r12 does not overflow
1787-
sub r7,r7,r8
1788-
b 2f
1789-
1:
1790-
adds r7,r7,r9
1791-
rev r6,r6; adcs r6,r6,#0
1792-
rev r5,r5; adcs r5,r5,#0
1793-
rev r4,r4; adcs r4,r4,#0
1794-
@ Now do 128-bit subtraction of r8 from byte-reversed IV
1795-
subs r7,r7,r8
1796-
sbcs r6,r6,#0; rev r6,r6
1797-
sbcs r5,r5,#0; rev r5,r5
1798-
sbcs r4,r4,#0; rev r4,r4
1799-
2:
1800-
rev r7,r7
1801-
clear01 16
1759+
ldr r8,=IV0
1760+
ldmia r8,{r4-r7} @ load IV0_A
1761+
clear03 16
1762+
add r8,r8,#20
1763+
ldmia r8,{r8-r11} @ load IV0_B
1764+
clear03 32
1765+
rev r0,r12
1766+
eor r7,r7,r0 @ XOR in block number to IV0. IV(block n) = IV0 ^ n, cf standard CTR mode IV0 + n.
1767+
@ XOR (vs addition) is compatible with XOR-shares, so stealthier/simpler because don't have to unshare to work out IV(block n)
1768+
@ r4-r11 = IV for the current block
18021769
CHK_COUNT 83,6
1770+
.if ST_SHAREC
1771+
bl gen_rand_sha_nonpres @ Create state share C; all bytes the same
1772+
ands r0,r0,#255
1773+
orrs r0,r0,r0,lsl#8
1774+
orrs r12,r0,r0,lsl#16
1775+
ldr r1,=shareC
1776+
str r12,[r1]
1777+
.else
1778+
movs r12,#0
1779+
.endif
1780+
@ r4-r11 = IV for the current block w/o shareC, r12=shareC
1781+
@ refresh state shares and mix in shareC
1782+
bl gen_rand_sha_nonpres; eors r4,r4,r0; eor r4,r4,r12; movs r1,#0; eor r8, r8, r0,ror#16 @ Barriers between shares to prevent implicit r4^r8 etc
1783+
bl gen_rand_sha_nonpres; eors r5,r5,r0; eor r5,r5,r12; movs r1,#0; eor r9, r9, r0,ror#16
1784+
bl gen_rand_sha_nonpres; eors r6,r6,r0; eor r6,r6,r12; movs r1,#0; eor r10,r10,r0,ror#16
1785+
bl gen_rand_sha_nonpres; eors r7,r7,r0; eor r7,r7,r12; movs r1,#0; eor r11,r11,r0,ror#16
1786+
.if ST_VPERM
1787+
bl gen_rand_sha_nonpres
1788+
ldr r1,=statevperm
1789+
movs r2,#0
1790+
str r2,[r1]
1791+
bl addstatevperm @ Initialise state vperm (use SHA RNG to start with, later refreshes are with LFSR RNG)
1792+
.endif
18031793

1804-
@ r4-r7 = IV for the current block
1805-
bl ns_to_s @ convert IV+x to shares, which includes choosing and incorporating a random shareC
18061794
CHK_COUNT 84,6
18071795
bl conjshareC @ Add the effect of shareC to lut_a, lut_b
18081796
CHK_COUNT 85,6
@@ -1849,9 +1837,9 @@ rounds_s_mainloop:
18491837
bl addstatevperm
18501838
.endif
18511839

1852-
pop {r0-r3,r12}
1853-
push {r0,r3}
1854-
@ r0=IV, r1=cipher/plaintext buffer, r2=number of blocks, r3=block counter, r12=block to be deciphered
1840+
pop {r1-r3,r12}
1841+
push {r3}
1842+
@ r1=cipher/plaintext buffer, r2=number of blocks, r3=block counter, r12=block to be deciphered
18551843

18561844
decryption_start:
18571845
@ Decrypt ciphertext using AES output in shares: r4-r11
@@ -1893,8 +1881,8 @@ decryption_start:
18931881
sub r1,r1,r12,lsl#4 @ Restore r1 to point to start of buffer
18941882
CHK_COUNT 90,6
18951883

1896-
pop {r0,r3} @ Restore IV and block counter
1897-
@ r0=IV, r1=cipher/plaintext buffer, r2=number of blocks, r3=block counter
1884+
pop {r3} @ Restore block counter
1885+
@ r1=cipher/plaintext buffer, r2=number of blocks, r3=block counter
18981886
decryption_end:
18991887

19001888
adds r3,r3,#1

bootloaders/encrypted/enc_bootloader.c

+9-2
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818

1919
#define OTP_KEY_PAGE 30
2020

21-
extern void decrypt(uint8_t* key4way, uint8_t* iv, uint8_t(*buf)[16], int nblk);
21+
extern void decrypt(uint8_t* key4way, uint8_t* IV_OTPsalt, uint8_t* IV_public, uint8_t(*buf)[16], int nblk);
2222

2323
// The function lock_key() is called from decrypt() after key initialisation is complete and before decryption begins.
2424
// That is a suitable point to lock the OTP area where key information is stored.
@@ -151,7 +151,14 @@ int main() {
151151
// Read key directly from OTP - guarded reads will throw a bus fault if there are any errors
152152
uint16_t* otp_data = (uint16_t*)OTP_DATA_GUARDED_BASE;
153153

154-
decrypt((uint8_t*)&(otp_data[(OTP_CMD_ROW_BITS & (OTP_KEY_PAGE * 0x40))]), iv, (void*)SRAM_BASE, data_size/16);
154+
decrypt(
155+
(uint8_t*)&(otp_data[(OTP_CMD_ROW_BITS & (OTP_KEY_PAGE * 0x40))]),
156+
(uint8_t*)&(otp_data[(OTP_CMD_ROW_BITS & ((OTP_KEY_PAGE + 1) * 0x40))]),
157+
iv, (void*)SRAM_BASE, data_size/16
158+
);
159+
160+
// Lock the IV salt
161+
otp_hw->sw_lock[OTP_KEY_PAGE + 1] = 0xf;
155162

156163
printf("Post decryption image begins with\n");
157164
for (int i=0; i < 4; i++)

0 commit comments

Comments
 (0)