@@ -40,7 +40,7 @@ scratch RAM and the stack pointer is overwritten.
40
40
#define CTAG8 0x33
41
41
#define CTAG9 0x34
42
42
#define CTAG10 0x35 @ not used
43
- #define CTAG11 0x36
43
+ #define CTAG11 0x36 @ not used
44
44
#define CTAG12 0x37
45
45
#define CTAG13 0x38
46
46
#define CTAG14 0x39
@@ -93,6 +93,8 @@ scratch RAM and the stack pointer is overwritten.
93
93
.endif
94
94
.endm
95
95
96
+ @ Clear internal stripe load registers , and r0 - r3
97
+ @ 0 <= offset <= 32
96
98
.macro clear03 offset= 0
97
99
getchaffaddress r0 , \offset
98
100
ldmia r0 , {r0 - r3}
@@ -158,6 +160,10 @@ RKshareC: @ Round key common share C; see comment at init_key
158
160
.space 4
159
161
RKshareCchange: @ Temporary used by ref_roundkey_share_s
160
162
.space 4
163
+ IV0: @ 2 - way share of IV for block 0
164
+ .space 36 @ Considering IV0 as a word pointer , the form at is IV = IV0 [ 0 , 1 , 2 , 3 ] ^ (IV0 [ 5 , 6 , 7 , 8 ], ror # 16 )
165
+ @ The gap at IV0 [ 4 ] is to defe at unsharing by internal striped memory registers
166
+ @ I.e. , there are implicit XORs IV0 [ 0 ] ^IV0 [ 4 ], IV0 [ 1 ] ^IV0 [ 5 ], ... , th at the 1 word offset renders useless
161
167
162
168
@ Regardless of configuration , the code uses a single 256 - entry LUT ,
163
169
@ which is a simple S - box table.
@@ -323,11 +329,11 @@ gen_rand_sha:
323
329
ldr r2 , =rstate_sha
324
330
ldr r0 ,[ r2 , #jstate - rstate_sha ]
325
331
movs r1 , # 1
326
- movs r3 , r0 , lsl # 2
327
- ands r3 , r3 , # 31
328
- movs r3 , r1 , lsl r3 @ 1 <<( 4 * (r0& 7 ))
329
- udiv r3 , r3 , r1 @ Takes constant + (r0& 7 ) cycles
330
- lsrs r0 , r0 , # 1
332
+ ands r3 , r0 , # 3
333
+ movs r3 , r3 , lsl # 2
334
+ movs r3 , r1 , lsl r3 @ 1 <<( 4 * (r0& 3 ))
335
+ udiv r3 , r3 , r1 @ Takes constant + (r0& 3 ) cycles
336
+ lsrs r0 , r0 , # 2
331
337
bne 1f
332
338
bl gen_rand_sha_nonpres
333
339
ldr r2 , =rstate_sha
@@ -352,6 +358,7 @@ gen_rand_sha_nonpres:
352
358
strb r3 ,[ r2 ] @ save updated SUM register offset in bottom byte of rstate_sha []
353
359
bx r14
354
360
1 :
361
+ @ [ CK_JITTER code was here ]
355
362
movs r3 , #SHA256_SUM6_OFFSET + 1
356
363
strb r3 ,[ r2 ] @ reset word counter: the + 1 is compensated for later
357
364
movw r1 , #( 1 <<SHA256_CSR_BSWAP_LSB) + ( 1 <<SHA256_CSR_START_LSB)
@@ -437,10 +444,13 @@ gen_rand_lfsr_nonpres:
437
444
.balign 4
438
445
.thumb_func
439
446
decrypt:
447
+ @ r0= 4 - way key , r1=IV_shareA , r2=IV_shareB , r3=message buffer , [ r13 ] =number of blocks
448
+ ldr r12 ,[ r13 ] @ Pop 5th argument in r12 (which we are allowed to tre at as scratch according to AAPCS)
440
449
push { r14 }
441
450
GET_CANARY r14 , CTAG3 , 6
442
451
SET_COUNT 23 , 6
443
- push {r0 - r12 , r14 }
452
+ push {r4 - r11 , r14 }
453
+ push {r0 - r3 , r12 } @ Save the five arguments
444
454
bl reset_sha_trng
445
455
bl init_rstate
446
456
@ randomly re - share the LUT contents
@@ -463,11 +473,11 @@ decrypt:
463
473
bl init_key_4way
464
474
CHK_COUNT 31 , 6
465
475
bl lock_key
466
- pop {r0 - r2}
476
+ pop {r0 - r3} @ r0=IV_shareA , r1=IV_shareB , r2=message , r3=num blocks
467
477
bl ctr_crypt_s
468
478
bl randomisechaff
469
479
clear03
470
- pop {r4 - r12 , r14 }
480
+ pop {r4 - r11 , r14 }
471
481
CHK_CANARY r14 , CTAG3 , 6
472
482
pop { r15 }
473
483
@@ -859,7 +869,7 @@ ref_roundkey_hvperms_s_exit: @ label exit point to be to able to specify to ana
859
869
.if ST_VPERM
860
870
.balign 4
861
871
.thumb_func
862
- @ Rotate share registers r4 - r7 , r8 - r11 (r4 - >r5 - r6 - >r7 - >r4 etc.) by an addtional amount
872
+ @ Cycle share registers r4 - r7 , r8 - r11 (r4 - >r5 - r6 - >r7 - >r4 etc.) by an addtional amount
863
873
@ given in the bottom two bits of R0 and update the rotation recorded at statevperm.
864
874
@ On entry R1 must point to statevperm.
865
875
@ Trashes r0 - r3 , r12
@@ -901,46 +911,7 @@ addstatevperm_exit: @ label exit point to be to able to specify to ana
901
911
bx r14
902
912
.endif
903
913
904
- @ Switch from non - shared to shared state
905
- @ Trashes r0 - r3 , r12
906
- .balign 4
907
- ns_to_s:
908
- GET_CANARY r12 , CTAG11 , 6
909
- push { r12 , r14 }
910
- .if ST_SHAREC
911
- bl gen_rand_sha_nonpres @ Create state share C ; all bytes the same
912
- ands r0 , r0 , # 255
913
- orrs r0 , r0 , r0 , lsl # 8
914
- orrs r12 , r0 , r0 , lsl # 16
915
- ldr r1 , =shareC
916
- str r12 ,[ r1 ]
917
- .else
918
- movs r12 , # 0
919
- .endif
920
- bl gen_rand_sha_nonpres
921
- eors r4 , r4 , r0
922
- eor r8 , r12 , r0 , ror # 16
923
- bl gen_rand_sha_nonpres
924
- eors r5 , r5 , r0
925
- eor r9 , r12 , r0 , ror # 16
926
- bl gen_rand_sha_nonpres
927
- eors r6 , r6 , r0
928
- eor r10 , r12 , r0 , ror # 16
929
- bl gen_rand_sha_nonpres
930
- eors r7 , r7 , r0
931
- eor r11 , r12 , r0 , ror # 16
932
- .if ST_VPERM
933
- bl gen_rand_sha_nonpres
934
- ldr r1 , =statevperm
935
- movs r2 , # 0
936
- str r2 ,[ r1 ]
937
- bl addstatevperm @ Initialise state vperm with SHA RNG , refresh with LFSR RNG
938
- .endif
939
- pop { r12 , r14 }
940
- CHK_CANARY r12 , CTAG11 , 6
941
- bx r14
942
-
943
- @ Conjugate lut_a , lut_b with shareC
914
+ @ Conjugate lut_a , lut_b with (state) shareC
944
915
@ I.e. , EOR the input and output with shareC.
945
916
@ We need to pick one input for each share A and B , and one output for ONE of the shares A and B
946
917
@ Arbitrarily choosing a0 , b1 and d0
@@ -1653,44 +1624,65 @@ addrkey_s:
1653
1624
.endif
1654
1625
1655
1626
ctr_crypt_s:
1656
- @ r0=IV , r1=cipher/plaintext buffer , r2 =number of blocks
1627
+ @ r0=IV_shareA , r1=IV_shareB , r2= cipher/plaintext buffer, r3 =number of blocks
1657
1628
GET_CANARY r12 , CTAG0 , 6
1658
1629
push {r0 - r12 , r14 } @ save all registers so th at when we restore we overwrite any secrets
1659
1630
1660
- push {r0 - r2}
1631
+ push {r0 - r3}
1632
+
1661
1633
SET_COUNT 93 , 6
1662
1634
1663
1635
.if CT_BPERM
1664
1636
@ Initialise 32 random numbers (which fit in half - words)
1637
+ @ r3=number of blocks
1665
1638
ldr r4 , =bperm_rand
1666
1639
movs r5 , # 32
1667
1640
1 :
1668
1641
bl gen_rand_sha
1669
- umull r0 , r3 , r0 , r2 @ Random number between 0 and n - 1 (n=#blocks)
1670
- strh r3 ,[ r4 ], # 2
1642
+ umull r0 , r2 , r0 , r3 @ Random number between 0 and n - 1 (n=#blocks)
1643
+ strh r2 ,[ r4 ], # 2
1671
1644
subs r5 , r5 , # 1
1672
1645
bne 1b
1673
1646
.endif
1674
1647
1675
1648
bl randomisechaff
1676
- pop {r0 - r2}
1649
+
1650
+ @ Refresh IVshareA and IVshareB , convert to ror # 16 form at and store the result at IV0
1651
+ @ Not doing shareC or state vperm at this point
1652
+ pop {r0}
1653
+ ldmia r0 , {r4 - r7} @ r4 - r7 = IVshareA
1654
+ clear03 16
1655
+ pop {r1}
1656
+ ldmia r1 , { r8 - r11 } @ r8 - r11 = IVshareB
1657
+ clear03 32
1658
+ bl gen_rand_sha_nonpres ; eors r4,r4,r0; mov r8, r8, ror#16; eor r8, r8, r0,ror#16
1659
+ bl gen_rand_sha_nonpres ; eors r5,r5,r0; mov r9, r9, ror#16; eor r9, r9, r0,ror#16
1660
+ bl gen_rand_sha_nonpres ; eors r6,r6,r0; mov r10,r10,ror#16; eor r10,r10,r0,ror#16
1661
+ bl gen_rand_sha_nonpres ; eors r7,r7,r0; mov r11,r11,ror#16; eor r11,r11,r0,ror#16
1662
+ ldr r0 , =IV0
1663
+ stmia r0 , {r4 - r7}
1664
+ adds r0 , r0 , # 20
1665
+ stmia r0 , { r8 - r11 }
1666
+ pop {r1 , r2}
1667
+ @ r1=cipher/plaintext buffer , r2=number of blocks
1668
+
1677
1669
movs r3 , # 0
1678
1670
CHK_COUNT 93 , 6
1679
1671
1680
1672
ctr_crypt_mainloop:
1681
1673
SET_COUNT 80 , 6
1682
- @ here r0=IV , r1=cipher/plaintext buffer, r2=number of blocks , r3=block counter
1674
+ @ r1=cipher/plaintext buffer , r2=number of blocks , r3=block counter
1683
1675
1684
1676
@ Do as much preparatory stuff as possible th at doesn't involve the IV (to reduce interaction with it)
1685
- push {r0 - r3}
1677
+ push {r1 - r3}
1686
1678
@ It 's OK for execution time to depend on the block counter r3 (" public") , but not the block number (secret)
1687
1679
1688
1680
tst r3 , #(REFCHAFF_PERIOD - 1 )
1689
1681
bne 1f
1690
1682
bl refreshchaff_and_lfsr
1691
1683
1 :
1692
1684
1693
- ldr r3 ,[ r13 , # 12 ] @ get block count off the stack
1685
+ ldr r3 ,[ r13 , # 8 ] @ get block count off the stack
1694
1686
tst r3 , #(REMAP_PERIOD - 1 )
1695
1687
bne 1f
1696
1688
bl remap @ shuffle the LUTs ; this preserves R3
@@ -1702,21 +1694,21 @@ ctr_crypt_mainloop:
1702
1694
bl ref_roundkey_shares_s @ refresh the round key shares
1703
1695
1 :
1704
1696
1705
- ldr r3 ,[ r13 , # 12 ] @ get block count off the stack
1697
+ ldr r3 ,[ r13 , # 8 ] @ get block count off the stack
1706
1698
tst r3 , #(REFROUNDKEYHVPERMS_PERIOD - 1 )
1707
1699
bne 1f
1708
1700
bl ref_roundkey_hvperms_s @ refresh the round key vperms
1709
1701
1 :
1710
1702
1711
1703
CHK_COUNT 81 , 6
1712
1704
1713
- pop {r0 - r3}
1714
- @ r0=IV , r1=cipher/plaintext buffer, r2=number of blocks , r3=block counter
1705
+ pop {r1 - r3}
1706
+ @ r1=cipher/plaintext buffer , r2=number of blocks , r3=block counter
1715
1707
1716
1708
@ Now calculate r12 = block number - to - be - deciphered from r3 = block counter
1717
1709
.if CT_BPERM
1718
1710
@ Use a "swap-or-not" method to generate an "oblivious" permutation ; see makeperm.py version 7
1719
- push {r0 , r1}
1711
+ push {r1}
1720
1712
ldr r0 , =murmur3_constants
1721
1713
ldmia r0 , { r9 - r12 , r14 } @ load five murmur3_32 hash constants
1722
1714
ldr r0 , =bperm_rand
@@ -1752,57 +1744,53 @@ ctr_crypt_mainloop:
1752
1744
adds r4 , r4 , r7 @ r4=j if top bit of r6 , else i
1753
1745
subs r1 , r1 , # 1
1754
1746
bpl 1b
1755
- pop {r0 , r1}
1747
+ pop {r1}
1756
1748
mov r12 , r4
1757
1749
.else
1758
1750
mov r12 , r3
1759
1751
.endif
1760
1752
CHK_COUNT 82 , 6
1761
1753
1762
- @ r0=IV , r1=cipher/plaintext buffer , r2=number of blocks , r3=block counter (monotonic) , r12 =block number (block to be deciphered)
1763
- push {r0 - r3 , r12 }
1754
+ @ r1=cipher/plaintext buffer , r2=number of blocks , r3=block counter (monotonic) , r12 =block number (block to be deciphered)
1755
+ push {r1 - r3 , r12 }
1756
+ @ r4 - r11 = IV0 , r12 =block number
1764
1757
1765
1758
processIV: @ non - target label to assist power analysis
1766
-
1767
- @ It is not clear if the following addition of the block number in r12 to the IV can usefully
1768
- @ be done in terms of shares. Instead we do an addition and subtraction whose overall effect
1769
- @ is the same , and which provides a small degree of masking. The IV is not traditionally a secret ,
1770
- @ though it will make it harder for the attacker if it is obscured.
1771
- bl gen_rand_sha
1772
- movs r8 , r0 , lsr# 16 @ only use 16 low bits so we don't get any overflows in the following , and so th at a carry from the first word is rare
1773
- add r9 , r8 , r12 @ "masked" block number
1774
- @ r8 =random , r9 =(block number) + r8 , stack=IV , ...
1775
-
1776
- ldr r0 ,[ r13 ] @ peek at stack to restore r0=IV ptr
1777
- ldmia r0 , {r4 - r7} @ load IV
1778
- clear03 @ barrier to remove traces of IV from internal CPU load registers
1779
-
1780
- @ Add in r9 in byte - big - endian , bit - little - endian (!) fashion , while trying to avoid rev operations
1781
- @ as far as possible as these tend to expose (via power fluctuations) byte - level hamming weights.
1782
- @ First do 128 - bit addition of r9 to byte - reversed IV
1783
- rev r7 , r7
1784
- cmn r7 , #MAX_NUM_BLOCKS @ Compare against maximum number of blocks
1785
- bcs 1f
1786
- add r7 , r7 , r9 @ This can temporarily overflow but it doesn't matter as we know th at r7 + r12 does not overflow
1787
- sub r7 , r7 , r8
1788
- b 2f
1789
- 1 :
1790
- adds r7 , r7 , r9
1791
- rev r6 , r6 ; adcs r6,r6,#0
1792
- rev r5 , r5 ; adcs r5,r5,#0
1793
- rev r4 , r4 ; adcs r4,r4,#0
1794
- @ Now do 128 - bit subtraction of r8 from byte - reversed IV
1795
- subs r7 , r7 , r8
1796
- sbcs r6 , r6 , # 0 ; rev r6,r6
1797
- sbcs r5 , r5 , # 0 ; rev r5,r5
1798
- sbcs r4 , r4 , # 0 ; rev r4,r4
1799
- 2 :
1800
- rev r7 , r7
1801
- clear01 16
1759
+ ldr r8 , =IV0
1760
+ ldmia r8 , {r4 - r7} @ load IV0_A
1761
+ clear03 16
1762
+ add r8 , r8 , # 20
1763
+ ldmia r8 , { r8 - r11 } @ load IV0_B
1764
+ clear03 32
1765
+ rev r0 , r12
1766
+ eor r7 , r7 , r0 @ XOR in block number to IV0. IV(block n) = IV0 ^ n , cf standard CTR mode IV0 + n.
1767
+ @ XOR (vs addition) is compatible with XOR - shares , so stealthier/simpler because don't have to unshare to work out IV(block n)
1768
+ @ r4 - r11 = IV for the current block
1802
1769
CHK_COUNT 83 , 6
1770
+ .if ST_SHAREC
1771
+ bl gen_rand_sha_nonpres @ Create state share C ; all bytes the same
1772
+ ands r0 , r0 , # 255
1773
+ orrs r0 , r0 , r0 , lsl # 8
1774
+ orrs r12 , r0 , r0 , lsl # 16
1775
+ ldr r1 , =shareC
1776
+ str r12 ,[ r1 ]
1777
+ .else
1778
+ movs r12 , # 0
1779
+ .endif
1780
+ @ r4 - r11 = IV for the current block w/o shareC , r12 =shareC
1781
+ @ refresh state shares and mix in shareC
1782
+ bl gen_rand_sha_nonpres ; eors r4,r4,r0; eor r4,r4,r12; movs r1,#0; eor r8, r8, r0,ror#16 @ Barriers between shares to prevent implicit r4^r8 etc
1783
+ bl gen_rand_sha_nonpres ; eors r5,r5,r0; eor r5,r5,r12; movs r1,#0; eor r9, r9, r0,ror#16
1784
+ bl gen_rand_sha_nonpres ; eors r6,r6,r0; eor r6,r6,r12; movs r1,#0; eor r10,r10,r0,ror#16
1785
+ bl gen_rand_sha_nonpres ; eors r7,r7,r0; eor r7,r7,r12; movs r1,#0; eor r11,r11,r0,ror#16
1786
+ .if ST_VPERM
1787
+ bl gen_rand_sha_nonpres
1788
+ ldr r1 , =statevperm
1789
+ movs r2 , # 0
1790
+ str r2 ,[ r1 ]
1791
+ bl addstatevperm @ Initialise state vperm (use SHA RNG to start with , later refreshes are with LFSR RNG)
1792
+ .endif
1803
1793
1804
- @ r4 - r7 = IV for the current block
1805
- bl ns_to_s @ convert IV + x to shares , which includes choosing and incorporating a random shareC
1806
1794
CHK_COUNT 84 , 6
1807
1795
bl conjshareC @ Add the effect of shareC to lut_a , lut_b
1808
1796
CHK_COUNT 85 , 6
@@ -1849,9 +1837,9 @@ rounds_s_mainloop:
1849
1837
bl addstatevperm
1850
1838
.endif
1851
1839
1852
- pop {r0 - r3 , r12 }
1853
- push {r0 , r3}
1854
- @ r0=IV , r1=cipher/plaintext buffer, r2=number of blocks , r3=block counter , r12 =block to be deciphered
1840
+ pop {r1 - r3 , r12 }
1841
+ push {r3}
1842
+ @ r1=cipher/plaintext buffer , r2=number of blocks , r3=block counter , r12 =block to be deciphered
1855
1843
1856
1844
decryption_start:
1857
1845
@ Decrypt ciphertext using AES output in shares: r4 - r11
@@ -1893,8 +1881,8 @@ decryption_start:
1893
1881
sub r1 , r1 , r12 , lsl # 4 @ Restore r1 to point to start of buffer
1894
1882
CHK_COUNT 90 , 6
1895
1883
1896
- pop {r0 , r3} @ Restore IV and block counter
1897
- @ r0=IV , r1=cipher/plaintext buffer, r2=number of blocks , r3=block counter
1884
+ pop {r3} @ Restore block counter
1885
+ @ r1=cipher/plaintext buffer , r2=number of blocks , r3=block counter
1898
1886
decryption_end:
1899
1887
1900
1888
adds r3 , r3 , # 1
0 commit comments