Skip to content

Commit c228650

Browse files
committed
added mulhu CRT routines
1 parent 834875c commit c228650

File tree

9 files changed

+894
-0
lines changed

9 files changed

+894
-0
lines changed

src/crt/i48mulhu.src

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
assume adl=1
2+
3+
section .text
4+
5+
public __i48mulhu
6+
7+
; UDE:UHL = ((uint96_t)UDE:UHL * (uint96_t)UIY:UBC) >> 48
8+
__i48mulhu:
9+
; CC: 88 bytes
10+
; minimum: 87F + 39R + 39W + 2
11+
; maximum: 89F + 39R + 39W + 3
12+
; including __i48mulu:
13+
; minimum: 895F + 243R + 179W + 342
14+
; maximum: 897F + 243R + 179W + 343
15+
push ix
16+
push iy
17+
push bc
18+
ld ix, 0
19+
lea iy, ix
20+
add ix, sp
21+
push de
22+
push hl
23+
24+
; x_lo * y_lo
25+
lea de, iy
26+
call __i48mulu
27+
push de ; UHL * UBC (low carry)
28+
29+
; x_hi * y_lo
30+
lea de, iy
31+
ld hl, (ix - 3)
32+
call __i48mulu
33+
push de ; hi24
34+
push hl ; lo24
35+
36+
; x_lo * y_hi
37+
lea de, iy
38+
ld bc, (ix + 3)
39+
ld hl, (ix - 6)
40+
call __i48mulu
41+
pop bc ; lo24
42+
add hl, bc
43+
ex de, hl
44+
pop bc ; hi24
45+
adc hl, bc
46+
47+
pop bc ; UHL * UBC (low carry)
48+
ex de, hl
49+
add hl, bc
50+
jr nc, .no_low_carry
51+
inc de
52+
.no_low_carry:
53+
push de ; high carry
54+
55+
; x_hi * y_hi
56+
lea de, iy
57+
ld bc, (ix + 3)
58+
ld hl, (ix - 3)
59+
call __i48mulu
60+
pop bc ; high carry
61+
add hl, bc
62+
ld sp, ix
63+
pop bc
64+
pop iy
65+
pop ix
66+
ret nc ; no high carry
67+
inc de
68+
ret
69+
70+
extern __i48mulu

src/crt/imulhu.src

Lines changed: 145 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,145 @@
1+
assume adl=1
2+
3+
section .text
4+
5+
public __imulhu
6+
7+
; UHL = ((uint48_t)UHL * (uint48_t)UBC) >> 24
8+
__imulhu:
9+
; TODO: Optimize this routine as this is mostly just a copy paste of __i48mulu with some stuff removed.
10+
;
11+
; CC: 118*r(PC)+39*r(SPL)+38*w(SPL)+37
12+
; CC: 117 bytes | 118F + 39R + 38W + 37
13+
push de
14+
; backup af
15+
push af
16+
push ix
17+
ld ix, 0
18+
add ix, sp
19+
20+
; On stack to get upper byte when needed
21+
push de ; de will also be used to perform the actual multiplication
22+
push hl
23+
push iy
24+
push bc
25+
26+
; bc = a[0], a[1]
27+
ld a, l ; a = b[0]
28+
ld iy, (ix - 5) ; iy = b[1], b[2]
29+
30+
; or a, a ; carry is already cleared
31+
sbc hl, hl
32+
push hl ; upper bytes of sum at -15
33+
; Stack Use:
34+
; ix-1 : deu b[5]
35+
; ix-2 : d b[4]
36+
; ix-3 : e b[3]
37+
; ix-4 : hlu b[2]
38+
; ix-5 : h b[1]
39+
; ix-6 : l b[0]
40+
; ix-7 : iyu a[5]
41+
; ix-8 : iyh a[4]
42+
; ix-9 : iyl a[3]
43+
; ix-10 : bcu a[2]
44+
; ix-11 : b a[1]
45+
; ix-12 : c a[0]
46+
; ix-13 : sum[5]
47+
; ix-14 : sum[4]
48+
; ix-15 : sum[3]
49+
; ix-16 : sum[2]
50+
; ix-17 : sum[1]
51+
; ix-18 : sum[0]
52+
53+
; ======================================================================
54+
; sum[0-1]
55+
56+
; a[0]*b[0]
57+
ld d, c ; d = a[0]
58+
ld e, a ; e = b[0]
59+
mlt de
60+
push de ; lower bytes of sum at -18
61+
62+
; ======================================================================
63+
; sum[1-2]
64+
ld l, d ; hl will store current partial sum
65+
66+
; a[1]*b[0]
67+
ld d, b ; d = a[1]
68+
ld e, a ; e = b[0]
69+
mlt de
70+
add hl, de
71+
72+
; a[0]*b[1]
73+
ld d, c ; d = a[0]
74+
ld e, iyl ; e = b[1]
75+
mlt de
76+
add hl, de
77+
78+
ld (ix - 17), hl
79+
80+
; ======================================================================
81+
; sum[2-3]
82+
ld hl, (ix - 16) ; hl will store current partial sum
83+
84+
; a[0]*b[2]
85+
ld d, c ; d = a[0]
86+
ld e, iyh ; e = b[2]
87+
mlt de
88+
add hl, de
89+
90+
; a[1]*b[1]
91+
ld d, b ; d = a[1]
92+
ld e, iyl ; e = b[1]
93+
mlt de
94+
add hl, de
95+
96+
; a[2]*b[0]
97+
ld d, (ix - 10) ; d = a[2]
98+
ld e, a ; e = b[0]
99+
mlt de
100+
add hl, de
101+
102+
ld (ix - 16), hl
103+
104+
; ======================================================================
105+
; sum[3-4]
106+
ld hl, (ix - 15) ; hl will store current partial sum
107+
108+
; a[1]*b[2]
109+
ld d, b ; d = a[1]
110+
ld e, iyh ; e = b[2]
111+
mlt de
112+
add hl, de
113+
114+
; a[2]*b[1]
115+
ld d, (ix - 10) ; d = a[2]
116+
ld e, iyl ; e = b[1]
117+
mlt de
118+
add hl, de
119+
120+
ld (ix - 15), hl
121+
122+
; ======================================================================
123+
; sum[4-5]
124+
ld hl, (ix - 14) ; hl will store current partial sum
125+
126+
; a[2]*b[2]
127+
ld d, (ix - 10) ; d = a[2]
128+
ld e, iyh ; e = b[2]
129+
mlt de
130+
add hl, de
131+
132+
ld (ix - 14), l
133+
ld (ix - 13), h
134+
135+
; clean up stack and restore registers
136+
pop de
137+
pop hl
138+
pop bc
139+
pop iy
140+
141+
ld sp, ix
142+
pop ix
143+
pop af
144+
pop de
145+
ret

src/crt/llmulhu.src

Lines changed: 101 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,101 @@
1+
assume adl=1
2+
3+
section .text
4+
5+
public __llmulhu
6+
7+
; BC:UDE:UHL = ((uint128_t)BC:UDE:UHL * (uint128_t)(SP64)) >> 64
8+
__llmulhu:
9+
push ix
10+
push iy
11+
ld ix, -36
12+
add ix, sp
13+
ld sp, ix
14+
lea ix, ix + 36
15+
16+
ld (ix - 3), bc
17+
ld (ix - 6), de
18+
ld (ix - 9), hl
19+
20+
ld bc, 0
21+
ld (ix - 13), bc
22+
ld (ix - 30), bc
23+
ld c, (ix + 12)
24+
ld (ix - 33), bc
25+
ld iy, (ix + 9)
26+
ld (ix - 36), iy
27+
28+
; x_lo * y_lo
29+
ld c, b
30+
ld d, b
31+
inc de
32+
dec.s de
33+
call __llmulu
34+
ld (ix - 16), bc
35+
ld (ix - 19), de
36+
ld bc, 0
37+
ld (ix - 14), b
38+
39+
; x_hi * y_lo
40+
inc.s de
41+
ld d, b
42+
ld e, (ix - 2)
43+
ld hl, (ix - 5)
44+
call __llmulu
45+
ld (ix - 21), bc
46+
ld (ix - 24), de
47+
ld (ix - 27), hl
48+
49+
ld c, (ix + 16)
50+
ld (ix - 33), c
51+
ld iy, (ix + 13)
52+
ld (ix - 36), iy
53+
54+
; x_lo * y_hi
55+
ld bc, 0
56+
inc.s de
57+
ld d, b
58+
ld e, (ix - 6)
59+
ld hl, (ix - 9)
60+
call __llmulu
61+
lea iy, ix - 27
62+
call __llmulhu_add
63+
lea iy, ix - 18
64+
call __llmulhu_add
65+
ld (ix - 16), bc
66+
ld (ix - 19), de
67+
ld bc, 0
68+
ld (ix - 14), b
69+
70+
; x_hi * y_hi
71+
inc.s de
72+
ld d, b
73+
ld e, (ix - 2)
74+
ld hl, (ix - 5)
75+
call __llmulu
76+
lea iy, ix - 18
77+
call __llmulhu_add
78+
ld sp, ix
79+
pop iy
80+
pop ix
81+
ret
82+
83+
__llmulhu_add:
84+
; similar to __lladd, except iy points to the stack and is destroyed
85+
push bc
86+
ld bc, (iy + 0)
87+
add hl, bc
88+
ex de, hl
89+
ld bc, (iy + 3)
90+
adc hl, bc
91+
ex de, hl
92+
pop bc
93+
jr nc, .no_carry48
94+
inc bc
95+
.no_carry48:
96+
ld iy, (iy + 6)
97+
add iy, bc
98+
lea bc, iy
99+
ret
100+
101+
extern __llmulu

src/crt/lmulhu.src

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
assume adl=1
2+
3+
section .text
4+
5+
public __lmulhu
6+
7+
; E:UHL = ((uint64_t)E:UHL * (uint64_t)A:UBC) >> 32
8+
__lmulhu:
9+
push iy
10+
push de
11+
ld iy, 0
12+
push iy
13+
ld iyl, a
14+
push iy
15+
push bc
16+
ld iyl, iyh ; ld iy, 0
17+
lea bc, iy
18+
inc de
19+
dec.s de
20+
ld d, b
21+
call __llmulu
22+
; E = B
23+
; UHL = C
24+
; H = UDE
25+
; L = D
26+
add iy, sp
27+
push de
28+
ld e, (iy - 1) ; H = UDE
29+
ld (iy - 1), c ; UHL = C
30+
pop hl ; UHL = C
31+
ld h, e ; H = UDE
32+
ld l, d ; L = D
33+
ld iyl, b ; E = B
34+
pop bc
35+
pop de ; reset SP
36+
pop de ; reset SP
37+
pop de
38+
ld e, iyl ; E = B
39+
pop iy
40+
ret
41+
42+
extern __llmulu

src/crt/smulhu.src

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
assume adl=1
2+
3+
section .text
4+
5+
public __smulhu
6+
7+
; HL = ((uint32_t)HL * (uint32_t)BC) >> 16
8+
__smulhu:
9+
; CC: 32 bytes
10+
; ADL = 1: 33F + 12R + 9W + 17
11+
; ADL = 0: 33F + 8R + 6W + 17
12+
push af
13+
push de
14+
push bc
15+
ld d, l
16+
ld e, c
17+
mlt de ; L * C
18+
ld a, d
19+
ld d, l
20+
ld e, b
21+
mlt de ; L * B
22+
ld l, b
23+
ld b, h
24+
mlt bc ; H * C
25+
mlt hl ; H * B
26+
add a, c
27+
ld c, b
28+
ld b, 0
29+
adc hl, bc
30+
add a, e
31+
ld c, d
32+
adc hl, bc ; result is [0, $FFFE]
33+
pop bc
34+
pop de
35+
pop af
36+
ret

0 commit comments

Comments
 (0)