Skip to content

Commit 0bc7f41

Browse files
authored
[libclc] Move all remquo address spaces to CLC library (#140871)
Previously the OpenCL address space overloads of remquo would call into the one and only 'private' CLC remquo. This was an outlier compared with the other pointer-argumented maths builtins. This commit moves the definitions of all address space overloads to the CLC library to give more control over each address space to CLC implementers. There are some minor changes to the generated bytecode but it's simply moving IR instructions around.
1 parent 7a8090c commit 0bc7f41

File tree

7 files changed

+304
-291
lines changed

7 files changed

+304
-291
lines changed

libclc/clc/include/clc/math/clc_remquo.h

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,12 +10,10 @@
1010
#define __CLC_MATH_CLC_REMQUO_H__
1111

1212
#define __CLC_FUNCTION __clc_remquo
13-
1413
#define __CLC_BODY <clc/math/remquo_decl.inc>
15-
#define __CLC_ADDRESS_SPACE private
14+
1615
#include <clc/math/gentype.inc>
1716

18-
#undef __CLC_ADDRESS_SPACE
1917
#undef __CLC_FUNCTION
2018

2119
#endif // __CLC_MATH_CLC_REMQUO_H__

libclc/clc/include/clc/math/remquo_decl.inc

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,5 +6,14 @@
66
//
77
//===----------------------------------------------------------------------===//
88

9-
_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __CLC_FUNCTION(
10-
__CLC_GENTYPE x, __CLC_GENTYPE y, __CLC_ADDRESS_SPACE __CLC_INTN *q);
9+
_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __CLC_FUNCTION(__CLC_GENTYPE x,
10+
__CLC_GENTYPE y,
11+
private __CLC_INTN *q);
12+
13+
_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __CLC_FUNCTION(__CLC_GENTYPE x,
14+
__CLC_GENTYPE y,
15+
global __CLC_INTN *q);
16+
17+
_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __CLC_FUNCTION(__CLC_GENTYPE x,
18+
__CLC_GENTYPE y,
19+
local __CLC_INTN *q);

libclc/clc/lib/generic/math/clc_remquo.cl

Lines changed: 9 additions & 257 deletions
Original file line numberDiff line numberDiff line change
@@ -18,262 +18,14 @@
1818
#include <clc/math/math.h>
1919
#include <clc/shared/clc_max.h>
2020

21-
_CLC_DEF _CLC_OVERLOAD float __clc_remquo(float x, float y,
22-
__private int *quo) {
23-
x = __clc_flush_denormal_if_not_supported(x);
24-
y = __clc_flush_denormal_if_not_supported(y);
25-
int ux = __clc_as_int(x);
26-
int ax = ux & EXSIGNBIT_SP32;
27-
float xa = __clc_as_float(ax);
28-
int sx = ux ^ ax;
29-
int ex = ax >> EXPSHIFTBITS_SP32;
21+
#define __CLC_ADDRESS_SPACE private
22+
#include <clc_remquo.inc>
23+
#undef __CLC_ADDRESS_SPACE
3024

31-
int uy = __clc_as_int(y);
32-
int ay = uy & EXSIGNBIT_SP32;
33-
float ya = __clc_as_float(ay);
34-
int sy = uy ^ ay;
35-
int ey = ay >> EXPSHIFTBITS_SP32;
25+
#define __CLC_ADDRESS_SPACE global
26+
#include <clc_remquo.inc>
27+
#undef __CLC_ADDRESS_SPACE
3628

37-
float xr = __clc_as_float(0x3f800000 | (ax & 0x007fffff));
38-
float yr = __clc_as_float(0x3f800000 | (ay & 0x007fffff));
39-
int c;
40-
int k = ex - ey;
41-
42-
uint q = 0;
43-
44-
while (k > 0) {
45-
c = xr >= yr;
46-
q = (q << 1) | c;
47-
xr -= c ? yr : 0.0f;
48-
xr += xr;
49-
--k;
50-
}
51-
52-
c = xr > yr;
53-
q = (q << 1) | c;
54-
xr -= c ? yr : 0.0f;
55-
56-
int lt = ex < ey;
57-
58-
q = lt ? 0 : q;
59-
xr = lt ? xa : xr;
60-
yr = lt ? ya : yr;
61-
62-
c = (yr < 2.0f * xr) | ((yr == 2.0f * xr) & ((q & 0x1) == 0x1));
63-
xr -= c ? yr : 0.0f;
64-
q += c;
65-
66-
float s = __clc_as_float(ey << EXPSHIFTBITS_SP32);
67-
xr *= lt ? 1.0f : s;
68-
69-
int qsgn = sx == sy ? 1 : -1;
70-
int quot = (q & 0x7f) * qsgn;
71-
72-
c = ax == ay;
73-
quot = c ? qsgn : quot;
74-
xr = c ? 0.0f : xr;
75-
76-
xr = __clc_as_float(sx ^ __clc_as_int(xr));
77-
78-
c = ax > PINFBITPATT_SP32 | ay > PINFBITPATT_SP32 | ax == PINFBITPATT_SP32 |
79-
ay == 0;
80-
quot = c ? 0 : quot;
81-
xr = c ? __clc_as_float(QNANBITPATT_SP32) : xr;
82-
83-
*quo = quot;
84-
85-
return xr;
86-
}
87-
// remquo signature is special, we don't have macro for this
88-
#define __VEC_REMQUO(TYPE, VEC_SIZE, HALF_VEC_SIZE) \
89-
_CLC_DEF _CLC_OVERLOAD TYPE##VEC_SIZE __clc_remquo( \
90-
TYPE##VEC_SIZE x, TYPE##VEC_SIZE y, __private int##VEC_SIZE *quo) { \
91-
int##HALF_VEC_SIZE lo, hi; \
92-
TYPE##VEC_SIZE ret; \
93-
ret.lo = __clc_remquo(x.lo, y.lo, &lo); \
94-
ret.hi = __clc_remquo(x.hi, y.hi, &hi); \
95-
(*quo).lo = lo; \
96-
(*quo).hi = hi; \
97-
return ret; \
98-
}
99-
100-
#define __VEC3_REMQUO(TYPE) \
101-
_CLC_DEF _CLC_OVERLOAD TYPE##3 __clc_remquo(TYPE##3 x, TYPE##3 y, \
102-
__private int##3 * quo) { \
103-
int2 lo; \
104-
int hi; \
105-
TYPE##3 ret; \
106-
ret.s01 = __clc_remquo(x.s01, y.s01, &lo); \
107-
ret.s2 = __clc_remquo(x.s2, y.s2, &hi); \
108-
(*quo).s01 = lo; \
109-
(*quo).s2 = hi; \
110-
return ret; \
111-
}
112-
__VEC_REMQUO(float, 2, )
113-
__VEC3_REMQUO(float)
114-
__VEC_REMQUO(float, 4, 2)
115-
__VEC_REMQUO(float, 8, 4)
116-
__VEC_REMQUO(float, 16, 8)
117-
118-
#ifdef cl_khr_fp64
119-
120-
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
121-
122-
_CLC_DEF _CLC_OVERLOAD double __clc_remquo(double x, double y,
123-
__private int *pquo) {
124-
ulong ux = __clc_as_ulong(x);
125-
ulong ax = ux & ~SIGNBIT_DP64;
126-
ulong xsgn = ux ^ ax;
127-
double dx = __clc_as_double(ax);
128-
int xexp = __clc_convert_int(ax >> EXPSHIFTBITS_DP64);
129-
int xexp1 = 11 - (int)__clc_clz(ax & MANTBITS_DP64);
130-
xexp1 = xexp < 1 ? xexp1 : xexp;
131-
132-
ulong uy = __clc_as_ulong(y);
133-
ulong ay = uy & ~SIGNBIT_DP64;
134-
double dy = __clc_as_double(ay);
135-
int yexp = __clc_convert_int(ay >> EXPSHIFTBITS_DP64);
136-
int yexp1 = 11 - (int)__clc_clz(ay & MANTBITS_DP64);
137-
yexp1 = yexp < 1 ? yexp1 : yexp;
138-
139-
int qsgn = ((ux ^ uy) & SIGNBIT_DP64) == 0UL ? 1 : -1;
140-
141-
// First assume |x| > |y|
142-
143-
// Set ntimes to the number of times we need to do a
144-
// partial remainder. If the exponent of x is an exact multiple
145-
// of 53 larger than the exponent of y, and the mantissa of x is
146-
// less than the mantissa of y, ntimes will be one too large
147-
// but it doesn't matter - it just means that we'll go round
148-
// the loop below one extra time.
149-
int ntimes = __clc_max(0, (xexp1 - yexp1) / 53);
150-
double w = __clc_ldexp(dy, ntimes * 53);
151-
w = ntimes == 0 ? dy : w;
152-
double scale = ntimes == 0 ? 1.0 : 0x1.0p-53;
153-
154-
// Each time round the loop we compute a partial remainder.
155-
// This is done by subtracting a large multiple of w
156-
// from x each time, where w is a scaled up version of y.
157-
// The subtraction must be performed exactly in quad
158-
// precision, though the result at each stage can
159-
// fit exactly in a double precision number.
160-
int i;
161-
double t, v, p, pp;
162-
163-
for (i = 0; i < ntimes; i++) {
164-
// Compute integral multiplier
165-
t = __clc_trunc(dx / w);
166-
167-
// Compute w * t in quad precision
168-
p = w * t;
169-
pp = __clc_fma(w, t, -p);
170-
171-
// Subtract w * t from dx
172-
v = dx - p;
173-
dx = v + (((dx - v) - p) - pp);
174-
175-
// If t was one too large, dx will be negative. Add back one w.
176-
dx += dx < 0.0 ? w : 0.0;
177-
178-
// Scale w down by 2^(-53) for the next iteration
179-
w *= scale;
180-
}
181-
182-
// One more time
183-
// Variable todd says whether the integer t is odd or not
184-
t = __clc_floor(dx / w);
185-
long lt = (long)t;
186-
int todd = lt & 1;
187-
188-
p = w * t;
189-
pp = __clc_fma(w, t, -p);
190-
v = dx - p;
191-
dx = v + (((dx - v) - p) - pp);
192-
i = dx < 0.0;
193-
todd ^= i;
194-
dx += i ? w : 0.0;
195-
196-
lt -= i;
197-
198-
// At this point, dx lies in the range [0,dy)
199-
200-
// For the remainder function, we need to adjust dx
201-
// so that it lies in the range (-y/2, y/2] by carefully
202-
// subtracting w (== dy == y) if necessary. The rigmarole
203-
// with todd is to get the correct sign of the result
204-
// when x/y lies exactly half way between two integers,
205-
// when we need to choose the even integer.
206-
207-
int al = (2.0 * dx > w) | (todd & (2.0 * dx == w));
208-
double dxl = dx - (al ? w : 0.0);
209-
210-
int ag = (dx > 0.5 * w) | (todd & (dx == 0.5 * w));
211-
double dxg = dx - (ag ? w : 0.0);
212-
213-
dx = dy < 0x1.0p+1022 ? dxl : dxg;
214-
lt += dy < 0x1.0p+1022 ? al : ag;
215-
int quo = ((int)lt & 0x7f) * qsgn;
216-
217-
double ret = __clc_as_double(xsgn ^ __clc_as_ulong(dx));
218-
dx = __clc_as_double(ax);
219-
220-
// Now handle |x| == |y|
221-
int c = dx == dy;
222-
t = __clc_as_double(xsgn);
223-
quo = c ? qsgn : quo;
224-
ret = c ? t : ret;
225-
226-
// Next, handle |x| < |y|
227-
c = dx < dy;
228-
quo = c ? 0 : quo;
229-
ret = c ? x : ret;
230-
231-
c &= (yexp<1023 & 2.0 * dx> dy) | (dx > 0.5 * dy);
232-
quo = c ? qsgn : quo;
233-
// we could use a conversion here instead since qsgn = +-1
234-
p = qsgn == 1 ? -1.0 : 1.0;
235-
t = __clc_fma(y, p, x);
236-
ret = c ? t : ret;
237-
238-
// We don't need anything special for |x| == 0
239-
240-
// |y| is 0
241-
c = dy == 0.0;
242-
quo = c ? 0 : quo;
243-
ret = c ? __clc_as_double(QNANBITPATT_DP64) : ret;
244-
245-
// y is +-Inf, NaN
246-
c = yexp > BIASEDEMAX_DP64;
247-
quo = c ? 0 : quo;
248-
t = y == y ? x : y;
249-
ret = c ? t : ret;
250-
251-
// x is +=Inf, NaN
252-
c = xexp > BIASEDEMAX_DP64;
253-
quo = c ? 0 : quo;
254-
ret = c ? __clc_as_double(QNANBITPATT_DP64) : ret;
255-
256-
*pquo = quo;
257-
return ret;
258-
}
259-
__VEC_REMQUO(double, 2, )
260-
__VEC3_REMQUO(double)
261-
__VEC_REMQUO(double, 4, 2)
262-
__VEC_REMQUO(double, 8, 4)
263-
__VEC_REMQUO(double, 16, 8)
264-
#endif
265-
266-
#ifdef cl_khr_fp16
267-
268-
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
269-
270-
_CLC_OVERLOAD _CLC_DEF half __clc_remquo(half x, half y, __private int *pquo) {
271-
return (half)__clc_remquo((float)x, (float)y, pquo);
272-
}
273-
__VEC_REMQUO(half, 2, )
274-
__VEC3_REMQUO(half)
275-
__VEC_REMQUO(half, 4, 2)
276-
__VEC_REMQUO(half, 8, 4)
277-
__VEC_REMQUO(half, 16, 8)
278-
279-
#endif
29+
#define __CLC_ADDRESS_SPACE local
30+
#include <clc_remquo.inc>
31+
#undef __CLC_ADDRESS_SPACE

0 commit comments

Comments
 (0)