Skip to content

Add initial f16 and f128 support to the s390x backend #10691

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Apr 29, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion cranelift/codegen/src/isa/s390x/abi.rs
Original file line number Diff line number Diff line change
Expand Up @@ -166,7 +166,7 @@ fn in_int_reg(ty: Type) -> bool {

fn in_flt_reg(ty: Type) -> bool {
match ty {
types::F32 | types::F64 => true,
types::F16 | types::F32 | types::F64 => true,
_ => false,
}
}
Expand Down
20 changes: 19 additions & 1 deletion cranelift/codegen/src/isa/s390x/inst.isle
Original file line number Diff line number Diff line change
Expand Up @@ -552,6 +552,11 @@
(rn Reg)
(rm Reg))

;; Load floating-point constant, half-precision (16 bit).
(LoadFpuConst16
(rd WritableReg)
(const_data u16))

;; Load floating-point constant, single-precision (32 bit).
(LoadFpuConst32
(rd WritableReg)
Expand Down Expand Up @@ -2836,6 +2841,7 @@
(rule (arg_store $I16 reg mem) (store16 reg mem))
(rule (arg_store $I32 reg mem) (store32 reg mem))
(rule (arg_store $I64 reg mem) (store64 reg mem))
(rule (arg_store $F16 reg mem) (vec_store_lane $F16X8 reg mem 0))
(rule (arg_store $F32 reg mem) (vec_store_lane $F32X4 reg mem 0))
(rule (arg_store $F64 reg mem) (vec_store_lane $F64X2 reg mem 0))
(rule -1 (arg_store (vr128_ty ty) reg mem) (vec_store reg mem))
Expand All @@ -2861,6 +2867,7 @@
(rule 5 (abi_vec_elt_rev _ (gpr32_ty ty) reg) reg)
(rule 4 (abi_vec_elt_rev _ (gpr64_ty ty) reg) reg)
(rule 3 (abi_vec_elt_rev _ $I128 reg) reg)
(rule 3 (abi_vec_elt_rev _ $F128 reg) reg)
(rule 2 (abi_vec_elt_rev _ (ty_scalar_float ty) reg) reg)
(rule 0 (abi_vec_elt_rev callee_lane_order _ reg)
(if-let true (lane_order_equal callee_lane_order (lane_order)))
Expand Down Expand Up @@ -2925,7 +2932,7 @@
(decl imm (Type u64) Reg)

;; 16-bit (or smaller) result type, any value
(rule 7 (imm (fits_in_16 ty) n)
(rule 7 (imm (fits_in_16 (ty_int ty)) n)
(let ((dst WritableReg (temp_writable_reg ty))
(_ Unit (emit (MInst.Mov32SImm16 dst (u64_as_i16 n)))))
dst))
Expand Down Expand Up @@ -2986,6 +2993,13 @@
(_ Unit (emit (MInst.Insert64UImm32Shifted dst src n))))
dst))

;; 16-bit floating-point type, any value. Loaded from literal pool.
;; TODO: use LZER to load 0.0
(rule 8 (imm $F16 n)
(let ((dst WritableReg (temp_writable_reg $F16))
(_ Unit (emit (MInst.LoadFpuConst16 dst (u64_as_u16 n)))))
dst))

;; 32-bit floating-point type, any value. Loaded from literal pool.
;; TODO: use LZER to load 0.0
(rule 8 (imm $F32 n)
Expand Down Expand Up @@ -3222,6 +3236,10 @@
(let ((dst WritableReg (temp_writable_reg ty))
(inst MInst (MInst.CMov64 dst cond reg_false reg_true)))
(ConsumesFlags.ConsumesFlagsReturnsReg inst dst)))
(rule 3 (cmov_reg_reg $F16 cond reg_true reg_false)
(let ((dst WritableReg (temp_writable_reg $F16))
(inst MInst (MInst.FpuCMov32 dst cond reg_false reg_true)))
(ConsumesFlags.ConsumesFlagsReturnsReg inst dst)))
(rule 3 (cmov_reg_reg $F32 cond reg_true reg_false)
(let ((dst WritableReg (temp_writable_reg $F32))
(inst MInst (MInst.FpuCMov32 dst cond reg_false reg_true)))
Expand Down
74 changes: 36 additions & 38 deletions cranelift/codegen/src/isa/s390x/inst/emit.rs
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,11 @@ macro_rules! debug_assert_valid_regpair {
};
}

const OPCODE_BRAS: u16 = 0xa75;
const OPCODE_BCR: u16 = 0xa74;
const OPCODE_LDR: u16 = 0x28;
const OPCODE_VLR: u16 = 0xe756;

/// Type(s) of memory instructions available for mem_finalize.
pub struct MemInstType {
/// True if 12-bit unsigned displacement is supported.
Expand Down Expand Up @@ -2298,9 +2303,8 @@ impl Inst {
rd,
ref symbol_reloc,
} => {
let opcode = 0xa75; // BRAS
let reg = writable_spilltmp_reg().to_reg();
put(sink, &enc_ri_b(opcode, reg, 12));
put(sink, &enc_ri_b(OPCODE_BRAS, reg, 12));
let (reloc, name, offset) = match &**symbol_reloc {
SymbolReloc::Absolute { name, offset } => (Reloc::Abs8, name, *offset),
SymbolReloc::TlsGd { name } => (Reloc::S390xTlsGd64, name, 0),
Expand All @@ -2319,53 +2323,54 @@ impl Inst {
let opcode = 0x38; // LER
put(sink, &enc_rr(opcode, rd.to_reg(), rn));
} else {
let opcode = 0xe756; // VLR
put(sink, &enc_vrr_a(opcode, rd.to_reg(), rn, 0, 0, 0));
put(sink, &enc_vrr_a(OPCODE_VLR, rd.to_reg(), rn, 0, 0, 0));
}
}
&Inst::FpuMove64 { rd, rn } => {
if is_fpr(rd.to_reg()) && is_fpr(rn) {
let opcode = 0x28; // LDR
put(sink, &enc_rr(opcode, rd.to_reg(), rn));
put(sink, &enc_rr(OPCODE_LDR, rd.to_reg(), rn));
} else {
let opcode = 0xe756; // VLR
put(sink, &enc_vrr_a(opcode, rd.to_reg(), rn, 0, 0, 0));
put(sink, &enc_vrr_a(OPCODE_VLR, rd.to_reg(), rn, 0, 0, 0));
}
}
&Inst::FpuCMov32 { rd, cond, ri, rm } => {
debug_assert_eq!(rd.to_reg(), ri);

if is_fpr(rd.to_reg()) && is_fpr(rm) {
let opcode = 0xa74; // BCR
put(sink, &enc_ri_c(opcode, cond.invert().bits(), 4 + 2));
put(sink, &enc_ri_c(OPCODE_BCR, cond.invert().bits(), 4 + 2));
let opcode = 0x38; // LER
put(sink, &enc_rr(opcode, rd.to_reg(), rm));
} else {
let opcode = 0xa74; // BCR
put(sink, &enc_ri_c(opcode, cond.invert().bits(), 4 + 6));
let opcode = 0xe756; // VLR
put(sink, &enc_vrr_a(opcode, rd.to_reg(), rm, 0, 0, 0));
put(sink, &enc_ri_c(OPCODE_BCR, cond.invert().bits(), 4 + 6));
put(sink, &enc_vrr_a(OPCODE_VLR, rd.to_reg(), rm, 0, 0, 0));
}
}
&Inst::FpuCMov64 { rd, cond, ri, rm } => {
debug_assert_eq!(rd.to_reg(), ri);

if is_fpr(rd.to_reg()) && is_fpr(rm) {
let opcode = 0xa74; // BCR
put(sink, &enc_ri_c(opcode, cond.invert().bits(), 4 + 2));
let opcode = 0x28; // LDR
put(sink, &enc_rr(opcode, rd.to_reg(), rm));
put(sink, &enc_ri_c(OPCODE_BCR, cond.invert().bits(), 4 + 2));
put(sink, &enc_rr(OPCODE_LDR, rd.to_reg(), rm));
} else {
let opcode = 0xa74; // BCR
put(sink, &enc_ri_c(opcode, cond.invert().bits(), 4 + 6));
let opcode = 0xe756; // VLR
put(sink, &enc_vrr_a(opcode, rd.to_reg(), rm, 0, 0, 0));
put(sink, &enc_ri_c(OPCODE_BCR, cond.invert().bits(), 4 + 6));
put(sink, &enc_vrr_a(OPCODE_VLR, rd.to_reg(), rm, 0, 0, 0));
}
}
&Inst::LoadFpuConst16 { rd, const_data } => {
let reg = writable_spilltmp_reg().to_reg();
put(sink, &enc_ri_b(OPCODE_BRAS, reg, 6));
sink.put2(const_data.swap_bytes());
let inst = Inst::VecLoadLaneUndef {
size: 16,
rd,
mem: MemArg::reg(reg, MemFlags::trusted()),
lane_imm: 0,
};
inst.emit(sink, emit_info, state);
}
&Inst::LoadFpuConst32 { rd, const_data } => {
let opcode = 0xa75; // BRAS
let reg = writable_spilltmp_reg().to_reg();
put(sink, &enc_ri_b(opcode, reg, 8));
put(sink, &enc_ri_b(OPCODE_BRAS, reg, 8));
sink.put4(const_data.swap_bytes());
let inst = Inst::VecLoadLaneUndef {
size: 32,
Expand All @@ -2376,9 +2381,8 @@ impl Inst {
inst.emit(sink, emit_info, state);
}
&Inst::LoadFpuConst64 { rd, const_data } => {
let opcode = 0xa75; // BRAS
let reg = writable_spilltmp_reg().to_reg();
put(sink, &enc_ri_b(opcode, reg, 12));
put(sink, &enc_ri_b(OPCODE_BRAS, reg, 12));
sink.put8(const_data.swap_bytes());
let inst = Inst::VecLoadLaneUndef {
size: 64,
Expand Down Expand Up @@ -2780,8 +2784,7 @@ impl Inst {
put(sink, &enc_vrr_a(opcode, rm, rn, m3, 0, 0));

// If CC != 0, we'd done, so jump over the next instruction.
let opcode = 0xa74; // BCR
put(sink, &enc_ri_c(opcode, 7, 4 + 6));
put(sink, &enc_ri_c(OPCODE_BCR, 7, 4 + 6));

// Otherwise, use VECTOR COMPARE HIGH LOGICAL.
// Since we already know the high parts are equal, the CC
Expand Down Expand Up @@ -2864,25 +2867,21 @@ impl Inst {
}

&Inst::VecMov { rd, rn } => {
let opcode = 0xe756; // VLR
put(sink, &enc_vrr_a(opcode, rd.to_reg(), rn, 0, 0, 0));
put(sink, &enc_vrr_a(OPCODE_VLR, rd.to_reg(), rn, 0, 0, 0));
}
&Inst::VecCMov { rd, cond, ri, rm } => {
debug_assert_eq!(rd.to_reg(), ri);

let opcode = 0xa74; // BCR
put(sink, &enc_ri_c(opcode, cond.invert().bits(), 4 + 6));
let opcode = 0xe756; // VLR
put(sink, &enc_vrr_a(opcode, rd.to_reg(), rm, 0, 0, 0));
put(sink, &enc_ri_c(OPCODE_BCR, cond.invert().bits(), 4 + 6));
put(sink, &enc_vrr_a(OPCODE_VLR, rd.to_reg(), rm, 0, 0, 0));
}
&Inst::MovToVec128 { rd, rn, rm } => {
let opcode = 0xe762; // VLVGP
put(sink, &enc_vrr_f(opcode, rd.to_reg(), rn, rm));
}
&Inst::VecLoadConst { rd, const_data } => {
let opcode = 0xa75; // BRAS
let reg = writable_spilltmp_reg().to_reg();
put(sink, &enc_ri_b(opcode, reg, 20));
put(sink, &enc_ri_b(OPCODE_BRAS, reg, 20));
for i in const_data.to_be_bytes().iter() {
sink.put1(*i);
}
Expand All @@ -2897,9 +2896,8 @@ impl Inst {
rd,
const_data,
} => {
let opcode = 0xa75; // BRAS
let reg = writable_spilltmp_reg().to_reg();
put(sink, &enc_ri_b(opcode, reg, (4 + size / 8) as i32));
put(sink, &enc_ri_b(OPCODE_BRAS, reg, (4 + size / 8) as i32));
for i in 0..size / 8 {
sink.put1((const_data >> (size - 8 - 8 * i)) as u8);
}
Expand Down
18 changes: 18 additions & 0 deletions cranelift/codegen/src/isa/s390x/inst/emit_tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7817,6 +7817,24 @@ fn test_s390x_binemit() {
"wfcdb %v24, %f12",
));

// FIXME(#8312): Use `1.0_f16.to_bits()` once `f16` is stabilised.
let f16_1_0 = 0x3c00;
insns.push((
Inst::LoadFpuConst16 {
rd: writable_vr(8),
const_data: f16_1_0,
},
"A71500033C00E78010000001",
"bras %r1, 8 ; data.f16 0x1.000p0 ; vleh %v8, 0(%r1), 0",
));
insns.push((
Inst::LoadFpuConst16 {
rd: writable_vr(24),
const_data: f16_1_0,
},
"A71500033C00E78010000801",
"bras %r1, 8 ; data.f16 0x1.000p0 ; vleh %v24, 0(%r1), 0",
));
insns.push((
Inst::LoadFpuConst32 {
rd: writable_vr(8),
Expand Down
38 changes: 33 additions & 5 deletions cranelift/codegen/src/isa/s390x/inst/mod.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
//! This module defines s390x-specific machine instruction types.

use crate::binemit::{Addend, CodeOffset, Reloc};
use crate::ir::immediates::Ieee16;
use crate::ir::{types, ExternalName, Type};
use crate::isa::s390x::abi::S390xMachineDeps;
use crate::isa::{CallConv, FunctionAlignment};
Expand Down Expand Up @@ -177,6 +178,7 @@ impl Inst {
| Inst::FpuRRRR { .. }
| Inst::FpuCmp32 { .. }
| Inst::FpuCmp64 { .. }
| Inst::LoadFpuConst16 { .. }
| Inst::LoadFpuConst32 { .. }
| Inst::LoadFpuConst64 { .. }
| Inst::VecRRR { .. }
Expand Down Expand Up @@ -324,6 +326,12 @@ impl Inst {
types::I16 => Inst::Load64ZExt16 { rd: into_reg, mem },
types::I32 => Inst::Load64ZExt32 { rd: into_reg, mem },
types::I64 => Inst::Load64 { rd: into_reg, mem },
types::F16 => Inst::VecLoadLaneUndef {
size: 16,
rd: into_reg,
mem,
lane_imm: 0,
},
types::F32 => Inst::VecLoadLaneUndef {
size: 32,
rd: into_reg,
Expand All @@ -336,8 +344,7 @@ impl Inst {
mem,
lane_imm: 0,
},
_ if ty.is_vector() && ty.bits() == 128 => Inst::VecLoad { rd: into_reg, mem },
types::I128 => Inst::VecLoad { rd: into_reg, mem },
_ if ty.bits() == 128 => Inst::VecLoad { rd: into_reg, mem },
_ => unimplemented!("gen_load({})", ty),
}
}
Expand All @@ -349,6 +356,12 @@ impl Inst {
types::I16 => Inst::Store16 { rd: from_reg, mem },
types::I32 => Inst::Store32 { rd: from_reg, mem },
types::I64 => Inst::Store64 { rd: from_reg, mem },
types::F16 => Inst::VecStoreLane {
size: 16,
rd: from_reg,
mem,
lane_imm: 0,
},
types::F32 => Inst::VecStoreLane {
size: 32,
rd: from_reg,
Expand All @@ -361,8 +374,7 @@ impl Inst {
mem,
lane_imm: 0,
},
_ if ty.is_vector() && ty.bits() == 128 => Inst::VecStore { rd: from_reg, mem },
types::I128 => Inst::VecStore { rd: from_reg, mem },
_ if ty.bits() == 128 => Inst::VecStore { rd: from_reg, mem },
_ => unimplemented!("gen_store({})", ty),
}
}
Expand Down Expand Up @@ -646,7 +658,9 @@ fn s390x_get_operands(inst: &mut Inst, collector: &mut DenyReuseVisitor<impl Ope
collector.reg_use(rn);
collector.reg_use(rm);
}
Inst::LoadFpuConst32 { rd, .. } | Inst::LoadFpuConst64 { rd, .. } => {
Inst::LoadFpuConst16 { rd, .. }
| Inst::LoadFpuConst32 { rd, .. }
| Inst::LoadFpuConst64 { rd, .. } => {
collector.reg_def(rd);
collector.reg_fixed_nonallocatable(gpr_preg(1));
}
Expand Down Expand Up @@ -1119,8 +1133,10 @@ impl MachInst for Inst {
types::I16 => Ok((&[RegClass::Int], &[types::I16])),
types::I32 => Ok((&[RegClass::Int], &[types::I32])),
types::I64 => Ok((&[RegClass::Int], &[types::I64])),
types::F16 => Ok((&[RegClass::Float], &[types::F16])),
types::F32 => Ok((&[RegClass::Float], &[types::F32])),
types::F64 => Ok((&[RegClass::Float], &[types::F64])),
types::F128 => Ok((&[RegClass::Float], &[types::F128])),
types::I128 => Ok((&[RegClass::Float], &[types::I128])),
_ if ty.is_vector() && ty.bits() == 128 => Ok((&[RegClass::Float], &[types::I8X16])),
_ => Err(CodegenError::Unsupported(format!(
Expand Down Expand Up @@ -2267,6 +2283,18 @@ impl Inst {
format!("wfcdb {}, {}", rn_fpr.unwrap_or(rn), rm_fpr.unwrap_or(rm))
}
}
&Inst::LoadFpuConst16 { rd, const_data } => {
let (rd, _rd_fpr) = pretty_print_fpr(rd.to_reg());
let tmp = pretty_print_reg(writable_spilltmp_reg().to_reg());
// FIXME(#8312): Use `f16::from_bits` once it is stabilised.
format!(
"bras {}, 8 ; data.f16 {} ; vleh {}, 0({}), 0",
tmp,
Ieee16::with_bits(const_data),
rd,
tmp
)
}
&Inst::LoadFpuConst32 { rd, const_data } => {
let (rd, rd_fpr) = pretty_print_fpr(rd.to_reg());
let tmp = pretty_print_reg(writable_spilltmp_reg().to_reg());
Expand Down
Loading
Loading