From 4df9b9ac6f1bc708cfaf4fc7f2c6c303f9938194 Mon Sep 17 00:00:00 2001 From: ptitSeb Date: Sun, 16 May 2021 13:43:29 +0200 Subject: [PATCH] [DYNAREC] Force separation of x87 and mmx cache usage --- src/dynarec/dynarec_arm_0f.c | 37 +++++++++++++------------- src/dynarec/dynarec_arm_d8.c | 4 +-- src/dynarec/dynarec_arm_d9.c | 28 ++++++++++---------- src/dynarec/dynarec_arm_da.c | 6 ++--- src/dynarec/dynarec_arm_db.c | 8 +++--- src/dynarec/dynarec_arm_dc.c | 4 +-- src/dynarec/dynarec_arm_dd.c | 12 ++++----- src/dynarec/dynarec_arm_de.c | 18 ++++++------- src/dynarec/dynarec_arm_df.c | 16 +++++------ src/dynarec/dynarec_arm_helper.c | 44 ++++++++++++++++++++++++++----- src/dynarec/dynarec_arm_helper.h | 12 ++++++--- src/dynarec/dynarec_arm_private.h | 2 ++ 12 files changed, 114 insertions(+), 77 deletions(-) diff --git a/src/dynarec/dynarec_arm_0f.c b/src/dynarec/dynarec_arm_0f.c index 4f42375492..d38c26408e 100755 --- a/src/dynarec/dynarec_arm_0f.c +++ b/src/dynarec/dynarec_arm_0f.c @@ -37,10 +37,10 @@ } #define GETGM(a) \ gd = (nextop&0x38)>>3; \ - a = mmx_get_reg(dyn, ninst, x1, gd) + a = mmx_get_reg(dyn, ninst, x1, x2, x3, gd) #define GETEM(a) \ if((nextop&0xC0)==0xC0) { \ - a = mmx_get_reg(dyn, ninst, x1, nextop&7); \ + a = mmx_get_reg(dyn, ninst, x1, x2, x3, nextop&7); \ } else { \ addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, 0, 0); \ a = fpu_get_scratch_double(dyn); \ @@ -284,7 +284,7 @@ uintptr_t dynarec0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, gd = (nextop&0x38)>>3; v0 = sse_get_reg(dyn, ninst, x1, gd); if((nextop&0xC0)==0xC0) { - v1 = mmx_get_reg(dyn, ninst, x1, nextop&7); + v1 = mmx_get_reg(dyn, ninst, x1, x2, x3, nextop&7); } else { addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, 0, 0); v1 = fpu_get_scratch_double(dyn); @@ -309,7 +309,7 @@ uintptr_t dynarec0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, INST_NAME("CVTTPS2PI Gm, Ex"); nextop = F8; gd = (nextop&0x38)>>3; - v0 = mmx_get_reg_empty(dyn, ninst, x1, gd); + v0 = mmx_get_reg_empty(dyn, ninst, x1, x2, x3, gd); if((nextop&0xC0)==0xC0) { v1 = sse_get_reg(dyn, ninst, x1, nextop&7); } else { @@ -323,7 +323,7 @@ uintptr_t dynarec0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, INST_NAME("CVTPS2PI Gm, Ex"); nextop = F8; gd = (nextop&0x38)>>3; - v0 = mmx_get_reg_empty(dyn, ninst, x1, gd); + v0 = mmx_get_reg_empty(dyn, ninst, x1, x2, x3, gd); if((nextop&0xC0)==0xC0) { v1 = sse_get_reg(dyn, ninst, x1, nextop&7); } else { @@ -800,7 +800,7 @@ uintptr_t dynarec0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, GETGM(d0); q0 = fpu_get_scratch_quad(dyn); if((nextop&0xC0)==0xC0) { - d1 = mmx_get_reg(dyn, ninst, x1, nextop&7); + d1 = mmx_get_reg(dyn, ninst, x1, x2, x3, nextop&7); VMOVD(q0+1, d1); } else { addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, 0, 0); @@ -861,7 +861,7 @@ uintptr_t dynarec0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, nextop = F8; GETED; gd = (nextop&0x38)>>3; - v0 = mmx_get_reg_empty(dyn, ninst, x3, gd); + v0 = mmx_get_reg_empty(dyn, ninst, x2, x3, x14, gd); VEOR(v0, v0, v0); VMOVtoDx_32(v0, 0, ed); break; @@ -870,11 +870,11 @@ uintptr_t dynarec0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, nextop = F8; gd = (nextop&0x38)>>3; if((nextop&0xC0)==0xC0) { - v1 = mmx_get_reg(dyn, ninst, x1, nextop&7); - v0 = mmx_get_reg_empty(dyn, ninst, x1, gd); + v1 = mmx_get_reg(dyn, ninst, x1, x2, x3, nextop&7); + v0 = mmx_get_reg_empty(dyn, ninst, x1, x2, x3, gd); VMOVD(v0, v1); } else { - v0 = mmx_get_reg_empty(dyn, ninst, x1, gd); + v0 = mmx_get_reg_empty(dyn, ninst, x1, x2, x3, gd); addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, 0, 0); VLD1_64(v0, ed); } @@ -884,10 +884,10 @@ uintptr_t dynarec0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, nextop = F8; gd = (nextop&0x38)>>3; i32 = -1; - v0 = mmx_get_reg(dyn, ninst, x1, gd); + v0 = mmx_get_reg(dyn, ninst, x1, x2, x3, gd); if((nextop&0xC0)==0xC0) { u8 = F8; - v1 = mmx_get_reg(dyn, ninst, x1, nextop&7); + v1 = mmx_get_reg(dyn, ninst, x1, x2, x3, nextop&7); // use stack as temporary storage SUB_IMM8(xSP, xSP, 4); if(v1==v0) { @@ -931,7 +931,7 @@ uintptr_t dynarec0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, case 2: INST_NAME("PSRLW Em, Ib"); if((nextop&0xC0)==0xC0) { - d0 = mmx_get_reg(dyn, ninst, x1, nextop&7); + d0 = mmx_get_reg(dyn, ninst, x1, x2, x3, nextop&7); } else { addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, 0, 0); d0 = fpu_get_scratch_quad(dyn); @@ -952,7 +952,7 @@ uintptr_t dynarec0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, case 4: INST_NAME("PSRAW Em, Ib"); if((nextop&0xC0)==0xC0) { - d0 = mmx_get_reg(dyn, ninst, x1, nextop&7); + d0 = mmx_get_reg(dyn, ninst, x1, x2, x3, nextop&7); } else { addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, 0, 0); d0 = fpu_get_scratch_quad(dyn); @@ -969,7 +969,7 @@ uintptr_t dynarec0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, case 6: INST_NAME("PSLLW Em, Ib"); if((nextop&0xC0)==0xC0) { - d0 = mmx_get_reg(dyn, ninst, x1, nextop&7); + d0 = mmx_get_reg(dyn, ninst, x1, x2, x3, nextop&7); } else { addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, 0, 0); d0 = fpu_get_scratch_quad(dyn); @@ -1085,8 +1085,7 @@ uintptr_t dynarec0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, case 0x77: INST_NAME("EMMS"); // empty MMX, FPU now usable - /*emu->top = 0; - emu->fpu_stack = 0;*/ //TODO: Check if something is needed here? + mmx_purgecache(dyn, ninst, x1); break; case 0x7E: @@ -1107,7 +1106,7 @@ uintptr_t dynarec0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, nextop = F8; GETGM(v0); if((nextop&0xC0)==0xC0) { - v1 = mmx_get_reg_empty(dyn, ninst, x1, nextop&7); + v1 = mmx_get_reg_empty(dyn, ninst, x1, x2, x3, nextop&7); VMOVD(v1, v0); } else { parity = getedparity(dyn, ninst, addr, nextop, 3); @@ -2189,7 +2188,7 @@ uintptr_t dynarec0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, if((nextop&0xC0)==0xC0) { DEFAULT; } else { - v0 = mmx_get_reg(dyn, ninst, x1, gd); + v0 = mmx_get_reg(dyn, ninst, x1, x2, x3, gd); addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, 0, 0); VST1_64(v0, ed); } diff --git a/src/dynarec/dynarec_arm_d8.c b/src/dynarec/dynarec_arm_d8.c index bbf2797603..b487d87de9 100755 --- a/src/dynarec/dynarec_arm_d8.c +++ b/src/dynarec/dynarec_arm_d8.c @@ -94,7 +94,7 @@ uintptr_t dynarecD8(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, v2 = x87_get_st(dyn, ninst, x1, x2, nextop&7); VCMP_F64(v1, v2); FCOM(x1, x2); - x87_do_pop(dyn, ninst); + x87_do_pop(dyn, ninst, x3); break; case 0xE0: case 0xE1: @@ -216,7 +216,7 @@ uintptr_t dynarecD8(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, VCVT_F64_F32(d1, s0); VCMP_F64(v1, d1); FCOM(x1, x2); - x87_do_pop(dyn, ninst); + x87_do_pop(dyn, ninst, x3); break; case 4: INST_NAME("FSUB ST0, float[ED]"); diff --git a/src/dynarec/dynarec_arm_d9.c b/src/dynarec/dynarec_arm_d9.c index 218f903d8c..cd690c24fb 100755 --- a/src/dynarec/dynarec_arm_d9.c +++ b/src/dynarec/dynarec_arm_d9.c @@ -58,7 +58,7 @@ uintptr_t dynarecD9(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, case 0xC7: INST_NAME("FLD STx"); v1 = x87_get_st(dyn, ninst, x1, x2, nextop&7); - v2 = x87_do_push(dyn, ninst); + v2 = x87_do_push(dyn, ninst, x3); VMOV_64(v2, v1); break; @@ -108,43 +108,43 @@ uintptr_t dynarecD9(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, case 0xE8: INST_NAME("FLD1"); - v1 = x87_do_push(dyn, ninst); + v1 = x87_do_push(dyn, ninst, x1); MOV32(x2, (&d_1)); VLDR_64(v1, x2, 0); break; case 0xE9: INST_NAME("FLDL2T"); - v1 = x87_do_push(dyn, ninst); + v1 = x87_do_push(dyn, ninst, x1); MOV32(x2, (&d_l2t)); VLDR_64(v1, x2, 0); break; case 0xEA: INST_NAME("FLDL2E"); - v1 = x87_do_push(dyn, ninst); + v1 = x87_do_push(dyn, ninst, x1); MOV32(x2, (&d_l2e)); VLDR_64(v1, x2, 0); break; case 0xEB: INST_NAME("FLDPI"); - v1 = x87_do_push(dyn, ninst); + v1 = x87_do_push(dyn, ninst, x1); MOV32(x2, (&d_pi)); VLDR_64(v1, x2, 0); break; case 0xEC: INST_NAME("FLDLG2"); - v1 = x87_do_push(dyn, ninst); + v1 = x87_do_push(dyn, ninst, x1); MOV32(x2, (&d_lg2)); VLDR_64(v1, x2, 0); break; case 0xED: INST_NAME("FLDLN2"); - v1 = x87_do_push(dyn, ninst); + v1 = x87_do_push(dyn, ninst, x1); MOV32(x2, (&d_ln2)); VLDR_64(v1, x2, 0); break; case 0xEE: INST_NAME("FLDZ"); - v1 = x87_do_push(dyn, ninst); + v1 = x87_do_push(dyn, ninst, x1); MOV32(x2, (&d_0)); VLDR_64(v1, x2, 0); break; @@ -182,13 +182,13 @@ uintptr_t dynarecD9(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, x87_forget(dyn, ninst, x1, x2, 0); x87_forget(dyn, ninst, x1, x2, 1); CALL(arm_fyl2x, -1, 0); - x87_do_pop(dyn, ninst); + x87_do_pop(dyn, ninst, x3); break; case 0xF2: INST_NAME("FTAN"); x87_forget(dyn, ninst, x1, x2, 0); CALL(arm_ftan, -1, 0); - v1 = x87_do_push(dyn, ninst); + v1 = x87_do_push(dyn, ninst, x3); MOV32(x2, (&d_1)); VLDR_64(v1, x2, 0); break; @@ -197,7 +197,7 @@ uintptr_t dynarecD9(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, x87_forget(dyn, ninst, x1, x2, 0); x87_forget(dyn, ninst, x1, x2, 1); CALL(arm_fpatan, -1, 0); - x87_do_pop(dyn, ninst); + x87_do_pop(dyn, ninst, x3); break; case 0xF4: INST_NAME("FXTRACT"); @@ -238,7 +238,7 @@ uintptr_t dynarecD9(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, x87_forget(dyn, ninst, x1, x2, 0); x87_forget(dyn, ninst, x1, x2, 1); CALL(arm_fyl2xp1, -1, 0); - x87_do_pop(dyn, ninst); + x87_do_pop(dyn, ninst, x3); break; case 0xFB: INST_NAME("FSINCOS"); @@ -289,7 +289,7 @@ uintptr_t dynarecD9(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, switch((nextop>>3)&7) { case 0: INST_NAME("FLD ST0, float[ED]"); - v1 = x87_do_push(dyn, ninst); + v1 = x87_do_push(dyn, ninst, x1); s0 = fpu_get_scratch_single(dyn); parity = getedparity(dyn, ninst, addr, nextop, 2); if(parity) { @@ -331,7 +331,7 @@ uintptr_t dynarecD9(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, VMOVfrV(x2, s0); STR_IMM9(x2, ed, fixedaddress); } - x87_do_pop(dyn, ninst); + x87_do_pop(dyn, ninst, x3); break; case 4: INST_NAME("FLDENV Ed"); diff --git a/src/dynarec/dynarec_arm_da.c b/src/dynarec/dynarec_arm_da.c index b4b9b653d0..92e828ba6e 100755 --- a/src/dynarec/dynarec_arm_da.c +++ b/src/dynarec/dynarec_arm_da.c @@ -108,8 +108,8 @@ uintptr_t dynarecDA(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, v2 = x87_get_st(dyn, ninst, x1, x2, 1); VCMP_F64(v1, v2); FCOM(x1, x2); - x87_do_pop(dyn, ninst); - x87_do_pop(dyn, ninst); + x87_do_pop(dyn, ninst, x3); + x87_do_pop(dyn, ninst, x3); break; case 0xE4: @@ -168,7 +168,7 @@ uintptr_t dynarecDA(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, VCVT_F64_S32(d0, s0); VCMP_F64(v1, d0); FCOM(x1, x2); - x87_do_pop(dyn, ninst); + x87_do_pop(dyn, ninst, x3); break; case 4: INST_NAME("FISUB ST0, Ed"); diff --git a/src/dynarec/dynarec_arm_db.c b/src/dynarec/dynarec_arm_db.c index bb806e0108..8e0bb8e3b8 100755 --- a/src/dynarec/dynarec_arm_db.c +++ b/src/dynarec/dynarec_arm_db.c @@ -159,7 +159,7 @@ uintptr_t dynarecDB(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, switch((nextop>>3)&7) { case 0: INST_NAME("FILD ST0, Ed"); - v1 = x87_do_push(dyn, ninst); + v1 = x87_do_push(dyn, ninst, x1); s0 = fpu_get_scratch_single(dyn); parity = getedparity(dyn, ninst, addr, nextop, 2); if(parity) { @@ -194,7 +194,7 @@ uintptr_t dynarecDB(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, MOV_IMM_COND(cNE, ed, 0b10, 1); // 0x80000000 WBACK; VMSR(x14); // put back values - x87_do_pop(dyn, ninst); + x87_do_pop(dyn, ninst, x3); break; case 2: INST_NAME("FIST Ed, ST0"); @@ -244,7 +244,7 @@ uintptr_t dynarecDB(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, TSTS_IMM8_ROR(x3, 0b00000001, 0); MOV_IMM_COND(cNE, ed, 0b10, 1); // 0x80000000 WBACK; - x87_do_pop(dyn, ninst); + x87_do_pop(dyn, ninst, x3); x87_restoreround(dyn, ninst, u8); break; case 5: @@ -279,7 +279,7 @@ uintptr_t dynarecDB(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, MOV_REG(x1, ed); } CALL(arm_fstp, -1, 0); - x87_do_pop(dyn, ninst); + x87_do_pop(dyn, ninst, x3); break; default: DEFAULT; diff --git a/src/dynarec/dynarec_arm_dc.c b/src/dynarec/dynarec_arm_dc.c index 342f61002d..58ffc6bcdd 100755 --- a/src/dynarec/dynarec_arm_dc.c +++ b/src/dynarec/dynarec_arm_dc.c @@ -90,7 +90,7 @@ uintptr_t dynarecDC(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, v2 = x87_get_st(dyn, ninst, x1, x2, nextop&7); VCMP_F64(v1, v2); FCOM(x1, x2); - x87_do_pop(dyn, ninst); + x87_do_pop(dyn, ninst, x3); break; case 0xE0: case 0xE1: @@ -211,7 +211,7 @@ uintptr_t dynarecDC(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, } VCMP_F64(v1, d1); FCOM(x1, x2); - x87_do_pop(dyn, ninst); + x87_do_pop(dyn, ninst, x3); break; case 4: INST_NAME("FSUB ST0, double[ED]"); diff --git a/src/dynarec/dynarec_arm_dd.c b/src/dynarec/dynarec_arm_dd.c index 492a5c0cd9..ff12586a3b 100755 --- a/src/dynarec/dynarec_arm_dd.c +++ b/src/dynarec/dynarec_arm_dd.c @@ -61,7 +61,7 @@ uintptr_t dynarecDD(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, break; case 0xD8: INST_NAME("FSTP ST0, ST0"); - x87_do_pop(dyn, ninst); + x87_do_pop(dyn, ninst, x3); break; case 0xD9: case 0xDA: @@ -74,7 +74,7 @@ uintptr_t dynarecDD(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, v1 = x87_get_st(dyn, ninst, x1, x2, 0); v2 = x87_get_st(dyn, ninst, x1, x2, nextop&7); VMOV_64(v2, v1); - x87_do_pop(dyn, ninst); + x87_do_pop(dyn, ninst, x3); break; case 0xE0: @@ -104,7 +104,7 @@ uintptr_t dynarecDD(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, v2 = x87_get_st(dyn, ninst, x1, x2, nextop&7); VCMP_F64(v1, v2); FCOM(x1, x2); - x87_do_pop(dyn, ninst); + x87_do_pop(dyn, ninst, x3); break; case 0xC8: @@ -138,7 +138,7 @@ uintptr_t dynarecDD(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, switch((nextop>>3)&7) { case 0: INST_NAME("FLD double"); - v1 = x87_do_push(dyn, ninst); + v1 = x87_do_push(dyn, ninst, x1); #if 0 // can bus error... addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, 1023, 3); @@ -162,7 +162,7 @@ uintptr_t dynarecDD(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, 0, 0); if(ed!=x1) {MOV_REG(x1, ed);} CALL(arm_fistt64, -1, 0); - x87_do_pop(dyn, ninst); + x87_do_pop(dyn, ninst, x3); break; case 2: INST_NAME("FST double"); @@ -191,7 +191,7 @@ uintptr_t dynarecDD(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, STR_IMM9(x2, ed, fixedaddress); STR_IMM9(x3, ed, fixedaddress+4); } - x87_do_pop(dyn, ninst); + x87_do_pop(dyn, ninst, x3); break; case 4: INST_NAME("FRSTOR m108byte"); diff --git a/src/dynarec/dynarec_arm_de.c b/src/dynarec/dynarec_arm_de.c index 6fe0b92f44..62a709546b 100755 --- a/src/dynarec/dynarec_arm_de.c +++ b/src/dynarec/dynarec_arm_de.c @@ -43,7 +43,7 @@ uintptr_t dynarecDE(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, v1 = x87_get_st(dyn, ninst, x1, x2, 0); v2 = x87_get_st(dyn, ninst, x1, x2, nextop&7); VADD_F64(v2, v2, v1); - x87_do_pop(dyn, ninst); + x87_do_pop(dyn, ninst, x3); break; case 0xC8: case 0xC9: @@ -57,7 +57,7 @@ uintptr_t dynarecDE(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, v1 = x87_get_st(dyn, ninst, x1, x2, 0); v2 = x87_get_st(dyn, ninst, x1, x2, nextop&7); VMUL_F64(v2, v2, v1); - x87_do_pop(dyn, ninst); + x87_do_pop(dyn, ninst, x3); break; case 0xD0: case 0xD1: @@ -72,7 +72,7 @@ uintptr_t dynarecDE(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, v2 = x87_get_st(dyn, ninst, x1, x2, nextop&7); VCMP_F64(v1, v2); FCOM(x1, x2); - x87_do_pop(dyn, ninst); + x87_do_pop(dyn, ninst, x3); break; case 0xD9: @@ -81,8 +81,8 @@ uintptr_t dynarecDE(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, v2 = x87_get_st(dyn, ninst, x1, x2, 1); VCMP_F64(v1, v2); FCOM(x1, x2); - x87_do_pop(dyn, ninst); - x87_do_pop(dyn, ninst); + x87_do_pop(dyn, ninst, x3); + x87_do_pop(dyn, ninst, x3); break; case 0xE0: case 0xE1: @@ -96,7 +96,7 @@ uintptr_t dynarecDE(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, v1 = x87_get_st(dyn, ninst, x1, x2, 0); v2 = x87_get_st(dyn, ninst, x1, x2, nextop&7); VSUB_F64(v2, v1, v2); - x87_do_pop(dyn, ninst); + x87_do_pop(dyn, ninst, x3); break; case 0xE8: case 0xE9: @@ -110,7 +110,7 @@ uintptr_t dynarecDE(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, v1 = x87_get_st(dyn, ninst, x1, x2, 0); v2 = x87_get_st(dyn, ninst, x1, x2, nextop&7); VSUB_F64(v2, v2, v1); - x87_do_pop(dyn, ninst); + x87_do_pop(dyn, ninst, x3); break; case 0xF0: case 0xF1: @@ -124,7 +124,7 @@ uintptr_t dynarecDE(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, v1 = x87_get_st(dyn, ninst, x1, x2, 0); v2 = x87_get_st(dyn, ninst, x1, x2, nextop&7); VDIV_F64(v2, v1, v2); - x87_do_pop(dyn, ninst); + x87_do_pop(dyn, ninst, x3); break; case 0xF8: case 0xF9: @@ -138,7 +138,7 @@ uintptr_t dynarecDE(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, v1 = x87_get_st(dyn, ninst, x1, x2, 0); v2 = x87_get_st(dyn, ninst, x1, x2, nextop&7); VDIV_F64(v2, v2, v1); - x87_do_pop(dyn, ninst); + x87_do_pop(dyn, ninst, x3); break; case 0xD8: diff --git a/src/dynarec/dynarec_arm_df.c b/src/dynarec/dynarec_arm_df.c index 1f8ab002ec..c8ad7add00 100755 --- a/src/dynarec/dynarec_arm_df.c +++ b/src/dynarec/dynarec_arm_df.c @@ -51,7 +51,7 @@ uintptr_t dynarecDF(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, case 0xC7: INST_NAME("FFREEP STx"); // not handling Tag... - x87_do_pop(dyn, ninst); + x87_do_pop(dyn, ninst, x3); break; case 0xE0: @@ -75,7 +75,7 @@ uintptr_t dynarecDF(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, v2 = x87_get_st(dyn, ninst, x1, x2, nextop&7); VCMP_F64(v1, v2); FCOMI(x1, x2); - x87_do_pop(dyn, ninst); + x87_do_pop(dyn, ninst, x3); break; case 0xF0: case 0xF1: @@ -91,7 +91,7 @@ uintptr_t dynarecDF(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, v2 = x87_get_st(dyn, ninst, x1, x2, nextop&7); VCMP_F64(v1, v2); FCOMI(x1, x2); - x87_do_pop(dyn, ninst); + x87_do_pop(dyn, ninst, x3); break; case 0xC8: @@ -140,7 +140,7 @@ uintptr_t dynarecDF(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, switch((nextop>>3)&7) { case 0: INST_NAME("FILD ST0, Ew"); - v1 = x87_do_push(dyn, ninst); + v1 = x87_do_push(dyn, ninst, x1); addr = geted(dyn, addr, ninst, nextop, &wback, x3, &fixedaddress, 255, 0); LDRSH_IMM8(x1, wback, fixedaddress); s0 = fpu_get_scratch_single(dyn); @@ -167,7 +167,7 @@ uintptr_t dynarecDF(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, CMPS_REG_LSL_IMM5_COND(cEQ, ed, x3, 0); MOVW_COND(cNE, x3, 0x8000); // saturated STRH_IMM8(x3, wback, fixedaddress); - x87_do_pop(dyn, ninst); + x87_do_pop(dyn, ninst, x3); VMSR(x14); break; case 2: @@ -212,7 +212,7 @@ uintptr_t dynarecDF(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, CMPS_REG_LSL_IMM5_COND(cEQ, ed, x3, 0); MOVW_COND(cNE, x3, 0x8000); // saturated STRH_IMM8(x3, wback, fixedaddress); - x87_do_pop(dyn, ninst); + x87_do_pop(dyn, ninst, x3); x87_restoreround(dyn, ninst, u8); break; case 4: @@ -235,7 +235,7 @@ uintptr_t dynarecDF(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, 0, 0); if(ed!=x1) {MOV_REG(x1, ed);} CALL(fpu_fbst, -1, 0); - x87_do_pop(dyn, ninst); + x87_do_pop(dyn, ninst, x3); break; case 7: // could be inlined for most thing, but is it usefull? INST_NAME("FISTP i64, ST0"); @@ -243,7 +243,7 @@ uintptr_t dynarecDF(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, addr = geted(dyn, addr, ninst, nextop, &ed, x1, &fixedaddress, 0, 0); if(ed!=x1) {MOV_REG(x1, ed);} CALL(arm_fistp64, -1, 0); - x87_do_pop(dyn, ninst); + x87_do_pop(dyn, ninst, x3); break; default: DEFAULT; diff --git a/src/dynarec/dynarec_arm_helper.c b/src/dynarec/dynarec_arm_helper.c index 43538753b7..492928ebe8 100755 --- a/src/dynarec/dynarec_arm_helper.c +++ b/src/dynarec/dynarec_arm_helper.c @@ -390,6 +390,7 @@ void grab_fsdata(dynarec_arm_t* dyn, uintptr_t addr, int ninst, int reg) static void x87_reset(dynarec_arm_t* dyn, int ninst) { #if STEP > 1 + dyn->x87count = 0; for (int i=0; i<8; ++i) dyn->x87cache[i] = -1; dyn->x87stack = 0; @@ -401,6 +402,8 @@ void x87_stackcount(dynarec_arm_t* dyn, int ninst, int scratch) #if STEP > 1 if(!dyn->x87stack) return; + if(dyn->mmxcount) + mmx_purgecache(dyn, ninst, scratch); MESSAGE(LOG_DUMP, "\tSynch x87 Stackcount (%d)\n", dyn->x87stack); int a = dyn->x87stack; // Add x87stack to emu fpu_stack @@ -426,9 +429,12 @@ void x87_stackcount(dynarec_arm_t* dyn, int ninst, int scratch) #endif } -int x87_do_push(dynarec_arm_t* dyn, int ninst) +int x87_do_push(dynarec_arm_t* dyn, int ninst, int s1) { #if STEP > 1 + if(dyn->mmxcount) + mmx_purgecache(dyn, ninst, s1); + ++dyn->x87count; dyn->x87stack+=1; // move all regs in cache, and find a free one int ret = -1; @@ -447,6 +453,9 @@ int x87_do_push(dynarec_arm_t* dyn, int ninst) void x87_do_push_empty(dynarec_arm_t* dyn, int ninst, int s1) { #if STEP > 1 + if(dyn->mmxcount) + mmx_purgecache(dyn, ninst, s1); + ++dyn->x87count; dyn->x87stack+=1; // move all regs in cache for(int i=0; i<8; ++i) @@ -456,9 +465,11 @@ void x87_do_push_empty(dynarec_arm_t* dyn, int ninst, int s1) x87_stackcount(dyn, ninst, s1); #endif } -void x87_do_pop(dynarec_arm_t* dyn, int ninst) +void x87_do_pop(dynarec_arm_t* dyn, int ninst, int s1) { #if STEP > 1 + if(dyn->mmxcount) + mmx_purgecache(dyn, ninst, s1); dyn->x87stack-=1; // move all regs in cache, poping ST0 for(int i=0; i<8; ++i) @@ -469,12 +480,16 @@ void x87_do_pop(dynarec_arm_t* dyn, int ninst) dyn->x87reg[i] = -1; } } + --dyn->x87count; #endif } -static void x87_purgecache(dynarec_arm_t* dyn, int ninst, int s1, int s2, int s3) +void x87_purgecache(dynarec_arm_t* dyn, int ninst, int s1, int s2, int s3) { #if STEP > 1 + if(!dyn->x87count) + return; + dyn->x87count = 0; int ret = 0; for (int i=0; i<8 && !ret; ++i) if(dyn->x87cache[i] != -1) @@ -592,6 +607,8 @@ static void x87_reflectcache(dynarec_arm_t* dyn, int ninst, int s1, int s2, int int x87_get_cache(dynarec_arm_t* dyn, int ninst, int s1, int s2, int st) { #if STEP > 1 + if(dyn->mmxcount) + mmx_purgecache(dyn, ninst, s1); // search in cache first for (int i=0; i<8; ++i) if(dyn->x87cache[i]==st) @@ -605,6 +622,7 @@ int x87_get_cache(dynarec_arm_t* dyn, int ninst, int s1, int s2, int st) // found, setup and grab the value dyn->x87cache[ret] = st; dyn->x87reg[ret] = fpu_get_reg_double(dyn); + ++dyn->x87count; if(offsetof(x86emu_t, mmx87)<256) { ADD_IMM8(s1, xEmu, offsetof(x86emu_t, mmx87)); } else { @@ -704,12 +722,15 @@ void x87_forget(dynarec_arm_t* dyn, int ninst, int s1, int s2, int st) fpu_free_reg_double(dyn, dyn->x87reg[ret]); dyn->x87cache[ret] = -1; dyn->x87reg[ret] = -1; + --dyn->x87count; #endif } void x87_reget_st(dynarec_arm_t* dyn, int ninst, int s1, int s2, int st) { #if STEP > 1 + if(dyn->mmxcount) + mmx_purgecache(dyn, ninst, s1); // search in cache first for (int i=0; i<8; ++i) if(dyn->x87cache[i]==st) { @@ -745,6 +766,7 @@ void x87_reget_st(dynarec_arm_t* dyn, int ninst, int s1, int s2, int st) // found, setup and grab the value dyn->x87cache[ret] = st; dyn->x87reg[ret] = fpu_get_reg_double(dyn); + ++dyn->x87count; if(offsetof(x86emu_t, mmx87)<256) { ADD_IMM8(s1, xEmu, offsetof(x86emu_t, mmx87)); } else { @@ -805,16 +827,20 @@ void x87_restoreround(dynarec_arm_t* dyn, int ninst, int s1) static void mmx_reset(dynarec_arm_t* dyn, int ninst) { #if STEP > 1 + dyn->mmxcount = 0; for (int i=0; i<8; ++i) dyn->mmxcache[i] = -1; #endif } // get neon register for a MMX reg, create the entry if needed -int mmx_get_reg(dynarec_arm_t* dyn, int ninst, int s1, int a) +int mmx_get_reg(dynarec_arm_t* dyn, int ninst, int s1, int s2, int s3, int a) { #if STEP > 1 + if(dyn->x87count) + x87_purgecache(dyn, ninst, s1, s2, s3); if(dyn->mmxcache[a]!=-1) return dyn->mmxcache[a]; + ++dyn->mmxcount; int ret = dyn->mmxcache[a] = fpu_get_reg_double(dyn); int offs = offsetof(x86emu_t, mmx87[a]); if(!(offs&3) && (offs>>2)<256) { @@ -830,11 +856,14 @@ int mmx_get_reg(dynarec_arm_t* dyn, int ninst, int s1, int a) #endif } // get neon register for a MMX reg, but don't try to synch it if it needed to be created -int mmx_get_reg_empty(dynarec_arm_t* dyn, int ninst, int s1, int a) +int mmx_get_reg_empty(dynarec_arm_t* dyn, int ninst, int s1, int s2, int s3, int a) { #if STEP > 1 + if(dyn->x87count) + x87_purgecache(dyn, ninst, s1, s2, s3); if(dyn->mmxcache[a]!=-1) return dyn->mmxcache[a]; + ++dyn->mmxcount; int ret = dyn->mmxcache[a] = fpu_get_reg_double(dyn); return ret; #else @@ -842,9 +871,12 @@ int mmx_get_reg_empty(dynarec_arm_t* dyn, int ninst, int s1, int a) #endif } // purge the MMX cache only(needs 3 scratch registers) -static void mmx_purgecache(dynarec_arm_t* dyn, int ninst, int s1) +void mmx_purgecache(dynarec_arm_t* dyn, int ninst, int s1) { #if STEP > 1 + if(!dyn->mmxcount) + return; + dyn->mmxcount = 0; int old = -1; for (int i=0; i<8; ++i) if(dyn->mmxcache[i]!=-1) { diff --git a/src/dynarec/dynarec_arm_helper.h b/src/dynarec/dynarec_arm_helper.h index 07066205d5..92173c427f 100755 --- a/src/dynarec/dynarec_arm_helper.h +++ b/src/dynarec/dynarec_arm_helper.h @@ -461,6 +461,8 @@ void* arm_next(x86emu_t* emu, uintptr_t addr); #define fpu_popcache STEPNAME(fpu_popcache) #define fpu_reset STEPNAME(fpu_reset) #define fpu_purgecache STEPNAME(fpu_purgecache) +#define x87_purgecache STEPNAME(x87_purgecache) +#define mmx_purgecache STEPNAME(mmx_purgecache) #ifdef HAVE_TRACE #define fpu_reflectcache STEPNAME(fpu_reflectcache) #endif @@ -564,11 +566,11 @@ void emit_pf(dynarec_arm_t* dyn, int ninst, int s1, int s3, int s4); // cache of the local stack counter, to avoid upadte at every call void x87_stackcount(dynarec_arm_t* dyn, int ninst, int scratch); // fpu push. Return the Dd value to be used -int x87_do_push(dynarec_arm_t* dyn, int ninst); +int x87_do_push(dynarec_arm_t* dyn, int ninst, int s1); // fpu push. Do not allocate a cache register. Needs a scratch register to do x87stack synch (or 0 to not do it) void x87_do_push_empty(dynarec_arm_t* dyn, int ninst, int s1); // fpu pop. All previous returned Dd should be considered invalid -void x87_do_pop(dynarec_arm_t* dyn, int ninst); +void x87_do_pop(dynarec_arm_t* dyn, int ninst, int s1); // get cache index for a x87 reg, create the entry if needed int x87_get_cache(dynarec_arm_t* dyn, int ninst, int s1, int s2, int a); // get vfpu register for a x87 reg, create the entry if needed @@ -588,9 +590,9 @@ int sse_setround(dynarec_arm_t* dyn, int ninst, int s1, int s2, int s3); //MMX helpers // get neon register for a MMX reg, create the entry if needed -int mmx_get_reg(dynarec_arm_t* dyn, int ninst, int s1, int a); +int mmx_get_reg(dynarec_arm_t* dyn, int ninst, int s1, int s2, int s3, int a); // get neon register for a MMX reg, but don't try to synch it if it needed to be created -int mmx_get_reg_empty(dynarec_arm_t* dyn, int ninst, int s1, int a); +int mmx_get_reg_empty(dynarec_arm_t* dyn, int ninst, int s1, int s2, int s3, int a); //SSE/SSE2 helpers // get neon register for a SSE reg, create the entry if needed @@ -603,6 +605,8 @@ int sse_get_reg_empty(dynarec_arm_t* dyn, int ninst, int s1, int a); void fpu_reset(dynarec_arm_t* dyn, int ninst); // purge the FPU cache (needs 3 scratch registers) void fpu_purgecache(dynarec_arm_t* dyn, int ninst, int s1, int s2, int s3); +void x87_purgecache(dynarec_arm_t* dyn, int ninst, int s1, int s2, int s3); +void mmx_purgecache(dynarec_arm_t* dyn, int ninst, int s1); #ifdef HAVE_TRACE void fpu_reflectcache(dynarec_arm_t* dyn, int ninst, int s1, int s2, int s3); #endif diff --git a/src/dynarec/dynarec_arm_private.h b/src/dynarec/dynarec_arm_private.h index fbd6238064..c00092fa68 100755 --- a/src/dynarec/dynarec_arm_private.h +++ b/src/dynarec/dynarec_arm_private.h @@ -36,6 +36,8 @@ typedef struct dynarec_arm_s { int ssecache[8];// cache status for the 8 SSE(2) registers int fpuused[24];// all 8..31 double reg from fpu, used by x87, sse and mmx int x87stack; // cache stack counter + int x87count; // number of x87 register used + int mmxcount; // number of mmx register used (not both mmx and x87 at the same time) int fpu_scratch;// scratch counter int fpu_extra_qscratch; // some opcode need an extra quad scratch register int fpu_reg; // x87/sse/mmx reg counter