From ac3a7f7493979f8c825d0b19b568185162716671 Mon Sep 17 00:00:00 2001 From: Sean Maas Date: Thu, 19 Dec 2024 22:49:42 -0500 Subject: [PATCH] Implement the DSP-1 stuff that Mario Kart needs --- src/cop_dsp1.S | 392 ++++++++++++++++++++++++++++++++++++++++++++++--- src/memory.S | 4 +- 2 files changed, 375 insertions(+), 21 deletions(-) diff --git a/src/cop_dsp1.S b/src/cop_dsp1.S index 267a302..b281b2c 100644 --- a/src/cop_dsp1.S +++ b/src/cop_dsp1.S @@ -75,22 +75,22 @@ azsc_bounds: .align 4 dsp1_cmds: // Lookup table for DSP1 command functions and parameter counts - .word dsp1_unimp, 0x20001, dsp1_unimp, 0x40000, dsp1_params, 0x70004, dsp1_unimp, 0x30003 // 0x00-0x03 - .word dsp1_unimp, 0x20002, dsp1_unimp, 0x40000, dsp1_unimp, 0x30003, dsp1_unimp, 0x10001 // 0x04-0x07 - .word dsp1_unimp, 0x30002, dsp1_unimp, 0x30003, dsp1_raster, 0x10004, dsp1_unimp, 0x30001 // 0x08-0x0B - .word dsp1_unimp, 0x30002, dsp1_unimp, 0x30003, dsp1_unimp, 0x20002, dsp1_unimp, 0x10001 // 0x0C-0x0F - .word dsp1_unimp, 0x20002, dsp1_unimp, 0x40000, dsp1_params, 0x70004, dsp1_unimp, 0x30003 // 0x10-0x13 - .word dsp1_unimp, 0x60003, dsp1_unimp, 0x40000, dsp1_unimp, 0x30003, dsp1_unimp, 0x10400 // 0x14-0x17 - .word dsp1_unimp, 0x40001, dsp1_unimp, 0x30003, dsp1_unimp, 0x00000, dsp1_unimp, 0x30001 // 0x18-0x1B - .word dsp1_unimp, 0x60003, dsp1_unimp, 0x30003, dsp1_unimp, 0x20002, dsp1_unimp, 0x10400 // 0x1C-0x1F - .word dsp1_unimp, 0x20001, dsp1_unimp, 0x40000, dsp1_params, 0x70004, dsp1_unimp, 0x30003 // 0x20-0x23 - .word dsp1_unimp, 0x20002, dsp1_unimp, 0x40000, dsp1_unimp, 0x30003, dsp1_unimp, 0x10001 // 0x24-0x27 - .word dsp1_unimp, 0x30001, dsp1_unimp, 0x30003, dsp1_unimp, 0x00000, dsp1_unimp, 0x30001 // 0x28-0x2B - .word dsp1_unimp, 0x30002, dsp1_unimp, 0x30003, dsp1_unimp, 0x20002, dsp1_unimp, 0x10001 // 0x2C-0x2F - .word dsp1_unimp, 0x20002, dsp1_unimp, 0x40000, dsp1_params, 0x70004, dsp1_unimp, 0x30003 // 0x30-0x33 - .word dsp1_unimp, 0x60003, dsp1_unimp, 0x40000, dsp1_unimp, 0x30003, dsp1_unimp, 0x10400 // 0x34-0x37 - .word dsp1_unimp, 0x40001, dsp1_unimp, 0x30003, dsp1_unimp, 0x00000, dsp1_unimp, 0x30001 // 0x38-0x3B - .word dsp1_unimp, 0x60003, dsp1_unimp, 0x30003, dsp1_unimp, 0x20002, dsp1_unimp, 0x10400 // 0x3C-0x3F + .word dsp1_multip, 0x20001, dsp1_unimpl, 0x40000, dsp1_params, 0x70004, dsp1_unimpl, 0x30003 // 0x00-0x03 + .word dsp1_triang, 0x20002, dsp1_unimpl, 0x40000, dsp1_projec, 0x30003, dsp1_unimpl, 0x10001 // 0x04-0x07 + .word dsp1_radius, 0x30002, dsp1_unimpl, 0x30003, dsp1_raster, 0x10004, dsp1_unimpl, 0x30001 // 0x08-0x0B + .word dsp1_rotate, 0x30002, dsp1_unimpl, 0x30003, dsp1_unimpl, 0x20002, dsp1_unimpl, 0x10001 // 0x0C-0x0F + .word dsp1_unimpl, 0x20002, dsp1_unimpl, 0x40000, dsp1_params, 0x70004, dsp1_unimpl, 0x30003 // 0x10-0x13 + .word dsp1_unimpl, 0x60003, dsp1_unimpl, 0x40000, dsp1_projec, 0x30003, dsp1_unimpl, 0x10400 // 0x14-0x17 + .word dsp1_unimpl, 0x40001, dsp1_unimpl, 0x30003, dsp1_unimpl, 0x00000, dsp1_unimpl, 0x30001 // 0x18-0x1B + .word dsp1_unimpl, 0x60003, dsp1_unimpl, 0x30003, dsp1_unimpl, 0x20002, dsp1_unimpl, 0x10400 // 0x1C-0x1F + .word dsp1_unimpl, 0x20001, dsp1_unimpl, 0x40000, dsp1_params, 0x70004, dsp1_unimpl, 0x30003 // 0x20-0x23 + .word dsp1_triang, 0x20002, dsp1_unimpl, 0x40000, dsp1_projec, 0x30003, dsp1_unimpl, 0x10001 // 0x24-0x27 + .word dsp1_distan, 0x30001, dsp1_unimpl, 0x30003, dsp1_unimpl, 0x00000, dsp1_unimpl, 0x30001 // 0x28-0x2B + .word dsp1_rotate, 0x30002, dsp1_unimpl, 0x30003, dsp1_unimpl, 0x20002, dsp1_unimpl, 0x10001 // 0x2C-0x2F + .word dsp1_unimpl, 0x20002, dsp1_unimpl, 0x40000, dsp1_params, 0x70004, dsp1_unimpl, 0x30003 // 0x30-0x33 + .word dsp1_unimpl, 0x60003, dsp1_unimpl, 0x40000, dsp1_projec, 0x30003, dsp1_unimpl, 0x10400 // 0x34-0x37 + .word dsp1_unimpl, 0x40001, dsp1_unimpl, 0x30003, dsp1_unimpl, 0x00000, dsp1_unimpl, 0x30001 // 0x38-0x3B + .word dsp1_unimpl, 0x60003, dsp1_unimpl, 0x30003, dsp1_unimpl, 0x20002, dsp1_unimpl, 0x10400 // 0x3C-0x3F .align 4 data_rom: // Data ROM from bsnes containing lookup tables and constants @@ -389,22 +389,91 @@ normal_loop: bnez t3, normal_coeff srl t1, t1, 1 beqz t1, normal_coeff - nop - b normal_loop addi t0, t0, 1 + b normal_loop + nop normal_coeff: // Use a lookup table to calculate the coefficient and adjust its exponent beqz t0, normal_same sll t1, t0, 1 lh t1, data_rom + 0x42(t1) - sll t1, t1, 1 mult a0, t1 mflo a0 + sll a0, a0, 1 normal_same: jr ra sub a1, a1, t0 +.align 5 +calc_norm32: // a0: number - a0: coefficient, a1: exponent + // Set initial values for normalizing a 32-bit number with range -1 to 1 + li a1, 0 + li t1, 0x4000 + sra t2, a0, 31 + andi t3, a0, 0x7FFF + sra a0, a0, 15 + xor t4, t2, a0 // Absolute + xor t2, t2, t3 // Absolute + +norm32_loop: + // Count leading zeros in the absolute upper value of the number + and t0, t4, t1 + bnez t0, norm32_coeff + srl t1, t1, 1 + beqz t1, norm32_coeff + addi a1, a1, 1 + b norm32_loop + nop + +norm32_coeff: + // Use a lookup table to calculate the upper coefficient + beqz a1, norm32_done + sll t1, a1, 1 + lh t1, data_rom + 0x42(t1) + sll t1, t1, 1 + mult a0, t1 + mflo a0 + + // Adjust the coefficient for the lower value if upper isn't empty + bge a1, 15, norm16_count + sll t0, a1, 1 + sub t0, zero, t0 + lh t0, data_rom + 0x80(t0) + mult t0, t3 + mflo t0 + sra t0, t0, 15 + b norm32_done + add a0, a0, t0 + +norm16_count: + // Count leading zeros in the absolute lower value of the number + li t1, 0x4000 +norm16_loop: + and t0, t2, t1 + bnez t0, norm16_coeff + srl t1, t1, 1 + beqz t1, norm16_coeff + addi a1, a1, 1 + b norm16_loop + nop + +norm16_coeff: + // Adjust the coefficient for the lower value if upper is empty + ble a1, 15, norm32_done + add a0, a0, t3 + sll t0, a1, 1 + lh t0, data_rom + 0x24(t0) + mult t0, t3 + mflo a0 + sll a0, a0, 1 + +norm32_done: + // Ensure the 16-bit result is properly sign-extended + sll a0, a0, 16 + jr ra + sra a0, a0, 16 + .align 5 calc_denorm: // a0: coefficient, a1: exponent - a0: result // Use a lookup table to denormalize a 16-bit coefficient with its exponent @@ -494,6 +563,138 @@ inverse_exp: jr ra addi a1, a1, 1 +.align 5 +dsp1_multip: + // Multiply two 16-bit values + lh t0, input_buf + 2 + lh t1, input_buf + 0 + mult t0, t1 + mflo t0 + sra t0, t0, 15 + sh t0, output_buf + 0 + j check_output + nop + +.align 5 +dsp1_triang: + // Calculate the X component of a 2D vector using angle and radius + lh t9, input_buf + 2 + lh t8, input_buf + 0 + jal calc_cos + move a0, t9 + mult a0, t8 + mflo t0 + sra t0, t0, 15 + sh t0, output_buf + 0 + + // Calculate the Y component of a 2D vector using angle and radius + jal calc_sin + move a0, t9 + mult a0, t8 + mflo t0 + sra t0, t0, 15 + sh t0, output_buf + 2 + j check_output + nop + +.align 5 +dsp1_rotate: + // Get the sine and cosine of the input angle + lh a0, input_buf + 4 + jal calc_sin + move t9, a0 + move t8, a0 + jal calc_cos + move a0, t9 + + // Calculate the rotated X-coordinate from the input coordinates + lh t0, input_buf + 0 + mult t0, t8 + lh t1, input_buf + 2 + mflo t2 + mult t1, a0 + sra t2, t2, 15 + mflo t3 + sra t3, t3, 15 + add t2, t2, t3 + sh t2, output_buf + 2 + + // Calculate the rotated Y-coordinate from the input coordinates + mult t0, a0 + mflo t2 + mult t1, t8 + sra t2, t2, 15 + mflo t3 + sra t3, t3, 15 + sub t2, t2, t3 + sh t2, output_buf + 0 + j check_output + nop + +.align 5 +dsp1_radius: + // Calculate the 32-bit squared normal of a 3D vector + lh t0, input_buf + 4 + mult t0, t0 + lh t1, input_buf + 2 + mflo t0 + mult t1, t1 + lh t2, input_buf + 0 + mflo t1 + mult t2, t2 + add t0, t0, t1 + mflo t2 + add t0, t0, t2 + sll t0, t0, 1 + + // Output the result as two 16-bit halves + sh t0, output_buf + 2 + sra t0, t0, 16 + sh t0, output_buf + 0 + j check_output + nop + +.align 5 +dsp1_distan: + // Calculate the 32-bit squared normal of a 3D vector + lh t0, input_buf + 4 + mult t0, t0 + lh t1, input_buf + 2 + mflo t0 + mult t1, t1 + lh t2, input_buf + 0 + mflo t1 + mult t2, t2 + add t0, t0, t1 + mflo t2 + add a0, t0, t2 + beqz a0, distan_set + nop + + // Normalize and look up points for square root interpolation + jal calc_norm32 + nop + andi t0, a1, 0x1 + sra a0, a0, t0 + sra t0, a0, 9 + sll t0, t0, 1 + lh t1, data_rom + 0x1AC(t0) + lh t0, data_rom + 0x1AA(t0) + + // Calculate the square root to get the length of the vector + sub t2, t1, t0 + andi a0, a0, 0x1FF + mult t2, a0 + mflo t2 + sra t2, t2, 9 + add a0, t2, t0 + sra t0, a1, 1 + sra a0, a0, t0 +distan_set: + sh a0, output_buf + 0 + j check_output + nop + .align 5 dsp1_params: // Save the sine and cosine of the azimuth angle @@ -789,7 +990,158 @@ dsp1_raster: nop .align 5 -dsp1_unimp: +dsp1_projec: + // Normalize the offset of a 3D point from the screen center + lh t0, input_buf + 4 + lh t1, global_x + jal calc_norm32 + sub a0, t0, t1 + addi t9, a1, -1 + sra t8, a0, 1 + lh t0, input_buf + 2 + lh t1, global_y + jal calc_norm32 + sub a0, t0, t1 + addi t7, a1, -1 + sra t6, a0, 1 + lh t0, input_buf + 0 + lh t1, global_z + jal calc_norm32 + sub a0, t0, t1 + addi t5, a1, -1 + sra t4, a0, 1 + + // Find the lowest common exponent between the components + blt t7, t9, low_exp1 + move t0, t7 + move t0, t9 +low_exp1: + blt t0, t5, low_exp2 + nop + move t0, t5 +low_exp2: + sub t9, t9, t0 + sub t7, t7, t0 + sub t5, t5, t0 + + // Adjust the coefficients so they all use the same exponent + sll t9, t9, 1 + lh t9, data_rom + 0x62(t9) + mult t8, t9 + sll t7, t7, 1 + lh t7, data_rom + 0x62(t7) + mflo t8 + sra t8, t8, 15 + mult t6, t7 + sll t5, t5, 1 + lh t5, data_rom + 0x62(t5) + mflo t6 + mult t4, t5 + sra t6, t6, 15 + mflo t4 + sra sp, t4, 15 + + // Calculate the dot product of the point and the normal vector + lh t9, normal_x + mult t9, t8 + lh t7, normal_y + mflo t9 + sra t9, t9, 15 + mult t7, t6 + lh t5, normal_z + mflo t7 + sra t7, t7, 15 + mult t5, sp + sub t1, zero, t9 + sub t1, t1, t7 + mflo t5 + sra t5, t5, 15 + sub t1, t1, t5 + + // Denormalize the product to a 32-bit value + li t9, 16 + sub t9, t9, t0 + bgez t9, shift_pos + sub t0, zero, t9 + b shift_neg + sra t1, t1, t0 +shift_pos: + sll t1, t1, t9 +shift_neg: + addi t1, t1, 1 + sltu t0, zero, t1 + sub t1, t1, t0 + sra t1, t1, 1 + + // Calculate the scale factor and denormalize it + lhu t0, les + lh t7, les_e + jal calc_norm32 + add a0, t0, t1 + addi t7, t7, -15 + add t7, t7, a1 + jal calc_inverse + li a1, 0 + lh t0, les_c + mult t0, a0 + mflo t5 + sra t5, t5, 15 + jal calc_normal + move a0, t5 + add a1, a1, t7 + jal calc_denorm + addi a1, a1, -7 + sh a0, output_buf + 0 + + // Calculate the vertical projection and denormalize it + lh t0, vert_x + mult t0, t8 + lh t1, vert_y + mflo t0 + sra t0, t0, 15 + mult t1, t6 + lh t2, vert_z + mflo t1 + mult t2, sp + sra t1, t1, 15 + add t0, t0, t1 + mflo t2 + sra t2, t2, 15 + add t0, t0, t2 + mult t0, t5 + mflo a0 + sra a0, a0, 15 + jal calc_normal + li a1, 0 + add a1, a1, t7 + jal calc_denorm + add a1, a1, t9 + sh a0, output_buf + 2 + + // Calculate the horizontal projection and denormalize it + lh t0, hori_x + mult t0, t8 + lh t1, hori_y + mflo t0 + mult t1, t6 + sra t0, t0, 15 + mflo t1 + sra t1, t1, 15 + add t0, t0, t1 + mult t0, t5 + mflo a0 + sra a0, a0, 15 + jal calc_normal + li a1, 0 + add a1, a1, t7 + jal calc_denorm + add a1, a1, t9 + sh a0, output_buf + 4 + j check_output + nop + +.align 5 +dsp1_unimpl: // Do nothing for unimplemented DSP1 commands la t0, output_buf sd zero, 0(t0) diff --git a/src/memory.S b/src/memory.S index 6f3cec3..ef01fdb 100644 --- a/src/memory.S +++ b/src/memory.S @@ -425,9 +425,11 @@ hirom_area: add t3, t8, t4 hisram_area: - // Map I/O registers to banks 0x00-0x0F if DSP-1 is enabled + // Map I/O registers to banks 0x00-0x0F and 0x20-0x2F if DSP-1 is enabled srl t3, t0, 16 + addi t3, t3, -0x10 sltiu t3, t3, 0x10 + xori t3, t3, 0x1 and t3, t3, t6 bnez t3, hiio_area nop