Port bones.gc math to GOAL (#3425)

Reverse engineer the skinning matrix calculation and port to GOAL. This is about 3x faster than the MIPS2c version. As usual, there is a `*use-new-bones*` flag to go back to the old version. Fix for a bug in the compiler's `.div.vf` implementation (only happens if src/dst are the same), and fix for a typo in the register allocator that would sometimes cause it not to consider xmm8-xmm15.
2024-10-20 00:57:44 -04:00 · 2024-03-15 20:31:11 -04:00 · 2024-03-15 20:31:11 -04:00 · 82fb2cc26a
parent 5a8b4e81f9
commit 82fb2cc26a
6 changed files with 892 additions and 39 deletions
--- a/docs/progress-notes/bones.md
+++ b/docs/progress-notes/bones.md
@ -0,0 +1,500 @@
 The `bones.gc` file computes skinning matrices for foreground rendering.
 Arguments:
 - `a0`: output matrix area, each matrix is 7x quadwords contaning the vertex and normal transformation matrices.
 - `a1`: input joint array. The joints contain the inverse bind pose.
 - `a2`: input bones array. The world space bone transforms
 - `a3`: num bones
 - `vf28, vf29, vf30, vf31` the camera matrix
 - `vf25, vf26, vf27` the camera matrix again
 ```asm
    daddiu sp, sp, -96
    sd ra, 0(sp)
    sq s2, 16(sp)
    sq s3, 32(sp)
    sq s4, 48(sp)
    sq s5, 64(sp)
    sq gp, 80(sp)
    lui v1, 4096
    lui t0, 4096
    ori v1, v1, 54272   ;; v1 = DMA reg addr
    ori t0, t0, 53248   ;; t0 = DMA reg addr
    lui t2, 32767       ;; 0x7fff....
    daddiu t1, a3, -16  ;; bone count - 16 (maybe we do 16 bones at a time?)
    ori t2, t2, 65535   ;; 0x7fff'ffff
    lui at, 28672       ;; scratchpad addr
    addiu t4, r0, 64    ;; t4 = 64 (= 16 bones * 4)
    addiu t5, r0, 1280  ;; t5 = 1280 (= 16 bones * 80)
    bgez t1, L17        ;; more than 16 bones?
    addiu t3, r0, 16    ;; t3 = 16
   ;; if first run is under 16 bones, adjust counts
 B1:
    or t3, a3, r0       ;; t3 = num bones
    sll r0, r0, 0
    dsll t4, t3, 2      ;; t4 = num bones * 4
    dsll a3, t3, 4      ;; a3 = num bones * 16
    dsll t1, t3, 6      ;; t1 = num bones * 64
    sll r0, r0, 0
    daddu t5, t1, a3    ;; t5 = num bones * 80
    addiu t1, r0, 0     ;; t1 = 0 (remaining bones count)
 B2:
 L17:
    addiu a3, r0, 0     ;; a3 = 0
    addiu t6, r0, 1     ;; t6 = 1
    and a1, a1, t2      ;; mask off upper bits of address (not sure why, but they do this sometimes)
    sll r0, r0, 0
    daddiu a1, a1, 12   ;; adjustment of joint pointer for the strided dma stuff.
    or a0, a0, r0
    daddiu a1, a1, -80
    sll r0, r0, 0
    ;; wait for DMA to be free...
    <snip>
 B5:
 L19:
    addiu t6, r0, 80
    addiu t7, r0, 264
    sw t6, 128(v1)   ;; addr in spad = 80 for joints
    sw a1, 16(v1)
    sw t4, 32(v1)    ;; size: num bones * 4
    sw t7, 0(v1)
    daddu a1, a1, t5
    ;; wait for dma to complete
    <snip>
 B8:
 L21:
    and a2, a2, t2  ;; clean up bones addr
    sll r0, r0, 0
    dsll t2, t3, 2    ;; t2 = bones * 4
    addiu t4, r0, 256 ;; t4 = 256
    daddu t2, t2, t3  ;; t2 = bones * 5 (size of the bone)
    addiu t6, r0, 1104  ;; addr in spad = 1104 for bones.
    dsll t5, t2, 4
    sw t6, 128(v1)
    addiu t8, r0, 0
    sw a2, 16(v1)
    daddu a2, a2, t5
    sw t2, 32(v1)
    addiu t2, r0, 1
    sw t4, 0(v1)
 ;; wait for dma
 ;; <snip>
 B11:
 L23:
    dsll t5, t8, 2     ;; ?? not sure what this is, but always zero?
    daddu t9, t5, at   ;; ptr to bone-work
    sll r0, r0, 0
    lwu t5, 0(t9)      ;; t5 = (-> bone-layout joint)
    or t6, t3, r0
    lwu t7, 8(t9)      ;; t7 = (-> bone-layout bone)
    or ra, t3, r0
    lwu t3, 16(t9)     ;; t3 = (-> bone-layout output)
    sll r0, r0, 0
    sw ra, 44(at)      ;; stash sp-size
    beq ra, r0, L36
    sw t8, 48(at)      ;; stash sp-bufnum
 B12:
    daddiu t1, t1, -16   ;; decrement bones count
    addiu t9, r0, 1280   ;; next DMA math stuff
    bgez t1, L24         ;; check if partial bone buffer
    addiu t8, r0, 16     ;; ....
 B13:
    daddiu t8, t1, 16
    addiu t1, r0, 0
    dsll t9, t8, 4
    dsll ra, t8, 6
    beq t8, r0, L25
    daddu t9, ra, t9
 B14:
 L24:
    dsll t4, t8, 2
    dsll ra, t2, 2
    daddu gp, ra, at
    sw a1, 16(v1)
    addiu ra, r0, 264
    lwu gp, 0(gp)
    andi gp, gp, 16383
    sw t4, 32(v1)
    daddu a1, a1, t9
    sw gp, 128(v1)
    addiu t4, r0, 0
    sw ra, 0(v1)
 ;; and now, for the actual bones.
 B15:
 L25:
    sll r0, r0, 0
    sw t8, 40(at)   ;; in-count
    sll r0, r0, 0
    lqc2 vf1, 0(t5)  ;; vf1, vf2, vf3, vf4 = inverse bind pose
    sll r0, r0, 0
    lqc2 vf2, 16(t5)
    sll r0, r0, 0
    lqc2 vf3, 32(t5)
    sll r0, r0, 0
    lqc2 vf4, 48(t5)
    sll r0, r0, 0
    lqc2 vf5, 0(t7)  ;; vf5, vf6, vf7, vf8 = input bone matrix.
    sll r0, r0, 0
    lqc2 vf6, 16(t7)
    sll r0, r0, 0
    lqc2 vf7, 32(t7)
    sll r0, r0, 0
    lqc2 vf8, 48(t7)
    vcallms 0         ;; run bone program
    sll r0, r0, 0
 B16:
 L26:
    sll r0, r0, 0
    sll r0, r0, 0
    daddiu t5, t5, 64  ;; advance joint
    sll r0, r0, 0
    daddiu t7, t7, 80  ;; advance bone.
    sll r0, r0, 0
    sll r0, r0, 0
    sll r0, r0, 0
    sll r0, r0, 0
    sll r0, r0, 0
    sll r0, r0, 0
    sll r0, r0, 0
    sll r0, r0, 0
    sll r0, r0, 0
    sll r0, r0, 0
    sll r0, r0, 0
    sll r0, r0, 0
    lq t8, 0(t5)   ;; load next joint
    sll r0, r0, 0
    lq t9, 16(t5)
    sll r0, r0, 0
    lq ra, 32(t5)
    sll r0, r0, 0
    lq gp, 48(t5)
    sll r0, r0, 0
    lq s5, 0(t7)   ;; load next bone
    sll r0, r0, 0
    lq s4, 16(t7)
    sll r0, r0, 0
    lq s3, 32(t7)
    sll r0, r0, 0
    lq s2, 48(t7)
    sll r0, r0, 0
    qmtc2.ni vf1, t8 ;; swap in new inputs
    sll r0, r0, 0
    qmtc2.ni vf2, t9
    sll r0, r0, 0
    qmtc2.ni vf3, ra
    sll r0, r0, 0
    qmtc2.ni vf4, gp
    sll r0, r0, 0
    qmtc2.ni vf5, s5
    sll r0, r0, 0
    qmtc2.ni vf6, s4
    sll r0, r0, 0
    qmtc2.ni vf7, s3
    sll r0, r0, 0
    qmtc2.ni vf8, s2
    sll r0, r0, 0
    qmfc2.i t8, vf13   ;; swap out result in (vf13, vf14, vf15, vf16) and (vf9, vf10, vf11)
    sll r0, r0, 0
    qmfc2.ni t9, vf14
    sll r0, r0, 0
    qmfc2.ni ra, vf15
    sll r0, r0, 0
    qmfc2.ni gp, vf16
    sll r0, r0, 0
    qmfc2.ni s5, vf9
    sll r0, r0, 0
    qmfc2.ni s4, vf10
    sll r0, r0, 0
    qmfc2.ni s3, vf11
    vcallms 0
    sll r0, r0, 0
    sll r0, r0, 0
    sq t8, 0(t3)
    sll r0, r0, 0
    sq t9, 16(t3)
    sll r0, r0, 0
    sq ra, 32(t3)
    sll r0, r0, 0
    sq gp, 48(t3)
    sll r0, r0, 0
    sq s5, 64(t3)
    sll r0, r0, 0
    sq s4, 80(t3)
    sll r0, r0, 0
    sq s3, 96(t3)
    sll r0, r0, 0
    sq r0, 112(t3)
    daddiu t3, t3, 128
    daddiu t6, t6, -1
    bgtz t6, L26
    sll r0, r0, 0
 B17:
    sll r0, r0, 0
    lw t3, 40(at)
    beq t3, r0, L29
    sll r0, r0, 0
 B18:
 L27:
    lw t4, 0(v1)
    sll r0, r0, 0
    sll r0, r0, 0
    sll r0, r0, 0
    andi t4, t4, 256
    sll r0, r0, 0
    beq t4, r0, L28
    sll r0, r0, 0
 B19:
    sll r0, r0, 0
    sll r0, r0, 0
    sll r0, r0, 0
    sll r0, r0, 0
    sll r0, r0, 0
    sll r0, r0, 0
    sll r0, r0, 0
    sll r0, r0, 0
    sll r0, r0, 0
    sll r0, r0, 0
    beq r0, r0, L27
    sll r0, r0, 0
 B20:
 L28:
    dsll t5, t2, 2
    sll r0, r0, 0
    addiu t4, r0, 1
    daddu t5, t5, at
    sll r0, r0, 0
    lwu t6, 8(t5)
    dsll t5, t3, 2
    andi t6, t6, 16383
    daddu t5, t5, t3
    sw t6, 128(v1)
    dsll t6, t5, 4
    sw a2, 16(v1)
    addiu t7, r0, 256
    sw t5, 32(v1)
    daddu a2, a2, t6
    sw t7, 0(v1)
 B21:
 L29:
    sll r0, r0, 0
    lw t5, 48(at)
    sll r0, r0, 0
    lw t6, 44(at)
 B22:
 L30:
    lw t7, 0(t0)
    sll r0, r0, 0
    sll r0, r0, 0
    sll r0, r0, 0
    andi t7, t7, 256
    sll r0, r0, 0
    beq t7, r0, L31
    sll r0, r0, 0
 B23:
    sll r0, r0, 0
    sll r0, r0, 0
    sll r0, r0, 0
    sll r0, r0, 0
    sll r0, r0, 0
    sll r0, r0, 0
    sll r0, r0, 0
    sll r0, r0, 0
    sll r0, r0, 0
    sll r0, r0, 0
    beq r0, r0, L30
    sll r0, r0, 0
 B24:
 L31:
    beq t6, r0, L32
    sll r0, r0, 0
 B25:
    dsll t7, t5, 2
    lui t8, 28672
    daddu t7, t7, t8
    lwu t7, 16(t7)
    andi t7, t7, 16383
    sw t7, 128(t0)
    sw a0, 16(t0)
    dsll t7, t6, 3
    sw t7, 32(t0)
    addiu t7, r0, 256
    sw t7, 0(t0)
    dsll t6, t6, 7
    daddu a0, a0, t6
 B26:
 L32:
    beq t3, r0, L35
    sll r0, r0, 0
 B27:
    bne t4, r0, L35
    sll r0, r0, 0
 B28:
 L33:
    lw t6, 0(v1)
    sll r0, r0, 0
    sll r0, r0, 0
    sll r0, r0, 0
    andi t6, t6, 256
    sll r0, r0, 0
    beq t6, r0, L34
    sll r0, r0, 0
 B29:
    sll r0, r0, 0
    sll r0, r0, 0
    sll r0, r0, 0
    sll r0, r0, 0
    sll r0, r0, 0
    sll r0, r0, 0
    sll r0, r0, 0
    sll r0, r0, 0
    sll r0, r0, 0
    sll r0, r0, 0
    beq r0, r0, L33
    sll r0, r0, 0
 B30:
 L34:
    dsll t6, t2, 2
    lui t7, 28672
    daddu t6, t6, t7
    lwu t6, 8(t6)
    andi t6, t6, 16383
    sw t6, 128(v1)
    sw a2, 16(v1)
    addiu t6, r0, 5
    mult3 t6, t6, t3
    sw t6, 32(v1)
    addiu t6, r0, 256
    sw t6, 0(v1)
    addiu t6, r0, 80
    mult3 t6, t6, t3
    daddu a2, a2, t6
 B31:
 L35:
    or t8, t2, r0
    bne t1, r0, L22
    or t2, t5, r0
 B32:
    beq a3, r0, L22
    addiu a3, r0, 1
 B33:
 L36:
    or v0, r0, r0
    ld ra, 0(sp)
    lq gp, 80(sp)
    lq s5, 64(sp)
    lq s4, 48(sp)
    lq s3, 32(sp)
    lq s2, 16(sp)
    jr ra
    daddiu sp, sp, 96
 ```
 # VU0 micoprogram
 - vf1, vf2, vf3, vf4 = inverse bind pose
 - vf5, vf6, vf7, vf8 = input bone matrix.
 - `vf28, vf29, vf30, vf31` the camera matrix
 - `vf25, vf26, vf27` the camera matrix again
 - (vf13, vf14, vf15, vf16) output point transformation
 - (vf09, vf10, vf11) output normal transformation
 ```
 First: multiply bone and bind pose: (vf13, vf14, vf15, vf16) = (vf5, vf6, vf7, vf8) * (vf1, vf2, vf3, vf4).
 This is doing a true matrix multiplication.
  nop                        |  mulax.xyzw ACC, vf05, vf01
  nop                        |  madday.xyzw ACC, vf06, vf01
  nop                        |  maddaz.xyzw ACC, vf07, vf01
  nop                        |  maddw.xyzw vf13, vf08, vf01
  nop                        |  mulax.xyzw ACC, vf05, vf02
  nop                        |  madday.xyzw ACC, vf06, vf02
  nop                        |  maddaz.xyzw ACC, vf07, vf02
  nop                        |  maddw.xyzw vf14, vf08, vf02
  nop                        |  mulax.xyzw ACC, vf05, vf03
  nop                        |  madday.xyzw ACC, vf06, vf03
  nop                        |  maddaz.xyzw ACC, vf07, vf03
  nop                        |  maddw.xyzw vf15, vf08, vf03
  nop                        |  mulax.xyzw ACC, vf05, vf04
  nop                        |  madday.xyzw ACC, vf06, vf04
  nop                        |  maddaz.xyzw ACC, vf07, vf04
  nop                        |  maddw.xyzw vf16, vf08, vf04
 ;; vf09 = cross(y, z)
  nop                        |  opmula.xyz ACC, vf14, vf15
  nop                        |  opmsub.xyz vf09, vf15, vf14
 ;; vf10 = cross(z, x)
  nop                        |  opmula.xyz ACC, vf15, vf13
  nop                        |  opmsub.xyz vf10, vf13, vf15
 ;; vf11 = cross(x, y)
  nop                        |  opmula.xyz ACC, vf13, vf14
  nop                        |  opmsub.xyz vf11, vf14, vf13
 ;; vf12 = cross (y, z) * x
  nop                        |  mul.xyz vf12, vf13, vf09
 ;; second multiply: doing (vf13....) = cam * (vf5, vf6, vf7, vf8) * (vf1, vf2, vf3, vf4)
  nop                        |  mulax.xyzw ACC, vf28, vf13
  nop                        |  madday.xyzw ACC, vf29, vf13
  nop                        |  maddaz.xyzw ACC, vf30, vf13
  nop                        |  maddw.xyzw vf13, vf31, vf13
  nop                        |  mulax.w ACC, vf00, vf12
  nop                        |  madday.w ACC, vf00, vf12
  nop                        |  maddz.w vf12, vf00, vf12
  vf12.w = dot (cross(y, z), x)  [before the second multiply]
  nop                        |  mulax.xyzw ACC, vf28, vf14
  nop                        |  madday.xyzw ACC, vf29, vf14
  nop                        |  maddaz.xyzw ACC, vf30, vf14
  div Q, vf00.w, vf12.w      |  maddw.xyzw vf14, vf31, vf14  ;; divide
  nop                        |  mulax.xyzw ACC, vf28, vf15
  nop                        |  madday.xyzw ACC, vf29, vf15
  nop                        |  maddaz.xyzw ACC, vf30, vf15
  nop                        |  maddw.xyzw vf15, vf31, vf15
  nop                        |  mulax.xyzw ACC, vf28, vf16
  nop                        |  madday.xyzw ACC, vf29, vf16
  nop                        |  maddaz.xyzw ACC, vf30, vf16
  nop                        |  maddw.xyzw vf16, vf31, vf16
 ;; normal scale
  nop                        |  mul.xyzw vf09, vf09, Q
  nop                        |  mul.xyzw vf10, vf10, Q
  nop                        |  mul.xyzw vf11, vf11, Q
 ;; apply cam to normal matrix too
  nop                        |  mulax.xyzw ACC, vf25, vf09
  nop                        |  madday.xyzw ACC, vf26, vf09
  nop                        |  maddz.xyzw vf09, vf27, vf09
  nop                        |  mulax.xyzw ACC, vf25, vf10
  nop                        |  madday.xyzw ACC, vf26, vf10
  nop                        |  maddz.xyzw vf10, vf27, vf10
  nop                        |  mulax.xyzw ACC, vf25, vf11
  nop                        |  madday.xyzw ACC, vf26, vf11 :e
  nop                        |  maddz.xyzw vf11, vf27, vf11
 ```
--- a/goal_src/jak1/engine/gfx/foreground/bones.gc
+++ b/goal_src/jak1/engine/gfx/foreground/bones.gc
@ -422,6 +422,141 @@
 (def-mips2c bones-mtx-calc (function int pointer pointer int object none))
 (defmacro .cross.vf (out a b)
  `(begin
     (.outer.product.a.vf acc ,a ,b)
     (.outer.product.b.vf ,out ,b ,a acc)
     )
  )
 (defun new-bones-mtx-calc-asm ((output (inline-array pris-mtx)) (joints (inline-array joint)) (bones (inline-array bone)) (cam matrix) (count int))
  "Compute skinning matrices."
  ;; (declare (print-asm))
  (dotimes (i (- count 1))
    (let ((b (-> bones (+ i 1) transform))
          (j (-> joints i bind-pose))
          (out (-> output (+ i 1)))
          )
      (rlet (
             (tmat0 :class vf)
             (tmat1 :class vf)
             (tmat2 :class vf)
             (tmat3 :class vf)
             (nmat0 :class vf)
             (nmat1 :class vf)
             (nmat2 :class vf)
             (nmat3 :class vf)
             (acc :class vf  )
             (vf0 :class vf  )
             (cam0 :class vf )
             (cam1 :class vf )
             (cam2 :class vf )
             (cam3 :class vf )
             )
            (init-vf0-vector)
            ;; load bind-pose to tmat:
            (.lvf tmat0 (&-> j quad 0))
            (.lvf tmat1 (&-> j quad 1))
            (.lvf tmat2 (&-> j quad 2))
            (.lvf tmat3 (&-> j quad 3))
            ;; load bone to nmat
            (.lvf nmat0 (&-> b quad 0))
            (.lvf nmat1 (&-> b quad 1))
            (.lvf nmat2 (&-> b quad 2))
            (.lvf nmat3 (&-> b quad 3))
            ;; multiply bone and bind pose, store in tmat
            (.mul.x.vf acc nmat0 tmat0)
            (.add.mul.y.vf acc nmat1 tmat0 acc)
            (.add.mul.z.vf acc nmat2 tmat0 acc)
            (.add.mul.w.vf tmat0 nmat3 tmat0 acc)
            (.mul.x.vf acc nmat0 tmat1)
            (.add.mul.y.vf acc nmat1 tmat1 acc)
            (.add.mul.z.vf acc nmat2 tmat1 acc)
            (.add.mul.w.vf tmat1 nmat3 tmat1 acc)
            (.mul.x.vf acc nmat0 tmat2)
            (.add.mul.y.vf acc nmat1 tmat2 acc)
            (.add.mul.z.vf acc nmat2 tmat2 acc)
            (.add.mul.w.vf tmat2 nmat3 tmat2 acc)
            (.mul.x.vf acc nmat0 tmat3)
            (.add.mul.y.vf acc nmat1 tmat3 acc)
            (.add.mul.z.vf acc nmat2 tmat3 acc)
            (.add.mul.w.vf tmat3 nmat3 tmat3 acc)
            ;; compute inverse transpose, storing in nmat.
            (.cross.vf nmat0 tmat1 tmat2)
            (.cross.vf nmat1 tmat2 tmat0)
            (.cross.vf nmat2 tmat0 tmat1)
            ;; dot nmat0 and tmat0
            (.mul.vf acc nmat0 tmat0)
            (.add.y.vf acc acc acc :mask #b1)
            (.add.z.vf acc acc acc :mask #b1)
            ;; divide!
            (.div.vf acc vf0 acc :fsf #b11 :ftf #b0)
            ;; scale nmat:
            (.mul.x.vf nmat0 nmat0 acc)
            (.mul.x.vf nmat1 nmat1 acc)
            (.mul.x.vf nmat2 nmat2 acc)
            ;; load camera
            (.lvf cam0 (&-> cam quad 0))
            (.lvf cam1 (&-> cam quad 1))
            (.lvf cam2 (&-> cam quad 2))
            (.lvf cam3 (&-> cam quad 3))
            ;; multiply tmat by camera
            (.mul.x.vf acc cam0 tmat0)
            (.add.mul.y.vf acc cam1 tmat0 acc)
            (.add.mul.z.vf acc cam2 tmat0 acc)
            (.add.mul.w.vf tmat0 cam3 tmat0 acc)
            (.mul.x.vf acc cam0 tmat1)
            (.add.mul.y.vf acc cam1 tmat1 acc)
            (.add.mul.z.vf acc cam2 tmat1 acc)
            (.add.mul.w.vf tmat1 cam3 tmat1 acc)
            (.mul.x.vf acc cam0 tmat2)
            (.add.mul.y.vf acc cam1 tmat2 acc)
            (.add.mul.z.vf acc cam2 tmat2 acc)
            (.add.mul.w.vf tmat2 cam3 tmat2 acc)
            (.mul.x.vf acc cam0 tmat3)
            (.add.mul.y.vf acc cam1 tmat3 acc)
            (.add.mul.z.vf acc cam2 tmat3 acc)
            (.add.mul.w.vf tmat3 cam3 tmat3 acc)
            ;; store tmat
            (.svf (&-> out t-mtx quad 0) tmat0)
            (.svf (&-> out t-mtx quad 1) tmat1)
            (.svf (&-> out t-mtx quad 2) tmat2)
            (.svf (&-> out t-mtx quad 3) tmat3)
            ;; multiply nmat
            (.mul.x.vf acc cam0 nmat0)
            (.add.mul.y.vf acc cam1 nmat0 acc)
            (.add.mul.z.vf nmat0 cam2 nmat0 acc)
            (.mul.x.vf acc cam0 nmat1)
            (.add.mul.y.vf acc cam1 nmat1 acc)
            (.add.mul.z.vf nmat1 cam2 nmat1 acc)
            (.mul.x.vf acc cam0 nmat2)
            (.add.mul.y.vf acc cam1 nmat2 acc)
            (.add.mul.z.vf nmat2 cam2 nmat2 acc)
            ;; store nmat
            (.svf (&-> out n-mtx quad 0) nmat0)
            (.svf (&-> out n-mtx quad 1) nmat1)
            (.svf (&-> out n-mtx quad 2) nmat2)
            )
      )
    )
  (none)
  )
 (define *use-new-bones* #t)
 (defun bones-mtx-calc-execute ()
  "Do all pending bone calculations"
  (local-vars (v1-14 float))
@ -481,13 +616,20 @@
        (.mov v1-14 vf27)
        ;; hack??
-
+        (if *use-new-bones*
-        (bones-mtx-calc
+          (new-bones-mtx-calc-asm
-          (the-as int (-> s4-0 matrix-area))
+            (the (inline-array pris-mtx) (-> s4-0 matrix-area))
-          (the-as pointer (-> s4-0 joints))
+            (-> s4-0 joints)
-          (the-as pointer (-> s4-0 bones))
+            (-> s4-0 bones)
-          (the-as int (-> s4-0 num-bones))
+            v1-13
-          v1-13 ;; hack, added
+            (the int (-> s4-0 num-bones)))
          (bones-mtx-calc
            (the-as int (-> s4-0 matrix-area))
            (the-as pointer (-> s4-0 joints))
            (the-as pointer (-> s4-0 bones))
            (the-as int (-> s4-0 num-bones))
            v1-13 ;; hack, added
            )
          )
        )
        (when (logtest? (-> s4-0 flags) (bone-calc-flags bncfl00))
--- a/goal_src/jak2/engine/gfx/foreground/bones.gc
+++ b/goal_src/jak2/engine/gfx/foreground/bones.gc
@ -120,16 +120,224 @@ this is done by a linked list of "bone calculations", which is stashed in the dm
 ;; see the C++ code for more details.
 (def-mips2c bones-mtx-calc (function (inline-array pris-mtx) (inline-array joint) (inline-array bone) uint object none))
 (defun matrix-*float! ((output matrix3) (input matrix3) (x float))
  (dotimes (i 12)
    (set! (-> output data i) (* x (-> input data i)))
    )
  output
  )
 (defun matrix*!-first-three ((arg0 matrix3) (arg1 matrix3) (arg2 matrix))
  "Set dst = src1 * src2. It is okay for any arguments to be the same data.
   This is a moderately efficient implementation."
  (rlet ((acc :class vf)
         (vf10 :class vf)
         (vf11 :class vf)
         (vf12 :class vf)
         (vf14 :class vf)
         (vf15 :class vf)
         (vf16 :class vf)
         (vf18 :class vf)
         (vf19 :class vf)
         (vf20 :class vf)
         )
    (.lvf vf10 (&-> arg1 quad 0))
    (.lvf vf14 (&-> arg2 quad 0))
    (.lvf vf15 (&-> arg2 quad 1))
    (.lvf vf16 (&-> arg2 quad 2))
    (.lvf vf11 (&-> arg1 quad 1))
    (.lvf vf12 (&-> arg1 quad 2))
    (.mul.x.vf acc vf14 vf10)
    (.add.mul.y.vf acc vf15 vf10 acc)
    (.add.mul.z.vf vf18 vf16 vf10 acc)
    (.mul.x.vf acc vf14 vf11)
    (.add.mul.y.vf acc vf15 vf11 acc)
    (.add.mul.z.vf vf19 vf16 vf11 acc)
    (.mul.x.vf acc vf14 vf12)
    (.add.mul.y.vf acc vf15 vf12 acc)
    (.add.mul.z.vf vf20 vf16 vf12 acc)
    (.svf (&-> arg0 quad 0) vf18)
    (.svf (&-> arg0 quad 1) vf19)
    (.svf (&-> arg0 quad 2) vf20)
    arg0
    )
  )
 (defun new-bones-mtx-calc ((output (inline-array pris-mtx)) (joints (inline-array joint)) (bones (inline-array bone)) (cam matrix) (count int))
  (dotimes (i (- count 1))
    (let ((b (-> bones (+ i 1) transform))
          (j (-> joints i bind-pose))
          (out (-> output (+ i 1)))
          )
      ;; multiply by bind pose
      ;; mult swaps the args
      (matrix*! (-> out t-mtx) j b)
      ;; clever way to compute inverse transpose of a 3x3:
      (vector-cross! (-> out n-mtx vector 0)
                     (-> out t-mtx vector 1)
                     (-> out t-mtx vector 2)
                     )
      (vector-cross! (-> out n-mtx vector 1)
                     (-> out t-mtx vector 2)
                     (-> out t-mtx vector 0)
                     )
      (vector-cross! (-> out n-mtx vector 2)
                     (-> out t-mtx vector 0)
                     (-> out t-mtx vector 1)
                     )
      (let ((scale (/ 1. (vector-dot (-> out n-mtx vector 0) (-> out t-mtx vector 0)))))
        (matrix-*float! (-> out n-mtx) (-> out n-mtx) scale)
        )
      ;; multiply by camera
      (matrix*! (-> out t-mtx) (-> out t-mtx) cam)
      (matrix*!-first-three (-> out n-mtx) (-> out n-mtx) cam) ;; WRONG!!
      )
    )
  )
 (defmacro .cross.vf (out a b)
  `(begin
     (.outer.product.a.vf acc ,a ,b)
     (.outer.product.b.vf ,out ,b ,a acc)
     )
  )
 (defun new-bones-mtx-calc-asm ((output (inline-array pris-mtx)) (joints (inline-array joint)) (bones (inline-array bone)) (cam matrix) (count int))
  ;; (declare (print-asm))
  (dotimes (i (- count 1))
    (let ((b (-> bones (+ i 1) transform))
          (j (-> joints i bind-pose))
          (out (-> output (+ i 1)))
          )
      (rlet (
             (tmat0 :class vf)
             (tmat1 :class vf)
             (tmat2 :class vf)
             (tmat3 :class vf)
             (nmat0 :class vf)
             (nmat1 :class vf)
             (nmat2 :class vf)
             (nmat3 :class vf)
             (acc :class vf  )
             (vf0 :class vf  )
             (cam0 :class vf )
             (cam1 :class vf )
             (cam2 :class vf )
             (cam3 :class vf )
             )
            (init-vf0-vector)
            ;; load bind-pose to tmat:
            (.lvf tmat0 (&-> j quad 0))
            (.lvf tmat1 (&-> j quad 1))
            (.lvf tmat2 (&-> j quad 2))
            (.lvf tmat3 (&-> j quad 3))
            ;; load bone to nmat
            (.lvf nmat0 (&-> b quad 0))
            (.lvf nmat1 (&-> b quad 1))
            (.lvf nmat2 (&-> b quad 2))
            (.lvf nmat3 (&-> b quad 3))
            ;; multiply, store in tmat
            (.mul.x.vf acc nmat0 tmat0)
            (.add.mul.y.vf acc nmat1 tmat0 acc)
            (.add.mul.z.vf acc nmat2 tmat0 acc)
            (.add.mul.w.vf tmat0 nmat3 tmat0 acc)
            (.mul.x.vf acc nmat0 tmat1)
            (.add.mul.y.vf acc nmat1 tmat1 acc)
            (.add.mul.z.vf acc nmat2 tmat1 acc)
            (.add.mul.w.vf tmat1 nmat3 tmat1 acc)
            (.mul.x.vf acc nmat0 tmat2)
            (.add.mul.y.vf acc nmat1 tmat2 acc)
            (.add.mul.z.vf acc nmat2 tmat2 acc)
            (.add.mul.w.vf tmat2 nmat3 tmat2 acc)
            (.mul.x.vf acc nmat0 tmat3)
            (.add.mul.y.vf acc nmat1 tmat3 acc)
            (.add.mul.z.vf acc nmat2 tmat3 acc)
            (.add.mul.w.vf tmat3 nmat3 tmat3 acc)
            ;; compute inverse transpose, storing in nmat
            (.cross.vf nmat0 tmat1 tmat2)
            (.cross.vf nmat1 tmat2 tmat0)
            (.cross.vf nmat2 tmat0 tmat1)
            ;; dot nmat0 and tmat0
            (.mul.vf acc nmat0 tmat0)
            (.add.y.vf acc acc acc :mask #b1)
            (.add.z.vf acc acc acc :mask #b1)
            ;; divide!
            (.div.vf acc vf0 acc :fsf #b11 :ftf #b0)
            ;; scale nmat:
            (.mul.x.vf nmat0 nmat0 acc)
            (.mul.x.vf nmat1 nmat1 acc)
            (.mul.x.vf nmat2 nmat2 acc)
            ;; load camera
            (.lvf cam0 (&-> cam quad 0))
            (.lvf cam1 (&-> cam quad 1))
            (.lvf cam2 (&-> cam quad 2))
            (.lvf cam3 (&-> cam quad 3))
            ;; multiply tmat by camera
            (.mul.x.vf acc cam0 tmat0)
            (.add.mul.y.vf acc cam1 tmat0 acc)
            (.add.mul.z.vf acc cam2 tmat0 acc)
            (.add.mul.w.vf tmat0 cam3 tmat0 acc)
            (.mul.x.vf acc cam0 tmat1)
            (.add.mul.y.vf acc cam1 tmat1 acc)
            (.add.mul.z.vf acc cam2 tmat1 acc)
            (.add.mul.w.vf tmat1 cam3 tmat1 acc)
            (.mul.x.vf acc cam0 tmat2)
            (.add.mul.y.vf acc cam1 tmat2 acc)
            (.add.mul.z.vf acc cam2 tmat2 acc)
            (.add.mul.w.vf tmat2 cam3 tmat2 acc)
            (.mul.x.vf acc cam0 tmat3)
            (.add.mul.y.vf acc cam1 tmat3 acc)
            (.add.mul.z.vf acc cam2 tmat3 acc)
            (.add.mul.w.vf tmat3 cam3 tmat3 acc)
            ;; store tmat
            (.svf (&-> out t-mtx quad 0) tmat0)
            (.svf (&-> out t-mtx quad 1) tmat1)
            (.svf (&-> out t-mtx quad 2) tmat2)
            (.svf (&-> out t-mtx quad 3) tmat3)
            ;; multiply nmat
            (.mul.x.vf acc cam0 nmat0)
            (.add.mul.y.vf acc cam1 nmat0 acc)
            (.add.mul.z.vf nmat0 cam2 nmat0 acc)
            (.mul.x.vf acc cam0 nmat1)
            (.add.mul.y.vf acc cam1 nmat1 acc)
            (.add.mul.z.vf nmat1 cam2 nmat1 acc)
            (.mul.x.vf acc cam0 nmat2)
            (.add.mul.y.vf acc cam1 nmat2 acc)
            (.add.mul.z.vf nmat2 cam2 nmat2 acc)
            ;; store nmat
            (.svf (&-> out n-mtx quad 0) nmat0)
            (.svf (&-> out n-mtx quad 1) nmat1)
            (.svf (&-> out n-mtx quad 2) nmat2)
            )
      )
    )
  (none)
  )
 (define *use-new-bones* #t)
 (define *display-bone-stats* #f)
 (define *num-bones* 0)
 (defun bones-mtx-calc-execute ()
  "Execute all bone matrix calculations."
  (rlet ((vf1 :class vf)
         (vf25 :class vf)
         (vf26 :class vf)
         (vf27 :class vf)
         (vf28 :class vf)
         (vf29 :class vf)
         (vf30 :class vf)
         (vf31 :class vf)
         (vf4 :class vf)
         (vf5 :class vf)
         (vf6 :class vf)
@ -157,6 +365,7 @@ this is done by a linked list of "bone calculations", which is stashed in the dm
                 (s5-1 (-> *math-camera* camera-rot))
                 (s4-1 (-> v1-26 first))
                 )
            (set! *num-bones* 0)
            (while (nonzero? s4-1) ;; loop
              ;; pick correct matrix. The pris-mtx includes camera rotation, so all that's needed in the final
@ -167,23 +376,19 @@ this is done by a linked list of "bone calculations", which is stashed in the dm
                               )
                           )
                    )
                (.lvf vf28 (&-> v1-29 quad 0))
                (.lvf vf29 (&-> v1-29 quad 1))
                (.lvf vf30 (&-> v1-29 quad 2))
                (.lvf vf31 (&-> v1-29 trans quad))
                (.lvf vf25 (&-> v1-29 quad 0))
                (.lvf vf26 (&-> v1-29 quad 1))
                (.lvf vf27 (&-> v1-29 quad 2))
                ;; calculate!
-                (bones-mtx-calc
+                (if *use-new-bones*
-                  (-> s4-1 matrix-area)
+                    (new-bones-mtx-calc-asm (-> s4-1 matrix-area) (-> s4-1 joints) (-> s4-1 bones) v1-29 (the int (-> s4-1 num-bones)))
-                  (-> s4-1 joints)
+                    (bones-mtx-calc
-                  (-> s4-1 bones)
+                      (-> s4-1 matrix-area)
-                  (-> s4-1 num-bones)
+                      (-> s4-1 joints)
-                  v1-29 ;; hack, to pass matrix to bones-mtx-calc in a better way.
+                      (-> s4-1 bones)
-                  )
+                      (-> s4-1 num-bones)
                      v1-29 ;; hack, to pass matrix to bones-mtx-calc in a better way.
                      )
                    )
                )
              (+! *num-bones* (-> s4-1 num-bones))
              ;; there is an optional post-processing step for ripple.
              (when (logtest? (-> s4-1 flags) (bone-calc-flags write-ripple-data))
@ -228,6 +433,10 @@ this is done by a linked list of "bone calculations", which is stashed in the dm
              )
            )
          (when *display-bone-stats*
            (format *stdcon* "num-bones: ~D~%" *num-bones*)
            )
          ;; reset sqwc
          ;; (set! (-> (the-as dma-bank-control #x1000e000) sqwc) (new 'static 'dma-sqwc :sqwc #x1 :tqwc #x1))
--- a/goalc/compiler/compilation/Asm.cpp
+++ b/goalc/compiler/compilation/Asm.cpp
@ -1233,17 +1233,17 @@ Val* Compiler::compile_asm_div_vf(const goos::Object& form, const goos::Object&
  // Why do we even bother using VDIVPS instead of FDIV? Because otherwise in x86, you have to use
  // the FPU stack Registers are nicer.
-  // Save one temp reg, use the destination as one
+  auto temp_reg1 = env->make_vfr(dest->type());
-  auto temp_reg = env->make_vfr(dest->type());
+  auto temp_reg2 = env->make_vfr(dest->type());
-  // Splat src1's value into the dest reg, keep it simple, this way no matter which vector component
+  // Splat src1's value into a temp reg, keep it simple, this way no matter which vector component
  // is accessed from the final result will be the correct answer
-  env->emit_ir<IR_SplatVF>(form, color, dest, src1, ftf_fsf_to_vector_element(fsf));
+  env->emit_ir<IR_SplatVF>(form, color, temp_reg1, src1, ftf_fsf_to_vector_element(fsf));
  // Splat src1's value into the the temp reg
-  env->emit_ir<IR_SplatVF>(form, color, temp_reg, src2, ftf_fsf_to_vector_element(ftf));
+  env->emit_ir<IR_SplatVF>(form, color, temp_reg2, src2, ftf_fsf_to_vector_element(ftf));
  // Perform the Division
-  env->emit_ir<IR_VFMath3Asm>(form, color, dest, dest, temp_reg, IR_VFMath3Asm::Kind::DIV);
+  env->emit_ir<IR_VFMath3Asm>(form, color, dest, temp_reg1, temp_reg2, IR_VFMath3Asm::Kind::DIV);
  return get_none();
 }
--- a/goalc/regalloc/Allocator_v2.cpp
+++ b/goalc/regalloc/Allocator_v2.cpp
@ -332,7 +332,7 @@ const std::vector<emitter::Register>& get_alloc_order(int var_idx,
      if (is_gpr) {
        return REG_temp_first_order.gprs;
      } else {
-        return REG_temp_only_order.xmms;
+        return REG_temp_first_order.xmms;
      }
    }
  }
@ -1060,8 +1060,10 @@ bool run_assignment_on_var(const AllocationInput& input,
      for (auto& reg : assign_order) {
        bool worked = check_register_assign(input, *cache, var_idx, reg);
        if (trace) {
-          lg::print("m2 trying var {} in {}: {}\n", cache->iregs.at(var_idx).to_string(),
+          const auto& this_var = cache->vars.at(var_idx);
-                    reg.print(), worked);
+          lg::print("m2 trying var {} in {} (live {} to {}): {}\n",
                    cache->iregs.at(var_idx).to_string(), reg.print(), this_var.first_live(),
                    this_var.last_live(), worked);
        }
        if (worked) {
          var.assign_to_register(reg);
--- a/goalc/regalloc/allocator_interface.cpp
+++ b/goalc/regalloc/allocator_interface.cpp
@ -24,7 +24,7 @@ void print_allocate_input(const AllocationInput& in) {
    }
  } else {
    for (const auto& instruction : in.instructions) {
-      lg::print(" [{:3d}] {}\n", instruction.print());
+      lg::print(" {}\n", instruction.print());
    }
  }
  lg::print("[RegAlloc] Debug Input Constraints:\n");