mirror of
https://github.com/open-goal/jak-project.git
synced 2024-10-20 00:57:44 -04:00
Port bones.gc
math to GOAL (#3425)
Reverse engineer the skinning matrix calculation and port to GOAL. This is about 3x faster than the MIPS2c version. As usual, there is a `*use-new-bones*` flag to go back to the old version. Fix for a bug in the compiler's `.div.vf` implementation (only happens if src/dst are the same), and fix for a typo in the register allocator that would sometimes cause it not to consider xmm8-xmm15.
This commit is contained in:
parent
5a8b4e81f9
commit
82fb2cc26a
500
docs/progress-notes/bones.md
Normal file
500
docs/progress-notes/bones.md
Normal file
|
@ -0,0 +1,500 @@
|
|||
The `bones.gc` file computes skinning matrices for foreground rendering.
|
||||
|
||||
Arguments:
|
||||
- `a0`: output matrix area, each matrix is 7x quadwords contaning the vertex and normal transformation matrices.
|
||||
- `a1`: input joint array. The joints contain the inverse bind pose.
|
||||
- `a2`: input bones array. The world space bone transforms
|
||||
- `a3`: num bones
|
||||
|
||||
- `vf28, vf29, vf30, vf31` the camera matrix
|
||||
- `vf25, vf26, vf27` the camera matrix again
|
||||
|
||||
```asm
|
||||
daddiu sp, sp, -96
|
||||
sd ra, 0(sp)
|
||||
sq s2, 16(sp)
|
||||
sq s3, 32(sp)
|
||||
sq s4, 48(sp)
|
||||
sq s5, 64(sp)
|
||||
sq gp, 80(sp)
|
||||
|
||||
lui v1, 4096
|
||||
lui t0, 4096
|
||||
ori v1, v1, 54272 ;; v1 = DMA reg addr
|
||||
ori t0, t0, 53248 ;; t0 = DMA reg addr
|
||||
lui t2, 32767 ;; 0x7fff....
|
||||
daddiu t1, a3, -16 ;; bone count - 16 (maybe we do 16 bones at a time?)
|
||||
ori t2, t2, 65535 ;; 0x7fff'ffff
|
||||
lui at, 28672 ;; scratchpad addr
|
||||
addiu t4, r0, 64 ;; t4 = 64 (= 16 bones * 4)
|
||||
addiu t5, r0, 1280 ;; t5 = 1280 (= 16 bones * 80)
|
||||
bgez t1, L17 ;; more than 16 bones?
|
||||
addiu t3, r0, 16 ;; t3 = 16
|
||||
|
||||
;; if first run is under 16 bones, adjust counts
|
||||
B1:
|
||||
or t3, a3, r0 ;; t3 = num bones
|
||||
sll r0, r0, 0
|
||||
dsll t4, t3, 2 ;; t4 = num bones * 4
|
||||
dsll a3, t3, 4 ;; a3 = num bones * 16
|
||||
dsll t1, t3, 6 ;; t1 = num bones * 64
|
||||
sll r0, r0, 0
|
||||
daddu t5, t1, a3 ;; t5 = num bones * 80
|
||||
addiu t1, r0, 0 ;; t1 = 0 (remaining bones count)
|
||||
B2:
|
||||
L17:
|
||||
addiu a3, r0, 0 ;; a3 = 0
|
||||
addiu t6, r0, 1 ;; t6 = 1
|
||||
and a1, a1, t2 ;; mask off upper bits of address (not sure why, but they do this sometimes)
|
||||
sll r0, r0, 0
|
||||
daddiu a1, a1, 12 ;; adjustment of joint pointer for the strided dma stuff.
|
||||
or a0, a0, r0
|
||||
daddiu a1, a1, -80
|
||||
sll r0, r0, 0
|
||||
|
||||
;; wait for DMA to be free...
|
||||
<snip>
|
||||
|
||||
B5:
|
||||
L19:
|
||||
addiu t6, r0, 80
|
||||
addiu t7, r0, 264
|
||||
sw t6, 128(v1) ;; addr in spad = 80 for joints
|
||||
sw a1, 16(v1)
|
||||
sw t4, 32(v1) ;; size: num bones * 4
|
||||
sw t7, 0(v1)
|
||||
daddu a1, a1, t5
|
||||
;; wait for dma to complete
|
||||
<snip>
|
||||
|
||||
B8:
|
||||
L21:
|
||||
and a2, a2, t2 ;; clean up bones addr
|
||||
sll r0, r0, 0
|
||||
dsll t2, t3, 2 ;; t2 = bones * 4
|
||||
addiu t4, r0, 256 ;; t4 = 256
|
||||
daddu t2, t2, t3 ;; t2 = bones * 5 (size of the bone)
|
||||
addiu t6, r0, 1104 ;; addr in spad = 1104 for bones.
|
||||
dsll t5, t2, 4
|
||||
sw t6, 128(v1)
|
||||
addiu t8, r0, 0
|
||||
sw a2, 16(v1)
|
||||
daddu a2, a2, t5
|
||||
sw t2, 32(v1)
|
||||
addiu t2, r0, 1
|
||||
sw t4, 0(v1)
|
||||
;; wait for dma
|
||||
;; <snip>
|
||||
|
||||
B11:
|
||||
L23:
|
||||
dsll t5, t8, 2 ;; ?? not sure what this is, but always zero?
|
||||
daddu t9, t5, at ;; ptr to bone-work
|
||||
sll r0, r0, 0
|
||||
lwu t5, 0(t9) ;; t5 = (-> bone-layout joint)
|
||||
or t6, t3, r0
|
||||
lwu t7, 8(t9) ;; t7 = (-> bone-layout bone)
|
||||
or ra, t3, r0
|
||||
lwu t3, 16(t9) ;; t3 = (-> bone-layout output)
|
||||
sll r0, r0, 0
|
||||
sw ra, 44(at) ;; stash sp-size
|
||||
beq ra, r0, L36
|
||||
sw t8, 48(at) ;; stash sp-bufnum
|
||||
|
||||
B12:
|
||||
daddiu t1, t1, -16 ;; decrement bones count
|
||||
addiu t9, r0, 1280 ;; next DMA math stuff
|
||||
bgez t1, L24 ;; check if partial bone buffer
|
||||
addiu t8, r0, 16 ;; ....
|
||||
|
||||
B13:
|
||||
daddiu t8, t1, 16
|
||||
addiu t1, r0, 0
|
||||
dsll t9, t8, 4
|
||||
dsll ra, t8, 6
|
||||
beq t8, r0, L25
|
||||
daddu t9, ra, t9
|
||||
|
||||
B14:
|
||||
L24:
|
||||
dsll t4, t8, 2
|
||||
dsll ra, t2, 2
|
||||
daddu gp, ra, at
|
||||
sw a1, 16(v1)
|
||||
addiu ra, r0, 264
|
||||
lwu gp, 0(gp)
|
||||
andi gp, gp, 16383
|
||||
sw t4, 32(v1)
|
||||
daddu a1, a1, t9
|
||||
sw gp, 128(v1)
|
||||
addiu t4, r0, 0
|
||||
sw ra, 0(v1)
|
||||
|
||||
;; and now, for the actual bones.
|
||||
B15:
|
||||
L25:
|
||||
sll r0, r0, 0
|
||||
sw t8, 40(at) ;; in-count
|
||||
sll r0, r0, 0
|
||||
lqc2 vf1, 0(t5) ;; vf1, vf2, vf3, vf4 = inverse bind pose
|
||||
sll r0, r0, 0
|
||||
lqc2 vf2, 16(t5)
|
||||
sll r0, r0, 0
|
||||
lqc2 vf3, 32(t5)
|
||||
sll r0, r0, 0
|
||||
lqc2 vf4, 48(t5)
|
||||
sll r0, r0, 0
|
||||
lqc2 vf5, 0(t7) ;; vf5, vf6, vf7, vf8 = input bone matrix.
|
||||
sll r0, r0, 0
|
||||
lqc2 vf6, 16(t7)
|
||||
sll r0, r0, 0
|
||||
lqc2 vf7, 32(t7)
|
||||
sll r0, r0, 0
|
||||
lqc2 vf8, 48(t7)
|
||||
vcallms 0 ;; run bone program
|
||||
sll r0, r0, 0
|
||||
B16:
|
||||
L26:
|
||||
sll r0, r0, 0
|
||||
sll r0, r0, 0
|
||||
daddiu t5, t5, 64 ;; advance joint
|
||||
sll r0, r0, 0
|
||||
daddiu t7, t7, 80 ;; advance bone.
|
||||
sll r0, r0, 0
|
||||
sll r0, r0, 0
|
||||
sll r0, r0, 0
|
||||
sll r0, r0, 0
|
||||
sll r0, r0, 0
|
||||
sll r0, r0, 0
|
||||
sll r0, r0, 0
|
||||
sll r0, r0, 0
|
||||
sll r0, r0, 0
|
||||
sll r0, r0, 0
|
||||
sll r0, r0, 0
|
||||
sll r0, r0, 0
|
||||
lq t8, 0(t5) ;; load next joint
|
||||
sll r0, r0, 0
|
||||
lq t9, 16(t5)
|
||||
sll r0, r0, 0
|
||||
lq ra, 32(t5)
|
||||
sll r0, r0, 0
|
||||
lq gp, 48(t5)
|
||||
sll r0, r0, 0
|
||||
lq s5, 0(t7) ;; load next bone
|
||||
sll r0, r0, 0
|
||||
lq s4, 16(t7)
|
||||
sll r0, r0, 0
|
||||
lq s3, 32(t7)
|
||||
sll r0, r0, 0
|
||||
lq s2, 48(t7)
|
||||
sll r0, r0, 0
|
||||
qmtc2.ni vf1, t8 ;; swap in new inputs
|
||||
sll r0, r0, 0
|
||||
qmtc2.ni vf2, t9
|
||||
sll r0, r0, 0
|
||||
qmtc2.ni vf3, ra
|
||||
sll r0, r0, 0
|
||||
qmtc2.ni vf4, gp
|
||||
sll r0, r0, 0
|
||||
qmtc2.ni vf5, s5
|
||||
sll r0, r0, 0
|
||||
qmtc2.ni vf6, s4
|
||||
sll r0, r0, 0
|
||||
qmtc2.ni vf7, s3
|
||||
sll r0, r0, 0
|
||||
qmtc2.ni vf8, s2
|
||||
sll r0, r0, 0
|
||||
qmfc2.i t8, vf13 ;; swap out result in (vf13, vf14, vf15, vf16) and (vf9, vf10, vf11)
|
||||
sll r0, r0, 0
|
||||
qmfc2.ni t9, vf14
|
||||
sll r0, r0, 0
|
||||
qmfc2.ni ra, vf15
|
||||
sll r0, r0, 0
|
||||
qmfc2.ni gp, vf16
|
||||
sll r0, r0, 0
|
||||
qmfc2.ni s5, vf9
|
||||
sll r0, r0, 0
|
||||
qmfc2.ni s4, vf10
|
||||
sll r0, r0, 0
|
||||
qmfc2.ni s3, vf11
|
||||
vcallms 0
|
||||
sll r0, r0, 0
|
||||
sll r0, r0, 0
|
||||
sq t8, 0(t3)
|
||||
sll r0, r0, 0
|
||||
sq t9, 16(t3)
|
||||
sll r0, r0, 0
|
||||
sq ra, 32(t3)
|
||||
sll r0, r0, 0
|
||||
sq gp, 48(t3)
|
||||
sll r0, r0, 0
|
||||
sq s5, 64(t3)
|
||||
sll r0, r0, 0
|
||||
sq s4, 80(t3)
|
||||
sll r0, r0, 0
|
||||
sq s3, 96(t3)
|
||||
sll r0, r0, 0
|
||||
sq r0, 112(t3)
|
||||
daddiu t3, t3, 128
|
||||
daddiu t6, t6, -1
|
||||
bgtz t6, L26
|
||||
sll r0, r0, 0
|
||||
|
||||
B17:
|
||||
sll r0, r0, 0
|
||||
lw t3, 40(at)
|
||||
beq t3, r0, L29
|
||||
sll r0, r0, 0
|
||||
|
||||
B18:
|
||||
L27:
|
||||
lw t4, 0(v1)
|
||||
sll r0, r0, 0
|
||||
sll r0, r0, 0
|
||||
sll r0, r0, 0
|
||||
andi t4, t4, 256
|
||||
sll r0, r0, 0
|
||||
beq t4, r0, L28
|
||||
sll r0, r0, 0
|
||||
|
||||
B19:
|
||||
sll r0, r0, 0
|
||||
sll r0, r0, 0
|
||||
sll r0, r0, 0
|
||||
sll r0, r0, 0
|
||||
sll r0, r0, 0
|
||||
sll r0, r0, 0
|
||||
sll r0, r0, 0
|
||||
sll r0, r0, 0
|
||||
sll r0, r0, 0
|
||||
sll r0, r0, 0
|
||||
beq r0, r0, L27
|
||||
sll r0, r0, 0
|
||||
|
||||
B20:
|
||||
L28:
|
||||
dsll t5, t2, 2
|
||||
sll r0, r0, 0
|
||||
addiu t4, r0, 1
|
||||
daddu t5, t5, at
|
||||
sll r0, r0, 0
|
||||
lwu t6, 8(t5)
|
||||
dsll t5, t3, 2
|
||||
andi t6, t6, 16383
|
||||
daddu t5, t5, t3
|
||||
sw t6, 128(v1)
|
||||
dsll t6, t5, 4
|
||||
sw a2, 16(v1)
|
||||
addiu t7, r0, 256
|
||||
sw t5, 32(v1)
|
||||
daddu a2, a2, t6
|
||||
sw t7, 0(v1)
|
||||
B21:
|
||||
L29:
|
||||
sll r0, r0, 0
|
||||
lw t5, 48(at)
|
||||
sll r0, r0, 0
|
||||
lw t6, 44(at)
|
||||
B22:
|
||||
L30:
|
||||
lw t7, 0(t0)
|
||||
sll r0, r0, 0
|
||||
sll r0, r0, 0
|
||||
sll r0, r0, 0
|
||||
andi t7, t7, 256
|
||||
sll r0, r0, 0
|
||||
beq t7, r0, L31
|
||||
sll r0, r0, 0
|
||||
|
||||
B23:
|
||||
sll r0, r0, 0
|
||||
sll r0, r0, 0
|
||||
sll r0, r0, 0
|
||||
sll r0, r0, 0
|
||||
sll r0, r0, 0
|
||||
sll r0, r0, 0
|
||||
sll r0, r0, 0
|
||||
sll r0, r0, 0
|
||||
sll r0, r0, 0
|
||||
sll r0, r0, 0
|
||||
beq r0, r0, L30
|
||||
sll r0, r0, 0
|
||||
|
||||
B24:
|
||||
L31:
|
||||
beq t6, r0, L32
|
||||
sll r0, r0, 0
|
||||
|
||||
B25:
|
||||
dsll t7, t5, 2
|
||||
lui t8, 28672
|
||||
daddu t7, t7, t8
|
||||
lwu t7, 16(t7)
|
||||
andi t7, t7, 16383
|
||||
sw t7, 128(t0)
|
||||
sw a0, 16(t0)
|
||||
dsll t7, t6, 3
|
||||
sw t7, 32(t0)
|
||||
addiu t7, r0, 256
|
||||
sw t7, 0(t0)
|
||||
dsll t6, t6, 7
|
||||
daddu a0, a0, t6
|
||||
B26:
|
||||
L32:
|
||||
beq t3, r0, L35
|
||||
sll r0, r0, 0
|
||||
|
||||
B27:
|
||||
bne t4, r0, L35
|
||||
sll r0, r0, 0
|
||||
|
||||
B28:
|
||||
L33:
|
||||
lw t6, 0(v1)
|
||||
sll r0, r0, 0
|
||||
sll r0, r0, 0
|
||||
sll r0, r0, 0
|
||||
andi t6, t6, 256
|
||||
sll r0, r0, 0
|
||||
beq t6, r0, L34
|
||||
sll r0, r0, 0
|
||||
|
||||
B29:
|
||||
sll r0, r0, 0
|
||||
sll r0, r0, 0
|
||||
sll r0, r0, 0
|
||||
sll r0, r0, 0
|
||||
sll r0, r0, 0
|
||||
sll r0, r0, 0
|
||||
sll r0, r0, 0
|
||||
sll r0, r0, 0
|
||||
sll r0, r0, 0
|
||||
sll r0, r0, 0
|
||||
beq r0, r0, L33
|
||||
sll r0, r0, 0
|
||||
|
||||
B30:
|
||||
L34:
|
||||
dsll t6, t2, 2
|
||||
lui t7, 28672
|
||||
daddu t6, t6, t7
|
||||
lwu t6, 8(t6)
|
||||
andi t6, t6, 16383
|
||||
sw t6, 128(v1)
|
||||
sw a2, 16(v1)
|
||||
addiu t6, r0, 5
|
||||
mult3 t6, t6, t3
|
||||
sw t6, 32(v1)
|
||||
addiu t6, r0, 256
|
||||
sw t6, 0(v1)
|
||||
addiu t6, r0, 80
|
||||
mult3 t6, t6, t3
|
||||
daddu a2, a2, t6
|
||||
B31:
|
||||
L35:
|
||||
or t8, t2, r0
|
||||
bne t1, r0, L22
|
||||
or t2, t5, r0
|
||||
|
||||
B32:
|
||||
beq a3, r0, L22
|
||||
addiu a3, r0, 1
|
||||
|
||||
B33:
|
||||
L36:
|
||||
or v0, r0, r0
|
||||
ld ra, 0(sp)
|
||||
lq gp, 80(sp)
|
||||
lq s5, 64(sp)
|
||||
lq s4, 48(sp)
|
||||
lq s3, 32(sp)
|
||||
lq s2, 16(sp)
|
||||
jr ra
|
||||
daddiu sp, sp, 96
|
||||
```
|
||||
|
||||
|
||||
# VU0 micoprogram
|
||||
|
||||
- vf1, vf2, vf3, vf4 = inverse bind pose
|
||||
- vf5, vf6, vf7, vf8 = input bone matrix.
|
||||
- `vf28, vf29, vf30, vf31` the camera matrix
|
||||
- `vf25, vf26, vf27` the camera matrix again
|
||||
- (vf13, vf14, vf15, vf16) output point transformation
|
||||
- (vf09, vf10, vf11) output normal transformation
|
||||
|
||||
```
|
||||
First: multiply bone and bind pose: (vf13, vf14, vf15, vf16) = (vf5, vf6, vf7, vf8) * (vf1, vf2, vf3, vf4).
|
||||
This is doing a true matrix multiplication.
|
||||
nop | mulax.xyzw ACC, vf05, vf01
|
||||
nop | madday.xyzw ACC, vf06, vf01
|
||||
nop | maddaz.xyzw ACC, vf07, vf01
|
||||
nop | maddw.xyzw vf13, vf08, vf01
|
||||
nop | mulax.xyzw ACC, vf05, vf02
|
||||
nop | madday.xyzw ACC, vf06, vf02
|
||||
nop | maddaz.xyzw ACC, vf07, vf02
|
||||
nop | maddw.xyzw vf14, vf08, vf02
|
||||
nop | mulax.xyzw ACC, vf05, vf03
|
||||
nop | madday.xyzw ACC, vf06, vf03
|
||||
nop | maddaz.xyzw ACC, vf07, vf03
|
||||
nop | maddw.xyzw vf15, vf08, vf03
|
||||
nop | mulax.xyzw ACC, vf05, vf04
|
||||
nop | madday.xyzw ACC, vf06, vf04
|
||||
nop | maddaz.xyzw ACC, vf07, vf04
|
||||
nop | maddw.xyzw vf16, vf08, vf04
|
||||
|
||||
;; vf09 = cross(y, z)
|
||||
nop | opmula.xyz ACC, vf14, vf15
|
||||
nop | opmsub.xyz vf09, vf15, vf14
|
||||
|
||||
;; vf10 = cross(z, x)
|
||||
nop | opmula.xyz ACC, vf15, vf13
|
||||
nop | opmsub.xyz vf10, vf13, vf15
|
||||
|
||||
;; vf11 = cross(x, y)
|
||||
nop | opmula.xyz ACC, vf13, vf14
|
||||
nop | opmsub.xyz vf11, vf14, vf13
|
||||
|
||||
;; vf12 = cross (y, z) * x
|
||||
nop | mul.xyz vf12, vf13, vf09
|
||||
|
||||
;; second multiply: doing (vf13....) = cam * (vf5, vf6, vf7, vf8) * (vf1, vf2, vf3, vf4)
|
||||
nop | mulax.xyzw ACC, vf28, vf13
|
||||
nop | madday.xyzw ACC, vf29, vf13
|
||||
nop | maddaz.xyzw ACC, vf30, vf13
|
||||
nop | maddw.xyzw vf13, vf31, vf13
|
||||
|
||||
nop | mulax.w ACC, vf00, vf12
|
||||
nop | madday.w ACC, vf00, vf12
|
||||
nop | maddz.w vf12, vf00, vf12
|
||||
vf12.w = dot (cross(y, z), x) [before the second multiply]
|
||||
|
||||
nop | mulax.xyzw ACC, vf28, vf14
|
||||
nop | madday.xyzw ACC, vf29, vf14
|
||||
nop | maddaz.xyzw ACC, vf30, vf14
|
||||
div Q, vf00.w, vf12.w | maddw.xyzw vf14, vf31, vf14 ;; divide
|
||||
nop | mulax.xyzw ACC, vf28, vf15
|
||||
nop | madday.xyzw ACC, vf29, vf15
|
||||
nop | maddaz.xyzw ACC, vf30, vf15
|
||||
nop | maddw.xyzw vf15, vf31, vf15
|
||||
nop | mulax.xyzw ACC, vf28, vf16
|
||||
nop | madday.xyzw ACC, vf29, vf16
|
||||
nop | maddaz.xyzw ACC, vf30, vf16
|
||||
nop | maddw.xyzw vf16, vf31, vf16
|
||||
|
||||
;; normal scale
|
||||
nop | mul.xyzw vf09, vf09, Q
|
||||
nop | mul.xyzw vf10, vf10, Q
|
||||
nop | mul.xyzw vf11, vf11, Q
|
||||
|
||||
;; apply cam to normal matrix too
|
||||
nop | mulax.xyzw ACC, vf25, vf09
|
||||
nop | madday.xyzw ACC, vf26, vf09
|
||||
nop | maddz.xyzw vf09, vf27, vf09
|
||||
nop | mulax.xyzw ACC, vf25, vf10
|
||||
nop | madday.xyzw ACC, vf26, vf10
|
||||
nop | maddz.xyzw vf10, vf27, vf10
|
||||
nop | mulax.xyzw ACC, vf25, vf11
|
||||
nop | madday.xyzw ACC, vf26, vf11 :e
|
||||
nop | maddz.xyzw vf11, vf27, vf11
|
||||
```
|
|
@ -422,6 +422,141 @@
|
|||
|
||||
(def-mips2c bones-mtx-calc (function int pointer pointer int object none))
|
||||
|
||||
(defmacro .cross.vf (out a b)
|
||||
`(begin
|
||||
(.outer.product.a.vf acc ,a ,b)
|
||||
(.outer.product.b.vf ,out ,b ,a acc)
|
||||
)
|
||||
)
|
||||
|
||||
(defun new-bones-mtx-calc-asm ((output (inline-array pris-mtx)) (joints (inline-array joint)) (bones (inline-array bone)) (cam matrix) (count int))
|
||||
"Compute skinning matrices."
|
||||
;; (declare (print-asm))
|
||||
(dotimes (i (- count 1))
|
||||
(let ((b (-> bones (+ i 1) transform))
|
||||
(j (-> joints i bind-pose))
|
||||
(out (-> output (+ i 1)))
|
||||
)
|
||||
(rlet (
|
||||
(tmat0 :class vf)
|
||||
(tmat1 :class vf)
|
||||
(tmat2 :class vf)
|
||||
(tmat3 :class vf)
|
||||
(nmat0 :class vf)
|
||||
(nmat1 :class vf)
|
||||
(nmat2 :class vf)
|
||||
(nmat3 :class vf)
|
||||
(acc :class vf )
|
||||
(vf0 :class vf )
|
||||
(cam0 :class vf )
|
||||
(cam1 :class vf )
|
||||
(cam2 :class vf )
|
||||
(cam3 :class vf )
|
||||
)
|
||||
|
||||
(init-vf0-vector)
|
||||
|
||||
;; load bind-pose to tmat:
|
||||
(.lvf tmat0 (&-> j quad 0))
|
||||
(.lvf tmat1 (&-> j quad 1))
|
||||
(.lvf tmat2 (&-> j quad 2))
|
||||
(.lvf tmat3 (&-> j quad 3))
|
||||
|
||||
;; load bone to nmat
|
||||
(.lvf nmat0 (&-> b quad 0))
|
||||
(.lvf nmat1 (&-> b quad 1))
|
||||
(.lvf nmat2 (&-> b quad 2))
|
||||
(.lvf nmat3 (&-> b quad 3))
|
||||
|
||||
;; multiply bone and bind pose, store in tmat
|
||||
(.mul.x.vf acc nmat0 tmat0)
|
||||
(.add.mul.y.vf acc nmat1 tmat0 acc)
|
||||
(.add.mul.z.vf acc nmat2 tmat0 acc)
|
||||
(.add.mul.w.vf tmat0 nmat3 tmat0 acc)
|
||||
(.mul.x.vf acc nmat0 tmat1)
|
||||
(.add.mul.y.vf acc nmat1 tmat1 acc)
|
||||
(.add.mul.z.vf acc nmat2 tmat1 acc)
|
||||
(.add.mul.w.vf tmat1 nmat3 tmat1 acc)
|
||||
(.mul.x.vf acc nmat0 tmat2)
|
||||
(.add.mul.y.vf acc nmat1 tmat2 acc)
|
||||
(.add.mul.z.vf acc nmat2 tmat2 acc)
|
||||
(.add.mul.w.vf tmat2 nmat3 tmat2 acc)
|
||||
(.mul.x.vf acc nmat0 tmat3)
|
||||
(.add.mul.y.vf acc nmat1 tmat3 acc)
|
||||
(.add.mul.z.vf acc nmat2 tmat3 acc)
|
||||
(.add.mul.w.vf tmat3 nmat3 tmat3 acc)
|
||||
|
||||
;; compute inverse transpose, storing in nmat.
|
||||
(.cross.vf nmat0 tmat1 tmat2)
|
||||
(.cross.vf nmat1 tmat2 tmat0)
|
||||
(.cross.vf nmat2 tmat0 tmat1)
|
||||
|
||||
;; dot nmat0 and tmat0
|
||||
(.mul.vf acc nmat0 tmat0)
|
||||
(.add.y.vf acc acc acc :mask #b1)
|
||||
(.add.z.vf acc acc acc :mask #b1)
|
||||
|
||||
;; divide!
|
||||
(.div.vf acc vf0 acc :fsf #b11 :ftf #b0)
|
||||
|
||||
;; scale nmat:
|
||||
(.mul.x.vf nmat0 nmat0 acc)
|
||||
(.mul.x.vf nmat1 nmat1 acc)
|
||||
(.mul.x.vf nmat2 nmat2 acc)
|
||||
|
||||
;; load camera
|
||||
(.lvf cam0 (&-> cam quad 0))
|
||||
(.lvf cam1 (&-> cam quad 1))
|
||||
(.lvf cam2 (&-> cam quad 2))
|
||||
(.lvf cam3 (&-> cam quad 3))
|
||||
|
||||
;; multiply tmat by camera
|
||||
(.mul.x.vf acc cam0 tmat0)
|
||||
(.add.mul.y.vf acc cam1 tmat0 acc)
|
||||
(.add.mul.z.vf acc cam2 tmat0 acc)
|
||||
(.add.mul.w.vf tmat0 cam3 tmat0 acc)
|
||||
(.mul.x.vf acc cam0 tmat1)
|
||||
(.add.mul.y.vf acc cam1 tmat1 acc)
|
||||
(.add.mul.z.vf acc cam2 tmat1 acc)
|
||||
(.add.mul.w.vf tmat1 cam3 tmat1 acc)
|
||||
(.mul.x.vf acc cam0 tmat2)
|
||||
(.add.mul.y.vf acc cam1 tmat2 acc)
|
||||
(.add.mul.z.vf acc cam2 tmat2 acc)
|
||||
(.add.mul.w.vf tmat2 cam3 tmat2 acc)
|
||||
(.mul.x.vf acc cam0 tmat3)
|
||||
(.add.mul.y.vf acc cam1 tmat3 acc)
|
||||
(.add.mul.z.vf acc cam2 tmat3 acc)
|
||||
(.add.mul.w.vf tmat3 cam3 tmat3 acc)
|
||||
|
||||
;; store tmat
|
||||
(.svf (&-> out t-mtx quad 0) tmat0)
|
||||
(.svf (&-> out t-mtx quad 1) tmat1)
|
||||
(.svf (&-> out t-mtx quad 2) tmat2)
|
||||
(.svf (&-> out t-mtx quad 3) tmat3)
|
||||
|
||||
;; multiply nmat
|
||||
(.mul.x.vf acc cam0 nmat0)
|
||||
(.add.mul.y.vf acc cam1 nmat0 acc)
|
||||
(.add.mul.z.vf nmat0 cam2 nmat0 acc)
|
||||
(.mul.x.vf acc cam0 nmat1)
|
||||
(.add.mul.y.vf acc cam1 nmat1 acc)
|
||||
(.add.mul.z.vf nmat1 cam2 nmat1 acc)
|
||||
(.mul.x.vf acc cam0 nmat2)
|
||||
(.add.mul.y.vf acc cam1 nmat2 acc)
|
||||
(.add.mul.z.vf nmat2 cam2 nmat2 acc)
|
||||
|
||||
;; store nmat
|
||||
(.svf (&-> out n-mtx quad 0) nmat0)
|
||||
(.svf (&-> out n-mtx quad 1) nmat1)
|
||||
(.svf (&-> out n-mtx quad 2) nmat2)
|
||||
)
|
||||
)
|
||||
)
|
||||
(none)
|
||||
)
|
||||
|
||||
(define *use-new-bones* #t)
|
||||
|
||||
(defun bones-mtx-calc-execute ()
|
||||
"Do all pending bone calculations"
|
||||
(local-vars (v1-14 float))
|
||||
|
@ -481,7 +616,13 @@
|
|||
(.mov v1-14 vf27)
|
||||
;; hack??
|
||||
|
||||
|
||||
(if *use-new-bones*
|
||||
(new-bones-mtx-calc-asm
|
||||
(the (inline-array pris-mtx) (-> s4-0 matrix-area))
|
||||
(-> s4-0 joints)
|
||||
(-> s4-0 bones)
|
||||
v1-13
|
||||
(the int (-> s4-0 num-bones)))
|
||||
(bones-mtx-calc
|
||||
(the-as int (-> s4-0 matrix-area))
|
||||
(the-as pointer (-> s4-0 joints))
|
||||
|
@ -490,6 +631,7 @@
|
|||
v1-13 ;; hack, added
|
||||
)
|
||||
)
|
||||
)
|
||||
(when (logtest? (-> s4-0 flags) (bone-calc-flags bncfl00))
|
||||
(let ((v1-18 (-> s4-0 matrix-area))
|
||||
(a0-22 (-> s4-0 num-bones))
|
||||
|
|
|
@ -120,16 +120,224 @@ this is done by a linked list of "bone calculations", which is stashed in the dm
|
|||
;; see the C++ code for more details.
|
||||
(def-mips2c bones-mtx-calc (function (inline-array pris-mtx) (inline-array joint) (inline-array bone) uint object none))
|
||||
|
||||
(defun matrix-*float! ((output matrix3) (input matrix3) (x float))
|
||||
(dotimes (i 12)
|
||||
(set! (-> output data i) (* x (-> input data i)))
|
||||
)
|
||||
output
|
||||
)
|
||||
|
||||
(defun matrix*!-first-three ((arg0 matrix3) (arg1 matrix3) (arg2 matrix))
|
||||
"Set dst = src1 * src2. It is okay for any arguments to be the same data.
|
||||
This is a moderately efficient implementation."
|
||||
(rlet ((acc :class vf)
|
||||
(vf10 :class vf)
|
||||
(vf11 :class vf)
|
||||
(vf12 :class vf)
|
||||
(vf14 :class vf)
|
||||
(vf15 :class vf)
|
||||
(vf16 :class vf)
|
||||
(vf18 :class vf)
|
||||
(vf19 :class vf)
|
||||
(vf20 :class vf)
|
||||
)
|
||||
(.lvf vf10 (&-> arg1 quad 0))
|
||||
(.lvf vf14 (&-> arg2 quad 0))
|
||||
(.lvf vf15 (&-> arg2 quad 1))
|
||||
(.lvf vf16 (&-> arg2 quad 2))
|
||||
(.lvf vf11 (&-> arg1 quad 1))
|
||||
(.lvf vf12 (&-> arg1 quad 2))
|
||||
(.mul.x.vf acc vf14 vf10)
|
||||
(.add.mul.y.vf acc vf15 vf10 acc)
|
||||
(.add.mul.z.vf vf18 vf16 vf10 acc)
|
||||
(.mul.x.vf acc vf14 vf11)
|
||||
(.add.mul.y.vf acc vf15 vf11 acc)
|
||||
(.add.mul.z.vf vf19 vf16 vf11 acc)
|
||||
(.mul.x.vf acc vf14 vf12)
|
||||
(.add.mul.y.vf acc vf15 vf12 acc)
|
||||
(.add.mul.z.vf vf20 vf16 vf12 acc)
|
||||
(.svf (&-> arg0 quad 0) vf18)
|
||||
(.svf (&-> arg0 quad 1) vf19)
|
||||
(.svf (&-> arg0 quad 2) vf20)
|
||||
arg0
|
||||
)
|
||||
)
|
||||
|
||||
(defun new-bones-mtx-calc ((output (inline-array pris-mtx)) (joints (inline-array joint)) (bones (inline-array bone)) (cam matrix) (count int))
|
||||
(dotimes (i (- count 1))
|
||||
(let ((b (-> bones (+ i 1) transform))
|
||||
(j (-> joints i bind-pose))
|
||||
(out (-> output (+ i 1)))
|
||||
)
|
||||
|
||||
;; multiply by bind pose
|
||||
;; mult swaps the args
|
||||
(matrix*! (-> out t-mtx) j b)
|
||||
|
||||
;; clever way to compute inverse transpose of a 3x3:
|
||||
(vector-cross! (-> out n-mtx vector 0)
|
||||
(-> out t-mtx vector 1)
|
||||
(-> out t-mtx vector 2)
|
||||
)
|
||||
(vector-cross! (-> out n-mtx vector 1)
|
||||
(-> out t-mtx vector 2)
|
||||
(-> out t-mtx vector 0)
|
||||
)
|
||||
(vector-cross! (-> out n-mtx vector 2)
|
||||
(-> out t-mtx vector 0)
|
||||
(-> out t-mtx vector 1)
|
||||
)
|
||||
(let ((scale (/ 1. (vector-dot (-> out n-mtx vector 0) (-> out t-mtx vector 0)))))
|
||||
(matrix-*float! (-> out n-mtx) (-> out n-mtx) scale)
|
||||
)
|
||||
|
||||
;; multiply by camera
|
||||
(matrix*! (-> out t-mtx) (-> out t-mtx) cam)
|
||||
(matrix*!-first-three (-> out n-mtx) (-> out n-mtx) cam) ;; WRONG!!
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
(defmacro .cross.vf (out a b)
|
||||
`(begin
|
||||
(.outer.product.a.vf acc ,a ,b)
|
||||
(.outer.product.b.vf ,out ,b ,a acc)
|
||||
)
|
||||
)
|
||||
|
||||
(defun new-bones-mtx-calc-asm ((output (inline-array pris-mtx)) (joints (inline-array joint)) (bones (inline-array bone)) (cam matrix) (count int))
|
||||
;; (declare (print-asm))
|
||||
(dotimes (i (- count 1))
|
||||
(let ((b (-> bones (+ i 1) transform))
|
||||
(j (-> joints i bind-pose))
|
||||
(out (-> output (+ i 1)))
|
||||
)
|
||||
(rlet (
|
||||
(tmat0 :class vf)
|
||||
(tmat1 :class vf)
|
||||
(tmat2 :class vf)
|
||||
(tmat3 :class vf)
|
||||
(nmat0 :class vf)
|
||||
(nmat1 :class vf)
|
||||
(nmat2 :class vf)
|
||||
(nmat3 :class vf)
|
||||
(acc :class vf )
|
||||
(vf0 :class vf )
|
||||
(cam0 :class vf )
|
||||
(cam1 :class vf )
|
||||
(cam2 :class vf )
|
||||
(cam3 :class vf )
|
||||
)
|
||||
|
||||
(init-vf0-vector)
|
||||
|
||||
;; load bind-pose to tmat:
|
||||
(.lvf tmat0 (&-> j quad 0))
|
||||
(.lvf tmat1 (&-> j quad 1))
|
||||
(.lvf tmat2 (&-> j quad 2))
|
||||
(.lvf tmat3 (&-> j quad 3))
|
||||
|
||||
;; load bone to nmat
|
||||
(.lvf nmat0 (&-> b quad 0))
|
||||
(.lvf nmat1 (&-> b quad 1))
|
||||
(.lvf nmat2 (&-> b quad 2))
|
||||
(.lvf nmat3 (&-> b quad 3))
|
||||
|
||||
;; multiply, store in tmat
|
||||
(.mul.x.vf acc nmat0 tmat0)
|
||||
(.add.mul.y.vf acc nmat1 tmat0 acc)
|
||||
(.add.mul.z.vf acc nmat2 tmat0 acc)
|
||||
(.add.mul.w.vf tmat0 nmat3 tmat0 acc)
|
||||
(.mul.x.vf acc nmat0 tmat1)
|
||||
(.add.mul.y.vf acc nmat1 tmat1 acc)
|
||||
(.add.mul.z.vf acc nmat2 tmat1 acc)
|
||||
(.add.mul.w.vf tmat1 nmat3 tmat1 acc)
|
||||
(.mul.x.vf acc nmat0 tmat2)
|
||||
(.add.mul.y.vf acc nmat1 tmat2 acc)
|
||||
(.add.mul.z.vf acc nmat2 tmat2 acc)
|
||||
(.add.mul.w.vf tmat2 nmat3 tmat2 acc)
|
||||
(.mul.x.vf acc nmat0 tmat3)
|
||||
(.add.mul.y.vf acc nmat1 tmat3 acc)
|
||||
(.add.mul.z.vf acc nmat2 tmat3 acc)
|
||||
(.add.mul.w.vf tmat3 nmat3 tmat3 acc)
|
||||
|
||||
;; compute inverse transpose, storing in nmat
|
||||
(.cross.vf nmat0 tmat1 tmat2)
|
||||
(.cross.vf nmat1 tmat2 tmat0)
|
||||
(.cross.vf nmat2 tmat0 tmat1)
|
||||
|
||||
;; dot nmat0 and tmat0
|
||||
(.mul.vf acc nmat0 tmat0)
|
||||
(.add.y.vf acc acc acc :mask #b1)
|
||||
(.add.z.vf acc acc acc :mask #b1)
|
||||
|
||||
;; divide!
|
||||
(.div.vf acc vf0 acc :fsf #b11 :ftf #b0)
|
||||
|
||||
;; scale nmat:
|
||||
(.mul.x.vf nmat0 nmat0 acc)
|
||||
(.mul.x.vf nmat1 nmat1 acc)
|
||||
(.mul.x.vf nmat2 nmat2 acc)
|
||||
|
||||
;; load camera
|
||||
(.lvf cam0 (&-> cam quad 0))
|
||||
(.lvf cam1 (&-> cam quad 1))
|
||||
(.lvf cam2 (&-> cam quad 2))
|
||||
(.lvf cam3 (&-> cam quad 3))
|
||||
|
||||
;; multiply tmat by camera
|
||||
(.mul.x.vf acc cam0 tmat0)
|
||||
(.add.mul.y.vf acc cam1 tmat0 acc)
|
||||
(.add.mul.z.vf acc cam2 tmat0 acc)
|
||||
(.add.mul.w.vf tmat0 cam3 tmat0 acc)
|
||||
(.mul.x.vf acc cam0 tmat1)
|
||||
(.add.mul.y.vf acc cam1 tmat1 acc)
|
||||
(.add.mul.z.vf acc cam2 tmat1 acc)
|
||||
(.add.mul.w.vf tmat1 cam3 tmat1 acc)
|
||||
(.mul.x.vf acc cam0 tmat2)
|
||||
(.add.mul.y.vf acc cam1 tmat2 acc)
|
||||
(.add.mul.z.vf acc cam2 tmat2 acc)
|
||||
(.add.mul.w.vf tmat2 cam3 tmat2 acc)
|
||||
(.mul.x.vf acc cam0 tmat3)
|
||||
(.add.mul.y.vf acc cam1 tmat3 acc)
|
||||
(.add.mul.z.vf acc cam2 tmat3 acc)
|
||||
(.add.mul.w.vf tmat3 cam3 tmat3 acc)
|
||||
|
||||
;; store tmat
|
||||
(.svf (&-> out t-mtx quad 0) tmat0)
|
||||
(.svf (&-> out t-mtx quad 1) tmat1)
|
||||
(.svf (&-> out t-mtx quad 2) tmat2)
|
||||
(.svf (&-> out t-mtx quad 3) tmat3)
|
||||
|
||||
;; multiply nmat
|
||||
(.mul.x.vf acc cam0 nmat0)
|
||||
(.add.mul.y.vf acc cam1 nmat0 acc)
|
||||
(.add.mul.z.vf nmat0 cam2 nmat0 acc)
|
||||
(.mul.x.vf acc cam0 nmat1)
|
||||
(.add.mul.y.vf acc cam1 nmat1 acc)
|
||||
(.add.mul.z.vf nmat1 cam2 nmat1 acc)
|
||||
(.mul.x.vf acc cam0 nmat2)
|
||||
(.add.mul.y.vf acc cam1 nmat2 acc)
|
||||
(.add.mul.z.vf nmat2 cam2 nmat2 acc)
|
||||
|
||||
;; store nmat
|
||||
(.svf (&-> out n-mtx quad 0) nmat0)
|
||||
(.svf (&-> out n-mtx quad 1) nmat1)
|
||||
(.svf (&-> out n-mtx quad 2) nmat2)
|
||||
)
|
||||
)
|
||||
)
|
||||
(none)
|
||||
)
|
||||
|
||||
|
||||
(define *use-new-bones* #t)
|
||||
(define *display-bone-stats* #f)
|
||||
(define *num-bones* 0)
|
||||
|
||||
(defun bones-mtx-calc-execute ()
|
||||
"Execute all bone matrix calculations."
|
||||
(rlet ((vf1 :class vf)
|
||||
(vf25 :class vf)
|
||||
(vf26 :class vf)
|
||||
(vf27 :class vf)
|
||||
(vf28 :class vf)
|
||||
(vf29 :class vf)
|
||||
(vf30 :class vf)
|
||||
(vf31 :class vf)
|
||||
(vf4 :class vf)
|
||||
(vf5 :class vf)
|
||||
(vf6 :class vf)
|
||||
|
@ -157,6 +365,7 @@ this is done by a linked list of "bone calculations", which is stashed in the dm
|
|||
(s5-1 (-> *math-camera* camera-rot))
|
||||
(s4-1 (-> v1-26 first))
|
||||
)
|
||||
(set! *num-bones* 0)
|
||||
(while (nonzero? s4-1) ;; loop
|
||||
|
||||
;; pick correct matrix. The pris-mtx includes camera rotation, so all that's needed in the final
|
||||
|
@ -167,15 +376,9 @@ this is done by a linked list of "bone calculations", which is stashed in the dm
|
|||
)
|
||||
)
|
||||
)
|
||||
(.lvf vf28 (&-> v1-29 quad 0))
|
||||
(.lvf vf29 (&-> v1-29 quad 1))
|
||||
(.lvf vf30 (&-> v1-29 quad 2))
|
||||
(.lvf vf31 (&-> v1-29 trans quad))
|
||||
(.lvf vf25 (&-> v1-29 quad 0))
|
||||
(.lvf vf26 (&-> v1-29 quad 1))
|
||||
(.lvf vf27 (&-> v1-29 quad 2))
|
||||
|
||||
;; calculate!
|
||||
(if *use-new-bones*
|
||||
(new-bones-mtx-calc-asm (-> s4-1 matrix-area) (-> s4-1 joints) (-> s4-1 bones) v1-29 (the int (-> s4-1 num-bones)))
|
||||
(bones-mtx-calc
|
||||
(-> s4-1 matrix-area)
|
||||
(-> s4-1 joints)
|
||||
|
@ -184,6 +387,8 @@ this is done by a linked list of "bone calculations", which is stashed in the dm
|
|||
v1-29 ;; hack, to pass matrix to bones-mtx-calc in a better way.
|
||||
)
|
||||
)
|
||||
)
|
||||
(+! *num-bones* (-> s4-1 num-bones))
|
||||
|
||||
;; there is an optional post-processing step for ripple.
|
||||
(when (logtest? (-> s4-1 flags) (bone-calc-flags write-ripple-data))
|
||||
|
@ -228,6 +433,10 @@ this is done by a linked list of "bone calculations", which is stashed in the dm
|
|||
)
|
||||
)
|
||||
|
||||
(when *display-bone-stats*
|
||||
(format *stdcon* "num-bones: ~D~%" *num-bones*)
|
||||
)
|
||||
|
||||
;; reset sqwc
|
||||
;; (set! (-> (the-as dma-bank-control #x1000e000) sqwc) (new 'static 'dma-sqwc :sqwc #x1 :tqwc #x1))
|
||||
|
||||
|
|
|
@ -1233,17 +1233,17 @@ Val* Compiler::compile_asm_div_vf(const goos::Object& form, const goos::Object&
|
|||
// Why do we even bother using VDIVPS instead of FDIV? Because otherwise in x86, you have to use
|
||||
// the FPU stack Registers are nicer.
|
||||
|
||||
// Save one temp reg, use the destination as one
|
||||
auto temp_reg = env->make_vfr(dest->type());
|
||||
auto temp_reg1 = env->make_vfr(dest->type());
|
||||
auto temp_reg2 = env->make_vfr(dest->type());
|
||||
|
||||
// Splat src1's value into the dest reg, keep it simple, this way no matter which vector component
|
||||
// Splat src1's value into a temp reg, keep it simple, this way no matter which vector component
|
||||
// is accessed from the final result will be the correct answer
|
||||
env->emit_ir<IR_SplatVF>(form, color, dest, src1, ftf_fsf_to_vector_element(fsf));
|
||||
env->emit_ir<IR_SplatVF>(form, color, temp_reg1, src1, ftf_fsf_to_vector_element(fsf));
|
||||
// Splat src1's value into the the temp reg
|
||||
env->emit_ir<IR_SplatVF>(form, color, temp_reg, src2, ftf_fsf_to_vector_element(ftf));
|
||||
env->emit_ir<IR_SplatVF>(form, color, temp_reg2, src2, ftf_fsf_to_vector_element(ftf));
|
||||
|
||||
// Perform the Division
|
||||
env->emit_ir<IR_VFMath3Asm>(form, color, dest, dest, temp_reg, IR_VFMath3Asm::Kind::DIV);
|
||||
env->emit_ir<IR_VFMath3Asm>(form, color, dest, temp_reg1, temp_reg2, IR_VFMath3Asm::Kind::DIV);
|
||||
return get_none();
|
||||
}
|
||||
|
||||
|
|
|
@ -332,7 +332,7 @@ const std::vector<emitter::Register>& get_alloc_order(int var_idx,
|
|||
if (is_gpr) {
|
||||
return REG_temp_first_order.gprs;
|
||||
} else {
|
||||
return REG_temp_only_order.xmms;
|
||||
return REG_temp_first_order.xmms;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -1060,8 +1060,10 @@ bool run_assignment_on_var(const AllocationInput& input,
|
|||
for (auto& reg : assign_order) {
|
||||
bool worked = check_register_assign(input, *cache, var_idx, reg);
|
||||
if (trace) {
|
||||
lg::print("m2 trying var {} in {}: {}\n", cache->iregs.at(var_idx).to_string(),
|
||||
reg.print(), worked);
|
||||
const auto& this_var = cache->vars.at(var_idx);
|
||||
lg::print("m2 trying var {} in {} (live {} to {}): {}\n",
|
||||
cache->iregs.at(var_idx).to_string(), reg.print(), this_var.first_live(),
|
||||
this_var.last_live(), worked);
|
||||
}
|
||||
if (worked) {
|
||||
var.assign_to_register(reg);
|
||||
|
|
|
@ -24,7 +24,7 @@ void print_allocate_input(const AllocationInput& in) {
|
|||
}
|
||||
} else {
|
||||
for (const auto& instruction : in.instructions) {
|
||||
lg::print(" [{:3d}] {}\n", instruction.print());
|
||||
lg::print(" {}\n", instruction.print());
|
||||
}
|
||||
}
|
||||
lg::print("[RegAlloc] Debug Input Constraints:\n");
|
||||
|
|
Loading…
Reference in a new issue