mirror of
https://github.com/open-goal/jak-project.git
synced 2024-10-20 00:57:44 -04:00
Port bones.gc
math to GOAL (#3425)
Reverse engineer the skinning matrix calculation and port to GOAL. This is about 3x faster than the MIPS2c version. As usual, there is a `*use-new-bones*` flag to go back to the old version. Fix for a bug in the compiler's `.div.vf` implementation (only happens if src/dst are the same), and fix for a typo in the register allocator that would sometimes cause it not to consider xmm8-xmm15.
This commit is contained in:
parent
5a8b4e81f9
commit
82fb2cc26a
500
docs/progress-notes/bones.md
Normal file
500
docs/progress-notes/bones.md
Normal file
|
@ -0,0 +1,500 @@
|
||||||
|
The `bones.gc` file computes skinning matrices for foreground rendering.
|
||||||
|
|
||||||
|
Arguments:
|
||||||
|
- `a0`: output matrix area, each matrix is 7x quadwords contaning the vertex and normal transformation matrices.
|
||||||
|
- `a1`: input joint array. The joints contain the inverse bind pose.
|
||||||
|
- `a2`: input bones array. The world space bone transforms
|
||||||
|
- `a3`: num bones
|
||||||
|
|
||||||
|
- `vf28, vf29, vf30, vf31` the camera matrix
|
||||||
|
- `vf25, vf26, vf27` the camera matrix again
|
||||||
|
|
||||||
|
```asm
|
||||||
|
daddiu sp, sp, -96
|
||||||
|
sd ra, 0(sp)
|
||||||
|
sq s2, 16(sp)
|
||||||
|
sq s3, 32(sp)
|
||||||
|
sq s4, 48(sp)
|
||||||
|
sq s5, 64(sp)
|
||||||
|
sq gp, 80(sp)
|
||||||
|
|
||||||
|
lui v1, 4096
|
||||||
|
lui t0, 4096
|
||||||
|
ori v1, v1, 54272 ;; v1 = DMA reg addr
|
||||||
|
ori t0, t0, 53248 ;; t0 = DMA reg addr
|
||||||
|
lui t2, 32767 ;; 0x7fff....
|
||||||
|
daddiu t1, a3, -16 ;; bone count - 16 (maybe we do 16 bones at a time?)
|
||||||
|
ori t2, t2, 65535 ;; 0x7fff'ffff
|
||||||
|
lui at, 28672 ;; scratchpad addr
|
||||||
|
addiu t4, r0, 64 ;; t4 = 64 (= 16 bones * 4)
|
||||||
|
addiu t5, r0, 1280 ;; t5 = 1280 (= 16 bones * 80)
|
||||||
|
bgez t1, L17 ;; more than 16 bones?
|
||||||
|
addiu t3, r0, 16 ;; t3 = 16
|
||||||
|
|
||||||
|
;; if first run is under 16 bones, adjust counts
|
||||||
|
B1:
|
||||||
|
or t3, a3, r0 ;; t3 = num bones
|
||||||
|
sll r0, r0, 0
|
||||||
|
dsll t4, t3, 2 ;; t4 = num bones * 4
|
||||||
|
dsll a3, t3, 4 ;; a3 = num bones * 16
|
||||||
|
dsll t1, t3, 6 ;; t1 = num bones * 64
|
||||||
|
sll r0, r0, 0
|
||||||
|
daddu t5, t1, a3 ;; t5 = num bones * 80
|
||||||
|
addiu t1, r0, 0 ;; t1 = 0 (remaining bones count)
|
||||||
|
B2:
|
||||||
|
L17:
|
||||||
|
addiu a3, r0, 0 ;; a3 = 0
|
||||||
|
addiu t6, r0, 1 ;; t6 = 1
|
||||||
|
and a1, a1, t2 ;; mask off upper bits of address (not sure why, but they do this sometimes)
|
||||||
|
sll r0, r0, 0
|
||||||
|
daddiu a1, a1, 12 ;; adjustment of joint pointer for the strided dma stuff.
|
||||||
|
or a0, a0, r0
|
||||||
|
daddiu a1, a1, -80
|
||||||
|
sll r0, r0, 0
|
||||||
|
|
||||||
|
;; wait for DMA to be free...
|
||||||
|
<snip>
|
||||||
|
|
||||||
|
B5:
|
||||||
|
L19:
|
||||||
|
addiu t6, r0, 80
|
||||||
|
addiu t7, r0, 264
|
||||||
|
sw t6, 128(v1) ;; addr in spad = 80 for joints
|
||||||
|
sw a1, 16(v1)
|
||||||
|
sw t4, 32(v1) ;; size: num bones * 4
|
||||||
|
sw t7, 0(v1)
|
||||||
|
daddu a1, a1, t5
|
||||||
|
;; wait for dma to complete
|
||||||
|
<snip>
|
||||||
|
|
||||||
|
B8:
|
||||||
|
L21:
|
||||||
|
and a2, a2, t2 ;; clean up bones addr
|
||||||
|
sll r0, r0, 0
|
||||||
|
dsll t2, t3, 2 ;; t2 = bones * 4
|
||||||
|
addiu t4, r0, 256 ;; t4 = 256
|
||||||
|
daddu t2, t2, t3 ;; t2 = bones * 5 (size of the bone)
|
||||||
|
addiu t6, r0, 1104 ;; addr in spad = 1104 for bones.
|
||||||
|
dsll t5, t2, 4
|
||||||
|
sw t6, 128(v1)
|
||||||
|
addiu t8, r0, 0
|
||||||
|
sw a2, 16(v1)
|
||||||
|
daddu a2, a2, t5
|
||||||
|
sw t2, 32(v1)
|
||||||
|
addiu t2, r0, 1
|
||||||
|
sw t4, 0(v1)
|
||||||
|
;; wait for dma
|
||||||
|
;; <snip>
|
||||||
|
|
||||||
|
B11:
|
||||||
|
L23:
|
||||||
|
dsll t5, t8, 2 ;; ?? not sure what this is, but always zero?
|
||||||
|
daddu t9, t5, at ;; ptr to bone-work
|
||||||
|
sll r0, r0, 0
|
||||||
|
lwu t5, 0(t9) ;; t5 = (-> bone-layout joint)
|
||||||
|
or t6, t3, r0
|
||||||
|
lwu t7, 8(t9) ;; t7 = (-> bone-layout bone)
|
||||||
|
or ra, t3, r0
|
||||||
|
lwu t3, 16(t9) ;; t3 = (-> bone-layout output)
|
||||||
|
sll r0, r0, 0
|
||||||
|
sw ra, 44(at) ;; stash sp-size
|
||||||
|
beq ra, r0, L36
|
||||||
|
sw t8, 48(at) ;; stash sp-bufnum
|
||||||
|
|
||||||
|
B12:
|
||||||
|
daddiu t1, t1, -16 ;; decrement bones count
|
||||||
|
addiu t9, r0, 1280 ;; next DMA math stuff
|
||||||
|
bgez t1, L24 ;; check if partial bone buffer
|
||||||
|
addiu t8, r0, 16 ;; ....
|
||||||
|
|
||||||
|
B13:
|
||||||
|
daddiu t8, t1, 16
|
||||||
|
addiu t1, r0, 0
|
||||||
|
dsll t9, t8, 4
|
||||||
|
dsll ra, t8, 6
|
||||||
|
beq t8, r0, L25
|
||||||
|
daddu t9, ra, t9
|
||||||
|
|
||||||
|
B14:
|
||||||
|
L24:
|
||||||
|
dsll t4, t8, 2
|
||||||
|
dsll ra, t2, 2
|
||||||
|
daddu gp, ra, at
|
||||||
|
sw a1, 16(v1)
|
||||||
|
addiu ra, r0, 264
|
||||||
|
lwu gp, 0(gp)
|
||||||
|
andi gp, gp, 16383
|
||||||
|
sw t4, 32(v1)
|
||||||
|
daddu a1, a1, t9
|
||||||
|
sw gp, 128(v1)
|
||||||
|
addiu t4, r0, 0
|
||||||
|
sw ra, 0(v1)
|
||||||
|
|
||||||
|
;; and now, for the actual bones.
|
||||||
|
B15:
|
||||||
|
L25:
|
||||||
|
sll r0, r0, 0
|
||||||
|
sw t8, 40(at) ;; in-count
|
||||||
|
sll r0, r0, 0
|
||||||
|
lqc2 vf1, 0(t5) ;; vf1, vf2, vf3, vf4 = inverse bind pose
|
||||||
|
sll r0, r0, 0
|
||||||
|
lqc2 vf2, 16(t5)
|
||||||
|
sll r0, r0, 0
|
||||||
|
lqc2 vf3, 32(t5)
|
||||||
|
sll r0, r0, 0
|
||||||
|
lqc2 vf4, 48(t5)
|
||||||
|
sll r0, r0, 0
|
||||||
|
lqc2 vf5, 0(t7) ;; vf5, vf6, vf7, vf8 = input bone matrix.
|
||||||
|
sll r0, r0, 0
|
||||||
|
lqc2 vf6, 16(t7)
|
||||||
|
sll r0, r0, 0
|
||||||
|
lqc2 vf7, 32(t7)
|
||||||
|
sll r0, r0, 0
|
||||||
|
lqc2 vf8, 48(t7)
|
||||||
|
vcallms 0 ;; run bone program
|
||||||
|
sll r0, r0, 0
|
||||||
|
B16:
|
||||||
|
L26:
|
||||||
|
sll r0, r0, 0
|
||||||
|
sll r0, r0, 0
|
||||||
|
daddiu t5, t5, 64 ;; advance joint
|
||||||
|
sll r0, r0, 0
|
||||||
|
daddiu t7, t7, 80 ;; advance bone.
|
||||||
|
sll r0, r0, 0
|
||||||
|
sll r0, r0, 0
|
||||||
|
sll r0, r0, 0
|
||||||
|
sll r0, r0, 0
|
||||||
|
sll r0, r0, 0
|
||||||
|
sll r0, r0, 0
|
||||||
|
sll r0, r0, 0
|
||||||
|
sll r0, r0, 0
|
||||||
|
sll r0, r0, 0
|
||||||
|
sll r0, r0, 0
|
||||||
|
sll r0, r0, 0
|
||||||
|
sll r0, r0, 0
|
||||||
|
lq t8, 0(t5) ;; load next joint
|
||||||
|
sll r0, r0, 0
|
||||||
|
lq t9, 16(t5)
|
||||||
|
sll r0, r0, 0
|
||||||
|
lq ra, 32(t5)
|
||||||
|
sll r0, r0, 0
|
||||||
|
lq gp, 48(t5)
|
||||||
|
sll r0, r0, 0
|
||||||
|
lq s5, 0(t7) ;; load next bone
|
||||||
|
sll r0, r0, 0
|
||||||
|
lq s4, 16(t7)
|
||||||
|
sll r0, r0, 0
|
||||||
|
lq s3, 32(t7)
|
||||||
|
sll r0, r0, 0
|
||||||
|
lq s2, 48(t7)
|
||||||
|
sll r0, r0, 0
|
||||||
|
qmtc2.ni vf1, t8 ;; swap in new inputs
|
||||||
|
sll r0, r0, 0
|
||||||
|
qmtc2.ni vf2, t9
|
||||||
|
sll r0, r0, 0
|
||||||
|
qmtc2.ni vf3, ra
|
||||||
|
sll r0, r0, 0
|
||||||
|
qmtc2.ni vf4, gp
|
||||||
|
sll r0, r0, 0
|
||||||
|
qmtc2.ni vf5, s5
|
||||||
|
sll r0, r0, 0
|
||||||
|
qmtc2.ni vf6, s4
|
||||||
|
sll r0, r0, 0
|
||||||
|
qmtc2.ni vf7, s3
|
||||||
|
sll r0, r0, 0
|
||||||
|
qmtc2.ni vf8, s2
|
||||||
|
sll r0, r0, 0
|
||||||
|
qmfc2.i t8, vf13 ;; swap out result in (vf13, vf14, vf15, vf16) and (vf9, vf10, vf11)
|
||||||
|
sll r0, r0, 0
|
||||||
|
qmfc2.ni t9, vf14
|
||||||
|
sll r0, r0, 0
|
||||||
|
qmfc2.ni ra, vf15
|
||||||
|
sll r0, r0, 0
|
||||||
|
qmfc2.ni gp, vf16
|
||||||
|
sll r0, r0, 0
|
||||||
|
qmfc2.ni s5, vf9
|
||||||
|
sll r0, r0, 0
|
||||||
|
qmfc2.ni s4, vf10
|
||||||
|
sll r0, r0, 0
|
||||||
|
qmfc2.ni s3, vf11
|
||||||
|
vcallms 0
|
||||||
|
sll r0, r0, 0
|
||||||
|
sll r0, r0, 0
|
||||||
|
sq t8, 0(t3)
|
||||||
|
sll r0, r0, 0
|
||||||
|
sq t9, 16(t3)
|
||||||
|
sll r0, r0, 0
|
||||||
|
sq ra, 32(t3)
|
||||||
|
sll r0, r0, 0
|
||||||
|
sq gp, 48(t3)
|
||||||
|
sll r0, r0, 0
|
||||||
|
sq s5, 64(t3)
|
||||||
|
sll r0, r0, 0
|
||||||
|
sq s4, 80(t3)
|
||||||
|
sll r0, r0, 0
|
||||||
|
sq s3, 96(t3)
|
||||||
|
sll r0, r0, 0
|
||||||
|
sq r0, 112(t3)
|
||||||
|
daddiu t3, t3, 128
|
||||||
|
daddiu t6, t6, -1
|
||||||
|
bgtz t6, L26
|
||||||
|
sll r0, r0, 0
|
||||||
|
|
||||||
|
B17:
|
||||||
|
sll r0, r0, 0
|
||||||
|
lw t3, 40(at)
|
||||||
|
beq t3, r0, L29
|
||||||
|
sll r0, r0, 0
|
||||||
|
|
||||||
|
B18:
|
||||||
|
L27:
|
||||||
|
lw t4, 0(v1)
|
||||||
|
sll r0, r0, 0
|
||||||
|
sll r0, r0, 0
|
||||||
|
sll r0, r0, 0
|
||||||
|
andi t4, t4, 256
|
||||||
|
sll r0, r0, 0
|
||||||
|
beq t4, r0, L28
|
||||||
|
sll r0, r0, 0
|
||||||
|
|
||||||
|
B19:
|
||||||
|
sll r0, r0, 0
|
||||||
|
sll r0, r0, 0
|
||||||
|
sll r0, r0, 0
|
||||||
|
sll r0, r0, 0
|
||||||
|
sll r0, r0, 0
|
||||||
|
sll r0, r0, 0
|
||||||
|
sll r0, r0, 0
|
||||||
|
sll r0, r0, 0
|
||||||
|
sll r0, r0, 0
|
||||||
|
sll r0, r0, 0
|
||||||
|
beq r0, r0, L27
|
||||||
|
sll r0, r0, 0
|
||||||
|
|
||||||
|
B20:
|
||||||
|
L28:
|
||||||
|
dsll t5, t2, 2
|
||||||
|
sll r0, r0, 0
|
||||||
|
addiu t4, r0, 1
|
||||||
|
daddu t5, t5, at
|
||||||
|
sll r0, r0, 0
|
||||||
|
lwu t6, 8(t5)
|
||||||
|
dsll t5, t3, 2
|
||||||
|
andi t6, t6, 16383
|
||||||
|
daddu t5, t5, t3
|
||||||
|
sw t6, 128(v1)
|
||||||
|
dsll t6, t5, 4
|
||||||
|
sw a2, 16(v1)
|
||||||
|
addiu t7, r0, 256
|
||||||
|
sw t5, 32(v1)
|
||||||
|
daddu a2, a2, t6
|
||||||
|
sw t7, 0(v1)
|
||||||
|
B21:
|
||||||
|
L29:
|
||||||
|
sll r0, r0, 0
|
||||||
|
lw t5, 48(at)
|
||||||
|
sll r0, r0, 0
|
||||||
|
lw t6, 44(at)
|
||||||
|
B22:
|
||||||
|
L30:
|
||||||
|
lw t7, 0(t0)
|
||||||
|
sll r0, r0, 0
|
||||||
|
sll r0, r0, 0
|
||||||
|
sll r0, r0, 0
|
||||||
|
andi t7, t7, 256
|
||||||
|
sll r0, r0, 0
|
||||||
|
beq t7, r0, L31
|
||||||
|
sll r0, r0, 0
|
||||||
|
|
||||||
|
B23:
|
||||||
|
sll r0, r0, 0
|
||||||
|
sll r0, r0, 0
|
||||||
|
sll r0, r0, 0
|
||||||
|
sll r0, r0, 0
|
||||||
|
sll r0, r0, 0
|
||||||
|
sll r0, r0, 0
|
||||||
|
sll r0, r0, 0
|
||||||
|
sll r0, r0, 0
|
||||||
|
sll r0, r0, 0
|
||||||
|
sll r0, r0, 0
|
||||||
|
beq r0, r0, L30
|
||||||
|
sll r0, r0, 0
|
||||||
|
|
||||||
|
B24:
|
||||||
|
L31:
|
||||||
|
beq t6, r0, L32
|
||||||
|
sll r0, r0, 0
|
||||||
|
|
||||||
|
B25:
|
||||||
|
dsll t7, t5, 2
|
||||||
|
lui t8, 28672
|
||||||
|
daddu t7, t7, t8
|
||||||
|
lwu t7, 16(t7)
|
||||||
|
andi t7, t7, 16383
|
||||||
|
sw t7, 128(t0)
|
||||||
|
sw a0, 16(t0)
|
||||||
|
dsll t7, t6, 3
|
||||||
|
sw t7, 32(t0)
|
||||||
|
addiu t7, r0, 256
|
||||||
|
sw t7, 0(t0)
|
||||||
|
dsll t6, t6, 7
|
||||||
|
daddu a0, a0, t6
|
||||||
|
B26:
|
||||||
|
L32:
|
||||||
|
beq t3, r0, L35
|
||||||
|
sll r0, r0, 0
|
||||||
|
|
||||||
|
B27:
|
||||||
|
bne t4, r0, L35
|
||||||
|
sll r0, r0, 0
|
||||||
|
|
||||||
|
B28:
|
||||||
|
L33:
|
||||||
|
lw t6, 0(v1)
|
||||||
|
sll r0, r0, 0
|
||||||
|
sll r0, r0, 0
|
||||||
|
sll r0, r0, 0
|
||||||
|
andi t6, t6, 256
|
||||||
|
sll r0, r0, 0
|
||||||
|
beq t6, r0, L34
|
||||||
|
sll r0, r0, 0
|
||||||
|
|
||||||
|
B29:
|
||||||
|
sll r0, r0, 0
|
||||||
|
sll r0, r0, 0
|
||||||
|
sll r0, r0, 0
|
||||||
|
sll r0, r0, 0
|
||||||
|
sll r0, r0, 0
|
||||||
|
sll r0, r0, 0
|
||||||
|
sll r0, r0, 0
|
||||||
|
sll r0, r0, 0
|
||||||
|
sll r0, r0, 0
|
||||||
|
sll r0, r0, 0
|
||||||
|
beq r0, r0, L33
|
||||||
|
sll r0, r0, 0
|
||||||
|
|
||||||
|
B30:
|
||||||
|
L34:
|
||||||
|
dsll t6, t2, 2
|
||||||
|
lui t7, 28672
|
||||||
|
daddu t6, t6, t7
|
||||||
|
lwu t6, 8(t6)
|
||||||
|
andi t6, t6, 16383
|
||||||
|
sw t6, 128(v1)
|
||||||
|
sw a2, 16(v1)
|
||||||
|
addiu t6, r0, 5
|
||||||
|
mult3 t6, t6, t3
|
||||||
|
sw t6, 32(v1)
|
||||||
|
addiu t6, r0, 256
|
||||||
|
sw t6, 0(v1)
|
||||||
|
addiu t6, r0, 80
|
||||||
|
mult3 t6, t6, t3
|
||||||
|
daddu a2, a2, t6
|
||||||
|
B31:
|
||||||
|
L35:
|
||||||
|
or t8, t2, r0
|
||||||
|
bne t1, r0, L22
|
||||||
|
or t2, t5, r0
|
||||||
|
|
||||||
|
B32:
|
||||||
|
beq a3, r0, L22
|
||||||
|
addiu a3, r0, 1
|
||||||
|
|
||||||
|
B33:
|
||||||
|
L36:
|
||||||
|
or v0, r0, r0
|
||||||
|
ld ra, 0(sp)
|
||||||
|
lq gp, 80(sp)
|
||||||
|
lq s5, 64(sp)
|
||||||
|
lq s4, 48(sp)
|
||||||
|
lq s3, 32(sp)
|
||||||
|
lq s2, 16(sp)
|
||||||
|
jr ra
|
||||||
|
daddiu sp, sp, 96
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
# VU0 micoprogram
|
||||||
|
|
||||||
|
- vf1, vf2, vf3, vf4 = inverse bind pose
|
||||||
|
- vf5, vf6, vf7, vf8 = input bone matrix.
|
||||||
|
- `vf28, vf29, vf30, vf31` the camera matrix
|
||||||
|
- `vf25, vf26, vf27` the camera matrix again
|
||||||
|
- (vf13, vf14, vf15, vf16) output point transformation
|
||||||
|
- (vf09, vf10, vf11) output normal transformation
|
||||||
|
|
||||||
|
```
|
||||||
|
First: multiply bone and bind pose: (vf13, vf14, vf15, vf16) = (vf5, vf6, vf7, vf8) * (vf1, vf2, vf3, vf4).
|
||||||
|
This is doing a true matrix multiplication.
|
||||||
|
nop | mulax.xyzw ACC, vf05, vf01
|
||||||
|
nop | madday.xyzw ACC, vf06, vf01
|
||||||
|
nop | maddaz.xyzw ACC, vf07, vf01
|
||||||
|
nop | maddw.xyzw vf13, vf08, vf01
|
||||||
|
nop | mulax.xyzw ACC, vf05, vf02
|
||||||
|
nop | madday.xyzw ACC, vf06, vf02
|
||||||
|
nop | maddaz.xyzw ACC, vf07, vf02
|
||||||
|
nop | maddw.xyzw vf14, vf08, vf02
|
||||||
|
nop | mulax.xyzw ACC, vf05, vf03
|
||||||
|
nop | madday.xyzw ACC, vf06, vf03
|
||||||
|
nop | maddaz.xyzw ACC, vf07, vf03
|
||||||
|
nop | maddw.xyzw vf15, vf08, vf03
|
||||||
|
nop | mulax.xyzw ACC, vf05, vf04
|
||||||
|
nop | madday.xyzw ACC, vf06, vf04
|
||||||
|
nop | maddaz.xyzw ACC, vf07, vf04
|
||||||
|
nop | maddw.xyzw vf16, vf08, vf04
|
||||||
|
|
||||||
|
;; vf09 = cross(y, z)
|
||||||
|
nop | opmula.xyz ACC, vf14, vf15
|
||||||
|
nop | opmsub.xyz vf09, vf15, vf14
|
||||||
|
|
||||||
|
;; vf10 = cross(z, x)
|
||||||
|
nop | opmula.xyz ACC, vf15, vf13
|
||||||
|
nop | opmsub.xyz vf10, vf13, vf15
|
||||||
|
|
||||||
|
;; vf11 = cross(x, y)
|
||||||
|
nop | opmula.xyz ACC, vf13, vf14
|
||||||
|
nop | opmsub.xyz vf11, vf14, vf13
|
||||||
|
|
||||||
|
;; vf12 = cross (y, z) * x
|
||||||
|
nop | mul.xyz vf12, vf13, vf09
|
||||||
|
|
||||||
|
;; second multiply: doing (vf13....) = cam * (vf5, vf6, vf7, vf8) * (vf1, vf2, vf3, vf4)
|
||||||
|
nop | mulax.xyzw ACC, vf28, vf13
|
||||||
|
nop | madday.xyzw ACC, vf29, vf13
|
||||||
|
nop | maddaz.xyzw ACC, vf30, vf13
|
||||||
|
nop | maddw.xyzw vf13, vf31, vf13
|
||||||
|
|
||||||
|
nop | mulax.w ACC, vf00, vf12
|
||||||
|
nop | madday.w ACC, vf00, vf12
|
||||||
|
nop | maddz.w vf12, vf00, vf12
|
||||||
|
vf12.w = dot (cross(y, z), x) [before the second multiply]
|
||||||
|
|
||||||
|
nop | mulax.xyzw ACC, vf28, vf14
|
||||||
|
nop | madday.xyzw ACC, vf29, vf14
|
||||||
|
nop | maddaz.xyzw ACC, vf30, vf14
|
||||||
|
div Q, vf00.w, vf12.w | maddw.xyzw vf14, vf31, vf14 ;; divide
|
||||||
|
nop | mulax.xyzw ACC, vf28, vf15
|
||||||
|
nop | madday.xyzw ACC, vf29, vf15
|
||||||
|
nop | maddaz.xyzw ACC, vf30, vf15
|
||||||
|
nop | maddw.xyzw vf15, vf31, vf15
|
||||||
|
nop | mulax.xyzw ACC, vf28, vf16
|
||||||
|
nop | madday.xyzw ACC, vf29, vf16
|
||||||
|
nop | maddaz.xyzw ACC, vf30, vf16
|
||||||
|
nop | maddw.xyzw vf16, vf31, vf16
|
||||||
|
|
||||||
|
;; normal scale
|
||||||
|
nop | mul.xyzw vf09, vf09, Q
|
||||||
|
nop | mul.xyzw vf10, vf10, Q
|
||||||
|
nop | mul.xyzw vf11, vf11, Q
|
||||||
|
|
||||||
|
;; apply cam to normal matrix too
|
||||||
|
nop | mulax.xyzw ACC, vf25, vf09
|
||||||
|
nop | madday.xyzw ACC, vf26, vf09
|
||||||
|
nop | maddz.xyzw vf09, vf27, vf09
|
||||||
|
nop | mulax.xyzw ACC, vf25, vf10
|
||||||
|
nop | madday.xyzw ACC, vf26, vf10
|
||||||
|
nop | maddz.xyzw vf10, vf27, vf10
|
||||||
|
nop | mulax.xyzw ACC, vf25, vf11
|
||||||
|
nop | madday.xyzw ACC, vf26, vf11 :e
|
||||||
|
nop | maddz.xyzw vf11, vf27, vf11
|
||||||
|
```
|
|
@ -422,6 +422,141 @@
|
||||||
|
|
||||||
(def-mips2c bones-mtx-calc (function int pointer pointer int object none))
|
(def-mips2c bones-mtx-calc (function int pointer pointer int object none))
|
||||||
|
|
||||||
|
(defmacro .cross.vf (out a b)
|
||||||
|
`(begin
|
||||||
|
(.outer.product.a.vf acc ,a ,b)
|
||||||
|
(.outer.product.b.vf ,out ,b ,a acc)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
(defun new-bones-mtx-calc-asm ((output (inline-array pris-mtx)) (joints (inline-array joint)) (bones (inline-array bone)) (cam matrix) (count int))
|
||||||
|
"Compute skinning matrices."
|
||||||
|
;; (declare (print-asm))
|
||||||
|
(dotimes (i (- count 1))
|
||||||
|
(let ((b (-> bones (+ i 1) transform))
|
||||||
|
(j (-> joints i bind-pose))
|
||||||
|
(out (-> output (+ i 1)))
|
||||||
|
)
|
||||||
|
(rlet (
|
||||||
|
(tmat0 :class vf)
|
||||||
|
(tmat1 :class vf)
|
||||||
|
(tmat2 :class vf)
|
||||||
|
(tmat3 :class vf)
|
||||||
|
(nmat0 :class vf)
|
||||||
|
(nmat1 :class vf)
|
||||||
|
(nmat2 :class vf)
|
||||||
|
(nmat3 :class vf)
|
||||||
|
(acc :class vf )
|
||||||
|
(vf0 :class vf )
|
||||||
|
(cam0 :class vf )
|
||||||
|
(cam1 :class vf )
|
||||||
|
(cam2 :class vf )
|
||||||
|
(cam3 :class vf )
|
||||||
|
)
|
||||||
|
|
||||||
|
(init-vf0-vector)
|
||||||
|
|
||||||
|
;; load bind-pose to tmat:
|
||||||
|
(.lvf tmat0 (&-> j quad 0))
|
||||||
|
(.lvf tmat1 (&-> j quad 1))
|
||||||
|
(.lvf tmat2 (&-> j quad 2))
|
||||||
|
(.lvf tmat3 (&-> j quad 3))
|
||||||
|
|
||||||
|
;; load bone to nmat
|
||||||
|
(.lvf nmat0 (&-> b quad 0))
|
||||||
|
(.lvf nmat1 (&-> b quad 1))
|
||||||
|
(.lvf nmat2 (&-> b quad 2))
|
||||||
|
(.lvf nmat3 (&-> b quad 3))
|
||||||
|
|
||||||
|
;; multiply bone and bind pose, store in tmat
|
||||||
|
(.mul.x.vf acc nmat0 tmat0)
|
||||||
|
(.add.mul.y.vf acc nmat1 tmat0 acc)
|
||||||
|
(.add.mul.z.vf acc nmat2 tmat0 acc)
|
||||||
|
(.add.mul.w.vf tmat0 nmat3 tmat0 acc)
|
||||||
|
(.mul.x.vf acc nmat0 tmat1)
|
||||||
|
(.add.mul.y.vf acc nmat1 tmat1 acc)
|
||||||
|
(.add.mul.z.vf acc nmat2 tmat1 acc)
|
||||||
|
(.add.mul.w.vf tmat1 nmat3 tmat1 acc)
|
||||||
|
(.mul.x.vf acc nmat0 tmat2)
|
||||||
|
(.add.mul.y.vf acc nmat1 tmat2 acc)
|
||||||
|
(.add.mul.z.vf acc nmat2 tmat2 acc)
|
||||||
|
(.add.mul.w.vf tmat2 nmat3 tmat2 acc)
|
||||||
|
(.mul.x.vf acc nmat0 tmat3)
|
||||||
|
(.add.mul.y.vf acc nmat1 tmat3 acc)
|
||||||
|
(.add.mul.z.vf acc nmat2 tmat3 acc)
|
||||||
|
(.add.mul.w.vf tmat3 nmat3 tmat3 acc)
|
||||||
|
|
||||||
|
;; compute inverse transpose, storing in nmat.
|
||||||
|
(.cross.vf nmat0 tmat1 tmat2)
|
||||||
|
(.cross.vf nmat1 tmat2 tmat0)
|
||||||
|
(.cross.vf nmat2 tmat0 tmat1)
|
||||||
|
|
||||||
|
;; dot nmat0 and tmat0
|
||||||
|
(.mul.vf acc nmat0 tmat0)
|
||||||
|
(.add.y.vf acc acc acc :mask #b1)
|
||||||
|
(.add.z.vf acc acc acc :mask #b1)
|
||||||
|
|
||||||
|
;; divide!
|
||||||
|
(.div.vf acc vf0 acc :fsf #b11 :ftf #b0)
|
||||||
|
|
||||||
|
;; scale nmat:
|
||||||
|
(.mul.x.vf nmat0 nmat0 acc)
|
||||||
|
(.mul.x.vf nmat1 nmat1 acc)
|
||||||
|
(.mul.x.vf nmat2 nmat2 acc)
|
||||||
|
|
||||||
|
;; load camera
|
||||||
|
(.lvf cam0 (&-> cam quad 0))
|
||||||
|
(.lvf cam1 (&-> cam quad 1))
|
||||||
|
(.lvf cam2 (&-> cam quad 2))
|
||||||
|
(.lvf cam3 (&-> cam quad 3))
|
||||||
|
|
||||||
|
;; multiply tmat by camera
|
||||||
|
(.mul.x.vf acc cam0 tmat0)
|
||||||
|
(.add.mul.y.vf acc cam1 tmat0 acc)
|
||||||
|
(.add.mul.z.vf acc cam2 tmat0 acc)
|
||||||
|
(.add.mul.w.vf tmat0 cam3 tmat0 acc)
|
||||||
|
(.mul.x.vf acc cam0 tmat1)
|
||||||
|
(.add.mul.y.vf acc cam1 tmat1 acc)
|
||||||
|
(.add.mul.z.vf acc cam2 tmat1 acc)
|
||||||
|
(.add.mul.w.vf tmat1 cam3 tmat1 acc)
|
||||||
|
(.mul.x.vf acc cam0 tmat2)
|
||||||
|
(.add.mul.y.vf acc cam1 tmat2 acc)
|
||||||
|
(.add.mul.z.vf acc cam2 tmat2 acc)
|
||||||
|
(.add.mul.w.vf tmat2 cam3 tmat2 acc)
|
||||||
|
(.mul.x.vf acc cam0 tmat3)
|
||||||
|
(.add.mul.y.vf acc cam1 tmat3 acc)
|
||||||
|
(.add.mul.z.vf acc cam2 tmat3 acc)
|
||||||
|
(.add.mul.w.vf tmat3 cam3 tmat3 acc)
|
||||||
|
|
||||||
|
;; store tmat
|
||||||
|
(.svf (&-> out t-mtx quad 0) tmat0)
|
||||||
|
(.svf (&-> out t-mtx quad 1) tmat1)
|
||||||
|
(.svf (&-> out t-mtx quad 2) tmat2)
|
||||||
|
(.svf (&-> out t-mtx quad 3) tmat3)
|
||||||
|
|
||||||
|
;; multiply nmat
|
||||||
|
(.mul.x.vf acc cam0 nmat0)
|
||||||
|
(.add.mul.y.vf acc cam1 nmat0 acc)
|
||||||
|
(.add.mul.z.vf nmat0 cam2 nmat0 acc)
|
||||||
|
(.mul.x.vf acc cam0 nmat1)
|
||||||
|
(.add.mul.y.vf acc cam1 nmat1 acc)
|
||||||
|
(.add.mul.z.vf nmat1 cam2 nmat1 acc)
|
||||||
|
(.mul.x.vf acc cam0 nmat2)
|
||||||
|
(.add.mul.y.vf acc cam1 nmat2 acc)
|
||||||
|
(.add.mul.z.vf nmat2 cam2 nmat2 acc)
|
||||||
|
|
||||||
|
;; store nmat
|
||||||
|
(.svf (&-> out n-mtx quad 0) nmat0)
|
||||||
|
(.svf (&-> out n-mtx quad 1) nmat1)
|
||||||
|
(.svf (&-> out n-mtx quad 2) nmat2)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
(none)
|
||||||
|
)
|
||||||
|
|
||||||
|
(define *use-new-bones* #t)
|
||||||
|
|
||||||
(defun bones-mtx-calc-execute ()
|
(defun bones-mtx-calc-execute ()
|
||||||
"Do all pending bone calculations"
|
"Do all pending bone calculations"
|
||||||
(local-vars (v1-14 float))
|
(local-vars (v1-14 float))
|
||||||
|
@ -481,13 +616,20 @@
|
||||||
(.mov v1-14 vf27)
|
(.mov v1-14 vf27)
|
||||||
;; hack??
|
;; hack??
|
||||||
|
|
||||||
|
(if *use-new-bones*
|
||||||
(bones-mtx-calc
|
(new-bones-mtx-calc-asm
|
||||||
(the-as int (-> s4-0 matrix-area))
|
(the (inline-array pris-mtx) (-> s4-0 matrix-area))
|
||||||
(the-as pointer (-> s4-0 joints))
|
(-> s4-0 joints)
|
||||||
(the-as pointer (-> s4-0 bones))
|
(-> s4-0 bones)
|
||||||
(the-as int (-> s4-0 num-bones))
|
v1-13
|
||||||
v1-13 ;; hack, added
|
(the int (-> s4-0 num-bones)))
|
||||||
|
(bones-mtx-calc
|
||||||
|
(the-as int (-> s4-0 matrix-area))
|
||||||
|
(the-as pointer (-> s4-0 joints))
|
||||||
|
(the-as pointer (-> s4-0 bones))
|
||||||
|
(the-as int (-> s4-0 num-bones))
|
||||||
|
v1-13 ;; hack, added
|
||||||
|
)
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
(when (logtest? (-> s4-0 flags) (bone-calc-flags bncfl00))
|
(when (logtest? (-> s4-0 flags) (bone-calc-flags bncfl00))
|
||||||
|
|
|
@ -120,16 +120,224 @@ this is done by a linked list of "bone calculations", which is stashed in the dm
|
||||||
;; see the C++ code for more details.
|
;; see the C++ code for more details.
|
||||||
(def-mips2c bones-mtx-calc (function (inline-array pris-mtx) (inline-array joint) (inline-array bone) uint object none))
|
(def-mips2c bones-mtx-calc (function (inline-array pris-mtx) (inline-array joint) (inline-array bone) uint object none))
|
||||||
|
|
||||||
|
(defun matrix-*float! ((output matrix3) (input matrix3) (x float))
|
||||||
|
(dotimes (i 12)
|
||||||
|
(set! (-> output data i) (* x (-> input data i)))
|
||||||
|
)
|
||||||
|
output
|
||||||
|
)
|
||||||
|
|
||||||
|
(defun matrix*!-first-three ((arg0 matrix3) (arg1 matrix3) (arg2 matrix))
|
||||||
|
"Set dst = src1 * src2. It is okay for any arguments to be the same data.
|
||||||
|
This is a moderately efficient implementation."
|
||||||
|
(rlet ((acc :class vf)
|
||||||
|
(vf10 :class vf)
|
||||||
|
(vf11 :class vf)
|
||||||
|
(vf12 :class vf)
|
||||||
|
(vf14 :class vf)
|
||||||
|
(vf15 :class vf)
|
||||||
|
(vf16 :class vf)
|
||||||
|
(vf18 :class vf)
|
||||||
|
(vf19 :class vf)
|
||||||
|
(vf20 :class vf)
|
||||||
|
)
|
||||||
|
(.lvf vf10 (&-> arg1 quad 0))
|
||||||
|
(.lvf vf14 (&-> arg2 quad 0))
|
||||||
|
(.lvf vf15 (&-> arg2 quad 1))
|
||||||
|
(.lvf vf16 (&-> arg2 quad 2))
|
||||||
|
(.lvf vf11 (&-> arg1 quad 1))
|
||||||
|
(.lvf vf12 (&-> arg1 quad 2))
|
||||||
|
(.mul.x.vf acc vf14 vf10)
|
||||||
|
(.add.mul.y.vf acc vf15 vf10 acc)
|
||||||
|
(.add.mul.z.vf vf18 vf16 vf10 acc)
|
||||||
|
(.mul.x.vf acc vf14 vf11)
|
||||||
|
(.add.mul.y.vf acc vf15 vf11 acc)
|
||||||
|
(.add.mul.z.vf vf19 vf16 vf11 acc)
|
||||||
|
(.mul.x.vf acc vf14 vf12)
|
||||||
|
(.add.mul.y.vf acc vf15 vf12 acc)
|
||||||
|
(.add.mul.z.vf vf20 vf16 vf12 acc)
|
||||||
|
(.svf (&-> arg0 quad 0) vf18)
|
||||||
|
(.svf (&-> arg0 quad 1) vf19)
|
||||||
|
(.svf (&-> arg0 quad 2) vf20)
|
||||||
|
arg0
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
(defun new-bones-mtx-calc ((output (inline-array pris-mtx)) (joints (inline-array joint)) (bones (inline-array bone)) (cam matrix) (count int))
|
||||||
|
(dotimes (i (- count 1))
|
||||||
|
(let ((b (-> bones (+ i 1) transform))
|
||||||
|
(j (-> joints i bind-pose))
|
||||||
|
(out (-> output (+ i 1)))
|
||||||
|
)
|
||||||
|
|
||||||
|
;; multiply by bind pose
|
||||||
|
;; mult swaps the args
|
||||||
|
(matrix*! (-> out t-mtx) j b)
|
||||||
|
|
||||||
|
;; clever way to compute inverse transpose of a 3x3:
|
||||||
|
(vector-cross! (-> out n-mtx vector 0)
|
||||||
|
(-> out t-mtx vector 1)
|
||||||
|
(-> out t-mtx vector 2)
|
||||||
|
)
|
||||||
|
(vector-cross! (-> out n-mtx vector 1)
|
||||||
|
(-> out t-mtx vector 2)
|
||||||
|
(-> out t-mtx vector 0)
|
||||||
|
)
|
||||||
|
(vector-cross! (-> out n-mtx vector 2)
|
||||||
|
(-> out t-mtx vector 0)
|
||||||
|
(-> out t-mtx vector 1)
|
||||||
|
)
|
||||||
|
(let ((scale (/ 1. (vector-dot (-> out n-mtx vector 0) (-> out t-mtx vector 0)))))
|
||||||
|
(matrix-*float! (-> out n-mtx) (-> out n-mtx) scale)
|
||||||
|
)
|
||||||
|
|
||||||
|
;; multiply by camera
|
||||||
|
(matrix*! (-> out t-mtx) (-> out t-mtx) cam)
|
||||||
|
(matrix*!-first-three (-> out n-mtx) (-> out n-mtx) cam) ;; WRONG!!
|
||||||
|
)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
(defmacro .cross.vf (out a b)
|
||||||
|
`(begin
|
||||||
|
(.outer.product.a.vf acc ,a ,b)
|
||||||
|
(.outer.product.b.vf ,out ,b ,a acc)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
(defun new-bones-mtx-calc-asm ((output (inline-array pris-mtx)) (joints (inline-array joint)) (bones (inline-array bone)) (cam matrix) (count int))
|
||||||
|
;; (declare (print-asm))
|
||||||
|
(dotimes (i (- count 1))
|
||||||
|
(let ((b (-> bones (+ i 1) transform))
|
||||||
|
(j (-> joints i bind-pose))
|
||||||
|
(out (-> output (+ i 1)))
|
||||||
|
)
|
||||||
|
(rlet (
|
||||||
|
(tmat0 :class vf)
|
||||||
|
(tmat1 :class vf)
|
||||||
|
(tmat2 :class vf)
|
||||||
|
(tmat3 :class vf)
|
||||||
|
(nmat0 :class vf)
|
||||||
|
(nmat1 :class vf)
|
||||||
|
(nmat2 :class vf)
|
||||||
|
(nmat3 :class vf)
|
||||||
|
(acc :class vf )
|
||||||
|
(vf0 :class vf )
|
||||||
|
(cam0 :class vf )
|
||||||
|
(cam1 :class vf )
|
||||||
|
(cam2 :class vf )
|
||||||
|
(cam3 :class vf )
|
||||||
|
)
|
||||||
|
|
||||||
|
(init-vf0-vector)
|
||||||
|
|
||||||
|
;; load bind-pose to tmat:
|
||||||
|
(.lvf tmat0 (&-> j quad 0))
|
||||||
|
(.lvf tmat1 (&-> j quad 1))
|
||||||
|
(.lvf tmat2 (&-> j quad 2))
|
||||||
|
(.lvf tmat3 (&-> j quad 3))
|
||||||
|
|
||||||
|
;; load bone to nmat
|
||||||
|
(.lvf nmat0 (&-> b quad 0))
|
||||||
|
(.lvf nmat1 (&-> b quad 1))
|
||||||
|
(.lvf nmat2 (&-> b quad 2))
|
||||||
|
(.lvf nmat3 (&-> b quad 3))
|
||||||
|
|
||||||
|
;; multiply, store in tmat
|
||||||
|
(.mul.x.vf acc nmat0 tmat0)
|
||||||
|
(.add.mul.y.vf acc nmat1 tmat0 acc)
|
||||||
|
(.add.mul.z.vf acc nmat2 tmat0 acc)
|
||||||
|
(.add.mul.w.vf tmat0 nmat3 tmat0 acc)
|
||||||
|
(.mul.x.vf acc nmat0 tmat1)
|
||||||
|
(.add.mul.y.vf acc nmat1 tmat1 acc)
|
||||||
|
(.add.mul.z.vf acc nmat2 tmat1 acc)
|
||||||
|
(.add.mul.w.vf tmat1 nmat3 tmat1 acc)
|
||||||
|
(.mul.x.vf acc nmat0 tmat2)
|
||||||
|
(.add.mul.y.vf acc nmat1 tmat2 acc)
|
||||||
|
(.add.mul.z.vf acc nmat2 tmat2 acc)
|
||||||
|
(.add.mul.w.vf tmat2 nmat3 tmat2 acc)
|
||||||
|
(.mul.x.vf acc nmat0 tmat3)
|
||||||
|
(.add.mul.y.vf acc nmat1 tmat3 acc)
|
||||||
|
(.add.mul.z.vf acc nmat2 tmat3 acc)
|
||||||
|
(.add.mul.w.vf tmat3 nmat3 tmat3 acc)
|
||||||
|
|
||||||
|
;; compute inverse transpose, storing in nmat
|
||||||
|
(.cross.vf nmat0 tmat1 tmat2)
|
||||||
|
(.cross.vf nmat1 tmat2 tmat0)
|
||||||
|
(.cross.vf nmat2 tmat0 tmat1)
|
||||||
|
|
||||||
|
;; dot nmat0 and tmat0
|
||||||
|
(.mul.vf acc nmat0 tmat0)
|
||||||
|
(.add.y.vf acc acc acc :mask #b1)
|
||||||
|
(.add.z.vf acc acc acc :mask #b1)
|
||||||
|
|
||||||
|
;; divide!
|
||||||
|
(.div.vf acc vf0 acc :fsf #b11 :ftf #b0)
|
||||||
|
|
||||||
|
;; scale nmat:
|
||||||
|
(.mul.x.vf nmat0 nmat0 acc)
|
||||||
|
(.mul.x.vf nmat1 nmat1 acc)
|
||||||
|
(.mul.x.vf nmat2 nmat2 acc)
|
||||||
|
|
||||||
|
;; load camera
|
||||||
|
(.lvf cam0 (&-> cam quad 0))
|
||||||
|
(.lvf cam1 (&-> cam quad 1))
|
||||||
|
(.lvf cam2 (&-> cam quad 2))
|
||||||
|
(.lvf cam3 (&-> cam quad 3))
|
||||||
|
|
||||||
|
;; multiply tmat by camera
|
||||||
|
(.mul.x.vf acc cam0 tmat0)
|
||||||
|
(.add.mul.y.vf acc cam1 tmat0 acc)
|
||||||
|
(.add.mul.z.vf acc cam2 tmat0 acc)
|
||||||
|
(.add.mul.w.vf tmat0 cam3 tmat0 acc)
|
||||||
|
(.mul.x.vf acc cam0 tmat1)
|
||||||
|
(.add.mul.y.vf acc cam1 tmat1 acc)
|
||||||
|
(.add.mul.z.vf acc cam2 tmat1 acc)
|
||||||
|
(.add.mul.w.vf tmat1 cam3 tmat1 acc)
|
||||||
|
(.mul.x.vf acc cam0 tmat2)
|
||||||
|
(.add.mul.y.vf acc cam1 tmat2 acc)
|
||||||
|
(.add.mul.z.vf acc cam2 tmat2 acc)
|
||||||
|
(.add.mul.w.vf tmat2 cam3 tmat2 acc)
|
||||||
|
(.mul.x.vf acc cam0 tmat3)
|
||||||
|
(.add.mul.y.vf acc cam1 tmat3 acc)
|
||||||
|
(.add.mul.z.vf acc cam2 tmat3 acc)
|
||||||
|
(.add.mul.w.vf tmat3 cam3 tmat3 acc)
|
||||||
|
|
||||||
|
;; store tmat
|
||||||
|
(.svf (&-> out t-mtx quad 0) tmat0)
|
||||||
|
(.svf (&-> out t-mtx quad 1) tmat1)
|
||||||
|
(.svf (&-> out t-mtx quad 2) tmat2)
|
||||||
|
(.svf (&-> out t-mtx quad 3) tmat3)
|
||||||
|
|
||||||
|
;; multiply nmat
|
||||||
|
(.mul.x.vf acc cam0 nmat0)
|
||||||
|
(.add.mul.y.vf acc cam1 nmat0 acc)
|
||||||
|
(.add.mul.z.vf nmat0 cam2 nmat0 acc)
|
||||||
|
(.mul.x.vf acc cam0 nmat1)
|
||||||
|
(.add.mul.y.vf acc cam1 nmat1 acc)
|
||||||
|
(.add.mul.z.vf nmat1 cam2 nmat1 acc)
|
||||||
|
(.mul.x.vf acc cam0 nmat2)
|
||||||
|
(.add.mul.y.vf acc cam1 nmat2 acc)
|
||||||
|
(.add.mul.z.vf nmat2 cam2 nmat2 acc)
|
||||||
|
|
||||||
|
;; store nmat
|
||||||
|
(.svf (&-> out n-mtx quad 0) nmat0)
|
||||||
|
(.svf (&-> out n-mtx quad 1) nmat1)
|
||||||
|
(.svf (&-> out n-mtx quad 2) nmat2)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
(none)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
(define *use-new-bones* #t)
|
||||||
|
(define *display-bone-stats* #f)
|
||||||
|
(define *num-bones* 0)
|
||||||
|
|
||||||
(defun bones-mtx-calc-execute ()
|
(defun bones-mtx-calc-execute ()
|
||||||
"Execute all bone matrix calculations."
|
"Execute all bone matrix calculations."
|
||||||
(rlet ((vf1 :class vf)
|
(rlet ((vf1 :class vf)
|
||||||
(vf25 :class vf)
|
|
||||||
(vf26 :class vf)
|
|
||||||
(vf27 :class vf)
|
|
||||||
(vf28 :class vf)
|
|
||||||
(vf29 :class vf)
|
|
||||||
(vf30 :class vf)
|
|
||||||
(vf31 :class vf)
|
|
||||||
(vf4 :class vf)
|
(vf4 :class vf)
|
||||||
(vf5 :class vf)
|
(vf5 :class vf)
|
||||||
(vf6 :class vf)
|
(vf6 :class vf)
|
||||||
|
@ -157,6 +365,7 @@ this is done by a linked list of "bone calculations", which is stashed in the dm
|
||||||
(s5-1 (-> *math-camera* camera-rot))
|
(s5-1 (-> *math-camera* camera-rot))
|
||||||
(s4-1 (-> v1-26 first))
|
(s4-1 (-> v1-26 first))
|
||||||
)
|
)
|
||||||
|
(set! *num-bones* 0)
|
||||||
(while (nonzero? s4-1) ;; loop
|
(while (nonzero? s4-1) ;; loop
|
||||||
|
|
||||||
;; pick correct matrix. The pris-mtx includes camera rotation, so all that's needed in the final
|
;; pick correct matrix. The pris-mtx includes camera rotation, so all that's needed in the final
|
||||||
|
@ -167,23 +376,19 @@ this is done by a linked list of "bone calculations", which is stashed in the dm
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
(.lvf vf28 (&-> v1-29 quad 0))
|
|
||||||
(.lvf vf29 (&-> v1-29 quad 1))
|
|
||||||
(.lvf vf30 (&-> v1-29 quad 2))
|
|
||||||
(.lvf vf31 (&-> v1-29 trans quad))
|
|
||||||
(.lvf vf25 (&-> v1-29 quad 0))
|
|
||||||
(.lvf vf26 (&-> v1-29 quad 1))
|
|
||||||
(.lvf vf27 (&-> v1-29 quad 2))
|
|
||||||
|
|
||||||
;; calculate!
|
;; calculate!
|
||||||
(bones-mtx-calc
|
(if *use-new-bones*
|
||||||
(-> s4-1 matrix-area)
|
(new-bones-mtx-calc-asm (-> s4-1 matrix-area) (-> s4-1 joints) (-> s4-1 bones) v1-29 (the int (-> s4-1 num-bones)))
|
||||||
(-> s4-1 joints)
|
(bones-mtx-calc
|
||||||
(-> s4-1 bones)
|
(-> s4-1 matrix-area)
|
||||||
(-> s4-1 num-bones)
|
(-> s4-1 joints)
|
||||||
v1-29 ;; hack, to pass matrix to bones-mtx-calc in a better way.
|
(-> s4-1 bones)
|
||||||
)
|
(-> s4-1 num-bones)
|
||||||
|
v1-29 ;; hack, to pass matrix to bones-mtx-calc in a better way.
|
||||||
|
)
|
||||||
|
)
|
||||||
)
|
)
|
||||||
|
(+! *num-bones* (-> s4-1 num-bones))
|
||||||
|
|
||||||
;; there is an optional post-processing step for ripple.
|
;; there is an optional post-processing step for ripple.
|
||||||
(when (logtest? (-> s4-1 flags) (bone-calc-flags write-ripple-data))
|
(when (logtest? (-> s4-1 flags) (bone-calc-flags write-ripple-data))
|
||||||
|
@ -228,6 +433,10 @@ this is done by a linked list of "bone calculations", which is stashed in the dm
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
(when *display-bone-stats*
|
||||||
|
(format *stdcon* "num-bones: ~D~%" *num-bones*)
|
||||||
|
)
|
||||||
|
|
||||||
;; reset sqwc
|
;; reset sqwc
|
||||||
;; (set! (-> (the-as dma-bank-control #x1000e000) sqwc) (new 'static 'dma-sqwc :sqwc #x1 :tqwc #x1))
|
;; (set! (-> (the-as dma-bank-control #x1000e000) sqwc) (new 'static 'dma-sqwc :sqwc #x1 :tqwc #x1))
|
||||||
|
|
||||||
|
|
|
@ -1233,17 +1233,17 @@ Val* Compiler::compile_asm_div_vf(const goos::Object& form, const goos::Object&
|
||||||
// Why do we even bother using VDIVPS instead of FDIV? Because otherwise in x86, you have to use
|
// Why do we even bother using VDIVPS instead of FDIV? Because otherwise in x86, you have to use
|
||||||
// the FPU stack Registers are nicer.
|
// the FPU stack Registers are nicer.
|
||||||
|
|
||||||
// Save one temp reg, use the destination as one
|
auto temp_reg1 = env->make_vfr(dest->type());
|
||||||
auto temp_reg = env->make_vfr(dest->type());
|
auto temp_reg2 = env->make_vfr(dest->type());
|
||||||
|
|
||||||
// Splat src1's value into the dest reg, keep it simple, this way no matter which vector component
|
// Splat src1's value into a temp reg, keep it simple, this way no matter which vector component
|
||||||
// is accessed from the final result will be the correct answer
|
// is accessed from the final result will be the correct answer
|
||||||
env->emit_ir<IR_SplatVF>(form, color, dest, src1, ftf_fsf_to_vector_element(fsf));
|
env->emit_ir<IR_SplatVF>(form, color, temp_reg1, src1, ftf_fsf_to_vector_element(fsf));
|
||||||
// Splat src1's value into the the temp reg
|
// Splat src1's value into the the temp reg
|
||||||
env->emit_ir<IR_SplatVF>(form, color, temp_reg, src2, ftf_fsf_to_vector_element(ftf));
|
env->emit_ir<IR_SplatVF>(form, color, temp_reg2, src2, ftf_fsf_to_vector_element(ftf));
|
||||||
|
|
||||||
// Perform the Division
|
// Perform the Division
|
||||||
env->emit_ir<IR_VFMath3Asm>(form, color, dest, dest, temp_reg, IR_VFMath3Asm::Kind::DIV);
|
env->emit_ir<IR_VFMath3Asm>(form, color, dest, temp_reg1, temp_reg2, IR_VFMath3Asm::Kind::DIV);
|
||||||
return get_none();
|
return get_none();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -332,7 +332,7 @@ const std::vector<emitter::Register>& get_alloc_order(int var_idx,
|
||||||
if (is_gpr) {
|
if (is_gpr) {
|
||||||
return REG_temp_first_order.gprs;
|
return REG_temp_first_order.gprs;
|
||||||
} else {
|
} else {
|
||||||
return REG_temp_only_order.xmms;
|
return REG_temp_first_order.xmms;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1060,8 +1060,10 @@ bool run_assignment_on_var(const AllocationInput& input,
|
||||||
for (auto& reg : assign_order) {
|
for (auto& reg : assign_order) {
|
||||||
bool worked = check_register_assign(input, *cache, var_idx, reg);
|
bool worked = check_register_assign(input, *cache, var_idx, reg);
|
||||||
if (trace) {
|
if (trace) {
|
||||||
lg::print("m2 trying var {} in {}: {}\n", cache->iregs.at(var_idx).to_string(),
|
const auto& this_var = cache->vars.at(var_idx);
|
||||||
reg.print(), worked);
|
lg::print("m2 trying var {} in {} (live {} to {}): {}\n",
|
||||||
|
cache->iregs.at(var_idx).to_string(), reg.print(), this_var.first_live(),
|
||||||
|
this_var.last_live(), worked);
|
||||||
}
|
}
|
||||||
if (worked) {
|
if (worked) {
|
||||||
var.assign_to_register(reg);
|
var.assign_to_register(reg);
|
||||||
|
|
|
@ -24,7 +24,7 @@ void print_allocate_input(const AllocationInput& in) {
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
for (const auto& instruction : in.instructions) {
|
for (const auto& instruction : in.instructions) {
|
||||||
lg::print(" [{:3d}] {}\n", instruction.print());
|
lg::print(" {}\n", instruction.print());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
lg::print("[RegAlloc] Debug Input Constraints:\n");
|
lg::print("[RegAlloc] Debug Input Constraints:\n");
|
||||||
|
|
Loading…
Reference in a new issue