jak-project/docs/progress-notes/bones.md

500 lines
12 KiB
Markdown
Raw Permalink Normal View History

The `bones.gc` file computes skinning matrices for foreground rendering.
Arguments:
- `a0`: output matrix area, each matrix is 7x quadwords contaning the vertex and normal transformation matrices.
- `a1`: input joint array. The joints contain the inverse bind pose.
- `a2`: input bones array. The world space bone transforms
- `a3`: num bones
- `vf28, vf29, vf30, vf31` the camera matrix
- `vf25, vf26, vf27` the camera matrix again
```asm
daddiu sp, sp, -96
sd ra, 0(sp)
sq s2, 16(sp)
sq s3, 32(sp)
sq s4, 48(sp)
sq s5, 64(sp)
sq gp, 80(sp)
lui v1, 4096
lui t0, 4096
ori v1, v1, 54272 ;; v1 = DMA reg addr
ori t0, t0, 53248 ;; t0 = DMA reg addr
lui t2, 32767 ;; 0x7fff....
daddiu t1, a3, -16 ;; bone count - 16 (maybe we do 16 bones at a time?)
ori t2, t2, 65535 ;; 0x7fff'ffff
lui at, 28672 ;; scratchpad addr
addiu t4, r0, 64 ;; t4 = 64 (= 16 bones * 4)
addiu t5, r0, 1280 ;; t5 = 1280 (= 16 bones * 80)
bgez t1, L17 ;; more than 16 bones?
addiu t3, r0, 16 ;; t3 = 16
;; if first run is under 16 bones, adjust counts
B1:
or t3, a3, r0 ;; t3 = num bones
sll r0, r0, 0
dsll t4, t3, 2 ;; t4 = num bones * 4
dsll a3, t3, 4 ;; a3 = num bones * 16
dsll t1, t3, 6 ;; t1 = num bones * 64
sll r0, r0, 0
daddu t5, t1, a3 ;; t5 = num bones * 80
addiu t1, r0, 0 ;; t1 = 0 (remaining bones count)
B2:
L17:
addiu a3, r0, 0 ;; a3 = 0
addiu t6, r0, 1 ;; t6 = 1
and a1, a1, t2 ;; mask off upper bits of address (not sure why, but they do this sometimes)
sll r0, r0, 0
daddiu a1, a1, 12 ;; adjustment of joint pointer for the strided dma stuff.
or a0, a0, r0
daddiu a1, a1, -80
sll r0, r0, 0
;; wait for DMA to be free...
<snip>
B5:
L19:
addiu t6, r0, 80
addiu t7, r0, 264
sw t6, 128(v1) ;; addr in spad = 80 for joints
sw a1, 16(v1)
sw t4, 32(v1) ;; size: num bones * 4
sw t7, 0(v1)
daddu a1, a1, t5
;; wait for dma to complete
<snip>
B8:
L21:
and a2, a2, t2 ;; clean up bones addr
sll r0, r0, 0
dsll t2, t3, 2 ;; t2 = bones * 4
addiu t4, r0, 256 ;; t4 = 256
daddu t2, t2, t3 ;; t2 = bones * 5 (size of the bone)
addiu t6, r0, 1104 ;; addr in spad = 1104 for bones.
dsll t5, t2, 4
sw t6, 128(v1)
addiu t8, r0, 0
sw a2, 16(v1)
daddu a2, a2, t5
sw t2, 32(v1)
addiu t2, r0, 1
sw t4, 0(v1)
;; wait for dma
;; <snip>
B11:
L23:
dsll t5, t8, 2 ;; ?? not sure what this is, but always zero?
daddu t9, t5, at ;; ptr to bone-work
sll r0, r0, 0
lwu t5, 0(t9) ;; t5 = (-> bone-layout joint)
or t6, t3, r0
lwu t7, 8(t9) ;; t7 = (-> bone-layout bone)
or ra, t3, r0
lwu t3, 16(t9) ;; t3 = (-> bone-layout output)
sll r0, r0, 0
sw ra, 44(at) ;; stash sp-size
beq ra, r0, L36
sw t8, 48(at) ;; stash sp-bufnum
B12:
daddiu t1, t1, -16 ;; decrement bones count
addiu t9, r0, 1280 ;; next DMA math stuff
bgez t1, L24 ;; check if partial bone buffer
addiu t8, r0, 16 ;; ....
B13:
daddiu t8, t1, 16
addiu t1, r0, 0
dsll t9, t8, 4
dsll ra, t8, 6
beq t8, r0, L25
daddu t9, ra, t9
B14:
L24:
dsll t4, t8, 2
dsll ra, t2, 2
daddu gp, ra, at
sw a1, 16(v1)
addiu ra, r0, 264
lwu gp, 0(gp)
andi gp, gp, 16383
sw t4, 32(v1)
daddu a1, a1, t9
sw gp, 128(v1)
addiu t4, r0, 0
sw ra, 0(v1)
;; and now, for the actual bones.
B15:
L25:
sll r0, r0, 0
sw t8, 40(at) ;; in-count
sll r0, r0, 0
lqc2 vf1, 0(t5) ;; vf1, vf2, vf3, vf4 = inverse bind pose
sll r0, r0, 0
lqc2 vf2, 16(t5)
sll r0, r0, 0
lqc2 vf3, 32(t5)
sll r0, r0, 0
lqc2 vf4, 48(t5)
sll r0, r0, 0
lqc2 vf5, 0(t7) ;; vf5, vf6, vf7, vf8 = input bone matrix.
sll r0, r0, 0
lqc2 vf6, 16(t7)
sll r0, r0, 0
lqc2 vf7, 32(t7)
sll r0, r0, 0
lqc2 vf8, 48(t7)
vcallms 0 ;; run bone program
sll r0, r0, 0
B16:
L26:
sll r0, r0, 0
sll r0, r0, 0
daddiu t5, t5, 64 ;; advance joint
sll r0, r0, 0
daddiu t7, t7, 80 ;; advance bone.
sll r0, r0, 0
sll r0, r0, 0
sll r0, r0, 0
sll r0, r0, 0
sll r0, r0, 0
sll r0, r0, 0
sll r0, r0, 0
sll r0, r0, 0
sll r0, r0, 0
sll r0, r0, 0
sll r0, r0, 0
sll r0, r0, 0
lq t8, 0(t5) ;; load next joint
sll r0, r0, 0
lq t9, 16(t5)
sll r0, r0, 0
lq ra, 32(t5)
sll r0, r0, 0
lq gp, 48(t5)
sll r0, r0, 0
lq s5, 0(t7) ;; load next bone
sll r0, r0, 0
lq s4, 16(t7)
sll r0, r0, 0
lq s3, 32(t7)
sll r0, r0, 0
lq s2, 48(t7)
sll r0, r0, 0
qmtc2.ni vf1, t8 ;; swap in new inputs
sll r0, r0, 0
qmtc2.ni vf2, t9
sll r0, r0, 0
qmtc2.ni vf3, ra
sll r0, r0, 0
qmtc2.ni vf4, gp
sll r0, r0, 0
qmtc2.ni vf5, s5
sll r0, r0, 0
qmtc2.ni vf6, s4
sll r0, r0, 0
qmtc2.ni vf7, s3
sll r0, r0, 0
qmtc2.ni vf8, s2
sll r0, r0, 0
qmfc2.i t8, vf13 ;; swap out result in (vf13, vf14, vf15, vf16) and (vf9, vf10, vf11)
sll r0, r0, 0
qmfc2.ni t9, vf14
sll r0, r0, 0
qmfc2.ni ra, vf15
sll r0, r0, 0
qmfc2.ni gp, vf16
sll r0, r0, 0
qmfc2.ni s5, vf9
sll r0, r0, 0
qmfc2.ni s4, vf10
sll r0, r0, 0
qmfc2.ni s3, vf11
vcallms 0
sll r0, r0, 0
sll r0, r0, 0
sq t8, 0(t3)
sll r0, r0, 0
sq t9, 16(t3)
sll r0, r0, 0
sq ra, 32(t3)
sll r0, r0, 0
sq gp, 48(t3)
sll r0, r0, 0
sq s5, 64(t3)
sll r0, r0, 0
sq s4, 80(t3)
sll r0, r0, 0
sq s3, 96(t3)
sll r0, r0, 0
sq r0, 112(t3)
daddiu t3, t3, 128
daddiu t6, t6, -1
bgtz t6, L26
sll r0, r0, 0
B17:
sll r0, r0, 0
lw t3, 40(at)
beq t3, r0, L29
sll r0, r0, 0
B18:
L27:
lw t4, 0(v1)
sll r0, r0, 0
sll r0, r0, 0
sll r0, r0, 0
andi t4, t4, 256
sll r0, r0, 0
beq t4, r0, L28
sll r0, r0, 0
B19:
sll r0, r0, 0
sll r0, r0, 0
sll r0, r0, 0
sll r0, r0, 0
sll r0, r0, 0
sll r0, r0, 0
sll r0, r0, 0
sll r0, r0, 0
sll r0, r0, 0
sll r0, r0, 0
beq r0, r0, L27
sll r0, r0, 0
B20:
L28:
dsll t5, t2, 2
sll r0, r0, 0
addiu t4, r0, 1
daddu t5, t5, at
sll r0, r0, 0
lwu t6, 8(t5)
dsll t5, t3, 2
andi t6, t6, 16383
daddu t5, t5, t3
sw t6, 128(v1)
dsll t6, t5, 4
sw a2, 16(v1)
addiu t7, r0, 256
sw t5, 32(v1)
daddu a2, a2, t6
sw t7, 0(v1)
B21:
L29:
sll r0, r0, 0
lw t5, 48(at)
sll r0, r0, 0
lw t6, 44(at)
B22:
L30:
lw t7, 0(t0)
sll r0, r0, 0
sll r0, r0, 0
sll r0, r0, 0
andi t7, t7, 256
sll r0, r0, 0
beq t7, r0, L31
sll r0, r0, 0
B23:
sll r0, r0, 0
sll r0, r0, 0
sll r0, r0, 0
sll r0, r0, 0
sll r0, r0, 0
sll r0, r0, 0
sll r0, r0, 0
sll r0, r0, 0
sll r0, r0, 0
sll r0, r0, 0
beq r0, r0, L30
sll r0, r0, 0
B24:
L31:
beq t6, r0, L32
sll r0, r0, 0
B25:
dsll t7, t5, 2
lui t8, 28672
daddu t7, t7, t8
lwu t7, 16(t7)
andi t7, t7, 16383
sw t7, 128(t0)
sw a0, 16(t0)
dsll t7, t6, 3
sw t7, 32(t0)
addiu t7, r0, 256
sw t7, 0(t0)
dsll t6, t6, 7
daddu a0, a0, t6
B26:
L32:
beq t3, r0, L35
sll r0, r0, 0
B27:
bne t4, r0, L35
sll r0, r0, 0
B28:
L33:
lw t6, 0(v1)
sll r0, r0, 0
sll r0, r0, 0
sll r0, r0, 0
andi t6, t6, 256
sll r0, r0, 0
beq t6, r0, L34
sll r0, r0, 0
B29:
sll r0, r0, 0
sll r0, r0, 0
sll r0, r0, 0
sll r0, r0, 0
sll r0, r0, 0
sll r0, r0, 0
sll r0, r0, 0
sll r0, r0, 0
sll r0, r0, 0
sll r0, r0, 0
beq r0, r0, L33
sll r0, r0, 0
B30:
L34:
dsll t6, t2, 2
lui t7, 28672
daddu t6, t6, t7
lwu t6, 8(t6)
andi t6, t6, 16383
sw t6, 128(v1)
sw a2, 16(v1)
addiu t6, r0, 5
mult3 t6, t6, t3
sw t6, 32(v1)
addiu t6, r0, 256
sw t6, 0(v1)
addiu t6, r0, 80
mult3 t6, t6, t3
daddu a2, a2, t6
B31:
L35:
or t8, t2, r0
bne t1, r0, L22
or t2, t5, r0
B32:
beq a3, r0, L22
addiu a3, r0, 1
B33:
L36:
or v0, r0, r0
ld ra, 0(sp)
lq gp, 80(sp)
lq s5, 64(sp)
lq s4, 48(sp)
lq s3, 32(sp)
lq s2, 16(sp)
jr ra
daddiu sp, sp, 96
```
# VU0 micoprogram
- vf1, vf2, vf3, vf4 = inverse bind pose
- vf5, vf6, vf7, vf8 = input bone matrix.
- `vf28, vf29, vf30, vf31` the camera matrix
- `vf25, vf26, vf27` the camera matrix again
- (vf13, vf14, vf15, vf16) output point transformation
- (vf09, vf10, vf11) output normal transformation
```
First: multiply bone and bind pose: (vf13, vf14, vf15, vf16) = (vf5, vf6, vf7, vf8) * (vf1, vf2, vf3, vf4).
This is doing a true matrix multiplication.
nop | mulax.xyzw ACC, vf05, vf01
nop | madday.xyzw ACC, vf06, vf01
nop | maddaz.xyzw ACC, vf07, vf01
nop | maddw.xyzw vf13, vf08, vf01
nop | mulax.xyzw ACC, vf05, vf02
nop | madday.xyzw ACC, vf06, vf02
nop | maddaz.xyzw ACC, vf07, vf02
nop | maddw.xyzw vf14, vf08, vf02
nop | mulax.xyzw ACC, vf05, vf03
nop | madday.xyzw ACC, vf06, vf03
nop | maddaz.xyzw ACC, vf07, vf03
nop | maddw.xyzw vf15, vf08, vf03
nop | mulax.xyzw ACC, vf05, vf04
nop | madday.xyzw ACC, vf06, vf04
nop | maddaz.xyzw ACC, vf07, vf04
nop | maddw.xyzw vf16, vf08, vf04
;; vf09 = cross(y, z)
nop | opmula.xyz ACC, vf14, vf15
nop | opmsub.xyz vf09, vf15, vf14
;; vf10 = cross(z, x)
nop | opmula.xyz ACC, vf15, vf13
nop | opmsub.xyz vf10, vf13, vf15
;; vf11 = cross(x, y)
nop | opmula.xyz ACC, vf13, vf14
nop | opmsub.xyz vf11, vf14, vf13
;; vf12 = cross (y, z) * x
nop | mul.xyz vf12, vf13, vf09
;; second multiply: doing (vf13....) = cam * (vf5, vf6, vf7, vf8) * (vf1, vf2, vf3, vf4)
nop | mulax.xyzw ACC, vf28, vf13
nop | madday.xyzw ACC, vf29, vf13
nop | maddaz.xyzw ACC, vf30, vf13
nop | maddw.xyzw vf13, vf31, vf13
nop | mulax.w ACC, vf00, vf12
nop | madday.w ACC, vf00, vf12
nop | maddz.w vf12, vf00, vf12
vf12.w = dot (cross(y, z), x) [before the second multiply]
nop | mulax.xyzw ACC, vf28, vf14
nop | madday.xyzw ACC, vf29, vf14
nop | maddaz.xyzw ACC, vf30, vf14
div Q, vf00.w, vf12.w | maddw.xyzw vf14, vf31, vf14 ;; divide
nop | mulax.xyzw ACC, vf28, vf15
nop | madday.xyzw ACC, vf29, vf15
nop | maddaz.xyzw ACC, vf30, vf15
nop | maddw.xyzw vf15, vf31, vf15
nop | mulax.xyzw ACC, vf28, vf16
nop | madday.xyzw ACC, vf29, vf16
nop | maddaz.xyzw ACC, vf30, vf16
nop | maddw.xyzw vf16, vf31, vf16
;; normal scale
nop | mul.xyzw vf09, vf09, Q
nop | mul.xyzw vf10, vf10, Q
nop | mul.xyzw vf11, vf11, Q
;; apply cam to normal matrix too
nop | mulax.xyzw ACC, vf25, vf09
nop | madday.xyzw ACC, vf26, vf09
nop | maddz.xyzw vf09, vf27, vf09
nop | mulax.xyzw ACC, vf25, vf10
nop | madday.xyzw ACC, vf26, vf10
nop | maddz.xyzw vf10, vf27, vf10
nop | mulax.xyzw ACC, vf25, vf11
nop | madday.xyzw ACC, vf26, vf11 :e
nop | maddz.xyzw vf11, vf27, vf11
```