jak-project/docs/scratch/tie_ee.asm
water111 a8342aef31
[graphics] TIE extractor (#1026)
* temp

* temp

* wip

* more progress on the instance asm

* first half of tie extraction, up to dma lists

* more tie extraction

* first part figured out maybe

* bp1 loop seems to work, bp2 loop does not

* bp1 and bp2 appear working. sadly ip is needed

* ip1 outline, not working ip2

* just kidding, ip2 seems to work

* extraction seems to work

* basic rendering working

* tie fixes

* performance optimization of tie renderer

* hook up tie to engine

* fix more bugs

* cleanup and perf improvements

* fix tests

* ref tests

* mm256i for gcc

* CLANG

* windows

* more compile fixes

* fix fast time of day

* small fixes

* fix after merge

* clang
2021-12-26 12:33:51 -05:00

1760 lines
42 KiB
NASM

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; .function draw-inline-array-instance-tie
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;BAD PROLOGUE
;; Warnings:
;; INFO: Flagged as asm by config
;; INFO: Assembly Function
# this is yet-another insane double-buffered DMA function.
# it tries to keep a SPR to or SPR from DMA going always, while processing other stuff
# on the Scratch Pad.
# The upload chunks are 2kB and complete in 128 cycles ideally.
# it's documented, not including:
# -tie near (but this is very simple)
# -wind (probably not too hard)
# -generic (for another day)
# args are:
# a0 : vis bits
# a1 : instances
# a2 : num instances
# a3 : output dma buf.
# common vars:
# t0 work
# t1 SPR to DMA bank
# a3 SPR from DMA bank
# t5 *wind-work*
# t2 bank ptr
# a1 out ptr
B0:
L137:
daddiu sp, sp, -128
sd ra, 0(sp)
sq s0, 16(sp)
sq s1, 32(sp)
sq s2, 48(sp)
sq s3, 64(sp)
sq s4, 80(sp)
sq s5, 96(sp)
sq gp, 112(sp)
# Initialization block.
# On exit, we set t6 to the first 32-bits of vis data.
lui t4, 28672 # = 0x7000
lw v1, 4(a3) # = dma buf base
lui t1, 4096 # = 0x1000
lui t2, 4096 # = 0x1000
sync.l
cache dxwbin v1, 0
sync.l
cache dxwbin v1, 1
sync.l
lw t0, *instance-tie-work-copy*(s7) # use the work copy.
ori t1, t1, 54272 # SPR TO
sw a3, 396(t0) # store dma-buffer
ori a3, t2, 53248 # SPR FROM
lw t5, *wind-work*(s7)
lw t6, 0(a0) # load first vis bits (32 bits)
ori t2, t4, 16 # spr + 16 (bank ptr?)
vmaxw.xyzw vf1, vf0, vf0 # vf1 = (1, 1, 1, 1)
addiu t3, a1, -4 # remove basic ptr.
ori a1, t4, 4112 # set out ptr
sw t1, 400(t0) # stash to-spr
addiu t9, r0, 0 # t9 = 0
sw a3, 404(t0) # stash from-spr
or t8, a1, r0 # another copy of out ptr
sw t5, 408(t0) # stash wind work
sll r0, r0, 0
lqc2 vf3, 64(t0) # constant (4096., 128., 0., 0.)
sll r0, r0, 0
sw r0, 432(t0) # set the flags to 0.
# Find the first group of 32 with a visible.
# Note: this only runs on the entrance, and this runs before any
# SPR DMA is started.
B1:
L138:
bne t6, r0, L139 # if we found something advance to 139
sll r0, r0, 0
B2:
addiu a0, a0, 4 # advance vis pointer by 32 bits
addiu t3, t3, 2048 # advance instance ptr by 32 * 64 = 2048
daddiu a2, a2, -32 # decrement remaining instances by 32.
lw t6, 0(a0) # load the next vis 32-bits.
blez a2, L177 # Return, if we got to the end.
sll r0, r0, 0
B3:
beq r0, r0, L138
sll r0, r0, 0
# Here once we point to a visible thing.
# Now we should begin DMA to SPR.
# But first, we must wait for any in-progress transfer to end:
B4:
L139:
lw t4, 0(t1)
sll r0, r0, 0
sll r0, r0, 0
sll r0, r0, 0
andi t4, t4, 256
sll r0, r0, 0
bne t4, r0, L139
sll r0, r0, 0
# No more in progress tranfer.
# Initialize the first transfer (special case)
# Initialize the double buffering.
B5:
sw t3, 16(t1) # source is the input instance array
xori t4, t2, 2048 # dest is "the other" bank array (in spr)
sw t4, 128(t1)
addiu t4, r0, 128 # copy 128 qw = 2048 bytes = 32 instances
sw t4, 32(t1)
addiu t4, r0, 256 # start!!!
sw t4, 0(t1)
sll r0, r0, 0
# MAIN LOOP TOP
# In here, we have an in progress DMA SPR to transfer.
# we're going to do as much stuff as possible before waiting on the transfer.
# we'll start with finding the address of the next to transfer source.
# (we'll skip groups of 32 that have no visible)
B6:
L140:
or ra, a0, r0 # ra = vis ptr for the SPR to transfer's instances
xori t2, t2, 2048 # toggle spr bank to point to the uploading data
daddiu a0, a0, 4 # increment vis ptr to the next upload's vis data
or t7, a0, r0 # remember this as the end of this group's vis data
or t4, t2, r0 # t4 = the uploading bank
daddiu t6, a2, -32 # t6 = the number remaining after this group.
bgtz t6, L142 # if there's nothing left, fall through (will skip "find next")
lw t6, 0(a0) # t6 is the next 32-bits of vis
B7:
beq r0, r0, L145 # nothing left after the current upload, don't look for next
sll r0, r0, 0
B8:
sll r0, r0, 0
lw v1, 400(r0) # ?? probably will crash if we hit this, but it's unreachable.
B9:
L141:
daddiu a2, a2, -32 # skip 32 instances that are invisible
addiu a0, a0, 4 # advance vis ptr
blez a2, L145 # did we reach the end? if so, give up looking for another.
lw t6, 0(a0) # load next vis
B10:
sll r0, r0, 0
sll r0, r0, 0
B11:
L142:
beq t6, r0, L141 # did we find visible?
addiu t3, t3, 2048 # advance input to bank upload ptr.
# If we reach here, we've found the next SPR to transfer's source
B12:
L143:
# check if in progress upload is done?
lw t6, 0(t1)
sll r0, r0, 0
sll r0, r0, 0
sll r0, r0, 0
andi t6, t6, 256
sll r0, r0, 0
beq t6, r0, L144
sll r0, r0, 0
B13:
# nope, increment stall counter.
sll r0, r0, 0
lw t6, 444(t0)
sll r0, r0, 0
sll r0, r0, 0
sll r0, r0, 0
daddiu t6, t6, 1
sll r0, r0, 0
sw t6, 444(t0)
beq r0, r0, L143
sll r0, r0, 0
# if we reach here, we've found the next SPR to source, and
# the previous upload is done.
B14:
L144:
sw t3, 16(t1) # source addr
xori t6, t2, 2048 # dest is the other
sw t6, 128(t1)
addiu t6, r0, 128 # 128 qw (see prev)
sw t6, 32(t1)
addiu t6, r0, 256
beq r0, r0, L146 # skip waiting for completion!
sw t6, 0(t1)
# if we reach here, we didn't find the next SPR to source
# because there's nothing to do next. So just wait for
# the current upload to finish
B15:
L145:
lw t6, 0(t1)
sll r0, r0, 0
sll r0, r0, 0
sll r0, r0, 0
andi t6, t6, 256
sll r0, r0, 0
beq t6, r0, L146
sll r0, r0, 0
B16:
# inc wait counter if we had to wait.
sll r0, r0, 0
lw t6, 444(t0)
sll r0, r0, 0
sll r0, r0, 0
sll r0, r0, 0
daddiu t6, t6, 1
sll r0, r0, 0
sw t6, 444(t0)
beq r0, r0, L145
sll r0, r0, 0
# if we reach here, we've got some bank data.
# TOP for per-8 instances
B17:
L146:
lb t6, 0(ra) # load 8 bits of vis data for this bank.
addiu ra, ra, 1 # seek to next vis byte
sll r0, r0, 0
sw ra, 412(t0) # stash our vis bits
bne t6, r0, L147 # branch if we have at least one visible in this group of 8.
sw t7, 416(t0) # stash end vis ptr.
B18:
# here if the group of 8 was invisible
daddiu a2, a2, -8 # skip 8
addiu t4, t4, 512 # advance input array pointer by 8
beq r0, r0, L173 # skip to end of 8-block loop.
sll r0, r0, 0
# we have some visible!
B19:
L147:
addiu t7, r0, 128 # vis mask constant (start at highest bit)
lqc2 vf2, 16(t4) # load the bsphere of the instance.
B20:
L148:
daddiu ra, t9, -246 # do we have room left in the output for up to 8 visible?
sll r0, r0, 0 # note, really 10, not sure why they leave 2 empty.
blez ra, L151 # branch if we have room, vcallms 42 no matter what.
vcallms 42 # these people are insane.
# vi01 = in view frustum result
# vf05 = M(vf28) * bsphere (camera)
# vf06 = M(vf24) * bsphere (camera rot (I think includes trans, but no perspective))
# if we got here, we filled the output buffer.
# we should then copy the output FROM SPR to the dma buffer.
B21:
L149:
lw t8, 0(a3) # SPR FROM sync.
sll r0, r0, 0
sll r0, r0, 0
sll r0, r0, 0
andi t8, t8, 256
sll r0, r0, 0
beq t8, r0, L150
sll r0, r0, 0
B22:
sll r0, r0, 0
lw t8, 440(t0)
sll r0, r0, 0
sll r0, r0, 0
sll r0, r0, 0
daddiu t8, t8, 1
sll r0, r0, 0
sw t8, 440(t0)
beq r0, r0, L149
sll r0, r0, 0
B23:
L150:
sw a1, 128(a3) # copy from out ptr
xori a1, a1, 12288 # toggle out
sw v1, 16(a3) # copy to dma buf ptr
sll t8, t9, 4
addu v1, v1, t8 # seek dma buf ptr to the next
or t8, a1, r0 # t8 = the next out ptr
sw t9, 32(a3) # store qwc
addiu t9, r0, 256
sw t9, 0(a3) # start the FROM.
addiu t9, r0, 0 # t9 = 0 (why?)
# here, we have room in the scratchpad output for up to 10.
B24:
L151:
sll r0, r0, 0
lw ra, 12(t4) # ra = our prototype bucket
and gp, t6, t7 # check if our vis bit is set.
ld s5, 56(t4) # s5 = origin_3
beq gp, r0, L172 # are we even visible?
ld s2, 32(t4) # s2 = origin_0
B25:
sll gp, t9, 4 # gp = output offset
ld s4, 40(t4) # s4 = origin_1
pextlh s3, s5, r0 # s3 = s5 << 16 (now 128-bits, origin3)
ld s5, 48(t4) # s5 = origin_2
psraw s3, s3, 10 # s3 = origin3, now signed
lq s1, 28(ra) # s1 = our prototype bucket's dists
pextlh s2, s2, r0 # conversion of origin matrix
lq s0, 44(ra) # s0 = [rlength-near, rlength-stiff, rlength-mid, stiffness]
psraw s2, s2, 16 # conversion of origin matrix (see tie format doc)
qmtc2.ni vf14, s1 # vf14 = dists
pextlh s4, s4, r0 # conversion
qmtc2.ni vf15, s0 # vf15 = rlengths
psraw s4, s4, 16 # conversion
qmtc2.ni vf13, s3 # vf13 = origin3
pextlh s5, s5, r0 # conversion
qmtc2.ni vf10, s2 # vf10 = origin0
psraw s3, s5, 16 # conversion
lhu s2, 62(t4) # s2 = wind index
addu gp, gp, v1 # gp = output address (spr buffer)
qmtc2.ni vf11, s4 # vf11 = origin1
dsll s5, s2, 4 # s5 = wind_idx * 16
qmtc2.ni vf12, s3 # vf12 = origin2
daddu s4, s2, t5 # s4 = wind_idx + time? (on the first time through, seems like it's not time...)
lw s2, 408(t0) # s2 = wind work
andi s4, s4, 63 # truncate upper bits so we fit in the 64 wind array.
lw s3, 384(t0) # s3 = wind vectors
sll s1, s4, 4 # index qw's with our [0, 64) index
lw s4, 4(ra) # flags
daddu s5, s3, s5 # s5 = wind vector pointer (not from wind work, in the static data.)
addu s3, s1, s2 # wind work vector array pointer
andi s1, s4, 1 # s1 = flag1
andi s4, s4, 2 # s4 = flag2
bne s1, r0, L172 # skip if flag1 is set.
cfc2.ni s1, vi1 # get the in-view-frustum result from VU0.
B26:
vitof0.xyzw vf13, vf13 # convert origin matrix row to float.
lw t5, 1324(s2) # load wind time
bne s1, r0, L172 # skip if not in view frustum.
lqc2 vf25, 112(t0) # vf25 = current TIE min distance
B27:
sll r0, r0, 0
lqc2 vf16, 16(t0) # vf16 = hmge-d
sll r0, r0, 0
lqc2 vf17, 32(t0) # vf17 = hvdf-offset
vmulaz.xyzw acc, vf1, vf6 # [z, z, z, z] (how in front of camera)
sw gp, 196(t0) # work.upload-color-0.addr = output_ptr
vmsubw.xyzw vf8, vf1, vf2 # vf8 = dist in front of cam (origin - r_bs)
sw gp, 276(t0) # work.generic-color-0.addr = output_ptr
vadd.xyz vf5, vf0, vf0 # vf5 = [0, 0, 0, transformed_w]
sll r0, r0, 0
vadd.xyz vf13, vf13, vf2 # vf13 = (+, +, +) corner of bounding box (outside of bsphere)
sll r0, r0, 0
vmula.xyzw acc, vf1, vf1 # acc = [1, 1, 1, 1]
sll r0, r0, 0
vsub.xyzw vf14, vf8, vf14 # (origin - r_bs) - dists
sll r0, r0, 0
vaddw.w vf5, vf5, vf17 # vf5 = [0, 0, 0, transformed_w + hvdf_offset.w]
sll r0, r0, 0
sll r0, r0, 0
lqc2 vf30, 80(t0) # vf30 = far_morph
vmini.xyzw vf25, vf8, vf25 # update TIE min distance
sll r0, r0, 0
vmsub.xyz vf15, vf14, vf15 # dist weights
sll r0, r0, 0
vminiy.w vf5, vf5, vf16 # clip w min
sll r0, r0, 0
sll r0, r0, 0
lqc2 vf24, 128(t0) # vf24 = guard plane 0 (i think w plane)
sll r0, r0, 0
sqc2 vf25, 112(t0) # vf25 = min-dist
vmini.xyz vf15, vf15, vf1 # saturate dist weights
sll r0, r0, 0
vmaxx.w vf5, vf5, vf16 # clip w max
sll r0, r0, 0
vsubz.xyzw vf16, vf8, vf16 # vf16 = dist in front of the camera - some hmge thing.
sll r0, r0, 0
sll r0, r0, 0
lqc2 vf25, 144(t0) # vf25 = some plane
sll r0, r0, 0
lqc2 vf26, 160(t0) # vf26 = another plane
sll r0, r0, 0
lqc2 vf27, 176(t0) # vf27 = yet another plane
vmulax.xyzw acc, vf24, vf2 # perform clipping with this plane.
sll r0, r0, 0
vmadday.xyzw acc, vf25, vf2
sll r0, r0, 0
vmaddaz.xyzw acc, vf26, vf2
sll r0, r0, 0
vmsubaw.xyzw acc, vf27, vf0
sll r0, r0, 0
vmsubw.xyzw vf24, vf1, vf2
sll r0, r0, 0
sll r0, r0, 0
qmfc2.i s2, vf16 # s2 = dists - hmge thing
vmulw.xyzw vf28, vf15, vf30 # vf28 = scaled dist weights
sll r0, r0, 0
vmulw.xyzw vf29, vf15, vf30 # vf29 = scaled dist weights, again?
sll r0, r0, 0
sll r0, r0, 0
lqc2 vf19, 0(t0) # vf19 = wind const (WIND)
vitof12.xyzw vf10, vf10 # convert origin (back to this, I guess)
sll r0, r0, 0
pcgtw s1, r0, s2 # dists check
qmfc2.i s0, vf24 # clip check
vmulx.xyzw vf28, vf1, vf28 # distweights x
sll r0, r0, 0
vmulz.xyzw vf29, vf1, vf29 # distweights z
lw s2, 56(ra) # s2 = stiffness
pcgtw s0, r0, s0 # check clip again
sqc2 vf5, 80(t8) # store magic w.
ppach s0, r0, s0 # s0 = more clip
sw s4, 80(t8) # store some flags
or s1, s0, s1 # more clipping/distance crap
sqc2 vf14, 96(t0) # called "dist-test"
ppacb s1, r0, s1 # distance stuff.
mfc1 r0, f31
beq s2, r0, L153 # if stiffness == 0, skip ahead
sw s1, 84(t8) # output some clip/dist info.
# apply wind.
B28:
vftoi0.zw vf28, vf28
ld s1, 8(s5)
vftoi0.zw vf29, vf29
ld s2, 0(s5)
pextlw s1, r0, s1
lqc2 vf16, 12(s3)
pextlw s3, r0, s2
qmtc2.i vf18, s1
sll r0, r0, 0
qmtc2.i vf17, s3
vmula.xyzw acc, vf16, vf1
sll r0, r0, 0
vmsubax.xyzw acc, vf18, vf19
sll r0, r0, 0
vmsuby.xyzw vf16, vf17, vf19
sll r0, r0, 0
vsubx.x vf28, vf30, vf15
sll r0, r0, 0
vsubz.x vf29, vf1, vf15
sll r0, r0, 0
vitof0.zw vf28, vf28
sll r0, r0, 0
vmulaz.xyzw acc, vf16, vf19
sll r0, r0, 0
vmadd.xyzw vf18, vf1, vf18
sll r0, r0, 0
vitof0.zw vf29, vf29
sll r0, r0, 0
vaddy.y vf28, vf0, vf0
sll r0, r0, 0
vaddy.y vf29, vf0, vf0
sll r0, r0, 0
vmulaz.xyzw acc, vf18, vf19
sll r0, r0, 0
vmadd.xyzw vf17, vf17, vf1
sll r0, r0, 0
vitof12.xyzw vf11, vf11
sll r0, r0, 0
vitof12.xyzw vf12, vf12
sll r0, r0, 0
vsubw.w vf28, vf30, vf28
sll r0, r0, 0
vminiw.xyzw vf17, vf17, vf0
sll r0, r0, 0
vsubw.w vf29, vf30, vf29
sll r0, r0, 0
sll r0, r0, 0
sll r0, r0, 0
sll r0, r0, 0
qmfc2.i s3, vf18
vmaxw.xyzw vf27, vf17, vf19
sll r0, r0, 0
ppacw s3, r0, s3
mfc1 r0, f31
sll r0, r0, 0
sll r0, r0, 0
sll r0, r0, 0
sll r0, r0, 0
vmulw.xyzw vf27, vf27, vf15
sll r0, r0, 0
sll r0, r0, 0
sll r0, r0, 0
sll r0, r0, 0
sll r0, r0, 0
vmulax.yw acc, vf0, vf0
sll r0, r0, 0
vmulay.xz acc, vf27, vf10
sll r0, r0, 0
vmadd.xyzw vf10, vf1, vf10
sll r0, r0, 0
sll r0, r0, 0
qmfc2.i s2, vf27
vmulax.yw acc, vf0, vf0
lw s1, 436(t0)
vmulay.xz acc, vf27, vf11
sll r0, r0, 0
vmadd.xyzw vf11, vf1, vf11
sll r0, r0, 0
bne s1, s7, L152
ppacw s2, r0, s2
B29:
vmulax.yw acc, vf0, vf0
sd s3, 8(s5)
vmulay.xz acc, vf27, vf12
sd s2, 0(s5)
bne s4, r0, L164
vmadd.xyzw vf12, vf1, vf12
B30:
beq r0, r0, L154
sll r0, r0, 0
B31:
L152:
vmulax.yw acc, vf0, vf0 # acc = [? 0 ? 0]
sll r0, r0, 0
vmulay.xz acc, vf27, vf12 # acc = [o2.y*gp3.x, 0, o2.y*gp3.z, 0] (wtf is this)
sll r0, r0, 0
bne s4, r0, L164
vmadd.xyzw vf12, vf1, vf12
B32:
beq r0, r0, L154
sll r0, r0, 0
# Don't Apply Wind.
B33:
L153:
vftoi0.zw vf28, vf28 # dist weights
sll r0, r0, 0
vftoi0.zw vf29, vf29 # more dist weights
sll r0, r0, 0
vsubx.x vf28, vf30, vf15
sll r0, r0, 0
vsubz.x vf29, vf1, vf15
sll r0, r0, 0
vitof0.zw vf28, vf28
sll r0, r0, 0
vitof0.zw vf29, vf29
sll r0, r0, 0
vaddy.y vf28, vf0, vf0
sll r0, r0, 0
vaddy.y vf29, vf0, vf0
sll r0, r0, 0
vsubw.w vf28, vf30, vf28
sll r0, r0, 0
vsubw.w vf29, vf30, vf29
sll r0, r0, 0
vitof12.xyzw vf11, vf11 # convert origin1
sll r0, r0, 0
bne s4, r0, L164
vitof12.xyzw vf12, vf12 # convert origin2
# End of stiffness calcultion.
# S4 = 0 version. Goto L164 for S4 != 0 version.
# Maybe this is the version for non-generic??
B34:
L154:
sll r0, r0, 0
lw s5, 84(t8) # s5 = clipping/dist flags
sll r0, r0, 0
lw s4, 108(t0) # s4 = dist_test_w (from work)
addiu t9, t9, 6 # advance output qwc by 6 (96 bytes)
lw s3, 104(t0) # s3 = dist_test_z (from work)
bne s5, r0, L158 # if we clip, go to TIE NEAR.
vsubw.w vf10, vf10, vf10 # clear the w component of origin0
B35:
bgtz s4, L156
sll r0, r0, 0
B36:
bgtz s3, L155
sll r0, r0, 0
B37: # GEOM 1 (this is the non-near version) (believed to be the high-LOD one)
sll r0, r0, 0
lh s4, 78(ra) # load count 1
sll r0, r0, 0
lw s5, 64(ra) # load next
daddiu s4, s4, 1 # inc count
sqc2 vf28, 64(t8) # store morph constants
vmulax.xyzw acc, vf20, vf10 # matmul 0/16
addiu gp, gp, 96 # output pointer, I think this is a pointer to the RAM dma buffer, not spr.
vmadday.xyzw acc, vf21, vf10 # matmul 1/16
sw gp, 64(ra) # update next.
vmaddz.xyzw vf10, vf22, vf10 # matmul 2/16 (out vf10)
sh s4, 78(ra) # store updated count
vmulax.xyzw acc, vf20, vf11 # matmul 3/16
lbu s4, 109(ra) # s4 = frag-count 1
vmadday.xyzw acc, vf21, vf11 # matmul 4/16
lhu gp, 118(ra) # gp = base-qw
vmaddz.xyzw vf11, vf22, vf11 # matmul 5/16 (out vf11)
lbu s3, 113(ra) # s3 = index-start
beq r0, r0, L157
sll r0, r0, 0
B38:
L155: # geom 2 version (same as above)
sll r0, r0, 0
lh s4, 80(ra)
sll r0, r0, 0
lw s5, 68(ra)
daddiu s4, s4, 1
sqc2 vf29, 64(t8)
vmulax.xyzw acc, vf20, vf10
addiu gp, gp, 96
vmadday.xyzw acc, vf21, vf10
sw gp, 68(ra)
vmaddz.xyzw vf10, vf22, vf10
sh s4, 80(ra)
vmulax.xyzw acc, vf20, vf11
lbu s4, 110(ra)
vmadday.xyzw acc, vf21, vf11
lhu gp, 120(ra)
vmaddz.xyzw vf11, vf22, vf11
lbu s3, 114(ra)
beq r0, r0, L157
sll r0, r0, 0
B39:
L156: # geom 3 version (same as above)
sll r0, r0, 0
lh s4, 82(ra)
sll r0, r0, 0
lw s5, 72(ra)
daddiu s4, s4, 1
sqc2 vf30, 64(t8)
vmulax.xyzw acc, vf20, vf10
addiu gp, gp, 96
vmadday.xyzw acc, vf21, vf10
sw gp, 72(ra)
vmaddz.xyzw vf10, vf22, vf10
sh s4, 82(ra)
vmulax.xyzw acc, vf20, vf11
lbu s4, 111(ra)
vmadday.xyzw acc, vf21, vf11
lhu gp, 122(ra)
vmaddz.xyzw vf11, vf22, vf11
lbu s3, 115(ra)
# common for 1,2,3 geoms
B40:
L157:
vmulax.xyzw acc, vf20, vf12 # matmul 6/16
lq s2, 224(t0) # s2 = upload color 2
vmadday.xyzw acc, vf21, vf12 # matmul 7/16
lq s1, 240(t0) # s1 = upload color ret
vmaddz.xyzw vf12, vf22, vf12 # matmul 8/16 (out vf12)
dsll gp, gp, 4 # base-offset (from base-qw in the prototype)
vmulax.xyzw acc, vf20, vf13 # matmul
daddu s3, s3, ra # s3 = prototype bucket + index zone (noe sure what's here yet.)
vmadday.xyzw acc, vf21, vf13 # matmul
sll r0, r0, 0
vmaddaz.xyzw acc, vf22, vf13 # matmul
sll r0, r0, 0
vmaddw.xyzw vf13, vf23, vf0 # matmul (out vf13, we're done)
sll r0, r0, 0
sqc2 vf10, 0(t8) # store matrix
sll r0, r0, 0
sqc2 vf11, 16(t8) # store matrix
movz s2, s1, s5 # if we're the first thing added, we'll be the last in the chain, and should put a upload-color-ret!
sqc2 vf12, 32(t8) # store matrix
daddiu t8, t8, 96 # inc SPR ptr.
beq r0, r0, L159
sqc2 vf13, -48(t8) # store matrix
# TIE NEAR DMA generation
# (don't care)
B41:
L158:
sll r0, r0, 0
lqc2 vf24, 320(t0)
sll r0, r0, 0
lqc2 vf25, 336(t0)
sll r0, r0, 0
lqc2 vf26, 352(t0)
sll r0, r0, 0
lqc2 vf27, 368(t0)
sll r0, r0, 0
lh s4, 76(ra)
sll r0, r0, 0
lw s5, 60(ra)
daddiu s4, s4, 1
sqc2 vf28, 64(t8)
vmulax.xyzw acc, vf24, vf10
addiu gp, gp, 96
vmadday.xyzw acc, vf25, vf10
sw gp, 60(ra)
vmaddz.xyzw vf10, vf26, vf10
sh s4, 76(ra)
vmulax.xyzw acc, vf24, vf11
lbu s4, 108(ra)
vmadday.xyzw acc, vf25, vf11
lhu gp, 116(ra)
vmaddz.xyzw vf11, vf26, vf11
lbu s3, 112(ra)
vmulax.xyzw acc, vf24, vf12
lq s2, 224(t0)
vmadday.xyzw acc, vf25, vf12
lq s1, 240(t0)
vmaddz.xyzw vf12, vf26, vf12
dsll gp, gp, 4
vmulax.xyzw acc, vf24, vf13
daddu s3, s3, ra
vmadday.xyzw acc, vf25, vf13
sll r0, r0, 0
vmaddaz.xyzw acc, vf26, vf13
sll r0, r0, 0
vmaddw.xyzw vf13, vf27, vf0
sll r0, r0, 0
sqc2 vf10, 0(t8)
sll r0, r0, 0
sqc2 vf11, 16(t8)
sll r0, r0, 0
sqc2 vf12, 32(t8)
movz s2, s1, s5
sqc2 vf13, 48(t8)
daddiu t8, t8, 96
# And Back to common non-generic TIE.
# it is time for colors. This is a loop over fragments
# s2 = tag2
# s4 = frag-count 1
# gp = base-qw * 16
# s3 = prototype + index-start
B42:
L159:
sll r0, r0, 0
lw ra, 8(t4) # ra = color-indices
sll r0, r0, 0
sq s2, 256(t0) # upload color temp (either ret or 2, we'll refer to it as 2)
sll r0, r0, 0
lbu s2, 144(s3) # load the first index (actually counts)
addu s1, gp, ra # s1 = color-indices + base-offset
sw s5, 260(t0) # store the address of next in the colors2 tag (doesn't do anything if ret)
daddiu t9, t9, 3 # another 3 qw for the colors.
sw s1, 212(t0) # color + base goes in colors 1
sll s1, s2, 2 # s1 = index * 4
sh s2, 208(t0) # color1's qwc = *index
sll s2, s2, 4 # s2 = index * 16
sb s1, 222(t0) # set something in the vif tag of upload color 1
daddu gp, gp, s2 # advance our colors ptrs
lq s2, 192(t0) # s2 = upload color 0
daddiu s5, s5, 48 # inc next
lq s1, 208(t0) # s1 = upload color 1
daddiu t8, t8, 48 # inc output
lq s0, 256(t0) # s0 = upload temp (2 or ret)
daddiu s3, s3, 1 # inc the instance pointer
sq s2, -48(t8) # upload color 0 (seems to be, a constant?)
daddiu s4, s4, -1 # decrement frag count.
sq s1, -32(t8) # upload color 1
blez s4, L172 # did we run out of fragments?
sq s0, -16(t8) # upload color 2
# top of fragment loop
B43:
L160:
daddiu s2, t9, -252 # did we run out of room in the scratchpad?
sll r0, r0, 0
blez s2, L163
sll r0, r0, 0
# swap output buffer
B44:
L161:
lw t8, 0(a3)
sll r0, r0, 0
sll r0, r0, 0
sll r0, r0, 0
andi t8, t8, 256
sll r0, r0, 0
beq t8, r0, L162
sll r0, r0, 0
B45:
sll r0, r0, 0
lw t8, 440(t0)
sll r0, r0, 0
sll r0, r0, 0
sll r0, r0, 0
daddiu t8, t8, 1
sll r0, r0, 0
sw t8, 440(t0)
beq r0, r0, L161
sll r0, r0, 0
B46:
L162:
sw a1, 128(a3)
xori a1, a1, 12288
sw v1, 16(a3)
sll t8, t9, 4
addu v1, v1, t8
or t8, a1, r0
sw t9, 32(a3)
addiu t9, r0, 256
sw t9, 0(a3)
addiu t9, r0, 0
# here we have an output buffer that has room for another fragment.
# and add another fragment.
B47:
L163:
sll r0, r0, 0
lbu s2, 144(s3) # load next index
addu s1, gp, ra # s1 = color-indices + base-offset
sw s5, 260(t0) # store the address of next in the colors2 tag (doesn't do anything if ret)
daddiu t9, t9, 3 # another 3 qw for the colors.
sw s1, 212(t0) # color + base goes in colors 1
sll s1, s2, 2 # s1 = index * 4
sh s2, 208(t0) # color1's qwc = *index
sll s2, s2, 4 # s2 = index * 16
sb s1, 222(t0) # set something in the vif tag of upload color 1
daddu gp, gp, s2 # advance our colors ptrs
# same as B42
lq s2, 192(t0)
daddiu s5, s5, 48
lq s1, 208(t0)
daddiu t8, t8, 48
lq s0, 256(t0)
daddiu s3, s3, 1
sq s2, -48(t8)
daddiu s4, s4, -1
sq s1, -32(t8)
bgtz s4, L160 # except we keep looping if there are fragments left.
sq s0, -16(t8)
B48:
beq r0, r0, L172
sll r0, r0, 0
# s4 != 0 version
# I think, for the GENERIC renderer.
# wtf there's a square root...
B49:
L164:
vmul.xyz vf16, vf6, vf6
sll r0, r0, 0
sll r0, r0, 0
lqc2 vf9, 124(ra)
vsubw.w vf10, vf10, vf10
sll r0, r0, 0
sll r0, r0, 0
sll r0, r0, 0
vadday.x acc, vf16, vf16
sll r0, r0, 0
vmaddz.x vf16, vf1, vf16
sll r0, r0, 0
vsqrt Q, vf16.x
sll r0, r0, 0
vmulay.xyzw acc, vf1, vf9
sll r0, r0, 0
vmaddaw.xyzw acc, vf1, vf2
sll r0, r0, 0
sll r0, r0, 0
vwaitq
vmsubq.xyzw vf16, vf1, Q
sll r0, r0, 0
vmulx.xyzw vf16, vf16, vf9
sll r0, r0, 0
vmaxx.x vf16, vf16, vf0
sll r0, r0, 0
vminiy.x vf16, vf16, vf3
sll r0, r0, 0
vftoi0.xyzw vf16, vf16
sll r0, r0, 0
sll r0, r0, 0
qmfc2.i s5, vf16
sll r0, r0, 0
sll r0, r0, 0
andi s5, s5, 255
sll r0, r0, 0
beq s5, r0, L154
sll r0, r0, 0
B50:
vcallms 29
sw s4, 432(t0)
sll r0, r0, 0
lw s4, 108(t0)
addiu t9, t9, 6
lw s3, 104(t0)
sll r0, r0, 0
sw s5, 80(t8)
bgtz s4, L166
sll r0, r0, 0
B51:
bgtz s3, L165
sll r0, r0, 0
B52:
sll r0, r0, 0
lh s4, 86(ra)
sll r0, r0, 0
lw s5, 96(ra)
daddiu s4, s4, 1
sqc2 vf28, 64(t8)
vmulax.xyzw acc, vf24, vf10
addiu gp, gp, 96
vmadday.xyzw acc, vf25, vf10
sw gp, 96(ra)
vmaddz.xyzw vf10, vf26, vf10
sh s4, 86(ra)
vmulax.xyzw acc, vf24, vf11
lbu s3, 109(ra)
vmadday.xyzw acc, vf25, vf11
lhu gp, 118(ra)
vmaddz.xyzw vf11, vf26, vf11
lbu s4, 113(ra)
beq r0, r0, L167
sll r0, r0, 0
B53:
L165:
sll r0, r0, 0
lh s4, 88(ra)
sll r0, r0, 0
lw s5, 100(ra)
daddiu s4, s4, 1
sqc2 vf29, 64(t8)
vmulax.xyzw acc, vf24, vf10
addiu gp, gp, 96
vmadday.xyzw acc, vf25, vf10
sw gp, 100(ra)
vmaddz.xyzw vf10, vf26, vf10
sh s4, 88(ra)
vmulax.xyzw acc, vf24, vf11
lbu s3, 110(ra)
vmadday.xyzw acc, vf25, vf11
lhu gp, 120(ra)
vmaddz.xyzw vf11, vf26, vf11
lbu s4, 114(ra)
beq r0, r0, L167
sll r0, r0, 0
B54:
L166:
sll r0, r0, 0
lh s4, 90(ra)
sll r0, r0, 0
lw s5, 104(ra)
daddiu s4, s4, 1
sqc2 vf30, 64(t8)
vmulax.xyzw acc, vf24, vf10
addiu gp, gp, 96
vmadday.xyzw acc, vf25, vf10
sw gp, 104(ra)
vmaddz.xyzw vf10, vf26, vf10
sh s4, 90(ra)
vmulax.xyzw acc, vf24, vf11
lbu s3, 111(ra)
vmadday.xyzw acc, vf25, vf11
lhu gp, 122(ra)
vmaddz.xyzw vf11, vf26, vf11
lbu s4, 115(ra)
B55:
L167:
vmulax.xyzw acc, vf24, vf12
dsll gp, gp, 4
vmadday.xyzw acc, vf25, vf12
daddu s4, s4, ra
vmaddz.xyzw vf12, vf26, vf12
sll r0, r0, 0
vmulax.xyzw acc, vf24, vf13
sll r0, r0, 0
vmadday.xyzw acc, vf25, vf13
sll r0, r0, 0
vmaddaz.xyzw acc, vf26, vf13
sll r0, r0, 0
vmaddw.xyzw vf13, vf27, vf0
sll r0, r0, 0
sqc2 vf10, 0(t8)
sll r0, r0, 0
sqc2 vf11, 16(t8)
sll r0, r0, 0
sqc2 vf12, 32(t8)
sll r0, r0, 0
sqc2 vf13, 48(t8)
daddiu t8, t8, 96
sll r0, r0, 0
lw ra, 8(t4)
sll r0, r0, 0
lbu s2, 144(s4)
addu s1, gp, ra
sw s5, 284(t0)
daddiu t9, t9, 3
sw s1, 292(t0)
sll s1, s2, 4
sh s2, 288(t0)
daddu gp, gp, s1
lq s2, 272(t0)
daddiu s5, s5, 48
lq s1, 288(t0)
daddiu t8, t8, 48
lq s0, 304(t0)
daddiu s4, s4, 1
sq s2, -48(t8)
daddiu s3, s3, -1
sq s1, -32(t8)
blez s3, L172
sq s0, -16(t8)
B56:
L168:
daddiu s2, t9, -252
sll r0, r0, 0
blez s2, L171
sll r0, r0, 0
B57:
L169:
lw t8, 0(a3)
sll r0, r0, 0
sll r0, r0, 0
sll r0, r0, 0
andi t8, t8, 256
sll r0, r0, 0
beq t8, r0, L170
sll r0, r0, 0
B58:
sll r0, r0, 0
lw t8, 440(t0)
sll r0, r0, 0
sll r0, r0, 0
sll r0, r0, 0
daddiu t8, t8, 1
sll r0, r0, 0
sw t8, 440(t0)
beq r0, r0, L169
sll r0, r0, 0
B59:
L170:
sw a1, 128(a3)
xori a1, a1, 12288
sw v1, 16(a3)
sll t8, t9, 4
addu v1, v1, t8
or t8, a1, r0
sw t9, 32(a3)
addiu t9, r0, 256
sw t9, 0(a3)
addiu t9, r0, 0
B60:
L171:
sll r0, r0, 0
lbu s2, 144(s4)
addu s1, gp, ra
sw s5, 284(t0)
daddiu t9, t9, 3
sw s1, 292(t0)
sll s1, s2, 4
sh s2, 288(t0)
daddu gp, gp, s1
lq s2, 272(t0)
daddiu s5, s5, 48
lq s1, 288(t0)
daddiu t8, t8, 48
lq s0, 304(t0)
daddiu s4, s4, 1
sq s2, -48(t8)
daddiu s3, s3, -1
sq s1, -32(t8)
bgtz s3, L168
sq s0, -16(t8)
B61:
# early exit for 1 in a per-8
L172:
addiu a2, a2, -1 # decrement instance count
srl t7, t7, 1 # update vis mask
daddiu t4, t4, 64 # update instance ptr
sll r0, r0, 0
bne t7, r0, L148 # reloop, if we've got any left in the group of 8.
lqc2 vf2, 16(t4) # load the bsphere.
# early exit for per-8
B62:
L173:
sll r0, r0, 0
lw ra, 412(t0)
sll r0, r0, 0
lw t7, 416(t0)
bne ra, t7, L146
sll r0, r0, 0
B63:
bgtz a2, L140
sll r0, r0, 0
B64:
beq t9, r0, L176
sll r0, r0, 0
B65:
L174:
lw a0, 0(a3)
sll r0, r0, 0
sll r0, r0, 0
sll r0, r0, 0
andi a0, a0, 256
sll r0, r0, 0
beq a0, r0, L175
sll r0, r0, 0
B66:
sll r0, r0, 0
lw a0, 440(t0)
sll r0, r0, 0
sll r0, r0, 0
sll r0, r0, 0
daddiu a0, a0, 1
sll r0, r0, 0
sw a0, 440(t0)
beq r0, r0, L174
sll r0, r0, 0
B67:
L175:
sw a1, 128(a3)
xori a0, a1, 12288
sw v1, 16(a3)
sll a1, t9, 4
addu v1, v1, a1
or a0, a0, r0
sw t9, 32(a3)
addiu a0, r0, 256
sw a0, 0(a3)
addiu a0, r0, 0
B68:
L176:
lw a0, 0(a3)
sll r0, r0, 0
sll r0, r0, 0
sll r0, r0, 0
andi a0, a0, 256
sll r0, r0, 0
beq a0, r0, L177
sll r0, r0, 0
B69:
sll r0, r0, 0
lw a0, 440(t0)
sll r0, r0, 0
sll r0, r0, 0
sll r0, r0, 0
daddiu a0, a0, 1
sll r0, r0, 0
sw a0, 440(t0)
beq r0, r0, L176
sll r0, r0, 0
# Final exit
B70:
L177:
lw a0, 396(t0)
sll r0, r0, 0
sw v1, 4(a0)
sll r0, r0, 0
or v0, r0, r0
ld ra, 0(sp)
lq gp, 112(sp)
lq s5, 96(sp)
lq s4, 80(sp)
lq s3, 64(sp)
lq s2, 48(sp)
lq s1, 32(sp)
lq s0, 16(sp)
jr ra
daddiu sp, sp, 128
sll r0, r0, 0
sll r0, r0, 0
sll r0, r0, 0
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; .function draw-inline-array-prototype-tie-asm
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;BAD PROLOGUE
;; Warnings:
;; INFO: Flagged as asm by config
;; INFO: Assembly Function
# runs after the instance drawing.
# a0 - dma buffer
# a1 - num prototypes
# a2 - prototype array (a prototype-array-tie)
# t0 = work
# t1 = SPR from DMA bank
# t3 = SPR ptr to input buffer
#
B0:
L84:
daddiu sp, sp, -112
sd ra, 0(sp)
sq s1, 16(sp)
sq s2, 32(sp)
sq s3, 48(sp)
sq s4, 64(sp)
sq s5, 80(sp)
sq gp, 96(sp)
sll r0, r0, 0
# SETUP
lui a3, 28672 # = 0x7000
lw v1, 4(a0) # = dma buf base
lui t1, 4096 # = 0x1000
lui t2, 4096 # = 0x1000
sync.l
cache dxwbin v1, 0
sync.l
cache dxwbin v1, 1
sync.l
lw t0, *prototype-tie-work*(s7)
ori t1, t1, 53248 # spr FROM
ori t4, t2, 54272 # spr TO
ori t3, a3, 16 # setup spr input buffer pointer
ori t2, a3, 2064 # spr output buffer pointer
sw a0, 10260(a3) # stash dma buffer in spad (dma-buffer of prototype-tie-dma)
daddiu t7, a1, -1 # not sure, is the input data length off by one or something?
sll r0, r0, 0
lw t5, 12(a2) # t5 = prototype-bucket-tie
addiu a0, r0, 0 # a0 = 0 (offset into the scratchpad's output offset)
or a1, t2, r0 # a1 = spr output
B1:
L85:
sll r0, r0, 0
lq t6, 60(t5) # load next's for the bucket (start of dma chains per geom, in 0, 1, 2, 3 order)
daddiu t8, a2, 4 # t8 = array pointer (array prottype-bucket-tie), load at +12.
sw t7, 10256(a3) # stash length in spad
dsrl32 a2, t6, 0 # a2 = 1's next
sw t8, 280(t0) # stash prototype-array
pcpyud t8, t6, t6 # [2,3]
lw t7, 140(t5) # tie colors
or t8, a2, t8 # t8 = any next's nonzero?
lw a2, 108(t5) # a2 = frag counts array
beq t8, r0, L99 # skip ahead if 0 prototypes drawn.
lw t8, 4(a3) # load mood off of the terrain-context (spr)
B2:
sll r0, r0, 0
lq t5, 12(t5) # load geom's from bucket
sll r0, r0, 0
sq t6, 10272(a3) # stash next's on spad
sll r0, r0, 0
sw a2, 10304(a3) # stash frag counts on spad
sll r0, r0, 0
sq t5, 10288(a3) # stash geom's on spad
sll r0, r0, 0
ld a2, 272(t0) # clamp constant: #x0080'00ff'00ff'00ff (for time of day color interp)
sll r0, r0, 0
lw t6, 4(t7) # t6 = time of day palette's height
daddiu ra, t7, 12 # ra = time of day palette's data
lq t5, 1852(t8) # t5 = itimes0 from mood
sra t7, t6, 2 # t7 = width / 4
sll r0, r0, 0
addu t7, t7, a0 # some spad out buffer ptr, or something.
addiu t9, r0, 221 # t9 = 221 (I think the max number of qws we can safely have used after colors)
dsubu t7, t9, t7
sll r0, r0, 0
bgez t7, L88 # see if we have too much crap in the DMA output buffer
sll r0, r0, 0
# dma output full, copy from scratchpad.
B3:
L86:
lw a1, 0(t1)
sll r0, r0, 0
sll r0, r0, 0
sll r0, r0, 0
andi a1, a1, 256
sll r0, r0, 0
beq a1, r0, L87
sll r0, r0, 0
B4:
sll r0, r0, 0
lw a1, 292(t0)
sll r0, r0, 0
sll r0, r0, 0
sll r0, r0, 0
daddiu a1, a1, 1
sll r0, r0, 0
sw a1, 292(t0)
beq r0, r0, L86
sll r0, r0, 0
B5:
L87:
sw t2, 128(t1)
xori t2, t2, 4096
sw v1, 16(t1)
sll a1, a0, 4
addu v1, v1, a1
or a1, t2, r0
sw a0, 32(t1)
addiu a0, r0, 256
sw a0, 0(t1)
addiu a0, r0, 0
# here it is safe to output colors to the current scratchpad
# output buffer.
B6:
L88:
addiu t7, t6, 31 # width + 31
lq t6, 0(t0) # upload palette 0
sra t7, t7, 5 # (width + 31) / 5
addiu a0, a0, 2 # 2 qw's, I guess for upload palette 0 and upload palette 1
sll t9, t7, 5 # ((width + 31) / 5) * 5
sq t6, 0(a1) # store upload palette 0
sra t6, t9, 2 # aligned width / 4
sb t9, 30(t0) # probably the viftag's unpack count, or something.
addu a0, a0, t6 # going to use up (aligned width / 4) qw's in our dma buffer.
sh t6, 16(t0) # also put this in the giftag for upload1
sll r0, r0, 0
lq t6, 1868(t8) # t6 = itimes1
sll r0, r0, 0
lq gp, 16(t0) # gp = upload1's tag
sll r0, r0, 0
lq t7, 1884(t8) # t7 = itimes2
sll r0, r0, 0
sq gp, 16(a1) # store upload palette 1
addiu a1, a1, 32 # advance output pointer (spr)
lq t8, 1900(t8) # t8 = itimes3
# begin color stuff
B7:
L89:
lw gp, 0(t4)
sll r0, r0, 0
sll r0, r0, 0
sll r0, r0, 0
andi gp, gp, 256
sll r0, r0, 0
beq gp, r0, L90
sll r0, r0, 0
B8:
sll r0, r0, 0
lw gp, 296(t0)
sll r0, r0, 0
sll r0, r0, 0
sll r0, r0, 0
daddiu gp, gp, 1
sll r0, r0, 0
sw gp, 296(t0)
beq r0, r0, L89
sll r0, r0, 0
B9:
L90:
sw ra, 16(t4)
daddiu t9, t9, -32
sw t3, 128(t4)
addiu gp, r0, 64
sw gp, 32(t4)
addiu gp, r0, 256
sw gp, 0(t4)
daddiu ra, ra, 1024
B10:
L91:
or s5, t3, r0
xori t3, t3, 1024
blez t9, L94
daddiu t9, t9, -32
B11:
L92:
lw gp, 0(t4)
sll r0, r0, 0
sll r0, r0, 0
sll r0, r0, 0
andi gp, gp, 256
sll r0, r0, 0
beq gp, r0, L93
sll r0, r0, 0
B12:
sll r0, r0, 0
lw gp, 296(t0)
sll r0, r0, 0
sll r0, r0, 0
sll r0, r0, 0
daddiu gp, gp, 1
sll r0, r0, 0
sw gp, 296(t0)
beq r0, r0, L92
sll r0, r0, 0
B13:
L93:
sw ra, 16(t4)
sll r0, r0, 0
sw t3, 128(t4)
addiu gp, r0, 64
sw gp, 32(t4)
addiu gp, r0, 256
sw gp, 0(t4)
daddiu ra, ra, 1024
beq r0, r0, L95
sll r0, r0, 0
B14:
L94:
lw gp, 0(t4)
sll r0, r0, 0
sll r0, r0, 0
sll r0, r0, 0
andi gp, gp, 256
sll r0, r0, 0
beq gp, r0, L95
sll r0, r0, 0
B15:
sll r0, r0, 0
lw gp, 296(t0)
sll r0, r0, 0
sll r0, r0, 0
sll r0, r0, 0
daddiu gp, gp, 1
sll r0, r0, 0
sw gp, 296(t0)
beq r0, r0, L94
sll r0, r0, 0
B16:
L95:
addiu gp, a1, 128
lq s2, 12(s5)
sll r0, r0, 0
lq s4, 28(s5)
pextlb s3, r0, s2
mfc1 r0, f31
pextub s2, r0, s2
mfc1 r0, f31
pmulth r0, s3, t5
mfc1 r0, f31
pextlb s3, r0, s4
mfc1 r0, f31
pmaddh r0, s2, t6
mfc1 r0, f31
pextub s4, r0, s4
mfc1 r0, f31
pmaddh r0, s3, t7
lq s3, 44(s5)
addiu s5, s5, 32
sll r0, r0, 0
pmaddh r0, s4, t8
lq s4, 28(s5)
pextlb s2, r0, s3
mfc1 r0, f31
B17:
L96:
pextub s3, r0, s3
mfc1 r0, f31
pmfhl.lh s1
mfc1 r0, f31
pmulth r0, s2, t5
mfc1 r0, f31
psrlh s2, s1, 6
mfc1 r0, f31
pcpyud s1, s2, s2
mfc1 r0, f31
paddh s2, s1, s2
mfc1 r0, f31
pminh s2, s2, a2
mfc1 r0, f31
ppacb s1, r0, s2
mfc1 r0, f31
pextlb s2, r0, s4
mfc1 r0, f31
pmaddh r0, s3, t6
sw s1, 0(a1)
pextub s4, r0, s4
mfc1 r0, f31
pmaddh r0, s2, t7
lq s3, 44(s5)
addiu s5, s5, 32
addiu a1, a1, 4
pmaddh r0, s4, t8
lq s4, 28(s5)
bne a1, gp, L96
pextlb s2, r0, s3
B18:
bgez t9, L91
sll r0, r0, 0
# done with colors
B19:
sll r0, r0, 0
lw a2, 10276(a3) # next1
sll r0, r0, 0
lw t6, 10292(a3) # geom1
beq a2, r0, L97
lbu t5, 10305(a3) # frag1
B20:
bgezal r0, L100 # call sub at L100 for adding the geom.
sll r0, r0, 0
B21:
L97:
sll r0, r0, 0
lw a2, 10280(a3)
sll r0, r0, 0
lw t6, 10296(a3)
beq a2, r0, L98
lbu t5, 10306(a3)
B22:
bgezal r0, L100
sll r0, r0, 0
B23:
L98:
sll r0, r0, 0
lw a2, 10284(a3)
sll r0, r0, 0
lw t6, 10300(a3)
beq a2, r0, L99
lbu t5, 10307(a3)
B24:
bgezal r0, L100
sll r0, r0, 0
# early exit if we didn't draw any of this protytpe
B25:
L99:
sll r0, r0, 0
lw a2, 280(t0)
sll r0, r0, 0
lw t6, 10256(a3)
sll r0, r0, 0
lw t5, 12(a2)
bne t6, r0, L85
daddiu t7, t6, -1
B26:
beq r0, r0, L105
sll r0, r0, 0
# geom upload thing.
B27:
L100:
addiu t6, t6, 32 # offset of first frag within a prototype-tie
sll r0, r0, 0
B28:
L101:
addiu t7, a0, 4 # looks like we're going to use 4 qw's of our buffer
addiu t8, r0, 255 # and we can go all the way to the end this time (last time we did 251)
dsubu t7, t8, t7
lw t8, 0(t6) # t8 = gif ref
bgez t7, L104
lhu t7, 30(t6)
B29:
L102:
lw a1, 0(t1)
sll r0, r0, 0
sll r0, r0, 0
sll r0, r0, 0
andi a1, a1, 256
sll r0, r0, 0
beq a1, r0, L103
sll r0, r0, 0
B30:
sll r0, r0, 0
lw a1, 292(t0)
sll r0, r0, 0
sll r0, r0, 0
sll r0, r0, 0
daddiu a1, a1, 1
sll r0, r0, 0
sw a1, 292(t0)
beq r0, r0, L102
sll r0, r0, 0
B31:
L103:
sw t2, 128(t1)
xori t2, t2, 4096
sw v1, 16(t1)
sll a1, a0, 4
addu v1, v1, a1
or a1, t2, r0
sw a0, 32(t1)
addiu a0, r0, 256
sw a0, 0(t1)
addiu a0, r0, 0
# got enough room, make the output for this frag:
# a2 = next, t6 = tie-fragment, t5 = frag count,
B32:
L104:
sll r0, r0, 0
lw s5, 4(t6) # s5 = point ref
sll r0, r0, 0
lhu s4, 28(t6) # s4 = tex-count
sll r0, r0, 0
lhu gp, 32(t6) # gp = vertex count
sll r0, r0, 0
sw t8, 36(t0) # store gif ref ptr in upload model 0
sll r0, r0, 0
sh s4, 32(t0) # store tex count in upload 0
daddiu t9, s4, 16384 # what's this doing... something with the unpack. maybe an offset.
sb s4, 46(t0) # store tex count in upload 0's vif tag
dsll s4, s4, 4 # s4 now bytes
sw s5, 68(t0) # point ref in upload 2
daddu t8, t8, s4 # advance gif ref
sh gp, 64(t0) # vertex count in upload 2 dma
dsll gp, gp, 1 # multiply by 2 (I guess upacks to 2x as big?)
sw a2, 84(t0) # store next in upload 3
sll r0, r0, 0
sb gp, 78(t0) # vertex count * 2 in upload 2 viftag unpack
sll r0, r0, 0
sw t8, 52(t0) # ?? in upload 1 (more "gif" stuff)
sll r0, r0, 0
sh t7, 48(t0) # upload 1 gets gif-count
dsll t7, t7, 2 # also unpacks
sh t9, 60(t0) # really not sure what this is here...
sll r0, r0, 0
sb t7, 62(t0) # unpack count for the extra gif stuff.
sll r0, r0, 0
lq t7, 32(t0) # t7 = upload0
sll r0, r0, 0
lq t8, 48(t0) # t8 = upload1
sll r0, r0, 0
lq t9, 64(t0) # t9 = upload2
sll r0, r0, 0
lq gp, 80(t0) # gp = upload3
daddiu a0, a0, 4
sq t7, 0(a1)
daddiu t5, t5, -1
sq t8, 16(a1)
daddiu a2, a2, 48
sq t9, 32(a1)
sq gp, 48(a1)
daddiu a1, a1, 64
bgtz t5, L101
daddiu t6, t6, 64
B33:
jr ra
sll r0, r0, 0
# end of geom upload subroutine
B34:
L105:
beq a0, r0, L108
sll r0, r0, 0
B35:
L106:
lw a1, 0(t1)
sll r0, r0, 0
sll r0, r0, 0
sll r0, r0, 0
andi a1, a1, 256
sll r0, r0, 0
beq a1, r0, L107
sll r0, r0, 0
B36:
sll r0, r0, 0
lw a1, 292(t0)
sll r0, r0, 0
sll r0, r0, 0
sll r0, r0, 0
daddiu a1, a1, 1
sll r0, r0, 0
sw a1, 292(t0)
beq r0, r0, L106
sll r0, r0, 0
B37:
L107:
sw t2, 128(t1)
sll r0, r0, 0
sw v1, 16(t1)
sll a1, a0, 4
addu v1, v1, a1
sll r0, r0, 0
sw a0, 32(t1)
addiu a0, r0, 256
sw a0, 0(t1)
sll r0, r0, 0
B38:
L108:
lw a0, 0(t1)
sll r0, r0, 0
sll r0, r0, 0
sll r0, r0, 0
andi a0, a0, 256
sll r0, r0, 0
beq a0, r0, L109
sll r0, r0, 0
B39:
sll r0, r0, 0
lw a0, 292(t0)
sll r0, r0, 0
sll r0, r0, 0
sll r0, r0, 0
daddiu a0, a0, 1
sll r0, r0, 0
sw a0, 292(t0)
beq r0, r0, L108
sll r0, r0, 0
B40:
L109:
lw a0, 10260(a3)
sll r0, r0, 0
sw v1, 4(a0)
sll r0, r0, 0
or v0, r0, r0
ld ra, 0(sp)
lq gp, 96(sp)
lq s5, 80(sp)
lq s4, 64(sp)
lq s3, 48(sp)
lq s2, 32(sp)
lq s1, 16(sp)
jr ra
daddiu sp, sp, 112
sll r0, r0, 0
sll r0, r0, 0