mirror of
https://github.com/open-goal/jak-project.git
synced 2024-10-20 21:27:52 -04:00
678 lines
25 KiB
C++
678 lines
25 KiB
C++
|
|
|
|
#include "background_common.h"
|
|
|
|
#include <immintrin.h>
|
|
|
|
#include "common/util/os.h"
|
|
|
|
#include "game/graphics/opengl_renderer/BucketRenderer.h"
|
|
#include "game/graphics/pipelines/opengl.h"
|
|
|
|
DoubleDraw setup_opengl_from_draw_mode(DrawMode mode, u32 tex_unit, bool mipmap) {
|
|
glActiveTexture(tex_unit);
|
|
|
|
if (mode.get_zt_enable()) {
|
|
glEnable(GL_DEPTH_TEST);
|
|
switch (mode.get_depth_test()) {
|
|
case GsTest::ZTest::NEVER:
|
|
glDepthFunc(GL_NEVER);
|
|
break;
|
|
case GsTest::ZTest::ALWAYS:
|
|
glDepthFunc(GL_ALWAYS);
|
|
break;
|
|
case GsTest::ZTest::GEQUAL:
|
|
glDepthFunc(GL_GEQUAL);
|
|
break;
|
|
case GsTest::ZTest::GREATER:
|
|
glDepthFunc(GL_GREATER);
|
|
break;
|
|
default:
|
|
ASSERT(false);
|
|
}
|
|
} else {
|
|
glDisable(GL_DEPTH_TEST);
|
|
}
|
|
|
|
if (mode.get_ab_enable() && mode.get_alpha_blend() != DrawMode::AlphaBlend::DISABLED) {
|
|
glEnable(GL_BLEND);
|
|
switch (mode.get_alpha_blend()) {
|
|
case DrawMode::AlphaBlend::SRC_DST_SRC_DST:
|
|
glBlendEquation(GL_FUNC_ADD);
|
|
glBlendFuncSeparate(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, GL_ONE, GL_ZERO);
|
|
break;
|
|
case DrawMode::AlphaBlend::SRC_0_SRC_DST:
|
|
glBlendEquation(GL_FUNC_ADD);
|
|
glBlendFuncSeparate(GL_SRC_ALPHA, GL_ONE, GL_ONE, GL_ZERO);
|
|
break;
|
|
case DrawMode::AlphaBlend::SRC_0_FIX_DST:
|
|
glBlendEquation(GL_FUNC_ADD);
|
|
glBlendFuncSeparate(GL_ONE, GL_ONE, GL_ONE, GL_ZERO);
|
|
break;
|
|
case DrawMode::AlphaBlend::SRC_DST_FIX_DST:
|
|
// Cv = (Cs - Cd) * FIX + Cd
|
|
// Cs * FIX * 0.5
|
|
// Cd * FIX * 0.5
|
|
glBlendEquation(GL_FUNC_ADD);
|
|
glBlendFuncSeparate(GL_CONSTANT_COLOR, GL_CONSTANT_COLOR, GL_ONE, GL_ZERO);
|
|
glBlendColor(0.5, 0.5, 0.5, 0.5);
|
|
break;
|
|
case DrawMode::AlphaBlend::ZERO_SRC_SRC_DST:
|
|
glBlendFuncSeparate(GL_SRC_ALPHA, GL_ONE, GL_ONE, GL_ZERO);
|
|
glBlendEquation(GL_FUNC_REVERSE_SUBTRACT);
|
|
break;
|
|
default:
|
|
ASSERT(false);
|
|
}
|
|
} else {
|
|
glDisable(GL_BLEND);
|
|
}
|
|
|
|
if (mode.get_clamp_s_enable()) {
|
|
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE);
|
|
} else {
|
|
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_REPEAT);
|
|
}
|
|
|
|
if (mode.get_clamp_t_enable()) {
|
|
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE);
|
|
} else {
|
|
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_REPEAT);
|
|
}
|
|
|
|
if (mode.get_filt_enable()) {
|
|
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER,
|
|
mipmap ? GL_LINEAR_MIPMAP_LINEAR : GL_LINEAR);
|
|
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_LINEAR);
|
|
} else {
|
|
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
|
|
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
|
|
}
|
|
|
|
// for some reason, they set atest NEVER + FB_ONLY to disable depth writes
|
|
bool alpha_hack_to_disable_z_write = false;
|
|
DoubleDraw double_draw;
|
|
|
|
float alpha_min = 0.;
|
|
if (mode.get_at_enable()) {
|
|
switch (mode.get_alpha_test()) {
|
|
case DrawMode::AlphaTest::ALWAYS:
|
|
break;
|
|
case DrawMode::AlphaTest::GEQUAL:
|
|
alpha_min = mode.get_aref() / 127.f;
|
|
switch (mode.get_alpha_fail()) {
|
|
case GsTest::AlphaFail::KEEP:
|
|
// ok, no need for double draw
|
|
break;
|
|
case GsTest::AlphaFail::FB_ONLY:
|
|
if (mode.get_depth_write_enable()) {
|
|
// darn, we need to draw twice
|
|
double_draw.kind = DoubleDrawKind::AFAIL_NO_DEPTH_WRITE;
|
|
double_draw.aref_second = alpha_min;
|
|
} else {
|
|
alpha_min = 0.f;
|
|
}
|
|
break;
|
|
default:
|
|
ASSERT(false);
|
|
}
|
|
break;
|
|
case DrawMode::AlphaTest::NEVER:
|
|
if (mode.get_alpha_fail() == GsTest::AlphaFail::FB_ONLY) {
|
|
alpha_hack_to_disable_z_write = true;
|
|
} else {
|
|
ASSERT(false);
|
|
}
|
|
break;
|
|
default:
|
|
ASSERT(false);
|
|
}
|
|
}
|
|
|
|
if (mode.get_depth_write_enable() && !alpha_hack_to_disable_z_write) {
|
|
glDepthMask(GL_TRUE);
|
|
} else {
|
|
glDepthMask(GL_FALSE);
|
|
}
|
|
double_draw.aref_first = alpha_min;
|
|
return double_draw;
|
|
}
|
|
|
|
DoubleDraw setup_tfrag_shader(SharedRenderState* render_state, DrawMode mode, ShaderId shader) {
|
|
auto draw_settings = setup_opengl_from_draw_mode(mode, GL_TEXTURE0, true);
|
|
glUniform1f(glGetUniformLocation(render_state->shaders[shader].id(), "alpha_min"),
|
|
draw_settings.aref_first);
|
|
glUniform1f(glGetUniformLocation(render_state->shaders[shader].id(), "alpha_max"), 10.f);
|
|
return draw_settings;
|
|
}
|
|
|
|
void first_tfrag_draw_setup(const TfragRenderSettings& settings,
|
|
SharedRenderState* render_state,
|
|
ShaderId shader) {
|
|
render_state->shaders[shader].activate();
|
|
auto shid = render_state->shaders[shader].id();
|
|
glUniform1i(glGetUniformLocation(shid, "tex_T0"), 0);
|
|
glUniformMatrix4fv(glGetUniformLocation(shid, "camera"), 1, GL_FALSE,
|
|
settings.math_camera.data());
|
|
glUniform4f(glGetUniformLocation(shid, "hvdf_offset"), settings.hvdf_offset[0],
|
|
settings.hvdf_offset[1], settings.hvdf_offset[2], settings.hvdf_offset[3]);
|
|
glUniform1f(glGetUniformLocation(shid, "fog_constant"), settings.fog.x());
|
|
glUniform1f(glGetUniformLocation(shid, "fog_min"), settings.fog.y());
|
|
glUniform1f(glGetUniformLocation(shid, "fog_max"), settings.fog.z());
|
|
glUniform4f(glGetUniformLocation(shid, "fog_color"), render_state->fog_color[0] / 255.f,
|
|
render_state->fog_color[1] / 255.f, render_state->fog_color[2] / 255.f,
|
|
render_state->fog_intensity / 255);
|
|
}
|
|
|
|
void interp_time_of_day_slow(const float weights[8],
|
|
const std::vector<tfrag3::TimeOfDayColor>& in,
|
|
math::Vector<u8, 4>* out) {
|
|
// Timer interp_timer;
|
|
for (size_t color = 0; color < in.size(); color++) {
|
|
math::Vector4f result = math::Vector4f::zero();
|
|
for (int component = 0; component < 8; component++) {
|
|
result += in[color].rgba[component].cast<float>() * weights[component];
|
|
}
|
|
result[0] = std::min(result[0], 255.f);
|
|
result[1] = std::min(result[1], 255.f);
|
|
result[2] = std::min(result[2], 255.f);
|
|
result[3] = std::min(result[3], 128.f); // note: different for alpha!
|
|
out[color] = result.cast<u8>();
|
|
}
|
|
}
|
|
|
|
// we want to absolutely minimize the number of time we have to "cross lanes" in AVX (meaning X
|
|
// component of one vector interacts with Y component of another). We can make this a lot better by
|
|
// taking groups of 4 time of day colors (each containing 8x RGBAs) and rearranging them with this
|
|
// pattern. We want to compute:
|
|
// [rgba][0][0] * weights[0] + [rgba][0][1] * weights[1] + [rgba][0][2]... + rgba[0][7] * weights[7]
|
|
// RGBA is already a vector of 4 components, but with AVX we have vectors with 32 bytes which fit
|
|
// 16 colors in them.
|
|
|
|
// This makes each vector have:
|
|
// colors0 = [rgba][0][0], [rgba][1][0], [rgba][2][0], [rgba][3][0]
|
|
// colors1 = [rgba][0][1], [rgba][1][1], [rgba][2][1], [rgba][3][1]
|
|
// ...
|
|
// so we can basically add up the columns (multiplying by weights in between)
|
|
// and we'll end up with [final0, final1, final2, final3, final4]
|
|
|
|
// the swizzle function below rearranges to get this pattern.
|
|
// it's not the most efficient way to do it, but it just runs during loading and not on every frame.
|
|
|
|
SwizzledTimeOfDay swizzle_time_of_day(const std::vector<tfrag3::TimeOfDayColor>& in) {
|
|
SwizzledTimeOfDay out;
|
|
out.data.resize((in.size() + 3) * 8 * 4);
|
|
|
|
// we're rearranging per 4 colors (groups of 32 * 4 = 128)
|
|
// color (lots of these)
|
|
// component (8 of these)
|
|
// channel (4 of these, rgba)
|
|
|
|
for (u32 color_quad = 0; color_quad < (in.size() + 3) / 4; color_quad++) {
|
|
u8* quad_out = out.data.data() + color_quad * 128;
|
|
for (u32 component = 0; component < 8; component++) {
|
|
for (u32 color = 0; color < 4; color++) {
|
|
for (u32 channel = 0; channel < 4; channel++) {
|
|
size_t in_idx = color_quad * 4 + color;
|
|
if (in_idx < in.size()) {
|
|
*quad_out = in.at(color_quad * 4 + color).rgba[component][channel];
|
|
} else {
|
|
*quad_out = 0;
|
|
}
|
|
quad_out++;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
out.color_count = (in.size() + 3) & (~3);
|
|
return out;
|
|
}
|
|
|
|
// This does the same thing as interp_time_of_day_slow, but is faster.
|
|
// Due to using integers instead of floats, it may be a tiny bit different.
|
|
// TODO: it might be possible to reorder the loop into two blocks of loads and avoid spilling xmms.
|
|
// It's ~8x faster than the slow version.
|
|
|
|
void interp_time_of_day_fast_avx2(const float weights[8],
|
|
const SwizzledTimeOfDay& in,
|
|
math::Vector<u8, 4>* out) {
|
|
// even though the colors are 8 bits, we'll use 16 bits so we can saturate correctly
|
|
#ifdef __AVX2__
|
|
// weight multipliers
|
|
__m256i weights0 = _mm256_set1_epi16(weights[0] * 64.f);
|
|
__m256i weights1 = _mm256_set1_epi16(weights[1] * 64.f);
|
|
__m256i weights2 = _mm256_set1_epi16(weights[2] * 64.f);
|
|
__m256i weights3 = _mm256_set1_epi16(weights[3] * 64.f);
|
|
__m256i weights4 = _mm256_set1_epi16(weights[4] * 64.f);
|
|
__m256i weights5 = _mm256_set1_epi16(weights[5] * 64.f);
|
|
__m256i weights6 = _mm256_set1_epi16(weights[6] * 64.f);
|
|
__m256i weights7 = _mm256_set1_epi16(weights[7] * 64.f);
|
|
|
|
// saturation: note that alpha is saturated to 128 but the rest are 255.
|
|
// TODO: maybe we should saturate to 255 for everybody (can do this using a single packus) and
|
|
// change the shader to deal with this.
|
|
__m256i sat = _mm256_set_epi16(128, 255, 255, 255, 128, 255, 255, 255, 128, 255, 255, 255, 128,
|
|
255, 255, 255);
|
|
|
|
for (u32 color_quad = 0; color_quad < in.color_count / 4; color_quad++) {
|
|
// first, load colors. We put 16 bytes / register and don't touch the upper half because we
|
|
// convert u8s to u16s.
|
|
const u8* base = in.data.data() + color_quad * 128;
|
|
__m128i color0_p = _mm_loadu_si128((const __m128i*)(base + 0));
|
|
__m128i color1_p = _mm_loadu_si128((const __m128i*)(base + 16));
|
|
__m128i color2_p = _mm_loadu_si128((const __m128i*)(base + 32));
|
|
__m128i color3_p = _mm_loadu_si128((const __m128i*)(base + 48));
|
|
__m128i color4_p = _mm_loadu_si128((const __m128i*)(base + 64));
|
|
__m128i color5_p = _mm_loadu_si128((const __m128i*)(base + 80));
|
|
__m128i color6_p = _mm_loadu_si128((const __m128i*)(base + 96));
|
|
__m128i color7_p = _mm_loadu_si128((const __m128i*)(base + 112));
|
|
|
|
// unpack to 16-bits. each has 16x 16 bit colors.
|
|
__m256i color0 = _mm256_cvtepu8_epi16(color0_p);
|
|
__m256i color1 = _mm256_cvtepu8_epi16(color1_p);
|
|
__m256i color2 = _mm256_cvtepu8_epi16(color2_p);
|
|
__m256i color3 = _mm256_cvtepu8_epi16(color3_p);
|
|
__m256i color4 = _mm256_cvtepu8_epi16(color4_p);
|
|
__m256i color5 = _mm256_cvtepu8_epi16(color5_p);
|
|
__m256i color6 = _mm256_cvtepu8_epi16(color6_p);
|
|
__m256i color7 = _mm256_cvtepu8_epi16(color7_p);
|
|
|
|
// multiply by weights
|
|
color0 = _mm256_mullo_epi16(color0, weights0);
|
|
color1 = _mm256_mullo_epi16(color1, weights1);
|
|
color2 = _mm256_mullo_epi16(color2, weights2);
|
|
color3 = _mm256_mullo_epi16(color3, weights3);
|
|
color4 = _mm256_mullo_epi16(color4, weights4);
|
|
color5 = _mm256_mullo_epi16(color5, weights5);
|
|
color6 = _mm256_mullo_epi16(color6, weights6);
|
|
color7 = _mm256_mullo_epi16(color7, weights7);
|
|
|
|
// add. This order minimizes dependencies.
|
|
color0 = _mm256_add_epi16(color0, color1);
|
|
color2 = _mm256_add_epi16(color2, color3);
|
|
color4 = _mm256_add_epi16(color4, color5);
|
|
color6 = _mm256_add_epi16(color6, color7);
|
|
|
|
color0 = _mm256_add_epi16(color0, color2);
|
|
color4 = _mm256_add_epi16(color4, color6);
|
|
|
|
color0 = _mm256_add_epi16(color0, color4);
|
|
|
|
// divide, because we multiplied our weights by 2^7.
|
|
color0 = _mm256_srli_epi16(color0, 6);
|
|
|
|
// saturate
|
|
color0 = _mm256_min_epu16(sat, color0);
|
|
|
|
// back to u8s.
|
|
auto hi = _mm256_extracti128_si256(color0, 1);
|
|
auto result = _mm_packus_epi16(_mm256_castsi256_si128(color0), hi);
|
|
|
|
// store result
|
|
_mm_storeu_si128((__m128i*)(&out[color_quad * 4]), result);
|
|
}
|
|
#else
|
|
// unreachable.
|
|
(void)weights;
|
|
(void)in;
|
|
(void)out;
|
|
ASSERT(false);
|
|
#endif
|
|
}
|
|
|
|
void interp_time_of_day_fast(const float weights[8],
|
|
const SwizzledTimeOfDay& in,
|
|
math::Vector<u8, 4>* out) {
|
|
// even though the colors are 8 bits, we'll use 16 bits so we can saturate correctly
|
|
if (get_cpu_info().has_avx2) {
|
|
interp_time_of_day_fast_avx2(weights, in, out);
|
|
return;
|
|
}
|
|
|
|
// weight multipliers
|
|
__m128i weights0 = _mm_set1_epi16(weights[0] * 64.f);
|
|
__m128i weights1 = _mm_set1_epi16(weights[1] * 64.f);
|
|
__m128i weights2 = _mm_set1_epi16(weights[2] * 64.f);
|
|
__m128i weights3 = _mm_set1_epi16(weights[3] * 64.f);
|
|
__m128i weights4 = _mm_set1_epi16(weights[4] * 64.f);
|
|
__m128i weights5 = _mm_set1_epi16(weights[5] * 64.f);
|
|
__m128i weights6 = _mm_set1_epi16(weights[6] * 64.f);
|
|
__m128i weights7 = _mm_set1_epi16(weights[7] * 64.f);
|
|
|
|
// saturation: note that alpha is saturated to 128 but the rest are 255.
|
|
// TODO: maybe we should saturate to 255 for everybody (can do this using a single packus) and
|
|
// change the shader to deal with this.
|
|
__m128i sat = _mm_set_epi16(128, 255, 255, 255, 128, 255, 255, 255);
|
|
|
|
for (u32 color_quad = 0; color_quad < in.color_count / 4; color_quad++) {
|
|
// first, load colors. We put 16 bytes / register and don't touch the upper half because we
|
|
// convert u8s to u16s.
|
|
{
|
|
const u8* base = in.data.data() + color_quad * 128;
|
|
__m128i color0_p = _mm_loadu_si64((const __m128i*)(base + 0));
|
|
__m128i color1_p = _mm_loadu_si64((const __m128i*)(base + 16));
|
|
__m128i color2_p = _mm_loadu_si64((const __m128i*)(base + 32));
|
|
__m128i color3_p = _mm_loadu_si64((const __m128i*)(base + 48));
|
|
__m128i color4_p = _mm_loadu_si64((const __m128i*)(base + 64));
|
|
__m128i color5_p = _mm_loadu_si64((const __m128i*)(base + 80));
|
|
__m128i color6_p = _mm_loadu_si64((const __m128i*)(base + 96));
|
|
__m128i color7_p = _mm_loadu_si64((const __m128i*)(base + 112));
|
|
|
|
// unpack to 16-bits. each has 16x 16 bit colors.
|
|
__m128i color0 = _mm_cvtepu8_epi16(color0_p);
|
|
__m128i color1 = _mm_cvtepu8_epi16(color1_p);
|
|
__m128i color2 = _mm_cvtepu8_epi16(color2_p);
|
|
__m128i color3 = _mm_cvtepu8_epi16(color3_p);
|
|
__m128i color4 = _mm_cvtepu8_epi16(color4_p);
|
|
__m128i color5 = _mm_cvtepu8_epi16(color5_p);
|
|
__m128i color6 = _mm_cvtepu8_epi16(color6_p);
|
|
__m128i color7 = _mm_cvtepu8_epi16(color7_p);
|
|
|
|
// multiply by weights
|
|
color0 = _mm_mullo_epi16(color0, weights0);
|
|
color1 = _mm_mullo_epi16(color1, weights1);
|
|
color2 = _mm_mullo_epi16(color2, weights2);
|
|
color3 = _mm_mullo_epi16(color3, weights3);
|
|
color4 = _mm_mullo_epi16(color4, weights4);
|
|
color5 = _mm_mullo_epi16(color5, weights5);
|
|
color6 = _mm_mullo_epi16(color6, weights6);
|
|
color7 = _mm_mullo_epi16(color7, weights7);
|
|
|
|
// add. This order minimizes dependencies.
|
|
color0 = _mm_add_epi16(color0, color1);
|
|
color2 = _mm_add_epi16(color2, color3);
|
|
color4 = _mm_add_epi16(color4, color5);
|
|
color6 = _mm_add_epi16(color6, color7);
|
|
|
|
color0 = _mm_add_epi16(color0, color2);
|
|
color4 = _mm_add_epi16(color4, color6);
|
|
|
|
color0 = _mm_add_epi16(color0, color4);
|
|
|
|
// divide, because we multiplied our weights by 2^7.
|
|
color0 = _mm_srli_epi16(color0, 6);
|
|
|
|
// saturate
|
|
color0 = _mm_min_epu16(sat, color0);
|
|
|
|
// back to u8s.
|
|
auto result = _mm_packus_epi16(color0, color0);
|
|
|
|
// store result
|
|
_mm_storel_epi64((__m128i*)(&out[color_quad * 4]), result);
|
|
}
|
|
|
|
{
|
|
const u8* base = in.data.data() + color_quad * 128 + 8;
|
|
__m128i color0_p = _mm_loadu_si64((const __m128i*)(base + 0));
|
|
__m128i color1_p = _mm_loadu_si64((const __m128i*)(base + 16));
|
|
__m128i color2_p = _mm_loadu_si64((const __m128i*)(base + 32));
|
|
__m128i color3_p = _mm_loadu_si64((const __m128i*)(base + 48));
|
|
__m128i color4_p = _mm_loadu_si64((const __m128i*)(base + 64));
|
|
__m128i color5_p = _mm_loadu_si64((const __m128i*)(base + 80));
|
|
__m128i color6_p = _mm_loadu_si64((const __m128i*)(base + 96));
|
|
__m128i color7_p = _mm_loadu_si64((const __m128i*)(base + 112));
|
|
|
|
// unpack to 16-bits. each has 16x 16 bit colors.
|
|
__m128i color0 = _mm_cvtepu8_epi16(color0_p);
|
|
__m128i color1 = _mm_cvtepu8_epi16(color1_p);
|
|
__m128i color2 = _mm_cvtepu8_epi16(color2_p);
|
|
__m128i color3 = _mm_cvtepu8_epi16(color3_p);
|
|
__m128i color4 = _mm_cvtepu8_epi16(color4_p);
|
|
__m128i color5 = _mm_cvtepu8_epi16(color5_p);
|
|
__m128i color6 = _mm_cvtepu8_epi16(color6_p);
|
|
__m128i color7 = _mm_cvtepu8_epi16(color7_p);
|
|
|
|
// multiply by weights
|
|
color0 = _mm_mullo_epi16(color0, weights0);
|
|
color1 = _mm_mullo_epi16(color1, weights1);
|
|
color2 = _mm_mullo_epi16(color2, weights2);
|
|
color3 = _mm_mullo_epi16(color3, weights3);
|
|
color4 = _mm_mullo_epi16(color4, weights4);
|
|
color5 = _mm_mullo_epi16(color5, weights5);
|
|
color6 = _mm_mullo_epi16(color6, weights6);
|
|
color7 = _mm_mullo_epi16(color7, weights7);
|
|
|
|
// add. This order minimizes dependencies.
|
|
color0 = _mm_add_epi16(color0, color1);
|
|
color2 = _mm_add_epi16(color2, color3);
|
|
color4 = _mm_add_epi16(color4, color5);
|
|
color6 = _mm_add_epi16(color6, color7);
|
|
|
|
color0 = _mm_add_epi16(color0, color2);
|
|
color4 = _mm_add_epi16(color4, color6);
|
|
|
|
color0 = _mm_add_epi16(color0, color4);
|
|
|
|
// divide, because we multiplied our weights by 2^7.
|
|
color0 = _mm_srli_epi16(color0, 6);
|
|
|
|
// saturate
|
|
color0 = _mm_min_epu16(sat, color0);
|
|
|
|
// back to u8s.
|
|
auto result = _mm_packus_epi16(color0, color0);
|
|
|
|
// store result
|
|
_mm_storel_epi64((__m128i*)(&out[color_quad * 4 + 2]), result);
|
|
}
|
|
}
|
|
}
|
|
|
|
bool sphere_in_view_ref(const math::Vector4f& sphere, const math::Vector4f* planes) {
|
|
math::Vector4f acc =
|
|
planes[0] * sphere.x() + planes[1] * sphere.y() + planes[2] * sphere.z() - planes[3];
|
|
|
|
return acc.x() > -sphere.w() && acc.y() > -sphere.w() && acc.z() > -sphere.w() &&
|
|
acc.w() > -sphere.w();
|
|
}
|
|
|
|
// this isn't super efficient, but we spend so little time here it's not worth it to go faster.
|
|
void cull_check_all_slow(const math::Vector4f* planes,
|
|
const std::vector<tfrag3::VisNode>& nodes,
|
|
const u8* level_occlusion_string,
|
|
u8* out) {
|
|
if (level_occlusion_string) {
|
|
for (size_t i = 0; i < nodes.size(); i++) {
|
|
u16 my_id = nodes[i].my_id;
|
|
bool not_occluded =
|
|
my_id != 0xffff && level_occlusion_string[my_id / 8] & (1 << (7 - (my_id & 7)));
|
|
out[i] = not_occluded && sphere_in_view_ref(nodes[i].bsphere, planes);
|
|
}
|
|
} else {
|
|
for (size_t i = 0; i < nodes.size(); i++) {
|
|
out[i] = sphere_in_view_ref(nodes[i].bsphere, planes);
|
|
}
|
|
}
|
|
}
|
|
|
|
void make_all_visible_multidraws(std::pair<int, int>* draw_ptrs_out,
|
|
GLsizei* counts_out,
|
|
void** index_offsets_out,
|
|
const std::vector<tfrag3::ShrubDraw>& draws) {
|
|
u64 md_idx = 0;
|
|
for (size_t i = 0; i < draws.size(); i++) {
|
|
const auto& draw = draws[i];
|
|
u64 iidx = draw.first_index_index;
|
|
std::pair<int, int> ds;
|
|
ds.first = md_idx;
|
|
ds.second = 1;
|
|
counts_out[md_idx] = draw.num_indices;
|
|
index_offsets_out[md_idx] = (void*)(iidx * sizeof(u32));
|
|
md_idx++;
|
|
draw_ptrs_out[i] = ds;
|
|
}
|
|
}
|
|
|
|
u32 make_all_visible_multidraws(std::pair<int, int>* draw_ptrs_out,
|
|
GLsizei* counts_out,
|
|
void** index_offsets_out,
|
|
const std::vector<tfrag3::StripDraw>& draws) {
|
|
u64 md_idx = 0;
|
|
u32 num_tris = 0;
|
|
for (size_t i = 0; i < draws.size(); i++) {
|
|
const auto& draw = draws[i];
|
|
u64 iidx = draw.unpacked.idx_of_first_idx_in_full_buffer;
|
|
std::pair<int, int> ds;
|
|
ds.first = md_idx;
|
|
ds.second = 0;
|
|
for (auto& grp : draw.vis_groups) {
|
|
// visible!
|
|
// let's use a multidraw
|
|
counts_out[md_idx] = grp.num_inds;
|
|
index_offsets_out[md_idx] = (void*)(iidx * sizeof(u32));
|
|
ds.second++;
|
|
md_idx++;
|
|
num_tris += grp.num_tris;
|
|
iidx += grp.num_inds;
|
|
}
|
|
draw_ptrs_out[i] = ds;
|
|
}
|
|
return num_tris;
|
|
}
|
|
|
|
u32 make_all_visible_index_list(std::pair<int, int>* group_out,
|
|
u32* idx_out,
|
|
const std::vector<tfrag3::ShrubDraw>& draws,
|
|
const u32* idx_in) {
|
|
int idx_buffer_ptr = 0;
|
|
for (size_t i = 0; i < draws.size(); i++) {
|
|
const auto& draw = draws[i];
|
|
std::pair<int, int> ds;
|
|
ds.first = idx_buffer_ptr;
|
|
memcpy(&idx_out[idx_buffer_ptr], idx_in + draw.first_index_index,
|
|
draw.num_indices * sizeof(u32));
|
|
idx_buffer_ptr += draw.num_indices;
|
|
ds.second = idx_buffer_ptr - ds.first;
|
|
group_out[i] = ds;
|
|
}
|
|
return idx_buffer_ptr;
|
|
}
|
|
|
|
u32 make_multidraws_from_vis_string(std::pair<int, int>* draw_ptrs_out,
|
|
GLsizei* counts_out,
|
|
void** index_offsets_out,
|
|
const std::vector<tfrag3::StripDraw>& draws,
|
|
const std::vector<u8>& vis_data) {
|
|
u64 md_idx = 0;
|
|
u32 num_tris = 0;
|
|
u32 sanity_check = 0;
|
|
for (size_t i = 0; i < draws.size(); i++) {
|
|
const auto& draw = draws[i];
|
|
u64 iidx = draw.unpacked.idx_of_first_idx_in_full_buffer;
|
|
ASSERT(sanity_check == iidx);
|
|
std::pair<int, int> ds;
|
|
ds.first = md_idx;
|
|
ds.second = 0;
|
|
for (auto& grp : draw.vis_groups) {
|
|
sanity_check += grp.num_inds;
|
|
if (grp.vis_idx_in_pc_bvh == 0xffffffff || vis_data[grp.vis_idx_in_pc_bvh]) {
|
|
// visible!
|
|
// let's use a multidraw
|
|
counts_out[md_idx] = grp.num_inds;
|
|
index_offsets_out[md_idx] = (void*)(iidx * sizeof(u32));
|
|
ds.second++;
|
|
md_idx++;
|
|
num_tris += grp.num_tris;
|
|
}
|
|
iidx += grp.num_inds;
|
|
}
|
|
draw_ptrs_out[i] = ds;
|
|
}
|
|
return num_tris;
|
|
}
|
|
|
|
u32 make_index_list_from_vis_string(std::pair<int, int>* group_out,
|
|
u32* idx_out,
|
|
const std::vector<tfrag3::StripDraw>& draws,
|
|
const std::vector<u8>& vis_data,
|
|
const u32* idx_in,
|
|
u32* num_tris_out) {
|
|
int idx_buffer_ptr = 0;
|
|
u32 num_tris = 0;
|
|
for (size_t i = 0; i < draws.size(); i++) {
|
|
const auto& draw = draws[i];
|
|
int vtx_idx = 0;
|
|
std::pair<int, int> ds;
|
|
ds.first = idx_buffer_ptr;
|
|
bool building_run = false;
|
|
int run_start_out = 0;
|
|
int run_start_in = 0;
|
|
for (auto& grp : draw.vis_groups) {
|
|
bool vis = grp.vis_idx_in_pc_bvh == 0xffffffff || vis_data[grp.vis_idx_in_pc_bvh];
|
|
if (vis) {
|
|
num_tris += grp.num_tris;
|
|
}
|
|
|
|
if (building_run) {
|
|
if (vis) {
|
|
idx_buffer_ptr += grp.num_inds;
|
|
} else {
|
|
building_run = false;
|
|
memcpy(&idx_out[run_start_out],
|
|
idx_in + draw.unpacked.idx_of_first_idx_in_full_buffer + run_start_in,
|
|
(idx_buffer_ptr - run_start_out) * sizeof(u32));
|
|
}
|
|
} else {
|
|
if (vis) {
|
|
building_run = true;
|
|
run_start_out = idx_buffer_ptr;
|
|
run_start_in = vtx_idx;
|
|
idx_buffer_ptr += grp.num_inds;
|
|
}
|
|
}
|
|
vtx_idx += grp.num_inds;
|
|
}
|
|
|
|
if (building_run) {
|
|
memcpy(&idx_out[run_start_out],
|
|
idx_in + draw.unpacked.idx_of_first_idx_in_full_buffer + run_start_in,
|
|
(idx_buffer_ptr - run_start_out) * sizeof(u32));
|
|
}
|
|
|
|
ds.second = idx_buffer_ptr - ds.first;
|
|
group_out[i] = ds;
|
|
}
|
|
*num_tris_out = num_tris;
|
|
return idx_buffer_ptr;
|
|
}
|
|
|
|
u32 make_all_visible_index_list(std::pair<int, int>* group_out,
|
|
u32* idx_out,
|
|
const std::vector<tfrag3::StripDraw>& draws,
|
|
const u32* idx_in,
|
|
u32* num_tris_out) {
|
|
int idx_buffer_ptr = 0;
|
|
u32 num_tris = 0;
|
|
for (size_t i = 0; i < draws.size(); i++) {
|
|
const auto& draw = draws[i];
|
|
std::pair<int, int> ds;
|
|
ds.first = idx_buffer_ptr;
|
|
u32 num_inds = 0;
|
|
for (auto& grp : draw.vis_groups) {
|
|
num_inds += grp.num_inds;
|
|
num_tris += grp.num_tris;
|
|
}
|
|
memcpy(&idx_out[idx_buffer_ptr], idx_in + draw.unpacked.idx_of_first_idx_in_full_buffer,
|
|
num_inds * sizeof(u32));
|
|
idx_buffer_ptr += num_inds;
|
|
ds.second = idx_buffer_ptr - ds.first;
|
|
group_out[i] = ds;
|
|
}
|
|
*num_tris_out = num_tris;
|
|
return idx_buffer_ptr;
|
|
}
|
|
|
|
void update_render_state_from_pc_settings(SharedRenderState* state, const TfragPcPortData& data) {
|
|
if (!state->has_pc_data) {
|
|
for (int i = 0; i < 4; i++) {
|
|
state->camera_planes[i] = data.planes[i];
|
|
state->camera_matrix[i] = data.camera[i];
|
|
}
|
|
state->camera_pos = data.cam_trans;
|
|
state->camera_hvdf_off = data.hvdf_off;
|
|
state->camera_fog = data.fog;
|
|
state->has_pc_data = true;
|
|
}
|
|
}
|