[graphics] reduce the size of fr3 files (#1175)

* first pass

* first pass at shrinking fr3s

* only need to load vertices once

* avx2 detect and switch

* fix build

* another ifx'

* one more

* fix the sky and stupid math bug in size check
This commit is contained in:
water111 2022-02-16 22:13:18 -05:00 committed by GitHub
parent 74d0025974
commit 5135ea9659
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
20 changed files with 844 additions and 137 deletions

View file

@ -17,9 +17,8 @@ if(MSVC AND (CMAKE_CXX_COMPILER_ID STREQUAL "Clang"))
"-Xclang -fcxx-exceptions \
-Xclang -fexceptions \
-Xclang -std=c++17 \
-mfma -mavx2 \
-Wno-c++11-narrowing -W3 \
/arch:AVX")
-march=native \
-Wno-c++11-narrowing -W3")
# additional c++ flags for release mode for our projects
if(CMAKE_BUILD_TYPE MATCHES "Release")
@ -47,7 +46,7 @@ elseif(UNIX)
-Wshadow \
-Wsign-promo \
-fdiagnostics-color=always \
-march=haswell")
-march=native")
# additional c++ flags for release mode for our projects
if(CMAKE_BUILD_TYPE MATCHES "Release")

View file

@ -3,14 +3,31 @@
namespace tfrag3 {
void PackedTieVertices::serialize(Serializer& ser) {
ser.from_pod_vector(&color_indices);
ser.from_pod_vector(&matrices);
ser.from_pod_vector(&matrix_groups);
ser.from_pod_vector(&vertices);
}
void StripDraw::serialize(Serializer& ser) {
ser.from_ptr(&mode);
ser.from_ptr(&tree_tex_id);
ser.from_pod_vector(&vertex_index_stream);
ser.from_pod_vector(&runs);
ser.from_pod_vector(&vis_groups);
ser.from_ptr(&num_triangles);
}
void StripDraw::unpack() {
ASSERT(unpacked.vertex_index_stream.empty());
for (auto& r : runs) {
for (int i = 0; i < r.length; i++) {
unpacked.vertex_index_stream.push_back(r.vertex0 + i);
}
unpacked.vertex_index_stream.push_back(UINT32_MAX);
}
}
void InstancedStripDraw::serialize(Serializer& ser) {
ser.from_ptr(&mode);
ser.from_ptr(&tree_tex_id);
@ -37,11 +54,71 @@ void TfragTree::serialize(Serializer& ser) {
draw.serialize(ser);
}
ser.from_pod_vector(&vertices);
// ser.from_pod_vector(&vertices);
ser.from_pod_vector(&packed_vertices.vertices);
ser.from_pod_vector(&packed_vertices.cluster_origins);
ser.from_pod_vector(&colors);
bvh.serialize(ser);
}
void TieTree::unpack() {
unpacked.vertices.resize(packed_vertices.color_indices.size());
size_t i = 0;
for (const auto& grp : packed_vertices.matrix_groups) {
if (grp.matrix_idx == -1) {
for (u32 src_idx = grp.start_vert; src_idx < grp.end_vert; src_idx++) {
auto& vtx = unpacked.vertices[i];
vtx.color_index = packed_vertices.color_indices[i];
const auto& proto_vtx = packed_vertices.vertices[src_idx];
vtx.x = proto_vtx.x;
vtx.y = proto_vtx.y;
vtx.z = proto_vtx.z;
vtx.q = 1.f;
vtx.s = proto_vtx.s;
vtx.t = proto_vtx.t;
i++;
}
} else {
const auto& mat = packed_vertices.matrices[grp.matrix_idx];
for (u32 src_idx = grp.start_vert; src_idx < grp.end_vert; src_idx++) {
auto& vtx = unpacked.vertices[i];
vtx.color_index = packed_vertices.color_indices[i];
const auto& proto_vtx = packed_vertices.vertices[src_idx];
auto temp = mat[0] * proto_vtx.x + mat[1] * proto_vtx.y + mat[2] * proto_vtx.z + mat[3];
vtx.x = temp.x();
vtx.y = temp.y();
vtx.z = temp.z();
vtx.q = 1.f;
vtx.s = proto_vtx.s;
vtx.t = proto_vtx.t;
i++;
}
}
}
}
void TfragTree::unpack() {
unpacked.vertices.resize(packed_vertices.vertices.size());
for (size_t i = 0; i < unpacked.vertices.size(); i++) {
auto& o = unpacked.vertices[i];
auto& in = packed_vertices.vertices[i];
auto& cluster = packed_vertices.cluster_origins.at(in.cluster_idx);
constexpr float kClusterSize = 4096 * 40; // 100 in-game meters
constexpr float kMasterOffset = 12000 * 4096;
constexpr float rescale = kClusterSize / UINT16_MAX;
float cx = -kMasterOffset + kClusterSize * cluster.x();
float cy = -kMasterOffset + kClusterSize * cluster.y();
float cz = -kMasterOffset + kClusterSize * cluster.z();
o.x = cx + in.xoff * rescale;
o.y = cy + in.yoff * rescale;
o.z = cz + in.zoff * rescale;
o.s = in.s / (1024.f);
o.t = in.t / (1024.f);
o.q = 1.f;
o.color_index = in.color_index;
}
}
void TieTree::serialize(Serializer& ser) {
if (ser.is_saving()) {
ser.save<size_t>(static_draws.size());
@ -62,15 +139,15 @@ void TieTree::serialize(Serializer& ser) {
}
if (ser.is_saving()) {
ser.save<size_t>(instance_info.size());
ser.save<size_t>(wind_instance_info.size());
} else {
instance_info.resize(ser.load<size_t>());
wind_instance_info.resize(ser.load<size_t>());
}
for (auto& inst : instance_info) {
for (auto& inst : wind_instance_info) {
inst.serialize(ser);
}
ser.from_pod_vector(&vertices);
packed_vertices.serialize(ser);
ser.from_pod_vector(&colors);
bvh.serialize(ser);
}
@ -141,4 +218,58 @@ void Level::serialize(Serializer& ser) {
}
}
std::array<int, MemoryUsageCategory::NUM_CATEGORIES> Level::get_memory_usage() const {
std::array<int, MemoryUsageCategory::NUM_CATEGORIES> result;
result.fill(0);
// textures
for (const auto& tex : textures) {
result[TEXTURE] += tex.data.size() * sizeof(u32);
}
// tfrag
for (const auto& tfrag_tree_geoms : tfrag_trees) {
for (const auto& tfrag_tree : tfrag_tree_geoms) {
for (const auto& draw : tfrag_tree.draws) {
result[TFRAG_INDEX] += draw.runs.size() * sizeof(StripDraw::VertexRun);
result[TFRAG_VIS] += draw.vis_groups.size() * sizeof(StripDraw::VisGroup);
}
result[TFRAG_VERTS] +=
tfrag_tree.packed_vertices.vertices.size() * sizeof(PackedTfragVertices::Vertex);
result[TFRAG_CLUSTER] +=
tfrag_tree.packed_vertices.cluster_origins.size() * sizeof(math::Vector<u16, 3>);
result[TFRAG_TIME_OF_DAY] += tfrag_tree.colors.size() * sizeof(TimeOfDayColor);
result[TFRAG_BVH] += tfrag_tree.bvh.vis_nodes.size() * sizeof(VisNode);
}
}
// tie
for (const auto& tie_tree_geoms : tie_trees) {
for (const auto& tie_tree : tie_tree_geoms) {
result[TIE_BVH] += tie_tree.bvh.vis_nodes.size();
for (const auto& draw : tie_tree.static_draws) {
result[TIE_DEINST_INDEX] += draw.runs.size() * sizeof(StripDraw::VertexRun);
result[TIE_DEINST_VIS] += draw.vis_groups.size() * sizeof(StripDraw::VisGroup);
}
result[TIE_VERTS] +=
tie_tree.packed_vertices.vertices.size() * sizeof(PackedTieVertices::Vertex);
result[TIE_CIDX] += tie_tree.packed_vertices.color_indices.size() * sizeof(u16);
result[TIE_MATRICES] += tie_tree.packed_vertices.matrices.size() * 4 * 4 * 4;
result[TIE_GRPS] +=
tie_tree.packed_vertices.matrix_groups.size() * sizeof(PackedTieVertices::MatrixGroup);
result[TIE_TIME_OF_DAY] += tie_tree.colors.size() * sizeof(TimeOfDayColor);
for (const auto& draw : tie_tree.instanced_wind_draws) {
result[TIE_INST_INDEX] += draw.vertex_index_stream.size() * sizeof(u32);
result[TIE_INST_VIS] +=
draw.instance_groups.size() * sizeof(InstancedStripDraw::InstanceGroup);
}
result[TIE_WIND_INSTANCE_INFO] +=
tie_tree.wind_instance_info.size() * sizeof(TieWindInstance);
}
}
return result;
}
} // namespace tfrag3

View file

@ -11,7 +11,39 @@
namespace tfrag3 {
constexpr int TFRAG3_VERSION = 10;
// NOTE:
// when updating any data structures in this file:
// - change the TFRAG3_VERSION
// - make sure to update the serialize function
// - if changing any large things (vertices, vis, bvh, colors, textures) update get_memory_usage
// - if adding a new category to the memory usage, update extract_level to print it.
enum MemoryUsageCategory {
TEXTURE,
TIE_DEINST_VIS,
TIE_DEINST_INDEX,
TIE_INST_VIS,
TIE_INST_INDEX,
TIE_BVH,
TIE_VERTS,
TIE_TIME_OF_DAY,
TIE_WIND_INSTANCE_INFO,
TIE_CIDX,
TIE_MATRICES,
TIE_GRPS,
TFRAG_VIS,
TFRAG_INDEX,
TFRAG_VERTS,
TFRAG_CLUSTER,
TFRAG_TIME_OF_DAY,
TFRAG_BVH,
NUM_CATEGORIES
};
constexpr int TFRAG3_VERSION = 11;
// These vertices should be uploaded to the GPU at load time and don't change
struct PreloadedVertex {
@ -25,6 +57,55 @@ struct PreloadedVertex {
};
static_assert(sizeof(PreloadedVertex) == 32, "PreloadedVertex size");
struct PackedTieVertices {
struct Vertex {
float x, y, z;
float s, t;
};
struct MatrixGroup {
s32 matrix_idx;
u32 start_vert;
u32 end_vert;
};
std::vector<u16> color_indices;
std::vector<std::array<math::Vector4f, 4>> matrices;
std::vector<MatrixGroup> matrix_groups; // todo pack
std::vector<Vertex> vertices;
float cluster_size = 0;
void serialize(Serializer& ser);
};
struct PackedTfragVertices {
struct Vertex {
u16 xoff, yoff, zoff;
u16 cluster_idx;
u16 s, t;
u16 color_index;
/*
bool operator==(const Vertex& other) const {
return xoff == other.xoff && yoff == other.yoff && zoff == other.zoff &&
cluster_idx == other.cluster_idx && s == other.s && t == other.t &&
color_index == other.color_index;
}
struct hash {
auto operator()(const Vertex& x) const {
return std::hash<uint16_t>()(x.xoff) ^ std::hash<uint16_t>()(x.yoff) ^
std::hash<uint16_t>()(x.zoff) ^ std::hash<uint16_t>()(x.cluster_idx) ^
std::hash<uint16_t>()(x.s) ^ std::hash<uint16_t>()(x.t) ^
std::hash<uint16_t>()(x.color_index);
}
};
*/
};
std::vector<Vertex> vertices;
std::vector<math::Vector<u16, 3>> cluster_origins;
};
// Settings for drawing a group of triangle strips.
// This refers to a group of PreloadedVertices that are already uploaded.
// All triangles here are drawn in the same "mode" (blending, texture, etc)
@ -35,9 +116,20 @@ struct StripDraw {
DrawMode mode; // the OpenGL draw settings.
u32 tree_tex_id = 0; // the texture that should be bound for the draw
// the list of vertices in the draw. This includes the restart code of UINT32_MAX that OpenGL
// will use to start a new strip.
std::vector<u32> vertex_index_stream;
struct {
// the list of vertices in the draw. This includes the restart code of UINT32_MAX that OpenGL
// will use to start a new strip.
std::vector<u32> vertex_index_stream;
} unpacked;
void unpack();
struct VertexRun {
u32 vertex0;
u16 length;
};
std::vector<VertexRun> runs;
// to do culling, the above vertex stream is grouped.
// by following the visgroups and checking the visibility, you can leave out invisible vertices.
@ -129,11 +221,16 @@ constexpr const char* tfrag_tree_names[] = {"normal", "trans", "dirt",
// A tfrag model
struct TfragTree {
TFragmentTreeKind kind; // our tfrag kind
std::vector<StripDraw> draws; // the actual topology and settings
std::vector<PreloadedVertex> vertices; // mesh vertices
std::vector<TimeOfDayColor> colors; // vertex colors (pre-interpolation)
BVH bvh; // the bvh for frustum culling
TFragmentTreeKind kind; // our tfrag kind
std::vector<StripDraw> draws; // the actual topology and settings
PackedTfragVertices packed_vertices;
std::vector<TimeOfDayColor> colors; // vertex colors (pre-interpolation)
BVH bvh; // the bvh for frustum culling
struct {
std::vector<PreloadedVertex> vertices; // mesh vertices
} unpacked;
void unpack();
void serialize(Serializer& ser);
};
@ -147,14 +244,20 @@ struct TieWindInstance {
// A tie model
struct TieTree {
BVH bvh;
std::vector<StripDraw> static_draws; // the actual topology and settings
std::vector<PreloadedVertex> vertices; // mesh vertices
std::vector<TimeOfDayColor> colors; // vertex colors (pre-interpolation)
std::vector<StripDraw> static_draws; // the actual topology and settings
PackedTieVertices packed_vertices;
std::vector<TimeOfDayColor> colors; // vertex colors (pre-interpolation)
std::vector<InstancedStripDraw> instanced_wind_draws;
std::vector<TieWindInstance> instance_info;
std::vector<TieWindInstance> wind_instance_info;
struct {
std::vector<PreloadedVertex> vertices; // mesh vertices
} unpacked;
void serialize(Serializer& ser);
void unpack();
};
struct Level {
@ -165,6 +268,8 @@ struct Level {
std::array<std::vector<TieTree>, 4> tie_trees;
u16 version2 = TFRAG3_VERSION;
void serialize(Serializer& ser);
std::array<int, MemoryUsageCategory::NUM_CATEGORIES> get_memory_usage() const;
};
} // namespace tfrag3

View file

@ -13,7 +13,12 @@
#include "common/util/BinaryReader.h"
#include "BinaryWriter.h"
#include "common/common_types.h"
// This disables the use of PCLMULQDQ which is probably ok, but let's just be safe and disable it
// because nobody will care if png compression is 10% slower.
#define FPNG_NO_SSE 1
#include "third-party/fpng/fpng.cpp"
#include "third-party/fpng/fpng.h"
#include "third-party/fmt/core.h"
#include "third-party/lzokay/lzokay.hpp"

View file

@ -1,5 +1,7 @@
#include "os.h"
#include "common/common_types.h"
#ifdef __linux__
#include <sys/resource.h>
@ -14,4 +16,72 @@ size_t get_peak_rss() {
size_t get_peak_rss() {
return 0;
}
#endif
#endif
#ifdef _WIN32
// windows has a __cpuid
#include <intrin.h>
#else
// using int to be compatible with msvc's intrinsic
void __cpuidex(int result[4], int eax, int ecx) {
asm("cpuid\n\t"
: "=a"(result[0]), "=b"(result[1]), "=c"(result[2]), "=d"(result[3])
: "0"(eax), "2"(ecx));
}
#endif
CpuInfo gCpuInfo;
void setup_cpu_info() {
if (gCpuInfo.initialized) {
return;
}
// as a test, get the brand and model
for (u32 i = 0x80000002; i <= 0x80000004; i++) {
int result[4];
__cpuidex(result, i, 0);
for (auto reg : result) {
for (int c = 0; c < 4; c++) {
gCpuInfo.model.push_back(reg);
reg >>= 8;
}
}
}
{
int result[4];
__cpuidex(result, 0, 0);
for (auto r : {1, 3, 2}) {
for (int c = 0; c < 4; c++) {
gCpuInfo.brand.push_back(result[r]);
result[r] >>= 8;
}
}
}
// check for AVX2
{
int result[4];
__cpuidex(result, 7, 0);
gCpuInfo.has_avx2 = result[1] & (1 << 5);
}
{
int result[4];
__cpuidex(result, 1, 0);
gCpuInfo.has_avx = result[2] & (1 << 28);
}
printf("-------- CPU Information --------\n");
printf(" Brand: %s\n", gCpuInfo.brand.c_str());
printf(" Model: %s\n", gCpuInfo.model.c_str());
printf(" AVX : %s\n", gCpuInfo.has_avx ? "true" : "false");
printf(" AVX2 : %s\n", gCpuInfo.has_avx2 ? "true" : "false");
gCpuInfo.initialized = true;
}
CpuInfo& get_cpu_info() {
return gCpuInfo;
}

View file

@ -1,6 +1,19 @@
#pragma once
#include <cstddef>
#include <string>
// Note: these are not implemented on windows and will return zero.
size_t get_peak_rss();
size_t get_peak_rss();
void setup_cpu_info();
struct CpuInfo {
bool initialized = false;
bool has_avx = false;
bool has_avx2 = false;
std::string brand;
std::string model;
};
CpuInfo& get_cpu_info();

View file

@ -48,6 +48,45 @@ bool is_valid_bsp(const decompiler::LinkedObjectFile& file) {
return true;
}
void print_memory_usage(const tfrag3::Level& lev, int uncompressed_data_size) {
int total_accounted = 0;
auto memory_use_by_category = lev.get_memory_usage();
std::vector<std::pair<std::string, int>> known_categories = {
{"texture", memory_use_by_category[tfrag3::MemoryUsageCategory::TEXTURE]},
{"tie-deinst-vis", memory_use_by_category[tfrag3::MemoryUsageCategory::TIE_DEINST_VIS]},
{"tie-deinst-idx", memory_use_by_category[tfrag3::MemoryUsageCategory::TIE_DEINST_INDEX]},
{"tie-inst-vis", memory_use_by_category[tfrag3::MemoryUsageCategory::TIE_INST_VIS]},
{"tie-inst-idx", memory_use_by_category[tfrag3::MemoryUsageCategory::TIE_INST_INDEX]},
{"tie-bvh", memory_use_by_category[tfrag3::MemoryUsageCategory::TIE_BVH]},
{"tie-verts", memory_use_by_category[tfrag3::MemoryUsageCategory::TIE_VERTS]},
{"tie-colors", memory_use_by_category[tfrag3::MemoryUsageCategory::TIE_TIME_OF_DAY]},
{"tie-wind-inst-info",
memory_use_by_category[tfrag3::MemoryUsageCategory::TIE_WIND_INSTANCE_INFO]},
{"tie-cidx", memory_use_by_category[tfrag3::MemoryUsageCategory::TIE_CIDX]},
{"tie-mats", memory_use_by_category[tfrag3::MemoryUsageCategory::TIE_MATRICES]},
{"tie-grps", memory_use_by_category[tfrag3::MemoryUsageCategory::TIE_GRPS]},
{"tfrag-vis", memory_use_by_category[tfrag3::MemoryUsageCategory::TFRAG_VIS]},
{"tfrag-idx", memory_use_by_category[tfrag3::MemoryUsageCategory::TFRAG_INDEX]},
{"tfrag-vert", memory_use_by_category[tfrag3::MemoryUsageCategory::TFRAG_VERTS]},
{"tfrag-colors", memory_use_by_category[tfrag3::MemoryUsageCategory::TFRAG_TIME_OF_DAY]},
{"tfrag-cluster", memory_use_by_category[tfrag3::MemoryUsageCategory::TFRAG_CLUSTER]},
{"tfrag-bvh", memory_use_by_category[tfrag3::MemoryUsageCategory::TFRAG_BVH]}};
for (auto& known : known_categories) {
total_accounted += known.second;
}
known_categories.push_back({"unknown", uncompressed_data_size - total_accounted});
std::sort(known_categories.begin(), known_categories.end(),
[](const auto& a, const auto& b) { return a.second > b.second; });
for (const auto& x : known_categories) {
fmt::print("{:30s} : {:6d} kB {:3.1f}%\n", x.first, x.second / 1024,
100.f * (float)x.second / uncompressed_data_size);
}
}
void extract_from_level(ObjectFileDB& db,
TextureDB& tex_db,
const std::string& dgo_name,
@ -85,7 +124,6 @@ void extract_from_level(ObjectFileDB& db,
for (auto& draw_tree : bsp_header.drawable_tree_array.trees) {
if (tfrag_trees.count(draw_tree->my_type())) {
auto as_tfrag_tree = dynamic_cast<level_tools::DrawableTreeTfrag*>(draw_tree.get());
fmt::print(" extracting tree {}\n", draw_tree->my_type());
ASSERT(as_tfrag_tree);
std::vector<std::pair<int, int>> expected_missing_textures;
auto it = hacks.missing_textures_by_level.find(level_name);
@ -96,13 +134,12 @@ void extract_from_level(ObjectFileDB& db,
bsp_header.texture_remap_table, tex_db, expected_missing_textures, tfrag_level,
dump_level);
} else if (draw_tree->my_type() == "drawable-tree-instance-tie") {
fmt::print(" extracting TIE\n");
auto as_tie_tree = dynamic_cast<level_tools::DrawableTreeInstanceTie*>(draw_tree.get());
ASSERT(as_tie_tree);
extract_tie(as_tie_tree, fmt::format("{}-{}-tie", dgo_name, i++),
bsp_header.texture_remap_table, tex_db, tfrag_level, dump_level);
} else {
fmt::print(" unsupported tree {}\n", draw_tree->my_type());
// fmt::print(" unsupported tree {}\n", draw_tree->my_type());
}
}
@ -110,6 +147,7 @@ void extract_from_level(ObjectFileDB& db,
tfrag_level.serialize(ser);
auto compressed =
compression::compress_zstd(ser.get_save_result().first, ser.get_save_result().second);
print_memory_usage(tfrag_level, ser.get_save_result().second);
fmt::print("compressed: {} -> {} ({:.2f}%)\n", ser.get_save_result().second, compressed.size(),
100.f * compressed.size() / ser.get_save_result().second);
file_util::write_binary_file(file_util::get_file_path({fmt::format(

View file

@ -1975,13 +1975,14 @@ std::map<u32, std::vector<GroupedDraw>> make_draw_groups(std::vector<TFragDraw>&
}
}
fmt::print(" grouped to get {} draw calls\n", dc);
// fmt::print(" grouped to get {} draw calls\n", dc);
return result;
}
void make_tfrag3_data(std::map<u32, std::vector<GroupedDraw>>& draws,
tfrag3::TfragTree& tree_out,
std::vector<tfrag3::PreloadedVertex>& vertices,
std::vector<tfrag3::Texture>& texture_pool,
const TextureDB& tdb,
const std::vector<std::pair<int, int>>& expected_missing_textures) {
@ -2045,6 +2046,9 @@ void make_tfrag3_data(std::map<u32, std::vector<GroupedDraw>>& draws,
vgroup.num = strip.verts.size() + 1; // one for the primitive restart!
tdraw.num_triangles += strip.verts.size() - 2;
tfrag3::StripDraw::VertexRun run;
run.vertex0 = vertices.size();
run.length = strip.verts.size();
for (auto& vert : strip.verts) {
// convert vert.
tfrag3::PreloadedVertex vtx;
@ -2060,12 +2064,10 @@ void make_tfrag3_data(std::map<u32, std::vector<GroupedDraw>>& draws,
// ASSERT((vert.rgba >> 2) < 1024); spider cave has 2048?
ASSERT((vert.rgba & 3) == 0);
size_t vert_idx = tree_out.vertices.size();
tree_out.vertices.push_back(vtx);
tdraw.vertex_index_stream.push_back(vert_idx);
size_t vert_idx = vertices.size();
vertices.push_back(vtx);
}
tdraw.vertex_index_stream.push_back(UINT32_MAX); // prim restart
tdraw.runs.push_back(run);
tdraw.vis_groups.push_back(vgroup);
}
@ -2080,6 +2082,7 @@ void emulate_tfrags(int geom,
const std::vector<level_tools::TextureRemap>& map,
tfrag3::Level& level_out,
tfrag3::TfragTree& tree_out,
std::vector<tfrag3::PreloadedVertex>& vertices,
const TextureDB& tdb,
const std::vector<std::pair<int, int>>& expected_missing_textures,
bool dump_level) {
@ -2101,7 +2104,7 @@ void emulate_tfrags(int geom,
process_draw_mode(all_draws, map, tree_out.kind);
auto groups = make_draw_groups(all_draws);
make_tfrag3_data(groups, tree_out, level_out.textures, tdb, expected_missing_textures);
make_tfrag3_data(groups, tree_out, vertices, level_out.textures, tdb, expected_missing_textures);
if (dump_level) {
auto debug_out = debug_dump_to_obj(all_draws);
@ -2135,6 +2138,85 @@ void merge_groups(std::vector<tfrag3::StripDraw::VisGroup>& grps) {
} // namespace
constexpr float kClusterSize = 4096 * 40; // 100 in-game meters
constexpr float kMasterOffset = 12000 * 4096;
std::pair<u64, u16> position_to_cluster_and_offset(float in) {
in += kMasterOffset;
if (in < 0) {
fmt::print("negative: {}\n", in);
}
ASSERT(in >= 0);
int cluster_cell = (in / kClusterSize);
float leftover = in - (cluster_cell * kClusterSize);
u16 offset = (leftover / kClusterSize) * float(UINT16_MAX);
float recovered = ((float)cluster_cell + ((float)offset / UINT16_MAX)) * kClusterSize;
float diff = std::fabs(recovered - in);
ASSERT(diff < 7);
ASSERT(cluster_cell >= 0);
ASSERT(cluster_cell < UINT16_MAX);
return {cluster_cell, offset};
}
void pack_vertices(tfrag3::PackedTfragVertices* result,
const std::vector<tfrag3::PreloadedVertex>& vertices) {
u32 next_cluster_idx = 0;
std::map<u64, u32> clusters;
for (auto& vtx : vertices) {
auto x = position_to_cluster_and_offset(vtx.x);
auto y = position_to_cluster_and_offset(vtx.y);
auto z = position_to_cluster_and_offset(vtx.z);
u64 cluster_id = 0;
cluster_id |= x.first;
cluster_id |= (y.first << 16);
cluster_id |= (z.first << 32);
auto cluster_it = clusters.find(cluster_id);
u32 my_cluster_idx = 0;
if (cluster_it == clusters.end()) {
// first in cluster
clusters[cluster_id] = next_cluster_idx;
my_cluster_idx = next_cluster_idx;
next_cluster_idx++;
} else {
my_cluster_idx = cluster_it->second;
}
tfrag3::PackedTfragVertices::Vertex out_vtx;
out_vtx.xoff = x.second;
out_vtx.yoff = y.second;
out_vtx.zoff = z.second;
out_vtx.cluster_idx = my_cluster_idx;
// TODO check these
out_vtx.s = vtx.s * 1024;
out_vtx.t = vtx.t * 1024;
out_vtx.color_index = vtx.color_index;
result->vertices.push_back(out_vtx);
}
result->cluster_origins.resize(next_cluster_idx);
for (auto& cluster : clusters) {
auto& res = result->cluster_origins[cluster.second];
res.x() = (u16)cluster.first;
res.y() = (u16)(cluster.first >> 16);
res.z() = (u16)(cluster.first >> 32);
}
/*
std::unordered_set<tfrag3::PackedTfragVertices::Vertex, tfrag3::PackedTfragVertices::Vertex::hash>
a;
for (auto& v : result->vertices) {
a.insert(v);
}
fmt::print("SIZE: {} vs {} {}\n", a.size(), result->vertices.size(),
(float)a.size() / result->vertices.size());
*/
ASSERT(next_cluster_idx < UINT16_MAX);
}
void extract_tfrag(const level_tools::DrawableTreeTfrag* tree,
const std::string& debug_name,
const std::vector<level_tools::TextureRemap>& map,
@ -2142,7 +2224,7 @@ void extract_tfrag(const level_tools::DrawableTreeTfrag* tree,
const std::vector<std::pair<int, int>>& expected_missing_textures,
tfrag3::Level& out,
bool dump_level) {
// go through 4 lods(?)
// go through 3 lods(?)
for (int geom = 0; geom < GEOM_MAX; ++geom) {
tfrag3::TfragTree this_tree;
if (tree->my_type() == "drawable-tree-tfrag") {
@ -2176,7 +2258,8 @@ void extract_tfrag(const level_tools::DrawableTreeTfrag* tree,
}
bool ok = verify_node_indices(tree);
ASSERT(ok);
fmt::print(" tree has {} arrays and {} tfragments\n", tree->length, as_tfrag_array->length);
// fmt::print(" tree has {} arrays and {} tfragments\n", tree->length,
// as_tfrag_array->length);
auto vis_nodes = extract_vis_data(tree, as_tfrag_array->tfragments.front().id);
this_tree.bvh.first_leaf_node = vis_nodes.first_child_node;
@ -2198,8 +2281,10 @@ void extract_tfrag(const level_tools::DrawableTreeTfrag* tree,
}
// ASSERT(result.vis_nodes.last_child_node + 1 == idx);
emulate_tfrags(geom, as_tfrag_array->tfragments, debug_name, map, out, this_tree, tex_db,
expected_missing_textures, dump_level);
std::vector<tfrag3::PreloadedVertex> vertices;
emulate_tfrags(geom, as_tfrag_array->tfragments, debug_name, map, out, this_tree, vertices,
tex_db, expected_missing_textures, dump_level);
pack_vertices(&this_tree.packed_vertices, vertices);
extract_time_of_day(tree, this_tree);
for (auto& draw : this_tree.draws) {

View file

@ -558,9 +558,9 @@ void update_proto_info(std::vector<TieProtoInfo>* out,
adgif.combo_tex = tex_combo;
// and the hidden value in the unused a+d
memcpy(&adgif.second_w, &gif_data.at(16 * (tex_idx * 5 + 1) + 12), 4);
// todo: figure out if this matters
// todo: figure out if this matters. maybe this is decal?
if (ra_tex0_val == 0x800000000) {
fmt::print("texture {} in {} has weird tex setting\n", tex->second.name, proto.name);
// fmt::print("texture {} in {} has weird tex setting\n", tex->second.name, proto.name);
}
// mipmap settings. we ignore, but get the hidden value
@ -2036,19 +2036,40 @@ void add_vertices_and_static_draw(tfrag3::TieTree& tree,
// bool using_wind = true; // hack, for testing
bool using_wind = proto.stiffness != 0.f;
// create the model first
std::vector<std::vector<std::pair<int, int>>> packed_vert_indices;
for (size_t frag_idx = 0; frag_idx < proto.frags.size(); frag_idx++) {
packed_vert_indices.emplace_back();
auto& frag_vert_indices = packed_vert_indices.back();
auto& frag = proto.frags[frag_idx]; // shared info for all instances of this frag
for (auto& strip : frag.strips) {
int start = tree.packed_vertices.vertices.size();
for (auto& vert : strip.verts) {
tree.packed_vertices.vertices.push_back(
{vert.pos.x(), vert.pos.y(), vert.pos.z(), vert.tex.x(), vert.tex.y()});
ASSERT(vert.tex.z() == 1.);
}
int end = tree.packed_vertices.vertices.size();
frag_vert_indices.emplace_back(start, end);
}
}
// loop over instances of the prototypes
for (auto& inst : proto.instances) {
// if we're using wind, we use the instanced renderer, which requires some extra info
// and we should remember which instance ID we are.
// Note: this is different from the game's instance index - we don't draw everything instanced
// so the non-instanced models don't get a C++ renderer instance ID
u32 wind_instance_idx = tree.instance_info.size();
u32 wind_instance_idx = tree.wind_instance_info.size();
u32 matrix_idx = tree.packed_vertices.matrices.size();
if (using_wind) {
tfrag3::TieWindInstance wind_instance_info;
wind_instance_info.wind_idx = inst.wind_index; // which wind value to apply in the table
wind_instance_info.stiffness = proto.stiffness; // wind stiffness (how much we move)
wind_instance_info.matrix = inst.mat; // instance transformation matrix.
tree.instance_info.push_back(wind_instance_info);
tree.wind_instance_info.push_back(wind_instance_info);
} else {
tree.packed_vertices.matrices.push_back(inst.mat);
}
// loop over fragments of the prototype
@ -2056,7 +2077,8 @@ void add_vertices_and_static_draw(tfrag3::TieTree& tree,
auto& frag = proto.frags[frag_idx]; // shared info for all instances of this frag
auto& ifrag = inst.frags.at(frag_idx); // color info for this instance of the frag
// loop over triangle strips within the fragment
for (auto& strip : frag.strips) {
for (size_t strip_idx = 0; strip_idx < frag.strips.size(); strip_idx++) {
auto& strip = frag.strips[strip_idx];
// what texture are we using?
u32 combo_tex = strip.adgif.combo_tex;
@ -2139,31 +2161,30 @@ void add_vertices_and_static_draw(tfrag3::TieTree& tree,
igroup.instance_idx = wind_instance_idx;
draw_to_add_to->num_triangles += strip.verts.size() - 2;
// note: this is a bit wasteful to duplicate the xyz/stq.
tfrag3::PackedTieVertices::MatrixGroup grp;
grp.matrix_idx = -1;
grp.start_vert = packed_vert_indices.at(frag_idx).at(strip_idx).first;
grp.end_vert = packed_vert_indices.at(frag_idx).at(strip_idx).second;
tree.packed_vertices.matrix_groups.push_back(grp);
for (auto& vert : strip.verts) {
tfrag3::PreloadedVertex vtx;
vtx.x = vert.pos.x();
vtx.y = vert.pos.y();
vtx.z = vert.pos.z();
vtx.s = vert.tex.x();
vtx.t = vert.tex.y();
vtx.q = vert.tex.z();
// if this is true, we can remove a divide in the shader
ASSERT(vtx.q == 1.f);
u16 color_index = 0;
if (vert.color_index_index == UINT32_MAX) {
vtx.color_index = 0;
color_index = 0;
} else {
vtx.color_index = ifrag.color_indices.at(vert.color_index_index);
color_index = ifrag.color_indices.at(vert.color_index_index);
ASSERT(vert.color_index_index < ifrag.color_indices.size());
vtx.color_index += ifrag.color_index_offset_in_big_palette;
color_index += ifrag.color_index_offset_in_big_palette;
}
size_t vert_idx = tree.vertices.size();
tree.vertices.push_back(vtx);
size_t vert_idx = tree.packed_vertices.color_indices.size();
tree.packed_vertices.color_indices.push_back(color_index);
draw_to_add_to->vertex_index_stream.push_back(vert_idx);
}
// the primitive restart index
draw_to_add_to->vertex_index_stream.push_back(UINT32_MAX);
draw_to_add_to->instance_groups.push_back(igroup);
} else {
// okay, we now have a texture and draw mode, let's see if we can add to an existing...
auto existing_draws_in_tex = static_draws_by_tex.find(idx_in_lev_data);
@ -2190,31 +2211,30 @@ void add_vertices_and_static_draw(tfrag3::TieTree& tree,
vgroup.vis_idx_in_pc_bvh = inst.vis_id; // associate with the instance for culling
vgroup.num = strip.verts.size() + 1; // one for the primitive restart!
draw_to_add_to->num_triangles += strip.verts.size() - 2;
tfrag3::PackedTieVertices::MatrixGroup grp;
grp.matrix_idx = matrix_idx;
grp.start_vert = packed_vert_indices.at(frag_idx).at(strip_idx).first;
grp.end_vert = packed_vert_indices.at(frag_idx).at(strip_idx).second;
tree.packed_vertices.matrix_groups.push_back(grp);
tfrag3::StripDraw::VertexRun run;
run.vertex0 = tree.packed_vertices.color_indices.size();
run.length = strip.verts.size();
for (auto& vert : strip.verts) {
tfrag3::PreloadedVertex vtx;
// todo fields
auto tf = transform_tie(inst.mat, vert.pos);
vtx.x = tf.x();
vtx.y = tf.y();
vtx.z = tf.z();
vtx.s = vert.tex.x();
vtx.t = vert.tex.y();
vtx.q = vert.tex.z();
// if this is true, we can remove a divide in the shader
ASSERT(vtx.q == 1.f);
u16 color_index = 0;
if (vert.color_index_index == UINT32_MAX) {
vtx.color_index = 0;
color_index = 0;
} else {
vtx.color_index = ifrag.color_indices.at(vert.color_index_index);
color_index = ifrag.color_indices.at(vert.color_index_index);
ASSERT(vert.color_index_index < ifrag.color_indices.size());
vtx.color_index += ifrag.color_index_offset_in_big_palette;
color_index += ifrag.color_index_offset_in_big_palette;
}
size_t vert_idx = tree.vertices.size();
tree.vertices.push_back(vtx);
draw_to_add_to->vertex_index_stream.push_back(vert_idx);
size_t vert_idx = tree.packed_vertices.color_indices.size();
tree.packed_vertices.color_indices.push_back(color_index);
// draw_to_add_to->vertex_index_stream.push_back(vert_idx);
}
draw_to_add_to->vertex_index_stream.push_back(UINT32_MAX);
draw_to_add_to->runs.push_back(run);
// draw_to_add_to->vertex_index_stream.push_back(UINT32_MAX);
draw_to_add_to->vis_groups.push_back(vgroup);
}
}
@ -2285,8 +2305,6 @@ void extract_tie(const level_tools::DrawableTreeInstanceTie* tree,
}
bool ok = verify_node_indices(tree);
ASSERT(ok);
fmt::print(" tree has {} arrays and {} instances\n", tree->length,
as_instance_array->length);
// extract the vis tree. Note that this extracts the tree only down to the last draw node, a
// parent of between 1 and 8 instances.
@ -2362,7 +2380,6 @@ void extract_tie(const level_tools::DrawableTreeInstanceTie* tree,
}
this_tree.colors = full_palette.colors;
fmt::print("TIE tree {} has {} draws\n", geo, this_tree.static_draws.size());
out.tie_trees[geo].push_back(std::move(this_tree));
}
}

View file

@ -601,7 +601,11 @@ void DirectRenderer::render_gif(const u8* data,
}
if (size != UINT32_MAX) {
ASSERT((offset + 15) / 16 == size / 16);
if (!(offset + 15) / 16 == size / 16) {
fmt::print("DirectRenderer size failed in {}\n", name_and_id());
fmt::print("expected: {}, got: {}\n", size, offset);
ASSERT(false);
}
}
// fmt::print("{}\n", GifTag(data).print());

View file

@ -56,8 +56,27 @@ void Loader::loader_thread() {
Serializer ser(decomp_data.data(), decomp_data.size());
result->serialize(ser);
double import_time = import_timer.getSeconds();
fmt::print("------------> Load from file: {:.3f}s, import {:.3f}s, decomp {:.3f}s\n",
disk_load_time, import_time, decomp_time);
Timer unpack_timer;
for (auto& tie_tree : result->tie_trees) {
for (auto& tree : tie_tree) {
tree.unpack();
for (auto& d : tree.static_draws) {
d.unpack();
}
}
}
for (auto& t_tree : result->tfrag_trees) {
for (auto& tree : t_tree) {
tree.unpack();
for (auto& d : tree.draws) {
d.unpack();
}
}
}
fmt::print(
"------------> Load from file: {:.3f}s, import {:.3f}s, decomp {:.3f}s unpack {:.3f}s\n",
disk_load_time, import_time, decomp_time, unpack_timer.getSeconds());
lk.lock();
m_initializing_tfrag3_levels[lev].data.level = std::move(result);

View file

@ -353,14 +353,14 @@ struct alignas(16) Accumulator {
auto b = _mm_set1_ps(_b);
auto a = _mm_load_ps(_a.data);
auto acc = _mm_load_ps(data);
_mm_store_ps(data, _mm_fmadd_ps(a, b, acc));
_mm_store_ps(data, _mm_add_ps(_mm_mul_ps(a, b), acc));
}
REALLY_INLINE void madda_xyzw(const Vf& _a, const Vf& _b) {
auto b = _mm_load_ps(_b.data);
auto a = _mm_load_ps(_a.data);
auto acc = _mm_load_ps(data);
_mm_store_ps(data, _mm_fmadd_ps(a, b, acc));
_mm_store_ps(data, _mm_add_ps(_mm_mul_ps(a, b), acc));
}
void madd(Mask mask, Vf& dest, const Vf& a, const Vf& b) {
@ -375,14 +375,14 @@ struct alignas(16) Accumulator {
auto b = _mm_set1_ps(_b);
auto a = _mm_load_ps(_a.data);
auto acc = _mm_load_ps(data);
_mm_store_ps(dest.data, _mm_fmadd_ps(a, b, acc));
_mm_store_ps(dest.data, _mm_add_ps(_mm_mul_ps(a, b), acc));
}
REALLY_INLINE void madd_xyz(Vf& dest, const Vf& _a, float _b) {
auto b = _mm_set1_ps(_b);
auto a = _mm_load_ps(_a.data);
auto acc = _mm_load_ps(data);
auto prod = _mm_fmadd_ps(a, b, acc);
auto prod = _mm_add_ps(_mm_mul_ps(a, b), acc);
prod = _mm_blend_ps(prod, _mm_load_ps(dest.data), 0b1000);
_mm_store_ps(dest.data, prod);
}

View file

@ -1,5 +1,6 @@
#include "SkyBlendCPU.h"
#include "game/graphics/opengl_renderer/AdgifHandler.h"
#include "common/util/os.h"
#include <immintrin.h>
@ -18,33 +19,72 @@ SkyBlendCPU::~SkyBlendCPU() {
}
void blend_sky_initial_fast(u8 intensity, u8* out, const u8* in, u32 size) {
__m256i intensity_vec = _mm256_set1_epi16(intensity);
for (u32 i = 0; i < size / 16; i++) {
__m128i tex_data8 = _mm_loadu_si128((const __m128i*)(in + (i * 16)));
__m256i tex_data16 = _mm256_cvtepu8_epi16(tex_data8);
tex_data16 = _mm256_mullo_epi16(tex_data16, intensity_vec);
tex_data16 = _mm256_srli_epi16(tex_data16, 7);
auto hi = _mm256_extracti128_si256(tex_data16, 1);
auto result = _mm_packus_epi16(_mm256_castsi256_si128(tex_data16), hi);
_mm_storeu_si128((__m128i*)(out + (i * 16)), result);
if (get_cpu_info().has_avx2) {
#ifdef __AVX2__
__m256i intensity_vec = _mm256_set1_epi16(intensity);
for (u32 i = 0; i < size / 16; i++) {
__m128i tex_data8 = _mm_loadu_si128((const __m128i*)(in + (i * 16)));
__m256i tex_data16 = _mm256_cvtepu8_epi16(tex_data8);
tex_data16 = _mm256_mullo_epi16(tex_data16, intensity_vec);
tex_data16 = _mm256_srli_epi16(tex_data16, 7);
auto hi = _mm256_extracti128_si256(tex_data16, 1);
auto result = _mm_packus_epi16(_mm256_castsi256_si128(tex_data16), hi);
_mm_storeu_si128((__m128i*)(out + (i * 16)), result);
}
#else
ASSERT(false);
#endif
} else {
__m128i intensity_vec = _mm_set1_epi16(intensity);
for (u32 i = 0; i < size / 8; i++) {
__m128i tex_data8 = _mm_loadu_si64((const __m128i*)(in + (i * 8)));
__m128i tex_data16 = _mm_cvtepu8_epi16(tex_data8);
tex_data16 = _mm_mullo_epi16(tex_data16, intensity_vec);
tex_data16 = _mm_srli_epi16(tex_data16, 7);
auto result = _mm_packus_epi16(tex_data16, tex_data16);
_mm_storeu_si64((__m128i*)(out + (i * 8)), result);
}
}
}
void blend_sky_fast(u8 intensity, u8* out, const u8* in, u32 size) {
__m256i intensity_vec = _mm256_set1_epi16(intensity);
__m256i max_intensity = _mm256_set1_epi16(255);
for (u32 i = 0; i < size / 16; i++) {
__m128i tex_data8 = _mm_loadu_si128((const __m128i*)(in + (i * 16)));
__m128i out_val = _mm_loadu_si128((const __m128i*)(out + (i * 16)));
__m256i tex_data16 = _mm256_cvtepu8_epi16(tex_data8);
tex_data16 = _mm256_mullo_epi16(tex_data16, intensity_vec);
tex_data16 = _mm256_srli_epi16(tex_data16, 7);
tex_data16 = _mm256_min_epi16(max_intensity, tex_data16);
auto hi = _mm256_extracti128_si256(tex_data16, 1);
auto result = _mm_packus_epi16(_mm256_castsi256_si128(tex_data16), hi);
out_val = _mm_adds_epu8(out_val, result);
_mm_storeu_si128((__m128i*)(out + (i * 16)), out_val);
if (get_cpu_info().has_avx2) {
#ifdef __AVX2__
__m256i intensity_vec = _mm256_set1_epi16(intensity);
__m256i max_intensity = _mm256_set1_epi16(255);
for (u32 i = 0; i < size / 16; i++) {
__m128i tex_data8 = _mm_loadu_si128((const __m128i*)(in + (i * 16)));
__m128i out_val = _mm_loadu_si128((const __m128i*)(out + (i * 16)));
__m256i tex_data16 = _mm256_cvtepu8_epi16(tex_data8);
tex_data16 = _mm256_mullo_epi16(tex_data16, intensity_vec);
tex_data16 = _mm256_srli_epi16(tex_data16, 7);
tex_data16 = _mm256_min_epi16(max_intensity, tex_data16);
auto hi = _mm256_extracti128_si256(tex_data16, 1);
auto result = _mm_packus_epi16(_mm256_castsi256_si128(tex_data16), hi);
out_val = _mm_adds_epu8(out_val, result);
_mm_storeu_si128((__m128i*)(out + (i * 16)), out_val);
}
#else
ASSERT(false);
#endif
} else {
__m128i intensity_vec = _mm_set1_epi16(intensity);
__m128i max_intensity = _mm_set1_epi16(255);
for (u32 i = 0; i < size / 8; i++) {
__m128i tex_data8 = _mm_loadu_si64((const __m128i*)(in + (i * 8)));
__m128i out_val = _mm_loadu_si64((const __m128i*)(out + (i * 8)));
__m128i tex_data16 = _mm_cvtepu8_epi16(tex_data8);
tex_data16 = _mm_mullo_epi16(tex_data16, intensity_vec);
tex_data16 = _mm_srli_epi16(tex_data16, 7);
tex_data16 = _mm_min_epi16(max_intensity, tex_data16);
auto result = _mm_packus_epi16(tex_data16, tex_data16);
out_val = _mm_adds_epu8(out_val, result);
_mm_storeu_si64((__m128i*)(out + (i * 8)), out_val);
}
}
/*
*/
}
SkyBlendStats SkyBlendCPU::do_sky_blends(DmaFollower& dma,

View file

@ -70,10 +70,10 @@ bool Tfrag3::update_load(const std::vector<tfrag3::TFragmentTreeKind>& tree_kind
if (std::find(tree_kinds.begin(), tree_kinds.end(), tree.kind) != tree_kinds.end()) {
max_draw = std::max(tree.draws.size(), max_draw);
for (auto& draw : tree.draws) {
idx_buffer_len += draw.vertex_index_stream.size();
idx_buffer_len += draw.unpacked.vertex_index_stream.size();
}
time_of_day_count = std::max(tree.colors.size(), time_of_day_count);
u32 verts = tree.vertices.size();
u32 verts = tree.packed_vertices.vertices.size();
glGenVertexArrays(1, &tree_cache.vao);
glBindVertexArray(tree_cache.vao);
glGenBuffers(1, &tree_cache.vertex_buffer);
@ -148,7 +148,7 @@ bool Tfrag3::update_load(const std::vector<tfrag3::TFragmentTreeKind>& tree_kind
const auto& tree = lev_data->tfrag_trees[geom][tree_idx];
if (std::find(tree_kinds.begin(), tree_kinds.end(), tree.kind) != tree_kinds.end()) {
u32 verts = tree.vertices.size();
u32 verts = tree.unpacked.vertices.size();
u32 start_vert = (m_load_state.vert) * MAX_VERTS;
u32 end_vert = std::min(verts, (m_load_state.vert + 1) * MAX_VERTS);
if (end_vert > start_vert) {
@ -156,7 +156,7 @@ bool Tfrag3::update_load(const std::vector<tfrag3::TFragmentTreeKind>& tree_kind
glBindBuffer(GL_ARRAY_BUFFER, m_cached_trees[geom][tree_idx].vertex_buffer);
glBufferSubData(GL_ARRAY_BUFFER, start_vert * sizeof(tfrag3::PreloadedVertex),
(end_vert - start_vert) * sizeof(tfrag3::PreloadedVertex),
tree.vertices.data() + start_vert);
tree.unpacked.vertices.data() + start_vert);
if (end_vert < verts) {
remaining = true;
}
@ -274,7 +274,7 @@ void Tfrag3::render_tree(int geom,
void* offset = (void*)(indices.first * sizeof(u32));
prof.add_draw_call();
prof.add_tri(draw.num_triangles * (float)draw_size / draw.vertex_index_stream.size());
prof.add_tri(draw.num_triangles * (float)draw_size / draw.unpacked.vertex_index_stream.size());
glDrawElements(GL_TRIANGLE_STRIP, draw_size, GL_UNSIGNED_INT, (void*)offset);

View file

@ -40,18 +40,18 @@ bool Tie3::update_load(const tfrag3::Level* lev_data) {
const auto& tree = lev_data->tie_trees[geo][tree_idx];
max_draw = std::max(tree.static_draws.size(), max_draw);
for (auto& draw : tree.static_draws) {
idx_buffer_len += draw.vertex_index_stream.size();
max_idx_per_draw = std::max(max_idx_per_draw, draw.vertex_index_stream.size());
idx_buffer_len += draw.unpacked.vertex_index_stream.size();
max_idx_per_draw = std::max(max_idx_per_draw, draw.unpacked.vertex_index_stream.size());
}
for (auto& draw : tree.instanced_wind_draws) {
wind_idx_buffer_len += draw.vertex_index_stream.size();
max_idx_per_draw = std::max(max_idx_per_draw, draw.vertex_index_stream.size());
}
for (auto& inst : tree.instance_info) {
for (auto& inst : tree.wind_instance_info) {
max_wind_idx = std::max(max_wind_idx, inst.wind_idx);
}
time_of_day_count = std::max(tree.colors.size(), time_of_day_count);
u32 verts = tree.vertices.size();
u32 verts = tree.packed_vertices.color_indices.size();
fmt::print(" tree {} has {} verts ({} kB) and {} draws\n", tree_idx, verts,
verts * sizeof(tfrag3::PreloadedVertex) / 1024.f, tree.static_draws.size());
auto& lod_tree = m_trees.at(geo);
@ -62,7 +62,7 @@ bool Tie3::update_load(const tfrag3::Level* lev_data) {
lod_tree[tree_idx].draws = &tree.static_draws; // todo - should we just copy this?
lod_tree[tree_idx].colors = &tree.colors;
lod_tree[tree_idx].vis = &tree.bvh;
lod_tree[tree_idx].instance_info = &tree.instance_info;
lod_tree[tree_idx].instance_info = &tree.wind_instance_info;
lod_tree[tree_idx].wind_draws = &tree.instanced_wind_draws;
vis_temp_len = std::max(vis_temp_len, tree.bvh.vis_nodes.size());
lod_tree[tree_idx].tod_cache = swizzle_time_of_day(tree.colors);
@ -107,7 +107,7 @@ bool Tie3::update_load(const tfrag3::Level* lev_data) {
lod_tree[tree_idx].index_list.resize(idx_buffer_len);
if (wind_idx_buffer_len > 0) {
lod_tree[tree_idx].wind_matrix_cache.resize(tree.instance_info.size());
lod_tree[tree_idx].wind_matrix_cache.resize(tree.wind_instance_info.size());
lod_tree[tree_idx].has_wind = true;
glGenBuffers(1, &lod_tree[tree_idx].wind_vertex_index_buffer);
glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, lod_tree[tree_idx].wind_vertex_index_buffer);
@ -158,7 +158,7 @@ bool Tie3::update_load(const tfrag3::Level* lev_data) {
for (int geo = 0; geo < 4; ++geo) {
for (size_t tree_idx = 0; tree_idx < lev_data->tie_trees[geo].size(); tree_idx++) {
const auto& tree = lev_data->tie_trees[geo][tree_idx];
u32 verts = tree.vertices.size();
u32 verts = tree.unpacked.vertices.size();
u32 start_vert = (m_load_state.vert) * MAX_VERTS;
u32 end_vert = std::min(verts, (m_load_state.vert + 1) * MAX_VERTS);
if (end_vert > start_vert) {
@ -166,7 +166,7 @@ bool Tie3::update_load(const tfrag3::Level* lev_data) {
glBindBuffer(GL_ARRAY_BUFFER, m_trees[geo][tree_idx].vertex_buffer);
glBufferSubData(GL_ARRAY_BUFFER, start_vert * sizeof(tfrag3::PreloadedVertex),
(end_vert - start_vert) * sizeof(tfrag3::PreloadedVertex),
tree.vertices.data() + start_vert);
tree.unpacked.vertices.data() + start_vert);
if (end_vert < verts) {
remaining = true;
}
@ -440,7 +440,6 @@ void Tie3::render(DmaFollower& dma, SharedRenderState* render_state, ScopedProfi
m_has_level = setup_for_level(m_pc_port_data.level_name, render_state);
}
render_all_trees(lod(), settings, render_state, prof);
// todo render all...
}
void Tie3::render_all_trees(int geom,
@ -659,9 +658,9 @@ void Tie3::render_tree(int idx,
void* offset = (void*)(indices.first * sizeof(u32));
prof.add_draw_call();
prof.add_tri(draw.num_triangles * (float)draw_size / draw.vertex_index_stream.size());
prof.add_tri(draw.num_triangles * (float)draw_size / draw.unpacked.vertex_index_stream.size());
bool is_full = draw_size == (int)draw.vertex_index_stream.size();
bool is_full = draw_size == (int)draw.unpacked.vertex_index_stream.size();
tree.perf.draws++;
if (is_full) {

View file

@ -48,8 +48,6 @@ class Tie3 : public BucketRenderer {
SharedRenderState* render_state,
ScopedProfilerNode& prof);
int m_geom = 0;
struct Tree {
GLuint vertex_buffer;
GLuint index_buffer;

View file

@ -3,6 +3,7 @@
#include "tfrag_common.h"
#include "game/graphics/opengl_renderer/BucketRenderer.h"
#include "game/graphics/pipelines/opengl.h"
#include "common/util/os.h"
#include <immintrin.h>
@ -212,11 +213,12 @@ SwizzledTimeOfDay swizzle_time_of_day(const std::vector<tfrag3::TimeOfDayColor>&
// Due to using integers instead of floats, it may be a tiny bit different.
// TODO: it might be possible to reorder the loop into two blocks of loads and avoid spilling xmms.
// It's ~8x faster than the slow version.
void interp_time_of_day_fast(const float weights[8],
const SwizzledTimeOfDay& in,
math::Vector<u8, 4>* out) {
// even though the colors are 8 bits, we'll use 16 bits so we can saturate correctly
void interp_time_of_day_fast_avx2(const float weights[8],
const SwizzledTimeOfDay& in,
math::Vector<u8, 4>* out) {
// even though the colors are 8 bits, we'll use 16 bits so we can saturate correctly
#ifdef __AVX2__
// weight multipliers
__m256i weights0 = _mm256_set1_epi16(weights[0] * 64.f);
__m256i weights1 = _mm256_set1_epi16(weights[1] * 64.f);
@ -234,7 +236,7 @@ void interp_time_of_day_fast(const float weights[8],
255, 255, 255);
for (u32 color_quad = 0; color_quad < in.color_count / 4; color_quad++) {
// first, load colors. We put 16 bytes / register and don't touch the upper half because we will
// first, load colors. We put 16 bytes / register and don't touch the upper half because we
// convert u8s to u16s.
const u8* base = in.data.data() + color_quad * 128;
__m128i color0_p = _mm_loadu_si128((const __m128i*)(base + 0));
@ -290,6 +292,149 @@ void interp_time_of_day_fast(const float weights[8],
// store result
_mm_storeu_si128((__m128i*)(&out[color_quad * 4]), result);
}
#else
// unreachable.
ASSERT(false);
#endif
}
void interp_time_of_day_fast(const float weights[8],
const SwizzledTimeOfDay& in,
math::Vector<u8, 4>* out) {
// even though the colors are 8 bits, we'll use 16 bits so we can saturate correctly
if (get_cpu_info().has_avx2) {
interp_time_of_day_fast_avx2(weights, in, out);
return;
}
// weight multipliers
__m128i weights0 = _mm_set1_epi16(weights[0] * 64.f);
__m128i weights1 = _mm_set1_epi16(weights[1] * 64.f);
__m128i weights2 = _mm_set1_epi16(weights[2] * 64.f);
__m128i weights3 = _mm_set1_epi16(weights[3] * 64.f);
__m128i weights4 = _mm_set1_epi16(weights[4] * 64.f);
__m128i weights5 = _mm_set1_epi16(weights[5] * 64.f);
__m128i weights6 = _mm_set1_epi16(weights[6] * 64.f);
__m128i weights7 = _mm_set1_epi16(weights[7] * 64.f);
// saturation: note that alpha is saturated to 128 but the rest are 255.
// TODO: maybe we should saturate to 255 for everybody (can do this using a single packus) and
// change the shader to deal with this.
__m128i sat = _mm_set_epi16(128, 255, 255, 255, 128, 255, 255, 255);
for (u32 color_quad = 0; color_quad < in.color_count / 4; color_quad++) {
// first, load colors. We put 16 bytes / register and don't touch the upper half because we
// convert u8s to u16s.
{
const u8* base = in.data.data() + color_quad * 128;
__m128i color0_p = _mm_loadu_si64((const __m128i*)(base + 0));
__m128i color1_p = _mm_loadu_si64((const __m128i*)(base + 16));
__m128i color2_p = _mm_loadu_si64((const __m128i*)(base + 32));
__m128i color3_p = _mm_loadu_si64((const __m128i*)(base + 48));
__m128i color4_p = _mm_loadu_si64((const __m128i*)(base + 64));
__m128i color5_p = _mm_loadu_si64((const __m128i*)(base + 80));
__m128i color6_p = _mm_loadu_si64((const __m128i*)(base + 96));
__m128i color7_p = _mm_loadu_si64((const __m128i*)(base + 112));
// unpack to 16-bits. each has 16x 16 bit colors.
__m128i color0 = _mm_cvtepu8_epi16(color0_p);
__m128i color1 = _mm_cvtepu8_epi16(color1_p);
__m128i color2 = _mm_cvtepu8_epi16(color2_p);
__m128i color3 = _mm_cvtepu8_epi16(color3_p);
__m128i color4 = _mm_cvtepu8_epi16(color4_p);
__m128i color5 = _mm_cvtepu8_epi16(color5_p);
__m128i color6 = _mm_cvtepu8_epi16(color6_p);
__m128i color7 = _mm_cvtepu8_epi16(color7_p);
// multiply by weights
color0 = _mm_mullo_epi16(color0, weights0);
color1 = _mm_mullo_epi16(color1, weights1);
color2 = _mm_mullo_epi16(color2, weights2);
color3 = _mm_mullo_epi16(color3, weights3);
color4 = _mm_mullo_epi16(color4, weights4);
color5 = _mm_mullo_epi16(color5, weights5);
color6 = _mm_mullo_epi16(color6, weights6);
color7 = _mm_mullo_epi16(color7, weights7);
// add. This order minimizes dependencies.
color0 = _mm_add_epi16(color0, color1);
color2 = _mm_add_epi16(color2, color3);
color4 = _mm_add_epi16(color4, color5);
color6 = _mm_add_epi16(color6, color7);
color0 = _mm_add_epi16(color0, color2);
color4 = _mm_add_epi16(color4, color6);
color0 = _mm_add_epi16(color0, color4);
// divide, because we multiplied our weights by 2^7.
color0 = _mm_srli_epi16(color0, 6);
// saturate
color0 = _mm_min_epu16(sat, color0);
// back to u8s.
auto result = _mm_packus_epi16(color0, color0);
// store result
_mm_storeu_si64((__m128i*)(&out[color_quad * 4]), result);
}
{
const u8* base = in.data.data() + color_quad * 128 + 8;
__m128i color0_p = _mm_loadu_si64((const __m128i*)(base + 0));
__m128i color1_p = _mm_loadu_si64((const __m128i*)(base + 16));
__m128i color2_p = _mm_loadu_si64((const __m128i*)(base + 32));
__m128i color3_p = _mm_loadu_si64((const __m128i*)(base + 48));
__m128i color4_p = _mm_loadu_si64((const __m128i*)(base + 64));
__m128i color5_p = _mm_loadu_si64((const __m128i*)(base + 80));
__m128i color6_p = _mm_loadu_si64((const __m128i*)(base + 96));
__m128i color7_p = _mm_loadu_si64((const __m128i*)(base + 112));
// unpack to 16-bits. each has 16x 16 bit colors.
__m128i color0 = _mm_cvtepu8_epi16(color0_p);
__m128i color1 = _mm_cvtepu8_epi16(color1_p);
__m128i color2 = _mm_cvtepu8_epi16(color2_p);
__m128i color3 = _mm_cvtepu8_epi16(color3_p);
__m128i color4 = _mm_cvtepu8_epi16(color4_p);
__m128i color5 = _mm_cvtepu8_epi16(color5_p);
__m128i color6 = _mm_cvtepu8_epi16(color6_p);
__m128i color7 = _mm_cvtepu8_epi16(color7_p);
// multiply by weights
color0 = _mm_mullo_epi16(color0, weights0);
color1 = _mm_mullo_epi16(color1, weights1);
color2 = _mm_mullo_epi16(color2, weights2);
color3 = _mm_mullo_epi16(color3, weights3);
color4 = _mm_mullo_epi16(color4, weights4);
color5 = _mm_mullo_epi16(color5, weights5);
color6 = _mm_mullo_epi16(color6, weights6);
color7 = _mm_mullo_epi16(color7, weights7);
// add. This order minimizes dependencies.
color0 = _mm_add_epi16(color0, color1);
color2 = _mm_add_epi16(color2, color3);
color4 = _mm_add_epi16(color4, color5);
color6 = _mm_add_epi16(color6, color7);
color0 = _mm_add_epi16(color0, color2);
color4 = _mm_add_epi16(color4, color6);
color0 = _mm_add_epi16(color0, color4);
// divide, because we multiplied our weights by 2^7.
color0 = _mm_srli_epi16(color0, 6);
// saturate
color0 = _mm_min_epu16(sat, color0);
// back to u8s.
auto result = _mm_packus_epi16(color0, color0);
// store result
_mm_storeu_si64((__m128i*)(&out[color_quad * 4 + 2]), result);
}
}
}
bool sphere_in_view_ref(const math::Vector4f& sphere, const math::Vector4f* planes) {
@ -327,9 +472,9 @@ u32 make_all_visible_index_list(std::pair<int, int>* group_out,
const auto& draw = draws[i];
std::pair<int, int> ds;
ds.first = idx_buffer_ptr;
memcpy(&idx_out[idx_buffer_ptr], draw.vertex_index_stream.data(),
draw.vertex_index_stream.size() * sizeof(u32));
idx_buffer_ptr += draw.vertex_index_stream.size();
memcpy(&idx_out[idx_buffer_ptr], draw.unpacked.vertex_index_stream.data(),
draw.unpacked.vertex_index_stream.size() * sizeof(u32));
idx_buffer_ptr += draw.unpacked.vertex_index_stream.size();
ds.second = idx_buffer_ptr;
group_out[i] = ds;
}
@ -357,7 +502,7 @@ u32 make_index_list_from_vis_string(std::pair<int, int>* group_out,
} else {
building_run = false;
idx_buffer_ptr += grp.num;
memcpy(&idx_out[run_start_out], &draw.vertex_index_stream[run_start_in],
memcpy(&idx_out[run_start_out], &draw.unpacked.vertex_index_stream[run_start_in],
(idx_buffer_ptr - run_start_out) * sizeof(u32));
}
} else {
@ -372,7 +517,7 @@ u32 make_index_list_from_vis_string(std::pair<int, int>* group_out,
vtx_idx += grp.num;
}
if (building_run) {
memcpy(&idx_out[run_start_out], &draw.vertex_index_stream[run_start_in],
memcpy(&idx_out[run_start_out], &draw.unpacked.vertex_index_stream[run_start_in],
(idx_buffer_ptr - run_start_out) * sizeof(u32));
}

View file

@ -159,7 +159,7 @@ std::vector<std::shared_ptr<TextureRecord>> TexturePool::convert_textures(const
// the sizes given aren't the actual sizes in memory, so if you just use that, you get the
// wrong answer. I solved this in the decompiler by using the size of the actual data, but we
// don't really have that here.
u32 size = ((sizes[0] + sizes[1] + sizes[2] + 2047) / 256) * 256;
u32 size = ((sizes[0] + sizes[1] + sizes[2] + 4096) / 256) * 256;
m_tex_converter.upload(memory_base + texture_page.segment[0].block_data_ptr,
texture_page.segment[0].dest, size);

View file

@ -9,6 +9,7 @@
#include "common/log/log.h"
#include "common/util/FileUtil.h"
#include "game/discord.h"
#include "common/util/os.h"
// Discord RPC
extern int64_t gStartTime;
@ -28,17 +29,49 @@ void setup_logging(bool verbose) {
}
int main(int argc, char** argv) {
// do this as soon as possible - stuff like memcpy might use AVX instructions and we want to
// warn the user instead of just crashing.
setup_cpu_info();
if (!get_cpu_info().has_avx) {
printf("Your CPU does not support AVX, which is required for OpenGOAL.\n");
return -1;
}
bool verbose = false;
bool disable_avx2 = false;
for (int i = 1; i < argc; i++) {
if (std::string("-v") == argv[i]) {
verbose = true;
break;
}
if (std::string("-no-avx2") == argv[i]) {
disable_avx2 = true;
}
}
gStartTime = time(0);
init_discord_rpc();
if (disable_avx2) {
// for debugging the non-avx2 code paths, there's a flag to manually disable.
printf("Note: AVX2 code has been manually disabled.\n");
get_cpu_info().has_avx2 = false;
}
#ifndef __AVX2__
if (get_cpu_info().has_avx2) {
printf("Note: your CPU supports AVX2, but this build was not compiled with AVX2 support\n");
get_cpu_info().has_avx2 = false;
}
#endif
if (get_cpu_info().has_avx2) {
printf("AVX2 mode enabled\n");
} else {
printf("AVX2 mode disabled\n");
}
setup_logging(verbose);
while (true) {

View file

@ -9,12 +9,17 @@
#include "gtest/gtest.h"
#include "test/all_jak1_symbols.h"
#include "common/util/json_util.h"
#include "common/util/os.h"
#include "common/util/Range.h"
#include "third-party/fmt/core.h"
#include "common/util/print_float.h"
#include "common/util/CopyOnWrite.h"
#include "common/util/SmallVector.h"
TEST(CommonUtil, CpuInfo) {
setup_cpu_info();
}
TEST(CommonUtil, get_file_path) {
std::vector<std::string> test = {"cabbage", "banana", "apple"};
std::string sampleString = file_util::get_file_path(test);
@ -390,5 +395,6 @@ TEST(SmallVector, Construction) {
TEST(Assert, Death) {
EXPECT_DEATH(private_assert_failed("foo", "bar", 12, "aaa"), "");
}
} // namespace test
} // namespace cu