mirror of
https://github.com/open-goal/jak-project.git
synced 2024-10-20 00:57:44 -04:00
[graphics] reduce the size of fr3 files (#1175)
* first pass * first pass at shrinking fr3s * only need to load vertices once * avx2 detect and switch * fix build * another ifx' * one more * fix the sky and stupid math bug in size check
This commit is contained in:
parent
74d0025974
commit
5135ea9659
|
@ -17,9 +17,8 @@ if(MSVC AND (CMAKE_CXX_COMPILER_ID STREQUAL "Clang"))
|
|||
"-Xclang -fcxx-exceptions \
|
||||
-Xclang -fexceptions \
|
||||
-Xclang -std=c++17 \
|
||||
-mfma -mavx2 \
|
||||
-Wno-c++11-narrowing -W3 \
|
||||
/arch:AVX")
|
||||
-march=native \
|
||||
-Wno-c++11-narrowing -W3")
|
||||
|
||||
# additional c++ flags for release mode for our projects
|
||||
if(CMAKE_BUILD_TYPE MATCHES "Release")
|
||||
|
@ -47,7 +46,7 @@ elseif(UNIX)
|
|||
-Wshadow \
|
||||
-Wsign-promo \
|
||||
-fdiagnostics-color=always \
|
||||
-march=haswell")
|
||||
-march=native")
|
||||
|
||||
# additional c++ flags for release mode for our projects
|
||||
if(CMAKE_BUILD_TYPE MATCHES "Release")
|
||||
|
|
|
@ -3,14 +3,31 @@
|
|||
|
||||
namespace tfrag3 {
|
||||
|
||||
void PackedTieVertices::serialize(Serializer& ser) {
|
||||
ser.from_pod_vector(&color_indices);
|
||||
ser.from_pod_vector(&matrices);
|
||||
ser.from_pod_vector(&matrix_groups);
|
||||
ser.from_pod_vector(&vertices);
|
||||
}
|
||||
|
||||
void StripDraw::serialize(Serializer& ser) {
|
||||
ser.from_ptr(&mode);
|
||||
ser.from_ptr(&tree_tex_id);
|
||||
ser.from_pod_vector(&vertex_index_stream);
|
||||
ser.from_pod_vector(&runs);
|
||||
ser.from_pod_vector(&vis_groups);
|
||||
ser.from_ptr(&num_triangles);
|
||||
}
|
||||
|
||||
void StripDraw::unpack() {
|
||||
ASSERT(unpacked.vertex_index_stream.empty());
|
||||
for (auto& r : runs) {
|
||||
for (int i = 0; i < r.length; i++) {
|
||||
unpacked.vertex_index_stream.push_back(r.vertex0 + i);
|
||||
}
|
||||
unpacked.vertex_index_stream.push_back(UINT32_MAX);
|
||||
}
|
||||
}
|
||||
|
||||
void InstancedStripDraw::serialize(Serializer& ser) {
|
||||
ser.from_ptr(&mode);
|
||||
ser.from_ptr(&tree_tex_id);
|
||||
|
@ -37,11 +54,71 @@ void TfragTree::serialize(Serializer& ser) {
|
|||
draw.serialize(ser);
|
||||
}
|
||||
|
||||
ser.from_pod_vector(&vertices);
|
||||
// ser.from_pod_vector(&vertices);
|
||||
ser.from_pod_vector(&packed_vertices.vertices);
|
||||
ser.from_pod_vector(&packed_vertices.cluster_origins);
|
||||
ser.from_pod_vector(&colors);
|
||||
bvh.serialize(ser);
|
||||
}
|
||||
|
||||
void TieTree::unpack() {
|
||||
unpacked.vertices.resize(packed_vertices.color_indices.size());
|
||||
size_t i = 0;
|
||||
for (const auto& grp : packed_vertices.matrix_groups) {
|
||||
if (grp.matrix_idx == -1) {
|
||||
for (u32 src_idx = grp.start_vert; src_idx < grp.end_vert; src_idx++) {
|
||||
auto& vtx = unpacked.vertices[i];
|
||||
vtx.color_index = packed_vertices.color_indices[i];
|
||||
const auto& proto_vtx = packed_vertices.vertices[src_idx];
|
||||
vtx.x = proto_vtx.x;
|
||||
vtx.y = proto_vtx.y;
|
||||
vtx.z = proto_vtx.z;
|
||||
vtx.q = 1.f;
|
||||
vtx.s = proto_vtx.s;
|
||||
vtx.t = proto_vtx.t;
|
||||
i++;
|
||||
}
|
||||
} else {
|
||||
const auto& mat = packed_vertices.matrices[grp.matrix_idx];
|
||||
for (u32 src_idx = grp.start_vert; src_idx < grp.end_vert; src_idx++) {
|
||||
auto& vtx = unpacked.vertices[i];
|
||||
vtx.color_index = packed_vertices.color_indices[i];
|
||||
const auto& proto_vtx = packed_vertices.vertices[src_idx];
|
||||
auto temp = mat[0] * proto_vtx.x + mat[1] * proto_vtx.y + mat[2] * proto_vtx.z + mat[3];
|
||||
vtx.x = temp.x();
|
||||
vtx.y = temp.y();
|
||||
vtx.z = temp.z();
|
||||
vtx.q = 1.f;
|
||||
vtx.s = proto_vtx.s;
|
||||
vtx.t = proto_vtx.t;
|
||||
i++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void TfragTree::unpack() {
|
||||
unpacked.vertices.resize(packed_vertices.vertices.size());
|
||||
for (size_t i = 0; i < unpacked.vertices.size(); i++) {
|
||||
auto& o = unpacked.vertices[i];
|
||||
auto& in = packed_vertices.vertices[i];
|
||||
auto& cluster = packed_vertices.cluster_origins.at(in.cluster_idx);
|
||||
constexpr float kClusterSize = 4096 * 40; // 100 in-game meters
|
||||
constexpr float kMasterOffset = 12000 * 4096;
|
||||
constexpr float rescale = kClusterSize / UINT16_MAX;
|
||||
float cx = -kMasterOffset + kClusterSize * cluster.x();
|
||||
float cy = -kMasterOffset + kClusterSize * cluster.y();
|
||||
float cz = -kMasterOffset + kClusterSize * cluster.z();
|
||||
o.x = cx + in.xoff * rescale;
|
||||
o.y = cy + in.yoff * rescale;
|
||||
o.z = cz + in.zoff * rescale;
|
||||
o.s = in.s / (1024.f);
|
||||
o.t = in.t / (1024.f);
|
||||
o.q = 1.f;
|
||||
o.color_index = in.color_index;
|
||||
}
|
||||
}
|
||||
|
||||
void TieTree::serialize(Serializer& ser) {
|
||||
if (ser.is_saving()) {
|
||||
ser.save<size_t>(static_draws.size());
|
||||
|
@ -62,15 +139,15 @@ void TieTree::serialize(Serializer& ser) {
|
|||
}
|
||||
|
||||
if (ser.is_saving()) {
|
||||
ser.save<size_t>(instance_info.size());
|
||||
ser.save<size_t>(wind_instance_info.size());
|
||||
} else {
|
||||
instance_info.resize(ser.load<size_t>());
|
||||
wind_instance_info.resize(ser.load<size_t>());
|
||||
}
|
||||
for (auto& inst : instance_info) {
|
||||
for (auto& inst : wind_instance_info) {
|
||||
inst.serialize(ser);
|
||||
}
|
||||
|
||||
ser.from_pod_vector(&vertices);
|
||||
packed_vertices.serialize(ser);
|
||||
ser.from_pod_vector(&colors);
|
||||
bvh.serialize(ser);
|
||||
}
|
||||
|
@ -141,4 +218,58 @@ void Level::serialize(Serializer& ser) {
|
|||
}
|
||||
}
|
||||
|
||||
std::array<int, MemoryUsageCategory::NUM_CATEGORIES> Level::get_memory_usage() const {
|
||||
std::array<int, MemoryUsageCategory::NUM_CATEGORIES> result;
|
||||
result.fill(0);
|
||||
|
||||
// textures
|
||||
for (const auto& tex : textures) {
|
||||
result[TEXTURE] += tex.data.size() * sizeof(u32);
|
||||
}
|
||||
|
||||
// tfrag
|
||||
for (const auto& tfrag_tree_geoms : tfrag_trees) {
|
||||
for (const auto& tfrag_tree : tfrag_tree_geoms) {
|
||||
for (const auto& draw : tfrag_tree.draws) {
|
||||
result[TFRAG_INDEX] += draw.runs.size() * sizeof(StripDraw::VertexRun);
|
||||
result[TFRAG_VIS] += draw.vis_groups.size() * sizeof(StripDraw::VisGroup);
|
||||
}
|
||||
result[TFRAG_VERTS] +=
|
||||
tfrag_tree.packed_vertices.vertices.size() * sizeof(PackedTfragVertices::Vertex);
|
||||
result[TFRAG_CLUSTER] +=
|
||||
tfrag_tree.packed_vertices.cluster_origins.size() * sizeof(math::Vector<u16, 3>);
|
||||
result[TFRAG_TIME_OF_DAY] += tfrag_tree.colors.size() * sizeof(TimeOfDayColor);
|
||||
result[TFRAG_BVH] += tfrag_tree.bvh.vis_nodes.size() * sizeof(VisNode);
|
||||
}
|
||||
}
|
||||
|
||||
// tie
|
||||
for (const auto& tie_tree_geoms : tie_trees) {
|
||||
for (const auto& tie_tree : tie_tree_geoms) {
|
||||
result[TIE_BVH] += tie_tree.bvh.vis_nodes.size();
|
||||
for (const auto& draw : tie_tree.static_draws) {
|
||||
result[TIE_DEINST_INDEX] += draw.runs.size() * sizeof(StripDraw::VertexRun);
|
||||
result[TIE_DEINST_VIS] += draw.vis_groups.size() * sizeof(StripDraw::VisGroup);
|
||||
}
|
||||
result[TIE_VERTS] +=
|
||||
tie_tree.packed_vertices.vertices.size() * sizeof(PackedTieVertices::Vertex);
|
||||
result[TIE_CIDX] += tie_tree.packed_vertices.color_indices.size() * sizeof(u16);
|
||||
result[TIE_MATRICES] += tie_tree.packed_vertices.matrices.size() * 4 * 4 * 4;
|
||||
result[TIE_GRPS] +=
|
||||
tie_tree.packed_vertices.matrix_groups.size() * sizeof(PackedTieVertices::MatrixGroup);
|
||||
result[TIE_TIME_OF_DAY] += tie_tree.colors.size() * sizeof(TimeOfDayColor);
|
||||
|
||||
for (const auto& draw : tie_tree.instanced_wind_draws) {
|
||||
result[TIE_INST_INDEX] += draw.vertex_index_stream.size() * sizeof(u32);
|
||||
result[TIE_INST_VIS] +=
|
||||
draw.instance_groups.size() * sizeof(InstancedStripDraw::InstanceGroup);
|
||||
}
|
||||
result[TIE_WIND_INSTANCE_INFO] +=
|
||||
tie_tree.wind_instance_info.size() * sizeof(TieWindInstance);
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
} // namespace tfrag3
|
||||
|
|
|
@ -11,7 +11,39 @@
|
|||
|
||||
namespace tfrag3 {
|
||||
|
||||
constexpr int TFRAG3_VERSION = 10;
|
||||
// NOTE:
|
||||
// when updating any data structures in this file:
|
||||
// - change the TFRAG3_VERSION
|
||||
// - make sure to update the serialize function
|
||||
// - if changing any large things (vertices, vis, bvh, colors, textures) update get_memory_usage
|
||||
// - if adding a new category to the memory usage, update extract_level to print it.
|
||||
|
||||
enum MemoryUsageCategory {
|
||||
TEXTURE,
|
||||
|
||||
TIE_DEINST_VIS,
|
||||
TIE_DEINST_INDEX,
|
||||
TIE_INST_VIS,
|
||||
TIE_INST_INDEX,
|
||||
TIE_BVH,
|
||||
TIE_VERTS,
|
||||
TIE_TIME_OF_DAY,
|
||||
TIE_WIND_INSTANCE_INFO,
|
||||
|
||||
TIE_CIDX,
|
||||
TIE_MATRICES,
|
||||
TIE_GRPS,
|
||||
|
||||
TFRAG_VIS,
|
||||
TFRAG_INDEX,
|
||||
TFRAG_VERTS,
|
||||
TFRAG_CLUSTER,
|
||||
TFRAG_TIME_OF_DAY,
|
||||
TFRAG_BVH,
|
||||
NUM_CATEGORIES
|
||||
};
|
||||
|
||||
constexpr int TFRAG3_VERSION = 11;
|
||||
|
||||
// These vertices should be uploaded to the GPU at load time and don't change
|
||||
struct PreloadedVertex {
|
||||
|
@ -25,6 +57,55 @@ struct PreloadedVertex {
|
|||
};
|
||||
static_assert(sizeof(PreloadedVertex) == 32, "PreloadedVertex size");
|
||||
|
||||
struct PackedTieVertices {
|
||||
struct Vertex {
|
||||
float x, y, z;
|
||||
float s, t;
|
||||
};
|
||||
|
||||
struct MatrixGroup {
|
||||
s32 matrix_idx;
|
||||
u32 start_vert;
|
||||
u32 end_vert;
|
||||
};
|
||||
|
||||
std::vector<u16> color_indices;
|
||||
std::vector<std::array<math::Vector4f, 4>> matrices;
|
||||
std::vector<MatrixGroup> matrix_groups; // todo pack
|
||||
std::vector<Vertex> vertices;
|
||||
float cluster_size = 0;
|
||||
void serialize(Serializer& ser);
|
||||
};
|
||||
|
||||
struct PackedTfragVertices {
|
||||
struct Vertex {
|
||||
u16 xoff, yoff, zoff;
|
||||
u16 cluster_idx;
|
||||
u16 s, t;
|
||||
u16 color_index;
|
||||
|
||||
/*
|
||||
bool operator==(const Vertex& other) const {
|
||||
return xoff == other.xoff && yoff == other.yoff && zoff == other.zoff &&
|
||||
cluster_idx == other.cluster_idx && s == other.s && t == other.t &&
|
||||
color_index == other.color_index;
|
||||
}
|
||||
|
||||
struct hash {
|
||||
auto operator()(const Vertex& x) const {
|
||||
return std::hash<uint16_t>()(x.xoff) ^ std::hash<uint16_t>()(x.yoff) ^
|
||||
std::hash<uint16_t>()(x.zoff) ^ std::hash<uint16_t>()(x.cluster_idx) ^
|
||||
std::hash<uint16_t>()(x.s) ^ std::hash<uint16_t>()(x.t) ^
|
||||
std::hash<uint16_t>()(x.color_index);
|
||||
}
|
||||
};
|
||||
*/
|
||||
};
|
||||
|
||||
std::vector<Vertex> vertices;
|
||||
std::vector<math::Vector<u16, 3>> cluster_origins;
|
||||
};
|
||||
|
||||
// Settings for drawing a group of triangle strips.
|
||||
// This refers to a group of PreloadedVertices that are already uploaded.
|
||||
// All triangles here are drawn in the same "mode" (blending, texture, etc)
|
||||
|
@ -35,9 +116,20 @@ struct StripDraw {
|
|||
DrawMode mode; // the OpenGL draw settings.
|
||||
u32 tree_tex_id = 0; // the texture that should be bound for the draw
|
||||
|
||||
// the list of vertices in the draw. This includes the restart code of UINT32_MAX that OpenGL
|
||||
// will use to start a new strip.
|
||||
std::vector<u32> vertex_index_stream;
|
||||
struct {
|
||||
// the list of vertices in the draw. This includes the restart code of UINT32_MAX that OpenGL
|
||||
// will use to start a new strip.
|
||||
std::vector<u32> vertex_index_stream;
|
||||
} unpacked;
|
||||
|
||||
void unpack();
|
||||
|
||||
struct VertexRun {
|
||||
u32 vertex0;
|
||||
u16 length;
|
||||
};
|
||||
|
||||
std::vector<VertexRun> runs;
|
||||
|
||||
// to do culling, the above vertex stream is grouped.
|
||||
// by following the visgroups and checking the visibility, you can leave out invisible vertices.
|
||||
|
@ -129,11 +221,16 @@ constexpr const char* tfrag_tree_names[] = {"normal", "trans", "dirt",
|
|||
|
||||
// A tfrag model
|
||||
struct TfragTree {
|
||||
TFragmentTreeKind kind; // our tfrag kind
|
||||
std::vector<StripDraw> draws; // the actual topology and settings
|
||||
std::vector<PreloadedVertex> vertices; // mesh vertices
|
||||
std::vector<TimeOfDayColor> colors; // vertex colors (pre-interpolation)
|
||||
BVH bvh; // the bvh for frustum culling
|
||||
TFragmentTreeKind kind; // our tfrag kind
|
||||
std::vector<StripDraw> draws; // the actual topology and settings
|
||||
PackedTfragVertices packed_vertices;
|
||||
std::vector<TimeOfDayColor> colors; // vertex colors (pre-interpolation)
|
||||
BVH bvh; // the bvh for frustum culling
|
||||
|
||||
struct {
|
||||
std::vector<PreloadedVertex> vertices; // mesh vertices
|
||||
} unpacked;
|
||||
void unpack();
|
||||
void serialize(Serializer& ser);
|
||||
};
|
||||
|
||||
|
@ -147,14 +244,20 @@ struct TieWindInstance {
|
|||
// A tie model
|
||||
struct TieTree {
|
||||
BVH bvh;
|
||||
std::vector<StripDraw> static_draws; // the actual topology and settings
|
||||
std::vector<PreloadedVertex> vertices; // mesh vertices
|
||||
std::vector<TimeOfDayColor> colors; // vertex colors (pre-interpolation)
|
||||
std::vector<StripDraw> static_draws; // the actual topology and settings
|
||||
|
||||
PackedTieVertices packed_vertices;
|
||||
std::vector<TimeOfDayColor> colors; // vertex colors (pre-interpolation)
|
||||
|
||||
std::vector<InstancedStripDraw> instanced_wind_draws;
|
||||
std::vector<TieWindInstance> instance_info;
|
||||
std::vector<TieWindInstance> wind_instance_info;
|
||||
|
||||
struct {
|
||||
std::vector<PreloadedVertex> vertices; // mesh vertices
|
||||
} unpacked;
|
||||
|
||||
void serialize(Serializer& ser);
|
||||
void unpack();
|
||||
};
|
||||
|
||||
struct Level {
|
||||
|
@ -165,6 +268,8 @@ struct Level {
|
|||
std::array<std::vector<TieTree>, 4> tie_trees;
|
||||
u16 version2 = TFRAG3_VERSION;
|
||||
void serialize(Serializer& ser);
|
||||
|
||||
std::array<int, MemoryUsageCategory::NUM_CATEGORIES> get_memory_usage() const;
|
||||
};
|
||||
|
||||
} // namespace tfrag3
|
||||
|
|
|
@ -13,7 +13,12 @@
|
|||
#include "common/util/BinaryReader.h"
|
||||
#include "BinaryWriter.h"
|
||||
#include "common/common_types.h"
|
||||
|
||||
// This disables the use of PCLMULQDQ which is probably ok, but let's just be safe and disable it
|
||||
// because nobody will care if png compression is 10% slower.
|
||||
#define FPNG_NO_SSE 1
|
||||
#include "third-party/fpng/fpng.cpp"
|
||||
|
||||
#include "third-party/fpng/fpng.h"
|
||||
#include "third-party/fmt/core.h"
|
||||
#include "third-party/lzokay/lzokay.hpp"
|
||||
|
|
|
@ -1,5 +1,7 @@
|
|||
#include "os.h"
|
||||
|
||||
#include "common/common_types.h"
|
||||
|
||||
#ifdef __linux__
|
||||
|
||||
#include <sys/resource.h>
|
||||
|
@ -14,4 +16,72 @@ size_t get_peak_rss() {
|
|||
size_t get_peak_rss() {
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef _WIN32
|
||||
// windows has a __cpuid
|
||||
#include <intrin.h>
|
||||
#else
|
||||
// using int to be compatible with msvc's intrinsic
|
||||
void __cpuidex(int result[4], int eax, int ecx) {
|
||||
asm("cpuid\n\t"
|
||||
: "=a"(result[0]), "=b"(result[1]), "=c"(result[2]), "=d"(result[3])
|
||||
: "0"(eax), "2"(ecx));
|
||||
}
|
||||
#endif
|
||||
|
||||
CpuInfo gCpuInfo;
|
||||
|
||||
void setup_cpu_info() {
|
||||
if (gCpuInfo.initialized) {
|
||||
return;
|
||||
}
|
||||
|
||||
// as a test, get the brand and model
|
||||
for (u32 i = 0x80000002; i <= 0x80000004; i++) {
|
||||
int result[4];
|
||||
__cpuidex(result, i, 0);
|
||||
for (auto reg : result) {
|
||||
for (int c = 0; c < 4; c++) {
|
||||
gCpuInfo.model.push_back(reg);
|
||||
reg >>= 8;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
int result[4];
|
||||
__cpuidex(result, 0, 0);
|
||||
for (auto r : {1, 3, 2}) {
|
||||
for (int c = 0; c < 4; c++) {
|
||||
gCpuInfo.brand.push_back(result[r]);
|
||||
result[r] >>= 8;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// check for AVX2
|
||||
{
|
||||
int result[4];
|
||||
__cpuidex(result, 7, 0);
|
||||
gCpuInfo.has_avx2 = result[1] & (1 << 5);
|
||||
}
|
||||
|
||||
{
|
||||
int result[4];
|
||||
__cpuidex(result, 1, 0);
|
||||
gCpuInfo.has_avx = result[2] & (1 << 28);
|
||||
}
|
||||
|
||||
printf("-------- CPU Information --------\n");
|
||||
printf(" Brand: %s\n", gCpuInfo.brand.c_str());
|
||||
printf(" Model: %s\n", gCpuInfo.model.c_str());
|
||||
printf(" AVX : %s\n", gCpuInfo.has_avx ? "true" : "false");
|
||||
printf(" AVX2 : %s\n", gCpuInfo.has_avx2 ? "true" : "false");
|
||||
|
||||
gCpuInfo.initialized = true;
|
||||
}
|
||||
|
||||
CpuInfo& get_cpu_info() {
|
||||
return gCpuInfo;
|
||||
}
|
|
@ -1,6 +1,19 @@
|
|||
#pragma once
|
||||
|
||||
#include <cstddef>
|
||||
#include <string>
|
||||
|
||||
// Note: these are not implemented on windows and will return zero.
|
||||
size_t get_peak_rss();
|
||||
size_t get_peak_rss();
|
||||
void setup_cpu_info();
|
||||
|
||||
struct CpuInfo {
|
||||
bool initialized = false;
|
||||
bool has_avx = false;
|
||||
bool has_avx2 = false;
|
||||
|
||||
std::string brand;
|
||||
std::string model;
|
||||
};
|
||||
|
||||
CpuInfo& get_cpu_info();
|
||||
|
|
|
@ -48,6 +48,45 @@ bool is_valid_bsp(const decompiler::LinkedObjectFile& file) {
|
|||
return true;
|
||||
}
|
||||
|
||||
void print_memory_usage(const tfrag3::Level& lev, int uncompressed_data_size) {
|
||||
int total_accounted = 0;
|
||||
auto memory_use_by_category = lev.get_memory_usage();
|
||||
|
||||
std::vector<std::pair<std::string, int>> known_categories = {
|
||||
{"texture", memory_use_by_category[tfrag3::MemoryUsageCategory::TEXTURE]},
|
||||
{"tie-deinst-vis", memory_use_by_category[tfrag3::MemoryUsageCategory::TIE_DEINST_VIS]},
|
||||
{"tie-deinst-idx", memory_use_by_category[tfrag3::MemoryUsageCategory::TIE_DEINST_INDEX]},
|
||||
{"tie-inst-vis", memory_use_by_category[tfrag3::MemoryUsageCategory::TIE_INST_VIS]},
|
||||
{"tie-inst-idx", memory_use_by_category[tfrag3::MemoryUsageCategory::TIE_INST_INDEX]},
|
||||
{"tie-bvh", memory_use_by_category[tfrag3::MemoryUsageCategory::TIE_BVH]},
|
||||
{"tie-verts", memory_use_by_category[tfrag3::MemoryUsageCategory::TIE_VERTS]},
|
||||
{"tie-colors", memory_use_by_category[tfrag3::MemoryUsageCategory::TIE_TIME_OF_DAY]},
|
||||
{"tie-wind-inst-info",
|
||||
memory_use_by_category[tfrag3::MemoryUsageCategory::TIE_WIND_INSTANCE_INFO]},
|
||||
{"tie-cidx", memory_use_by_category[tfrag3::MemoryUsageCategory::TIE_CIDX]},
|
||||
{"tie-mats", memory_use_by_category[tfrag3::MemoryUsageCategory::TIE_MATRICES]},
|
||||
{"tie-grps", memory_use_by_category[tfrag3::MemoryUsageCategory::TIE_GRPS]},
|
||||
{"tfrag-vis", memory_use_by_category[tfrag3::MemoryUsageCategory::TFRAG_VIS]},
|
||||
{"tfrag-idx", memory_use_by_category[tfrag3::MemoryUsageCategory::TFRAG_INDEX]},
|
||||
{"tfrag-vert", memory_use_by_category[tfrag3::MemoryUsageCategory::TFRAG_VERTS]},
|
||||
{"tfrag-colors", memory_use_by_category[tfrag3::MemoryUsageCategory::TFRAG_TIME_OF_DAY]},
|
||||
{"tfrag-cluster", memory_use_by_category[tfrag3::MemoryUsageCategory::TFRAG_CLUSTER]},
|
||||
{"tfrag-bvh", memory_use_by_category[tfrag3::MemoryUsageCategory::TFRAG_BVH]}};
|
||||
for (auto& known : known_categories) {
|
||||
total_accounted += known.second;
|
||||
}
|
||||
|
||||
known_categories.push_back({"unknown", uncompressed_data_size - total_accounted});
|
||||
|
||||
std::sort(known_categories.begin(), known_categories.end(),
|
||||
[](const auto& a, const auto& b) { return a.second > b.second; });
|
||||
|
||||
for (const auto& x : known_categories) {
|
||||
fmt::print("{:30s} : {:6d} kB {:3.1f}%\n", x.first, x.second / 1024,
|
||||
100.f * (float)x.second / uncompressed_data_size);
|
||||
}
|
||||
}
|
||||
|
||||
void extract_from_level(ObjectFileDB& db,
|
||||
TextureDB& tex_db,
|
||||
const std::string& dgo_name,
|
||||
|
@ -85,7 +124,6 @@ void extract_from_level(ObjectFileDB& db,
|
|||
for (auto& draw_tree : bsp_header.drawable_tree_array.trees) {
|
||||
if (tfrag_trees.count(draw_tree->my_type())) {
|
||||
auto as_tfrag_tree = dynamic_cast<level_tools::DrawableTreeTfrag*>(draw_tree.get());
|
||||
fmt::print(" extracting tree {}\n", draw_tree->my_type());
|
||||
ASSERT(as_tfrag_tree);
|
||||
std::vector<std::pair<int, int>> expected_missing_textures;
|
||||
auto it = hacks.missing_textures_by_level.find(level_name);
|
||||
|
@ -96,13 +134,12 @@ void extract_from_level(ObjectFileDB& db,
|
|||
bsp_header.texture_remap_table, tex_db, expected_missing_textures, tfrag_level,
|
||||
dump_level);
|
||||
} else if (draw_tree->my_type() == "drawable-tree-instance-tie") {
|
||||
fmt::print(" extracting TIE\n");
|
||||
auto as_tie_tree = dynamic_cast<level_tools::DrawableTreeInstanceTie*>(draw_tree.get());
|
||||
ASSERT(as_tie_tree);
|
||||
extract_tie(as_tie_tree, fmt::format("{}-{}-tie", dgo_name, i++),
|
||||
bsp_header.texture_remap_table, tex_db, tfrag_level, dump_level);
|
||||
} else {
|
||||
fmt::print(" unsupported tree {}\n", draw_tree->my_type());
|
||||
// fmt::print(" unsupported tree {}\n", draw_tree->my_type());
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -110,6 +147,7 @@ void extract_from_level(ObjectFileDB& db,
|
|||
tfrag_level.serialize(ser);
|
||||
auto compressed =
|
||||
compression::compress_zstd(ser.get_save_result().first, ser.get_save_result().second);
|
||||
print_memory_usage(tfrag_level, ser.get_save_result().second);
|
||||
fmt::print("compressed: {} -> {} ({:.2f}%)\n", ser.get_save_result().second, compressed.size(),
|
||||
100.f * compressed.size() / ser.get_save_result().second);
|
||||
file_util::write_binary_file(file_util::get_file_path({fmt::format(
|
||||
|
|
|
@ -1975,13 +1975,14 @@ std::map<u32, std::vector<GroupedDraw>> make_draw_groups(std::vector<TFragDraw>&
|
|||
}
|
||||
}
|
||||
|
||||
fmt::print(" grouped to get {} draw calls\n", dc);
|
||||
// fmt::print(" grouped to get {} draw calls\n", dc);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
void make_tfrag3_data(std::map<u32, std::vector<GroupedDraw>>& draws,
|
||||
tfrag3::TfragTree& tree_out,
|
||||
std::vector<tfrag3::PreloadedVertex>& vertices,
|
||||
std::vector<tfrag3::Texture>& texture_pool,
|
||||
const TextureDB& tdb,
|
||||
const std::vector<std::pair<int, int>>& expected_missing_textures) {
|
||||
|
@ -2045,6 +2046,9 @@ void make_tfrag3_data(std::map<u32, std::vector<GroupedDraw>>& draws,
|
|||
vgroup.num = strip.verts.size() + 1; // one for the primitive restart!
|
||||
|
||||
tdraw.num_triangles += strip.verts.size() - 2;
|
||||
tfrag3::StripDraw::VertexRun run;
|
||||
run.vertex0 = vertices.size();
|
||||
run.length = strip.verts.size();
|
||||
for (auto& vert : strip.verts) {
|
||||
// convert vert.
|
||||
tfrag3::PreloadedVertex vtx;
|
||||
|
@ -2060,12 +2064,10 @@ void make_tfrag3_data(std::map<u32, std::vector<GroupedDraw>>& draws,
|
|||
// ASSERT((vert.rgba >> 2) < 1024); spider cave has 2048?
|
||||
ASSERT((vert.rgba & 3) == 0);
|
||||
|
||||
size_t vert_idx = tree_out.vertices.size();
|
||||
tree_out.vertices.push_back(vtx);
|
||||
tdraw.vertex_index_stream.push_back(vert_idx);
|
||||
size_t vert_idx = vertices.size();
|
||||
vertices.push_back(vtx);
|
||||
}
|
||||
tdraw.vertex_index_stream.push_back(UINT32_MAX); // prim restart
|
||||
|
||||
tdraw.runs.push_back(run);
|
||||
tdraw.vis_groups.push_back(vgroup);
|
||||
}
|
||||
|
||||
|
@ -2080,6 +2082,7 @@ void emulate_tfrags(int geom,
|
|||
const std::vector<level_tools::TextureRemap>& map,
|
||||
tfrag3::Level& level_out,
|
||||
tfrag3::TfragTree& tree_out,
|
||||
std::vector<tfrag3::PreloadedVertex>& vertices,
|
||||
const TextureDB& tdb,
|
||||
const std::vector<std::pair<int, int>>& expected_missing_textures,
|
||||
bool dump_level) {
|
||||
|
@ -2101,7 +2104,7 @@ void emulate_tfrags(int geom,
|
|||
process_draw_mode(all_draws, map, tree_out.kind);
|
||||
auto groups = make_draw_groups(all_draws);
|
||||
|
||||
make_tfrag3_data(groups, tree_out, level_out.textures, tdb, expected_missing_textures);
|
||||
make_tfrag3_data(groups, tree_out, vertices, level_out.textures, tdb, expected_missing_textures);
|
||||
|
||||
if (dump_level) {
|
||||
auto debug_out = debug_dump_to_obj(all_draws);
|
||||
|
@ -2135,6 +2138,85 @@ void merge_groups(std::vector<tfrag3::StripDraw::VisGroup>& grps) {
|
|||
|
||||
} // namespace
|
||||
|
||||
constexpr float kClusterSize = 4096 * 40; // 100 in-game meters
|
||||
constexpr float kMasterOffset = 12000 * 4096;
|
||||
|
||||
std::pair<u64, u16> position_to_cluster_and_offset(float in) {
|
||||
in += kMasterOffset;
|
||||
if (in < 0) {
|
||||
fmt::print("negative: {}\n", in);
|
||||
}
|
||||
ASSERT(in >= 0);
|
||||
int cluster_cell = (in / kClusterSize);
|
||||
float leftover = in - (cluster_cell * kClusterSize);
|
||||
u16 offset = (leftover / kClusterSize) * float(UINT16_MAX);
|
||||
|
||||
float recovered = ((float)cluster_cell + ((float)offset / UINT16_MAX)) * kClusterSize;
|
||||
float diff = std::fabs(recovered - in);
|
||||
ASSERT(diff < 7);
|
||||
ASSERT(cluster_cell >= 0);
|
||||
ASSERT(cluster_cell < UINT16_MAX);
|
||||
return {cluster_cell, offset};
|
||||
}
|
||||
|
||||
void pack_vertices(tfrag3::PackedTfragVertices* result,
|
||||
const std::vector<tfrag3::PreloadedVertex>& vertices) {
|
||||
u32 next_cluster_idx = 0;
|
||||
std::map<u64, u32> clusters;
|
||||
|
||||
for (auto& vtx : vertices) {
|
||||
auto x = position_to_cluster_and_offset(vtx.x);
|
||||
auto y = position_to_cluster_and_offset(vtx.y);
|
||||
auto z = position_to_cluster_and_offset(vtx.z);
|
||||
u64 cluster_id = 0;
|
||||
cluster_id |= x.first;
|
||||
cluster_id |= (y.first << 16);
|
||||
cluster_id |= (z.first << 32);
|
||||
|
||||
auto cluster_it = clusters.find(cluster_id);
|
||||
u32 my_cluster_idx = 0;
|
||||
if (cluster_it == clusters.end()) {
|
||||
// first in cluster
|
||||
clusters[cluster_id] = next_cluster_idx;
|
||||
my_cluster_idx = next_cluster_idx;
|
||||
next_cluster_idx++;
|
||||
} else {
|
||||
my_cluster_idx = cluster_it->second;
|
||||
}
|
||||
|
||||
tfrag3::PackedTfragVertices::Vertex out_vtx;
|
||||
out_vtx.xoff = x.second;
|
||||
out_vtx.yoff = y.second;
|
||||
out_vtx.zoff = z.second;
|
||||
out_vtx.cluster_idx = my_cluster_idx;
|
||||
// TODO check these
|
||||
out_vtx.s = vtx.s * 1024;
|
||||
out_vtx.t = vtx.t * 1024;
|
||||
out_vtx.color_index = vtx.color_index;
|
||||
result->vertices.push_back(out_vtx);
|
||||
}
|
||||
|
||||
result->cluster_origins.resize(next_cluster_idx);
|
||||
for (auto& cluster : clusters) {
|
||||
auto& res = result->cluster_origins[cluster.second];
|
||||
res.x() = (u16)cluster.first;
|
||||
res.y() = (u16)(cluster.first >> 16);
|
||||
res.z() = (u16)(cluster.first >> 32);
|
||||
}
|
||||
|
||||
/*
|
||||
std::unordered_set<tfrag3::PackedTfragVertices::Vertex, tfrag3::PackedTfragVertices::Vertex::hash>
|
||||
a;
|
||||
for (auto& v : result->vertices) {
|
||||
a.insert(v);
|
||||
}
|
||||
fmt::print("SIZE: {} vs {} {}\n", a.size(), result->vertices.size(),
|
||||
(float)a.size() / result->vertices.size());
|
||||
*/
|
||||
|
||||
ASSERT(next_cluster_idx < UINT16_MAX);
|
||||
}
|
||||
|
||||
void extract_tfrag(const level_tools::DrawableTreeTfrag* tree,
|
||||
const std::string& debug_name,
|
||||
const std::vector<level_tools::TextureRemap>& map,
|
||||
|
@ -2142,7 +2224,7 @@ void extract_tfrag(const level_tools::DrawableTreeTfrag* tree,
|
|||
const std::vector<std::pair<int, int>>& expected_missing_textures,
|
||||
tfrag3::Level& out,
|
||||
bool dump_level) {
|
||||
// go through 4 lods(?)
|
||||
// go through 3 lods(?)
|
||||
for (int geom = 0; geom < GEOM_MAX; ++geom) {
|
||||
tfrag3::TfragTree this_tree;
|
||||
if (tree->my_type() == "drawable-tree-tfrag") {
|
||||
|
@ -2176,7 +2258,8 @@ void extract_tfrag(const level_tools::DrawableTreeTfrag* tree,
|
|||
}
|
||||
bool ok = verify_node_indices(tree);
|
||||
ASSERT(ok);
|
||||
fmt::print(" tree has {} arrays and {} tfragments\n", tree->length, as_tfrag_array->length);
|
||||
// fmt::print(" tree has {} arrays and {} tfragments\n", tree->length,
|
||||
// as_tfrag_array->length);
|
||||
|
||||
auto vis_nodes = extract_vis_data(tree, as_tfrag_array->tfragments.front().id);
|
||||
this_tree.bvh.first_leaf_node = vis_nodes.first_child_node;
|
||||
|
@ -2198,8 +2281,10 @@ void extract_tfrag(const level_tools::DrawableTreeTfrag* tree,
|
|||
}
|
||||
// ASSERT(result.vis_nodes.last_child_node + 1 == idx);
|
||||
|
||||
emulate_tfrags(geom, as_tfrag_array->tfragments, debug_name, map, out, this_tree, tex_db,
|
||||
expected_missing_textures, dump_level);
|
||||
std::vector<tfrag3::PreloadedVertex> vertices;
|
||||
emulate_tfrags(geom, as_tfrag_array->tfragments, debug_name, map, out, this_tree, vertices,
|
||||
tex_db, expected_missing_textures, dump_level);
|
||||
pack_vertices(&this_tree.packed_vertices, vertices);
|
||||
extract_time_of_day(tree, this_tree);
|
||||
|
||||
for (auto& draw : this_tree.draws) {
|
||||
|
|
|
@ -558,9 +558,9 @@ void update_proto_info(std::vector<TieProtoInfo>* out,
|
|||
adgif.combo_tex = tex_combo;
|
||||
// and the hidden value in the unused a+d
|
||||
memcpy(&adgif.second_w, &gif_data.at(16 * (tex_idx * 5 + 1) + 12), 4);
|
||||
// todo: figure out if this matters
|
||||
// todo: figure out if this matters. maybe this is decal?
|
||||
if (ra_tex0_val == 0x800000000) {
|
||||
fmt::print("texture {} in {} has weird tex setting\n", tex->second.name, proto.name);
|
||||
// fmt::print("texture {} in {} has weird tex setting\n", tex->second.name, proto.name);
|
||||
}
|
||||
|
||||
// mipmap settings. we ignore, but get the hidden value
|
||||
|
@ -2036,19 +2036,40 @@ void add_vertices_and_static_draw(tfrag3::TieTree& tree,
|
|||
// bool using_wind = true; // hack, for testing
|
||||
bool using_wind = proto.stiffness != 0.f;
|
||||
|
||||
// create the model first
|
||||
std::vector<std::vector<std::pair<int, int>>> packed_vert_indices;
|
||||
for (size_t frag_idx = 0; frag_idx < proto.frags.size(); frag_idx++) {
|
||||
packed_vert_indices.emplace_back();
|
||||
auto& frag_vert_indices = packed_vert_indices.back();
|
||||
auto& frag = proto.frags[frag_idx]; // shared info for all instances of this frag
|
||||
for (auto& strip : frag.strips) {
|
||||
int start = tree.packed_vertices.vertices.size();
|
||||
for (auto& vert : strip.verts) {
|
||||
tree.packed_vertices.vertices.push_back(
|
||||
{vert.pos.x(), vert.pos.y(), vert.pos.z(), vert.tex.x(), vert.tex.y()});
|
||||
ASSERT(vert.tex.z() == 1.);
|
||||
}
|
||||
int end = tree.packed_vertices.vertices.size();
|
||||
frag_vert_indices.emplace_back(start, end);
|
||||
}
|
||||
}
|
||||
|
||||
// loop over instances of the prototypes
|
||||
for (auto& inst : proto.instances) {
|
||||
// if we're using wind, we use the instanced renderer, which requires some extra info
|
||||
// and we should remember which instance ID we are.
|
||||
// Note: this is different from the game's instance index - we don't draw everything instanced
|
||||
// so the non-instanced models don't get a C++ renderer instance ID
|
||||
u32 wind_instance_idx = tree.instance_info.size();
|
||||
u32 wind_instance_idx = tree.wind_instance_info.size();
|
||||
u32 matrix_idx = tree.packed_vertices.matrices.size();
|
||||
if (using_wind) {
|
||||
tfrag3::TieWindInstance wind_instance_info;
|
||||
wind_instance_info.wind_idx = inst.wind_index; // which wind value to apply in the table
|
||||
wind_instance_info.stiffness = proto.stiffness; // wind stiffness (how much we move)
|
||||
wind_instance_info.matrix = inst.mat; // instance transformation matrix.
|
||||
tree.instance_info.push_back(wind_instance_info);
|
||||
tree.wind_instance_info.push_back(wind_instance_info);
|
||||
} else {
|
||||
tree.packed_vertices.matrices.push_back(inst.mat);
|
||||
}
|
||||
|
||||
// loop over fragments of the prototype
|
||||
|
@ -2056,7 +2077,8 @@ void add_vertices_and_static_draw(tfrag3::TieTree& tree,
|
|||
auto& frag = proto.frags[frag_idx]; // shared info for all instances of this frag
|
||||
auto& ifrag = inst.frags.at(frag_idx); // color info for this instance of the frag
|
||||
// loop over triangle strips within the fragment
|
||||
for (auto& strip : frag.strips) {
|
||||
for (size_t strip_idx = 0; strip_idx < frag.strips.size(); strip_idx++) {
|
||||
auto& strip = frag.strips[strip_idx];
|
||||
// what texture are we using?
|
||||
u32 combo_tex = strip.adgif.combo_tex;
|
||||
|
||||
|
@ -2139,31 +2161,30 @@ void add_vertices_and_static_draw(tfrag3::TieTree& tree,
|
|||
igroup.instance_idx = wind_instance_idx;
|
||||
draw_to_add_to->num_triangles += strip.verts.size() - 2;
|
||||
// note: this is a bit wasteful to duplicate the xyz/stq.
|
||||
tfrag3::PackedTieVertices::MatrixGroup grp;
|
||||
grp.matrix_idx = -1;
|
||||
grp.start_vert = packed_vert_indices.at(frag_idx).at(strip_idx).first;
|
||||
grp.end_vert = packed_vert_indices.at(frag_idx).at(strip_idx).second;
|
||||
tree.packed_vertices.matrix_groups.push_back(grp);
|
||||
for (auto& vert : strip.verts) {
|
||||
tfrag3::PreloadedVertex vtx;
|
||||
vtx.x = vert.pos.x();
|
||||
vtx.y = vert.pos.y();
|
||||
vtx.z = vert.pos.z();
|
||||
vtx.s = vert.tex.x();
|
||||
vtx.t = vert.tex.y();
|
||||
vtx.q = vert.tex.z();
|
||||
// if this is true, we can remove a divide in the shader
|
||||
ASSERT(vtx.q == 1.f);
|
||||
u16 color_index = 0;
|
||||
if (vert.color_index_index == UINT32_MAX) {
|
||||
vtx.color_index = 0;
|
||||
color_index = 0;
|
||||
} else {
|
||||
vtx.color_index = ifrag.color_indices.at(vert.color_index_index);
|
||||
color_index = ifrag.color_indices.at(vert.color_index_index);
|
||||
ASSERT(vert.color_index_index < ifrag.color_indices.size());
|
||||
vtx.color_index += ifrag.color_index_offset_in_big_palette;
|
||||
color_index += ifrag.color_index_offset_in_big_palette;
|
||||
}
|
||||
|
||||
size_t vert_idx = tree.vertices.size();
|
||||
tree.vertices.push_back(vtx);
|
||||
size_t vert_idx = tree.packed_vertices.color_indices.size();
|
||||
tree.packed_vertices.color_indices.push_back(color_index);
|
||||
draw_to_add_to->vertex_index_stream.push_back(vert_idx);
|
||||
}
|
||||
|
||||
// the primitive restart index
|
||||
draw_to_add_to->vertex_index_stream.push_back(UINT32_MAX);
|
||||
draw_to_add_to->instance_groups.push_back(igroup);
|
||||
|
||||
} else {
|
||||
// okay, we now have a texture and draw mode, let's see if we can add to an existing...
|
||||
auto existing_draws_in_tex = static_draws_by_tex.find(idx_in_lev_data);
|
||||
|
@ -2190,31 +2211,30 @@ void add_vertices_and_static_draw(tfrag3::TieTree& tree,
|
|||
vgroup.vis_idx_in_pc_bvh = inst.vis_id; // associate with the instance for culling
|
||||
vgroup.num = strip.verts.size() + 1; // one for the primitive restart!
|
||||
draw_to_add_to->num_triangles += strip.verts.size() - 2;
|
||||
tfrag3::PackedTieVertices::MatrixGroup grp;
|
||||
grp.matrix_idx = matrix_idx;
|
||||
grp.start_vert = packed_vert_indices.at(frag_idx).at(strip_idx).first;
|
||||
grp.end_vert = packed_vert_indices.at(frag_idx).at(strip_idx).second;
|
||||
tree.packed_vertices.matrix_groups.push_back(grp);
|
||||
tfrag3::StripDraw::VertexRun run;
|
||||
run.vertex0 = tree.packed_vertices.color_indices.size();
|
||||
run.length = strip.verts.size();
|
||||
for (auto& vert : strip.verts) {
|
||||
tfrag3::PreloadedVertex vtx;
|
||||
// todo fields
|
||||
auto tf = transform_tie(inst.mat, vert.pos);
|
||||
vtx.x = tf.x();
|
||||
vtx.y = tf.y();
|
||||
vtx.z = tf.z();
|
||||
vtx.s = vert.tex.x();
|
||||
vtx.t = vert.tex.y();
|
||||
vtx.q = vert.tex.z();
|
||||
// if this is true, we can remove a divide in the shader
|
||||
ASSERT(vtx.q == 1.f);
|
||||
u16 color_index = 0;
|
||||
if (vert.color_index_index == UINT32_MAX) {
|
||||
vtx.color_index = 0;
|
||||
color_index = 0;
|
||||
} else {
|
||||
vtx.color_index = ifrag.color_indices.at(vert.color_index_index);
|
||||
color_index = ifrag.color_indices.at(vert.color_index_index);
|
||||
ASSERT(vert.color_index_index < ifrag.color_indices.size());
|
||||
vtx.color_index += ifrag.color_index_offset_in_big_palette;
|
||||
color_index += ifrag.color_index_offset_in_big_palette;
|
||||
}
|
||||
|
||||
size_t vert_idx = tree.vertices.size();
|
||||
tree.vertices.push_back(vtx);
|
||||
draw_to_add_to->vertex_index_stream.push_back(vert_idx);
|
||||
size_t vert_idx = tree.packed_vertices.color_indices.size();
|
||||
tree.packed_vertices.color_indices.push_back(color_index);
|
||||
// draw_to_add_to->vertex_index_stream.push_back(vert_idx);
|
||||
}
|
||||
draw_to_add_to->vertex_index_stream.push_back(UINT32_MAX);
|
||||
draw_to_add_to->runs.push_back(run);
|
||||
// draw_to_add_to->vertex_index_stream.push_back(UINT32_MAX);
|
||||
draw_to_add_to->vis_groups.push_back(vgroup);
|
||||
}
|
||||
}
|
||||
|
@ -2285,8 +2305,6 @@ void extract_tie(const level_tools::DrawableTreeInstanceTie* tree,
|
|||
}
|
||||
bool ok = verify_node_indices(tree);
|
||||
ASSERT(ok);
|
||||
fmt::print(" tree has {} arrays and {} instances\n", tree->length,
|
||||
as_instance_array->length);
|
||||
|
||||
// extract the vis tree. Note that this extracts the tree only down to the last draw node, a
|
||||
// parent of between 1 and 8 instances.
|
||||
|
@ -2362,7 +2380,6 @@ void extract_tie(const level_tools::DrawableTreeInstanceTie* tree,
|
|||
}
|
||||
|
||||
this_tree.colors = full_palette.colors;
|
||||
fmt::print("TIE tree {} has {} draws\n", geo, this_tree.static_draws.size());
|
||||
out.tie_trees[geo].push_back(std::move(this_tree));
|
||||
}
|
||||
}
|
||||
|
|
|
@ -601,7 +601,11 @@ void DirectRenderer::render_gif(const u8* data,
|
|||
}
|
||||
|
||||
if (size != UINT32_MAX) {
|
||||
ASSERT((offset + 15) / 16 == size / 16);
|
||||
if (!(offset + 15) / 16 == size / 16) {
|
||||
fmt::print("DirectRenderer size failed in {}\n", name_and_id());
|
||||
fmt::print("expected: {}, got: {}\n", size, offset);
|
||||
ASSERT(false);
|
||||
}
|
||||
}
|
||||
|
||||
// fmt::print("{}\n", GifTag(data).print());
|
||||
|
|
|
@ -56,8 +56,27 @@ void Loader::loader_thread() {
|
|||
Serializer ser(decomp_data.data(), decomp_data.size());
|
||||
result->serialize(ser);
|
||||
double import_time = import_timer.getSeconds();
|
||||
fmt::print("------------> Load from file: {:.3f}s, import {:.3f}s, decomp {:.3f}s\n",
|
||||
disk_load_time, import_time, decomp_time);
|
||||
|
||||
Timer unpack_timer;
|
||||
for (auto& tie_tree : result->tie_trees) {
|
||||
for (auto& tree : tie_tree) {
|
||||
tree.unpack();
|
||||
for (auto& d : tree.static_draws) {
|
||||
d.unpack();
|
||||
}
|
||||
}
|
||||
}
|
||||
for (auto& t_tree : result->tfrag_trees) {
|
||||
for (auto& tree : t_tree) {
|
||||
tree.unpack();
|
||||
for (auto& d : tree.draws) {
|
||||
d.unpack();
|
||||
}
|
||||
}
|
||||
}
|
||||
fmt::print(
|
||||
"------------> Load from file: {:.3f}s, import {:.3f}s, decomp {:.3f}s unpack {:.3f}s\n",
|
||||
disk_load_time, import_time, decomp_time, unpack_timer.getSeconds());
|
||||
|
||||
lk.lock();
|
||||
m_initializing_tfrag3_levels[lev].data.level = std::move(result);
|
||||
|
|
|
@ -353,14 +353,14 @@ struct alignas(16) Accumulator {
|
|||
auto b = _mm_set1_ps(_b);
|
||||
auto a = _mm_load_ps(_a.data);
|
||||
auto acc = _mm_load_ps(data);
|
||||
_mm_store_ps(data, _mm_fmadd_ps(a, b, acc));
|
||||
_mm_store_ps(data, _mm_add_ps(_mm_mul_ps(a, b), acc));
|
||||
}
|
||||
|
||||
REALLY_INLINE void madda_xyzw(const Vf& _a, const Vf& _b) {
|
||||
auto b = _mm_load_ps(_b.data);
|
||||
auto a = _mm_load_ps(_a.data);
|
||||
auto acc = _mm_load_ps(data);
|
||||
_mm_store_ps(data, _mm_fmadd_ps(a, b, acc));
|
||||
_mm_store_ps(data, _mm_add_ps(_mm_mul_ps(a, b), acc));
|
||||
}
|
||||
|
||||
void madd(Mask mask, Vf& dest, const Vf& a, const Vf& b) {
|
||||
|
@ -375,14 +375,14 @@ struct alignas(16) Accumulator {
|
|||
auto b = _mm_set1_ps(_b);
|
||||
auto a = _mm_load_ps(_a.data);
|
||||
auto acc = _mm_load_ps(data);
|
||||
_mm_store_ps(dest.data, _mm_fmadd_ps(a, b, acc));
|
||||
_mm_store_ps(dest.data, _mm_add_ps(_mm_mul_ps(a, b), acc));
|
||||
}
|
||||
|
||||
REALLY_INLINE void madd_xyz(Vf& dest, const Vf& _a, float _b) {
|
||||
auto b = _mm_set1_ps(_b);
|
||||
auto a = _mm_load_ps(_a.data);
|
||||
auto acc = _mm_load_ps(data);
|
||||
auto prod = _mm_fmadd_ps(a, b, acc);
|
||||
auto prod = _mm_add_ps(_mm_mul_ps(a, b), acc);
|
||||
prod = _mm_blend_ps(prod, _mm_load_ps(dest.data), 0b1000);
|
||||
_mm_store_ps(dest.data, prod);
|
||||
}
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
#include "SkyBlendCPU.h"
|
||||
#include "game/graphics/opengl_renderer/AdgifHandler.h"
|
||||
#include "common/util/os.h"
|
||||
|
||||
#include <immintrin.h>
|
||||
|
||||
|
@ -18,33 +19,72 @@ SkyBlendCPU::~SkyBlendCPU() {
|
|||
}
|
||||
|
||||
void blend_sky_initial_fast(u8 intensity, u8* out, const u8* in, u32 size) {
|
||||
__m256i intensity_vec = _mm256_set1_epi16(intensity);
|
||||
for (u32 i = 0; i < size / 16; i++) {
|
||||
__m128i tex_data8 = _mm_loadu_si128((const __m128i*)(in + (i * 16)));
|
||||
__m256i tex_data16 = _mm256_cvtepu8_epi16(tex_data8);
|
||||
tex_data16 = _mm256_mullo_epi16(tex_data16, intensity_vec);
|
||||
tex_data16 = _mm256_srli_epi16(tex_data16, 7);
|
||||
auto hi = _mm256_extracti128_si256(tex_data16, 1);
|
||||
auto result = _mm_packus_epi16(_mm256_castsi256_si128(tex_data16), hi);
|
||||
_mm_storeu_si128((__m128i*)(out + (i * 16)), result);
|
||||
if (get_cpu_info().has_avx2) {
|
||||
#ifdef __AVX2__
|
||||
__m256i intensity_vec = _mm256_set1_epi16(intensity);
|
||||
for (u32 i = 0; i < size / 16; i++) {
|
||||
__m128i tex_data8 = _mm_loadu_si128((const __m128i*)(in + (i * 16)));
|
||||
__m256i tex_data16 = _mm256_cvtepu8_epi16(tex_data8);
|
||||
tex_data16 = _mm256_mullo_epi16(tex_data16, intensity_vec);
|
||||
tex_data16 = _mm256_srli_epi16(tex_data16, 7);
|
||||
auto hi = _mm256_extracti128_si256(tex_data16, 1);
|
||||
auto result = _mm_packus_epi16(_mm256_castsi256_si128(tex_data16), hi);
|
||||
_mm_storeu_si128((__m128i*)(out + (i * 16)), result);
|
||||
}
|
||||
#else
|
||||
ASSERT(false);
|
||||
#endif
|
||||
} else {
|
||||
__m128i intensity_vec = _mm_set1_epi16(intensity);
|
||||
for (u32 i = 0; i < size / 8; i++) {
|
||||
__m128i tex_data8 = _mm_loadu_si64((const __m128i*)(in + (i * 8)));
|
||||
__m128i tex_data16 = _mm_cvtepu8_epi16(tex_data8);
|
||||
tex_data16 = _mm_mullo_epi16(tex_data16, intensity_vec);
|
||||
tex_data16 = _mm_srli_epi16(tex_data16, 7);
|
||||
auto result = _mm_packus_epi16(tex_data16, tex_data16);
|
||||
_mm_storeu_si64((__m128i*)(out + (i * 8)), result);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void blend_sky_fast(u8 intensity, u8* out, const u8* in, u32 size) {
|
||||
__m256i intensity_vec = _mm256_set1_epi16(intensity);
|
||||
__m256i max_intensity = _mm256_set1_epi16(255);
|
||||
for (u32 i = 0; i < size / 16; i++) {
|
||||
__m128i tex_data8 = _mm_loadu_si128((const __m128i*)(in + (i * 16)));
|
||||
__m128i out_val = _mm_loadu_si128((const __m128i*)(out + (i * 16)));
|
||||
__m256i tex_data16 = _mm256_cvtepu8_epi16(tex_data8);
|
||||
tex_data16 = _mm256_mullo_epi16(tex_data16, intensity_vec);
|
||||
tex_data16 = _mm256_srli_epi16(tex_data16, 7);
|
||||
tex_data16 = _mm256_min_epi16(max_intensity, tex_data16);
|
||||
auto hi = _mm256_extracti128_si256(tex_data16, 1);
|
||||
auto result = _mm_packus_epi16(_mm256_castsi256_si128(tex_data16), hi);
|
||||
out_val = _mm_adds_epu8(out_val, result);
|
||||
_mm_storeu_si128((__m128i*)(out + (i * 16)), out_val);
|
||||
if (get_cpu_info().has_avx2) {
|
||||
#ifdef __AVX2__
|
||||
__m256i intensity_vec = _mm256_set1_epi16(intensity);
|
||||
__m256i max_intensity = _mm256_set1_epi16(255);
|
||||
for (u32 i = 0; i < size / 16; i++) {
|
||||
__m128i tex_data8 = _mm_loadu_si128((const __m128i*)(in + (i * 16)));
|
||||
__m128i out_val = _mm_loadu_si128((const __m128i*)(out + (i * 16)));
|
||||
__m256i tex_data16 = _mm256_cvtepu8_epi16(tex_data8);
|
||||
tex_data16 = _mm256_mullo_epi16(tex_data16, intensity_vec);
|
||||
tex_data16 = _mm256_srli_epi16(tex_data16, 7);
|
||||
tex_data16 = _mm256_min_epi16(max_intensity, tex_data16);
|
||||
auto hi = _mm256_extracti128_si256(tex_data16, 1);
|
||||
auto result = _mm_packus_epi16(_mm256_castsi256_si128(tex_data16), hi);
|
||||
out_val = _mm_adds_epu8(out_val, result);
|
||||
_mm_storeu_si128((__m128i*)(out + (i * 16)), out_val);
|
||||
}
|
||||
#else
|
||||
ASSERT(false);
|
||||
#endif
|
||||
} else {
|
||||
__m128i intensity_vec = _mm_set1_epi16(intensity);
|
||||
__m128i max_intensity = _mm_set1_epi16(255);
|
||||
for (u32 i = 0; i < size / 8; i++) {
|
||||
__m128i tex_data8 = _mm_loadu_si64((const __m128i*)(in + (i * 8)));
|
||||
__m128i out_val = _mm_loadu_si64((const __m128i*)(out + (i * 8)));
|
||||
__m128i tex_data16 = _mm_cvtepu8_epi16(tex_data8);
|
||||
tex_data16 = _mm_mullo_epi16(tex_data16, intensity_vec);
|
||||
tex_data16 = _mm_srli_epi16(tex_data16, 7);
|
||||
tex_data16 = _mm_min_epi16(max_intensity, tex_data16);
|
||||
auto result = _mm_packus_epi16(tex_data16, tex_data16);
|
||||
out_val = _mm_adds_epu8(out_val, result);
|
||||
_mm_storeu_si64((__m128i*)(out + (i * 8)), out_val);
|
||||
}
|
||||
}
|
||||
/*
|
||||
|
||||
*/
|
||||
}
|
||||
|
||||
SkyBlendStats SkyBlendCPU::do_sky_blends(DmaFollower& dma,
|
||||
|
|
|
@ -70,10 +70,10 @@ bool Tfrag3::update_load(const std::vector<tfrag3::TFragmentTreeKind>& tree_kind
|
|||
if (std::find(tree_kinds.begin(), tree_kinds.end(), tree.kind) != tree_kinds.end()) {
|
||||
max_draw = std::max(tree.draws.size(), max_draw);
|
||||
for (auto& draw : tree.draws) {
|
||||
idx_buffer_len += draw.vertex_index_stream.size();
|
||||
idx_buffer_len += draw.unpacked.vertex_index_stream.size();
|
||||
}
|
||||
time_of_day_count = std::max(tree.colors.size(), time_of_day_count);
|
||||
u32 verts = tree.vertices.size();
|
||||
u32 verts = tree.packed_vertices.vertices.size();
|
||||
glGenVertexArrays(1, &tree_cache.vao);
|
||||
glBindVertexArray(tree_cache.vao);
|
||||
glGenBuffers(1, &tree_cache.vertex_buffer);
|
||||
|
@ -148,7 +148,7 @@ bool Tfrag3::update_load(const std::vector<tfrag3::TFragmentTreeKind>& tree_kind
|
|||
const auto& tree = lev_data->tfrag_trees[geom][tree_idx];
|
||||
|
||||
if (std::find(tree_kinds.begin(), tree_kinds.end(), tree.kind) != tree_kinds.end()) {
|
||||
u32 verts = tree.vertices.size();
|
||||
u32 verts = tree.unpacked.vertices.size();
|
||||
u32 start_vert = (m_load_state.vert) * MAX_VERTS;
|
||||
u32 end_vert = std::min(verts, (m_load_state.vert + 1) * MAX_VERTS);
|
||||
if (end_vert > start_vert) {
|
||||
|
@ -156,7 +156,7 @@ bool Tfrag3::update_load(const std::vector<tfrag3::TFragmentTreeKind>& tree_kind
|
|||
glBindBuffer(GL_ARRAY_BUFFER, m_cached_trees[geom][tree_idx].vertex_buffer);
|
||||
glBufferSubData(GL_ARRAY_BUFFER, start_vert * sizeof(tfrag3::PreloadedVertex),
|
||||
(end_vert - start_vert) * sizeof(tfrag3::PreloadedVertex),
|
||||
tree.vertices.data() + start_vert);
|
||||
tree.unpacked.vertices.data() + start_vert);
|
||||
if (end_vert < verts) {
|
||||
remaining = true;
|
||||
}
|
||||
|
@ -274,7 +274,7 @@ void Tfrag3::render_tree(int geom,
|
|||
void* offset = (void*)(indices.first * sizeof(u32));
|
||||
|
||||
prof.add_draw_call();
|
||||
prof.add_tri(draw.num_triangles * (float)draw_size / draw.vertex_index_stream.size());
|
||||
prof.add_tri(draw.num_triangles * (float)draw_size / draw.unpacked.vertex_index_stream.size());
|
||||
|
||||
glDrawElements(GL_TRIANGLE_STRIP, draw_size, GL_UNSIGNED_INT, (void*)offset);
|
||||
|
||||
|
|
|
@ -40,18 +40,18 @@ bool Tie3::update_load(const tfrag3::Level* lev_data) {
|
|||
const auto& tree = lev_data->tie_trees[geo][tree_idx];
|
||||
max_draw = std::max(tree.static_draws.size(), max_draw);
|
||||
for (auto& draw : tree.static_draws) {
|
||||
idx_buffer_len += draw.vertex_index_stream.size();
|
||||
max_idx_per_draw = std::max(max_idx_per_draw, draw.vertex_index_stream.size());
|
||||
idx_buffer_len += draw.unpacked.vertex_index_stream.size();
|
||||
max_idx_per_draw = std::max(max_idx_per_draw, draw.unpacked.vertex_index_stream.size());
|
||||
}
|
||||
for (auto& draw : tree.instanced_wind_draws) {
|
||||
wind_idx_buffer_len += draw.vertex_index_stream.size();
|
||||
max_idx_per_draw = std::max(max_idx_per_draw, draw.vertex_index_stream.size());
|
||||
}
|
||||
for (auto& inst : tree.instance_info) {
|
||||
for (auto& inst : tree.wind_instance_info) {
|
||||
max_wind_idx = std::max(max_wind_idx, inst.wind_idx);
|
||||
}
|
||||
time_of_day_count = std::max(tree.colors.size(), time_of_day_count);
|
||||
u32 verts = tree.vertices.size();
|
||||
u32 verts = tree.packed_vertices.color_indices.size();
|
||||
fmt::print(" tree {} has {} verts ({} kB) and {} draws\n", tree_idx, verts,
|
||||
verts * sizeof(tfrag3::PreloadedVertex) / 1024.f, tree.static_draws.size());
|
||||
auto& lod_tree = m_trees.at(geo);
|
||||
|
@ -62,7 +62,7 @@ bool Tie3::update_load(const tfrag3::Level* lev_data) {
|
|||
lod_tree[tree_idx].draws = &tree.static_draws; // todo - should we just copy this?
|
||||
lod_tree[tree_idx].colors = &tree.colors;
|
||||
lod_tree[tree_idx].vis = &tree.bvh;
|
||||
lod_tree[tree_idx].instance_info = &tree.instance_info;
|
||||
lod_tree[tree_idx].instance_info = &tree.wind_instance_info;
|
||||
lod_tree[tree_idx].wind_draws = &tree.instanced_wind_draws;
|
||||
vis_temp_len = std::max(vis_temp_len, tree.bvh.vis_nodes.size());
|
||||
lod_tree[tree_idx].tod_cache = swizzle_time_of_day(tree.colors);
|
||||
|
@ -107,7 +107,7 @@ bool Tie3::update_load(const tfrag3::Level* lev_data) {
|
|||
lod_tree[tree_idx].index_list.resize(idx_buffer_len);
|
||||
|
||||
if (wind_idx_buffer_len > 0) {
|
||||
lod_tree[tree_idx].wind_matrix_cache.resize(tree.instance_info.size());
|
||||
lod_tree[tree_idx].wind_matrix_cache.resize(tree.wind_instance_info.size());
|
||||
lod_tree[tree_idx].has_wind = true;
|
||||
glGenBuffers(1, &lod_tree[tree_idx].wind_vertex_index_buffer);
|
||||
glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, lod_tree[tree_idx].wind_vertex_index_buffer);
|
||||
|
@ -158,7 +158,7 @@ bool Tie3::update_load(const tfrag3::Level* lev_data) {
|
|||
for (int geo = 0; geo < 4; ++geo) {
|
||||
for (size_t tree_idx = 0; tree_idx < lev_data->tie_trees[geo].size(); tree_idx++) {
|
||||
const auto& tree = lev_data->tie_trees[geo][tree_idx];
|
||||
u32 verts = tree.vertices.size();
|
||||
u32 verts = tree.unpacked.vertices.size();
|
||||
u32 start_vert = (m_load_state.vert) * MAX_VERTS;
|
||||
u32 end_vert = std::min(verts, (m_load_state.vert + 1) * MAX_VERTS);
|
||||
if (end_vert > start_vert) {
|
||||
|
@ -166,7 +166,7 @@ bool Tie3::update_load(const tfrag3::Level* lev_data) {
|
|||
glBindBuffer(GL_ARRAY_BUFFER, m_trees[geo][tree_idx].vertex_buffer);
|
||||
glBufferSubData(GL_ARRAY_BUFFER, start_vert * sizeof(tfrag3::PreloadedVertex),
|
||||
(end_vert - start_vert) * sizeof(tfrag3::PreloadedVertex),
|
||||
tree.vertices.data() + start_vert);
|
||||
tree.unpacked.vertices.data() + start_vert);
|
||||
if (end_vert < verts) {
|
||||
remaining = true;
|
||||
}
|
||||
|
@ -440,7 +440,6 @@ void Tie3::render(DmaFollower& dma, SharedRenderState* render_state, ScopedProfi
|
|||
m_has_level = setup_for_level(m_pc_port_data.level_name, render_state);
|
||||
}
|
||||
render_all_trees(lod(), settings, render_state, prof);
|
||||
// todo render all...
|
||||
}
|
||||
|
||||
void Tie3::render_all_trees(int geom,
|
||||
|
@ -659,9 +658,9 @@ void Tie3::render_tree(int idx,
|
|||
void* offset = (void*)(indices.first * sizeof(u32));
|
||||
|
||||
prof.add_draw_call();
|
||||
prof.add_tri(draw.num_triangles * (float)draw_size / draw.vertex_index_stream.size());
|
||||
prof.add_tri(draw.num_triangles * (float)draw_size / draw.unpacked.vertex_index_stream.size());
|
||||
|
||||
bool is_full = draw_size == (int)draw.vertex_index_stream.size();
|
||||
bool is_full = draw_size == (int)draw.unpacked.vertex_index_stream.size();
|
||||
|
||||
tree.perf.draws++;
|
||||
if (is_full) {
|
||||
|
|
|
@ -48,8 +48,6 @@ class Tie3 : public BucketRenderer {
|
|||
SharedRenderState* render_state,
|
||||
ScopedProfilerNode& prof);
|
||||
|
||||
int m_geom = 0;
|
||||
|
||||
struct Tree {
|
||||
GLuint vertex_buffer;
|
||||
GLuint index_buffer;
|
||||
|
|
|
@ -3,6 +3,7 @@
|
|||
#include "tfrag_common.h"
|
||||
#include "game/graphics/opengl_renderer/BucketRenderer.h"
|
||||
#include "game/graphics/pipelines/opengl.h"
|
||||
#include "common/util/os.h"
|
||||
|
||||
#include <immintrin.h>
|
||||
|
||||
|
@ -212,11 +213,12 @@ SwizzledTimeOfDay swizzle_time_of_day(const std::vector<tfrag3::TimeOfDayColor>&
|
|||
// Due to using integers instead of floats, it may be a tiny bit different.
|
||||
// TODO: it might be possible to reorder the loop into two blocks of loads and avoid spilling xmms.
|
||||
// It's ~8x faster than the slow version.
|
||||
void interp_time_of_day_fast(const float weights[8],
|
||||
const SwizzledTimeOfDay& in,
|
||||
math::Vector<u8, 4>* out) {
|
||||
// even though the colors are 8 bits, we'll use 16 bits so we can saturate correctly
|
||||
|
||||
void interp_time_of_day_fast_avx2(const float weights[8],
|
||||
const SwizzledTimeOfDay& in,
|
||||
math::Vector<u8, 4>* out) {
|
||||
// even though the colors are 8 bits, we'll use 16 bits so we can saturate correctly
|
||||
#ifdef __AVX2__
|
||||
// weight multipliers
|
||||
__m256i weights0 = _mm256_set1_epi16(weights[0] * 64.f);
|
||||
__m256i weights1 = _mm256_set1_epi16(weights[1] * 64.f);
|
||||
|
@ -234,7 +236,7 @@ void interp_time_of_day_fast(const float weights[8],
|
|||
255, 255, 255);
|
||||
|
||||
for (u32 color_quad = 0; color_quad < in.color_count / 4; color_quad++) {
|
||||
// first, load colors. We put 16 bytes / register and don't touch the upper half because we will
|
||||
// first, load colors. We put 16 bytes / register and don't touch the upper half because we
|
||||
// convert u8s to u16s.
|
||||
const u8* base = in.data.data() + color_quad * 128;
|
||||
__m128i color0_p = _mm_loadu_si128((const __m128i*)(base + 0));
|
||||
|
@ -290,6 +292,149 @@ void interp_time_of_day_fast(const float weights[8],
|
|||
// store result
|
||||
_mm_storeu_si128((__m128i*)(&out[color_quad * 4]), result);
|
||||
}
|
||||
#else
|
||||
// unreachable.
|
||||
ASSERT(false);
|
||||
#endif
|
||||
}
|
||||
|
||||
void interp_time_of_day_fast(const float weights[8],
|
||||
const SwizzledTimeOfDay& in,
|
||||
math::Vector<u8, 4>* out) {
|
||||
// even though the colors are 8 bits, we'll use 16 bits so we can saturate correctly
|
||||
if (get_cpu_info().has_avx2) {
|
||||
interp_time_of_day_fast_avx2(weights, in, out);
|
||||
return;
|
||||
}
|
||||
|
||||
// weight multipliers
|
||||
__m128i weights0 = _mm_set1_epi16(weights[0] * 64.f);
|
||||
__m128i weights1 = _mm_set1_epi16(weights[1] * 64.f);
|
||||
__m128i weights2 = _mm_set1_epi16(weights[2] * 64.f);
|
||||
__m128i weights3 = _mm_set1_epi16(weights[3] * 64.f);
|
||||
__m128i weights4 = _mm_set1_epi16(weights[4] * 64.f);
|
||||
__m128i weights5 = _mm_set1_epi16(weights[5] * 64.f);
|
||||
__m128i weights6 = _mm_set1_epi16(weights[6] * 64.f);
|
||||
__m128i weights7 = _mm_set1_epi16(weights[7] * 64.f);
|
||||
|
||||
// saturation: note that alpha is saturated to 128 but the rest are 255.
|
||||
// TODO: maybe we should saturate to 255 for everybody (can do this using a single packus) and
|
||||
// change the shader to deal with this.
|
||||
__m128i sat = _mm_set_epi16(128, 255, 255, 255, 128, 255, 255, 255);
|
||||
|
||||
for (u32 color_quad = 0; color_quad < in.color_count / 4; color_quad++) {
|
||||
// first, load colors. We put 16 bytes / register and don't touch the upper half because we
|
||||
// convert u8s to u16s.
|
||||
{
|
||||
const u8* base = in.data.data() + color_quad * 128;
|
||||
__m128i color0_p = _mm_loadu_si64((const __m128i*)(base + 0));
|
||||
__m128i color1_p = _mm_loadu_si64((const __m128i*)(base + 16));
|
||||
__m128i color2_p = _mm_loadu_si64((const __m128i*)(base + 32));
|
||||
__m128i color3_p = _mm_loadu_si64((const __m128i*)(base + 48));
|
||||
__m128i color4_p = _mm_loadu_si64((const __m128i*)(base + 64));
|
||||
__m128i color5_p = _mm_loadu_si64((const __m128i*)(base + 80));
|
||||
__m128i color6_p = _mm_loadu_si64((const __m128i*)(base + 96));
|
||||
__m128i color7_p = _mm_loadu_si64((const __m128i*)(base + 112));
|
||||
|
||||
// unpack to 16-bits. each has 16x 16 bit colors.
|
||||
__m128i color0 = _mm_cvtepu8_epi16(color0_p);
|
||||
__m128i color1 = _mm_cvtepu8_epi16(color1_p);
|
||||
__m128i color2 = _mm_cvtepu8_epi16(color2_p);
|
||||
__m128i color3 = _mm_cvtepu8_epi16(color3_p);
|
||||
__m128i color4 = _mm_cvtepu8_epi16(color4_p);
|
||||
__m128i color5 = _mm_cvtepu8_epi16(color5_p);
|
||||
__m128i color6 = _mm_cvtepu8_epi16(color6_p);
|
||||
__m128i color7 = _mm_cvtepu8_epi16(color7_p);
|
||||
|
||||
// multiply by weights
|
||||
color0 = _mm_mullo_epi16(color0, weights0);
|
||||
color1 = _mm_mullo_epi16(color1, weights1);
|
||||
color2 = _mm_mullo_epi16(color2, weights2);
|
||||
color3 = _mm_mullo_epi16(color3, weights3);
|
||||
color4 = _mm_mullo_epi16(color4, weights4);
|
||||
color5 = _mm_mullo_epi16(color5, weights5);
|
||||
color6 = _mm_mullo_epi16(color6, weights6);
|
||||
color7 = _mm_mullo_epi16(color7, weights7);
|
||||
|
||||
// add. This order minimizes dependencies.
|
||||
color0 = _mm_add_epi16(color0, color1);
|
||||
color2 = _mm_add_epi16(color2, color3);
|
||||
color4 = _mm_add_epi16(color4, color5);
|
||||
color6 = _mm_add_epi16(color6, color7);
|
||||
|
||||
color0 = _mm_add_epi16(color0, color2);
|
||||
color4 = _mm_add_epi16(color4, color6);
|
||||
|
||||
color0 = _mm_add_epi16(color0, color4);
|
||||
|
||||
// divide, because we multiplied our weights by 2^7.
|
||||
color0 = _mm_srli_epi16(color0, 6);
|
||||
|
||||
// saturate
|
||||
color0 = _mm_min_epu16(sat, color0);
|
||||
|
||||
// back to u8s.
|
||||
auto result = _mm_packus_epi16(color0, color0);
|
||||
|
||||
// store result
|
||||
_mm_storeu_si64((__m128i*)(&out[color_quad * 4]), result);
|
||||
}
|
||||
|
||||
{
|
||||
const u8* base = in.data.data() + color_quad * 128 + 8;
|
||||
__m128i color0_p = _mm_loadu_si64((const __m128i*)(base + 0));
|
||||
__m128i color1_p = _mm_loadu_si64((const __m128i*)(base + 16));
|
||||
__m128i color2_p = _mm_loadu_si64((const __m128i*)(base + 32));
|
||||
__m128i color3_p = _mm_loadu_si64((const __m128i*)(base + 48));
|
||||
__m128i color4_p = _mm_loadu_si64((const __m128i*)(base + 64));
|
||||
__m128i color5_p = _mm_loadu_si64((const __m128i*)(base + 80));
|
||||
__m128i color6_p = _mm_loadu_si64((const __m128i*)(base + 96));
|
||||
__m128i color7_p = _mm_loadu_si64((const __m128i*)(base + 112));
|
||||
|
||||
// unpack to 16-bits. each has 16x 16 bit colors.
|
||||
__m128i color0 = _mm_cvtepu8_epi16(color0_p);
|
||||
__m128i color1 = _mm_cvtepu8_epi16(color1_p);
|
||||
__m128i color2 = _mm_cvtepu8_epi16(color2_p);
|
||||
__m128i color3 = _mm_cvtepu8_epi16(color3_p);
|
||||
__m128i color4 = _mm_cvtepu8_epi16(color4_p);
|
||||
__m128i color5 = _mm_cvtepu8_epi16(color5_p);
|
||||
__m128i color6 = _mm_cvtepu8_epi16(color6_p);
|
||||
__m128i color7 = _mm_cvtepu8_epi16(color7_p);
|
||||
|
||||
// multiply by weights
|
||||
color0 = _mm_mullo_epi16(color0, weights0);
|
||||
color1 = _mm_mullo_epi16(color1, weights1);
|
||||
color2 = _mm_mullo_epi16(color2, weights2);
|
||||
color3 = _mm_mullo_epi16(color3, weights3);
|
||||
color4 = _mm_mullo_epi16(color4, weights4);
|
||||
color5 = _mm_mullo_epi16(color5, weights5);
|
||||
color6 = _mm_mullo_epi16(color6, weights6);
|
||||
color7 = _mm_mullo_epi16(color7, weights7);
|
||||
|
||||
// add. This order minimizes dependencies.
|
||||
color0 = _mm_add_epi16(color0, color1);
|
||||
color2 = _mm_add_epi16(color2, color3);
|
||||
color4 = _mm_add_epi16(color4, color5);
|
||||
color6 = _mm_add_epi16(color6, color7);
|
||||
|
||||
color0 = _mm_add_epi16(color0, color2);
|
||||
color4 = _mm_add_epi16(color4, color6);
|
||||
|
||||
color0 = _mm_add_epi16(color0, color4);
|
||||
|
||||
// divide, because we multiplied our weights by 2^7.
|
||||
color0 = _mm_srli_epi16(color0, 6);
|
||||
|
||||
// saturate
|
||||
color0 = _mm_min_epu16(sat, color0);
|
||||
|
||||
// back to u8s.
|
||||
auto result = _mm_packus_epi16(color0, color0);
|
||||
|
||||
// store result
|
||||
_mm_storeu_si64((__m128i*)(&out[color_quad * 4 + 2]), result);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
bool sphere_in_view_ref(const math::Vector4f& sphere, const math::Vector4f* planes) {
|
||||
|
@ -327,9 +472,9 @@ u32 make_all_visible_index_list(std::pair<int, int>* group_out,
|
|||
const auto& draw = draws[i];
|
||||
std::pair<int, int> ds;
|
||||
ds.first = idx_buffer_ptr;
|
||||
memcpy(&idx_out[idx_buffer_ptr], draw.vertex_index_stream.data(),
|
||||
draw.vertex_index_stream.size() * sizeof(u32));
|
||||
idx_buffer_ptr += draw.vertex_index_stream.size();
|
||||
memcpy(&idx_out[idx_buffer_ptr], draw.unpacked.vertex_index_stream.data(),
|
||||
draw.unpacked.vertex_index_stream.size() * sizeof(u32));
|
||||
idx_buffer_ptr += draw.unpacked.vertex_index_stream.size();
|
||||
ds.second = idx_buffer_ptr;
|
||||
group_out[i] = ds;
|
||||
}
|
||||
|
@ -357,7 +502,7 @@ u32 make_index_list_from_vis_string(std::pair<int, int>* group_out,
|
|||
} else {
|
||||
building_run = false;
|
||||
idx_buffer_ptr += grp.num;
|
||||
memcpy(&idx_out[run_start_out], &draw.vertex_index_stream[run_start_in],
|
||||
memcpy(&idx_out[run_start_out], &draw.unpacked.vertex_index_stream[run_start_in],
|
||||
(idx_buffer_ptr - run_start_out) * sizeof(u32));
|
||||
}
|
||||
} else {
|
||||
|
@ -372,7 +517,7 @@ u32 make_index_list_from_vis_string(std::pair<int, int>* group_out,
|
|||
vtx_idx += grp.num;
|
||||
}
|
||||
if (building_run) {
|
||||
memcpy(&idx_out[run_start_out], &draw.vertex_index_stream[run_start_in],
|
||||
memcpy(&idx_out[run_start_out], &draw.unpacked.vertex_index_stream[run_start_in],
|
||||
(idx_buffer_ptr - run_start_out) * sizeof(u32));
|
||||
}
|
||||
|
||||
|
|
|
@ -159,7 +159,7 @@ std::vector<std::shared_ptr<TextureRecord>> TexturePool::convert_textures(const
|
|||
// the sizes given aren't the actual sizes in memory, so if you just use that, you get the
|
||||
// wrong answer. I solved this in the decompiler by using the size of the actual data, but we
|
||||
// don't really have that here.
|
||||
u32 size = ((sizes[0] + sizes[1] + sizes[2] + 2047) / 256) * 256;
|
||||
u32 size = ((sizes[0] + sizes[1] + sizes[2] + 4096) / 256) * 256;
|
||||
|
||||
m_tex_converter.upload(memory_base + texture_page.segment[0].block_data_ptr,
|
||||
texture_page.segment[0].dest, size);
|
||||
|
|
|
@ -9,6 +9,7 @@
|
|||
#include "common/log/log.h"
|
||||
#include "common/util/FileUtil.h"
|
||||
#include "game/discord.h"
|
||||
#include "common/util/os.h"
|
||||
|
||||
// Discord RPC
|
||||
extern int64_t gStartTime;
|
||||
|
@ -28,17 +29,49 @@ void setup_logging(bool verbose) {
|
|||
}
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
// do this as soon as possible - stuff like memcpy might use AVX instructions and we want to
|
||||
// warn the user instead of just crashing.
|
||||
setup_cpu_info();
|
||||
if (!get_cpu_info().has_avx) {
|
||||
printf("Your CPU does not support AVX, which is required for OpenGOAL.\n");
|
||||
return -1;
|
||||
}
|
||||
|
||||
bool verbose = false;
|
||||
bool disable_avx2 = false;
|
||||
for (int i = 1; i < argc; i++) {
|
||||
if (std::string("-v") == argv[i]) {
|
||||
verbose = true;
|
||||
break;
|
||||
}
|
||||
|
||||
if (std::string("-no-avx2") == argv[i]) {
|
||||
disable_avx2 = true;
|
||||
}
|
||||
}
|
||||
|
||||
gStartTime = time(0);
|
||||
init_discord_rpc();
|
||||
|
||||
if (disable_avx2) {
|
||||
// for debugging the non-avx2 code paths, there's a flag to manually disable.
|
||||
printf("Note: AVX2 code has been manually disabled.\n");
|
||||
get_cpu_info().has_avx2 = false;
|
||||
}
|
||||
|
||||
#ifndef __AVX2__
|
||||
if (get_cpu_info().has_avx2) {
|
||||
printf("Note: your CPU supports AVX2, but this build was not compiled with AVX2 support\n");
|
||||
get_cpu_info().has_avx2 = false;
|
||||
}
|
||||
#endif
|
||||
|
||||
if (get_cpu_info().has_avx2) {
|
||||
printf("AVX2 mode enabled\n");
|
||||
} else {
|
||||
printf("AVX2 mode disabled\n");
|
||||
}
|
||||
|
||||
setup_logging(verbose);
|
||||
|
||||
while (true) {
|
||||
|
|
|
@ -9,12 +9,17 @@
|
|||
#include "gtest/gtest.h"
|
||||
#include "test/all_jak1_symbols.h"
|
||||
#include "common/util/json_util.h"
|
||||
#include "common/util/os.h"
|
||||
#include "common/util/Range.h"
|
||||
#include "third-party/fmt/core.h"
|
||||
#include "common/util/print_float.h"
|
||||
#include "common/util/CopyOnWrite.h"
|
||||
#include "common/util/SmallVector.h"
|
||||
|
||||
TEST(CommonUtil, CpuInfo) {
|
||||
setup_cpu_info();
|
||||
}
|
||||
|
||||
TEST(CommonUtil, get_file_path) {
|
||||
std::vector<std::string> test = {"cabbage", "banana", "apple"};
|
||||
std::string sampleString = file_util::get_file_path(test);
|
||||
|
@ -390,5 +395,6 @@ TEST(SmallVector, Construction) {
|
|||
TEST(Assert, Death) {
|
||||
EXPECT_DEATH(private_assert_failed("foo", "bar", 12, "aaa"), "");
|
||||
}
|
||||
|
||||
} // namespace test
|
||||
} // namespace cu
|
Loading…
Reference in a new issue