[graphics] reduce the size of fr3 files (#1175)

* first pass * first pass at shrinking fr3s * only need to load vertices once * avx2 detect and switch * fix build * another ifx' * one more * fix the sky and stupid math bug in size check
2024-10-20 00:57:44 -04:00 · 2022-02-16 22:13:18 -05:00 · 2022-02-16 22:13:18 -05:00 · 5135ea9659
parent 74d0025974
commit 5135ea9659
20 changed files with 844 additions and 137 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -17,9 +17,8 @@ if(MSVC AND (CMAKE_CXX_COMPILER_ID STREQUAL "Clang"))
        "-Xclang -fcxx-exceptions \
        -Xclang -fexceptions \
        -Xclang -std=c++17 \
-        -mfma -mavx2 \
-        -Wno-c++11-narrowing -W3 \
-        /arch:AVX")
+        -march=native \
+        -Wno-c++11-narrowing -W3")

    # additional c++ flags for release mode for our projects
    if(CMAKE_BUILD_TYPE MATCHES "Release")
@ -47,7 +46,7 @@ elseif(UNIX)
        -Wshadow \
        -Wsign-promo \
        -fdiagnostics-color=always \
-        -march=haswell")
+        -march=native")

    # additional c++ flags for release mode for our projects
    if(CMAKE_BUILD_TYPE MATCHES "Release")
--- a/common/custom_data/TFrag3Data.cpp
+++ b/common/custom_data/TFrag3Data.cpp
@ -3,14 +3,31 @@

 namespace tfrag3 {

+void PackedTieVertices::serialize(Serializer& ser) {
+  ser.from_pod_vector(&color_indices);
+  ser.from_pod_vector(&matrices);
+  ser.from_pod_vector(&matrix_groups);
+  ser.from_pod_vector(&vertices);
+}
+
 void StripDraw::serialize(Serializer& ser) {
  ser.from_ptr(&mode);
  ser.from_ptr(&tree_tex_id);
-  ser.from_pod_vector(&vertex_index_stream);
+  ser.from_pod_vector(&runs);
  ser.from_pod_vector(&vis_groups);
  ser.from_ptr(&num_triangles);
 }

+void StripDraw::unpack() {
+  ASSERT(unpacked.vertex_index_stream.empty());
+  for (auto& r : runs) {
+    for (int i = 0; i < r.length; i++) {
+      unpacked.vertex_index_stream.push_back(r.vertex0 + i);
+    }
+    unpacked.vertex_index_stream.push_back(UINT32_MAX);
+  }
+}
+
 void InstancedStripDraw::serialize(Serializer& ser) {
  ser.from_ptr(&mode);
  ser.from_ptr(&tree_tex_id);
@ -37,11 +54,71 @@ void TfragTree::serialize(Serializer& ser) {
    draw.serialize(ser);
  }

-  ser.from_pod_vector(&vertices);
+  // ser.from_pod_vector(&vertices);
+  ser.from_pod_vector(&packed_vertices.vertices);
+  ser.from_pod_vector(&packed_vertices.cluster_origins);
  ser.from_pod_vector(&colors);
  bvh.serialize(ser);
 }

+void TieTree::unpack() {
+  unpacked.vertices.resize(packed_vertices.color_indices.size());
+  size_t i = 0;
+  for (const auto& grp : packed_vertices.matrix_groups) {
+    if (grp.matrix_idx == -1) {
+      for (u32 src_idx = grp.start_vert; src_idx < grp.end_vert; src_idx++) {
+        auto& vtx = unpacked.vertices[i];
+        vtx.color_index = packed_vertices.color_indices[i];
+        const auto& proto_vtx = packed_vertices.vertices[src_idx];
+        vtx.x = proto_vtx.x;
+        vtx.y = proto_vtx.y;
+        vtx.z = proto_vtx.z;
+        vtx.q = 1.f;
+        vtx.s = proto_vtx.s;
+        vtx.t = proto_vtx.t;
+        i++;
+      }
+    } else {
+      const auto& mat = packed_vertices.matrices[grp.matrix_idx];
+      for (u32 src_idx = grp.start_vert; src_idx < grp.end_vert; src_idx++) {
+        auto& vtx = unpacked.vertices[i];
+        vtx.color_index = packed_vertices.color_indices[i];
+        const auto& proto_vtx = packed_vertices.vertices[src_idx];
+        auto temp = mat[0] * proto_vtx.x + mat[1] * proto_vtx.y + mat[2] * proto_vtx.z + mat[3];
+        vtx.x = temp.x();
+        vtx.y = temp.y();
+        vtx.z = temp.z();
+        vtx.q = 1.f;
+        vtx.s = proto_vtx.s;
+        vtx.t = proto_vtx.t;
+        i++;
+      }
+    }
+  }
+}
+
+void TfragTree::unpack() {
+  unpacked.vertices.resize(packed_vertices.vertices.size());
+  for (size_t i = 0; i < unpacked.vertices.size(); i++) {
+    auto& o = unpacked.vertices[i];
+    auto& in = packed_vertices.vertices[i];
+    auto& cluster = packed_vertices.cluster_origins.at(in.cluster_idx);
+    constexpr float kClusterSize = 4096 * 40;  // 100 in-game meters
+    constexpr float kMasterOffset = 12000 * 4096;
+    constexpr float rescale = kClusterSize / UINT16_MAX;
+    float cx = -kMasterOffset + kClusterSize * cluster.x();
+    float cy = -kMasterOffset + kClusterSize * cluster.y();
+    float cz = -kMasterOffset + kClusterSize * cluster.z();
+    o.x = cx + in.xoff * rescale;
+    o.y = cy + in.yoff * rescale;
+    o.z = cz + in.zoff * rescale;
+    o.s = in.s / (1024.f);
+    o.t = in.t / (1024.f);
+    o.q = 1.f;
+    o.color_index = in.color_index;
+  }
+}
+
 void TieTree::serialize(Serializer& ser) {
  if (ser.is_saving()) {
    ser.save<size_t>(static_draws.size());
@ -62,15 +139,15 @@ void TieTree::serialize(Serializer& ser) {
  }

  if (ser.is_saving()) {
-    ser.save<size_t>(instance_info.size());
+    ser.save<size_t>(wind_instance_info.size());
  } else {
-    instance_info.resize(ser.load<size_t>());
+    wind_instance_info.resize(ser.load<size_t>());
  }
-  for (auto& inst : instance_info) {
+  for (auto& inst : wind_instance_info) {
    inst.serialize(ser);
  }

-  ser.from_pod_vector(&vertices);
+  packed_vertices.serialize(ser);
  ser.from_pod_vector(&colors);
  bvh.serialize(ser);
 }
@ -141,4 +218,58 @@ void Level::serialize(Serializer& ser) {
  }
 }

+std::array<int, MemoryUsageCategory::NUM_CATEGORIES> Level::get_memory_usage() const {
+  std::array<int, MemoryUsageCategory::NUM_CATEGORIES> result;
+  result.fill(0);
+
+  // textures
+  for (const auto& tex : textures) {
+    result[TEXTURE] += tex.data.size() * sizeof(u32);
+  }
+
+  // tfrag
+  for (const auto& tfrag_tree_geoms : tfrag_trees) {
+    for (const auto& tfrag_tree : tfrag_tree_geoms) {
+      for (const auto& draw : tfrag_tree.draws) {
+        result[TFRAG_INDEX] += draw.runs.size() * sizeof(StripDraw::VertexRun);
+        result[TFRAG_VIS] += draw.vis_groups.size() * sizeof(StripDraw::VisGroup);
+      }
+      result[TFRAG_VERTS] +=
+          tfrag_tree.packed_vertices.vertices.size() * sizeof(PackedTfragVertices::Vertex);
+      result[TFRAG_CLUSTER] +=
+          tfrag_tree.packed_vertices.cluster_origins.size() * sizeof(math::Vector<u16, 3>);
+      result[TFRAG_TIME_OF_DAY] += tfrag_tree.colors.size() * sizeof(TimeOfDayColor);
+      result[TFRAG_BVH] += tfrag_tree.bvh.vis_nodes.size() * sizeof(VisNode);
+    }
+  }
+
+  // tie
+  for (const auto& tie_tree_geoms : tie_trees) {
+    for (const auto& tie_tree : tie_tree_geoms) {
+      result[TIE_BVH] += tie_tree.bvh.vis_nodes.size();
+      for (const auto& draw : tie_tree.static_draws) {
+        result[TIE_DEINST_INDEX] += draw.runs.size() * sizeof(StripDraw::VertexRun);
+        result[TIE_DEINST_VIS] += draw.vis_groups.size() * sizeof(StripDraw::VisGroup);
+      }
+      result[TIE_VERTS] +=
+          tie_tree.packed_vertices.vertices.size() * sizeof(PackedTieVertices::Vertex);
+      result[TIE_CIDX] += tie_tree.packed_vertices.color_indices.size() * sizeof(u16);
+      result[TIE_MATRICES] += tie_tree.packed_vertices.matrices.size() * 4 * 4 * 4;
+      result[TIE_GRPS] +=
+          tie_tree.packed_vertices.matrix_groups.size() * sizeof(PackedTieVertices::MatrixGroup);
+      result[TIE_TIME_OF_DAY] += tie_tree.colors.size() * sizeof(TimeOfDayColor);
+
+      for (const auto& draw : tie_tree.instanced_wind_draws) {
+        result[TIE_INST_INDEX] += draw.vertex_index_stream.size() * sizeof(u32);
+        result[TIE_INST_VIS] +=
+            draw.instance_groups.size() * sizeof(InstancedStripDraw::InstanceGroup);
+      }
+      result[TIE_WIND_INSTANCE_INFO] +=
+          tie_tree.wind_instance_info.size() * sizeof(TieWindInstance);
+    }
+  }
+
+  return result;
+}
+
 }  // namespace tfrag3
--- a/common/custom_data/Tfrag3Data.h
+++ b/common/custom_data/Tfrag3Data.h
@ -11,7 +11,39 @@

 namespace tfrag3 {

-constexpr int TFRAG3_VERSION = 10;
+// NOTE:
+// when updating any data structures in this file:
+// - change the TFRAG3_VERSION
+// - make sure to update the serialize function
+// - if changing any large things (vertices, vis, bvh, colors, textures) update get_memory_usage
+// - if adding a new category to the memory usage, update extract_level to print it.
+
+enum MemoryUsageCategory {
+  TEXTURE,
+
+  TIE_DEINST_VIS,
+  TIE_DEINST_INDEX,
+  TIE_INST_VIS,
+  TIE_INST_INDEX,
+  TIE_BVH,
+  TIE_VERTS,
+  TIE_TIME_OF_DAY,
+  TIE_WIND_INSTANCE_INFO,
+
+  TIE_CIDX,
+  TIE_MATRICES,
+  TIE_GRPS,
+
+  TFRAG_VIS,
+  TFRAG_INDEX,
+  TFRAG_VERTS,
+  TFRAG_CLUSTER,
+  TFRAG_TIME_OF_DAY,
+  TFRAG_BVH,
+  NUM_CATEGORIES
+};
+
+constexpr int TFRAG3_VERSION = 11;

 // These vertices should be uploaded to the GPU at load time and don't change
 struct PreloadedVertex {
@ -25,6 +57,55 @@ struct PreloadedVertex {
 };
 static_assert(sizeof(PreloadedVertex) == 32, "PreloadedVertex size");

+struct PackedTieVertices {
+  struct Vertex {
+    float x, y, z;
+    float s, t;
+  };
+
+  struct MatrixGroup {
+    s32 matrix_idx;
+    u32 start_vert;
+    u32 end_vert;
+  };
+
+  std::vector<u16> color_indices;
+  std::vector<std::array<math::Vector4f, 4>> matrices;
+  std::vector<MatrixGroup> matrix_groups;  // todo pack
+  std::vector<Vertex> vertices;
+  float cluster_size = 0;
+  void serialize(Serializer& ser);
+};
+
+struct PackedTfragVertices {
+  struct Vertex {
+    u16 xoff, yoff, zoff;
+    u16 cluster_idx;
+    u16 s, t;
+    u16 color_index;
+
+    /*
+    bool operator==(const Vertex& other) const {
+      return xoff == other.xoff && yoff == other.yoff && zoff == other.zoff &&
+             cluster_idx == other.cluster_idx && s == other.s && t == other.t &&
+             color_index == other.color_index;
+    }
+
+    struct hash {
+      auto operator()(const Vertex& x) const {
+        return std::hash<uint16_t>()(x.xoff) ^ std::hash<uint16_t>()(x.yoff) ^
+               std::hash<uint16_t>()(x.zoff) ^ std::hash<uint16_t>()(x.cluster_idx) ^
+               std::hash<uint16_t>()(x.s) ^ std::hash<uint16_t>()(x.t) ^
+               std::hash<uint16_t>()(x.color_index);
+      }
+    };
+     */
+  };
+
+  std::vector<Vertex> vertices;
+  std::vector<math::Vector<u16, 3>> cluster_origins;
+};
+
 // Settings for drawing a group of triangle strips.
 // This refers to a group of PreloadedVertices that are already uploaded.
 // All triangles here are drawn in the same "mode" (blending, texture, etc)
@ -35,9 +116,20 @@ struct StripDraw {
  DrawMode mode;        // the OpenGL draw settings.
  u32 tree_tex_id = 0;  // the texture that should be bound for the draw

-  // the list of vertices in the draw. This includes the restart code of UINT32_MAX that OpenGL
-  // will use to start a new strip.
-  std::vector<u32> vertex_index_stream;
+  struct {
+    // the list of vertices in the draw. This includes the restart code of UINT32_MAX that OpenGL
+    // will use to start a new strip.
+    std::vector<u32> vertex_index_stream;
+  } unpacked;
+
+  void unpack();
+
+  struct VertexRun {
+    u32 vertex0;
+    u16 length;
+  };
+
+  std::vector<VertexRun> runs;

  // to do culling, the above vertex stream is grouped.
  // by following the visgroups and checking the visibility, you can leave out invisible vertices.
@ -129,11 +221,16 @@ constexpr const char* tfrag_tree_names[] = {"normal", "trans",        "dirt",

 // A tfrag model
 struct TfragTree {
-  TFragmentTreeKind kind;                 // our tfrag kind
-  std::vector<StripDraw> draws;           // the actual topology and settings
-  std::vector<PreloadedVertex> vertices;  // mesh vertices
-  std::vector<TimeOfDayColor> colors;     // vertex colors (pre-interpolation)
-  BVH bvh;                                // the bvh for frustum culling
+  TFragmentTreeKind kind;        // our tfrag kind
+  std::vector<StripDraw> draws;  // the actual topology and settings
+  PackedTfragVertices packed_vertices;
+  std::vector<TimeOfDayColor> colors;  // vertex colors (pre-interpolation)
+  BVH bvh;                             // the bvh for frustum culling
+
+  struct {
+    std::vector<PreloadedVertex> vertices;  // mesh vertices
+  } unpacked;
+  void unpack();
  void serialize(Serializer& ser);
 };

@ -147,14 +244,20 @@ struct TieWindInstance {
 // A tie model
 struct TieTree {
  BVH bvh;
-  std::vector<StripDraw> static_draws;    // the actual topology and settings
-  std::vector<PreloadedVertex> vertices;  // mesh vertices
-  std::vector<TimeOfDayColor> colors;     // vertex colors (pre-interpolation)
+  std::vector<StripDraw> static_draws;  // the actual topology and settings
+
+  PackedTieVertices packed_vertices;
+  std::vector<TimeOfDayColor> colors;  // vertex colors (pre-interpolation)

  std::vector<InstancedStripDraw> instanced_wind_draws;
-  std::vector<TieWindInstance> instance_info;
+  std::vector<TieWindInstance> wind_instance_info;
+
+  struct {
+    std::vector<PreloadedVertex> vertices;  // mesh vertices
+  } unpacked;

  void serialize(Serializer& ser);
+  void unpack();
 };

 struct Level {
@ -165,6 +268,8 @@ struct Level {
  std::array<std::vector<TieTree>, 4> tie_trees;
  u16 version2 = TFRAG3_VERSION;
  void serialize(Serializer& ser);
+
+  std::array<int, MemoryUsageCategory::NUM_CATEGORIES> get_memory_usage() const;
 };

 }  // namespace tfrag3
--- a/common/util/FileUtil.cpp
+++ b/common/util/FileUtil.cpp
@ -13,7 +13,12 @@
 #include "common/util/BinaryReader.h"
 #include "BinaryWriter.h"
 #include "common/common_types.h"
+
+// This disables the use of PCLMULQDQ which is probably ok, but let's just be safe and disable it
+// because nobody will care if png compression is 10% slower.
+#define FPNG_NO_SSE 1
 #include "third-party/fpng/fpng.cpp"
+
 #include "third-party/fpng/fpng.h"
 #include "third-party/fmt/core.h"
 #include "third-party/lzokay/lzokay.hpp"
--- a/common/util/os.cpp
+++ b/common/util/os.cpp
@ -1,5 +1,7 @@
 #include "os.h"

+#include "common/common_types.h"
+
 #ifdef __linux__

 #include <sys/resource.h>
@ -14,4 +16,72 @@ size_t get_peak_rss() {
 size_t get_peak_rss() {
  return 0;
 }
-#endif
+#endif
+
+#ifdef _WIN32
+// windows has a __cpuid
+#include <intrin.h>
+#else
+// using int to be compatible with msvc's intrinsic
+void __cpuidex(int result[4], int eax, int ecx) {
+  asm("cpuid\n\t"
+      : "=a"(result[0]), "=b"(result[1]), "=c"(result[2]), "=d"(result[3])
+      : "0"(eax), "2"(ecx));
+}
+#endif
+
+CpuInfo gCpuInfo;
+
+void setup_cpu_info() {
+  if (gCpuInfo.initialized) {
+    return;
+  }
+
+  // as a test, get the brand and model
+  for (u32 i = 0x80000002; i <= 0x80000004; i++) {
+    int result[4];
+    __cpuidex(result, i, 0);
+    for (auto reg : result) {
+      for (int c = 0; c < 4; c++) {
+        gCpuInfo.model.push_back(reg);
+        reg >>= 8;
+      }
+    }
+  }
+
+  {
+    int result[4];
+    __cpuidex(result, 0, 0);
+    for (auto r : {1, 3, 2}) {
+      for (int c = 0; c < 4; c++) {
+        gCpuInfo.brand.push_back(result[r]);
+        result[r] >>= 8;
+      }
+    }
+  }
+
+  // check for AVX2
+  {
+    int result[4];
+    __cpuidex(result, 7, 0);
+    gCpuInfo.has_avx2 = result[1] & (1 << 5);
+  }
+
+  {
+    int result[4];
+    __cpuidex(result, 1, 0);
+    gCpuInfo.has_avx = result[2] & (1 << 28);
+  }
+
+  printf("-------- CPU Information --------\n");
+  printf(" Brand: %s\n", gCpuInfo.brand.c_str());
+  printf(" Model: %s\n", gCpuInfo.model.c_str());
+  printf(" AVX  : %s\n", gCpuInfo.has_avx ? "true" : "false");
+  printf(" AVX2 : %s\n", gCpuInfo.has_avx2 ? "true" : "false");
+
+  gCpuInfo.initialized = true;
+}
+
+CpuInfo& get_cpu_info() {
+  return gCpuInfo;
+}
--- a/common/util/os.h
+++ b/common/util/os.h
@ -1,6 +1,19 @@
 #pragma once

 #include <cstddef>
+#include <string>

 // Note: these are not implemented on windows and will return zero.
-size_t get_peak_rss();
+size_t get_peak_rss();
+void setup_cpu_info();
+
+struct CpuInfo {
+  bool initialized = false;
+  bool has_avx = false;
+  bool has_avx2 = false;
+
+  std::string brand;
+  std::string model;
+};
+
+CpuInfo& get_cpu_info();
--- a/decompiler/level_extractor/extract_level.cpp
+++ b/decompiler/level_extractor/extract_level.cpp
@ -48,6 +48,45 @@ bool is_valid_bsp(const decompiler::LinkedObjectFile& file) {
  return true;
 }

+void print_memory_usage(const tfrag3::Level& lev, int uncompressed_data_size) {
+  int total_accounted = 0;
+  auto memory_use_by_category = lev.get_memory_usage();
+
+  std::vector<std::pair<std::string, int>> known_categories = {
+      {"texture", memory_use_by_category[tfrag3::MemoryUsageCategory::TEXTURE]},
+      {"tie-deinst-vis", memory_use_by_category[tfrag3::MemoryUsageCategory::TIE_DEINST_VIS]},
+      {"tie-deinst-idx", memory_use_by_category[tfrag3::MemoryUsageCategory::TIE_DEINST_INDEX]},
+      {"tie-inst-vis", memory_use_by_category[tfrag3::MemoryUsageCategory::TIE_INST_VIS]},
+      {"tie-inst-idx", memory_use_by_category[tfrag3::MemoryUsageCategory::TIE_INST_INDEX]},
+      {"tie-bvh", memory_use_by_category[tfrag3::MemoryUsageCategory::TIE_BVH]},
+      {"tie-verts", memory_use_by_category[tfrag3::MemoryUsageCategory::TIE_VERTS]},
+      {"tie-colors", memory_use_by_category[tfrag3::MemoryUsageCategory::TIE_TIME_OF_DAY]},
+      {"tie-wind-inst-info",
+       memory_use_by_category[tfrag3::MemoryUsageCategory::TIE_WIND_INSTANCE_INFO]},
+      {"tie-cidx", memory_use_by_category[tfrag3::MemoryUsageCategory::TIE_CIDX]},
+      {"tie-mats", memory_use_by_category[tfrag3::MemoryUsageCategory::TIE_MATRICES]},
+      {"tie-grps", memory_use_by_category[tfrag3::MemoryUsageCategory::TIE_GRPS]},
+      {"tfrag-vis", memory_use_by_category[tfrag3::MemoryUsageCategory::TFRAG_VIS]},
+      {"tfrag-idx", memory_use_by_category[tfrag3::MemoryUsageCategory::TFRAG_INDEX]},
+      {"tfrag-vert", memory_use_by_category[tfrag3::MemoryUsageCategory::TFRAG_VERTS]},
+      {"tfrag-colors", memory_use_by_category[tfrag3::MemoryUsageCategory::TFRAG_TIME_OF_DAY]},
+      {"tfrag-cluster", memory_use_by_category[tfrag3::MemoryUsageCategory::TFRAG_CLUSTER]},
+      {"tfrag-bvh", memory_use_by_category[tfrag3::MemoryUsageCategory::TFRAG_BVH]}};
+  for (auto& known : known_categories) {
+    total_accounted += known.second;
+  }
+
+  known_categories.push_back({"unknown", uncompressed_data_size - total_accounted});
+
+  std::sort(known_categories.begin(), known_categories.end(),
+            [](const auto& a, const auto& b) { return a.second > b.second; });
+
+  for (const auto& x : known_categories) {
+    fmt::print("{:30s} : {:6d} kB {:3.1f}%\n", x.first, x.second / 1024,
+               100.f * (float)x.second / uncompressed_data_size);
+  }
+}
+
 void extract_from_level(ObjectFileDB& db,
                        TextureDB& tex_db,
                        const std::string& dgo_name,
@ -85,7 +124,6 @@ void extract_from_level(ObjectFileDB& db,
  for (auto& draw_tree : bsp_header.drawable_tree_array.trees) {
    if (tfrag_trees.count(draw_tree->my_type())) {
      auto as_tfrag_tree = dynamic_cast<level_tools::DrawableTreeTfrag*>(draw_tree.get());
-      fmt::print("  extracting tree {}\n", draw_tree->my_type());
      ASSERT(as_tfrag_tree);
      std::vector<std::pair<int, int>> expected_missing_textures;
      auto it = hacks.missing_textures_by_level.find(level_name);
@ -96,13 +134,12 @@ void extract_from_level(ObjectFileDB& db,
                    bsp_header.texture_remap_table, tex_db, expected_missing_textures, tfrag_level,
                    dump_level);
    } else if (draw_tree->my_type() == "drawable-tree-instance-tie") {
-      fmt::print("  extracting TIE\n");
      auto as_tie_tree = dynamic_cast<level_tools::DrawableTreeInstanceTie*>(draw_tree.get());
      ASSERT(as_tie_tree);
      extract_tie(as_tie_tree, fmt::format("{}-{}-tie", dgo_name, i++),
                  bsp_header.texture_remap_table, tex_db, tfrag_level, dump_level);
    } else {
-      fmt::print("  unsupported tree {}\n", draw_tree->my_type());
+      // fmt::print("  unsupported tree {}\n", draw_tree->my_type());
    }
  }

@ -110,6 +147,7 @@ void extract_from_level(ObjectFileDB& db,
  tfrag_level.serialize(ser);
  auto compressed =
      compression::compress_zstd(ser.get_save_result().first, ser.get_save_result().second);
+  print_memory_usage(tfrag_level, ser.get_save_result().second);
  fmt::print("compressed: {} -> {} ({:.2f}%)\n", ser.get_save_result().second, compressed.size(),
             100.f * compressed.size() / ser.get_save_result().second);
  file_util::write_binary_file(file_util::get_file_path({fmt::format(
--- a/decompiler/level_extractor/extract_tfrag.cpp
+++ b/decompiler/level_extractor/extract_tfrag.cpp
@ -1975,13 +1975,14 @@ std::map<u32, std::vector<GroupedDraw>> make_draw_groups(std::vector<TFragDraw>&
    }
  }

-  fmt::print("    grouped to get {} draw calls\n", dc);
+  // fmt::print("    grouped to get {} draw calls\n", dc);

  return result;
 }

 void make_tfrag3_data(std::map<u32, std::vector<GroupedDraw>>& draws,
                      tfrag3::TfragTree& tree_out,
+                      std::vector<tfrag3::PreloadedVertex>& vertices,
                      std::vector<tfrag3::Texture>& texture_pool,
                      const TextureDB& tdb,
                      const std::vector<std::pair<int, int>>& expected_missing_textures) {
@ -2045,6 +2046,9 @@ void make_tfrag3_data(std::map<u32, std::vector<GroupedDraw>>& draws,
        vgroup.num = strip.verts.size() + 1;        // one for the primitive restart!

        tdraw.num_triangles += strip.verts.size() - 2;
+        tfrag3::StripDraw::VertexRun run;
+        run.vertex0 = vertices.size();
+        run.length = strip.verts.size();
        for (auto& vert : strip.verts) {
          // convert vert.
          tfrag3::PreloadedVertex vtx;
@ -2060,12 +2064,10 @@ void make_tfrag3_data(std::map<u32, std::vector<GroupedDraw>>& draws,
          // ASSERT((vert.rgba >> 2) < 1024); spider cave has 2048?
          ASSERT((vert.rgba & 3) == 0);

-          size_t vert_idx = tree_out.vertices.size();
-          tree_out.vertices.push_back(vtx);
-          tdraw.vertex_index_stream.push_back(vert_idx);
+          size_t vert_idx = vertices.size();
+          vertices.push_back(vtx);
        }
-        tdraw.vertex_index_stream.push_back(UINT32_MAX);  // prim restart
-
+        tdraw.runs.push_back(run);
        tdraw.vis_groups.push_back(vgroup);
      }

@ -2080,6 +2082,7 @@ void emulate_tfrags(int geom,
                    const std::vector<level_tools::TextureRemap>& map,
                    tfrag3::Level& level_out,
                    tfrag3::TfragTree& tree_out,
+                    std::vector<tfrag3::PreloadedVertex>& vertices,
                    const TextureDB& tdb,
                    const std::vector<std::pair<int, int>>& expected_missing_textures,
                    bool dump_level) {
@ -2101,7 +2104,7 @@ void emulate_tfrags(int geom,
  process_draw_mode(all_draws, map, tree_out.kind);
  auto groups = make_draw_groups(all_draws);

-  make_tfrag3_data(groups, tree_out, level_out.textures, tdb, expected_missing_textures);
+  make_tfrag3_data(groups, tree_out, vertices, level_out.textures, tdb, expected_missing_textures);

  if (dump_level) {
    auto debug_out = debug_dump_to_obj(all_draws);
@ -2135,6 +2138,85 @@ void merge_groups(std::vector<tfrag3::StripDraw::VisGroup>& grps) {

 }  // namespace

+constexpr float kClusterSize = 4096 * 40;  // 100 in-game meters
+constexpr float kMasterOffset = 12000 * 4096;
+
+std::pair<u64, u16> position_to_cluster_and_offset(float in) {
+  in += kMasterOffset;
+  if (in < 0) {
+    fmt::print("negative: {}\n", in);
+  }
+  ASSERT(in >= 0);
+  int cluster_cell = (in / kClusterSize);
+  float leftover = in - (cluster_cell * kClusterSize);
+  u16 offset = (leftover / kClusterSize) * float(UINT16_MAX);
+
+  float recovered = ((float)cluster_cell + ((float)offset / UINT16_MAX)) * kClusterSize;
+  float diff = std::fabs(recovered - in);
+  ASSERT(diff < 7);
+  ASSERT(cluster_cell >= 0);
+  ASSERT(cluster_cell < UINT16_MAX);
+  return {cluster_cell, offset};
+}
+
+void pack_vertices(tfrag3::PackedTfragVertices* result,
+                   const std::vector<tfrag3::PreloadedVertex>& vertices) {
+  u32 next_cluster_idx = 0;
+  std::map<u64, u32> clusters;
+
+  for (auto& vtx : vertices) {
+    auto x = position_to_cluster_and_offset(vtx.x);
+    auto y = position_to_cluster_and_offset(vtx.y);
+    auto z = position_to_cluster_and_offset(vtx.z);
+    u64 cluster_id = 0;
+    cluster_id |= x.first;
+    cluster_id |= (y.first << 16);
+    cluster_id |= (z.first << 32);
+
+    auto cluster_it = clusters.find(cluster_id);
+    u32 my_cluster_idx = 0;
+    if (cluster_it == clusters.end()) {
+      // first in cluster
+      clusters[cluster_id] = next_cluster_idx;
+      my_cluster_idx = next_cluster_idx;
+      next_cluster_idx++;
+    } else {
+      my_cluster_idx = cluster_it->second;
+    }
+
+    tfrag3::PackedTfragVertices::Vertex out_vtx;
+    out_vtx.xoff = x.second;
+    out_vtx.yoff = y.second;
+    out_vtx.zoff = z.second;
+    out_vtx.cluster_idx = my_cluster_idx;
+    // TODO check these
+    out_vtx.s = vtx.s * 1024;
+    out_vtx.t = vtx.t * 1024;
+    out_vtx.color_index = vtx.color_index;
+    result->vertices.push_back(out_vtx);
+  }
+
+  result->cluster_origins.resize(next_cluster_idx);
+  for (auto& cluster : clusters) {
+    auto& res = result->cluster_origins[cluster.second];
+    res.x() = (u16)cluster.first;
+    res.y() = (u16)(cluster.first >> 16);
+    res.z() = (u16)(cluster.first >> 32);
+  }
+
+  /*
+  std::unordered_set<tfrag3::PackedTfragVertices::Vertex, tfrag3::PackedTfragVertices::Vertex::hash>
+      a;
+  for (auto& v : result->vertices) {
+    a.insert(v);
+  }
+  fmt::print("SIZE: {} vs {} {}\n", a.size(), result->vertices.size(),
+             (float)a.size() / result->vertices.size());
+             */
+
+  ASSERT(next_cluster_idx < UINT16_MAX);
+}
+
 void extract_tfrag(const level_tools::DrawableTreeTfrag* tree,
                   const std::string& debug_name,
                   const std::vector<level_tools::TextureRemap>& map,
@ -2142,7 +2224,7 @@ void extract_tfrag(const level_tools::DrawableTreeTfrag* tree,
                   const std::vector<std::pair<int, int>>& expected_missing_textures,
                   tfrag3::Level& out,
                   bool dump_level) {
-  // go through 4 lods(?)
+  // go through 3 lods(?)
  for (int geom = 0; geom < GEOM_MAX; ++geom) {
    tfrag3::TfragTree this_tree;
    if (tree->my_type() == "drawable-tree-tfrag") {
@ -2176,7 +2258,8 @@ void extract_tfrag(const level_tools::DrawableTreeTfrag* tree,
    }
    bool ok = verify_node_indices(tree);
    ASSERT(ok);
-    fmt::print("    tree has {} arrays and {} tfragments\n", tree->length, as_tfrag_array->length);
+    // fmt::print("    tree has {} arrays and {} tfragments\n", tree->length,
+    // as_tfrag_array->length);

    auto vis_nodes = extract_vis_data(tree, as_tfrag_array->tfragments.front().id);
    this_tree.bvh.first_leaf_node = vis_nodes.first_child_node;
@ -2198,8 +2281,10 @@ void extract_tfrag(const level_tools::DrawableTreeTfrag* tree,
    }
    //  ASSERT(result.vis_nodes.last_child_node + 1 == idx);

-    emulate_tfrags(geom, as_tfrag_array->tfragments, debug_name, map, out, this_tree, tex_db,
-                   expected_missing_textures, dump_level);
+    std::vector<tfrag3::PreloadedVertex> vertices;
+    emulate_tfrags(geom, as_tfrag_array->tfragments, debug_name, map, out, this_tree, vertices,
+                   tex_db, expected_missing_textures, dump_level);
+    pack_vertices(&this_tree.packed_vertices, vertices);
    extract_time_of_day(tree, this_tree);

    for (auto& draw : this_tree.draws) {
--- a/decompiler/level_extractor/extract_tie.cpp
+++ b/decompiler/level_extractor/extract_tie.cpp
@ -558,9 +558,9 @@ void update_proto_info(std::vector<TieProtoInfo>* out,
        adgif.combo_tex = tex_combo;
        // and the hidden value in the unused a+d
        memcpy(&adgif.second_w, &gif_data.at(16 * (tex_idx * 5 + 1) + 12), 4);
-        // todo: figure out if this matters
+        // todo: figure out if this matters. maybe this is decal?
        if (ra_tex0_val == 0x800000000) {
-          fmt::print("texture {} in {} has weird tex setting\n", tex->second.name, proto.name);
+          // fmt::print("texture {} in {} has weird tex setting\n", tex->second.name, proto.name);
        }

        // mipmap settings. we ignore, but get the hidden value
@ -2036,19 +2036,40 @@ void add_vertices_and_static_draw(tfrag3::TieTree& tree,
    //    bool using_wind = true;  // hack, for testing
    bool using_wind = proto.stiffness != 0.f;

+    // create the model first
+    std::vector<std::vector<std::pair<int, int>>> packed_vert_indices;
+    for (size_t frag_idx = 0; frag_idx < proto.frags.size(); frag_idx++) {
+      packed_vert_indices.emplace_back();
+      auto& frag_vert_indices = packed_vert_indices.back();
+      auto& frag = proto.frags[frag_idx];  // shared info for all instances of this frag
+      for (auto& strip : frag.strips) {
+        int start = tree.packed_vertices.vertices.size();
+        for (auto& vert : strip.verts) {
+          tree.packed_vertices.vertices.push_back(
+              {vert.pos.x(), vert.pos.y(), vert.pos.z(), vert.tex.x(), vert.tex.y()});
+          ASSERT(vert.tex.z() == 1.);
+        }
+        int end = tree.packed_vertices.vertices.size();
+        frag_vert_indices.emplace_back(start, end);
+      }
+    }
+
    // loop over instances of the prototypes
    for (auto& inst : proto.instances) {
      // if we're using wind, we use the instanced renderer, which requires some extra info
      // and we should remember which instance ID we are.
      // Note: this is different from the game's instance index - we don't draw everything instanced
      // so the non-instanced models don't get a C++ renderer instance ID
-      u32 wind_instance_idx = tree.instance_info.size();
+      u32 wind_instance_idx = tree.wind_instance_info.size();
+      u32 matrix_idx = tree.packed_vertices.matrices.size();
      if (using_wind) {
        tfrag3::TieWindInstance wind_instance_info;
        wind_instance_info.wind_idx = inst.wind_index;   // which wind value to apply in the table
        wind_instance_info.stiffness = proto.stiffness;  // wind stiffness (how much we move)
        wind_instance_info.matrix = inst.mat;            // instance transformation matrix.
-        tree.instance_info.push_back(wind_instance_info);
+        tree.wind_instance_info.push_back(wind_instance_info);
+      } else {
+        tree.packed_vertices.matrices.push_back(inst.mat);
      }

      // loop over fragments of the prototype
@ -2056,7 +2077,8 @@ void add_vertices_and_static_draw(tfrag3::TieTree& tree,
        auto& frag = proto.frags[frag_idx];     // shared info for all instances of this frag
        auto& ifrag = inst.frags.at(frag_idx);  // color info for this instance of the frag
        // loop over triangle strips within the fragment
-        for (auto& strip : frag.strips) {
+        for (size_t strip_idx = 0; strip_idx < frag.strips.size(); strip_idx++) {
+          auto& strip = frag.strips[strip_idx];
          // what texture are we using?
          u32 combo_tex = strip.adgif.combo_tex;

@ -2139,31 +2161,30 @@ void add_vertices_and_static_draw(tfrag3::TieTree& tree,
            igroup.instance_idx = wind_instance_idx;
            draw_to_add_to->num_triangles += strip.verts.size() - 2;
            // note: this is a bit wasteful to duplicate the xyz/stq.
+            tfrag3::PackedTieVertices::MatrixGroup grp;
+            grp.matrix_idx = -1;
+            grp.start_vert = packed_vert_indices.at(frag_idx).at(strip_idx).first;
+            grp.end_vert = packed_vert_indices.at(frag_idx).at(strip_idx).second;
+            tree.packed_vertices.matrix_groups.push_back(grp);
            for (auto& vert : strip.verts) {
-              tfrag3::PreloadedVertex vtx;
-              vtx.x = vert.pos.x();
-              vtx.y = vert.pos.y();
-              vtx.z = vert.pos.z();
-              vtx.s = vert.tex.x();
-              vtx.t = vert.tex.y();
-              vtx.q = vert.tex.z();
-              // if this is true, we can remove a divide in the shader
-              ASSERT(vtx.q == 1.f);
+              u16 color_index = 0;
              if (vert.color_index_index == UINT32_MAX) {
-                vtx.color_index = 0;
+                color_index = 0;
              } else {
-                vtx.color_index = ifrag.color_indices.at(vert.color_index_index);
+                color_index = ifrag.color_indices.at(vert.color_index_index);
                ASSERT(vert.color_index_index < ifrag.color_indices.size());
-                vtx.color_index += ifrag.color_index_offset_in_big_palette;
+                color_index += ifrag.color_index_offset_in_big_palette;
              }

-              size_t vert_idx = tree.vertices.size();
-              tree.vertices.push_back(vtx);
+              size_t vert_idx = tree.packed_vertices.color_indices.size();
+              tree.packed_vertices.color_indices.push_back(color_index);
              draw_to_add_to->vertex_index_stream.push_back(vert_idx);
            }
+
            // the primitive restart index
            draw_to_add_to->vertex_index_stream.push_back(UINT32_MAX);
            draw_to_add_to->instance_groups.push_back(igroup);
+
          } else {
            // okay, we now have a texture and draw mode, let's see if we can add to an existing...
            auto existing_draws_in_tex = static_draws_by_tex.find(idx_in_lev_data);
@ -2190,31 +2211,30 @@ void add_vertices_and_static_draw(tfrag3::TieTree& tree,
            vgroup.vis_idx_in_pc_bvh = inst.vis_id;  // associate with the instance for culling
            vgroup.num = strip.verts.size() + 1;     // one for the primitive restart!
            draw_to_add_to->num_triangles += strip.verts.size() - 2;
+            tfrag3::PackedTieVertices::MatrixGroup grp;
+            grp.matrix_idx = matrix_idx;
+            grp.start_vert = packed_vert_indices.at(frag_idx).at(strip_idx).first;
+            grp.end_vert = packed_vert_indices.at(frag_idx).at(strip_idx).second;
+            tree.packed_vertices.matrix_groups.push_back(grp);
+            tfrag3::StripDraw::VertexRun run;
+            run.vertex0 = tree.packed_vertices.color_indices.size();
+            run.length = strip.verts.size();
            for (auto& vert : strip.verts) {
-              tfrag3::PreloadedVertex vtx;
-              // todo fields
-              auto tf = transform_tie(inst.mat, vert.pos);
-              vtx.x = tf.x();
-              vtx.y = tf.y();
-              vtx.z = tf.z();
-              vtx.s = vert.tex.x();
-              vtx.t = vert.tex.y();
-              vtx.q = vert.tex.z();
-              // if this is true, we can remove a divide in the shader
-              ASSERT(vtx.q == 1.f);
+              u16 color_index = 0;
              if (vert.color_index_index == UINT32_MAX) {
-                vtx.color_index = 0;
+                color_index = 0;
              } else {
-                vtx.color_index = ifrag.color_indices.at(vert.color_index_index);
+                color_index = ifrag.color_indices.at(vert.color_index_index);
                ASSERT(vert.color_index_index < ifrag.color_indices.size());
-                vtx.color_index += ifrag.color_index_offset_in_big_palette;
+                color_index += ifrag.color_index_offset_in_big_palette;
              }

-              size_t vert_idx = tree.vertices.size();
-              tree.vertices.push_back(vtx);
-              draw_to_add_to->vertex_index_stream.push_back(vert_idx);
+              size_t vert_idx = tree.packed_vertices.color_indices.size();
+              tree.packed_vertices.color_indices.push_back(color_index);
+              // draw_to_add_to->vertex_index_stream.push_back(vert_idx);
            }
-            draw_to_add_to->vertex_index_stream.push_back(UINT32_MAX);
+            draw_to_add_to->runs.push_back(run);
+            // draw_to_add_to->vertex_index_stream.push_back(UINT32_MAX);
            draw_to_add_to->vis_groups.push_back(vgroup);
          }
        }
@ -2285,8 +2305,6 @@ void extract_tie(const level_tools::DrawableTreeInstanceTie* tree,
    }
    bool ok = verify_node_indices(tree);
    ASSERT(ok);
-    fmt::print("    tree has {} arrays and {} instances\n", tree->length,
-               as_instance_array->length);

    // extract the vis tree. Note that this extracts the tree only down to the last draw node, a
    // parent of between 1 and 8 instances.
@ -2362,7 +2380,6 @@ void extract_tie(const level_tools::DrawableTreeInstanceTie* tree,
    }

    this_tree.colors = full_palette.colors;
-    fmt::print("TIE tree {} has {} draws\n", geo, this_tree.static_draws.size());
    out.tie_trees[geo].push_back(std::move(this_tree));
  }
 }
--- a/game/graphics/opengl_renderer/DirectRenderer.cpp
+++ b/game/graphics/opengl_renderer/DirectRenderer.cpp
@ -601,7 +601,11 @@ void DirectRenderer::render_gif(const u8* data,
  }

  if (size != UINT32_MAX) {
-    ASSERT((offset + 15) / 16 == size / 16);
+    if (!(offset + 15) / 16 == size / 16) {
+      fmt::print("DirectRenderer size failed in {}\n", name_and_id());
+      fmt::print("expected: {}, got: {}\n", size, offset);
+      ASSERT(false);
+    }
  }

  //  fmt::print("{}\n", GifTag(data).print());
--- a/game/graphics/opengl_renderer/Loader.cpp
+++ b/game/graphics/opengl_renderer/Loader.cpp
@ -56,8 +56,27 @@ void Loader::loader_thread() {
    Serializer ser(decomp_data.data(), decomp_data.size());
    result->serialize(ser);
    double import_time = import_timer.getSeconds();
-    fmt::print("------------> Load from file: {:.3f}s, import {:.3f}s, decomp {:.3f}s\n",
-               disk_load_time, import_time, decomp_time);
+
+    Timer unpack_timer;
+    for (auto& tie_tree : result->tie_trees) {
+      for (auto& tree : tie_tree) {
+        tree.unpack();
+        for (auto& d : tree.static_draws) {
+          d.unpack();
+        }
+      }
+    }
+    for (auto& t_tree : result->tfrag_trees) {
+      for (auto& tree : t_tree) {
+        tree.unpack();
+        for (auto& d : tree.draws) {
+          d.unpack();
+        }
+      }
+    }
+    fmt::print(
+        "------------> Load from file: {:.3f}s, import {:.3f}s, decomp {:.3f}s unpack {:.3f}s\n",
+        disk_load_time, import_time, decomp_time, unpack_timer.getSeconds());

    lk.lock();
    m_initializing_tfrag3_levels[lev].data.level = std::move(result);
--- a/game/graphics/opengl_renderer/MercRenderer.h
+++ b/game/graphics/opengl_renderer/MercRenderer.h
@ -353,14 +353,14 @@ struct alignas(16) Accumulator {
    auto b = _mm_set1_ps(_b);
    auto a = _mm_load_ps(_a.data);
    auto acc = _mm_load_ps(data);
-    _mm_store_ps(data, _mm_fmadd_ps(a, b, acc));
+    _mm_store_ps(data, _mm_add_ps(_mm_mul_ps(a, b), acc));
  }

  REALLY_INLINE void madda_xyzw(const Vf& _a, const Vf& _b) {
    auto b = _mm_load_ps(_b.data);
    auto a = _mm_load_ps(_a.data);
    auto acc = _mm_load_ps(data);
-    _mm_store_ps(data, _mm_fmadd_ps(a, b, acc));
+    _mm_store_ps(data, _mm_add_ps(_mm_mul_ps(a, b), acc));
  }

  void madd(Mask mask, Vf& dest, const Vf& a, const Vf& b) {
@ -375,14 +375,14 @@ struct alignas(16) Accumulator {
    auto b = _mm_set1_ps(_b);
    auto a = _mm_load_ps(_a.data);
    auto acc = _mm_load_ps(data);
-    _mm_store_ps(dest.data, _mm_fmadd_ps(a, b, acc));
+    _mm_store_ps(dest.data, _mm_add_ps(_mm_mul_ps(a, b), acc));
  }

  REALLY_INLINE void madd_xyz(Vf& dest, const Vf& _a, float _b) {
    auto b = _mm_set1_ps(_b);
    auto a = _mm_load_ps(_a.data);
    auto acc = _mm_load_ps(data);
-    auto prod = _mm_fmadd_ps(a, b, acc);
+    auto prod = _mm_add_ps(_mm_mul_ps(a, b), acc);
    prod = _mm_blend_ps(prod, _mm_load_ps(dest.data), 0b1000);
    _mm_store_ps(dest.data, prod);
  }
--- a/game/graphics/opengl_renderer/SkyBlendCPU.cpp
+++ b/game/graphics/opengl_renderer/SkyBlendCPU.cpp
@ -1,5 +1,6 @@
 #include "SkyBlendCPU.h"
 #include "game/graphics/opengl_renderer/AdgifHandler.h"
+#include "common/util/os.h"

 #include <immintrin.h>

@ -18,33 +19,72 @@ SkyBlendCPU::~SkyBlendCPU() {
 }

 void blend_sky_initial_fast(u8 intensity, u8* out, const u8* in, u32 size) {
-  __m256i intensity_vec = _mm256_set1_epi16(intensity);
-  for (u32 i = 0; i < size / 16; i++) {
-    __m128i tex_data8 = _mm_loadu_si128((const __m128i*)(in + (i * 16)));
-    __m256i tex_data16 = _mm256_cvtepu8_epi16(tex_data8);
-    tex_data16 = _mm256_mullo_epi16(tex_data16, intensity_vec);
-    tex_data16 = _mm256_srli_epi16(tex_data16, 7);
-    auto hi = _mm256_extracti128_si256(tex_data16, 1);
-    auto result = _mm_packus_epi16(_mm256_castsi256_si128(tex_data16), hi);
-    _mm_storeu_si128((__m128i*)(out + (i * 16)), result);
+  if (get_cpu_info().has_avx2) {
+#ifdef __AVX2__
+    __m256i intensity_vec = _mm256_set1_epi16(intensity);
+    for (u32 i = 0; i < size / 16; i++) {
+      __m128i tex_data8 = _mm_loadu_si128((const __m128i*)(in + (i * 16)));
+      __m256i tex_data16 = _mm256_cvtepu8_epi16(tex_data8);
+      tex_data16 = _mm256_mullo_epi16(tex_data16, intensity_vec);
+      tex_data16 = _mm256_srli_epi16(tex_data16, 7);
+      auto hi = _mm256_extracti128_si256(tex_data16, 1);
+      auto result = _mm_packus_epi16(_mm256_castsi256_si128(tex_data16), hi);
+      _mm_storeu_si128((__m128i*)(out + (i * 16)), result);
+    }
+#else
+    ASSERT(false);
+#endif
+  } else {
+    __m128i intensity_vec = _mm_set1_epi16(intensity);
+    for (u32 i = 0; i < size / 8; i++) {
+      __m128i tex_data8 = _mm_loadu_si64((const __m128i*)(in + (i * 8)));
+      __m128i tex_data16 = _mm_cvtepu8_epi16(tex_data8);
+      tex_data16 = _mm_mullo_epi16(tex_data16, intensity_vec);
+      tex_data16 = _mm_srli_epi16(tex_data16, 7);
+      auto result = _mm_packus_epi16(tex_data16, tex_data16);
+      _mm_storeu_si64((__m128i*)(out + (i * 8)), result);
+    }
  }
 }

 void blend_sky_fast(u8 intensity, u8* out, const u8* in, u32 size) {
-  __m256i intensity_vec = _mm256_set1_epi16(intensity);
-  __m256i max_intensity = _mm256_set1_epi16(255);
-  for (u32 i = 0; i < size / 16; i++) {
-    __m128i tex_data8 = _mm_loadu_si128((const __m128i*)(in + (i * 16)));
-    __m128i out_val = _mm_loadu_si128((const __m128i*)(out + (i * 16)));
-    __m256i tex_data16 = _mm256_cvtepu8_epi16(tex_data8);
-    tex_data16 = _mm256_mullo_epi16(tex_data16, intensity_vec);
-    tex_data16 = _mm256_srli_epi16(tex_data16, 7);
-    tex_data16 = _mm256_min_epi16(max_intensity, tex_data16);
-    auto hi = _mm256_extracti128_si256(tex_data16, 1);
-    auto result = _mm_packus_epi16(_mm256_castsi256_si128(tex_data16), hi);
-    out_val = _mm_adds_epu8(out_val, result);
-    _mm_storeu_si128((__m128i*)(out + (i * 16)), out_val);
+  if (get_cpu_info().has_avx2) {
+#ifdef __AVX2__
+    __m256i intensity_vec = _mm256_set1_epi16(intensity);
+    __m256i max_intensity = _mm256_set1_epi16(255);
+    for (u32 i = 0; i < size / 16; i++) {
+      __m128i tex_data8 = _mm_loadu_si128((const __m128i*)(in + (i * 16)));
+      __m128i out_val = _mm_loadu_si128((const __m128i*)(out + (i * 16)));
+      __m256i tex_data16 = _mm256_cvtepu8_epi16(tex_data8);
+      tex_data16 = _mm256_mullo_epi16(tex_data16, intensity_vec);
+      tex_data16 = _mm256_srli_epi16(tex_data16, 7);
+      tex_data16 = _mm256_min_epi16(max_intensity, tex_data16);
+      auto hi = _mm256_extracti128_si256(tex_data16, 1);
+      auto result = _mm_packus_epi16(_mm256_castsi256_si128(tex_data16), hi);
+      out_val = _mm_adds_epu8(out_val, result);
+      _mm_storeu_si128((__m128i*)(out + (i * 16)), out_val);
+    }
+#else
+    ASSERT(false);
+#endif
+  } else {
+    __m128i intensity_vec = _mm_set1_epi16(intensity);
+    __m128i max_intensity = _mm_set1_epi16(255);
+    for (u32 i = 0; i < size / 8; i++) {
+      __m128i tex_data8 = _mm_loadu_si64((const __m128i*)(in + (i * 8)));
+      __m128i out_val = _mm_loadu_si64((const __m128i*)(out + (i * 8)));
+      __m128i tex_data16 = _mm_cvtepu8_epi16(tex_data8);
+      tex_data16 = _mm_mullo_epi16(tex_data16, intensity_vec);
+      tex_data16 = _mm_srli_epi16(tex_data16, 7);
+      tex_data16 = _mm_min_epi16(max_intensity, tex_data16);
+      auto result = _mm_packus_epi16(tex_data16, tex_data16);
+      out_val = _mm_adds_epu8(out_val, result);
+      _mm_storeu_si64((__m128i*)(out + (i * 8)), out_val);
+    }
  }
+  /*
+
+   */
 }

 SkyBlendStats SkyBlendCPU::do_sky_blends(DmaFollower& dma,
--- a/game/graphics/opengl_renderer/tfrag/Tfrag3.cpp
+++ b/game/graphics/opengl_renderer/tfrag/Tfrag3.cpp
@ -70,10 +70,10 @@ bool Tfrag3::update_load(const std::vector<tfrag3::TFragmentTreeKind>& tree_kind
          if (std::find(tree_kinds.begin(), tree_kinds.end(), tree.kind) != tree_kinds.end()) {
            max_draw = std::max(tree.draws.size(), max_draw);
            for (auto& draw : tree.draws) {
-              idx_buffer_len += draw.vertex_index_stream.size();
+              idx_buffer_len += draw.unpacked.vertex_index_stream.size();
            }
            time_of_day_count = std::max(tree.colors.size(), time_of_day_count);
-            u32 verts = tree.vertices.size();
+            u32 verts = tree.packed_vertices.vertices.size();
            glGenVertexArrays(1, &tree_cache.vao);
            glBindVertexArray(tree_cache.vao);
            glGenBuffers(1, &tree_cache.vertex_buffer);
@ -148,7 +148,7 @@ bool Tfrag3::update_load(const std::vector<tfrag3::TFragmentTreeKind>& tree_kind
          const auto& tree = lev_data->tfrag_trees[geom][tree_idx];

          if (std::find(tree_kinds.begin(), tree_kinds.end(), tree.kind) != tree_kinds.end()) {
-            u32 verts = tree.vertices.size();
+            u32 verts = tree.unpacked.vertices.size();
            u32 start_vert = (m_load_state.vert) * MAX_VERTS;
            u32 end_vert = std::min(verts, (m_load_state.vert + 1) * MAX_VERTS);
            if (end_vert > start_vert) {
@ -156,7 +156,7 @@ bool Tfrag3::update_load(const std::vector<tfrag3::TFragmentTreeKind>& tree_kind
              glBindBuffer(GL_ARRAY_BUFFER, m_cached_trees[geom][tree_idx].vertex_buffer);
              glBufferSubData(GL_ARRAY_BUFFER, start_vert * sizeof(tfrag3::PreloadedVertex),
                              (end_vert - start_vert) * sizeof(tfrag3::PreloadedVertex),
-                              tree.vertices.data() + start_vert);
+                              tree.unpacked.vertices.data() + start_vert);
              if (end_vert < verts) {
                remaining = true;
              }
@ -274,7 +274,7 @@ void Tfrag3::render_tree(int geom,
    void* offset = (void*)(indices.first * sizeof(u32));

    prof.add_draw_call();
-    prof.add_tri(draw.num_triangles * (float)draw_size / draw.vertex_index_stream.size());
+    prof.add_tri(draw.num_triangles * (float)draw_size / draw.unpacked.vertex_index_stream.size());

    glDrawElements(GL_TRIANGLE_STRIP, draw_size, GL_UNSIGNED_INT, (void*)offset);

--- a/game/graphics/opengl_renderer/tfrag/Tie3.cpp
+++ b/game/graphics/opengl_renderer/tfrag/Tie3.cpp
@ -40,18 +40,18 @@ bool Tie3::update_load(const tfrag3::Level* lev_data) {
          const auto& tree = lev_data->tie_trees[geo][tree_idx];
          max_draw = std::max(tree.static_draws.size(), max_draw);
          for (auto& draw : tree.static_draws) {
-            idx_buffer_len += draw.vertex_index_stream.size();
-            max_idx_per_draw = std::max(max_idx_per_draw, draw.vertex_index_stream.size());
+            idx_buffer_len += draw.unpacked.vertex_index_stream.size();
+            max_idx_per_draw = std::max(max_idx_per_draw, draw.unpacked.vertex_index_stream.size());
          }
          for (auto& draw : tree.instanced_wind_draws) {
            wind_idx_buffer_len += draw.vertex_index_stream.size();
            max_idx_per_draw = std::max(max_idx_per_draw, draw.vertex_index_stream.size());
          }
-          for (auto& inst : tree.instance_info) {
+          for (auto& inst : tree.wind_instance_info) {
            max_wind_idx = std::max(max_wind_idx, inst.wind_idx);
          }
          time_of_day_count = std::max(tree.colors.size(), time_of_day_count);
-          u32 verts = tree.vertices.size();
+          u32 verts = tree.packed_vertices.color_indices.size();
          fmt::print("  tree {} has {} verts ({} kB) and {} draws\n", tree_idx, verts,
                     verts * sizeof(tfrag3::PreloadedVertex) / 1024.f, tree.static_draws.size());
          auto& lod_tree = m_trees.at(geo);
@ -62,7 +62,7 @@ bool Tie3::update_load(const tfrag3::Level* lev_data) {
          lod_tree[tree_idx].draws = &tree.static_draws;  // todo - should we just copy this?
          lod_tree[tree_idx].colors = &tree.colors;
          lod_tree[tree_idx].vis = &tree.bvh;
-          lod_tree[tree_idx].instance_info = &tree.instance_info;
+          lod_tree[tree_idx].instance_info = &tree.wind_instance_info;
          lod_tree[tree_idx].wind_draws = &tree.instanced_wind_draws;
          vis_temp_len = std::max(vis_temp_len, tree.bvh.vis_nodes.size());
          lod_tree[tree_idx].tod_cache = swizzle_time_of_day(tree.colors);
@ -107,7 +107,7 @@ bool Tie3::update_load(const tfrag3::Level* lev_data) {
          lod_tree[tree_idx].index_list.resize(idx_buffer_len);

          if (wind_idx_buffer_len > 0) {
-            lod_tree[tree_idx].wind_matrix_cache.resize(tree.instance_info.size());
+            lod_tree[tree_idx].wind_matrix_cache.resize(tree.wind_instance_info.size());
            lod_tree[tree_idx].has_wind = true;
            glGenBuffers(1, &lod_tree[tree_idx].wind_vertex_index_buffer);
            glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, lod_tree[tree_idx].wind_vertex_index_buffer);
@ -158,7 +158,7 @@ bool Tie3::update_load(const tfrag3::Level* lev_data) {
      for (int geo = 0; geo < 4; ++geo) {
        for (size_t tree_idx = 0; tree_idx < lev_data->tie_trees[geo].size(); tree_idx++) {
          const auto& tree = lev_data->tie_trees[geo][tree_idx];
-          u32 verts = tree.vertices.size();
+          u32 verts = tree.unpacked.vertices.size();
          u32 start_vert = (m_load_state.vert) * MAX_VERTS;
          u32 end_vert = std::min(verts, (m_load_state.vert + 1) * MAX_VERTS);
          if (end_vert > start_vert) {
@ -166,7 +166,7 @@ bool Tie3::update_load(const tfrag3::Level* lev_data) {
            glBindBuffer(GL_ARRAY_BUFFER, m_trees[geo][tree_idx].vertex_buffer);
            glBufferSubData(GL_ARRAY_BUFFER, start_vert * sizeof(tfrag3::PreloadedVertex),
                            (end_vert - start_vert) * sizeof(tfrag3::PreloadedVertex),
-                            tree.vertices.data() + start_vert);
+                            tree.unpacked.vertices.data() + start_vert);
            if (end_vert < verts) {
              remaining = true;
            }
@ -440,7 +440,6 @@ void Tie3::render(DmaFollower& dma, SharedRenderState* render_state, ScopedProfi
    m_has_level = setup_for_level(m_pc_port_data.level_name, render_state);
  }
  render_all_trees(lod(), settings, render_state, prof);
-  // todo render all...
 }

 void Tie3::render_all_trees(int geom,
@ -659,9 +658,9 @@ void Tie3::render_tree(int idx,
    void* offset = (void*)(indices.first * sizeof(u32));

    prof.add_draw_call();
-    prof.add_tri(draw.num_triangles * (float)draw_size / draw.vertex_index_stream.size());
+    prof.add_tri(draw.num_triangles * (float)draw_size / draw.unpacked.vertex_index_stream.size());

-    bool is_full = draw_size == (int)draw.vertex_index_stream.size();
+    bool is_full = draw_size == (int)draw.unpacked.vertex_index_stream.size();

    tree.perf.draws++;
    if (is_full) {
--- a/game/graphics/opengl_renderer/tfrag/Tie3.h
+++ b/game/graphics/opengl_renderer/tfrag/Tie3.h
@ -48,8 +48,6 @@ class Tie3 : public BucketRenderer {
                        SharedRenderState* render_state,
                        ScopedProfilerNode& prof);

-  int m_geom = 0;
-
  struct Tree {
    GLuint vertex_buffer;
    GLuint index_buffer;
--- a/game/graphics/opengl_renderer/tfrag/tfrag_common.cpp
+++ b/game/graphics/opengl_renderer/tfrag/tfrag_common.cpp
@ -3,6 +3,7 @@
 #include "tfrag_common.h"
 #include "game/graphics/opengl_renderer/BucketRenderer.h"
 #include "game/graphics/pipelines/opengl.h"
+#include "common/util/os.h"

 #include <immintrin.h>

@ -212,11 +213,12 @@ SwizzledTimeOfDay swizzle_time_of_day(const std::vector<tfrag3::TimeOfDayColor>&
 // Due to using integers instead of floats, it may be a tiny bit different.
 // TODO: it might be possible to reorder the loop into two blocks of loads and avoid spilling xmms.
 // It's ~8x faster than the slow version.
-void interp_time_of_day_fast(const float weights[8],
-                             const SwizzledTimeOfDay& in,
-                             math::Vector<u8, 4>* out) {
-  // even though the colors are 8 bits, we'll use 16 bits so we can saturate correctly

+void interp_time_of_day_fast_avx2(const float weights[8],
+                                  const SwizzledTimeOfDay& in,
+                                  math::Vector<u8, 4>* out) {
+  // even though the colors are 8 bits, we'll use 16 bits so we can saturate correctly
+#ifdef __AVX2__
  // weight multipliers
  __m256i weights0 = _mm256_set1_epi16(weights[0] * 64.f);
  __m256i weights1 = _mm256_set1_epi16(weights[1] * 64.f);
@ -234,7 +236,7 @@ void interp_time_of_day_fast(const float weights[8],
                                 255, 255, 255);

  for (u32 color_quad = 0; color_quad < in.color_count / 4; color_quad++) {
-    // first, load colors. We put 16 bytes / register and don't touch the upper half because we will
+    // first, load colors. We put 16 bytes / register and don't touch the upper half because we
    // convert u8s to u16s.
    const u8* base = in.data.data() + color_quad * 128;
    __m128i color0_p = _mm_loadu_si128((const __m128i*)(base + 0));
@ -290,6 +292,149 @@ void interp_time_of_day_fast(const float weights[8],
    // store result
    _mm_storeu_si128((__m128i*)(&out[color_quad * 4]), result);
  }
+#else
+  // unreachable.
+  ASSERT(false);
+#endif
+}
+
+void interp_time_of_day_fast(const float weights[8],
+                             const SwizzledTimeOfDay& in,
+                             math::Vector<u8, 4>* out) {
+  // even though the colors are 8 bits, we'll use 16 bits so we can saturate correctly
+  if (get_cpu_info().has_avx2) {
+    interp_time_of_day_fast_avx2(weights, in, out);
+    return;
+  }
+
+  // weight multipliers
+  __m128i weights0 = _mm_set1_epi16(weights[0] * 64.f);
+  __m128i weights1 = _mm_set1_epi16(weights[1] * 64.f);
+  __m128i weights2 = _mm_set1_epi16(weights[2] * 64.f);
+  __m128i weights3 = _mm_set1_epi16(weights[3] * 64.f);
+  __m128i weights4 = _mm_set1_epi16(weights[4] * 64.f);
+  __m128i weights5 = _mm_set1_epi16(weights[5] * 64.f);
+  __m128i weights6 = _mm_set1_epi16(weights[6] * 64.f);
+  __m128i weights7 = _mm_set1_epi16(weights[7] * 64.f);
+
+  // saturation: note that alpha is saturated to 128 but the rest are 255.
+  // TODO: maybe we should saturate to 255 for everybody (can do this using a single packus) and
+  // change the shader to deal with this.
+  __m128i sat = _mm_set_epi16(128, 255, 255, 255, 128, 255, 255, 255);
+
+  for (u32 color_quad = 0; color_quad < in.color_count / 4; color_quad++) {
+    // first, load colors. We put 16 bytes / register and don't touch the upper half because we
+    // convert u8s to u16s.
+    {
+      const u8* base = in.data.data() + color_quad * 128;
+      __m128i color0_p = _mm_loadu_si64((const __m128i*)(base + 0));
+      __m128i color1_p = _mm_loadu_si64((const __m128i*)(base + 16));
+      __m128i color2_p = _mm_loadu_si64((const __m128i*)(base + 32));
+      __m128i color3_p = _mm_loadu_si64((const __m128i*)(base + 48));
+      __m128i color4_p = _mm_loadu_si64((const __m128i*)(base + 64));
+      __m128i color5_p = _mm_loadu_si64((const __m128i*)(base + 80));
+      __m128i color6_p = _mm_loadu_si64((const __m128i*)(base + 96));
+      __m128i color7_p = _mm_loadu_si64((const __m128i*)(base + 112));
+
+      // unpack to 16-bits. each has 16x 16 bit colors.
+      __m128i color0 = _mm_cvtepu8_epi16(color0_p);
+      __m128i color1 = _mm_cvtepu8_epi16(color1_p);
+      __m128i color2 = _mm_cvtepu8_epi16(color2_p);
+      __m128i color3 = _mm_cvtepu8_epi16(color3_p);
+      __m128i color4 = _mm_cvtepu8_epi16(color4_p);
+      __m128i color5 = _mm_cvtepu8_epi16(color5_p);
+      __m128i color6 = _mm_cvtepu8_epi16(color6_p);
+      __m128i color7 = _mm_cvtepu8_epi16(color7_p);
+
+      // multiply by weights
+      color0 = _mm_mullo_epi16(color0, weights0);
+      color1 = _mm_mullo_epi16(color1, weights1);
+      color2 = _mm_mullo_epi16(color2, weights2);
+      color3 = _mm_mullo_epi16(color3, weights3);
+      color4 = _mm_mullo_epi16(color4, weights4);
+      color5 = _mm_mullo_epi16(color5, weights5);
+      color6 = _mm_mullo_epi16(color6, weights6);
+      color7 = _mm_mullo_epi16(color7, weights7);
+
+      // add. This order minimizes dependencies.
+      color0 = _mm_add_epi16(color0, color1);
+      color2 = _mm_add_epi16(color2, color3);
+      color4 = _mm_add_epi16(color4, color5);
+      color6 = _mm_add_epi16(color6, color7);
+
+      color0 = _mm_add_epi16(color0, color2);
+      color4 = _mm_add_epi16(color4, color6);
+
+      color0 = _mm_add_epi16(color0, color4);
+
+      // divide, because we multiplied our weights by 2^7.
+      color0 = _mm_srli_epi16(color0, 6);
+
+      // saturate
+      color0 = _mm_min_epu16(sat, color0);
+
+      // back to u8s.
+      auto result = _mm_packus_epi16(color0, color0);
+
+      // store result
+      _mm_storeu_si64((__m128i*)(&out[color_quad * 4]), result);
+    }
+
+    {
+      const u8* base = in.data.data() + color_quad * 128 + 8;
+      __m128i color0_p = _mm_loadu_si64((const __m128i*)(base + 0));
+      __m128i color1_p = _mm_loadu_si64((const __m128i*)(base + 16));
+      __m128i color2_p = _mm_loadu_si64((const __m128i*)(base + 32));
+      __m128i color3_p = _mm_loadu_si64((const __m128i*)(base + 48));
+      __m128i color4_p = _mm_loadu_si64((const __m128i*)(base + 64));
+      __m128i color5_p = _mm_loadu_si64((const __m128i*)(base + 80));
+      __m128i color6_p = _mm_loadu_si64((const __m128i*)(base + 96));
+      __m128i color7_p = _mm_loadu_si64((const __m128i*)(base + 112));
+
+      // unpack to 16-bits. each has 16x 16 bit colors.
+      __m128i color0 = _mm_cvtepu8_epi16(color0_p);
+      __m128i color1 = _mm_cvtepu8_epi16(color1_p);
+      __m128i color2 = _mm_cvtepu8_epi16(color2_p);
+      __m128i color3 = _mm_cvtepu8_epi16(color3_p);
+      __m128i color4 = _mm_cvtepu8_epi16(color4_p);
+      __m128i color5 = _mm_cvtepu8_epi16(color5_p);
+      __m128i color6 = _mm_cvtepu8_epi16(color6_p);
+      __m128i color7 = _mm_cvtepu8_epi16(color7_p);
+
+      // multiply by weights
+      color0 = _mm_mullo_epi16(color0, weights0);
+      color1 = _mm_mullo_epi16(color1, weights1);
+      color2 = _mm_mullo_epi16(color2, weights2);
+      color3 = _mm_mullo_epi16(color3, weights3);
+      color4 = _mm_mullo_epi16(color4, weights4);
+      color5 = _mm_mullo_epi16(color5, weights5);
+      color6 = _mm_mullo_epi16(color6, weights6);
+      color7 = _mm_mullo_epi16(color7, weights7);
+
+      // add. This order minimizes dependencies.
+      color0 = _mm_add_epi16(color0, color1);
+      color2 = _mm_add_epi16(color2, color3);
+      color4 = _mm_add_epi16(color4, color5);
+      color6 = _mm_add_epi16(color6, color7);
+
+      color0 = _mm_add_epi16(color0, color2);
+      color4 = _mm_add_epi16(color4, color6);
+
+      color0 = _mm_add_epi16(color0, color4);
+
+      // divide, because we multiplied our weights by 2^7.
+      color0 = _mm_srli_epi16(color0, 6);
+
+      // saturate
+      color0 = _mm_min_epu16(sat, color0);
+
+      // back to u8s.
+      auto result = _mm_packus_epi16(color0, color0);
+
+      // store result
+      _mm_storeu_si64((__m128i*)(&out[color_quad * 4 + 2]), result);
+    }
+  }
 }

 bool sphere_in_view_ref(const math::Vector4f& sphere, const math::Vector4f* planes) {
@ -327,9 +472,9 @@ u32 make_all_visible_index_list(std::pair<int, int>* group_out,
    const auto& draw = draws[i];
    std::pair<int, int> ds;
    ds.first = idx_buffer_ptr;
-    memcpy(&idx_out[idx_buffer_ptr], draw.vertex_index_stream.data(),
-           draw.vertex_index_stream.size() * sizeof(u32));
-    idx_buffer_ptr += draw.vertex_index_stream.size();
+    memcpy(&idx_out[idx_buffer_ptr], draw.unpacked.vertex_index_stream.data(),
+           draw.unpacked.vertex_index_stream.size() * sizeof(u32));
+    idx_buffer_ptr += draw.unpacked.vertex_index_stream.size();
    ds.second = idx_buffer_ptr;
    group_out[i] = ds;
  }
@ -357,7 +502,7 @@ u32 make_index_list_from_vis_string(std::pair<int, int>* group_out,
        } else {
          building_run = false;
          idx_buffer_ptr += grp.num;
-          memcpy(&idx_out[run_start_out], &draw.vertex_index_stream[run_start_in],
+          memcpy(&idx_out[run_start_out], &draw.unpacked.vertex_index_stream[run_start_in],
                 (idx_buffer_ptr - run_start_out) * sizeof(u32));
        }
      } else {
@ -372,7 +517,7 @@ u32 make_index_list_from_vis_string(std::pair<int, int>* group_out,
      vtx_idx += grp.num;
    }
    if (building_run) {
-      memcpy(&idx_out[run_start_out], &draw.vertex_index_stream[run_start_in],
+      memcpy(&idx_out[run_start_out], &draw.unpacked.vertex_index_stream[run_start_in],
             (idx_buffer_ptr - run_start_out) * sizeof(u32));
    }

--- a/game/graphics/texture/TexturePool.cpp
+++ b/game/graphics/texture/TexturePool.cpp
@ -159,7 +159,7 @@ std::vector<std::shared_ptr<TextureRecord>> TexturePool::convert_textures(const
    // the sizes given aren't the actual sizes in memory, so if you just use that, you get the
    // wrong answer. I solved this in the decompiler by using the size of the actual data, but we
    // don't really have that here.
-    u32 size = ((sizes[0] + sizes[1] + sizes[2] + 2047) / 256) * 256;
+    u32 size = ((sizes[0] + sizes[1] + sizes[2] + 4096) / 256) * 256;

    m_tex_converter.upload(memory_base + texture_page.segment[0].block_data_ptr,
                           texture_page.segment[0].dest, size);
--- a/game/main.cpp
+++ b/game/main.cpp
@ -9,6 +9,7 @@
 #include "common/log/log.h"
 #include "common/util/FileUtil.h"
 #include "game/discord.h"
+#include "common/util/os.h"

 // Discord RPC
 extern int64_t gStartTime;
@ -28,17 +29,49 @@ void setup_logging(bool verbose) {
 }

 int main(int argc, char** argv) {
+  // do this as soon as possible - stuff like memcpy might use AVX instructions and we want to
+  // warn the user instead of just crashing.
+  setup_cpu_info();
+  if (!get_cpu_info().has_avx) {
+    printf("Your CPU does not support AVX, which is required for OpenGOAL.\n");
+    return -1;
+  }
+
  bool verbose = false;
+  bool disable_avx2 = false;
  for (int i = 1; i < argc; i++) {
    if (std::string("-v") == argv[i]) {
      verbose = true;
      break;
    }
+
+    if (std::string("-no-avx2") == argv[i]) {
+      disable_avx2 = true;
+    }
  }

  gStartTime = time(0);
  init_discord_rpc();

+  if (disable_avx2) {
+    // for debugging the non-avx2 code paths, there's a flag to manually disable.
+    printf("Note: AVX2 code has been manually disabled.\n");
+    get_cpu_info().has_avx2 = false;
+  }
+
+#ifndef __AVX2__
+  if (get_cpu_info().has_avx2) {
+    printf("Note: your CPU supports AVX2, but this build was not compiled with AVX2 support\n");
+    get_cpu_info().has_avx2 = false;
+  }
+#endif
+
+  if (get_cpu_info().has_avx2) {
+    printf("AVX2 mode enabled\n");
+  } else {
+    printf("AVX2 mode disabled\n");
+  }
+
  setup_logging(verbose);

  while (true) {
--- a/test/test_common_util.cpp
+++ b/test/test_common_util.cpp
@ -9,12 +9,17 @@
 #include "gtest/gtest.h"
 #include "test/all_jak1_symbols.h"
 #include "common/util/json_util.h"
+#include "common/util/os.h"
 #include "common/util/Range.h"
 #include "third-party/fmt/core.h"
 #include "common/util/print_float.h"
 #include "common/util/CopyOnWrite.h"
 #include "common/util/SmallVector.h"

+TEST(CommonUtil, CpuInfo) {
+  setup_cpu_info();
+}
+
 TEST(CommonUtil, get_file_path) {
  std::vector<std::string> test = {"cabbage", "banana", "apple"};
  std::string sampleString = file_util::get_file_path(test);
@ -390,5 +395,6 @@ TEST(SmallVector, Construction) {
 TEST(Assert, Death) {
  EXPECT_DEATH(private_assert_failed("foo", "bar", 12, "aaa"), "");
 }
+
 }  // namespace test
 }  // namespace cu