[jak2] Floating point blerc (#2715)

This moves the blerc math from mips2c to the Merc2 renderer, and uses floats instead. We could potentially do this on the GPU, which would be even faster, but this isn't that slow in the first place.
2024-10-20 00:57:44 -04:00 · 2023-06-11 12:35:08 -04:00 · 2023-06-11 12:35:08 -04:00 · ad5cec1bb4
parent a88c2d2460
commit ad5cec1bb4
20 changed files with 1166 additions and 259 deletions
--- a/common/custom_data/TFrag3Data.cpp
+++ b/common/custom_data/TFrag3Data.cpp
@ -398,6 +398,11 @@ void MercDraw::serialize(Serializer& ser) {
  ser.from_ptr(&num_triangles);
 }

+void Blerc::serialize(Serializer& ser) {
+  ser.from_pod_vector(&float_data);
+  ser.from_pod_vector(&int_data);
+}
+
 void MercModifiableDrawGroup::serialize(Serializer& ser) {
  if (ser.is_saving()) {
    ser.save<size_t>(mod_draw.size());
@ -420,6 +425,8 @@ void MercModifiableDrawGroup::serialize(Serializer& ser) {
  ser.from_pod_vector(&vertex_lump4_addr);
  ser.from_pod_vector(&fragment_mask);
  ser.from_ptr(&expect_vidx_end);
+
+  blerc.serialize(ser);
 }

 void MercEffect::serialize(Serializer& ser) {
@ -537,6 +544,8 @@ void MercModifiableDrawGroup::memory_usage(MemoryUsageTracker* tracker) const {
  tracker->add(MemoryUsageCategory::MERC_MOD_DRAW_1, sizeof(MercDraw) * fix_draw.size());
  tracker->add(MemoryUsageCategory::MERC_MOD_DRAW_2, sizeof(MercDraw) * mod_draw.size());
  tracker->add(MemoryUsageCategory::MERC_MOD_TABLE, sizeof(u16) * vertex_lump4_addr.size());
+  tracker->add(MemoryUsageCategory::BLERC, sizeof(BlercFloatData) * blerc.float_data.size());
+  tracker->add(MemoryUsageCategory::BLERC, sizeof(u32) * blerc.int_data.size());
 }

 void MercEffect::memory_usage(MemoryUsageTracker* tracker) const {
@ -688,6 +697,7 @@ void print_memory_usage(const tfrag3::Level& lev, int uncompressed_data_size) {
      {"merc-mod-table", mem_use.data[tfrag3::MemoryUsageCategory::MERC_MOD_TABLE]},
      {"merc-mod-draw-1", mem_use.data[tfrag3::MemoryUsageCategory::MERC_MOD_DRAW_1]},
      {"merc-mod-draw-2", mem_use.data[tfrag3::MemoryUsageCategory::MERC_MOD_DRAW_2]},
+      {"blerc", mem_use.data[tfrag3::MemoryUsageCategory::BLERC]},
  };
  for (auto& known : known_categories) {
    total_accounted += known.second;
--- a/common/custom_data/Tfrag3Data.h
+++ b/common/custom_data/Tfrag3Data.h
@ -18,6 +18,8 @@ namespace tfrag3 {
 // - if changing any large things (vertices, vis, bvh, colors, textures) update get_memory_usage
 // - if adding a new category to the memory usage, update extract_level to print it.

+constexpr int TFRAG3_VERSION = 36;
+
 enum MemoryUsageCategory {
  TEXTURE,

@ -55,6 +57,7 @@ enum MemoryUsageCategory {
  MERC_MOD_VERT,
  MERC_MOD_IND,
  MERC_MOD_TABLE,
+  BLERC,

  COLLISION,

@ -73,8 +76,6 @@ struct MemoryUsageTracker {
  void add(MemoryUsageCategory category, u32 size_bytes) { data[category] += size_bytes; }
 };

-constexpr int TFRAG3_VERSION = 35;
-
 // These vertices should be uploaded to the GPU at load time and don't change
 struct PreloadedVertex {
  // the vertex position
@ -437,7 +438,7 @@ struct CollisionMesh {
 // MERC

 struct MercVertex {
-  float pos[3];
+  alignas(32) float pos[3];
  float pad0;

  float normal[3];
@ -464,12 +465,38 @@ struct MercDraw {
  void serialize(Serializer& ser);
 };

+struct BlercFloatData {
+  // [x, y, z, pad, nx, ny, nz, pad]
+  // note that this should match the layout of the merc vertex above
+  alignas(32) float v[8];
+};
+
+/*!
+ * Data to modify vertices based on blend shapes.
+ */
+struct Blerc {
+  std::vector<BlercFloatData> float_data;
+  std::vector<u32> int_data;
+  static constexpr u32 kTargetIdxTerminator = UINT32_MAX;
+  void serialize(Serializer& ser);
+
+  // int data, per vertex:
+  // [tgt0_idx, tgt1_idx, ..., terminator, dest]
+  // float data, per vertex:
+  // [base, tgt0, tgt1, ...]
+
+  // final vertex position is:
+  // base + sum(tgtn * weights[tgtn_idx])
+};
+
 struct MercModifiableDrawGroup {
  std::vector<MercVertex> vertices;
  std::vector<u16> vertex_lump4_addr;
  std::vector<MercDraw> fix_draw, mod_draw;
  std::vector<u8> fragment_mask;
+  Blerc blerc;
  u32 expect_vidx_end = 0;
+
  void serialize(Serializer& ser);
  void memory_usage(MemoryUsageTracker* tracker) const;
 };
--- a/common/serialization/subtitles2/subtitles2_ser.cpp
+++ b/common/serialization/subtitles2/subtitles2_ser.cpp
@ -48,6 +48,7 @@ const std::vector<std::string> get_speaker_names(GameVersion version) {
  switch (version) {
    case GameVersion::Jak2:
      return s_speakers_jak2;
+    default:
      break;
  }
  throw std::runtime_error(
--- a/decompiler/level_extractor/MercData.cpp
+++ b/decompiler/level_extractor/MercData.cpp
@ -1,6 +1,7 @@
 #include "MercData.h"

 #include "common/dma/gs.h"
+#include "common/util/BitUtils.h"

 #include "decompiler/ObjectFile/LinkedObjectFile.h"
 #include "decompiler/util/DecompilerTypeSystem.h"
@ -385,12 +386,24 @@ void MercEffect::from_ref(TypedRef tr,
    f = frag_geo.emplace_back().from_ref(f, dts, frag_ctrl.at(i), main_control);
  }

-  // do blend ctrls
+  // do blend ctrls/data
  if (blend_frag_count) {
+    // each fragment has a blend-ctrl and a blend-data.
+
    TypedRef bc(deref_label(get_field_ref(tr, "blend-ctrl", dts)),
                dts.ts.lookup_type("merc-blend-ctrl"));
+    Ref bd(deref_label(get_field_ref(tr, "blend-data", dts)));
+
    for (u32 i = 0; i < blend_frag_count; i++) {
      bc = blend_ctrl.emplace_back().from_ref(bc, dts, main_control.blend_target_count);
+      const auto& ctrl = blend_ctrl.back();
+      // the order of the data is [target][vtx]
+      // Each target is 16 bytes aligned because it gets dma'd to the scratchpad separately.
+      // Each vertex uses 6 bytes (1 byte for each of x,y,z,nx,ny,nz.
+      int stride = align16(6 * ctrl.blend_vtx_count);
+      // add an additional target for the "base" position.
+      int data_size = stride * (1 + ctrl.nonzero_index_count);
+      bd = blend_data.emplace_back().from_ref(bd, data_size);
    }
  }

@ -439,43 +452,6 @@ void MercCtrl::from_ref(TypedRef tr, const DecompilerTypeSystem& dts, GameVersio
    effects.emplace_back().from_ref(eff_ref, dts, header);
    eff_ref.ref.byte_offset += 32;  //
  }
-
-  // debug_print_blerc();
-}
-
-void MercCtrl::debug_print_blerc() {
-  int total_verts = 0;
-  int blerc_verts = 0;
-  int total_frags = 0;
-  int blerc_frags = 0;
-  int total_effects = effects.size();
-  int blerc_effects = 0;
-
-  for (auto& effect : effects) {
-    bool effect_has_blerc = false;
-    for (size_t frag_idx = 0; frag_idx < effect.frag_count; frag_idx++) {
-      total_frags++;
-      auto& fc = effect.frag_ctrl.at(frag_idx);
-      total_verts += fc.lump_four_count;
-
-      if (frag_idx < effect.blend_ctrl.size()) {
-        auto& bfc = effect.blend_ctrl.at(frag_idx);
-        if (bfc.blend_vtx_count) {
-          effect_has_blerc = true;
-          blerc_frags++;
-          blerc_verts += fc.lump_four_count;
-        }
-      }
-    }
-
-    if (effect_has_blerc) {
-      blerc_effects++;
-    }
-  }
-  if (blerc_effects) {
-    fmt::print("BLERC: {}, {}/{} e, {}/{} f, {}/{} v\n", name, blerc_effects, total_effects,
-               blerc_frags, total_frags, blerc_verts, total_verts);
-  }
 }

 TypedRef MercBlendCtrl::from_ref(TypedRef tr,
@ -491,6 +467,14 @@ TypedRef MercBlendCtrl::from_ref(TypedRef tr,
  return tr;
 }

+Ref MercBlendData::from_ref(Ref ref, int num_bytes) {
+  for (int i = 0; i < num_bytes; i++) {
+    u8_data.push_back(deref_u8(ref, 0));
+    ref.byte_offset += 1;
+  }
+  return ref;
+}
+
 std::string MercCtrl::print() {
  std::string result;
  result += fmt::format("name: {}\n", name);
--- a/decompiler/level_extractor/MercData.h
+++ b/decompiler/level_extractor/MercData.h
@ -170,12 +170,38 @@ struct MercFragment {
 };

 struct MercBlendCtrl {
-  u8 blend_vtx_count;
-  u8 nonzero_index_count;
+  u8 blend_vtx_count;  // total number of vertices
+
+  // if a fragment is not influenced by a target, the offsets would be zero, and these offset
+  // aren't stored. The format works like this:
+  // if bt_index[tgt_idx] == 0:
+  //   the target doesn't influence this vertex
+  // else:
+  //   the bt_index[tgt_idx] group of offsets is for tgt_idx.
+
+  // All the nonzero entries of bt_index are increasing.
+
+  // For example:
+  // 0, 0, 1, 0, 2, 3
+  // indicates that this blend fragment is used in targets 2, 4, and 5.
+  // group 1 is the offsets for target 2, group 2 for 4, and group 3 for 5.
+
+  // the group 0 offsets are actually the vertex base position, and should be treated as
+  // unsigned. All other offsets are signed offsets.
+
+  u8 nonzero_index_count;  // number of nonzeros in the bt_index table
+
+  // which groups correspond to which targets (see comment above)
+  // the length of this array is always the number of blend targets for the effect.
  std::vector<u8> bt_index;
  TypedRef from_ref(TypedRef tr, const DecompilerTypeSystem& dts, int blend_target_count);
 };

+struct MercBlendData {
+  std::vector<u8> u8_data;
+  Ref from_ref(Ref ref, int num_bytes);
+};
+
 struct MercExtraInfo {
  std::optional<MercShader> shader;
 };
@ -190,8 +216,9 @@ struct MercEffect {
  // (frag-ctrl        merc-fragment-control  :offset-assert 4)
  std::vector<MercFragmentControl> frag_ctrl;
  // (blend-data       merc-blend-data        :offset-assert 8) ??
-  std::vector<MercBlendCtrl> blend_ctrl;
+  std::vector<MercBlendData> blend_data;
  // (blend-ctrl       merc-blend-ctrl        :offset-assert 12) ??
+  std::vector<MercBlendCtrl> blend_ctrl;
  // (dummy0           uint8                  :offset-assert 16) ??
  u8 effect_bits;
  u16 frag_count;
@ -216,7 +243,6 @@ struct MercCtrl {
  std::vector<MercEffect> effects;

  void from_ref(TypedRef tr, const DecompilerTypeSystem& dts, GameVersion version);
-  void debug_print_blerc();
  std::string print();
 };
 }  // namespace decompiler
--- a/decompiler/level_extractor/extract_merc.cpp
+++ b/decompiler/level_extractor/extract_merc.cpp
@ -1,6 +1,7 @@
 #include "extract_merc.h"

 #include "common/log/log.h"
+#include "common/util/BitUtils.h"
 #include "common/util/FileUtil.h"
 #include "common/util/colors.h"
 #include "common/util/string_util.h"
@ -103,6 +104,40 @@ struct MercUnpackedVtx {
  int frag = -1;
 };

+/*!
+ * A single vertex in a single blend shape target.
+ * This is used to store both the base position and offsets.
+ * These integers are for the EE-format merc int data, before VIF/VU processing.
+ */
+struct BlercVtxIntTarget {
+  math::Vector<s8, 3> pos;
+  math::Vector<s8, 3> nrm;
+  u8 idx = 0;  // if this is a non-base vertex (an offset), the target index
+
+  /*!
+   * Are all components zero? (if so, this vertex is not effected by this target).
+   */
+  bool all_zero_data() const {
+    return pos == math::Vector<s8, 3>::zero() && nrm == math::Vector<s8, 3>::zero();
+  }
+};
+
+/*!
+ * A single vertex and all of its blend targets.
+ */
+struct BlercVtxInt {
+  // the base position. If blend shapes are off, or weights all 0, then this position is used.
+  BlercVtxIntTarget base;
+  // the offsets for each target. Stored sparsely so targets that don't move the vertex
+  // from the base position aren't stored.
+  std::vector<BlercVtxIntTarget> targets;
+  // the index of this vertex in the "lump4" EE-format merc data. before samecopy/crosscopy
+  // processing
+  u32 lump4_addr = 0;
+  // the floating point offset specified in the fp_header that will be applied during rendering.
+  math::Vector3f pos_offset;
+};
+
 /*!
 * An entire merc-effect, split into draws.
 * Note that copied or multiply-placed vertices will be de-deduplicated, but not identical vertices
@ -119,6 +154,12 @@ struct ConvertedMercEffect {
  DrawMode envmap_mode;
  u32 envmap_texture;
  std::optional<s8> eye_slot;
+  float pos_scale = 0;
+
+  // note: these vertices are _not_ in the same order as vertices.
+  // these are in the order they appeared in EE-memory. Some vertices may not use blerc.
+  // the only way to map these to other vertices is to use lump4_addrs
+  std::vector<BlercVtxInt> blerc_vertices_i;
 };

 /*!
@ -767,6 +808,8 @@ ConvertedMercEffect convert_merc_effect(const MercEffect& input_effect,
  ConvertedMercEffect result;
  result.ctrl_idx = ctrl_idx;
  result.effect_idx = effect_idx;
+  result.pos_scale = ctrl_header.xyz_scale;
+
  if (ctrl_header.eye_ctrl) {
    result.eye_slot = ctrl_header.eye_ctrl->eye_slot;
  }
@ -1012,6 +1055,82 @@ ConvertedMercEffect convert_merc_effect(const MercEffect& input_effect,
    memory_buffer_toggle ^= 1;
  }

+  // process blend fragments. this loop combines all fragments in this effect, and unpacks the int
+  // data.
+  // each blend fragment corresponds to a normal fragment.
+  // the vertices also match up, but the blend fragment may be smaller, allowing them to skip
+  // vertices at the end of fragments.
+  // (also, there can be fewer blend frags, and the remaining frags just don't have blerc)
+  int base_blend_out = 0;
+  for (size_t i = 0; i < input_effect.blend_frag_count; i++) {
+    // these three structures are all associated:
+    auto& bc = input_effect.blend_ctrl.at(i);
+    auto& bd = input_effect.blend_data.at(i);
+    auto& f = input_effect.frag_geo.at(i).fp_header;
+
+    // index in blend int data.
+    int bdi = 0;
+
+    size_t original_size = result.blerc_vertices_i.size();
+    result.blerc_vertices_i.resize(original_size + bc.blend_vtx_count);
+    auto* out_vertices = &result.blerc_vertices_i[original_size];
+
+    // the base position of this vertex.
+    for (int vi = 0; vi < bc.blend_vtx_count; vi++) {
+      auto& out_vertex = out_vertices[vi];
+      out_vertex.lump4_addr = base_blend_out + vi;
+      out_vertex.pos_offset = math::Vector3f(f.x_add, f.y_add, f.z_add);
+      auto& vc = out_vertex.base;
+      vc.nrm.x() = bd.u8_data.at(bdi++);
+      vc.pos.x() = bd.u8_data.at(bdi++);
+
+      vc.nrm.y() = bd.u8_data.at(bdi++);
+      vc.pos.y() = bd.u8_data.at(bdi++);
+
+      vc.nrm.z() = bd.u8_data.at(bdi++);
+      vc.pos.z() = bd.u8_data.at(bdi++);
+    }
+    // align16 for DMA (transferred per group)
+    bdi = align16(bdi);
+
+    // next, add targets by
+    for (size_t ti = 0; ti < bc.bt_index.size(); ti++) {
+      if (bc.bt_index[ti] == 0) {
+        // this fragment isn't used by this target, skip it.
+        // (they also don't store the offsets for this)
+        continue;
+      }
+
+      for (int vi = 0; vi < bc.blend_vtx_count; vi++) {
+        BlercVtxIntTarget vc;
+        vc.idx = ti;
+
+        vc.nrm.x() = bd.u8_data.at(bdi++);
+        vc.pos.x() = bd.u8_data.at(bdi++);
+
+        vc.nrm.y() = bd.u8_data.at(bdi++);
+        vc.pos.y() = bd.u8_data.at(bdi++);
+
+        vc.nrm.z() = bd.u8_data.at(bdi++);
+        vc.pos.z() = bd.u8_data.at(bdi++);
+
+        // some vertices within a fragment may not use all the targets of that fragment.
+        // detect this and skip adding the 0 offsets, so the downstream stuff can skip adding
+        // this to the file.
+        if (!vc.all_zero_data()) {
+          out_vertices[vi].targets.push_back(vc);
+        }
+      }
+      // for DMA
+      bdi = align16(bdi);
+    }
+    // we should have processed all the u8 data
+    ASSERT((size_t)align16(bdi) == bd.u8_data.size());
+
+    // skip over vertices that don't have blend.
+    base_blend_out += result.verts_per_frag.at(i);
+  }
+
  if (dump) {
    auto file_path = file_util::get_file_path(
        {"debug_out/merc", fmt::format("{}_{}.ply", debug_name, effect_idx)});
@ -1063,6 +1182,149 @@ struct VertexSourceInfo {
  int flump4;
 };

+// ND used VIF to do int->float conversion.
+// This is a little tricky because you can only do an int + int, then interpret that as a float.
+// This resulting conversion is linear (for some range of input):
+// out = in * scale + offset
+
+// where scale and offset are the values returned below (assuming in is small)
+
+float magic_float_scale(u32 base_val) {
+  float a, b;
+  memcpy(&a, &base_val, 4);
+  base_val++;
+  memcpy(&b, &base_val, 4);
+  return b - a;
+}
+
+float magic_float_offset(u32 base_val) {
+  float a;
+  memcpy(&a, &base_val, 4);
+  return a;
+}
+
+/*!
+ * The offset (or base position) of a vertex target (or base position), stored as a float.
+ */
+struct BlercVtxFloatTarget {
+  math::Vector3f pos;
+  math::Vector3f nrm;
+  u8 idx;  // if an offset, the index of the target we belong to.
+};
+
+/*!
+ * A single vertex (floating point), and all the offsets for all the targets it uses.
+ * These floats are exactly the floats used by the merc2 renderer (pre-bones), so they can
+ * be uploaded into mod buffers directly.
+ */
+struct BlercVtxFloat {
+  BlercVtxFloatTarget base;
+  std::vector<BlercVtxFloatTarget> targets;
+  s32 dest = -1;  // the index of this vertex in the mod buffer (PC vertex ordering)
+};
+
+/*!
+ * Convert a vertex offset (or base position) from int format (EE lump) to floating point.
+ */
+BlercVtxFloatTarget blerc_vertex_convert(const BlercVtxIntTarget& in,
+                                         const math::Vector<float, 3>& pos_offset,
+                                         float pos_scale,
+                                         bool is_base) {
+  BlercVtxFloatTarget result;
+  result.idx = in.idx;
+
+  // scale factors to apply. these integers match the row value for VIF.
+  float pos_total_scale = magic_float_scale(0x4b010000) * pos_scale;
+  float nrm_total_scale = magic_float_scale(0x47800000);
+
+  if (is_base) {
+    // the EE assembly was:
+    // pextlb t6, r0, t6 to get u16's from the packed u8's, so these should be treated as unsigned.
+    result.nrm = in.nrm.cast<u8>().cast<float>() * nrm_total_scale;
+    result.pos = in.pos.cast<u8>().cast<float>() * pos_total_scale;
+
+    // also include the floating point offset from...
+    math::Vector3f post_pos_off;
+    post_pos_off.fill(magic_float_offset(0x4b010000));  // the vif integer add
+    post_pos_off += pos_offset;                         // the offset in the fp header
+    post_pos_off *= pos_scale;                          // and scale this by xyz-scale.
+
+    math::Vector3f nrm_off;
+    nrm_off.fill(magic_float_offset(0x47800000) - 65537.f);  // 65537.f is part of MERC
+
+    result.pos += post_pos_off;
+    result.nrm += nrm_off;
+  } else {
+    // in the target case, the s8's were sign extended, so cast from s8 -> float directly.
+    // all the offset are applied after the sum, so we don't need to include them here
+    // (we include them once in the base).
+    // pextlb t5, t5, r0
+    // psrah t5, t5, 8
+    result.pos = in.pos.cast<float>() * pos_total_scale;
+    result.nrm = in.nrm.cast<float>() * nrm_total_scale;
+  }
+  return result;
+}
+
+/*!
+ * Convert a vertex and its targets from int format to floating point. The floating point format
+ * here matches the PC merc float format exactly.
+ */
+BlercVtxFloat blerc_vertex_convert(const BlercVtxInt& in, float pos_scale) {
+  BlercVtxFloat result;
+  result.base = blerc_vertex_convert(in.base, in.pos_offset, pos_scale, true);
+  for (auto& t : in.targets) {
+    result.targets.push_back(blerc_vertex_convert(t, in.pos_offset, pos_scale, false));
+  }
+  return result;
+}
+
+/*!
+ * Convert floating point data for a single blerc vertex target to the format used by PC blerc code.
+ * This includes padding to match the GPU vertex format and keep the vectors 16-byte aligned.
+ */
+tfrag3::BlercFloatData to_float_data(const math::Vector3f& pos,
+                                     const math::Vector3f& nrm,
+                                     float scale) {
+  tfrag3::BlercFloatData result;
+  result.v[0] = pos.x() * scale;
+  result.v[1] = pos.y() * scale;
+  result.v[2] = pos.z() * scale;
+  result.v[3] = 0;
+  result.v[4] = nrm.x() * scale;
+  result.v[5] = nrm.y() * scale;
+  result.v[6] = nrm.z() * scale;
+  result.v[7] = 0;
+  return result;
+}
+
+/*!
+ * Pack floating point vertices to the format for PC blerc.
+ * Currently this is designed with an outer loop over vertices. It's probably not the best thing,
+ * but it still beats the unoptimized version by a few times.
+ */
+tfrag3::Blerc blerc_pack(const std::vector<BlercVtxFloat>& verts) {
+  tfrag3::Blerc blerc;
+  for (auto& v : verts) {
+    // this check is weird, but it discards blerc vertices that don't map to any PC mod vertex.
+    // why does this happen? I'm not sure. It only happens on 3 vertices in metalkor, crocadog, and
+    // kor. It could be that merc has some extra vertices that are unpacked twice.
+    if (v.dest >= 0) {
+      // base is multiplied by 8192, then left shifted by 13, so it cancels
+      blerc.float_data.push_back(to_float_data(v.base.pos, v.base.nrm, 1.f));
+      for (auto& t : v.targets) {
+        // target is multiplied by "weight", then left shifted by 13. So full weight is
+        // 8192:
+        blerc.int_data.push_back(t.idx);
+        blerc.float_data.push_back(to_float_data(t.pos, t.nrm, 1.f / 8192.f));
+      }
+      blerc.int_data.push_back(tfrag3::Blerc::kTargetIdxTerminator);
+      blerc.int_data.push_back(v.dest);
+    }
+  }
+  return blerc;
+}
+
 void create_modifiable_vertex_data(
    const std::vector<bool>& vtx_mod_flag,
    const std::vector<VertexSourceInfo>& vtx_srcs,
@ -1081,9 +1343,7 @@ void create_modifiable_vertex_data(
  // In this modifiable draw path, there will be a list of "fixed draws", which draw vertices that
  // cannot be modified. This set is known at build-time.
  // The "mod draws" will draw the modifiable vertices. These use the normal index buffer, but
-  // index into a per-effect modifiable vertex buffer.
-
-  //  std::vector<tfrag3::MercDraw> fixed_draws, mod_draws;
+  // index into a per-effect modifiable vertex buffer, not the giant per-FR3 merc vertex buffer.

  // some stats
  int num_tris = 0;  // all triangles
@ -1100,6 +1360,9 @@ void create_modifiable_vertex_data(

      std::vector<std::vector<u32>> inds_per_mod_draw;

+      // loop over draw calls within this effect, and determine if it's fixed, modifiable, or needs
+      // to be split up. For mod draws, this just figures which vertices go which the draw, using
+      // the indices in the original vertex buffer.
      for (const auto& draw : effect.all_draws) {
        num_tris += draw.num_triangles;

@ -1123,18 +1386,23 @@ void create_modifiable_vertex_data(
          // nothing found at all, bad
          ASSERT_NOT_REACHED();
        } else if (found_fixed && !found_mod) {
-          // only fixed. can just copy the fixed draw
+          // only fixed. can just copy the fixed draw. This can reuse the index buffer data
+          // we already added for this effect.
          effect.mod.fix_draw.push_back(draw);
        } else if (found_mod && !found_fixed) {
-          // only mod
+          // only mod. Add the entire draw
          effect.mod.mod_draw.push_back(draw);
+          // remember the indices _in the main buffer_ for these vertices.
          auto& inds_out = inds_per_mod_draw.emplace_back();
          for (u32 i = 0; i < draw.index_count; i++) {
            inds_out.push_back(out.indices.at(draw.first_index + i));
          }
          mod_tris += draw.num_triangles;
        } else {
-          // it's a mix...
+          // it's a mix and needs to be split per strip. Strips containing any mod vertices
+          // go in the mod category.
+
+          // build strips as lists of vertex indices.
          std::vector<std::vector<u32>> strips;
          strips.emplace_back();
          for (u32 i = 0; i < draw.index_count; i++) {
@ -1148,9 +1416,11 @@ void create_modifiable_vertex_data(
            }
          }

+          // create the two draws
          tfrag3::MercDraw mod = draw;
          tfrag3::MercDraw fix = draw;
          std::vector<u32> mod_ind, fix_ind;
+          // iterate over strips and add them to the right one
          for (auto& strip : strips) {
            bool strip_has_mod = false;
            for (auto ind : strip) {
@ -1185,27 +1455,49 @@ void create_modifiable_vertex_data(
        effect.mod.fix_draw.clear();
      } else {
        effect.has_mod_draw = true;
-        // need to set up the vertex buffer for the modifiable draws
-        // map of original vertex indices to mod buffer index
+        // In this second pass, we need to build the actual vertex buffer and index buffer for
+        // the mod draws.
+
+        // the renderer has some optimizations it can decide to use the default value, instead
+        // of reading from the game for some subset of the mod vertices.
+
+        // map of original vertices to slot in the mod vtx buffer.
        std::unordered_map<u32, u32> vtx_to_mod_vtx;
+        // loop over mod draws
        for (size_t mdi = 0; mdi < effect.mod.mod_draw.size(); mdi++) {
          auto& draw = effect.mod.mod_draw[mdi];
+          // indices into the normal vertex buffer for this draw
          auto& orig_inds = inds_per_mod_draw.at(mdi);
+
+          // we'll be adding indices to the end of the main index buffer.
+          // these never change, so we want them in the big buffer loaded with the fr3.
          u32 new_first_index = out.indices.size();
+
+          // loop over indices into the normal vertex buffer
          for (auto vidx : orig_inds) {
            if (vidx == UINT32_MAX) {
              out.indices.push_back(UINT32_MAX);
              continue;  // strip restart
            }
+
+            // see if we've already got a copy of this vertex in the mod buffer
            const auto& existing = vtx_to_mod_vtx.find(vidx);
            if (existing == vtx_to_mod_vtx.end()) {
-              // add vertex to mod buffer
+              // nope, add vertex to mod buffer
              auto idx = effect.mod.vertices.size();
+              // remember we did this one already
              vtx_to_mod_vtx[vidx] = idx;
+              // add the vertex
              effect.mod.vertices.push_back(out.vertices.at(vidx));
+              // look up where this one came from in the EE memory layout
              auto src = vtx_srcs.at(vidx - first_out_vertex);
              ASSERT(src.combined_lump4 < UINT16_MAX);
+              // add the EE layout index of this vertex to the data, so the runtime
+              // knows how to map from EE data to the mod vertex buffer
              effect.mod.vertex_lump4_addr.push_back(src.combined_lump4);
+
+              // also flag that this fragment has modifiable vertices: we want to know
+              // which ones are safe to skip.
              u32 frag_idx = src.frag;
              if (frag_idx >= effect.mod.fragment_mask.size()) {
                effect.mod.fragment_mask.resize(frag_idx + 1);
@ -1213,14 +1505,56 @@ void create_modifiable_vertex_data(
              effect.mod.fragment_mask[frag_idx] = true;
              out.indices.push_back(idx);
            } else {
+              // already added this vertex, just reuse it.
              out.indices.push_back(existing->second);
            }
          }
          draw.first_index = new_first_index;
        }

-        // splice out masked fragments, the renderer won't index them
-        const auto& frag_counts = all_effects.at(mi - first_out_model).at(ei).verts_per_frag;
+        const auto& og_effect = all_effects.at(mi - first_out_model).at(ei);
+
+        // blerc! The blerc vertex indexing is totally different, so track which blerc vertex
+        // goes with the lump4 (ee layout addr).
+        std::vector<s32> which_blerc_is_at_this_lump4;
+        std::vector<BlercVtxFloat> blerc_floats;
+
+        // convert blerc from int to float, and fill out the lump4 map.
+        for (size_t i = 0; i < og_effect.blerc_vertices_i.size(); i++) {
+          auto& bvi = og_effect.blerc_vertices_i[i];
+          if (bvi.lump4_addr >= which_blerc_is_at_this_lump4.size()) {
+            which_blerc_is_at_this_lump4.resize(bvi.lump4_addr + 1, -1);
+          }
+          which_blerc_is_at_this_lump4[bvi.lump4_addr] = i;
+          blerc_floats.push_back(blerc_vertex_convert(bvi, og_effect.pos_scale));
+        }
+
+        // the second part of blerc mapping - tell each vertex where it goes in the
+        // mod vertex buffer. This way we don't really care about the order of blerc vertices,
+        // and we don't have to consider lump4 at all in the renderer.
+
+        // loop over all mod vertices
+        for (u32 vi = 0; vi < effect.mod.vertices.size(); vi++) {
+          // figure out its lump4.
+          u16 la = effect.mod.vertex_lump4_addr[vi];
+          ASSERT(la < UINT16_MAX);
+          // check if there's a blerc modifier for this vertex
+          if (la < which_blerc_is_at_this_lump4.size()) {
+            s32 bi = which_blerc_is_at_this_lump4.at(la);
+            if (bi >= 0) {
+              // there is! remember this dest.
+              blerc_floats[bi].dest = vi;
+            }
+          }
+        }
+        effect.mod.blerc = blerc_pack(blerc_floats);
+
+        // this next section is a bit of a hack: the renderer loops over fragments,
+        // we know ahead of time that some fragments have no modifiable vertices.
+        // we'd like the renderer to just skip over this fragment, and not worry about
+        // how many vertices it has. So we effectively splice out the disabled fragments indices.
+        // this means that the "combined lump4" counter will skip over these fragments.
+        const auto& frag_counts = og_effect.verts_per_frag;
        std::unordered_map<u32, u32> old_to_new;
        u32 old_idx = 0;
        u32 new_idx = 0;
--- a/game/common/vu.h
+++ b/game/common/vu.h
@ -461,6 +461,10 @@ struct alignas(16) Vf {
 struct alignas(16) Accumulator {
  float data[4];

+  std::string print() const {
+    return fmt::format("{} {} {} {}", data[0], data[1], data[2], data[3]);
+  }
+
  void adda(Mask mask, const Vf& a, float b) {
    for (int i = 0; i < 4; i++) {
      if ((u64)mask & (1 << i)) {
--- a/game/graphics/opengl_renderer/foreground/Merc2.cpp
+++ b/game/graphics/opengl_renderer/foreground/Merc2.cpp
@ -1,5 +1,7 @@
 #include "Merc2.h"

+#include <xmmintrin.h>
+
 #include "common/global_profiler/GlobalProfiler.h"

 #include "game/graphics/opengl_renderer/EyeRenderer.h"
@ -103,11 +105,280 @@ Merc2::~Merc2() {
  glDeleteVertexArrays(1, &m_vao);
 }

+/*!
+ * Modify vertices for blerc.
+ */
+void blerc_avx(const u32* i_data,
+               const u32* i_data_end,
+               const tfrag3::BlercFloatData* floats,
+               const float* weights,
+               tfrag3::MercVertex* out,
+               float multiplier) {
+  // store a table of weights. It's faster to load the 16-bytes of weights than load and broadcast
+  // the float.
+  __m128 weights_table[Merc2::kMaxBlerc];
+  for (int i = 0; i < Merc2::kMaxBlerc; i++) {
+    weights_table[i] = _mm_set1_ps(weights[i] * multiplier);
+  }
+
+  // loop over vertices
+  while (i_data != i_data_end) {
+    // load the base position
+    __m128 pos = _mm_load_ps(floats->v);
+    __m128 nrm = _mm_load_ps(floats->v + 4);
+    floats++;
+
+    // loop over targets
+    while (*i_data != tfrag3::Blerc::kTargetIdxTerminator) {
+      // get the weights for this target, from the game data.
+      __m128 weight_multiplier = weights_table[*i_data];
+      // get the pos/normal offset for this target.
+      __m128 posm = _mm_load_ps(floats->v);
+      __m128 nrmm = _mm_load_ps(floats->v + 4);
+      floats++;
+
+      // apply weights and add
+      posm = _mm_mul_ps(posm, weight_multiplier);
+      nrmm = _mm_mul_ps(nrmm, weight_multiplier);
+      pos = _mm_add_ps(pos, posm);
+      nrm = _mm_add_ps(nrm, nrmm);
+
+      i_data++;
+    }
+    i_data++;
+
+    // store final position/normal.
+    _mm_store_ps(out[*i_data].pos, pos);
+    _mm_store_ps(out[*i_data].normal, nrm);
+    i_data++;
+  }
+}
+namespace {
+float blerc_multiplier = 1.f;
+}
+
+void Merc2::model_mod_blerc_draws(int num_effects,
+                                  const tfrag3::MercModel* model,
+                                  const LevelData* lev,
+                                  ModBuffers* mod_opengl_buffers,
+                                  const float* blerc_weights) {
+  // loop over effects.
+  for (int ei = 0; ei < num_effects; ei++) {
+    const auto& effect = model->effects[ei];
+    // some effects might have no mod draw info, and no modifiable vertices
+    if (effect.mod.mod_draw.empty()) {
+      continue;
+    }
+
+    // grab opengl buffer
+    auto opengl_buffers = alloc_mod_vtx_buffer(lev);
+    mod_opengl_buffers[ei] = opengl_buffers;
+
+    // check that we have enough room for the finished thing.
+    if (effect.mod.vertices.size() > MAX_MOD_VTX) {
+      fmt::print("More mod vertices than MAX_MOD_VTX. {} > {}\n", effect.mod.vertices.size(),
+                 MAX_MOD_VTX);
+      ASSERT_NOT_REACHED();
+    }
+
+    // start with the correct vertices from the model data:
+    memcpy(m_mod_vtx_temp.data(), effect.mod.vertices.data(),
+           sizeof(tfrag3::MercVertex) * effect.mod.vertices.size());
+
+    // do blerc math
+    const auto* f_data = effect.mod.blerc.float_data.data();
+    const u32* i_data = effect.mod.blerc.int_data.data();
+    const u32* i_data_end = i_data + effect.mod.blerc.int_data.size();
+    blerc_avx(i_data, i_data_end, f_data, blerc_weights, m_mod_vtx_temp.data(), blerc_multiplier);
+
+    // and upload to GPU
+    m_stats.num_uploads++;
+    m_stats.num_upload_bytes += effect.mod.vertices.size() * sizeof(tfrag3::MercVertex);
+    {
+      glBindBuffer(GL_ARRAY_BUFFER, opengl_buffers.vertex);
+      glBufferData(GL_ARRAY_BUFFER, effect.mod.vertices.size() * sizeof(tfrag3::MercVertex),
+                   m_mod_vtx_temp.data(), GL_DYNAMIC_DRAW);
+    }
+  }
+}
+
 // We can run into a problem where adding a PC model would overflow the
 // preallocated draw/bone buffers.
 // So we break this part into two functions:
 // - init_pc_model, which doesn't allocate bones/draws

+void Merc2::model_mod_draws(int num_effects,
+                            const tfrag3::MercModel* model,
+                            const LevelData* lev,
+                            const u8* input_data,
+                            const DmaTransfer& setup,
+                            ModBuffers* mod_opengl_buffers) {
+  auto p = scoped_prof("update-verts");
+
+  // loop over effects. Mod vertices are done per effect (possibly a bad idea?)
+  for (int ei = 0; ei < num_effects; ei++) {
+    const auto& effect = model->effects[ei];
+    // some effects might have no mod draw info, and no modifiable vertices
+    if (effect.mod.mod_draw.empty()) {
+      continue;
+    }
+
+    prof().begin_event("start1");
+    // grab opengl buffer
+    auto opengl_buffers = alloc_mod_vtx_buffer(lev);
+    mod_opengl_buffers[ei] = opengl_buffers;
+
+    // check that we have enough room for the finished thing.
+    if (effect.mod.vertices.size() > MAX_MOD_VTX) {
+      fmt::print("More mod vertices than MAX_MOD_VTX. {} > {}\n", effect.mod.vertices.size(),
+                 MAX_MOD_VTX);
+      ASSERT_NOT_REACHED();
+    }
+
+    // check that we have enough room for unpack
+    if (effect.mod.expect_vidx_end > MAX_MOD_VTX) {
+      fmt::print("More mod vertices (temp) than MAX_MOD_VTX. {} > {}\n", effect.mod.expect_vidx_end,
+                 MAX_MOD_VTX);
+      ASSERT_NOT_REACHED();
+    }
+
+    // start with the "correct" vertices from the model data:
+    memcpy(m_mod_vtx_temp.data(), effect.mod.vertices.data(),
+           sizeof(tfrag3::MercVertex) * effect.mod.vertices.size());
+
+    // get pointers to the fragment and fragment control data
+    u32 goal_addr;
+    memcpy(&goal_addr, input_data + 4 * ei, 4);
+    const u8* ee0 = setup.data - setup.data_offset;
+    const u8* merc_effect = ee0 + goal_addr;
+    u16 frag_cnt;
+    memcpy(&frag_cnt, merc_effect + 18, 2);
+    ASSERT(frag_cnt >= effect.mod.fragment_mask.size());
+    u32 frag_goal;
+    memcpy(&frag_goal, merc_effect, 4);
+    u32 frag_ctrl_goal;
+    memcpy(&frag_ctrl_goal, merc_effect + 4, 4);
+    const u8* frag = ee0 + frag_goal;
+    const u8* frag_ctrl = ee0 + frag_ctrl_goal;
+
+    // loop over frags
+    u32 vidx = 0;
+    // u32 st_vif_add = model->st_vif_add;
+    float xyz_scale = model->xyz_scale;
+    prof().end_event();
+    {
+      // we're going to look at data that the game may be modifying.
+      // in the original game, they didn't have any lock, but I think that the
+      // scratchpad access from the EE would effectively block the VIF1 DMA, so you'd
+      // hopefully never get a partially updated model (which causes obvious holes).
+      // this lock is not ideal, and can block the rendering thread while blerc_execute runs,
+      // which can take up to 2ms on really blerc-heavy scenes
+      std::unique_lock<std::mutex> lk(g_merc_data_mutex);
+      int frags_done = 0;
+      auto p = scoped_prof("vert-math");
+
+      // loop over fragments
+      for (u32 fi = 0; fi < effect.mod.fragment_mask.size(); fi++) {
+        frags_done++;
+        u8 mat_xfer_count = frag_ctrl[3];
+
+        // we create a mask of fragments to skip because they have no vertices.
+        // the indexing data assumes that we skip the other fragments.
+        if (effect.mod.fragment_mask[fi]) {
+          // read fragment metadata
+          u8 unsigned_four_count = frag_ctrl[0];
+          u8 lump_four_count = frag_ctrl[1];
+          u32 mm_qwc_off = frag[10];
+          float float_offsets[3];
+          memcpy(float_offsets, &frag[mm_qwc_off * 16], 12);
+          u32 my_u4_count = ((unsigned_four_count + 3) / 4) * 16;
+          u32 my_l4_count = my_u4_count + ((lump_four_count + 3) / 4) * 16;
+
+          // loop over vertices in the fragment and unpack
+          for (u32 w = my_u4_count / 4; w < (my_l4_count / 4) - 2; w += 3) {
+            // positions
+            u32 q0w = 0x4b010000 + frag[w * 4 + (0 * 4) + 3];
+            u32 q1w = 0x4b010000 + frag[w * 4 + (1 * 4) + 3];
+            u32 q2w = 0x4b010000 + frag[w * 4 + (2 * 4) + 3];
+
+            // normals
+            u32 q0z = 0x47800000 + frag[w * 4 + (0 * 4) + 2];
+            u32 q1z = 0x47800000 + frag[w * 4 + (1 * 4) + 2];
+            u32 q2z = 0x47800000 + frag[w * 4 + (2 * 4) + 2];
+
+            // uvs
+            u32 q2x = model->st_vif_add + frag[w * 4 + (2 * 4) + 0];
+            u32 q2y = model->st_vif_add + frag[w * 4 + (2 * 4) + 1];
+
+            auto* pos_array = m_mod_vtx_unpack_temp[vidx].pos;
+            memcpy(&pos_array[0], &q0w, 4);
+            memcpy(&pos_array[1], &q1w, 4);
+            memcpy(&pos_array[2], &q2w, 4);
+            pos_array[0] += float_offsets[0];
+            pos_array[1] += float_offsets[1];
+            pos_array[2] += float_offsets[2];
+            pos_array[0] *= xyz_scale;
+            pos_array[1] *= xyz_scale;
+            pos_array[2] *= xyz_scale;
+
+            auto* nrm_array = m_mod_vtx_unpack_temp[vidx].nrm;
+            memcpy(&nrm_array[0], &q0z, 4);
+            memcpy(&nrm_array[1], &q1z, 4);
+            memcpy(&nrm_array[2], &q2z, 4);
+            nrm_array[0] += -65537;
+            nrm_array[1] += -65537;
+            nrm_array[2] += -65537;
+
+            auto* uv_array = m_mod_vtx_unpack_temp[vidx].uv;
+            memcpy(&uv_array[0], &q2x, 4);
+            memcpy(&uv_array[1], &q2y, 4);
+            uv_array[0] += model->st_magic;
+            uv_array[1] += model->st_magic;
+
+            vidx++;
+          }
+        }
+
+        // next control
+        frag_ctrl += 4 + 2 * mat_xfer_count;
+
+        // next frag
+        u32 mm_qwc_count = frag[11];
+        frag += mm_qwc_count * 16;
+      }
+
+      // sanity check
+      if (effect.mod.expect_vidx_end != vidx) {
+        fmt::print("---------- BAD {}/{}\n", effect.mod.expect_vidx_end, vidx);
+        ASSERT(false);
+      }
+    }
+
+    {
+      auto pp = scoped_prof("copy");
+      // now copy the data in merc original vertex order to the output.
+      for (u32 vi = 0; vi < effect.mod.vertices.size(); vi++) {
+        u32 addr = effect.mod.vertex_lump4_addr[vi];
+        if (addr < vidx) {
+          memcpy(&m_mod_vtx_temp[vi], &m_mod_vtx_unpack_temp[addr], 32);
+          m_mod_vtx_temp[vi].st[0] = m_mod_vtx_unpack_temp[addr].uv[0];
+          m_mod_vtx_temp[vi].st[1] = m_mod_vtx_unpack_temp[addr].uv[1];
+        }
+      }
+    }
+
+    // and upload to GPU
+    m_stats.num_uploads++;
+    m_stats.num_upload_bytes += effect.mod.vertices.size() * sizeof(tfrag3::MercVertex);
+    {
+      auto pp = scoped_prof("update-verts-upload");
+      glBindBuffer(GL_ARRAY_BUFFER, opengl_buffers.vertex);
+      glBufferData(GL_ARRAY_BUFFER, effect.mod.vertices.size() * sizeof(tfrag3::MercVertex),
+                   m_mod_vtx_temp.data(), GL_DYNAMIC_DRAW);
+    }
+  }
+}
+
 /*!
 * Setup draws for a model, given the DMA data generated by the GOAL code.
 */
@ -256,8 +527,15 @@ void Merc2::handle_pc_model(const DmaTransfer& setup,
  u64 current_effect_enable_bits = flags->enable_mask;       // mask for game to disable an effect
  bool model_uses_mod = flags->bitflags & 1;  // if we should update vertices from game.
  bool model_disables_fog = (flags->bitflags & 2);
+  bool model_uses_pc_blerc = flags->bitflags & 4;
  input_data += 32;

+  float blerc_weights[kMaxBlerc];
+  if (model_uses_pc_blerc) {
+    memcpy(blerc_weights, input_data, kMaxBlerc * sizeof(float));
+    input_data += kMaxBlerc * sizeof(float);
+  }
+
  // Next is "fade data", indicating the color/intensity of envmap effect
  u8 fade_buffer[4 * kMaxEffect];
  for (int ei = 0; ei < num_effects; ei++) {
@ -271,171 +549,10 @@ void Merc2::handle_pc_model(const DmaTransfer& setup,

  // will hold opengl buffers for the updated vertices
  ModBuffers mod_opengl_buffers[kMaxEffect];
-  if (model_uses_mod) {  // only if we've enabled, this path is slow.
-    auto p = scoped_prof("update-verts");
-
-    // loop over effects. Mod vertices are done per effect (possibly a bad idea?)
-    for (int ei = 0; ei < num_effects; ei++) {
-      const auto& effect = model_ref->model->effects[ei];
-      // some effects might have no mod draw info, and no modifiable vertices
-      if (effect.mod.mod_draw.empty()) {
-        continue;
-      }
-
-      prof().begin_event("start1");
-      // grab opengl buffer
-      auto opengl_buffers = alloc_mod_vtx_buffer(model_ref->level);
-      mod_opengl_buffers[ei] = opengl_buffers;
-
-      // check that we have enough room for the finished thing.
-      if (effect.mod.vertices.size() > MAX_MOD_VTX) {
-        fmt::print("More mod vertices than MAX_MOD_VTX. {} > {}\n", effect.mod.vertices.size(),
-                   MAX_MOD_VTX);
-        ASSERT_NOT_REACHED();
-      }
-
-      // check that we have enough room for unpack
-      if (effect.mod.expect_vidx_end > MAX_MOD_VTX) {
-        fmt::print("More mod vertices (temp) than MAX_MOD_VTX. {} > {}\n",
-                   effect.mod.expect_vidx_end, MAX_MOD_VTX);
-        ASSERT_NOT_REACHED();
-      }
-
-      // start with the "correct" vertices from the model data:
-      memcpy(m_mod_vtx_temp.data(), effect.mod.vertices.data(),
-             sizeof(tfrag3::MercVertex) * effect.mod.vertices.size());
-
-      // get pointers to the fragment and fragment control data
-      u32 goal_addr;
-      memcpy(&goal_addr, input_data + 4 * ei, 4);
-      const u8* ee0 = setup.data - setup.data_offset;
-      const u8* merc_effect = ee0 + goal_addr;
-      u16 frag_cnt;
-      memcpy(&frag_cnt, merc_effect + 18, 2);
-      ASSERT(frag_cnt >= effect.mod.fragment_mask.size());
-      u32 frag_goal;
-      memcpy(&frag_goal, merc_effect, 4);
-      u32 frag_ctrl_goal;
-      memcpy(&frag_ctrl_goal, merc_effect + 4, 4);
-      const u8* frag = ee0 + frag_goal;
-      const u8* frag_ctrl = ee0 + frag_ctrl_goal;
-
-      // loop over frags
-      u32 vidx = 0;
-      // u32 st_vif_add = model->st_vif_add;
-      float xyz_scale = model->xyz_scale;
-      prof().end_event();
-      {
-        // we're going to look at data that the game may be modifying.
-        // in the original game, they didn't have any lock, but I think that the
-        // scratchpad access from the EE would effectively block the VIF1 DMA, so you'd
-        // hopefully never get a partially updated model (which causes obvious holes).
-        // this lock is not ideal, and can block the rendering thread while blerc_execute runs,
-        // which can take up to 2ms on really blerc-heavy scenes
-        std::unique_lock<std::mutex> lk(g_merc_data_mutex);
-        int frags_done = 0;
-        auto p = scoped_prof("vert-math");
-
-        // loop over fragments
-        for (u32 fi = 0; fi < effect.mod.fragment_mask.size(); fi++) {
-          frags_done++;
-          u8 mat_xfer_count = frag_ctrl[3];
-
-          // we create a mask of fragments to skip because they have no vertices.
-          // the indexing data assumes that we skip the other fragments.
-          if (effect.mod.fragment_mask[fi]) {
-            // read fragment metadata
-            u8 unsigned_four_count = frag_ctrl[0];
-            u8 lump_four_count = frag_ctrl[1];
-            u32 mm_qwc_off = frag[10];
-            float float_offsets[3];
-            memcpy(float_offsets, &frag[mm_qwc_off * 16], 12);
-            u32 my_u4_count = ((unsigned_four_count + 3) / 4) * 16;
-            u32 my_l4_count = my_u4_count + ((lump_four_count + 3) / 4) * 16;
-
-            // loop over vertices in the fragment and unpack
-            for (u32 w = my_u4_count / 4; w < (my_l4_count / 4) - 2; w += 3) {
-              // positions
-              u32 q0w = 0x4b010000 + frag[w * 4 + (0 * 4) + 3];
-              u32 q1w = 0x4b010000 + frag[w * 4 + (1 * 4) + 3];
-              u32 q2w = 0x4b010000 + frag[w * 4 + (2 * 4) + 3];
-
-              // normals
-              u32 q0z = 0x47800000 + frag[w * 4 + (0 * 4) + 2];
-              u32 q1z = 0x47800000 + frag[w * 4 + (1 * 4) + 2];
-              u32 q2z = 0x47800000 + frag[w * 4 + (2 * 4) + 2];
-
-              // uvs
-              u32 q2x = model->st_vif_add + frag[w * 4 + (2 * 4) + 0];
-              u32 q2y = model->st_vif_add + frag[w * 4 + (2 * 4) + 1];
-
-              auto* pos_array = m_mod_vtx_unpack_temp[vidx].pos;
-              memcpy(&pos_array[0], &q0w, 4);
-              memcpy(&pos_array[1], &q1w, 4);
-              memcpy(&pos_array[2], &q2w, 4);
-              pos_array[0] += float_offsets[0];
-              pos_array[1] += float_offsets[1];
-              pos_array[2] += float_offsets[2];
-              pos_array[0] *= xyz_scale;
-              pos_array[1] *= xyz_scale;
-              pos_array[2] *= xyz_scale;
-
-              auto* nrm_array = m_mod_vtx_unpack_temp[vidx].nrm;
-              memcpy(&nrm_array[0], &q0z, 4);
-              memcpy(&nrm_array[1], &q1z, 4);
-              memcpy(&nrm_array[2], &q2z, 4);
-              nrm_array[0] += -65537;
-              nrm_array[1] += -65537;
-              nrm_array[2] += -65537;
-
-              auto* uv_array = m_mod_vtx_unpack_temp[vidx].uv;
-              memcpy(&uv_array[0], &q2x, 4);
-              memcpy(&uv_array[1], &q2y, 4);
-              uv_array[0] += model->st_magic;
-              uv_array[1] += model->st_magic;
-
-              vidx++;
-            }
-          }
-
-          // next control
-          frag_ctrl += 4 + 2 * mat_xfer_count;
-
-          // next frag
-          u32 mm_qwc_count = frag[11];
-          frag += mm_qwc_count * 16;
-        }
-
-        // sanity check
-        if (effect.mod.expect_vidx_end != vidx) {
-          fmt::print("---------- BAD {}/{}\n", effect.mod.expect_vidx_end, vidx);
-          ASSERT(false);
-        }
-      }
-
-      {
-        auto pp = scoped_prof("copy");
-        // now copy the data in merc original vertex order to the output.
-        for (u32 vi = 0; vi < effect.mod.vertices.size(); vi++) {
-          u32 addr = effect.mod.vertex_lump4_addr[vi];
-          if (addr < vidx) {
-            memcpy(&m_mod_vtx_temp[vi], &m_mod_vtx_unpack_temp[addr], 32);
-            m_mod_vtx_temp[vi].st[0] = m_mod_vtx_unpack_temp[addr].uv[0];
-            m_mod_vtx_temp[vi].st[1] = m_mod_vtx_unpack_temp[addr].uv[1];
-          }
-        }
-      }
-
-      // and upload to GPU
-      m_stats.num_uploads++;
-      m_stats.num_upload_bytes += effect.mod.vertices.size() * sizeof(tfrag3::MercVertex);
-      {
-        auto pp = scoped_prof("update-verts-upload");
-        glBindBuffer(GL_ARRAY_BUFFER, opengl_buffers.vertex);
-        glBufferData(GL_ARRAY_BUFFER, effect.mod.vertices.size() * sizeof(tfrag3::MercVertex),
-                     m_mod_vtx_temp.data(), GL_DYNAMIC_DRAW);
-      }
-    }
+  if (model_uses_pc_blerc) {
+    model_mod_blerc_draws(num_effects, model, lev, mod_opengl_buffers, blerc_weights);
+  } else if (model_uses_mod) {  // only if we've enabled, this path is slow.
+    model_mod_draws(num_effects, model, lev, input_data, setup, mod_opengl_buffers);
  }

  // stats
@ -494,7 +611,7 @@ void Merc2::handle_pc_model(const DmaTransfer& setup,
    auto& effect = model->effects[ei];

    bool should_envmap = effect.has_envmap;
-    bool should_mod = model_uses_mod && effect.has_mod_draw;
+    bool should_mod = (model_uses_pc_blerc || model_uses_mod) && effect.has_mod_draw;

    if (should_mod) {
      // draw as two parts, fixed and mod
@ -557,6 +674,8 @@ void Merc2::draw_debug_window() {

  ImGui::Checkbox("Debug", &m_debug_mode);

+  ImGui::SliderFloat("blerc-nightmare", &blerc_multiplier, -3, 3);
+
  if (m_debug_mode) {
    for (int i = 0; i < kMaxEffect; i++) {
      ImGui::Checkbox(fmt::format("e{:02d}", i).c_str(), &m_effect_debug_mask[i]);
--- a/game/graphics/opengl_renderer/foreground/Merc2.h
+++ b/game/graphics/opengl_renderer/foreground/Merc2.h
@ -8,6 +8,7 @@ class Merc2 : public BucketRenderer {
  void draw_debug_window() override;
  void init_shaders(ShaderLibrary& shaders) override;
  void render(DmaFollower& dma, SharedRenderState* render_state, ScopedProfilerNode& prof) override;
+  static constexpr int kMaxBlerc = 40;

 private:
  bool m_debug_mode = false;
@ -230,4 +231,15 @@ class Merc2 : public BucketRenderer {
  size_t m_opengl_buffer_alignment = 0;

  void flush_draw_buckets(SharedRenderState* render_state, ScopedProfilerNode& prof);
+  void model_mod_draws(int num_effects,
+                       const tfrag3::MercModel* model,
+                       const LevelData* lev,
+                       const u8* input_data,
+                       const DmaTransfer& setup,
+                       ModBuffers* mod_opengl_buffers);
+  void model_mod_blerc_draws(int num_effects,
+                             const tfrag3::MercModel* model,
+                             const LevelData* lev,
+                             ModBuffers* mod_opengl_buffers,
+                             const float* blerc_weights);
 };
--- a/game/graphics/pipelines/opengl.cpp
+++ b/game/graphics/pipelines/opengl.cpp
@ -91,9 +91,12 @@ std::unique_ptr<GraphicsData> g_gfx_data;
 static bool gl_inited = false;
 static int gl_init(GfxGlobalSettings& settings) {
  prof().instant_event("ROOT");
+  Timer gl_init_timer;
  // Initialize SDL
  {
    auto p = scoped_prof("startup::sdl::init_sdl");
+    // remove SDL garbage from hooking signal handler.
+    SDL_SetHint(SDL_HINT_NO_SIGNAL_HANDLERS, "1");
    if (SDL_Init(SDL_INIT_VIDEO | SDL_INIT_GAMECONTROLLER | SDL_INIT_HAPTIC) != 0) {
      sdl_util::log_error("Could not initialize SDL, exiting");
      return 1;
@ -127,7 +130,7 @@ static int gl_init(GfxGlobalSettings& settings) {
    SDL_GL_SetAttribute(SDL_GL_STENCIL_SIZE, 8);
    SDL_GL_SetAttribute(SDL_GL_ALPHA_SIZE, 8);
  }
-
+  lg::info("gl init took {:.3f}s\n", gl_init_timer.getSeconds());
  return 0;
 }

--- a/game/mips2c/jak2_functions/bones.cpp
+++ b/game/mips2c/jak2_functions/bones.cpp
@ -64,7 +64,7 @@ void exec_mpg(ExecutionContext* c) {
  // nop                        |  mulax.w ACC, vf00, vf12        27
  c->acc.vf.mula(Mask::w, c->vf_src(vf00).vf, c->vf_src(vf12).vf.x());
  // nop                        |  madday.w ACC, vf00, vf12       28
-  c->acc.vf.madda(Mask::w, c->vfs[vf00].vf, c->vfs[vf12].vf.y());
+  c->acc.vf.madda(Mask::w, c->vf_src(vf00).vf, c->vfs[vf12].vf.y());
  // nop                        |  maddz.w vf12, vf00, vf12       29
  c->acc.vf.madd(Mask::w, c->vfs[vf12].vf, c->vf_src(vf00).vf, c->vf_src(vf12).vf.z());
  // nop                        |  mulax.xyzw ACC, vf28, vf14     30
--- a/game/mips2c/jak2_functions/merc_blend_shape.cpp
+++ b/game/mips2c/jak2_functions/merc_blend_shape.cpp
@ -1,10 +1,106 @@
 //--------------------------MIPS2C---------------------
-// clang-format off
-#include "game/mips2c/mips2c_private.h"
-#include "game/kernel/jak2/kscheme.h"
-#include "common/global_profiler/GlobalProfiler.h"
+
 #include <mutex>

+#include "common/global_profiler/GlobalProfiler.h"
+
+#include "game/kernel/jak2/kscheme.h"
+#include "game/mips2c/mips2c_private.h"
+
+// I've rewritten the math part in C here:
+
+struct ChunkHeader {
+  s8 num_entries;  // not including this header
+  s8 unk[11];
+  s16 overlap_val;
+  s16 pad;
+};
+static_assert(sizeof(ChunkHeader) == 16);
+
+struct S16_8 {
+  s16 vals[8];
+};
+static_assert(sizeof(S16_8) == 16);
+
+struct BlercBlockHeader {
+  u8 tag_bytes[16];
+  u32 vtx_count;
+  u32 overlap;
+  u32 lump_dst;
+  u32 lump_qwc;
+};
+
+struct BlercBlock {
+  u8 output[848];
+  BlercBlockHeader header;
+};
+
+struct BlercContext {
+  BlercBlock block;
+  s8 dummy[7312];
+};
+
+namespace {
+
+int og_load_skip_pattern(int in) {
+  int base = (in >> 3) << 3;
+  int rem = (in & 0b111);
+  int pattern[8] = {0, 1, 4, 5, 8, 9, 12, 13};
+  return base * 2 + pattern[rem];
+}
+
+void simplified1(BlercContext* context, u8* ee_buffer) {
+  const s8* dummy_data = context->dummy;
+  const auto* first_chunk_header = (ChunkHeader*)dummy_data;
+  const int stride = (first_chunk_header->num_entries + 1) * 16;
+  int overlap = context->block.header.overlap;
+  u8* out = context->block.output;
+
+  // past the first chunk
+  const s8* data_src = dummy_data + stride;  // t2/t3
+  S16_8* ee_s16_8 = (S16_8*)ee_buffer;
+  for (int i = 0; i < overlap; i++) {
+    const auto* this_chunk = (ChunkHeader*)data_src;
+    for (int j = 0; j < 8; j++) {
+      ee_s16_8[i].vals[j] = this_chunk->overlap_val;
+    }
+
+    data_src += stride;  // next chunk
+  }
+
+  int total_count = ((ChunkHeader*)data_src)->num_entries;  // lb s5, 0(t3)
+  data_src += 16;                                           // now in s4
+  const u8* base_data_ptr = (u8*)dummy_data + 16;           // ra in asm
+  const s8* base_data_ptr_s = dummy_data + 16;              // ra in asm
+
+  memcpy(out, data_src, total_count * 16);
+  for (int i = 0; i < total_count * 8; i++) {
+    s32 base_val = base_data_ptr[i] * 8192;  // ld t6 grabs 8 at a time
+
+    for (int j = 0; j < overlap; j++) {
+      base_val += base_data_ptr_s[i + (j + 1) * stride] * ee_s16_8[j].vals[0];  // ld t5
+    }
+
+    base_val >>= 13;
+
+    if (base_val < 0)
+      base_val = 0;
+    if (base_val > 255)
+      base_val = 255;
+
+    int oo = og_load_skip_pattern(i);
+    out[oo + 2] = base_val;
+  }
+}
+
+void blerc_c(void* a, void* b) {
+  simplified1((BlercContext*)a, (u8*)b);
+}
+}  // namespace
+
+// clang-format off
+
+
 extern std::mutex g_merc_data_mutex;

 using ::jak2::intern_from_c;
@ -19,6 +115,8 @@ struct Cache {
 } cache;

 u64 execute(void* ctxt) {
+  bool hit18 = false;
+  bool hit19 = false;
  auto pp = scoped_prof("blerc-exec");
  std::unique_lock<std::mutex> lk(g_merc_data_mutex);
  auto* c = (ExecutionContext*)ctxt;
@ -168,10 +266,14 @@ block_13:
  // tadr here is bogus, it's reading something uploaded by the other transfer.
  spad_to_dma_blerc_chain(cache.fake_scratchpad_data, sadr, tadr);

+
 block_16:
  c->gprs[a1].du64[0] = 0;                          // or a1, r0, r0
  c->mov64(a2, a0);                                 // or a2, a0, r0
  c->load_symbol2(a3, cache.gsf_buffer);            // lw a3, *gsf-buffer*(s7)
+
+  // blerc_c(g_ee_main_mem + c->sgpr64(a2), g_ee_main_mem + c->sgpr64(a3));
+
  c->daddiu(t2, a2, 880);                           // daddiu t2, a2, 880
  c->lb(t1, 0, t2);                                 // lb t1, 0(t2)
  // nop                                            // sll r0, r0, 0
@ -195,12 +297,14 @@ block_16:


 block_18:
+hit18 = true;
  c->lh(t5, 12, t2);                                // lh t5, 12(t2)
  c->daddu(t2, t2, t8);                             // daddu t2, t2, t8
  c->sq(t6, 0, t1);                                 // sq t6, 0(t1)
  c->daddiu(t1, t1, 16);                            // daddiu t1, t1, 16

 block_19:
+  hit19 = true;
  c->pcpyh(t5, t5);                                 // pcpyh t5, t5
  c->mfc1(r0, f31);                                 // mfc1 r0, f31
  bc = c->sgpr64(t1) != c->sgpr64(t9);              // bne t1, t9, L47
@ -209,8 +313,8 @@ block_19:

  c->dsubu(t3, t2, t8);                             // dsubu t3, t2, t8
  // nop                                            // sll r0, r0, 0
-
 block_21:
+
  c->addiu(t1, r0, 255);                            // addiu t1, r0, 255
  c->addiu(t2, r0, 8192);                           // addiu t2, r0, 8192
  c->lb(s5, 0, t3);                                 // lb s5, 0(t3)
@ -283,11 +387,14 @@ block_24:
  c->mfc1(r0, f31);                                 // mfc1 r0, f31
  c->pextlh(t5, t5, t7);                            // pextlh t5, t5, t7
  c->mfc1(r0, f31);                                 // mfc1 r0, f31
+  // store modified vertex
  c->sq(t5, 0, t0);                                 // sq t5, 0(t0)
+
  c->daddiu(t0, t0, 16);                            // daddiu t0, t0, 16
  bc = c->sgpr64(s5) != 0;                          // bne s5, r0, L50
  c->daddiu(s4, s4, 16);                            // daddiu s4, s4, 16
  if (bc) {goto block_22;}                          // branch non-likely
+  // end of blerc_c stuff

  c->load_symbol2(a3, cache.stats_blerc);           // lw a3, *stats-blerc*(s7)
  bc = c->sgpr64(a3) == c->sgpr64(s7);              // beq a3, s7, L53
--- a/game/system/hid/devices/game_controller.cpp
+++ b/game/system/hid/devices/game_controller.cpp
@ -28,10 +28,6 @@ GameController::GameController(int sdl_device_id,
    return;
  }
  const auto controller_guid = SDL_JoystickGetGUID(joystick);
-  if (controller_guid.data == 0) {
-    sdl_util::log_error(fmt::format("Could not get contoller guid with id: {}", sdl_device_id));
-    return;
-  }
  char guidStr[33];
  SDL_JoystickGetGUIDString(controller_guid, guidStr, sizeof(guidStr));
  m_guid = guidStr;
--- a/game/tools/subtitles2/subtitle2_editor.cpp
+++ b/game/tools/subtitles2/subtitle2_editor.cpp
@ -13,7 +13,7 @@
 #include "third-party/imgui/imgui.h"
 #include "third-party/imgui/imgui_stdlib.h"

-static constexpr size_t LINE_DISPLAY_MAX_LEN = 38;
+// static constexpr size_t LINE_DISPLAY_MAX_LEN = 38;

 Subtitle2Editor::Subtitle2Editor(GameVersion version)
    : db_loaded(true),
--- a/goal_src/jak1/engine/game/main.gc
+++ b/goal_src/jak1/engine/game/main.gc
@ -121,7 +121,7 @@

       ;; allow the menu to run.
       (logclear! (-> *setting-control* default process-mask) (process-mask menu))
-       
+
       ;; modified for PC port - show hidden speedrun progress menu if L1+R1+X are held
       (if (and PC_PORT (-> *pc-settings* speedrunner-mode?) (cpad-hold? 0 l1) (cpad-hold? 0 r1) (cpad-hold? 0 x))
        (activate-progress *dproc* (progress-screen speedrun-options))
@ -356,7 +356,7 @@

 (defun scf-get-territory ()
  "this overrides the kernel version which usually has a hardcoded value."
-  
+
  (if (not *debug-segment*)
      (return *jak1-territory*))
  (case (-> *setting-control* default language)
@ -595,7 +595,11 @@

      ;; drawing effects to be used in foreground drawing.
      (with-profiler "foreground-effects"
-        (blerc-execute)
+        ;; with FP blerc, the vertices are modified in the PC renderer, so we can just skip
+        ;; this call to save time.
+        (unless *use-fp-blerc*
+          (blerc-execute)
+          )
        (blerc-init)
        (texscroll-execute)
        (ripple-execute)
@ -849,7 +853,7 @@
      (with-profiler "process-particles" (process-particles))

      ;; vif0 collide
-      
+
      (with-profiler "sound-update"
                     (swap-sound-buffers (ear-trans) (camera-pos) (camera-angle))
                     (str-play-kick)
--- a/goal_src/jak1/engine/gfx/foreground/bones.gc
+++ b/goal_src/jak1/engine/gfx/foreground/bones.gc
@ -937,6 +937,71 @@
  `(set! (-> (the-as (pointer uint32) ,addr)) ,val)
  )

+(defun pc-merc-blend-shape ((pd process-drawable) (blerc-weights-out (pointer float)))
+  "PC implementation to get blerc weights as floats and avoid the u16 rounding.
+  Returns #f if there is no blerc running, which means that we should use base positions.
+  If things work correctly, the original implementation would try to restore weights of 0
+  after the animation finishes - it leaves `blend-shape-valid` set after `blend-shape` is cleared
+  causing one more round of merc-blend-shape to run with hardcoded weights of 0.
+  "
+  (when (or (not (-> pd skel))
+            (zero? (-> pd skel))
+            )
+    (return #f)
+    )
+
+  (let* ((jc-channel (-> pd skel root-channel 0))
+         (anim (-> jc-channel frame-group))
+         (got-weights #f)
+         )
+    (when (and anim
+               (> (-> pd skel active-channels) 0)
+               (zero? (-> pd draw cur-lod))
+               (logtest? (-> pd skel status) (janim-status blerc))
+               )
+      (let ((shape-anim (-> anim blerc-data)))
+        (when shape-anim
+          (let* ((mctrl (-> pd draw mgeo))
+                 (num-targets (-> mctrl header blend-target-count))
+                 (frame-f (-> jc-channel frame-num))
+                 (frame-i (the int frame-f))
+                 (frame-1-data (&+ shape-anim (* (the-as uint frame-i) num-targets)))
+                 ;(a1-5 (new 'stack-no-clear 'array 'int16 128))
+                 )
+            (let ((a2-1 (-> mctrl header blend-target-count)))
+              (cond
+                ((< frame-i (+ (-> anim data 0 length) -1))
+                 (let* ((frame-2-data (&+ frame-1-data a2-1))
+                        (frame-2-mult (* 64.0 (- frame-f (the float frame-i))))
+                        (frame-1-mult (- 64.0 frame-2-mult))
+                        )
+                   (set! got-weights #t)
+                   (dotimes (i (the-as int a2-1))
+                     (set! (-> blerc-weights-out i)
+                           (+ (* (the float (- (-> frame-1-data i) 64)) frame-1-mult)
+                              (* (the float (- (-> frame-2-data i) 64)) frame-2-mult)
+                              )
+                           )
+                     )
+                   )
+                 )
+                (else
+                  (set! got-weights #t)
+                      (dotimes (a3-7 num-targets)
+                        (set! (-> blerc-weights-out a3-7) (the float (* (+ (-> (the-as (pointer uint8) (&+ frame-1-data a3-7))) -64) 64)))
+                        )
+                  )
+                )
+              )
+            )
+
+          )
+        )
+      )
+    got-weights
+    )
+  )
+
 ;; name   (128 char, 8 qw)
 ;; lights (7 qw x 1)
 ;; matrix slot string (128 char, 8 qw)
@ -949,6 +1014,7 @@
  :bitfield #t
  (update-verts 0)
  (disable-fog 1)
+  (pc-blerc 2)
  )

 (deftype pc-merc-flags (structure)
@ -975,7 +1041,7 @@
    )
  )

-(defun pc-merc-draw-request ((dc draw-control) (dma-buf pointer) (matrix-buf pointer) (update-verts symbol))
+(defun pc-merc-draw-request ((dc draw-control) (dma-buf pointer) (matrix-buf pointer) (update-verts symbol) (blercs (pointer float)))
  (let ((start-packet (the-as dma-packet dma-buf))
        (qwc-total 0))
    ;; merc draw asm will check this.
@ -1064,7 +1130,10 @@
            (set! (-> flags effect-count) (-> merc-ctrl header effect-count))
            (set! (-> flags bit-flags) (the pc-merc-bits 0))
            (when update-verts
-              (logior! (-> flags bit-flags) (pc-merc-bits update-verts))
+              (if (= update-verts 'blerc)
+                  (logior! (-> flags bit-flags) (pc-merc-bits pc-blerc))
+                  (logior! (-> flags bit-flags) (pc-merc-bits update-verts))
+                  )
              )
            (set! (-> flags enable-mask) enable-mask)
            (set! (-> flags ignore-alpha-mask) ignore-alpha-mask)
@ -1072,6 +1141,13 @@
          (&+! dma-buf (* 16 2))
          (+! qwc-total 2)

+          ;; include blerc weights.
+          (when (= update-verts 'blerc)
+            (mem-copy! dma-buf blercs (* 40 4))
+            (&+! dma-buf (* 40 4))
+            (+! qwc-total 10)
+            )
+
          ;; fades
          (let ((fades (the (pointer uint32) dma-buf)))
            (dotimes (i (-> merc-ctrl header effect-count))
@ -1119,6 +1195,9 @@
 ;; when set, use merc for blerc instead of generic.
 (define *blerc-hack* #t)

+;; when true, uses the PC float blerc implementation.
+(define *use-fp-blerc* #t)
+
 (define *texscroll-force-generic* #f)
 (define *ripple-force-generic* #f)

@ -1336,6 +1415,7 @@
                  (pc-force-mercneric #f)
                  ;; if pc rendering code needs to update merc vertices
                  (pc-merc-vtx-update #f)
+                  (blerc-weights (new 'stack-no-clear 'array 'float 40))
                  )
              (when (logtest? (-> arg0 global-effect) (draw-effect title))
                (set! pc-force-mercneric #t)
@ -1415,12 +1495,22 @@
                    (when (nonzero? jc)
                      (when (logtest? (-> jc status) (janim-status blerc))
                        (if *blerc-hack*
-                            (set! pc-merc-vtx-update #t)
-                            (set! pc-force-mercneric #t)
+                            (if *use-fp-blerc*
+                                (set! pc-merc-vtx-update 'blerc) ;; C++ blerc
+                                (set! pc-merc-vtx-update #t)     ;; GOAL blerc + merc
+                                )
+                            (set! pc-force-mercneric #t)         ;; GOAL blerc + generic
                            )
                        )
                      )
                    )
+
+                  (when (= pc-merc-vtx-update 'blerc)
+                    (unless
+                      (pc-merc-blend-shape (the process-drawable (-> arg0 process)) blerc-weights)
+                      (set! pc-merc-vtx-update #f)
+                      )
+                    )
                  )

                ;; additional in pc to make envmap fade 0 when envmap is not used
@ -1567,7 +1657,7 @@
                        )
                      )
                    )
-                  (set! s2-0 (pc-merc-draw-request arg0 (the pointer s2-0) (the pointer matrix-data) pc-merc-vtx-update))
+                  (set! s2-0 (pc-merc-draw-request arg0 (the pointer s2-0) (the pointer matrix-data) pc-merc-vtx-update blerc-weights))
                  ; (if (nonzero? (-> *merc-bucket-info* need-mercprime-if-merc))
                  ;     (set! (-> dma-buf base) (draw-bones-merc arg0 matrix-data s2-0 32 17))
                  ;     (set! (-> dma-buf base) (draw-bones-merc arg0 matrix-data s2-0 35 20))
--- a/goal_src/jak2/engine/game/main.gc
+++ b/goal_src/jak2/engine/game/main.gc
@ -1483,7 +1483,11 @@

    ; ;; Run blerc to modify foreground models
    (with-profiler 'merc *profile-merc-color*
-      (blerc-execute)
+      ;; with FP blerc, the vertices are modified in the PC renderer, so we can just skip
+      ;; this call to save time.
+      (unless *use-fp-blerc*
+        (blerc-execute)
+        )
      (blerc-init)
      )

--- a/goal_src/jak2/engine/gfx/foreground/foreground.gc
+++ b/goal_src/jak2/engine/gfx/foreground/foreground.gc
@ -876,11 +876,101 @@
    )
  )

+
+(defun pc-merc-blend-shape ((pd process-drawable) (blerc-weights-out (pointer float)))
+  "PC implementation to get blerc weights as floats and avoid the u16 rounding.
+   Returns #f if there is no blerc running, which means that we should use base positions.
+   If things work correctly, the original implementation would try to restore weights of 0
+   after the animation finishes - it leaves `blend-shape-valid` set after `blend-shape` is cleared
+   causing one more round of merc-blend-shape to run with hardcoded weights of 0.
+   "
+  (when (or (not (-> pd skel))
+            (zero? (-> pd skel))
+            )
+    (return #f)
+    )
+
+  ;; grab the currently playing animatoin
+  (let* ((jc-channel (-> pd skel root-channel 0))
+         (anim (-> jc-channel frame-group))
+         (got-weights #f)
+         )
+    (when (and anim ;; we have an anim
+               (> (-> pd skel active-channels) 0) ;; there are channels running
+               (zero? (-> pd draw cur-lod))       ;; using high lod
+               (logtest? (-> pd skel status) (joint-control-status blend-shape)) ;; blend shape is on
+               )
+      (cond
+        ;; first, see if we have an override:
+        ((and (-> pd skel override) (!= (-> pd skel override 0) 0.0))
+         ;; we do! copy from there.
+         (let* ((mctrl (-> pd draw mgeo))
+                (num-targets (-> mctrl header blend-target-count))
+                (override-array (-> pd skel override))
+                )
+           (set! got-weights #t)
+           (dotimes (i num-targets)
+             (set! (-> blerc-weights-out i) (* 8192.0 (-> override-array (+ i 1))))
+             )
+           )
+         )
+        (else
+          ;; otherwise, do the animation
+          (let ((shape-anim (-> anim blend-shape-anim)))
+            (when shape-anim
+              (let ((mctrl (-> pd draw mgeo)))
+                (let* ((num-targets (-> mctrl header blend-target-count))
+                       (frame-f (-> jc-channel frame-num))
+                       ;; round down to the integer frame
+                       (frame-i (the int frame-f))
+                       (frame-1-data (the (pointer uint8) (&+ shape-anim (* frame-i num-targets))))
+                       )
+                  (cond
+                    ;; check if there's a frame after this
+                    ((< frame-i (the-as int (+ (-> anim frames num-frames) -1)))
+                     ;; there is, interpolate between them.
+                     ;; this is rewritten to use floats, but still use the same scaling as the s16 weights.
+                     (let* ((frame-2-data (&+ frame-1-data num-targets))
+                            (frame-2-mult (* 64.0 (- frame-f (the float frame-i))))
+                            (frame-1-mult (- 64.0 frame-2-mult))
+                            )
+                       (set! got-weights #t)
+                       (dotimes (i num-targets)
+                         ;; key difference: this is floats
+                         (set! (-> blerc-weights-out i)
+                               (+ (* (the float (- (-> frame-1-data i) 64)) frame-1-mult)
+                                  (* (the float (- (-> frame-2-data i) 64)) frame-2-mult)
+                                  )
+                               )
+                         )
+                       )
+                     )
+                    (else
+                      ;; at the last frame, nothing to interpolate.
+                      (set! got-weights #t)
+                      (dotimes (a3-7 num-targets)
+                        (set! (-> blerc-weights-out a3-7) (the float (* (+ (-> (the-as (pointer uint8) (&+ frame-1-data a3-7))) -64) 64)))
+                        )
+                      )
+                    )
+                  )
+                )
+              )
+            )
+          )
+        )
+      )
+  got-weights
+
+    )
+  )
+
 (defenum pc-merc-bits
  :type uint8
  :bitfield #t
  (update-verts 0)
  (disable-fog 1)
+  (pc-blerc 2)
  )

 (deftype pc-merc-flags (structure)
@ -891,10 +981,13 @@
   )
  )

-(defun pc-merc-draw-request ((dc draw-control) (dma-buf pointer) (matrix-buf pointer) (tex-idx int) (update-verts symbol))
+(defun pc-merc-draw-request ((dc draw-control) (dma-buf pointer) (matrix-buf pointer) (tex-idx int) (update-verts symbol) (blercs (pointer float)))
  "Send a request to PC Merc2 to draw the given object.
   Only draws the effects which match this texture index.
-   Just places a single big dma packet, you have to patch the end yourself."
+   Just places a single big dma packet, you have to patch the end yourself.
+   If update-verts is set to #t, tell the PC renderer to use EE version of vertices.
+   If update-verts is set to 'blerc, tell the PC renderer to use the included blerc weights
+   to modify the vertices."
  (let ((start-packet (the-as dma-packet dma-buf))
        (qwc-total 0))
    (set! (-> start-packet dma) (new 'static 'dma-tag :id (dma-tag-id cnt)))
@ -984,7 +1077,10 @@
            (set! (-> flags effect-count) (-> merc-ctrl header effect-count))
            (set! (-> flags bit-flags) (the pc-merc-bits 0))
            (when update-verts
-              (logior! (-> flags bit-flags) (pc-merc-bits update-verts))
+              (if (= update-verts 'blerc)
+                  (logior! (-> flags bit-flags) (pc-merc-bits pc-blerc))
+                  (logior! (-> flags bit-flags) (pc-merc-bits update-verts))
+                  )
              )
            (when (logtest? (-> dc status) (draw-control-status disable-fog))
              (logior! (-> flags bit-flags) (pc-merc-bits disable-fog))
@ -995,6 +1091,13 @@
          (&+! dma-buf (* 16 2))
          (+! qwc-total 2)

+          ;; include blerc weights.
+          (when (= update-verts 'blerc)
+            (mem-copy! dma-buf blercs (* 40 4))
+            (&+! dma-buf (* 40 4))
+            (+! qwc-total 10)
+            )
+
          ;; fades
          (let ((fades (the (pointer uint32) dma-buf)))
            (dotimes (i (-> merc-ctrl header effect-count))
@ -1028,11 +1131,13 @@

 (defun pc-draw-bones ((dc draw-control) (dma-buf pointer) (matrix-buf pointer))
  "Add a dma packet to tell the PC renderer which model we are renderering."
-  (let ((use-flags (new 'stack-no-clear 'array 'uint8 7))
-        (mctrl (-> dc mgeo))
-        (buckets (-> (scratchpad-object foreground-work) grid level-buckets (-> (scratchpad-object foreground-work) draw-index-map (-> dc level-index))))
-        (has-ripple-or-texscroll #f)
-        )
+  (let* ((use-flags (new 'stack-no-clear 'array 'uint8 7))
+         (blerc-weights (new 'stack-no-clear 'array 'float 40))
+         (mctrl (-> dc mgeo))
+         (buckets (-> (scratchpad-object foreground-work) grid level-buckets (-> (scratchpad-object foreground-work) draw-index-map (-> dc level-index))))
+         (has-ripple-or-texscroll #f)
+         (uses-fp-blerc (and *use-fp-blerc* (pc-merc-blend-shape (the process-drawable (-> dc process)) blerc-weights)))
+         )
    ;; mark all as unused, until we see a use
    (dotimes (i 7) (set! (-> use-flags i) 0))

@ -1058,15 +1163,25 @@
          (when has-ripple-or-texscroll
            (set! vertex-update #t)
            )
-          (let* ((pd (the process-drawable (-> dc process)))
-                 (jc (-> pd skel)))
-            (when (nonzero? jc)
-              (when (logtest? (-> jc status) (joint-control-status blend-shape-valid))
-                (set! vertex-update #t)
+          (when uses-fp-blerc
+            (set! vertex-update 'blerc)
+            )
+
+          ;; if the fp blerc is disabled, handle blerc with normal modified vertices.
+          (unless *use-fp-blerc*
+            (let* ((pd (the process-drawable (-> dc process)))
+                   (jc (-> pd skel)))
+              (when (nonzero? jc)
+                (when (logtest? (-> jc status) (joint-control-status blend-shape-valid))
+                  (set! vertex-update #t)
+                  )
                )
              )
            )
-          (set! dma-buf (pc-merc-draw-request dc dma-buf matrix-buf i vertex-update))
+
+         ; (format 0 "~D weights: ~X~%" i blerc-weights)
+
+          (set! dma-buf (pc-merc-draw-request dc dma-buf matrix-buf i vertex-update blerc-weights))

          ;; create a patch packet
          (let ((patch-packet (the-as dma-packet dma-buf)))
--- a/goal_src/jak2/engine/gfx/merc/merc-blend-shape.gc
+++ b/goal_src/jak2/engine/gfx/merc/merc-blend-shape.gc
@ -11,6 +11,10 @@

 (define *stats-blerc* #f)

+;; added:
+;; when true, uses the PC float blerc implementation.
+(define *use-fp-blerc* #t)
+
 (deftype blerc-block-header (structure)
  ((tag       generic-merc-tag :inline :offset-assert   0)
   (vtx-count uint32                   :offset-assert  16)
@ -310,3 +314,70 @@
  0
  (none)
  )
+
+; (defun setup-blerc-chains ((mc merc-ctrl) (blend-shape-coeffs (pointer int16)) (dma-buf dma-buffer))
+;   (local-vars
+;     (effect merc-effect)
+;     (blend-frag-count uint)
+;     (blend-ctrl object)
+;     (sv-48 int)
+;     (sv-64 int)
+;     )
+;   (let* ((num-effects (-> mc header effect-count))
+;         (num-targets (-> mc header blend-target-count))
+;         (dma-mem (-> dma-buf base))
+;         (dma-tag-mem (&+ dma-mem 0))
+;         (dma-mem-ptr (the-as object (&+ dma-mem 16)))
+;         )
+;     (if (zero? (-> *blerc-globals* first))
+;         (set! (-> *blerc-globals* first) (the-as uint dma-mem-ptr))
+;         )
+;     (dotimes (effect-idx (the-as int num-effect))
+;       (set! effect (-> mc effect effect-idx))
+;       (set! blend-frag-count (-> effect blend-frag-count))
+;       (when (nonzero? blend-frag-count)
+;         (let ((v1-15 (the-as object (-> effect frag-geo)))
+;               (s1-0 (the-as structure (-> effect frag-ctrl)))
+;               (s0-0 (the-as object (-> effect blend-data)))
+;               )
+;           (set! blend-ctrl (-> effect blend-ctrl))
+;           (set! sv-48 0)
+;           (while (< sv-48 (the-as int blend-frag-count))
+;             (set! sv-64 (+ (the-as int v1-15)
+;                            (logand (* (+ (-> (the-as merc-fragment-control s1-0) unsigned-four-count) 3) 4) #xfff0)
+;                            )
+;                   )
+;             (if (nonzero? (-> (the-as (pointer uint8) blend-ctrl) 0))
+;                 (set! dma-mem-ptr (setup-blerc-chains-for-one-fragment num-targets blend-shape-coeffs dma-mem-ptr s0-0 blend-ctrl sv-64))
+;                 )
+;             (let ((a0-14 (logand (+ (* (the-as uint 6)
+;                                        (-> (the-as merc-blend-ctrl blend-ctrl) blend-vtx-count))
+;                                      15)
+;                                  #xfff0)))
+;               (set! v1-15
+;                     (+ sv-64
+;                        (logand (* (+ (-> (the-as merc-fragment-control s1-0) lump-four-count) 3) 4) #xfff0)
+;                        (* (-> (the-as merc-fragment-control s1-0) fp-qwc) 16)
+;                        )
+;                     )
+;               (set! s1-0 (&+ s1-0 (* (-> (the-as merc-fragment-control s1-0) mat-xfer-count) 2) 4))
+;               (set! s0-0
+;                     (+ (the-as int s0-0) (* (the-as uint a0-14) (+ (-> (the-as merc-blend-ctrl blend-ctrl) nonzero-index-count) 1)))
+;                     )
+;               )
+;             (set! blend-ctrl (+ (the-as int blend-ctrl) num-targets 2))
+;             (the-as int blend-ctrl)
+;             (set! sv-48 (+ sv-48 1))
+;             )
+;           )
+;         )
+;       )
+;     (set! (-> (the-as (pointer int64) dma-tag-mem)) (logior #x20000000 (shr (shl (the-as int dma-mem-ptr) 33) 1)))
+;     (set! (-> (the-as (pointer uint32) dma-tag-mem) 2) (the-as uint 0))
+;     (set! (-> (the-as (pointer uint32) dma-tag-mem) 3) (the-as uint 0))
+;     (set! (-> dma-buf base) (the-as pointer dma-mem-ptr))
+
+;     )
+;   0
+;   (none)
+;   )
--- a/goalc/data_compiler/game_text_common.cpp
+++ b/goalc/data_compiler/game_text_common.cpp
@ -160,7 +160,7 @@ void compile_subtitle2(GameSubtitle2DB& db, const std::string& output_prefix) {
    int speaker_array_link = gen.add_word(0);  // speaker array (dummy for now)

    auto speaker_index_by_name = [&speaker_names](const std::string& name) {
-      for (int i = 0; i < speaker_names.size(); ++i) {
+      for (int i = 0; i < (int)speaker_names.size(); ++i) {
        if (speaker_names.at(i) == name) {
          return i + 1;
        }