From cfce5e5916d55e21dd7da96a97a908aa0c9e767f Mon Sep 17 00:00:00 2001
From: water111 <48171810+water111@users.noreply.github.com>
Date: Tue, 10 Oct 2023 18:56:03 -0700
Subject: [PATCH] [decompiler] Support v5 data file link data (#3076)

Fix the implementation of `link_v5` so it works on "data" files for jak
3.
---
 common/audio/audio_formats.cpp                |   5 +-
 common/goos/Interpreter.cpp                   |   2 +-
 common/goos/Object.cpp                        |   1 -
 decompiler/ObjectFile/LinkedObjectFile.cpp    |   7 +-
 .../ObjectFile/LinkedObjectFileCreation.cpp   | 147 ++++++++++--------
 .../analysis/analyze_inspect_method.cpp       |   2 +-
 lsp/transport/stdio.cpp                       |   2 +-
 7 files changed, 89 insertions(+), 77 deletions(-)
diff --git a/common/audio/audio_formats.cpp b/common/audio/audio_formats.cpp
index 83201b65f..25cdbf61d 100644
--- a/common/audio/audio_formats.cpp
+++ b/common/audio/audio_formats.cpp
@@ -43,7 +43,7 @@ void write_wave_file(const std::vector<s16>& left_samples,
       writer.add(sample);
     }
   } else {
-    for (int i = 0; i < left_samples.size(); i++) {
+    for (size_t i = 0; i < left_samples.size(); i++) {
       writer.add(left_samples.at(i));
       if (i < right_samples.size()) {
         writer.add(right_samples.at(i));
@@ -86,8 +86,9 @@ std::pair<std::vector<s16>, std::vector<s16>> decode_adpcm(BinaryReader& reader,
     u8 shift = shift_filter & 0b1111;
     u8 filter = shift_filter >> 4;
     u8 flags = reader.read<u8>();
+    (void)flags;
 
-    // removed assertions here
+    // removed assertions here (and that's probably why the audio doesn't sound right)
 
     u8 input_buffer[14];
 
diff --git a/common/goos/Interpreter.cpp b/common/goos/Interpreter.cpp
index b7d50df30..b6c627a6e 100644
--- a/common/goos/Interpreter.cpp
+++ b/common/goos/Interpreter.cpp
@@ -521,7 +521,7 @@ void Interpreter::vararg_check(
 /*!
  * Evaluate a list and return the result of the last evaluation.
  */
-Object Interpreter::eval_list_return_last(const Object& form,
+Object Interpreter::eval_list_return_last(const Object& /*form*/,
                                           Object rest,
                                           const std::shared_ptr<EnvironmentObject>& env) {
   if (rest.is_empty_list()) {
diff --git a/common/goos/Object.cpp b/common/goos/Object.cpp
index 9bb5b3a2a..2d9119478 100644
--- a/common/goos/Object.cpp
+++ b/common/goos/Object.cpp
@@ -66,7 +66,6 @@ SymbolTable::~SymbolTable() {
 }
 
 InternedSymbolPtr SymbolTable::intern(const char* str) {
-  InternedSymbolPtr result;
   size_t string_len = strlen(str);
   u32 hash = crc32((const u8*)str, string_len);
 
diff --git a/decompiler/ObjectFile/LinkedObjectFile.cpp b/decompiler/ObjectFile/LinkedObjectFile.cpp
index 531fcfc2f..fb431b2ea 100644
--- a/decompiler/ObjectFile/LinkedObjectFile.cpp
+++ b/decompiler/ObjectFile/LinkedObjectFile.cpp
@@ -113,7 +113,7 @@ Function* LinkedObjectFile::try_get_function_at_label(int label_id) {
 
 Function* LinkedObjectFile::try_get_function_at_label(const DecompilerLabel& label) {
   for (auto& func : functions_by_seg.at(label.target_segment)) {
-    // + 4 to skip past type tag to the first word, which is were the label points.
+    // + 4 to skip past type tag to the first word, which is where the label points.
     if (func.start_word * 4 + 4 == label.offset) {
       return &func;
     }
@@ -128,7 +128,7 @@ const Function* LinkedObjectFile::try_get_function_at_label(int label_id) const
 
 const Function* LinkedObjectFile::try_get_function_at_label(const DecompilerLabel& label) const {
   for (auto& func : functions_by_seg.at(label.target_segment)) {
-    // + 4 to skip past type tag to the first word, which is were the label points.
+    // + 4 to skip past type tag to the first word, which is where the label points.
     if (func.start_word * 4 + 4 == label.offset) {
       return &func;
     }
@@ -156,7 +156,8 @@ bool LinkedObjectFile::pointer_link_word(int source_segment,
   ASSERT(word.kind() == LinkedWord::PLAIN_DATA);
 
   if (dest_offset / 4 > (int)words_by_seg.at(dest_segment).size()) {
-    //    printf("HACK bad link ignored!\n");
+    //    printf("HACK bad link ignored src %d, %d vs %d!\n", source_offset, dest_offset / 4,
+    //           int(words_by_seg.at(dest_segment).size()));
     return false;
   }
   ASSERT(dest_offset / 4 <= (int)words_by_seg.at(dest_segment).size());
diff --git a/decompiler/ObjectFile/LinkedObjectFileCreation.cpp b/decompiler/ObjectFile/LinkedObjectFileCreation.cpp
index 9312f0e3d..62c4736cd 100644
--- a/decompiler/ObjectFile/LinkedObjectFileCreation.cpp
+++ b/decompiler/ObjectFile/LinkedObjectFileCreation.cpp
@@ -55,11 +55,10 @@ struct LinkHeaderV5 {
   uint32_t length_to_get_to_code;  // 4 length.. of link data?
   uint16_t version;                // 8
   uint16_t unknown;                // 10
-  uint32_t pad;                    // 12
+  uint32_t length_to_get_to_link;  // 12
   uint32_t link_length;            // 16
   uint8_t n_segments;              // 20
   char name[59];                   // 21 (really??)
-  SegmentInfo segment_info[3];
 };
 
 // The types of symbol links
@@ -421,72 +420,83 @@ static void link_v5(LinkedObjectFile& f,
                     const std::string& name,
                     DecompilerTypeSystem& dts) {
   auto header = (const LinkHeaderV5*)(&data.at(0));
-  if (header->n_segments == 1) {
-    printf("abandon %s!\n", name.c_str());
-    return;
+
+  // for jak 3, both code and data use a "v5" format for linking.
+  // code has 3 segments (top-level, main, debug), and data has just 1.
+  // they appear to be generated by different programs, so there's some hard-coded checks for
+  // each.
+
+  // the "v5" format allows for multiple segments (like v3), "split-pointer" linking to support
+  // splitting a pointer link between a lui/ori instruction (needed for code), but uses "v2"
+  // symbol linking. For a reason that I don't understand, "v3" symlinks uses a less-space efficient
+  // encoding of large integers.
+
+  static_assert(0x50 == sizeof(LinkHeaderV5));
+
+  if (header->n_segments == 3) {
+    ASSERT(header->type_tag == 0);
+    ASSERT(name == header->name);
+    // the linker for code placed the link data at the beginning.
+    // but we expect the link data to start just after the object file header
+    ASSERT(header->length_to_get_to_link == sizeof(LinkHeaderV5));
+    // and then the code sould come after that
+    ASSERT(header->length_to_get_to_code == sizeof(LinkHeaderV5) + header->link_length);
+  } else if (header->n_segments == 1) {
+    ASSERT(header->type_tag == UINT32_MAX);
+    // name is inconsistent, so don't check is
+    // data files have the data first, which is good, as the last object in a DGO gets loaded
+    // directly to the heap, and putting the data first means that we can "free" the link data just
+    // by bumping the heap pointer back, rather than memcpy the code back to cover the hole if link
+    // data came first.
+    // the offset is always 0x80, which is bigger than the header, but is needed to make data
+    // aligned with the PS2's cache line size (64 bytes), which makes sense.
+    ASSERT(header->length_to_get_to_code == 0x80);
+  } else {
+    lg::die("bad segment count {}", header->n_segments);
   }
-  ASSERT(header->type_tag == 0);
-  ASSERT(name == header->name);
-  ASSERT(header->n_segments == 3);
-  ASSERT(header->pad == 0x50);
-  ASSERT(header->length_to_get_to_code - header->link_length == 0x50);
-
-  f.set_segment_count(3);
-
-  // link v3's data size is data.size() - link_length
-  // link v5's data size is data.size() - new_link_length - 0x50.
-
-  // lbp + 4 points to version?
-  // lbp points to 4 past start of header.
-
-  // lbp[1] = version + unknown 16 bit thing.
-  // lbp[3] = link block length (minus 0x50)
+  f.set_segment_count(header->n_segments);
 
   // todo - check this against the code size we actually got.
   //  size_t expected_code_size = data.size() - (header->link_length + 0x50);
 
-  uint32_t data_ptr_offset = header->length_to_get_to_code;
+  const int n_segs = header->n_segments;
 
+  // the first think in the link data is the segment info array, which we need to find stuff.
+  const SegmentInfo* seg_info_array =
+      (const SegmentInfo*)(data.data() + header->length_to_get_to_link);
+
+  // for convenience, we'll find the data/link offsets for each segment.
   uint32_t segment_data_offsets[3];
   uint32_t segment_link_offsets[3];
-  uint32_t segment_link_ends[3];
-  for (int i = 0; i < 3; i++) {
-    segment_data_offsets[i] = data_ptr_offset + header->segment_info[i].data;
-    segment_link_offsets[i] = header->segment_info[i].relocs + 0x50;
-    ASSERT(header->segment_info[i].magic == 1);
+  uint32_t segment_link_ends[3];  // set in linking, once we get to the end.
+  for (int i = 0; i < n_segs; i++) {
+    segment_data_offsets[i] = header->length_to_get_to_code + seg_info_array[i].data;
+    segment_link_offsets[i] = header->length_to_get_to_link + seg_info_array[i].relocs;
+    ASSERT(seg_info_array[i].magic == 1);
   }
 
   // check that the data region is filled
-  for (int i = 0; i < 2; i++) {
-    ASSERT(align16(segment_data_offsets[i] + header->segment_info[i].size) ==
+  for (int i = 0; i < n_segs - 1; i++) {
+    ASSERT(align16(segment_data_offsets[i] + seg_info_array[i].size) ==
            segment_data_offsets[i + 1]);
   }
-  ASSERT(align16(segment_data_offsets[2] + header->segment_info[2].size) == data.size());
+  if (n_segs == 3) {
+    ASSERT(align16(segment_data_offsets[2] + seg_info_array[2].size) == data.size());
+  }
 
-  // loop over segments (reverse order for now)
-  for (int seg_id = 3; seg_id-- > 0;) {
-    // ?? is this right?
-    if (header->segment_info[seg_id].size == 0)
+  // loop over segments
+  for (int seg_id = n_segs; seg_id-- > 0;) {
+    int segment_size = seg_info_array[seg_id].size;
+    if (segment_size == 0) {
       continue;
+    }
 
-    auto segment_size = header->segment_info[seg_id].size;
-    f.stats.v3_code_bytes += segment_size;
-
-    //    if(gGameVersion == JAK2) {
-    bool adjusted = false;
+    // the decompiler uses 4-byte words, so pad to 4-bytes.
     while (segment_size % 4) {
       segment_size++;
-      adjusted = true;
     }
 
-    if (adjusted) {
-      printf(
-          "Adjusted the size of segment %d in %s, this is fine, but rare (and may indicate a "
-          "bigger problem if it happens often)\n",
-          seg_id, name.c_str());
-    }
-    //    }
-
+    // set up pointers for linker.
     auto base_ptr = segment_data_offsets[seg_id];
     auto data_ptr = base_ptr - 4;
     auto link_ptr = segment_link_offsets[seg_id];
@@ -494,13 +504,15 @@ static void link_v5(LinkedObjectFile& f,
     ASSERT((data_ptr % 4) == 0);
     ASSERT((segment_size % 4) == 0);
 
+    // add data to the decompiler.
     auto code_start = (const uint32_t*)(&data.at(data_ptr + 4));
     auto code_end = ((const uint32_t*)(&data.at(data_ptr + segment_size))) + 1;
     for (auto x = code_start; x < code_end; x++) {
       f.push_back_word_to_segment(*((const uint32_t*)x), seg_id);
     }
-    bool fixing = false;
 
+    // pointer linking.
+    bool fixing = false;
     if (data.at(link_ptr)) {
       // we have pointers
       while (true) {
@@ -517,7 +529,8 @@ static void link_v5(LinkedObjectFile& f,
               if ((old_code >> 24) == 0) {
                 f.stats.v3_word_pointers++;
                 if (!f.pointer_link_word(seg_id, data_ptr - base_ptr, seg_id, old_code)) {
-                  printf("WARNING bad pointer_link_word (2) in %s\n", name.c_str());
+                  // the art groups just have bogus links. we ignored them in jak 2, so do the same
+                  // here. The joint-anim-compressed-control's have a few bogus frames at the end.
                 }
               } else {
                 f.stats.v3_split_pointers++;
@@ -526,12 +539,9 @@ static void link_v5(LinkedObjectFile& f,
                 ASSERT(lo_hi_offset);
                 ASSERT(dest_seg < 3);
                 auto offset_upper = old_code & 0xff;
-                //                ASSERT(offset_upper == 0);
                 uint32_t low_code = *(const uint32_t*)(&data.at(data_ptr + 4 * lo_hi_offset));
                 uint32_t offset = low_code & 0xffff;
                 if (offset_upper) {
-                  // seems to work fine, no need to warn.
-                  //                  printf("WARNING - offset upper is set in %s\n", name.c_str());
                   offset += (offset_upper << 16);
                 }
                 f.pointer_link_split_word(seg_id, data_ptr - base_ptr,
@@ -558,6 +568,7 @@ static void link_v5(LinkedObjectFile& f,
     }
     link_ptr++;
 
+    // symbol linking.
     if (data.at(link_ptr)) {
       auto sub_link_ptr = link_ptr;
 
@@ -604,21 +615,23 @@ static void link_v5(LinkedObjectFile& f,
     segment_link_ends[seg_id] = link_ptr;
   }
 
-  ASSERT(segment_link_offsets[0] == 128);
+  if (n_segs == 3) {
+    ASSERT(segment_link_offsets[0] == 128);
 
-  if (header->segment_info[0].size) {
-    ASSERT(segment_link_ends[0] + 1 == segment_link_offsets[1]);
-  } else {
-    ASSERT(segment_link_offsets[0] + 2 == segment_link_offsets[1]);
+    if (seg_info_array[0].size) {
+      ASSERT(segment_link_ends[0] + 1 == segment_link_offsets[1]);
+    } else {
+      ASSERT(segment_link_offsets[0] + 2 == segment_link_offsets[1]);
+    }
+
+    if (seg_info_array[1].size) {
+      ASSERT(segment_link_ends[1] + 1 == segment_link_offsets[2]);
+    } else {
+      ASSERT(segment_link_offsets[1] + 2 == segment_link_offsets[2]);
+    }
+
+    ASSERT(align16(segment_link_ends[2] + 2) == segment_data_offsets[0]);
   }
-
-  if (header->segment_info[1].size) {
-    ASSERT(segment_link_ends[1] + 1 == segment_link_offsets[2]);
-  } else {
-    ASSERT(segment_link_offsets[1] + 2 == segment_link_offsets[2]);
-  }
-
-  ASSERT(align16(segment_link_ends[2] + 2) == segment_data_offsets[0]);
 }
 
 static void link_v3(LinkedObjectFile& f,
@@ -677,10 +690,8 @@ static void link_v3(LinkedObjectFile& f,
     }
 
     if (game_version == GameVersion::Jak2) {
-      [[maybe_unused]] bool adjusted = false;
       while (segment_size % 4) {
         segment_size++;
-        adjusted = true;
       }
     }
 
diff --git a/decompiler/analysis/analyze_inspect_method.cpp b/decompiler/analysis/analyze_inspect_method.cpp
index d877ba5f8..9ec704a6b 100644
--- a/decompiler/analysis/analyze_inspect_method.cpp
+++ b/decompiler/analysis/analyze_inspect_method.cpp
@@ -294,7 +294,7 @@ struct FieldPrint {
 
 // if a field has a weird inspect, just return the FieldPrint instead of asserting,
 // there's too many edge cases in custom prints to account for all of them
-FieldPrint handle_custom_prints(FieldPrint& fp, const std::string& str) {
+FieldPrint handle_custom_prints(FieldPrint& fp, const std::string& /*str*/) {
   return fp;
 }
 
diff --git a/lsp/transport/stdio.cpp b/lsp/transport/stdio.cpp
index 632284b8d..0c7f2764e 100644
--- a/lsp/transport/stdio.cpp
+++ b/lsp/transport/stdio.cpp
@@ -61,7 +61,7 @@ void MessageBuffer::handle_char(char c) {
     // we reach the length of the body as provided in the Content-Length
     // header.
     auto content_length = std::stoi(m_headers["Content-Length"]);
-    if (m_raw_message.length() == content_length) {
+    if (m_raw_message.length() == (size_t)content_length) {
       m_body = json::parse(m_raw_message);
       m_reading_content = false;
     }