From cfce5e5916d55e21dd7da96a97a908aa0c9e767f Mon Sep 17 00:00:00 2001 From: water111 <48171810+water111@users.noreply.github.com> Date: Tue, 10 Oct 2023 18:56:03 -0700 Subject: [PATCH] [decompiler] Support v5 data file link data (#3076) Fix the implementation of `link_v5` so it works on "data" files for jak 3. --- common/audio/audio_formats.cpp | 5 +- common/goos/Interpreter.cpp | 2 +- common/goos/Object.cpp | 1 - decompiler/ObjectFile/LinkedObjectFile.cpp | 7 +- .../ObjectFile/LinkedObjectFileCreation.cpp | 147 ++++++++++-------- .../analysis/analyze_inspect_method.cpp | 2 +- lsp/transport/stdio.cpp | 2 +- 7 files changed, 89 insertions(+), 77 deletions(-) diff --git a/common/audio/audio_formats.cpp b/common/audio/audio_formats.cpp index 83201b65f..25cdbf61d 100644 --- a/common/audio/audio_formats.cpp +++ b/common/audio/audio_formats.cpp @@ -43,7 +43,7 @@ void write_wave_file(const std::vector& left_samples, writer.add(sample); } } else { - for (int i = 0; i < left_samples.size(); i++) { + for (size_t i = 0; i < left_samples.size(); i++) { writer.add(left_samples.at(i)); if (i < right_samples.size()) { writer.add(right_samples.at(i)); @@ -86,8 +86,9 @@ std::pair, std::vector> decode_adpcm(BinaryReader& reader, u8 shift = shift_filter & 0b1111; u8 filter = shift_filter >> 4; u8 flags = reader.read(); + (void)flags; - // removed assertions here + // removed assertions here (and that's probably why the audio doesn't sound right) u8 input_buffer[14]; diff --git a/common/goos/Interpreter.cpp b/common/goos/Interpreter.cpp index b7d50df30..b6c627a6e 100644 --- a/common/goos/Interpreter.cpp +++ b/common/goos/Interpreter.cpp @@ -521,7 +521,7 @@ void Interpreter::vararg_check( /*! * Evaluate a list and return the result of the last evaluation. */ -Object Interpreter::eval_list_return_last(const Object& form, +Object Interpreter::eval_list_return_last(const Object& /*form*/, Object rest, const std::shared_ptr& env) { if (rest.is_empty_list()) { diff --git a/common/goos/Object.cpp b/common/goos/Object.cpp index 9bb5b3a2a..2d9119478 100644 --- a/common/goos/Object.cpp +++ b/common/goos/Object.cpp @@ -66,7 +66,6 @@ SymbolTable::~SymbolTable() { } InternedSymbolPtr SymbolTable::intern(const char* str) { - InternedSymbolPtr result; size_t string_len = strlen(str); u32 hash = crc32((const u8*)str, string_len); diff --git a/decompiler/ObjectFile/LinkedObjectFile.cpp b/decompiler/ObjectFile/LinkedObjectFile.cpp index 531fcfc2f..fb431b2ea 100644 --- a/decompiler/ObjectFile/LinkedObjectFile.cpp +++ b/decompiler/ObjectFile/LinkedObjectFile.cpp @@ -113,7 +113,7 @@ Function* LinkedObjectFile::try_get_function_at_label(int label_id) { Function* LinkedObjectFile::try_get_function_at_label(const DecompilerLabel& label) { for (auto& func : functions_by_seg.at(label.target_segment)) { - // + 4 to skip past type tag to the first word, which is were the label points. + // + 4 to skip past type tag to the first word, which is where the label points. if (func.start_word * 4 + 4 == label.offset) { return &func; } @@ -128,7 +128,7 @@ const Function* LinkedObjectFile::try_get_function_at_label(int label_id) const const Function* LinkedObjectFile::try_get_function_at_label(const DecompilerLabel& label) const { for (auto& func : functions_by_seg.at(label.target_segment)) { - // + 4 to skip past type tag to the first word, which is were the label points. + // + 4 to skip past type tag to the first word, which is where the label points. if (func.start_word * 4 + 4 == label.offset) { return &func; } @@ -156,7 +156,8 @@ bool LinkedObjectFile::pointer_link_word(int source_segment, ASSERT(word.kind() == LinkedWord::PLAIN_DATA); if (dest_offset / 4 > (int)words_by_seg.at(dest_segment).size()) { - // printf("HACK bad link ignored!\n"); + // printf("HACK bad link ignored src %d, %d vs %d!\n", source_offset, dest_offset / 4, + // int(words_by_seg.at(dest_segment).size())); return false; } ASSERT(dest_offset / 4 <= (int)words_by_seg.at(dest_segment).size()); diff --git a/decompiler/ObjectFile/LinkedObjectFileCreation.cpp b/decompiler/ObjectFile/LinkedObjectFileCreation.cpp index 9312f0e3d..62c4736cd 100644 --- a/decompiler/ObjectFile/LinkedObjectFileCreation.cpp +++ b/decompiler/ObjectFile/LinkedObjectFileCreation.cpp @@ -55,11 +55,10 @@ struct LinkHeaderV5 { uint32_t length_to_get_to_code; // 4 length.. of link data? uint16_t version; // 8 uint16_t unknown; // 10 - uint32_t pad; // 12 + uint32_t length_to_get_to_link; // 12 uint32_t link_length; // 16 uint8_t n_segments; // 20 char name[59]; // 21 (really??) - SegmentInfo segment_info[3]; }; // The types of symbol links @@ -421,72 +420,83 @@ static void link_v5(LinkedObjectFile& f, const std::string& name, DecompilerTypeSystem& dts) { auto header = (const LinkHeaderV5*)(&data.at(0)); - if (header->n_segments == 1) { - printf("abandon %s!\n", name.c_str()); - return; + + // for jak 3, both code and data use a "v5" format for linking. + // code has 3 segments (top-level, main, debug), and data has just 1. + // they appear to be generated by different programs, so there's some hard-coded checks for + // each. + + // the "v5" format allows for multiple segments (like v3), "split-pointer" linking to support + // splitting a pointer link between a lui/ori instruction (needed for code), but uses "v2" + // symbol linking. For a reason that I don't understand, "v3" symlinks uses a less-space efficient + // encoding of large integers. + + static_assert(0x50 == sizeof(LinkHeaderV5)); + + if (header->n_segments == 3) { + ASSERT(header->type_tag == 0); + ASSERT(name == header->name); + // the linker for code placed the link data at the beginning. + // but we expect the link data to start just after the object file header + ASSERT(header->length_to_get_to_link == sizeof(LinkHeaderV5)); + // and then the code sould come after that + ASSERT(header->length_to_get_to_code == sizeof(LinkHeaderV5) + header->link_length); + } else if (header->n_segments == 1) { + ASSERT(header->type_tag == UINT32_MAX); + // name is inconsistent, so don't check is + // data files have the data first, which is good, as the last object in a DGO gets loaded + // directly to the heap, and putting the data first means that we can "free" the link data just + // by bumping the heap pointer back, rather than memcpy the code back to cover the hole if link + // data came first. + // the offset is always 0x80, which is bigger than the header, but is needed to make data + // aligned with the PS2's cache line size (64 bytes), which makes sense. + ASSERT(header->length_to_get_to_code == 0x80); + } else { + lg::die("bad segment count {}", header->n_segments); } - ASSERT(header->type_tag == 0); - ASSERT(name == header->name); - ASSERT(header->n_segments == 3); - ASSERT(header->pad == 0x50); - ASSERT(header->length_to_get_to_code - header->link_length == 0x50); - - f.set_segment_count(3); - - // link v3's data size is data.size() - link_length - // link v5's data size is data.size() - new_link_length - 0x50. - - // lbp + 4 points to version? - // lbp points to 4 past start of header. - - // lbp[1] = version + unknown 16 bit thing. - // lbp[3] = link block length (minus 0x50) + f.set_segment_count(header->n_segments); // todo - check this against the code size we actually got. // size_t expected_code_size = data.size() - (header->link_length + 0x50); - uint32_t data_ptr_offset = header->length_to_get_to_code; + const int n_segs = header->n_segments; + // the first think in the link data is the segment info array, which we need to find stuff. + const SegmentInfo* seg_info_array = + (const SegmentInfo*)(data.data() + header->length_to_get_to_link); + + // for convenience, we'll find the data/link offsets for each segment. uint32_t segment_data_offsets[3]; uint32_t segment_link_offsets[3]; - uint32_t segment_link_ends[3]; - for (int i = 0; i < 3; i++) { - segment_data_offsets[i] = data_ptr_offset + header->segment_info[i].data; - segment_link_offsets[i] = header->segment_info[i].relocs + 0x50; - ASSERT(header->segment_info[i].magic == 1); + uint32_t segment_link_ends[3]; // set in linking, once we get to the end. + for (int i = 0; i < n_segs; i++) { + segment_data_offsets[i] = header->length_to_get_to_code + seg_info_array[i].data; + segment_link_offsets[i] = header->length_to_get_to_link + seg_info_array[i].relocs; + ASSERT(seg_info_array[i].magic == 1); } // check that the data region is filled - for (int i = 0; i < 2; i++) { - ASSERT(align16(segment_data_offsets[i] + header->segment_info[i].size) == + for (int i = 0; i < n_segs - 1; i++) { + ASSERT(align16(segment_data_offsets[i] + seg_info_array[i].size) == segment_data_offsets[i + 1]); } - ASSERT(align16(segment_data_offsets[2] + header->segment_info[2].size) == data.size()); + if (n_segs == 3) { + ASSERT(align16(segment_data_offsets[2] + seg_info_array[2].size) == data.size()); + } - // loop over segments (reverse order for now) - for (int seg_id = 3; seg_id-- > 0;) { - // ?? is this right? - if (header->segment_info[seg_id].size == 0) + // loop over segments + for (int seg_id = n_segs; seg_id-- > 0;) { + int segment_size = seg_info_array[seg_id].size; + if (segment_size == 0) { continue; + } - auto segment_size = header->segment_info[seg_id].size; - f.stats.v3_code_bytes += segment_size; - - // if(gGameVersion == JAK2) { - bool adjusted = false; + // the decompiler uses 4-byte words, so pad to 4-bytes. while (segment_size % 4) { segment_size++; - adjusted = true; } - if (adjusted) { - printf( - "Adjusted the size of segment %d in %s, this is fine, but rare (and may indicate a " - "bigger problem if it happens often)\n", - seg_id, name.c_str()); - } - // } - + // set up pointers for linker. auto base_ptr = segment_data_offsets[seg_id]; auto data_ptr = base_ptr - 4; auto link_ptr = segment_link_offsets[seg_id]; @@ -494,13 +504,15 @@ static void link_v5(LinkedObjectFile& f, ASSERT((data_ptr % 4) == 0); ASSERT((segment_size % 4) == 0); + // add data to the decompiler. auto code_start = (const uint32_t*)(&data.at(data_ptr + 4)); auto code_end = ((const uint32_t*)(&data.at(data_ptr + segment_size))) + 1; for (auto x = code_start; x < code_end; x++) { f.push_back_word_to_segment(*((const uint32_t*)x), seg_id); } - bool fixing = false; + // pointer linking. + bool fixing = false; if (data.at(link_ptr)) { // we have pointers while (true) { @@ -517,7 +529,8 @@ static void link_v5(LinkedObjectFile& f, if ((old_code >> 24) == 0) { f.stats.v3_word_pointers++; if (!f.pointer_link_word(seg_id, data_ptr - base_ptr, seg_id, old_code)) { - printf("WARNING bad pointer_link_word (2) in %s\n", name.c_str()); + // the art groups just have bogus links. we ignored them in jak 2, so do the same + // here. The joint-anim-compressed-control's have a few bogus frames at the end. } } else { f.stats.v3_split_pointers++; @@ -526,12 +539,9 @@ static void link_v5(LinkedObjectFile& f, ASSERT(lo_hi_offset); ASSERT(dest_seg < 3); auto offset_upper = old_code & 0xff; - // ASSERT(offset_upper == 0); uint32_t low_code = *(const uint32_t*)(&data.at(data_ptr + 4 * lo_hi_offset)); uint32_t offset = low_code & 0xffff; if (offset_upper) { - // seems to work fine, no need to warn. - // printf("WARNING - offset upper is set in %s\n", name.c_str()); offset += (offset_upper << 16); } f.pointer_link_split_word(seg_id, data_ptr - base_ptr, @@ -558,6 +568,7 @@ static void link_v5(LinkedObjectFile& f, } link_ptr++; + // symbol linking. if (data.at(link_ptr)) { auto sub_link_ptr = link_ptr; @@ -604,21 +615,23 @@ static void link_v5(LinkedObjectFile& f, segment_link_ends[seg_id] = link_ptr; } - ASSERT(segment_link_offsets[0] == 128); + if (n_segs == 3) { + ASSERT(segment_link_offsets[0] == 128); - if (header->segment_info[0].size) { - ASSERT(segment_link_ends[0] + 1 == segment_link_offsets[1]); - } else { - ASSERT(segment_link_offsets[0] + 2 == segment_link_offsets[1]); + if (seg_info_array[0].size) { + ASSERT(segment_link_ends[0] + 1 == segment_link_offsets[1]); + } else { + ASSERT(segment_link_offsets[0] + 2 == segment_link_offsets[1]); + } + + if (seg_info_array[1].size) { + ASSERT(segment_link_ends[1] + 1 == segment_link_offsets[2]); + } else { + ASSERT(segment_link_offsets[1] + 2 == segment_link_offsets[2]); + } + + ASSERT(align16(segment_link_ends[2] + 2) == segment_data_offsets[0]); } - - if (header->segment_info[1].size) { - ASSERT(segment_link_ends[1] + 1 == segment_link_offsets[2]); - } else { - ASSERT(segment_link_offsets[1] + 2 == segment_link_offsets[2]); - } - - ASSERT(align16(segment_link_ends[2] + 2) == segment_data_offsets[0]); } static void link_v3(LinkedObjectFile& f, @@ -677,10 +690,8 @@ static void link_v3(LinkedObjectFile& f, } if (game_version == GameVersion::Jak2) { - [[maybe_unused]] bool adjusted = false; while (segment_size % 4) { segment_size++; - adjusted = true; } } diff --git a/decompiler/analysis/analyze_inspect_method.cpp b/decompiler/analysis/analyze_inspect_method.cpp index d877ba5f8..9ec704a6b 100644 --- a/decompiler/analysis/analyze_inspect_method.cpp +++ b/decompiler/analysis/analyze_inspect_method.cpp @@ -294,7 +294,7 @@ struct FieldPrint { // if a field has a weird inspect, just return the FieldPrint instead of asserting, // there's too many edge cases in custom prints to account for all of them -FieldPrint handle_custom_prints(FieldPrint& fp, const std::string& str) { +FieldPrint handle_custom_prints(FieldPrint& fp, const std::string& /*str*/) { return fp; } diff --git a/lsp/transport/stdio.cpp b/lsp/transport/stdio.cpp index 632284b8d..0c7f2764e 100644 --- a/lsp/transport/stdio.cpp +++ b/lsp/transport/stdio.cpp @@ -61,7 +61,7 @@ void MessageBuffer::handle_char(char c) { // we reach the length of the body as provided in the Content-Length // header. auto content_length = std::stoi(m_headers["Content-Length"]); - if (m_raw_message.length() == content_length) { + if (m_raw_message.length() == (size_t)content_length) { m_body = json::parse(m_raw_message); m_reading_content = false; }