diff --git a/decompiler/CMakeLists.txt b/decompiler/CMakeLists.txt index 91c2051e2..5d4865823 100644 --- a/decompiler/CMakeLists.txt +++ b/decompiler/CMakeLists.txt @@ -34,7 +34,9 @@ add_library( IR2/AtomicOpTypeAnalysis.cpp IR2/cfg_builder.cpp IR2/Env.cpp + IR2/expression_build.cpp IR2/Form.cpp + IR2/FormStack.cpp IR2/reg_usage.cpp IR2/variable_naming.cpp diff --git a/decompiler/Function/Function.h b/decompiler/Function/Function.h index 6d3ca4075..9e78dbabd 100644 --- a/decompiler/Function/Function.h +++ b/decompiler/Function/Function.h @@ -169,6 +169,8 @@ class Function { Env env; FormPool form_pool; Form* top_form = nullptr; + std::string debug_form_string; + bool print_debug_forms = false; } ir2; private: diff --git a/decompiler/IR2/Form.cpp b/decompiler/IR2/Form.cpp index 8b9406d89..5d16b1048 100644 --- a/decompiler/IR2/Form.cpp +++ b/decompiler/IR2/Form.cpp @@ -20,6 +20,18 @@ FormPool::~FormPool() { } } +/////////////////// +// FormElement +/////////////////// + +std::string FormElement::to_string(const Env& env) const { + return to_form(env).print(); +} + +void FormElement::push_to_stack(const Env& env, FormStack&) { + throw std::runtime_error("push_to_stack not implemented for " + to_string(env)); +} + /////////////////// // Form ////////////////// @@ -38,6 +50,10 @@ goos::Object Form::to_form(const Env& env) const { } } +std::string Form::to_string(const Env& env) const { + return to_form(env).print(); +} + void Form::inline_forms(std::vector& forms, const Env& env) const { for (auto& x : m_elements) { forms.push_back(x->to_form(env)); diff --git a/decompiler/IR2/Form.h b/decompiler/IR2/Form.h index 572bc3ac9..8980d8333 100644 --- a/decompiler/IR2/Form.h +++ b/decompiler/IR2/Form.h @@ -11,7 +11,7 @@ namespace decompiler { class Form; class Env; -class IR2_Stack; +class FormStack; /*! * A "FormElement" represents a single LISP form that's not a begin. @@ -27,14 +27,10 @@ class FormElement { virtual void apply_form(const std::function& f) = 0; virtual bool is_sequence_point() const { return true; } virtual void collect_vars(VariableSet& vars) const = 0; + std::string to_string(const Env& env) const; - // // push the result of this operation to the operation stack - // // this is used for the forms that aren't last in a multi-form. - // virtual void push_to_stack(const Env& env, IR2_Stack& stack) = 0; - // - // // this is used for the final of a multi-form only. - // // using the current expressions on the stack, simplify myself. - // virtual FormElement* simplify(const Env& env, FormPool& pool, IR2_Stack& stack) = 0; + // push the result of this operation to the operation stack + virtual void push_to_stack(const Env& env, FormStack& stack); protected: friend class Form; @@ -138,6 +134,10 @@ class SetVarElement : public FormElement { bool m_is_sequence_point = true; }; +/*! + * A wrapper around a single AtomicOp. + * The "important" special AtomicOps have their own Form type, like FuncitonCallElement. + */ class AtomicOpElement : public FormElement { public: explicit AtomicOpElement(const AtomicOp* op); @@ -150,6 +150,14 @@ class AtomicOpElement : public FormElement { const AtomicOp* m_op; }; +/*! + * A "condition" like (< a b). This can be used as a boolean value directly: (set! a (< b c)) + * or it can be used as a branch condition: (if (< a b)). + * + * In the first case, it can be either a conditional move or actually branching. GOAL seems to use + * the branching when sometimes it could have used the conditional move, and for now, we don't + * care about the difference. + */ class ConditionElement : public FormElement { public: ConditionElement(IR2_Condition::Kind kind, Form* src0, Form* src1); @@ -164,6 +172,9 @@ class ConditionElement : public FormElement { Form* m_src[2] = {nullptr, nullptr}; }; +/*! + * Wrapper around an AtomicOp call. + */ class FunctionCallElement : public FormElement { public: explicit FunctionCallElement(const CallOp* op); @@ -176,6 +187,10 @@ class FunctionCallElement : public FormElement { const CallOp* m_op; }; +/*! + * Wrapper around an AtomicOp branch. These are inserted when directly converting blocks to Form, + * but should be eliminated after the cfg_builder pass completes. + */ class BranchElement : public FormElement { public: explicit BranchElement(const BranchOp* op); @@ -189,6 +204,10 @@ class BranchElement : public FormElement { const BranchOp* m_op; }; +/*! + * Represents a (return-from #f x) form, which immediately returns from the function. + * This always has some "dead code" after it that can't be reached, which is the "dead_code". + */ class ReturnElement : public FormElement { public: Form* return_code = nullptr; @@ -201,6 +220,27 @@ class ReturnElement : public FormElement { void collect_vars(VariableSet& vars) const override; }; +/*! + * Represents a (return-from Lxxx x) form, which returns from a block which ends before the end + * of the function. These are used pretty rarely. As a result, I'm not planning to allow these to + * next within other expressions. This means that the following code: + * + * (set! x (block my-block + * (if (condition?) + * (return-from my-block 12)) + * 2)) + * + * Would become + * + * (block my-block + * (when (condition?) + * (set! x 12) + * (return-from my-block none)) + * (set! x 2) + * ) + * + * which seems fine to me. + */ class BreakElement : public FormElement { public: Form* return_code = nullptr; @@ -213,6 +253,21 @@ class BreakElement : public FormElement { void collect_vars(VariableSet& vars) const override; }; +/*! + * Condition (cond, if, when, unless) which has an "else" case. + * The condition of the first entry may contain too much and will need to be adjusted later. + * Example: + * + * (set! x 10) + * (if (something?) ... ) + * + * might become + * (if (begin (set! x 10) (something?)) ... ) + * + * We want to wait until after expressions are built to move the extra stuff up to avoid splitting + * up a complicated expression used as the condition. But this should happen before variable + * scoping. + */ class CondWithElseElement : public FormElement { public: struct Entry { @@ -230,6 +285,14 @@ class CondWithElseElement : public FormElement { void collect_vars(VariableSet& vars) const override; }; +/*! + * An empty element. This is used to fill the body of control forms with nothing in them. + * For example, I believe that (cond ((x y) (else none))) will generate an else case with an + * "empty" and looks different from (cond ((x y))). + * + * We _could_ simplify out the use of empty, but I think it's more "authentic" to leave them in, and + * might give us more clues about how the code was originally written + */ class EmptyElement : public FormElement { public: EmptyElement() = default; @@ -239,6 +302,11 @@ class EmptyElement : public FormElement { void collect_vars(VariableSet& vars) const override; }; +/*! + * Represents a GOAL while loop and more complicated loops which have the "while" format of checking + * the condition before the first loop. This will not include infinite while loops. + * Unlike CondWithElseElement, this will correctly identify the start and end of the condition. + */ class WhileElement : public FormElement { public: WhileElement(Form* _condition, Form* _body) : condition(_condition), body(_body) {} @@ -251,6 +319,11 @@ class WhileElement : public FormElement { bool cleaned = false; }; +/*! + * Represents a GOAL until loop and more complicated loops which use the "until" format of checking + * the condition after the first iteration. Has the same limitation as CondWithElseElement for the + * condition. + */ class UntilElement : public FormElement { public: UntilElement(Form* _condition, Form* _body) : condition(_condition), body(_body) {} @@ -262,6 +335,11 @@ class UntilElement : public FormElement { Form* body = nullptr; }; +/*! + * Represents a GOAL short-circuit expression, either AND or OR. + * The first "element" in ShortCircuitElement may be too large, see the comment on + * CondWithElseElement + */ class ShortCircuitElement : public FormElement { public: struct Entry { @@ -286,6 +364,11 @@ class ShortCircuitElement : public FormElement { void collect_vars(VariableSet& vars) const override; }; +/*! + * Represents a GOAL cond/if/when/unless statement which does not have an explicit else case. The + * compiler will then move #f into the result register in the delay slot. The first condition may be + * too large at first, see CondWithElseElement + */ class CondNoElseElement : public FormElement { public: struct Entry { @@ -305,6 +388,9 @@ class CondNoElseElement : public FormElement { void collect_vars(VariableSet& vars) const override; }; +/*! + * Represents a (abs x) expression. + */ class AbsElement : public FormElement { public: explicit AbsElement(Form* _source); @@ -315,6 +401,11 @@ class AbsElement : public FormElement { Form* source = nullptr; }; +/*! + * Represents an (ash x y) expression. There is also an "unsigned" version of this using logical + * shifts. This only recognizes the fancy version where the shift amount isn't known at compile time + * and the compiler emits code that branches depending on the sign of the shift amount. + */ class AshElement : public FormElement { public: Form* shift_amount = nullptr; @@ -328,6 +419,10 @@ class AshElement : public FormElement { void collect_vars(VariableSet& vars) const override; }; +/*! + * Represents a form which gets the runtime type of a boxed object. This is for the most general + * "object" case where we check for pair, binteger, or basic and there's actually branching. + */ class TypeOfElement : public FormElement { public: Form* value; @@ -339,6 +434,24 @@ class TypeOfElement : public FormElement { void collect_vars(VariableSet& vars) const override; }; +/*! + * Represents an unpaired cmove #f. GOAL may emit code like + * (set! x #t) + * (... evaluate something) + * (cmov x y #f) + * where the stuff in between is potentially very large. + * GOAL has no "condition move" keyword available to the programmer - this would only happen if when + * doing something like (set! x (zero? y)), in the code for creating a GOAL boolean. + * + * Code like (if x (set! y z)) will branch, the compiler isn't smart enough to use movn/movz here. + * + * These cannot be compacted into a single form until expression building, so we leave these + * placeholders in. + * + * Note - some conditionals put the (set! x #t) immediately before the cmove, but not all. Those + * that do will be correctly recognized and will be a ConditionElement. zero! seems to be the most + * common one that's split, and it happens reasonably often, so I will try to actually correct it. + */ class ConditionalMoveFalseElement : public FormElement { public: Variable dest; @@ -351,6 +464,37 @@ class ConditionalMoveFalseElement : public FormElement { void collect_vars(VariableSet& vars) const override; }; +///*! +// * A GenericOperator is the head of a GenericElement. +// * It is used for the final output. +// */ +// class GenericOperator { +// public: +// enum class Kind { +// FIXED_FUNCTION_CALL, +// VAR_FUNCTION_CALL, +// FIXED_OPERATOR +// }; +// +// private: +// // if we're a VAR_FUNCTION_CALL, this should contain the expression to get the function +// Form* m_function_val; +// +// //std::string +// +//}; +// +// class GenericElement : public FormElement { +// public: +// goos::Object to_form(const Env& env) const override; +// void apply(const std::function& f) override; +// void apply_form(const std::function& f) override; +// void collect_vars(VariableSet& vars) const override; +// private: +// GenericOperator m_head; +// std::vector m_elts; +//}; + /*! * A Form is a wrapper around one or more FormElements. * This is done for two reasons: @@ -401,9 +545,15 @@ class Form { const std::vector& elts() const { return m_elements; } std::vector& elts() { return m_elements; } - void push_back(FormElement* elt) { m_elements.push_back(elt); } + void push_back(FormElement* elt) { + elt->parent_form = this; + m_elements.push_back(elt); + } + + void clear() { m_elements.clear(); } goos::Object to_form(const Env& env) const; + std::string to_string(const Env& env) const; void inline_forms(std::vector& forms, const Env& env) const; void apply(const std::function& f); void apply_form(const std::function& f); diff --git a/decompiler/IR2/FormStack.cpp b/decompiler/IR2/FormStack.cpp new file mode 100644 index 000000000..ecabb02b5 --- /dev/null +++ b/decompiler/IR2/FormStack.cpp @@ -0,0 +1,91 @@ +#include "FormStack.h" +#include "Form.h" + +namespace decompiler { +std::string FormStack::StackEntry::print(const Env& env) const { + if (destination.has_value()) { + assert(source && !elt); + return fmt::format("d: {} s: {} | {} <- {}", active, sequence_point, + destination.value().reg().to_charp(), source->to_string(env)); + } else { + assert(elt && !source); + return fmt::format("d: {} s: {} | {}", active, sequence_point, elt->to_string(env)); + } +} + +std::string FormStack::print(const Env& env) { + std::string result; + for (auto& x : m_stack) { + result += x.print(env); + result += '\n'; + } + return result; +} + +void FormStack::push_value_to_reg(Variable var, Form* value, bool sequence_point) { + StackEntry entry; + entry.active = true; // by default, we should display everything! + entry.sequence_point = sequence_point; + entry.destination = var; + entry.source = value; + m_stack.push_back(entry); +} + +bool FormStack::is_single_expression() { + int count = 0; + for (auto& e : m_stack) { + if (e.active) { + count++; + } + } + return count == 1; +} + +void FormStack::push_form_element(FormElement* elt, bool sequence_point) { + StackEntry entry; + entry.active = true; + entry.elt = elt; + entry.sequence_point = sequence_point; + m_stack.push_back(entry); +} + +Form* FormStack::pop_reg(const Variable& var) { + for (size_t i = m_stack.size(); i-- > 0;) { + auto& entry = m_stack.at(i); + if (entry.active) { + if (entry.destination == var) { + entry.active = false; + assert(entry.source); + return entry.source; + } else { + // we didn't match + if (entry.sequence_point) { + // and it's a sequence point! can't look any more back than this. + return nullptr; + } + } + } + } + // we didn't have it... + return nullptr; +} + +std::vector FormStack::rewrite(FormPool& pool) { + std::vector result; + + for (auto& e : m_stack) { + if (!e.active) { + continue; + } + + if (e.destination.has_value()) { + auto elt = pool.alloc_element(*e.destination, e.source, e.sequence_point); + e.source->parent_element = elt; + result.push_back(elt); + } else { + result.push_back(e.elt); + } + } + return result; +} +} // namespace decompiler \ No newline at end of file diff --git a/decompiler/IR2/FormStack.h b/decompiler/IR2/FormStack.h new file mode 100644 index 000000000..fa03a2bd1 --- /dev/null +++ b/decompiler/IR2/FormStack.h @@ -0,0 +1,36 @@ +#pragma once + +#include +#include "decompiler/Disasm/Register.h" +#include "decompiler/IR2/AtomicOp.h" + +namespace decompiler { +class Form; +/*! + * A FormStack is used to track partial expressions when rebuilding the tree structure of + * GOAL code. Linear sequences of operations are added onto the expression stack. + */ +class FormStack { + public: + FormStack() = default; + void push_value_to_reg(Variable var, Form* value, bool sequence_point); + void push_form_element(FormElement* elt, bool sequence_point); + Form* pop_reg(const Variable& var); + bool is_single_expression(); + std::vector rewrite(FormPool& pool); + std::string print(const Env& env); + + private: + struct StackEntry { + bool active = true; // should this appear in the output? + std::optional destination; // what register we are setting (or nullopt if no dest.) + Form* source = nullptr; // the value we are setting the register to. + + FormElement* elt = nullptr; + bool sequence_point = false; + TP_Type type; + std::string print(const Env& env) const; + }; + std::vector m_stack; +}; +} // namespace decompiler diff --git a/decompiler/IR2/expression_build.cpp b/decompiler/IR2/expression_build.cpp new file mode 100644 index 000000000..43f1a4c36 --- /dev/null +++ b/decompiler/IR2/expression_build.cpp @@ -0,0 +1,27 @@ +#include "expression_build.h" +#include "decompiler/Function/Function.h" +#include "decompiler/IR2/Form.h" +#include "decompiler/IR2/FormStack.h" + +namespace decompiler { +bool convert_to_expressions(Form* top_level_form, FormPool& pool, const Function& f) { + assert(top_level_form); + + try { + top_level_form->apply_form([&](Form* form) { + FormStack stack; + for (auto& entry : form->elts()) { + entry->push_to_stack(f.ir2.env, stack); + } + auto new_entries = stack.rewrite(pool); + form->clear(); + for (auto x : new_entries) { + form->push_back(x); + } + }); + } catch (std::exception& e) { + return false; + } + return true; +} +} // namespace decompiler diff --git a/decompiler/IR2/expression_build.h b/decompiler/IR2/expression_build.h new file mode 100644 index 000000000..227489a9a --- /dev/null +++ b/decompiler/IR2/expression_build.h @@ -0,0 +1,8 @@ +#pragma once + +namespace decompiler { +class Form; +class Function; +class FormPool; +bool convert_to_expressions(Form* top_level_form, FormPool& pool, const Function& f); +} // namespace decompiler \ No newline at end of file diff --git a/decompiler/ObjectFile/ObjectFileDB.h b/decompiler/ObjectFile/ObjectFileDB.h index c6e50eeae..a05f641ab 100644 --- a/decompiler/ObjectFile/ObjectFileDB.h +++ b/decompiler/ObjectFile/ObjectFileDB.h @@ -74,6 +74,8 @@ class ObjectFileDB { void ir2_register_usage_pass(); void ir2_variable_pass(); void ir2_cfg_build_pass(); + void ir2_store_current_forms(); + void ir2_build_expressions(); void ir2_write_results(const std::string& output_dir); std::string ir2_to_file(ObjectFileData& data); std::string ir2_function_to_string(ObjectFileData& data, Function& function, int seg); diff --git a/decompiler/ObjectFile/ObjectFileDB_IR2.cpp b/decompiler/ObjectFile/ObjectFileDB_IR2.cpp index 22d80a779..07f72ccff 100644 --- a/decompiler/ObjectFile/ObjectFileDB_IR2.cpp +++ b/decompiler/ObjectFile/ObjectFileDB_IR2.cpp @@ -11,6 +11,7 @@ #include "decompiler/IR2/reg_usage.h" #include "decompiler/IR2/variable_naming.h" #include "decompiler/IR2/cfg_builder.h" +#include "decompiler/IR2/expression_build.h" #include "common/goos/PrettyPrinter.h" namespace decompiler { @@ -34,8 +35,12 @@ void ObjectFileDB::analyze_functions_ir2(const std::string& output_dir) { ir2_register_usage_pass(); lg::info("Variable analysis..."); ir2_variable_pass(); - lg::info("Initial conversion to Form..."); + lg::info("Initial structuring.."); ir2_cfg_build_pass(); + lg::info("Storing temporary form result..."); + ir2_store_current_forms(); + lg::info("Expression building..."); + ir2_build_expressions(); lg::info("Writing results..."); ir2_write_results(output_dir); } @@ -349,6 +354,45 @@ void ObjectFileDB::ir2_cfg_build_pass() { lg::info("{}/{}/{} cfg build in {:.2f} ms\n", successful, attempted, total, timer.getMs()); } +void ObjectFileDB::ir2_store_current_forms() { + Timer timer; + int total = 0; + + for_each_function_def_order([&](Function& func, int segment_id, ObjectFileData& data) { + (void)segment_id; + (void)data; + + if (func.ir2.top_form) { + total++; + func.ir2.debug_form_string = + pretty_print::to_string(func.ir2.top_form->to_form(func.ir2.env)); + } + }); + + lg::info("Stored debug forms for {} functions in {:.2f} ms\n", total, timer.getMs()); +} + +void ObjectFileDB::ir2_build_expressions() { + Timer timer; + int total = 0; + int attempted = 0; + int successful = 0; + for_each_function_def_order([&](Function& func, int segment_id, ObjectFileData& data) { + (void)segment_id; + (void)data; + total++; + if (func.ir2.top_form) { + attempted++; + if (convert_to_expressions(func.ir2.top_form, func.ir2.form_pool, func)) { + successful++; + func.ir2.print_debug_forms = true; + } + } + }); + + lg::info("{}/{}/{} expression build in {:.2f} ms\n", successful, attempted, total, timer.getMs()); +} + void ObjectFileDB::ir2_write_results(const std::string& output_dir) { Timer timer; lg::info("Writing IR2 results to file..."); @@ -388,6 +432,12 @@ std::string ObjectFileDB::ir2_to_file(ObjectFileData& data) { result += pretty_print::to_string(func.ir2.top_form->to_form(func.ir2.env)); result += '\n'; } + + if (func.ir2.print_debug_forms) { + result += '\n'; + result += func.ir2.debug_form_string; + result += '\n'; + } } // print data