jak-project/common/formatter/formatter_tree.cpp
Tyler Wilding c162c66118
g/j1: Cleanup all main issues in the formatter and format all of goal_src/jak1 (#3535)
This PR does two main things:
1. Work through the main low-hanging fruit issues in the formatter
keeping it from feeling mature and usable
2. Iterate and prove that point by formatting all of the Jak 1 code
base. **This has removed around 100K lines in total.**
- The decompiler will now format it's results for jak 1 to keep things
from drifting back to where they were. This is controlled by a new
config flag `format_code`.

How am I confident this hasn't broken anything?:
- I compiled the entire project and stored it's `out/jak1/obj` files
separately
- I then recompiled the project after formatting and wrote a script that
md5's each file and compares it (`compare-compilation-outputs.py`
- The results (eventually) were the same:

![Screenshot 2024-05-25
132900](https://github.com/open-goal/jak-project/assets/13153231/015e6f20-8d19-49b7-9951-97fa88ddc6c2)
> This proves that the only difference before and after is non-critical
whitespace for all code/macros that is actually in use.

I'm still aware of improvements that could be made to the formatter, as
well as general optimization of it's performance. But in general these
are for rare or non-critical situations in my opinion and I'll work
through them before doing Jak 2. The vast majority looks great and is
working properly at this point. Those known issues are the following if
you are curious:

![image](https://github.com/open-goal/jak-project/assets/13153231/0edfaba1-6d36-40f5-ab23-0642209867c4)
2024-06-05 22:17:31 -04:00

178 lines
6.5 KiB
C++

#include "formatter_tree.h"
#include "common/util/string_util.h"
#include "fmt/core.h"
std::string get_source_code(const std::string& source, const TSNode& node) {
uint32_t start = ts_node_start_byte(node);
uint32_t end = ts_node_end_byte(node);
return source.substr(start, end - start);
}
int num_blank_lines_following_node(const std::string& source, const TSNode& node) {
int num_lines = -1; // The first new-line encountered is not a blank line
uint32_t cursor = ts_node_end_byte(node);
// TODO - this breaks on lines with whitespace as well, should probably seek past that!
while (cursor < source.length() && source.at(cursor) == '\n') {
num_lines++;
cursor++;
}
return num_lines;
}
// Check if the original source only has whitespace up to a new-line before it's token
bool node_preceeded_by_only_whitespace(const std::string& source, const TSNode& node) {
// NOTE - this returns incorrectly because we skip brackets in lists, we'll see if that matters
int32_t pos = ts_node_start_byte(node) - 1;
while (pos > 0) {
const auto& c = source.at(pos);
if (c == '\n') {
return true;
} else if (c == ' ' || c == '\t') {
pos--;
continue;
}
return false;
}
return true;
}
FormatterTreeNode::FormatterTreeNode(const std::string& source, const TSNode& node)
: token(get_source_code(source, node)) {
metadata.node_type = ts_node_type(node);
metadata.is_comment = str_util::starts_with(str_util::ltrim(token.value()), ";") ||
str_util::starts_with(str_util::ltrim(token.value()), "#|");
// Set any metadata based on the value of the token
metadata.num_blank_lines_following = num_blank_lines_following_node(source, node);
metadata.is_inline = !node_preceeded_by_only_whitespace(source, node);
};
// Check if the original source only has whitespace up to a new-line after it's token
bool node_followed_by_only_whitespace(const std::string& source, const TSNode& node) {
uint32_t pos = ts_node_end_byte(node);
while (pos < source.length()) {
const auto& c = source.at(pos);
if (c == '\n') {
return true;
} else if (c == ' ' || c == '\t') {
pos++;
continue;
}
return false;
}
return true;
}
bool nodes_on_same_line(const std::string& source, const TSNode& n1, const TSNode& n2) {
// Get the source between the two lines, if there are any new-lines, the answer is NO
uint32_t start = ts_node_start_byte(n1);
uint32_t end = ts_node_end_byte(n2);
const auto code_between = source.substr(start, end - start);
return !str_util::contains(code_between, "\n");
}
FormatterTree::FormatterTree(const std::string& source, const TSNode& root_node) {
root = FormatterTreeNode();
root.metadata.is_top_level = true;
construct_formatter_tree_recursive(source, root_node, root);
}
const std::unordered_map<std::string, std::vector<std::string>> node_type_ignorable_contents = {
{"list_lit", {"(", ")"}}};
// TODO make an imperative version eventually
// TODO - cleanup duplication
void FormatterTree::construct_formatter_tree_recursive(const std::string& source,
TSNode curr_node,
FormatterTreeNode& tree_node,
std::optional<std::string> node_prefix) {
if (ts_node_child_count(curr_node) == 0) {
auto new_node = FormatterTreeNode(source, curr_node);
new_node.node_prefix = node_prefix;
tree_node.refs.push_back(new_node);
return;
}
const std::string curr_node_type = ts_node_type(curr_node);
FormatterTreeNode list_node;
std::optional<std::string> next_node_prefix;
if (curr_node_type == "list_lit") {
list_node = FormatterTreeNode();
} else if (curr_node_type == "str_lit") {
// Strings are a special case, they are literals and essentially tokens but the grammar can
// detect formatter identifiers, this is useful for semantic highlighting but doesn't matter for
// formatting So for strings, we treat them as if they should be a single token
tree_node.refs.push_back(FormatterTreeNode(source, curr_node));
return;
} else if (curr_node_type == "quoting_lit") {
if (node_prefix) {
node_prefix.value() += "'";
} else {
node_prefix = "'";
}
construct_formatter_tree_recursive(source, ts_node_child(curr_node, 1), tree_node, node_prefix);
return;
} else if (curr_node_type == "unquoting_lit") {
if (node_prefix) {
node_prefix.value() += ",";
} else {
node_prefix = ",";
}
construct_formatter_tree_recursive(source, ts_node_child(curr_node, 1), tree_node, node_prefix);
return;
} else if (curr_node_type == "quasi_quoting_lit") {
if (node_prefix) {
node_prefix.value() += "`";
} else {
node_prefix = "`";
}
construct_formatter_tree_recursive(source, ts_node_child(curr_node, 1), tree_node, node_prefix);
return;
} else if (curr_node_type == "unquote_splicing_lit") {
if (node_prefix) {
node_prefix.value() += ",@";
} else {
node_prefix = ",@";
}
construct_formatter_tree_recursive(source, ts_node_child(curr_node, 1), tree_node, node_prefix);
return;
}
std::vector<std::string> skippable_nodes = {};
if (node_type_ignorable_contents.find(curr_node_type) != node_type_ignorable_contents.end()) {
skippable_nodes = node_type_ignorable_contents.at(curr_node_type);
}
for (size_t i = 0; i < ts_node_child_count(curr_node); i++) {
const auto child_node = ts_node_child(curr_node, i);
auto debug_child = ts_node_string(child_node);
const auto contents = get_source_code(source, child_node);
bool skip_node = false;
for (const auto& skippable_content : skippable_nodes) {
if (skippable_content == contents) {
skip_node = true;
break;
}
}
if (skip_node) {
continue;
}
if (curr_node_type == "list_lit") {
construct_formatter_tree_recursive(source, child_node, list_node, {});
if (node_prefix) {
list_node.node_prefix = node_prefix;
}
} else {
construct_formatter_tree_recursive(source, child_node, tree_node, node_prefix);
if (node_prefix && !tree_node.refs.empty()) {
tree_node.refs.at(tree_node.refs.size() - 1).node_prefix = node_prefix;
}
}
}
if (curr_node_type == "list_lit") {
// special case for empty lists
if (node_prefix && !list_node.node_prefix) {
list_node.node_prefix = node_prefix;
}
tree_node.refs.push_back(list_node);
}
}