formatter: initial and basic indentation/alignment and expose the formatting via the LSP (#2673)

This commit is contained in:
Tyler Wilding 2023-05-28 12:22:00 -05:00 committed by GitHub
parent ad0b3297ca
commit 4c6982b0ec
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
22 changed files with 447 additions and 166 deletions

View file

@ -33,6 +33,7 @@ add_library(common
dma/dma.cpp
dma/gs.cpp
formatter/formatter.cpp
formatter/formatter_tree.cpp
global_profiler/GlobalProfiler.cpp
goos/Interpreter.cpp
goos/Object.cpp
@ -78,7 +79,7 @@ add_library(common
util/Timer.cpp
util/unicode_util.cpp
versions/versions.cpp
)
)
target_link_libraries(common fmt lzokay replxx libzstd_static tree-sitter)

View file

@ -1,5 +1,7 @@
#include "formatter.h"
#include "formatter_tree.h"
#include "common/util/FileUtil.h"
#include "common/util/string_util.h"
@ -13,134 +15,74 @@ extern "C" {
extern const TSLanguage* tree_sitter_opengoal();
}
void walk_tree(TSTreeCursor* cursor, std::string& output, const std::string& source_code) {
// an imperative breadth-first-search
while (true) {
// Process the node
const auto curr_node = ts_tree_cursor_current_node(cursor);
const std::string curr_node_type = ts_node_type(curr_node);
std::string curr_node_field_name;
if (ts_tree_cursor_current_field_name(cursor)) {
curr_node_field_name = ts_tree_cursor_current_field_name(cursor);
}
if (curr_node_field_name == "open") {
output += "(";
} else if (curr_node_field_name == "close") {
output.pop_back();
output += ") ";
}
if (curr_node_type == "sym_name" || curr_node_type == "num_lit" ||
curr_node_type == "str_lit") {
uint32_t start = ts_node_start_byte(curr_node);
uint32_t end = ts_node_end_byte(curr_node);
const char* type = ts_node_type(curr_node);
(void)type;
// TODO - if it's a string literal, take out any newlines and reflow the string to the
// line-length
const auto contents = source_code.substr(start, end - start);
output += contents + " ";
std::string align_form(const std::string& form, int alignment_width) {
const auto lines = str_util::split(form);
std::string aligned_form = "";
for (int i = 0; i < lines.size(); i++) {
aligned_form += str_util::repeat(alignment_width, " ") + lines.at(i);
if (i != lines.size() - 1) {
aligned_form += "\n";
}
}
return aligned_form;
}
if (ts_tree_cursor_goto_first_child(cursor)) {
continue;
}
if (ts_tree_cursor_goto_next_sibling(cursor)) {
continue;
}
while (true) {
if (!ts_tree_cursor_goto_parent(cursor)) {
if (output.at(output.length() - 1) == ' ') {
output.pop_back();
std::string apply_formatting(const FormatterTree::Node& curr_node,
std::string output,
int tree_depth = 0) {
if (!curr_node.token && curr_node.refs.empty()) {
return output;
}
std::string curr_form = "";
if (curr_node.token) {
curr_form += curr_node.token.value();
return curr_form;
}
if (!curr_node.metadata.is_root) {
curr_form += "(";
}
for (int i = 0; i < curr_node.refs.size(); i++) {
const auto& ref = curr_node.refs.at(i);
// TODO - abstract these into formatting rules
if (!curr_node.metadata.is_root && curr_node.metadata.multiple_elements_first_line) {
if (i > 1) {
// TODO - kinda unsafe
// Trim the current form before applying a new-line
curr_form = str_util::rtrim(curr_form) + "\n";
if (ref.token) {
curr_form += str_util::repeat(curr_node.refs.at(0).token.value().length() + 2, " ");
}
return;
}
if (ts_tree_cursor_goto_next_sibling(cursor)) {
break;
} else if (!curr_node.metadata.is_root) {
if (i > 0) {
// Trim the current form before applying a new-line
curr_form = str_util::rtrim(curr_form) + "\n";
curr_form += str_util::repeat(tree_depth, " ");
}
}
}
}
// TODO - move this to str_util
std::string repeat(size_t n, const std::string& str) {
if (n == 0 || str.empty())
return {};
if (n == 1)
return str;
const auto period = str.size();
if (period == 1)
return std::string(n, str.front());
std::string ret(str);
ret.reserve(period * n);
std::size_t m{2};
for (; m < n; m *= 2)
ret += ret;
ret.append(ret.c_str(), (n - (m / 2)) * period);
return ret;
}
// It's possible to walk a tree-sitter tree imperatively with a cursor
// but the code for that is more verbose and less intuitive and I'm not sure how much
// of a benefit I'd get out of it since for formatting i basically have to convert every
// cursor to it's fat node
//
// But in any case, do it the easy way first and refactor later
void format_code(const std::string& source,
TSNode curr_node,
std::string& output,
std::string curr_form_head = "",
int indent = 0) {
if (ts_node_child_count(curr_node) == 0) {
uint32_t start = ts_node_start_byte(curr_node);
uint32_t end = ts_node_end_byte(curr_node);
// TODO - if it's a string literal, take out any newlines and reflow the string to the
// line-length
const auto contents = source.substr(start, end - start);
if (contents == ")") {
output.pop_back();
output += ") ";
} else if (contents == "(") {
output += "(";
if (ref.token) {
curr_form += ref.token.value() + " ";
} else {
output += contents + " ";
if (!curr_node.metadata.is_root && curr_node.metadata.multiple_elements_first_line) {
// align returned form's lines with this forms lines
// TODO - kinda unsafe
curr_form += align_form(apply_formatting(ref, "", tree_depth + 1),
curr_node.refs.at(0).token.value().length() + 2);
} else {
curr_form += apply_formatting(ref, "", tree_depth + 1);
}
}
return;
}
const std::string curr_node_type = ts_node_type(curr_node);
for (size_t i = 0; i < ts_node_child_count(curr_node); i++) {
auto child_node = ts_node_child(curr_node, i);
// If we are opening a list, peek at the first element in the list
// this is so we can properly handle indentation based on different forms
if (curr_node_type == "list_lit" && i == 1) {
uint32_t start = ts_node_start_byte(child_node);
uint32_t end = ts_node_end_byte(child_node);
// TODO - if it's a string literal, take out any newlines and reflow the string to the
// line-length
curr_form_head = source.substr(start, end - start);
}
std::string curr_node_field_name;
auto curr_field_name_raw = ts_node_field_name_for_child(
curr_node, i); // TODO - why is this always returning `close` for the opening paren..
if (curr_field_name_raw) {
curr_node_field_name = curr_field_name_raw;
}
if (curr_form_head == "defun" && i == 4) {
indent += 2;
output += "\n" + repeat(indent, " ");
} else if (curr_form_head == "defun" && i == 5) {
output += "\n" + repeat(indent, " ");
}
format_code(source, child_node, output, curr_form_head, indent);
if (curr_node_type == "source") {
output += "\n\n";
if (curr_node.metadata.is_root && i < curr_node.refs.size() - 1) {
curr_form += "\n\n";
}
}
if (!curr_node.metadata.is_root) {
curr_form = str_util::rtrim(curr_form) + ")";
}
return curr_form;
}
std::string formatter::format_code(const std::string& source) {
std::optional<std::string> formatter::format_code(const std::string& source) {
// Create a parser.
std::shared_ptr<TSParser> parser(ts_parser_new(), TreeSitterParserDeleter());
@ -154,9 +96,12 @@ std::string formatter::format_code(const std::string& source) {
// Get the root node of the syntax tree.
TSNode root_node = ts_tree_root_node(tree.get());
if (ts_node_is_null(root_node) || ts_node_has_error(root_node)) {
return std::nullopt;
}
std::string output = "";
format_code(source, root_node, output, "", 0);
const auto formatting_tree = FormatterTree(source, root_node);
std::string formatted_code = apply_formatting(formatting_tree.root, "");
return str_util::trim(output);
return formatted_code;
}

View file

@ -1,5 +1,6 @@
#pragma once
#include <optional>
#include <string>
#include "tree_sitter/api.h"
@ -13,5 +14,5 @@ struct TreeSitterTreeDeleter {
void operator()(TSTree* ptr) const { ts_tree_delete(ptr); }
};
std::string format_code(const std::string& source);
std::optional<std::string> format_code(const std::string& source);
} // namespace formatter

View file

@ -0,0 +1,65 @@
#include "formatter_tree.h"
// Check if the original source only has whitespace up to a new-line after it's token
bool node_followed_by_only_whitespace(const std::string& source, const TSNode& node) {
uint32_t pos = ts_node_end_byte(node);
while (pos < source.length()) {
const auto& c = source.at(pos);
if (c == '\n') {
return true;
} else if (c == ' ' || c == '\t') {
pos++;
continue;
}
return false;
}
return true;
}
std::string get_source_code(const std::string& source, const TSNode& node) {
uint32_t start = ts_node_start_byte(node);
uint32_t end = ts_node_end_byte(node);
return source.substr(start, end - start);
}
FormatterTree::FormatterTree(const std::string& source, const TSNode& root_node) {
root = FormatterTree::Node();
root.metadata.is_root = true;
construct_formatter_tree_recursive(source, root_node, root);
}
// TODO make an imperative version eventually
void FormatterTree::construct_formatter_tree_recursive(const std::string& source,
TSNode curr_node,
Node& tree_node) {
if (ts_node_child_count(curr_node) == 0) {
tree_node.refs.push_back(FormatterTree::Node(get_source_code(source, curr_node)));
return;
}
const std::string curr_node_type = ts_node_type(curr_node);
FormatterTree::Node list_node;
if (curr_node_type == "list_lit") {
list_node = FormatterTree::Node();
}
for (size_t i = 0; i < ts_node_child_count(curr_node); i++) {
const auto child_node = ts_node_child(curr_node, i);
// We skip parens
const auto contents = get_source_code(source, child_node);
if (contents == "(" || contents == ")") {
continue;
}
if (curr_node_type == "list_lit") {
// Check to see if the first line of the form has more than 1 element
if (i == 1) {
list_node.metadata.multiple_elements_first_line =
!node_followed_by_only_whitespace(source, child_node);
}
construct_formatter_tree_recursive(source, child_node, list_node);
} else {
construct_formatter_tree_recursive(source, child_node, tree_node);
}
}
if (curr_node_type == "list_lit") {
tree_node.refs.push_back(list_node);
}
}

View file

@ -0,0 +1,60 @@
#pragma once
#include <optional>
#include <string>
#include <vector>
#include "tree_sitter/api.h"
// Treesitter is fantastic for validating and parsing our code into a structured tree format without
// whitespace so we can do that ourselves (formatting) However, the treesitter AST is a bit too
// detailed for purposes of formatting.
//
// When formatting there is no need to know things like are we
// in a function, or a symbol, etc. This extra information is fantastic for parsing or manipulating
// the language, but becomes burdensome when just trying to write elegant formatting code when all
// we really care about is:
// - getting all the text tokens for the source code
// - having them in a proper, nested format
// The treesitter format is complicated and highly nested, leading to some very hard to understand
// code. So my solution is a 2-pass format.
//
// Pass 1 - convert the AST into a simplified FormatterTree
// Pass 2 - use the simplified tree to output the final code
// A FormatterTree has a very simple and crude tree structure where:
// Nodes are essentially forms, which contain in-order tokens or references to nested forms
// Nodes can have associated metadata, often related to their context in the original code
class FormatterTree {
public:
struct NodeMetadata {
bool is_root = false;
// Whether the form had more than 1 element on the first line
// (println
// "test")
// vs
// (println "test")
bool multiple_elements_first_line;
};
class Node {
public:
std::vector<Node> refs;
NodeMetadata metadata;
// The token is optional because list nodes do not contain a token, they just contain a bunch of
// eventually token node refs
std::optional<std::string> token;
Node() = default;
Node(const std::string& _token) : token(_token){};
Node(const NodeMetadata& _metadata) : metadata(_metadata){};
};
FormatterTree(const std::string& source, const TSNode& root_node);
Node root;
private:
void construct_formatter_tree_recursive(const std::string& source,
TSNode curr_node,
Node& tree_node);
};

View file

@ -121,4 +121,22 @@ std::string uuid() {
}
return res;
}
std::string repeat(size_t n, const std::string& str) {
if (n == 0 || str.empty())
return {};
if (n == 1)
return str;
const auto period = str.size();
if (period == 1)
return std::string(n, str.front());
std::string ret(str);
ret.reserve(period * n);
std::size_t m{2};
for (; m < n; m *= 2)
ret += ret;
ret.append(ret.c_str(), (n - (m / 2)) * period);
return ret;
}
} // namespace str_util

View file

@ -22,4 +22,5 @@ std::string join(const std::vector<std::string>& strs, const std::string& join_w
std::vector<std::string> regex_get_capture_groups(const std::string& str, const std::string& regex);
bool replace(std::string& str, const std::string& from, const std::string& to);
std::string uuid();
std::string repeat(size_t n, const std::string& str);
} // namespace str_util

View file

@ -7,13 +7,13 @@ add_executable(lsp
protocol/document_diagnostics.cpp
protocol/document_symbols.cpp
protocol/document_synchronization.cpp
protocol/formatting.cpp
protocol/hover.cpp
protocol/progress_report.cpp
state/data/mips_instruction.cpp
state/lsp_requester.cpp
state/workspace.cpp
transport/stdio.cpp
)
transport/stdio.cpp)
target_compile_definitions(lsp PRIVATE -DJSON_DIAGNOSTICS=1)

View file

@ -8,6 +8,7 @@
#include "text_document/document_color.h"
#include "text_document/document_symbol.h"
#include "text_document/document_synchronization.h"
#include "text_document/formatting.h"
#include "text_document/go_to.h"
#include "text_document/hover.h"
@ -46,6 +47,7 @@ void LSPRouter::init_routes() {
m_routes["textDocument/definition"] = LSPRoute(go_to_definition_handler);
m_routes["textDocument/completion"] = LSPRoute(get_completions_handler);
m_routes["textDocument/documentColor"] = LSPRoute(document_color_handler);
m_routes["textDocument/formatting"] = LSPRoute(formatting_handler);
// TODO - m_routes["textDocument/signatureHelp"] = LSPRoute(get_completions_handler);
// Not Yet Supported Routes, noops
m_routes["$/cancelRequest"] = LSPRoute();

View file

@ -0,0 +1,37 @@
#pragma once
#include <optional>
#include "common/formatter/formatter.h"
#include "lsp/protocol/common_types.h"
#include "lsp/protocol/formatting.h"
#include "lsp/state/data/mips_instructions.h"
#include "lsp/state/workspace.h"
std::optional<json> formatting_handler(Workspace& workspace, int id, json raw_params) {
auto params = raw_params.get<LSPSpec::DocumentFormattingParams>();
const auto file_type = workspace.determine_filetype_from_uri(params.textDocument.m_uri);
if (file_type == Workspace::FileType::OpenGOALIR) {
return nullptr;
} else if (file_type == Workspace::FileType::OpenGOAL) {
auto tracked_file = workspace.get_tracked_og_file(params.textDocument.m_uri);
if (!tracked_file) {
return nullptr;
}
// TODO move away from holding the content directly
const auto result = formatter::format_code(tracked_file->m_content);
if (!result) {
return nullptr;
}
json edits = json::array();
auto format_edit = LSPSpec::TextEdit();
format_edit.range = {{0, 0}, {(uint32_t)tracked_file->m_lines.size(), 0}};
format_edit.newText = result.value();
edits.push_back(format_edit);
return edits;
}
return nullptr;
}

View file

@ -90,3 +90,13 @@ void LSPSpec::from_json(const json& j, Color& obj) {
json_deserialize_if_exists(blue);
json_deserialize_if_exists(alpha);
}
void LSPSpec::to_json(json& j, const TextEdit& obj) {
json_serialize(range);
json_serialize(newText);
}
void LSPSpec::from_json(const json& j, TextEdit& obj) {
json_deserialize_if_exists(range);
json_deserialize_if_exists(newText);
}

View file

@ -122,4 +122,15 @@ struct Color {
};
void to_json(json& j, const Color& obj);
void from_json(const json& j, Color& obj);
struct TextEdit {
// The range of the text document to be manipulated. To insert
// text into a document create a range where start === end.
Range range;
// The string to be inserted. For delete operations use an
// empty string.
std::string newText;
};
void to_json(json& j, const TextEdit& obj);
void from_json(const json& j, TextEdit& obj);
} // namespace LSPSpec

View file

@ -0,0 +1,27 @@
#include "formatting.h"
void LSPSpec::to_json(json& j, const FormattingOptions& obj) {
json_serialize(tabSize);
json_serialize(insertSpaces);
json_serialize_optional(trimTrailingWhitespace);
json_serialize_optional(insertFinalNewLine);
json_serialize_optional(trimFinalNewLines);
}
void LSPSpec::from_json(const json& j, FormattingOptions& obj) {
json_deserialize_if_exists(tabSize);
json_deserialize_if_exists(insertSpaces);
json_deserialize_optional_if_exists(trimTrailingWhitespace);
json_deserialize_optional_if_exists(insertFinalNewLine);
json_deserialize_optional_if_exists(trimFinalNewLines);
}
void LSPSpec::to_json(json& j, const DocumentFormattingParams& obj) {
json_serialize(textDocument);
json_serialize(options);
}
void LSPSpec::from_json(const json& j, DocumentFormattingParams& obj) {
json_deserialize_if_exists(textDocument);
json_deserialize_if_exists(options);
}

42
lsp/protocol/formatting.h Normal file
View file

@ -0,0 +1,42 @@
#pragma once
#include "common_types.h"
namespace LSPSpec {
// Value-object describing what options formatting should use.
struct FormattingOptions {
// Size of a tab in spaces.
uint32_t tabSize;
// Prefer spaces over tabs.
bool insertSpaces;
// Trim trailing whitespace on a line.
//
// @since 3.15.0
std::optional<bool> trimTrailingWhitespace;
// Insert a newline character at the end of the file if one does not exist.
//
// @since 3.15.0
std::optional<bool> insertFinalNewLine;
// Trim all newlines after the final newline at the end of the file.
//
// @since 3.15.0
std::optional<bool> trimFinalNewLines;
// NOTE - omitting dynamic properties, not standardized anyway
// Signature for further properties.
};
void to_json(json& j, const FormattingOptions& obj);
void from_json(const json& j, FormattingOptions& obj);
struct DocumentFormattingParams {
// The document to format.
TextDocumentIdentifier textDocument;
// The format options.
FormattingOptions options;
};
void to_json(json& j, const DocumentFormattingParams& obj);
void from_json(const json& j, DocumentFormattingParams& obj);
} // namespace LSPSpec

View file

@ -49,7 +49,7 @@ class InitializeResult {
{"workspaceSymbolProvider", false},
{"codeActionProvider", false},
{"codeLensProvider", code_lens_provider},
{"documentFormattingProvider", false},
{"documentFormattingProvider", true},
{"documentRangeFormattingProvider", false},
{"documentOnTypeFormattingProvider", document_on_type_formatting_provider},
{"renameProvider", false},

View file

@ -73,7 +73,11 @@ LSPSpec::DocumentUri uri_from_path(fs::path path) {
std::string uri_to_path(LSPSpec::DocumentUri uri) {
auto decoded_uri = url_decode(uri);
if (str_util::starts_with(decoded_uri, "file:///")) {
#ifdef _WIN32
decoded_uri = decoded_uri.substr(8);
#else
decoded_uri = decoded_uri.substr(7);
#endif
}
return decoded_uri;
}
@ -290,7 +294,7 @@ void Workspace::stop_tracking_file(const LSPSpec::DocumentUri& file_uri) {
}
WorkspaceOGFile::WorkspaceOGFile(const std::string& content, const GameVersion& game_version)
: m_game_version(game_version) {
: m_content(content), m_game_version(game_version) {
m_lines = str_util::split(content);
lg::info("Added new OG file. {} lines with {} symbols and {} diagnostics", m_lines.size(),
m_symbols.size(), m_diagnostics.size());

View file

@ -21,6 +21,7 @@ class WorkspaceOGFile {
// TODO - make private
int32_t version;
// TODO - keep an AST of the file instead
std::string m_content;
std::vector<std::string> m_lines;
std::vector<LSPSpec::DocumentSymbol> m_symbols;
std::vector<LSPSpec::Diagnostic> m_diagnostics;

View file

@ -0,0 +1,64 @@
===
Multiple Top Level Forms
===
(println "hello" "world")(println "hello" "world")
---
(println "hello"
"world")
(println "hello"
"world")
===
All Alignment
===
(println "hello" (println "world" "world2"))
---
(println "hello"
(println "world"
"world2"))
===
All Indented
===
(println
"hello" (println
"world"))
---
(println
"hello"
(println
"world"))
===
Mixed
===
(println
"hello" (println "world"))
---
(println
"hello"
(println "world"))
===
Single Item Form
===
(println)
---
(println)

View file

@ -0,0 +1,19 @@
===
Unbalanced Parens
===
(println "hello" "world"(println "hello" "world")
---
__THROWS__
===
Unbalanced Quotes
===
(println "hello" "world)(println "hello" "world")
---
__THROWS__

View file

@ -1,34 +0,0 @@
===
Basic Function
===
(defun test-function ((hello string))
"world hello"
(+ 1 1))
---
(defun test-function ((hello string))
"world hello"
(+ 1 1))
===
Two Functions
===
(defun test-function ((hello string))
"world hello"
(+ 1 1))
(defun test-function ((hello string))
"world hello"
(+ 1 1))
---
(defun test-function ((hello string))
"world hello"
(+ 1 1))
(defun test-function ((hello string))
"world hello"
(+ 1 1))

View file

@ -78,9 +78,18 @@ bool run_tests(fs::path file_path) {
fmt::print("{}:\n", file_util::base_name(file_path.string()));
for (const auto& test : tests) {
const auto formatted_result = formatter::format_code(test.input);
if (formatted_result != test.output) {
if (!formatted_result) {
// Unable to parse, was that expected?
if (test.output == "__THROWS__") {
fmt::print(" ✅ - {}\n", test.name);
} else {
fmt::print(" ❌ - {}\n", test.name);
fmt::print("Unable to Format\n");
test_failed = true;
}
} else if (formatted_result != test.output) {
fmt::print(" ❌ - {}\n", test.name);
fmt::print("{}\n", str_util::diff(test.output, formatted_result));
fmt::print("{}\n", str_util::diff(test.output, formatted_result.value()));
test_failed = true;
} else {
fmt::print(" ✅ - {}\n", test.name);
@ -92,7 +101,7 @@ bool run_tests(fs::path file_path) {
bool find_and_run_tests() {
// Enumerate test files
const auto test_files = file_util::find_files_recursively(
file_util::get_file_path({"test/common/formatter/corpus"}), std::regex("^.*\.test$"));
file_util::get_file_path({"test/common/formatter/corpus"}), std::regex("^.*\.test.gc$"));
bool failed = false;
for (const auto& file : test_files) {
failed = run_tests(file);

View file

@ -31,8 +31,6 @@ int main(int argc, char** argv) {
lg::initialize();
// TODO - write a simple test framework for this stuff
CLI::App app{"OpenGOAL Formatter"};
app.add_flag("-c,--check", check,
"If on, will just do a dry-run and fail if something isn't formatted correctly");
@ -52,10 +50,10 @@ int main(int argc, char** argv) {
const auto result = formatter::format_code(source_code);
if (write_newfile) {
if (write_newfile && result) {
// TODO - i don't like this implementation, return a new string instead
if (str_util::replace(file_path, ".gc", ".new.gc")) {
file_util::write_text_file(file_path, result);
file_util::write_text_file(file_path, result.value());
}
}