jak-project/common/formatter/formatter_tree.h

#pragma once

#include <memory>
#include <optional>
#include <string>
#include <vector>

#include "tree_sitter/api.h"

// Treesitter is fantastic for validating and parsing our code into a structured tree format without
// whitespace so we can do that ourselves (formatting) However, the treesitter AST is a bit too
// detailed for purposes of formatting.
//
// When formatting there is no need to know things like are we
// in a function, or a symbol, etc. This extra information is fantastic for parsing or manipulating
// the language, but becomes burdensome when just trying to write elegant formatting code when all
// we really care about is:
// - getting all the text tokens for the source code
// - having them in a proper, nested format
// The treesitter format is complicated and highly nested, leading to some very hard to understand
// code. So my solution is a 2-pass format.
//
// Pass 1 - convert the AST into a simplified FormatterTree
// Pass 2 - use the simplified tree to output the final code

class FormatterTreeNode {
 public:
  struct Metadata {
    std::string node_type;
    bool is_top_level = false;
    bool is_comment = false;
    bool is_inline = false;
    int num_blank_lines_following = 0;
    bool is_binding_list = false;  // TODO set this
  };
  std::vector<FormatterTreeNode> refs;
  Metadata metadata;
  // The token is optional because list nodes do not contain a token, they just contain a bunch of
  // eventually token node refs
  std::optional<std::string> token;

  FormatterTreeNode() = default;
  FormatterTreeNode(const std::string& source, const TSNode& node);
  FormatterTreeNode(const Metadata& _metadata) : metadata(_metadata){};

  bool is_list() const { return token.has_value(); }
};

// A FormatterTree has a very simple and crude tree structure where:
// - Nodes are essentially forms, which contain in-order tokens or references to nested forms
// - Nodes can have associated metadata, often related to their context in the original code
// - Nodes can also have multiple formatting rules associated with them.  Often this is the default
// rule or based on pre-configured overrides due to the head of the form, ex. 'defun'
class FormatterTree {
 public:
  FormatterTree(const std::string& source, const TSNode& root_node);
  FormatterTreeNode root;

 private:
  void construct_formatter_tree_recursive(const std::string& source,
                                          TSNode curr_node,
                                          FormatterTreeNode& tree_node);
};
formatter: initial and basic indentation/alignment and expose the formatting via the LSP (#2673) 2023-05-28 13:22:00 -04:00			`#pragma once`

formatter: extract formatting into a rules abstraction, add `inner` formatting (body vs argument functions) (#2684) 2023-06-04 13:19:29 -04:00			`#include <memory>`
formatter: initial and basic indentation/alignment and expose the formatting via the LSP (#2673) 2023-05-28 13:22:00 -04:00			`#include <optional>`
			`#include <string>`
			`#include <vector>`

			`#include "tree_sitter/api.h"`

			`// Treesitter is fantastic for validating and parsing our code into a structured tree format without`
			`// whitespace so we can do that ourselves (formatting) However, the treesitter AST is a bit too`
			`// detailed for purposes of formatting.`
			`//`
			`// When formatting there is no need to know things like are we`
			`// in a function, or a symbol, etc. This extra information is fantastic for parsing or manipulating`
			`// the language, but becomes burdensome when just trying to write elegant formatting code when all`
			`// we really care about is:`
			`// - getting all the text tokens for the source code`
			`// - having them in a proper, nested format`
			`// The treesitter format is complicated and highly nested, leading to some very hard to understand`
			`// code. So my solution is a 2-pass format.`
			`//`
			`// Pass 1 - convert the AST into a simplified FormatterTree`
			`// Pass 2 - use the simplified tree to output the final code`

formatter: extract formatting into a rules abstraction, add `inner` formatting (body vs argument functions) (#2684) 2023-06-04 13:19:29 -04:00			`class FormatterTreeNode {`
formatter: initial and basic indentation/alignment and expose the formatting via the LSP (#2673) 2023-05-28 13:22:00 -04:00			`public:`
formatter: extract formatting into a rules abstraction, add `inner` formatting (body vs argument functions) (#2684) 2023-06-04 13:19:29 -04:00			`struct Metadata {`
formatter: support comments better (including block comments) and constant pair formatting (#2745) 2023-06-18 17:19:35 -04:00			`std::string node_type;`
formatter: handle top level blank lines and better handle comments (#2702) 2023-06-06 20:34:50 -04:00			`bool is_top_level = false;`
			`bool is_comment = false;`
			`bool is_inline = false;`
			`int num_blank_lines_following = 0;`
formatter: support formatting bindings, for example in a `let` (#2883) 2023-08-05 15:23:09 -04:00			`bool is_binding_list = false; // TODO set this`
formatter: initial and basic indentation/alignment and expose the formatting via the LSP (#2673) 2023-05-28 13:22:00 -04:00			`};`
formatter: extract formatting into a rules abstraction, add `inner` formatting (body vs argument functions) (#2684) 2023-06-04 13:19:29 -04:00			`std::vector<FormatterTreeNode> refs;`
			`Metadata metadata;`
			`// The token is optional because list nodes do not contain a token, they just contain a bunch of`
			`// eventually token node refs`
			`std::optional<std::string> token;`
formatter: initial and basic indentation/alignment and expose the formatting via the LSP (#2673) 2023-05-28 13:22:00 -04:00
formatter: extract formatting into a rules abstraction, add `inner` formatting (body vs argument functions) (#2684) 2023-06-04 13:19:29 -04:00			`FormatterTreeNode() = default;`
formatter: handle top level blank lines and better handle comments (#2702) 2023-06-06 20:34:50 -04:00			`FormatterTreeNode(const std::string& source, const TSNode& node);`
formatter: extract formatting into a rules abstraction, add `inner` formatting (body vs argument functions) (#2684) 2023-06-04 13:19:29 -04:00			`FormatterTreeNode(const Metadata& _metadata) : metadata(_metadata){};`
formatter: support formatting bindings, for example in a `let` (#2883) 2023-08-05 15:23:09 -04:00
			`bool is_list() const { return token.has_value(); }`
formatter: extract formatting into a rules abstraction, add `inner` formatting (body vs argument functions) (#2684) 2023-06-04 13:19:29 -04:00			`};`

			`// A FormatterTree has a very simple and crude tree structure where:`
			`// - Nodes are essentially forms, which contain in-order tokens or references to nested forms`
			`// - Nodes can have associated metadata, often related to their context in the original code`
			`// - Nodes can also have multiple formatting rules associated with them. Often this is the default`
			`// rule or based on pre-configured overrides due to the head of the form, ex. 'defun'`
			`class FormatterTree {`
			`public:`
formatter: initial and basic indentation/alignment and expose the formatting via the LSP (#2673) 2023-05-28 13:22:00 -04:00			`FormatterTree(const std::string& source, const TSNode& root_node);`
formatter: extract formatting into a rules abstraction, add `inner` formatting (body vs argument functions) (#2684) 2023-06-04 13:19:29 -04:00			`FormatterTreeNode root;`
formatter: initial and basic indentation/alignment and expose the formatting via the LSP (#2673) 2023-05-28 13:22:00 -04:00
			`private:`
			`void construct_formatter_tree_recursive(const std::string& source,`
			`TSNode curr_node,`
formatter: extract formatting into a rules abstraction, add `inner` formatting (body vs argument functions) (#2684) 2023-06-04 13:19:29 -04:00			`FormatterTreeNode& tree_node);`
formatter: initial and basic indentation/alignment and expose the formatting via the LSP (#2673) 2023-05-28 13:22:00 -04:00			`};`