From edc5e08a91d61b91a225dc38ff381e1fa0e88d2c Mon Sep 17 00:00:00 2001
From: Nicolas BACQUEY <nicolas.bacquey@tweag.io>
Date: Wed, 24 May 2023 16:50:25 +0200
Subject: [PATCH] Extract and anchor comments from input

---
 topiary-core/src/comments.rs    | 294 ++++++++++++++++++++++++++++++++
 topiary-core/src/lib.rs         |   4 +-
 topiary-core/src/tree_sitter.rs |  47 +++--
 topiary-core/src/types.rs       |  57 +++++++
 4 files changed, 374 insertions(+), 28 deletions(-)
 create mode 100644 topiary-core/src/comments.rs
 create mode 100644 topiary-core/src/types.rs
diff --git a/topiary-core/src/comments.rs b/topiary-core/src/comments.rs
new file mode 100644
index 00000000..9cb1eed4
--- /dev/null
+++ b/topiary-core/src/comments.rs
@@ -0,0 +1,294 @@
+use topiary_tree_sitter_facade::{InputEdit, Language, Node, Parser, Tree};
+
+use crate::{
+    error::FormatterError,
+    types::{Diff, InputSection, Position},
+    FormatterResult,
+};
+
+/// When you remove a block of text from the input, it changes the positions of every subsequent character.
+/// This is what this Diff instance does.
+impl Diff<InputSection> for Position {
+    type ErrorType = FormatterError;
+
+    fn subtract(self: &mut Self, other: InputSection) -> FormatterResult<()> {
+        if *self <= other.start {
+            // The point is before the removed block: nothing happens.
+            Ok(())
+        } else if other.end <= *self {
+            // The point is after the removed block: its new coordinates depend on whether it was
+            // on the same row as the last point of the removed block.
+            //
+            // See in the following example how the positions of characters `a` and `b`
+            // change when the bracketed block is removed:
+            //
+            // Before:
+            // ..........
+            // ...[---------
+            // ---------
+            // -------]...a..
+            // ...b......
+            // .............
+            //
+            // After:
+            // ..........
+            // ......a..
+            // ...b......
+            // .............
+            let mut row = self.row;
+            let mut column = self.column;
+            if row == other.end.row {
+                column = column + other.start.column - other.end.column
+            }
+            row = row + other.start.row - other.end.row;
+            *self = Position { row, column };
+            Ok(())
+        } else {
+            // The point is within the removed block:
+            // fail, because the point can't be accessed after the subtraction
+            Err(FormatterError::Internal(
+                "Tried to remove a section from a point it contains".into(),
+                None,
+            ))
+        }
+    }
+}
+
+impl Diff<InputSection> for InputSection {
+    type ErrorType = FormatterError;
+
+    fn subtract(self: &mut Self, other: Self) -> FormatterResult<()> {
+        self.start.subtract(other)?;
+        self.end.subtract(other)
+    }
+}
+
+fn is_comment(node: &Node) -> bool {
+    node.is_extra() && node.kind().to_string().contains("comment")
+}
+
+fn find_comments<'a>(
+    node: Node<'a>,
+    input: &str,
+    comments: &mut Vec<(Node<'a>, AnchoredComment)>,
+) -> FormatterResult<()> {
+    if is_comment(&node) {
+        let commented = find_anchor(&node, input)?;
+        comments.push((
+            node.clone(),
+            AnchoredComment {
+                comment_text: node.utf8_text(input.as_bytes())?.to_string(),
+                commented,
+            },
+        ));
+        Ok(())
+    } else {
+        let mut walker = node.walk();
+        for child in node.children(&mut walker) {
+            find_comments(child, input, comments)?;
+        }
+        Ok(())
+    }
+}
+
+/// The section of code to which a comment refers. We also remember whether the comment
+/// is positioned before or after the section.
+#[derive(Copy, Clone, Debug)]
+pub enum Commented {
+    /// The code section is before the comment, as in:
+    /// ```
+    /// struct Foo {
+    ///     baz: Baz, // this is baz
+    ///     quz: Qux, // this is qux
+    /// }
+    /// ```
+    CommentedBefore(InputSection),
+    /// The code section is after the comment, as in:
+    /// ```
+    /// struct Foo {
+    ///     // let's have a baz
+    ///     baz: Baz,
+    ///     // and a qux
+    ///     qux: Qux,
+    /// }
+    /// ```
+    CommentedAfter(InputSection),
+}
+
+impl Diff<InputSection> for Commented {
+    type ErrorType = FormatterError;
+
+    fn subtract(self: &mut Self, other: InputSection) -> FormatterResult<()> {
+        match self {
+            Commented::CommentedBefore(section) => section.subtract(other),
+            Commented::CommentedAfter(section) => section.subtract(other),
+        }
+    }
+}
+
+fn next_non_comment<'tree>(node: Node<'tree>) -> Option<Node<'tree>> {
+    let mut temp_node: Node<'tree> = node;
+    loop {
+        match temp_node.next_sibling() {
+            Some(sibling) => {
+                if !is_comment(&sibling) {
+                    return Some(sibling);
+                }
+                temp_node = sibling;
+            }
+            None => return None,
+        }
+    }
+}
+
+fn previous_non_comment<'tree>(node: Node<'tree>) -> Option<Node<'tree>> {
+    let mut temp_node = node;
+    loop {
+        match temp_node.prev_sibling() {
+            Some(sibling) => {
+                if !is_comment(&sibling) {
+                    return Some(sibling);
+                }
+                temp_node = sibling
+            }
+            None => return None,
+        }
+    }
+}
+
+fn reparse(
+    old_tree: Tree,
+    content: &str,
+    grammar: &topiary_tree_sitter_facade::Language,
+) -> FormatterResult<Tree> {
+    let mut parser = Parser::new()?;
+    parser.set_language(grammar)?;
+    let tree = parser
+        .parse(content, Some(&old_tree))?
+        .ok_or_else(|| FormatterError::Internal("Could not parse input".into(), None))?;
+    Ok(tree)
+}
+
+// Use the following heuristics to find a comment's anchor:
+// If the comment is only prefixed by blank symbols on its line, then the anchor is the
+// next non-comment sibling node.
+// Otherwise, the anchor is the previous non-comment sibling node.
+// If there is no such node, we anchor to the first non-comment sibling node
+// in the other direction.
+fn find_anchor<'tree>(node: &'tree Node<'tree>, input: &str) -> FormatterResult<Commented> {
+    let point = node.start_position();
+    let mut lines = input.lines();
+    let prefix = lines
+        .nth(point.row() as usize)
+        .map(|line| &line[..point.column() as usize])
+        .ok_or_else(|| {
+            FormatterError::Internal(
+                format!(
+                    "Trying to access nonexistent line {} in text:\n{}",
+                    point.row(),
+                    input,
+                ),
+                None,
+            )
+        })?;
+    if prefix.trim_start() == "" {
+        if let Some(anchor) = next_non_comment(node.clone()) {
+            return Ok(Commented::CommentedAfter(anchor.into()));
+        } else if let Some(anchor) = previous_non_comment(node.clone()) {
+            return Ok(Commented::CommentedBefore(anchor.into()));
+        } else {
+            return Err(FormatterError::Internal(
+                format!("Could find no anchor for comment {node:?}",),
+                None,
+            ));
+        }
+    } else {
+        if let Some(anchor) = previous_non_comment(node.clone()) {
+            return Ok(Commented::CommentedBefore(anchor.into()));
+        } else if let Some(anchor) = next_non_comment(node.clone()) {
+            return Ok(Commented::CommentedAfter(anchor.into()));
+        } else {
+            return Err(FormatterError::Internal(
+                format!("Could find no anchor for comment {node:?}",),
+                None,
+            ));
+        }
+    }
+}
+
+#[derive(Clone, Debug)]
+pub struct AnchoredComment {
+    pub comment_text: String,
+    pub commented: Commented,
+}
+
+pub struct SeparatedInput {
+    pub input_tree: Tree,
+    pub input_string: String,
+    pub comments: Vec<AnchoredComment>,
+}
+
+pub fn extract_comments<'a>(
+    tree: &'a Tree,
+    input: &'a str,
+    grammar: &Language,
+) -> FormatterResult<SeparatedInput> {
+    let mut anchors: Vec<(Node, AnchoredComment)> = Vec::new();
+    let mut anchored_comments: Vec<AnchoredComment> = Vec::new();
+    let mut new_input: String = input.to_string();
+    let mut new_tree: Tree = tree.clone();
+    find_comments(tree.root_node(), input, &mut anchors)?;
+    anchors.sort_by_key(|(node, _)| node.start_byte());
+    let mut edits: Vec<InputEdit> = Vec::new();
+    // for each (comment, anchor) pair in reverse order, we:
+    // 1) remove the comment from the input,
+    // 2) register an InputEdit to modify the tree,
+    // 3) edit the following anchors to account for the removed comment.
+    //
+    // The order is reversed so that all InputEdits can be applied in succession:
+    // one will not affect the others.
+    while let Some((comment, anchored_comment)) = anchors.pop() {
+        // 1)
+        new_input.replace_range(
+            (comment.start_byte() as usize)..(comment.end_byte() as usize),
+            "",
+        );
+        // 2)
+        let edit = InputEdit::new(
+            comment.start_byte(),
+            comment.end_byte(),
+            comment.start_byte(),
+            &comment.start_position(),
+            &comment.end_position(),
+            &comment.start_position(),
+        );
+        edits.push(edit);
+        // 3)
+        anchored_comments.push(anchored_comment);
+        anchored_comments = anchored_comments
+            .iter()
+            .map(
+                |AnchoredComment {
+                     mut commented,
+                     comment_text,
+                 }|
+                 -> FormatterResult<AnchoredComment> {
+                    commented.subtract(comment.clone().into())?;
+                    Ok(AnchoredComment {
+                        commented,
+                        comment_text: comment_text.to_string(),
+                    })
+                },
+            )
+            .collect::<FormatterResult<Vec<_>>>()?;
+    }
+    for edit in edits {
+        new_tree.edit(&edit);
+    }
+    new_tree = reparse(new_tree, new_input.as_str(), grammar)?;
+    Ok(SeparatedInput {
+        input_tree: new_tree,
+        input_string: new_input,
+        comments: anchored_comments,
+    })
+}
diff --git a/topiary-core/src/lib.rs b/topiary-core/src/lib.rs
index e050515f..f30c95b0 100644
--- a/topiary-core/src/lib.rs
+++ b/topiary-core/src/lib.rs
@@ -14,20 +14,22 @@ use std::io;
 
 use itertools::Itertools;
 use pretty_assertions::StrComparison;
-use tree_sitter::Position;
 
 pub use crate::{
     error::{FormatterError, IoError},
     language::Language,
     tree_sitter::{apply_query, CoverageData, SyntaxNode, TopiaryQuery, Visualisation},
+    types::Position,
 };
 
 mod atom_collection;
+mod comments;
 mod error;
 mod graphviz;
 mod language;
 mod pretty;
 mod tree_sitter;
+mod types;
 
 #[doc(hidden)]
 pub mod test_utils;
diff --git a/topiary-core/src/tree_sitter.rs b/topiary-core/src/tree_sitter.rs
index 5b75d4d7..dc5129b8 100644
--- a/topiary-core/src/tree_sitter.rs
+++ b/topiary-core/src/tree_sitter.rs
@@ -14,7 +14,9 @@ use streaming_iterator::StreamingIterator;
 
 use crate::{
     atom_collection::{AtomCollection, QueryPredicates},
+    comments::{extract_comments, AnchoredComment, SeparatedInput},
     error::FormatterError,
+    types::Position,
     FormatterResult,
 };
 
@@ -25,21 +27,6 @@ pub enum Visualisation {
     Json,
 }
 
-/// Refers to a position within the code. Used for error reporting, and for
-/// comparing input with formatted output. The numbers are 1-based, because that
-/// is how editors usually refer to a position. Derived from tree_sitter::Point.
-#[derive(Clone, Copy, Debug, PartialEq, Eq, Serialize)]
-pub struct Position {
-    pub row: u32,
-    pub column: u32,
-}
-
-impl Display for Position {
-    fn fmt(&self, f: &mut std::fmt::Formatter) -> Result<(), std::fmt::Error> {
-        write!(f, "({},{})", self.row, self.column)
-    }
-}
-
 /// Topiary often needs both the tree-sitter `Query` and the original content
 /// beloging to the file from which the query was parsed. This struct is a simple
 /// convenience wrapper that combines the `Query` with its original string.
@@ -97,15 +84,6 @@ impl TopiaryQuery {
     }
 }
 
-impl From<Point> for Position {
-    fn from(point: Point) -> Self {
-        Self {
-            row: point.row() + 1,
-            column: point.column() + 1,
-        }
-    }
-}
-
 // Simplified syntactic node struct, for the sake of serialisation.
 #[derive(Serialize)]
 pub struct SyntaxNode {
@@ -233,9 +211,24 @@ pub fn apply_query(
     grammar: &topiary_tree_sitter_facade::Language,
     tolerate_parsing_errors: bool,
 ) -> FormatterResult<AtomCollection> {
-    let (tree, _grammar) = parse(input_content, grammar, tolerate_parsing_errors)?;
-    let root = tree.root_node();
-    let source = input_content.as_bytes();
+    let (tree, grammar) = parse(input_content, grammar, tolerate_parsing_errors)?;
+
+    // Remove comments in a separate stream before applying queries
+    let SeparatedInput {
+        input_string,
+        input_tree,
+        comments,
+    } = extract_comments(&tree, input_content, grammar)?;
+    let source = input_string.as_bytes();
+    let root = input_tree.root_node();
+
+    for AnchoredComment {
+        comment_text,
+        commented,
+    } in comments
+    {
+        log::debug!("Found comment \"{comment_text}\" with anchor {commented:?}");
+    }
 
     // Match queries
     let mut cursor = QueryCursor::new();
diff --git a/topiary-core/src/types.rs b/topiary-core/src/types.rs
new file mode 100644
index 00000000..00978fbd
--- /dev/null
+++ b/topiary-core/src/types.rs
@@ -0,0 +1,57 @@
+use std::{cmp::Ord, fmt::Display};
+
+use serde::Serialize;
+use topiary_tree_sitter_facade::{Node, Point};
+
+/// A module for common, low-level types in the topiary-core crate
+
+/// Refers to a position within the code. Used for error reporting, and for
+/// comparing input with formatted output. The numbers are 1-based, because that
+/// is how editors usually refer to a position. Derived from tree_sitter::Point.
+/// Note that the order is the standard western reading order.
+#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Serialize)]
+pub struct Position {
+    pub row: u32,
+    pub column: u32,
+}
+
+impl Display for Position {
+    fn fmt(&self, f: &mut std::fmt::Formatter) -> Result<(), std::fmt::Error> {
+        write!(f, "({},{})", self.row, self.column)
+    }
+}
+
+impl From<Point> for Position {
+    fn from(point: Point) -> Self {
+        Self {
+            row: point.row() + 1,
+            column: point.column() + 1,
+        }
+    }
+}
+
+/// Some section of contiguous characters in the input.
+/// It is assumed that `start <= end`, according to the order on `Position`.
+#[derive(Copy, Clone, Debug, Eq, PartialEq, Serialize)]
+pub struct InputSection {
+    pub start: Position,
+    pub end: Position,
+}
+
+impl From<Node<'_>> for InputSection {
+    fn from(value: Node) -> Self {
+        InputSection {
+            start: value.start_position().into(),
+            end: value.end_position().into(),
+        }
+    }
+}
+
+/// A generic trait to subtract stuff from other stuff. The function can be partial.
+/// In practice, it will be used to update text positions within the input,
+/// when removing parts of it.
+pub trait Diff<T> {
+    type ErrorType;
+
+    fn subtract(self: &mut Self, other: T) -> Result<(), Self::ErrorType>;
+}