From edc5e08a91d61b91a225dc38ff381e1fa0e88d2c Mon Sep 17 00:00:00 2001 From: Nicolas BACQUEY Date: Wed, 24 May 2023 16:50:25 +0200 Subject: [PATCH] Extract and anchor comments from input --- topiary-core/src/comments.rs | 294 ++++++++++++++++++++++++++++++++ topiary-core/src/lib.rs | 4 +- topiary-core/src/tree_sitter.rs | 47 +++-- topiary-core/src/types.rs | 57 +++++++ 4 files changed, 374 insertions(+), 28 deletions(-) create mode 100644 topiary-core/src/comments.rs create mode 100644 topiary-core/src/types.rs diff --git a/topiary-core/src/comments.rs b/topiary-core/src/comments.rs new file mode 100644 index 00000000..9cb1eed4 --- /dev/null +++ b/topiary-core/src/comments.rs @@ -0,0 +1,294 @@ +use topiary_tree_sitter_facade::{InputEdit, Language, Node, Parser, Tree}; + +use crate::{ + error::FormatterError, + types::{Diff, InputSection, Position}, + FormatterResult, +}; + +/// When you remove a block of text from the input, it changes the positions of every subsequent character. +/// This is what this Diff instance does. +impl Diff for Position { + type ErrorType = FormatterError; + + fn subtract(self: &mut Self, other: InputSection) -> FormatterResult<()> { + if *self <= other.start { + // The point is before the removed block: nothing happens. + Ok(()) + } else if other.end <= *self { + // The point is after the removed block: its new coordinates depend on whether it was + // on the same row as the last point of the removed block. + // + // See in the following example how the positions of characters `a` and `b` + // change when the bracketed block is removed: + // + // Before: + // .......... + // ...[--------- + // --------- + // -------]...a.. + // ...b...... + // ............. + // + // After: + // .......... + // ......a.. + // ...b...... + // ............. + let mut row = self.row; + let mut column = self.column; + if row == other.end.row { + column = column + other.start.column - other.end.column + } + row = row + other.start.row - other.end.row; + *self = Position { row, column }; + Ok(()) + } else { + // The point is within the removed block: + // fail, because the point can't be accessed after the subtraction + Err(FormatterError::Internal( + "Tried to remove a section from a point it contains".into(), + None, + )) + } + } +} + +impl Diff for InputSection { + type ErrorType = FormatterError; + + fn subtract(self: &mut Self, other: Self) -> FormatterResult<()> { + self.start.subtract(other)?; + self.end.subtract(other) + } +} + +fn is_comment(node: &Node) -> bool { + node.is_extra() && node.kind().to_string().contains("comment") +} + +fn find_comments<'a>( + node: Node<'a>, + input: &str, + comments: &mut Vec<(Node<'a>, AnchoredComment)>, +) -> FormatterResult<()> { + if is_comment(&node) { + let commented = find_anchor(&node, input)?; + comments.push(( + node.clone(), + AnchoredComment { + comment_text: node.utf8_text(input.as_bytes())?.to_string(), + commented, + }, + )); + Ok(()) + } else { + let mut walker = node.walk(); + for child in node.children(&mut walker) { + find_comments(child, input, comments)?; + } + Ok(()) + } +} + +/// The section of code to which a comment refers. We also remember whether the comment +/// is positioned before or after the section. +#[derive(Copy, Clone, Debug)] +pub enum Commented { + /// The code section is before the comment, as in: + /// ``` + /// struct Foo { + /// baz: Baz, // this is baz + /// quz: Qux, // this is qux + /// } + /// ``` + CommentedBefore(InputSection), + /// The code section is after the comment, as in: + /// ``` + /// struct Foo { + /// // let's have a baz + /// baz: Baz, + /// // and a qux + /// qux: Qux, + /// } + /// ``` + CommentedAfter(InputSection), +} + +impl Diff for Commented { + type ErrorType = FormatterError; + + fn subtract(self: &mut Self, other: InputSection) -> FormatterResult<()> { + match self { + Commented::CommentedBefore(section) => section.subtract(other), + Commented::CommentedAfter(section) => section.subtract(other), + } + } +} + +fn next_non_comment<'tree>(node: Node<'tree>) -> Option> { + let mut temp_node: Node<'tree> = node; + loop { + match temp_node.next_sibling() { + Some(sibling) => { + if !is_comment(&sibling) { + return Some(sibling); + } + temp_node = sibling; + } + None => return None, + } + } +} + +fn previous_non_comment<'tree>(node: Node<'tree>) -> Option> { + let mut temp_node = node; + loop { + match temp_node.prev_sibling() { + Some(sibling) => { + if !is_comment(&sibling) { + return Some(sibling); + } + temp_node = sibling + } + None => return None, + } + } +} + +fn reparse( + old_tree: Tree, + content: &str, + grammar: &topiary_tree_sitter_facade::Language, +) -> FormatterResult { + let mut parser = Parser::new()?; + parser.set_language(grammar)?; + let tree = parser + .parse(content, Some(&old_tree))? + .ok_or_else(|| FormatterError::Internal("Could not parse input".into(), None))?; + Ok(tree) +} + +// Use the following heuristics to find a comment's anchor: +// If the comment is only prefixed by blank symbols on its line, then the anchor is the +// next non-comment sibling node. +// Otherwise, the anchor is the previous non-comment sibling node. +// If there is no such node, we anchor to the first non-comment sibling node +// in the other direction. +fn find_anchor<'tree>(node: &'tree Node<'tree>, input: &str) -> FormatterResult { + let point = node.start_position(); + let mut lines = input.lines(); + let prefix = lines + .nth(point.row() as usize) + .map(|line| &line[..point.column() as usize]) + .ok_or_else(|| { + FormatterError::Internal( + format!( + "Trying to access nonexistent line {} in text:\n{}", + point.row(), + input, + ), + None, + ) + })?; + if prefix.trim_start() == "" { + if let Some(anchor) = next_non_comment(node.clone()) { + return Ok(Commented::CommentedAfter(anchor.into())); + } else if let Some(anchor) = previous_non_comment(node.clone()) { + return Ok(Commented::CommentedBefore(anchor.into())); + } else { + return Err(FormatterError::Internal( + format!("Could find no anchor for comment {node:?}",), + None, + )); + } + } else { + if let Some(anchor) = previous_non_comment(node.clone()) { + return Ok(Commented::CommentedBefore(anchor.into())); + } else if let Some(anchor) = next_non_comment(node.clone()) { + return Ok(Commented::CommentedAfter(anchor.into())); + } else { + return Err(FormatterError::Internal( + format!("Could find no anchor for comment {node:?}",), + None, + )); + } + } +} + +#[derive(Clone, Debug)] +pub struct AnchoredComment { + pub comment_text: String, + pub commented: Commented, +} + +pub struct SeparatedInput { + pub input_tree: Tree, + pub input_string: String, + pub comments: Vec, +} + +pub fn extract_comments<'a>( + tree: &'a Tree, + input: &'a str, + grammar: &Language, +) -> FormatterResult { + let mut anchors: Vec<(Node, AnchoredComment)> = Vec::new(); + let mut anchored_comments: Vec = Vec::new(); + let mut new_input: String = input.to_string(); + let mut new_tree: Tree = tree.clone(); + find_comments(tree.root_node(), input, &mut anchors)?; + anchors.sort_by_key(|(node, _)| node.start_byte()); + let mut edits: Vec = Vec::new(); + // for each (comment, anchor) pair in reverse order, we: + // 1) remove the comment from the input, + // 2) register an InputEdit to modify the tree, + // 3) edit the following anchors to account for the removed comment. + // + // The order is reversed so that all InputEdits can be applied in succession: + // one will not affect the others. + while let Some((comment, anchored_comment)) = anchors.pop() { + // 1) + new_input.replace_range( + (comment.start_byte() as usize)..(comment.end_byte() as usize), + "", + ); + // 2) + let edit = InputEdit::new( + comment.start_byte(), + comment.end_byte(), + comment.start_byte(), + &comment.start_position(), + &comment.end_position(), + &comment.start_position(), + ); + edits.push(edit); + // 3) + anchored_comments.push(anchored_comment); + anchored_comments = anchored_comments + .iter() + .map( + |AnchoredComment { + mut commented, + comment_text, + }| + -> FormatterResult { + commented.subtract(comment.clone().into())?; + Ok(AnchoredComment { + commented, + comment_text: comment_text.to_string(), + }) + }, + ) + .collect::>>()?; + } + for edit in edits { + new_tree.edit(&edit); + } + new_tree = reparse(new_tree, new_input.as_str(), grammar)?; + Ok(SeparatedInput { + input_tree: new_tree, + input_string: new_input, + comments: anchored_comments, + }) +} diff --git a/topiary-core/src/lib.rs b/topiary-core/src/lib.rs index e050515f..f30c95b0 100644 --- a/topiary-core/src/lib.rs +++ b/topiary-core/src/lib.rs @@ -14,20 +14,22 @@ use std::io; use itertools::Itertools; use pretty_assertions::StrComparison; -use tree_sitter::Position; pub use crate::{ error::{FormatterError, IoError}, language::Language, tree_sitter::{apply_query, CoverageData, SyntaxNode, TopiaryQuery, Visualisation}, + types::Position, }; mod atom_collection; +mod comments; mod error; mod graphviz; mod language; mod pretty; mod tree_sitter; +mod types; #[doc(hidden)] pub mod test_utils; diff --git a/topiary-core/src/tree_sitter.rs b/topiary-core/src/tree_sitter.rs index 5b75d4d7..dc5129b8 100644 --- a/topiary-core/src/tree_sitter.rs +++ b/topiary-core/src/tree_sitter.rs @@ -14,7 +14,9 @@ use streaming_iterator::StreamingIterator; use crate::{ atom_collection::{AtomCollection, QueryPredicates}, + comments::{extract_comments, AnchoredComment, SeparatedInput}, error::FormatterError, + types::Position, FormatterResult, }; @@ -25,21 +27,6 @@ pub enum Visualisation { Json, } -/// Refers to a position within the code. Used for error reporting, and for -/// comparing input with formatted output. The numbers are 1-based, because that -/// is how editors usually refer to a position. Derived from tree_sitter::Point. -#[derive(Clone, Copy, Debug, PartialEq, Eq, Serialize)] -pub struct Position { - pub row: u32, - pub column: u32, -} - -impl Display for Position { - fn fmt(&self, f: &mut std::fmt::Formatter) -> Result<(), std::fmt::Error> { - write!(f, "({},{})", self.row, self.column) - } -} - /// Topiary often needs both the tree-sitter `Query` and the original content /// beloging to the file from which the query was parsed. This struct is a simple /// convenience wrapper that combines the `Query` with its original string. @@ -97,15 +84,6 @@ impl TopiaryQuery { } } -impl From for Position { - fn from(point: Point) -> Self { - Self { - row: point.row() + 1, - column: point.column() + 1, - } - } -} - // Simplified syntactic node struct, for the sake of serialisation. #[derive(Serialize)] pub struct SyntaxNode { @@ -233,9 +211,24 @@ pub fn apply_query( grammar: &topiary_tree_sitter_facade::Language, tolerate_parsing_errors: bool, ) -> FormatterResult { - let (tree, _grammar) = parse(input_content, grammar, tolerate_parsing_errors)?; - let root = tree.root_node(); - let source = input_content.as_bytes(); + let (tree, grammar) = parse(input_content, grammar, tolerate_parsing_errors)?; + + // Remove comments in a separate stream before applying queries + let SeparatedInput { + input_string, + input_tree, + comments, + } = extract_comments(&tree, input_content, grammar)?; + let source = input_string.as_bytes(); + let root = input_tree.root_node(); + + for AnchoredComment { + comment_text, + commented, + } in comments + { + log::debug!("Found comment \"{comment_text}\" with anchor {commented:?}"); + } // Match queries let mut cursor = QueryCursor::new(); diff --git a/topiary-core/src/types.rs b/topiary-core/src/types.rs new file mode 100644 index 00000000..00978fbd --- /dev/null +++ b/topiary-core/src/types.rs @@ -0,0 +1,57 @@ +use std::{cmp::Ord, fmt::Display}; + +use serde::Serialize; +use topiary_tree_sitter_facade::{Node, Point}; + +/// A module for common, low-level types in the topiary-core crate + +/// Refers to a position within the code. Used for error reporting, and for +/// comparing input with formatted output. The numbers are 1-based, because that +/// is how editors usually refer to a position. Derived from tree_sitter::Point. +/// Note that the order is the standard western reading order. +#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Serialize)] +pub struct Position { + pub row: u32, + pub column: u32, +} + +impl Display for Position { + fn fmt(&self, f: &mut std::fmt::Formatter) -> Result<(), std::fmt::Error> { + write!(f, "({},{})", self.row, self.column) + } +} + +impl From for Position { + fn from(point: Point) -> Self { + Self { + row: point.row() + 1, + column: point.column() + 1, + } + } +} + +/// Some section of contiguous characters in the input. +/// It is assumed that `start <= end`, according to the order on `Position`. +#[derive(Copy, Clone, Debug, Eq, PartialEq, Serialize)] +pub struct InputSection { + pub start: Position, + pub end: Position, +} + +impl From> for InputSection { + fn from(value: Node) -> Self { + InputSection { + start: value.start_position().into(), + end: value.end_position().into(), + } + } +} + +/// A generic trait to subtract stuff from other stuff. The function can be partial. +/// In practice, it will be used to update text positions within the input, +/// when removing parts of it. +pub trait Diff { + type ErrorType; + + fn subtract(self: &mut Self, other: T) -> Result<(), Self::ErrorType>; +}