Skip to content

Commit

Permalink
Extract and anchor comments from input
Browse files Browse the repository at this point in the history
  • Loading branch information
nbacquey committed Dec 4, 2024
1 parent 932cb22 commit edc5e08
Show file tree
Hide file tree
Showing 4 changed files with 374 additions and 28 deletions.
294 changes: 294 additions & 0 deletions topiary-core/src/comments.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,294 @@
use topiary_tree_sitter_facade::{InputEdit, Language, Node, Parser, Tree};

use crate::{
error::FormatterError,
types::{Diff, InputSection, Position},
FormatterResult,
};

/// When you remove a block of text from the input, it changes the positions of every subsequent character.
/// This is what this Diff instance does.
impl Diff<InputSection> for Position {
type ErrorType = FormatterError;

fn subtract(self: &mut Self, other: InputSection) -> FormatterResult<()> {
if *self <= other.start {
// The point is before the removed block: nothing happens.
Ok(())
} else if other.end <= *self {
// The point is after the removed block: its new coordinates depend on whether it was
// on the same row as the last point of the removed block.
//
// See in the following example how the positions of characters `a` and `b`
// change when the bracketed block is removed:
//
// Before:
// ..........
// ...[---------
// ---------
// -------]...a..
// ...b......
// .............
//
// After:
// ..........
// ......a..
// ...b......
// .............
let mut row = self.row;
let mut column = self.column;
if row == other.end.row {
column = column + other.start.column - other.end.column
}
row = row + other.start.row - other.end.row;
*self = Position { row, column };
Ok(())
} else {
// The point is within the removed block:
// fail, because the point can't be accessed after the subtraction
Err(FormatterError::Internal(
"Tried to remove a section from a point it contains".into(),
None,
))
}
}
}

impl Diff<InputSection> for InputSection {
type ErrorType = FormatterError;

fn subtract(self: &mut Self, other: Self) -> FormatterResult<()> {
self.start.subtract(other)?;
self.end.subtract(other)
}
}

fn is_comment(node: &Node) -> bool {
node.is_extra() && node.kind().to_string().contains("comment")
}

fn find_comments<'a>(
node: Node<'a>,
input: &str,
comments: &mut Vec<(Node<'a>, AnchoredComment)>,
) -> FormatterResult<()> {
if is_comment(&node) {
let commented = find_anchor(&node, input)?;
comments.push((
node.clone(),
AnchoredComment {
comment_text: node.utf8_text(input.as_bytes())?.to_string(),
commented,
},
));
Ok(())
} else {
let mut walker = node.walk();
for child in node.children(&mut walker) {
find_comments(child, input, comments)?;
}
Ok(())
}
}

/// The section of code to which a comment refers. We also remember whether the comment
/// is positioned before or after the section.
#[derive(Copy, Clone, Debug)]
pub enum Commented {
/// The code section is before the comment, as in:
/// ```
/// struct Foo {
/// baz: Baz, // this is baz
/// quz: Qux, // this is qux
/// }
/// ```
CommentedBefore(InputSection),
/// The code section is after the comment, as in:
/// ```
/// struct Foo {
/// // let's have a baz
/// baz: Baz,
/// // and a qux
/// qux: Qux,
/// }
/// ```
CommentedAfter(InputSection),
}

impl Diff<InputSection> for Commented {
type ErrorType = FormatterError;

fn subtract(self: &mut Self, other: InputSection) -> FormatterResult<()> {
match self {
Commented::CommentedBefore(section) => section.subtract(other),
Commented::CommentedAfter(section) => section.subtract(other),
}
}
}

fn next_non_comment<'tree>(node: Node<'tree>) -> Option<Node<'tree>> {
let mut temp_node: Node<'tree> = node;
loop {
match temp_node.next_sibling() {
Some(sibling) => {
if !is_comment(&sibling) {
return Some(sibling);
}
temp_node = sibling;
}
None => return None,
}
}
}

fn previous_non_comment<'tree>(node: Node<'tree>) -> Option<Node<'tree>> {
let mut temp_node = node;
loop {
match temp_node.prev_sibling() {
Some(sibling) => {
if !is_comment(&sibling) {
return Some(sibling);
}
temp_node = sibling
}
None => return None,
}
}
}

fn reparse(
old_tree: Tree,
content: &str,
grammar: &topiary_tree_sitter_facade::Language,
) -> FormatterResult<Tree> {
let mut parser = Parser::new()?;
parser.set_language(grammar)?;
let tree = parser
.parse(content, Some(&old_tree))?
.ok_or_else(|| FormatterError::Internal("Could not parse input".into(), None))?;
Ok(tree)
}

// Use the following heuristics to find a comment's anchor:
// If the comment is only prefixed by blank symbols on its line, then the anchor is the
// next non-comment sibling node.
// Otherwise, the anchor is the previous non-comment sibling node.
// If there is no such node, we anchor to the first non-comment sibling node
// in the other direction.
fn find_anchor<'tree>(node: &'tree Node<'tree>, input: &str) -> FormatterResult<Commented> {
let point = node.start_position();
let mut lines = input.lines();
let prefix = lines
.nth(point.row() as usize)
.map(|line| &line[..point.column() as usize])
.ok_or_else(|| {
FormatterError::Internal(
format!(
"Trying to access nonexistent line {} in text:\n{}",
point.row(),
input,
),
None,
)
})?;
if prefix.trim_start() == "" {
if let Some(anchor) = next_non_comment(node.clone()) {
return Ok(Commented::CommentedAfter(anchor.into()));
} else if let Some(anchor) = previous_non_comment(node.clone()) {
return Ok(Commented::CommentedBefore(anchor.into()));
} else {
return Err(FormatterError::Internal(
format!("Could find no anchor for comment {node:?}",),
None,
));
}
} else {
if let Some(anchor) = previous_non_comment(node.clone()) {
return Ok(Commented::CommentedBefore(anchor.into()));
} else if let Some(anchor) = next_non_comment(node.clone()) {
return Ok(Commented::CommentedAfter(anchor.into()));
} else {
return Err(FormatterError::Internal(
format!("Could find no anchor for comment {node:?}",),
None,
));
}
}
}

#[derive(Clone, Debug)]
pub struct AnchoredComment {
pub comment_text: String,
pub commented: Commented,
}

pub struct SeparatedInput {
pub input_tree: Tree,
pub input_string: String,
pub comments: Vec<AnchoredComment>,
}

pub fn extract_comments<'a>(
tree: &'a Tree,
input: &'a str,
grammar: &Language,
) -> FormatterResult<SeparatedInput> {
let mut anchors: Vec<(Node, AnchoredComment)> = Vec::new();
let mut anchored_comments: Vec<AnchoredComment> = Vec::new();
let mut new_input: String = input.to_string();
let mut new_tree: Tree = tree.clone();
find_comments(tree.root_node(), input, &mut anchors)?;
anchors.sort_by_key(|(node, _)| node.start_byte());
let mut edits: Vec<InputEdit> = Vec::new();
// for each (comment, anchor) pair in reverse order, we:
// 1) remove the comment from the input,
// 2) register an InputEdit to modify the tree,
// 3) edit the following anchors to account for the removed comment.
//
// The order is reversed so that all InputEdits can be applied in succession:
// one will not affect the others.
while let Some((comment, anchored_comment)) = anchors.pop() {
// 1)
new_input.replace_range(
(comment.start_byte() as usize)..(comment.end_byte() as usize),
"",
);
// 2)
let edit = InputEdit::new(
comment.start_byte(),
comment.end_byte(),
comment.start_byte(),
&comment.start_position(),
&comment.end_position(),
&comment.start_position(),
);
edits.push(edit);
// 3)
anchored_comments.push(anchored_comment);
anchored_comments = anchored_comments
.iter()
.map(
|AnchoredComment {
mut commented,
comment_text,
}|
-> FormatterResult<AnchoredComment> {
commented.subtract(comment.clone().into())?;
Ok(AnchoredComment {
commented,
comment_text: comment_text.to_string(),
})
},
)
.collect::<FormatterResult<Vec<_>>>()?;
}
for edit in edits {
new_tree.edit(&edit);
}
new_tree = reparse(new_tree, new_input.as_str(), grammar)?;
Ok(SeparatedInput {
input_tree: new_tree,
input_string: new_input,
comments: anchored_comments,
})
}
4 changes: 3 additions & 1 deletion topiary-core/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,20 +14,22 @@ use std::io;

use itertools::Itertools;
use pretty_assertions::StrComparison;
use tree_sitter::Position;

pub use crate::{
error::{FormatterError, IoError},
language::Language,
tree_sitter::{apply_query, CoverageData, SyntaxNode, TopiaryQuery, Visualisation},
types::Position,
};

mod atom_collection;
mod comments;
mod error;
mod graphviz;
mod language;
mod pretty;
mod tree_sitter;
mod types;

#[doc(hidden)]
pub mod test_utils;
Expand Down
47 changes: 20 additions & 27 deletions topiary-core/src/tree_sitter.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,9 @@ use streaming_iterator::StreamingIterator;

use crate::{
atom_collection::{AtomCollection, QueryPredicates},
comments::{extract_comments, AnchoredComment, SeparatedInput},
error::FormatterError,
types::Position,
FormatterResult,
};

Expand All @@ -25,21 +27,6 @@ pub enum Visualisation {
Json,
}

/// Refers to a position within the code. Used for error reporting, and for
/// comparing input with formatted output. The numbers are 1-based, because that
/// is how editors usually refer to a position. Derived from tree_sitter::Point.
#[derive(Clone, Copy, Debug, PartialEq, Eq, Serialize)]
pub struct Position {
pub row: u32,
pub column: u32,
}

impl Display for Position {
fn fmt(&self, f: &mut std::fmt::Formatter) -> Result<(), std::fmt::Error> {
write!(f, "({},{})", self.row, self.column)
}
}

/// Topiary often needs both the tree-sitter `Query` and the original content
/// beloging to the file from which the query was parsed. This struct is a simple
/// convenience wrapper that combines the `Query` with its original string.
Expand Down Expand Up @@ -97,15 +84,6 @@ impl TopiaryQuery {
}
}

impl From<Point> for Position {
fn from(point: Point) -> Self {
Self {
row: point.row() + 1,
column: point.column() + 1,
}
}
}

// Simplified syntactic node struct, for the sake of serialisation.
#[derive(Serialize)]
pub struct SyntaxNode {
Expand Down Expand Up @@ -233,9 +211,24 @@ pub fn apply_query(
grammar: &topiary_tree_sitter_facade::Language,
tolerate_parsing_errors: bool,
) -> FormatterResult<AtomCollection> {
let (tree, _grammar) = parse(input_content, grammar, tolerate_parsing_errors)?;
let root = tree.root_node();
let source = input_content.as_bytes();
let (tree, grammar) = parse(input_content, grammar, tolerate_parsing_errors)?;

// Remove comments in a separate stream before applying queries
let SeparatedInput {
input_string,
input_tree,
comments,
} = extract_comments(&tree, input_content, grammar)?;
let source = input_string.as_bytes();
let root = input_tree.root_node();

for AnchoredComment {
comment_text,
commented,
} in comments
{
log::debug!("Found comment \"{comment_text}\" with anchor {commented:?}");
}

// Match queries
let mut cursor = QueryCursor::new();
Expand Down
Loading

0 comments on commit edc5e08

Please sign in to comment.