Skip to content

Commit

Permalink
Identify comments with independent query files
Browse files Browse the repository at this point in the history
Instead of using ad hoc logic to identify comment nodes, this commit
uses a separate query file.
  • Loading branch information
nbacquey committed Jan 10, 2025
1 parent e965cd7 commit 019848b
Show file tree
Hide file tree
Showing 18 changed files with 254 additions and 40 deletions.
78 changes: 70 additions & 8 deletions topiary-cli/src/io.rs
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,7 @@ pub struct InputFile<'cfg> {
source: InputSource,
language: &'cfg topiary_config::language::Language,
query: QuerySource,
comment_query: Option<QuerySource>,
}

impl<'cfg> InputFile<'cfg> {
Expand All @@ -125,9 +126,21 @@ impl<'cfg> InputFile<'cfg> {
};
let query = TopiaryQuery::new(&grammar, &contents)?;

// Can't use `map` because of closures, async, and Result.
let comment_contents = match &self.comment_query {
Some(QuerySource::Path(query)) => Some(tokio::fs::read_to_string(query).await?),
Some(QuerySource::BuiltIn(contents)) => Some(contents.to_owned()),
None => None,
};
let comment_query = match comment_contents {
Some(c) => Some(TopiaryQuery::new(&grammar, &c)?),
None => None,
};

Ok(Language {
name: self.language.name.clone(),
query,
comment_query,
grammar,
indent: self.language().config.indent.clone(),
})
Expand Down Expand Up @@ -178,17 +191,18 @@ impl<'cfg, 'i> Inputs<'cfg> {
InputFrom::Stdin(language_name, query) => {
vec![(|| {
let language = config.get_language(&language_name)?;
let query_source: QuerySource = match query {
let (query, comment_query) = match query {
// The user specified a query file
Some(p) => p,
Some(p) => (p, None),
// The user did not specify a file, try the default locations
None => to_query_from_language(language)?,
};

Ok(InputFile {
source: InputSource::Stdin,
language,
query: query_source,
query,
comment_query,
})
})()]
}
Expand All @@ -197,12 +211,13 @@ impl<'cfg, 'i> Inputs<'cfg> {
.into_iter()
.map(|path| {
let language = config.detect(&path)?;
let query: QuerySource = to_query_from_language(language)?;
let (query, comment_query) = to_query_from_language(language)?;

Ok(InputFile {
source: InputSource::Disk(path, None),
language,
query,
comment_query,
})
})
.collect(),
Expand All @@ -212,16 +227,21 @@ impl<'cfg, 'i> Inputs<'cfg> {
}
}

fn to_query_from_language(language: &topiary_config::language::Language) -> CLIResult<QuerySource> {
let query: QuerySource = match language.find_query_file() {
Ok(p) => p.into(),
fn to_query_from_language(
language: &topiary_config::language::Language,
) -> CLIResult<(QuerySource, Option<QuerySource>)> {
let query: (QuerySource, Option<QuerySource>) = match language.find_query_file() {
Ok((path, comment_path)) => (path.into(), comment_path.map(|p| p.into())),
// For some reason, Topiary could not find any
// matching file in a default location. As a final attempt, try the
// builtin ones. Store the error, return that if we
// fail to find anything, because the builtin error might be unexpected.
Err(e) => {
log::warn!("No query files found in any of the expected locations. Falling back to compile-time included files.");
to_query(&language.name).map_err(|_| e)?
(
to_query(&language.name).map_err(|_| e)?,
to_comment_query(&language.name)?,
)
}
};
Ok(query)
Expand Down Expand Up @@ -364,3 +384,45 @@ where
)),
}
}

fn to_comment_query<T>(name: T) -> CLIResult<Option<QuerySource>>
where
T: AsRef<str> + fmt::Display,
{
match name.as_ref() {
#[cfg(feature = "bash")]
"bash" => Ok(Some(topiary_queries::bash_comment().into())),

#[cfg(feature = "css")]
"css" => Ok(Some(topiary_queries::css_comment().into())),

#[cfg(feature = "json")]
"json" => Ok(None),

#[cfg(feature = "nickel")]
"nickel" => Ok(Some(topiary_queries::nickel_comment().into())),

#[cfg(feature = "ocaml")]
"ocaml" => Ok(Some(topiary_queries::ocaml_comment().into())),

#[cfg(feature = "ocaml_interface")]
"ocaml_interface" => Ok(Some(topiary_queries::ocaml_interface_comment().into())),

#[cfg(feature = "ocamllex")]
"ocamllex" => Ok(Some(topiary_queries::ocamllex_comment().into())),

#[cfg(feature = "rust")]
"rust" => Ok(Some(topiary_queries::rust_comment().into())),

#[cfg(feature = "toml")]
"toml" => Ok(Some(topiary_queries::toml_comment().into())),

#[cfg(feature = "tree_sitter_query")]
"tree_sitter_query" => Ok(Some(topiary_queries::tree_sitter_query_comment().into())),

name => Err(TopiaryError::Bin(
format!("The specified language is unsupported: {}", name),
Some(CLIError::UnsupportedLanguage(name.to_string())),
)),
}
}
18 changes: 14 additions & 4 deletions topiary-config/src/language.rs
Original file line number Diff line number Diff line change
Expand Up @@ -81,8 +81,10 @@ impl Language {
}

#[cfg(not(target_arch = "wasm32"))]
pub fn find_query_file(&self) -> TopiaryConfigResult<PathBuf> {
let basename = PathBuf::from(self.name.as_str()).with_extension("scm");
pub fn find_query_file(&self) -> TopiaryConfigResult<(PathBuf, Option<PathBuf>)> {
let name = self.name.clone();
let basename = PathBuf::from(&name).with_extension("scm"); // "clang.scm"
let comment_basename = PathBuf::from(&name).with_extension("comment.scm"); // "clang.comment.scm"

#[rustfmt::skip]
let potentials: [Option<PathBuf>; 4] = [
Expand All @@ -92,12 +94,20 @@ impl Language {
Some(PathBuf::from("../topiary-queries/queries")),
];

potentials
let query_file = potentials
.into_iter()
.flatten()
.map(|path| path.join(&basename))
.find(|path| path.exists())
.ok_or_else(|| TopiaryConfigError::QueryFileNotFound(basename))
.ok_or_else(|| TopiaryConfigError::QueryFileNotFound(basename))?;

let comment_query_file = query_file.parent().unwrap().join(comment_basename);

if comment_query_file.exists() {
Ok((query_file, Some(comment_query_file)))
} else {
Ok((query_file, None))
}
}

#[cfg(not(target_arch = "wasm32"))]
Expand Down
5 changes: 5 additions & 0 deletions topiary-core/benches/benchmark.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ use topiary_core::{formatter, Language, Operation, TopiaryQuery};
async fn format() {
let input = fs::read_to_string("../topiary-cli/tests/samples/input/ocaml.ml").unwrap();
let query_content = fs::read_to_string("../topiary-queries/queries/ocaml.scm").unwrap();
let comment_query_content =
fs::read_to_string("../topiary-queries/queries/ocaml.comment.scm").unwrap();
let ocaml = tree_sitter_ocaml::LANGUAGE_OCAML;

let mut input = input.as_bytes();
Expand All @@ -15,6 +17,9 @@ async fn format() {
let language: Language = Language {
name: "ocaml".to_owned(),
query: TopiaryQuery::new(&ocaml.clone().into(), &query_content).unwrap(),
comment_query: Some(
TopiaryQuery::new(&ocaml.clone().into(), &comment_query_content).unwrap(),
),
grammar: ocaml.into(),
indent: None,
};
Expand Down
53 changes: 31 additions & 22 deletions topiary-core/src/comments.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
use std::collections::HashSet;

use topiary_tree_sitter_facade::{InputEdit, Language, Node, Tree};

use crate::{
Expand Down Expand Up @@ -63,11 +65,6 @@ impl Diff<InputSection> for InputSection {
}
}

// TODO: allow the users to manually identify comments. Maybe with a separate query file?
fn is_comment(node: &Node) -> bool {
node.is_extra() && node.kind().to_string().contains("comment")
}

fn into_edit(node: &Node<'_>) -> InputEdit {
InputEdit::new(
node.start_byte(),
Expand All @@ -82,10 +79,11 @@ fn into_edit(node: &Node<'_>) -> InputEdit {
fn find_comments(
node: Node,
input: &str,
comment_ids: &HashSet<usize>,
comments: &mut Vec<(InputEdit, AnchoredComment)>,
) -> FormatterResult<()> {
if is_comment(&node) {
let commented = find_anchor(&node, input)?;
if comment_ids.contains(&node.id()) {
let commented = find_anchor(&node, input, comment_ids)?;
// Build the corresponding InputEdit:
// - If the comment is not alone on its line, return its bounds
// - If the comment is alone on its line, return the bounds of all its line
Expand Down Expand Up @@ -150,7 +148,7 @@ fn find_comments(
} else {
let mut walker = node.walk();
for child in node.children(&mut walker) {
find_comments(child, input, comments)?;
find_comments(child, input, comment_ids, comments)?;
}
Ok(())
}
Expand Down Expand Up @@ -259,7 +257,10 @@ fn previous_disjoint_node<'tree>(starting_node: &'tree Node<'tree>) -> Option<No
}

// TODO: if performance is an issue, use TreeCursor to navigate the tree
fn next_non_comment_leaf<'tree>(starting_node: Node<'tree>) -> Option<Node<'tree>> {
fn next_non_comment_leaf<'tree>(
starting_node: Node<'tree>,
comment_ids: &HashSet<usize>,
) -> Option<Node<'tree>> {
let mut node: Node<'tree> = starting_node;
loop {
// get the next leaf:
Expand All @@ -275,7 +276,7 @@ fn next_non_comment_leaf<'tree>(starting_node: Node<'tree>) -> Option<Node<'tree
}
Some(sibling) => {
node = sibling;
if is_comment(&node) {
if comment_ids.contains(&node.id()) {
// get the following sibling
continue;
} else {
Expand All @@ -287,14 +288,14 @@ fn next_non_comment_leaf<'tree>(starting_node: Node<'tree>) -> Option<Node<'tree
// 2) get the leftmost leaf of the sibling.
// If we encounter a comment, we stop. We'll get back to 1) after the loop
while let Some(child) = node.child(0) {
if is_comment(&child) {
if comment_ids.contains(&child.id()) {
break;
} else {
node = child
}
}
// check if the leaf is a comment. If it is not, start over again.
if is_comment(&node) {
if comment_ids.contains(&node.id()) {
continue;
} else {
return Some(node);
Expand All @@ -303,7 +304,10 @@ fn next_non_comment_leaf<'tree>(starting_node: Node<'tree>) -> Option<Node<'tree
}

// TODO: if performance is an issue, use TreeCursor to navigate the tree
fn previous_non_comment_leaf<'tree>(starting_node: Node<'tree>) -> Option<Node<'tree>> {
fn previous_non_comment_leaf<'tree>(
starting_node: Node<'tree>,
comment_ids: &HashSet<usize>,
) -> Option<Node<'tree>> {
let mut node: Node<'tree> = starting_node;
loop {
// get the previous leaf:
Expand All @@ -320,7 +324,7 @@ fn previous_non_comment_leaf<'tree>(starting_node: Node<'tree>) -> Option<Node<'
}
Some(sibling) => {
node = sibling;
if is_comment(&node) {
if comment_ids.contains(&node.id()) {
// get the previous sibling
continue;
} else {
Expand All @@ -338,14 +342,14 @@ fn previous_non_comment_leaf<'tree>(starting_node: Node<'tree>) -> Option<Node<'
node.child(node.child_count() - 1)
}
} {
if is_comment(&child) {
if comment_ids.contains(&child.id()) {
break;
} else {
node = child
}
}
// check if the leaf is a comment. If it is not, start over again.
if is_comment(&node) {
if comment_ids.contains(&node.id()) {
continue;
} else {
return Some(node);
Expand All @@ -360,7 +364,11 @@ fn previous_non_comment_leaf<'tree>(starting_node: Node<'tree>) -> Option<Node<'
// If there is no such node, we anchor to the first non-comment sibling node
// in the other direction.
#[allow(clippy::collapsible_else_if)]
fn find_anchor<'tree>(node: &'tree Node<'tree>, input: &str) -> FormatterResult<Commented> {
fn find_anchor<'tree>(
node: &'tree Node<'tree>,
input: &str,
comment_ids: &HashSet<usize>,
) -> FormatterResult<Commented> {
let point = node.start_position();
let mut lines = input.lines();
let prefix = lines
Expand All @@ -377,7 +385,7 @@ fn find_anchor<'tree>(node: &'tree Node<'tree>, input: &str) -> FormatterResult<
)
})?;
if prefix.trim_start() == "" {
if let Some(anchor) = next_non_comment_leaf(node.clone()) {
if let Some(anchor) = next_non_comment_leaf(node.clone(), comment_ids) {
let prev = previous_disjoint_node(node);
let next = next_disjoint_node(node);
Ok(Commented::CommentedAfter {
Expand All @@ -389,17 +397,17 @@ fn find_anchor<'tree>(node: &'tree Node<'tree>, input: &str) -> FormatterResult<
.map(|prev| prev.end_position().row() + 1 < node.start_position().row())
.unwrap_or(false),
})
} else if let Some(anchor) = previous_non_comment_leaf(node.clone()) {
} else if let Some(anchor) = previous_non_comment_leaf(node.clone(), comment_ids) {
Ok(Commented::CommentedBefore((&anchor).into()))
} else {
Err(FormatterError::CommentOrphaned(
node.utf8_text(input.as_bytes())?.to_string(),
))
}
} else {
if let Some(anchor) = previous_non_comment_leaf(node.clone()) {
if let Some(anchor) = previous_non_comment_leaf(node.clone(), comment_ids) {
Ok(Commented::CommentedBefore((&anchor).into()))
} else if let Some(anchor) = next_non_comment_leaf(node.clone()) {
} else if let Some(anchor) = next_non_comment_leaf(node.clone(), comment_ids) {
let prev = previous_disjoint_node(node);
let next = next_disjoint_node(node);
Ok(Commented::CommentedAfter {
Expand Down Expand Up @@ -445,14 +453,15 @@ pub struct SeparatedInput {
pub fn extract_comments<'a>(
tree: &'a Tree,
input: &'a str,
comment_ids: HashSet<usize>,
grammar: &Language,
tolerate_parsing_errors: bool,
) -> FormatterResult<SeparatedInput> {
let mut anchors: Vec<(InputEdit, AnchoredComment)> = Vec::new();
let mut anchored_comments: Vec<AnchoredComment> = Vec::new();
let mut new_input: String = input.to_string();
let mut new_tree: Tree = tree.clone();
find_comments(tree.root_node(), input, &mut anchors)?;
find_comments(tree.root_node(), input, &comment_ids, &mut anchors)?;
anchors.sort_by_key(|(node, _)| node.start_byte());
let mut edits: Vec<InputEdit> = Vec::new();
// for each (comment, anchor) pair in reverse order, we:
Expand Down
3 changes: 3 additions & 0 deletions topiary-core/src/language.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,9 @@ pub struct Language {
/// The Query Topiary will use to get the formating captures, must be
/// present. The topiary engine does not include any formatting queries.
pub query: TopiaryQuery,
/// The Query Topiary will use to determine which nodes are comments.
/// When missing, ther ewill be no separate comment processing.
pub comment_query: Option<TopiaryQuery>,
/// The tree-sitter Language. Topiary will use this Language for parsing.
pub grammar: topiary_tree_sitter_facade::Language,
/// The indentation string used for that particular language. Defaults to " "
Expand Down
Loading

0 comments on commit 019848b

Please sign in to comment.