From 5984134e6c76d49c62182b9006187da171cc1364 Mon Sep 17 00:00:00 2001 From: Lucas Satabin Date: Mon, 22 Jan 2024 22:10:09 +0100 Subject: [PATCH 1/3] Add support for pretty printing XML --- xml/src/main/scala/fs2/data/xml/Attr.scala | 13 ++ .../fs2/data/xml/internals/Renderer.scala | 141 +++++++++++++++++ xml/src/main/scala/fs2/data/xml/package.scala | 41 ++++- .../scala/fs2/data/xml/XmlRenderTest.scala | 142 +++++++++++++++++- 4 files changed, 330 insertions(+), 7 deletions(-) create mode 100644 xml/src/main/scala/fs2/data/xml/internals/Renderer.scala diff --git a/xml/src/main/scala/fs2/data/xml/Attr.scala b/xml/src/main/scala/fs2/data/xml/Attr.scala index 1c151aa7..9e606fd9 100644 --- a/xml/src/main/scala/fs2/data/xml/Attr.scala +++ b/xml/src/main/scala/fs2/data/xml/Attr.scala @@ -18,4 +18,17 @@ package fs2 package data package xml +import cats.Show +import cats.syntax.all._ + +import scala.runtime.AbstractFunction2 + case class Attr(name: QName, value: List[XmlEvent.XmlTexty]) + +object Attr extends AbstractFunction2[QName, List[XmlEvent.XmlTexty], Attr] { + + implicit val show: Show[Attr] = Show.show { case Attr(name, value) => + show"""$name="${value.foldMap[String](_.render)}"""" + } + +} diff --git a/xml/src/main/scala/fs2/data/xml/internals/Renderer.scala b/xml/src/main/scala/fs2/data/xml/internals/Renderer.scala new file mode 100644 index 00000000..b6d03a1b --- /dev/null +++ b/xml/src/main/scala/fs2/data/xml/internals/Renderer.scala @@ -0,0 +1,141 @@ +/* + * Copyright 2024 fs2-data Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package fs2 +package data +package xml +package internals + +import cats.syntax.all._ + +private[xml] class Renderer(collapseEmpty: Boolean, resetOnChunk: Boolean, indent: String, attributeThreshold: Int) + extends Collector.Builder[XmlEvent, String] { + + private val builder = new StringBuilder + + private var level = 0 + + private var newline = false + + private var skipClose = false + + private def indentation(): Unit = + if (newline) { + builder.append('\n') + builder.append(indent * level) + } + + override def +=(chunk: Chunk[XmlEvent]): Unit = { + if (resetOnChunk) + builder.setLength(0) + chunk.foreach { + case e @ (XmlEvent.XmlDecl(_, _, _) | XmlEvent.XmlPI(_, _)) => + indentation() + builder ++= e.show + newline = true + + case XmlEvent.Comment(content) => + newline = true + indentation() + builder ++= "" + + case XmlEvent.StartTag(name, attributes, isEmpty) => + indentation() + val renderedName = name.show + builder ++= show"<$renderedName" + + attributes match { + case a :: as => + val exceedThreshold = as.size > attributeThreshold - 1 + builder ++= show" $a" + as.foreach { a => + if (exceedThreshold) { + builder += '\n' + builder ++= " " * (renderedName.length() + 2) + } else { + builder += ' ' + } + builder ++= a.show + } + case Nil => // do nothing + } + + if (isEmpty && collapseEmpty) { + builder ++= " />" + skipClose = true + } else { + builder += '>' + level += 1 + } + newline = true + + case XmlEvent.EndTag(name) => + level -= 1 + newline = true + if (!skipClose) { + indentation() + builder ++= show"" + } + skipClose = false + + case XmlEvent.XmlString(content, true) => + indentation() + builder ++= show"" + newline = true + + case XmlEvent.XmlString(content, false) => + content.linesIterator.foreach { line => + indentation() + if (newline) + builder ++= line.stripLeading() + else + builder ++= line + newline = true + } + newline = content.matches("^.*\n\\s*$") + + case XmlEvent.StartDocument | XmlEvent.EndDocument => + // do nothing + case e => + indentation() + builder ++= e.show + newline = false + } + } + + override def result: String = builder.result() + +} + +private[xml] object Renderer { + + def pipe[F[_]](collapseEmpty: Boolean, indent: String, attributeThreshold: Int): Pipe[F, XmlEvent, String] = + in => + Stream.suspend(Stream.emit(new Renderer(collapseEmpty, true, indent, attributeThreshold))).flatMap { builder => + in.mapChunks { chunk => + builder += chunk + Chunk.singleton(builder.result) + } + + } + +} diff --git a/xml/src/main/scala/fs2/data/xml/package.scala b/xml/src/main/scala/fs2/data/xml/package.scala index 21920d49..75222df6 100644 --- a/xml/src/main/scala/fs2/data/xml/package.scala +++ b/xml/src/main/scala/fs2/data/xml/package.scala @@ -80,12 +80,43 @@ package object xml { * without additional (or original) whitespace and with empty tags being collapsed to the short self-closed form * if collapseEmpty is true. Preserves chunking, each String in the output will correspond to one event in the input. */ + @deprecated(message = "Use `fs2.data.xml.render.raw() instead.`", since = "fs2-data 1.11.0") def render[F[_]](collapseEmpty: Boolean = true): Pipe[F, XmlEvent, String] = - _.zipWithPrevious.map { - case (_, st: XmlEvent.StartTag) => st.render(collapseEmpty) - case (Some(XmlEvent.StartTag(_, _, true)), XmlEvent.EndTag(_)) if collapseEmpty => "" - case (_, event) => event.show - } + render.raw(collapseEmpty) + + object render { + + /** + * Render the incoming xml events to their string representation. The output will be concise, + * without additional (or original) whitespace and with empty tags being collapsed to the short self-closed form + * if collapseEmpty is true. Preserves chunking, each String in the output will correspond to one event in the input. + */ + def raw[F[_]](collapseEmpty: Boolean = true): Pipe[F, XmlEvent, String] = + _.zipWithPrevious.map { + case (_, st: XmlEvent.StartTag) => st.render(collapseEmpty) + case (Some(XmlEvent.StartTag(_, _, true)), XmlEvent.EndTag(_)) if collapseEmpty => "" + case (_, event) => event.show + } + + /** + * Render the incoming xml events intot a prettified string representation. + * _Prettified_ means that nested tags will be indented as per `indent` parameter + * and text data (except for `CDATA`, which remains untouched) is indented to the current + * indentation level after each new line. + * + * This pipe can be used when whitespace characters are not relevant to the application + * and to make it more readable to human beings. + * + * @param collapseEmpty Whether empty tags are collapsed in a single self closing tag + * @param indent THe indentation string + * @param attributeThreshold Number of attributes above which each attribute is rendered on a new line + */ + def pretty[F[_]](collapseEmpty: Boolean = true, + indent: String = " ", + attributeThreshold: Int = 3): Pipe[F, XmlEvent, String] = + Renderer.pipe(collapseEmpty, indent, attributeThreshold) + + } val ncNameStart = CharRanges.fromRanges( ('A', 'Z'), diff --git a/xml/src/test/scala/fs2/data/xml/XmlRenderTest.scala b/xml/src/test/scala/fs2/data/xml/XmlRenderTest.scala index b6ac15c9..99f7e052 100644 --- a/xml/src/test/scala/fs2/data/xml/XmlRenderTest.scala +++ b/xml/src/test/scala/fs2/data/xml/XmlRenderTest.scala @@ -24,18 +24,156 @@ object XmlRenderTest extends SimpleIOSuite { test("renders xml with self-closing tags") { val result = - xml"""""".through(render()).compile.string + xml"""""".through(render.raw()).compile.string result.liftTo[IO].map { result => expect.eql("""""", result) } } + test("renders xml with self-closing tags prettily") { + val result = + xml"""""".through(render.pretty()).compile.string + result.liftTo[IO].map { result => + expect.eql( + """ + | + | + |""".stripMargin, + result + ) + } + } + test("renders xml without self-closing tags if disabled") { val result = - xml"""""".through(render(false)).compile.string + xml"""""".through(render.raw(false)).compile.string result.liftTo[IO].map { result => expect.eql("""""", result) } } + test("renders xml without self-closing tags prettily") { + val result = + xml"""""" + .through(render.pretty(false)) + .compile + .string + result.liftTo[IO].map { result => + expect.eql( + """ + | + | + | + |""".stripMargin, + result + ) + } + } + + test("renders xml with attributes prettily if below threshold") { + val result = + xml"""""" + .through(render.pretty()) + .compile + .string + result.liftTo[IO].map { result => + expect.eql( + """ + | + | + |""".stripMargin, + result + ) + } + } + + test("renders xml with attributes prettily if above threshold") { + val result = + xml"""""" + .through(render.pretty()) + .compile + .string + result.liftTo[IO].map { result => + expect.eql( + """ + | + | + |""".stripMargin, + result + ) + } + } + + test("renders text prettily") { + val result = + xml"""This is a test. +The text is not originally formatted.""".through(render.pretty()).compile.string + result.liftTo[IO].map { result => + expect.eql( + """ + | + | This is a test. + | The text is not originally formatted. + |""".stripMargin, + result + ) + } + } + + test("renders text with entities prettily") { + val result = + xml"""This is a test. +The text is not originally formatted but contains & and +´ as entities.""".through(render.pretty()).compile.string + result.liftTo[IO].map { result => + expect.eql( + """ + | + | This is a test. + | The text is not originally formatted but contains & and + | ´ as entities. + |""".stripMargin, + result + ) + } + } + + test("renders CDATA as-is") { + val result = + xml"""""".through(render.pretty()).compile.string + result.liftTo[IO].map { result => + expect.eql( + """ + | + | + |""".stripMargin, + result + ) + } + } + + test("renders comments prettily") { + val result = + rawxml"""""" + .through(render.pretty()) + .compile + .string + result.liftTo[IO].map { result => + expect.eql( + """ + | + | + |""".stripMargin, + result + ) + } + } + } From d8a1610fb53aaeea1531b161ff86a1fa8c8268f4 Mon Sep 17 00:00:00 2001 From: Lucas Satabin Date: Tue, 23 Jan 2024 20:04:59 +0100 Subject: [PATCH 2/3] Add XML rendering collectors --- .../fs2/data/xml/internals/Renderer.scala | 31 +++++++++++----- xml/src/main/scala/fs2/data/xml/package.scala | 37 +++++++++++++++---- 2 files changed, 50 insertions(+), 18 deletions(-) diff --git a/xml/src/main/scala/fs2/data/xml/internals/Renderer.scala b/xml/src/main/scala/fs2/data/xml/internals/Renderer.scala index b6d03a1b..8ddc41ef 100644 --- a/xml/src/main/scala/fs2/data/xml/internals/Renderer.scala +++ b/xml/src/main/scala/fs2/data/xml/internals/Renderer.scala @@ -21,7 +21,11 @@ package internals import cats.syntax.all._ -private[xml] class Renderer(collapseEmpty: Boolean, resetOnChunk: Boolean, indent: String, attributeThreshold: Int) +private[xml] class Renderer(pretty: Boolean, + collapseEmpty: Boolean, + resetOnChunk: Boolean, + indent: String, + attributeThreshold: Int) extends Collector.Builder[XmlEvent, String] { private val builder = new StringBuilder @@ -33,7 +37,7 @@ private[xml] class Renderer(collapseEmpty: Boolean, resetOnChunk: Boolean, inden private var skipClose = false private def indentation(): Unit = - if (newline) { + if (pretty && newline) { builder.append('\n') builder.append(indent * level) } @@ -80,7 +84,10 @@ private[xml] class Renderer(collapseEmpty: Boolean, resetOnChunk: Boolean, inden } if (isEmpty && collapseEmpty) { - builder ++= " />" + if (pretty) + builder ++= " />" + else + builder ++= "/>" skipClose = true } else { builder += '>' @@ -102,7 +109,7 @@ private[xml] class Renderer(collapseEmpty: Boolean, resetOnChunk: Boolean, inden builder ++= show"" newline = true - case XmlEvent.XmlString(content, false) => + case XmlEvent.XmlString(content, false) if pretty => content.linesIterator.foreach { line => indentation() if (newline) @@ -128,13 +135,17 @@ private[xml] class Renderer(collapseEmpty: Boolean, resetOnChunk: Boolean, inden private[xml] object Renderer { - def pipe[F[_]](collapseEmpty: Boolean, indent: String, attributeThreshold: Int): Pipe[F, XmlEvent, String] = + def pipe[F[_]](pretty: Boolean, + collapseEmpty: Boolean, + indent: String, + attributeThreshold: Int): Pipe[F, XmlEvent, String] = in => - Stream.suspend(Stream.emit(new Renderer(collapseEmpty, true, indent, attributeThreshold))).flatMap { builder => - in.mapChunks { chunk => - builder += chunk - Chunk.singleton(builder.result) - } + Stream.suspend(Stream.emit(new Renderer(pretty, collapseEmpty, true, indent, attributeThreshold))).flatMap { + builder => + in.mapChunks { chunk => + builder += chunk + Chunk.singleton(builder.result) + } } diff --git a/xml/src/main/scala/fs2/data/xml/package.scala b/xml/src/main/scala/fs2/data/xml/package.scala index 75222df6..e0563c3b 100644 --- a/xml/src/main/scala/fs2/data/xml/package.scala +++ b/xml/src/main/scala/fs2/data/xml/package.scala @@ -78,25 +78,22 @@ package object xml { /** * Render the incoming xml events to their string representation. The output will be concise, * without additional (or original) whitespace and with empty tags being collapsed to the short self-closed form - * if collapseEmpty is true. Preserves chunking, each String in the output will correspond to one event in the input. + * if collapseEmpty is true. */ @deprecated(message = "Use `fs2.data.xml.render.raw() instead.`", since = "fs2-data 1.11.0") def render[F[_]](collapseEmpty: Boolean = true): Pipe[F, XmlEvent, String] = render.raw(collapseEmpty) + /** XML Event stream pipes to render XML values. */ object render { /** * Render the incoming xml events to their string representation. The output will be concise, * without additional (or original) whitespace and with empty tags being collapsed to the short self-closed form - * if collapseEmpty is true. Preserves chunking, each String in the output will correspond to one event in the input. + * if collapseEmpty is true. */ def raw[F[_]](collapseEmpty: Boolean = true): Pipe[F, XmlEvent, String] = - _.zipWithPrevious.map { - case (_, st: XmlEvent.StartTag) => st.render(collapseEmpty) - case (Some(XmlEvent.StartTag(_, _, true)), XmlEvent.EndTag(_)) if collapseEmpty => "" - case (_, event) => event.show - } + Renderer.pipe(false, collapseEmpty, "", 0) /** * Render the incoming xml events intot a prettified string representation. @@ -114,7 +111,7 @@ package object xml { def pretty[F[_]](collapseEmpty: Boolean = true, indent: String = " ", attributeThreshold: Int = 3): Pipe[F, XmlEvent, String] = - Renderer.pipe(collapseEmpty, indent, attributeThreshold) + Renderer.pipe(true, collapseEmpty, indent, attributeThreshold) } @@ -152,6 +149,7 @@ package object xml { object collector { /** Renders all events using the `Show` instance and build the result string. */ + @deprecated(message = "Use `fs2.data.xml.collector.raw(false)` instead", since = "fs2-data 1.11.0") object show extends Collector[XmlEvent] { type Out = String def newBuilder: Collector.Builder[XmlEvent, Out] = @@ -168,6 +166,29 @@ package object xml { } } + /** Renders all events without extra formatting. */ + def raw(collapseEmpty: Boolean = true): Collector[XmlEvent] = + new Collector[XmlEvent] { + type Out = String + def newBuilder: Collector.Builder[XmlEvent, Out] = + new Renderer(false, collapseEmpty, false, "", 0) + } + + /** Renders all events with trying to make it more readable. + * This collector should only be used if white spaces is not relevant to the application + * and results in more human readable XML. + * + * @param collapseEmpty Whether empty tags are collapsed in a single self closing tag + * @param indent THe indentation string + * @param attributeThreshold Number of attributes above which each attribute is rendered on a new line + */ + def pretty(collapseEmpty: Boolean = true, indent: String = " ", attributeThreshold: Int = 3): Collector[XmlEvent] = + new Collector[XmlEvent] { + type Out = String + def newBuilder: Collector.Builder[XmlEvent, Out] = + new Renderer(true, collapseEmpty, false, indent, attributeThreshold) + } + } implicit class XmlInterpolators(val sc: StringContext) extends AnyVal { From b8354b9b904cb012c2f4da3b647cd5a35685f922 Mon Sep 17 00:00:00 2001 From: Lucas Satabin Date: Tue, 23 Jan 2024 20:42:52 +0100 Subject: [PATCH 3/3] Add XML renderer documentation --- site/documentation/xml/index.md | 35 ++++++++++++++++--- site/documentation/xml/xpath.md | 7 ++-- .../fs2/data/xml/internals/Renderer.scala | 18 ++++++---- xml/src/main/scala/fs2/data/xml/package.scala | 6 ++-- 4 files changed, 49 insertions(+), 17 deletions(-) diff --git a/site/documentation/xml/index.md b/site/documentation/xml/index.md index 4b1b48e4..d646234f 100644 --- a/site/documentation/xml/index.md +++ b/site/documentation/xml/index.md @@ -8,7 +8,7 @@ The `fs2-data-xml` module provides tools to parse XML data in a streaming manner To create a stream of XML events from an input stream, use the `events` pipe in `fs2.data.xml` package. -```scala mdoc +```scala mdoc:height=500 import cats.effect._ import cats.effect.unsafe.implicits.global @@ -33,14 +33,14 @@ The pipe validates the XML structure while parsing. It reads all the XML element Namespace can be resolved by using the `namespaceResolver` pipe. -```scala mdoc +```scala mdoc:height=500 val nsResolved = stream.through(namespaceResolver[IO]) nsResolved.compile.toList.unsafeRunSync() ``` Using the `referenceResolver` pipe, entity and character references can be resolved. By defaut the standard `xmlEntities` mapping is used, but it can be replaced by any mapping you see fit. -```scala mdoc +```scala mdoc:height=500 val entityResolved = stream.through(referenceResolver[IO]()) entityResolved.compile.toList.unsafeRunSync() ``` @@ -49,7 +49,7 @@ entityResolved.compile.toList.unsafeRunSync() Once entites and namespaces are resolved, the events might be numerous and can be normalized to avoid emitting too many of them. For instance, after reference resolution, consecutive text events can be merged. This is achieved by using the `normalize` pipe. -```scala mdoc +```scala mdoc:height=500 val normalized = entityResolved.through(normalize) normalized.compile.toList.unsafeRunSync() ``` @@ -82,3 +82,30 @@ implicit val eventifier: DocumentEventifier[SomeDocType] = ??? stream.through(documents[IO, SomeDocType]) .through(eventify[IO, SomeDocType]) ``` + +## XML Renderers + +Once you got an XML event stream, selected and transformed what you needed in it, you can then write the resulting event stream to some storage. This can be achieved using renderers. + +For instance, let's say you want to write the resulting XML stream to a file in raw form (i.e. without trying to format the nested tags and text), you can do: + +```scala mdoc:compile-only +import fs2.io.file.{Files, Flags, Path} + +stream + .through(render.raw()) + .through(text.utf8.encode) + .through(Files[IO].writeAll(Path("/some/path/to/file.xml"), Flags.Write)) + .compile + .drain +``` + +There exists also a `pretty()` renderer, that indents inner tags and text by the given indent string. + +If you are interested in the String rendering as a value, the library also provides `Collector`s: + +```scala mdoc +stream.compile.to(collector.raw()).unsafeRunSync() + +stream.compile.to(collector.pretty()).unsafeRunSync() +``` diff --git a/site/documentation/xml/xpath.md b/site/documentation/xml/xpath.md index 77c7b0f4..1240b84c 100644 --- a/site/documentation/xml/xpath.md +++ b/site/documentation/xml/xpath.md @@ -84,14 +84,13 @@ The `filter.raw` emits a stream of all matches. Each match is represented as a nested stream of XML events which must be consumed. ```scala mdoc -import cats.Show import cats.effect._ import cats.effect.unsafe.implicits.global stream .lift[IO] .through(filter.raw(path)) - .parEvalMapUnbounded(_.map(Show[XmlEvent].show(_)).compile.foldMonoid) + .parEvalMapUnbounded(_.through(render.raw()).compile.foldMonoid) .compile .toList .unsafeRunSync() @@ -105,7 +104,7 @@ The library offers `filter.collect` to collect each match for any collector. ```scala mdoc stream .lift[IO] - .through(filter.collect(path, collector.show)) + .through(filter.collect(path, collector.raw())) .compile .toList .unsafeRunSync() @@ -116,7 +115,7 @@ If you want to have results emitted as early as possible instead of in order, yo ```scala mdoc stream .lift[IO] - .through(filter.collect(path, collector.show, deterministic = false)) + .through(filter.collect(path, collector.raw(), deterministic = false)) .compile .toList .unsafeRunSync() diff --git a/xml/src/main/scala/fs2/data/xml/internals/Renderer.scala b/xml/src/main/scala/fs2/data/xml/internals/Renderer.scala index 8ddc41ef..5d027ddb 100644 --- a/xml/src/main/scala/fs2/data/xml/internals/Renderer.scala +++ b/xml/src/main/scala/fs2/data/xml/internals/Renderer.scala @@ -96,9 +96,9 @@ private[xml] class Renderer(pretty: Boolean, newline = true case XmlEvent.EndTag(name) => - level -= 1 newline = true if (!skipClose) { + level -= 1 indentation() builder ++= show"" } @@ -111,12 +111,16 @@ private[xml] class Renderer(pretty: Boolean, case XmlEvent.XmlString(content, false) if pretty => content.linesIterator.foreach { line => - indentation() - if (newline) - builder ++= line.stripLeading() - else - builder ++= line - newline = true + if (line.matches("\\s*")) { + // empty line, ignore it + } else { + indentation() + if (newline) + builder ++= line.stripLeading() + else + builder ++= line + newline = true + } } newline = content.matches("^.*\n\\s*$") diff --git a/xml/src/main/scala/fs2/data/xml/package.scala b/xml/src/main/scala/fs2/data/xml/package.scala index e0563c3b..5035c3ab 100644 --- a/xml/src/main/scala/fs2/data/xml/package.scala +++ b/xml/src/main/scala/fs2/data/xml/package.scala @@ -167,7 +167,7 @@ package object xml { } /** Renders all events without extra formatting. */ - def raw(collapseEmpty: Boolean = true): Collector[XmlEvent] = + def raw(collapseEmpty: Boolean = true): Collector.Aux[XmlEvent, String] = new Collector[XmlEvent] { type Out = String def newBuilder: Collector.Builder[XmlEvent, Out] = @@ -182,7 +182,9 @@ package object xml { * @param indent THe indentation string * @param attributeThreshold Number of attributes above which each attribute is rendered on a new line */ - def pretty(collapseEmpty: Boolean = true, indent: String = " ", attributeThreshold: Int = 3): Collector[XmlEvent] = + def pretty(collapseEmpty: Boolean = true, + indent: String = " ", + attributeThreshold: Int = 3): Collector.Aux[XmlEvent, String] = new Collector[XmlEvent] { type Out = String def newBuilder: Collector.Builder[XmlEvent, Out] =