From 9491813ca58dca608ae7c4d2c2dc3b21e118343d Mon Sep 17 00:00:00 2001
From: Donald Gray
Date: Wed, 15 Nov 2023 14:50:39 +0000
Subject: [PATCH 1/2] Add SanitiseHtml helper method
---
readme.md | 15 ++
.../Presentation/HtmlSanitiserTests.cs | 200 ++++++++++++++++++
src/IIIF/IIIF/IIIF.csproj | 1 +
src/IIIF/IIIF/Presentation/HtmlSanitiser.cs | 76 +++++++
4 files changed, 292 insertions(+)
create mode 100644 src/IIIF/IIIF.Tests/Presentation/HtmlSanitiserTests.cs
create mode 100644 src/IIIF/IIIF/Presentation/HtmlSanitiser.cs
diff --git a/readme.md b/readme.md
index 252e959..ccc0a1d 100644
--- a/readme.md
+++ b/readme.md
@@ -85,6 +85,8 @@ The `.Serialisation` namespace contains a number of custom `JsonConverter` imple
### Helpers
+#### Serialisation
+
`IIIFSerialiserX` contains 2 extension methods for `JsonLdBase` that help with serialising / deserialising models.
For string serialisation these are `AsJson` and `FromJson`:
@@ -110,6 +112,19 @@ Manifest deserialisedManifest = streamContainingManifest.FromJsonStream Note: full object deserialisation is incomplete - open an issue or PR if you find an issue.
+#### HTML Markup Handling
+
+`HtmlSanitiser` contains a `SanitiseHtml()` extension method on `string` to help sanitise HTML.
+
+```cs
+string original = "my markup
invalid
";
+string safe = original.SanitiseHtml();
+```
+
+See [IIIF Presentation 3.0 docs](https://iiif.io/api/presentation/3.0/#45-html-markup-in-property-values) for details on html markup.
+
+> Note: The rules around markup differs between Presentation 2.1 and 3.0. This method uses 3.0 which permits a couple of tags not mentioned in 2.1 (`small`, `sub` and `sup`).
+
## Local Build
The `local_build.sh` bash script will build/test/pack for ease of testing.
diff --git a/src/IIIF/IIIF.Tests/Presentation/HtmlSanitiserTests.cs b/src/IIIF/IIIF.Tests/Presentation/HtmlSanitiserTests.cs
new file mode 100644
index 0000000..4a931a6
--- /dev/null
+++ b/src/IIIF/IIIF.Tests/Presentation/HtmlSanitiserTests.cs
@@ -0,0 +1,200 @@
+using IIIF.Presentation;
+
+namespace IIIF.Tests.Presentation;
+
+public class HtmlSanitiserTests
+{
+ [Theory]
+ [InlineData(null)]
+ [InlineData("")]
+ public void SanitiseHtml_ReturnsGivenString_IfNullOrEmpty(string val)
+ => val.SanitiseHtml().Should().Be(val);
+
+ [Fact]
+ public void SanitiseHtml_Trims_Whitespace_From_Beginning_And_End()
+ {
+ const string input = "
valid html
";
+ const string expected = "valid html
";
+
+ var actual = input.SanitiseHtml();
+
+ actual.Should().Be(expected);
+ }
+
+ [Fact]
+ public void SanitiseHtml_AutoCloses_ValidTags()
+ {
+ const string input = " valid html
";
+ const string expected = "valid html
";
+
+ var actual = input.SanitiseHtml();
+
+ actual.Should().Be(expected);
+ }
+
+ [Fact]
+ public void SanitiseHtml_RemovesInvalidTags_IncludingChildElements()
+ {
+ const string input =
+ "
hiTest
valid paragraph
";
+ const string expected = "
hivalid paragraph
";
+
+ var actual = input.SanitiseHtml();
+
+ actual.Should().Be(expected);
+ }
+
+ [Theory]
+ [InlineData("http://localhost")]
+ [InlineData("https://localhost")]
+ [InlineData("mailto://test@example")]
+ public void SanitiseHtml_AllowsSpecifiedSchemesForHref(string href)
+ {
+ var input = $"test";
+ var expected = $"test";
+
+ var actual = input.SanitiseHtml();
+
+ actual.Should().Be(expected);
+ }
+
+ [Fact]
+ public void SanitiseHtml_StripsInvalidSchemesFromHref()
+ {
+ var input = "test";
+ const string expected = "test";
+
+ var actual = input.SanitiseHtml();
+
+ actual.Should().Be(expected);
+ }
+
+ [Fact]
+ public void SanitiseHtml_RemovesComments()
+ {
+ const string input = "valid html
";
+ const string expected = "valid html
";
+
+ var actual = input.SanitiseHtml();
+
+ actual.Should().Be(expected);
+ }
+
+ [Fact]
+ public void SanitiseHtml_RemovesCdata()
+ {
+ const string input = "valid html
";
+ const string expected = "valid html
";
+
+ var actual = input.SanitiseHtml();
+
+ actual.Should().Be(expected);
+ }
+
+ [Fact]
+ public void SanitiseHtml_RemovesProcessingInstructions()
+ {
+ const string input = "valid html
";
+ const string expected = "valid html
";
+
+ var actual = input.SanitiseHtml();
+
+ actual.Should().Be(expected);
+ }
+
+ [Theory]
+ [InlineData("html", "
html
")]
+ [InlineData("html
html
")]
+ [InlineData("p>html", "p>html")]
+ [InlineData("html", "html")]
+ public void SanitiseHtml_HandlesInvalidHtml(string input, string expected)
+ {
+ var actual = input.SanitiseHtml();
+
+ actual.Should().Be(expected);
+ }
+
+ [Theory]
+ [InlineData("html", "
html
")]
+ [InlineData("html
html")]
+ [InlineData("p>html", "p>html
")]
+ [InlineData("html", "html
")]
+ public void SanitiseHtml_HandlesInvalidHtml_WithCustomWrapperTag(string input, string expected)
+ {
+ var actual = input.SanitiseHtml("p");
+
+ actual.Should().Be(expected);
+ }
+
+ [Theory]
+ [InlineData("a")]
+ [InlineData("b")]
+ [InlineData("i")]
+ [InlineData("p")]
+ [InlineData("small")]
+ [InlineData("span")]
+ [InlineData("sub")]
+ [InlineData("sup")]
+ public void SanitiseHtml_RemovesSrcAndAltAttributes_FromNonImgTag(string tag)
+ {
+ var input = $"<{tag} alt=\"alt\" src=\"http://foo\">x{tag}>";
+ var expected = $"<{tag}>x{tag}>";
+
+ var actual = input.SanitiseHtml();
+
+ actual.Should().Be(expected);
+ }
+
+ [Theory]
+ [InlineData("b")]
+ [InlineData("i")]
+ [InlineData("p")]
+ [InlineData("small")]
+ [InlineData("span")]
+ [InlineData("sub")]
+ [InlineData("sup")]
+ public void SanitiseHtml_RemovesHrefAttribute_FromNonAnchorTag(string tag)
+ {
+ var input = $"<{tag} href=\"http://foo\">x{tag}>";
+ var expected = $"<{tag}>x{tag}>";
+
+ var actual = input.SanitiseHtml();
+
+ actual.Should().Be(expected);
+ }
+
+ [Fact]
+ public void SanitiseHtml_RemovesHrefAttribute_FromImageTag()
+ {
+ // NOTE: this is excluded from above as it has no closing tag so avoids logic in tests
+ const string input = "";
+ const string expected = "";
+
+ var actual = input.SanitiseHtml();
+
+ actual.Should().Be(expected);
+ }
+
+ [Fact]
+ public void SanitiseHtml_RemovesHref_Src_AndAltAttributes_FromLineBreak()
+ {
+ // NOTE: this is excluded from above as it has no closing tag so avoids logic in tests
+ const string input = "
";
+ const string expected = "
";
+
+ var actual = input.SanitiseHtml();
+
+ actual.Should().Be(expected);
+ }
+
+ [Fact]
+ public void SanitiseHtml_AllowsSrcAndAltAttributes_OnImgTag()
+ {
+ const string input = "";
+ const string expected = "";
+
+ var actual = input.SanitiseHtml();
+
+ actual.Should().Be(expected);
+ }
+}
\ No newline at end of file
diff --git a/src/IIIF/IIIF/IIIF.csproj b/src/IIIF/IIIF/IIIF.csproj
index 6e86e84..d8cb475 100644
--- a/src/IIIF/IIIF/IIIF.csproj
+++ b/src/IIIF/IIIF/IIIF.csproj
@@ -19,6 +19,7 @@
+
diff --git a/src/IIIF/IIIF/Presentation/HtmlSanitiser.cs b/src/IIIF/IIIF/Presentation/HtmlSanitiser.cs
new file mode 100644
index 0000000..7049266
--- /dev/null
+++ b/src/IIIF/IIIF/Presentation/HtmlSanitiser.cs
@@ -0,0 +1,76 @@
+using System.Collections.Generic;
+using Ganss.Xss;
+
+namespace IIIF.Presentation;
+
+///
+/// Class to help in sanitising HTML markup for use in IIIF property values
+///
+/// See https://iiif.io/api/presentation/3.0/#45-html-markup-in-property-values
+public static class HtmlSanitiser
+{
+ private static readonly HtmlSanitizerOptions HtmlSanitizerOptions = new()
+ {
+ AllowedTags = new HashSet { "a", "b", "br", "i", "img", "p", "small", "span", "sub", "sup" },
+ AllowedAttributes = new HashSet(0),
+ AllowedSchemes = new HashSet { "http", "https", "mailto" },
+ UriAttributes = new HashSet { "href" }
+ };
+
+ private static readonly HtmlSanitizer Sanitizer = new(HtmlSanitizerOptions);
+
+ private static readonly Dictionary> ValidAttributesPerTag
+ = new()
+ {
+ ["a"] = new HashSet { "href" },
+ ["img"] = new HashSet { "src", "alt" },
+ };
+
+ static HtmlSanitiser()
+ {
+ // NOTE - used HTML sanitiser lib doesn't allow tag-specific attributes so subscribe to RemovingAttribute
+ // events and cancel those that should be allowed
+ Sanitizer.RemovingAttribute += (sender, args) =>
+ {
+ // Attribute can also be removed if scheme isn't allowed
+ if (args.Reason != RemoveReason.NotAllowedAttribute) return;
+ args.Cancel = ValidAttributesPerTag.TryGetValue(args.Tag.TagName.ToLower(), out var allowedAttributes)
+ && allowedAttributes.Contains(args.Attribute.Name.ToLower());
+ };
+ }
+
+
+ ///
+ /// Sanitise markup to meet requirements in IIIF spec. This will
+ ///
+ /// * Remove all tags except: a, b, br, i, img, p, small, span, sub and sup
+ /// * Remove all attributes other than href on the a tag, src and alt on the img tag
+ /// * Remove all href attributes that start with the strings other than “http:”, “https:”, and “mailto:”
+ /// * CData sections
+ /// * XML comments
+ /// * Processing instructions
+ /// * Strip whitespace from either side of HTML string
+ ///
+ /// see https://iiif.io/api/presentation/3.0/#45-html-markup-in-property-values
+ ///
+ /// Value to be sanitised
+ ///
+ /// Tag to wrap value in if it is not currently an HTML string (starts with < and ends with >)
+ ///
+ /// Sanitised markup value
+ public static string SanitiseHtml(this string propertyValue, string nonHtmlWrappingTag = "span")
+ {
+ if (string.IsNullOrEmpty(propertyValue)) return propertyValue;
+
+ var workingString = Sanitizer.Sanitize(propertyValue.Trim());
+
+ if (IsHtmlString(workingString))
+ {
+ workingString = $"<{nonHtmlWrappingTag}>{workingString}{nonHtmlWrappingTag}>";
+ }
+
+ return workingString;
+ }
+
+ private static bool IsHtmlString(string workingString) => workingString[0] != '<' || workingString[^1] != '>';
+}
\ No newline at end of file
From 8f0035e826f1058f272879be6f3ef9f242e4c797 Mon Sep 17 00:00:00 2001
From: Donald Gray
Date: Wed, 15 Nov 2023 15:52:48 +0000
Subject: [PATCH 2/2] Add 'ignoreNonHtml' to SanitiseHtml
---
.../Presentation/HtmlSanitiserTests.cs | 48 +++++++++++++++----
src/IIIF/IIIF/Presentation/HtmlSanitiser.cs | 27 ++++++++---
2 files changed, 58 insertions(+), 17 deletions(-)
diff --git a/src/IIIF/IIIF.Tests/Presentation/HtmlSanitiserTests.cs b/src/IIIF/IIIF.Tests/Presentation/HtmlSanitiserTests.cs
index 4a931a6..37b3d23 100644
--- a/src/IIIF/IIIF.Tests/Presentation/HtmlSanitiserTests.cs
+++ b/src/IIIF/IIIF.Tests/Presentation/HtmlSanitiserTests.cs
@@ -1,4 +1,5 @@
-using IIIF.Presentation;
+using System;
+using IIIF.Presentation;
namespace IIIF.Tests.Presentation;
@@ -11,23 +12,23 @@ public void SanitiseHtml_ReturnsGivenString_IfNullOrEmpty(string val)
=> val.SanitiseHtml().Should().Be(val);
[Fact]
- public void SanitiseHtml_Trims_Whitespace_From_Beginning_And_End()
+ public void SanitiseHtml_Trims_Whitespace_FromBeginningAndEnd_IfIgnoreNonHtmlFalse()
{
const string input = " valid html
";
const string expected = "valid html
";
- var actual = input.SanitiseHtml();
+ var actual = input.SanitiseHtml(false);
actual.Should().Be(expected);
}
[Fact]
- public void SanitiseHtml_AutoCloses_ValidTags()
+ public void SanitiseHtml_AutoCloses_ValidTags_IfIgnoreNonHtmlFalse()
{
const string input = " valid html
";
const string expected = "valid html
";
- var actual = input.SanitiseHtml();
+ var actual = input.SanitiseHtml(false);
actual.Should().Be(expected);
}
@@ -103,28 +104,55 @@ public void SanitiseHtml_RemovesProcessingInstructions()
}
[Theory]
+ [InlineData("html")]
+ [InlineData("
html
html")]
+ [InlineData("html")]
+ [InlineData(" valid html
")]
+ public void SanitiseHtml_ReturnsInvalidHtml_IfIgnoreNonHtmlTrue(string input)
+ {
+ var actual = input.SanitiseHtml();
+
+ actual.Should().Be(input);
+ }
+
+ [Theory]
+ [InlineData("html", "html")]
+ [InlineData(" html ", "html")]
[InlineData("html", "
html
")]
[InlineData("html
html")]
[InlineData("p>html", "p>html")]
[InlineData("html", "html")]
- public void SanitiseHtml_HandlesInvalidHtml(string input, string expected)
+ public void SanitiseHtml_HandlesInvalidHtml_IfIgnoreNonHtmlFalse(string input, string expected)
{
- var actual = input.SanitiseHtml();
+ var actual = input.SanitiseHtml(false);
actual.Should().Be(expected);
}
[Theory]
+ [InlineData("html", "html
")]
+ [InlineData(" html ", "html
")]
[InlineData("html", "
html
")]
[InlineData("html
html")]
- [InlineData("p>html", "p>html
")]
- [InlineData("html", "html
")]
+ [InlineData("p>html", "p>html
")]
+ [InlineData("html", "html
")]
public void SanitiseHtml_HandlesInvalidHtml_WithCustomWrapperTag(string input, string expected)
{
- var actual = input.SanitiseHtml("p");
+ var actual = input.SanitiseHtml(false, "p");
actual.Should().Be(expected);
}
+
+ [Fact]
+ public void SanitiseHtml_Throws_IfCustomWrapperTagNotAllowed()
+ {
+ Action action = () => "html".SanitiseHtml(false, "div");
+
+ action.Should()
+ .ThrowExactly()
+ .WithMessage("Tag provided is not allowed. Must be one of: a,b,br,i,img,p,small,span,sub,sup (Parameter 'nonHtmlWrappingTag')");
+ }
[Theory]
[InlineData("a")]
diff --git a/src/IIIF/IIIF/Presentation/HtmlSanitiser.cs b/src/IIIF/IIIF/Presentation/HtmlSanitiser.cs
index 7049266..a4ab7b6 100644
--- a/src/IIIF/IIIF/Presentation/HtmlSanitiser.cs
+++ b/src/IIIF/IIIF/Presentation/HtmlSanitiser.cs
@@ -1,4 +1,5 @@
-using System.Collections.Generic;
+using System;
+using System.Collections.Generic;
using Ganss.Xss;
namespace IIIF.Presentation;
@@ -54,23 +55,35 @@ static HtmlSanitiser()
/// see https://iiif.io/api/presentation/3.0/#45-html-markup-in-property-values
///
/// Value to be sanitised
+ ///
+ /// If true, any strings that don't start/end with </> are returned as-is. If false non-html strings will be
+ /// wrapped
+ ///
///
- /// Tag to wrap value in if it is not currently an HTML string (starts with < and ends with >)
+ /// Tag to wrap value in if it is not currently an HTML string (starts with < and ends with >). Only used if
+ /// is true
///
/// Sanitised markup value
- public static string SanitiseHtml(this string propertyValue, string nonHtmlWrappingTag = "span")
+ public static string SanitiseHtml(this string propertyValue, bool ignoreNonHtml = true,
+ string nonHtmlWrappingTag = "span")
{
if (string.IsNullOrEmpty(propertyValue)) return propertyValue;
+ if (ignoreNonHtml && !IsHtmlString(propertyValue)) return propertyValue;
var workingString = Sanitizer.Sanitize(propertyValue.Trim());
+
+ if (IsHtmlString(workingString)) return workingString;
- if (IsHtmlString(workingString))
+ if (!HtmlSanitizerOptions.AllowedTags.Contains(nonHtmlWrappingTag))
{
- workingString = $"<{nonHtmlWrappingTag}>{workingString}{nonHtmlWrappingTag}>";
+ throw new ArgumentException(
+ $"Tag provided is not allowed. Must be one of: {string.Join(",", HtmlSanitizerOptions.AllowedTags)}",
+ nameof(nonHtmlWrappingTag));
}
+ workingString = $"<{nonHtmlWrappingTag}>{workingString}{nonHtmlWrappingTag}>";
- return workingString;
+ return Sanitizer.Sanitize(workingString);
}
- private static bool IsHtmlString(string workingString) => workingString[0] != '<' || workingString[^1] != '>';
+ private static bool IsHtmlString(string candidate) => candidate[0] == '<' && candidate[^1] == '>';
}
\ No newline at end of file