From 9491813ca58dca608ae7c4d2c2dc3b21e118343d Mon Sep 17 00:00:00 2001 From: Donald Gray Date: Wed, 15 Nov 2023 14:50:39 +0000 Subject: [PATCH 1/2] Add SanitiseHtml helper method --- readme.md | 15 ++ .../Presentation/HtmlSanitiserTests.cs | 200 ++++++++++++++++++ src/IIIF/IIIF/IIIF.csproj | 1 + src/IIIF/IIIF/Presentation/HtmlSanitiser.cs | 76 +++++++ 4 files changed, 292 insertions(+) create mode 100644 src/IIIF/IIIF.Tests/Presentation/HtmlSanitiserTests.cs create mode 100644 src/IIIF/IIIF/Presentation/HtmlSanitiser.cs diff --git a/readme.md b/readme.md index 252e959..ccc0a1d 100644 --- a/readme.md +++ b/readme.md @@ -85,6 +85,8 @@ The `.Serialisation` namespace contains a number of custom `JsonConverter` imple ### Helpers +#### Serialisation + `IIIFSerialiserX` contains 2 extension methods for `JsonLdBase` that help with serialising / deserialising models. For string serialisation these are `AsJson` and `FromJson`: @@ -110,6 +112,19 @@ Manifest deserialisedManifest = streamContainingManifest.FromJsonStream Note: full object deserialisation is incomplete - open an issue or PR if you find an issue. +#### HTML Markup Handling + +`HtmlSanitiser` contains a `SanitiseHtml()` extension method on `string` to help sanitise HTML. + +```cs +string original = "

my markup

invalid

"; +string safe = original.SanitiseHtml(); +``` + +See [IIIF Presentation 3.0 docs](https://iiif.io/api/presentation/3.0/#45-html-markup-in-property-values) for details on html markup. + +> Note: The rules around markup differs between Presentation 2.1 and 3.0. This method uses 3.0 which permits a couple of tags not mentioned in 2.1 (`small`, `sub` and `sup`). + ## Local Build The `local_build.sh` bash script will build/test/pack for ease of testing. diff --git a/src/IIIF/IIIF.Tests/Presentation/HtmlSanitiserTests.cs b/src/IIIF/IIIF.Tests/Presentation/HtmlSanitiserTests.cs new file mode 100644 index 0000000..4a931a6 --- /dev/null +++ b/src/IIIF/IIIF.Tests/Presentation/HtmlSanitiserTests.cs @@ -0,0 +1,200 @@ +using IIIF.Presentation; + +namespace IIIF.Tests.Presentation; + +public class HtmlSanitiserTests +{ + [Theory] + [InlineData(null)] + [InlineData("")] + public void SanitiseHtml_ReturnsGivenString_IfNullOrEmpty(string val) + => val.SanitiseHtml().Should().Be(val); + + [Fact] + public void SanitiseHtml_Trims_Whitespace_From_Beginning_And_End() + { + const string input = "

valid html

"; + const string expected = "

valid html

"; + + var actual = input.SanitiseHtml(); + + actual.Should().Be(expected); + } + + [Fact] + public void SanitiseHtml_AutoCloses_ValidTags() + { + const string input = "

valid html

"; + const string expected = "

valid html

"; + + var actual = input.SanitiseHtml(); + + actual.Should().Be(expected); + } + + [Fact] + public void SanitiseHtml_RemovesInvalidTags_IncludingChildElements() + { + const string input = + "
hi

child paragraph

Test

    1. foo

valid paragraph

"; + const string expected = "
hi

valid paragraph

"; + + var actual = input.SanitiseHtml(); + + actual.Should().Be(expected); + } + + [Theory] + [InlineData("http://localhost")] + [InlineData("https://localhost")] + [InlineData("mailto://test@example")] + public void SanitiseHtml_AllowsSpecifiedSchemesForHref(string href) + { + var input = $"test"; + var expected = $"test"; + + var actual = input.SanitiseHtml(); + + actual.Should().Be(expected); + } + + [Fact] + public void SanitiseHtml_StripsInvalidSchemesFromHref() + { + var input = "test"; + const string expected = "test"; + + var actual = input.SanitiseHtml(); + + actual.Should().Be(expected); + } + + [Fact] + public void SanitiseHtml_RemovesComments() + { + const string input = "

valid html

"; + const string expected = "

valid html

"; + + var actual = input.SanitiseHtml(); + + actual.Should().Be(expected); + } + + [Fact] + public void SanitiseHtml_RemovesCdata() + { + const string input = "

valid html

"; + const string expected = "

valid html

"; + + var actual = input.SanitiseHtml(); + + actual.Should().Be(expected); + } + + [Fact] + public void SanitiseHtml_RemovesProcessingInstructions() + { + const string input = "

valid html

"; + const string expected = "

valid html

"; + + var actual = input.SanitiseHtml(); + + actual.Should().Be(expected); + } + + [Theory] + [InlineData("

html", "

html

")] + [InlineData("

htmlhtml

")] + [InlineData("p>html

", "p>html

")] + [InlineData("html

", "html

")] + public void SanitiseHtml_HandlesInvalidHtml(string input, string expected) + { + var actual = input.SanitiseHtml(); + + actual.Should().Be(expected); + } + + [Theory] + [InlineData("

html", "

html

")] + [InlineData("

htmlhtml

")] + [InlineData("p>html

", "

p>html

")] + [InlineData("html

", "

html

")] + public void SanitiseHtml_HandlesInvalidHtml_WithCustomWrapperTag(string input, string expected) + { + var actual = input.SanitiseHtml("p"); + + actual.Should().Be(expected); + } + + [Theory] + [InlineData("a")] + [InlineData("b")] + [InlineData("i")] + [InlineData("p")] + [InlineData("small")] + [InlineData("span")] + [InlineData("sub")] + [InlineData("sup")] + public void SanitiseHtml_RemovesSrcAndAltAttributes_FromNonImgTag(string tag) + { + var input = $"<{tag} alt=\"alt\" src=\"http://foo\">x"; + var expected = $"<{tag}>x"; + + var actual = input.SanitiseHtml(); + + actual.Should().Be(expected); + } + + [Theory] + [InlineData("b")] + [InlineData("i")] + [InlineData("p")] + [InlineData("small")] + [InlineData("span")] + [InlineData("sub")] + [InlineData("sup")] + public void SanitiseHtml_RemovesHrefAttribute_FromNonAnchorTag(string tag) + { + var input = $"<{tag} href=\"http://foo\">x"; + var expected = $"<{tag}>x"; + + var actual = input.SanitiseHtml(); + + actual.Should().Be(expected); + } + + [Fact] + public void SanitiseHtml_RemovesHrefAttribute_FromImageTag() + { + // NOTE: this is excluded from above as it has no closing tag so avoids logic in tests + const string input = ""; + const string expected = ""; + + var actual = input.SanitiseHtml(); + + actual.Should().Be(expected); + } + + [Fact] + public void SanitiseHtml_RemovesHref_Src_AndAltAttributes_FromLineBreak() + { + // NOTE: this is excluded from above as it has no closing tag so avoids logic in tests + const string input = "
"; + const string expected = "
"; + + var actual = input.SanitiseHtml(); + + actual.Should().Be(expected); + } + + [Fact] + public void SanitiseHtml_AllowsSrcAndAltAttributes_OnImgTag() + { + const string input = "\"alt\""; + const string expected = "\"alt\""; + + var actual = input.SanitiseHtml(); + + actual.Should().Be(expected); + } +} \ No newline at end of file diff --git a/src/IIIF/IIIF/IIIF.csproj b/src/IIIF/IIIF/IIIF.csproj index 6e86e84..d8cb475 100644 --- a/src/IIIF/IIIF/IIIF.csproj +++ b/src/IIIF/IIIF/IIIF.csproj @@ -19,6 +19,7 @@ + diff --git a/src/IIIF/IIIF/Presentation/HtmlSanitiser.cs b/src/IIIF/IIIF/Presentation/HtmlSanitiser.cs new file mode 100644 index 0000000..7049266 --- /dev/null +++ b/src/IIIF/IIIF/Presentation/HtmlSanitiser.cs @@ -0,0 +1,76 @@ +using System.Collections.Generic; +using Ganss.Xss; + +namespace IIIF.Presentation; + +/// +/// Class to help in sanitising HTML markup for use in IIIF property values +/// +/// See https://iiif.io/api/presentation/3.0/#45-html-markup-in-property-values +public static class HtmlSanitiser +{ + private static readonly HtmlSanitizerOptions HtmlSanitizerOptions = new() + { + AllowedTags = new HashSet { "a", "b", "br", "i", "img", "p", "small", "span", "sub", "sup" }, + AllowedAttributes = new HashSet(0), + AllowedSchemes = new HashSet { "http", "https", "mailto" }, + UriAttributes = new HashSet { "href" } + }; + + private static readonly HtmlSanitizer Sanitizer = new(HtmlSanitizerOptions); + + private static readonly Dictionary> ValidAttributesPerTag + = new() + { + ["a"] = new HashSet { "href" }, + ["img"] = new HashSet { "src", "alt" }, + }; + + static HtmlSanitiser() + { + // NOTE - used HTML sanitiser lib doesn't allow tag-specific attributes so subscribe to RemovingAttribute + // events and cancel those that should be allowed + Sanitizer.RemovingAttribute += (sender, args) => + { + // Attribute can also be removed if scheme isn't allowed + if (args.Reason != RemoveReason.NotAllowedAttribute) return; + args.Cancel = ValidAttributesPerTag.TryGetValue(args.Tag.TagName.ToLower(), out var allowedAttributes) + && allowedAttributes.Contains(args.Attribute.Name.ToLower()); + }; + } + + + /// + /// Sanitise markup to meet requirements in IIIF spec. This will + /// + /// * Remove all tags except: a, b, br, i, img, p, small, span, sub and sup + /// * Remove all attributes other than href on the a tag, src and alt on the img tag + /// * Remove all href attributes that start with the strings other than “http:”, “https:”, and “mailto:” + /// * CData sections + /// * XML comments + /// * Processing instructions + /// * Strip whitespace from either side of HTML string + /// + /// see https://iiif.io/api/presentation/3.0/#45-html-markup-in-property-values + /// + /// Value to be sanitised + /// + /// Tag to wrap value in if it is not currently an HTML string (starts with < and ends with >) + /// + /// Sanitised markup value + public static string SanitiseHtml(this string propertyValue, string nonHtmlWrappingTag = "span") + { + if (string.IsNullOrEmpty(propertyValue)) return propertyValue; + + var workingString = Sanitizer.Sanitize(propertyValue.Trim()); + + if (IsHtmlString(workingString)) + { + workingString = $"<{nonHtmlWrappingTag}>{workingString}"; + } + + return workingString; + } + + private static bool IsHtmlString(string workingString) => workingString[0] != '<' || workingString[^1] != '>'; +} \ No newline at end of file From 8f0035e826f1058f272879be6f3ef9f242e4c797 Mon Sep 17 00:00:00 2001 From: Donald Gray Date: Wed, 15 Nov 2023 15:52:48 +0000 Subject: [PATCH 2/2] Add 'ignoreNonHtml' to SanitiseHtml --- .../Presentation/HtmlSanitiserTests.cs | 48 +++++++++++++++---- src/IIIF/IIIF/Presentation/HtmlSanitiser.cs | 27 ++++++++--- 2 files changed, 58 insertions(+), 17 deletions(-) diff --git a/src/IIIF/IIIF.Tests/Presentation/HtmlSanitiserTests.cs b/src/IIIF/IIIF.Tests/Presentation/HtmlSanitiserTests.cs index 4a931a6..37b3d23 100644 --- a/src/IIIF/IIIF.Tests/Presentation/HtmlSanitiserTests.cs +++ b/src/IIIF/IIIF.Tests/Presentation/HtmlSanitiserTests.cs @@ -1,4 +1,5 @@ -using IIIF.Presentation; +using System; +using IIIF.Presentation; namespace IIIF.Tests.Presentation; @@ -11,23 +12,23 @@ public void SanitiseHtml_ReturnsGivenString_IfNullOrEmpty(string val) => val.SanitiseHtml().Should().Be(val); [Fact] - public void SanitiseHtml_Trims_Whitespace_From_Beginning_And_End() + public void SanitiseHtml_Trims_Whitespace_FromBeginningAndEnd_IfIgnoreNonHtmlFalse() { const string input = "

valid html

"; const string expected = "

valid html

"; - var actual = input.SanitiseHtml(); + var actual = input.SanitiseHtml(false); actual.Should().Be(expected); } [Fact] - public void SanitiseHtml_AutoCloses_ValidTags() + public void SanitiseHtml_AutoCloses_ValidTags_IfIgnoreNonHtmlFalse() { const string input = "

valid html

"; const string expected = "

valid html

"; - var actual = input.SanitiseHtml(); + var actual = input.SanitiseHtml(false); actual.Should().Be(expected); } @@ -103,28 +104,55 @@ public void SanitiseHtml_RemovesProcessingInstructions() } [Theory] + [InlineData("

html")] + [InlineData("

htmlhtml

")] + [InlineData("html

")] + [InlineData("

valid html

")] + public void SanitiseHtml_ReturnsInvalidHtml_IfIgnoreNonHtmlTrue(string input) + { + var actual = input.SanitiseHtml(); + + actual.Should().Be(input); + } + + [Theory] + [InlineData("html", "html")] + [InlineData(" html ", "html")] [InlineData("

html", "

html

")] [InlineData("

htmlhtml

")] [InlineData("p>html

", "p>html

")] [InlineData("html

", "html

")] - public void SanitiseHtml_HandlesInvalidHtml(string input, string expected) + public void SanitiseHtml_HandlesInvalidHtml_IfIgnoreNonHtmlFalse(string input, string expected) { - var actual = input.SanitiseHtml(); + var actual = input.SanitiseHtml(false); actual.Should().Be(expected); } [Theory] + [InlineData("html", "

html

")] + [InlineData(" html ", "

html

")] [InlineData("

html", "

html

")] [InlineData("

htmlhtml

")] - [InlineData("p>html

", "

p>html

")] - [InlineData("html

", "

html

")] + [InlineData("p>html

", "

p>html

")] + [InlineData("html

", "

html

")] public void SanitiseHtml_HandlesInvalidHtml_WithCustomWrapperTag(string input, string expected) { - var actual = input.SanitiseHtml("p"); + var actual = input.SanitiseHtml(false, "p"); actual.Should().Be(expected); } + + [Fact] + public void SanitiseHtml_Throws_IfCustomWrapperTagNotAllowed() + { + Action action = () => "html".SanitiseHtml(false, "div"); + + action.Should() + .ThrowExactly() + .WithMessage("Tag provided is not allowed. Must be one of: a,b,br,i,img,p,small,span,sub,sup (Parameter 'nonHtmlWrappingTag')"); + } [Theory] [InlineData("a")] diff --git a/src/IIIF/IIIF/Presentation/HtmlSanitiser.cs b/src/IIIF/IIIF/Presentation/HtmlSanitiser.cs index 7049266..a4ab7b6 100644 --- a/src/IIIF/IIIF/Presentation/HtmlSanitiser.cs +++ b/src/IIIF/IIIF/Presentation/HtmlSanitiser.cs @@ -1,4 +1,5 @@ -using System.Collections.Generic; +using System; +using System.Collections.Generic; using Ganss.Xss; namespace IIIF.Presentation; @@ -54,23 +55,35 @@ static HtmlSanitiser() /// see https://iiif.io/api/presentation/3.0/#45-html-markup-in-property-values /// /// Value to be sanitised + /// + /// If true, any strings that don't start/end with </> are returned as-is. If false non-html strings will be + /// wrapped + /// /// - /// Tag to wrap value in if it is not currently an HTML string (starts with < and ends with >) + /// Tag to wrap value in if it is not currently an HTML string (starts with < and ends with >). Only used if + /// is true /// /// Sanitised markup value - public static string SanitiseHtml(this string propertyValue, string nonHtmlWrappingTag = "span") + public static string SanitiseHtml(this string propertyValue, bool ignoreNonHtml = true, + string nonHtmlWrappingTag = "span") { if (string.IsNullOrEmpty(propertyValue)) return propertyValue; + if (ignoreNonHtml && !IsHtmlString(propertyValue)) return propertyValue; var workingString = Sanitizer.Sanitize(propertyValue.Trim()); + + if (IsHtmlString(workingString)) return workingString; - if (IsHtmlString(workingString)) + if (!HtmlSanitizerOptions.AllowedTags.Contains(nonHtmlWrappingTag)) { - workingString = $"<{nonHtmlWrappingTag}>{workingString}"; + throw new ArgumentException( + $"Tag provided is not allowed. Must be one of: {string.Join(",", HtmlSanitizerOptions.AllowedTags)}", + nameof(nonHtmlWrappingTag)); } + workingString = $"<{nonHtmlWrappingTag}>{workingString}"; - return workingString; + return Sanitizer.Sanitize(workingString); } - private static bool IsHtmlString(string workingString) => workingString[0] != '<' || workingString[^1] != '>'; + private static bool IsHtmlString(string candidate) => candidate[0] == '<' && candidate[^1] == '>'; } \ No newline at end of file