diff --git a/readme.md b/readme.md index 252e959..ccc0a1d 100644 --- a/readme.md +++ b/readme.md @@ -85,6 +85,8 @@ The `.Serialisation` namespace contains a number of custom `JsonConverter` imple ### Helpers +#### Serialisation + `IIIFSerialiserX` contains 2 extension methods for `JsonLdBase` that help with serialising / deserialising models. For string serialisation these are `AsJson` and `FromJson`: @@ -110,6 +112,19 @@ Manifest deserialisedManifest = streamContainingManifest.FromJsonStream Note: full object deserialisation is incomplete - open an issue or PR if you find an issue. +#### HTML Markup Handling + +`HtmlSanitiser` contains a `SanitiseHtml()` extension method on `string` to help sanitise HTML. + +```cs +string original = "

my markup

invalid

"; +string safe = original.SanitiseHtml(); +``` + +See [IIIF Presentation 3.0 docs](https://iiif.io/api/presentation/3.0/#45-html-markup-in-property-values) for details on html markup. + +> Note: The rules around markup differs between Presentation 2.1 and 3.0. This method uses 3.0 which permits a couple of tags not mentioned in 2.1 (`small`, `sub` and `sup`). + ## Local Build The `local_build.sh` bash script will build/test/pack for ease of testing. diff --git a/src/IIIF/IIIF.Tests/Presentation/HtmlSanitiserTests.cs b/src/IIIF/IIIF.Tests/Presentation/HtmlSanitiserTests.cs new file mode 100644 index 0000000..37b3d23 --- /dev/null +++ b/src/IIIF/IIIF.Tests/Presentation/HtmlSanitiserTests.cs @@ -0,0 +1,228 @@ +using System; +using IIIF.Presentation; + +namespace IIIF.Tests.Presentation; + +public class HtmlSanitiserTests +{ + [Theory] + [InlineData(null)] + [InlineData("")] + public void SanitiseHtml_ReturnsGivenString_IfNullOrEmpty(string val) + => val.SanitiseHtml().Should().Be(val); + + [Fact] + public void SanitiseHtml_Trims_Whitespace_FromBeginningAndEnd_IfIgnoreNonHtmlFalse() + { + const string input = "

valid html

"; + const string expected = "

valid html

"; + + var actual = input.SanitiseHtml(false); + + actual.Should().Be(expected); + } + + [Fact] + public void SanitiseHtml_AutoCloses_ValidTags_IfIgnoreNonHtmlFalse() + { + const string input = "

valid html

"; + const string expected = "

valid html

"; + + var actual = input.SanitiseHtml(false); + + actual.Should().Be(expected); + } + + [Fact] + public void SanitiseHtml_RemovesInvalidTags_IncludingChildElements() + { + const string input = + "
hi

child paragraph

Test

    1. foo

valid paragraph

"; + const string expected = "
hi

valid paragraph

"; + + var actual = input.SanitiseHtml(); + + actual.Should().Be(expected); + } + + [Theory] + [InlineData("http://localhost")] + [InlineData("https://localhost")] + [InlineData("mailto://test@example")] + public void SanitiseHtml_AllowsSpecifiedSchemesForHref(string href) + { + var input = $"test"; + var expected = $"test"; + + var actual = input.SanitiseHtml(); + + actual.Should().Be(expected); + } + + [Fact] + public void SanitiseHtml_StripsInvalidSchemesFromHref() + { + var input = "test"; + const string expected = "test"; + + var actual = input.SanitiseHtml(); + + actual.Should().Be(expected); + } + + [Fact] + public void SanitiseHtml_RemovesComments() + { + const string input = "

valid html

"; + const string expected = "

valid html

"; + + var actual = input.SanitiseHtml(); + + actual.Should().Be(expected); + } + + [Fact] + public void SanitiseHtml_RemovesCdata() + { + const string input = "

valid html

"; + const string expected = "

valid html

"; + + var actual = input.SanitiseHtml(); + + actual.Should().Be(expected); + } + + [Fact] + public void SanitiseHtml_RemovesProcessingInstructions() + { + const string input = "

valid html

"; + const string expected = "

valid html

"; + + var actual = input.SanitiseHtml(); + + actual.Should().Be(expected); + } + + [Theory] + [InlineData("

html")] + [InlineData("

htmlhtml

")] + [InlineData("html

")] + [InlineData("

valid html

")] + public void SanitiseHtml_ReturnsInvalidHtml_IfIgnoreNonHtmlTrue(string input) + { + var actual = input.SanitiseHtml(); + + actual.Should().Be(input); + } + + [Theory] + [InlineData("html", "html")] + [InlineData(" html ", "html")] + [InlineData("

html", "

html

")] + [InlineData("

htmlhtml

")] + [InlineData("p>html

", "p>html

")] + [InlineData("html

", "html

")] + public void SanitiseHtml_HandlesInvalidHtml_IfIgnoreNonHtmlFalse(string input, string expected) + { + var actual = input.SanitiseHtml(false); + + actual.Should().Be(expected); + } + + [Theory] + [InlineData("html", "

html

")] + [InlineData(" html ", "

html

")] + [InlineData("

html", "

html

")] + [InlineData("

htmlhtml

")] + [InlineData("p>html

", "

p>html

")] + [InlineData("html

", "

html

")] + public void SanitiseHtml_HandlesInvalidHtml_WithCustomWrapperTag(string input, string expected) + { + var actual = input.SanitiseHtml(false, "p"); + + actual.Should().Be(expected); + } + + [Fact] + public void SanitiseHtml_Throws_IfCustomWrapperTagNotAllowed() + { + Action action = () => "html".SanitiseHtml(false, "div"); + + action.Should() + .ThrowExactly() + .WithMessage("Tag provided is not allowed. Must be one of: a,b,br,i,img,p,small,span,sub,sup (Parameter 'nonHtmlWrappingTag')"); + } + + [Theory] + [InlineData("a")] + [InlineData("b")] + [InlineData("i")] + [InlineData("p")] + [InlineData("small")] + [InlineData("span")] + [InlineData("sub")] + [InlineData("sup")] + public void SanitiseHtml_RemovesSrcAndAltAttributes_FromNonImgTag(string tag) + { + var input = $"<{tag} alt=\"alt\" src=\"http://foo\">x"; + var expected = $"<{tag}>x"; + + var actual = input.SanitiseHtml(); + + actual.Should().Be(expected); + } + + [Theory] + [InlineData("b")] + [InlineData("i")] + [InlineData("p")] + [InlineData("small")] + [InlineData("span")] + [InlineData("sub")] + [InlineData("sup")] + public void SanitiseHtml_RemovesHrefAttribute_FromNonAnchorTag(string tag) + { + var input = $"<{tag} href=\"http://foo\">x"; + var expected = $"<{tag}>x"; + + var actual = input.SanitiseHtml(); + + actual.Should().Be(expected); + } + + [Fact] + public void SanitiseHtml_RemovesHrefAttribute_FromImageTag() + { + // NOTE: this is excluded from above as it has no closing tag so avoids logic in tests + const string input = ""; + const string expected = ""; + + var actual = input.SanitiseHtml(); + + actual.Should().Be(expected); + } + + [Fact] + public void SanitiseHtml_RemovesHref_Src_AndAltAttributes_FromLineBreak() + { + // NOTE: this is excluded from above as it has no closing tag so avoids logic in tests + const string input = "
"; + const string expected = "
"; + + var actual = input.SanitiseHtml(); + + actual.Should().Be(expected); + } + + [Fact] + public void SanitiseHtml_AllowsSrcAndAltAttributes_OnImgTag() + { + const string input = "\"alt\""; + const string expected = "\"alt\""; + + var actual = input.SanitiseHtml(); + + actual.Should().Be(expected); + } +} \ No newline at end of file diff --git a/src/IIIF/IIIF/IIIF.csproj b/src/IIIF/IIIF/IIIF.csproj index 6e86e84..d8cb475 100644 --- a/src/IIIF/IIIF/IIIF.csproj +++ b/src/IIIF/IIIF/IIIF.csproj @@ -19,6 +19,7 @@ + diff --git a/src/IIIF/IIIF/Presentation/HtmlSanitiser.cs b/src/IIIF/IIIF/Presentation/HtmlSanitiser.cs new file mode 100644 index 0000000..a4ab7b6 --- /dev/null +++ b/src/IIIF/IIIF/Presentation/HtmlSanitiser.cs @@ -0,0 +1,89 @@ +using System; +using System.Collections.Generic; +using Ganss.Xss; + +namespace IIIF.Presentation; + +/// +/// Class to help in sanitising HTML markup for use in IIIF property values +/// +/// See https://iiif.io/api/presentation/3.0/#45-html-markup-in-property-values +public static class HtmlSanitiser +{ + private static readonly HtmlSanitizerOptions HtmlSanitizerOptions = new() + { + AllowedTags = new HashSet { "a", "b", "br", "i", "img", "p", "small", "span", "sub", "sup" }, + AllowedAttributes = new HashSet(0), + AllowedSchemes = new HashSet { "http", "https", "mailto" }, + UriAttributes = new HashSet { "href" } + }; + + private static readonly HtmlSanitizer Sanitizer = new(HtmlSanitizerOptions); + + private static readonly Dictionary> ValidAttributesPerTag + = new() + { + ["a"] = new HashSet { "href" }, + ["img"] = new HashSet { "src", "alt" }, + }; + + static HtmlSanitiser() + { + // NOTE - used HTML sanitiser lib doesn't allow tag-specific attributes so subscribe to RemovingAttribute + // events and cancel those that should be allowed + Sanitizer.RemovingAttribute += (sender, args) => + { + // Attribute can also be removed if scheme isn't allowed + if (args.Reason != RemoveReason.NotAllowedAttribute) return; + args.Cancel = ValidAttributesPerTag.TryGetValue(args.Tag.TagName.ToLower(), out var allowedAttributes) + && allowedAttributes.Contains(args.Attribute.Name.ToLower()); + }; + } + + + /// + /// Sanitise markup to meet requirements in IIIF spec. This will + /// + /// * Remove all tags except: a, b, br, i, img, p, small, span, sub and sup + /// * Remove all attributes other than href on the a tag, src and alt on the img tag + /// * Remove all href attributes that start with the strings other than “http:”, “https:”, and “mailto:” + /// * CData sections + /// * XML comments + /// * Processing instructions + /// * Strip whitespace from either side of HTML string + /// + /// see https://iiif.io/api/presentation/3.0/#45-html-markup-in-property-values + /// + /// Value to be sanitised + /// + /// If true, any strings that don't start/end with </> are returned as-is. If false non-html strings will be + /// wrapped + /// + /// + /// Tag to wrap value in if it is not currently an HTML string (starts with < and ends with >). Only used if + /// is true + /// + /// Sanitised markup value + public static string SanitiseHtml(this string propertyValue, bool ignoreNonHtml = true, + string nonHtmlWrappingTag = "span") + { + if (string.IsNullOrEmpty(propertyValue)) return propertyValue; + if (ignoreNonHtml && !IsHtmlString(propertyValue)) return propertyValue; + + var workingString = Sanitizer.Sanitize(propertyValue.Trim()); + + if (IsHtmlString(workingString)) return workingString; + + if (!HtmlSanitizerOptions.AllowedTags.Contains(nonHtmlWrappingTag)) + { + throw new ArgumentException( + $"Tag provided is not allowed. Must be one of: {string.Join(",", HtmlSanitizerOptions.AllowedTags)}", + nameof(nonHtmlWrappingTag)); + } + workingString = $"<{nonHtmlWrappingTag}>{workingString}"; + + return Sanitizer.Sanitize(workingString); + } + + private static bool IsHtmlString(string candidate) => candidate[0] == '<' && candidate[^1] == '>'; +} \ No newline at end of file