Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add SanitiseHtml helper method #43

Merged
merged 2 commits into from
Nov 16, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,8 @@ The `.Serialisation` namespace contains a number of custom `JsonConverter` imple

### Helpers

#### Serialisation

`IIIFSerialiserX` contains 2 extension methods for `JsonLdBase` that help with serialising / deserialising models.

For string serialisation these are `AsJson` and `FromJson<TTarget>`:
Expand All @@ -110,6 +112,19 @@ Manifest deserialisedManifest = streamContainingManifest.FromJsonStream<Manifest

> Note: full object deserialisation is incomplete - open an issue or PR if you find an issue.

#### HTML Markup Handling

`HtmlSanitiser` contains a `SanitiseHtml()` extension method on `string` to help sanitise HTML.

```cs
string original = "<p>my markup<div>invalid</div><p>";
string safe = original.SanitiseHtml();
```

See [IIIF Presentation 3.0 docs](https://iiif.io/api/presentation/3.0/#45-html-markup-in-property-values) for details on html markup.

> Note: The rules around markup differs between Presentation 2.1 and 3.0. This method uses 3.0 which permits a couple of tags not mentioned in 2.1 (`small`, `sub` and `sup`).

## Local Build

The `local_build.sh` bash script will build/test/pack for ease of testing.
Expand Down
228 changes: 228 additions & 0 deletions src/IIIF/IIIF.Tests/Presentation/HtmlSanitiserTests.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,228 @@
using System;
using IIIF.Presentation;

namespace IIIF.Tests.Presentation;

public class HtmlSanitiserTests
{
[Theory]
[InlineData(null)]
[InlineData("")]
public void SanitiseHtml_ReturnsGivenString_IfNullOrEmpty(string val)
=> val.SanitiseHtml().Should().Be(val);

[Fact]
public void SanitiseHtml_Trims_Whitespace_FromBeginningAndEnd_IfIgnoreNonHtmlFalse()
{
const string input = " <p>valid html</p> ";
const string expected = "<p>valid html</p>";

var actual = input.SanitiseHtml(false);

actual.Should().Be(expected);
}

[Fact]
public void SanitiseHtml_AutoCloses_ValidTags_IfIgnoreNonHtmlFalse()
{
const string input = " <p>valid html</p><span> ";
const string expected = "<p>valid html</p><span></span>";

var actual = input.SanitiseHtml(false);

actual.Should().Be(expected);
}

[Fact]
public void SanitiseHtml_RemovesInvalidTags_IncludingChildElements()
{
const string input =
"<br><span><small><i>hi</i></small></span><div><p>child paragraph</p></div><h1>Test</h1><ul><ol><li>foo</li></ol></ul><p><script>alert('hi');</script><sub>valid</sub> <sup>paragraph</sup></p>";
const string expected = "<br><span><small><i>hi</i></small></span><p><sub>valid</sub> <sup>paragraph</sup></p>";

var actual = input.SanitiseHtml();

actual.Should().Be(expected);
}

[Theory]
[InlineData("http://localhost")]
[InlineData("https://localhost")]
[InlineData("mailto://test@example")]
public void SanitiseHtml_AllowsSpecifiedSchemesForHref(string href)
{
var input = $"<a href=\"{href}\">test</a>";
var expected = $"<a href=\"{href}\">test</a>";

var actual = input.SanitiseHtml();

actual.Should().Be(expected);
}

[Fact]
public void SanitiseHtml_StripsInvalidSchemesFromHref()
{
var input = "<a href=\"other://foo-bar\">test</a>";
const string expected = "<a>test</a>";

var actual = input.SanitiseHtml();

actual.Should().Be(expected);
}

[Fact]
public void SanitiseHtml_RemovesComments()
{
const string input = "<!--This will be removed--><p><!--as will this-->valid html</p>";
const string expected = "<p>valid html</p>";

var actual = input.SanitiseHtml();

actual.Should().Be(expected);
}

[Fact]
public void SanitiseHtml_RemovesCdata()
{
const string input = "<![CDATA[This will be removed]]><p>valid html</p>";
const string expected = "<p>valid html</p>";

var actual = input.SanitiseHtml();

actual.Should().Be(expected);
}

[Fact]
public void SanitiseHtml_RemovesProcessingInstructions()
{
const string input = "<?xml version=\"1.0\"?><p>valid html</p>";
const string expected = "<p>valid html</p>";

var actual = input.SanitiseHtml();

actual.Should().Be(expected);
}

[Theory]
[InlineData("<p>html")]
[InlineData("<p>html</p")]
[InlineData("p>html</p>")]
[InlineData("html</p>")]
[InlineData(" <p>valid html</p> ")]
public void SanitiseHtml_ReturnsInvalidHtml_IfIgnoreNonHtmlTrue(string input)
{
var actual = input.SanitiseHtml();

actual.Should().Be(input);
}

[Theory]
[InlineData("html", "<span>html</span>")]
[InlineData(" html ", "<span>html</span>")]
[InlineData("<p>html", "<p>html</p>")]
[InlineData("<p>html</p", "<p>html</p>")]
[InlineData("p>html</p>", "<span>p&gt;html<p></p></span>")]
[InlineData("html</p>", "<span>html<p></p></span>")]
public void SanitiseHtml_HandlesInvalidHtml_IfIgnoreNonHtmlFalse(string input, string expected)
{
var actual = input.SanitiseHtml(false);

actual.Should().Be(expected);
}

[Theory]
[InlineData("html", "<p>html</p>")]
[InlineData(" html ", "<p>html</p>")]
[InlineData("<p>html", "<p>html</p>")]
[InlineData("<p>html</p", "<p>html</p>")]
[InlineData("p>html</p>", "<p>p&gt;html</p><p></p><p></p>")]
[InlineData("html</p>", "<p>html</p><p></p><p></p>")]
public void SanitiseHtml_HandlesInvalidHtml_WithCustomWrapperTag(string input, string expected)
{
var actual = input.SanitiseHtml(false, "p");

actual.Should().Be(expected);
}

[Fact]
public void SanitiseHtml_Throws_IfCustomWrapperTagNotAllowed()
{
Action action = () => "html".SanitiseHtml(false, "div");

action.Should()
.ThrowExactly<ArgumentException>()
.WithMessage("Tag provided is not allowed. Must be one of: a,b,br,i,img,p,small,span,sub,sup (Parameter 'nonHtmlWrappingTag')");
}

[Theory]
[InlineData("a")]
[InlineData("b")]
[InlineData("i")]
[InlineData("p")]
[InlineData("small")]
[InlineData("span")]
[InlineData("sub")]
[InlineData("sup")]
public void SanitiseHtml_RemovesSrcAndAltAttributes_FromNonImgTag(string tag)
{
var input = $"<{tag} alt=\"alt\" src=\"http://foo\">x</{tag}>";
var expected = $"<{tag}>x</{tag}>";

var actual = input.SanitiseHtml();

actual.Should().Be(expected);
}

[Theory]
[InlineData("b")]
[InlineData("i")]
[InlineData("p")]
[InlineData("small")]
[InlineData("span")]
[InlineData("sub")]
[InlineData("sup")]
public void SanitiseHtml_RemovesHrefAttribute_FromNonAnchorTag(string tag)
{
var input = $"<{tag} href=\"http://foo\">x</{tag}>";
var expected = $"<{tag}>x</{tag}>";

var actual = input.SanitiseHtml();

actual.Should().Be(expected);
}

[Fact]
public void SanitiseHtml_RemovesHrefAttribute_FromImageTag()
{
// NOTE: this is excluded from above as it has no closing tag so avoids logic in tests
const string input = "<img href=\"http://foo\">";
const string expected = "<img>";

var actual = input.SanitiseHtml();

actual.Should().Be(expected);
}

[Fact]
public void SanitiseHtml_RemovesHref_Src_AndAltAttributes_FromLineBreak()
{
// NOTE: this is excluded from above as it has no closing tag so avoids logic in tests
const string input = "<br alt=\"alt\" src=\"http://foo\" href=\"http://foo\">";
const string expected = "<br>";

var actual = input.SanitiseHtml();

actual.Should().Be(expected);
}

[Fact]
public void SanitiseHtml_AllowsSrcAndAltAttributes_OnImgTag()
{
const string input = "<img alt=\"alt\" src=\"http://img.jpg\">";
const string expected = "<img alt=\"alt\" src=\"http://img.jpg\">";

var actual = input.SanitiseHtml();

actual.Should().Be(expected);
}
}
1 change: 1 addition & 0 deletions src/IIIF/IIIF/IIIF.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
</PropertyGroup>

<ItemGroup>
<PackageReference Include="HtmlSanitizer" Version="8.0.746" />
<PackageReference Include="Newtonsoft.Json" Version="13.0.2"/>
<PackageReference Include="Microsoft.SourceLink.GitHub" Version="1.1.1" PrivateAssets="All"/>
</ItemGroup>
Expand Down
89 changes: 89 additions & 0 deletions src/IIIF/IIIF/Presentation/HtmlSanitiser.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
using System;
using System.Collections.Generic;
using Ganss.Xss;

namespace IIIF.Presentation;

/// <summary>
/// Class to help in sanitising HTML markup for use in IIIF property values
/// </summary>
/// <remarks>See https://iiif.io/api/presentation/3.0/#45-html-markup-in-property-values</remarks>
public static class HtmlSanitiser
{
private static readonly HtmlSanitizerOptions HtmlSanitizerOptions = new()
{
AllowedTags = new HashSet<string> { "a", "b", "br", "i", "img", "p", "small", "span", "sub", "sup" },
AllowedAttributes = new HashSet<string>(0),
AllowedSchemes = new HashSet<string> { "http", "https", "mailto" },
UriAttributes = new HashSet<string> { "href" }
};

private static readonly HtmlSanitizer Sanitizer = new(HtmlSanitizerOptions);

private static readonly Dictionary<string, ISet<string>> ValidAttributesPerTag
= new()
{
["a"] = new HashSet<string> { "href" },
["img"] = new HashSet<string> { "src", "alt" },
};

static HtmlSanitiser()
{
// NOTE - used HTML sanitiser lib doesn't allow tag-specific attributes so subscribe to RemovingAttribute
// events and cancel those that should be allowed
Sanitizer.RemovingAttribute += (sender, args) =>
{
// Attribute can also be removed if scheme isn't allowed
if (args.Reason != RemoveReason.NotAllowedAttribute) return;
args.Cancel = ValidAttributesPerTag.TryGetValue(args.Tag.TagName.ToLower(), out var allowedAttributes)
&& allowedAttributes.Contains(args.Attribute.Name.ToLower());
};
}


/// <summary>
/// Sanitise markup to meet requirements in IIIF spec. This will
///
/// * Remove all tags except: a, b, br, i, img, p, small, span, sub and sup
/// * Remove all attributes other than href on the a tag, src and alt on the img tag
/// * Remove all href attributes that start with the strings other than “http:”, “https:”, and “mailto:”
/// * CData sections
/// * XML comments
/// * Processing instructions
/// * Strip whitespace from either side of HTML string
///
/// see https://iiif.io/api/presentation/3.0/#45-html-markup-in-property-values
/// </summary>
/// <param name="propertyValue">Value to be sanitised</param>
/// <param name="ignoreNonHtml">
/// If true, any strings that don't start/end with &lt;/&gt; are returned as-is. If false non-html strings will be
/// wrapped
/// </param>
/// <param name="nonHtmlWrappingTag">
/// Tag to wrap value in if it is not currently an HTML string (starts with &lt; and ends with &gt;). Only used if
/// <see cref="ignoreNonHtml"/> is true
/// </param>
/// <returns>Sanitised markup value</returns>
public static string SanitiseHtml(this string propertyValue, bool ignoreNonHtml = true,
string nonHtmlWrappingTag = "span")
{
if (string.IsNullOrEmpty(propertyValue)) return propertyValue;
if (ignoreNonHtml && !IsHtmlString(propertyValue)) return propertyValue;

var workingString = Sanitizer.Sanitize(propertyValue.Trim());

if (IsHtmlString(workingString)) return workingString;

if (!HtmlSanitizerOptions.AllowedTags.Contains(nonHtmlWrappingTag))
{
throw new ArgumentException(
$"Tag provided is not allowed. Must be one of: {string.Join(",", HtmlSanitizerOptions.AllowedTags)}",
nameof(nonHtmlWrappingTag));
}
workingString = $"<{nonHtmlWrappingTag}>{workingString}</{nonHtmlWrappingTag}>";

return Sanitizer.Sanitize(workingString);
}

private static bool IsHtmlString(string candidate) => candidate[0] == '<' && candidate[^1] == '>';
}
Loading