Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add SanitiseHtml helper method #43

Merged
merged 2 commits into from
Nov 16, 2023
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,8 @@ The `.Serialisation` namespace contains a number of custom `JsonConverter` imple

### Helpers

#### Serialisation

`IIIFSerialiserX` contains 2 extension methods for `JsonLdBase` that help with serialising / deserialising models.

For string serialisation these are `AsJson` and `FromJson<TTarget>`:
Expand All @@ -110,6 +112,19 @@ Manifest deserialisedManifest = streamContainingManifest.FromJsonStream<Manifest

> Note: full object deserialisation is incomplete - open an issue or PR if you find an issue.

#### HTML Markup Handling

`HtmlSanitiser` contains a `SanitiseHtml()` extension method on `string` to help sanitise HTML.

```cs
string original = "<p>my markup<div>invalid</div><p>";
string safe = original.SanitiseHtml();
```

See [IIIF Presentation 3.0 docs](https://iiif.io/api/presentation/3.0/#45-html-markup-in-property-values) for details on html markup.

> Note: The rules around markup differs between Presentation 2.1 and 3.0. This method uses 3.0 which permits a couple of tags not mentioned in 2.1 (`small`, `sub` and `sup`).

## Local Build

The `local_build.sh` bash script will build/test/pack for ease of testing.
Expand Down
200 changes: 200 additions & 0 deletions src/IIIF/IIIF.Tests/Presentation/HtmlSanitiserTests.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,200 @@
using IIIF.Presentation;

namespace IIIF.Tests.Presentation;

public class HtmlSanitiserTests
{
[Theory]
[InlineData(null)]
[InlineData("")]
public void SanitiseHtml_ReturnsGivenString_IfNullOrEmpty(string val)
=> val.SanitiseHtml().Should().Be(val);

[Fact]
public void SanitiseHtml_Trims_Whitespace_From_Beginning_And_End()
{
const string input = " <p>valid html</p> ";
const string expected = "<p>valid html</p>";

var actual = input.SanitiseHtml();

actual.Should().Be(expected);
}

[Fact]
public void SanitiseHtml_AutoCloses_ValidTags()
{
const string input = " <p>valid html</p><span> ";
const string expected = "<p>valid html</p><span></span>";

var actual = input.SanitiseHtml();

actual.Should().Be(expected);
}

[Fact]
public void SanitiseHtml_RemovesInvalidTags_IncludingChildElements()
{
const string input =
"<br><span><small><i>hi</i></small></span><div><p>child paragraph</p></div><h1>Test</h1><ul><ol><li>foo</li></ol></ul><p><script>alert('hi');</script><sub>valid</sub> <sup>paragraph</sup></p>";
const string expected = "<br><span><small><i>hi</i></small></span><p><sub>valid</sub> <sup>paragraph</sup></p>";

var actual = input.SanitiseHtml();

actual.Should().Be(expected);
}

[Theory]
[InlineData("http://localhost")]
[InlineData("https://localhost")]
[InlineData("mailto://test@example")]
public void SanitiseHtml_AllowsSpecifiedSchemesForHref(string href)
{
var input = $"<a href=\"{href}\">test</a>";
var expected = $"<a href=\"{href}\">test</a>";

var actual = input.SanitiseHtml();

actual.Should().Be(expected);
}

[Fact]
public void SanitiseHtml_StripsInvalidSchemesFromHref()
{
var input = "<a href=\"other://foo-bar\">test</a>";
const string expected = "<a>test</a>";

var actual = input.SanitiseHtml();

actual.Should().Be(expected);
}

[Fact]
public void SanitiseHtml_RemovesComments()
{
const string input = "<!--This will be removed--><p><!--as will this-->valid html</p>";
const string expected = "<p>valid html</p>";

var actual = input.SanitiseHtml();

actual.Should().Be(expected);
}

[Fact]
public void SanitiseHtml_RemovesCdata()
{
const string input = "<![CDATA[This will be removed]]><p>valid html</p>";
const string expected = "<p>valid html</p>";

var actual = input.SanitiseHtml();

actual.Should().Be(expected);
}

[Fact]
public void SanitiseHtml_RemovesProcessingInstructions()
{
const string input = "<?xml version=\"1.0\"?><p>valid html</p>";
const string expected = "<p>valid html</p>";

var actual = input.SanitiseHtml();

actual.Should().Be(expected);
}

[Theory]
[InlineData("<p>html", "<p>html</p>")]
[InlineData("<p>html</p", "<p>html</p>")]
[InlineData("p>html</p>", "<span>p&gt;html<p></p></span>")]
[InlineData("html</p>", "<span>html<p></p></span>")]
public void SanitiseHtml_HandlesInvalidHtml(string input, string expected)
{
var actual = input.SanitiseHtml();

actual.Should().Be(expected);
}

[Theory]
[InlineData("<p>html", "<p>html</p>")]
[InlineData("<p>html</p", "<p>html</p>")]
[InlineData("p>html</p>", "<p>p&gt;html<p></p></p>")]
[InlineData("html</p>", "<p>html<p></p></p>")]
public void SanitiseHtml_HandlesInvalidHtml_WithCustomWrapperTag(string input, string expected)
{
var actual = input.SanitiseHtml("p");

actual.Should().Be(expected);
}

[Theory]
[InlineData("a")]
[InlineData("b")]
[InlineData("i")]
[InlineData("p")]
[InlineData("small")]
[InlineData("span")]
[InlineData("sub")]
[InlineData("sup")]
public void SanitiseHtml_RemovesSrcAndAltAttributes_FromNonImgTag(string tag)
{
var input = $"<{tag} alt=\"alt\" src=\"http://foo\">x</{tag}>";
var expected = $"<{tag}>x</{tag}>";

var actual = input.SanitiseHtml();

actual.Should().Be(expected);
}

[Theory]
[InlineData("b")]
[InlineData("i")]
[InlineData("p")]
[InlineData("small")]
[InlineData("span")]
[InlineData("sub")]
[InlineData("sup")]
public void SanitiseHtml_RemovesHrefAttribute_FromNonAnchorTag(string tag)
{
var input = $"<{tag} href=\"http://foo\">x</{tag}>";
var expected = $"<{tag}>x</{tag}>";

var actual = input.SanitiseHtml();

actual.Should().Be(expected);
}

[Fact]
public void SanitiseHtml_RemovesHrefAttribute_FromImageTag()
{
// NOTE: this is excluded from above as it has no closing tag so avoids logic in tests
const string input = "<img href=\"http://foo\">";
const string expected = "<img>";

var actual = input.SanitiseHtml();

actual.Should().Be(expected);
}

[Fact]
public void SanitiseHtml_RemovesHref_Src_AndAltAttributes_FromLineBreak()
{
// NOTE: this is excluded from above as it has no closing tag so avoids logic in tests
const string input = "<br alt=\"alt\" src=\"http://foo\" href=\"http://foo\">";
const string expected = "<br>";

var actual = input.SanitiseHtml();

actual.Should().Be(expected);
}

[Fact]
public void SanitiseHtml_AllowsSrcAndAltAttributes_OnImgTag()
{
const string input = "<img alt=\"alt\" src=\"http://img.jpg\">";
const string expected = "<img alt=\"alt\" src=\"http://img.jpg\">";

var actual = input.SanitiseHtml();

actual.Should().Be(expected);
}
}
1 change: 1 addition & 0 deletions src/IIIF/IIIF/IIIF.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
</PropertyGroup>

<ItemGroup>
<PackageReference Include="HtmlSanitizer" Version="8.0.746" />
<PackageReference Include="Newtonsoft.Json" Version="13.0.2"/>
<PackageReference Include="Microsoft.SourceLink.GitHub" Version="1.1.1" PrivateAssets="All"/>
</ItemGroup>
Expand Down
76 changes: 76 additions & 0 deletions src/IIIF/IIIF/Presentation/HtmlSanitiser.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
using System.Collections.Generic;
using Ganss.Xss;

namespace IIIF.Presentation;

/// <summary>
/// Class to help in sanitising HTML markup for use in IIIF property values
/// </summary>
/// <remarks>See https://iiif.io/api/presentation/3.0/#45-html-markup-in-property-values</remarks>
public static class HtmlSanitiser
{
private static readonly HtmlSanitizerOptions HtmlSanitizerOptions = new()
{
AllowedTags = new HashSet<string> { "a", "b", "br", "i", "img", "p", "small", "span", "sub", "sup" },
AllowedAttributes = new HashSet<string>(0),
AllowedSchemes = new HashSet<string> { "http", "https", "mailto" },
UriAttributes = new HashSet<string> { "href" }
};

private static readonly HtmlSanitizer Sanitizer = new(HtmlSanitizerOptions);

private static readonly Dictionary<string, ISet<string>> ValidAttributesPerTag
= new()
{
["a"] = new HashSet<string> { "href" },
["img"] = new HashSet<string> { "src", "alt" },
};

static HtmlSanitiser()
{
// NOTE - used HTML sanitiser lib doesn't allow tag-specific attributes so subscribe to RemovingAttribute
// events and cancel those that should be allowed
Sanitizer.RemovingAttribute += (sender, args) =>
{
// Attribute can also be removed if scheme isn't allowed
if (args.Reason != RemoveReason.NotAllowedAttribute) return;
args.Cancel = ValidAttributesPerTag.TryGetValue(args.Tag.TagName.ToLower(), out var allowedAttributes)
&& allowedAttributes.Contains(args.Attribute.Name.ToLower());
};
}


/// <summary>
/// Sanitise markup to meet requirements in IIIF spec. This will
///
/// * Remove all tags except: a, b, br, i, img, p, small, span, sub and sup
/// * Remove all attributes other than href on the a tag, src and alt on the img tag
/// * Remove all href attributes that start with the strings other than “http:”, “https:”, and “mailto:”
/// * CData sections
/// * XML comments
/// * Processing instructions
/// * Strip whitespace from either side of HTML string
///
/// see https://iiif.io/api/presentation/3.0/#45-html-markup-in-property-values
/// </summary>
/// <param name="propertyValue">Value to be sanitised</param>
/// <param name="nonHtmlWrappingTag">
/// Tag to wrap value in if it is not currently an HTML string (starts with &lt; and ends with &gt;)
/// </param>
/// <returns>Sanitised markup value</returns>
public static string SanitiseHtml(this string propertyValue, string nonHtmlWrappingTag = "span")
{
if (string.IsNullOrEmpty(propertyValue)) return propertyValue;

var workingString = Sanitizer.Sanitize(propertyValue.Trim());

if (IsHtmlString(workingString))
{
workingString = $"<{nonHtmlWrappingTag}>{workingString}</{nonHtmlWrappingTag}>";
}

return workingString;
}

private static bool IsHtmlString(string workingString) => workingString[0] != '<' || workingString[^1] != '>';
}
Loading