Skip to main content

C# method to strip HTML tags from a string leaving its contents in place, safely and accurately.

using System.Diagnostics.CodeAnalysis;
using AngleSharp.Html.Dom;
using AngleSharp.Html.Parser;
using Ganss.Xss;

/// <summary>
/// Remove HTML tags from string.
/// Uses AngleSharp package (also part of Ganss.Xss.HtmlSanizter) to remove HTML tags.
/// </summary>
/// <remakrs>
/// See https://code-maze.com/csharp-remove-html-tags-from-a-string/
/// </remakrs>
[return: NotNullIfNotNull(nameof(html))]
public string? StripHtml(in string? html)
{
    if (html == null)
    {
        return null;
    }

    // hat tip: https://code-maze.com/csharp-remove-html-tags-from-a-string/
    var parser = new HtmlParser();

    var document = parser.ParseDocument(html);
    var result = document.Body!.TextContent;

    result = System.Net.WebUtility.HtmlDecode(result);

    return result;
}

// -----------------------------------------------------------------------------
// A few simple unit tests
// -----------------------------------------------------------------------------

public class StripHtmlTests
{
    [Fact]
    public void Text_Content_Is_Preserved()
    {
        const string html = "<p>hello world</p>";
        const string expected = "hello world";

        var actual = StripHtml(html);

        Assert.Equal(expected, actual);
    }

    [Fact]
    public void Missing_Closing_Bracket_Assumes_Is_Part_of_Tag()
    {
        const string html = "<phello world</p>";
        const string expected = "";

        var actual = StripHtml(html);

        Assert.Equal(expected, actual);
    }

    [Fact]
    public void Leaves_Lonely_Brackets_Alone()
    {
        const string html = "hello > world";
        const string expected = "hello > world";

        var actual = StripHtml(html);

        Assert.Equal(expected, actual);
    }
}