C# method to strip HTML tags from a string leaving its contents in place, safely and accurately.
using System.Diagnostics.CodeAnalysis;
using AngleSharp.Html.Dom;
using AngleSharp.Html.Parser;
using Ganss.Xss;
/// <summary>
/// Remove HTML tags from string.
/// Uses AngleSharp package (also part of Ganss.Xss.HtmlSanizter) to remove HTML tags.
/// </summary>
/// <remakrs>
/// See https://code-maze.com/csharp-remove-html-tags-from-a-string/
/// </remakrs>
[return: NotNullIfNotNull(nameof(html))]
public string? StripHtml(in string? html)
{
if (html == null)
{
return null;
}
// hat tip: https://code-maze.com/csharp-remove-html-tags-from-a-string/
var parser = new HtmlParser();
var document = parser.ParseDocument(html);
var result = document.Body!.TextContent;
result = System.Net.WebUtility.HtmlDecode(result);
return result;
}
// -----------------------------------------------------------------------------
// A few simple unit tests
// -----------------------------------------------------------------------------
public class StripHtmlTests
{
[Fact]
public void Text_Content_Is_Preserved()
{
const string html = "<p>hello world</p>";
const string expected = "hello world";
var actual = StripHtml(html);
Assert.Equal(expected, actual);
}
[Fact]
public void Missing_Closing_Bracket_Assumes_Is_Part_of_Tag()
{
const string html = "<phello world</p>";
const string expected = "";
var actual = StripHtml(html);
Assert.Equal(expected, actual);
}
[Fact]
public void Leaves_Lonely_Brackets_Alone()
{
const string html = "hello > world";
const string expected = "hello > world";
var actual = StripHtml(html);
Assert.Equal(expected, actual);
}
}