Skip to main content

A C# Punycode encoding helper class. Punycode is a special encoding used to convert Unicode characters to ASCII, which is a smaller, restricted character set. Punycode is used to encode internationalized domain names (IDN).

namespace AngleSharp.Text
{
    using System;
    using System.Collections.Generic;
    using System.Text;

    /// <summary>
    /// Represents a Punycode encoding helper class.
    /// </summary>
    public static class Punycode
    {
        #region Constants

        private const Int32 PunycodeBase = 36;
        private const Int32 Tmin = 1;
        private const Int32 Tmax = 26;

        private static readonly String acePrefix = "xn--";
        private static readonly Char[] possibleDots = { '.', '\u3002', '\uFF0E', '\uFF61' };

        /// <summary>
        /// A list of available punycode character mappings.
        /// </summary>
        public static IDictionary<Char, Char> Symbols = new Dictionary<Char, Char>
        {
            { '。', '.' },
            { '.', '.' },
            { 'G', 'g' },
            { 'o', 'o' },
            { 'c', 'c' },
            { 'X', 'x' },
            { '0', '0' },
            { '1', '1' },
            { '2', '2' },
            { '5', '5' },
            { '⁰', '0' },
            { '¹', '1' },
            { '²', '2' },
            { '³', '3' },
            { '⁴', '4' },
            { '⁵', '5' },
            { '⁶', '6' },
            { '⁷', '7' },
            { '⁸', '8' },
            { '⁹', '9' },
            { '₀', '0' },
            { '₁', '1' },
            { '₂', '2' },
            { '₃', '3' },
            { '₄', '4' },
            { '₅', '5' },
            { '₆', '6' },
            { '₇', '7' },
            { '₈', '8' },
            { '₉', '9' },
            { 'ᵃ', 'a' },
            { 'ᵇ', 'b' },
            { 'ᶜ', 'c' },
            { 'ᵈ', 'd' },
            { 'ᵉ', 'e' },
            { 'ᶠ', 'f' },
            { 'ᵍ', 'g' },
            { 'ʰ', 'h' },
            { 'ⁱ', 'i' },
            { 'ʲ', 'j' },
            { 'ᵏ', 'k' },
            { 'ˡ', 'l' },
            { 'ᵐ', 'm' },
            { 'ⁿ', 'n' },
            { 'ᵒ', 'o' },
            { 'ᵖ', 'p' },
            { 'ʳ', 'r' },
            { 'ˢ', 's' },
            { 'ᵗ', 't' },
            { 'ᵘ', 'u' },
            { 'ᵛ', 'v' },
            { 'ʷ', 'w' },
            { 'ˣ', 'x' },
            { 'ʸ', 'y' },
            { 'ᶻ', 'z' },
            { 'ᴬ', 'A' },
            { 'ᴮ', 'B' },
            { 'ᴰ', 'D' },
            { 'ᴱ', 'E' },
            { 'ᴳ', 'G' },
            { 'ᴴ', 'H' },
            { 'ᴵ', 'I' },
            { 'ᴶ', 'J' },
            { 'ᴷ', 'K' },
            { 'ᴸ', 'L' },
            { 'ᴹ', 'M' },
            { 'ᴺ', 'N' },
            { 'ᴼ', 'O' },
            { 'ᴾ', 'P' },
            { 'ᴿ', 'R' },
            { 'ᵀ', 'T' },
            { 'ᵁ', 'U' },
            { 'ⱽ', 'V' },
            { 'ᵂ', 'W' },
        };

        #endregion

        #region Methods

        /// <summary>
        /// Encodes the given text using Punycode.
        /// </summary>
        public static String Encode(String text)
        {
            const Int32 InitialBias = 72;
            const Int32 InitialNumber = 0x80;
            const Int32 MaxIntValue = 0x7ffffff;
            const Int32 LabelLimit = 63;
            const Int32 DefaultNameLimit = 255;

            // 0 length strings aren't allowed
            if (text.Length == 0)
            {
                return text;
            }

            var output = new StringBuilder(text.Length);
            var iNextDot = 0;
            var iAfterLastDot = 0;
            var iOutputAfterLastDot = 0;

            // Find the next dot
            while (iNextDot < text.Length)
            {
                // Find end of this segment
                iNextDot = text.IndexOfAny(possibleDots, iAfterLastDot);

                if (iNextDot < 0)
                {
                    iNextDot = text.Length;
                }

                // Only allowed to have empty . section at end (www.microsoft.com.)
                if (iNextDot == iAfterLastDot)
                {
                    break;
                }

                // We'll need an Ace prefix
                output.Append(acePrefix);

                var basicCount = 0;
                var numProcessed = 0;

                for (basicCount = iAfterLastDot; basicCount < iNextDot; basicCount++)
                {
                    if (text[basicCount] < 0x80)
                    {
                        output.Append(EncodeBasic(text[basicCount]));
                        numProcessed++;
                    }
                    else if (Char.IsSurrogatePair(text, basicCount))
                    {
                        basicCount++;
                    }
                }

                var numBasicCodePoints = numProcessed;

                if (numBasicCodePoints == iNextDot - iAfterLastDot)
                {
                    output.Remove(iOutputAfterLastDot, acePrefix.Length);
                }
                else
                {
                    // If it has some non-basic code points the input cannot start with xn--
                    if (text.Length - iAfterLastDot >= acePrefix.Length && text.Substring(iAfterLastDot, acePrefix.Length).Equals(acePrefix, StringComparison.OrdinalIgnoreCase))
                    {
                        break;
                    }

                    // Need to do ACE encoding
                    var numSurrogatePairs = 0;

                    // Add a delimiter (-) if we had any basic code points (between basic and encoded pieces)
                    if (numBasicCodePoints > 0)
                    {
                        output.Append(Text.Symbols.Minus);
                    }

                    // Initialize the state
                    var n = InitialNumber;
                    var delta = 0;
                    var bias = InitialBias;

                    // Main loop
                    while (numProcessed < (iNextDot - iAfterLastDot))
                    {
                        var j = 0;
                        var m = 0;
                        var test = 0;

                        for (m = MaxIntValue, j = iAfterLastDot; j < iNextDot; j += IsSupplementary(test) ? 2 : 1)
                        {
                            test = Char.ConvertToUtf32(text, j);

                            if (test >= n && test < m)
                            {
                                m = test;
                            }
                        }

                        // Increase delta enough to advance the decoder's
                        // <n,i> state to <m,0>, but guard against overflow:
                        delta += (m - n) * ((numProcessed - numSurrogatePairs) + 1);
                        n = m;

                        for (j = iAfterLastDot; j < iNextDot; j += IsSupplementary(test) ? 2 : 1)
                        {
                            // Make sure we're aware of surrogates
                            test = Char.ConvertToUtf32(text, j);

                            // Adjust for character position (only the chars in our string already, some
                            // haven't been processed.

                            if (test < n)
                            {
                                delta++;
                            }
                            else if (test == n)
                            {
                                // Represent delta as a generalized variable-length integer:
                                int q, k;

                                for (q = delta, k = PunycodeBase; ; k += PunycodeBase)
                                {
                                    var t = k <= bias ? Tmin : k >= bias + Tmax ? Tmax : k - bias;

                                    if (q < t)
                                    {
                                        break;
                                    }

                                    output.Append(EncodeDigit(t + (q - t) % (PunycodeBase - t)));
                                    q = (q - t) / (PunycodeBase - t);
                                }

                                output.Append(EncodeDigit(q));
                                bias = AdaptChar(delta, (numProcessed - numSurrogatePairs) + 1, numProcessed == numBasicCodePoints);
                                delta = 0;
                                numProcessed++;

                                if (IsSupplementary(m))
                                {
                                    numProcessed++;
                                    numSurrogatePairs++;
                                }
                            }
                        }

                        ++delta;
                        ++n;
                    }
                }

                // Make sure its not too big
                if (output.Length - iOutputAfterLastDot > LabelLimit)
                    throw new ArgumentException();

                // Done with this segment, add dot if necessary
                if (iNextDot != text.Length)
                {
                    output.Append(possibleDots[0]);
                }

                iAfterLastDot = iNextDot + 1;
                iOutputAfterLastDot = output.Length;
            }

            var rest = IsDot(text[text.Length - 1]) ? 0 : 1;
            var maxlength = DefaultNameLimit - rest;

            // Throw if we're too long
            if (output.Length > maxlength)
            {
                output.Remove(maxlength, output.Length - maxlength);
            }

            return output.ToString();
        }

        #endregion

        #region Helpers

        private static Boolean IsSupplementary(Int32 test)
        {
            return test >= 0x10000;
        }

        private static Boolean IsDot(Char c)
        {
            for (var i = 0; i < possibleDots.Length; i++)
            {
                if (possibleDots[i] == c)
                {
                    return true;
                }
            }

            return false;
        }

        private static Char EncodeDigit(Int32 digit)
        {
            const Char NumberOffset = (Char)('0' - 26);
            const Char LetterOffset = 'a';

            if (digit > 25)
            {
                // 26-35 map to ASCII 0-9
                return (Char)(digit + NumberOffset);
            }

            // 0-25 map to a-z or A-Z
            return (Char)(digit + LetterOffset);
        }

        private static Char EncodeBasic(Char character)
        {
            const Char CaseDifference = (Char)('a' - 'A');

            if (Char.IsUpper(character))
            {
                character += CaseDifference;
            }

            return character;
        }

        private static Int32 AdaptChar(Int32 delta, Int32 numPoints, Boolean firstTime)
        {
            const Int32 Skew = 38;
            const Int32 Damp = 700;

            var k = 0u;

            delta = firstTime ? delta / Damp : delta / 2;
            delta += delta / numPoints;

            for (k = 0; delta > ((PunycodeBase - Tmin) * Tmax) / 2; k += PunycodeBase)
            {
                delta /= PunycodeBase - Tmin;
            }

            return (Int32)(k + PunycodeBase * delta / (delta + Skew));
        }

        #endregion
    }
}