Skip to main content

C# class to convert the textual content of any PDF file in TXT or HTML format with the iTextSharp PDF library.

using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using iTextSharp.text.pdf;
using iTextSharp.text.pdf.parser;

//
// ASP.NET - Convert PDF to TXT (Plain-Text) or HTML to C # with iTextSharp
//
// A practical C # Class to convert the textual content of any PDF file in TXT
// or HTML format with the iTextSharp PDF library for ASP.NET.
//
// Requires iTextSharp (https://www.nuget.org/packages/iTextSharp/)
//
// https://www.ryadel.com/convertire-pdf-txt-plain-text-testo-html-c-sharp-itextsharp-itext-asp-net/
//

namespace PDF
{
    /// <summary>
    /// Parses a PDF file and extracts the text from it.
    /// </summary>
    public static class PDFParser
    {
        /// <summary>
        /// Extracts a text from a PDF file.
        /// </summary>
        /// <param name="filePath">the full path to the pdf file.</param>
        /// <returns>the extracted text</returns>
        public static string GetText(string filePath)
        {
            var sb = new StringBuilder();

            try
            {
                using (PdfReader reader = new PdfReader(filePath))
                {
                    string prevPage = "";
                    for (int page = 1; page <= reader.NumberOfPages; page++)
                    {
                        ITextExtractionStrategy its = new SimpleTextExtractionStrategy();
                        var s = PdfTextExtractor.GetTextFromPage(reader, page, its);
                        if (prevPage != s) sb.Append(s);
                        prevPage = s;
                    }
                    reader.Close();
                }
            }
            catch (Exception e)
            {
                throw e;
            }

            return sb.ToString();
        }

        public static GetHTMLText(string sourceFilePath)
        {
            var txt = PDFParser.GetText(sourceFilePath);
            var sb = new StringBuilder();

            foreach (string s in txt.Split('\n')) {
                sb.AppendFormat("<p>{0}</p>", s);
            }

            return sb.ToString();
        }
    }
}