C# class to convert the textual content of any PDF file in TXT or HTML format with the iTextSharp PDF library.
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using iTextSharp.text.pdf;
using iTextSharp.text.pdf.parser;
//
// ASP.NET - Convert PDF to TXT (Plain-Text) or HTML to C # with iTextSharp
//
// A practical C # Class to convert the textual content of any PDF file in TXT
// or HTML format with the iTextSharp PDF library for ASP.NET.
//
// Requires iTextSharp (https://www.nuget.org/packages/iTextSharp/)
//
// https://www.ryadel.com/convertire-pdf-txt-plain-text-testo-html-c-sharp-itextsharp-itext-asp-net/
//
namespace PDF
{
/// <summary>
/// Parses a PDF file and extracts the text from it.
/// </summary>
public static class PDFParser
{
/// <summary>
/// Extracts a text from a PDF file.
/// </summary>
/// <param name="filePath">the full path to the pdf file.</param>
/// <returns>the extracted text</returns>
public static string GetText(string filePath)
{
var sb = new StringBuilder();
try
{
using (PdfReader reader = new PdfReader(filePath))
{
string prevPage = "";
for (int page = 1; page <= reader.NumberOfPages; page++)
{
ITextExtractionStrategy its = new SimpleTextExtractionStrategy();
var s = PdfTextExtractor.GetTextFromPage(reader, page, its);
if (prevPage != s) sb.Append(s);
prevPage = s;
}
reader.Close();
}
}
catch (Exception e)
{
throw e;
}
return sb.ToString();
}
public static GetHTMLText(string sourceFilePath)
{
var txt = PDFParser.GetText(sourceFilePath);
var sb = new StringBuilder();
foreach (string s in txt.Split('\n')) {
sb.AppendFormat("<p>{0}</p>", s);
}
return sb.ToString();
}
}
}