Python script to convert plain text files into PDF files.
#!/usr/bin/env python
"""
pyText2Pdf - Python script to convert plain text files into Adobe
Acrobat PDF files with support for arbitrary page breaks etc.
Version 2.0
Author: Anand B Pillai <abpillai at gmail dot com>
Derived from http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/189858
"""
import optparse
import os
import re
import string
import sys
import time
LF_EXTRA = 0
LINE_END = '\015'
# form feed character (^L)
FF = chr(12)
ENCODING_STR = """\
/Encoding <<
/Differences [ 0 /.notdef /.notdef /.notdef /.notdef
/.notdef /.notdef /.notdef /.notdef /.notdef /.notdef
/.notdef /.notdef /.notdef /.notdef /.notdef /.notdef
/.notdef /.notdef /.notdef /.notdef /.notdef /.notdef
/.notdef /.notdef /.notdef /.notdef /.notdef /.notdef
/.notdef /.notdef /.notdef /.notdef /space /exclam
/quotedbl /numbersign /dollar /percent /ampersand
/quoteright /parenleft /parenright /asterisk /plus /comma
/hyphen /period /slash /zero /one /two /three /four /five
/six /seven /eight /nine /colon /semicolon /less /equal
/greater /question /at /A /B /C /D /E /F /G /H /I /J /K /L
/M /N /O /P /Q /R /S /T /U /V /W /X /Y /Z /bracketleft
/backslash /bracketright /asciicircum /underscore
/quoteleft /a /b /c /d /e /f /g /h /i /j /k /l /m /n /o /p
/q /r /s /t /u /v /w /x /y /z /braceleft /bar /braceright
/asciitilde /.notdef /.notdef /.notdef /.notdef /.notdef
/.notdef /.notdef /.notdef /.notdef /.notdef /.notdef
/.notdef /.notdef /.notdef /.notdef /.notdef /.notdef
/dotlessi /grave /acute /circumflex /tilde /macron /breve
/dotaccent /dieresis /.notdef /ring /cedilla /.notdef
/hungarumlaut /ogonek /caron /space /exclamdown /cent
/sterling /currency /yen /brokenbar /section /dieresis
/copyright /ordfeminine /guillemotleft /logicalnot /hyphen
/registered /macron /degree /plusminus /twosuperior
/threesuperior /acute /mu /paragraph /periodcentered
/cedilla /onesuperior /ordmasculine /guillemotright
/onequarter /onehalf /threequarters /questiondown /Agrave
/Aacute /Acircumflex /Atilde /Adieresis /Aring /AE
/Ccedilla /Egrave /Eacute /Ecircumflex /Edieresis /Igrave
/Iacute /Icircumflex /Idieresis /Eth /Ntilde /Ograve
/Oacute /Ocircumflex /Otilde /Odieresis /multiply /Oslash
/Ugrave /Uacute /Ucircumflex /Udieresis /Yacute /Thorn
/germandbls /agrave /aacute /acircumflex /atilde /adieresis
/aring /ae /ccedilla /egrave /eacute /ecircumflex
/edieresis /igrave /iacute /icircumflex /idieresis /eth
/ntilde /ograve /oacute /ocircumflex /otilde /odieresis
/divide /oslash /ugrave /uacute /ucircumflex /udieresis
/yacute /thorn /ydieresis ]
>>
"""
INTRO = """\
%prog [options] filename
PyText2Pdf makes a 7-bit clean PDF file from any input file.
It reads from a named file, and writes the PDF file to a file specified by
the user, otherwise to a file with '.pdf' appended to the input file.
Author: Anand B Pillai."""
class PyText2Pdf(object):
""" Text2pdf converter in pure Python """
def __init__(self):
# version number
self._version = "1.3"
# iso encoding flag
self._IsoEnc = False
# formfeeds flag
self._doFFs = False
self._progname = "PyText2Pdf"
self._appname = " ".join((self._progname, str(self._version)))
# default font
self._font = "/Courier"
# default font size
self._ptSize = 10
# default vert space
self._vertSpace = 12
self._lines = 0
# number of characters in a row
self._cols = 80
self._columns = 1
# page ht
self._pageHt = 792
# page wd
self._pageWd = 612
# input file
self._ifile = ""
# output file
self._ofile = ""
# default tab width
self._tab = 4
# input file descriptor
self._ifs = None
# output file descriptor
self._ofs = None
# landscape flag
self._landscape = False
# Subject
self._subject = ''
# Author
self._author = ''
# Keywords
self._keywords = []
# Custom regexp for page breaks
self._pagebreakre = None
# marker objects
self._curobj = 5
self._pageObs = [0]
self._locations = [0, 0, 0, 0, 0, 0]
self._pageNo = 0
# file position marker
self._fpos = 0
def parse_args(self):
""" Callback function called by argument parser.
Helps to remove duplicate code """
if len(sys.argv) < 2:
sys.argv.append('-h')
parser = optparse.OptionParser(usage=INTRO)
parser.add_option('-o', '--output', dest='outfile', help='Direct output to file OUTFILE', metavar='OUTFILE')
parser.add_option('-f', '--font', dest='font', help='Use Postscript font FONT (must be in standard 14, default: Courier)', default='Courier')
parser.add_option('-I', '--isolatin', dest='isolatin', help='Use ISO latin-1 encoding', default=False, action='store_true')
parser.add_option('-s', '--size', dest='fontsize', help='Use font at PTSIZE points (default=>10)', metavar='PTSIZE', default=10)
parser.add_option('-v', '--linespace', dest='linespace', help='Use line spacing LINESPACE (deault 12)', metavar='LINESPACE', default=12)
parser.add_option('-l', '--lines', dest='lines', help='Lines per page (default 60, determined automatically if unspecified)', default=60, metavar=None)
parser.add_option('-c', '--chars', dest='chars', help='Maximum characters per line (default 80)', default=80, metavar=None)
parser.add_option('-t', '--tab', dest='tabspace', help='Spaces per tab character (default 4)', default=4, metavar=None)
parser.add_option('-F', '--ignoreff', dest='formfeed', help='Ignore formfeed character ^L (i.e, accept formfeed characters as pagebreaks)', default=False, action='store_true')
parser.add_option('-P', '--papersize', dest='papersize', help='Set paper size (default is letter, accepted values are "A4" or "A3")')
parser.add_option('-W', '--width', dest='width', help='Independent paper width in points', metavar=None, default=612)
parser.add_option('-H', '--height', dest='height', help='Independent paper height in points', metavar=None, default=792)
parser.add_option('-2', '--twocolumns', dest='twocolumns', help='Format as two columns', metavar=None, default=False, action='store_true')
parser.add_option('-L', '--landscape', dest='landscape', help='Format in landscape mode', metavar=None, default=False, action='store_true')
parser.add_option('-R', '--regexp', dest='pageregexp', help='Regular expression string to determine page breaks (if supplied, this will be used to split text into pages, instead of using line count)', metavar=None)
parser.add_option('-S', '--subject', dest='subject', help='Optional subject for the document', metavar=None)
parser.add_option('-A', '--author', dest='author', help='Optional author for the document', metavar=None)
parser.add_option('-K', '--keywords', dest='keywords', help='Optional list of keywords for the document (separated by commas)', metavar=None)
optlist, args = parser.parse_args()
# print optlist.__dict__, args
if len(args) == 0:
sys.exit('Error: input file argument missing')
elif len(args) > 1:
sys.exit('Error: Too many arguments')
self._ifile = args[0]
d = optlist.__dict__
if d.get('isolatin'):
self._IsoEnc = True
if d.get('formfeed'):
self._doFFs = True
if d.get('twocolumns'):
self._columns = 2
if d.get('landscape'):
self._landscape = True
self._font = '/' + d.get('font')
psize = d.get('papersize')
if psize == 'A4':
self._pageWd = 595
self._pageHt = 842
elif psize == 'A3':
self._pageWd = 842
self._pageHt = 1190
fsize = int(d.get('fontsize'))
if fsize < 1:
fsize = 1
self._ptSize = fsize
lspace = int(d.get('linespace'))
if lspace < 1:
lspace = 1
self._vertSpace = lspace
lines = int(d.get('lines'))
if lines < 1:
lines = 1
self._lines = int(lines)
chars = int(d.get('chars'))
if chars < 4:
chars = 4
self._cols = chars
tab = int(d.get('tabspace'))
if tab < 1:
tab = 1
self._tab = tab
w = int(d.get('width'))
if w < 72:
w = 72
self._pageWd = w
h = int(d.get('height'))
if h < 72:
h = 72
self._pageHt = h
# Very optional args
author = d.get('author')
if author:
self._author = author
subject = d.get('subject')
if subject:
self._subject = subject
keywords = d.get('keywords')
if keywords:
self._keywords = keywords.split(',')
pagebreak = d.get('pageregexp')
if pagebreak:
self._pagebreakre = re.compile(
pagebreak, re.UNICODE | re.IGNORECASE)
outfile = d.get('outfile')
if outfile:
self._ofile = outfile
if self._landscape:
print 'Landscape option on...'
if self._columns == 2:
print 'Printing in two columns...'
if self._doFFs:
print 'Ignoring form feed character...'
if self._IsoEnc:
print 'Using ISO Latin Encoding...'
print 'Using font', self._font[1:], 'size =', self._ptSize
def writestr(self, str):
""" Write string to output file descriptor.
All output operations go through this function.
We keep the current file position also here"""
# update current file position
self._fpos += len(str)
for x in range(0, len(str)):
if str[x] == '\n':
self._fpos += LF_EXTRA
try:
self._ofs.write(str)
except IOError, e:
print e
return -1
return 0
def convert(self):
""" Perform the actual conversion """
if self._landscape:
# swap page width & height
tmp = self._pageHt
self._pageHt = self._pageWd
self._pageWd = tmp
if self._lines == 0:
self._lines = (self._pageHt - 72) / self._vertSpace
if self._lines < 1:
self._lines = 1
try:
self._ifs = open(self._ifile)
except IOError, (strerror, errno):
print 'Error: Could not open file to read --->', self._ifile
sys.exit(3)
if self._ofile == "":
self._ofile = os.path.splitext(self._ifile)[0] + '.pdf'
try:
self._ofs = open(self._ofile, 'wb')
except IOError, (strerror, errno):
print 'Error: Could not open file to write --->', self._ofile
sys.exit(3)
print 'Input file=>', self._ifile
print 'Writing pdf file', self._ofile, '...'
self.writeheader()
self.writepages()
self.writerest()
print 'Wrote file', self._ofile
self._ifs.close()
self._ofs.close()
return 0
def writeheader(self):
"""Write the PDF header"""
ws = self.writestr
title = self._ifile
t = time.localtime()
timestr = str(time.strftime("D:%Y%m%d%H%M%S", t))
ws("%PDF-1.4\n")
self._locations[1] = self._fpos
ws("1 0 obj\n")
ws("<<\n")
buf = "".join(("/Creator (", self._appname, " By Anand B Pillai )\n"))
ws(buf)
buf = "".join(("/CreationDate (", timestr, ")\n"))
ws(buf)
buf = "".join(
("/Producer (", self._appname, "(\\251 Anand B Pillai))\n"))
ws(buf)
if self._subject:
title = self._subject
buf = "".join(("/Subject (", self._subject, ")\n"))
ws(buf)
if self._author:
buf = "".join(("/Author (", self._author, ")\n"))
ws(buf)
if self._keywords:
buf = "".join(("/Keywords (", ' '.join(self._keywords), ")\n"))
ws(buf)
if title:
buf = "".join(("/Title (", title, ")\n"))
ws(buf)
ws(">>\n")
ws("endobj\n")
self._locations[2] = self._fpos
ws("2 0 obj\n")
ws("<<\n")
ws("/Type /Catalog\n")
ws("/Pages 3 0 R\n")
ws(">>\n")
ws("endobj\n")
self._locations[4] = self._fpos
ws("4 0 obj\n")
ws("<<\n")
buf = "".join(("/BaseFont ", str(self._font),
" /Encoding /WinAnsiEncoding /Name /F1 /Subtype /Type1 /Type /Font >>\n"))
ws(buf)
if self._IsoEnc:
ws(ENCODING_STR)
ws(">>\n")
ws("endobj\n")
self._locations[5] = self._fpos
ws("5 0 obj\n")
ws("<<\n")
ws(" /Font << /F1 4 0 R >>\n")
ws(" /ProcSet [ /PDF /Text ]\n")
ws(">>\n")
ws("endobj\n")
def startpage(self):
""" Start a page of data """
ws = self.writestr
self._pageNo += 1
self._curobj += 1
self._locations.append(self._fpos)
self._locations[self._curobj] = self._fpos
self._pageObs.append(self._curobj)
self._pageObs[self._pageNo] = self._curobj
buf = "".join((str(self._curobj), " 0 obj\n"))
ws(buf)
ws("<<\n")
ws("/Type /Page\n")
ws("/Parent 3 0 R\n")
ws("/Resources 5 0 R\n")
self._curobj += 1
buf = "".join(("/Contents ", str(self._curobj), " 0 R\n"))
ws(buf)
ws(">>\n")
ws("endobj\n")
self._locations.append(self._fpos)
self._locations[self._curobj] = self._fpos
buf = "".join((str(self._curobj), " 0 obj\n"))
ws(buf)
ws("<<\n")
buf = "".join(("/Length ", str(self._curobj + 1), " 0 R\n"))
ws(buf)
ws(">>\n")
ws("stream\n")
strmPos = self._fpos
ws("BT\n")
buf = "".join(("/F1 ", str(self._ptSize), " Tf\n"))
ws(buf)
buf = "".join(("1 0 0 1 50 ", str(self._pageHt - 40), " Tm\n"))
ws(buf)
buf = "".join((str(self._vertSpace), " TL\n"))
ws(buf)
return strmPos
def endpage(self, streamStart):
"""End a page of data """
ws = self.writestr
ws("ET\n")
streamEnd = self._fpos
ws("endstream\n")
ws("endobj\n")
self._curobj += 1
self._locations.append(self._fpos)
self._locations[self._curobj] = self._fpos
buf = "".join((str(self._curobj), " 0 obj\n"))
ws(buf)
buf = "".join((str(streamEnd - streamStart), '\n'))
ws(buf)
ws('endobj\n')
def writepages(self):
"""Write pages as PDF"""
ws = self.writestr
beginstream = 0
lineNo, charNo = 0, 0
ch, column = 0, 0
padding, i = 0, 0
atEOF = 0
linebuf = ''
while not atEOF:
beginstream = self.startpage()
column = 1
while column <= self._columns:
column += 1
atFF = 0
atBOP = 0
lineNo = 0
# Special flag for regexp page break
pagebreak = False
while lineNo < self._lines and not atFF and not atEOF and not pagebreak:
linebuf = ''
lineNo += 1
ws("(")
charNo = 0
while charNo < self._cols:
charNo += 1
ch = self._ifs.read(1)
cond = ((ch != '\n') and not(
ch == FF and self._doFFs) and (ch != ''))
if not cond:
# See if this dude matches the pagebreak regexp
if self._pagebreakre and self._pagebreakre.search(linebuf.strip()):
pagebreak = True
linebuf = ''
break
else:
linebuf = linebuf + ch
if ord(ch) >= 32 and ord(ch) <= 127:
if ch == '(' or ch == ')' or ch == '\\':
ws("\\")
ws(ch)
else:
if ord(ch) == 9:
padding = self._tab - \
((charNo - 1) % self._tab)
for i in range(padding):
ws(" ")
charNo += (padding - 1)
else:
if ch != FF:
# write \xxx form for dodgy character
buf = "".join(('\\', ch))
ws(buf)
else:
# dont print anything for a FF
charNo -= 1
ws(")'\n")
if ch == FF:
atFF = 1
if lineNo == self._lines:
atBOP = 1
if atBOP:
pos = 0
ch = self._ifs.read(1)
pos = self._ifs.tell()
if ch == FF:
ch = self._ifs.read(1)
pos = self._ifs.tell()
# python's EOF signature
if ch == '':
atEOF = 1
else:
# push position back by one char
self._ifs.seek(pos - 1)
elif atFF:
ch = self._ifs.read(1)
pos = self._ifs.tell()
if ch == '':
atEOF = 1
else:
self._ifs.seek(pos - 1)
if column < self._columns:
buf = "".join(("1 0 0 1 ",
str((self._pageWd / 2 + 25)),
" ",
str(self._pageHt - 40),
" Tm\n"))
ws(buf)
self.endpage(beginstream)
def writerest(self):
"""Finish the file"""
ws = self.writestr
self._locations[3] = self._fpos
ws("3 0 obj\n")
ws("<<\n")
ws("/Type /Pages\n")
buf = "".join(("/Count ", str(self._pageNo), "\n"))
ws(buf)
buf = "".join(
("/MediaBox [ 0 0 ", str(self._pageWd), " ", str(self._pageHt), " ]\n"))
ws(buf)
ws("/Kids [ ")
for i in range(1, self._pageNo + 1):
buf = "".join((str(self._pageObs[i]), " 0 R "))
ws(buf)
ws("]\n")
ws(">>\n")
ws("endobj\n")
xref = self._fpos
ws("xref\n")
buf = "".join(("0 ", str((self._curobj) + 1), "\n"))
ws(buf)
buf = "".join(("0000000000 65535 f ", str(LINE_END)))
ws(buf)
for i in range(1, self._curobj + 1):
val = self._locations[i]
buf = "".join((string.zfill(str(val), 10),
" 00000 n ", str(LINE_END)))
ws(buf)
ws("trailer\n")
ws("<<\n")
buf = "".join(("/Size ", str(self._curobj + 1), "\n"))
ws(buf)
ws("/Root 2 0 R\n")
ws("/Info 1 0 R\n")
ws(">>\n")
ws("startxref\n")
buf = "".join((str(xref), "\n"))
ws(buf)
ws("%%EOF\n")
def main():
pdfclass = PyText2Pdf()
pdfclass.parse_args()
pdfclass.convert()
if __name__ == "__main__":
main()