reingart · Lucas-C · Aug 26, 2020 · Sep 5, 2020
diff --git a/fpdf/fpdf.py b/fpdf/fpdf.py
@@ -15,17 +15,18 @@
 
 from __future__ import division, with_statement
 
+from contextlib import contextmanager
 from datetime import datetime
 from functools import wraps
-import math
-import errno
-import os, sys, zlib, struct, re, tempfile, struct
+import errno, logging, math, os, re, struct, sys, tempfile, zlib
 
 from .ttfonts import TTFontFile
 from .fonts import fpdf_charwidths
 from .php import substr, sprintf, print_r, UTF8ToUTF16BE, UTF8StringToArray
 from .py3k import PY3K, pickle, urlopen, BytesIO, Image, basestring, unicode, exception, b, hashpath
 
+LOGGER = logging.getLogger(__name__)
+
 # Global variables
 FPDF_VERSION = '1.7.2'
 FPDF_FONT_DIR = os.path.join(os.path.dirname(__file__),'font')
@@ -64,7 +65,7 @@ def __init__(self, orientation = 'P', unit = 'mm', format = 'A4'):
         self.offsets = {}               # array of object offsets
         self.page = 0                   # current page number
         self.n = 2                      # current object number
-        self.buffer = ''                # buffer holding in-memory PDF
+        self.buffer = bytearray()       # buffer holding in-memory PDF
         self.pages = {}                 # array containing pages and metadata
         self.state = 0                  # current document state
         self.fonts = {}                 # array of used fonts
@@ -498,7 +499,6 @@ def add_font(self, family, style='', fname='', uni=False):
                 ttffilename = os.path.join(SYSTEM_TTFONTS, fname)
             else:
                 raise RuntimeError("TTF Font file not found: %s" % fname)
-            name = ''
             if FPDF_CACHE_MODE == 0:
                 unifilename = os.path.splitext(ttffilename)[0] + '.pkl'
             elif FPDF_CACHE_MODE == 2:
@@ -523,7 +523,7 @@ def add_font(self, family, style='', fname='', uni=False):
                     'ItalicAngle': int(ttf.italicAngle),
                     'StemV': int(round(ttf.stemV, 0)),
                     'MissingWidth': int(round(ttf.defaultWidth, 0)),
-                    }
+                }
                 # Generate metrics .pkl file
                 font_dict = {
                     'name': re.sub('[ ()]', '', ttf.fullName),
@@ -678,16 +678,16 @@ def text(self, x, y, txt=''):
         "Output a string"
         txt = self.normalize_text(txt)
         if (self.unifontsubset):
-            txt2 = self._escape(UTF8ToUTF16BE(txt, False))
+            txt2 = UTF8ToUTF16BE(self._escape(txt), False)
             for uni in UTF8StringToArray(txt):
                 self.current_font['subset'].append(uni)
         else:
             txt2 = self._escape(txt)
-        s=sprintf('BT %.2f %.2f Td (%s) Tj ET',x*self.k,(self.h-y)*self.k, txt2)
+        s=sprintf(b'BT %.2f %.2f Td (%s) Tj ET',x*self.k,(self.h-y)*self.k, txt2)
         if(self.underline and txt!=''):
-            s+=' '+self._dounderline(x,y,txt)
+            s+=b' '+self._dounderline(x,y,txt)
         if(self.color_flag):
-            s='q '+self.text_color+' '+s+' Q'
+            s=b'q '+self.text_color.encode()+b' '+s+b' Q'
         self._out(s)
 
     @check_page
@@ -1115,22 +1115,17 @@ def output(self, name='',dest=''):
                 dest='I'
             else:
                 dest='F'
-        if PY3K:
-            # manage binary data as latin1 until PEP461 or similar is implemented
-            buffer = self.buffer.encode("latin1")
-        else:
-            buffer = self.buffer
         if dest in ('I', 'D'):
             # Python < 3 writes byte data transparently without "buffer"
             stdout = getattr(sys.stdout, 'buffer', sys.stdout)
-            stdout.write(buffer)
+            stdout.write(self.buffer)
         elif dest=='F':
             #Save to local file
             with open(name,'wb') as f:
-                f.write(buffer)
+                f.write(self.buffer)
         elif dest=='S':
             #Return as a byte string
-            return buffer
+            return self.buffer
         else:
             self.error('Incorrect output destination: '+dest)
 
@@ -1172,8 +1167,8 @@ def _putpages(self):
             # Now repeat for no pages in non-subset fonts
             for n in range(1,nb + 1):
                 self.pages[n]["content"] = \
-                    self.pages[n]["content"].replace(self.str_alias_nb_pages,
-                        str(nb))
+                    self.pages[n]["content"].replace(self.str_alias_nb_pages.encode(),
+                        str(nb).encode())
         if self.def_orientation == 'P':
             dw_pt = self.dw_pt
             dh_pt = self.dh_pt
@@ -1206,6 +1201,7 @@ def _putpages(self):
                         annots += '/A <</S /URI /URI ' + \
                             self._textstring(pl[4]) + '>>>>'
                     else:
+                        assert pl[4] in self.links, f'Page {n} has a link with an invalid index: {pl[4]} (doc #links={len(self.links)})'
                         l = self.links[pl[4]]
                         if l[0] in self.orientation_changes:
                             h = w_pt
@@ -1222,9 +1218,7 @@ def _putpages(self):
             # Page content
             content = self.pages[n]["content"]
             if self.compress:
-                # manage binary data as latin1 until PEP461 or similar is implemented
-                p = content.encode("latin1") if PY3K else content
-                p = zlib.compress(p)
+                p = zlib.compress(content)
             else:
                 p = content
             self._newobj()
@@ -1633,15 +1627,18 @@ def _putresourcedict(self):
         self._out('>>')
 
     def _putresources(self):
-        self._putfonts()
-        self._putimages()
+        with self._trace_size('resources.fonts'):
+            self._putfonts()
+        with self._trace_size('resources.images'):
+            self._putimages()
         #Resource dictionary
-        self.offsets[2]=len(self.buffer)
-        self._out('2 0 obj')
-        self._out('<<')
-        self._putresourcedict()
-        self._out('>>')
-        self._out('endobj')
+        with self._trace_size('resources.dict'):
+            self.offsets[2]=len(self.buffer)
+            self._out('2 0 obj')
+            self._out('<<')
+            self._putresourcedict()
+            self._out('>>')
+            self._out('endobj')
 
     def _putinfo(self):
         self._out('/Producer '+self._textstring('PyFPDF '+FPDF_VERSION+' http://pyfpdf.googlecode.com/'))
@@ -1684,41 +1681,48 @@ def _puttrailer(self):
         self._out('/Info '+str(self.n-1)+' 0 R')
 
     def _enddoc(self):
-        self._putheader()
-        self._putpages()
-        self._putresources()
+        LOGGER.debug('Final doc sections size summary:')
+        with self._trace_size('header'):
+            self._putheader()
+        with self._trace_size('pages'):
+            self._putpages()
+        self._putresources()  # trace_size is performed inside
         #Info
-        self._newobj()
-        self._out('<<')
-        self._putinfo()
-        self._out('>>')
-        self._out('endobj')
+        with self._trace_size('info'):
+            self._newobj()
+            self._out('<<')
+            self._putinfo()
+            self._out('>>')
+            self._out('endobj')
         #Catalog
-        self._newobj()
-        self._out('<<')
-        self._putcatalog()
-        self._out('>>')
-        self._out('endobj')
+        with self._trace_size('catalog'):
+            self._newobj()
+            self._out('<<')
+            self._putcatalog()
+            self._out('>>')
+            self._out('endobj')
         #Cross-ref
-        o=len(self.buffer)
-        self._out('xref')
-        self._out('0 '+(str(self.n+1)))
-        self._out('0000000000 65535 f ')
-        for i in range(1,self.n+1):
-            self._out(sprintf('%010d 00000 n ',self.offsets[i]))
+        with self._trace_size('xref'):
+            o=len(self.buffer)
+            self._out('xref')
+            self._out('0 '+(str(self.n+1)))
+            self._out('0000000000 65535 f ')
+            for i in range(1,self.n+1):
+                self._out(sprintf('%010d 00000 n ',self.offsets[i]))
         #Trailer
-        self._out('trailer')
-        self._out('<<')
-        self._puttrailer()
-        self._out('>>')
-        self._out('startxref')
-        self._out(o)
+        with self._trace_size('trailer'):
+            self._out('trailer')
+            self._out('<<')
+            self._puttrailer()
+            self._out('>>')
+            self._out('startxref')
+            self._out(o)
         self._out('%%EOF')
         self.state=3
 
     def _beginpage(self, orientation, format, same):
         self.page += 1
-        self.pages[self.page] = {"content": ""}
+        self.pages[self.page] = {"content": bytearray()}
         self.state = 2
         self.x = self.l_margin
         self.y = self.t_margin
@@ -1984,9 +1988,9 @@ def _out(self, s):
         elif not isinstance(s, basestring):
             s = str(s)
         if(self.state == 2):
-            self.pages[self.page]["content"] += (s + "\n")
+            self.pages[self.page]["content"] += (s.encode("latin1") + b"\n")
         else:
-            self.buffer += (s + "\n")
+            self.buffer += (s.encode("latin1") + b"\n")
 
     @check_page
     def interleaved2of5(self, txt, x, y, w=1.0, h=10.0):
@@ -2067,3 +2071,18 @@ def code39(self, txt, x, y, w=1.5, h=5.0):
                     self.rect(x, y, dim[d], h, 'F')
                 x += dim[d]
             x += dim['n']
+
+    @contextmanager
+    def _trace_size(self, label):
+        prev_size = len(self.buffer)
+        yield
+        LOGGER.debug('- %s.size: %s', label, _sizeof_fmt(len(self.buffer) - prev_size))
+
+
+def _sizeof_fmt(num, suffix='B'):
+    # Recipe from: https://stackoverflow.com/a/1094933/636849
+    for unit in ['','Ki','Mi','Gi','Ti','Pi','Ei','Zi']:
+        if abs(num) < 1024.0:
+            return "%3.1f%s%s" % (num, unit, suffix)
+        num /= 1024.0
+    return "%.1f%s%s" % (num, 'Yi', suffix)
diff --git a/fpdf/php.py b/fpdf/php.py
@@ -25,11 +25,7 @@ def UTF8ToUTF16BE(instr, setbom=True):
         outstr += "\xFE\xFF".encode("latin1")
     if not isinstance(instr, unicode):
         instr = instr.decode('UTF-8')
-    outstr += instr.encode('UTF-16BE')
-    # convert bytes back to fake unicode string until PEP461-like is implemented
-    if PY3K:
-        outstr = outstr.decode("latin1")
-    return outstr
+    return outstr + instr.encode('UTF-16BE')
 
 def UTF8StringToArray(instr):
     "Converts UTF-8 strings to codepoints array"