Improve PDFObjRef resolution and object conversion

More robust solutions to: - #77 - #90
jsvine · Nov 12, 2018 · 5843cd7 · 5843cd7
1 parent 416fda3
commit 5843cd7
Show file tree

Hide file tree

Showing 4 changed files with 65 additions and 15 deletions.
diff --git a/pdfplumber/page.py b/pdfplumber/page.py
@@ -95,13 +95,48 @@ def point2coord(pt):
             "pts",
         ]
 
-        NON_DECIMALIZE = [
-            "fontname", "name", "upright",
-            "stroking_color", "non_stroking_color",
-        ]
+        noop = lambda x: x
+        str_conv = lambda x: str(x or "")
+
+        CONVERSIONS = {
+            # Decimals
+            "adv": d,
+            "height": d,
+            "linewidth": d,
+            "pts": d,
+            "size": d,
+            "srcsize": d,
+            "width": d,
+            "x0": d,
+            "x1": d,
+            "y0": d,
+            "y1": d,
+
+            # Integer
+            "bits": int,
+            "upright": int,
+
+            # Strings
+            "font": str_conv,
+            "fontname": str_conv,
+            "imagemask": noop,
+            "name": str_conv,
+            "object_type": str_conv,
+            "text": str_conv,
+
+            # No conversion
+            "colorspace": noop,
+            "evenodd": noop,
+            "fill": noop,
+            "non_stroking_color": noop,
+            "path": noop,
+            "stream": noop,
+            "stroke": noop,
+            "stroking_color": noop,
+        }
 
         def process_object(obj):
-            attr = dict((k, (v if (k in NON_DECIMALIZE or v == None) else d(v)))
+            attr = dict((k, CONVERSIONS[k](resolve_all(v)))
                 for k, v in obj.__dict__.items()
                     if k not in IGNORE)
 

diff --git a/pdfplumber/utils.py b/pdfplumber/utils.py
@@ -1,9 +1,10 @@
 from pdfminer.utils import PDFDocEncoding
-from pdfminer.pdftypes import PDFObjRef
+from pdfminer.psparser import PSLiteral
 from decimal import Decimal, ROUND_HALF_UP
 import numbers
 from operator import itemgetter
 import itertools
+from functools import lru_cache as cache
 import six
 
 DEFAULT_X_TOLERANCE = 3
@@ -69,11 +70,12 @@ def decode_text(s):
         ords = (ord(c) if type(c) == str else c for c in s)
         return ''.join(PDFDocEncoding[o] for o in ords)
 
-def decimalize(v, q=None):
-    # If PDFObjRef, first resolve
-    if isinstance(v, PDFObjRef):
-        return decimalize(v.resolve(), q)
+def decode_psl_list(_list):
+    return [ decode_text(value.name) if isinstance(value, PSLiteral) else value
+        for value in _list ]
 
+@cache(maxsize = int(10e4))
+def _decimalize(v, q = None):
     # If already a decimal, just return itself
     if isinstance(v, Decimal):
         return v
@@ -96,6 +98,13 @@ def decimalize(v, q=None):
     else:
         raise ValueError("Cannot convert {0} to Decimal.".format(v))
 
+def decimalize(v, q = None):
+    # If tuple/list passed, bulk-convert
+    if isinstance(v, (tuple, list)):
+        return type(v)(decimalize(x, q) for x in v)
+    else:
+        return _decimalize(v, q)
+
 def is_dataframe(collection):
     cls = collection.__class__
     name = ".".join([ cls.__module__, cls.__name__ ])

diff --git a/tests/pdfs/issue-90-example.pdf b/tests/pdfs/issue-90-example.pdf
diff --git a/tests/test-issues.py b/tests/test-issues.py
@@ -102,19 +102,25 @@ def test_issue_67(self):
             os.path.join(HERE, "pdfs/issue-67-example.pdf")
         )
         assert len(pdf.metadata.keys())
-        
+
     def test_pr_77(self):
         # via https://github.com/jsvine/pdfplumber/pull/77
         path = os.path.join(HERE, "pdfs/pr-77-example.pdf")
         with pdfplumber.open(path) as pdf:
             first_page = pdf.pages[0]
             first_page.objects
-        
+
     def test_pr_88(self):
         # via https://github.com/jsvine/pdfplumber/pull/88
         path = os.path.join(HERE, "pdfs/pr-88-example.pdf")
         with pdfplumber.open(path) as pdf:
-            first_page = pdf.pages[0]
-            words = first_page.extract_words()
+            page = pdf.pages[0]
+            words = page.extract_words()
             assert len(words) == 25
-
+
+    def test_issue_90(self):
+        path = os.path.join(HERE, "pdfs/issue-90-example.pdf")
+        with pdfplumber.open(path) as pdf:
+            page = pdf.pages[0]
+            words = page.extract_words()
+