Skip to content

Commit

Permalink
Improve PDFObjRef resolution and object conversion
Browse files Browse the repository at this point in the history
More robust solutions to:

- #77
- #90
  • Loading branch information
jsvine committed Nov 12, 2018
1 parent 416fda3 commit 5843cd7
Show file tree
Hide file tree
Showing 4 changed files with 65 additions and 15 deletions.
45 changes: 40 additions & 5 deletions pdfplumber/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,13 +95,48 @@ def point2coord(pt):
"pts",
]

NON_DECIMALIZE = [
"fontname", "name", "upright",
"stroking_color", "non_stroking_color",
]
noop = lambda x: x
str_conv = lambda x: str(x or "")

CONVERSIONS = {
# Decimals
"adv": d,
"height": d,
"linewidth": d,
"pts": d,
"size": d,
"srcsize": d,
"width": d,
"x0": d,
"x1": d,
"y0": d,
"y1": d,

# Integer
"bits": int,
"upright": int,

# Strings
"font": str_conv,
"fontname": str_conv,
"imagemask": noop,
"name": str_conv,
"object_type": str_conv,
"text": str_conv,

# No conversion
"colorspace": noop,
"evenodd": noop,
"fill": noop,
"non_stroking_color": noop,
"path": noop,
"stream": noop,
"stroke": noop,
"stroking_color": noop,
}

def process_object(obj):
attr = dict((k, (v if (k in NON_DECIMALIZE or v == None) else d(v)))
attr = dict((k, CONVERSIONS[k](resolve_all(v)))
for k, v in obj.__dict__.items()
if k not in IGNORE)

Expand Down
19 changes: 14 additions & 5 deletions pdfplumber/utils.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
from pdfminer.utils import PDFDocEncoding
from pdfminer.pdftypes import PDFObjRef
from pdfminer.psparser import PSLiteral
from decimal import Decimal, ROUND_HALF_UP
import numbers
from operator import itemgetter
import itertools
from functools import lru_cache as cache
import six

DEFAULT_X_TOLERANCE = 3
Expand Down Expand Up @@ -69,11 +70,12 @@ def decode_text(s):
ords = (ord(c) if type(c) == str else c for c in s)
return ''.join(PDFDocEncoding[o] for o in ords)

def decimalize(v, q=None):
# If PDFObjRef, first resolve
if isinstance(v, PDFObjRef):
return decimalize(v.resolve(), q)
def decode_psl_list(_list):
return [ decode_text(value.name) if isinstance(value, PSLiteral) else value
for value in _list ]

@cache(maxsize = int(10e4))
def _decimalize(v, q = None):
# If already a decimal, just return itself
if isinstance(v, Decimal):
return v
Expand All @@ -96,6 +98,13 @@ def decimalize(v, q=None):
else:
raise ValueError("Cannot convert {0} to Decimal.".format(v))

def decimalize(v, q = None):
# If tuple/list passed, bulk-convert
if isinstance(v, (tuple, list)):
return type(v)(decimalize(x, q) for x in v)
else:
return _decimalize(v, q)

def is_dataframe(collection):
cls = collection.__class__
name = ".".join([ cls.__module__, cls.__name__ ])
Expand Down
Binary file added tests/pdfs/issue-90-example.pdf
Binary file not shown.
16 changes: 11 additions & 5 deletions tests/test-issues.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,19 +102,25 @@ def test_issue_67(self):
os.path.join(HERE, "pdfs/issue-67-example.pdf")
)
assert len(pdf.metadata.keys())

def test_pr_77(self):
# via https://github.com/jsvine/pdfplumber/pull/77
path = os.path.join(HERE, "pdfs/pr-77-example.pdf")
with pdfplumber.open(path) as pdf:
first_page = pdf.pages[0]
first_page.objects

def test_pr_88(self):
# via https://github.com/jsvine/pdfplumber/pull/88
path = os.path.join(HERE, "pdfs/pr-88-example.pdf")
with pdfplumber.open(path) as pdf:
first_page = pdf.pages[0]
words = first_page.extract_words()
page = pdf.pages[0]
words = page.extract_words()
assert len(words) == 25


def test_issue_90(self):
path = os.path.join(HERE, "pdfs/issue-90-example.pdf")
with pdfplumber.open(path) as pdf:
page = pdf.pages[0]
words = page.extract_words()

0 comments on commit 5843cd7

Please sign in to comment.