diff --git a/CHANGELOG.md b/CHANGELOG.md index 86bcad4e..7c071f3c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,11 +3,17 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](http://keepachangelog.com/). ## [0.5.25] — Unreleased +## Added +- Add new boolean argument `strict_metadata` (default `False`) to `pdfplumber.open(...)` method for handling metadata resolve failures ([#320](https://github.com/jsvine/pdfplumber/pull/320)) + ### Fixed - Fix metadata extraction to handle integer/floating-point values ([#297](https://github.com/jsvine/pdfplumber/issues/297)) - Explicitly load text as utf-8 in `setup.py` ([#304](https://github.com/jsvine/pdfplumber/issues/304)) - Fix `pdfplumber.open(...)` so that it does not close file objects passed to it ([#312](https://github.com/jsvine/pdfplumber/issues/312)) +### Changed +- Extend metadata resolver to handle more data types ([#320](https://github.com/jsvine/pdfplumber/pull/320)) + ## [0.5.24] — 2020-10-20 ### Added - Added `extra_attrs=[...]` parameter to `.extract_text(...)` ([c8b200e](https://github.com/jsvine/pdfplumber/commit/c8b200e)) ([#28](https://github.com/jsvine/pdfplumber/issues/28)) diff --git a/README.md b/README.md index 9e97f90a..a2095bf8 100644 --- a/README.md +++ b/README.md @@ -72,6 +72,8 @@ The `open` method returns an instance of the `pdfplumber.PDF` class. To load a password-protected PDF, pass the `password` keyword argument, e.g., `pdfplumber.open("file.pdf", password = "test")`. +Invalid metadata values are treated as a warning by default. If that is not intended, pass `strict_metadata=True` to the `open` method and `pdfplumber.open` will raise an exception if it is unable to parse the metadata. + ### The `pdfplumber.PDF` class The top-level `pdfplumber.PDF` class represents a single PDF and has two main properties: diff --git a/pdfplumber/pdf.py b/pdfplumber/pdf.py index b541c510..640d6a6e 100644 --- a/pdfplumber/pdf.py +++ b/pdfplumber/pdf.py @@ -1,7 +1,8 @@ from .container import Container from .page import Page -from .utils import decode_text +from .utils import resolve_and_decode +import logging import pathlib import itertools from pdfminer.pdfparser import PDFParser @@ -10,13 +11,22 @@ from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter from pdfminer.layout import LAParams from pdfminer.converter import PDFPageAggregator -from pdfminer.psparser import PSLiteral + +logger = logging.getLogger(__name__) class PDF(Container): cached_properties = Container.cached_properties + ["_pages"] - def __init__(self, stream, pages=None, laparams=None, precision=0.001, password=""): + def __init__( + self, + stream, + pages=None, + laparams=None, + precision=0.001, + password="", + strict_metadata=False, + ): self.laparams = None if laparams is None else LAParams(**laparams) self.stream = stream self.pages_to_parse = pages @@ -27,16 +37,18 @@ def __init__(self, stream, pages=None, laparams=None, precision=0.001, password= for info in self.doc.info: self.metadata.update(info) for k, v in self.metadata.items(): - if hasattr(v, "resolve"): - v = v.resolve() - if type(v) == list: - self.metadata[k] = list(map(decode_text, v)) - elif isinstance(v, PSLiteral): - self.metadata[k] = decode_text(v.name) - elif isinstance(v, (str, bytes)): - self.metadata[k] = decode_text(v) - else: - self.metadata[k] = v + try: + self.metadata[k] = resolve_and_decode(v) + except Exception as e: + if strict_metadata: + # Raise an exception since unable to resolve the metadata value. + raise + # This metadata value could not be parsed. Instead of failing the PDF + # read, treat it as a warning only if ``strict_metadata=False``. + logger.warning( + f'[WARNING] Metadata key "{k}" could not be parsed due to ' + f"exception: {str(e)}" + ) self.device = PDFPageAggregator(rsrcmgr, laparams=self.laparams) self.interpreter = PDFPageInterpreter(rsrcmgr, self.device) diff --git a/pdfplumber/utils.py b/pdfplumber/utils.py index f71d3c6c..05a4fa56 100644 --- a/pdfplumber/utils.py +++ b/pdfplumber/utils.py @@ -78,6 +78,24 @@ def decode_text(s): return "".join(PDFDocEncoding[o] for o in ords) +def resolve_and_decode(obj): + """Recursively resolve the metadata values.""" + if hasattr(obj, "resolve"): + obj = obj.resolve() + if isinstance(obj, list): + return list(map(resolve_and_decode, obj)) + elif isinstance(obj, PSLiteral): + return decode_text(obj.name) + elif isinstance(obj, (str, bytes)): + return decode_text(obj) + elif isinstance(obj, dict): + for k, v in obj.items(): + obj[k] = resolve_and_decode(v) + return obj + + return obj + + def decode_psl_list(_list): return [ decode_text(value.name) if isinstance(value, PSLiteral) else value diff --git a/requirements-dev.txt b/requirements-dev.txt index 3ab2792e..74996a2b 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,5 +1,5 @@ -pytest -pytest-cov -pytest-parallel +pytest==6.1.2 +pytest-cov==2.10.1 +pytest-parallel==0.1.0 flake8==3.8.3 black==20.8b0 diff --git a/tests/pdfs/issue-316-example.pdf b/tests/pdfs/issue-316-example.pdf new file mode 100644 index 00000000..2a4c3ee2 Binary files /dev/null and b/tests/pdfs/issue-316-example.pdf differ diff --git a/tests/test_display.py b/tests/test_display.py index b0742a37..bc1598d7 100644 --- a/tests/test_display.py +++ b/tests/test_display.py @@ -22,9 +22,11 @@ def teardown_class(self): def test_basic_conversion(self): self.im.reset() - self.im.draw_rect(self.im.page.rects[0]) + self.im.draw_rects(self.im.page.rects) self.im.draw_circle(self.im.page.chars[0]) self.im.draw_line(self.im.page.edges[0]) + self.im.draw_vlines([10]) + self.im.draw_hlines([10]) def test_debug_tablefinder(self): self.im.reset() diff --git a/tests/test_issues.py b/tests/test_issues.py index 198e338b..51613ce0 100644 --- a/tests/test_issues.py +++ b/tests/test_issues.py @@ -1,8 +1,7 @@ #!/usr/bin/env python import unittest import pdfplumber -import sys, os -import six +import os import logging logging.disable(logging.ERROR) @@ -168,3 +167,11 @@ def test_issue_297(self): path = os.path.join(HERE, "pdfs/issue-297-example.pdf") with pdfplumber.open(path) as pdf: assert isinstance(pdf.metadata["Copies"], int) + + def test_issue_316(self): + """ + Handle invalid metadata + """ + path = os.path.join(HERE, "pdfs/issue-316-example.pdf") + with pdfplumber.open(path) as pdf: + assert pdf.metadata["Changes"][0]["CreationDate"] == "D:20061207105020Z00'00'"