Skip to content

Commit

Permalink
Fix bug that crashed table extraction when null value provided for `(…
Browse files Browse the repository at this point in the history
…text|intersection)_(x|y)_tolerance` keys

Fixes #539
Thanks to @yoavxyoav for reporting
  • Loading branch information
samkit-jain committed Nov 22, 2021
1 parent c915a00 commit 608db02
Show file tree
Hide file tree
Showing 4 changed files with 66 additions and 13 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ All notable changes to this project will be documented in this file. The format
### Fixed
- Fix slowdown in `.extract_words(...)`/`WordExtractor.iter_chars_to_words(...)` on very long words, caused by repeatedly re-calculating bounding box. ([#483](https://github.com/jsvine/pdfplumber/discussions/483))
- Handle `UnicodeDecodeError` when trying to decode utf-16-encoded annotations ([#463](https://github.com/jsvine/pdfplumber/issues/463)) [h/t @tungph]
- Fix crash when extracting tables with null values in `(text|intersection)_(x|y)_tolerance` settings. ([#539](https://github.com/jsvine/pdfplumber/discussions/539)) [h/t @yoavxyoav]

### Development Changes
- Add `CONTRIBUTING.md` ([#428](https://github.com/jsvine/pdfplumber/pull/428))
Expand Down
2 changes: 2 additions & 0 deletions pdfplumber/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -219,6 +219,7 @@ def find_tables(self, table_settings={}):
return TableFinder(self, table_settings).tables

def extract_tables(self, table_settings={}):
table_settings = TableFinder.resolve_table_settings(table_settings)
tables = self.find_tables(table_settings)

extract_kwargs = dict(
Expand All @@ -230,6 +231,7 @@ def extract_tables(self, table_settings={}):
return [table.extract(**extract_kwargs) for table in tables]

def extract_table(self, table_settings={}):
table_settings = TableFinder.resolve_table_settings(table_settings)
tables = self.find_tables(table_settings)

if len(tables) == 0:
Expand Down
48 changes: 35 additions & 13 deletions pdfplumber/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -438,20 +438,8 @@ class TableFinder(object):
"""

def __init__(self, page, settings={}):
for k in settings.keys():
if k not in DEFAULT_TABLE_SETTINGS:
raise ValueError(f"Unrecognized table setting: '{k}'")
self.page = page
self.settings = dict(DEFAULT_TABLE_SETTINGS)
self.settings.update(settings)
for var, fallback in [
("text_x_tolerance", "text_tolerance"),
("text_y_tolerance", "text_tolerance"),
("intersection_x_tolerance", "intersection_tolerance"),
("intersection_y_tolerance", "intersection_tolerance"),
]:
if self.settings[var] is None:
self.settings.update({var: self.settings[fallback]})
self.settings = self.resolve_table_settings(settings)
self.edges = self.get_edges()
self.intersections = edges_to_intersections(
self.edges,
Expand All @@ -461,6 +449,40 @@ def __init__(self, page, settings={}):
self.cells = intersections_to_cells(self.intersections)
self.tables = [Table(self.page, t) for t in cells_to_tables(self.cells)]

@staticmethod
def resolve_table_settings(table_settings={}):
"""Clean up user-provided table settings.
Validates that the table settings provided consists of acceptable keys and
returns a cleaned up version. The cleaned up version fills out the missing
values with the default values in the provided settings.
TODO: Can be further used to validate that the values are of the correct
type. For example, raising a value error when a non-boolean input is
provided for the key ``keep_blank_chars``.
:param table_settings: User-provided table settings.
:returns: A cleaned up version of the user-provided table settings.
:raises ValueError: When an unrecognised key is provided.
"""
for k in table_settings.keys():
if k not in DEFAULT_TABLE_SETTINGS:
raise ValueError(f"Unrecognized table setting: '{k}'")

resolved_table_settings = dict(DEFAULT_TABLE_SETTINGS)
resolved_table_settings.update(table_settings)

for var, fallback in [
("text_x_tolerance", "text_tolerance"),
("text_y_tolerance", "text_tolerance"),
("intersection_x_tolerance", "intersection_tolerance"),
("intersection_y_tolerance", "intersection_tolerance"),
]:
if resolved_table_settings[var] is None:
resolved_table_settings.update({var: resolved_table_settings[fallback]})

return resolved_table_settings

def get_edges(self):
settings = self.settings
for name in ["vertical", "horizontal"]:
Expand Down
28 changes: 28 additions & 0 deletions tests/test_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,3 +170,31 @@ def test_issue_466_mixed_strategy(self):
# Verify that all cell contain real data
for cell in t[3]:
assert "last" in cell

def test_discussion_539_null_value(self):
"""
See discussion #539
"""
path = os.path.join(HERE, "pdfs/nics-background-checks-2015-11.pdf")
with pdfplumber.open(path) as pdf:
page = pdf.pages[0]
table_settings = {
"vertical_strategy": "lines",
"horizontal_strategy": "lines",
"explicit_vertical_lines": [],
"explicit_horizontal_lines": [],
"snap_tolerance": 3,
"join_tolerance": 3,
"edge_min_length": 3,
"min_words_vertical": 3,
"min_words_horizontal": 1,
"keep_blank_chars": False,
"text_tolerance": 3,
"text_x_tolerance": None,
"text_y_tolerance": None,
"intersection_tolerance": 3,
"intersection_x_tolerance": None,
"intersection_y_tolerance": None,
}
assert page.extract_table(table_settings)
assert page.extract_tables(table_settings)

0 comments on commit 608db02

Please sign in to comment.