Skip to content

Commit

Permalink
Merge d3b8848 into 002500a
Browse files Browse the repository at this point in the history
  • Loading branch information
bobluda authored Jul 11, 2021
2 parents 002500a + d3b8848 commit 1f296af
Show file tree
Hide file tree
Showing 3 changed files with 45 additions and 10 deletions.
26 changes: 16 additions & 10 deletions pdfplumber/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ def words_to_edges_h(words, word_threshold=DEFAULT_MIN_WORDS_HORIZONTAL):
return []
min_x0 = min(map(itemgetter("x0"), rects))
max_x1 = max(map(itemgetter("x1"), rects))
max_bottom = max(map(itemgetter("bottom"), rects))

edges = [
{
"x0": min_x0,
Expand All @@ -95,17 +95,23 @@ def words_to_edges_h(words, word_threshold=DEFAULT_MIN_WORDS_HORIZONTAL):
"orientation": "h",
}
for r in rects
] + [
{
"x0": min_x0,
"x1": max_x1,
"top": max_bottom,
"bottom": max_bottom,
"width": max_x1 - min_x0,
"orientation": "h",
}
]

# For each detected row, we also add the 'bottom' line.
# This will generate extra edges, (some will be redundant with the next row
# 'top' line), but this catches the last row of every table.
for r in rects:
edges.append(
{
"x0": min_x0,
"x1": max_x1,
"top": r["bottom"],
"bottom": r["bottom"],
"width": max_x1 - min_x0,
"orientation": "h",
}
)

return edges


Expand Down
Binary file added tests/pdfs/issue-466-example.pdf
Binary file not shown.
29 changes: 29 additions & 0 deletions tests/test_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,3 +141,32 @@ def test_order(self):
assert len(tables[0]) == 8
assert len(tables[1]) == 11
assert len(tables[2]) == 2

def test_issue_466_mixed_strategy(self):
"""
See issue #466
"""
path = os.path.join(HERE, "pdfs/issue-466-example.pdf")
with pdfplumber.open(path) as pdf:
tables = pdf.pages[0].extract_tables(
{
"vertical_strategy": "lines",
"horizontal_strategy": "text",
"snap_tolerance": 8,
"intersection_tolerance": 4,
}
)

# The engine only extracts the tables which have drawn horizontal
# lines.
# For the 3 extracted tables, some common properties are expected:
# - 4 rows
# - 3 columns
# - Data in last row contains the string 'last'
for t in tables:
assert len(t) == 4
assert len(t[0]) == 3

# Verify that all cell contain real data
for cell in t[3]:
assert "last" in cell

0 comments on commit 1f296af

Please sign in to comment.