From 901985407b857a9734983170ea1d67e18cd33287 Mon Sep 17 00:00:00 2001 From: Jeremy Singer-Vine Date: Thu, 15 Jul 2021 10:00:07 -0400 Subject: [PATCH] Tweak #467 code slightly, update CHANGELOG/thanks h/t @bobluda + @samkit-jain --- CHANGELOG.md | 3 +++ README.md | 1 + pdfplumber/table.py | 34 ++++++++++++++++------------------ 3 files changed, 20 insertions(+), 18 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 10f8892f..20ba266e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,9 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](http://keepachangelog.com/). ## [0.5.29] - [unreleased] +## Changed +- Change behavior of horizontal `text_strategy`, so that it uses the top and bottom of *every* word, not just the top of every word and the bottom of the last. ([#467](https://github.com/jsvine/pdfplumber/pull/467) + [#466](https://github.com/jsvine/pdfplumber/issues/466) + [#265](https://github.com/jsvine/pdfplumber/issues/265)) [h/t @bobluda + @samkit-jain] + ### Development Changes - Add `CONTRIBUTING.md` ([#428](https://github.com/jsvine/pdfplumber/pull/428)) diff --git a/README.md b/README.md index 0e06b9de..ac9b0f3a 100644 --- a/README.md +++ b/README.md @@ -431,6 +431,7 @@ Many thanks to the following users who've contributed ideas, features, and fixes - [xv44586](https://github.com/xv44586) - [Alexander Regueiro](https://github.com/alexreg) - [Daniel Peña](https://github.com/trifling) +- [bobluda](https://github.com/bobluda) ## Contributing diff --git a/pdfplumber/table.py b/pdfplumber/table.py index 74edc840..af5593db 100644 --- a/pdfplumber/table.py +++ b/pdfplumber/table.py @@ -85,23 +85,21 @@ def words_to_edges_h(words, word_threshold=DEFAULT_MIN_WORDS_HORIZONTAL): min_x0 = min(map(itemgetter("x0"), rects)) max_x1 = max(map(itemgetter("x1"), rects)) - edges = [ - { - "x0": min_x0, - "x1": max_x1, - "top": r["top"], - "bottom": r["top"], - "width": max_x1 - min_x0, - "orientation": "h", - } - for r in rects - ] - - # For each detected row, we also add the 'bottom' line. - # This will generate extra edges, (some will be redundant with the next row - # 'top' line), but this catches the last row of every table. + edges = [] for r in rects: - edges.append( + edges += [ + # Top of text + { + "x0": min_x0, + "x1": max_x1, + "top": r["top"], + "bottom": r["top"], + "width": max_x1 - min_x0, + "orientation": "h", + }, + # For each detected row, we also add the 'bottom' line. This will + # generate extra edges, (some will be redundant with the next row + # 'top' line), but this catches the last row of every table. { "x0": min_x0, "x1": max_x1, @@ -109,8 +107,8 @@ def words_to_edges_h(words, word_threshold=DEFAULT_MIN_WORDS_HORIZONTAL): "bottom": r["bottom"], "width": max_x1 - min_x0, "orientation": "h", - } - ) + }, + ] return edges