Fix slowdown in extract_words on long words (#483)

This commit fixes the slowdown in `.extract_words(...)` and `WordExtractor.iter_chars_to_words(...)` which occurred for very long "words", and which was caused by repeatedly re-calculating bounding box. See #483 for discussion and example. It also adds `utils.merge_bboxes(bboxes)`, which returns the smallest bounding box that contains all bounding boxes in the `bboxes` argument, and which we use in the fix.
jsvine · Sep 2, 2021 · f8d5e70 · f8d5e70
1 parent 9019854
commit f8d5e70
Show file tree

Hide file tree

Showing 2 changed files with 31 additions and 3 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -3,9 +3,15 @@
 All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](http://keepachangelog.com/).
 
 ## [0.5.29] - [unreleased]
+### Added
+- Add `utils.merge_bboxes(bboxes)`, which returns the smallest bounding box that contains all bounding boxes in the `bboxes` argument.
+
 ## Changed
 - Change behavior of horizontal `text_strategy`, so that it uses the top and bottom of *every* word, not just the top of every word and the bottom of the last. ([#467](https://github.com/jsvine/pdfplumber/pull/467) + [#466](https://github.com/jsvine/pdfplumber/issues/466) + [#265](https://github.com/jsvine/pdfplumber/issues/265)) [h/t @bobluda + @samkit-jain]
 
+### Fixed
+- Fix slowdown in `.extract_words(...)`/`WordExtractor.iter_chars_to_words(...)` on very long words, caused by repeatedly re-calculating bounding box. ([#483](https://github.com/jsvine/pdfplumber/discussions/483))
+
 ### Development Changes
 - Add `CONTRIBUTING.md` ([#428](https://github.com/jsvine/pdfplumber/pull/428))
 

diff --git a/pdfplumber/utils.py b/pdfplumber/utils.py
@@ -243,6 +243,19 @@ def bbox_to_rect(bbox):
     return {"x0": bbox[0], "top": bbox[1], "x1": bbox[2], "bottom": bbox[3]}
 
 
+def merge_bboxes(bboxes):
+    """
+    Given a set of bounding boxes, return the smallest bounding box that
+    contains them all.
+    """
+    return (
+        min(map(itemgetter(0), bboxes)),
+        min(map(itemgetter(1), bboxes)),
+        max(map(itemgetter(2), bboxes)),
+        max(map(itemgetter(3), bboxes)),
+    )
+
+
 DEFAULT_WORD_EXTRACTION_SETTINGS = dict(
     x_tolerance=DEFAULT_X_TOLERANCE,
     y_tolerance=DEFAULT_Y_TOLERANCE,
@@ -286,12 +299,12 @@ def merge_chars(self, ordered_chars):
 
         return word
 
-    def char_begins_new_word(self, current_chars, next_char):
+    def char_begins_new_word(self, current_chars, current_bbox, next_char):
         upright = current_chars[0]["upright"]
         intraline_tol = self.x_tolerance if upright else self.y_tolerance
         interline_tol = self.y_tolerance if upright else self.x_tolerance
 
-        word_x0, word_top, word_x1, word_bottom = objects_to_bbox(current_chars)
+        word_x0, word_top, word_x1, word_bottom = current_bbox
 
         return (
             (next_char["x0"] > word_x1 + intraline_tol)
@@ -302,19 +315,28 @@ def char_begins_new_word(self, current_chars, next_char):
 
     def iter_chars_to_words(self, chars):
         current_word = []
+        current_bbox = None
 
         for char in chars:
             if not self.keep_blank_chars and char["text"].isspace():
                 if current_word:
                     yield current_word
                     current_word = []
+                    current_bbox = None
 
-            elif current_word and self.char_begins_new_word(current_word, char):
+            elif current_word and self.char_begins_new_word(
+                current_word, current_bbox, char
+            ):
                 yield current_word
                 current_word = [char]
+                current_bbox = obj_to_bbox(char)
 
             else:
                 current_word.append(char)
+                if current_bbox is None:
+                    current_bbox = obj_to_bbox(char)
+                else:
+                    current_bbox = merge_bboxes([current_bbox, obj_to_bbox(char)])
 
         if current_word:
             yield current_word