Refactor text-extraction utilities

jsvine · Feb 7, 2023 · 3424b57 · 3424b57
1 parent 8ed2c7e
commit 3424b57
Show file tree

Hide file tree

Showing 6 changed files with 205 additions and 250 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -12,6 +12,7 @@ All notable changes to this project will be documented in this file. The format
 
 - Convert `utils.py` into `utils/` submodules. Retains same interface, just an improvement in organization. ([6351d97](https://github.com/jsvine/pdfplumber/commit/6351d97))
 - Fix typing hints to include io.BytesIO. ([d4107f6](https://github.com/jsvine/pdfplumber/commit/d4107f6)) [h/t @ conitrade-as]
+- Refactor text-extraction utilities, paving way for better consistency across various entrypoints to text extraction (e.g., via `utils.extract_text(...)`, via `Page.extract_text(...)`, via `Page.extract_table(...)`).
 
 ## [0.7.6] - 2022-11-22
 

diff --git a/pdfplumber/page.py b/pdfplumber/page.py
@@ -30,6 +30,7 @@
 from .container import Container
 from .table import T_table_settings, Table, TableFinder, TableSettings
 from .utils import resolve_all
+from .utils.text import TextMap
 
 lt_pat = re.compile(r"^LT")
 
@@ -116,7 +117,7 @@ def __init__(
         )
 
         # https://rednafi.github.io/reflections/dont-wrap-instance-methods-with-functoolslru_cache-decorator-in-python.html
-        self.get_text_layout = lru_cache()(self._get_text_layout)
+        self.get_textmap = lru_cache()(self._get_textmap)
 
     @property
     def width(self) -> T_num:
@@ -300,10 +301,10 @@ def sorter(x: Table) -> Tuple[int, T_num, T_num]:
 
         return largest.extract(**extract_kwargs)
 
-    def _get_text_layout(self, **kwargs: Any) -> utils.TextLayout:
+    def _get_textmap(self, **kwargs: Any) -> TextMap:
         defaults = dict(x_shift=self.bbox[0], y_shift=self.bbox[1])
         full_kwargs: Dict[str, Any] = {**defaults, **kwargs}
-        return utils.chars_to_layout(self.chars, **full_kwargs)
+        return utils.chars_to_textmap(self.chars, **full_kwargs)
 
     def search(
         self,
@@ -312,16 +313,14 @@ def search(
         case: bool = True,
         **kwargs: Any,
     ) -> List[Dict[str, Any]]:
-        text_layout = self.get_text_layout(**kwargs)
-        return text_layout.search(pattern, regex=regex, case=case)
+        textmap = self.get_textmap(**kwargs)
+        return textmap.search(pattern, regex=regex, case=case)
 
     def extract_text(self, **kwargs: Any) -> str:
-        if kwargs.get("layout") is True:
-            del kwargs["layout"]
-            text_layout = self.get_text_layout(**kwargs)
-            return text_layout.to_string()
-        else:
-            return utils.extract_text(self.chars, **kwargs)
+        return self.get_textmap(**kwargs).as_string
+
+    def extract_text_simple(self, **kwargs: Any) -> str:
+        return utils.extract_text_simple(self.chars, **kwargs)
 
     def extract_words(self, **kwargs: Any) -> T_obj_list:
         return utils.extract_words(self.chars, **kwargs)
@@ -410,7 +409,7 @@ def __init__(self, parent_page: Page):
         self.page_obj = parent_page.page_obj
         self.page_number = parent_page.page_number
         self.flush_cache(Container.cached_properties)
-        self.get_text_layout = lru_cache()(self._get_text_layout)
+        self.get_textmap = lru_cache()(self._get_textmap)
 
 
 def test_proposed_bbox(bbox: T_bbox, parent_bbox: T_bbox) -> None:

diff --git a/pdfplumber/utils/__init__.py b/pdfplumber/utils/__init__.py
@@ -34,12 +34,10 @@
     DEFAULT_X_TOLERANCE,
     DEFAULT_Y_DENSITY,
     DEFAULT_Y_TOLERANCE,
-    LayoutEngine,
-    TextLayout,
-    WordExtractor,
-    chars_to_layout,
+    chars_to_textmap,
     collate_line,
     dedupe_chars,
     extract_text,
+    extract_text_simple,
     extract_words,
 )