Skip to content

Commit

Permalink
Refactor text-extraction utilities
Browse files Browse the repository at this point in the history
  • Loading branch information
jsvine committed Feb 7, 2023
1 parent 8ed2c7e commit 3424b57
Show file tree
Hide file tree
Showing 6 changed files with 205 additions and 250 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ All notable changes to this project will be documented in this file. The format

- Convert `utils.py` into `utils/` submodules. Retains same interface, just an improvement in organization. ([6351d97](https://github.com/jsvine/pdfplumber/commit/6351d97))
- Fix typing hints to include io.BytesIO. ([d4107f6](https://github.com/jsvine/pdfplumber/commit/d4107f6)) [h/t @ conitrade-as]
- Refactor text-extraction utilities, paving way for better consistency across various entrypoints to text extraction (e.g., via `utils.extract_text(...)`, via `Page.extract_text(...)`, via `Page.extract_table(...)`).

## [0.7.6] - 2022-11-22

Expand Down
23 changes: 11 additions & 12 deletions pdfplumber/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
from .container import Container
from .table import T_table_settings, Table, TableFinder, TableSettings
from .utils import resolve_all
from .utils.text import TextMap

lt_pat = re.compile(r"^LT")

Expand Down Expand Up @@ -116,7 +117,7 @@ def __init__(
)

# https://rednafi.github.io/reflections/dont-wrap-instance-methods-with-functoolslru_cache-decorator-in-python.html
self.get_text_layout = lru_cache()(self._get_text_layout)
self.get_textmap = lru_cache()(self._get_textmap)

@property
def width(self) -> T_num:
Expand Down Expand Up @@ -300,10 +301,10 @@ def sorter(x: Table) -> Tuple[int, T_num, T_num]:

return largest.extract(**extract_kwargs)

def _get_text_layout(self, **kwargs: Any) -> utils.TextLayout:
def _get_textmap(self, **kwargs: Any) -> TextMap:
defaults = dict(x_shift=self.bbox[0], y_shift=self.bbox[1])
full_kwargs: Dict[str, Any] = {**defaults, **kwargs}
return utils.chars_to_layout(self.chars, **full_kwargs)
return utils.chars_to_textmap(self.chars, **full_kwargs)

def search(
self,
Expand All @@ -312,16 +313,14 @@ def search(
case: bool = True,
**kwargs: Any,
) -> List[Dict[str, Any]]:
text_layout = self.get_text_layout(**kwargs)
return text_layout.search(pattern, regex=regex, case=case)
textmap = self.get_textmap(**kwargs)
return textmap.search(pattern, regex=regex, case=case)

def extract_text(self, **kwargs: Any) -> str:
if kwargs.get("layout") is True:
del kwargs["layout"]
text_layout = self.get_text_layout(**kwargs)
return text_layout.to_string()
else:
return utils.extract_text(self.chars, **kwargs)
return self.get_textmap(**kwargs).as_string

def extract_text_simple(self, **kwargs: Any) -> str:
return utils.extract_text_simple(self.chars, **kwargs)

def extract_words(self, **kwargs: Any) -> T_obj_list:
return utils.extract_words(self.chars, **kwargs)
Expand Down Expand Up @@ -410,7 +409,7 @@ def __init__(self, parent_page: Page):
self.page_obj = parent_page.page_obj
self.page_number = parent_page.page_number
self.flush_cache(Container.cached_properties)
self.get_text_layout = lru_cache()(self._get_text_layout)
self.get_textmap = lru_cache()(self._get_textmap)


def test_proposed_bbox(bbox: T_bbox, parent_bbox: T_bbox) -> None:
Expand Down
6 changes: 2 additions & 4 deletions pdfplumber/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,12 +34,10 @@
DEFAULT_X_TOLERANCE,
DEFAULT_Y_DENSITY,
DEFAULT_Y_TOLERANCE,
LayoutEngine,
TextLayout,
WordExtractor,
chars_to_layout,
chars_to_textmap,
collate_line,
dedupe_chars,
extract_text,
extract_text_simple,
extract_words,
)
Loading

0 comments on commit 3424b57

Please sign in to comment.