Reorganize .utils

jsvine · Jan 24, 2023 · 6351d97 · 6351d97
1 parent 4b7f3b5
commit 6351d97
Show file tree

Hide file tree

Showing 9 changed files with 514 additions and 457 deletions.
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -36,7 +36,7 @@ jobs:
       run: python -m flake8 pdfplumber tests
 
     - name: Check type annotations via mypy
-      run: python -m mypy --strict pdfplumber
+      run: python -m mypy --strict --implicit-reexport pdfplumber
 
   test:
     needs: lint

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,6 +2,12 @@
 
 All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](http://keepachangelog.com/).
 
+## [Unreleased]
+
+### Development Changes
+
+- Converted `utils.py` into `utils/` submodules. Retains same interface, just an improvement in organization.
+
 ## [0.7.6] - 2022-11-22
 
 ### Changed

diff --git a/Makefile b/Makefile
@@ -23,7 +23,7 @@ check-flake:
 	${PYTHON} -m flake8 pdfplumber tests
 
 check-mypy:
-	${PYTHON} -m mypy --strict pdfplumber
+	${PYTHON} -m mypy --strict --implicit-reexport pdfplumber
 
 lint: check-flake check-mypy check-black check-isort
 

diff --git a/pdfplumber/utils/__init__.py b/pdfplumber/utils/__init__.py
@@ -0,0 +1,45 @@
+from .clustering import cluster_list, cluster_objects, make_cluster_dict  # noqa: F401
+from .generic import to_list  # noqa: F401
+from .geometry import (  # noqa: F401
+    bbox_to_rect,
+    calculate_area,
+    clip_obj,
+    crop_to_bbox,
+    curve_to_edges,
+    filter_edges,
+    get_bbox_overlap,
+    intersects_bbox,
+    line_to_edge,
+    merge_bboxes,
+    move_object,
+    obj_to_bbox,
+    obj_to_edges,
+    objects_to_bbox,
+    objects_to_rect,
+    outside_bbox,
+    rect_to_edges,
+    resize_object,
+    snap_objects,
+    within_bbox,
+)
+from .pdfinternals import (  # noqa: F401
+    decode_psl_list,
+    decode_text,
+    resolve,
+    resolve_all,
+    resolve_and_decode,
+)
+from .text import (  # noqa: F401
+    DEFAULT_X_DENSITY,
+    DEFAULT_X_TOLERANCE,
+    DEFAULT_Y_DENSITY,
+    DEFAULT_Y_TOLERANCE,
+    LayoutEngine,
+    TextLayout,
+    WordExtractor,
+    chars_to_layout,
+    collate_line,
+    dedupe_chars,
+    extract_text,
+    extract_words,
+)
diff --git a/pdfplumber/utils/clustering.py b/pdfplumber/utils/clustering.py
@@ -0,0 +1,58 @@
+import itertools
+from collections.abc import Hashable
+from operator import itemgetter
+from typing import Callable, Dict, Iterable, List, TypeVar, Union
+
+from .._typing import T_num
+
+
+def cluster_list(xs: List[T_num], tolerance: T_num = 0) -> List[List[T_num]]:
+    if tolerance == 0:
+        return [[x] for x in sorted(xs)]
+    if len(xs) < 2:
+        return [[x] for x in sorted(xs)]
+    groups = []
+    xs = list(sorted(xs))
+    current_group = [xs[0]]
+    last = xs[0]
+    for x in xs[1:]:
+        if x <= (last + tolerance):
+            current_group.append(x)
+        else:
+            groups.append(current_group)
+            current_group = [x]
+        last = x
+    groups.append(current_group)
+    return groups
+
+
+def make_cluster_dict(values: Iterable[T_num], tolerance: T_num) -> Dict[T_num, int]:
+    clusters = cluster_list(list(set(values)), tolerance)
+
+    nested_tuples = [
+        [(val, i) for val in value_cluster] for i, value_cluster in enumerate(clusters)
+    ]
+
+    return dict(itertools.chain(*nested_tuples))
+
+
+R = TypeVar("R")
+
+
+def cluster_objects(
+    xs: List[R], key_fn: Union[Hashable, Callable[[R], T_num]], tolerance: T_num
+) -> List[List[R]]:
+
+    if not callable(key_fn):
+        key_fn = itemgetter(key_fn)
+
+    values = map(key_fn, xs)
+    cluster_dict = make_cluster_dict(values, tolerance)
+
+    get_0, get_1 = itemgetter(0), itemgetter(1)
+
+    cluster_tuples = sorted(((x, cluster_dict.get(key_fn(x))) for x in xs), key=get_1)
+
+    grouped = itertools.groupby(cluster_tuples, key=get_1)
+
+    return [list(map(get_0, v)) for k, v in grouped]
diff --git a/pdfplumber/utils/generic.py b/pdfplumber/utils/generic.py
@@ -0,0 +1,21 @@
+from collections.abc import Sequence
+from typing import TYPE_CHECKING, Any, Dict, List, Union
+
+from .._typing import T_seq
+
+if TYPE_CHECKING:  # pragma: nocover
+    from pandas.core.frame import DataFrame
+
+
+def to_list(collection: Union[T_seq[Any], "DataFrame"]) -> List[Any]:
+    if isinstance(collection, list):
+        return collection
+    elif isinstance(collection, Sequence):
+        return list(collection)
+    elif hasattr(collection, "to_dict"):
+        res: List[Dict[Union[str, int], Any]] = collection.to_dict(
+            "records"
+        )  # pragma: nocover
+        return res
+    else:
+        return list(collection)