Skip to content

Commit

Permalink
Reorganize .utils
Browse files Browse the repository at this point in the history
  • Loading branch information
jsvine committed Jan 24, 2023
1 parent 4b7f3b5 commit 6351d97
Show file tree
Hide file tree
Showing 9 changed files with 514 additions and 457 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ jobs:
run: python -m flake8 pdfplumber tests

- name: Check type annotations via mypy
run: python -m mypy --strict pdfplumber
run: python -m mypy --strict --implicit-reexport pdfplumber

test:
needs: lint
Expand Down
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,12 @@

All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](http://keepachangelog.com/).

## [Unreleased]

### Development Changes

- Converted `utils.py` into `utils/` submodules. Retains same interface, just an improvement in organization.

## [0.7.6] - 2022-11-22

### Changed
Expand Down
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ check-flake:
${PYTHON} -m flake8 pdfplumber tests

check-mypy:
${PYTHON} -m mypy --strict pdfplumber
${PYTHON} -m mypy --strict --implicit-reexport pdfplumber

lint: check-flake check-mypy check-black check-isort

Expand Down
45 changes: 45 additions & 0 deletions pdfplumber/utils/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
from .clustering import cluster_list, cluster_objects, make_cluster_dict # noqa: F401
from .generic import to_list # noqa: F401
from .geometry import ( # noqa: F401
bbox_to_rect,
calculate_area,
clip_obj,
crop_to_bbox,
curve_to_edges,
filter_edges,
get_bbox_overlap,
intersects_bbox,
line_to_edge,
merge_bboxes,
move_object,
obj_to_bbox,
obj_to_edges,
objects_to_bbox,
objects_to_rect,
outside_bbox,
rect_to_edges,
resize_object,
snap_objects,
within_bbox,
)
from .pdfinternals import ( # noqa: F401
decode_psl_list,
decode_text,
resolve,
resolve_all,
resolve_and_decode,
)
from .text import ( # noqa: F401
DEFAULT_X_DENSITY,
DEFAULT_X_TOLERANCE,
DEFAULT_Y_DENSITY,
DEFAULT_Y_TOLERANCE,
LayoutEngine,
TextLayout,
WordExtractor,
chars_to_layout,
collate_line,
dedupe_chars,
extract_text,
extract_words,
)
58 changes: 58 additions & 0 deletions pdfplumber/utils/clustering.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
import itertools
from collections.abc import Hashable
from operator import itemgetter
from typing import Callable, Dict, Iterable, List, TypeVar, Union

from .._typing import T_num


def cluster_list(xs: List[T_num], tolerance: T_num = 0) -> List[List[T_num]]:
if tolerance == 0:
return [[x] for x in sorted(xs)]
if len(xs) < 2:
return [[x] for x in sorted(xs)]
groups = []
xs = list(sorted(xs))
current_group = [xs[0]]
last = xs[0]
for x in xs[1:]:
if x <= (last + tolerance):
current_group.append(x)
else:
groups.append(current_group)
current_group = [x]
last = x
groups.append(current_group)
return groups


def make_cluster_dict(values: Iterable[T_num], tolerance: T_num) -> Dict[T_num, int]:
clusters = cluster_list(list(set(values)), tolerance)

nested_tuples = [
[(val, i) for val in value_cluster] for i, value_cluster in enumerate(clusters)
]

return dict(itertools.chain(*nested_tuples))


R = TypeVar("R")


def cluster_objects(
xs: List[R], key_fn: Union[Hashable, Callable[[R], T_num]], tolerance: T_num
) -> List[List[R]]:

if not callable(key_fn):
key_fn = itemgetter(key_fn)

values = map(key_fn, xs)
cluster_dict = make_cluster_dict(values, tolerance)

get_0, get_1 = itemgetter(0), itemgetter(1)

cluster_tuples = sorted(((x, cluster_dict.get(key_fn(x))) for x in xs), key=get_1)

grouped = itertools.groupby(cluster_tuples, key=get_1)

return [list(map(get_0, v)) for k, v in grouped]
21 changes: 21 additions & 0 deletions pdfplumber/utils/generic.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
from collections.abc import Sequence
from typing import TYPE_CHECKING, Any, Dict, List, Union

from .._typing import T_seq

if TYPE_CHECKING: # pragma: nocover
from pandas.core.frame import DataFrame


def to_list(collection: Union[T_seq[Any], "DataFrame"]) -> List[Any]:
if isinstance(collection, list):
return collection
elif isinstance(collection, Sequence):
return list(collection)
elif hasattr(collection, "to_dict"):
res: List[Dict[Union[str, int], Any]] = collection.to_dict(
"records"
) # pragma: nocover
return res
else:
return list(collection)
Loading

0 comments on commit 6351d97

Please sign in to comment.