Skip to content

Commit

Permalink
Add relative param to .crop & err on invalid boxes
Browse files Browse the repository at this point in the history
Addresses #245

Also adds `relative` param to .within_bbox, and adds a new
utils.calculate_area(bbox) method.
  • Loading branch information
jsvine committed Aug 14, 2020
1 parent 6c9f8db commit 047ad34
Show file tree
Hide file tree
Showing 5 changed files with 63 additions and 9 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -93,8 +93,8 @@ The `pdfplumber.Page` class is at the core of `pdfplumber`. Most things you'll d

| Method | Description |
|--------|-------------|
|`.crop(bounding_box)`| Returns a version of the page cropped to the bounding box, which should be expressed as 4-tuple with the values `(x0, top, x1, bottom)`. Cropped pages retain objects that fall at least partly within the bounding box. If an object falls only partly within the box, its dimensions are sliced to fit the bounding box.|
|`.within_bbox(bounding_box)`| Similar to `.crop`, but only retains objects that fall *entirely* within the bounding box.|
|`.crop(bounding_box, relative=False)`| Returns a version of the page cropped to the bounding box, which should be expressed as 4-tuple with the values `(x0, top, x1, bottom)`. Cropped pages retain objects that fall at least partly within the bounding box. If an object falls only partly within the box, its dimensions are sliced to fit the bounding box. If `relative=True`, the bounding box is calculated as an offset from the top-left of the page's bounding box, rather than an absolute positioning.|
|`.within_bbox(bounding_box, relative=False)`| Similar to `.crop`, but only retains objects that fall *entirely* within the bounding box.|
|`.filter(test_function)`| Returns a version of the page with only the `.objects` for which `test_function(obj)` returns `True`.|
|`.extract_text(x_tolerance=3, y_tolerance=3)`| Collates all of the page's character objects into a single string. Adds spaces where the difference between the `x1` of one character and the `x0` of the next is greater than `x_tolerance`. Adds newline characters where the difference between the `doctop` of one character and the `doctop` of the next is greater than `y_tolerance`.|
|`.extract_words(x_tolerance=3, y_tolerance=3, horizontal_ltr=True, vertical_ttb=True)`| Returns a list of all word-looking things and their bounding boxes. Words are considered to be sequences of characters where (for "upright" characters) the difference between the `x1` of one character and the `x0` of the next is less than or equal to `x_tolerance` *and* where the `doctop` of one character and the `doctop` of the next is less than or equal to `y_tolerance`. A similar approach is taken for non-upright characters, but instead measuring the vertical, rather than horizontal, distances between them. The parameters `horizontal_ltr` and `vertical_ttb` indicate whether the words should be read from left-to-right (for horizontal words) / top-to-bottom (for vertical words).|
Expand Down
33 changes: 27 additions & 6 deletions pdfplumber/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -236,14 +236,14 @@ def extract_text(self, **kwargs):
def extract_words(self, **kwargs):
return utils.extract_words(self.chars, **kwargs)

def crop(self, bbox):
return CroppedPage(self, self.decimalize(bbox))
def crop(self, bbox, relative=False):
return CroppedPage(self, self.decimalize(bbox), relative=relative)

def within_bbox(self, bbox):
def within_bbox(self, bbox, relative=False):
"""
Same as .crop, except only includes objects fully within the bbox
"""
return CroppedPage(self, self.decimalize(bbox), utils.within_bbox)
return CroppedPage(self, self.decimalize(bbox), relative=relative, crop_fn=utils.within_bbox)

def filter(self, test_function):
return FilteredPage(self, test_function)
Expand Down Expand Up @@ -280,9 +280,30 @@ def __init__(self, parent_page):
self.root_page = parent_page.root_page


def test_proposed_bbox(bbox, parent_bbox):
bbox_area = utils.calculate_area(bbox)
if bbox_area == 0:
raise ValueError(f"Bounding box {bbox} has an area of zero.")

overlap = utils.get_bbox_overlap(bbox, parent_bbox)
overlap_area = utils.calculate_area(overlap)
if overlap_area < bbox_area:
raise ValueError(
f"Bounding box {bbox} is not fully within "
f"parent page bounding box {parent_bbox}"
)

class CroppedPage(DerivedPage):
def __init__(self, parent_page, bbox, crop_fn=utils.crop_to_bbox):
self.bbox = bbox
def __init__(self, parent_page, bbox, crop_fn=utils.crop_to_bbox, relative=False):
if relative:
print("Parent page", parent_page.bbox)
o_x0, o_top, _, _ = parent_page.bbox
x0, top, x1, bottom = bbox
self.bbox = (x0 + o_x0, top + o_top, x1 + o_x0, bottom + o_top)
else:
self.bbox = bbox

test_proposed_bbox(self.bbox, parent_page.bbox)
self.crop_fn = crop_fn
super().__init__(parent_page)

Expand Down
5 changes: 5 additions & 0 deletions pdfplumber/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -333,6 +333,11 @@ def get_bbox_overlap(a, b):
else:
return None

def calculate_area(bbox):
left, top, right, bottom = bbox
if left > right or top > bottom:
raise ValueError(f"{bbox} has a negative width or height.")
return (right - left) * (bottom - top)

def clip_obj(obj, bbox):
bbox = decimalize(bbox)
Expand Down
28 changes: 28 additions & 0 deletions tests/test_basics.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#!/usr/bin/env python
import unittest
import pytest
import pandas as pd
import pdfplumber
import sys, os
Expand Down Expand Up @@ -72,6 +73,33 @@ def test(obj):
assert id(filtered.chars) == id(filtered._objects["char"])
assert len(filtered.rects) == 0

def test_relative_crop(self):
original = self.pdf.pages[0]
cropped = original.crop((10, 10, 40, 40))
recropped = cropped.crop((10, 15, 20, 25), relative=True)
target_bbox = pdfplumber.utils.decimalize((20, 25, 30, 35))
assert recropped.bbox == target_bbox

recropped_wi = cropped.within_bbox((10, 15, 20, 25), relative=True)
assert recropped_wi.bbox == target_bbox

def test_invalid_crops(self):
original = self.pdf.pages[0]
with pytest.raises(ValueError):
original.crop((0, 0, 0, 0))

with pytest.raises(ValueError):
original.crop((0, 0, 10000, 10))

with pytest.raises(ValueError):
original.crop((-10, 0, 10, 10))

with pytest.raises(ValueError):
original.crop((100, 0, 0, 100))

with pytest.raises(ValueError):
original.crop((0, 100, 100, 0))

def test_rotation(self):
assert(self.pdf.pages[0].width == 1008)
assert(self.pdf.pages[0].height == 612)
Expand Down
2 changes: 1 addition & 1 deletion tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ def test_extract_text(self):
])

assert text == goal
assert self.pdf.pages[0].crop((0, 0, 0, 0)).extract_text() == None
assert self.pdf.pages[0].crop((0, 0, 1, 1)).extract_text() == None

def test_resize_object(self):
obj = {
Expand Down

0 comments on commit 047ad34

Please sign in to comment.