Skip to content

Commit

Permalink
feat: speed up dataset enumeration
Browse files Browse the repository at this point in the history
Rather than attempting to load every image, now by default
we check if the file extention is known by Pillow the image
libary. The enhanced (checks can load each image) can be
utilised still by passing enhanced_validation=True to
`enumerate_images()`.
  • Loading branch information
bencevans committed Jun 21, 2022
1 parent 5d0c265 commit 8410eea
Showing 1 changed file with 16 additions and 4 deletions.
20 changes: 16 additions & 4 deletions camtrapml/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

from pathlib import Path
from os import walk

from PIL import Image
from camtrapml.image.utils import is_image


Expand All @@ -24,14 +24,26 @@ def __init__(self, name: str, path: Path):
if not self.path.exists() or not self.path.is_dir():
raise ValueError(f"{self.path} is not a directory")

def enumerate_images(self):
def enumerate_images(self, enhanced_validation: bool = False):
"""
Enumerates all images in the dataset.
"""

exts = Image.registered_extensions()
supported_extensions = {ex.lower() for ex, f in exts.items() if f in Image.OPEN}

for root, _, files in walk(self.path):
for file in files:
if is_image(Path(root) / file):
yield Path(root) / file
file_path = Path(root) / file

if enhanced_validation:
if is_image(file_path):
yield file_path

else:
if file_path.suffix.lower() in supported_extensions:
yield file_path


@staticmethod
def from_coco(source):
Expand Down

0 comments on commit 8410eea

Please sign in to comment.