From c099967fb0b9d361af4ed8363188a88f7810f6ac Mon Sep 17 00:00:00 2001 From: Paul Gierz Date: Fri, 14 Mar 2025 10:29:23 +0100 Subject: [PATCH 1/8] tar index as json file --- fsspec/implementations/tar.py | 29 +++++++++++++++++++---------- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/fsspec/implementations/tar.py b/fsspec/implementations/tar.py index 412e5ba4d..3e0410164 100644 --- a/fsspec/implementations/tar.py +++ b/fsspec/implementations/tar.py @@ -1,4 +1,6 @@ +import json import logging +import pathlib import tarfile import fsspec @@ -89,16 +91,23 @@ def __init__( self._index() def _index(self): - # TODO: load and set saved index, if exists - out = {} - for ti in self.tar: - info = ti.get_info() - info["type"] = typemap.get(info["type"], "file") - name = ti.get_info()["name"].rstrip("/") - out[name] = (info, ti.offset_data) - - self.index = out - # TODO: save index to self.index_store here, if set + if self.index_store is not None and pathlib(self.index_store).exists(): + # NOTE(PG): Not sure if JSON is the best way to go here, but it's + # simple and human-readable. + with self.index_store.open("r") as f: + self.index = json.load(f) + else: + out = {} + for ti in self.tar: + info = ti.get_info() + info["type"] = typemap.get(info["type"], "file") + name = ti.get_info()["name"].rstrip("/") + out[name] = (info, ti.offset_data) + + self.index = out + if self.index_store is not None: + with self.index_store.open("w") as f: + json.dump(out, f) def _get_dirs(self): if self.dir_cache is not None: From 8e0ff3cd88421bed3eee7b57b86dcf42fefa5aab Mon Sep 17 00:00:00 2001 From: Paul Gierz Date: Fri, 14 Mar 2025 10:41:28 +0100 Subject: [PATCH 2/8] fix: forgot Path in pathlib --- fsspec/implementations/tar.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fsspec/implementations/tar.py b/fsspec/implementations/tar.py index 3e0410164..b59ec9894 100644 --- a/fsspec/implementations/tar.py +++ b/fsspec/implementations/tar.py @@ -91,7 +91,7 @@ def __init__( self._index() def _index(self): - if self.index_store is not None and pathlib(self.index_store).exists(): + if self.index_store is not None and pathlib.Path(self.index_store).exists(): # NOTE(PG): Not sure if JSON is the best way to go here, but it's # simple and human-readable. with self.index_store.open("r") as f: From 28e27fbbef4bbd20e1b1b5b4c1aa496e56365615 Mon Sep 17 00:00:00 2001 From: Paul Gierz Date: Fri, 14 Mar 2025 10:48:30 +0100 Subject: [PATCH 3/8] wip: forgot another pathlib, sorry... --- fsspec/implementations/tar.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fsspec/implementations/tar.py b/fsspec/implementations/tar.py index b59ec9894..8e9e541b5 100644 --- a/fsspec/implementations/tar.py +++ b/fsspec/implementations/tar.py @@ -106,7 +106,7 @@ def _index(self): self.index = out if self.index_store is not None: - with self.index_store.open("w") as f: + with pathlib.Path(self.index_store).open("w") as f: json.dump(out, f) def _get_dirs(self): From d7ea0a7ec1abafd8aa1682d16dbe60d69a4d748f Mon Sep 17 00:00:00 2001 From: Paul Gierz Date: Fri, 14 Mar 2025 10:53:16 +0100 Subject: [PATCH 4/8] wip(tar): support for booleans in index_store --- fsspec/implementations/tar.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/fsspec/implementations/tar.py b/fsspec/implementations/tar.py index 8e9e541b5..f9afb1509 100644 --- a/fsspec/implementations/tar.py +++ b/fsspec/implementations/tar.py @@ -86,7 +86,12 @@ def __init__( self.tar = tarfile.TarFile(fileobj=self.fo) self.dir_cache = None - self.index_store = index_store + if isinstance(index_store, (str, pathlib.Path)): + self.index_store = pathlib.Path(index_store) + elif isinstance(index_store, bool): + self.index_store = pathlib.Path(f"{name}.index.json") + else: + self.index_store = index_store self.index = None self._index() From afccb5ed530971447dc87157b804dc773722b5a5 Mon Sep 17 00:00:00 2001 From: Paul Gierz Date: Fri, 14 Mar 2025 10:56:56 +0100 Subject: [PATCH 5/8] wip(tar): support for booleans in index_store, 2 --- fsspec/implementations/tar.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fsspec/implementations/tar.py b/fsspec/implementations/tar.py index f9afb1509..768ba7367 100644 --- a/fsspec/implementations/tar.py +++ b/fsspec/implementations/tar.py @@ -88,7 +88,7 @@ def __init__( if isinstance(index_store, (str, pathlib.Path)): self.index_store = pathlib.Path(index_store) - elif isinstance(index_store, bool): + elif bool(index_store) is True: self.index_store = pathlib.Path(f"{name}.index.json") else: self.index_store = index_store @@ -96,7 +96,7 @@ def __init__( self._index() def _index(self): - if self.index_store is not None and pathlib.Path(self.index_store).exists(): + if self.index_store is not None and self.index_store.exists(): # NOTE(PG): Not sure if JSON is the best way to go here, but it's # simple and human-readable. with self.index_store.open("r") as f: @@ -111,7 +111,7 @@ def _index(self): self.index = out if self.index_store is not None: - with pathlib.Path(self.index_store).open("w") as f: + with self.index_store.open("w") as f: json.dump(out, f) def _get_dirs(self): From ea524c3e7fa7263fa77c54d4b92afdd627ce4b57 Mon Sep 17 00:00:00 2001 From: Paul Gierz Date: Fri, 14 Mar 2025 11:22:15 +0100 Subject: [PATCH 6/8] feat: doesn't crash if you can't write the index --- fsspec/implementations/tar.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fsspec/implementations/tar.py b/fsspec/implementations/tar.py index 768ba7367..f30039ae9 100644 --- a/fsspec/implementations/tar.py +++ b/fsspec/implementations/tar.py @@ -112,7 +112,10 @@ def _index(self): self.index = out if self.index_store is not None: with self.index_store.open("w") as f: - json.dump(out, f) + try: + json.dump(out, f) + except Exception as e: + logger.warning(f"Failed to write index: {e}") def _get_dirs(self): if self.dir_cache is not None: From e04b178bb75c829095d3a01fee95e6ac993c47f4 Mon Sep 17 00:00:00 2001 From: Paul Gierz Date: Fri, 14 Mar 2025 11:23:39 +0100 Subject: [PATCH 7/8] wip: adds note about handling cached names, this isn't handled yet --- fsspec/implementations/tar.py | 1 + 1 file changed, 1 insertion(+) diff --git a/fsspec/implementations/tar.py b/fsspec/implementations/tar.py index f30039ae9..d84da49f0 100644 --- a/fsspec/implementations/tar.py +++ b/fsspec/implementations/tar.py @@ -89,6 +89,7 @@ def __init__( if isinstance(index_store, (str, pathlib.Path)): self.index_store = pathlib.Path(index_store) elif bool(index_store) is True: + # TODO: How to handle a hashed filename from FileCache? self.index_store = pathlib.Path(f"{name}.index.json") else: self.index_store = index_store From b8173bc8c4da27f478a3ff5aa46eb45a09d37157 Mon Sep 17 00:00:00 2001 From: Paul Gierz Date: Mon, 17 Mar 2025 13:48:09 +0100 Subject: [PATCH 8/8] feat(tar.py): better usage of index for dircache --- fsspec/implementations/tar.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/fsspec/implementations/tar.py b/fsspec/implementations/tar.py index d84da49f0..147db28ca 100644 --- a/fsspec/implementations/tar.py +++ b/fsspec/implementations/tar.py @@ -100,14 +100,16 @@ def _index(self): if self.index_store is not None and self.index_store.exists(): # NOTE(PG): Not sure if JSON is the best way to go here, but it's # simple and human-readable. + logger.debug(f"Reloading from {self.index_store}") with self.index_store.open("r") as f: self.index = json.load(f) else: + logger.debug(f"Populating {self.index_store}") out = {} for ti in self.tar: info = ti.get_info() info["type"] = typemap.get(info["type"], "file") - name = ti.get_info()["name"].rstrip("/") + info["name"] = name = info["name"].rstrip("/") out[name] = (info, ti.offset_data) self.index = out @@ -125,13 +127,10 @@ def _get_dirs(self): # This enables ls to get directories as children as well as files self.dir_cache = { dirname: {"name": dirname, "size": 0, "type": "directory"} - for dirname in self._all_dirnames(self.tar.getnames()) + for dirname in self._all_dirnames(self.index.keys()) } - for member in self.tar.getmembers(): - info = member.get_info() - info["name"] = info["name"].rstrip("/") - info["type"] = typemap.get(info["type"], "file") - self.dir_cache[info["name"]] = info + for name, (info, _) in self.index.items(): + self.dir_cache[name] = info def _open(self, path, mode="rb", **kwargs): if mode != "rb":