Skip to content

Use lsjson for searches #551

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 8 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions datashuttle/configs/config_class.py
Original file line number Diff line number Diff line change
Expand Up @@ -206,10 +206,10 @@ def get_base_folder(

return base_folder

def get_rclone_config_name(
def get_rclone_config_name_central(
self, connection_method: Optional[str] = None
) -> str:
"""Generate the rclone configuration name for the project.
"""Generate the rclone configuration name for the central project.

These configs are created by datashuttle but managed and stored by rclone.
"""
Expand All @@ -218,6 +218,10 @@ def get_rclone_config_name(

return f"central_{self.project_name}_{connection_method}"

def get_rclone_config_name_central_local(self):
"""Generate the rclone configuration name for the local project."""
return f"local_{self.project_name}_local_filesystem"

def make_rclone_transfer_options(
self, overwrite_existing_files: OverwriteExistingFiles, dry_run: bool
) -> Dict:
Expand Down
4 changes: 2 additions & 2 deletions datashuttle/datashuttle_class.py
Original file line number Diff line number Diff line change
Expand Up @@ -1399,14 +1399,14 @@ def _make_project_metadata_if_does_not_exist(self) -> None:
def _setup_rclone_central_ssh_config(self, log: bool) -> None:
rclone.setup_rclone_config_for_ssh(
self.cfg,
self.cfg.get_rclone_config_name("ssh"),
self.cfg.get_rclone_config_name_central("ssh"),
self.cfg.ssh_key_path,
log=log,
)

def _setup_rclone_central_local_filesystem_config(self) -> None:
rclone.setup_rclone_config_for_local_filesystem(
self.cfg.get_rclone_config_name("local_filesystem"),
self.cfg.get_rclone_config_name_central("local_filesystem"),
)

# Persistent settings
Expand Down
124 changes: 76 additions & 48 deletions datashuttle/utils/folders.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,12 @@
from datashuttle.configs.config_class import Configs
from datashuttle.utils.custom_types import TopLevelFolder

import glob
import fnmatch
import json
from pathlib import Path

from datashuttle.configs import canonical_folders, canonical_tags
from datashuttle.utils import ssh, utils, validation
from datashuttle.utils import utils, validation
from datashuttle.utils.custom_exceptions import NeuroBlueprintError

# -----------------------------------------------------------------------------
Expand Down Expand Up @@ -598,67 +599,94 @@ def search_for_folders(
Discovered folders (`all_folder_names`) and files (`all_filenames`).

"""
if local_or_central == "central" and cfg["connection_method"] == "ssh":
all_folder_names, all_filenames = ssh.search_ssh_central_for_folders(
search_path,
search_prefix,
cfg,
verbose,
return_full_path,
)
if (
local_or_central == "local"
or cfg["connection_method"] == "local_filesystem"
) and not search_path.exists():
if verbose:
utils.log_and_message(f"No file found at {search_path.as_posix()}")
return [], []

if local_or_central == "local":
rclone_config_name = None
else:
if not search_path.exists():
if verbose:
utils.log_and_message(
f"No file found at {search_path.as_posix()}"
)
return [], []

all_folder_names, all_filenames = search_filesystem_path_for_folders(
search_path / search_prefix, return_full_path
rclone_config_name = cfg.get_rclone_config_name_central(
cfg["connection_method"]
)

all_folder_names, all_filenames = search_local_or_remote(
search_path,
search_prefix,
rclone_config_name,
return_full_path,
)

return all_folder_names, all_filenames


# Actual function implementation
def search_filesystem_path_for_folders(
search_path_with_prefix: Path, return_full_path: bool = False
) -> Tuple[List[Path | str], List[Path | str]]:
r"""Search a folder through the local filesystem.
def search_local_or_remote(
search_path: Path,
search_prefix: str,
rclone_config_name: str | None,
return_full_path: bool = False,
) -> Tuple[List[Any], List[Any]]:
"""Search for files and folders in central path using `rclone lsjson` command.

Use glob to search the full search path (including prefix) with glob.
Files are filtered out of results, returning folders only.
This command lists all the files and folders in the central path in a json format.
The json contains file/folder info about each file/folder like name, type, etc.

Parameters
----------
search_path_with_prefix
Path to search along with search prefix e.g. "C:\drive\project\sub-*"

search_path
The path to search (relative to the local or remote drive). For example,
for "local_filesystem" this is the path on the local machine. For "ssh", this
is the path on the machine that has been connected to.
search_prefix
The search string e.g. "sub-*".
rclone_config_name
Name of the rclone config for the remote (not set for local). `rclone config`
can be used in the terminal to see how rclone has stored these. In datashuttle,
these are managed by `Configs`.
return_full_path
If `True` returns the path to the discovered folder or file,
otherwise just the name.

Returns
-------
Discovered folders (`all_folder_names`) and files (`all_filenames`).
If `True`, return the full filepath, otherwise return only the folder/file name.

"""
all_folder_names = []
all_filenames = []
from datashuttle.utils import (
rclone, # imported here to avoid circular reference
)

all_files_and_folders = list(glob.glob(search_path_with_prefix.as_posix()))
sorter_files_and_folders = sorted(all_files_and_folders)
config_prefix = "" if not rclone_config_name else f"{rclone_config_name}:"

for file_or_folder_str in sorter_files_and_folders:
file_or_folder = Path(file_or_folder_str)
output = rclone.call_rclone(
f'lsjson {config_prefix}"{search_path.as_posix()}"',
pipe_std=True,
)

if file_or_folder.is_dir():
all_folder_names.append(
file_or_folder if return_full_path else file_or_folder.name
)
all_folder_names: List[str] = []
all_filenames: List[str] = []

if output.returncode != 0:
utils.log_and_message(
f"Error searching files at {search_path.as_posix()}\n"
f"{output.stderr.decode('utf-8') if output.stderr else ''}"
)
return all_folder_names, all_filenames

files_and_folders = json.loads(output.stdout)

for file_or_folder in files_and_folders:
name = file_or_folder["Name"]

if not fnmatch.fnmatch(name, search_prefix):
continue

is_dir = file_or_folder.get("IsDir", False)

to_append = search_path / name if return_full_path else name

if is_dir:
all_folder_names.append(to_append)
else:
all_filenames.append(
file_or_folder if return_full_path else file_or_folder.name
)
all_filenames.append(to_append)

return all_folder_names, all_filenames
8 changes: 7 additions & 1 deletion datashuttle/utils/formatting.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,13 @@ def check_and_format_names(
names_to_format, reserved_keywords = [], []
for name in names:
if name in canonical_reserved_keywords() or tags("*") in name:
reserved_keywords.append(name)
if tags("to") in name:
# handle an edge case where use searches with both tags
reserved_keywords += update_names_with_range_to_flag(
[name], prefix
)
else:
reserved_keywords.append(name)
else:
names_to_format.append(name)

Expand Down
6 changes: 3 additions & 3 deletions datashuttle/utils/getters.py
Original file line number Diff line number Diff line change
Expand Up @@ -293,9 +293,9 @@ def get_existing_project_paths() -> List[Path]:
"""
datashuttle_path = canonical_folders.get_datashuttle_path()

all_folders, _ = folders.search_filesystem_path_for_folders(
datashuttle_path / "*"
)
all_folders = [
path_ for path_ in datashuttle_path.glob("*") if path_.is_dir()
]

existing_project_paths = []
for folder_name in all_folders:
Expand Down
10 changes: 5 additions & 5 deletions datashuttle/utils/rclone.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ def setup_rclone_config_for_local_filesystem(
----------
rclone_config_name
canonical config name, generated by
datashuttle.cfg.get_rclone_config_name()
datashuttle.cfg.get_rclone_config_name_central()

log
whether to log, if True logger must already be initialised.
Expand Down Expand Up @@ -146,7 +146,7 @@ def setup_rclone_config_for_ssh(

rclone_config_name
canonical config name, generated by
datashuttle.cfg.get_rclone_config_name()
datashuttle.cfg.get_rclone_config_name_central()

ssh_key_path
path to the ssh key used for connecting to
Expand Down Expand Up @@ -252,14 +252,14 @@ def transfer_data(
if upload_or_download == "upload":
output = call_rclone_through_script(
f"{rclone_args('copy')} "
f'"{local_filepath}" "{cfg.get_rclone_config_name()}:'
f'"{local_filepath}" "{cfg.get_rclone_config_name_central()}:'
f'{central_filepath}" {extra_arguments}',
)

elif upload_or_download == "download":
output = call_rclone_through_script(
f"{rclone_args('copy')} "
f'"{cfg.get_rclone_config_name()}:'
f'"{cfg.get_rclone_config_name_central()}:'
f'{central_filepath}" "{local_filepath}" {extra_arguments}',
)

Expand Down Expand Up @@ -364,7 +364,7 @@ def perform_rclone_check(
output = call_rclone(
f"{rclone_args('check')} "
f'"{local_filepath}" '
f'"{cfg.get_rclone_config_name()}:{central_filepath}"'
f'"{cfg.get_rclone_config_name_central()}:{central_filepath}"'
f" --combined -",
pipe_std=True,
)
Expand Down
2 changes: 1 addition & 1 deletion tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ def setup_project_default_configs(

rclone.setup_rclone_config_for_ssh(
project.cfg,
project.cfg.get_rclone_config_name("ssh"),
project.cfg.get_rclone_config_name_central("ssh"),
project.cfg.ssh_key_path,
)

Expand Down
Loading
Loading