ShellJob: Add support for FolderData in nodes input

The `nodes` input accepts any `Data` type that implements the `value` property that returns a value that can be cast to a string. In addition, a special case was added to add support for `SinglefileData` nodes which would allow users to have files written to the working directory. Here the logic is extended to also allow `FolderData` nodes. By default all contents are written to the root of the working directory. The `filenames` input namespace can be used to write the contents to a subdirectory. For now it is not possible to write to nested directories through this interface. However, in this case the user can create the `FolderData` with the desired target hierarchy.
sphuber · Jun 12, 2023 · 9587c33 · 9587c33
1 parent ad2ec0a
commit 9587c33
Show file tree

Hide file tree

Showing 3 changed files with 155 additions and 4 deletions.
diff --git a/docs/source/howto.rst b/docs/source/howto.rst
@@ -114,6 +114,91 @@ Any parent directories in the filepath, for example ``some/nested/path`` in the
 The output filename can be anything except for ``stdout``, ``stderr`` and ``status``, which are reserved filenames.
 
 
+Running a shell command with folders as arguments
+=================================================
+
+Certain commands might require the presence of a folder of files in the working directory.
+Just like a file is modeled in AiiDA's provenance graph by a ``SinglefileData`` node, a folder is represented by a ``FolderData`` node.
+The following example shows how a ``FolderData`` can be created to contain multiple files and how it can be passed to ``launch_shell_job`` using the ``nodes`` argument:
+
+.. code-block:: python
+
+    import pathlib
+    import tempfile
+    from aiida.orm import FolderData
+    from aiida_shell import launch_shell_job
+
+    # First create a ``FolderData`` node with some arbitrary files
+    with tempfile.TemporaryDirectory() as tmpdir:
+        dirpath = pathlib.Path(tmpdir)
+        (dirpath / 'file_a.txt').write_text('content a')
+        (dirpath / 'file_b.txt').write_text('content b')
+        folder_data = FolderData(tree=dirpath.absolute())
+
+    results, node = launch_shell_job(
+        'ls',
+        nodes={
+            'directory': folder_data,
+        }
+    )
+    print(results['stdout'].get_content())
+
+which prints:
+
+.. code-block:: console
+
+    _aiidasubmit.sh
+    file_a.txt
+    file_b.txt
+    _scheduler-stderr.txt
+    _scheduler-stdout.txt
+    stderr
+    stdout
+
+The contents of the ``folder_data`` node, the ``file_a.txt`` and ``file_b.txt`` files, were copied to the working directory.
+
+Note that by default, the contents of the ``FolderData`` are copied to the root of the working directory, as shown in the example above.
+If the contents should be written to a directory inside the working directory, use the ``filenames`` argument, as is done for copying ``SinglefileData`` nodes.
+Take for example the ``zip`` command that can create a zip archive from one or many files and folders.
+
+.. code-block:: python
+
+    import pathlib
+    import tempfile
+    from aiida.orm import FolderData
+    from aiida_shell import launch_shell_job
+
+    # First create a ``FolderData`` node with some arbitrary files
+    with tempfile.TemporaryDirectory() as tmpdir:
+        dirpath = pathlib.Path(tmpdir)
+        (dirpath / 'file_a.txt').write_text('content a')
+        (dirpath / 'file_b.txt').write_text('content b')
+        folder_data = FolderData(tree=dirpath.absolute())
+
+    results, node = launch_shell_job(
+        'zip',
+        arguments='-r archive.zip {folder}',
+        outputs=['archive.zip'],
+        nodes={
+            'folder': folder_data,
+        },
+        filenames={
+            'folder': 'directory'
+        }
+    )
+
+In this example, the contents of the ``folder_data`` node were copied to the ``directory`` folder in the working directory.
+The ``results`` dictionary contains the ``archive_zip`` output which is a ``SinglefileData`` node containing the zip archive.
+It can be unzipped as follows: ``verdi node repo cat <IDENTIFIER> | unzip``, where ``<IDENTIFIER>`` should be replaced with the pk or UUID of the ``archive_zip`` node.
+The original files ``file_a.txt`` and ``file_b.txt`` are now written to the current working directory.
+
+.. note::
+
+    It is not required for a ``FolderData`` node, that is specified in the ``nodes`` input, to have a corresponding placeholder in the ``arguments``.
+    Just as with ``SinglefileData`` inputs nodes, if there is no corresponding placeholder, the contents of the folder are simply written to the working directory where the shell command is executed.
+    This is useful for commands that expect a folder to be present in the working directory but whose name is not explicitly defined through a command line argument.
+
+
 Passing other ``Data`` types as input
 =====================================
 

diff --git a/src/aiida_shell/calculations/shell.py b/src/aiida_shell/calculations/shell.py
@@ -9,7 +9,7 @@
 from aiida.common.datastructures import CalcInfo, CodeInfo
 from aiida.common.folders import Folder
 from aiida.engine import CalcJob, CalcJobProcessSpec
-from aiida.orm import Data, Dict, List, SinglefileData, to_aiida_type
+from aiida.orm import Data, Dict, FolderData, List, SinglefileData, to_aiida_type
 
 from aiida_shell.data import PickledData
 
@@ -133,7 +133,7 @@ def validate_nodes(cls, value: t.Mapping[str, Data], _) -> str | None:
         """Validate the ``nodes`` input."""
         for key, node in value.items():
 
-            if isinstance(node, SinglefileData):
+            if isinstance(node, (FolderData, SinglefileData)):
                 continue
 
             try:
@@ -289,15 +289,23 @@ def process_arguments_and_nodes(
             if isinstance(node, SinglefileData):
                 filename = self.write_single_file_data(dirpath, node, placeholder, filenames)
                 argument_interpolated = argument.format(**{placeholder: filename})
+            elif isinstance(node, FolderData):
+                filename = self.write_folder_data(dirpath, node, placeholder, filenames)
+                argument_interpolated = argument.format(**{placeholder: filename})
             else:
                 argument_interpolated = argument.format(**{placeholder: str(node.value)})
 
             processed_nodes.append(placeholder)
             processed_arguments.append(argument_interpolated)
 
         for key, node in nodes.items():
-            if key not in processed_nodes and isinstance(node, SinglefileData):
+            if key in processed_nodes:
+                continue
+
+            if isinstance(node, SinglefileData):
                 self.write_single_file_data(dirpath, node, key, filenames)
+            elif isinstance(node, FolderData):
+                self.write_folder_data(dirpath, node, key, filenames)
 
         return processed_arguments
 
@@ -321,3 +329,25 @@ def write_single_file_data(dirpath: pathlib.Path, node: SinglefileData, key: str
             filepath.write_bytes(handle.read())
 
         return filename
+
+    @staticmethod
+    def write_folder_data(dirpath: pathlib.Path, node: FolderData, key: str, filenames: dict[str, str]) -> str:
+        """Write the content of a ``FolderData`` node to ``dirpath``.
+
+        :param dirpath: A temporary folder on the local file system.
+        :param node: The node whose content to write.
+        :param key: The relative filename to use.
+        :param filenames: Mapping that can provide explicit filenames for the given key.
+        :returns: The relative filename used to write the content to ``dirpath``.
+        """
+        if key in filenames:
+            filename = filenames[key]
+            filepath = dirpath / filename
+        else:
+            filename = key
+            filepath = dirpath
+
+        filepath.parent.mkdir(parents=True, exist_ok=True)
+        node.base.repository.copy_tree(filepath)
+
+        return filename
diff --git a/tests/calculations/test_shell.py b/tests/calculations/test_shell.py
@@ -5,7 +5,7 @@
 import pathlib
 
 from aiida.common.datastructures import CodeInfo
-from aiida.orm import Data, Float, Int, List, SinglefileData, Str
+from aiida.orm import Data, Float, FolderData, Int, List, SinglefileData, Str
 import pytest
 
 from aiida_shell.calculations.shell import ShellJob
@@ -44,6 +44,42 @@ def test_nodes_single_file_data(generate_calc_job, generate_code):
     assert sorted([p.name for p in dirpath.iterdir()]) == ['xa', 'xb']
 
 
+def test_nodes_folder_data(generate_calc_job, generate_code, tmp_path):
+    """Test the ``nodes`` input with ``FolderData`` nodes ."""
+    (tmp_path / 'file_a.txt').write_text('content a')
+    (tmp_path / 'file_b.txt').write_text('content b')
+
+    folder_flat = FolderData(tree=tmp_path.absolute())
+    folder_nested = FolderData()
+    folder_nested.base.repository.put_object_from_tree(tmp_path.absolute(), 'dir')
+    inputs = {
+        'code': generate_code(),
+        'arguments': ['{nested}', '{nested_explicit}'],
+        'nodes': {
+            'flat': folder_flat,
+            'nested': folder_nested,
+            'flat_explicit': folder_flat,
+            'nested_explicit': folder_nested,
+        },
+        'filenames': {
+            'flat_explicit': 'sub',
+            'nested_explicit': 'sub'
+        }
+    }
+    dirpath, calc_info = generate_calc_job('core.shell', inputs)
+    code_info = calc_info.codes_info[0]
+
+    assert code_info.cmdline_params == ['nested', 'sub']
+    assert code_info.stdout_name == ShellJob.FILENAME_STDOUT
+    assert calc_info.retrieve_temporary_list == ShellJob.DEFAULT_RETRIEVED_TEMPORARY
+    assert sorted([p.name for p in dirpath.iterdir()]) == ['dir', 'file_a.txt', 'file_b.txt', 'sub']
+    assert sorted([p.name for p in (dirpath / 'dir').iterdir()]) == ['file_a.txt', 'file_b.txt']
+    assert sorted([p.name for p in (dirpath / 'sub').iterdir()]) == ['dir', 'file_a.txt', 'file_b.txt']
+    assert sorted([p.name for p in (dirpath / 'sub' / 'dir').iterdir()]) == ['file_a.txt', 'file_b.txt']
+    assert (dirpath / 'file_a.txt').read_text() == 'content a'
+    assert (dirpath / 'file_b.txt').read_text() == 'content b'
+
+
 def test_nodes_base_types(generate_calc_job, generate_code):
     """Test the ``nodes`` input with ``BaseType`` nodes ."""
     inputs = {