From 4f481e271a39793e13ced1443a5a866631a52e5e Mon Sep 17 00:00:00 2001 From: Manuel Giffels Date: Fri, 11 Nov 2022 12:13:58 +0100 Subject: [PATCH 01/16] Add aiolancium depnendency --- setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.py b/setup.py index f4205726..7fdfbaf7 100644 --- a/setup.py +++ b/setup.py @@ -98,6 +98,7 @@ def get_cryptography_version(): "python-auditor>=0.0.5", "pytz", "tzlocal", + "aiolancium", ], extras_require={ "docs": [ From 20faad8a4cc35f2c382a73f0c49fa6a88b9a000f Mon Sep 17 00:00:00 2001 From: Manuel Giffels Date: Fri, 11 Nov 2022 15:58:31 +0100 Subject: [PATCH 02/16] Add first version of lancium site adapter --- tardis/adapters/sites/lancium.py | 127 +++++++++++++++++++++++++++++++ 1 file changed, 127 insertions(+) create mode 100644 tardis/adapters/sites/lancium.py diff --git a/tardis/adapters/sites/lancium.py b/tardis/adapters/sites/lancium.py new file mode 100644 index 00000000..f1022c4b --- /dev/null +++ b/tardis/adapters/sites/lancium.py @@ -0,0 +1,127 @@ +from aiolancium.client import Authenticator, LanciumClient + +from ...exceptions.tardisexceptions import TardisError, TardisResourceStatusUpdateFailed +from ...interfaces.siteadapter import SiteAdapter, ResourceStatus +from ...utilities.attributedict import AttributeDict, convert_to_attribute_dict +from ...utilities.asynccachemap import AsyncCacheMap +from ...utilities.staticmapping import StaticMapping + +from contextlib import contextmanager +from datetime import datetime +from functools import partial +from typing import Dict + +import logging + +logger = logging.getLogger("cobald.runtime.tardis.adapters.sites.lancium") + + +async def lancium_status_updater(client: LanciumClient) -> Dict: + response = client.jobs.show_jobs() + logger.debug(f"Show jobs returned {response}") + return {job["id"]: job for job in response["jobs"]} + + +class LanciumAdapter(SiteAdapter): + # space in last key requires dict expansion in `__init__` `translation_functions` + resource_status_translation = { + "created": ResourceStatus.Booting, + "submitted": ResourceStatus.Booting, + "queued": ResourceStatus.Booting, + "ready": ResourceStatus.Booting, + "running": ResourceStatus.Running, + "error": ResourceStatus.Error, + "finished": ResourceStatus.Stopped, + "delete pending": ResourceStatus.Stopped, + "deleted": ResourceStatus.Deleted, + } + + def __init__(self, machine_type: str, site_name: str): + self._machine_type = machine_type + self._site_name = site_name + + auth = Authenticator(api_key=self.configuration.api_key) + self.client = LanciumClient(api_url=self.configuration.api_url, auth=auth) + + key_translator = StaticMapping( + remote_resource_uuid="id", + drone_uuid="name", + resource_status="status", + created="created_at", + updated="updated_at", + ) + + translator_functions = StaticMapping( + created=lambda date: datetime.strptime(date, "%Y-%m-%dT%H:%M:%SZ"), + updated=lambda date: datetime.strptime(date, "%Y-%m-%dT%H:%M:%SZ"), + status=lambda x, translator=StaticMapping( + **self.resource_status_translation + ): translator[x], + ) + + self.handle_response = partial( + self.handle_response, + key_translator=key_translator, + translator_functions=translator_functions, + ) + + self._lancium_status = AsyncCacheMap( + update_coroutine=partial(lancium_status_updater, self.client), + max_age=self.configuration.max_age * 60, + ) + + async def deploy_resource( + self, resource_attributes: AttributeDict + ) -> AttributeDict: + create_response = await self.client.create_job( + job=self.machine_type_configuration + ) + logger.debug(f"{self.site_name} create job returned {create_response}") + submit_response = await self.client.submit_job(id=create_response["job"]["id"]) + logger.debug(f"{self.site_name} submit job returned {submit_response}") + return self.handle_response(create_response) + + async def resource_status( + self, resource_attributes: AttributeDict + ) -> AttributeDict: + await self._lancium_status.update_status() + # In case the created timestamp is after last update timestamp of the + # asynccachemap, no decision about the current state can be given, + # since map is updated asynchronously. + try: + resource_uuid = resource_attributes.remote_resource_uuid + resource_status = self._lancium_status[str(resource_uuid)] + except KeyError as err: + if ( + self._lancium_status._last_update - resource_attributes.created + ).total_seconds() < 0: + raise TardisResourceStatusUpdateFailed from err + else: + resource_status = { + "id": resource_attributes.remote_resource_uuid, + "status": "deleted", + } + logger.debug(f"{self.site_name} has status {resource_status}.") + resource_attributes.update(updated=datetime.now()) + return convert_to_attribute_dict( + {**resource_attributes, **self.handle_response(resource_status)} + ) + + async def stop_resource(self, resource_attributes: AttributeDict): + response = self.client.terminate_job( + id=resource_attributes.remote_resource_uuid + ) + logger.debug(f"{self.site_name} stop resource returned {response}") + return response + + async def terminate_resource(self, resource_attributes: AttributeDict): + response = self.client.delete_job(id=resource_attributes.remote_resource_uuid) + logger.debug(f"{self.site_name} terminate resource returned {response}") + return response + + @contextmanager + def handle_exceptions(self): + try: + yield + except Exception as ex: + raise TardisError from ex From b318e01ba6952a78c92520e49084e53df0ffdcec Mon Sep 17 00:00:00 2001 From: Manuel Giffels Date: Fri, 11 Nov 2022 16:08:21 +0100 Subject: [PATCH 03/16] Adaed lancium adapter test template --- tests/adapters_t/sites_t/test_lancium.py | 26 ++++++++++++++++++++++++ 1 file changed, 26 insertions(+) create mode 100644 tests/adapters_t/sites_t/test_lancium.py diff --git a/tests/adapters_t/sites_t/test_lancium.py b/tests/adapters_t/sites_t/test_lancium.py new file mode 100644 index 00000000..8d5788da --- /dev/null +++ b/tests/adapters_t/sites_t/test_lancium.py @@ -0,0 +1,26 @@ +from tardis.adapters.sites.lancium import LanciumAdapter + +from unittest import TestCase +from unittest.mock import patch + + +class TestLanciumAdapter(TestCase): + mock_config_patcher = None + mock_lancium_api_patcher = None + + @classmethod + def setUpClass(cls) -> None: + cls.mock_config_patcher = patch("tardis.interfaces.siteadapter.Configuration") + cls.mock_config = cls.mock_config_patcher.start() + cls.mock_lancium_api_patcher = patch( + "tardis.adapters.sites.lancium.LanciumClient" + ) + cls.mock_openstack_api = cls.mock_lancium_api_patcher.start() + + @classmethod + def tearDownClass(cls): + cls.mock_config_patcher.stop() + cls.mock_lancium_api_patcher.stop() + + def setUp(self) -> None: + self.adapter = LanciumAdapter(machine_type="test2large", site_name="TestSite") From e9c89f8764bc1eeab1f09bf9d330413ac182cbd8 Mon Sep 17 00:00:00 2001 From: Manuel Giffels Date: Fri, 11 Nov 2022 16:18:59 +0100 Subject: [PATCH 04/16] Exclude cobald*.yml for git --- .gitignore | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index b6ca5262..8b76bbe2 100644 --- a/.gitignore +++ b/.gitignore @@ -113,7 +113,7 @@ test_scripts/ *.db # Ignore configurations -cobald.yml +cobald*.yml *tardis.yml #Ignore cloudinit files From 38a1cd76a36d0d45ea2a076014dac6562a9e16b7 Mon Sep 17 00:00:00 2001 From: Manuel Giffels Date: Fri, 11 Nov 2022 16:36:36 +0100 Subject: [PATCH 05/16] Set name and resource specs in adapter code --- tardis/adapters/sites/lancium.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/tardis/adapters/sites/lancium.py b/tardis/adapters/sites/lancium.py index f1022c4b..116c4605 100644 --- a/tardis/adapters/sites/lancium.py +++ b/tardis/adapters/sites/lancium.py @@ -73,9 +73,14 @@ def __init__(self, machine_type: str, site_name: str): async def deploy_resource( self, resource_attributes: AttributeDict ) -> AttributeDict: - create_response = await self.client.create_job( - job=self.machine_type_configuration + specs = dict(name=resource_attributes.drone_uuid) + specs["resources"] = dict( + core_count=self.machine_meta_data.Cores, + memory=self.machine_meta_data.Memory, + scratch=self.machine_meta_data.Disk, ) + specs.update(self.machine_type_configuration) + create_response = await self.client.create_job(job=specs) logger.debug(f"{self.site_name} create job returned {create_response}") submit_response = await self.client.submit_job(id=create_response["job"]["id"]) logger.debug(f"{self.site_name} submit job returned {submit_response}") From c92f7e70bbffb2aaa23fcf665c82820a9ed7a673 Mon Sep 17 00:00:00 2001 From: Manuel Giffels Date: Fri, 11 Nov 2022 18:16:37 +0100 Subject: [PATCH 06/16] Fixes for lancium adapter --- tardis/adapters/sites/lancium.py | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/tardis/adapters/sites/lancium.py b/tardis/adapters/sites/lancium.py index 116c4605..5bfc63ca 100644 --- a/tardis/adapters/sites/lancium.py +++ b/tardis/adapters/sites/lancium.py @@ -17,7 +17,7 @@ async def lancium_status_updater(client: LanciumClient) -> Dict: - response = client.jobs.show_jobs() + response = await client.jobs.show_jobs() logger.debug(f"Show jobs returned {response}") return {job["id"]: job for job in response["jobs"]} @@ -47,16 +47,14 @@ def __init__(self, machine_type: str, site_name: str): remote_resource_uuid="id", drone_uuid="name", resource_status="status", - created="created_at", - updated="updated_at", ) translator_functions = StaticMapping( - created=lambda date: datetime.strptime(date, "%Y-%m-%dT%H:%M:%SZ"), - updated=lambda date: datetime.strptime(date, "%Y-%m-%dT%H:%M:%SZ"), status=lambda x, translator=StaticMapping( **self.resource_status_translation ): translator[x], + id=int, + name=str, ) self.handle_response = partial( @@ -80,11 +78,13 @@ async def deploy_resource( scratch=self.machine_meta_data.Disk, ) specs.update(self.machine_type_configuration) - create_response = await self.client.create_job(job=specs) + create_response = await self.client.jobs.create_job(job=specs) logger.debug(f"{self.site_name} create job returned {create_response}") - submit_response = await self.client.submit_job(id=create_response["job"]["id"]) + submit_response = await self.client.jobs.submit_job( + id=create_response["job"]["id"] + ) logger.debug(f"{self.site_name} submit job returned {submit_response}") - return self.handle_response(create_response) + return self.handle_response(create_response["job"]) async def resource_status( self, resource_attributes: AttributeDict @@ -95,7 +95,7 @@ async def resource_status( # since map is updated asynchronously. try: resource_uuid = resource_attributes.remote_resource_uuid - resource_status = self._lancium_status[str(resource_uuid)] + resource_status = self._lancium_status[resource_uuid] except KeyError as err: if ( self._lancium_status._last_update - resource_attributes.created @@ -113,14 +113,16 @@ async def resource_status( ) async def stop_resource(self, resource_attributes: AttributeDict): - response = self.client.terminate_job( + response = await self.client.jobs.terminate_job( id=resource_attributes.remote_resource_uuid ) logger.debug(f"{self.site_name} stop resource returned {response}") return response async def terminate_resource(self, resource_attributes: AttributeDict): - response = self.client.delete_job(id=resource_attributes.remote_resource_uuid) + response = await self.client.jobs.delete_job( + id=resource_attributes.remote_resource_uuid + ) logger.debug(f"{self.site_name} terminate resource returned {response}") return response From 9c8d4aeea3049968211db100b43f3a0a379d0235 Mon Sep 17 00:00:00 2001 From: Manuel Giffels Date: Fri, 11 Nov 2022 19:17:41 +0100 Subject: [PATCH 07/16] Add Tardis environment to job --- tardis/adapters/sites/lancium.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tardis/adapters/sites/lancium.py b/tardis/adapters/sites/lancium.py index 5bfc63ca..1a0d9bf3 100644 --- a/tardis/adapters/sites/lancium.py +++ b/tardis/adapters/sites/lancium.py @@ -77,6 +77,13 @@ async def deploy_resource( memory=self.machine_meta_data.Memory, scratch=self.machine_meta_data.Disk, ) + specs["environment"] = [ + {"variable": f"TardisDrone{key}", "value": str(value)} + for key, value in self.drone_environment( + resource_attributes.drone_uuid, + resource_attributes.obs_machine_meta_data_translation_mapping, + ).items() + ] specs.update(self.machine_type_configuration) create_response = await self.client.jobs.create_job(job=specs) logger.debug(f"{self.site_name} create job returned {create_response}") From c2fb54c9bfee81e04adf153f62f0ddbaae5eb4b3 Mon Sep 17 00:00:00 2001 From: Manuel Giffels Date: Thu, 17 Nov 2022 11:19:06 +0100 Subject: [PATCH 08/16] Add lancium deploy resource test --- docs/source/changelog.rst | 4 +- tests/adapters_t/sites_t/test_lancium.py | 112 ++++++++++++++++++++++- 2 files changed, 113 insertions(+), 3 deletions(-) diff --git a/docs/source/changelog.rst b/docs/source/changelog.rst index 89dbab8b..a930df9c 100644 --- a/docs/source/changelog.rst +++ b/docs/source/changelog.rst @@ -1,4 +1,4 @@ -.. Created by changelog.py at 2022-11-11, command +.. Created by changelog.py at 2022-11-17, command '/Users/giffler/.cache/pre-commit/repor6pnmwlm/py_env-python3.10/bin/changelog docs/source/changes compile --output=docs/source/changelog.rst' based on the format of 'https://keepachangelog.com/' @@ -6,7 +6,7 @@ CHANGELOG ######### -[Unreleased] - 2022-11-11 +[Unreleased] - 2022-11-17 ========================= Added diff --git a/tests/adapters_t/sites_t/test_lancium.py b/tests/adapters_t/sites_t/test_lancium.py index 8d5788da..d989913d 100644 --- a/tests/adapters_t/sites_t/test_lancium.py +++ b/tests/adapters_t/sites_t/test_lancium.py @@ -1,4 +1,10 @@ from tardis.adapters.sites.lancium import LanciumAdapter +from tardis.interfaces.siteadapter import ResourceStatus +from tardis.utilities.attributedict import AttributeDict + +from simple_rest_client.exceptions import AuthError + +from ...utilities.utilities import run_async, set_awaitable_return_value from unittest import TestCase from unittest.mock import patch @@ -15,7 +21,7 @@ def setUpClass(cls) -> None: cls.mock_lancium_api_patcher = patch( "tardis.adapters.sites.lancium.LanciumClient" ) - cls.mock_openstack_api = cls.mock_lancium_api_patcher.start() + cls.mock_lancium_api = cls.mock_lancium_api_patcher.start() @classmethod def tearDownClass(cls): @@ -23,4 +29,108 @@ def tearDownClass(cls): cls.mock_lancium_api_patcher.stop() def setUp(self) -> None: + self.mock_configuration() + self.mock_lancium_adapter() self.adapter = LanciumAdapter(machine_type="test2large", site_name="TestSite") + + def mock_configuration(self): + config = self.mock_config.return_value + test_site_config = config.TestSite + test_site_config.api_url = "https://test.site.api" + test_site_config.api_key = "top_secret_test" + test_site_config.max_age = 1 + test_site_config.MachineTypeConfiguration = AttributeDict( + test2large=AttributeDict( + qos="high", + image="lancium/ubuntu", + command_line="sleep 500", + max_run_time=600, + ) + ) + test_site_config.MachineMetaData = AttributeDict( + test2large=AttributeDict(Cores=8, Memory=20, Disk=20) + ) + + def mock_lancium_adapter(self): + self.mocked_lancium_api = self.mock_lancium_api.return_value + set_awaitable_return_value( + self.mocked_lancium_api.jobs.create_job, + {"job": {"id": 123, "status": "created", "name": "testsite-089123"}}, + ) + set_awaitable_return_value(self.mocked_lancium_api.jobs.submit_job, {}) + + def test_deploy_resource(self): + self.assertEqual( + AttributeDict( + drone_uuid="testsite-089123", + remote_resource_uuid=123, + resource_status=ResourceStatus.Booting, + ), + run_async( + self.adapter.deploy_resource, + resource_attributes=AttributeDict( + drone_uuid="testsite-089123", + obs_machine_meta_data_translation_mapping=AttributeDict( + Cores=1, + Memory=1, + Disk=1, + ), + ), + ), + ) + + self.assertDictEqual( + { + "name": "testsite-089123", + "qos": "high", + "image": "lancium/ubuntu", + "command_line": "sleep 500", + "max_run_time": 600, + "resources": {"core_count": 8, "memory": 20, "scratch": 20}, + "environment": [ + {"variable": "TardisDroneCores", "value": "8"}, + {"variable": "TardisDroneMemory", "value": "20"}, + {"variable": "TardisDroneDisk", "value": "20"}, + {"variable": "TardisDroneUuid", "value": "testsite-089123"}, + ], + }, + self.mocked_lancium_api.jobs.create_job.call_args.kwargs["job"], + ) + self.mocked_lancium_api.jobs.submit_job.assert_called_with(id=123) + + self.mocked_lancium_api.jobs.create_job.side_effect = AuthError( + "operation=auth_error", {} + ) + with self.assertRaises(AuthError): + run_async( + self.adapter.deploy_resource, + resource_attributes=AttributeDict( + drone_uuid="testsite-089123", + obs_machine_meta_data_translation_mapping=AttributeDict( + Cores=1, + Memory=1, + Disk=1, + ), + ), + ) + + def test_machine_meta_data(self): + ... + + def test_machine_type(self): + ... + + def test_site_name(self): + ... + + def test_resource_status(self): + ... + + def test_stop_resource(self): + ... + + def test_terminate_resource(self): + ... + + def test_exception_handling(self): + ... From 3dd0641931f4d29413aad4727bf68246c5685b40 Mon Sep 17 00:00:00 2001 From: Manuel Giffels Date: Thu, 17 Nov 2022 13:29:28 +0100 Subject: [PATCH 09/16] Add tests for machine meta data, machine tye, site name and resource status --- tests/adapters_t/sites_t/test_lancium.py | 95 +++++++++++++++++++++++- 1 file changed, 91 insertions(+), 4 deletions(-) diff --git a/tests/adapters_t/sites_t/test_lancium.py b/tests/adapters_t/sites_t/test_lancium.py index d989913d..fa0b29ea 100644 --- a/tests/adapters_t/sites_t/test_lancium.py +++ b/tests/adapters_t/sites_t/test_lancium.py @@ -1,4 +1,5 @@ from tardis.adapters.sites.lancium import LanciumAdapter +from tardis.exceptions.tardisexceptions import TardisResourceStatusUpdateFailed from tardis.interfaces.siteadapter import ResourceStatus from tardis.utilities.attributedict import AttributeDict @@ -6,6 +7,7 @@ from ...utilities.utilities import run_async, set_awaitable_return_value +from datetime import datetime from unittest import TestCase from unittest.mock import patch @@ -58,6 +60,22 @@ def mock_lancium_adapter(self): {"job": {"id": 123, "status": "created", "name": "testsite-089123"}}, ) set_awaitable_return_value(self.mocked_lancium_api.jobs.submit_job, {}) + set_awaitable_return_value( + self.mocked_lancium_api.jobs.show_jobs, + { + "jobs": [ + {"id": 123, "status": "created", "name": "testsite-089123"}, + {"id": 124, "status": "submitted", "name": "testsite-089124"}, + {"id": 125, "status": "queued", "name": "testsite-089125"}, + {"id": 126, "status": "ready", "name": "testsite-089126"}, + {"id": 127, "status": "running", "name": "testsite-089127"}, + {"id": 128, "status": "error", "name": "testsite-089128"}, + {"id": 129, "status": "finished", "name": "testsite-089129"}, + {"id": 130, "status": "delete pending", "name": "testsite-089130"}, + {"id": 131, "status": "deleted", "name": "testsite-089131"}, + ] + }, + ) def test_deploy_resource(self): self.assertEqual( @@ -115,16 +133,85 @@ def test_deploy_resource(self): ) def test_machine_meta_data(self): - ... + self.assertEqual( + self.adapter.machine_meta_data, AttributeDict(Cores=8, Memory=20, Disk=20) + ) def test_machine_type(self): - ... + self.assertEqual(self.adapter.machine_type, "test2large") def test_site_name(self): - ... + self.assertEqual(self.adapter.site_name, "TestSite") def test_resource_status(self): - ... + test_matrix = [ + (123, ResourceStatus.Booting), + (124, ResourceStatus.Booting), + (125, ResourceStatus.Booting), + (126, ResourceStatus.Booting), + (127, ResourceStatus.Running), + (128, ResourceStatus.Error), + (129, ResourceStatus.Stopped), + (130, ResourceStatus.Stopped), + (131, ResourceStatus.Deleted), + ] + for job_id, resource_status in test_matrix: + response = { + key: value + for key, value in run_async( + self.adapter.resource_status, + resource_attributes=AttributeDict( + remote_resource_uuid=job_id, + drone_uuid=f"testsite-089{job_id}", + ), + ).items() + if key not in ["created", "updated"] + } + self.assertEqual( + { + "remote_resource_uuid": job_id, + "drone_uuid": f"testsite-089{job_id}", + "resource_status": resource_status, + }, + response, + ) + + # check that resource not in the show_job list and older than maxAge + # have status deleted + response = { + key: value + for key, value in run_async( + self.adapter.resource_status, + resource_attributes=AttributeDict( + remote_resource_uuid=999, + drone_uuid="testsite-089999", + created=datetime.fromtimestamp(0), + ), + ).items() + if key not in ["created", "updated"] + } + self.assertEqual( + { + "remote_resource_uuid": 999, + "drone_uuid": "testsite-089999", + "resource_status": ResourceStatus.Deleted, + }, + response, + ) + + # check that resources not in the show_job list and younger than maxAge + # raise TardisResourceStatusUpdateFailed + with self.assertRaises(TardisResourceStatusUpdateFailed): + run_async( + self.adapter.resource_status, + resource_attributes=AttributeDict( + remote_resource_uuid=999, + drone_uuid="testsite-089999", + created=datetime.now(), + ), + ) + + self.mocked_lancium_api.jobs.show_jobs.assert_called_once() def test_stop_resource(self): ... From 91a17b9034b77faf1f77f02fa182c35a2d525a67 Mon Sep 17 00:00:00 2001 From: Manuel Giffels Date: Thu, 17 Nov 2022 13:49:39 +0100 Subject: [PATCH 10/16] Refactor lancium unittests --- tests/adapters_t/sites_t/test_lancium.py | 98 +++++++++++++----------- 1 file changed, 55 insertions(+), 43 deletions(-) diff --git a/tests/adapters_t/sites_t/test_lancium.py b/tests/adapters_t/sites_t/test_lancium.py index fa0b29ea..60fece74 100644 --- a/tests/adapters_t/sites_t/test_lancium.py +++ b/tests/adapters_t/sites_t/test_lancium.py @@ -76,15 +76,12 @@ def mock_lancium_adapter(self): ] }, ) + set_awaitable_return_value(self.mocked_lancium_api.jobs.terminate_job, {}) + set_awaitable_return_value(self.mocked_lancium_api.jobs.delete_job, {}) def test_deploy_resource(self): - self.assertEqual( - AttributeDict( - drone_uuid="testsite-089123", - remote_resource_uuid=123, - resource_status=ResourceStatus.Booting, - ), - run_async( + def run_it(): + return run_async( self.adapter.deploy_resource, resource_attributes=AttributeDict( drone_uuid="testsite-089123", @@ -94,7 +91,15 @@ def test_deploy_resource(self): Disk=1, ), ), + ) + + self.assertEqual( + AttributeDict( + drone_uuid="testsite-089123", + remote_resource_uuid=123, + resource_status=ResourceStatus.Booting, ), + run_it(), ) self.assertDictEqual( @@ -120,17 +125,7 @@ def test_deploy_resource(self): "operation=auth_error", {} ) with self.assertRaises(AuthError): - run_async( - self.adapter.deploy_resource, - resource_attributes=AttributeDict( - drone_uuid="testsite-089123", - obs_machine_meta_data_translation_mapping=AttributeDict( - Cores=1, - Memory=1, - Disk=1, - ), - ), - ) + run_it() def test_machine_meta_data(self): self.assertEqual( @@ -144,6 +139,16 @@ def test_site_name(self): self.assertEqual(self.adapter.site_name, "TestSite") def test_resource_status(self): + def run_it(job_id, created=datetime.now()): + return run_async( + self.adapter.resource_status, + resource_attributes=AttributeDict( + remote_resource_uuid=job_id, + drone_uuid=f"testsite-089{job_id}", + created=created, + ), + ) + test_matrix = [ (123, ResourceStatus.Booting), (124, ResourceStatus.Booting), @@ -158,13 +163,7 @@ def test_resource_status(self): for job_id, resource_status in test_matrix: response = { key: value - for key, value in run_async( - self.adapter.resource_status, - resource_attributes=AttributeDict( - remote_resource_uuid=job_id, - drone_uuid=f"testsite-089{job_id}", - ), - ).items() + for key, value in run_it(job_id).items() if key not in ["created", "updated"] } self.assertEqual( @@ -180,14 +179,7 @@ def test_resource_status(self): # have status deleted response = { key: value - for key, value in run_async( - self.adapter.resource_status, - resource_attributes=AttributeDict( - remote_resource_uuid=999, - drone_uuid="testsite-089999", - created=datetime.fromtimestamp(0), - ), - ).items() + for key, value in run_it(999, datetime.fromtimestamp(0)).items() if key not in ["created", "updated"] } self.assertEqual( @@ -202,22 +194,42 @@ def test_resource_status(self): # check that resources not in the show_job list and younger than maxAge # raise TardisResourceStatusUpdateFailed with self.assertRaises(TardisResourceStatusUpdateFailed): - run_async( - self.adapter.resource_status, - resource_attributes=AttributeDict( - remote_resource_uuid=999, - drone_uuid="testsite-089999", - created=datetime.now(), - ), - ) + run_it(999, datetime.now()) self.mocked_lancium_api.jobs.show_jobs.assert_called_once() def test_stop_resource(self): - ... + def run_it(): + return run_async( + self.adapter.stop_resource, + resource_attributes=AttributeDict(remote_resource_uuid=123), + ) + + run_it() + + self.mocked_lancium_api.jobs.terminate_job.assert_called_with(id=123) + + self.mocked_lancium_api.jobs.terminate_job.side_effect = AuthError( + "operation=auth_error", {} + ) + with self.assertRaises(AuthError): + run_it() def test_terminate_resource(self): - ... + def run_it(): + return run_async( + self.adapter.terminate_resource, + resource_attributes=AttributeDict(remote_resource_uuid=123), + ) + + run_it() + self.mocked_lancium_api.jobs.delete_job.assert_called_with(id=123) + + self.mocked_lancium_api.jobs.delete_job.side_effect = AuthError( + "operation=auth_error", {} + ) + with self.assertRaises(AuthError): + run_it() def test_exception_handling(self): ... From b8662019d0362707761e7094cc93f7d062cf8aef Mon Sep 17 00:00:00 2001 From: Manuel Giffels Date: Thu, 17 Nov 2022 13:58:36 +0100 Subject: [PATCH 11/16] Test exception handling --- tests/adapters_t/sites_t/test_lancium.py | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/tests/adapters_t/sites_t/test_lancium.py b/tests/adapters_t/sites_t/test_lancium.py index 60fece74..c48f317d 100644 --- a/tests/adapters_t/sites_t/test_lancium.py +++ b/tests/adapters_t/sites_t/test_lancium.py @@ -1,5 +1,8 @@ from tardis.adapters.sites.lancium import LanciumAdapter -from tardis.exceptions.tardisexceptions import TardisResourceStatusUpdateFailed +from tardis.exceptions.tardisexceptions import ( + TardisResourceStatusUpdateFailed, + TardisError, +) from tardis.interfaces.siteadapter import ResourceStatus from tardis.utilities.attributedict import AttributeDict @@ -198,6 +201,19 @@ def run_it(job_id, created=datetime.now()): self.mocked_lancium_api.jobs.show_jobs.assert_called_once() + def test_resource_status_failed(self): + self.mocked_lancium_api.jobs.show_jobs.side_effect = AuthError( + "operation=auth_error", {} + ) + with self.assertRaises(AuthError): + run_async( + self.adapter.resource_status, + resource_attributes=AttributeDict( + remote_resource_uuid=123, + drone_uuid="testsite-089123", + ), + ) + def test_stop_resource(self): def run_it(): return run_async( @@ -232,4 +248,6 @@ def run_it(): run_it() def test_exception_handling(self): - ... + with self.assertRaises(TardisError): + with self.adapter.handle_exceptions(): + raise AuthError("test", "test") From b37ce325ab732bdfc6d5c8bebea03205c753d66a Mon Sep 17 00:00:00 2001 From: Manuel Giffels Date: Thu, 17 Nov 2022 14:33:48 +0100 Subject: [PATCH 12/16] Add documentation for lancium adapter plus additional updates --- docs/source/adapters/site.rst | 63 ++++++++++++++++++- .../api/tardis.adapters.sites.lancium.rst | 7 +++ docs/source/api/tardis.adapters.sites.rst | 1 + docs/source/api/tardis.plugins.auditor.rst | 7 +++ docs/source/api/tardis.plugins.rst | 1 + docs/source/api/tardis.rest.app.routers.rst | 3 +- .../api/tardis.rest.app.routers.types.rst | 7 +++ .../api/tardis.rest.app.routers.user.rst | 7 +++ docs/source/api/tardis.rest.app.rst | 1 + docs/source/api/tardis.rest.app.scopes.rst | 7 +++ docs/source/api/tardis.rest.rst | 1 - docs/source/plugins/plugins.rst | 3 +- 12 files changed, 102 insertions(+), 6 deletions(-) create mode 100644 docs/source/api/tardis.adapters.sites.lancium.rst create mode 100644 docs/source/api/tardis.plugins.auditor.rst create mode 100644 docs/source/api/tardis.rest.app.routers.types.rst create mode 100644 docs/source/api/tardis.rest.app.routers.user.rst create mode 100644 docs/source/api/tardis.rest.app.scopes.rst diff --git a/docs/source/adapters/site.rst b/docs/source/adapters/site.rst index ce9094a5..e6adf157 100644 --- a/docs/source/adapters/site.rst +++ b/docs/source/adapters/site.rst @@ -298,6 +298,66 @@ Available adapter configuration options The ``Arguments`` contains the following command line arguments, ``--cores``. ``--memory``. ``--disk`` and ``--uuid``. +Lancium Site Adapter +-------------------- + +.. content-tabs:: left-col + + The :py:class:`~tardis.adapters.sites.lancium.LanciumAdapter` implements an interface to `Lancium`_ Compute API. + The following general adapter configuration options are available. + + .. _Lancium: https://lancium.github.io + +Available adapter configuration options +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. content-tabs:: left-col + + +---------------------+------------------------------------------------------------------------+-----------------+ + | Option | Short Description | Requirement | + +=====================+========================================================================+=================+ + | api_url | The end point of the Lancium API to contact. | **Required** | + +---------------------+------------------------------------------------------------------------+-----------------+ + | api_key | API Token to access the Lancium API. | **Required** | + +---------------------+------------------------------------------------------------------------+-----------------+ + | max_age | The output of the `show_jobs` API call is cached for `max_age` minutes | **Required** | + +---------------------+------------------------------------------------------------------------+-----------------+ + + All configuration entries in the `MachineTypeConfiguration` section of the machine types are + directly added to the body of Lancium API `create_job` call. All available options are + described in the `Lancium documentation`_ + + .. _Lancium documentation: https://lancium.github.io/compute-api-docs/api.html#tag/Jobs/operation/create_job + +.. content-tabs:: right-col + + .. rubric:: Example configuration + + .. code-block:: yaml + + Sites: + - name: Lancium + adapter: Lancium + quota: 1 # CPU core quota + + Lancium: + api_url: https://portal.lancium.com/api/v1/ + api_key: "top_secret" + max_age: 1 + MachineTypes: + - m1.small + MachineTypeConfiguration: + m1.small: + qos: "high" + image: "lancium/ubuntu" + command_line: "sleep 500" + max_run_time: 600 + MachineMetaData: + m1.small: + Cores: 2 + Memory: 4 + Disk: 20 + Moab Site Adapter ----------------- @@ -621,5 +681,4 @@ Available machine type configuration options .. content-tabs:: left-col Your favorite site is currently not supported? - Please, have a look at - :ref:`how to contribute.` + Please, have a look at how to contribute. diff --git a/docs/source/api/tardis.adapters.sites.lancium.rst b/docs/source/api/tardis.adapters.sites.lancium.rst new file mode 100644 index 00000000..9077ac92 --- /dev/null +++ b/docs/source/api/tardis.adapters.sites.lancium.rst @@ -0,0 +1,7 @@ +tardis.adapters.sites.lancium module +==================================== + +.. automodule:: tardis.adapters.sites.lancium + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/tardis.adapters.sites.rst b/docs/source/api/tardis.adapters.sites.rst index 736cdb19..91937280 100644 --- a/docs/source/api/tardis.adapters.sites.rst +++ b/docs/source/api/tardis.adapters.sites.rst @@ -16,6 +16,7 @@ Submodules tardis.adapters.sites.fakesite tardis.adapters.sites.htcondor tardis.adapters.sites.kubernetes + tardis.adapters.sites.lancium tardis.adapters.sites.moab tardis.adapters.sites.openstack tardis.adapters.sites.slurm diff --git a/docs/source/api/tardis.plugins.auditor.rst b/docs/source/api/tardis.plugins.auditor.rst new file mode 100644 index 00000000..6eb20f0d --- /dev/null +++ b/docs/source/api/tardis.plugins.auditor.rst @@ -0,0 +1,7 @@ +tardis.plugins.auditor module +============================= + +.. automodule:: tardis.plugins.auditor + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/tardis.plugins.rst b/docs/source/api/tardis.plugins.rst index f7ede22a..a72d239f 100644 --- a/docs/source/api/tardis.plugins.rst +++ b/docs/source/api/tardis.plugins.rst @@ -12,6 +12,7 @@ Submodules .. toctree:: :maxdepth: 4 + tardis.plugins.auditor tardis.plugins.elasticsearchmonitoring tardis.plugins.prometheusmonitoring tardis.plugins.sqliteregistry diff --git a/docs/source/api/tardis.rest.app.routers.rst b/docs/source/api/tardis.rest.app.routers.rst index 49d6a6cc..845a8e70 100644 --- a/docs/source/api/tardis.rest.app.routers.rst +++ b/docs/source/api/tardis.rest.app.routers.rst @@ -12,5 +12,6 @@ Submodules .. toctree:: :maxdepth: 4 - tardis.rest.app.routers.login tardis.rest.app.routers.resources + tardis.rest.app.routers.types + tardis.rest.app.routers.user diff --git a/docs/source/api/tardis.rest.app.routers.types.rst b/docs/source/api/tardis.rest.app.routers.types.rst new file mode 100644 index 00000000..e8db8660 --- /dev/null +++ b/docs/source/api/tardis.rest.app.routers.types.rst @@ -0,0 +1,7 @@ +tardis.rest.app.routers.types module +==================================== + +.. automodule:: tardis.rest.app.routers.types + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/tardis.rest.app.routers.user.rst b/docs/source/api/tardis.rest.app.routers.user.rst new file mode 100644 index 00000000..9dfc9ab2 --- /dev/null +++ b/docs/source/api/tardis.rest.app.routers.user.rst @@ -0,0 +1,7 @@ +tardis.rest.app.routers.user module +=================================== + +.. automodule:: tardis.rest.app.routers.user + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/tardis.rest.app.rst b/docs/source/api/tardis.rest.app.rst index 21d6c406..3d98c52d 100644 --- a/docs/source/api/tardis.rest.app.rst +++ b/docs/source/api/tardis.rest.app.rst @@ -23,4 +23,5 @@ Submodules tardis.rest.app.crud tardis.rest.app.database tardis.rest.app.main + tardis.rest.app.scopes tardis.rest.app.security diff --git a/docs/source/api/tardis.rest.app.scopes.rst b/docs/source/api/tardis.rest.app.scopes.rst new file mode 100644 index 00000000..1a6aed2a --- /dev/null +++ b/docs/source/api/tardis.rest.app.scopes.rst @@ -0,0 +1,7 @@ +tardis.rest.app.scopes module +============================= + +.. automodule:: tardis.rest.app.scopes + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api/tardis.rest.rst b/docs/source/api/tardis.rest.rst index 32ccadea..9fc0cce2 100644 --- a/docs/source/api/tardis.rest.rst +++ b/docs/source/api/tardis.rest.rst @@ -14,7 +14,6 @@ Subpackages tardis.rest.app tardis.rest.hash_credentials - tardis.rest.token_generator Submodules ---------- diff --git a/docs/source/plugins/plugins.rst b/docs/source/plugins/plugins.rst index 77393d40..20ff0c13 100644 --- a/docs/source/plugins/plugins.rst +++ b/docs/source/plugins/plugins.rst @@ -220,5 +220,4 @@ Available configuration options .. content-tabs:: left-col Your favorite monitoring is currently not supported? - Please, have a look at - :ref:`how to contribute.` + Please, have a look at how to contribute. From a1ec767db02024eda47be33a092885e20b8619e6 Mon Sep 17 00:00:00 2001 From: Manuel Giffels Date: Thu, 17 Nov 2022 14:39:33 +0100 Subject: [PATCH 13/16] Add change log for lancium compute adapter --- docs/source/changelog.rst | 1 + docs/source/changes/267.add_lancium_site_adapter.yaml | 6 ++++++ 2 files changed, 7 insertions(+) create mode 100644 docs/source/changes/267.add_lancium_site_adapter.yaml diff --git a/docs/source/changelog.rst b/docs/source/changelog.rst index a930df9c..92eeee14 100644 --- a/docs/source/changelog.rst +++ b/docs/source/changelog.rst @@ -15,6 +15,7 @@ Added * Introduce a TARDIS REST API to query the state of resources from SqlRegistry * Added support for manual draining of drones using the REST API * Add support for passing environment variables as executable arguments to support HTCondor grid universe +* Added a new site adapter to use Lancium compute as resource provider Changed ------- diff --git a/docs/source/changes/267.add_lancium_site_adapter.yaml b/docs/source/changes/267.add_lancium_site_adapter.yaml new file mode 100644 index 00000000..90c4dd70 --- /dev/null +++ b/docs/source/changes/267.add_lancium_site_adapter.yaml @@ -0,0 +1,6 @@ +category: added +summary: "Added a new site adapter to use Lancium compute as resource provider" +description: | + A new Lancium compute site adapter has been added to `TARDIS` to use resources provided by the Lancium compute cluster. +pull requests: +- 267 From aa48d4bfd668b1cf7c0ea8259f74e10cfe91f3c5 Mon Sep 17 00:00:00 2001 From: Manuel Giffels Date: Thu, 17 Nov 2022 14:53:36 +0100 Subject: [PATCH 14/16] Satisfy Python 3.7 unittest --- tests/adapters_t/sites_t/test_lancium.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/adapters_t/sites_t/test_lancium.py b/tests/adapters_t/sites_t/test_lancium.py index c48f317d..3cce7bd7 100644 --- a/tests/adapters_t/sites_t/test_lancium.py +++ b/tests/adapters_t/sites_t/test_lancium.py @@ -120,7 +120,7 @@ def run_it(): {"variable": "TardisDroneUuid", "value": "testsite-089123"}, ], }, - self.mocked_lancium_api.jobs.create_job.call_args.kwargs["job"], + self.mocked_lancium_api.jobs.create_job.call_args[1]["job"], ) self.mocked_lancium_api.jobs.submit_job.assert_called_with(id=123) From f04cb6d77ed2ef89dfb56ed027f744a3c9ac960a Mon Sep 17 00:00:00 2001 From: Manuel Giffels Date: Wed, 23 Nov 2022 16:54:39 +0100 Subject: [PATCH 15/16] Apply suggestions from code review Co-authored-by: Max Fischer --- tardis/adapters/sites/lancium.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tardis/adapters/sites/lancium.py b/tardis/adapters/sites/lancium.py index 1a0d9bf3..fa7c6d2f 100644 --- a/tardis/adapters/sites/lancium.py +++ b/tardis/adapters/sites/lancium.py @@ -105,7 +105,7 @@ async def resource_status( resource_status = self._lancium_status[resource_uuid] except KeyError as err: if ( - self._lancium_status._last_update - resource_attributes.created + self._lancium_status.last_update - resource_attributes.created ).total_seconds() < 0: raise TardisResourceStatusUpdateFailed from err else: @@ -114,7 +114,7 @@ async def resource_status( "status": "deleted", } logger.debug(f"{self.site_name} has status {resource_status}.") - resource_attributes.update(updated=datetime.now()) + resource_attributes["updated"]=datetime.now() return convert_to_attribute_dict( {**resource_attributes, **self.handle_response(resource_status)} ) From 077c62170abfeefaf677a132924364f3bb7cec2e Mon Sep 17 00:00:00 2001 From: Manuel Giffels Date: Wed, 23 Nov 2022 16:58:14 +0100 Subject: [PATCH 16/16] Fix flake8 issue --- CONTRIBUTORS | 1 + docs/source/changelog.rst | 4 ++-- tardis/adapters/sites/lancium.py | 2 +- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/CONTRIBUTORS b/CONTRIBUTORS index 44bc88f5..ba50a8c6 100644 --- a/CONTRIBUTORS +++ b/CONTRIBUTORS @@ -15,6 +15,7 @@ Alexander Haas <104835302+haasal@users.noreply.github.com> mschnepf Matthias J. Schnepf Matthias Schnepf +LGTM Migrator Matthias Schnepf PSchuhmacher Peter Wienemann diff --git a/docs/source/changelog.rst b/docs/source/changelog.rst index 92eeee14..a8d3564c 100644 --- a/docs/source/changelog.rst +++ b/docs/source/changelog.rst @@ -1,4 +1,4 @@ -.. Created by changelog.py at 2022-11-17, command +.. Created by changelog.py at 2022-11-23, command '/Users/giffler/.cache/pre-commit/repor6pnmwlm/py_env-python3.10/bin/changelog docs/source/changes compile --output=docs/source/changelog.rst' based on the format of 'https://keepachangelog.com/' @@ -6,7 +6,7 @@ CHANGELOG ######### -[Unreleased] - 2022-11-17 +[Unreleased] - 2022-11-23 ========================= Added diff --git a/tardis/adapters/sites/lancium.py b/tardis/adapters/sites/lancium.py index fa7c6d2f..e9b36e9b 100644 --- a/tardis/adapters/sites/lancium.py +++ b/tardis/adapters/sites/lancium.py @@ -114,7 +114,7 @@ async def resource_status( "status": "deleted", } logger.debug(f"{self.site_name} has status {resource_status}.") - resource_attributes["updated"]=datetime.now() + resource_attributes["updated"] = datetime.now() return convert_to_attribute_dict( {**resource_attributes, **self.handle_response(resource_status)} )