From 8aa49cce68551ef0372c20dd9984db98a5d39cde Mon Sep 17 00:00:00 2001 From: Connor Osborn Date: Fri, 13 Apr 2018 13:02:04 -0700 Subject: [PATCH 1/4] Revert "Fix unshelved instance failing to get fixed ip" We are no longer removing ports. After upgrading Marana to Pike we found that ports which were attached post boot, would never successfully attach. A workaround was discovered to enable allow-hotplug for the given interface in cloud-init, but it was deemed more complicated than simply preserving the ports. We were deleting/recreating the ports in the first place, because of a bug which affected j7m https://bugs.launchpad.net/nova/+bug/1759924. This reverts commit af002eb03892374eda4654f9218bb6f8f3ad9973. --- service/instance.py | 17 ++++++----- service/tasks/driver.py | 67 ++++++++++++++++++++++++++--------------- 2 files changed, 51 insertions(+), 33 deletions(-) diff --git a/service/instance.py b/service/instance.py index 7a9016721..6b907547b 100644 --- a/service/instance.py +++ b/service/instance.py @@ -470,7 +470,7 @@ def restore_ip_chain(esh_driver, esh_instance, deploy=True, start with: task.apply_async() """ from service.tasks.driver import ( - wait_for_instance, restore_port, add_floating_ip, + wait_for_instance, add_fixed_ip, add_floating_ip, deploy_init_to, update_metadata ) init_task = wait_for_instance.s( @@ -478,19 +478,20 @@ def restore_ip_chain(esh_driver, esh_instance, deploy=True, esh_driver.identity, "active", # TODO: DELETEME below. no_tasks=True) - + # Step 1: Set metadata to initializing metadata = {'tmp_status': 'initializing'} metadata_update_task = update_metadata.si( esh_driver.__class__, esh_driver.provider, esh_driver.identity, esh_instance.id, metadata, replace_metadata=False) - restore_port_task = restore_port.si( + # Step 2: Add fixed + fixed_ip_task = add_fixed_ip.si( esh_driver.__class__, esh_driver.provider, esh_driver.identity, esh_instance.id, core_identity_uuid) init_task.link(metadata_update_task) - metadata_update_task.link(restore_port_task) - + metadata_update_task.link(fixed_ip_task) + # Add float and re-deploy OR just add floating IP... if deploy: core_identity = CoreIdentity.objects.get(uuid=core_identity_uuid) deploy_task = deploy_init_to.si( @@ -500,16 +501,16 @@ def restore_ip_chain(esh_driver, esh_instance, deploy=True, esh_instance.id, core_identity, redeploy=True) - restore_port_task.link(deploy_task) + fixed_ip_task.link(deploy_task) else: + logger.info("Skip deployment, Add floating IP only") floating_ip_task = add_floating_ip.si( esh_driver.__class__, esh_driver.provider, esh_driver.identity, str(core_identity_uuid), esh_instance.id) - restore_port_task.link(floating_ip_task) - + fixed_ip_task.link(floating_ip_task) return init_task diff --git a/service/tasks/driver.py b/service/tasks/driver.py index cc22ce83b..22c956c37 100644 --- a/service/tasks/driver.py +++ b/service/tasks/driver.py @@ -168,35 +168,40 @@ def _is_instance_ready(instance, status_query, return instance.id return True -@task(name="restore_port", + +@task(name="add_fixed_ip", ignore_result=True, default_retry_delay=15, - max_retries=5, - bind=True) -def restore_port(self, - driverCls, - provider, - identity, - instance_id, - core_identity_uuid=None): - from service.instance import _to_network_driver, _get_network_id + max_retries=15) +def add_fixed_ip( + driverCls, + provider, + identity, + instance_id, + core_identity_uuid=None): + from service import instance as instance_service try: + celery_logger.debug("add_fixed_ip task started at %s." % datetime.now()) driver = get_driver(driverCls, provider, identity) - core_identity = Identity.objects.get(uuid=core_identity_uuid) - network_driver = _to_network_driver(core_identity) instance = driver.get_instance(instance_id) - assert instance, "Instance {} no longer exists".format(instance_id) - - ports = network_driver.list_ports(device_id=instance.id) - for port in ports: - network_driver.delete_port(port) - - network_id = _get_network_id(driver, instance) - driver._connection.ex_attach_interface( - instance.id, network_id=network_id) + if not instance: + celery_logger.debug("Instance has been teminated: %s." % instance_id) + return None + if instance._node.private_ips: + # TODO: Attempt to rescue + celery_logger.info("Instance has fixed IP: %s" % instance_id) + return instance + + network_id = instance_service._get_network_id(driver, instance) + fixed_ip = driver._connection.ex_add_fixed_ip(instance, network_id) + celery_logger.debug("add_fixed_ip task finished at %s." % datetime.now()) + return fixed_ip except Exception as exc: - celery_logger.exception(exc) - self.retry(exc=exc) + if "Not Ready" not in str(exc): + # Ignore 'normal' errors. + celery_logger.exception(exc) + add_fixed_ip.retry(exc=exc) + def current_openstack_identities(): identities = Identity.objects.filter( @@ -1192,8 +1197,20 @@ def add_floating_ip(driverCls, provider, identity, core_identity_uuid, countdown = min(2**current.request.retries, 128) add_floating_ip.retry(exc=floating_ip_err, countdown=countdown) - except NeutronBadRequest: - # This is an error on our end, we want it to surface + except NeutronBadRequest as bad_request: + # NOTE: 'Neutron Bad Request' is a good message to 'catch and fix' + # because its a user-supplied problem. + # Here we will attempt to 'fix' requests and put the 'add_floating_ip' + # task back on the queue after we're done. + celery_logger.exception("Neutron did not accept request - %s." + % bad_request.message) + if 'no fixed ip' in bad_request.message.lower(): + fixed_ip = add_fixed_ip(driverCls, provider, identity, + instance_alias) + if fixed_ip: + celery_logger.debug("Fixed IP %s has been added to Instance %s." + % (fixed_ip, instance_alias)) + # let the exception bubble-up for a retry.. raise except (BaseException, Exception) as exc: celery_logger.exception("Error occurred while assigning a floating IP") From c1385fa96a29d900ded988af34f0f42ae438ef76 Mon Sep 17 00:00:00 2001 From: Connor Osborn Date: Fri, 13 Apr 2018 13:15:51 -0700 Subject: [PATCH 2/4] Try different approach to working around openstack port availability zone issue Problem: After upgrading Marana to Pike we found that ports which were attached post boot, would never successfully attach. A workaround was discovered to enable allow-hotplug for the given interface in cloud-init, but it was deemed more complicated than simply preserving the ports. We were deleting/recreating the ports in the first place, because of a bug which affected j7m https://bugs.launchpad.net/nova/+bug/1759924. Solution Rather than add/delete ports, add/remove fixed ips, but explicitly check if we're dealing with the availability zone issue and then try deleting/recreating the port. For some reason j7m is able to attach ports post boot. If its discovered that this is because of some race condition, we will need another workaround. --- service/tasks/driver.py | 37 +++++++++++++++++++++++++++++-------- 1 file changed, 29 insertions(+), 8 deletions(-) diff --git a/service/tasks/driver.py b/service/tasks/driver.py index 22c956c37..78eb851f2 100644 --- a/service/tasks/driver.py +++ b/service/tasks/driver.py @@ -179,23 +179,44 @@ def add_fixed_ip( identity, instance_id, core_identity_uuid=None): - from service import instance as instance_service + from service.instance import _to_network_driver, _get_network_id try: celery_logger.debug("add_fixed_ip task started at %s." % datetime.now()) + core_identity = Identity.objects.get(uuid=core_identity_uuid) + network_driver = _to_network_driver(core_identity) driver = get_driver(driverCls, provider, identity) instance = driver.get_instance(instance_id) if not instance: celery_logger.debug("Instance has been teminated: %s." % instance_id) return None - if instance._node.private_ips: - # TODO: Attempt to rescue - celery_logger.info("Instance has fixed IP: %s" % instance_id) - return instance - network_id = instance_service._get_network_id(driver, instance) - fixed_ip = driver._connection.ex_add_fixed_ip(instance, network_id) + ports = network_driver.list_ports(device_id=instance.id) + # Catch a common scenario that breaks networking + assert len(ports) == 1, "Attaching a fixed ip requires a single port" + port = ports[0] + port_zone = instance_zone = None + try: + port_zone = port['device_owner'].split(":")[1] + instance_zone = instance.extra['availability_zone'] + except: + pass + + network_id = _get_network_id(driver, instance) + + if port_zone and instance_zone and port_zone != instance_zone: + # If the port and instance are in different zones, delete the old + # port and attach a new one, this only occurs in narrow scenarios + # documented in the following ticket: + # https://bugs.launchpad.net/nova/+bug/1759924 + network_driver.delete_port(port) + driver._connection.ex_attach_interface( + instance.id, network_id=network_id) + + elif not instance._node.private_ips: + # Only add fixed ip if the instance doesn't already have one + driver._connection.ex_add_fixed_ip(instance, network_id) + celery_logger.debug("add_fixed_ip task finished at %s." % datetime.now()) - return fixed_ip except Exception as exc: if "Not Ready" not in str(exc): # Ignore 'normal' errors. From 127770216e53ee691d925ebfaf9847411ffe813c Mon Sep 17 00:00:00 2001 From: Connor Osborn Date: Fri, 13 Apr 2018 13:33:47 -0700 Subject: [PATCH 3/4] Remove fragile error handling --- service/tasks/driver.py | 16 ++-------------- 1 file changed, 2 insertions(+), 14 deletions(-) diff --git a/service/tasks/driver.py b/service/tasks/driver.py index 78eb851f2..c6ea68fd6 100644 --- a/service/tasks/driver.py +++ b/service/tasks/driver.py @@ -1218,20 +1218,8 @@ def add_floating_ip(driverCls, provider, identity, core_identity_uuid, countdown = min(2**current.request.retries, 128) add_floating_ip.retry(exc=floating_ip_err, countdown=countdown) - except NeutronBadRequest as bad_request: - # NOTE: 'Neutron Bad Request' is a good message to 'catch and fix' - # because its a user-supplied problem. - # Here we will attempt to 'fix' requests and put the 'add_floating_ip' - # task back on the queue after we're done. - celery_logger.exception("Neutron did not accept request - %s." - % bad_request.message) - if 'no fixed ip' in bad_request.message.lower(): - fixed_ip = add_fixed_ip(driverCls, provider, identity, - instance_alias) - if fixed_ip: - celery_logger.debug("Fixed IP %s has been added to Instance %s." - % (fixed_ip, instance_alias)) - # let the exception bubble-up for a retry.. + except NeutronBadRequest: + # This is an error on our end, we want it to surface raise except (BaseException, Exception) as exc: celery_logger.exception("Error occurred while assigning a floating IP") From 563d6a1cd86e94d4c942a8f264404aa704a524fd Mon Sep 17 00:00:00 2001 From: Connor Osborn Date: Tue, 17 Apr 2018 10:45:44 -0700 Subject: [PATCH 4/4] Update changelog --- CHANGELOG.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 60e4b039a..603bf47ed 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -28,6 +28,10 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/) ### Added - Support multiple hostnames for Atmosphere(1) server ([#602](https://github.com/cyverse/atmosphere/pull/602)) +### Fixed + - On start/unshelve instances would fail to be reachable because ports added + post boot ([#604](https://github.com/cyverse/atmosphere/pull/604)) + ## [v32-0](https://github.com/cyverse/atmosphere/compare/v31-1...v32-0) 2018-04-03 ### Changed ### Added