|
| 1 | +from dart.context.locator import injectable |
| 2 | +from dart.model.workflow import WorkflowState, WorkflowInstanceState |
| 3 | +from dart.model.action import ActionState |
| 4 | + |
| 5 | +import logging |
| 6 | +import boto3 |
| 7 | + |
| 8 | +_logger = logging.getLogger(__name__) |
| 9 | + |
| 10 | +@injectable |
| 11 | +class PendingActionsCheck(object): |
| 12 | + def __init__(self, action_service): |
| 13 | + self._action_service = action_service |
| 14 | + self._batch_client = boto3.client('batch') |
| 15 | + |
| 16 | + def get_not_completed_workflow_instances(self, workflow_id, workflow_service): |
| 17 | + wf = workflow_service.get_workflow(workflow_id, raise_when_missing=False) |
| 18 | + if not wf: |
| 19 | + _logger.info('Zombie Check: workflow (id={wf_id}) not found. log-info: {log_info}'. |
| 20 | + format(wf_id=workflow_id, log_info=workflow_msg.get('log_info'))) |
| 21 | + return None |
| 22 | + |
| 23 | + if wf.data.state != WorkflowState.ACTIVE: |
| 24 | + _logger.info('Zombie Check: expected workflow (id={wf_id}) to be in ACTIVE state. log-info: {log_info}'. |
| 25 | + format(wf_id=workflow_id, log_info=workflow_msg.get('log_info'))) |
| 26 | + |
| 27 | + # get all workflow_instances of current workflow: |
| 28 | + NOT_COMPLETE_STATES = ['QUEUED', 'RUNNING'] |
| 29 | + all_wf_instances = workflow_service.find_workflow_instances(workflow_id) |
| 30 | + current_wf_instances = [wf for wf in all_wf_instances if wf.data.state in NOT_COMPLETE_STATES] |
| 31 | + _logger.info('Zombie Check: Found workflow instance ids (workflow_id={0}) instances = {1}'.format(workflow_id, current_wf_instances)) |
| 32 | + |
| 33 | + return current_wf_instances |
| 34 | + |
| 35 | + def get_instance_actions(self, current_wf_instances): |
| 36 | + # get all actions of not completed workflow_instances |
| 37 | + incomplete_actions = [] |
| 38 | + action_2_wf_instance = {} |
| 39 | + for wf_instance in current_wf_instances: |
| 40 | + wf_instance_actions = self._action_service.find_actions(workflow_instance_id=wf_instance.id) |
| 41 | + incomplete_actions.extend(wf_instance_actions) |
| 42 | + for action in wf_instance_actions: |
| 43 | + action_2_wf_instance[action.id] = wf_instance |
| 44 | + |
| 45 | + jobs_2_actions = {} |
| 46 | + for action in incomplete_actions: |
| 47 | + if action.data.batch_job_id: |
| 48 | + jobs_2_actions[action.data.batch_job_id] = action |
| 49 | + |
| 50 | + return incomplete_actions, jobs_2_actions, action_2_wf_instance |
| 51 | + |
| 52 | + def handle_done_batch_jobs_with_not_complete_wf_instances(self, batch_jobs, jobs_2_actions, action_2_wf_instance, workflow_service): |
| 53 | + for job in batch_jobs.get('jobs'): |
| 54 | + # jobs fail + action not-failed => fail workflow instance and action |
| 55 | + action = jobs_2_actions[job.get('jobId')] |
| 56 | + if action: |
| 57 | + wf_instance = action_2_wf_instance[action.id] |
| 58 | + if job.get('status') == 'FAILED' and not (action.data.state in ['FAILED', 'COMPLETED']): |
| 59 | + _logger.info("Zombie Check: Job {0} is failed but action {0} is not failed/completed. Updating action and workflow_instance to FAILED".format(job.get('jobId'), action.id)) |
| 60 | + self._action_service.update_action_state(action, ActionState.FAILED, action.data.error_message) |
| 61 | + workflow_service.update_workflow_instance_state(wf_instance, WorkflowInstanceState.FAILED) |
| 62 | + |
| 63 | + # Jobs complete + action not-failed => mark workflow instance as complete and mark actions as complete |
| 64 | + if job.get('status') == 'COMPLETED' and not (action.data.state in ['FAILED', 'COMPLETED']): |
| 65 | + _logger.info("Zombie Check: Job {0} is completed but action {0} is not failed/completed. Updating action to COMPLETED".format(job.get("jobId"), action.id)) |
| 66 | + self._action_service.update_action_state(action, ActionState.COMPLETED, action.data.error_message) |
| 67 | + workflow_service.update_workflow_instance_state(wf_instance, WorkflowInstanceState.FAILED) |
| 68 | + |
| 69 | + def find_pending_dart_actions(self, workflow_id, workflow_service): |
| 70 | + ''' We send workflow_service to avoid cyclical injection from workflow_service ''' |
| 71 | + current_wf_instances = self.get_not_completed_workflow_instances(workflow_id, workflow_service) |
| 72 | + if current_wf_instances: |
| 73 | + incomplete_actions, jobs_2_actions, action_2_wf_instance = self.get_instance_actions(current_wf_instances) |
| 74 | + batch_job_ids = [job.data.batch_job_id for job in incomplete_actions] |
| 75 | + _logger.info("Zombie Check: extract job_ids {0} form incomplete actions {1}".format(batch_job_ids, [act.id for act in incomplete_actions])) |
| 76 | + |
| 77 | + try: |
| 78 | + batch_jobs = self._batch_client.describe_jobs(jobs=batch_job_ids) |
| 79 | + except Exception as err: |
| 80 | + _logger.error("Zombie Check: failed to execute batch's describe_jobs. err = {0}".format(err)) |
| 81 | + else: |
| 82 | + self.handle_done_batch_jobs_with_not_complete_wf_instances(batch_jobs, jobs_2_actions, action_2_wf_instance, workflow_service) |
| 83 | + |
| 84 | + |
0 commit comments