Skip to content

Commit

Permalink
Added Openkruise workload integration health check scripts (#16238)
Browse files Browse the repository at this point in the history
Signed-off-by: Mahesh <maheshkasbe010@gmail.com>
Co-authored-by: Ishita Sequeira <46771830+ishitasequeira@users.noreply.github.com>
  • Loading branch information
maheshkasabe and ishitasequeira authored Jan 10, 2024
1 parent 9b27aeb commit d6da9f2
Show file tree
Hide file tree
Showing 39 changed files with 1,174 additions and 0 deletions.
36 changes: 36 additions & 0 deletions resource_customizations/apps.kruise.io/AdvancedCronJob/health.lua
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
hs = { status = "Progressing", message = "AdvancedCronJobs has active jobs" }
-- Extract lastScheduleTime and convert to time objects
lastScheduleTime = nil

if obj.status.lastScheduleTime ~= nil then
local year, month, day, hour, min, sec = string.match(obj.status.lastScheduleTime, "(%d+)-(%d+)-(%d+)T(%d+):(%d+):(%d+)Z")
lastScheduleTime = os.time({year=year, month=month, day=day, hour=hour, min=min, sec=sec})
end


if lastScheduleTime == nil and obj.spec.paused == true then
hs.status = "Suspended"
hs.message = "AdvancedCronJob is Paused"
return hs
end

-- AdvancedCronJobs are progressing if they have any object in the "active" state
if obj.status.active ~= nil and #obj.status.active > 0 then
hs.status = "Progressing"
hs.message = "AdvancedCronJobs has active jobs"
return hs
end
-- AdvancedCronJobs are Degraded if they don't have lastScheduleTime
if lastScheduleTime == nil then
hs.status = "Degraded"
hs.message = "AdvancedCronJobs has not run successfully"
return hs
end
-- AdvancedCronJobs are healthy if they have lastScheduleTime
if lastScheduleTime ~= nil then
hs.status = "Healthy"
hs.message = "AdvancedCronJobs has run successfully"
return hs
end

return hs
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
tests:
- healthStatus:
status: Healthy
message: AdvancedCronJobs has run successfully
inputPath: testdata/lastScheduleTime.yaml
- healthStatus:
status: Degraded
message: AdvancedCronJobs has not run successfully
inputPath: testdata/notScheduled.yaml
- healthStatus:
status: Progressing
message: AdvancedCronJobs has active jobs
inputPath: testdata/activeJobs.yaml
- healthStatus:
status: Suspended
message: AdvancedCronJob is Paused
inputPath: testdata/suspended.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
apiVersion: apps.kruise.io/v1alpha1
kind: AdvancedCronJob
metadata:
name: acj-test
spec:
schedule: "*/1 * * * *"
template:
broadcastJobTemplate:
spec:
template:
spec:
containers:
- name: pi
image: perl
command: ["perl", "-Mbignum=bpi", "-wle", "print bpi(2000)"]
restartPolicy: Never
completionPolicy:
type: Always
ttlSecondsAfterFinished: 30

status:
active:
- apiVersion: apps.kruise.io/v1alpha1
kind: BroadcastJob
name: acj-test-1694882400
namespace: default
resourceVersion: '4012'
uid: 2b08a429-a43b-4382-8e5d-3db0c72b5b13
lastScheduleTime: '2023-09-16T16:40:00Z'
type: BroadcastJob
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
apiVersion: apps.kruise.io/v1alpha1
kind: AdvancedCronJob
metadata:
name: acj-test
spec:
schedule: "*/1 * * * *"
template:
broadcastJobTemplate:
spec:
template:
spec:
containers:
- name: pi
image: perl
command: ["perl", "-Mbignum=bpi", "-wle", "print bpi(2000)"]
restartPolicy: Never
completionPolicy:
type: Always
ttlSecondsAfterFinished: 30

status:
lastScheduleTime: "2023-09-16T16:29:00Z"
type: BroadcastJob
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
apiVersion: apps.kruise.io/v1alpha1
kind: AdvancedCronJob
metadata:
name: acj-test
spec:
schedule: "*/1 * * * *"
template:
broadcastJobTemplate:
spec:
template:
spec:
containers:
- name: pi
image: perl
command: ["perl", "-Mbignum=bpi", "-wle", "print bpi(2000)"]
restartPolicy: Never
completionPolicy:
type: Always
ttlSecondsAfterFinished: 30

status:
lastScheduleTime: null
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
apiVersion: apps.kruise.io/v1alpha1
kind: AdvancedCronJob
metadata:
name: acj-test
spec:
schedule: "*/1 * * * *"
template:
broadcastJobTemplate:
spec:
template:
spec:
containers:
- name: pi
image: perl
command: ["perl", "-Mbignum=bpi", "-wle", "print bpi(2000)"]
restartPolicy: Never
completionPolicy:
type: Always
ttlSecondsAfterFinished: 30
paused: true

status:
type: BroadcastJob
32 changes: 32 additions & 0 deletions resource_customizations/apps.kruise.io/BroadcastJob/health.lua
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
hs={ status= "Progressing", message= "BroadcastJob is still running" }

if obj.status ~= nil then

-- BroadcastJob are healthy if desired number and succeeded number is equal
if obj.status.desired == obj.status.succeeded and obj.status.phase == "completed" then
hs.status = "Healthy"
hs.message = "BroadcastJob is completed successfully"
return hs
end
-- BroadcastJob are progressing if active is not equal to 0
if obj.status.active ~= 0 and obj.status.phase == "running" then
hs.status = "Progressing"
hs.message = "BroadcastJob is still running"
return hs
end
-- BroadcastJob are progressing if failed is not equal to 0
if obj.status.failed ~= 0 and obj.status.phase == "failed" then
hs.status = "Degraded"
hs.message = "BroadcastJob failed"
return hs
end

if obj.status.phase == "paused" and obj.spec.paused == true then
hs.status = "Suspended"
hs.message = "BroadcastJob is Paused"
return hs
end

end

return hs
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
tests:
- healthStatus:
status: Healthy
message: "BroadcastJob is completed successfully"
inputPath: testdata/succeeded.yaml
- healthStatus:
status: Degraded
message: "BroadcastJob failed"
inputPath: testdata/failed.yaml
- healthStatus:
status: Progressing
message: "BroadcastJob is still running"
inputPath: testdata/running.yaml
- healthStatus:
status: Suspended
message: "BroadcastJob is Paused"
inputPath: testdata/suspended.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
apiVersion: apps.kruise.io/v1alpha1
kind: BroadcastJob
metadata:
name: failed-job
spec:
template:
spec:
containers:
- name: guestbook
image: openkruise/guestbook:v3
command: ["exit", "1"] # a dummy command to fail
restartPolicy: Never
completionPolicy:
type: Always
ttlSecondsAfterFinished: 60 # the job will be deleted after 60 seconds

status:
active: 0
completionTime: '2023-09-17T14:31:38Z'
conditions:
- lastProbeTime: '2023-09-17T14:31:38Z'
lastTransitionTime: '2023-09-17T14:31:38Z'
message: failure policy is FailurePolicyTypeFailFast and failed pod is found
reason: Failed
status: 'True'
type: Failed
desired: 1
failed: 1
phase: failed
startTime: '2023-09-17T14:31:32Z'
succeeded: 0
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
apiVersion: apps.kruise.io/v1alpha1
kind: BroadcastJob
metadata:
name: download-image
spec:
template:
spec:
containers:
- name: guestbook
image: openkruise/guestbook:v3
command: ["echo", "started"] # a dummy command to do nothing
restartPolicy: Never
completionPolicy:
type: Always
ttlSecondsAfterFinished: 60 # the job will be deleted after 60 seconds
status:
active: 1
desired: 1
failed: 0
phase: running
startTime: '2023-09-17T14:43:30Z'
succeeded: 0
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
apiVersion: apps.kruise.io/v1alpha1
kind: BroadcastJob
metadata:
name: download-image
spec:
template:
spec:
containers:
- name: guestbook
image: openkruise/guestbook:v3
command: ["echo", "started"] # a dummy command to do nothing
restartPolicy: Never
completionPolicy:
type: Always
ttlSecondsAfterFinished: 60 # the job will be deleted after 60 seconds
status:
active: 0
completionTime: '2023-09-17T14:35:14Z'
conditions:
- lastProbeTime: '2023-09-17T14:35:14Z'
lastTransitionTime: '2023-09-17T14:35:14Z'
message: Job completed, 1 pods succeeded, 0 pods failed
reason: Complete
status: 'True'
type: Complete
desired: 1
failed: 0
phase: completed
startTime: '2023-09-17T14:35:07Z'
succeeded: 1

Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
apiVersion: apps.kruise.io/v1alpha1
kind: BroadcastJob
metadata:
name: download-image
spec:
template:
spec:
containers:
- name: guestbook
image: openkruise/guestbook:v3
command: ["echo", "started"] # a dummy command to do nothing
restartPolicy: Never
paused: true
completionPolicy:
type: Always
ttlSecondsAfterFinished: 60 # the job will be deleted after 60 seconds
status:
active: 0
completionTime: '2023-09-17T14:35:14Z'
conditions:
- lastProbeTime: '2023-09-17T14:35:14Z'
lastTransitionTime: '2023-09-17T14:35:14Z'
message: Job completed, 1 pods succeeded, 0 pods failed
reason: Complete
status: 'True'
type: Complete
desired: 1
failed: 0
phase: paused
startTime: '2023-09-17T14:35:07Z'
succeeded: 0
33 changes: 33 additions & 0 deletions resource_customizations/apps.kruise.io/CloneSet/health.lua
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
hs={ status = "Progressing", message = "Waiting for initialization" }

if obj.status ~= nil then

if obj.metadata.generation == obj.status.observedGeneration then

if obj.spec.updateStrategy.paused == true or not obj.status.updatedAvailableReplicas then
hs.status = "Suspended"
hs.message = "Cloneset is paused"
return hs
elseif obj.spec.updateStrategy.partition ~= 0 and obj.metadata.generation > 1 then
if obj.status.updatedReplicas >= obj.status.expectedUpdatedReplicas then
hs.status = "Suspended"
hs.message = "Cloneset needs manual intervention"
return hs
end

elseif obj.status.updatedAvailableReplicas == obj.status.replicas then
hs.status = "Healthy"
hs.message = "All Cloneset workloads are ready and updated"
return hs

else
if obj.status.updatedAvailableReplicas ~= obj.status.replicas then
hs.status = "Degraded"
hs.message = "Some replicas are not ready or available"
return hs
end
end
end
end

return hs
21 changes: 21 additions & 0 deletions resource_customizations/apps.kruise.io/CloneSet/health_test.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
tests:
- healthStatus:
status: Healthy
message: "All Cloneset workloads are ready and updated"
inputPath: testdata/healthy.yaml
- healthStatus:
status: Degraded
message: "Some replicas are not ready or available"
inputPath: testdata/degraded.yaml
- healthStatus:
status: Progressing
message: "Waiting for initialization"
inputPath: testdata/unknown.yaml
- healthStatus:
status: Suspended
message: "Cloneset is paused"
inputpath: testdata/suspended.yaml
- healthStatus:
status: Suspended
message: "Cloneset needs manual intervention"
inputpath: testdata/partition_suspended.yaml
Loading

0 comments on commit d6da9f2

Please sign in to comment.