Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

chore: infer managed resources health from redis instead of storing it in CRD #10191

Merged
merged 2 commits into from
Aug 17, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions assets/swagger.json
Original file line number Diff line number Diff line change
Expand Up @@ -5306,6 +5306,10 @@
"reconciledAt": {
"$ref": "#/definitions/v1Time"
},
"resourceHealthSource": {
"type": "string",
"title": "ResourceHealthSource indicates where the resource health status is stored: inline if not set or appTree"
},
"resources": {
"type": "array",
"title": "Resources is a list of Kubernetes resources managed by this application",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ func NewCommand() *cobra.Command {
repoServerStrictTLS bool
otlpAddress string
applicationNamespaces []string
persistResourceHealth bool
)
var command = cobra.Command{
Use: cliName,
Expand Down Expand Up @@ -149,6 +150,7 @@ func NewCommand() *cobra.Command {
metricsCacheExpiration,
metricsAplicationLabels,
kubectlParallelismLimit,
persistResourceHealth,
clusterFilter,
applicationNamespaces)
errors.CheckError(err)
Expand Down Expand Up @@ -192,6 +194,7 @@ func NewCommand() *cobra.Command {
command.Flags().StringSliceVar(&metricsAplicationLabels, "metrics-application-labels", []string{}, "List of Application labels that will be added to the argocd_application_labels metric")
command.Flags().StringVar(&otlpAddress, "otlp-address", env.StringFromEnv("ARGOCD_APPLICATION_CONTROLLER_OTLP_ADDRESS", ""), "OpenTelemetry collector address to send traces to")
command.Flags().StringSliceVar(&applicationNamespaces, "application-namespaces", env.StringsFromEnv("ARGOCD_APPLICATION_NAMESPACES", []string{}, ","), "List of additional namespaces that applications are allowed to be reconciled from")
command.Flags().BoolVar(&persistResourceHealth, "persist-resource-health", env.ParseBoolFromEnv("ARGOCD_APPLICATION_CONTROLLER_PERSIST_RESOURCE_HEALTH", true), "Enables storing the managed resources health in the Application CRD")
cacheSrc = appstatecache.AddCacheFlagsToCmd(&command, func(client *redis.Client) {
redisClient = client
})
Expand Down
2 changes: 1 addition & 1 deletion cmd/argocd/commands/admin/app.go
Original file line number Diff line number Diff line change
Expand Up @@ -371,7 +371,7 @@ func reconcileApplications(
)

appStateManager := controller.NewAppStateManager(
argoDB, appClientset, repoServerClient, namespace, kubeutil.NewKubectl(), settingsMgr, stateCache, projInformer, server, cache, time.Second, argo.NewResourceTracking())
argoDB, appClientset, repoServerClient, namespace, kubeutil.NewKubectl(), settingsMgr, stateCache, projInformer, server, cache, time.Second, argo.NewResourceTracking(), false)

appsList, err := appClientset.ArgoprojV1alpha1().Applications(namespace).List(ctx, v1.ListOptions{LabelSelector: selector})
if err != nil {
Expand Down
3 changes: 2 additions & 1 deletion controller/appcontroller.go
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,7 @@ func NewApplicationController(
metricsCacheExpiration time.Duration,
metricsApplicationLabels []string,
kubectlParallelismLimit int64,
persistResourceHealth bool,
clusterFilter func(cluster *appv1.Cluster) bool,
applicationNamespaces []string,
) (*ApplicationController, error) {
Expand Down Expand Up @@ -209,7 +210,7 @@ func NewApplicationController(
}
}
stateCache := statecache.NewLiveStateCache(db, appInformer, ctrl.settingsMgr, kubectl, ctrl.metricsServer, ctrl.handleObjectUpdated, clusterFilter, argo.NewResourceTracking())
appStateManager := NewAppStateManager(db, applicationClientset, repoClientset, namespace, kubectl, ctrl.settingsMgr, stateCache, projInformer, ctrl.metricsServer, argoCache, ctrl.statusRefreshTimeout, argo.NewResourceTracking())
appStateManager := NewAppStateManager(db, applicationClientset, repoClientset, namespace, kubectl, ctrl.settingsMgr, stateCache, projInformer, ctrl.metricsServer, argoCache, ctrl.statusRefreshTimeout, argo.NewResourceTracking(), persistResourceHealth)
ctrl.appInformer = appInformer
ctrl.appLister = appLister
ctrl.projInformer = projInformer
Expand Down
1 change: 1 addition & 0 deletions controller/appcontroller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,7 @@ func newFakeController(data *fakeData) *ApplicationController {
data.metricsCacheExpiration,
[]string{},
0,
true,
nil,
[]string{},
)
Expand Down
15 changes: 12 additions & 3 deletions controller/health.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ import (
)

// setApplicationHealth updates the health statuses of all resources performed in the comparison
func setApplicationHealth(resources []managedResource, statuses []appv1.ResourceStatus, resourceOverrides map[string]appv1.ResourceOverride, app *appv1.Application) (*appv1.HealthStatus, error) {
func setApplicationHealth(resources []managedResource, statuses []appv1.ResourceStatus, resourceOverrides map[string]appv1.ResourceOverride, app *appv1.Application, persistResourceHealth bool) (*appv1.HealthStatus, error) {
var savedErr error
appHealth := appv1.HealthStatus{Status: health.HealthStatusHealthy}
for i, res := range resources {
Expand Down Expand Up @@ -42,8 +42,12 @@ func setApplicationHealth(resources []managedResource, statuses []appv1.Resource
}
}
if healthStatus != nil {
resHealth := appv1.HealthStatus{Status: healthStatus.Status, Message: healthStatus.Message}
statuses[i].Health = &resHealth
if persistResourceHealth {
resHealth := appv1.HealthStatus{Status: healthStatus.Status, Message: healthStatus.Message}
statuses[i].Health = &resHealth
} else {
statuses[i].Health = nil
}

// Is health status is missing but resource has not built-in/custom health check then it should not affect parent app health
if _, hasOverride := healthOverrides[lua.GetConfigMapKey(gvk)]; healthStatus.Status == health.HealthStatusMissing && !hasOverride && health.GetHealthCheckFunc(gvk) == nil {
Expand All @@ -60,5 +64,10 @@ func setApplicationHealth(resources []managedResource, statuses []appv1.Resource
}
}
}
if persistResourceHealth {
app.Status.ResourceHealthSource = appv1.ResourceHealthLocationInline
} else {
app.Status.ResourceHealthSource = appv1.ResourceHealthLocationAppTree
}
return &appHealth, savedErr
}
32 changes: 25 additions & 7 deletions controller/health_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -51,25 +51,43 @@ func TestSetApplicationHealth(t *testing.T) {
}}
resourceStatuses := initStatuses(resources)

healthStatus, err := setApplicationHealth(resources, resourceStatuses, lua.ResourceHealthOverrides{}, app)
healthStatus, err := setApplicationHealth(resources, resourceStatuses, lua.ResourceHealthOverrides{}, app, true)
assert.NoError(t, err)
assert.Equal(t, health.HealthStatusDegraded, healthStatus.Status)

assert.Equal(t, resourceStatuses[0].Health.Status, health.HealthStatusHealthy)
assert.Equal(t, resourceStatuses[1].Health.Status, health.HealthStatusDegraded)

// now mark the job as a hook and retry. it should ignore the hook and consider the app healthy
failedJob.SetAnnotations(map[string]string{synccommon.AnnotationKeyHook: "PreSync"})
healthStatus, err = setApplicationHealth(resources, resourceStatuses, nil, app)
healthStatus, err = setApplicationHealth(resources, resourceStatuses, nil, app, true)
assert.NoError(t, err)
assert.Equal(t, health.HealthStatusHealthy, healthStatus.Status)
}

func TestSetApplicationHealth_ResourceHealthNotPersisted(t *testing.T) {
failedJob := resourceFromFile("./testdata/job-failed.yaml")

resources := []managedResource{{
Group: "batch", Version: "v1", Kind: "Job", Live: &failedJob,
}}
resourceStatuses := initStatuses(resources)

healthStatus, err := setApplicationHealth(resources, resourceStatuses, lua.ResourceHealthOverrides{}, app, false)
assert.NoError(t, err)
assert.Equal(t, health.HealthStatusDegraded, healthStatus.Status)

assert.Nil(t, resourceStatuses[0].Health)
}

func TestSetApplicationHealth_MissingResource(t *testing.T) {
pod := resourceFromFile("./testdata/pod-running-restart-always.yaml")

resources := []managedResource{{
Group: "", Version: "v1", Kind: "Pod", Target: &pod}, {}}
resourceStatuses := initStatuses(resources)

healthStatus, err := setApplicationHealth(resources, resourceStatuses, lua.ResourceHealthOverrides{}, app)
healthStatus, err := setApplicationHealth(resources, resourceStatuses, lua.ResourceHealthOverrides{}, app, true)
assert.NoError(t, err)
assert.Equal(t, health.HealthStatusMissing, healthStatus.Status)
}
Expand All @@ -82,7 +100,7 @@ func TestSetApplicationHealth_MissingResourceNoBuiltHealthCheck(t *testing.T) {
resourceStatuses := initStatuses(resources)

t.Run("NoOverride", func(t *testing.T) {
healthStatus, err := setApplicationHealth(resources, resourceStatuses, lua.ResourceHealthOverrides{}, app)
healthStatus, err := setApplicationHealth(resources, resourceStatuses, lua.ResourceHealthOverrides{}, app, true)
assert.NoError(t, err)
assert.Equal(t, health.HealthStatusHealthy, healthStatus.Status)
assert.Equal(t, resourceStatuses[0].Health.Status, health.HealthStatusMissing)
Expand All @@ -93,7 +111,7 @@ func TestSetApplicationHealth_MissingResourceNoBuiltHealthCheck(t *testing.T) {
lua.GetConfigMapKey(schema.GroupVersionKind{Version: "v1", Kind: "ConfigMap"}): appv1.ResourceOverride{
HealthLua: "some health check",
},
}, app)
}, app, true)
assert.NoError(t, err)
assert.Equal(t, health.HealthStatusMissing, healthStatus.Status)
})
Expand Down Expand Up @@ -143,7 +161,7 @@ return hs`,
Group: application.Group, Version: "v1alpha1", Kind: application.ApplicationKind, Live: degradedApp}, {}}
resourceStatuses := initStatuses(resources)

healthStatus, err := setApplicationHealth(resources, resourceStatuses, overrides, app)
healthStatus, err := setApplicationHealth(resources, resourceStatuses, overrides, app, true)
assert.NoError(t, err)
assert.Equal(t, health.HealthStatusDegraded, healthStatus.Status)
})
Expand All @@ -154,7 +172,7 @@ return hs`,
Group: application.Group, Version: "v1alpha1", Kind: application.ApplicationKind, Live: degradedApp}, {}}
resourceStatuses := initStatuses(resources)

healthStatus, err := setApplicationHealth(resources, resourceStatuses, overrides, app)
healthStatus, err := setApplicationHealth(resources, resourceStatuses, overrides, app, true)
assert.NoError(t, err)
assert.Equal(t, health.HealthStatusHealthy, healthStatus.Status)
})
Expand Down
53 changes: 28 additions & 25 deletions controller/state.go
Original file line number Diff line number Diff line change
Expand Up @@ -89,18 +89,19 @@ func (res *comparisonResult) GetHealthStatus() *v1alpha1.HealthStatus {

// appStateManager allows to compare applications to git
type appStateManager struct {
metricsServer *metrics.MetricsServer
db db.ArgoDB
settingsMgr *settings.SettingsManager
appclientset appclientset.Interface
projInformer cache.SharedIndexInformer
kubectl kubeutil.Kubectl
repoClientset apiclient.Clientset
liveStateCache statecache.LiveStateCache
cache *appstatecache.Cache
namespace string
statusRefreshTimeout time.Duration
resourceTracking argo.ResourceTracking
metricsServer *metrics.MetricsServer
db db.ArgoDB
settingsMgr *settings.SettingsManager
appclientset appclientset.Interface
projInformer cache.SharedIndexInformer
kubectl kubeutil.Kubectl
repoClientset apiclient.Clientset
liveStateCache statecache.LiveStateCache
cache *appstatecache.Cache
namespace string
statusRefreshTimeout time.Duration
resourceTracking argo.ResourceTracking
persistResourceHealth bool
}

func (m *appStateManager) getRepoObjs(app *v1alpha1.Application, source v1alpha1.ApplicationSource, appLabelKey, revision string, noCache, noRevisionCache, verifySignature bool, proj *v1alpha1.AppProject) ([]*unstructured.Unstructured, *apiclient.ManifestResponse, error) {
Expand Down Expand Up @@ -588,7 +589,7 @@ func (m *appStateManager) CompareAppState(app *v1alpha1.Application, project *ap
}
ts.AddCheckpoint("sync_ms")

healthStatus, err := setApplicationHealth(managedResources, resourceSummaries, resourceOverrides, app)
healthStatus, err := setApplicationHealth(managedResources, resourceSummaries, resourceOverrides, app, m.persistResourceHealth)
if err != nil {
conditions = append(conditions, appv1.ApplicationCondition{Type: v1alpha1.ApplicationConditionComparisonError, Message: err.Error(), LastTransitionTime: &now})
}
Expand Down Expand Up @@ -664,20 +665,22 @@ func NewAppStateManager(
cache *appstatecache.Cache,
statusRefreshTimeout time.Duration,
resourceTracking argo.ResourceTracking,
persistResourceHealth bool,
) AppStateManager {
return &appStateManager{
liveStateCache: liveStateCache,
cache: cache,
db: db,
appclientset: appclientset,
kubectl: kubectl,
repoClientset: repoClientset,
namespace: namespace,
settingsMgr: settingsMgr,
projInformer: projInformer,
metricsServer: metricsServer,
statusRefreshTimeout: statusRefreshTimeout,
resourceTracking: resourceTracking,
liveStateCache: liveStateCache,
cache: cache,
db: db,
appclientset: appclientset,
kubectl: kubectl,
repoClientset: repoClientset,
namespace: namespace,
settingsMgr: settingsMgr,
projInformer: projInformer,
metricsServer: metricsServer,
statusRefreshTimeout: statusRefreshTimeout,
resourceTracking: resourceTracking,
persistResourceHealth: persistResourceHealth,
}
}

Expand Down
5 changes: 5 additions & 0 deletions docs/operator-manual/argocd-cmd-params-cm.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,11 @@ data:
controller.self.heal.timeout.seconds: "5"
# Cache expiration for app state (default 1h0m0s)
controller.app.state.cache.expiration: "1h0m0s"
# Specifies if resource health should be persisted in app CRD (default true)
# Changing this to `false` significantly reduce number of Application CRD updates and improves controller performance.
# However, disabling resource health by default might affect applications that communicate with Applications CRD directly
# so we have to defer switching this to `false` by default till v3.0 release.
controller.resource.health.persist: "true"
# Cache expiration default (default 24h0m0s)
controller.default.cache.expiration: "24h0m0s"

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ argocd-application-controller [flags]
--operation-processors int Number of application operation processors (default 10)
--otlp-address string OpenTelemetry collector address to send traces to
--password string Password for basic authentication to the API server
--persist-resource-health Enables storing the managed resources health in the Application CRD (default true)
--proxy-url string If provided, this URL will be used to connect via proxy
--redis string Redis server hostname and port (e.g. argocd-redis:6379).
--redis-ca-certificate string Path to Redis server CA certificate (e.g. /etc/certs/redis/ca.crt). If not specified, system trusted CAs will be used for server certificate validation.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,12 @@ spec:
name: argocd-cmd-params-cm
key: controller.repo.server.strict.tls
optional: true
- name: ARGOCD_APPLICATION_CONTROLLER_PERSIST_RESOURCE_HEALTH
valueFrom:
configMapKeyRef:
name: argocd-cmd-params-cm
key: controller.resource.health.persist
optional: true
- name: ARGOCD_APP_STATE_CACHE_EXPIRATION
valueFrom:
configMapKeyRef:
Expand Down
10 changes: 10 additions & 0 deletions manifests/core-install.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1806,6 +1806,10 @@ spec:
reconciled using the latest git version
format: date-time
type: string
resourceHealthSource:
description: 'ResourceHealthSource indicates where the resource health
status is stored: inline if not set or appTree'
type: string
resources:
description: Resources is a list of Kubernetes resources managed by
this application
Expand Down Expand Up @@ -10080,6 +10084,12 @@ spec:
key: controller.repo.server.strict.tls
name: argocd-cmd-params-cm
optional: true
- name: ARGOCD_APPLICATION_CONTROLLER_PERSIST_RESOURCE_HEALTH
valueFrom:
configMapKeyRef:
key: controller.resource.health.persist
name: argocd-cmd-params-cm
optional: true
- name: ARGOCD_APP_STATE_CACHE_EXPIRATION
valueFrom:
configMapKeyRef:
Expand Down
4 changes: 4 additions & 0 deletions manifests/crds/application-crd.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1805,6 +1805,10 @@ spec:
reconciled using the latest git version
format: date-time
type: string
resourceHealthSource:
description: 'ResourceHealthSource indicates where the resource health
status is stored: inline if not set or appTree'
type: string
resources:
description: Resources is a list of Kubernetes resources managed by
this application
Expand Down
10 changes: 10 additions & 0 deletions manifests/ha/install.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1806,6 +1806,10 @@ spec:
reconciled using the latest git version
format: date-time
type: string
resourceHealthSource:
description: 'ResourceHealthSource indicates where the resource health
status is stored: inline if not set or appTree'
type: string
resources:
description: Resources is a list of Kubernetes resources managed by
this application
Expand Down Expand Up @@ -11835,6 +11839,12 @@ spec:
key: controller.repo.server.strict.tls
name: argocd-cmd-params-cm
optional: true
- name: ARGOCD_APPLICATION_CONTROLLER_PERSIST_RESOURCE_HEALTH
valueFrom:
configMapKeyRef:
key: controller.resource.health.persist
name: argocd-cmd-params-cm
optional: true
- name: ARGOCD_APP_STATE_CACHE_EXPIRATION
valueFrom:
configMapKeyRef:
Expand Down
6 changes: 6 additions & 0 deletions manifests/ha/namespace-install.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2524,6 +2524,12 @@ spec:
key: controller.repo.server.strict.tls
name: argocd-cmd-params-cm
optional: true
- name: ARGOCD_APPLICATION_CONTROLLER_PERSIST_RESOURCE_HEALTH
valueFrom:
configMapKeyRef:
key: controller.resource.health.persist
name: argocd-cmd-params-cm
optional: true
- name: ARGOCD_APP_STATE_CACHE_EXPIRATION
valueFrom:
configMapKeyRef:
Expand Down
10 changes: 10 additions & 0 deletions manifests/install.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1806,6 +1806,10 @@ spec:
reconciled using the latest git version
format: date-time
type: string
resourceHealthSource:
description: 'ResourceHealthSource indicates where the resource health
status is stored: inline if not set or appTree'
type: string
resources:
description: Resources is a list of Kubernetes resources managed by
this application
Expand Down Expand Up @@ -10852,6 +10856,12 @@ spec:
key: controller.repo.server.strict.tls
name: argocd-cmd-params-cm
optional: true
- name: ARGOCD_APPLICATION_CONTROLLER_PERSIST_RESOURCE_HEALTH
valueFrom:
configMapKeyRef:
key: controller.resource.health.persist
name: argocd-cmd-params-cm
optional: true
- name: ARGOCD_APP_STATE_CACHE_EXPIRATION
valueFrom:
configMapKeyRef:
Expand Down
Loading