Skip to content

Commit

Permalink
Prometheus and grafana improvements based on load testing
Browse files Browse the repository at this point in the history
- made prometheus PVC size configurable on the command line
- moved majority of Prometheus config overrides to separate yaml file.
- removed scraping of container stats from prometheus config, otherwise
clusters of 10K pods are very quickly consuming tons of space
- added taints and tolerations to prometheus and grafana

They will now will prefer (but not require) to be scheduled on nodes labeled
with `stable.agones.dev/agones-metrics: true`. They will also tolerate
taint `stable.agones.dev/agones-metrics=true:NoExecute`.

Creating node pool dedicated for monitoring is as simple as:

```
gcloud container node-pools create agones-metrics ... \
  --node-taints stable.agones.dev/agones-metrics=true:NoExecute
  --node-labels stable.agones.dev/agones-metrics=true \
  --num-nodes=1
```
  • Loading branch information
jkowalski committed Jan 24, 2019
1 parent c3bbe54 commit cc29603
Show file tree
Hide file tree
Showing 4 changed files with 172 additions and 8 deletions.
13 changes: 7 additions & 6 deletions build/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -389,23 +389,24 @@ pprof-web:

# setup prometheus in the current cluster by default Persistent Volume Claims are requested.
setup-prometheus: PVC ?= true
setup-prometheus: PV_SIZE ?= 64Gi
setup-prometheus: SCRAPE_INTERVAL=30s
setup-prometheus:
$(DOCKER_RUN) \
helm upgrade --install --wait prom stable/prometheus --namespace metrics \
--set alertmanager.enabled=false,pushgateway.enabled=false \
--set kubeStateMetrics.enabled=false,nodeExporter.enabled=false \
--set pushgateway.enabled=false \
--set server.global.scrape_interval=30s,server.persistentVolume.enabled=$(PVC)
--set server.global.scrape_interval=$(SCRAPE_INTERVAL),server.persistentVolume.enabled=$(PVC),server.persistentVolume.size=$(PV_SIZE) \
-f $(mount_path)/build/prometheus.yaml

# setup grafana in the current cluster with datasource and dashboards ready for use with agones
# by default Persistent Volume Claims are requested.
setup-grafana: PVC ?= true
setup-grafana: PV_SIZE ?= 64Gi
setup-grafana: PASSWORD ?= admin
setup-grafana:
$(DOCKER_RUN) kubectl apply -f $(mount_path)/build/grafana/
$(DOCKER_RUN) \
helm upgrade --install --wait grafana stable/grafana --namespace metrics \
--set persistence.enabled=$(PVC) \
--set persistence.enabled=$(PVC),server.persistentVolume.size=$(PV_SIZE) \
--set adminPassword=$(PASSWORD) -f $(mount_path)/build/grafana.yaml

# generate a changelog using github-changelog-generator
Expand Down Expand Up @@ -700,4 +701,4 @@ kind-grafana-portforward:
kind-prometheus-portforward:
$(MAKE) prometheus-portforward \
KUBECONFIG="$(shell kind get kubeconfig-path --name="$(KIND_PROFILE)")" \
DOCKER_RUN_ARGS="--network=host"
DOCKER_RUN_ARGS="--network=host"
30 changes: 30 additions & 0 deletions build/gke-test-cluster/cluster.yml.jinja
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,36 @@ resources:
- https://www.googleapis.com/auth/devstorage.read_only
- https://www.googleapis.com/auth/logging.write
- https://www.googleapis.com/auth/monitoring
- name: "agones-system"
initialNodeCount: 1
config:
machineType: n1-standard-4
oauthScopes:
- https://www.googleapis.com/auth/compute
- https://www.googleapis.com/auth/devstorage.read_only
- https://www.googleapis.com/auth/logging.write
- https://www.googleapis.com/auth/monitoring
labels:
stable.agones.dev/agones-system: "true"
taints:
- key: stable.agones.dev/agones-system
value: "true"
effect: "NO_EXECUTE"
- name: "agones-metrics"
initialNodeCount: 1
config:
machineType: n1-standard-4
oauthScopes:
- https://www.googleapis.com/auth/compute
- https://www.googleapis.com/auth/devstorage.read_only
- https://www.googleapis.com/auth/logging.write
- https://www.googleapis.com/auth/monitoring
labels:
stable.agones.dev/agones-metrics: "true"
taints:
- key: stable.agones.dev/agones-metrics
value: "true"
effect: "NO_EXECUTE"
masterAuth:
username: admin
password: supersecretpassword
Expand Down
17 changes: 15 additions & 2 deletions build/grafana.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,18 @@
service:
port: 3000
port: 3000
tolerations:
- key: "stable.agones.dev/agones-metrics"
operator: "Equal"
value: "true"
effect: "NoExecute"
affinity:
nodeAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 1
preference:
matchExpressions:
- key: stable.agones.dev/agones-metrics
operator: Exists
sidecar:
dashboards:
enabled: true
Expand All @@ -12,4 +25,4 @@ datasources:
type: prometheus
url: http://prom-prometheus-server.metrics.svc.cluster.local:80/
access: proxy
isDefault: true
isDefault: true
120 changes: 120 additions & 0 deletions build/prometheus.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
alertmanager:
enabled: false
nodeExporter:
enabled: false
kubeStateMetrics:
enabled: false
pushgateway:
enabled: false
server:
resources:
requests:
memory: 4Gi
cpu: 2
tolerations:
- key: "stable.agones.dev/agones-metrics"
operator: "Equal"
value: "true"
effect: "NoExecute"
affinity:
nodeAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 1
preference:
matchExpressions:
- key: stable.agones.dev/agones-metrics
operator: Exists
serverFiles:
prometheus.yml:
rule_files:
- /etc/config/rules
- /etc/config/alerts

scrape_configs:
- job_name: prometheus
static_configs:
- targets:
- localhost:9090

# A scrape configuration for running Prometheus on a Kubernetes cluster.
# This uses separate scrape configs for cluster components (i.e. API server, node)
# and services to allow each to use different authentication configs.
#
# Kubernetes labels will be added as Prometheus labels on metrics via the
# `labelmap` relabeling action.

# Scrape config for API servers.
#
# Kubernetes exposes API servers as endpoints to the default/kubernetes
# service so this uses `endpoints` role and uses relabelling to only keep
# the endpoints associated with the default/kubernetes service using the
# default named port `https`. This works for single API server deployments as
# well as HA API server deployments.
- job_name: 'kubernetes-apiservers'

kubernetes_sd_configs:
- role: endpoints

# Default to scraping over https. If required, just disable this or change to
# `http`.
scheme: https

# This TLS & bearer token file config is used to connect to the actual scrape
# endpoints for cluster components. This is separate to discovery auth
# configuration because discovery & scraping are two separate concerns in
# Prometheus. The discovery auth config is automatic if Prometheus runs inside
# the cluster. Otherwise, more config options have to be provided within the
# <kubernetes_sd_config>.
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
# If your node certificates are self-signed or use a different CA to the
# master CA, then disable certificate verification below. Note that
# certificate verification is an integral part of a secure infrastructure
# so this should only be disabled in a controlled environment. You can
# disable certificate verification by uncommenting the line below.
#
insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token

# Keep only the default/kubernetes service endpoints for the https port. This
# will add targets for each API server which Kubernetes adds an endpoint to
# the default/kubernetes service.
relabel_configs:
- source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
action: keep
regex: default;kubernetes;https

# Example scrape config for pods
#
# The relabeling allows the actual pod scrape endpoint to be configured via the
# following annotations:
#
# * `prometheus.io/scrape`: Only scrape pods that have a value of `true`
# * `prometheus.io/path`: If the metrics path is not `/metrics` override this.
# * `prometheus.io/port`: Scrape the pod on the indicated port instead of the default of `9102`.
- job_name: 'kubernetes-pods'

kubernetes_sd_configs:
- role: pod

relabel_configs:
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
action: keep
regex: true
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
action: replace
target_label: __metrics_path__
regex: (.+)
- source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]
action: replace
regex: ([^:]+)(?::\d+)?;(\d+)
replacement: $1:$2
target_label: __address__
- action: labelmap
regex: __meta_kubernetes_pod_label_(.+)
- source_labels: [__meta_kubernetes_namespace]
action: replace
target_label: kubernetes_namespace
- source_labels: [__meta_kubernetes_pod_name]
action: replace
target_label: kubernetes_pod_name

0 comments on commit cc29603

Please sign in to comment.