From 04b4a35cf973bfe317bca44edc55dc432aa69816 Mon Sep 17 00:00:00 2001
From: Jarek Kowalski <jkowalski@google.com>
Date: Thu, 24 Jan 2019 16:28:18 -0800
Subject: [PATCH] Prometheus and grafana improvements based on load testing

- made prometheus PVC size configurable on the command line
- moved majority of Prometheus config overrides to separate yaml file.
- removed scraping of container stats from prometheus config, otherwise
clusters of 10K pods are very quickly consuming tons of space
- added taints and tolerations to prometheus and grafana

They will now will prefer (but not require) to be scheduled on nodes labeled
with `stable.agones.dev/agones-metrics: true`. They will also tolerate
taint `stable.agones.dev/agones-metrics=true:NoExecute`.

Creating node pool dedicated for monitoring is as simple as:

```
gcloud container node-pools create agones-metrics ... \
  --node-taints stable.agones.dev/agones-metrics=true:NoExecute \
  --node-labels stable.agones.dev/agones-metrics=true \
  --num-nodes=1
```
---
 build/Makefile                           |  11 ++-
 build/gke-test-cluster/cluster.yml.jinja |  13 +++
 build/grafana.yaml                       |  15 ++-
 build/prometheus.yaml                    | 120 +++++++++++++++++++++++
 site/content/en/docs/Guides/metrics.md   |  19 +++-
 5 files changed, 169 insertions(+), 9 deletions(-)
 create mode 100644 build/prometheus.yaml

diff --git a/build/Makefile b/build/Makefile
index b90f63f9ab..54d9bcc40a 100644
--- a/build/Makefile
+++ b/build/Makefile
@@ -389,23 +389,24 @@ pprof-web:
 
 # setup prometheus in the current cluster by default Persistent Volume Claims are requested.
 setup-prometheus: PVC ?= true
+setup-prometheus: PV_SIZE ?= 64Gi
+setup-prometheus: SCRAPE_INTERVAL=30s
 setup-prometheus:
 	$(DOCKER_RUN) \
 		helm upgrade --install --wait prom stable/prometheus --namespace metrics \
-		--set alertmanager.enabled=false,pushgateway.enabled=false \
-		--set kubeStateMetrics.enabled=false,nodeExporter.enabled=false \
-		--set pushgateway.enabled=false \
-		--set server.global.scrape_interval=30s,server.persistentVolume.enabled=$(PVC)
+		--set server.global.scrape_interval=$(SCRAPE_INTERVAL),server.persistentVolume.enabled=$(PVC),server.persistentVolume.size=$(PV_SIZE) \
+		-f $(mount_path)/build/prometheus.yaml
 
 # setup grafana in the current cluster with datasource and dashboards ready for use with agones
 # by default Persistent Volume Claims are requested.
 setup-grafana: PVC ?= true
+setup-grafana: PV_SIZE ?= 64Gi
 setup-grafana: PASSWORD ?= admin
 setup-grafana:
 	$(DOCKER_RUN) kubectl apply -f $(mount_path)/build/grafana/
 	$(DOCKER_RUN) \
 		helm upgrade --install --wait grafana stable/grafana  --namespace metrics \
-		--set persistence.enabled=$(PVC) \
+		--set persistence.enabled=$(PVC),server.persistentVolume.size=$(PV_SIZE) \
 		--set adminPassword=$(PASSWORD) -f $(mount_path)/build/grafana.yaml
 
 # generate a changelog using github-changelog-generator
diff --git a/build/gke-test-cluster/cluster.yml.jinja b/build/gke-test-cluster/cluster.yml.jinja
index 48af537274..d133980843 100644
--- a/build/gke-test-cluster/cluster.yml.jinja
+++ b/build/gke-test-cluster/cluster.yml.jinja
@@ -47,6 +47,19 @@ resources:
               stable.agones.dev/agones-system: "true"
             taints:
               - key: stable.agones.dev/agones-system
+        - name: "agones-metrics"
+          initialNodeCount: 1
+          config:
+            machineType: n1-standard-4
+            oauthScopes:
+              - https://www.googleapis.com/auth/compute
+              - https://www.googleapis.com/auth/devstorage.read_only
+              - https://www.googleapis.com/auth/logging.write
+              - https://www.googleapis.com/auth/monitoring
+            labels:
+              stable.agones.dev/agones-metrics: "true"
+            taints:
+              - key: stable.agones.dev/agones-metrics
                 value: "true"
                 effect: "NO_EXECUTE"
       masterAuth:
diff --git a/build/grafana.yaml b/build/grafana.yaml
index cb344284e4..9a1557972e 100644
--- a/build/grafana.yaml
+++ b/build/grafana.yaml
@@ -1,5 +1,18 @@
 service:
-    port: 3000
+  port: 3000
+tolerations:
+- key: "stable.agones.dev/agones-metrics"
+  operator: "Equal"
+  value: "true"
+  effect: "NoExecute"
+affinity:
+  nodeAffinity:
+    preferredDuringSchedulingIgnoredDuringExecution:
+    - weight: 1
+      preference:
+        matchExpressions:
+          - key: stable.agones.dev/agones-metrics
+            operator: Exists
 sidecar:
     dashboards:
         enabled: true
diff --git a/build/prometheus.yaml b/build/prometheus.yaml
new file mode 100644
index 0000000000..221b42e655
--- /dev/null
+++ b/build/prometheus.yaml
@@ -0,0 +1,120 @@
+alertmanager:
+  enabled: false
+nodeExporter:
+  enabled: false
+kubeStateMetrics:
+  enabled: false
+pushgateway:
+  enabled: false
+server:
+  resources:
+    requests:
+      memory: 4Gi
+      cpu: 2
+  tolerations:
+  - key: "stable.agones.dev/agones-metrics"
+    operator: "Equal"
+    value: "true"
+    effect: "NoExecute"
+  affinity:
+    nodeAffinity:
+      preferredDuringSchedulingIgnoredDuringExecution:
+      - weight: 1
+        preference:
+          matchExpressions:
+            - key: stable.agones.dev/agones-metrics
+              operator: Exists
+serverFiles:
+ prometheus.yml:
+    rule_files:
+      - /etc/config/rules
+      - /etc/config/alerts
+
+    scrape_configs:
+      - job_name: prometheus
+        static_configs:
+          - targets:
+            - localhost:9090
+
+      # A scrape configuration for running Prometheus on a Kubernetes cluster.
+      # This uses separate scrape configs for cluster components (i.e. API server, node)
+      # and services to allow each to use different authentication configs.
+      #
+      # Kubernetes labels will be added as Prometheus labels on metrics via the
+      # `labelmap` relabeling action.
+
+      # Scrape config for API servers.
+      #
+      # Kubernetes exposes API servers as endpoints to the default/kubernetes
+      # service so this uses `endpoints` role and uses relabelling to only keep
+      # the endpoints associated with the default/kubernetes service using the
+      # default named port `https`. This works for single API server deployments as
+      # well as HA API server deployments.
+      - job_name: 'kubernetes-apiservers'
+
+        kubernetes_sd_configs:
+          - role: endpoints
+
+        # Default to scraping over https. If required, just disable this or change to
+        # `http`.
+        scheme: https
+
+        # This TLS & bearer token file config is used to connect to the actual scrape
+        # endpoints for cluster components. This is separate to discovery auth
+        # configuration because discovery & scraping are two separate concerns in
+        # Prometheus. The discovery auth config is automatic if Prometheus runs inside
+        # the cluster. Otherwise, more config options have to be provided within the
+        # <kubernetes_sd_config>.
+        tls_config:
+          ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
+          # If your node certificates are self-signed or use a different CA to the
+          # master CA, then disable certificate verification below. Note that
+          # certificate verification is an integral part of a secure infrastructure
+          # so this should only be disabled in a controlled environment. You can
+          # disable certificate verification by uncommenting the line below.
+          #
+          insecure_skip_verify: true
+        bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
+
+        # Keep only the default/kubernetes service endpoints for the https port. This
+        # will add targets for each API server which Kubernetes adds an endpoint to
+        # the default/kubernetes service.
+        relabel_configs:
+          - source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
+            action: keep
+            regex: default;kubernetes;https
+
+      # Example scrape config for pods
+      #
+      # The relabeling allows the actual pod scrape endpoint to be configured via the
+      # following annotations:
+      #
+      # * `prometheus.io/scrape`: Only scrape pods that have a value of `true`
+      # * `prometheus.io/path`: If the metrics path is not `/metrics` override this.
+      # * `prometheus.io/port`: Scrape the pod on the indicated port instead of the default of `9102`.
+      - job_name: 'kubernetes-pods'
+
+        kubernetes_sd_configs:
+          - role: pod
+
+        relabel_configs:
+          - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
+            action: keep
+            regex: true
+          - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
+            action: replace
+            target_label: __metrics_path__
+            regex: (.+)
+          - source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]
+            action: replace
+            regex: ([^:]+)(?::\d+)?;(\d+)
+            replacement: $1:$2
+            target_label: __address__
+          - action: labelmap
+            regex: __meta_kubernetes_pod_label_(.+)
+          - source_labels: [__meta_kubernetes_namespace]
+            action: replace
+            target_label: kubernetes_namespace
+          - source_labels: [__meta_kubernetes_pod_name]
+            action: replace
+            target_label: kubernetes_pod_name
\ No newline at end of file
diff --git a/site/content/en/docs/Guides/metrics.md b/site/content/en/docs/Guides/metrics.md
index 6f615ca3fd..c1adb77ee1 100644
--- a/site/content/en/docs/Guides/metrics.md
+++ b/site/content/en/docs/Guides/metrics.md
@@ -118,15 +118,28 @@ Prometheus is an open source monitoring solution, we will use it to store Agones
 Let's install Prometheus using the [helm stable](https://github.com/helm/charts/tree/master/stable/prometheus) repository.
 
 ```bash
-helm install --wait --name prom stable/prometheus --namespace metrics \
-  --set pushgateway.enabled=false \
-  --set kubeStateMetrics.enabled=false,nodeExporter.enabled=false
+helm upgrade --install --wait prom stable/prometheus --namespace metrics \
+    --set server.global.scrape_interval=30s \
+    --set server.persistentVolume.enabled=true \
+    --set server.persistentVolume.size=64Gi \
+    -f ./build/prometheus.yaml
 ```
 
 > You can also run our {{< ghlink href="/build/Makefile" branch="master" branch="master" >}}Makefile{{< /ghlink >}} target `make setup-prometheus`
 or `make kind-setup-prometheus` and `make minikube-setup-prometheus` for {{< ghlink href="/build/README.md#running-a-test-kind-cluster" branch="master" >}}Kind{{< /ghlink >}}
 and {{< ghlink href="/build/README.md#running-a-test-minikube-cluster" branch="master" >}}Minikube{{< /ghlink >}}.
 
+For resiliency it is recommended to run Prometheus on a dedicated node which is separate from nodes where Game Servers are scheduled. If you use `make setup-prometheus` to set up Prometheus, it will schedule Prometheus pods on nodes tainted with `stable.agones.dev/agones-metrics=true:NoExecute` and labeled with `stable.agones.dev/agones-metrics=true` if available.
+
+As an example, to set up dedicated node pool for Prometheus on GKE, run the following command before installing Prometheus. Alternatively you can taint and label nodes manually.
+
+```
+gcloud container node-pools create agones-metrics --cluster=... --zone=... \
+  --node-taints stable.agones.dev/agones-metrics=true:NoExecute \
+  --node-labels stable.agones.dev/agones-metrics=true \
+  --num-nodes=1
+```
+
 By default we will disable the push gateway (we don't need it for Agones) and other exporters.
 
 The helm [chart](https://github.com/helm/charts/tree/master/stable/prometheus) support [nodeSelector](https://kubernetes.io/docs/concepts/configuration/assign-pod-node/#nodeselector), [affinity](https://kubernetes.io/docs/concepts/configuration/assign-pod-node/#affinity-and-anti-affinity) and [toleration](https://kubernetes.io/docs/concepts/configuration/taint-and-toleration/), you can use them to schedule prometheus deployments on an isolated node(s) to have an homogeneous game servers workload.