11#! /bin/bash
2- set -eux
3- oc patch Scheduler cluster --type=' json' -p ' [{ "op": "replace", "path": "/spec/mastersSchedulable", "value": true }]'
2+ # set -eux
3+ set -x
4+ # This script will install ACM, MCH, MCO, and other test resources.
5+ # The script will skip installation when MCO CR existed.
46
7+ echo " [INFO] Checking for existing MultiClusterObservability CR..."
8+ MCO_NAMESPACE=" open-cluster-management-observability"
9+ MCO_NAME=" observability"
10+ # The 'oc get ...' command will have a non-zero exit code if the resource is not found.
11+ if oc get multiclusterobservability ${MCO_NAME} -n ${MCO_NAMESPACE} > /dev/null 2>&1 ; then
12+ echo " [INFO] MultiClusterObservability CR '${MCO_NAME} ' already exists in '${MCO_NAMESPACE} '."
13+ echo " [INFO] Skipping installation to avoid conflicts and assuming a previous step is managing it."
14+ exit 0
15+ else
16+ echo " [INFO] No existing MultiClusterObservability CR found. Proceeding with installation."
17+ fi
18+ # patch node
19+ oc patch Scheduler cluster --type=' json' -p ' [{ "op": "replace", "path": "/spec/mastersSchedulable", "value": true }]'
20+ # install acm
521oc apply -f - << EOF
622apiVersion: v1
723kind: Namespace
@@ -43,6 +59,7 @@ while [[ $tries -gt 0 ]] &&
4359 (( tries-- ))
4460done
4561oc wait -n open-cluster-management --for=condition=Available deploy/multiclusterhub-operator --timeout=300s
62+ # install mch
4663oc apply -f - << EOF
4764apiVersion: operator.open-cluster-management.io/v1
4865kind: MultiClusterHub
@@ -56,7 +73,7 @@ oc wait -n open-cluster-management --for=condition=Available deploy/search-api -
5673oc wait -n open-cluster-management --for=condition=Available deploy/search-collector --timeout=300s
5774oc wait -n open-cluster-management --for=condition=Available deploy/search-indexer --timeout=300s
5875oc -n open-cluster-management get pod
59- # create multi-cluster
76+ # create mco
6077if ! oc get ns open-cluster-management-observability > /dev/null 2>&1 ; then
6178 echo " [INFO] Creating namespace open-cluster-management-observability"
6279 oc create ns open-cluster-management-observability
@@ -168,5 +185,64 @@ spec:
168185EOF
169186sleep 1m
170187oc wait --for=condition=Ready pod -l alertmanager=observability,app=multicluster-observability-alertmanager -n open-cluster-management-observability --timeout=300s
171- oc -n open-cluster-management-observability get pod
172- oc -n open-cluster-management-observability get svc | grep -E ' alertmanager|rbac-query'
188+ # enable UIPlugin
189+ oc apply -f - << EOF
190+ apiVersion: observability.openshift.io/v1alpha1
191+ kind: UIPlugin
192+ metadata:
193+ name: monitoring
194+ spec:
195+ monitoring:
196+ acm:
197+ enabled: true
198+ alertmanager:
199+ url: 'https://alertmanager.open-cluster-management-observability.svc:9095'
200+ thanosQuerier:
201+ url: 'https://rbac-query-proxy.open-cluster-management-observability.svc:8443'
202+ type: Monitoring
203+ EOF
204+ # apply custom-rules
205+ oc apply -f - << EOF
206+ apiVersion: v1
207+ kind: ConfigMap
208+ metadata:
209+ name: thanos-ruler-custom-rules
210+ namespace: open-cluster-management-observability
211+ data:
212+ custom_rules.yaml: |
213+ groups:
214+ - name: alertrule-testing
215+ rules:
216+ - alert: Watchdog
217+ annotations:
218+ summary: An alert that should always be firing to certify that Alertmanager is working properly.
219+ description: This is an alert meant to ensure that the entire alerting pipeline is functional.
220+ expr: vector(1)
221+ labels:
222+ instance: "local"
223+ cluster: "local"
224+ clusterID: "111111111"
225+ severity: info
226+ - alert: Watchdog-spoke
227+ annotations:
228+ summary: An alert that should always be firing to certify that Alertmanager is working properly.
229+ description: This is an alert meant to ensure that the entire alerting pipeline is functional.
230+ expr: vector(1)
231+ labels:
232+ instance: "spoke"
233+ cluster: "spoke"
234+ clusterID: "22222222"
235+ severity: warn
236+ - name: cluster-health
237+ rules:
238+ - alert: ClusterCPUHealth-jb
239+ annotations:
240+ summary: Notify when CPU utilization on a cluster is greater than the defined utilization limit
241+ description: "The cluster has a high CPU usage: core for"
242+ expr: |
243+ max(cluster:cpu_usage_cores:sum) by (clusterID, cluster, prometheus) > 0
244+ labels:
245+ cluster: "{{ $labels .cluster }}"
246+ prometheus: "{{ $labels .prometheus }}"
247+ severity: critical
248+ EOF
0 commit comments