Created attachment 1806170 [details] oc adm inspect co/monitoring Description of problem: enable UWM, deploy two targets under user namespace, then set enforcedTargetLimit to 1, can not find user metrics, this is expected. delete user-workload-monitoring-config configmap, can not find user metrics although no setting for enforcedTargetLimit ******************************************* apiVersion: v1 kind: ConfigMap metadata: name: cluster-monitoring-config namespace: openshift-monitoring data: config.yaml: | enableUserWorkload: true ******************************************* apiVersion: v1 kind: Namespace metadata: name: ns1 --- apiVersion: apps/v1 kind: Deployment metadata: labels: app: prometheus-example-app name: prometheus-example-app namespace: ns1 spec: replicas: 2 selector: matchLabels: app: prometheus-example-app template: metadata: labels: app: prometheus-example-app spec: containers: - image: ghcr.io/rhobs/prometheus-example-app:0.3.0 imagePullPolicy: IfNotPresent name: prometheus-example-app --- apiVersion: v1 kind: Service metadata: labels: app: prometheus-example-app name: prometheus-example-app namespace: ns1 spec: ports: - port: 8080 protocol: TCP targetPort: 8080 name: web selector: app: prometheus-example-app type: ClusterIP --- apiVersion: monitoring.coreos.com/v1 kind: ServiceMonitor metadata: name: prometheus-example-monitor namespace: ns1 spec: endpoints: - interval: 5s port: web selector: matchLabels: app: prometheus-example-app ******************************************* # oc -n ns1 get pod NAME READY STATUS RESTARTS AGE prometheus-example-app-d748cfb54-h27vb 1/1 Running 0 81s prometheus-example-app-d748cfb54-wpjw9 1/1 Running 0 81s user metrics could be found from thanos-querier # token=`oc sa get-token prometheus-k8s -n openshift-monitoring` # oc -n openshift-monitoring exec -c prometheus prometheus-k8s-0 -- curl -k -H "Authorization: Bearer $token" 'https://thanos-querier.openshift-monitoring.svc:9091/api/v1/query?query=version' | jq { "status": "success", "data": { "resultType": "vector", "result": [ { "metric": { "__name__": "version", "endpoint": "web", "instance": "10.129.2.12:8080", "job": "prometheus-example-app", "namespace": "ns1", "pod": "prometheus-example-app-d748cfb54-h27vb", "prometheus": "openshift-user-workload-monitoring/user-workload", "service": "prometheus-example-app", "version": "v0.3.0" }, "value": [ 1627356612.269, "1" ] }, { "metric": { "__name__": "version", "endpoint": "web", "instance": "10.131.0.91:8080", "job": "prometheus-example-app", "namespace": "ns1", "pod": "prometheus-example-app-d748cfb54-wpjw9", "prometheus": "openshift-user-workload-monitoring/user-workload", "service": "prometheus-example-app", "version": "v0.3.0" }, "value": [ 1627356612.269, "1" ] } ] } } set enforcedTargetLimit to 1 ****************************** apiVersion: v1 kind: ConfigMap metadata: name: user-workload-monitoring-config namespace: openshift-user-workload-monitoring data: config.yaml: | prometheus: enforcedTargetLimit: 1 ****************************** # oc -n openshift-user-workload-monitoring get prometheus user-workload -oyaml | grep enforcedTargetLimit enforcedTargetLimit: 1 can not find user metrics due to enforcedTargetLimit set to 1, this is expected # oc -n openshift-monitoring exec -c prometheus prometheus-k8s-0 -- curl -k -H "Authorization: Bearer $token" 'https://thanos-querier.openshift-monitoring.svc:9091/api/v1/query?query=version' | jq { "status": "success", "data": { "resultType": "vector", "result": [] } } delete user-workload-monitoring-config configmap # oc -n openshift-user-workload-monitoring delete cm user-workload-monitoring-config configmap "user-workload-monitoring-config" deleted # oc -n openshift-user-workload-monitoring get prometheus user-workload -oyaml | grep enforcedTargetLimit no result can not find user metrics although no setting for enforcedTargetLimit # oc -n openshift-monitoring exec -c prometheus prometheus-k8s-0 -- curl -k -H "Authorization: Bearer $token" 'https://thanos-querier.openshift-monitoring.svc:9091/api/v1/query?query=version' | jq { "status": "success", "data": { "resultType": "vector", "result": [] } } # oc -n openshift-user-workload-monitoring exec -c prometheus prometheus-user-workload-0 -- cat /etc/prometheus/config_out/prometheus.env.yaml global: evaluation_interval: 30s scrape_interval: 30s external_labels: prometheus: openshift-user-workload-monitoring/user-workload prometheus_replica: prometheus-user-workload-0 rule_files: - /etc/prometheus/rules/prometheus-user-workload-rulefiles-0/*.yaml scrape_configs: - job_name: serviceMonitor/ns1/prometheus-example-monitor/0 honor_labels: false honor_timestamps: false kubernetes_sd_configs: - role: endpoints namespaces: names: - ns1 scrape_interval: 5s relabel_configs: - source_labels: - job target_label: __tmp_prometheus_job_name - action: keep source_labels: - __meta_kubernetes_service_label_app regex: prometheus-example-app - action: keep source_labels: - __meta_kubernetes_endpoint_port_name regex: web - source_labels: - __meta_kubernetes_endpoint_address_target_kind - __meta_kubernetes_endpoint_address_target_name separator: ; regex: Node;(.*) replacement: ${1} target_label: node - source_labels: - __meta_kubernetes_endpoint_address_target_kind - __meta_kubernetes_endpoint_address_target_name separator: ; regex: Pod;(.*) replacement: ${1} target_label: pod - source_labels: - __meta_kubernetes_namespace target_label: namespace - source_labels: - __meta_kubernetes_service_name target_label: service - source_labels: - __meta_kubernetes_pod_name target_label: pod - source_labels: - __meta_kubernetes_pod_container_name target_label: container - source_labels: - __meta_kubernetes_service_name target_label: job replacement: ${1} - target_label: endpoint replacement: web - target_label: namespace replacement: ns1 - source_labels: - __address__ target_label: __tmp_hash modulus: 1 action: hashmod - source_labels: - __tmp_hash regex: 0 action: keep alerting: alert_relabel_configs: - action: labeldrop regex: prometheus_replica alertmanagers: - path_prefix: / scheme: https tls_config: insecure_skip_verify: false server_name: alertmanager-main.openshift-monitoring.svc ca_file: /etc/prometheus/configmaps/serving-certs-ca-bundle/service-ca.crt kubernetes_sd_configs: - role: endpoints namespaces: names: - openshift-monitoring bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token api_version: v2 relabel_configs: - action: keep source_labels: - __meta_kubernetes_service_name regex: alertmanager-main - action: keep source_labels: - __meta_kubernetes_endpoint_port_name regex: web Version-Release number of selected component (if applicable): 4.9.0-0.nightly-2021-07-26-071921 How reproducible: always Steps to Reproduce: 1. see the description 2. 3. Actual results: Expected results: Additional info:
from "lastError": "target_limit exceeded (number of targets: 2, limit: 0)" reason maybe, if we delete user-workload-monitoring-config configmap, the target limit value is 0, 0 should mean no limit (as for Prometheus), but it's treated as number 0 here, see bug 1982931 # oc -n openshift-monitoring exec -c prometheus prometheus-k8s-0 -- curl -k -H "Authorization: Bearer $token" 'https://thanos-querier.openshift-monitoring.svc:9091/api/v1/targets' | jq | grep prometheus-example-monitor -C10 "__meta_kubernetes_pod_name": "prometheus-example-app-d748cfb54-h27vb", "__meta_kubernetes_pod_node_name": "ip-10-0-177-101.us-east-2.compute.internal", "__meta_kubernetes_pod_phase": "Running", "__meta_kubernetes_pod_ready": "true", "__meta_kubernetes_pod_uid": "fa7aa1b4-f4d2-4880-b5ac-9d239787bc72", "__meta_kubernetes_service_label_app": "prometheus-example-app", "__meta_kubernetes_service_labelpresent_app": "true", "__meta_kubernetes_service_name": "prometheus-example-app", "__metrics_path__": "/metrics", "__scheme__": "http", "job": "serviceMonitor/ns1/prometheus-example-monitor/0", "prometheus": "openshift-user-workload-monitoring/user-workload" }, "labels": { "endpoint": "web", "instance": "10.129.2.12:8080", "job": "prometheus-example-app", "namespace": "ns1", "pod": "prometheus-example-app-d748cfb54-h27vb", "prometheus": "openshift-user-workload-monitoring/user-workload", "service": "prometheus-example-app" }, "scrapePool": "serviceMonitor/ns1/prometheus-example-monitor/0", "scrapeUrl": "http://10.129.2.12:8080/metrics", "globalUrl": "http://10.129.2.12:8080/metrics", "lastError": "target_limit exceeded (number of targets: 2, limit: 0)", "lastScrape": "2021-07-27T04:19:26.780629467Z", "lastScrapeDuration": 3.5823e-05, "health": "down" }, { "discoveredLabels": { "__address__": "10.131.0.91:8080", -- "__meta_kubernetes_pod_name": "prometheus-example-app-d748cfb54-wpjw9", "__meta_kubernetes_pod_node_name": "ip-10-0-199-189.us-east-2.compute.internal", "__meta_kubernetes_pod_phase": "Running", "__meta_kubernetes_pod_ready": "true", "__meta_kubernetes_pod_uid": "eead5b2d-8bea-43e0-bff5-a9a2c493c052", "__meta_kubernetes_service_label_app": "prometheus-example-app", "__meta_kubernetes_service_labelpresent_app": "true", "__meta_kubernetes_service_name": "prometheus-example-app", "__metrics_path__": "/metrics", "__scheme__": "http", "job": "serviceMonitor/ns1/prometheus-example-monitor/0", "prometheus": "openshift-user-workload-monitoring/user-workload" }, "labels": { "endpoint": "web", "instance": "10.131.0.91:8080", "job": "prometheus-example-app", "namespace": "ns1", "pod": "prometheus-example-app-d748cfb54-wpjw9", "prometheus": "openshift-user-workload-monitoring/user-workload", "service": "prometheus-example-app" }, "scrapePool": "serviceMonitor/ns1/prometheus-example-monitor/0", "scrapeUrl": "http://10.131.0.91:8080/metrics", "globalUrl": "http://10.131.0.91:8080/metrics", "lastError": "target_limit exceeded (number of targets: 2, limit: 0)", "lastScrape": "2021-07-27T04:19:26.501800903Z", "lastScrapeDuration": 1.7929e-05, "health": "down" }, { "discoveredLabels": { "__address__": "10.0.129.103:10250",
I could reproduce this issue on upstream prometheus 2.28.1 by setting `target_limit: 1` then back to `target_limit: 0` with SIGHUP to reload config. ``` scrape_configs: - job_name: 'prometheus-k8s' target_limit: 0 static_configs: - targets: - 'localhost:9090' labels: pod: prometheus-k8s-0 service: prometheus-k8s - targets: - 'localhost:9090' labels: pod: prometheus-k8s-1 service: prometheus-k8s ```
1. Start the prometheus with following config, $ ./prometheus --config.file=./config.yaml ``` scrape_configs: - job_name: 'prometheus-k8s' target_limit: 1 static_configs: - targets: - 'localhost:9090' labels: pod: prometheus-k8s-0 service: prometheus-k8s - targets: - 'localhost:9090' labels: pod: prometheus-k8s-1 service: prometheus-k8s ``` 2. Change `target_limit: 0` and sent signal to prometheus to reload config. $ kill -SIGHUP $(ps | awk -F ' ' '!/awk/ && /prometheus/{print $1}') 3. Check targets status, it will show the error "target_limit exceeded (number of targets: 2, limit: 0)"
Fix will be available on prometheus 2.29.0 upstream release.
tested with 4.9.0-0.nightly-2021-08-19-184748 and followed the steps in Comment 0, delete user-workload-monitoring-config configmap, can see the user metrics now
Since the problem described in this bug report should be resolved in a recent advisory, it has been closed with a resolution of ERRATA. For information on the advisory (Moderate: OpenShift Container Platform 4.9.0 bug fix and security update), and where to find the updated files, follow the link below. If the solution does not work for you, open a new bug report. https://access.redhat.com/errata/RHSA-2021:3759