Created attachment 1862500 [details] sample user project/servicemonitor/svc Description of problem: tested https://issues.redhat.com/browse/MON-1708 with PR https://github.com/openshift/cluster-monitoring-operator/pull/1350 enable UWM ******************* apiVersion: v1 kind: ConfigMap metadata: name: cluster-monitoring-config namespace: openshift-monitoring data: config.yaml: | enableUserWorkload: true ******************* created user project/servicemonitor with the attached file, the user endpoint exposes only one metrics: version{version="v0.4.0"} with value 1 # token=`oc sa get-token prometheus-k8s -n openshift-monitoring` # oc -n openshift-monitoring exec -c prometheus prometheus-k8s-0 -- curl -k -H "Authorization: Bearer $token" 'https://thanos-querier.openshift-monitoring.svc:9091/api/v1/query?query=version' | jq { "status": "success", "data": { "resultType": "vector", "result": [ { "metric": { "__name__": "version", "endpoint": "web", "instance": "10.128.2.20:8080", "job": "prometheus-example-app", "namespace": "ns1", "pod": "prometheus-example-app-676776dcb9-fllq8", "prometheus": "openshift-user-workload-monitoring/user-workload", "service": "prometheus-example-app", "version": "v0.4.0" }, "value": [ 1645500602.075, "1" ] } ] } } metrics name: version label name: version, label name length: 7 label value: v0.4.0, label value length: 6 there are also other labels added by thanos-querier set value for "enforcedLabelLimit|enforcedLabelNameLengthLimit|enforcedLabelValueLengthLimit" ******************* apiVersion: v1 kind: ConfigMap metadata: name: user-workload-monitoring-config namespace: openshift-user-workload-monitoring data: config.yaml: | prometheus: enforcedLabelLimit: 1 enforcedLabelNameLengthLimit: 1 enforcedLabelValueLengthLimit: 1 ******************* the settings loaded to prometheus configuration, waited for a while, and checked, still saw the user metrics, which should not # oc -n openshift-user-workload-monitoring get prometheus user-workload -oyaml | grep -E "enforcedLabelLimit|enforcedLabelNameLengthLimit|enforcedLabelValueLengthLimit" enforcedLabelLimit: 1 enforcedLabelNameLengthLimit: 1 enforcedLabelValueLengthLimit: 1 # oc -n openshift-user-workload-monitoring exec -c prometheus prometheus-user-workload-0 -- cat /etc/prometheus/config_out/prometheus.env.yaml global: evaluation_interval: 30s scrape_interval: 30s external_labels: prometheus: openshift-user-workload-monitoring/user-workload prometheus_replica: prometheus-user-workload-0 rule_files: - /etc/prometheus/rules/prometheus-user-workload-rulefiles-0/*.yaml scrape_configs: - job_name: serviceMonitor/ns1/prometheus-example-monitor/0 honor_labels: false honor_timestamps: false kubernetes_sd_configs: - role: endpoints namespaces: names: - ns1 scrape_interval: 30s scheme: http relabel_configs: - source_labels: - job target_label: __tmp_prometheus_job_name - action: keep source_labels: - __meta_kubernetes_service_label_app - __meta_kubernetes_service_labelpresent_app regex: (prometheus-example-app);true - action: keep source_labels: - __meta_kubernetes_endpoint_port_name regex: web - source_labels: - __meta_kubernetes_endpoint_address_target_kind - __meta_kubernetes_endpoint_address_target_name separator: ; regex: Node;(.*) replacement: ${1} target_label: node - source_labels: - __meta_kubernetes_endpoint_address_target_kind - __meta_kubernetes_endpoint_address_target_name separator: ; regex: Pod;(.*) replacement: ${1} target_label: pod - source_labels: - __meta_kubernetes_namespace target_label: namespace - source_labels: - __meta_kubernetes_service_name target_label: service - source_labels: - __meta_kubernetes_pod_name target_label: pod - source_labels: - __meta_kubernetes_pod_container_name target_label: container - source_labels: - __meta_kubernetes_service_name target_label: job replacement: ${1} - target_label: endpoint replacement: web - target_label: namespace replacement: ns1 - source_labels: - __address__ target_label: __tmp_hash modulus: 1 action: hashmod - source_labels: - __tmp_hash regex: 0 action: keep label_limit: 1 label_name_length_limit: 1 label_value_length_limit: 1 metric_relabel_configs: - target_label: namespace replacement: ns1 alerting: alert_relabel_configs: - action: labeldrop regex: prometheus_replica alertmanagers: - path_prefix: / scheme: https tls_config: insecure_skip_verify: false server_name: alertmanager-main.openshift-monitoring.svc ca_file: /etc/prometheus/configmaps/serving-certs-ca-bundle/service-ca.crt kubernetes_sd_configs: - role: endpoints namespaces: names: - openshift-monitoring bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token api_version: v2 relabel_configs: - action: keep source_labels: - __meta_kubernetes_service_name regex: alertmanager-main - action: keep source_labels: - __meta_kubernetes_endpoint_port_name regex: web # oc -n openshift-monitoring exec -c prometheus prometheus-k8s-0 -- curl -k -H "Authorization: Bearer $token" 'https://thanos-querier.openshift-monitoring.svc:9091/api/v1/query?query=version' | jq { "status": "success", "data": { "resultType": "vector", "result": [ { "metric": { "__name__": "version", "endpoint": "web", "instance": "10.128.2.20:8080", "job": "prometheus-example-app", "namespace": "ns1", "pod": "prometheus-example-app-676776dcb9-fllq8", "prometheus": "openshift-user-workload-monitoring/user-workload", "service": "prometheus-example-app", "version": "v0.4.0" }, "value": [ 1645503437.278, "1" ] } ] } } # oc -n openshift-monitoring exec -c prometheus prometheus-k8s-0 -- curl -k -H "Authorization: Bearer $token" 'https://thanos-querier.openshift-monitoring.svc:9091/api/v1/targets' | jq ... { "discoveredLabels": { "__address__": "10.128.2.20:8080", "__meta_kubernetes_endpoint_address_target_kind": "Pod", "__meta_kubernetes_endpoint_address_target_name": "prometheus-example-app-676776dcb9-fllq8", "__meta_kubernetes_endpoint_node_name": "ci-ln-gx602xt-72292-tsptp-worker-c-l8mhr", "__meta_kubernetes_endpoint_port_name": "web", "__meta_kubernetes_endpoint_port_protocol": "TCP", "__meta_kubernetes_endpoint_ready": "true", "__meta_kubernetes_endpoints_label_app": "prometheus-example-app", "__meta_kubernetes_endpoints_labelpresent_app": "true", "__meta_kubernetes_endpoints_name": "prometheus-example-app", "__meta_kubernetes_namespace": "ns1", "__meta_kubernetes_pod_annotation_k8s_v1_cni_cncf_io_network_status": "[{\n \"name\": \"openshift-sdn\",\n \"interface\": \"eth0\",\n \"ips\": [\n \"10.128.2.20\"\n ],\n \"default\": true,\n \"dns\": {}\n}]", "__meta_kubernetes_pod_annotation_k8s_v1_cni_cncf_io_networks_status": "[{\n \"name\": \"openshift-sdn\",\n \"interface\": \"eth0\",\n \"ips\": [\n \"10.128.2.20\"\n ],\n \"default\": true,\n \"dns\": {}\n}]", "__meta_kubernetes_pod_annotation_openshift_io_scc": "restricted", "__meta_kubernetes_pod_annotationpresent_k8s_v1_cni_cncf_io_network_status": "true", "__meta_kubernetes_pod_annotationpresent_k8s_v1_cni_cncf_io_networks_status": "true", "__meta_kubernetes_pod_annotationpresent_openshift_io_scc": "true", "__meta_kubernetes_pod_controller_kind": "ReplicaSet", "__meta_kubernetes_pod_controller_name": "prometheus-example-app-676776dcb9", "__meta_kubernetes_pod_host_ip": "10.0.128.4", "__meta_kubernetes_pod_ip": "10.128.2.20", "__meta_kubernetes_pod_label_app": "prometheus-example-app", "__meta_kubernetes_pod_label_pod_template_hash": "676776dcb9", "__meta_kubernetes_pod_labelpresent_app": "true", "__meta_kubernetes_pod_labelpresent_pod_template_hash": "true", "__meta_kubernetes_pod_name": "prometheus-example-app-676776dcb9-fllq8", "__meta_kubernetes_pod_node_name": "ci-ln-gx602xt-72292-tsptp-worker-c-l8mhr", "__meta_kubernetes_pod_phase": "Running", "__meta_kubernetes_pod_ready": "true", "__meta_kubernetes_pod_uid": "01aab948-f38e-4310-b616-ea0372015865", "__meta_kubernetes_service_label_app": "prometheus-example-app", "__meta_kubernetes_service_labelpresent_app": "true", "__meta_kubernetes_service_name": "prometheus-example-app", "__metrics_path__": "/metrics", "__scheme__": "http", "__scrape_interval__": "30s", "__scrape_timeout__": "10s", "job": "serviceMonitor/ns1/prometheus-example-monitor/0", "prometheus": "openshift-user-workload-monitoring/user-workload" }, "labels": { "endpoint": "web", "instance": "10.128.2.20:8080", "job": "prometheus-example-app", "namespace": "ns1", "pod": "prometheus-example-app-676776dcb9-fllq8", "prometheus": "openshift-user-workload-monitoring/user-workload", "service": "prometheus-example-app" }, "scrapePool": "serviceMonitor/ns1/prometheus-example-monitor/0", "scrapeUrl": "http://10.128.2.20:8080/metrics", "globalUrl": "http://10.128.2.20:8080/metrics", "lastError": "", "lastScrape": "2022-02-22T04:25:22.19938364Z", "lastScrapeDuration": 0.003441224, "health": "up" }, Version-Release number of selected component (if applicable): tested with PR How reproducible: always Steps to Reproduce: 1. see the description 2. 3. Actual results: Expected results: Additional info:
tested with PR launch openshift/cluster-monitoring-operator#1350 use the same steps in comment 0 still the same result as the bug
launched with launch 4.11.0-0.ci-2022-03-07-013215,openshift/cluster-monitoring-operator#1350 https://github.com/openshift/prometheus/pull/121 is in this cluster 1. the user endpoint exposes only one metrics: version{version="v0.4.0"} with value 1, label number is 1, checked from thanos-querier API, there are 9 labels in total from thanos-querier(include "__name__") # oc -n openshift-monitoring exec -c prometheus prometheus-k8s-0 -- curl -k -H "Authorization: Bearer $token" 'https://thanos-querier.openshift-monitoring.svc:9091/api/v1/query?query=version' | jq { "status": "success", "data": { "resultType": "vector", "result": [ { "metric": { "__name__": "version", "endpoint": "web", "instance": "10.128.2.20:8080", "job": "prometheus-example-app", "namespace": "ns1", "pod": "prometheus-example-app-676776dcb9-fllq8", "prometheus": "openshift-user-workload-monitoring/user-workload", "service": "prometheus-example-app", "version": "v0.4.0" }, "value": [ 1645500602.075, "1" ] } ] } } tested with each parameter, setting see below # oc -n openshift-user-workload-monitoring get prometheus user-workload -oyaml | grep -E "enforcedLabelLimit|enforcedLabelNameLengthLimit|enforcedLabelValueLengthLimit" enforcedLabelLimit: 1 enforcedLabelNameLengthLimit: 1 enforcedLabelValueLengthLimit: 1 error in /targets api shows label_limit exceeded (metric: version, number of label: 8, limit: 1) , the msg is wrong, since there are 9 labels in total, should throw out error "label_limit exceeded (metric: version, number of label: 9, limit: 1)", it does not count label "version" error in /targets api "labels": { "endpoint": "web", "instance": "10.131.0.21:8080", "job": "prometheus-example-app", "namespace": "ns1", "pod": "prometheus-example-app-676776dcb9-llg28", "prometheus": "openshift-user-workload-monitoring/user-workload", "service": "prometheus-example-app" }, "scrapePool": "serviceMonitor/ns1/prometheus-example-monitor/0", "scrapeUrl": "http://10.131.0.21:8080/metrics", "globalUrl": "http://10.131.0.21:8080/metrics", "lastError": "label_limit exceeded (metric: version, number of label: 8, limit: 1)", "lastScrape": "2022-03-07T07:44:20.443436825Z", "lastScrapeDuration": 0.002481978, "health": "down" }, 2. update enforcedLabelLimit to 8, also error in targets API # oc -n openshift-user-workload-monitoring get prometheus user-workload -oyaml | grep -E "enforcedLabelLimit|enforcedLabelNameLengthLimit|enforcedLabelValueLengthLimit" enforcedLabelLimit: 8 enforcedLabelNameLengthLimit: 1 enforcedLabelValueLengthLimit: 1 "labels": { "endpoint": "web", "instance": "10.131.0.21:8080", "job": "prometheus-example-app", "namespace": "ns1", "pod": "prometheus-example-app-676776dcb9-llg28", "prometheus": "openshift-user-workload-monitoring/user-workload", "service": "prometheus-example-app" }, "scrapePool": "serviceMonitor/ns1/prometheus-example-monitor/0", "scrapeUrl": "http://10.131.0.21:8080/metrics", "globalUrl": "http://10.131.0.21:8080/metrics", "lastError": "label_name_length_limit exceeded (metric: version, label: {__name__ version}, name length: 8, limit: 1)", "lastScrape": "2022-03-07T09:40:20.443332988Z", "lastScrapeDuration": 0.003412566, "health": "down" }, error should be "label_limit exceeded (metric: version, number of label: 9, limit: 1)", 3. update enforcedLabelLimit to 9 # oc -n openshift-user-workload-monitoring get prometheus user-workload -oyaml | grep -E "enforcedLabelLimit|enforcedLabelNameLengthLimit|enforcedLabelValueLengthLimit" enforcedLabelLimit: 9 enforcedLabelNameLengthLimit: 1 enforcedLabelValueLengthLimit: 1 error in targets API is "label_name_length_limit exceeded (metric: version, label: {__name__ version}, name length: 8, limit: 1)", change to "label_name_length_limit exceeded (metric: version, label: {__name__}, name length: 8, limit: 1)" is better "labels": { "endpoint": "web", "instance": "10.131.0.21:8080", "job": "prometheus-example-app", "namespace": "ns1", "pod": "prometheus-example-app-676776dcb9-llg28", "prometheus": "openshift-user-workload-monitoring/user-workload", "service": "prometheus-example-app" }, "scrapePool": "serviceMonitor/ns1/prometheus-example-monitor/0", "scrapeUrl": "http://10.131.0.21:8080/metrics", "globalUrl": "http://10.131.0.21:8080/metrics", "lastError": "label_name_length_limit exceeded (metric: version, label: {__name__ version}, name length: 8, limit: 1)", "lastScrape": "2022-03-07T09:20:50.44352258Z", "lastScrapeDuration": 0.003263413, "health": "down" }, 4. update enforcedLabelNameLengthLimit to 8 # oc -n openshift-user-workload-monitoring get prometheus user-workload -oyaml | grep -E "enforcedLabelLimit|enforcedLabelNameLengthLimit|enforcedLabelValueLengthLimit" enforcedLabelLimit: 9 enforcedLabelNameLengthLimit: 8 enforcedLabelValueLengthLimit: 1 error "label_value_length_limit exceeded (metric: version, label: {__name__ version}, value length: 7, limit: 1)", change to "label_value_length_limit exceeded (metric: version, label: {__name__}, value:{version}, value length: 7, limit: 1)" is better "labels": { "endpoint": "web", "instance": "10.131.0.21:8080", "job": "prometheus-example-app", "namespace": "ns1", "pod": "prometheus-example-app-676776dcb9-llg28", "prometheus": "openshift-user-workload-monitoring/user-workload", "service": "prometheus-example-app" }, "scrapePool": "serviceMonitor/ns1/prometheus-example-monitor/0", "scrapeUrl": "http://10.131.0.21:8080/metrics", "globalUrl": "http://10.131.0.21:8080/metrics", "lastError": "label_value_length_limit exceeded (metric: version, label: {__name__ version}, value length: 7, limit: 1)", "lastScrape": "2022-03-07T09:27:37.12488813Z", "lastScrapeDuration": 0.003259243, "health": "down" } 5. update enforcedLabelValueLengthLimit to 7 # oc -n openshift-user-workload-monitoring get prometheus user-workload -oyaml | grep -E "enforcedLabelLimit|enforcedLabelNameLengthLimit|enforcedLabelValueLengthLimit" enforcedLabelLimit: 9 enforcedLabelNameLengthLimit: 8 enforcedLabelValueLengthLimit: 7 the error msg "label: {instance 10.131.0.21:8080}" is also confusing, "label: {instance}, value: {10.131.0.21:8080}" is better "labels": { "endpoint": "web", "instance": "10.131.0.21:8080", "job": "prometheus-example-app", "namespace": "ns1", "pod": "prometheus-example-app-676776dcb9-llg28", "prometheus": "openshift-user-workload-monitoring/user-workload", "service": "prometheus-example-app" }, "scrapePool": "serviceMonitor/ns1/prometheus-example-monitor/0", "scrapeUrl": "http://10.131.0.21:8080/metrics", "globalUrl": "http://10.131.0.21:8080/metrics", "lastError": "label_value_length_limit exceeded (metric: version, label: {instance 10.131.0.21:8080}, value length: 16, limit: 7)", "lastScrape": "2022-03-07T09:32:50.443931585Z", "lastScrapeDuration": 0.003819909, "health": "down" }, 6. the longest label value is "openshift-user-workload-monitoring/user-workload", length is 48 update enforcedLabelValueLengthLimit to 48 # oc -n openshift-user-workload-monitoring get prometheus user-workload -oyaml | grep -E "enforcedLabelLimit|enforcedLabelNameLengthLimit|enforcedLabelValueLengthLimit" enforcedLabelLimit: 9 enforcedLabelNameLengthLimit: 8 enforcedLabelValueLengthLimit: 48 error is "labels": { "endpoint": "web", "instance": "10.131.0.21:8080", "job": "prometheus-example-app", "namespace": "ns1", "pod": "prometheus-example-app-676776dcb9-llg28", "prometheus": "openshift-user-workload-monitoring/user-workload", "service": "prometheus-example-app" }, "scrapePool": "serviceMonitor/ns1/prometheus-example-monitor/0", "scrapeUrl": "http://10.131.0.21:8080/metrics", "globalUrl": "http://10.131.0.21:8080/metrics", "lastError": "label_name_length_limit exceeded (metric: version, label: {namespace ns1}, name length: 9, limit: 8)", "lastScrape": "2022-03-07T09:36:07.125599191Z", "lastScrapeDuration": 0.001726719, "health": "down" }, since the enforcedLabelNameLengthLimit and enforcedLabelValueLengthLimit are checked for each metrics sequentially, the error "(metric: version, label: {namespace ns1}, name length: 9, limit: 8)" makes sense, but "(metric: version, label: {namespace}, name length: 9, limit: 8)" is better 7. the longest label name is prometheus, length is 9, update enforcedLabelNameLengthLimit to 9, # oc -n openshift-user-workload-monitoring get prometheus user-workload -oyaml | grep -E "enforcedLabelLimit|enforcedLabelNameLengthLimit|enforcedLabelValueLengthLimit" enforcedLabelLimit: 9 enforcedLabelNameLengthLimit: 9 enforcedLabelValueLengthLimit: 48 no error, this is expected. in summary: 1. we should confirm if the enforcedLabelLimit setting in only applied for the labels added by thanos-querier, does not include the labels exposed by user metrics 2. should change the error msg for easy understanding
Since the problem described in this bug report should be resolved in a recent advisory, it has been closed with a resolution of ERRATA. For information on the advisory (Important: OpenShift Container Platform 4.11.0 bug fix and security update), and where to find the updated files, follow the link below. If the solution does not work for you, open a new bug report. https://access.redhat.com/errata/RHSA-2022:5069