Bug 1955051

Summary: metrics "kube_node_status_capacity_cpu_cores" does not exist
Product: OpenShift Container Platform Reporter: Junqi Zhao <juzhao>
Component: MonitoringAssignee: Jayapriya Pai <janantha>
Status: CLOSED ERRATA QA Contact: Junqi Zhao <juzhao>
Severity: medium Docs Contact:
Priority: medium    
Version: 4.8CC: alegrand, anpicker, erooth, kakkoyun, lcosic, pkrupa, spasquie, xueli
Target Milestone: ---Keywords: EasyFix, Regression
Target Release: 4.8.0   
Hardware: Unspecified   
OS: Unspecified   
Whiteboard:
Fixed In Version: Doc Type: No Doc Update
Doc Text:
Story Points: ---
Clone Of: Environment:
Last Closed: 2021-07-27 23:04:52 UTC Type: Bug
Regression: --- Mount Type: ---
Documentation: --- CRM:
Verified Versions: Category: ---
oVirt Team: --- RHEL 7.3 requirements from Atomic Host:
Cloudforms Team: --- Target Upstream Version:
Embargoed:

Description Junqi Zhao 2021-04-29 11:06:13 UTC
Description of problem:
# oc get node
NAME                                                   STATUS   ROLES    AGE     VERSION
 **-48b-tmvdj-master-0.c.openshift-qe.internal         Ready    master   4h6m    v1.21.0-rc.0+3bed56d
 **-48b-tmvdj-master-1.c.openshift-qe.internal         Ready    master   4h6m    v1.21.0-rc.0+3bed56d
 **-48b-tmvdj-master-2.c.openshift-qe.internal         Ready    master   4h6m    v1.21.0-rc.0+3bed56d
 **-48b-tmvdj-worker-a-k5shb.c.openshift-qe.internal   Ready    worker   3h58m   v1.21.0-rc.0+3bed56d
 **-48b-tmvdj-worker-b-bpnfx.c.openshift-qe.internal   Ready    worker   3h58m   v1.21.0-rc.0+3bed56d
 **-48b-tmvdj-worker-c-nrgp6.c.openshift-qe.internal   Ready    worker   3h58m   v1.21.0-rc.0+3bed56d

"cluster:capacity_cpu_cores:sum" is in telemetry-config configmap, but no such metrics from prometheus search
# oc -n openshift-monitoring get cm telemetry-config -o jsonpath="{.data.metrics\.yaml}" | grep {__name__= | grep "cluster:capacity_cpu_cores:sum"
- '{__name__="cluster:capacity_cpu_cores:sum"}'


# token=`oc sa get-token prometheus-k8s -n openshift-monitoring`
# oc -n openshift-monitoring exec -c prometheus prometheus-k8s-0 -- curl -k -H "Authorization: Bearer $token" 'https://prometheus-k8s.openshift-monitoring.svc:9091/api/v1/query?query=cluster:capacity_cpu_cores:sum' | jq
{
  "status": "success",
  "data": {
    "resultType": "vector",
    "result": []
  }
}

# oc -n openshift-monitoring exec -c prometheus prometheus-k8s-0 -- curl -k -H "Authorization: Bearer $token" 'https://prometheus-k8s.openshift-monitoring.svc:9091/api/v1/label/__name__/values' | jq | grep "cluster:capacity_cpu_cores:sum"
no result

checked "cluster:capacity_cpu_cores:sum" rules, metrics "kube_node_status_capacity_cpu_cores" does not exist
*********************************************
      - expr: |
          sum by(label_beta_kubernetes_io_instance_type, label_node_role_kubernetes_io, label_kubernetes_io_arch, label_node_openshift_io_os_id) (
            (
              cluster:master_nodes
              * on(node) group_left() max by(node)
              (
                kube_node_status_capacity_cpu_cores
              )
            )
            or on(node) (
              max without(endpoint, instance, job, pod, service)
              (
                kube_node_labels
              ) * on(node) group_left() max by(node)
              (
                kube_node_status_capacity_cpu_cores
              )
            )
          )
        record: cluster:capacity_cpu_cores:sum
*********************************************
# oc -n openshift-monitoring exec -c prometheus prometheus-k8s-0 -- curl -k -H "Authorization: Bearer $token" 'https://prometheus-k8s.openshift-monitoring.svc:9091/api/v1/label/__name__/values' | jq | grep "kube_node"
    "cluster:usage:kube_node_ready:avg5m",
    "kube_node_created",
    "kube_node_info",
    "kube_node_labels",
    "kube_node_role",
    "kube_node_spec_taint",
    "kube_node_spec_unschedulable",
    "kube_node_status_allocatable",
    "kube_node_status_capacity",
    "kube_node_status_condition",

cluster:master_nodes result
********************************************************
Element	Value
cluster:master_nodes{container="kube-rbac-proxy-main",label_beta_kubernetes_io_arch="amd64",label_beta_kubernetes_io_instance_type="n1-standard-4",label_beta_kubernetes_io_os="linux",label_failure_domain_beta_kubernetes_io_region="us-central1",label_failure_domain_beta_kubernetes_io_zone="us-central1-a",label_kubernetes_io_arch="amd64",label_kubernetes_io_hostname=" **-48b-tmvdj-master-0.c.openshift-qe.internal",label_kubernetes_io_os="linux",label_node_kubernetes_io_instance_type="n1-standard-4",label_node_openshift_io_os_id="rhcos",label_node_role_kubernetes_io="master",label_node_role_kubernetes_io_master="true",label_topology_gke_io_zone="us-central1-a",label_topology_kubernetes_io_region="us-central1",label_topology_kubernetes_io_zone="us-central1-a",namespace="openshift-monitoring",node=" **-48b-tmvdj-master-0.c.openshift-qe.internal",prometheus="openshift-monitoring/k8s"}	1
cluster:master_nodes{container="kube-rbac-proxy-main",label_beta_kubernetes_io_arch="amd64",label_beta_kubernetes_io_instance_type="n1-standard-4",label_beta_kubernetes_io_os="linux",label_failure_domain_beta_kubernetes_io_region="us-central1",label_failure_domain_beta_kubernetes_io_zone="us-central1-b",label_kubernetes_io_arch="amd64",label_kubernetes_io_hostname=" **-48b-tmvdj-master-1.c.openshift-qe.internal",label_kubernetes_io_os="linux",label_node_kubernetes_io_instance_type="n1-standard-4",label_node_openshift_io_os_id="rhcos",label_node_role_kubernetes_io="master",label_node_role_kubernetes_io_master="true",label_topology_gke_io_zone="us-central1-b",label_topology_kubernetes_io_region="us-central1",label_topology_kubernetes_io_zone="us-central1-b",namespace="openshift-monitoring",node=" **-48b-tmvdj-master-1.c.openshift-qe.internal",prometheus="openshift-monitoring/k8s"}	1
cluster:master_nodes{container="kube-rbac-proxy-main",label_beta_kubernetes_io_arch="amd64",label_beta_kubernetes_io_instance_type="n1-standard-4",label_beta_kubernetes_io_os="linux",label_failure_domain_beta_kubernetes_io_region="us-central1",label_failure_domain_beta_kubernetes_io_zone="us-central1-c",label_kubernetes_io_arch="amd64",label_kubernetes_io_hostname=" **-48b-tmvdj-master-2.c.openshift-qe.internal",label_kubernetes_io_os="linux",label_node_kubernetes_io_instance_type="n1-standard-4",label_node_openshift_io_os_id="rhcos",label_node_role_kubernetes_io="master",label_node_role_kubernetes_io_master="true",label_topology_gke_io_zone="us-central1-c",label_topology_kubernetes_io_region="us-central1",label_topology_kubernetes_io_zone="us-central1-c",namespace="openshift-monitoring",node=" **-48b-tmvdj-master-2.c.openshift-qe.internal",prometheus="openshift-monitoring/k8s"}     1
********************************************************

kube_node_labels result
********************************************************
Element	Value
kube_node_labels{container="kube-rbac-proxy-main",endpoint="https-main",job="kube-state-metrics",label_beta_kubernetes_io_arch="amd64",label_beta_kubernetes_io_instance_type="n1-standard-4",label_beta_kubernetes_io_os="linux",label_failure_domain_beta_kubernetes_io_region="us-central1",label_failure_domain_beta_kubernetes_io_zone="us-central1-a",label_kubernetes_io_arch="amd64",label_kubernetes_io_hostname=" **-48b-tmvdj-master-0.c.openshift-qe.internal",label_kubernetes_io_os="linux",label_node_kubernetes_io_instance_type="n1-standard-4",label_node_openshift_io_os_id="rhcos",label_topology_gke_io_zone="us-central1-a",label_topology_kubernetes_io_region="us-central1",label_topology_kubernetes_io_zone="us-central1-a",namespace="openshift-monitoring",node=" **-48b-tmvdj-master-0.c.openshift-qe.internal",prometheus="openshift-monitoring/k8s",service="kube-state-metrics"}	1
kube_node_labels{container="kube-rbac-proxy-main",endpoint="https-main",job="kube-state-metrics",label_beta_kubernetes_io_arch="amd64",label_beta_kubernetes_io_instance_type="n1-standard-4",label_beta_kubernetes_io_os="linux",label_failure_domain_beta_kubernetes_io_region="us-central1",label_failure_domain_beta_kubernetes_io_zone="us-central1-a",label_kubernetes_io_arch="amd64",label_kubernetes_io_hostname=" **-48b-tmvdj-worker-a-k5shb.c.openshift-qe.internal",label_kubernetes_io_os="linux",label_node_kubernetes_io_instance_type="n1-standard-4",label_node_openshift_io_os_id="rhcos",label_topology_gke_io_zone="us-central1-a",label_topology_kubernetes_io_region="us-central1",label_topology_kubernetes_io_zone="us-central1-a",namespace="openshift-monitoring",node=" **-48b-tmvdj-worker-a-k5shb.c.openshift-qe.internal",prometheus="openshift-monitoring/k8s",service="kube-state-metrics"}	1
kube_node_labels{container="kube-rbac-proxy-main",endpoint="https-main",job="kube-state-metrics",label_beta_kubernetes_io_arch="amd64",label_beta_kubernetes_io_instance_type="n1-standard-4",label_beta_kubernetes_io_os="linux",label_failure_domain_beta_kubernetes_io_region="us-central1",label_failure_domain_beta_kubernetes_io_zone="us-central1-b",label_kubernetes_io_arch="amd64",label_kubernetes_io_hostname=" **-48b-tmvdj-master-1.c.openshift-qe.internal",label_kubernetes_io_os="linux",label_node_kubernetes_io_instance_type="n1-standard-4",label_node_openshift_io_os_id="rhcos",label_topology_gke_io_zone="us-central1-b",label_topology_kubernetes_io_region="us-central1",label_topology_kubernetes_io_zone="us-central1-b",namespace="openshift-monitoring",node=" **-48b-tmvdj-master-1.c.openshift-qe.internal",prometheus="openshift-monitoring/k8s",service="kube-state-metrics"}	1
kube_node_labels{container="kube-rbac-proxy-main",endpoint="https-main",job="kube-state-metrics",label_beta_kubernetes_io_arch="amd64",label_beta_kubernetes_io_instance_type="n1-standard-4",label_beta_kubernetes_io_os="linux",label_failure_domain_beta_kubernetes_io_region="us-central1",label_failure_domain_beta_kubernetes_io_zone="us-central1-b",label_kubernetes_io_arch="amd64",label_kubernetes_io_hostname=" **-48b-tmvdj-worker-b-bpnfx.c.openshift-qe.internal",label_kubernetes_io_os="linux",label_node_kubernetes_io_instance_type="n1-standard-4",label_node_openshift_io_os_id="rhcos",label_topology_gke_io_zone="us-central1-b",label_topology_kubernetes_io_region="us-central1",label_topology_kubernetes_io_zone="us-central1-b",namespace="openshift-monitoring",node=" **-48b-tmvdj-worker-b-bpnfx.c.openshift-qe.internal",prometheus="openshift-monitoring/k8s",service="kube-state-metrics"}	1
kube_node_labels{container="kube-rbac-proxy-main",endpoint="https-main",job="kube-state-metrics",label_beta_kubernetes_io_arch="amd64",label_beta_kubernetes_io_instance_type="n1-standard-4",label_beta_kubernetes_io_os="linux",label_failure_domain_beta_kubernetes_io_region="us-central1",label_failure_domain_beta_kubernetes_io_zone="us-central1-c",label_kubernetes_io_arch="amd64",label_kubernetes_io_hostname=" **-48b-tmvdj-master-2.c.openshift-qe.internal",label_kubernetes_io_os="linux",label_node_kubernetes_io_instance_type="n1-standard-4",label_node_openshift_io_os_id="rhcos",label_topology_gke_io_zone="us-central1-c",label_topology_kubernetes_io_region="us-central1",label_topology_kubernetes_io_zone="us-central1-c",namespace="openshift-monitoring",node=" **-48b-tmvdj-master-2.c.openshift-qe.internal",prometheus="openshift-monitoring/k8s",service="kube-state-metrics"}	1
kube_node_labels{container="kube-rbac-proxy-main",endpoint="https-main",job="kube-state-metrics",label_beta_kubernetes_io_arch="amd64",label_beta_kubernetes_io_instance_type="n1-standard-4",label_beta_kubernetes_io_os="linux",label_failure_domain_beta_kubernetes_io_region="us-central1",label_failure_domain_beta_kubernetes_io_zone="us-central1-c",label_kubernetes_io_arch="amd64",label_kubernetes_io_hostname=" **-48b-tmvdj-worker-c-nrgp6.c.openshift-qe.internal",label_kubernetes_io_os="linux",label_node_kubernetes_io_instance_type="n1-standard-4",label_node_openshift_io_os_id="rhcos",label_topology_gke_io_zone="us-central1-c",label_topology_kubernetes_io_region="us-central1",label_topology_kubernetes_io_zone="us-central1-c",namespace="openshift-monitoring",node=" **-48b-tmvdj-worker-c-nrgp6.c.openshift-qe.internal",prometheus="openshift-monitoring/k8s",service="kube-state-metrics"}	1
********************************************************

Version-Release number of selected component (if applicable):
4.8.0-0.nightly-2021-04-29-063720

How reproducible:
always

Steps to Reproduce:
1.
2.
3.

Actual results:


Expected results:


Additional info:

Comment 5 Junqi Zhao 2021-05-06 07:06:13 UTC
issue is fixed with
# oc get clusterversion
NAME      VERSION                             AVAILABLE   PROGRESSING   SINCE   STATUS
version   4.8.0-0.nightly-2021-05-06-003426   True        False         3h5m    Cluster version is 4.8.0-0.nightly-2021-05-06-003426

# token=`oc sa get-token prometheus-k8s -n openshift-monitoring`
# oc -n openshift-monitoring exec -c prometheus prometheus-k8s-0 -- curl -k -H "Authorization: Bearer $token" 'https://prometheus-k8s.openshift-monitoring.svc:9091/api/v1/query?query=cluster:capacity_cpu_cores:sum' | jq
{
  "status": "success",
  "data": {
    "resultType": "vector",
    "result": [
      {
        "metric": {
          "__name__": "cluster:capacity_cpu_cores:sum",
          "label_beta_kubernetes_io_instance_type": "m5.xlarge",
          "label_kubernetes_io_arch": "amd64",
          "label_node_openshift_io_os_id": "rhcos"
        },
        "value": [
          1620284381.421,
          "12"
        ]
      },
      {
        "metric": {
          "__name__": "cluster:capacity_cpu_cores:sum",
          "label_beta_kubernetes_io_instance_type": "m5.xlarge",
          "label_kubernetes_io_arch": "amd64",
          "label_node_openshift_io_os_id": "rhcos",
          "label_node_role_kubernetes_io": "master"
        },
        "value": [
          1620284381.421,
          "12"
        ]
      }
    ]
  }
}
# oc -n openshift-monitoring exec -c prometheus prometheus-k8s-0 -- curl -k -H "Authorization: Bearer $token" 'https://prometheus-k8s.openshift-monitoring.svc:9091/api/v1/query?query=cluster:capacity_memory_bytes:sum' | jq
{
  "status": "success",
  "data": {
    "resultType": "vector",
    "result": [
      {
        "metric": {
          "__name__": "cluster:capacity_memory_bytes:sum",
          "label_beta_kubernetes_io_instance_type": "m5.xlarge"
        },
        "value": [
          1620284381.812,
          "49478766592"
        ]
      },
      {
        "metric": {
          "__name__": "cluster:capacity_memory_bytes:sum",
          "label_beta_kubernetes_io_instance_type": "m5.xlarge",
          "label_node_role_kubernetes_io": "master"
        },
        "value": [
          1620284381.812,
          "49302614016"
        ]
      }
    ]
  }
}

Comment 6 Junqi Zhao 2021-05-19 03:22:45 UTC
*** Bug 1961908 has been marked as a duplicate of this bug. ***

Comment 9 errata-xmlrpc 2021-07-27 23:04:52 UTC
Since the problem described in this bug report should be
resolved in a recent advisory, it has been closed with a
resolution of ERRATA.

For information on the advisory (Moderate: OpenShift Container Platform 4.8.2 bug fix and security update), and where to find the updated
files, follow the link below.

If the solution does not work for you, open a new bug report.

https://access.redhat.com/errata/RHSA-2021:2438