This bug was initially created as a copy of Bug #1781708 Description of problem: Bunch of pull-ci-* jobs are failing with: level=error msg="Cluster operator machine-config Degraded is True with RequiredPoolsFailed: Failed to resync 0.0.1-2019-12-10-092928 because: timed out waiting for the condition during syncRequiredMachineConfigPools: pool master has not progressed to latest configuration: configuration status for pool master is empty: pool is degraded because nodes fail with \"3 nodes are reporting degraded status on sync\": \"Node ip-10-0-129-171.ec2.internal is reporting: \\\"machineconfig.machineconfiguration.openshift.io \\\\\\\"rendered-master-9ffdae4ce3763dbc967f7e9e041d4de1\\\\\\\" not found\\\", Node ip-10-0-145-21.ec2.internal is reporting: \\\"machineconfig.machineconfiguration.openshift.io \\\\\\\"rendered-master-9ffdae4ce3763dbc967f7e9e041d4de1\\\\\\\" not found\\\", Node ip-10-0-140-8.ec2.internal is reporting: \\\"machineconfig.machineconfiguration.openshift.io \\\\\\\"rendered-master-9ffdae4ce3763dbc967f7e9e041d4de1\\\\\\\" not found\\\"\", retrying" Additionally, mao daemon (logs at https://storage.googleapis.com/origin-ci-test/pr-logs/pull/openshift_installer/2765/pull-ci-openshift-installer-master-e2e-aws/8952/artifacts/e2e-aws/pods/openshift-machine-config-operator_machine-config-daemon-6lqbw_machine-config-daemon.log) complains about (storage.conf file is quite long to share it in its completeness): ``` E1210 09:19:15.517411 13932 daemon.go:1350] content mismatch for file /etc/containers/storage.conf: # A: This file is is the configuration file for all tools # that use the containers/storage library. # See man 5 containers-storage.conf for more information # The "container storage" table contains all of the server options. [storage] ... ``` Error message repeated again every minute until 09:50:16.231021 Known jobs: - https://prow.svc.ci.openshift.org/view/gcs/origin-ci-test/pr-logs/pull/openshift_cluster-api-provider-aws/280/pull-ci-openshift-cluster-api-provider-aws-master-e2e-aws-operator/775 - https://prow.svc.ci.openshift.org/view/gcs/origin-ci-test/pr-logs/pull/openshift_installer/2765/pull-ci-openshift-installer-master-e2e-aws/8952 - https://prow.svc.ci.openshift.org/view/gcs/origin-ci-test/pr-logs/pull/openshift_console/3559/pull-ci-openshift-console-master-e2e-gcp-console/5884 - https://prow.svc.ci.openshift.org/view/gcs/origin-ci-test/pr-logs/pull/openshift_telemeter/273/pull-ci-openshift-telemeter-master-e2e-aws/517 You can find the same error message in the remaining five daemon logs. Version-Release number of selected component (if applicable): Master branch of installer: registry.svc.ci.openshift.org/ci-op-0l9nfi34/release@sha256:1dc8db6e093e8484d0a75ad69be3d617b0d7502cdebdd892d0119cac43150cc9 How reproducible: Always Steps to Reproduce: - https://prow.svc.ci.openshift.org/view/gcs/origin-ci-test/pr-logs/pull/openshift_installer/2765/pull-ci-openshift-installer-master-e2e-aws/8952 Actual results: - MCO is Degraded Expected results: - MCO is not Degraded and the cluster is installed successfully Additional info:
Verified /etc/containers/storage.conf is now using the same file for both masters and worker pools. Also verified the content of the file matches the PR. CI tests no longer shows error describe starting from https://prow.svc.ci.openshift.org/view/gcs/origin-ci-test/pr-logs/pull/openshift_installer/2765/pull-ci-openshift-installer-master-e2e-aws/9009 $ oc get clusterversion NAME VERSION AVAILABLE PROGRESSING SINCE STATUS version 4.3.0-0.nightly-2019-12-18-145749 True False 54m Cluster version is 4.3.0-0.nightly-2019-12-18-145749 $ oc debug node/ip-10-0-141-196.ec2.internal -- chroot /host cat /etc/containers/storage.conf > master.conf Starting pod/ip-10-0-141-196ec2internal-debug ... To use host binaries, run `chroot /host` Removing debug pod ... $ oc debug node/ip-10-0-130-157.ec2.internal -- chroot /host cat /etc/containers/storage.conf > worker.conf Starting pod/ip-10-0-130-157ec2internal-debug ... To use host binaries, run `chroot /host` Removing debug pod ... $ diff master.conf worker.conf $ cat master.conf # This file is generated by the Machine Config Operator's containerruntimeconfig controller. # # storage.conf is the configuration file for all tools # that share the containers/storage libraries # See man 5 containers-storage.conf for more information # The "container storage" table contains all of the server options. [storage] # Default Storage Driver driver = "overlay" # Temporary storage location runroot = "/var/run/containers/storage" # Primary Read/Write location of container storage graphroot = "/var/lib/containers/storage" [storage.options] # Storage options to be passed to underlying storage drivers # AdditionalImageStores is used to pass paths to additional Read/Only image stores # Must be comma separated list. additionalimagestores = [ ] # Size is used to set a maximum size of the container image. Only supported by # certain container storage drivers. size = "" # OverrideKernelCheck tells the driver to ignore kernel checks based on kernel version override_kernel_check = "true" # Remap-UIDs/GIDs is the mapping from UIDs/GIDs as they should appear inside of # a container, to UIDs/GIDs as they should appear outside of the container, and # the length of the range of UIDs/GIDs. Additional mapped sets can be listed # and will be heeded by libraries, but there are limits to the number of # mappings which the kernel will allow when you later attempt to run a # container. # # remap-uids = 0:1668442479:65536 # remap-gids = 0:1668442479:65536 # Remap-User/Group is a name which can be used to look up one or more UID/GID # ranges in the /etc/subuid or /etc/subgid file. Mappings are set up starting # with an in-container ID of 0 and the a host-level ID taken from the lowest # range that matches the specified name, and using the length of that range. # Additional ranges are then assigned, using the ranges which specify the # lowest host-level IDs first, to the lowest not-yet-mapped container-level ID, # until all of the entries have been used for maps. # # remap-user = "storage" # remap-group = "storage" [storage.options.thinpool] # Storage Options for thinpool # autoextend_percent determines the amount by which pool needs to be # grown. This is specified in terms of % of pool size. So a value of 20 means # that when threshold is hit, pool will be grown by 20% of existing # pool size. # autoextend_percent = "20" # autoextend_threshold determines the pool extension threshold in terms # of percentage of pool size. For example, if threshold is 60, that means when # pool is 60% full, threshold has been hit. # autoextend_threshold = "80" # basesize specifies the size to use when creating the base device, which # limits the size of images and containers. # basesize = "10G" # blocksize specifies a custom blocksize to use for the thin pool. # blocksize="64k" # directlvm_device specifies a custom block storage device to use for the # thin pool. Required if you setup devicemapper # directlvm_device = "" # directlvm_device_force wipes device even if device already has a filesystem # directlvm_device_force = "True" # fs specifies the filesystem type to use for the base device. # fs="xfs" # log_level sets the log level of devicemapper. # 0: LogLevelSuppress 0 (Default) # 2: LogLevelFatal # 3: LogLevelErr # 4: LogLevelWarn # 5: LogLevelNotice # 6: LogLevelInfo # 7: LogLevelDebug # log_level = "7" # min_free_space specifies the min free space percent in a thin pool require for # new device creation to succeed. Valid values are from 0% - 99%. # Value 0% disables # min_free_space = "10%" # mkfsarg specifies extra mkfs arguments to be used when creating the base # device. # mkfsarg = "" # mountopt specifies extra mount options used when mounting the thin devices. # mountopt = "" # use_deferred_removal Marking device for deferred removal # use_deferred_removal = "True" # use_deferred_deletion Marking device for deferred deletion # use_deferred_deletion = "True" # xfs_nospace_max_retries specifies the maximum number of retries XFS should # attempt to complete IO when ENOSPC (no space) error is returned by # underlying storage device. # xfs_nospace_max_retries = "0"
Since the problem described in this bug report should be resolved in a recent advisory, it has been closed with a resolution of ERRATA. For information on the advisory, and where to find the updated files, follow the link below. If the solution does not work for you, open a new bug report. https://access.redhat.com/errata/RHBA-2020:0062