### Description of problem: OCP ansible installation fails when at: TASK [openshift_hosted : Poll for OpenShift pod deployment success] ************************************ Fails on this command (as the output is status '1'): # oc get deploymentconfig docker-registry --namespace default --config /etc/origin/master/admin.kubeconfig -o jsonpath='{ .status.latestVersion }' ### Version-Release number of the following components: # rpm -q openshift-ansible openshift-ansible-3.7.64-1.git.0.cf13b6c.el7.noarch # rpm -q ansible ansible-2.3.2.0-2.el7.noarch # ansible --version ansible 2.3.2.0 config file = /etc/ansible/ansible.cfg configured module search path = Default w/o overrides python version = 2.7.5 (default, May 31 2018, 09:41:32) [GCC 4.8.5 20150623 (Red Hat 4.8.5-28)] ### How reproducible: Always ### Steps to Reproduce: Setting up a 3.7 cluster similar to customer's inventory file. ### Actual results: TASK [openshift_hosted : Poll for OpenShift pod deployment success] ************************************ task path: /usr/share/ansible/openshift-ansible/roles/openshift_hosted/tasks/wait_for_pod.yml:23 Using module file /usr/lib/python2.7/site-packages/ansible/modules/commands/command.py <ocp37m1.lab.msp.redhat.com> ESTABLISH SSH CONNECTION FOR USER: root <ocp37m1.lab.msp.redhat.com> SSH: EXEC ssh -C -o ControlMaster=auto -o ControlPersist=60s -o KbdInteractiveAuthentication=no -o PreferredAuthentications=gssapi-with-mic,gssapi-keyex,hostbased,publickey -o PasswordAuthentication=no -o User=root -o ConnectTimeout=10 -o ControlPath=/root/.ansible/cp/ea8eb347dd ocp37m1.lab.msp.redhat.com '/bin/sh -c '"'"'echo ~ && sleep 0'"'"'' <ocp37m1.lab.msp.redhat.com> (0, '/root\n', '') <ocp37m1.lab.msp.redhat.com> ESTABLISH SSH CONNECTION FOR USER: root <ocp37m1.lab.msp.redhat.com> SSH: EXEC ssh -C -o ControlMaster=auto -o ControlPersist=60s -o KbdInteractiveAuthentication=no -o PreferredAuthentications=gssapi-with-mic,gssapi-keyex,hostbased,publickey -o PasswordAuthentication=no -o User=root -o ConnectTimeout=10 -o ControlPath=/root/.ansible/cp/ea8eb347dd ocp37m1.lab.msp.redhat.com '/bin/sh -c '"'"'( umask 77 && mkdir -p "` echo /root/.ansible/tmp/ansible-tmp-1543346692.23-281458549249719 `" && echo ansible-tmp-1543346692.23-281458549249719="` echo /root/.ansible/tmp/ansible-tmp-1543346692.23-281458549249719 `" ) && sleep 0'"'"'' <ocp37m1.lab.msp.redhat.com> (0, 'ansible-tmp-1543346692.23-281458549249719=/root/.ansible/tmp/ansible-tmp-1543346692.23-281458549249719\n', '') <ocp37m1.lab.msp.redhat.com> PUT /tmp/tmpQH8A4M TO /root/.ansible/tmp/ansible-tmp-1543346692.23-281458549249719/command.py <ocp37m1.lab.msp.redhat.com> SSH: EXEC sftp -b - -C -o ControlMaster=auto -o ControlPersist=60s -o KbdInteractiveAuthentication=no -o PreferredAuthentications=gssapi-with-mic,gssapi-keyex,hostbased,publickey -o PasswordAuthentication=no -o User=root -o ConnectTimeout=10 -o ControlPath=/root/.ansible/cp/ea8eb347dd '[ocp37m1.lab.msp.redhat.com]' <ocp37m1.lab.msp.redhat.com> (0, 'sftp> put /tmp/tmpQH8A4M /root/.ansible/tmp/ansible-tmp-1543346692.23-281458549249719/command.py\n', '') <ocp37m1.lab.msp.redhat.com> ESTABLISH SSH CONNECTION FOR USER: root <ocp37m1.lab.msp.redhat.com> SSH: EXEC ssh -C -o ControlMaster=auto -o ControlPersist=60s -o KbdInteractiveAuthentication=no -o PreferredAuthentications=gssapi-with-mic,gssapi-keyex,hostbased,publickey -o PasswordAuthentication=no -o User=root -o ConnectTimeout=10 -o ControlPath=/root/.ansible/cp/ea8eb347dd ocp37m1.lab.msp.redhat.com '/bin/sh -c '"'"'chmod u+x /root/.ansible/tmp/ansible-tmp-1543346692.23-281458549249719/ /root/.ansible/tmp/ansible-tmp-1543346692.23-281458549249719/command.py && sleep 0'"'"'' <ocp37m1.lab.msp.redhat.com> (0, '', '') <ocp37m1.lab.msp.redhat.com> ESTABLISH SSH CONNECTION FOR USER: root <ocp37m1.lab.msp.redhat.com> SSH: EXEC ssh -C -o ControlMaster=auto -o ControlPersist=60s -o KbdInteractiveAuthentication=no -o PreferredAuthentications=gssapi-with-mic,gssapi-keyex,hostbased,publickey -o PasswordAuthentication=no -o User=root -o ConnectTimeout=10 -o ControlPath=/root/.ansible/cp/ea8eb347dd -tt ocp37m1.lab.msp.redhat.com '/bin/sh -c '"'"'/usr/bin/python /root/.ansible/tmp/ansible-tmp-1543346692.23-281458549249719/command.py; rm -rf "/root/.ansible/tmp/ansible-tmp-1543346692.23-281458549249719/" > /dev/null 2>&1 && sleep 0'"'"'' <ocp37m1.lab.msp.redhat.com> (0, '\r\n{"changed": true, "end": "2018-11-27 13:24:53.670782", "stdout": "Failed", "cmd": ["oc", "get", "replicationcontroller", "docker-registry-1", "--namespace", "default", "--config", "/etc/origin/master/admin.kubeconfig", "-o", "jsonpath={ .metadata.annotations.openshift\\\\.io/deployment\\\\.phase }"], "rc": 0, "start": "2018-11-27 13:24:53.000765", "stderr": "", "delta": "0:00:00.670017", "invocation": {"module_args": {"warn": true, "executable": null, "_uses_shell": false, "_raw_params": "oc get replicationcontroller docker-registry-1 --namespace default --config /etc/origin/master/admin.kubeconfig -o jsonpath=\'{ .metadata.annotations.openshift\\\\.io/deployment\\\\.phase }\'", "removes": null, "creates": null, "chdir": null}}, "warnings": []}\r\n', 'Shared connection to ocp37m1.lab.msp.redhat.com closed.\r\n') failed: [ocp37m1.lab.msp.redhat.com] (item=[{u'namespace': u'default', u'name': u'docker-registry'}, {'_ansible_parsed': True, 'stderr_lines': [], '_ansible_item_result': True, u'end': u'2018-11-27 13:24:51.931503', '_ansible_no_log': False, u'stdout': u'1', u'changed': True, u'start': u'2018-11-27 13:24:51.280685', u'delta': u'0:00:00.650818', u'cmd': [u'oc', u'get', u'deploymentconfig', u'docker-registry', u'--namespace', u'default', u'--config', u'/etc/origin/master/admin.kubeconfig', u'-o', u'jsonpath={ .status.latestVersion }'], 'item': {u'namespace': u'default', u'name': u'docker-registry'}, u'rc': 0, u'invocation': {u'module_args': {u'warn': True, u'executable': None, u'_uses_shell': False, u'_raw_params': u"oc get deploymentconfig docker-registry --namespace default --config /etc/origin/master/admin.kubeconfig -o jsonpath='{ .status.latestVersion }'", u'removes': None, u'creates': None, u'chdir': None}}, 'stdout_lines': [u'1'], u'stderr': u''}]) => { "attempts": 1, "changed": true, "cmd": [ "oc", "get", "replicationcontroller", "docker-registry-1", "--namespace", "default", "--config", "/etc/origin/master/admin.kubeconfig", "-o", "jsonpath={ .metadata.annotations.openshift\\.io/deployment\\.phase }" ], "delta": "0:00:00.670017", "end": "2018-11-27 13:24:53.670782", "failed": true, "failed_when_result": true, "invocation": { "module_args": { "_raw_params": "oc get replicationcontroller docker-registry-1 --namespace default --config /etc/origin/master/admin.kubeconfig -o jsonpath='{ .metadata.annotations.openshift\\.io/deployment\\.phase }'", "_uses_shell": false, "chdir": null, "creates": null, "executable": null, "removes": null, "warn": true } }, "item": [ { "name": "docker-registry", "namespace": "default" }, { "_ansible_item_result": true, "_ansible_no_log": false, "_ansible_parsed": true, "changed": true, "cmd": [ "oc", "get", "deploymentconfig", "docker-registry", "--namespace", "default", "--config", "/etc/origin/master/admin.kubeconfig", "-o", "jsonpath={ .status.latestVersion }" ], "delta": "0:00:00.650818", "end": "2018-11-27 13:24:51.931503", "invocation": { "module_args": { "_raw_params": "oc get deploymentconfig docker-registry --namespace default --config /etc/origin/master/admin.kubeconfig -o jsonpath='{ .status.latestVersion }'", "_uses_shell": false, "chdir": null, "creates": null, "executable": null, "removes": null, "warn": true } }, "item": { "name": "docker-registry", "namespace": "default" }, "rc": 0, "start": "2018-11-27 13:24:51.280685", "stderr": "", "stderr_lines": [], "stdout": "1", "stdout_lines": [ "1" ] } ], "rc": 0, "start": "2018-11-27 13:24:53.000765", "stderr": "", "stderr_lines": [], "stdout": "Failed", "stdout_lines": [ "Failed" ] } # oc get pods -o wide NAME READY STATUS RESTARTS AGE IP NODE docker-registry-1-deploy 0/1 Error 0 3d 10.130.2.2 ocp37node4.lab.msp.redhat.com registry-console-1-rvsz5 1/1 Running 1 3d 10.129.0.3 ocp37app2.lab.msp.redhat.com router-1-deploy 0/1 Error 0 3d 10.129.4.2 ocp37node5.lab.msp.redhat.com router-5-8m2lj 1/1 Running 0 10m 10.15.108.88 ocp37node3.lab.msp.redhat.com router-5-gsnlk 1/1 Running 0 12m 10.15.108.89 ocp37node4.lab.msp.redhat.com router-5-j8r2l 1/1 Running 0 11m 10.15.108.86 ocp37node1.lab.msp.redhat.com router-5-mdvqf 1/1 Running 0 11m 10.15.108.90 ocp37node5.lab.msp.redhat.com router-5-rgq5w 1/1 Running 0 12m 10.15.108.87 ocp37node2.lab.msp.redhat.com ### Expected results: registry pod should not be in status error ### Additional info: Playbook with -vvv output to be attached
Created attachment 1509208 [details] ansible installation output with -vvv
We need to look at the pod logs from those deploy pods that are in Error state, there should be indication there as to why the deployment was considered a failure even though those pods actually appear to be running in the case of the router. Also, the router deploy pod is version 1 of the DC but it's now running version 5 of the DC, what's changed?
(In reply to Scott Dodson from comment #4) > We need to look at the pod logs from those deploy pods that are in Error > state, there should be indication there as to why the deployment was > considered a failure even though those pods actually appear to be running in > the case of the router. Can you provide the commands you need exactly that I should run on the pods/nodes? I just want to make sure I don't miss anything. > Also, the router deploy pod is version 1 of the DC > but it's now running version 5 of the DC, what's changed? It might've been because I tried the 'oc rollout latest router' command a few times to test that part. Thanks!
(In reply to Sam Yangsao from comment #5) > (In reply to Scott Dodson from comment #4) > > We need to look at the pod logs from those deploy pods that are in Error > > state, there should be indication there as to why the deployment was > > considered a failure even though those pods actually appear to be running in > > the case of the router. > > Can you provide the commands you need exactly that I should run on the > pods/nodes? oc logs -n default docker-registry-1-deploy oc logs -n default router-1-deploy
[root@ocp37m1 ~]# oc logs -n default docker-registry-1-deploy --> Scaling docker-registry-1 to 5 error: update acceptor rejected docker-registry-1: pods for rc 'default/docker-registry-1' took longer than 600 seconds to become available [root@ocp37m1 ~]# oc logs -n default router-1-deploy --> Scaling router-1 to 5 error: update acceptor rejected router-1: pods for rc 'default/router-1' took longer than 600 seconds to become available [root@ocp37m1 ~]# oc get pods -o wide NAME READY STATUS RESTARTS AGE IP NODE docker-registry-1-deploy 0/1 Error 0 5d 10.130.2.2 ocp37node4.lab.msp.redhat.com registry-console-1-rvsz5 1/1 Running 1 5d 10.129.0.3 ocp37app2.lab.msp.redhat.com router-1-deploy 0/1 Error 0 5d 10.129.4.2 ocp37node5.lab.msp.redhat.com router-8-2nf6t 1/1 Running 0 1d 10.15.108.87 ocp37node2.lab.msp.redhat.com router-8-5grrv 1/1 Running 0 1d 10.15.108.88 ocp37node3.lab.msp.redhat.com router-8-9nvnt 1/1 Running 0 1d 10.15.108.86 ocp37node1.lab.msp.redhat.com router-8-rxlzg 1/1 Running 0 1d 10.15.108.89 ocp37node4.lab.msp.redhat.com router-8-sc7vt 1/1 Running 0 1d 10.15.108.90 ocp37node5.lab.msp.redhat.com [root@ocp37m1 ~]# oc get events No resources found.
[root@ocp37m1 ~]# oc describe po/router-1-deploy Name: router-1-deploy Namespace: default Node: ocp37node5.lab.msp.redhat.com/10.15.108.90 Start Time: Fri, 23 Nov 2018 14:08:45 -0600 Labels: openshift.io/deployer-pod-for.name=router-1 Annotations: openshift.io/deployment.name=router-1 openshift.io/scc=restricted Status: Failed IP: 10.129.4.2 Containers: deployment: Container ID: docker://8f2643c08a56961a9bfd391fc6d08511b5193b32ee2b8432741eab290e2c692b Image: sat6.lab.msp.redhat.com:5000/syangsao-ocp-openshift3_ose-deployer:v3.7.44 Image ID: docker-pullable://sat6.lab.msp.redhat.com:5000/syangsao-ocp-openshift3_ose-deployer@sha256:7866af10cee87798436b8391703901cfa8f058728c1ba01e5d7230f508c41972 Port: <none> State: Terminated Reason: Error Exit Code: 1 Started: Fri, 23 Nov 2018 14:10:28 -0600 Finished: Fri, 23 Nov 2018 14:20:30 -0600 Ready: False Restart Count: 0 Environment: KUBERNETES_MASTER: https://ocp37m1.lab.msp.redhat.com OPENSHIFT_MASTER: https://ocp37m1.lab.msp.redhat.com BEARER_TOKEN_FILE: /var/run/secrets/kubernetes.io/serviceaccount/token OPENSHIFT_CA_DATA: -----BEGIN CERTIFICATE----- MIIC6jCCAdKgAwIBAgIBATANBgkqhkiG9w0BAQsFADAmMSQwIgYDVQQDDBtvcGVu c2hpZnQtc2lnbmVyQDE1NDMwMDEwNjIwHhcNMTgxMTIzMTkyNDIxWhcNMjMxMTIy MTkyNDIyWjAmMSQwIgYDVQQDDBtvcGVuc2hpZnQtc2lnbmVyQDE1NDMwMDEwNjIw ggEiMA0GCSqGSIb3DQEBAQUAA4IBDwAwggEKAoIBAQDbesBI8KLapzVgQ7IgJIPQ +aIgolUl4lDWtXxIxgA0nJcVPMo8py2XUXM/4WNtBj15djpfTrYaqEnouLo7RLni Uv94nib+pFUib2/8+483jv3qDxzSKIWertGe48Kfjweys8m7v9XEmlx3jndMZdWu JSNoZQMM0nby98pLqXzUG7ErCx7rZNYRBJ0MOCAMUW+F1qUYedaCTHEUgnc+KJF3 GzSzUengfI1Jy85q6OmdA5UaECz9wc3rOXjNfJhasdYxcM161OHG9Jweqnf2zJ6e 0LXEDWsJQyhpcGeonIEhkaLCvv2mBKFZvwJbL+BKdw2ieaGNBpEb+PekiTzoIeH7 AgMBAAGjIzAhMA4GA1UdDwEB/wQEAwICpDAPBgNVHRMBAf8EBTADAQH/MA0GCSqG SIb3DQEBCwUAA4IBAQBApRN13VMsUyP3ial9AmukV3jfvm/Em86+GMhaRiTQGSin CUUIoTeD7A37ak5neIeVdRVi/34a9+YjRV7uYt0VXDMm5u1CIA2VLIsXziWBB/kt OSaKfGhMpPpskjhx/TOv9xLUeIQYEBpAarm74M6hyw/WaakZSRcWCY0rLk99etlj d/NhJLBPVO7brYHpoTFbQJnZATDVXGpa5RpS4sVkKGu2OEss2hPI+CIm6S5HNh38 /3QZjPihzI/AZNJ3YpalBy1rfOHl2WGsXPh9hKstDJOYLL2XbFUCAR/nhenLCd7h T+IP1ekpIHe/wVc/e4flVcOY4i5eoygrXOTN4Q2O -----END CERTIFICATE----- OPENSHIFT_DEPLOYMENT_NAME: router-1 OPENSHIFT_DEPLOYMENT_NAMESPACE: default Mounts: /var/run/secrets/kubernetes.io/serviceaccount from deployer-token-l2mzs (ro) Conditions: Type Status Initialized True Ready False PodScheduled True Volumes: deployer-token-l2mzs: Type: Secret (a volume populated by a Secret) SecretName: deployer-token-l2mzs Optional: false QoS Class: BestEffort Node-Selectors: region=infra Tolerations: <none> Events: <none> [root@ocp37m1 ~]# oc describe po/router-1-deploy Name: router-1-deploy Namespace: default Node: ocp37node5.lab.msp.redhat.com/10.15.108.90 Start Time: Fri, 23 Nov 2018 14:08:45 -0600 Labels: openshift.io/deployer-pod-for.name=router-1 Annotations: openshift.io/deployment.name=router-1 openshift.io/scc=restricted Status: Failed IP: 10.129.4.2 Containers: deployment: Container ID: docker://8f2643c08a56961a9bfd391fc6d08511b5193b32ee2b8432741eab290e2c692b Image: sat6.lab.msp.redhat.com:5000/syangsao-ocp-openshift3_ose-deployer:v3.7.44 Image ID: docker-pullable://sat6.lab.msp.redhat.com:5000/syangsao-ocp-openshift3_ose-deployer@sha256:7866af10cee87798436b8391703901cfa8f058728c1ba01e5d7230f508c41972 Port: <none> State: Terminated Reason: Error Exit Code: 1 Started: Fri, 23 Nov 2018 14:10:28 -0600 Finished: Fri, 23 Nov 2018 14:20:30 -0600 Ready: False Restart Count: 0 Environment: KUBERNETES_MASTER: https://ocp37m1.lab.msp.redhat.com OPENSHIFT_MASTER: https://ocp37m1.lab.msp.redhat.com BEARER_TOKEN_FILE: /var/run/secrets/kubernetes.io/serviceaccount/token OPENSHIFT_CA_DATA: -----BEGIN CERTIFICATE----- MIIC6jCCAdKgAwIBAgIBATANBgkqhkiG9w0BAQsFADAmMSQwIgYDVQQDDBtvcGVu c2hpZnQtc2lnbmVyQDE1NDMwMDEwNjIwHhcNMTgxMTIzMTkyNDIxWhcNMjMxMTIy MTkyNDIyWjAmMSQwIgYDVQQDDBtvcGVuc2hpZnQtc2lnbmVyQDE1NDMwMDEwNjIw ggEiMA0GCSqGSIb3DQEBAQUAA4IBDwAwggEKAoIBAQDbesBI8KLapzVgQ7IgJIPQ +aIgolUl4lDWtXxIxgA0nJcVPMo8py2XUXM/4WNtBj15djpfTrYaqEnouLo7RLni Uv94nib+pFUib2/8+483jv3qDxzSKIWertGe48Kfjweys8m7v9XEmlx3jndMZdWu JSNoZQMM0nby98pLqXzUG7ErCx7rZNYRBJ0MOCAMUW+F1qUYedaCTHEUgnc+KJF3 GzSzUengfI1Jy85q6OmdA5UaECz9wc3rOXjNfJhasdYxcM161OHG9Jweqnf2zJ6e 0LXEDWsJQyhpcGeonIEhkaLCvv2mBKFZvwJbL+BKdw2ieaGNBpEb+PekiTzoIeH7 AgMBAAGjIzAhMA4GA1UdDwEB/wQEAwICpDAPBgNVHRMBAf8EBTADAQH/MA0GCSqG SIb3DQEBCwUAA4IBAQBApRN13VMsUyP3ial9AmukV3jfvm/Em86+GMhaRiTQGSin CUUIoTeD7A37ak5neIeVdRVi/34a9+YjRV7uYt0VXDMm5u1CIA2VLIsXziWBB/kt OSaKfGhMpPpskjhx/TOv9xLUeIQYEBpAarm74M6hyw/WaakZSRcWCY0rLk99etlj d/NhJLBPVO7brYHpoTFbQJnZATDVXGpa5RpS4sVkKGu2OEss2hPI+CIm6S5HNh38 /3QZjPihzI/AZNJ3YpalBy1rfOHl2WGsXPh9hKstDJOYLL2XbFUCAR/nhenLCd7h T+IP1ekpIHe/wVc/e4flVcOY4i5eoygrXOTN4Q2O -----END CERTIFICATE----- OPENSHIFT_DEPLOYMENT_NAME: router-1 OPENSHIFT_DEPLOYMENT_NAMESPACE: default Mounts: /var/run/secrets/kubernetes.io/serviceaccount from deployer-token-l2mzs (ro) Conditions: Type Status Initialized True Ready False PodScheduled True Volumes: deployer-token-l2mzs: Type: Secret (a volume populated by a Secret) SecretName: deployer-token-l2mzs Optional: false QoS Class: BestEffort Node-Selectors: region=infra Tolerations: <none> Events: <none>
> Also, the router deploy pod is version 1 of the DC > but it's now running version 5 of the DC, what's changed? I just thought about this, I believe it may be related to the amount of times I've attempted this installation via ansible-playbook, already. I was thinking about just cleaning out these VM's and reinstalling fresh again (since we've made a few mods to the inventory file to get past other issues). Just let me know if I should go that route or provide more logs. Thanks!
If you start fresh and it works i don't think we'd have time to look any further so I don't think we need any additional logs now. If the fresh install works then just close this bug at that time.
Fresh installation, 2 attempts, it just hangs waiting on the router to come up on all 5 nodes. At this TASK, I'll log the next install this time to a file. TASK [openshift_hosted : Create the router service account(s)] I noticed that the container was being created at this time on the last 2 nodes, but the install just failed (even though it was at about 23 attempts and had 18 tries left). [root@ocp37m1 ~]# oc get pods -o wide NAME READY STATUS RESTARTS AGE IP NODE router-1-deploy 0/1 Error 0 1h 10.131.2.2 ocp37node5.lab.msp.redhat.com router-2-59nv9 0/1 ContainerCreating 0 8m 10.15.108.88 ocp37node3.lab.msp.redhat.com router-2-66t8s 1/1 Running 0 8m 10.15.108.89 ocp37node4.lab.msp.redhat.com router-2-deploy 1/1 Running 0 8m 10.131.2.3 ocp37node5.lab.msp.redhat.com router-2-gz52m 1/1 Running 0 8m 10.15.108.87 ocp37node2.lab.msp.redhat.com router-2-nt2h2 1/1 Running 0 8m 10.15.108.90 ocp37node5.lab.msp.redhat.com router-2-t8xlp 0/1 ContainerCreating 0 8m 10.15.108.86 ocp37node1.lab.msp.redhat.com [root@ocp37m1 ~]# oc projects You have access to the following projects and can switch between them with 'oc project <projectname>': * default kube-public kube-system logging management-infra openshift openshift-infra openshift-node Using project "default" on server "https://ocp37master.lab.msp.redhat.com". [root@ocp37m1 ~]# oc get pods NAME READY STATUS RESTARTS AGE router-1-deploy 0/1 Error 0 1h router-2-deploy 0/1 Error 0 16m [root@ocp37m1 ~]# oc logs router-1-deploy --> Scaling router-1 to 5 error: update acceptor rejected router-1: pods for rc 'default/router-1' took longer than 600 seconds to become available [root@ocp37m1 ~]# oc logs router-2-deploy --> Scaling router-2 to 5 error: update acceptor rejected router-2: pods for rc 'default/router-2' took longer than 600 seconds to become available [root@ocp37m1 ~]# oc get pods -o wide NAME READY STATUS RESTARTS AGE IP NODE router-1-deploy 0/1 Error 0 1h 10.131.2.2 ocp37node5.lab.msp.redhat.com router-2-deploy 0/1 Error 0 17m 10.131.2.3 I'm assuming that it didn't finish the task because the router went over 600 seconds before becoming available. Is there a way to up the 600 seconds? (knowing that we're just waiting for the container image to be created, mainly due to resource contention... vCPU, etc.)
Encountered the error alot sooner with the default registry service not completing and timing out. ## ansible installer from control node TASK [openshift_hosted : create the default registry service] ****************************************** [...] fatal: [ocp37m1.lab.msp.redhat.com]: FAILED! => { "changed": false, "failed": true, "invocation": { "module_args": { "annotations": null, "clusterip": "", "debug": false, "external_ips": null, "kubeconfig": "/etc/origin/master/admin.kubeconfig", "labels": null, "name": "docker-registry", "namespace": "default", "portalip": null, "ports": [ { "name": "5000-tcp", "port": 5000, "protocol": "TCP", "targetPort": 5000 } ], "selector": { "docker-registry": "default" }, "service_type": "ClusterIP", "session_affinity": "ClientIP", "state": "present" } }, "msg": { "cmd": "/usr/bin/oc create -f /tmp/docker-registry-MP1zs5 -n default", "results": {}, "returncode": 1, "stderr": "Error from server (Timeout): error when creating \"/tmp/docker-registry-MP1zs5\": Timeout: request did not complete within allowed duration\n", "stdout": "" } } PLAY RECAP ********************************************************************************************* localhost : ok=12 changed=0 unreachable=0 failed=0 ocp37app1.lab.msp.redhat.com : ok=193 changed=67 unreachable=0 failed=0 ocp37app2.lab.msp.redhat.com : ok=193 changed=67 unreachable=0 failed=0 ocp37m1.lab.msp.redhat.com : ok=548 changed=211 unreachable=0 failed=1 ocp37m2.lab.msp.redhat.com : ok=427 changed=163 unreachable=0 failed=0 ocp37m3.lab.msp.redhat.com : ok=427 changed=163 unreachable=0 failed=0 ocp37node1.lab.msp.redhat.com : ok=193 changed=67 unreachable=0 failed=0 ocp37node2.lab.msp.redhat.com : ok=193 changed=67 unreachable=0 failed=0 ocp37node3.lab.msp.redhat.com : ok=193 changed=67 unreachable=0 failed=0 ocp37node4.lab.msp.redhat.com : ok=193 changed=67 unreachable=0 failed=0 ocp37node5.lab.msp.redhat.com : ok=193 changed=67 unreachable=0 failed=0 INSTALLER STATUS *************************************************************************************** Initialization : Complete Health Check : Complete etcd Install : Complete Master Install : Complete Master Additional Install : Complete Node Install : Complete Hosted Install : In Progress This phase can be restarted by running: playbooks/byo/openshift-cluster/openshift-hosted.yml ## one of the masters [root@ocp37m1 dnsmasq.d]# oc get all NAME REVISION DESIRED CURRENT TRIGGERED BY deploymentconfigs/router 1 5 0 config NAME READY STATUS RESTARTS AGE po/router-1-deploy 0/1 Error 0 1h NAME DESIRED CURRENT READY AGE rc/router-1 0 0 0 1h NAME CLUSTER-IP EXTERNAL-IP PORT(S) AGE svc/docker-registry 172.30.244.68 <none> 5000/TCP 1h svc/kubernetes 172.30.0.1 <none> 443/TCP,53/UDP,53/TCP 1h svc/router 172.30.179.48 <none> 80/TCP,443/TCP,1936/TCP 1h [root@ocp37m1 dnsmasq.d]# oc logs -n default router-1-deploy --> Scaling router-1 to 5 error: update acceptor rejected router-1: pods for rc 'default/router-1' took longer than 600 seconds to become available [root@ocp37m1 dnsmasq.d]# oc logs deploymentconfigs/router --> Scaling router-1 to 5 error: update acceptor rejected router-1: pods for rc 'default/router-1' took longer than 600 seconds to become available [root@ocp37m1 dnsmasq.d]# oc logs po/router-1-deploy --> Scaling router-1 to 5 error: update acceptor rejected router-1: pods for rc 'default/router-1' took longer than 600 seconds to become available [root@ocp37m1 dnsmasq.d]# oc status -v In project default on server https://ocp37master.lab.msp.redhat.com svc/docker-registry - 172.30.244.68:5000 svc/kubernetes - 172.30.0.1 ports 443, 53->8053, 53->8053 svc/router - 172.30.179.48 ports 80, 443, 1936 dc/router deploys sat6.lab.msp.redhat.com:5000/syangsao-ocp-openshift3_ose-haproxy-router:v3.7.44 deployment #1 failed about an hour ago: config change Info: * pod/router-1-deploy has no liveness probe to verify pods are still running. try: oc set probe pod/router-1-deploy --liveness ... View details with 'oc describe <resource>/<name>' or list everything with 'oc get all'. [root@ocp37m1 dnsmasq.d]# oc set probe pod/router-1-deploy --liveness error: Pod "router-1-deploy" is invalid: [spec.containers[0].livenessProbe: Required value: must specify a handler type, spec: Forbidden: pod updates may not change fields other than `spec.containers[*].image`, `spec.initContainers[*].image`, `spec.activeDeadlineSeconds` or `spec.tolerations` (only additions to existing tolerations) {"Volumes":[{"Name":"deployer-token-t2vdp","HostPath":null,"EmptyDir":null,"GCEPersistentDisk":null,"AWSElasticBlockStore":null,"GitRepo":null,"Secret":{"SecretName":"deployer-token-t2vdp","Items":null,"DefaultMode":420,"Optional":null},"NFS":null,"ISCSI":null,"Glusterfs":null,"PersistentVolumeClaim":null,"RBD":null,"Quobyte":null,"FlexVolume":null,"Cinder":null,"CephFS":null,"Flocker":null,"DownwardAPI":null,"FC":null,"AzureFile":null,"ConfigMap":null,"VsphereVolume":null,"AzureDisk":null,"PhotonPersistentDisk":null,"Projected":null,"PortworxVolume":null,"ScaleIO":null,"StorageOS":null}],"InitContainers":null,"Containers":[{"Name":"deployment","Image":"sat6.lab.msp.redhat.com:5000/syangsao-ocp-openshift3_ose-deployer:v3.7.44","Command":null,"Args":null,"WorkingDir":"","Ports":null,"EnvFrom":null,"Env":[{"Name":"KUBERNETES_MASTER","Value":"https://ocp37m2.lab.msp.redhat.com","ValueFrom":null},{"Name":"OPENSHIFT_MASTER","Value":"https://ocp37m2.lab.msp.redhat.com","ValueFrom":null},{"Name":"BEARER_TOKEN_FILE","Value":"/var/run/secrets/kubernetes.io/serviceaccount/token","ValueFrom":null},{"Name":"OPENSHIFT_CA_DATA","Value":"-----BEGIN CERTIFICATE-----\nMIIC6jCCAdKgAwIBAgIBATANBgkqhkiG9w0BAQsFADAmMSQwIgYDVQQDDBtvcGVu\nc2hpZnQtc2lnbmVyQDE1NDM1MzcxMzUwHhcNMTgxMTMwMDAxODU0WhcNMjMxMTI5\nMDAxODU1WjAmMSQwIgYDVQQDDBtvcGVuc2hpZnQtc2lnbmVyQDE1NDM1MzcxMzUw\nggEiMA0GCSqGSIb3DQEBAQUAA4IBDwAwggEKAoIBAQDk88vTUrslJiwBPv+SNnAE\n0hE9o9raVv5W9AueW+37iUWXrM9uzd3hi9vOXGHahKjsrO6+/92ZxtyWwERWiTcL\nOWti7YrnAhB8pfPl5nfcNFS7y1mW4go2yJeJbnWQ1V7vfSXUmZ+KF/RKWgmsUdni\nVcfzb0qk8UoamTxAkyLS32CwDaIzvqVaiAiui2iw9eS2lRZpS52lrZe4JfID1Hoc\nlt3JvyrYxuoQSHKCFt8txVIf12wmJx/3UVE/fF8A2kkNPrfQRrQTvagaTD5PdSuW\nhZi1zXuEqe2QFvEMK04g6S2Ovon0Qsu1a8EXHABwz/hLQpUA40HHa7NaLtlimWe3\nAgMBAAGjIzAhMA4GA1UdDwEB/wQEAwICpDAPBgNVHRMBAf8EBTADAQH/MA0GCSqG\nSIb3DQEBCwUAA4IBAQA4P/9MVWHM40GpQsPo5rG/R38EuB3kH5U7dm9Ti26gKypl\nLRe4Qa3tUm1LAapdagB8oXBwDUvn9BBXp9dGQBZ04z44ctpLD8KMckDfWs4oUyc7\nWB/DAuQ4y+p5khMHa1F9X1/lWRIlrbUqVVUNoFUhzKiia2vVX5EF2lblJL3gvSrF\nE+xOXhQ08LMlVnf8urefpDy/dAYthOkOtbeIQeOo4o92PztXl7ARdIHMmPQxe2+U\n2goP5FWjceRAo5nEE7zQJcECQa6hAP3YS0rr2PFTLirdnrnREyZker9tf2tY4mZP\nMqBKgSTo1wlB6AMFcmU3SsN2XGkorrttW3JTPbz0\n-----END CERTIFICATE-----\n","ValueFrom":null},{"Name":"OPENSHIFT_DEPLOYMENT_NAME","Value":"router-1","ValueFrom":null},{"Name":"OPENSHIFT_DEPLOYMENT_NAMESPACE","Value":"default","ValueFrom":null}],"Resources":{"Limits":null,"Requests":null},"VolumeMounts":[{"Name":"deployer-token-t2vdp","ReadOnly":true,"MountPath":"/var/run/secrets/kubernetes.io/serviceaccount","SubPath":""}],"LivenessProbe": A: {"Exec":null,"HTTPGet":null,"TCPSocket":null,"InitialDelaySeconds":0,"TimeoutSeconds":1,"PeriodSeconds":10,"SuccessThreshold":1,"FailureThreshold":3},"ReadinessProbe":null,"Lifecycle":null,"TerminationMessagePath":"/dev/termination-log","TerminationMessagePolicy":"File","ImagePullPolicy":"IfNotPresent","SecurityContext":{"Capabilities":{"Add":null,"Drop":["KILL","MKNOD","SETGID","SETUID"]},"Privileged":false,"SELinuxOptions":{"User":"","Role":"","Type":"","Level":"s0:c1,c0"},"RunAsUser":1000000000,"RunAsNonRoot":null,"ReadOnlyRootFilesystem":null},"Stdin":false,"StdinOnce":false,"TTY":false}],"RestartPolicy":"Never","TerminationGracePeriodSeconds":10,"ActiveDeadlineSeconds":21600,"DNSPolicy":"ClusterFirst","NodeSelector":{"region":"infra"},"ServiceAccountName":"deployer","AutomountServiceAccountToken":null,"NodeName":"ocp37node5.lab.msp.redhat.com","SecurityContext":{"HostNetwork":false,"HostPID":false,"HostIPC":false,"SELinuxOptions":{"User":"","Role":"","Type":"","Level":"s0:c1,c0"},"RunAsUser":null,"RunAsNonRoot":null,"SupplementalGroups":[],"FSGroup":1000000000},"ImagePullSecrets":[{"Name":"deployer-dockercfg-hpdmn"}],"Hostname":"","Subdomain":"","Affinity":null,"SchedulerName":"default-scheduler","Tolerations":null,"HostAliases":null} B: null,"ReadinessProbe":null,"Lifecycle":null,"TerminationMessagePath":"/dev/termination-log","TerminationMessagePolicy":"File","ImagePullPolicy":"IfNotPresent","SecurityContext":{"Capabilities":{"Add":null,"Drop":["KILL","MKNOD","SETGID","SETUID"]},"Privileged":false,"SELinuxOptions":{"User":"","Role":"","Type":"","Level":"s0:c1,c0"},"RunAsUser":1000000000,"RunAsNonRoot":null,"ReadOnlyRootFilesystem":null},"Stdin":false,"StdinOnce":false,"TTY":false}],"RestartPolicy":"Never","TerminationGracePeriodSeconds":10,"ActiveDeadlineSeconds":21600,"DNSPolicy":"ClusterFirst","NodeSelector":{"region":"infra"},"ServiceAccountName":"deployer","AutomountServiceAccountToken":null,"NodeName":"ocp37node5.lab.msp.redhat.com","SecurityContext":{"HostNetwork":false,"HostPID":false,"HostIPC":false,"SELinuxOptions":{"User":"","Role":"","Type":"","Level":"s0:c1,c0"},"RunAsUser":null,"RunAsNonRoot":null,"SupplementalGroups":null,"FSGroup":1000000000},"ImagePullSecrets":[{"Name":"deployer-dockercfg-hpdmn"}],"Hostname":"","Subdomain":"","Affinity":null,"SchedulerName":"default-scheduler","Tolerations":null,"HostAliases":null} ]
Created attachment 1510181 [details] ansible installation output with -vvv for comment#12's installation
Seems to be DNS related, why isn't dnsmasq going to the upstream DNS server when looking up the master hostname during the installation? It's attempting to look internally and can not find the master hostname. [root@ocp37m1 ~]# cat /etc/resolv.conf # nameserver updated by /etc/NetworkManager/dispatcher.d/99-origin-dns.sh # Generated by NetworkManager search cluster.local lab.msp.redhat.com redhat.com nameserver 10.15.108.83 [root@ocp37m1 ~]# oc get pods -o wide NAME READY STATUS RESTARTS AGE IP NODE docker-registry-1-deploy 0/1 ContainerCreating 0 15m <none> ocp37node4.lab.msp.redhat.com registry-console-1-deploy 0/1 ContainerCreating 0 13m <none> ocp37app1.lab.msp.redhat.com router-1-deploy 0/1 Error 0 18m 10.128.4.2 ocp37node5.lab.msp.redhat.com [root@ocp37m1 ~]# oc logs docker-registry-1-deploy Unable to connect to the server: dial tcp: lookup ocp37master.lab.msp.redhat.com on 10.15.108.83:53: no such host [root@ocp37m1 ~]# ping ocp37master PING ocp37master.lab.msp.redhat.com (10.15.108.91) 56(84) bytes of data. 64 bytes from ocp37master.lab.msp.redhat.com (10.15.108.91): icmp_seq=1 ttl=64 time=0.498 ms 64 bytes from ocp37master.lab.msp.redhat.com (10.15.108.91): icmp_seq=2 ttl=64 time=0.427 ms ^C --- ocp37master.lab.msp.redhat.com ping statistics --- 2 packets transmitted, 2 received, 0% packet loss, time 1000ms rtt min/avg/max/mdev = 0.427/0.462/0.498/0.041 ms [root@ocp37m1 ~]# ping ocp37master.lab.msp.redhat.com PING ocp37master.lab.msp.redhat.com (10.15.108.91) 56(84) bytes of data. 64 bytes from ocp37master.lab.msp.redhat.com (10.15.108.91): icmp_seq=1 ttl=64 time=0.513 ms 64 bytes from ocp37master.lab.msp.redhat.com (10.15.108.91): icmp_seq=2 ttl=64 time=0.593 ms 64 bytes from ocp37master.lab.msp.redhat.com (10.15.108.91): icmp_seq=3 ttl=64 time=0.753 ms ^C --- ocp37master.lab.msp.redhat.com ping statistics --- 3 packets transmitted, 3 received, 0% packet loss, time 2001ms rtt min/avg/max/mdev = 0.513/0.619/0.753/0.103 ms [root@ocp37m1 ~]# dig ocp37master.lab.msp.redhat.com ; <<>> DiG 9.9.4-RedHat-9.9.4-61.el7_5.1 <<>> ocp37master.lab.msp.redhat.com ;; global options: +cmd ;; Got answer: ;; ->>HEADER<<- opcode: QUERY, status: NOERROR, id: 12080 ;; flags: qr aa rd ra; QUERY: 1, ANSWER: 1, AUTHORITY: 0, ADDITIONAL: 0 ;; QUESTION SECTION: ;ocp37master.lab.msp.redhat.com. IN A ;; ANSWER SECTION: ocp37master.lab.msp.redhat.com. 0 IN A 10.15.108.91 ;; Query time: 2 msec ;; SERVER: 10.15.108.83#53(10.15.108.83) ;; WHEN: Mon Dec 03 11:44:36 CST 2018 ;; MSG SIZE rcvd: 64 [root@ocp37m1 ~]# oc logs registry-console-1-deploy --> Scaling registry-console-1 to 1
The install fails at this step: TASK [openshift_hosted : Create default projects] ****************************************************** failed: [ocp37m1.lab.msp.redhat.com] (item={'key': u'openshift-infra', 'value': {u'default_node_selector': u''}}) => { "failed": true, "invocation": { "module_args": { "admin": null, "admin_role": "admin", "debug": false, "description": null, "display_name": null, "kubeconfig": "/etc/origin/master/admin.kubeconfig", "name": "openshift-infra", "node_selector": [ "" ], "state": "present" } }, "item": { "key": "openshift-infra", "value": { "default_node_selector": "" } }, "msg": { "cmd": "/usr/bin/oc adm new-project openshift-infra --admin-role=admin --node-selector=", "results": {}, "returncode": 1, "stderr": "Unable to connect to the server: dial tcp: lookup ocp37master.lab.msp.redhat.com on 10.15.108.83:53: no such host\n", "stdout": "" } } When checking the master, it fails looking up the master's hostname against DNS: <snip> [root@ocp37m1 ~]# oc get events Unable to connect to the server: dial tcp: lookup ocp37master.lab.msp.redhat.com on 10.15.108.83:53: no such host [root@ocp37m1 ~]# ping ocp37master.lab.msp.redhat.com ping: ocp37master.lab.msp.redhat.com: Name or service not known [root@ocp37m1 ~]# ping ocp37master.lab.msp.redhat.com ping: ocp37master.lab.msp.redhat.com: Name or service not known [root@ocp37m1 ~]# dig ocp37master.lab.msp.redhat.com ; <<>> DiG 9.9.4-RedHat-9.9.4-61.el7_5.1 <<>> ocp37master.lab.msp.redhat.com ;; global options: +cmd ;; Got answer: ;; ->>HEADER<<- opcode: QUERY, status: NXDOMAIN, id: 48626 ;; flags: qr aa rd ra; QUERY: 1, ANSWER: 0, AUTHORITY: 1, ADDITIONAL: 1 ;; OPT PSEUDOSECTION: ; EDNS: version: 0, flags:; udp: 4096 ;; QUESTION SECTION: ;ocp37master.lab.msp.redhat.com. IN A ;; AUTHORITY SECTION: lab.msp.redhat.com. 86400 IN SOA buffett.lab.msp.redhat.com. cfeist.redhat.com. 2018111401 3600 1800 604800 86400 ;; Query time: 6 msec ;; SERVER: 10.15.108.83#53(10.15.108.83) ;; WHEN: Mon Dec 03 14:36:34 CST 2018 ;; MSG SIZE rcvd: 110 [root@ocp37m1 ~]# ping ocp37master.lab.msp.redhat.com ping: ocp37master.lab.msp.redhat.com: Name or service not known [root@ocp37m1 ~]# ping ocp37master.lab.msp.redhat.com PING ocp37master.lab.msp.redhat.com (10.15.108.91) 56(84) bytes of data. 64 bytes from ocp37master.lab.msp.redhat.com (10.15.108.91): icmp_seq=1 ttl=64 time=0.704 ms 64 bytes from ocp37master.lab.msp.redhat.com (10.15.108.91): icmp_seq=2 ttl=64 time=0.474 ms 64 bytes from ocp37master.lab.msp.redhat.com (10.15.108.91): icmp_seq=3 ttl=64 time=0.424 ms ^C --- ocp37master.lab.msp.redhat.com ping statistics --- 3 packets transmitted, 3 received, 0% packet loss, time 2001ms rtt min/avg/max/mdev = 0.424/0.534/0.704/0.121 ms [root@ocp37m1 ~]# dig ocp37master.lab.msp.redhat.com ; <<>> DiG 9.9.4-RedHat-9.9.4-61.el7_5.1 <<>> ocp37master.lab.msp.redhat.com ;; global options: +cmd ;; Got answer: ;; ->>HEADER<<- opcode: QUERY, status: NOERROR, id: 45744 ;; flags: qr aa rd ra; QUERY: 1, ANSWER: 1, AUTHORITY: 0, ADDITIONAL: 0 ;; QUESTION SECTION: ;ocp37master.lab.msp.redhat.com. IN A ;; ANSWER SECTION: ocp37master.lab.msp.redhat.com. 0 IN A 10.15.108.91 ;; Query time: 6 msec ;; SERVER: 10.15.108.83#53(10.15.108.83) ;; WHEN: Mon Dec 03 14:37:16 CST 2018 ;; MSG SIZE rcvd: 64 </snip> Then all of a sudden it picks up the right upstream server again. Just odd, seems like dnsmasq is doing some sort of loop around the upstream DNS nameservers. The only thing I'm thinking about doing now is adding the hostname to the local /etc/hosts file next as a workaround, not sure why dnsmasq is not finding the master server name, is it trying to valid it via all the nameservers (just wondering since this hostname only exists on one of the nameservers in the upstream list)? <snip> [root@ocp37m1 ~]# cat /etc/resolv.conf # nameserver updated by /etc/NetworkManager/dispatcher.d/99-origin-dns.sh # Generated by NetworkManager search cluster.local lab.msp.redhat.com redhat.com nameserver 10.15.108.83 [root@ocp37m1 ~]# cat /etc/dnsmasq.d/origin-upstream-dns.conf server=10.15.108.22 server=10.15.84.128 </snip>
Created attachment 1511104 [details] ansible playbook output for comment#15
Setting need info for comment#15
Closing this, I found an issue with upstream nameservers in relation to this, I'll file a new bz on what I found. Thanks!