Bug 2088627
| Summary: | [OVN] EgressIP NATs are not being cleared correctly from the logical router | |||
|---|---|---|---|---|
| Product: | OpenShift Container Platform | Reporter: | ffernand <ffernand> | |
| Component: | Networking | Assignee: | ffernand <ffernand> | |
| Networking sub component: | ovn-kubernetes | QA Contact: | jechen <jechen> | |
| Status: | CLOSED ERRATA | Docs Contact: | ||
| Severity: | high | |||
| Priority: | high | CC: | andcosta, anusaxen, bpickard, cldavey, ffernand, jechen, openshift-bugs-escalate, pdiak, rravaiol, surya, vlours | |
| Version: | 4.8 | Keywords: | FastFix | |
| Target Milestone: | --- | |||
| Target Release: | 4.10.z | |||
| Hardware: | x86_64 | |||
| OS: | Linux | |||
| Whiteboard: | ||||
| Fixed In Version: | Doc Type: | If docs needed, set a value | ||
| Doc Text: | Story Points: | --- | ||
| Clone Of: | 2088626 | |||
| : | 2088630 (view as bug list) | Environment: | ||
| Last Closed: | 2022-06-07 13:24:31 UTC | Type: | --- | |
| Regression: | --- | Mount Type: | --- | |
| Documentation: | --- | CRM: | ||
| Verified Versions: | Category: | --- | ||
| oVirt Team: | --- | RHEL 7.3 requirements from Atomic Host: | ||
| Cloudforms Team: | --- | Target Upstream Version: | ||
| Embargoed: | ||||
| Bug Depends On: | 2088626 | |||
| Bug Blocks: | 2088630 | |||
|
Comment 1
jechen
2022-05-25 02:17:13 UTC
Verified with pre-merged image 4.10.0-0.ci.test-2022-05-24-200227-ci-ln-tyxls9k-latest
$ oc get clusterversion
NAME VERSION AVAILABLE PROGRESSING SINCE STATUS
version 4.10.0-0.ci.test-2022-05-24-200227-ci-ln-tyxls9k-latest True False 33m Cluster version is 4.10.0-0.ci.test-2022-05-24-200227-ci-ln-tyxls9k-latest
$ oc get node
NAME STATUS ROLES AGE VERSION
jechen-0525a-h29xb-master-0.c.openshift-qe.internal Ready master 154m v1.23.5+3afdacb
jechen-0525a-h29xb-master-1.c.openshift-qe.internal Ready master 153m v1.23.5+3afdacb
jechen-0525a-h29xb-master-2.c.openshift-qe.internal Ready master 154m v1.23.5+3afdacb
jechen-0525a-h29xb-worker-a-vjj7f.c.openshift-qe.internal Ready worker 142m v1.23.5+3afdacb
jechen-0525a-h29xb-worker-b-xt8zm.c.openshift-qe.internal Ready worker 142m v1.23.5+3afdacb
jechen-0525a-h29xb-worker-c-rwzmj.c.openshift-qe.internal Ready worker 142m v1.23.5+3afdacb
# label two nodes as egress nodes
$ oc label node jechen-0525a-h29xb-worker-a-vjj7f.c.openshift-qe.internal "k8s.ovn.org/egress-assignable"=""
node/jechen-0525a-h29xb-worker-a-vjj7f.c.openshift-qe.internal labeled
$ oc label node jechen-0525a-h29xb-worker-b-xt8zm.c.openshift-qe.internal "k8s.ovn.org/egress-assignable"=""
node/jechen-0525a-h29xb-worker-b-xt8zm.c.openshift-qe.internal labeled
# create egressIP object
$ oc create -f config_egressip1_ovn_ns_team_red.yaml
egressip.k8s.ovn.org/egressip1 created
$ oc get egressip
NAME EGRESSIPS ASSIGNED NODE ASSIGNED EGRESSIPS
egressip1 10.0.128.101 jechen-0525a-h29xb-worker-a-vjj7f.c.openshift-qe.internal 10.0.128.101
$ oc get egressip egressip1 -oyaml
apiVersion: k8s.ovn.org/v1
kind: EgressIP
metadata:
creationTimestamp: "2022-05-25T14:29:30Z"
generation: 2
name: egressip1
resourceVersion: "45680"
uid: 39997df6-c31a-49b6-bc0a-9151c9d26840
spec:
egressIPs:
- 10.0.128.101
namespaceSelector:
matchLabels:
team: red
status:
items:
- egressIP: 10.0.128.101
node: jechen-0525a-h29xb-worker-a-vjj7f.c.openshift-qe.internal
# create a namespace, label the namespace with team=red, create some test pod in it
$ oc new-project test
$ oc label ns test team=red
namespace/test labeled
$ oc create -f ./SDN-1332-test/list_for_pods.json
replicationcontroller/test-rc created
service/test-service created
$ oc get pod
NAME READY STATUS RESTARTS AGE
test-rc-62sdg 1/1 Running 0 9m38s
test-rc-74xbp 1/1 Running 0 9m38s
test-rc-j4jbx 1/1 Running 0 9m38s
test-rc-slbqz 1/1 Running 0 9m38s
test-rc-v47xm 1/1 Running 0 9m38s
$ oc rsh test-rc-62sdg
~ $ curl 10.0.0.2:9152
10.0.128.101~ $
# check NAT
$ oc get -o jsonpath='{.metadata.annotations.control-plane\.alpha\.kubernetes\.io/leader}' -n openshift-ovn-kubernetes cm ovn-kubernetes-master
{"holderIdentity":"jechen-0525a-h29xb-master-1.c.openshift-qe.internal","leaseDurationSeconds":60,"acquireTime":"2022-05-25T13:24:27Z","renewTime":"2022-05-25T14:30:08Z","leaderTransitions":0}
$ oc get pod -n openshift-ovn-kubernetes -l app=ovnkube-master --field-selector=spec.nodeName=jechen-0525a-h29xb-master-1.c.openshift-qe.internal -o jsonpath={.items[*].metadata.name}
ovnkube-master-fp652
$ oc -n openshift-ovn-kubernetes rsh ovnkube-master-fp652
Defaulted container "northd" out of: northd, nbdb, kube-rbac-proxy, sbdb, ovnkube-master, ovn-dbchecker
sh-4.4# ps auxwww | grep ssl
root 1 0.0 0.0 11920 2748 ? Ss 13:23 0:00 /bin/bash -c set -xem if [[ -f /env/_master ]]; then set -o allexport source /env/_master set +o allexport fi quit() { echo "$(date -Iseconds) - stopping ovn-northd" OVN_MANAGE_OVSDB=no /usr/share/ovn/scripts/ovn-ctl stop_northd echo "$(date -Iseconds) - ovn-northd stopped" rm -f /var/run/ovn/ovn-northd.pid exit 0 } # end of quit trap quit TERM INT echo "$(date -Iseconds) - starting ovn-northd" exec ovn-northd \ --no-chdir "-vconsole:${OVN_LOG_LEVEL}" -vfile:off "-vPATTERN:console:%D{%Y-%m-%dT%H:%M:%S.###Z}|%05N|%c%T|%p|%m" \ --ovnnb-db "ssl:10.0.0.5:9641,ssl:10.0.0.6:9641,ssl:10.0.0.7:9641" \ --ovnsb-db "ssl:10.0.0.5:9642,ssl:10.0.0.6:9642,ssl:10.0.0.7:9642" \ --pidfile /var/run/ovn/ovn-northd.pid \ -p /ovn-cert/tls.key \ -c /ovn-cert/tls.crt \ -C /ovn-ca/ca-bundle.crt & wait $!
root 9 0.0 0.1 205668 22712 ? Sl 13:23 0:02 ovn-northd --no-chdir -vconsole:info -vfile:off -vPATTERN:console:%D{%Y-%m-%dT%H:%M:%S.###Z}|%05N|%c%T|%p|%m --ovnnb-db ssl:10.0.0.5:9641,ssl:10.0.0.6:9641,ssl:10.0.0.7:9641 --ovnsb-db ssl:10.0.0.5:9642,ssl:10.0.0.6:9642,ssl:10.0.0.7:9642 --pidfile /var/run/ovn/ovn-northd.pid -p /ovn-cert/tls.key -c /ovn-cert/tls.crt -C /ovn-ca/ca-bundle.crt
root 19 0.0 0.0 9208 1104 pts/0 S+ 14:31 0:00 grep ssl
sh-4.4#
sh-4.4#
sh-4.4# alias ovn-nbctl='ovn-nbctl --db ssl:10.0.0.5:9641,ssl:10.0.0.6:9641,ssl:10.0.0.7:9641 -p /ovn-cert/tls.key -c /ovn-cert/tls.crt -C /ovn-ca/ca-bundle.crt'
sh-4.4#
sh-4.4# ovn-nbctl find nat external-ids:\"name\"!=\"\"
_uuid : a5643002-ef4d-49bf-aa1b-8bfebe0e2f30
allowed_ext_ips : []
exempted_ext_ips : []
external_ids : {name=egressip1}
external_ip : "10.0.128.101"
external_mac : []
external_port_range : ""
logical_ip : "10.128.2.15"
logical_port : k8s-jechen-0525a-h29xb-worker-a-vjj7f.c.openshift-qe.internal
options : {stateless="false"}
type : snat
_uuid : c91a8b1b-2935-40f9-a4b9-b1a245f36892
allowed_ext_ips : []
exempted_ext_ips : []
external_ids : {name=egressip1}
external_ip : "10.0.128.101"
external_mac : []
external_port_range : ""
logical_ip : "10.129.2.14"
logical_port : k8s-jechen-0525a-h29xb-worker-a-vjj7f.c.openshift-qe.internal
options : {stateless="false"}
type : snat
_uuid : 366781cb-8058-4d52-9e6c-0e61434bc9d4
allowed_ext_ips : []
exempted_ext_ips : []
external_ids : {name=egressip1}
external_ip : "10.0.128.101"
external_mac : []
external_port_range : ""
logical_ip : "10.129.2.13"
logical_port : k8s-jechen-0525a-h29xb-worker-a-vjj7f.c.openshift-qe.internal
options : {stateless="false"}
type : snat
_uuid : 4f2735cd-e8e2-4f1e-b7a2-f7d62bbe2c85
allowed_ext_ips : []
exempted_ext_ips : []
external_ids : {name=egressip1}
external_ip : "10.0.128.101"
external_mac : []
external_port_range : ""
logical_ip : "10.128.2.16"
logical_port : k8s-jechen-0525a-h29xb-worker-a-vjj7f.c.openshift-qe.internal
options : {stateless="false"}
type : snat
_uuid : 3ab5f178-1d67-4982-88ce-7b6859f74e2c
allowed_ext_ips : []
exempted_ext_ips : []
external_ids : {name=egressip1}
external_ip : "10.0.128.101"
external_mac : []
external_port_range : ""
logical_ip : "10.131.0.30"
logical_port : k8s-jechen-0525a-h29xb-worker-a-vjj7f.c.openshift-qe.internal
options : {stateless="false"}
type : snat
# shutdown the egressnode that has egressIP assigned, wait till egressIP is moved to a second egress node
$ oc debug node/jechen-0525a-h29xb-worker-a-vjj7f.c.openshift-qe.internal
Starting pod/jechen-0525a-h29xb-worker-a-vjj7fcopenshift-qeinternal-debug ...
To use host binaries, run `chroot /host`
Pod IP: 10.0.128.4
If you don't see a command prompt, try pressing enter.
sh-4.4# chroot /host
sh-4.4#
sh-4.4# shutdown
Shutdown scheduled for Wed 2022-05-25 15:58:12 UTC, use 'shutdown -c' to cancel.
sh-4.4#
sh-4.4# exit
sh-4.4#
Removing debug pod ...
$ oc get egressip egressip1 -oyaml
apiVersion: k8s.ovn.org/v1
kind: EgressIP
metadata:
creationTimestamp: "2022-05-25T14:29:30Z"
generation: 4
name: egressip1
resourceVersion: "76171"
uid: 39997df6-c31a-49b6-bc0a-9151c9d26840
spec:
egressIPs:
- 10.0.128.101
namespaceSelector:
matchLabels:
team: red
status:
items:
- egressIP: 10.0.128.101
node: jechen-0525a-h29xb-worker-b-xt8zm.c.openshift-qe.internal
#check NAT again
sh-4.4#
sh-4.4# ovn-nbctl find nat external-ids:\"name\"!=\"\"
_uuid : e92efa15-d734-4c96-8c94-2fa13aefc849
allowed_ext_ips : []
exempted_ext_ips : []
external_ids : {name=egressip1}
external_ip : "10.0.128.101"
external_mac : []
external_port_range : ""
logical_ip : "10.128.2.18"
logical_port : k8s-jechen-0525a-h29xb-worker-b-xt8zm.c.openshift-qe.internal
options : {stateless="false"}
type : snat
_uuid : bd865116-ebd3-4c13-a2e0-bb161cff6fb7
allowed_ext_ips : []
exempted_ext_ips : []
external_ids : {name=egressip1}
external_ip : "10.0.128.101"
external_mac : []
external_port_range : ""
logical_ip : "10.131.0.30"
logical_port : k8s-jechen-0525a-h29xb-worker-b-xt8zm.c.openshift-qe.internal
options : {stateless="false"}
type : snat
_uuid : 2e41b0a1-bb6b-4345-af45-57d56834e465
allowed_ext_ips : []
exempted_ext_ips : []
external_ids : {name=egressip1}
external_ip : "10.0.128.101"
external_mac : []
external_port_range : ""
logical_ip : "10.128.2.15"
logical_port : k8s-jechen-0525a-h29xb-worker-b-xt8zm.c.openshift-qe.internal
options : {stateless="false"}
type : snat
_uuid : 908df626-d49d-46b9-9b8f-bb5a3320a6cd
allowed_ext_ips : []
exempted_ext_ips : []
external_ids : {name=egressip1}
external_ip : "10.0.128.101"
external_mac : []
external_port_range : ""
logical_ip : "10.131.0.34"
logical_port : k8s-jechen-0525a-h29xb-worker-b-xt8zm.c.openshift-qe.internal
options : {stateless="false"}
type : snat
_uuid : db0304ab-852d-473f-b0d6-f59abd463c12
allowed_ext_ips : []
exempted_ext_ips : []
external_ids : {name=egressip1}
external_ip : "10.0.128.101"
external_mac : []
external_port_range : ""
logical_ip : "10.128.2.16"
logical_port : k8s-jechen-0525a-h29xb-worker-b-xt8zm.c.openshift-qe.internal
options : {stateless="false"}
type : snat
_uuid : 4071bc3a-fbcd-4b56-a0f1-1cad78c529dd
allowed_ext_ips : []
exempted_ext_ips : []
external_ids : {name=egressip1}
external_ip : "10.0.128.101"
external_mac : []
external_port_range : ""
logical_ip : "10.129.2.13"
logical_port : k8s-jechen-0525a-h29xb-worker-b-xt8zm.c.openshift-qe.internal
options : {stateless="false"}
type : snat
_uuid : 5c3217bb-1505-49c2-b0bd-572616ad16ff
allowed_ext_ips : []
exempted_ext_ips : []
external_ids : {name=egressip1}
external_ip : "10.0.128.101"
external_mac : []
external_port_range : ""
logical_ip : "10.129.2.14"
logical_port : k8s-jechen-0525a-h29xb-worker-b-xt8zm.c.openshift-qe.internal
options : {stateless="false"}
type : snat
==> NAT entries are updated with new egressIP node
Since the problem described in this bug report should be resolved in a recent advisory, it has been closed with a resolution of ERRATA. For information on the advisory (OpenShift Container Platform 4.10.17 bug fix update), and where to find the updated files, follow the link below. If the solution does not work for you, open a new bug report. https://access.redhat.com/errata/RHBA-2022:4882 |