Created attachment 1937554 [details] SBDB Created attachment 1937554 [details] SBDB Description of problem: On a KIND cluster, out of three nodes one of them is not having the right SNAT zone being created, so the ovn-controller has done something wrong somewhere: router for which wrong zone id was created: sh-5.2# ovn-nbctl list logical-router GR_ovn-control-plane _uuid : c4d4770d-7a19-46f6-bac3-bd72d685eb6f copp : ff481dec-8d8b-4db3-bb16-159d116d190f enabled : [] external_ids : {physical_ip="172.19.0.3", physical_ips="172.19.0.3"} load_balancer : [a2ffeffb-b8f6-48b4-bbfa-fe58bd9c46b4, fea94d30-e345-4701-a772-735bc250a457] load_balancer_group : [7e14296e-428c-44fd-8e4b-31d9c79af7a8] name : GR_ovn-control-plane nat : [13f19514-3ada-48d6-a802-2328a39c2083, 1a2e9bd4-11e4-4b11-8fba-b972a7d31d61, 77daa738-38ad-4ace-a55f-3e044db6aa3e] options : {always_learn_from_arp_request="false", chassis="42ff75c4-bea7-49fa-a311-2e7e408950fd", dynamic_neigh_routers="true", lb_force_snat_ip=router_ip, snat-ct-zone="0"} policies : [] ports : [80b8f8fc-0d35-4d74-8cb5-64790bf3f9b2, b1ddee97-5291-4de6-94b0-f20d21cb9153] static_routes : [49cb42f0-6197-44ec-86b4-275081fb8ed6, 753ecea6-cdf6-4184-a805-be208e06b8b5] sh-5.2# ovn-sbctl list datapath_binding GR_ovn-control-plane _uuid : 3acc2a68-4767-4b74-adec-5648d4889d3f external_ids : {always_learn_from_arp_request="false", logical-router="c4d4770d-7a19-46f6-bac3-bd72d685eb6f", name=GR_ovn-control-plane, snat-ct-zone="0"} load_balancers : [] tunnel_key : 10 ovn-controller on the wrong node: sh-5.2# ovs-appctl -t ovn-controller ct-zone-list 8bb103d2-854c-420a-87cd-7525751740fd_dnat 1 8bb103d2-854c-420a-87cd-7525751740fd_snat 2 01e8a93a-ff57-4d6a-8390-2140c5fe9b50_dnat 3 3acc2a68-4767-4b74-adec-5648d4889d3f_dnat 8 f12103c8-3115-4c2e-8e09-d6677c7296bf_snat 4 01e8a93a-ff57-4d6a-8390-2140c5fe9b50_snat 6 3acc2a68-4767-4b74-adec-5648d4889d3f_snat 9 ----> wrong! 7c11dfeb-393a-4020-8861-00718efd1a9a_dnat 10 7c11dfeb-393a-4020-8861-00718efd1a9a_snat 11 kube-system_coredns-6d4b75cb6d-lptr5 14 f12103c8-3115-4c2e-8e09-d6677c7296bf_dnat 5 k8s-ovn-control-plane 7 local-path-storage_local-path-provisioner-9cd9bd544-k2d22 13 kube-system_coredns-6d4b75cb6d-tm4ng 12 sh-5.2# routers for which correct zone id was created: sh-5.2# ovn-nbctl list logical-router GR_ovn-worker _uuid : e7a1568a-2598-4b55-8251-b76ca0e29032 copp : ff481dec-8d8b-4db3-bb16-159d116d190f enabled : [] external_ids : {physical_ip="172.19.0.2", physical_ips="172.19.0.2"} load_balancer : [18b78948-c62d-40c9-938d-5435b85951ee, 199fd4f3-f745-41f4-bce0-50a80dc2bca8] load_balancer_group : [7e14296e-428c-44fd-8e4b-31d9c79af7a8] name : GR_ovn-worker nat : [] options : {always_learn_from_arp_request="false", chassis="0195f08e-4a98-41c2-9496-cd391f414ea2", dynamic_neigh_routers="true", lb_force_snat_ip=router_ip, snat-ct-zone="0"} policies : [] ports : [295367d1-e617-4f90-a477-ef47d160099f, 38e921c4-6346-42ae-8f7e-fb8188688e0a] static_routes : [168c453e-43d8-4e9c-acfa-e05b2335f524, ca2191d4-5ba3-47c7-a3ae-c90f41e6ef06] sh-5.2# ovn-sbctl list datapath_binding GR_ovn-worker _uuid : 38c1d94c-c990-4fc8-af32-5ed1acdf2f5e external_ids : {always_learn_from_arp_request="false", logical-router="e7a1568a-2598-4b55-8251-b76ca0e29032", name=GR_ovn-worker, snat-ct-zone="0"} load_balancers : [] tunnel_key : 8 ovn-controller on ovn-worker: sh-5.2# ovs-appctl -t ovn-controller ct-zone-list 86b2f86b-2d64-41e8-b68b-4d9184a95163_snat 2 8bb103d2-854c-420a-87cd-7525751740fd_dnat 1 8bb103d2-854c-420a-87cd-7525751740fd_snat 3 38c1d94c-c990-4fc8-af32-5ed1acdf2f5e_snat 0 f12103c8-3115-4c2e-8e09-d6677c7296bf_snat 6 56998d8a-c461-4a3d-a63f-03d9095d323e_dnat 8 86b2f86b-2d64-41e8-b68b-4d9184a95163_dnat 4 38c1d94c-c990-4fc8-af32-5ed1acdf2f5e_dnat 11 56998d8a-c461-4a3d-a63f-03d9095d323e_snat 10 f12103c8-3115-4c2e-8e09-d6677c7296bf_dnat 5 k8s-ovn-worker 7 sh-5.2# sh-5.2# ovn-nbctl list logical-router GR_ovn-worker2 _uuid : 94b9cf6f-0f6a-4270-bf98-f65574a4529c copp : ff481dec-8d8b-4db3-bb16-159d116d190f enabled : [] external_ids : {physical_ip="172.19.0.4", physical_ips="172.19.0.4"} load_balancer : [199fd4f3-f745-41f4-bce0-50a80dc2bca8, b8f064d6-aff6-4db3-935d-0cac2ad4745a] load_balancer_group : [7e14296e-428c-44fd-8e4b-31d9c79af7a8] name : GR_ovn-worker2 nat : [] options : {always_learn_from_arp_request="false", chassis="892c718a-d153-4692-85a2-219f379f2aac", dynamic_neigh_routers="true", lb_force_snat_ip=router_ip, snat-ct-zone="0"} policies : [] ports : [47bb80c8-52a7-4071-9256-4b5575a10b54, af0b8ddb-484d-4699-b17f-dfb544c40df2] static_routes : [708c7945-730c-4e69-9dd0-a9795dac20bd, b1095ea7-6f9d-4bdd-850d-cd1eb2483765] sh-5.2# ovn-sbctl list datapath_binding GR_ovn-worker2 _uuid : 8b11dddd-e65c-43cb-ae5f-dbd038e6f2cd external_ids : {always_learn_from_arp_request="false", logical-router="94b9cf6f-0f6a-4270-bf98-f65574a4529c", name=GR_ovn-worker2, snat-ct-zone="0"} load_balancers : [] tunnel_key : 6 ovn-controller on ovn-worker2: [surya@hidden-temple yaml_debugging]$ oc rsh -n ovn-kubernetes ovs-node-wk7nv sh-5.2# ovs-appctl -t ovn-controller ct-zone-list 208fb755-f761-40e8-a1ec-f061462ecdb6_dnat 2 8bb103d2-854c-420a-87cd-7525751740fd_dnat 3 8b11dddd-e65c-43cb-ae5f-dbd038e6f2cd_dnat 8 8bb103d2-854c-420a-87cd-7525751740fd_snat 4 b128d3a2-6b7e-4617-bfea-db19395ac218_snat 9 f12103c8-3115-4c2e-8e09-d6677c7296bf_snat 6 b128d3a2-6b7e-4617-bfea-db19395ac218_dnat 11 8b11dddd-e65c-43cb-ae5f-dbd038e6f2cd_snat 0 208fb755-f761-40e8-a1ec-f061462ecdb6_snat 1 k8s-ovn-worker2 7 f12103c8-3115-4c2e-8e09-d6677c7296bf_dnat 5 sh-5.2# Version-Release number of selected component (if applicable): 2023-01-11T10:57:50.696Z|00004|main|INFO|OVN internal version is : [22.12.0-20.27.0-70.6] How reproducible: not always happened by chance on a latest kind install Steps to Reproduce: 1. 2. 3. Actual results: Expected results: Additional info: attaching: NB/SB DBs control plane OVS DB live core of ovn-controller
Created attachment 1937555 [details] NBDB
Created attachment 1937556 [details] OVSDB from wrong node
Attaching gcore: root@ovn-control-plane:/# gcore -o /tmp/ovn-controller.live.core 1866 [New LWP 1867] [New LWP 1868] [New LWP 1870] warning: Expected absolute pathname for libpthread in the inferior, but got target:/lib64/libc.so.6. warning: Unable to find libthread_db matching inferior's thread library, thread debugging will not be available. 0x00007fbd4e48390f in poll () from target:/lib64/libc.so.6 warning: target file /proc/1866/cmdline contained unexpected null characters warning: Memory read failed for corefile section, 4096 bytes at 0xffffffffff600000. Saved corefile /tmp/ovn-controller.live.core.1866 [Inferior 1 (process 1866) detached]
sh-5.2# ovs-appctl -t ovn-controller ct-zone-list 8bb103d2-854c-420a-87cd-7525751740fd_dnat 1 8bb103d2-854c-420a-87cd-7525751740fd_snat 2 01e8a93a-ff57-4d6a-8390-2140c5fe9b50_dnat 3 3acc2a68-4767-4b74-adec-5648d4889d3f_dnat 8 f12103c8-3115-4c2e-8e09-d6677c7296bf_snat 4 01e8a93a-ff57-4d6a-8390-2140c5fe9b50_snat 6 3acc2a68-4767-4b74-adec-5648d4889d3f_snat 9 7c11dfeb-393a-4020-8861-00718efd1a9a_dnat 10 7c11dfeb-393a-4020-8861-00718efd1a9a_snat 11 kube-system_coredns-6d4b75cb6d-lptr5 14 f12103c8-3115-4c2e-8e09-d6677c7296bf_dnat 5 k8s-ovn-control-plane 7 local-path-storage_local-path-provisioner-9cd9bd544-k2d22 13 kube-system_coredns-6d4b75cb6d-tm4ng 12 sh-5.2# ovn-appctl -t ovn-controller recompute sh-5.2# ovs-appctl -t ovn-controller ct-zone-list 8bb103d2-854c-420a-87cd-7525751740fd_dnat 1 8bb103d2-854c-420a-87cd-7525751740fd_snat 2 01e8a93a-ff57-4d6a-8390-2140c5fe9b50_dnat 3 3acc2a68-4767-4b74-adec-5648d4889d3f_dnat 8 f12103c8-3115-4c2e-8e09-d6677c7296bf_snat 4 01e8a93a-ff57-4d6a-8390-2140c5fe9b50_snat 6 3acc2a68-4767-4b74-adec-5648d4889d3f_snat 0 7c11dfeb-393a-4020-8861-00718efd1a9a_dnat 10 7c11dfeb-393a-4020-8861-00718efd1a9a_snat 11 kube-system_coredns-6d4b75cb6d-lptr5 14 f12103c8-3115-4c2e-8e09-d6677c7296bf_dnat 5 k8s-ovn-control-plane 7 local-path-storage_local-path-provisioner-9cd9bd544-k2d22 13 kube-system_coredns-6d4b75cb6d-tm4ng 12
This was spotted with ovn-22.12.0-0.fc36.x86_64 (in kind). Debug symbols from this version are needed to decode the live core.
I have figured out the issue and reproduced it locally. As strange as it may seem, the bug is in the recompute code of CT zones and not the incremental code. The issue only manifests if the logical router and all of its settings are created in a single transaction, and then no further changes are made to the logical router. The issue is that the requested SNAT zone only takes effect if there is already an assigned CT zone for the logical router. When everything is created in one transaction, then the logical router has no existing CT zone assigned to it, so the requested zone is ignored. Since no further changes are made to the logical router, the incremental code cannot detect the mismatch between the assigned and requested zone and correct the issue. I have a fix in the works that corrects the issue in a sandbox. What I need to do is to write a formal test that proves it before I submit a patch to the mailing list.
I have posted a patch to fix this: https://patchwork.ozlabs.org/project/ovn/patch/20230118133113.1253910-1-mmichels@redhat.com/
ovn22.12 fast-datapath-rhel-9 clone created at https://bugzilla.redhat.com/show_bug.cgi?id=2162817 ovn22.09 fast-datapath-rhel-8 clone created at https://bugzilla.redhat.com/show_bug.cgi?id=2162818 ovn22.09 fast-datapath-rhel-9 clone created at https://bugzilla.redhat.com/show_bug.cgi?id=2162819 ovn22.06 fast-datapath-rhel-8 clone created at https://bugzilla.redhat.com/show_bug.cgi?id=2162820 ovn22.06 fast-datapath-rhel-9 clone created at https://bugzilla.redhat.com/show_bug.cgi?id=2162821 ovn22.03 fast-datapath-rhel-8 clone created at https://bugzilla.redhat.com/show_bug.cgi?id=2162822 ovn22.03 fast-datapath-rhel-9 clone created at https://bugzilla.redhat.com/show_bug.cgi?id=2162823
reproducer: systemctl start openvswitch systemctl start ovn-northd ovn-nbctl set-connection ptcp:6641 ovn-sbctl set-connection ptcp:6642 ovs-vsctl set open . external_ids:system-id=hv1 external_ids:ovn-remote=tcp:127.0.0.1:6642 external_ids:ovn-encap-type=geneve external_ids:ovn-encap-ip=127.0.0.1 systemctl restart ovn-controller ovn-nbctl --wait=hv sync ovn-sbctl list datapath_binding ovn-nbctl lr-add lr0 -- set Logical_Router lr0 options:snat-ct-zone=666 -- lrp-add lr0 lrp-gw 01:00:00:00:00:01 172.16.0.1 -- lrp-set-gateway-chassis lrp-gw hv1 ovn-nbctl --wait=hv sync lr_uuid=$(ovn-sbctl find datapath_bind external_ids:name=lr0 | awk '/_uuid/{print $3}') ct_zones=$(ovn-appctl -t ovn-controller ct-zone-list) zone_num=$(printf "$ct_zones" | grep ${lr_uuid}_snat | cut -d ' ' -f 2) test "$zone_num" -eq 666 echo $? reproduced on ovn22.12-22.12.0-4.el8: [root@wsfd-advnetlab16 bz2160403]# rpm -qa | grep -E "ovn22.12|openvswitch2.17" python3-openvswitch2.17-2.17.0-60.el8fdp.x86_64 ovn22.12-host-22.12.0-4.el8fdp.x86_64 ovn22.12-22.12.0-4.el8fdp.x86_64 openvswitch2.17-2.17.0-60.el8fdp.x86_64 ovn22.12-central-22.12.0-4.el8fdp.x86_64 + ovn-nbctl --wait=hv sync ++ ovn-sbctl find datapath_bind external_ids:name=lr0 ++ awk '/_uuid/{print $3}' + lr_uuid=66e6a0f9-826d-49dd-a404-61ed463f7114 ++ ovn-appctl -t ovn-controller ct-zone-list + ct_zones='66e6a0f9-826d-49dd-a404-61ed463f7114_snat 1 66e6a0f9-826d-49dd-a404-61ed463f7114_dnat 2' ++ printf '66e6a0f9-826d-49dd-a404-61ed463f7114_snat 1 66e6a0f9-826d-49dd-a404-61ed463f7114_dnat 2' ++ cut -d ' ' -f 2 ++ grep 66e6a0f9-826d-49dd-a404-61ed463f7114_snat + zone_num=1 + test 1 -eq 666 <=== snat ct zone is not the snat-ct-zone + echo 1 1 Verified on ovn22.12-22.12.0-20.el8: [root@wsfd-advnetlab16 bz2160403]# rpm -qa | grep -E "ovn22.12|openvswitch2.17" python3-openvswitch2.17-2.17.0-60.el8fdp.x86_64 ovn22.12-central-22.12.0-20.el8fdp.x86_64 openvswitch2.17-2.17.0-60.el8fdp.x86_64 ovn22.12-host-22.12.0-20.el8fdp.x86_64 ovn22.12-22.12.0-20.el8fdp.x86_64 + ovn-nbctl --wait=hv sync ++ ovn-sbctl find datapath_bind external_ids:name=lr0 ++ awk '/_uuid/{print $3}' + lr_uuid=db0defa1-ff88-4252-be9a-afba5261fe70 ++ ovn-appctl -t ovn-controller ct-zone-list + ct_zones='db0defa1-ff88-4252-be9a-afba5261fe70_dnat 1 db0defa1-ff88-4252-be9a-afba5261fe70_snat 666' ++ printf 'db0defa1-ff88-4252-be9a-afba5261fe70_dnat 1 db0defa1-ff88-4252-be9a-afba5261fe70_snat 666' ++ grep db0defa1-ff88-4252-be9a-afba5261fe70_snat ++ cut -d ' ' -f 2 + zone_num=666 + test 666 -eq 666 <=== snat ct zone is the snat-ct-zone + echo 0 0