Description of problem: Cluster install failed on IBM Cloud IPI DaemonSet "openshift-ovn-kubernetes/ovnkube-master" rollout is not making progress - pod ovnkube-master-xcv8v is in CrashLoopBackOff State Version-Release number of selected component (if applicable): 4.10.0-0.ci-2021-12-16-184808 How reproducible: Once Steps to Reproduce: 1. install cluster 2. check OVN pods 3. Actual results: ovnkube-master-xcv8v ovn-dbchecker log + [[ -f /env/_master ]] ++ date '+%m%d %H:%M:%S.%N' + echo 'I1217 01:18:20.422223585 - ovn-dbchecker - start ovn-dbchecker' I1217 01:18:20.422223585 - ovn-dbchecker - start ovn-dbchecker + exec /usr/bin/ovndbchecker --config-file=/run/ovnkube-config/ovnkube.conf --loglevel 4 --sb-address ssl:10.240.1.5:9642,ssl:10.240.129.6:9642,ssl:10.240.65.6:9642 --sb-client-privkey /ovn-cert/tls.key --sb-client-cert /ovn-cert/tls.crt --sb-client-cacert /ovn-ca/ca-bundle.crt --sb-cert-common-name ovn --sb-raft-election-timer 16 --nb-address ssl:10.240.1.5:9641,ssl:10.240.129.6:9641,ssl:10.240.65.6:9641 --nb-client-privkey /ovn-cert/tls.key --nb-client-cert /ovn-cert/tls.crt --nb-client-cacert /ovn-ca/ca-bundle.crt --nb-cert-common-name ovn --nb-raft-election-timer 10 I1217 01:18:20.549618 1 ovs.go:95] Maximum command line arguments set to: 191102 I1217 01:18:20.550561 1 config.go:1615] Parsed config file /run/ovnkube-config/ovnkube.conf I1217 01:18:20.550592 1 config.go:1616] Parsed config: {Default:{MTU:1400 RoutableMTU:0 ConntrackZone:64000 EncapType:geneve EncapIP: EncapPort:6081 InactivityProbe:100000 OpenFlowProbe:180 MonitorAll:true LFlowCacheEnable:true LFlowCacheLimit:0 LFlowCacheLimitKb:1048576 RawClusterSubnets:10.128.0.0/14/23 ClusterSubnets:[]} Logging:{File: CNIFile: Level:4 LogFileMaxSize:100 LogFileMaxBackups:5 LogFileMaxAge:5 ACLLoggingRateLimit:20} Monitoring:{RawNetFlowTargets: RawSFlowTargets: RawIPFIXTargets: NetFlowTargets:[] SFlowTargets:[] IPFIXTargets:[]} CNI:{ConfDir:/etc/cni/net.d Plugin:ovn-k8s-cni-overlay} OVNKubernetesFeature:{EnableEgressIP:true EnableEgressFirewall:true} Kubernetes:{Kubeconfig: CACert: CAData:[] APIServer:https://api-int.rbrattai-o410i46.ibmcloud.qe.devcluster.openshift.com:6443 Token: CompatServiceCIDR: RawServiceCIDRs:172.30.0.0/16 ServiceCIDRs:[] OVNConfigNamespace:openshift-ovn-kubernetes MetricsBindAddress: OVNMetricsBindAddress: MetricsEnablePprof:false OVNEmptyLbEvents:false PodIP: RawNoHostSubnetNodes: NoHostSubnetNodes:nil HostNetworkNamespace:openshift-host-network PlatformType:} OvnNorth:{Address: PrivKey: Cert: CACert: CertCommonName: Scheme: ElectionTimer:0 northbound:false exec:<nil>} OvnSouth:{Address: PrivKey: Cert: CACert: CertCommonName: Scheme: ElectionTimer:0 northbound:false exec:<nil>} Gateway:{Mode:shared Interface: EgressGWInterface: NextHop: VLANID:0 NodeportEnable:true DisableSNATMultipleGWs:false V4JoinSubnet:100.64.0.0/16 V6JoinSubnet:fd98::/64 DisablePacketMTUCheck:false RouterSubnet:} MasterHA:{ElectionLeaseDuration:60 ElectionRenewDeadline:30 ElectionRetryPeriod:20} HybridOverlay:{Enabled:false RawClusterSubnets: ClusterSubnets:[] VXLANPort:4789} OvnKubeNode:{Mode:full MgmtPortNetdev: DisableOVNIfaceIdVer:false}} I1217 01:18:20.561110 1 ovndbmanager.go:43] Starting DB Checker to ensure cluster membership and DB consistency F1217 01:18:20.570344 1 ovndbmanager.go:49] NBDB Upgrade failed: %!w(*fmt.wrapError=&{failed to get schema version for NBDB, stderr: "ovsdb-client: transaction returned error: {\"details\":\"get_schema request specifies database OVN_Northbound which is not yet available because it has not completed joining its cluster\",\"error\":\"database not available\"}\n", error: OVN command '/usr/bin/ovsdb-client -t 10 get-schema-version unix:/var/run/ovn/ovnnb_db.sock OVN_Northbound' failed: exit status 1 0xc0004a62f0}) goroutine 52 [running]: k8s.io/klog/v2.stacks(0xc000012001, 0xc000134000, 0x20a, 0x35d) /go/src/github.com/openshift/ovn-kubernetes/go-controller/vendor/k8s.io/klog/v2/klog.go:1038 +0xb9 k8s.io/klog/v2.(*loggingT).output(0x2653e20, 0xc000000003, 0x0, 0xc0004aecb0, 0x0, 0x1f1314f, 0xf, 0x31, 0x0) /go/src/github.com/openshift/ovn-kubernetes/go-controller/vendor/k8s.io/klog/v2/klog.go:987 +0x1ff k8s.io/klog/v2.(*loggingT).printf(0x2653e20, 0xc000000003, 0x0, 0x0, 0x0, 0x199f9e8, 0x17, 0xc0004a6310, 0x1, 0x1) /go/src/github.com/openshift/ovn-kubernetes/go-controller/vendor/k8s.io/klog/v2/klog.go:753 +0x18d k8s.io/klog/v2.Fatalf(...) /go/src/github.com/openshift/ovn-kubernetes/go-controller/vendor/k8s.io/klog/v2/klog.go:1532 github.com/ovn-org/ovn-kubernetes/go-controller/pkg/ovndbmanager.RunDBChecker.func1(0xc000329870, 0x1bcf158, 0xc000479640, 0xc000476480) /go/src/github.com/openshift/ovn-kubernetes/go-controller/pkg/ovndbmanager/ovndbmanager.go:49 +0x157 created by github.com/ovn-org/ovn-kubernetes/go-controller/pkg/ovndbmanager.RunDBChecker /go/src/github.com/openshift/ovn-kubernetes/go-controller/pkg/ovndbmanager/ovndbmanager.go:46 +0x150 goroutine 1 [chan receive]: main.runOvnKubeDBChecker(0xc000229680, 0x0, 0x0) /go/src/github.com/openshift/ovn-kubernetes/go-controller/cmd/ovndbchecker/ovndbchecker.go:198 +0x3f5 main.main.func1(0xc000229680, 0x0, 0x80) /go/src/github.com/openshift/ovn-kubernetes/go-controller/cmd/ovndbchecker/ovndbchecker.go:90 +0x2b github.com/urfave/cli/v2.(*App).RunContext(0xc000364300, 0x1bb1d18, 0xc0003ae000, 0xc00003e1c0, 0x1c, 0x1c, 0x0, 0x0) /go/src/github.com/openshift/ovn-kubernetes/go-controller/vendor/github.com/urfave/cli/v2/app.go:315 +0x6fe main.main() /go/src/github.com/openshift/ovn-kubernetes/go-controller/cmd/ovndbchecker/ovndbchecker.go:117 +0x8a7 goroutine 6 [chan receive]: k8s.io/klog/v2.(*loggingT).flushDaemon(0x2653e20) /go/src/github.com/openshift/ovn-kubernetes/go-controller/vendor/k8s.io/klog/v2/klog.go:1181 +0x8b created by k8s.io/klog/v2.init.0 /go/src/github.com/openshift/ovn-kubernetes/go-controller/vendor/k8s.io/klog/v2/klog.go:420 +0xdf goroutine 20 [syscall]: os/signal.signal_recv(0x0) /usr/lib/golang/src/runtime/sigqueue.go:168 +0xa5 os/signal.loop() /usr/lib/golang/src/os/signal/signal_unix.go:23 +0x25 created by os/signal.Notify.func1.1 /usr/lib/golang/src/os/signal/signal.go:151 +0x45 goroutine 11 [select]: main.main.func3(0xc0003ba060, 0xc0004a6150, 0x1bb1d18, 0xc0003ae000) /go/src/github.com/openshift/ovn-kubernetes/go-controller/cmd/ovndbchecker/ovndbchecker.go:109 +0xb3 created by main.main /go/src/github.com/openshift/ovn-kubernetes/go-controller/cmd/ovndbchecker/ovndbchecker.go:108 +0x858 goroutine 51 [chan receive]: github.com/ovn-org/ovn-kubernetes/go-controller/pkg/ovndbmanager.RunDBChecker(0x1bcf158, 0xc000479640, 0xc000476480) /go/src/github.com/openshift/ovn-kubernetes/go-controller/pkg/ovndbmanager/ovndbmanager.go:62 +0x1ca created by main.runOvnKubeDBChecker /go/src/github.com/openshift/ovn-kubernetes/go-controller/cmd/ovndbchecker/ovndbchecker.go:190 +0x3c5 goroutine 53 [syscall]: syscall.Syscall6(0xf7, 0x1, 0x1b, 0xc000070948, 0x1000004, 0x0, 0x0, 0xc00004c4e0, 0x190e680, 0xc000060c00) /usr/lib/golang/src/syscall/asm_linux_amd64.s:43 +0x5 os.(*Process).blockUntilWaitable(0xc0004bca20, 0x6, 0x6, 0x4fc328) /usr/lib/golang/src/os/wait_waitid.go:32 +0x9e os.(*Process).wait(0xc0004bca20, 0x10000c000070b40, 0x1a5ebd8, 0x1a5ebe0) /usr/lib/golang/src/os/exec_unix.go:22 +0x39 os.(*Process).Wait(...) /usr/lib/golang/src/os/exec.go:129 os/exec.(*Cmd).Wait(0xc0003b6c60, 0x0, 0x0) /usr/lib/golang/src/os/exec/exec.go:507 +0x65 os/exec.(*Cmd).Run(0xc0003b6c60, 0xc000070b40, 0x414418) /usr/lib/golang/src/os/exec/exec.go:341 +0x5f k8s.io/utils/exec.(*cmdWrapper).Run(0xc0003b6c60, 0xc00004c2c0, 0xc0000ccc80) /go/src/github.com/openshift/ovn-kubernetes/go-controller/vendor/k8s.io/utils/exec/exec.go:161 +0x2f github.com/ovn-org/ovn-kubernetes/go-controller/pkg/util.(*defaultExecRunner).RunCmd(0x2683660, 0x1bc8e40, 0xc0003b6c60, 0xc0004cb428, 0x15, 0x0, 0x0, 0x0, 0xc0004236d0, 0x5, ...) /go/src/github.com/openshift/ovn-kubernetes/go-controller/pkg/util/ovs.go:208 +0x344 github.com/ovn-org/ovn-kubernetes/go-controller/pkg/util.runWithEnvVars(0xc0004cb428, 0x15, 0x0, 0x0, 0x0, 0xc0004236d0, 0x5, 0x5, 0x1, 0x7, ...) /go/src/github.com/openshift/ovn-kubernetes/go-controller/pkg/util/ovs.go:356 +0x10e github.com/ovn-org/ovn-kubernetes/go-controller/pkg/util.runOVNretry(0xc0004cb428, 0x15, 0x0, 0x0, 0x0, 0xc0004236d0, 0x5, 0x5, 0x7, 0xc000060c00, ...) /go/src/github.com/openshift/ovn-kubernetes/go-controller/pkg/util/ovs.go:416 +0xbe github.com/ovn-org/ovn-kubernetes/go-controller/pkg/util.RunOVSDBClient(0xc0004236d0, 0x5, 0x5, 0x7, 0x7, 0x0, 0x0, 0x0, 0x0) /go/src/github.com/openshift/ovn-kubernetes/go-controller/pkg/util/ovs.go:608 +0x8a github.com/ovn-org/ovn-kubernetes/go-controller/pkg/ovndbmanager.upgradeDBSchema(0x19a93ae, 0x1f, 0x19aa76b, 0x1f, 0x1993f53, 0xe, 0x0, 0x0) /go/src/github.com/openshift/ovn-kubernetes/go-controller/pkg/ovndbmanager/ovndbmanager.go:416 +0x2ad github.com/ovn-org/ovn-kubernetes/go-controller/pkg/ovndbmanager.upgradeSBDBSchema(...) /go/src/github.com/openshift/ovn-kubernetes/go-controller/pkg/ovndbmanager/ovndbmanager.go:402 github.com/ovn-org/ovn-kubernetes/go-controller/pkg/ovndbmanager.RunDBChecker.func2(0xc000329870, 0x1bcf158, 0xc000479640, 0xc000476480) /go/src/github.com/openshift/ovn-kubernetes/go-controller/pkg/ovndbmanager/ovndbmanager.go:57 +0x8f created by github.com/ovn-org/ovn-kubernetes/go-controller/pkg/ovndbmanager.RunDBChecker /go/src/github.com/openshift/ovn-kubernetes/go-controller/pkg/ovndbmanager/ovndbmanager.go:55 +0x1b0 goroutine 56 [IO wait]: internal/poll.runtime_pollWait(0x7f5de02b3570, 0x72, 0xffffffffffffffff) /usr/lib/golang/src/runtime/netpoll.go:222 +0x55 internal/poll.(*pollDesc).wait(0xc00048f458, 0x72, 0x201, 0x200, 0xffffffffffffffff) /usr/lib/golang/src/internal/poll/fd_poll_runtime.go:87 +0x45 internal/poll.(*pollDesc).waitRead(...) /usr/lib/golang/src/internal/poll/fd_poll_runtime.go:92 internal/poll.(*FD).Read(0xc00048f440, 0xc0004f2a00, 0x200, 0x200, 0x0, 0x0, 0x0) /usr/lib/golang/src/internal/poll/fd_unix.go:166 +0x1d5 os.(*File).read(...) /usr/lib/golang/src/os/file_posix.go:31 os.(*File).Read(0xc000462188, 0xc0004f2a00, 0x200, 0x200, 0x0, 0x0, 0x0) /usr/lib/golang/src/os/file.go:117 +0x77 bytes.(*Buffer).ReadFrom(0xc000313770, 0x1b83ce0, 0xc000462188, 0x7f5dd90f0278, 0xc000313770, 0x1) /usr/lib/golang/src/bytes/buffer.go:204 +0xbe io.copyBuffer(0x1b818e0, 0xc000313770, 0x1b83ce0, 0xc000462188, 0x0, 0x0, 0x0, 0x404040404030302, 0x0, 0x0) /usr/lib/golang/src/io/io.go:409 +0x357 io.Copy(...) /usr/lib/golang/src/io/io.go:382 os/exec.(*Cmd).writerDescriptor.func1(0x0, 0x0) /usr/lib/golang/src/os/exec/exec.go:311 +0x65 os/exec.(*Cmd).Start.func1(0xc0003b6c60, 0xc00004c300) /usr/lib/golang/src/os/exec/exec.go:441 +0x27 created by os/exec.(*Cmd).Start /usr/lib/golang/src/os/exec/exec.go:440 +0x630 goroutine 57 [IO wait]: internal/poll.runtime_pollWait(0x7f5de02b3000, 0x72, 0xffffffffffffffff) /usr/lib/golang/src/runtime/netpoll.go:222 +0x55 internal/poll.(*pollDesc).wait(0xc00048f518, 0x72, 0x201, 0x200, 0xffffffffffffffff) /usr/lib/golang/src/internal/poll/fd_poll_runtime.go:87 +0x45 internal/poll.(*pollDesc).waitRead(...) /usr/lib/golang/src/internal/poll/fd_poll_runtime.go:92 internal/poll.(*FD).Read(0xc00048f500, 0xc0004f2c00, 0x200, 0x200, 0x0, 0x0, 0x0) /usr/lib/golang/src/internal/poll/fd_unix.go:166 +0x1d5 os.(*File).read(...) /usr/lib/golang/src/os/file_posix.go:31 os.(*File).Read(0xc0004621a0, 0xc0004f2c00, 0x200, 0x200, 0x0, 0x0, 0x0) /usr/lib/golang/src/os/file.go:117 +0x77 bytes.(*Buffer).ReadFrom(0xc0003137a0, 0x1b83ce0, 0xc0004621a0, 0x7f5dd90f0278, 0xc0003137a0, 0x1) /usr/lib/golang/src/bytes/buffer.go:204 +0xbe io.copyBuffer(0x1b818e0, 0xc0003137a0, 0x1b83ce0, 0xc0004621a0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0) /usr/lib/golang/src/io/io.go:409 +0x357 io.Copy(...) /usr/lib/golang/src/io/io.go:382 os/exec.(*Cmd).writerDescriptor.func1(0x0, 0x0) /usr/lib/golang/src/os/exec/exec.go:311 +0x65 os/exec.(*Cmd).Start.func1(0xc0003b6c60, 0xc00004c4c0) /usr/lib/golang/src/os/exec/exec.go:441 +0x27 created by os/exec.(*Cmd).Start /usr/lib/golang/src/os/exec/exec.go:440 +0x630 sbdb: 2021-12-17T01:14:00.918Z|00670|reconnect|WARN|unix#1920: connection dropped (Connection reset by peer) 2021-12-17T01:14:30.946Z|00671|jsonrpc|WARN|unix#1926: receive error: Connection reset by peer 2021-12-17T01:14:30.946Z|00672|reconnect|WARN|unix#1926: connection dropped (Connection reset by peer) 2021-12-17T01:15:00.966Z|00673|jsonrpc|WARN|unix#1932: receive error: Connection reset by peer 2021-12-17T01:15:00.966Z|00674|reconnect|WARN|unix#1932: connection dropped (Connection reset by peer) 2021-12-17T01:15:30.985Z|00675|jsonrpc|WARN|unix#1938: receive error: Connection reset by peer 2021-12-17T01:15:30.985Z|00676|reconnect|WARN|unix#1938: connection dropped (Connection reset by peer) 2021-12-17T01:16:01.005Z|00677|jsonrpc|WARN|unix#1944: receive error: Connection reset by peer 2021-12-17T01:16:01.006Z|00678|reconnect|WARN|unix#1944: connection dropped (Connection reset by peer) 2021-12-17T01:16:31.025Z|00679|jsonrpc|WARN|unix#1950: receive error: Connection reset by peer 2021-12-17T01:16:31.025Z|00680|reconnect|WARN|unix#1950: connection dropped (Connection reset by peer) 2021-12-17T01:17:01.049Z|00681|jsonrpc|WARN|unix#1956: receive error: Connection reset by peer 2021-12-17T01:17:01.050Z|00682|reconnect|WARN|unix#1956: connection dropped (Connection reset by peer) 2021-12-17T01:18:01.107Z|00683|jsonrpc|WARN|unix#1968: receive error: Connection reset by peer 2021-12-17T01:18:01.107Z|00684|reconnect|WARN|unix#1968: connection dropped (Connection reset by peer) Expected results: CNO not degraded.
oc rsh -c nbdb ovnkube-master-xcv8v sh-4.4# ovn-nbctl -v show 2021-12-17T04:53:00Z|00001|reconnect|DBG|unix:/var/run/ovn/ovnnb_db.sock: entering BACKOFF 2021-12-17T04:53:00Z|00002|ovn_dbctl|DBG|Called as ovn-nbctl -v show 2021-12-17T04:53:00Z|00003|reconnect|INFO|unix:/var/run/ovn/ovnnb_db.sock: connecting... 2021-12-17T04:53:00Z|00004|reconnect|DBG|unix:/var/run/ovn/ovnnb_db.sock: entering CONNECTING 2021-12-17T04:53:00Z|00005|ovsdb_cs|DBG|unix:/var/run/ovn/ovnnb_db.sock: SERVER_SCHEMA_REQUESTED -> SERVER_SCHEMA_REQUESTED at lib/ovsdb-cs.c:419 2021-12-17T04:53:00Z|00006|poll_loop|DBG|wakeup due to [POLLOUT] on fd 3 (<->/var/run/ovn/ovnnb_db.sock) at lib/stream-fd.c:153 2021-12-17T04:53:00Z|00007|reconnect|INFO|unix:/var/run/ovn/ovnnb_db.sock: connected 2021-12-17T04:53:00Z|00008|reconnect|DBG|unix:/var/run/ovn/ovnnb_db.sock: entering ACTIVE 2021-12-17T04:53:00Z|00009|jsonrpc|DBG|unix:/var/run/ovn/ovnnb_db.sock: send request, method="get_schema", params=["_Server"], id=1 2021-12-17T04:53:00Z|00010|ovsdb_cs|DBG|unix:/var/run/ovn/ovnnb_db.sock: SERVER_SCHEMA_REQUESTED -> SERVER_SCHEMA_REQUESTED at lib/ovsdb-cs.c:419 2021-12-17T04:53:00Z|00011|poll_loop|DBG|wakeup due to [POLLIN] on fd 3 (<->/var/run/ovn/ovnnb_db.sock) at lib/stream-fd.c:157 2021-12-17T04:53:00Z|00012|jsonrpc|DBG|unix:/var/run/ovn/ovnnb_db.sock: received reply, result={"cksum":"3009684573 744","name":"_Server","version":"1.2.0","tables":{"Database":{"columns":{"model":{"type":{"key":{"type":"string","enum":["set",["clustered","relay","standalone"]]}}},"name":{"type":"string"},"connected":{"type":"boolean"},"leader":{"type":"boolean"},"schema":{"type":{"min":0,"key":"string"}},"sid":{"type":{"min":0,"key":"uuid"}},"cid":{"type":{"min":0,"key":"uuid"}},"index":{"type":{"min":0,"key":"integer"}}}}}}, id=1 2021-12-17T04:53:00Z|00013|jsonrpc|DBG|unix:/var/run/ovn/ovnnb_db.sock: send request, method="monitor_cond", params=["_Server",["monid","_Server"],{"Database":[{"columns":["name","model","connected","leader","schema","cid","index"]}]}], id=2 2021-12-17T04:53:00Z|00014|ovsdb_cs|DBG|unix:/var/run/ovn/ovnnb_db.sock: SERVER_SCHEMA_REQUESTED -> SERVER_MONITOR_REQUESTED at lib/ovsdb-cs.c:452 2021-12-17T04:53:00Z|00015|poll_loop|DBG|wakeup due to [POLLIN] on fd 3 (<->/var/run/ovn/ovnnb_db.sock) at lib/stream-fd.c:157 2021-12-17T04:53:00Z|00016|jsonrpc|DBG|unix:/var/run/ovn/ovnnb_db.sock: received reply, result={"Database":{"a42cfa87-ffd9-4aae-8e99-a103b50cd9cd":{"initial":{"name":"OVN_Northbound","model":"clustered","cid":["uuid","2f6f1fd2-8c24-4b23-9329-c4914a6c2a68"]}},"fee7b314-c69e-47f3-a83a-e768cedb503d":{"initial":{"schema":"{\"cksum\":\"3009684573 744\",\"name\":\"_Server\",\"tables\":{\"Database\":{\"columns\":{\"cid\":{\"type\":{\"key\":\"uuid\",\"min\":0}},\"connected\":{\"type\":\"boolean\"},\"index\":{\"type\":{\"key\":\"integer\",\"min\":0}},\"leader\":{\"type\":\"boolean\"},\"model\":{\"type\":{\"key\":{\"enum\":[\"set\",[\"clustered\",\"relay\",\"standalone\"]],\"type\":\"string\"}}},\"name\":{\"type\":\"string\"},\"schema\":{\"type\":{\"key\":\"string\",\"min\":0}},\"sid\":{\"type\":{\"key\":\"uuid\",\"min\":0}}}}},\"version\":\"1.2.0\"}","name":"_Server","model":"standalone","connected":true,"leader":true}}}}, id=2 2021-12-17T04:53:00Z|00017|ovsdb_cs|INFO|unix:/var/run/ovn/ovnnb_db.sock: clustered database server has not yet joined cluster; trying another server 2021-12-17T04:53:00Z|00018|reconnect|DBG|unix:/var/run/ovn/ovnnb_db.sock: entering RECONNECT 2021-12-17T04:53:00Z|00019|ovsdb_cs|DBG|unix:/var/run/ovn/ovnnb_db.sock: SERVER_MONITOR_REQUESTED -> RETRY at lib/ovsdb-cs.c:1994 2021-12-17T04:53:00Z|00020|poll_loop|DBG|wakeup due to 0-ms timeout at lib/reconnect.c:664 2021-12-17T04:53:00Z|00021|reconnect|INFO|unix:/var/run/ovn/ovnnb_db.sock: connection attempt timed out 2021-12-17T04:53:00Z|00022|reconnect|DBG|unix:/var/run/ovn/ovnnb_db.sock: entering BACKOFF 2021-12-17T04:53:00Z|00023|ovsdb_cs|DBG|unix:/var/run/ovn/ovnnb_db.sock: RETRY -> SERVER_SCHEMA_REQUESTED at lib/ovsdb-cs.c:419 ovn-nbctl: unix:/var/run/ovn/ovnnb_db.sock: database connection failed ()
Occured again on 4.10.0-0.nightly-2022-01-25-023600 2022-01-26T15:27:18.555295002Z + [[ -f /env/_master ]] 2022-01-26T15:27:18.555666191Z ++ date '+%m%d %H:%M:%S.%N' 2022-01-26T15:27:18.557786986Z + echo 'I0126 15:27:18.557140368 - ovn-dbchecker - start ovn-dbchecker' 2022-01-26T15:27:18.557840431Z I0126 15:27:18.557140368 - ovn-dbchecker - start ovn-dbchecker 2022-01-26T15:27:18.558035891Z + exec /usr/bin/ovndbchecker --config-file=/run/ovnkube-config/ovnkube.conf --loglevel 4 --sb-address ssl:10.242.0.7:9642,ssl:10.242.128.6:9642,ssl:10.242.65.4:9642 --sb-client-privkey /ovn-cert/tls.key --sb-client-cert /ovn-cert/tls.crt --sb-client-cacert /ovn-ca/ca-bundle.crt --sb-cert-common-name ovn --sb-raft-election-timer 16 --nb-address ssl:10.242.0.7:9641,ssl:10.242.128.6:9641,ssl:10.242.65.4:9641 --nb-client-privkey /ovn-cert/tls.key --nb-client-cert /ovn-cert/tls.crt --nb-client-cacert /ovn-ca/ca-bundle.crt --nb-cert-common-name ovn --nb-raft-election-timer 10 2022-01-26T15:27:18.682485667Z I0126 15:27:18.682401 1 ovs.go:93] Maximum command line arguments set to: 191102 2022-01-26T15:27:18.683531314Z I0126 15:27:18.683500 1 config.go:1674] Parsed config file /run/ovnkube-config/ovnkube.conf 2022-01-26T15:27:18.683610331Z I0126 15:27:18.683518 1 config.go:1675] Parsed config: {Default:{MTU:1400 RoutableMTU:0 ConntrackZone:64000 EncapType:geneve EncapIP: EncapPort:6081 InactivityProbe:100000 OpenFlowProbe:180 MonitorAll:true LFlowCacheEnable:true LFlowCacheLimit:0 LFlowCacheLimitKb:1048576 RawClusterSubnets:10.128.0.0/14/23 ClusterSubnets:[]} Logging:{File: CNIFile: Level:4 LogFileMaxSize:100 LogFileMaxBackups:5 LogFileMaxAge:5 ACLLoggingRateLimit:20} Monitoring:{RawNetFlowTargets: RawSFlowTargets: RawIPFIXTargets: NetFlowTargets:[] SFlowTargets:[] IPFIXTargets:[]} IPFIX:{Sampling:400 CacheActiveTimeout:60 CacheMaxFlows:0} CNI:{ConfDir:/etc/cni/net.d Plugin:ovn-k8s-cni-overlay} OVNKubernetesFeature:{EnableEgressIP:true EnableEgressFirewall:true} Kubernetes:{Kubeconfig: CACert: CAData:[] APIServer:https://api-int.rbrattai-o410i37.ibmcloud.qe.devcluster.openshift.com:6443 Token: CompatServiceCIDR: RawServiceCIDRs:172.30.0.0/16 ServiceCIDRs:[] OVNConfigNamespace:openshift-ovn-kubernetes MetricsBindAddress: OVNMetricsBindAddress: MetricsEnablePprof:false OVNEmptyLbEvents:false PodIP: RawNoHostSubnetNodes: NoHostSubnetNodes:nil HostNetworkNamespace:openshift-host-network PlatformType:IBMCloud} OvnNorth:{Address: PrivKey: Cert: CACert: CertCommonName: Scheme: ElectionTimer:0 northbound:false exec:<nil>} OvnSouth:{Address: PrivKey: Cert: CACert: CertCommonName: Scheme: ElectionTimer:0 northbound:false exec:<nil>} Gateway:{Mode:shared Interface: EgressGWInterface: NextHop: VLANID:0 NodeportEnable:true DisableSNATMultipleGWs:false V4JoinSubnet:100.64.0.0/16 V6JoinSubnet:fd98::/64 DisablePacketMTUCheck:false RouterSubnet:} MasterHA:{ElectionLeaseDuration:60 ElectionRenewDeadline:30 ElectionRetryPeriod:20} HybridOverlay:{Enabled:false RawClusterSubnets: ClusterSubnets:[] VXLANPort:4789} OvnKubeNode:{Mode:full MgmtPortNetdev: DisableOVNIfaceIdVer:false}} 2022-01-26T15:27:18.694365062Z I0126 15:27:18.694320 1 ovndbmanager.go:43] Starting DB Checker to ensure cluster membership and DB consistency 2022-01-26T15:27:18.705020053Z F0126 15:27:18.704960 1 ovndbmanager.go:49] NBDB Upgrade failed: %!w(*fmt.wrapError=&{failed to get schema version for NBDB, stderr: "ovsdb-client: transaction returned error: {\"details\":\"get_schema request specifies database OVN_Northbound which is not yet available because it has not completed joining its cluster\",\"error\":\"database not available\"}\n", error: OVN command '/usr/bin/ovsdb-client -t 10 get-schema-version unix:/var/run/ovn/ovnnb_db.sock OVN_Northbound' failed: exit status 1 0xc0001d2020}) 2022-01-26T15:27:18.705213984Z goroutine 52 [running]: 2022-01-26T15:27:18.705213984Z k8s.io/klog/v2.stacks(0x1) 2022-01-26T15:27:18.705213984Z /go/src/github.com/openshift/ovn-kubernetes/go-controller/vendor/k8s.io/klog/v2/klog.go:1038 +0x8a 2022-01-26T15:27:18.705213984Z k8s.io/klog/v2.(*loggingT).output(0x2503620, 0x3, 0x0, 0xc00053cf50, 0x0, {0x1c93ac6, 0x1}, 0xc0001d2040, 0x0) 2022-01-26T15:27:18.705213984Z /go/src/github.com/openshift/ovn-kubernetes/go-controller/vendor/k8s.io/klog/v2/klog.go:987 +0x5fd 2022-01-26T15:27:18.705213984Z k8s.io/klog/v2.(*loggingT).printf(0x1f, 0x1701748, 0x0, {0x0, 0x0}, {0x170cce0, 0x17}, {0xc0001d2040, 0x1, 0x1}) 2022-01-26T15:27:18.705213984Z /go/src/github.com/openshift/ovn-kubernetes/go-controller/vendor/k8s.io/klog/v2/klog.go:753 +0x1c5 2022-01-26T15:27:18.705213984Z k8s.io/klog/v2.Fatalf(...) 2022-01-26T15:27:18.705213984Z /go/src/github.com/openshift/ovn-kubernetes/go-controller/vendor/k8s.io/klog/v2/klog.go:1532 2022-01-26T15:27:18.705213984Z github.com/ovn-org/ovn-kubernetes/go-controller/pkg/ovndbmanager.RunDBChecker.func1() 2022-01-26T15:27:18.705213984Z /go/src/github.com/openshift/ovn-kubernetes/go-controller/pkg/ovndbmanager/ovndbmanager.go:49 +0x14e 2022-01-26T15:27:18.705213984Z created by github.com/ovn-org/ovn-kubernetes/go-controller/pkg/ovndbmanager.RunDBChecker 2022-01-26T15:27:18.705213984Z /go/src/github.com/openshift/ovn-kubernetes/go-controller/pkg/ovndbmanager/ovndbmanager.go:46 +0x195 2022-01-26T15:27:18.705213984Z 2022-01-26T15:27:18.705213984Z goroutine 1 [chan receive]: 2022-01-26T15:27:18.705213984Z main.runOvnKubeDBChecker(0xc00047bb40) 2022-01-26T15:27:18.705213984Z /go/src/github.com/openshift/ovn-kubernetes/go-controller/cmd/ovndbchecker/ovndbchecker.go:198 +0x37a 2022-01-26T15:27:18.705213984Z main.main.func1(0xc000332800) 2022-01-26T15:27:18.705213984Z /go/src/github.com/openshift/ovn-kubernetes/go-controller/cmd/ovndbchecker/ovndbchecker.go:90 +0x19 2022-01-26T15:27:18.705213984Z github.com/urfave/cli/v2.(*App).RunContext(0xc00046a000, {0x190e878, 0xc00047a000}, {0xc0001ac000, 0x1c, 0x1c}) 2022-01-26T15:27:18.705213984Z /go/src/github.com/openshift/ovn-kubernetes/go-controller/vendor/github.com/urfave/cli/v2/app.go:315 +0x8ac 2022-01-26T15:27:18.705213984Z main.main() 2022-01-26T15:27:18.705213984Z /go/src/github.com/openshift/ovn-kubernetes/go-controller/cmd/ovndbchecker/ovndbchecker.go:117 +0xad8 2022-01-26T15:27:18.705213984Z 2022-01-26T15:27:18.705213984Z goroutine 19 [chan receive]: 2022-01-26T15:27:18.705213984Z k8s.io/klog/v2.(*loggingT).flushDaemon(0x0) 2022-01-26T15:27:18.705213984Z /go/src/github.com/openshift/ovn-kubernetes/go-controller/vendor/k8s.io/klog/v2/klog.go:1181 +0x6a 2022-01-26T15:27:18.705213984Z created by k8s.io/klog/v2.init.0 2022-01-26T15:27:18.705213984Z /go/src/github.com/openshift/ovn-kubernetes/go-controller/vendor/k8s.io/klog/v2/klog.go:420 +0xfb 2022-01-26T15:27:18.705213984Z 2022-01-26T15:27:18.705213984Z goroutine 35 [syscall]: 2022-01-26T15:27:18.705213984Z os/signal.signal_recv() 2022-01-26T15:27:18.705213984Z /usr/lib/golang/src/runtime/sigqueue.go:169 +0x98 2022-01-26T15:27:18.705213984Z os/signal.loop() 2022-01-26T15:27:18.705213984Z /usr/lib/golang/src/os/signal/signal_unix.go:24 +0x19 2022-01-26T15:27:18.705213984Z created by os/signal.Notify.func1.1 2022-01-26T15:27:18.705213984Z /usr/lib/golang/src/os/signal/signal.go:151 +0x2c 2022-01-26T15:27:18.705213984Z 2022-01-26T15:27:18.705213984Z goroutine 11 [select]: 2022-01-26T15:27:18.705213984Z main.main.func3() 2022-01-26T15:27:18.705213984Z /go/src/github.com/openshift/ovn-kubernetes/go-controller/cmd/ovndbchecker/ovndbchecker.go:109 +0xa7 2022-01-26T15:27:18.705213984Z created by main.main 2022-01-26T15:27:18.705213984Z /go/src/github.com/openshift/ovn-kubernetes/go-controller/cmd/ovndbchecker/ovndbchecker.go:108 +0xaa6 2022-01-26T15:27:18.705213984Z 2022-01-26T15:27:18.705213984Z goroutine 51 [chan receive]: 2022-01-26T15:27:18.705213984Z github.com/ovn-org/ovn-kubernetes/go-controller/pkg/ovndbmanager.RunDBChecker({0x194b9f8, 0xc000337cc0}, 0xc000182d80) 2022-01-26T15:27:18.705213984Z /go/src/github.com/openshift/ovn-kubernetes/go-controller/pkg/ovndbmanager/ovndbmanager.go:62 +0x245 2022-01-26T15:27:18.705213984Z created by main.runOvnKubeDBChecker 2022-01-26T15:27:18.705213984Z /go/src/github.com/openshift/ovn-kubernetes/go-controller/cmd/ovndbchecker/ovndbchecker.go:190 +0x35e 2022-01-26T15:27:18.705213984Z 2022-01-26T15:27:18.705213984Z goroutine 53 [syscall]: 2022-01-26T15:27:18.705213984Z syscall.Syscall6(0xf7, 0x1, 0x1a, 0xc0001bc9d0, 0x1000004, 0x0, 0x0) 2022-01-26T15:27:18.705213984Z /usr/lib/golang/src/syscall/asm_linux_amd64.s:43 +0x5 2022-01-26T15:27:18.705213984Z os.(*Process).blockUntilWaitable(0xc0004c6c30) 2022-01-26T15:27:18.705213984Z /usr/lib/golang/src/os/wait_waitid.go:33 +0x9c 2022-01-26T15:27:18.705213984Z os.(*Process).wait(0xc0004c6c30) 2022-01-26T15:27:18.705213984Z /usr/lib/golang/src/os/exec_unix.go:23 +0x28 2022-01-26T15:27:18.705213984Z os.(*Process).Wait(...) 2022-01-26T15:27:18.705213984Z /usr/lib/golang/src/os/exec.go:132 2022-01-26T15:27:18.705213984Z os/exec.(*Cmd).Wait(0xc0004c1080) 2022-01-26T15:27:18.705213984Z /usr/lib/golang/src/os/exec/exec.go:507 +0x54 2022-01-26T15:27:18.705213984Z os/exec.(*Cmd).Run(0xc0001bcb90) 2022-01-26T15:27:18.705213984Z /usr/lib/golang/src/os/exec/exec.go:341 +0x39 2022-01-26T15:27:18.705213984Z k8s.io/utils/exec.(*cmdWrapper).Run(0xc000182de0) 2022-01-26T15:27:18.705213984Z /go/src/github.com/openshift/ovn-kubernetes/go-controller/vendor/k8s.io/utils/exec/exec.go:161 +0x1d 2022-01-26T15:27:18.705213984Z github.com/ovn-org/ovn-kubernetes/go-controller/pkg/util.(*defaultExecRunner).RunCmd(0xc0005434a0, {0x193f580, 0xc0004c1080}, {0xc00029be90, 0x15}, {0x0, 0x4, 0x2e}, {0xc0005434a0, 0x5, ...}) 2022-01-26T15:27:18.705213984Z /go/src/github.com/openshift/ovn-kubernetes/go-controller/pkg/util/ovs.go:206 +0x312 2022-01-26T15:27:18.705213984Z github.com/ovn-org/ovn-kubernetes/go-controller/pkg/util.runWithEnvVars({0xc00029be90, 0x15}, {0x0, 0x0, 0x0}, {0xc0005434a0, 0x5, 0x5}) 2022-01-26T15:27:18.705213984Z /go/src/github.com/openshift/ovn-kubernetes/go-controller/pkg/util/ovs.go:354 +0xde 2022-01-26T15:27:18.705213984Z github.com/ovn-org/ovn-kubernetes/go-controller/pkg/util.runOVNretry({0xc00029be90, 0x15}, {0x0, 0x0, 0x0}, {0xc0005434a0, 0x5, 0x5}) 2022-01-26T15:27:18.705213984Z /go/src/github.com/openshift/ovn-kubernetes/go-controller/pkg/util/ovs.go:415 +0xcc 2022-01-26T15:27:18.705213984Z github.com/ovn-org/ovn-kubernetes/go-controller/pkg/util.RunOVSDBClient({0xc0005434a0, 0x1f, 0x0}) 2022-01-26T15:27:18.705213984Z /go/src/github.com/openshift/ovn-kubernetes/go-controller/pkg/util/ovs.go:570 +0x4d 2022-01-26T15:27:18.705213984Z github.com/ovn-org/ovn-kubernetes/go-controller/pkg/ovndbmanager.upgradeDBSchema({0x171679d, 0x0}, {0x1717b5a, 0x1f}, {0x1701756, 0x0}) 2022-01-26T15:27:18.705213984Z /go/src/github.com/openshift/ovn-kubernetes/go-controller/pkg/ovndbmanager/ovndbmanager.go:416 +0x233 2022-01-26T15:27:18.705213984Z github.com/ovn-org/ovn-kubernetes/go-controller/pkg/ovndbmanager.upgradeSBDBSchema(...) 2022-01-26T15:27:18.705213984Z /go/src/github.com/openshift/ovn-kubernetes/go-controller/pkg/ovndbmanager/ovndbmanager.go:402 2022-01-26T15:27:18.705213984Z github.com/ovn-org/ovn-kubernetes/go-controller/pkg/ovndbmanager.RunDBChecker.func2() 2022-01-26T15:27:18.705213984Z /go/src/github.com/openshift/ovn-kubernetes/go-controller/pkg/ovndbmanager/ovndbmanager.go:57 +0xb6 2022-01-26T15:27:18.705213984Z created by github.com/ovn-org/ovn-kubernetes/go-controller/pkg/ovndbmanager.RunDBChecker 2022-01-26T15:27:18.705213984Z /go/src/github.com/openshift/ovn-kubernetes/go-controller/pkg/ovndbmanager/ovndbmanager.go:55 +0x235 2022-01-26T15:27:18.705213984Z 2022-01-26T15:27:18.705213984Z goroutine 56 [IO wait]: 2022-01-26T15:27:18.705213984Z internal/poll.runtime_pollWait(0x7f034abce5d8, 0x72) 2022-01-26T15:27:18.705213984Z /usr/lib/golang/src/runtime/netpoll.go:229 +0x89 2022-01-26T15:27:18.705213984Z internal/poll.(*pollDesc).wait(0xc0004cd8c0, 0xc000547000, 0x1) 2022-01-26T15:27:18.705213984Z /usr/lib/golang/src/internal/poll/fd_poll_runtime.go:84 +0x32 2022-01-26T15:27:18.705213984Z internal/poll.(*pollDesc).waitRead(...) 2022-01-26T15:27:18.705213984Z /usr/lib/golang/src/internal/poll/fd_poll_runtime.go:89 2022-01-26T15:27:18.705213984Z internal/poll.(*FD).Read(0xc0004cd8c0, {0xc000547000, 0x200, 0x200}) 2022-01-26T15:27:18.705213984Z /usr/lib/golang/src/internal/poll/fd_unix.go:167 +0x25a 2022-01-26T15:27:18.705213984Z os.(*File).read(...) 2022-01-26T15:27:18.705213984Z /usr/lib/golang/src/os/file_posix.go:32 2022-01-26T15:27:18.705213984Z os.(*File).Read(0xc000012178, {0xc000547000, 0x4687ce, 0xc0000c96a0}) 2022-01-26T15:27:18.705213984Z /usr/lib/golang/src/os/file.go:119 +0x5e 2022-01-26T15:27:18.705213984Z bytes.(*Buffer).ReadFrom(0xc000319fb0, {0x18e8ae0, 0xc000012178}) 2022-01-26T15:27:18.705213984Z /usr/lib/golang/src/bytes/buffer.go:204 +0x98 2022-01-26T15:27:18.705213984Z io.copyBuffer({0x18e66e0, 0xc000319fb0}, {0x18e8ae0, 0xc000012178}, {0x0, 0x0, 0x0}) 2022-01-26T15:27:18.705213984Z /usr/lib/golang/src/io/io.go:409 +0x14b 2022-01-26T15:27:18.705213984Z io.Copy(...) 2022-01-26T15:27:18.705213984Z /usr/lib/golang/src/io/io.go:382 2022-01-26T15:27:18.705213984Z os/exec.(*Cmd).writerDescriptor.func1() 2022-01-26T15:27:18.705213984Z /usr/lib/golang/src/os/exec/exec.go:311 +0x3a 2022-01-26T15:27:18.705213984Z os/exec.(*Cmd).Start.func1(0x0) 2022-01-26T15:27:18.705213984Z /usr/lib/golang/src/os/exec/exec.go:441 +0x25 2022-01-26T15:27:18.705213984Z created by os/exec.(*Cmd).Start 2022-01-26T15:27:18.705213984Z /usr/lib/golang/src/os/exec/exec.go:440 +0x80d 2022-01-26T15:27:18.705213984Z 2022-01-26T15:27:18.705213984Z goroutine 57 [IO wait]: 2022-01-26T15:27:18.705213984Z internal/poll.runtime_pollWait(0x7f034abcdf80, 0x72) 2022-01-26T15:27:18.705213984Z /usr/lib/golang/src/runtime/netpoll.go:229 +0x89 2022-01-26T15:27:18.705213984Z internal/poll.(*pollDesc).wait(0xc0004cd980, 0xc000547200, 0x1) 2022-01-26T15:27:18.705213984Z /usr/lib/golang/src/internal/poll/fd_poll_runtime.go:84 +0x32 2022-01-26T15:27:18.705213984Z internal/poll.(*pollDesc).waitRead(...) 2022-01-26T15:27:18.705213984Z /usr/lib/golang/src/internal/poll/fd_poll_runtime.go:89 2022-01-26T15:27:18.705213984Z internal/poll.(*FD).Read(0xc0004cd980, {0xc000547200, 0x200, 0x200}) 2022-01-26T15:27:18.705213984Z /usr/lib/golang/src/internal/poll/fd_unix.go:167 +0x25a 2022-01-26T15:27:18.705213984Z os.(*File).read(...) 2022-01-26T15:27:18.705213984Z /usr/lib/golang/src/os/file_posix.go:32 2022-01-26T15:27:18.705213984Z os.(*File).Read(0xc000012190, {0xc000547200, 0x0, 0xc0000c9ea0}) 2022-01-26T15:27:18.705213984Z /usr/lib/golang/src/os/file.go:119 +0x5e 2022-01-26T15:27:18.705213984Z bytes.(*Buffer).ReadFrom(0xc0003ffb00, {0x18e8ae0, 0xc000012190}) 2022-01-26T15:27:18.705213984Z /usr/lib/golang/src/bytes/buffer.go:204 +0x98 2022-01-26T15:27:18.705213984Z io.copyBuffer({0x18e66e0, 0xc0003ffb00}, {0x18e8ae0, 0xc000012190}, {0x0, 0x0, 0x0}) 2022-01-26T15:27:18.705213984Z /usr/lib/golang/src/io/io.go:409 +0x14b 2022-01-26T15:27:18.705213984Z io.Copy(...) 2022-01-26T15:27:18.705213984Z /usr/lib/golang/src/io/io.go:382 2022-01-26T15:27:18.705213984Z os/exec.(*Cmd).writerDescriptor.func1() 2022-01-26T15:27:18.705213984Z /usr/lib/golang/src/os/exec/exec.go:311 +0x3a 2022-01-26T15:27:18.705213984Z os/exec.(*Cmd).Start.func1(0x0) 2022-01-26T15:27:18.705213984Z /usr/lib/golang/src/os/exec/exec.go:441 +0x25 2022-01-26T15:27:18.705213984Z created by os/exec.(*Cmd).Start 2022-01-26T15:27:18.705213984Z /usr/lib/golang/src/os/exec/exec.go:440 +0x80d
Error is happening in the db checker pod for /home/surya/Downloads/must-gather.local.8304822870521033907/quay-io-openshift-release-dev-ocp-v4-0-art-dev-sha256-a4cd0fe78299d11381355f89526887610c4b4c0b7b529ca4addfdfef14ff6f57/namespaces/openshift-ovn-kubernetes/pods/ovnkube-master-nwj4r: 2022-01-26T13:59:59.829099353Z Joining /etc/ovn/ovnnb_db.db to cluster. It says the northbound database has joined the raft clsuter at 13:59. It is strange that DBChecker is crashing repeatedly and not picking this up? 2022-01-26T16:08:27.776353754Z I0126 16:08:27.776312 1 ovndbmanager.go:43] Starting DB Checker to ensure cluster membership and DB consistency 2022-01-26T16:08:27.790410111Z F0126 16:08:27.790352 1 ovndbmanager.go:49] NBDB Upgrade failed: %!w(*fmt.wrapError=&{failed to get schema version for NBDB, stderr: "ovsdb-client: transaction returned error: {\"details\":\"get_schema request specifies database OVN_Northbound which is not yet available because it has not completed joining its cluster\",\"error\":\"database not available\"}\n", error: OVN command '/usr/bin/ovsdb-client -t 10 get-schema-version unix:/var/run/ovn/ovnnb_db.sock OVN_Northbound' failed: exit status 1 0xc0008ac0b0}) 2022-01-26T16:08:27.790603029Z goroutine 38 [running]: 2022-01-26T16:08:27.790603029Z k8s.io/klog/v2.stacks(0x1) It's complaining it hasn't joined the raft cluster? The other two pods look good: ======/home/surya/Downloads/must-gather.local.8304822870521033907/quay-io-openshift-release-dev-ocp-v4-0-art-dev-sha256-a4cd0fe78299d11381355f89526887610c4b4c0b7b529ca4addfdfef14ff6f57/namespaces/openshift-ovn-kubernetes/pods/ovnkube-master-tcgcc/ovn-dbchecker/ovn-dbchecker/logs/current.log: 2022-01-26T14:00:24.301416231Z I0126 14:00:24.301387 1 ovndbmanager.go:43] Starting DB Checker to ensure cluster membership and DB consistency 2022-01-26T14:00:24.312682512Z I0126 14:00:24.312646 1 ovndbmanager.go:424] No OVN_Southbound DB schema upgrade is required. Current version: 20.21.0, target: 20.21.0 2022-01-26T14:00:24.312682512Z I0126 14:00:24.312668 1 ovndbmanager.go:70] Starting ensure routine for Raft db: /etc/ovn/ovnsb_db.db 2022-01-26T14:00:24.312964280Z I0126 14:00:24.312938 1 ovndbmanager.go:424] No OVN_Northbound DB schema upgrade is required. Current version: 5.34.1, target: 5.34.1 2022-01-26T14:00:24.312964280Z I0126 14:00:24.312954 1 ovndbmanager.go:70] Starting ensure routine for Raft db: /etc/ovn/ovnnb_db.db =====/home/surya/Downloads/must-gather.local.8304822870521033907/quay-io-openshift-release-dev-ocp-v4-0-art-dev-sha256-a4cd0fe78299d11381355f89526887610c4b4c0b7b529ca4addfdfef14ff6f57/namespaces/openshift-ovn-kubernetes/pods/ovnkube-master-c2mpn/ovn-dbchecker/ovn-dbchecker/logs/current.log: 2022-01-26T14:00:33.748569663Z I0126 14:00:33.748528 1 ovndbmanager.go:43] Starting DB Checker to ensure cluster membership and DB consistency 2022-01-26T14:00:33.760213597Z I0126 14:00:33.760174 1 ovndbmanager.go:424] No OVN_Southbound DB schema upgrade is required. Current version: 20.21.0, target: 20.21.0 2022-01-26T14:00:33.760237309Z I0126 14:00:33.760210 1 ovndbmanager.go:70] Starting ensure routine for Raft db: /etc/ovn/ovnsb_db.db 2022-01-26T14:00:33.760877151Z I0126 14:00:33.760844 1 ovndbmanager.go:424] No OVN_Northbound DB schema upgrade is required. Current version: 5.34.1, target: 5.34.1 2022-01-26T14:00:33.760877151Z I0126 14:00:33.760860 1 ovndbmanager.go:70] Starting ensure routine for Raft db: /etc/ovn/ovnnb_db.db 2022-01-26T14:08:33.773241042Z E0126 14:08:33.773191 1 ovndbmanager.go:103] unable to get db pod list from kubeclient: the server was unable to return a response in the time allotted, but may still be processing the request (get pods) 2022-01-26T14:08:33.773314249Z E0126 14:08:33.773196 1 ovndbmanager.go:103] unable to get db pod list from kubeclient: the server was unable to return a response in the time allotted, but may still be processing the request (get pods)
When the error happens: 2022-01-26T16:08:27.776353754Z I0126 16:08:27.776312 1 ovndbmanager.go:43] Starting DB Checker to ensure cluster membership and DB consistency 2022-01-26T16:08:27.790410111Z F0126 16:08:27.790352 1 ovndbmanager.go:49] NBDB Upgrade failed: %!w(*fmt.wrapError=&{failed to get schema version for NBDB, stderr: "ovsdb-client: transaction returned error: {\"details\":\"get_schema request specifies database OVN_Northbound which is not yet available because it has not completed joining its cluster\",\"error\":\"database not available\"}\n", error: OVN command '/usr/bin/ovsdb-client -t 10 get-schema-version unix:/var/run/ovn/ovnnb_db.sock OVN_Northbound' failed: exit status 1 0xc0008ac0b0}) I'd expect us to see: klog.Infof("Starting ensure routine for Raft db: %s", db) like we see in the other logs. go func() { defer wg.Done() if err := upgradeNBDBSchema(); err != nil { klog.Fatalf("NBDB Upgrade failed: %w", err) } ensureOvnDBState(util.OvnNbdbLocation, kclient, stopCh) }() After the fatal error gets printed we are having a traceback and not proceeding into ensureOvnDBState
Maybe fatal is not really the right level here? Because what we actually want is to run ensureOvnDBState which runs ensureClusterRaftMembership that ensures this pod has become a raft member. Then afterwards retry the upgrade.
Eventually the dbchecker should crash re-start and come up fine once the member has joined. It's really confusing this is not happening. sh-4.4# ovn-appctl -t /var/run/ovn/ovnnb_db.ctl --timeout=3 cluster/status OVN_Northbound 2f2a Name: OVN_Northbound Cluster ID: e3aa (e3aa2751-43a4-4c4b-a51a-a5c376f98da6) Server ID: 2f2a (2f2abbf7-7019-456b-a503-e59b1d525280) Address: ssl:10.242.128.6:9643 Status: cluster member Role: follower Term: 1 Leader: 5754 Vote: unknown Election timer: 10000 Log: [2, 7586] Entries not yet committed: 0 Entries not yet applied: 0 Connections: ->0000 <-5754 <-7f29 Disconnections: 3 Servers: 5754 (5754 at ssl:10.242.0.7:9643) last msg 3314 ms ago 2f2a (2f2a at ssl:10.242.128.6:9643) (self) sh-4.4# ovn-appctl -t /var/run/ovn/ovnnb_db.ctl --timeout=3 cluster/status OVN_Northbound 7f29 Name: OVN_Northbound Cluster ID: e3aa (e3aa2751-43a4-4c4b-a51a-a5c376f98da6) Server ID: 7f29 (7f29330f-8ea4-4d37-81d8-4bc5810e8a67) Address: ssl:10.242.65.4:9643 Status: joining cluster Remotes for joining: ssl:10.242.0.7:9643 ssl:10.242.128.6:9643 Role: follower Term: 1 Leader: 5754 Vote: unknown Election timer: 1000 Log: [2, 2] Entries not yet committed: 0 Entries not yet applied: 1 Connections: ->0000 ->0000 Disconnections: 0 Servers: 5754 (5754 at ssl:10.242.0.7:9643) last msg 991 ms ago sh-4.4# ovn-appctl -t /var/run/ovn/ovnnb_db.ctl --timeout=3 cluster/status OVN_Northbound 5754 Name: OVN_Northbound Cluster ID: e3aa (e3aa2751-43a4-4c4b-a51a-a5c376f98da6) Server ID: 5754 (5754c1cd-3adf-4294-a4e9-4d1a6a45b69f) Address: ssl:10.242.0.7:9643 Status: cluster member Adding server 7f29 (7f29 at ssl:10.242.65.4:9643) (adding: catchup) Role: leader Term: 1 Leader: self Vote: self Last Election started 83001818 ms ago, reason: timeout Last Election won: 83001818 ms ago Election timer: 10000 Log: [2, 7588] Entries not yet committed: 0 Entries not yet applied: 0 Connections: <-2f2a ->2f2a <-7f29 Disconnections: 3 Servers: 5754 (5754 at ssl:10.242.0.7:9643) (self) next_index=2 match_index=7587 2f2a (2f2a at ssl:10.242.128.6:9643) next_index=7588 match_index=7587 last msg 1888 ms ago Ah ok that explains it! Real issue is.. why isn't this 7f29 member joining the cluster :) ? The fact that db checker crashes is legit in that case!
Logs for NBDB from the error pod (follower that refuses to join the cluster and is stuck): [surya@hidden-temple ovn-org]$ oc logs -n openshift-ovn-kubernetes ovnkube-master-nwj4r -c nbdb Joining the nbdb cluster with init_ip=10.242.0.7... + [[ 10.242.65.4 == \1\0\.\2\4\2\.\0\.\7 ]] + echo 'Joining the nbdb cluster with init_ip=10.242.0.7...' + wait 128 + exec /usr/share/ovn/scripts/ovn-ctl --db-nb-cluster-local-port=9643 --db-nb-cluster-local-addr=10.242.65.4 --no-monitor --db-nb-cluster-local-proto=ssl --ovn-nb-db-ssl-key=/ovn-cert/tls.key --ovn-nb-db-ssl-cert=/ovn-cert/tls.crt --ovn-nb-db-ssl-ca-cert=/ovn-ca/ca-bundle.crt --db-nb-cluster-remote-port=9643 --db-nb-cluster-remote-addr=10.242.0.7 --db-nb-cluster-remote-proto=ssl '--ovn-nb-log=-vconsole:info -vfile:off -vPATTERN:console:%D{%Y-%m-%dT%H:%M:%S.###Z}|%05N|%c%T|%p|%m' run_nb_ovsdb Joining /etc/ovn/ovnnb_db.db to cluster. 2022-01-26T13:59:59.865Z|00001|vlog|INFO|opened log file /var/log/ovn/ovsdb-server-nb.log 2022-01-26T13:59:59.866Z|00002|raft|INFO|local server ID is 7f29 2022-01-26T13:59:59.868Z|00003|ovsdb_server|INFO|ovsdb-server (Open vSwitch) 2.16.2 2022-01-26T13:59:59.868Z|00004|reconnect|INFO|ssl:10.242.0.7:9643: connecting... 2022-01-26T13:59:59.875Z|00005|reconnect|INFO|ssl:10.242.0.7:9643: connected 2022-01-26T13:59:59.877Z|00006|raft_rpc|INFO|learned cluster ID e3aa 2022-01-26T13:59:59.877Z|00007|raft|INFO|ssl:10.242.0.7:9643: learned server ID 5754 2022-01-26T13:59:59.880Z|00008|raft|INFO|server 5754 is leader for term 1 2022-01-26T13:59:59.880Z|00009|raft|INFO|rejecting append_request because previous entry 1,5 not in local log (mismatch past end of log) 2022-01-26T14:00:04.955Z|00010|raft|INFO|server 5754 added to configuration 2022-01-26T14:00:04.955Z|00011|timeval|WARN|Unreasonably long 4813ms poll interval (2ms user, 0ms system) 2022-01-26T14:00:04.955Z|00012|timeval|WARN|faults: 55 minor, 0 major 2022-01-26T14:00:04.955Z|00013|timeval|WARN|disk: 0 reads, 32 writes 2022-01-26T14:00:04.956Z|00014|timeval|WARN|context switches: 13 voluntary, 1 involuntary 2022-01-26T14:00:04.956Z|00015|coverage|INFO|Event coverage, avg rate over last: 5 seconds, last minute, last hour, hash=ee7028dd: 2022-01-26T14:00:04.956Z|00016|coverage|INFO|raft_entry_serialize 0.0/sec 0.000/sec 0.0000/sec total: 1 2022-01-26T14:00:04.956Z|00017|coverage|INFO|hmap_pathological 0.0/sec 0.000/sec 0.0000/sec total: 1 2022-01-26T14:00:04.956Z|00018|coverage|INFO|hmap_expand 0.0/sec 0.000/sec 0.0000/sec total: 308 2022-01-26T14:00:04.956Z|00019|coverage|INFO|lockfile_lock 0.0/sec 0.000/sec 0.0000/sec total: 1 2022-01-26T14:00:04.956Z|00020|coverage|INFO|poll_create_node 0.0/sec 0.000/sec 0.0000/sec total: 45 2022-01-26T14:00:04.956Z|00021|coverage|INFO|poll_zero_timeout 0.0/sec 0.000/sec 0.0000/sec total: 3 2022-01-26T14:00:04.956Z|00022|coverage|INFO|seq_change 0.0/sec 0.000/sec 0.0000/sec total: 12 2022-01-26T14:00:04.956Z|00023|coverage|INFO|pstream_open 0.0/sec 0.000/sec 0.0000/sec total: 3 2022-01-26T14:00:04.956Z|00024|coverage|INFO|stream_open 0.0/sec 0.000/sec 0.0000/sec total: 1 2022-01-26T14:00:04.956Z|00025|coverage|INFO|util_xalloc 0.0/sec 0.000/sec 0.0000/sec total: 6419 2022-01-26T14:00:04.956Z|00026|coverage|INFO|101 events never hit 2022-01-26T14:00:04.956Z|00027|reconnect|INFO|ssl:10.242.0.7:9643: connection closed by peer 2022-01-26T14:00:05.956Z|00028|reconnect|INFO|ssl:10.242.0.7:9643: connecting... 2022-01-26T14:00:05.962Z|00029|reconnect|INFO|ssl:10.242.0.7:9643: connected 2022-01-26T14:00:05.965Z|00030|raft|INFO|ssl:10.242.128.6:9643: learned new server address for joining cluster 2022-01-26T14:00:05.965Z|00031|reconnect|INFO|ssl:10.242.128.6:9643: connecting... 2022-01-26T14:00:05.970Z|00032|reconnect|INFO|ssl:10.242.128.6:9643: connected 2022-01-26T14:00:05.972Z|00033|raft|INFO|ssl:10.242.128.6:9643: learned server ID 2f2a 2022-01-26T14:00:09.957Z|00034|memory|INFO|9156 kB peak resident set size after 10.1 seconds 2022-01-26T14:00:09.957Z|00035|memory|INFO|atoms:15 cells:20 monitors:0 2022-01-26T14:00:13.802Z|00036|ovsdb_server|INFO|memory trimming after compaction enabled. 2022-01-26T14:00:40.778Z|00037|ovsdb_server|INFO|memory trimming after compaction enabled. ============== logs for NBDB from correct pod1 (follower that correctly joins the cluster): [surya@hidden-temple ovn-org]$ oc logs -n openshift-ovn-kubernetes ovnkube-master-c2mpn -c nbdb + [[ 10.242.128.6 == \1\0\.\2\4\2\.\0\.\7 ]] + echo 'Joining the nbdb cluster with init_ip=10.242.0.7...' Joining the nbdb cluster with init_ip=10.242.0.7... + wait 128 + exec /usr/share/ovn/scripts/ovn-ctl --db-nb-cluster-local-port=9643 --db-nb-cluster-local-addr=10.242.128.6 --no-monitor --db-nb-cluster-local-proto=ssl --ovn-nb-db-ssl-key=/ovn-cert/tls.key --ovn-nb-db-ssl-cert=/ovn-cert/tls.crt --ovn-nb-db-ssl-ca-cert=/ovn-ca/ca-bundle.crt --db-nb-cluster-remote-port=9643 --db-nb-cluster-remote-addr=10.242.0.7 --db-nb-cluster-remote-proto=ssl '--ovn-nb-log=-vconsole:info -vfile:off -vPATTERN:console:%D{%Y-%m-%dT%H:%M:%S.###Z}|%05N|%c%T|%p|%m' run_nb_ovsdb Joining /etc/ovn/ovnnb_db.db to cluster. 2022-01-26T13:59:54.573Z|00001|vlog|INFO|opened log file /var/log/ovn/ovsdb-server-nb.log 2022-01-26T13:59:54.573Z|00002|raft|INFO|local server ID is 2f2a 2022-01-26T13:59:54.575Z|00003|ovsdb_server|INFO|ovsdb-server (Open vSwitch) 2.16.2 2022-01-26T13:59:54.575Z|00004|reconnect|INFO|ssl:10.242.0.7:9643: connecting... 2022-01-26T13:59:54.582Z|00005|reconnect|INFO|ssl:10.242.0.7:9643: connected 2022-01-26T13:59:54.584Z|00006|raft_rpc|INFO|learned cluster ID e3aa 2022-01-26T13:59:54.584Z|00007|raft|INFO|ssl:10.242.0.7:9643: learned server ID 5754 2022-01-26T13:59:54.584Z|00008|raft|INFO|server 5754 is leader for term 1 2022-01-26T13:59:54.584Z|00009|raft|INFO|rejecting append_request because previous entry 1,4 not in local log (mismatch past end of log) 2022-01-26T13:59:54.591Z|00010|raft|INFO|server 5754 added to configuration 2022-01-26T13:59:54.596Z|00011|raft|INFO|server 2f2a added to configuration 2022-01-26T13:59:54.603Z|00012|raft|INFO|ssl:10.242.0.7:42974: learned server ID 5754 2022-01-26T13:59:54.604Z|00013|raft|INFO|ssl:10.242.0.7:42974: learned remote address ssl:10.242.0.7:9643 2022-01-26T13:59:55.370Z|00014|ovsdb_server|INFO|memory trimming after compaction enabled. 2022-01-26T13:59:56.150Z|00015|ovsdb_server|INFO|memory trimming after compaction enabled. 2022-01-26T14:00:04.577Z|00016|memory|INFO|11532 kB peak resident set size after 10.0 seconds 2022-01-26T14:00:04.577Z|00017|memory|INFO|atoms:35 cells:43 monitors:0 raft-connections:2 raft-log:4 2022-01-26T14:00:05.968Z|00018|raft|INFO|ssl:10.242.65.4:48880: learned server ID 7f29 2022-01-26T14:00:05.968Z|00019|raft|INFO|ssl:10.242.65.4:48880: learned remote address ssl:10.242.65.4:9643 ============ logs for NBDB from correct pod1 (leader of the cluster): + [[ 10.242.128.6 == \1\0\.\2\4\2\.\0\.\7 ]] + echo 'Joining the nbdb cluster with init_ip=10.242.0.7...' Joining the nbdb cluster with init_ip=10.242.0.7... + wait 128 + exec /usr/share/ovn/scripts/ovn-ctl --db-nb-cluster-local-port=9643 --db-nb-cluster-local-addr=10.242.128.6 --no-monitor --db-nb-cluster-local-proto=ssl --ovn-nb-db-ssl-key=/ovn-cert/tls.key --ovn-nb-db-ssl-cert=/ovn-cert/tls.crt --ovn-nb-db-ssl-ca-cert=/ovn-ca/ca-bundle.crt --db-nb-cluster-remote-port=9643 --db-nb-cluster-remote-addr=10.242.0.7 --db-nb-cluster-remote-proto=ssl '--ovn-nb-log=-vconsole:info -vfile:off -vPATTERN:console:%D{%Y-%m-%dT%H:%M:%S.###Z}|%05N|%c%T|%p|%m' run_nb_ovsdb Joining /etc/ovn/ovnnb_db.db to cluster. 2022-01-26T13:59:54.573Z|00001|vlog|INFO|opened log file /var/log/ovn/ovsdb-server-nb.log 2022-01-26T13:59:54.573Z|00002|raft|INFO|local server ID is 2f2a 2022-01-26T13:59:54.575Z|00003|ovsdb_server|INFO|ovsdb-server (Open vSwitch) 2.16.2 2022-01-26T13:59:54.575Z|00004|reconnect|INFO|ssl:10.242.0.7:9643: connecting... 2022-01-26T13:59:54.582Z|00005|reconnect|INFO|ssl:10.242.0.7:9643: connected 2022-01-26T13:59:54.584Z|00006|raft_rpc|INFO|learned cluster ID e3aa 2022-01-26T13:59:54.584Z|00007|raft|INFO|ssl:10.242.0.7:9643: learned server ID 5754 2022-01-26T13:59:54.584Z|00008|raft|INFO|server 5754 is leader for term 1 2022-01-26T13:59:54.584Z|00009|raft|INFO|rejecting append_request because previous entry 1,4 not in local log (mismatch past end of log) 2022-01-26T13:59:54.591Z|00010|raft|INFO|server 5754 added to configuration 2022-01-26T13:59:54.596Z|00011|raft|INFO|server 2f2a added to configuration 2022-01-26T13:59:54.603Z|00012|raft|INFO|ssl:10.242.0.7:42974: learned server ID 5754 2022-01-26T13:59:54.604Z|00013|raft|INFO|ssl:10.242.0.7:42974: learned remote address ssl:10.242.0.7:9643 2022-01-26T13:59:55.370Z|00014|ovsdb_server|INFO|memory trimming after compaction enabled. 2022-01-26T13:59:56.150Z|00015|ovsdb_server|INFO|memory trimming after compaction enabled. 2022-01-26T14:00:04.577Z|00016|memory|INFO|11532 kB peak resident set size after 10.0 seconds 2022-01-26T14:00:04.577Z|00017|memory|INFO|atoms:35 cells:43 monitors:0 raft-connections:2 raft-log:4 2022-01-26T14:00:05.968Z|00018|raft|INFO|ssl:10.242.65.4:48880: learned server ID 7f29 2022-01-26T14:00:05.968Z|00019|raft|INFO|ssl:10.242.65.4:48880: learned remote address ssl:10.242.65.4:9643 2022-01-26T14:00:05.968Z|00020|raft|INFO|adding 7f29 (7f29 at ssl:10.242.65.4:9643) to cluster e3aa failed (not leader) 2022-01-26T14:00:06.955Z|00021|raft|INFO|adding 7f29 (7f29 at ssl:10.242.65.4:9643) to cluster e3aa failed (not leader) 2022-01-26T14:00:07.955Z|00022|raft|INFO|adding 7f29 (7f29 at ssl:10.242.65.4:9643) to cluster e3aa failed (not leader) 2022-01-26T14:00:08.956Z|00023|raft|INFO|adding 7f29 (7f29 at ssl:10.242.65.4:9643) to cluster e3aa failed (not leader) 2022-01-26T14:00:09.956Z|00024|raft|INFO|adding 7f29 (7f29 at ssl:10.242.65.4:9643) to cluster e3aa failed (not leader) 2022-01-26T14:00:10.726Z|00025|ovsdb_server|INFO|memory trimming after compaction enabled. 2022-01-26T14:00:10.957Z|00026|raft|INFO|adding 7f29 (7f29 at ssl:10.242.65.4:9643) to cluster e3aa failed (not leader) 2022-01-26T14:00:11.959Z|00027|raft|INFO|adding 7f29 (7f29 at ssl:10.242.65.4:9643) to cluster e3aa failed (not leader) 2022-01-26T14:00:12.959Z|00028|raft|INFO|adding 7f29 (7f29 at ssl:10.242.65.4:9643) to cluster e3aa failed (not leader) 2022-01-26T14:00:13.959Z|00029|raft|INFO|adding 7f29 (7f29 at ssl:10.242.65.4:9643) to cluster e3aa failed (not leader) 2022-01-26T14:00:14.959Z|00030|raft|INFO|adding 7f29 (7f29 at ssl:10.242.65.4:9643) to cluster e3aa failed (not leader) 2022-01-26T14:00:15.961Z|00031|raft|INFO|adding 7f29 (7f29 at ssl:10.242.65.4:9643) to cluster e3aa failed (not leader) 2022-01-26T14:00:18.965Z|00032|raft|INFO|Dropped 2 log messages in last 2 seconds (most recently, 1 seconds ago) due to excessive rate 2022-01-26T14:00:18.965Z|00033|raft|INFO|adding 7f29 (7f29 at ssl:10.242.65.4:9643) to cluster e3aa failed (not leader) 2022-01-26T14:00:24.966Z|00034|raft|INFO|Dropped 5 log messages in last 5 seconds (most recently, 1 seconds ago) due to excessive rate 2022-01-26T14:00:24.966Z|00035|raft|INFO|adding 7f29 (7f29 at ssl:10.242.65.4:9643) to cluster e3aa failed (not leader) 2022-01-26T14:00:29.970Z|00036|raft|INFO|Dropped 4 log messages in last 4 seconds (most recently, 1 seconds ago) due to excessive rate 2022-01-26T14:00:29.970Z|00037|raft|INFO|adding 7f29 (7f29 at ssl:10.242.65.4:9643) to cluster e3aa failed (not leader) 2022-01-26T14:00:33.745Z|00038|ovsdb_server|INFO|memory trimming after compaction enabled. 2022-01-26T14:00:35.978Z|00039|raft|INFO|Dropped 5 log messages in last 5 seconds (most recently, 1 seconds ago) due to excessive rate 2022-01-26T14:00:35.978Z|00040|raft|INFO|adding 7f29 (7f29 at ssl:10.242.65.4:9643) to cluster e3aa failed (not leader) 2022-01-26T14:00:41.983Z|00041|raft|INFO|Dropped 5 log messages in last 5 seconds (most recently, 1 seconds ago) due to excessive rate 2022-01-26T14:00:41.983Z|00042|raft|INFO|adding 7f29 (7f29 at ssl:10.242.65.4:9643) to cluster e3aa failed (not leader) 2022-01-26T14:00:47.988Z|00043|raft|INFO|Dropped 5 log messages in last 5 seconds (most recently, 1 seconds ago) due to excessive rate 2022-01-26T14:00:47.988Z|00044|raft|INFO|adding 7f29 (7f29 at ssl:10.242.65.4:9643) to cluster e3aa failed (not leader) 2022-01-26T14:00:53.994Z|00045|raft|INFO|Dropped 5 log messages in last 5 seconds (most recently, 1 seconds ago) due to excessive rate 2022-01-26T14:00:53.994Z|00046|raft|INFO|adding 7f29 (7f29 at ssl:10.242.65.4:9643) to cluster e3aa failed (not leader) 2022-01-26T14:00:59.995Z|00047|raft|INFO|Dropped 5 log messages in last 5 seconds (most recently, 1 seconds ago) due to excessive rate 2022-01-26T14:00:59.995Z|00048|raft|INFO|adding 7f29 (7f29 at ssl:10.242.65.4:9643) to cluster e3aa failed (not leader) 2022-01-26T14:01:05.999Z|00049|raft|INFO|Dropped 5 log messages in last 5 seconds (most recently, 1 seconds ago) due to excessive rate 2022-01-26T14:01:05.999Z|00050|raft|INFO|adding 7f29 (7f29 at ssl:10.242.65.4:9643) to cluster e3aa failed (not leader) 2022-01-26T14:01:12.003Z|00051|raft|INFO|Dropped 5 log messages in last 5 seconds (most recently, 1 seconds ago) due to excessive rate 2022-01-26T14:01:12.003Z|00052|raft|INFO|adding 7f29 (7f29 at ssl:10.242.65.4:9643) to cluster e3aa failed (not leader) 2022-01-26T14:01:18.008Z|00053|raft|INFO|Dropped 5 log messages in last 5 seconds (most recently, 1 seconds ago) due to excessive rate 2022-01-26T14:01:18.008Z|00054|raft|INFO|adding 7f29 (7f29 at ssl:10.242.65.4:9643) to cluster e3aa failed (not leader) 2022-01-26T14:01:24.015Z|00055|raft|INFO|Dropped 5 log messages in last 5 seconds (most recently, 1 seconds ago) due to excessive rate 2022-01-26T14:01:24.015Z|00056|raft|INFO|adding 7f29 (7f29 at ssl:10.242.65.4:9643) to cluster e3aa failed (not leader) 2022-01-26T14:01:24.361Z|00057|raft|INFO|Election timer changed from 1000 to 2000 2022-01-26T14:01:30.019Z|00058|raft|INFO|Dropped 5 log messages in last 5 seconds (most recently, 1 seconds ago) due to excessive rate 2022-01-26T14:01:30.019Z|00059|raft|INFO|adding 7f29 (7f29 at ssl:10.242.65.4:9643) to cluster e3aa failed (not leader) 2022-01-26T14:01:36.023Z|00060|raft|INFO|Dropped 5 log messages in last 5 seconds (most recently, 1 seconds ago) due to excessive rate 2022-01-26T14:01:36.023Z|00061|raft|INFO|adding 7f29 (7f29 at ssl:10.242.65.4:9643) to cluster e3aa failed (not leader) 2022-01-26T14:01:42.026Z|00062|raft|INFO|Dropped 5 log messages in last 5 seconds (most recently, 1 seconds ago) due to excessive rate 2022-01-26T14:01:42.026Z|00063|raft|INFO|adding 7f29 (7f29 at ssl:10.242.65.4:9643) to cluster e3aa failed (not leader) 2022-01-26T14:01:48.031Z|00064|raft|INFO|Dropped 5 log messages in last 5 seconds (most recently, 1 seconds ago) due to excessive rate 2022-01-26T14:01:48.031Z|00065|raft|INFO|adding 7f29 (7f29 at ssl:10.242.65.4:9643) to cluster e3aa failed (not leader) <snip> 2022-01-26T14:03:18.105Z|00095|raft|INFO|Dropped 5 log messages in last 5 seconds (most recently, 1 seconds ago) due to excessive rate 2022-01-26T14:03:18.106Z|00096|raft|INFO|adding 7f29 (7f29 at ssl:10.242.65.4:9643) to cluster e3aa failed (not leader) 2022-01-26T14:03:24.109Z|00097|raft|INFO|Dropped 5 log messages in last 5 seconds (most recently, 1 seconds ago) due to excessive rate 2022-01-26T14:03:24.109Z|00098|raft|INFO|adding 7f29 (7f29 at ssl:10.242.65.4:9643) to cluster e3aa failed (not leader) 2022-01-26T14:03:24.349Z|00099|raft|INFO|Election timer changed from 4000 to 8000 2022-01-26T14:03:30.110Z|00100|raft|INFO|Dropped 5 log messages in last 5 seconds (most recently, 1 seconds ago) due to excessive rate 2022-01-26T14:03:30.110Z|00101|raft|INFO|adding 7f29 (7f29 at ssl:10.242.65.4:9643) to cluster e3aa failed (not leader) 2022-01-26T14:03:36.121Z|00102|raft|INFO|Dropped 5 log messages in last 5 seconds (most recently, 1 seconds ago) due to excessive rate 2022-01-26T14:03:36.121Z|00103|raft|INFO|adding 7f29 (7f29 at ssl:10.242.65.4:9643) to cluster e3aa failed (not leader) 2022-01-26T14:03:42.125Z|00104|raft|INFO|Dropped 5 log messages in last 5 seconds (most recently, 1 seconds ago) due to excessive rate 2022-01-26T14:03:42.125Z|00105|raft|INFO|adding 7f29 (7f29 at ssl:10.242.65.4:9643) to cluster e3aa failed (not leader) 2022-01-26T14:03:48.132Z|00106|raft|INFO|Dropped 5 log messages in last 5 seconds (most recently, 1 seconds ago) due to excessive rate 2022-01-26T14:03:48.132Z|00107|raft|INFO|adding 7f29 (7f29 at ssl:10.242.65.4:9643) to cluster e3aa failed (not leader) 2022-01-26T14:03:53.581Z|00001|timeval(log_fsync3)|WARN|Unreasonably long 2920ms poll interval (0ms user, 0ms system) 2022-01-26T14:03:53.581Z|00002|timeval(log_fsync3)|WARN|context switches: 5 voluntary, 0 involuntary 2022-01-26T14:03:53.581Z|00003|coverage(log_fsync3)|INFO|Event coverage, avg rate over last: 5 seconds, last minute, last hour, hash=ed913496: 2022-01-26T14:03:53.581Z|00004|coverage(log_fsync3)|INFO|raft_entry_serialize 0.6/sec 1.900/sec 0.1217/sec total: 439
[surya@hidden-temple ovn-org]$ oc get pods -n openshift-ovn-kubernetes -owide NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES ovnkube-master-c2mpn 6/6 Running 6 (23h ago) 23h 10.242.128.6 rbrattai-o410i37-xp2gr-master-2 <none> <none> ovnkube-master-nwj4r 5/6 CrashLoopBackOff 283 (4m32s ago) 23h 10.242.65.4 rbrattai-o410i37-xp2gr-master-1 <none> <none> ovnkube-master-tcgcc 6/6 Running 1 (23h ago) 23h 10.242.0.7 rbrattai-o410i37-xp2gr-master-0 <none> <none> ovnkube-node-h964n 5/5 Running 0 23h 10.242.64.5 rbrattai-o410i37-xp2gr-worker-2-p9xgs <none> <none> ovnkube-node-mwlzb 5/5 Running 0 23h 10.242.1.4 rbrattai-o410i37-xp2gr-worker-1-nz2g6 <none> <none> ovnkube-node-pwm27 5/5 Running 0 23h 10.242.129.4 rbrattai-o410i37-xp2gr-worker-3-4jd78 <none> <none> ovnkube-node-qx8vr 5/5 Running 0 23h 10.242.0.7 rbrattai-o410i37-xp2gr-master-0 <none> <none> ovnkube-node-wxm2b 5/5 Running 0 23h 10.242.128.6 rbrattai-o410i37-xp2gr-master-2 <none> <none> ovnkube-node-zs5xv 5/5 Running 0 23h 10.242.65.4 rbrattai-o410i37-xp2gr-master-1 <none> <none>
oc logs -n openshift-ovn-kubernetes ovnkube-master-tcgcc -c nbdb There was an error pasting master logs in https://bugzilla.redhat.com/show_bug.cgi?id=2033514#c16. RE-pasting: exec /usr/share/ovn/scripts/ovn-ctl --db-nb-cluster-local-port=9643 --db-nb-cluster-local-addr=10.242.0.7 --no-monitor --db-nb-cluster-local-proto=ssl --ovn-nb-db-ssl-key=/ovn-cert/tls.key --ovn-nb-db-ssl-cert=/ovn-cert/tls.crt --ovn-nb-db-ssl-ca-cert=/ovn-ca/ca-bundle.crt '--ovn-nb-log=-vconsole:info -vfile:off -vPATTERN:console:%D{%Y-%m-%dT%H:%M:%S.###Z}|%05N|%c%T|%p|%m' run_nb_ovsdb Creating cluster database /etc/ovn/ovnnb_db.db. 2022-01-26T13:59:53.434Z|00001|vlog|INFO|opened log file /var/log/ovn/ovsdb-server-nb.log 2022-01-26T13:59:53.435Z|00002|raft|INFO|term 1: 292895 ms timeout expired, starting election 2022-01-26T13:59:53.435Z|00003|raft|INFO|term 1: elected leader by 1+ of 1 servers 2022-01-26T13:59:53.435Z|00004|raft|INFO|local server ID is 5754 ovn-nbctl: unix:/var/run/ovn/ovnnb_db.sock: database connection failed (No such file or directory) 2022-01-26T13:59:53.438Z|00005|ovsdb_server|INFO|ovsdb-server (Open vSwitch) 2.16.2 2022-01-26T13:59:53Z|00001|reconnect|INFO|unix:/var/run/ovn/ovnnb_db.sock: connecting... 2022-01-26T13:59:53Z|00002|reconnect|INFO|unix:/var/run/ovn/ovnnb_db.sock: connected Waiting for OVN_Northbound to come up. 2022-01-26T13:59:54.584Z|00006|raft|INFO|ssl:10.242.128.6:35570: learned server ID 2f2a 2022-01-26T13:59:54.584Z|00007|raft|INFO|ssl:10.242.128.6:35570: learned remote address ssl:10.242.128.6:9643 2022-01-26T13:59:54.584Z|00008|raft|INFO|starting to add server 2f2a (2f2a at ssl:10.242.128.6:9643) to cluster e3aa 2022-01-26T13:59:54.588Z|00009|raft|INFO|sending snapshot to server 2f2a, 1:1. 2022-01-26T13:59:54.593Z|00010|raft|INFO|cluster e3aa: installed snapshot on server 2f2a up to 1:1 2022-01-26T13:59:54.596Z|00011|reconnect|INFO|ssl:10.242.128.6:9643: connecting... 2022-01-26T13:59:54.600Z|00012|raft|INFO|adding 2f2a (2f2a at ssl:10.242.128.6:9643) to cluster e3aa succeeded (completed) 2022-01-26T13:59:54.603Z|00013|reconnect|INFO|ssl:10.242.128.6:9643: connected 2022-01-26T13:59:54.973Z|00014|stream_ssl|WARN|SSL_read: unexpected SSL connection close 2022-01-26T13:59:54.973Z|00015|jsonrpc|WARN|ssl:10.242.65.4:53994: receive error: Protocol error 2022-01-26T13:59:54.973Z|00016|reconnect|WARN|ssl:10.242.65.4:53994: connection dropped (Protocol error) 2022-01-26T13:59:54.977Z|00017|stream_ssl|WARN|SSL_read: unexpected SSL connection close 2022-01-26T13:59:54.977Z|00018|jsonrpc|WARN|ssl:10.242.65.4:53992: receive error: Protocol error 2022-01-26T13:59:54.977Z|00019|reconnect|WARN|ssl:10.242.65.4:53992: connection dropped (Protocol error) 2022-01-26T13:59:55.306Z|00020|stream_ssl|WARN|SSL_read: unexpected SSL connection close 2022-01-26T13:59:55.306Z|00021|jsonrpc|WARN|ssl:10.242.128.6:33274: receive error: Protocol error 2022-01-26T13:59:55.306Z|00022|reconnect|WARN|ssl:10.242.128.6:33274: connection dropped (Protocol error) 2022-01-26T13:59:55.326Z|00023|stream_ssl|WARN|SSL_read: unexpected SSL connection close 2022-01-26T13:59:55.326Z|00024|jsonrpc|WARN|ssl:10.242.128.6:33272: receive error: Protocol error 2022-01-26T13:59:55.326Z|00025|reconnect|WARN|ssl:10.242.128.6:33272: connection dropped (Protocol error) 2022-01-26T13:59:55.982Z|00026|stream_ssl|WARN|SSL_read: unexpected SSL connection close 2022-01-26T13:59:55.982Z|00027|jsonrpc|WARN|ssl:10.242.65.4:54006: receive error: Protocol error 2022-01-26T13:59:55.982Z|00028|reconnect|WARN|ssl:10.242.65.4:54006: connection dropped (Protocol error) 2022-01-26T13:59:55.995Z|00029|stream_ssl|WARN|SSL_read: unexpected SSL connection close 2022-01-26T13:59:55.995Z|00030|reconnect|WARN|ssl:10.242.65.4:54004: connection dropped (Protocol error) 2022-01-26T13:59:56.039Z|00031|stream_ssl|WARN|SSL_read: unexpected SSL connection close 2022-01-26T13:59:56.039Z|00032|reconnect|WARN|ssl:10.242.128.6:33288: connection dropped (Protocol error) 2022-01-26T13:59:56.043Z|00033|stream_ssl|WARN|SSL_read: unexpected SSL connection close 2022-01-26T13:59:56.043Z|00034|reconnect|WARN|ssl:10.242.128.6:33286: connection dropped (Protocol error) 2022-01-26T13:59:59.874Z|00035|raft|INFO|ssl:10.242.65.4:54390: learned server ID 7f29 2022-01-26T13:59:59.874Z|00036|raft|INFO|ssl:10.242.65.4:54390: learned remote address ssl:10.242.65.4:9643 2022-01-26T13:59:59.874Z|00037|raft|INFO|starting to add server 7f29 (7f29 at ssl:10.242.65.4:9643) to cluster e3aa 2022-01-26T14:00:00.138Z|00038|raft|INFO|sending snapshot to server 7f29, 1:1. 2022-01-26T14:00:03.440Z|00039|memory|INFO|10288 kB peak resident set size after 10.0 seconds 2022-01-26T14:00:03.440Z|00040|memory|INFO|atoms:35 cells:43 monitors:2 raft-connections:3 raft-log:4 sessions:1 2022-01-26T14:00:04.269Z|00041|reconnect|ERR|ssl:10.242.65.4:54390: no response to inactivity probe after 2.05 seconds, disconnecting 2022-01-26T14:00:05.961Z|00042|raft|INFO|ssl:10.242.65.4:54400: learned server ID 7f29 2022-01-26T14:00:05.961Z|00043|raft|INFO|ssl:10.242.65.4:54400: learned remote address ssl:10.242.65.4:9643 2022-01-26T14:00:05.961Z|00044|raft|INFO|adding 7f29 (7f29 at ssl:10.242.65.4:9643) to cluster e3aa failed (in progress) 2022-01-26T14:00:06.956Z|00045|raft|INFO|adding 7f29 (7f29 at ssl:10.242.65.4:9643) to cluster e3aa failed (in progress) 2022-01-26T14:00:07.956Z|00046|raft|INFO|adding 7f29 (7f29 at ssl:10.242.65.4:9643) to cluster e3aa failed (in progress) 2022-01-26T14:00:08.957Z|00047|raft|INFO|adding 7f29 (7f29 at ssl:10.242.65.4:9643) to cluster e3aa failed (in progress) 2022-01-26T14:00:09.958Z|00048|raft|INFO|adding 7f29 (7f29 at ssl:10.242.65.4:9643) to cluster e3aa failed (in progress)
Hey Ross, closing this as dupe of https://bugzilla.redhat.com/show_bug.cgi?id=2041546 *** This bug has been marked as a duplicate of bug 2041546 ***
This is not a dupe of the election timer bug. I mean the reason why this bug happened is different. Quoting Ilya from OVS team: The series of events: 1) New follower sent a join request. 2) Leader accepted it. 3) Leader sent an append request with all the latest raft log entries. 4) Follower has no log at all, so it declines the append request. (That's a standard procedure, log entries are sent for the case previous joining session got interrupted by other leader losing its leadership. basically, to avoid step 5 is some cases.) 5) Leader sends snapshot installation request. 6) At the same time follower gets preempted for 5 seconds without a chance to actually receive the data. 7) After some time leader closes connection due to inactivity. 8) Follower finally got some CPU time, discovers that connection is broken. 9) Follower re-connects and tries to re-join. 10) Leader still thinks that the previous join is in progress, so declines. 11) leader still waits for the reply for the never received snapshot installation request and keeps declining new join requests thinking that follower is just not patient enough. Solution should be: 1) If joining server reconnects while still in CATHUP phase, leader should re-start the process from step 3, i.e. assume that append request or snapshot are lost and needs to be re-sent. 2) Additionally, leader should timeout requests from servers if they are not having any progress for the election timeout. (paragraph 4.2.1 of raft paper). Current implementation doesn't do that. Workarounds: 1) Re-start the leader. Pending join requests are not part of the configuration, so they will be cleared on re-start of a leader. Joining should succeed after restart or it will succeed on the second follower that will take the leadership while current leader is re-starting. (If for some reason we can not re-start the leader) Destroy the joining server and re-create it to change the server ID. Since leader doesn't timeout join requests and doesn't re-start the process on re-connection, the server with the current server ID will never be able to join that cluster until the leader is re-started. Reopening this bug.
Moving the bz to the OVS dbserver team.
Solution 1 posted for review: https://patchwork.ozlabs.org/project/openvswitch/patch/20220128185121.434055-1-i.maximets@ovn.org/ Timeouts are still nice to have, but can be implemented as a separate RFE.
thanks Ilya! will use this version of OVS and close the other bug as well.
Marking Verified, SanityOnly as no reliable reproducer is available. Fixed in Version: openvswitch2.16-2.16.0-57.el8fdp Errata build: openvswitch2.16-2.16.0-63.el8fdp SanityOnly details: [ralongi@ralongi openvswitch2.16]$ rhpkg switch-branch fast-datapath-rhel-8 Switched to branch 'fast-datapath-rhel-8' [ralongi@ralongi openvswitch2.16]$ git log --oneline --grep=2033514 897937f6d3 (tag: openvswitch2.16-2.16.0-57.el8fdp) Merging upstream branch-2.16 9598f0529c ovsdb: raft: Fix inability to join the cluster after interrupted attempt. [ralongi@ralongi openvswitch2.16]$ git show 9598f0529c commit 9598f0529c433217d9d39abcef12730c97b6e9d1 Author: Ilya Maximets <i.maximets> Date: Fri Jan 28 19:51:21 2022 +0100 ovsdb: raft: Fix inability to join the cluster after interrupted attempt. If the joining server re-connects while catching up (e.g. if it crashed or connection got closed due to inactivity), the data we sent might be lost, so the server will never reply to append request or a snapshot installation request. At the same time, leader will decline all the subsequent requests to join from that server with the 'in progress' resolution. At this point the new server will never be able to join the cluster, because it will never receive the raft log while leader thinks that it was already sent. This happened in practice when one of the servers got preempted for a few seconds, so the leader closed connection due to inactivity. Destroying the joining server if disconnection detected. This will allow to start the joining from scratch when the server re-connects and sends the new join request. We can't track re-connection in the raft_conn_run(), because it's incoming connection and the jsonrpc will not keep it alive or try to reconnect. Next time the server re-connects it will be an entirely new raft conn. Fixes: 1b1d2e6daa56 ("ovsdb: Introduce experimental support for clustered databases.") Reported-at: https://bugzilla.redhat.com/2033514 Signed-off-by: Ilya Maximets <i.maximets> Acked-by: Dumitru Ceara <dceara> diff --git a/ovsdb/raft.c b/ovsdb/raft.c index 2fb5156519..a0474be59c 100644 --- a/ovsdb/raft.c +++ b/ovsdb/raft.c @@ -74,6 +74,7 @@ enum raft_failure_test { FT_CRASH_BEFORE_SEND_EXEC_REQ, FT_CRASH_AFTER_SEND_EXEC_REQ, FT_CRASH_AFTER_RECV_APPEND_REQ_UPDATE, + FT_CRASH_BEFORE_SEND_SNAPSHOT_REP, FT_DELAY_ELECTION, FT_DONT_SEND_VOTE_REQUEST, FT_STOP_RAFT_RPC, @@ -379,12 +380,19 @@ static bool raft_handle_write_error(struct raft *, struct ovsdb_error *); static void raft_run_reconfigure(struct raft *); static void raft_set_leader(struct raft *, const struct uuid *sid); + static struct raft_server * raft_find_server(const struct raft *raft, const struct uuid *sid) { return raft_server_find(&raft->servers, sid); } +static struct raft_server * +raft_find_new_server(struct raft *raft, const struct uuid *uuid) +{ + return raft_server_find(&raft->add_servers, uuid); +} + static char * raft_make_address_passive(const char *address_) { @@ -1864,6 +1872,8 @@ raft_open_conn(struct raft *raft, const char *address, const struct uuid *sid) static void raft_conn_close(struct raft_conn *conn) { + VLOG_DBG("closing connection to server %s (%s)", + conn->nickname, jsonrpc_session_get_name(conn->js)); jsonrpc_session_close(conn->js); ovs_list_remove(&conn->list_node); free(conn->nickname); @@ -1954,16 +1964,30 @@ raft_run(struct raft *raft) } /* Close unneeded sessions. */ + struct raft_server *server; struct raft_conn *next; LIST_FOR_EACH_SAFE (conn, next, list_node, &raft->conns) { if (!raft_conn_should_stay_open(raft, conn)) { + server = raft_find_new_server(raft, &conn->sid); + if (server) { + /* We only have one incoming connection from joining servers, + * so if it's closed, we need to destroy the record about the + * server. This way the process can be started over on the + * next join request. */ + static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5); + VLOG_INFO_RL(&rl, "cluster "CID_FMT": server %s (%s) " + "disconnected while joining", + CID_ARGS(&raft->cid), + server->nickname, server->address); + hmap_remove(&raft->add_servers, &server->hmap_node); + raft_server_destroy(server); + } raft->n_disconnections++; raft_conn_close(conn); } } /* Open needed sessions. */ - struct raft_server *server; HMAP_FOR_EACH (server, hmap_node, &raft->servers) { raft_open_conn(raft, server->address, &server->sid); } @@ -3352,12 +3376,6 @@ raft_find_peer(struct raft *raft, const struct uuid *uuid) return s && !uuid_equals(&raft->sid, &s->sid) ? s : NULL; } -static struct raft_server * -raft_find_new_server(struct raft *raft, const struct uuid *uuid) -{ - return raft_server_find(&raft->add_servers, uuid); -} - /* Figure 3.1: "If there exists an N such that N > commitIndex, a * majority of matchIndex[i] >= N, and log[N].term == currentTerm, set * commitIndex = N (sections 3.5 and 3.6)." */ @@ -4132,6 +4150,10 @@ static void raft_handle_install_snapshot_request( struct raft *raft, const struct raft_install_snapshot_request *rq) { + if (failure_test == FT_CRASH_BEFORE_SEND_SNAPSHOT_REP) { + ovs_fatal(0, "Raft test: crash before sending install_snapshot_reply"); + } + if (raft_handle_install_snapshot_request__(raft, rq)) { union raft_rpc rpy = { .install_snapshot_reply = { @@ -4926,6 +4948,8 @@ raft_unixctl_failure_test(struct unixctl_conn *conn OVS_UNUSED, failure_test = FT_CRASH_AFTER_SEND_EXEC_REQ; } else if (!strcmp(test, "crash-after-receiving-append-request-update")) { failure_test = FT_CRASH_AFTER_RECV_APPEND_REQ_UPDATE; + } else if (!strcmp(test, "crash-before-sending-install-snapshot-reply")) { + failure_test = FT_CRASH_BEFORE_SEND_SNAPSHOT_REP; } else if (!strcmp(test, "delay-election")) { failure_test = FT_DELAY_ELECTION; struct raft *raft; diff --git a/tests/ovsdb-cluster.at b/tests/ovsdb-cluster.at index fc6253cfe9..07af1160fc 100644 --- a/tests/ovsdb-cluster.at +++ b/tests/ovsdb-cluster.at @@ -400,6 +400,61 @@ done AT_CLEANUP +AT_BANNER([OVSDB - cluster failure while joining]) +AT_SETUP([OVSDB cluster - follower crash while joining]) +AT_KEYWORDS([ovsdb server negative unix cluster join]) + +n=3 +schema_name=`ovsdb-tool schema-name $abs_srcdir/idltest.ovsschema` +ordinal_schema > schema +AT_CHECK([ovsdb-tool '-vPATTERN:console:%c|%p|%m' create-cluster s1.db dnl + $abs_srcdir/idltest.ovsschema unix:s1.raft], [0], [], [stderr]) +cid=`ovsdb-tool db-cid s1.db` +schema_name=`ovsdb-tool schema-name $abs_srcdir/idltest.ovsschema` +for i in `seq 2 $n`; do + AT_CHECK([ovsdb-tool join-cluster s$i.db $schema_name unix:s$i.raft unix:s1.raft]) +done + +on_exit 'kill `cat *.pid`' + +dnl Starting followers first, so we can configure them to crash on join. +for j in `seq $n`; do + i=$(($n + 1 - $j)) + AT_CHECK([ovsdb-server -v -vconsole:off -vsyslog:off dnl + --detach --no-chdir --log-file=s$i.log dnl + --pidfile=s$i.pid --unixctl=s$i dnl + --remote=punix:s$i.ovsdb s$i.db]) + if test $i != 1; then + OVS_WAIT_UNTIL([ovs-appctl -t "`pwd`"/s$i dnl + cluster/failure-test crash-before-sending-install-snapshot-reply dnl + | grep -q "engaged"]) + fi +done + +dnl Make sure that followers really crashed. +for i in `seq 2 $n`; do + OVS_WAIT_WHILE([test -s s$i.pid]) +done + +dnl Bring them back. +for i in `seq 2 $n`; do + AT_CHECK([ovsdb-server -v -vconsole:off -vsyslog:off dnl + --detach --no-chdir --log-file=s$i.log dnl + --pidfile=s$i.pid --unixctl=s$i dnl + --remote=punix:s$i.ovsdb s$i.db]) +done + +dnl Make sure that all servers joined the cluster. +for i in `seq $n`; do + AT_CHECK([ovsdb_client_wait unix:s$i.ovsdb $schema_name connected]) +done + +for i in `seq $n`; do + OVS_APP_EXIT_AND_WAIT_BY_TARGET([`pwd`/s$i], [s$i.pid]) +done + +AT_CLEANUP + ^L OVS_START_SHELL_HELPERS (END)
Since the problem described in this bug report should be resolved in a recent advisory, it has been closed with a resolution of ERRATA. For information on the advisory (openvswitch2.16 bug fix and enhancement update), and where to find the updated files, follow the link below. If the solution does not work for you, open a new bug report. https://access.redhat.com/errata/RHBA-2022:1525