Ceph health is going to warn state on ODF 4.14.7 cluster because one of the daemons is crashing. ODF version: 4.14.7 OCP version: 4.14.24 [root@rdr-odf4147-bastion-0 scripts]# oc get csv -A NAMESPACE NAME DISPLAY VERSION REPLACES PHASE openshift-local-storage local-storage-operator.v4.14.0-202404030309 Local Storage 4.14.0-202404030309 Succeeded openshift-operator-lifecycle-manager packageserver Package Server 0.0.1-snapshot Succeeded openshift-storage mcg-operator.v4.14.7-rhodf NooBaa Operator 4.14.7-rhodf mcg-operator.v4.14.6-rhodf Succeeded openshift-storage ocs-operator.v4.14.7-rhodf OpenShift Container Storage 4.14.7-rhodf ocs-operator.v4.14.6-rhodf Succeeded openshift-storage odf-csi-addons-operator.v4.14.7-rhodf CSI Addons 4.14.7-rhodf odf-csi-addons-operator.v4.14.6-rhodf Succeeded openshift-storage odf-operator.v4.14.7-rhodf OpenShift Data Foundation 4.14.7-rhodf odf-operator.v4.14.6-rhodf Succeeded [root@rdr-odf4147-bastion-0 scripts]# oc get pods -n openshift-storage NAME READY STATUS RESTARTS AGE csi-addons-controller-manager-689f68486f-5ds88 2/2 Running 0 127m csi-cephfsplugin-mc47p 2/2 Running 0 4h23m csi-cephfsplugin-mjhmd 2/2 Running 0 4h23m csi-cephfsplugin-provisioner-7474765988-52zdq 5/5 Running 0 4h23m csi-cephfsplugin-provisioner-7474765988-zzkcn 5/5 Running 0 4h23m csi-cephfsplugin-vvtjr 2/2 Running 0 4h23m csi-nfsplugin-bxqhm 2/2 Running 0 175m csi-nfsplugin-jfmld 2/2 Running 0 175m csi-nfsplugin-provisioner-6c874556f8-5qcgx 5/5 Running 0 175m csi-nfsplugin-provisioner-6c874556f8-wjtpr 5/5 Running 0 175m csi-nfsplugin-vxtwm 2/2 Running 0 175m csi-rbdplugin-lk5sq 3/3 Running 0 4h23m csi-rbdplugin-provisioner-54464d46f-d6lvz 6/6 Running 0 4h23m csi-rbdplugin-provisioner-54464d46f-rjcxb 6/6 Running 0 4h23m csi-rbdplugin-rwpzt 3/3 Running 0 4h23m csi-rbdplugin-z44b4 3/3 Running 0 4h23m noobaa-core-0 1/1 Running 0 4h20m noobaa-db-pg-0 1/1 Running 0 4h20m noobaa-endpoint-5c5cb4465f-7lqbs 1/1 Running 0 4h19m noobaa-operator-7d69ff9855-czxql 2/2 Running 0 4h25m ocs-metrics-exporter-67b9f9855d-w4f5r 1/1 Running 0 4h25m ocs-operator-c8d7c579f-8sv89 1/1 Running 0 4h25m odf-console-7888dd6746-c2rdj 1/1 Running 0 4h25m odf-operator-controller-manager-7749cdb995-hj4px 2/2 Running 0 4h25m rook-ceph-crashcollector-worker-0-648f4b8788-wxgxd 1/1 Running 0 4h21m rook-ceph-crashcollector-worker-1-646b58c45f-lzzjz 1/1 Running 0 4h21m rook-ceph-crashcollector-worker-2-754dbbbfd6-988sn 1/1 Running 0 4h20m rook-ceph-exporter-worker-0-6b555f4675-vtkrf 1/1 Running 0 4h21m rook-ceph-exporter-worker-1-6f9d4c69c6-gxscj 1/1 Running 0 4h20m rook-ceph-exporter-worker-2-54fcd47ccc-dz49p 1/1 Running 0 4h20m rook-ceph-mds-ocs-storagecluster-cephfilesystem-a-59b657c5n6w8w 2/2 Running 0 4h21m rook-ceph-mds-ocs-storagecluster-cephfilesystem-b-86bdff9c6rtsp 2/2 Running 0 4h21m rook-ceph-mgr-a-664998bd55-brfhq 2/2 Running 0 4h21m rook-ceph-mon-a-78974b5876-f9bn4 2/2 Running 0 4h23m rook-ceph-mon-b-595cf6c7dd-n9pxk 2/2 Running 0 4h22m rook-ceph-mon-c-7d677bc87c-cvlqr 2/2 Running 0 4h22m rook-ceph-nfs-ocs-storagecluster-cephnfs-a-749b7554b9-svlfb 0/2 Pending 0 175m rook-ceph-operator-857ccc7545-4mxqb 1/1 Running 0 175m rook-ceph-osd-0-5fd64ff975-ptnzw 2/2 Running 0 4h21m rook-ceph-osd-1-cf445674b-lzp4q 2/2 Running 0 4h21m rook-ceph-osd-2-8959656bc-fbxtt 2/2 Running 1 (3h11m ago) 4h21m rook-ceph-osd-prepare-5895382c9fad168658629556bc2357ec-ql2xb 0/1 Completed 0 4h21m rook-ceph-osd-prepare-844e4607303c606cfbab4627d669ad7d-2hr8z 0/1 Completed 0 4h21m rook-ceph-osd-prepare-d28e28ab841253a7fabb443ba93df7e4-b5z9w 0/1 Completed 0 4h21m rook-ceph-rgw-ocs-storagecluster-cephobjectstore-a-c6df4b4jmsz9 2/2 Running 0 4h20m rook-ceph-tools-6cb655c7d-nbm9n 1/1 Running 0 4h21m ux-backend-server-996cffddb-gpznx 2/2 Running 0 4h25m [root@rdr-odf4147-bastion-0 scripts]# oc -n openshift-storage rsh rook-ceph-tools-6cb655c7d-nbm9n sh-5.1$ sh-5.1$ ceph -s cluster: id: 98eb0784-a614-4a72-894f-5dbd22943b5c health: HEALTH_WARN 1 daemons have recently crashed services: mon: 3 daemons, quorum a,b,c (age 2h) mgr: a(active, since 4h) mds: 1/1 daemons up, 1 hot standby osd: 3 osds: 3 up (since 3h), 3 in (since 4h) rgw: 1 daemon active (1 hosts, 1 zones) data: volumes: 1/1 healthy pools: 13 pools, 201 pgs objects: 1.46k objects, 3.6 GiB usage: 10 GiB used, 1.5 TiB / 1.5 TiB avail pgs: 201 active+clean io: client: 11 KiB/s rd, 16 KiB/s wr, 13 op/s rd, 2 op/s wr sh-5.1$ sh-5.1$ ceph crash ls ID ENTITY NEW 2024-05-17T13:21:56.069393Z_4c403fa3-d584-4cf0-9d01-ccbcdcb8f889 osd.2 * sh-5.1$ We have tried in multiple clusters, and every time ceph health is going to warning state because of a daemon crashing. It can be recovered by executing `ceph crash archive <crash-id> . But the issue is consistent.
ODF must-gather: https://drive.google.com/file/d/1I0lSKhkyFaoydWZywD8ny-AVDQKV4V6A/view?usp=sharing
We tried in one more setup and there also ceph health went to WARN state. [root@zstrm4147-36f2-bastion-0 ~]# oc get cephcluster -n openshift-storage NAME DATADIRHOSTPATH MONCOUNT AGE PHASE MESSAGE HEALTH EXTERNAL FSID ocs-storagecluster-cephcluster /var/lib/rook 3 20h Ready Cluster created successfully HEALTH_WARN 9906c940-e3f4-4e54-a3b3-344f6bba0e44 [root@zstrm4147-36f2-bastion-0 ~]# oc project openshift-storage Now using project "openshift-storage" on server "https://api.zstrm4147-36f2.redhat.com:6443". [root@zstrm4147-36f2-bastion-0 ~]# oc get pods NAME READY STATUS RESTARTS AGE csi-addons-controller-manager-996645669-wdkgh 2/2 Running 0 6h44m csi-cephfsplugin-7wwx6 2/2 Running 0 20h csi-cephfsplugin-ntd5f 2/2 Running 0 20h csi-cephfsplugin-provisioner-689d6c55c8-87m25 5/5 Running 0 20h csi-cephfsplugin-provisioner-689d6c55c8-pd755 5/5 Running 0 20h csi-cephfsplugin-x5nj4 2/2 Running 0 20h csi-rbdplugin-7x8xr 3/3 Running 0 20h csi-rbdplugin-b5vbp 3/3 Running 0 20h csi-rbdplugin-lz46c 3/3 Running 0 20h csi-rbdplugin-provisioner-f55b6764c-bgfgh 6/6 Running 0 20h csi-rbdplugin-provisioner-f55b6764c-kgwl4 6/6 Running 0 20h noobaa-core-0 1/1 Running 0 13h noobaa-db-pg-0 1/1 Running 0 15h noobaa-endpoint-97d7869fc-lkm8z 1/1 Running 0 13h noobaa-endpoint-97d7869fc-m668w 1/1 Running 0 13h noobaa-operator-5b98688968-4m2gg 2/2 Running 0 20h ocs-metrics-exporter-69fcf78f4f-mvq42 1/1 Running 1 (15h ago) 20h ocs-operator-795d78977d-gmn9j 1/1 Running 0 20h odf-console-665c9b8b89-hkj85 1/1 Running 0 20h odf-operator-controller-manager-64cc78c7fb-vh8fl 2/2 Running 0 20h rook-ceph-crashcollector-worker-0-648f4b8788-67qg4 1/1 Running 0 20h rook-ceph-crashcollector-worker-1-58cf4949d5-5lm55 1/1 Running 0 20h rook-ceph-crashcollector-worker-2-754dbbbfd6-nrst5 1/1 Running 0 20h rook-ceph-exporter-worker-0-6b555f4675-kqnlc 1/1 Running 0 20h rook-ceph-exporter-worker-1-6896d48494-r5bvf 1/1 Running 0 20h rook-ceph-exporter-worker-2-54fcd47ccc-kwdzc 1/1 Running 0 20h rook-ceph-mds-ocs-storagecluster-cephfilesystem-a-6fd84d547s78s 2/2 Running 0 20h rook-ceph-mds-ocs-storagecluster-cephfilesystem-b-5f4bd954r2sph 2/2 Running 0 20h rook-ceph-mgr-a-5f88d4d8c8-sz66q 2/2 Running 0 20h rook-ceph-mon-a-6bcf64f9c-bf4m2 2/2 Running 0 20h rook-ceph-mon-b-84c747557b-g4pr9 2/2 Running 0 20h rook-ceph-mon-c-59d8bdffd6-kgb54 2/2 Running 0 20h rook-ceph-operator-55f5b798c6-nq6sk 1/1 Running 0 12h rook-ceph-osd-0-64cc857d47-4bxr7 2/2 Running 30 (7h32m ago) 20h rook-ceph-osd-1-76b9bd4f88-5xdvh 2/2 Running 28 (7h31m ago) 20h rook-ceph-osd-2-5bdfc884f6-9zbdr 2/2 Running 1 (17h ago) 20h rook-ceph-osd-prepare-0585316d9c09d657f184835f73eeb702-bx8qd 0/1 Completed 0 20h rook-ceph-osd-prepare-0f8262ce950db13b819688fa5932c519-smfbv 0/1 Completed 0 20h rook-ceph-osd-prepare-bd463f52c4ffcb0dc0c102d520b83a3c-xzglp 0/1 Completed 0 20h rook-ceph-rgw-ocs-storagecluster-cephobjectstore-a-7494b7dcjqtx 2/2 Running 0 20h rook-ceph-tools-6cb655c7d-p7z4c 1/1 Running 0 17h s3cli-0 1/1 Running 0 16h ux-backend-server-664946c799-89nz2 2/2 Running 0 20h [root@zstrm4147-36f2-bastion-0 ~]# cd /home/pooja/ [root@zstrm4147-36f2-bastion-0 pooja]# cd ocs-upi-kvm/scripts/ [root@zstrm4147-36f2-bastion-0 scripts]# oc rsh rook-ceph-tools-6cb655c7d-p7z4c sh-5.1$ ceph -s cluster: id: 9906c940-e3f4-4e54-a3b3-344f6bba0e44 health: HEALTH_WARN 58 daemons have recently crashed services: mon: 3 daemons, quorum a,b,c (age 20h) mgr: a(active, since 20h) mds: 1/1 daemons up, 1 hot standby osd: 3 osds: 3 up (since 7h), 3 in (since 20h) rgw: 1 daemon active (1 hosts, 1 zones) data: volumes: 1/1 healthy pools: 13 pools, 201 pgs objects: 856 objects, 678 MiB usage: 3.5 GiB used, 1.5 TiB / 1.5 TiB avail pgs: 201 active+clean io: client: 853 B/s rd, 5.0 KiB/s wr, 1 op/s rd, 0 op/s wr sh-5.1$ ceph crash ls ID ENTITY NEW 2024-05-20T12:07:26.947280Z_98530e32-dee6-4f66-8ae8-85114fd35d97 osd.2 2024-05-20T19:29:24.758645Z_51f769e0-ef71-48b7-a608-2d2e412cbcf4 osd.0 * 2024-05-20T19:29:56.529526Z_ab37a6e2-4fff-4246-87a2-0d4429ea051a osd.0 * 2024-05-20T19:30:27.156872Z_d501478f-18f6-40fd-af89-fc5223c8b7de osd.0 * 2024-05-20T19:31:02.783288Z_bbaae5cc-b674-44a3-b79e-5f7a95974afd osd.0 * 2024-05-20T19:31:58.454879Z_3c7771ab-3077-489f-8064-7430cb01b02d osd.0 * 2024-05-20T19:33:44.627129Z_a5820177-feb5-438f-8999-3ccc8543e1f7 osd.0 * 2024-05-20T19:36:38.968833Z_37404201-9e3f-4224-8c2a-afe1165ea4bc osd.0 * 2024-05-20T19:40:59.824158Z_e0ae9b15-4a53-48a6-9425-46dfd392af21 osd.1 * 2024-05-20T19:41:35.736050Z_b5840db9-a431-4391-9af4-ee7147f4cef2 osd.1 * 2024-05-20T19:42:05.208379Z_db6877b4-e9dc-47fe-ba48-76ecb192de62 osd.1 * 2024-05-20T19:42:14.802140Z_a2ea96dc-f475-4107-82f4-404f42f553d0 osd.0 * 2024-05-20T19:42:48.624680Z_cbc0122d-692c-4fe6-812c-4a90dc902390 osd.1 * 2024-05-20T19:43:53.141545Z_e8e4ce86-ea47-4050-9132-989dde758748 osd.1 * 2024-05-20T19:45:26.955786Z_1f86e7e5-15d3-4557-b0df-1a13f4b7937c osd.1 * 2024-05-20T19:47:39.146466Z_3af69261-44fd-49f4-ac13-31e6b9c42121 osd.0 * 2024-05-20T19:48:26.775745Z_0fa4c6b2-e50a-4c68-aff3-1e8499ea9222 osd.1 * 2024-05-20T19:53:02.656173Z_ebf81eb7-5d42-463e-bdf2-d0c088e85e89 osd.0 * 2024-05-20T19:53:45.443418Z_3db24034-3a78-4564-90e8-4900f91ce3ff osd.1 * 2024-05-20T19:58:24.668759Z_39db852a-c778-4290-84b9-520cf6aafd8c osd.0 * 2024-05-20T19:58:59.753079Z_afddde37-7e3b-405c-9772-6b480e3c0240 osd.1 * 2024-05-20T20:03:37.768719Z_98f0deb6-f553-4076-ab9b-f6d1380af308 osd.0 * 2024-05-20T20:04:19.318720Z_a5b37a47-8466-4a6e-81f5-244c26b61edd osd.1 * 2024-05-20T20:08:58.540910Z_4aca0122-c6c4-46db-bceb-eeae19c3e3b8 osd.0 * 2024-05-20T20:09:38.966684Z_fd180a49-d16b-4542-9f92-2bc4ccd3ad38 osd.1 * 2024-05-20T20:14:12.311070Z_b56f8463-90b8-47ff-bd7c-d64be0ab8609 osd.0 * 2024-05-20T20:14:53.449849Z_d5321e15-ba36-44fe-82be-56c50ac036fe osd.1 * 2024-05-20T20:19:36.326738Z_820b5d6f-a28e-49b3-ad48-1387842449c3 osd.0 * 2024-05-20T20:20:11.271711Z_382ec56e-895b-427f-8b12-87672b053f8b osd.1 * 2024-05-20T20:25:01.272700Z_36353d4b-6809-49e5-83dd-97c5102c60d2 osd.0 * 2024-05-20T20:25:28.751136Z_03d53a79-afa0-47b0-a219-87712abb4932 osd.1 * 2024-05-20T20:30:13.421988Z_49ddacb3-a129-4ccc-8fa0-81d0ea4f3ae1 osd.0 * 2024-05-20T20:30:52.083514Z_6937807e-7ac0-4e84-a097-42c7347b8137 osd.1 * 2024-05-20T20:35:31.517824Z_afd13a7a-9362-4612-9b63-04a742dc1989 osd.0 * 2024-05-20T20:36:12.511203Z_f2a690f7-4006-495d-a9cb-153e1150409f osd.1 * 2024-05-20T20:40:44.987849Z_a71249c6-c0e5-4aaf-a756-81a8de4d03c0 osd.0 * 2024-05-20T20:41:32.120357Z_8fefda75-2160-42f1-9bfb-42ce5b8741f1 osd.1 * 2024-05-20T20:46:15.092657Z_f5b52130-f3df-4103-a181-aef834dbb4e1 osd.0 * 2024-05-20T20:46:48.165170Z_b9068456-6f1e-49fa-9b80-2d40dd0429ec osd.1 * 2024-05-20T20:51:29.307286Z_6e186a56-aaf6-4677-a970-df992a9bb726 osd.0 * 2024-05-20T20:52:06.096301Z_6cce959e-4bea-4a41-b34e-87eb524d46c3 osd.1 * 2024-05-20T20:56:50.302435Z_0369e319-5a9b-4eed-a025-584414f02151 osd.0 * 2024-05-20T20:57:22.233310Z_514d5bfa-4073-47ba-a824-3e7b2f309bd2 osd.1 * 2024-05-20T21:02:03.688782Z_d40b44a0-f706-4a1e-963e-216d3432b6b5 osd.0 * 2024-05-20T21:02:33.969822Z_5c580fac-dbf6-456c-9cf1-101f45ea37cc osd.1 * 2024-05-20T21:07:21.407423Z_00e78631-5422-4a74-8060-f4c607d71dec osd.0 * 2024-05-20T21:07:54.163055Z_6dfeb37d-4c60-49c9-8f06-35589de52070 osd.1 * 2024-05-20T21:12:36.502645Z_497725a3-7df9-464b-8441-8ceeef49b51c osd.0 * 2024-05-20T21:13:11.475367Z_8058c5a0-7d02-4dab-98ba-222b0815db58 osd.1 * 2024-05-20T21:17:54.499438Z_7db2a5f1-8746-41e9-9309-c21ffea517eb osd.0 * 2024-05-20T21:18:30.103770Z_bac0bed5-0518-4355-bdbc-d52ea8132861 osd.1 * 2024-05-20T21:23:17.100344Z_6ebfbc60-0c5a-4db0-937f-debbb4e608cf osd.0 * 2024-05-20T21:23:51.755475Z_753b19d2-9bba-4e3a-a115-6f6d60dd3a75 osd.1 * 2024-05-20T21:28:29.614448Z_07d0b9a1-8f1a-4d41-8a3c-08176bfb05da osd.0 * 2024-05-20T21:29:09.063590Z_34ed321b-6643-4f3b-8b0f-be400c733ab3 osd.1 * 2024-05-20T21:33:49.817150Z_c46df897-a75f-416e-bb78-52e453b61bcc osd.0 * 2024-05-20T21:34:32.385479Z_833b12b2-8af2-419c-aed3-ad442f92f5f5 osd.1 * 2024-05-20T21:39:03.385938Z_95a4db27-ae84-4e9d-b57c-c1cd5e0ce256 osd.0 * 2024-05-20T21:39:47.288190Z_8f0d4fc2-3c54-4c96-961e-f0baabc310ee osd.1 * sh-5.1$
Sunil, we are not facing this issue in ODF4.16 We have created the BZ for ODF4.14.7
we haven't noticed the issue on ODF4.14.6, ODF4.15.2 and not even in ODF4.16.0
This issue is only seen in ODF4.14.7 and we have tried in multiple clusters and it is happening every time.
We again tested on 4.14.6 and did not face this issue. Ceph health is in OK state.
With custom build of ODF4.14.7 having rhceph version 6.1z4, we did not face any issues while running test cases. ceph health is also OK and all pods are running.
We re-tested ODF4.15.2 again and it also have same issue that we have in ODF4.14.7
I ran tier1 test on ODF 4.14.7 after excluding test_selinux_relabel_for_existing_pvc[5] test case and setting debug log level to 20. Here are the must gather for the same - https://drive.google.com/file/d/12coLuTEAZVhI45ZRL6VS4bS06eg4SDKR/viewusp=drive_link
Blaine, I created fresh setup for ODF4.14.7 and after setting log level to 20, I executed tier1 tests after excluding selinux testcase. While the tests were running, CEPH_HEALTH went to error state. I have collected must-gather and also core-dump from all the 3 worker nodes. And regarding your question: > Did the issue reproduce? We are hitting this issue every time. Must-gather contains coredumps from worker-1 and worker-2 and not from worker-0. [root@rdr-odf4147-bastion-0 ~]# cd odf4147/quay-io-rhceph-dev-ocs-must-gather-sha256-3e50350c978708240e3a3d8e1054c752ce9637e3222de5eb93145a2cac4f8ac8/ [root@rdr-odf4147-bastion-0 quay-io-rhceph-dev-ocs-must-gather-sha256-3e50350c978708240e3a3d8e1054c752ce9637e3222de5eb93145a2cac4f8ac8]# ls ceph cluster-scoped-resources event-filter.html gather-debug.log namespaces noobaa timestamp [root@rdr-odf4147-bastion-0 quay-io-rhceph-dev-ocs-must-gather-sha256-3e50350c978708240e3a3d8e1054c752ce9637e3222de5eb93145a2cac4f8ac8]# cd ceph/ [root@rdr-odf4147-bastion-0 ceph]# ls ceph_daemon_log_worker-1 coredump_worker-1 crash_worker-1 event-filter.html journal_worker-2 kernel_worker-2 must_gather_commands namespaces ceph_daemon_log_worker-2 coredump_worker-2 crash_worker-2 journal_worker-1 kernel_worker-1 logs must_gather_commands_json_output timestamp Must-gather logs: https://drive.google.com/file/d/13PeLkpFSLj9NIx07upX4cJH8qHEpVUxe/view?usp=sharing I will attach core-dumps and logfile of tier1 as well.
coredump from worker-0: https://drive.google.com/file/d/1uHu5D9sCpwHZInH1Iz7C95gGzFFpFkmT/view?usp=sharing worker-1: https://drive.google.com/file/d/1b2Q_o_f_cLMJxhBMw9gV5UAPjhOXHGa6/view?usp=sharing worker-2: https://drive.google.com/file/d/1k0NiOSyUfZRuUVsKRiaVEnX-OY326ozy/view?usp=sharing
[root@rdr-odf4147-bastion-0 scripts]# oc get csv -A NAMESPACE NAME DISPLAY VERSION REPLACES PHASE openshift-local-storage local-storage-operator.v4.14.0-202404030309 Local Storage 4.14.0-202404030309 Succeeded openshift-operator-lifecycle-manager packageserver Package Server 0.0.1-snapshot Succeeded openshift-storage mcg-operator.v4.14.7-rhodf NooBaa Operator 4.14.7-rhodf mcg-operator.v4.14.6-rhodf Succeeded openshift-storage ocs-operator.v4.14.7-rhodf OpenShift Container Storage 4.14.7-rhodf ocs-operator.v4.14.6-rhodf Succeeded openshift-storage odf-csi-addons-operator.v4.14.7-rhodf CSI Addons 4.14.7-rhodf odf-csi-addons-operator.v4.14.6-rhodf Succeeded openshift-storage odf-operator.v4.14.7-rhodf OpenShift Data Foundation 4.14.7-rhodf odf-operator.v4.14.6-rhodf Succeeded [root@rdr-odf4147-bastion-0 scripts]# oc get cephcluster -n openshift-storage NAME DATADIRHOSTPATH MONCOUNT AGE PHASE MESSAGE HEALTH EXTERNAL FSID ocs-storagecluster-cephcluster /var/lib/rook 3 10h Ready Cluster created successfully HEALTH_ERR 5af86b47-4a7a-49bf-afc5-9427b31af0f7 [root@rdr-odf4147-bastion-0 scripts]# oc get pods -n openshift-storage NAME READY STATUS RESTARTS AGE csi-addons-controller-manager-7f6c8f9bb5-qprmg 2/2 Running 0 97m csi-cephfsplugin-22h8v 2/2 Running 0 10h csi-cephfsplugin-provisioner-569f87bb57-4sppw 5/5 Running 0 10h csi-cephfsplugin-provisioner-569f87bb57-bmjc8 5/5 Running 0 10h csi-cephfsplugin-pvmk6 2/2 Running 0 10h csi-cephfsplugin-x8xjj 2/2 Running 0 10h csi-nfsplugin-d6sdj 2/2 Running 0 7h6m csi-nfsplugin-dh9hh 2/2 Running 0 7h6m csi-nfsplugin-h8kml 2/2 Running 0 7h6m csi-nfsplugin-provisioner-6c874556f8-8dc6k 5/5 Running 0 7h6m csi-nfsplugin-provisioner-6c874556f8-c7qzm 5/5 Running 0 7h6m csi-rbdplugin-j9ssv 3/3 Running 0 10h csi-rbdplugin-p9vdq 3/3 Running 0 10h csi-rbdplugin-provisioner-745c4d4c9d-26lwg 6/6 Running 0 10h csi-rbdplugin-provisioner-745c4d4c9d-qwmm8 6/6 Running 0 10h csi-rbdplugin-vr2dj 3/3 Running 0 10h noobaa-core-0 1/1 Running 0 7h5m noobaa-db-pg-0 1/1 Running 0 7h5m noobaa-endpoint-6b95b7db94-cg77r 1/1 Running 0 7h6m noobaa-endpoint-6b95b7db94-xbxdq 1/1 Running 0 8h noobaa-operator-7d98d9bcd5-dpdgp 2/2 Running 0 10h ocs-metrics-exporter-67b9f9855d-2tpks 1/1 Running 0 10h ocs-operator-c8d7c579f-5ds49 1/1 Running 0 10h odf-console-7888dd6746-mhl6w 1/1 Running 0 10h odf-operator-controller-manager-7749cdb995-dhd7l 2/2 Running 0 10h rook-ceph-crashcollector-worker-0-648f4b8788-xkwvs 1/1 Running 0 10h rook-ceph-crashcollector-worker-1-646b58c45f-d9jjg 1/1 Running 0 10h rook-ceph-crashcollector-worker-2-69f5449956-vkrjc 1/1 Running 0 10h rook-ceph-exporter-worker-0-6b555f4675-9jhv4 1/1 Running 0 10h rook-ceph-exporter-worker-1-6f9d4c69c6-5jmvw 1/1 Running 0 10h rook-ceph-exporter-worker-2-c45956ff5-6fb75 1/1 Running 0 10h rook-ceph-mds-ocs-storagecluster-cephfilesystem-a-5dbf95fcl8szg 2/2 Running 0 10h rook-ceph-mds-ocs-storagecluster-cephfilesystem-b-cdf8f98flckv7 2/2 Running 0 10h rook-ceph-mgr-a-6c49b69f56-sgc8w 2/2 Running 0 10h rook-ceph-mon-a-c8855ffbc-qchzx 2/2 Running 0 10h rook-ceph-mon-b-564f768678-cl6kd 2/2 Running 0 10h rook-ceph-mon-c-798d6866b-lftwj 2/2 Running 0 10h rook-ceph-nfs-ocs-storagecluster-cephnfs-a-749b7554b9-trmtp 2/2 Running 0 7h6m rook-ceph-operator-857ccc7545-7w4h8 1/1 Running 0 7h6m rook-ceph-osd-0-85d8855d4c-bzlwc 2/2 Running 21 (153m ago) 10h rook-ceph-osd-1-7f86b5dd68-92htf 2/2 Running 21 (154m ago) 10h rook-ceph-osd-2-5f66dc8fff-wznr9 2/2 Running 7 (4h6m ago) 10h rook-ceph-osd-prepare-1c57881a92d512679897c83afc0a1402-9lgc6 0/1 Completed 0 10h rook-ceph-osd-prepare-80aac1814cd9db2201a61815118b2870-s6h4m 0/1 Completed 0 10h rook-ceph-osd-prepare-bec33bac20067cf4b8111ca5cf53d962-wj4kz 0/1 Completed 0 10h rook-ceph-rgw-ocs-storagecluster-cephobjectstore-a-f56d999qkbvv 2/2 Running 0 10h rook-ceph-tools-6cb655c7d-t8wrd 1/1 Running 0 10h ux-backend-server-996cffddb-f44jg 2/2 Running 0 10h [root@rdr-odf4147-bastion-0 scripts]# oc -n openshift-storage rsh rook-ceph-tools-6cb655c7d-t8wrd sh-5.1$ ceph -s cluster: id: 5af86b47-4a7a-49bf-afc5-9427b31af0f7 health: HEALTH_ERR 1 MDSs report slow metadata IOs 2 MDSs behind on trimming 3/3361 objects unfound (0.089%) Possible data damage: 3 pgs recovery_unfound Degraded data redundancy: 9/10083 objects degraded (0.089%), 3 pgs degraded, 1 pg undersized 49 daemons have recently crashed 5 slow ops, oldest one blocked for 8991 sec, osd.2 has slow ops services: mon: 3 daemons, quorum a,b,c (age 10h) mgr: a(active, since 10h) mds: 1/1 daemons up, 1 hot standby osd: 3 osds: 3 up (since 2h), 3 in (since 10h); 1 remapped pgs rgw: 1 daemon active (1 hosts, 1 zones) data: volumes: 1/1 healthy pools: 13 pools, 201 pgs objects: 3.36k objects, 7.9 GiB usage: 24 GiB used, 1.4 TiB / 1.5 TiB avail pgs: 9/10083 objects degraded (0.089%) 3/3361 objects unfound (0.089%) 198 active+clean 2 active+recovery_unfound+degraded 1 active+recovery_unfound+undersized+degraded+remapped io: client: 5.0 KiB/s rd, 5.5 KiB/s wr, 5 op/s rd, 3 op/s wr sh-5.1$
> Could you retest this with `debug_osd = 20` and `debug_bluestore = 20` both? Please capture the m-g and coredumps again as well. setting `debug_osd = 20` and `debug_bluestore = 20` [root@rdr-odf414sel-bastion-0 ~]# oc -n openshift-storage rsh rook-ceph-tools-6cb655c7d-mfmt5 sh-5.1$ ceph -s cluster: id: 3f44f905-74ed-456e-8288-0cea8e7d0c93 health: HEALTH_OK services: mon: 3 daemons, quorum a,b,c (age 10h) mgr: a(active, since 10h) mds: 1/1 daemons up, 1 hot standby osd: 3 osds: 3 up (since 10h), 3 in (since 10h) rgw: 1 daemon active (1 hosts, 1 zones) data: volumes: 1/1 healthy pools: 12 pools, 169 pgs objects: 2.60k objects, 7.9 GiB usage: 24 GiB used, 1.4 TiB / 1.5 TiB avail pgs: 169 active+clean io: client: 853 B/s rd, 16 KiB/s wr, 1 op/s rd, 1 op/s wr sh-5.1$ ceph tell osd.* config get debug_osd osd.0: { "debug_osd": "1/5" } osd.1: { "debug_osd": "1/5" } osd.2: { "debug_osd": "1/5" } sh-5.1$ sh-5.1$ ceph tell osd.* config set debug_osd 20 osd.0: { "success": "" } osd.1: { "success": "" } osd.2: { "success": "" } sh-5.1$ sh-5.1$ ceph tell osd.* config get debug_osd osd.0: { "debug_osd": "20/20" } osd.1: { "debug_osd": "20/20" } osd.2: { "debug_osd": "20/20" } sh-5.1$ ceph tell osd.* config get debug_bluestore osd.0: { "debug_bluestore": "1/5" } osd.1: { "debug_bluestore": "1/5" } osd.2: { "debug_bluestore": "1/5" } sh-5.1$ sh-5.1$ ceph tell osd.* config set debug_bluestore 20 osd.0: { "success": "" } osd.1: { "success": "" } osd.2: { "success": "" } sh-5.1$ sh-5.1$ ceph tell osd.* config get debug_bluestore osd.0: { "debug_bluestore": "20/20" } osd.1: { "debug_bluestore": "20/20" } osd.2: { "debug_bluestore": "20/20" } sh-5.1$ exit exit Executed tier1 tests and ceph health went to warn state. ceph health status: [root@rdr-odf414sel-bastion-0 ~]# oc -n openshift-storage rsh rook-ceph-tools-6cb655c7d-mfmt5 sh-5.1$ ceph -s cluster: id: 3f44f905-74ed-456e-8288-0cea8e7d0c93 health: HEALTH_WARN 1 daemons have recently crashed services: mon: 3 daemons, quorum a,b,c (age 13h) mgr: a(active, since 13h) mds: 1/1 daemons up, 1 hot standby osd: 3 osds: 3 up (since 2h), 3 in (since 13h) rgw: 1 daemon active (1 hosts, 1 zones) data: volumes: 1/1 healthy pools: 13 pools, 201 pgs objects: 3.18k objects, 10 GiB usage: 31 GiB used, 1.4 TiB / 1.5 TiB avail pgs: 201 active+clean io: client: 1023 B/s rd, 418 KiB/s wr, 1 op/s rd, 1 op/s wr sh-5.1$ sh-5.1$ ceph crash ls ID ENTITY NEW 2024-06-05T07:00:33.318587Z_892175db-bd0a-48cc-a0e4-23da2970608a osd.2 * sh-5.1$ pods: [root@rdr-odf414sel-bastion-0 ~]# oc get pods -n openshift-storage NAME READY STATUS RESTARTS AGE csi-addons-controller-manager-5f8cf65876-257w4 2/2 Running 0 19m csi-cephfsplugin-c7wlj 2/2 Running 0 13h csi-cephfsplugin-k7fg8 2/2 Running 0 13h csi-cephfsplugin-provisioner-7b6b7c8bbf-4fh97 5/5 Running 0 13h csi-cephfsplugin-provisioner-7b6b7c8bbf-7stpt 5/5 Running 0 13h csi-cephfsplugin-rfxpz 2/2 Running 0 13h csi-nfsplugin-d4m25 2/2 Running 0 67m csi-nfsplugin-gx78q 2/2 Running 0 67m csi-nfsplugin-provisioner-6c874556f8-p7n7r 5/5 Running 0 67m csi-nfsplugin-provisioner-6c874556f8-zf24r 5/5 Running 0 67m csi-nfsplugin-xb969 2/2 Running 0 67m csi-rbdplugin-6gxnf 3/3 Running 0 13h csi-rbdplugin-csqwz 3/3 Running 0 13h csi-rbdplugin-mm785 3/3 Running 0 13h csi-rbdplugin-provisioner-6c64c96886-2smdw 6/6 Running 0 13h csi-rbdplugin-provisioner-6c64c96886-m5tz9 6/6 Running 0 13h noobaa-core-0 1/1 Running 0 66m noobaa-db-pg-0 1/1 Running 0 66m noobaa-endpoint-654d57b548-c4svt 1/1 Running 0 128m noobaa-endpoint-654d57b548-s8tj4 1/1 Running 0 67m noobaa-operator-7bdd5cb576-rbh2p 2/2 Running 0 13h ocs-metrics-exporter-67b9f9855d-f7rdv 1/1 Running 1 (67m ago) 13h ocs-operator-c8d7c579f-pvfln 1/1 Running 0 13h odf-console-7888dd6746-rl7n2 1/1 Running 0 13h odf-operator-controller-manager-7749cdb995-dw7mc 2/2 Running 0 13h rook-ceph-crashcollector-worker-0-554f85b66-gnh52 1/1 Running 0 13h rook-ceph-crashcollector-worker-1-646b58c45f-gvm59 1/1 Running 0 13h rook-ceph-crashcollector-worker-2-754dbbbfd6-b4fsn 1/1 Running 0 13h rook-ceph-exporter-worker-0-666bd75845-tnn57 1/1 Running 0 13h rook-ceph-exporter-worker-1-6f9d4c69c6-j6g88 1/1 Running 0 13h rook-ceph-exporter-worker-2-54fcd47ccc-cspg5 1/1 Running 0 13h rook-ceph-mds-ocs-storagecluster-cephfilesystem-a-6487686cs8tlz 2/2 Running 0 13h rook-ceph-mds-ocs-storagecluster-cephfilesystem-b-f8b946dfqqrgm 2/2 Running 0 13h rook-ceph-mgr-a-744479898c-jnrnx 2/2 Running 0 13h rook-ceph-mon-a-574b64f99-xkvd7 2/2 Running 0 13h rook-ceph-mon-b-6dffc99fdd-pf7f9 2/2 Running 0 13h rook-ceph-mon-c-768c9bd57d-49m6f 2/2 Running 0 13h rook-ceph-nfs-ocs-storagecluster-cephnfs-a-749b7554b9-8zlpx 2/2 Running 0 67m rook-ceph-operator-857ccc7545-ns89c 1/1 Running 0 67m rook-ceph-osd-0-5548bc9796-cbzgl 2/2 Running 0 13h rook-ceph-osd-1-778c784bd5-jcd2t 2/2 Running 0 13h rook-ceph-osd-2-8c55ccff8-dtjrm 2/2 Running 1 (131m ago) 13h rook-ceph-osd-prepare-28a84de044d061da27bf0f4f43d99cab-qcqms 0/1 Completed 0 13h rook-ceph-osd-prepare-3535e54116e131773d2accb9756b7796-46r9l 0/1 Completed 0 13h rook-ceph-osd-prepare-92eebb804a903eedb1a420bdd15a845a-28rvv 0/1 Completed 0 13h rook-ceph-rgw-ocs-storagecluster-cephobjectstore-a-857895bt8bq7 2/2 Running 0 13h rook-ceph-tools-6cb655c7d-mfmt5 1/1 Running 0 13h ux-backend-server-996cffddb-6j59g 2/2 Running 0 13h coredump : https://drive.google.com/file/d/1HQDVMFUPtMCAgUcEOj-tPBeINsH3u5IG/view?usp=sharing must-gather: https://drive.google.com/file/d/17Xkfp2cwYuSli5SDeLwb6BfWPBkilfCc/view?usp=sharing
Facing same issue for 4.14.9. Ceph health went into WARN state.