Description of problem (please be detailed as possible and provide log snippests): The issue arose while running the following script 'tests/e2e/longevity/test_stage4.py' in the PR: https://github.com/red-hat-storage/ocs-ci/pull/5943 In the mentioned script the following operations are being performed - 1. 30 PVCs of different access modes are created. 2. Each of the 30 PVCs is attached to a POD. 3. FIO is started on all pods to utilise 25% of PVC used space 4. Waiting for FIO to complete on all pods While executing the step 4, the FIO completed for 15 PODs but on the 16th POD it gave an error as ```2022-06-17 13:14:14 E ocs_ci.ocs.exceptions.CommandFailed: Error during execution of command: oc -n stage-4-cycle-1-sequential-operation rsh pod-test-rbd-d48ef50e3a49410fa50289248fc fio --name=fio-rand-readwrite --filename=/mnt/fio_25 --readwrite=randrw --bs=4K --direct=0 --numjobs=1 --time_based=1 --runtime=20 --size=500M --iodepth=4 --invalidate=1 --fsync_on_close=1 --rwmixread=75 --ioengine=libaio --rate=1m --rate_process=poisson --end_fsync=1 --output-format=json. 2022-06-17 13:14:14 E Error is fio: pid=0, err=13/file:filesetup.c:174, func=open, error=Permission denied 2022-06-17 13:14:14 E command terminated with exit code 1``` On analyzing I found that that the ceph osd crashed. Below are the relevant ceph outputs -> `sh-4.4$ ceph status cluster: id: cde709b2-fec0-4331-bc51-f50a0ee11237 health: HEALTH_WARN 1 daemons have recently crashed services: mon: 3 daemons, quorum a,b,c (age 2d) mgr: a(active, since 2d) mds: 1/1 daemons up, 1 hot standby osd: 3 osds: 3 up (since 2d), 3 in (since 3w) rgw: 1 daemon active (1 hosts, 1 zones) data: volumes: 1/1 healthy pools: 11 pools, 177 pgs objects: 1.51k objects, 3.8 GiB usage: 9.2 GiB used, 1.5 TiB / 1.5 TiB avail pgs: 177 active+clean io: client: 853 B/s rd, 1 op/s rd, 0 op/s wr ` `sh-4.4$ ceph crash ls ID ENTITY NEW 2022-06-07T18:36:49.236367Z_d50b8013-aa1d-426b-99e0-55046a801216 osd.2 * sh-4.4$ ceph crash info 2022-06-07T18:36:49.236367Z_d50b8013-aa1d-426b-99e0-55046a801216 { "backtrace": [ "/lib64/libpthread.so.0(+0x12c20) [0x7f986a564c20]", "pthread_cond_wait()", "(std::condition_variable::wait(std::unique_lock<std::mutex>&)+0x10) [0x7f9869ba78f0]", "(BlueStore::Collection::flush()+0x95) [0x55db7bb1b495]", "(BlueStore::collection_list(boost::intrusive_ptr<ObjectStore::CollectionImpl>&, ghobject_t const&, ghobject_t const&, int, std::vector<ghobject_t, std::allocator<ghobject_t> >*, ghobject_t*)+0x4e) [0x55db7bb5712e]", "(PGBackend::objects_list_partial(hobject_t const&, int, int, std::vector<hobject_t, std::allocator<hobject_t> >*, hobject_t*)+0x8b4) [0x55db7b830a74]", "(PgScrubber::select_range()+0x3ce) [0x55db7b9bb7be]", "(Scrub::NewChunk::NewChunk(boost::statechart::state<Scrub::NewChunk, Scrub::ActiveScrubbing, boost::mpl::list<mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na>, (boost::statechart::history_mode)0>::my_context)+0xdd) [0x55db7b9d3a1d]", "(boost::statechart::simple_state<Scrub::PendingTimer, Scrub::ActiveScrubbing, boost::mpl::list<mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na>, (boost::statechart::history_mode)0>::react_impl(boost::statechart::event_base const&, void const*)+0x103) [0x55db7b9dae23]", "(boost::statechart::state_machine<Scrub::ScrubMachine, Scrub::NotActive, std::allocator<boost::statechart::none>, boost::statechart::null_exception_translator>::process_event(boost::statechart::event_base const&)+0x5b) [0x55db7b9d08ab]", "(PgScrubber::send_scrub_resched(unsigned int)+0x28a) [0x55db7b9ca2fa]", "(PG::forward_scrub_event(void (ScrubPgIF::*)(unsigned int), unsigned int)+0x72) [0x55db7b70d1e2]", "(PG::scrub_send_scrub_resched(unsigned int, ThreadPool::TPHandle&)+0x111) [0x55db7b70db51]", "(ceph::osd::scheduler::PGScrubResched::run(OSD*, OSDShard*, boost::intrusive_ptr<PG>&, ThreadPool::TPHandle&)+0x16) [0x55db7b8b9016]", "(OSD::ShardedOpWQ::_process(unsigned int, ceph::heartbeat_handle_d*)+0xc28) [0x55db7b67bf28]", "(ShardedThreadPool::shardedthreadpool_worker(unsigned int)+0x5c4) [0x55db7bced894]", "(ShardedThreadPool::WorkThreadSharded::entry()+0x14) [0x55db7bcf0534]", "/lib64/libpthread.so.0(+0x817a) [0x7f986a55a17a]", "clone()" ], "ceph_version": "16.2.7-98.el8cp", "crash_id": "2022-06-07T18:36:49.236367Z_d50b8013-aa1d-426b-99e0-55046a801216", "entity_name": "osd.2", "os_id": "rhel", "os_name": "Red Hat Enterprise Linux", "os_version": "8.5 (Ootpa)", "os_version_id": "8.5", "process_name": "ceph-osd", "stack_sig": "39531eedc6df625aaec9a651806daf4de3ab1822375c20c0159f4bcc6a27ffda", "timestamp": "2022-06-07T18:36:49.236367Z", "utsname_hostname": "rook-ceph-osd-2-749c95f8c-vn8xw", "utsname_machine": "x86_64", "utsname_release": "4.18.0-305.45.1.el8_4.x86_64", "utsname_sysname": "Linux", "utsname_version": "#1 SMP Wed Apr 6 13:48:37 EDT 2022" } The pods on which FIO is performed are created using the following yaml -> --- apiVersion: v1 kind: Pod metadata: name: perf-pod namespace: default spec: containers: - name: performance image: quay.io/ocsci/perf:latest imagePullPolicy: IfNotPresent command: ['/bin/sh'] stdin: true tty: true volumeMounts: - name: mypvc mountPath: /mnt securityContext: allowPrivilegeEscalation: false runAsNonRoot: true runAsUser: 1000 capabilities: drop: - ALL seccompProfile: type: RuntimeDefault volumes: - name: mypvc persistentVolumeClaim: claimName: pvc readOnly: false ` Run details -> https://ocs4-jenkins-csb-odf-qe.apps.ocp-c1.prod.psi.redhat.com/job/qe-deploy-ocs-cluster/13677/console Version of all relevant components (if applicable): OCP 4.10.15, ODF 4.10.3 Does this issue impact your ability to continue to work with the product (please explain in detail what is the user impact)? Is there any workaround available to the best of your knowledge? Yes, we can archive the crash. Rate from 1 - 5 the complexity of the scenario you performed that caused this bug (1 - very simple, 5 - very complex)? 2 Can this issue reproducible? First time hitting it. Can this issue reproduce from the UI? NA If this is a regression, please provide more details to justify this: Steps to Reproduce: The issue arose while running the following script 'tests/e2e/longevity/test_stage4.py' in the PR: https://github.com/red-hat-storage/ocs-ci/pull/5943 In the mentioned script the following operations are being performed - 1. 30 PVCs of different access modes are created. 2. Each of the 30 PVCs is attached to a POD. 3. FIO is started on all pods to utilise 25% of PVC used space 4. Waiting for FIO to complete on all pods Actual results: Ceph osd crashed while running FIO. The details of the crash and backtrace can be found in the description above. Expected results: Ceph osd should not crash while running FIO, and FIO should complete for all the pods. Additional info: Must gather: http://magna002.ceph.redhat.com/ocsci-jenkins/openshift-clusters/tdesala-long-testd/tdesala-long-testd_20220525T080711/logs/failed_testcase_ocs_logs_1655449172/test_longevity_stage4_ocs_logs/ocs_must_gather/
Below are the coredump related outputs for every ODF node: ``` `[anamalho@anamalho ~]$ oc debug node/compute-3 Starting pod/compute-3-debug ... To use host binaries, run chroot /host Pod IP: 10.1.161.25 If you don't see a command prompt, try pressing enter. sh-4.4# chroot /host sh-4.4# ls -ltr /var/lib/systemd/coredump/ total 0 sh-4.4# coredumpctl list TIME PID UID GID SIG COREFILE EXE Tue 2022-06-07 18:36:55 UTC 7522 167 167 6 missing /usr/bin/ceph-osd sh-4.4# exit exit sh-4.4# exit exit Removing debug pod ... [anamalho@anamalho ~]$ oc debug node/compute-4 Starting pod/compute-4-debug ... To use host binaries, run chroot /host Pod IP: 10.1.160.163 If you don't see a command prompt, try pressing enter. sh-4.4# chroot /host sh-4.4# ls -ltr /var/lib/systemd/coredump/ total 0 sh-4.4# exit exit sh-4.4# exit exit Removing debug pod ... [anamalho@anamalho ~]$ oc debug node/compute-5 Starting pod/compute-5-debug ... To use host binaries, run chroot /host Pod IP: 10.1.161.45 If you don't see a command prompt, try pressing enter. sh-4.4# chroot /host sh-4.4# ls -ltr /var/lib/systemd/coredump/ total 0 sh-4.4# exit exit sh-4.4# exit exit Removing debug pod ... ```
It looks like the core dumps are missing or not collected.
Okay. Can you try to generate coredump for OSD daemon using below steps and see if coredump is getting generated ? 1. rsh to one of the osd pod # oc rsh <osd-pod> 2. get ceph-osd pid and send SIGSEGV signal to ceph-osd daemon # ps -aef|grep ceph-osd # kill -11 <pidof-ceph-osd> 3. login to ODF node hosting osd for which we generated coredump by sending SIGSEGV and verify that we have coredump for osd daemon # oc debug node/<odf-node> sh-4.4# chroot /host sh-4.4# ls -ltr /var/lib/systemd/coredump/ If there is no coredump for osd generated in /var/lib/systemd/coredump/ directory then open BZ for it. Also verify that OSD pod is sharing host namespace ("oc get pod <osd> -o yml" should have "hostPID: true" under "containers" section).
The coredump is getting generated using the above steps. ``` [anamalho@anamalho ~]$ oc rsh rook-ceph-osd-0-7bbd558ccd-t56jx Defaulted container "osd" out of: osd, log-collector, blkdevmapper (init), encryption-open (init), blkdevmapper-encryption (init), encrypted-block-status (init), expand-encrypted-bluefs (init), activate (init), expand-bluefs (init), chown-container-data-dir (init) sh-4.4# ps -aef|grep ceph-osd ceph 445 0 1 Jun14 ? 02:54:37 ceph-osd --foreground --id 0 --fsid cde709b2-fec0-4331-bc51-f50a0ee11237 --setuser ceph --setgroup ceph --crush-location=root=default host=ocs-deviceset-thin-0-data-0kjltj rack=rack1 --log-to-stderr=true --err-to-stderr=true --mon-cluster-log-to-stderr=true --log-stderr-prefix=debug --default-log-to-file=false --default-mon-cluster-log-to-file=false --ms-learn-addr-from-peer=false root 465 0 0 Jun14 pts/0 00:00:00 /bin/bash -x -e -m -c CEPH_CLIENT_ID=ceph-osd.0 PERIODICITY=24h LOG_ROTATE_CEPH_FILE=/etc/logrotate.d/ceph if [ -z "$PERIODICITY" ]; then .PERIODICITY=24h fi # edit the logrotate file to only rotate a specific daemon log # otherwise we will logrotate log files without reloading certain daemons # this might happen when multiple daemons run on the same machine sed -i "s|*.log|$CEPH_CLIENT_ID.log|" "$LOG_ROTATE_CEPH_FILE" while true; do .sleep "$PERIODICITY" .echo "starting log rotation" .logrotate --verbose --force "$LOG_ROTATE_CEPH_FILE" .echo "I am going to sleep now, see you in $PERIODICITY" done root 529859 529583 0 06:25 pts/0 00:00:00 grep ceph-osd sh-4.4# kill -11 445 sh-4.4# exit exit [anamalho@anamalho ~]$ oc debug node/compute-4 Starting pod/compute-4-debug ... To use host binaries, run `chroot /host` Pod IP: 10.1.160.163 If you don't see a command prompt, try pressing enter. sh-4.4# chroot /host sh-4.4# ls -ltr /var/lib/systemd/coredump/ total 690236 -rw-r-----. 1 root root 706794214 Jun 23 06:29 core.ceph-osd.167.13e9301b7d5d4c57aff8e735d9c3b64c.4040567.1655965792000000.lz4 sh-4.4# exit exit sh-4.4# exit exit ``` As for the output of `[anamalho@anamalho ~]$ oc get pod rook-ceph-osd-0-7bbd558ccd-t56jx -o yaml` -> there was no param as hostPID in the yaml output.
Who's looking at the coredump?
(In reply to Yaniv Kaul from comment #7) > Who's looking at the coredump? Hi Yaniv, I am investigating this issue. The recent coredump from comment#6 is generated by manual trigger. The coredump for OSD crash (reported in BZ description) during FIO run is not available. `[anamalho@anamalho ~]$ oc debug node/compute-3 Starting pod/compute-3-debug ... To use host binaries, run chroot /host Pod IP: 10.1.161.25 If you don't see a command prompt, try pressing enter. sh-4.4# chroot /host sh-4.4# ls -ltr /var/lib/systemd/coredump/ total 0 sh-4.4# coredumpctl list TIME PID UID GID SIG COREFILE EXE Tue 2022-06-07 18:36:55 UTC 7522 167 167 6 missing /usr/bin/ceph-osd <----- corefile is missing on ODF node I will be requesting Anant (BZ reporter) to reproduce this issue again and get us OSD coredump for further investigation.
(In reply to Anant Malhotra from comment #6) > The coredump is getting generated using the above steps. > > ``` > [anamalho@anamalho ~]$ oc rsh rook-ceph-osd-0-7bbd558ccd-t56jx > Defaulted container "osd" out of: osd, log-collector, blkdevmapper (init), > encryption-open (init), blkdevmapper-encryption (init), > encrypted-block-status (init), expand-encrypted-bluefs (init), activate > (init), expand-bluefs (init), chown-container-data-dir (init) > sh-4.4# ps -aef|grep ceph-osd > ceph 445 0 1 Jun14 ? 02:54:37 ceph-osd --foreground > --id 0 --fsid cde709b2-fec0-4331-bc51-f50a0ee11237 --setuser ceph --setgroup > ceph --crush-location=root=default host=ocs-deviceset-thin-0-data-0kjltj > rack=rack1 --log-to-stderr=true --err-to-stderr=true > --mon-cluster-log-to-stderr=true --log-stderr-prefix=debug > --default-log-to-file=false --default-mon-cluster-log-to-file=false > --ms-learn-addr-from-peer=false > root 465 0 0 Jun14 pts/0 00:00:00 /bin/bash -x -e -m -c > CEPH_CLIENT_ID=ceph-osd.0 PERIODICITY=24h > LOG_ROTATE_CEPH_FILE=/etc/logrotate.d/ceph if [ -z "$PERIODICITY" ]; then > .PERIODICITY=24h fi # edit the logrotate file to only rotate a specific > daemon log # otherwise we will logrotate log files without reloading certain > daemons # this might happen when multiple daemons run on the same machine > sed -i "s|*.log|$CEPH_CLIENT_ID.log|" "$LOG_ROTATE_CEPH_FILE" while true; > do .sleep "$PERIODICITY" .echo "starting log rotation" .logrotate --verbose > --force "$LOG_ROTATE_CEPH_FILE" .echo "I am going to sleep now, see you in > $PERIODICITY" done > root 529859 529583 0 06:25 pts/0 00:00:00 grep ceph-osd > sh-4.4# kill -11 445 > sh-4.4# exit > exit > > > [anamalho@anamalho ~]$ oc debug node/compute-4 > Starting pod/compute-4-debug ... > To use host binaries, run `chroot /host` > Pod IP: 10.1.160.163 > If you don't see a command prompt, try pressing enter. > sh-4.4# chroot /host > sh-4.4# ls -ltr /var/lib/systemd/coredump/ > total 690236 > -rw-r-----. 1 root root 706794214 Jun 23 06:29 > core.ceph-osd.167.13e9301b7d5d4c57aff8e735d9c3b64c.4040567.1655965792000000. > lz4 > sh-4.4# exit > exit > sh-4.4# exit > exit > ``` Thanks Anant. It confirms that there is no issue with coredump generation for OSD daemon in the event of crash. Can you try to reproduce this issue reported in BZ description again and get us coredump if any OSD is still crashing ? Also is this issue consistently reproducible during FIO run ? > > > As for the output of `[anamalho@anamalho ~]$ oc get pod > rook-ceph-osd-0-7bbd558ccd-t56jx -o yaml` -> there was no param as hostPID > in the yaml output. Okay. I will check it on my end.
Sure Harish.
I tried to reproduce this bug on VM setup with same testcase used by anant. But I issue is not reproduced. Now , I am tryoing to do same scenario on BM like run the IO's for more a week and check the issue is reproducing or not.
(In reply to avdhoot from comment #22) > I tried to reproduce this bug on VM setup with same testcase used by anant. > But I issue is not reproduced. > Now , I am tryoing to do same scenario on BM like run the IO's for more a > week and check the issue is reproducing or not. Thanks Avdhoot. Is it fine if we close this BZ for now ? You can re-open it later if you have successful reproducer for this issue.
Hi Prashant, I have checked same scenario on BM like run the IO's but I didnot found any crash related thing on cluster. So, yes it isfine if we close this BZ for now.
(In reply to avdhoot from comment #24) > Hi Prashant, > > I have checked same scenario on BM like run the IO's but I didnot found any > crash related thing on cluster. > So, yes it isfine if we close this BZ for now. Thanks Avdhoot. I am closing this BZ now. Feel free to reopen it if you encounter this issue again.
https://bugzilla.redhat.com/show_bug.cgi?id=2099463 moved to 7.0
Hi Prashant, Observed same issue in RDR Longevity cluster. Workloads have been running for past 2-3 weeks, hitting OSD crash. ceph crash ls ID ENTITY NEW 2023-07-14T20:04:57.339567Z_cc9c176b-3991-4b92-aec1-dbf1dee98520 osd.1 * 2023-07-14T20:05:04.597812Z_42744795-b51a-4169-82f6-f29639d2e150 osd.0 * ceph crash info 2023-07-14T20:04:57.339567Z_cc9c176b-3991-4b92-aec1-dbf1dee98520 { "backtrace": [ "/lib64/libc.so.6(+0x54df0) [0x7fa4b6eeadf0]", "/lib64/libc.so.6(+0x9c560) [0x7fa4b6f32560]", "pthread_mutex_lock()", "(PG::lock(bool) const+0x2b) [0x559763bee91b]", "(OSD::ShardedOpWQ::_process(unsigned int, ceph::heartbeat_handle_d*)+0x45d) [0x559763bc5dfd]", "(ShardedThreadPool::shardedthreadpool_worker(unsigned int)+0x2a3) [0x5597640e7ad3]", "ceph-osd(+0xa89074) [0x5597640e8074]", "/lib64/libc.so.6(+0x9f802) [0x7fa4b6f35802]", "/lib64/libc.so.6(+0x3f450) [0x7fa4b6ed5450]" ], "ceph_version": "17.2.6-70.0.TEST.bz2119217.el9cp", "crash_id": "2023-07-14T20:04:57.339567Z_cc9c176b-3991-4b92-aec1-dbf1dee98520", "entity_name": "osd.1", "os_id": "rhel", "os_name": "Red Hat Enterprise Linux", "os_version": "9.2 (Plow)", "os_version_id": "9.2", "process_name": "ceph-osd", "stack_sig": "5c7afd3067dc17bd22ffd5987b09913e4018bf079244d12c2db1c472317a24d8", "timestamp": "2023-07-14T20:04:57.339567Z", "utsname_hostname": "rook-ceph-osd-1-5f946675bc-hhjwk", "utsname_machine": "x86_64", "utsname_release": "5.14.0-284.16.1.el9_2.x86_64", "utsname_sysname": "Linux", "utsname_version": "#1 SMP PREEMPT_DYNAMIC Thu May 18 19:03:13 EDT 2023" } Could see coredumps in must gather for the nodes http://rhsqe-repo.lab.eng.blr.redhat.com/OCS/ocs-qe-bugs/keerthana/rbd-mirror/c2/must-gather.local.1781396938003127686/quay-io-rhceph-dev-ocs-must-gather-sha256-9ce39944596cbc4966404fb1ceb24be21093a708b1691e78453ab1b9a7a10f7b/ceph/ Complete Must gather logs :- c1 - http://rhsqe-repo.lab.eng.blr.redhat.com/OCS/ocs-qe-bugs/keerthana/rbd-mirror/c1/ c2 - http://rhsqe-repo.lab.eng.blr.redhat.com/OCS/ocs-qe-bugs/keerthana/rbd-mirror/c2/ hub - http://rhsqe-repo.lab.eng.blr.redhat.com/OCS/ocs-qe-bugs/keerthana/rbd-mirror/hub/ Live setup is available for debugging c1 - https://ocs4-jenkins-csb-odf-qe.apps.ocp-c1.prod.psi.redhat.com/job/qe-deploy-ocs-cluster/25313/ c2 - https://ocs4-jenkins-csb-odf-qe.apps.ocp-c1.prod.psi.redhat.com/job/qe-deploy-ocs-cluster/25312/ hub - https://ocs4-jenkins-csb-odf-qe.apps.ocp-c1.prod.psi.redhat.com/job/qe-deploy-ocs-cluster/25311/
Setting needinfo on Prashant as per comment71 by Keerthana.
Had an offline discussion with Prashant and opened new bug to track the issue. Clearing need-info on me