Bug 2037135

Summary: Booting from Local Snapshot Core Dumped Whose Backing File Is Based on RBD
Product: Red Hat Enterprise Linux 8 Reporter: Tingting Mao <timao>
Component: qemu-kvmAssignee: Stefano Garzarella <sgarzare>
qemu-kvm sub component: Ceph QA Contact: Tingting Mao <timao>
Status: CLOSED ERRATA Docs Contact:
Severity: high    
Priority: high CC: aliang, chayang, coli, hhan, idryomov, jinzhao, juzhang, kkiwi, meili, nanli, sgarzare, virt-maint, yicui
Version: 8.6Keywords: Regression, Triaged
Target Milestone: rc   
Target Release: ---   
Hardware: x86_64   
OS: Linux   
Whiteboard:
Fixed In Version: qemu-kvm-6.2.0-6.module+el8.6.0+14165+5e5e76ac Doc Type: If docs needed, set a value
Doc Text:
Story Points: ---
Clone Of: 2034791 Environment:
Last Closed: 2022-05-10 13:25:20 UTC Type: Bug
Regression: --- Mount Type: ---
Documentation: --- CRM:
Verified Versions: Category: ---
oVirt Team: --- RHEL 7.3 requirements from Atomic Host:
Cloudforms Team: --- Target Upstream Version:
Embargoed:
Bug Depends On: 2034791    
Bug Blocks:    

Description Tingting Mao 2022-01-05 03:06:39 UTC
+++ This bug was initially created as a clone of Bug #2034791 +++

Description of problem:
As the subject.


Version-Release number of selected component (if applicable):
qemu-kvm-6.2.0-2.module+el8.6.0+13738+17338784
kernel-4.18.0-357.el8.x86_64



How reproducible:
3/3


Steps to Reproduce:
1.Configure RBD env and create a base image file
cat > /etc/ceph/ceph.conf<<EOF
[global]
	cluster = testcluster
	fsid = d5ce4d4d-6a72-40e8-b59e-57a5b075a50a
 
	mon initial members = ibm-x3650m4-06
	mon host = ${ceph_server_ip}
 
	public network = 10.73.114.0/24
	cluster network = 10.73.114.0/24
 
	auth cluster required = cephx
	auth service required = cephx
	auth client required = cephx
	osd journal size = 1024
	osd pool default size = 1
	osd pool default min size = 1
	osd pool default pg num = 33
	osd pool default pgp num = 33
	osd crush chooseleaf type = 1
EOF

cat > /etc/ceph/ceph.client.admin.keyring<<EOF
[client.admin]
    key = AQBwUmlfB9VwKhAA0HwqNhKCfarcItKtCR9ulg==
    caps mds = "allow *"
    caps mgr = "allow *"
    caps mon = "allow *"
    caps osd = "allow *"
EOF

# qemu-img create -f raw rbd:rbd/base.img 20G

2. Install rhel9 guest os in the base image file
# /usr/libexec/qemu-kvm \
    -S  \
    -name 'avocado-vt-vm1'  \
    -sandbox on  \
    -machine q35 \
    -device pcie-root-port,id=pcie-root-port-0,multifunction=on,bus=pcie.0,addr=0x1,chassis=1 \
    -device pcie-pci-bridge,id=pcie-pci-bridge-0,addr=0x0,bus=pcie-root-port-0  \
    -nodefaults \
    -device VGA,bus=pcie.0,addr=0x2 \
    -m 15360  \
    -smp 16,maxcpus=16,cores=8,threads=1,dies=1,sockets=2  \
    -cpu 'Haswell-noTSX',+kvm_pv_unhalt \
    -device pcie-root-port,id=pcie-root-port-1,port=0x1,addr=0x1.0x1,bus=pcie.0,chassis=2 \
    -device qemu-xhci,id=usb1,bus=pcie-root-port-1,addr=0x0 \
    -device usb-tablet,id=usb-tablet1,bus=usb1.0,port=1 \
    -object iothread,id=iothread0 \
    -object iothread,id=iothread1 \
    -device pcie-root-port,id=pcie-root-port-3,port=0x3,addr=0x1.0x3,bus=pcie.0,chassis=4 \
    -device virtio-net-pci,mac=9a:1c:0c:0d:e3:4c,id=idjmZXQS,netdev=idEFQ4i1,bus=pcie-root-port-3,addr=0x0  \
    -netdev tap,id=idEFQ4i1,vhost=on  \
    -vnc :0  \
    -rtc base=utc,clock=host,driftfix=slew  \
    -boot menu=off,order=cdn,once=c,strict=off \
    -enable-kvm \
    -monitor stdio \
    -device pcie-root-port,id=pcie-root-port-5,port=0x6,addr=0x1.0x5,bus=pcie.0,chassis=5 \
    -device virtio-scsi-pci,id=virtio_scsi_pci2,bus=pcie-root-port-5,addr=0x0 \
    -blockdev node-name=file_image1,driver=rbd,auto-read-only=on,discard=unmap,pool=rbd,image=base.img,cache.direct=on,cache.no-flush=off \
    -blockdev node-name=drive_image1,driver=raw,read-only=off,cache.direct=on,cache.no-flush=off,file=file_image1 \
    -device scsi-hd,id=image1,drive=drive_image1,write-cache=on \
    -blockdev node-name=file_cd1,driver=file,auto-read-only=on,discard=unmap,aio=threads,filename=/home/kvm_autotest_root/iso/linux/RHEL-9.0.0-20211216.2-x86_64-dvd1.iso,cache.direct=on,cache.no-flush=off \
    -blockdev node-name=drive_cd1,driver=raw,read-only=on,cache.direct=on,cache.no-flush=off,file=file_cd1 \
    -device scsi-cd,id=cd1,drive=drive_cd1,write-cache=on \
    -chardev socket,server=on,path=/var/monitor-qmpmonitor1-20210721-024113-AsZ7KYro,id=qmp_id_qmpmonitor1,wait=off  \
    -mon chardev=qmp_id_qmpmonitor1,mode=control \
                                                    
3. Create a local snapshot file of the base image file over RBD
# qemu-img create -f qcow2 -F raw -b rbd:rbd/base.img sn.qcow2
# qemu-img info sn.qcow2 
image: sn.qcow2
file format: qcow2
virtual size: 20 GiB (21474836480 bytes)
disk size: 196 KiB
cluster_size: 65536
backing file: rbd:rbd/base.img
backing file format: raw
Format specific information:
    compat: 1.1
    compression type: zlib
    lazy refcounts: false
    refcount bits: 16
    corrupt: false
    extended l2: false

4. Boot from the local snapshot file
# /usr/libexec/qemu-kvm \
    -S  \
    -name 'avocado-vt-vm1'  \
    -sandbox on  \
    -machine q35 \
    -device pcie-root-port,id=pcie-root-port-0,multifunction=on,bus=pcie.0,addr=0x1,chassis=1 \
    -device pcie-pci-bridge,id=pcie-pci-bridge-0,addr=0x0,bus=pcie-root-port-0  \
    -nodefaults \
    -device VGA,bus=pcie.0,addr=0x2 \
    -m 15360  \
    -smp 16,maxcpus=16,cores=8,threads=1,dies=1,sockets=2  \
    -cpu 'Haswell-noTSX',+kvm_pv_unhalt \
    -device pcie-root-port,id=pcie-root-port-1,port=0x1,addr=0x1.0x1,bus=pcie.0,chassis=2 \
    -device qemu-xhci,id=usb1,bus=pcie-root-port-1,addr=0x0 \
    -device usb-tablet,id=usb-tablet1,bus=usb1.0,port=1 \
    -object iothread,id=iothread0 \
    -object iothread,id=iothread1 \
    -device pcie-root-port,id=pcie-root-port-3,port=0x3,addr=0x1.0x3,bus=pcie.0,chassis=4 \
    -device virtio-net-pci,mac=9a:1c:0c:0d:e3:4c,id=idjmZXQS,netdev=idEFQ4i1,bus=pcie-root-port-3,addr=0x0  \
    -netdev tap,id=idEFQ4i1,vhost=on  \
    -vnc :0  \
    -rtc base=utc,clock=host,driftfix=slew  \
    -boot menu=off,order=cdn,once=c,strict=off \
    -enable-kvm \
    -monitor stdio \
    -device pcie-root-port,id=pcie-root-port-5,port=0x6,addr=0x1.0x5,bus=pcie.0,chassis=5 \
    -device virtio-scsi-pci,id=virtio_scsi_pci2,bus=pcie-root-port-5,addr=0x0 \
    -blockdev node-name=file_image1,driver=file,auto-read-only=on,discard=unmap,aio=threads,filename=sn.qcow2,cache.direct=on,cache.no-flush=off \
    -blockdev node-name=drive_image1,driver=qcow2,read-only=off,cache.direct=on,cache.no-flush=off,file=file_image1 \
    -device scsi-hd,id=image1,drive=drive_image1,write-cache=on \
    -chardev socket,server=on,path=/var/monitor-qmpmonitor1-20210721-024113-AsZ7KYro,id=qmp_id_qmpmonitor1,wait=off  \
    -mon chardev=qmp_id_qmpmonitor1,mode=control \
QEMU 6.2.0 monitor - type 'help' for more information
(qemu) c
(qemu) qemu-kvm: ../block/rbd.c:1355: int qemu_rbd_co_block_status(BlockDriverState *, _Bool, int64_t, int64_t, int64_t *, int64_t *, BlockDriverState **): Assertion `req.bytes <= bytes' failed.
qemu.sh: line 33: 360208 Aborted                 (core dumped) /usr/libexec/qemu-kvm -S -name 'avocado-vt-vm1' -sandbox on -machine q35 -device pcie-root-port,id=pcie-root-port-0,multifunction=on,bus=pcie.0,addr=0x1,chassis=1 -device pcie-pci-bridge,id=pcie-pci-bridge-0,addr=0x0,bus=pcie-root-port-0 -nodefaults -device VGA,bus=pcie.0,addr=0x2 -m 15360 -smp 16,maxcpus=16,cores=8,threads=1,dies=1,sockets=2 -cpu 'Haswell-noTSX',+kvm_pv_unhalt -device pcie-root-port,id=pcie-root-port-1,port=0x1,addr=0x1.0x1,bus=pcie.0,chassis=2 -device qemu-xhci,id=usb1,bus=pcie-root-port-1,addr=0x0 -device usb-tablet,id=usb-tablet1,bus=usb1.0,port=1 -object iothread,id=iothread0 -object iothread,id=iothread1 -device pcie-root-port,id=pcie-root-port-3,port=0x3,addr=0x1.0x3,bus=pcie.0,chassis=4 -device virtio-net-pci,mac=9a:1c:0c:0d:e3:4c,id=idjmZXQS,netdev=idEFQ4i1,bus=pcie-root-port-3,addr=0x0 -netdev tap,id=idEFQ4i1,vhost=on -vnc :0 -rtc base=utc,clock=host,driftfix=slew -boot menu=off,order=cdn,once=c,strict=off -enable-kvm -monitor stdio -device pcie-root-port,id=pcie-root-port-5,port=0x6,addr=0x1.0x5,bus=pcie.0,chassis=5 -device virtio-scsi-pci,id=virtio_scsi_pci2,bus=pcie-root-port-5,addr=0x0 -blockdev node-name=file_image1,driver=file,auto-read-only=on,discard=unmap,aio=threads,filename=$1,cache.direct=on,cache.no-flush=off -blockdev node-name=drive_image1,driver=qcow2,read-only=off,cache.direct=on,cache.no-flush=off,file=file_image1 -device scsi-hd,id=image1,drive=drive_image1,write-cache=on -chardev socket,server=on,path=/var/monitor-qmpmonitor1-20210721-024113-AsZ7KYro,id=qmp_id_qmpmonitor1,wait=off -mon chardev=qmp_id_qmpmonitor1,mode=control


Actual results:
As above, the qemu core dumped.


Expected results:
Boot guest from the snapshot file successfully.


Additional info:
(gdb) bt
#0  0x00007fe71641c83c in __pthread_kill_implementation () from /lib64/libc.so.6
#1  0x00007fe7163cf686 in raise () from /lib64/libc.so.6
#2  0x00007fe7163b97d3 in abort () from /lib64/libc.so.6
#3  0x00007fe7163b96fb in __assert_fail_base.cold () from /lib64/libc.so.6
#4  0x00007fe7163c83a6 in __assert_fail () from /lib64/libc.so.6
#5  0x00007fe716bd9021 in qemu_rbd_co_block_status (bs=<optimized out>, want_zero=<optimized out>, offset=806354944, bytes=57344, pnum=0x7fe2b16efd90, map=<optimized out>, file=<optimized out>)
    at ../block/rbd.c:1355
#6  0x0000557d56e5138e in bdrv_co_block_status (bs=0x557d594476d0, want_zero=<optimized out>, offset=<optimized out>, bytes=57344, pnum=0x7fe2b16efd90, map=0x7fe2b16efd00, file=0x7fe2b16efcf8)
    at ../block/io.c:2489
#7  0x0000557d56e51545 in bdrv_co_block_status (bs=0x557d59440390, want_zero=<optimized out>, offset=<optimized out>, bytes=<optimized out>, pnum=0x7fe2b16efd90, map=0x0, file=0x0) at ../block/io.c:2557
#8  0x0000557d56e50eeb in bdrv_co_common_block_status_above (bs=<optimized out>, base=<optimized out>, include_base=false, want_zero=<optimized out>, offset=<optimized out>, bytes=57344, pnum=0x7fe2b16efd90, 
    map=0x0, file=0x0, depth=<optimized out>) at ../block/io.c:2667
#9  0x0000557d56e09530 in bdrv_common_block_status_above (bs=0x5797c, base=0x5797c, include_base=<optimized out>, want_zero=<optimized out>, offset=140611617228576, bytes=0, pnum=<optimized out>, 
    map=<optimized out>, file=<optimized out>, depth=<optimized out>) at block/block-gen.c:444
#10 0x0000557d56e84720 in bdrv_co_is_zero_fast (bs=0x557d59425b30, offset=140611617228576, bytes=57344) at ../block/io.c:2755
#11 is_zero_cow (bs=0x557d59425b30, m=0x557d5a5f6880) at ../block/qcow2.c:2450
#12 handle_alloc_space (bs=0x557d59425b30, l2meta=0x557d5a5f6880) at ../block/qcow2.c:2483
#13 qcow2_co_pwritev_task (bs=0x557d59425b30, host_offset=385024, offset=<optimized out>, bytes=<optimized out>, qiov=0x557d598a9db0, qiov_offset=<optimized out>, l2meta=0x557d5a5f6880) at ../block/qcow2.c:2556
#14 qcow2_co_pwritev_task_entry (task=<optimized out>) at ../block/qcow2.c:2600
#15 0x0000557d56e7f177 in qcow2_add_task (bs=0x557d59425b30, pool=0x0, subcluster_type=QCOW2_SUBCLUSTER_UNALLOCATED_PLAIN, host_offset=385024, offset=806412288, bytes=1024, qiov=0x557d598a9db0, qiov_offset=0, 
    func=<optimized out>, l2meta=<optimized out>) at ../block/qcow2.c:2255
#16 qcow2_co_pwritev_part (bs=<optimized out>, offset=806412288, bytes=1024, qiov=0x557d598a9db0, qiov_offset=0, flags=<optimized out>) at ../block/qcow2.c:2651
#17 0x0000557d56e4e9ed in bdrv_driver_pwritev (bs=0x557d59425b30, offset=<optimized out>, bytes=1024, qiov=0x557d598a9db0, qiov_offset=0, flags=0) at ../block/io.c:1252
#18 0x0000557d56e50360 in bdrv_aligned_pwritev (child=0x557d5ad0bc50, req=0x7fe2b16eff98, offset=806412288, bytes=<optimized out>, align=<optimized out>, qiov=0x557d598a9db0, qiov_offset=0, 
    flags=<optimized out>) at ../block/io.c:2126
#19 0x0000557d56e4f753 in bdrv_co_pwritev_part (child=<optimized out>, offset=<optimized out>, bytes=<optimized out>, qiov=<optimized out>, qiov_offset=<optimized out>, flags=<optimized out>)
    at ../block/io.c:2314
#20 0x0000557d56e3b362 in blk_co_do_pwritev_part (blk=0x557d5ad30ad0, offset=806412288, bytes=1024, qiov=0x7fe71641c83c <__pthread_kill_implementation+284>, qiov_offset=140611617228576, flags=0)
    at ../block/block-backend.c:1283
#21 0x0000557d56e3b7d7 in blk_aio_write_entry (opaque=0x557d5940e580) at ../block/block-backend.c:1467
#22 0x0000557d56ff7016 in coroutine_trampoline (i0=<optimized out>, i1=<optimized out>) at ../util/coroutine-ucontext.c:173
#23 0x00007fe7163e4820 in ?? () from /lib64/libc.so.6
#24 0x00007fe7153b9980 in ?? ()
#25 0x0000000000000000 in ?? ()

Comment 1 aihua liang 2022-01-06 07:52:53 UTC
In my test scenario(live snapshot), the reproduce ratio is 90%.

Comment 4 CongLi 2022-02-08 12:01:56 UTC
*** Bug 2051870 has been marked as a duplicate of this bug. ***

Comment 8 Tingting Mao 2022-02-09 03:11:03 UTC
Verified this bug as below.


Tested with:
qemu-kvm-6.2.0-6.module+el8.6.0+14165+5e5e76ac
kernel-4.18.0-364.el8.x86_64


Steps:
1. Create snapshot over RBD base image
# qemu-img create -f qcow2 -b rbd:rbd/rhel860-64-virtio-scsi.raw -F raw /home/kvm_autotest_root/images/snA.qcow2

2. Boot up guest from the snapshot
# /usr/libexec/qemu-kvm \
    -S  \
    -name 'avocado-vt-vm1'  \
    -sandbox on  \
    -machine q35,memory-backend=mem-machine_mem \
    -device pcie-root-port,id=pcie-root-port-0,multifunction=on,bus=pcie.0,addr=0x1,chassis=1 \
    -device pcie-pci-bridge,id=pcie-pci-bridge-0,addr=0x0,bus=pcie-root-port-0  \
    -nodefaults \
    -device VGA,bus=pcie.0,addr=0x2 \
    -m 30720 \
    -object memory-backend-ram,size=30720M,id=mem-machine_mem  \
    -smp 20,maxcpus=20,cores=10,threads=1,dies=1,sockets=2  \
    -cpu 'Broadwell',+kvm_pv_unhalt \
    -chardev socket,wait=off,server=on,id=qmp_id_qmpmonitor1,path=/tmp/avocado_fju2jpnh/monitor-qmpmonitor1-20220208-220246-XYgPA3Ks  \
    -mon chardev=qmp_id_qmpmonitor1,mode=control \
    -chardev socket,wait=off,server=on,id=qmp_id_catch_monitor,path=/tmp/avocado_fju2jpnh/monitor-catch_monitor-20220208-220246-XYgPA3Ks  \
    -mon chardev=qmp_id_catch_monitor,mode=control \
    -device pvpanic,ioport=0x505,id=idVRulgI \
    -chardev socket,wait=off,server=on,id=chardev_serial0,path=/tmp/avocado_fju2jpnh/serial-serial0-20220208-220246-XYgPA3Ks \
    -device isa-serial,id=serial0,chardev=chardev_serial0  \
    -chardev socket,id=seabioslog_id_20220208-220246-XYgPA3Ks,path=/tmp/avocado_fju2jpnh/seabios-20220208-220246-XYgPA3Ks,server=on,wait=off \
    -device isa-debugcon,chardev=seabioslog_id_20220208-220246-XYgPA3Ks,iobase=0x402 \
    -device pcie-root-port,id=pcie-root-port-1,port=0x1,addr=0x1.0x1,bus=pcie.0,chassis=2 \
    -device qemu-xhci,id=usb1,bus=pcie-root-port-1,addr=0x0 \
    -device usb-tablet,id=usb-tablet1,bus=usb1.0,port=1 \
    -device pcie-root-port,id=pcie-root-port-2,port=0x2,addr=0x1.0x2,bus=pcie.0,chassis=3 \
    -device virtio-scsi-pci,id=virtio_scsi_pci0,bus=pcie-root-port-2,addr=0x0 \
    -blockdev node-name=file_snA,driver=file,auto-read-only=on,discard=unmap,aio=threads,filename=/home/kvm_autotest_root/images/snA.qcow2,cache.direct=on,cache.no-flush=off \
    -blockdev node-name=drive_snA,driver=qcow2,read-only=off,cache.direct=on,cache.no-flush=off,file=file_snA \
    -device scsi-hd,id=snA,drive=drive_snA,write-cache=on \
    -device pcie-root-port,id=pcie-root-port-3,port=0x3,addr=0x1.0x3,bus=pcie.0,chassis=4 \
    -device virtio-net-pci,mac=9a:58:27:a6:c2:c5,id=idIUrp7T,netdev=ids1Vu7l,bus=pcie-root-port-3,addr=0x0  \
    -netdev tap,id=ids1Vu7l,vhost=on,vhostfd=25,fd=19  \
    -vnc :0  \
    -rtc base=utc,clock=host,driftfix=slew  \
    -boot menu=off,order=cdn,once=c,strict=off \
    -enable-kvm \
    -device pcie-root-port,id=pcie_extra_root_port_0,multifunction=on,bus=pcie.0,addr=0x3,chassis=5


Results:
Boot up guest successfully.

Comment 9 Yanan Fu 2022-02-09 06:13:59 UTC
QE bot(pre verify): Set 'Verified:Tested,SanityOnly' as gating/tier1 test pass.

Comment 11 errata-xmlrpc 2022-05-10 13:25:20 UTC
Since the problem described in this bug report should be
resolved in a recent advisory, it has been closed with a
resolution of ERRATA.

For information on the advisory (Moderate: virt:rhel and virt-devel:rhel security, bug fix, and enhancement update), and where to find the updated
files, follow the link below.

If the solution does not work for you, open a new bug report.

https://access.redhat.com/errata/RHSA-2022:1759