Bug 1764120

Summary: [Data plane]virtio_scsi_ctx_check: Assertion `blk_get_aio_context(d->conf.blk) == s->ctx' failed when unplug a device that running block stream on it
Product: Red Hat Enterprise Linux 7 Reporter: aihua liang <aliang>
Component: qemu-kvm-rhevAssignee: Sergio Lopez <slopezpa>
Status: CLOSED ERRATA QA Contact: aihua liang <aliang>
Severity: unspecified Docs Contact:
Priority: medium    
Version: 7.8CC: coli, jinzhao, juzhang, lmiksik, mjankula, ngu, qzhang, redhat-bugzilla, virt-maint
Target Milestone: rc   
Target Release: ---   
Hardware: Unspecified   
OS: Unspecified   
Whiteboard:
Fixed In Version: qemu-kvm-rhev-2.12.0-39.el7 Doc Type: If docs needed, set a value
Doc Text:
Story Points: ---
Clone Of: Environment:
Last Closed: 2020-03-31 14:34:56 UTC Type: Bug
Regression: --- Mount Type: ---
Documentation: --- CRM:
Verified Versions: Category: ---
oVirt Team: --- RHEL 7.3 requirements from Atomic Host:
Cloudforms Team: --- Target Upstream Version:
Embargoed:

Description aihua liang 2019-10-22 09:42:19 UTC
Description of problem:
  virtio_scsi_ctx_check: Assertion `blk_get_aio_context(d->conf.blk) == s->ctx' failed when unplug a device that running block stream on it

Version-Release number of selected component (if applicable):
 qemu-kvm-rhev version: qemu-kvm-rhev-2.12.0-38.el7.x86_64

How reproducible:


Steps to Reproduce:
1.Start guest with qemu cmds:
   /usr/libexec/qemu-kvm \
    -name 'avocado-vt-vm1' \
    -machine pc  \
    -nodefaults \
    -device VGA,bus=pci.0,addr=0x2  \
    -chardev socket,id=qmp_id_qmp1,path=/var/tmp/monitor-qmp1-20191022-051824-ZDUTGAgt,server,nowait \
    -mon chardev=qmp_id_qmp1,mode=control  \
    -chardev socket,id=qmp_id_catch_monitor,path=/var/tmp/monitor-catch_monitor-20191022-051824-ZDUTGAgt,server,nowait \
    -mon chardev=qmp_id_catch_monitor,mode=control \
    -device pvpanic,ioport=0x505,id=idR4rYF2 \
    -chardev socket,path=/var/tmp/serial-serial0-20191022-051824-ZDUTGAgt,nowait,id=chardev_serial0,server \
    -device isa-serial,id=serial0,chardev=chardev_serial0  \
    -chardev socket,id=seabioslog_id_20191022-051824-ZDUTGAgt,path=/var/tmp/seabios-20191022-051824-ZDUTGAgt,server,nowait \
    -device isa-debugcon,chardev=seabioslog_id_20191022-051824-ZDUTGAgt,iobase=0x402 \
    -device nec-usb-xhci,id=usb1,bus=pci.0,addr=0x3 \
    -object iothread,id=iothread0 \
    -object iothread,id=iothread1 \
    -device virtio-scsi-pci,iothread=iothread0,id=virtio_scsi_pci0,bus=pci.0,addr=0x4 \
    -drive id=drive_image1,if=none,snapshot=off,aio=threads,cache=none,format=qcow2,file=/home/kvm_autotest_root/images/rhel78-64-virtio-scsi.qcow2 \
    -device scsi-hd,id=image1,drive=drive_image1 \
    -device virtio-net-pci,mac=9a:ec:f9:12:10:82,id=idvaJEVE,netdev=idsDQKUV,bus=pci.0,addr=0x5  \
    -netdev tap,id=idsDQKUV,vhost=on \
    -m 5120  \
    -smp 2,maxcpus=2,cores=1,threads=1,sockets=2  \
    -cpu 'Nehalem',+kvm_pv_unhalt \
    -device usb-tablet,id=usb-tablet1,bus=usb1.0,port=1  \
    -vnc :0  \
    -rtc base=utc,clock=host,driftfix=slew  \
    -boot menu=off,strict=off,order=cdn,once=c \
    -enable-kvm \
    -monitor stdio \

2.DD file in guest, then create live snapshot.
   (guest)#dd if=/dev/urandom of=/home/block_stream_test bs=128k count=100
   {'execute': 'blockdev-snapshot-sync', 'arguments': {'device':'drive_image1', 'snapshot-file': '/home/kvm_autotest_root/images/sn1', 'mode': 'absolute-paths', 'format': 'qcow2'}, 'id': 'Ad73GLzA'}
   
3.Do stream to device
   {'execute': 'block-stream', 'arguments': {'device':'drive_image1', 'speed': 0}, 'id': 'uEpv2EZY'}
   {"timestamp": {"seconds": 1571735961, "microseconds": 469505}, "event": "JOB_STATUS_CHANGE", "data": {"status": "created", "id": "drive_image1"}}
   {"timestamp": {"seconds": 1571735961, "microseconds": 469581}, "event": "JOB_STATUS_CHANGE", "data": {"status": "running", "id": "drive_image1"}}

4. Query block jobs
   {'execute': 'query-block-jobs', 'id': 'mivnJiuq'}
   {"return": [{"auto-finalize": true, "io-status": "ok", "device": "drive_image1", "auto-dismiss": true, "busy": true, "len": 21474836480, "offset": 0, "status": "running", "paused": false, "speed": 0, "ready": false, "type": "stream"}], "id": "mivnJiuq"}

5. Hot unplug device drive_image1
   {'execute': 'device_del', 'arguments': {'id': 'image1'}, 'id': 'tLzJV1Mk'}
   {"timestamp": {"seconds": 1571735961, "microseconds": 563780}, "event": "JOB_STATUS_CHANGE", "data": {"status": "paused", "id": "drive_image1"}}
   {"timestamp": {"seconds": 1571735961, "microseconds": 563977}, "event": "JOB_STATUS_CHANGE", "data": {"status": "running", "id": "drive_image1"}}
   {"timestamp": {"seconds": 1571735961, "microseconds": 564027}, "event": "JOB_STATUS_CHANGE", "data": {"status": "paused", "id": "drive_image1"}}

Actual results:
After step5, qemu coredump with info:
  qemu-kvm: /builddir/build/BUILD/qemu-2.12.0/hw/scsi/virtio-scsi.c:246: virtio_scsi_ctx_check: Assertion `blk_get_aio_context(d->conf.blk) == s->ctx' failed.

/tmp/aexpect_EKBU7afY/aexpect-WMPzAb.sh: line 1: 27030 Aborted                 (core dumped) MALLOC_PERTURB_=1 /usr/libexec/qemu-kvm -S -name 'avocado-vt-vm1' -machine pc -nodefaults

coredump info as bellow:
  #gdb -c core.27030
  (gdb) bt
#0  0x00007f533e279387 in __GI_raise (sig=sig@entry=6) at ../nptl/sysdeps/unix/sysv/linux/raise.c:55
#1  0x00007f533e27aa78 in __GI_abort () at abort.c:90
#2  0x00007f533e2721a6 in __assert_fail_base (fmt=0x7f533e3cdce0 "%s%s%s:%u: %s%sAssertion `%s' failed.\n%n", assertion=assertion@entry=0x55950e5527f0 "blk_get_aio_context(d->conf.blk) == s->ctx", file=file@entry=0x55950e552718 "/builddir/build/BUILD/qemu-2.12.0/hw/scsi/virtio-scsi.c", line=line@entry=246, function=function@entry=0x55950e552c90 <__PRETTY_FUNCTION__.30121> "virtio_scsi_ctx_check") at assert.c:92
#3  0x00007f533e272252 in __GI___assert_fail (assertion=assertion@entry=0x55950e5527f0 "blk_get_aio_context(d->conf.blk) == s->ctx", file=file@entry=0x55950e552718 "/builddir/build/BUILD/qemu-2.12.0/hw/scsi/virtio-scsi.c", line=line@entry=246, function=function@entry=0x55950e552c90 <__PRETTY_FUNCTION__.30121> "virtio_scsi_ctx_check") at assert.c:101
#4  0x000055950e145dd1 in virtio_scsi_ctx_check (s=<optimized out>, s=<optimized out>, d=0x559511a7c500)
    at /usr/src/debug/qemu-2.12.0/hw/scsi/virtio-scsi.c:246
#5  0x000055950e1e09d6 in virtio_scsi_handle_cmd_vq (s=<optimized out>, s=<optimized out>, d=0x559511a7c500)
    at /usr/src/debug/qemu-2.12.0/hw/scsi/virtio-scsi.c:246
#6  0x000055950e1e09d6 in virtio_scsi_handle_cmd_vq (req=0x5595119423c0, s=0x559512986170)
    at /usr/src/debug/qemu-2.12.0/hw/scsi/virtio-scsi.c:559
#7  0x000055950e1e09d6 in virtio_scsi_handle_cmd_vq (s=s@entry=0x559512986170, vq=vq@entry=0x55951298e100)
    at /usr/src/debug/qemu-2.12.0/hw/scsi/virtio-scsi.c:599
#8  0x000055950e1e151a in virtio_scsi_data_plane_handle_cmd (vdev=<optimized out>, vq=0x55951298e100)
    at /usr/src/debug/qemu-2.12.0/hw/scsi/virtio-scsi-dataplane.c:60
#9  0x000055950e1ef8e6 in virtio_queue_host_notifier_aio_poll (vq=0x55951298e100)
    at /usr/src/debug/qemu-2.12.0/hw/virtio/virtio.c:1520
#10 0x000055950e1ef8e6 in virtio_queue_host_notifier_aio_poll (opaque=0x55951298e168)
    at /usr/src/debug/qemu-2.12.0/hw/virtio/virtio.c:2451
#11 0x000055950e4ad45e in run_poll_handlers_once (ctx=ctx@entry=0x559510b57a40, timeout=timeout@entry=0x7f5335b42ad8)
    at util/aio-posix.c:501
#12 0x000055950e4aded9 in aio_poll (timeout=0x7f5335b42ad8, ctx=0x559510b57a40) at util/aio-posix.c:584
#13 0x000055950e4aded9 in aio_poll (ctx=0x559510b57a40, blocking=blocking@entry=true) at util/aio-posix.c:615
#14 0x000055950e27ae7e in iothread_run (opaque=0x559510b75ea0) at iothread.c:64
#15 0x00007f533e618ea5 in start_thread (arg=0x7f5335b46700) at pthread_create.c:307
#16 0x00007f533e3418dd in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:111


Expected results:
 Unplug device can executed successfully with error.

Additional info:

Comment 2 John Ferlan 2019-10-22 19:52:22 UTC
Sergio - this is I believe similar to bz 1683937 (comment 5 perhaps)

Comment 11 Sergio Lopez 2019-11-20 18:11:40 UTC
Thanks a lot for the access to the machine and the detailed reproducer. With them I've been able to see the error myself, and dig a bit on it.

Turns out this is an issue fixed upstream by this commit:

commit 9c5aad84da1c37429d06c193f23a8df6445ed29e
Author: Zhengui Li <lizhengui>
Date:   Mon Jul 22 23:05:20 2019 +0200

    virtio-scsi: fixed virtio_scsi_ctx_check failed when detaching scsi disk
    
    commit a6f230c move blockbackend back to main AioContext on unplug. It set the AioContext of
    SCSIDevice to the main AioContex, but s->ctx is still the iothread AioContex(if the scsi controller
    is configure with iothread). So if there are having in-flight requests during unplug, a failing assertion
    happend. The bt is below:
    (gdb) bt
    #0  0x0000ffff86aacbd0 in raise () from /lib64/libc.so.6
    #1  0x0000ffff86aadf7c in abort () from /lib64/libc.so.6
    #2  0x0000ffff86aa6124 in __assert_fail_base () from /lib64/libc.so.6
    #3  0x0000ffff86aa61a4 in __assert_fail () from /lib64/libc.so.6
    #4  0x0000000000529118 in virtio_scsi_ctx_check (d=<optimized out>, s=<optimized out>, s=<optimized out>) at /home/qemu-4.0.0/hw/scsi/virtio-scsi.c:246
    #5  0x0000000000529ec4 in virtio_scsi_handle_cmd_req_prepare (s=0x2779ec00, req=0xffff740397d0) at /home/qemu-4.0.0/hw/scsi/virtio-scsi.c:559
    #6  0x000000000052a228 in virtio_scsi_handle_cmd_vq (s=0x2779ec00, vq=0xffff7c6d7110) at /home/qemu-4.0.0/hw/scsi/virtio-scsi.c:603
    #7  0x000000000052afa8 in virtio_scsi_data_plane_handle_cmd (vdev=<optimized out>, vq=0xffff7c6d7110) at /home/qemu-4.0.0/hw/scsi/virtio-scsi-dataplane.c:59
    #8  0x000000000054d94c in virtio_queue_host_notifier_aio_poll (opaque=<optimized out>) at /home/qemu-4.0.0/hw/virtio/virtio.c:2452
    
    assert(blk_get_aio_context(d->conf.blk) == s->ctx) failed.
    
    To avoid assertion failed,  moving the "if" after qdev_simple_device_unplug_cb.
    
    In addition, to avoid another qemu crash below, add aio_disable_external before
    qdev_simple_device_unplug_cb, which disable the further processing of external clients
    when doing qdev_simple_device_unplug_cb.
    (gdb) bt
    #0  scsi_req_unref (req=0xffff6802c6f0) at hw/scsi/scsi-bus.c:1283
    #1  0x00000000005294a4 in virtio_scsi_handle_cmd_req_submit (req=<optimized out>,
        s=<optimized out>) at /home/qemu-4.0.0/hw/scsi/virtio-scsi.c:589
    #2  0x000000000052a2a8 in virtio_scsi_handle_cmd_vq (s=s@entry=0x9c90e90,
        vq=vq@entry=0xffff7c05f110) at /home/qemu-4.0.0/hw/scsi/virtio-scsi.c:625
    #3  0x000000000052afd8 in virtio_scsi_data_plane_handle_cmd (vdev=<optimized out>,
        vq=0xffff7c05f110) at /home/qemu-4.0.0/hw/scsi/virtio-scsi-dataplane.c:60
    #4  0x000000000054d97c in virtio_queue_host_notifier_aio_poll (opaque=<optimized out>)
        at /home/qemu-4.0.0/hw/virtio/virtio.c:2447
    #5  0x00000000009b204c in run_poll_handlers_once (ctx=ctx@entry=0x6efea40,
        timeout=timeout@entry=0xffff7d7f7308) at util/aio-posix.c:521
    #6  0x00000000009b2b64 in run_poll_handlers (ctx=ctx@entry=0x6efea40,
        max_ns=max_ns@entry=4000, timeout=timeout@entry=0xffff7d7f7308) at util/aio-posix.c:559
    #7  0x00000000009b2ca0 in try_poll_mode (ctx=ctx@entry=0x6efea40, timeout=0xffff7d7f7308,
        timeout@entry=0xffff7d7f7348) at util/aio-posix.c:594
    #8  0x00000000009b31b8 in aio_poll (ctx=0x6efea40, blocking=blocking@entry=true)
        at util/aio-posix.c:636
    #9  0x00000000006973cc in iothread_run (opaque=0x6ebd800) at iothread.c:75
    #10 0x00000000009b592c in qemu_thread_start (args=0x6efef60) at util/qemu-thread-posix.c:502
    #11 0x0000ffff8057f8bc in start_thread () from /lib64/libpthread.so.0
    #12 0x0000ffff804e5f8c in thread_start () from /lib64/libc.so.6
    (gdb) p bus
    $1 = (SCSIBus *) 0x0
    
    Signed-off-by: Zhengui li <lizhengui>
    Message-Id: <1563696502-7972-1-git-send-email-lizhengui>
    Signed-off-by: Paolo Bonzini <pbonzini>
    Message-Id: <1563829520-17525-1-git-send-email-pbonzini>
    Signed-off-by: Paolo Bonzini <pbonzini>

diff --git a/hw/scsi/virtio-scsi.c b/hw/scsi/virtio-scsi.c
index d0bdbff090..8b9e5e2b49 100644
--- a/hw/scsi/virtio-scsi.c
+++ b/hw/scsi/virtio-scsi.c
@@ -832,6 +832,7 @@ static void virtio_scsi_hotunplug(HotplugHandler *hotplug_dev, DeviceState *dev,
     VirtIODevice *vdev = VIRTIO_DEVICE(hotplug_dev);
     VirtIOSCSI *s = VIRTIO_SCSI(vdev);
     SCSIDevice *sd = SCSI_DEVICE(dev);
+    AioContext *ctx = s->ctx ?: qemu_get_aio_context();
 
     if (virtio_vdev_has_feature(vdev, VIRTIO_SCSI_F_HOTPLUG)) {
         virtio_scsi_acquire(s);
@@ -841,14 +842,16 @@ static void virtio_scsi_hotunplug(HotplugHandler *hotplug_dev, DeviceState *dev,
         virtio_scsi_release(s);
     }
 
+    aio_disable_external(ctx);
+    qdev_simple_device_unplug_cb(hotplug_dev, dev, errp);
+    aio_enable_external(ctx);
+
     if (s->ctx) {
         virtio_scsi_acquire(s);
         /* If other users keep the BlockBackend in the iothread, that's ok */
         blk_set_aio_context(sd->conf.blk, qemu_get_aio_context(), NULL);
         virtio_scsi_release(s);
     }
-
-    qdev_simple_device_unplug_cb(hotplug_dev, dev, errp);
 }
 
 static struct SCSIBusInfo virtio_scsi_scsi_info = {

Sadly, we can't do a simple backport of this fix, because the hotplug mechanism has changed significantly from 2.12 to 4.2. In fact, in 2.12, the call to qdev_simple_device_unplug_cb() will cause the underlying device to be immediately released, so the attempt to access sd->conf.blk below will lead to a crash.

I'm going to look for an alternative solution.

Comment 21 aihua liang 2019-12-04 06:35:14 UTC
Verified on qemu-kvm-rhev-2.12.0-39.el7.x86_64, it works ok, set bug's status to "Verified".

Run test case for 20 times, all pass.
cmd:
 python ConfigTest.py --guestname=RHEL.7.8 --platform=x86_64 --testcase=block_stream.with_hot_unplug.with_data_plane --nrepeat=20

Comment 23 errata-xmlrpc 2020-03-31 14:34:56 UTC
Since the problem described in this bug report should be
resolved in a recent advisory, it has been closed with a
resolution of ERRATA.

For information on the advisory, and where to find the updated
files, follow the link below.

If the solution does not work for you, open a new bug report.

https://access.redhat.com/errata/RHSA-2020:1216