qemu segfaults in virtio-scsi driver if underlying device returns -EIO

Bug #1804323 reported by Richard Jones
6
This bug affects 1 person
Affects Status Importance Assigned to Milestone
QEMU
Fix Released
Undecided
Unassigned

Bug Description

Reported downstream in Fedora: https://bugzilla.redhat.com/show_bug.cgi?id=1650975

Using qemu from git and reasonably recent nbdkit, this command injects -EIO
errors into the block device which virtio-scsi is reading from:

$ nbdkit --filter=error memory size=64M error-rate=100% \
    --run 'x86_64-softmmu/qemu-system-x86_64 -device virtio-scsi,id=scsi -drive file=$nbd,format=raw,id=hd0,if=none -device scsi-hd,drive=hd0'
nbdkit: memory[1]: error: injecting EIO error into pread
nbdkit: memory[1]: error: injecting EIO error into pread
qemu-system-x86_64: hw/scsi/scsi-bus.c:1374: scsi_req_complete: Assertion `req->status == -1' failed.

The stack trace is:

Thread 5 (Thread 0x7f33e1f8b700 (LWP 10474)):
#0 0x00007f33fe0bf371 in __GI___poll (fds=0x559b07199490, nfds=1, timeout=-1)
    at ../sysdeps/unix/sysv/linux/poll.c:29
#1 0x00007f34061df5e6 in () at /lib64/libglib-2.0.so.0
#2 0x00007f34061df710 in g_main_context_iteration ()
    at /lib64/libglib-2.0.so.0
#3 0x00007f34061df761 in () at /lib64/libglib-2.0.so.0
#4 0x00007f34062086ea in () at /lib64/libglib-2.0.so.0
#5 0x00007f33fe19b58e in start_thread (arg=<optimized out>)
    at pthread_create.c:486
#6 0x00007f33fe0ca593 in clone ()
    at ../sysdeps/unix/sysv/linux/x86_64/clone.S:95

Thread 4 (Thread 0x7f33e3fff700 (LWP 10473)):
#0 0x00007f33fe1a4a8d in __lll_lock_wait ()
    at ../sysdeps/unix/sysv/linux/x86_64/lowlevellock.S:103
#1 0x00007f33fe19ddf8 in __GI___pthread_mutex_lock (mutex=mutex@entry=0x559b054697a0 <qemu_global_mutex>) at ../nptl/pthread_mutex_lock.c:78
#2 0x0000559b04f6b103 in qemu_mutex_lock_impl (mutex=0x559b054697a0 <qemu_global_mutex>, file=0x559b04f87041 "/home/rjones/d/qemu/exec.c", line=3197)
    at util/qemu-thread-posix.c:66
#3 0x0000559b04b722ee in qemu_mutex_lock_iothread_impl (file=file@entry=0x559b04f87041 "/home/rjones/d/qemu/exec.c", line=line@entry=3197)
    at /home/rjones/d/qemu/cpus.c:1845
#4 0x0000559b04b31859 in prepare_mmio_access (mr=<optimized out>, mr=<optimized out>) at /home/rjones/d/qemu/exec.c:3197
#5 0x0000559b04b381d4 in address_space_ldub (as=<optimized out>, addr=<optimized out>, attrs=..., result=result@entry=0x0)
    at /home/rjones/d/qemu/memory_ldst.inc.c:188
#6 0x0000559b04c61cd0 in helper_inb (env=<optimized out>, port=<optimized out>) at /home/rjones/d/qemu/target/i386/cpu.h:1846
#7 0x00007f33e889dc3e in code_gen_buffer ()
#8 0x0000559b04bb3b87 in cpu_tb_exec (itb=<optimized out>, cpu=0x7f33e8876100 <code_gen_buffer+684243>) at /home/rjones/d/qemu/accel/tcg/cpu-exec.c:171
#9 0x0000559b04bb3b87 in cpu_loop_exec_tb (tb_exit=<synthetic pointer>, last_tb=<synthetic pointer>, tb=<optimized out>, cpu=0x7f33e8876100 <code_gen_buffer+684243>) at /home/rjones/d/qemu/accel/tcg/cpu-exec.c:615
#10 0x0000559b04bb3b87 in cpu_exec (cpu=cpu@entry=0x559b05db57a0)
    at /home/rjones/d/qemu/accel/tcg/cpu-exec.c:725
#11 0x0000559b04b7088f in tcg_cpu_exec (cpu=0x559b05db57a0)
    at /home/rjones/d/qemu/cpus.c:1425
#12 0x0000559b04b72c03 in qemu_tcg_cpu_thread_fn (arg=0x559b05db57a0)
    at /home/rjones/d/qemu/cpus.c:1729
#13 0x0000559b04b72c03 in qemu_tcg_cpu_thread_fn (arg=arg@entry=0x559b05db57a0)
    at /home/rjones/d/qemu/cpus.c:1703
#14 0x0000559b04f6afba in qemu_thread_start (args=<optimized out>)
    at util/qemu-thread-posix.c:498
#15 0x00007f33fe19b58e in start_thread (arg=<optimized out>)
    at pthread_create.c:486
#16 0x00007f33fe0ca593 in clone ()
    at ../sysdeps/unix/sysv/linux/x86_64/clone.S:95

Thread 3 (Thread 0x7f33e178a700 (LWP 10475)):
#0 0x00007f33fe0bf371 in __GI___poll (fds=0x559b071aa760, nfds=2, timeout=-1)
    at ../sysdeps/unix/sysv/linux/poll.c:29
#1 0x00007f34061df5e6 in () at /lib64/libglib-2.0.so.0
#2 0x00007f34061df9a2 in g_main_loop_run () at /lib64/libglib-2.0.so.0
#3 0x00007f34032ca90a in () at /lib64/libgio-2.0.so.0
#4 0x00007f34062086ea in () at /lib64/libglib-2.0.so.0
#5 0x00007f33fe19b58e in start_thread (arg=<optimized out>)
    at pthread_create.c:486
#6 0x00007f33fe0ca593 in clone ()
    at ../sysdeps/unix/sysv/linux/x86_64/clone.S:95

Thread 2 (Thread 0x7f33eb050700 (LWP 10471)):
#0 0x00007f33fe1a5400 in __GI___nanosleep (requested_time=0x7f33eb04d270, remaining=0x7f33eb04d280) at ../sysdeps/unix/sysv/linux/nanosleep.c:28
#1 0x00007f3406209e17 in g_usleep () at /lib64/libglib-2.0.so.0
#2 0x0000559b04f7cb80 in call_rcu_thread (opaque=opaque@entry=0x0)
    at util/rcu.c:253
#3 0x0000559b04f6afba in qemu_thread_start (args=<optimized out>)
    at util/qemu-thread-posix.c:498
#4 0x00007f33fe19b58e in start_thread (arg=<optimized out>)
    at pthread_create.c:486
#5 0x00007f33fe0ca593 in clone ()
    at ../sysdeps/unix/sysv/linux/x86_64/clone.S:95

Thread 1 (Thread 0x7f33eb9b42c0 (LWP 10470)):
#0 0x00007f33fe00553f in __GI_raise (sig=sig@entry=6)
    at ../sysdeps/unix/sysv/linux/raise.c:50
#1 0x00007f33fdfef895 in __GI_abort () at abort.c:79
#2 0x00007f33fdfef769 in __assert_fail_base (fmt=0x7f33fe156ea8 "%s%s%s:%u: %s%sAssertion `%s' failed.\n%n", assertion=0x559b050203cf "req->status == -1", file=0x559b050203aa "hw/scsi/scsi-bus.c", line=1374, function=0x559b05020dc0 <__PRETTY_FUNCTION__.32414> "scsi_req_complete") at assert.c:92
#3 0x00007f33fdffd9f6 in __GI___assert_fail (assertion=assertion@entry=0x559b050203cf "req->status == -1", file=file@entry=0x559b050203aa "hw/scsi/scsi-bus.c", line=line@entry=1374, function=function@entry=0x559b05020dc0 <__PRETTY_FUNCTION__.32414> "scsi_req_complete") at assert.c:101
#4 0x0000559b04da23c0 in scsi_req_complete (req=<optimized out>, status=<optimized out>) at hw/scsi/scsi-bus.c:1374
#5 0x0000559b04d9cc60 in scsi_dma_complete_noio (r=0x559b069c3600, ret=<optimized out>) at hw/scsi/scsi-disk.c:281
#6 0x0000559b04d9cd0f in scsi_dma_complete (opaque=0x559b069c3600, ret=-5)
    at hw/scsi/scsi-disk.c:302
#7 0x0000559b04c91607 in dma_complete (ret=-5, dbs=0x559b07808000)
    at dma-helpers.c:116
#8 0x0000559b04c91607 in dma_blk_cb (opaque=0x559b07808000, ret=-5)
    at dma-helpers.c:138
#9 0x0000559b04ec411e in blk_aio_complete (acb=0x559b07514fa0)
    at block/block-backend.c:1345
#10 0x0000559b04f7e32b in coroutine_trampoline (i0=<optimized out>, i1=<optimized out>) at util/coroutine-ucontext.c:116
#11 0x00007f33fe01b200 in __start_context ()
    at ../sysdeps/unix/sysv/linux/x86_64/__start_context.S:91
#12 0x00007ffc0896b040 in ()
#13 0x0000000000000000 in ()

I bisected this to:

40dce4ee61c68395f6d463fae792f61b7c003bce is the first bad commit
commit 40dce4ee61c68395f6d463fae792f61b7c003bce
Author: Paolo Bonzini <email address hidden>
Date: Sat Oct 13 11:52:34 2018 +0200

    scsi-disk: fix rerror/werror=ignore

    rerror=ignore was returning true from scsi_handle_rw_error but the callers were not
    calling scsi_req_complete when rerror=ignore returns true (this is the correct thing
    to do when true is returned after executing a passthrough command). Fix this by
    calling it in scsi_handle_rw_error.

    Signed-off-by: Paolo Bonzini <email address hidden>

:040000 040000 311386b9b91d77840a849459ab6ae41a37fd7f42 8adcda67d7487bcc18966f096c9923da3b8dc0b9 M hw

Revision history for this message
Richard Jones (rjones-redhat) wrote :

Kevin suggested this change, which works for me:

diff --git a/hw/scsi/scsi-disk.c b/hw/scsi/scsi-disk.c
index 6eb258d3f3..0e9027c8f3 100644
--- a/hw/scsi/scsi-disk.c
+++ b/hw/scsi/scsi-disk.c
@@ -482,7 +482,7 @@ static bool scsi_handle_rw_error(SCSIDiskReq *r, int error, bool acct_failed)
     if (action == BLOCK_ERROR_ACTION_STOP) {
         scsi_req_retry(&r->req);
     }
- return false;
+ return true;
 }

 static void scsi_write_complete_noio(SCSIDiskReq *r, int ret)

Revision history for this message
Peter Maydell (pmaydell) wrote :

The fix as suggested in comment 1 is now in QEMU master as commit 1c7f618f689b0b5; this is in 3.1.0-rc3 and will be in the final 3.1 release.

Changed in qemu:
status: New → Fix Committed
Thomas Huth (th-huth)
Changed in qemu:
status: Fix Committed → Fix Released
To post a comment you must log in.
This report contains Public information  
Everyone can see this information.

Other bug subscribers

Remote bug watches

Bug watches keep track of this bug in other bug trackers.