While committing snapshot qemu crashes with SIGABRT

Bug #1570134 reported by Matthew Schumacher on 2016-04-13
6
This bug affects 1 person
Affects Status Importance Assigned to Milestone
QEMU
Undecided
Unassigned

Bug Description

Information:

OS: Slackware64-Current
Compiled with: gcc version 5.3.0 (GCC) / glibc 2.23
Compiled using:

CFLAGS="-O2 -fPIC" \
CXXFLAGS="-O2 -fPIC" \
LDFLAGS="-L/usr/lib64" \
./configure \
  --prefix=/usr \
  --sysconfdir=/etc \
  --localstatedir=/var \
  --libdir=/usr/lib64 \
  --enable-spice \
  --enable-kvm \
  --enable-glusterfs \
  --enable-libiscsi \
  --enable-libusb \
  --target-list=x86_64-softmmu,i386-softmmu \
  --enable-debug

Source: qemu-2.5.1.tar.bz2

Running as:

/usr/bin/qemu-system-x86_64 -name test1,debug-threads=on -S -machine pc-1.1,accel=kvm,usb=off -m 4096 -realtime mlock=off -smp 2,sockets=2,cores=1,threads=1 -uuid 4b30ec13-6609-4a56-8731-d400c38189ef -no-user-config -nodefaults -chardev socket,id=charmonitor,path=/var/lib/libvirt/qemu/domain-4-test1/monitor.sock,server,nowait -mon chardev=charmonitor,id=monitor,mode=control -rtc base=localtime,clock=vm,driftfix=slew -global kvm-pit.lost_tick_policy=discard -no-shutdown -boot strict=on -device piix3-usb-uhci,id=usb,bus=pci.0,addr=0x1.0x2 -drive file=/datastore/vm/test1/test1.img,format=qcow2,if=none,id=drive-virtio-disk0 -device virtio-blk-pci,scsi=off,bus=pci.0,addr=0x4,drive=drive-virtio-disk0,id=virtio-disk0,bootindex=2 -drive if=none,id=drive-ide0-1-0,readonly=on -device ide-cd,bus=ide.1,unit=0,drive=drive-ide0-1-0,id=ide0-1-0,bootindex=1 -netdev tap,fd=23,id=hostnet0,vhost=on,vhostfd=25 -device virtio-net pci,netdev=hostnet0,id=net0,mac=52:54:00:66:2e:0f,bus=pci.0,addr=0x3 -vnc 0.0.0.0:0 -device cirrus-vga,id=video0,bus=pci.0,addr=0x2 -device virtio-balloon-pci,id=balloon0,bus=pci.0,addr=0x5 -msg timestamp=on

File system: zfs v0.6.5.6

While running:
virsh blockcommit test1 vda --active --pivot --verbose

VM running very heavy IO load

GDB reporting:

#0 0x00007fd80132c3f8 in raise () at /lib64/libc.so.6
#1 0x00007fd80132dffa in abort () at /lib64/libc.so.6
#2 0x00007fd801324c17 in __assert_fail_base () at /lib64/libc.so.6
#3 0x00007fd801324cc2 in () at /lib64/libc.so.6
#4 0x000055d9918d7572 in bdrv_replace_in_backing_chain (old=0x55d993ed9c10, new=0x55d9931ccc10) at block.c:2096
        __PRETTY_FUNCTION__ = "bdrv_replace_in_backing_chain"
#5 0x000055d991911869 in mirror_exit (job=0x55d993fef830, opaque=0x55d999bbefe0) at block/mirror.c:376
        to_replace = 0x55d993ed9c10
        s = 0x55d993fef830
        data = 0x55d999bbefe0
        replace_aio_context = <optimized out>
        src = 0x55d993ed9c10
#6 0x000055d9918da1dc in block_job_defer_to_main_loop_bh (opaque=0x55d9940ce850) at blockjob.c:481
        data = 0x55d9940ce850
        aio_context = 0x55d9931a2610
#7 0x000055d9918d014b in aio_bh_poll (ctx=ctx@entry=0x55d9931a2610) at async.c:92
        bh = <optimized out>
        bhp = <optimized out>
        next = 0x55d99440f910
        ret = 1
#8 0x000055d9918dc8c0 in aio_dispatch (ctx=0x55d9931a2610) at aio-posix.c:305
        node = <optimized out>
        progress = false
#9 0x000055d9918d000e in aio_ctx_dispatch (source=<optimized out>, callback=<optimized out>, user_data=<optimized out>) at async.c:231
        ctx = <optimized out>
#10 0x00007fd8037cf787 in g_main_context_dispatch () at /usr/lib64/libglib-2.0.so.0
#11 0x000055d9918db03b in main_loop_wait () at main-loop.c:211
        context = 0x55d9931a3200
        pfds = <optimized out>
        ret = 0
        spin_counter = 1
        ret = 0
        timeout = 4294967295
        timeout_ns = <optimized out>
#12 0x000055d9918db03b in main_loop_wait (timeout=<optimized out>) at main-loop.c:256
        ret = 0
        spin_counter = 1
        ret = 0
        timeout = 4294967295
        timeout_ns = <optimized out>
#13 0x000055d9918db03b in main_loop_wait (nonblocking=<optimized out>) at main-loop.c:504
        ret = 0
        timeout = 4294967295
        timeout_ns = <optimized out>
#14 0x000055d991679cc4 in main () at vl.c:1923
        nonblocking = <optimized out>
        last_io = 2
        i = <optimized out>
        snapshot = <optimized out>
        linux_boot = <optimized out>
        initrd_filename = <optimized out>
        kernel_filename = <optimized out>
        kernel_cmdline = <optimized out>
        boot_order = <optimized out>
        boot_once = <optimized out>
        ds = <optimized out>
        cyls = <optimized out>
        heads = <optimized out>
        secs = <optimized out>
        translation = <optimized out>
        hda_opts = <optimized out>
        opts = <optimized out>
        machine_opts = <optimized out>
        icount_opts = <optimized out>
        olist = <optimized out>
        optind = 49
        optarg = 0x7fffc6d27f43 "timestamp=on"
        loadvm = <optimized out>
        machine_class = 0x55d993194d10
        cpu_model = <optimized out>
        vga_model = 0x0
        qtest_chrdev = <optimized out>
        qtest_log = <optimized out>
        pid_file = <optimized out>
        incoming = <optimized out>
        defconfig = <optimized out>
        userconfig = false
        log_mask = <optimized out>
        log_file = <optimized out>
        trace_events = <optimized out>
        trace_file = <optimized out>
        maxram_size = <optimized out>
        ram_slots = <optimized out>
        vmstate_dump_file = <optimized out>
        main_loop_err = 0x0
        err = 0x0
        __func__ = "main"
#15 0x000055d991679cc4 in main (argc=<optimized out>, argv=<optimized out>, envp=<optimized out>) at vl.c:4699
        i = <optimized out>
        snapshot = <optimized out>
        linux_boot = <optimized out>
        initrd_filename = <optimized out>
        kernel_filename = <optimized out>
        kernel_cmdline = <optimized out>
        boot_order = <optimized out>
        boot_once = <optimized out>
        ds = <optimized out>
        cyls = <optimized out>
        heads = <optimized out>
        secs = <optimized out>
        translation = <optimized out>
        hda_opts = <optimized out>
        opts = <optimized out>
        machine_opts = <optimized out>
        icount_opts = <optimized out>
        olist = <optimized out>
        optind = 49
        optarg = 0x7fffc6d27f43 "timestamp=on"
        loadvm = <optimized out>
        machine_class = 0x55d993194d10
        cpu_model = <optimized out>
        vga_model = 0x0
        qtest_chrdev = <optimized out>
        qtest_log = <optimized out>
        pid_file = <optimized out>
        incoming = <optimized out>
        defconfig = <optimized out>
        userconfig = false
        log_mask = <optimized out>
        log_file = <optimized out>
        trace_events = <optimized out>
        trace_file = <optimized out>
        maxram_size = <optimized out>
        ram_slots = <optimized out>
        vmstate_dump_file = <optimized out>
        main_loop_err = 0x0
        err = 0x0
        __func__ = "main"

I can reproduce this at will, and can provide more information per a dev's request.

On Wed, 04/13 23:18, Matthew Schumacher wrote:
> I can reproduce this at will, and can provide more information per a
> dev's request.

Could you please try v2.6.0-rc1?

Fam

Matthew Schumacher (schu-schu) wrote :
Download full text (6.2 KiB)

Sure, I did the same test and still got a SIGABRT, but the debug looks a little different:

Backtrace:

#0 0x00007f8f0d46a3f8 in raise () at /lib64/libc.so.6
#1 0x00007f8f0d46bffa in abort () at /lib64/libc.so.6
#2 0x00007f8f0d462c17 in __assert_fail_base () at /lib64/libc.so.6
#3 0x00007f8f0d462cc2 in () at /lib64/libc.so.6
#4 0x000055ff4ce33926 in mirror_run (s=0x55ff4fc00dd0) at block/mirror.c:335
        next_sector = 31174784
        next_chunk = 243553
        nb_chunks = 29
        end = 209715200
        sectors_per_chunk = 128
        source = 0x55ff4e1eb050
        sector_num = 31171072
        delay_ns = 0
        delay_ns = 0
        cnt = 157184
        should_complete = <optimized out>
        s = 0x55ff4fc00dd0
        data = <optimized out>
        bs = 0x55ff4e1eb050
        sector_num = <optimized out>
        end = <optimized out>
        length = <optimized out>
        last_pause_ns = <optimized out>
        bdi = {cluster_size = 65536, vm_state_offset = 107374182400, is_dirty = false, unallocated_blocks_are_zero = true, can_write_zeroes_with_unmap = true, needs_compressed_writes = false}
        backing_filename = "\000\021"
        ret = <optimized out>
        n = 1048576
        target_cluster_size = <optimized out>
        __PRETTY_FUNCTION__ = "mirror_run"
#5 0x000055ff4ce33926 in mirror_run (opaque=0x55ff4fc00dd0) at block/mirror.c:613
        delay_ns = 0
        cnt = 157184
        should_complete = <optimized out>
        s = 0x55ff4fc00dd0
        data = <optimized out>
        bs = 0x55ff4e1eb050
        sector_num = <optimized out>
        end = <optimized out>
        length = <optimized out>
        last_pause_ns = <optimized out>
        bdi = {cluster_size = 65536, vm_state_offset = 107374182400, is_dirty = false, unallocated_blocks_are_zero = true, can_write_zeroes_with_unmap = true, needs_compressed_writes = false}
        backing_filename = "\000\021"
        ret = <optimized out>
        n = 1048576
        target_cluster_size = <optimized out>
        __PRETTY_FUNCTION__ = "mirror_run"
#6 0x000055ff4ce9968a in coroutine_trampoline (i0=<optimized out>, i1=<optimized out>) at util/coroutine-ucontext.c:78
        self = 0x55ff4f6c2c80
        co = 0x55ff4f6c2c80
#7 0x00007f8f0d47f560 in __start_context () at /lib64/libc.so.6
#8 0x00007ffc759cb060 in ()
#9 0x0000000000000000 in ()

I get this in the log:

qemu-system-x86_64: block/mirror.c:335: mirror_iteration: Assertion `hbitmap_next == next_sector' failed.

The system was compiled like this:

Install prefix /usr
BIOS directory /usr/share/qemu
binary directory /usr/bin
library directory /usr/lib64
module directory /usr/lib64/qemu
libexec directory /usr/libexec
include directory /usr/include
config directory /etc
local state directory /var
Manual directory /usr/share/man
ELF interp prefix /usr/gnemul/qemu-%M
Source path /tmp/qemu-2.6.0-rc1
C compiler cc
Host C compiler cc
C++ compiler c++
Objective-C compiler clang
ARFLAGS rv
CFLAGS -pthread -I/usr/include/glib-2.0 -I/usr/lib64/glib-2.0/include -g -O2 -fPIC
QEMU_CFLAGS -I/usr/include/pixman-1 -I$(SRC_PATH)/dtc/libfdt -DHAS_LIBSSH2...

Read more...

Matthew Schumacher (schu-schu) wrote :
Download full text (3.3 KiB)

It still fails with ext4:

#0 0x00007fbaa12b33f8 in raise () at /lib64/libc.so.6
#1 0x00007fbaa12b4ffa in abort () at /lib64/libc.so.6
#2 0x00007fbaa12abc17 in __assert_fail_base () at /lib64/libc.so.6
#3 0x00007fbaa12abcc2 in () at /lib64/libc.so.6
#4 0x00005646b990f926 in mirror_run (s=0x5646bc50f480) at block/mirror.c:335
        next_sector = 36659200
        next_chunk = 286400
        nb_chunks = 80
        end = 209715200
        sectors_per_chunk = 128
        source = 0x5646bcb70000
        sector_num = 36648960
        delay_ns = 0
        delay_ns = 0
        cnt = 15360
        should_complete = <optimized out>
        s = 0x5646bc50f480
        data = <optimized out>
        bs = 0x5646bcb70000
        sector_num = <optimized out>
        end = <optimized out>
        length = <optimized out>
        last_pause_ns = <optimized out>
        bdi = {cluster_size = 65536, vm_state_offset = 107374182400, is_dirty = false, unallocated_blocks_are_zero = true, can_write_zeroes_with_unmap = true, needs_compressed_writes = false}
        backing_filename = "\000"
        ret = <optimized out>
        n = 1048576
        target_cluster_size = <optimized out>
        __PRETTY_FUNCTION__ = "mirror_run"
#5 0x00005646b990f926 in mirror_run (opaque=0x5646bc50f480) at block/mirror.c:613
        delay_ns = 0
        cnt = 15360
        should_complete = <optimized out>
        s = 0x5646bc50f480
        data = <optimized out>
        bs = 0x5646bcb70000
        sector_num = <optimized out>
        end = <optimized out>
        length = <optimized out>
        last_pause_ns = <optimized out>
        bdi = {cluster_size = 65536, vm_state_offset = 107374182400, is_dirty = false, unallocated_blocks_are_zero = true, can_write_zeroes_with_unmap = true, needs_compressed_writes = false}
        backing_filename = "\000"
        ret = <optimized out>
        n = 1048576
        target_cluster_size = <optimized out>
        __PRETTY_FUNCTION__ = "mirror_run"
#6 0x00005646b997568a in coroutine_trampoline (i0=<optimized out>, i1=<optimized out>) at util/coroutine-ucontext.c:78
        self = 0x5646bc5115b0
        co = 0x5646bc5115b0
#7 0x00007fbaa12c8560 in __start_context () at /lib64/libc.so.6
#8 0x00005646bd2b98b0 in ()
#9 0x0000000000000000 in ()

qemu-system-x86_64: block/mirror.c:335: mirror_iteration: Assertion `hbitmap_next == next_sector' failed.

I can't seem to get stable snapshotting and blockpull with a loaded VM.

Interestingly enough, the last command libvirt passes to qemu is:

2016-04-14 20:47:58.196+0000: 18932: debug : qemuMonitorJSONCommandWithFd:294 : Send command '{"execute":"query-block-jobs","id":"libvirt-69"}' for write with FD -1
2016-04-14 20:47:58.196+0000: 18932: info : qemuMonitorSend:1005 : QEMU_MONITOR_SEND_MSG: mon=0x7f1874001a30 msg={"execute":"query-block-jobs","id":"libvirt-69"}
2016-04-14 20:47:58.197+0000: 18929: info : qemuMonitorIOWrite:529 : QEMU_MONITOR_IO_WRITE: mon=0x7f1874001a30 buf={"execute":"query-block-jobs","id":"libvirt-69"}

Odd that it would SIGABRT on a smile query-block-jobs.

Even more interesting is that it crashes on the first or second or third snapshot/block-commit cycle when using EX...

Read more...

Matthew Schumacher (schu-schu) wrote :

I just tested master, and it does the same as 2.6.0-rc....

The 2.6.0 branch crashes much faster than 2.5.x

Max Reitz (xanclic) wrote :

Hi Matthew,

Thank you for your report! Could you try again with these two patches applied? Alternatively, you may fetch the resulting tree from https://github.com/XanClic/qemu.git, branch lp-1570134-pl (https://github.com/XanClic/qemu/archive/lp-1570134-pl.zip).

Max

Max Reitz (xanclic) wrote :

And the second patch, because I'm either too stupid to make Launchpad attach two files to a single comment, or because Launchpad actually doesn't want me to for some reason.

Matthew Schumacher (schu-schu) wrote :

Thank you for working on this. Super helpful to have someone looking at this issue!

With those two patches applied to 2.6.0-rc2 I still get the following:

qemu-system-x86_64: block/mirror.c:342: mirror_iteration: Assertion `hbitmap_next == next_sector' failed.

The line number confirms that qemu was patched before it was compiled. Here is the full backtrace:

#0 0x00007f4e5aa213f8 in raise () at /lib64/libc.so.6
#1 0x00007f4e5aa22ffa in abort () at /lib64/libc.so.6
#2 0x00007f4e5aa19c17 in __assert_fail_base () at /lib64/libc.so.6
#3 0x00007f4e5aa19cc2 in () at /lib64/libc.so.6
#4 0x0000564d5afc1dab in mirror_run (s=0x564d5eb9c2d0) at block/mirror.c:342
        hbitmap_next = <optimized out>
        next_sector = 29561984
        next_chunk = 230953
        nb_chunks = 4
        end = 209715200
        sectors_per_chunk = 128
        source = 0x564d5d273b00
        sector_num = 29561472
        delay_ns = 0
        delay_ns = 0
        cnt = <optimized out>
        should_complete = <optimized out>
        s = 0x564d5eb9c2d0
        data = <optimized out>
        bs = 0x564d5d273b00
        sector_num = <optimized out>
        end = <optimized out>
        length = <optimized out>
        last_pause_ns = <optimized out>
        bdi =
          {cluster_size = 65536, vm_state_offset = 107374182400, is_dirty = false, unallocated_blocks_are_zero = true, can_write_zeroes_with_unmap = true, needs_compressed_writes = false}
        backing_filename = "\000\060"
        ret = <optimized out>
        n = 1048576
        target_cluster_size = <optimized out>
        __PRETTY_FUNCTION__ = "mirror_run"
#5 0x0000564d5afc1dab in mirror_run (opaque=0x564d5eb9c2d0) at block/mirror.c:619
        delay_ns = 0
        cnt = <optimized out>
        should_complete = <optimized out>
        s = 0x564d5eb9c2d0
        data = <optimized out>
        bs = 0x564d5d273b00
        sector_num = <optimized out>
        end = <optimized out>
        length = <optimized out>
        last_pause_ns = <optimized out>
        bdi =
          {cluster_size = 65536, vm_state_offset = 107374182400, is_dirty = false, unallocated_blocks_are_zero = true, can_write_zeroes_with_unmap = true, needs_compressed_writes = false}
        backing_filename = "\000\060"
        ret = <optimized out>
        n = 1048576
        target_cluster_size = <optimized out>
        __PRETTY_FUNCTION__ = "mirror_run"
#6 0x0000564d5b027e4a in coroutine_trampoline (i0=<optimized out>, i1=<optimized out>) at util/coroutine-ucontext.c:78
        self = 0x564d5eacc520
        co = 0x564d5eacc520
#7 0x00007f4e5aa36560 in __start_context () at /lib64/libc.so.6
#8 0x00007ffc151258c0 in ()
#9 0x0000000000000000 in ()

Max Reitz (xanclic) wrote :

Hi Matthew,

I now reproduced the issue myself, and it appears the second patch just missed one little thing. The attached patch (together with patch 1 from above) fixes the problem for me.

(Also available from https://github.com/XanClic/qemu.git, branch lp-1570134-pl2; archive: https://github.com/XanClic/qemu/archive/lp-1570134-pl2.zip)

While it was probably more or less noticed by chance (this is most likely a different issue than the one in 2.5.1), thank you for bringing this up. 2.6.0 is close to release, so it's good that this issue was still found.

Max

Matthew Schumacher (schu-schu) wrote :
Download full text (14.3 KiB)

Max,

Qemu still crashes for me, but the debug is again very different. When I attach to the qemu process from gdb, it is unable to provide a backtrace when it crashes. The log file is different too. Any ideas?

qemu-system-x86_64: block.c:2307: bdrv_replace_in_backing_chain: Assertion `!bdrv_requests_pending(old)' failed.

(gdb) attach 5563
Attaching to process 5563
Reading symbols from /usr/bin/qemu-system-x86_64...cdone.
oReading symbols from /usr/lib64/libepoxy.so.0...(no debugging symbols found)...done.
Reading symbols from /usr/lib64/libdrm.so.2...(no debugging symbols found)...done.
Reading symbols from /usr/lib64/libgbm.so.1...(no debugging symbols found)...done.
Reading symbols from /usr/lib64/libX11.so.6...n(no debugging symbols found)...done.
Reading symbols from /usr/lib64/libz.so.1...(no debugging symbols found)...done.
Reading symbols from /lib64/libaio.so.1...(no debugging symbols found)...done.
Reading symbols from /usr/lib64/libiscsi.so.4...done.
Reading symbols from /usr/lib64/libcurl.so.4...(no debugging symbols found)...done.
Reading symbols from /lib64/libacl.so.1...(no debugging symbols found)...done.
Reading symbols from /usr/lib64/libgfapi.so.0...done.
Reading symbols from /usr/lib64/libglusterfs.so.0...done.
Reading symbols from /usr/lib64/libgfrpc.so.0...done.
Reading symbols from /usr/lib64/libgfxdr.so.0...done.
Reading symbols from /lib64/libuuid.so.1...(no debugging symbols found)...done.
Reading symbols from /usr/lib64/libssh2.so.1...done.
Reading symbols from /lib64/libbz2.so.1...(no debugging symbols found)...done.
Reading symbols from /usr/lib64/libpixman-1.so.0...(no debugging symbols found)...done.
Reading symbols from /lib64/libutil.so.1...(no debugging symbols found)...done.
Reading symbols from /lib64/libncurses.so.5...(no debugging symbols found)...done.
Reading symbols from /usr/lib64/libpng16.so.16...(no debugging symbols found)...done.
Reading symbols from /usr/lib64/libjpeg.so.62...(no debugging symbols found)...done.
Reading symbols from /usr/lib64/libsasl2.so.3...(no debugging symbols found)...done.
Reading symbols from /usr/lib64/libSDL-1.2.so.0...(no debugging symbols found)...done.
Reading symbols from /lib64/libpthread.so.0...(no debugging symbols found)...done.
[New LWP 5588]
[New LWP 5587]
[New LWP 5586]
[New LWP 5585]
[New LWP 5584]
[New LWP 5583]
[New LWP 5582]
[New LWP 5581]
[New LWP 5580]
[New LWP 5579]
[New LWP 5578]
[New LWP 5577]
[New LWP 5576]
[New LWP 5575]
[New LWP 5574]
[New LWP 5573]
[New LWP 5572]
[New LWP 5571]
[New LWP 5570]
[New LWP 5568]
[New LWP 5567]
[New LWP 5566]
[New LWP 5564]
[Thread debugging using libthread_db enabled]
Using host libthread_db library "/lib64/libthread_db.so.1".
Reading symbols from /usr/lib64/libvte.so.9...(no debugging symbols found)...done.
Reading symbols from /usr/lib64/libgtk-x11-2.0.so.0...(no debugging symbols found)...done.
Reading symbols from /usr/lib64/libgdk-x11-2.0.so.0...(no debugging symbols found)...done.
Reading symbols from /usr/lib64/libpangocairo-1.0.so.0...(no debugging symbols found)...done.
Reading symbols from /usr/lib64/libatk-1.0.so.0...(no debugging symbols found)...done.
Reading symbols from /usr/lib64/libgdk_pixbuf-2.0....

On Wed, 04/20 22:03, Max Reitz wrote:
> On 20.04.2016 20:09, Max Reitz wrote:
> > On 20.04.2016 02:03, Matthew Schumacher wrote:
> >> Max,
> >>
> >> Qemu still crashes for me, but the debug is again very different. When
> >> I attach to the qemu process from gdb, it is unable to provide a
> >> backtrace when it crashes. The log file is different too. Any ideas?
> >>
> >> qemu-system-x86_64: block.c:2307: bdrv_replace_in_backing_chain:
> >> Assertion `!bdrv_requests_pending(old)' failed.
> >
> > This message is exactly the same as you saw in 2.5.1, so I guess we've
> > at least averted a regression in 2.6.0.
>
> I get the same message in 2.5.0, in 2.4.0 it's "Co-routine re-entered
> recursively". 2.3.0 works fine.
>
> Bisecting the regression between 2.3.0 and 2.4.0 interestingly yields
> 48ac0a4df84662f as the problematic commit, but I can't imagine that this
> is the root issue. The effective change it brings is that for active
> commits, the buf_size is no longer the same as the granularity, but the
> default mirror buf_size instead.
>
> When forcing buf_size to the granularity, the issue first appears with
> commit 3f09bfbc7bee812 (after 2.4.0, before 2.5.0), which is much less
> surprising, because this is the one that introduced the assertion in the
> first place.
>
> However, I still don't think the assertion is the problem but the fact
> that the guest device can still send requests after bdrv_drained_begin().

Thanks for debugging this.

bdrv_drained_begin isn't effective because the guest notifier handler is not
registered as "external":

  virtio_queue_set_host_notifier_fd_handler
    event_notifier_set_handler
      qemu_set_fd_handler
        aio_set_fd_handler(ctx, fd,
                           is_external, /* false */
                           ...)

is_external SHOULD be true here.

Fam Zheng (famz) wrote :

On Thu, 04/21 08:34, Fam Zheng wrote:
> On Wed, 04/20 22:03, Max Reitz wrote:
> > On 20.04.2016 20:09, Max Reitz wrote:
> > > On 20.04.2016 02:03, Matthew Schumacher wrote:
> > >> Max,
> > >>
> > >> Qemu still crashes for me, but the debug is again very different. When
> > >> I attach to the qemu process from gdb, it is unable to provide a
> > >> backtrace when it crashes. The log file is different too. Any ideas?
> > >>
> > >> qemu-system-x86_64: block.c:2307: bdrv_replace_in_backing_chain:
> > >> Assertion `!bdrv_requests_pending(old)' failed.
> > >
> > > This message is exactly the same as you saw in 2.5.1, so I guess we've
> > > at least averted a regression in 2.6.0.
> >
> > I get the same message in 2.5.0, in 2.4.0 it's "Co-routine re-entered
> > recursively". 2.3.0 works fine.
> >
> > Bisecting the regression between 2.3.0 and 2.4.0 interestingly yields
> > 48ac0a4df84662f as the problematic commit, but I can't imagine that this
> > is the root issue. The effective change it brings is that for active
> > commits, the buf_size is no longer the same as the granularity, but the
> > default mirror buf_size instead.
> >
> > When forcing buf_size to the granularity, the issue first appears with
> > commit 3f09bfbc7bee812 (after 2.4.0, before 2.5.0), which is much less
> > surprising, because this is the one that introduced the assertion in the
> > first place.
> >
> > However, I still don't think the assertion is the problem but the fact
> > that the guest device can still send requests after bdrv_drained_begin().
>
> Thanks for debugging this.
>
> bdrv_drained_begin isn't effective because the guest notifier handler is not
> registered as "external":
>
> virtio_queue_set_host_notifier_fd_handler
> event_notifier_set_handler
> qemu_set_fd_handler
> aio_set_fd_handler(ctx, fd,
> is_external, /* false */
> ...)
>
>
> is_external SHOULD be true here.
>

This patch survives the reproducer I have on top of master (also submitted to
qemu-devel for 2.6):

---

diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c
index f745c4a..002c2c6 100644
--- a/hw/virtio/virtio.c
+++ b/hw/virtio/virtio.c
@@ -1829,10 +1829,11 @@ void virtio_queue_set_host_notifier_fd_handler(VirtQueue *vq, bool assign,
                                                bool set_handler)
 {
     if (assign && set_handler) {
- event_notifier_set_handler(&vq->host_notifier,
- virtio_queue_host_notifier_read);
+ aio_set_event_notifier(qemu_get_aio_context(), &vq->host_notifier,
+ true, virtio_queue_host_notifier_read);
     } else {
- event_notifier_set_handler(&vq->host_notifier, NULL);
+ aio_set_event_notifier(qemu_get_aio_context(), &vq->host_notifier,
+ true, NULL);
     }
     if (!assign) {
         /* Test and clear notifier before after disabling event,

Peter Maydell (pmaydell) wrote :

On 20 April 2016 at 19:09, Max Reitz <email address hidden> wrote:
> On 20.04.2016 02:03, Matthew Schumacher wrote:
>> Qemu still crashes for me, but the debug is again very different. When
>> I attach to the qemu process from gdb, it is unable to provide a
>> backtrace when it crashes. The log file is different too. Any ideas?
>>
>> qemu-system-x86_64: block.c:2307: bdrv_replace_in_backing_chain:
>> Assertion `!bdrv_requests_pending(old)' failed.
>
> This message is exactly the same as you saw in 2.5.1, so I guess we've
> at least averted a regression in 2.6.0.

Could somebody summarize for me the state of this bug w.r.t. the
upcoming release? In particular:
 * are there any patches on-list for it which should go into rc3?
 * are there any further problems which we plan to fix for 2.6 but
   which there aren't patches for yet?

thanks
-- PMM

Am 21.04.2016 um 13:35 hat Peter Maydell geschrieben:
> On 20 April 2016 at 19:09, Max Reitz <email address hidden> wrote:
> > On 20.04.2016 02:03, Matthew Schumacher wrote:
> >> Qemu still crashes for me, but the debug is again very different. When
> >> I attach to the qemu process from gdb, it is unable to provide a
> >> backtrace when it crashes. The log file is different too. Any ideas?
> >>
> >> qemu-system-x86_64: block.c:2307: bdrv_replace_in_backing_chain:
> >> Assertion `!bdrv_requests_pending(old)' failed.
> >
> > This message is exactly the same as you saw in 2.5.1, so I guess we've
> > at least averted a regression in 2.6.0.
>
> Could somebody summarize for me the state of this bug w.r.t. the
> upcoming release? In particular:
> * are there any patches on-list for it which should go into rc3?
> * are there any further problems which we plan to fix for 2.6 but
> which there aren't patches for yet?

The first part of the bug (the regression since 2.5) was fixed with the
pull request that I sent you yesterday. For the remaining part, Fam sent
this patch, which hasn't been applied yet:

[PATCH for-2.6] virtio: Register host notifier handler as external

Kevin

Matthew Schumacher (schu-schu) wrote :

Running master as of this morning 4/22 and I'm not getting any more crashes, and I'm flat beating on it. RC3 still crashes on me, so whatever the fix is, came after rc3.

Download full text (8.2 KiB)

On Fri, 04/22 18:55, Matthew Schumacher wrote:
> Running master as of this morning 4/22 and I'm not getting any more
> crashes, and I'm flat beating on it. RC3 still crashes on me, so
> whatever the fix is, came after rc3.

Matthew, It was bcd82a9..ab27c3b from last Friday (yes, after -rc3).

Thank you so much for your reporting and testing.

Fam

>
> --
> You received this bug notification because you are a member of qemu-
> devel-ml, which is subscribed to QEMU.
> https://bugs.launchpad.net/bugs/1570134
>
> Title:
> While committing snapshot qemu crashes with SIGABRT
>
> Status in QEMU:
> New
>
> Bug description:
> Information:
>
> OS: Slackware64-Current
> Compiled with: gcc version 5.3.0 (GCC) / glibc 2.23
> Compiled using:
>
> CFLAGS="-O2 -fPIC" \
> CXXFLAGS="-O2 -fPIC" \
> LDFLAGS="-L/usr/lib64" \
> ./configure \
> --prefix=/usr \
> --sysconfdir=/etc \
> --localstatedir=/var \
> --libdir=/usr/lib64 \
> --enable-spice \
> --enable-kvm \
> --enable-glusterfs \
> --enable-libiscsi \
> --enable-libusb \
> --target-list=x86_64-softmmu,i386-softmmu \
> --enable-debug
>
> Source: qemu-2.5.1.tar.bz2
>
> Running as:
>
> /usr/bin/qemu-system-x86_64 -name test1,debug-threads=on -S -machine
> pc-1.1,accel=kvm,usb=off -m 4096 -realtime mlock=off -smp
> 2,sockets=2,cores=1,threads=1 -uuid
> 4b30ec13-6609-4a56-8731-d400c38189ef -no-user-config -nodefaults
> -chardev
> socket,id=charmonitor,path=/var/lib/libvirt/qemu/domain-4-test1/monitor.sock,server,nowait
> -mon chardev=charmonitor,id=monitor,mode=control -rtc
> base=localtime,clock=vm,driftfix=slew -global kvm-
> pit.lost_tick_policy=discard -no-shutdown -boot strict=on -device
> piix3-usb-uhci,id=usb,bus=pci.0,addr=0x1.0x2 -drive
> file=/datastore/vm/test1/test1.img,format=qcow2,if=none,id=drive-
> virtio-disk0 -device virtio-blk-pci,scsi=off,bus=pci.0,addr=0x4,drive
> =drive-virtio-disk0,id=virtio-disk0,bootindex=2 -drive if=none,id
> =drive-ide0-1-0,readonly=on -device ide-cd,bus=ide.1,unit=0,drive
> =drive-ide0-1-0,id=ide0-1-0,bootindex=1 -netdev
> tap,fd=23,id=hostnet0,vhost=on,vhostfd=25 -device virtio-net
> pci,netdev=hostnet0,id=net0,mac=52:54:00:66:2e:0f,bus=pci.0,addr=0x3
> -vnc 0.0.0.0:0 -device cirrus-vga,id=video0,bus=pci.0,addr=0x2 -device
> virtio-balloon-pci,id=balloon0,bus=pci.0,addr=0x5 -msg timestamp=on
>
> File system: zfs v0.6.5.6
>
> While running:
> virsh blockcommit test1 vda --active --pivot --verbose
>
> VM running very heavy IO load
>
> GDB reporting:
>
> #0 0x00007fd80132c3f8 in raise () at /lib64/libc.so.6
> #1 0x00007fd80132dffa in abort () at /lib64/libc.so.6
> #2 0x00007fd801324c17 in __assert_fail_base () at /lib64/libc.so.6
> #3 0x00007fd801324cc2 in () at /lib64/libc.so.6
> #4 0x000055d9918d7572 in bdrv_replace_in_backing_chain (old=0x55d993ed9c10, new=0x55d9931ccc10) at block.c:2096
> __PRETTY_FUNCTION__ = "bdrv_replace_in_backing_chain"
> #5 0x000055d991911869 in mirror_exit (job=0x55d993fef830, opaque=0x55d999bbefe0) at block/mirror.c:376
> to_replace = 0x55d993ed9c10
> s = 0x55d...

Read more...

Fam Zheng (famz) on 2016-04-26
Changed in qemu:
status: New → Fix Committed
Thomas Huth (th-huth) on 2016-06-21
Changed in qemu:
status: Fix Committed → Fix Released
To post a comment you must log in.
This report contains Public information  Edit
Everyone can see this information.

Other bug subscribers