Qemu hangs during migration
Affects | Status | Importance | Assigned to | Milestone | |
---|---|---|---|---|---|
QEMU |
Expired
|
Undecided
|
Unassigned |
Bug Description
Source server: linux 4.19.5 qemu-3.0.0 from source, libvirt 4.9
Dest server: linux 4.18.19 qemu-3.0.0 from source, libvirt 4.9
When this VM is running on source server:
/usr/bin/
I try to migrate it and the disks to the other side:
virsh migrate --live --undefinesource --persistent --verbose --copy-storage-all testvm qemu+ssh:
We get to 99% then hang with both sides in the pause state.
Source server is stuck here:
(gdb) bt full
#0 0x00007f327994f3c1 in ppoll () at /lib64/libc.so.6
#1 0x000000000086167b in qemu_poll_ns (fds=<optimized out>, nfds=nfds@entry=1, timeout=<optimized out>) at util/qemu-
#2 0x0000000000863302 in aio_poll (ctx=0x21044e0, blocking=
node = <optimized out>
i = <optimized out>
ret = 0
progress = <optimized out>
timeout = <optimized out>
start = <optimized out>
#3 0x00000000007e0d52 in nbd_client_close (bs=0x2ba2400) at block/nbd-
waited_ = <optimized out>
wait_ = 0x2ba563c
ctx_ = 0x2109bb0
bs_ = 0x2ba2400
client = 0x31287e0
client = <optimized out>
request = {handle = 0, from = 0, len = 0, flags = 0, type = 2}
#4 0x00000000007e0d52 in nbd_client_close (bs=0x2ba2400) at block/nbd-
client = <optimized out>
request = {handle = 0, from = 0, len = 0, flags = 0, type = 2}
#5 0x00000000007de5ca in nbd_close (bs=<optimized out>) at block/nbd.c:491
s = 0x31287e0
#6 0x00000000007823d6 in bdrv_unref (bs=0x2ba2400) at block.c:3352
ban = <optimized out>
ban_next = <optimized out>
child = <optimized out>
next = <optimized out>
#7 0x00000000007823d6 in bdrv_unref (bs=0x2ba2400) at block.c:3560
#8 0x00000000007823d6 in bdrv_unref (bs=0x2ba2400) at block.c:4616
#9 0x0000000000782403 in bdrv_unref (bs=0x2af96f0) at block.c:3359
ban = <optimized out>
ban_next = <optimized out>
child = <optimized out>
next = <optimized out>
#10 0x0000000000782403 in bdrv_unref (bs=0x2af96f0) at block.c:3560
#11 0x0000000000782403 in bdrv_unref (bs=0x2af96f0) at block.c:4616
#12 0x0000000000785784 in block_job_
c = 0x23bac30
l = 0x20dd330 = {0x23bac30, 0x2b89410}
#13 0x00000000007ceb5f in mirror_exit (job=0x2f32570, opaque=
s = 0x2f32570
bjob = 0x2f32570
data = 0x7f326407a350
bs_opaque = 0x30d5600
src = 0x2131080
target_bs = 0x2af96f0
local_err = 0x0
#14 0x0000000000786452 in job_defer_
data = 0x7f32640786a0
job = <optimized out>
aio_context = 0x2109bb0
#15 0x000000000085fd3f in aio_bh_poll (ctx=ctx@
---Type <return> to continue, or q <return> to quit---
bh = <optimized out>
bhp = <optimized out>
next = 0x2ea86e0
ret = 1
deleted = false
#16 0x00000000008631b0 in aio_dispatch (ctx=0x21044e0) at util/aio-
#17 0x000000000085fc1e in aio_ctx_dispatch (source=<optimized out>, callback=<optimized out>, user_data=
ctx = <optimized out>
#18 0x00007f327f17d797 in g_main_
#19 0x00000000008622ed in main_loop_wait () at util/main-
context = 0x2104900
pfds = <optimized out>
context = 0x2104900
ret = 1
ret = 1
timeout = 4294967295
timeout_ns = <optimized out>
#20 0x00000000008622ed in main_loop_wait (timeout=<optimized out>) at util/main-
context = 0x2104900
ret = 1
ret = 1
timeout = 4294967295
timeout_ns = <optimized out>
#21 0x00000000008622ed in main_loop_wait (nonblocking=
ret = 1
timeout = 4294967295
timeout_ns = <optimized out>
#22 0x0000000000595dee in main_loop () at vl.c:1866
#23 0x000000000041f35d in main (argc=<optimized out>, argv=<optimized out>, envp=<optimized out>) at vl.c:4644
i = <optimized out>
snapshot = 0
linux_boot = <optimized out>
boot_order = 0x918f44 "cad"
boot_once = 0x0
ds = <optimized out>
opts = <optimized out>
icount_opts = <optimized out>
accel_opts = 0x0
olist = <optimized out>
optind = 71
optarg = 0x7ffdfc94df69 "timestamp=on"
loadvm = 0x0
cpu_model = 0x7ffdfc94d864 "Skylake-
vga_model = 0x0
qtest_log = 0x0
pid_file = <optimized out>
incoming = 0x0
userconfig = <optimized out>
---Type <return> to continue, or q <return> to quit---
nographic = false
log_mask = <optimized out>
log_file = <optimized out>
trace_file = <optimized out>
maxram_size = 4294967296
ram_slots = 0
err = 0x0
dir = <optimized out>
dirs = <optimized out>
bdo_queue = {sqh_first = 0x0, sqh_last = 0x7ffdfc94c170}
__func__ = "main"
Strace shows:
ppoll([{fd=9, events=
Which points to this:
ls -al /proc/2286/fd/9
lrwx------ 1 root users 64 Dec 5 13:04 /proc/2286/fd/9 -> anon_inode:
The dest side is stuck here:
(gdb) bt full
#0 0x00007f21f070d3c1 in ppoll () at /lib64/libc.so.6
#1 0x0000000000861659 in qemu_poll_ns (fds=<optimized out>, nfds=<optimized out>, timeout=
ts = {tv_sec = 2, tv_nsec = 999926258}
Python Exception <class 'gdb.error'> That operation is not available on integers of more than 8 bytes.:
#2 0x00000000008622a4 in main_loop_wait (timeout=<optimized out>) at util/main-
context = 0x2142900
ret = <optimized out>
ret = -1295041038
timeout = 4294967295
timeout_ns = <optimized out>
#3 0x00000000008622a4 in main_loop_wait (nonblocking=
ret = -1295041038
timeout = 4294967295
timeout_ns = <optimized out>
#4 0x0000000000595dee in main_loop () at vl.c:1866
#5 0x000000000041f35d in main (argc=<optimized out>, argv=<optimized out>, envp=<optimized out>) at vl.c:4644
i = <optimized out>
snapshot = 0
linux_boot = <optimized out>
boot_order = 0x918f44 "cad"
boot_once = 0x0
ds = <optimized out>
opts = <optimized out>
icount_opts = <optimized out>
accel_opts = 0x0
olist = <optimized out>
optind = 73
optarg = 0x7ffdd6ee8f69 "timestamp=on"
loadvm = 0x0
cpu_model = 0x7ffdd6ee8854 "Skylake-
vga_model = 0x0
qtest_log = 0x0
pid_file = <optimized out>
incoming = 0x7ffdd6ee8f0a "defer"
userconfig = <optimized out>
nographic = false
log_mask = <optimized out>
log_file = <optimized out>
trace_file = <optimized out>
maxram_size = 4294967296
ram_slots = 0
err = 0x0
dir = <optimized out>
dirs = <optimized out>
bdo_queue = {sqh_first = 0x0, sqh_last = 0x7ffdd6ee6630}
---Type <return> to continue, or q <return> to quit---
__func__ = "main"
Strace show this over and over
ppoll([{fd=6, events=POLLIN}, {fd=7, events=POLLIN}, {fd=9, events=POLLIN}, {fd=10, events=POLLIN}, {fd=21, events=POLLIN}, {fd=22, events=POLLIN}, {fd=23, events=POLLIN}, {fd=24, events=POLLIN}, {fd=27, events=POLLIN}], 9, {0, 594527977}, NULL, 8) = 0 (Timeout)
lrwx------ 1 root users 64 Dec 5 13:15 /proc/20170/fd/10 -> anon_inode:
lrwx------ 1 root users 64 Dec 5 13:15 /proc/20170/fd/21 -> socket:[42631161]
lrwx------ 1 root users 64 Dec 5 13:15 /proc/20170/fd/22 -> socket:[42631165]
lrwx------ 1 root users 64 Dec 5 13:15 /proc/20170/fd/23 -> socket:[42631167]
lrwx------ 1 root users 64 Dec 5 13:15 /proc/20170/fd/24 -> socket:[42631168]
lrwx------ 1 root users 64 Dec 5 13:15 /proc/20170/fd/27 -> socket:[42690422]
lrwx------ 1 root users 64 Dec 5 13:15 /proc/20170/fd/6 -> anon_inode:
lrwx------ 1 root users 64 Dec 5 13:15 /proc/20170/fd/7 -> anon_inode:
lrwx------ 1 root users 64 Dec 5 13:15 /proc/20170/fd/9 -> anon_inode:
If I remote iothreads and writeback caching, it seems more reliable, but I can still get it to hang.
This time the source server shows the VM as running, backtrace looks like:
(gdb) bt full libpthread. so.0 libpthread. so.0 lock_impl (mutex= mutex@entry= 0x115b8e0 <qemu_global_ mutex>, file=file@ entry=0x8fdf14 "/tmp/qemu- 3.0.0/cpus. c", line=line@ entry=1768) thread- posix.c: 66
__PRETTY_ FUNCTION_ _ = "qemu_mutex_ lock_impl" lock_impl" lock_iothread () at /tmp/qemu- 3.0.0/cpus. c:1768 loop.c: 236 nonblocking@ entry=0) at util/main- loop.c: 497
initrd_ filename = 0x0
kernel_ filename = <optimized out>
kernel_ cmdline = <optimized out>
machine_ opts = <optimized out>
machine_ class = 0x0 Server- IBRS,ss= on,hypervisor= on,tsc_ adjust= on,clflushopt= on,umip= on,pku= on,ssbd= on,xsaves= on,topoext= on,hv_time, hv_relaxed, hv_vapic, hv_spinlocks= 0x1fff, hv_vpindex, hv_runtime, hv_synic, hv_stimer" ...
qtest_ chrdev = 0x0
display_ remote = <optimized out>
vmstate_ dump_file = 0x0
main_loop_ err = 0x0
list_data_ dirs = false
#0 0x00007f27eab0028c in __lll_lock_wait () at /lib64/
#1 0x00007f27eaaf9d35 in pthread_mutex_lock () at /lib64/
#2 0x0000000000865419 in qemu_mutex_
at util/qemu-
err = <optimized out>
__func__ = "qemu_mutex_
#3 0x0000000000477578 in qemu_mutex_
#4 0x00000000008622b0 in main_loop_wait (timeout=<optimized out>) at util/main-
context = 0x1e72810
ret = 1
ret = 1
timeout = 4294967295
timeout_ns = <optimized out>
#5 0x00000000008622b0 in main_loop_wait (nonblocking=
ret = 1
timeout = 4294967295
timeout_ns = <optimized out>
#6 0x0000000000595dee in main_loop () at vl.c:1866
#7 0x000000000041f35d in main (argc=<optimized out>, argv=<optimized out>, envp=<optimized out>) at vl.c:4644
i = <optimized out>
snapshot = 0
linux_boot = <optimized out>
boot_order = 0x918f44 "cad"
boot_once = 0x0
ds = <optimized out>
opts = <optimized out>
icount_opts = <optimized out>
accel_opts = 0x0
olist = <optimized out>
optind = 71
optarg = 0x7fff5edcff69 "timestamp=on"
loadvm = 0x0
cpu_model = 0x7fff5edcf88a "Skylake-
vga_model = 0x0
qtest_log = 0x0
pid_file = <optimized out>
incoming = 0x7fff5edcff0a "defer"
userconfig = <optimized out>
nographic = false
log_mask = <optimized out>
log_file = <optimized out>
trace_file = <optimized out>
maxram_size = 4294967296
ram_slots = 0
---Type <return> to continue, or q <return> to quit---
err = 0x0
dir = <optimized out>
dirs = <optimized out>
bdo_queue = {sqh_first = 0x0, sqh_last = 0x7fff5edcd670}
__func__ = "main"
Dest server is paused, and looks like this:
#0 0x00007f11c48bc3c1 in ppoll () at /lib64/libc.so.6 timeout@ entry=299989238 3) at util/qemu- timer.c: 334
#1 0x0000000000861659 in qemu_poll_ns (fds=<optimized out>, nfds=<optimized out>, timeout=
ts = {tv_sec = 2, tv_nsec = 999892383}
Python Exception <class 'gdb.error'> That operation ...