If I remote iothreads and writeback caching, it seems more reliable, but I can still get it to hang.
This time the source server shows the VM as running, backtrace looks like:
(gdb) bt full #0 0x00007f27eab0028c in __lll_lock_wait () at /lib64/libpthread.so.0 #1 0x00007f27eaaf9d35 in pthread_mutex_lock () at /lib64/libpthread.so.0 #2 0x0000000000865419 in qemu_mutex_lock_impl (mutex=mutex@entry=0x115b8e0 <qemu_global_mutex>, file=file@entry=0x8fdf14 "/tmp/qemu-3.0.0/cpus.c", line=line@entry=1768) at util/qemu-thread-posix.c:66 err = <optimized out> __PRETTY_FUNCTION__ = "qemu_mutex_lock_impl" __func__ = "qemu_mutex_lock_impl" #3 0x0000000000477578 in qemu_mutex_lock_iothread () at /tmp/qemu-3.0.0/cpus.c:1768 #4 0x00000000008622b0 in main_loop_wait (timeout=<optimized out>) at util/main-loop.c:236 context = 0x1e72810 ret = 1 ret = 1 timeout = 4294967295 timeout_ns = <optimized out> #5 0x00000000008622b0 in main_loop_wait (nonblocking=nonblocking@entry=0) at util/main-loop.c:497 ret = 1 timeout = 4294967295 timeout_ns = <optimized out> #6 0x0000000000595dee in main_loop () at vl.c:1866 #7 0x000000000041f35d in main (argc=<optimized out>, argv=<optimized out>, envp=<optimized out>) at vl.c:4644 i = <optimized out> snapshot = 0 linux_boot = <optimized out> initrd_filename = 0x0 kernel_filename = <optimized out> kernel_cmdline = <optimized out> boot_order = 0x918f44 "cad" boot_once = 0x0 ds = <optimized out> opts = <optimized out> machine_opts = <optimized out> icount_opts = <optimized out> accel_opts = 0x0 olist = <optimized out> optind = 71 optarg = 0x7fff5edcff69 "timestamp=on" loadvm = 0x0 machine_class = 0x0 cpu_model = 0x7fff5edcf88a "Skylake-Server-IBRS,ss=on,hypervisor=on,tsc_adjust=on,clflushopt=on,umip=on,pku=on,ssbd=on,xsaves=on,topoext=on,hv_time,hv_relaxed,hv_vapic,hv_spinlocks=0x1fff,hv_vpindex,hv_runtime,hv_synic,hv_stimer"... vga_model = 0x0 qtest_chrdev = 0x0 qtest_log = 0x0 pid_file = <optimized out> incoming = 0x7fff5edcff0a "defer" userconfig = <optimized out> nographic = false display_remote = <optimized out> log_mask = <optimized out> log_file = <optimized out> trace_file = <optimized out> maxram_size = 4294967296 ram_slots = 0 vmstate_dump_file = 0x0 main_loop_err = 0x0 ---Type <return> to continue, or q <return> to quit--- err = 0x0 list_data_dirs = false dir = <optimized out> dirs = <optimized out> bdo_queue = {sqh_first = 0x0, sqh_last = 0x7fff5edcd670} __func__ = "main"
Dest server is paused, and looks like this:
#0 0x00007f11c48bc3c1 in ppoll () at /lib64/libc.so.6 #1 0x0000000000861659 in qemu_poll_ns (fds=<optimized out>, nfds=<optimized out>, timeout=timeout@entry=2999892383) at util/qemu-timer.c:334 ts = {tv_sec = 2, tv_nsec = 999892383} Python Exception <class 'gdb.error'> That operation is not available on integers of more than 8 bytes.: #2 0x00000000008622a4 in main_loop_wait (timeout=<optimized out>) at util/main-loop.c:233 context = 0x2342810 ret = <optimized out> ret = -1295074913 timeout = 4294967295 timeout_ns = <optimized out> #3 0x00000000008622a4 in main_loop_wait (nonblocking=nonblocking@entry=0) at util/main-loop.c:497 ret = -1295074913 timeout = 4294967295 timeout_ns = <optimized out> #4 0x0000000000595dee in main_loop () at vl.c:1866 #5 0x000000000041f35d in main (argc=<optimized out>, argv=<optimized out>, envp=<optimized out>) at vl.c:4644 i = <optimized out> snapshot = 0 linux_boot = <optimized out> initrd_filename = 0x0 kernel_filename = <optimized out> kernel_cmdline = <optimized out> boot_order = 0x918f44 "cad" boot_once = 0x0 ds = <optimized out> opts = <optimized out> machine_opts = <optimized out> icount_opts = <optimized out> accel_opts = 0x0 olist = <optimized out> optind = 71 optarg = 0x7ffe6b899f69 "timestamp=on" loadvm = 0x0 machine_class = 0x0 cpu_model = 0x7ffe6b89988a "Skylake-Server-IBRS,ss=on,hypervisor=on,tsc_adjust=on,clflushopt=on,umip=on,pku=on,ssbd=on,xsaves=on,topoext=on,hv_time,hv_relaxed,hv_vapic,hv_spinlocks=0x1fff,hv_vpindex,hv_runtime,hv_synic,hv_stimer"... vga_model = 0x0 qtest_chrdev = 0x0 qtest_log = 0x0 pid_file = <optimized out> incoming = 0x7ffe6b899f0a "defer" userconfig = <optimized out> nographic = false display_remote = <optimized out> log_mask = <optimized out> log_file = <optimized out> trace_file = <optimized out> maxram_size = 4294967296 ram_slots = 0 vmstate_dump_file = 0x0 main_loop_err = 0x0 err = 0x0 list_data_dirs = false dir = <optimized out> dirs = <optimized out> bdo_queue = {sqh_first = 0x0, sqh_last = 0x7ffe6b8988e0} ---Type <return> to continue, or q <return> to quit--- __func__ = "main"
Honestly looks pretty much like the same bug....
If I remote iothreads and writeback caching, it seems more reliable, but I can still get it to hang.
This time the source server shows the VM as running, backtrace looks like:
(gdb) bt full libpthread. so.0 libpthread. so.0 lock_impl (mutex= mutex@entry= 0x115b8e0 <qemu_global_ mutex>, file=file@ entry=0x8fdf14 "/tmp/qemu- 3.0.0/cpus. c", line=line@ entry=1768) thread- posix.c: 66
__PRETTY_ FUNCTION_ _ = "qemu_mutex_ lock_impl" lock_impl" lock_iothread () at /tmp/qemu- 3.0.0/cpus. c:1768 loop.c: 236 nonblocking@ entry=0) at util/main- loop.c: 497
initrd_ filename = 0x0
kernel_ filename = <optimized out>
kernel_ cmdline = <optimized out>
machine_ opts = <optimized out>
machine_ class = 0x0 Server- IBRS,ss= on,hypervisor= on,tsc_ adjust= on,clflushopt= on,umip= on,pku= on,ssbd= on,xsaves= on,topoext= on,hv_time, hv_relaxed, hv_vapic, hv_spinlocks= 0x1fff, hv_vpindex, hv_runtime, hv_synic, hv_stimer" ...
qtest_ chrdev = 0x0
display_ remote = <optimized out>
vmstate_ dump_file = 0x0
main_loop_ err = 0x0
list_data_ dirs = false
#0 0x00007f27eab0028c in __lll_lock_wait () at /lib64/
#1 0x00007f27eaaf9d35 in pthread_mutex_lock () at /lib64/
#2 0x0000000000865419 in qemu_mutex_
at util/qemu-
err = <optimized out>
__func__ = "qemu_mutex_
#3 0x0000000000477578 in qemu_mutex_
#4 0x00000000008622b0 in main_loop_wait (timeout=<optimized out>) at util/main-
context = 0x1e72810
ret = 1
ret = 1
timeout = 4294967295
timeout_ns = <optimized out>
#5 0x00000000008622b0 in main_loop_wait (nonblocking=
ret = 1
timeout = 4294967295
timeout_ns = <optimized out>
#6 0x0000000000595dee in main_loop () at vl.c:1866
#7 0x000000000041f35d in main (argc=<optimized out>, argv=<optimized out>, envp=<optimized out>) at vl.c:4644
i = <optimized out>
snapshot = 0
linux_boot = <optimized out>
boot_order = 0x918f44 "cad"
boot_once = 0x0
ds = <optimized out>
opts = <optimized out>
icount_opts = <optimized out>
accel_opts = 0x0
olist = <optimized out>
optind = 71
optarg = 0x7fff5edcff69 "timestamp=on"
loadvm = 0x0
cpu_model = 0x7fff5edcf88a "Skylake-
vga_model = 0x0
qtest_log = 0x0
pid_file = <optimized out>
incoming = 0x7fff5edcff0a "defer"
userconfig = <optimized out>
nographic = false
log_mask = <optimized out>
log_file = <optimized out>
trace_file = <optimized out>
maxram_size = 4294967296
ram_slots = 0
---Type <return> to continue, or q <return> to quit---
err = 0x0
dir = <optimized out>
dirs = <optimized out>
bdo_queue = {sqh_first = 0x0, sqh_last = 0x7fff5edcd670}
__func__ = "main"
Dest server is paused, and looks like this:
#0 0x00007f11c48bc3c1 in ppoll () at /lib64/libc.so.6 timeout@ entry=299989238 3) at util/qemu- timer.c: 334 loop.c: 233 nonblocking@ entry=0) at util/main- loop.c: 497
initrd_ filename = 0x0
kernel_ filename = <optimized out>
kernel_ cmdline = <optimized out>
machine_ opts = <optimized out>
machine_ class = 0x0 Server- IBRS,ss= on,hypervisor= on,tsc_ adjust= on,clflushopt= on,umip= on,pku= on,ssbd= on,xsaves= on,topoext= on,hv_time, hv_relaxed, hv_vapic, hv_spinlocks= 0x1fff, hv_vpindex, hv_runtime, hv_synic, hv_stimer" ...
qtest_ chrdev = 0x0
display_ remote = <optimized out>
vmstate_ dump_file = 0x0
main_loop_ err = 0x0
list_data_ dirs = false
#1 0x0000000000861659 in qemu_poll_ns (fds=<optimized out>, nfds=<optimized out>, timeout=
ts = {tv_sec = 2, tv_nsec = 999892383}
Python Exception <class 'gdb.error'> That operation is not available on integers of more than 8 bytes.:
#2 0x00000000008622a4 in main_loop_wait (timeout=<optimized out>) at util/main-
context = 0x2342810
ret = <optimized out>
ret = -1295074913
timeout = 4294967295
timeout_ns = <optimized out>
#3 0x00000000008622a4 in main_loop_wait (nonblocking=
ret = -1295074913
timeout = 4294967295
timeout_ns = <optimized out>
#4 0x0000000000595dee in main_loop () at vl.c:1866
#5 0x000000000041f35d in main (argc=<optimized out>, argv=<optimized out>, envp=<optimized out>) at vl.c:4644
i = <optimized out>
snapshot = 0
linux_boot = <optimized out>
boot_order = 0x918f44 "cad"
boot_once = 0x0
ds = <optimized out>
opts = <optimized out>
icount_opts = <optimized out>
accel_opts = 0x0
olist = <optimized out>
optind = 71
optarg = 0x7ffe6b899f69 "timestamp=on"
loadvm = 0x0
cpu_model = 0x7ffe6b89988a "Skylake-
vga_model = 0x0
qtest_log = 0x0
pid_file = <optimized out>
incoming = 0x7ffe6b899f0a "defer"
userconfig = <optimized out>
nographic = false
log_mask = <optimized out>
log_file = <optimized out>
trace_file = <optimized out>
maxram_size = 4294967296
ram_slots = 0
err = 0x0
dir = <optimized out>
dirs = <optimized out>
bdo_queue = {sqh_first = 0x0, sqh_last = 0x7ffe6b8988e0}
---Type <return> to continue, or q <return> to quit---
__func__ = "main"
Honestly looks pretty much like the same bug....