On Wed, Sep 11, 2019 at 04:09:25PM -0300, Rafael David Tinoco wrote:
> > Zhengui's theory that notify_me doesn't work properly on ARM is more
> > promising, but he couldn't provide a clear explanation of why he thought
> > notify_me is involved.  In particular, I would have expected notify_me to
> > be wrong if the qemu_poll_ns call came from aio_ctx_dispatch, for example:
> > 
> > 
> >     glib_pollfds_fill
> >       g_main_context_prepare
> >         aio_ctx_prepare
> >           atomic_or(&ctx->notify_me, 1)
> >     qemu_poll_ns
> >     glib_pollfds_poll
> >       g_main_context_check
> >         aio_ctx_check
> >           atomic_and(&ctx->notify_me, ~1)
> >       g_main_context_dispatch
> >         aio_ctx_dispatch
> >           /* do something for event */
> >             qemu_poll_ns 
> > 
> 
> Paolo,
> 
> I tried confining execution in a single NUMA domain (cpu & mem) and
> still faced the issue, then, I added a mutex "ctx->notify_me_lcktest"
> into context to protect "ctx->notify_me", like showed bellow, and it
> seems to have either fixed or mitigated it.
> 
> I was able to cause the hung once every 3 or 4 runs. I have already ran
> qemu-img convert more than 30 times now and couldn't reproduce it again.
> 
> Next step is to play with the barriers and check why existing ones
> aren't enough for ordering access to ctx->notify_me ... or should I
> try/do something else in your opinion ?
> 
> This arch/machine (Huawei D06):
> 
> $ lscpu
> Architecture:        aarch64
> Byte Order:          Little Endian
> CPU(s):              96
> On-line CPU(s) list: 0-95
> Thread(s) per core:  1
> Core(s) per socket:  48
> Socket(s):           2
> NUMA node(s):        4
> Vendor ID:           0x48
> Model:               0
> Stepping:            0x0
> CPU max MHz:         2000.0000
> CPU min MHz:         200.0000
> BogoMIPS:            200.00
> L1d cache:           64K
> L1i cache:           64K
> L2 cache:            512K
> L3 cache:            32768K
> NUMA node0 CPU(s):   0-23
> NUMA node1 CPU(s):   24-47
> NUMA node2 CPU(s):   48-71
> NUMA node3 CPU(s):   72-95
> Flags:               fp asimd evtstrm aes pmull sha1 sha2 crc32 atomics
> cpuid asimdrdm dcpop

Note that I'm also seeing this on a ThunderX2 (same calltrace):

$ lscpu
Architecture:        aarch64
Byte Order:          Little Endian
CPU(s):              224
On-line CPU(s) list: 0-223
Thread(s) per core:  4
Core(s) per socket:  28
Socket(s):           2
NUMA node(s):        2
Vendor ID:           Cavium
Model:               1
Model name:          ThunderX2 99xx
Stepping:            0x1
BogoMIPS:            400.00
L1d cache:           32K
L1i cache:           32K
L2 cache:            256K
L3 cache:            32768K
NUMA node0 CPU(s):   0-111
NUMA node1 CPU(s):   112-223
Flags:               fp asimd evtstrm aes pmull sha1 sha2 crc32 atomics cpuid asimdrdm

  -dann

> ----
> 
> diff --git a/include/block/aio.h b/include/block/aio.h
> index 0ca25dfec6..0724086d91 100644
> --- a/include/block/aio.h
> +++ b/include/block/aio.h
> @@ -84,6 +84,7 @@ struct AioContext {
>       * dispatch phase, hence a simple counter is enough for them.
>       */
>      uint32_t notify_me;
> +    QemuMutex notify_me_lcktest;
> 
>      /* A lock to protect between QEMUBH and AioHandler adders and deleter,
>       * and to ensure that no callbacks are removed while we're walking and
> diff --git a/util/aio-posix.c b/util/aio-posix.c
> index 51c41ed3c9..031d6e2997 100644
> --- a/util/aio-posix.c
> +++ b/util/aio-posix.c
> @@ -529,7 +529,9 @@ static bool run_poll_handlers(AioContext *ctx,
> int64_t max_ns, int64_t *timeout)
>      bool progress;
>      int64_t start_time, elapsed_time;
> 
> +    qemu_mutex_lock(&ctx->notify_me_lcktest);
>      assert(ctx->notify_me);
> +    qemu_mutex_unlock(&ctx->notify_me_lcktest);
>      assert(qemu_lockcnt_count(&ctx->list_lock) > 0);
> 
>      trace_run_poll_handlers_begin(ctx, max_ns, *timeout);
> @@ -601,8 +603,10 @@ bool aio_poll(AioContext *ctx, bool blocking)
>       * so disable the optimization now.
>       */
>      if (blocking) {
> +        qemu_mutex_lock(&ctx->notify_me_lcktest);
>          assert(in_aio_context_home_thread(ctx));
>          atomic_add(&ctx->notify_me, 2);
> +        qemu_mutex_unlock(&ctx->notify_me_lcktest);
>      }
> 
>      qemu_lockcnt_inc(&ctx->list_lock);
> @@ -647,8 +651,10 @@ bool aio_poll(AioContext *ctx, bool blocking)
>      }
> 
>      if (blocking) {
> +        qemu_mutex_lock(&ctx->notify_me_lcktest);
>          atomic_sub(&ctx->notify_me, 2);
>          aio_notify_accept(ctx);
> +        qemu_mutex_unlock(&ctx->notify_me_lcktest);
>      }
> 
>      /* Adjust polling time */
> diff --git a/util/async.c b/util/async.c
> index c10642a385..140e1e86f5 100644
> --- a/util/async.c
> +++ b/util/async.c
> @@ -221,7 +221,9 @@ aio_ctx_prepare(GSource *source, gint    *timeout)
>  {
>      AioContext *ctx = (AioContext *) source;
> 
> +    qemu_mutex_lock(&ctx->notify_me_lcktest);
>      atomic_or(&ctx->notify_me, 1);
> +    qemu_mutex_unlock(&ctx->notify_me_lcktest);
> 
>      /* We assume there is no timeout already supplied */
>      *timeout = qemu_timeout_ns_to_ms(aio_compute_timeout(ctx));
> @@ -239,8 +241,10 @@ aio_ctx_check(GSource *source)
>      AioContext *ctx = (AioContext *) source;
>      QEMUBH *bh;
> 
> +    qemu_mutex_lock(&ctx->notify_me_lcktest);
>      atomic_and(&ctx->notify_me, ~1);
>      aio_notify_accept(ctx);
> +    qemu_mutex_unlock(&ctx->notify_me_lcktest);
> 
>      for (bh = ctx->first_bh; bh; bh = bh->next) {
>          if (bh->scheduled) {
> @@ -346,11 +350,13 @@ void aio_notify(AioContext *ctx)
>      /* Write e.g. bh->scheduled before reading ctx->notify_me.  Pairs
>       * with atomic_or in aio_ctx_prepare or atomic_add in aio_poll.
>       */
> -    smp_mb();
> +    //smp_mb();
> +    qemu_mutex_lock(&ctx->notify_me_lcktest);
>      if (ctx->notify_me) {
>          event_notifier_set(&ctx->notifier);
>          atomic_mb_set(&ctx->notified, true);
>      }
> +    qemu_mutex_unlock(&ctx->notify_me_lcktest);
>  }
> 
>  void aio_notify_accept(AioContext *ctx)
> @@ -424,6 +430,8 @@ AioContext *aio_context_new(Error **errp)
>      ctx->co_schedule_bh = aio_bh_new(ctx, co_schedule_bh_cb, ctx);
>      QSLIST_INIT(&ctx->scheduled_coroutines);
> 
> +    qemu_rec_mutex_init(&ctx->notify_me_lcktest);
> +
>      aio_set_event_notifier(ctx, &ctx->notifier,
>                             false,
>                             (EventNotifierHandler *)
>