block.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-)
From: Paolo Bonzini <pbonzini@redhat.com>
BDRV_POLL_WHILE() does not support recursive AioContext locking. It
only releases the AioContext lock once regardless of how many times the
caller has acquired it. This results in a hang since the IOThread does
not make progress while the AioContext is still locked.
The following steps trigger the hang:
$ qemu-system-x86_64 -M accel=kvm -m 1G -cpu host \
-object iothread,id=iothread0 \
-device virtio-scsi-pci,iothread=iothread0 \
-drive if=none,id=drive0,file=test.img,format=raw \
-device scsi-hd,drive=drive0 \
-drive if=none,id=drive1,file=test.img,format=raw \
-device scsi-hd,drive=drive1
$ qemu-system-x86_64 ...same options... \
-incoming tcp::1234
(qemu) migrate tcp:127.0.0.1:1234
...hang...
Tested-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
block.c | 14 +++++++++++---
1 file changed, 11 insertions(+), 3 deletions(-)
diff --git a/block.c b/block.c
index 9a1a0d1e73..1c37ce4554 100644
--- a/block.c
+++ b/block.c
@@ -4320,9 +4320,15 @@ int bdrv_inactivate_all(void)
BdrvNextIterator it;
int ret = 0;
int pass;
+ GSList *aio_ctxs = NULL, *ctx;
for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
- aio_context_acquire(bdrv_get_aio_context(bs));
+ AioContext *aio_context = bdrv_get_aio_context(bs);
+
+ if (!g_slist_find(aio_ctxs, aio_context)) {
+ aio_ctxs = g_slist_prepend(aio_ctxs, aio_context);
+ aio_context_acquire(aio_context);
+ }
}
/* We do two passes of inactivation. The first pass calls to drivers'
@@ -4340,9 +4346,11 @@ int bdrv_inactivate_all(void)
}
out:
- for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
- aio_context_release(bdrv_get_aio_context(bs));
+ for (ctx = aio_ctxs; ctx != NULL; ctx = ctx->next) {
+ AioContext *aio_context = ctx->data;
+ aio_context_release(aio_context);
}
+ g_slist_free(aio_ctxs);
return ret;
}
--
2.14.3
Am 06.12.2017 um 18:54 hat Stefan Hajnoczi geschrieben: > From: Paolo Bonzini <pbonzini@redhat.com> > > BDRV_POLL_WHILE() does not support recursive AioContext locking. It > only releases the AioContext lock once regardless of how many times the > caller has acquired it. This results in a hang since the IOThread does > not make progress while the AioContext is still locked. > > The following steps trigger the hang: > > $ qemu-system-x86_64 -M accel=kvm -m 1G -cpu host \ > -object iothread,id=iothread0 \ > -device virtio-scsi-pci,iothread=iothread0 \ > -drive if=none,id=drive0,file=test.img,format=raw \ > -device scsi-hd,drive=drive0 \ > -drive if=none,id=drive1,file=test.img,format=raw \ > -device scsi-hd,drive=drive1 > $ qemu-system-x86_64 ...same options... \ > -incoming tcp::1234 > (qemu) migrate tcp:127.0.0.1:1234 > ...hang... Please turn this into a test case. We should probably also update docs/devel/multiple-iothreads.txt. Currently it says: aio_context_acquire()/aio_context_release() calls may be nested. This means you can call them if you're not sure whether #2 applies. While technically that's still correct as far as the lock is concerned, the limitations of BDRV_POLL_WHILE() mean that in practice this is not a viable option any more at least in the context of the block layer. Kevin > Tested-by: Stefan Hajnoczi <stefanha@redhat.com> > Signed-off-by: Paolo Bonzini <pbonzini@redhat.com> > --- > block.c | 14 +++++++++++--- > 1 file changed, 11 insertions(+), 3 deletions(-) > > diff --git a/block.c b/block.c > index 9a1a0d1e73..1c37ce4554 100644 > --- a/block.c > +++ b/block.c > @@ -4320,9 +4320,15 @@ int bdrv_inactivate_all(void) > BdrvNextIterator it; > int ret = 0; > int pass; > + GSList *aio_ctxs = NULL, *ctx; > > for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) { > - aio_context_acquire(bdrv_get_aio_context(bs)); > + AioContext *aio_context = bdrv_get_aio_context(bs); > + > + if (!g_slist_find(aio_ctxs, aio_context)) { > + aio_ctxs = g_slist_prepend(aio_ctxs, aio_context); > + aio_context_acquire(aio_context); > + } > } > > /* We do two passes of inactivation. The first pass calls to drivers' > @@ -4340,9 +4346,11 @@ int bdrv_inactivate_all(void) > } > > out: > - for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) { > - aio_context_release(bdrv_get_aio_context(bs)); > + for (ctx = aio_ctxs; ctx != NULL; ctx = ctx->next) { > + AioContext *aio_context = ctx->data; > + aio_context_release(aio_context); > } > + g_slist_free(aio_ctxs); > > return ret; > } > -- > 2.14.3 > >
On Wed, Dec 06, 2017 at 07:40:28PM +0100, Kevin Wolf wrote: > Am 06.12.2017 um 18:54 hat Stefan Hajnoczi geschrieben: > > From: Paolo Bonzini <pbonzini@redhat.com> > > > > BDRV_POLL_WHILE() does not support recursive AioContext locking. It > > only releases the AioContext lock once regardless of how many times the > > caller has acquired it. This results in a hang since the IOThread does > > not make progress while the AioContext is still locked. > > > > The following steps trigger the hang: > > > > $ qemu-system-x86_64 -M accel=kvm -m 1G -cpu host \ > > -object iothread,id=iothread0 \ > > -device virtio-scsi-pci,iothread=iothread0 \ > > -drive if=none,id=drive0,file=test.img,format=raw \ > > -device scsi-hd,drive=drive0 \ > > -drive if=none,id=drive1,file=test.img,format=raw \ > > -device scsi-hd,drive=drive1 > > $ qemu-system-x86_64 ...same options... \ > > -incoming tcp::1234 > > (qemu) migrate tcp:127.0.0.1:1234 > > ...hang... > > Please turn this into a test case. > > We should probably also update docs/devel/multiple-iothreads.txt. > Currently it says: > > aio_context_acquire()/aio_context_release() calls may be nested. > This means you can call them if you're not sure whether #2 applies. > > While technically that's still correct as far as the lock is concerned, > the limitations of BDRV_POLL_WHILE() mean that in practice this is not a > viable option any more at least in the context of the block layer. Good point, will fix both things in v2. Stefan
© 2016 - 2024 Red Hat, Inc.