From e01bee0881e0f0c8a79555f6729d7238841a5b04 Mon Sep 17 00:00:00 2001 From: Bharata B Rao Date: Tue, 16 Jul 2013 21:47:41 +0530 Subject: [PATCH 01/11] gluster: Use pkg-config to configure GlusterFS block driver Use pkg-config to determine the version and library dependency for GlusterFS block driver. Signed-off-by: Bharata B Rao Signed-off-by: Stefan Hajnoczi --- configure | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/configure b/configure index 9e1cd1975a..cf13493135 100755 --- a/configure +++ b/configure @@ -2570,23 +2570,18 @@ fi ########################################## # glusterfs probe if test "$glusterfs" != "no" ; then - cat > $TMPC < -int main(void) { - (void) glfs_new("volume"); - return 0; -} -EOF - glusterfs_libs="-lgfapi -lgfrpc -lgfxdr" - if compile_prog "" "$glusterfs_libs" ; then - glusterfs=yes + if $pkg_config --atleast-version=3 glusterfs-api >/dev/null 2>&1; then + glusterfs="yes" + glusterfs_cflags=`$pkg_config --cflags glusterfs-api 2>/dev/null` + glusterfs_libs=`$pkg_config --libs glusterfs-api 2>/dev/null` + CFLAGS="$CFLAGS $glusterfs_cflags" libs_tools="$glusterfs_libs $libs_tools" libs_softmmu="$glusterfs_libs $libs_softmmu" else if test "$glusterfs" = "yes" ; then feature_not_found "GlusterFS backend support" fi - glusterfs=no + glusterfs="no" fi fi From 0c14fb47ece5ef42d7a0a4b3e8e43e022b375720 Mon Sep 17 00:00:00 2001 From: Bharata B Rao Date: Tue, 16 Jul 2013 21:47:42 +0530 Subject: [PATCH 02/11] gluster: Add discard support for GlusterFS block driver. Implement bdrv_aio_discard for gluster. Signed-off-by: Bharata B Rao Reviewed-by: Kevin Wolf Signed-off-by: Stefan Hajnoczi --- block/gluster.c | 45 +++++++++++++++++++++++++++++++++++++++++++++ configure | 8 ++++++++ 2 files changed, 53 insertions(+) diff --git a/block/gluster.c b/block/gluster.c index 61424bcb01..6de418c0bd 100644 --- a/block/gluster.c +++ b/block/gluster.c @@ -532,6 +532,39 @@ out: return NULL; } +#ifdef CONFIG_GLUSTERFS_DISCARD +static BlockDriverAIOCB *qemu_gluster_aio_discard(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, BlockDriverCompletionFunc *cb, + void *opaque) +{ + int ret; + GlusterAIOCB *acb; + BDRVGlusterState *s = bs->opaque; + size_t size; + off_t offset; + + offset = sector_num * BDRV_SECTOR_SIZE; + size = nb_sectors * BDRV_SECTOR_SIZE; + + acb = qemu_aio_get(&gluster_aiocb_info, bs, cb, opaque); + acb->size = 0; + acb->ret = 0; + acb->finished = NULL; + s->qemu_aio_count++; + + ret = glfs_discard_async(s->fd, offset, size, &gluster_finish_aiocb, acb); + if (ret < 0) { + goto out; + } + return &acb->common; + +out: + s->qemu_aio_count--; + qemu_aio_release(acb); + return NULL; +} +#endif + static int64_t qemu_gluster_getlength(BlockDriverState *bs) { BDRVGlusterState *s = bs->opaque; @@ -602,6 +635,9 @@ static BlockDriver bdrv_gluster = { .bdrv_aio_writev = qemu_gluster_aio_writev, .bdrv_aio_flush = qemu_gluster_aio_flush, .bdrv_has_zero_init = qemu_gluster_has_zero_init, +#ifdef CONFIG_GLUSTERFS_DISCARD + .bdrv_aio_discard = qemu_gluster_aio_discard, +#endif .create_options = qemu_gluster_create_options, }; @@ -618,6 +654,9 @@ static BlockDriver bdrv_gluster_tcp = { .bdrv_aio_writev = qemu_gluster_aio_writev, .bdrv_aio_flush = qemu_gluster_aio_flush, .bdrv_has_zero_init = qemu_gluster_has_zero_init, +#ifdef CONFIG_GLUSTERFS_DISCARD + .bdrv_aio_discard = qemu_gluster_aio_discard, +#endif .create_options = qemu_gluster_create_options, }; @@ -634,6 +673,9 @@ static BlockDriver bdrv_gluster_unix = { .bdrv_aio_writev = qemu_gluster_aio_writev, .bdrv_aio_flush = qemu_gluster_aio_flush, .bdrv_has_zero_init = qemu_gluster_has_zero_init, +#ifdef CONFIG_GLUSTERFS_DISCARD + .bdrv_aio_discard = qemu_gluster_aio_discard, +#endif .create_options = qemu_gluster_create_options, }; @@ -650,6 +692,9 @@ static BlockDriver bdrv_gluster_rdma = { .bdrv_aio_writev = qemu_gluster_aio_writev, .bdrv_aio_flush = qemu_gluster_aio_flush, .bdrv_has_zero_init = qemu_gluster_has_zero_init, +#ifdef CONFIG_GLUSTERFS_DISCARD + .bdrv_aio_discard = qemu_gluster_aio_discard, +#endif .create_options = qemu_gluster_create_options, }; diff --git a/configure b/configure index cf13493135..7c45db2fdd 100755 --- a/configure +++ b/configure @@ -237,6 +237,7 @@ libiscsi="" coroutine="" seccomp="" glusterfs="" +glusterfs_discard="no" virtio_blk_data_plane="" gtk="" gtkabi="2.0" @@ -2577,6 +2578,9 @@ if test "$glusterfs" != "no" ; then CFLAGS="$CFLAGS $glusterfs_cflags" libs_tools="$glusterfs_libs $libs_tools" libs_softmmu="$glusterfs_libs $libs_softmmu" + if $pkg_config --atleast-version=5 glusterfs-api >/dev/null 2>&1; then + glusterfs_discard="yes" + fi else if test "$glusterfs" = "yes" ; then feature_not_found "GlusterFS backend support" @@ -3964,6 +3968,10 @@ if test "$glusterfs" = "yes" ; then echo "CONFIG_GLUSTERFS=y" >> $config_host_mak fi +if test "$glusterfs_discard" = "yes" ; then + echo "CONFIG_GLUSTERFS_DISCARD=y" >> $config_host_mak +fi + if test "$libssh2" = "yes" ; then echo "CONFIG_LIBSSH2=y" >> $config_host_mak fi From 9154b02c53bb6685797c973fcdbec51c4714777d Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Mon, 1 Jul 2013 13:29:17 +0200 Subject: [PATCH 03/11] dataplane: sync virtio.c and vring.c virtqueue state Load the virtio.c state into vring.c when we start dataplane mode and vice versa when stopping dataplane mode. This patch makes it possible to start and stop dataplane any time while the guest is running. This will eventually allow us to go back to QEMU main loop for bdrv_drain_all() and live migration. In the meantime, this patch makes the dataplane lifecycle more robust but should make no visible difference. It may be useful in the virtio-net dataplane effort. Signed-off-by: Stefan Hajnoczi --- hw/block/dataplane/virtio-blk.c | 2 +- hw/virtio/dataplane/vring.c | 8 +++++--- include/hw/virtio/dataplane/vring.h | 2 +- 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/hw/block/dataplane/virtio-blk.c b/hw/block/dataplane/virtio-blk.c index 03566650c6..2faed43127 100644 --- a/hw/block/dataplane/virtio-blk.c +++ b/hw/block/dataplane/virtio-blk.c @@ -537,7 +537,7 @@ void virtio_blk_data_plane_stop(VirtIOBlockDataPlane *s) /* Clean up guest notifier (irq) */ k->set_guest_notifiers(qbus->parent, 1, false); - vring_teardown(&s->vring); + vring_teardown(&s->vring, s->vdev, 0); s->started = false; s->stopping = false; } diff --git a/hw/virtio/dataplane/vring.c b/hw/virtio/dataplane/vring.c index e0d6e83625..82cc151b17 100644 --- a/hw/virtio/dataplane/vring.c +++ b/hw/virtio/dataplane/vring.c @@ -39,8 +39,8 @@ bool vring_setup(Vring *vring, VirtIODevice *vdev, int n) vring_init(&vring->vr, virtio_queue_get_num(vdev, n), vring_ptr, 4096); - vring->last_avail_idx = 0; - vring->last_used_idx = 0; + vring->last_avail_idx = virtio_queue_get_last_avail_idx(vdev, n); + vring->last_used_idx = vring->vr.used->idx; vring->signalled_used = 0; vring->signalled_used_valid = false; @@ -49,8 +49,10 @@ bool vring_setup(Vring *vring, VirtIODevice *vdev, int n) return true; } -void vring_teardown(Vring *vring) +void vring_teardown(Vring *vring, VirtIODevice *vdev, int n) { + virtio_queue_set_last_avail_idx(vdev, n, vring->last_avail_idx); + hostmem_finalize(&vring->hostmem); } diff --git a/include/hw/virtio/dataplane/vring.h b/include/hw/virtio/dataplane/vring.h index 9380cb5413..c0b69ff18f 100644 --- a/include/hw/virtio/dataplane/vring.h +++ b/include/hw/virtio/dataplane/vring.h @@ -50,7 +50,7 @@ static inline void vring_set_broken(Vring *vring) } bool vring_setup(Vring *vring, VirtIODevice *vdev, int n); -void vring_teardown(Vring *vring); +void vring_teardown(Vring *vring, VirtIODevice *vdev, int n); void vring_disable_notification(VirtIODevice *vdev, Vring *vring); bool vring_enable_notification(VirtIODevice *vdev, Vring *vring); bool vring_should_notify(VirtIODevice *vdev, Vring *vring); From dcc772e2f2b7c2a68644133fea2b874f6751a57b Mon Sep 17 00:00:00 2001 From: Liu Ping Fan Date: Tue, 16 Jul 2013 12:28:58 +0800 Subject: [PATCH 04/11] QEMUBH: make AioContext's bh re-entrant BH will be used outside big lock, so introduce lock to protect between the writers, ie, bh's adders and deleter. The lock only affects the writers and bh's callback does not take this extra lock. Note that for the same AioContext, aio_bh_poll() can not run in parallel yet. Signed-off-by: Liu Ping Fan Reviewed-by: Stefan Hajnoczi Signed-off-by: Stefan Hajnoczi --- async.c | 33 +++++++++++++++++++++++++++++++-- include/block/aio.h | 7 +++++++ 2 files changed, 38 insertions(+), 2 deletions(-) diff --git a/async.c b/async.c index 90fe906539..5ce3633010 100644 --- a/async.c +++ b/async.c @@ -47,11 +47,16 @@ QEMUBH *aio_bh_new(AioContext *ctx, QEMUBHFunc *cb, void *opaque) bh->ctx = ctx; bh->cb = cb; bh->opaque = opaque; + qemu_mutex_lock(&ctx->bh_lock); bh->next = ctx->first_bh; + /* Make sure that the members are ready before putting bh into list */ + smp_wmb(); ctx->first_bh = bh; + qemu_mutex_unlock(&ctx->bh_lock); return bh; } +/* Multiple occurrences of aio_bh_poll cannot be called concurrently */ int aio_bh_poll(AioContext *ctx) { QEMUBH *bh, **bhp, *next; @@ -61,9 +66,15 @@ int aio_bh_poll(AioContext *ctx) ret = 0; for (bh = ctx->first_bh; bh; bh = next) { + /* Make sure that fetching bh happens before accessing its members */ + smp_read_barrier_depends(); next = bh->next; if (!bh->deleted && bh->scheduled) { bh->scheduled = 0; + /* Paired with write barrier in bh schedule to ensure reading for + * idle & callbacks coming after bh's scheduling. + */ + smp_rmb(); if (!bh->idle) ret = 1; bh->idle = 0; @@ -75,6 +86,7 @@ int aio_bh_poll(AioContext *ctx) /* remove deleted bhs */ if (!ctx->walking_bh) { + qemu_mutex_lock(&ctx->bh_lock); bhp = &ctx->first_bh; while (*bhp) { bh = *bhp; @@ -85,6 +97,7 @@ int aio_bh_poll(AioContext *ctx) bhp = &bh->next; } } + qemu_mutex_unlock(&ctx->bh_lock); } return ret; @@ -94,24 +107,38 @@ void qemu_bh_schedule_idle(QEMUBH *bh) { if (bh->scheduled) return; - bh->scheduled = 1; bh->idle = 1; + /* Make sure that idle & any writes needed by the callback are done + * before the locations are read in the aio_bh_poll. + */ + smp_wmb(); + bh->scheduled = 1; } void qemu_bh_schedule(QEMUBH *bh) { if (bh->scheduled) return; - bh->scheduled = 1; bh->idle = 0; + /* Make sure that idle & any writes needed by the callback are done + * before the locations are read in the aio_bh_poll. + */ + smp_wmb(); + bh->scheduled = 1; aio_notify(bh->ctx); } + +/* This func is async. + */ void qemu_bh_cancel(QEMUBH *bh) { bh->scheduled = 0; } +/* This func is async.The bottom half will do the delete action at the finial + * end. + */ void qemu_bh_delete(QEMUBH *bh) { bh->scheduled = 0; @@ -176,6 +203,7 @@ aio_ctx_finalize(GSource *source) thread_pool_free(ctx->thread_pool); aio_set_event_notifier(ctx, &ctx->notifier, NULL, NULL); event_notifier_cleanup(&ctx->notifier); + qemu_mutex_destroy(&ctx->bh_lock); g_array_free(ctx->pollfds, TRUE); } @@ -211,6 +239,7 @@ AioContext *aio_context_new(void) ctx = (AioContext *) g_source_new(&aio_source_funcs, sizeof(AioContext)); ctx->pollfds = g_array_new(FALSE, FALSE, sizeof(GPollFD)); ctx->thread_pool = NULL; + qemu_mutex_init(&ctx->bh_lock); event_notifier_init(&ctx->notifier, false); aio_set_event_notifier(ctx, &ctx->notifier, (EventNotifierHandler *) diff --git a/include/block/aio.h b/include/block/aio.h index 183679374f..cc77771c46 100644 --- a/include/block/aio.h +++ b/include/block/aio.h @@ -17,6 +17,7 @@ #include "qemu-common.h" #include "qemu/queue.h" #include "qemu/event_notifier.h" +#include "qemu/thread.h" typedef struct BlockDriverAIOCB BlockDriverAIOCB; typedef void BlockDriverCompletionFunc(void *opaque, int ret); @@ -53,6 +54,8 @@ typedef struct AioContext { */ int walking_handlers; + /* lock to protect between bh's adders and deleter */ + QemuMutex bh_lock; /* Anchor of the list of Bottom Halves belonging to the context */ struct QEMUBH *first_bh; @@ -127,6 +130,8 @@ void aio_notify(AioContext *ctx); * aio_bh_poll: Poll bottom halves for an AioContext. * * These are internal functions used by the QEMU main loop. + * And notice that multiple occurrences of aio_bh_poll cannot + * be called concurrently */ int aio_bh_poll(AioContext *ctx); @@ -163,6 +168,8 @@ void qemu_bh_cancel(QEMUBH *bh); * Deleting a bottom half frees the memory that was allocated for it by * qemu_bh_new. It also implies canceling the bottom half if it was * scheduled. + * This func is async. The bottom half will do the delete action at the finial + * end. * * @bh: The bottom half to be deleted. */ From 78f27bd02ceba4a2f6ac5c725f4d4410eec205ef Mon Sep 17 00:00:00 2001 From: Fam Zheng Date: Wed, 17 Jul 2013 17:57:37 +0800 Subject: [PATCH 05/11] block: fix vvfat error path for enable_write_target s->qcow and s->qcow_filename are allocated but not freed on error. Fix the possible leaks, remove unnecessary check for bdrv_new(), propagate ret code of bdrv_create() and also the one of enable_write_target(). Signed-off-by: Fam Zheng Reviewed-by: Laszlo Ersek Signed-off-by: Stefan Hajnoczi --- block/vvfat.c | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/block/vvfat.c b/block/vvfat.c index 87b02799d0..cd3b8edd9f 100644 --- a/block/vvfat.c +++ b/block/vvfat.c @@ -1164,8 +1164,8 @@ DLOG(if (stderr == NULL) { s->sector_count = cyls * heads * secs - (s->first_sectors_number - 1); if (qemu_opt_get_bool(opts, "rw", false)) { - if (enable_write_target(s)) { - ret = -EIO; + ret = enable_write_target(s); + if (ret < 0) { goto fail; } bs->read_only = 0; @@ -2917,9 +2917,7 @@ static int enable_write_target(BDRVVVFATState *s) s->qcow_filename = g_malloc(1024); ret = get_tmp_filename(s->qcow_filename, 1024); if (ret < 0) { - g_free(s->qcow_filename); - s->qcow_filename = NULL; - return ret; + goto err; } bdrv_qcow = bdrv_find_format("qcow"); @@ -2927,18 +2925,18 @@ static int enable_write_target(BDRVVVFATState *s) set_option_parameter_int(options, BLOCK_OPT_SIZE, s->sector_count * 512); set_option_parameter(options, BLOCK_OPT_BACKING_FILE, "fat:"); - if (bdrv_create(bdrv_qcow, s->qcow_filename, options) < 0) - return -1; + ret = bdrv_create(bdrv_qcow, s->qcow_filename, options); + if (ret < 0) { + goto err; + } s->qcow = bdrv_new(""); - if (s->qcow == NULL) { - return -1; - } ret = bdrv_open(s->qcow, s->qcow_filename, NULL, BDRV_O_RDWR | BDRV_O_CACHE_WB | BDRV_O_NO_FLUSH, bdrv_qcow); if (ret < 0) { - return ret; + bdrv_delete(s->qcow); + goto err; } #ifndef _WIN32 @@ -2951,6 +2949,11 @@ static int enable_write_target(BDRVVVFATState *s) *(void**)s->bs->backing_hd->opaque = s; return 0; + +err: + g_free(s->qcow_filename); + s->qcow_filename = NULL; + return ret; } static void vvfat_close(BlockDriverState *bs) From 4105eaaab9376ea959de711b81bba9e1494c971d Mon Sep 17 00:00:00 2001 From: Peter Lieven Date: Thu, 11 Jul 2013 14:16:22 +0200 Subject: [PATCH 06/11] block: add bdrv_write_zeroes() Signed-off-by: Peter Lieven Reviewed-by: Kevin Wolf Signed-off-by: Stefan Hajnoczi --- block.c | 27 +++++++++++++++++++-------- include/block/block.h | 2 ++ 2 files changed, 21 insertions(+), 8 deletions(-) diff --git a/block.c b/block.c index b56024113b..b05e2d6ce9 100644 --- a/block.c +++ b/block.c @@ -2162,6 +2162,7 @@ typedef struct RwCo { QEMUIOVector *qiov; bool is_write; int ret; + BdrvRequestFlags flags; } RwCo; static void coroutine_fn bdrv_rw_co_entry(void *opaque) @@ -2170,10 +2171,12 @@ static void coroutine_fn bdrv_rw_co_entry(void *opaque) if (!rwco->is_write) { rwco->ret = bdrv_co_do_readv(rwco->bs, rwco->sector_num, - rwco->nb_sectors, rwco->qiov, 0); + rwco->nb_sectors, rwco->qiov, + rwco->flags); } else { rwco->ret = bdrv_co_do_writev(rwco->bs, rwco->sector_num, - rwco->nb_sectors, rwco->qiov, 0); + rwco->nb_sectors, rwco->qiov, + rwco->flags); } } @@ -2181,7 +2184,8 @@ static void coroutine_fn bdrv_rw_co_entry(void *opaque) * Process a vectored synchronous request using coroutines */ static int bdrv_rwv_co(BlockDriverState *bs, int64_t sector_num, - QEMUIOVector *qiov, bool is_write) + QEMUIOVector *qiov, bool is_write, + BdrvRequestFlags flags) { Coroutine *co; RwCo rwco = { @@ -2191,6 +2195,7 @@ static int bdrv_rwv_co(BlockDriverState *bs, int64_t sector_num, .qiov = qiov, .is_write = is_write, .ret = NOT_DONE, + .flags = flags, }; assert((qiov->size & (BDRV_SECTOR_SIZE - 1)) == 0); @@ -2222,7 +2227,7 @@ static int bdrv_rwv_co(BlockDriverState *bs, int64_t sector_num, * Process a synchronous request using coroutines */ static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf, - int nb_sectors, bool is_write) + int nb_sectors, bool is_write, BdrvRequestFlags flags) { QEMUIOVector qiov; struct iovec iov = { @@ -2231,14 +2236,14 @@ static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf, }; qemu_iovec_init_external(&qiov, &iov, 1); - return bdrv_rwv_co(bs, sector_num, &qiov, is_write); + return bdrv_rwv_co(bs, sector_num, &qiov, is_write, flags); } /* return < 0 if error. See bdrv_write() for the return codes */ int bdrv_read(BlockDriverState *bs, int64_t sector_num, uint8_t *buf, int nb_sectors) { - return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false); + return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false, 0); } /* Just like bdrv_read(), but with I/O throttling temporarily disabled */ @@ -2264,12 +2269,18 @@ int bdrv_read_unthrottled(BlockDriverState *bs, int64_t sector_num, int bdrv_write(BlockDriverState *bs, int64_t sector_num, const uint8_t *buf, int nb_sectors) { - return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true); + return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true, 0); } int bdrv_writev(BlockDriverState *bs, int64_t sector_num, QEMUIOVector *qiov) { - return bdrv_rwv_co(bs, sector_num, qiov, true); + return bdrv_rwv_co(bs, sector_num, qiov, true, 0); +} + +int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num, int nb_sectors) +{ + return bdrv_rw_co(bs, sector_num, NULL, nb_sectors, true, + BDRV_REQ_ZERO_WRITE); } int bdrv_pread(BlockDriverState *bs, int64_t offset, diff --git a/include/block/block.h b/include/block/block.h index b6b9014a9c..742fce5f7f 100644 --- a/include/block/block.h +++ b/include/block/block.h @@ -157,6 +157,8 @@ int bdrv_read_unthrottled(BlockDriverState *bs, int64_t sector_num, uint8_t *buf, int nb_sectors); int bdrv_write(BlockDriverState *bs, int64_t sector_num, const uint8_t *buf, int nb_sectors); +int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num, + int nb_sectors); int bdrv_writev(BlockDriverState *bs, int64_t sector_num, QEMUIOVector *qiov); int bdrv_pread(BlockDriverState *bs, int64_t offset, void *buf, int count); From 8bf9344ad6883e6d85b69bab36d9d76e4257e9ed Mon Sep 17 00:00:00 2001 From: Peter Lieven Date: Thu, 11 Jul 2013 14:16:23 +0200 Subject: [PATCH 07/11] block/raw: add bdrv_co_write_zeroes Signed-off-by: Peter Lieven Reviewed-by: Kevin Wolf Signed-off-by: Stefan Hajnoczi --- block/raw.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/block/raw.c b/block/raw.c index ce10422006..8c81de9345 100644 --- a/block/raw.c +++ b/block/raw.c @@ -42,6 +42,13 @@ static int coroutine_fn raw_co_is_allocated(BlockDriverState *bs, return bdrv_co_is_allocated(bs->file, sector_num, nb_sectors, pnum); } +static int coroutine_fn raw_co_write_zeroes(BlockDriverState *bs, + int64_t sector_num, + int nb_sectors) +{ + return bdrv_co_write_zeroes(bs->file, sector_num, nb_sectors); +} + static int64_t raw_getlength(BlockDriverState *bs) { return bdrv_getlength(bs->file); @@ -128,6 +135,7 @@ static BlockDriver bdrv_raw = { .bdrv_co_readv = raw_co_readv, .bdrv_co_writev = raw_co_writev, .bdrv_co_is_allocated = raw_co_is_allocated, + .bdrv_co_write_zeroes = raw_co_write_zeroes, .bdrv_co_discard = raw_co_discard, .bdrv_probe = raw_probe, From 323004a39d4d8d33c744a5b108f80bfe6402fca3 Mon Sep 17 00:00:00 2001 From: Peter Lieven Date: Thu, 18 Jul 2013 09:48:50 +0200 Subject: [PATCH 08/11] block-migration: efficiently encode zero blocks this patch adds a efficient encoding for zero blocks by adding a new flag indicating a block is completely zero. additionally bdrv_write_zeros() is used at the destination to efficiently write these zeroes. depending on the implementation this avoids that the destination target gets fully provisioned. Signed-off-by: Peter Lieven Signed-off-by: Stefan Hajnoczi --- block-migration.c | 32 ++++++++++++++++++++++++++------ include/migration/migration.h | 1 + migration.c | 9 +++++++++ qapi-schema.json | 8 +++++++- 4 files changed, 43 insertions(+), 7 deletions(-) diff --git a/block-migration.c b/block-migration.c index 2fd7699794..f803f2006f 100644 --- a/block-migration.c +++ b/block-migration.c @@ -29,6 +29,7 @@ #define BLK_MIG_FLAG_DEVICE_BLOCK 0x01 #define BLK_MIG_FLAG_EOS 0x02 #define BLK_MIG_FLAG_PROGRESS 0x04 +#define BLK_MIG_FLAG_ZERO_BLOCK 0x08 #define MAX_IS_ALLOCATED_SEARCH 65536 @@ -80,6 +81,7 @@ typedef struct BlkMigState { int shared_base; QSIMPLEQ_HEAD(bmds_list, BlkMigDevState) bmds_list; int64_t total_sector_sum; + bool zero_blocks; /* Protected by lock. */ QSIMPLEQ_HEAD(blk_list, BlkMigBlock) blk_list; @@ -114,16 +116,30 @@ static void blk_mig_unlock(void) static void blk_send(QEMUFile *f, BlkMigBlock * blk) { int len; + uint64_t flags = BLK_MIG_FLAG_DEVICE_BLOCK; + + if (block_mig_state.zero_blocks && + buffer_is_zero(blk->buf, BLOCK_SIZE)) { + flags |= BLK_MIG_FLAG_ZERO_BLOCK; + } /* sector number and flags */ qemu_put_be64(f, (blk->sector << BDRV_SECTOR_BITS) - | BLK_MIG_FLAG_DEVICE_BLOCK); + | flags); /* device name */ len = strlen(blk->bmds->bs->device_name); qemu_put_byte(f, len); qemu_put_buffer(f, (uint8_t *)blk->bmds->bs->device_name, len); + /* if a block is zero we need to flush here since the network + * bandwidth is now a lot higher than the storage device bandwidth. + * thus if we queue zero blocks we slow down the migration */ + if (flags & BLK_MIG_FLAG_ZERO_BLOCK) { + qemu_fflush(f); + return; + } + qemu_put_buffer(f, blk->buf, BLOCK_SIZE); } @@ -344,6 +360,7 @@ static void init_blk_migration(QEMUFile *f) block_mig_state.total_sector_sum = 0; block_mig_state.prev_progress = -1; block_mig_state.bulk_completed = 0; + block_mig_state.zero_blocks = migrate_zero_blocks(); bdrv_iterate(init_blk_migration_it, NULL); } @@ -762,12 +779,15 @@ static int block_load(QEMUFile *f, void *opaque, int version_id) nr_sectors = BDRV_SECTORS_PER_DIRTY_CHUNK; } - buf = g_malloc(BLOCK_SIZE); + if (flags & BLK_MIG_FLAG_ZERO_BLOCK) { + ret = bdrv_write_zeroes(bs, addr, nr_sectors); + } else { + buf = g_malloc(BLOCK_SIZE); + qemu_get_buffer(f, buf, BLOCK_SIZE); + ret = bdrv_write(bs, addr, buf, nr_sectors); + g_free(buf); + } - qemu_get_buffer(f, buf, BLOCK_SIZE); - ret = bdrv_write(bs, addr, buf, nr_sectors); - - g_free(buf); if (ret < 0) { return ret; } diff --git a/include/migration/migration.h b/include/migration/migration.h index bc9fde0b2a..701709a1e9 100644 --- a/include/migration/migration.h +++ b/include/migration/migration.h @@ -124,6 +124,7 @@ void migrate_add_blocker(Error *reason); void migrate_del_blocker(Error *reason); bool migrate_rdma_pin_all(void); +bool migrate_zero_blocks(void); bool migrate_auto_converge(void); diff --git a/migration.c b/migration.c index 9f5a4230d1..a9c042186d 100644 --- a/migration.c +++ b/migration.c @@ -493,6 +493,15 @@ bool migrate_auto_converge(void) return s->enabled_capabilities[MIGRATION_CAPABILITY_AUTO_CONVERGE]; } +bool migrate_zero_blocks(void) +{ + MigrationState *s; + + s = migrate_get_current(); + + return s->enabled_capabilities[MIGRATION_CAPABILITY_ZERO_BLOCKS]; +} + int migrate_use_xbzrle(void) { MigrationState *s; diff --git a/qapi-schema.json b/qapi-schema.json index 8d33d527d3..592bb9c7a1 100644 --- a/qapi-schema.json +++ b/qapi-schema.json @@ -613,10 +613,16 @@ # Disabled by default. Experimental: may (or may not) be renamed after # further testing is complete. (since 1.6) # +# @zero-blocks: During storage migration encode blocks of zeroes efficiently. This +# essentially saves 1MB of zeroes per block on the wire. Enabling requires +# source and target VM to support this feature. To enable it is sufficient +# to enable the capability on the source VM. The feature is disabled by +# default. (since 1.6) +# # Since: 1.2 ## { 'enum': 'MigrationCapability', - 'data': ['xbzrle', 'x-rdma-pin-all', 'auto-converge'] } + 'data': ['xbzrle', 'x-rdma-pin-all', 'auto-converge', 'zero-blocks'] } ## # @MigrationCapabilityStatus From 594a45ce64dbef1829996403506a1154eb2fd1cc Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Thu, 18 Jul 2013 14:52:19 +0200 Subject: [PATCH 09/11] cpus: Let vm_stop[_force_state]() always flush block devices Even if the VM is already stopped, we cannot assume that all data has already been successfully flushed to disk. The flush during the previous vm_stop() could have failed. Run bdrv_flush_all() unconditionally so that we get an error each time if the block device isn't really flushed. Signed-off-by: Kevin Wolf Reviewed-by: Eric Blake Signed-off-by: Stefan Hajnoczi --- cpus.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/cpus.c b/cpus.c index 8062cdd57e..2509eb5af3 100644 --- a/cpus.c +++ b/cpus.c @@ -443,11 +443,12 @@ static int do_vm_stop(RunState state) pause_all_vcpus(); runstate_set(state); vm_state_notify(0, state); - bdrv_drain_all(); - ret = bdrv_flush_all(); monitor_protocol_event(QEVENT_STOP, NULL); } + bdrv_drain_all(); + ret = bdrv_flush_all(); + return ret; } @@ -1126,7 +1127,9 @@ int vm_stop_force_state(RunState state) return vm_stop(state); } else { runstate_set(state); - return 0; + /* Make sure to return an error if the flush in a previous vm_stop() + * failed. */ + return bdrv_flush_all(); } } From 4e7395e84fc7534f6a6c6f3d5563e770501dbe2e Mon Sep 17 00:00:00 2001 From: Peter Lieven Date: Thu, 18 Jul 2013 10:37:32 +0200 Subject: [PATCH 10/11] block: fix bdrv_read_unthrottled() Signed-off-by: Peter Lieven Reviewed-by: Paolo Bonzini Signed-off-by: Stefan Hajnoczi --- block.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block.c b/block.c index b05e2d6ce9..6cd39fa146 100644 --- a/block.c +++ b/block.c @@ -2255,7 +2255,7 @@ int bdrv_read_unthrottled(BlockDriverState *bs, int64_t sector_num, enabled = bs->io_limits_enabled; bs->io_limits_enabled = false; - ret = bdrv_read(bs, 0, buf, 1); + ret = bdrv_read(bs, sector_num, buf, nb_sectors); bs->io_limits_enabled = enabled; return ret; } From a23fdf355969d331f60593fa5b857d43aec25aac Mon Sep 17 00:00:00 2001 From: Peter Lieven Date: Mon, 15 Jul 2013 12:49:34 +0200 Subject: [PATCH 11/11] block/raw: add .bdrv_get_info Signed-off-by: Peter Lieven Signed-off-by: Stefan Hajnoczi --- block/raw.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/block/raw.c b/block/raw.c index 8c81de9345..f1682d4f32 100644 --- a/block/raw.c +++ b/block/raw.c @@ -121,6 +121,11 @@ static int raw_has_zero_init(BlockDriverState *bs) return bdrv_has_zero_init(bs->file); } +static int raw_get_info(BlockDriverState *bs, BlockDriverInfo *bdi) +{ + return bdrv_get_info(bs->file, bdi); +} + static BlockDriver bdrv_raw = { .format_name = "raw", @@ -140,6 +145,7 @@ static BlockDriver bdrv_raw = { .bdrv_probe = raw_probe, .bdrv_getlength = raw_getlength, + .bdrv_get_info = raw_get_info, .bdrv_truncate = raw_truncate, .bdrv_is_inserted = raw_is_inserted,