From e04fb07fd1676e9facd7f3f878c1bbe03bccd26b Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Thu, 5 Dec 2013 16:38:33 +0100 Subject: [PATCH 01/93] rbd: switch from pipe to QEMUBH completion notification rbd callbacks are called from non-QEMU threads. Up until now a pipe was used to signal completion back to the QEMU iothread. The pipe writer code handles EAGAIN using select(2). The select(2) API is not scalable since fd_set size is static. FD_SET() can write beyond the end of fd_set if the file descriptor number is too high. (QEMU's main loop uses poll(2) to avoid this issue with select(2).) Since the pipe itself is quite clumsy to use and QEMUBH is now thread-safe, just schedule a BH from the rbd callback function. This way we can simplify I/O completion in addition to eliminating the potential FD_SET() crash when file descriptor numbers become too high. Crash scenario: QEMU already has 1024 file descriptors open. Hotplug an rbd drive and get the pipe writer to take the select(2) code path. Reviewed-by: Josh Durgin Tested-by: Josh Durgin Signed-off-by: Stefan Hajnoczi Signed-off-by: Kevin Wolf --- block/rbd.c | 130 +++++++++------------------------------------------- 1 file changed, 22 insertions(+), 108 deletions(-) diff --git a/block/rbd.c b/block/rbd.c index f453f04757..121fae221e 100644 --- a/block/rbd.c +++ b/block/rbd.c @@ -95,18 +95,13 @@ typedef struct RADOSCB { #define RBD_FD_WRITE 1 typedef struct BDRVRBDState { - int fds[2]; rados_t cluster; rados_ioctx_t io_ctx; rbd_image_t image; char name[RBD_MAX_IMAGE_NAME_SIZE]; char *snap; - int event_reader_pos; - RADOSCB *event_rcb; } BDRVRBDState; -static void rbd_aio_bh_cb(void *opaque); - static int qemu_rbd_next_tok(char *dst, int dst_len, char *src, char delim, const char *name, @@ -369,9 +364,8 @@ static int qemu_rbd_create(const char *filename, QEMUOptionParameter *options, } /* - * This aio completion is being called from qemu_rbd_aio_event_reader() - * and runs in qemu context. It schedules a bh, but just in case the aio - * was not cancelled before. + * This aio completion is being called from rbd_finish_bh() and runs in qemu + * BH context. */ static void qemu_rbd_complete_aio(RADOSCB *rcb) { @@ -401,36 +395,19 @@ static void qemu_rbd_complete_aio(RADOSCB *rcb) acb->ret = r; } } - /* Note that acb->bh can be NULL in case where the aio was cancelled */ - acb->bh = qemu_bh_new(rbd_aio_bh_cb, acb); - qemu_bh_schedule(acb->bh); + g_free(rcb); -} -/* - * aio fd read handler. It runs in the qemu context and calls the - * completion handling of completed rados aio operations. - */ -static void qemu_rbd_aio_event_reader(void *opaque) -{ - BDRVRBDState *s = opaque; + if (acb->cmd == RBD_AIO_READ) { + qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size); + } + qemu_vfree(acb->bounce); + acb->common.cb(acb->common.opaque, (acb->ret > 0 ? 0 : acb->ret)); + acb->status = 0; - ssize_t ret; - - do { - char *p = (char *)&s->event_rcb; - - /* now read the rcb pointer that was sent from a non qemu thread */ - ret = read(s->fds[RBD_FD_READ], p + s->event_reader_pos, - sizeof(s->event_rcb) - s->event_reader_pos); - if (ret > 0) { - s->event_reader_pos += ret; - if (s->event_reader_pos == sizeof(s->event_rcb)) { - s->event_reader_pos = 0; - qemu_rbd_complete_aio(s->event_rcb); - } - } - } while (ret < 0 && errno == EINTR); + if (!acb->cancelled) { + qemu_aio_release(acb); + } } /* TODO Convert to fine grained options */ @@ -538,23 +515,9 @@ static int qemu_rbd_open(BlockDriverState *bs, QDict *options, int flags, bs->read_only = (s->snap != NULL); - s->event_reader_pos = 0; - r = qemu_pipe(s->fds); - if (r < 0) { - error_report("error opening eventfd"); - goto failed; - } - fcntl(s->fds[0], F_SETFL, O_NONBLOCK); - fcntl(s->fds[1], F_SETFL, O_NONBLOCK); - qemu_aio_set_fd_handler(s->fds[RBD_FD_READ], qemu_rbd_aio_event_reader, - NULL, s); - - qemu_opts_del(opts); return 0; -failed: - rbd_close(s->image); failed_open: rados_ioctx_destroy(s->io_ctx); failed_shutdown: @@ -569,10 +532,6 @@ static void qemu_rbd_close(BlockDriverState *bs) { BDRVRBDState *s = bs->opaque; - close(s->fds[0]); - close(s->fds[1]); - qemu_aio_set_fd_handler(s->fds[RBD_FD_READ], NULL, NULL, NULL); - rbd_close(s->image); rados_ioctx_destroy(s->io_ctx); g_free(s->snap); @@ -600,34 +559,11 @@ static const AIOCBInfo rbd_aiocb_info = { .cancel = qemu_rbd_aio_cancel, }; -static int qemu_rbd_send_pipe(BDRVRBDState *s, RADOSCB *rcb) +static void rbd_finish_bh(void *opaque) { - int ret = 0; - while (1) { - fd_set wfd; - int fd = s->fds[RBD_FD_WRITE]; - - /* send the op pointer to the qemu thread that is responsible - for the aio/op completion. Must do it in a qemu thread context */ - ret = write(fd, (void *)&rcb, sizeof(rcb)); - if (ret >= 0) { - break; - } - if (errno == EINTR) { - continue; - } - if (errno != EAGAIN) { - break; - } - - FD_ZERO(&wfd); - FD_SET(fd, &wfd); - do { - ret = select(fd + 1, NULL, &wfd, NULL, NULL); - } while (ret < 0 && errno == EINTR); - } - - return ret; + RADOSCB *rcb = opaque; + qemu_bh_delete(rcb->acb->bh); + qemu_rbd_complete_aio(rcb); } /* @@ -635,40 +571,18 @@ static int qemu_rbd_send_pipe(BDRVRBDState *s, RADOSCB *rcb) * * Note: this function is being called from a non qemu thread so * we need to be careful about what we do here. Generally we only - * write to the block notification pipe, and do the rest of the - * io completion handling from qemu_rbd_aio_event_reader() which - * runs in a qemu context. + * schedule a BH, and do the rest of the io completion handling + * from rbd_finish_bh() which runs in a qemu context. */ static void rbd_finish_aiocb(rbd_completion_t c, RADOSCB *rcb) { - int ret; + RBDAIOCB *acb = rcb->acb; + rcb->ret = rbd_aio_get_return_value(c); rbd_aio_release(c); - ret = qemu_rbd_send_pipe(rcb->s, rcb); - if (ret < 0) { - error_report("failed writing to acb->s->fds"); - g_free(rcb); - } -} -/* Callback when all queued rbd_aio requests are complete */ - -static void rbd_aio_bh_cb(void *opaque) -{ - RBDAIOCB *acb = opaque; - - if (acb->cmd == RBD_AIO_READ) { - qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size); - } - qemu_vfree(acb->bounce); - acb->common.cb(acb->common.opaque, (acb->ret > 0 ? 0 : acb->ret)); - qemu_bh_delete(acb->bh); - acb->bh = NULL; - acb->status = 0; - - if (!acb->cancelled) { - qemu_aio_release(acb); - } + acb->bh = qemu_bh_new(rbd_finish_bh, rcb); + qemu_bh_schedule(acb->bh); } static int rbd_aio_discard_wrapper(rbd_image_t image, From 2c77f52e39ff2ba071e3b549ad7a3ebea0758edd Mon Sep 17 00:00:00 2001 From: Fam Zheng Date: Tue, 26 Nov 2013 14:40:32 +0800 Subject: [PATCH 02/93] qemu-iotests: Introduce _unsupported_imgopts Introduce _unsupported_imgopts that causes _notrun for specific image options. Signed-off-by: Fam Zheng Signed-off-by: Stefan Hajnoczi Signed-off-by: Kevin Wolf --- tests/qemu-iotests/common.rc | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/tests/qemu-iotests/common.rc b/tests/qemu-iotests/common.rc index 28ba0d9ad5..2489c437eb 100644 --- a/tests/qemu-iotests/common.rc +++ b/tests/qemu-iotests/common.rc @@ -406,6 +406,17 @@ _default_cache_mode() fi } +_unsupported_imgopts() +{ + for bad_opt + do + if echo "$IMGOPTS" | grep -q 2>/dev/null "$bad_opt" + then + _notrun "not suitable for image option: $bad_opt" + fi + done +} + # this test requires that a specified command (executable) exists # _require_command() From d2329f27c9c8408d4134c7243313dbaa37270384 Mon Sep 17 00:00:00 2001 From: Fam Zheng Date: Tue, 26 Nov 2013 14:40:33 +0800 Subject: [PATCH 03/93] qemu-iotests: Add _unsupported_imgopts for vmdk subformats Some cases are not applicable for vmdk subformats those don't support certain features, e.g. backing file, and some others can't run on mult-file image, e.g. monolithicFlat. This adds declaration in test cases to skip them automatically, so that iotests on vmdk can go more smoothly (without manually picking of cases for each subformat). Signed-off-by: Fam Zheng Signed-off-by: Stefan Hajnoczi Signed-off-by: Kevin Wolf --- tests/qemu-iotests/017 | 1 + tests/qemu-iotests/018 | 1 + tests/qemu-iotests/019 | 3 +++ tests/qemu-iotests/020 | 3 +++ tests/qemu-iotests/034 | 3 +++ tests/qemu-iotests/037 | 3 +++ tests/qemu-iotests/059 | 3 +++ tests/qemu-iotests/063 | 3 +++ tests/qemu-iotests/069 | 1 + 9 files changed, 21 insertions(+) diff --git a/tests/qemu-iotests/017 b/tests/qemu-iotests/017 index aba3faf712..3af3cdfbc3 100755 --- a/tests/qemu-iotests/017 +++ b/tests/qemu-iotests/017 @@ -43,6 +43,7 @@ trap "_cleanup; exit \$status" 0 1 2 3 15 _supported_fmt qcow qcow2 vmdk qed _supported_proto generic _supported_os Linux +_unsupported_imgopts "subformat=monolithicFlat" "subformat=twoGbMaxExtentFlat" TEST_OFFSETS="0 4294967296" diff --git a/tests/qemu-iotests/018 b/tests/qemu-iotests/018 index 15fcfe5670..6f7f0545d0 100755 --- a/tests/qemu-iotests/018 +++ b/tests/qemu-iotests/018 @@ -43,6 +43,7 @@ trap "_cleanup; exit \$status" 0 1 2 3 15 _supported_fmt qcow qcow2 vmdk qed _supported_proto generic _supported_os Linux +_unsupported_imgopts "subformat=monolithicFlat" "subformat=twoGbMaxExtentFlat" TEST_OFFSETS="0 4294967296" diff --git a/tests/qemu-iotests/019 b/tests/qemu-iotests/019 index 5bb18d0c0a..b43e70f3cb 100755 --- a/tests/qemu-iotests/019 +++ b/tests/qemu-iotests/019 @@ -47,6 +47,9 @@ trap "_cleanup; exit \$status" 0 1 2 3 15 _supported_fmt qcow qcow2 vmdk qed _supported_proto generic _supported_os Linux +_unsupported_imgopts "subformat=monolithicFlat" \ + "subformat=twoGbMaxExtentFlat" \ + "subformat=twoGbMaxExtentSparse" TEST_OFFSETS="0 4294967296" CLUSTER_SIZE=65536 diff --git a/tests/qemu-iotests/020 b/tests/qemu-iotests/020 index b3c86d844e..73a0429481 100755 --- a/tests/qemu-iotests/020 +++ b/tests/qemu-iotests/020 @@ -45,6 +45,9 @@ trap "_cleanup; exit \$status" 0 1 2 3 15 _supported_fmt qcow qcow2 vmdk qed _supported_proto generic _supported_os Linux +_unsupported_imgopts "subformat=monolithicFlat" \ + "subformat=twoGbMaxExtentFlat" \ + "subformat=twoGbMaxExtentSparse" TEST_OFFSETS="0 4294967296" diff --git a/tests/qemu-iotests/034 b/tests/qemu-iotests/034 index 67f1959690..7349789583 100755 --- a/tests/qemu-iotests/034 +++ b/tests/qemu-iotests/034 @@ -41,6 +41,9 @@ trap "_cleanup; exit \$status" 0 1 2 3 15 _supported_fmt qcow qcow2 vmdk qed _supported_proto generic _supported_os Linux +_unsupported_imgopts "subformat=monolithicFlat" \ + "subformat=twoGbMaxExtentFlat" \ + "subformat=twoGbMaxExtentSparse" CLUSTER_SIZE=4k size=128M diff --git a/tests/qemu-iotests/037 b/tests/qemu-iotests/037 index 743bae33d3..e444349e6d 100755 --- a/tests/qemu-iotests/037 +++ b/tests/qemu-iotests/037 @@ -41,6 +41,9 @@ trap "_cleanup; exit \$status" 0 1 2 3 15 _supported_fmt qcow qcow2 vmdk qed _supported_proto generic _supported_os Linux +_unsupported_imgopts "subformat=monolithicFlat" \ + "subformat=twoGbMaxExtentFlat" \ + "subformat=twoGbMaxExtentSparse" CLUSTER_SIZE=4k size=128M diff --git a/tests/qemu-iotests/059 b/tests/qemu-iotests/059 index 65bea1d6c6..d8215ae596 100755 --- a/tests/qemu-iotests/059 +++ b/tests/qemu-iotests/059 @@ -42,6 +42,9 @@ trap "_cleanup; exit \$status" 0 1 2 3 15 _supported_fmt vmdk _supported_proto generic _supported_os Linux +_unsupported_imgopts "subformat=monolithicFlat" \ + "subformat=twoGbMaxExtentFlat" \ + "subformat=twoGbMaxExtentSparse" capacity_offset=16 granularity_offset=20 diff --git a/tests/qemu-iotests/063 b/tests/qemu-iotests/063 index 2ab8f20e02..77503a2984 100755 --- a/tests/qemu-iotests/063 +++ b/tests/qemu-iotests/063 @@ -44,6 +44,9 @@ trap "_cleanup; exit \$status" 0 1 2 3 15 _supported_fmt qcow qcow2 vmdk qed raw _supported_proto generic _supported_os Linux +_unsupported_imgopts "subformat=monolithicFlat" \ + "subformat=twoGbMaxExtentFlat" \ + "subformat=twoGbMaxExtentSparse" _make_test_img 4M diff --git a/tests/qemu-iotests/069 b/tests/qemu-iotests/069 index 3042803a81..50347d91d2 100755 --- a/tests/qemu-iotests/069 +++ b/tests/qemu-iotests/069 @@ -41,6 +41,7 @@ trap "_cleanup; exit \$status" 0 1 2 3 15 _supported_fmt cow qed qcow qcow2 vmdk _supported_proto generic _supported_os Linux +_unsupported_imgopts "subformat=monolithicFlat" "subformat=twoGbMaxExtentFlat" IMG_SIZE=128K From 487c1910023c83fa6d550a50c8ad7ee730e60bfa Mon Sep 17 00:00:00 2001 From: Fam Zheng Date: Tue, 26 Nov 2013 14:40:34 +0800 Subject: [PATCH 04/93] qemu-iotests: Clean up all extents for vmdk This modifies _cleanup_test_img to remove all the extent files listed by "qemu-img info"'s format specific information. Signed-off-by: Fam Zheng Signed-off-by: Stefan Hajnoczi Signed-off-by: Kevin Wolf --- tests/qemu-iotests/common.rc | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/tests/qemu-iotests/common.rc b/tests/qemu-iotests/common.rc index 2489c437eb..0f68156400 100644 --- a/tests/qemu-iotests/common.rc +++ b/tests/qemu-iotests/common.rc @@ -170,6 +170,17 @@ _make_test_img() fi } +_rm_test_img() +{ + local img=$1 + if [ "$IMGFMT" = "vmdk" ]; then + # Remove all the extents for vmdk + $QEMU_IMG info $img 2>/dev/null | grep 'filename:' | cut -f 2 -d: \ + | xargs -I {} rm -f "{}" + fi + rm -f $img +} + _cleanup_test_img() { case "$IMGPROTO" in @@ -179,9 +190,9 @@ _cleanup_test_img() rm -f "$TEST_IMG_FILE" ;; file) - rm -f "$TEST_DIR/t.$IMGFMT" - rm -f "$TEST_DIR/t.$IMGFMT.orig" - rm -f "$TEST_DIR/t.$IMGFMT.base" + _rm_test_img "$TEST_DIR/t.$IMGFMT" + _rm_test_img "$TEST_DIR/t.$IMGFMT.orig" + _rm_test_img "$TEST_DIR/t.$IMGFMT.base" if [ -n "$SAMPLE_IMG_FILE" ] then rm -f "$TEST_DIR/$SAMPLE_IMG_FILE" From 92397116a6789ed4455c6dacea0f378cae096d8d Mon Sep 17 00:00:00 2001 From: Peter Lieven Date: Fri, 20 Dec 2013 10:02:47 +0100 Subject: [PATCH 05/93] block/iscsi: return -ENOMEM if an async call fails immediately if an async libiscsi call fails directly it can only be due to an out of memory condition. All other errors are returned through the callback. Signed-off-by: Peter Lieven Reviewed-by: Ronnie Sahlberg Signed-off-by: Stefan Hajnoczi Signed-off-by: Kevin Wolf --- block/iscsi.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/block/iscsi.c b/block/iscsi.c index c0ea0c4543..76b3c96d38 100644 --- a/block/iscsi.c +++ b/block/iscsi.c @@ -308,7 +308,7 @@ retry: iscsi_co_generic_cb, &iTask); if (iTask.task == NULL) { g_free(buf); - return -EIO; + return -ENOMEM; } #if defined(LIBISCSI_FEATURE_IOVECTOR) scsi_task_set_iov_out(iTask.task, (struct scsi_iovec *) iov->iov, @@ -376,7 +376,7 @@ retry: break; } if (iTask.task == NULL) { - return -EIO; + return -ENOMEM; } #if defined(LIBISCSI_FEATURE_IOVECTOR) scsi_task_set_iov_in(iTask.task, (struct scsi_iovec *) iov->iov, iov->niov); @@ -419,7 +419,7 @@ static int coroutine_fn iscsi_co_flush(BlockDriverState *bs) retry: if (iscsi_synchronizecache10_task(iscsilun->iscsi, iscsilun->lun, 0, 0, 0, 0, iscsi_co_generic_cb, &iTask) == NULL) { - return -EIO; + return -ENOMEM; } while (!iTask.complete) { @@ -669,7 +669,7 @@ retry: sector_qemu2lun(sector_num, iscsilun), 8 + 16, iscsi_co_generic_cb, &iTask) == NULL) { - ret = -EIO; + ret = -ENOMEM; goto out; } @@ -753,7 +753,7 @@ coroutine_fn iscsi_co_discard(BlockDriverState *bs, int64_t sector_num, retry: if (iscsi_unmap_task(iscsilun->iscsi, iscsilun->lun, 0, 0, &list, 1, iscsi_co_generic_cb, &iTask) == NULL) { - return -EIO; + return -ENOMEM; } while (!iTask.complete) { @@ -822,7 +822,7 @@ retry: iscsilun->zeroblock, iscsilun->block_size, nb_blocks, 0, !!(flags & BDRV_REQ_MAY_UNMAP), 0, 0, iscsi_co_generic_cb, &iTask) == NULL) { - return -EIO; + return -ENOMEM; } while (!iTask.complete) { From 15744b0b8f63d624bdd5825011cd201541a62094 Mon Sep 17 00:00:00 2001 From: Bharata B Rao Date: Sat, 21 Dec 2013 14:51:24 +0530 Subject: [PATCH 06/93] gluster: Convert aio routines into coroutines Convert the read, write, flush and discard implementations from aio-based ones to coroutine based ones. Signed-off-by: Bharata B Rao Signed-off-by: Stefan Hajnoczi Signed-off-by: Kevin Wolf --- block/gluster.c | 219 ++++++++++++++++-------------------------------- 1 file changed, 73 insertions(+), 146 deletions(-) diff --git a/block/gluster.c b/block/gluster.c index 563d497dc2..f9aea0ea27 100644 --- a/block/gluster.c +++ b/block/gluster.c @@ -21,19 +21,15 @@ #include "qemu/uri.h" typedef struct GlusterAIOCB { - BlockDriverAIOCB common; int64_t size; int ret; - bool *finished; QEMUBH *bh; + Coroutine *coroutine; } GlusterAIOCB; typedef struct BDRVGlusterState { struct glfs *glfs; - int fds[2]; struct glfs_fd *fd; - int event_reader_pos; - GlusterAIOCB *event_acb; } BDRVGlusterState; #define GLUSTER_FD_READ 0 @@ -231,46 +227,13 @@ out: return NULL; } -static void qemu_gluster_complete_aio(GlusterAIOCB *acb, BDRVGlusterState *s) +static void qemu_gluster_complete_aio(void *opaque) { - int ret; - bool *finished = acb->finished; - BlockDriverCompletionFunc *cb = acb->common.cb; - void *opaque = acb->common.opaque; + GlusterAIOCB *acb = (GlusterAIOCB *)opaque; - if (!acb->ret || acb->ret == acb->size) { - ret = 0; /* Success */ - } else if (acb->ret < 0) { - ret = acb->ret; /* Read/Write failed */ - } else { - ret = -EIO; /* Partial read/write - fail it */ - } - - qemu_aio_release(acb); - cb(opaque, ret); - if (finished) { - *finished = true; - } -} - -static void qemu_gluster_aio_event_reader(void *opaque) -{ - BDRVGlusterState *s = opaque; - ssize_t ret; - - do { - char *p = (char *)&s->event_acb; - - ret = read(s->fds[GLUSTER_FD_READ], p + s->event_reader_pos, - sizeof(s->event_acb) - s->event_reader_pos); - if (ret > 0) { - s->event_reader_pos += ret; - if (s->event_reader_pos == sizeof(s->event_acb)) { - s->event_reader_pos = 0; - qemu_gluster_complete_aio(s->event_acb, s); - } - } - } while (ret < 0 && errno == EINTR); + qemu_bh_delete(acb->bh); + acb->bh = NULL; + qemu_coroutine_enter(acb->coroutine, NULL); } /* TODO Convert to fine grained options */ @@ -309,7 +272,6 @@ static int qemu_gluster_open(BlockDriverState *bs, QDict *options, filename = qemu_opt_get(opts, "filename"); - s->glfs = qemu_gluster_init(gconf, filename); if (!s->glfs) { ret = -errno; @@ -329,18 +291,8 @@ static int qemu_gluster_open(BlockDriverState *bs, QDict *options, s->fd = glfs_open(s->glfs, gconf->image, open_flags); if (!s->fd) { ret = -errno; - goto out; } - ret = qemu_pipe(s->fds); - if (ret < 0) { - ret = -errno; - goto out; - } - fcntl(s->fds[GLUSTER_FD_READ], F_SETFL, O_NONBLOCK); - qemu_aio_set_fd_handler(s->fds[GLUSTER_FD_READ], - qemu_gluster_aio_event_reader, NULL, s); - out: qemu_opts_del(opts); qemu_gluster_gconf_free(gconf); @@ -398,58 +350,37 @@ out: return ret; } -static void qemu_gluster_aio_cancel(BlockDriverAIOCB *blockacb) -{ - GlusterAIOCB *acb = (GlusterAIOCB *)blockacb; - bool finished = false; - - acb->finished = &finished; - while (!finished) { - qemu_aio_wait(); - } -} - -static const AIOCBInfo gluster_aiocb_info = { - .aiocb_size = sizeof(GlusterAIOCB), - .cancel = qemu_gluster_aio_cancel, -}; - +/* + * AIO callback routine called from GlusterFS thread. + */ static void gluster_finish_aiocb(struct glfs_fd *fd, ssize_t ret, void *arg) { GlusterAIOCB *acb = (GlusterAIOCB *)arg; - BlockDriverState *bs = acb->common.bs; - BDRVGlusterState *s = bs->opaque; - int retval; - acb->ret = ret; - retval = qemu_write_full(s->fds[GLUSTER_FD_WRITE], &acb, sizeof(acb)); - if (retval != sizeof(acb)) { - /* - * Gluster AIO callback thread failed to notify the waiting - * QEMU thread about IO completion. - */ - error_report("Gluster AIO completion failed: %s", strerror(errno)); - abort(); + if (!ret || ret == acb->size) { + acb->ret = 0; /* Success */ + } else if (ret < 0) { + acb->ret = ret; /* Read/Write failed */ + } else { + acb->ret = -EIO; /* Partial read/write - fail it */ } + + acb->bh = qemu_bh_new(qemu_gluster_complete_aio, acb); + qemu_bh_schedule(acb->bh); } -static BlockDriverAIOCB *qemu_gluster_aio_rw(BlockDriverState *bs, - int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, - BlockDriverCompletionFunc *cb, void *opaque, int write) +static coroutine_fn int qemu_gluster_co_rw(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, int write) { int ret; - GlusterAIOCB *acb; + GlusterAIOCB *acb = g_slice_new(GlusterAIOCB); BDRVGlusterState *s = bs->opaque; - size_t size; - off_t offset; + size_t size = nb_sectors * BDRV_SECTOR_SIZE; + off_t offset = sector_num * BDRV_SECTOR_SIZE; - offset = sector_num * BDRV_SECTOR_SIZE; - size = nb_sectors * BDRV_SECTOR_SIZE; - - acb = qemu_aio_get(&gluster_aiocb_info, bs, cb, opaque); acb->size = size; acb->ret = 0; - acb->finished = NULL; + acb->coroutine = qemu_coroutine_self(); if (write) { ret = glfs_pwritev_async(s->fd, qiov->iov, qiov->niov, offset, 0, @@ -460,13 +391,16 @@ static BlockDriverAIOCB *qemu_gluster_aio_rw(BlockDriverState *bs, } if (ret < 0) { + ret = -errno; goto out; } - return &acb->common; + + qemu_coroutine_yield(); + ret = acb->ret; out: - qemu_aio_release(acb); - return NULL; + g_slice_free(GlusterAIOCB, acb); + return ret; } static int qemu_gluster_truncate(BlockDriverState *bs, int64_t offset) @@ -482,71 +416,68 @@ static int qemu_gluster_truncate(BlockDriverState *bs, int64_t offset) return 0; } -static BlockDriverAIOCB *qemu_gluster_aio_readv(BlockDriverState *bs, - int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, - BlockDriverCompletionFunc *cb, void *opaque) +static coroutine_fn int qemu_gluster_co_readv(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) { - return qemu_gluster_aio_rw(bs, sector_num, qiov, nb_sectors, cb, opaque, 0); + return qemu_gluster_co_rw(bs, sector_num, nb_sectors, qiov, 0); } -static BlockDriverAIOCB *qemu_gluster_aio_writev(BlockDriverState *bs, - int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, - BlockDriverCompletionFunc *cb, void *opaque) +static coroutine_fn int qemu_gluster_co_writev(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) { - return qemu_gluster_aio_rw(bs, sector_num, qiov, nb_sectors, cb, opaque, 1); + return qemu_gluster_co_rw(bs, sector_num, nb_sectors, qiov, 1); } -static BlockDriverAIOCB *qemu_gluster_aio_flush(BlockDriverState *bs, - BlockDriverCompletionFunc *cb, void *opaque) +static coroutine_fn int qemu_gluster_co_flush_to_disk(BlockDriverState *bs) { int ret; - GlusterAIOCB *acb; + GlusterAIOCB *acb = g_slice_new(GlusterAIOCB); BDRVGlusterState *s = bs->opaque; - acb = qemu_aio_get(&gluster_aiocb_info, bs, cb, opaque); acb->size = 0; acb->ret = 0; - acb->finished = NULL; + acb->coroutine = qemu_coroutine_self(); ret = glfs_fsync_async(s->fd, &gluster_finish_aiocb, acb); if (ret < 0) { + ret = -errno; goto out; } - return &acb->common; + + qemu_coroutine_yield(); + ret = acb->ret; out: - qemu_aio_release(acb); - return NULL; + g_slice_free(GlusterAIOCB, acb); + return ret; } #ifdef CONFIG_GLUSTERFS_DISCARD -static BlockDriverAIOCB *qemu_gluster_aio_discard(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, BlockDriverCompletionFunc *cb, - void *opaque) +static coroutine_fn int qemu_gluster_co_discard(BlockDriverState *bs, + int64_t sector_num, int nb_sectors) { int ret; - GlusterAIOCB *acb; + GlusterAIOCB *acb = g_slice_new(GlusterAIOCB); BDRVGlusterState *s = bs->opaque; - size_t size; - off_t offset; + size_t size = nb_sectors * BDRV_SECTOR_SIZE; + off_t offset = sector_num * BDRV_SECTOR_SIZE; - offset = sector_num * BDRV_SECTOR_SIZE; - size = nb_sectors * BDRV_SECTOR_SIZE; - - acb = qemu_aio_get(&gluster_aiocb_info, bs, cb, opaque); acb->size = 0; acb->ret = 0; - acb->finished = NULL; + acb->coroutine = qemu_coroutine_self(); ret = glfs_discard_async(s->fd, offset, size, &gluster_finish_aiocb, acb); if (ret < 0) { + ret = -errno; goto out; } - return &acb->common; + + qemu_coroutine_yield(); + ret = acb->ret; out: - qemu_aio_release(acb); - return NULL; + g_slice_free(GlusterAIOCB, acb); + return ret; } #endif @@ -581,10 +512,6 @@ static void qemu_gluster_close(BlockDriverState *bs) { BDRVGlusterState *s = bs->opaque; - close(s->fds[GLUSTER_FD_READ]); - close(s->fds[GLUSTER_FD_WRITE]); - qemu_aio_set_fd_handler(s->fds[GLUSTER_FD_READ], NULL, NULL, NULL); - if (s->fd) { glfs_close(s->fd); s->fd = NULL; @@ -618,12 +545,12 @@ static BlockDriver bdrv_gluster = { .bdrv_getlength = qemu_gluster_getlength, .bdrv_get_allocated_file_size = qemu_gluster_allocated_file_size, .bdrv_truncate = qemu_gluster_truncate, - .bdrv_aio_readv = qemu_gluster_aio_readv, - .bdrv_aio_writev = qemu_gluster_aio_writev, - .bdrv_aio_flush = qemu_gluster_aio_flush, + .bdrv_co_readv = qemu_gluster_co_readv, + .bdrv_co_writev = qemu_gluster_co_writev, + .bdrv_co_flush_to_disk = qemu_gluster_co_flush_to_disk, .bdrv_has_zero_init = qemu_gluster_has_zero_init, #ifdef CONFIG_GLUSTERFS_DISCARD - .bdrv_aio_discard = qemu_gluster_aio_discard, + .bdrv_co_discard = qemu_gluster_co_discard, #endif .create_options = qemu_gluster_create_options, }; @@ -639,12 +566,12 @@ static BlockDriver bdrv_gluster_tcp = { .bdrv_getlength = qemu_gluster_getlength, .bdrv_get_allocated_file_size = qemu_gluster_allocated_file_size, .bdrv_truncate = qemu_gluster_truncate, - .bdrv_aio_readv = qemu_gluster_aio_readv, - .bdrv_aio_writev = qemu_gluster_aio_writev, - .bdrv_aio_flush = qemu_gluster_aio_flush, + .bdrv_co_readv = qemu_gluster_co_readv, + .bdrv_co_writev = qemu_gluster_co_writev, + .bdrv_co_flush_to_disk = qemu_gluster_co_flush_to_disk, .bdrv_has_zero_init = qemu_gluster_has_zero_init, #ifdef CONFIG_GLUSTERFS_DISCARD - .bdrv_aio_discard = qemu_gluster_aio_discard, + .bdrv_co_discard = qemu_gluster_co_discard, #endif .create_options = qemu_gluster_create_options, }; @@ -660,12 +587,12 @@ static BlockDriver bdrv_gluster_unix = { .bdrv_getlength = qemu_gluster_getlength, .bdrv_get_allocated_file_size = qemu_gluster_allocated_file_size, .bdrv_truncate = qemu_gluster_truncate, - .bdrv_aio_readv = qemu_gluster_aio_readv, - .bdrv_aio_writev = qemu_gluster_aio_writev, - .bdrv_aio_flush = qemu_gluster_aio_flush, + .bdrv_co_readv = qemu_gluster_co_readv, + .bdrv_co_writev = qemu_gluster_co_writev, + .bdrv_co_flush_to_disk = qemu_gluster_co_flush_to_disk, .bdrv_has_zero_init = qemu_gluster_has_zero_init, #ifdef CONFIG_GLUSTERFS_DISCARD - .bdrv_aio_discard = qemu_gluster_aio_discard, + .bdrv_co_discard = qemu_gluster_co_discard, #endif .create_options = qemu_gluster_create_options, }; @@ -681,12 +608,12 @@ static BlockDriver bdrv_gluster_rdma = { .bdrv_getlength = qemu_gluster_getlength, .bdrv_get_allocated_file_size = qemu_gluster_allocated_file_size, .bdrv_truncate = qemu_gluster_truncate, - .bdrv_aio_readv = qemu_gluster_aio_readv, - .bdrv_aio_writev = qemu_gluster_aio_writev, - .bdrv_aio_flush = qemu_gluster_aio_flush, + .bdrv_co_readv = qemu_gluster_co_readv, + .bdrv_co_writev = qemu_gluster_co_writev, + .bdrv_co_flush_to_disk = qemu_gluster_co_flush_to_disk, .bdrv_has_zero_init = qemu_gluster_has_zero_init, #ifdef CONFIG_GLUSTERFS_DISCARD - .bdrv_aio_discard = qemu_gluster_aio_discard, + .bdrv_co_discard = qemu_gluster_co_discard, #endif .create_options = qemu_gluster_create_options, }; From 7c815372f3b37754b2a568e82f0521c7f77a6f66 Mon Sep 17 00:00:00 2001 From: Bharata B Rao Date: Sat, 21 Dec 2013 14:51:25 +0530 Subject: [PATCH 07/93] gluster: Implement .bdrv_co_write_zeroes for gluster Support .bdrv_co_write_zeroes() from gluster driver by using GlusterFS API glfs_zerofill() that off-loads the writing of zeroes to GlusterFS server. Signed-off-by: Bharata B Rao Signed-off-by: Stefan Hajnoczi Signed-off-by: Kevin Wolf --- block/gluster.c | 79 +++++++++++++++++++++++++++++++++++++------------ configure | 8 +++++ 2 files changed, 68 insertions(+), 19 deletions(-) diff --git a/block/gluster.c b/block/gluster.c index f9aea0ea27..c11f60c671 100644 --- a/block/gluster.c +++ b/block/gluster.c @@ -236,6 +236,25 @@ static void qemu_gluster_complete_aio(void *opaque) qemu_coroutine_enter(acb->coroutine, NULL); } +/* + * AIO callback routine called from GlusterFS thread. + */ +static void gluster_finish_aiocb(struct glfs_fd *fd, ssize_t ret, void *arg) +{ + GlusterAIOCB *acb = (GlusterAIOCB *)arg; + + if (!ret || ret == acb->size) { + acb->ret = 0; /* Success */ + } else if (ret < 0) { + acb->ret = ret; /* Read/Write failed */ + } else { + acb->ret = -EIO; /* Partial read/write - fail it */ + } + + acb->bh = qemu_bh_new(qemu_gluster_complete_aio, acb); + qemu_bh_schedule(acb->bh); +} + /* TODO Convert to fine grained options */ static QemuOptsList runtime_opts = { .name = "gluster", @@ -308,6 +327,35 @@ out: return ret; } +#ifdef CONFIG_GLUSTERFS_ZEROFILL +static coroutine_fn int qemu_gluster_co_write_zeroes(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, BdrvRequestFlags flags) +{ + int ret; + GlusterAIOCB *acb = g_slice_new(GlusterAIOCB); + BDRVGlusterState *s = bs->opaque; + off_t size = nb_sectors * BDRV_SECTOR_SIZE; + off_t offset = sector_num * BDRV_SECTOR_SIZE; + + acb->size = size; + acb->ret = 0; + acb->coroutine = qemu_coroutine_self(); + + ret = glfs_zerofill_async(s->fd, offset, size, &gluster_finish_aiocb, acb); + if (ret < 0) { + ret = -errno; + goto out; + } + + qemu_coroutine_yield(); + ret = acb->ret; + +out: + g_slice_free(GlusterAIOCB, acb); + return ret; +} +#endif + static int qemu_gluster_create(const char *filename, QEMUOptionParameter *options, Error **errp) { @@ -350,25 +398,6 @@ out: return ret; } -/* - * AIO callback routine called from GlusterFS thread. - */ -static void gluster_finish_aiocb(struct glfs_fd *fd, ssize_t ret, void *arg) -{ - GlusterAIOCB *acb = (GlusterAIOCB *)arg; - - if (!ret || ret == acb->size) { - acb->ret = 0; /* Success */ - } else if (ret < 0) { - acb->ret = ret; /* Read/Write failed */ - } else { - acb->ret = -EIO; /* Partial read/write - fail it */ - } - - acb->bh = qemu_bh_new(qemu_gluster_complete_aio, acb); - qemu_bh_schedule(acb->bh); -} - static coroutine_fn int qemu_gluster_co_rw(BlockDriverState *bs, int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, int write) { @@ -551,6 +580,9 @@ static BlockDriver bdrv_gluster = { .bdrv_has_zero_init = qemu_gluster_has_zero_init, #ifdef CONFIG_GLUSTERFS_DISCARD .bdrv_co_discard = qemu_gluster_co_discard, +#endif +#ifdef CONFIG_GLUSTERFS_ZEROFILL + .bdrv_co_write_zeroes = qemu_gluster_co_write_zeroes, #endif .create_options = qemu_gluster_create_options, }; @@ -572,6 +604,9 @@ static BlockDriver bdrv_gluster_tcp = { .bdrv_has_zero_init = qemu_gluster_has_zero_init, #ifdef CONFIG_GLUSTERFS_DISCARD .bdrv_co_discard = qemu_gluster_co_discard, +#endif +#ifdef CONFIG_GLUSTERFS_ZEROFILL + .bdrv_co_write_zeroes = qemu_gluster_co_write_zeroes, #endif .create_options = qemu_gluster_create_options, }; @@ -593,6 +628,9 @@ static BlockDriver bdrv_gluster_unix = { .bdrv_has_zero_init = qemu_gluster_has_zero_init, #ifdef CONFIG_GLUSTERFS_DISCARD .bdrv_co_discard = qemu_gluster_co_discard, +#endif +#ifdef CONFIG_GLUSTERFS_ZEROFILL + .bdrv_co_write_zeroes = qemu_gluster_co_write_zeroes, #endif .create_options = qemu_gluster_create_options, }; @@ -614,6 +652,9 @@ static BlockDriver bdrv_gluster_rdma = { .bdrv_has_zero_init = qemu_gluster_has_zero_init, #ifdef CONFIG_GLUSTERFS_DISCARD .bdrv_co_discard = qemu_gluster_co_discard, +#endif +#ifdef CONFIG_GLUSTERFS_ZEROFILL + .bdrv_co_write_zeroes = qemu_gluster_co_write_zeroes, #endif .create_options = qemu_gluster_create_options, }; diff --git a/configure b/configure index 3782a6a26a..b472694cb2 100755 --- a/configure +++ b/configure @@ -256,6 +256,7 @@ coroutine_pool="" seccomp="" glusterfs="" glusterfs_discard="no" +glusterfs_zerofill="no" virtio_blk_data_plane="" gtk="" gtkabi="2.0" @@ -2701,6 +2702,9 @@ if test "$glusterfs" != "no" ; then if $pkg_config --atleast-version=5 glusterfs-api; then glusterfs_discard="yes" fi + if $pkg_config --atleast-version=6 glusterfs-api; then + glusterfs_zerofill="yes" + fi else if test "$glusterfs" = "yes" ; then feature_not_found "GlusterFS backend support" @@ -4229,6 +4233,10 @@ if test "$glusterfs_discard" = "yes" ; then echo "CONFIG_GLUSTERFS_DISCARD=y" >> $config_host_mak fi +if test "$glusterfs_zerofill" = "yes" ; then + echo "CONFIG_GLUSTERFS_ZEROFILL=y" >> $config_host_mak +fi + if test "$libssh2" = "yes" ; then echo "CONFIG_LIBSSH2=y" >> $config_host_mak fi From cf7f616b9d846b1cc21c7b692b5c9ff6f757a5e7 Mon Sep 17 00:00:00 2001 From: Bharata B Rao Date: Sat, 21 Dec 2013 14:51:26 +0530 Subject: [PATCH 08/93] gluster: Add support for creating zero-filled image GlusterFS supports creation of zero-filled file on GlusterFS volume by means of an API called glfs_zerofill(). Use this API from QEMU to create an image that is filled with zeroes by using the preallocation option of qemu-img. qemu-img create gluster://server/volume/image -o preallocation=full 10G The allowed values for preallocation are 'full' and 'off'. By default preallocation is off and image is not zero-filled. glfs_zerofill() offloads the writing of zeroes to the server and if the storage supports SCSI WRITESAME, GlusterFS server can issue BLKZEROOUT ioctl to achieve the zeroing. Signed-off-by: Bharata B Rao Signed-off-by: Stefan Hajnoczi Signed-off-by: Kevin Wolf --- block/gluster.c | 50 ++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 49 insertions(+), 1 deletion(-) diff --git a/block/gluster.c b/block/gluster.c index c11f60c671..a009b15ded 100644 --- a/block/gluster.c +++ b/block/gluster.c @@ -354,6 +354,29 @@ out: g_slice_free(GlusterAIOCB, acb); return ret; } + +static inline bool gluster_supports_zerofill(void) +{ + return 1; +} + +static inline int qemu_gluster_zerofill(struct glfs_fd *fd, int64_t offset, + int64_t size) +{ + return glfs_zerofill(fd, offset, size); +} + +#else +static inline bool gluster_supports_zerofill(void) +{ + return 0; +} + +static inline int qemu_gluster_zerofill(struct glfs_fd *fd, int64_t offset, + int64_t size) +{ + return 0; +} #endif static int qemu_gluster_create(const char *filename, @@ -362,6 +385,7 @@ static int qemu_gluster_create(const char *filename, struct glfs *glfs; struct glfs_fd *fd; int ret = 0; + int prealloc = 0; int64_t total_size = 0; GlusterConf *gconf = g_malloc0(sizeof(GlusterConf)); @@ -374,6 +398,19 @@ static int qemu_gluster_create(const char *filename, while (options && options->name) { if (!strcmp(options->name, BLOCK_OPT_SIZE)) { total_size = options->value.n / BDRV_SECTOR_SIZE; + } else if (!strcmp(options->name, BLOCK_OPT_PREALLOC)) { + if (!options->value.s || !strcmp(options->value.s, "off")) { + prealloc = 0; + } else if (!strcmp(options->value.s, "full") && + gluster_supports_zerofill()) { + prealloc = 1; + } else { + error_setg(errp, "Invalid preallocation mode: '%s'" + " or GlusterFS doesn't support zerofill API", + options->value.s); + ret = -EINVAL; + goto out; + } } options++; } @@ -383,9 +420,15 @@ static int qemu_gluster_create(const char *filename, if (!fd) { ret = -errno; } else { - if (glfs_ftruncate(fd, total_size * BDRV_SECTOR_SIZE) != 0) { + if (!glfs_ftruncate(fd, total_size * BDRV_SECTOR_SIZE)) { + if (prealloc && qemu_gluster_zerofill(fd, 0, + total_size * BDRV_SECTOR_SIZE)) { + ret = -errno; + } + } else { ret = -errno; } + if (glfs_close(fd) != 0) { ret = -errno; } @@ -560,6 +603,11 @@ static QEMUOptionParameter qemu_gluster_create_options[] = { .type = OPT_SIZE, .help = "Virtual disk size" }, + { + .name = BLOCK_OPT_PREALLOC, + .type = OPT_STRING, + .help = "Preallocation mode (allowed values: off, full)" + }, { NULL } }; From 9f23fce7b2e78b917f03ccd366e3e151c0a1a419 Mon Sep 17 00:00:00 2001 From: Liu Yuan Date: Fri, 3 Jan 2014 20:13:12 +0800 Subject: [PATCH 09/93] sheepdog: fix clone operation by 'qemu-img create -b' We should pass base_inode->vdi_id to base_vdi_id of SheepdogVdiReq so that sheep can create a clone instead a fresh volume. This fixes following command: qemu-create -b sheepdog:base sheepdog:clone so users can boot sheepdog:clone as a normal volume. Cc: qemu-devel@nongnu.org Cc: Kevin Wolf Cc: Stefan Hajnoczi Signed-off-by: Liu Yuan Signed-off-by: Stefan Hajnoczi Signed-off-by: Kevin Wolf --- block/sheepdog.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/block/sheepdog.c b/block/sheepdog.c index b94ab6e10a..6088fa5571 100644 --- a/block/sheepdog.c +++ b/block/sheepdog.c @@ -161,7 +161,7 @@ typedef struct SheepdogVdiReq { uint32_t id; uint32_t data_length; uint64_t vdi_size; - uint32_t vdi_id; + uint32_t base_vdi_id; uint8_t copies; uint8_t copy_policy; uint8_t reserved[2]; @@ -1493,7 +1493,7 @@ static int do_sd_create(BDRVSheepdogState *s, uint32_t *vdi_id, int snapshot) memset(&hdr, 0, sizeof(hdr)); hdr.opcode = SD_OP_NEW_VDI; - hdr.vdi_id = s->inode.vdi_id; + hdr.base_vdi_id = s->inode.vdi_id; wlen = SD_MAX_VDI_LEN; @@ -1684,7 +1684,7 @@ static int sd_create(const char *filename, QEMUOptionParameter *options, if (backing_file) { BlockDriverState *bs; - BDRVSheepdogState *s; + BDRVSheepdogState *base; BlockDriver *drv; /* Currently, only Sheepdog backing image is supported. */ @@ -1702,15 +1702,15 @@ static int sd_create(const char *filename, QEMUOptionParameter *options, goto out; } - s = bs->opaque; + base = bs->opaque; - if (!is_snapshot(&s->inode)) { + if (!is_snapshot(&base->inode)) { error_report("cannot clone from a non snapshot vdi"); bdrv_unref(bs); ret = -EINVAL; goto out; } - + s->inode.vdi_id = base->inode.vdi_id; bdrv_unref(bs); } @@ -1743,7 +1743,7 @@ static void sd_close(BlockDriverState *bs) memset(&hdr, 0, sizeof(hdr)); hdr.opcode = SD_OP_RELEASE_VDI; - hdr.vdi_id = s->inode.vdi_id; + hdr.base_vdi_id = s->inode.vdi_id; wlen = strlen(s->name) + 1; hdr.data_length = wlen; hdr.flags = SD_FLAG_CMD_WRITE; @@ -1846,7 +1846,7 @@ static bool sd_delete(BDRVSheepdogState *s) unsigned int wlen = SD_MAX_VDI_LEN, rlen = 0; SheepdogVdiReq hdr = { .opcode = SD_OP_DEL_VDI, - .vdi_id = s->inode.vdi_id, + .base_vdi_id = s->inode.vdi_id, .data_length = wlen, .flags = SD_FLAG_CMD_WRITE, }; From b7fcff01790d25f48d81ef6c8c3399577096a555 Mon Sep 17 00:00:00 2001 From: Kewei Yu Date: Mon, 6 Jan 2014 14:05:24 +0800 Subject: [PATCH 10/93] qtest: Fix the bug about disable vnc causes "make check" fail When we disable vnc from "./configure", QEMU can't use the vnc option. So qtest can't use the "vnc -none ", otherwise "make check" fails. If QEMU uses "-display none", "-vnc none" is excrescent, So we just need to drop it. Signed-off-by: Kewei Yu Reviewed-by: Paolo Bonzini Signed-off-by: Kevin Wolf --- tests/fdc-test.c | 5 +---- tests/ide-test.c | 3 --- 2 files changed, 1 insertion(+), 7 deletions(-) diff --git a/tests/fdc-test.c b/tests/fdc-test.c index 38b5b178d0..37096dcc13 100644 --- a/tests/fdc-test.c +++ b/tests/fdc-test.c @@ -518,7 +518,6 @@ static void fuzz_registers(void) int main(int argc, char **argv) { const char *arch = qtest_get_arch(); - char *cmdline; int fd; int ret; @@ -538,9 +537,7 @@ int main(int argc, char **argv) /* Run the tests */ g_test_init(&argc, &argv, NULL); - cmdline = g_strdup_printf("-vnc none "); - - qtest_start(cmdline); + qtest_start(NULL); qtest_irq_intercept_in(global_qtest, "ioapic"); qtest_add_func("/fdc/cmos", test_cmos); qtest_add_func("/fdc/no_media_on_start", test_no_media_on_start); diff --git a/tests/ide-test.c b/tests/ide-test.c index d5cec5a1fc..4a0d97f197 100644 --- a/tests/ide-test.c +++ b/tests/ide-test.c @@ -380,7 +380,6 @@ static void test_bmdma_no_busmaster(void) static void test_bmdma_setup(void) { ide_test_start( - "-vnc none " "-drive file=%s,if=ide,serial=%s,cache=writeback " "-global ide-hd.ver=%s", tmp_path, "testdisk", "version"); @@ -410,7 +409,6 @@ static void test_identify(void) int ret; ide_test_start( - "-vnc none " "-drive file=%s,if=ide,serial=%s,cache=writeback " "-global ide-hd.ver=%s", tmp_path, "testdisk", "version"); @@ -455,7 +453,6 @@ static void test_flush(void) uint8_t data; ide_test_start( - "-vnc none " "-drive file=blkdebug::%s,if=ide,cache=writeback", tmp_path); From 7fa9e1f941b4be1f71bb42de2f2ed8805d7e7326 Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Mon, 6 Jan 2014 12:39:01 +0800 Subject: [PATCH 11/93] docs: qcow2 compat=1.1 is now the default Commit 9117b47717ad208b12786ce88eacb013f9b3dd1c ("qcow2: Change default for new images to compat=1.1") changed the default qcow2 image format version but forgot to update qemu-doc.texi and qemu-img.texi. Signed-off-by: Stefan Hajnoczi Reviewed-by: Eric Blake Signed-off-by: Kevin Wolf --- qemu-doc.texi | 8 ++++---- qemu-img.texi | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/qemu-doc.texi b/qemu-doc.texi index 4e9c6e9b6e..ce61f30d6e 100644 --- a/qemu-doc.texi +++ b/qemu-doc.texi @@ -536,11 +536,11 @@ support of multiple VM snapshots. Supported options: @table @code @item compat -Determines the qcow2 version to use. @code{compat=0.10} uses the traditional -image format that can be read by any QEMU since 0.10 (this is the default). +Determines the qcow2 version to use. @code{compat=0.10} uses the +traditional image format that can be read by any QEMU since 0.10. @code{compat=1.1} enables image format extensions that only QEMU 1.1 and -newer understand. Amongst others, this includes zero clusters, which allow -efficient copy-on-read for sparse images. +newer understand (this is the default). Amongst others, this includes +zero clusters, which allow efficient copy-on-read for sparse images. @item backing_file File name of a base image (see @option{create} subcommand) diff --git a/qemu-img.texi b/qemu-img.texi index 1bba91efde..778e967f39 100644 --- a/qemu-img.texi +++ b/qemu-img.texi @@ -391,11 +391,11 @@ support of multiple VM snapshots. Supported options: @table @code @item compat -Determines the qcow2 version to use. @code{compat=0.10} uses the traditional -image format that can be read by any QEMU since 0.10 (this is the default). +Determines the qcow2 version to use. @code{compat=0.10} uses the +traditional image format that can be read by any QEMU since 0.10. @code{compat=1.1} enables image format extensions that only QEMU 1.1 and -newer understand. Amongst others, this includes zero clusters, which allow -efficient copy-on-read for sparse images. +newer understand (this is the default). Amongst others, this includes zero +clusters, which allow efficient copy-on-read for sparse images. @item backing_file File name of a base image (see @option{create} subcommand) From 585ea0c841df47c1542d33e17c5c6d532316ef74 Mon Sep 17 00:00:00 2001 From: Fam Zheng Date: Wed, 8 Jan 2014 09:42:07 +0800 Subject: [PATCH 12/93] vmdk: Fix big flat extent IO Local variable "n" as int64_t avoids overflow with large sector number calculation. See test case change for failure case. Signed-off-by: Fam Zheng Signed-off-by: Kevin Wolf --- block/vmdk.c | 4 +-- tests/qemu-iotests/059 | 7 ++++ tests/qemu-iotests/059.out | 74 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 83 insertions(+), 2 deletions(-) diff --git a/block/vmdk.c b/block/vmdk.c index c6b60b4a91..22b99b04ac 100644 --- a/block/vmdk.c +++ b/block/vmdk.c @@ -1325,8 +1325,8 @@ static int vmdk_write(BlockDriverState *bs, int64_t sector_num, { BDRVVmdkState *s = bs->opaque; VmdkExtent *extent = NULL; - int n, ret; - int64_t index_in_cluster; + int ret; + int64_t index_in_cluster, n; uint64_t extent_begin_sector, extent_relative_sector_num; uint64_t cluster_offset; VmdkMetaData m_data; diff --git a/tests/qemu-iotests/059 b/tests/qemu-iotests/059 index d8215ae596..64ed04cfce 100755 --- a/tests/qemu-iotests/059 +++ b/tests/qemu-iotests/059 @@ -102,6 +102,13 @@ echo "=== Testing version 3 ===" _use_sample_img iotest-version3.vmdk.bz2 _img_info +echo +echo "=== Testing 4TB monolithicFlat creation and IO ===" +IMGOPTS="subformat=monolithicFlat" _make_test_img 4T +_img_info +$QEMU_IO -c "write -P 0xa 900G 512" "$TEST_IMG" | _filter_qemu_io +$QEMU_IO -c "read -v 900G 1024" "$TEST_IMG" | _filter_qemu_io + # success, all done echo "*** done" rm -f $seq.full diff --git a/tests/qemu-iotests/059.out b/tests/qemu-iotests/059.out index 16ab7c6c1f..5e30e69bef 100644 --- a/tests/qemu-iotests/059.out +++ b/tests/qemu-iotests/059.out @@ -2047,4 +2047,78 @@ RW 12582912 VMFS "dummy.IMGFMT" 1 image: TEST_DIR/iotest-version3.IMGFMT file format: IMGFMT virtual size: 1.0G (1073741824 bytes) + +=== Testing 4TB monolithicFlat creation and IO === +Formatting 'TEST_DIR/iotest-version3.IMGFMT', fmt=IMGFMT size=4398046511104 +image: TEST_DIR/iotest-version3.IMGFMT +file format: IMGFMT +virtual size: 4.0T (4398046511104 bytes) +wrote 512/512 bytes at offset 966367641600 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +e100000000: 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a ................ +e100000010: 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a ................ +e100000020: 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a ................ +e100000030: 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a ................ +e100000040: 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a ................ +e100000050: 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a ................ +e100000060: 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a ................ +e100000070: 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a ................ +e100000080: 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a ................ +e100000090: 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a ................ +e1000000a0: 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a ................ +e1000000b0: 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a ................ +e1000000c0: 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a ................ +e1000000d0: 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a ................ +e1000000e0: 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a ................ +e1000000f0: 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a ................ +e100000100: 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a ................ +e100000110: 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a ................ +e100000120: 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a ................ +e100000130: 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a ................ +e100000140: 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a ................ +e100000150: 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a ................ +e100000160: 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a ................ +e100000170: 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a ................ +e100000180: 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a ................ +e100000190: 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a ................ +e1000001a0: 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a ................ +e1000001b0: 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a ................ +e1000001c0: 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a ................ +e1000001d0: 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a ................ +e1000001e0: 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a ................ +e1000001f0: 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a ................ +e100000200: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ +e100000210: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ +e100000220: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ +e100000230: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ +e100000240: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ +e100000250: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ +e100000260: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ +e100000270: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ +e100000280: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ +e100000290: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ +e1000002a0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ +e1000002b0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ +e1000002c0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ +e1000002d0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ +e1000002e0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ +e1000002f0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ +e100000300: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ +e100000310: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ +e100000320: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ +e100000330: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ +e100000340: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ +e100000350: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ +e100000360: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ +e100000370: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ +e100000380: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ +e100000390: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ +e1000003a0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ +e1000003b0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ +e1000003c0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ +e1000003d0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ +e1000003e0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ +e1000003f0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ +read 1024/1024 bytes at offset 966367641600 +1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) *** done From c60bf3391bf4cb79b7adc6650094e21671ddaabd Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Thu, 14 Nov 2013 11:54:14 +0100 Subject: [PATCH 13/93] readline: decouple readline from the monitor Make the readline.c functionality reusable. Instead of calling monitor_printf() and monitor_flush() directly, invoke function pointers provided by the user. This way readline.c does not know about Monitor and other users will be able to make use of readline.c. Note that there is already an "opaque" argument to the ReadLineFunc callback. Consistently call it "readline_opaque" from now on to distinguish from the ReadLinePrintfFunc/ReadLineFlushFunc "opaque" argument. I also dropped the printf macro trickery since it's now highly unlikely that anyone modifying readline.c would call printf(3) directly. We no longer need this protection. Signed-off-by: Stefan Hajnoczi Signed-off-by: Kevin Wolf --- hmp.c | 6 +++--- include/monitor/readline.h | 20 +++++++++++------ monitor.c | 39 +++++++++++++++++++++++++++------ readline.c | 44 ++++++++++++++++++++------------------ 4 files changed, 71 insertions(+), 38 deletions(-) diff --git a/hmp.c b/hmp.c index 79f9c7d2ce..468f97d176 100644 --- a/hmp.c +++ b/hmp.c @@ -1092,11 +1092,11 @@ void hmp_eject(Monitor *mon, const QDict *qdict) hmp_handle_error(mon, &err); } -static void hmp_change_read_arg(Monitor *mon, const char *password, - void *opaque) +static void hmp_change_read_arg(void *opaque, const char *password, + void *readline_opaque) { qmp_change_vnc_password(password, NULL); - monitor_read_command(mon, 1); + monitor_read_command(opaque, 1); } void hmp_change(Monitor *mon, const QDict *qdict) diff --git a/include/monitor/readline.h b/include/monitor/readline.h index 0faf6e1db7..a89fe4a9a9 100644 --- a/include/monitor/readline.h +++ b/include/monitor/readline.h @@ -1,14 +1,15 @@ #ifndef READLINE_H #define READLINE_H -#include "qemu-common.h" - #define READLINE_CMD_BUF_SIZE 4095 #define READLINE_MAX_CMDS 64 #define READLINE_MAX_COMPLETIONS 256 -typedef void ReadLineFunc(Monitor *mon, const char *str, void *opaque); -typedef void ReadLineCompletionFunc(Monitor *mon, +typedef void ReadLinePrintfFunc(void *opaque, const char *fmt, ...); +typedef void ReadLineFlushFunc(void *opaque); +typedef void ReadLineFunc(void *opaque, const char *str, + void *readline_opaque); +typedef void ReadLineCompletionFunc(void *opaque, const char *cmdline); typedef struct ReadLineState { @@ -35,7 +36,10 @@ typedef struct ReadLineState { void *readline_opaque; int read_password; char prompt[256]; - Monitor *mon; + + ReadLinePrintfFunc *printf_func; + ReadLineFlushFunc *flush_func; + void *opaque; } ReadLineState; void readline_add_completion(ReadLineState *rs, const char *str); @@ -46,11 +50,13 @@ const char *readline_get_history(ReadLineState *rs, unsigned int index); void readline_handle_byte(ReadLineState *rs, int ch); void readline_start(ReadLineState *rs, const char *prompt, int read_password, - ReadLineFunc *readline_func, void *opaque); + ReadLineFunc *readline_func, void *readline_opaque); void readline_restart(ReadLineState *rs); void readline_show_prompt(ReadLineState *rs); -ReadLineState *readline_init(Monitor *mon, +ReadLineState *readline_init(ReadLinePrintfFunc *printf_func, + ReadLineFlushFunc *flush_func, + void *opaque, ReadLineCompletionFunc *completion_finder); #endif /* !READLINE_H */ diff --git a/monitor.c b/monitor.c index 845f608665..32d02640a0 100644 --- a/monitor.c +++ b/monitor.c @@ -217,8 +217,8 @@ static const mon_cmd_t qmp_cmds[]; Monitor *cur_mon; Monitor *default_mon; -static void monitor_command_cb(Monitor *mon, const char *cmdline, - void *opaque); +static void monitor_command_cb(void *opaque, const char *cmdline, + void *readline_opaque); static inline int qmp_cmd_mode(const Monitor *mon) { @@ -4338,9 +4338,10 @@ static void monitor_find_completion_by_table(Monitor *mon, } } -static void monitor_find_completion(Monitor *mon, +static void monitor_find_completion(void *opaque, const char *cmdline) { + Monitor *mon = opaque; char *args[MAX_ARGS]; int nb_args, len; @@ -4751,8 +4752,11 @@ static void monitor_read(void *opaque, const uint8_t *buf, int size) cur_mon = old_mon; } -static void monitor_command_cb(Monitor *mon, const char *cmdline, void *opaque) +static void monitor_command_cb(void *opaque, const char *cmdline, + void *readline_opaque) { + Monitor *mon = opaque; + monitor_suspend(mon); handle_user_command(mon, cmdline); monitor_resume(mon); @@ -4881,6 +4885,22 @@ static void sortcmdlist(void) * End: */ +/* These functions just adapt the readline interface in a typesafe way. We + * could cast function pointers but that discards compiler checks. + */ +static void monitor_readline_printf(void *opaque, const char *fmt, ...) +{ + va_list ap; + va_start(ap, fmt); + monitor_vprintf(opaque, fmt, ap); + va_end(ap); +} + +static void monitor_readline_flush(void *opaque) +{ + monitor_flush(opaque); +} + void monitor_init(CharDriverState *chr, int flags) { static int is_first_init = 1; @@ -4898,7 +4918,10 @@ void monitor_init(CharDriverState *chr, int flags) mon->chr = chr; mon->flags = flags; if (flags & MONITOR_USE_READLINE) { - mon->rs = readline_init(mon, monitor_find_completion); + mon->rs = readline_init(monitor_readline_printf, + monitor_readline_flush, + mon, + monitor_find_completion); monitor_read_command(mon, 0); } @@ -4920,9 +4943,11 @@ void monitor_init(CharDriverState *chr, int flags) default_mon = mon; } -static void bdrv_password_cb(Monitor *mon, const char *password, void *opaque) +static void bdrv_password_cb(void *opaque, const char *password, + void *readline_opaque) { - BlockDriverState *bs = opaque; + Monitor *mon = opaque; + BlockDriverState *bs = readline_opaque; int ret = 0; if (bdrv_set_key(bs, password) != 0) { diff --git a/readline.c b/readline.c index abf27ddec3..ca894d1854 100644 --- a/readline.c +++ b/readline.c @@ -21,21 +21,19 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ + +#include "qemu-common.h" #include "monitor/readline.h" -#include "monitor/monitor.h" #define IS_NORM 0 #define IS_ESC 1 #define IS_CSI 2 #define IS_SS3 3 -#undef printf -#define printf do_not_use_printf - void readline_show_prompt(ReadLineState *rs) { - monitor_printf(rs->mon, "%s", rs->prompt); - monitor_flush(rs->mon); + rs->printf_func(rs->opaque, "%s", rs->prompt); + rs->flush_func(rs->opaque); rs->last_cmd_buf_index = 0; rs->last_cmd_buf_size = 0; rs->esc_state = IS_NORM; @@ -49,17 +47,17 @@ static void readline_update(ReadLineState *rs) if (rs->cmd_buf_size != rs->last_cmd_buf_size || memcmp(rs->cmd_buf, rs->last_cmd_buf, rs->cmd_buf_size) != 0) { for(i = 0; i < rs->last_cmd_buf_index; i++) { - monitor_printf(rs->mon, "\033[D"); + rs->printf_func(rs->opaque, "\033[D"); } rs->cmd_buf[rs->cmd_buf_size] = '\0'; if (rs->read_password) { len = strlen(rs->cmd_buf); for(i = 0; i < len; i++) - monitor_printf(rs->mon, "*"); + rs->printf_func(rs->opaque, "*"); } else { - monitor_printf(rs->mon, "%s", rs->cmd_buf); + rs->printf_func(rs->opaque, "%s", rs->cmd_buf); } - monitor_printf(rs->mon, "\033[K"); + rs->printf_func(rs->opaque, "\033[K"); memcpy(rs->last_cmd_buf, rs->cmd_buf, rs->cmd_buf_size); rs->last_cmd_buf_size = rs->cmd_buf_size; rs->last_cmd_buf_index = rs->cmd_buf_size; @@ -68,17 +66,17 @@ static void readline_update(ReadLineState *rs) delta = rs->cmd_buf_index - rs->last_cmd_buf_index; if (delta > 0) { for(i = 0;i < delta; i++) { - monitor_printf(rs->mon, "\033[C"); + rs->printf_func(rs->opaque, "\033[C"); } } else { delta = -delta; for(i = 0;i < delta; i++) { - monitor_printf(rs->mon, "\033[D"); + rs->printf_func(rs->opaque, "\033[D"); } } rs->last_cmd_buf_index = rs->cmd_buf_index; } - monitor_flush(rs->mon); + rs->flush_func(rs->opaque); } static void readline_insert_char(ReadLineState *rs, int ch) @@ -284,7 +282,7 @@ static void readline_completion(ReadLineState *rs) cmdline = g_malloc(rs->cmd_buf_index + 1); memcpy(cmdline, rs->cmd_buf, rs->cmd_buf_index); cmdline[rs->cmd_buf_index] = '\0'; - rs->completion_finder(rs->mon, cmdline); + rs->completion_finder(rs->opaque, cmdline); g_free(cmdline); /* no completion found */ @@ -299,7 +297,7 @@ static void readline_completion(ReadLineState *rs) if (len > 0 && rs->completions[0][len - 1] != '/') readline_insert_char(rs, ' '); } else { - monitor_printf(rs->mon, "\n"); + rs->printf_func(rs->opaque, "\n"); max_width = 0; max_prefix = 0; for(i = 0; i < rs->nb_completions; i++) { @@ -329,9 +327,9 @@ static void readline_completion(ReadLineState *rs) nb_cols = 80 / max_width; j = 0; for(i = 0; i < rs->nb_completions; i++) { - monitor_printf(rs->mon, "%-*s", max_width, rs->completions[i]); + rs->printf_func(rs->opaque, "%-*s", max_width, rs->completions[i]); if (++j == nb_cols || i == (rs->nb_completions - 1)) { - monitor_printf(rs->mon, "\n"); + rs->printf_func(rs->opaque, "\n"); j = 0; } } @@ -365,12 +363,12 @@ void readline_handle_byte(ReadLineState *rs, int ch) rs->cmd_buf[rs->cmd_buf_size] = '\0'; if (!rs->read_password) readline_hist_add(rs, rs->cmd_buf); - monitor_printf(rs->mon, "\n"); + rs->printf_func(rs->opaque, "\n"); rs->cmd_buf_index = 0; rs->cmd_buf_size = 0; rs->last_cmd_buf_index = 0; rs->last_cmd_buf_size = 0; - rs->readline_func(rs->mon, rs->cmd_buf, rs->readline_opaque); + rs->readline_func(rs->opaque, rs->cmd_buf, rs->readline_opaque); break; case 23: /* ^W */ @@ -480,13 +478,17 @@ const char *readline_get_history(ReadLineState *rs, unsigned int index) return rs->history[index]; } -ReadLineState *readline_init(Monitor *mon, +ReadLineState *readline_init(ReadLinePrintfFunc *printf_func, + ReadLineFlushFunc *flush_func, + void *opaque, ReadLineCompletionFunc *completion_finder) { ReadLineState *rs = g_malloc0(sizeof(*rs)); rs->hist_entry = -1; - rs->mon = mon; + rs->opaque = opaque; + rs->printf_func = printf_func; + rs->flush_func = flush_func; rs->completion_finder = completion_finder; return rs; From 0150cd81cf608b93778a067189829f354fe27e4b Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Thu, 14 Nov 2013 11:54:15 +0100 Subject: [PATCH 14/93] readline: move readline to a generic location Now that the monitor and readline are decoupled, readline.h no longer belongs in include/monitor/. Put the header into include/qemu/. Move the source file into util/ so it can be linked as part of libqemuutil.a. Signed-off-by: Stefan Hajnoczi Signed-off-by: Kevin Wolf --- Makefile.objs | 1 - include/monitor/monitor.h | 2 +- include/{monitor => qemu}/readline.h | 0 monitor.c | 2 +- util/Makefile.objs | 1 + readline.c => util/readline.c | 2 +- 6 files changed, 4 insertions(+), 4 deletions(-) rename include/{monitor => qemu}/readline.h (100%) rename readline.c => util/readline.c (99%) diff --git a/Makefile.objs b/Makefile.objs index 857bb53ae4..ac1d0e1c28 100644 --- a/Makefile.objs +++ b/Makefile.objs @@ -43,7 +43,6 @@ libcacard-y += libcacard/vcardt.o ifeq ($(CONFIG_SOFTMMU),y) common-obj-y = $(block-obj-y) blockdev.o blockdev-nbd.o block/ common-obj-y += net/ -common-obj-y += readline.o common-obj-y += qdev-monitor.o device-hotplug.o common-obj-$(CONFIG_WIN32) += os-win32.o common-obj-$(CONFIG_POSIX) += os-posix.o diff --git a/include/monitor/monitor.h b/include/monitor/monitor.h index 22d8b8f3e0..7e5f752b7a 100644 --- a/include/monitor/monitor.h +++ b/include/monitor/monitor.h @@ -5,7 +5,7 @@ #include "qapi/qmp/qerror.h" #include "qapi/qmp/qdict.h" #include "block/block.h" -#include "monitor/readline.h" +#include "qemu/readline.h" extern Monitor *cur_mon; extern Monitor *default_mon; diff --git a/include/monitor/readline.h b/include/qemu/readline.h similarity index 100% rename from include/monitor/readline.h rename to include/qemu/readline.h diff --git a/monitor.c b/monitor.c index 32d02640a0..80456fbe5b 100644 --- a/monitor.c +++ b/monitor.c @@ -37,7 +37,7 @@ #include "ui/qemu-spice.h" #include "sysemu/sysemu.h" #include "monitor/monitor.h" -#include "monitor/readline.h" +#include "qemu/readline.h" #include "ui/console.h" #include "sysemu/blockdev.h" #include "audio/audio.h" diff --git a/util/Makefile.objs b/util/Makefile.objs index af3e5cb157..937376b082 100644 --- a/util/Makefile.objs +++ b/util/Makefile.objs @@ -13,3 +13,4 @@ util-obj-y += hexdump.o util-obj-y += crc32c.o util-obj-y += throttle.o util-obj-y += getauxval.o +util-obj-y += readline.o diff --git a/readline.c b/util/readline.c similarity index 99% rename from readline.c rename to util/readline.c index ca894d1854..8441be484c 100644 --- a/readline.c +++ b/util/readline.c @@ -23,7 +23,7 @@ */ #include "qemu-common.h" -#include "monitor/readline.h" +#include "qemu/readline.h" #define IS_NORM 0 #define IS_ESC 1 From 13401ba0b982024b62a99388032bbb889dc98b43 Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Thu, 14 Nov 2013 11:54:16 +0100 Subject: [PATCH 15/93] osdep: add qemu_set_tty_echo() Using stdin with readline.c requires disabling echo and line buffering. Add a portable wrapper to set the terminal attributes under Linux and Windows. Signed-off-by: Stefan Hajnoczi Signed-off-by: Kevin Wolf --- include/qemu/osdep.h | 2 ++ util/oslib-posix.c | 18 ++++++++++++++++++ util/oslib-win32.c | 19 +++++++++++++++++++ 3 files changed, 39 insertions(+) diff --git a/include/qemu/osdep.h b/include/qemu/osdep.h index b3e2b6d8ea..eac7172bcb 100644 --- a/include/qemu/osdep.h +++ b/include/qemu/osdep.h @@ -240,4 +240,6 @@ static inline void qemu_init_auxval(char **envp) { } void qemu_init_auxval(char **envp); #endif +void qemu_set_tty_echo(int fd, bool echo); + #endif diff --git a/util/oslib-posix.c b/util/oslib-posix.c index e00a44c86f..f5c401646f 100644 --- a/util/oslib-posix.c +++ b/util/oslib-posix.c @@ -47,6 +47,9 @@ extern int daemon(int, int); # define QEMU_VMALLOC_ALIGN getpagesize() #endif +#include +#include + #include #include "config-host.h" @@ -251,3 +254,18 @@ qemu_get_local_state_pathname(const char *relative_pathname) return g_strdup_printf("%s/%s", CONFIG_QEMU_LOCALSTATEDIR, relative_pathname); } + +void qemu_set_tty_echo(int fd, bool echo) +{ + struct termios tty; + + tcgetattr(fd, &tty); + + if (echo) { + tty.c_lflag |= ECHO | ECHONL | ICANON | IEXTEN; + } else { + tty.c_lflag &= ~(ECHO | ECHONL | ICANON | IEXTEN); + } + + tcsetattr(fd, TCSANOW, &tty); +} diff --git a/util/oslib-win32.c b/util/oslib-win32.c index 776ccfaaf0..50be0440f2 100644 --- a/util/oslib-win32.c +++ b/util/oslib-win32.c @@ -189,3 +189,22 @@ qemu_get_local_state_pathname(const char *relative_pathname) return g_strdup_printf("%s" G_DIR_SEPARATOR_S "%s", base_path, relative_pathname); } + +void qemu_set_tty_echo(int fd, bool echo) +{ + HANDLE handle = (HANDLE)_get_osfhandle(fd); + DWORD dwMode = 0; + + if (handle == INVALID_HANDLE_VALUE) { + return; + } + + GetConsoleMode(handle, &dwMode); + + if (echo) { + SetConsoleMode(handle, dwMode | ENABLE_ECHO_INPUT | ENABLE_LINE_INPUT); + } else { + SetConsoleMode(handle, + dwMode & ~(ENABLE_ECHO_INPUT | ENABLE_LINE_INPUT)); + } +} From 0cf17e181798063c3824c8200ba46f25f54faa1a Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Thu, 14 Nov 2013 11:54:17 +0100 Subject: [PATCH 16/93] qemu-io: use readline.c Use readline.c for command-line history. There was support for GNU Readline and BSD Editline but it was never compiled in. Since QEMU has its own readline.c, just use that when qemu-io runs with stdin attached to a terminal. Signed-off-by: Stefan Hajnoczi Signed-off-by: Kevin Wolf --- qemu-io.c | 113 ++++++++++++++++++++++++++++++------------------------ 1 file changed, 63 insertions(+), 50 deletions(-) diff --git a/qemu-io.c b/qemu-io.c index 3b3340ab1b..d7c26d3ed0 100644 --- a/qemu-io.c +++ b/qemu-io.c @@ -18,6 +18,7 @@ #include "qemu/main-loop.h" #include "qemu/option.h" #include "qemu/config-file.h" +#include "qemu/readline.h" #include "block/block_int.h" #include "trace/control.h" @@ -32,6 +33,8 @@ extern int qemuio_misalign; static int ncmdline; static char **cmdline; +static ReadLineState *readline_state; + static int close_f(BlockDriverState *bs, int argc, char **argv) { bdrv_unref(bs); @@ -203,14 +206,6 @@ static void usage(const char *name) name); } - -#if defined(ENABLE_READLINE) -# include -# include -#elif defined(ENABLE_EDITLINE) -# include -#endif - static char *get_prompt(void) { static char prompt[FILENAME_MAX + 2 /*"> "*/ + 1 /*"\0"*/ ]; @@ -222,52 +217,47 @@ static char *get_prompt(void) return prompt; } -#if defined(ENABLE_READLINE) -static char *fetchline(void) +static void readline_printf_func(void *opaque, const char *fmt, ...) { - char *line = readline(get_prompt()); - if (line && *line) { - add_history(line); + va_list ap; + va_start(ap, fmt); + vprintf(fmt, ap); + va_end(ap); +} + +static void readline_flush_func(void *opaque) +{ + fflush(stdout); +} + +static void readline_func(void *opaque, const char *str, void *readline_opaque) +{ + char **line = readline_opaque; + *line = g_strdup(str); +} + +static void readline_completion_func(void *opaque, const char *str) +{ + /* No command or argument completion implemented yet */ +} + +static char *fetchline_readline(void) +{ + char *line = NULL; + + readline_start(readline_state, get_prompt(), 0, readline_func, &line); + while (!line) { + int ch = getchar(); + if (ch == EOF) { + break; + } + readline_handle_byte(readline_state, ch); } return line; } -#elif defined(ENABLE_EDITLINE) -static char *el_get_prompt(EditLine *e) -{ - return get_prompt(); -} -static char *fetchline(void) -{ - static EditLine *el; - static History *hist; - HistEvent hevent; - char *line; - int count; - - if (!el) { - hist = history_init(); - history(hist, &hevent, H_SETSIZE, 100); - el = el_init(progname, stdin, stdout, stderr); - el_source(el, NULL); - el_set(el, EL_SIGNAL, 1); - el_set(el, EL_PROMPT, el_get_prompt); - el_set(el, EL_HIST, history, (const char *)hist); - } - line = strdup(el_gets(el, &count)); - if (line) { - if (count > 0) { - line[count-1] = '\0'; - } - if (*line) { - history(hist, &hevent, H_ENTER, line); - } - } - return line; -} -#else -# define MAXREADLINESZ 1024 -static char *fetchline(void) +#define MAXREADLINESZ 1024 +static char *fetchline_fgets(void) { char *p, *line = g_malloc(MAXREADLINESZ); @@ -283,7 +273,15 @@ static char *fetchline(void) return line; } -#endif + +static char *fetchline(void) +{ + if (readline_state) { + return fetchline_readline(); + } else { + return fetchline_fgets(); + } +} static void prep_fetchline(void *opaque) { @@ -339,6 +337,11 @@ static void add_user_command(char *optarg) cmdline[ncmdline-1] = optarg; } +static void reenable_tty_echo(void) +{ + qemu_set_tty_echo(STDIN_FILENO, true); +} + int main(int argc, char **argv) { int readonly = 0; @@ -435,6 +438,15 @@ int main(int argc, char **argv) qemuio_add_command(&open_cmd); qemuio_add_command(&close_cmd); + if (isatty(STDIN_FILENO)) { + readline_state = readline_init(readline_printf_func, + readline_flush_func, + NULL, + readline_completion_func); + qemu_set_tty_echo(STDIN_FILENO, false); + atexit(reenable_tty_echo); + } + /* open the device */ if (!readonly) { flags |= BDRV_O_RDWR; @@ -453,5 +465,6 @@ int main(int argc, char **argv) if (qemuio_bs) { bdrv_unref(qemuio_bs); } + g_free(readline_state); return 0; } From 4694020d3c0d21f02408d5cc6f44b8fb55b4ee15 Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Thu, 14 Nov 2013 11:54:18 +0100 Subject: [PATCH 17/93] qemu-io: add command completion Autocomplete qemu-io commands at the interactive prompt. Note this only completes command names and not their options. Signed-off-by: Stefan Hajnoczi Signed-off-by: Kevin Wolf --- include/qemu-io.h | 3 +++ qemu-io-cmds.c | 15 +++++++++++++++ qemu-io.c | 8 +++++++- 3 files changed, 25 insertions(+), 1 deletion(-) diff --git a/include/qemu-io.h b/include/qemu-io.h index a418b46a40..7e7c07c09b 100644 --- a/include/qemu-io.h +++ b/include/qemu-io.h @@ -42,5 +42,8 @@ bool qemuio_command(BlockDriverState *bs, const char *cmd); void qemuio_add_command(const cmdinfo_t *ci); int qemuio_command_usage(const cmdinfo_t *ci); +void qemuio_complete_command(const char *input, + void (*fn)(const char *cmd, void *opaque), + void *opaque); #endif /* QEMU_IO_H */ diff --git a/qemu-io-cmds.c b/qemu-io-cmds.c index 85e4982bd8..6dfb4a51ae 100644 --- a/qemu-io-cmds.c +++ b/qemu-io-cmds.c @@ -94,6 +94,21 @@ static const cmdinfo_t *find_command(const char *cmd) return NULL; } +/* Invoke fn() for commands with a matching prefix */ +void qemuio_complete_command(const char *input, + void (*fn)(const char *cmd, void *opaque), + void *opaque) +{ + cmdinfo_t *ct; + size_t input_len = strlen(input); + + for (ct = cmdtab; ct < &cmdtab[ncmds]; ct++) { + if (strncmp(input, ct->name, input_len) == 0) { + fn(ct->name, opaque); + } + } +} + static char **breakline(char *input, int *count) { int c = 0; diff --git a/qemu-io.c b/qemu-io.c index d7c26d3ed0..fdc46a97e8 100644 --- a/qemu-io.c +++ b/qemu-io.c @@ -236,9 +236,15 @@ static void readline_func(void *opaque, const char *str, void *readline_opaque) *line = g_strdup(str); } +static void completion_match(const char *cmd, void *opaque) +{ + readline_add_completion(readline_state, cmd); +} + static void readline_completion_func(void *opaque, const char *str) { - /* No command or argument completion implemented yet */ + readline_set_completion_index(readline_state, strlen(str)); + qemuio_complete_command(str, completion_match, NULL); } static char *fetchline_readline(void) From 466b49f276310952ad64485d8b9fa87a5c8a9451 Mon Sep 17 00:00:00 2001 From: Max Reitz Date: Fri, 20 Dec 2013 19:28:01 +0100 Subject: [PATCH 18/93] blkdebug: Use errp for read_config() Use an Error variable in the read_config() function. Signed-off-by: Max Reitz Reviewed-by: Kevin Wolf Reviewed-by: Eric Blake Signed-off-by: Kevin Wolf --- block/blkdebug.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/block/blkdebug.c b/block/blkdebug.c index ebc5f13464..2eb2e8beb0 100644 --- a/block/blkdebug.c +++ b/block/blkdebug.c @@ -271,7 +271,7 @@ static void remove_rule(BlkdebugRule *rule) g_free(rule); } -static int read_config(BDRVBlkdebugState *s, const char *filename) +static int read_config(BDRVBlkdebugState *s, const char *filename, Error **errp) { FILE *f; int ret; @@ -279,11 +279,14 @@ static int read_config(BDRVBlkdebugState *s, const char *filename) f = fopen(filename, "r"); if (f == NULL) { + error_setg_errno(errp, errno, "Could not read blkdebug config file"); return -errno; } ret = qemu_config_parse(f, config_groups, filename); if (ret < 0) { + error_setg(errp, "Could not parse blkdebug config file"); + ret = -EINVAL; goto fail; } @@ -370,9 +373,8 @@ static int blkdebug_open(BlockDriverState *bs, QDict *options, int flags, /* Read rules from config file */ config = qemu_opt_get(opts, "config"); if (config) { - ret = read_config(s, config); - if (ret < 0) { - error_setg_errno(errp, -ret, "Could not read blkdebug config file"); + ret = read_config(s, config, errp); + if (ret) { goto fail; } } From d4881b9bcbbadc83ffa5d8e6d2d6deb36cd8faa6 Mon Sep 17 00:00:00 2001 From: Max Reitz Date: Fri, 20 Dec 2013 19:28:02 +0100 Subject: [PATCH 19/93] blkdebug: Don't require sophisticated filename If the filename is not prefixed by "blkdebug:" in blkdebug_parse_filename(), the blkdebug driver was not selected through that protocol prefix, but by an explicit command line option (file.driver=blkdebug or something similar). Contrary to the current reaction, this is not a problem at all; we just need to store the filename (in the x-image option) and can go on; the user just has to manually specify the config option. Signed-off-by: Max Reitz Reviewed-by: Kevin Wolf Reviewed-by: Eric Blake Signed-off-by: Kevin Wolf --- block/blkdebug.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/block/blkdebug.c b/block/blkdebug.c index 2eb2e8beb0..fab76ceb74 100644 --- a/block/blkdebug.c +++ b/block/blkdebug.c @@ -313,7 +313,9 @@ static void blkdebug_parse_filename(const char *filename, QDict *options, /* Parse the blkdebug: prefix */ if (!strstart(filename, "blkdebug:", &filename)) { - error_setg(errp, "File name string must start with 'blkdebug:'"); + /* There was no prefix; therefore, all options have to be already + present in the QDict (except for the filename) */ + qdict_put(options, "x-image", qstring_from_str(filename)); return; } From 05a8c2227157eda2540404999c4615d3bf343c18 Mon Sep 17 00:00:00 2001 From: Max Reitz Date: Fri, 20 Dec 2013 19:28:03 +0100 Subject: [PATCH 20/93] qdict: Add qdict_array_split() This function splits a QDict consisting of entries prefixed by incrementally enumerated indices into a QList of QDicts. Signed-off-by: Max Reitz Reviewed-by: Kevin Wolf Reviewed-by: Eric Blake Signed-off-by: Kevin Wolf --- include/qapi/qmp/qdict.h | 1 + qobject/qdict.c | 37 +++++++++++++++++++++++++++++++++++++ 2 files changed, 38 insertions(+) diff --git a/include/qapi/qmp/qdict.h b/include/qapi/qmp/qdict.h index 5cefd8022a..1ddf97b1c3 100644 --- a/include/qapi/qmp/qdict.h +++ b/include/qapi/qmp/qdict.h @@ -68,5 +68,6 @@ QDict *qdict_clone_shallow(const QDict *src); void qdict_flatten(QDict *qdict); void qdict_extract_subqdict(QDict *src, QDict **dst, const char *start); +void qdict_array_split(QDict *src, QList **dst); #endif /* QDICT_H */ diff --git a/qobject/qdict.c b/qobject/qdict.c index 17e14f08b1..2d5848d7d7 100644 --- a/qobject/qdict.c +++ b/qobject/qdict.c @@ -554,3 +554,40 @@ void qdict_extract_subqdict(QDict *src, QDict **dst, const char *start) entry = next; } } + +/** + * qdict_array_split(): This function moves array-like elements of a QDict into + * a new QList of QDicts. Every entry in the original QDict with a key prefixed + * "%u.", where %u designates an unsigned integer starting at 0 and + * incrementally counting up, will be moved to a new QDict at index %u in the + * output QList with the key prefix removed. The function terminates when there + * is no entry in the QDict with a prefix directly (incrementally) following the + * last one. + * Example: {"0.a": 42, "0.b": 23, "1.x": 0, "3.y": 1, "o.o": 7} + * (or {"1.x": 0, "3.y": 1, "0.a": 42, "o.o": 7, "0.b": 23}) + * => [{"a": 42, "b": 23}, {"x": 0}] + * and {"3.y": 1, "o.o": 7} (remainder of the old QDict) + */ +void qdict_array_split(QDict *src, QList **dst) +{ + unsigned i; + + *dst = qlist_new(); + + for (i = 0; i < UINT_MAX; i++) { + QDict *subqdict; + char prefix[32]; + size_t snprintf_ret; + + snprintf_ret = snprintf(prefix, 32, "%u.", i); + assert(snprintf_ret < 32); + + qdict_extract_subqdict(src, &subqdict, prefix); + if (!qdict_size(subqdict)) { + QDECREF(subqdict); + break; + } + + qlist_append_obj(*dst, QOBJECT(subqdict)); + } +} From 9f23fc0c23ab16e9c16b41ed300786924f7a7768 Mon Sep 17 00:00:00 2001 From: Max Reitz Date: Fri, 20 Dec 2013 19:28:04 +0100 Subject: [PATCH 21/93] qapi: extend qdict_flatten() for QLists Reversing qdict_array_split(), qdict_flatten() should flatten QLists as well by interpreting them as QDicts where every entry's key is its index. This allows bringing QDicts with QLists from QMP commands to the same form as they would be given as command-line options, thereby allowing them to be parsed the same way. Signed-off-by: Max Reitz Reviewed-by: Eric Blake Signed-off-by: Kevin Wolf --- qobject/qdict.c | 54 +++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 48 insertions(+), 6 deletions(-) diff --git a/qobject/qdict.c b/qobject/qdict.c index 2d5848d7d7..a3924f24bd 100644 --- a/qobject/qdict.c +++ b/qobject/qdict.c @@ -477,7 +477,43 @@ static void qdict_destroy_obj(QObject *obj) g_free(qdict); } -static void qdict_do_flatten(QDict *qdict, QDict *target, const char *prefix) +static void qdict_flatten_qdict(QDict *qdict, QDict *target, + const char *prefix); + +static void qdict_flatten_qlist(QList *qlist, QDict *target, const char *prefix) +{ + QObject *value; + const QListEntry *entry; + char *new_key; + int i; + + /* This function is never called with prefix == NULL, i.e., it is always + * called from within qdict_flatten_q(list|dict)(). Therefore, it does not + * need to remove list entries during the iteration (the whole list will be + * deleted eventually anyway from qdict_flatten_qdict()). */ + assert(prefix); + + entry = qlist_first(qlist); + + for (i = 0; entry; entry = qlist_next(entry), i++) { + value = qlist_entry_obj(entry); + new_key = g_strdup_printf("%s.%i", prefix, i); + + if (qobject_type(value) == QTYPE_QDICT) { + qdict_flatten_qdict(qobject_to_qdict(value), target, new_key); + } else if (qobject_type(value) == QTYPE_QLIST) { + qdict_flatten_qlist(qobject_to_qlist(value), target, new_key); + } else { + /* All other types are moved to the target unchanged. */ + qobject_incref(value); + qdict_put_obj(target, new_key, value); + } + + g_free(new_key); + } +} + +static void qdict_flatten_qdict(QDict *qdict, QDict *target, const char *prefix) { QObject *value; const QDictEntry *entry, *next; @@ -500,8 +536,12 @@ static void qdict_do_flatten(QDict *qdict, QDict *target, const char *prefix) if (qobject_type(value) == QTYPE_QDICT) { /* Entries of QDicts are processed recursively, the QDict object * itself disappears. */ - qdict_do_flatten(qobject_to_qdict(value), target, - new_key ? new_key : entry->key); + qdict_flatten_qdict(qobject_to_qdict(value), target, + new_key ? new_key : entry->key); + delete = true; + } else if (qobject_type(value) == QTYPE_QLIST) { + qdict_flatten_qlist(qobject_to_qlist(value), target, + new_key ? new_key : entry->key); delete = true; } else if (prefix) { /* All other objects are moved to the target unchanged. */ @@ -526,12 +566,14 @@ static void qdict_do_flatten(QDict *qdict, QDict *target, const char *prefix) /** * qdict_flatten(): For each nested QDict with key x, all fields with key y - * are moved to this QDict and their key is renamed to "x.y". This operation - * is applied recursively for nested QDicts. + * are moved to this QDict and their key is renamed to "x.y". For each nested + * QList with key x, the field at index y is moved to this QDict with the key + * "x.y" (i.e., the reverse of what qdict_array_split() does). + * This operation is applied recursively for nested QDicts and QLists. */ void qdict_flatten(QDict *qdict) { - qdict_do_flatten(qdict, qdict, NULL); + qdict_flatten_qdict(qdict, qdict, NULL); } /* extract all the src QDict entries starting by start into dst */ From adf5c449e5beb163999e4ba7366d5f9aebb504a1 Mon Sep 17 00:00:00 2001 From: Max Reitz Date: Fri, 20 Dec 2013 19:28:05 +0100 Subject: [PATCH 22/93] qemu-option: Add qemu_config_parse_qdict() This function basically parses command-line options given as a QDict replacing a config file. For instance, the QDict {"section.opt1": 42, "section.opt2": 23} corresponds to the config file: [section] opt1 = 42 opt2 = 23 It is possible to specify multiple sections and also multiple sections of the same type. On the command line, this looks like the following: inject-error.0.event=reftable_load,\ inject-error.1.event=l2_load,\ set-state.event=l1_update This would correspond to the following config file: [inject-error "inject-error.0"] event = reftable_load [inject-error "inject-error.1"] event = l2_load [set-state] event = l1_update Signed-off-by: Max Reitz Signed-off-by: Kevin Wolf --- include/qemu/config-file.h | 6 +++ util/qemu-config.c | 100 +++++++++++++++++++++++++++++++++++++ 2 files changed, 106 insertions(+) diff --git a/include/qemu/config-file.h b/include/qemu/config-file.h index 508428ff32..dbd97c4bdb 100644 --- a/include/qemu/config-file.h +++ b/include/qemu/config-file.h @@ -4,6 +4,7 @@ #include #include "qemu/option.h" #include "qapi/error.h" +#include "qapi/qmp/qdict.h" QemuOptsList *qemu_find_opts(const char *group); QemuOptsList *qemu_find_opts_err(const char *group, Error **errp); @@ -18,6 +19,11 @@ int qemu_config_parse(FILE *fp, QemuOptsList **lists, const char *fname); int qemu_read_config_file(const char *filename); +/* Parse QDict options as a replacement for a config file (allowing multiple + enumerated (0..(n-1)) configuration "sections") */ +void qemu_config_parse_qdict(QDict *options, QemuOptsList **lists, + Error **errp); + /* Read default QEMU config files */ int qemu_read_default_config_files(bool userconfig); diff --git a/util/qemu-config.c b/util/qemu-config.c index 7973659518..9298f55ecf 100644 --- a/util/qemu-config.c +++ b/util/qemu-config.c @@ -356,3 +356,103 @@ int qemu_read_config_file(const char *filename) return -EINVAL; } } + +static void config_parse_qdict_section(QDict *options, QemuOptsList *opts, + Error **errp) +{ + QemuOpts *subopts; + QDict *subqdict; + QList *list = NULL; + Error *local_err = NULL; + size_t orig_size, enum_size; + char *prefix; + + prefix = g_strdup_printf("%s.", opts->name); + qdict_extract_subqdict(options, &subqdict, prefix); + g_free(prefix); + orig_size = qdict_size(subqdict); + if (!orig_size) { + goto out; + } + + subopts = qemu_opts_create(opts, NULL, 0, &local_err); + if (error_is_set(&local_err)) { + error_propagate(errp, local_err); + goto out; + } + + qemu_opts_absorb_qdict(subopts, subqdict, &local_err); + if (error_is_set(&local_err)) { + error_propagate(errp, local_err); + goto out; + } + + enum_size = qdict_size(subqdict); + if (enum_size < orig_size && enum_size) { + error_setg(errp, "Unknown option '%s' for [%s]", + qdict_first(subqdict)->key, opts->name); + goto out; + } + + if (enum_size) { + /* Multiple, enumerated sections */ + QListEntry *list_entry; + unsigned i = 0; + + /* Not required anymore */ + qemu_opts_del(subopts); + + qdict_array_split(subqdict, &list); + if (qdict_size(subqdict)) { + error_setg(errp, "Unused option '%s' for [%s]", + qdict_first(subqdict)->key, opts->name); + goto out; + } + + QLIST_FOREACH_ENTRY(list, list_entry) { + QDict *section = qobject_to_qdict(qlist_entry_obj(list_entry)); + char *opt_name; + + opt_name = g_strdup_printf("%s.%u", opts->name, i++); + subopts = qemu_opts_create(opts, opt_name, 1, &local_err); + g_free(opt_name); + if (error_is_set(&local_err)) { + error_propagate(errp, local_err); + goto out; + } + + qemu_opts_absorb_qdict(subopts, section, &local_err); + if (error_is_set(&local_err)) { + error_propagate(errp, local_err); + qemu_opts_del(subopts); + goto out; + } + + if (qdict_size(section)) { + error_setg(errp, "[%s] section doesn't support the option '%s'", + opts->name, qdict_first(section)->key); + qemu_opts_del(subopts); + goto out; + } + } + } + +out: + QDECREF(subqdict); + QDECREF(list); +} + +void qemu_config_parse_qdict(QDict *options, QemuOptsList **lists, + Error **errp) +{ + int i; + Error *local_err = NULL; + + for (i = 0; lists[i]; i++) { + config_parse_qdict_section(options, lists[i], &local_err); + if (error_is_set(&local_err)) { + error_propagate(errp, local_err); + return; + } + } +} From 85a040e5485413333da4fcf98bc8b28c92fa623f Mon Sep 17 00:00:00 2001 From: Max Reitz Date: Fri, 20 Dec 2013 19:28:06 +0100 Subject: [PATCH 23/93] blkdebug: Always call read_config() Move the check whether there actually is a config file into the read_config() function. Signed-off-by: Max Reitz Reviewed-by: Kevin Wolf Reviewed-by: Eric Blake Signed-off-by: Kevin Wolf --- block/blkdebug.c | 36 +++++++++++++++++++----------------- 1 file changed, 19 insertions(+), 17 deletions(-) diff --git a/block/blkdebug.c b/block/blkdebug.c index fab76ceb74..acf23f2f07 100644 --- a/block/blkdebug.c +++ b/block/blkdebug.c @@ -273,21 +273,23 @@ static void remove_rule(BlkdebugRule *rule) static int read_config(BDRVBlkdebugState *s, const char *filename, Error **errp) { - FILE *f; + FILE *f = NULL; int ret; struct add_rule_data d; - f = fopen(filename, "r"); - if (f == NULL) { - error_setg_errno(errp, errno, "Could not read blkdebug config file"); - return -errno; - } + if (filename) { + f = fopen(filename, "r"); + if (f == NULL) { + error_setg_errno(errp, errno, "Could not read blkdebug config file"); + return -errno; + } - ret = qemu_config_parse(f, config_groups, filename); - if (ret < 0) { - error_setg(errp, "Could not parse blkdebug config file"); - ret = -EINVAL; - goto fail; + ret = qemu_config_parse(f, config_groups, filename); + if (ret < 0) { + error_setg(errp, "Could not parse blkdebug config file"); + ret = -EINVAL; + goto fail; + } } d.s = s; @@ -301,7 +303,9 @@ static int read_config(BDRVBlkdebugState *s, const char *filename, Error **errp) fail: qemu_opts_reset(&inject_error_opts); qemu_opts_reset(&set_state_opts); - fclose(f); + if (f) { + fclose(f); + } return ret; } @@ -374,11 +378,9 @@ static int blkdebug_open(BlockDriverState *bs, QDict *options, int flags, /* Read rules from config file */ config = qemu_opt_get(opts, "config"); - if (config) { - ret = read_config(s, config, errp); - if (ret) { - goto fail; - } + ret = read_config(s, config, errp); + if (ret) { + goto fail; } /* Set initial state */ From 89f2b21e36cce948c39fa7cf24226f6e5f042cc8 Mon Sep 17 00:00:00 2001 From: Max Reitz Date: Fri, 20 Dec 2013 19:28:07 +0100 Subject: [PATCH 24/93] blkdebug: Use command-line in read_config() Use qemu_config_parse_qdict() to parse the command-line options in addition to the config file. Signed-off-by: Max Reitz Reviewed-by: Kevin Wolf Signed-off-by: Kevin Wolf --- block/blkdebug.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/block/blkdebug.c b/block/blkdebug.c index acf23f2f07..0bf3bb518a 100644 --- a/block/blkdebug.c +++ b/block/blkdebug.c @@ -271,11 +271,13 @@ static void remove_rule(BlkdebugRule *rule) g_free(rule); } -static int read_config(BDRVBlkdebugState *s, const char *filename, Error **errp) +static int read_config(BDRVBlkdebugState *s, const char *filename, + QDict *options, Error **errp) { FILE *f = NULL; int ret; struct add_rule_data d; + Error *local_err = NULL; if (filename) { f = fopen(filename, "r"); @@ -292,6 +294,13 @@ static int read_config(BDRVBlkdebugState *s, const char *filename, Error **errp) } } + qemu_config_parse_qdict(options, config_groups, &local_err); + if (error_is_set(&local_err)) { + error_propagate(errp, local_err); + ret = -EINVAL; + goto fail; + } + d.s = s; d.action = ACTION_INJECT_ERROR; qemu_opts_foreach(&inject_error_opts, add_rule, &d, 0); @@ -376,9 +385,9 @@ static int blkdebug_open(BlockDriverState *bs, QDict *options, int flags, goto fail; } - /* Read rules from config file */ + /* Read rules from config file or command line options */ config = qemu_opt_get(opts, "config"); - ret = read_config(s, config, errp); + ret = read_config(s, config, options, errp); if (ret) { goto fail; } From 72daa72eeecb6b2ee06ab7d836ac3aa01ad7e6df Mon Sep 17 00:00:00 2001 From: Max Reitz Date: Fri, 20 Dec 2013 19:28:08 +0100 Subject: [PATCH 25/93] block: Allow reference for bdrv_file_open() Allow specifying a reference to an existing block device (by name) for bdrv_file_open() instead of a filename and/or options. Signed-off-by: Max Reitz Reviewed-by: Kevin Wolf Signed-off-by: Kevin Wolf --- block.c | 25 ++++++++++++++++++++++--- block/blkdebug.c | 2 +- block/blkverify.c | 2 +- block/cow.c | 3 ++- block/qcow.c | 3 ++- block/qcow2.c | 2 +- block/qed.c | 4 ++-- block/sheepdog.c | 4 ++-- block/vhdx.c | 2 +- block/vmdk.c | 8 ++++---- include/block/block.h | 3 ++- qemu-io.c | 2 +- 12 files changed, 41 insertions(+), 19 deletions(-) diff --git a/block.c b/block.c index 64e7d220c6..1e53b3dffd 100644 --- a/block.c +++ b/block.c @@ -858,9 +858,10 @@ free_and_fail: * dictionary, it needs to use QINCREF() before calling bdrv_file_open. */ int bdrv_file_open(BlockDriverState **pbs, const char *filename, - QDict *options, int flags, Error **errp) + const char *reference, QDict *options, int flags, + Error **errp) { - BlockDriverState *bs; + BlockDriverState *bs = NULL; BlockDriver *drv; const char *drvname; bool allow_protocol_prefix = false; @@ -872,6 +873,24 @@ int bdrv_file_open(BlockDriverState **pbs, const char *filename, options = qdict_new(); } + if (reference) { + if (filename || qdict_size(options)) { + error_setg(errp, "Cannot reference an existing block device with " + "additional options or a new filename"); + return -EINVAL; + } + QDECREF(options); + + bs = bdrv_find(reference); + if (!bs) { + error_setg(errp, "Cannot find block device '%s'", reference); + return -ENODEV; + } + bdrv_ref(bs); + *pbs = bs; + return 0; + } + bs = bdrv_new(""); bs->options = options; options = qdict_clone_shallow(options); @@ -1124,7 +1143,7 @@ int bdrv_open(BlockDriverState *bs, const char *filename, QDict *options, qdict_extract_subqdict(options, &file_options, "file."); - ret = bdrv_file_open(&file, filename, file_options, + ret = bdrv_file_open(&file, filename, NULL, file_options, bdrv_open_flags(bs, flags | BDRV_O_UNMAP), &local_err); if (ret < 0) { goto fail; diff --git a/block/blkdebug.c b/block/blkdebug.c index 0bf3bb518a..21a4931594 100644 --- a/block/blkdebug.c +++ b/block/blkdebug.c @@ -403,7 +403,7 @@ static int blkdebug_open(BlockDriverState *bs, QDict *options, int flags, goto fail; } - ret = bdrv_file_open(&bs->file, filename, NULL, flags, &local_err); + ret = bdrv_file_open(&bs->file, filename, NULL, NULL, flags, &local_err); if (ret < 0) { error_propagate(errp, local_err); goto fail; diff --git a/block/blkverify.c b/block/blkverify.c index 1c1637f55e..e15ac4ca69 100644 --- a/block/blkverify.c +++ b/block/blkverify.c @@ -141,7 +141,7 @@ static int blkverify_open(BlockDriverState *bs, QDict *options, int flags, goto fail; } - ret = bdrv_file_open(&bs->file, raw, NULL, flags, &local_err); + ret = bdrv_file_open(&bs->file, raw, NULL, NULL, flags, &local_err); if (ret < 0) { error_propagate(errp, local_err); goto fail; diff --git a/block/cow.c b/block/cow.c index dc15e46b6c..7fc0b12163 100644 --- a/block/cow.c +++ b/block/cow.c @@ -351,7 +351,8 @@ static int cow_create(const char *filename, QEMUOptionParameter *options, return ret; } - ret = bdrv_file_open(&cow_bs, filename, NULL, BDRV_O_RDWR, &local_err); + ret = bdrv_file_open(&cow_bs, filename, NULL, NULL, BDRV_O_RDWR, + &local_err); if (ret < 0) { qerror_report_err(local_err); error_free(local_err); diff --git a/block/qcow.c b/block/qcow.c index c470e05f60..948b0c5601 100644 --- a/block/qcow.c +++ b/block/qcow.c @@ -691,7 +691,8 @@ static int qcow_create(const char *filename, QEMUOptionParameter *options, return ret; } - ret = bdrv_file_open(&qcow_bs, filename, NULL, BDRV_O_RDWR, &local_err); + ret = bdrv_file_open(&qcow_bs, filename, NULL, NULL, BDRV_O_RDWR, + &local_err); if (ret < 0) { qerror_report_err(local_err); error_free(local_err); diff --git a/block/qcow2.c b/block/qcow2.c index 8ec9db10f8..e15a4dd057 100644 --- a/block/qcow2.c +++ b/block/qcow2.c @@ -1483,7 +1483,7 @@ static int qcow2_create2(const char *filename, int64_t total_size, return ret; } - ret = bdrv_file_open(&bs, filename, NULL, BDRV_O_RDWR, &local_err); + ret = bdrv_file_open(&bs, filename, NULL, NULL, BDRV_O_RDWR, &local_err); if (ret < 0) { error_propagate(errp, local_err); return ret; diff --git a/block/qed.c b/block/qed.c index 450a1fa2e9..0dd5c5859e 100644 --- a/block/qed.c +++ b/block/qed.c @@ -563,8 +563,8 @@ static int qed_create(const char *filename, uint32_t cluster_size, return ret; } - ret = bdrv_file_open(&bs, filename, NULL, BDRV_O_RDWR | BDRV_O_CACHE_WB, - &local_err); + ret = bdrv_file_open(&bs, filename, NULL, NULL, + BDRV_O_RDWR | BDRV_O_CACHE_WB, &local_err); if (ret < 0) { qerror_report_err(local_err); error_free(local_err); diff --git a/block/sheepdog.c b/block/sheepdog.c index 6088fa5571..2ce3d9b9fb 100644 --- a/block/sheepdog.c +++ b/block/sheepdog.c @@ -1534,7 +1534,7 @@ static int sd_prealloc(const char *filename) Error *local_err = NULL; int ret; - ret = bdrv_file_open(&bs, filename, NULL, BDRV_O_RDWR, &local_err); + ret = bdrv_file_open(&bs, filename, NULL, NULL, BDRV_O_RDWR, &local_err); if (ret < 0) { qerror_report_err(local_err); error_free(local_err); @@ -1695,7 +1695,7 @@ static int sd_create(const char *filename, QEMUOptionParameter *options, goto out; } - ret = bdrv_file_open(&bs, backing_file, NULL, 0, &local_err); + ret = bdrv_file_open(&bs, backing_file, NULL, NULL, 0, &local_err); if (ret < 0) { qerror_report_err(local_err); error_free(local_err); diff --git a/block/vhdx.c b/block/vhdx.c index 1995778945..9ee0a612ff 100644 --- a/block/vhdx.c +++ b/block/vhdx.c @@ -1797,7 +1797,7 @@ static int vhdx_create(const char *filename, QEMUOptionParameter *options, goto exit; } - ret = bdrv_file_open(&bs, filename, NULL, BDRV_O_RDWR, &local_err); + ret = bdrv_file_open(&bs, filename, NULL, NULL, BDRV_O_RDWR, &local_err); if (ret < 0) { error_propagate(errp, local_err); goto exit; diff --git a/block/vmdk.c b/block/vmdk.c index 22b99b04ac..599a928545 100644 --- a/block/vmdk.c +++ b/block/vmdk.c @@ -769,8 +769,8 @@ static int vmdk_parse_extents(const char *desc, BlockDriverState *bs, path_combine(extent_path, sizeof(extent_path), desc_file_path, fname); - ret = bdrv_file_open(&extent_file, extent_path, NULL, bs->open_flags, - errp); + ret = bdrv_file_open(&extent_file, extent_path, NULL, NULL, + bs->open_flags, errp); if (ret) { return ret; } @@ -1469,7 +1469,7 @@ static int vmdk_create_extent(const char *filename, int64_t filesize, goto exit; } - ret = bdrv_file_open(&bs, filename, NULL, BDRV_O_RDWR, &local_err); + ret = bdrv_file_open(&bs, filename, NULL, NULL, BDRV_O_RDWR, &local_err); if (ret < 0) { error_propagate(errp, local_err); goto exit; @@ -1807,7 +1807,7 @@ static int vmdk_create(const char *filename, QEMUOptionParameter *options, goto exit; } } - ret = bdrv_file_open(&new_bs, filename, NULL, BDRV_O_RDWR, &local_err); + ret = bdrv_file_open(&new_bs, filename, NULL, NULL, BDRV_O_RDWR, &local_err); if (ret < 0) { error_setg_errno(errp, -ret, "Could not write description"); goto exit; diff --git a/include/block/block.h b/include/block/block.h index 36efaeac2d..e2b2a15f9a 100644 --- a/include/block/block.h +++ b/include/block/block.h @@ -184,7 +184,8 @@ void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top); int bdrv_parse_cache_flags(const char *mode, int *flags); int bdrv_parse_discard_flags(const char *mode, int *flags); int bdrv_file_open(BlockDriverState **pbs, const char *filename, - QDict *options, int flags, Error **errp); + const char *reference, QDict *options, int flags, + Error **errp); int bdrv_open_backing_file(BlockDriverState *bs, QDict *options, Error **errp); int bdrv_open(BlockDriverState *bs, const char *filename, QDict *options, int flags, BlockDriver *drv, Error **errp); diff --git a/qemu-io.c b/qemu-io.c index fdc46a97e8..bfb773e70f 100644 --- a/qemu-io.c +++ b/qemu-io.c @@ -59,7 +59,7 @@ static int openfile(char *name, int flags, int growable, QDict *opts) } if (growable) { - if (bdrv_file_open(&qemuio_bs, name, opts, flags, &local_err)) { + if (bdrv_file_open(&qemuio_bs, name, NULL, opts, flags, &local_err)) { fprintf(stderr, "%s: can't open device %s: %s\n", progname, name, error_get_pretty(local_err)); error_free(local_err); From 2258e3fe20990a13c9aa2c1adccafae073b7ce13 Mon Sep 17 00:00:00 2001 From: Max Reitz Date: Fri, 20 Dec 2013 19:28:09 +0100 Subject: [PATCH 26/93] block: Pass reference to bdrv_file_open() With that now being possible, bdrv_open() should try to extract a block device reference from the options and pass it to bdrv_file_open(). Signed-off-by: Max Reitz Reviewed-by: Kevin Wolf Signed-off-by: Kevin Wolf --- block.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/block.c b/block.c index 1e53b3dffd..bef4f8232b 100644 --- a/block.c +++ b/block.c @@ -1056,6 +1056,7 @@ int bdrv_open(BlockDriverState *bs, const char *filename, QDict *options, char tmp_filename[PATH_MAX + 1]; BlockDriverState *file = NULL; QDict *file_options = NULL; + const char *file_reference; const char *drvname; Error *local_err = NULL; @@ -1142,9 +1143,11 @@ int bdrv_open(BlockDriverState *bs, const char *filename, QDict *options, } qdict_extract_subqdict(options, &file_options, "file."); + file_reference = qdict_get_try_str(options, "file"); - ret = bdrv_file_open(&file, filename, NULL, file_options, + ret = bdrv_file_open(&file, filename, file_reference, file_options, bdrv_open_flags(bs, flags | BDRV_O_UNMAP), &local_err); + qdict_del(options, "file"); if (ret < 0) { goto fail; } From 2a05cbe426a7a3ddec63dbc67c9ac93013aebf77 Mon Sep 17 00:00:00 2001 From: Max Reitz Date: Fri, 20 Dec 2013 19:28:10 +0100 Subject: [PATCH 27/93] block: Allow block devices without files blkdebug and blkverify will, in order to retain compatibility, not support the field "file" implicitly through bdrv_open(). In order to be able to use those drivers without giving a filename anyway, it is necessary to be able to have block devices without files implicitly opened by bdrv_open(). This is the case, if there was neither a file name, a reference to an existing block device to use as a file nor options specific to the file. Signed-off-by: Max Reitz Signed-off-by: Kevin Wolf --- block.c | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/block.c b/block.c index bef4f8232b..7464fb237e 100644 --- a/block.c +++ b/block.c @@ -1145,11 +1145,14 @@ int bdrv_open(BlockDriverState *bs, const char *filename, QDict *options, qdict_extract_subqdict(options, &file_options, "file."); file_reference = qdict_get_try_str(options, "file"); - ret = bdrv_file_open(&file, filename, file_reference, file_options, - bdrv_open_flags(bs, flags | BDRV_O_UNMAP), &local_err); - qdict_del(options, "file"); - if (ret < 0) { - goto fail; + if (filename || file_reference || qdict_size(file_options)) { + ret = bdrv_file_open(&file, filename, file_reference, file_options, + bdrv_open_flags(bs, flags | BDRV_O_UNMAP), + &local_err); + qdict_del(options, "file"); + if (ret < 0) { + goto fail; + } } /* Find the right image format driver */ @@ -1165,7 +1168,13 @@ int bdrv_open(BlockDriverState *bs, const char *filename, QDict *options, } if (!drv) { - ret = find_image_format(file, filename, &drv, &local_err); + if (file) { + ret = find_image_format(file, filename, &drv, &local_err); + } else { + error_setg(errp, "Must specify either driver or file"); + ret = -EINVAL; + goto unlink_and_fail; + } } if (!drv) { @@ -1178,7 +1187,7 @@ int bdrv_open(BlockDriverState *bs, const char *filename, QDict *options, goto unlink_and_fail; } - if (bs->file != file) { + if (file && (bs->file != file)) { bdrv_unref(file); file = NULL; } From da557aac181fa71fde6a2a7c7a1eb2aea20caf64 Mon Sep 17 00:00:00 2001 From: Max Reitz Date: Fri, 20 Dec 2013 19:28:11 +0100 Subject: [PATCH 28/93] block: Add bdrv_open_image() Add a common function for opening images to be used for block drivers specified through BlockdevRefs in an option QDict. The difference from bdrv_file_open() is that this function may invoke bdrv_open() instead, allowing auto-detection of the driver to be used; and second, it automatically extracts the BlockdevRef from the option QDict. Signed-off-by: Max Reitz Signed-off-by: Kevin Wolf --- block.c | 73 +++++++++++++++++++++++++++++++++++++++++++ include/block/block.h | 3 ++ 2 files changed, 76 insertions(+) diff --git a/block.c b/block.c index 7464fb237e..76b6c25d90 100644 --- a/block.c +++ b/block.c @@ -1040,6 +1040,79 @@ int bdrv_open_backing_file(BlockDriverState *bs, QDict *options, Error **errp) return 0; } +/* + * Opens a disk image whose options are given as BlockdevRef in another block + * device's options. + * + * If force_raw is true, bdrv_file_open() will be used, thereby preventing any + * image format auto-detection. If it is false and a filename is given, + * bdrv_open() will be used for auto-detection. + * + * If allow_none is true, no image will be opened if filename is false and no + * BlockdevRef is given. *pbs will remain unchanged and 0 will be returned. + * + * bdrev_key specifies the key for the image's BlockdevRef in the options QDict. + * That QDict has to be flattened; therefore, if the BlockdevRef is a QDict + * itself, all options starting with "${bdref_key}." are considered part of the + * BlockdevRef. + * + * The BlockdevRef will be removed from the options QDict. + */ +int bdrv_open_image(BlockDriverState **pbs, const char *filename, + QDict *options, const char *bdref_key, int flags, + bool force_raw, bool allow_none, Error **errp) +{ + QDict *image_options; + int ret; + char *bdref_key_dot; + const char *reference; + + bdref_key_dot = g_strdup_printf("%s.", bdref_key); + qdict_extract_subqdict(options, &image_options, bdref_key_dot); + g_free(bdref_key_dot); + + reference = qdict_get_try_str(options, bdref_key); + if (!filename && !reference && !qdict_size(image_options)) { + if (allow_none) { + ret = 0; + } else { + error_setg(errp, "A block device must be specified for \"%s\"", + bdref_key); + ret = -EINVAL; + } + goto done; + } + + if (filename && !force_raw) { + /* If a filename is given and the block driver should be detected + automatically (instead of using none), use bdrv_open() in order to do + that auto-detection. */ + BlockDriverState *bs; + + if (reference) { + error_setg(errp, "Cannot reference an existing block device while " + "giving a filename"); + ret = -EINVAL; + goto done; + } + + bs = bdrv_new(""); + ret = bdrv_open(bs, filename, image_options, flags, NULL, errp); + if (ret < 0) { + bdrv_unref(bs); + } else { + *pbs = bs; + } + } else { + ret = bdrv_file_open(pbs, filename, reference, image_options, flags, + errp); + } + +done: + qdict_del(options, bdref_key); + return ret; +} + /* * Opens a disk image (raw, qcow2, vmdk, ...) * diff --git a/include/block/block.h b/include/block/block.h index e2b2a15f9a..a47f3d4988 100644 --- a/include/block/block.h +++ b/include/block/block.h @@ -186,6 +186,9 @@ int bdrv_parse_discard_flags(const char *mode, int *flags); int bdrv_file_open(BlockDriverState **pbs, const char *filename, const char *reference, QDict *options, int flags, Error **errp); +int bdrv_open_image(BlockDriverState **pbs, const char *filename, + QDict *options, const char *bdref_key, int flags, + bool force_raw, bool allow_none, Error **errp); int bdrv_open_backing_file(BlockDriverState *bs, QDict *options, Error **errp); int bdrv_open(BlockDriverState *bs, const char *filename, QDict *options, int flags, BlockDriver *drv, Error **errp); From 054963f8f082695ecb1f169024c83ce3e4eea3d8 Mon Sep 17 00:00:00 2001 From: Max Reitz Date: Fri, 20 Dec 2013 19:28:12 +0100 Subject: [PATCH 29/93] block: Use bdrv_open_image() in bdrv_open() Using bdrv_open_image() instead of bdrv_file_open() directly in bdrv_open() is easier. Signed-off-by: Max Reitz Signed-off-by: Kevin Wolf --- block.c | 18 +++++------------- 1 file changed, 5 insertions(+), 13 deletions(-) diff --git a/block.c b/block.c index 76b6c25d90..9e4e85f11f 100644 --- a/block.c +++ b/block.c @@ -1128,8 +1128,6 @@ int bdrv_open(BlockDriverState *bs, const char *filename, QDict *options, /* TODO: extra byte is a hack to ensure MAX_PATH space on Windows. */ char tmp_filename[PATH_MAX + 1]; BlockDriverState *file = NULL; - QDict *file_options = NULL; - const char *file_reference; const char *drvname; Error *local_err = NULL; @@ -1215,17 +1213,11 @@ int bdrv_open(BlockDriverState *bs, const char *filename, QDict *options, flags |= BDRV_O_ALLOW_RDWR; } - qdict_extract_subqdict(options, &file_options, "file."); - file_reference = qdict_get_try_str(options, "file"); - - if (filename || file_reference || qdict_size(file_options)) { - ret = bdrv_file_open(&file, filename, file_reference, file_options, - bdrv_open_flags(bs, flags | BDRV_O_UNMAP), - &local_err); - qdict_del(options, "file"); - if (ret < 0) { - goto fail; - } + ret = bdrv_open_image(&file, filename, options, "file", + bdrv_open_flags(bs, flags | BDRV_O_UNMAP), true, true, + &local_err); + if (ret < 0) { + goto fail; } /* Find the right image format driver */ From 505d758334afcee07eb40aa1b33f2353c612c8ec Mon Sep 17 00:00:00 2001 From: Max Reitz Date: Fri, 20 Dec 2013 19:28:13 +0100 Subject: [PATCH 30/93] block: Allow recursive "file"s It should be possible to use a format as a driver for a file which in turn requires another file, i.e., nesting file formats. Allowing nested file formats results in e.g. qcow2 BlockDriverStates never being directly passed to bdrv_open_common() from bdrv_file_open(), but instead being handed through bdrv_open(). This changes the error message when trying to give a filename to qcow2, i.e. trying to use it as a driver for the protocol level. Therefore, change the reference output of I/O test 051 accordingly. Signed-off-by: Max Reitz Signed-off-by: Kevin Wolf --- block.c | 9 +++++++-- tests/qemu-iotests/051.out | 2 +- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/block.c b/block.c index 9e4e85f11f..6af5f6e39f 100644 --- a/block.c +++ b/block.c @@ -948,14 +948,19 @@ int bdrv_file_open(BlockDriverState **pbs, const char *filename, goto fail; } - ret = bdrv_open_common(bs, NULL, options, flags, drv, &local_err); + if (!drv->bdrv_file_open) { + ret = bdrv_open(bs, filename, options, flags, drv, &local_err); + options = NULL; + } else { + ret = bdrv_open_common(bs, NULL, options, flags, drv, &local_err); + } if (ret < 0) { error_propagate(errp, local_err); goto fail; } /* Check if any unknown options were used */ - if (qdict_size(options) != 0) { + if (options && (qdict_size(options) != 0)) { const QDictEntry *entry = qdict_first(options); error_setg(errp, "Block protocol '%s' doesn't support the option '%s'", drv->format_name, entry->key); diff --git a/tests/qemu-iotests/051.out b/tests/qemu-iotests/051.out index c2cadba2fc..d0c5173626 100644 --- a/tests/qemu-iotests/051.out +++ b/tests/qemu-iotests/051.out @@ -222,7 +222,7 @@ QEMU X.Y.Z monitor - type 'help' for more information (qemu) qququiquit Testing: -drive file=TEST_DIR/t.qcow2,file.driver=qcow2 -QEMU_PROG: -drive file=TEST_DIR/t.qcow2,file.driver=qcow2: could not open disk image TEST_DIR/t.qcow2: Can't use 'qcow2' as a block driver for the protocol level +QEMU_PROG: -drive file=TEST_DIR/t.qcow2,file.driver=qcow2: could not open disk image TEST_DIR/t.qcow2: Block format 'qcow2' used by device '' doesn't support the option 'filename' === Parsing protocol from file name === From d095b465339b79929fd2adc25c0ab3598e80fd39 Mon Sep 17 00:00:00 2001 From: Max Reitz Date: Fri, 20 Dec 2013 19:28:14 +0100 Subject: [PATCH 31/93] blockdev: Move "file" to legacy_opts Specifying the image filename through the "file" option is a legacy option and should not be supported by blockdev-add (in that case, giving a string for "file" references an existing block device). Signed-off-by: Max Reitz Reviewed-by: Kevin Wolf Signed-off-by: Kevin Wolf --- blockdev.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/blockdev.c b/blockdev.c index e457494342..386109a8d4 100644 --- a/blockdev.c +++ b/blockdev.c @@ -307,12 +307,11 @@ static bool check_throttle_config(ThrottleConfig *cfg, Error **errp) typedef enum { MEDIA_DISK, MEDIA_CDROM } DriveMediaType; /* Takes the ownership of bs_opts */ -static DriveInfo *blockdev_init(QDict *bs_opts, +static DriveInfo *blockdev_init(const char *file, QDict *bs_opts, BlockInterfaceType type, Error **errp) { const char *buf; - const char *file = NULL; const char *serial; int ro = 0; int bdrv_flags = 0; @@ -354,7 +353,6 @@ static DriveInfo *blockdev_init(QDict *bs_opts, ro = qemu_opt_get_bool(opts, "read-only", 0); copy_on_read = qemu_opt_get_bool(opts, "copy-on-read", false); - file = qemu_opt_get(opts, "file"); serial = qemu_opt_get(opts, "serial"); if ((buf = qemu_opt_get(opts, "discard")) != NULL) { @@ -599,6 +597,10 @@ QemuOptsList qemu_legacy_drive_opts = { .name = "addr", .type = QEMU_OPT_STRING, .help = "pci address (virtio only)", + },{ + .name = "file", + .type = QEMU_OPT_STRING, + .help = "file name", }, /* Options that are passed on, but have special semantics with -drive */ @@ -629,6 +631,7 @@ DriveInfo *drive_init(QemuOpts *all_opts, BlockInterfaceType block_default_type) const char *devaddr; bool read_only = false; bool copy_on_read; + const char *filename; Error *local_err = NULL; /* Change legacy command line options into QMP ones */ @@ -867,8 +870,10 @@ DriveInfo *drive_init(QemuOpts *all_opts, BlockInterfaceType block_default_type) } } + filename = qemu_opt_get(legacy_opts, "file"); + /* Actual block device init: Functionality shared with blockdev-add */ - dinfo = blockdev_init(bs_opts, type, &local_err); + dinfo = blockdev_init(filename, bs_opts, type, &local_err); if (dinfo == NULL) { if (error_is_set(&local_err)) { qerror_report_err(local_err); @@ -2210,7 +2215,7 @@ void qmp_blockdev_add(BlockdevOptions *options, Error **errp) qdict_flatten(qdict); - blockdev_init(qdict, IF_NONE, &local_err); + blockdev_init(NULL, qdict, IF_NONE, &local_err); if (error_is_set(&local_err)) { error_propagate(errp, local_err); goto fail; @@ -2250,10 +2255,6 @@ QemuOptsList qemu_common_drive_opts = { .name = "snapshot", .type = QEMU_OPT_BOOL, .help = "enable/disable snapshot mode", - },{ - .name = "file", - .type = QEMU_OPT_STRING, - .help = "disk image", },{ .name = "discard", .type = QEMU_OPT_STRING, From 4373593d5111a8ed3b6d47ad4a458ee28ec942e3 Mon Sep 17 00:00:00 2001 From: Max Reitz Date: Fri, 20 Dec 2013 19:28:15 +0100 Subject: [PATCH 32/93] blkdebug: Allow command-line file configuration Introduce the "image" option as an alternative to specifying the image through the filename. Signed-off-by: Max Reitz Signed-off-by: Kevin Wolf --- block/blkdebug.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/block/blkdebug.c b/block/blkdebug.c index 21a4931594..c8f8d56758 100644 --- a/block/blkdebug.c +++ b/block/blkdebug.c @@ -374,7 +374,7 @@ static int blkdebug_open(BlockDriverState *bs, QDict *options, int flags, BDRVBlkdebugState *s = bs->opaque; QemuOpts *opts; Error *local_err = NULL; - const char *filename, *config; + const char *config; int ret; opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort); @@ -396,14 +396,8 @@ static int blkdebug_open(BlockDriverState *bs, QDict *options, int flags, s->state = 1; /* Open the backing file */ - filename = qemu_opt_get(opts, "x-image"); - if (filename == NULL) { - error_setg(errp, "Could not retrieve image file name"); - ret = -EINVAL; - goto fail; - } - - ret = bdrv_file_open(&bs->file, filename, NULL, NULL, flags, &local_err); + ret = bdrv_open_image(&bs->file, qemu_opt_get(opts, "x-image"), options, "image", + flags, true, false, &local_err); if (ret < 0) { error_propagate(errp, local_err); goto fail; From 70b6198acc9643c3ce801e5cf4c24274722f2f4a Mon Sep 17 00:00:00 2001 From: Max Reitz Date: Fri, 20 Dec 2013 19:28:16 +0100 Subject: [PATCH 33/93] blkverify: Allow command-line configuration Introduce the "test" and "raw" options for specifying images. Signed-off-by: Max Reitz Signed-off-by: Kevin Wolf --- block/blkverify.c | 25 +++++-------------------- 1 file changed, 5 insertions(+), 20 deletions(-) diff --git a/block/blkverify.c b/block/blkverify.c index e15ac4ca69..dc14290420 100644 --- a/block/blkverify.c +++ b/block/blkverify.c @@ -122,7 +122,6 @@ static int blkverify_open(BlockDriverState *bs, QDict *options, int flags, BDRVBlkverifyState *s = bs->opaque; QemuOpts *opts; Error *local_err = NULL; - const char *filename, *raw; int ret; opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort); @@ -133,33 +132,19 @@ static int blkverify_open(BlockDriverState *bs, QDict *options, int flags, goto fail; } - /* Parse the raw image filename */ - raw = qemu_opt_get(opts, "x-raw"); - if (raw == NULL) { - error_setg(errp, "Could not retrieve raw image filename"); - ret = -EINVAL; - goto fail; - } - - ret = bdrv_file_open(&bs->file, raw, NULL, NULL, flags, &local_err); + /* Open the raw file */ + ret = bdrv_open_image(&bs->file, qemu_opt_get(opts, "x-raw"), options, + "raw", flags, true, false, &local_err); if (ret < 0) { error_propagate(errp, local_err); goto fail; } /* Open the test file */ - filename = qemu_opt_get(opts, "x-image"); - if (filename == NULL) { - error_setg(errp, "Could not retrieve test image filename"); - ret = -EINVAL; - goto fail; - } - - s->test_file = bdrv_new(""); - ret = bdrv_open(s->test_file, filename, NULL, flags, NULL, &local_err); + ret = bdrv_open_image(&s->test_file, qemu_opt_get(opts, "x-image"), options, + "test", flags, false, false, &local_err); if (ret < 0) { error_propagate(errp, local_err); - bdrv_unref(s->test_file); s->test_file = NULL; goto fail; } From 22511ad681348cc4e500ebafdc324b0909d41c95 Mon Sep 17 00:00:00 2001 From: Max Reitz Date: Fri, 20 Dec 2013 19:28:17 +0100 Subject: [PATCH 34/93] blkverify: Don't require protocol filename If the filename is not prefixed by "blkverify:" in blkverify_parse_filename(), the blkverify driver was not selected through that protocol prefix, but by an explicit command line (or QMP) option (like driver=blkverify). If blkverify_parse_filename() has been called, a filename has been given. If it is not prefixed, it is probably really just a plain filename. This is no problem, since we can use it as the test image filename and rely on the user to specify the raw image filename through the new corresponding option. Signed-off-by: Max Reitz Signed-off-by: Kevin Wolf --- block/blkverify.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/block/blkverify.c b/block/blkverify.c index dc14290420..a2e8f5f138 100644 --- a/block/blkverify.c +++ b/block/blkverify.c @@ -78,7 +78,9 @@ static void blkverify_parse_filename(const char *filename, QDict *options, /* Parse the blkverify: prefix */ if (!strstart(filename, "blkverify:", &filename)) { - error_setg(errp, "File name string must start with 'blkverify:'"); + /* There was no prefix; therefore, all options have to be already + present in the QDict (except for the filename) */ + qdict_put(options, "x-image", qstring_from_str(filename)); return; } From 8592a545b61b99114a86ee7cecef7a5f284d1b6c Mon Sep 17 00:00:00 2001 From: Max Reitz Date: Fri, 20 Dec 2013 19:28:18 +0100 Subject: [PATCH 35/93] qapi: Add "errno" to the list of polluted words Using "errno" directly as an identifier results in various syntax errors; therefore it should be added to the list of polluted words. Signed-off-by: Max Reitz Signed-off-by: Kevin Wolf --- scripts/qapi.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/qapi.py b/scripts/qapi.py index 750e9fb552..9b3de4c7c3 100644 --- a/scripts/qapi.py +++ b/scripts/qapi.py @@ -247,7 +247,7 @@ def c_var(name, protect=True): 'and', 'and_eq', 'bitand', 'bitor', 'compl', 'not', 'not_eq', 'or', 'or_eq', 'xor', 'xor_eq']) # namespace pollution: - polluted_words = set(['unix']) + polluted_words = set(['unix', 'errno']) if protect and (name in c89_words | c99_words | c11_words | gcc_words | cpp_words | polluted_words): return "q_" + name return name.replace('-', '_').lstrip("*") From 1bf20b8280186299c750018bbfa3b52f4afd71ea Mon Sep 17 00:00:00 2001 From: Max Reitz Date: Fri, 20 Dec 2013 19:28:19 +0100 Subject: [PATCH 36/93] qapi: QMP interface for blkdebug and blkverify Add structures to support blkdebug and blkverify in blockdev-add. Signed-off-by: Max Reitz Signed-off-by: Kevin Wolf --- qapi-schema.json | 113 +++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 109 insertions(+), 4 deletions(-) diff --git a/qapi-schema.json b/qapi-schema.json index f27c48a285..35f7b34410 100644 --- a/qapi-schema.json +++ b/qapi-schema.json @@ -4200,6 +4200,113 @@ '*pass-discard-snapshot': 'bool', '*pass-discard-other': 'bool' } } +## +# @BlkdebugEvent +# +# Trigger events supported by blkdebug. +## +{ 'enum': 'BlkdebugEvent', + 'data': [ 'l1_update', 'l1_grow.alloc_table', 'l1_grow.write_table', + 'l1_grow.activate_table', 'l2_load', 'l2_update', + 'l2_update_compressed', 'l2_alloc.cow_read', 'l2_alloc.write', + 'read_aio', 'read_backing_aio', 'read_compressed', 'write_aio', + 'write_compressed', 'vmstate_load', 'vmstate_save', 'cow_read', + 'cow_write', 'reftable_load', 'reftable_grow', 'reftable_update', + 'refblock_load', 'refblock_update', 'refblock_update_part', + 'refblock_alloc', 'refblock_alloc.hookup', 'refblock_alloc.write', + 'refblock_alloc.write_blocks', 'refblock_alloc.write_table', + 'refblock_alloc.switch_table', 'cluster_alloc', + 'cluster_alloc_bytes', 'cluster_free', 'flush_to_os', + 'flush_to_disk' ] } + +## +# @BlkdebugInjectErrorOptions +# +# Describes a single error injection for blkdebug. +# +# @event: trigger event +# +# @state: #optional the state identifier blkdebug needs to be in to +# actually trigger the event; defaults to "any" +# +# @errno: #optional error identifier (errno) to be returned; defaults to +# EIO +# +# @sector: #optional specifies the sector index which has to be affected +# in order to actually trigger the event; defaults to "any +# sector" +# +# @once: #optional disables further events after this one has been +# triggered; defaults to false +# +# @immediately: #optional fail immediately; defaults to false +# +# Since: 2.0 +## +{ 'type': 'BlkdebugInjectErrorOptions', + 'data': { 'event': 'BlkdebugEvent', + '*state': 'int', + '*errno': 'int', + '*sector': 'int', + '*once': 'bool', + '*immediately': 'bool' } } + +## +# @BlkdebugSetStateOptions +# +# Describes a single state-change event for blkdebug. +# +# @event: trigger event +# +# @state: #optional the current state identifier blkdebug needs to be in; +# defaults to "any" +# +# @new_state: the state identifier blkdebug is supposed to assume if +# this event is triggered +# +# Since: 2.0 +## +{ 'type': 'BlkdebugSetStateOptions', + 'data': { 'event': 'BlkdebugEvent', + '*state': 'int', + 'new_state': 'int' } } + +## +# @BlockdevOptionsBlkdebug +# +# Driver specific block device options for blkdebug. +# +# @image: underlying raw block device (or image file) +# +# @config: #optional filename of the configuration file +# +# @inject-error: #optional array of error injection descriptions +# +# @set-state: #optional array of state-change descriptions +# +# Since: 2.0 +## +{ 'type': 'BlockdevOptionsBlkdebug', + 'data': { 'image': 'BlockdevRef', + '*config': 'str', + '*inject-error': ['BlkdebugInjectErrorOptions'], + '*set-state': ['BlkdebugSetStateOptions'] } } + +## +# @BlockdevOptionsBlkverify +# +# Driver specific block device options for blkverify. +# +# @test: block device to be tested +# +# @raw: raw image used for verification +# +# Since: 2.0 +## +{ 'type': 'BlockdevOptionsBlkverify', + 'data': { 'test': 'BlockdevRef', + 'raw': 'BlockdevRef' } } + ## # @BlockdevOptions # @@ -4224,10 +4331,8 @@ # TODO sheepdog: Wait for structured options # TODO ssh: Should take InetSocketAddress for 'host'? 'vvfat': 'BlockdevOptionsVVFAT', - -# TODO blkdebug: Wait for structured options -# TODO blkverify: Wait for structured options - + 'blkdebug': 'BlockdevOptionsBlkdebug', + 'blkverify': 'BlockdevOptionsBlkverify', 'bochs': 'BlockdevOptionsGenericFormat', 'cloop': 'BlockdevOptionsGenericFormat', 'cow': 'BlockdevOptionsGenericCOWFormat', From fd0fee34b5ae7699dc558c12ddc3663bdb580060 Mon Sep 17 00:00:00 2001 From: Max Reitz Date: Fri, 20 Dec 2013 19:28:20 +0100 Subject: [PATCH 37/93] qemu-io: Make filename optional Giving a filename is actually not essential, since it can be specified through the options as well - on the contrary: Sometimes a filename must not be given. Signed-off-by: Max Reitz Reviewed-by: Kevin Wolf Signed-off-by: Kevin Wolf --- qemu-io.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/qemu-io.c b/qemu-io.c index bfb773e70f..d6690289b8 100644 --- a/qemu-io.c +++ b/qemu-io.c @@ -163,11 +163,13 @@ static int open_f(BlockDriverState *bs, int argc, char **argv) flags |= BDRV_O_RDWR; } - if (optind != argc - 1) { + if (optind == argc - 1) { + return openfile(argv[optind], flags, growable, opts); + } else if (optind == argc) { + return openfile(NULL, flags, growable, opts); + } else { return qemuio_command_usage(&open_cmd); } - - return openfile(argv[optind], flags, growable, opts); } static int quit_f(BlockDriverState *bs, int argc, char **argv) From be331341a1f35c2de2fcc05cc78e0342d2edeb8a Mon Sep 17 00:00:00 2001 From: Max Reitz Date: Fri, 20 Dec 2013 19:28:21 +0100 Subject: [PATCH 38/93] tests: Add test for qdict_array_split() Add a test case for qdict_array_split() in tests/check-qdict.c. Signed-off-by: Max Reitz Reviewed-by: Eric Blake Signed-off-by: Kevin Wolf --- tests/check-qdict.c | 80 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 80 insertions(+) diff --git a/tests/check-qdict.c b/tests/check-qdict.c index dc5f05a85f..cab7dd093d 100644 --- a/tests/check-qdict.c +++ b/tests/check-qdict.c @@ -227,6 +227,85 @@ static void qdict_iterapi_test(void) QDECREF(tests_dict); } +static void qdict_array_split_test(void) +{ + QDict *test_dict = qdict_new(); + QDict *dict1, *dict2; + QList *test_list; + + /* + * Test the split of + * + * { + * "1.x": 0, + * "3.y": 1, + * "0.a": 42, + * "o.o": 7, + * "0.b": 23 + * } + * + * to + * + * [ + * { + * "a": 42, + * "b": 23 + * }, + * { + * "x": 0 + * } + * ] + * + * and + * + * { + * "3.y": 1, + * "o.o": 7 + * } + * + * (remaining in the old QDict) + * + * This example is given in the comment of qdict_array_split(). + */ + + qdict_put(test_dict, "1.x", qint_from_int(0)); + qdict_put(test_dict, "3.y", qint_from_int(1)); + qdict_put(test_dict, "0.a", qint_from_int(42)); + qdict_put(test_dict, "o.o", qint_from_int(7)); + qdict_put(test_dict, "0.b", qint_from_int(23)); + + qdict_array_split(test_dict, &test_list); + + dict1 = qobject_to_qdict(qlist_pop(test_list)); + dict2 = qobject_to_qdict(qlist_pop(test_list)); + + g_assert(dict1); + g_assert(dict2); + g_assert(qlist_empty(test_list)); + + QDECREF(test_list); + + g_assert(qdict_get_int(dict1, "a") == 42); + g_assert(qdict_get_int(dict1, "b") == 23); + + g_assert(qdict_size(dict1) == 2); + + QDECREF(dict1); + + g_assert(qdict_get_int(dict2, "x") == 0); + + g_assert(qdict_size(dict2) == 1); + + QDECREF(dict2); + + g_assert(qdict_get_int(test_dict, "3.y") == 1); + g_assert(qdict_get_int(test_dict, "o.o") == 7); + + g_assert(qdict_size(test_dict) == 2); + + QDECREF(test_dict); +} + /* * Errors test-cases */ @@ -365,6 +444,7 @@ int main(int argc, char **argv) g_test_add_func("/public/del", qdict_del_test); g_test_add_func("/public/to_qdict", qobject_to_qdict_test); g_test_add_func("/public/iterapi", qdict_iterapi_test); + g_test_add_func("/public/array_split", qdict_array_split_test); g_test_add_func("/errors/put_exists", qdict_put_exists_test); g_test_add_func("/errors/get_not_exists", qdict_get_not_exists_test); From 3fb11779ca5f1d601adeb5870ba79e61e81a4cce Mon Sep 17 00:00:00 2001 From: Max Reitz Date: Fri, 20 Dec 2013 19:28:22 +0100 Subject: [PATCH 39/93] tests: Add test for qdict_flatten() Add a test case for qdict_flatten() in tests/check-qdict.c. This test case covers the flattening of subordinate QLists as well. Signed-off-by: Max Reitz Reviewed-by: Eric Blake Signed-off-by: Kevin Wolf --- tests/check-qdict.c | 76 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 76 insertions(+) diff --git a/tests/check-qdict.c b/tests/check-qdict.c index cab7dd093d..7a7461b0b2 100644 --- a/tests/check-qdict.c +++ b/tests/check-qdict.c @@ -227,6 +227,81 @@ static void qdict_iterapi_test(void) QDECREF(tests_dict); } +static void qdict_flatten_test(void) +{ + QList *list1 = qlist_new(); + QList *list2 = qlist_new(); + QDict *dict1 = qdict_new(); + QDict *dict2 = qdict_new(); + QDict *dict3 = qdict_new(); + + /* + * Test the flattening of + * + * { + * "e": [ + * 42, + * [ + * 23, + * 66, + * { + * "a": 0, + * "b": 1 + * } + * ] + * ], + * "f": { + * "c": 2, + * "d": 3, + * }, + * "g": 4 + * } + * + * to + * + * { + * "e.0": 42, + * "e.1.0": 23, + * "e.1.1": 66, + * "e.1.2.a": 0, + * "e.1.2.b": 1, + * "f.c": 2, + * "f.d": 3, + * "g": 4 + * } + */ + + qdict_put(dict1, "a", qint_from_int(0)); + qdict_put(dict1, "b", qint_from_int(1)); + + qlist_append_obj(list1, QOBJECT(qint_from_int(23))); + qlist_append_obj(list1, QOBJECT(qint_from_int(66))); + qlist_append_obj(list1, QOBJECT(dict1)); + qlist_append_obj(list2, QOBJECT(qint_from_int(42))); + qlist_append_obj(list2, QOBJECT(list1)); + + qdict_put(dict2, "c", qint_from_int(2)); + qdict_put(dict2, "d", qint_from_int(3)); + qdict_put_obj(dict3, "e", QOBJECT(list2)); + qdict_put_obj(dict3, "f", QOBJECT(dict2)); + qdict_put(dict3, "g", qint_from_int(4)); + + qdict_flatten(dict3); + + g_assert(qdict_get_int(dict3, "e.0") == 42); + g_assert(qdict_get_int(dict3, "e.1.0") == 23); + g_assert(qdict_get_int(dict3, "e.1.1") == 66); + g_assert(qdict_get_int(dict3, "e.1.2.a") == 0); + g_assert(qdict_get_int(dict3, "e.1.2.b") == 1); + g_assert(qdict_get_int(dict3, "f.c") == 2); + g_assert(qdict_get_int(dict3, "f.d") == 3); + g_assert(qdict_get_int(dict3, "g") == 4); + + g_assert(qdict_size(dict3) == 8); + + QDECREF(dict3); +} + static void qdict_array_split_test(void) { QDict *test_dict = qdict_new(); @@ -444,6 +519,7 @@ int main(int argc, char **argv) g_test_add_func("/public/del", qdict_del_test); g_test_add_func("/public/to_qdict", qobject_to_qdict_test); g_test_add_func("/public/iterapi", qdict_iterapi_test); + g_test_add_func("/public/flatten", qdict_flatten_test); g_test_add_func("/public/array_split", qdict_array_split_test); g_test_add_func("/errors/put_exists", qdict_put_exists_test); From 30bd6a4dafe2f79909451ef5769561c9a9d3eaca Mon Sep 17 00:00:00 2001 From: Max Reitz Date: Fri, 20 Dec 2013 19:28:23 +0100 Subject: [PATCH 40/93] iotests: Test new blkdebug/blkverify interface Add a test for the new blkdebug/blkverify interface. This test is not written in Python, although it uses QMP. This is because it invokes the qemu-io HMP command, which outputs errors to stderr instead of returning them through QMP. Filtering and testing that output is easier in a shell script than with the Python infrastructure. Signed-off-by: Max Reitz Signed-off-by: Kevin Wolf --- tests/qemu-iotests/071 | 239 +++++++++++++++++++++++++++++++++++++ tests/qemu-iotests/071.out | 90 ++++++++++++++ tests/qemu-iotests/group | 1 + 3 files changed, 330 insertions(+) create mode 100755 tests/qemu-iotests/071 create mode 100644 tests/qemu-iotests/071.out diff --git a/tests/qemu-iotests/071 b/tests/qemu-iotests/071 new file mode 100755 index 0000000000..2a22546e1a --- /dev/null +++ b/tests/qemu-iotests/071 @@ -0,0 +1,239 @@ +#!/bin/bash +# +# Test case for the QMP blkdebug and blkverify interfaces +# +# Copyright (C) 2013 Red Hat, Inc. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# + +# creator +owner=mreitz@redhat.com + +seq="$(basename $0)" +echo "QA output created by $seq" + +here="$PWD" +tmp=/tmp/$$ +status=1 # failure is the default! + +_cleanup() +{ + _cleanup_test_img +} +trap "_cleanup; exit \$status" 0 1 2 3 15 + +# get standard environment, filters and checks +. ./common.rc +. ./common.filter + +_supported_fmt generic +_supported_proto generic +_supported_os Linux + +function do_run_qemu() +{ + echo Testing: "$@" | _filter_imgfmt + $QEMU -nographic -qmp stdio -serial none "$@" + echo +} + +function run_qemu() +{ + do_run_qemu "$@" 2>&1 | _filter_testdir | _filter_qmp | _filter_qemu_io +} + +IMG_SIZE=64M + +echo +echo "=== Testing blkverify through filename ===" +echo + +TEST_IMG="$TEST_IMG.base" IMGOPTS="" IMGFMT="raw" _make_test_img $IMG_SIZE |\ + _filter_imgfmt +_make_test_img $IMG_SIZE +$QEMU_IO -c "open -o file.driver=blkverify,file.raw.filename=$TEST_IMG.base $TEST_IMG" \ + -c 'read 0 512' -c 'write -P 42 0x38000 512' -c 'read -P 42 0x38000 512' | _filter_qemu_io + +$QEMU_IO -c 'write -P 42 0 512' "$TEST_IMG" | _filter_qemu_io + +$QEMU_IO -c "open -o file.driver=blkverify,file.raw.filename=$TEST_IMG.base $TEST_IMG" \ + -c 'read -P 42 0 512' | _filter_qemu_io + +echo +echo "=== Testing blkverify through file blockref ===" +echo + +TEST_IMG="$TEST_IMG.base" IMGOPTS="" IMGFMT="raw" _make_test_img $IMG_SIZE |\ + _filter_imgfmt +_make_test_img $IMG_SIZE +$QEMU_IO -c "open -o file.driver=blkverify,file.raw.filename=$TEST_IMG.base,file.test.driver=$IMGFMT,file.test.file.filename=$TEST_IMG" \ + -c 'read 0 512' -c 'write -P 42 0x38000 512' -c 'read -P 42 0x38000 512' | _filter_qemu_io + +$QEMU_IO -c 'write -P 42 0 512' "$TEST_IMG" | _filter_qemu_io + +$QEMU_IO -c "open -o file.driver=blkverify,file.raw.filename=$TEST_IMG.base $TEST_IMG" \ + -c 'read -P 42 0 512' | _filter_qemu_io + +echo +echo "=== Testing blkdebug through filename ===" +echo + +$QEMU_IO -c "open -o file.driver=blkdebug,file.inject-error.event=l2_load $TEST_IMG" \ + -c 'read -P 42 0x38000 512' + +echo +echo "=== Testing blkdebug through file blockref ===" +echo + +$QEMU_IO -c "open -o driver=$IMGFMT,file.driver=blkdebug,file.inject-error.event=l2_load,file.image.filename=$TEST_IMG" \ + -c 'read -P 42 0x38000 512' + +echo +echo "=== Testing blkdebug on existing block device ===" +echo + +run_qemu -drive "file=$TEST_IMG,format=raw,if=none,id=drive0" < Date: Fri, 20 Dec 2013 19:28:24 +0100 Subject: [PATCH 41/93] iotests: Test file format nesting Add a test for nested image formats. Signed-off-by: Max Reitz Signed-off-by: Kevin Wolf --- tests/qemu-iotests/072 | 69 ++++++++++++++++++++++++++++++++++++++ tests/qemu-iotests/072.out | 21 ++++++++++++ tests/qemu-iotests/group | 1 + 3 files changed, 91 insertions(+) create mode 100755 tests/qemu-iotests/072 create mode 100644 tests/qemu-iotests/072.out diff --git a/tests/qemu-iotests/072 b/tests/qemu-iotests/072 new file mode 100755 index 0000000000..a3876c2161 --- /dev/null +++ b/tests/qemu-iotests/072 @@ -0,0 +1,69 @@ +#!/bin/bash +# +# Test case for nested image formats +# +# Copyright (C) 2013 Red Hat, Inc. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# + +# creator +owner=mreitz@redhat.com + +seq="$(basename $0)" +echo "QA output created by $seq" + +here="$PWD" +tmp=/tmp/$$ +status=1 # failure is the default! + +_cleanup() +{ + _cleanup_test_img +} +trap "_cleanup; exit \$status" 0 1 2 3 15 + +# get standard environment, filters and checks +. ./common.rc +. ./common.filter + +_supported_fmt vpc vmdk vhdx vdi qed qcow2 qcow cow +_supported_proto generic +_supported_os Linux + +IMG_SIZE=64M + +echo +echo "=== Testing nested image formats ===" +echo + +TEST_IMG="$TEST_IMG.base" _make_test_img $IMG_SIZE + +$QEMU_IO -c 'write -P 42 0 512' -c 'write -P 23 512 512' \ + -c 'write -P 66 1024 512' "$TEST_IMG.base" | _filter_qemu_io + +$QEMU_IMG convert -f raw -O $IMGFMT "$TEST_IMG.base" "$TEST_IMG" + +$QEMU_IO -c "open -o driver=$IMGFMT,file.driver=$IMGFMT,file.file.filename=$TEST_IMG" \ + -c 'read -P 42 0 512' -c 'read -P 23 512 512' \ + -c 'read -P 66 1024 512' | _filter_qemu_io + +# When not giving any format, qemu should open only one "layer". Therefore, this +# should not work for any image formats with a header. +$QEMU_IO -c 'read -P 42 0 512' "$TEST_IMG" | _filter_qemu_io + +# success, all done +echo "*** done" +rm -f $seq.full +status=0 diff --git a/tests/qemu-iotests/072.out b/tests/qemu-iotests/072.out new file mode 100644 index 0000000000..efe577c1c0 --- /dev/null +++ b/tests/qemu-iotests/072.out @@ -0,0 +1,21 @@ +QA output created by 072 + +=== Testing nested image formats === + +Formatting 'TEST_DIR/t.IMGFMT.base', fmt=IMGFMT size=67108864 +wrote 512/512 bytes at offset 0 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +wrote 512/512 bytes at offset 512 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +wrote 512/512 bytes at offset 1024 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 0 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 512 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 1024 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +Pattern verification failed at offset 0, 512 bytes +read 512/512 bytes at offset 0 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +*** done diff --git a/tests/qemu-iotests/group b/tests/qemu-iotests/group index 1194339915..5860c40e09 100644 --- a/tests/qemu-iotests/group +++ b/tests/qemu-iotests/group @@ -78,5 +78,6 @@ 069 rw auto 070 rw auto 071 rw auto +072 rw auto 073 rw auto 074 rw auto From d80ac658f2caacfb14ea386211c4a9bea0cea280 Mon Sep 17 00:00:00 2001 From: Peter Feiner Date: Wed, 8 Jan 2014 19:43:25 +0000 Subject: [PATCH 42/93] block: fix backing file segfault When a backing file is opened such that (1) a protocol is directly used as the block driver and (2) the block driver has bdrv_file_open, bdrv_open_backing_file segfaults. The problem arises because bdrv_open_common returns without setting bd->backing_hd->file. To effect (1), you seem to have to use the -F flag in qemu-img. There are several block drivers that satisfy (2), such as "file" and "nbd". Here are some concrete examples: #!/bin/bash echo Test file format ./qemu-img create -f file base.file 1m ./qemu-img create -f qcow2 -F file -o backing_file=base.file\ file-overlay.qcow2 ./qemu-img convert -O raw file-overlay.qcow2 file-convert.raw echo Test nbd format SOCK=$PWD/nbd.sock ./qemu-img create -f raw base.raw 1m ./qemu-nbd -t -k $SOCK base.raw & trap "kill $!" EXIT while ! test -e $SOCK; do sleep 1; done ./qemu-img create -f qcow2 -F nbd -o backing_file=nbd:unix:$SOCK\ nbd-overlay.qcow2 ./qemu-img convert -O raw nbd-overlay.qcow2 nbd-convert.raw Without this patch, the two qemu-img convert commands segfault. This is a regression that was introduced in v1.7 by dbecebddfa4932d1c83915bcb9b5ba5984eb91be. Signed-off-by: Peter Feiner Reviewed-by: Max Reitz Signed-off-by: Kevin Wolf --- block.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/block.c b/block.c index 6af5f6e39f..53cc9e09c2 100644 --- a/block.c +++ b/block.c @@ -1040,8 +1040,12 @@ int bdrv_open_backing_file(BlockDriverState *bs, QDict *options, Error **errp) error_free(local_err); return ret; } - pstrcpy(bs->backing_file, sizeof(bs->backing_file), - bs->backing_hd->file->filename); + + if (bs->backing_hd->file) { + pstrcpy(bs->backing_file, sizeof(bs->backing_file), + bs->backing_hd->file->filename); + } + return 0; } From 385c04d0b66917457b6a12fc2cfd99a6a40b2d89 Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Mon, 13 Jan 2014 18:47:39 +0800 Subject: [PATCH 43/93] dataplane: fix shadowed return value Propagate the error return value from get_indirect(). This bug was introduced in commit 4d684832 ("vring: create a common function to parse descriptors"). Reviewed-by: Markus Armbruster Reviewed-by: Peter Maydell Signed-off-by: Stefan Hajnoczi Signed-off-by: Kevin Wolf --- hw/virtio/dataplane/vring.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hw/virtio/dataplane/vring.c b/hw/virtio/dataplane/vring.c index 250d45ec3d..665a1ffcb3 100644 --- a/hw/virtio/dataplane/vring.c +++ b/hw/virtio/dataplane/vring.c @@ -376,7 +376,7 @@ int vring_pop(VirtIODevice *vdev, Vring *vring, barrier(); if (desc.flags & VRING_DESC_F_INDIRECT) { - int ret = get_indirect(vring, elem, &desc); + ret = get_indirect(vring, elem, &desc); if (ret < 0) { goto out; } From 46bae927134468d27f5e2508c3ced67ff58fa45b Mon Sep 17 00:00:00 2001 From: Hu Tao Date: Tue, 21 Jan 2014 11:30:02 +0800 Subject: [PATCH 44/93] qcow2: fix wrong value of L1E_OFFSET_MASK, L2E_OFFSET_MASK and REFT_OFFSET_MASK Accoring to qcow spec, the offset fields in l1e, l2e and ref table entry start at bit 9. The offset is cluster offset, and the smallest possible cluster size is 512 bytes. Signed-off-by: Hu Tao Signed-off-by: Kevin Wolf --- block/qcow2.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/block/qcow2.h b/block/qcow2.h index 303eb26629..b5b7d13630 100644 --- a/block/qcow2.h +++ b/block/qcow2.h @@ -340,11 +340,11 @@ typedef enum QCow2MetadataOverlap { #define QCOW2_OL_ALL \ (QCOW2_OL_CACHED | QCOW2_OL_INACTIVE_L2) -#define L1E_OFFSET_MASK 0x00ffffffffffff00ULL -#define L2E_OFFSET_MASK 0x00ffffffffffff00ULL +#define L1E_OFFSET_MASK 0x00fffffffffffe00ULL +#define L2E_OFFSET_MASK 0x00fffffffffffe00ULL #define L2E_COMPRESSED_OFFSET_SIZE_MASK 0x3fffffffffffffffULL -#define REFT_OFFSET_MASK 0xffffffffffffff00ULL +#define REFT_OFFSET_MASK 0xfffffffffffffe00ULL static inline int64_t start_of_cluster(BDRVQcowState *s, int64_t offset) { From 34ceed81f9ca31829448276dafe3d9151d66962c Mon Sep 17 00:00:00 2001 From: Fam Zheng Date: Tue, 21 Jan 2014 15:07:43 +0800 Subject: [PATCH 45/93] vmdk: Check for overhead when opening Report an error if file size is even smaller than metadata. Signed-off-by: Fam Zheng Signed-off-by: Kevin Wolf --- block/vmdk.c | 7 +++++++ tests/qemu-iotests/059 | 6 ++++++ tests/qemu-iotests/059.out | 5 +++++ 3 files changed, 18 insertions(+) diff --git a/block/vmdk.c b/block/vmdk.c index 599a928545..74c44bd5fb 100644 --- a/block/vmdk.c +++ b/block/vmdk.c @@ -640,6 +640,13 @@ static int vmdk_open_vmdk4(BlockDriverState *bs, if (le32_to_cpu(header.flags) & VMDK4_FLAG_RGD) { l1_backup_offset = le64_to_cpu(header.rgd_offset) << 9; } + if (bdrv_getlength(file) < + le64_to_cpu(header.grain_offset) * BDRV_SECTOR_SIZE) { + error_report("File truncated, expecting at least %lld bytes", + le64_to_cpu(header.grain_offset) * BDRV_SECTOR_SIZE); + return -EINVAL; + } + ret = vmdk_add_extent(bs, file, false, le64_to_cpu(header.capacity), le64_to_cpu(header.gd_offset) << 9, diff --git a/tests/qemu-iotests/059 b/tests/qemu-iotests/059 index 64ed04cfce..2d604d3a91 100755 --- a/tests/qemu-iotests/059 +++ b/tests/qemu-iotests/059 @@ -97,6 +97,12 @@ RW 12582912 VMFS "dummy.vmdk" 1 EOF _img_info +echo +echo "=== Testing truncated sparse ===" +IMGOPTS="subformat=monolithicSparse" _make_test_img 100G +truncate -s 10M $TEST_IMG +_img_info + echo echo "=== Testing version 3 ===" _use_sample_img iotest-version3.vmdk.bz2 diff --git a/tests/qemu-iotests/059.out b/tests/qemu-iotests/059.out index 5e30e69bef..4ffeb54710 100644 --- a/tests/qemu-iotests/059.out +++ b/tests/qemu-iotests/059.out @@ -2043,6 +2043,11 @@ qemu-img: Could not open 'TEST_DIR/t.IMGFMT': Invalid extent lines: RW 12582912 VMFS "dummy.IMGFMT" 1 +=== Testing truncated sparse === +Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=107374182400 +qemu-img: File truncated, expecting at least 13172736 bytes +qemu-img: Could not open 'TEST_DIR/t.IMGFMT': Could not open 'TEST_DIR/t.IMGFMT': Wrong medium type + === Testing version 3 === image: TEST_DIR/iotest-version3.IMGFMT file format: IMGFMT From e69968d472bd020a08c677c814237548090d2e59 Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Mon, 20 Jan 2014 15:05:25 +0100 Subject: [PATCH 46/93] qemu-progress: Drop unused include Signed-off-by: Kevin Wolf Reviewed-by: Benoit Canet --- util/qemu-progress.c | 1 - 1 file changed, 1 deletion(-) diff --git a/util/qemu-progress.c b/util/qemu-progress.c index 9a3f96cd47..ad33fee98d 100644 --- a/util/qemu-progress.c +++ b/util/qemu-progress.c @@ -24,7 +24,6 @@ #include "qemu-common.h" #include "qemu/osdep.h" -#include "sysemu/sysemu.h" #include struct progress_state { From 3c4b4e383e82ab3db307ee01f12ab0d4a28584dc Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Mon, 20 Jan 2014 15:06:03 +0100 Subject: [PATCH 47/93] qemu-progress: Fix progress printing on SIGUSR1 Since commit a7aae221 ('Switch SIG_IPI to SIGUSR1'), SIGUSR1 is blocked during startup, breaking the progress report in tools. This patch reenables the signal when initialising a progress report. Signed-off-by: Kevin Wolf Reviewed-by: Benoit Canet --- util/qemu-progress.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/util/qemu-progress.c b/util/qemu-progress.c index ad33fee98d..4ee5cd07f2 100644 --- a/util/qemu-progress.c +++ b/util/qemu-progress.c @@ -82,12 +82,22 @@ static void progress_dummy_init(void) { #ifdef CONFIG_POSIX struct sigaction action; + sigset_t set; memset(&action, 0, sizeof(action)); sigfillset(&action.sa_mask); action.sa_handler = sigusr_print; action.sa_flags = 0; sigaction(SIGUSR1, &action, NULL); + + /* + * SIGUSR1 is SIG_IPI and gets blocked in qemu_init_main_loop(). In the + * tools that use the progress report SIGUSR1 isn't used in this meaning + * and instead should print the progress, so reenable it. + */ + sigemptyset(&set); + sigaddset(&set, SIGUSR1); + pthread_sigmask(SIG_UNBLOCK, &set, NULL); #endif state.print = progress_dummy_print; From 0e3bd9932f862c1c1e4926939b4d0c602ce214ef Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Mon, 20 Jan 2014 15:12:16 +0100 Subject: [PATCH 48/93] Documentation: qemu-img: Mention SIGUSR1 progress report Document the SIGUSR1 behaviour of qemu-img. Also, added compare to the list of subcommands that support -p. Signed-off-by: Kevin Wolf Reviewed-by: Benoit Canet --- qemu-img.texi | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/qemu-img.texi b/qemu-img.texi index 778e967f39..f86a86d916 100644 --- a/qemu-img.texi +++ b/qemu-img.texi @@ -57,7 +57,9 @@ indicates that target image must be compressed (qcow format only) @item -h with or without a command shows help and lists the supported formats @item -p -display progress bar (convert and rebase commands only) +display progress bar (compare, convert and rebase commands only). +If the @var{-p} option is not used for a command that supports it, the +progress is reported when the process receives a @code{SIGUSR1} signal. @item -q Quiet mode - do not print any output (except errors). There's no progress bar in case both @var{-q} and @var{-p} options are used. From 9cd767376f137918dbe90abb452dfe119ae7d8f3 Mon Sep 17 00:00:00 2001 From: Liu Yuan Date: Wed, 22 Jan 2014 01:14:11 +0800 Subject: [PATCH 49/93] sheepdog: fix 'qemu-img map' It was muted in the previous commit 4bc74be9. Let's revive it since nothing prevents us to do it. With this patch, following command will work as other formats: $ qemu-img map sheepdog:image Cc: qemu-devel@nongnu.org Cc: Kevin Wolf Cc: Stefan Hajnoczi Signed-off-by: Liu Yuan Signed-off-by: Kevin Wolf --- block/sheepdog.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/block/sheepdog.c b/block/sheepdog.c index 2ce3d9b9fb..672b9c97a2 100644 --- a/block/sheepdog.c +++ b/block/sheepdog.c @@ -2442,11 +2442,12 @@ sd_co_get_block_status(BlockDriverState *bs, int64_t sector_num, int nb_sectors, { BDRVSheepdogState *s = bs->opaque; SheepdogInode *inode = &s->inode; - unsigned long start = sector_num * BDRV_SECTOR_SIZE / SD_DATA_OBJ_SIZE, + uint64_t offset = sector_num * BDRV_SECTOR_SIZE; + unsigned long start = offset / SD_DATA_OBJ_SIZE, end = DIV_ROUND_UP((sector_num + nb_sectors) * BDRV_SECTOR_SIZE, SD_DATA_OBJ_SIZE); unsigned long idx; - int64_t ret = BDRV_BLOCK_DATA; + int64_t ret = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | offset; for (idx = start; idx < end; idx++) { if (inode->data_vdi_id[idx] == 0) { From 6df3bf8eb3ed428015c85cfbd554ac9b32164f40 Mon Sep 17 00:00:00 2001 From: Zhang Min Date: Thu, 23 Jan 2014 15:59:16 +0800 Subject: [PATCH 50/93] drive mirror:fix memory leak In the function mirror_iteration() -> qemu_iovec_init(), it allocates memory for op->qiov.iov, when the write request calls back, but in the function mirror_iteration_done(), it only frees the op, not free the op->qiov.iov, so this causes memory leak. It should use qemu_iovec_destroy() to free op->qiov. Signed-off-by: Zhang Min Signed-off-by: Kevin Wolf --- block/mirror.c | 1 + 1 file changed, 1 insertion(+) diff --git a/block/mirror.c b/block/mirror.c index 2932bab27a..05758e5500 100644 --- a/block/mirror.c +++ b/block/mirror.c @@ -96,6 +96,7 @@ static void mirror_iteration_done(MirrorOp *op, int ret) bitmap_set(s->cow_bitmap, chunk_num, nb_chunks); } + qemu_iovec_destroy(&op->qiov); g_slice_free(MirrorOp, op); qemu_coroutine_enter(s->common.co, NULL); } From d8a7b061ae01e5692cc994f05ad6480d8c170125 Mon Sep 17 00:00:00 2001 From: Fam Zheng Date: Thu, 23 Jan 2014 15:10:52 +0800 Subject: [PATCH 51/93] vmdk: Fix format specific information (create type) for streamOptimized Previously the field is wrong: $ ./qemu-img create -f vmdk -o subformat=streamOptimized /tmp/a.vmdk 1G $ ./qemu-img info /tmp/a.vmdk image: /tmp/a.vmdk file format: vmdk virtual size: 1.0G (1073741824 bytes) disk size: 12K Format specific information: cid: 1390460459 parent cid: 4294967295 >>> create type: monolithicSparse Signed-off-by: Fam Zheng Signed-off-by: Kevin Wolf --- block/vmdk.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/block/vmdk.c b/block/vmdk.c index 74c44bd5fb..67b5f96a19 100644 --- a/block/vmdk.c +++ b/block/vmdk.c @@ -661,6 +661,10 @@ static int vmdk_open_vmdk4(BlockDriverState *bs, } extent->compressed = le16_to_cpu(header.compressAlgorithm) == VMDK4_COMPRESSION_DEFLATE; + if (extent->compressed) { + g_free(s->create_type); + s->create_type = g_strdup("streamOptimized"); + } extent->has_marker = le32_to_cpu(header.flags) & VMDK4_FLAG_MARKER; extent->version = le32_to_cpu(header.version); extent->has_zero_grain = le32_to_cpu(header.flags) & VMDK4_FLAG_ZERO_GRAIN; From c8059b97e1f9b4635b836ee98373a0f72f9fc0b4 Mon Sep 17 00:00:00 2001 From: Fam Zheng Date: Thu, 23 Jan 2014 10:03:26 +0800 Subject: [PATCH 52/93] qapi: Add "backing" to BlockStats Currently there is no way to query BlockStats of the backing chain. This adds "backing" field into BlockStats to make it possible. The comment of "parent" is reworded. Signed-off-by: Fam Zheng Reviewed-by: Benoit Canet Reviewed-by: Eric Blake Signed-off-by: Kevin Wolf --- block/qapi.c | 5 +++++ qapi-schema.json | 10 ++++++---- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/block/qapi.c b/block/qapi.c index a32cb79db8..98b1b83bd6 100644 --- a/block/qapi.c +++ b/block/qapi.c @@ -318,6 +318,11 @@ BlockStats *bdrv_query_stats(const BlockDriverState *bs) s->parent = bdrv_query_stats(bs->file); } + if (bs->backing_hd) { + s->has_backing = true; + s->backing = bdrv_query_stats(bs->backing_hd); + } + return s; } diff --git a/qapi-schema.json b/qapi-schema.json index 35f7b34410..a433869962 100644 --- a/qapi-schema.json +++ b/qapi-schema.json @@ -1022,15 +1022,17 @@ # # @stats: A @BlockDeviceStats for the device. # -# @parent: #optional This may point to the backing block device if this is a -# a virtual block device. If it's a backing block, this will point -# to the backing file is one is present. +# @parent: #optional This describes the file block device if it has one. +# +# @backing: #optional This describes the backing block device if it has one. +# (Since 2.0) # # Since: 0.14.0 ## { 'type': 'BlockStats', 'data': {'*device': 'str', 'stats': 'BlockDeviceStats', - '*parent': 'BlockStats'} } + '*parent': 'BlockStats', + '*backing': 'BlockStats'} } ## # @query-blockstats: From dc364f4cdca0c49e37376b16c3ee0bf3b4a96f4c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Beno=C3=AEt=20Canet?= Date: Thu, 23 Jan 2014 21:31:32 +0100 Subject: [PATCH 53/93] block: Add bs->node_name to hold the name of a bs node of the bs graph. Add the minimum of code to prepare for the following patches. Signed-off-by: Benoit Canet Reviewed-by: Fam Zheng Signed-off-by: Kevin Wolf --- block.c | 57 +++++++++++++++++++++++++++++---------- include/block/block.h | 1 + include/block/block_int.h | 9 ++++++- 3 files changed, 52 insertions(+), 15 deletions(-) diff --git a/block.c b/block.c index 53cc9e09c2..8562685c17 100644 --- a/block.c +++ b/block.c @@ -90,6 +90,9 @@ static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs, static QTAILQ_HEAD(, BlockDriverState) bdrv_states = QTAILQ_HEAD_INITIALIZER(bdrv_states); +static QTAILQ_HEAD(, BlockDriverState) graph_bdrv_states = + QTAILQ_HEAD_INITIALIZER(graph_bdrv_states); + static QLIST_HEAD(, BlockDriver) bdrv_drivers = QLIST_HEAD_INITIALIZER(bdrv_drivers); @@ -327,7 +330,7 @@ BlockDriverState *bdrv_new(const char *device_name) QLIST_INIT(&bs->dirty_bitmaps); pstrcpy(bs->device_name, sizeof(bs->device_name), device_name); if (device_name[0] != '\0') { - QTAILQ_INSERT_TAIL(&bdrv_states, bs, list); + QTAILQ_INSERT_TAIL(&bdrv_states, bs, device_list); } bdrv_iostatus_disable(bs); notifier_list_init(&bs->close_notifiers); @@ -1606,7 +1609,7 @@ void bdrv_close_all(void) { BlockDriverState *bs; - QTAILQ_FOREACH(bs, &bdrv_states, list) { + QTAILQ_FOREACH(bs, &bdrv_states, device_list) { bdrv_close(bs); } } @@ -1635,7 +1638,7 @@ static bool bdrv_requests_pending(BlockDriverState *bs) static bool bdrv_requests_pending_all(void) { BlockDriverState *bs; - QTAILQ_FOREACH(bs, &bdrv_states, list) { + QTAILQ_FOREACH(bs, &bdrv_states, device_list) { if (bdrv_requests_pending(bs)) { return true; } @@ -1662,7 +1665,7 @@ void bdrv_drain_all(void) BlockDriverState *bs; while (busy) { - QTAILQ_FOREACH(bs, &bdrv_states, list) { + QTAILQ_FOREACH(bs, &bdrv_states, device_list) { bdrv_start_throttled_reqs(bs); } @@ -1671,14 +1674,19 @@ void bdrv_drain_all(void) } } -/* make a BlockDriverState anonymous by removing from bdrv_state list. +/* make a BlockDriverState anonymous by removing from bdrv_state and + * graph_bdrv_state list. Also, NULL terminate the device_name to prevent double remove */ void bdrv_make_anon(BlockDriverState *bs) { if (bs->device_name[0] != '\0') { - QTAILQ_REMOVE(&bdrv_states, bs, list); + QTAILQ_REMOVE(&bdrv_states, bs, device_list); } bs->device_name[0] = '\0'; + if (bs->node_name[0] != '\0') { + QTAILQ_REMOVE(&graph_bdrv_states, bs, node_list); + } + bs->node_name[0] = '\0'; } static void bdrv_rebind(BlockDriverState *bs) @@ -1732,7 +1740,12 @@ static void bdrv_move_feature_fields(BlockDriverState *bs_dest, /* keep the same entry in bdrv_states */ pstrcpy(bs_dest->device_name, sizeof(bs_dest->device_name), bs_src->device_name); - bs_dest->list = bs_src->list; + bs_dest->device_list = bs_src->device_list; + + /* keep the same entry in graph_bdrv_states + * We do want to swap name but don't want to swap linked list entries + */ + bs_dest->node_list = bs_src->node_list; } /* @@ -2057,7 +2070,7 @@ int bdrv_commit_all(void) { BlockDriverState *bs; - QTAILQ_FOREACH(bs, &bdrv_states, list) { + QTAILQ_FOREACH(bs, &bdrv_states, device_list) { if (bs->drv && bs->backing_hd) { int ret = bdrv_commit(bs); if (ret < 0) { @@ -3215,11 +3228,12 @@ void bdrv_iterate_format(void (*it)(void *opaque, const char *name), } } +/* This function is to find block backend bs */ BlockDriverState *bdrv_find(const char *name) { BlockDriverState *bs; - QTAILQ_FOREACH(bs, &bdrv_states, list) { + QTAILQ_FOREACH(bs, &bdrv_states, device_list) { if (!strcmp(name, bs->device_name)) { return bs; } @@ -3227,19 +3241,34 @@ BlockDriverState *bdrv_find(const char *name) return NULL; } +/* This function is to find a node in the bs graph */ +BlockDriverState *bdrv_find_node(const char *node_name) +{ + BlockDriverState *bs; + + assert(node_name); + + QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) { + if (!strcmp(node_name, bs->node_name)) { + return bs; + } + } + return NULL; +} + BlockDriverState *bdrv_next(BlockDriverState *bs) { if (!bs) { return QTAILQ_FIRST(&bdrv_states); } - return QTAILQ_NEXT(bs, list); + return QTAILQ_NEXT(bs, device_list); } void bdrv_iterate(void (*it)(void *opaque, BlockDriverState *bs), void *opaque) { BlockDriverState *bs; - QTAILQ_FOREACH(bs, &bdrv_states, list) { + QTAILQ_FOREACH(bs, &bdrv_states, device_list) { it(opaque, bs); } } @@ -3259,7 +3288,7 @@ int bdrv_flush_all(void) BlockDriverState *bs; int result = 0; - QTAILQ_FOREACH(bs, &bdrv_states, list) { + QTAILQ_FOREACH(bs, &bdrv_states, device_list) { int ret = bdrv_flush(bs); if (ret < 0 && !result) { result = ret; @@ -4383,7 +4412,7 @@ void bdrv_invalidate_cache_all(void) { BlockDriverState *bs; - QTAILQ_FOREACH(bs, &bdrv_states, list) { + QTAILQ_FOREACH(bs, &bdrv_states, device_list) { bdrv_invalidate_cache(bs); } } @@ -4392,7 +4421,7 @@ void bdrv_clear_incoming_migration_all(void) { BlockDriverState *bs; - QTAILQ_FOREACH(bs, &bdrv_states, list) { + QTAILQ_FOREACH(bs, &bdrv_states, device_list) { bs->open_flags = bs->open_flags & ~(BDRV_O_INCOMING); } } diff --git a/include/block/block.h b/include/block/block.h index a47f3d4988..501b555885 100644 --- a/include/block/block.h +++ b/include/block/block.h @@ -378,6 +378,7 @@ void bdrv_lock_medium(BlockDriverState *bs, bool locked); void bdrv_eject(BlockDriverState *bs, bool eject_flag); const char *bdrv_get_format_name(BlockDriverState *bs); BlockDriverState *bdrv_find(const char *name); +BlockDriverState *bdrv_find_node(const char *node_name); BlockDriverState *bdrv_next(BlockDriverState *bs); void bdrv_iterate(void (*it)(void *opaque, BlockDriverState *bs), void *opaque); diff --git a/include/block/block_int.h b/include/block/block_int.h index 2772f2f1bd..f3f518c4d2 100644 --- a/include/block/block_int.h +++ b/include/block/block_int.h @@ -325,11 +325,18 @@ struct BlockDriverState { BlockdevOnError on_read_error, on_write_error; bool iostatus_enabled; BlockDeviceIoStatus iostatus; + + /* the following member gives a name to every node on the bs graph. */ + char node_name[32]; + /* element of the list of named nodes building the graph */ + QTAILQ_ENTRY(BlockDriverState) node_list; + /* Device name is the name associated with the "drive" the guest sees */ char device_name[32]; + /* element of the list of "drives" the guest sees */ + QTAILQ_ENTRY(BlockDriverState) device_list; QLIST_HEAD(, BdrvDirtyBitmap) dirty_bitmaps; int refcnt; int in_use; /* users other than guest access, eg. block migration */ - QTAILQ_ENTRY(BlockDriverState) list; QLIST_HEAD(, BdrvTrackedRequest) tracked_requests; From 6913c0c2ce00c0e886b2bd20b05073090fa5308a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Beno=C3=AEt=20Canet?= Date: Thu, 23 Jan 2014 21:31:33 +0100 Subject: [PATCH 54/93] block: Allow the user to define "node-name" option both on command line and QMP. Signed-off-by: Benoit Canet Signed-off-by: Kevin Wolf --- block.c | 35 +++++++++++++++++++++++++++++++++++ qapi-schema.json | 2 ++ 2 files changed, 37 insertions(+) diff --git a/block.c b/block.c index 8562685c17..f0436697ff 100644 --- a/block.c +++ b/block.c @@ -735,6 +735,33 @@ static int bdrv_open_flags(BlockDriverState *bs, int flags) return open_flags; } +static int bdrv_assign_node_name(BlockDriverState *bs, + const char *node_name, + Error **errp) +{ + if (!node_name) { + return 0; + } + + /* empty string node name is invalid */ + if (node_name[0] == '\0') { + error_setg(errp, "Empty node name"); + return -EINVAL; + } + + /* takes care of avoiding duplicates node names */ + if (bdrv_find_node(node_name)) { + error_setg(errp, "Duplicate node name"); + return -EINVAL; + } + + /* copy node name into the bs and insert it into the graph list */ + pstrcpy(bs->node_name, sizeof(bs->node_name), node_name); + QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs, node_list); + + return 0; +} + /* * Common part for opening disk images and files * @@ -745,6 +772,7 @@ static int bdrv_open_common(BlockDriverState *bs, BlockDriverState *file, { int ret, open_flags; const char *filename; + const char *node_name = NULL; Error *local_err = NULL; assert(drv != NULL); @@ -759,6 +787,13 @@ static int bdrv_open_common(BlockDriverState *bs, BlockDriverState *file, trace_bdrv_open_common(bs, filename ?: "", flags, drv->format_name); + node_name = qdict_get_try_str(options, "node-name"); + ret = bdrv_assign_node_name(bs, node_name, errp); + if (ret < 0) { + return ret; + } + qdict_del(options, "node-name"); + /* bdrv_open() with directly using a protocol as drv. This layer is already * opened, so assign it to bs (while file becomes a closed BlockDriverState) * and return immediately. */ diff --git a/qapi-schema.json b/qapi-schema.json index a433869962..26e370b0a0 100644 --- a/qapi-schema.json +++ b/qapi-schema.json @@ -4092,6 +4092,7 @@ # @id: #optional id by which the new block device can be referred to. # This is a required option on the top level of blockdev-add, and # currently not allowed on any other level. +# @node-name: #optional the name of a block driver state node (Since 2.0) # @discard: #optional discard-related options (default: ignore) # @cache: #optional cache-related options # @aio: #optional AIO backend (default: threads) @@ -4107,6 +4108,7 @@ { 'type': 'BlockdevOptionsBase', 'data': { 'driver': 'str', '*id': 'str', + '*node-name': 'str', '*discard': 'BlockdevDiscardOptions', '*cache': 'BlockdevCacheOptions', '*aio': 'BlockdevAioOptions', From c13163fba151f0be5176eaf55907bc1dbff3a1d4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Beno=C3=AEt=20Canet?= Date: Thu, 23 Jan 2014 21:31:34 +0100 Subject: [PATCH 55/93] qmp: Add QMP query-named-block-nodes to list the named BlockDriverState nodes. Signed-off-by: Benoit Canet Reviewed-by: Fam Zheng Signed-off-by: Kevin Wolf --- block.c | 18 +++++++ block/qapi.c | 109 +++++++++++++++++++++--------------------- blockdev.c | 5 ++ include/block/block.h | 1 + include/block/qapi.h | 1 + qapi-schema.json | 16 ++++++- qmp-commands.hx | 61 +++++++++++++++++++++++ 7 files changed, 155 insertions(+), 56 deletions(-) diff --git a/block.c b/block.c index f0436697ff..5f6308df67 100644 --- a/block.c +++ b/block.c @@ -32,6 +32,7 @@ #include "sysemu/sysemu.h" #include "qemu/notify.h" #include "block/coroutine.h" +#include "block/qapi.h" #include "qmp-commands.h" #include "qemu/timer.h" @@ -3291,6 +3292,23 @@ BlockDriverState *bdrv_find_node(const char *node_name) return NULL; } +/* Put this QMP function here so it can access the static graph_bdrv_states. */ +BlockDeviceInfoList *bdrv_named_nodes_list(void) +{ + BlockDeviceInfoList *list, *entry; + BlockDriverState *bs; + + list = NULL; + QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) { + entry = g_malloc0(sizeof(*entry)); + entry->value = bdrv_block_device_info(bs); + entry->next = list; + list = entry; + } + + return list; +} + BlockDriverState *bdrv_next(BlockDriverState *bs) { if (!bs) { diff --git a/block/qapi.c b/block/qapi.c index 98b1b83bd6..8f4134b40a 100644 --- a/block/qapi.c +++ b/block/qapi.c @@ -29,6 +29,60 @@ #include "qapi/qmp-output-visitor.h" #include "qapi/qmp/types.h" +BlockDeviceInfo *bdrv_block_device_info(BlockDriverState *bs) +{ + BlockDeviceInfo *info = g_malloc0(sizeof(*info)); + + info->file = g_strdup(bs->filename); + info->ro = bs->read_only; + info->drv = g_strdup(bs->drv->format_name); + info->encrypted = bs->encrypted; + info->encryption_key_missing = bdrv_key_required(bs); + + if (bs->node_name[0]) { + info->has_node_name = true; + info->node_name = g_strdup(bs->node_name); + } + + if (bs->backing_file[0]) { + info->has_backing_file = true; + info->backing_file = g_strdup(bs->backing_file); + } + + info->backing_file_depth = bdrv_get_backing_file_depth(bs); + + if (bs->io_limits_enabled) { + ThrottleConfig cfg; + throttle_get_config(&bs->throttle_state, &cfg); + info->bps = cfg.buckets[THROTTLE_BPS_TOTAL].avg; + info->bps_rd = cfg.buckets[THROTTLE_BPS_READ].avg; + info->bps_wr = cfg.buckets[THROTTLE_BPS_WRITE].avg; + + info->iops = cfg.buckets[THROTTLE_OPS_TOTAL].avg; + info->iops_rd = cfg.buckets[THROTTLE_OPS_READ].avg; + info->iops_wr = cfg.buckets[THROTTLE_OPS_WRITE].avg; + + info->has_bps_max = cfg.buckets[THROTTLE_BPS_TOTAL].max; + info->bps_max = cfg.buckets[THROTTLE_BPS_TOTAL].max; + info->has_bps_rd_max = cfg.buckets[THROTTLE_BPS_READ].max; + info->bps_rd_max = cfg.buckets[THROTTLE_BPS_READ].max; + info->has_bps_wr_max = cfg.buckets[THROTTLE_BPS_WRITE].max; + info->bps_wr_max = cfg.buckets[THROTTLE_BPS_WRITE].max; + + info->has_iops_max = cfg.buckets[THROTTLE_OPS_TOTAL].max; + info->iops_max = cfg.buckets[THROTTLE_OPS_TOTAL].max; + info->has_iops_rd_max = cfg.buckets[THROTTLE_OPS_READ].max; + info->iops_rd_max = cfg.buckets[THROTTLE_OPS_READ].max; + info->has_iops_wr_max = cfg.buckets[THROTTLE_OPS_WRITE].max; + info->iops_wr_max = cfg.buckets[THROTTLE_OPS_WRITE].max; + + info->has_iops_size = cfg.op_size; + info->iops_size = cfg.op_size; + } + + return info; +} + /* * Returns 0 on success, with *p_list either set to describe snapshot * information, or NULL because there are no snapshots. Returns -errno on @@ -211,60 +265,7 @@ void bdrv_query_info(BlockDriverState *bs, if (bs->drv) { info->has_inserted = true; - info->inserted = g_malloc0(sizeof(*info->inserted)); - info->inserted->file = g_strdup(bs->filename); - info->inserted->ro = bs->read_only; - info->inserted->drv = g_strdup(bs->drv->format_name); - info->inserted->encrypted = bs->encrypted; - info->inserted->encryption_key_missing = bdrv_key_required(bs); - - if (bs->backing_file[0]) { - info->inserted->has_backing_file = true; - info->inserted->backing_file = g_strdup(bs->backing_file); - } - - info->inserted->backing_file_depth = bdrv_get_backing_file_depth(bs); - - if (bs->io_limits_enabled) { - ThrottleConfig cfg; - throttle_get_config(&bs->throttle_state, &cfg); - info->inserted->bps = cfg.buckets[THROTTLE_BPS_TOTAL].avg; - info->inserted->bps_rd = cfg.buckets[THROTTLE_BPS_READ].avg; - info->inserted->bps_wr = cfg.buckets[THROTTLE_BPS_WRITE].avg; - - info->inserted->iops = cfg.buckets[THROTTLE_OPS_TOTAL].avg; - info->inserted->iops_rd = cfg.buckets[THROTTLE_OPS_READ].avg; - info->inserted->iops_wr = cfg.buckets[THROTTLE_OPS_WRITE].avg; - - info->inserted->has_bps_max = - cfg.buckets[THROTTLE_BPS_TOTAL].max; - info->inserted->bps_max = - cfg.buckets[THROTTLE_BPS_TOTAL].max; - info->inserted->has_bps_rd_max = - cfg.buckets[THROTTLE_BPS_READ].max; - info->inserted->bps_rd_max = - cfg.buckets[THROTTLE_BPS_READ].max; - info->inserted->has_bps_wr_max = - cfg.buckets[THROTTLE_BPS_WRITE].max; - info->inserted->bps_wr_max = - cfg.buckets[THROTTLE_BPS_WRITE].max; - - info->inserted->has_iops_max = - cfg.buckets[THROTTLE_OPS_TOTAL].max; - info->inserted->iops_max = - cfg.buckets[THROTTLE_OPS_TOTAL].max; - info->inserted->has_iops_rd_max = - cfg.buckets[THROTTLE_OPS_READ].max; - info->inserted->iops_rd_max = - cfg.buckets[THROTTLE_OPS_READ].max; - info->inserted->has_iops_wr_max = - cfg.buckets[THROTTLE_OPS_WRITE].max; - info->inserted->iops_wr_max = - cfg.buckets[THROTTLE_OPS_WRITE].max; - - info->inserted->has_iops_size = cfg.op_size; - info->inserted->iops_size = cfg.op_size; - } + info->inserted = bdrv_block_device_info(bs); bs0 = bs; p_image_info = &info->inserted->image; diff --git a/blockdev.c b/blockdev.c index 386109a8d4..0bfe38027b 100644 --- a/blockdev.c +++ b/blockdev.c @@ -1952,6 +1952,11 @@ void qmp_drive_backup(const char *device, const char *target, } } +BlockDeviceInfoList *qmp_query_named_block_nodes(Error **errp) +{ + return bdrv_named_nodes_list(); +} + #define DEFAULT_MIRROR_BUF_SIZE (10 << 20) void qmp_drive_mirror(const char *device, const char *target, diff --git a/include/block/block.h b/include/block/block.h index 501b555885..13654ede8b 100644 --- a/include/block/block.h +++ b/include/block/block.h @@ -379,6 +379,7 @@ void bdrv_eject(BlockDriverState *bs, bool eject_flag); const char *bdrv_get_format_name(BlockDriverState *bs); BlockDriverState *bdrv_find(const char *name); BlockDriverState *bdrv_find_node(const char *node_name); +BlockDeviceInfoList *bdrv_named_nodes_list(void); BlockDriverState *bdrv_next(BlockDriverState *bs); void bdrv_iterate(void (*it)(void *opaque, BlockDriverState *bs), void *opaque); diff --git a/include/block/qapi.h b/include/block/qapi.h index 9518ee4001..e92c00daf6 100644 --- a/include/block/qapi.h +++ b/include/block/qapi.h @@ -29,6 +29,7 @@ #include "block/block.h" #include "block/snapshot.h" +BlockDeviceInfo *bdrv_block_device_info(BlockDriverState *bs); int bdrv_query_snapshot_info_list(BlockDriverState *bs, SnapshotInfoList **p_list, Error **errp); diff --git a/qapi-schema.json b/qapi-schema.json index 26e370b0a0..b619f019ee 100644 --- a/qapi-schema.json +++ b/qapi-schema.json @@ -810,6 +810,8 @@ # # @file: the filename of the backing device # +# @node-name: #optional the name of the block driver node (Since 2.0) +# # @ro: true if the backing device was open read-only # # @drv: the name of the block format used to open the backing device. As of @@ -857,10 +859,9 @@ # # Since: 0.14.0 # -# Notes: This interface is only found in @BlockInfo. ## { 'type': 'BlockDeviceInfo', - 'data': { 'file': 'str', 'ro': 'bool', 'drv': 'str', + 'data': { 'file': 'str', '*node-name': 'str', 'ro': 'bool', 'drv': 'str', '*backing_file': 'str', 'backing_file_depth': 'int', 'encrypted': 'bool', 'encryption_key_missing': 'bool', 'bps': 'int', 'bps_rd': 'int', 'bps_wr': 'int', @@ -2010,6 +2011,17 @@ ## { 'command': 'drive-backup', 'data': 'DriveBackup' } +## +# @query-named-block-nodes +# +# Get the named block driver list +# +# Returns: the list of BlockDeviceInfo +# +# Since 2.0 +## +{ 'command': 'query-named-block-nodes', 'returns': [ 'BlockDeviceInfo' ] } + ## # @drive-mirror # diff --git a/qmp-commands.hx b/qmp-commands.hx index 02cc815bc5..11b44c51b7 100644 --- a/qmp-commands.hx +++ b/qmp-commands.hx @@ -3345,4 +3345,65 @@ Example (2): <- { "return": {} } +EQMP + + { + .name = "query-named-block-nodes", + .args_type = "", + .mhandler.cmd_new = qmp_marshal_input_query_named_block_nodes, + }, + +SQMP +@query-named-block-nodes +------------------------ + +Return a list of BlockDeviceInfo for all the named block driver nodes + +Example: + +-> { "execute": "query-named-block-nodes" } +<- { "return": [ { "ro":false, + "drv":"qcow2", + "encrypted":false, + "file":"disks/test.qcow2", + "node-name": "my-node", + "backing_file_depth":1, + "bps":1000000, + "bps_rd":0, + "bps_wr":0, + "iops":1000000, + "iops_rd":0, + "iops_wr":0, + "bps_max": 8000000, + "bps_rd_max": 0, + "bps_wr_max": 0, + "iops_max": 0, + "iops_rd_max": 0, + "iops_wr_max": 0, + "iops_size": 0, + "image":{ + "filename":"disks/test.qcow2", + "format":"qcow2", + "virtual-size":2048000, + "backing_file":"base.qcow2", + "full-backing-filename":"disks/base.qcow2", + "backing-filename-format:"qcow2", + "snapshots":[ + { + "id": "1", + "name": "snapshot1", + "vm-state-size": 0, + "date-sec": 10000200, + "date-nsec": 12, + "vm-clock-sec": 206, + "vm-clock-nsec": 30 + } + ], + "backing-image":{ + "filename":"disks/base.qcow2", + "format":"qcow2", + "virtual-size":2048000 + } + } } ] } + EQMP From 12d3ba821da9f8a039240a8a1bc01e27a12f9c22 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Beno=C3=AEt=20Canet?= Date: Thu, 23 Jan 2014 21:31:35 +0100 Subject: [PATCH 56/93] qmp: Allow to change password on named block driver states. Signed-off-by: Benoit Canet Reviewed-by: Fam Zheng There was two candidate ways to implement named node manipulation: 1) { 'command': 'block_passwd', 'data': {'*device': 'str', '*node-name': 'str', 'password': 'str'} } 2) { 'command': 'block_passwd', 'data': {'device': 'str', '*device-is-node': 'bool', 'password': 'str'} } Luiz proposed 1 and says 2 was an abuse of the QMP interface and proposed to rewrite the QMP block interface for 2.0. Luiz does not like in 1 the fact that 2 fields are optional but one of them must be specified leading to an abuse of the QMP semantic. Kevin argumented that 2 what a clear abuse of the device field and would not be practical when reading fast some log file because the user would read "device" and think that a device is manipulated when it's in fact a node name. Documentation of 1 make it pretty clear what to do for the user. Kevin argued that all bs are node including devices ones so 2 does not make sense. Kevin also argued that rewriting the QMP block interface would not make disapear the current one. Kevin pushed the argument that making the QAPI generator compatible with the semantic of the operation would need a rewrite that no one has done yet. A vote has been done on the list to elect the version to use and 1 won. For reference the complete thread is: "[Qemu-devel] [PATCH V4 4/7] qmp: Allow to change password on names block driver states." Signed-off-by: Benoit Canet Signed-off-by: Kevin Wolf --- block.c | 32 ++++++++++++++++++++++++++++++++ blockdev.c | 13 +++++++++---- hmp.c | 2 +- include/block/block.h | 3 +++ qapi-schema.json | 9 +++++++-- qmp-commands.hx | 3 ++- 6 files changed, 54 insertions(+), 8 deletions(-) diff --git a/block.c b/block.c index 5f6308df67..064df90917 100644 --- a/block.c +++ b/block.c @@ -3309,6 +3309,38 @@ BlockDeviceInfoList *bdrv_named_nodes_list(void) return list; } +BlockDriverState *bdrv_lookup_bs(const char *device, + const char *node_name, + Error **errp) +{ + BlockDriverState *bs = NULL; + + if ((!device && !node_name) || (device && node_name)) { + error_setg(errp, "Use either device or node-name but not both"); + return NULL; + } + + if (device) { + bs = bdrv_find(device); + + if (!bs) { + error_set(errp, QERR_DEVICE_NOT_FOUND, device); + return NULL; + } + + return bs; + } + + bs = bdrv_find_node(node_name); + + if (!bs) { + error_set(errp, QERR_DEVICE_NOT_FOUND, node_name); + return NULL; + } + + return bs; +} + BlockDriverState *bdrv_next(BlockDriverState *bs) { if (!bs) { diff --git a/blockdev.c b/blockdev.c index 0bfe38027b..2f4065e2cd 100644 --- a/blockdev.c +++ b/blockdev.c @@ -1481,14 +1481,19 @@ void qmp_eject(const char *device, bool has_force, bool force, Error **errp) eject_device(bs, force, errp); } -void qmp_block_passwd(const char *device, const char *password, Error **errp) +void qmp_block_passwd(bool has_device, const char *device, + bool has_node_name, const char *node_name, + const char *password, Error **errp) { + Error *local_err = NULL; BlockDriverState *bs; int err; - bs = bdrv_find(device); - if (!bs) { - error_set(errp, QERR_DEVICE_NOT_FOUND, device); + bs = bdrv_lookup_bs(has_device ? device : NULL, + has_node_name ? node_name : NULL, + &local_err); + if (error_is_set(&local_err)) { + error_propagate(errp, local_err); return; } diff --git a/hmp.c b/hmp.c index 468f97d176..5804a5a2ad 100644 --- a/hmp.c +++ b/hmp.c @@ -871,7 +871,7 @@ void hmp_block_passwd(Monitor *mon, const QDict *qdict) const char *password = qdict_get_str(qdict, "password"); Error *errp = NULL; - qmp_block_passwd(device, password, &errp); + qmp_block_passwd(true, device, false, NULL, password, &errp); hmp_handle_error(mon, &errp); } diff --git a/include/block/block.h b/include/block/block.h index 13654ede8b..b4a77e6cff 100644 --- a/include/block/block.h +++ b/include/block/block.h @@ -380,6 +380,9 @@ const char *bdrv_get_format_name(BlockDriverState *bs); BlockDriverState *bdrv_find(const char *name); BlockDriverState *bdrv_find_node(const char *node_name); BlockDeviceInfoList *bdrv_named_nodes_list(void); +BlockDriverState *bdrv_lookup_bs(const char *device, + const char *node_name, + Error **errp); BlockDriverState *bdrv_next(BlockDriverState *bs); void bdrv_iterate(void (*it)(void *opaque, BlockDriverState *bs), void *opaque); diff --git a/qapi-schema.json b/qapi-schema.json index b619f019ee..3f48ae9be4 100644 --- a/qapi-schema.json +++ b/qapi-schema.json @@ -1678,7 +1678,11 @@ # determine which ones are encrypted, set the passwords with this command, and # then start the guest with the @cont command. # -# @device: the name of the device to set the password on +# Either @device or @node-name must be set but not both. +# +# @device: #optional the name of the block backend device to set the password on +# +# @node-name: #optional graph node name to set the password on (Since 2.0) # # @password: the password to use for the device # @@ -1692,7 +1696,8 @@ # # Since: 0.14.0 ## -{ 'command': 'block_passwd', 'data': {'device': 'str', 'password': 'str'} } +{ 'command': 'block_passwd', 'data': {'*device': 'str', + '*node-name': 'str', 'password': 'str'} } ## # @balloon: diff --git a/qmp-commands.hx b/qmp-commands.hx index 11b44c51b7..5492eb061b 100644 --- a/qmp-commands.hx +++ b/qmp-commands.hx @@ -1503,7 +1503,7 @@ EQMP { .name = "block_passwd", - .args_type = "device:B,password:s", + .args_type = "device:s?,node-name:s?,password:s", .mhandler.cmd_new = qmp_marshal_input_block_passwd, }, @@ -1516,6 +1516,7 @@ Set the password of encrypted block devices. Arguments: - "device": device name (json-string) +- "node-name": name in the block driver state graph (json-string) - "password": password (json-string) Example: From 212a5a8f095de9a1624de6b4a589d60688b02747 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Beno=C3=AEt=20Canet?= Date: Thu, 23 Jan 2014 21:31:36 +0100 Subject: [PATCH 57/93] block: Create authorizations mechanism for external snapshot and resize. Signed-off-by: Benoit Canet Signed-off-by: Kevin Wolf --- block.c | 65 +++++++++++++++++++++++++++++++++------ block/blkverify.c | 2 +- blockdev.c | 2 +- include/block/block.h | 18 +++++------ include/block/block_int.h | 12 ++++++-- 5 files changed, 76 insertions(+), 23 deletions(-) diff --git a/block.c b/block.c index 064df90917..18c0a8dd94 100644 --- a/block.c +++ b/block.c @@ -5094,21 +5094,68 @@ int bdrv_amend_options(BlockDriverState *bs, QEMUOptionParameter *options) return bs->drv->bdrv_amend_options(bs, options); } -ExtSnapshotPerm bdrv_check_ext_snapshot(BlockDriverState *bs) +/* Used to recurse on single child block filters. + * Single child block filter will store their child in bs->file. + */ +bool bdrv_generic_is_first_non_filter(BlockDriverState *bs, + BlockDriverState *candidate) { - if (bs->drv->bdrv_check_ext_snapshot) { - return bs->drv->bdrv_check_ext_snapshot(bs); + if (!bs->drv) { + return false; } - if (bs->file && bs->file->drv && bs->file->drv->bdrv_check_ext_snapshot) { - return bs->file->drv->bdrv_check_ext_snapshot(bs); + if (!bs->drv->authorizations[BS_IS_A_FILTER]) { + if (bs == candidate) { + return true; + } else { + return false; + } } - /* external snapshots are allowed by default */ - return EXT_SNAPSHOT_ALLOWED; + if (!bs->drv->authorizations[BS_FILTER_PASS_DOWN]) { + return false; + } + + if (!bs->file) { + return false; + } + + return bdrv_recurse_is_first_non_filter(bs->file, candidate); } -ExtSnapshotPerm bdrv_check_ext_snapshot_forbidden(BlockDriverState *bs) +bool bdrv_recurse_is_first_non_filter(BlockDriverState *bs, + BlockDriverState *candidate) { - return EXT_SNAPSHOT_FORBIDDEN; + if (bs->drv && bs->drv->bdrv_recurse_is_first_non_filter) { + return bs->drv->bdrv_recurse_is_first_non_filter(bs, candidate); + } + + return bdrv_generic_is_first_non_filter(bs, candidate); +} + +/* This function checks if the candidate is the first non filter bs down it's + * bs chain. Since we don't have pointers to parents it explore all bs chains + * from the top. Some filters can choose not to pass down the recursion. + */ +bool bdrv_is_first_non_filter(BlockDriverState *candidate) +{ + BlockDriverState *bs; + + /* walk down the bs forest recursively */ + QTAILQ_FOREACH(bs, &bdrv_states, device_list) { + bool perm; + + if (!bs->file) { + continue; + } + + perm = bdrv_recurse_is_first_non_filter(bs->file, candidate); + + /* candidate is the first non filter */ + if (perm) { + return true; + } + } + + return false; } diff --git a/block/blkverify.c b/block/blkverify.c index a2e8f5f138..cfcbcf41c3 100644 --- a/block/blkverify.c +++ b/block/blkverify.c @@ -404,7 +404,7 @@ static BlockDriver bdrv_blkverify = { .bdrv_aio_writev = blkverify_aio_writev, .bdrv_aio_flush = blkverify_aio_flush, - .bdrv_check_ext_snapshot = bdrv_check_ext_snapshot_forbidden, + .authorizations = { true, false }, }; static void bdrv_blkverify_init(void) diff --git a/blockdev.c b/blockdev.c index 2f4065e2cd..32a356eadc 100644 --- a/blockdev.c +++ b/blockdev.c @@ -1243,7 +1243,7 @@ static void external_snapshot_prepare(BlkTransactionState *common, } } - if (bdrv_check_ext_snapshot(state->old_bs) != EXT_SNAPSHOT_ALLOWED) { + if (!bdrv_is_first_non_filter(state->old_bs)) { error_set(errp, QERR_FEATURE_DISABLED, "snapshot"); return; } diff --git a/include/block/block.h b/include/block/block.h index b4a77e6cff..59d9f12ce4 100644 --- a/include/block/block.h +++ b/include/block/block.h @@ -287,16 +287,16 @@ int bdrv_amend_options(BlockDriverState *bs_new, QEMUOptionParameter *options); /* external snapshots */ typedef enum { - EXT_SNAPSHOT_ALLOWED, - EXT_SNAPSHOT_FORBIDDEN, -} ExtSnapshotPerm; + BS_IS_A_FILTER, + BS_FILTER_PASS_DOWN, + BS_AUTHORIZATION_COUNT, +} BsAuthorization; -/* return EXT_SNAPSHOT_ALLOWED if external snapshot is allowed - * return EXT_SNAPSHOT_FORBIDDEN if external snapshot is forbidden - */ -ExtSnapshotPerm bdrv_check_ext_snapshot(BlockDriverState *bs); -/* helper used to forbid external snapshots like in blkverify */ -ExtSnapshotPerm bdrv_check_ext_snapshot_forbidden(BlockDriverState *bs); +bool bdrv_generic_is_first_non_filter(BlockDriverState *bs, + BlockDriverState *candidate); +bool bdrv_recurse_is_first_non_filter(BlockDriverState *bs, + BlockDriverState *candidate); +bool bdrv_is_first_non_filter(BlockDriverState *candidate); /* async block I/O */ typedef void BlockDriverDirtyHandler(BlockDriverState *bs, int64_t sector, diff --git a/include/block/block_int.h b/include/block/block_int.h index f3f518c4d2..611a955712 100644 --- a/include/block/block_int.h +++ b/include/block/block_int.h @@ -69,10 +69,16 @@ struct BlockDriver { const char *format_name; int instance_size; - /* if not defined external snapshots are allowed - * future block filters will query their children to build the response + /* this table of boolean contains authorizations for the block operations */ + bool authorizations[BS_AUTHORIZATION_COUNT]; + /* for snapshots complex block filter like Quorum can implement the + * following recursive callback instead of BS_IS_A_FILTER. + * It's purpose is to recurse on the filter children while calling + * bdrv_recurse_is_first_non_filter on them. + * For a sample implementation look in the future Quorum block filter. */ - ExtSnapshotPerm (*bdrv_check_ext_snapshot)(BlockDriverState *bs); + bool (*bdrv_recurse_is_first_non_filter)(BlockDriverState *bs, + BlockDriverState *candidate); int (*bdrv_probe)(const uint8_t *buf, int buf_size, const char *filename); int (*bdrv_probe_device)(const char *filename); From 3b1dbd11a60d75e99af5fc9b73c34f4af9d4f510 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Beno=C3=AEt=20Canet?= Date: Thu, 23 Jan 2014 21:31:37 +0100 Subject: [PATCH 58/93] qmp: Allow block_resize to manipulate bs graph nodes. Signed-off-by: Benoit Canet Reviewed-by: Fam Zheng Signed-off-by: Kevin Wolf --- blockdev.c | 18 ++++++++++++++---- hmp.c | 2 +- qapi-schema.json | 10 ++++++++-- qmp-commands.hx | 3 ++- 4 files changed, 25 insertions(+), 8 deletions(-) diff --git a/blockdev.c b/blockdev.c index 32a356eadc..d192370f9c 100644 --- a/blockdev.c +++ b/blockdev.c @@ -1683,14 +1683,24 @@ int do_drive_del(Monitor *mon, const QDict *qdict, QObject **ret_data) return 0; } -void qmp_block_resize(const char *device, int64_t size, Error **errp) +void qmp_block_resize(bool has_device, const char *device, + bool has_node_name, const char *node_name, + int64_t size, Error **errp) { + Error *local_err = NULL; BlockDriverState *bs; int ret; - bs = bdrv_find(device); - if (!bs) { - error_set(errp, QERR_DEVICE_NOT_FOUND, device); + bs = bdrv_lookup_bs(has_device ? device : NULL, + has_node_name ? node_name : NULL, + &local_err); + if (error_is_set(&local_err)) { + error_propagate(errp, local_err); + return; + } + + if (!bdrv_is_first_non_filter(bs)) { + error_set(errp, QERR_FEATURE_DISABLED, "resize"); return; } diff --git a/hmp.c b/hmp.c index 5804a5a2ad..66c8d7e6ff 100644 --- a/hmp.c +++ b/hmp.c @@ -893,7 +893,7 @@ void hmp_block_resize(Monitor *mon, const QDict *qdict) int64_t size = qdict_get_int(qdict, "size"); Error *errp = NULL; - qmp_block_resize(device, size, &errp); + qmp_block_resize(true, device, false, NULL, size, &errp); hmp_handle_error(mon, &errp); } diff --git a/qapi-schema.json b/qapi-schema.json index 3f48ae9be4..f21243aba1 100644 --- a/qapi-schema.json +++ b/qapi-schema.json @@ -1724,7 +1724,11 @@ # # Resize a block image while a guest is running. # -# @device: the name of the device to get the image resized +# Either @device or @node-name must be set but not both. +# +# @device: #optional the name of the device to get the image resized +# +# @node-name: #optional graph node name to get the image resized (Since 2.0) # # @size: new image size in bytes # @@ -1733,7 +1737,9 @@ # # Since: 0.14.0 ## -{ 'command': 'block_resize', 'data': { 'device': 'str', 'size': 'int' }} +{ 'command': 'block_resize', 'data': { '*device': 'str', + '*node-name': 'str', + 'size': 'int' }} ## # @NewImageMode diff --git a/qmp-commands.hx b/qmp-commands.hx index 5492eb061b..a45f26cc48 100644 --- a/qmp-commands.hx +++ b/qmp-commands.hx @@ -931,7 +931,7 @@ EQMP { .name = "block_resize", - .args_type = "device:B,size:o", + .args_type = "device:s?,node-name:s?,size:o", .mhandler.cmd_new = qmp_marshal_input_block_resize, }, @@ -944,6 +944,7 @@ Resize a block image while a guest is running. Arguments: - "device": the device's ID, must be unique (json-string) +- "node-name": the node name in the block driver state graph (json-string) - "size": new size Example: From 0901f67ecdb74d9ba1451e3b4367194cd43f96b4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Beno=C3=AEt=20Canet?= Date: Thu, 23 Jan 2014 21:31:38 +0100 Subject: [PATCH 59/93] qmp: Allow to take external snapshots on bs graphs node. Signed-off-by: Benoit Canet Reviewed-by: Fam Zheng Signed-off-by: Kevin Wolf --- blockdev.c | 55 ++++++++++++++++++++++++++++++++++++++++++------ hmp.c | 4 +++- qapi-schema.json | 13 +++++++++--- qmp-commands.hx | 11 +++++++++- 4 files changed, 71 insertions(+), 12 deletions(-) diff --git a/blockdev.c b/blockdev.c index d192370f9c..36ceece9ff 100644 --- a/blockdev.c +++ b/blockdev.c @@ -947,14 +947,22 @@ static void blockdev_do_action(int kind, void *data, Error **errp) qmp_transaction(&list, errp); } -void qmp_blockdev_snapshot_sync(const char *device, const char *snapshot_file, +void qmp_blockdev_snapshot_sync(bool has_device, const char *device, + bool has_node_name, const char *node_name, + const char *snapshot_file, + bool has_snapshot_node_name, + const char *snapshot_node_name, bool has_format, const char *format, - bool has_mode, enum NewImageMode mode, - Error **errp) + bool has_mode, NewImageMode mode, Error **errp) { BlockdevSnapshot snapshot = { + .has_device = has_device, .device = (char *) device, + .has_node_name = has_node_name, + .node_name = (char *) node_name, .snapshot_file = (char *) snapshot_file, + .has_snapshot_node_name = has_snapshot_node_name, + .snapshot_node_name = (char *) snapshot_node_name, .has_format = has_format, .format = (char *) format, .has_mode = has_mode, @@ -1192,8 +1200,14 @@ static void external_snapshot_prepare(BlkTransactionState *common, { BlockDriver *drv; int flags, ret; + QDict *options = NULL; Error *local_err = NULL; + bool has_device = false; const char *device; + bool has_node_name = false; + const char *node_name; + bool has_snapshot_node_name = false; + const char *snapshot_node_name; const char *new_image_file; const char *format = "qcow2"; enum NewImageMode mode = NEW_IMAGE_MODE_ABSOLUTE_PATHS; @@ -1204,7 +1218,14 @@ static void external_snapshot_prepare(BlkTransactionState *common, /* get parameters */ g_assert(action->kind == TRANSACTION_ACTION_KIND_BLOCKDEV_SNAPSHOT_SYNC); + has_device = action->blockdev_snapshot_sync->has_device; device = action->blockdev_snapshot_sync->device; + has_node_name = action->blockdev_snapshot_sync->has_node_name; + node_name = action->blockdev_snapshot_sync->node_name; + has_snapshot_node_name = + action->blockdev_snapshot_sync->has_snapshot_node_name; + snapshot_node_name = action->blockdev_snapshot_sync->snapshot_node_name; + new_image_file = action->blockdev_snapshot_sync->snapshot_file; if (action->blockdev_snapshot_sync->has_format) { format = action->blockdev_snapshot_sync->format; @@ -1220,9 +1241,21 @@ static void external_snapshot_prepare(BlkTransactionState *common, return; } - state->old_bs = bdrv_find(device); - if (!state->old_bs) { - error_set(errp, QERR_DEVICE_NOT_FOUND, device); + state->old_bs = bdrv_lookup_bs(has_device ? device : NULL, + has_node_name ? node_name : NULL, + &local_err); + if (error_is_set(&local_err)) { + error_propagate(errp, local_err); + return; + } + + if (has_node_name && !has_snapshot_node_name) { + error_setg(errp, "New snapshot node name missing"); + return; + } + + if (has_snapshot_node_name && bdrv_find_node(snapshot_node_name)) { + error_setg(errp, "New snapshot node name already existing"); return; } @@ -1262,15 +1295,23 @@ static void external_snapshot_prepare(BlkTransactionState *common, } } + if (has_snapshot_node_name) { + options = qdict_new(); + qdict_put(options, "node-name", + qstring_from_str(snapshot_node_name)); + } + /* We will manually add the backing_hd field to the bs later */ state->new_bs = bdrv_new(""); /* TODO Inherit bs->options or only take explicit options with an * extended QMP command? */ - ret = bdrv_open(state->new_bs, new_image_file, NULL, + ret = bdrv_open(state->new_bs, new_image_file, options, flags | BDRV_O_NO_BACKING, drv, &local_err); if (ret != 0) { error_propagate(errp, local_err); } + + QDECREF(options); } static void external_snapshot_commit(BlkTransactionState *common) diff --git a/hmp.c b/hmp.c index 66c8d7e6ff..1af0809305 100644 --- a/hmp.c +++ b/hmp.c @@ -972,7 +972,9 @@ void hmp_snapshot_blkdev(Monitor *mon, const QDict *qdict) } mode = reuse ? NEW_IMAGE_MODE_EXISTING : NEW_IMAGE_MODE_ABSOLUTE_PATHS; - qmp_blockdev_snapshot_sync(device, filename, !!format, format, + qmp_blockdev_snapshot_sync(true, device, false, NULL, + filename, false, NULL, + !!format, format, true, mode, &errp); hmp_handle_error(mon, &errp); } diff --git a/qapi-schema.json b/qapi-schema.json index f21243aba1..e04949de52 100644 --- a/qapi-schema.json +++ b/qapi-schema.json @@ -1761,18 +1761,25 @@ ## # @BlockdevSnapshot # -# @device: the name of the device to generate the snapshot from. +# Either @device or @node-name must be set but not both. +# +# @device: #optional the name of the device to generate the snapshot from. +# +# @node-name: #optional graph node name to generate the snapshot from (Since 2.0) # # @snapshot-file: the target of the new image. A new file will be created. # +# @snapshot-node-name: #optional the graph node name of the new image (Since 2.0) +# # @format: #optional the format of the snapshot image, default is 'qcow2'. # # @mode: #optional whether and how QEMU should create a new image, default is # 'absolute-paths'. ## { 'type': 'BlockdevSnapshot', - 'data': { 'device': 'str', 'snapshot-file': 'str', '*format': 'str', - '*mode': 'NewImageMode' } } + 'data': { '*device': 'str', '*node-name': 'str', + 'snapshot-file': 'str', '*snapshot-node-name': 'str', + '*format': 'str', '*mode': 'NewImageMode' } } ## # @BlockdevSnapshotInternal diff --git a/qmp-commands.hx b/qmp-commands.hx index a45f26cc48..4f2825017c 100644 --- a/qmp-commands.hx +++ b/qmp-commands.hx @@ -1089,7 +1089,9 @@ actions array: - "data": a dictionary. The contents depend on the value of "type". When "type" is "blockdev-snapshot-sync": - "device": device name to snapshot (json-string) + - "node-name": graph node name to snapshot (json-string) - "snapshot-file": name of new image file (json-string) + - "snapshot-node-name": graph node name of the new snapshot (json-string) - "format": format of new image (json-string, optional) - "mode": whether and how QEMU should create the snapshot file (NewImageMode, optional, default "absolute-paths") @@ -1104,6 +1106,11 @@ Example: { 'type': 'blockdev-snapshot-sync', 'data' : { "device": "ide-hd0", "snapshot-file": "/some/place/my-image", "format": "qcow2" } }, + { 'type': 'blockdev-snapshot-sync', 'data' : { "node-name": "myfile", + "snapshot-file": "/some/place/my-image2", + "snapshot-node-name": "node3432", + "mode": "existing", + "format": "qcow2" } }, { 'type': 'blockdev-snapshot-sync', 'data' : { "device": "ide-hd1", "snapshot-file": "/some/place/my-image2", "mode": "existing", @@ -1117,7 +1124,7 @@ EQMP { .name = "blockdev-snapshot-sync", - .args_type = "device:B,snapshot-file:s,format:s?,mode:s?", + .args_type = "device:s?,node-name:s?,snapshot-file:s,snapshot-node-name:s?,format:s?,mode:s?", .mhandler.cmd_new = qmp_marshal_input_blockdev_snapshot_sync, }, @@ -1134,7 +1141,9 @@ snapshot image, default is qcow2. Arguments: - "device": device name to snapshot (json-string) +- "node-name": graph node name to snapshot (json-string) - "snapshot-file": name of new image file (json-string) +- "snapshot-node-name": graph node name of the new snapshot (json-string) - "mode": whether and how QEMU should create the snapshot file (NewImageMode, optional, default "absolute-paths") - "format": format of new image (json-string, optional) From 031fd1be5618c347f9aeb44ec294f14a541e42b2 Mon Sep 17 00:00:00 2001 From: Peter Maydell Date: Fri, 24 Jan 2014 14:56:17 +0100 Subject: [PATCH 60/93] block/curl: Implement the libcurl timer callback interface libcurl versions 7.16.0 and later have a timer callback interface which must be implemented in order for libcurl to make forward progress (it will sometimes rely on being called back on the timeout if there are no file descriptors registered). Implement the callback, and use a QEMU AIO timer to ensure we prod libcurl again when it asks us to. Based on Peter's original patch plus my fix to add curl_multi_timeout_do. Should compile just fine even on older versions of libcurl. I also tried copy-on-read and streaming: $ ./qemu-img create -f qcow2 -o \ backing_file=http://download.fedoraproject.org/pub/fedora/linux/releases/20/Live/x86_64/Fedora-Live-Desktop-x86_64-20-1.iso \ foo.qcow2 1G $ x86_64-softmmu/qemu-system-x86_64 \ -drive if=none,file=foo.qcow2,copy-on-read=on,id=cd \ -device ide-cd,drive=cd --enable-kvm -m 1024 Direct http usage is probably too slow, but with copy-on-read ultimately the image does boot! After some time, streaming gets canceled by an EIO, which needs further investigation. Signed-off-by: Peter Maydell Signed-off-by: Paolo Bonzini Signed-off-by: Kevin Wolf --- block/curl.c | 81 +++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 70 insertions(+), 11 deletions(-) diff --git a/block/curl.c b/block/curl.c index a6039366da..a8075847b8 100644 --- a/block/curl.c +++ b/block/curl.c @@ -34,6 +34,11 @@ #define DPRINTF(fmt, ...) do { } while (0) #endif +#if LIBCURL_VERSION_NUM >= 0x071000 +/* The multi interface timer callback was introduced in 7.16.0 */ +#define NEED_CURL_TIMER_CALLBACK +#endif + #define PROTOCOLS (CURLPROTO_HTTP | CURLPROTO_HTTPS | \ CURLPROTO_FTP | CURLPROTO_FTPS | \ CURLPROTO_TFTP) @@ -77,6 +82,7 @@ typedef struct CURLState typedef struct BDRVCURLState { CURLM *multi; + QEMUTimer timer; size_t len; CURLState states[CURL_NUM_STATES]; char *url; @@ -87,6 +93,23 @@ typedef struct BDRVCURLState { static void curl_clean_state(CURLState *s); static void curl_multi_do(void *arg); +#ifdef NEED_CURL_TIMER_CALLBACK +static int curl_timer_cb(CURLM *multi, long timeout_ms, void *opaque) +{ + BDRVCURLState *s = opaque; + + DPRINTF("CURL: timer callback timeout_ms %ld\n", timeout_ms); + if (timeout_ms == -1) { + timer_del(&s->timer); + } else { + int64_t timeout_ns = (int64_t)timeout_ms * 1000 * 1000; + timer_mod(&s->timer, + qemu_clock_get_ns(QEMU_CLOCK_REALTIME) + timeout_ns); + } + return 0; +} +#endif + static int curl_sock_cb(CURL *curl, curl_socket_t fd, int action, void *s, void *sp) { @@ -209,20 +232,10 @@ static int curl_find_buf(BDRVCURLState *s, size_t start, size_t len, return FIND_RET_NONE; } -static void curl_multi_do(void *arg) +static void curl_multi_read(BDRVCURLState *s) { - BDRVCURLState *s = (BDRVCURLState *)arg; - int running; - int r; int msgs_in_queue; - if (!s->multi) - return; - - do { - r = curl_multi_socket_all(s->multi, &running); - } while(r == CURLM_CALL_MULTI_PERFORM); - /* Try to find done transfers, so we can free the easy * handle again. */ do { @@ -266,6 +279,41 @@ static void curl_multi_do(void *arg) } while(msgs_in_queue); } +static void curl_multi_do(void *arg) +{ + BDRVCURLState *s = (BDRVCURLState *)arg; + int running; + int r; + + if (!s->multi) { + return; + } + + do { + r = curl_multi_socket_all(s->multi, &running); + } while(r == CURLM_CALL_MULTI_PERFORM); + + curl_multi_read(s); +} + +static void curl_multi_timeout_do(void *arg) +{ +#ifdef NEED_CURL_TIMER_CALLBACK + BDRVCURLState *s = (BDRVCURLState *)arg; + int running; + + if (!s->multi) { + return; + } + + curl_multi_socket_action(s->multi, CURL_SOCKET_TIMEOUT, 0, &running); + + curl_multi_read(s); +#else + abort(); +#endif +} + static CURLState *curl_init_state(BDRVCURLState *s) { CURLState *state = NULL; @@ -473,12 +521,20 @@ static int curl_open(BlockDriverState *bs, QDict *options, int flags, curl_easy_cleanup(state->curl); state->curl = NULL; + aio_timer_init(bdrv_get_aio_context(bs), &s->timer, + QEMU_CLOCK_REALTIME, SCALE_NS, + curl_multi_timeout_do, s); + // Now we know the file exists and its size, so let's // initialize the multi interface! s->multi = curl_multi_init(); curl_multi_setopt(s->multi, CURLMOPT_SOCKETDATA, s); curl_multi_setopt(s->multi, CURLMOPT_SOCKETFUNCTION, curl_sock_cb); +#ifdef NEED_CURL_TIMER_CALLBACK + curl_multi_setopt(s->multi, CURLMOPT_TIMERDATA, s); + curl_multi_setopt(s->multi, CURLMOPT_TIMERFUNCTION, curl_timer_cb); +#endif curl_multi_do(s); qemu_opts_del(opts); @@ -597,6 +653,9 @@ static void curl_close(BlockDriverState *bs) } if (s->multi) curl_multi_cleanup(s->multi); + + timer_del(&s->timer); + g_free(s->url); } From 72706ea4cd38bfcb151265df0178ba21863d7518 Mon Sep 17 00:00:00 2001 From: Jeff Cody Date: Fri, 24 Jan 2014 09:02:35 -0500 Subject: [PATCH 61/93] block: resize backing file image during offline commit, if necessary Currently, if an image file is logically larger than its backing file, committing it via 'qemu-img commit' will fail. For instance, if we have a base image with a virtual size 10G, and a snapshot image of size 20G, then committing the snapshot offline with 'qemu-img commit' will likely fail. This will automatically attempt to resize the base image, if the snapshot image to be committed is larger. Signed-off-by: Jeff Cody Reviewed-by: Fam Zheng Reviewed-by: Eric Blake Reviewed-by: Benoit Canet Signed-off-by: Kevin Wolf --- block.c | 28 +++++++++++++++++++++++++--- 1 file changed, 25 insertions(+), 3 deletions(-) diff --git a/block.c b/block.c index 18c0a8dd94..72bccb1ea0 100644 --- a/block.c +++ b/block.c @@ -2030,10 +2030,10 @@ int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res, BdrvCheckMode fix) int bdrv_commit(BlockDriverState *bs) { BlockDriver *drv = bs->drv; - int64_t sector, total_sectors; + int64_t sector, total_sectors, length, backing_length; int n, ro, open_flags; int ret = 0; - uint8_t *buf; + uint8_t *buf = NULL; char filename[PATH_MAX]; if (!drv) @@ -2058,7 +2058,29 @@ int bdrv_commit(BlockDriverState *bs) } } - total_sectors = bdrv_getlength(bs) >> BDRV_SECTOR_BITS; + length = bdrv_getlength(bs); + if (length < 0) { + ret = length; + goto ro_cleanup; + } + + backing_length = bdrv_getlength(bs->backing_hd); + if (backing_length < 0) { + ret = backing_length; + goto ro_cleanup; + } + + /* If our top snapshot is larger than the backing file image, + * grow the backing file image if possible. If not possible, + * we must return an error */ + if (length > backing_length) { + ret = bdrv_truncate(bs->backing_hd, length); + if (ret < 0) { + goto ro_cleanup; + } + } + + total_sectors = length >> BDRV_SECTOR_BITS; buf = g_malloc(COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE); for (sector = 0; sector < total_sectors; sector += n) { From 4da83585961631bfc10831dd26c4afda2a8b23e8 Mon Sep 17 00:00:00 2001 From: Jeff Cody Date: Fri, 24 Jan 2014 09:02:36 -0500 Subject: [PATCH 62/93] block: resize backing image during active layer commit, if needed If the top image to commit is the active layer, and also larger than the base image, then an I/O error will likely be returned during block-commit. For instance, if we have a base image with a virtual size 10G, and a active layer image of size 20G, then committing the snapshot via 'block-commit' will likely fail. This will automatically attempt to resize the base image, if the active layer image to be committed is larger. Signed-off-by: Jeff Cody Reviewed-by: Eric Blake Reviewed-by: Benoit Canet Signed-off-by: Kevin Wolf --- block/mirror.c | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/block/mirror.c b/block/mirror.c index 05758e5500..2a4333474e 100644 --- a/block/mirror.c +++ b/block/mirror.c @@ -631,11 +631,49 @@ void commit_active_start(BlockDriverState *bs, BlockDriverState *base, BlockDriverCompletionFunc *cb, void *opaque, Error **errp) { + int64_t length, base_length; + int orig_base_flags; + + orig_base_flags = bdrv_get_flags(base); + if (bdrv_reopen(base, bs->open_flags, errp)) { return; } + + length = bdrv_getlength(bs); + if (length < 0) { + error_setg(errp, "Unable to determine length of %s", bs->filename); + goto error_restore_flags; + } + + base_length = bdrv_getlength(base); + if (base_length < 0) { + error_setg(errp, "Unable to determine length of %s", base->filename); + goto error_restore_flags; + } + + if (length > base_length) { + if (bdrv_truncate(base, length) < 0) { + error_setg(errp, "Top image %s is larger than base image %s, and " + "resize of base image failed", + bs->filename, base->filename); + goto error_restore_flags; + } + } + bdrv_ref(base); mirror_start_job(bs, base, speed, 0, 0, on_error, on_error, cb, opaque, errp, &commit_active_job_driver, false, base); + if (error_is_set(errp)) { + goto error_restore_flags; + } + + return; + +error_restore_flags: + /* ignore error and errp for bdrv_reopen, because we want to propagate + * the original error */ + bdrv_reopen(base, orig_base_flags, NULL); + return; } From 37222900743962e146a82b7077a18c3f39859a19 Mon Sep 17 00:00:00 2001 From: Jeff Cody Date: Fri, 24 Jan 2014 09:02:37 -0500 Subject: [PATCH 63/93] block: update block commit documentation regarding image truncation This updates the documentation for commiting snapshot images. Specifically, this highlights what happens when the base image is either smaller or larger than the snapshot image being committed. In the case of the base image being smaller, it is resized to the larger size of the snapshot image. In the case of the base image being larger, it is not resized automatically, but once the commit has completed it is safe for the user to truncate the base image. Signed-off-by: Jeff Cody Reviewed-by: Fam Zheng Reviewed-by: Eric Blake Signed-off-by: Kevin Wolf --- hmp-commands.hx | 5 +++++ qapi-schema.json | 7 +++++++ qemu-img.texi | 7 ++++++- qmp-commands.hx | 39 +++++++++++++++++++++++++++++++++++++++ 4 files changed, 57 insertions(+), 1 deletion(-) diff --git a/hmp-commands.hx b/hmp-commands.hx index feca0847d0..f3fc514427 100644 --- a/hmp-commands.hx +++ b/hmp-commands.hx @@ -35,6 +35,11 @@ STEXI @item commit @findex commit Commit changes to the disk images (if -snapshot is used) or backing files. +If the backing file is smaller than the snapshot, then the backing file will be +resized to be the same size as the snapshot. If the snapshot is smaller than +the backing file, the backing file will not be truncated. If you want the +backing file to match the size of the smaller snapshot, you can safely truncate +it yourself once the commit operation successfully completes. ETEXI { diff --git a/qapi-schema.json b/qapi-schema.json index e04949de52..1ff607ac3c 100644 --- a/qapi-schema.json +++ b/qapi-schema.json @@ -1994,6 +1994,13 @@ # user needs to complete the job with the block-job-complete # command after getting the ready event. (Since 2.0) # +# If the base image is smaller than top, then the base image +# will be resized to be the same size as top. If top is +# smaller than the base image, the base will not be +# truncated. If you want the base image size to match the +# size of the smaller top, you can safely truncate it +# yourself once the commit operation successfully completes. +# # # @speed: #optional the maximum speed, in bytes per second # diff --git a/qemu-img.texi b/qemu-img.texi index f86a86d916..526d56a458 100644 --- a/qemu-img.texi +++ b/qemu-img.texi @@ -142,7 +142,12 @@ it doesn't need to be specified separately in this case. @item commit [-f @var{fmt}] [-t @var{cache}] @var{filename} -Commit the changes recorded in @var{filename} in its base image. +Commit the changes recorded in @var{filename} in its base image or backing file. +If the backing file is smaller than the snapshot, then the backing file will be +resized to be the same size as the snapshot. If the snapshot is smaller than +the backing file, the backing file will not be truncated. If you want the +backing file to match the size of the smaller snapshot, you can safely truncate +it yourself once the commit operation successfully completes. @item compare [-f @var{fmt}] [-F @var{fmt}] [-p] [-s] [-q] @var{filename1} @var{filename2} diff --git a/qmp-commands.hx b/qmp-commands.hx index 4f2825017c..cce6b81da4 100644 --- a/qmp-commands.hx +++ b/qmp-commands.hx @@ -966,6 +966,45 @@ EQMP .mhandler.cmd_new = qmp_marshal_input_block_commit, }, +SQMP +block-commit +------------ + +Live commit of data from overlay image nodes into backing nodes - i.e., writes +data between 'top' and 'base' into 'base'. + +Arguments: + +- "device": The device's ID, must be unique (json-string) +- "base": The file name of the backing image to write data into. + If not specified, this is the deepest backing image + (json-string, optional) +- "top": The file name of the backing image within the image chain, + which contains the topmost data to be committed down. + + If top == base, that is an error. + If top == active, the job will not be completed by itself, + user needs to complete the job with the block-job-complete + command after getting the ready event. (Since 2.0) + + If the base image is smaller than top, then the base image + will be resized to be the same size as top. If top is + smaller than the base image, the base will not be + truncated. If you want the base image size to match the + size of the smaller top, you can safely truncate it + yourself once the commit operation successfully completes. + (json-string) +- "speed": the maximum speed, in bytes per second (json-int, optional) + + +Example: + +-> { "execute": "block-commit", "arguments": { "device": "virtio0", + "top": "/tmp/snap1.qcow2" } } +<- { "return": {} } + +EQMP + { .name = "drive-backup", .args_type = "sync:s,device:B,target:s,speed:i?,mode:s?,format:s?," From dabfa6cc2e2a06269026fcb42772894f67bd0c3e Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Fri, 24 Jan 2014 14:00:43 +0100 Subject: [PATCH 64/93] block: Fix bdrv_commit return value bdrv_commit() could return 0 or 1 on success, depending on whether or not the last sector was allocated in the overlay and whether the overlay format had a .bdrv_make_empty callback. Most callers ignored it, but qemu-img commit would print an error message while the operation actually succeeded. Also clean up the handling of I/O errors to return the real error code instead of -EIO. Signed-off-by: Kevin Wolf Reviewed-by: Benoit Canet --- block.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/block.c b/block.c index 72bccb1ea0..2106ae9cee 100644 --- a/block.c +++ b/block.c @@ -2089,13 +2089,13 @@ int bdrv_commit(BlockDriverState *bs) goto ro_cleanup; } if (ret) { - if (bdrv_read(bs, sector, buf, n) != 0) { - ret = -EIO; + ret = bdrv_read(bs, sector, buf, n); + if (ret < 0) { goto ro_cleanup; } - if (bdrv_write(bs->backing_hd, sector, buf, n) != 0) { - ret = -EIO; + ret = bdrv_write(bs->backing_hd, sector, buf, n); + if (ret < 0) { goto ro_cleanup; } } @@ -2103,6 +2103,9 @@ int bdrv_commit(BlockDriverState *bs) if (drv->bdrv_make_empty) { ret = drv->bdrv_make_empty(bs); + if (ret < 0) { + goto ro_cleanup; + } bdrv_flush(bs); } @@ -2110,9 +2113,11 @@ int bdrv_commit(BlockDriverState *bs) * Make sure all data we wrote to the backing device is actually * stable on disk. */ - if (bs->backing_hd) + if (bs->backing_hd) { bdrv_flush(bs->backing_hd); + } + ret = 0; ro_cleanup: g_free(buf); From d34682cd4a06efe9ee3fc8cb7e8a0ea445299989 Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Wed, 11 Dec 2013 19:26:16 +0100 Subject: [PATCH 65/93] block: Move initialisation of BlockLimits to bdrv_refresh_limits() This function separates filling the BlockLimits from bdrv_open(), which allows it to call it from other operations which may change the limits (e.g. modifications to the backing file chain or bdrv_reopen) Signed-off-by: Kevin Wolf Reviewed-by: Max Reitz Reviewed-by: Benoit Canet --- block.c | 18 +++++++++++++++ block/iscsi.c | 46 ++++++++++++++++++++++++--------------- block/qcow2.c | 11 +++++++++- block/qed.c | 11 +++++++++- block/vmdk.c | 22 +++++++++++++++---- include/block/block_int.h | 2 ++ 6 files changed, 87 insertions(+), 23 deletions(-) diff --git a/block.c b/block.c index 2106ae9cee..0a3d12c328 100644 --- a/block.c +++ b/block.c @@ -483,6 +483,19 @@ int bdrv_create_file(const char* filename, QEMUOptionParameter *options, return ret; } +static int bdrv_refresh_limits(BlockDriverState *bs) +{ + BlockDriver *drv = bs->drv; + + memset(&bs->bl, 0, sizeof(bs->bl)); + + if (drv && drv->bdrv_refresh_limits) { + return drv->bdrv_refresh_limits(bs); + } + + return 0; +} + /* * Create a uniquely-named empty temporary file. * Return 0 upon success, otherwise a negative errno value. @@ -872,6 +885,8 @@ static int bdrv_open_common(BlockDriverState *bs, BlockDriverState *file, goto free_and_fail; } + bdrv_refresh_limits(bs); + #ifndef _WIN32 if (bs->is_temporary) { assert(bs->filename[0] != '\0'); @@ -1085,6 +1100,9 @@ int bdrv_open_backing_file(BlockDriverState *bs, QDict *options, Error **errp) bs->backing_hd->file->filename); } + /* Recalculate the BlockLimits with the backing file */ + bdrv_refresh_limits(bs); + return 0; } diff --git a/block/iscsi.c b/block/iscsi.c index 76b3c96d38..c8bf8dc049 100644 --- a/block/iscsi.c +++ b/block/iscsi.c @@ -1265,23 +1265,6 @@ static int iscsi_open(BlockDriverState *bs, QDict *options, int flags, sizeof(struct scsi_inquiry_block_limits)); scsi_free_scsi_task(task); task = NULL; - - if (iscsilun->bl.max_unmap < 0xffffffff) { - bs->bl.max_discard = sector_lun2qemu(iscsilun->bl.max_unmap, - iscsilun); - } - bs->bl.discard_alignment = sector_lun2qemu(iscsilun->bl.opt_unmap_gran, - iscsilun); - - if (iscsilun->bl.max_ws_len < 0xffffffff) { - bs->bl.max_write_zeroes = sector_lun2qemu(iscsilun->bl.max_ws_len, - iscsilun); - } - bs->bl.write_zeroes_alignment = sector_lun2qemu(iscsilun->bl.opt_unmap_gran, - iscsilun); - - bs->bl.opt_transfer_length = sector_lun2qemu(iscsilun->bl.opt_xfer_len, - iscsilun); } #if defined(LIBISCSI_FEATURE_NOP_COUNTER) @@ -1326,6 +1309,34 @@ static void iscsi_close(BlockDriverState *bs) memset(iscsilun, 0, sizeof(IscsiLun)); } +static int iscsi_refresh_limits(BlockDriverState *bs) +{ + IscsiLun *iscsilun = bs->opaque; + + /* We don't actually refresh here, but just return data queried in + * iscsi_open(): iscsi targets don't change their limits. */ + if (iscsilun->lbp.lbpu || iscsilun->lbp.lbpws) { + if (iscsilun->bl.max_unmap < 0xffffffff) { + bs->bl.max_discard = sector_lun2qemu(iscsilun->bl.max_unmap, + iscsilun); + } + bs->bl.discard_alignment = sector_lun2qemu(iscsilun->bl.opt_unmap_gran, + iscsilun); + + if (iscsilun->bl.max_ws_len < 0xffffffff) { + bs->bl.max_write_zeroes = sector_lun2qemu(iscsilun->bl.max_ws_len, + iscsilun); + } + bs->bl.write_zeroes_alignment = sector_lun2qemu(iscsilun->bl.opt_unmap_gran, + iscsilun); + + bs->bl.opt_transfer_length = sector_lun2qemu(iscsilun->bl.opt_xfer_len, + iscsilun); + } + + return 0; +} + static int iscsi_truncate(BlockDriverState *bs, int64_t offset) { IscsiLun *iscsilun = bs->opaque; @@ -1438,6 +1449,7 @@ static BlockDriver bdrv_iscsi = { .bdrv_getlength = iscsi_getlength, .bdrv_get_info = iscsi_get_info, .bdrv_truncate = iscsi_truncate, + .bdrv_refresh_limits = iscsi_refresh_limits, #if defined(LIBISCSI_FEATURE_IOVECTOR) .bdrv_co_get_block_status = iscsi_co_get_block_status, diff --git a/block/qcow2.c b/block/qcow2.c index e15a4dd057..2da62b8a90 100644 --- a/block/qcow2.c +++ b/block/qcow2.c @@ -718,7 +718,6 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags, } qemu_opts_del(opts); - bs->bl.write_zeroes_alignment = s->cluster_sectors; if (s->use_lazy_refcounts && s->qcow_version < 3) { error_setg(errp, "Lazy refcounts require a qcow2 image with at least " @@ -751,6 +750,15 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags, return ret; } +static int qcow2_refresh_limits(BlockDriverState *bs) +{ + BDRVQcowState *s = bs->opaque; + + bs->bl.write_zeroes_alignment = s->cluster_sectors; + + return 0; +} + static int qcow2_set_key(BlockDriverState *bs, const char *key) { BDRVQcowState *s = bs->opaque; @@ -2268,6 +2276,7 @@ static BlockDriver bdrv_qcow2 = { .bdrv_change_backing_file = qcow2_change_backing_file, + .bdrv_refresh_limits = qcow2_refresh_limits, .bdrv_invalidate_cache = qcow2_invalidate_cache, .create_options = qcow2_create_options, diff --git a/block/qed.c b/block/qed.c index 0dd5c5859e..694e6e2ee0 100644 --- a/block/qed.c +++ b/block/qed.c @@ -495,7 +495,6 @@ static int bdrv_qed_open(BlockDriverState *bs, QDict *options, int flags, } } - bs->bl.write_zeroes_alignment = s->header.cluster_size >> BDRV_SECTOR_BITS; s->need_check_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, qed_need_check_timer_cb, s); @@ -507,6 +506,15 @@ out: return ret; } +static int bdrv_qed_refresh_limits(BlockDriverState *bs) +{ + BDRVQEDState *s = bs->opaque; + + bs->bl.write_zeroes_alignment = s->header.cluster_size >> BDRV_SECTOR_BITS; + + return 0; +} + /* We have nothing to do for QED reopen, stubs just return * success */ static int bdrv_qed_reopen_prepare(BDRVReopenState *state, @@ -1616,6 +1624,7 @@ static BlockDriver bdrv_qed = { .bdrv_truncate = bdrv_qed_truncate, .bdrv_getlength = bdrv_qed_getlength, .bdrv_get_info = bdrv_qed_get_info, + .bdrv_refresh_limits = bdrv_qed_refresh_limits, .bdrv_change_backing_file = bdrv_qed_change_backing_file, .bdrv_invalidate_cache = bdrv_qed_invalidate_cache, .bdrv_check = bdrv_qed_check, diff --git a/block/vmdk.c b/block/vmdk.c index 67b5f96a19..99ca60fdb9 100644 --- a/block/vmdk.c +++ b/block/vmdk.c @@ -428,10 +428,6 @@ static int vmdk_add_extent(BlockDriverState *bs, extent->l2_size = l2_size; extent->cluster_sectors = flat ? sectors : cluster_sectors; - if (!flat) { - bs->bl.write_zeroes_alignment = - MAX(bs->bl.write_zeroes_alignment, cluster_sectors); - } if (s->num_extents > 1) { extent->end_sector = (*(extent - 1)).end_sector + extent->sectors; } else { @@ -902,6 +898,23 @@ fail: return ret; } + +static int vmdk_refresh_limits(BlockDriverState *bs) +{ + BDRVVmdkState *s = bs->opaque; + int i; + + for (i = 0; i < s->num_extents; i++) { + if (!s->extents[i].flat) { + bs->bl.write_zeroes_alignment = + MAX(bs->bl.write_zeroes_alignment, + s->extents[i].cluster_sectors); + } + } + + return 0; +} + static int get_whole_cluster(BlockDriverState *bs, VmdkExtent *extent, uint64_t cluster_offset, @@ -2013,6 +2026,7 @@ static BlockDriver bdrv_vmdk = { .bdrv_get_allocated_file_size = vmdk_get_allocated_file_size, .bdrv_has_zero_init = vmdk_has_zero_init, .bdrv_get_specific_info = vmdk_get_specific_info, + .bdrv_refresh_limits = vmdk_refresh_limits, .create_options = vmdk_create_options, }; diff --git a/include/block/block_int.h b/include/block/block_int.h index 611a955712..f6fa1f6f36 100644 --- a/include/block/block_int.h +++ b/include/block/block_int.h @@ -232,6 +232,8 @@ struct BlockDriver { int (*bdrv_debug_resume)(BlockDriverState *bs, const char *tag); bool (*bdrv_debug_is_suspended)(BlockDriverState *bs, const char *tag); + int (*bdrv_refresh_limits)(BlockDriverState *bs); + /* * Returns 1 if newly created images are guaranteed to contain only * zeros, 0 otherwise. From 466ad822deef3a03757d505218a52993c5d56b5d Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Wed, 11 Dec 2013 19:50:32 +0100 Subject: [PATCH 66/93] block: Inherit opt_transfer_length MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When there is a format driver between the backend, it's not guaranteed that exposing the opt_transfer_length for the format driver results in the optimal requests (because of fragmentation etc.), but it can't make things worse, so let's just do it. Signed-off-by: Kevin Wolf Reviewed-by: Wenchao Xia Reviewed-by: Max Reitz Reviewed-by: Benoît Canet --- block.c | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/block.c b/block.c index 0a3d12c328..8a6692719a 100644 --- a/block.c +++ b/block.c @@ -489,7 +489,25 @@ static int bdrv_refresh_limits(BlockDriverState *bs) memset(&bs->bl, 0, sizeof(bs->bl)); - if (drv && drv->bdrv_refresh_limits) { + if (!drv) { + return 0; + } + + /* Take some limits from the children as a default */ + if (bs->file) { + bdrv_refresh_limits(bs->file); + bs->bl.opt_transfer_length = bs->file->bl.opt_transfer_length; + } + + if (bs->backing_hd) { + bdrv_refresh_limits(bs->backing_hd); + bs->bl.opt_transfer_length = + MAX(bs->bl.opt_transfer_length, + bs->backing_hd->bl.opt_transfer_length); + } + + /* Then let the driver override it */ + if (drv->bdrv_refresh_limits) { return drv->bdrv_refresh_limits(bs); } From 355ef4ac95a7a47d5c7201ccd910056a100d2fdf Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Wed, 11 Dec 2013 20:14:09 +0100 Subject: [PATCH 67/93] block: Update BlockLimits when they might have changed MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When reopening with different flags, or when backing files disappear from the chain, the limits may change. Make sure they get updated in these cases. Signed-off-by: Kevin Wolf Reviewed-by: Wenchao Xia Reviewed-by: Max Reitz Reviewed-by: Benoît Canet --- block.c | 5 ++++- block/stream.c | 2 ++ include/block/block.h | 1 + 3 files changed, 7 insertions(+), 1 deletion(-) diff --git a/block.c b/block.c index 8a6692719a..9d0cfc46da 100644 --- a/block.c +++ b/block.c @@ -483,7 +483,7 @@ int bdrv_create_file(const char* filename, QEMUOptionParameter *options, return ret; } -static int bdrv_refresh_limits(BlockDriverState *bs) +int bdrv_refresh_limits(BlockDriverState *bs) { BlockDriver *drv = bs->drv; @@ -1607,6 +1607,8 @@ void bdrv_reopen_commit(BDRVReopenState *reopen_state) reopen_state->bs->enable_write_cache = !!(reopen_state->flags & BDRV_O_CACHE_WB); reopen_state->bs->read_only = !(reopen_state->flags & BDRV_O_RDWR); + + bdrv_refresh_limits(reopen_state->bs); } /* @@ -2441,6 +2443,7 @@ int bdrv_drop_intermediate(BlockDriverState *active, BlockDriverState *top, } new_top_bs->backing_hd = base_bs; + bdrv_refresh_limits(new_top_bs); QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) { /* so that bdrv_close() does not recursively close the chain */ diff --git a/block/stream.c b/block/stream.c index 46bec7d379..dd0b4ac3d2 100644 --- a/block/stream.c +++ b/block/stream.c @@ -75,6 +75,8 @@ static void close_unused_images(BlockDriverState *top, BlockDriverState *base, unused->backing_hd = NULL; bdrv_unref(unused); } + + bdrv_refresh_limits(top); } static void coroutine_fn stream_run(void *opaque) diff --git a/include/block/block.h b/include/block/block.h index 59d9f12ce4..00a8790660 100644 --- a/include/block/block.h +++ b/include/block/block.h @@ -253,6 +253,7 @@ int bdrv_truncate(BlockDriverState *bs, int64_t offset); int64_t bdrv_getlength(BlockDriverState *bs); int64_t bdrv_get_allocated_file_size(BlockDriverState *bs); void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr); +int bdrv_refresh_limits(BlockDriverState *bs); int bdrv_commit(BlockDriverState *bs); int bdrv_commit_all(void); int bdrv_change_backing_file(BlockDriverState *bs, From e5354657a626b325c31888f33de88ac6d39e2fcb Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Fri, 29 Nov 2013 21:29:17 +0100 Subject: [PATCH 68/93] qemu_memalign: Allow small alignments MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The functions used by qemu_memalign() require an alignment that is at least sizeof(void*). Adjust it if it is too small. Signed-off-by: Kevin Wolf Reviewed-by: Wenchao Xia Reviewed-by: Max Reitz Reviewed-by: Benoît Canet --- util/oslib-posix.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/util/oslib-posix.c b/util/oslib-posix.c index f5c401646f..d5dca4729a 100644 --- a/util/oslib-posix.c +++ b/util/oslib-posix.c @@ -88,6 +88,11 @@ void *qemu_oom_check(void *ptr) void *qemu_memalign(size_t alignment, size_t size) { void *ptr; + + if (alignment < sizeof(void*)) { + alignment = sizeof(void*); + } + #if defined(_POSIX_C_SOURCE) && !defined(__sun__) int ret; ret = posix_memalign(&ptr, alignment, size); From 1ff735bdc417945bc6df1857861b127644b3f461 Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Thu, 5 Dec 2013 13:01:46 +0100 Subject: [PATCH 69/93] block: Detect unaligned length in bdrv_qiov_is_aligned() For an O_DIRECT request to succeed, it's not only necessary that all base addresses in the qiov are aligned, but also that each length in it is aligned. Signed-off-by: Kevin Wolf Reviewed-by: Wenchao Xia Reviewed-by: Max Reitz --- block.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/block.c b/block.c index 9d0cfc46da..275d387354 100644 --- a/block.c +++ b/block.c @@ -4811,6 +4811,9 @@ bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov) if ((uintptr_t) qiov->iov[i].iov_base % bs->buffer_alignment) { return false; } + if (qiov->iov[i].iov_len % bs->buffer_alignment) { + return false; + } } return true; From 339064d5063924e5176842abbf6c8089f3479c5b Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Thu, 28 Nov 2013 10:23:32 +0100 Subject: [PATCH 70/93] block: Don't use guest sector size for qemu_blockalign() bs->buffer_alignment is set by the device emulation and contains the logical block size of the guest device. This isn't something that the block layer should know, and even less something to use for determining the right alignment of buffers to be used for the host. The new BlockLimits field opt_mem_alignment tells the qemu block layer the optimal alignment to be used so that no bounce buffer must be used in the driver. This patch may change the buffer alignment from 4k to 512 for all callers that used qemu_blockalign() with the top-level image format BlockDriverState. The value was never propagated to other levels in the tree, so in particular raw-posix never required anything else than 512. While on disks with 4k sectors direct I/O requires a 4k alignment, memory may still be okay when aligned to 512 byte boundaries. This is what must have happened in practice, because otherwise this would already have failed earlier. Therefore I don't expect regressions even with this intermediate state. Later, raw-posix can implement the hook and expose a different memory alignment requirement. Signed-off-by: Kevin Wolf Reviewed-by: Wenchao Xia Reviewed-by: Max Reitz --- block.c | 23 ++++++++++++++++++++--- include/block/block.h | 3 +++ include/block/block_int.h | 3 +++ 3 files changed, 26 insertions(+), 3 deletions(-) diff --git a/block.c b/block.c index 275d387354..e5f214d135 100644 --- a/block.c +++ b/block.c @@ -218,6 +218,16 @@ static void bdrv_io_limits_intercept(BlockDriverState *bs, qemu_co_queue_next(&bs->throttled_reqs[is_write]); } +size_t bdrv_opt_mem_align(BlockDriverState *bs) +{ + if (!bs || !bs->drv) { + /* 4k should be on the safe side */ + return 4096; + } + + return bs->bl.opt_mem_alignment; +} + /* check if the path starts with ":" */ static int path_has_protocol(const char *path) { @@ -497,6 +507,9 @@ int bdrv_refresh_limits(BlockDriverState *bs) if (bs->file) { bdrv_refresh_limits(bs->file); bs->bl.opt_transfer_length = bs->file->bl.opt_transfer_length; + bs->bl.opt_mem_alignment = bs->file->bl.opt_mem_alignment; + } else { + bs->bl.opt_mem_alignment = 512; } if (bs->backing_hd) { @@ -504,6 +517,9 @@ int bdrv_refresh_limits(BlockDriverState *bs) bs->bl.opt_transfer_length = MAX(bs->bl.opt_transfer_length, bs->backing_hd->bl.opt_transfer_length); + bs->bl.opt_mem_alignment = + MAX(bs->bl.opt_mem_alignment, + bs->backing_hd->bl.opt_mem_alignment); } /* Then let the driver override it */ @@ -4797,7 +4813,7 @@ void bdrv_set_buffer_alignment(BlockDriverState *bs, int align) void *qemu_blockalign(BlockDriverState *bs, size_t size) { - return qemu_memalign((bs && bs->buffer_alignment) ? bs->buffer_alignment : 512, size); + return qemu_memalign(bdrv_opt_mem_align(bs), size); } /* @@ -4806,12 +4822,13 @@ void *qemu_blockalign(BlockDriverState *bs, size_t size) bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov) { int i; + size_t alignment = bdrv_opt_mem_align(bs); for (i = 0; i < qiov->niov; i++) { - if ((uintptr_t) qiov->iov[i].iov_base % bs->buffer_alignment) { + if ((uintptr_t) qiov->iov[i].iov_base % alignment) { return false; } - if (qiov->iov[i].iov_len % bs->buffer_alignment) { + if (qiov->iov[i].iov_len % alignment) { return false; } } diff --git a/include/block/block.h b/include/block/block.h index 00a8790660..332ebb94fb 100644 --- a/include/block/block.h +++ b/include/block/block.h @@ -428,6 +428,9 @@ void bdrv_img_create(const char *filename, const char *fmt, char *options, uint64_t img_size, int flags, Error **errp, bool quiet); +/* Returns the alignment in bytes that is required so that no bounce buffer + * is required throughout the stack */ +size_t bdrv_opt_mem_align(BlockDriverState *bs); void bdrv_set_buffer_alignment(BlockDriverState *bs, int align); void *qemu_blockalign(BlockDriverState *bs, size_t size); bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov); diff --git a/include/block/block_int.h b/include/block/block_int.h index f6fa1f6f36..74a78a6831 100644 --- a/include/block/block_int.h +++ b/include/block/block_int.h @@ -258,6 +258,9 @@ typedef struct BlockLimits { /* optimal transfer length in sectors */ int opt_transfer_length; + + /* memory alignment so that no bounce buffer is needed */ + size_t opt_mem_alignment; } BlockLimits; /* From 1b7fd729559c6d3b273303aa48bc653ceef08747 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 29 Nov 2011 11:35:47 +0100 Subject: [PATCH 71/93] block: rename buffer_alignment to guest_block_size The alignment field is now set to the value that is promised to the guest, rather than required by the host. The next patches will make QEMU aware of the host-provided values, so make this clear. The alignment is also not about memory buffers, but about the sectors on the disk, change the documentation of the field. At this point, the field is set by the device emulation, but completely ignored by the block layer. Signed-off-by: Paolo Bonzini Signed-off-by: Kevin Wolf Reviewed-by: Wenchao Xia Reviewed-by: Max Reitz Reviewed-by: Benoit Canet --- block.c | 10 +++++----- hw/block/virtio-blk.c | 2 +- hw/ide/core.c | 2 +- hw/scsi/scsi-disk.c | 2 +- hw/scsi/scsi-generic.c | 2 +- include/block/block.h | 2 +- include/block/block_int.h | 4 ++-- 7 files changed, 12 insertions(+), 12 deletions(-) diff --git a/block.c b/block.c index e5f214d135..195beff706 100644 --- a/block.c +++ b/block.c @@ -851,7 +851,7 @@ static int bdrv_open_common(BlockDriverState *bs, BlockDriverState *file, } bs->open_flags = flags; - bs->buffer_alignment = 512; + bs->guest_block_size = 512; bs->zero_beyond_eof = true; open_flags = bdrv_open_flags(bs, flags); bs->read_only = !(open_flags & BDRV_O_RDWR); @@ -1796,7 +1796,7 @@ static void bdrv_move_feature_fields(BlockDriverState *bs_dest, bs_dest->dev_ops = bs_src->dev_ops; bs_dest->dev_opaque = bs_src->dev_opaque; bs_dest->dev = bs_src->dev; - bs_dest->buffer_alignment = bs_src->buffer_alignment; + bs_dest->guest_block_size = bs_src->guest_block_size; bs_dest->copy_on_read = bs_src->copy_on_read; bs_dest->enable_write_cache = bs_src->enable_write_cache; @@ -1953,7 +1953,7 @@ void bdrv_detach_dev(BlockDriverState *bs, void *dev) bs->dev = NULL; bs->dev_ops = NULL; bs->dev_opaque = NULL; - bs->buffer_alignment = 512; + bs->guest_block_size = 512; } /* TODO change to return DeviceState * when all users are qdevified */ @@ -4806,9 +4806,9 @@ BlockDriverAIOCB *bdrv_aio_ioctl(BlockDriverState *bs, return NULL; } -void bdrv_set_buffer_alignment(BlockDriverState *bs, int align) +void bdrv_set_guest_block_size(BlockDriverState *bs, int align) { - bs->buffer_alignment = align; + bs->guest_block_size = align; } void *qemu_blockalign(BlockDriverState *bs, size_t size) diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c index 19d0961a47..8a568e5edb 100644 --- a/hw/block/virtio-blk.c +++ b/hw/block/virtio-blk.c @@ -731,7 +731,7 @@ static void virtio_blk_device_realize(DeviceState *dev, Error **errp) register_savevm(dev, "virtio-blk", virtio_blk_id++, 2, virtio_blk_save, virtio_blk_load, s); bdrv_set_dev_ops(s->bs, &virtio_block_ops, s); - bdrv_set_buffer_alignment(s->bs, s->conf->logical_block_size); + bdrv_set_guest_block_size(s->bs, s->conf->logical_block_size); bdrv_iostatus_enable(s->bs); diff --git a/hw/ide/core.c b/hw/ide/core.c index e1f4c33fb8..036cd4a6d1 100644 --- a/hw/ide/core.c +++ b/hw/ide/core.c @@ -2103,7 +2103,7 @@ int ide_init_drive(IDEState *s, BlockDriverState *bs, IDEDriveKind kind, s->smart_selftest_count = 0; if (kind == IDE_CD) { bdrv_set_dev_ops(bs, &ide_cd_block_ops, s); - bdrv_set_buffer_alignment(bs, 2048); + bdrv_set_guest_block_size(bs, 2048); } else { if (!bdrv_is_inserted(s->bs)) { error_report("Device needs media, but drive is empty"); diff --git a/hw/scsi/scsi-disk.c b/hw/scsi/scsi-disk.c index bce617cb93..649109150b 100644 --- a/hw/scsi/scsi-disk.c +++ b/hw/scsi/scsi-disk.c @@ -2254,7 +2254,7 @@ static int scsi_initfn(SCSIDevice *dev) } else { bdrv_set_dev_ops(s->qdev.conf.bs, &scsi_disk_block_ops, s); } - bdrv_set_buffer_alignment(s->qdev.conf.bs, s->qdev.blocksize); + bdrv_set_guest_block_size(s->qdev.conf.bs, s->qdev.blocksize); bdrv_iostatus_enable(s->qdev.conf.bs); add_boot_device_path(s->qdev.conf.bootindex, &dev->qdev, NULL); diff --git a/hw/scsi/scsi-generic.c b/hw/scsi/scsi-generic.c index 8f195bec00..f08b64e177 100644 --- a/hw/scsi/scsi-generic.c +++ b/hw/scsi/scsi-generic.c @@ -210,7 +210,7 @@ static void scsi_read_complete(void * opaque, int ret) s->blocksize = ldl_be_p(&r->buf[8]); s->max_lba = ldq_be_p(&r->buf[0]); } - bdrv_set_buffer_alignment(s->conf.bs, s->blocksize); + bdrv_set_guest_block_size(s->conf.bs, s->blocksize); scsi_req_data(&r->req, len); if (!r->req.io_canceled) { diff --git a/include/block/block.h b/include/block/block.h index 332ebb94fb..a2f5657321 100644 --- a/include/block/block.h +++ b/include/block/block.h @@ -431,7 +431,7 @@ void bdrv_img_create(const char *filename, const char *fmt, /* Returns the alignment in bytes that is required so that no bounce buffer * is required throughout the stack */ size_t bdrv_opt_mem_align(BlockDriverState *bs); -void bdrv_set_buffer_alignment(BlockDriverState *bs, int align); +void bdrv_set_guest_block_size(BlockDriverState *bs, int align); void *qemu_blockalign(BlockDriverState *bs, size_t size); bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov); diff --git a/include/block/block_int.h b/include/block/block_int.h index 74a78a6831..ae609bd4d6 100644 --- a/include/block/block_int.h +++ b/include/block/block_int.h @@ -325,8 +325,8 @@ struct BlockDriverState { /* Whether produces zeros when read beyond eof */ bool zero_beyond_eof; - /* the memory alignment required for the buffers handled by this driver */ - int buffer_alignment; + /* the block size for which the guest device expects atomicity */ + int guest_block_size; /* do we need to tell the quest if we have a volatile write cache? */ int enable_write_cache; From c25f53b06eba1575d5d0e92a0132455c97825b83 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 29 Nov 2011 12:42:20 +0100 Subject: [PATCH 72/93] raw: Probe required direct I/O alignment Add a bs->request_alignment field that contains the required offset/length alignment for I/O requests and fill it in the raw block drivers. Use ioctls if possible, else see what alignment it takes for O_DIRECT to succeed. While at it, also expose the memory alignment requirements, which may be (and in practice are) different from the disk alignment requirements. Signed-off-by: Paolo Bonzini Signed-off-by: Kevin Wolf Reviewed-by: Max Reitz --- block.c | 3 ++ block/raw-posix.c | 102 +++++++++++++++++++++++++++++++------- block/raw-win32.c | 41 +++++++++++++++ include/block/block_int.h | 3 ++ 4 files changed, 132 insertions(+), 17 deletions(-) diff --git a/block.c b/block.c index 195beff706..d4dd7fe172 100644 --- a/block.c +++ b/block.c @@ -852,6 +852,7 @@ static int bdrv_open_common(BlockDriverState *bs, BlockDriverState *file, bs->open_flags = flags; bs->guest_block_size = 512; + bs->request_alignment = 512; bs->zero_beyond_eof = true; open_flags = bdrv_open_flags(bs, flags); bs->read_only = !(open_flags & BDRV_O_RDWR); @@ -920,6 +921,8 @@ static int bdrv_open_common(BlockDriverState *bs, BlockDriverState *file, } bdrv_refresh_limits(bs); + assert(bdrv_opt_mem_align(bs) != 0); + assert(bs->request_alignment != 0); #ifndef _WIN32 if (bs->is_temporary) { diff --git a/block/raw-posix.c b/block/raw-posix.c index 0676037e13..126a634e45 100644 --- a/block/raw-posix.c +++ b/block/raw-posix.c @@ -127,6 +127,8 @@ typedef struct BDRVRawState { int fd; int type; int open_flags; + size_t buf_align; + #if defined(__linux__) /* linux floppy specific */ int64_t fd_open_time; @@ -213,6 +215,76 @@ static int raw_normalize_devicepath(const char **filename) } #endif +static void raw_probe_alignment(BlockDriverState *bs) +{ + BDRVRawState *s = bs->opaque; + char *buf; + unsigned int sector_size; + + /* For /dev/sg devices the alignment is not really used. + With buffered I/O, we don't have any restrictions. */ + if (bs->sg || !(s->open_flags & O_DIRECT)) { + bs->request_alignment = 1; + s->buf_align = 1; + return; + } + + /* Try a few ioctls to get the right size */ + bs->request_alignment = 0; + s->buf_align = 0; + +#ifdef BLKSSZGET + if (ioctl(s->fd, BLKSSZGET, §or_size) >= 0) { + bs->request_alignment = sector_size; + } +#endif +#ifdef DKIOCGETBLOCKSIZE + if (ioctl(s->fd, DKIOCGETBLOCKSIZE, §or_size) >= 0) { + bs->request_alignment = sector_size; + } +#endif +#ifdef DIOCGSECTORSIZE + if (ioctl(s->fd, DIOCGSECTORSIZE, §or_size) >= 0) { + bs->request_alignment = sector_size; + } +#endif +#ifdef CONFIG_XFS + if (s->is_xfs) { + struct dioattr da; + if (xfsctl(NULL, s->fd, XFS_IOC_DIOINFO, &da) >= 0) { + bs->request_alignment = da.d_miniosz; + /* The kernel returns wrong information for d_mem */ + /* s->buf_align = da.d_mem; */ + } + } +#endif + + /* If we could not get the sizes so far, we can only guess them */ + if (!s->buf_align) { + size_t align; + buf = qemu_memalign(MAX_BLOCKSIZE, 2 * MAX_BLOCKSIZE); + for (align = 512; align <= MAX_BLOCKSIZE; align <<= 1) { + if (pread(s->fd, buf + align, MAX_BLOCKSIZE, 0) >= 0) { + s->buf_align = align; + break; + } + } + qemu_vfree(buf); + } + + if (!bs->request_alignment) { + size_t align; + buf = qemu_memalign(s->buf_align, MAX_BLOCKSIZE); + for (align = 512; align <= MAX_BLOCKSIZE; align <<= 1) { + if (pread(s->fd, buf, align, 0) >= 0) { + bs->request_alignment = align; + break; + } + } + qemu_vfree(buf); + } +} + static void raw_parse_flags(int bdrv_flags, int *open_flags) { assert(open_flags != NULL); @@ -463,7 +535,6 @@ static int raw_reopen_prepare(BDRVReopenState *state, return ret; } - static void raw_reopen_commit(BDRVReopenState *state) { BDRVRawReopenState *raw_s = state->opaque; @@ -499,23 +570,15 @@ static void raw_reopen_abort(BDRVReopenState *state) state->opaque = NULL; } +static int raw_refresh_limits(BlockDriverState *bs) +{ + BDRVRawState *s = bs->opaque; -/* XXX: use host sector size if necessary with: -#ifdef DIOCGSECTORSIZE - { - unsigned int sectorsize = 512; - if (!ioctl(fd, DIOCGSECTORSIZE, §orsize) && - sectorsize > bufsize) - bufsize = sectorsize; - } -#endif -#ifdef CONFIG_COCOA - uint32_t blockSize = 512; - if ( !ioctl( fd, DKIOCGETBLOCKSIZE, &blockSize ) && blockSize > bufsize) { - bufsize = blockSize; - } -#endif -*/ + raw_probe_alignment(bs); + bs->bl.opt_mem_alignment = s->buf_align; + + return 0; +} static ssize_t handle_aiocb_ioctl(RawPosixAIOData *aiocb) { @@ -1363,6 +1426,7 @@ static BlockDriver bdrv_file = { .bdrv_aio_writev = raw_aio_writev, .bdrv_aio_flush = raw_aio_flush, .bdrv_aio_discard = raw_aio_discard, + .bdrv_refresh_limits = raw_refresh_limits, .bdrv_truncate = raw_truncate, .bdrv_getlength = raw_getlength, @@ -1740,6 +1804,7 @@ static BlockDriver bdrv_host_device = { .bdrv_aio_writev = raw_aio_writev, .bdrv_aio_flush = raw_aio_flush, .bdrv_aio_discard = hdev_aio_discard, + .bdrv_refresh_limits = raw_refresh_limits, .bdrv_truncate = raw_truncate, .bdrv_getlength = raw_getlength, @@ -1871,6 +1936,7 @@ static BlockDriver bdrv_host_floppy = { .bdrv_aio_readv = raw_aio_readv, .bdrv_aio_writev = raw_aio_writev, .bdrv_aio_flush = raw_aio_flush, + .bdrv_refresh_limits = raw_refresh_limits, .bdrv_truncate = raw_truncate, .bdrv_getlength = raw_getlength, @@ -1981,6 +2047,7 @@ static BlockDriver bdrv_host_cdrom = { .bdrv_aio_readv = raw_aio_readv, .bdrv_aio_writev = raw_aio_writev, .bdrv_aio_flush = raw_aio_flush, + .bdrv_refresh_limits = raw_refresh_limits, .bdrv_truncate = raw_truncate, .bdrv_getlength = raw_getlength, @@ -2110,6 +2177,7 @@ static BlockDriver bdrv_host_cdrom = { .bdrv_aio_readv = raw_aio_readv, .bdrv_aio_writev = raw_aio_writev, .bdrv_aio_flush = raw_aio_flush, + .bdrv_refresh_limits = raw_refresh_limits, .bdrv_truncate = raw_truncate, .bdrv_getlength = raw_getlength, diff --git a/block/raw-win32.c b/block/raw-win32.c index ce314fd54f..beb7f2395e 100644 --- a/block/raw-win32.c +++ b/block/raw-win32.c @@ -202,6 +202,35 @@ static int set_sparse(int fd) NULL, 0, NULL, 0, &returned, NULL); } +static void raw_probe_alignment(BlockDriverState *bs) +{ + BDRVRawState *s = bs->opaque; + DWORD sectorsPerCluster, freeClusters, totalClusters, count; + DISK_GEOMETRY_EX dg; + BOOL status; + + if (s->type == FTYPE_CD) { + bs->request_alignment = 2048; + return; + } + if (s->type == FTYPE_HARDDISK) { + status = DeviceIoControl(s->hfile, IOCTL_DISK_GET_DRIVE_GEOMETRY_EX, + NULL, 0, &dg, sizeof(dg), &count, NULL); + if (status != 0) { + bs->request_alignment = dg.Geometry.BytesPerSector; + return; + } + /* try GetDiskFreeSpace too */ + } + + if (s->drive_path[0]) { + GetDiskFreeSpace(s->drive_path, §orsPerCluster, + &dg.Geometry.BytesPerSector, + &freeClusters, &totalClusters); + bs->request_alignment = dg.Geometry.BytesPerSector; + } +} + static void raw_parse_flags(int flags, int *access_flags, DWORD *overlapped) { assert(access_flags != NULL); @@ -269,6 +298,17 @@ static int raw_open(BlockDriverState *bs, QDict *options, int flags, } } + if (filename[0] && filename[1] == ':') { + snprintf(s->drive_path, sizeof(s->drive_path), "%c:\\", filename[0]); + } else if (filename[0] == '\\' && filename[1] == '\\') { + s->drive_path[0] = 0; + } else { + /* Relative path. */ + char buf[MAX_PATH]; + GetCurrentDirectory(MAX_PATH, buf); + snprintf(s->drive_path, sizeof(s->drive_path), "%c:\\", buf[0]); + } + s->hfile = CreateFile(filename, access_flags, FILE_SHARE_READ, NULL, OPEN_EXISTING, overlapped, NULL); @@ -293,6 +333,7 @@ static int raw_open(BlockDriverState *bs, QDict *options, int flags, s->aio = aio; } + raw_probe_alignment(bs); ret = 0; fail: qemu_opts_del(opts); diff --git a/include/block/block_int.h b/include/block/block_int.h index ae609bd4d6..5f9cecd65a 100644 --- a/include/block/block_int.h +++ b/include/block/block_int.h @@ -325,6 +325,9 @@ struct BlockDriverState { /* Whether produces zeros when read beyond eof */ bool zero_beyond_eof; + /* Alignment requirement for offset/length of I/O requests */ + unsigned int request_alignment; + /* the block size for which the guest device expects atomicity */ int guest_block_size; From d0c7f642f5eb2cb21d0c3acf766cb375eaaf4666 Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Mon, 2 Dec 2013 15:07:48 +0100 Subject: [PATCH 73/93] block: Introduce bdrv_aligned_preadv() This separates the part of bdrv_co_do_readv() that needs to happen before the request is modified to match the backend alignment, and a part that needs to be executed afterwards and passes the request to the BlockDriver. Signed-off-by: Kevin Wolf Reviewed-by: Wenchao Xia Reviewed-by: Max Reitz --- block.c | 61 ++++++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 43 insertions(+), 18 deletions(-) diff --git a/block.c b/block.c index d4dd7fe172..701e95afb2 100644 --- a/block.c +++ b/block.c @@ -2885,26 +2885,24 @@ err: } /* - * Handle a read request in coroutine context + * Forwards an already correctly aligned request to the BlockDriver. This + * handles copy on read and zeroing after EOF; any other features must be + * implemented by the caller. */ -static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, - BdrvRequestFlags flags) +static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs, + int64_t offset, unsigned int bytes, QEMUIOVector *qiov, int flags) { BlockDriver *drv = bs->drv; BdrvTrackedRequest req; int ret; - if (!drv) { - return -ENOMEDIUM; - } - if (bdrv_check_request(bs, sector_num, nb_sectors)) { - return -EIO; - } + int64_t sector_num = offset >> BDRV_SECTOR_BITS; + unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS; - if (bs->copy_on_read) { - flags |= BDRV_REQ_COPY_ON_READ; - } + assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0); + assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0); + + /* Handle Copy on Read and associated serialisation */ if (flags & BDRV_REQ_COPY_ON_READ) { bs->copy_on_read_in_flight++; } @@ -2913,11 +2911,6 @@ static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs, wait_for_overlapping_requests(bs, sector_num, nb_sectors); } - /* throttling disk I/O */ - if (bs->io_limits_enabled) { - bdrv_io_limits_intercept(bs, nb_sectors, false); - } - tracked_request_begin(&req, bs, sector_num, nb_sectors, false); if (flags & BDRV_REQ_COPY_ON_READ) { @@ -2934,6 +2927,7 @@ static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs, } } + /* Forward the request to the BlockDriver */ if (!(bs->zero_beyond_eof && bs->growable)) { ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov); } else { @@ -2974,6 +2968,37 @@ out: return ret; } +/* + * Handle a read request in coroutine context + */ +static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, + BdrvRequestFlags flags) +{ + BlockDriver *drv = bs->drv; + int ret; + + if (!drv) { + return -ENOMEDIUM; + } + if (bdrv_check_request(bs, sector_num, nb_sectors)) { + return -EIO; + } + + if (bs->copy_on_read) { + flags |= BDRV_REQ_COPY_ON_READ; + } + + /* throttling disk I/O */ + if (bs->io_limits_enabled) { + bdrv_io_limits_intercept(bs, nb_sectors, false); + } + + ret = bdrv_aligned_preadv(bs, sector_num << BDRV_SECTOR_BITS, + nb_sectors << BDRV_SECTOR_BITS, qiov, flags); + return ret; +} + int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) { From 1b0288ae7fc695a8e652973f75e92464bbc13416 Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Mon, 2 Dec 2013 16:09:46 +0100 Subject: [PATCH 74/93] block: Introduce bdrv_co_do_preadv() Similar to bdrv_pread(), which aligns byte-aligned request to 512 byte sectors, bdrv_co_do_preadv() takes a byte-aligned request and aligns it to the alignment specified in bs->request_alignment. Signed-off-by: Kevin Wolf Reviewed-by: Max Reitz Reviewed-by: Benoit Canet --- block.c | 64 +++++++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 58 insertions(+), 6 deletions(-) diff --git a/block.c b/block.c index 701e95afb2..fa4feb08d1 100644 --- a/block.c +++ b/block.c @@ -2971,17 +2971,23 @@ out: /* * Handle a read request in coroutine context */ -static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, +static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs, + int64_t offset, unsigned int bytes, QEMUIOVector *qiov, BdrvRequestFlags flags) { BlockDriver *drv = bs->drv; + /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */ + uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment); + uint8_t *head_buf = NULL; + uint8_t *tail_buf = NULL; + QEMUIOVector local_qiov; + bool use_local_qiov = false; int ret; if (!drv) { return -ENOMEDIUM; } - if (bdrv_check_request(bs, sector_num, nb_sectors)) { + if (bdrv_check_byte_request(bs, offset, bytes)) { return -EIO; } @@ -2991,14 +2997,60 @@ static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs, /* throttling disk I/O */ if (bs->io_limits_enabled) { - bdrv_io_limits_intercept(bs, nb_sectors, false); + /* TODO Switch to byte granularity */ + bdrv_io_limits_intercept(bs, bytes >> BDRV_SECTOR_BITS, false); + } + + /* Align read if necessary by padding qiov */ + if (offset & (align - 1)) { + head_buf = qemu_blockalign(bs, align); + qemu_iovec_init(&local_qiov, qiov->niov + 2); + qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1)); + qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); + use_local_qiov = true; + + bytes += offset & (align - 1); + offset = offset & ~(align - 1); + } + + if ((offset + bytes) & (align - 1)) { + if (!use_local_qiov) { + qemu_iovec_init(&local_qiov, qiov->niov + 1); + qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); + use_local_qiov = true; + } + tail_buf = qemu_blockalign(bs, align); + qemu_iovec_add(&local_qiov, tail_buf, + align - ((offset + bytes) & (align - 1))); + + bytes = ROUND_UP(bytes, align); + } + + ret = bdrv_aligned_preadv(bs, offset, bytes, + use_local_qiov ? &local_qiov : qiov, + flags); + + if (use_local_qiov) { + qemu_iovec_destroy(&local_qiov); + qemu_vfree(head_buf); + qemu_vfree(tail_buf); } - ret = bdrv_aligned_preadv(bs, sector_num << BDRV_SECTOR_BITS, - nb_sectors << BDRV_SECTOR_BITS, qiov, flags); return ret; } +static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, + BdrvRequestFlags flags) +{ + if (nb_sectors < 0 || nb_sectors > (UINT_MAX >> BDRV_SECTOR_BITS)) { + return -EINVAL; + } + + return bdrv_co_do_preadv(bs, sector_num << BDRV_SECTOR_BITS, + nb_sectors << BDRV_SECTOR_BITS, qiov, flags); +} + int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) { From b404f72036716ab8ace04b83a8f0a93be4739a6a Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Tue, 3 Dec 2013 14:02:23 +0100 Subject: [PATCH 75/93] block: Introduce bdrv_aligned_pwritev() This separates the part of bdrv_co_do_writev() that needs to happen before the request is modified to match the backend alignment, and a part that needs to be executed afterwards and passes the request to the BlockDriver. Signed-off-by: Kevin Wolf Reviewed-by: Max Reitz Reviewed-by: Benoit Canet --- block.c | 62 ++++++++++++++++++++++++++++++++++++++------------------- 1 file changed, 41 insertions(+), 21 deletions(-) diff --git a/block.c b/block.c index fa4feb08d1..d9c472b2f8 100644 --- a/block.c +++ b/block.c @@ -3144,34 +3144,20 @@ static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs, } /* - * Handle a write request in coroutine context + * Forwards an already correctly aligned write request to the BlockDriver. */ -static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, - BdrvRequestFlags flags) +static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs, + int64_t offset, unsigned int bytes, QEMUIOVector *qiov, int flags) { BlockDriver *drv = bs->drv; BdrvTrackedRequest req; int ret; - if (!bs->drv) { - return -ENOMEDIUM; - } - if (bs->read_only) { - return -EACCES; - } - if (bdrv_check_request(bs, sector_num, nb_sectors)) { - return -EIO; - } + int64_t sector_num = offset >> BDRV_SECTOR_BITS; + unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS; - if (bs->copy_on_read_in_flight) { - wait_for_overlapping_requests(bs, sector_num, nb_sectors); - } - - /* throttling disk I/O */ - if (bs->io_limits_enabled) { - bdrv_io_limits_intercept(bs, nb_sectors, true); - } + assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0); + assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0); tracked_request_begin(&req, bs, sector_num, nb_sectors, true); @@ -3203,6 +3189,40 @@ static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs, return ret; } +/* + * Handle a write request in coroutine context + */ +static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, + BdrvRequestFlags flags) +{ + int ret; + + if (!bs->drv) { + return -ENOMEDIUM; + } + if (bs->read_only) { + return -EACCES; + } + if (bdrv_check_request(bs, sector_num, nb_sectors)) { + return -EIO; + } + + if (bs->copy_on_read_in_flight) { + wait_for_overlapping_requests(bs, sector_num, nb_sectors); + } + + /* throttling disk I/O */ + if (bs->io_limits_enabled) { + bdrv_io_limits_intercept(bs, nb_sectors, true); + } + + ret = bdrv_aligned_pwritev(bs, sector_num << BDRV_SECTOR_BITS, + nb_sectors << BDRV_SECTOR_BITS, qiov, flags); + + return ret; +} + int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) { From 244eadef5c797c674b0aef96366671be4b33d03a Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Tue, 3 Dec 2013 14:30:44 +0100 Subject: [PATCH 76/93] block: write: Handle COR dependency after I/O throttling First waiting for all COR requests to complete and calling the throttling function afterwards means that the request could be delayed and we still need to wait for the COR request even if it was issued only after the throttled write request. Signed-off-by: Kevin Wolf Reviewed-by: Max Reitz Reviewed-by: Benoit Canet --- block.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/block.c b/block.c index d9c472b2f8..20a38533ec 100644 --- a/block.c +++ b/block.c @@ -3159,6 +3159,10 @@ static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs, assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0); assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0); + if (bs->copy_on_read_in_flight) { + wait_for_overlapping_requests(bs, sector_num, nb_sectors); + } + tracked_request_begin(&req, bs, sector_num, nb_sectors, true); ret = notifier_with_return_list_notify(&bs->before_write_notifiers, &req); @@ -3208,10 +3212,6 @@ static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs, return -EIO; } - if (bs->copy_on_read_in_flight) { - wait_for_overlapping_requests(bs, sector_num, nb_sectors); - } - /* throttling disk I/O */ if (bs->io_limits_enabled) { bdrv_io_limits_intercept(bs, nb_sectors, true); From 6601553e27091ffe240bea69227adce941fe12e8 Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Tue, 3 Dec 2013 14:40:18 +0100 Subject: [PATCH 77/93] block: Introduce bdrv_co_do_pwritev() This is going to become the bdrv_co_do_preadv() equivalent for writes. In this patch, however, just a function taking byte offsets is created, it doesn't align anything yet. Signed-off-by: Kevin Wolf Reviewed-by: Max Reitz Reviewed-by: Benoit Canet --- block.c | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/block.c b/block.c index 20a38533ec..576859ea3e 100644 --- a/block.c +++ b/block.c @@ -3196,8 +3196,8 @@ static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs, /* * Handle a write request in coroutine context */ -static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, +static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs, + int64_t offset, unsigned int bytes, QEMUIOVector *qiov, BdrvRequestFlags flags) { int ret; @@ -3208,21 +3208,33 @@ static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs, if (bs->read_only) { return -EACCES; } - if (bdrv_check_request(bs, sector_num, nb_sectors)) { + if (bdrv_check_byte_request(bs, offset, bytes)) { return -EIO; } /* throttling disk I/O */ if (bs->io_limits_enabled) { - bdrv_io_limits_intercept(bs, nb_sectors, true); + /* TODO Switch to byte granularity */ + bdrv_io_limits_intercept(bs, bytes >> BDRV_SECTOR_BITS, true); } - ret = bdrv_aligned_pwritev(bs, sector_num << BDRV_SECTOR_BITS, - nb_sectors << BDRV_SECTOR_BITS, qiov, flags); + ret = bdrv_aligned_pwritev(bs, offset, bytes, qiov, flags); return ret; } +static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, + BdrvRequestFlags flags) +{ + if (nb_sectors < 0 || nb_sectors > (INT_MAX >> BDRV_SECTOR_BITS)) { + return -EINVAL; + } + + return bdrv_co_do_pwritev(bs, sector_num << BDRV_SECTOR_BITS, + nb_sectors << BDRV_SECTOR_BITS, qiov, flags); +} + int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) { From 793ed47a7a2b09b67cb2a8863dff531436532b5c Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Tue, 3 Dec 2013 15:31:25 +0100 Subject: [PATCH 78/93] block: Switch BdrvTrackedRequest to byte granularity Signed-off-by: Kevin Wolf Reviewed-by: Max Reitz Reviewed-by: Benoit Canet --- block.c | 52 +++++++++++++++++++++++++-------------- block/backup.c | 7 +++++- include/block/block_int.h | 4 +-- 3 files changed, 42 insertions(+), 21 deletions(-) diff --git a/block.c b/block.c index 576859ea3e..a32c81d644 100644 --- a/block.c +++ b/block.c @@ -2217,13 +2217,13 @@ static void tracked_request_end(BdrvTrackedRequest *req) */ static void tracked_request_begin(BdrvTrackedRequest *req, BlockDriverState *bs, - int64_t sector_num, - int nb_sectors, bool is_write) + int64_t offset, + unsigned int bytes, bool is_write) { *req = (BdrvTrackedRequest){ .bs = bs, - .sector_num = sector_num, - .nb_sectors = nb_sectors, + .offset = offset, + .bytes = bytes, .is_write = is_write, .co = qemu_coroutine_self(), }; @@ -2254,25 +2254,43 @@ void bdrv_round_to_clusters(BlockDriverState *bs, } } +static void round_bytes_to_clusters(BlockDriverState *bs, + int64_t offset, unsigned int bytes, + int64_t *cluster_offset, + unsigned int *cluster_bytes) +{ + BlockDriverInfo bdi; + + if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) { + *cluster_offset = offset; + *cluster_bytes = bytes; + } else { + *cluster_offset = QEMU_ALIGN_DOWN(offset, bdi.cluster_size); + *cluster_bytes = QEMU_ALIGN_UP(offset - *cluster_offset + bytes, + bdi.cluster_size); + } +} + static bool tracked_request_overlaps(BdrvTrackedRequest *req, - int64_t sector_num, int nb_sectors) { + int64_t offset, unsigned int bytes) +{ /* aaaa bbbb */ - if (sector_num >= req->sector_num + req->nb_sectors) { + if (offset >= req->offset + req->bytes) { return false; } /* bbbb aaaa */ - if (req->sector_num >= sector_num + nb_sectors) { + if (req->offset >= offset + bytes) { return false; } return true; } static void coroutine_fn wait_for_overlapping_requests(BlockDriverState *bs, - int64_t sector_num, int nb_sectors) + int64_t offset, unsigned int bytes) { BdrvTrackedRequest *req; - int64_t cluster_sector_num; - int cluster_nb_sectors; + int64_t cluster_offset; + unsigned int cluster_bytes; bool retry; /* If we touch the same cluster it counts as an overlap. This guarantees @@ -2281,14 +2299,12 @@ static void coroutine_fn wait_for_overlapping_requests(BlockDriverState *bs, * CoR read and write operations are atomic and guest writes cannot * interleave between them. */ - bdrv_round_to_clusters(bs, sector_num, nb_sectors, - &cluster_sector_num, &cluster_nb_sectors); + round_bytes_to_clusters(bs, offset, bytes, &cluster_offset, &cluster_bytes); do { retry = false; QLIST_FOREACH(req, &bs->tracked_requests, list) { - if (tracked_request_overlaps(req, cluster_sector_num, - cluster_nb_sectors)) { + if (tracked_request_overlaps(req, cluster_offset, cluster_bytes)) { /* Hitting this means there was a reentrant request, for * example, a block driver issuing nested requests. This must * never happen since it means deadlock. @@ -2908,10 +2924,10 @@ static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs, } if (bs->copy_on_read_in_flight) { - wait_for_overlapping_requests(bs, sector_num, nb_sectors); + wait_for_overlapping_requests(bs, offset, bytes); } - tracked_request_begin(&req, bs, sector_num, nb_sectors, false); + tracked_request_begin(&req, bs, offset, bytes, false); if (flags & BDRV_REQ_COPY_ON_READ) { int pnum; @@ -3160,10 +3176,10 @@ static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs, assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0); if (bs->copy_on_read_in_flight) { - wait_for_overlapping_requests(bs, sector_num, nb_sectors); + wait_for_overlapping_requests(bs, offset, bytes); } - tracked_request_begin(&req, bs, sector_num, nb_sectors, true); + tracked_request_begin(&req, bs, offset, bytes, true); ret = notifier_with_return_list_notify(&bs->before_write_notifiers, &req); diff --git a/block/backup.c b/block/backup.c index 0198514043..15a2e55e8e 100644 --- a/block/backup.c +++ b/block/backup.c @@ -181,8 +181,13 @@ static int coroutine_fn backup_before_write_notify( void *opaque) { BdrvTrackedRequest *req = opaque; + int64_t sector_num = req->offset >> BDRV_SECTOR_BITS; + int nb_sectors = req->bytes >> BDRV_SECTOR_BITS; - return backup_do_cow(req->bs, req->sector_num, req->nb_sectors, NULL); + assert((req->offset & (BDRV_SECTOR_SIZE - 1)) == 0); + assert((req->bytes & (BDRV_SECTOR_SIZE - 1)) == 0); + + return backup_do_cow(req->bs, sector_num, nb_sectors, NULL); } static void backup_set_speed(BlockJob *job, int64_t speed, Error **errp) diff --git a/include/block/block_int.h b/include/block/block_int.h index 5f9cecd65a..bcdd98c503 100644 --- a/include/block/block_int.h +++ b/include/block/block_int.h @@ -57,8 +57,8 @@ typedef struct BdrvTrackedRequest { BlockDriverState *bs; - int64_t sector_num; - int nb_sectors; + int64_t offset; + unsigned int bytes; bool is_write; QLIST_ENTRY(BdrvTrackedRequest) list; Coroutine *co; /* owner, used for deadlock detection */ From 65afd211c71fc91750d8a18f9604c1e57a5202fb Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Tue, 3 Dec 2013 14:55:55 +0100 Subject: [PATCH 79/93] block: Allow waiting for overlapping requests between begin/end Previously, it was not possible to use wait_for_overlapping_requests() between tracked_request_begin()/end() because it would wait for itself. Ignore the current request in the overlap check and run more of the bdrv_co_do_preadv/pwritev code with a BdrvTrackedRequest present. Signed-off-by: Kevin Wolf Reviewed-by: Max Reitz Reviewed-by: Benoit Canet --- block.c | 38 ++++++++++++++++++++------------------ 1 file changed, 20 insertions(+), 18 deletions(-) diff --git a/block.c b/block.c index a32c81d644..1591649731 100644 --- a/block.c +++ b/block.c @@ -2286,7 +2286,7 @@ static bool tracked_request_overlaps(BdrvTrackedRequest *req, } static void coroutine_fn wait_for_overlapping_requests(BlockDriverState *bs, - int64_t offset, unsigned int bytes) + BdrvTrackedRequest *self, int64_t offset, unsigned int bytes) { BdrvTrackedRequest *req; int64_t cluster_offset; @@ -2304,6 +2304,9 @@ static void coroutine_fn wait_for_overlapping_requests(BlockDriverState *bs, do { retry = false; QLIST_FOREACH(req, &bs->tracked_requests, list) { + if (req == self) { + continue; + } if (tracked_request_overlaps(req, cluster_offset, cluster_bytes)) { /* Hitting this means there was a reentrant request, for * example, a block driver issuing nested requests. This must @@ -2906,10 +2909,10 @@ err: * implemented by the caller. */ static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs, - int64_t offset, unsigned int bytes, QEMUIOVector *qiov, int flags) + BdrvTrackedRequest *req, int64_t offset, unsigned int bytes, + QEMUIOVector *qiov, int flags) { BlockDriver *drv = bs->drv; - BdrvTrackedRequest req; int ret; int64_t sector_num = offset >> BDRV_SECTOR_BITS; @@ -2924,11 +2927,9 @@ static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs, } if (bs->copy_on_read_in_flight) { - wait_for_overlapping_requests(bs, offset, bytes); + wait_for_overlapping_requests(bs, req, offset, bytes); } - tracked_request_begin(&req, bs, offset, bytes, false); - if (flags & BDRV_REQ_COPY_ON_READ) { int pnum; @@ -2975,8 +2976,6 @@ static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs, } out: - tracked_request_end(&req); - if (flags & BDRV_REQ_COPY_ON_READ) { bs->copy_on_read_in_flight--; } @@ -2992,6 +2991,8 @@ static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs, BdrvRequestFlags flags) { BlockDriver *drv = bs->drv; + BdrvTrackedRequest req; + /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */ uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment); uint8_t *head_buf = NULL; @@ -3042,9 +3043,11 @@ static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs, bytes = ROUND_UP(bytes, align); } - ret = bdrv_aligned_preadv(bs, offset, bytes, + tracked_request_begin(&req, bs, offset, bytes, false); + ret = bdrv_aligned_preadv(bs, &req, offset, bytes, use_local_qiov ? &local_qiov : qiov, flags); + tracked_request_end(&req); if (use_local_qiov) { qemu_iovec_destroy(&local_qiov); @@ -3163,10 +3166,10 @@ static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs, * Forwards an already correctly aligned write request to the BlockDriver. */ static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs, - int64_t offset, unsigned int bytes, QEMUIOVector *qiov, int flags) + BdrvTrackedRequest *req, int64_t offset, unsigned int bytes, + QEMUIOVector *qiov, int flags) { BlockDriver *drv = bs->drv; - BdrvTrackedRequest req; int ret; int64_t sector_num = offset >> BDRV_SECTOR_BITS; @@ -3176,12 +3179,10 @@ static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs, assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0); if (bs->copy_on_read_in_flight) { - wait_for_overlapping_requests(bs, offset, bytes); + wait_for_overlapping_requests(bs, req, offset, bytes); } - tracked_request_begin(&req, bs, offset, bytes, true); - - ret = notifier_with_return_list_notify(&bs->before_write_notifiers, &req); + ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req); if (ret < 0) { /* Do nothing, write notifier decided to fail this request */ @@ -3204,8 +3205,6 @@ static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs, bs->total_sectors = MAX(bs->total_sectors, sector_num + nb_sectors); } - tracked_request_end(&req); - return ret; } @@ -3216,6 +3215,7 @@ static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs, int64_t offset, unsigned int bytes, QEMUIOVector *qiov, BdrvRequestFlags flags) { + BdrvTrackedRequest req; int ret; if (!bs->drv) { @@ -3234,7 +3234,9 @@ static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs, bdrv_io_limits_intercept(bs, bytes >> BDRV_SECTOR_BITS, true); } - ret = bdrv_aligned_pwritev(bs, offset, bytes, qiov, flags); + tracked_request_begin(&req, bs, offset, bytes, true); + ret = bdrv_aligned_pwritev(bs, &req, offset, bytes, qiov, flags); + tracked_request_end(&req); return ret; } From ec746e10cb2e6276a8d2e036454792fe0674864a Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Wed, 4 Dec 2013 12:13:10 +0100 Subject: [PATCH 80/93] block: Make zero-after-EOF work with larger alignment Odd file sizes could make bdrv_aligned_preadv() shorten the request in non-aligned ways. Fix it by rounding to the required alignment instead of 512 bytes. Signed-off-by: Kevin Wolf Reviewed-by: Max Reitz Reviewed-by: Benoit Canet --- block.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/block.c b/block.c index 1591649731..0a391bd80f 100644 --- a/block.c +++ b/block.c @@ -2910,7 +2910,7 @@ err: */ static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs, BdrvTrackedRequest *req, int64_t offset, unsigned int bytes, - QEMUIOVector *qiov, int flags) + int64_t align, QEMUIOVector *qiov, int flags) { BlockDriver *drv = bs->drv; int ret; @@ -2958,7 +2958,8 @@ static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs, } total_sectors = DIV_ROUND_UP(len, BDRV_SECTOR_SIZE); - max_nb_sectors = MAX(0, total_sectors - sector_num); + max_nb_sectors = MAX(0, ROUND_UP(total_sectors - sector_num, + align >> BDRV_SECTOR_BITS)); if (max_nb_sectors > 0) { ret = drv->bdrv_co_readv(bs, sector_num, MIN(nb_sectors, max_nb_sectors), qiov); @@ -3044,7 +3045,7 @@ static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs, } tracked_request_begin(&req, bs, offset, bytes, false); - ret = bdrv_aligned_preadv(bs, &req, offset, bytes, + ret = bdrv_aligned_preadv(bs, &req, offset, bytes, align, use_local_qiov ? &local_qiov : qiov, flags); tracked_request_end(&req); From 2dbafdc012d3ea81a97fec6226ca82d644539c9a Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Wed, 4 Dec 2013 16:43:44 +0100 Subject: [PATCH 81/93] block: Generalise and optimise COR serialisation Change the API so that specific requests can be marked serialising. Only these requests are checked for overlaps then. This means that during a Copy on Read operation, not all requests overlapping other requests are serialised any more, but only those that actually overlap with the specific COR request. Also remove COR from function and variable names because this functionality can be useful in other contexts. Signed-off-by: Kevin Wolf Reviewed-by: Max Reitz Reviewed-by: Benoit Canet --- block.c | 48 +++++++++++++++++++++++---------------- include/block/block_int.h | 5 ++-- 2 files changed, 32 insertions(+), 21 deletions(-) diff --git a/block.c b/block.c index 0a391bd80f..493c75f33e 100644 --- a/block.c +++ b/block.c @@ -2208,6 +2208,10 @@ int bdrv_commit_all(void) */ static void tracked_request_end(BdrvTrackedRequest *req) { + if (req->serialising) { + req->bs->serialising_in_flight--; + } + QLIST_REMOVE(req, list); qemu_co_queue_restart_all(&req->wait_queue); } @@ -2222,10 +2226,11 @@ static void tracked_request_begin(BdrvTrackedRequest *req, { *req = (BdrvTrackedRequest){ .bs = bs, - .offset = offset, - .bytes = bytes, - .is_write = is_write, - .co = qemu_coroutine_self(), + .offset = offset, + .bytes = bytes, + .is_write = is_write, + .co = qemu_coroutine_self(), + .serialising = false, }; qemu_co_queue_init(&req->wait_queue); @@ -2233,6 +2238,14 @@ static void tracked_request_begin(BdrvTrackedRequest *req, QLIST_INSERT_HEAD(&bs->tracked_requests, req, list); } +static void mark_request_serialising(BdrvTrackedRequest *req) +{ + if (!req->serialising) { + req->bs->serialising_in_flight++; + req->serialising = true; + } +} + /** * Round a region to cluster boundaries */ @@ -2285,26 +2298,31 @@ static bool tracked_request_overlaps(BdrvTrackedRequest *req, return true; } -static void coroutine_fn wait_for_overlapping_requests(BlockDriverState *bs, - BdrvTrackedRequest *self, int64_t offset, unsigned int bytes) +static void coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self) { + BlockDriverState *bs = self->bs; BdrvTrackedRequest *req; int64_t cluster_offset; unsigned int cluster_bytes; bool retry; + if (!bs->serialising_in_flight) { + return; + } + /* If we touch the same cluster it counts as an overlap. This guarantees * that allocating writes will be serialized and not race with each other * for the same cluster. For example, in copy-on-read it ensures that the * CoR read and write operations are atomic and guest writes cannot * interleave between them. */ - round_bytes_to_clusters(bs, offset, bytes, &cluster_offset, &cluster_bytes); + round_bytes_to_clusters(bs, self->offset, self->bytes, + &cluster_offset, &cluster_bytes); do { retry = false; QLIST_FOREACH(req, &bs->tracked_requests, list) { - if (req == self) { + if (req == self || (!req->serialising && !self->serialising)) { continue; } if (tracked_request_overlaps(req, cluster_offset, cluster_bytes)) { @@ -2923,12 +2941,10 @@ static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs, /* Handle Copy on Read and associated serialisation */ if (flags & BDRV_REQ_COPY_ON_READ) { - bs->copy_on_read_in_flight++; + mark_request_serialising(req); } - if (bs->copy_on_read_in_flight) { - wait_for_overlapping_requests(bs, req, offset, bytes); - } + wait_serialising_requests(req); if (flags & BDRV_REQ_COPY_ON_READ) { int pnum; @@ -2977,10 +2993,6 @@ static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs, } out: - if (flags & BDRV_REQ_COPY_ON_READ) { - bs->copy_on_read_in_flight--; - } - return ret; } @@ -3179,9 +3191,7 @@ static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs, assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0); assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0); - if (bs->copy_on_read_in_flight) { - wait_for_overlapping_requests(bs, req, offset, bytes); - } + wait_serialising_requests(req); ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req); diff --git a/include/block/block_int.h b/include/block/block_int.h index bcdd98c503..c1153cb3ab 100644 --- a/include/block/block_int.h +++ b/include/block/block_int.h @@ -60,6 +60,7 @@ typedef struct BdrvTrackedRequest { int64_t offset; unsigned int bytes; bool is_write; + bool serialising; QLIST_ENTRY(BdrvTrackedRequest) list; Coroutine *co; /* owner, used for deadlock detection */ CoQueue wait_queue; /* coroutines blocked on this request */ @@ -302,8 +303,8 @@ struct BlockDriverState { /* Callback before write request is processed */ NotifierWithReturnList before_write_notifiers; - /* number of in-flight copy-on-read requests */ - unsigned int copy_on_read_in_flight; + /* number of in-flight serialising requests */ + unsigned int serialising_in_flight; /* I/O throttling */ ThrottleState throttle_state; From 7327145f63a224c9ba9c16d0c29781feffef8dc6 Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Wed, 4 Dec 2013 17:08:50 +0100 Subject: [PATCH 82/93] block: Make overlap range for serialisation dynamic Copy on Read wants to serialise with all requests touching the same cluster, so wait_serialising_requests() rounded to cluster boundaries. Other users like alignment RMW will have different requirements, though (requests touching the same sector), so make it dynamic. Signed-off-by: Kevin Wolf Reviewed-by: Max Reitz Reviewed-by: Benoit Canet --- block.c | 53 ++++++++++++++++++++------------------- include/block/block_int.h | 4 +++ 2 files changed, 31 insertions(+), 26 deletions(-) diff --git a/block.c b/block.c index 493c75f33e..784ff079ed 100644 --- a/block.c +++ b/block.c @@ -2231,6 +2231,8 @@ static void tracked_request_begin(BdrvTrackedRequest *req, .is_write = is_write, .co = qemu_coroutine_self(), .serialising = false, + .overlap_offset = offset, + .overlap_bytes = bytes, }; qemu_co_queue_init(&req->wait_queue); @@ -2238,12 +2240,19 @@ static void tracked_request_begin(BdrvTrackedRequest *req, QLIST_INSERT_HEAD(&bs->tracked_requests, req, list); } -static void mark_request_serialising(BdrvTrackedRequest *req) +static void mark_request_serialising(BdrvTrackedRequest *req, size_t align) { + int64_t overlap_offset = req->offset & ~(align - 1); + int overlap_bytes = ROUND_UP(req->offset + req->bytes, align) + - overlap_offset; + if (!req->serialising) { req->bs->serialising_in_flight++; req->serialising = true; } + + req->overlap_offset = MIN(req->overlap_offset, overlap_offset); + req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes); } /** @@ -2267,20 +2276,16 @@ void bdrv_round_to_clusters(BlockDriverState *bs, } } -static void round_bytes_to_clusters(BlockDriverState *bs, - int64_t offset, unsigned int bytes, - int64_t *cluster_offset, - unsigned int *cluster_bytes) +static int bdrv_get_cluster_size(BlockDriverState *bs) { BlockDriverInfo bdi; + int ret; - if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) { - *cluster_offset = offset; - *cluster_bytes = bytes; + ret = bdrv_get_info(bs, &bdi); + if (ret < 0 || bdi.cluster_size == 0) { + return bs->request_alignment; } else { - *cluster_offset = QEMU_ALIGN_DOWN(offset, bdi.cluster_size); - *cluster_bytes = QEMU_ALIGN_UP(offset - *cluster_offset + bytes, - bdi.cluster_size); + return bdi.cluster_size; } } @@ -2288,11 +2293,11 @@ static bool tracked_request_overlaps(BdrvTrackedRequest *req, int64_t offset, unsigned int bytes) { /* aaaa bbbb */ - if (offset >= req->offset + req->bytes) { + if (offset >= req->overlap_offset + req->overlap_bytes) { return false; } /* bbbb aaaa */ - if (req->offset >= offset + bytes) { + if (req->overlap_offset >= offset + bytes) { return false; } return true; @@ -2302,30 +2307,21 @@ static void coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self) { BlockDriverState *bs = self->bs; BdrvTrackedRequest *req; - int64_t cluster_offset; - unsigned int cluster_bytes; bool retry; if (!bs->serialising_in_flight) { return; } - /* If we touch the same cluster it counts as an overlap. This guarantees - * that allocating writes will be serialized and not race with each other - * for the same cluster. For example, in copy-on-read it ensures that the - * CoR read and write operations are atomic and guest writes cannot - * interleave between them. - */ - round_bytes_to_clusters(bs, self->offset, self->bytes, - &cluster_offset, &cluster_bytes); - do { retry = false; QLIST_FOREACH(req, &bs->tracked_requests, list) { if (req == self || (!req->serialising && !self->serialising)) { continue; } - if (tracked_request_overlaps(req, cluster_offset, cluster_bytes)) { + if (tracked_request_overlaps(req, self->overlap_offset, + self->overlap_bytes)) + { /* Hitting this means there was a reentrant request, for * example, a block driver issuing nested requests. This must * never happen since it means deadlock. @@ -2941,7 +2937,12 @@ static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs, /* Handle Copy on Read and associated serialisation */ if (flags & BDRV_REQ_COPY_ON_READ) { - mark_request_serialising(req); + /* If we touch the same cluster it counts as an overlap. This + * guarantees that allocating writes will be serialized and not race + * with each other for the same cluster. For example, in copy-on-read + * it ensures that the CoR read and write operations are atomic and + * guest writes cannot interleave between them. */ + mark_request_serialising(req, bdrv_get_cluster_size(bs)); } wait_serialising_requests(req); diff --git a/include/block/block_int.h b/include/block/block_int.h index c1153cb3ab..0ee955cb76 100644 --- a/include/block/block_int.h +++ b/include/block/block_int.h @@ -60,7 +60,11 @@ typedef struct BdrvTrackedRequest { int64_t offset; unsigned int bytes; bool is_write; + bool serialising; + int64_t overlap_offset; + unsigned int overlap_bytes; + QLIST_ENTRY(BdrvTrackedRequest) list; Coroutine *co; /* owner, used for deadlock detection */ CoQueue wait_queue; /* coroutines blocked on this request */ From 6460440f34c709461b84375cfd8a86b27d433225 Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Fri, 13 Dec 2013 13:04:35 +0100 Subject: [PATCH 83/93] block: Allow wait_serialising_requests() at any point We can only have a single wait_serialising_requests() call per request because otherwise we can run into deadlocks where requests are waiting for each other. The same is true when wait_serialising_requests() is not at the very beginning of a request, so that other requests can be issued between the start of the tracking and wait_serialising_requests(). Fix this by changing wait_serialising_requests() to ignore requests that are already (directly or indirectly) waiting for the calling request. Signed-off-by: Kevin Wolf Reviewed-by: Max Reitz Reviewed-by: Benoit Canet --- block.c | 13 ++++++++++--- include/block/block_int.h | 2 ++ 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/block.c b/block.c index 784ff079ed..96905856a1 100644 --- a/block.c +++ b/block.c @@ -2328,9 +2328,16 @@ static void coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self) */ assert(qemu_coroutine_self() != req->co); - qemu_co_queue_wait(&req->wait_queue); - retry = true; - break; + /* If the request is already (indirectly) waiting for us, or + * will wait for us as soon as it wakes up, then just go on + * (instead of producing a deadlock in the former case). */ + if (!req->waiting_for) { + self->waiting_for = req; + qemu_co_queue_wait(&req->wait_queue); + self->waiting_for = NULL; + retry = true; + break; + } } } } while (retry); diff --git a/include/block/block_int.h b/include/block/block_int.h index 0ee955cb76..0bcf1c9b8c 100644 --- a/include/block/block_int.h +++ b/include/block/block_int.h @@ -68,6 +68,8 @@ typedef struct BdrvTrackedRequest { QLIST_ENTRY(BdrvTrackedRequest) list; Coroutine *co; /* owner, used for deadlock detection */ CoQueue wait_queue; /* coroutines blocked on this request */ + + struct BdrvTrackedRequest *waiting_for; } BdrvTrackedRequest; struct BlockDriver { From 3b8242e0ea2a2c201ef3d1bd24080490dae33080 Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Tue, 3 Dec 2013 16:34:41 +0100 Subject: [PATCH 84/93] block: Align requests in bdrv_co_do_pwritev() This patch changes bdrv_co_do_pwritev() to actually be what its name promises. If requests aren't properly aligned, it performs a RMW. Requests touching the same block are serialised against the RMW request. Further optimisation of this is possible by differentiating types of requests (concurrent reads should actually be okay here). Signed-off-by: Kevin Wolf Reviewed-by: Max Reitz Reviewed-by: Benoit Canet --- block.c | 86 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 85 insertions(+), 1 deletion(-) diff --git a/block.c b/block.c index 96905856a1..04a3c5a94c 100644 --- a/block.c +++ b/block.c @@ -3235,6 +3235,12 @@ static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs, BdrvRequestFlags flags) { BdrvTrackedRequest req; + /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */ + uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment); + uint8_t *head_buf = NULL; + uint8_t *tail_buf = NULL; + QEMUIOVector local_qiov; + bool use_local_qiov = false; int ret; if (!bs->drv) { @@ -3253,10 +3259,88 @@ static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs, bdrv_io_limits_intercept(bs, bytes >> BDRV_SECTOR_BITS, true); } + /* + * Align write if necessary by performing a read-modify-write cycle. + * Pad qiov with the read parts and be sure to have a tracked request not + * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle. + */ tracked_request_begin(&req, bs, offset, bytes, true); - ret = bdrv_aligned_pwritev(bs, &req, offset, bytes, qiov, flags); + + if (offset & (align - 1)) { + QEMUIOVector head_qiov; + struct iovec head_iov; + + mark_request_serialising(&req, align); + wait_serialising_requests(&req); + + head_buf = qemu_blockalign(bs, align); + head_iov = (struct iovec) { + .iov_base = head_buf, + .iov_len = align, + }; + qemu_iovec_init_external(&head_qiov, &head_iov, 1); + + ret = bdrv_aligned_preadv(bs, &req, offset & ~(align - 1), align, + align, &head_qiov, 0); + if (ret < 0) { + goto fail; + } + + qemu_iovec_init(&local_qiov, qiov->niov + 2); + qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1)); + qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); + use_local_qiov = true; + + bytes += offset & (align - 1); + offset = offset & ~(align - 1); + } + + if ((offset + bytes) & (align - 1)) { + QEMUIOVector tail_qiov; + struct iovec tail_iov; + size_t tail_bytes; + + mark_request_serialising(&req, align); + wait_serialising_requests(&req); + + tail_buf = qemu_blockalign(bs, align); + tail_iov = (struct iovec) { + .iov_base = tail_buf, + .iov_len = align, + }; + qemu_iovec_init_external(&tail_qiov, &tail_iov, 1); + + ret = bdrv_aligned_preadv(bs, &req, (offset + bytes) & ~(align - 1), align, + align, &tail_qiov, 0); + if (ret < 0) { + goto fail; + } + + if (!use_local_qiov) { + qemu_iovec_init(&local_qiov, qiov->niov + 1); + qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); + use_local_qiov = true; + } + + tail_bytes = (offset + bytes) & (align - 1); + qemu_iovec_add(&local_qiov, tail_buf + tail_bytes, align - tail_bytes); + + bytes = ROUND_UP(bytes, align); + } + + ret = bdrv_aligned_pwritev(bs, &req, offset, bytes, + use_local_qiov ? &local_qiov : qiov, + flags); + +fail: tracked_request_end(&req); + if (use_local_qiov) { + qemu_iovec_destroy(&local_qiov); + qemu_vfree(head_buf); + qemu_vfree(tail_buf); + } + return ret; } From 28de2dcd88de31f50bbd43d9c2fcb046c3a727cb Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Tue, 14 Jan 2014 11:41:35 +0100 Subject: [PATCH 85/93] block: Assert serialisation assumptions in pwritev If a request calls wait_serialising_requests() and actually has to wait in this function (i.e. a coroutine yield), other requests can run and previously read data (like the head or tail buffer) could become outdated. In this case, we would have to restart from the beginning to read in the updated data. However, we're lucky and don't actually need to do that: A request can only wait in the first call of wait_serialising_requests() because we mark it as serialising before that call, so any later requests would wait. So as we don't wait in practice, we don't have to reload the data. This is an important assumption that may not be broken or data corruption will happen. Document it with some assertions. Signed-off-by: Kevin Wolf Reviewed-by: Max Reitz --- block.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/block.c b/block.c index 04a3c5a94c..e1b2c8d2df 100644 --- a/block.c +++ b/block.c @@ -2303,14 +2303,15 @@ static bool tracked_request_overlaps(BdrvTrackedRequest *req, return true; } -static void coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self) +static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self) { BlockDriverState *bs = self->bs; BdrvTrackedRequest *req; bool retry; + bool waited = false; if (!bs->serialising_in_flight) { - return; + return false; } do { @@ -2336,11 +2337,14 @@ static void coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self) qemu_co_queue_wait(&req->wait_queue); self->waiting_for = NULL; retry = true; + waited = true; break; } } } } while (retry); + + return waited; } /* @@ -3191,6 +3195,7 @@ static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs, QEMUIOVector *qiov, int flags) { BlockDriver *drv = bs->drv; + bool waited; int ret; int64_t sector_num = offset >> BDRV_SECTOR_BITS; @@ -3199,7 +3204,8 @@ static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs, assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0); assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0); - wait_serialising_requests(req); + waited = wait_serialising_requests(req); + assert(!waited || !req->serialising); ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req); @@ -3299,9 +3305,11 @@ static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs, QEMUIOVector tail_qiov; struct iovec tail_iov; size_t tail_bytes; + bool waited; mark_request_serialising(&req, align); - wait_serialising_requests(&req); + waited = wait_serialising_requests(&req); + assert(!waited || !use_local_qiov); tail_buf = qemu_blockalign(bs, align); tail_iov = (struct iovec) { From 775aa8b6e0ea25f8cca74d0fcb1e30a764cf624f Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Thu, 5 Dec 2013 12:09:38 +0100 Subject: [PATCH 86/93] block: Change coroutine wrapper to byte granularity Signed-off-by: Kevin Wolf Reviewed-by: Max Reitz --- block.c | 48 ++++++++++++++++++++++++++---------------------- 1 file changed, 26 insertions(+), 22 deletions(-) diff --git a/block.c b/block.c index e1b2c8d2df..fefa3f219c 100644 --- a/block.c +++ b/block.c @@ -70,11 +70,11 @@ static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs, static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs, int64_t sector_num, int nb_sectors, QEMUIOVector *iov); -static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, +static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs, + int64_t offset, unsigned int bytes, QEMUIOVector *qiov, BdrvRequestFlags flags); -static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, +static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs, + int64_t offset, unsigned int bytes, QEMUIOVector *qiov, BdrvRequestFlags flags); static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs, int64_t sector_num, @@ -2554,8 +2554,7 @@ static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num, typedef struct RwCo { BlockDriverState *bs; - int64_t sector_num; - int nb_sectors; + int64_t offset; QEMUIOVector *qiov; bool is_write; int ret; @@ -2567,34 +2566,32 @@ static void coroutine_fn bdrv_rw_co_entry(void *opaque) RwCo *rwco = opaque; if (!rwco->is_write) { - rwco->ret = bdrv_co_do_readv(rwco->bs, rwco->sector_num, - rwco->nb_sectors, rwco->qiov, - rwco->flags); - } else { - rwco->ret = bdrv_co_do_writev(rwco->bs, rwco->sector_num, - rwco->nb_sectors, rwco->qiov, + rwco->ret = bdrv_co_do_preadv(rwco->bs, rwco->offset, + rwco->qiov->size, rwco->qiov, rwco->flags); + } else { + rwco->ret = bdrv_co_do_pwritev(rwco->bs, rwco->offset, + rwco->qiov->size, rwco->qiov, + rwco->flags); } } /* * Process a vectored synchronous request using coroutines */ -static int bdrv_rwv_co(BlockDriverState *bs, int64_t sector_num, - QEMUIOVector *qiov, bool is_write, - BdrvRequestFlags flags) +static int bdrv_prwv_co(BlockDriverState *bs, int64_t offset, + QEMUIOVector *qiov, bool is_write, + BdrvRequestFlags flags) { Coroutine *co; RwCo rwco = { .bs = bs, - .sector_num = sector_num, - .nb_sectors = qiov->size >> BDRV_SECTOR_BITS, + .offset = offset, .qiov = qiov, .is_write = is_write, .ret = NOT_DONE, .flags = flags, }; - assert((qiov->size & (BDRV_SECTOR_SIZE - 1)) == 0); /** * In sync call context, when the vcpu is blocked, this throttling timer @@ -2633,7 +2630,8 @@ static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf, }; qemu_iovec_init_external(&qiov, &iov, 1); - return bdrv_rwv_co(bs, sector_num, &qiov, is_write, flags); + return bdrv_prwv_co(bs, sector_num << BDRV_SECTOR_BITS, + &qiov, is_write, flags); } /* return < 0 if error. See bdrv_write() for the return codes */ @@ -2671,7 +2669,7 @@ int bdrv_write(BlockDriverState *bs, int64_t sector_num, int bdrv_writev(BlockDriverState *bs, int64_t sector_num, QEMUIOVector *qiov) { - return bdrv_rwv_co(bs, sector_num, qiov, true, 0); + return bdrv_prwv_co(bs, sector_num << BDRV_SECTOR_BITS, qiov, true, 0); } int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num, @@ -4856,9 +4854,15 @@ int bdrv_flush(BlockDriverState *bs) return rwco.ret; } +typedef struct DiscardCo { + BlockDriverState *bs; + int64_t sector_num; + int nb_sectors; + int ret; +} DiscardCo; static void coroutine_fn bdrv_discard_co_entry(void *opaque) { - RwCo *rwco = opaque; + DiscardCo *rwco = opaque; rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors); } @@ -4942,7 +4946,7 @@ int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num, int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors) { Coroutine *co; - RwCo rwco = { + DiscardCo rwco = { .bs = bs, .sector_num = sector_num, .nb_sectors = nb_sectors, From a3ef65718506fb94cb9e5a903ef9bf9ad8fbe6de Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Thu, 5 Dec 2013 12:29:59 +0100 Subject: [PATCH 87/93] block: Make bdrv_pread() a bdrv_prwv_co() wrapper Instead of implementing the alignment adjustment here, use the now existing functionality of bdrv_co_do_preadv(). Signed-off-by: Kevin Wolf Reviewed-by: Max Reitz --- block.c | 49 +++++++++++++------------------------------------ 1 file changed, 13 insertions(+), 36 deletions(-) diff --git a/block.c b/block.c index fefa3f219c..07114f3169 100644 --- a/block.c +++ b/block.c @@ -2721,49 +2721,26 @@ int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags) } } -int bdrv_pread(BlockDriverState *bs, int64_t offset, - void *buf, int count1) +int bdrv_pread(BlockDriverState *bs, int64_t offset, void *buf, int bytes) { - uint8_t tmp_buf[BDRV_SECTOR_SIZE]; - int len, nb_sectors, count; - int64_t sector_num; + QEMUIOVector qiov; + struct iovec iov = { + .iov_base = (void *)buf, + .iov_len = bytes, + }; int ret; - count = count1; - /* first read to align to sector start */ - len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1); - if (len > count) - len = count; - sector_num = offset >> BDRV_SECTOR_BITS; - if (len > 0) { - if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0) - return ret; - memcpy(buf, tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)), len); - count -= len; - if (count == 0) - return count1; - sector_num++; - buf += len; + if (bytes < 0) { + return -EINVAL; } - /* read the sectors "in place" */ - nb_sectors = count >> BDRV_SECTOR_BITS; - if (nb_sectors > 0) { - if ((ret = bdrv_read(bs, sector_num, buf, nb_sectors)) < 0) - return ret; - sector_num += nb_sectors; - len = nb_sectors << BDRV_SECTOR_BITS; - buf += len; - count -= len; + qemu_iovec_init_external(&qiov, &iov, 1); + ret = bdrv_prwv_co(bs, offset, &qiov, false, 0); + if (ret < 0) { + return ret; } - /* add data from the last sector */ - if (count > 0) { - if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0) - return ret; - memcpy(buf, tmp_buf, count); - } - return count1; + return bytes; } int bdrv_pwritev(BlockDriverState *bs, int64_t offset, QEMUIOVector *qiov) From 8407d5d7e265911b05949ee2ffd9e45c97bf0505 Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Thu, 5 Dec 2013 12:34:02 +0100 Subject: [PATCH 88/93] block: Make bdrv_pwrite() a bdrv_prwv_co() wrapper Instead of implementing the alignment adjustment here, use the now existing functionality of bdrv_co_do_pwritev(). Signed-off-by: Kevin Wolf Reviewed-by: Max Reitz --- block.c | 64 ++++++------------------------------------- include/block/block.h | 1 - 2 files changed, 9 insertions(+), 56 deletions(-) diff --git a/block.c b/block.c index 07114f3169..179d27ac75 100644 --- a/block.c +++ b/block.c @@ -2667,11 +2667,6 @@ int bdrv_write(BlockDriverState *bs, int64_t sector_num, return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true, 0); } -int bdrv_writev(BlockDriverState *bs, int64_t sector_num, QEMUIOVector *qiov) -{ - return bdrv_prwv_co(bs, sector_num << BDRV_SECTOR_BITS, qiov, true, 0); -} - int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num, int nb_sectors, BdrvRequestFlags flags) { @@ -2745,70 +2740,29 @@ int bdrv_pread(BlockDriverState *bs, int64_t offset, void *buf, int bytes) int bdrv_pwritev(BlockDriverState *bs, int64_t offset, QEMUIOVector *qiov) { - uint8_t tmp_buf[BDRV_SECTOR_SIZE]; - int len, nb_sectors, count; - int64_t sector_num; int ret; - count = qiov->size; - - /* first write to align to sector start */ - len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1); - if (len > count) - len = count; - sector_num = offset >> BDRV_SECTOR_BITS; - if (len > 0) { - if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0) - return ret; - qemu_iovec_to_buf(qiov, 0, tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)), - len); - if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0) - return ret; - count -= len; - if (count == 0) - return qiov->size; - sector_num++; + ret = bdrv_prwv_co(bs, offset, qiov, true, 0); + if (ret < 0) { + return ret; } - /* write the sectors "in place" */ - nb_sectors = count >> BDRV_SECTOR_BITS; - if (nb_sectors > 0) { - QEMUIOVector qiov_inplace; - - qemu_iovec_init(&qiov_inplace, qiov->niov); - qemu_iovec_concat(&qiov_inplace, qiov, len, - nb_sectors << BDRV_SECTOR_BITS); - ret = bdrv_writev(bs, sector_num, &qiov_inplace); - qemu_iovec_destroy(&qiov_inplace); - if (ret < 0) { - return ret; - } - - sector_num += nb_sectors; - len = nb_sectors << BDRV_SECTOR_BITS; - count -= len; - } - - /* add data from the last sector */ - if (count > 0) { - if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0) - return ret; - qemu_iovec_to_buf(qiov, qiov->size - count, tmp_buf, count); - if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0) - return ret; - } return qiov->size; } int bdrv_pwrite(BlockDriverState *bs, int64_t offset, - const void *buf, int count1) + const void *buf, int bytes) { QEMUIOVector qiov; struct iovec iov = { .iov_base = (void *) buf, - .iov_len = count1, + .iov_len = bytes, }; + if (bytes < 0) { + return -EINVAL; + } + qemu_iovec_init_external(&qiov, &iov, 1); return bdrv_pwritev(bs, offset, &qiov); } diff --git a/include/block/block.h b/include/block/block.h index a2f5657321..1085992a7a 100644 --- a/include/block/block.h +++ b/include/block/block.h @@ -224,7 +224,6 @@ BlockDriverAIOCB *bdrv_aio_write_zeroes(BlockDriverState *bs, int64_t sector_num int nb_sectors, BdrvRequestFlags flags, BlockDriverCompletionFunc *cb, void *opaque); int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags); -int bdrv_writev(BlockDriverState *bs, int64_t sector_num, QEMUIOVector *qiov); int bdrv_pread(BlockDriverState *bs, int64_t offset, void *buf, int count); int bdrv_pwrite(BlockDriverState *bs, int64_t offset, From 2c9880c45e2f9a98d11d44ce9966515c23870a86 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 29 Nov 2011 12:41:35 +0100 Subject: [PATCH 89/93] iscsi: Set bs->request_alignment The iSCSI backend already gets the block size from the READ CAPACITY command it sends. Save it so that the generic block layer gets it too. Signed-off-by: Paolo Bonzini Signed-off-by: Kevin Wolf Reviewed-by: Max Reitz --- block/iscsi.c | 1 + 1 file changed, 1 insertion(+) diff --git a/block/iscsi.c b/block/iscsi.c index c8bf8dc049..890bd81336 100644 --- a/block/iscsi.c +++ b/block/iscsi.c @@ -1217,6 +1217,7 @@ static int iscsi_open(BlockDriverState *bs, QDict *options, int flags, goto out; } bs->total_sectors = sector_lun2qemu(iscsilun->num_blocks, iscsilun); + bs->request_alignment = iscsilun->block_size; /* Medium changer or tape. We dont have any emulation for this so this must * be sg ioctl compatible. We force it to be sg, otherwise qemu will try From b35ee7fb2308e09092488029b5a9e456ce61bbe6 Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Tue, 14 Jan 2014 13:44:35 +0100 Subject: [PATCH 90/93] blkdebug: Make required alignment configurable The new 'align' option of blkdebug can be used in order to emulate backends with a required 4k alignment on hosts which only really require 512 byte alignment. Signed-off-by: Kevin Wolf --- block/blkdebug.c | 16 ++++++++++++++++ qapi-schema.json | 3 +++ 2 files changed, 19 insertions(+) diff --git a/block/blkdebug.c b/block/blkdebug.c index c8f8d56758..2c03698f93 100644 --- a/block/blkdebug.c +++ b/block/blkdebug.c @@ -364,6 +364,11 @@ static QemuOptsList runtime_opts = { .type = QEMU_OPT_STRING, .help = "[internal use only, will be removed]", }, + { + .name = "align", + .type = QEMU_OPT_SIZE, + .help = "Required alignment in bytes", + }, { /* end of list */ } }, }; @@ -375,6 +380,7 @@ static int blkdebug_open(BlockDriverState *bs, QDict *options, int flags, QemuOpts *opts; Error *local_err = NULL; const char *config; + uint64_t align; int ret; opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort); @@ -403,6 +409,16 @@ static int blkdebug_open(BlockDriverState *bs, QDict *options, int flags, goto fail; } + /* Set request alignment */ + align = qemu_opt_get_size(opts, "align", bs->request_alignment); + if (align > 0 && align < INT_MAX && !(align & (align - 1))) { + bs->request_alignment = align; + } else { + error_setg(errp, "Invalid alignment"); + ret = -EINVAL; + goto fail; + } + ret = 0; fail: qemu_opts_del(opts); diff --git a/qapi-schema.json b/qapi-schema.json index 1ff607ac3c..05ced9d572 100644 --- a/qapi-schema.json +++ b/qapi-schema.json @@ -4321,6 +4321,8 @@ # # @config: #optional filename of the configuration file # +# @align: #optional required alignment for requests in bytes +# # @inject-error: #optional array of error injection descriptions # # @set-state: #optional array of state-change descriptions @@ -4330,6 +4332,7 @@ { 'type': 'BlockdevOptionsBlkdebug', 'data': { 'image': 'BlockdevRef', '*config': 'str', + '*align': 'int', '*inject-error': ['BlkdebugInjectErrorOptions'], '*set-state': ['BlkdebugSetStateOptions'] } } From cd33d02a1012e58ee0d3c8259159e8c60cfa0a4d Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Wed, 15 Jan 2014 15:39:10 +0100 Subject: [PATCH 91/93] qemu-io: New command 'sleep' There is no easy way to check that a request correctly waits for a different request. With a sleep command we can at least approximate it. Signed-off-by: Kevin Wolf --- qemu-io-cmds.c | 42 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/qemu-io-cmds.c b/qemu-io-cmds.c index 6dfb4a51ae..f1de24c91c 100644 --- a/qemu-io-cmds.c +++ b/qemu-io-cmds.c @@ -12,6 +12,7 @@ #include "block/block_int.h" #include "block/qapi.h" #include "qemu/main-loop.h" +#include "qemu/timer.h" #define CMD_NOFILE_OK 0x01 @@ -2053,6 +2054,46 @@ static const cmdinfo_t abort_cmd = { .oneline = "simulate a program crash using abort(3)", }; +static void sleep_cb(void *opaque) +{ + bool *expired = opaque; + *expired = true; +} + +static int sleep_f(BlockDriverState *bs, int argc, char **argv) +{ + char *endptr; + long ms; + struct QEMUTimer *timer; + bool expired = false; + + ms = strtol(argv[1], &endptr, 0); + if (ms < 0 || *endptr != '\0') { + printf("%s is not a valid number\n", argv[1]); + return 0; + } + + timer = timer_new_ns(QEMU_CLOCK_HOST, sleep_cb, &expired); + timer_mod(timer, qemu_clock_get_ns(QEMU_CLOCK_HOST) + SCALE_MS * ms); + + while (!expired) { + main_loop_wait(false); + } + + timer_free(timer); + + return 0; +} + +static const cmdinfo_t sleep_cmd = { + .name = "sleep", + .argmin = 1, + .argmax = 1, + .cfunc = sleep_f, + .flags = CMD_NOFILE_OK, + .oneline = "waits for the given value in milliseconds", +}; + static void help_oneline(const char *cmd, const cmdinfo_t *ct) { if (cmd) { @@ -2166,4 +2207,5 @@ static void __attribute((constructor)) init_qemuio_commands(void) qemuio_add_command(&resume_cmd); qemuio_add_command(&wait_break_cmd); qemuio_add_command(&abort_cmd); + qemuio_add_command(&sleep_cmd); } From 9e1cb96d9a5e434f389a4d7b7ff4dcdd71e8ec0f Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Tue, 14 Jan 2014 15:37:03 +0100 Subject: [PATCH 92/93] qemu-iotests: Test pwritev RMW logic Signed-off-by: Kevin Wolf Reviewed-by: Max Reitz --- block.c | 7 + block/blkdebug.c | 8 ++ include/block/block.h | 8 ++ tests/qemu-iotests/077 | 278 +++++++++++++++++++++++++++++++++++++ tests/qemu-iotests/077.out | 202 +++++++++++++++++++++++++++ tests/qemu-iotests/group | 1 + 6 files changed, 504 insertions(+) create mode 100755 tests/qemu-iotests/077 create mode 100644 tests/qemu-iotests/077.out diff --git a/block.c b/block.c index 179d27ac75..932ce58265 100644 --- a/block.c +++ b/block.c @@ -3141,10 +3141,13 @@ static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs, if (ret < 0) { /* Do nothing, write notifier decided to fail this request */ } else if (flags & BDRV_REQ_ZERO_WRITE) { + BLKDBG_EVENT(bs, BLKDBG_PWRITEV_ZERO); ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors, flags); } else { + BLKDBG_EVENT(bs, BLKDBG_PWRITEV); ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov); } + BLKDBG_EVENT(bs, BLKDBG_PWRITEV_DONE); if (ret == 0 && !bs->enable_write_cache) { ret = bdrv_co_flush(bs); @@ -3215,11 +3218,13 @@ static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs, }; qemu_iovec_init_external(&head_qiov, &head_iov, 1); + BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_HEAD); ret = bdrv_aligned_preadv(bs, &req, offset & ~(align - 1), align, align, &head_qiov, 0); if (ret < 0) { goto fail; } + BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD); qemu_iovec_init(&local_qiov, qiov->niov + 2); qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1)); @@ -3247,11 +3252,13 @@ static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs, }; qemu_iovec_init_external(&tail_qiov, &tail_iov, 1); + BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_TAIL); ret = bdrv_aligned_preadv(bs, &req, (offset + bytes) & ~(align - 1), align, align, &tail_qiov, 0); if (ret < 0) { goto fail; } + BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL); if (!use_local_qiov) { qemu_iovec_init(&local_qiov, qiov->niov + 1); diff --git a/block/blkdebug.c b/block/blkdebug.c index 2c03698f93..56c4cd084f 100644 --- a/block/blkdebug.c +++ b/block/blkdebug.c @@ -186,6 +186,14 @@ static const char *event_names[BLKDBG_EVENT_MAX] = { [BLKDBG_FLUSH_TO_OS] = "flush_to_os", [BLKDBG_FLUSH_TO_DISK] = "flush_to_disk", + + [BLKDBG_PWRITEV_RMW_HEAD] = "pwritev_rmw.head", + [BLKDBG_PWRITEV_RMW_AFTER_HEAD] = "pwritev_rmw.after_head", + [BLKDBG_PWRITEV_RMW_TAIL] = "pwritev_rmw.tail", + [BLKDBG_PWRITEV_RMW_AFTER_TAIL] = "pwritev_rmw.after_tail", + [BLKDBG_PWRITEV] = "pwritev", + [BLKDBG_PWRITEV_ZERO] = "pwritev_zero", + [BLKDBG_PWRITEV_DONE] = "pwritev_done", }; static int get_event_by_name(const char *name, BlkDebugEvent *event) diff --git a/include/block/block.h b/include/block/block.h index 1085992a7a..963a61fa4c 100644 --- a/include/block/block.h +++ b/include/block/block.h @@ -527,6 +527,14 @@ typedef enum { BLKDBG_FLUSH_TO_OS, BLKDBG_FLUSH_TO_DISK, + BLKDBG_PWRITEV_RMW_HEAD, + BLKDBG_PWRITEV_RMW_AFTER_HEAD, + BLKDBG_PWRITEV_RMW_TAIL, + BLKDBG_PWRITEV_RMW_AFTER_TAIL, + BLKDBG_PWRITEV, + BLKDBG_PWRITEV_ZERO, + BLKDBG_PWRITEV_DONE, + BLKDBG_EVENT_MAX, } BlkDebugEvent; diff --git a/tests/qemu-iotests/077 b/tests/qemu-iotests/077 new file mode 100755 index 0000000000..bbf7b5145a --- /dev/null +++ b/tests/qemu-iotests/077 @@ -0,0 +1,278 @@ +#!/bin/bash +# +# Test concurrent pread/pwrite +# +# Copyright (C) 2014 Red Hat, Inc. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# + +# creator +owner=kwolf@redhat.com + +seq=`basename $0` +echo "QA output created by $seq" + +here=`pwd` +tmp=/tmp/$$ +status=1 # failure is the default! + +_cleanup() +{ + _cleanup_test_img +} +trap "_cleanup; exit \$status" 0 1 2 3 15 + +# get standard environment, filters and checks +. ./common.rc +. ./common.filter + +_supported_fmt generic +_supported_proto generic +_supported_os Linux + +CLUSTER_SIZE=4k +size=128M + +_make_test_img $size + +echo +echo "== Some concurrent requests involving RMW ==" + +function test_io() +{ +echo "open -o file.align=4k blkdebug::$TEST_IMG" +# A simple RMW request +cat < Date: Thu, 16 Jan 2014 13:29:10 +0100 Subject: [PATCH 93/93] block: Switch bdrv_io_limits_intercept() to byte granularity Request sizes used to be rounded down to the next sector boundary, allowing to bypass the I/O limit. Now all requests are accounted for with their exact byte size. Reported-by: Wenchao Xia Signed-off-by: Kevin Wolf Reviewed-by: Max Reitz --- block.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/block.c b/block.c index 932ce58265..cb21a5fa61 100644 --- a/block.c +++ b/block.c @@ -192,7 +192,7 @@ void bdrv_io_limits_enable(BlockDriverState *bs) * @is_write: is the IO a write */ static void bdrv_io_limits_intercept(BlockDriverState *bs, - int nb_sectors, + unsigned int bytes, bool is_write) { /* does this io must wait */ @@ -205,9 +205,8 @@ static void bdrv_io_limits_intercept(BlockDriverState *bs, } /* the IO will be executed, do the accounting */ - throttle_account(&bs->throttle_state, - is_write, - nb_sectors * BDRV_SECTOR_SIZE); + throttle_account(&bs->throttle_state, is_write, bytes); + /* if the next request must wait -> do nothing */ if (throttle_schedule_timer(&bs->throttle_state, is_write)) { @@ -2968,8 +2967,7 @@ static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs, /* throttling disk I/O */ if (bs->io_limits_enabled) { - /* TODO Switch to byte granularity */ - bdrv_io_limits_intercept(bs, bytes >> BDRV_SECTOR_BITS, false); + bdrv_io_limits_intercept(bs, bytes, false); } /* Align read if necessary by padding qiov */ @@ -3193,8 +3191,7 @@ static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs, /* throttling disk I/O */ if (bs->io_limits_enabled) { - /* TODO Switch to byte granularity */ - bdrv_io_limits_intercept(bs, bytes >> BDRV_SECTOR_BITS, true); + bdrv_io_limits_intercept(bs, bytes, true); } /*