From c9308314460f46c2bbc4a672a3a39ac842434bf1 Mon Sep 17 00:00:00 2001 From: Eric Blake Date: Tue, 2 Feb 2021 12:59:14 -0600 Subject: [PATCH 01/20] iotests: Fix expected whitespace for 185 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit f93e19fb03b adjusted various iotest whitespace discrepancies. But another one snuck in during 61623f82153788e, and we missed the semantic merge conflict at the time because 185 is not run as part of the default 'make check'. Signed-off-by: Eric Blake Message-Id: <20210202185914.614705-1-eblake@redhat.com> [eblake: adjust commit message] Reviewed-by: Vladimir Sementsov-Ogievskiy Reviewed-by: Philippe Mathieu-Daudé --- tests/qemu-iotests/185.out | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/qemu-iotests/185.out b/tests/qemu-iotests/185.out index 9dedc8eacb..754a641258 100644 --- a/tests/qemu-iotests/185.out +++ b/tests/qemu-iotests/185.out @@ -89,7 +89,7 @@ Formatting 'TEST_DIR/t.qcow2.copy', fmt=qcow2 cluster_size=65536 extended_l2=off 'format': 'IMGFMT', 'sync': 'full', 'speed': 65536, - 'x-perf': { 'max-chunk': 65536 } } } + 'x-perf': {'max-chunk': 65536} } } Formatting 'TEST_DIR/t.qcow2.copy', fmt=qcow2 cluster_size=65536 extended_l2=off compression_type=zlib size=67108864 lazy_refcounts=off refcount_bits=16 {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "created", "id": "disk"}} {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "running", "id": "disk"}} From 69b55e03f7e65a36eb954d0b7d4698b258df2708 Mon Sep 17 00:00:00 2001 From: Vladimir Sementsov-Ogievskiy Date: Fri, 11 Dec 2020 21:39:19 +0300 Subject: [PATCH 02/20] block: refactor bdrv_check_request: add errp It's better to pass &error_abort than just assert that result is 0: on crash, we'll immediately see the reason in the backtrace. Signed-off-by: Vladimir Sementsov-Ogievskiy Message-Id: <20201211183934.169161-2-vsementsov@virtuozzo.com> Reviewed-by: Eric Blake [eblake: fix iotest 206 fallout] Signed-off-by: Eric Blake --- block/file-posix.c | 2 +- block/io.c | 29 ++++++++++++++++++++++------- include/block/block_int.h | 2 +- tests/qemu-iotests/206.out | 2 +- tests/test-write-threshold.c | 5 +++-- 5 files changed, 28 insertions(+), 12 deletions(-) diff --git a/block/file-posix.c b/block/file-posix.c index 11aafa9d82..05079b40ca 100644 --- a/block/file-posix.c +++ b/block/file-posix.c @@ -2969,7 +2969,7 @@ raw_do_pwrite_zeroes(BlockDriverState *bs, int64_t offset, int bytes, req->bytes = BDRV_MAX_LENGTH - req->offset; - assert(bdrv_check_request(req->offset, req->bytes) == 0); + bdrv_check_request(req->offset, req->bytes, &error_abort); bdrv_make_request_serialising(req, bs->bl.request_alignment); } diff --git a/block/io.c b/block/io.c index d203435a73..23abdae794 100644 --- a/block/io.c +++ b/block/io.c @@ -920,17 +920,34 @@ bool coroutine_fn bdrv_make_request_serialising(BdrvTrackedRequest *req, return waited; } -int bdrv_check_request(int64_t offset, int64_t bytes) +int bdrv_check_request(int64_t offset, int64_t bytes, Error **errp) { - if (offset < 0 || bytes < 0) { + if (offset < 0) { + error_setg(errp, "offset is negative: %" PRIi64, offset); + return -EIO; + } + + if (bytes < 0) { + error_setg(errp, "bytes is negative: %" PRIi64, bytes); return -EIO; } if (bytes > BDRV_MAX_LENGTH) { + error_setg(errp, "bytes(%" PRIi64 ") exceeds maximum(%" PRIi64 ")", + bytes, BDRV_MAX_LENGTH); + return -EIO; + } + + if (offset > BDRV_MAX_LENGTH) { + error_setg(errp, "offset(%" PRIi64 ") exceeds maximum(%" PRIi64 ")", + offset, BDRV_MAX_LENGTH); return -EIO; } if (offset > BDRV_MAX_LENGTH - bytes) { + error_setg(errp, "sum of offset(%" PRIi64 ") and bytes(%" PRIi64 ") " + "exceeds maximum(%" PRIi64 ")", offset, bytes, + BDRV_MAX_LENGTH); return -EIO; } @@ -939,7 +956,7 @@ int bdrv_check_request(int64_t offset, int64_t bytes) static int bdrv_check_request32(int64_t offset, int64_t bytes) { - int ret = bdrv_check_request(offset, bytes); + int ret = bdrv_check_request(offset, bytes, NULL); if (ret < 0) { return ret; } @@ -2847,7 +2864,7 @@ int coroutine_fn bdrv_co_pdiscard(BdrvChild *child, int64_t offset, return -EPERM; } - ret = bdrv_check_request(offset, bytes); + ret = bdrv_check_request(offset, bytes, NULL); if (ret < 0) { return ret; } @@ -3249,10 +3266,8 @@ int coroutine_fn bdrv_co_truncate(BdrvChild *child, int64_t offset, bool exact, return -EINVAL; } - ret = bdrv_check_request(offset, 0); + ret = bdrv_check_request(offset, 0, errp); if (ret < 0) { - error_setg(errp, "Required too big image size, it must be not greater " - "than %" PRId64, BDRV_MAX_LENGTH); return ret; } diff --git a/include/block/block_int.h b/include/block/block_int.h index d01fc23720..5bbbf9ee0a 100644 --- a/include/block/block_int.h +++ b/include/block/block_int.h @@ -93,7 +93,7 @@ typedef struct BdrvTrackedRequest { struct BdrvTrackedRequest *waiting_for; } BdrvTrackedRequest; -int bdrv_check_request(int64_t offset, int64_t bytes); +int bdrv_check_request(int64_t offset, int64_t bytes, Error **errp); struct BlockDriver { const char *format_name; diff --git a/tests/qemu-iotests/206.out b/tests/qemu-iotests/206.out index e8a36de00b..5dd589d14e 100644 --- a/tests/qemu-iotests/206.out +++ b/tests/qemu-iotests/206.out @@ -180,7 +180,7 @@ Job failed: Could not resize image: Image size cannot be negative {"execute": "blockdev-create", "arguments": {"job-id": "job0", "options": {"driver": "qcow2", "file": "node0", "size": 9223372036854775296}}} {"return": {}} -Job failed: Could not resize image: Required too big image size, it must be not greater than 9223372035781033984 +Job failed: Could not resize image: offset(9223372036854775296) exceeds maximum(9223372035781033984) {"execute": "job-dismiss", "arguments": {"id": "job0"}} {"return": {}} diff --git a/tests/test-write-threshold.c b/tests/test-write-threshold.c index 4cf032652d..fc1c45a2eb 100644 --- a/tests/test-write-threshold.c +++ b/tests/test-write-threshold.c @@ -7,6 +7,7 @@ */ #include "qemu/osdep.h" +#include "qapi/error.h" #include "block/block_int.h" #include "block/write-threshold.h" @@ -64,7 +65,7 @@ static void test_threshold_not_trigger(void) req.offset = 1024; req.bytes = 1024; - assert(bdrv_check_request(req.offset, req.bytes) == 0); + bdrv_check_request(req.offset, req.bytes, &error_abort); bdrv_write_threshold_set(&bs, threshold); amount = bdrv_write_threshold_exceeded(&bs, &req); @@ -84,7 +85,7 @@ static void test_threshold_trigger(void) req.offset = (4 * 1024 * 1024) - 1024; req.bytes = 2 * 1024; - assert(bdrv_check_request(req.offset, req.bytes) == 0); + bdrv_check_request(req.offset, req.bytes, &error_abort); bdrv_write_threshold_set(&bs, threshold); amount = bdrv_write_threshold_exceeded(&bs, &req); From 4c002cef0e9abe7135d7916c51abce47f7fc1ee2 Mon Sep 17 00:00:00 2001 From: Vladimir Sementsov-Ogievskiy Date: Fri, 11 Dec 2020 21:39:20 +0300 Subject: [PATCH 03/20] util/iov: make qemu_iovec_init_extended() honest Actually, we can't extend the io vector in all cases. Handle possible MAX_IOV and size_t overflows. For now add assertion to callers (actually they rely on success anyway) and fix them in the following patch. Add also some additional good assertions to qemu_iovec_init_slice() while being here. Signed-off-by: Vladimir Sementsov-Ogievskiy Message-Id: <20201211183934.169161-3-vsementsov@virtuozzo.com> Reviewed-by: Eric Blake Signed-off-by: Eric Blake --- block/io.c | 10 +++++++--- include/qemu/iov.h | 2 +- util/iov.c | 25 +++++++++++++++++++++++-- 3 files changed, 31 insertions(+), 6 deletions(-) diff --git a/block/io.c b/block/io.c index 23abdae794..ab953bd58f 100644 --- a/block/io.c +++ b/block/io.c @@ -1680,13 +1680,17 @@ static bool bdrv_pad_request(BlockDriverState *bs, int64_t *offset, unsigned int *bytes, BdrvRequestPadding *pad) { + int ret; + if (!bdrv_init_padding(bs, *offset, *bytes, pad)) { return false; } - qemu_iovec_init_extended(&pad->local_qiov, pad->buf, pad->head, - *qiov, *qiov_offset, *bytes, - pad->buf + pad->buf_len - pad->tail, pad->tail); + ret = qemu_iovec_init_extended(&pad->local_qiov, pad->buf, pad->head, + *qiov, *qiov_offset, *bytes, + pad->buf + pad->buf_len - pad->tail, + pad->tail); + assert(ret == 0); *bytes += pad->head + pad->tail; *offset -= pad->head; *qiov = &pad->local_qiov; diff --git a/include/qemu/iov.h b/include/qemu/iov.h index b6b283a5e5..9330746680 100644 --- a/include/qemu/iov.h +++ b/include/qemu/iov.h @@ -222,7 +222,7 @@ static inline void *qemu_iovec_buf(QEMUIOVector *qiov) void qemu_iovec_init(QEMUIOVector *qiov, int alloc_hint); void qemu_iovec_init_external(QEMUIOVector *qiov, struct iovec *iov, int niov); -void qemu_iovec_init_extended( +int qemu_iovec_init_extended( QEMUIOVector *qiov, void *head_buf, size_t head_len, QEMUIOVector *mid_qiov, size_t mid_offset, size_t mid_len, diff --git a/util/iov.c b/util/iov.c index f3a9e92a37..58c7b3eeee 100644 --- a/util/iov.c +++ b/util/iov.c @@ -415,7 +415,7 @@ int qemu_iovec_subvec_niov(QEMUIOVector *qiov, size_t offset, size_t len) * Compile new iovec, combining @head_buf buffer, sub-qiov of @mid_qiov, * and @tail_buf buffer into new qiov. */ -void qemu_iovec_init_extended( +int qemu_iovec_init_extended( QEMUIOVector *qiov, void *head_buf, size_t head_len, QEMUIOVector *mid_qiov, size_t mid_offset, size_t mid_len, @@ -425,12 +425,24 @@ void qemu_iovec_init_extended( int total_niov, mid_niov = 0; struct iovec *p, *mid_iov = NULL; + assert(mid_qiov->niov <= IOV_MAX); + + if (SIZE_MAX - head_len < mid_len || + SIZE_MAX - head_len - mid_len < tail_len) + { + return -EINVAL; + } + if (mid_len) { mid_iov = qiov_slice(mid_qiov, mid_offset, mid_len, &mid_head, &mid_tail, &mid_niov); } total_niov = !!head_len + mid_niov + !!tail_len; + if (total_niov > IOV_MAX) { + return -EINVAL; + } + if (total_niov == 1) { qemu_iovec_init_buf(qiov, NULL, 0); p = &qiov->local_iov; @@ -459,6 +471,8 @@ void qemu_iovec_init_extended( p->iov_base = tail_buf; p->iov_len = tail_len; } + + return 0; } /* @@ -492,7 +506,14 @@ bool qemu_iovec_is_zero(QEMUIOVector *qiov, size_t offset, size_t bytes) void qemu_iovec_init_slice(QEMUIOVector *qiov, QEMUIOVector *source, size_t offset, size_t len) { - qemu_iovec_init_extended(qiov, NULL, 0, source, offset, len, NULL, 0); + int ret; + + assert(source->size >= len); + assert(source->size - len >= offset); + + /* We shrink the request, so we can't overflow neither size_t nor MAX_IOV */ + ret = qemu_iovec_init_extended(qiov, NULL, 0, source, offset, len, NULL, 0); + assert(ret == 0); } void qemu_iovec_destroy(QEMUIOVector *qiov) From a56ed80c429610aecd6f74fbd4a9467f5466278a Mon Sep 17 00:00:00 2001 From: Vladimir Sementsov-Ogievskiy Date: Fri, 11 Dec 2020 21:39:21 +0300 Subject: [PATCH 04/20] block: fix theoretical overflow in bdrv_init_padding() Calculation of sum may theoretically overflow, so use 64bit type and add some good assertions. Use int64_t constantly. Signed-off-by: Vladimir Sementsov-Ogievskiy Message-Id: <20201211183934.169161-4-vsementsov@virtuozzo.com> Reviewed-by: Eric Blake [eblake: tweak assertion order] Signed-off-by: Eric Blake --- block/io.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/block/io.c b/block/io.c index ab953bd58f..c8c9dea554 100644 --- a/block/io.c +++ b/block/io.c @@ -1565,8 +1565,12 @@ static bool bdrv_init_padding(BlockDriverState *bs, int64_t offset, int64_t bytes, BdrvRequestPadding *pad) { - uint64_t align = bs->bl.request_alignment; - size_t sum; + int64_t align = bs->bl.request_alignment; + int64_t sum; + + bdrv_check_request(offset, bytes, &error_abort); + assert(align <= INT_MAX); /* documented in block/block_int.h */ + assert(align <= SIZE_MAX / 2); /* so we can allocate the buffer */ memset(pad, 0, sizeof(*pad)); From f0deecff827c838a71f83539b0572094aefe3d5e Mon Sep 17 00:00:00 2001 From: Vladimir Sementsov-Ogievskiy Date: Fri, 11 Dec 2020 21:39:22 +0300 Subject: [PATCH 05/20] block/io: refactor bdrv_pad_request(): move bdrv_pad_request() up Prepare for the following patch when bdrv_pad_request() will be able to fail. Update the comments. Signed-off-by: Vladimir Sementsov-Ogievskiy Message-Id: <20201211183934.169161-5-vsementsov@virtuozzo.com> Reviewed-by: Eric Blake [eblake: grammar tweak] Signed-off-by: Eric Blake --- block/io.c | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/block/io.c b/block/io.c index c8c9dea554..3b1aec366e 100644 --- a/block/io.c +++ b/block/io.c @@ -2135,6 +2135,7 @@ int coroutine_fn bdrv_co_pwritev_part(BdrvChild *child, uint64_t align = bs->bl.request_alignment; BdrvRequestPadding pad; int ret; + bool padded = false; trace_bdrv_co_pwritev(child->bs, offset, bytes, flags); @@ -2166,20 +2167,32 @@ int coroutine_fn bdrv_co_pwritev_part(BdrvChild *child, return 0; } + if (!(flags & BDRV_REQ_ZERO_WRITE)) { + /* + * Pad request for following read-modify-write cycle. + * bdrv_co_do_zero_pwritev() does aligning by itself, so, we do + * alignment only if there is no ZERO flag. + */ + padded = bdrv_pad_request(bs, &qiov, &qiov_offset, &offset, &bytes, + &pad); + } + bdrv_inc_in_flight(bs); - /* - * Align write if necessary by performing a read-modify-write cycle. - * Pad qiov with the read parts and be sure to have a tracked request not - * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle. - */ tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_WRITE); if (flags & BDRV_REQ_ZERO_WRITE) { + assert(!padded); ret = bdrv_co_do_zero_pwritev(child, offset, bytes, flags, &req); goto out; } - if (bdrv_pad_request(bs, &qiov, &qiov_offset, &offset, &bytes, &pad)) { + if (padded) { + /* + * Request was unaligned to request_alignment and therefore + * padded. We are going to do read-modify-write, and must + * serialize the request to prevent interactions of the + * widened region with other transactions. + */ bdrv_make_request_serialising(&req, align); bdrv_padding_rmw_read(child, &req, &pad, false); } From 98ca45494fcd6bf0336ecd559e440b6de6ea4cd3 Mon Sep 17 00:00:00 2001 From: Vladimir Sementsov-Ogievskiy Date: Fri, 11 Dec 2020 21:39:23 +0300 Subject: [PATCH 06/20] block/io: bdrv_pad_request(): support qemu_iovec_init_extended failure Make bdrv_pad_request() honest: return error if qemu_iovec_init_extended() failed. Update also bdrv_padding_destroy() to clean the structure for safety. Signed-off-by: Vladimir Sementsov-Ogievskiy Message-Id: <20201211183934.169161-6-vsementsov@virtuozzo.com> Reviewed-by: Eric Blake Signed-off-by: Eric Blake --- block/io.c | 45 +++++++++++++++++++++++++++++++-------------- 1 file changed, 31 insertions(+), 14 deletions(-) diff --git a/block/io.c b/block/io.c index 3b1aec366e..39d943c33a 100644 --- a/block/io.c +++ b/block/io.c @@ -1665,6 +1665,7 @@ static void bdrv_padding_destroy(BdrvRequestPadding *pad) qemu_vfree(pad->buf); qemu_iovec_destroy(&pad->local_qiov); } + memset(pad, 0, sizeof(*pad)); } /* @@ -1674,33 +1675,42 @@ static void bdrv_padding_destroy(BdrvRequestPadding *pad) * read of padding, bdrv_padding_rmw_read() should be called separately if * needed. * - * All parameters except @bs are in-out: they represent original request at - * function call and padded (if padding needed) at function finish. - * - * Function always succeeds. + * Request parameters (@qiov, &qiov_offset, &offset, &bytes) are in-out: + * - on function start they represent original request + * - on failure or when padding is not needed they are unchanged + * - on success when padding is needed they represent padded request */ -static bool bdrv_pad_request(BlockDriverState *bs, - QEMUIOVector **qiov, size_t *qiov_offset, - int64_t *offset, unsigned int *bytes, - BdrvRequestPadding *pad) +static int bdrv_pad_request(BlockDriverState *bs, + QEMUIOVector **qiov, size_t *qiov_offset, + int64_t *offset, unsigned int *bytes, + BdrvRequestPadding *pad, bool *padded) { int ret; if (!bdrv_init_padding(bs, *offset, *bytes, pad)) { - return false; + if (padded) { + *padded = false; + } + return 0; } ret = qemu_iovec_init_extended(&pad->local_qiov, pad->buf, pad->head, *qiov, *qiov_offset, *bytes, pad->buf + pad->buf_len - pad->tail, pad->tail); - assert(ret == 0); + if (ret < 0) { + bdrv_padding_destroy(pad); + return ret; + } *bytes += pad->head + pad->tail; *offset -= pad->head; *qiov = &pad->local_qiov; *qiov_offset = 0; + if (padded) { + *padded = true; + } - return true; + return 0; } int coroutine_fn bdrv_co_preadv(BdrvChild *child, @@ -1750,7 +1760,11 @@ int coroutine_fn bdrv_co_preadv_part(BdrvChild *child, flags |= BDRV_REQ_COPY_ON_READ; } - bdrv_pad_request(bs, &qiov, &qiov_offset, &offset, &bytes, &pad); + ret = bdrv_pad_request(bs, &qiov, &qiov_offset, &offset, &bytes, &pad, + NULL); + if (ret < 0) { + return ret; + } tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_READ); ret = bdrv_aligned_preadv(child, &req, offset, bytes, @@ -2173,8 +2187,11 @@ int coroutine_fn bdrv_co_pwritev_part(BdrvChild *child, * bdrv_co_do_zero_pwritev() does aligning by itself, so, we do * alignment only if there is no ZERO flag. */ - padded = bdrv_pad_request(bs, &qiov, &qiov_offset, &offset, &bytes, - &pad); + ret = bdrv_pad_request(bs, &qiov, &qiov_offset, &offset, &bytes, &pad, + &padded); + if (ret < 0) { + return ret; + } } bdrv_inc_in_flight(bs); From 801625e69d947b0f9291d77bb1deb7545525c66d Mon Sep 17 00:00:00 2001 From: Vladimir Sementsov-Ogievskiy Date: Fri, 11 Dec 2020 21:39:24 +0300 Subject: [PATCH 07/20] block/throttle-groups: throttle_group_co_io_limits_intercept(): 64bit bytes The function is called from 64bit io handlers, and bytes is just passed to throttle_account() which is 64bit too (unsigned though). So, let's convert intermediate argument to 64bit too. This patch is a first in the 64-bit-blocklayer series, so we are generally moving to int64_t for both offset and bytes parameters on all io paths. Main motivation is realization of 64-bit write_zeroes operation for fast zeroing large disk chunks, up to the whole disk. We chose signed type, to be consistent with off_t (which is signed) and with possibility for signed return type (where negative value means error). Patch-correctness audit by Eric Blake: Caller has 32-bit, this patch now causes widening which is safe: block/block-backend.c: blk_do_preadv() passes 'unsigned int' block/block-backend.c: blk_do_pwritev_part() passes 'unsigned int' block/throttle.c: throttle_co_pwrite_zeroes() passes 'int' block/throttle.c: throttle_co_pdiscard() passes 'int' Caller has 64-bit, this patch fixes potential bug where pre-patch could narrow, except it's easy enough to trace that callers are still capped at 2G actions: block/throttle.c: throttle_co_preadv() passes 'uint64_t' block/throttle.c: throttle_co_pwritev() passes 'uint64_t' Implementation in question: block/throttle-groups.c throttle_group_co_io_limits_intercept() takes 'unsigned int bytes' and uses it: argument to util/throttle.c throttle_account(uint64_t) All safe: it patches a latent bug, and does not introduce any 64-bit gotchas once throttle_co_p{read,write}v are relaxed, and assuming throttle_account() is not buggy. Signed-off-by: Vladimir Sementsov-Ogievskiy Reviewed-by: Eric Blake Reviewed-by: Alberto Garcia Message-Id: <20201211183934.169161-7-vsementsov@virtuozzo.com> Signed-off-by: Eric Blake --- block/throttle-groups.c | 5 ++++- include/block/throttle-groups.h | 2 +- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/block/throttle-groups.c b/block/throttle-groups.c index abd16ed9db..fb203c3ced 100644 --- a/block/throttle-groups.c +++ b/block/throttle-groups.c @@ -358,12 +358,15 @@ static void schedule_next_request(ThrottleGroupMember *tgm, bool is_write) * @is_write: the type of operation (read/write) */ void coroutine_fn throttle_group_co_io_limits_intercept(ThrottleGroupMember *tgm, - unsigned int bytes, + int64_t bytes, bool is_write) { bool must_wait; ThrottleGroupMember *token; ThrottleGroup *tg = container_of(tgm->throttle_state, ThrottleGroup, ts); + + assert(bytes >= 0); + qemu_mutex_lock(&tg->lock); /* First we check if this I/O has to be throttled. */ diff --git a/include/block/throttle-groups.h b/include/block/throttle-groups.h index 8bf7d233fa..9541b32432 100644 --- a/include/block/throttle-groups.h +++ b/include/block/throttle-groups.h @@ -77,7 +77,7 @@ void throttle_group_unregister_tgm(ThrottleGroupMember *tgm); void throttle_group_restart_tgm(ThrottleGroupMember *tgm); void coroutine_fn throttle_group_co_io_limits_intercept(ThrottleGroupMember *tgm, - unsigned int bytes, + int64_t bytes, bool is_write); void throttle_group_attach_aio_context(ThrottleGroupMember *tgm, AioContext *new_context); From 63f4ad1186b7bdf562193ee79a1f5d7f51343776 Mon Sep 17 00:00:00 2001 From: Vladimir Sementsov-Ogievskiy Date: Fri, 11 Dec 2020 21:39:25 +0300 Subject: [PATCH 08/20] block/io: improve bdrv_check_request: check qiov too Operations with qiov add more restrictions on bytes, let's cover it. Signed-off-by: Vladimir Sementsov-Ogievskiy Message-Id: <20201211183934.169161-8-vsementsov@virtuozzo.com> Reviewed-by: Eric Blake Signed-off-by: Eric Blake --- block/io.c | 46 +++++++++++++++++++++++++++++++++++++++------- 1 file changed, 39 insertions(+), 7 deletions(-) diff --git a/block/io.c b/block/io.c index 39d943c33a..b56db913da 100644 --- a/block/io.c +++ b/block/io.c @@ -920,8 +920,14 @@ bool coroutine_fn bdrv_make_request_serialising(BdrvTrackedRequest *req, return waited; } -int bdrv_check_request(int64_t offset, int64_t bytes, Error **errp) +static int bdrv_check_qiov_request(int64_t offset, int64_t bytes, + QEMUIOVector *qiov, size_t qiov_offset, + Error **errp) { + /* + * Check generic offset/bytes correctness + */ + if (offset < 0) { error_setg(errp, "offset is negative: %" PRIi64, offset); return -EIO; @@ -951,12 +957,38 @@ int bdrv_check_request(int64_t offset, int64_t bytes, Error **errp) return -EIO; } + if (!qiov) { + return 0; + } + + /* + * Check qiov and qiov_offset + */ + + if (qiov_offset > qiov->size) { + error_setg(errp, "qiov_offset(%zu) overflow io vector size(%zu)", + qiov_offset, qiov->size); + return -EIO; + } + + if (bytes > qiov->size - qiov_offset) { + error_setg(errp, "bytes(%" PRIi64 ") + qiov_offset(%zu) overflow io " + "vector size(%zu)", bytes, qiov_offset, qiov->size); + return -EIO; + } + return 0; } -static int bdrv_check_request32(int64_t offset, int64_t bytes) +int bdrv_check_request(int64_t offset, int64_t bytes, Error **errp) { - int ret = bdrv_check_request(offset, bytes, NULL); + return bdrv_check_qiov_request(offset, bytes, NULL, 0, errp); +} + +static int bdrv_check_request32(int64_t offset, int64_t bytes, + QEMUIOVector *qiov, size_t qiov_offset) +{ + int ret = bdrv_check_qiov_request(offset, bytes, qiov, qiov_offset, NULL); if (ret < 0) { return ret; } @@ -1736,7 +1768,7 @@ int coroutine_fn bdrv_co_preadv_part(BdrvChild *child, return -ENOMEDIUM; } - ret = bdrv_check_request32(offset, bytes); + ret = bdrv_check_request32(offset, bytes, qiov, qiov_offset); if (ret < 0) { return ret; } @@ -2157,7 +2189,7 @@ int coroutine_fn bdrv_co_pwritev_part(BdrvChild *child, return -ENOMEDIUM; } - ret = bdrv_check_request32(offset, bytes); + ret = bdrv_check_request32(offset, bytes, qiov, qiov_offset); if (ret < 0) { return ret; } @@ -3163,7 +3195,7 @@ static int coroutine_fn bdrv_co_copy_range_internal( if (!dst || !dst->bs || !bdrv_is_inserted(dst->bs)) { return -ENOMEDIUM; } - ret = bdrv_check_request32(dst_offset, bytes); + ret = bdrv_check_request32(dst_offset, bytes, NULL, 0); if (ret) { return ret; } @@ -3174,7 +3206,7 @@ static int coroutine_fn bdrv_co_copy_range_internal( if (!src || !src->bs || !bdrv_is_inserted(src->bs)) { return -ENOMEDIUM; } - ret = bdrv_check_request32(src_offset, bytes); + ret = bdrv_check_request32(src_offset, bytes, NULL, 0); if (ret) { return ret; } From 8024726459f37ffcceebfc9e50ff8e5cc362f274 Mon Sep 17 00:00:00 2001 From: Eric Blake Date: Wed, 3 Feb 2021 08:14:15 -0600 Subject: [PATCH 09/20] block: use int64_t as bytes type in tracked requests We are generally moving to int64_t for both offset and bytes parameters on all io paths. Main motivation is realization of 64-bit write_zeroes operation for fast zeroing large disk chunks, up to the whole disk. We chose signed type, to be consistent with off_t (which is signed) and with possibility for signed return type (where negative value means error). All requests in block/io must not overflow BDRV_MAX_LENGTH, all external users of BdrvTrackedRequest already have corresponding assertions, so we are safe. Add some assertions still. Signed-off-by: Vladimir Sementsov-Ogievskiy Message-Id: <20201211183934.169161-9-vsementsov@virtuozzo.com> Reviewed-by: Eric Blake Signed-off-by: Eric Blake --- block/io.c | 14 +++++++++----- include/block/block_int.h | 4 ++-- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/block/io.c b/block/io.c index b56db913da..1c23587d18 100644 --- a/block/io.c +++ b/block/io.c @@ -717,10 +717,10 @@ static void tracked_request_end(BdrvTrackedRequest *req) static void tracked_request_begin(BdrvTrackedRequest *req, BlockDriverState *bs, int64_t offset, - uint64_t bytes, + int64_t bytes, enum BdrvTrackedRequestType type) { - assert(bytes <= INT64_MAX && offset <= INT64_MAX - bytes); + bdrv_check_request(offset, bytes, &error_abort); *req = (BdrvTrackedRequest){ .bs = bs, @@ -741,8 +741,10 @@ static void tracked_request_begin(BdrvTrackedRequest *req, } static bool tracked_request_overlaps(BdrvTrackedRequest *req, - int64_t offset, uint64_t bytes) + int64_t offset, int64_t bytes) { + bdrv_check_request(offset, bytes, &error_abort); + /* aaaa bbbb */ if (offset >= req->overlap_offset + req->overlap_bytes) { return false; @@ -810,8 +812,10 @@ static void tracked_request_set_serialising(BdrvTrackedRequest *req, uint64_t align) { int64_t overlap_offset = req->offset & ~(align - 1); - uint64_t overlap_bytes = ROUND_UP(req->offset + req->bytes, align) - - overlap_offset; + int64_t overlap_bytes = + ROUND_UP(req->offset + req->bytes, align) - overlap_offset; + + bdrv_check_request(req->offset, req->bytes, &error_abort); if (!req->serialising) { qatomic_inc(&req->bs->serialising_in_flight); diff --git a/include/block/block_int.h b/include/block/block_int.h index 5bbbf9ee0a..7f41f0990c 100644 --- a/include/block/block_int.h +++ b/include/block/block_int.h @@ -79,12 +79,12 @@ enum BdrvTrackedRequestType { typedef struct BdrvTrackedRequest { BlockDriverState *bs; int64_t offset; - uint64_t bytes; + int64_t bytes; enum BdrvTrackedRequestType type; bool serialising; int64_t overlap_offset; - uint64_t overlap_bytes; + int64_t overlap_bytes; QLIST_ENTRY(BdrvTrackedRequest) list; Coroutine *co; /* owner, used for deadlock detection */ From 17abcbeee24edf264af4250b6e06b07c564e8233 Mon Sep 17 00:00:00 2001 From: Vladimir Sementsov-Ogievskiy Date: Fri, 11 Dec 2020 21:39:27 +0300 Subject: [PATCH 10/20] block/io: use int64_t bytes in driver wrappers We are generally moving to int64_t for both offset and bytes parameters on all io paths. Main motivation is realization of 64-bit write_zeroes operation for fast zeroing large disk chunks, up to the whole disk. We chose signed type, to be consistent with off_t (which is signed) and with possibility for signed return type (where negative value means error). So, convert driver wrappers parameters which are already 64bit to signed type. Requests in block/io.c must never exceed BDRV_MAX_LENGTH (which is less than INT64_MAX), which makes the conversion to signed 64bit type safe. Add corresponding assertions. Signed-off-by: Vladimir Sementsov-Ogievskiy Message-Id: <20201211183934.169161-10-vsementsov@virtuozzo.com> Reviewed-by: Eric Blake Signed-off-by: Eric Blake --- block/io.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/block/io.c b/block/io.c index 1c23587d18..cb5064155b 100644 --- a/block/io.c +++ b/block/io.c @@ -1125,7 +1125,7 @@ static void bdrv_co_io_em_complete(void *opaque, int ret) } static int coroutine_fn bdrv_driver_preadv(BlockDriverState *bs, - uint64_t offset, uint64_t bytes, + int64_t offset, int64_t bytes, QEMUIOVector *qiov, size_t qiov_offset, int flags) { @@ -1135,6 +1135,7 @@ static int coroutine_fn bdrv_driver_preadv(BlockDriverState *bs, QEMUIOVector local_qiov; int ret; + bdrv_check_qiov_request(offset, bytes, qiov, qiov_offset, &error_abort); assert(!(flags & ~BDRV_REQ_MASK)); assert(!(flags & BDRV_REQ_NO_FALLBACK)); @@ -1194,7 +1195,7 @@ out: } static int coroutine_fn bdrv_driver_pwritev(BlockDriverState *bs, - uint64_t offset, uint64_t bytes, + int64_t offset, int64_t bytes, QEMUIOVector *qiov, size_t qiov_offset, int flags) { @@ -1204,6 +1205,7 @@ static int coroutine_fn bdrv_driver_pwritev(BlockDriverState *bs, QEMUIOVector local_qiov; int ret; + bdrv_check_qiov_request(offset, bytes, qiov, qiov_offset, &error_abort); assert(!(flags & ~BDRV_REQ_MASK)); assert(!(flags & BDRV_REQ_NO_FALLBACK)); @@ -1274,14 +1276,16 @@ emulate_flags: } static int coroutine_fn -bdrv_driver_pwritev_compressed(BlockDriverState *bs, uint64_t offset, - uint64_t bytes, QEMUIOVector *qiov, +bdrv_driver_pwritev_compressed(BlockDriverState *bs, int64_t offset, + int64_t bytes, QEMUIOVector *qiov, size_t qiov_offset) { BlockDriver *drv = bs->drv; QEMUIOVector local_qiov; int ret; + bdrv_check_qiov_request(offset, bytes, qiov, qiov_offset, &error_abort); + if (!drv) { return -ENOMEDIUM; } From 5ae07b141085c9ac0e10b739639e760b71c490f2 Mon Sep 17 00:00:00 2001 From: Vladimir Sementsov-Ogievskiy Date: Fri, 11 Dec 2020 21:39:28 +0300 Subject: [PATCH 11/20] block/io: support int64_t bytes in bdrv_co_do_pwrite_zeroes() We are generally moving to int64_t for both offset and bytes parameters on all io paths. Main motivation is realization of 64-bit write_zeroes operation for fast zeroing large disk chunks, up to the whole disk. We chose signed type, to be consistent with off_t (which is signed) and with possibility for signed return type (where negative value means error). So, prepare bdrv_co_do_pwrite_zeroes() now. Callers are safe, as converting int to int64_t is safe. Concentrate on 'bytes' usage in the function (thx to Eric Blake): compute 'int tail' via % 'int alignment' - safe fragmentation loop 'int num' - still fragments with a cap on max_transfer use of 'num' within the loop MIN(bytes, max_transfer) as well as %alignment - still works, so calculations in if (head) {} are safe clamp size by 'int max_write_zeroes' - safe drv->bdrv_co_pwrite_zeroes(int) - safe because of clamping clamp size by 'int max_transfer' - safe buf allocation is still clamped to max_transfer qemu_iovec_init_buf(size_t) - safe because of clamping bdrv_driver_pwritev(uint64_t) - safe Signed-off-by: Vladimir Sementsov-Ogievskiy Message-Id: <20201211183934.169161-11-vsementsov@virtuozzo.com> Reviewed-by: Eric Blake Signed-off-by: Eric Blake --- block/io.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/block/io.c b/block/io.c index cb5064155b..98d9f5bdf4 100644 --- a/block/io.c +++ b/block/io.c @@ -41,7 +41,7 @@ static void bdrv_parent_cb_resize(BlockDriverState *bs); static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs, - int64_t offset, int bytes, BdrvRequestFlags flags); + int64_t offset, int64_t bytes, BdrvRequestFlags flags); static void bdrv_parent_drained_begin(BlockDriverState *bs, BdrvChild *ignore, bool ignore_bds_parents) @@ -1819,7 +1819,7 @@ int coroutine_fn bdrv_co_preadv_part(BdrvChild *child, } static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs, - int64_t offset, int bytes, BdrvRequestFlags flags) + int64_t offset, int64_t bytes, BdrvRequestFlags flags) { BlockDriver *drv = bs->drv; QEMUIOVector qiov; @@ -1834,6 +1834,8 @@ static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs, bs->bl.request_alignment); int max_transfer = MIN_NON_ZERO(bs->bl.max_transfer, MAX_BOUNCE_BUFFER); + bdrv_check_request(offset, bytes, &error_abort); + if (!drv) { return -ENOMEDIUM; } @@ -1849,7 +1851,7 @@ static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs, assert(max_write_zeroes >= bs->bl.request_alignment); while (bytes > 0 && !ret) { - int num = bytes; + int64_t num = bytes; /* Align request. Block drivers can expect the "bulk" of the request * to be aligned, and that unaligned requests do not cross cluster From fcfd9ade687864dc194bac11150195e5ec884676 Mon Sep 17 00:00:00 2001 From: Vladimir Sementsov-Ogievskiy Date: Fri, 11 Dec 2020 21:39:29 +0300 Subject: [PATCH 12/20] block/io: support int64_t bytes in bdrv_aligned_pwritev() We are generally moving to int64_t for both offset and bytes parameters on all io paths. Main motivation is realization of 64-bit write_zeroes operation for fast zeroing large disk chunks, up to the whole disk. We chose signed type, to be consistent with off_t (which is signed) and with possibility for signed return type (where negative value means error). So, prepare bdrv_aligned_pwritev() now and convert the dependencies: bdrv_co_write_req_prepare() and bdrv_co_write_req_finish() to signed type bytes. Conversion of bdrv_co_write_req_prepare() and bdrv_co_write_req_finish() is definitely safe, as all requests in block/io must not overflow BDRV_MAX_LENGTH. Still add assertions. For bdrv_aligned_pwritev() 'bytes' type is widened, so callers are safe. Let's check usage of the parameter inside the function. Passing to bdrv_co_write_req_prepare() and bdrv_co_write_req_finish() is OK. Passing to qemu_iovec_* is OK after new assertion. All other callees are already updated to int64_t. Checking alignment is not changed, offset + bytes and qiov_offset + bytes calculations are safe (thanks to new assertions). max_transfer is kept to be int for now. It has a default of INT_MAX here, and some drivers may rely on it. It's to be refactored later. Signed-off-by: Vladimir Sementsov-Ogievskiy Message-Id: <20201211183934.169161-12-vsementsov@virtuozzo.com> Reviewed-by: Eric Blake Signed-off-by: Eric Blake --- block/io.c | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/block/io.c b/block/io.c index 98d9f5bdf4..59ae0a110d 100644 --- a/block/io.c +++ b/block/io.c @@ -1932,11 +1932,12 @@ fail: } static inline int coroutine_fn -bdrv_co_write_req_prepare(BdrvChild *child, int64_t offset, uint64_t bytes, +bdrv_co_write_req_prepare(BdrvChild *child, int64_t offset, int64_t bytes, BdrvTrackedRequest *req, int flags) { BlockDriverState *bs = child->bs; - int64_t end_sector = DIV_ROUND_UP(offset + bytes, BDRV_SECTOR_SIZE); + + bdrv_check_request(offset, bytes, &error_abort); if (bs->read_only) { return -EPERM; @@ -1963,7 +1964,8 @@ bdrv_co_write_req_prepare(BdrvChild *child, int64_t offset, uint64_t bytes, assert(req->overlap_offset <= offset); assert(offset + bytes <= req->overlap_offset + req->overlap_bytes); - assert(end_sector <= bs->total_sectors || child->perm & BLK_PERM_RESIZE); + assert(offset + bytes <= bs->total_sectors * BDRV_SECTOR_SIZE || + child->perm & BLK_PERM_RESIZE); switch (req->type) { case BDRV_TRACKED_WRITE: @@ -1984,12 +1986,14 @@ bdrv_co_write_req_prepare(BdrvChild *child, int64_t offset, uint64_t bytes, } static inline void coroutine_fn -bdrv_co_write_req_finish(BdrvChild *child, int64_t offset, uint64_t bytes, +bdrv_co_write_req_finish(BdrvChild *child, int64_t offset, int64_t bytes, BdrvTrackedRequest *req, int ret) { int64_t end_sector = DIV_ROUND_UP(offset + bytes, BDRV_SECTOR_SIZE); BlockDriverState *bs = child->bs; + bdrv_check_request(offset, bytes, &error_abort); + qatomic_inc(&bs->write_gen); /* @@ -2026,16 +2030,18 @@ bdrv_co_write_req_finish(BdrvChild *child, int64_t offset, uint64_t bytes, * after possibly fragmenting it. */ static int coroutine_fn bdrv_aligned_pwritev(BdrvChild *child, - BdrvTrackedRequest *req, int64_t offset, unsigned int bytes, + BdrvTrackedRequest *req, int64_t offset, int64_t bytes, int64_t align, QEMUIOVector *qiov, size_t qiov_offset, int flags) { BlockDriverState *bs = child->bs; BlockDriver *drv = bs->drv; int ret; - uint64_t bytes_remaining = bytes; + int64_t bytes_remaining = bytes; int max_transfer; + bdrv_check_qiov_request(offset, bytes, qiov, qiov_offset, &error_abort); + if (!drv) { return -ENOMEDIUM; } @@ -2047,7 +2053,6 @@ static int coroutine_fn bdrv_aligned_pwritev(BdrvChild *child, assert(is_power_of_2(align)); assert((offset & (align - 1)) == 0); assert((bytes & (align - 1)) == 0); - assert(!qiov || qiov_offset + bytes <= qiov->size); max_transfer = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_transfer, INT_MAX), align); @@ -2146,7 +2151,7 @@ static int coroutine_fn bdrv_co_do_zero_pwritev(BdrvChild *child, assert(!bytes || (offset & (align - 1)) == 0); if (bytes >= align) { /* Write the aligned part in the middle. */ - uint64_t aligned_bytes = bytes & ~(align - 1); + int64_t aligned_bytes = bytes & ~(align - 1); ret = bdrv_aligned_pwritev(child, req, offset, aligned_bytes, align, NULL, 0, flags); if (ret < 0) { From 9df5afbdd1ae0d8d5c3fb23e8c219ce7ffddd8f6 Mon Sep 17 00:00:00 2001 From: Vladimir Sementsov-Ogievskiy Date: Fri, 11 Dec 2020 21:39:30 +0300 Subject: [PATCH 13/20] block/io: support int64_t bytes in bdrv_co_do_copy_on_readv() We are generally moving to int64_t for both offset and bytes parameters on all io paths. Main motivation is realization of 64-bit write_zeroes operation for fast zeroing large disk chunks, up to the whole disk. We chose signed type, to be consistent with off_t (which is signed) and with possibility for signed return type (where negative value means error). So, prepare bdrv_co_do_copy_on_readv() now. 'bytes' type widening, so callers are safe. Look at the function itself: bytes, skip_bytes and progress become int64_t. bdrv_round_to_clusters() is OK, cluster_bytes now may be large. trace_bdrv_co_do_copy_on_readv() is OK looping through cluster_bytes is still OK. pnum is still capped to max_transfer, and to MAX_BOUNCE_BUFFER when we are going to do COR operation. Therefor calculations in qemu_iovec_from_buf() and bdrv_driver_preadv() should not change. Signed-off-by: Vladimir Sementsov-Ogievskiy Message-Id: <20201211183934.169161-13-vsementsov@virtuozzo.com> Reviewed-by: Eric Blake Signed-off-by: Eric Blake --- block/io.c | 8 +++++--- block/trace-events | 2 +- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/block/io.c b/block/io.c index 59ae0a110d..63b0fa0e9e 100644 --- a/block/io.c +++ b/block/io.c @@ -1311,7 +1311,7 @@ bdrv_driver_pwritev_compressed(BlockDriverState *bs, int64_t offset, } static int coroutine_fn bdrv_co_do_copy_on_readv(BdrvChild *child, - int64_t offset, unsigned int bytes, QEMUIOVector *qiov, + int64_t offset, int64_t bytes, QEMUIOVector *qiov, size_t qiov_offset, int flags) { BlockDriverState *bs = child->bs; @@ -1326,13 +1326,15 @@ static int coroutine_fn bdrv_co_do_copy_on_readv(BdrvChild *child, BlockDriver *drv = bs->drv; int64_t cluster_offset; int64_t cluster_bytes; - size_t skip_bytes; + int64_t skip_bytes; int ret; int max_transfer = MIN_NON_ZERO(bs->bl.max_transfer, BDRV_REQUEST_MAX_BYTES); - unsigned int progress = 0; + int64_t progress = 0; bool skip_write; + bdrv_check_qiov_request(offset, bytes, qiov, qiov_offset, &error_abort); + if (!drv) { return -ENOMEDIUM; } diff --git a/block/trace-events b/block/trace-events index ecbc32a80a..82b5dd7cb6 100644 --- a/block/trace-events +++ b/block/trace-events @@ -14,7 +14,7 @@ blk_root_detach(void *child, void *blk, void *bs) "child %p blk %p bs %p" bdrv_co_preadv(void *bs, int64_t offset, int64_t nbytes, unsigned int flags) "bs %p offset %"PRId64" nbytes %"PRId64" flags 0x%x" bdrv_co_pwritev(void *bs, int64_t offset, int64_t nbytes, unsigned int flags) "bs %p offset %"PRId64" nbytes %"PRId64" flags 0x%x" bdrv_co_pwrite_zeroes(void *bs, int64_t offset, int count, int flags) "bs %p offset %"PRId64" count %d flags 0x%x" -bdrv_co_do_copy_on_readv(void *bs, int64_t offset, unsigned int bytes, int64_t cluster_offset, int64_t cluster_bytes) "bs %p offset %"PRId64" bytes %u cluster_offset %"PRId64" cluster_bytes %"PRId64 +bdrv_co_do_copy_on_readv(void *bs, int64_t offset, int64_t bytes, int64_t cluster_offset, int64_t cluster_bytes) "bs %p offset %" PRId64 " bytes %" PRId64 " cluster_offset %" PRId64 " cluster_bytes %" PRId64 bdrv_co_copy_range_from(void *src, uint64_t src_offset, void *dst, uint64_t dst_offset, uint64_t bytes, int read_flags, int write_flags) "src %p offset %"PRIu64" dst %p offset %"PRIu64" bytes %"PRIu64" rw flags 0x%x 0x%x" bdrv_co_copy_range_to(void *src, uint64_t src_offset, void *dst, uint64_t dst_offset, uint64_t bytes, int read_flags, int write_flags) "src %p offset %"PRIu64" dst %p offset %"PRIu64" bytes %"PRIu64" rw flags 0x%x 0x%x" From 8b0c5d7659f0d0c192f78867f4dc6026e9f3506f Mon Sep 17 00:00:00 2001 From: Vladimir Sementsov-Ogievskiy Date: Fri, 11 Dec 2020 21:39:31 +0300 Subject: [PATCH 14/20] block/io: support int64_t bytes in bdrv_aligned_preadv() We are generally moving to int64_t for both offset and bytes parameters on all io paths. Main motivation is realization of 64-bit write_zeroes operation for fast zeroing large disk chunks, up to the whole disk. We chose signed type, to be consistent with off_t (which is signed) and with possibility for signed return type (where negative value means error). So, prepare bdrv_aligned_preadv() now. Make the bytes variable in bdrv_padding_rmw_read() int64_t, as it is only used for pass-through to bdrv_aligned_preadv(). All bdrv_aligned_preadv() callers are safe as type is widening. Let's look inside: - add a new-style assertion that request is good. - callees bdrv_is_allocated(), bdrv_co_do_copy_on_readv() supports int64_t bytes - conversion of bytes_remaining is OK, as we never have requests overflowing BDRV_MAX_LENGTH - looping through bytes_remaining is ok, num is updated to int64_t - for bdrv_driver_preadv we have same limit of max_transfer - qemu_iovec_memset is OK, as bytes+qiov_offset should not overflow qiov->size anyway (thanks to bdrv_check_qiov_request()) Signed-off-by: Vladimir Sementsov-Ogievskiy Message-Id: <20201211183934.169161-14-vsementsov@virtuozzo.com> Reviewed-by: Eric Blake [eblake: grammar tweak] Signed-off-by: Eric Blake --- block/io.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/block/io.c b/block/io.c index 63b0fa0e9e..cef284e3a1 100644 --- a/block/io.c +++ b/block/io.c @@ -1475,15 +1475,16 @@ err: * reads; any other features must be implemented by the caller. */ static int coroutine_fn bdrv_aligned_preadv(BdrvChild *child, - BdrvTrackedRequest *req, int64_t offset, unsigned int bytes, + BdrvTrackedRequest *req, int64_t offset, int64_t bytes, int64_t align, QEMUIOVector *qiov, size_t qiov_offset, int flags) { BlockDriverState *bs = child->bs; int64_t total_bytes, max_bytes; int ret = 0; - uint64_t bytes_remaining = bytes; + int64_t bytes_remaining = bytes; int max_transfer; + bdrv_check_qiov_request(offset, bytes, qiov, qiov_offset, &error_abort); assert(is_power_of_2(align)); assert((offset & (align - 1)) == 0); assert((bytes & (align - 1)) == 0); @@ -1545,7 +1546,7 @@ static int coroutine_fn bdrv_aligned_preadv(BdrvChild *child, } while (bytes_remaining) { - int num; + int64_t num; if (max_bytes) { num = MIN(bytes_remaining, MIN(max_bytes, max_transfer)); @@ -1652,7 +1653,7 @@ static int bdrv_padding_rmw_read(BdrvChild *child, assert(req->serialising && pad->buf); if (pad->head || pad->merge_reads) { - uint64_t bytes = pad->merge_reads ? pad->buf_len : align; + int64_t bytes = pad->merge_reads ? pad->buf_len : align; qemu_iovec_init_buf(&local_qiov, pad->buf, bytes); From 37e9403ea87a473f96744af7583dbb3eaef8d0f6 Mon Sep 17 00:00:00 2001 From: Vladimir Sementsov-Ogievskiy Date: Fri, 11 Dec 2020 21:39:32 +0300 Subject: [PATCH 15/20] block/io: support int64_t bytes in bdrv_co_p{read,write}v_part() We are generally moving to int64_t for both offset and bytes parameters on all io paths. Main motivation is realization of 64-bit write_zeroes operation for fast zeroing large disk chunks, up to the whole disk. We chose signed type, to be consistent with off_t (which is signed) and with possibility for signed return type (where negative value means error). So, prepare bdrv_co_preadv_part() and bdrv_co_pwritev_part() and their remaining dependencies now. bdrv_pad_request() is updated simultaneously, as pointer to bytes passed to it both from bdrv_co_pwritev_part() and bdrv_co_preadv_part(). So, all callers of bdrv_pad_request() are updated to pass 64bit bytes. bdrv_pad_request() is already good for 64bit requests, add corresponding assertion. Look at bdrv_co_preadv_part() and bdrv_co_pwritev_part(). Type is widening, so callers are safe. Let's look inside the functions. In bdrv_co_preadv_part() and bdrv_aligned_pwritev() we only pass bytes to other already int64_t interfaces (and some obviously safe calculations), it's OK. In bdrv_co_do_zero_pwritev() aligned_bytes may become large now, still it's passed to bdrv_aligned_pwritev which supports int64_t bytes. Signed-off-by: Vladimir Sementsov-Ogievskiy Message-Id: <20201211183934.169161-15-vsementsov@virtuozzo.com> Reviewed-by: Eric Blake Signed-off-by: Eric Blake --- block/io.c | 14 ++++++++------ block/trace-events | 4 ++-- include/block/block_int.h | 4 ++-- 3 files changed, 12 insertions(+), 10 deletions(-) diff --git a/block/io.c b/block/io.c index cef284e3a1..7b6b0027bc 100644 --- a/block/io.c +++ b/block/io.c @@ -1725,11 +1725,13 @@ static void bdrv_padding_destroy(BdrvRequestPadding *pad) */ static int bdrv_pad_request(BlockDriverState *bs, QEMUIOVector **qiov, size_t *qiov_offset, - int64_t *offset, unsigned int *bytes, + int64_t *offset, int64_t *bytes, BdrvRequestPadding *pad, bool *padded) { int ret; + bdrv_check_qiov_request(*offset, *bytes, *qiov, *qiov_offset, &error_abort); + if (!bdrv_init_padding(bs, *offset, *bytes, pad)) { if (padded) { *padded = false; @@ -1764,7 +1766,7 @@ int coroutine_fn bdrv_co_preadv(BdrvChild *child, } int coroutine_fn bdrv_co_preadv_part(BdrvChild *child, - int64_t offset, unsigned int bytes, + int64_t offset, int64_t bytes, QEMUIOVector *qiov, size_t qiov_offset, BdrvRequestFlags flags) { @@ -1773,7 +1775,7 @@ int coroutine_fn bdrv_co_preadv_part(BdrvChild *child, BdrvRequestPadding pad; int ret; - trace_bdrv_co_preadv(bs, offset, bytes, flags); + trace_bdrv_co_preadv_part(bs, offset, bytes, flags); if (!bdrv_is_inserted(bs)) { return -ENOMEDIUM; @@ -2117,7 +2119,7 @@ static int coroutine_fn bdrv_aligned_pwritev(BdrvChild *child, static int coroutine_fn bdrv_co_do_zero_pwritev(BdrvChild *child, int64_t offset, - unsigned int bytes, + int64_t bytes, BdrvRequestFlags flags, BdrvTrackedRequest *req) { @@ -2191,7 +2193,7 @@ int coroutine_fn bdrv_co_pwritev(BdrvChild *child, } int coroutine_fn bdrv_co_pwritev_part(BdrvChild *child, - int64_t offset, unsigned int bytes, QEMUIOVector *qiov, size_t qiov_offset, + int64_t offset, int64_t bytes, QEMUIOVector *qiov, size_t qiov_offset, BdrvRequestFlags flags) { BlockDriverState *bs = child->bs; @@ -2201,7 +2203,7 @@ int coroutine_fn bdrv_co_pwritev_part(BdrvChild *child, int ret; bool padded = false; - trace_bdrv_co_pwritev(child->bs, offset, bytes, flags); + trace_bdrv_co_pwritev_part(child->bs, offset, bytes, flags); if (!bdrv_is_inserted(bs)) { return -ENOMEDIUM; diff --git a/block/trace-events b/block/trace-events index 82b5dd7cb6..b252457d8e 100644 --- a/block/trace-events +++ b/block/trace-events @@ -11,8 +11,8 @@ blk_root_attach(void *child, void *blk, void *bs) "child %p blk %p bs %p" blk_root_detach(void *child, void *blk, void *bs) "child %p blk %p bs %p" # io.c -bdrv_co_preadv(void *bs, int64_t offset, int64_t nbytes, unsigned int flags) "bs %p offset %"PRId64" nbytes %"PRId64" flags 0x%x" -bdrv_co_pwritev(void *bs, int64_t offset, int64_t nbytes, unsigned int flags) "bs %p offset %"PRId64" nbytes %"PRId64" flags 0x%x" +bdrv_co_preadv_part(void *bs, int64_t offset, int64_t bytes, unsigned int flags) "bs %p offset %" PRId64 " bytes %" PRId64 " flags 0x%x" +bdrv_co_pwritev_part(void *bs, int64_t offset, int64_t bytes, unsigned int flags) "bs %p offset %" PRId64 " bytes %" PRId64 " flags 0x%x" bdrv_co_pwrite_zeroes(void *bs, int64_t offset, int count, int flags) "bs %p offset %"PRId64" count %d flags 0x%x" bdrv_co_do_copy_on_readv(void *bs, int64_t offset, int64_t bytes, int64_t cluster_offset, int64_t cluster_bytes) "bs %p offset %" PRId64 " bytes %" PRId64 " cluster_offset %" PRId64 " cluster_bytes %" PRId64 bdrv_co_copy_range_from(void *src, uint64_t src_offset, void *dst, uint64_t dst_offset, uint64_t bytes, int read_flags, int write_flags) "src %p offset %"PRIu64" dst %p offset %"PRIu64" bytes %"PRIu64" rw flags 0x%x 0x%x" diff --git a/include/block/block_int.h b/include/block/block_int.h index 7f41f0990c..f2ad8aa771 100644 --- a/include/block/block_int.h +++ b/include/block/block_int.h @@ -1035,13 +1035,13 @@ int coroutine_fn bdrv_co_preadv(BdrvChild *child, int64_t offset, unsigned int bytes, QEMUIOVector *qiov, BdrvRequestFlags flags); int coroutine_fn bdrv_co_preadv_part(BdrvChild *child, - int64_t offset, unsigned int bytes, + int64_t offset, int64_t bytes, QEMUIOVector *qiov, size_t qiov_offset, BdrvRequestFlags flags); int coroutine_fn bdrv_co_pwritev(BdrvChild *child, int64_t offset, unsigned int bytes, QEMUIOVector *qiov, BdrvRequestFlags flags); int coroutine_fn bdrv_co_pwritev_part(BdrvChild *child, - int64_t offset, unsigned int bytes, + int64_t offset, int64_t bytes, QEMUIOVector *qiov, size_t qiov_offset, BdrvRequestFlags flags); static inline int coroutine_fn bdrv_co_pread(BdrvChild *child, From e9e52efdc53bf7746bdb3c21f1a9ee5da298c6a2 Mon Sep 17 00:00:00 2001 From: Vladimir Sementsov-Ogievskiy Date: Fri, 11 Dec 2020 21:39:33 +0300 Subject: [PATCH 16/20] block/io: support int64_t bytes in read/write wrappers We are generally moving to int64_t for both offset and bytes parameters on all io paths. Main motivation is realization of 64-bit write_zeroes operation for fast zeroing large disk chunks, up to the whole disk. We chose signed type, to be consistent with off_t (which is signed) and with possibility for signed return type (where negative value means error). Now, since bdrv_co_preadv_part() and bdrv_co_pwritev_part() have been updated, update all their wrappers. For all of them type of 'bytes' is widening, so callers are safe. We have update request_fn in blkverify.c simultaneously. Still it's just a pointer to one of bdrv_co_pwritev() or bdrv_co_preadv(), and type is widening for callers of the request_fn anyway. Signed-off-by: Vladimir Sementsov-Ogievskiy Message-Id: <20201211183934.169161-16-vsementsov@virtuozzo.com> Reviewed-by: Eric Blake [eblake: grammar tweak] Signed-off-by: Eric Blake --- block/blkverify.c | 2 +- block/io.c | 15 ++++++++------- block/trace-events | 2 +- include/block/block.h | 11 ++++++----- include/block/block_int.h | 4 ++-- 5 files changed, 18 insertions(+), 16 deletions(-) diff --git a/block/blkverify.c b/block/blkverify.c index 4aed53ab59..943e62be9c 100644 --- a/block/blkverify.c +++ b/block/blkverify.c @@ -31,7 +31,7 @@ typedef struct BlkverifyRequest { uint64_t bytes; int flags; - int (*request_fn)(BdrvChild *, int64_t, unsigned int, QEMUIOVector *, + int (*request_fn)(BdrvChild *, int64_t, int64_t, QEMUIOVector *, BdrvRequestFlags); int ret; /* test image result */ diff --git a/block/io.c b/block/io.c index 7b6b0027bc..8817c84966 100644 --- a/block/io.c +++ b/block/io.c @@ -1005,7 +1005,7 @@ static int bdrv_check_request32(int64_t offset, int64_t bytes, } int bdrv_pwrite_zeroes(BdrvChild *child, int64_t offset, - int bytes, BdrvRequestFlags flags) + int64_t bytes, BdrvRequestFlags flags) { return bdrv_pwritev(child, offset, bytes, NULL, BDRV_REQ_ZERO_WRITE | flags); @@ -1053,7 +1053,7 @@ int bdrv_make_zero(BdrvChild *child, BdrvRequestFlags flags) } /* See bdrv_pwrite() for the return codes */ -int bdrv_pread(BdrvChild *child, int64_t offset, void *buf, int bytes) +int bdrv_pread(BdrvChild *child, int64_t offset, void *buf, int64_t bytes) { int ret; QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, buf, bytes); @@ -1073,7 +1073,8 @@ int bdrv_pread(BdrvChild *child, int64_t offset, void *buf, int bytes) -EINVAL Invalid offset or number of bytes -EACCES Trying to write a read-only device */ -int bdrv_pwrite(BdrvChild *child, int64_t offset, const void *buf, int bytes) +int bdrv_pwrite(BdrvChild *child, int64_t offset, const void *buf, + int64_t bytes) { int ret; QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, buf, bytes); @@ -1094,7 +1095,7 @@ int bdrv_pwrite(BdrvChild *child, int64_t offset, const void *buf, int bytes) * Returns 0 on success, -errno in error cases. */ int bdrv_pwrite_sync(BdrvChild *child, int64_t offset, - const void *buf, int count) + const void *buf, int64_t count) { int ret; @@ -1759,7 +1760,7 @@ static int bdrv_pad_request(BlockDriverState *bs, } int coroutine_fn bdrv_co_preadv(BdrvChild *child, - int64_t offset, unsigned int bytes, QEMUIOVector *qiov, + int64_t offset, int64_t bytes, QEMUIOVector *qiov, BdrvRequestFlags flags) { return bdrv_co_preadv_part(child, offset, bytes, qiov, 0, flags); @@ -2186,7 +2187,7 @@ out: * Handle a write request in coroutine context */ int coroutine_fn bdrv_co_pwritev(BdrvChild *child, - int64_t offset, unsigned int bytes, QEMUIOVector *qiov, + int64_t offset, int64_t bytes, QEMUIOVector *qiov, BdrvRequestFlags flags) { return bdrv_co_pwritev_part(child, offset, bytes, qiov, 0, flags); @@ -2279,7 +2280,7 @@ out: } int coroutine_fn bdrv_co_pwrite_zeroes(BdrvChild *child, int64_t offset, - int bytes, BdrvRequestFlags flags) + int64_t bytes, BdrvRequestFlags flags) { trace_bdrv_co_pwrite_zeroes(child->bs, offset, bytes, flags); diff --git a/block/trace-events b/block/trace-events index b252457d8e..a8d3fe94e6 100644 --- a/block/trace-events +++ b/block/trace-events @@ -13,7 +13,7 @@ blk_root_detach(void *child, void *blk, void *bs) "child %p blk %p bs %p" # io.c bdrv_co_preadv_part(void *bs, int64_t offset, int64_t bytes, unsigned int flags) "bs %p offset %" PRId64 " bytes %" PRId64 " flags 0x%x" bdrv_co_pwritev_part(void *bs, int64_t offset, int64_t bytes, unsigned int flags) "bs %p offset %" PRId64 " bytes %" PRId64 " flags 0x%x" -bdrv_co_pwrite_zeroes(void *bs, int64_t offset, int count, int flags) "bs %p offset %"PRId64" count %d flags 0x%x" +bdrv_co_pwrite_zeroes(void *bs, int64_t offset, int64_t bytes, int flags) "bs %p offset %" PRId64 " bytes %" PRId64 " flags 0x%x" bdrv_co_do_copy_on_readv(void *bs, int64_t offset, int64_t bytes, int64_t cluster_offset, int64_t cluster_bytes) "bs %p offset %" PRId64 " bytes %" PRId64 " cluster_offset %" PRId64 " cluster_bytes %" PRId64 bdrv_co_copy_range_from(void *src, uint64_t src_offset, void *dst, uint64_t dst_offset, uint64_t bytes, int read_flags, int write_flags) "src %p offset %"PRIu64" dst %p offset %"PRIu64" bytes %"PRIu64" rw flags 0x%x 0x%x" bdrv_co_copy_range_to(void *src, uint64_t src_offset, void *dst, uint64_t dst_offset, uint64_t bytes, int read_flags, int write_flags) "src %p offset %"PRIu64" dst %p offset %"PRIu64" bytes %"PRIu64" rw flags 0x%x 0x%x" diff --git a/include/block/block.h b/include/block/block.h index 81fcaad5ac..5f28d0d33f 100644 --- a/include/block/block.h +++ b/include/block/block.h @@ -392,12 +392,13 @@ int bdrv_reopen_prepare(BDRVReopenState *reopen_state, void bdrv_reopen_commit(BDRVReopenState *reopen_state); void bdrv_reopen_abort(BDRVReopenState *reopen_state); int bdrv_pwrite_zeroes(BdrvChild *child, int64_t offset, - int bytes, BdrvRequestFlags flags); + int64_t bytes, BdrvRequestFlags flags); int bdrv_make_zero(BdrvChild *child, BdrvRequestFlags flags); -int bdrv_pread(BdrvChild *child, int64_t offset, void *buf, int bytes); -int bdrv_pwrite(BdrvChild *child, int64_t offset, const void *buf, int bytes); +int bdrv_pread(BdrvChild *child, int64_t offset, void *buf, int64_t bytes); +int bdrv_pwrite(BdrvChild *child, int64_t offset, const void *buf, + int64_t bytes); int bdrv_pwrite_sync(BdrvChild *child, int64_t offset, - const void *buf, int count); + const void *buf, int64_t bytes); /* * Efficiently zero a region of the disk image. Note that this is a regular * I/O request like read or write and should have a reasonable size. This @@ -405,7 +406,7 @@ int bdrv_pwrite_sync(BdrvChild *child, int64_t offset, * because it may allocate memory for the entire region. */ int coroutine_fn bdrv_co_pwrite_zeroes(BdrvChild *child, int64_t offset, - int bytes, BdrvRequestFlags flags); + int64_t bytes, BdrvRequestFlags flags); BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs, const char *backing_file); void bdrv_refresh_filename(BlockDriverState *bs); diff --git a/include/block/block_int.h b/include/block/block_int.h index f2ad8aa771..749d1fb9d0 100644 --- a/include/block/block_int.h +++ b/include/block/block_int.h @@ -1032,13 +1032,13 @@ extern BlockDriver bdrv_raw; extern BlockDriver bdrv_qcow2; int coroutine_fn bdrv_co_preadv(BdrvChild *child, - int64_t offset, unsigned int bytes, QEMUIOVector *qiov, + int64_t offset, int64_t bytes, QEMUIOVector *qiov, BdrvRequestFlags flags); int coroutine_fn bdrv_co_preadv_part(BdrvChild *child, int64_t offset, int64_t bytes, QEMUIOVector *qiov, size_t qiov_offset, BdrvRequestFlags flags); int coroutine_fn bdrv_co_pwritev(BdrvChild *child, - int64_t offset, unsigned int bytes, QEMUIOVector *qiov, + int64_t offset, int64_t bytes, QEMUIOVector *qiov, BdrvRequestFlags flags); int coroutine_fn bdrv_co_pwritev_part(BdrvChild *child, int64_t offset, int64_t bytes, From a5215b8fdf1f8b284b4e5ba4895a4b091b503444 Mon Sep 17 00:00:00 2001 From: Vladimir Sementsov-Ogievskiy Date: Fri, 11 Dec 2020 21:39:34 +0300 Subject: [PATCH 17/20] block/io: use int64_t bytes in copy_range We are generally moving to int64_t for both offset and bytes parameters on all io paths. Main motivation is realization of 64-bit write_zeroes operation for fast zeroing large disk chunks, up to the whole disk. We chose signed type, to be consistent with off_t (which is signed) and with possibility for signed return type (where negative value means error). So, convert now copy_range parameters which are already 64bit to signed type. It's safe as we don't work with requests overflowing BDRV_MAX_LENGTH (which is less than INT64_MAX), and do check the requests in bdrv_co_copy_range_internal() (by bdrv_check_request32(), which calls bdrv_check_request()). Signed-off-by: Vladimir Sementsov-Ogievskiy Message-Id: <20201211183934.169161-17-vsementsov@virtuozzo.com> Reviewed-by: Eric Blake Signed-off-by: Eric Blake --- block/io.c | 22 +++++++++++----------- block/trace-events | 4 ++-- include/block/block.h | 6 +++--- include/block/block_int.h | 12 ++++++------ 4 files changed, 22 insertions(+), 22 deletions(-) diff --git a/block/io.c b/block/io.c index 8817c84966..b0435ed670 100644 --- a/block/io.c +++ b/block/io.c @@ -3201,8 +3201,8 @@ void bdrv_unregister_buf(BlockDriverState *bs, void *host) } static int coroutine_fn bdrv_co_copy_range_internal( - BdrvChild *src, uint64_t src_offset, BdrvChild *dst, - uint64_t dst_offset, uint64_t bytes, + BdrvChild *src, int64_t src_offset, BdrvChild *dst, + int64_t dst_offset, int64_t bytes, BdrvRequestFlags read_flags, BdrvRequestFlags write_flags, bool recurse_src) { @@ -3280,9 +3280,9 @@ static int coroutine_fn bdrv_co_copy_range_internal( * * See the comment of bdrv_co_copy_range for the parameter and return value * semantics. */ -int coroutine_fn bdrv_co_copy_range_from(BdrvChild *src, uint64_t src_offset, - BdrvChild *dst, uint64_t dst_offset, - uint64_t bytes, +int coroutine_fn bdrv_co_copy_range_from(BdrvChild *src, int64_t src_offset, + BdrvChild *dst, int64_t dst_offset, + int64_t bytes, BdrvRequestFlags read_flags, BdrvRequestFlags write_flags) { @@ -3296,9 +3296,9 @@ int coroutine_fn bdrv_co_copy_range_from(BdrvChild *src, uint64_t src_offset, * * See the comment of bdrv_co_copy_range for the parameter and return value * semantics. */ -int coroutine_fn bdrv_co_copy_range_to(BdrvChild *src, uint64_t src_offset, - BdrvChild *dst, uint64_t dst_offset, - uint64_t bytes, +int coroutine_fn bdrv_co_copy_range_to(BdrvChild *src, int64_t src_offset, + BdrvChild *dst, int64_t dst_offset, + int64_t bytes, BdrvRequestFlags read_flags, BdrvRequestFlags write_flags) { @@ -3308,9 +3308,9 @@ int coroutine_fn bdrv_co_copy_range_to(BdrvChild *src, uint64_t src_offset, bytes, read_flags, write_flags, false); } -int coroutine_fn bdrv_co_copy_range(BdrvChild *src, uint64_t src_offset, - BdrvChild *dst, uint64_t dst_offset, - uint64_t bytes, BdrvRequestFlags read_flags, +int coroutine_fn bdrv_co_copy_range(BdrvChild *src, int64_t src_offset, + BdrvChild *dst, int64_t dst_offset, + int64_t bytes, BdrvRequestFlags read_flags, BdrvRequestFlags write_flags) { return bdrv_co_copy_range_from(src, src_offset, diff --git a/block/trace-events b/block/trace-events index a8d3fe94e6..1a12d634e2 100644 --- a/block/trace-events +++ b/block/trace-events @@ -15,8 +15,8 @@ bdrv_co_preadv_part(void *bs, int64_t offset, int64_t bytes, unsigned int flags) bdrv_co_pwritev_part(void *bs, int64_t offset, int64_t bytes, unsigned int flags) "bs %p offset %" PRId64 " bytes %" PRId64 " flags 0x%x" bdrv_co_pwrite_zeroes(void *bs, int64_t offset, int64_t bytes, int flags) "bs %p offset %" PRId64 " bytes %" PRId64 " flags 0x%x" bdrv_co_do_copy_on_readv(void *bs, int64_t offset, int64_t bytes, int64_t cluster_offset, int64_t cluster_bytes) "bs %p offset %" PRId64 " bytes %" PRId64 " cluster_offset %" PRId64 " cluster_bytes %" PRId64 -bdrv_co_copy_range_from(void *src, uint64_t src_offset, void *dst, uint64_t dst_offset, uint64_t bytes, int read_flags, int write_flags) "src %p offset %"PRIu64" dst %p offset %"PRIu64" bytes %"PRIu64" rw flags 0x%x 0x%x" -bdrv_co_copy_range_to(void *src, uint64_t src_offset, void *dst, uint64_t dst_offset, uint64_t bytes, int read_flags, int write_flags) "src %p offset %"PRIu64" dst %p offset %"PRIu64" bytes %"PRIu64" rw flags 0x%x 0x%x" +bdrv_co_copy_range_from(void *src, int64_t src_offset, void *dst, int64_t dst_offset, int64_t bytes, int read_flags, int write_flags) "src %p offset %" PRId64 " dst %p offset %" PRId64 " bytes %" PRId64 " rw flags 0x%x 0x%x" +bdrv_co_copy_range_to(void *src, int64_t src_offset, void *dst, int64_t dst_offset, int64_t bytes, int read_flags, int write_flags) "src %p offset %" PRId64 " dst %p offset %" PRId64 " bytes %" PRId64 " rw flags 0x%x 0x%x" # stream.c stream_one_iteration(void *s, int64_t offset, uint64_t bytes, int is_allocated) "s %p offset %" PRId64 " bytes %" PRIu64 " is_allocated %d" diff --git a/include/block/block.h b/include/block/block.h index 5f28d0d33f..0a9f2c187c 100644 --- a/include/block/block.h +++ b/include/block/block.h @@ -845,8 +845,8 @@ void bdrv_unregister_buf(BlockDriverState *bs, void *host); * * Returns: 0 if succeeded; negative error code if failed. **/ -int coroutine_fn bdrv_co_copy_range(BdrvChild *src, uint64_t src_offset, - BdrvChild *dst, uint64_t dst_offset, - uint64_t bytes, BdrvRequestFlags read_flags, +int coroutine_fn bdrv_co_copy_range(BdrvChild *src, int64_t src_offset, + BdrvChild *dst, int64_t dst_offset, + int64_t bytes, BdrvRequestFlags read_flags, BdrvRequestFlags write_flags); #endif diff --git a/include/block/block_int.h b/include/block/block_int.h index 749d1fb9d0..22a2789d35 100644 --- a/include/block/block_int.h +++ b/include/block/block_int.h @@ -1357,14 +1357,14 @@ void bdrv_dec_in_flight(BlockDriverState *bs); void blockdev_close_all_bdrv_states(void); -int coroutine_fn bdrv_co_copy_range_from(BdrvChild *src, uint64_t src_offset, - BdrvChild *dst, uint64_t dst_offset, - uint64_t bytes, +int coroutine_fn bdrv_co_copy_range_from(BdrvChild *src, int64_t src_offset, + BdrvChild *dst, int64_t dst_offset, + int64_t bytes, BdrvRequestFlags read_flags, BdrvRequestFlags write_flags); -int coroutine_fn bdrv_co_copy_range_to(BdrvChild *src, uint64_t src_offset, - BdrvChild *dst, uint64_t dst_offset, - uint64_t bytes, +int coroutine_fn bdrv_co_copy_range_to(BdrvChild *src, int64_t src_offset, + BdrvChild *dst, int64_t dst_offset, + int64_t bytes, BdrvRequestFlags read_flags, BdrvRequestFlags write_flags); From 3b5e4db6734d30e551101c0941b2a6140862ba40 Mon Sep 17 00:00:00 2001 From: Roman Kagan Date: Fri, 29 Jan 2021 10:38:57 +0300 Subject: [PATCH 18/20] block/nbd: only detach existing iochannel from aio_context When the reconnect in NBD client is in progress, the iochannel used for NBD connection doesn't exist. Therefore an attempt to detach it from the aio_context of the parent BlockDriverState results in a NULL pointer dereference. The problem is triggerable, in particular, when an outgoing migration is about to finish, and stopping the dataplane tries to move the BlockDriverState from the iothread aio_context to the main loop. If the NBD connection is lost before this point, and the NBD client has entered the reconnect procedure, QEMU crashes: #0 qemu_aio_coroutine_enter (ctx=0x5618056c7580, co=0x0) at /build/qemu-6MF7tq/qemu-5.0.1/util/qemu-coroutine.c:109 #1 0x00005618034b1b68 in nbd_client_attach_aio_context_bh ( opaque=0x561805ed4c00) at /build/qemu-6MF7tq/qemu-5.0.1/block/nbd.c:164 #2 0x000056180353116b in aio_wait_bh (opaque=0x7f60e1e63700) at /build/qemu-6MF7tq/qemu-5.0.1/util/aio-wait.c:55 #3 0x0000561803530633 in aio_bh_call (bh=0x7f60d40a7e80) at /build/qemu-6MF7tq/qemu-5.0.1/util/async.c:136 #4 aio_bh_poll (ctx=ctx@entry=0x5618056c7580) at /build/qemu-6MF7tq/qemu-5.0.1/util/async.c:164 #5 0x0000561803533e5a in aio_poll (ctx=ctx@entry=0x5618056c7580, blocking=blocking@entry=true) at /build/qemu-6MF7tq/qemu-5.0.1/util/aio-posix.c:650 #6 0x000056180353128d in aio_wait_bh_oneshot (ctx=0x5618056c7580, cb=, opaque=) at /build/qemu-6MF7tq/qemu-5.0.1/util/aio-wait.c:71 #7 0x000056180345c50a in bdrv_attach_aio_context (new_context=0x5618056c7580, bs=0x561805ed4c00) at /build/qemu-6MF7tq/qemu-5.0.1/block.c:6172 #8 bdrv_set_aio_context_ignore (bs=bs@entry=0x561805ed4c00, new_context=new_context@entry=0x5618056c7580, ignore=ignore@entry=0x7f60e1e63780) at /build/qemu-6MF7tq/qemu-5.0.1/block.c:6237 #9 0x000056180345c969 in bdrv_child_try_set_aio_context ( bs=bs@entry=0x561805ed4c00, ctx=0x5618056c7580, ignore_child=, errp=) at /build/qemu-6MF7tq/qemu-5.0.1/block.c:6332 #10 0x00005618034957db in blk_do_set_aio_context (blk=0x56180695b3f0, new_context=0x5618056c7580, update_root_node=update_root_node@entry=true, errp=errp@entry=0x0) at /build/qemu-6MF7tq/qemu-5.0.1/block/block-backend.c:1989 #11 0x00005618034980bd in blk_set_aio_context (blk=, new_context=, errp=errp@entry=0x0) at /build/qemu-6MF7tq/qemu-5.0.1/block/block-backend.c:2010 #12 0x0000561803197953 in virtio_blk_data_plane_stop (vdev=) at /build/qemu-6MF7tq/qemu-5.0.1/hw/block/dataplane/virtio-blk.c:292 #13 0x00005618033d67bf in virtio_bus_stop_ioeventfd (bus=0x5618056d9f08) at /build/qemu-6MF7tq/qemu-5.0.1/hw/virtio/virtio-bus.c:245 #14 0x00005618031c9b2e in virtio_vmstate_change (opaque=0x5618056d9f90, running=0, state=) at /build/qemu-6MF7tq/qemu-5.0.1/hw/virtio/virtio.c:3220 #15 0x0000561803208bfd in vm_state_notify (running=running@entry=0, state=state@entry=RUN_STATE_FINISH_MIGRATE) at /build/qemu-6MF7tq/qemu-5.0.1/softmmu/vl.c:1275 #16 0x0000561803155c02 in do_vm_stop (state=RUN_STATE_FINISH_MIGRATE, send_stop=) at /build/qemu-6MF7tq/qemu-5.0.1/cpus.c:1032 #17 0x00005618033e3765 in migration_completion (s=0x5618056e6960) at /build/qemu-6MF7tq/qemu-5.0.1/migration/migration.c:2914 #18 migration_iteration_run (s=0x5618056e6960) at /build/qemu-6MF7tq/qemu-5.0.1/migration/migration.c:3275 #19 migration_thread (opaque=opaque@entry=0x5618056e6960) at /build/qemu-6MF7tq/qemu-5.0.1/migration/migration.c:3439 #20 0x0000561803536ad6 in qemu_thread_start (args=) at /build/qemu-6MF7tq/qemu-5.0.1/util/qemu-thread-posix.c:519 #21 0x00007f61085d06ba in start_thread () from /lib/x86_64-linux-gnu/libpthread.so.0 #22 0x00007f610830641d in sysctl () from /lib/x86_64-linux-gnu/libc.so.6 #23 0x0000000000000000 in ?? () Fix it by checking that the iochannel is non-null before trying to detach it from the aio_context. If it is null, no detaching is needed, and it will get reattached in the proper aio_context once the connection is reestablished. Signed-off-by: Roman Kagan Reviewed-by: Vladimir Sementsov-Ogievskiy Message-Id: <20210129073859.683063-2-rvkagan@yandex-team.ru> Signed-off-by: Eric Blake --- block/nbd.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/block/nbd.c b/block/nbd.c index 42e10c7c93..bcd6641e90 100644 --- a/block/nbd.c +++ b/block/nbd.c @@ -235,7 +235,14 @@ static void nbd_client_detach_aio_context(BlockDriverState *bs) /* Timer is deleted in nbd_client_co_drain_begin() */ assert(!s->reconnect_delay_timer); - qio_channel_detach_aio_context(QIO_CHANNEL(s->ioc)); + /* + * If reconnect is in progress we may have no ->ioc. It will be + * re-instantiated in the proper aio context once the connection is + * reestablished. + */ + if (s->ioc) { + qio_channel_detach_aio_context(QIO_CHANNEL(s->ioc)); + } } static void nbd_client_attach_aio_context_bh(void *opaque) From ddde5ee769fcc84b96f879d7b94f35268f69ca3b Mon Sep 17 00:00:00 2001 From: Roman Kagan Date: Fri, 29 Jan 2021 10:38:58 +0300 Subject: [PATCH 19/20] block/nbd: only enter connection coroutine if it's present When an NBD block driver state is moved from one aio_context to another (e.g. when doing a drain in a migration thread), nbd_client_attach_aio_context_bh is executed that enters the connection coroutine. However, the assumption that ->connection_co is always present here appears incorrect: the connection may have encountered an error other than -EIO in the underlying transport, and thus may have decided to quit rather than keep trying to reconnect, and therefore it may have terminated the connection coroutine. As a result an attempt to reassign the client in this state (NBD_CLIENT_QUIT) to a different aio_context leads to a null pointer dereference: #0 qio_channel_detach_aio_context (ioc=0x0) at /build/qemu-gYtjVn/qemu-5.0.1/io/channel.c:452 #1 0x0000562a242824b3 in bdrv_detach_aio_context (bs=0x562a268d6a00) at /build/qemu-gYtjVn/qemu-5.0.1/block.c:6151 #2 bdrv_set_aio_context_ignore (bs=bs@entry=0x562a268d6a00, new_context=new_context@entry=0x562a260c9580, ignore=ignore@entry=0x7feeadc9b780) at /build/qemu-gYtjVn/qemu-5.0.1/block.c:6230 #3 0x0000562a24282969 in bdrv_child_try_set_aio_context (bs=bs@entry=0x562a268d6a00, ctx=0x562a260c9580, ignore_child=, errp=) at /build/qemu-gYtjVn/qemu-5.0.1/block.c:6332 #4 0x0000562a242bb7db in blk_do_set_aio_context (blk=0x562a2735d0d0, new_context=0x562a260c9580, update_root_node=update_root_node@entry=true, errp=errp@entry=0x0) at /build/qemu-gYtjVn/qemu-5.0.1/block/block-backend.c:1989 #5 0x0000562a242be0bd in blk_set_aio_context (blk=, new_context=, errp=errp@entry=0x0) at /build/qemu-gYtjVn/qemu-5.0.1/block/block-backend.c:2010 #6 0x0000562a23fbd953 in virtio_blk_data_plane_stop (vdev=) at /build/qemu-gYtjVn/qemu-5.0.1/hw/block/dataplane/virtio-blk.c:292 #7 0x0000562a241fc7bf in virtio_bus_stop_ioeventfd (bus=0x562a260dbf08) at /build/qemu-gYtjVn/qemu-5.0.1/hw/virtio/virtio-bus.c:245 #8 0x0000562a23fefb2e in virtio_vmstate_change (opaque=0x562a260dbf90, running=0, state=) at /build/qemu-gYtjVn/qemu-5.0.1/hw/virtio/virtio.c:3220 #9 0x0000562a2402ebfd in vm_state_notify (running=running@entry=0, state=state@entry=RUN_STATE_FINISH_MIGRATE) at /build/qemu-gYtjVn/qemu-5.0.1/softmmu/vl.c:1275 #10 0x0000562a23f7bc02 in do_vm_stop (state=RUN_STATE_FINISH_MIGRATE, send_stop=) at /build/qemu-gYtjVn/qemu-5.0.1/cpus.c:1032 #11 0x0000562a24209765 in migration_completion (s=0x562a260e83a0) at /build/qemu-gYtjVn/qemu-5.0.1/migration/migration.c:2914 #12 migration_iteration_run (s=0x562a260e83a0) at /build/qemu-gYtjVn/qemu-5.0.1/migration/migration.c:3275 #13 migration_thread (opaque=opaque@entry=0x562a260e83a0) at /build/qemu-gYtjVn/qemu-5.0.1/migration/migration.c:3439 #14 0x0000562a2435ca96 in qemu_thread_start (args=) at /build/qemu-gYtjVn/qemu-5.0.1/util/qemu-thread-posix.c:519 #15 0x00007feed31466ba in start_thread (arg=0x7feeadc9c700) at pthread_create.c:333 #16 0x00007feed2e7c41d in __GI___sysctl (name=0x0, nlen=608471908, oldval=0x562a2452b138, oldlenp=0x0, newval=0x562a2452c5e0 <__func__.28102>, newlen=0) at ../sysdeps/unix/sysv/linux/sysctl.c:30 #17 0x0000000000000000 in ?? () Fix it by checking that the connection coroutine is non-null before trying to enter it. If it is null, no entering is needed, as the connection is probably going down anyway. Signed-off-by: Roman Kagan Reviewed-by: Vladimir Sementsov-Ogievskiy Message-Id: <20210129073859.683063-3-rvkagan@yandex-team.ru> Signed-off-by: Eric Blake --- block/nbd.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/block/nbd.c b/block/nbd.c index bcd6641e90..b3cbbeb4b0 100644 --- a/block/nbd.c +++ b/block/nbd.c @@ -250,13 +250,15 @@ static void nbd_client_attach_aio_context_bh(void *opaque) BlockDriverState *bs = opaque; BDRVNBDState *s = (BDRVNBDState *)bs->opaque; - /* - * The node is still drained, so we know the coroutine has yielded in - * nbd_read_eof(), the only place where bs->in_flight can reach 0, or it is - * entered for the first time. Both places are safe for entering the - * coroutine. - */ - qemu_aio_coroutine_enter(bs->aio_context, s->connection_co); + if (s->connection_co) { + /* + * The node is still drained, so we know the coroutine has yielded in + * nbd_read_eof(), the only place where bs->in_flight can reach 0, or + * it is entered for the first time. Both places are safe for entering + * the coroutine. + */ + qemu_aio_coroutine_enter(bs->aio_context, s->connection_co); + } bdrv_dec_in_flight(bs); } From 5082fc82a6bc3fc06a04be47d39777c7cff61e5b Mon Sep 17 00:00:00 2001 From: Roman Kagan Date: Fri, 29 Jan 2021 10:38:59 +0300 Subject: [PATCH 20/20] nbd: make nbd_read* return -EIO on error NBD reconnect logic considers the error code from the functions that read NBD messages to tell if reconnect should be attempted or not: it is attempted on -EIO, otherwise the client transitions to NBD_CLIENT_QUIT state (see nbd_channel_error). This error code is propagated from the primitives like nbd_read. The problem, however, is that nbd_read itself turns every error into -1 rather than -EIO. As a result, if the NBD server happens to die while sending the message, the client in QEMU receives less data than it expects, considers it as a fatal error, and wouldn't attempt reestablishing the connection. Fix it by turning every negative return from qio_channel_read_all into -EIO returned from nbd_read. Apparently that was the original behavior, but got broken later. Also adjust nbd_readXX to follow. Fixes: e6798f06a6 ("nbd: generalize usage of nbd_read") Signed-off-by: Roman Kagan Reviewed-by: Vladimir Sementsov-Ogievskiy Message-Id: <20210129073859.683063-4-rvkagan@yandex-team.ru> Signed-off-by: Eric Blake --- include/block/nbd.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/include/block/nbd.h b/include/block/nbd.h index 4a52a43ef5..5f34d23bb0 100644 --- a/include/block/nbd.h +++ b/include/block/nbd.h @@ -364,7 +364,7 @@ static inline int nbd_read(QIOChannel *ioc, void *buffer, size_t size, if (desc) { error_prepend(errp, "Failed to read %s: ", desc); } - return -1; + return ret; } return 0; @@ -375,8 +375,9 @@ static inline int nbd_read##bits(QIOChannel *ioc, \ uint##bits##_t *val, \ const char *desc, Error **errp) \ { \ - if (nbd_read(ioc, val, sizeof(*val), desc, errp) < 0) { \ - return -1; \ + int ret = nbd_read(ioc, val, sizeof(*val), desc, errp); \ + if (ret < 0) { \ + return ret; \ } \ *val = be##bits##_to_cpu(*val); \ return 0; \