From e66e665f15736f5ee1fbd8087926cb0f1e52f61a Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Tue, 25 Jan 2022 16:15:14 +0100 Subject: [PATCH 01/10] qemu-storage-daemon: Fix typo in vhost-user-blk help The syntax of the fd passing case misses the "addr.type=" key. Add it. Signed-off-by: Kevin Wolf Message-Id: <20220125151514.49035-1-kwolf@redhat.com> Reviewed-by: Hanna Reitz Signed-off-by: Kevin Wolf --- storage-daemon/qemu-storage-daemon.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/storage-daemon/qemu-storage-daemon.c b/storage-daemon/qemu-storage-daemon.c index 9d76d1114d..ec9aa79b55 100644 --- a/storage-daemon/qemu-storage-daemon.c +++ b/storage-daemon/qemu-storage-daemon.c @@ -111,7 +111,7 @@ static void help(void) " export the specified block node as a\n" " vhost-user-blk device over UNIX domain socket\n" " --export [type=]vhost-user-blk,id=,node-name=,\n" -" fd,addr.str=[,writable=on|off]\n" +" addr.type=fd,addr.str=[,writable=on|off]\n" " [,logical-block-size=][,num-queues=]\n" " export the specified block node as a\n" " vhost-user-blk device over file descriptor\n" From c0829cb1fd5e0b35abfcf9fc3f04502c1ed5d7b6 Mon Sep 17 00:00:00 2001 From: Vladimir Sementsov-Ogievskiy Date: Mon, 24 Jan 2022 18:37:41 +0100 Subject: [PATCH 02/10] block: bdrv_set_backing_hd(): use drained section Graph modifications should be done in drained section. stream_prepare() handler of block stream job call bdrv_set_backing_hd() without using drained section and it's theoretically possible that some IO request will interleave with graph modification and will use outdated pointers to removed block nodes. Some other callers use bdrv_set_backing_hd() not caring about drained sections too. So it seems good to make a drained section exactly in bdrv_set_backing_hd(). Signed-off-by: Vladimir Sementsov-Ogievskiy Message-Id: <20220124173741.2984056-1-vsementsov@virtuozzo.com> Signed-off-by: Kevin Wolf --- block.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/block.c b/block.c index 7b3ce415d8..b54d59d1fa 100644 --- a/block.c +++ b/block.c @@ -3341,6 +3341,8 @@ int bdrv_set_backing_hd(BlockDriverState *bs, BlockDriverState *backing_hd, int ret; Transaction *tran = tran_new(); + bdrv_drained_begin(bs); + ret = bdrv_set_backing_noperm(bs, backing_hd, tran, errp); if (ret < 0) { goto out; @@ -3350,6 +3352,8 @@ int bdrv_set_backing_hd(BlockDriverState *bs, BlockDriverState *backing_hd, out: tran_finalize(tran, ret); + bdrv_drained_end(bs); + return ret; } From 520d8b40e898158bc9a2b416d1cbdb44d2260bc7 Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Tue, 25 Jan 2022 16:14:35 +0100 Subject: [PATCH 03/10] block/export: Fix vhost-user-blk shutdown with requests in flight The vhost-user-blk export runs requests asynchronously in their own coroutine. When the vhost connection goes away and we want to stop the vhost-user server, we need to wait for these coroutines to stop before we can unmap the shared memory. Otherwise, they would still access the unmapped memory and crash. This introduces a refcount to VuServer which is increased when spawning a new request coroutine and decreased before the coroutine exits. The memory is only unmapped when the refcount reaches zero. Signed-off-by: Kevin Wolf Message-Id: <20220125151435.48792-1-kwolf@redhat.com> Signed-off-by: Kevin Wolf --- block/export/vhost-user-blk-server.c | 5 +++++ include/qemu/vhost-user-server.h | 5 +++++ util/vhost-user-server.c | 22 ++++++++++++++++++++++ 3 files changed, 32 insertions(+) diff --git a/block/export/vhost-user-blk-server.c b/block/export/vhost-user-blk-server.c index 1862563336..a129204c44 100644 --- a/block/export/vhost-user-blk-server.c +++ b/block/export/vhost-user-blk-server.c @@ -172,6 +172,7 @@ vu_blk_discard_write_zeroes(VuBlkExport *vexp, struct iovec *iov, return VIRTIO_BLK_S_IOERR; } +/* Called with server refcount increased, must decrease before returning */ static void coroutine_fn vu_blk_virtio_process_req(void *opaque) { VuBlkReq *req = opaque; @@ -286,10 +287,12 @@ static void coroutine_fn vu_blk_virtio_process_req(void *opaque) } vu_blk_req_complete(req); + vhost_user_server_unref(server); return; err: free(req); + vhost_user_server_unref(server); } static void vu_blk_process_vq(VuDev *vu_dev, int idx) @@ -310,6 +313,8 @@ static void vu_blk_process_vq(VuDev *vu_dev, int idx) Coroutine *co = qemu_coroutine_create(vu_blk_virtio_process_req, req); + + vhost_user_server_ref(server); qemu_coroutine_enter(co); } } diff --git a/include/qemu/vhost-user-server.h b/include/qemu/vhost-user-server.h index 121ea1dedf..cd43193b80 100644 --- a/include/qemu/vhost-user-server.h +++ b/include/qemu/vhost-user-server.h @@ -42,6 +42,8 @@ typedef struct { const VuDevIface *vu_iface; /* Protected by ctx lock */ + unsigned int refcount; + bool wait_idle; VuDev vu_dev; QIOChannel *ioc; /* The I/O channel with the client */ QIOChannelSocket *sioc; /* The underlying data channel with the client */ @@ -59,6 +61,9 @@ bool vhost_user_server_start(VuServer *server, void vhost_user_server_stop(VuServer *server); +void vhost_user_server_ref(VuServer *server); +void vhost_user_server_unref(VuServer *server); + void vhost_user_server_attach_aio_context(VuServer *server, AioContext *ctx); void vhost_user_server_detach_aio_context(VuServer *server); diff --git a/util/vhost-user-server.c b/util/vhost-user-server.c index f68287e811..f66fbba710 100644 --- a/util/vhost-user-server.c +++ b/util/vhost-user-server.c @@ -74,6 +74,20 @@ static void panic_cb(VuDev *vu_dev, const char *buf) error_report("vu_panic: %s", buf); } +void vhost_user_server_ref(VuServer *server) +{ + assert(!server->wait_idle); + server->refcount++; +} + +void vhost_user_server_unref(VuServer *server) +{ + server->refcount--; + if (server->wait_idle && !server->refcount) { + aio_co_wake(server->co_trip); + } +} + static bool coroutine_fn vu_message_read(VuDev *vu_dev, int conn_fd, VhostUserMsg *vmsg) { @@ -177,6 +191,14 @@ static coroutine_fn void vu_client_trip(void *opaque) /* Keep running */ } + if (server->refcount) { + /* Wait for requests to complete before we can unmap the memory */ + server->wait_idle = true; + qemu_coroutine_yield(); + server->wait_idle = false; + } + assert(server->refcount == 0); + vu_deinit(vu_dev); /* vu_deinit() should have called remove_watch() */ From ac50419460cc45a66214e6d1c3e9d0d670522f8a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philippe=20Mathieu-Daud=C3=A9?= Date: Tue, 1 Feb 2022 12:26:54 +0100 Subject: [PATCH 04/10] block/export/fuse: Rearrange if-else-if ladder in fuse_fallocate() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In order to safely maintain a mixture of #ifdef'ry with if-else-if ladder, rearrange the last statement (!mode) first. Since it is mutually exclusive with the other conditions, checking it first doesn't make any logical difference, but allows to add #ifdef'ry around in a more cleanly way. Suggested-by: Kevin Wolf Signed-off-by: Philippe Mathieu-Daudé Message-Id: <20220201112655.344373-2-f4bug@amsat.org> Signed-off-by: Kevin Wolf --- block/export/fuse.c | 41 +++++++++++++++++++++-------------------- 1 file changed, 21 insertions(+), 20 deletions(-) diff --git a/block/export/fuse.c b/block/export/fuse.c index 6710d8aed8..d25e478c0a 100644 --- a/block/export/fuse.c +++ b/block/export/fuse.c @@ -629,7 +629,26 @@ static void fuse_fallocate(fuse_req_t req, fuse_ino_t inode, int mode, length = MIN(length, blk_len - offset); } - if (mode & FALLOC_FL_PUNCH_HOLE) { + if (!mode) { + /* We can only fallocate at the EOF with a truncate */ + if (offset < blk_len) { + fuse_reply_err(req, EOPNOTSUPP); + return; + } + + if (offset > blk_len) { + /* No preallocation needed here */ + ret = fuse_do_truncate(exp, offset, true, PREALLOC_MODE_OFF); + if (ret < 0) { + fuse_reply_err(req, -ret); + return; + } + } + + ret = fuse_do_truncate(exp, offset + length, true, + PREALLOC_MODE_FALLOC); + } + else if (mode & FALLOC_FL_PUNCH_HOLE) { if (!(mode & FALLOC_FL_KEEP_SIZE)) { fuse_reply_err(req, EINVAL); return; @@ -665,25 +684,7 @@ static void fuse_fallocate(fuse_req_t req, fuse_ino_t inode, int mode, } while (ret == 0 && length > 0); } #endif /* CONFIG_FALLOCATE_ZERO_RANGE */ - else if (!mode) { - /* We can only fallocate at the EOF with a truncate */ - if (offset < blk_len) { - fuse_reply_err(req, EOPNOTSUPP); - return; - } - - if (offset > blk_len) { - /* No preallocation needed here */ - ret = fuse_do_truncate(exp, offset, true, PREALLOC_MODE_OFF); - if (ret < 0) { - fuse_reply_err(req, -ret); - return; - } - } - - ret = fuse_do_truncate(exp, offset + length, true, - PREALLOC_MODE_FALLOC); - } else { + else { ret = -EOPNOTSUPP; } From 3c9c70347b8e636c08035f39288f8cdd2e68bbda Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philippe=20Mathieu-Daud=C3=A9?= Date: Tue, 1 Feb 2022 12:26:55 +0100 Subject: [PATCH 05/10] block/export/fuse: Fix build failure on FreeBSD MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When building on FreeBSD we get: [816/6851] Compiling C object libblockdev.fa.p/block_export_fuse.c.o ../block/export/fuse.c:628:16: error: use of undeclared identifier 'FALLOC_FL_KEEP_SIZE' if (mode & FALLOC_FL_KEEP_SIZE) { ^ ../block/export/fuse.c:651:16: error: use of undeclared identifier 'FALLOC_FL_PUNCH_HOLE' if (mode & FALLOC_FL_PUNCH_HOLE) { ^ ../block/export/fuse.c:652:22: error: use of undeclared identifier 'FALLOC_FL_KEEP_SIZE' if (!(mode & FALLOC_FL_KEEP_SIZE)) { ^ 3 errors generated. FAILED: libblockdev.fa.p/block_export_fuse.c.o Meson indeed reported FALLOC_FL_PUNCH_HOLE is not available: C compiler for the host machine: cc (clang 10.0.1 "FreeBSD clang version 10.0.1") Checking for function "fallocate" : NO Checking for function "posix_fallocate" : YES Header has symbol "FALLOC_FL_PUNCH_HOLE" : NO Header has symbol "FALLOC_FL_ZERO_RANGE" : NO ... Similarly to commit 304332039 ("block/export/fuse.c: fix musl build"), guard the code requiring FALLOC_FL_KEEP_SIZE / FALLOC_FL_PUNCH_HOLE definitions under CONFIG_FALLOCATE_PUNCH_HOLE #ifdef'ry. Signed-off-by: Philippe Mathieu-Daudé Message-Id: <20220201112655.344373-3-f4bug@amsat.org> Signed-off-by: Kevin Wolf --- block/export/fuse.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/block/export/fuse.c b/block/export/fuse.c index d25e478c0a..fdda8e3c81 100644 --- a/block/export/fuse.c +++ b/block/export/fuse.c @@ -625,9 +625,11 @@ static void fuse_fallocate(fuse_req_t req, fuse_ino_t inode, int mode, return; } +#ifdef CONFIG_FALLOCATE_PUNCH_HOLE if (mode & FALLOC_FL_KEEP_SIZE) { length = MIN(length, blk_len - offset); } +#endif /* CONFIG_FALLOCATE_PUNCH_HOLE */ if (!mode) { /* We can only fallocate at the EOF with a truncate */ @@ -648,6 +650,7 @@ static void fuse_fallocate(fuse_req_t req, fuse_ino_t inode, int mode, ret = fuse_do_truncate(exp, offset + length, true, PREALLOC_MODE_FALLOC); } +#ifdef CONFIG_FALLOCATE_PUNCH_HOLE else if (mode & FALLOC_FL_PUNCH_HOLE) { if (!(mode & FALLOC_FL_KEEP_SIZE)) { fuse_reply_err(req, EINVAL); @@ -662,6 +665,7 @@ static void fuse_fallocate(fuse_req_t req, fuse_ino_t inode, int mode, length -= size; } while (ret == 0 && length > 0); } +#endif /* CONFIG_FALLOCATE_PUNCH_HOLE */ #ifdef CONFIG_FALLOCATE_ZERO_RANGE else if (mode & FALLOC_FL_ZERO_RANGE) { if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + length > blk_len) { From ef6ec0d7793bec9e529fd967c25c8300ef6b3cd2 Mon Sep 17 00:00:00 2001 From: Emanuele Giuseppe Esposito Date: Mon, 31 Jan 2022 07:56:15 -0500 Subject: [PATCH 06/10] block.h: remove outdated comment MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The comment "disk I/O throttling" doesn't make any sense at all any more. It was added in commit 0563e191516 to describe bdrv_io_limits_enable()/disable(), which were removed in commit 97148076, so the comment is just a forgotten leftover. Suggested-by: Kevin Wolf Signed-off-by: Emanuele Giuseppe Esposito Message-Id: <20220131125615.74612-1-eesposit@redhat.com> Reviewed-by: Philippe Mathieu-Daudé Signed-off-by: Kevin Wolf --- include/block/block.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/block/block.h b/include/block/block.h index 9d4050220b..e1713ee306 100644 --- a/include/block/block.h +++ b/include/block/block.h @@ -344,7 +344,6 @@ typedef unsigned int BdrvChildRole; char *bdrv_perm_names(uint64_t perm); uint64_t bdrv_qapi_perm_to_blk_perm(BlockPermission qapi_perm); -/* disk I/O throttling */ void bdrv_init(void); void bdrv_init_with_whitelist(void); bool bdrv_uses_whitelist(void); From cb90ec3a3646a3bdb0b2a157db226db25d470442 Mon Sep 17 00:00:00 2001 From: Hanna Reitz Date: Mon, 31 Jan 2022 11:31:24 +0100 Subject: [PATCH 07/10] qsd: Document fuse's allow-other option We did not add documentation to the storage daemon's man page for fuse's allow-other option when it was introduced, so do that now. Fixes: 8fc54f9428b9763f800 ("export/fuse: Add allow-other option") Signed-off-by: Hanna Reitz Message-Id: <20220131103124.20325-1-hreitz@redhat.com> Reviewed-by: Eric Blake Signed-off-by: Kevin Wolf --- docs/tools/qemu-storage-daemon.rst | 9 +++++++-- storage-daemon/qemu-storage-daemon.c | 2 +- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/docs/tools/qemu-storage-daemon.rst b/docs/tools/qemu-storage-daemon.rst index 9b0eaba6e5..878e6a5c5c 100644 --- a/docs/tools/qemu-storage-daemon.rst +++ b/docs/tools/qemu-storage-daemon.rst @@ -76,7 +76,7 @@ Standard options: .. option:: --export [type=]nbd,id=,node-name=[,name=][,writable=on|off][,bitmap=] --export [type=]vhost-user-blk,id=,node-name=,addr.type=unix,addr.path=[,writable=on|off][,logical-block-size=][,num-queues=] --export [type=]vhost-user-blk,id=,node-name=,addr.type=fd,addr.str=[,writable=on|off][,logical-block-size=][,num-queues=] - --export [type=]fuse,id=,node-name=,mountpoint=[,growable=on|off][,writable=on|off] + --export [type=]fuse,id=,node-name=,mountpoint=[,growable=on|off][,writable=on|off][,allow-other=on|off|auto] is a block export definition. ``node-name`` is the block node that should be exported. ``writable`` determines whether or not the export allows write @@ -103,7 +103,12 @@ Standard options: mounted). Consequently, applications that have opened the given file before the export became active will continue to see its original content. If ``growable`` is set, writes after the end of the exported file will grow the - block node to fit. + block node to fit. The ``allow-other`` option controls whether users other + than the user running the process will be allowed to access the export. Note + that enabling this option as a non-root user requires enabling the + user_allow_other option in the global fuse.conf configuration file. Setting + ``allow-other`` to auto (the default) will try enabling this option, and on + error fall back to disabling it. .. option:: --monitor MONITORDEF diff --git a/storage-daemon/qemu-storage-daemon.c b/storage-daemon/qemu-storage-daemon.c index ec9aa79b55..504d33aa91 100644 --- a/storage-daemon/qemu-storage-daemon.c +++ b/storage-daemon/qemu-storage-daemon.c @@ -100,7 +100,7 @@ static void help(void) "\n" #ifdef CONFIG_FUSE " --export [type=]fuse,id=,node-name=,mountpoint=\n" -" [,growable=on|off][,writable=on|off]\n" +" [,growable=on|off][,writable=on|off][,allow-other=on|off|auto]\n" " export the specified block node over FUSE\n" "\n" #endif /* CONFIG_FUSE */ From 111fbd74f67575c158d9be5363825aab8be50a0a Mon Sep 17 00:00:00 2001 From: Hanna Reitz Date: Mon, 31 Jan 2022 14:59:08 +0100 Subject: [PATCH 08/10] qemu-img: Unify [-b [-F]] documentation qemu-img convert documents the backing file and backing format options as follows: [-B backing_file [-F backing_fmt]] whereas qemu-img create has this: [-b backing_file] [-F backing_fmt] That is, for convert, we document that -F cannot be given without -B, while for create, way say that they are independent. Indeed, it is technically possible to give -F without -b, because it is left to the block driver to decide whether this is an error or not, so sometimes it is: $ qemu-img create -f qed -F qed test.qed 64M Formatting 'test.qed', fmt=qed size=67108864 backing_fmt=qed [...] And sometimes it is not: $ qemu-img create -f qcow2 -F qcow2 test.qcow2 64M Formatting 'test.qcow2', fmt=qcow2 cluster_size=65536 [...] qemu-img: test.qcow2: Backing format cannot be used without backing file Generally, it does not make much sense, though, and users should only give -F with -b, so document it that way, as we have already done for qemu-img convert (commit 1899bf47375ad40555dcdff12ba49b4b8b82df38). Reported-by: Tingting Mao Signed-off-by: Hanna Reitz Message-Id: <20220131135908.32393-1-hreitz@redhat.com> Signed-off-by: Kevin Wolf --- docs/tools/qemu-img.rst | 2 +- qemu-img-cmds.hx | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/tools/qemu-img.rst b/docs/tools/qemu-img.rst index d663dd92bd..8885ea11cf 100644 --- a/docs/tools/qemu-img.rst +++ b/docs/tools/qemu-img.rst @@ -463,7 +463,7 @@ Command description: ``--skip-broken-bitmaps`` is also specified to copy only the consistent bitmaps. -.. option:: create [--object OBJECTDEF] [-q] [-f FMT] [-b BACKING_FILE] [-F BACKING_FMT] [-u] [-o OPTIONS] FILENAME [SIZE] +.. option:: create [--object OBJECTDEF] [-q] [-f FMT] [-b BACKING_FILE [-F BACKING_FMT]] [-u] [-o OPTIONS] FILENAME [SIZE] Create the new disk image *FILENAME* of size *SIZE* and format *FMT*. Depending on the file format, you can add one or more *OPTIONS* diff --git a/qemu-img-cmds.hx b/qemu-img-cmds.hx index 72bcdcfbfa..1b1dab5b17 100644 --- a/qemu-img-cmds.hx +++ b/qemu-img-cmds.hx @@ -52,9 +52,9 @@ SRST ERST DEF("create", img_create, - "create [--object objectdef] [-q] [-f fmt] [-b backing_file] [-F backing_fmt] [-u] [-o options] filename [size]") + "create [--object objectdef] [-q] [-f fmt] [-b backing_file [-F backing_fmt]] [-u] [-o options] filename [size]") SRST -.. option:: create [--object OBJECTDEF] [-q] [-f FMT] [-b BACKING_FILE] [-F BACKING_FMT] [-u] [-o OPTIONS] FILENAME [SIZE] +.. option:: create [--object OBJECTDEF] [-q] [-f FMT] [-b BACKING_FILE [-F BACKING_FMT]] [-u] [-o OPTIONS] FILENAME [SIZE] ERST DEF("dd", img_dd, From 9e302f64bb407a9bb097b626da97228c2654cfee Mon Sep 17 00:00:00 2001 From: Peter Lieven Date: Thu, 13 Jan 2022 15:44:25 +0100 Subject: [PATCH 09/10] block/rbd: fix handling of holes in .bdrv_co_block_status the assumption that we can't hit a hole if we do not diff against a snapshot was wrong. We can see a hole in an image if we diff against base if there exists an older snapshot of the image and we have discarded blocks in the image where the snapshot has data. Fix this by simply handling a hole like an unallocated area. There are no callbacks for unallocated areas so just bail out if we hit a hole. Fixes: 0347a8fd4c3faaedf119be04c197804be40a384b Suggested-by: Ilya Dryomov Cc: qemu-stable@nongnu.org Signed-off-by: Peter Lieven Message-Id: <20220113144426.4036493-2-pl@kamp.de> Reviewed-by: Ilya Dryomov Reviewed-by: Stefano Garzarella Signed-off-by: Kevin Wolf --- block/rbd.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/block/rbd.c b/block/rbd.c index def96292e0..20bb896c4a 100644 --- a/block/rbd.c +++ b/block/rbd.c @@ -1279,11 +1279,11 @@ static int qemu_rbd_diff_iterate_cb(uint64_t offs, size_t len, RBDDiffIterateReq *req = opaque; assert(req->offs + req->bytes <= offs); - /* - * we do not diff against a snapshot so we should never receive a callback - * for a hole. - */ - assert(exists); + + /* treat a hole like an unallocated area and bail out */ + if (!exists) { + return 0; + } if (!req->exists && offs > req->offs) { /* From fc176116cdea816ceb8dd969080b2b95f58edbc0 Mon Sep 17 00:00:00 2001 From: Peter Lieven Date: Thu, 13 Jan 2022 15:44:26 +0100 Subject: [PATCH 10/10] block/rbd: workaround for ceph issue #53784 librbd had a bug until early 2022 that affected all versions of ceph that supported fast-diff. This bug results in reporting of incorrect offsets if the offset parameter to rbd_diff_iterate2 is not object aligned. This patch works around this bug for pre Quincy versions of librbd. Fixes: 0347a8fd4c3faaedf119be04c197804be40a384b Cc: qemu-stable@nongnu.org Signed-off-by: Peter Lieven Message-Id: <20220113144426.4036493-3-pl@kamp.de> Reviewed-by: Ilya Dryomov Reviewed-by: Stefano Garzarella Tested-by: Stefano Garzarella Signed-off-by: Kevin Wolf --- block/rbd.c | 42 ++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 40 insertions(+), 2 deletions(-) diff --git a/block/rbd.c b/block/rbd.c index 20bb896c4a..8f183eba2a 100644 --- a/block/rbd.c +++ b/block/rbd.c @@ -1320,6 +1320,7 @@ static int coroutine_fn qemu_rbd_co_block_status(BlockDriverState *bs, int status, r; RBDDiffIterateReq req = { .offs = offset }; uint64_t features, flags; + uint64_t head = 0; assert(offset + bytes <= s->image_size); @@ -1347,7 +1348,43 @@ static int coroutine_fn qemu_rbd_co_block_status(BlockDriverState *bs, return status; } - r = rbd_diff_iterate2(s->image, NULL, offset, bytes, true, true, +#if LIBRBD_VERSION_CODE < LIBRBD_VERSION(1, 17, 0) + /* + * librbd had a bug until early 2022 that affected all versions of ceph that + * supported fast-diff. This bug results in reporting of incorrect offsets + * if the offset parameter to rbd_diff_iterate2 is not object aligned. + * Work around this bug by rounding down the offset to object boundaries. + * This is OK because we call rbd_diff_iterate2 with whole_object = true. + * However, this workaround only works for non cloned images with default + * striping. + * + * See: https://tracker.ceph.com/issues/53784 + */ + + /* check if RBD image has non-default striping enabled */ + if (features & RBD_FEATURE_STRIPINGV2) { + return status; + } + +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wdeprecated-declarations" + /* + * check if RBD image is a clone (= has a parent). + * + * rbd_get_parent_info is deprecated from Nautilus onwards, but the + * replacement rbd_get_parent is not present in Luminous and Mimic. + */ + if (rbd_get_parent_info(s->image, NULL, 0, NULL, 0, NULL, 0) != -ENOENT) { + return status; + } +#pragma GCC diagnostic pop + + head = req.offs & (s->object_size - 1); + req.offs -= head; + bytes += head; +#endif + + r = rbd_diff_iterate2(s->image, NULL, req.offs, bytes, true, true, qemu_rbd_diff_iterate_cb, &req); if (r < 0 && r != QEMU_RBD_EXIT_DIFF_ITERATE2) { return status; @@ -1366,7 +1403,8 @@ static int coroutine_fn qemu_rbd_co_block_status(BlockDriverState *bs, status = BDRV_BLOCK_ZERO | BDRV_BLOCK_OFFSET_VALID; } - *pnum = req.bytes; + assert(req.bytes > head); + *pnum = req.bytes - head; return status; }