From 13a028336f2c05e7ff47dfdaf30dfac7f4883e80 Mon Sep 17 00:00:00 2001
From: Ari Sundholm <ari@tuxera.com>
Date: Tue, 19 Oct 2021 14:09:55 +0300
Subject: [PATCH 01/12] block/file-posix: Fix return value translation for AIO
 discards

AIO discards regressed as a result of the following commit:
	0dfc7af2 block/file-posix: Optimize for macOS

When trying to run blkdiscard within a Linux guest, the request would
fail, with some errors in dmesg:

---- [ snip ] ----
[    4.010070] sd 2:0:0:0: [sda] tag#0 FAILED Result: hostbyte=DID_OK
driverbyte=DRIVER_SENSE
[    4.011061] sd 2:0:0:0: [sda] tag#0 Sense Key : Aborted Command
[current]
[    4.011061] sd 2:0:0:0: [sda] tag#0 Add. Sense: I/O process
terminated
[    4.011061] sd 2:0:0:0: [sda] tag#0 CDB: Unmap/Read sub-channel 42
00 00 00 00 00 00 00 18 00
[    4.011061] blk_update_request: I/O error, dev sda, sector 0
---- [ snip ] ----

This turns out to be a result of a flaw in changes to the error value
translation logic in handle_aiocb_discard(). The default return value
may be left untranslated in some configurations, and the wrong variable
is used in one translation.

Fix both issues.

Fixes: 0dfc7af2b28 ("block/file-posix: Optimize for macOS")
Cc: qemu-stable@nongnu.org
Signed-off-by: Ari Sundholm <ari@tuxera.com>
Signed-off-by: Emil Karlson <jkarlson@tuxera.com>
Reviewed-by: Akihiko Odaki <akihiko.odaki@gmail.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Message-Id: <20211019110954.4170931-1-ari@tuxera.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/file-posix.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/block/file-posix.c b/block/file-posix.c
index 53be0bdc1b..6def2a4cba 100644
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -1807,7 +1807,7 @@ static int handle_aiocb_copy_range(void *opaque)
 static int handle_aiocb_discard(void *opaque)
 {
     RawPosixAIOData *aiocb = opaque;
-    int ret = -EOPNOTSUPP;
+    int ret = -ENOTSUP;
     BDRVRawState *s = aiocb->bs->opaque;
 
     if (!s->has_discard) {
@@ -1829,7 +1829,7 @@ static int handle_aiocb_discard(void *opaque)
 #ifdef CONFIG_FALLOCATE_PUNCH_HOLE
         ret = do_fallocate(s->fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
                            aiocb->aio_offset, aiocb->aio_nbytes);
-        ret = translate_err(-errno);
+        ret = translate_err(ret);
 #elif defined(__APPLE__) && (__MACH__)
         fpunchhole_t fpunchhole;
         fpunchhole.fp_flags = 0;

From bfb8aa6d583b09378dcdb85d40c7951e44acd09f Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Mon, 18 Oct 2021 15:47:14 +0200
Subject: [PATCH 02/12] block: Fail gracefully when blockdev-snapshot creates
 loops

Using blockdev-snapshot to append a node as an overlay to itself, or to
any of its parents, causes crashes. Catch the condition and return an
error for these cases instead.

Fixes: https://bugzilla.redhat.com/show_bug.cgi?id=1824363
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Message-Id: <20211018134714.48438-1-kwolf@redhat.com>
Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block.c                    | 10 ++++++++++
 tests/qemu-iotests/085     | 31 ++++++++++++++++++++++++++++++-
 tests/qemu-iotests/085.out | 33 ++++++++++++++++++++++++++++++---
 3 files changed, 70 insertions(+), 4 deletions(-)

diff --git a/block.c b/block.c
index 45f653a88b..580cb77a70 100644
--- a/block.c
+++ b/block.c
@@ -84,6 +84,9 @@ static BlockDriverState *bdrv_open_inherit(const char *filename,
                                            BdrvChildRole child_role,
                                            Error **errp);
 
+static bool bdrv_recurse_has_child(BlockDriverState *bs,
+                                   BlockDriverState *child);
+
 static void bdrv_replace_child_noperm(BdrvChild *child,
                                       BlockDriverState *new_bs);
 static void bdrv_remove_file_or_backing_child(BlockDriverState *bs,
@@ -2673,6 +2676,7 @@ static void bdrv_replace_child_noperm(BdrvChild *child,
     int drain_saldo;
 
     assert(!child->frozen);
+    assert(old_bs != new_bs);
 
     if (old_bs && new_bs) {
         assert(bdrv_get_aio_context(old_bs) == bdrv_get_aio_context(new_bs));
@@ -2892,6 +2896,12 @@ static int bdrv_attach_child_noperm(BlockDriverState *parent_bs,
 
     assert(parent_bs->drv);
 
+    if (bdrv_recurse_has_child(child_bs, parent_bs)) {
+        error_setg(errp, "Making '%s' a %s child of '%s' would create a cycle",
+                   child_bs->node_name, child_name, parent_bs->node_name);
+        return -EINVAL;
+    }
+
     bdrv_get_cumulative_perm(parent_bs, &perm, &shared_perm);
     bdrv_child_perm(parent_bs, child_bs, NULL, child_role, NULL,
                     perm, shared_perm, &perm, &shared_perm);
diff --git a/tests/qemu-iotests/085 b/tests/qemu-iotests/085
index d557522943..de74262a26 100755
--- a/tests/qemu-iotests/085
+++ b/tests/qemu-iotests/085
@@ -103,11 +103,18 @@ do_blockdev_add()
 }
 
 # ${1}: unique identifier for the snapshot filename
-add_snapshot_image()
+create_snapshot_image()
 {
     base_image="${TEST_DIR}/$((${1}-1))-${snapshot_virt0}"
     snapshot_file="${TEST_DIR}/${1}-${snapshot_virt0}"
     TEST_IMG=$snapshot_file _make_test_img -u -b "${base_image}" -F $IMGFMT "$size"
+}
+
+# ${1}: unique identifier for the snapshot filename
+add_snapshot_image()
+{
+    snapshot_file="${TEST_DIR}/${1}-${snapshot_virt0}"
+    create_snapshot_image "$1"
     do_blockdev_add "$1" "'backing': null, " "${snapshot_file}"
 }
 
@@ -230,6 +237,28 @@ _make_test_img -b "${TEST_IMG}.base" -F $IMGFMT "$size"
 do_blockdev_add ${SNAPSHOTS} "" "${TEST_IMG}"
 blockdev_snapshot ${SNAPSHOTS} error
 
+echo
+echo === Invalid command - creating loops ===
+echo
+
+SNAPSHOTS=$((${SNAPSHOTS}+1))
+add_snapshot_image ${SNAPSHOTS}
+
+_send_qemu_cmd $h "{ 'execute': 'blockdev-snapshot',
+                     'arguments': { 'node':'snap_${SNAPSHOTS}',
+                                    'overlay':'snap_${SNAPSHOTS}' }
+                   }" "error"
+
+SNAPSHOTS=$((${SNAPSHOTS}+1))
+create_snapshot_image ${SNAPSHOTS}
+do_blockdev_add ${SNAPSHOTS} "'backing': 'snap_$((${SNAPSHOTS}-1))', " \
+    "${TEST_DIR}/${SNAPSHOTS}-${snapshot_virt0}"
+
+_send_qemu_cmd $h "{ 'execute': 'blockdev-snapshot',
+                     'arguments': { 'node':'snap_${SNAPSHOTS}',
+                                    'overlay':'snap_$((${SNAPSHOTS}-1))' }
+                   }" "error"
+
 echo
 echo === Invalid command - The node does not exist ===
 echo
diff --git a/tests/qemu-iotests/085.out b/tests/qemu-iotests/085.out
index 1d4c565b6d..b543b992ff 100644
--- a/tests/qemu-iotests/085.out
+++ b/tests/qemu-iotests/085.out
@@ -217,15 +217,42 @@ Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=134217728 backing_file=TEST_DIR/
                                      'overlay':'snap_13' } }
 {"error": {"class": "GenericError", "desc": "The overlay already has a backing image"}}
 
+=== Invalid command - creating loops ===
+
+Formatting 'TEST_DIR/14-snapshot-v0.IMGFMT', fmt=IMGFMT size=134217728 backing_file=TEST_DIR/13-snapshot-v0.IMGFMT backing_fmt=IMGFMT
+{ 'execute': 'blockdev-add', 'arguments':
+           { 'driver': 'IMGFMT', 'node-name': 'snap_14', 'backing': null,
+             'file':
+             { 'driver': 'file', 'filename': 'TEST_DIR/14-snapshot-v0.IMGFMT',
+               'node-name': 'file_14' } } }
+{"return": {}}
+{ 'execute': 'blockdev-snapshot',
+                     'arguments': { 'node':'snap_14',
+                                    'overlay':'snap_14' }
+                   }
+{"error": {"class": "GenericError", "desc": "Making 'snap_14' a backing child of 'snap_14' would create a cycle"}}
+Formatting 'TEST_DIR/15-snapshot-v0.IMGFMT', fmt=IMGFMT size=134217728 backing_file=TEST_DIR/14-snapshot-v0.IMGFMT backing_fmt=IMGFMT
+{ 'execute': 'blockdev-add', 'arguments':
+           { 'driver': 'IMGFMT', 'node-name': 'snap_15', 'backing': 'snap_14',
+             'file':
+             { 'driver': 'file', 'filename': 'TEST_DIR/15-snapshot-v0.IMGFMT',
+               'node-name': 'file_15' } } }
+{"return": {}}
+{ 'execute': 'blockdev-snapshot',
+                     'arguments': { 'node':'snap_15',
+                                    'overlay':'snap_14' }
+                   }
+{"error": {"class": "GenericError", "desc": "Making 'snap_15' a backing child of 'snap_14' would create a cycle"}}
+
 === Invalid command - The node does not exist ===
 
 { 'execute': 'blockdev-snapshot',
                       'arguments': { 'node': 'virtio0',
-                                     'overlay':'snap_14' } }
-{"error": {"class": "GenericError", "desc": "Cannot find device='snap_14' nor node-name='snap_14'"}}
+                                     'overlay':'snap_16' } }
+{"error": {"class": "GenericError", "desc": "Cannot find device='snap_16' nor node-name='snap_16'"}}
 { 'execute': 'blockdev-snapshot',
                      'arguments': { 'node':'nodevice',
-                                    'overlay':'snap_13' }
+                                    'overlay':'snap_15' }
                    }
 {"error": {"class": "GenericError", "desc": "Cannot find device='nodevice' nor node-name='nodevice'"}}
 *** done

From 0347a8fd4c3faaedf119be04c197804be40a384b Mon Sep 17 00:00:00 2001
From: Peter Lieven <pl@kamp.de>
Date: Tue, 12 Oct 2021 17:22:31 +0200
Subject: [PATCH 03/12] block/rbd: implement bdrv_co_block_status

the qemu rbd driver currently lacks support for bdrv_co_block_status.
This results mainly in incorrect progress during block operations (e.g.
qemu-img convert with an rbd image as source).

This patch utilizes the rbd_diff_iterate2 call from librbd to detect
allocated and unallocated (all zero areas).

To avoid querying the ceph OSDs for the answer this is only done if
the image has the fast-diff feature which depends on the object-map and
exclusive-lock features. In this case it is guaranteed that the information
is present in memory in the librbd client and thus very fast.

If fast-diff is not available all areas are reported to be allocated
which is the current behaviour if bdrv_co_block_status is not implemented.

Signed-off-by: Peter Lieven <pl@kamp.de>
Message-Id: <20211012152231.24868-1-pl@kamp.de>
Reviewed-by: Ilya Dryomov <idryomov@gmail.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/rbd.c | 112 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 112 insertions(+)

diff --git a/block/rbd.c b/block/rbd.c
index 701fbf2b0c..def96292e0 100644
--- a/block/rbd.c
+++ b/block/rbd.c
@@ -97,6 +97,12 @@ typedef struct RBDTask {
     int64_t ret;
 } RBDTask;
 
+typedef struct RBDDiffIterateReq {
+    uint64_t offs;
+    uint64_t bytes;
+    bool exists;
+} RBDDiffIterateReq;
+
 static int qemu_rbd_connect(rados_t *cluster, rados_ioctx_t *io_ctx,
                             BlockdevOptionsRbd *opts, bool cache,
                             const char *keypairs, const char *secretid,
@@ -1259,6 +1265,111 @@ static ImageInfoSpecific *qemu_rbd_get_specific_info(BlockDriverState *bs,
     return spec_info;
 }
 
+/*
+ * rbd_diff_iterate2 allows to interrupt the exection by returning a negative
+ * value in the callback routine. Choose a value that does not conflict with
+ * an existing exitcode and return it if we want to prematurely stop the
+ * execution because we detected a change in the allocation status.
+ */
+#define QEMU_RBD_EXIT_DIFF_ITERATE2 -9000
+
+static int qemu_rbd_diff_iterate_cb(uint64_t offs, size_t len,
+                                    int exists, void *opaque)
+{
+    RBDDiffIterateReq *req = opaque;
+
+    assert(req->offs + req->bytes <= offs);
+    /*
+     * we do not diff against a snapshot so we should never receive a callback
+     * for a hole.
+     */
+    assert(exists);
+
+    if (!req->exists && offs > req->offs) {
+        /*
+         * we started in an unallocated area and hit the first allocated
+         * block. req->bytes must be set to the length of the unallocated area
+         * before the allocated area. stop further processing.
+         */
+        req->bytes = offs - req->offs;
+        return QEMU_RBD_EXIT_DIFF_ITERATE2;
+    }
+
+    if (req->exists && offs > req->offs + req->bytes) {
+        /*
+         * we started in an allocated area and jumped over an unallocated area,
+         * req->bytes contains the length of the allocated area before the
+         * unallocated area. stop further processing.
+         */
+        return QEMU_RBD_EXIT_DIFF_ITERATE2;
+    }
+
+    req->bytes += len;
+    req->exists = true;
+
+    return 0;
+}
+
+static int coroutine_fn qemu_rbd_co_block_status(BlockDriverState *bs,
+                                                 bool want_zero, int64_t offset,
+                                                 int64_t bytes, int64_t *pnum,
+                                                 int64_t *map,
+                                                 BlockDriverState **file)
+{
+    BDRVRBDState *s = bs->opaque;
+    int status, r;
+    RBDDiffIterateReq req = { .offs = offset };
+    uint64_t features, flags;
+
+    assert(offset + bytes <= s->image_size);
+
+    /* default to all sectors allocated */
+    status = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID;
+    *map = offset;
+    *file = bs;
+    *pnum = bytes;
+
+    /* check if RBD image supports fast-diff */
+    r = rbd_get_features(s->image, &features);
+    if (r < 0) {
+        return status;
+    }
+    if (!(features & RBD_FEATURE_FAST_DIFF)) {
+        return status;
+    }
+
+    /* check if RBD fast-diff result is valid */
+    r = rbd_get_flags(s->image, &flags);
+    if (r < 0) {
+        return status;
+    }
+    if (flags & RBD_FLAG_FAST_DIFF_INVALID) {
+        return status;
+    }
+
+    r = rbd_diff_iterate2(s->image, NULL, offset, bytes, true, true,
+                          qemu_rbd_diff_iterate_cb, &req);
+    if (r < 0 && r != QEMU_RBD_EXIT_DIFF_ITERATE2) {
+        return status;
+    }
+    assert(req.bytes <= bytes);
+    if (!req.exists) {
+        if (r == 0) {
+            /*
+             * rbd_diff_iterate2 does not invoke callbacks for unallocated
+             * areas. This here catches the case where no callback was
+             * invoked at all (req.bytes == 0).
+             */
+            assert(req.bytes == 0);
+            req.bytes = bytes;
+        }
+        status = BDRV_BLOCK_ZERO | BDRV_BLOCK_OFFSET_VALID;
+    }
+
+    *pnum = req.bytes;
+    return status;
+}
+
 static int64_t qemu_rbd_getlength(BlockDriverState *bs)
 {
     BDRVRBDState *s = bs->opaque;
@@ -1494,6 +1605,7 @@ static BlockDriver bdrv_rbd = {
 #ifdef LIBRBD_SUPPORTS_WRITE_ZEROES
     .bdrv_co_pwrite_zeroes  = qemu_rbd_co_pwrite_zeroes,
 #endif
+    .bdrv_co_block_status   = qemu_rbd_co_block_status,
 
     .bdrv_snapshot_create   = qemu_rbd_snap_create,
     .bdrv_snapshot_delete   = qemu_rbd_snap_remove,

From 46e018e9b741731842b93ce23a86fad60445969b Mon Sep 17 00:00:00 2001
From: Samuel Thibault <samuel.thibault@ens-lyon.org>
Date: Tue, 24 Aug 2021 12:43:44 +0200
Subject: [PATCH 04/12] ide: Cap LBA28 capacity announcement to 2^28-1

The LBA28 capacity (at offsets 60/61 of identification) is supposed to
express the maximum size supported by LBA28 commands. If the device is
larger than this, we have to cap it to 2^28-1.

At least NetBSD happens to be using this value to determine whether to use
LBA28 or LBA48 for its commands, using LBA28 for sectors that don't need
LBA48. This commit thus fixes NetBSD access to disks larger than 128GiB.

Signed-off-by: Samuel Thibault <samuel.thibault@ens-lyon.org>
Message-Id: <20210824104344.3878849-1-samuel.thibault@ens-lyon.org>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 hw/ide/core.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/hw/ide/core.c b/hw/ide/core.c
index fd69ca3167..e28f8aad61 100644
--- a/hw/ide/core.c
+++ b/hw/ide/core.c
@@ -98,8 +98,12 @@ static void put_le16(uint16_t *p, unsigned int v)
 static void ide_identify_size(IDEState *s)
 {
     uint16_t *p = (uint16_t *)s->identify_data;
-    put_le16(p + 60, s->nb_sectors);
-    put_le16(p + 61, s->nb_sectors >> 16);
+    int64_t nb_sectors_lba28 = s->nb_sectors;
+    if (nb_sectors_lba28 >= 1 << 28) {
+        nb_sectors_lba28 = (1 << 28) - 1;
+    }
+    put_le16(p + 60, nb_sectors_lba28);
+    put_le16(p + 61, nb_sectors_lba28 >> 16);
     put_le16(p + 100, s->nb_sectors);
     put_le16(p + 101, s->nb_sectors >> 16);
     put_le16(p + 102, s->nb_sectors >> 32);

From 304332039014679b809f606e2f227ee0fc43a451 Mon Sep 17 00:00:00 2001
From: Fabrice Fontaine <fontaine.fabrice@gmail.com>
Date: Fri, 22 Oct 2021 11:52:09 +0200
Subject: [PATCH 05/12] block/export/fuse.c: fix musl build
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Include linux/falloc.h if CONFIG_FALLOCATE_ZERO_RANGE is defined to fix
https://gitlab.com/qemu-project/qemu/-/commit/50482fda98bd62e072c30b7ea73c985c4e9d9bbb
and avoid the following build failure on musl:

../block/export/fuse.c: In function 'fuse_fallocate':
../block/export/fuse.c:643:21: error: 'FALLOC_FL_ZERO_RANGE' undeclared (first use in this function)
  643 |     else if (mode & FALLOC_FL_ZERO_RANGE) {
      |                     ^~~~~~~~~~~~~~~~~~~~

Fixes:
 - http://autobuild.buildroot.org/results/be24433a429fda681fb66698160132c1c99bc53b

Fixes: 50482fda98b ("block/export/fuse.c: fix musl build")
Signed-off-by: Fabrice Fontaine <fontaine.fabrice@gmail.com>
Message-Id: <20211022095209.1319671-1-fontaine.fabrice@gmail.com>
Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/export/fuse.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/block/export/fuse.c b/block/export/fuse.c
index 2e3bf8270b..823c126d23 100644
--- a/block/export/fuse.c
+++ b/block/export/fuse.c
@@ -31,6 +31,10 @@
 #include <fuse.h>
 #include <fuse_lowlevel.h>
 
+#if defined(CONFIG_FALLOCATE_ZERO_RANGE)
+#include <linux/falloc.h>
+#endif
+
 #ifdef __linux__
 #include <linux/fs.h>
 #endif

From 684960d46267d6e6443b39ee98b3a95632ba8dc6 Mon Sep 17 00:00:00 2001
From: Stefano Garzarella <sgarzare@redhat.com>
Date: Tue, 26 Oct 2021 18:23:44 +0200
Subject: [PATCH 06/12] file-posix: add `aio-max-batch` option

Commit d7ddd0a161 ("linux-aio: limit the batch size using
`aio-max-batch` parameter") added a way to limit the batch size
of Linux AIO backend for the entire AIO context.

The same AIO context can be shared by multiple devices, so
latency-sensitive devices may want to limit the batch size even
more to avoid increasing latency.

For this reason we add the `aio-max-batch` option to the file
backend, which will be used by the next commits to limit the size of
batches including requests generated by this device.

Suggested-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Stefano Garzarella <sgarzare@redhat.com>
Message-Id: <20211026162346.253081-2-sgarzare@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/file-posix.c   | 9 +++++++++
 qapi/block-core.json | 7 +++++++
 2 files changed, 16 insertions(+)

diff --git a/block/file-posix.c b/block/file-posix.c
index 6def2a4cba..7a289a9481 100644
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -150,6 +150,8 @@ typedef struct BDRVRawState {
     uint64_t locked_perm;
     uint64_t locked_shared_perm;
 
+    uint64_t aio_max_batch;
+
     int perm_change_fd;
     int perm_change_flags;
     BDRVReopenState *reopen_state;
@@ -530,6 +532,11 @@ static QemuOptsList raw_runtime_opts = {
             .type = QEMU_OPT_STRING,
             .help = "host AIO implementation (threads, native, io_uring)",
         },
+        {
+            .name = "aio-max-batch",
+            .type = QEMU_OPT_NUMBER,
+            .help = "AIO max batch size (0 = auto handled by AIO backend, default: 0)",
+        },
         {
             .name = "locking",
             .type = QEMU_OPT_STRING,
@@ -609,6 +616,8 @@ static int raw_open_common(BlockDriverState *bs, QDict *options,
     s->use_linux_io_uring = (aio == BLOCKDEV_AIO_OPTIONS_IO_URING);
 #endif
 
+    s->aio_max_batch = qemu_opt_get_number(opts, "aio-max-batch", 0);
+
     locking = qapi_enum_parse(&OnOffAuto_lookup,
                               qemu_opt_get(opts, "locking"),
                               ON_OFF_AUTO_AUTO, &local_err);
diff --git a/qapi/block-core.json b/qapi/block-core.json
index ce2c1352cb..ea36e0038c 100644
--- a/qapi/block-core.json
+++ b/qapi/block-core.json
@@ -2939,6 +2939,12 @@
 #              for this device (default: none, forward the commands via SG_IO;
 #              since 2.11)
 # @aio: AIO backend (default: threads) (since: 2.8)
+# @aio-max-batch: maximum number of requests to batch together into a single
+#                 submission in the AIO backend. The smallest value between
+#                 this and the aio-max-batch value of the IOThread object is
+#                 chosen.
+#                 0 means that the AIO backend will handle it automatically.
+#                 (default: 0, since 6.2)
 # @locking: whether to enable file locking. If set to 'auto', only enable
 #           when Open File Descriptor (OFD) locking API is available
 #           (default: auto, since 2.10)
@@ -2968,6 +2974,7 @@
             '*pr-manager': 'str',
             '*locking': 'OnOffAuto',
             '*aio': 'BlockdevAioOptions',
+            '*aio-max-batch': 'int',
             '*drop-cache': {'type': 'bool',
                             'if': 'CONFIG_LINUX'},
             '*x-check-cache-dropped': { 'type': 'bool',

From 512da211010700cdbfaab45c8980ca88958a4ab8 Mon Sep 17 00:00:00 2001
From: Stefano Garzarella <sgarzare@redhat.com>
Date: Tue, 26 Oct 2021 18:23:45 +0200
Subject: [PATCH 07/12] linux-aio: add `dev_max_batch` parameter to
 laio_co_submit()

This new parameter can be used by block devices to limit the
Linux AIO batch size more than the limit set by the AIO context.

file-posix backend supports this, passing its `aio-max-batch` option
previously added.

Add an helper function to calculate the maximum batch size.

Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Stefano Garzarella <sgarzare@redhat.com>
Message-Id: <20211026162346.253081-3-sgarzare@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/file-posix.c      |  3 ++-
 block/linux-aio.c       | 30 ++++++++++++++++++++++--------
 include/block/raw-aio.h |  3 ++-
 3 files changed, 26 insertions(+), 10 deletions(-)

diff --git a/block/file-posix.c b/block/file-posix.c
index 7a289a9481..c2c94fca66 100644
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -2066,7 +2066,8 @@ static int coroutine_fn raw_co_prw(BlockDriverState *bs, uint64_t offset,
     } else if (s->use_linux_aio) {
         LinuxAioState *aio = aio_get_linux_aio(bdrv_get_aio_context(bs));
         assert(qiov->size == bytes);
-        return laio_co_submit(bs, aio, s->fd, offset, qiov, type);
+        return laio_co_submit(bs, aio, s->fd, offset, qiov, type,
+                              s->aio_max_batch);
 #endif
     }
 
diff --git a/block/linux-aio.c b/block/linux-aio.c
index 0dab507b71..88b44fee72 100644
--- a/block/linux-aio.c
+++ b/block/linux-aio.c
@@ -334,6 +334,23 @@ static void ioq_submit(LinuxAioState *s)
     }
 }
 
+static uint64_t laio_max_batch(LinuxAioState *s, uint64_t dev_max_batch)
+{
+    uint64_t max_batch = s->aio_context->aio_max_batch ?: DEFAULT_MAX_BATCH;
+
+    /*
+     * AIO context can be shared between multiple block devices, so
+     * `dev_max_batch` allows reducing the batch size for latency-sensitive
+     * devices.
+     */
+    max_batch = MIN_NON_ZERO(dev_max_batch, max_batch);
+
+    /* limit the batch with the number of available events */
+    max_batch = MIN_NON_ZERO(MAX_EVENTS - s->io_q.in_flight, max_batch);
+
+    return max_batch;
+}
+
 void laio_io_plug(BlockDriverState *bs, LinuxAioState *s)
 {
     s->io_q.plugged++;
@@ -349,15 +366,11 @@ void laio_io_unplug(BlockDriverState *bs, LinuxAioState *s)
 }
 
 static int laio_do_submit(int fd, struct qemu_laiocb *laiocb, off_t offset,
-                          int type)
+                          int type, uint64_t dev_max_batch)
 {
     LinuxAioState *s = laiocb->ctx;
     struct iocb *iocbs = &laiocb->iocb;
     QEMUIOVector *qiov = laiocb->qiov;
-    int64_t max_batch = s->aio_context->aio_max_batch ?: DEFAULT_MAX_BATCH;
-
-    /* limit the batch with the number of available events */
-    max_batch = MIN_NON_ZERO(MAX_EVENTS - s->io_q.in_flight, max_batch);
 
     switch (type) {
     case QEMU_AIO_WRITE:
@@ -378,7 +391,7 @@ static int laio_do_submit(int fd, struct qemu_laiocb *laiocb, off_t offset,
     s->io_q.in_queue++;
     if (!s->io_q.blocked &&
         (!s->io_q.plugged ||
-         s->io_q.in_queue >= max_batch)) {
+         s->io_q.in_queue >= laio_max_batch(s, dev_max_batch))) {
         ioq_submit(s);
     }
 
@@ -386,7 +399,8 @@ static int laio_do_submit(int fd, struct qemu_laiocb *laiocb, off_t offset,
 }
 
 int coroutine_fn laio_co_submit(BlockDriverState *bs, LinuxAioState *s, int fd,
-                                uint64_t offset, QEMUIOVector *qiov, int type)
+                                uint64_t offset, QEMUIOVector *qiov, int type,
+                                uint64_t dev_max_batch)
 {
     int ret;
     struct qemu_laiocb laiocb = {
@@ -398,7 +412,7 @@ int coroutine_fn laio_co_submit(BlockDriverState *bs, LinuxAioState *s, int fd,
         .qiov       = qiov,
     };
 
-    ret = laio_do_submit(fd, &laiocb, offset, type);
+    ret = laio_do_submit(fd, &laiocb, offset, type, dev_max_batch);
     if (ret < 0) {
         return ret;
     }
diff --git a/include/block/raw-aio.h b/include/block/raw-aio.h
index 251b10d273..ebd042fa27 100644
--- a/include/block/raw-aio.h
+++ b/include/block/raw-aio.h
@@ -51,7 +51,8 @@ typedef struct LinuxAioState LinuxAioState;
 LinuxAioState *laio_init(Error **errp);
 void laio_cleanup(LinuxAioState *s);
 int coroutine_fn laio_co_submit(BlockDriverState *bs, LinuxAioState *s, int fd,
-                                uint64_t offset, QEMUIOVector *qiov, int type);
+                                uint64_t offset, QEMUIOVector *qiov, int type,
+                                uint64_t dev_max_batch);
 void laio_detach_aio_context(LinuxAioState *s, AioContext *old_context);
 void laio_attach_aio_context(LinuxAioState *s, AioContext *new_context);
 void laio_io_plug(BlockDriverState *bs, LinuxAioState *s);

From 68d7946648a5c364a4df187804d37f09a318b50f Mon Sep 17 00:00:00 2001
From: Stefano Garzarella <sgarzare@redhat.com>
Date: Tue, 26 Oct 2021 18:23:46 +0200
Subject: [PATCH 08/12] linux-aio: add `dev_max_batch` parameter to
 laio_io_unplug()

Between the submission of a request and the unplug, other devices
with larger limits may have been queued new requests without flushing
the batch.

Using the new `dev_max_batch` parameter, laio_io_unplug() can check
if the batch exceeds the device limit to flush the current batch.

Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Stefano Garzarella <sgarzare@redhat.com>
Message-Id: <20211026162346.253081-4-sgarzare@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/file-posix.c      | 2 +-
 block/linux-aio.c       | 8 +++++---
 include/block/raw-aio.h | 3 ++-
 3 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/block/file-posix.c b/block/file-posix.c
index c2c94fca66..7a27c83060 100644
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -2125,7 +2125,7 @@ static void raw_aio_unplug(BlockDriverState *bs)
 #ifdef CONFIG_LINUX_AIO
     if (s->use_linux_aio) {
         LinuxAioState *aio = aio_get_linux_aio(bdrv_get_aio_context(bs));
-        laio_io_unplug(bs, aio);
+        laio_io_unplug(bs, aio, s->aio_max_batch);
     }
 #endif
 #ifdef CONFIG_LINUX_IO_URING
diff --git a/block/linux-aio.c b/block/linux-aio.c
index 88b44fee72..f53ae72e21 100644
--- a/block/linux-aio.c
+++ b/block/linux-aio.c
@@ -356,11 +356,13 @@ void laio_io_plug(BlockDriverState *bs, LinuxAioState *s)
     s->io_q.plugged++;
 }
 
-void laio_io_unplug(BlockDriverState *bs, LinuxAioState *s)
+void laio_io_unplug(BlockDriverState *bs, LinuxAioState *s,
+                    uint64_t dev_max_batch)
 {
     assert(s->io_q.plugged);
-    if (--s->io_q.plugged == 0 &&
-        !s->io_q.blocked && !QSIMPLEQ_EMPTY(&s->io_q.pending)) {
+    if (s->io_q.in_queue >= laio_max_batch(s, dev_max_batch) ||
+        (--s->io_q.plugged == 0 &&
+         !s->io_q.blocked && !QSIMPLEQ_EMPTY(&s->io_q.pending))) {
         ioq_submit(s);
     }
 }
diff --git a/include/block/raw-aio.h b/include/block/raw-aio.h
index ebd042fa27..21fc10c4c9 100644
--- a/include/block/raw-aio.h
+++ b/include/block/raw-aio.h
@@ -56,7 +56,8 @@ int coroutine_fn laio_co_submit(BlockDriverState *bs, LinuxAioState *s, int fd,
 void laio_detach_aio_context(LinuxAioState *s, AioContext *old_context);
 void laio_attach_aio_context(LinuxAioState *s, AioContext *new_context);
 void laio_io_plug(BlockDriverState *bs, LinuxAioState *s);
-void laio_io_unplug(BlockDriverState *bs, LinuxAioState *s);
+void laio_io_unplug(BlockDriverState *bs, LinuxAioState *s,
+                    uint64_t dev_max_batch);
 #endif
 /* io_uring.c - Linux io_uring implementation */
 #ifdef CONFIG_LINUX_IO_URING

From 73d4a11300ee87e04f93fab4380b80379e792a06 Mon Sep 17 00:00:00 2001
From: Hanna Reitz <hreitz@redhat.com>
Date: Tue, 26 Oct 2021 11:07:45 +0200
Subject: [PATCH 09/12] block-backend: Silence clang -m32 compiler warning
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Similarly to e7e588d432d31ecebc26358e47201dd108db964c, there is a
warning in block/block-backend.c that qiov->size <= INT64_MAX is always
true on machines where size_t is narrower than a uint64_t.  In said
commit, we silenced this warning by casting to uint64_t.

The commit introducing this warning here
(a93d81c84afa717b0a1a6947524d8d1fbfd6bbf5) anticipated it and so tried
to address it the same way.  However, it only did so in one of two
places where this comparison occurs, and so we still need to fix up the
other one.

Fixes: a93d81c84afa717b0a1a6947524d8d1fbfd6bbf5
       ("block-backend: convert blk_aio_ functions to int64_t bytes
       paramter")
Signed-off-by: Hanna Reitz <hreitz@redhat.com>
Message-Id: <20211026090745.30800-1-hreitz@redhat.com>
Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/block-backend.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/block-backend.c b/block/block-backend.c
index 39cd99df2b..12ef80ea17 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -1540,7 +1540,7 @@ BlockAIOCB *blk_aio_pwritev(BlockBackend *blk, int64_t offset,
                             QEMUIOVector *qiov, BdrvRequestFlags flags,
                             BlockCompletionFunc *cb, void *opaque)
 {
-    assert(qiov->size <= INT64_MAX);
+    assert((uint64_t)qiov->size <= INT64_MAX);
     return blk_aio_prwv(blk, offset, qiov->size, qiov,
                         blk_aio_write_entry, flags, cb, opaque);
 }

From 4a613bd86256543b9735307674a84e167a585687 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Philippe=20Mathieu-Daud=C3=A9?= <philmd@redhat.com>
Date: Wed, 6 Oct 2021 18:49:27 +0200
Subject: [PATCH 10/12] block/nvme: Automatically free qemu_memalign() with
 QEMU_AUTO_VFREE
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Since commit 4d324c0bf65 ("introduce QEMU_AUTO_VFREE") buffers
allocated by qemu_memalign() can automatically freed when using
the QEMU_AUTO_VFREE macro. Use it to simplify a bit.

Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Message-Id: <20211006164931.172349-2-philmd@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/nvme.c | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/block/nvme.c b/block/nvme.c
index 1cc7b62bb4..fefcc04abe 100644
--- a/block/nvme.c
+++ b/block/nvme.c
@@ -514,10 +514,10 @@ static bool nvme_identify(BlockDriverState *bs, int namespace, Error **errp)
 {
     BDRVNVMeState *s = bs->opaque;
     bool ret = false;
-    union {
+    QEMU_AUTO_VFREE union {
         NvmeIdCtrl ctrl;
         NvmeIdNs ns;
-    } *id;
+    } *id = NULL;
     NvmeLBAF *lbaf;
     uint16_t oncs;
     int r;
@@ -595,7 +595,6 @@ static bool nvme_identify(BlockDriverState *bs, int namespace, Error **errp)
     s->blkshift = lbaf->ds;
 out:
     qemu_vfio_dma_unmap(s->vfio, id);
-    qemu_vfree(id);
 
     return ret;
 }
@@ -1219,7 +1218,7 @@ static int nvme_co_prw(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
 {
     BDRVNVMeState *s = bs->opaque;
     int r;
-    uint8_t *buf = NULL;
+    QEMU_AUTO_VFREE uint8_t *buf = NULL;
     QEMUIOVector local_qiov;
     size_t len = QEMU_ALIGN_UP(bytes, qemu_real_host_page_size);
     assert(QEMU_IS_ALIGNED(offset, s->page_size));
@@ -1246,7 +1245,6 @@ static int nvme_co_prw(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
     if (!r && !is_write) {
         qemu_iovec_from_buf(qiov, 0, buf, bytes);
     }
-    qemu_vfree(buf);
     return r;
 }
 
@@ -1365,7 +1363,7 @@ static int coroutine_fn nvme_co_pdiscard(BlockDriverState *bs,
     BDRVNVMeState *s = bs->opaque;
     NVMeQueuePair *ioq = s->queues[INDEX_IO(0)];
     NVMeRequest *req;
-    NvmeDsmRange *buf;
+    QEMU_AUTO_VFREE NvmeDsmRange *buf = NULL;
     QEMUIOVector local_qiov;
     int ret;
 
@@ -1440,7 +1438,6 @@ static int coroutine_fn nvme_co_pdiscard(BlockDriverState *bs,
     trace_nvme_dsm_done(s, offset, bytes, ret);
 out:
     qemu_iovec_destroy(&local_qiov);
-    qemu_vfree(buf);
     return ret;
 
 }

From 53cedeaaee9d585bd07eb921fa5557c64531f69b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Philippe=20Mathieu-Daud=C3=A9?= <philmd@redhat.com>
Date: Wed, 6 Oct 2021 18:49:28 +0200
Subject: [PATCH 11/12] block/nvme: Display CQ/SQ pointer in
 nvme_free_queue_pair()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

For debugging purpose it is helpful to know the CQ/SQ pointers.
We already have a trace event in nvme_free_queue_pair(), extend
it to report these pointer addresses.

Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Message-Id: <20211006164931.172349-3-philmd@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/nvme.c       | 2 +-
 block/trace-events | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/block/nvme.c b/block/nvme.c
index fefcc04abe..0c94799a54 100644
--- a/block/nvme.c
+++ b/block/nvme.c
@@ -185,7 +185,7 @@ static bool nvme_init_queue(BDRVNVMeState *s, NVMeQueue *q,
 
 static void nvme_free_queue_pair(NVMeQueuePair *q)
 {
-    trace_nvme_free_queue_pair(q->index, q);
+    trace_nvme_free_queue_pair(q->index, q, &q->cq, &q->sq);
     if (q->completion_bh) {
         qemu_bh_delete(q->completion_bh);
     }
diff --git a/block/trace-events b/block/trace-events
index ab56edacb4..549090d453 100644
--- a/block/trace-events
+++ b/block/trace-events
@@ -157,7 +157,7 @@ nvme_dsm_done(void *s, int64_t offset, int64_t bytes, int ret) "s %p offset 0x%"
 nvme_dma_map_flush(void *s) "s %p"
 nvme_free_req_queue_wait(void *s, unsigned q_index) "s %p q #%u"
 nvme_create_queue_pair(unsigned q_index, void *q, size_t size, void *aio_context, int fd) "index %u q %p size %zu aioctx %p fd %d"
-nvme_free_queue_pair(unsigned q_index, void *q) "index %u q %p"
+nvme_free_queue_pair(unsigned q_index, void *q, void *cq, void *sq) "index %u q %p cq %p sq %p"
 nvme_cmd_map_qiov(void *s, void *cmd, void *req, void *qiov, int entries) "s %p cmd %p req %p qiov %p entries %d"
 nvme_cmd_map_qiov_pages(void *s, int i, uint64_t page) "s %p page[%d] 0x%"PRIx64
 nvme_cmd_map_qiov_iov(void *s, int i, void *page, int pages) "s %p iov[%d] %p pages %d"

From a8951438946d72d74c9bdbdb38fce95aa2973a88 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Philippe=20Mathieu-Daud=C3=A9?= <philmd@redhat.com>
Date: Wed, 6 Oct 2021 18:49:29 +0200
Subject: [PATCH 12/12] block/nvme: Extract nvme_free_queue() from
 nvme_free_queue_pair()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Instead of duplicating code, extract the common helper to free
a single queue.

Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Message-Id: <20211006164931.172349-4-philmd@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/nvme.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/block/nvme.c b/block/nvme.c
index 0c94799a54..e4f336d79c 100644
--- a/block/nvme.c
+++ b/block/nvme.c
@@ -183,15 +183,20 @@ static bool nvme_init_queue(BDRVNVMeState *s, NVMeQueue *q,
     return r == 0;
 }
 
+static void nvme_free_queue(NVMeQueue *q)
+{
+    qemu_vfree(q->queue);
+}
+
 static void nvme_free_queue_pair(NVMeQueuePair *q)
 {
     trace_nvme_free_queue_pair(q->index, q, &q->cq, &q->sq);
     if (q->completion_bh) {
         qemu_bh_delete(q->completion_bh);
     }
+    nvme_free_queue(&q->sq);
+    nvme_free_queue(&q->cq);
     qemu_vfree(q->prp_list_pages);
-    qemu_vfree(q->sq.queue);
-    qemu_vfree(q->cq.queue);
     qemu_mutex_destroy(&q->lock);
     g_free(q);
 }