From 41abca8c39244b7eeb06378c4aa18b10e0645a1c Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Tue, 30 May 2023 14:09:54 -0400 Subject: [PATCH 01/25] block: add blk_io_plug_call() API Introduce a new API for thread-local blk_io_plug() that does not traverse the block graph. The goal is to make blk_io_plug() multi-queue friendly. Instead of having block drivers track whether or not we're in a plugged section, provide an API that allows them to defer a function call until we're unplugged: blk_io_plug_call(fn, opaque). If blk_io_plug_call() is called multiple times with the same fn/opaque pair, then fn() is only called once at the end of the function - resulting in batching. This patch introduces the API and changes blk_io_plug()/blk_io_unplug(). blk_io_plug()/blk_io_unplug() no longer require a BlockBackend argument because the plug state is now thread-local. Later patches convert block drivers to blk_io_plug_call() and then we can finally remove .bdrv_co_io_plug() once all block drivers have been converted. Signed-off-by: Stefan Hajnoczi Reviewed-by: Eric Blake Reviewed-by: Stefano Garzarella Acked-by: Kevin Wolf Message-id: 20230530180959.1108766-2-stefanha@redhat.com Signed-off-by: Stefan Hajnoczi --- MAINTAINERS | 1 + block/block-backend.c | 22 ----- block/meson.build | 1 + block/plug.c | 159 ++++++++++++++++++++++++++++++ hw/block/dataplane/xen-block.c | 8 +- hw/block/virtio-blk.c | 4 +- hw/scsi/virtio-scsi.c | 6 +- include/sysemu/block-backend-io.h | 13 +-- 8 files changed, 173 insertions(+), 41 deletions(-) create mode 100644 block/plug.c diff --git a/MAINTAINERS b/MAINTAINERS index 4b025a7b63..89f274f85e 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2650,6 +2650,7 @@ F: util/aio-*.c F: util/aio-*.h F: util/fdmon-*.c F: block/io.c +F: block/plug.c F: migration/block* F: include/block/aio.h F: include/block/aio-wait.h diff --git a/block/block-backend.c b/block/block-backend.c index 241f643507..4009ed5fed 100644 --- a/block/block-backend.c +++ b/block/block-backend.c @@ -2582,28 +2582,6 @@ void blk_add_insert_bs_notifier(BlockBackend *blk, Notifier *notify) notifier_list_add(&blk->insert_bs_notifiers, notify); } -void coroutine_fn blk_co_io_plug(BlockBackend *blk) -{ - BlockDriverState *bs = blk_bs(blk); - IO_CODE(); - GRAPH_RDLOCK_GUARD(); - - if (bs) { - bdrv_co_io_plug(bs); - } -} - -void coroutine_fn blk_co_io_unplug(BlockBackend *blk) -{ - BlockDriverState *bs = blk_bs(blk); - IO_CODE(); - GRAPH_RDLOCK_GUARD(); - - if (bs) { - bdrv_co_io_unplug(bs); - } -} - BlockAcctStats *blk_get_stats(BlockBackend *blk) { IO_CODE(); diff --git a/block/meson.build b/block/meson.build index 486dda8b85..fb4332bd66 100644 --- a/block/meson.build +++ b/block/meson.build @@ -23,6 +23,7 @@ block_ss.add(files( 'mirror.c', 'nbd.c', 'null.c', + 'plug.c', 'qapi.c', 'qcow2-bitmap.c', 'qcow2-cache.c', diff --git a/block/plug.c b/block/plug.c new file mode 100644 index 0000000000..98a155d2f4 --- /dev/null +++ b/block/plug.c @@ -0,0 +1,159 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Block I/O plugging + * + * Copyright Red Hat. + * + * This API defers a function call within a blk_io_plug()/blk_io_unplug() + * section, allowing multiple calls to batch up. This is a performance + * optimization that is used in the block layer to submit several I/O requests + * at once instead of individually: + * + * blk_io_plug(); <-- start of plugged region + * ... + * blk_io_plug_call(my_func, my_obj); <-- deferred my_func(my_obj) call + * blk_io_plug_call(my_func, my_obj); <-- another + * blk_io_plug_call(my_func, my_obj); <-- another + * ... + * blk_io_unplug(); <-- end of plugged region, my_func(my_obj) is called once + * + * This code is actually generic and not tied to the block layer. If another + * subsystem needs this functionality, it could be renamed. + */ + +#include "qemu/osdep.h" +#include "qemu/coroutine-tls.h" +#include "qemu/notify.h" +#include "qemu/thread.h" +#include "sysemu/block-backend.h" + +/* A function call that has been deferred until unplug() */ +typedef struct { + void (*fn)(void *); + void *opaque; +} UnplugFn; + +/* Per-thread state */ +typedef struct { + unsigned count; /* how many times has plug() been called? */ + GArray *unplug_fns; /* functions to call at unplug time */ +} Plug; + +/* Use get_ptr_plug() to fetch this thread-local value */ +QEMU_DEFINE_STATIC_CO_TLS(Plug, plug); + +/* Called at thread cleanup time */ +static void blk_io_plug_atexit(Notifier *n, void *value) +{ + Plug *plug = get_ptr_plug(); + g_array_free(plug->unplug_fns, TRUE); +} + +/* This won't involve coroutines, so use __thread */ +static __thread Notifier blk_io_plug_atexit_notifier; + +/** + * blk_io_plug_call: + * @fn: a function pointer to be invoked + * @opaque: a user-defined argument to @fn() + * + * Call @fn(@opaque) immediately if not within a blk_io_plug()/blk_io_unplug() + * section. + * + * Otherwise defer the call until the end of the outermost + * blk_io_plug()/blk_io_unplug() section in this thread. If the same + * @fn/@opaque pair has already been deferred, it will only be called once upon + * blk_io_unplug() so that accumulated calls are batched into a single call. + * + * The caller must ensure that @opaque is not freed before @fn() is invoked. + */ +void blk_io_plug_call(void (*fn)(void *), void *opaque) +{ + Plug *plug = get_ptr_plug(); + + /* Call immediately if we're not plugged */ + if (plug->count == 0) { + fn(opaque); + return; + } + + GArray *array = plug->unplug_fns; + if (!array) { + array = g_array_new(FALSE, FALSE, sizeof(UnplugFn)); + plug->unplug_fns = array; + blk_io_plug_atexit_notifier.notify = blk_io_plug_atexit; + qemu_thread_atexit_add(&blk_io_plug_atexit_notifier); + } + + UnplugFn *fns = (UnplugFn *)array->data; + UnplugFn new_fn = { + .fn = fn, + .opaque = opaque, + }; + + /* + * There won't be many, so do a linear search. If this becomes a bottleneck + * then a binary search (glib 2.62+) or different data structure could be + * used. + */ + for (guint i = 0; i < array->len; i++) { + if (memcmp(&fns[i], &new_fn, sizeof(new_fn)) == 0) { + return; /* already exists */ + } + } + + g_array_append_val(array, new_fn); +} + +/** + * blk_io_plug: Defer blk_io_plug_call() functions until blk_io_unplug() + * + * blk_io_plug/unplug are thread-local operations. This means that multiple + * threads can simultaneously call plug/unplug, but the caller must ensure that + * each unplug() is called in the same thread of the matching plug(). + * + * Nesting is supported. blk_io_plug_call() functions are only called at the + * outermost blk_io_unplug(). + */ +void blk_io_plug(void) +{ + Plug *plug = get_ptr_plug(); + + assert(plug->count < UINT32_MAX); + + plug->count++; +} + +/** + * blk_io_unplug: Run any pending blk_io_plug_call() functions + * + * There must have been a matching blk_io_plug() call in the same thread prior + * to this blk_io_unplug() call. + */ +void blk_io_unplug(void) +{ + Plug *plug = get_ptr_plug(); + + assert(plug->count > 0); + + if (--plug->count > 0) { + return; + } + + GArray *array = plug->unplug_fns; + if (!array) { + return; + } + + UnplugFn *fns = (UnplugFn *)array->data; + + for (guint i = 0; i < array->len; i++) { + fns[i].fn(fns[i].opaque); + } + + /* + * This resets the array without freeing memory so that appending is cheap + * in the future. + */ + g_array_set_size(array, 0); +} diff --git a/hw/block/dataplane/xen-block.c b/hw/block/dataplane/xen-block.c index 2597f38805..3b6f2b0aa2 100644 --- a/hw/block/dataplane/xen-block.c +++ b/hw/block/dataplane/xen-block.c @@ -537,7 +537,7 @@ static bool xen_block_handle_requests(XenBlockDataPlane *dataplane) * is below us. */ if (inflight_atstart > IO_PLUG_THRESHOLD) { - blk_io_plug(dataplane->blk); + blk_io_plug(); } while (rc != rp) { /* pull request from ring */ @@ -577,12 +577,12 @@ static bool xen_block_handle_requests(XenBlockDataPlane *dataplane) if (inflight_atstart > IO_PLUG_THRESHOLD && batched >= inflight_atstart) { - blk_io_unplug(dataplane->blk); + blk_io_unplug(); } xen_block_do_aio(request); if (inflight_atstart > IO_PLUG_THRESHOLD) { if (batched >= inflight_atstart) { - blk_io_plug(dataplane->blk); + blk_io_plug(); batched = 0; } else { batched++; @@ -590,7 +590,7 @@ static bool xen_block_handle_requests(XenBlockDataPlane *dataplane) } } if (inflight_atstart > IO_PLUG_THRESHOLD) { - blk_io_unplug(dataplane->blk); + blk_io_unplug(); } return done_something; diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c index 4ca66b5860..39e7f23fab 100644 --- a/hw/block/virtio-blk.c +++ b/hw/block/virtio-blk.c @@ -1134,7 +1134,7 @@ void virtio_blk_handle_vq(VirtIOBlock *s, VirtQueue *vq) bool suppress_notifications = virtio_queue_get_notification(vq); aio_context_acquire(blk_get_aio_context(s->blk)); - blk_io_plug(s->blk); + blk_io_plug(); do { if (suppress_notifications) { @@ -1158,7 +1158,7 @@ void virtio_blk_handle_vq(VirtIOBlock *s, VirtQueue *vq) virtio_blk_submit_multireq(s, &mrb); } - blk_io_unplug(s->blk); + blk_io_unplug(); aio_context_release(blk_get_aio_context(s->blk)); } diff --git a/hw/scsi/virtio-scsi.c b/hw/scsi/virtio-scsi.c index 4a8849cc7e..9c8ef0aaa6 100644 --- a/hw/scsi/virtio-scsi.c +++ b/hw/scsi/virtio-scsi.c @@ -799,7 +799,7 @@ static int virtio_scsi_handle_cmd_req_prepare(VirtIOSCSI *s, VirtIOSCSIReq *req) return -ENOBUFS; } scsi_req_ref(req->sreq); - blk_io_plug(d->conf.blk); + blk_io_plug(); object_unref(OBJECT(d)); return 0; } @@ -810,7 +810,7 @@ static void virtio_scsi_handle_cmd_req_submit(VirtIOSCSI *s, VirtIOSCSIReq *req) if (scsi_req_enqueue(sreq)) { scsi_req_continue(sreq); } - blk_io_unplug(sreq->dev->conf.blk); + blk_io_unplug(); scsi_req_unref(sreq); } @@ -836,7 +836,7 @@ static void virtio_scsi_handle_cmd_vq(VirtIOSCSI *s, VirtQueue *vq) while (!QTAILQ_EMPTY(&reqs)) { req = QTAILQ_FIRST(&reqs); QTAILQ_REMOVE(&reqs, req, next); - blk_io_unplug(req->sreq->dev->conf.blk); + blk_io_unplug(); scsi_req_unref(req->sreq); virtqueue_detach_element(req->vq, &req->elem, 0); virtio_scsi_free_req(req); diff --git a/include/sysemu/block-backend-io.h b/include/sysemu/block-backend-io.h index d62a7ee773..be4dcef59d 100644 --- a/include/sysemu/block-backend-io.h +++ b/include/sysemu/block-backend-io.h @@ -100,16 +100,9 @@ void blk_iostatus_set_err(BlockBackend *blk, int error); int blk_get_max_iov(BlockBackend *blk); int blk_get_max_hw_iov(BlockBackend *blk); -/* - * blk_io_plug/unplug are thread-local operations. This means that multiple - * IOThreads can simultaneously call plug/unplug, but the caller must ensure - * that each unplug() is called in the same IOThread of the matching plug(). - */ -void coroutine_fn blk_co_io_plug(BlockBackend *blk); -void co_wrapper blk_io_plug(BlockBackend *blk); - -void coroutine_fn blk_co_io_unplug(BlockBackend *blk); -void co_wrapper blk_io_unplug(BlockBackend *blk); +void blk_io_plug(void); +void blk_io_unplug(void); +void blk_io_plug_call(void (*fn)(void *), void *opaque); AioContext *blk_get_aio_context(BlockBackend *blk); BlockAcctStats *blk_get_stats(BlockBackend *blk); From f2e590002bd6765e2b79fcd05992ecf70cc1d229 Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Tue, 30 May 2023 14:09:55 -0400 Subject: [PATCH 02/25] block/nvme: convert to blk_io_plug_call() API Stop using the .bdrv_co_io_plug() API because it is not multi-queue block layer friendly. Use the new blk_io_plug_call() API to batch I/O submission instead. Signed-off-by: Stefan Hajnoczi Reviewed-by: Eric Blake Reviewed-by: Stefano Garzarella Acked-by: Kevin Wolf Message-id: 20230530180959.1108766-3-stefanha@redhat.com Signed-off-by: Stefan Hajnoczi --- block/nvme.c | 44 ++++++++++++-------------------------------- block/trace-events | 1 - 2 files changed, 12 insertions(+), 33 deletions(-) diff --git a/block/nvme.c b/block/nvme.c index 17937d398d..7ca85bc44a 100644 --- a/block/nvme.c +++ b/block/nvme.c @@ -25,6 +25,7 @@ #include "qemu/vfio-helpers.h" #include "block/block-io.h" #include "block/block_int.h" +#include "sysemu/block-backend.h" #include "sysemu/replay.h" #include "trace.h" @@ -119,7 +120,6 @@ struct BDRVNVMeState { int blkshift; uint64_t max_transfer; - bool plugged; bool supports_write_zeroes; bool supports_discard; @@ -282,7 +282,7 @@ static void nvme_kick(NVMeQueuePair *q) { BDRVNVMeState *s = q->s; - if (s->plugged || !q->need_kick) { + if (!q->need_kick) { return; } trace_nvme_kick(s, q->index); @@ -387,10 +387,6 @@ static bool nvme_process_completion(NVMeQueuePair *q) NvmeCqe *c; trace_nvme_process_completion(s, q->index, q->inflight); - if (s->plugged) { - trace_nvme_process_completion_queue_plugged(s, q->index); - return false; - } /* * Support re-entrancy when a request cb() function invokes aio_poll(). @@ -480,6 +476,15 @@ static void nvme_trace_command(const NvmeCmd *cmd) } } +static void nvme_unplug_fn(void *opaque) +{ + NVMeQueuePair *q = opaque; + + QEMU_LOCK_GUARD(&q->lock); + nvme_kick(q); + nvme_process_completion(q); +} + static void nvme_submit_command(NVMeQueuePair *q, NVMeRequest *req, NvmeCmd *cmd, BlockCompletionFunc cb, void *opaque) @@ -496,8 +501,7 @@ static void nvme_submit_command(NVMeQueuePair *q, NVMeRequest *req, q->sq.tail * NVME_SQ_ENTRY_BYTES, cmd, sizeof(*cmd)); q->sq.tail = (q->sq.tail + 1) % NVME_QUEUE_SIZE; q->need_kick++; - nvme_kick(q); - nvme_process_completion(q); + blk_io_plug_call(nvme_unplug_fn, q); qemu_mutex_unlock(&q->lock); } @@ -1567,27 +1571,6 @@ static void nvme_attach_aio_context(BlockDriverState *bs, } } -static void coroutine_fn nvme_co_io_plug(BlockDriverState *bs) -{ - BDRVNVMeState *s = bs->opaque; - assert(!s->plugged); - s->plugged = true; -} - -static void coroutine_fn nvme_co_io_unplug(BlockDriverState *bs) -{ - BDRVNVMeState *s = bs->opaque; - assert(s->plugged); - s->plugged = false; - for (unsigned i = INDEX_IO(0); i < s->queue_count; i++) { - NVMeQueuePair *q = s->queues[i]; - qemu_mutex_lock(&q->lock); - nvme_kick(q); - nvme_process_completion(q); - qemu_mutex_unlock(&q->lock); - } -} - static bool nvme_register_buf(BlockDriverState *bs, void *host, size_t size, Error **errp) { @@ -1664,9 +1647,6 @@ static BlockDriver bdrv_nvme = { .bdrv_detach_aio_context = nvme_detach_aio_context, .bdrv_attach_aio_context = nvme_attach_aio_context, - .bdrv_co_io_plug = nvme_co_io_plug, - .bdrv_co_io_unplug = nvme_co_io_unplug, - .bdrv_register_buf = nvme_register_buf, .bdrv_unregister_buf = nvme_unregister_buf, }; diff --git a/block/trace-events b/block/trace-events index 32665158d6..048ad27519 100644 --- a/block/trace-events +++ b/block/trace-events @@ -141,7 +141,6 @@ nvme_kick(void *s, unsigned q_index) "s %p q #%u" nvme_dma_flush_queue_wait(void *s) "s %p" nvme_error(int cmd_specific, int sq_head, int sqid, int cid, int status) "cmd_specific %d sq_head %d sqid %d cid %d status 0x%x" nvme_process_completion(void *s, unsigned q_index, int inflight) "s %p q #%u inflight %d" -nvme_process_completion_queue_plugged(void *s, unsigned q_index) "s %p q #%u" nvme_complete_command(void *s, unsigned q_index, int cid) "s %p q #%u cid %d" nvme_submit_command(void *s, unsigned q_index, int cid) "s %p q #%u cid %d" nvme_submit_command_raw(int c0, int c1, int c2, int c3, int c4, int c5, int c6, int c7) "%02x %02x %02x %02x %02x %02x %02x %02x" From 28ff7b4dfbb5594fff86a2f283a96e0f3ce975d5 Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Tue, 30 May 2023 14:09:56 -0400 Subject: [PATCH 03/25] block/blkio: convert to blk_io_plug_call() API Stop using the .bdrv_co_io_plug() API because it is not multi-queue block layer friendly. Use the new blk_io_plug_call() API to batch I/O submission instead. Signed-off-by: Stefan Hajnoczi Reviewed-by: Eric Blake Reviewed-by: Stefano Garzarella Acked-by: Kevin Wolf Message-id: 20230530180959.1108766-4-stefanha@redhat.com Signed-off-by: Stefan Hajnoczi --- block/blkio.c | 43 ++++++++++++++++++++++++------------------- 1 file changed, 24 insertions(+), 19 deletions(-) diff --git a/block/blkio.c b/block/blkio.c index 72117fa005..11be8787a3 100644 --- a/block/blkio.c +++ b/block/blkio.c @@ -17,6 +17,7 @@ #include "qemu/error-report.h" #include "qapi/qmp/qdict.h" #include "qemu/module.h" +#include "sysemu/block-backend.h" #include "exec/memory.h" /* for ram_block_discard_disable() */ #include "block/block-io.h" @@ -320,16 +321,30 @@ static void blkio_detach_aio_context(BlockDriverState *bs) NULL, NULL, NULL); } -/* Call with s->blkio_lock held to submit I/O after enqueuing a new request */ -static void blkio_submit_io(BlockDriverState *bs) +/* + * Called by blk_io_unplug() or immediately if not plugged. Called without + * blkio_lock. + */ +static void blkio_unplug_fn(void *opaque) { - if (qatomic_read(&bs->io_plugged) == 0) { - BDRVBlkioState *s = bs->opaque; + BDRVBlkioState *s = opaque; + WITH_QEMU_LOCK_GUARD(&s->blkio_lock) { blkioq_do_io(s->blkioq, NULL, 0, 0, NULL); } } +/* + * Schedule I/O submission after enqueuing a new request. Called without + * blkio_lock. + */ +static void blkio_submit_io(BlockDriverState *bs) +{ + BDRVBlkioState *s = bs->opaque; + + blk_io_plug_call(blkio_unplug_fn, s); +} + static int coroutine_fn blkio_co_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes) { @@ -340,9 +355,9 @@ blkio_co_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes) WITH_QEMU_LOCK_GUARD(&s->blkio_lock) { blkioq_discard(s->blkioq, offset, bytes, &cod, 0); - blkio_submit_io(bs); } + blkio_submit_io(bs); qemu_coroutine_yield(); return cod.ret; } @@ -373,9 +388,9 @@ blkio_co_preadv(BlockDriverState *bs, int64_t offset, int64_t bytes, WITH_QEMU_LOCK_GUARD(&s->blkio_lock) { blkioq_readv(s->blkioq, offset, iov, iovcnt, &cod, 0); - blkio_submit_io(bs); } + blkio_submit_io(bs); qemu_coroutine_yield(); if (use_bounce_buffer) { @@ -418,9 +433,9 @@ static int coroutine_fn blkio_co_pwritev(BlockDriverState *bs, int64_t offset, WITH_QEMU_LOCK_GUARD(&s->blkio_lock) { blkioq_writev(s->blkioq, offset, iov, iovcnt, &cod, blkio_flags); - blkio_submit_io(bs); } + blkio_submit_io(bs); qemu_coroutine_yield(); if (use_bounce_buffer) { @@ -439,9 +454,9 @@ static int coroutine_fn blkio_co_flush(BlockDriverState *bs) WITH_QEMU_LOCK_GUARD(&s->blkio_lock) { blkioq_flush(s->blkioq, &cod, 0); - blkio_submit_io(bs); } + blkio_submit_io(bs); qemu_coroutine_yield(); return cod.ret; } @@ -467,22 +482,13 @@ static int coroutine_fn blkio_co_pwrite_zeroes(BlockDriverState *bs, WITH_QEMU_LOCK_GUARD(&s->blkio_lock) { blkioq_write_zeroes(s->blkioq, offset, bytes, &cod, blkio_flags); - blkio_submit_io(bs); } + blkio_submit_io(bs); qemu_coroutine_yield(); return cod.ret; } -static void coroutine_fn blkio_co_io_unplug(BlockDriverState *bs) -{ - BDRVBlkioState *s = bs->opaque; - - WITH_QEMU_LOCK_GUARD(&s->blkio_lock) { - blkio_submit_io(bs); - } -} - typedef enum { BMRR_OK, BMRR_SKIP, @@ -1004,7 +1010,6 @@ static void blkio_refresh_limits(BlockDriverState *bs, Error **errp) .bdrv_co_pwritev = blkio_co_pwritev, \ .bdrv_co_flush_to_disk = blkio_co_flush, \ .bdrv_co_pwrite_zeroes = blkio_co_pwrite_zeroes, \ - .bdrv_co_io_unplug = blkio_co_io_unplug, \ .bdrv_refresh_limits = blkio_refresh_limits, \ .bdrv_register_buf = blkio_register_buf, \ .bdrv_unregister_buf = blkio_unregister_buf, \ From 6a6da231b7076b8ed5409c4745e11cb14d962194 Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Tue, 30 May 2023 14:09:57 -0400 Subject: [PATCH 04/25] block/io_uring: convert to blk_io_plug_call() API Stop using the .bdrv_co_io_plug() API because it is not multi-queue block layer friendly. Use the new blk_io_plug_call() API to batch I/O submission instead. Signed-off-by: Stefan Hajnoczi Reviewed-by: Eric Blake Reviewed-by: Stefano Garzarella Acked-by: Kevin Wolf Message-id: 20230530180959.1108766-5-stefanha@redhat.com Signed-off-by: Stefan Hajnoczi --- block/file-posix.c | 10 ---------- block/io_uring.c | 44 ++++++++++++++++------------------------- block/trace-events | 5 ++--- include/block/raw-aio.h | 7 ------- 4 files changed, 19 insertions(+), 47 deletions(-) diff --git a/block/file-posix.c b/block/file-posix.c index 0ab158efba..7baa8491dd 100644 --- a/block/file-posix.c +++ b/block/file-posix.c @@ -2558,11 +2558,6 @@ static void coroutine_fn raw_co_io_plug(BlockDriverState *bs) laio_io_plug(); } #endif -#ifdef CONFIG_LINUX_IO_URING - if (s->use_linux_io_uring) { - luring_io_plug(); - } -#endif } static void coroutine_fn raw_co_io_unplug(BlockDriverState *bs) @@ -2573,11 +2568,6 @@ static void coroutine_fn raw_co_io_unplug(BlockDriverState *bs) laio_io_unplug(s->aio_max_batch); } #endif -#ifdef CONFIG_LINUX_IO_URING - if (s->use_linux_io_uring) { - luring_io_unplug(); - } -#endif } static int coroutine_fn raw_co_flush_to_disk(BlockDriverState *bs) diff --git a/block/io_uring.c b/block/io_uring.c index 3a77480e16..69d9820928 100644 --- a/block/io_uring.c +++ b/block/io_uring.c @@ -16,6 +16,7 @@ #include "block/raw-aio.h" #include "qemu/coroutine.h" #include "qapi/error.h" +#include "sysemu/block-backend.h" #include "trace.h" /* Only used for assertions. */ @@ -41,7 +42,6 @@ typedef struct LuringAIOCB { } LuringAIOCB; typedef struct LuringQueue { - int plugged; unsigned int in_queue; unsigned int in_flight; bool blocked; @@ -267,7 +267,7 @@ static void luring_process_completions_and_submit(LuringState *s) { luring_process_completions(s); - if (!s->io_q.plugged && s->io_q.in_queue > 0) { + if (s->io_q.in_queue > 0) { ioq_submit(s); } } @@ -301,29 +301,17 @@ static void qemu_luring_poll_ready(void *opaque) static void ioq_init(LuringQueue *io_q) { QSIMPLEQ_INIT(&io_q->submit_queue); - io_q->plugged = 0; io_q->in_queue = 0; io_q->in_flight = 0; io_q->blocked = false; } -void luring_io_plug(void) +static void luring_unplug_fn(void *opaque) { - AioContext *ctx = qemu_get_current_aio_context(); - LuringState *s = aio_get_linux_io_uring(ctx); - trace_luring_io_plug(s); - s->io_q.plugged++; -} - -void luring_io_unplug(void) -{ - AioContext *ctx = qemu_get_current_aio_context(); - LuringState *s = aio_get_linux_io_uring(ctx); - assert(s->io_q.plugged); - trace_luring_io_unplug(s, s->io_q.blocked, s->io_q.plugged, - s->io_q.in_queue, s->io_q.in_flight); - if (--s->io_q.plugged == 0 && - !s->io_q.blocked && s->io_q.in_queue > 0) { + LuringState *s = opaque; + trace_luring_unplug_fn(s, s->io_q.blocked, s->io_q.in_queue, + s->io_q.in_flight); + if (!s->io_q.blocked && s->io_q.in_queue > 0) { ioq_submit(s); } } @@ -370,14 +358,16 @@ static int luring_do_submit(int fd, LuringAIOCB *luringcb, LuringState *s, QSIMPLEQ_INSERT_TAIL(&s->io_q.submit_queue, luringcb, next); s->io_q.in_queue++; - trace_luring_do_submit(s, s->io_q.blocked, s->io_q.plugged, - s->io_q.in_queue, s->io_q.in_flight); - if (!s->io_q.blocked && - (!s->io_q.plugged || - s->io_q.in_flight + s->io_q.in_queue >= MAX_ENTRIES)) { - ret = ioq_submit(s); - trace_luring_do_submit_done(s, ret); - return ret; + trace_luring_do_submit(s, s->io_q.blocked, s->io_q.in_queue, + s->io_q.in_flight); + if (!s->io_q.blocked) { + if (s->io_q.in_flight + s->io_q.in_queue >= MAX_ENTRIES) { + ret = ioq_submit(s); + trace_luring_do_submit_done(s, ret); + return ret; + } + + blk_io_plug_call(luring_unplug_fn, s); } return 0; } diff --git a/block/trace-events b/block/trace-events index 048ad27519..6f121b7636 100644 --- a/block/trace-events +++ b/block/trace-events @@ -64,9 +64,8 @@ file_paio_submit(void *acb, void *opaque, int64_t offset, int count, int type) " # io_uring.c luring_init_state(void *s, size_t size) "s %p size %zu" luring_cleanup_state(void *s) "%p freed" -luring_io_plug(void *s) "LuringState %p plug" -luring_io_unplug(void *s, int blocked, int plugged, int queued, int inflight) "LuringState %p blocked %d plugged %d queued %d inflight %d" -luring_do_submit(void *s, int blocked, int plugged, int queued, int inflight) "LuringState %p blocked %d plugged %d queued %d inflight %d" +luring_unplug_fn(void *s, int blocked, int queued, int inflight) "LuringState %p blocked %d queued %d inflight %d" +luring_do_submit(void *s, int blocked, int queued, int inflight) "LuringState %p blocked %d queued %d inflight %d" luring_do_submit_done(void *s, int ret) "LuringState %p submitted to kernel %d" luring_co_submit(void *bs, void *s, void *luringcb, int fd, uint64_t offset, size_t nbytes, int type) "bs %p s %p luringcb %p fd %d offset %" PRId64 " nbytes %zd type %d" luring_process_completion(void *s, void *aiocb, int ret) "LuringState %p luringcb %p ret %d" diff --git a/include/block/raw-aio.h b/include/block/raw-aio.h index 0fe85ade77..da60ca13ef 100644 --- a/include/block/raw-aio.h +++ b/include/block/raw-aio.h @@ -81,13 +81,6 @@ int coroutine_fn luring_co_submit(BlockDriverState *bs, int fd, uint64_t offset, QEMUIOVector *qiov, int type); void luring_detach_aio_context(LuringState *s, AioContext *old_context); void luring_attach_aio_context(LuringState *s, AioContext *new_context); - -/* - * luring_io_plug/unplug work in the thread's current AioContext, therefore the - * caller must ensure that they are paired in the same IOThread. - */ -void luring_io_plug(void); -void luring_io_unplug(void); #endif #ifdef _WIN32 From 076682885d6fe521ed005b988fde6a67ec6818df Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Tue, 30 May 2023 14:09:58 -0400 Subject: [PATCH 05/25] block/linux-aio: convert to blk_io_plug_call() API Stop using the .bdrv_co_io_plug() API because it is not multi-queue block layer friendly. Use the new blk_io_plug_call() API to batch I/O submission instead. Note that a dev_max_batch check is dropped in laio_io_unplug() because the semantics of unplug_fn() are different from .bdrv_co_unplug(): 1. unplug_fn() is only called when the last blk_io_unplug() call occurs, not every time blk_io_unplug() is called. 2. unplug_fn() is per-thread, not per-BlockDriverState, so there is no way to get per-BlockDriverState fields like dev_max_batch. Therefore this condition cannot be moved to laio_unplug_fn(). It is not obvious that this condition affects performance in practice, so I am removing it instead of trying to come up with a more complex mechanism to preserve the condition. Signed-off-by: Stefan Hajnoczi Reviewed-by: Eric Blake Acked-by: Kevin Wolf Reviewed-by: Stefano Garzarella Message-id: 20230530180959.1108766-6-stefanha@redhat.com Signed-off-by: Stefan Hajnoczi --- block/file-posix.c | 28 ---------------------------- block/linux-aio.c | 41 +++++++++++------------------------------ include/block/raw-aio.h | 7 ------- 3 files changed, 11 insertions(+), 65 deletions(-) diff --git a/block/file-posix.c b/block/file-posix.c index 7baa8491dd..ac1ed54811 100644 --- a/block/file-posix.c +++ b/block/file-posix.c @@ -2550,26 +2550,6 @@ static int coroutine_fn raw_co_pwritev(BlockDriverState *bs, int64_t offset, return raw_co_prw(bs, offset, bytes, qiov, QEMU_AIO_WRITE); } -static void coroutine_fn raw_co_io_plug(BlockDriverState *bs) -{ - BDRVRawState __attribute__((unused)) *s = bs->opaque; -#ifdef CONFIG_LINUX_AIO - if (s->use_linux_aio) { - laio_io_plug(); - } -#endif -} - -static void coroutine_fn raw_co_io_unplug(BlockDriverState *bs) -{ - BDRVRawState __attribute__((unused)) *s = bs->opaque; -#ifdef CONFIG_LINUX_AIO - if (s->use_linux_aio) { - laio_io_unplug(s->aio_max_batch); - } -#endif -} - static int coroutine_fn raw_co_flush_to_disk(BlockDriverState *bs) { BDRVRawState *s = bs->opaque; @@ -3914,8 +3894,6 @@ BlockDriver bdrv_file = { .bdrv_co_copy_range_from = raw_co_copy_range_from, .bdrv_co_copy_range_to = raw_co_copy_range_to, .bdrv_refresh_limits = raw_refresh_limits, - .bdrv_co_io_plug = raw_co_io_plug, - .bdrv_co_io_unplug = raw_co_io_unplug, .bdrv_attach_aio_context = raw_aio_attach_aio_context, .bdrv_co_truncate = raw_co_truncate, @@ -4286,8 +4264,6 @@ static BlockDriver bdrv_host_device = { .bdrv_co_copy_range_from = raw_co_copy_range_from, .bdrv_co_copy_range_to = raw_co_copy_range_to, .bdrv_refresh_limits = raw_refresh_limits, - .bdrv_co_io_plug = raw_co_io_plug, - .bdrv_co_io_unplug = raw_co_io_unplug, .bdrv_attach_aio_context = raw_aio_attach_aio_context, .bdrv_co_truncate = raw_co_truncate, @@ -4424,8 +4400,6 @@ static BlockDriver bdrv_host_cdrom = { .bdrv_co_pwritev = raw_co_pwritev, .bdrv_co_flush_to_disk = raw_co_flush_to_disk, .bdrv_refresh_limits = cdrom_refresh_limits, - .bdrv_co_io_plug = raw_co_io_plug, - .bdrv_co_io_unplug = raw_co_io_unplug, .bdrv_attach_aio_context = raw_aio_attach_aio_context, .bdrv_co_truncate = raw_co_truncate, @@ -4552,8 +4526,6 @@ static BlockDriver bdrv_host_cdrom = { .bdrv_co_pwritev = raw_co_pwritev, .bdrv_co_flush_to_disk = raw_co_flush_to_disk, .bdrv_refresh_limits = cdrom_refresh_limits, - .bdrv_co_io_plug = raw_co_io_plug, - .bdrv_co_io_unplug = raw_co_io_unplug, .bdrv_attach_aio_context = raw_aio_attach_aio_context, .bdrv_co_truncate = raw_co_truncate, diff --git a/block/linux-aio.c b/block/linux-aio.c index 916f001e32..561c71a9ae 100644 --- a/block/linux-aio.c +++ b/block/linux-aio.c @@ -15,6 +15,7 @@ #include "qemu/event_notifier.h" #include "qemu/coroutine.h" #include "qapi/error.h" +#include "sysemu/block-backend.h" /* Only used for assertions. */ #include "qemu/coroutine_int.h" @@ -46,7 +47,6 @@ struct qemu_laiocb { }; typedef struct { - int plugged; unsigned int in_queue; unsigned int in_flight; bool blocked; @@ -236,7 +236,7 @@ static void qemu_laio_process_completions_and_submit(LinuxAioState *s) { qemu_laio_process_completions(s); - if (!s->io_q.plugged && !QSIMPLEQ_EMPTY(&s->io_q.pending)) { + if (!QSIMPLEQ_EMPTY(&s->io_q.pending)) { ioq_submit(s); } } @@ -277,7 +277,6 @@ static void qemu_laio_poll_ready(EventNotifier *opaque) static void ioq_init(LaioQueue *io_q) { QSIMPLEQ_INIT(&io_q->pending); - io_q->plugged = 0; io_q->in_queue = 0; io_q->in_flight = 0; io_q->blocked = false; @@ -354,31 +353,11 @@ static uint64_t laio_max_batch(LinuxAioState *s, uint64_t dev_max_batch) return max_batch; } -void laio_io_plug(void) +static void laio_unplug_fn(void *opaque) { - AioContext *ctx = qemu_get_current_aio_context(); - LinuxAioState *s = aio_get_linux_aio(ctx); + LinuxAioState *s = opaque; - s->io_q.plugged++; -} - -void laio_io_unplug(uint64_t dev_max_batch) -{ - AioContext *ctx = qemu_get_current_aio_context(); - LinuxAioState *s = aio_get_linux_aio(ctx); - - assert(s->io_q.plugged); - s->io_q.plugged--; - - /* - * Why max batch checking is performed here: - * Another BDS may have queued requests with a higher dev_max_batch and - * therefore in_queue could now exceed our dev_max_batch. Re-check the max - * batch so we can honor our device's dev_max_batch. - */ - if (s->io_q.in_queue >= laio_max_batch(s, dev_max_batch) || - (!s->io_q.plugged && - !s->io_q.blocked && !QSIMPLEQ_EMPTY(&s->io_q.pending))) { + if (!s->io_q.blocked && !QSIMPLEQ_EMPTY(&s->io_q.pending)) { ioq_submit(s); } } @@ -410,10 +389,12 @@ static int laio_do_submit(int fd, struct qemu_laiocb *laiocb, off_t offset, QSIMPLEQ_INSERT_TAIL(&s->io_q.pending, laiocb, next); s->io_q.in_queue++; - if (!s->io_q.blocked && - (!s->io_q.plugged || - s->io_q.in_queue >= laio_max_batch(s, dev_max_batch))) { - ioq_submit(s); + if (!s->io_q.blocked) { + if (s->io_q.in_queue >= laio_max_batch(s, dev_max_batch)) { + ioq_submit(s); + } else { + blk_io_plug_call(laio_unplug_fn, s); + } } return 0; diff --git a/include/block/raw-aio.h b/include/block/raw-aio.h index da60ca13ef..0f63c2800c 100644 --- a/include/block/raw-aio.h +++ b/include/block/raw-aio.h @@ -62,13 +62,6 @@ int coroutine_fn laio_co_submit(int fd, uint64_t offset, QEMUIOVector *qiov, void laio_detach_aio_context(LinuxAioState *s, AioContext *old_context); void laio_attach_aio_context(LinuxAioState *s, AioContext *new_context); - -/* - * laio_io_plug/unplug work in the thread's current AioContext, therefore the - * caller must ensure that they are paired in the same IOThread. - */ -void laio_io_plug(void); -void laio_io_unplug(uint64_t dev_max_batch); #endif /* io_uring.c - Linux io_uring implementation */ #ifdef CONFIG_LINUX_IO_URING From 2a0d7cb6b7c1d1f325a81ec9f50a823318b045bb Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Tue, 30 May 2023 14:09:59 -0400 Subject: [PATCH 06/25] block: remove bdrv_co_io_plug() API No block driver implements .bdrv_co_io_plug() anymore. Get rid of the function pointers. Signed-off-by: Stefan Hajnoczi Reviewed-by: Eric Blake Reviewed-by: Stefano Garzarella Acked-by: Kevin Wolf Message-id: 20230530180959.1108766-7-stefanha@redhat.com Signed-off-by: Stefan Hajnoczi --- block/io.c | 37 -------------------------------- include/block/block-io.h | 3 --- include/block/block_int-common.h | 11 ---------- 3 files changed, 51 deletions(-) diff --git a/block/io.c b/block/io.c index 540bf8d26d..f2dfc7c405 100644 --- a/block/io.c +++ b/block/io.c @@ -3223,43 +3223,6 @@ void *qemu_try_blockalign0(BlockDriverState *bs, size_t size) return mem; } -void coroutine_fn bdrv_co_io_plug(BlockDriverState *bs) -{ - BdrvChild *child; - IO_CODE(); - assert_bdrv_graph_readable(); - - QLIST_FOREACH(child, &bs->children, next) { - bdrv_co_io_plug(child->bs); - } - - if (qatomic_fetch_inc(&bs->io_plugged) == 0) { - BlockDriver *drv = bs->drv; - if (drv && drv->bdrv_co_io_plug) { - drv->bdrv_co_io_plug(bs); - } - } -} - -void coroutine_fn bdrv_co_io_unplug(BlockDriverState *bs) -{ - BdrvChild *child; - IO_CODE(); - assert_bdrv_graph_readable(); - - assert(bs->io_plugged); - if (qatomic_fetch_dec(&bs->io_plugged) == 1) { - BlockDriver *drv = bs->drv; - if (drv && drv->bdrv_co_io_unplug) { - drv->bdrv_co_io_unplug(bs); - } - } - - QLIST_FOREACH(child, &bs->children, next) { - bdrv_co_io_unplug(child->bs); - } -} - /* Helper that undoes bdrv_register_buf() when it fails partway through */ static void GRAPH_RDLOCK bdrv_register_buf_rollback(BlockDriverState *bs, void *host, size_t size, diff --git a/include/block/block-io.h b/include/block/block-io.h index a27e471a87..43af816d75 100644 --- a/include/block/block-io.h +++ b/include/block/block-io.h @@ -259,9 +259,6 @@ void coroutine_fn bdrv_co_leave(BlockDriverState *bs, AioContext *old_ctx); AioContext *child_of_bds_get_parent_aio_context(BdrvChild *c); -void coroutine_fn GRAPH_RDLOCK bdrv_co_io_plug(BlockDriverState *bs); -void coroutine_fn GRAPH_RDLOCK bdrv_co_io_unplug(BlockDriverState *bs); - bool coroutine_fn GRAPH_RDLOCK bdrv_co_can_store_new_dirty_bitmap(BlockDriverState *bs, const char *name, uint32_t granularity, Error **errp); diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h index b1cbc1e00c..74195c3004 100644 --- a/include/block/block_int-common.h +++ b/include/block/block_int-common.h @@ -768,11 +768,6 @@ struct BlockDriver { void coroutine_fn GRAPH_RDLOCK_PTR (*bdrv_co_debug_event)( BlockDriverState *bs, BlkdebugEvent event); - /* io queue for linux-aio */ - void coroutine_fn GRAPH_RDLOCK_PTR (*bdrv_co_io_plug)(BlockDriverState *bs); - void coroutine_fn GRAPH_RDLOCK_PTR (*bdrv_co_io_unplug)( - BlockDriverState *bs); - bool (*bdrv_supports_persistent_dirty_bitmap)(BlockDriverState *bs); bool coroutine_fn GRAPH_RDLOCK_PTR (*bdrv_co_can_store_new_dirty_bitmap)( @@ -1227,12 +1222,6 @@ struct BlockDriverState { unsigned int in_flight; unsigned int serialising_in_flight; - /* - * counter for nested bdrv_io_plug. - * Accessed with atomic ops. - */ - unsigned io_plugged; - /* do we need to tell the quest if we have a volatile write cache? */ int enable_write_cache; From 2e2097b495990b198c45dd8ecde8cc58ae3fcd1f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alex=20Benn=C3=A9e?= Date: Fri, 26 May 2023 17:53:51 +0100 Subject: [PATCH 07/25] *-user: remove the guest_user_syscall tracepoints MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is pure duplication now. Both bsd-user and linux-user have builtin strace support and we can also track syscalls via the plugins system. Reviewed-by: Warner Losh Reviewed-by: Stefan Hajnoczi Reviewed-by: Richard Henderson Reviewed-by: Philippe Mathieu-Daudé Signed-off-by: Alex Bennée Message-id: 20230526165401.574474-2-alex.bennee@linaro.org Message-Id: <20230524133952.3971948-2-alex.bennee@linaro.org> [Remove unused variable in do_freebsd_syscall() reported by Richard Henderson. --Stefan] Signed-off-by: Stefan Hajnoczi --- bsd-user/freebsd/os-syscall.c | 3 --- include/user/syscall-trace.h | 4 ---- trace-events | 19 ------------------- 3 files changed, 26 deletions(-) diff --git a/bsd-user/freebsd/os-syscall.c b/bsd-user/freebsd/os-syscall.c index c8f998ecec..de36c4b71c 100644 --- a/bsd-user/freebsd/os-syscall.c +++ b/bsd-user/freebsd/os-syscall.c @@ -528,10 +528,8 @@ abi_long do_freebsd_syscall(void *cpu_env, int num, abi_long arg1, abi_long arg5, abi_long arg6, abi_long arg7, abi_long arg8) { - CPUState *cpu = env_cpu(cpu_env); abi_long ret; - trace_guest_user_syscall(cpu, num, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8); if (do_strace) { print_freebsd_syscall(num, arg1, arg2, arg3, arg4, arg5, arg6); } @@ -541,7 +539,6 @@ abi_long do_freebsd_syscall(void *cpu_env, int num, abi_long arg1, if (do_strace) { print_freebsd_syscall_ret(num, ret); } - trace_guest_user_syscall_ret(cpu, num, ret); return ret; } diff --git a/include/user/syscall-trace.h b/include/user/syscall-trace.h index 90bda7631c..557f881a79 100644 --- a/include/user/syscall-trace.h +++ b/include/user/syscall-trace.h @@ -26,9 +26,6 @@ static inline void record_syscall_start(void *cpu, int num, abi_long arg5, abi_long arg6, abi_long arg7, abi_long arg8) { - trace_guest_user_syscall(cpu, num, - arg1, arg2, arg3, arg4, - arg5, arg6, arg7, arg8); qemu_plugin_vcpu_syscall(cpu, num, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8); @@ -36,7 +33,6 @@ static inline void record_syscall_start(void *cpu, int num, static inline void record_syscall_return(void *cpu, int num, abi_long ret) { - trace_guest_user_syscall_ret(cpu, num, ret); qemu_plugin_vcpu_syscall_ret(cpu, num, ret); } diff --git a/trace-events b/trace-events index b6b84b175e..691c3533e4 100644 --- a/trace-events +++ b/trace-events @@ -85,22 +85,3 @@ vcpu guest_cpu_exit(void) # Targets: all vcpu guest_cpu_reset(void) -# include/user/syscall-trace.h - -# @num: System call number. -# @arg*: System call argument value. -# -# Start executing a guest system call in syscall emulation mode. -# -# Mode: user -# Targets: TCG(all) -vcpu guest_user_syscall(uint64_t num, uint64_t arg1, uint64_t arg2, uint64_t arg3, uint64_t arg4, uint64_t arg5, uint64_t arg6, uint64_t arg7, uint64_t arg8) "num=0x%016"PRIx64" arg1=0x%016"PRIx64" arg2=0x%016"PRIx64" arg3=0x%016"PRIx64" arg4=0x%016"PRIx64" arg5=0x%016"PRIx64" arg6=0x%016"PRIx64" arg7=0x%016"PRIx64" arg8=0x%016"PRIx64 - -# @num: System call number. -# @ret: System call result value. -# -# Finish executing a guest system call in syscall emulation mode. -# -# Mode: user -# Targets: TCG(all) -vcpu guest_user_syscall_ret(uint64_t num, uint64_t ret) "num=0x%016"PRIx64" ret=0x%016"PRIx64 From 78f314cf8349a76a757dbcc4da88bf9a0b20d3a1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alex=20Benn=C3=A9e?= Date: Fri, 26 May 2023 17:53:52 +0100 Subject: [PATCH 08/25] trace-events: remove the remaining vcpu trace events MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit While these are all in helper functions being designated vcpu events complicates the removal of the dynamic vcpu state code. TCG plugins allow you to instrument vcpu_[init|exit|idle]. We rename cpu_reset and make it a normal trace point. Reviewed-by: Stefan Hajnoczi Reviewed-by: Richard Henderson Reviewed-by: Philippe Mathieu-Daudé Signed-off-by: Alex Bennée Message-id: 20230526165401.574474-3-alex.bennee@linaro.org Message-Id: <20230524133952.3971948-3-alex.bennee@linaro.org> Signed-off-by: Stefan Hajnoczi --- hw/core/cpu-common.c | 4 ++-- hw/core/trace-events | 3 +++ trace-events | 31 ------------------------------- trace/control-target.c | 1 - trace/control.c | 2 -- 5 files changed, 5 insertions(+), 36 deletions(-) diff --git a/hw/core/cpu-common.c b/hw/core/cpu-common.c index 5ccc3837b6..951477a7fd 100644 --- a/hw/core/cpu-common.c +++ b/hw/core/cpu-common.c @@ -32,7 +32,7 @@ #include "sysemu/tcg.h" #include "hw/boards.h" #include "hw/qdev-properties.h" -#include "trace/trace-root.h" +#include "trace.h" #include "qemu/plugin.h" CPUState *cpu_by_arch_id(int64_t id) @@ -113,7 +113,7 @@ void cpu_reset(CPUState *cpu) { device_cold_reset(DEVICE(cpu)); - trace_guest_cpu_reset(cpu); + trace_cpu_reset(cpu->cpu_index); } static void cpu_common_reset_hold(Object *obj) diff --git a/hw/core/trace-events b/hw/core/trace-events index 56da55bd71..2cf085ac66 100644 --- a/hw/core/trace-events +++ b/hw/core/trace-events @@ -29,3 +29,6 @@ clock_set(const char *clk, uint64_t old, uint64_t new) "'%s', %"PRIu64"Hz->%"PRI clock_propagate(const char *clk) "'%s'" clock_update(const char *clk, const char *src, uint64_t hz, int cb) "'%s', src='%s', val=%"PRIu64"Hz cb=%d" clock_set_mul_div(const char *clk, uint32_t oldmul, uint32_t mul, uint32_t olddiv, uint32_t div) "'%s', mul: %u -> %u, div: %u -> %u" + +# cpu-common.c +cpu_reset(int cpu_index) "%d" diff --git a/trace-events b/trace-events index 691c3533e4..dd318ed1af 100644 --- a/trace-events +++ b/trace-events @@ -54,34 +54,3 @@ qmp_job_resume(void *job) "job %p" qmp_job_complete(void *job) "job %p" qmp_job_finalize(void *job) "job %p" qmp_job_dismiss(void *job) "job %p" - - -### Guest events, keep at bottom - - -## vCPU - -# trace/control-target.c - -# Hot-plug a new virtual (guest) CPU -# -# Mode: user, softmmu -# Targets: all -vcpu guest_cpu_enter(void) - -# trace/control.c - -# Hot-unplug a virtual (guest) CPU -# -# Mode: user, softmmu -# Targets: all -vcpu guest_cpu_exit(void) - -# hw/core/cpu.c - -# Reset the state of a virtual (guest) CPU -# -# Mode: user, softmmu -# Targets: all -vcpu guest_cpu_reset(void) - diff --git a/trace/control-target.c b/trace/control-target.c index c0c1e2310a..a10752924b 100644 --- a/trace/control-target.c +++ b/trace/control-target.c @@ -144,5 +144,4 @@ void trace_init_vcpu(CPUState *vcpu) } } } - trace_guest_cpu_enter(vcpu); } diff --git a/trace/control.c b/trace/control.c index 6c77cc6318..d24af91004 100644 --- a/trace/control.c +++ b/trace/control.c @@ -277,8 +277,6 @@ void trace_fini_vcpu(CPUState *vcpu) TraceEventIter iter; TraceEvent *ev; - trace_guest_cpu_exit(vcpu); - trace_event_iter_init_all(&iter); while ((ev = trace_event_iter_next(&iter)) != NULL) { if (trace_event_is_vcpu(ev) && From 710320137099b8d86cedb4f58de2455acd58cbde Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alex=20Benn=C3=A9e?= Date: Fri, 26 May 2023 17:53:53 +0100 Subject: [PATCH 09/25] trace: remove vcpu_id from the TraceEvent structure MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This does involve temporarily stubbing out some helper functions before we excise the rest of the code. Reviewed-by: Philippe Mathieu-Daudé Reviewed-by: Stefan Hajnoczi Reviewed-by: Richard Henderson Signed-off-by: Alex Bennée Message-id: 20230526165401.574474-4-alex.bennee@linaro.org Message-Id: <20230524133952.3971948-4-alex.bennee@linaro.org> Signed-off-by: Stefan Hajnoczi --- scripts/tracetool/format/c.py | 6 ------ scripts/tracetool/format/h.py | 11 +---------- trace/control-internal.h | 4 ++-- trace/control.c | 10 ---------- trace/event-internal.h | 2 -- 5 files changed, 3 insertions(+), 30 deletions(-) diff --git a/scripts/tracetool/format/c.py b/scripts/tracetool/format/c.py index c390c1844a..69edf0d588 100644 --- a/scripts/tracetool/format/c.py +++ b/scripts/tracetool/format/c.py @@ -32,19 +32,13 @@ def generate(events, backend, group): out('uint16_t %s;' % e.api(e.QEMU_DSTATE)) for e in events: - if "vcpu" in e.properties: - vcpu_id = 0 - else: - vcpu_id = "TRACE_VCPU_EVENT_NONE" out('TraceEvent %(event)s = {', ' .id = 0,', - ' .vcpu_id = %(vcpu_id)s,', ' .name = \"%(name)s\",', ' .sstate = %(sstate)s,', ' .dstate = &%(dstate)s ', '};', event = e.api(e.QEMU_EVENT), - vcpu_id = vcpu_id, name = e.name, sstate = "TRACE_%s_ENABLED" % e.name.upper(), dstate = e.api(e.QEMU_DSTATE)) diff --git a/scripts/tracetool/format/h.py b/scripts/tracetool/format/h.py index e94f0be7da..285d7b03a9 100644 --- a/scripts/tracetool/format/h.py +++ b/scripts/tracetool/format/h.py @@ -74,16 +74,7 @@ def generate(events, backend, group): out('}') - # tracer wrapper with checks (per-vCPU tracing) - if "vcpu" in e.properties: - trace_cpu = next(iter(e.args))[1] - cond = "trace_event_get_vcpu_state(%(cpu)s,"\ - " TRACE_%(id)s)"\ - % dict( - cpu=trace_cpu, - id=e.name.upper()) - else: - cond = "true" + cond = "true" out('', 'static inline void %(api)s(%(args)s)', diff --git a/trace/control-internal.h b/trace/control-internal.h index 8b2b50a7cf..0178121720 100644 --- a/trace/control-internal.h +++ b/trace/control-internal.h @@ -27,12 +27,12 @@ static inline uint32_t trace_event_get_id(TraceEvent *ev) static inline uint32_t trace_event_get_vcpu_id(TraceEvent *ev) { - return ev->vcpu_id; + return 0; } static inline bool trace_event_is_vcpu(TraceEvent *ev) { - return ev->vcpu_id != TRACE_VCPU_EVENT_NONE; + return false; } static inline const char * trace_event_get_name(TraceEvent *ev) diff --git a/trace/control.c b/trace/control.c index d24af91004..5dfb609954 100644 --- a/trace/control.c +++ b/trace/control.c @@ -68,16 +68,6 @@ void trace_event_register_group(TraceEvent **events) size_t i; for (i = 0; events[i] != NULL; i++) { events[i]->id = next_id++; - if (events[i]->vcpu_id == TRACE_VCPU_EVENT_NONE) { - continue; - } - - if (likely(next_vcpu_id < CPU_TRACE_DSTATE_MAX_EVENTS)) { - events[i]->vcpu_id = next_vcpu_id++; - } else { - warn_report("too many vcpu trace events; dropping '%s'", - events[i]->name); - } } event_groups = g_renew(TraceEventGroup, event_groups, nevent_groups + 1); event_groups[nevent_groups].events = events; diff --git a/trace/event-internal.h b/trace/event-internal.h index f63500b37e..0c24e01b52 100644 --- a/trace/event-internal.h +++ b/trace/event-internal.h @@ -19,7 +19,6 @@ /** * TraceEvent: * @id: Unique event identifier. - * @vcpu_id: Unique per-vCPU event identifier. * @name: Event name. * @sstate: Static tracing state. * @dstate: Dynamic tracing state @@ -33,7 +32,6 @@ */ typedef struct TraceEvent { uint32_t id; - uint32_t vcpu_id; const char * name; const bool sstate; uint16_t *dstate; From 9deb999305ef11d1f06a909b29ab625476091552 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alex=20Benn=C3=A9e?= Date: Fri, 26 May 2023 17:53:54 +0100 Subject: [PATCH 10/25] scripts/qapi: document the tool that generated the file MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This makes it a little easier for developers to find where things where being generated. Reviewed-by: Richard Henderson Reviewed-by: Stefan Hajnoczi Reviewed-by: Philippe Mathieu-Daudé Reviewed-by: Markus Armbruster Signed-off-by: Alex Bennée Message-id: 20230526165401.574474-5-alex.bennee@linaro.org Message-Id: <20230524133952.3971948-5-alex.bennee@linaro.org> Signed-off-by: Stefan Hajnoczi --- scripts/qapi/gen.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/scripts/qapi/gen.py b/scripts/qapi/gen.py index 8f8f784f4a..70bc576a10 100644 --- a/scripts/qapi/gen.py +++ b/scripts/qapi/gen.py @@ -13,6 +13,7 @@ from contextlib import contextmanager import os +import sys import re from typing import ( Dict, @@ -162,7 +163,7 @@ class QAPIGenC(QAPIGenCCode): def _top(self) -> str: return mcgen(''' -/* AUTOMATICALLY GENERATED, DO NOT MODIFY */ +/* AUTOMATICALLY GENERATED by %(tool)s DO NOT MODIFY */ /* %(blurb)s @@ -174,6 +175,7 @@ class QAPIGenC(QAPIGenCCode): */ ''', + tool=os.path.basename(sys.argv[0]), blurb=self._blurb, copyright=self._copyright) def _bottom(self) -> str: @@ -195,7 +197,10 @@ class QAPIGenH(QAPIGenC): class QAPIGenTrace(QAPIGen): def _top(self) -> str: - return super()._top() + '# AUTOMATICALLY GENERATED, DO NOT MODIFY\n\n' + return (super()._top() + + '# AUTOMATICALLY GENERATED by ' + + os.path.basename(sys.argv[0]) + + ', DO NOT MODIFY\n\n') @contextmanager From 1a8fc85019a49bdc571f57b597198d7503a273f8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alex=20Benn=C3=A9e?= Date: Fri, 26 May 2023 17:53:55 +0100 Subject: [PATCH 11/25] docs/deprecated: move QMP events bellow QMP command section MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Also rename the section to make the fact this is part of the management protocol even clearer. Suggested-by: Markus Armbruster Signed-off-by: Alex Bennée Message-id: 20230526165401.574474-6-alex.bennee@linaro.org Signed-off-by: Stefan Hajnoczi --- docs/about/deprecated.rst | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/docs/about/deprecated.rst b/docs/about/deprecated.rst index e934e0a13a..7c45a64363 100644 --- a/docs/about/deprecated.rst +++ b/docs/about/deprecated.rst @@ -218,6 +218,15 @@ instruction per translated block" mode (which can be set on the command line or via the HMP, but not via QMP). The information remains available via the HMP 'info jit' command. +QEMU Machine Protocol (QMP) events +---------------------------------- + +``MEM_UNPLUG_ERROR`` (since 6.2) +'''''''''''''''''''''''''''''''''''''''''''''''''''''''' + +Use the more generic event ``DEVICE_UNPLUG_GUEST_ERROR`` instead. + + Human Monitor Protocol (HMP) commands ------------------------------------- @@ -251,15 +260,6 @@ it. Since all recent x86 hardware from the past >10 years is capable of the 64-bit x86 extensions, a corresponding 64-bit OS should be used instead. -QEMU API (QAPI) events ----------------------- - -``MEM_UNPLUG_ERROR`` (since 6.2) -'''''''''''''''''''''''''''''''''''''''''''''''''''''''' - -Use the more generic event ``DEVICE_UNPLUG_GUEST_ERROR`` instead. - - System emulator machines ------------------------ From 5485e52a332c3b38e338081a7275b2c7cb4f0813 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alex=20Benn=C3=A9e?= Date: Fri, 26 May 2023 17:53:56 +0100 Subject: [PATCH 12/25] qapi: make the vcpu parameters deprecated for 8.1 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit I don't think I can remove the parameters directly but certainly mark them as deprecated. Reviewed-by: Stefan Hajnoczi Reviewed-by: Richard Henderson Reviewed-by: Philippe Mathieu-Daudé Signed-off-by: Alex Bennée Message-id: 20230526165401.574474-7-alex.bennee@linaro.org Message-Id: <20230524133952.3971948-6-alex.bennee@linaro.org> Signed-off-by: Stefan Hajnoczi --- docs/about/deprecated.rst | 7 +++++++ qapi/trace.json | 40 +++++++++++++++++---------------------- 2 files changed, 24 insertions(+), 23 deletions(-) diff --git a/docs/about/deprecated.rst b/docs/about/deprecated.rst index 7c45a64363..0743459862 100644 --- a/docs/about/deprecated.rst +++ b/docs/about/deprecated.rst @@ -226,6 +226,13 @@ QEMU Machine Protocol (QMP) events Use the more generic event ``DEVICE_UNPLUG_GUEST_ERROR`` instead. +``vcpu`` trace events (since 8.1) +''''''''''''''''''''''''''''''''' + +The ability to instrument QEMU helper functions with vCPU-aware trace +points was removed in 7.0. However QMP still exposed the vcpu +parameter. This argument has now been deprecated and the remaining +remaining trace points that used it are selected just by name. Human Monitor Protocol (HMP) commands ------------------------------------- diff --git a/qapi/trace.json b/qapi/trace.json index 6bf0af0946..39b752fc88 100644 --- a/qapi/trace.json +++ b/qapi/trace.json @@ -37,13 +37,14 @@ # # @vcpu: Whether this is a per-vCPU event (since 2.7). # -# An event is per-vCPU if it has the "vcpu" property in the -# "trace-events" files. +# Features: +# @deprecated: Member @vcpu is deprecated, and always ignored. # # Since: 2.2 ## { 'struct': 'TraceEventInfo', - 'data': {'name': 'str', 'state': 'TraceEventState', 'vcpu': 'bool'} } + 'data': {'name': 'str', 'state': 'TraceEventState', + 'vcpu': { 'type': 'bool', 'features': ['deprecated'] } } } ## # @trace-event-get-state: @@ -52,19 +53,15 @@ # # @name: Event name pattern (case-sensitive glob). # -# @vcpu: The vCPU to query (any by default; since 2.7). +# @vcpu: The vCPU to query (since 2.7). +# +# Features: +# @deprecated: Member @vcpu is deprecated, and always ignored. # # Returns: a list of @TraceEventInfo for the matching events # -# An event is returned if: -# -# - its name matches the @name pattern, and -# - if @vcpu is given, the event has the "vcpu" property. -# -# Therefore, if @vcpu is given, the operation will only match per-vCPU -# events, returning their state on the specified vCPU. Special case: -# if @name is an exact match, @vcpu is given and the event does not -# have the "vcpu" property, an error is returned. +# An event is returned if its name matches the @name pattern +# (There are no longer any per-vCPU events). # # Since: 2.2 # @@ -75,7 +72,8 @@ # <- { "return": [ { "name": "qemu_memalign", "state": "disabled", "vcpu": false } ] } ## { 'command': 'trace-event-get-state', - 'data': {'name': 'str', '*vcpu': 'int'}, + 'data': {'name': 'str', + '*vcpu': {'type': 'int', 'features': ['deprecated'] } }, 'returns': ['TraceEventInfo'] } ## @@ -91,15 +89,11 @@ # # @vcpu: The vCPU to act upon (all by default; since 2.7). # -# An event's state is modified if: +# Features: +# @deprecated: Member @vcpu is deprecated, and always ignored. # -# - its name matches the @name pattern, and -# - if @vcpu is given, the event has the "vcpu" property. -# -# Therefore, if @vcpu is given, the operation will only match per-vCPU -# events, setting their state on the specified vCPU. Special case: if -# @name is an exact match, @vcpu is given and the event does not have -# the "vcpu" property, an error is returned. +# An event is enabled if its name matches the @name pattern +# (There are no longer any per-vCPU events). # # Since: 2.2 # @@ -111,4 +105,4 @@ ## { 'command': 'trace-event-set-state', 'data': {'name': 'str', 'enable': 'bool', '*ignore-unavailable': 'bool', - '*vcpu': 'int'} } + '*vcpu': {'type': 'int', 'features': ['deprecated'] } } } From 89aafcf2a79307b7d658b174e2d649dd63365bd1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alex=20Benn=C3=A9e?= Date: Fri, 26 May 2023 17:53:57 +0100 Subject: [PATCH 13/25] trace: remove code that depends on setting vcpu MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Now we no longer have any events that are for vcpus we can start excising the code from the trace control. As the vcpu parameter is encoded as part of QMP we just stub out the has_vcpu/vcpu parameters rather than alter the API. Reviewed-by: Stefan Hajnoczi Reviewed-by: Richard Henderson Signed-off-by: Alex Bennée Message-id: 20230526165401.574474-8-alex.bennee@linaro.org Message-Id: <20230524133952.3971948-7-alex.bennee@linaro.org> Signed-off-by: Stefan Hajnoczi --- hw/core/cpu-common.c | 2 - stubs/trace-control.c | 13 ----- trace/control-internal.h | 10 ---- trace/control-target.c | 108 ++++----------------------------------- trace/control-vcpu.h | 16 ------ trace/control.c | 16 ------ trace/control.h | 48 ----------------- trace/qmp.c | 74 +++------------------------ trace/trace-hmp-cmds.c | 18 ++----- 9 files changed, 20 insertions(+), 285 deletions(-) diff --git a/hw/core/cpu-common.c b/hw/core/cpu-common.c index 951477a7fd..f4e51c8a1b 100644 --- a/hw/core/cpu-common.c +++ b/hw/core/cpu-common.c @@ -211,7 +211,6 @@ static void cpu_common_realizefn(DeviceState *dev, Error **errp) } /* NOTE: latest generic point where the cpu is fully realized */ - trace_init_vcpu(cpu); } static void cpu_common_unrealizefn(DeviceState *dev) @@ -219,7 +218,6 @@ static void cpu_common_unrealizefn(DeviceState *dev) CPUState *cpu = CPU(dev); /* NOTE: latest generic point before the cpu is fully unrealized */ - trace_fini_vcpu(cpu); cpu_exec_unrealizefn(cpu); } diff --git a/stubs/trace-control.c b/stubs/trace-control.c index 7f856e5c24..b428f34c87 100644 --- a/stubs/trace-control.c +++ b/stubs/trace-control.c @@ -36,16 +36,3 @@ void trace_event_set_state_dynamic(TraceEvent *ev, bool state) } } } - -void trace_event_set_vcpu_state_dynamic(CPUState *vcpu, - TraceEvent *ev, bool state) -{ - /* should never be called on non-target binaries */ - abort(); -} - -void trace_init_vcpu(CPUState *vcpu) -{ - /* should never be called on non-target binaries */ - abort(); -} diff --git a/trace/control-internal.h b/trace/control-internal.h index 0178121720..8d818d359b 100644 --- a/trace/control-internal.h +++ b/trace/control-internal.h @@ -25,16 +25,6 @@ static inline uint32_t trace_event_get_id(TraceEvent *ev) return ev->id; } -static inline uint32_t trace_event_get_vcpu_id(TraceEvent *ev) -{ - return 0; -} - -static inline bool trace_event_is_vcpu(TraceEvent *ev) -{ - return false; -} - static inline const char * trace_event_get_name(TraceEvent *ev) { assert(ev != NULL); diff --git a/trace/control-target.c b/trace/control-target.c index a10752924b..97f21e476d 100644 --- a/trace/control-target.c +++ b/trace/control-target.c @@ -36,112 +36,22 @@ void trace_event_set_state_dynamic_init(TraceEvent *ev, bool state) void trace_event_set_state_dynamic(TraceEvent *ev, bool state) { - CPUState *vcpu; assert(trace_event_get_state_static(ev)); - if (trace_event_is_vcpu(ev) && likely(first_cpu != NULL)) { - CPU_FOREACH(vcpu) { - trace_event_set_vcpu_state_dynamic(vcpu, ev, state); - } - } else { - /* - * Without the "vcpu" property, dstate can only be 1 or 0. With it, we - * haven't instantiated any vCPU yet, so we will set a global state - * instead, and trace_init_vcpu will reconcile it afterwards. - */ - bool state_pre = *ev->dstate; - if (state_pre != state) { - if (state) { - trace_events_enabled_count++; - *ev->dstate = 1; - } else { - trace_events_enabled_count--; - *ev->dstate = 0; - } - } - } -} -static void trace_event_synchronize_vcpu_state_dynamic( - CPUState *vcpu, run_on_cpu_data ignored) -{ - bitmap_copy(vcpu->trace_dstate, vcpu->trace_dstate_delayed, - CPU_TRACE_DSTATE_MAX_EVENTS); - tcg_flush_jmp_cache(vcpu); -} - -void trace_event_set_vcpu_state_dynamic(CPUState *vcpu, - TraceEvent *ev, bool state) -{ - uint32_t vcpu_id; - bool state_pre; - assert(trace_event_get_state_static(ev)); - assert(trace_event_is_vcpu(ev)); - vcpu_id = trace_event_get_vcpu_id(ev); - state_pre = test_bit(vcpu_id, vcpu->trace_dstate); + /* + * There is no longer a "vcpu" property, dstate can only be 1 or + * 0. With it, we haven't instantiated any vCPU yet, so we will + * set a global state instead, and trace_init_vcpu will reconcile + * it afterwards. + */ + bool state_pre = *ev->dstate; if (state_pre != state) { if (state) { trace_events_enabled_count++; - set_bit(vcpu_id, vcpu->trace_dstate_delayed); - (*ev->dstate)++; + *ev->dstate = 1; } else { trace_events_enabled_count--; - clear_bit(vcpu_id, vcpu->trace_dstate_delayed); - (*ev->dstate)--; - } - if (vcpu->created) { - /* - * Delay changes until next TB; we want all TBs to be built from a - * single set of dstate values to ensure consistency of generated - * tracing code. - */ - async_run_on_cpu(vcpu, trace_event_synchronize_vcpu_state_dynamic, - RUN_ON_CPU_NULL); - } else { - trace_event_synchronize_vcpu_state_dynamic(vcpu, RUN_ON_CPU_NULL); - } - } -} - -static bool adding_first_cpu1(void) -{ - CPUState *cpu; - size_t count = 0; - CPU_FOREACH(cpu) { - count++; - if (count > 1) { - return false; - } - } - return true; -} - -static bool adding_first_cpu(void) -{ - QEMU_LOCK_GUARD(&qemu_cpu_list_lock); - - return adding_first_cpu1(); -} - -void trace_init_vcpu(CPUState *vcpu) -{ - TraceEventIter iter; - TraceEvent *ev; - trace_event_iter_init_all(&iter); - while ((ev = trace_event_iter_next(&iter)) != NULL) { - if (trace_event_is_vcpu(ev) && - trace_event_get_state_static(ev) && - trace_event_get_state_dynamic(ev)) { - if (adding_first_cpu()) { - /* check preconditions */ - assert(*ev->dstate == 1); - /* disable early-init state ... */ - *ev->dstate = 0; - trace_events_enabled_count--; - /* ... and properly re-enable */ - trace_event_set_vcpu_state_dynamic(vcpu, ev, true); - } else { - trace_event_set_vcpu_state_dynamic(vcpu, ev, true); - } + *ev->dstate = 0; } } } diff --git a/trace/control-vcpu.h b/trace/control-vcpu.h index 0f98ebe7b5..800fc5a219 100644 --- a/trace/control-vcpu.h +++ b/trace/control-vcpu.h @@ -30,13 +30,6 @@ trace_event_get_vcpu_state_dynamic_by_vcpu_id( \ vcpu, _ ## id ## _EVENT.vcpu_id)) -/** - * trace_event_get_vcpu_state_dynamic: - * - * Get the dynamic tracing state of an event for the given vCPU. - */ -static bool trace_event_get_vcpu_state_dynamic(CPUState *vcpu, TraceEvent *ev); - #include "control-internal.h" static inline bool @@ -51,13 +44,4 @@ trace_event_get_vcpu_state_dynamic_by_vcpu_id(CPUState *vcpu, } } -static inline bool trace_event_get_vcpu_state_dynamic(CPUState *vcpu, - TraceEvent *ev) -{ - uint32_t vcpu_id; - assert(trace_event_is_vcpu(ev)); - vcpu_id = trace_event_get_vcpu_id(ev); - return trace_event_get_vcpu_state_dynamic_by_vcpu_id(vcpu, vcpu_id); -} - #endif diff --git a/trace/control.c b/trace/control.c index 5dfb609954..1a48a7e266 100644 --- a/trace/control.c +++ b/trace/control.c @@ -262,22 +262,6 @@ void trace_init_file(void) #endif } -void trace_fini_vcpu(CPUState *vcpu) -{ - TraceEventIter iter; - TraceEvent *ev; - - trace_event_iter_init_all(&iter); - while ((ev = trace_event_iter_next(&iter)) != NULL) { - if (trace_event_is_vcpu(ev) && - trace_event_get_state_static(ev) && - trace_event_get_vcpu_state_dynamic(vcpu, ev)) { - /* must disable to affect the global counter */ - trace_event_set_vcpu_state_dynamic(vcpu, ev, false); - } - } -} - bool trace_init_backends(void) { #ifdef CONFIG_TRACE_SIMPLE diff --git a/trace/control.h b/trace/control.h index 23b8393b29..dfd209edd8 100644 --- a/trace/control.h +++ b/trace/control.h @@ -89,23 +89,6 @@ static bool trace_event_is_pattern(const char *str); */ static uint32_t trace_event_get_id(TraceEvent *ev); -/** - * trace_event_get_vcpu_id: - * - * Get the per-vCPU identifier of an event. - * - * Special value #TRACE_VCPU_EVENT_NONE means the event is not vCPU-specific - * (does not have the "vcpu" property). - */ -static uint32_t trace_event_get_vcpu_id(TraceEvent *ev); - -/** - * trace_event_is_vcpu: - * - * Whether this is a per-vCPU event. - */ -static bool trace_event_is_vcpu(TraceEvent *ev); - /** * trace_event_get_name: * @@ -172,21 +155,6 @@ static bool trace_event_get_state_dynamic(TraceEvent *ev); */ void trace_event_set_state_dynamic(TraceEvent *ev, bool state); -/** - * trace_event_set_vcpu_state_dynamic: - * - * Set the dynamic tracing state of an event for the given vCPU. - * - * Pre-condition: trace_event_get_vcpu_state_static(ev) == true - * - * Note: Changes for execution-time events with the 'tcg' property will not be - * propagated until the next TB is executed (iff executing in TCG mode). - */ -void trace_event_set_vcpu_state_dynamic(CPUState *vcpu, - TraceEvent *ev, bool state); - - - /** * trace_init_backends: * @@ -205,22 +173,6 @@ bool trace_init_backends(void); */ void trace_init_file(void); -/** - * trace_init_vcpu: - * @vcpu: Added vCPU. - * - * Set initial dynamic event state for a hot-plugged vCPU. - */ -void trace_init_vcpu(CPUState *vcpu); - -/** - * trace_fini_vcpu: - * @vcpu: Removed vCPU. - * - * Disable dynamic event state for a hot-unplugged vCPU. - */ -void trace_fini_vcpu(CPUState *vcpu); - /** * trace_list_events: * @f: Where to send output. diff --git a/trace/qmp.c b/trace/qmp.c index 3b4f4702b4..aa760f1fc4 100644 --- a/trace/qmp.c +++ b/trace/qmp.c @@ -13,20 +13,7 @@ #include "control-vcpu.h" -static CPUState *get_cpu(bool has_vcpu, int vcpu, Error **errp) -{ - if (has_vcpu) { - CPUState *cpu = qemu_get_cpu(vcpu); - if (cpu == NULL) { - error_setg(errp, "invalid vCPU index %u", vcpu); - } - return cpu; - } else { - return NULL; - } -} - -static bool check_events(bool has_vcpu, bool ignore_unavailable, bool is_pattern, +static bool check_events(bool ignore_unavailable, bool is_pattern, const char *name, Error **errp) { if (!is_pattern) { @@ -38,12 +25,6 @@ static bool check_events(bool has_vcpu, bool ignore_unavailable, bool is_pattern return false; } - /* error for non-vcpu event */ - if (has_vcpu && !trace_event_is_vcpu(ev)) { - error_setg(errp, "event \"%s\" is not vCPU-specific", name); - return false; - } - /* error for unavailable event */ if (!ignore_unavailable && !trace_event_get_state_static(ev)) { error_setg(errp, "event \"%s\" is disabled", name); @@ -70,22 +51,13 @@ TraceEventInfoList *qmp_trace_event_get_state(const char *name, bool has_vcpu, int64_t vcpu, Error **errp) { - Error *err = NULL; TraceEventInfoList *events = NULL; TraceEventIter iter; TraceEvent *ev; bool is_pattern = trace_event_is_pattern(name); - CPUState *cpu; - - /* Check provided vcpu */ - cpu = get_cpu(has_vcpu, vcpu, &err); - if (err) { - error_propagate(errp, err); - return NULL; - } /* Check events */ - if (!check_events(has_vcpu, true, is_pattern, name, errp)) { + if (!check_events(true, is_pattern, name, errp)) { return NULL; } @@ -93,33 +65,17 @@ TraceEventInfoList *qmp_trace_event_get_state(const char *name, trace_event_iter_init_pattern(&iter, name); while ((ev = trace_event_iter_next(&iter)) != NULL) { TraceEventInfo *value; - bool is_vcpu = trace_event_is_vcpu(ev); - if (has_vcpu && !is_vcpu) { - continue; - } value = g_new(TraceEventInfo, 1); - value->vcpu = is_vcpu; value->name = g_strdup(trace_event_get_name(ev)); if (!trace_event_get_state_static(ev)) { value->state = TRACE_EVENT_STATE_UNAVAILABLE; } else { - if (has_vcpu) { - if (is_vcpu) { - if (trace_event_get_vcpu_state_dynamic(cpu, ev)) { - value->state = TRACE_EVENT_STATE_ENABLED; - } else { - value->state = TRACE_EVENT_STATE_DISABLED; - } - } - /* else: already skipped above */ + if (trace_event_get_state_dynamic(ev)) { + value->state = TRACE_EVENT_STATE_ENABLED; } else { - if (trace_event_get_state_dynamic(ev)) { - value->state = TRACE_EVENT_STATE_ENABLED; - } else { - value->state = TRACE_EVENT_STATE_DISABLED; - } + value->state = TRACE_EVENT_STATE_DISABLED; } } QAPI_LIST_PREPEND(events, value); @@ -133,21 +89,12 @@ void qmp_trace_event_set_state(const char *name, bool enable, bool has_vcpu, int64_t vcpu, Error **errp) { - Error *err = NULL; TraceEventIter iter; TraceEvent *ev; bool is_pattern = trace_event_is_pattern(name); - CPUState *cpu; - - /* Check provided vcpu */ - cpu = get_cpu(has_vcpu, vcpu, &err); - if (err) { - error_propagate(errp, err); - return; - } /* Check events */ - if (!check_events(has_vcpu, has_ignore_unavailable && ignore_unavailable, + if (!check_events(has_ignore_unavailable && ignore_unavailable, is_pattern, name, errp)) { return; } @@ -155,14 +102,9 @@ void qmp_trace_event_set_state(const char *name, bool enable, /* Apply changes (all errors checked above) */ trace_event_iter_init_pattern(&iter, name); while ((ev = trace_event_iter_next(&iter)) != NULL) { - if (!trace_event_get_state_static(ev) || - (has_vcpu && !trace_event_is_vcpu(ev))) { + if (!trace_event_get_state_static(ev)) { continue; } - if (has_vcpu) { - trace_event_set_vcpu_state_dynamic(cpu, ev, enable); - } else { - trace_event_set_state_dynamic(ev, enable); - } + trace_event_set_state_dynamic(ev, enable); } } diff --git a/trace/trace-hmp-cmds.c b/trace/trace-hmp-cmds.c index 792876c34a..86211fce27 100644 --- a/trace/trace-hmp-cmds.c +++ b/trace/trace-hmp-cmds.c @@ -37,16 +37,10 @@ void hmp_trace_event(Monitor *mon, const QDict *qdict) { const char *tp_name = qdict_get_str(qdict, "name"); bool new_state = qdict_get_bool(qdict, "option"); - bool has_vcpu = qdict_haskey(qdict, "vcpu"); - int vcpu = qdict_get_try_int(qdict, "vcpu", 0); Error *local_err = NULL; - if (vcpu < 0) { - monitor_printf(mon, "argument vcpu must be positive"); - return; - } - - qmp_trace_event_set_state(tp_name, new_state, true, true, has_vcpu, vcpu, &local_err); + qmp_trace_event_set_state(tp_name, new_state, + true, true, false, 0, &local_err); if (local_err) { error_report_err(local_err); } @@ -80,8 +74,6 @@ void hmp_trace_file(Monitor *mon, const QDict *qdict) void hmp_info_trace_events(Monitor *mon, const QDict *qdict) { const char *name = qdict_get_try_str(qdict, "name"); - bool has_vcpu = qdict_haskey(qdict, "vcpu"); - int vcpu = qdict_get_try_int(qdict, "vcpu", 0); TraceEventInfoList *events; TraceEventInfoList *elem; Error *local_err = NULL; @@ -89,12 +81,8 @@ void hmp_info_trace_events(Monitor *mon, const QDict *qdict) if (name == NULL) { name = "*"; } - if (vcpu < 0) { - monitor_printf(mon, "argument vcpu must be positive"); - return; - } - events = qmp_trace_event_get_state(name, has_vcpu, vcpu, &local_err); + events = qmp_trace_event_get_state(name, false, 0, &local_err); if (local_err) { error_report_err(local_err); return; From 333df1c6c7a388ddccaaa041b4fa2ce565d1255e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alex=20Benn=C3=A9e?= Date: Fri, 26 May 2023 17:53:58 +0100 Subject: [PATCH 14/25] trace: remove control-vcpu.h MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Now we no longer have vcpu controlled trace events we can excise the code that allows us to query its status. Reviewed-by: Philippe Mathieu-Daudé Reviewed-by: Stefan Hajnoczi Reviewed-by: Richard Henderson Signed-off-by: Alex Bennée Message-id: 20230526165401.574474-9-alex.bennee@linaro.org Message-Id: <20230524133952.3971948-8-alex.bennee@linaro.org> Signed-off-by: Stefan Hajnoczi --- scripts/tracetool/format/h.py | 5 +--- trace/control-vcpu.h | 47 ----------------------------------- trace/qmp.c | 2 +- 3 files changed, 2 insertions(+), 52 deletions(-) delete mode 100644 trace/control-vcpu.h diff --git a/scripts/tracetool/format/h.py b/scripts/tracetool/format/h.py index 285d7b03a9..ea126b07ea 100644 --- a/scripts/tracetool/format/h.py +++ b/scripts/tracetool/format/h.py @@ -16,10 +16,7 @@ from tracetool import out def generate(events, backend, group): - if group == "root": - header = "trace/control-vcpu.h" - else: - header = "trace/control.h" + header = "trace/control.h" out('/* This file is autogenerated by tracetool, do not edit. */', '', diff --git a/trace/control-vcpu.h b/trace/control-vcpu.h deleted file mode 100644 index 800fc5a219..0000000000 --- a/trace/control-vcpu.h +++ /dev/null @@ -1,47 +0,0 @@ -/* - * Interface for configuring and controlling the state of tracing events. - * - * Copyright (C) 2011-2016 Lluís Vilanova - * - * This work is licensed under the terms of the GNU GPL, version 2 or later. - * See the COPYING file in the top-level directory. - */ - -#ifndef TRACE__CONTROL_VCPU_H -#define TRACE__CONTROL_VCPU_H - -#include "control.h" -#include "event-internal.h" -#include "hw/core/cpu.h" - -/** - * trace_event_get_vcpu_state: - * @vcpu: Target vCPU. - * @id: Event identifier name. - * - * Get the tracing state of an event (both static and dynamic) for the given - * vCPU. - * - * If the event has the disabled property, the check will have no performance - * impact. - */ -#define trace_event_get_vcpu_state(vcpu, id) \ - ((id ##_ENABLED) && \ - trace_event_get_vcpu_state_dynamic_by_vcpu_id( \ - vcpu, _ ## id ## _EVENT.vcpu_id)) - -#include "control-internal.h" - -static inline bool -trace_event_get_vcpu_state_dynamic_by_vcpu_id(CPUState *vcpu, - uint32_t vcpu_id) -{ - /* it's on fast path, avoid consistency checks (asserts) */ - if (unlikely(trace_events_enabled_count)) { - return test_bit(vcpu_id, vcpu->trace_dstate); - } else { - return false; - } -} - -#endif diff --git a/trace/qmp.c b/trace/qmp.c index aa760f1fc4..3e3971c6a8 100644 --- a/trace/qmp.c +++ b/trace/qmp.c @@ -10,7 +10,7 @@ #include "qemu/osdep.h" #include "qapi/error.h" #include "qapi/qapi-commands-trace.h" -#include "control-vcpu.h" +#include "control.h" static bool check_events(bool ignore_unavailable, bool is_pattern, From d0aaf08bb9453c12ab33bd5defa1815c34460595 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alex=20Benn=C3=A9e?= Date: Fri, 26 May 2023 17:53:59 +0100 Subject: [PATCH 15/25] tcg: remove the final vestiges of dstate MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Now we no longer have dynamic state affecting things we can remove the additional fields in cpu.h and simplify the TB hash calculation. For the benchmark: hyperfine -w 2 -m 20 \ "./arm-softmmu/qemu-system-arm -cpu cortex-a15 \ -machine type=virt,highmem=off \ -display none -m 2048 \ -serial mon:stdio \ -netdev user,id=unet,hostfwd=tcp::2222-:22 \ -device virtio-net-pci,netdev=unet \ -device virtio-scsi-pci \ -blockdev driver=raw,node-name=hd,discard=unmap,file.driver=host_device,file.filename=/dev/zen-disk/debian-bullseye-armhf \ -device scsi-hd,drive=hd -smp 4 \ -kernel /home/alex/lsrc/linux.git/builds/arm/arch/arm/boot/zImage \ -append 'console=ttyAMA0 root=/dev/sda2 systemd.unit=benchmark.service' \ -snapshot" It has a marginal effect on runtime, before: Time (mean ± σ): 26.279 s ± 2.438 s [User: 41.113 s, System: 1.843 s] Range (min … max): 24.420 s … 32.565 s 20 runs after: Time (mean ± σ): 24.440 s ± 2.885 s [User: 34.474 s, System: 2.028 s] Range (min … max): 21.663 s … 29.937 s 20 runs Resolves: https://gitlab.com/qemu-project/qemu/-/issues/1358 Reviewed-by: Stefan Hajnoczi Reviewed-by: Richard Henderson Reviewed-by: Philippe Mathieu-Daudé Signed-off-by: Alex Bennée Message-id: 20230526165401.574474-10-alex.bennee@linaro.org Message-Id: <20230524133952.3971948-9-alex.bennee@linaro.org> Signed-off-by: Stefan Hajnoczi --- accel/tcg/cpu-exec.c | 7 +------ accel/tcg/tb-hash.h | 6 +++--- accel/tcg/tb-maint.c | 5 ++--- accel/tcg/translate-all.c | 6 ------ include/exec/exec-all.h | 3 --- include/hw/core/cpu.h | 5 ----- 6 files changed, 6 insertions(+), 26 deletions(-) diff --git a/accel/tcg/cpu-exec.c b/accel/tcg/cpu-exec.c index 0e741960da..4a1dce98ff 100644 --- a/accel/tcg/cpu-exec.c +++ b/accel/tcg/cpu-exec.c @@ -175,7 +175,6 @@ struct tb_desc { tb_page_addr_t page_addr0; uint32_t flags; uint32_t cflags; - uint32_t trace_vcpu_dstate; }; static bool tb_lookup_cmp(const void *p, const void *d) @@ -187,7 +186,6 @@ static bool tb_lookup_cmp(const void *p, const void *d) tb_page_addr0(tb) == desc->page_addr0 && tb->cs_base == desc->cs_base && tb->flags == desc->flags && - tb->trace_vcpu_dstate == desc->trace_vcpu_dstate && tb_cflags(tb) == desc->cflags) { /* check next page if needed */ tb_page_addr_t tb_phys_page1 = tb_page_addr1(tb); @@ -228,7 +226,6 @@ static TranslationBlock *tb_htable_lookup(CPUState *cpu, target_ulong pc, desc.cs_base = cs_base; desc.flags = flags; desc.cflags = cflags; - desc.trace_vcpu_dstate = *cpu->trace_dstate; desc.pc = pc; phys_pc = get_page_addr_code(desc.env, pc); if (phys_pc == -1) { @@ -236,7 +233,7 @@ static TranslationBlock *tb_htable_lookup(CPUState *cpu, target_ulong pc, } desc.page_addr0 = phys_pc; h = tb_hash_func(phys_pc, (cflags & CF_PCREL ? 0 : pc), - flags, cflags, *cpu->trace_dstate); + flags, cflags); return qht_lookup_custom(&tb_ctx.htable, &desc, h, tb_lookup_cmp); } @@ -263,7 +260,6 @@ static inline TranslationBlock *tb_lookup(CPUState *cpu, target_ulong pc, jc->array[hash].pc == pc && tb->cs_base == cs_base && tb->flags == flags && - tb->trace_vcpu_dstate == *cpu->trace_dstate && tb_cflags(tb) == cflags)) { return tb; } @@ -282,7 +278,6 @@ static inline TranslationBlock *tb_lookup(CPUState *cpu, target_ulong pc, tb->pc == pc && tb->cs_base == cs_base && tb->flags == flags && - tb->trace_vcpu_dstate == *cpu->trace_dstate && tb_cflags(tb) == cflags)) { return tb; } diff --git a/accel/tcg/tb-hash.h b/accel/tcg/tb-hash.h index 83dc610e4c..1d19c69caa 100644 --- a/accel/tcg/tb-hash.h +++ b/accel/tcg/tb-hash.h @@ -61,10 +61,10 @@ static inline unsigned int tb_jmp_cache_hash_func(target_ulong pc) #endif /* CONFIG_SOFTMMU */ static inline -uint32_t tb_hash_func(tb_page_addr_t phys_pc, target_ulong pc, uint32_t flags, - uint32_t cf_mask, uint32_t trace_vcpu_dstate) +uint32_t tb_hash_func(tb_page_addr_t phys_pc, target_ulong pc, + uint32_t flags, uint32_t cf_mask) { - return qemu_xxhash7(phys_pc, pc, flags, cf_mask, trace_vcpu_dstate); + return qemu_xxhash6(phys_pc, pc, flags, cf_mask); } #endif diff --git a/accel/tcg/tb-maint.c b/accel/tcg/tb-maint.c index 991746f80f..bc1961ea55 100644 --- a/accel/tcg/tb-maint.c +++ b/accel/tcg/tb-maint.c @@ -50,7 +50,6 @@ static bool tb_cmp(const void *ap, const void *bp) a->cs_base == b->cs_base && a->flags == b->flags && (tb_cflags(a) & ~CF_INVALID) == (tb_cflags(b) & ~CF_INVALID) && - a->trace_vcpu_dstate == b->trace_vcpu_dstate && tb_page_addr0(a) == tb_page_addr0(b) && tb_page_addr1(a) == tb_page_addr1(b)); } @@ -888,7 +887,7 @@ static void do_tb_phys_invalidate(TranslationBlock *tb, bool rm_from_page_list) /* remove the TB from the hash list */ phys_pc = tb_page_addr0(tb); h = tb_hash_func(phys_pc, (orig_cflags & CF_PCREL ? 0 : tb->pc), - tb->flags, orig_cflags, tb->trace_vcpu_dstate); + tb->flags, orig_cflags); if (!qht_remove(&tb_ctx.htable, tb, h)) { return; } @@ -969,7 +968,7 @@ TranslationBlock *tb_link_page(TranslationBlock *tb, tb_page_addr_t phys_pc, /* add in the hash table */ h = tb_hash_func(phys_pc, (tb->cflags & CF_PCREL ? 0 : tb->pc), - tb->flags, tb->cflags, tb->trace_vcpu_dstate); + tb->flags, tb->cflags); qht_insert(&tb_ctx.htable, tb, h, &existing_tb); /* remove TB from the page(s) if we couldn't insert it */ diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c index c87648b99e..bf814b9e81 100644 --- a/accel/tcg/translate-all.c +++ b/accel/tcg/translate-all.c @@ -65,11 +65,6 @@ #include "internal.h" #include "perf.h" -/* Make sure all possible CPU event bits fit in tb->trace_vcpu_dstate */ -QEMU_BUILD_BUG_ON(CPU_TRACE_DSTATE_MAX_EVENTS > - sizeof_field(TranslationBlock, trace_vcpu_dstate) - * BITS_PER_BYTE); - TBContext tb_ctx; /* @@ -352,7 +347,6 @@ TranslationBlock *tb_gen_code(CPUState *cpu, tb->cs_base = cs_base; tb->flags = flags; tb->cflags = cflags; - tb->trace_vcpu_dstate = *cpu->trace_dstate; tb_set_page_addr0(tb, phys_pc); tb_set_page_addr1(tb, -1); tcg_ctx->gen_tb = tb; diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h index 4d2b151986..3b1b57f6ad 100644 --- a/include/exec/exec-all.h +++ b/include/exec/exec-all.h @@ -545,9 +545,6 @@ struct TranslationBlock { #define CF_CLUSTER_MASK 0xff000000 /* Top 8 bits are cluster ID */ #define CF_CLUSTER_SHIFT 24 - /* Per-vCPU dynamic tracing state used to generate this TB */ - uint32_t trace_vcpu_dstate; - /* * Above fields used for comparing */ diff --git a/include/hw/core/cpu.h b/include/hw/core/cpu.h index 39150cf8f8..383456d1b3 100644 --- a/include/hw/core/cpu.h +++ b/include/hw/core/cpu.h @@ -266,7 +266,6 @@ typedef void (*run_on_cpu_func)(CPUState *cpu, run_on_cpu_data data); struct qemu_work_item; #define CPU_UNSET_NUMA_NODE_ID -1 -#define CPU_TRACE_DSTATE_MAX_EVENTS 32 /** * CPUState: @@ -407,10 +406,6 @@ struct CPUState { /* Use by accel-block: CPU is executing an ioctl() */ QemuLockCnt in_ioctl_lock; - /* Used for events with 'vcpu' and *without* the 'disabled' properties */ - DECLARE_BITMAP(trace_dstate_delayed, CPU_TRACE_DSTATE_MAX_EVENTS); - DECLARE_BITMAP(trace_dstate, CPU_TRACE_DSTATE_MAX_EVENTS); - DECLARE_BITMAP(plugin_mask, QEMU_PLUGIN_EV_MAX); #ifdef CONFIG_PLUGIN From 80106bc5f906ef965c397515772cc9d3955e6d2a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alex=20Benn=C3=A9e?= Date: Fri, 26 May 2023 17:54:00 +0100 Subject: [PATCH 16/25] hw/9pfs: use qemu_xxhash4 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit No need to pass zeros as we have helpers that do that for us. Reviewed-by: Philippe Mathieu-Daudé Reviewed-by: Christian Schoenebeck Reviewed-by: Stefan Hajnoczi Reviewed-by: Richard Henderson Signed-off-by: Alex Bennée Message-id: 20230526165401.574474-11-alex.bennee@linaro.org Message-Id: <20230524133952.3971948-10-alex.bennee@linaro.org> Signed-off-by: Stefan Hajnoczi --- hw/9pfs/9p.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/hw/9pfs/9p.c b/hw/9pfs/9p.c index 9621ec1341..991645adca 100644 --- a/hw/9pfs/9p.c +++ b/hw/9pfs/9p.c @@ -738,15 +738,14 @@ static VariLenAffix affixForIndex(uint64_t index) return invertAffix(&prefix); /* convert prefix to suffix */ } -/* creative abuse of tb_hash_func7, which is based on xxhash */ static uint32_t qpp_hash(QppEntry e) { - return qemu_xxhash7(e.ino_prefix, e.dev, 0, 0, 0); + return qemu_xxhash4(e.ino_prefix, e.dev); } static uint32_t qpf_hash(QpfEntry e) { - return qemu_xxhash7(e.ino, e.dev, 0, 0, 0); + return qemu_xxhash4(e.ino, e.dev); } static bool qpd_cmp_func(const void *obj, const void *userp) From 367189efae8b53ec2ade37a1c079fd8f69244b9e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alex=20Benn=C3=A9e?= Date: Fri, 26 May 2023 17:54:01 +0100 Subject: [PATCH 17/25] accel/tcg: include cs_base in our hash calculations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We weren't using cs_base in the hash calculations before. Since the arm front end moved a chunk of flags in a378206a20 (target/arm: Move mode specific TB flags to tb->cs_base) they comprise of an important part of the execution state. Widen the tb_hash_func to include cs_base and expand to qemu_xxhash8() to accommodate it. My initial benchmark shows very little difference in the runtime. Before: armhf ➜ hyperfine -w 2 -m 20 "./arm-softmmu/qemu-system-arm -cpu cortex-a15 -machine type=virt,highmem=off -display none -m 2048 -serial mon:stdio -netdev user,id=unet,hostfwd=tcp::2222-:22 -device virtio-net-pci,netdev=unet -device virtio-scsi-pci -blockdev driver=raw,node-name=hd,discard=unmap,file.driver=host_device,file.filename=/dev/zen-disk/debian-bullseye-armhf -device scsi-hd,drive=hd -smp 4 -kernel /home/alex/lsrc/linux.git/builds/arm/arch/arm/boot/zImage -append 'console=ttyAMA0 root=/dev/sda2 systemd.unit=benchmark.service' -snapshot" Benchmark 1: ./arm-softmmu/qemu-system-arm -cpu cortex-a15 -machine type=virt,highmem=off -display none -m 2048 -serial mon:stdio -netdev user,id=unet,hostfwd=tcp::2222-:22 -device virtio-net-pci,netdev=unet -device virtio-scsi-pci -blockdev driver=raw,node-name=hd,discard=unmap,file.driver=host_device,file.filename=/dev/zen-disk/debian-bullseye-armhf -device scsi-hd,drive=hd -smp 4 -kernel /home/alex/lsrc/linux.git/builds/arm/arch/arm/boot/zImage -append 'console=ttyAMA0 root=/dev/sda2 systemd.unit=benchmark.service' -snapshot Time (mean ± σ): 24.627 s ± 2.708 s [User: 34.309 s, System: 1.797 s] Range (min … max): 22.345 s … 29.864 s 20 runs arm64 ➜ hyperfine -w 2 -n 20 "./qemu-system-aarch64 -cpu max,pauth-impdef=on -machine type=virt,virtualization=on,gic-version=3 -display none -serial mon:stdio -netdev user,id=unet,hostfwd=tcp::2222-:22,hostfwd=tcp::1234-:1234 -device virtio-net-pci,netdev=unet -device virtio-scsi-pci -blockdev driver=raw,node-name=hd,discard=unmap,file.driver=host_device,file.filename=/dev/zen-disk/debian-bullseye-arm64 -device scsi-hd,drive=hd -smp 4 -kernel ~/lsrc/linux.git/builds/arm64/arch/arm64/boot/Image.gz -append 'console=ttyAMA0 root=/dev/sda2 systemd.unit=benchmark-pigz.service' -snapshot" Benchmark 1: 20 Time (mean ± σ): 62.559 s ± 2.917 s [User: 189.115 s, System: 4.089 s] Range (min … max): 59.997 s … 70.153 s 10 runs After: armhf Benchmark 1: ./arm-softmmu/qemu-system-arm -cpu cortex-a15 -machine type=virt,highmem=off -display none -m 2048 -serial mon:stdio -netdev user,id=unet,hostfwd=tcp::2222-:22 -device virtio-net-pci,netdev=unet -device virtio-scsi-pci -blockdev driver=raw,node-name=hd,discard=unmap,file.driver=host_device,file.filename=/dev/zen-disk/debian-bullseye-armhf -device scsi-hd,drive=hd -smp 4 -kernel /home/alex/lsrc/linux.git/builds/arm/arch/arm/boot/zImage -append 'console=ttyAMA0 root=/dev/sda2 systemd.unit=benchmark.service' -snapshot Time (mean ± σ): 24.223 s ± 2.151 s [User: 34.284 s, System: 1.906 s] Range (min … max): 22.000 s … 28.476 s 20 runs arm64 hyperfine -w 2 -n 20 "./qemu-system-aarch64 -cpu max,pauth-impdef=on -machine type=virt,virtualization=on,gic-version=3 -display none -serial mon:stdio -netdev user,id=unet,hostfwd=tcp::2222-:22,hostfwd=tcp::1234-:1234 -device virtio-net-pci,netdev=unet -device virtio-scsi-pci -blockdev driver=raw,node-name=hd,discard=unmap,file.driver=host_device,file.filename=/dev/zen-disk/debian-bullseye-arm64 -device scsi-hd,drive=hd -smp 4 -kernel ~/lsrc/linux.git/builds/arm64/arch/arm64/boot/Image.gz -append 'console=ttyAMA0 root=/dev/sda2 systemd.unit=benchmark-pigz.service' -snapshot" Benchmark 1: 20 Time (mean ± σ): 62.769 s ± 1.978 s [User: 188.431 s, System: 5.269 s] Range (min … max): 60.285 s … 66.868 s 10 runs Signed-off-by: Alex Bennée Reviewed-by: Richard Henderson Message-id: 20230526165401.574474-12-alex.bennee@linaro.org Message-Id: <20230524133952.3971948-11-alex.bennee@linaro.org> Signed-off-by: Stefan Hajnoczi --- accel/tcg/cpu-exec.c | 2 +- accel/tcg/tb-hash.h | 4 ++-- accel/tcg/tb-maint.c | 4 ++-- include/qemu/xxhash.h | 23 +++++++++++++++++------ util/qsp.c | 2 +- 5 files changed, 23 insertions(+), 12 deletions(-) diff --git a/accel/tcg/cpu-exec.c b/accel/tcg/cpu-exec.c index 4a1dce98ff..60ca9e229e 100644 --- a/accel/tcg/cpu-exec.c +++ b/accel/tcg/cpu-exec.c @@ -233,7 +233,7 @@ static TranslationBlock *tb_htable_lookup(CPUState *cpu, target_ulong pc, } desc.page_addr0 = phys_pc; h = tb_hash_func(phys_pc, (cflags & CF_PCREL ? 0 : pc), - flags, cflags); + flags, cs_base, cflags); return qht_lookup_custom(&tb_ctx.htable, &desc, h, tb_lookup_cmp); } diff --git a/accel/tcg/tb-hash.h b/accel/tcg/tb-hash.h index 1d19c69caa..2ba2193731 100644 --- a/accel/tcg/tb-hash.h +++ b/accel/tcg/tb-hash.h @@ -62,9 +62,9 @@ static inline unsigned int tb_jmp_cache_hash_func(target_ulong pc) static inline uint32_t tb_hash_func(tb_page_addr_t phys_pc, target_ulong pc, - uint32_t flags, uint32_t cf_mask) + uint32_t flags, uint64_t flags2, uint32_t cf_mask) { - return qemu_xxhash6(phys_pc, pc, flags, cf_mask); + return qemu_xxhash8(phys_pc, pc, flags2, flags, cf_mask); } #endif diff --git a/accel/tcg/tb-maint.c b/accel/tcg/tb-maint.c index bc1961ea55..892eecda2d 100644 --- a/accel/tcg/tb-maint.c +++ b/accel/tcg/tb-maint.c @@ -887,7 +887,7 @@ static void do_tb_phys_invalidate(TranslationBlock *tb, bool rm_from_page_list) /* remove the TB from the hash list */ phys_pc = tb_page_addr0(tb); h = tb_hash_func(phys_pc, (orig_cflags & CF_PCREL ? 0 : tb->pc), - tb->flags, orig_cflags); + tb->flags, tb->cs_base, orig_cflags); if (!qht_remove(&tb_ctx.htable, tb, h)) { return; } @@ -968,7 +968,7 @@ TranslationBlock *tb_link_page(TranslationBlock *tb, tb_page_addr_t phys_pc, /* add in the hash table */ h = tb_hash_func(phys_pc, (tb->cflags & CF_PCREL ? 0 : tb->pc), - tb->flags, tb->cflags); + tb->flags, tb->cs_base, tb->cflags); qht_insert(&tb_ctx.htable, tb, h, &existing_tb); /* remove TB from the page(s) if we couldn't insert it */ diff --git a/include/qemu/xxhash.h b/include/qemu/xxhash.h index c2dcccadbf..0259bbef18 100644 --- a/include/qemu/xxhash.h +++ b/include/qemu/xxhash.h @@ -48,8 +48,8 @@ * xxhash32, customized for input variables that are not guaranteed to be * contiguous in memory. */ -static inline uint32_t -qemu_xxhash7(uint64_t ab, uint64_t cd, uint32_t e, uint32_t f, uint32_t g) +static inline uint32_t qemu_xxhash8(uint64_t ab, uint64_t cd, uint64_t ef, + uint32_t g, uint32_t h) { uint32_t v1 = QEMU_XXHASH_SEED + PRIME32_1 + PRIME32_2; uint32_t v2 = QEMU_XXHASH_SEED + PRIME32_2; @@ -59,6 +59,8 @@ qemu_xxhash7(uint64_t ab, uint64_t cd, uint32_t e, uint32_t f, uint32_t g) uint32_t b = ab >> 32; uint32_t c = cd; uint32_t d = cd >> 32; + uint32_t e = ef; + uint32_t f = ef >> 32; uint32_t h32; v1 += a * PRIME32_2; @@ -89,6 +91,9 @@ qemu_xxhash7(uint64_t ab, uint64_t cd, uint32_t e, uint32_t f, uint32_t g) h32 += g * PRIME32_3; h32 = rol32(h32, 17) * PRIME32_4; + h32 += h * PRIME32_3; + h32 = rol32(h32, 17) * PRIME32_4; + h32 ^= h32 >> 15; h32 *= PRIME32_2; h32 ^= h32 >> 13; @@ -100,23 +105,29 @@ qemu_xxhash7(uint64_t ab, uint64_t cd, uint32_t e, uint32_t f, uint32_t g) static inline uint32_t qemu_xxhash2(uint64_t ab) { - return qemu_xxhash7(ab, 0, 0, 0, 0); + return qemu_xxhash8(ab, 0, 0, 0, 0); } static inline uint32_t qemu_xxhash4(uint64_t ab, uint64_t cd) { - return qemu_xxhash7(ab, cd, 0, 0, 0); + return qemu_xxhash8(ab, cd, 0, 0, 0); } static inline uint32_t qemu_xxhash5(uint64_t ab, uint64_t cd, uint32_t e) { - return qemu_xxhash7(ab, cd, e, 0, 0); + return qemu_xxhash8(ab, cd, 0, e, 0); } static inline uint32_t qemu_xxhash6(uint64_t ab, uint64_t cd, uint32_t e, uint32_t f) { - return qemu_xxhash7(ab, cd, e, f, 0); + return qemu_xxhash8(ab, cd, 0, e, f); +} + +static inline uint32_t qemu_xxhash7(uint64_t ab, uint64_t cd, uint64_t ef, + uint32_t g) +{ + return qemu_xxhash8(ab, cd, ef, g, 0); } /* diff --git a/util/qsp.c b/util/qsp.c index 8562b14a87..2fe3764906 100644 --- a/util/qsp.c +++ b/util/qsp.c @@ -144,7 +144,7 @@ uint32_t do_qsp_callsite_hash(const QSPCallSite *callsite, uint64_t ab) uint32_t e = callsite->line; uint32_t f = callsite->type; - return qemu_xxhash6(ab, cd, e, f); + return qemu_xxhash8(ab, cd, 0, e, f); } static inline From cad2ccc395c7113fb30bc9390774b67b34f06c68 Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Tue, 30 May 2023 09:19:40 +0200 Subject: [PATCH 18/25] block/blkio: use qemu_open() to support fd passing for virtio-blk Some virtio-blk drivers (e.g. virtio-blk-vhost-vdpa) supports the fd passing. Let's expose this to the user, so the management layer can pass the file descriptor of an already opened path. If the libblkio virtio-blk driver supports fd passing, let's always use qemu_open() to open the `path`, so we can handle fd passing from the management layer through the "/dev/fdset/N" special path. Reviewed-by: Stefan Hajnoczi Signed-off-by: Stefano Garzarella Message-id: 20230530071941.8954-2-sgarzare@redhat.com Signed-off-by: Stefan Hajnoczi --- block/blkio.c | 53 ++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 44 insertions(+), 9 deletions(-) diff --git a/block/blkio.c b/block/blkio.c index 11be8787a3..527323d625 100644 --- a/block/blkio.c +++ b/block/blkio.c @@ -673,25 +673,60 @@ static int blkio_virtio_blk_common_open(BlockDriverState *bs, { const char *path = qdict_get_try_str(options, "path"); BDRVBlkioState *s = bs->opaque; - int ret; + bool fd_supported = false; + int fd, ret; if (!path) { error_setg(errp, "missing 'path' option"); return -EINVAL; } - ret = blkio_set_str(s->blkio, "path", path); - qdict_del(options, "path"); - if (ret < 0) { - error_setg_errno(errp, -ret, "failed to set path: %s", - blkio_get_error_msg()); - return ret; - } - if (!(flags & BDRV_O_NOCACHE)) { error_setg(errp, "cache.direct=off is not supported"); return -EINVAL; } + + if (blkio_get_int(s->blkio, "fd", &fd) == 0) { + fd_supported = true; + } + + /* + * If the libblkio driver supports fd passing, let's always use qemu_open() + * to open the `path`, so we can handle fd passing from the management + * layer through the "/dev/fdset/N" special path. + */ + if (fd_supported) { + int open_flags; + + if (flags & BDRV_O_RDWR) { + open_flags = O_RDWR; + } else { + open_flags = O_RDONLY; + } + + fd = qemu_open(path, open_flags, errp); + if (fd < 0) { + return -EINVAL; + } + + ret = blkio_set_int(s->blkio, "fd", fd); + if (ret < 0) { + error_setg_errno(errp, -ret, "failed to set fd: %s", + blkio_get_error_msg()); + qemu_close(fd); + return ret; + } + } else { + ret = blkio_set_str(s->blkio, "path", path); + if (ret < 0) { + error_setg_errno(errp, -ret, "failed to set path: %s", + blkio_get_error_msg()); + return ret; + } + } + + qdict_del(options, "path"); + return 0; } From 98b126f5e3228a346c774e569e26689943b401dd Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Tue, 30 May 2023 09:19:41 +0200 Subject: [PATCH 19/25] qapi: add '@fdset' feature for BlockdevOptionsVirtioBlkVhostVdpa The virtio-blk-vhost-vdpa driver in libblkio 1.3.0 supports the fd passing through the new 'fd' property. Since now we are using qemu_open() on '@path' if the virtio-blk driver supports the fd passing, let's announce it. In this way, the management layer can pass the file descriptor of an already opened vhost-vdpa character device. This is useful especially when the device can only be accessed with certain privileges. Add the '@fdset' feature only when the virtio-blk-vhost-vdpa driver in libblkio supports it. Suggested-by: Markus Armbruster Reviewed-by: Stefan Hajnoczi Signed-off-by: Stefano Garzarella Message-id: 20230530071941.8954-3-sgarzare@redhat.com Signed-off-by: Stefan Hajnoczi --- meson.build | 4 ++++ qapi/block-core.json | 6 ++++++ 2 files changed, 10 insertions(+) diff --git a/meson.build b/meson.build index bc76ea96bf..a61d3e9b06 100644 --- a/meson.build +++ b/meson.build @@ -2106,6 +2106,10 @@ config_host_data.set('CONFIG_LZO', lzo.found()) config_host_data.set('CONFIG_MPATH', mpathpersist.found()) config_host_data.set('CONFIG_MPATH_NEW_API', mpathpersist_new_api) config_host_data.set('CONFIG_BLKIO', blkio.found()) +if blkio.found() + config_host_data.set('CONFIG_BLKIO_VHOST_VDPA_FD', + blkio.version().version_compare('>=1.3.0')) +endif config_host_data.set('CONFIG_CURL', curl.found()) config_host_data.set('CONFIG_CURSES', curses.found()) config_host_data.set('CONFIG_GBM', gbm.found()) diff --git a/qapi/block-core.json b/qapi/block-core.json index 98d9116dae..4bf89171c6 100644 --- a/qapi/block-core.json +++ b/qapi/block-core.json @@ -3955,10 +3955,16 @@ # # @path: path to the vhost-vdpa character device. # +# Features: +# @fdset: Member @path supports the special "/dev/fdset/N" path +# (since 8.1) +# # Since: 7.2 ## { 'struct': 'BlockdevOptionsVirtioBlkVhostVdpa', 'data': { 'path': 'str' }, + 'features': [ { 'name' :'fdset', + 'if': 'CONFIG_BLKIO_VHOST_VDPA_FD' } ], 'if': 'CONFIG_BLKIO' } ## From 242b74eb69d0e53b25e294331a192b7a458b8e46 Mon Sep 17 00:00:00 2001 From: Vladimir Sementsov-Ogievskiy Date: Wed, 17 May 2023 15:37:48 +0300 Subject: [PATCH 20/25] runstate: add runstate_get() It's necessary to restore the state after failed/cancelled migration in further commit. Signed-off-by: Vladimir Sementsov-Ogievskiy Reviewed-by: Juan Quintela Message-Id: <20230517123752.21615-2-vsementsov@yandex-team.ru> Signed-off-by: Juan Quintela --- include/sysemu/runstate.h | 1 + softmmu/runstate.c | 5 +++++ 2 files changed, 6 insertions(+) diff --git a/include/sysemu/runstate.h b/include/sysemu/runstate.h index f3ed52548e..85f5d9a419 100644 --- a/include/sysemu/runstate.h +++ b/include/sysemu/runstate.h @@ -6,6 +6,7 @@ bool runstate_check(RunState state); void runstate_set(RunState new_state); +RunState runstate_get(void); bool runstate_is_running(void); bool runstate_needs_reset(void); bool runstate_store(char *str, size_t size); diff --git a/softmmu/runstate.c b/softmmu/runstate.c index 2f2396c819..1e6f0bcecc 100644 --- a/softmmu/runstate.c +++ b/softmmu/runstate.c @@ -221,6 +221,11 @@ void runstate_set(RunState new_state) current_run_state = new_state; } +RunState runstate_get(void) +{ + return current_run_state; +} + bool runstate_is_running(void) { return runstate_check(RUN_STATE_RUNNING); From c33f1829f891058442e9670325decff0c8a2e28c Mon Sep 17 00:00:00 2001 From: Vladimir Sementsov-Ogievskiy Date: Wed, 17 May 2023 15:37:49 +0300 Subject: [PATCH 21/25] migration: never fail in global_state_store() Actually global_state_store() can never fail. Let's get rid of extra error paths. To make things clear, use new runstate_get() and use same approach for global_state_store() and global_state_store_running(). Signed-off-by: Vladimir Sementsov-Ogievskiy Reviewed-by: Juan Quintela Message-Id: <20230517123752.21615-3-vsementsov@yandex-team.ru> Signed-off-by: Juan Quintela --- include/migration/global_state.h | 2 +- migration/global_state.c | 23 +++++++++--------- migration/migration.c | 41 +++++++++++++++----------------- migration/savevm.c | 6 +---- 4 files changed, 32 insertions(+), 40 deletions(-) diff --git a/include/migration/global_state.h b/include/migration/global_state.h index 945eb35d5b..d7c2cd3216 100644 --- a/include/migration/global_state.h +++ b/include/migration/global_state.h @@ -16,7 +16,7 @@ #include "qapi/qapi-types-run-state.h" void register_global_state(void); -int global_state_store(void); +void global_state_store(void); void global_state_store_running(void); bool global_state_received(void); RunState global_state_get_runstate(void); diff --git a/migration/global_state.c b/migration/global_state.c index a33947ca32..4e2a9d8ec0 100644 --- a/migration/global_state.c +++ b/migration/global_state.c @@ -29,23 +29,22 @@ typedef struct { static GlobalState global_state; -int global_state_store(void) +static void global_state_do_store(RunState state) { - if (!runstate_store((char *)global_state.runstate, - sizeof(global_state.runstate))) { - error_report("runstate name too big: %s", global_state.runstate); - trace_migrate_state_too_big(); - return -EINVAL; - } - return 0; + const char *state_str = RunState_str(state); + assert(strlen(state_str) < sizeof(global_state.runstate)); + strpadcpy((char *)global_state.runstate, sizeof(global_state.runstate), + state_str, '\0'); +} + +void global_state_store(void) +{ + global_state_do_store(runstate_get()); } void global_state_store_running(void) { - const char *state = RunState_str(RUN_STATE_RUNNING); - assert(strlen(state) < sizeof(global_state.runstate)); - strpadcpy((char *)global_state.runstate, sizeof(global_state.runstate), - state, '\0'); + global_state_do_store(RUN_STATE_RUNNING); } bool global_state_received(void) diff --git a/migration/migration.c b/migration/migration.c index 5de7f734b9..c75d5aa479 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -2288,27 +2288,26 @@ static void migration_completion(MigrationState *s) s->downtime_start = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); qemu_system_wakeup_request(QEMU_WAKEUP_REASON_OTHER, NULL); s->vm_was_running = runstate_is_running(); - ret = global_state_store(); + global_state_store(); - if (!ret) { - ret = vm_stop_force_state(RUN_STATE_FINISH_MIGRATE); - trace_migration_completion_vm_stop(ret); - if (ret >= 0) { - ret = migration_maybe_pause(s, ¤t_active_state, - MIGRATION_STATUS_DEVICE); - } - if (ret >= 0) { - /* - * Inactivate disks except in COLO, and track that we - * have done so in order to remember to reactivate - * them if migration fails or is cancelled. - */ - s->block_inactive = !migrate_colo(); - migration_rate_set(RATE_LIMIT_DISABLED); - ret = qemu_savevm_state_complete_precopy(s->to_dst_file, false, - s->block_inactive); - } + ret = vm_stop_force_state(RUN_STATE_FINISH_MIGRATE); + trace_migration_completion_vm_stop(ret); + if (ret >= 0) { + ret = migration_maybe_pause(s, ¤t_active_state, + MIGRATION_STATUS_DEVICE); } + if (ret >= 0) { + /* + * Inactivate disks except in COLO, and track that we + * have done so in order to remember to reactivate + * them if migration fails or is cancelled. + */ + s->block_inactive = !migrate_colo(); + migration_rate_set(RATE_LIMIT_DISABLED); + ret = qemu_savevm_state_complete_precopy(s->to_dst_file, false, + s->block_inactive); + } + qemu_mutex_unlock_iothread(); if (ret < 0) { @@ -3088,9 +3087,7 @@ static void *bg_migration_thread(void *opaque) qemu_system_wakeup_request(QEMU_WAKEUP_REASON_OTHER, NULL); s->vm_was_running = runstate_is_running(); - if (global_state_store()) { - goto fail; - } + global_state_store(); /* Forcibly stop VM before saving state of vCPUs and devices */ if (vm_stop_force_state(RUN_STATE_PAUSED)) { goto fail; diff --git a/migration/savevm.c b/migration/savevm.c index 03795ce8dc..bc284087f9 100644 --- a/migration/savevm.c +++ b/migration/savevm.c @@ -2919,11 +2919,7 @@ bool save_snapshot(const char *name, bool overwrite, const char *vmstate, saved_vm_running = runstate_is_running(); - ret = global_state_store(); - if (ret) { - error_setg(errp, "Error saving global state"); - return false; - } + global_state_store(); vm_stop(RUN_STATE_SAVE_VM); bdrv_drain_all_begin(); From e76005a081d08d1e42d98811fba983c59b3f736b Mon Sep 17 00:00:00 2001 From: Vladimir Sementsov-Ogievskiy Date: Wed, 17 May 2023 15:37:50 +0300 Subject: [PATCH 22/25] runstate: drop unused runstate_store() The function is unused since previous commit. Drop it. Signed-off-by: Vladimir Sementsov-Ogievskiy Reviewed-by: Juan Quintela Message-Id: <20230517123752.21615-4-vsementsov@yandex-team.ru> Signed-off-by: Juan Quintela --- include/sysemu/runstate.h | 1 - softmmu/runstate.c | 12 ------------ 2 files changed, 13 deletions(-) diff --git a/include/sysemu/runstate.h b/include/sysemu/runstate.h index 85f5d9a419..7beb29c2e2 100644 --- a/include/sysemu/runstate.h +++ b/include/sysemu/runstate.h @@ -9,7 +9,6 @@ void runstate_set(RunState new_state); RunState runstate_get(void); bool runstate_is_running(void); bool runstate_needs_reset(void); -bool runstate_store(char *str, size_t size); typedef void VMChangeStateHandler(void *opaque, bool running, RunState state); diff --git a/softmmu/runstate.c b/softmmu/runstate.c index 1e6f0bcecc..0370230a5e 100644 --- a/softmmu/runstate.c +++ b/softmmu/runstate.c @@ -175,18 +175,6 @@ bool runstate_check(RunState state) return current_run_state == state; } -bool runstate_store(char *str, size_t size) -{ - const char *state = RunState_str(current_run_state); - size_t len = strlen(state) + 1; - - if (len > size) { - return false; - } - memcpy(str, state, len); - return true; -} - static void runstate_init(void) { const RunStateTransition *p; From f4584076fc318bb4ac762e3c09ff3544938fed5b Mon Sep 17 00:00:00 2001 From: Vladimir Sementsov-Ogievskiy Date: Wed, 17 May 2023 15:37:51 +0300 Subject: [PATCH 23/25] migration: switch from .vm_was_running to .vm_old_state No logic change here, only refactoring. That's a preparation for next commit where we finally restore the stopped vm state on migration failure or cancellation. Signed-off-by: Vladimir Sementsov-Ogievskiy Reviewed-by: Juan Quintela Message-Id: <20230517123752.21615-5-vsementsov@yandex-team.ru> Signed-off-by: Juan Quintela --- migration/migration.c | 11 ++++++----- migration/migration.h | 9 ++++++--- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/migration/migration.c b/migration/migration.c index c75d5aa479..033162cda0 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -1402,7 +1402,7 @@ void migrate_init(MigrationState *s) s->start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); s->total_time = 0; - s->vm_was_running = false; + s->vm_old_state = -1; s->iteration_initial_bytes = 0; s->threshold_size = 0; } @@ -2287,7 +2287,8 @@ static void migration_completion(MigrationState *s) qemu_mutex_lock_iothread(); s->downtime_start = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); qemu_system_wakeup_request(QEMU_WAKEUP_REASON_OTHER, NULL); - s->vm_was_running = runstate_is_running(); + + s->vm_old_state = runstate_get(); global_state_store(); ret = vm_stop_force_state(RUN_STATE_FINISH_MIGRATE); @@ -2760,12 +2761,12 @@ static void migration_iteration_finish(MigrationState *s) case MIGRATION_STATUS_COLO: assert(migrate_colo()); migrate_start_colo_process(s); - s->vm_was_running = true; + s->vm_old_state = RUN_STATE_RUNNING; /* Fallthrough */ case MIGRATION_STATUS_FAILED: case MIGRATION_STATUS_CANCELLED: case MIGRATION_STATUS_CANCELLING: - if (s->vm_was_running) { + if (s->vm_old_state == RUN_STATE_RUNNING) { if (!runstate_check(RUN_STATE_SHUTDOWN)) { vm_start(); } @@ -3085,7 +3086,7 @@ static void *bg_migration_thread(void *opaque) * transition in vm_stop_force_state() we need to wakeup it up. */ qemu_system_wakeup_request(QEMU_WAKEUP_REASON_OTHER, NULL); - s->vm_was_running = runstate_is_running(); + s->vm_old_state = runstate_get(); global_state_store(); /* Forcibly stop VM before saving state of vCPUs and devices */ diff --git a/migration/migration.h b/migration/migration.h index 48a46123a0..30c3e97635 100644 --- a/migration/migration.h +++ b/migration/migration.h @@ -25,6 +25,7 @@ #include "net/announce.h" #include "qom/object.h" #include "postcopy-ram.h" +#include "sysemu/runstate.h" struct PostcopyBlocktimeContext; @@ -317,12 +318,14 @@ struct MigrationState { int64_t expected_downtime; bool capabilities[MIGRATION_CAPABILITY__MAX]; int64_t setup_time; + /* - * Whether guest was running when we enter the completion stage. + * State before stopping the vm by vm_stop_force_state(). * If migration is interrupted by any reason, we need to continue - * running the guest on source. + * running the guest on source if it was running or restore its stopped + * state. */ - bool vm_was_running; + RunState vm_old_state; /* Flag set once the migration has been asked to enter postcopy */ bool start_postcopy; From a4c6275aa11f153a6a7e614dd867c62b904f8838 Mon Sep 17 00:00:00 2001 From: Vladimir Sementsov-Ogievskiy Date: Wed, 17 May 2023 15:37:52 +0300 Subject: [PATCH 24/25] migration: restore vmstate on migration failure 1. Otherwise failed migration just drops guest-panicked state, which is not good for management software. 2. We do keep different paused states like guest-panicked during migration with help of global_state state. 3. We do restore running state on source when migration is cancelled or failed. 4. "postmigrate" state is documented as "guest is paused following a successful 'migrate'", so originally it's only for successful path and we never documented current behavior. Let's restore paused states like guest-panicked in case of cancel or fail too. Allow same transitions like for inmigrate state. This commit changes the behavior that was introduced by commit 42da5550d6 "migration: set state to post-migrate on failure" and provides a bit different fix on related https://bugzilla.redhat.com/show_bug.cgi?id=1355683 Signed-off-by: Vladimir Sementsov-Ogievskiy Reviewed-by: Juan Quintela Message-Id: <20230517123752.21615-6-vsementsov@yandex-team.ru> Signed-off-by: Juan Quintela --- migration/migration.c | 2 +- softmmu/runstate.c | 8 +++++++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/migration/migration.c b/migration/migration.c index 033162cda0..7c3425c6fe 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -2772,7 +2772,7 @@ static void migration_iteration_finish(MigrationState *s) } } else { if (runstate_check(RUN_STATE_FINISH_MIGRATE)) { - runstate_set(RUN_STATE_POSTMIGRATE); + runstate_set(s->vm_old_state); } } break; diff --git a/softmmu/runstate.c b/softmmu/runstate.c index 0370230a5e..1957caf73f 100644 --- a/softmmu/runstate.c +++ b/softmmu/runstate.c @@ -121,7 +121,13 @@ static const RunStateTransition runstate_transitions_def[] = { { RUN_STATE_FINISH_MIGRATE, RUN_STATE_PAUSED }, { RUN_STATE_FINISH_MIGRATE, RUN_STATE_POSTMIGRATE }, { RUN_STATE_FINISH_MIGRATE, RUN_STATE_PRELAUNCH }, - { RUN_STATE_FINISH_MIGRATE, RUN_STATE_COLO}, + { RUN_STATE_FINISH_MIGRATE, RUN_STATE_COLO }, + { RUN_STATE_FINISH_MIGRATE, RUN_STATE_INTERNAL_ERROR }, + { RUN_STATE_FINISH_MIGRATE, RUN_STATE_IO_ERROR }, + { RUN_STATE_FINISH_MIGRATE, RUN_STATE_SHUTDOWN }, + { RUN_STATE_FINISH_MIGRATE, RUN_STATE_SUSPENDED }, + { RUN_STATE_FINISH_MIGRATE, RUN_STATE_WATCHDOG }, + { RUN_STATE_FINISH_MIGRATE, RUN_STATE_GUEST_PANICKED }, { RUN_STATE_RESTORE_VM, RUN_STATE_RUNNING }, { RUN_STATE_RESTORE_VM, RUN_STATE_PRELAUNCH }, From 3a8b81f2e6393828589699bb0b8ef557b9ae5937 Mon Sep 17 00:00:00 2001 From: Fiona Ebner Date: Fri, 26 May 2023 13:59:08 +0200 Subject: [PATCH 25/25] migration: stop tracking ram writes when cancelling background migration Currently, it is only done when the iteration finishes successfully. Not cleaning up the userfaultfd write protection can lead to symptoms/issues such as the process hanging in memmove or GDB not being able to attach. Signed-off-by: Fiona Ebner Message-Id: <20230526115908.196171-1-f.ebner@proxmox.com> Signed-off-by: Juan Quintela --- migration/migration.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/migration/migration.c b/migration/migration.c index 7c3425c6fe..dc05c6f6ea 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -2400,13 +2400,6 @@ static void bg_migration_completion(MigrationState *s) { int current_active_state = s->state; - /* - * Stop tracking RAM writes - un-protect memory, un-register UFFD - * memory ranges, flush kernel wait queues and wake up threads - * waiting for write fault to be resolved. - */ - ram_write_tracking_stop(); - if (s->state == MIGRATION_STATUS_ACTIVE) { /* * By this moment we have RAM content saved into the migration stream. @@ -2788,6 +2781,13 @@ static void migration_iteration_finish(MigrationState *s) static void bg_migration_iteration_finish(MigrationState *s) { + /* + * Stop tracking RAM writes - un-protect memory, un-register UFFD + * memory ranges, flush kernel wait queues and wake up threads + * waiting for write fault to be resolved. + */ + ram_write_tracking_stop(); + qemu_mutex_lock_iothread(); switch (s->state) { case MIGRATION_STATUS_COMPLETED: