From 4526687bf12624d957088cd40ee02540a5404546 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= Date: Mon, 17 Feb 2025 18:34:55 +0100 Subject: [PATCH 01/42] vfio: Add property documentation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Investigate the git history to uncover when and why the VFIO properties were introduced and update the models. This is mostly targeting vfio-pci device, since vfio-platform, vfio-ap and vfio-ccw devices are simpler. Sort the properties based on the QEMU version in which they were introduced. Cc: Tony Krowiak Cc: Eric Farman Cc: Eric Auger Reviewed-by: Kirti Wankhede Reviewed-by: Anthony Krowiak Reviewed-by: Eric Farman # vfio-ccw Reviewed-by: Alex Williamson Reviewed-by: Eric Auger Link: https://lore.kernel.org/qemu-devel/20250217173455.449983-1-clg@redhat.com Signed-off-by: Cédric Le Goater --- hw/vfio/ap.c | 9 ++++ hw/vfio/ccw.c | 15 ++++++ hw/vfio/pci.c | 125 +++++++++++++++++++++++++++++++++++++++++++++ hw/vfio/platform.c | 24 +++++++++ 4 files changed, 173 insertions(+) diff --git a/hw/vfio/ap.c b/hw/vfio/ap.c index 30b08ad375..c7ab4ff57a 100644 --- a/hw/vfio/ap.c +++ b/hw/vfio/ap.c @@ -257,6 +257,15 @@ static void vfio_ap_class_init(ObjectClass *klass, void *data) dc->hotpluggable = true; device_class_set_legacy_reset(dc, vfio_ap_reset); dc->bus_type = TYPE_AP_BUS; + + object_class_property_set_description(klass, /* 3.1 */ + "sysfsdev", + "Host sysfs path of assigned device"); +#ifdef CONFIG_IOMMUFD + object_class_property_set_description(klass, /* 9.0 */ + "iommufd", + "Set host IOMMUFD backend device"); +#endif } static const TypeInfo vfio_ap_info = { diff --git a/hw/vfio/ccw.c b/hw/vfio/ccw.c index 67bc137f9b..ea766ae26c 100644 --- a/hw/vfio/ccw.c +++ b/hw/vfio/ccw.c @@ -717,6 +717,21 @@ static void vfio_ccw_class_init(ObjectClass *klass, void *data) cdc->handle_halt = vfio_ccw_handle_halt; cdc->handle_clear = vfio_ccw_handle_clear; cdc->handle_store = vfio_ccw_handle_store; + + object_class_property_set_description(klass, /* 2.10 */ + "sysfsdev", + "Host sysfs path of assigned device"); + object_class_property_set_description(klass, /* 3.0 */ + "force-orb-pfch", + "Force unlimited prefetch"); +#ifdef CONFIG_IOMMUFD + object_class_property_set_description(klass, /* 9.0 */ + "iommufd", + "Set host IOMMUFD backend device"); +#endif + object_class_property_set_description(klass, /* 9.2 */ + "loadparm", + "Define which devices that can be used for booting"); } static const TypeInfo vfio_ccw_info = { diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index 89d900e9cf..4f92b50b13 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -3433,6 +3433,122 @@ static void vfio_pci_dev_class_init(ObjectClass *klass, void *data) pdc->exit = vfio_exitfn; pdc->config_read = vfio_pci_read_config; pdc->config_write = vfio_pci_write_config; + + object_class_property_set_description(klass, /* 1.3 */ + "host", + "Host PCI address [domain:] of assigned device"); + object_class_property_set_description(klass, /* 1.3 */ + "x-intx-mmap-timeout-ms", + "When EOI is not provided by KVM/QEMU, wait time " + "(milliseconds) to re-enable device direct access " + "after INTx (DEBUG)"); + object_class_property_set_description(klass, /* 1.5 */ + "x-vga", + "Expose VGA address spaces for device"); + object_class_property_set_description(klass, /* 2.3 */ + "x-req", + "Disable device request notification support (DEBUG)"); + object_class_property_set_description(klass, /* 2.4 and 2.5 */ + "x-no-mmap", + "Disable MMAP for device. Allows to trace MMIO " + "accesses (DEBUG)"); + object_class_property_set_description(klass, /* 2.5 */ + "x-no-kvm-intx", + "Disable direct VFIO->KVM INTx injection. Allows to " + "trace INTx interrupts (DEBUG)"); + object_class_property_set_description(klass, /* 2.5 */ + "x-no-kvm-msi", + "Disable direct VFIO->KVM MSI injection. Allows to " + "trace MSI interrupts (DEBUG)"); + object_class_property_set_description(klass, /* 2.5 */ + "x-no-kvm-msix", + "Disable direct VFIO->KVM MSIx injection. Allows to " + "trace MSIx interrupts (DEBUG)"); + object_class_property_set_description(klass, /* 2.5 */ + "x-pci-vendor-id", + "Override PCI Vendor ID with provided value (DEBUG)"); + object_class_property_set_description(klass, /* 2.5 */ + "x-pci-device-id", + "Override PCI device ID with provided value (DEBUG)"); + object_class_property_set_description(klass, /* 2.5 */ + "x-pci-sub-vendor-id", + "Override PCI Subsystem Vendor ID with provided value " + "(DEBUG)"); + object_class_property_set_description(klass, /* 2.5 */ + "x-pci-sub-device-id", + "Override PCI Subsystem Device ID with provided value " + "(DEBUG)"); + object_class_property_set_description(klass, /* 2.6 */ + "sysfsdev", + "Host sysfs path of assigned device"); + object_class_property_set_description(klass, /* 2.7 */ + "x-igd-opregion", + "Expose host IGD OpRegion to guest"); + object_class_property_set_description(klass, /* 2.7 (See c4c45e943e51) */ + "x-igd-gms", + "Override IGD data stolen memory size (32MiB units)"); + object_class_property_set_description(klass, /* 2.11 */ + "x-nv-gpudirect-clique", + "Add NVIDIA GPUDirect capability indicating P2P DMA " + "clique for device [0-15]"); + object_class_property_set_description(klass, /* 2.12 */ + "x-no-geforce-quirks", + "Disable GeForce quirks (for NVIDIA Quadro/GRID/Tesla). " + "Improves performance"); + object_class_property_set_description(klass, /* 2.12 */ + "display", + "Enable display support for device, ex. vGPU"); + object_class_property_set_description(klass, /* 2.12 */ + "x-msix-relocation", + "Specify MSI-X MMIO relocation to the end of specified " + "existing BAR or new BAR to avoid virtualization overhead " + "due to adjacent device registers"); + object_class_property_set_description(klass, /* 3.0 */ + "x-no-kvm-ioeventfd", + "Disable registration of ioeventfds with KVM (DEBUG)"); + object_class_property_set_description(klass, /* 3.0 */ + "x-no-vfio-ioeventfd", + "Disable linking of KVM ioeventfds to VFIO ioeventfds " + "(DEBUG)"); + object_class_property_set_description(klass, /* 3.1 */ + "x-balloon-allowed", + "Override allowing ballooning with device (DEBUG, DANGER)"); + object_class_property_set_description(klass, /* 3.2 */ + "xres", + "Set X display resolution the vGPU should use"); + object_class_property_set_description(klass, /* 3.2 */ + "yres", + "Set Y display resolution the vGPU should use"); + object_class_property_set_description(klass, /* 5.2 */ + "x-pre-copy-dirty-page-tracking", + "Disable dirty pages tracking during iterative phase " + "(DEBUG)"); + object_class_property_set_description(klass, /* 5.2, 8.0 non-experimetal */ + "enable-migration", + "Enale device migration. Also requires a host VFIO PCI " + "variant or mdev driver with migration support enabled"); + object_class_property_set_description(klass, /* 8.1 */ + "vf-token", + "Specify UUID VF token. Required for VF when PF is owned " + "by another VFIO driver"); +#ifdef CONFIG_IOMMUFD + object_class_property_set_description(klass, /* 9.0 */ + "iommufd", + "Set host IOMMUFD backend device"); +#endif + object_class_property_set_description(klass, /* 9.1 */ + "x-device-dirty-page-tracking", + "Disable device dirty page tracking and use " + "container-based dirty page tracking (DEBUG)"); + object_class_property_set_description(klass, /* 9.1 */ + "migration-events", + "Emit VFIO migration QAPI event when a VFIO device " + "changes its migration state. For management applications"); + object_class_property_set_description(klass, /* 9.1 */ + "skip-vsc-check", + "Skip config space check for Vendor Specific Capability. " + "Setting to false will enforce strict checking of VSC content " + "(DEBUG)"); } static const TypeInfo vfio_pci_dev_info = { @@ -3461,6 +3577,15 @@ static void vfio_pci_nohotplug_dev_class_init(ObjectClass *klass, void *data) device_class_set_props(dc, vfio_pci_dev_nohotplug_properties); dc->hotpluggable = false; + + object_class_property_set_description(klass, /* 3.1 */ + "ramfb", + "Enable ramfb to provide pre-boot graphics for devices " + "enabling display option"); + object_class_property_set_description(klass, /* 8.2 */ + "x-ramfb-migrate", + "Override default migration support for ramfb support " + "(DEBUG)"); } static const TypeInfo vfio_pci_nohotplug_dev_info = { diff --git a/hw/vfio/platform.c b/hw/vfio/platform.c index f491f4dc95..d9faaa7395 100644 --- a/hw/vfio/platform.c +++ b/hw/vfio/platform.c @@ -672,6 +672,30 @@ static void vfio_platform_class_init(ObjectClass *klass, void *data) dc->desc = "VFIO-based platform device assignment"; sbc->connect_irq_notifier = vfio_start_irqfd_injection; set_bit(DEVICE_CATEGORY_MISC, dc->categories); + + object_class_property_set_description(klass, /* 2.4 */ + "host", + "Host device name of assigned device"); + object_class_property_set_description(klass, /* 2.4 and 2.5 */ + "x-no-mmap", + "Disable MMAP for device. Allows to trace MMIO " + "accesses (DEBUG)"); + object_class_property_set_description(klass, /* 2.4 */ + "mmap-timeout-ms", + "When EOI is not provided by KVM/QEMU, wait time " + "(milliseconds) to re-enable device direct access " + "after level interrupt (DEBUG)"); + object_class_property_set_description(klass, /* 2.4 */ + "x-irqfd", + "Allow disabling irqfd support (DEBUG)"); + object_class_property_set_description(klass, /* 2.6 */ + "sysfsdev", + "Host sysfs path of assigned device"); +#ifdef CONFIG_IOMMUFD + object_class_property_set_description(klass, /* 9.0 */ + "iommufd", + "Set host IOMMUFD backend device"); +#endif } static const TypeInfo vfio_platform_dev_info = { From 3f8f6ef701ac6b3691b6003025005a970b2075de Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= Date: Fri, 14 Feb 2025 17:19:36 +0100 Subject: [PATCH 02/42] vfio/ccw: Replace warn_once_pfch() with warn_report_once() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use the common helper warn_report_once() instead of implementing its own. Cc: Eric Farman Reviewed-by: Eric Farman Link: https://lore.kernel.org/qemu-devel/20250214161936.1720039-1-clg@redhat.com Signed-off-by: Cédric Le Goater --- hw/vfio/ccw.c | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/hw/vfio/ccw.c b/hw/vfio/ccw.c index ea766ae26c..e5e0d9e3e7 100644 --- a/hw/vfio/ccw.c +++ b/hw/vfio/ccw.c @@ -51,17 +51,8 @@ struct VFIOCCWDevice { EventNotifier crw_notifier; EventNotifier req_notifier; bool force_orb_pfch; - bool warned_orb_pfch; }; -static inline void warn_once_pfch(VFIOCCWDevice *vcdev, SubchDev *sch, - const char *msg) -{ - warn_report_once_cond(&vcdev->warned_orb_pfch, - "vfio-ccw (devno %x.%x.%04x): %s", - sch->cssid, sch->ssid, sch->devno, msg); -} - static void vfio_ccw_compute_needs_reset(VFIODevice *vdev) { vdev->needs_reset = false; @@ -83,7 +74,8 @@ static IOInstEnding vfio_ccw_handle_request(SubchDev *sch) if (!(sch->orb.ctrl0 & ORB_CTRL0_MASK_PFCH) && vcdev->force_orb_pfch) { sch->orb.ctrl0 |= ORB_CTRL0_MASK_PFCH; - warn_once_pfch(vcdev, sch, "PFCH flag forced"); + warn_report_once("vfio-ccw (devno %x.%x.%04x): PFCH flag forced", + sch->cssid, sch->ssid, sch->devno); } QEMU_BUILD_BUG_ON(sizeof(region->orb_area) != sizeof(ORB)); From 9461afd2008b0820fc45a6a7bc675df1b6791e4f Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Tue, 25 Feb 2025 14:52:25 -0700 Subject: [PATCH 03/42] hw/pci: Basic support for PCI power management MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The memory and IO BARs for devices are only accessible in the D0 power state. In other power states the PCI spec defines that the device responds to TLPs and messages with an Unsupported Request response. To approximate this behavior, consider the BARs as unmapped when the device is not in the D0 power state. This makes the BARs inaccessible and has the additional bonus for vfio-pci that we don't attempt to DMA map BARs for devices in a non-D0 power state. To support this, an interface is added for devices to register the PM capability, which allows central tracking to enforce valid transitions and unmap BARs in non-D0 states. NB. We currently have device models (eepro100 and pcie_pci_bridge) that register a PM capability but do not set wmask to enable writes to the power state field. In order to maintain migration compatibility, this new helper does not manage the wmask to enable guest writes to initiate a power state change. The contents and write access of the PM capability are still managed by the caller. Cc: Michael S. Tsirkin Cc: Marcel Apfelbaum Signed-off-by: Alex Williamson Reviewed-by: Eric Auger Reviewed-by: Michael S. Tsirkin Link: https://lore.kernel.org/qemu-devel/20250225215237.3314011-2-alex.williamson@redhat.com Signed-off-by: Cédric Le Goater --- hw/pci/pci.c | 93 ++++++++++++++++++++++++++++++++++++- hw/pci/trace-events | 2 + include/hw/pci/pci.h | 3 ++ include/hw/pci/pci_device.h | 3 ++ 4 files changed, 99 insertions(+), 2 deletions(-) diff --git a/hw/pci/pci.c b/hw/pci/pci.c index 1d42847ef0..2ef7f61749 100644 --- a/hw/pci/pci.c +++ b/hw/pci/pci.c @@ -435,6 +435,84 @@ static void pci_msi_trigger(PCIDevice *dev, MSIMessage msg) attrs, NULL); } +/* + * Register and track a PM capability. If wmask is also enabled for the power + * state field of the pmcsr register, guest writes may change the device PM + * state. BAR access is only enabled while the device is in the D0 state. + * Return the capability offset or negative error code. + */ +int pci_pm_init(PCIDevice *d, uint8_t offset, Error **errp) +{ + int cap = pci_add_capability(d, PCI_CAP_ID_PM, offset, PCI_PM_SIZEOF, errp); + + if (cap < 0) { + return cap; + } + + d->pm_cap = cap; + d->cap_present |= QEMU_PCI_CAP_PM; + + return cap; +} + +static uint8_t pci_pm_state(PCIDevice *d) +{ + uint16_t pmcsr; + + if (!(d->cap_present & QEMU_PCI_CAP_PM)) { + return 0; + } + + pmcsr = pci_get_word(d->config + d->pm_cap + PCI_PM_CTRL); + + return pmcsr & PCI_PM_CTRL_STATE_MASK; +} + +/* + * Update the PM capability state based on the new value stored in config + * space respective to the old, pre-write state provided. If the new value + * is rejected (unsupported or invalid transition) restore the old value. + * Return the resulting PM state. + */ +static uint8_t pci_pm_update(PCIDevice *d, uint32_t addr, int l, uint8_t old) +{ + uint16_t pmc; + uint8_t new; + + if (!(d->cap_present & QEMU_PCI_CAP_PM) || + !range_covers_byte(addr, l, d->pm_cap + PCI_PM_CTRL)) { + return old; + } + + new = pci_pm_state(d); + if (new == old) { + return old; + } + + pmc = pci_get_word(d->config + d->pm_cap + PCI_PM_PMC); + + /* + * Transitions to D1 & D2 are only allowed if supported. Devices may + * only transition to higher D-states or to D0. + */ + if ((!(pmc & PCI_PM_CAP_D1) && new == 1) || + (!(pmc & PCI_PM_CAP_D2) && new == 2) || + (old && new && new < old)) { + pci_word_test_and_clear_mask(d->config + d->pm_cap + PCI_PM_CTRL, + PCI_PM_CTRL_STATE_MASK); + pci_word_test_and_set_mask(d->config + d->pm_cap + PCI_PM_CTRL, + old); + trace_pci_pm_bad_transition(d->name, pci_dev_bus_num(d), + PCI_SLOT(d->devfn), PCI_FUNC(d->devfn), + old, new); + return old; + } + + trace_pci_pm_transition(d->name, pci_dev_bus_num(d), PCI_SLOT(d->devfn), + PCI_FUNC(d->devfn), old, new); + return new; +} + static void pci_reset_regions(PCIDevice *dev) { int r; @@ -474,6 +552,11 @@ static void pci_do_device_reset(PCIDevice *dev) pci_get_word(dev->wmask + PCI_INTERRUPT_LINE) | pci_get_word(dev->w1cmask + PCI_INTERRUPT_LINE)); dev->config[PCI_CACHE_LINE_SIZE] = 0x0; + /* Default PM state is D0 */ + if (dev->cap_present & QEMU_PCI_CAP_PM) { + pci_word_test_and_clear_mask(dev->config + dev->pm_cap + PCI_PM_CTRL, + PCI_PM_CTRL_STATE_MASK); + } pci_reset_regions(dev); pci_update_mappings(dev); @@ -1606,7 +1689,7 @@ static void pci_update_mappings(PCIDevice *d) continue; new_addr = pci_bar_address(d, i, r->type, r->size); - if (!d->enabled) { + if (!d->enabled || pci_pm_state(d)) { new_addr = PCI_BAR_UNMAPPED; } @@ -1672,6 +1755,7 @@ uint32_t pci_default_read_config(PCIDevice *d, void pci_default_write_config(PCIDevice *d, uint32_t addr, uint32_t val_in, int l) { + uint8_t new_pm_state, old_pm_state = pci_pm_state(d); int i, was_irq_disabled = pci_irq_disabled(d); uint32_t val = val_in; @@ -1684,11 +1768,16 @@ void pci_default_write_config(PCIDevice *d, uint32_t addr, uint32_t val_in, int d->config[addr + i] = (d->config[addr + i] & ~wmask) | (val & wmask); d->config[addr + i] &= ~(val & w1cmask); /* W1C: Write 1 to Clear */ } + + new_pm_state = pci_pm_update(d, addr, l, old_pm_state); + if (ranges_overlap(addr, l, PCI_BASE_ADDRESS_0, 24) || ranges_overlap(addr, l, PCI_ROM_ADDRESS, 4) || ranges_overlap(addr, l, PCI_ROM_ADDRESS1, 4) || - range_covers_byte(addr, l, PCI_COMMAND)) + range_covers_byte(addr, l, PCI_COMMAND) || + !!new_pm_state != !!old_pm_state) { pci_update_mappings(d); + } if (ranges_overlap(addr, l, PCI_COMMAND, 2)) { pci_update_irq_disabled(d, was_irq_disabled); diff --git a/hw/pci/trace-events b/hw/pci/trace-events index e98f575a9d..6a9968962f 100644 --- a/hw/pci/trace-events +++ b/hw/pci/trace-events @@ -1,6 +1,8 @@ # See docs/devel/tracing.rst for syntax documentation. # pci.c +pci_pm_bad_transition(const char *dev, uint32_t bus, uint32_t slot, uint32_t func, uint8_t old, uint8_t new) "%s %02x:%02x.%x REJECTED PM transition D%d->D%d" +pci_pm_transition(const char *dev, uint32_t bus, uint32_t slot, uint32_t func, uint8_t old, uint8_t new) "%s %02x:%02x.%x PM transition D%d->D%d" pci_update_mappings_del(const char *dev, uint32_t bus, uint32_t slot, uint32_t func, int bar, uint64_t addr, uint64_t size) "%s %02x:%02x.%x %d,0x%"PRIx64"+0x%"PRIx64 pci_update_mappings_add(const char *dev, uint32_t bus, uint32_t slot, uint32_t func, int bar, uint64_t addr, uint64_t size) "%s %02x:%02x.%x %d,0x%"PRIx64"+0x%"PRIx64 pci_route_irq(int dev_irq, const char *dev_path, int parent_irq, const char *parent_path) "IRQ %d @%s -> IRQ %d @%s" diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h index 4002bbeebd..c220cc8449 100644 --- a/include/hw/pci/pci.h +++ b/include/hw/pci/pci.h @@ -216,6 +216,8 @@ enum { QEMU_PCIE_ARI_NEXTFN_1 = (1 << QEMU_PCIE_ARI_NEXTFN_1_BITNR), #define QEMU_PCIE_EXT_TAG_BITNR 13 QEMU_PCIE_EXT_TAG = (1 << QEMU_PCIE_EXT_TAG_BITNR), +#define QEMU_PCI_CAP_PM_BITNR 14 + QEMU_PCI_CAP_PM = (1 << QEMU_PCI_CAP_PM_BITNR), }; typedef struct PCIINTxRoute { @@ -676,5 +678,6 @@ static inline void pci_irq_deassert(PCIDevice *pci_dev) MSIMessage pci_get_msi_message(PCIDevice *dev, int vector); void pci_set_enabled(PCIDevice *pci_dev, bool state); void pci_set_power(PCIDevice *pci_dev, bool state); +int pci_pm_init(PCIDevice *pci_dev, uint8_t offset, Error **errp); #endif diff --git a/include/hw/pci/pci_device.h b/include/hw/pci/pci_device.h index add208edfa..345b12eaac 100644 --- a/include/hw/pci/pci_device.h +++ b/include/hw/pci/pci_device.h @@ -105,6 +105,9 @@ struct PCIDevice { /* Capability bits */ uint32_t cap_present; + /* Offset of PM capability in config space */ + uint8_t pm_cap; + /* Offset of MSI-X capability in config space */ uint8_t msix_cap; From 0681ec253141d838210b3c5e6bc0d2d71f2e111e Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Tue, 25 Feb 2025 14:52:26 -0700 Subject: [PATCH 04/42] pci: Use PCI PM capability initializer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Switch callers directly initializing the PCI PM capability with pci_add_capability() to use pci_pm_init(). Cc: Dmitry Fleytman Cc: Akihiko Odaki Cc: Jason Wang Cc: Stefan Weil Cc: Sriram Yagnaraman Cc: Keith Busch Cc: Klaus Jensen Cc: Jesper Devantier Cc: Michael S. Tsirkin Cc: Marcel Apfelbaum Cc: Cédric Le Goater Signed-off-by: Alex Williamson Reviewed-by: Eric Auger Reviewed-by: Akihiko Odaki Reviewed-by: Michael S. Tsirkin Link: https://lore.kernel.org/qemu-devel/20250225215237.3314011-3-alex.williamson@redhat.com Signed-off-by: Cédric Le Goater --- hw/net/e1000e.c | 3 +-- hw/net/eepro100.c | 4 +--- hw/net/igb.c | 3 +-- hw/nvme/ctrl.c | 3 +-- hw/pci-bridge/pcie_pci_bridge.c | 2 +- hw/vfio/pci.c | 7 ++++++- hw/virtio/virtio-pci.c | 3 +-- 7 files changed, 12 insertions(+), 13 deletions(-) diff --git a/hw/net/e1000e.c b/hw/net/e1000e.c index f637853073..b72cbab7e8 100644 --- a/hw/net/e1000e.c +++ b/hw/net/e1000e.c @@ -372,8 +372,7 @@ static int e1000e_add_pm_capability(PCIDevice *pdev, uint8_t offset, uint16_t pmc) { Error *local_err = NULL; - int ret = pci_add_capability(pdev, PCI_CAP_ID_PM, offset, - PCI_PM_SIZEOF, &local_err); + int ret = pci_pm_init(pdev, offset, &local_err); if (local_err) { error_report_err(local_err); diff --git a/hw/net/eepro100.c b/hw/net/eepro100.c index 6d853229ae..29a39865a6 100644 --- a/hw/net/eepro100.c +++ b/hw/net/eepro100.c @@ -551,9 +551,7 @@ static void e100_pci_reset(EEPRO100State *s, Error **errp) if (info->power_management) { /* Power Management Capabilities */ int cfg_offset = 0xdc; - int r = pci_add_capability(&s->dev, PCI_CAP_ID_PM, - cfg_offset, PCI_PM_SIZEOF, - errp); + int r = pci_pm_init(&s->dev, cfg_offset, errp); if (r < 0) { return; } diff --git a/hw/net/igb.c b/hw/net/igb.c index c965fc2fb6..e318df40e0 100644 --- a/hw/net/igb.c +++ b/hw/net/igb.c @@ -356,8 +356,7 @@ static int igb_add_pm_capability(PCIDevice *pdev, uint8_t offset, uint16_t pmc) { Error *local_err = NULL; - int ret = pci_add_capability(pdev, PCI_CAP_ID_PM, offset, - PCI_PM_SIZEOF, &local_err); + int ret = pci_pm_init(pdev, offset, &local_err); if (local_err) { error_report_err(local_err); diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c index e62c6a3588..518d02dc66 100644 --- a/hw/nvme/ctrl.c +++ b/hw/nvme/ctrl.c @@ -8600,8 +8600,7 @@ static int nvme_add_pm_capability(PCIDevice *pci_dev, uint8_t offset) Error *err = NULL; int ret; - ret = pci_add_capability(pci_dev, PCI_CAP_ID_PM, offset, - PCI_PM_SIZEOF, &err); + ret = pci_pm_init(pci_dev, offset, &err); if (err) { error_report_err(err); return ret; diff --git a/hw/pci-bridge/pcie_pci_bridge.c b/hw/pci-bridge/pcie_pci_bridge.c index fd4514a595..9fa656b43b 100644 --- a/hw/pci-bridge/pcie_pci_bridge.c +++ b/hw/pci-bridge/pcie_pci_bridge.c @@ -52,7 +52,7 @@ static void pcie_pci_bridge_realize(PCIDevice *d, Error **errp) goto cap_error; } - pos = pci_add_capability(d, PCI_CAP_ID_PM, 0, PCI_PM_SIZEOF, errp); + pos = pci_pm_init(d, 0, errp); if (pos < 0) { goto pm_error; } diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index 4f92b50b13..d33b795af0 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -2216,7 +2216,12 @@ static bool vfio_add_std_cap(VFIOPCIDevice *vdev, uint8_t pos, Error **errp) case PCI_CAP_ID_PM: vfio_check_pm_reset(vdev, pos); vdev->pm_cap = pos; - ret = pci_add_capability(pdev, cap_id, pos, size, errp) >= 0; + ret = pci_pm_init(pdev, pos, errp) >= 0; + /* + * PCI-core config space emulation needs write access to the power + * state enabled for tracking BAR mapping relative to PM state. + */ + pci_set_word(pdev->wmask + pos + PCI_PM_CTRL, PCI_PM_CTRL_STATE_MASK); break; case PCI_CAP_ID_AF: vfio_check_af_flr(vdev, pos); diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c index c773a9130c..afe8b5551c 100644 --- a/hw/virtio/virtio-pci.c +++ b/hw/virtio/virtio-pci.c @@ -2204,8 +2204,7 @@ static void virtio_pci_realize(PCIDevice *pci_dev, Error **errp) pos = pcie_endpoint_cap_init(pci_dev, 0); assert(pos > 0); - pos = pci_add_capability(pci_dev, PCI_CAP_ID_PM, 0, - PCI_PM_SIZEOF, errp); + pos = pci_pm_init(pci_dev, 0, errp); if (pos < 0) { return; } From 05c6a8eff6298675080aa2692ee05a310b3483b4 Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Tue, 25 Feb 2025 14:52:27 -0700 Subject: [PATCH 05/42] vfio/pci: Delete local pm_cap MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is now redundant to PCIDevice.pm_cap. Cc: Cédric Le Goater Reviewed-by: Zhenzhong Duan Reviewed-by: Eric Auger Signed-off-by: Alex Williamson Reviewed-by: Michael S. Tsirkin Link: https://lore.kernel.org/qemu-devel/20250225215237.3314011-4-alex.williamson@redhat.com Signed-off-by: Cédric Le Goater --- hw/vfio/pci.c | 9 ++++----- hw/vfio/pci.h | 1 - 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index d33b795af0..a8db19d8d2 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -2215,7 +2215,6 @@ static bool vfio_add_std_cap(VFIOPCIDevice *vdev, uint8_t pos, Error **errp) break; case PCI_CAP_ID_PM: vfio_check_pm_reset(vdev, pos); - vdev->pm_cap = pos; ret = pci_pm_init(pdev, pos, errp) >= 0; /* * PCI-core config space emulation needs write access to the power @@ -2412,17 +2411,17 @@ void vfio_pci_pre_reset(VFIOPCIDevice *vdev) vfio_disable_interrupts(vdev); /* Make sure the device is in D0 */ - if (vdev->pm_cap) { + if (pdev->pm_cap) { uint16_t pmcsr; uint8_t state; - pmcsr = vfio_pci_read_config(pdev, vdev->pm_cap + PCI_PM_CTRL, 2); + pmcsr = vfio_pci_read_config(pdev, pdev->pm_cap + PCI_PM_CTRL, 2); state = pmcsr & PCI_PM_CTRL_STATE_MASK; if (state) { pmcsr &= ~PCI_PM_CTRL_STATE_MASK; - vfio_pci_write_config(pdev, vdev->pm_cap + PCI_PM_CTRL, pmcsr, 2); + vfio_pci_write_config(pdev, pdev->pm_cap + PCI_PM_CTRL, pmcsr, 2); /* vfio handles the necessary delay here */ - pmcsr = vfio_pci_read_config(pdev, vdev->pm_cap + PCI_PM_CTRL, 2); + pmcsr = vfio_pci_read_config(pdev, pdev->pm_cap + PCI_PM_CTRL, 2); state = pmcsr & PCI_PM_CTRL_STATE_MASK; if (state) { error_report("vfio: Unable to power on device, stuck in D%d", diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h index 43c166680a..d638c781f6 100644 --- a/hw/vfio/pci.h +++ b/hw/vfio/pci.h @@ -160,7 +160,6 @@ struct VFIOPCIDevice { int32_t bootindex; uint32_t igd_gms; OffAutoPCIBAR msix_relo; - uint8_t pm_cap; uint8_t nv_gpudirect_clique; bool pci_aer; bool req_enabled; From 8b8d08cf293b930d0f55b2d5385d8dd27e0c6b41 Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Tue, 25 Feb 2025 14:52:28 -0700 Subject: [PATCH 06/42] pcie, virtio: Remove redundant pm_cap MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The pm_cap on the PCIExpressDevice object can be distilled down to the new instance on the PCIDevice object. Cc: Michael S. Tsirkin Cc: Marcel Apfelbaum Reviewed-by: Michael S. Tsirkin Reviewed-by: Zhenzhong Duan Reviewed-by: Eric Auger Signed-off-by: Alex Williamson Link: https://lore.kernel.org/qemu-devel/20250225215237.3314011-5-alex.williamson@redhat.com Signed-off-by: Cédric Le Goater --- hw/pci-bridge/pcie_pci_bridge.c | 1 - hw/virtio/virtio-pci.c | 8 +++----- include/hw/pci/pcie.h | 2 -- 3 files changed, 3 insertions(+), 8 deletions(-) diff --git a/hw/pci-bridge/pcie_pci_bridge.c b/hw/pci-bridge/pcie_pci_bridge.c index 9fa656b43b..2429503cfb 100644 --- a/hw/pci-bridge/pcie_pci_bridge.c +++ b/hw/pci-bridge/pcie_pci_bridge.c @@ -56,7 +56,6 @@ static void pcie_pci_bridge_realize(PCIDevice *d, Error **errp) if (pos < 0) { goto pm_error; } - d->exp.pm_cap = pos; pci_set_word(d->config + pos + PCI_PM_PMC, 0x3); pcie_cap_arifwd_init(d); diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c index afe8b5551c..3ca3f849d3 100644 --- a/hw/virtio/virtio-pci.c +++ b/hw/virtio/virtio-pci.c @@ -2209,8 +2209,6 @@ static void virtio_pci_realize(PCIDevice *pci_dev, Error **errp) return; } - pci_dev->exp.pm_cap = pos; - /* * Indicates that this function complies with revision 1.2 of the * PCI Power Management Interface Specification. @@ -2309,11 +2307,11 @@ static bool virtio_pci_no_soft_reset(PCIDevice *dev) { uint16_t pmcsr; - if (!pci_is_express(dev) || !dev->exp.pm_cap) { + if (!pci_is_express(dev) || !(dev->cap_present & QEMU_PCI_CAP_PM)) { return false; } - pmcsr = pci_get_word(dev->config + dev->exp.pm_cap + PCI_PM_CTRL); + pmcsr = pci_get_word(dev->config + dev->pm_cap + PCI_PM_CTRL); /* * When No_Soft_Reset bit is set and the device @@ -2342,7 +2340,7 @@ static void virtio_pci_bus_reset_hold(Object *obj, ResetType type) if (proxy->flags & VIRTIO_PCI_FLAG_INIT_PM) { pci_word_test_and_clear_mask( - dev->config + dev->exp.pm_cap + PCI_PM_CTRL, + dev->config + dev->pm_cap + PCI_PM_CTRL, PCI_PM_CTRL_STATE_MASK); } } diff --git a/include/hw/pci/pcie.h b/include/hw/pci/pcie.h index b8d59732bc..70a5de09de 100644 --- a/include/hw/pci/pcie.h +++ b/include/hw/pci/pcie.h @@ -58,8 +58,6 @@ typedef enum { struct PCIExpressDevice { /* Offset of express capability in config space */ uint8_t exp_cap; - /* Offset of Power Management capability in config space */ - uint8_t pm_cap; /* SLOT */ bool hpev_notified; /* Logical AND of conditions for hot plug event. From 518a69a598916749338de3852d41d961d4503115 Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Tue, 25 Feb 2025 14:52:29 -0700 Subject: [PATCH 07/42] hw/vfio/pci: Re-order pre-reset MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We want the device in the D0 power state going into reset, but the config write can enable the BARs in the address space, which are then removed from the address space once we clear the memory enable bit in the command register. Re-order to clear the command bit first, so the power state change doesn't enable the BARs. Cc: Cédric Le Goater Reviewed-by: Zhenzhong Duan Reviewed-by: Eric Auger Signed-off-by: Alex Williamson Reviewed-by: Michael S. Tsirkin Link: https://lore.kernel.org/qemu-devel/20250225215237.3314011-6-alex.williamson@redhat.com Signed-off-by: Cédric Le Goater --- hw/vfio/pci.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index a8db19d8d2..c1cee280ae 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -2410,6 +2410,15 @@ void vfio_pci_pre_reset(VFIOPCIDevice *vdev) vfio_disable_interrupts(vdev); + /* + * Stop any ongoing DMA by disconnecting I/O, MMIO, and bus master. + * Also put INTx Disable in known state. + */ + cmd = vfio_pci_read_config(pdev, PCI_COMMAND, 2); + cmd &= ~(PCI_COMMAND_IO | PCI_COMMAND_MEMORY | PCI_COMMAND_MASTER | + PCI_COMMAND_INTX_DISABLE); + vfio_pci_write_config(pdev, PCI_COMMAND, cmd, 2); + /* Make sure the device is in D0 */ if (pdev->pm_cap) { uint16_t pmcsr; @@ -2429,15 +2438,6 @@ void vfio_pci_pre_reset(VFIOPCIDevice *vdev) } } } - - /* - * Stop any ongoing DMA by disconnecting I/O, MMIO, and bus master. - * Also put INTx Disable in known state. - */ - cmd = vfio_pci_read_config(pdev, PCI_COMMAND, 2); - cmd &= ~(PCI_COMMAND_IO | PCI_COMMAND_MEMORY | PCI_COMMAND_MASTER | - PCI_COMMAND_INTX_DISABLE); - vfio_pci_write_config(pdev, PCI_COMMAND, cmd, 2); } void vfio_pci_post_reset(VFIOPCIDevice *vdev) From 515d80d66527b105493fad2df313693a72bf7560 Mon Sep 17 00:00:00 2001 From: Tomita Moeko Date: Fri, 28 Feb 2025 00:27:41 +0800 Subject: [PATCH 08/42] MAINTAINERS: Add myself as vfio-igd maintainer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As suggested by Cédric, I'm glad to be a maintainer of vfio-igd. Signed-off-by: Tomita Moeko Reviewed-by: Alex Williamson Reviewed-by: Cédric Le Goater Link: https://lore.kernel.org/qemu-devel/20250227162741.9860-1-tomitamoeko@gmail.com Signed-off-by: Cédric Le Goater --- MAINTAINERS | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 692628cd78..1a920e7dc4 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2186,10 +2186,17 @@ M: Cédric Le Goater S: Supported F: hw/vfio/* F: include/hw/vfio/ -F: docs/igd-assign.txt F: docs/devel/migration/vfio.rst F: qapi/vfio.json +vfio-igd +M: Alex Williamson +M: Cédric Le Goater +M: Tomita Moeko +S: Supported +F: hw/vfio/igd.c +F: docs/igd-assign.txt + vfio-ccw M: Eric Farman M: Matthew Rosato From 8d8a30d1ac7cc9b35833106d749e1b3e8675bc53 Mon Sep 17 00:00:00 2001 From: Eric Auger Date: Wed, 5 Mar 2025 13:42:25 +0100 Subject: [PATCH 09/42] vfio-platform: Deprecate all forms of vfio-platform devices MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As an outcome of KVM forum 2024 "vfio-platform: live and let die?" talk, let's deprecate vfio-platform devices. Signed-off-by: Eric Auger Reviewed-by: Cédric Le Goater Link: https://lore.kernel.org/qemu-devel/20250305124225.952791-1-eric.auger@redhat.com [ clg: Fixed spelling in vfio-amd-xgbe section ] Signed-off-by: Cédric Le Goater --- docs/about/deprecated.rst | 25 +++++++++++++++++++++++++ hw/vfio/amd-xgbe.c | 2 ++ hw/vfio/calxeda-xgmac.c | 2 ++ hw/vfio/platform.c | 1 + 4 files changed, 30 insertions(+) diff --git a/docs/about/deprecated.rst b/docs/about/deprecated.rst index abadf8de27..589951b136 100644 --- a/docs/about/deprecated.rst +++ b/docs/about/deprecated.rst @@ -434,6 +434,31 @@ Stream ``reconnect`` (since 9.2) The ``reconnect`` option only allows specifiying second granularity timeouts, which is not enough for all types of use cases, use ``reconnect-ms`` instead. +VFIO device options +''''''''''''''''''' + +``-device vfio-calxeda-xgmac`` (since 10.0) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +The vfio-calxeda-xgmac device allows to assign a host Calxeda Highbank +10Gb XGMAC Ethernet controller device ("calxeda,hb-xgmac" compatibility +string) to a guest. Calxeda HW has been ewasted now and there is no point +keeping that device. + +``-device vfio-amd-xgbe`` (since 10.0) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +The vfio-amd-xgbe device allows to assign a host AMD 10GbE controller +to a guest ("amd,xgbe-seattle-v1a" compatibility string). AMD "Seattle" +is not supported anymore and there is no point keeping that device. + +``-device vfio-platform`` (since 10.0) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +The vfio-platform device allows to assign a host platform device +to a guest in a generic manner. Integrating a new device into +the vfio-platform infrastructure requires some adaptation at +both kernel and qemu level. No such attempt has been done for years +and the conclusion is that vfio-platform has not got any traction. +PCIe passthrough shall be the mainline solution. + CPU device properties ''''''''''''''''''''' diff --git a/hw/vfio/amd-xgbe.c b/hw/vfio/amd-xgbe.c index aaa96903db..5927503b5c 100644 --- a/hw/vfio/amd-xgbe.c +++ b/hw/vfio/amd-xgbe.c @@ -15,12 +15,14 @@ #include "hw/vfio/vfio-amd-xgbe.h" #include "migration/vmstate.h" #include "qemu/module.h" +#include "qemu/error-report.h" static void amd_xgbe_realize(DeviceState *dev, Error **errp) { VFIOPlatformDevice *vdev = VFIO_PLATFORM_DEVICE(dev); VFIOAmdXgbeDeviceClass *k = VFIO_AMD_XGBE_DEVICE_GET_CLASS(dev); + warn_report("-device vfio-amd-xgbe is deprecated"); vdev->compat = g_strdup("amd,xgbe-seattle-v1a"); vdev->num_compat = 1; diff --git a/hw/vfio/calxeda-xgmac.c b/hw/vfio/calxeda-xgmac.c index b016d42b49..a5ef262def 100644 --- a/hw/vfio/calxeda-xgmac.c +++ b/hw/vfio/calxeda-xgmac.c @@ -15,12 +15,14 @@ #include "hw/vfio/vfio-calxeda-xgmac.h" #include "migration/vmstate.h" #include "qemu/module.h" +#include "qemu/error-report.h" static void calxeda_xgmac_realize(DeviceState *dev, Error **errp) { VFIOPlatformDevice *vdev = VFIO_PLATFORM_DEVICE(dev); VFIOCalxedaXgmacDeviceClass *k = VFIO_CALXEDA_XGMAC_DEVICE_GET_CLASS(dev); + warn_report("-device vfio-calxeda-xgmac is deprecated"); vdev->compat = g_strdup("calxeda,hb-xgmac"); vdev->num_compat = 1; diff --git a/hw/vfio/platform.c b/hw/vfio/platform.c index d9faaa7395..67bc57409c 100644 --- a/hw/vfio/platform.c +++ b/hw/vfio/platform.c @@ -575,6 +575,7 @@ static void vfio_platform_realize(DeviceState *dev, Error **errp) VFIODevice *vbasedev = &vdev->vbasedev; int i; + warn_report("-device vfio-platform is deprecated"); qemu_mutex_init(&vdev->intp_mutex); trace_vfio_platform_realize(vbasedev->sysfsdev ? From d3237d0d85e9fba79b37c776b0ddda611c338f7d Mon Sep 17 00:00:00 2001 From: "Maciej S. Szmigiero" Date: Tue, 4 Mar 2025 23:03:28 +0100 Subject: [PATCH 10/42] migration: Clarify that {load, save}_cleanup handlers can run without setup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It's possible for {load,save}_cleanup SaveVMHandlers to get called without the corresponding {load,save}_setup handler being called first. One such example is if {load,save}_setup handler of a proceeding device returns error. In this case the migration core cleanup code will call all corresponding cleanup handlers, even for these devices which haven't had its setup handler called. Since this behavior can generate some surprises let's clearly document it in these SaveVMHandlers description. Reviewed-by: Fabiano Rosas Reviewed-by: Cédric Le Goater Signed-off-by: Maciej S. Szmigiero Link: https://lore.kernel.org/qemu-devel/991636623fb780350f493b5f045cb17e13ce4c0f.1741124640.git.maciej.szmigiero@oracle.com Signed-off-by: Cédric Le Goater --- include/migration/register.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/include/migration/register.h b/include/migration/register.h index f60e797894..0b02927383 100644 --- a/include/migration/register.h +++ b/include/migration/register.h @@ -69,7 +69,9 @@ typedef struct SaveVMHandlers { /** * @save_cleanup * - * Uninitializes the data structures on the source + * Uninitializes the data structures on the source. + * Note that this handler can be called even if save_setup + * wasn't called earlier. * * @opaque: data pointer passed to register_savevm_live() */ @@ -244,6 +246,8 @@ typedef struct SaveVMHandlers { * @load_cleanup * * Uninitializes the data structures on the destination. + * Note that this handler can be called even if load_setup + * wasn't called earlier. * * @opaque: data pointer passed to register_savevm_live() * From 03c6468a136b3d79e7c6bb269ac7924fa8fef634 Mon Sep 17 00:00:00 2001 From: "Maciej S. Szmigiero" Date: Tue, 4 Mar 2025 23:03:29 +0100 Subject: [PATCH 11/42] thread-pool: Remove thread_pool_submit() function MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This function name conflicts with one used by a future generic thread pool function and it was only used by one test anyway. Update the trace event name in thread_pool_submit_aio() accordingly. Acked-by: Fabiano Rosas Reviewed-by: Cédric Le Goater Reviewed-by: Peter Xu Signed-off-by: Maciej S. Szmigiero Link: https://lore.kernel.org/qemu-devel/6830f07777f939edaf0a2d301c39adcaaf3817f0.1741124640.git.maciej.szmigiero@oracle.com Signed-off-by: Cédric Le Goater --- include/block/thread-pool.h | 3 +-- tests/unit/test-thread-pool.c | 6 +++--- util/thread-pool.c | 7 +------ util/trace-events | 2 +- 4 files changed, 6 insertions(+), 12 deletions(-) diff --git a/include/block/thread-pool.h b/include/block/thread-pool.h index 948ff5f30c..4f66940261 100644 --- a/include/block/thread-pool.h +++ b/include/block/thread-pool.h @@ -30,13 +30,12 @@ ThreadPool *thread_pool_new(struct AioContext *ctx); void thread_pool_free(ThreadPool *pool); /* - * thread_pool_submit* API: submit I/O requests in the thread's + * thread_pool_submit_{aio,co} API: submit I/O requests in the thread's * current AioContext. */ BlockAIOCB *thread_pool_submit_aio(ThreadPoolFunc *func, void *arg, BlockCompletionFunc *cb, void *opaque); int coroutine_fn thread_pool_submit_co(ThreadPoolFunc *func, void *arg); -void thread_pool_submit(ThreadPoolFunc *func, void *arg); void thread_pool_update_params(ThreadPool *pool, struct AioContext *ctx); diff --git a/tests/unit/test-thread-pool.c b/tests/unit/test-thread-pool.c index 1483e53473..33407b595d 100644 --- a/tests/unit/test-thread-pool.c +++ b/tests/unit/test-thread-pool.c @@ -43,10 +43,10 @@ static void done_cb(void *opaque, int ret) active--; } -static void test_submit(void) +static void test_submit_no_complete(void) { WorkerTestData data = { .n = 0 }; - thread_pool_submit(worker_cb, &data); + thread_pool_submit_aio(worker_cb, &data, NULL, NULL); while (data.n == 0) { aio_poll(ctx, true); } @@ -236,7 +236,7 @@ int main(int argc, char **argv) ctx = qemu_get_current_aio_context(); g_test_init(&argc, &argv, NULL); - g_test_add_func("/thread-pool/submit", test_submit); + g_test_add_func("/thread-pool/submit-no-complete", test_submit_no_complete); g_test_add_func("/thread-pool/submit-aio", test_submit_aio); g_test_add_func("/thread-pool/submit-co", test_submit_co); g_test_add_func("/thread-pool/submit-many", test_submit_many); diff --git a/util/thread-pool.c b/util/thread-pool.c index 27eb777e85..2f751d55b3 100644 --- a/util/thread-pool.c +++ b/util/thread-pool.c @@ -256,7 +256,7 @@ BlockAIOCB *thread_pool_submit_aio(ThreadPoolFunc *func, void *arg, QLIST_INSERT_HEAD(&pool->head, req, all); - trace_thread_pool_submit(pool, req, arg); + trace_thread_pool_submit_aio(pool, req, arg); qemu_mutex_lock(&pool->lock); if (pool->idle_threads == 0 && pool->cur_threads < pool->max_threads) { @@ -290,11 +290,6 @@ int coroutine_fn thread_pool_submit_co(ThreadPoolFunc *func, void *arg) return tpc.ret; } -void thread_pool_submit(ThreadPoolFunc *func, void *arg) -{ - thread_pool_submit_aio(func, arg, NULL, NULL); -} - void thread_pool_update_params(ThreadPool *pool, AioContext *ctx) { qemu_mutex_lock(&pool->lock); diff --git a/util/trace-events b/util/trace-events index 49a4962e18..5be12d7fab 100644 --- a/util/trace-events +++ b/util/trace-events @@ -14,7 +14,7 @@ aio_co_schedule_bh_cb(void *ctx, void *co) "ctx %p co %p" reentrant_aio(void *ctx, const char *name) "ctx %p name %s" # thread-pool.c -thread_pool_submit(void *pool, void *req, void *opaque) "pool %p req %p opaque %p" +thread_pool_submit_aio(void *pool, void *req, void *opaque) "pool %p req %p opaque %p" thread_pool_complete(void *pool, void *req, void *opaque, int ret) "pool %p req %p opaque %p ret %d" thread_pool_cancel(void *req, void *opaque) "req %p opaque %p" From dc67daeed579ea52f045f78b88d9e5e712038ccf Mon Sep 17 00:00:00 2001 From: "Maciej S. Szmigiero" Date: Tue, 4 Mar 2025 23:03:30 +0100 Subject: [PATCH 12/42] thread-pool: Rename AIO pool functions to *_aio() and data types to *Aio MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit These names conflict with ones used by future generic thread pool equivalents. Generic names should belong to the generic pool type, not specific (AIO) type. Acked-by: Fabiano Rosas Reviewed-by: Cédric Le Goater Reviewed-by: Peter Xu Signed-off-by: Maciej S. Szmigiero Link: https://lore.kernel.org/qemu-devel/70f9e0fb4b01042258a1a57996c64d19779dc7f0.1741124640.git.maciej.szmigiero@oracle.com Signed-off-by: Cédric Le Goater --- include/block/aio.h | 8 ++--- include/block/thread-pool.h | 8 ++--- util/async.c | 6 ++-- util/thread-pool.c | 58 ++++++++++++++++++------------------- util/trace-events | 4 +-- 5 files changed, 42 insertions(+), 42 deletions(-) diff --git a/include/block/aio.h b/include/block/aio.h index 43883a8a33..b2ab3514de 100644 --- a/include/block/aio.h +++ b/include/block/aio.h @@ -54,7 +54,7 @@ typedef void QEMUBHFunc(void *opaque); typedef bool AioPollFn(void *opaque); typedef void IOHandler(void *opaque); -struct ThreadPool; +struct ThreadPoolAio; struct LinuxAioState; typedef struct LuringState LuringState; @@ -207,7 +207,7 @@ struct AioContext { /* Thread pool for performing work and receiving completion callbacks. * Has its own locking. */ - struct ThreadPool *thread_pool; + struct ThreadPoolAio *thread_pool; #ifdef CONFIG_LINUX_AIO struct LinuxAioState *linux_aio; @@ -500,8 +500,8 @@ void aio_set_event_notifier_poll(AioContext *ctx, */ GSource *aio_get_g_source(AioContext *ctx); -/* Return the ThreadPool bound to this AioContext */ -struct ThreadPool *aio_get_thread_pool(AioContext *ctx); +/* Return the ThreadPoolAio bound to this AioContext */ +struct ThreadPoolAio *aio_get_thread_pool(AioContext *ctx); /* Setup the LinuxAioState bound to this AioContext */ struct LinuxAioState *aio_setup_linux_aio(AioContext *ctx, Error **errp); diff --git a/include/block/thread-pool.h b/include/block/thread-pool.h index 4f66940261..6f27eb085b 100644 --- a/include/block/thread-pool.h +++ b/include/block/thread-pool.h @@ -24,10 +24,10 @@ typedef int ThreadPoolFunc(void *opaque); -typedef struct ThreadPool ThreadPool; +typedef struct ThreadPoolAio ThreadPoolAio; -ThreadPool *thread_pool_new(struct AioContext *ctx); -void thread_pool_free(ThreadPool *pool); +ThreadPoolAio *thread_pool_new_aio(struct AioContext *ctx); +void thread_pool_free_aio(ThreadPoolAio *pool); /* * thread_pool_submit_{aio,co} API: submit I/O requests in the thread's @@ -36,7 +36,7 @@ void thread_pool_free(ThreadPool *pool); BlockAIOCB *thread_pool_submit_aio(ThreadPoolFunc *func, void *arg, BlockCompletionFunc *cb, void *opaque); int coroutine_fn thread_pool_submit_co(ThreadPoolFunc *func, void *arg); +void thread_pool_update_params(ThreadPoolAio *pool, struct AioContext *ctx); -void thread_pool_update_params(ThreadPool *pool, struct AioContext *ctx); #endif diff --git a/util/async.c b/util/async.c index 0fe2943609..47e3d35a26 100644 --- a/util/async.c +++ b/util/async.c @@ -369,7 +369,7 @@ aio_ctx_finalize(GSource *source) QEMUBH *bh; unsigned flags; - thread_pool_free(ctx->thread_pool); + thread_pool_free_aio(ctx->thread_pool); #ifdef CONFIG_LINUX_AIO if (ctx->linux_aio) { @@ -435,10 +435,10 @@ GSource *aio_get_g_source(AioContext *ctx) return &ctx->source; } -ThreadPool *aio_get_thread_pool(AioContext *ctx) +ThreadPoolAio *aio_get_thread_pool(AioContext *ctx) { if (!ctx->thread_pool) { - ctx->thread_pool = thread_pool_new(ctx); + ctx->thread_pool = thread_pool_new_aio(ctx); } return ctx->thread_pool; } diff --git a/util/thread-pool.c b/util/thread-pool.c index 2f751d55b3..908194dc07 100644 --- a/util/thread-pool.c +++ b/util/thread-pool.c @@ -23,9 +23,9 @@ #include "block/thread-pool.h" #include "qemu/main-loop.h" -static void do_spawn_thread(ThreadPool *pool); +static void do_spawn_thread(ThreadPoolAio *pool); -typedef struct ThreadPoolElement ThreadPoolElement; +typedef struct ThreadPoolElementAio ThreadPoolElementAio; enum ThreadState { THREAD_QUEUED, @@ -33,9 +33,9 @@ enum ThreadState { THREAD_DONE, }; -struct ThreadPoolElement { +struct ThreadPoolElementAio { BlockAIOCB common; - ThreadPool *pool; + ThreadPoolAio *pool; ThreadPoolFunc *func; void *arg; @@ -47,13 +47,13 @@ struct ThreadPoolElement { int ret; /* Access to this list is protected by lock. */ - QTAILQ_ENTRY(ThreadPoolElement) reqs; + QTAILQ_ENTRY(ThreadPoolElementAio) reqs; /* This list is only written by the thread pool's mother thread. */ - QLIST_ENTRY(ThreadPoolElement) all; + QLIST_ENTRY(ThreadPoolElementAio) all; }; -struct ThreadPool { +struct ThreadPoolAio { AioContext *ctx; QEMUBH *completion_bh; QemuMutex lock; @@ -62,10 +62,10 @@ struct ThreadPool { QEMUBH *new_thread_bh; /* The following variables are only accessed from one AioContext. */ - QLIST_HEAD(, ThreadPoolElement) head; + QLIST_HEAD(, ThreadPoolElementAio) head; /* The following variables are protected by lock. */ - QTAILQ_HEAD(, ThreadPoolElement) request_list; + QTAILQ_HEAD(, ThreadPoolElementAio) request_list; int cur_threads; int idle_threads; int new_threads; /* backlog of threads we need to create */ @@ -76,14 +76,14 @@ struct ThreadPool { static void *worker_thread(void *opaque) { - ThreadPool *pool = opaque; + ThreadPoolAio *pool = opaque; qemu_mutex_lock(&pool->lock); pool->pending_threads--; do_spawn_thread(pool); while (pool->cur_threads <= pool->max_threads) { - ThreadPoolElement *req; + ThreadPoolElementAio *req; int ret; if (QTAILQ_EMPTY(&pool->request_list)) { @@ -131,7 +131,7 @@ static void *worker_thread(void *opaque) return NULL; } -static void do_spawn_thread(ThreadPool *pool) +static void do_spawn_thread(ThreadPoolAio *pool) { QemuThread t; @@ -148,14 +148,14 @@ static void do_spawn_thread(ThreadPool *pool) static void spawn_thread_bh_fn(void *opaque) { - ThreadPool *pool = opaque; + ThreadPoolAio *pool = opaque; qemu_mutex_lock(&pool->lock); do_spawn_thread(pool); qemu_mutex_unlock(&pool->lock); } -static void spawn_thread(ThreadPool *pool) +static void spawn_thread(ThreadPoolAio *pool) { pool->cur_threads++; pool->new_threads++; @@ -173,8 +173,8 @@ static void spawn_thread(ThreadPool *pool) static void thread_pool_completion_bh(void *opaque) { - ThreadPool *pool = opaque; - ThreadPoolElement *elem, *next; + ThreadPoolAio *pool = opaque; + ThreadPoolElementAio *elem, *next; defer_call_begin(); /* cb() may use defer_call() to coalesce work */ @@ -184,8 +184,8 @@ restart: continue; } - trace_thread_pool_complete(pool, elem, elem->common.opaque, - elem->ret); + trace_thread_pool_complete_aio(pool, elem, elem->common.opaque, + elem->ret); QLIST_REMOVE(elem, all); if (elem->common.cb) { @@ -217,10 +217,10 @@ restart: static void thread_pool_cancel(BlockAIOCB *acb) { - ThreadPoolElement *elem = (ThreadPoolElement *)acb; - ThreadPool *pool = elem->pool; + ThreadPoolElementAio *elem = (ThreadPoolElementAio *)acb; + ThreadPoolAio *pool = elem->pool; - trace_thread_pool_cancel(elem, elem->common.opaque); + trace_thread_pool_cancel_aio(elem, elem->common.opaque); QEMU_LOCK_GUARD(&pool->lock); if (elem->state == THREAD_QUEUED) { @@ -234,16 +234,16 @@ static void thread_pool_cancel(BlockAIOCB *acb) } static const AIOCBInfo thread_pool_aiocb_info = { - .aiocb_size = sizeof(ThreadPoolElement), + .aiocb_size = sizeof(ThreadPoolElementAio), .cancel_async = thread_pool_cancel, }; BlockAIOCB *thread_pool_submit_aio(ThreadPoolFunc *func, void *arg, BlockCompletionFunc *cb, void *opaque) { - ThreadPoolElement *req; + ThreadPoolElementAio *req; AioContext *ctx = qemu_get_current_aio_context(); - ThreadPool *pool = aio_get_thread_pool(ctx); + ThreadPoolAio *pool = aio_get_thread_pool(ctx); /* Assert that the thread submitting work is the same running the pool */ assert(pool->ctx == qemu_get_current_aio_context()); @@ -290,7 +290,7 @@ int coroutine_fn thread_pool_submit_co(ThreadPoolFunc *func, void *arg) return tpc.ret; } -void thread_pool_update_params(ThreadPool *pool, AioContext *ctx) +void thread_pool_update_params(ThreadPoolAio *pool, AioContext *ctx) { qemu_mutex_lock(&pool->lock); @@ -317,7 +317,7 @@ void thread_pool_update_params(ThreadPool *pool, AioContext *ctx) qemu_mutex_unlock(&pool->lock); } -static void thread_pool_init_one(ThreadPool *pool, AioContext *ctx) +static void thread_pool_init_one(ThreadPoolAio *pool, AioContext *ctx) { if (!ctx) { ctx = qemu_get_aio_context(); @@ -337,14 +337,14 @@ static void thread_pool_init_one(ThreadPool *pool, AioContext *ctx) thread_pool_update_params(pool, ctx); } -ThreadPool *thread_pool_new(AioContext *ctx) +ThreadPoolAio *thread_pool_new_aio(AioContext *ctx) { - ThreadPool *pool = g_new(ThreadPool, 1); + ThreadPoolAio *pool = g_new(ThreadPoolAio, 1); thread_pool_init_one(pool, ctx); return pool; } -void thread_pool_free(ThreadPool *pool) +void thread_pool_free_aio(ThreadPoolAio *pool) { if (!pool) { return; diff --git a/util/trace-events b/util/trace-events index 5be12d7fab..bd8f25fb59 100644 --- a/util/trace-events +++ b/util/trace-events @@ -15,8 +15,8 @@ reentrant_aio(void *ctx, const char *name) "ctx %p name %s" # thread-pool.c thread_pool_submit_aio(void *pool, void *req, void *opaque) "pool %p req %p opaque %p" -thread_pool_complete(void *pool, void *req, void *opaque, int ret) "pool %p req %p opaque %p ret %d" -thread_pool_cancel(void *req, void *opaque) "req %p opaque %p" +thread_pool_complete_aio(void *pool, void *req, void *opaque, int ret) "pool %p req %p opaque %p ret %d" +thread_pool_cancel_aio(void *req, void *opaque) "req %p opaque %p" # buffer.c buffer_resize(const char *buf, size_t olen, size_t len) "%s: old %zd, new %zd" From b5aa74968b27f37523c180e9c42ca007dbc7758f Mon Sep 17 00:00:00 2001 From: "Maciej S. Szmigiero" Date: Tue, 4 Mar 2025 23:03:31 +0100 Subject: [PATCH 13/42] thread-pool: Implement generic (non-AIO) pool support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Migration code wants to manage device data sending threads in one place. QEMU has an existing thread pool implementation, however it is limited to queuing AIO operations only and essentially has a 1:1 mapping between the current AioContext and the AIO ThreadPool in use. Implement generic (non-AIO) ThreadPool by essentially wrapping Glib's GThreadPool. This brings a few new operations on a pool: * thread_pool_wait() operation waits until all the submitted work requests have finished. * thread_pool_set_max_threads() explicitly sets the maximum thread count in the pool. * thread_pool_adjust_max_threads_to_work() adjusts the maximum thread count in the pool to equal the number of still waiting in queue or unfinished work. Reviewed-by: Fabiano Rosas Reviewed-by: Peter Xu Signed-off-by: Maciej S. Szmigiero Link: https://lore.kernel.org/qemu-devel/b1efaebdbea7cb7068b8fb74148777012383e12b.1741124640.git.maciej.szmigiero@oracle.com Signed-off-by: Cédric Le Goater --- include/block/thread-pool.h | 51 ++++++++++++++++ util/thread-pool.c | 119 ++++++++++++++++++++++++++++++++++++ 2 files changed, 170 insertions(+) diff --git a/include/block/thread-pool.h b/include/block/thread-pool.h index 6f27eb085b..dd48cf07e8 100644 --- a/include/block/thread-pool.h +++ b/include/block/thread-pool.h @@ -38,5 +38,56 @@ BlockAIOCB *thread_pool_submit_aio(ThreadPoolFunc *func, void *arg, int coroutine_fn thread_pool_submit_co(ThreadPoolFunc *func, void *arg); void thread_pool_update_params(ThreadPoolAio *pool, struct AioContext *ctx); +/* ------------------------------------------- */ +/* Generic thread pool types and methods below */ +typedef struct ThreadPool ThreadPool; + +/* Create a new thread pool. Never returns NULL. */ +ThreadPool *thread_pool_new(void); + +/* + * Free the thread pool. + * Waits for all the previously submitted work to complete before performing + * the actual freeing operation. + */ +void thread_pool_free(ThreadPool *pool); + +/* + * Submit a new work (task) for the pool. + * + * @opaque_destroy is an optional GDestroyNotify for the @opaque argument + * to the work function at @func. + */ +void thread_pool_submit(ThreadPool *pool, ThreadPoolFunc *func, + void *opaque, GDestroyNotify opaque_destroy); + +/* + * Submit a new work (task) for the pool, making sure it starts getting + * processed immediately, launching a new thread for it if necessary. + * + * @opaque_destroy is an optional GDestroyNotify for the @opaque argument + * to the work function at @func. + */ +void thread_pool_submit_immediate(ThreadPool *pool, ThreadPoolFunc *func, + void *opaque, GDestroyNotify opaque_destroy); + +/* + * Wait for all previously submitted work to complete before returning. + * + * Can be used as a barrier between two sets of tasks executed on a thread + * pool without destroying it or in a performance sensitive path where the + * caller just wants to wait for all tasks to complete while deferring the + * pool free operation for later, less performance sensitive time. + */ +void thread_pool_wait(ThreadPool *pool); + +/* Set the maximum number of threads in the pool. */ +bool thread_pool_set_max_threads(ThreadPool *pool, int max_threads); + +/* + * Adjust the maximum number of threads in the pool to give each task its + * own thread (exactly one thread per task). + */ +bool thread_pool_adjust_max_threads_to_work(ThreadPool *pool); #endif diff --git a/util/thread-pool.c b/util/thread-pool.c index 908194dc07..d2ead6b728 100644 --- a/util/thread-pool.c +++ b/util/thread-pool.c @@ -374,3 +374,122 @@ void thread_pool_free_aio(ThreadPoolAio *pool) qemu_mutex_destroy(&pool->lock); g_free(pool); } + +struct ThreadPool { + GThreadPool *t; + size_t cur_work; + QemuMutex cur_work_lock; + QemuCond all_finished_cond; +}; + +typedef struct { + ThreadPoolFunc *func; + void *opaque; + GDestroyNotify opaque_destroy; +} ThreadPoolElement; + +static void thread_pool_func(gpointer data, gpointer user_data) +{ + ThreadPool *pool = user_data; + g_autofree ThreadPoolElement *el = data; + + el->func(el->opaque); + + if (el->opaque_destroy) { + el->opaque_destroy(el->opaque); + } + + QEMU_LOCK_GUARD(&pool->cur_work_lock); + + assert(pool->cur_work > 0); + pool->cur_work--; + + if (pool->cur_work == 0) { + qemu_cond_signal(&pool->all_finished_cond); + } +} + +ThreadPool *thread_pool_new(void) +{ + ThreadPool *pool = g_new(ThreadPool, 1); + + pool->cur_work = 0; + qemu_mutex_init(&pool->cur_work_lock); + qemu_cond_init(&pool->all_finished_cond); + + pool->t = g_thread_pool_new(thread_pool_func, pool, 0, TRUE, NULL); + /* + * g_thread_pool_new() can only return errors if initial thread(s) + * creation fails but we ask for 0 initial threads above. + */ + assert(pool->t); + + return pool; +} + +void thread_pool_free(ThreadPool *pool) +{ + /* + * With _wait = TRUE this effectively waits for all + * previously submitted work to complete first. + */ + g_thread_pool_free(pool->t, FALSE, TRUE); + + qemu_cond_destroy(&pool->all_finished_cond); + qemu_mutex_destroy(&pool->cur_work_lock); + + g_free(pool); +} + +void thread_pool_submit(ThreadPool *pool, ThreadPoolFunc *func, + void *opaque, GDestroyNotify opaque_destroy) +{ + ThreadPoolElement *el = g_new(ThreadPoolElement, 1); + + el->func = func; + el->opaque = opaque; + el->opaque_destroy = opaque_destroy; + + WITH_QEMU_LOCK_GUARD(&pool->cur_work_lock) { + pool->cur_work++; + } + + /* + * Ignore the return value since this function can only return errors + * if creation of an additional thread fails but even in this case the + * provided work is still getting queued (just for the existing threads). + */ + g_thread_pool_push(pool->t, el, NULL); +} + +void thread_pool_submit_immediate(ThreadPool *pool, ThreadPoolFunc *func, + void *opaque, GDestroyNotify opaque_destroy) +{ + thread_pool_submit(pool, func, opaque, opaque_destroy); + thread_pool_adjust_max_threads_to_work(pool); +} + +void thread_pool_wait(ThreadPool *pool) +{ + QEMU_LOCK_GUARD(&pool->cur_work_lock); + + while (pool->cur_work > 0) { + qemu_cond_wait(&pool->all_finished_cond, + &pool->cur_work_lock); + } +} + +bool thread_pool_set_max_threads(ThreadPool *pool, + int max_threads) +{ + assert(max_threads > 0); + + return g_thread_pool_set_max_threads(pool->t, max_threads, NULL); +} + +bool thread_pool_adjust_max_threads_to_work(ThreadPool *pool) +{ + QEMU_LOCK_GUARD(&pool->cur_work_lock); + + return thread_pool_set_max_threads(pool, pool->cur_work); +} From 4e55cb3cdeb099cb65f75f5d3b061e3e1319cf3b Mon Sep 17 00:00:00 2001 From: "Maciej S. Szmigiero" Date: Tue, 4 Mar 2025 23:03:32 +0100 Subject: [PATCH 14/42] migration: Add MIG_CMD_SWITCHOVER_START and its load handler MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This QEMU_VM_COMMAND sub-command and its switchover_start SaveVMHandler is used to mark the switchover point in main migration stream. It can be used to inform the destination that all pre-switchover main migration stream data has been sent/received so it can start to process post-switchover data that it might have received via other migration channels like the multifd ones. Add also the relevant MigrationState bit stream compatibility property and its hw_compat entry. Reviewed-by: Fabiano Rosas Reviewed-by: Zhang Chen # for the COLO part Signed-off-by: Maciej S. Szmigiero Link: https://lore.kernel.org/qemu-devel/311be6da85fc7e49a7598684d80aa631778dcbce.1741124640.git.maciej.szmigiero@oracle.com Signed-off-by: Cédric Le Goater --- hw/core/machine.c | 1 + include/migration/client-options.h | 4 +++ include/migration/register.h | 12 +++++++++ migration/colo.c | 3 +++ migration/migration-hmp-cmds.c | 2 ++ migration/migration.c | 2 ++ migration/migration.h | 2 ++ migration/options.c | 9 +++++++ migration/savevm.c | 39 ++++++++++++++++++++++++++++++ migration/savevm.h | 1 + migration/trace-events | 1 + scripts/analyze-migration.py | 11 +++++++++ 12 files changed, 87 insertions(+) diff --git a/hw/core/machine.c b/hw/core/machine.c index b68b8b94a3..d1ddc3a3db 100644 --- a/hw/core/machine.c +++ b/hw/core/machine.c @@ -44,6 +44,7 @@ GlobalProperty hw_compat_9_2[] = { { "virtio-balloon-pci-non-transitional", "vectors", "0" }, { "virtio-mem-pci", "vectors", "0" }, { "migration", "multifd-clean-tls-termination", "false" }, + { "migration", "send-switchover-start", "off"}, }; const size_t hw_compat_9_2_len = G_N_ELEMENTS(hw_compat_9_2); diff --git a/include/migration/client-options.h b/include/migration/client-options.h index 59f4b55cf4..289c9d7762 100644 --- a/include/migration/client-options.h +++ b/include/migration/client-options.h @@ -10,6 +10,10 @@ #ifndef QEMU_MIGRATION_CLIENT_OPTIONS_H #define QEMU_MIGRATION_CLIENT_OPTIONS_H + +/* properties */ +bool migrate_send_switchover_start(void); + /* capabilities */ bool migrate_background_snapshot(void); diff --git a/include/migration/register.h b/include/migration/register.h index 0b02927383..ff0faf5f68 100644 --- a/include/migration/register.h +++ b/include/migration/register.h @@ -279,6 +279,18 @@ typedef struct SaveVMHandlers { * otherwise */ bool (*switchover_ack_needed)(void *opaque); + + /** + * @switchover_start + * + * Notifies that the switchover has started. Called only on + * the destination. + * + * @opaque: data pointer passed to register_savevm_live() + * + * Returns zero to indicate success and negative for error + */ + int (*switchover_start)(void *opaque); } SaveVMHandlers; /** diff --git a/migration/colo.c b/migration/colo.c index 9a8e5fbe9b..c976b3ff34 100644 --- a/migration/colo.c +++ b/migration/colo.c @@ -452,6 +452,9 @@ static int colo_do_checkpoint_transaction(MigrationState *s, bql_unlock(); goto out; } + + qemu_savevm_maybe_send_switchover_start(s->to_dst_file); + /* Note: device state is saved into buffer */ ret = qemu_save_device_state(fb); diff --git a/migration/migration-hmp-cmds.c b/migration/migration-hmp-cmds.c index 3347e34c48..49c26daed3 100644 --- a/migration/migration-hmp-cmds.c +++ b/migration/migration-hmp-cmds.c @@ -46,6 +46,8 @@ static void migration_global_dump(Monitor *mon) ms->send_configuration ? "on" : "off"); monitor_printf(mon, "send-section-footer: %s\n", ms->send_section_footer ? "on" : "off"); + monitor_printf(mon, "send-switchover-start: %s\n", + ms->send_switchover_start ? "on" : "off"); monitor_printf(mon, "clear-bitmap-shift: %u\n", ms->clear_bitmap_shift); } diff --git a/migration/migration.c b/migration/migration.c index c597aa707e..9e9db26667 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -2891,6 +2891,8 @@ static bool migration_switchover_start(MigrationState *s, Error **errp) precopy_notify_complete(); + qemu_savevm_maybe_send_switchover_start(s->to_dst_file); + return true; } diff --git a/migration/migration.h b/migration/migration.h index 4639e2a7e4..7b4278e2a3 100644 --- a/migration/migration.h +++ b/migration/migration.h @@ -400,6 +400,8 @@ struct MigrationState { bool send_configuration; /* Whether we send section footer during migration */ bool send_section_footer; + /* Whether we send switchover start notification during migration */ + bool send_switchover_start; /* Needed by postcopy-pause state */ QemuSemaphore postcopy_pause_sem; diff --git a/migration/options.c b/migration/options.c index bb259d192a..b0ac2ea408 100644 --- a/migration/options.c +++ b/migration/options.c @@ -93,6 +93,8 @@ const Property migration_properties[] = { send_configuration, true), DEFINE_PROP_BOOL("send-section-footer", MigrationState, send_section_footer, true), + DEFINE_PROP_BOOL("send-switchover-start", MigrationState, + send_switchover_start, true), DEFINE_PROP_BOOL("multifd-flush-after-each-section", MigrationState, multifd_flush_after_each_section, false), DEFINE_PROP_UINT8("x-clear-bitmap-shift", MigrationState, @@ -209,6 +211,13 @@ bool migrate_auto_converge(void) return s->capabilities[MIGRATION_CAPABILITY_AUTO_CONVERGE]; } +bool migrate_send_switchover_start(void) +{ + MigrationState *s = migrate_get_current(); + + return s->send_switchover_start; +} + bool migrate_background_snapshot(void) { MigrationState *s = migrate_get_current(); diff --git a/migration/savevm.c b/migration/savevm.c index 4046faf009..faebf47ef5 100644 --- a/migration/savevm.c +++ b/migration/savevm.c @@ -90,6 +90,7 @@ enum qemu_vm_cmd { MIG_CMD_ENABLE_COLO, /* Enable COLO */ MIG_CMD_POSTCOPY_RESUME, /* resume postcopy on dest */ MIG_CMD_RECV_BITMAP, /* Request for recved bitmap on dst */ + MIG_CMD_SWITCHOVER_START, /* Switchover start notification */ MIG_CMD_MAX }; @@ -109,6 +110,7 @@ static struct mig_cmd_args { [MIG_CMD_POSTCOPY_RESUME] = { .len = 0, .name = "POSTCOPY_RESUME" }, [MIG_CMD_PACKAGED] = { .len = 4, .name = "PACKAGED" }, [MIG_CMD_RECV_BITMAP] = { .len = -1, .name = "RECV_BITMAP" }, + [MIG_CMD_SWITCHOVER_START] = { .len = 0, .name = "SWITCHOVER_START" }, [MIG_CMD_MAX] = { .len = -1, .name = "MAX" }, }; @@ -1201,6 +1203,19 @@ void qemu_savevm_send_recv_bitmap(QEMUFile *f, char *block_name) qemu_savevm_command_send(f, MIG_CMD_RECV_BITMAP, len + 1, (uint8_t *)buf); } +static void qemu_savevm_send_switchover_start(QEMUFile *f) +{ + trace_savevm_send_switchover_start(); + qemu_savevm_command_send(f, MIG_CMD_SWITCHOVER_START, 0, NULL); +} + +void qemu_savevm_maybe_send_switchover_start(QEMUFile *f) +{ + if (migrate_send_switchover_start()) { + qemu_savevm_send_switchover_start(f); + } +} + bool qemu_savevm_state_blocked(Error **errp) { SaveStateEntry *se; @@ -1687,6 +1702,7 @@ static int qemu_savevm_state(QEMUFile *f, Error **errp) ret = qemu_file_get_error(f); if (ret == 0) { + qemu_savevm_maybe_send_switchover_start(f); qemu_savevm_state_complete_precopy(f, false); ret = qemu_file_get_error(f); } @@ -2383,6 +2399,26 @@ static int loadvm_process_enable_colo(MigrationIncomingState *mis) return ret; } +static int loadvm_postcopy_handle_switchover_start(void) +{ + SaveStateEntry *se; + + QTAILQ_FOREACH(se, &savevm_state.handlers, entry) { + int ret; + + if (!se->ops || !se->ops->switchover_start) { + continue; + } + + ret = se->ops->switchover_start(se->opaque); + if (ret < 0) { + return ret; + } + } + + return 0; +} + /* * Process an incoming 'QEMU_VM_COMMAND' * 0 just a normal return @@ -2481,6 +2517,9 @@ static int loadvm_process_command(QEMUFile *f) case MIG_CMD_ENABLE_COLO: return loadvm_process_enable_colo(mis); + + case MIG_CMD_SWITCHOVER_START: + return loadvm_postcopy_handle_switchover_start(); } return 0; diff --git a/migration/savevm.h b/migration/savevm.h index 7957460062..58f871a7ed 100644 --- a/migration/savevm.h +++ b/migration/savevm.h @@ -53,6 +53,7 @@ void qemu_savevm_send_postcopy_listen(QEMUFile *f); void qemu_savevm_send_postcopy_run(QEMUFile *f); void qemu_savevm_send_postcopy_resume(QEMUFile *f); void qemu_savevm_send_recv_bitmap(QEMUFile *f, char *block_name); +void qemu_savevm_maybe_send_switchover_start(QEMUFile *f); void qemu_savevm_send_postcopy_ram_discard(QEMUFile *f, const char *name, uint16_t len, diff --git a/migration/trace-events b/migration/trace-events index 58c0f07f5b..c506e11a2e 100644 --- a/migration/trace-events +++ b/migration/trace-events @@ -39,6 +39,7 @@ savevm_send_postcopy_run(void) "" savevm_send_postcopy_resume(void) "" savevm_send_colo_enable(void) "" savevm_send_recv_bitmap(char *name) "%s" +savevm_send_switchover_start(void) "" savevm_state_setup(void) "" savevm_state_resume_prepare(void) "" savevm_state_header(void) "" diff --git a/scripts/analyze-migration.py b/scripts/analyze-migration.py index 8e1fbf4c9d..67631ac43e 100755 --- a/scripts/analyze-migration.py +++ b/scripts/analyze-migration.py @@ -620,7 +620,9 @@ class MigrationDump(object): QEMU_VM_SUBSECTION = 0x05 QEMU_VM_VMDESCRIPTION = 0x06 QEMU_VM_CONFIGURATION = 0x07 + QEMU_VM_COMMAND = 0x08 QEMU_VM_SECTION_FOOTER= 0x7e + QEMU_MIG_CMD_SWITCHOVER_START = 0x0b def __init__(self, filename): self.section_classes = { @@ -685,6 +687,15 @@ class MigrationDump(object): elif section_type == self.QEMU_VM_SECTION_PART or section_type == self.QEMU_VM_SECTION_END: section_id = file.read32() self.sections[section_id].read() + elif section_type == self.QEMU_VM_COMMAND: + command_type = file.read16() + command_data_len = file.read16() + if command_type != self.QEMU_MIG_CMD_SWITCHOVER_START: + raise Exception("Unknown QEMU_VM_COMMAND: %x" % + (command_type)) + if command_data_len != 0: + raise Exception("Invalid SWITCHOVER_START length: %x" % + (command_data_len)) elif section_type == self.QEMU_VM_SECTION_FOOTER: read_section_id = file.read32() if read_section_id != section_id: From a30363db082a6947794be6e085ad9437798ec211 Mon Sep 17 00:00:00 2001 From: "Maciej S. Szmigiero" Date: Tue, 4 Mar 2025 23:03:33 +0100 Subject: [PATCH 15/42] migration: Add qemu_loadvm_load_state_buffer() and its handler MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit qemu_loadvm_load_state_buffer() and its load_state_buffer SaveVMHandler allow providing device state buffer to explicitly specified device via its idstr and instance id. Reviewed-by: Fabiano Rosas Reviewed-by: Peter Xu Signed-off-by: Maciej S. Szmigiero Link: https://lore.kernel.org/qemu-devel/71ca753286b87831ced4afd422e2e2bed071af25.1741124640.git.maciej.szmigiero@oracle.com Signed-off-by: Cédric Le Goater --- include/migration/register.h | 15 +++++++++++++++ migration/savevm.c | 23 +++++++++++++++++++++++ migration/savevm.h | 3 +++ 3 files changed, 41 insertions(+) diff --git a/include/migration/register.h b/include/migration/register.h index ff0faf5f68..58891aa54b 100644 --- a/include/migration/register.h +++ b/include/migration/register.h @@ -229,6 +229,21 @@ typedef struct SaveVMHandlers { */ int (*load_state)(QEMUFile *f, void *opaque, int version_id); + /** + * @load_state_buffer (invoked outside the BQL) + * + * Load device state buffer provided to qemu_loadvm_load_state_buffer(). + * + * @opaque: data pointer passed to register_savevm_live() + * @buf: the data buffer to load + * @len: the data length in buffer + * @errp: pointer to Error*, to store an error if it happens. + * + * Returns true to indicate success and false for errors. + */ + bool (*load_state_buffer)(void *opaque, char *buf, size_t len, + Error **errp); + /** * @load_setup * diff --git a/migration/savevm.c b/migration/savevm.c index faebf47ef5..7c1aa8ad7b 100644 --- a/migration/savevm.c +++ b/migration/savevm.c @@ -3060,6 +3060,29 @@ int qemu_loadvm_approve_switchover(void) return migrate_send_rp_switchover_ack(mis); } +bool qemu_loadvm_load_state_buffer(const char *idstr, uint32_t instance_id, + char *buf, size_t len, Error **errp) +{ + SaveStateEntry *se; + + se = find_se(idstr, instance_id); + if (!se) { + error_setg(errp, + "Unknown idstr %s or instance id %u for load state buffer", + idstr, instance_id); + return false; + } + + if (!se->ops || !se->ops->load_state_buffer) { + error_setg(errp, + "idstr %s / instance %u has no load state buffer operation", + idstr, instance_id); + return false; + } + + return se->ops->load_state_buffer(se->opaque, buf, len, errp); +} + bool save_snapshot(const char *name, bool overwrite, const char *vmstate, bool has_devices, strList *devices, Error **errp) { diff --git a/migration/savevm.h b/migration/savevm.h index 58f871a7ed..cb58434a94 100644 --- a/migration/savevm.h +++ b/migration/savevm.h @@ -71,4 +71,7 @@ int qemu_loadvm_approve_switchover(void); int qemu_savevm_state_complete_precopy_non_iterable(QEMUFile *f, bool in_postcopy); +bool qemu_loadvm_load_state_buffer(const char *idstr, uint32_t instance_id, + char *buf, size_t len, Error **errp); + #endif From 6a76eb4872f632974307bf12cb7f2416a77ad4a8 Mon Sep 17 00:00:00 2001 From: "Maciej S. Szmigiero" Date: Wed, 5 Mar 2025 17:49:10 +0100 Subject: [PATCH 16/42] migration: Always take BQL for migration_incoming_state_destroy() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit All callers to migration_incoming_state_destroy() other than postcopy_ram_listen_thread() do this call with BQL held. Since migration_incoming_state_destroy() ultimately calls "load_cleanup" SaveVMHandlers and it will soon call BQL-sensitive code it makes sense to always call that function under BQL rather than to have it deal with both cases (with BQL and without BQL). Add the necessary bql_lock() and bql_unlock() to postcopy_ram_listen_thread(). qemu_loadvm_state_main() in postcopy_ram_listen_thread() could call "load_state" SaveVMHandlers that are expecting BQL to be held. In principle, the only devices that should be arriving on migration channel serviced by postcopy_ram_listen_thread() are those that are postcopiable and whose load handlers are safe to be called without BQL being held. But nothing currently prevents the source from sending data for "unsafe" devices which would cause trouble there. Add a TODO comment there so it's clear that it would be good to improve handling of such (erroneous) case in the future. Acked-by: Peter Xu Signed-off-by: Maciej S. Szmigiero Link: https://lore.kernel.org/qemu-devel/21bb5ca337b1d5a802e697f553f37faf296b5ff4.1741193259.git.maciej.szmigiero@oracle.com Signed-off-by: Cédric Le Goater --- migration/migration.c | 13 +++++++++++++ migration/savevm.c | 4 ++++ 2 files changed, 17 insertions(+) diff --git a/migration/migration.c b/migration/migration.c index 9e9db26667..0bf70ea971 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -402,10 +402,23 @@ void migration_incoming_state_destroy(void) struct MigrationIncomingState *mis = migration_incoming_get_current(); multifd_recv_cleanup(); + /* * RAM state cleanup needs to happen after multifd cleanup, because * multifd threads can use some of its states (receivedmap). + * The VFIO load_cleanup() implementation is BQL-sensitive. It requires + * BQL must NOT be taken when recycling load threads, so that it won't + * block the load threads from making progress on address space + * modification operations. + * + * To make it work, we could try to not take BQL for all load_cleanup(), + * or conditionally unlock BQL only if bql_locked() in VFIO. + * + * Since most existing call sites take BQL for load_cleanup(), make + * it simple by taking BQL always as the rule, so that VFIO can unlock + * BQL and retake unconditionally. */ + assert(bql_locked()); qemu_loadvm_state_cleanup(); if (mis->to_src_file) { diff --git a/migration/savevm.c b/migration/savevm.c index 7c1aa8ad7b..3e86b572cf 100644 --- a/migration/savevm.c +++ b/migration/savevm.c @@ -1986,6 +1986,8 @@ static void *postcopy_ram_listen_thread(void *opaque) * in qemu_file, and thus we must be blocking now. */ qemu_file_set_blocking(f, true); + + /* TODO: sanity check that only postcopiable data will be loaded here */ load_res = qemu_loadvm_state_main(f, mis); /* @@ -2046,7 +2048,9 @@ static void *postcopy_ram_listen_thread(void *opaque) * (If something broke then qemu will have to exit anyway since it's * got a bad migration state). */ + bql_lock(); migration_incoming_state_destroy(); + bql_unlock(); rcu_unregister_thread(); mis->have_listen_thread = false; From 18eb55546a54e443d94a4c49286348176ad4b00a Mon Sep 17 00:00:00 2001 From: "Maciej S. Szmigiero" Date: Tue, 4 Mar 2025 23:03:35 +0100 Subject: [PATCH 17/42] error: define g_autoptr() cleanup function for the Error type MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Automatic memory management helps avoid memory safety issues. Reviewed-by: Peter Xu Signed-off-by: Maciej S. Szmigiero Link: https://lore.kernel.org/qemu-devel/a5843c5fa64d7e5239a4316092ec0ef0d10c2320.1741124640.git.maciej.szmigiero@oracle.com Signed-off-by: Cédric Le Goater --- include/qapi/error.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/qapi/error.h b/include/qapi/error.h index f5fe216262..41e3816380 100644 --- a/include/qapi/error.h +++ b/include/qapi/error.h @@ -437,6 +437,8 @@ Error *error_copy(const Error *err); */ void error_free(Error *err); +G_DEFINE_AUTOPTR_CLEANUP_FUNC(Error, error_free) + /* * Convenience function to assert that *@errp is set, then silently free it. */ From b1937fd1eb8360d0dc0abb0a8da221d8edce3733 Mon Sep 17 00:00:00 2001 From: "Maciej S. Szmigiero" Date: Tue, 4 Mar 2025 23:03:36 +0100 Subject: [PATCH 18/42] migration: Add thread pool of optional load threads MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Some drivers might want to make use of auxiliary helper threads during VM state loading, for example to make sure that their blocking (sync) I/O operations don't block the rest of the migration process. Add a migration core managed thread pool to facilitate this use case. The migration core will wait for these threads to finish before (re)starting the VM at destination. Reviewed-by: Fabiano Rosas Signed-off-by: Maciej S. Szmigiero Link: https://lore.kernel.org/qemu-devel/b09fd70369b6159c75847e69f235cb908b02570c.1741124640.git.maciej.szmigiero@oracle.com Signed-off-by: Cédric Le Goater --- include/migration/misc.h | 3 ++ include/qemu/typedefs.h | 2 + migration/migration.c | 2 +- migration/migration.h | 5 +++ migration/savevm.c | 95 +++++++++++++++++++++++++++++++++++++++- migration/savevm.h | 2 +- 6 files changed, 105 insertions(+), 4 deletions(-) diff --git a/include/migration/misc.h b/include/migration/misc.h index c660be8095..4c171f4e89 100644 --- a/include/migration/misc.h +++ b/include/migration/misc.h @@ -45,9 +45,12 @@ bool migrate_ram_is_ignored(RAMBlock *block); /* migration/block.c */ AnnounceParameters *migrate_announce_params(void); + /* migration/savevm.c */ void dump_vmstate_json_to_file(FILE *out_fp); +void qemu_loadvm_start_load_thread(MigrationLoadThread function, + void *opaque); /* migration/migration.c */ void migration_object_init(void); diff --git a/include/qemu/typedefs.h b/include/qemu/typedefs.h index 3d84efcac4..fd23ff7771 100644 --- a/include/qemu/typedefs.h +++ b/include/qemu/typedefs.h @@ -131,5 +131,7 @@ typedef struct IRQState *qemu_irq; * Function types */ typedef void (*qemu_irq_handler)(void *opaque, int n, int level); +typedef bool (*MigrationLoadThread)(void *opaque, bool *should_quit, + Error **errp); #endif /* QEMU_TYPEDEFS_H */ diff --git a/migration/migration.c b/migration/migration.c index 0bf70ea971..1833cfe358 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -419,7 +419,7 @@ void migration_incoming_state_destroy(void) * BQL and retake unconditionally. */ assert(bql_locked()); - qemu_loadvm_state_cleanup(); + qemu_loadvm_state_cleanup(mis); if (mis->to_src_file) { /* Tell source that we are done */ diff --git a/migration/migration.h b/migration/migration.h index 7b4278e2a3..d53f7cad84 100644 --- a/migration/migration.h +++ b/migration/migration.h @@ -43,6 +43,7 @@ #define MIGRATION_THREAD_DST_PREEMPT "mig/dst/preempt" struct PostcopyBlocktimeContext; +typedef struct ThreadPool ThreadPool; #define MIGRATION_RESUME_ACK_VALUE (1) @@ -187,6 +188,10 @@ struct MigrationIncomingState { Coroutine *colo_incoming_co; QemuSemaphore colo_incoming_sem; + /* Optional load threads pool and its thread exit request flag */ + ThreadPool *load_threads; + bool load_threads_abort; + /* * PostcopyBlocktimeContext to keep information for postcopy * live migration, to calculate vCPU block time diff --git a/migration/savevm.c b/migration/savevm.c index 3e86b572cf..1abc365570 100644 --- a/migration/savevm.c +++ b/migration/savevm.c @@ -54,6 +54,7 @@ #include "qemu/job.h" #include "qemu/main-loop.h" #include "block/snapshot.h" +#include "block/thread-pool.h" #include "qemu/cutils.h" #include "io/channel-buffer.h" #include "io/channel-file.h" @@ -131,6 +132,35 @@ static struct mig_cmd_args { * generic extendable format with an exception for two old entities. */ +/***********************************************************/ +/* Optional load threads pool support */ + +static void qemu_loadvm_thread_pool_create(MigrationIncomingState *mis) +{ + assert(!mis->load_threads); + mis->load_threads = thread_pool_new(); + mis->load_threads_abort = false; +} + +static void qemu_loadvm_thread_pool_destroy(MigrationIncomingState *mis) +{ + qatomic_set(&mis->load_threads_abort, true); + + bql_unlock(); /* Load threads might be waiting for BQL */ + g_clear_pointer(&mis->load_threads, thread_pool_free); + bql_lock(); +} + +static bool qemu_loadvm_thread_pool_wait(MigrationState *s, + MigrationIncomingState *mis) +{ + bql_unlock(); /* Let load threads do work requiring BQL */ + thread_pool_wait(mis->load_threads); + bql_lock(); + + return !migrate_has_error(s); +} + /***********************************************************/ /* savevm/loadvm support */ @@ -2783,16 +2813,68 @@ static int qemu_loadvm_state_setup(QEMUFile *f, Error **errp) return 0; } -void qemu_loadvm_state_cleanup(void) +struct LoadThreadData { + MigrationLoadThread function; + void *opaque; +}; + +static int qemu_loadvm_load_thread(void *thread_opaque) +{ + struct LoadThreadData *data = thread_opaque; + MigrationIncomingState *mis = migration_incoming_get_current(); + g_autoptr(Error) local_err = NULL; + + if (!data->function(data->opaque, &mis->load_threads_abort, &local_err)) { + MigrationState *s = migrate_get_current(); + + /* + * Can't set load_threads_abort here since processing of main migration + * channel data could still be happening, resulting in launching of new + * load threads. + */ + + assert(local_err); + + /* + * In case of multiple load threads failing which thread error + * return we end setting is purely arbitrary. + */ + migrate_set_error(s, local_err); + } + + return 0; +} + +void qemu_loadvm_start_load_thread(MigrationLoadThread function, + void *opaque) +{ + MigrationIncomingState *mis = migration_incoming_get_current(); + struct LoadThreadData *data; + + /* We only set it from this thread so it's okay to read it directly */ + assert(!mis->load_threads_abort); + + data = g_new(struct LoadThreadData, 1); + data->function = function; + data->opaque = opaque; + + thread_pool_submit_immediate(mis->load_threads, qemu_loadvm_load_thread, + data, g_free); +} + +void qemu_loadvm_state_cleanup(MigrationIncomingState *mis) { SaveStateEntry *se; trace_loadvm_state_cleanup(); + QTAILQ_FOREACH(se, &savevm_state.handlers, entry) { if (se->ops && se->ops->load_cleanup) { se->ops->load_cleanup(se->opaque); } } + + qemu_loadvm_thread_pool_destroy(mis); } /* Return true if we should continue the migration, or false. */ @@ -2943,6 +3025,7 @@ out: int qemu_loadvm_state(QEMUFile *f) { + MigrationState *s = migrate_get_current(); MigrationIncomingState *mis = migration_incoming_get_current(); Error *local_err = NULL; int ret; @@ -2952,6 +3035,8 @@ int qemu_loadvm_state(QEMUFile *f) return -EINVAL; } + qemu_loadvm_thread_pool_create(mis); + ret = qemu_loadvm_state_header(f); if (ret) { return ret; @@ -2983,12 +3068,18 @@ int qemu_loadvm_state(QEMUFile *f) /* When reaching here, it must be precopy */ if (ret == 0) { - if (migrate_has_error(migrate_get_current())) { + if (migrate_has_error(migrate_get_current()) || + !qemu_loadvm_thread_pool_wait(s, mis)) { ret = -EINVAL; } else { ret = qemu_file_get_error(f); } } + /* + * Set this flag unconditionally so we'll catch further attempts to + * start additional threads via an appropriate assert() + */ + qatomic_set(&mis->load_threads_abort, true); /* * Try to read in the VMDESC section as well, so that dumping tools that diff --git a/migration/savevm.h b/migration/savevm.h index cb58434a94..138c39a7f9 100644 --- a/migration/savevm.h +++ b/migration/savevm.h @@ -64,7 +64,7 @@ void qemu_savevm_live_state(QEMUFile *f); int qemu_save_device_state(QEMUFile *f); int qemu_loadvm_state(QEMUFile *f); -void qemu_loadvm_state_cleanup(void); +void qemu_loadvm_state_cleanup(MigrationIncomingState *mis); int qemu_loadvm_state_main(QEMUFile *f, MigrationIncomingState *mis); int qemu_load_device_state(QEMUFile *f); int qemu_loadvm_approve_switchover(void); From 8050c435b70b6805f05441a8a7cc6a3d70c3ee71 Mon Sep 17 00:00:00 2001 From: "Maciej S. Szmigiero" Date: Tue, 4 Mar 2025 23:03:37 +0100 Subject: [PATCH 19/42] migration/multifd: Split packet into header and RAM data MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Read packet header first so in the future we will be able to differentiate between a RAM multifd packet and a device state multifd packet. Since these two are of different size we can't read the packet body until we know which packet type it is. Reviewed-by: Fabiano Rosas Reviewed-by: Peter Xu Signed-off-by: Maciej S. Szmigiero Link: https://lore.kernel.org/qemu-devel/832ad055fe447561ac1ad565d61658660cb3f63f.1741124640.git.maciej.szmigiero@oracle.com Signed-off-by: Cédric Le Goater --- migration/multifd.c | 55 ++++++++++++++++++++++++++++++++++++--------- migration/multifd.h | 5 +++++ 2 files changed, 49 insertions(+), 11 deletions(-) diff --git a/migration/multifd.c b/migration/multifd.c index 215ad0414a..3b47e63c2c 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -209,10 +209,10 @@ void multifd_send_fill_packet(MultiFDSendParams *p) memset(packet, 0, p->packet_len); - packet->magic = cpu_to_be32(MULTIFD_MAGIC); - packet->version = cpu_to_be32(MULTIFD_VERSION); + packet->hdr.magic = cpu_to_be32(MULTIFD_MAGIC); + packet->hdr.version = cpu_to_be32(MULTIFD_VERSION); - packet->flags = cpu_to_be32(p->flags); + packet->hdr.flags = cpu_to_be32(p->flags); packet->next_packet_size = cpu_to_be32(p->next_packet_size); packet_num = qatomic_fetch_inc(&multifd_send_state->packet_num); @@ -228,12 +228,12 @@ void multifd_send_fill_packet(MultiFDSendParams *p) p->flags, p->next_packet_size); } -static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp) +static int multifd_recv_unfill_packet_header(MultiFDRecvParams *p, + const MultiFDPacketHdr_t *hdr, + Error **errp) { - const MultiFDPacket_t *packet = p->packet; - uint32_t magic = be32_to_cpu(packet->magic); - uint32_t version = be32_to_cpu(packet->version); - int ret = 0; + uint32_t magic = be32_to_cpu(hdr->magic); + uint32_t version = be32_to_cpu(hdr->version); if (magic != MULTIFD_MAGIC) { error_setg(errp, "multifd: received packet magic %x, expected %x", @@ -247,7 +247,16 @@ static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp) return -1; } - p->flags = be32_to_cpu(packet->flags); + p->flags = be32_to_cpu(hdr->flags); + + return 0; +} + +static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp) +{ + const MultiFDPacket_t *packet = p->packet; + int ret = 0; + p->next_packet_size = be32_to_cpu(packet->next_packet_size); p->packet_num = be64_to_cpu(packet->packet_num); p->packets_recved++; @@ -1165,14 +1174,18 @@ static void *multifd_recv_thread(void *opaque) } while (true) { + MultiFDPacketHdr_t hdr; uint32_t flags = 0; bool has_data = false; + uint8_t *pkt_buf; + size_t pkt_len; + p->normal_num = 0; if (use_packets) { struct iovec iov = { - .iov_base = (void *)p->packet, - .iov_len = p->packet_len + .iov_base = (void *)&hdr, + .iov_len = sizeof(hdr) }; if (multifd_recv_should_exit()) { @@ -1191,6 +1204,26 @@ static void *multifd_recv_thread(void *opaque) break; } + ret = multifd_recv_unfill_packet_header(p, &hdr, &local_err); + if (ret) { + break; + } + + pkt_buf = (uint8_t *)p->packet + sizeof(hdr); + pkt_len = p->packet_len - sizeof(hdr); + + ret = qio_channel_read_all_eof(p->c, (char *)pkt_buf, pkt_len, + &local_err); + if (!ret) { + /* EOF */ + error_setg(&local_err, "multifd: unexpected EOF after packet header"); + break; + } + + if (ret == -1) { + break; + } + qemu_mutex_lock(&p->mutex); ret = multifd_recv_unfill_packet(p, &local_err); if (ret) { diff --git a/migration/multifd.h b/migration/multifd.h index cf408ff721..f7156f66c0 100644 --- a/migration/multifd.h +++ b/migration/multifd.h @@ -69,6 +69,11 @@ typedef struct { uint32_t magic; uint32_t version; uint32_t flags; +} __attribute__((packed)) MultiFDPacketHdr_t; + +typedef struct { + MultiFDPacketHdr_t hdr; + /* maximum number of allocated pages */ uint32_t pages_alloc; /* non zero pages */ From f588f3c46ae278661cdad5f1198a455e3ec9f910 Mon Sep 17 00:00:00 2001 From: "Maciej S. Szmigiero" Date: Tue, 4 Mar 2025 23:03:38 +0100 Subject: [PATCH 20/42] migration/multifd: Device state transfer support - receive side MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a basic support for receiving device state via multifd channels - channels that are shared with RAM transfers. Depending whether MULTIFD_FLAG_DEVICE_STATE flag is present or not in the packet header either device state (MultiFDPacketDeviceState_t) or RAM data (existing MultiFDPacket_t) is read. The received device state data is provided to qemu_loadvm_load_state_buffer() function for processing in the device's load_state_buffer handler. Reviewed-by: Peter Xu Signed-off-by: Maciej S. Szmigiero Link: https://lore.kernel.org/qemu-devel/9b86f806c134e7815ecce0eee84f0e0e34aa0146.1741124640.git.maciej.szmigiero@oracle.com Signed-off-by: Cédric Le Goater --- migration/multifd.c | 101 +++++++++++++++++++++++++++++++++++++++----- migration/multifd.h | 19 ++++++++- 2 files changed, 108 insertions(+), 12 deletions(-) diff --git a/migration/multifd.c b/migration/multifd.c index 3b47e63c2c..01f427d8ed 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -21,6 +21,7 @@ #include "file.h" #include "migration.h" #include "migration-stats.h" +#include "savevm.h" #include "socket.h" #include "tls.h" #include "qemu-file.h" @@ -252,14 +253,24 @@ static int multifd_recv_unfill_packet_header(MultiFDRecvParams *p, return 0; } -static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp) +static int multifd_recv_unfill_packet_device_state(MultiFDRecvParams *p, + Error **errp) +{ + MultiFDPacketDeviceState_t *packet = p->packet_dev_state; + + packet->instance_id = be32_to_cpu(packet->instance_id); + p->next_packet_size = be32_to_cpu(packet->next_packet_size); + + return 0; +} + +static int multifd_recv_unfill_packet_ram(MultiFDRecvParams *p, Error **errp) { const MultiFDPacket_t *packet = p->packet; int ret = 0; p->next_packet_size = be32_to_cpu(packet->next_packet_size); p->packet_num = be64_to_cpu(packet->packet_num); - p->packets_recved++; /* Always unfill, old QEMUs (<9.0) send data along with SYNC */ ret = multifd_ram_unfill_packet(p, errp); @@ -270,6 +281,17 @@ static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp) return ret; } +static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp) +{ + p->packets_recved++; + + if (p->flags & MULTIFD_FLAG_DEVICE_STATE) { + return multifd_recv_unfill_packet_device_state(p, errp); + } + + return multifd_recv_unfill_packet_ram(p, errp); +} + static bool multifd_send_should_exit(void) { return qatomic_read(&multifd_send_state->exiting); @@ -1057,6 +1079,7 @@ static void multifd_recv_cleanup_channel(MultiFDRecvParams *p) p->packet_len = 0; g_free(p->packet); p->packet = NULL; + g_clear_pointer(&p->packet_dev_state, g_free); g_free(p->normal); p->normal = NULL; g_free(p->zero); @@ -1158,6 +1181,34 @@ void multifd_recv_sync_main(void) trace_multifd_recv_sync_main(multifd_recv_state->packet_num); } +static int multifd_device_state_recv(MultiFDRecvParams *p, Error **errp) +{ + g_autofree char *dev_state_buf = NULL; + int ret; + + dev_state_buf = g_malloc(p->next_packet_size); + + ret = qio_channel_read_all(p->c, dev_state_buf, p->next_packet_size, errp); + if (ret != 0) { + return ret; + } + + if (p->packet_dev_state->idstr[sizeof(p->packet_dev_state->idstr) - 1] + != 0) { + error_setg(errp, "unterminated multifd device state idstr"); + return -1; + } + + if (!qemu_loadvm_load_state_buffer(p->packet_dev_state->idstr, + p->packet_dev_state->instance_id, + dev_state_buf, p->next_packet_size, + errp)) { + ret = -1; + } + + return ret; +} + static void *multifd_recv_thread(void *opaque) { MigrationState *s = migrate_get_current(); @@ -1176,6 +1227,7 @@ static void *multifd_recv_thread(void *opaque) while (true) { MultiFDPacketHdr_t hdr; uint32_t flags = 0; + bool is_device_state = false; bool has_data = false; uint8_t *pkt_buf; size_t pkt_len; @@ -1209,8 +1261,14 @@ static void *multifd_recv_thread(void *opaque) break; } - pkt_buf = (uint8_t *)p->packet + sizeof(hdr); - pkt_len = p->packet_len - sizeof(hdr); + is_device_state = p->flags & MULTIFD_FLAG_DEVICE_STATE; + if (is_device_state) { + pkt_buf = (uint8_t *)p->packet_dev_state + sizeof(hdr); + pkt_len = sizeof(*p->packet_dev_state) - sizeof(hdr); + } else { + pkt_buf = (uint8_t *)p->packet + sizeof(hdr); + pkt_len = p->packet_len - sizeof(hdr); + } ret = qio_channel_read_all_eof(p->c, (char *)pkt_buf, pkt_len, &local_err); @@ -1235,12 +1293,17 @@ static void *multifd_recv_thread(void *opaque) /* recv methods don't know how to handle the SYNC flag */ p->flags &= ~MULTIFD_FLAG_SYNC; - /* - * Even if it's a SYNC packet, this needs to be set - * because older QEMUs (<9.0) still send data along with - * the SYNC packet. - */ - has_data = p->normal_num || p->zero_num; + if (is_device_state) { + has_data = p->next_packet_size > 0; + } else { + /* + * Even if it's a SYNC packet, this needs to be set + * because older QEMUs (<9.0) still send data along with + * the SYNC packet. + */ + has_data = p->normal_num || p->zero_num; + } + qemu_mutex_unlock(&p->mutex); } else { /* @@ -1269,14 +1332,29 @@ static void *multifd_recv_thread(void *opaque) } if (has_data) { - ret = multifd_recv_state->ops->recv(p, &local_err); + if (is_device_state) { + assert(use_packets); + ret = multifd_device_state_recv(p, &local_err); + } else { + ret = multifd_recv_state->ops->recv(p, &local_err); + } if (ret != 0) { break; } + } else if (is_device_state) { + error_setg(&local_err, + "multifd: received empty device state packet"); + break; } if (use_packets) { if (flags & MULTIFD_FLAG_SYNC) { + if (is_device_state) { + error_setg(&local_err, + "multifd: received SYNC device state packet"); + break; + } + qemu_sem_post(&multifd_recv_state->sem_sync); qemu_sem_wait(&p->sem_sync); } @@ -1345,6 +1423,7 @@ int multifd_recv_setup(Error **errp) p->packet_len = sizeof(MultiFDPacket_t) + sizeof(uint64_t) * page_count; p->packet = g_malloc0(p->packet_len); + p->packet_dev_state = g_malloc0(sizeof(*p->packet_dev_state)); } p->name = g_strdup_printf(MIGRATION_THREAD_DST_MULTIFD, i); p->normal = g_new0(ram_addr_t, page_count); diff --git a/migration/multifd.h b/migration/multifd.h index f7156f66c0..d682c5a9b7 100644 --- a/migration/multifd.h +++ b/migration/multifd.h @@ -62,6 +62,12 @@ MultiFDRecvData *multifd_get_recv_data(void); #define MULTIFD_FLAG_UADK (8 << 1) #define MULTIFD_FLAG_QATZIP (16 << 1) +/* + * If set it means that this packet contains device state + * (MultiFDPacketDeviceState_t), not RAM data (MultiFDPacket_t). + */ +#define MULTIFD_FLAG_DEVICE_STATE (32 << 1) + /* This value needs to be a multiple of qemu_target_page_size() */ #define MULTIFD_PACKET_SIZE (512 * 1024) @@ -94,6 +100,16 @@ typedef struct { uint64_t offset[]; } __attribute__((packed)) MultiFDPacket_t; +typedef struct { + MultiFDPacketHdr_t hdr; + + char idstr[256]; + uint32_t instance_id; + + /* size of the next packet that contains the actual data */ + uint32_t next_packet_size; +} __attribute__((packed)) MultiFDPacketDeviceState_t; + typedef struct { /* number of used pages */ uint32_t num; @@ -227,8 +243,9 @@ typedef struct { /* thread local variables. No locking required */ - /* pointer to the packet */ + /* pointers to the possible packet types */ MultiFDPacket_t *packet; + MultiFDPacketDeviceState_t *packet_dev_state; /* size of the next packet that contains pages */ uint32_t next_packet_size; /* packets received through this channel */ From d19cc4dca0b21af95fee36a2ddad34eb4bd6b67f Mon Sep 17 00:00:00 2001 From: "Maciej S. Szmigiero" Date: Tue, 4 Mar 2025 23:03:39 +0100 Subject: [PATCH 21/42] migration/multifd: Make multifd_send() thread safe MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit multifd_send() function is currently not thread safe, make it thread safe by holding a lock during its execution. This way it will be possible to safely call it concurrently from multiple threads. Reviewed-by: Peter Xu Signed-off-by: Maciej S. Szmigiero Link: https://lore.kernel.org/qemu-devel/dd0f3bcc02ca96a7d523ca58ea69e495a33b453b.1741124640.git.maciej.szmigiero@oracle.com Signed-off-by: Cédric Le Goater --- migration/multifd.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/migration/multifd.c b/migration/multifd.c index 01f427d8ed..add6f86175 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -50,6 +50,10 @@ typedef struct { struct { MultiFDSendParams *params; + + /* multifd_send() body is not thread safe, needs serialization */ + QemuMutex multifd_send_mutex; + /* * Global number of generated multifd packets. * @@ -339,6 +343,8 @@ bool multifd_send(MultiFDSendData **send_data) return false; } + QEMU_LOCK_GUARD(&multifd_send_state->multifd_send_mutex); + /* We wait here, until at least one channel is ready */ qemu_sem_wait(&multifd_send_state->channels_ready); @@ -507,6 +513,7 @@ static void multifd_send_cleanup_state(void) socket_cleanup_outgoing_migration(); qemu_sem_destroy(&multifd_send_state->channels_created); qemu_sem_destroy(&multifd_send_state->channels_ready); + qemu_mutex_destroy(&multifd_send_state->multifd_send_mutex); g_free(multifd_send_state->params); multifd_send_state->params = NULL; g_free(multifd_send_state); @@ -887,6 +894,7 @@ bool multifd_send_setup(void) thread_count = migrate_multifd_channels(); multifd_send_state = g_malloc0(sizeof(*multifd_send_state)); multifd_send_state->params = g_new0(MultiFDSendParams, thread_count); + qemu_mutex_init(&multifd_send_state->multifd_send_mutex); qemu_sem_init(&multifd_send_state->channels_created, 0); qemu_sem_init(&multifd_send_state->channels_ready, 0); qatomic_set(&multifd_send_state->exiting, 0); From 7ecfab1ddd3e6678c6a0b12d348d82cfaaa406ff Mon Sep 17 00:00:00 2001 From: "Maciej S. Szmigiero" Date: Tue, 4 Mar 2025 23:03:40 +0100 Subject: [PATCH 22/42] migration/multifd: Add an explicit MultiFDSendData destructor MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This way if there are fields there that needs explicit disposal (like, for example, some attached buffers) they will be handled appropriately. Add a related assert to multifd_set_payload_type() in order to make sure that this function is only used to fill a previously empty MultiFDSendData with some payload, not the other way around. Reviewed-by: Fabiano Rosas Reviewed-by: Peter Xu Signed-off-by: Maciej S. Szmigiero Link: https://lore.kernel.org/qemu-devel/6755205f2b95abbed251f87061feee1c0e410836.1741124640.git.maciej.szmigiero@oracle.com Signed-off-by: Cédric Le Goater --- migration/multifd-nocomp.c | 3 +-- migration/multifd.c | 31 ++++++++++++++++++++++++++++--- migration/multifd.h | 5 +++++ 3 files changed, 34 insertions(+), 5 deletions(-) diff --git a/migration/multifd-nocomp.c b/migration/multifd-nocomp.c index 1325dba97c..e46e79d8b2 100644 --- a/migration/multifd-nocomp.c +++ b/migration/multifd-nocomp.c @@ -42,8 +42,7 @@ void multifd_ram_save_setup(void) void multifd_ram_save_cleanup(void) { - g_free(multifd_ram_send); - multifd_ram_send = NULL; + g_clear_pointer(&multifd_ram_send, multifd_send_data_free); } static void multifd_set_file_bitmap(MultiFDSendParams *p) diff --git a/migration/multifd.c b/migration/multifd.c index add6f86175..c8508cadab 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -123,6 +123,32 @@ MultiFDSendData *multifd_send_data_alloc(void) return g_malloc0(size_minus_payload + max_payload_size); } +void multifd_send_data_clear(MultiFDSendData *data) +{ + if (multifd_payload_empty(data)) { + return; + } + + switch (data->type) { + default: + /* Nothing to do */ + break; + } + + data->type = MULTIFD_PAYLOAD_NONE; +} + +void multifd_send_data_free(MultiFDSendData *data) +{ + if (!data) { + return; + } + + multifd_send_data_clear(data); + + g_free(data); +} + static bool multifd_use_packets(void) { return !migrate_mapped_ram(); @@ -496,8 +522,7 @@ static bool multifd_send_cleanup_channel(MultiFDSendParams *p, Error **errp) qemu_sem_destroy(&p->sem_sync); g_free(p->name); p->name = NULL; - g_free(p->data); - p->data = NULL; + g_clear_pointer(&p->data, multifd_send_data_free); p->packet_len = 0; g_free(p->packet); p->packet = NULL; @@ -695,7 +720,7 @@ static void *multifd_send_thread(void *opaque) (uint64_t)p->next_packet_size + p->packet_len); p->next_packet_size = 0; - multifd_set_payload_type(p->data, MULTIFD_PAYLOAD_NONE); + multifd_send_data_clear(p->data); /* * Making sure p->data is published before saying "we're diff --git a/migration/multifd.h b/migration/multifd.h index d682c5a9b7..8d639eec69 100644 --- a/migration/multifd.h +++ b/migration/multifd.h @@ -149,6 +149,9 @@ static inline bool multifd_payload_empty(MultiFDSendData *data) static inline void multifd_set_payload_type(MultiFDSendData *data, MultiFDPayloadType type) { + assert(multifd_payload_empty(data)); + assert(type != MULTIFD_PAYLOAD_NONE); + data->type = type; } @@ -365,6 +368,8 @@ static inline void multifd_send_prepare_header(MultiFDSendParams *p) void multifd_channel_connect(MultiFDSendParams *p, QIOChannel *ioc); bool multifd_send(MultiFDSendData **send_data); MultiFDSendData *multifd_send_data_alloc(void); +void multifd_send_data_clear(MultiFDSendData *data); +void multifd_send_data_free(MultiFDSendData *data); static inline uint32_t multifd_ram_page_size(void) { From 0525b91a0b993f95d29b2ea84155e7e4366c120e Mon Sep 17 00:00:00 2001 From: "Maciej S. Szmigiero" Date: Tue, 4 Mar 2025 23:03:41 +0100 Subject: [PATCH 23/42] migration/multifd: Device state transfer support - send side MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A new function multifd_queue_device_state() is provided for device to queue its state for transmission via a multifd channel. Reviewed-by: Peter Xu Signed-off-by: Maciej S. Szmigiero Link: https://lore.kernel.org/qemu-devel/ebd55768d3e5fecb5eb3f197bad9c0c07e5bc084.1741124640.git.maciej.szmigiero@oracle.com Signed-off-by: Cédric Le Goater --- include/migration/misc.h | 4 ++ migration/meson.build | 1 + migration/multifd-device-state.c | 118 +++++++++++++++++++++++++++++++ migration/multifd-nocomp.c | 14 +++- migration/multifd.c | 42 +++++++++-- migration/multifd.h | 34 ++++++--- 6 files changed, 197 insertions(+), 16 deletions(-) create mode 100644 migration/multifd-device-state.c diff --git a/include/migration/misc.h b/include/migration/misc.h index 4c171f4e89..bd3b725fa0 100644 --- a/include/migration/misc.h +++ b/include/migration/misc.h @@ -118,4 +118,8 @@ bool migrate_is_uri(const char *uri); bool migrate_uri_parse(const char *uri, MigrationChannel **channel, Error **errp); +/* migration/multifd-device-state.c */ +bool multifd_queue_device_state(char *idstr, uint32_t instance_id, + char *data, size_t len); + #endif diff --git a/migration/meson.build b/migration/meson.build index d3bfe84d62..9aa48b290e 100644 --- a/migration/meson.build +++ b/migration/meson.build @@ -25,6 +25,7 @@ system_ss.add(files( 'migration-hmp-cmds.c', 'migration.c', 'multifd.c', + 'multifd-device-state.c', 'multifd-nocomp.c', 'multifd-zlib.c', 'multifd-zero-page.c', diff --git a/migration/multifd-device-state.c b/migration/multifd-device-state.c new file mode 100644 index 0000000000..e383e75b1a --- /dev/null +++ b/migration/multifd-device-state.c @@ -0,0 +1,118 @@ +/* + * Multifd device state migration + * + * Copyright (C) 2024,2025 Oracle and/or its affiliates. + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#include "qemu/osdep.h" +#include "qemu/lockable.h" +#include "migration/misc.h" +#include "multifd.h" + +static struct { + QemuMutex queue_job_mutex; + + MultiFDSendData *send_data; +} *multifd_send_device_state; + +size_t multifd_device_state_payload_size(void) +{ + return sizeof(MultiFDDeviceState_t); +} + +void multifd_device_state_send_setup(void) +{ + assert(!multifd_send_device_state); + multifd_send_device_state = g_malloc(sizeof(*multifd_send_device_state)); + + qemu_mutex_init(&multifd_send_device_state->queue_job_mutex); + + multifd_send_device_state->send_data = multifd_send_data_alloc(); +} + +void multifd_device_state_send_cleanup(void) +{ + g_clear_pointer(&multifd_send_device_state->send_data, + multifd_send_data_free); + + qemu_mutex_destroy(&multifd_send_device_state->queue_job_mutex); + + g_clear_pointer(&multifd_send_device_state, g_free); +} + +void multifd_send_data_clear_device_state(MultiFDDeviceState_t *device_state) +{ + g_clear_pointer(&device_state->idstr, g_free); + g_clear_pointer(&device_state->buf, g_free); +} + +static void multifd_device_state_fill_packet(MultiFDSendParams *p) +{ + MultiFDDeviceState_t *device_state = &p->data->u.device_state; + MultiFDPacketDeviceState_t *packet = p->packet_device_state; + + packet->hdr.flags = cpu_to_be32(p->flags); + strncpy(packet->idstr, device_state->idstr, sizeof(packet->idstr) - 1); + packet->idstr[sizeof(packet->idstr) - 1] = 0; + packet->instance_id = cpu_to_be32(device_state->instance_id); + packet->next_packet_size = cpu_to_be32(p->next_packet_size); +} + +static void multifd_prepare_header_device_state(MultiFDSendParams *p) +{ + p->iov[0].iov_len = sizeof(*p->packet_device_state); + p->iov[0].iov_base = p->packet_device_state; + p->iovs_num++; +} + +void multifd_device_state_send_prepare(MultiFDSendParams *p) +{ + MultiFDDeviceState_t *device_state = &p->data->u.device_state; + + assert(multifd_payload_device_state(p->data)); + + multifd_prepare_header_device_state(p); + + assert(!(p->flags & MULTIFD_FLAG_SYNC)); + + p->next_packet_size = device_state->buf_len; + if (p->next_packet_size > 0) { + p->iov[p->iovs_num].iov_base = device_state->buf; + p->iov[p->iovs_num].iov_len = p->next_packet_size; + p->iovs_num++; + } + + p->flags |= MULTIFD_FLAG_NOCOMP | MULTIFD_FLAG_DEVICE_STATE; + + multifd_device_state_fill_packet(p); +} + +bool multifd_queue_device_state(char *idstr, uint32_t instance_id, + char *data, size_t len) +{ + /* Device state submissions can come from multiple threads */ + QEMU_LOCK_GUARD(&multifd_send_device_state->queue_job_mutex); + MultiFDDeviceState_t *device_state; + + assert(multifd_payload_empty(multifd_send_device_state->send_data)); + + multifd_set_payload_type(multifd_send_device_state->send_data, + MULTIFD_PAYLOAD_DEVICE_STATE); + device_state = &multifd_send_device_state->send_data->u.device_state; + device_state->idstr = g_strdup(idstr); + device_state->instance_id = instance_id; + device_state->buf = g_memdup2(data, len); + device_state->buf_len = len; + + if (!multifd_send(&multifd_send_device_state->send_data)) { + multifd_send_data_clear(multifd_send_device_state->send_data); + return false; + } + + return true; +} diff --git a/migration/multifd-nocomp.c b/migration/multifd-nocomp.c index e46e79d8b2..c008046523 100644 --- a/migration/multifd-nocomp.c +++ b/migration/multifd-nocomp.c @@ -14,6 +14,7 @@ #include "exec/ramblock.h" #include "exec/target_page.h" #include "file.h" +#include "migration-stats.h" #include "multifd.h" #include "options.h" #include "qapi/error.h" @@ -85,6 +86,13 @@ static void multifd_nocomp_send_cleanup(MultiFDSendParams *p, Error **errp) return; } +static void multifd_ram_prepare_header(MultiFDSendParams *p) +{ + p->iov[0].iov_len = p->packet_len; + p->iov[0].iov_base = p->packet; + p->iovs_num++; +} + static void multifd_send_prepare_iovs(MultiFDSendParams *p) { MultiFDPages_t *pages = &p->data->u.ram; @@ -118,7 +126,7 @@ static int multifd_nocomp_send_prepare(MultiFDSendParams *p, Error **errp) * Only !zerocopy needs the header in IOV; zerocopy will * send it separately. */ - multifd_send_prepare_header(p); + multifd_ram_prepare_header(p); } multifd_send_prepare_iovs(p); @@ -133,6 +141,8 @@ static int multifd_nocomp_send_prepare(MultiFDSendParams *p, Error **errp) if (ret != 0) { return -1; } + + stat64_add(&mig_stats.multifd_bytes, p->packet_len); } return 0; @@ -431,7 +441,7 @@ int multifd_ram_flush_and_sync(QEMUFile *f) bool multifd_send_prepare_common(MultiFDSendParams *p) { MultiFDPages_t *pages = &p->data->u.ram; - multifd_send_prepare_header(p); + multifd_ram_prepare_header(p); multifd_send_zero_page_detect(p); if (!pages->normal_num) { diff --git a/migration/multifd.c b/migration/multifd.c index c8508cadab..3625c9a37c 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -12,6 +12,7 @@ #include "qemu/osdep.h" #include "qemu/cutils.h" +#include "qemu/iov.h" #include "qemu/rcu.h" #include "exec/target_page.h" #include "system/system.h" @@ -19,6 +20,7 @@ #include "qemu/error-report.h" #include "qapi/error.h" #include "file.h" +#include "migration/misc.h" #include "migration.h" #include "migration-stats.h" #include "savevm.h" @@ -111,7 +113,9 @@ MultiFDSendData *multifd_send_data_alloc(void) * added to the union in the future are larger than * (MultiFDPages_t + flex array). */ - max_payload_size = MAX(multifd_ram_payload_size(), sizeof(MultiFDPayload)); + max_payload_size = MAX(multifd_ram_payload_size(), + multifd_device_state_payload_size()); + max_payload_size = MAX(max_payload_size, sizeof(MultiFDPayload)); /* * Account for any holes the compiler might insert. We can't pack @@ -130,6 +134,9 @@ void multifd_send_data_clear(MultiFDSendData *data) } switch (data->type) { + case MULTIFD_PAYLOAD_DEVICE_STATE: + multifd_send_data_clear_device_state(&data->u.device_state); + break; default: /* Nothing to do */ break; @@ -232,6 +239,7 @@ static int multifd_recv_initial_packet(QIOChannel *c, Error **errp) return msg.id; } +/* Fills a RAM multifd packet */ void multifd_send_fill_packet(MultiFDSendParams *p) { MultiFDPacket_t *packet = p->packet; @@ -524,6 +532,7 @@ static bool multifd_send_cleanup_channel(MultiFDSendParams *p, Error **errp) p->name = NULL; g_clear_pointer(&p->data, multifd_send_data_free); p->packet_len = 0; + g_clear_pointer(&p->packet_device_state, g_free); g_free(p->packet); p->packet = NULL; multifd_send_state->ops->send_cleanup(p, errp); @@ -536,6 +545,7 @@ static void multifd_send_cleanup_state(void) { file_cleanup_outgoing_migration(); socket_cleanup_outgoing_migration(); + multifd_device_state_send_cleanup(); qemu_sem_destroy(&multifd_send_state->channels_created); qemu_sem_destroy(&multifd_send_state->channels_ready); qemu_mutex_destroy(&multifd_send_state->multifd_send_mutex); @@ -694,16 +704,32 @@ static void *multifd_send_thread(void *opaque) * qatomic_store_release() in multifd_send(). */ if (qatomic_load_acquire(&p->pending_job)) { + bool is_device_state = multifd_payload_device_state(p->data); + size_t total_size; + p->flags = 0; p->iovs_num = 0; assert(!multifd_payload_empty(p->data)); - ret = multifd_send_state->ops->send_prepare(p, &local_err); - if (ret != 0) { - break; + if (is_device_state) { + multifd_device_state_send_prepare(p); + } else { + ret = multifd_send_state->ops->send_prepare(p, &local_err); + if (ret != 0) { + break; + } } + /* + * The packet header in the zerocopy RAM case is accounted for + * in multifd_nocomp_send_prepare() - where it is actually + * being sent. + */ + total_size = iov_size(p->iov, p->iovs_num); + if (migrate_mapped_ram()) { + assert(!is_device_state); + ret = file_write_ramblock_iov(p->c, p->iov, p->iovs_num, &p->data->u.ram, &local_err); } else { @@ -716,8 +742,7 @@ static void *multifd_send_thread(void *opaque) break; } - stat64_add(&mig_stats.multifd_bytes, - (uint64_t)p->next_packet_size + p->packet_len); + stat64_add(&mig_stats.multifd_bytes, total_size); p->next_packet_size = 0; multifd_send_data_clear(p->data); @@ -938,6 +963,9 @@ bool multifd_send_setup(void) p->packet_len = sizeof(MultiFDPacket_t) + sizeof(uint64_t) * page_count; p->packet = g_malloc0(p->packet_len); + p->packet_device_state = g_malloc0(sizeof(*p->packet_device_state)); + p->packet_device_state->hdr.magic = cpu_to_be32(MULTIFD_MAGIC); + p->packet_device_state->hdr.version = cpu_to_be32(MULTIFD_VERSION); } p->name = g_strdup_printf(MIGRATION_THREAD_SRC_MULTIFD, i); p->write_flags = 0; @@ -973,6 +1001,8 @@ bool multifd_send_setup(void) assert(p->iov); } + multifd_device_state_send_setup(); + return true; err: diff --git a/migration/multifd.h b/migration/multifd.h index 8d639eec69..aa679d8bbe 100644 --- a/migration/multifd.h +++ b/migration/multifd.h @@ -127,13 +127,22 @@ struct MultiFDRecvData { off_t file_offset; }; +typedef struct { + char *idstr; + uint32_t instance_id; + char *buf; + size_t buf_len; +} MultiFDDeviceState_t; + typedef enum { MULTIFD_PAYLOAD_NONE, MULTIFD_PAYLOAD_RAM, + MULTIFD_PAYLOAD_DEVICE_STATE, } MultiFDPayloadType; typedef union MultiFDPayload { MultiFDPages_t ram; + MultiFDDeviceState_t device_state; } MultiFDPayload; struct MultiFDSendData { @@ -146,6 +155,11 @@ static inline bool multifd_payload_empty(MultiFDSendData *data) return data->type == MULTIFD_PAYLOAD_NONE; } +static inline bool multifd_payload_device_state(MultiFDSendData *data) +{ + return data->type == MULTIFD_PAYLOAD_DEVICE_STATE; +} + static inline void multifd_set_payload_type(MultiFDSendData *data, MultiFDPayloadType type) { @@ -198,8 +212,9 @@ typedef struct { /* thread local variables. No locking required */ - /* pointer to the packet */ + /* pointers to the possible packet types */ MultiFDPacket_t *packet; + MultiFDPacketDeviceState_t *packet_device_state; /* size of the next packet that contains pages */ uint32_t next_packet_size; /* packets sent through this channel */ @@ -358,13 +373,6 @@ bool multifd_send_prepare_common(MultiFDSendParams *p); void multifd_send_zero_page_detect(MultiFDSendParams *p); void multifd_recv_zero_page_process(MultiFDRecvParams *p); -static inline void multifd_send_prepare_header(MultiFDSendParams *p) -{ - p->iov[0].iov_len = p->packet_len; - p->iov[0].iov_base = p->packet; - p->iovs_num++; -} - void multifd_channel_connect(MultiFDSendParams *p, QIOChannel *ioc); bool multifd_send(MultiFDSendData **send_data); MultiFDSendData *multifd_send_data_alloc(void); @@ -389,4 +397,14 @@ bool multifd_ram_sync_per_section(void); size_t multifd_ram_payload_size(void); void multifd_ram_fill_packet(MultiFDSendParams *p); int multifd_ram_unfill_packet(MultiFDRecvParams *p, Error **errp); + +size_t multifd_device_state_payload_size(void); + +void multifd_send_data_clear_device_state(MultiFDDeviceState_t *device_state); + +void multifd_device_state_send_setup(void); +void multifd_device_state_send_cleanup(void); + +void multifd_device_state_send_prepare(MultiFDSendParams *p); + #endif From 99fab22350e4eed1d8a238ed97c77b1a5ee77dd4 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 4 Mar 2025 23:03:42 +0100 Subject: [PATCH 24/42] migration/multifd: Make MultiFDSendData a struct MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The newly introduced device state buffer can be used for either storing VFIO's read() raw data, but already also possible to store generic device states. After noticing that device states may not easily provide a max buffer size (also the fact that RAM MultiFDPages_t after all also want to have flexibility on managing offset[] array), it may not be a good idea to stick with union on MultiFDSendData.. as it won't play well with such flexibility. Switch MultiFDSendData to a struct. It won't consume a lot more space in reality, after all the real buffers were already dynamically allocated, so it's so far only about the two structs (pages, device_state) that will be duplicated, but they're small. With this, we can remove the pretty hard to understand alloc size logic. Because now we can allocate offset[] together with the SendData, and properly free it when the SendData is freed. [MSS: Make sure to clear possible device state payload before freeing MultiFDSendData, remove placeholders for other patches not included] Signed-off-by: Peter Xu Signed-off-by: Maciej S. Szmigiero Acked-by: Fabiano Rosas Link: https://lore.kernel.org/qemu-devel/7b02baba8e6ddb23ef7c349d312b9b631db09d7e.1741124640.git.maciej.szmigiero@oracle.com Signed-off-by: Cédric Le Goater --- migration/multifd-device-state.c | 5 ----- migration/multifd-nocomp.c | 13 ++++++------- migration/multifd.c | 25 +++++++------------------ migration/multifd.h | 15 +++++++++------ 4 files changed, 22 insertions(+), 36 deletions(-) diff --git a/migration/multifd-device-state.c b/migration/multifd-device-state.c index e383e75b1a..64d8ca1801 100644 --- a/migration/multifd-device-state.c +++ b/migration/multifd-device-state.c @@ -20,11 +20,6 @@ static struct { MultiFDSendData *send_data; } *multifd_send_device_state; -size_t multifd_device_state_payload_size(void) -{ - return sizeof(MultiFDDeviceState_t); -} - void multifd_device_state_send_setup(void) { assert(!multifd_send_device_state); diff --git a/migration/multifd-nocomp.c b/migration/multifd-nocomp.c index c008046523..ffe75256c9 100644 --- a/migration/multifd-nocomp.c +++ b/migration/multifd-nocomp.c @@ -25,15 +25,14 @@ static MultiFDSendData *multifd_ram_send; -size_t multifd_ram_payload_size(void) +void multifd_ram_payload_alloc(MultiFDPages_t *pages) { - uint32_t n = multifd_ram_page_count(); + pages->offset = g_new0(ram_addr_t, multifd_ram_page_count()); +} - /* - * We keep an array of page offsets at the end of MultiFDPages_t, - * add space for it in the allocation. - */ - return sizeof(MultiFDPages_t) + n * sizeof(ram_addr_t); +void multifd_ram_payload_free(MultiFDPages_t *pages) +{ + g_clear_pointer(&pages->offset, g_free); } void multifd_ram_save_setup(void) diff --git a/migration/multifd.c b/migration/multifd.c index 3625c9a37c..dfb5189f0e 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -105,26 +105,12 @@ struct { MultiFDSendData *multifd_send_data_alloc(void) { - size_t max_payload_size, size_minus_payload; + MultiFDSendData *new = g_new0(MultiFDSendData, 1); - /* - * MultiFDPages_t has a flexible array at the end, account for it - * when allocating MultiFDSendData. Use max() in case other types - * added to the union in the future are larger than - * (MultiFDPages_t + flex array). - */ - max_payload_size = MAX(multifd_ram_payload_size(), - multifd_device_state_payload_size()); - max_payload_size = MAX(max_payload_size, sizeof(MultiFDPayload)); + multifd_ram_payload_alloc(&new->u.ram); + /* Device state allocates its payload on-demand */ - /* - * Account for any holes the compiler might insert. We can't pack - * the structure because that misaligns the members and triggers - * Waddress-of-packed-member. - */ - size_minus_payload = sizeof(MultiFDSendData) - sizeof(MultiFDPayload); - - return g_malloc0(size_minus_payload + max_payload_size); + return new; } void multifd_send_data_clear(MultiFDSendData *data) @@ -151,8 +137,11 @@ void multifd_send_data_free(MultiFDSendData *data) return; } + /* This also free's device state payload */ multifd_send_data_clear(data); + multifd_ram_payload_free(&data->u.ram); + g_free(data); } diff --git a/migration/multifd.h b/migration/multifd.h index aa679d8bbe..2d337e7b3b 100644 --- a/migration/multifd.h +++ b/migration/multifd.h @@ -115,9 +115,13 @@ typedef struct { uint32_t num; /* number of normal pages */ uint32_t normal_num; + /* + * Pointer to the ramblock. NOTE: it's caller's responsibility to make + * sure the pointer is always valid! + */ RAMBlock *block; - /* offset of each page */ - ram_addr_t offset[]; + /* offset array of each page, managed by multifd */ + ram_addr_t *offset; } MultiFDPages_t; struct MultiFDRecvData { @@ -140,7 +144,7 @@ typedef enum { MULTIFD_PAYLOAD_DEVICE_STATE, } MultiFDPayloadType; -typedef union MultiFDPayload { +typedef struct MultiFDPayload { MultiFDPages_t ram; MultiFDDeviceState_t device_state; } MultiFDPayload; @@ -394,12 +398,11 @@ void multifd_ram_save_cleanup(void); int multifd_ram_flush_and_sync(QEMUFile *f); bool multifd_ram_sync_per_round(void); bool multifd_ram_sync_per_section(void); -size_t multifd_ram_payload_size(void); +void multifd_ram_payload_alloc(MultiFDPages_t *pages); +void multifd_ram_payload_free(MultiFDPages_t *pages); void multifd_ram_fill_packet(MultiFDSendParams *p); int multifd_ram_unfill_packet(MultiFDRecvParams *p, Error **errp); -size_t multifd_device_state_payload_size(void); - void multifd_send_data_clear_device_state(MultiFDDeviceState_t *device_state); void multifd_device_state_send_setup(void); From a1131aa94256d1007b4267ff9e79c3b4ada6e2b9 Mon Sep 17 00:00:00 2001 From: "Maciej S. Szmigiero" Date: Tue, 4 Mar 2025 23:03:43 +0100 Subject: [PATCH 25/42] migration/multifd: Add multifd_device_state_supported() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since device state transfer via multifd channels requires multifd channels with packets and is currently not compatible with multifd compression add an appropriate query function so device can learn whether it can actually make use of it. Reviewed-by: Fabiano Rosas Reviewed-by: Peter Xu Signed-off-by: Maciej S. Szmigiero Link: https://lore.kernel.org/qemu-devel/1ff0d98b85f470e5a33687406e877583b8fab74e.1741124640.git.maciej.szmigiero@oracle.com Signed-off-by: Cédric Le Goater --- include/migration/misc.h | 1 + migration/multifd-device-state.c | 7 +++++++ 2 files changed, 8 insertions(+) diff --git a/include/migration/misc.h b/include/migration/misc.h index bd3b725fa0..273ebfca62 100644 --- a/include/migration/misc.h +++ b/include/migration/misc.h @@ -121,5 +121,6 @@ bool migrate_uri_parse(const char *uri, MigrationChannel **channel, /* migration/multifd-device-state.c */ bool multifd_queue_device_state(char *idstr, uint32_t instance_id, char *data, size_t len); +bool multifd_device_state_supported(void); #endif diff --git a/migration/multifd-device-state.c b/migration/multifd-device-state.c index 64d8ca1801..3097ffa310 100644 --- a/migration/multifd-device-state.c +++ b/migration/multifd-device-state.c @@ -13,6 +13,7 @@ #include "qemu/lockable.h" #include "migration/misc.h" #include "multifd.h" +#include "options.h" static struct { QemuMutex queue_job_mutex; @@ -111,3 +112,9 @@ bool multifd_queue_device_state(char *idstr, uint32_t instance_id, return true; } + +bool multifd_device_state_supported(void) +{ + return migrate_multifd() && !migrate_mapped_ram() && + migrate_multifd_compression() == MULTIFD_COMPRESSION_NONE; +} From 8305921a91a940023fe971c74eb1e06f2725ebab Mon Sep 17 00:00:00 2001 From: "Maciej S. Szmigiero" Date: Tue, 4 Mar 2025 23:03:44 +0100 Subject: [PATCH 26/42] migration: Add save_live_complete_precopy_thread handler MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This SaveVMHandler helps device provide its own asynchronous transmission of the remaining data at the end of a precopy phase via multifd channels, in parallel with the transfer done by save_live_complete_precopy handlers. These threads are launched only when multifd device state transfer is supported. Management of these threads in done in the multifd migration code, wrapping them in the generic thread pool. Signed-off-by: Maciej S. Szmigiero Reviewed-by: Peter Xu Link: https://lore.kernel.org/qemu-devel/eac74a4ca7edd8968bbf72aa07b9041c76364a16.1741124640.git.maciej.szmigiero@oracle.com Signed-off-by: Cédric Le Goater --- include/migration/misc.h | 17 ++++++ include/migration/register.h | 19 +++++++ include/qemu/typedefs.h | 3 ++ migration/multifd-device-state.c | 92 ++++++++++++++++++++++++++++++++ migration/savevm.c | 40 +++++++++++++- 5 files changed, 170 insertions(+), 1 deletion(-) diff --git a/include/migration/misc.h b/include/migration/misc.h index 273ebfca62..8fd36eba1d 100644 --- a/include/migration/misc.h +++ b/include/migration/misc.h @@ -119,8 +119,25 @@ bool migrate_uri_parse(const char *uri, MigrationChannel **channel, Error **errp); /* migration/multifd-device-state.c */ +typedef struct SaveLiveCompletePrecopyThreadData { + SaveLiveCompletePrecopyThreadHandler hdlr; + char *idstr; + uint32_t instance_id; + void *handler_opaque; +} SaveLiveCompletePrecopyThreadData; + bool multifd_queue_device_state(char *idstr, uint32_t instance_id, char *data, size_t len); bool multifd_device_state_supported(void); +void +multifd_spawn_device_state_save_thread(SaveLiveCompletePrecopyThreadHandler hdlr, + char *idstr, uint32_t instance_id, + void *opaque); + +bool multifd_device_state_save_thread_should_exit(void); + +void multifd_abort_device_state_save_threads(void); +bool multifd_join_device_state_save_threads(void); + #endif diff --git a/include/migration/register.h b/include/migration/register.h index 58891aa54b..c041ce32f2 100644 --- a/include/migration/register.h +++ b/include/migration/register.h @@ -105,6 +105,25 @@ typedef struct SaveVMHandlers { */ int (*save_live_complete_precopy)(QEMUFile *f, void *opaque); + /** + * @save_live_complete_precopy_thread (invoked in a separate thread) + * + * Called at the end of a precopy phase from a separate worker thread + * in configurations where multifd device state transfer is supported + * in order to perform asynchronous transmission of the remaining data in + * parallel with @save_live_complete_precopy handlers. + * When postcopy is enabled, devices that support postcopy will skip this + * step. + * + * @d: a #SaveLiveCompletePrecopyThreadData containing parameters that the + * handler may need, including this device section idstr and instance_id, + * and opaque data pointer passed to register_savevm_live(). + * @errp: pointer to Error*, to store an error if it happens. + * + * Returns true to indicate success and false for errors. + */ + SaveLiveCompletePrecopyThreadHandler save_live_complete_precopy_thread; + /* This runs both outside and inside the BQL. */ /** diff --git a/include/qemu/typedefs.h b/include/qemu/typedefs.h index fd23ff7771..42ed4e6be1 100644 --- a/include/qemu/typedefs.h +++ b/include/qemu/typedefs.h @@ -108,6 +108,7 @@ typedef struct QString QString; typedef struct RAMBlock RAMBlock; typedef struct Range Range; typedef struct ReservedRegion ReservedRegion; +typedef struct SaveLiveCompletePrecopyThreadData SaveLiveCompletePrecopyThreadData; typedef struct SHPCDevice SHPCDevice; typedef struct SSIBus SSIBus; typedef struct TCGCPUOps TCGCPUOps; @@ -133,5 +134,7 @@ typedef struct IRQState *qemu_irq; typedef void (*qemu_irq_handler)(void *opaque, int n, int level); typedef bool (*MigrationLoadThread)(void *opaque, bool *should_quit, Error **errp); +typedef bool (*SaveLiveCompletePrecopyThreadHandler)(SaveLiveCompletePrecopyThreadData *d, + Error **errp); #endif /* QEMU_TYPEDEFS_H */ diff --git a/migration/multifd-device-state.c b/migration/multifd-device-state.c index 3097ffa310..94222d0eb0 100644 --- a/migration/multifd-device-state.c +++ b/migration/multifd-device-state.c @@ -10,7 +10,10 @@ */ #include "qemu/osdep.h" +#include "qapi/error.h" #include "qemu/lockable.h" +#include "block/thread-pool.h" +#include "migration.h" #include "migration/misc.h" #include "multifd.h" #include "options.h" @@ -19,6 +22,9 @@ static struct { QemuMutex queue_job_mutex; MultiFDSendData *send_data; + + ThreadPool *threads; + bool threads_abort; } *multifd_send_device_state; void multifd_device_state_send_setup(void) @@ -29,10 +35,14 @@ void multifd_device_state_send_setup(void) qemu_mutex_init(&multifd_send_device_state->queue_job_mutex); multifd_send_device_state->send_data = multifd_send_data_alloc(); + + multifd_send_device_state->threads = thread_pool_new(); + multifd_send_device_state->threads_abort = false; } void multifd_device_state_send_cleanup(void) { + g_clear_pointer(&multifd_send_device_state->threads, thread_pool_free); g_clear_pointer(&multifd_send_device_state->send_data, multifd_send_data_free); @@ -118,3 +128,85 @@ bool multifd_device_state_supported(void) return migrate_multifd() && !migrate_mapped_ram() && migrate_multifd_compression() == MULTIFD_COMPRESSION_NONE; } + +static void multifd_device_state_save_thread_data_free(void *opaque) +{ + SaveLiveCompletePrecopyThreadData *data = opaque; + + g_clear_pointer(&data->idstr, g_free); + g_free(data); +} + +static int multifd_device_state_save_thread(void *opaque) +{ + SaveLiveCompletePrecopyThreadData *data = opaque; + g_autoptr(Error) local_err = NULL; + + if (!data->hdlr(data, &local_err)) { + MigrationState *s = migrate_get_current(); + + /* + * Can't call abort_device_state_save_threads() here since new + * save threads could still be in process of being launched + * (if, for example, the very first save thread launched exited + * with an error very quickly). + */ + + assert(local_err); + + /* + * In case of multiple save threads failing which thread error + * return we end setting is purely arbitrary. + */ + migrate_set_error(s, local_err); + } + + return 0; +} + +bool multifd_device_state_save_thread_should_exit(void) +{ + return qatomic_read(&multifd_send_device_state->threads_abort); +} + +void +multifd_spawn_device_state_save_thread(SaveLiveCompletePrecopyThreadHandler hdlr, + char *idstr, uint32_t instance_id, + void *opaque) +{ + SaveLiveCompletePrecopyThreadData *data; + + assert(multifd_device_state_supported()); + assert(multifd_send_device_state); + + assert(!qatomic_read(&multifd_send_device_state->threads_abort)); + + data = g_new(SaveLiveCompletePrecopyThreadData, 1); + data->hdlr = hdlr; + data->idstr = g_strdup(idstr); + data->instance_id = instance_id; + data->handler_opaque = opaque; + + thread_pool_submit_immediate(multifd_send_device_state->threads, + multifd_device_state_save_thread, + data, + multifd_device_state_save_thread_data_free); +} + +void multifd_abort_device_state_save_threads(void) +{ + assert(multifd_device_state_supported()); + + qatomic_set(&multifd_send_device_state->threads_abort, true); +} + +bool multifd_join_device_state_save_threads(void) +{ + MigrationState *s = migrate_get_current(); + + assert(multifd_device_state_supported()); + + thread_pool_wait(multifd_send_device_state->threads); + + return !migrate_has_error(s); +} diff --git a/migration/savevm.c b/migration/savevm.c index 1abc365570..5c4fdfd95e 100644 --- a/migration/savevm.c +++ b/migration/savevm.c @@ -37,6 +37,7 @@ #include "migration/register.h" #include "migration/global_state.h" #include "migration/channel-block.h" +#include "multifd.h" #include "ram.h" #include "qemu-file.h" #include "savevm.h" @@ -1527,6 +1528,24 @@ int qemu_savevm_state_complete_precopy_iterable(QEMUFile *f, bool in_postcopy) int64_t start_ts_each, end_ts_each; SaveStateEntry *se; int ret; + bool multifd_device_state = multifd_device_state_supported(); + + if (multifd_device_state) { + QTAILQ_FOREACH(se, &savevm_state.handlers, entry) { + SaveLiveCompletePrecopyThreadHandler hdlr; + + if (!se->ops || (in_postcopy && se->ops->has_postcopy && + se->ops->has_postcopy(se->opaque)) || + !se->ops->save_live_complete_precopy_thread) { + continue; + } + + hdlr = se->ops->save_live_complete_precopy_thread; + multifd_spawn_device_state_save_thread(hdlr, + se->idstr, se->instance_id, + se->opaque); + } + } QTAILQ_FOREACH(se, &savevm_state.handlers, entry) { if (!se->ops || @@ -1552,16 +1571,35 @@ int qemu_savevm_state_complete_precopy_iterable(QEMUFile *f, bool in_postcopy) save_section_footer(f, se); if (ret < 0) { qemu_file_set_error(f, ret); - return -1; + goto ret_fail_abort_threads; } end_ts_each = qemu_clock_get_us(QEMU_CLOCK_REALTIME); trace_vmstate_downtime_save("iterable", se->idstr, se->instance_id, end_ts_each - start_ts_each); } + if (multifd_device_state) { + if (migrate_has_error(migrate_get_current())) { + multifd_abort_device_state_save_threads(); + } + + if (!multifd_join_device_state_save_threads()) { + qemu_file_set_error(f, -EINVAL); + return -1; + } + } + trace_vmstate_downtime_checkpoint("src-iterable-saved"); return 0; + +ret_fail_abort_threads: + if (multifd_device_state) { + multifd_abort_device_state_save_threads(); + multifd_join_device_state_save_threads(); + } + + return -1; } int qemu_savevm_state_complete_precopy_non_iterable(QEMUFile *f, From 5963c219a0e60d3f20c09ba2d34671d5e9623e70 Mon Sep 17 00:00:00 2001 From: "Maciej S. Szmigiero" Date: Tue, 4 Mar 2025 23:03:45 +0100 Subject: [PATCH 27/42] vfio/migration: Add load_device_config_state_start trace event MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit And rename existing load_device_config_state trace event to load_device_config_state_end for consistency since it is triggered at the end of loading of the VFIO device config state. This way both the start and end points of particular device config loading operation (a long, BQL-serialized operation) are known. Reviewed-by: Cédric Le Goater Signed-off-by: Maciej S. Szmigiero Link: https://lore.kernel.org/qemu-devel/1b6c5a2097e64c272eb7e53f9e4cca4b79581b38.1741124640.git.maciej.szmigiero@oracle.com Signed-off-by: Cédric Le Goater --- hw/vfio/migration.c | 4 +++- hw/vfio/trace-events | 3 ++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c index adfa752db5..03890eaa48 100644 --- a/hw/vfio/migration.c +++ b/hw/vfio/migration.c @@ -285,6 +285,8 @@ static int vfio_load_device_config_state(QEMUFile *f, void *opaque) VFIODevice *vbasedev = opaque; uint64_t data; + trace_vfio_load_device_config_state_start(vbasedev->name); + if (vbasedev->ops && vbasedev->ops->vfio_load_config) { int ret; @@ -303,7 +305,7 @@ static int vfio_load_device_config_state(QEMUFile *f, void *opaque) return -EINVAL; } - trace_vfio_load_device_config_state(vbasedev->name); + trace_vfio_load_device_config_state_end(vbasedev->name); return qemu_file_get_error(f); } diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events index c5385e1a4f..a02c668f28 100644 --- a/hw/vfio/trace-events +++ b/hw/vfio/trace-events @@ -150,7 +150,8 @@ vfio_display_edid_write_error(void) "" # migration.c vfio_load_cleanup(const char *name) " (%s)" -vfio_load_device_config_state(const char *name) " (%s)" +vfio_load_device_config_state_start(const char *name) " (%s)" +vfio_load_device_config_state_end(const char *name) " (%s)" vfio_load_state(const char *name, uint64_t data) " (%s) data 0x%"PRIx64 vfio_load_state_device_data(const char *name, uint64_t data_size, int ret) " (%s) size %"PRIu64" ret %d" vfio_migration_realize(const char *name) " (%s)" From bd846c5d583a63eaaf836403515c4bf3748f3bb3 Mon Sep 17 00:00:00 2001 From: "Maciej S. Szmigiero" Date: Tue, 4 Mar 2025 23:03:46 +0100 Subject: [PATCH 28/42] vfio/migration: Convert bytes_transferred counter to atomic MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit So it can be safety accessed from multiple threads. This variable type needs to be changed to unsigned long since 32-bit host platforms lack the necessary addition atomics on 64-bit variables. Using 32-bit counters on 32-bit host platforms should not be a problem in practice since they can't realistically address more memory anyway. Reviewed-by: Cédric Le Goater Signed-off-by: Maciej S. Szmigiero Link: https://lore.kernel.org/qemu-devel/dc391771d2d9ad0f311994f0cb9e666da564aeaf.1741124640.git.maciej.szmigiero@oracle.com Signed-off-by: Cédric Le Goater --- hw/vfio/migration.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c index 03890eaa48..5532787be6 100644 --- a/hw/vfio/migration.c +++ b/hw/vfio/migration.c @@ -55,7 +55,7 @@ */ #define VFIO_MIG_DEFAULT_DATA_BUFFER_SIZE (1 * MiB) -static int64_t bytes_transferred; +static unsigned long bytes_transferred; static const char *mig_state_to_str(enum vfio_device_mig_state state) { @@ -391,7 +391,7 @@ static ssize_t vfio_save_block(QEMUFile *f, VFIOMigration *migration) qemu_put_be64(f, VFIO_MIG_FLAG_DEV_DATA_STATE); qemu_put_be64(f, data_size); qemu_put_buffer(f, migration->data_buffer, data_size); - bytes_transferred += data_size; + qatomic_add(&bytes_transferred, data_size); trace_vfio_save_block(migration->vbasedev->name, data_size); @@ -1013,12 +1013,12 @@ static int vfio_block_migration(VFIODevice *vbasedev, Error *err, Error **errp) int64_t vfio_mig_bytes_transferred(void) { - return bytes_transferred; + return MIN(qatomic_read(&bytes_transferred), INT64_MAX); } void vfio_reset_bytes_transferred(void) { - bytes_transferred = 0; + qatomic_set(&bytes_transferred, 0); } /* From 47c7133629cc35b417b8f32512a4715ee53bfae3 Mon Sep 17 00:00:00 2001 From: "Maciej S. Szmigiero" Date: Tue, 4 Mar 2025 23:03:47 +0100 Subject: [PATCH 29/42] vfio/migration: Add vfio_add_bytes_transferred() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This way bytes_transferred can also be incremented in other translation units than migration.c. Signed-off-by: Maciej S. Szmigiero Reviewed-by: Cédric Le Goater Link: https://lore.kernel.org/qemu-devel/d1fbc27ac2417b49892f354ba20f6c6b3f7209f8.1741124640.git.maciej.szmigiero@oracle.com Signed-off-by: Cédric Le Goater --- hw/vfio/migration.c | 7 ++++++- include/hw/vfio/vfio-common.h | 1 + 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c index 5532787be6..51c056e152 100644 --- a/hw/vfio/migration.c +++ b/hw/vfio/migration.c @@ -391,7 +391,7 @@ static ssize_t vfio_save_block(QEMUFile *f, VFIOMigration *migration) qemu_put_be64(f, VFIO_MIG_FLAG_DEV_DATA_STATE); qemu_put_be64(f, data_size); qemu_put_buffer(f, migration->data_buffer, data_size); - qatomic_add(&bytes_transferred, data_size); + vfio_mig_add_bytes_transferred(data_size); trace_vfio_save_block(migration->vbasedev->name, data_size); @@ -1021,6 +1021,11 @@ void vfio_reset_bytes_transferred(void) qatomic_set(&bytes_transferred, 0); } +void vfio_mig_add_bytes_transferred(unsigned long val) +{ + qatomic_add(&bytes_transferred, val); +} + /* * Return true when either migration initialized or blocker registered. * Currently only return false when adding blocker fails which will diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h index ac35136a11..5c84ebb002 100644 --- a/include/hw/vfio/vfio-common.h +++ b/include/hw/vfio/vfio-common.h @@ -274,6 +274,7 @@ void vfio_unblock_multiple_devices_migration(void); bool vfio_viommu_preset(VFIODevice *vbasedev); int64_t vfio_mig_bytes_transferred(void); void vfio_reset_bytes_transferred(void); +void vfio_mig_add_bytes_transferred(unsigned long val); bool vfio_device_state_is_running(VFIODevice *vbasedev); bool vfio_device_state_is_precopy(VFIODevice *vbasedev); From eb6608619a16ab949f3cf1138638d17afa45fe4d Mon Sep 17 00:00:00 2001 From: "Maciej S. Szmigiero" Date: Tue, 4 Mar 2025 23:03:48 +0100 Subject: [PATCH 30/42] vfio/migration: Move migration channel flags to vfio-common.h header file MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This way they can also be referenced in other translation units than migration.c. Reviewed-by: Cédric Le Goater Signed-off-by: Maciej S. Szmigiero Link: https://lore.kernel.org/qemu-devel/26a940f6b22c1b685818251b7a3ddbbca601b1d6.1741124640.git.maciej.szmigiero@oracle.com Signed-off-by: Cédric Le Goater --- hw/vfio/migration.c | 17 ----------------- include/hw/vfio/vfio-common.h | 17 +++++++++++++++++ 2 files changed, 17 insertions(+), 17 deletions(-) diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c index 51c056e152..a9b0970604 100644 --- a/hw/vfio/migration.c +++ b/hw/vfio/migration.c @@ -31,23 +31,6 @@ #include "trace.h" #include "hw/hw.h" -/* - * Flags to be used as unique delimiters for VFIO devices in the migration - * stream. These flags are composed as: - * 0xffffffff => MSB 32-bit all 1s - * 0xef10 => Magic ID, represents emulated (virtual) function IO - * 0x0000 => 16-bits reserved for flags - * - * The beginning of state information is marked by _DEV_CONFIG_STATE, - * _DEV_SETUP_STATE, or _DEV_DATA_STATE, respectively. The end of a - * certain state information is marked by _END_OF_STATE. - */ -#define VFIO_MIG_FLAG_END_OF_STATE (0xffffffffef100001ULL) -#define VFIO_MIG_FLAG_DEV_CONFIG_STATE (0xffffffffef100002ULL) -#define VFIO_MIG_FLAG_DEV_SETUP_STATE (0xffffffffef100003ULL) -#define VFIO_MIG_FLAG_DEV_DATA_STATE (0xffffffffef100004ULL) -#define VFIO_MIG_FLAG_DEV_INIT_DATA_SENT (0xffffffffef100005ULL) - /* * This is an arbitrary size based on migration of mlx5 devices, where typically * total device migration size is on the order of 100s of MB. Testing with diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h index 5c84ebb002..bf5d520871 100644 --- a/include/hw/vfio/vfio-common.h +++ b/include/hw/vfio/vfio-common.h @@ -36,6 +36,23 @@ #define VFIO_MSG_PREFIX "vfio %s: " +/* + * Flags to be used as unique delimiters for VFIO devices in the migration + * stream. These flags are composed as: + * 0xffffffff => MSB 32-bit all 1s + * 0xef10 => Magic ID, represents emulated (virtual) function IO + * 0x0000 => 16-bits reserved for flags + * + * The beginning of state information is marked by _DEV_CONFIG_STATE, + * _DEV_SETUP_STATE, or _DEV_DATA_STATE, respectively. The end of a + * certain state information is marked by _END_OF_STATE. + */ +#define VFIO_MIG_FLAG_END_OF_STATE (0xffffffffef100001ULL) +#define VFIO_MIG_FLAG_DEV_CONFIG_STATE (0xffffffffef100002ULL) +#define VFIO_MIG_FLAG_DEV_SETUP_STATE (0xffffffffef100003ULL) +#define VFIO_MIG_FLAG_DEV_DATA_STATE (0xffffffffef100004ULL) +#define VFIO_MIG_FLAG_DEV_INIT_DATA_SENT (0xffffffffef100005ULL) + enum { VFIO_DEVICE_TYPE_PCI = 0, VFIO_DEVICE_TYPE_PLATFORM = 1, From 961165122bc8a4cf58eaaab75ff09ba8fc950a88 Mon Sep 17 00:00:00 2001 From: "Maciej S. Szmigiero" Date: Tue, 4 Mar 2025 23:03:49 +0100 Subject: [PATCH 31/42] vfio/migration: Multifd device state transfer support - basic types MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add basic types and flags used by VFIO multifd device state transfer support. Since we'll be introducing a lot of multifd transfer specific code, add a new file migration-multifd.c to home it, wired into main VFIO migration code (migration.c) via migration-multifd.h header file. Signed-off-by: Maciej S. Szmigiero Reviewed-by: Cédric Le Goater Link: https://lore.kernel.org/qemu-devel/4eedd529e6617f80f3d6a66d7268a0db2bc173fa.1741124640.git.maciej.szmigiero@oracle.com Signed-off-by: Cédric Le Goater --- hw/vfio/meson.build | 1 + hw/vfio/migration-multifd.c | 33 +++++++++++++++++++++++++++++++++ hw/vfio/migration-multifd.h | 17 +++++++++++++++++ hw/vfio/migration.c | 1 + 4 files changed, 52 insertions(+) create mode 100644 hw/vfio/migration-multifd.c create mode 100644 hw/vfio/migration-multifd.h diff --git a/hw/vfio/meson.build b/hw/vfio/meson.build index bba776f75c..260d65febd 100644 --- a/hw/vfio/meson.build +++ b/hw/vfio/meson.build @@ -5,6 +5,7 @@ vfio_ss.add(files( 'container-base.c', 'container.c', 'migration.c', + 'migration-multifd.c', 'cpr.c', )) vfio_ss.add(when: 'CONFIG_PSERIES', if_true: files('spapr.c')) diff --git a/hw/vfio/migration-multifd.c b/hw/vfio/migration-multifd.c new file mode 100644 index 0000000000..fa594b33fd --- /dev/null +++ b/hw/vfio/migration-multifd.c @@ -0,0 +1,33 @@ +/* + * Multifd VFIO migration + * + * Copyright (C) 2024,2025 Oracle and/or its affiliates. + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#include "qemu/osdep.h" +#include "hw/vfio/vfio-common.h" +#include "migration/misc.h" +#include "qapi/error.h" +#include "qemu/error-report.h" +#include "qemu/lockable.h" +#include "qemu/main-loop.h" +#include "qemu/thread.h" +#include "migration/qemu-file.h" +#include "migration-multifd.h" +#include "trace.h" + +#define VFIO_DEVICE_STATE_CONFIG_STATE (1) + +#define VFIO_DEVICE_STATE_PACKET_VER_CURRENT (0) + +typedef struct VFIODeviceStatePacket { + uint32_t version; + uint32_t idx; + uint32_t flags; + uint8_t data[0]; +} QEMU_PACKED VFIODeviceStatePacket; diff --git a/hw/vfio/migration-multifd.h b/hw/vfio/migration-multifd.h new file mode 100644 index 0000000000..5b221c6e16 --- /dev/null +++ b/hw/vfio/migration-multifd.h @@ -0,0 +1,17 @@ +/* + * Multifd VFIO migration + * + * Copyright (C) 2024,2025 Oracle and/or its affiliates. + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#ifndef HW_VFIO_MIGRATION_MULTIFD_H +#define HW_VFIO_MIGRATION_MULTIFD_H + +#include "hw/vfio/vfio-common.h" + +#endif diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c index a9b0970604..dc1fe4e717 100644 --- a/hw/vfio/migration.c +++ b/hw/vfio/migration.c @@ -23,6 +23,7 @@ #include "migration/qemu-file.h" #include "migration/register.h" #include "migration/blocker.h" +#include "migration-multifd.h" #include "qapi/error.h" #include "qapi/qapi-events-vfio.h" #include "exec/ramlist.h" From 2efa35d34edfeca0d151de8283e52006e2c6cbd4 Mon Sep 17 00:00:00 2001 From: "Maciej S. Szmigiero" Date: Tue, 4 Mar 2025 23:03:50 +0100 Subject: [PATCH 32/42] vfio/migration: Multifd device state transfer - add support checking function MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add vfio_multifd_transfer_supported() function that tells whether the multifd device state transfer is supported. Reviewed-by: Cédric Le Goater Signed-off-by: Maciej S. Szmigiero Link: https://lore.kernel.org/qemu-devel/8ce50256f341b3d47342bb217cb5fbb2deb14639.1741124640.git.maciej.szmigiero@oracle.com Signed-off-by: Cédric Le Goater --- hw/vfio/migration-multifd.c | 6 ++++++ hw/vfio/migration-multifd.h | 2 ++ 2 files changed, 8 insertions(+) diff --git a/hw/vfio/migration-multifd.c b/hw/vfio/migration-multifd.c index fa594b33fd..79fae0b629 100644 --- a/hw/vfio/migration-multifd.c +++ b/hw/vfio/migration-multifd.c @@ -31,3 +31,9 @@ typedef struct VFIODeviceStatePacket { uint32_t flags; uint8_t data[0]; } QEMU_PACKED VFIODeviceStatePacket; + +bool vfio_multifd_transfer_supported(void) +{ + return multifd_device_state_supported() && + migrate_send_switchover_start(); +} diff --git a/hw/vfio/migration-multifd.h b/hw/vfio/migration-multifd.h index 5b221c6e16..1b60d5f67a 100644 --- a/hw/vfio/migration-multifd.h +++ b/hw/vfio/migration-multifd.h @@ -14,4 +14,6 @@ #include "hw/vfio/vfio-common.h" +bool vfio_multifd_transfer_supported(void); + #endif From ff2fd1f7e23f528f03f41b5475afb173718fea07 Mon Sep 17 00:00:00 2001 From: "Maciej S. Szmigiero" Date: Tue, 4 Mar 2025 23:03:51 +0100 Subject: [PATCH 33/42] vfio/migration: Multifd setup/cleanup functions and associated VFIOMultifd MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add multifd setup/cleanup functions and an associated VFIOMultifd data structure that will contain most of the receive-side data together with its init/cleanup methods. Signed-off-by: Maciej S. Szmigiero Reviewed-by: Cédric Le Goater Link: https://lore.kernel.org/qemu-devel/c0520523053b1087787152ddf2163257d3030be0.1741124640.git.maciej.szmigiero@oracle.com Signed-off-by: Cédric Le Goater --- hw/vfio/migration-multifd.c | 44 +++++++++++++++++++++++++++++++++++ hw/vfio/migration-multifd.h | 4 ++++ include/hw/vfio/vfio-common.h | 3 +++ 3 files changed, 51 insertions(+) diff --git a/hw/vfio/migration-multifd.c b/hw/vfio/migration-multifd.c index 79fae0b629..091dc43210 100644 --- a/hw/vfio/migration-multifd.c +++ b/hw/vfio/migration-multifd.c @@ -32,8 +32,52 @@ typedef struct VFIODeviceStatePacket { uint8_t data[0]; } QEMU_PACKED VFIODeviceStatePacket; +typedef struct VFIOMultifd { +} VFIOMultifd; + +static VFIOMultifd *vfio_multifd_new(void) +{ + VFIOMultifd *multifd = g_new(VFIOMultifd, 1); + + return multifd; +} + +static void vfio_multifd_free(VFIOMultifd *multifd) +{ + g_free(multifd); +} + +void vfio_multifd_cleanup(VFIODevice *vbasedev) +{ + VFIOMigration *migration = vbasedev->migration; + + g_clear_pointer(&migration->multifd, vfio_multifd_free); +} + bool vfio_multifd_transfer_supported(void) { return multifd_device_state_supported() && migrate_send_switchover_start(); } + +bool vfio_multifd_transfer_enabled(VFIODevice *vbasedev) +{ + return false; +} + +bool vfio_multifd_setup(VFIODevice *vbasedev, bool alloc_multifd, Error **errp) +{ + VFIOMigration *migration = vbasedev->migration; + + if (!vfio_multifd_transfer_enabled(vbasedev)) { + /* Nothing further to check or do */ + return true; + } + + if (alloc_multifd) { + assert(!migration->multifd); + migration->multifd = vfio_multifd_new(); + } + + return true; +} diff --git a/hw/vfio/migration-multifd.h b/hw/vfio/migration-multifd.h index 1b60d5f67a..2a7a76164f 100644 --- a/hw/vfio/migration-multifd.h +++ b/hw/vfio/migration-multifd.h @@ -14,6 +14,10 @@ #include "hw/vfio/vfio-common.h" +bool vfio_multifd_setup(VFIODevice *vbasedev, bool alloc_multifd, Error **errp); +void vfio_multifd_cleanup(VFIODevice *vbasedev); + bool vfio_multifd_transfer_supported(void); +bool vfio_multifd_transfer_enabled(VFIODevice *vbasedev); #endif diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h index bf5d520871..4038239069 100644 --- a/include/hw/vfio/vfio-common.h +++ b/include/hw/vfio/vfio-common.h @@ -78,6 +78,8 @@ typedef struct VFIORegion { uint8_t nr; /* cache the region number for debug */ } VFIORegion; +typedef struct VFIOMultifd VFIOMultifd; + typedef struct VFIOMigration { struct VFIODevice *vbasedev; VMChangeStateEntry *vm_state; @@ -89,6 +91,7 @@ typedef struct VFIOMigration { uint64_t mig_flags; uint64_t precopy_init_size; uint64_t precopy_dirty_size; + VFIOMultifd *multifd; bool initial_data_sent; bool event_save_iterate_started; From 6bcffb1cad5b2b45152c0faa1133c96d3f129914 Mon Sep 17 00:00:00 2001 From: "Maciej S. Szmigiero" Date: Tue, 4 Mar 2025 23:03:52 +0100 Subject: [PATCH 34/42] vfio/migration: Setup and cleanup multifd transfer in these general methods MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Wire VFIO multifd transfer specific setup and cleanup functions into general VFIO load/save setup and cleanup methods. Signed-off-by: Maciej S. Szmigiero Reviewed-by: Cédric Le Goater Link: https://lore.kernel.org/qemu-devel/b1f864a65fafd4fdab1f89230df52e46ae41f2ac.1741124640.git.maciej.szmigiero@oracle.com Signed-off-by: Cédric Le Goater --- hw/vfio/migration.c | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c index dc1fe4e717..3c8286ae62 100644 --- a/hw/vfio/migration.c +++ b/hw/vfio/migration.c @@ -453,6 +453,10 @@ static int vfio_save_setup(QEMUFile *f, void *opaque, Error **errp) uint64_t stop_copy_size = VFIO_MIG_DEFAULT_DATA_BUFFER_SIZE; int ret; + if (!vfio_multifd_setup(vbasedev, false, errp)) { + return -EINVAL; + } + qemu_put_be64(f, VFIO_MIG_FLAG_DEV_SETUP_STATE); vfio_query_stop_copy_size(vbasedev, &stop_copy_size); @@ -509,6 +513,9 @@ static void vfio_save_cleanup(void *opaque) Error *local_err = NULL; int ret; + /* Currently a NOP, done for symmetry with load_cleanup() */ + vfio_multifd_cleanup(vbasedev); + /* * Changing device state from STOP_COPY to STOP can take time. Do it here, * after migration has completed, so it won't increase downtime. @@ -674,15 +681,28 @@ static void vfio_save_state(QEMUFile *f, void *opaque) static int vfio_load_setup(QEMUFile *f, void *opaque, Error **errp) { VFIODevice *vbasedev = opaque; + VFIOMigration *migration = vbasedev->migration; + int ret; - return vfio_migration_set_state(vbasedev, VFIO_DEVICE_STATE_RESUMING, - vbasedev->migration->device_state, errp); + if (!vfio_multifd_setup(vbasedev, true, errp)) { + return -EINVAL; + } + + ret = vfio_migration_set_state(vbasedev, VFIO_DEVICE_STATE_RESUMING, + migration->device_state, errp); + if (ret) { + return ret; + } + + return 0; } static int vfio_load_cleanup(void *opaque) { VFIODevice *vbasedev = opaque; + vfio_multifd_cleanup(vbasedev); + vfio_migration_cleanup(vbasedev); trace_vfio_load_cleanup(vbasedev->name); From 3228d311ab1882f75b04d080d33a71fc7a0bcac5 Mon Sep 17 00:00:00 2001 From: "Maciej S. Szmigiero" Date: Tue, 4 Mar 2025 23:03:53 +0100 Subject: [PATCH 35/42] vfio/migration: Multifd device state transfer support - received buffers queuing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The multifd received data needs to be reassembled since device state packets sent via different multifd channels can arrive out-of-order. Therefore, each VFIO device state packet carries a header indicating its position in the stream. The raw device state data is saved into a VFIOStateBuffer for later in-order loading into the device. The last such VFIO device state packet should have VFIO_DEVICE_STATE_CONFIG_STATE flag set and carry the device config state. Signed-off-by: Maciej S. Szmigiero Reviewed-by: Cédric Le Goater Link: https://lore.kernel.org/qemu-devel/e3bff515a8d61c582b94b409eb12a45b1a143a69.1741124640.git.maciej.szmigiero@oracle.com [ clg: - Reordered savevm_vfio_handlers - Added load_state_buffer documentation ] Signed-off-by: Cédric Le Goater --- docs/devel/migration/vfio.rst | 7 ++ hw/vfio/migration-multifd.c | 163 ++++++++++++++++++++++++++++++++++ hw/vfio/migration-multifd.h | 3 + hw/vfio/migration.c | 4 + hw/vfio/trace-events | 1 + 5 files changed, 178 insertions(+) diff --git a/docs/devel/migration/vfio.rst b/docs/devel/migration/vfio.rst index c49482eab6..8b1f28890a 100644 --- a/docs/devel/migration/vfio.rst +++ b/docs/devel/migration/vfio.rst @@ -76,6 +76,10 @@ VFIO implements the device hooks for the iterative approach as follows: * A ``load_state`` function that loads the config section and the data sections that are generated by the save functions above. +* A ``load_state_buffer`` function that loads the device state and the device + config that arrived via multifd channels. + It's used only in the multifd mode. + * ``cleanup`` functions for both save and load that perform any migration related cleanup. @@ -194,6 +198,9 @@ Live migration resume path (RESTORE_VM, _ACTIVE, _STOP) | For each device, .load_state() is called for that device section data + transmitted via the main migration channel. + For data transmitted via multifd channels .load_state_buffer() is called + instead. (RESTORE_VM, _ACTIVE, _RESUMING) | At the end, .load_cleanup() is called for each device and vCPUs are started diff --git a/hw/vfio/migration-multifd.c b/hw/vfio/migration-multifd.c index 091dc43210..79df11b7ba 100644 --- a/hw/vfio/migration-multifd.c +++ b/hw/vfio/migration-multifd.c @@ -32,18 +32,181 @@ typedef struct VFIODeviceStatePacket { uint8_t data[0]; } QEMU_PACKED VFIODeviceStatePacket; +/* type safety */ +typedef struct VFIOStateBuffers { + GArray *array; +} VFIOStateBuffers; + +typedef struct VFIOStateBuffer { + bool is_present; + char *data; + size_t len; +} VFIOStateBuffer; + typedef struct VFIOMultifd { + VFIOStateBuffers load_bufs; + QemuCond load_bufs_buffer_ready_cond; + QemuMutex load_bufs_mutex; /* Lock order: this lock -> BQL */ + uint32_t load_buf_idx; + uint32_t load_buf_idx_last; } VFIOMultifd; +static void vfio_state_buffer_clear(gpointer data) +{ + VFIOStateBuffer *lb = data; + + if (!lb->is_present) { + return; + } + + g_clear_pointer(&lb->data, g_free); + lb->is_present = false; +} + +static void vfio_state_buffers_init(VFIOStateBuffers *bufs) +{ + bufs->array = g_array_new(FALSE, TRUE, sizeof(VFIOStateBuffer)); + g_array_set_clear_func(bufs->array, vfio_state_buffer_clear); +} + +static void vfio_state_buffers_destroy(VFIOStateBuffers *bufs) +{ + g_clear_pointer(&bufs->array, g_array_unref); +} + +static void vfio_state_buffers_assert_init(VFIOStateBuffers *bufs) +{ + assert(bufs->array); +} + +static unsigned int vfio_state_buffers_size_get(VFIOStateBuffers *bufs) +{ + return bufs->array->len; +} + +static void vfio_state_buffers_size_set(VFIOStateBuffers *bufs, + unsigned int size) +{ + g_array_set_size(bufs->array, size); +} + +static VFIOStateBuffer *vfio_state_buffers_at(VFIOStateBuffers *bufs, + unsigned int idx) +{ + return &g_array_index(bufs->array, VFIOStateBuffer, idx); +} + +/* called with load_bufs_mutex locked */ +static bool vfio_load_state_buffer_insert(VFIODevice *vbasedev, + VFIODeviceStatePacket *packet, + size_t packet_total_size, + Error **errp) +{ + VFIOMigration *migration = vbasedev->migration; + VFIOMultifd *multifd = migration->multifd; + VFIOStateBuffer *lb; + + vfio_state_buffers_assert_init(&multifd->load_bufs); + if (packet->idx >= vfio_state_buffers_size_get(&multifd->load_bufs)) { + vfio_state_buffers_size_set(&multifd->load_bufs, packet->idx + 1); + } + + lb = vfio_state_buffers_at(&multifd->load_bufs, packet->idx); + if (lb->is_present) { + error_setg(errp, "%s: state buffer %" PRIu32 " already filled", + vbasedev->name, packet->idx); + return false; + } + + assert(packet->idx >= multifd->load_buf_idx); + + lb->data = g_memdup2(&packet->data, packet_total_size - sizeof(*packet)); + lb->len = packet_total_size - sizeof(*packet); + lb->is_present = true; + + return true; +} + +bool vfio_multifd_load_state_buffer(void *opaque, char *data, size_t data_size, + Error **errp) +{ + VFIODevice *vbasedev = opaque; + VFIOMigration *migration = vbasedev->migration; + VFIOMultifd *multifd = migration->multifd; + VFIODeviceStatePacket *packet = (VFIODeviceStatePacket *)data; + + if (!vfio_multifd_transfer_enabled(vbasedev)) { + error_setg(errp, + "%s: got device state packet but not doing multifd transfer", + vbasedev->name); + return false; + } + + assert(multifd); + + if (data_size < sizeof(*packet)) { + error_setg(errp, "%s: packet too short at %zu (min is %zu)", + vbasedev->name, data_size, sizeof(*packet)); + return false; + } + + if (packet->version != VFIO_DEVICE_STATE_PACKET_VER_CURRENT) { + error_setg(errp, "%s: packet has unknown version %" PRIu32, + vbasedev->name, packet->version); + return false; + } + + if (packet->idx == UINT32_MAX) { + error_setg(errp, "%s: packet index is invalid", vbasedev->name); + return false; + } + + trace_vfio_load_state_device_buffer_incoming(vbasedev->name, packet->idx); + + /* + * Holding BQL here would violate the lock order and can cause + * a deadlock once we attempt to lock load_bufs_mutex below. + */ + assert(!bql_locked()); + + WITH_QEMU_LOCK_GUARD(&multifd->load_bufs_mutex) { + /* config state packet should be the last one in the stream */ + if (packet->flags & VFIO_DEVICE_STATE_CONFIG_STATE) { + multifd->load_buf_idx_last = packet->idx; + } + + if (!vfio_load_state_buffer_insert(vbasedev, packet, data_size, + errp)) { + return false; + } + + qemu_cond_signal(&multifd->load_bufs_buffer_ready_cond); + } + + return true; +} + static VFIOMultifd *vfio_multifd_new(void) { VFIOMultifd *multifd = g_new(VFIOMultifd, 1); + vfio_state_buffers_init(&multifd->load_bufs); + + qemu_mutex_init(&multifd->load_bufs_mutex); + + multifd->load_buf_idx = 0; + multifd->load_buf_idx_last = UINT32_MAX; + qemu_cond_init(&multifd->load_bufs_buffer_ready_cond); + return multifd; } static void vfio_multifd_free(VFIOMultifd *multifd) { + vfio_state_buffers_destroy(&multifd->load_bufs); + qemu_cond_destroy(&multifd->load_bufs_buffer_ready_cond); + qemu_mutex_destroy(&multifd->load_bufs_mutex); + g_free(multifd); } diff --git a/hw/vfio/migration-multifd.h b/hw/vfio/migration-multifd.h index 2a7a76164f..8c6320fcb4 100644 --- a/hw/vfio/migration-multifd.h +++ b/hw/vfio/migration-multifd.h @@ -20,4 +20,7 @@ void vfio_multifd_cleanup(VFIODevice *vbasedev); bool vfio_multifd_transfer_supported(void); bool vfio_multifd_transfer_enabled(VFIODevice *vbasedev); +bool vfio_multifd_load_state_buffer(void *opaque, char *data, size_t data_size, + Error **errp); + #endif diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c index 3c8286ae62..2cdb92356e 100644 --- a/hw/vfio/migration.c +++ b/hw/vfio/migration.c @@ -802,6 +802,10 @@ static const SaveVMHandlers savevm_vfio_handlers = { .load_cleanup = vfio_load_cleanup, .load_state = vfio_load_state, .switchover_ack_needed = vfio_switchover_ack_needed, + /* + * Multifd support + */ + .load_state_buffer = vfio_multifd_load_state_buffer, }; /* ---------------------------------------------------------------------- */ diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events index a02c668f28..404ea079b2 100644 --- a/hw/vfio/trace-events +++ b/hw/vfio/trace-events @@ -154,6 +154,7 @@ vfio_load_device_config_state_start(const char *name) " (%s)" vfio_load_device_config_state_end(const char *name) " (%s)" vfio_load_state(const char *name, uint64_t data) " (%s) data 0x%"PRIx64 vfio_load_state_device_data(const char *name, uint64_t data_size, int ret) " (%s) size %"PRIu64" ret %d" +vfio_load_state_device_buffer_incoming(const char *name, uint32_t idx) " (%s) idx %"PRIu32 vfio_migration_realize(const char *name) " (%s)" vfio_migration_set_device_state(const char *name, const char *state) " (%s) state %s" vfio_migration_set_state(const char *name, const char *new_state, const char *recover_state) " (%s) new state %s, recover state %s" From c59748c1ff924963a67af9efd7e1a1ee6f82d6d6 Mon Sep 17 00:00:00 2001 From: "Maciej S. Szmigiero" Date: Tue, 4 Mar 2025 23:03:54 +0100 Subject: [PATCH 36/42] vfio/migration: Multifd device state transfer support - load thread MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a thread which loads the VFIO device state buffers that were received via multifd. Each VFIO device that has multifd device state transfer enabled has one such thread, which is created using migration core API qemu_loadvm_start_load_thread(). Since it's important to finish loading device state transferred via the main migration channel (via save_live_iterate SaveVMHandler) before starting loading the data asynchronously transferred via multifd the thread doing the actual loading of the multifd transferred data is only started from switchover_start SaveVMHandler. switchover_start handler is called when MIG_CMD_SWITCHOVER_START sub-command of QEMU_VM_COMMAND is received via the main migration channel. This sub-command is only sent after all save_live_iterate data have already been posted so it is safe to commence loading of the multifd-transferred device state upon receiving it - loading of save_live_iterate data happens synchronously in the main migration thread (much like the processing of MIG_CMD_SWITCHOVER_START) so by the time MIG_CMD_SWITCHOVER_START is processed all the proceeding data must have already been loaded. Signed-off-by: Maciej S. Szmigiero Reviewed-by: Cédric Le Goater Link: https://lore.kernel.org/qemu-devel/9abe612d775aaf42e31646796acd2363c723a57a.1741124640.git.maciej.szmigiero@oracle.com [ clg: - Reordered savevm_vfio_handlers - Added switchover_start documentation ] Signed-off-by: Cédric Le Goater --- docs/devel/migration/vfio.rst | 4 + hw/vfio/migration-multifd.c | 226 ++++++++++++++++++++++++++++++++++ hw/vfio/migration-multifd.h | 2 + hw/vfio/migration.c | 12 ++ hw/vfio/trace-events | 7 ++ 5 files changed, 251 insertions(+) diff --git a/docs/devel/migration/vfio.rst b/docs/devel/migration/vfio.rst index 8b1f28890a..d6cf60890c 100644 --- a/docs/devel/migration/vfio.rst +++ b/docs/devel/migration/vfio.rst @@ -67,6 +67,10 @@ VFIO implements the device hooks for the iterative approach as follows: * A ``switchover_ack_needed`` function that checks if the VFIO device uses "switchover-ack" migration capability when this capability is enabled. +* A ``switchover_start`` function that in the multifd mode starts a thread that + reassembles the multifd received data and loads it in-order into the device. + In the non-multifd mode this function is a NOP. + * A ``save_state`` function to save the device config space if it is present. * A ``save_live_complete_precopy`` function that sets the VFIO device in diff --git a/hw/vfio/migration-multifd.c b/hw/vfio/migration-multifd.c index 79df11b7ba..2eef27604e 100644 --- a/hw/vfio/migration-multifd.c +++ b/hw/vfio/migration-multifd.c @@ -44,8 +44,12 @@ typedef struct VFIOStateBuffer { } VFIOStateBuffer; typedef struct VFIOMultifd { + bool load_bufs_thread_running; + bool load_bufs_thread_want_exit; + VFIOStateBuffers load_bufs; QemuCond load_bufs_buffer_ready_cond; + QemuCond load_bufs_thread_finished_cond; QemuMutex load_bufs_mutex; /* Lock order: this lock -> BQL */ uint32_t load_buf_idx; uint32_t load_buf_idx_last; @@ -186,6 +190,178 @@ bool vfio_multifd_load_state_buffer(void *opaque, char *data, size_t data_size, return true; } +static bool vfio_load_bufs_thread_load_config(VFIODevice *vbasedev, + Error **errp) +{ + error_setg(errp, "not yet there"); + return false; +} + +static VFIOStateBuffer *vfio_load_state_buffer_get(VFIOMultifd *multifd) +{ + VFIOStateBuffer *lb; + unsigned int bufs_len; + + bufs_len = vfio_state_buffers_size_get(&multifd->load_bufs); + if (multifd->load_buf_idx >= bufs_len) { + assert(multifd->load_buf_idx == bufs_len); + return NULL; + } + + lb = vfio_state_buffers_at(&multifd->load_bufs, + multifd->load_buf_idx); + if (!lb->is_present) { + return NULL; + } + + return lb; +} + +static bool vfio_load_state_buffer_write(VFIODevice *vbasedev, + VFIOStateBuffer *lb, + Error **errp) +{ + VFIOMigration *migration = vbasedev->migration; + VFIOMultifd *multifd = migration->multifd; + g_autofree char *buf = NULL; + char *buf_cur; + size_t buf_len; + + if (!lb->len) { + return true; + } + + trace_vfio_load_state_device_buffer_load_start(vbasedev->name, + multifd->load_buf_idx); + + /* lb might become re-allocated when we drop the lock */ + buf = g_steal_pointer(&lb->data); + buf_cur = buf; + buf_len = lb->len; + while (buf_len > 0) { + ssize_t wr_ret; + int errno_save; + + /* + * Loading data to the device takes a while, + * drop the lock during this process. + */ + qemu_mutex_unlock(&multifd->load_bufs_mutex); + wr_ret = write(migration->data_fd, buf_cur, buf_len); + errno_save = errno; + qemu_mutex_lock(&multifd->load_bufs_mutex); + + if (wr_ret < 0) { + error_setg(errp, + "%s: writing state buffer %" PRIu32 " failed: %d", + vbasedev->name, multifd->load_buf_idx, errno_save); + return false; + } + + assert(wr_ret <= buf_len); + buf_len -= wr_ret; + buf_cur += wr_ret; + } + + trace_vfio_load_state_device_buffer_load_end(vbasedev->name, + multifd->load_buf_idx); + + return true; +} + +static bool vfio_load_bufs_thread_want_exit(VFIOMultifd *multifd, + bool *should_quit) +{ + return multifd->load_bufs_thread_want_exit || qatomic_read(should_quit); +} + +/* + * This thread is spawned by vfio_multifd_switchover_start() which gets + * called upon encountering the switchover point marker in main migration + * stream. + * + * It exits after either: + * * completing loading the remaining device state and device config, OR: + * * encountering some error while doing the above, OR: + * * being forcefully aborted by the migration core by it setting should_quit + * or by vfio_load_cleanup_load_bufs_thread() setting + * multifd->load_bufs_thread_want_exit. + */ +static bool vfio_load_bufs_thread(void *opaque, bool *should_quit, Error **errp) +{ + VFIODevice *vbasedev = opaque; + VFIOMigration *migration = vbasedev->migration; + VFIOMultifd *multifd = migration->multifd; + bool ret = false; + + trace_vfio_load_bufs_thread_start(vbasedev->name); + + assert(multifd); + QEMU_LOCK_GUARD(&multifd->load_bufs_mutex); + + assert(multifd->load_bufs_thread_running); + + while (true) { + VFIOStateBuffer *lb; + + /* + * Always check cancellation first after the buffer_ready wait below in + * case that cond was signalled by vfio_load_cleanup_load_bufs_thread(). + */ + if (vfio_load_bufs_thread_want_exit(multifd, should_quit)) { + error_setg(errp, "operation cancelled"); + goto thread_exit; + } + + assert(multifd->load_buf_idx <= multifd->load_buf_idx_last); + + lb = vfio_load_state_buffer_get(multifd); + if (!lb) { + trace_vfio_load_state_device_buffer_starved(vbasedev->name, + multifd->load_buf_idx); + qemu_cond_wait(&multifd->load_bufs_buffer_ready_cond, + &multifd->load_bufs_mutex); + continue; + } + + if (multifd->load_buf_idx == multifd->load_buf_idx_last) { + break; + } + + if (multifd->load_buf_idx == 0) { + trace_vfio_load_state_device_buffer_start(vbasedev->name); + } + + if (!vfio_load_state_buffer_write(vbasedev, lb, errp)) { + goto thread_exit; + } + + if (multifd->load_buf_idx == multifd->load_buf_idx_last - 1) { + trace_vfio_load_state_device_buffer_end(vbasedev->name); + } + + multifd->load_buf_idx++; + } + + if (!vfio_load_bufs_thread_load_config(vbasedev, errp)) { + goto thread_exit; + } + + ret = true; + +thread_exit: + /* + * Notify possibly waiting vfio_load_cleanup_load_bufs_thread() that + * this thread is exiting. + */ + multifd->load_bufs_thread_running = false; + qemu_cond_signal(&multifd->load_bufs_thread_finished_cond); + + trace_vfio_load_bufs_thread_end(vbasedev->name); + + return ret; +} + static VFIOMultifd *vfio_multifd_new(void) { VFIOMultifd *multifd = g_new(VFIOMultifd, 1); @@ -198,11 +374,41 @@ static VFIOMultifd *vfio_multifd_new(void) multifd->load_buf_idx_last = UINT32_MAX; qemu_cond_init(&multifd->load_bufs_buffer_ready_cond); + multifd->load_bufs_thread_running = false; + multifd->load_bufs_thread_want_exit = false; + qemu_cond_init(&multifd->load_bufs_thread_finished_cond); + return multifd; } +/* + * Terminates vfio_load_bufs_thread by setting + * multifd->load_bufs_thread_want_exit and signalling all the conditions + * the thread could be blocked on. + * + * Waits for the thread to signal that it had finished. + */ +static void vfio_load_cleanup_load_bufs_thread(VFIOMultifd *multifd) +{ + /* The lock order is load_bufs_mutex -> BQL so unlock BQL here first */ + bql_unlock(); + WITH_QEMU_LOCK_GUARD(&multifd->load_bufs_mutex) { + while (multifd->load_bufs_thread_running) { + multifd->load_bufs_thread_want_exit = true; + + qemu_cond_signal(&multifd->load_bufs_buffer_ready_cond); + qemu_cond_wait(&multifd->load_bufs_thread_finished_cond, + &multifd->load_bufs_mutex); + } + } + bql_lock(); +} + static void vfio_multifd_free(VFIOMultifd *multifd) { + vfio_load_cleanup_load_bufs_thread(multifd); + + qemu_cond_destroy(&multifd->load_bufs_thread_finished_cond); vfio_state_buffers_destroy(&multifd->load_bufs); qemu_cond_destroy(&multifd->load_bufs_buffer_ready_cond); qemu_mutex_destroy(&multifd->load_bufs_mutex); @@ -244,3 +450,23 @@ bool vfio_multifd_setup(VFIODevice *vbasedev, bool alloc_multifd, Error **errp) return true; } + +int vfio_multifd_switchover_start(VFIODevice *vbasedev) +{ + VFIOMigration *migration = vbasedev->migration; + VFIOMultifd *multifd = migration->multifd; + + assert(multifd); + + /* The lock order is load_bufs_mutex -> BQL so unlock BQL here first */ + bql_unlock(); + WITH_QEMU_LOCK_GUARD(&multifd->load_bufs_mutex) { + assert(!multifd->load_bufs_thread_running); + multifd->load_bufs_thread_running = true; + } + bql_lock(); + + qemu_loadvm_start_load_thread(vfio_load_bufs_thread, vbasedev); + + return 0; +} diff --git a/hw/vfio/migration-multifd.h b/hw/vfio/migration-multifd.h index 8c6320fcb4..f0d28fcef2 100644 --- a/hw/vfio/migration-multifd.h +++ b/hw/vfio/migration-multifd.h @@ -23,4 +23,6 @@ bool vfio_multifd_transfer_enabled(VFIODevice *vbasedev); bool vfio_multifd_load_state_buffer(void *opaque, char *data, size_t data_size, Error **errp); +int vfio_multifd_switchover_start(VFIODevice *vbasedev); + #endif diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c index 2cdb92356e..815ad8fc93 100644 --- a/hw/vfio/migration.c +++ b/hw/vfio/migration.c @@ -788,6 +788,17 @@ static bool vfio_switchover_ack_needed(void *opaque) return vfio_precopy_supported(vbasedev); } +static int vfio_switchover_start(void *opaque) +{ + VFIODevice *vbasedev = opaque; + + if (vfio_multifd_transfer_enabled(vbasedev)) { + return vfio_multifd_switchover_start(vbasedev); + } + + return 0; +} + static const SaveVMHandlers savevm_vfio_handlers = { .save_prepare = vfio_save_prepare, .save_setup = vfio_save_setup, @@ -806,6 +817,7 @@ static const SaveVMHandlers savevm_vfio_handlers = { * Multifd support */ .load_state_buffer = vfio_multifd_load_state_buffer, + .switchover_start = vfio_switchover_start, }; /* ---------------------------------------------------------------------- */ diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events index 404ea079b2..d6b7e34faa 100644 --- a/hw/vfio/trace-events +++ b/hw/vfio/trace-events @@ -149,12 +149,19 @@ vfio_display_edid_update(uint32_t prefx, uint32_t prefy) "%ux%u" vfio_display_edid_write_error(void) "" # migration.c +vfio_load_bufs_thread_start(const char *name) " (%s)" +vfio_load_bufs_thread_end(const char *name) " (%s)" vfio_load_cleanup(const char *name) " (%s)" vfio_load_device_config_state_start(const char *name) " (%s)" vfio_load_device_config_state_end(const char *name) " (%s)" vfio_load_state(const char *name, uint64_t data) " (%s) data 0x%"PRIx64 vfio_load_state_device_data(const char *name, uint64_t data_size, int ret) " (%s) size %"PRIu64" ret %d" vfio_load_state_device_buffer_incoming(const char *name, uint32_t idx) " (%s) idx %"PRIu32 +vfio_load_state_device_buffer_start(const char *name) " (%s)" +vfio_load_state_device_buffer_starved(const char *name, uint32_t idx) " (%s) idx %"PRIu32 +vfio_load_state_device_buffer_load_start(const char *name, uint32_t idx) " (%s) idx %"PRIu32 +vfio_load_state_device_buffer_load_end(const char *name, uint32_t idx) " (%s) idx %"PRIu32 +vfio_load_state_device_buffer_end(const char *name) " (%s)" vfio_migration_realize(const char *name) " (%s)" vfio_migration_set_device_state(const char *name, const char *state) " (%s) state %s" vfio_migration_set_state(const char *name, const char *new_state, const char *recover_state) " (%s) new state %s, recover state %s" From fda70ed83d54abad6bf2d437a7c05204b0fad228 Mon Sep 17 00:00:00 2001 From: "Maciej S. Szmigiero" Date: Tue, 4 Mar 2025 23:03:55 +0100 Subject: [PATCH 37/42] migration/qemu-file: Define g_autoptr() cleanup function for QEMUFile MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Automatic memory management helps avoid memory safety issues. Reviewed-by: Fabiano Rosas Reviewed-by: Peter Xu Signed-off-by: Maciej S. Szmigiero Link: https://lore.kernel.org/qemu-devel/2fd01d773a783d572dcf538a064a98cc09e75c12.1741124640.git.maciej.szmigiero@oracle.com Signed-off-by: Cédric Le Goater --- migration/qemu-file.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/migration/qemu-file.h b/migration/qemu-file.h index 3e47a20621..f5b9f430e0 100644 --- a/migration/qemu-file.h +++ b/migration/qemu-file.h @@ -33,6 +33,8 @@ QEMUFile *qemu_file_new_input(QIOChannel *ioc); QEMUFile *qemu_file_new_output(QIOChannel *ioc); int qemu_fclose(QEMUFile *f); +G_DEFINE_AUTOPTR_CLEANUP_FUNC(QEMUFile, qemu_fclose) + /* * qemu_file_transferred: * From b659c07c534490369ca0954f0116b05c4a063065 Mon Sep 17 00:00:00 2001 From: "Maciej S. Szmigiero" Date: Tue, 4 Mar 2025 23:03:56 +0100 Subject: [PATCH 38/42] vfio/migration: Multifd device state transfer support - config loading support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Load device config received via multifd using the existing machinery behind vfio_load_device_config_state(). Also, make sure to process the relevant main migration channel flags. Signed-off-by: Maciej S. Szmigiero Reviewed-by: Cédric Le Goater Link: https://lore.kernel.org/qemu-devel/5dbd3f3703ec1097da2cf82a7262233452146fee.1741124640.git.maciej.szmigiero@oracle.com Signed-off-by: Cédric Le Goater --- hw/vfio/migration-multifd.c | 49 +++++++++++++++++++++++++++++++++-- hw/vfio/migration.c | 9 ++++++- include/hw/vfio/vfio-common.h | 2 ++ 3 files changed, 57 insertions(+), 3 deletions(-) diff --git a/hw/vfio/migration-multifd.c b/hw/vfio/migration-multifd.c index 2eef27604e..1d81233c75 100644 --- a/hw/vfio/migration-multifd.c +++ b/hw/vfio/migration-multifd.c @@ -17,6 +17,7 @@ #include "qemu/lockable.h" #include "qemu/main-loop.h" #include "qemu/thread.h" +#include "io/channel-buffer.h" #include "migration/qemu-file.h" #include "migration-multifd.h" #include "trace.h" @@ -193,8 +194,52 @@ bool vfio_multifd_load_state_buffer(void *opaque, char *data, size_t data_size, static bool vfio_load_bufs_thread_load_config(VFIODevice *vbasedev, Error **errp) { - error_setg(errp, "not yet there"); - return false; + VFIOMigration *migration = vbasedev->migration; + VFIOMultifd *multifd = migration->multifd; + VFIOStateBuffer *lb; + g_autoptr(QIOChannelBuffer) bioc = NULL; + g_autoptr(QEMUFile) f_out = NULL, f_in = NULL; + uint64_t mig_header; + int ret; + + assert(multifd->load_buf_idx == multifd->load_buf_idx_last); + lb = vfio_state_buffers_at(&multifd->load_bufs, multifd->load_buf_idx); + assert(lb->is_present); + + bioc = qio_channel_buffer_new(lb->len); + qio_channel_set_name(QIO_CHANNEL(bioc), "vfio-device-config-load"); + + f_out = qemu_file_new_output(QIO_CHANNEL(bioc)); + qemu_put_buffer(f_out, (uint8_t *)lb->data, lb->len); + + ret = qemu_fflush(f_out); + if (ret) { + error_setg(errp, "%s: load config state flush failed: %d", + vbasedev->name, ret); + return false; + } + + qio_channel_io_seek(QIO_CHANNEL(bioc), 0, 0, NULL); + f_in = qemu_file_new_input(QIO_CHANNEL(bioc)); + + mig_header = qemu_get_be64(f_in); + if (mig_header != VFIO_MIG_FLAG_DEV_CONFIG_STATE) { + error_setg(errp, "%s: expected FLAG_DEV_CONFIG_STATE but got %" PRIx64, + vbasedev->name, mig_header); + return false; + } + + bql_lock(); + ret = vfio_load_device_config_state(f_in, vbasedev); + bql_unlock(); + + if (ret < 0) { + error_setg(errp, "%s: vfio_load_device_config_state() failed: %d", + vbasedev->name, ret); + return false; + } + + return true; } static VFIOStateBuffer *vfio_load_state_buffer_get(VFIOMultifd *multifd) diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c index 815ad8fc93..2ca3fa08d4 100644 --- a/hw/vfio/migration.c +++ b/hw/vfio/migration.c @@ -264,7 +264,7 @@ static int vfio_save_device_config_state(QEMUFile *f, void *opaque, return ret; } -static int vfio_load_device_config_state(QEMUFile *f, void *opaque) +int vfio_load_device_config_state(QEMUFile *f, void *opaque) { VFIODevice *vbasedev = opaque; uint64_t data; @@ -723,6 +723,13 @@ static int vfio_load_state(QEMUFile *f, void *opaque, int version_id) switch (data) { case VFIO_MIG_FLAG_DEV_CONFIG_STATE: { + if (vfio_multifd_transfer_enabled(vbasedev)) { + error_report("%s: got DEV_CONFIG_STATE in main migration " + "channel but doing multifd transfer", + vbasedev->name); + return -EINVAL; + } + return vfio_load_device_config_state(f, opaque); } case VFIO_MIG_FLAG_DEV_SETUP_STATE: diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h index 4038239069..9d72ac1eae 100644 --- a/include/hw/vfio/vfio-common.h +++ b/include/hw/vfio/vfio-common.h @@ -298,6 +298,8 @@ void vfio_mig_add_bytes_transferred(unsigned long val); bool vfio_device_state_is_running(VFIODevice *vbasedev); bool vfio_device_state_is_precopy(VFIODevice *vbasedev); +int vfio_load_device_config_state(QEMUFile *f, void *opaque); + #ifdef CONFIG_LINUX int vfio_get_region_info(VFIODevice *vbasedev, int index, struct vfio_region_info **info); From 6d644baef20303fa4b2b342f556e26c2262b439f Mon Sep 17 00:00:00 2001 From: "Maciej S. Szmigiero" Date: Tue, 4 Mar 2025 23:03:57 +0100 Subject: [PATCH 39/42] vfio/migration: Multifd device state transfer support - send side MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implement the multifd device state transfer via additional per-device thread inside save_live_complete_precopy_thread handler. Switch between doing the data transfer in the new handler and doing it in the old save_state handler depending if VFIO multifd transfer is enabled or not. Signed-off-by: Maciej S. Szmigiero Reviewed-by: Cédric Le Goater Link: https://lore.kernel.org/qemu-devel/4d727e2e0435e0022d50004e474077632830e08d.1741124640.git.maciej.szmigiero@oracle.com [ clg: - Reordered savevm_vfio_handlers - Updated save_live_complete_precopy* documentation ] Signed-off-by: Cédric Le Goater --- docs/devel/migration/vfio.rst | 19 ++++- hw/vfio/migration-multifd.c | 142 ++++++++++++++++++++++++++++++++++ hw/vfio/migration-multifd.h | 6 ++ hw/vfio/migration.c | 22 ++++-- hw/vfio/trace-events | 2 + include/hw/vfio/vfio-common.h | 6 ++ 6 files changed, 189 insertions(+), 8 deletions(-) diff --git a/docs/devel/migration/vfio.rst b/docs/devel/migration/vfio.rst index d6cf60890c..a803a09bc1 100644 --- a/docs/devel/migration/vfio.rst +++ b/docs/devel/migration/vfio.rst @@ -71,11 +71,23 @@ VFIO implements the device hooks for the iterative approach as follows: reassembles the multifd received data and loads it in-order into the device. In the non-multifd mode this function is a NOP. -* A ``save_state`` function to save the device config space if it is present. +* A ``save_state`` function to save the device config space if it is present + in the non-multifd mode. + In the multifd mode it just emits either a dummy EOS marker. * A ``save_live_complete_precopy`` function that sets the VFIO device in _STOP_COPY state and iteratively copies the data for the VFIO device until the vendor driver indicates that no data remains. + In the multifd mode it just emits a dummy EOS marker. + +* A ``save_live_complete_precopy_thread`` function that in the multifd mode + provides thread handler performing multifd device state transfer. + It sets the VFIO device to _STOP_COPY state, iteratively reads the data + from the VFIO device and queues it for multifd transmission until the vendor + driver indicates that no data remains. + After that, it saves the device config space and queues it for multifd + transfer too. + In the non-multifd mode this thread is a NOP. * A ``load_state`` function that loads the config section and the data sections that are generated by the save functions above. @@ -184,8 +196,11 @@ Live migration save path Then the VFIO device is put in _STOP_COPY state (FINISH_MIGRATE, _ACTIVE, _STOP_COPY) .save_live_complete_precopy() is called for each active device - For the VFIO device, iterate in .save_live_complete_precopy() until + For the VFIO device: in the non-multifd mode iterate in + .save_live_complete_precopy() until pending data is 0 + In the multifd mode this iteration is done in + .save_live_complete_precopy_thread() instead. | (POSTMIGRATE, _COMPLETED, _STOP_COPY) Migraton thread schedules cleanup bottom half and exits diff --git a/hw/vfio/migration-multifd.c b/hw/vfio/migration-multifd.c index 1d81233c75..bfb9a72fa4 100644 --- a/hw/vfio/migration-multifd.c +++ b/hw/vfio/migration-multifd.c @@ -496,6 +496,148 @@ bool vfio_multifd_setup(VFIODevice *vbasedev, bool alloc_multifd, Error **errp) return true; } +void vfio_multifd_emit_dummy_eos(VFIODevice *vbasedev, QEMUFile *f) +{ + assert(vfio_multifd_transfer_enabled(vbasedev)); + + /* + * Emit dummy NOP data on the main migration channel since the actual + * device state transfer is done via multifd channels. + */ + qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE); +} + +static bool +vfio_save_complete_precopy_thread_config_state(VFIODevice *vbasedev, + char *idstr, + uint32_t instance_id, + uint32_t idx, + Error **errp) +{ + g_autoptr(QIOChannelBuffer) bioc = NULL; + g_autoptr(QEMUFile) f = NULL; + int ret; + g_autofree VFIODeviceStatePacket *packet = NULL; + size_t packet_len; + + bioc = qio_channel_buffer_new(0); + qio_channel_set_name(QIO_CHANNEL(bioc), "vfio-device-config-save"); + + f = qemu_file_new_output(QIO_CHANNEL(bioc)); + + if (vfio_save_device_config_state(f, vbasedev, errp)) { + return false; + } + + ret = qemu_fflush(f); + if (ret) { + error_setg(errp, "%s: save config state flush failed: %d", + vbasedev->name, ret); + return false; + } + + packet_len = sizeof(*packet) + bioc->usage; + packet = g_malloc0(packet_len); + packet->version = VFIO_DEVICE_STATE_PACKET_VER_CURRENT; + packet->idx = idx; + packet->flags = VFIO_DEVICE_STATE_CONFIG_STATE; + memcpy(&packet->data, bioc->data, bioc->usage); + + if (!multifd_queue_device_state(idstr, instance_id, + (char *)packet, packet_len)) { + error_setg(errp, "%s: multifd config data queuing failed", + vbasedev->name); + return false; + } + + vfio_mig_add_bytes_transferred(packet_len); + + return true; +} + +/* + * This thread is spawned by the migration core directly via + * .save_live_complete_precopy_thread SaveVMHandler. + * + * It exits after either: + * * completing saving the remaining device state and device config, OR: + * * encountering some error while doing the above, OR: + * * being forcefully aborted by the migration core by + * multifd_device_state_save_thread_should_exit() returning true. + */ +bool +vfio_multifd_save_complete_precopy_thread(SaveLiveCompletePrecopyThreadData *d, + Error **errp) +{ + VFIODevice *vbasedev = d->handler_opaque; + VFIOMigration *migration = vbasedev->migration; + bool ret = false; + g_autofree VFIODeviceStatePacket *packet = NULL; + uint32_t idx; + + if (!vfio_multifd_transfer_enabled(vbasedev)) { + /* Nothing to do, vfio_save_complete_precopy() does the transfer. */ + return true; + } + + trace_vfio_save_complete_precopy_thread_start(vbasedev->name, + d->idstr, d->instance_id); + + /* We reach here with device state STOP or STOP_COPY only */ + if (vfio_migration_set_state(vbasedev, VFIO_DEVICE_STATE_STOP_COPY, + VFIO_DEVICE_STATE_STOP, errp)) { + goto thread_exit; + } + + packet = g_malloc0(sizeof(*packet) + migration->data_buffer_size); + packet->version = VFIO_DEVICE_STATE_PACKET_VER_CURRENT; + + for (idx = 0; ; idx++) { + ssize_t data_size; + size_t packet_size; + + if (multifd_device_state_save_thread_should_exit()) { + error_setg(errp, "operation cancelled"); + goto thread_exit; + } + + data_size = read(migration->data_fd, &packet->data, + migration->data_buffer_size); + if (data_size < 0) { + error_setg(errp, "%s: reading state buffer %" PRIu32 " failed: %d", + vbasedev->name, idx, errno); + goto thread_exit; + } else if (data_size == 0) { + break; + } + + packet->idx = idx; + packet_size = sizeof(*packet) + data_size; + + if (!multifd_queue_device_state(d->idstr, d->instance_id, + (char *)packet, packet_size)) { + error_setg(errp, "%s: multifd data queuing failed", vbasedev->name); + goto thread_exit; + } + + vfio_mig_add_bytes_transferred(packet_size); + } + + if (!vfio_save_complete_precopy_thread_config_state(vbasedev, + d->idstr, + d->instance_id, + idx, errp)) { + goto thread_exit; + } + + ret = true; + +thread_exit: + trace_vfio_save_complete_precopy_thread_end(vbasedev->name, ret); + + return ret; +} + int vfio_multifd_switchover_start(VFIODevice *vbasedev) { VFIOMigration *migration = vbasedev->migration; diff --git a/hw/vfio/migration-multifd.h b/hw/vfio/migration-multifd.h index f0d28fcef2..a664051eb8 100644 --- a/hw/vfio/migration-multifd.h +++ b/hw/vfio/migration-multifd.h @@ -23,6 +23,12 @@ bool vfio_multifd_transfer_enabled(VFIODevice *vbasedev); bool vfio_multifd_load_state_buffer(void *opaque, char *data, size_t data_size, Error **errp); +void vfio_multifd_emit_dummy_eos(VFIODevice *vbasedev, QEMUFile *f); + +bool +vfio_multifd_save_complete_precopy_thread(SaveLiveCompletePrecopyThreadData *d, + Error **errp); + int vfio_multifd_switchover_start(VFIODevice *vbasedev); #endif diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c index 2ca3fa08d4..416643ddd6 100644 --- a/hw/vfio/migration.c +++ b/hw/vfio/migration.c @@ -120,10 +120,10 @@ static void vfio_migration_set_device_state(VFIODevice *vbasedev, vfio_migration_send_event(vbasedev); } -static int vfio_migration_set_state(VFIODevice *vbasedev, - enum vfio_device_mig_state new_state, - enum vfio_device_mig_state recover_state, - Error **errp) +int vfio_migration_set_state(VFIODevice *vbasedev, + enum vfio_device_mig_state new_state, + enum vfio_device_mig_state recover_state, + Error **errp) { VFIOMigration *migration = vbasedev->migration; uint64_t buf[DIV_ROUND_UP(sizeof(struct vfio_device_feature) + @@ -238,8 +238,7 @@ static int vfio_load_buffer(QEMUFile *f, VFIODevice *vbasedev, return ret; } -static int vfio_save_device_config_state(QEMUFile *f, void *opaque, - Error **errp) +int vfio_save_device_config_state(QEMUFile *f, void *opaque, Error **errp) { VFIODevice *vbasedev = opaque; int ret; @@ -638,6 +637,11 @@ static int vfio_save_complete_precopy(QEMUFile *f, void *opaque) int ret; Error *local_err = NULL; + if (vfio_multifd_transfer_enabled(vbasedev)) { + vfio_multifd_emit_dummy_eos(vbasedev, f); + return 0; + } + trace_vfio_save_complete_precopy_start(vbasedev->name); /* We reach here with device state STOP or STOP_COPY only */ @@ -669,6 +673,11 @@ static void vfio_save_state(QEMUFile *f, void *opaque) Error *local_err = NULL; int ret; + if (vfio_multifd_transfer_enabled(vbasedev)) { + vfio_multifd_emit_dummy_eos(vbasedev, f); + return; + } + ret = vfio_save_device_config_state(f, opaque, &local_err); if (ret) { error_prepend(&local_err, @@ -825,6 +834,7 @@ static const SaveVMHandlers savevm_vfio_handlers = { */ .load_state_buffer = vfio_multifd_load_state_buffer, .switchover_start = vfio_switchover_start, + .save_live_complete_precopy_thread = vfio_multifd_save_complete_precopy_thread, }; /* ---------------------------------------------------------------------- */ diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events index d6b7e34faa..9347e3a5f6 100644 --- a/hw/vfio/trace-events +++ b/hw/vfio/trace-events @@ -171,6 +171,8 @@ vfio_save_block_precopy_empty_hit(const char *name) " (%s)" vfio_save_cleanup(const char *name) " (%s)" vfio_save_complete_precopy(const char *name, int ret) " (%s) ret %d" vfio_save_complete_precopy_start(const char *name) " (%s)" +vfio_save_complete_precopy_thread_start(const char *name, const char *idstr, uint32_t instance_id) " (%s) idstr %s instance %"PRIu32 +vfio_save_complete_precopy_thread_end(const char *name, int ret) " (%s) ret %d" vfio_save_device_config_state(const char *name) " (%s)" vfio_save_iterate(const char *name, uint64_t precopy_init_size, uint64_t precopy_dirty_size) " (%s) precopy initial size %"PRIu64" precopy dirty size %"PRIu64 vfio_save_iterate_start(const char *name) " (%s)" diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h index 9d72ac1eae..961931d9f4 100644 --- a/include/hw/vfio/vfio-common.h +++ b/include/hw/vfio/vfio-common.h @@ -298,6 +298,7 @@ void vfio_mig_add_bytes_transferred(unsigned long val); bool vfio_device_state_is_running(VFIODevice *vbasedev); bool vfio_device_state_is_precopy(VFIODevice *vbasedev); +int vfio_save_device_config_state(QEMUFile *f, void *opaque, Error **errp); int vfio_load_device_config_state(QEMUFile *f, void *opaque); #ifdef CONFIG_LINUX @@ -314,6 +315,11 @@ struct vfio_info_cap_header * vfio_get_device_info_cap(struct vfio_device_info *info, uint16_t id); struct vfio_info_cap_header * vfio_get_cap(void *ptr, uint32_t cap_offset, uint16_t id); + +int vfio_migration_set_state(VFIODevice *vbasedev, + enum vfio_device_mig_state new_state, + enum vfio_device_mig_state recover_state, + Error **errp); #endif bool vfio_migration_realize(VFIODevice *vbasedev, Error **errp); From 623af41dd331d1a57a41bc3374e3d134adb33f4c Mon Sep 17 00:00:00 2001 From: "Maciej S. Szmigiero" Date: Tue, 4 Mar 2025 23:03:58 +0100 Subject: [PATCH 40/42] vfio/migration: Add x-migration-multifd-transfer VFIO property MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This property allows configuring whether to transfer the particular device state via multifd channels when live migrating that device. It defaults to AUTO, which means that VFIO device state transfer via multifd channels is attempted in configurations that otherwise support it. Signed-off-by: Maciej S. Szmigiero Reviewed-by: Cédric Le Goater Link: https://lore.kernel.org/qemu-devel/d6dbb326e3d53c7104d62c96c9e3dd64e1c7b940.1741124640.git.maciej.szmigiero@oracle.com [ clg: Added documentation ] Signed-off-by: Cédric Le Goater --- docs/devel/migration/vfio.rst | 15 +++++++++++++++ hw/vfio/migration-multifd.c | 18 +++++++++++++++++- hw/vfio/pci.c | 7 +++++++ include/hw/vfio/vfio-common.h | 2 ++ 4 files changed, 41 insertions(+), 1 deletion(-) diff --git a/docs/devel/migration/vfio.rst b/docs/devel/migration/vfio.rst index a803a09bc1..673e354754 100644 --- a/docs/devel/migration/vfio.rst +++ b/docs/devel/migration/vfio.rst @@ -232,3 +232,18 @@ Postcopy ======== Postcopy migration is currently not supported for VFIO devices. + +Multifd +======= + +Starting from QEMU version 10.0 there's a possibility to transfer VFIO device +_STOP_COPY state via multifd channels. This helps reduce downtime - especially +with multiple VFIO devices or with devices having a large migration state. +As an additional benefit, setting the VFIO device to _STOP_COPY state and +saving its config space is also parallelized (run in a separate thread) in +such migration mode. + +The multifd VFIO device state transfer is controlled by +"x-migration-multifd-transfer" VFIO device property. This property defaults to +AUTO, which means that VFIO device state transfer via multifd channels is +attempted in configurations that otherwise support it. diff --git a/hw/vfio/migration-multifd.c b/hw/vfio/migration-multifd.c index bfb9a72fa4..aacddc503b 100644 --- a/hw/vfio/migration-multifd.c +++ b/hw/vfio/migration-multifd.c @@ -476,18 +476,34 @@ bool vfio_multifd_transfer_supported(void) bool vfio_multifd_transfer_enabled(VFIODevice *vbasedev) { - return false; + VFIOMigration *migration = vbasedev->migration; + + return migration->multifd_transfer; } bool vfio_multifd_setup(VFIODevice *vbasedev, bool alloc_multifd, Error **errp) { VFIOMigration *migration = vbasedev->migration; + if (vbasedev->migration_multifd_transfer == ON_OFF_AUTO_AUTO) { + migration->multifd_transfer = vfio_multifd_transfer_supported(); + } else { + migration->multifd_transfer = + vbasedev->migration_multifd_transfer == ON_OFF_AUTO_ON; + } + if (!vfio_multifd_transfer_enabled(vbasedev)) { /* Nothing further to check or do */ return true; } + if (!vfio_multifd_transfer_supported()) { + error_setg(errp, + "%s: Multifd device transfer requested but unsupported in the current config", + vbasedev->name); + return false; + } + if (alloc_multifd) { assert(!migration->multifd); migration->multifd = vfio_multifd_new(); diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index c1cee280ae..1bbf15cea3 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -3381,6 +3381,9 @@ static const Property vfio_pci_dev_properties[] = { VFIO_FEATURE_ENABLE_IGD_OPREGION_BIT, false), DEFINE_PROP_ON_OFF_AUTO("enable-migration", VFIOPCIDevice, vbasedev.enable_migration, ON_OFF_AUTO_AUTO), + DEFINE_PROP_ON_OFF_AUTO("x-migration-multifd-transfer", VFIOPCIDevice, + vbasedev.migration_multifd_transfer, + ON_OFF_AUTO_AUTO), DEFINE_PROP_BOOL("migration-events", VFIOPCIDevice, vbasedev.migration_events, false), DEFINE_PROP_BOOL("x-no-mmap", VFIOPCIDevice, vbasedev.no_mmap, false), @@ -3553,6 +3556,10 @@ static void vfio_pci_dev_class_init(ObjectClass *klass, void *data) "Skip config space check for Vendor Specific Capability. " "Setting to false will enforce strict checking of VSC content " "(DEBUG)"); + object_class_property_set_description(klass, /* 10.0 */ + "x-migration-multifd-transfer", + "Transfer this device state via " + "multifd channels when live migrating it"); } static const TypeInfo vfio_pci_dev_info = { diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h index 961931d9f4..04b123a6c9 100644 --- a/include/hw/vfio/vfio-common.h +++ b/include/hw/vfio/vfio-common.h @@ -91,6 +91,7 @@ typedef struct VFIOMigration { uint64_t mig_flags; uint64_t precopy_init_size; uint64_t precopy_dirty_size; + bool multifd_transfer; VFIOMultifd *multifd; bool initial_data_sent; @@ -153,6 +154,7 @@ typedef struct VFIODevice { bool no_mmap; bool ram_block_discard_allowed; OnOffAuto enable_migration; + OnOffAuto migration_multifd_transfer; bool migration_events; VFIODeviceOps *ops; unsigned int num_irqs; From 4c765ceaace4e7828e2790d8f4829f69989888de Mon Sep 17 00:00:00 2001 From: "Maciej S. Szmigiero" Date: Tue, 4 Mar 2025 23:03:59 +0100 Subject: [PATCH 41/42] vfio/migration: Make x-migration-multifd-transfer VFIO property mutable MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit DEFINE_PROP_ON_OFF_AUTO() property isn't runtime-mutable so using it would mean that the source VM would need to decide upfront at startup time whether it wants to do a multifd device state transfer at some point. Source VM can run for a long time before being migrated so it is desirable to have a fallback mechanism to the old way of transferring VFIO device state if it turns to be necessary. This brings this property to the same mutability level as ordinary migration parameters, which too can be adjusted at the run time. Signed-off-by: Maciej S. Szmigiero Reviewed-by: Cédric Le Goater Link: https://lore.kernel.org/qemu-devel/f2f2d66bda477da3e6cb8c0311006cff36e8651d.1741124640.git.maciej.szmigiero@oracle.com Signed-off-by: Cédric Le Goater --- hw/vfio/migration-multifd.c | 4 ++++ hw/vfio/pci.c | 20 +++++++++++++++++--- 2 files changed, 21 insertions(+), 3 deletions(-) diff --git a/hw/vfio/migration-multifd.c b/hw/vfio/migration-multifd.c index aacddc503b..233724710b 100644 --- a/hw/vfio/migration-multifd.c +++ b/hw/vfio/migration-multifd.c @@ -485,6 +485,10 @@ bool vfio_multifd_setup(VFIODevice *vbasedev, bool alloc_multifd, Error **errp) { VFIOMigration *migration = vbasedev->migration; + /* + * Make a copy of this setting at the start in case it is changed + * mid-migration. + */ if (vbasedev->migration_multifd_transfer == ON_OFF_AUTO_AUTO) { migration->multifd_transfer = vfio_multifd_transfer_supported(); } else { diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index 1bbf15cea3..fdbc15885d 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -3357,6 +3357,8 @@ static void vfio_instance_init(Object *obj) pci_dev->cap_present |= QEMU_PCI_CAP_EXPRESS; } +static PropertyInfo vfio_pci_migration_multifd_transfer_prop; + static const Property vfio_pci_dev_properties[] = { DEFINE_PROP_PCI_HOST_DEVADDR("host", VFIOPCIDevice, host), DEFINE_PROP_UUID_NODEFAULT("vf-token", VFIOPCIDevice, vf_token), @@ -3381,9 +3383,10 @@ static const Property vfio_pci_dev_properties[] = { VFIO_FEATURE_ENABLE_IGD_OPREGION_BIT, false), DEFINE_PROP_ON_OFF_AUTO("enable-migration", VFIOPCIDevice, vbasedev.enable_migration, ON_OFF_AUTO_AUTO), - DEFINE_PROP_ON_OFF_AUTO("x-migration-multifd-transfer", VFIOPCIDevice, - vbasedev.migration_multifd_transfer, - ON_OFF_AUTO_AUTO), + DEFINE_PROP("x-migration-multifd-transfer", VFIOPCIDevice, + vbasedev.migration_multifd_transfer, + vfio_pci_migration_multifd_transfer_prop, OnOffAuto, + .set_default = true, .defval.i = ON_OFF_AUTO_AUTO), DEFINE_PROP_BOOL("migration-events", VFIOPCIDevice, vbasedev.migration_events, false), DEFINE_PROP_BOOL("x-no-mmap", VFIOPCIDevice, vbasedev.no_mmap, false), @@ -3608,6 +3611,17 @@ static const TypeInfo vfio_pci_nohotplug_dev_info = { static void register_vfio_pci_dev_type(void) { + /* + * Ordinary ON_OFF_AUTO property isn't runtime-mutable, but source VM can + * run for a long time before being migrated so it is desirable to have a + * fallback mechanism to the old way of transferring VFIO device state if + * it turns to be necessary. + * The following makes this type of property have the same mutability level + * as ordinary migration parameters. + */ + vfio_pci_migration_multifd_transfer_prop = qdev_prop_on_off_auto; + vfio_pci_migration_multifd_transfer_prop.realized_set_allowed = true; + type_register_static(&vfio_pci_dev_info); type_register_static(&vfio_pci_nohotplug_dev_info); } From 59a67e70950bcc2002d3a8d22a17743e0f70da96 Mon Sep 17 00:00:00 2001 From: "Maciej S. Szmigiero" Date: Tue, 4 Mar 2025 23:04:00 +0100 Subject: [PATCH 42/42] hw/core/machine: Add compat for x-migration-multifd-transfer VFIO property MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a hw_compat entry for recently added x-migration-multifd-transfer VFIO property. Reviewed-by: Cédric Le Goater Signed-off-by: Maciej S. Szmigiero Link: https://lore.kernel.org/qemu-devel/92c354f0457c152d1f267cc258c6967fff551cb1.1741124640.git.maciej.szmigiero@oracle.com Signed-off-by: Cédric Le Goater --- hw/core/machine.c | 1 + 1 file changed, 1 insertion(+) diff --git a/hw/core/machine.c b/hw/core/machine.c index d1ddc3a3db..f52a4f2273 100644 --- a/hw/core/machine.c +++ b/hw/core/machine.c @@ -45,6 +45,7 @@ GlobalProperty hw_compat_9_2[] = { { "virtio-mem-pci", "vectors", "0" }, { "migration", "multifd-clean-tls-termination", "false" }, { "migration", "send-switchover-start", "off"}, + { "vfio-pci", "x-migration-multifd-transfer", "off" }, }; const size_t hw_compat_9_2_len = G_N_ELEMENTS(hw_compat_9_2);