diff --git a/MAINTAINERS b/MAINTAINERS index e145017d53..1911949526 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2221,12 +2221,16 @@ M: Michael S. Tsirkin R: Stefano Garzarella S: Supported F: hw/*/*vhost* -F: docs/interop/vhost-user.json -F: docs/interop/vhost-user.rst +F: docs/interop/vhost-user* +F: docs/system/devices/vhost-user* F: contrib/vhost-user-*/ -F: backends/vhost-user.c +F: backends/*vhost* F: include/system/vhost-user-backend.h +F: include/hw/virtio/vhost* +F: include/*/vhost* F: subprojects/libvhost-user/ +F: block/export/vhost-user* +F: util/vhost-user-server.c vhost-shadow-virtqueue R: Eugenio PĂ©rez @@ -2255,6 +2259,7 @@ F: include/hw/virtio/virtio-balloon.h F: system/balloon.c F: include/system/balloon.h F: tests/qtest/virtio-balloon-test.c +F: tests/functional/test_virtio_balloon.py virtio-9p M: Christian Schoenebeck diff --git a/backends/cryptodev-vhost.c b/backends/cryptodev-vhost.c index 8718c97326..943680a23a 100644 --- a/backends/cryptodev-vhost.c +++ b/backends/cryptodev-vhost.c @@ -53,7 +53,7 @@ cryptodev_vhost_init( CryptoDevBackendVhost *crypto; Error *local_err = NULL; - crypto = g_new(CryptoDevBackendVhost, 1); + crypto = g_new0(CryptoDevBackendVhost, 1); crypto->dev.max_queues = 1; crypto->dev.nvqs = 1; crypto->dev.vqs = crypto->vqs; diff --git a/docs/about/deprecated.rst b/docs/about/deprecated.rst index 4c86620f6d..abadf8de27 100644 --- a/docs/about/deprecated.rst +++ b/docs/about/deprecated.rst @@ -277,13 +277,6 @@ deprecated; use the new name ``dtb-randomness`` instead. The new name better reflects the way this property affects all random data within the device tree blob, not just the ``kaslr-seed`` node. -``pc-i440fx-2.4`` up to ``pc-i440fx-2.12`` (since 9.1) -'''''''''''''''''''''''''''''''''''''''''''''''''''''' - -These old machine types are quite neglected nowadays and thus might have -various pitfalls with regards to live migration. Use a newer machine type -instead. - PPC 405 ``ref405ep`` machine (since 9.1) '''''''''''''''''''''''''''''''''''''''' diff --git a/docs/about/removed-features.rst b/docs/about/removed-features.rst index c6616ce05e..156c0c253c 100644 --- a/docs/about/removed-features.rst +++ b/docs/about/removed-features.rst @@ -972,6 +972,11 @@ from Linux in 2021, and is not supported anymore by QEMU either. System emulator machines ------------------------ +Note: Versioned machine types that have been introduced in a QEMU version +that has initially been released more than 6 years before are considered +obsolete and will be removed without further notice in this document. +Please use newer machine types instead. + ``s390-virtio`` (removed in 2.6) '''''''''''''''''''''''''''''''' @@ -1006,12 +1011,6 @@ mips ``fulong2e`` machine alias (removed in 6.0) This machine has been renamed ``fuloong2e``. -``pc-0.10`` up to ``pc-i440fx-2.3`` (removed in 4.0 up to 9.0) -'''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''' - -These machine types were very old and likely could not be used for live -migration from old QEMU versions anymore. Use a newer machine type instead. - Raspberry Pi ``raspi2`` and ``raspi3`` machines (removed in 6.2) '''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''' diff --git a/docs/devel/reset.rst b/docs/devel/reset.rst index adefd59ef9..0b8b2fa5f4 100644 --- a/docs/devel/reset.rst +++ b/docs/devel/reset.rst @@ -143,6 +143,11 @@ The *exit* phase is executed only when the last reset operation ends. Therefore the object does not need to care how many of reset controllers it has and how many of them have started a reset. +DMA capable devices are expected to cancel all outstanding DMA operations +during either 'enter' or 'hold' phases. IOMMUs are expected to reset during +the 'exit' phase and this sequencing makes sure no outstanding DMA request +will fault. + Handling reset in a resettable object ------------------------------------- diff --git a/docs/pcie_sriov.txt b/docs/pcie_sriov.txt index a47aad0bfa..ab2142807f 100644 --- a/docs/pcie_sriov.txt +++ b/docs/pcie_sriov.txt @@ -52,9 +52,11 @@ setting up a BAR for a VF. ... /* Add and initialize the SR/IOV capability */ - pcie_sriov_pf_init(d, 0x200, "your_virtual_dev", - vf_devid, initial_vfs, total_vfs, - fun_offset, stride); + if (!pcie_sriov_pf_init(d, 0x200, "your_virtual_dev", + vf_devid, initial_vfs, total_vfs, + fun_offset, stride, errp)) { + return; + } /* Set up individual VF BARs (parameters as for normal BARs) */ pcie_sriov_pf_init_vf_bar( ... ) diff --git a/hw/arm/smmu-common.c b/hw/arm/smmu-common.c index dd74c2e558..8c1b407b82 100644 --- a/hw/arm/smmu-common.c +++ b/hw/arm/smmu-common.c @@ -924,7 +924,12 @@ static void smmu_base_realize(DeviceState *dev, Error **errp) } } -static void smmu_base_reset_hold(Object *obj, ResetType type) +/* + * Make sure the IOMMU is reset in 'exit' phase after + * all outstanding DMA requests have been quiesced during + * the 'enter' or 'hold' reset phases + */ +static void smmu_base_reset_exit(Object *obj, ResetType type) { SMMUState *s = ARM_SMMU(obj); @@ -949,7 +954,7 @@ static void smmu_base_class_init(ObjectClass *klass, void *data) device_class_set_props(dc, smmu_dev_properties); device_class_set_parent_realize(dc, smmu_base_realize, &sbc->parent_realize); - rc->phases.hold = smmu_base_reset_hold; + rc->phases.exit = smmu_base_reset_exit; } static const TypeInfo smmu_base_info = { diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c index c0cf5df0f6..b49a59b64c 100644 --- a/hw/arm/smmuv3.c +++ b/hw/arm/smmuv3.c @@ -1870,13 +1870,19 @@ static void smmu_init_irq(SMMUv3State *s, SysBusDevice *dev) } } -static void smmu_reset_hold(Object *obj, ResetType type) +/* + * Make sure the IOMMU is reset in 'exit' phase after + * all outstanding DMA requests have been quiesced during + * the 'enter' or 'hold' reset phases + */ +static void smmu_reset_exit(Object *obj, ResetType type) { SMMUv3State *s = ARM_SMMUV3(obj); SMMUv3Class *c = ARM_SMMUV3_GET_CLASS(s); - if (c->parent_phases.hold) { - c->parent_phases.hold(obj, type); + trace_smmu_reset_exit(); + if (c->parent_phases.exit) { + c->parent_phases.exit(obj, type); } smmuv3_init_regs(s); @@ -1999,7 +2005,7 @@ static void smmuv3_class_init(ObjectClass *klass, void *data) SMMUv3Class *c = ARM_SMMUV3_CLASS(klass); dc->vmsd = &vmstate_smmuv3; - resettable_class_set_parent_phases(rc, NULL, smmu_reset_hold, NULL, + resettable_class_set_parent_phases(rc, NULL, NULL, smmu_reset_exit, &c->parent_phases); device_class_set_parent_realize(dc, smmu_realize, &c->parent_realize); diff --git a/hw/arm/trace-events b/hw/arm/trace-events index c64ad344bd..7790db780e 100644 --- a/hw/arm/trace-events +++ b/hw/arm/trace-events @@ -56,6 +56,7 @@ smmuv3_config_cache_inv(uint32_t sid) "Config cache INV for sid=0x%x" smmuv3_notify_flag_add(const char *iommu) "ADD SMMUNotifier node for iommu mr=%s" smmuv3_notify_flag_del(const char *iommu) "DEL SMMUNotifier node for iommu mr=%s" smmuv3_inv_notifiers_iova(const char *name, int asid, int vmid, uint64_t iova, uint8_t tg, uint64_t num_pages, int stage) "iommu mr=%s asid=%d vmid=%d iova=0x%"PRIx64" tg=%d num_pages=0x%"PRIx64" stage=%d" +smmu_reset_exit(void) "" # strongarm.c strongarm_uart_update_parameters(const char *label, int speed, char parity, int data_bits, int stop_bits) "%s speed=%d parity=%c data=%d stop=%d" diff --git a/hw/cxl/cxl-component-utils.c b/hw/cxl/cxl-component-utils.c index cd116c0401..473895948b 100644 --- a/hw/cxl/cxl-component-utils.c +++ b/hw/cxl/cxl-component-utils.c @@ -243,8 +243,13 @@ static void hdm_init_common(uint32_t *reg_state, uint32_t *write_msk, ARRAY_FIELD_DP32(reg_state, CXL_HDM_DECODER_CAPABILITY, INTERLEAVE_4K, 1); ARRAY_FIELD_DP32(reg_state, CXL_HDM_DECODER_CAPABILITY, POISON_ON_ERR_CAP, 0); - ARRAY_FIELD_DP32(reg_state, CXL_HDM_DECODER_CAPABILITY, 3_6_12_WAY, 0); - ARRAY_FIELD_DP32(reg_state, CXL_HDM_DECODER_CAPABILITY, 16_WAY, 0); + if (type == CXL2_TYPE3_DEVICE) { + ARRAY_FIELD_DP32(reg_state, CXL_HDM_DECODER_CAPABILITY, 3_6_12_WAY, 1); + ARRAY_FIELD_DP32(reg_state, CXL_HDM_DECODER_CAPABILITY, 16_WAY, 1); + } else { + ARRAY_FIELD_DP32(reg_state, CXL_HDM_DECODER_CAPABILITY, 3_6_12_WAY, 0); + ARRAY_FIELD_DP32(reg_state, CXL_HDM_DECODER_CAPABILITY, 16_WAY, 0); + } ARRAY_FIELD_DP32(reg_state, CXL_HDM_DECODER_CAPABILITY, UIO, 0); ARRAY_FIELD_DP32(reg_state, CXL_HDM_DECODER_CAPABILITY, UIO_DECODER_COUNT, 0); diff --git a/hw/cxl/cxl-device-utils.c b/hw/cxl/cxl-device-utils.c index 035d034f6d..52ad1e4c3f 100644 --- a/hw/cxl/cxl-device-utils.c +++ b/hw/cxl/cxl-device-utils.c @@ -352,10 +352,8 @@ static void device_reg_init_common(CXLDeviceState *cxl_dstate) } } -static void mailbox_reg_init_common(CXLDeviceState *cxl_dstate) +static void mailbox_reg_init_common(CXLDeviceState *cxl_dstate, int msi_n) { - const uint8_t msi_n = 9; - /* 2048 payload size */ ARRAY_FIELD_DP32(cxl_dstate->mbox_reg_state32, CXL_DEV_MAILBOX_CAP, PAYLOAD_SIZE, CXL_MAILBOX_PAYLOAD_SHIFT); @@ -382,7 +380,7 @@ static void memdev_reg_init_common(CXLDeviceState *cxl_dstate) cxl_dstate->memdev_status = memdev_status_reg; } -void cxl_device_register_init_t3(CXLType3Dev *ct3d) +void cxl_device_register_init_t3(CXLType3Dev *ct3d, int msi_n) { CXLDeviceState *cxl_dstate = &ct3d->cxl_dstate; uint64_t *cap_h = cxl_dstate->caps_reg_state64; @@ -398,7 +396,7 @@ void cxl_device_register_init_t3(CXLType3Dev *ct3d) device_reg_init_common(cxl_dstate); cxl_device_cap_init(cxl_dstate, MAILBOX, 2, CXL_DEV_MAILBOX_VERSION); - mailbox_reg_init_common(cxl_dstate); + mailbox_reg_init_common(cxl_dstate, msi_n); cxl_device_cap_init(cxl_dstate, MEMORY_DEVICE, 0x4000, CXL_MEM_DEV_STATUS_VERSION); @@ -408,7 +406,7 @@ void cxl_device_register_init_t3(CXLType3Dev *ct3d) CXL_MAILBOX_MAX_PAYLOAD_SIZE); } -void cxl_device_register_init_swcci(CSWMBCCIDev *sw) +void cxl_device_register_init_swcci(CSWMBCCIDev *sw, int msi_n) { CXLDeviceState *cxl_dstate = &sw->cxl_dstate; uint64_t *cap_h = cxl_dstate->caps_reg_state64; @@ -423,7 +421,7 @@ void cxl_device_register_init_swcci(CSWMBCCIDev *sw) device_reg_init_common(cxl_dstate); cxl_device_cap_init(cxl_dstate, MAILBOX, 2, 1); - mailbox_reg_init_common(cxl_dstate); + mailbox_reg_init_common(cxl_dstate, msi_n); cxl_device_cap_init(cxl_dstate, MEMORY_DEVICE, 0x4000, 1); memdev_reg_init_common(cxl_dstate); diff --git a/hw/cxl/switch-mailbox-cci.c b/hw/cxl/switch-mailbox-cci.c index 65cdac6cc1..833b824619 100644 --- a/hw/cxl/switch-mailbox-cci.c +++ b/hw/cxl/switch-mailbox-cci.c @@ -17,10 +17,12 @@ #include "hw/qdev-properties.h" #include "hw/cxl/cxl.h" +#define CXL_SWCCI_MSIX_MBOX 3 + static void cswmbcci_reset(DeviceState *dev) { CSWMBCCIDev *cswmb = CXL_SWITCH_MAILBOX_CCI(dev); - cxl_device_register_init_swcci(cswmb); + cxl_device_register_init_swcci(cswmb, CXL_SWCCI_MSIX_MBOX); } static void cswbcci_realize(PCIDevice *pci_dev, Error **errp) diff --git a/hw/i386/amd_iommu.c b/hw/i386/amd_iommu.c index e8e084c7cf..5b21cf134a 100644 --- a/hw/i386/amd_iommu.c +++ b/hw/i386/amd_iommu.c @@ -1309,15 +1309,15 @@ static int amdvi_int_remap_msi(AMDVIState *iommu, ret = -AMDVI_IR_ERR; break; case AMDVI_IOAPIC_INT_TYPE_NMI: - pass = dte[3] & AMDVI_DEV_NMI_PASS_MASK; + pass = dte[2] & AMDVI_DEV_NMI_PASS_MASK; trace_amdvi_ir_delivery_mode("nmi"); break; case AMDVI_IOAPIC_INT_TYPE_INIT: - pass = dte[3] & AMDVI_DEV_INT_PASS_MASK; + pass = dte[2] & AMDVI_DEV_INT_PASS_MASK; trace_amdvi_ir_delivery_mode("init"); break; case AMDVI_IOAPIC_INT_TYPE_EINT: - pass = dte[3] & AMDVI_DEV_EINT_PASS_MASK; + pass = dte[2] & AMDVI_DEV_EINT_PASS_MASK; trace_amdvi_ir_delivery_mode("eint"); break; default: @@ -1593,9 +1593,9 @@ static void amdvi_pci_realize(PCIDevice *pdev, Error **errp) /* reset AMDVI specific capabilities, all r/o */ pci_set_long(pdev->config + s->capab_offset, AMDVI_CAPAB_FEATURES); pci_set_long(pdev->config + s->capab_offset + AMDVI_CAPAB_BAR_LOW, - AMDVI_BASE_ADDR & ~(0xffff0000)); + AMDVI_BASE_ADDR & MAKE_64BIT_MASK(14, 18)); pci_set_long(pdev->config + s->capab_offset + AMDVI_CAPAB_BAR_HIGH, - (AMDVI_BASE_ADDR & ~(0xffff)) >> 16); + AMDVI_BASE_ADDR >> 32); pci_set_long(pdev->config + s->capab_offset + AMDVI_CAPAB_RANGE, 0xff000000); pci_set_long(pdev->config + s->capab_offset + AMDVI_CAPAB_MISC, 0); diff --git a/hw/i386/amd_iommu.h b/hw/i386/amd_iommu.h index e0dac4d9a9..28125130c6 100644 --- a/hw/i386/amd_iommu.h +++ b/hw/i386/amd_iommu.h @@ -187,7 +187,7 @@ AMDVI_CAPAB_FLAG_HTTUNNEL | AMDVI_CAPAB_EFR_SUP) /* AMDVI default address */ -#define AMDVI_BASE_ADDR 0xfed80000 +#define AMDVI_BASE_ADDR 0xfed80000ULL /* page management constants */ #define AMDVI_PAGE_SHIFT 12 diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c index 7fde0603bf..dffd7ee885 100644 --- a/hw/i386/intel_iommu.c +++ b/hw/i386/intel_iommu.c @@ -4697,10 +4697,11 @@ static void vtd_init(IntelIOMMUState *s) /* Should not reset address_spaces when reset because devices will still use * the address space they got at first (won't ask the bus again). */ -static void vtd_reset(DeviceState *dev) +static void vtd_reset_exit(Object *obj, ResetType type) { - IntelIOMMUState *s = INTEL_IOMMU_DEVICE(dev); + IntelIOMMUState *s = INTEL_IOMMU_DEVICE(obj); + trace_vtd_reset_exit(); vtd_init(s); vtd_address_space_refresh_all(s); } @@ -4864,8 +4865,13 @@ static void vtd_class_init(ObjectClass *klass, void *data) { DeviceClass *dc = DEVICE_CLASS(klass); X86IOMMUClass *x86_class = X86_IOMMU_DEVICE_CLASS(klass); + ResettableClass *rc = RESETTABLE_CLASS(klass); - device_class_set_legacy_reset(dc, vtd_reset); + /* + * Use 'exit' reset phase to make sure all DMA requests + * have been quiesced during 'enter' or 'hold' phase + */ + rc->phases.exit = vtd_reset_exit; dc->vmsd = &vtd_vmstate; device_class_set_props(dc, vtd_properties); dc->hotpluggable = false; diff --git a/hw/i386/microvm.c b/hw/i386/microvm.c index a8d354aabe..d0a236c74f 100644 --- a/hw/i386/microvm.c +++ b/hw/i386/microvm.c @@ -451,11 +451,44 @@ static HotplugHandler *microvm_get_hotplug_handler(MachineState *machine, return NULL; } +static void microvm_machine_done(Notifier *notifier, void *data) +{ + MicrovmMachineState *mms = container_of(notifier, MicrovmMachineState, + machine_done); + X86MachineState *x86ms = X86_MACHINE(mms); + + acpi_setup_microvm(mms); + dt_setup_microvm(mms); + fw_cfg_add_e820(x86ms->fw_cfg); +} + +static void microvm_powerdown_req(Notifier *notifier, void *data) +{ + MicrovmMachineState *mms = container_of(notifier, MicrovmMachineState, + powerdown_req); + X86MachineState *x86ms = X86_MACHINE(mms); + + if (x86ms->acpi_dev) { + Object *obj = OBJECT(x86ms->acpi_dev); + AcpiDeviceIfClass *adevc = ACPI_DEVICE_IF_GET_CLASS(obj); + adevc->send_event(ACPI_DEVICE_IF(x86ms->acpi_dev), + ACPI_POWER_DOWN_STATUS); + } +} + static void microvm_machine_state_init(MachineState *machine) { MicrovmMachineState *mms = MICROVM_MACHINE(machine); X86MachineState *x86ms = X86_MACHINE(machine); + /* State */ + mms->kernel_cmdline_fixed = false; + + mms->machine_done.notify = microvm_machine_done; + qemu_add_machine_init_done_notifier(&mms->machine_done); + mms->powerdown_req.notify = microvm_powerdown_req; + qemu_register_powerdown_notifier(&mms->powerdown_req); + microvm_memory_init(mms); x86_cpus_init(x86ms, CPU_VERSION_LATEST); @@ -581,31 +614,6 @@ static void microvm_machine_set_auto_kernel_cmdline(Object *obj, bool value, mms->auto_kernel_cmdline = value; } -static void microvm_machine_done(Notifier *notifier, void *data) -{ - MicrovmMachineState *mms = container_of(notifier, MicrovmMachineState, - machine_done); - X86MachineState *x86ms = X86_MACHINE(mms); - - acpi_setup_microvm(mms); - dt_setup_microvm(mms); - fw_cfg_add_e820(x86ms->fw_cfg); -} - -static void microvm_powerdown_req(Notifier *notifier, void *data) -{ - MicrovmMachineState *mms = container_of(notifier, MicrovmMachineState, - powerdown_req); - X86MachineState *x86ms = X86_MACHINE(mms); - - if (x86ms->acpi_dev) { - Object *obj = OBJECT(x86ms->acpi_dev); - AcpiDeviceIfClass *adevc = ACPI_DEVICE_IF_GET_CLASS(obj); - adevc->send_event(ACPI_DEVICE_IF(x86ms->acpi_dev), - ACPI_POWER_DOWN_STATUS); - } -} - static void microvm_machine_initfn(Object *obj) { MicrovmMachineState *mms = MICROVM_MACHINE(obj); @@ -617,14 +625,6 @@ static void microvm_machine_initfn(Object *obj) mms->isa_serial = true; mms->option_roms = true; mms->auto_kernel_cmdline = true; - - /* State */ - mms->kernel_cmdline_fixed = false; - - mms->machine_done.notify = microvm_machine_done; - qemu_add_machine_init_done_notifier(&mms->machine_done); - mms->powerdown_req.notify = microvm_powerdown_req; - qemu_register_powerdown_notifier(&mms->powerdown_req); } GlobalProperty microvm_properties[] = { diff --git a/hw/i386/pc.c b/hw/i386/pc.c index 22641e6ddc..f199a8c7ad 100644 --- a/hw/i386/pc.c +++ b/hw/i386/pc.c @@ -1241,6 +1241,9 @@ void pc_basic_device_init(struct PCMachineState *pcms, /* Super I/O */ pc_superio_init(isa_bus, create_fdctrl, pcms->i8042_enabled, pcms->vmport != ON_OFF_AUTO_ON, &error_fatal); + + pcms->machine_done.notify = pc_machine_done; + qemu_add_machine_init_done_notifier(&pcms->machine_done); } void pc_nic_init(PCMachineClass *pcmc, ISABus *isa_bus, PCIBus *pci_bus) @@ -1714,9 +1717,6 @@ static void pc_machine_initfn(Object *obj) if (pcmc->pci_enabled) { cxl_machine_init(obj, &pcms->cxl_devices_state); } - - pcms->machine_done.notify = pc_machine_done; - qemu_add_machine_init_done_notifier(&pcms->machine_done); } static void pc_machine_reset(MachineState *machine, ResetType type) diff --git a/hw/i386/trace-events b/hw/i386/trace-events index 53c02d7ac8..ac9e1a10aa 100644 --- a/hw/i386/trace-events +++ b/hw/i386/trace-events @@ -68,6 +68,7 @@ vtd_frr_new(int index, uint64_t hi, uint64_t lo) "index %d high 0x%"PRIx64" low vtd_warn_invalid_qi_tail(uint16_t tail) "tail 0x%"PRIx16 vtd_warn_ir_vector(uint16_t sid, int index, int vec, int target) "sid 0x%"PRIx16" index %d vec %d (should be: %d)" vtd_warn_ir_trigger(uint16_t sid, int index, int trig, int target) "sid 0x%"PRIx16" index %d trigger %d (should be: %d)" +vtd_reset_exit(void) "" # amd_iommu.c amdvi_evntlog_fail(uint64_t addr, uint32_t head) "error: fail to write at addr 0x%"PRIx64" + offset 0x%"PRIx32 diff --git a/hw/mem/cxl_type3.c b/hw/mem/cxl_type3.c index 0ae1704a34..6fffa21ead 100644 --- a/hw/mem/cxl_type3.c +++ b/hw/mem/cxl_type3.c @@ -30,6 +30,14 @@ #include "hw/cxl/cxl.h" #include "hw/pci/msix.h" +/* type3 device private */ +enum CXL_T3_MSIX_VECTOR { + CXL_T3_MSIX_PCIE_DOE_TABLE_ACCESS = 0, + CXL_T3_MSIX_EVENT_START = 2, + CXL_T3_MSIX_MBOX = CXL_T3_MSIX_EVENT_START + CXL_EVENT_TYPE_MAX, + CXL_T3_MSIX_VECTOR_NR +}; + #define DWORD_BYTE 4 #define CXL_CAPACITY_MULTIPLIER (256 * MiB) @@ -843,7 +851,6 @@ static void ct3_realize(PCIDevice *pci_dev, Error **errp) ComponentRegisters *regs = &cxl_cstate->crb; MemoryRegion *mr = ®s->component_registers; uint8_t *pci_conf = pci_dev->config; - unsigned short msix_num = 10; int i, rc; uint16_t count; @@ -884,31 +891,32 @@ static void ct3_realize(PCIDevice *pci_dev, Error **errp) &ct3d->cxl_dstate.device_registers); /* MSI(-X) Initialization */ - rc = msix_init_exclusive_bar(pci_dev, msix_num, 4, NULL); + rc = msix_init_exclusive_bar(pci_dev, CXL_T3_MSIX_VECTOR_NR, 4, errp); if (rc) { - goto err_address_space_free; + goto err_free_special_ops; } - for (i = 0; i < msix_num; i++) { + for (i = 0; i < CXL_T3_MSIX_VECTOR_NR; i++) { msix_vector_use(pci_dev, i); } /* DOE Initialization */ - pcie_doe_init(pci_dev, &ct3d->doe_cdat, 0x190, doe_cdat_prot, true, 0); + pcie_doe_init(pci_dev, &ct3d->doe_cdat, 0x190, doe_cdat_prot, true, + CXL_T3_MSIX_PCIE_DOE_TABLE_ACCESS); cxl_cstate->cdat.build_cdat_table = ct3_build_cdat_table; cxl_cstate->cdat.free_cdat_table = ct3_free_cdat_table; cxl_cstate->cdat.private = ct3d; if (!cxl_doe_cdat_init(cxl_cstate, errp)) { - goto err_free_special_ops; + goto err_msix_uninit; } pcie_cap_deverr_init(pci_dev); /* Leave a bit of room for expansion */ - rc = pcie_aer_init(pci_dev, PCI_ERR_VER, 0x200, PCI_ERR_SIZEOF, NULL); + rc = pcie_aer_init(pci_dev, PCI_ERR_VER, 0x200, PCI_ERR_SIZEOF, errp); if (rc) { goto err_release_cdat; } - cxl_event_init(&ct3d->cxl_dstate, 2); + cxl_event_init(&ct3d->cxl_dstate, CXL_T3_MSIX_EVENT_START); /* Set default value for patrol scrub attributes */ ct3d->patrol_scrub_attrs.scrub_cycle_cap = @@ -935,9 +943,10 @@ static void ct3_realize(PCIDevice *pci_dev, Error **errp) err_release_cdat: cxl_doe_cdat_release(cxl_cstate); +err_msix_uninit: + msix_uninit_exclusive_bar(pci_dev); err_free_special_ops: g_free(regs->special_ops); -err_address_space_free: if (ct3d->dc.host_dc) { cxl_destroy_dc_regions(ct3d); address_space_destroy(&ct3d->dc.host_dc_as); @@ -959,6 +968,7 @@ static void ct3_exit(PCIDevice *pci_dev) pcie_aer_exit(pci_dev); cxl_doe_cdat_release(cxl_cstate); + msix_uninit_exclusive_bar(pci_dev); g_free(regs->special_ops); if (ct3d->dc.host_dc) { cxl_destroy_dc_regions(ct3d); @@ -1090,10 +1100,17 @@ static bool cxl_type3_dpa(CXLType3Dev *ct3d, hwaddr host_addr, uint64_t *dpa) continue; } - *dpa = dpa_base + - ((MAKE_64BIT_MASK(0, 8 + ig) & hpa_offset) | - ((MAKE_64BIT_MASK(8 + ig + iw, 64 - 8 - ig - iw) & hpa_offset) - >> iw)); + if (iw < 8) { + *dpa = dpa_base + + ((MAKE_64BIT_MASK(0, 8 + ig) & hpa_offset) | + ((MAKE_64BIT_MASK(8 + ig + iw, 64 - 8 - ig - iw) & hpa_offset) + >> iw)); + } else { + *dpa = dpa_base + + ((MAKE_64BIT_MASK(0, 8 + ig) & hpa_offset) | + ((((MAKE_64BIT_MASK(ig + iw, 64 - ig - iw) & hpa_offset) + >> (ig + iw)) / 3) << (ig + 8))); + } return true; } @@ -1202,7 +1219,7 @@ static void ct3d_reset(DeviceState *dev) pcie_cap_fill_link_ep_usp(PCI_DEVICE(dev), ct3d->width, ct3d->speed); cxl_component_register_init_common(reg_state, write_msk, CXL2_TYPE3_DEVICE); - cxl_device_register_init_t3(ct3d); + cxl_device_register_init_t3(ct3d, CXL_T3_MSIX_MBOX); /* * Bring up an endpoint to target with MCTP over VDM. diff --git a/hw/net/igb.c b/hw/net/igb.c index 4d93ce629f..c965fc2fb6 100644 --- a/hw/net/igb.c +++ b/hw/net/igb.c @@ -446,9 +446,13 @@ static void igb_pci_realize(PCIDevice *pci_dev, Error **errp) pcie_ari_init(pci_dev, 0x150); - pcie_sriov_pf_init(pci_dev, IGB_CAP_SRIOV_OFFSET, TYPE_IGBVF, - IGB_82576_VF_DEV_ID, IGB_MAX_VF_FUNCTIONS, IGB_MAX_VF_FUNCTIONS, - IGB_VF_OFFSET, IGB_VF_STRIDE); + if (!pcie_sriov_pf_init(pci_dev, IGB_CAP_SRIOV_OFFSET, TYPE_IGBVF, + IGB_82576_VF_DEV_ID, IGB_MAX_VF_FUNCTIONS, + IGB_MAX_VF_FUNCTIONS, IGB_VF_OFFSET, IGB_VF_STRIDE, + errp)) { + igb_cleanup_msix(s); + return; + } pcie_sriov_pf_init_vf_bar(pci_dev, IGBVF_MMIO_BAR_IDX, PCI_BASE_ADDRESS_MEM_TYPE_64 | PCI_BASE_ADDRESS_MEM_PREFETCH, diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c index d847429b1c..de87cfadff 100644 --- a/hw/net/virtio-net.c +++ b/hw/net/virtio-net.c @@ -1352,18 +1352,25 @@ exit: static bool virtio_net_load_ebpf(VirtIONet *n, Error **errp) { - bool ret = false; - - if (virtio_net_attach_ebpf_to_backend(n->nic, -1)) { - trace_virtio_net_rss_load(n, n->nr_ebpf_rss_fds, n->ebpf_rss_fds); - if (n->ebpf_rss_fds) { - ret = virtio_net_load_ebpf_fds(n, errp); - } else { - ret = ebpf_rss_load(&n->ebpf_rss, errp); - } + if (!virtio_net_attach_ebpf_to_backend(n->nic, -1)) { + return true; } - return ret; + trace_virtio_net_rss_load(n, n->nr_ebpf_rss_fds, n->ebpf_rss_fds); + + /* + * If user explicitly gave QEMU RSS FDs to use, then + * failing to use them must be considered a fatal + * error. If no RSS FDs were provided, QEMU is trying + * eBPF on a "best effort" basis only, so report a + * warning and allow fallback to software RSS. + */ + if (n->ebpf_rss_fds) { + return virtio_net_load_ebpf_fds(n, errp); + } + + ebpf_rss_load(&n->ebpf_rss, &error_warn); + return true; } static void virtio_net_unload_ebpf(VirtIONet *n) @@ -3913,23 +3920,7 @@ static void virtio_net_device_realize(DeviceState *dev, Error **errp) net_rx_pkt_init(&n->rx_pkt); if (virtio_has_feature(n->host_features, VIRTIO_NET_F_RSS)) { - Error *err = NULL; - if (!virtio_net_load_ebpf(n, &err)) { - /* - * If user explicitly gave QEMU RSS FDs to use, then - * failing to use them must be considered a fatal - * error. If no RSS FDs were provided, QEMU is trying - * eBPF on a "best effort" basis only, so report a - * warning and allow fallback to software RSS. - */ - if (n->ebpf_rss_fds) { - error_propagate(errp, err); - } else { - warn_report("unable to load eBPF RSS: %s", - error_get_pretty(err)); - error_free(err); - } - } + virtio_net_load_ebpf(n, errp); } } diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c index 68903d1d70..8175751518 100644 --- a/hw/nvme/ctrl.c +++ b/hw/nvme/ctrl.c @@ -8481,7 +8481,8 @@ out: return pow2ceil(bar_size); } -static void nvme_init_sriov(NvmeCtrl *n, PCIDevice *pci_dev, uint16_t offset) +static bool nvme_init_sriov(NvmeCtrl *n, PCIDevice *pci_dev, uint16_t offset, + Error **errp) { uint16_t vf_dev_id = n->params.use_intel_id ? PCI_DEVICE_ID_INTEL_NVME : PCI_DEVICE_ID_REDHAT_NVME; @@ -8490,12 +8491,16 @@ static void nvme_init_sriov(NvmeCtrl *n, PCIDevice *pci_dev, uint16_t offset) le16_to_cpu(cap->vifrsm), NULL, NULL); - pcie_sriov_pf_init(pci_dev, offset, "nvme", vf_dev_id, - n->params.sriov_max_vfs, n->params.sriov_max_vfs, - NVME_VF_OFFSET, NVME_VF_STRIDE); + if (!pcie_sriov_pf_init(pci_dev, offset, "nvme", vf_dev_id, + n->params.sriov_max_vfs, n->params.sriov_max_vfs, + NVME_VF_OFFSET, NVME_VF_STRIDE, errp)) { + return false; + } pcie_sriov_pf_init_vf_bar(pci_dev, 0, PCI_BASE_ADDRESS_SPACE_MEMORY | PCI_BASE_ADDRESS_MEM_TYPE_64, bar_size); + + return true; } static int nvme_add_pm_capability(PCIDevice *pci_dev, uint8_t offset) @@ -8620,6 +8625,11 @@ static bool nvme_init_pci(NvmeCtrl *n, PCIDevice *pci_dev, Error **errp) return false; } + if (!pci_is_vf(pci_dev) && n->params.sriov_max_vfs && + !nvme_init_sriov(n, pci_dev, 0x120, errp)) { + return false; + } + nvme_update_msixcap_ts(pci_dev, n->conf_msix_qsize); pcie_cap_deverr_init(pci_dev); @@ -8649,10 +8659,6 @@ static bool nvme_init_pci(NvmeCtrl *n, PCIDevice *pci_dev, Error **errp) nvme_init_pmr(n, pci_dev); } - if (!pci_is_vf(pci_dev) && n->params.sriov_max_vfs) { - nvme_init_sriov(n, pci_dev, 0x120); - } - return true; } diff --git a/hw/pci/msix.c b/hw/pci/msix.c index 57ec7084a4..66f27b9d71 100644 --- a/hw/pci/msix.c +++ b/hw/pci/msix.c @@ -15,6 +15,7 @@ */ #include "qemu/osdep.h" +#include "qemu/log.h" #include "hw/pci/msi.h" #include "hw/pci/msix.h" #include "hw/pci/pci.h" @@ -260,6 +261,14 @@ static uint64_t msix_pba_mmio_read(void *opaque, hwaddr addr, static void msix_pba_mmio_write(void *opaque, hwaddr addr, uint64_t val, unsigned size) { + PCIDevice *dev = opaque; + + qemu_log_mask(LOG_GUEST_ERROR, + "PCI [%s:%02x:%02x.%x] attempt to write to MSI-X " + "PBA at 0x%" FMT_PCIBUS ", ignoring.\n", + pci_root_bus_path(dev), pci_dev_bus_num(dev), + PCI_SLOT(dev->devfn), PCI_FUNC(dev->devfn), + addr); } static const MemoryRegionOps msix_pba_mmio_ops = { diff --git a/hw/pci/pci.c b/hw/pci/pci.c index 2afa423925..1d42847ef0 100644 --- a/hw/pci/pci.c +++ b/hw/pci/pci.c @@ -803,10 +803,17 @@ static bool migrate_is_not_pcie(void *opaque, int version_id) return !pci_is_express((PCIDevice *)opaque); } +static int pci_post_load(void *opaque, int version_id) +{ + pcie_sriov_pf_post_load(opaque); + return 0; +} + const VMStateDescription vmstate_pci_device = { .name = "PCIDevice", .version_id = 2, .minimum_version_id = 1, + .post_load = pci_post_load, .fields = (const VMStateField[]) { VMSTATE_INT32_POSITIVE_LE(version_id, PCIDevice), VMSTATE_BUFFER_UNSAFE_INFO_TEST(config, PCIDevice, @@ -1391,6 +1398,7 @@ void pci_register_bar(PCIDevice *pci_dev, int region_num, assert(hdr_type != PCI_HEADER_TYPE_BRIDGE || region_num < 2); r = &pci_dev->io_regions[region_num]; + assert(!r->size); r->addr = PCI_BAR_UNMAPPED; r->size = size; r->type = type; @@ -2963,7 +2971,17 @@ MSIMessage pci_get_msi_message(PCIDevice *dev, int vector) void pci_set_power(PCIDevice *d, bool state) { - pci_set_enabled(d, state); + /* + * Don't change the enabled state of VFs when powering on/off the device. + * + * When powering on, VFs must not be enabled immediately but they must + * wait until the guest configures SR-IOV. + * When powering off, their corresponding PFs will be reset and disable + * VFs. + */ + if (!pci_is_vf(d)) { + pci_set_enabled(d, state); + } } void pci_set_enabled(PCIDevice *d, bool state) @@ -2977,7 +2995,7 @@ void pci_set_enabled(PCIDevice *d, bool state) memory_region_set_enabled(&d->bus_master_enable_region, (pci_get_word(d->config + PCI_COMMAND) & PCI_COMMAND_MASTER) && d->enabled); - if (!d->enabled) { + if (qdev_is_realized(&d->qdev)) { pci_device_reset(d); } } diff --git a/hw/pci/pcie_sriov.c b/hw/pci/pcie_sriov.c index e9b23221d7..1eb4358256 100644 --- a/hw/pci/pcie_sriov.c +++ b/hw/pci/pcie_sriov.c @@ -20,23 +20,37 @@ #include "qapi/error.h" #include "trace.h" -static PCIDevice *register_vf(PCIDevice *pf, int devfn, - const char *name, uint16_t vf_num); -static void unregister_vfs(PCIDevice *dev); +static void unparent_vfs(PCIDevice *dev, uint16_t total_vfs) +{ + for (uint16_t i = 0; i < total_vfs; i++) { + PCIDevice *vf = dev->exp.sriov_pf.vf[i]; + object_unparent(OBJECT(vf)); + object_unref(OBJECT(vf)); + } + g_free(dev->exp.sriov_pf.vf); + dev->exp.sriov_pf.vf = NULL; +} -void pcie_sriov_pf_init(PCIDevice *dev, uint16_t offset, +bool pcie_sriov_pf_init(PCIDevice *dev, uint16_t offset, const char *vfname, uint16_t vf_dev_id, uint16_t init_vfs, uint16_t total_vfs, - uint16_t vf_offset, uint16_t vf_stride) + uint16_t vf_offset, uint16_t vf_stride, + Error **errp) { + BusState *bus = qdev_get_parent_bus(&dev->qdev); + int32_t devfn = dev->devfn + vf_offset; uint8_t *cfg = dev->config + offset; uint8_t *wmask; + if (total_vfs && + (uint32_t)devfn + (uint32_t)(total_vfs - 1) * vf_stride >= PCI_DEVFN_MAX) { + error_setg(errp, "VF addr overflows"); + return false; + } + pcie_add_capability(dev, PCI_EXT_CAP_ID_SRIOV, 1, offset, PCI_EXT_CAP_SRIOV_SIZEOF); dev->exp.sriov_cap = offset; - dev->exp.sriov_pf.num_vfs = 0; - dev->exp.sriov_pf.vfname = g_strdup(vfname); dev->exp.sriov_pf.vf = NULL; pci_set_word(cfg + PCI_SRIOV_VF_OFFSET, vf_offset); @@ -69,13 +83,37 @@ void pcie_sriov_pf_init(PCIDevice *dev, uint16_t offset, pci_set_word(wmask + PCI_SRIOV_SYS_PGSIZE, 0x553); qdev_prop_set_bit(&dev->qdev, "multifunction", true); + + dev->exp.sriov_pf.vf = g_new(PCIDevice *, total_vfs); + + for (uint16_t i = 0; i < total_vfs; i++) { + PCIDevice *vf = pci_new(devfn, vfname); + vf->exp.sriov_vf.pf = dev; + vf->exp.sriov_vf.vf_number = i; + + if (!qdev_realize(&vf->qdev, bus, errp)) { + object_unparent(OBJECT(vf)); + object_unref(vf); + unparent_vfs(dev, i); + return false; + } + + /* set vid/did according to sr/iov spec - they are not used */ + pci_config_set_vendor_id(vf->config, 0xffff); + pci_config_set_device_id(vf->config, 0xffff); + + dev->exp.sriov_pf.vf[i] = vf; + devfn += vf_stride; + } + + return true; } void pcie_sriov_pf_exit(PCIDevice *dev) { - unregister_vfs(dev); - g_free((char *)dev->exp.sriov_pf.vfname); - dev->exp.sriov_pf.vfname = NULL; + uint8_t *cfg = dev->config + dev->exp.sriov_cap; + + unparent_vfs(dev, pci_get_word(cfg + PCI_SRIOV_TOTAL_VF)); } void pcie_sriov_pf_init_vf_bar(PCIDevice *dev, int region_num, @@ -141,80 +179,36 @@ void pcie_sriov_vf_register_bar(PCIDevice *dev, int region_num, } } -static PCIDevice *register_vf(PCIDevice *pf, int devfn, const char *name, - uint16_t vf_num) -{ - PCIDevice *dev = pci_new(devfn, name); - dev->exp.sriov_vf.pf = pf; - dev->exp.sriov_vf.vf_number = vf_num; - PCIBus *bus = pci_get_bus(pf); - Error *local_err = NULL; - - qdev_realize(&dev->qdev, &bus->qbus, &local_err); - if (local_err) { - error_report_err(local_err); - return NULL; - } - - /* set vid/did according to sr/iov spec - they are not used */ - pci_config_set_vendor_id(dev->config, 0xffff); - pci_config_set_device_id(dev->config, 0xffff); - - return dev; -} - static void register_vfs(PCIDevice *dev) { uint16_t num_vfs; uint16_t i; uint16_t sriov_cap = dev->exp.sriov_cap; - uint16_t vf_offset = - pci_get_word(dev->config + sriov_cap + PCI_SRIOV_VF_OFFSET); - uint16_t vf_stride = - pci_get_word(dev->config + sriov_cap + PCI_SRIOV_VF_STRIDE); - int32_t devfn = dev->devfn + vf_offset; assert(sriov_cap > 0); num_vfs = pci_get_word(dev->config + sriov_cap + PCI_SRIOV_NUM_VF); - if (num_vfs > pci_get_word(dev->config + sriov_cap + PCI_SRIOV_TOTAL_VF)) { - return; - } - - dev->exp.sriov_pf.vf = g_new(PCIDevice *, num_vfs); trace_sriov_register_vfs(dev->name, PCI_SLOT(dev->devfn), PCI_FUNC(dev->devfn), num_vfs); for (i = 0; i < num_vfs; i++) { - dev->exp.sriov_pf.vf[i] = register_vf(dev, devfn, - dev->exp.sriov_pf.vfname, i); - if (!dev->exp.sriov_pf.vf[i]) { - num_vfs = i; - break; - } - devfn += vf_stride; + pci_set_enabled(dev->exp.sriov_pf.vf[i], true); } - dev->exp.sriov_pf.num_vfs = num_vfs; + + pci_set_word(dev->wmask + sriov_cap + PCI_SRIOV_NUM_VF, 0); } static void unregister_vfs(PCIDevice *dev) { - uint16_t num_vfs = dev->exp.sriov_pf.num_vfs; + uint8_t *cfg = dev->config + dev->exp.sriov_cap; uint16_t i; trace_sriov_unregister_vfs(dev->name, PCI_SLOT(dev->devfn), - PCI_FUNC(dev->devfn), num_vfs); - for (i = 0; i < num_vfs; i++) { - Error *err = NULL; - PCIDevice *vf = dev->exp.sriov_pf.vf[i]; - if (!object_property_set_bool(OBJECT(vf), "realized", false, &err)) { - error_reportf_err(err, "Failed to unplug: "); - } - object_unparent(OBJECT(vf)); - object_unref(OBJECT(vf)); + PCI_FUNC(dev->devfn)); + for (i = 0; i < pci_get_word(cfg + PCI_SRIOV_TOTAL_VF); i++) { + pci_set_enabled(dev->exp.sriov_pf.vf[i], false); } - g_free(dev->exp.sriov_pf.vf); - dev->exp.sriov_pf.vf = NULL; - dev->exp.sriov_pf.num_vfs = 0; + + pci_set_word(dev->wmask + dev->exp.sriov_cap + PCI_SRIOV_NUM_VF, 0xffff); } void pcie_sriov_config_write(PCIDevice *dev, uint32_t address, @@ -235,15 +229,29 @@ void pcie_sriov_config_write(PCIDevice *dev, uint32_t address, PCI_FUNC(dev->devfn), off, val, len); if (range_covers_byte(off, len, PCI_SRIOV_CTRL)) { - if (dev->exp.sriov_pf.num_vfs) { - if (!(val & PCI_SRIOV_CTRL_VFE)) { - unregister_vfs(dev); - } + if (val & PCI_SRIOV_CTRL_VFE) { + register_vfs(dev); } else { - if (val & PCI_SRIOV_CTRL_VFE) { - register_vfs(dev); - } + unregister_vfs(dev); } + } else if (range_covers_byte(off, len, PCI_SRIOV_NUM_VF)) { + uint8_t *cfg = dev->config + sriov_cap; + uint8_t *wmask = dev->wmask + sriov_cap; + uint16_t num_vfs = pci_get_word(cfg + PCI_SRIOV_NUM_VF); + uint16_t wmask_val = PCI_SRIOV_CTRL_MSE | PCI_SRIOV_CTRL_ARI; + + if (num_vfs <= pci_get_word(cfg + PCI_SRIOV_TOTAL_VF)) { + wmask_val |= PCI_SRIOV_CTRL_VFE; + } + + pci_set_word(wmask + PCI_SRIOV_CTRL, wmask_val); + } +} + +void pcie_sriov_pf_post_load(PCIDevice *dev) +{ + if (dev->exp.sriov_cap) { + register_vfs(dev); } } @@ -260,6 +268,8 @@ void pcie_sriov_pf_reset(PCIDevice *dev) unregister_vfs(dev); pci_set_word(dev->config + sriov_cap + PCI_SRIOV_NUM_VF, 0); + pci_set_word(dev->wmask + sriov_cap + PCI_SRIOV_CTRL, + PCI_SRIOV_CTRL_VFE | PCI_SRIOV_CTRL_MSE | PCI_SRIOV_CTRL_ARI); /* * Default is to use 4K pages, software can modify it @@ -306,7 +316,7 @@ PCIDevice *pcie_sriov_get_pf(PCIDevice *dev) PCIDevice *pcie_sriov_get_vf_at_index(PCIDevice *dev, int n) { assert(!pci_is_vf(dev)); - if (n < dev->exp.sriov_pf.num_vfs) { + if (n < pcie_sriov_num_vfs(dev)) { return dev->exp.sriov_pf.vf[n]; } return NULL; @@ -314,5 +324,10 @@ PCIDevice *pcie_sriov_get_vf_at_index(PCIDevice *dev, int n) uint16_t pcie_sriov_num_vfs(PCIDevice *dev) { - return dev->exp.sriov_pf.num_vfs; + uint16_t sriov_cap = dev->exp.sriov_cap; + uint8_t *cfg = dev->config + sriov_cap; + + return sriov_cap && + (pci_get_word(cfg + PCI_SRIOV_CTRL) & PCI_SRIOV_CTRL_VFE) ? + pci_get_word(cfg + PCI_SRIOV_NUM_VF) : 0; } diff --git a/hw/pci/trace-events b/hw/pci/trace-events index 19643aa8c6..e98f575a9d 100644 --- a/hw/pci/trace-events +++ b/hw/pci/trace-events @@ -14,7 +14,7 @@ msix_write_config(char *name, bool enabled, bool masked) "dev %s enabled %d mask # hw/pci/pcie_sriov.c sriov_register_vfs(const char *name, int slot, int function, int num_vfs) "%s %02x:%x: creating %d vf devs" -sriov_unregister_vfs(const char *name, int slot, int function, int num_vfs) "%s %02x:%x: Unregistering %d vf devs" +sriov_unregister_vfs(const char *name, int slot, int function) "%s %02x:%x: Unregistering vf devs" sriov_config_write(const char *name, int slot, int fun, uint32_t offset, uint32_t val, uint32_t len) "%s %02x:%x: sriov offset 0x%x val 0x%x len %d" # pcie.c diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c index 904227d9aa..e0a9d50edc 100644 --- a/hw/ppc/spapr_pci.c +++ b/hw/ppc/spapr_pci.c @@ -1283,8 +1283,7 @@ static void spapr_dt_pci_device_cb(PCIBus *bus, PCIDevice *pdev, PciWalkFdt *p = opaque; int err; - if (p->err) { - /* Something's already broken, don't keep going */ + if (p->err || !pdev->enabled) { return; } @@ -1550,7 +1549,9 @@ static void spapr_pci_pre_plug(HotplugHandler *plug_handler, * hotplug, we do not allow functions to be hotplugged to a * slot that already has function 0 present */ - if (plugged_dev->hotplugged && bus->devices[PCI_DEVFN(slotnr, 0)] && + if (plugged_dev->hotplugged && + !pci_is_vf(pdev) && + bus->devices[PCI_DEVFN(slotnr, 0)] && PCI_FUNC(pdev->devfn) != 0) { error_setg(errp, "PCI: slot %d function 0 already occupied by %s," " additional functions can no longer be exposed to guest.", @@ -1572,6 +1573,14 @@ static void spapr_pci_plug(HotplugHandler *plug_handler, SpaprDrc *drc = drc_from_dev(phb, pdev); uint32_t slotnr = PCI_SLOT(pdev->devfn); + /* + * If DR or the PCI device is disabled we don't need to do anything + * in the case of hotplug or coldplug callbacks. + */ + if (!pdev->enabled) { + return; + } + g_assert(drc); if (IS_PCI_BRIDGE(plugged_dev)) { @@ -1647,6 +1656,11 @@ static void spapr_pci_unplug_request(HotplugHandler *plug_handler, SpaprDrc *drc = drc_from_dev(phb, pdev); g_assert(drc); + + if (!drc->dev) { + return; + } + g_assert(drc->dev == plugged_dev); if (!spapr_drc_unplug_requested(drc)) { diff --git a/hw/s390x/s390-pci-bus.c b/hw/s390x/s390-pci-bus.c index eead269cc2..913d72cc74 100644 --- a/hw/s390x/s390-pci-bus.c +++ b/hw/s390x/s390-pci-bus.c @@ -971,14 +971,7 @@ static void s390_pcihost_pre_plug(HotplugHandler *hotplug_dev, DeviceState *dev, "this device"); } - if (object_dynamic_cast(OBJECT(dev), TYPE_PCI_DEVICE)) { - PCIDevice *pdev = PCI_DEVICE(dev); - - if (pdev->cap_present & QEMU_PCI_CAP_MULTIFUNCTION) { - error_setg(errp, "multifunction not supported in s390"); - return; - } - } else if (object_dynamic_cast(OBJECT(dev), TYPE_S390_PCI_DEVICE)) { + if (object_dynamic_cast(OBJECT(dev), TYPE_S390_PCI_DEVICE)) { S390PCIBusDevice *pbdev = S390_PCI_DEVICE(dev); if (!s390_pci_alloc_idx(s, pbdev)) { @@ -1069,6 +1062,18 @@ static void s390_pcihost_plug(HotplugHandler *hotplug_dev, DeviceState *dev, } else if (object_dynamic_cast(OBJECT(dev), TYPE_PCI_DEVICE)) { pdev = PCI_DEVICE(dev); + /* + * Multifunction is not supported due to the lack of CLP. However, + * do not check for multifunction capability for SR-IOV devices because + * SR-IOV devices automatically add the multifunction capability whether + * the user intends to use the functions other than the PF. + */ + if (pdev->cap_present & QEMU_PCI_CAP_MULTIFUNCTION && + !pdev->exp.sriov_cap) { + error_setg(errp, "multifunction not supported in s390"); + return; + } + if (!dev->id) { /* In the case the PCI device does not define an id */ /* we generate one based on the PCI address */ @@ -1080,6 +1085,16 @@ static void s390_pcihost_plug(HotplugHandler *hotplug_dev, DeviceState *dev, pbdev = s390_pci_find_dev_by_target(s, dev->id); if (!pbdev) { + /* + * VFs are automatically created by PF, and creating zpci for them + * will result in unexpected usage of fids. Currently QEMU does not + * support multifunction for s390x so we don't need zpci for VFs + * anyway. + */ + if (pci_is_vf(pdev)) { + return; + } + pbdev = s390_pci_device_new(s, dev->id, errp); if (!pbdev) { return; @@ -1167,7 +1182,10 @@ static void s390_pcihost_unplug(HotplugHandler *hotplug_dev, DeviceState *dev, int32_t devfn; pbdev = s390_pci_find_dev_by_pci(s, PCI_DEVICE(dev)); - g_assert(pbdev); + if (!pbdev) { + g_assert(pci_is_vf(pci_dev)); + return; + } s390_pci_generate_plug_event(HP_EVENT_STANDBY_TO_RESERVED, pbdev->fh, pbdev->fid); @@ -1206,7 +1224,11 @@ static void s390_pcihost_unplug_request(HotplugHandler *hotplug_dev, * we've checked the PCI device already (to prevent endless recursion). */ pbdev = s390_pci_find_dev_by_pci(s, PCI_DEVICE(dev)); - g_assert(pbdev); + if (!pbdev) { + g_assert(pci_is_vf(PCI_DEVICE(dev))); + return; + } + pbdev->pci_unplug_request_processed = true; qdev_unplug(DEVICE(pbdev), errp); } else if (object_dynamic_cast(OBJECT(dev), TYPE_S390_PCI_DEVICE)) { diff --git a/hw/vfio/common.c b/hw/vfio/common.c index abbdc56b6d..7a4010ef4e 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -1404,6 +1404,7 @@ void vfio_reset_handler(void *opaque) { VFIODevice *vbasedev; + trace_vfio_reset_handler(); QLIST_FOREACH(vbasedev, &vfio_device_list, global_next) { if (vbasedev->dev->realized) { vbasedev->ops->vfio_compute_needs_reset(vbasedev); diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events index cab1cf1de0..c5385e1a4f 100644 --- a/hw/vfio/trace-events +++ b/hw/vfio/trace-events @@ -120,6 +120,7 @@ vfio_get_dev_region(const char *name, int index, uint32_t type, uint32_t subtype vfio_legacy_dma_unmap_overflow_workaround(void) "" vfio_get_dirty_bitmap(uint64_t iova, uint64_t size, uint64_t bitmap_size, uint64_t start, uint64_t dirty_pages) "iova=0x%"PRIx64" size= 0x%"PRIx64" bitmap_size=0x%"PRIx64" start=0x%"PRIx64" dirty_pages=%"PRIu64 vfio_iommu_map_dirty_notify(uint64_t iova_start, uint64_t iova_end) "iommu dirty @ 0x%"PRIx64" - 0x%"PRIx64 +vfio_reset_handler(void) "" # platform.c vfio_platform_realize(char *name, char *compat) "vfio device %s, compat = %s" diff --git a/hw/virtio/trace-events b/hw/virtio/trace-events index 04e36ae047..76f0d458b2 100644 --- a/hw/virtio/trace-events +++ b/hw/virtio/trace-events @@ -108,7 +108,7 @@ virtio_pci_notify_write(uint64_t addr, uint64_t val, unsigned int size) "0x%" PR virtio_pci_notify_write_pio(uint64_t addr, uint64_t val, unsigned int size) "0x%" PRIx64" = 0x%" PRIx64 " (%d)" # hw/virtio/virtio-iommu.c -virtio_iommu_device_reset(void) "reset!" +virtio_iommu_device_reset_exit(void) "reset!" virtio_iommu_system_reset(void) "system reset!" virtio_iommu_get_features(uint64_t features) "device supports features=0x%"PRIx64 virtio_iommu_device_status(uint8_t status) "driver status = %d" diff --git a/hw/virtio/vhost-iova-tree.c b/hw/virtio/vhost-iova-tree.c index 3d03395a77..fa4147b773 100644 --- a/hw/virtio/vhost-iova-tree.c +++ b/hw/virtio/vhost-iova-tree.c @@ -28,12 +28,18 @@ struct VhostIOVATree { /* IOVA address to qemu memory maps. */ IOVATree *iova_taddr_map; + + /* Allocated IOVA addresses */ + IOVATree *iova_map; + + /* GPA->IOVA address memory maps */ + IOVATree *gpa_iova_map; }; /** - * Create a new IOVA tree + * Create a new VhostIOVATree * - * Returns the new IOVA tree + * Returns the new VhostIOVATree. */ VhostIOVATree *vhost_iova_tree_new(hwaddr iova_first, hwaddr iova_last) { @@ -44,25 +50,29 @@ VhostIOVATree *vhost_iova_tree_new(hwaddr iova_first, hwaddr iova_last) tree->iova_last = iova_last; tree->iova_taddr_map = iova_tree_new(); + tree->iova_map = iova_tree_new(); + tree->gpa_iova_map = gpa_tree_new(); return tree; } /** - * Delete an iova tree + * Delete a VhostIOVATree */ void vhost_iova_tree_delete(VhostIOVATree *iova_tree) { iova_tree_destroy(iova_tree->iova_taddr_map); + iova_tree_destroy(iova_tree->iova_map); + iova_tree_destroy(iova_tree->gpa_iova_map); g_free(iova_tree); } /** * Find the IOVA address stored from a memory address * - * @tree: The iova tree + * @tree: The VhostIOVATree * @map: The map with the memory address * - * Return the stored mapping, or NULL if not found. + * Returns the stored IOVA->HVA mapping, or NULL if not found. */ const DMAMap *vhost_iova_tree_find_iova(const VhostIOVATree *tree, const DMAMap *map) @@ -71,40 +81,111 @@ const DMAMap *vhost_iova_tree_find_iova(const VhostIOVATree *tree, } /** - * Allocate a new mapping + * Allocate a new IOVA range and add the mapping to the IOVA->HVA tree * - * @tree: The iova tree - * @map: The iova map + * @tree: The VhostIOVATree + * @map: The IOVA mapping + * @taddr: The translated address (HVA) * * Returns: * - IOVA_OK if the map fits in the container * - IOVA_ERR_INVALID if the map does not make sense (like size overflow) * - IOVA_ERR_NOMEM if tree cannot allocate more space. * - * It returns assignated iova in map->iova if return value is VHOST_DMA_MAP_OK. + * It returns an assigned IOVA in map->iova if the return value is IOVA_OK. */ -int vhost_iova_tree_map_alloc(VhostIOVATree *tree, DMAMap *map) +int vhost_iova_tree_map_alloc(VhostIOVATree *tree, DMAMap *map, hwaddr taddr) { + int ret; + /* Some vhost devices do not like addr 0. Skip first page */ hwaddr iova_first = tree->iova_first ?: qemu_real_host_page_size(); - if (map->translated_addr + map->size < map->translated_addr || - map->perm == IOMMU_NONE) { + if (taddr + map->size < taddr || map->perm == IOMMU_NONE) { return IOVA_ERR_INVALID; } - /* Allocate a node in IOVA address */ - return iova_tree_alloc_map(tree->iova_taddr_map, map, iova_first, - tree->iova_last); + /* Allocate a node in the IOVA-only tree */ + ret = iova_tree_alloc_map(tree->iova_map, map, iova_first, tree->iova_last); + if (unlikely(ret != IOVA_OK)) { + return ret; + } + + /* Insert a node in the IOVA->HVA tree */ + map->translated_addr = taddr; + return iova_tree_insert(tree->iova_taddr_map, map); } /** - * Remove existing mappings from iova tree + * Remove existing mappings from the IOVA-only and IOVA->HVA trees * - * @iova_tree: The vhost iova tree + * @iova_tree: The VhostIOVATree * @map: The map to remove */ void vhost_iova_tree_remove(VhostIOVATree *iova_tree, DMAMap map) { iova_tree_remove(iova_tree->iova_taddr_map, map); + iova_tree_remove(iova_tree->iova_map, map); +} + +/** + * Find the IOVA address stored from a guest memory address (GPA) + * + * @tree: The VhostIOVATree + * @map: The map with the guest memory address + * + * Returns the stored GPA->IOVA mapping, or NULL if not found. + */ +const DMAMap *vhost_iova_tree_find_gpa(const VhostIOVATree *tree, + const DMAMap *map) +{ + return iova_tree_find_iova(tree->gpa_iova_map, map); +} + +/** + * Allocate a new IOVA range and add the mapping to the GPA->IOVA tree + * + * @tree: The VhostIOVATree + * @map: The IOVA mapping + * @taddr: The translated address (GPA) + * + * Returns: + * - IOVA_OK if the map fits both containers + * - IOVA_ERR_INVALID if the map does not make sense (like size overflow) + * - IOVA_ERR_NOMEM if the IOVA-only tree cannot allocate more space + * + * It returns an assigned IOVA in map->iova if the return value is IOVA_OK. + */ +int vhost_iova_tree_map_alloc_gpa(VhostIOVATree *tree, DMAMap *map, hwaddr taddr) +{ + int ret; + + /* Some vhost devices don't like addr 0. Skip first page */ + hwaddr iova_first = tree->iova_first ?: qemu_real_host_page_size(); + + if (taddr + map->size < taddr || map->perm == IOMMU_NONE) { + return IOVA_ERR_INVALID; + } + + /* Allocate a node in the IOVA-only tree */ + ret = iova_tree_alloc_map(tree->iova_map, map, iova_first, tree->iova_last); + if (unlikely(ret != IOVA_OK)) { + return ret; + } + + /* Insert a node in the GPA->IOVA tree */ + map->translated_addr = taddr; + return gpa_tree_insert(tree->gpa_iova_map, map); +} + +/** + * Remove existing mappings from the IOVA-only and GPA->IOVA trees + * + * @tree: The VhostIOVATree + * @map: The map to remove + */ +void vhost_iova_tree_remove_gpa(VhostIOVATree *iova_tree, DMAMap map) +{ + iova_tree_remove(iova_tree->gpa_iova_map, map); + iova_tree_remove(iova_tree->iova_map, map); } diff --git a/hw/virtio/vhost-iova-tree.h b/hw/virtio/vhost-iova-tree.h index 4adfd79ff0..0c4ba5abd5 100644 --- a/hw/virtio/vhost-iova-tree.h +++ b/hw/virtio/vhost-iova-tree.h @@ -21,7 +21,13 @@ G_DEFINE_AUTOPTR_CLEANUP_FUNC(VhostIOVATree, vhost_iova_tree_delete); const DMAMap *vhost_iova_tree_find_iova(const VhostIOVATree *iova_tree, const DMAMap *map); -int vhost_iova_tree_map_alloc(VhostIOVATree *iova_tree, DMAMap *map); +int vhost_iova_tree_map_alloc(VhostIOVATree *iova_tree, DMAMap *map, + hwaddr taddr); void vhost_iova_tree_remove(VhostIOVATree *iova_tree, DMAMap map); +const DMAMap *vhost_iova_tree_find_gpa(const VhostIOVATree *iova_tree, + const DMAMap *map); +int vhost_iova_tree_map_alloc_gpa(VhostIOVATree *iova_tree, DMAMap *map, + hwaddr taddr); +void vhost_iova_tree_remove_gpa(VhostIOVATree *iova_tree, DMAMap map); #endif diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c index 37aca8b431..2481d49345 100644 --- a/hw/virtio/vhost-shadow-virtqueue.c +++ b/hw/virtio/vhost-shadow-virtqueue.c @@ -78,24 +78,39 @@ uint16_t vhost_svq_available_slots(const VhostShadowVirtqueue *svq) * @vaddr: Translated IOVA addresses * @iovec: Source qemu's VA addresses * @num: Length of iovec and minimum length of vaddr + * @gpas: Descriptors' GPAs, if backed by guest memory */ static bool vhost_svq_translate_addr(const VhostShadowVirtqueue *svq, hwaddr *addrs, const struct iovec *iovec, - size_t num) + size_t num, const hwaddr *gpas) { if (num == 0) { return true; } for (size_t i = 0; i < num; ++i) { - DMAMap needle = { - .translated_addr = (hwaddr)(uintptr_t)iovec[i].iov_base, - .size = iovec[i].iov_len, - }; Int128 needle_last, map_last; size_t off; + const DMAMap *map; + DMAMap needle; + + /* Check if the descriptor is backed by guest memory */ + if (gpas) { + /* Search the GPA->IOVA tree */ + needle = (DMAMap) { + .translated_addr = gpas[i], + .size = iovec[i].iov_len, + }; + map = vhost_iova_tree_find_gpa(svq->iova_tree, &needle); + } else { + /* Search the IOVA->HVA tree */ + needle = (DMAMap) { + .translated_addr = (hwaddr)(uintptr_t)iovec[i].iov_base, + .size = iovec[i].iov_len, + }; + map = vhost_iova_tree_find_iova(svq->iova_tree, &needle); + } - const DMAMap *map = vhost_iova_tree_find_iova(svq->iova_tree, &needle); /* * Map cannot be NULL since iova map contains all guest space and * qemu already has a physical address mapped @@ -130,6 +145,7 @@ static bool vhost_svq_translate_addr(const VhostShadowVirtqueue *svq, * @sg: Cache for hwaddr * @iovec: The iovec from the guest * @num: iovec length + * @addr: Descriptors' GPAs, if backed by guest memory * @more_descs: True if more descriptors come in the chain * @write: True if they are writeable descriptors * @@ -137,7 +153,8 @@ static bool vhost_svq_translate_addr(const VhostShadowVirtqueue *svq, */ static bool vhost_svq_vring_write_descs(VhostShadowVirtqueue *svq, hwaddr *sg, const struct iovec *iovec, size_t num, - bool more_descs, bool write) + const hwaddr *addr, bool more_descs, + bool write) { uint16_t i = svq->free_head, last = svq->free_head; unsigned n; @@ -149,7 +166,7 @@ static bool vhost_svq_vring_write_descs(VhostShadowVirtqueue *svq, hwaddr *sg, return true; } - ok = vhost_svq_translate_addr(svq, sg, iovec, num); + ok = vhost_svq_translate_addr(svq, sg, iovec, num, addr); if (unlikely(!ok)) { return false; } @@ -165,17 +182,18 @@ static bool vhost_svq_vring_write_descs(VhostShadowVirtqueue *svq, hwaddr *sg, descs[i].len = cpu_to_le32(iovec[n].iov_len); last = i; - i = cpu_to_le16(svq->desc_next[i]); + i = svq->desc_next[i]; } - svq->free_head = le16_to_cpu(svq->desc_next[last]); + svq->free_head = svq->desc_next[last]; return true; } static bool vhost_svq_add_split(VhostShadowVirtqueue *svq, const struct iovec *out_sg, size_t out_num, + const hwaddr *out_addr, const struct iovec *in_sg, size_t in_num, - unsigned *head) + const hwaddr *in_addr, unsigned *head) { unsigned avail_idx; vring_avail_t *avail = svq->vring.avail; @@ -191,13 +209,14 @@ static bool vhost_svq_add_split(VhostShadowVirtqueue *svq, return false; } - ok = vhost_svq_vring_write_descs(svq, sgs, out_sg, out_num, in_num > 0, - false); + ok = vhost_svq_vring_write_descs(svq, sgs, out_sg, out_num, out_addr, + in_num > 0, false); if (unlikely(!ok)) { return false; } - ok = vhost_svq_vring_write_descs(svq, sgs, in_sg, in_num, false, true); + ok = vhost_svq_vring_write_descs(svq, sgs, in_sg, in_num, in_addr, false, + true); if (unlikely(!ok)) { return false; } @@ -228,10 +247,12 @@ static void vhost_svq_kick(VhostShadowVirtqueue *svq) smp_mb(); if (virtio_vdev_has_feature(svq->vdev, VIRTIO_RING_F_EVENT_IDX)) { - uint16_t avail_event = *(uint16_t *)(&svq->vring.used->ring[svq->vring.num]); + uint16_t avail_event = le16_to_cpu( + *(uint16_t *)(&svq->vring.used->ring[svq->vring.num])); needs_kick = vring_need_event(avail_event, svq->shadow_avail_idx, svq->shadow_avail_idx - 1); } else { - needs_kick = !(svq->vring.used->flags & VRING_USED_F_NO_NOTIFY); + needs_kick = + !(svq->vring.used->flags & cpu_to_le16(VRING_USED_F_NO_NOTIFY)); } if (!needs_kick) { @@ -247,8 +268,9 @@ static void vhost_svq_kick(VhostShadowVirtqueue *svq) * Return -EINVAL if element is invalid, -ENOSPC if dev queue is full */ int vhost_svq_add(VhostShadowVirtqueue *svq, const struct iovec *out_sg, - size_t out_num, const struct iovec *in_sg, size_t in_num, - VirtQueueElement *elem) + size_t out_num, const hwaddr *out_addr, + const struct iovec *in_sg, size_t in_num, + const hwaddr *in_addr, VirtQueueElement *elem) { unsigned qemu_head; unsigned ndescs = in_num + out_num; @@ -258,7 +280,8 @@ int vhost_svq_add(VhostShadowVirtqueue *svq, const struct iovec *out_sg, return -ENOSPC; } - ok = vhost_svq_add_split(svq, out_sg, out_num, in_sg, in_num, &qemu_head); + ok = vhost_svq_add_split(svq, out_sg, out_num, out_addr, in_sg, in_num, + in_addr, &qemu_head); if (unlikely(!ok)) { return -EINVAL; } @@ -274,8 +297,8 @@ int vhost_svq_add(VhostShadowVirtqueue *svq, const struct iovec *out_sg, static int vhost_svq_add_element(VhostShadowVirtqueue *svq, VirtQueueElement *elem) { - return vhost_svq_add(svq, elem->out_sg, elem->out_num, elem->in_sg, - elem->in_num, elem); + return vhost_svq_add(svq, elem->out_sg, elem->out_num, elem->out_addr, + elem->in_sg, elem->in_num, elem->in_addr, elem); } /** @@ -365,7 +388,7 @@ static bool vhost_svq_more_used(VhostShadowVirtqueue *svq) return true; } - svq->shadow_used_idx = cpu_to_le16(*(volatile uint16_t *)used_idx); + svq->shadow_used_idx = le16_to_cpu(*(volatile uint16_t *)used_idx); return svq->last_used_idx != svq->shadow_used_idx; } @@ -383,7 +406,7 @@ static bool vhost_svq_enable_notification(VhostShadowVirtqueue *svq) { if (virtio_vdev_has_feature(svq->vdev, VIRTIO_RING_F_EVENT_IDX)) { uint16_t *used_event = (uint16_t *)&svq->vring.avail->ring[svq->vring.num]; - *used_event = svq->shadow_used_idx; + *used_event = cpu_to_le16(svq->shadow_used_idx); } else { svq->vring.avail->flags &= ~cpu_to_le16(VRING_AVAIL_F_NO_INTERRUPT); } @@ -408,7 +431,7 @@ static uint16_t vhost_svq_last_desc_of_chain(const VhostShadowVirtqueue *svq, uint16_t num, uint16_t i) { for (uint16_t j = 0; j < (num - 1); ++j) { - i = le16_to_cpu(svq->desc_next[i]); + i = svq->desc_next[i]; } return i; @@ -683,7 +706,7 @@ void vhost_svq_start(VhostShadowVirtqueue *svq, VirtIODevice *vdev, svq->desc_state = g_new0(SVQDescState, svq->vring.num); svq->desc_next = g_new0(uint16_t, svq->vring.num); for (unsigned i = 0; i < svq->vring.num - 1; i++) { - svq->desc_next[i] = cpu_to_le16(i + 1); + svq->desc_next[i] = i + 1; } } diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h index 19c842a15b..9c273739d6 100644 --- a/hw/virtio/vhost-shadow-virtqueue.h +++ b/hw/virtio/vhost-shadow-virtqueue.h @@ -118,8 +118,9 @@ uint16_t vhost_svq_available_slots(const VhostShadowVirtqueue *svq); void vhost_svq_push_elem(VhostShadowVirtqueue *svq, const VirtQueueElement *elem, uint32_t len); int vhost_svq_add(VhostShadowVirtqueue *svq, const struct iovec *out_sg, - size_t out_num, const struct iovec *in_sg, size_t in_num, - VirtQueueElement *elem); + size_t out_num, const hwaddr *out_addr, + const struct iovec *in_sg, size_t in_num, + const hwaddr *in_addr, VirtQueueElement *elem); size_t vhost_svq_poll(VhostShadowVirtqueue *svq, size_t num); void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq, int svq_kick_fd); diff --git a/hw/virtio/vhost-user-snd.c b/hw/virtio/vhost-user-snd.c index 8610370af8..b414c75c06 100644 --- a/hw/virtio/vhost-user-snd.c +++ b/hw/virtio/vhost-user-snd.c @@ -16,6 +16,18 @@ #include "standard-headers/linux/virtio_ids.h" #include "standard-headers/linux/virtio_snd.h" +static const VirtIOFeature feature_sizes[] = { + {.flags = 1ULL << VIRTIO_SND_F_CTLS, + .end = endof(struct virtio_snd_config, controls)}, + {} +}; + +static const VirtIOConfigSizeParams cfg_size_params = { + .min_size = endof(struct virtio_snd_config, chmaps), + .max_size = sizeof(struct virtio_snd_config), + .feature_sizes = feature_sizes +}; + static const VMStateDescription vu_snd_vmstate = { .name = "vhost-user-snd", .unmigratable = 1, @@ -23,16 +35,20 @@ static const VMStateDescription vu_snd_vmstate = { static const Property vsnd_properties[] = { DEFINE_PROP_CHR("chardev", VHostUserBase, chardev), + DEFINE_PROP_BIT64("controls", VHostUserBase, + parent_obj.host_features, VIRTIO_SND_F_CTLS, false), }; static void vu_snd_base_realize(DeviceState *dev, Error **errp) { VHostUserBase *vub = VHOST_USER_BASE(dev); VHostUserBaseClass *vubs = VHOST_USER_BASE_GET_CLASS(dev); + VirtIODevice *vdev = &vub->parent_obj; vub->virtio_id = VIRTIO_ID_SOUND; vub->num_vqs = 4; - vub->config_size = sizeof(struct virtio_snd_config); + vub->config_size = virtio_get_config_size(&cfg_size_params, + vdev->host_features); vub->vq_size = 64; vubs->parent_realize(dev, errp); diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c index 3cdaa12ed5..7efbde3d4c 100644 --- a/hw/virtio/vhost-vdpa.c +++ b/hw/virtio/vhost-vdpa.c @@ -360,14 +360,20 @@ static void vhost_vdpa_listener_region_add(MemoryListener *listener, llsize = int128_sub(llend, int128_make64(iova)); if (s->shadow_data) { int r; + hwaddr gpa = section->offset_within_address_space; - mem_region.translated_addr = (hwaddr)(uintptr_t)vaddr, mem_region.size = int128_get64(llsize) - 1, mem_region.perm = IOMMU_ACCESS_FLAG(true, section->readonly), - r = vhost_iova_tree_map_alloc(s->iova_tree, &mem_region); + r = vhost_iova_tree_map_alloc_gpa(s->iova_tree, &mem_region, gpa); if (unlikely(r != IOVA_OK)) { error_report("Can't allocate a mapping (%d)", r); + + if (mem_region.translated_addr == gpa) { + error_report("Insertion to GPA->IOVA tree failed"); + /* Remove the mapping from the IOVA-only tree */ + goto fail_map; + } goto fail; } @@ -386,7 +392,7 @@ static void vhost_vdpa_listener_region_add(MemoryListener *listener, fail_map: if (s->shadow_data) { - vhost_iova_tree_remove(s->iova_tree, mem_region); + vhost_iova_tree_remove_gpa(s->iova_tree, mem_region); } fail: @@ -440,21 +446,18 @@ static void vhost_vdpa_listener_region_del(MemoryListener *listener, if (s->shadow_data) { const DMAMap *result; - const void *vaddr = memory_region_get_ram_ptr(section->mr) + - section->offset_within_region + - (iova - section->offset_within_address_space); DMAMap mem_region = { - .translated_addr = (hwaddr)(uintptr_t)vaddr, + .translated_addr = section->offset_within_address_space, .size = int128_get64(llsize) - 1, }; - result = vhost_iova_tree_find_iova(s->iova_tree, &mem_region); + result = vhost_iova_tree_find_gpa(s->iova_tree, &mem_region); if (!result) { /* The memory listener map wasn't mapped */ return; } iova = result->iova; - vhost_iova_tree_remove(s->iova_tree, *result); + vhost_iova_tree_remove_gpa(s->iova_tree, *result); } vhost_vdpa_iotlb_batch_begin_once(s); /* @@ -1142,16 +1145,23 @@ static void vhost_vdpa_svq_unmap_rings(struct vhost_dev *dev, * * @v: Vhost-vdpa device * @needle: The area to search iova + * @taddr: The translated address (HVA) * @errorp: Error pointer */ static bool vhost_vdpa_svq_map_ring(struct vhost_vdpa *v, DMAMap *needle, - Error **errp) + hwaddr taddr, Error **errp) { int r; - r = vhost_iova_tree_map_alloc(v->shared->iova_tree, needle); + r = vhost_iova_tree_map_alloc(v->shared->iova_tree, needle, taddr); if (unlikely(r != IOVA_OK)) { error_setg(errp, "Cannot allocate iova (%d)", r); + + if (needle->translated_addr == taddr) { + error_append_hint(errp, "Insertion to IOVA->HVA tree failed"); + /* Remove the mapping from the IOVA-only tree */ + vhost_iova_tree_remove(v->shared->iova_tree, *needle); + } return false; } @@ -1192,11 +1202,11 @@ static bool vhost_vdpa_svq_map_rings(struct vhost_dev *dev, vhost_svq_get_vring_addr(svq, &svq_addr); driver_region = (DMAMap) { - .translated_addr = svq_addr.desc_user_addr, .size = driver_size - 1, .perm = IOMMU_RO, }; - ok = vhost_vdpa_svq_map_ring(v, &driver_region, errp); + ok = vhost_vdpa_svq_map_ring(v, &driver_region, svq_addr.desc_user_addr, + errp); if (unlikely(!ok)) { error_prepend(errp, "Cannot create vq driver region: "); return false; @@ -1206,11 +1216,11 @@ static bool vhost_vdpa_svq_map_rings(struct vhost_dev *dev, addr->avail_user_addr = driver_region.iova + avail_offset; device_region = (DMAMap) { - .translated_addr = svq_addr.used_user_addr, .size = device_size - 1, .perm = IOMMU_RW, }; - ok = vhost_vdpa_svq_map_ring(v, &device_region, errp); + ok = vhost_vdpa_svq_map_ring(v, &device_region, svq_addr.used_user_addr, + errp); if (unlikely(!ok)) { error_prepend(errp, "Cannot create vq device region: "); vhost_vdpa_svq_unmap_ring(v, driver_region.translated_addr); diff --git a/hw/virtio/virtio-balloon.c b/hw/virtio/virtio-balloon.c index ad05768ded..2eb5a14fa2 100644 --- a/hw/virtio/virtio-balloon.c +++ b/hw/virtio/virtio-balloon.c @@ -31,7 +31,7 @@ #include "trace.h" #include "qemu/error-report.h" #include "migration/misc.h" - +#include "system/reset.h" #include "hw/virtio/virtio-bus.h" #include "hw/virtio/virtio-access.h" @@ -910,6 +910,8 @@ static void virtio_balloon_device_realize(DeviceState *dev, Error **errp) } reset_stats(s); + s->stats_last_update = 0; + qemu_register_resettable(OBJECT(dev)); } static void virtio_balloon_device_unrealize(DeviceState *dev) @@ -917,6 +919,7 @@ static void virtio_balloon_device_unrealize(DeviceState *dev) VirtIODevice *vdev = VIRTIO_DEVICE(dev); VirtIOBalloon *s = VIRTIO_BALLOON(dev); + qemu_unregister_resettable(OBJECT(dev)); if (s->free_page_bh) { qemu_bh_delete(s->free_page_bh); object_unref(OBJECT(s->iothread)); @@ -987,6 +990,27 @@ static void virtio_balloon_set_status(VirtIODevice *vdev, uint8_t status) } } +static ResettableState *virtio_balloon_get_reset_state(Object *obj) +{ + VirtIOBalloon *s = VIRTIO_BALLOON(obj); + return &s->reset_state; +} + +static void virtio_balloon_reset_enter(Object *obj, ResetType type) +{ + VirtIOBalloon *s = VIRTIO_BALLOON(obj); + + /* + * When waking up from standby/suspend-to-ram, do not reset stats. + */ + if (type == RESET_TYPE_WAKEUP) { + return; + } + + reset_stats(s); + s->stats_last_update = 0; +} + static void virtio_balloon_instance_init(Object *obj) { VirtIOBalloon *s = VIRTIO_BALLOON(obj); @@ -1038,6 +1062,7 @@ static void virtio_balloon_class_init(ObjectClass *klass, void *data) { DeviceClass *dc = DEVICE_CLASS(klass); VirtioDeviceClass *vdc = VIRTIO_DEVICE_CLASS(klass); + ResettableClass *rc = RESETTABLE_CLASS(klass); device_class_set_props(dc, virtio_balloon_properties); dc->vmsd = &vmstate_virtio_balloon; @@ -1050,6 +1075,9 @@ static void virtio_balloon_class_init(ObjectClass *klass, void *data) vdc->get_features = virtio_balloon_get_features; vdc->set_status = virtio_balloon_set_status; vdc->vmsd = &vmstate_virtio_balloon_device; + + rc->get_state = virtio_balloon_get_reset_state; + rc->phases.enter = virtio_balloon_reset_enter; } static const TypeInfo virtio_balloon_info = { diff --git a/hw/virtio/virtio-iommu.c b/hw/virtio/virtio-iommu.c index f41104a952..b6e7e01ef7 100644 --- a/hw/virtio/virtio-iommu.c +++ b/hw/virtio/virtio-iommu.c @@ -1504,11 +1504,11 @@ static void virtio_iommu_device_unrealize(DeviceState *dev) virtio_cleanup(vdev); } -static void virtio_iommu_device_reset(VirtIODevice *vdev) +static void virtio_iommu_device_reset_exit(Object *obj, ResetType type) { - VirtIOIOMMU *s = VIRTIO_IOMMU(vdev); + VirtIOIOMMU *s = VIRTIO_IOMMU(obj); - trace_virtio_iommu_device_reset(); + trace_virtio_iommu_device_reset_exit(); if (s->domains) { g_tree_destroy(s->domains); @@ -1668,6 +1668,7 @@ static void virtio_iommu_class_init(ObjectClass *klass, void *data) { DeviceClass *dc = DEVICE_CLASS(klass); VirtioDeviceClass *vdc = VIRTIO_DEVICE_CLASS(klass); + ResettableClass *rc = RESETTABLE_CLASS(klass); device_class_set_props(dc, virtio_iommu_properties); dc->vmsd = &vmstate_virtio_iommu; @@ -1675,7 +1676,12 @@ static void virtio_iommu_class_init(ObjectClass *klass, void *data) set_bit(DEVICE_CATEGORY_MISC, dc->categories); vdc->realize = virtio_iommu_device_realize; vdc->unrealize = virtio_iommu_device_unrealize; - vdc->reset = virtio_iommu_device_reset; + + /* + * Use 'exit' reset phase to make sure all DMA requests + * have been quiesced during 'enter' or 'hold' phase + */ + rc->phases.exit = virtio_iommu_device_reset_exit; vdc->get_config = virtio_iommu_get_config; vdc->set_config = virtio_iommu_set_config; vdc->get_features = virtio_iommu_get_features; diff --git a/hw/virtio/virtio-nsm.c b/hw/virtio/virtio-nsm.c index 098e1aeac6..b22aa74e34 100644 --- a/hw/virtio/virtio-nsm.c +++ b/hw/virtio/virtio-nsm.c @@ -1596,7 +1596,7 @@ static void handle_input(VirtIODevice *vdev, VirtQueue *vq) g_free(req.iov_base); g_free(res.iov_base); virtqueue_push(vq, out_elem, 0); - virtqueue_push(vq, in_elem, in_elem->in_sg->iov_len); + virtqueue_push(vq, in_elem, sz); virtio_notify(vdev, vq); return; diff --git a/include/hw/cxl/cxl_device.h b/include/hw/cxl/cxl_device.h index 561b375dc8..3a0ee7e8e7 100644 --- a/include/hw/cxl/cxl_device.h +++ b/include/hw/cxl/cxl_device.h @@ -264,8 +264,8 @@ void cxl_device_register_block_init(Object *obj, CXLDeviceState *dev, typedef struct CXLType3Dev CXLType3Dev; typedef struct CSWMBCCIDev CSWMBCCIDev; /* Set up default values for the register block */ -void cxl_device_register_init_t3(CXLType3Dev *ct3d); -void cxl_device_register_init_swcci(CSWMBCCIDev *sw); +void cxl_device_register_init_t3(CXLType3Dev *ct3d, int msi_n); +void cxl_device_register_init_swcci(CSWMBCCIDev *sw, int msi_n); /* * CXL r3.1 Section 8.2.8.1: CXL Device Capabilities Array Register diff --git a/include/hw/pci/pcie_sriov.h b/include/hw/pci/pcie_sriov.h index 450cbef6c2..c5d2d318d3 100644 --- a/include/hw/pci/pcie_sriov.h +++ b/include/hw/pci/pcie_sriov.h @@ -16,9 +16,7 @@ #include "hw/pci/pci.h" typedef struct PCIESriovPF { - uint16_t num_vfs; /* Number of virtual functions created */ uint8_t vf_bar_type[PCI_NUM_REGIONS]; /* Store type for each VF bar */ - const char *vfname; /* Reference to the device type used for the VFs */ PCIDevice **vf; /* Pointer to an array of num_vfs VF devices */ } PCIESriovPF; @@ -27,10 +25,11 @@ typedef struct PCIESriovVF { uint16_t vf_number; /* Logical VF number of this function */ } PCIESriovVF; -void pcie_sriov_pf_init(PCIDevice *dev, uint16_t offset, +bool pcie_sriov_pf_init(PCIDevice *dev, uint16_t offset, const char *vfname, uint16_t vf_dev_id, uint16_t init_vfs, uint16_t total_vfs, - uint16_t vf_offset, uint16_t vf_stride); + uint16_t vf_offset, uint16_t vf_stride, + Error **errp); void pcie_sriov_pf_exit(PCIDevice *dev); /* Set up a VF bar in the SR/IOV bar area */ @@ -58,6 +57,8 @@ void pcie_sriov_pf_add_sup_pgsize(PCIDevice *dev, uint16_t opt_sup_pgsize); void pcie_sriov_config_write(PCIDevice *dev, uint32_t address, uint32_t val, int len); +void pcie_sriov_pf_post_load(PCIDevice *dev); + /* Reset SR/IOV */ void pcie_sriov_pf_reset(PCIDevice *dev); diff --git a/include/hw/virtio/virtio-balloon.h b/include/hw/virtio/virtio-balloon.h index b12c18a43b..0456c211c6 100644 --- a/include/hw/virtio/virtio-balloon.h +++ b/include/hw/virtio/virtio-balloon.h @@ -16,6 +16,7 @@ #define QEMU_VIRTIO_BALLOON_H #include "standard-headers/linux/virtio_balloon.h" +#include "hw/resettable.h" #include "hw/virtio/virtio.h" #include "system/iothread.h" #include "qom/object.h" @@ -71,6 +72,9 @@ struct VirtIOBalloon { bool qemu_4_0_config_size; uint32_t poison_val; + + /* State of the resettable container */ + ResettableState reset_state; }; #endif diff --git a/include/qemu/iova-tree.h b/include/qemu/iova-tree.h index 44a45931d5..16d354a814 100644 --- a/include/qemu/iova-tree.h +++ b/include/qemu/iova-tree.h @@ -40,6 +40,28 @@ typedef struct DMAMap { } QEMU_PACKED DMAMap; typedef gboolean (*iova_tree_iterator)(DMAMap *map); +/** + * gpa_tree_new: + * + * Create a new GPA->IOVA tree. + * + * Returns: the tree point on success, or NULL otherwise. + */ +IOVATree *gpa_tree_new(void); + +/** + * gpa_tree_insert: + * + * @tree: The GPA->IOVA tree we're inserting the mapping to + * @map: The GPA->IOVA mapping to insert + * + * Inserts a GPA range to the GPA->IOVA tree. If there are overlapped + * ranges, IOVA_ERR_OVERLAP will be returned. + * + * Return: 0 if successful, < 0 otherwise. + */ +int gpa_tree_insert(IOVATree *tree, const DMAMap *map); + /** * iova_tree_new: * diff --git a/net/vhost-user.c b/net/vhost-user.c index 12555518e8..0b235e50c6 100644 --- a/net/vhost-user.c +++ b/net/vhost-user.c @@ -16,6 +16,7 @@ #include "chardev/char-fe.h" #include "qapi/error.h" #include "qapi/qapi-commands-net.h" +#include "qapi/qapi-events-net.h" #include "qemu/config-file.h" #include "qemu/error-report.h" #include "qemu/option.h" @@ -271,6 +272,7 @@ static void chr_closed_bh(void *opaque) if (err) { error_report_err(err); } + qapi_event_send_netdev_vhost_user_disconnected(name); } static void net_vhost_user_event(void *opaque, QEMUChrEvent event) @@ -300,6 +302,7 @@ static void net_vhost_user_event(void *opaque, QEMUChrEvent event) net_vhost_user_watch, s); qmp_set_link(name, true, &err); s->started = true; + qapi_event_send_netdev_vhost_user_connected(name, chr->label); break; case CHR_EVENT_CLOSED: /* a close event may happen during a read/write, but vhost diff --git a/net/vhost-vdpa.c b/net/vhost-vdpa.c index 231b45246c..bd01866878 100644 --- a/net/vhost-vdpa.c +++ b/net/vhost-vdpa.c @@ -510,14 +510,20 @@ static int vhost_vdpa_cvq_map_buf(struct vhost_vdpa *v, void *buf, size_t size, bool write) { DMAMap map = {}; + hwaddr taddr = (hwaddr)(uintptr_t)buf; int r; - map.translated_addr = (hwaddr)(uintptr_t)buf; map.size = size - 1; map.perm = write ? IOMMU_RW : IOMMU_RO, - r = vhost_iova_tree_map_alloc(v->shared->iova_tree, &map); + r = vhost_iova_tree_map_alloc(v->shared->iova_tree, &map, taddr); if (unlikely(r != IOVA_OK)) { error_report("Cannot map injected element"); + + if (map.translated_addr == taddr) { + error_report("Insertion to IOVA->HVA tree failed"); + /* Remove the mapping from the IOVA-only tree */ + goto dma_map_err; + } return r; } @@ -643,7 +649,7 @@ static ssize_t vhost_vdpa_net_cvq_add(VhostVDPAState *s, VhostShadowVirtqueue *svq = g_ptr_array_index(s->vhost_vdpa.shadow_vqs, 0); int r; - r = vhost_svq_add(svq, out_sg, out_num, in_sg, in_num, NULL); + r = vhost_svq_add(svq, out_sg, out_num, NULL, in_sg, in_num, NULL, NULL); if (unlikely(r != 0)) { if (unlikely(r == -ENOSPC)) { qemu_log_mask(LOG_GUEST_ERROR, "%s: No space on device queue\n", diff --git a/qapi/net.json b/qapi/net.json index 2739a2f423..310cc4fd19 100644 --- a/qapi/net.json +++ b/qapi/net.json @@ -1031,3 +1031,43 @@ ## { 'event': 'NETDEV_STREAM_DISCONNECTED', 'data': { 'netdev-id': 'str' } } + +## +# @NETDEV_VHOST_USER_CONNECTED: +# +# Emitted when the vhost-user chardev is connected +# +# @netdev-id: QEMU netdev id that is connected +# +# @chardev-id: The character device id used by the QEMU netdev +# +# Since: 10.0 +# +# .. qmp-example:: +# +# <- { "timestamp": {"seconds": 1739538638, "microseconds": 354181 }, +# "event": "NETDEV_VHOST_USER_CONNECTED", +# "data": { "netdev-id": "netdev0", "chardev-id": "chr0" } } +# +## +{ 'event': 'NETDEV_VHOST_USER_CONNECTED', + 'data': { 'netdev-id': 'str', 'chardev-id': 'str' } } + +## +# @NETDEV_VHOST_USER_DISCONNECTED: +# +# Emitted when the vhost-user chardev is disconnected +# +# @netdev-id: QEMU netdev id that is disconnected +# +# Since: 10.0 +# +# .. qmp-example:: +# +# <- { "timestamp": { "seconds": 1739538634, "microseconds": 920450 }, +# "event": "NETDEV_VHOST_USER_DISCONNECTED", +# "data": { "netdev-id": "netdev0" } } +# +## +{ 'event': 'NETDEV_VHOST_USER_DISCONNECTED', + 'data': { 'netdev-id': 'str' } } diff --git a/tests/functional/meson.build b/tests/functional/meson.build index b516d21cba..111d8bab26 100644 --- a/tests/functional/meson.build +++ b/tests/functional/meson.build @@ -46,6 +46,7 @@ test_timeouts = { 'riscv64_tuxrun' : 120, 's390x_ccw_virtio' : 420, 'sh4_tuxrun' : 240, + 'virtio_balloon': 120, 'x86_64_kvm_xen' : 180, } @@ -285,6 +286,7 @@ tests_x86_64_system_thorough = [ 'linux_initrd', 'multiprocess', 'netdev_ethtool', + 'virtio_balloon', 'virtio_gpu', 'x86_64_hotplug_blk', 'x86_64_hotplug_cpu', diff --git a/tests/functional/test_virtio_balloon.py b/tests/functional/test_virtio_balloon.py new file mode 100755 index 0000000000..67b48e1b4e --- /dev/null +++ b/tests/functional/test_virtio_balloon.py @@ -0,0 +1,161 @@ +#!/usr/bin/env python3 +# +# virtio-balloon tests +# +# This work is licensed under the terms of the GNU GPL, version 2 or +# later. See the COPYING file in the top-level directory. + +import time + +from qemu_test import QemuSystemTest, Asset +from qemu_test import wait_for_console_pattern +from qemu_test import exec_command_and_wait_for_pattern + +UNSET_STATS_VALUE = 18446744073709551615 + + +class VirtioBalloonx86(QemuSystemTest): + + ASSET_KERNEL = Asset( + ('https://archives.fedoraproject.org/pub/archive/fedora/linux/releases' + '/31/Server/x86_64/os/images/pxeboot/vmlinuz'), + 'd4738d03dbbe083ca610d0821d0a8f1488bebbdccef54ce33e3adb35fda00129') + + ASSET_INITRD = Asset( + ('https://archives.fedoraproject.org/pub/archive/fedora/linux/releases' + '/31/Server/x86_64/os/images/pxeboot/initrd.img'), + '277cd6c7adf77c7e63d73bbb2cded8ef9e2d3a2f100000e92ff1f8396513cd8b') + + ASSET_DISKIMAGE = Asset( + ('https://archives.fedoraproject.org/pub/archive/fedora/linux/releases' + '/31/Cloud/x86_64/images/Fedora-Cloud-Base-31-1.9.x86_64.qcow2'), + 'e3c1b309d9203604922d6e255c2c5d098a309c2d46215d8fc026954f3c5c27a0') + + DEFAULT_KERNEL_PARAMS = ('root=/dev/vda1 console=ttyS0 net.ifnames=0 ' + 'rd.rescue') + + def wait_for_console_pattern(self, success_message, vm=None): + wait_for_console_pattern( + self, + success_message, + failure_message="Kernel panic - not syncing", + vm=vm, + ) + + def mount_root(self): + self.wait_for_console_pattern('Entering emergency mode.') + prompt = '# ' + self.wait_for_console_pattern(prompt) + + exec_command_and_wait_for_pattern(self, 'mount /dev/vda1 /sysroot', + prompt) + exec_command_and_wait_for_pattern(self, 'chroot /sysroot', + prompt) + exec_command_and_wait_for_pattern(self, "modprobe virtio-balloon", + prompt) + + def assert_initial_stats(self): + ret = self.vm.qmp('qom-get', + {'path': '/machine/peripheral/balloon', + 'property': 'guest-stats'})['return'] + when = ret.get('last-update') + assert when == 0 + stats = ret.get('stats') + for name, val in stats.items(): + assert val == UNSET_STATS_VALUE + + def assert_running_stats(self, then): + ret = self.vm.qmp('qom-get', + {'path': '/machine/peripheral/balloon', + 'property': 'guest-stats'})['return'] + when = ret.get('last-update') + now = time.time() + + assert when > then and when < now + stats = ret.get('stats') + # Stat we expect this particular Kernel to have set + expectData = [ + "stat-available-memory", + "stat-disk-caches", + "stat-free-memory", + "stat-htlb-pgalloc", + "stat-htlb-pgfail", + "stat-major-faults", + "stat-minor-faults", + "stat-swap-in", + "stat-swap-out", + "stat-total-memory", + ] + for name, val in stats.items(): + if name in expectData: + assert val != UNSET_STATS_VALUE + else: + assert val == UNSET_STATS_VALUE + + def test_virtio_balloon_stats(self): + self.set_machine('q35') + kernel_path = self.ASSET_KERNEL.fetch() + initrd_path = self.ASSET_INITRD.fetch() + diskimage_path = self.ASSET_DISKIMAGE.fetch() + + self.vm.set_console() + self.vm.add_args("-S") + self.vm.add_args("-cpu", "max") + self.vm.add_args("-m", "2G") + # Slow down BIOS phase with boot menu, so that after a system + # reset, we can reliably catch the clean stats again in BIOS + # phase before the guest OS launches + self.vm.add_args("-boot", "menu=on") + self.vm.add_args("-machine", "q35,accel=kvm:tcg") + self.vm.add_args("-device", "virtio-balloon,id=balloon") + self.vm.add_args('-drive', + f'file={diskimage_path},if=none,id=drv0,snapshot=on') + self.vm.add_args('-device', 'virtio-blk-pci,bus=pcie.0,' + + 'drive=drv0,id=virtio-disk0,bootindex=1') + + self.vm.add_args( + "-kernel", + kernel_path, + "-initrd", + initrd_path, + "-append", + self.DEFAULT_KERNEL_PARAMS + ) + self.vm.launch() + + # Poll stats at 100ms + self.vm.qmp('qom-set', + {'path': '/machine/peripheral/balloon', + 'property': 'guest-stats-polling-interval', + 'value': 100 }) + + # We've not run any guest code yet, neither BIOS or guest, + # so stats should be all default values + self.assert_initial_stats() + + self.vm.qmp('cont') + + then = time.time() + self.mount_root() + self.assert_running_stats(then) + + # Race window between these two commands, where we + # rely on '-boot menu=on' to (hopefully) ensure we're + # still executing the BIOS when QEMU processes the + # 'stop', and thus have not loaded the virtio-balloon + # driver in the guest + self.vm.qmp('system_reset') + self.vm.qmp('stop') + + # If the above assumption held, we're in BIOS now and + # stats should be all back at their default values + self.assert_initial_stats() + self.vm.qmp('cont') + + then = time.time() + self.mount_root() + self.assert_running_stats(then) + + +if __name__ == '__main__': + QemuSystemTest.main() diff --git a/tests/qtest/libqos/pci.c b/tests/qtest/libqos/pci.c index b23d72346b..a59197b992 100644 --- a/tests/qtest/libqos/pci.c +++ b/tests/qtest/libqos/pci.c @@ -328,8 +328,6 @@ bool qpci_msix_pending(QPCIDevice *dev, uint16_t entry) g_assert(dev->msix_enabled); pba_entry = qpci_io_readl(dev, dev->msix_pba_bar, dev->msix_pba_off + off); - qpci_io_writel(dev, dev->msix_pba_bar, dev->msix_pba_off + off, - pba_entry & ~(1 << bit_n)); return (pba_entry & (1 << bit_n)) != 0; } diff --git a/util/iova-tree.c b/util/iova-tree.c index 06295e2755..5b0c95ff15 100644 --- a/util/iova-tree.c +++ b/util/iova-tree.c @@ -257,3 +257,49 @@ void iova_tree_destroy(IOVATree *tree) g_tree_destroy(tree->tree); g_free(tree); } + +static int gpa_tree_compare(gconstpointer a, gconstpointer b, gpointer data) +{ + const DMAMap *m1 = a, *m2 = b; + + if (m1->translated_addr > m2->translated_addr + m2->size) { + return 1; + } + + if (m1->translated_addr + m1->size < m2->translated_addr) { + return -1; + } + + /* Overlapped */ + return 0; +} + +IOVATree *gpa_tree_new(void) +{ + IOVATree *gpa_tree = g_new0(IOVATree, 1); + + gpa_tree->tree = g_tree_new_full(gpa_tree_compare, NULL, g_free, NULL); + + return gpa_tree; +} + +int gpa_tree_insert(IOVATree *tree, const DMAMap *map) +{ + DMAMap *new; + + if (map->translated_addr + map->size < map->translated_addr || + map->perm == IOMMU_NONE) { + return IOVA_ERR_INVALID; + } + + /* We don't allow inserting ranges that overlap with existing ones */ + if (iova_tree_find(tree, map)) { + return IOVA_ERR_OVERLAP; + } + + new = g_new0(DMAMap, 1); + memcpy(new, map, sizeof(*new)); + iova_tree_insert_internal(tree->tree, new); + + return IOVA_OK; +}