From 5aed8b0f0be25d2554f8bd76211e43b51e58f736 Mon Sep 17 00:00:00 2001 From: Tomita Moeko Date: Fri, 7 Mar 2025 02:01:21 +0800 Subject: [PATCH 01/21] vfio/igd: Remove GTT write quirk in IO BAR 4 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The IO BAR4 of IGD devices contains a pair of 32-bit address/data registers, MMIO_Index (0x0) and MMIO_Data (0x4), which provide access to the MMIO BAR0 (GTTMMADR) from IO space. These registers are probably only used by the VBIOS, and are not documented by intel. The observed layout of MMIO_Index register is: 31 2 1 0 +-------------------------------------------------------------------+ | Offset | Rsvd | Sel | +-------------------------------------------------------------------+ - Offset: Byte offset in specified region, 4-byte aligned. - Sel: Region selector 0: MMIO register region (first half of MMIO BAR0) 1: GTT region (second half of MMIO BAR0). Pre Gen11 only. Currently, QEMU implements a quirk that adjusts the guest Data Stolen Memory (DSM) region address to be (addr - host BDSM + guest BDSM) when programming GTT entries via IO BAR4, assuming guest still programs GTT with host DSM address, which is not the case. Guest's BDSM register is emulated and initialized to 0 at startup by QEMU, then SeaBIOS programs its value[1]. As result, the address programmed to GTT entries by VBIOS running in guest are valid GPA, and this unnecessary adjustment brings inconsistency. [1] https://gitlab.com/qemu-project/seabios/-/blob/1.12-stable/src/fw/pciinit.c#L319-332 Signed-off-by: Tomita Moeko Reviewed-by: Alex Williamson Tested-by: Alex Williamson Reviewed-by: Corvin Köhne Link: https://lore.kernel.org/qemu-devel/20250306180131.32970-2-tomitamoeko@gmail.com Signed-off-by: Cédric Le Goater --- hw/vfio/igd.c | 191 +------------------------------------------------- 1 file changed, 1 insertion(+), 190 deletions(-) diff --git a/hw/vfio/igd.c b/hw/vfio/igd.c index b1a237edd6..ca3a32f4f2 100644 --- a/hw/vfio/igd.c +++ b/hw/vfio/igd.c @@ -106,12 +106,6 @@ static int igd_gen(VFIOPCIDevice *vdev) return -1; } -typedef struct VFIOIGDQuirk { - struct VFIOPCIDevice *vdev; - uint32_t index; - uint64_t bdsm; -} VFIOIGDQuirk; - #define IGD_GMCH 0x50 /* Graphics Control Register */ #define IGD_BDSM 0x5c /* Base Data of Stolen Memory */ #define IGD_BDSM_GEN11 0xc0 /* Base Data of Stolen Memory of gen 11 and later */ @@ -300,129 +294,6 @@ static int vfio_pci_igd_lpc_init(VFIOPCIDevice *vdev, return ret; } -/* - * IGD Gen8 and newer support up to 8MB for the GTT and use a 64bit PTE - * entry, older IGDs use 2MB and 32bit. Each PTE maps a 4k page. Therefore - * we either have 2M/4k * 4 = 2k or 8M/4k * 8 = 16k as the maximum iobar index - * for programming the GTT. - * - * See linux:include/drm/i915_drm.h for shift and mask values. - */ -static int vfio_igd_gtt_max(VFIOPCIDevice *vdev) -{ - uint32_t gmch = vfio_pci_read_config(&vdev->pdev, IGD_GMCH, sizeof(gmch)); - int gen = igd_gen(vdev); - uint64_t ggms_size = igd_gtt_memory_size(gen, gmch); - - return (ggms_size / (4 * KiB)) * (gen < 8 ? 4 : 8); -} - -/* - * The IGD ROM will make use of stolen memory (GGMS) for support of VESA modes. - * Somehow the host stolen memory range is used for this, but how the ROM gets - * it is a mystery, perhaps it's hardcoded into the ROM. Thankfully though, it - * reprograms the GTT through the IOBAR where we can trap it and transpose the - * programming to the VM allocated buffer. That buffer gets reserved by the VM - * firmware via the fw_cfg entry added below. Here we're just monitoring the - * IOBAR address and data registers to detect a write sequence targeting the - * GTTADR. This code is developed by observed behavior and doesn't have a - * direct spec reference, unfortunately. - */ -static uint64_t vfio_igd_quirk_data_read(void *opaque, - hwaddr addr, unsigned size) -{ - VFIOIGDQuirk *igd = opaque; - VFIOPCIDevice *vdev = igd->vdev; - - igd->index = ~0; - - return vfio_region_read(&vdev->bars[4].region, addr + 4, size); -} - -static void vfio_igd_quirk_data_write(void *opaque, hwaddr addr, - uint64_t data, unsigned size) -{ - VFIOIGDQuirk *igd = opaque; - VFIOPCIDevice *vdev = igd->vdev; - uint64_t val = data; - int gen = igd_gen(vdev); - - /* - * Programming the GGMS starts at index 0x1 and uses every 4th index (ie. - * 0x1, 0x5, 0x9, 0xd,...). For pre-Gen8 each 4-byte write is a whole PTE - * entry, with 0th bit enable set. For Gen8 and up, PTEs are 64bit, so - * entries 0x5 & 0xd are the high dword, in our case zero. Each PTE points - * to a 4k page, which we translate to a page from the VM allocated region, - * pointed to by the BDSM register. If this is not set, we fail. - * - * We trap writes to the full configured GTT size, but we typically only - * see the vBIOS writing up to (nearly) the 1MB barrier. In fact it often - * seems to miss the last entry for an even 1MB GTT. Doing a gratuitous - * write of that last entry does work, but is hopefully unnecessary since - * we clear the previous GTT on initialization. - */ - if ((igd->index % 4 == 1) && igd->index < vfio_igd_gtt_max(vdev)) { - if (gen < 8 || (igd->index % 8 == 1)) { - uint64_t base; - - if (gen < 11) { - base = pci_get_long(vdev->pdev.config + IGD_BDSM); - } else { - base = pci_get_quad(vdev->pdev.config + IGD_BDSM_GEN11); - } - if (!base) { - hw_error("vfio-igd: Guest attempted to program IGD GTT before " - "BIOS reserved stolen memory. Unsupported BIOS?"); - } - - val = data - igd->bdsm + base; - } else { - val = 0; /* upper 32bits of pte, we only enable below 4G PTEs */ - } - - trace_vfio_pci_igd_bar4_write(vdev->vbasedev.name, - igd->index, data, val); - } - - vfio_region_write(&vdev->bars[4].region, addr + 4, val, size); - - igd->index = ~0; -} - -static const MemoryRegionOps vfio_igd_data_quirk = { - .read = vfio_igd_quirk_data_read, - .write = vfio_igd_quirk_data_write, - .endianness = DEVICE_LITTLE_ENDIAN, -}; - -static uint64_t vfio_igd_quirk_index_read(void *opaque, - hwaddr addr, unsigned size) -{ - VFIOIGDQuirk *igd = opaque; - VFIOPCIDevice *vdev = igd->vdev; - - igd->index = ~0; - - return vfio_region_read(&vdev->bars[4].region, addr, size); -} - -static void vfio_igd_quirk_index_write(void *opaque, hwaddr addr, - uint64_t data, unsigned size) -{ - VFIOIGDQuirk *igd = opaque; - VFIOPCIDevice *vdev = igd->vdev; - - igd->index = data; - - vfio_region_write(&vdev->bars[4].region, addr, data, size); -} - -static const MemoryRegionOps vfio_igd_index_quirk = { - .read = vfio_igd_quirk_index_read, - .write = vfio_igd_quirk_index_write, - .endianness = DEVICE_LITTLE_ENDIAN, -}; - #define IGD_GGC_MMIO_OFFSET 0x108040 #define IGD_BDSM_MMIO_OFFSET 0x1080C0 @@ -494,14 +365,11 @@ void vfio_probe_igd_bar4_quirk(VFIOPCIDevice *vdev, int nr) g_autofree struct vfio_region_info *opregion = NULL; g_autofree struct vfio_region_info *host = NULL; g_autofree struct vfio_region_info *lpc = NULL; - VFIOQuirk *quirk; - VFIOIGDQuirk *igd; PCIDevice *lpc_bridge; - int i, ret, gen; + int ret, gen; uint64_t ggms_size, gms_size; uint64_t *bdsm_size; uint32_t gmch; - uint16_t cmd_orig, cmd; Error *err = NULL; /* @@ -634,32 +502,6 @@ void vfio_probe_igd_bar4_quirk(VFIOPCIDevice *vdev, int nr) return; } - /* Setup our quirk to munge GTT addresses to the VM allocated buffer */ - quirk = vfio_quirk_alloc(2); - igd = quirk->data = g_malloc0(sizeof(*igd)); - igd->vdev = vdev; - igd->index = ~0; - if (gen < 11) { - igd->bdsm = vfio_pci_read_config(&vdev->pdev, IGD_BDSM, 4); - } else { - igd->bdsm = vfio_pci_read_config(&vdev->pdev, IGD_BDSM_GEN11, 4); - igd->bdsm |= - (uint64_t)vfio_pci_read_config(&vdev->pdev, IGD_BDSM_GEN11 + 4, 4) << 32; - } - igd->bdsm &= ~((1 * MiB) - 1); /* 1MB aligned */ - - memory_region_init_io(&quirk->mem[0], OBJECT(vdev), &vfio_igd_index_quirk, - igd, "vfio-igd-index-quirk", 4); - memory_region_add_subregion_overlap(vdev->bars[nr].region.mem, - 0, &quirk->mem[0], 1); - - memory_region_init_io(&quirk->mem[1], OBJECT(vdev), &vfio_igd_data_quirk, - igd, "vfio-igd-data-quirk", 4); - memory_region_add_subregion_overlap(vdev->bars[nr].region.mem, - 4, &quirk->mem[1], 1); - - QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next); - /* * Allow user to override dsm size using x-igd-gms option, in multiples of * 32MiB. This option should only be used when the desired size cannot be @@ -717,37 +559,6 @@ void vfio_probe_igd_bar4_quirk(VFIOPCIDevice *vdev, int nr) pci_set_quad(vdev->emulated_config_bits + IGD_BDSM_GEN11, ~0); } - /* - * This IOBAR gives us access to GTTADR, which allows us to write to - * the GTT itself. So let's go ahead and write zero to all the GTT - * entries to avoid spurious DMA faults. Be sure I/O access is enabled - * before talking to the device. - */ - if (pread(vdev->vbasedev.fd, &cmd_orig, sizeof(cmd_orig), - vdev->config_offset + PCI_COMMAND) != sizeof(cmd_orig)) { - error_report("IGD device %s - failed to read PCI command register", - vdev->vbasedev.name); - } - - cmd = cmd_orig | PCI_COMMAND_IO; - - if (pwrite(vdev->vbasedev.fd, &cmd, sizeof(cmd), - vdev->config_offset + PCI_COMMAND) != sizeof(cmd)) { - error_report("IGD device %s - failed to write PCI command register", - vdev->vbasedev.name); - } - - for (i = 1; i < vfio_igd_gtt_max(vdev); i += 4) { - vfio_region_write(&vdev->bars[4].region, 0, i, 4); - vfio_region_write(&vdev->bars[4].region, 4, 0, 4); - } - - if (pwrite(vdev->vbasedev.fd, &cmd_orig, sizeof(cmd_orig), - vdev->config_offset + PCI_COMMAND) != sizeof(cmd_orig)) { - error_report("IGD device %s - failed to restore PCI command register", - vdev->vbasedev.name); - } - trace_vfio_pci_igd_bdsm_enabled(vdev->vbasedev.name, (ggms_size + gms_size) / MiB); } From 5faec6a5e2202d3460b6b0550ec4ad897be3c76f Mon Sep 17 00:00:00 2001 From: Tomita Moeko Date: Fri, 7 Mar 2025 02:01:22 +0800 Subject: [PATCH 02/21] vfio/igd: Do not include GTT stolen size in etc/igd-bdsm-size MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Though GTT Stolen Memory (GSM) is right below Data Stolen Memory (DSM) in host address space, direct access to GSM is prohibited, and it is not mapped to guest address space. Both host and guest accesses GSM indirectly through the second half of MMIO BAR0 (GTTMMADR). Guest firmware only need to reserve a memory region for DSM and program the BDSM register with the base address of that region, that's actually what both SeaBIOS[1] and IgdAssignmentDxe does now. [1] https://gitlab.com/qemu-project/seabios/-/blob/1.12-stable/src/fw/pciinit.c#L319-332 Signed-off-by: Tomita Moeko Reviewed-by: Alex Williamson Tested-by: Alex Williamson Reviewed-by: Corvin Köhne Link: https://lore.kernel.org/qemu-devel/20250306180131.32970-3-tomitamoeko@gmail.com Signed-off-by: Cédric Le Goater --- hw/vfio/igd.c | 28 +++------------------------- 1 file changed, 3 insertions(+), 25 deletions(-) diff --git a/hw/vfio/igd.c b/hw/vfio/igd.c index ca3a32f4f2..dda4c7bb5d 100644 --- a/hw/vfio/igd.c +++ b/hw/vfio/igd.c @@ -112,28 +112,8 @@ static int igd_gen(VFIOPCIDevice *vdev) #define IGD_GMCH_GEN6_GMS_SHIFT 3 /* SNB_GMCH in i915 */ #define IGD_GMCH_GEN6_GMS_MASK 0x1f -#define IGD_GMCH_GEN6_GGMS_SHIFT 8 -#define IGD_GMCH_GEN6_GGMS_MASK 0x3 #define IGD_GMCH_GEN8_GMS_SHIFT 8 /* BDW_GMCH in i915 */ #define IGD_GMCH_GEN8_GMS_MASK 0xff -#define IGD_GMCH_GEN8_GGMS_SHIFT 6 -#define IGD_GMCH_GEN8_GGMS_MASK 0x3 - -static uint64_t igd_gtt_memory_size(int gen, uint16_t gmch) -{ - uint64_t ggms; - - if (gen < 8) { - ggms = (gmch >> IGD_GMCH_GEN6_GGMS_SHIFT) & IGD_GMCH_GEN6_GGMS_MASK; - } else { - ggms = (gmch >> IGD_GMCH_GEN8_GGMS_SHIFT) & IGD_GMCH_GEN8_GGMS_MASK; - if (ggms != 0) { - ggms = 1ULL << ggms; - } - } - - return ggms * MiB; -} static uint64_t igd_stolen_memory_size(int gen, uint32_t gmch) { @@ -367,7 +347,7 @@ void vfio_probe_igd_bar4_quirk(VFIOPCIDevice *vdev, int nr) g_autofree struct vfio_region_info *lpc = NULL; PCIDevice *lpc_bridge; int ret, gen; - uint64_t ggms_size, gms_size; + uint64_t gms_size; uint64_t *bdsm_size; uint32_t gmch; Error *err = NULL; @@ -527,7 +507,6 @@ void vfio_probe_igd_bar4_quirk(VFIOPCIDevice *vdev, int nr) } } - ggms_size = igd_gtt_memory_size(gen, gmch); gms_size = igd_stolen_memory_size(gen, gmch); /* @@ -539,7 +518,7 @@ void vfio_probe_igd_bar4_quirk(VFIOPCIDevice *vdev, int nr) * config offset 0x5C. */ bdsm_size = g_malloc(sizeof(*bdsm_size)); - *bdsm_size = cpu_to_le64(ggms_size + gms_size); + *bdsm_size = cpu_to_le64(gms_size); fw_cfg_add_file(fw_cfg_find(), "etc/igd-bdsm-size", bdsm_size, sizeof(*bdsm_size)); @@ -559,6 +538,5 @@ void vfio_probe_igd_bar4_quirk(VFIOPCIDevice *vdev, int nr) pci_set_quad(vdev->emulated_config_bits + IGD_BDSM_GEN11, ~0); } - trace_vfio_pci_igd_bdsm_enabled(vdev->vbasedev.name, - (ggms_size + gms_size) / MiB); + trace_vfio_pci_igd_bdsm_enabled(vdev->vbasedev.name, (gms_size / MiB)); } From ae9d9ec4a643aae8704b4252b402488bde7b4be4 Mon Sep 17 00:00:00 2001 From: Tomita Moeko Date: Fri, 7 Mar 2025 02:01:23 +0800 Subject: [PATCH 03/21] vfio/igd: Consolidate OpRegion initialization into a single function MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Both x-igd-opregion option and legacy mode require identical steps to set up OpRegion for IGD devices. Consolidate these steps into a single vfio_pci_igd_setup_opregion function. The function call in pci.c is wrapped with ifdef temporarily to prevent build error for non-x86 archs, it will be removed after we decouple it from legacy mode. Additionally, move vfio_pci_igd_opregion_init to igd.c to prevent it from being compiled in non-x86 builds. Signed-off-by: Tomita Moeko Reviewed-by: Alex Williamson Tested-by: Alex Williamson Reviewed-by: Corvin Köhne Link: https://lore.kernel.org/qemu-devel/20250306180131.32970-4-tomitamoeko@gmail.com [ clg: Fixed spelling in vfio_pci_igd_setup_opregion() ] Signed-off-by: Cédric Le Goater --- hw/vfio/igd.c | 101 +++++++++++++++++++++++++++++++++++-------- hw/vfio/pci-quirks.c | 50 --------------------- hw/vfio/pci.c | 22 ++-------- hw/vfio/pci.h | 4 +- 4 files changed, 88 insertions(+), 89 deletions(-) diff --git a/hw/vfio/igd.c b/hw/vfio/igd.c index dda4c7bb5d..113ad56ad4 100644 --- a/hw/vfio/igd.c +++ b/hw/vfio/igd.c @@ -106,6 +106,7 @@ static int igd_gen(VFIOPCIDevice *vdev) return -1; } +#define IGD_ASLS 0xfc /* ASL Storage Register */ #define IGD_GMCH 0x50 /* Graphics Control Register */ #define IGD_BDSM 0x5c /* Base Data of Stolen Memory */ #define IGD_BDSM_GEN11 0xc0 /* Base Data of Stolen Memory of gen 11 and later */ @@ -138,6 +139,82 @@ static uint64_t igd_stolen_memory_size(int gen, uint32_t gmch) return 0; } +/* + * The OpRegion includes the Video BIOS Table, which seems important for + * telling the driver what sort of outputs it has. Without this, the device + * may work in the guest, but we may not get output. This also requires BIOS + * support to reserve and populate a section of guest memory sufficient for + * the table and to write the base address of that memory to the ASLS register + * of the IGD device. + */ +static bool vfio_pci_igd_opregion_init(VFIOPCIDevice *vdev, + struct vfio_region_info *info, + Error **errp) +{ + int ret; + + vdev->igd_opregion = g_malloc0(info->size); + ret = pread(vdev->vbasedev.fd, vdev->igd_opregion, + info->size, info->offset); + if (ret != info->size) { + error_setg(errp, "failed to read IGD OpRegion"); + g_free(vdev->igd_opregion); + vdev->igd_opregion = NULL; + return false; + } + + /* + * Provide fw_cfg with a copy of the OpRegion which the VM firmware is to + * allocate 32bit reserved memory for, copy these contents into, and write + * the reserved memory base address to the device ASLS register at 0xFC. + * Alignment of this reserved region seems flexible, but using a 4k page + * alignment seems to work well. This interface assumes a single IGD + * device, which may be at VM address 00:02.0 in legacy mode or another + * address in UPT mode. + * + * NB, there may be future use cases discovered where the VM should have + * direct interaction with the host OpRegion, in which case the write to + * the ASLS register would trigger MemoryRegion setup to enable that. + */ + fw_cfg_add_file(fw_cfg_find(), "etc/igd-opregion", + vdev->igd_opregion, info->size); + + trace_vfio_pci_igd_opregion_enabled(vdev->vbasedev.name); + + pci_set_long(vdev->pdev.config + IGD_ASLS, 0); + pci_set_long(vdev->pdev.wmask + IGD_ASLS, ~0); + pci_set_long(vdev->emulated_config_bits + IGD_ASLS, ~0); + + return true; +} + +bool vfio_pci_igd_setup_opregion(VFIOPCIDevice *vdev, Error **errp) +{ + g_autofree struct vfio_region_info *opregion = NULL; + int ret; + + /* Hotplugging is not supported for opregion access */ + if (vdev->pdev.qdev.hotplugged) { + error_setg(errp, "IGD OpRegion is not supported on hotplugged device"); + return false; + } + + ret = vfio_get_dev_region_info(&vdev->vbasedev, + VFIO_REGION_TYPE_PCI_VENDOR_TYPE | PCI_VENDOR_ID_INTEL, + VFIO_REGION_SUBTYPE_INTEL_IGD_OPREGION, &opregion); + if (ret) { + error_setg_errno(errp, -ret, + "Device does not supports IGD OpRegion feature"); + return false; + } + + if (!vfio_pci_igd_opregion_init(vdev, opregion, errp)) { + return false; + } + + return true; +} + /* * The rather short list of registers that we copy from the host devices. * The LPC/ISA bridge values are definitely needed to support the vBIOS, the @@ -342,7 +419,6 @@ void vfio_probe_igd_bar0_quirk(VFIOPCIDevice *vdev, int nr) void vfio_probe_igd_bar4_quirk(VFIOPCIDevice *vdev, int nr) { g_autofree struct vfio_region_info *rom = NULL; - g_autofree struct vfio_region_info *opregion = NULL; g_autofree struct vfio_region_info *host = NULL; g_autofree struct vfio_region_info *lpc = NULL; PCIDevice *lpc_bridge; @@ -418,15 +494,6 @@ void vfio_probe_igd_bar4_quirk(VFIOPCIDevice *vdev, int nr) * Check whether we have all the vfio device specific regions to * support legacy mode (added in Linux v4.6). If not, bail. */ - ret = vfio_get_dev_region_info(&vdev->vbasedev, - VFIO_REGION_TYPE_PCI_VENDOR_TYPE | PCI_VENDOR_ID_INTEL, - VFIO_REGION_SUBTYPE_INTEL_IGD_OPREGION, &opregion); - if (ret) { - error_report("IGD device %s does not support OpRegion access," - "legacy mode disabled", vdev->vbasedev.name); - return; - } - ret = vfio_get_dev_region_info(&vdev->vbasedev, VFIO_REGION_TYPE_PCI_VENDOR_TYPE | PCI_VENDOR_ID_INTEL, VFIO_REGION_SUBTYPE_INTEL_IGD_HOST_CFG, &host); @@ -459,6 +526,13 @@ void vfio_probe_igd_bar4_quirk(VFIOPCIDevice *vdev, int nr) return; } + /* Setup OpRegion access */ + if (!vfio_pci_igd_setup_opregion(vdev, &err)) { + error_append_hint(&err, "IGD legacy mode disabled\n"); + error_report_err(err); + return; + } + /* Create our LPC/ISA bridge */ ret = vfio_pci_igd_lpc_init(vdev, lpc); if (ret) { @@ -475,13 +549,6 @@ void vfio_probe_igd_bar4_quirk(VFIOPCIDevice *vdev, int nr) return; } - /* Setup OpRegion access */ - if (!vfio_pci_igd_opregion_init(vdev, opregion, &err)) { - error_append_hint(&err, "IGD legacy mode disabled\n"); - error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name); - return; - } - /* * Allow user to override dsm size using x-igd-gms option, in multiples of * 32MiB. This option should only be used when the desired size cannot be diff --git a/hw/vfio/pci-quirks.c b/hw/vfio/pci-quirks.c index c53591fe2b..37966e17f0 100644 --- a/hw/vfio/pci-quirks.c +++ b/hw/vfio/pci-quirks.c @@ -1114,56 +1114,6 @@ static void vfio_probe_rtl8168_bar2_quirk(VFIOPCIDevice *vdev, int nr) trace_vfio_quirk_rtl8168_probe(vdev->vbasedev.name); } -#define IGD_ASLS 0xfc /* ASL Storage Register */ - -/* - * The OpRegion includes the Video BIOS Table, which seems important for - * telling the driver what sort of outputs it has. Without this, the device - * may work in the guest, but we may not get output. This also requires BIOS - * support to reserve and populate a section of guest memory sufficient for - * the table and to write the base address of that memory to the ASLS register - * of the IGD device. - */ -bool vfio_pci_igd_opregion_init(VFIOPCIDevice *vdev, - struct vfio_region_info *info, Error **errp) -{ - int ret; - - vdev->igd_opregion = g_malloc0(info->size); - ret = pread(vdev->vbasedev.fd, vdev->igd_opregion, - info->size, info->offset); - if (ret != info->size) { - error_setg(errp, "failed to read IGD OpRegion"); - g_free(vdev->igd_opregion); - vdev->igd_opregion = NULL; - return false; - } - - /* - * Provide fw_cfg with a copy of the OpRegion which the VM firmware is to - * allocate 32bit reserved memory for, copy these contents into, and write - * the reserved memory base address to the device ASLS register at 0xFC. - * Alignment of this reserved region seems flexible, but using a 4k page - * alignment seems to work well. This interface assumes a single IGD - * device, which may be at VM address 00:02.0 in legacy mode or another - * address in UPT mode. - * - * NB, there may be future use cases discovered where the VM should have - * direct interaction with the host OpRegion, in which case the write to - * the ASLS register would trigger MemoryRegion setup to enable that. - */ - fw_cfg_add_file(fw_cfg_find(), "etc/igd-opregion", - vdev->igd_opregion, info->size); - - trace_vfio_pci_igd_opregion_enabled(vdev->vbasedev.name); - - pci_set_long(vdev->pdev.config + IGD_ASLS, 0); - pci_set_long(vdev->pdev.wmask + IGD_ASLS, ~0); - pci_set_long(vdev->emulated_config_bits + IGD_ASLS, ~0); - - return true; -} - /* * Common quirk probe entry points. */ diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index fdbc15885d..419dc2c4c8 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -3136,30 +3136,14 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) vfio_bar_quirk_setup(vdev, i); } +#ifdef CONFIG_VFIO_IGD if (!vdev->igd_opregion && vdev->features & VFIO_FEATURE_ENABLE_IGD_OPREGION) { - g_autofree struct vfio_region_info *opregion = NULL; - - if (vdev->pdev.qdev.hotplugged) { - error_setg(errp, - "cannot support IGD OpRegion feature on hotplugged " - "device"); - goto out_unset_idev; - } - - ret = vfio_get_dev_region_info(vbasedev, - VFIO_REGION_TYPE_PCI_VENDOR_TYPE | PCI_VENDOR_ID_INTEL, - VFIO_REGION_SUBTYPE_INTEL_IGD_OPREGION, &opregion); - if (ret) { - error_setg_errno(errp, -ret, - "does not support requested IGD OpRegion feature"); - goto out_unset_idev; - } - - if (!vfio_pci_igd_opregion_init(vdev, opregion, errp)) { + if (!vfio_pci_igd_setup_opregion(vdev, errp)) { goto out_unset_idev; } } +#endif /* QEMU emulates all of MSI & MSIX */ if (pdev->cap_present & QEMU_PCI_CAP_MSIX) { diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h index d638c781f6..f660b0d80f 100644 --- a/hw/vfio/pci.h +++ b/hw/vfio/pci.h @@ -227,9 +227,7 @@ int vfio_pci_get_pci_hot_reset_info(VFIOPCIDevice *vdev, bool vfio_populate_vga(VFIOPCIDevice *vdev, Error **errp); -bool vfio_pci_igd_opregion_init(VFIOPCIDevice *vdev, - struct vfio_region_info *info, - Error **errp); +bool vfio_pci_igd_setup_opregion(VFIOPCIDevice *vdev, Error **errp); void vfio_display_reset(VFIOPCIDevice *vdev); bool vfio_display_probe(VFIOPCIDevice *vdev, Error **errp); From eabaa7b468eac63cb9acf47717f030c679532e6c Mon Sep 17 00:00:00 2001 From: Tomita Moeko Date: Fri, 7 Mar 2025 02:01:24 +0800 Subject: [PATCH 04/21] vfio/igd: Move LPC bridge initialization to a separate function MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A new option will soon be introduced to decouple the LPC bridge/Host bridge ID quirk from legacy mode. To prepare for this, move the LPC bridge initialization into a separate function. Signed-off-by: Tomita Moeko Reviewed-by: Alex Williamson Tested-by: Alex Williamson Reviewed-by: Corvin Köhne Link: https://lore.kernel.org/qemu-devel/20250306180131.32970-5-tomitamoeko@gmail.com Signed-off-by: Cédric Le Goater --- hw/vfio/igd.c | 122 +++++++++++++++++++++++++++++--------------------- 1 file changed, 70 insertions(+), 52 deletions(-) diff --git a/hw/vfio/igd.c b/hw/vfio/igd.c index 113ad56ad4..7feed7dfa9 100644 --- a/hw/vfio/igd.c +++ b/hw/vfio/igd.c @@ -351,6 +351,72 @@ static int vfio_pci_igd_lpc_init(VFIOPCIDevice *vdev, return ret; } +static bool vfio_pci_igd_setup_lpc_bridge(VFIOPCIDevice *vdev, Error **errp) +{ + g_autofree struct vfio_region_info *host = NULL; + g_autofree struct vfio_region_info *lpc = NULL; + PCIDevice *lpc_bridge; + int ret; + + /* + * Copying IDs or creating new devices are not supported on hotplug + */ + if (vdev->pdev.qdev.hotplugged) { + error_setg(errp, "IGD LPC is not supported on hotplugged device"); + return false; + } + + /* + * We need to create an LPC/ISA bridge at PCI bus address 00:1f.0 that we + * can stuff host values into, so if there's already one there and it's not + * one we can hack on, this quirk is no-go. Sorry Q35. + */ + lpc_bridge = pci_find_device(pci_device_root_bus(&vdev->pdev), + 0, PCI_DEVFN(0x1f, 0)); + if (lpc_bridge && !object_dynamic_cast(OBJECT(lpc_bridge), + "vfio-pci-igd-lpc-bridge")) { + error_setg(errp, + "Cannot create LPC bridge due to existing device at 1f.0"); + return false; + } + + /* + * Check whether we have all the vfio device specific regions to + * support LPC quirk (added in Linux v4.6). + */ + ret = vfio_get_dev_region_info(&vdev->vbasedev, + VFIO_REGION_TYPE_PCI_VENDOR_TYPE | PCI_VENDOR_ID_INTEL, + VFIO_REGION_SUBTYPE_INTEL_IGD_LPC_CFG, &lpc); + if (ret) { + error_setg(errp, "IGD LPC bridge access is not supported by kernel"); + return false; + } + + ret = vfio_get_dev_region_info(&vdev->vbasedev, + VFIO_REGION_TYPE_PCI_VENDOR_TYPE | PCI_VENDOR_ID_INTEL, + VFIO_REGION_SUBTYPE_INTEL_IGD_HOST_CFG, &host); + if (ret) { + error_setg(errp, "IGD host bridge access is not supported by kernel"); + return false; + } + + /* Create/modify LPC bridge */ + ret = vfio_pci_igd_lpc_init(vdev, lpc); + if (ret) { + error_setg(errp, "Failed to create/modify LPC bridge for IGD"); + return false; + } + + /* Stuff some host values into the VM PCI host bridge */ + ret = vfio_pci_igd_host_init(vdev, host); + if (ret) { + error_setg(errp, "Failed to modify host bridge for IGD"); + return false; + } + + return true; +} + #define IGD_GGC_MMIO_OFFSET 0x108040 #define IGD_BDSM_MMIO_OFFSET 0x1080C0 @@ -419,9 +485,6 @@ void vfio_probe_igd_bar0_quirk(VFIOPCIDevice *vdev, int nr) void vfio_probe_igd_bar4_quirk(VFIOPCIDevice *vdev, int nr) { g_autofree struct vfio_region_info *rom = NULL; - g_autofree struct vfio_region_info *host = NULL; - g_autofree struct vfio_region_info *lpc = NULL; - PCIDevice *lpc_bridge; int ret, gen; uint64_t gms_size; uint64_t *bdsm_size; @@ -440,20 +503,6 @@ void vfio_probe_igd_bar4_quirk(VFIOPCIDevice *vdev, int nr) return; } - /* - * We need to create an LPC/ISA bridge at PCI bus address 00:1f.0 that we - * can stuff host values into, so if there's already one there and it's not - * one we can hack on, legacy mode is no-go. Sorry Q35. - */ - lpc_bridge = pci_find_device(pci_device_root_bus(&vdev->pdev), - 0, PCI_DEVFN(0x1f, 0)); - if (lpc_bridge && !object_dynamic_cast(OBJECT(lpc_bridge), - "vfio-pci-igd-lpc-bridge")) { - error_report("IGD device %s cannot support legacy mode due to existing " - "devices at address 1f.0", vdev->vbasedev.name); - return; - } - /* * IGD is not a standard, they like to change their specs often. We * only attempt to support back to SandBridge and we hope that newer @@ -490,28 +539,6 @@ void vfio_probe_igd_bar4_quirk(VFIOPCIDevice *vdev, int nr) return; } - /* - * Check whether we have all the vfio device specific regions to - * support legacy mode (added in Linux v4.6). If not, bail. - */ - ret = vfio_get_dev_region_info(&vdev->vbasedev, - VFIO_REGION_TYPE_PCI_VENDOR_TYPE | PCI_VENDOR_ID_INTEL, - VFIO_REGION_SUBTYPE_INTEL_IGD_HOST_CFG, &host); - if (ret) { - error_report("IGD device %s does not support host bridge access," - "legacy mode disabled", vdev->vbasedev.name); - return; - } - - ret = vfio_get_dev_region_info(&vdev->vbasedev, - VFIO_REGION_TYPE_PCI_VENDOR_TYPE | PCI_VENDOR_ID_INTEL, - VFIO_REGION_SUBTYPE_INTEL_IGD_LPC_CFG, &lpc); - if (ret) { - error_report("IGD device %s does not support LPC bridge access," - "legacy mode disabled", vdev->vbasedev.name); - return; - } - gmch = vfio_pci_read_config(&vdev->pdev, IGD_GMCH, 4); /* @@ -533,19 +560,10 @@ void vfio_probe_igd_bar4_quirk(VFIOPCIDevice *vdev, int nr) return; } - /* Create our LPC/ISA bridge */ - ret = vfio_pci_igd_lpc_init(vdev, lpc); - if (ret) { - error_report("IGD device %s failed to create LPC bridge, " - "legacy mode disabled", vdev->vbasedev.name); - return; - } - - /* Stuff some host values into the VM PCI host bridge */ - ret = vfio_pci_igd_host_init(vdev, host); - if (ret) { - error_report("IGD device %s failed to modify host bridge, " - "legacy mode disabled", vdev->vbasedev.name); + /* Setup LPC bridge / Host bridge PCI IDs */ + if (!vfio_pci_igd_setup_lpc_bridge(vdev, &err)) { + error_append_hint(&err, "IGD legacy mode disabled\n"); + error_report_err(err); return; } From b22ab580d284558e298228e40f11be057cd3576c Mon Sep 17 00:00:00 2001 From: Tomita Moeko Date: Fri, 7 Mar 2025 02:01:25 +0800 Subject: [PATCH 05/21] vfio/pci: Add placeholder for device-specific config space quirks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit IGD devices require device-specific quirk to be applied to their PCI config space. Currently, it is put in the BAR4 quirk that does nothing to BAR4 itself. Add a placeholder for PCI config space quirks to hold that quirk later. Signed-off-by: Tomita Moeko Reviewed-by: Alex Williamson Tested-by: Alex Williamson Reviewed-by: Corvin Köhne Link: https://lore.kernel.org/qemu-devel/20250306180131.32970-6-tomitamoeko@gmail.com Signed-off-by: Cédric Le Goater --- hw/vfio/pci-quirks.c | 5 +++++ hw/vfio/pci.c | 4 ++++ hw/vfio/pci.h | 1 + 3 files changed, 10 insertions(+) diff --git a/hw/vfio/pci-quirks.c b/hw/vfio/pci-quirks.c index 37966e17f0..78aef7d60e 100644 --- a/hw/vfio/pci-quirks.c +++ b/hw/vfio/pci-quirks.c @@ -1117,6 +1117,11 @@ static void vfio_probe_rtl8168_bar2_quirk(VFIOPCIDevice *vdev, int nr) /* * Common quirk probe entry points. */ +bool vfio_config_quirk_setup(VFIOPCIDevice *vdev, Error **errp) +{ + return true; +} + void vfio_vga_quirk_setup(VFIOPCIDevice *vdev) { vfio_vga_probe_ati_3c3_quirk(vdev); diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index 419dc2c4c8..ff1e720dba 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -3128,6 +3128,10 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) goto out_unset_idev; } + if (!vfio_config_quirk_setup(vdev, errp)) { + goto out_unset_idev; + } + if (vdev->vga) { vfio_vga_quirk_setup(vdev); } diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h index f660b0d80f..d125e73865 100644 --- a/hw/vfio/pci.h +++ b/hw/vfio/pci.h @@ -204,6 +204,7 @@ uint64_t vfio_vga_read(void *opaque, hwaddr addr, unsigned size); void vfio_vga_write(void *opaque, hwaddr addr, uint64_t data, unsigned size); bool vfio_opt_rom_in_denylist(VFIOPCIDevice *vdev); +bool vfio_config_quirk_setup(VFIOPCIDevice *vdev, Error **errp); void vfio_vga_quirk_setup(VFIOPCIDevice *vdev); void vfio_vga_quirk_exit(VFIOPCIDevice *vdev); void vfio_vga_quirk_finalize(VFIOPCIDevice *vdev); From 9267f96ad6f26100e17a34f169a79d1ad9c1c7aa Mon Sep 17 00:00:00 2001 From: Tomita Moeko Date: Fri, 7 Mar 2025 02:01:26 +0800 Subject: [PATCH 06/21] vfio/igd: Refactor vfio_probe_igd_bar4_quirk into pci config quirk MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The actual IO BAR4 write quirk in vfio_probe_igd_bar4_quirk was removed in previous change, leaving the function not matching its name, so move it into the newly introduced vfio_config_quirk_setup. There is no functional change in this commit. For now, to align with current legacy mode behavior, it returns and proceeds on error. Later it will fail on error after decoupling the quirks from legacy mode. Signed-off-by: Tomita Moeko Reviewed-by: Alex Williamson Tested-by: Alex Williamson Reviewed-by: Corvin Köhne Link: https://lore.kernel.org/qemu-devel/20250306180131.32970-7-tomitamoeko@gmail.com Signed-off-by: Cédric Le Goater --- hw/vfio/igd.c | 21 ++++++++++++--------- hw/vfio/pci-quirks.c | 6 +++++- hw/vfio/pci.h | 2 +- 3 files changed, 18 insertions(+), 11 deletions(-) diff --git a/hw/vfio/igd.c b/hw/vfio/igd.c index 7feed7dfa9..65a3dbb5af 100644 --- a/hw/vfio/igd.c +++ b/hw/vfio/igd.c @@ -482,7 +482,8 @@ void vfio_probe_igd_bar0_quirk(VFIOPCIDevice *vdev, int nr) QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, bdsm_quirk, next); } -void vfio_probe_igd_bar4_quirk(VFIOPCIDevice *vdev, int nr) +bool vfio_probe_igd_config_quirk(VFIOPCIDevice *vdev, + Error **errp G_GNUC_UNUSED) { g_autofree struct vfio_region_info *rom = NULL; int ret, gen; @@ -497,10 +498,10 @@ void vfio_probe_igd_bar4_quirk(VFIOPCIDevice *vdev, int nr) * PCI bus address. */ if (!vfio_pci_is(vdev, PCI_VENDOR_ID_INTEL, PCI_ANY_ID) || - !vfio_is_vga(vdev) || nr != 4 || + !vfio_is_vga(vdev) || &vdev->pdev != pci_find_device(pci_device_root_bus(&vdev->pdev), 0, PCI_DEVFN(0x2, 0))) { - return; + return true; } /* @@ -512,7 +513,7 @@ void vfio_probe_igd_bar4_quirk(VFIOPCIDevice *vdev, int nr) if (gen == -1) { error_report("IGD device %s is unsupported in legacy mode, " "try SandyBridge or newer", vdev->vbasedev.name); - return; + return true; } /* @@ -525,7 +526,7 @@ void vfio_probe_igd_bar4_quirk(VFIOPCIDevice *vdev, int nr) if ((ret || !rom->size) && !vdev->pdev.romfile) { error_report("IGD device %s has no ROM, legacy mode disabled", vdev->vbasedev.name); - return; + return true; } /* @@ -536,7 +537,7 @@ void vfio_probe_igd_bar4_quirk(VFIOPCIDevice *vdev, int nr) error_report("IGD device %s hotplugged, ROM disabled, " "legacy mode disabled", vdev->vbasedev.name); vdev->rom_read_failed = true; - return; + return true; } gmch = vfio_pci_read_config(&vdev->pdev, IGD_GMCH, 4); @@ -550,21 +551,21 @@ void vfio_probe_igd_bar4_quirk(VFIOPCIDevice *vdev, int nr) error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name); error_report("IGD device %s failed to enable VGA access, " "legacy mode disabled", vdev->vbasedev.name); - return; + return true; } /* Setup OpRegion access */ if (!vfio_pci_igd_setup_opregion(vdev, &err)) { error_append_hint(&err, "IGD legacy mode disabled\n"); error_report_err(err); - return; + return true; } /* Setup LPC bridge / Host bridge PCI IDs */ if (!vfio_pci_igd_setup_lpc_bridge(vdev, &err)) { error_append_hint(&err, "IGD legacy mode disabled\n"); error_report_err(err); - return; + return true; } /* @@ -624,4 +625,6 @@ void vfio_probe_igd_bar4_quirk(VFIOPCIDevice *vdev, int nr) } trace_vfio_pci_igd_bdsm_enabled(vdev->vbasedev.name, (gms_size / MiB)); + + return true; } diff --git a/hw/vfio/pci-quirks.c b/hw/vfio/pci-quirks.c index 78aef7d60e..f998761abc 100644 --- a/hw/vfio/pci-quirks.c +++ b/hw/vfio/pci-quirks.c @@ -1119,6 +1119,11 @@ static void vfio_probe_rtl8168_bar2_quirk(VFIOPCIDevice *vdev, int nr) */ bool vfio_config_quirk_setup(VFIOPCIDevice *vdev, Error **errp) { +#ifdef CONFIG_VFIO_IGD + if (!vfio_probe_igd_config_quirk(vdev, errp)) { + return false; + } +#endif return true; } @@ -1170,7 +1175,6 @@ void vfio_bar_quirk_setup(VFIOPCIDevice *vdev, int nr) vfio_probe_rtl8168_bar2_quirk(vdev, nr); #ifdef CONFIG_VFIO_IGD vfio_probe_igd_bar0_quirk(vdev, nr); - vfio_probe_igd_bar4_quirk(vdev, nr); #endif } diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h index d125e73865..fc7ead7727 100644 --- a/hw/vfio/pci.h +++ b/hw/vfio/pci.h @@ -216,7 +216,7 @@ bool vfio_add_virt_caps(VFIOPCIDevice *vdev, Error **errp); void vfio_quirk_reset(VFIOPCIDevice *vdev); VFIOQuirk *vfio_quirk_alloc(int nr_mem); void vfio_probe_igd_bar0_quirk(VFIOPCIDevice *vdev, int nr); -void vfio_probe_igd_bar4_quirk(VFIOPCIDevice *vdev, int nr); +bool vfio_probe_igd_config_quirk(VFIOPCIDevice *vdev, Error **errp); extern const PropertyInfo qdev_prop_nv_gpudirect_clique; From 2fedccf03c889562855bfbe0628ac577938c50a2 Mon Sep 17 00:00:00 2001 From: Tomita Moeko Date: Fri, 7 Mar 2025 02:01:27 +0800 Subject: [PATCH 07/21] vfio/igd: Decouple common quirks from legacy mode MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit So far, IGD-specific quirks all require enabling legacy mode, which is toggled by assigning IGD to 00:02.0. However, some quirks, like the BDSM and GGC register quirks, should be applied to all supported IGD devices. A new config option, x-igd-legacy-mode=[on|off|auto], is introduced to control the legacy mode only quirks. The default value is "auto", which keeps current behavior that enables legacy mode implicitly and continues on error when all following conditions are met. * Machine type is i440fx * IGD device is at guest BDF 00:02.0 If any one of the conditions above is not met, the default behavior is equivalent to "off", QEMU will fail immediately if any error occurs. Users can also use "on" to force enabling legacy mode. It checks if all the conditions above are met and set up legacy mode. QEMU will also fail immediately on error in this case. Additionally, the hotplug check in legacy mode is removed as hotplugging IGD device is never supported, and it will be checked when enabling the OpRegion quirk. Signed-off-by: Tomita Moeko Reviewed-by: Alex Williamson Tested-by: Alex Williamson Reviewed-by: Corvin Köhne Link: https://lore.kernel.org/qemu-devel/20250306180131.32970-8-tomitamoeko@gmail.com [ clg: - Changed warn_report() by info_report() in vfio_probe_igd_config_quirk() as suggested by Alex W. - Fixed spelling in vfio_probe_igd_config_quirk () ] Signed-off-by: Cédric Le Goater --- hw/vfio/igd.c | 127 +++++++++++++++++++++++++++++--------------------- hw/vfio/pci.c | 2 + hw/vfio/pci.h | 1 + 3 files changed, 77 insertions(+), 53 deletions(-) diff --git a/hw/vfio/igd.c b/hw/vfio/igd.c index 65a3dbb5af..ee36875310 100644 --- a/hw/vfio/igd.c +++ b/hw/vfio/igd.c @@ -15,6 +15,7 @@ #include "qemu/error-report.h" #include "qapi/error.h" #include "qapi/qmp/qerror.h" +#include "hw/boards.h" #include "hw/hw.h" #include "hw/nvram/fw_cfg.h" #include "pci.h" @@ -432,9 +433,7 @@ void vfio_probe_igd_bar0_quirk(VFIOPCIDevice *vdev, int nr) * bus address. */ if (!vfio_pci_is(vdev, PCI_VENDOR_ID_INTEL, PCI_ANY_ID) || - !vfio_is_vga(vdev) || nr != 0 || - &vdev->pdev != pci_find_device(pci_device_root_bus(&vdev->pdev), - 0, PCI_DEVFN(0x2, 0))) { + !vfio_is_vga(vdev) || nr != 0) { return; } @@ -482,14 +481,13 @@ void vfio_probe_igd_bar0_quirk(VFIOPCIDevice *vdev, int nr) QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, bdsm_quirk, next); } -bool vfio_probe_igd_config_quirk(VFIOPCIDevice *vdev, - Error **errp G_GNUC_UNUSED) +bool vfio_probe_igd_config_quirk(VFIOPCIDevice *vdev, Error **errp) { - g_autofree struct vfio_region_info *rom = NULL; int ret, gen; uint64_t gms_size; uint64_t *bdsm_size; uint32_t gmch; + bool legacy_mode_enabled = false; Error *err = NULL; /* @@ -498,9 +496,7 @@ bool vfio_probe_igd_config_quirk(VFIOPCIDevice *vdev, * PCI bus address. */ if (!vfio_pci_is(vdev, PCI_VENDOR_ID_INTEL, PCI_ANY_ID) || - !vfio_is_vga(vdev) || - &vdev->pdev != pci_find_device(pci_device_root_bus(&vdev->pdev), - 0, PCI_DEVFN(0x2, 0))) { + !vfio_is_vga(vdev)) { return true; } @@ -516,56 +512,67 @@ bool vfio_probe_igd_config_quirk(VFIOPCIDevice *vdev, return true; } - /* - * Most of what we're doing here is to enable the ROM to run, so if - * there's no ROM, there's no point in setting up this quirk. - * NB. We only seem to get BIOS ROMs, so a UEFI VM would need CSM support. - */ - ret = vfio_get_region_info(&vdev->vbasedev, - VFIO_PCI_ROM_REGION_INDEX, &rom); - if ((ret || !rom->size) && !vdev->pdev.romfile) { - error_report("IGD device %s has no ROM, legacy mode disabled", - vdev->vbasedev.name); - return true; - } - - /* - * Ignore the hotplug corner case, mark the ROM failed, we can't - * create the devices we need for legacy mode in the hotplug scenario. - */ - if (vdev->pdev.qdev.hotplugged) { - error_report("IGD device %s hotplugged, ROM disabled, " - "legacy mode disabled", vdev->vbasedev.name); - vdev->rom_read_failed = true; - return true; - } - gmch = vfio_pci_read_config(&vdev->pdev, IGD_GMCH, 4); /* - * If IGD VGA Disable is clear (expected) and VGA is not already enabled, - * try to enable it. Probably shouldn't be using legacy mode without VGA, - * but also no point in us enabling VGA if disabled in hardware. + * For backward compatibility, enable legacy mode when + * - Machine type is i440fx (pc_piix) + * - IGD device is at guest BDF 00:02.0 + * - Not manually disabled by x-igd-legacy-mode=off */ - if (!(gmch & 0x2) && !vdev->vga && !vfio_populate_vga(vdev, &err)) { - error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name); - error_report("IGD device %s failed to enable VGA access, " - "legacy mode disabled", vdev->vbasedev.name); - return true; - } + if ((vdev->igd_legacy_mode != ON_OFF_AUTO_OFF) && + !strcmp(MACHINE_GET_CLASS(qdev_get_machine())->family, "pc_piix") && + (&vdev->pdev == pci_find_device(pci_device_root_bus(&vdev->pdev), + 0, PCI_DEVFN(0x2, 0)))) { + /* + * IGD legacy mode requires: + * - VBIOS in ROM BAR or file + * - VGA IO/MMIO ranges are claimed by IGD + * - OpRegion + * - Same LPC bridge and Host bridge VID/DID/SVID/SSID as host + */ + g_autofree struct vfio_region_info *rom = NULL; - /* Setup OpRegion access */ - if (!vfio_pci_igd_setup_opregion(vdev, &err)) { - error_append_hint(&err, "IGD legacy mode disabled\n"); - error_report_err(err); - return true; - } + legacy_mode_enabled = true; + info_report("IGD legacy mode enabled, " + "use x-igd-legacy-mode=off to disable it if unwanted."); - /* Setup LPC bridge / Host bridge PCI IDs */ - if (!vfio_pci_igd_setup_lpc_bridge(vdev, &err)) { - error_append_hint(&err, "IGD legacy mode disabled\n"); - error_report_err(err); - return true; + /* + * Most of what we're doing here is to enable the ROM to run, so if + * there's no ROM, there's no point in setting up this quirk. + * NB. We only seem to get BIOS ROMs, so UEFI VM would need CSM support. + */ + ret = vfio_get_region_info(&vdev->vbasedev, + VFIO_PCI_ROM_REGION_INDEX, &rom); + if ((ret || !rom->size) && !vdev->pdev.romfile) { + error_setg(&err, "Device has no ROM"); + goto error; + } + + /* + * If IGD VGA Disable is clear (expected) and VGA is not already + * enabled, try to enable it. Probably shouldn't be using legacy mode + * without VGA, but also no point in us enabling VGA if disabled in + * hardware. + */ + if (!(gmch & 0x2) && !vdev->vga && !vfio_populate_vga(vdev, &err)) { + error_setg(&err, "Unable to enable VGA access"); + goto error; + } + + /* Setup OpRegion access */ + if (!vfio_pci_igd_setup_opregion(vdev, &err)) { + goto error; + } + + /* Setup LPC bridge / Host bridge PCI IDs */ + if (!vfio_pci_igd_setup_lpc_bridge(vdev, &err)) { + goto error; + } + } else if (vdev->igd_legacy_mode == ON_OFF_AUTO_ON) { + error_setg(&err, + "Machine is not i440fx or assigned BDF is not 00:02.0"); + goto error; } /* @@ -627,4 +634,18 @@ bool vfio_probe_igd_config_quirk(VFIOPCIDevice *vdev, trace_vfio_pci_igd_bdsm_enabled(vdev->vbasedev.name, (gms_size / MiB)); return true; + +error: + /* + * When legacy mode is implicity enabled, continue on error, + * to keep compatibility + */ + if (legacy_mode_enabled && (vdev->igd_legacy_mode == ON_OFF_AUTO_AUTO)) { + error_report_err(err); + error_report("IGD legacy mode disabled"); + return true; + } + + error_propagate(errp, err); + return false; } diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index ff1e720dba..444a33d94b 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -3369,6 +3369,8 @@ static const Property vfio_pci_dev_properties[] = { VFIO_FEATURE_ENABLE_REQ_BIT, true), DEFINE_PROP_BIT("x-igd-opregion", VFIOPCIDevice, features, VFIO_FEATURE_ENABLE_IGD_OPREGION_BIT, false), + DEFINE_PROP_ON_OFF_AUTO("x-igd-legacy-mode", VFIOPCIDevice, + igd_legacy_mode, ON_OFF_AUTO_AUTO), DEFINE_PROP_ON_OFF_AUTO("enable-migration", VFIOPCIDevice, vbasedev.enable_migration, ON_OFF_AUTO_AUTO), DEFINE_PROP("x-migration-multifd-transfer", VFIOPCIDevice, diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h index fc7ead7727..3e66b19d8f 100644 --- a/hw/vfio/pci.h +++ b/hw/vfio/pci.h @@ -158,6 +158,7 @@ struct VFIOPCIDevice { uint32_t display_xres; uint32_t display_yres; int32_t bootindex; + OnOffAuto igd_legacy_mode; uint32_t igd_gms; OffAutoPCIBAR msix_relo; uint8_t nv_gpudirect_clique; From 434ac62ef2443f25956dafcfcbf29663fd0cd3e1 Mon Sep 17 00:00:00 2001 From: Tomita Moeko Date: Fri, 7 Mar 2025 02:01:28 +0800 Subject: [PATCH 08/21] vfio/igd: Handle x-igd-opregion option in config quirk MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Both enable OpRegion option (x-igd-opregion) and legacy mode require setting up OpRegion copy for IGD devices. As the config quirk no longer depends on legacy mode, we can now handle x-igd-opregion option there instead of in vfio_realize. Signed-off-by: Tomita Moeko Reviewed-by: Alex Williamson Tested-by: Alex Williamson Reviewed-by: Corvin Köhne Link: https://lore.kernel.org/qemu-devel/20250306180131.32970-9-tomitamoeko@gmail.com Signed-off-by: Cédric Le Goater --- hw/vfio/igd.c | 14 +++++++++----- hw/vfio/pci.c | 9 --------- hw/vfio/pci.h | 2 -- 3 files changed, 9 insertions(+), 16 deletions(-) diff --git a/hw/vfio/igd.c b/hw/vfio/igd.c index ee36875310..12e07517b4 100644 --- a/hw/vfio/igd.c +++ b/hw/vfio/igd.c @@ -189,7 +189,7 @@ static bool vfio_pci_igd_opregion_init(VFIOPCIDevice *vdev, return true; } -bool vfio_pci_igd_setup_opregion(VFIOPCIDevice *vdev, Error **errp) +static bool vfio_pci_igd_setup_opregion(VFIOPCIDevice *vdev, Error **errp) { g_autofree struct vfio_region_info *opregion = NULL; int ret; @@ -560,10 +560,8 @@ bool vfio_probe_igd_config_quirk(VFIOPCIDevice *vdev, Error **errp) goto error; } - /* Setup OpRegion access */ - if (!vfio_pci_igd_setup_opregion(vdev, &err)) { - goto error; - } + /* Enable OpRegion quirk */ + vdev->features |= VFIO_FEATURE_ENABLE_IGD_OPREGION; /* Setup LPC bridge / Host bridge PCI IDs */ if (!vfio_pci_igd_setup_lpc_bridge(vdev, &err)) { @@ -575,6 +573,12 @@ bool vfio_probe_igd_config_quirk(VFIOPCIDevice *vdev, Error **errp) goto error; } + /* Setup OpRegion access */ + if ((vdev->features & VFIO_FEATURE_ENABLE_IGD_OPREGION) && + !vfio_pci_igd_setup_opregion(vdev, errp)) { + goto error; + } + /* * Allow user to override dsm size using x-igd-gms option, in multiples of * 32MiB. This option should only be used when the desired size cannot be diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index 444a33d94b..e2897bdcd5 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -3140,15 +3140,6 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) vfio_bar_quirk_setup(vdev, i); } -#ifdef CONFIG_VFIO_IGD - if (!vdev->igd_opregion && - vdev->features & VFIO_FEATURE_ENABLE_IGD_OPREGION) { - if (!vfio_pci_igd_setup_opregion(vdev, errp)) { - goto out_unset_idev; - } - } -#endif - /* QEMU emulates all of MSI & MSIX */ if (pdev->cap_present & QEMU_PCI_CAP_MSIX) { memset(vdev->emulated_config_bits + pdev->msix_cap, 0xff, diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h index 3e66b19d8f..816bdbf844 100644 --- a/hw/vfio/pci.h +++ b/hw/vfio/pci.h @@ -229,8 +229,6 @@ int vfio_pci_get_pci_hot_reset_info(VFIOPCIDevice *vdev, bool vfio_populate_vga(VFIOPCIDevice *vdev, Error **errp); -bool vfio_pci_igd_setup_opregion(VFIOPCIDevice *vdev, Error **errp); - void vfio_display_reset(VFIOPCIDevice *vdev); bool vfio_display_probe(VFIOPCIDevice *vdev, Error **errp); void vfio_display_finalize(VFIOPCIDevice *vdev); From 674f61117d3652c969bd9d04201615bb69614fa2 Mon Sep 17 00:00:00 2001 From: Tomita Moeko Date: Fri, 7 Mar 2025 02:01:29 +0800 Subject: [PATCH 09/21] vfio/igd: Introduce x-igd-lpc option for LPC bridge ID quirk MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The LPC bridge/Host bridge IDs quirk is also not dependent on legacy mode. Recent Windows driver no longer depends on these IDs, as well as Linux i915 driver, while UEFI GOP seems still needs them. Make it an option to allow users enabling and disabling it as needed. Signed-off-by: Tomita Moeko Reviewed-by: Alex Williamson Tested-by: Alex Williamson Reviewed-by: Corvin Köhne Link: https://lore.kernel.org/qemu-devel/20250306180131.32970-10-tomitamoeko@gmail.com [ clg: - Fixed spelling in vfio_probe_igd_config_quirk() ] Signed-off-by: Cédric Le Goater --- hw/vfio/igd.c | 14 ++++++++------ hw/vfio/pci.c | 2 ++ hw/vfio/pci.h | 3 +++ 3 files changed, 13 insertions(+), 6 deletions(-) diff --git a/hw/vfio/igd.c b/hw/vfio/igd.c index 12e07517b4..8a88dbab13 100644 --- a/hw/vfio/igd.c +++ b/hw/vfio/igd.c @@ -560,13 +560,9 @@ bool vfio_probe_igd_config_quirk(VFIOPCIDevice *vdev, Error **errp) goto error; } - /* Enable OpRegion quirk */ + /* Enable OpRegion and LPC bridge quirk */ vdev->features |= VFIO_FEATURE_ENABLE_IGD_OPREGION; - - /* Setup LPC bridge / Host bridge PCI IDs */ - if (!vfio_pci_igd_setup_lpc_bridge(vdev, &err)) { - goto error; - } + vdev->features |= VFIO_FEATURE_ENABLE_IGD_LPC; } else if (vdev->igd_legacy_mode == ON_OFF_AUTO_ON) { error_setg(&err, "Machine is not i440fx or assigned BDF is not 00:02.0"); @@ -579,6 +575,12 @@ bool vfio_probe_igd_config_quirk(VFIOPCIDevice *vdev, Error **errp) goto error; } + /* Setup LPC bridge / Host bridge PCI IDs */ + if ((vdev->features & VFIO_FEATURE_ENABLE_IGD_LPC) && + !vfio_pci_igd_setup_lpc_bridge(vdev, errp)) { + goto error; + } + /* * Allow user to override dsm size using x-igd-gms option, in multiples of * 32MiB. This option should only be used when the desired size cannot be diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index e2897bdcd5..3cb7806f2f 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -3360,6 +3360,8 @@ static const Property vfio_pci_dev_properties[] = { VFIO_FEATURE_ENABLE_REQ_BIT, true), DEFINE_PROP_BIT("x-igd-opregion", VFIOPCIDevice, features, VFIO_FEATURE_ENABLE_IGD_OPREGION_BIT, false), + DEFINE_PROP_BIT("x-igd-lpc", VFIOPCIDevice, features, + VFIO_FEATURE_ENABLE_IGD_LPC_BIT, false), DEFINE_PROP_ON_OFF_AUTO("x-igd-legacy-mode", VFIOPCIDevice, igd_legacy_mode, ON_OFF_AUTO_AUTO), DEFINE_PROP_ON_OFF_AUTO("enable-migration", VFIOPCIDevice, diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h index 816bdbf844..d94ecaba68 100644 --- a/hw/vfio/pci.h +++ b/hw/vfio/pci.h @@ -154,6 +154,9 @@ struct VFIOPCIDevice { #define VFIO_FEATURE_ENABLE_IGD_OPREGION_BIT 2 #define VFIO_FEATURE_ENABLE_IGD_OPREGION \ (1 << VFIO_FEATURE_ENABLE_IGD_OPREGION_BIT) +#define VFIO_FEATURE_ENABLE_IGD_LPC_BIT 3 +#define VFIO_FEATURE_ENABLE_IGD_LPC \ + (1 << VFIO_FEATURE_ENABLE_IGD_LPC_BIT) OnOffAuto display; uint32_t display_xres; uint32_t display_yres; From e46b7af508c27a4e830818ac05fa3e2e4c33b416 Mon Sep 17 00:00:00 2001 From: Tomita Moeko Date: Fri, 7 Mar 2025 02:01:30 +0800 Subject: [PATCH 10/21] vfio/igd: Fix broken KVMGT OpRegion support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The KVMGT/GVT-g vGPU also exposes OpRegion. But unlike IGD passthrough, it only needs the OpRegion quirk. A previous change moved x-igd-opregion handling to config quirk breaks KVMGT functionality as it brings extra checks and applied other quirks. Here we check if the device is mdev (KVMGT) or not (passthrough), and then applies corresponding quirks. As before, users must manually specify x-igd-opregion=on to enable it on KVMGT devices. In the future, we may check the VID/DID and enable OpRegion automatically. Signed-off-by: Tomita Moeko Reviewed-by: Alex Williamson Tested-by: Alex Williamson Reviewed-by: Corvin Köhne Link: https://lore.kernel.org/qemu-devel/20250306180131.32970-11-tomitamoeko@gmail.com Signed-off-by: Cédric Le Goater --- hw/vfio/igd.c | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/hw/vfio/igd.c b/hw/vfio/igd.c index 8a88dbab13..265fffc2aa 100644 --- a/hw/vfio/igd.c +++ b/hw/vfio/igd.c @@ -481,7 +481,7 @@ void vfio_probe_igd_bar0_quirk(VFIOPCIDevice *vdev, int nr) QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, bdsm_quirk, next); } -bool vfio_probe_igd_config_quirk(VFIOPCIDevice *vdev, Error **errp) +static bool vfio_pci_igd_config_quirk(VFIOPCIDevice *vdev, Error **errp) { int ret, gen; uint64_t gms_size; @@ -655,3 +655,28 @@ error: error_propagate(errp, err); return false; } + +/* + * KVMGT/GVT-g vGPU exposes an emulated OpRegion. So far, users have to specify + * x-igd-opregion=on to enable the access. + * TODO: Check VID/DID and enable opregion access automatically + */ +static bool vfio_pci_kvmgt_config_quirk(VFIOPCIDevice *vdev, Error **errp) +{ + if ((vdev->features & VFIO_FEATURE_ENABLE_IGD_OPREGION) && + !vfio_pci_igd_setup_opregion(vdev, errp)) { + return false; + } + + return true; +} + +bool vfio_probe_igd_config_quirk(VFIOPCIDevice *vdev, Error **errp) +{ + /* KVMGT/GVT-g vGPU is exposed as mdev */ + if (vdev->vbasedev.mdev) { + return vfio_pci_kvmgt_config_quirk(vdev, errp); + } + + return vfio_pci_igd_config_quirk(vdev, errp); +} From cbb2e105268d0c5b22ba6fc936af1ae6bae3f959 Mon Sep 17 00:00:00 2001 From: "Maciej S. Szmigiero" Date: Mon, 10 Mar 2025 13:53:10 +0100 Subject: [PATCH 11/21] vfio/migration: Use BE byte order for device state wire packets MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Wire data commonly use BE byte order (including in the existing migration protocol), use it also for for VFIO device state packets. This will allow VFIO multifd device state transfer between hosts with different endianness. Although currently there is no such use case, it's good to have it now for completeness. Reviewed-by: Avihai Horon Signed-off-by: Maciej S. Szmigiero Link: https://lore.kernel.org/qemu-devel/dcfc04cc1a50655650dbac8398e2742ada84ee39.1741611079.git.maciej.szmigiero@oracle.com Signed-off-by: Cédric Le Goater --- hw/vfio/migration-multifd.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/hw/vfio/migration-multifd.c b/hw/vfio/migration-multifd.c index 233724710b..378f6f3bf0 100644 --- a/hw/vfio/migration-multifd.c +++ b/hw/vfio/migration-multifd.c @@ -13,6 +13,7 @@ #include "hw/vfio/vfio-common.h" #include "migration/misc.h" #include "qapi/error.h" +#include "qemu/bswap.h" #include "qemu/error-report.h" #include "qemu/lockable.h" #include "qemu/main-loop.h" @@ -155,12 +156,16 @@ bool vfio_multifd_load_state_buffer(void *opaque, char *data, size_t data_size, return false; } + packet->version = be32_to_cpu(packet->version); if (packet->version != VFIO_DEVICE_STATE_PACKET_VER_CURRENT) { error_setg(errp, "%s: packet has unknown version %" PRIu32, vbasedev->name, packet->version); return false; } + packet->idx = be32_to_cpu(packet->idx); + packet->flags = be32_to_cpu(packet->flags); + if (packet->idx == UINT32_MAX) { error_setg(errp, "%s: packet index is invalid", vbasedev->name); return false; @@ -558,9 +563,9 @@ vfio_save_complete_precopy_thread_config_state(VFIODevice *vbasedev, packet_len = sizeof(*packet) + bioc->usage; packet = g_malloc0(packet_len); - packet->version = VFIO_DEVICE_STATE_PACKET_VER_CURRENT; - packet->idx = idx; - packet->flags = VFIO_DEVICE_STATE_CONFIG_STATE; + packet->version = cpu_to_be32(VFIO_DEVICE_STATE_PACKET_VER_CURRENT); + packet->idx = cpu_to_be32(idx); + packet->flags = cpu_to_be32(VFIO_DEVICE_STATE_CONFIG_STATE); memcpy(&packet->data, bioc->data, bioc->usage); if (!multifd_queue_device_state(idstr, instance_id, @@ -610,7 +615,7 @@ vfio_multifd_save_complete_precopy_thread(SaveLiveCompletePrecopyThreadData *d, } packet = g_malloc0(sizeof(*packet) + migration->data_buffer_size); - packet->version = VFIO_DEVICE_STATE_PACKET_VER_CURRENT; + packet->version = cpu_to_be32(VFIO_DEVICE_STATE_PACKET_VER_CURRENT); for (idx = 0; ; idx++) { ssize_t data_size; @@ -631,7 +636,7 @@ vfio_multifd_save_complete_precopy_thread(SaveLiveCompletePrecopyThreadData *d, break; } - packet->idx = idx; + packet->idx = cpu_to_be32(idx); packet_size = sizeof(*packet) + data_size; if (!multifd_queue_device_state(d->idstr, d->instance_id, From c6cd30feadb6907cf69d35babec74274e690164f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philippe=20Mathieu-Daud=C3=A9?= Date: Tue, 11 Mar 2025 09:57:36 +0100 Subject: [PATCH 12/21] system: Declare qemu_[min/max]rampagesize() in 'system/hostmem.h' MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Both qemu_minrampagesize() and qemu_maxrampagesize() are related to host memory backends, having the following call stack: qemu_minrampagesize() -> find_min_backend_pagesize() -> object_dynamic_cast(obj, TYPE_MEMORY_BACKEND) qemu_maxrampagesize() -> find_max_backend_pagesize() -> object_dynamic_cast(obj, TYPE_MEMORY_BACKEND) Having TYPE_MEMORY_BACKEND defined in "system/hostmem.h": include/system/hostmem.h:23:#define TYPE_MEMORY_BACKEND "memory-backend" Move their prototype declaration to "system/hostmem.h". Signed-off-by: Philippe Mathieu-Daudé Reviewed-by: Pierrick Bouvier Reviewed-by: Eric Auger Message-Id: <20250308230917.18907-7-philmd@linaro.org> Acked-by: David Hildenbrand Link: https://lore.kernel.org/qemu-devel/20250311085743.21724-2-philmd@linaro.org Signed-off-by: Cédric Le Goater --- hw/ppc/spapr_caps.c | 1 + hw/s390x/s390-virtio-ccw.c | 1 + hw/vfio/spapr.c | 1 + include/exec/ram_addr.h | 3 --- include/system/hostmem.h | 3 +++ 5 files changed, 6 insertions(+), 3 deletions(-) diff --git a/hw/ppc/spapr_caps.c b/hw/ppc/spapr_caps.c index 904bff87ce..9e53d0c1fd 100644 --- a/hw/ppc/spapr_caps.c +++ b/hw/ppc/spapr_caps.c @@ -34,6 +34,7 @@ #include "kvm_ppc.h" #include "migration/vmstate.h" #include "system/tcg.h" +#include "system/hostmem.h" #include "hw/ppc/spapr.h" diff --git a/hw/s390x/s390-virtio-ccw.c b/hw/s390x/s390-virtio-ccw.c index a9b3db19f6..75b32182eb 100644 --- a/hw/s390x/s390-virtio-ccw.c +++ b/hw/s390x/s390-virtio-ccw.c @@ -41,6 +41,7 @@ #include "hw/s390x/tod.h" #include "system/system.h" #include "system/cpus.h" +#include "system/hostmem.h" #include "target/s390x/kvm/pv.h" #include "migration/blocker.h" #include "qapi/visitor.h" diff --git a/hw/vfio/spapr.c b/hw/vfio/spapr.c index ad4c499eaf..237f96dd3f 100644 --- a/hw/vfio/spapr.c +++ b/hw/vfio/spapr.c @@ -15,6 +15,7 @@ #include #endif #include "system/kvm.h" +#include "system/hostmem.h" #include "exec/address-spaces.h" #include "hw/vfio/vfio-common.h" diff --git a/include/exec/ram_addr.h b/include/exec/ram_addr.h index 3d8df4edf1..e4c28fbec9 100644 --- a/include/exec/ram_addr.h +++ b/include/exec/ram_addr.h @@ -102,9 +102,6 @@ static inline unsigned long int ramblock_recv_bitmap_offset(void *host_addr, bool ramblock_is_pmem(RAMBlock *rb); -long qemu_minrampagesize(void); -long qemu_maxrampagesize(void); - /** * qemu_ram_alloc_from_file, * qemu_ram_alloc_from_fd: Allocate a ram block from the specified backing diff --git a/include/system/hostmem.h b/include/system/hostmem.h index 5c21ca55c0..62642e602c 100644 --- a/include/system/hostmem.h +++ b/include/system/hostmem.h @@ -93,4 +93,7 @@ bool host_memory_backend_is_mapped(HostMemoryBackend *backend); size_t host_memory_backend_pagesize(HostMemoryBackend *memdev); char *host_memory_backend_get_name(HostMemoryBackend *backend); +long qemu_minrampagesize(void); +long qemu_maxrampagesize(void); + #endif From a19ce97ed16c2892c6340fb46e693aa934ab7ebd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philippe=20Mathieu-Daud=C3=A9?= Date: Tue, 11 Mar 2025 09:57:37 +0100 Subject: [PATCH 13/21] hw/vfio/spapr: Do not include MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit is already included by "system/kvm.h" in the next line. Signed-off-by: Philippe Mathieu-Daudé Reviewed-by: Pierrick Bouvier Reviewed-by: Richard Henderson Reviewed-by: Cédric Le Goater Reviewed-by: Eric Auger Message-Id: <20250307180337.14811-3-philmd@linaro.org> Link: https://lore.kernel.org/qemu-devel/20250311085743.21724-3-philmd@linaro.org Signed-off-by: Cédric Le Goater --- hw/vfio/spapr.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/hw/vfio/spapr.c b/hw/vfio/spapr.c index 237f96dd3f..1a5d1611f2 100644 --- a/hw/vfio/spapr.c +++ b/hw/vfio/spapr.c @@ -11,9 +11,6 @@ #include "qemu/osdep.h" #include #include -#ifdef CONFIG_KVM -#include -#endif #include "system/kvm.h" #include "system/hostmem.h" #include "exec/address-spaces.h" From 514c29678715c6b27e09fef8e3594dc1dba6820b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philippe=20Mathieu-Daud=C3=A9?= Date: Tue, 11 Mar 2025 09:57:38 +0100 Subject: [PATCH 14/21] hw/vfio/common: Include missing 'system/tcg.h' header MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Always include necessary headers explicitly, to avoid when refactoring unrelated ones: hw/vfio/common.c:1176:45: error: implicit declaration of function ‘tcg_enabled’; 1176 | tcg_enabled() ? DIRTY_CLIENTS_ALL : | ^~~~~~~~~~~ Signed-off-by: Philippe Mathieu-Daudé Reviewed-by: Pierrick Bouvier Reviewed-by: Richard Henderson Reviewed-by: Cédric Le Goater Reviewed-by: Eric Auger Message-Id: <20250307180337.14811-2-philmd@linaro.org> Link: https://lore.kernel.org/qemu-devel/20250311085743.21724-4-philmd@linaro.org Signed-off-by: Cédric Le Goater --- hw/vfio/common.c | 1 + 1 file changed, 1 insertion(+) diff --git a/hw/vfio/common.c b/hw/vfio/common.c index 7a4010ef4e..b1596b6bf6 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -42,6 +42,7 @@ #include "migration/misc.h" #include "migration/blocker.h" #include "migration/qemu-file.h" +#include "system/tcg.h" #include "system/tpm.h" VFIODeviceList vfio_device_list = From 80ce7bb5cf0157fa835ecaabeaed90fc282830c9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philippe=20Mathieu-Daud=C3=A9?= Date: Tue, 11 Mar 2025 09:57:39 +0100 Subject: [PATCH 15/21] hw/vfio/common: Get target page size using runtime helpers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Prefer runtime helpers to get target page size. Reviewed-by: Richard Henderson Signed-off-by: Philippe Mathieu-Daudé Message-Id: <20250305153929.43687-3-philmd@linaro.org> Link: https://lore.kernel.org/qemu-devel/20250311085743.21724-5-philmd@linaro.org Signed-off-by: Cédric Le Goater --- hw/vfio/common.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/hw/vfio/common.c b/hw/vfio/common.c index b1596b6bf6..1a0d9290f8 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -30,6 +30,7 @@ #include "exec/address-spaces.h" #include "exec/memory.h" #include "exec/ram_addr.h" +#include "exec/target_page.h" #include "hw/hw.h" #include "qemu/error-report.h" #include "qemu/main-loop.h" @@ -393,13 +394,14 @@ static void vfio_register_ram_discard_listener(VFIOContainerBase *bcontainer, MemoryRegionSection *section) { RamDiscardManager *rdm = memory_region_get_ram_discard_manager(section->mr); + int target_page_size = qemu_target_page_size(); VFIORamDiscardListener *vrdl; /* Ignore some corner cases not relevant in practice. */ - g_assert(QEMU_IS_ALIGNED(section->offset_within_region, TARGET_PAGE_SIZE)); + g_assert(QEMU_IS_ALIGNED(section->offset_within_region, target_page_size)); g_assert(QEMU_IS_ALIGNED(section->offset_within_address_space, - TARGET_PAGE_SIZE)); - g_assert(QEMU_IS_ALIGNED(int128_get64(section->size), TARGET_PAGE_SIZE)); + target_page_size)); + g_assert(QEMU_IS_ALIGNED(int128_get64(section->size), target_page_size)); vrdl = g_new0(VFIORamDiscardListener, 1); vrdl->bcontainer = bcontainer; From 5731baee6c3ca1fbd23ce9dfe1a825aafd473e7f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philippe=20Mathieu-Daud=C3=A9?= Date: Tue, 11 Mar 2025 09:57:40 +0100 Subject: [PATCH 16/21] hw/vfio: Compile some common objects once MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Some files don't rely on any target-specific knowledge and can be compiled once: - helpers.c - container-base.c - migration.c (removing unnecessary "exec/ram_addr.h") - migration-multifd.c - cpr.c Signed-off-by: Philippe Mathieu-Daudé Reviewed-by: Pierrick Bouvier Reviewed-by: Richard Henderson Reviewed-by: Cédric Le Goater Reviewed-by: Eric Auger Message-Id: <20250308230917.18907-4-philmd@linaro.org> Link: https://lore.kernel.org/qemu-devel/20250311085743.21724-6-philmd@linaro.org Signed-off-by: Cédric Le Goater --- hw/vfio/meson.build | 13 ++++++++----- hw/vfio/migration.c | 1 - 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/hw/vfio/meson.build b/hw/vfio/meson.build index 260d65febd..8e376cfcbf 100644 --- a/hw/vfio/meson.build +++ b/hw/vfio/meson.build @@ -1,12 +1,7 @@ vfio_ss = ss.source_set() vfio_ss.add(files( - 'helpers.c', 'common.c', - 'container-base.c', 'container.c', - 'migration.c', - 'migration-multifd.c', - 'cpr.c', )) vfio_ss.add(when: 'CONFIG_PSERIES', if_true: files('spapr.c')) vfio_ss.add(when: 'CONFIG_IOMMUFD', if_true: files( @@ -25,3 +20,11 @@ vfio_ss.add(when: 'CONFIG_VFIO_AP', if_true: files('ap.c')) vfio_ss.add(when: 'CONFIG_VFIO_IGD', if_true: files('igd.c')) specific_ss.add_all(when: 'CONFIG_VFIO', if_true: vfio_ss) + +system_ss.add(when: 'CONFIG_VFIO', if_true: files( + 'helpers.c', + 'container-base.c', + 'migration.c', + 'migration-multifd.c', + 'cpr.c', +)) diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c index 416643ddd6..fbff46cfc3 100644 --- a/hw/vfio/migration.c +++ b/hw/vfio/migration.c @@ -27,7 +27,6 @@ #include "qapi/error.h" #include "qapi/qapi-events-vfio.h" #include "exec/ramlist.h" -#include "exec/ram_addr.h" #include "pci.h" #include "trace.h" #include "hw/hw.h" From 761d63ccaf6768dc528f2e0e1a422caa12fbe216 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philippe=20Mathieu-Daud=C3=A9?= Date: Tue, 11 Mar 2025 09:57:41 +0100 Subject: [PATCH 17/21] hw/vfio: Compile more objects once MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit These files depend on the VFIO symbol in their Kconfig definition. They don't rely on target specific definitions, move them to system_ss[] to build them once. Signed-off-by: Philippe Mathieu-Daudé Reviewed-by: Pierrick Bouvier Reviewed-by: Richard Henderson Reviewed-by: Cédric Le Goater Reviewed-by: Eric Auger Message-Id: <20250308230917.18907-5-philmd@linaro.org> Link: https://lore.kernel.org/qemu-devel/20250311085743.21724-7-philmd@linaro.org Signed-off-by: Cédric Le Goater --- hw/vfio/meson.build | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hw/vfio/meson.build b/hw/vfio/meson.build index 8e376cfcbf..784eae4b55 100644 --- a/hw/vfio/meson.build +++ b/hw/vfio/meson.build @@ -14,13 +14,13 @@ vfio_ss.add(when: 'CONFIG_VFIO_PCI', if_true: files( )) vfio_ss.add(when: 'CONFIG_VFIO_CCW', if_true: files('ccw.c')) vfio_ss.add(when: 'CONFIG_VFIO_PLATFORM', if_true: files('platform.c')) -vfio_ss.add(when: 'CONFIG_VFIO_XGMAC', if_true: files('calxeda-xgmac.c')) -vfio_ss.add(when: 'CONFIG_VFIO_AMD_XGBE', if_true: files('amd-xgbe.c')) vfio_ss.add(when: 'CONFIG_VFIO_AP', if_true: files('ap.c')) vfio_ss.add(when: 'CONFIG_VFIO_IGD', if_true: files('igd.c')) specific_ss.add_all(when: 'CONFIG_VFIO', if_true: vfio_ss) +system_ss.add(when: 'CONFIG_VFIO_XGMAC', if_true: files('calxeda-xgmac.c')) +system_ss.add(when: 'CONFIG_VFIO_AMD_XGBE', if_true: files('amd-xgbe.c')) system_ss.add(when: 'CONFIG_VFIO', if_true: files( 'helpers.c', 'container-base.c', From d5c0be1a94fb3733ee04b8d5cde6641ad54ee2ea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philippe=20Mathieu-Daud=C3=A9?= Date: Tue, 11 Mar 2025 09:57:42 +0100 Subject: [PATCH 18/21] hw/vfio: Compile iommufd.c once MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Removing unused "exec/ram_addr.h" header allow to compile iommufd.c once for all targets. Signed-off-by: Philippe Mathieu-Daudé Reviewed-by: Pierrick Bouvier Reviewed-by: Richard Henderson Reviewed-by: Cédric Le Goater Reviewed-by: Eric Auger Message-Id: <20250308230917.18907-6-philmd@linaro.org> Link: https://lore.kernel.org/qemu-devel/20250311085743.21724-8-philmd@linaro.org Signed-off-by: Cédric Le Goater --- hw/vfio/iommufd.c | 1 - hw/vfio/meson.build | 6 +++--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c index df61edffc0..42c8412bbf 100644 --- a/hw/vfio/iommufd.c +++ b/hw/vfio/iommufd.c @@ -25,7 +25,6 @@ #include "qemu/cutils.h" #include "qemu/chardev_open.h" #include "pci.h" -#include "exec/ram_addr.h" static int iommufd_cdev_map(const VFIOContainerBase *bcontainer, hwaddr iova, ram_addr_t size, void *vaddr, bool readonly) diff --git a/hw/vfio/meson.build b/hw/vfio/meson.build index 784eae4b55..5c9ec7e897 100644 --- a/hw/vfio/meson.build +++ b/hw/vfio/meson.build @@ -4,9 +4,6 @@ vfio_ss.add(files( 'container.c', )) vfio_ss.add(when: 'CONFIG_PSERIES', if_true: files('spapr.c')) -vfio_ss.add(when: 'CONFIG_IOMMUFD', if_true: files( - 'iommufd.c', -)) vfio_ss.add(when: 'CONFIG_VFIO_PCI', if_true: files( 'display.c', 'pci-quirks.c', @@ -28,3 +25,6 @@ system_ss.add(when: 'CONFIG_VFIO', if_true: files( 'migration-multifd.c', 'cpr.c', )) +system_ss.add(when: ['CONFIG_VFIO', 'CONFIG_IOMMUFD'], if_true: files( + 'iommufd.c', +)) From 28ea52dd639b75e212ad32cc704a1dd212286914 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philippe=20Mathieu-Daud=C3=A9?= Date: Tue, 11 Mar 2025 09:57:43 +0100 Subject: [PATCH 19/21] hw/vfio: Compile display.c once MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit display.c doesn't rely on target specific definitions, move it to system_ss[] to build it once. Signed-off-by: Philippe Mathieu-Daudé Reviewed-by: Pierrick Bouvier Reviewed-by: Richard Henderson Reviewed-by: Cédric Le Goater Reviewed-by: Eric Auger Message-Id: <20250308230917.18907-8-philmd@linaro.org> Link: https://lore.kernel.org/qemu-devel/20250311085743.21724-9-philmd@linaro.org Signed-off-by: Cédric Le Goater --- hw/vfio/meson.build | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/hw/vfio/meson.build b/hw/vfio/meson.build index 5c9ec7e897..a8939c8386 100644 --- a/hw/vfio/meson.build +++ b/hw/vfio/meson.build @@ -5,7 +5,6 @@ vfio_ss.add(files( )) vfio_ss.add(when: 'CONFIG_PSERIES', if_true: files('spapr.c')) vfio_ss.add(when: 'CONFIG_VFIO_PCI', if_true: files( - 'display.c', 'pci-quirks.c', 'pci.c', )) @@ -28,3 +27,6 @@ system_ss.add(when: 'CONFIG_VFIO', if_true: files( system_ss.add(when: ['CONFIG_VFIO', 'CONFIG_IOMMUFD'], if_true: files( 'iommufd.c', )) +system_ss.add(when: 'CONFIG_VFIO_PCI', if_true: files( + 'display.c', +)) From 63316f973ad9f1465b1cb820c5be954260663c8a Mon Sep 17 00:00:00 2001 From: Vasilis Liaskovitis Date: Tue, 11 Mar 2025 00:58:33 +0100 Subject: [PATCH 20/21] vfio/pci-quirks: Exclude non-ioport BAR from ATI quirk MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The ATI BAR4 quirk is targeting an ioport BAR. Older devices may have a BAR4 which is not an ioport, causing a segfault here. Test the BAR type to skip these devices. Similar to "8f419c5b: vfio/pci-quirks: Exclude non-ioport BAR from NVIDIA quirk" Untested, as I don't have the card to test. Resolves: https://gitlab.com/qemu-project/qemu/-/issues/2856 Signed-off-by: Vasilis Liaskovitis Reviewed-by: Alex Williamson Link: https://lore.kernel.org/qemu-devel/20250310235833.41026-1-vliaskovitis@suse.com Signed-off-by: Cédric Le Goater --- hw/vfio/pci-quirks.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hw/vfio/pci-quirks.c b/hw/vfio/pci-quirks.c index f998761abc..3f002252ac 100644 --- a/hw/vfio/pci-quirks.c +++ b/hw/vfio/pci-quirks.c @@ -403,7 +403,7 @@ static void vfio_probe_ati_bar4_quirk(VFIOPCIDevice *vdev, int nr) /* This windows doesn't seem to be used except by legacy VGA code */ if (!vfio_pci_is(vdev, PCI_VENDOR_ID_ATI, PCI_ANY_ID) || - !vdev->vga || nr != 4) { + !vdev->vga || nr != 4 || !vdev->bars[4].ioport) { return; } From 4d9607481560e6c8e1508a0aafe94f86a0503c8c Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Tue, 11 Mar 2025 17:48:07 +0000 Subject: [PATCH 21/21] vfio/pci: Drop debug commentary from x-device-dirty-page-tracking MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The intent behind the x-device-dirty-page-tracking option is twofold: 1) development/testing in the presence of VFs with VF dirty page tracking 2) deliberately choosing platform dirty tracker over the VF one. Item 2) scenario is useful when VF dirty tracker is not as fast as IOMMU, or there's some limitations around it (e.g. number of them is limited; aggregated address space under tracking is limited), efficiency/scalability (e.g. 1 pagetable in IOMMU dirty tracker to scan vs N VFs) or just troubleshooting. Given item 2 it is not restricted to debugging, hence drop the debug parenthesis from the option description. Signed-off-by: Joao Martins Reviewed-by: Cédric Le Goater Link: https://lore.kernel.org/qemu-devel/20250311174807.79825-1-joao.m.martins@oracle.com [ clg: Fixed subject spelling ] Signed-off-by: Cédric Le Goater --- hw/vfio/pci.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index 3cb7806f2f..7f1532fbed 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -3532,7 +3532,7 @@ static void vfio_pci_dev_class_init(ObjectClass *klass, void *data) object_class_property_set_description(klass, /* 9.1 */ "x-device-dirty-page-tracking", "Disable device dirty page tracking and use " - "container-based dirty page tracking (DEBUG)"); + "container-based dirty page tracking"); object_class_property_set_description(klass, /* 9.1 */ "migration-events", "Emit VFIO migration QAPI event when a VFIO device "