Merge

2023-09-25 11:50:05 +02:00 · 2023-09-25 11:50:05 +02:00 · 09e79261f6
commit 09e79261f6
parent ff5bc3d934 b55e4b9c05
1000 changed files with 38123 additions and 15916 deletions
--- a/.gitlab-ci.d/base.yml
+++ b/.gitlab-ci.d/base.yml
@ -68,7 +68,7 @@ variables:

    #############################################################
    # Stage 2: fine tune execution of jobs in specific scenarios
-    # where the catch all logic is inapprorpaite
+    # where the catch all logic is inappropriate
    #############################################################

    # Optional jobs should not be run unless manually triggered
--- a/.gitlab-ci.d/cirrus.yml
+++ b/.gitlab-ci.d/cirrus.yml
@ -15,8 +15,10 @@
  stage: build
  image: registry.gitlab.com/libvirt/libvirt-ci/cirrus-run:master
  needs: []
+  # 20 mins larger than "timeout_in" in cirrus/build.yml
+  # as there's often a 5-10 minute delay before Cirrus CI
+  # actually starts the task
  timeout: 80m
-  allow_failure: true
  script:
    - source .gitlab-ci.d/cirrus/$NAME.vars
    - sed -e "s|[@]CI_REPOSITORY_URL@|$CI_REPOSITORY_URL|g"
--- a/.gitlab-ci.d/cirrus/build.yml
+++ b/.gitlab-ci.d/cirrus/build.yml
@ -16,6 +16,8 @@ env:
  TEST_TARGETS: "@TEST_TARGETS@"

 build_task:
+  # A little shorter than GitLab timeout in ../cirrus.yml
+  timeout_in: 60m
  install_script:
    - @UPDATE_COMMAND@
    - @INSTALL_COMMAND@ @PKGS@
--- a/51
+++ b/51
@ -298,11 +298,9 @@ F: hw/openrisc/
 F: tests/tcg/openrisc/

 PowerPC TCG CPUs
+M: Nicholas Piggin <npiggin@gmail.com>
 M: Daniel Henrique Barboza <danielhb413@gmail.com>
 R: Cédric Le Goater <clg@kaod.org>
-R: David Gibson <david@gibson.dropbear.id.au>
-R: Greg Kurz <groug@kaod.org>
-R: Nicholas Piggin <npiggin@gmail.com>
 L: qemu-ppc@nongnu.org
 S: Odd Fixes
 F: target/ppc/
@ -438,10 +436,9 @@ F: target/mips/kvm*
 F: target/mips/sysemu/

 PPC KVM CPUs
-M: Daniel Henrique Barboza <danielhb413@gmail.com>
+M: Nicholas Piggin <npiggin@gmail.com>
+R: Daniel Henrique Barboza <danielhb413@gmail.com>
 R: Cédric Le Goater <clg@kaod.org>
-R: David Gibson <david@gibson.dropbear.id.au>
-R: Greg Kurz <groug@kaod.org>
 S: Odd Fixes
 F: target/ppc/kvm.c

@ -543,14 +540,6 @@ F: include/sysemu/xen.h
 F: include/sysemu/xen-mapcache.h
 F: stubs/xen-hw-stub.c

-Guest CPU Cores (HAXM)
---------------------
-X86 HAXM CPUs
-S: Orphan
-F: accel/stubs/hax-stub.c
-F: include/sysemu/hax.h
-F: target/i386/hax/
-
 Guest CPU Cores (NVMM)
 ----------------------
 NetBSD Virtual Machine Monitor (NVMM) CPU support
@ -1034,6 +1023,16 @@ S: Maintained
 F: hw/ssi/xlnx-versal-ospi.c
 F: include/hw/ssi/xlnx-versal-ospi.h

+Xilinx Versal CFI
+M: Francisco Iglesias <francisco.iglesias@amd.com>
+S: Maintained
+F: hw/misc/xlnx-cfi-if.c
+F: include/hw/misc/xlnx-cfi-if.h
+F: hw/misc/xlnx-versal-cfu.c
+F: include/hw/misc/xlnx-versal-cfu.h
+F: hw/misc/xlnx-versal-cframe-reg.c
+F: include/hw/misc/xlnx-versal-cframe-reg.h
+
 STM32F100
 M: Alexandre Iooss <erdnaxe@crans.org>
 L: qemu-arm@nongnu.org
@ -1428,10 +1427,10 @@ F: include/hw/rtc/m48t59.h
 F: tests/avocado/ppc_prep_40p.py

 sPAPR (pseries)
-M: Daniel Henrique Barboza <danielhb413@gmail.com>
+M: Nicholas Piggin <npiggin@gmail.com>
+R: Daniel Henrique Barboza <danielhb413@gmail.com>
 R: Cédric Le Goater <clg@kaod.org>
 R: David Gibson <david@gibson.dropbear.id.au>
-R: Greg Kurz <groug@kaod.org>
 R: Harsh Prateek Bora <harshpb@linux.ibm.com>
 L: qemu-ppc@nongnu.org
 S: Odd Fixes
@ -1450,8 +1449,8 @@ F: tests/avocado/ppc_pseries.py

 PowerNV (Non-Virtualized)
 M: Cédric Le Goater <clg@kaod.org>
+M: Nicholas Piggin <npiggin@gmail.com>
 R: Frédéric Barrat <fbarrat@linux.ibm.com>
-R: Nicholas Piggin <npiggin@gmail.com>
 L: qemu-ppc@nongnu.org
 S: Odd Fixes
 F: docs/system/ppc/powernv.rst
@ -1495,12 +1494,9 @@ F: include/hw/pci-host/mv64361.h

 Virtual Open Firmware (VOF)
 M: Alexey Kardashevskiy <aik@ozlabs.ru>
-R: Cédric Le Goater <clg@kaod.org>
-R: Daniel Henrique Barboza <danielhb413@gmail.com>
 R: David Gibson <david@gibson.dropbear.id.au>
-R: Greg Kurz <groug@kaod.org>
 L: qemu-ppc@nongnu.org
-S: Maintained
+S: Odd Fixes
 F: hw/ppc/spapr_vof*
 F: hw/ppc/vof*
 F: include/hw/ppc/vof*
@ -2256,6 +2252,13 @@ F: tests/qtest/nvme-test.c
 F: docs/system/devices/nvme.rst
 T: git git://git.infradead.org/qemu-nvme.git nvme-next

+ufs
+M: Jeuk Kim <jeuk20.kim@samsung.com>
+S: Supported
+F: hw/ufs/*
+F: include/block/ufs.h
+F: tests/qtest/ufs-test.c
+
 megasas
 M: Hannes Reinecke <hare@suse.com>
 L: qemu-block@nongnu.org
@ -2948,12 +2951,17 @@ W: http://info.iet.unipi.it/~luigi/netmap/
 S: Maintained
 F: net/netmap.c

+AF_XDP network backend
+R: Ilya Maximets <i.maximets@ovn.org>
+F: net/af-xdp.c
+
 Host Memory Backends
 M: David Hildenbrand <david@redhat.com>
 M: Igor Mammedov <imammedo@redhat.com>
 S: Maintained
 F: backends/hostmem*.c
 F: include/sysemu/hostmem.h
+F: docs/system/vm-templating.rst
 T: git https://gitlab.com/ehabkost/qemu.git machine-next

 Cryptodev Backends
@ -3701,6 +3709,7 @@ S: Supported
 F: block/parallels.c
 F: block/parallels-ext.c
 F: docs/interop/parallels.txt
+T: git https://src.openvz.org/scm/~den/qemu.git parallels

 qed
 M: Stefan Hajnoczi <stefanha@redhat.com>
--- a/29
+++ b/29
@ -164,14 +164,6 @@ ifneq ($(filter $(ninja-targets), $(ninja-cmd-goals)),)
 endif
 endif

-ifeq ($(CONFIG_PLUGIN),y)
-.PHONY: plugins
-plugins:
-	$(call quiet-command,\
-		$(MAKE) $(SUBDIR_MAKEFLAGS) -C contrib/plugins V="$(V)", \
-		"BUILD", "example plugins")
-endif # $(CONFIG_PLUGIN)
-
 else # config-host.mak does not exist
 ifneq ($(filter-out $(UNCHECKED_GOALS),$(MAKECMDGOALS)),$(if $(MAKECMDGOALS),,fail))
 $(error Please call configure before running make)
@ -184,15 +176,20 @@ include $(SRC_PATH)/tests/Makefile.include

 all: recurse-all

-ROMS_RULES=$(foreach t, all clean distclean, $(addsuffix /$(t), $(ROMS)))
-.PHONY: $(ROMS_RULES)
-$(ROMS_RULES):
+SUBDIR_RULES=$(foreach t, all clean distclean, $(addsuffix /$(t), $(SUBDIRS)))
+.PHONY: $(SUBDIR_RULES)
+$(SUBDIR_RULES):
 	$(call quiet-command,$(MAKE) $(SUBDIR_MAKEFLAGS) -C $(dir $@) V="$(V)" TARGET_DIR="$(dir $@)" $(notdir $@),)

+ifneq ($(filter contrib/plugins, $(SUBDIRS)),)
+.PHONY: plugins
+plugins: contrib/plugins/all
+endif
+
 .PHONY: recurse-all recurse-clean
-recurse-all: $(addsuffix /all, $(ROMS))
-recurse-clean: $(addsuffix /clean, $(ROMS))
-recurse-distclean: $(addsuffix /distclean, $(ROMS))
+recurse-all: $(addsuffix /all, $(SUBDIRS))
+recurse-clean: $(addsuffix /clean, $(SUBDIRS))
+recurse-distclean: $(addsuffix /distclean, $(SUBDIRS))

 ######################################################################

@ -296,7 +293,7 @@ help:
 	$(call print-help,cscope,Generate cscope index)
 	$(call print-help,sparse,Run sparse on the QEMU source)
 	@echo  ''
-ifeq ($(CONFIG_PLUGIN),y)
+ifneq ($(filter contrib/plugins, $(SUBDIRS)),)
 	@echo  'Plugin targets:'
 	$(call print-help,plugins,Build the example TCG plugins)
 	@echo  ''
@ -316,7 +313,7 @@ endif
 	@echo  'Documentation targets:'
 	$(call print-help,html man,Build documentation in specified format)
 	@echo  ''
-ifdef CONFIG_WIN32
+ifneq ($(filter msi, $(ninja-targets)),)
 	@echo  'Windows targets:'
 	$(call print-help,installer,Build NSIS-based installer for QEMU)
 	$(call print-help,msi,Build MSI-based installer for qemu-ga)
--- a/accel/Kconfig
+++ b/accel/Kconfig
@ -4,9 +4,6 @@ config WHPX
 config NVMM
    bool

-config HAX
-    bool
-
 config HVF
    bool

--- a/accel/kvm/kvm-all.c
+++ b/accel/kvm/kvm-all.c
@ -3763,6 +3763,7 @@ static void kvm_accel_instance_init(Object *obj)
    /* KVM dirty ring is by default off */
    s->kvm_dirty_ring_size = 0;
    s->kvm_dirty_ring_with_bitmap = false;
+    s->kvm_eager_split_size = 0;
    s->notify_vmexit = NOTIFY_VMEXIT_OPTION_RUN;
    s->notify_window = 0;
    s->xen_version = 0;
--- a/accel/stubs/hax-stub.c
+++ b/accel/stubs/hax-stub.c
@ -1,24 +0,0 @@
-/*
- * QEMU HAXM support
- *
- * Copyright (c) 2015, Intel Corporation
- *
- * Copyright 2016 Google, Inc.
- *
- * This software is licensed under the terms of the GNU General Public
- * License version 2, as published by the Free Software Foundation, and
- * may be copied, distributed, and modified under those terms.
- *
- * See the COPYING file in the top-level directory.
- *
- */
-
-#include "qemu/osdep.h"
-#include "sysemu/hax.h"
-
-bool hax_allowed;
-
-int hax_sync_vcpus(void)
-{
-    return 0;
-}
--- a/accel/stubs/meson.build
+++ b/accel/stubs/meson.build
@ -1,5 +1,4 @@
 sysemu_stubs_ss = ss.source_set()
-sysemu_stubs_ss.add(when: 'CONFIG_HAX', if_false: files('hax-stub.c'))
 sysemu_stubs_ss.add(when: 'CONFIG_XEN', if_false: files('xen-stub.c'))
 sysemu_stubs_ss.add(when: 'CONFIG_KVM', if_false: files('kvm-stub.c'))
 sysemu_stubs_ss.add(when: 'CONFIG_TCG', if_false: files('tcg-stub.c'))
--- a/accel/tcg/cputlb.c
+++ b/accel/tcg/cputlb.c
@ -1200,6 +1200,7 @@ void tlb_set_page_full(CPUState *cpu, int mmu_idx,
    write_flags = read_flags;
    if (is_ram) {
        iotlb = memory_region_get_ram_addr(section->mr) + xlat;
+        assert(!(iotlb & ~TARGET_PAGE_MASK));
        /*
         * Computing is_clean is expensive; avoid all that unless
         * the page is actually writable.
@ -1262,16 +1263,18 @@ void tlb_set_page_full(CPUState *cpu, int mmu_idx,

    /* refill the tlb */
    /*
-     * At this point iotlb contains a physical section number in the lower
-     * TARGET_PAGE_BITS, and either
-     *  + the ram_addr_t of the page base of the target RAM (RAM)
-     *  + the offset within section->mr of the page base (I/O, ROMD)
+     * When memory region is ram, iotlb contains a TARGET_PAGE_BITS
+     * aligned ram_addr_t of the page base of the target RAM.
+     * Otherwise, iotlb contains
+     *  - a physical section number in the lower TARGET_PAGE_BITS
+     *  - the offset within section->mr of the page base (I/O, ROMD) with the
+     *    TARGET_PAGE_BITS masked off.
     * We subtract addr_page (which is page aligned and thus won't
     * disturb the low bits) to give an offset which can be added to the
     * (non-page-aligned) vaddr of the eventual memory access to get
     * the MemoryRegion offset for the access. Note that the vaddr we
     * subtract here is that of the page base, and not the same as the
-     * vaddr we add back in io_readx()/io_writex()/get_page_addr_code().
+     * vaddr we add back in io_prepare()/get_page_addr_code().
     */
    desc->fulltlb[index] = *full;
    full = &desc->fulltlb[index];
@ -1354,117 +1357,42 @@ static inline void cpu_unaligned_access(CPUState *cpu, vaddr addr,
                                          mmu_idx, retaddr);
 }

-static inline void cpu_transaction_failed(CPUState *cpu, hwaddr physaddr,
-                                          vaddr addr, unsigned size,
-                                          MMUAccessType access_type,
-                                          int mmu_idx, MemTxAttrs attrs,
-                                          MemTxResult response,
-                                          uintptr_t retaddr)
+static MemoryRegionSection *
+io_prepare(hwaddr *out_offset, CPUArchState *env, hwaddr xlat,
+           MemTxAttrs attrs, vaddr addr, uintptr_t retaddr)
 {
+    CPUState *cpu = env_cpu(env);
+    MemoryRegionSection *section;
+    hwaddr mr_offset;
+
+    section = iotlb_to_section(cpu, xlat, attrs);
+    mr_offset = (xlat & TARGET_PAGE_MASK) + addr;
+    cpu->mem_io_pc = retaddr;
+    if (!cpu->can_do_io) {
+        cpu_io_recompile(cpu, retaddr);
+    }
+
+    *out_offset = mr_offset;
+    return section;
+}
+
+static void io_failed(CPUArchState *env, CPUTLBEntryFull *full, vaddr addr,
+                      unsigned size, MMUAccessType access_type, int mmu_idx,
+                      MemTxResult response, uintptr_t retaddr)
+{
+    CPUState *cpu = env_cpu(env);
+
+    if (!cpu->ignore_memory_transaction_failures) {
        CPUClass *cc = CPU_GET_CLASS(cpu);

-    if (!cpu->ignore_memory_transaction_failures &&
-        cc->tcg_ops->do_transaction_failed) {
+        if (cc->tcg_ops->do_transaction_failed) {
+            hwaddr physaddr = full->phys_addr | (addr & ~TARGET_PAGE_MASK);
+
            cc->tcg_ops->do_transaction_failed(cpu, physaddr, addr, size,
-                                           access_type, mmu_idx, attrs,
-                                           response, retaddr);
+                                               access_type, mmu_idx,
+                                               full->attrs, response, retaddr);
        }
    }
-
-/*
- * Save a potentially trashed CPUTLBEntryFull for later lookup by plugin.
- * This is read by tlb_plugin_lookup if the fulltlb entry doesn't match
- * because of the side effect of io_writex changing memory layout.
- */
-static void save_iotlb_data(CPUState *cs, MemoryRegionSection *section,
-                            hwaddr mr_offset)
-{
-#ifdef CONFIG_PLUGIN
-    SavedIOTLB *saved = &cs->saved_iotlb;
-    saved->section = section;
-    saved->mr_offset = mr_offset;
-#endif
-}
-
-static uint64_t io_readx(CPUArchState *env, CPUTLBEntryFull *full,
-                         int mmu_idx, vaddr addr, uintptr_t retaddr,
-                         MMUAccessType access_type, MemOp op)
-{
-    CPUState *cpu = env_cpu(env);
-    hwaddr mr_offset;
-    MemoryRegionSection *section;
-    MemoryRegion *mr;
-    uint64_t val;
-    MemTxResult r;
-
-    section = iotlb_to_section(cpu, full->xlat_section, full->attrs);
-    mr = section->mr;
-    mr_offset = (full->xlat_section & TARGET_PAGE_MASK) + addr;
-    cpu->mem_io_pc = retaddr;
-    if (!cpu->can_do_io) {
-        cpu_io_recompile(cpu, retaddr);
-    }
-
-    /*
-     * The memory_region_dispatch may trigger a flush/resize
-     * so for plugins we save the iotlb_data just in case.
-     */
-    save_iotlb_data(cpu, section, mr_offset);
-
-    {
-        QEMU_IOTHREAD_LOCK_GUARD();
-        r = memory_region_dispatch_read(mr, mr_offset, &val, op, full->attrs);
-    }
-
-    if (r != MEMTX_OK) {
-        hwaddr physaddr = mr_offset +
-            section->offset_within_address_space -
-            section->offset_within_region;
-
-        cpu_transaction_failed(cpu, physaddr, addr, memop_size(op), access_type,
-                               mmu_idx, full->attrs, r, retaddr);
-    }
-    return val;
-}
-
-static void io_writex(CPUArchState *env, CPUTLBEntryFull *full,
-                      int mmu_idx, uint64_t val, vaddr addr,
-                      uintptr_t retaddr, MemOp op)
-{
-    CPUState *cpu = env_cpu(env);
-    hwaddr mr_offset;
-    MemoryRegionSection *section;
-    MemoryRegion *mr;
-    MemTxResult r;
-
-    section = iotlb_to_section(cpu, full->xlat_section, full->attrs);
-    mr = section->mr;
-    mr_offset = (full->xlat_section & TARGET_PAGE_MASK) + addr;
-    if (!cpu->can_do_io) {
-        cpu_io_recompile(cpu, retaddr);
-    }
-    cpu->mem_io_pc = retaddr;
-
-    /*
-     * The memory_region_dispatch may trigger a flush/resize
-     * so for plugins we save the iotlb_data just in case.
-     */
-    save_iotlb_data(cpu, section, mr_offset);
-
-    {
-        QEMU_IOTHREAD_LOCK_GUARD();
-        r = memory_region_dispatch_write(mr, mr_offset, val, op, full->attrs);
-    }
-
-    if (r != MEMTX_OK) {
-        hwaddr physaddr = mr_offset +
-            section->offset_within_address_space -
-            section->offset_within_region;
-
-        cpu_transaction_failed(cpu, physaddr, addr, memop_size(op),
-                               MMU_DATA_STORE, mmu_idx, full->attrs, r,
-                               retaddr);
-    }
 }

 /* Return true if ADDR is present in the victim tlb, and has been copied
@ -1733,45 +1661,41 @@ tb_page_addr_t get_page_addr_code_hostp(CPUArchState *env, vaddr addr,
 * in the softmmu lookup code (or helper). We don't handle re-fills or
 * checking the victim table. This is purely informational.
 *
- * This almost never fails as the memory access being instrumented
- * should have just filled the TLB. The one corner case is io_writex
- * which can cause TLB flushes and potential resizing of the TLBs
- * losing the information we need. In those cases we need to recover
- * data from a copy of the CPUTLBEntryFull. As long as this always occurs
- * from the same thread (which a mem callback will be) this is safe.
+ * The one corner case is i/o write, which can cause changes to the
+ * address space.  Those changes, and the corresponding tlb flush,
+ * should be delayed until the next TB, so even then this ought not fail.
+ * But check, Just in Case.
 */
-
 bool tlb_plugin_lookup(CPUState *cpu, vaddr addr, int mmu_idx,
                       bool is_store, struct qemu_plugin_hwaddr *data)
 {
    CPUArchState *env = cpu->env_ptr;
    CPUTLBEntry *tlbe = tlb_entry(env, mmu_idx, addr);
    uintptr_t index = tlb_index(env, mmu_idx, addr);
-    uint64_t tlb_addr = is_store ? tlb_addr_write(tlbe) : tlbe->addr_read;
+    MMUAccessType access_type = is_store ? MMU_DATA_STORE : MMU_DATA_LOAD;
+    uint64_t tlb_addr = tlb_read_idx(tlbe, access_type);
+    CPUTLBEntryFull *full;
+
+    if (unlikely(!tlb_hit(tlb_addr, addr))) {
+        return false;
+    }
+
+    full = &env_tlb(env)->d[mmu_idx].fulltlb[index];
+    data->phys_addr = full->phys_addr | (addr & ~TARGET_PAGE_MASK);

-    if (likely(tlb_hit(tlb_addr, addr))) {
    /* We must have an iotlb entry for MMIO */
    if (tlb_addr & TLB_MMIO) {
-            CPUTLBEntryFull *full;
-            full = &env_tlb(env)->d[mmu_idx].fulltlb[index];
+        MemoryRegionSection *section =
+            iotlb_to_section(cpu, full->xlat_section & ~TARGET_PAGE_MASK,
+                             full->attrs);
        data->is_io = true;
-            data->v.io.section =
-                iotlb_to_section(cpu, full->xlat_section, full->attrs);
-            data->v.io.offset = (full->xlat_section & TARGET_PAGE_MASK) + addr;
+        data->mr = section->mr;
    } else {
        data->is_io = false;
-            data->v.ram.hostaddr = (void *)((uintptr_t)addr + tlbe->addend);
+        data->mr = NULL;
    }
    return true;
-    } else {
-        SavedIOTLB *saved = &cpu->saved_iotlb;
-        data->is_io = true;
-        data->v.io.section = saved->section;
-        data->v.io.offset = saved->mr_offset;
-        return true;
 }
-}
-
 #endif

 /*
@ -2115,45 +2039,88 @@ static void *atomic_mmu_lookup(CPUArchState *env, vaddr addr, MemOpIdx oi,
 * Load @size bytes from @addr, which is memory-mapped i/o.
 * The bytes are concatenated in big-endian order with @ret_be.
 */
+static uint64_t int_ld_mmio_beN(CPUArchState *env, CPUTLBEntryFull *full,
+                                uint64_t ret_be, vaddr addr, int size,
+                                int mmu_idx, MMUAccessType type, uintptr_t ra,
+                                MemoryRegion *mr, hwaddr mr_offset)
+{
+    do {
+        MemOp this_mop;
+        unsigned this_size;
+        uint64_t val;
+        MemTxResult r;
+
+        /* Read aligned pieces up to 8 bytes. */
+        this_mop = ctz32(size | (int)addr | 8);
+        this_size = 1 << this_mop;
+        this_mop |= MO_BE;
+
+        r = memory_region_dispatch_read(mr, mr_offset, &val,
+                                        this_mop, full->attrs);
+        if (unlikely(r != MEMTX_OK)) {
+            io_failed(env, full, addr, this_size, type, mmu_idx, r, ra);
+        }
+        if (this_size == 8) {
+            return val;
+        }
+
+        ret_be = (ret_be << (this_size * 8)) | val;
+        addr += this_size;
+        mr_offset += this_size;
+        size -= this_size;
+    } while (size);
+
+    return ret_be;
+}
+
 static uint64_t do_ld_mmio_beN(CPUArchState *env, CPUTLBEntryFull *full,
                               uint64_t ret_be, vaddr addr, int size,
                               int mmu_idx, MMUAccessType type, uintptr_t ra)
 {
-    uint64_t t;
+    MemoryRegionSection *section;
+    MemoryRegion *mr;
+    hwaddr mr_offset;
+    MemTxAttrs attrs;
+    uint64_t ret;

    tcg_debug_assert(size > 0 && size <= 8);
-    do {
-        /* Read aligned pieces up to 8 bytes. */
-        switch ((size | (int)addr) & 7) {
-        case 1:
-        case 3:
-        case 5:
-        case 7:
-            t = io_readx(env, full, mmu_idx, addr, ra, type, MO_UB);
-            ret_be = (ret_be << 8) | t;
-            size -= 1;
-            addr += 1;
-            break;
-        case 2:
-        case 6:
-            t = io_readx(env, full, mmu_idx, addr, ra, type, MO_BEUW);
-            ret_be = (ret_be << 16) | t;
-            size -= 2;
-            addr += 2;
-            break;
-        case 4:
-            t = io_readx(env, full, mmu_idx, addr, ra, type, MO_BEUL);
-            ret_be = (ret_be << 32) | t;
-            size -= 4;
-            addr += 4;
-            break;
-        case 0:
-            return io_readx(env, full, mmu_idx, addr, ra, type, MO_BEUQ);
-        default:
-            qemu_build_not_reached();
+
+    attrs = full->attrs;
+    section = io_prepare(&mr_offset, env, full->xlat_section, attrs, addr, ra);
+    mr = section->mr;
+
+    qemu_mutex_lock_iothread();
+    ret = int_ld_mmio_beN(env, full, ret_be, addr, size, mmu_idx,
+                          type, ra, mr, mr_offset);
+    qemu_mutex_unlock_iothread();
+
+    return ret;
 }
-    } while (size);
-    return ret_be;
+
+static Int128 do_ld16_mmio_beN(CPUArchState *env, CPUTLBEntryFull *full,
+                               uint64_t ret_be, vaddr addr, int size,
+                               int mmu_idx, uintptr_t ra)
+{
+    MemoryRegionSection *section;
+    MemoryRegion *mr;
+    hwaddr mr_offset;
+    MemTxAttrs attrs;
+    uint64_t a, b;
+
+    tcg_debug_assert(size > 8 && size <= 16);
+
+    attrs = full->attrs;
+    section = io_prepare(&mr_offset, env, full->xlat_section, attrs, addr, ra);
+    mr = section->mr;
+
+    qemu_mutex_lock_iothread();
+    a = int_ld_mmio_beN(env, full, ret_be, addr, size - 8, mmu_idx,
+                        MMU_DATA_LOAD, ra, mr, mr_offset);
+    b = int_ld_mmio_beN(env, full, ret_be, addr + size - 8, 8, mmu_idx,
+                        MMU_DATA_LOAD, ra, mr, mr_offset + size - 8);
+    qemu_mutex_unlock_iothread();
+
+    return int128_make128(b, a);
 }

 /**
@ -2298,7 +2265,6 @@ static uint64_t do_ld_beN(CPUArchState *env, MMULookupPageData *p,
    unsigned tmp, half_size;

    if (unlikely(p->flags & TLB_MMIO)) {
-        QEMU_IOTHREAD_LOCK_GUARD();
        return do_ld_mmio_beN(env, p->full, ret_be, p->addr, p->size,
                              mmu_idx, type, ra);
    }
@ -2349,12 +2315,7 @@ static Int128 do_ld16_beN(CPUArchState *env, MMULookupPageData *p,
    MemOp atom;

    if (unlikely(p->flags & TLB_MMIO)) {
-        QEMU_IOTHREAD_LOCK_GUARD();
-        a = do_ld_mmio_beN(env, p->full, a, p->addr, size - 8,
-                           mmu_idx, MMU_DATA_LOAD, ra);
-        b = do_ld_mmio_beN(env, p->full, 0, p->addr + 8, 8,
-                           mmu_idx, MMU_DATA_LOAD, ra);
-        return int128_make128(b, a);
+        return do_ld16_mmio_beN(env, p->full, a, p->addr, size, mmu_idx, ra);
    }

    /*
@ -2399,7 +2360,7 @@ static uint8_t do_ld_1(CPUArchState *env, MMULookupPageData *p, int mmu_idx,
                       MMUAccessType type, uintptr_t ra)
 {
    if (unlikely(p->flags & TLB_MMIO)) {
-        return io_readx(env, p->full, mmu_idx, p->addr, ra, type, MO_UB);
+        return do_ld_mmio_beN(env, p->full, 0, p->addr, 1, mmu_idx, type, ra);
    } else {
        return *(uint8_t *)p->haddr;
    }
@ -2411,7 +2372,6 @@ static uint16_t do_ld_2(CPUArchState *env, MMULookupPageData *p, int mmu_idx,
    uint16_t ret;

    if (unlikely(p->flags & TLB_MMIO)) {
-        QEMU_IOTHREAD_LOCK_GUARD();
        ret = do_ld_mmio_beN(env, p->full, 0, p->addr, 2, mmu_idx, type, ra);
        if ((memop & MO_BSWAP) == MO_LE) {
            ret = bswap16(ret);
@ -2432,7 +2392,6 @@ static uint32_t do_ld_4(CPUArchState *env, MMULookupPageData *p, int mmu_idx,
    uint32_t ret;

    if (unlikely(p->flags & TLB_MMIO)) {
-        QEMU_IOTHREAD_LOCK_GUARD();
        ret = do_ld_mmio_beN(env, p->full, 0, p->addr, 4, mmu_idx, type, ra);
        if ((memop & MO_BSWAP) == MO_LE) {
            ret = bswap32(ret);
@ -2453,7 +2412,6 @@ static uint64_t do_ld_8(CPUArchState *env, MMULookupPageData *p, int mmu_idx,
    uint64_t ret;

    if (unlikely(p->flags & TLB_MMIO)) {
-        QEMU_IOTHREAD_LOCK_GUARD();
        ret = do_ld_mmio_beN(env, p->full, 0, p->addr, 8, mmu_idx, type, ra);
        if ((memop & MO_BSWAP) == MO_LE) {
            ret = bswap64(ret);
@ -2612,12 +2570,8 @@ static Int128 do_ld16_mmu(CPUArchState *env, vaddr addr,
    crosspage = mmu_lookup(env, addr, oi, ra, MMU_DATA_LOAD, &l);
    if (likely(!crosspage)) {
        if (unlikely(l.page[0].flags & TLB_MMIO)) {
-            QEMU_IOTHREAD_LOCK_GUARD();
-            a = do_ld_mmio_beN(env, l.page[0].full, 0, addr, 8,
-                               l.mmu_idx, MMU_DATA_LOAD, ra);
-            b = do_ld_mmio_beN(env, l.page[0].full, 0, addr + 8, 8,
-                               l.mmu_idx, MMU_DATA_LOAD, ra);
-            ret = int128_make128(b, a);
+            ret = do_ld16_mmio_beN(env, l.page[0].full, 0, addr, 16,
+                                   l.mmu_idx, ra);
            if ((l.memop & MO_BSWAP) == MO_LE) {
                ret = bswap128(ret);
            }
@ -2759,46 +2713,88 @@ Int128 cpu_ld16_mmu(CPUArchState *env, abi_ptr addr,
 * The bytes to store are extracted in little-endian order from @val_le;
 * return the bytes of @val_le beyond @p->size that have not been stored.
 */
+static uint64_t int_st_mmio_leN(CPUArchState *env, CPUTLBEntryFull *full,
+                                uint64_t val_le, vaddr addr, int size,
+                                int mmu_idx, uintptr_t ra,
+                                MemoryRegion *mr, hwaddr mr_offset)
+{
+    do {
+        MemOp this_mop;
+        unsigned this_size;
+        MemTxResult r;
+
+        /* Store aligned pieces up to 8 bytes. */
+        this_mop = ctz32(size | (int)addr | 8);
+        this_size = 1 << this_mop;
+        this_mop |= MO_LE;
+
+        r = memory_region_dispatch_write(mr, mr_offset, val_le,
+                                         this_mop, full->attrs);
+        if (unlikely(r != MEMTX_OK)) {
+            io_failed(env, full, addr, this_size, MMU_DATA_STORE,
+                      mmu_idx, r, ra);
+        }
+        if (this_size == 8) {
+            return 0;
+        }
+
+        val_le >>= this_size * 8;
+        addr += this_size;
+        mr_offset += this_size;
+        size -= this_size;
+    } while (size);
+
+    return val_le;
+}
+
 static uint64_t do_st_mmio_leN(CPUArchState *env, CPUTLBEntryFull *full,
                               uint64_t val_le, vaddr addr, int size,
                               int mmu_idx, uintptr_t ra)
 {
+    MemoryRegionSection *section;
+    hwaddr mr_offset;
+    MemoryRegion *mr;
+    MemTxAttrs attrs;
+    uint64_t ret;
+
    tcg_debug_assert(size > 0 && size <= 8);

-    do {
-        /* Store aligned pieces up to 8 bytes. */
-        switch ((size | (int)addr) & 7) {
-        case 1:
-        case 3:
-        case 5:
-        case 7:
-            io_writex(env, full, mmu_idx, val_le, addr, ra, MO_UB);
-            val_le >>= 8;
-            size -= 1;
-            addr += 1;
-            break;
-        case 2:
-        case 6:
-            io_writex(env, full, mmu_idx, val_le, addr, ra, MO_LEUW);
-            val_le >>= 16;
-            size -= 2;
-            addr += 2;
-            break;
-        case 4:
-            io_writex(env, full, mmu_idx, val_le, addr, ra, MO_LEUL);
-            val_le >>= 32;
-            size -= 4;
-            addr += 4;
-            break;
-        case 0:
-            io_writex(env, full, mmu_idx, val_le, addr, ra, MO_LEUQ);
-            return 0;
-        default:
-            qemu_build_not_reached();
-        }
-    } while (size);
+    attrs = full->attrs;
+    section = io_prepare(&mr_offset, env, full->xlat_section, attrs, addr, ra);
+    mr = section->mr;

-    return val_le;
+    qemu_mutex_lock_iothread();
+    ret = int_st_mmio_leN(env, full, val_le, addr, size, mmu_idx,
+                          ra, mr, mr_offset);
+    qemu_mutex_unlock_iothread();
+
+    return ret;
+}
+
+static uint64_t do_st16_mmio_leN(CPUArchState *env, CPUTLBEntryFull *full,
+                                 Int128 val_le, vaddr addr, int size,
+                                 int mmu_idx, uintptr_t ra)
+{
+    MemoryRegionSection *section;
+    MemoryRegion *mr;
+    hwaddr mr_offset;
+    MemTxAttrs attrs;
+    uint64_t ret;
+
+    tcg_debug_assert(size > 8 && size <= 16);
+
+    attrs = full->attrs;
+    section = io_prepare(&mr_offset, env, full->xlat_section, attrs, addr, ra);
+    mr = section->mr;
+
+    qemu_mutex_lock_iothread();
+    int_st_mmio_leN(env, full, int128_getlo(val_le), addr, 8,
+                    mmu_idx, ra, mr, mr_offset);
+    ret = int_st_mmio_leN(env, full, int128_gethi(val_le), addr + 8,
+                          size - 8, mmu_idx, ra, mr, mr_offset + 8);
+    qemu_mutex_unlock_iothread();
+
+    return ret;
 }

 /*
@ -2812,7 +2808,6 @@ static uint64_t do_st_leN(CPUArchState *env, MMULookupPageData *p,
    unsigned tmp, half_size;

    if (unlikely(p->flags & TLB_MMIO)) {
-        QEMU_IOTHREAD_LOCK_GUARD();
        return do_st_mmio_leN(env, p->full, val_le, p->addr,
                              p->size, mmu_idx, ra);
    } else if (unlikely(p->flags & TLB_DISCARD_WRITE)) {
@ -2867,11 +2862,8 @@ static uint64_t do_st16_leN(CPUArchState *env, MMULookupPageData *p,
    MemOp atom;

    if (unlikely(p->flags & TLB_MMIO)) {
-        QEMU_IOTHREAD_LOCK_GUARD();
-        do_st_mmio_leN(env, p->full, int128_getlo(val_le),
-                       p->addr, 8, mmu_idx, ra);
-        return do_st_mmio_leN(env, p->full, int128_gethi(val_le),
-                              p->addr + 8, size - 8, mmu_idx, ra);
+        return do_st16_mmio_leN(env, p->full, val_le, p->addr,
+                                size, mmu_idx, ra);
    } else if (unlikely(p->flags & TLB_DISCARD_WRITE)) {
        return int128_gethi(val_le) >> ((size - 8) * 8);
    }
@ -2915,7 +2907,7 @@ static void do_st_1(CPUArchState *env, MMULookupPageData *p, uint8_t val,
                    int mmu_idx, uintptr_t ra)
 {
    if (unlikely(p->flags & TLB_MMIO)) {
-        io_writex(env, p->full, mmu_idx, val, p->addr, ra, MO_UB);
+        do_st_mmio_leN(env, p->full, val, p->addr, 1, mmu_idx, ra);
    } else if (unlikely(p->flags & TLB_DISCARD_WRITE)) {
        /* nothing */
    } else {
@ -2930,7 +2922,6 @@ static void do_st_2(CPUArchState *env, MMULookupPageData *p, uint16_t val,
        if ((memop & MO_BSWAP) != MO_LE) {
            val = bswap16(val);
        }
-        QEMU_IOTHREAD_LOCK_GUARD();
        do_st_mmio_leN(env, p->full, val, p->addr, 2, mmu_idx, ra);
    } else if (unlikely(p->flags & TLB_DISCARD_WRITE)) {
        /* nothing */
@ -2950,7 +2941,6 @@ static void do_st_4(CPUArchState *env, MMULookupPageData *p, uint32_t val,
        if ((memop & MO_BSWAP) != MO_LE) {
            val = bswap32(val);
        }
-        QEMU_IOTHREAD_LOCK_GUARD();
        do_st_mmio_leN(env, p->full, val, p->addr, 4, mmu_idx, ra);
    } else if (unlikely(p->flags & TLB_DISCARD_WRITE)) {
        /* nothing */
@ -2970,7 +2960,6 @@ static void do_st_8(CPUArchState *env, MMULookupPageData *p, uint64_t val,
        if ((memop & MO_BSWAP) != MO_LE) {
            val = bswap64(val);
        }
-        QEMU_IOTHREAD_LOCK_GUARD();
        do_st_mmio_leN(env, p->full, val, p->addr, 8, mmu_idx, ra);
    } else if (unlikely(p->flags & TLB_DISCARD_WRITE)) {
        /* nothing */
@ -3098,11 +3087,7 @@ static void do_st16_mmu(CPUArchState *env, vaddr addr, Int128 val,
            if ((l.memop & MO_BSWAP) != MO_LE) {
                val = bswap128(val);
            }
-            a = int128_getlo(val);
-            b = int128_gethi(val);
-            QEMU_IOTHREAD_LOCK_GUARD();
-            do_st_mmio_leN(env, l.page[0].full, a, addr, 8, l.mmu_idx, ra);
-            do_st_mmio_leN(env, l.page[0].full, b, addr + 8, 8, l.mmu_idx, ra);
+            do_st16_mmio_leN(env, l.page[0].full, val, addr, 16, l.mmu_idx, ra);
        } else if (unlikely(l.page[0].flags & TLB_DISCARD_WRITE)) {
            /* nothing */
        } else {
--- a/accel/tcg/meson.build
+++ b/accel/tcg/meson.build
@ -11,7 +11,9 @@ tcg_ss.add(files(
 ))
 tcg_ss.add(when: 'CONFIG_USER_ONLY', if_true: files('user-exec.c'))
 tcg_ss.add(when: 'CONFIG_SYSTEM_ONLY', if_false: files('user-exec-stub.c'))
-tcg_ss.add(when: 'CONFIG_PLUGIN', if_true: [files('plugin-gen.c')])
+if get_option('plugins')
+  tcg_ss.add(files('plugin-gen.c'))
+endif
 tcg_ss.add(when: libdw, if_true: files('debuginfo.c'))
 tcg_ss.add(when: 'CONFIG_LINUX', if_true: files('perf.c'))
 specific_ss.add_all(when: 'CONFIG_TCG', if_true: tcg_ss)
--- a/accel/tcg/tb-maint.c
+++ b/accel/tcg/tb-maint.c
@ -1,5 +1,5 @@
 /*
- * Translation Block Maintaince
+ * Translation Block Maintenance
 *
 *  Copyright (c) 2003 Fabrice Bellard
 *
--- a/accel/tcg/tcg-accel-ops-mttcg.c
+++ b/accel/tcg/tcg-accel-ops-mttcg.c
@ -100,14 +100,9 @@ static void *mttcg_cpu_thread_fn(void *arg)
                break;
            case EXCP_HALTED:
                /*
-                 * during start-up the vCPU is reset and the thread is
-                 * kicked several times. If we don't ensure we go back
-                 * to sleep in the halted state we won't cleanly
-                 * start-up when the vCPU is enabled.
-                 *
-                 * cpu->halted should ensure we sleep in wait_io_event
+                 * Usually cpu->halted is set, but may have already been
+                 * reset by another thread by the time we arrive here.
                 */
-                g_assert(cpu->halted);
                break;
            case EXCP_ATOMIC:
                qemu_mutex_unlock_iothread();
--- a/accel/tcg/tcg-runtime-gvec.c
+++ b/accel/tcg/tcg-runtime-gvec.c
@ -1042,6 +1042,32 @@ DO_CMP2(64)
 #undef DO_CMP1
 #undef DO_CMP2

+#define DO_CMP1(NAME, TYPE, OP)                                            \
+void HELPER(NAME)(void *d, void *a, uint64_t b64, uint32_t desc)           \
+{                                                                          \
+    intptr_t oprsz = simd_oprsz(desc);                                     \
+    TYPE inv = simd_data(desc), b = b64;                                   \
+    for (intptr_t i = 0; i < oprsz; i += sizeof(TYPE)) {                   \
+        *(TYPE *)(d + i) = -((*(TYPE *)(a + i) OP b) ^ inv);               \
+    }                                                                      \
+    clear_high(d, oprsz, desc);                                            \
+}
+
+#define DO_CMP2(SZ) \
+    DO_CMP1(gvec_eqs##SZ, uint##SZ##_t, ==)    \
+    DO_CMP1(gvec_lts##SZ, int##SZ##_t, <)      \
+    DO_CMP1(gvec_les##SZ, int##SZ##_t, <=)     \
+    DO_CMP1(gvec_ltus##SZ, uint##SZ##_t, <)    \
+    DO_CMP1(gvec_leus##SZ, uint##SZ##_t, <=)
+
+DO_CMP2(8)
+DO_CMP2(16)
+DO_CMP2(32)
+DO_CMP2(64)
+
+#undef DO_CMP1
+#undef DO_CMP2
+
 void HELPER(gvec_ssadd8)(void *d, void *a, void *b, uint32_t desc)
 {
    intptr_t oprsz = simd_oprsz(desc);
--- a/accel/tcg/tcg-runtime.h
+++ b/accel/tcg/tcg-runtime.h
@ -297,6 +297,31 @@ DEF_HELPER_FLAGS_4(gvec_leu16, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
 DEF_HELPER_FLAGS_4(gvec_leu32, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
 DEF_HELPER_FLAGS_4(gvec_leu64, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)

+DEF_HELPER_FLAGS_4(gvec_eqs8, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+DEF_HELPER_FLAGS_4(gvec_eqs16, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+DEF_HELPER_FLAGS_4(gvec_eqs32, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+DEF_HELPER_FLAGS_4(gvec_eqs64, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+
+DEF_HELPER_FLAGS_4(gvec_lts8, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+DEF_HELPER_FLAGS_4(gvec_lts16, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+DEF_HELPER_FLAGS_4(gvec_lts32, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+DEF_HELPER_FLAGS_4(gvec_lts64, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+
+DEF_HELPER_FLAGS_4(gvec_les8, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+DEF_HELPER_FLAGS_4(gvec_les16, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+DEF_HELPER_FLAGS_4(gvec_les32, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+DEF_HELPER_FLAGS_4(gvec_les64, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+
+DEF_HELPER_FLAGS_4(gvec_ltus8, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+DEF_HELPER_FLAGS_4(gvec_ltus16, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+DEF_HELPER_FLAGS_4(gvec_ltus32, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+DEF_HELPER_FLAGS_4(gvec_ltus64, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+
+DEF_HELPER_FLAGS_4(gvec_leus8, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+DEF_HELPER_FLAGS_4(gvec_leus16, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+DEF_HELPER_FLAGS_4(gvec_leus32, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+DEF_HELPER_FLAGS_4(gvec_leus64, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+
 DEF_HELPER_FLAGS_5(gvec_bitsel, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)

 //// --- Begin LibAFL code ---
--- a/audio/mixeng.h
+++ b/audio/mixeng.h
@ -38,7 +38,7 @@ typedef struct st_sample st_sample;
 typedef void (t_sample) (struct st_sample *dst, const void *src, int samples);
 typedef void (f_sample) (void *dst, const struct st_sample *src, int samples);

-/* indices: [stereo][signed][swap endiannes][8, 16 or 32-bits] */
+/* indices: [stereo][signed][swap endianness][8, 16 or 32-bits] */
 extern t_sample *mixeng_conv[2][2][2][3];
 extern f_sample *mixeng_clip[2][2][2][3];

--- a/backends/cryptodev.c
+++ b/backends/cryptodev.c
@ -252,10 +252,11 @@ static void cryptodev_backend_throttle_timer_cb(void *opaque)
            continue;
        }

-        throttle_account(&backend->ts, true, ret);
+        throttle_account(&backend->ts, THROTTLE_WRITE, ret);
        cryptodev_backend_operation(backend, op_info);
        if (throttle_enabled(&backend->tc) &&
-            throttle_schedule_timer(&backend->ts, &backend->tt, true)) {
+            throttle_schedule_timer(&backend->ts, &backend->tt,
+                                    THROTTLE_WRITE)) {
            break;
        }
    }
@ -271,7 +272,7 @@ int cryptodev_backend_crypto_operation(
        goto do_account;
    }

-    if (throttle_schedule_timer(&backend->ts, &backend->tt, true) ||
+    if (throttle_schedule_timer(&backend->ts, &backend->tt, THROTTLE_WRITE) ||
        !QTAILQ_EMPTY(&backend->opinfos)) {
        QTAILQ_INSERT_TAIL(&backend->opinfos, op_info, next);
        return 0;
@ -283,7 +284,7 @@ do_account:
        return ret;
    }

-    throttle_account(&backend->ts, true, ret);
+    throttle_account(&backend->ts, THROTTLE_WRITE, ret);

    return cryptodev_backend_operation(backend, op_info);
 }
@ -341,8 +342,7 @@ static void cryptodev_backend_set_throttle(CryptoDevBackend *backend, int field,
    if (!enabled) {
        throttle_init(&backend->ts);
        throttle_timers_init(&backend->tt, qemu_get_aio_context(),
-                             QEMU_CLOCK_REALTIME,
-                             cryptodev_backend_throttle_timer_cb, /* FIXME */
+                             QEMU_CLOCK_REALTIME, NULL,
                             cryptodev_backend_throttle_timer_cb, backend);
    }

--- a/backends/hostmem-file.c
+++ b/backends/hostmem-file.c
@ -18,6 +18,8 @@
 #include "sysemu/hostmem.h"
 #include "qom/object_interfaces.h"
 #include "qom/object.h"
+#include "qapi/visitor.h"
+#include "qapi/qapi-visit-common.h"

 OBJECT_DECLARE_SIMPLE_TYPE(HostMemoryBackendFile, MEMORY_BACKEND_FILE)

@ -31,6 +33,7 @@ struct HostMemoryBackendFile {
    bool discard_data;
    bool is_pmem;
    bool readonly;
+    OnOffAuto rom;
 };

 static void
@ -53,15 +56,39 @@ file_backend_memory_alloc(HostMemoryBackend *backend, Error **errp)
        return;
    }

+    switch (fb->rom) {
+    case ON_OFF_AUTO_AUTO:
+        /* Traditionally, opening the file readonly always resulted in ROM. */
+        fb->rom = fb->readonly ? ON_OFF_AUTO_ON : ON_OFF_AUTO_OFF;
+        break;
+    case ON_OFF_AUTO_ON:
+        if (!fb->readonly) {
+            error_setg(errp, "property 'rom' = 'on' is not supported with"
+                       " 'readonly' = 'off'");
+            return;
+        }
+        break;
+    case ON_OFF_AUTO_OFF:
+        if (fb->readonly && backend->share) {
+            error_setg(errp, "property 'rom' = 'off' is incompatible with"
+                       " 'readonly' = 'on' and 'share' = 'on'");
+            return;
+        }
+        break;
+    default:
+        assert(false);
+    }
+
    name = host_memory_backend_get_name(backend);
    ram_flags = backend->share ? RAM_SHARED : 0;
+    ram_flags |= fb->readonly ? RAM_READONLY_FD : 0;
+    ram_flags |= fb->rom == ON_OFF_AUTO_ON ? RAM_READONLY : 0;
    ram_flags |= backend->reserve ? 0 : RAM_NORESERVE;
    ram_flags |= fb->is_pmem ? RAM_PMEM : 0;
    ram_flags |= RAM_NAMED_FILE;
    memory_region_init_ram_from_file(&backend->mr, OBJECT(backend), name,
                                     backend->size, fb->align, ram_flags,
-                                     fb->mem_path, fb->offset, fb->readonly,
-                                     errp);
+                                     fb->mem_path, fb->offset, errp);
    g_free(name);
 #endif
 }
@ -201,6 +228,32 @@ static void file_memory_backend_set_readonly(Object *obj, bool value,
    fb->readonly = value;
 }

+static void file_memory_backend_get_rom(Object *obj, Visitor *v,
+                                        const char *name, void *opaque,
+                                        Error **errp)
+{
+    HostMemoryBackendFile *fb = MEMORY_BACKEND_FILE(obj);
+    OnOffAuto rom = fb->rom;
+
+    visit_type_OnOffAuto(v, name, &rom, errp);
+}
+
+static void file_memory_backend_set_rom(Object *obj, Visitor *v,
+                                        const char *name, void *opaque,
+                                        Error **errp)
+{
+    HostMemoryBackend *backend = MEMORY_BACKEND(obj);
+    HostMemoryBackendFile *fb = MEMORY_BACKEND_FILE(obj);
+
+    if (host_memory_backend_mr_inited(backend)) {
+        error_setg(errp, "cannot change property '%s' of %s.", name,
+                   object_get_typename(obj));
+        return;
+    }
+
+    visit_type_OnOffAuto(v, name, &fb->rom, errp);
+}
+
 static void file_backend_unparent(Object *obj)
 {
    HostMemoryBackend *backend = MEMORY_BACKEND(obj);
@ -243,6 +296,10 @@ file_backend_class_init(ObjectClass *oc, void *data)
    object_class_property_add_bool(oc, "readonly",
        file_memory_backend_get_readonly,
        file_memory_backend_set_readonly);
+    object_class_property_add(oc, "rom", "OnOffAuto",
+        file_memory_backend_get_rom, file_memory_backend_set_rom, NULL, NULL);
+    object_class_property_set_description(oc, "rom",
+        "Whether to create Read Only Memory (ROM)");
 }

 static void file_backend_instance_finalize(Object *o)
--- a/backends/tpm/tpm_ioctl.h
+++ b/backends/tpm/tpm_ioctl.h
@ -238,7 +238,7 @@ struct ptm_lockstorage {
        } req; /* request */
        struct {
            ptm_res tpm_result;
-        } resp; /* reponse */
+        } resp; /* response */
    } u;
 };

--- a/backends/tpm/tpm_util.c
+++ b/backends/tpm/tpm_util.c
@ -112,12 +112,8 @@ static int tpm_util_request(int fd,
                            void *response,
                            size_t responselen)
 {
-    fd_set readfds;
+    GPollFD fds[1] = { {.fd = fd, .events = G_IO_IN } };
    int n;
-    struct timeval tv = {
-        .tv_sec = 1,
-        .tv_usec = 0,
-    };

    n = write(fd, request, requestlen);
    if (n < 0) {
@ -127,11 +123,8 @@ static int tpm_util_request(int fd,
        return -EFAULT;
    }

-    FD_ZERO(&readfds);
-    FD_SET(fd, &readfds);
-
    /* wait for a second */
-    n = select(fd + 1, &readfds, NULL, NULL, &tv);
+    n = RETRY_ON_EINTR(g_poll(fds, 1, 1000));
    if (n != 1) {
        return -errno;
    }
--- a/block.c
+++ b/block.c
@ -91,9 +91,11 @@ static BlockDriverState *bdrv_open_inherit(const char *filename,
 static bool bdrv_recurse_has_child(BlockDriverState *bs,
                                   BlockDriverState *child);

-static void bdrv_replace_child_noperm(BdrvChild *child,
-                                      BlockDriverState *new_bs);
-static void bdrv_remove_child(BdrvChild *child, Transaction *tran);
+static void GRAPH_WRLOCK
+bdrv_replace_child_noperm(BdrvChild *child, BlockDriverState *new_bs);
+
+static void GRAPH_WRLOCK
+bdrv_remove_child(BdrvChild *child, Transaction *tran);

 static int bdrv_reopen_prepare(BDRVReopenState *reopen_state,
                               BlockReopenQueue *queue,
@ -415,7 +417,7 @@ BlockDriverState *bdrv_new(void)
    for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
        QLIST_INIT(&bs->op_blockers[i]);
    }
-    qemu_co_mutex_init(&bs->reqs_lock);
+    qemu_mutex_init(&bs->reqs_lock);
    qemu_mutex_init(&bs->dirty_bitmap_mutex);
    bs->refcnt = 1;
    bs->aio_context = qemu_get_aio_context();
@ -661,8 +663,10 @@ int coroutine_fn bdrv_co_create_opts_simple(BlockDriver *drv,
    blk = blk_co_new_open(filename, NULL, options,
                          BDRV_O_RDWR | BDRV_O_RESIZE, errp);
    if (!blk) {
-        error_prepend(errp, "Protocol driver '%s' does not support image "
-                      "creation, and opening the image failed: ",
+        error_prepend(errp, "Protocol driver '%s' does not support creating "
+                      "new images, so an existing image must be selected as "
+                      "the target; however, opening the given target as an "
+                      "existing image failed: ",
                      drv->format_name);
        return -EINVAL;
    }
@ -1697,7 +1701,9 @@ bdrv_open_driver(BlockDriverState *bs, BlockDriver *drv, const char *node_name,
 open_failed:
    bs->drv = NULL;
    if (bs->file != NULL) {
+        bdrv_graph_wrlock(NULL);
        bdrv_unref_child(bs, bs->file);
+        bdrv_graph_wrunlock();
        assert(!bs->file);
    }
    g_free(bs->opaque);
@ -2113,7 +2119,6 @@ static int bdrv_fill_options(QDict **options, const char *filename,

 typedef struct BlockReopenQueueEntry {
     bool prepared;
-     bool perms_checked;
     BDRVReopenState state;
     QTAILQ_ENTRY(BlockReopenQueueEntry) entry;
 } BlockReopenQueueEntry;
@ -2199,7 +2204,8 @@ static bool bdrv_a_allow_b(BdrvChild *a, BdrvChild *b, Error **errp)
    return false;
 }

-static bool bdrv_parent_perms_conflict(BlockDriverState *bs, Error **errp)
+static bool GRAPH_RDLOCK
+bdrv_parent_perms_conflict(BlockDriverState *bs, Error **errp)
 {
    BdrvChild *a, *b;
    GLOBAL_STATE_CODE();
@ -2224,7 +2230,8 @@ static bool bdrv_parent_perms_conflict(BlockDriverState *bs, Error **errp)
    return false;
 }

-static void bdrv_child_perm(BlockDriverState *bs, BlockDriverState *child_bs,
+static void GRAPH_RDLOCK
+bdrv_child_perm(BlockDriverState *bs, BlockDriverState *child_bs,
                BdrvChild *c, BdrvChildRole role,
                BlockReopenQueue *reopen_queue,
                uint64_t parent_perm, uint64_t parent_shared,
@ -2252,8 +2259,8 @@ static void bdrv_child_perm(BlockDriverState *bs, BlockDriverState *child_bs,
 * simplest way to satisfy this criteria: use only result of
 * bdrv_topological_dfs() or NULL as @list parameter.
 */
-static GSList *bdrv_topological_dfs(GSList *list, GHashTable *found,
-                                    BlockDriverState *bs)
+static GSList * GRAPH_RDLOCK
+bdrv_topological_dfs(GSList *list, GHashTable *found, BlockDriverState *bs)
 {
    BdrvChild *child;
    g_autoptr(GHashTable) local_found = NULL;
@ -2316,7 +2323,7 @@ static void bdrv_child_set_perm(BdrvChild *c, uint64_t perm,
    tran_add(tran, &bdrv_child_set_pem_drv, s);
 }

-static void bdrv_drv_set_perm_commit(void *opaque)
+static void GRAPH_RDLOCK bdrv_drv_set_perm_commit(void *opaque)
 {
    BlockDriverState *bs = opaque;
    uint64_t cumulative_perms, cumulative_shared_perms;
@ -2329,7 +2336,7 @@ static void bdrv_drv_set_perm_commit(void *opaque)
    }
 }

-static void bdrv_drv_set_perm_abort(void *opaque)
+static void GRAPH_RDLOCK bdrv_drv_set_perm_abort(void *opaque)
 {
    BlockDriverState *bs = opaque;
    GLOBAL_STATE_CODE();
@ -2344,9 +2351,13 @@ TransactionActionDrv bdrv_drv_set_perm_drv = {
    .commit = bdrv_drv_set_perm_commit,
 };

-static int bdrv_drv_set_perm(BlockDriverState *bs, uint64_t perm,
-                             uint64_t shared_perm, Transaction *tran,
-                             Error **errp)
+/*
+ * After calling this function, the transaction @tran may only be completed
+ * while holding a reader lock for the graph.
+ */
+static int GRAPH_RDLOCK
+bdrv_drv_set_perm(BlockDriverState *bs, uint64_t perm, uint64_t shared_perm,
+                  Transaction *tran, Error **errp)
 {
    GLOBAL_STATE_CODE();
    if (!bs->drv) {
@ -2372,20 +2383,22 @@ typedef struct BdrvReplaceChildState {
    BlockDriverState *old_bs;
 } BdrvReplaceChildState;

-static void bdrv_replace_child_commit(void *opaque)
+static void GRAPH_WRLOCK bdrv_replace_child_commit(void *opaque)
 {
    BdrvReplaceChildState *s = opaque;
    GLOBAL_STATE_CODE();

-    bdrv_unref(s->old_bs);
+    bdrv_schedule_unref(s->old_bs);
 }

-static void bdrv_replace_child_abort(void *opaque)
+static void GRAPH_WRLOCK bdrv_replace_child_abort(void *opaque)
 {
    BdrvReplaceChildState *s = opaque;
    BlockDriverState *new_bs = s->child->bs;

    GLOBAL_STATE_CODE();
+    assert_bdrv_graph_writable();
+
    /* old_bs reference is transparently moved from @s to @s->child */
    if (!s->child->bs) {
        /*
@ -2402,6 +2415,7 @@ static void bdrv_replace_child_abort(void *opaque)
    }
    assert(s->child->quiesced_parent);
    bdrv_replace_child_noperm(s->child, s->old_bs);
+
    bdrv_unref(new_bs);
 }

@ -2419,9 +2433,13 @@ static TransactionActionDrv bdrv_replace_child_drv = {
 * Both @child->bs and @new_bs (if non-NULL) must be drained. @new_bs must be
 * kept drained until the transaction is completed.
 *
+ * After calling this function, the transaction @tran may only be completed
+ * while holding a writer lock for the graph.
+ *
 * The function doesn't update permissions, caller is responsible for this.
 */
-static void bdrv_replace_child_tran(BdrvChild *child, BlockDriverState *new_bs,
+static void GRAPH_WRLOCK
+bdrv_replace_child_tran(BdrvChild *child, BlockDriverState *new_bs,
                        Transaction *tran)
 {
    BdrvReplaceChildState *s = g_new(BdrvReplaceChildState, 1);
@ -2438,6 +2456,7 @@ static void bdrv_replace_child_tran(BdrvChild *child, BlockDriverState *new_bs,
    if (new_bs) {
        bdrv_ref(new_bs);
    }
+
    bdrv_replace_child_noperm(child, new_bs);
    /* old_bs reference is transparently moved from @child to @s */
 }
@ -2445,8 +2464,12 @@ static void bdrv_replace_child_tran(BdrvChild *child, BlockDriverState *new_bs,
 /*
 * Refresh permissions in @bs subtree. The function is intended to be called
 * after some graph modification that was done without permission update.
+ *
+ * After calling this function, the transaction @tran may only be completed
+ * while holding a reader lock for the graph.
 */
-static int bdrv_node_refresh_perm(BlockDriverState *bs, BlockReopenQueue *q,
+static int GRAPH_RDLOCK
+bdrv_node_refresh_perm(BlockDriverState *bs, BlockReopenQueue *q,
                       Transaction *tran, Error **errp)
 {
    BlockDriver *drv = bs->drv;
@ -2520,9 +2543,13 @@ static int bdrv_node_refresh_perm(BlockDriverState *bs, BlockReopenQueue *q,
 /*
 * @list is a product of bdrv_topological_dfs() (may be called several times) -
 * a topologically sorted subgraph.
+ *
+ * After calling this function, the transaction @tran may only be completed
+ * while holding a reader lock for the graph.
 */
-static int bdrv_do_refresh_perms(GSList *list, BlockReopenQueue *q,
-                                 Transaction *tran, Error **errp)
+static int GRAPH_RDLOCK
+bdrv_do_refresh_perms(GSList *list, BlockReopenQueue *q, Transaction *tran,
+                      Error **errp)
 {
    int ret;
    BlockDriverState *bs;
@ -2548,9 +2575,13 @@ static int bdrv_do_refresh_perms(GSList *list, BlockReopenQueue *q,
 * @list is any list of nodes. List is completed by all subtrees and
 * topologically sorted. It's not a problem if some node occurs in the @list
 * several times.
+ *
+ * After calling this function, the transaction @tran may only be completed
+ * while holding a reader lock for the graph.
 */
-static int bdrv_list_refresh_perms(GSList *list, BlockReopenQueue *q,
-                                   Transaction *tran, Error **errp)
+static int GRAPH_RDLOCK
+bdrv_list_refresh_perms(GSList *list, BlockReopenQueue *q, Transaction *tran,
+                        Error **errp)
 {
    g_autoptr(GHashTable) found = g_hash_table_new(NULL, NULL);
    g_autoptr(GSList) refresh_list = NULL;
@ -2609,9 +2640,14 @@ char *bdrv_perm_names(uint64_t perm)
 }


-/* @tran is allowed to be NULL. In this case no rollback is possible */
-static int bdrv_refresh_perms(BlockDriverState *bs, Transaction *tran,
-                              Error **errp)
+/*
+ * @tran is allowed to be NULL. In this case no rollback is possible.
+ *
+ * After calling this function, the transaction @tran may only be completed
+ * while holding a reader lock for the graph.
+ */
+static int GRAPH_RDLOCK
+bdrv_refresh_perms(BlockDriverState *bs, Transaction *tran, Error **errp)
 {
    int ret;
    Transaction *local_tran = NULL;
@ -2857,8 +2893,8 @@ uint64_t bdrv_qapi_perm_to_blk_perm(BlockPermission qapi_perm)
 * If @new_bs is non-NULL, the parent of @child must already be drained through
 * @child and the caller must hold the AioContext lock for @new_bs.
 */
-static void bdrv_replace_child_noperm(BdrvChild *child,
-                                      BlockDriverState *new_bs)
+static void GRAPH_WRLOCK
+bdrv_replace_child_noperm(BdrvChild *child, BlockDriverState *new_bs)
 {
    BlockDriverState *old_bs = child->bs;
    int new_bs_quiesce_counter;
@ -2893,8 +2929,6 @@ static void bdrv_replace_child_noperm(BdrvChild *child,
        assert(bdrv_get_aio_context(old_bs) == bdrv_get_aio_context(new_bs));
    }

-    /* TODO Pull this up into the callers to avoid polling here */
-    bdrv_graph_wrlock(new_bs);
    if (old_bs) {
        if (child->klass->detach) {
            child->klass->detach(child);
@ -2910,7 +2944,6 @@ static void bdrv_replace_child_noperm(BdrvChild *child,
            child->klass->attach(child);
        }
    }
-    bdrv_graph_wrunlock();

    /*
     * If the parent was drained through this BdrvChild previously, but new_bs
@ -2945,12 +2978,14 @@ typedef struct BdrvAttachChildCommonState {
    AioContext *old_child_ctx;
 } BdrvAttachChildCommonState;

-static void bdrv_attach_child_common_abort(void *opaque)
+static void GRAPH_WRLOCK bdrv_attach_child_common_abort(void *opaque)
 {
    BdrvAttachChildCommonState *s = opaque;
    BlockDriverState *bs = s->child->bs;

    GLOBAL_STATE_CODE();
+    assert_bdrv_graph_writable();
+
    bdrv_replace_child_noperm(s->child, NULL);

    if (bdrv_get_aio_context(bs) != s->old_child_ctx) {
@ -2975,7 +3010,7 @@ static void bdrv_attach_child_common_abort(void *opaque)
        tran_commit(tran);
    }

-    bdrv_unref(bs);
+    bdrv_schedule_unref(bs);
    bdrv_child_free(s->child);
 }

@ -2989,13 +3024,17 @@ static TransactionActionDrv bdrv_attach_child_common_drv = {
 *
 * Function doesn't update permissions, caller is responsible for this.
 *
+ * After calling this function, the transaction @tran may only be completed
+ * while holding a writer lock for the graph.
+ *
 * Returns new created child.
 *
 * The caller must hold the AioContext lock for @child_bs. Both @parent_bs and
 * @child_bs can move to a different AioContext in this function. Callers must
 * make sure that their AioContext locking is still correct after this.
 */
-static BdrvChild *bdrv_attach_child_common(BlockDriverState *child_bs,
+static BdrvChild * GRAPH_WRLOCK
+bdrv_attach_child_common(BlockDriverState *child_bs,
                         const char *child_name,
                         const BdrvChildClass *child_class,
                         BdrvChildRole child_role,
@ -3104,8 +3143,12 @@ static BdrvChild *bdrv_attach_child_common(BlockDriverState *child_bs,
 * The caller must hold the AioContext lock for @child_bs. Both @parent_bs and
 * @child_bs can move to a different AioContext in this function. Callers must
 * make sure that their AioContext locking is still correct after this.
+ *
+ * After calling this function, the transaction @tran may only be completed
+ * while holding a writer lock for the graph.
 */
-static BdrvChild *bdrv_attach_child_noperm(BlockDriverState *parent_bs,
+static BdrvChild * GRAPH_WRLOCK
+bdrv_attach_child_noperm(BlockDriverState *parent_bs,
                         BlockDriverState *child_bs,
                         const char *child_name,
                         const BdrvChildClass *child_class,
@ -3156,6 +3199,8 @@ BdrvChild *bdrv_root_attach_child(BlockDriverState *child_bs,

    GLOBAL_STATE_CODE();

+    bdrv_graph_wrlock(child_bs);
+
    child = bdrv_attach_child_common(child_bs, child_name, child_class,
                                   child_role, perm, shared_perm, opaque,
                                   tran, errp);
@ -3168,6 +3213,7 @@ BdrvChild *bdrv_root_attach_child(BlockDriverState *child_bs,

 out:
    tran_finalize(tran, ret);
+    bdrv_graph_wrunlock();

    bdrv_unref(child_bs);

@ -3213,7 +3259,7 @@ BdrvChild *bdrv_attach_child(BlockDriverState *parent_bs,
 out:
    tran_finalize(tran, ret);

-    bdrv_unref(child_bs);
+    bdrv_schedule_unref(child_bs);

    return ret < 0 ? NULL : child;
 }
@ -3243,7 +3289,7 @@ void bdrv_root_unref_child(BdrvChild *child)
                                    NULL);
    }

-    bdrv_unref(child_bs);
+    bdrv_schedule_unref(child_bs);
 }

 typedef struct BdrvSetInheritsFrom {
@ -3287,7 +3333,8 @@ static void bdrv_set_inherits_from(BlockDriverState *bs,
 * @root that point to @root, where necessary.
 * @tran is allowed to be NULL. In this case no rollback is possible
 */
-static void bdrv_unset_inherits_from(BlockDriverState *root, BdrvChild *child,
+static void GRAPH_WRLOCK
+bdrv_unset_inherits_from(BlockDriverState *root, BdrvChild *child,
                         Transaction *tran)
 {
    BdrvChild *c;
@ -3325,7 +3372,8 @@ void bdrv_unref_child(BlockDriverState *parent, BdrvChild *child)
 }


-static void bdrv_parent_cb_change_media(BlockDriverState *bs, bool load)
+static void GRAPH_RDLOCK
+bdrv_parent_cb_change_media(BlockDriverState *bs, bool load)
 {
    BdrvChild *c;
    GLOBAL_STATE_CODE();
@ -3366,13 +3414,20 @@ static BdrvChildRole bdrv_backing_role(BlockDriverState *bs)
 * Sets the bs->backing or bs->file link of a BDS. A new reference is created;
 * callers which don't need their own reference any more must call bdrv_unref().
 *
+ * If the respective child is already present (i.e. we're detaching a node),
+ * that child node must be drained.
+ *
 * Function doesn't update permissions, caller is responsible for this.
 *
 * The caller must hold the AioContext lock for @child_bs. Both @parent_bs and
 * @child_bs can move to a different AioContext in this function. Callers must
 * make sure that their AioContext locking is still correct after this.
+ *
+ * After calling this function, the transaction @tran may only be completed
+ * while holding a writer lock for the graph.
 */
-static int bdrv_set_file_or_backing_noperm(BlockDriverState *parent_bs,
+static int GRAPH_WRLOCK
+bdrv_set_file_or_backing_noperm(BlockDriverState *parent_bs,
                                BlockDriverState *child_bs,
                                bool is_backing,
                                Transaction *tran, Error **errp)
@ -3426,6 +3481,7 @@ static int bdrv_set_file_or_backing_noperm(BlockDriverState *parent_bs,
    }

    if (child) {
+        assert(child->bs->quiesce_counter);
        bdrv_unset_inherits_from(parent_bs, child, tran);
        bdrv_remove_child(child, tran);
    }
@ -3452,9 +3508,7 @@ static int bdrv_set_file_or_backing_noperm(BlockDriverState *parent_bs,
    }

 out:
-    bdrv_graph_rdlock_main_loop();
    bdrv_refresh_limits(parent_bs, tran, NULL);
-    bdrv_graph_rdunlock_main_loop();

    return 0;
 }
@ -3463,8 +3517,15 @@ out:
 * The caller must hold the AioContext lock for @backing_hd. Both @bs and
 * @backing_hd can move to a different AioContext in this function. Callers must
 * make sure that their AioContext locking is still correct after this.
+ *
+ * If a backing child is already present (i.e. we're detaching a node), that
+ * child node must be drained.
+ *
+ * After calling this function, the transaction @tran may only be completed
+ * while holding a writer lock for the graph.
 */
-static int bdrv_set_backing_noperm(BlockDriverState *bs,
+static int GRAPH_WRLOCK
+bdrv_set_backing_noperm(BlockDriverState *bs,
                        BlockDriverState *backing_hd,
                        Transaction *tran, Error **errp)
 {
@ -3481,6 +3542,10 @@ int bdrv_set_backing_hd_drained(BlockDriverState *bs,

    GLOBAL_STATE_CODE();
    assert(bs->quiesce_counter > 0);
+    if (bs->backing) {
+        assert(bs->backing->bs->quiesce_counter > 0);
+    }
+    bdrv_graph_wrlock(backing_hd);

    ret = bdrv_set_backing_noperm(bs, backing_hd, tran, errp);
    if (ret < 0) {
@ -3490,18 +3555,22 @@ int bdrv_set_backing_hd_drained(BlockDriverState *bs,
    ret = bdrv_refresh_perms(bs, tran, errp);
 out:
    tran_finalize(tran, ret);
+    bdrv_graph_wrunlock();
    return ret;
 }

 int bdrv_set_backing_hd(BlockDriverState *bs, BlockDriverState *backing_hd,
                        Error **errp)
 {
+    BlockDriverState *drain_bs = bs->backing ? bs->backing->bs : bs;
    int ret;
    GLOBAL_STATE_CODE();

-    bdrv_drained_begin(bs);
+    bdrv_ref(drain_bs);
+    bdrv_drained_begin(drain_bs);
    ret = bdrv_set_backing_hd_drained(bs, backing_hd, errp);
-    bdrv_drained_end(bs);
+    bdrv_drained_end(drain_bs);
+    bdrv_unref(drain_bs);

    return ret;
 }
@ -3711,11 +3780,13 @@ BdrvChild *bdrv_open_child(const char *filename,
        return NULL;
    }

+    bdrv_graph_wrlock(NULL);
    ctx = bdrv_get_aio_context(bs);
    aio_context_acquire(ctx);
    child = bdrv_attach_child(parent, bs, bdref_key, child_class, child_role,
                              errp);
    aio_context_release(ctx);
+    bdrv_graph_wrunlock();

    return child;
 }
@ -3900,6 +3971,9 @@ bdrv_open_inherit(const char *filename, const char *reference, QDict *options,
    GLOBAL_STATE_CODE();
    assert(!qemu_in_coroutine());

+    /* TODO We'll eventually have to take a writer lock in this function */
+    GRAPH_RDLOCK_GUARD_MAINLOOP();
+
    if (reference) {
        bool options_non_empty = options ? qdict_size(options) : false;
        qobject_unref(options);
@ -4539,7 +4613,10 @@ int bdrv_reopen_multiple(BlockReopenQueue *bs_queue, Error **errp)
     * reconfiguring the fd and that's why it does it in raw_check_perm(), not
     * in raw_reopen_prepare() which is called with "old" permissions.
     */
+    bdrv_graph_rdlock_main_loop();
    ret = bdrv_list_refresh_perms(refresh_list, bs_queue, tran, errp);
+    bdrv_graph_rdunlock_main_loop();
+
    if (ret < 0) {
        goto abort;
    }
@ -4560,7 +4637,9 @@ int bdrv_reopen_multiple(BlockReopenQueue *bs_queue, Error **errp)
        aio_context_release(ctx);
    }

+    bdrv_graph_wrlock(NULL);
    tran_commit(tran);
+    bdrv_graph_wrunlock();

    QTAILQ_FOREACH_REVERSE(bs_entry, bs_queue, entry) {
        BlockDriverState *bs = bs_entry->state.bs;
@ -4577,7 +4656,10 @@ int bdrv_reopen_multiple(BlockReopenQueue *bs_queue, Error **errp)
    goto cleanup;

 abort:
+    bdrv_graph_wrlock(NULL);
    tran_abort(tran);
+    bdrv_graph_wrunlock();
+
    QTAILQ_FOREACH_SAFE(bs_entry, bs_queue, entry, next) {
        if (bs_entry->prepared) {
            ctx = bdrv_get_aio_context(bs_entry->state.bs);
@ -4643,6 +4725,9 @@ int bdrv_reopen_set_read_only(BlockDriverState *bs, bool read_only,
 * true and reopen_state->new_backing_bs contains a pointer to the new
 * backing BlockDriverState (or NULL).
 *
+ * After calling this function, the transaction @tran may only be completed
+ * while holding a writer lock for the graph.
+ *
 * Return 0 on success, otherwise return < 0 and set @errp.
 *
 * The caller must hold the AioContext lock of @reopen_state->bs.
@ -4727,6 +4812,11 @@ static int bdrv_reopen_parse_file_or_backing(BDRVReopenState *reopen_state,
        reopen_state->old_file_bs = old_child_bs;
    }

+    if (old_child_bs) {
+        bdrv_ref(old_child_bs);
+        bdrv_drained_begin(old_child_bs);
+    }
+
    old_ctx = bdrv_get_aio_context(bs);
    ctx = bdrv_get_aio_context(new_child_bs);
    if (old_ctx != ctx) {
@ -4734,14 +4824,23 @@ static int bdrv_reopen_parse_file_or_backing(BDRVReopenState *reopen_state,
        aio_context_acquire(ctx);
    }

+    bdrv_graph_wrlock(new_child_bs);
+
    ret = bdrv_set_file_or_backing_noperm(bs, new_child_bs, is_backing,
                                          tran, errp);

+    bdrv_graph_wrunlock();
+
    if (old_ctx != ctx) {
        aio_context_release(ctx);
        aio_context_acquire(old_ctx);
    }

+    if (old_child_bs) {
+        bdrv_drained_end(old_child_bs);
+        bdrv_unref(old_child_bs);
+    }
+
    return ret;
 }

@ -4762,6 +4861,9 @@ static int bdrv_reopen_parse_file_or_backing(BDRVReopenState *reopen_state,
 * commit() for any other BDS that have been left in a prepare() state
 *
 * The caller must hold the AioContext lock of @reopen_state->bs.
+ *
+ * After calling this function, the transaction @change_child_tran may only be
+ * completed while holding a writer lock for the graph.
 */
 static int bdrv_reopen_prepare(BDRVReopenState *reopen_state,
                               BlockReopenQueue *queue,
@ -5063,9 +5165,11 @@ static void bdrv_close(BlockDriverState *bs)
        bs->drv = NULL;
    }

+    bdrv_graph_wrlock(NULL);
    QLIST_FOREACH_SAFE(child, &bs->children, next, next) {
        bdrv_unref_child(bs, child);
    }
+    bdrv_graph_wrunlock();

    assert(!bs->backing);
    assert(!bs->file);
@ -5120,7 +5224,7 @@ void bdrv_close_all(void)
    assert(QTAILQ_EMPTY(&all_bdrv_states));
 }

-static bool should_update_child(BdrvChild *c, BlockDriverState *to)
+static bool GRAPH_RDLOCK should_update_child(BdrvChild *c, BlockDriverState *to)
 {
    GQueue *queue;
    GHashTable *found;
@ -5209,33 +5313,37 @@ static TransactionActionDrv bdrv_remove_child_drv = {
    .commit = bdrv_remove_child_commit,
 };

-/* Function doesn't update permissions, caller is responsible for this. */
-static void bdrv_remove_child(BdrvChild *child, Transaction *tran)
+/*
+ * Function doesn't update permissions, caller is responsible for this.
+ *
+ * @child->bs (if non-NULL) must be drained.
+ *
+ * After calling this function, the transaction @tran may only be completed
+ * while holding a writer lock for the graph.
+ */
+static void GRAPH_WRLOCK bdrv_remove_child(BdrvChild *child, Transaction *tran)
 {
    if (!child) {
        return;
    }

    if (child->bs) {
-        BlockDriverState *bs = child->bs;
-        bdrv_drained_begin(bs);
+        assert(child->quiesced_parent);
        bdrv_replace_child_tran(child, NULL, tran);
-        bdrv_drained_end(bs);
    }

    tran_add(tran, &bdrv_remove_child_drv, child);
 }

-static void undrain_on_clean_cb(void *opaque)
-{
-    bdrv_drained_end(opaque);
-}
-
-static TransactionActionDrv undrain_on_clean = {
-    .clean = undrain_on_clean_cb,
-};
-
-static int bdrv_replace_node_noperm(BlockDriverState *from,
+/*
+ * Both @from and @to (if non-NULL) must be drained. @to must be kept drained
+ * until the transaction is completed.
+ *
+ * After calling this function, the transaction @tran may only be completed
+ * while holding a writer lock for the graph.
+ */
+static int GRAPH_WRLOCK
+bdrv_replace_node_noperm(BlockDriverState *from,
                         BlockDriverState *to,
                         bool auto_skip, Transaction *tran,
                         Error **errp)
@ -5244,10 +5352,8 @@ static int bdrv_replace_node_noperm(BlockDriverState *from,

    GLOBAL_STATE_CODE();

-    bdrv_drained_begin(from);
-    bdrv_drained_begin(to);
-    tran_add(tran, &undrain_on_clean, from);
-    tran_add(tran, &undrain_on_clean, to);
+    assert(from->quiesce_counter);
+    assert(to->quiesce_counter);

    QLIST_FOREACH_SAFE(c, &from->parents, next_parent, next) {
        assert(c->bs == from);
@ -5310,6 +5416,9 @@ static int bdrv_replace_node_common(BlockDriverState *from,
    assert(qemu_get_current_aio_context() == qemu_get_aio_context());
    assert(bdrv_get_aio_context(from) == bdrv_get_aio_context(to));
    bdrv_drained_begin(from);
+    bdrv_drained_begin(to);
+
+    bdrv_graph_wrlock(to);

    /*
     * Do the replacement without permission update.
@ -5323,6 +5432,7 @@ static int bdrv_replace_node_common(BlockDriverState *from,
    }

    if (detach_subchain) {
+        /* to_cow_parent is already drained because from is drained */
        bdrv_remove_child(bdrv_filter_or_cow_child(to_cow_parent), tran);
    }

@ -5338,7 +5448,9 @@ static int bdrv_replace_node_common(BlockDriverState *from,

 out:
    tran_finalize(tran, ret);
+    bdrv_graph_wrunlock();

+    bdrv_drained_end(to);
    bdrv_drained_end(from);
    bdrv_unref(from);

@ -5388,6 +5500,22 @@ int bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top,
    assert(!bs_new->backing);

    old_context = bdrv_get_aio_context(bs_top);
+    bdrv_drained_begin(bs_top);
+
+    /*
+     * bdrv_drained_begin() requires that only the AioContext of the drained
+     * node is locked, and at this point it can still differ from the AioContext
+     * of bs_top.
+     */
+    new_context = bdrv_get_aio_context(bs_new);
+    aio_context_release(old_context);
+    aio_context_acquire(new_context);
+    bdrv_drained_begin(bs_new);
+    aio_context_release(new_context);
+    aio_context_acquire(old_context);
+    new_context = NULL;
+
+    bdrv_graph_wrlock(bs_top);

    child = bdrv_attach_child_noperm(bs_new, bs_top, "backing",
                                     &child_of_bds, bdrv_backing_role(bs_new),
@ -5398,10 +5526,9 @@ int bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top,
    }

    /*
-     * bdrv_attach_child_noperm could change the AioContext of bs_top.
-     * bdrv_replace_node_noperm calls bdrv_drained_begin, so let's temporarily
-     * hold the new AioContext, since bdrv_drained_begin calls BDRV_POLL_WHILE
-     * that assumes the new lock is taken.
+     * bdrv_attach_child_noperm could change the AioContext of bs_top and
+     * bs_new, but at least they are in the same AioContext now. This is the
+     * AioContext that we need to lock for the rest of the function.
     */
    new_context = bdrv_get_aio_context(bs_top);

@ -5419,9 +5546,11 @@ int bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top,
 out:
    tran_finalize(tran, ret);

-    bdrv_graph_rdlock_main_loop();
    bdrv_refresh_limits(bs_top, NULL, NULL);
-    bdrv_graph_rdunlock_main_loop();
+    bdrv_graph_wrunlock();
+
+    bdrv_drained_end(bs_top);
+    bdrv_drained_end(bs_new);

    if (new_context && old_context != new_context) {
        aio_context_release(new_context);
@ -5445,6 +5574,7 @@ int bdrv_replace_child_bs(BdrvChild *child, BlockDriverState *new_bs,
    bdrv_ref(old_bs);
    bdrv_drained_begin(old_bs);
    bdrv_drained_begin(new_bs);
+    bdrv_graph_wrlock(new_bs);

    bdrv_replace_child_tran(child, new_bs, tran);

@ -5455,6 +5585,7 @@ int bdrv_replace_child_bs(BdrvChild *child, BlockDriverState *new_bs,

    tran_finalize(tran, ret);

+    bdrv_graph_wrunlock();
    bdrv_drained_end(old_bs);
    bdrv_drained_end(new_bs);
    bdrv_unref(old_bs);
@ -5476,6 +5607,8 @@ static void bdrv_delete(BlockDriverState *bs)

    bdrv_close(bs);

+    qemu_mutex_destroy(&bs->reqs_lock);
+
    g_free(bs);
 }

@ -5808,9 +5941,11 @@ int bdrv_drop_intermediate(BlockDriverState *top, BlockDriverState *base,
        backing_file_str = base->filename;
    }

+    bdrv_graph_rdlock_main_loop();
    QLIST_FOREACH(c, &top->parents, next_parent) {
        updated_children = g_slist_prepend(updated_children, c);
    }
+    bdrv_graph_rdunlock_main_loop();

    /*
     * It seems correct to pass detach_subchain=true here, but it triggers
@ -6733,6 +6868,7 @@ int bdrv_activate(BlockDriverState *bs, Error **errp)
    BdrvDirtyBitmap *bm;

    GLOBAL_STATE_CODE();
+    GRAPH_RDLOCK_GUARD_MAINLOOP();

    if (!bs->drv)  {
        return -ENOMEDIUM;
@ -6863,6 +6999,7 @@ static int bdrv_inactivate_recurse(BlockDriverState *bs)
    uint64_t cumulative_perms, cumulative_shared_perms;

    GLOBAL_STATE_CODE();
+    GRAPH_RDLOCK_GUARD_MAINLOOP();

    if (!bs->drv) {
        return -ENOMEDIUM;
@ -7041,6 +7178,23 @@ void bdrv_unref(BlockDriverState *bs)
    }
 }

+/*
+ * Release a BlockDriverState reference while holding the graph write lock.
+ *
+ * Calling bdrv_unref() directly is forbidden while holding the graph lock
+ * because bdrv_close() both involves polling and taking the graph lock
+ * internally. bdrv_schedule_unref() instead delays decreasing the refcount and
+ * possibly closing @bs until the graph lock is released.
+ */
+void bdrv_schedule_unref(BlockDriverState *bs)
+{
+    if (!bs) {
+        return;
+    }
+    aio_bh_schedule_oneshot(qemu_get_aio_context(),
+                            (QEMUBHFunc *) bdrv_unref, bs);
+}
+
 struct BdrvOpBlocker {
    Error *reason;
    QLIST_ENTRY(BdrvOpBlocker) list;
@ -7537,17 +7691,21 @@ static bool bdrv_change_aio_context(BlockDriverState *bs, AioContext *ctx,
        return true;
    }

+    bdrv_graph_rdlock_main_loop();
    QLIST_FOREACH(c, &bs->parents, next_parent) {
        if (!bdrv_parent_change_aio_context(c, ctx, visited, tran, errp)) {
+            bdrv_graph_rdunlock_main_loop();
            return false;
        }
    }

    QLIST_FOREACH(c, &bs->children, next) {
        if (!bdrv_child_change_aio_context(c, ctx, visited, tran, errp)) {
+            bdrv_graph_rdunlock_main_loop();
            return false;
        }
    }
+    bdrv_graph_rdunlock_main_loop();

    state = g_new(BdrvStateSetAioContext, 1);
    *state = (BdrvStateSetAioContext) {
@ -7589,7 +7747,7 @@ int bdrv_try_change_aio_context(BlockDriverState *bs, AioContext *ctx,
    /*
     * Recursion phase: go through all nodes of the graph.
     * Take care of checking that all nodes support changing AioContext
-     * and drain them, builing a linear list of callbacks to run if everything
+     * and drain them, building a linear list of callbacks to run if everything
     * is successful (the transaction itself).
     */
    tran = tran_new();
--- a/block/blklogwrites.c
+++ b/block/blklogwrites.c
@ -251,7 +251,9 @@ static int blk_log_writes_open(BlockDriverState *bs, QDict *options, int flags,
    ret = 0;
 fail_log:
    if (ret < 0) {
+        bdrv_graph_wrlock(NULL);
        bdrv_unref_child(bs, s->log_file);
+        bdrv_graph_wrunlock();
        s->log_file = NULL;
    }
 fail:
@ -263,8 +265,10 @@ static void blk_log_writes_close(BlockDriverState *bs)
 {
    BDRVBlkLogWritesState *s = bs->opaque;

+    bdrv_graph_wrlock(NULL);
    bdrv_unref_child(bs, s->log_file);
    s->log_file = NULL;
+    bdrv_graph_wrunlock();
 }

 static int64_t coroutine_fn GRAPH_RDLOCK
--- a/block/blkverify.c
+++ b/block/blkverify.c
@ -151,8 +151,10 @@ static void blkverify_close(BlockDriverState *bs)
 {
    BDRVBlkverifyState *s = bs->opaque;

+    bdrv_graph_wrlock(NULL);
    bdrv_unref_child(bs, s->test_file);
    s->test_file = NULL;
+    bdrv_graph_wrunlock();
 }

 static int64_t coroutine_fn GRAPH_RDLOCK
--- a/block/block-backend.c
+++ b/block/block-backend.c
@ -33,8 +33,6 @@

 #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */

-static AioContext *blk_aiocb_get_aio_context(BlockAIOCB *acb);
-
 typedef struct BlockBackendAioNotifier {
    void (*attached_aio_context)(AioContext *new_context, void *opaque);
    void (*detach_aio_context)(void *opaque);
@ -103,7 +101,6 @@ typedef struct BlockBackendAIOCB {
 } BlockBackendAIOCB;

 static const AIOCBInfo block_backend_aiocb_info = {
-    .get_aio_context = blk_aiocb_get_aio_context,
    .aiocb_size = sizeof(BlockBackendAIOCB),
 };

@ -121,6 +118,10 @@ static QTAILQ_HEAD(, BlockBackend) block_backends =
 static QTAILQ_HEAD(, BlockBackend) monitor_block_backends =
    QTAILQ_HEAD_INITIALIZER(monitor_block_backends);

+static int coroutine_mixed_fn GRAPH_RDLOCK
+blk_set_perm_locked(BlockBackend *blk, uint64_t perm, uint64_t shared_perm,
+                    Error **errp);
+
 static void blk_root_inherit_options(BdrvChildRole role, bool parent_is_format,
                                     int *child_flags, QDict *child_options,
                                     int parent_flags, QDict *parent_options)
@ -186,7 +187,7 @@ static void blk_vm_state_changed(void *opaque, bool running, RunState state)
 *
 * If an error is returned, the VM cannot be allowed to be resumed.
 */
-static void blk_root_activate(BdrvChild *child, Error **errp)
+static void GRAPH_RDLOCK blk_root_activate(BdrvChild *child, Error **errp)
 {
    BlockBackend *blk = child->opaque;
    Error *local_err = NULL;
@ -207,7 +208,7 @@ static void blk_root_activate(BdrvChild *child, Error **errp)
     */
    saved_shared_perm = blk->shared_perm;

-    blk_set_perm(blk, blk->perm, BLK_PERM_ALL, &local_err);
+    blk_set_perm_locked(blk, blk->perm, BLK_PERM_ALL, &local_err);
    if (local_err) {
        error_propagate(errp, local_err);
        blk->disable_perm = true;
@ -226,7 +227,7 @@ static void blk_root_activate(BdrvChild *child, Error **errp)
        return;
    }

-    blk_set_perm(blk, blk->perm, blk->shared_perm, &local_err);
+    blk_set_perm_locked(blk, blk->perm, blk->shared_perm, &local_err);
    if (local_err) {
        error_propagate(errp, local_err);
        blk->disable_perm = true;
@ -259,7 +260,7 @@ static bool blk_can_inactivate(BlockBackend *blk)
    return blk->force_allow_inactivate;
 }

-static int blk_root_inactivate(BdrvChild *child)
+static int GRAPH_RDLOCK blk_root_inactivate(BdrvChild *child)
 {
    BlockBackend *blk = child->opaque;

@ -911,7 +912,10 @@ void blk_remove_bs(BlockBackend *blk)
    blk_drain(blk);
    root = blk->root;
    blk->root = NULL;
+
+    bdrv_graph_wrlock(NULL);
    bdrv_root_unref_child(root);
+    bdrv_graph_wrunlock();
 }

 /*
@ -953,7 +957,8 @@ int blk_replace_bs(BlockBackend *blk, BlockDriverState *new_bs, Error **errp)
 /*
 * Sets the permission bitmasks that the user of the BlockBackend needs.
 */
-int blk_set_perm(BlockBackend *blk, uint64_t perm, uint64_t shared_perm,
+static int coroutine_mixed_fn GRAPH_RDLOCK
+blk_set_perm_locked(BlockBackend *blk, uint64_t perm, uint64_t shared_perm,
                    Error **errp)
 {
    int ret;
@ -972,6 +977,15 @@ int blk_set_perm(BlockBackend *blk, uint64_t perm, uint64_t shared_perm,
    return 0;
 }

+int blk_set_perm(BlockBackend *blk, uint64_t perm, uint64_t shared_perm,
+                 Error **errp)
+{
+    GLOBAL_STATE_CODE();
+    GRAPH_RDLOCK_GUARD_MAINLOOP();
+
+    return blk_set_perm_locked(blk, perm, shared_perm, errp);
+}
+
 void blk_get_perm(BlockBackend *blk, uint64_t *perm, uint64_t *shared_perm)
 {
    GLOBAL_STATE_CODE();
@ -1341,7 +1355,7 @@ blk_co_do_preadv_part(BlockBackend *blk, int64_t offset, int64_t bytes,
    /* throttling disk I/O */
    if (blk->public.throttle_group_member.throttle_state) {
        throttle_group_co_io_limits_intercept(&blk->public.throttle_group_member,
-                bytes, false);
+                bytes, THROTTLE_READ);
    }

    ret = bdrv_co_preadv_part(blk->root, offset, bytes, qiov, qiov_offset,
@ -1415,7 +1429,7 @@ blk_co_do_pwritev_part(BlockBackend *blk, int64_t offset, int64_t bytes,
    /* throttling disk I/O */
    if (blk->public.throttle_group_member.throttle_state) {
        throttle_group_co_io_limits_intercept(&blk->public.throttle_group_member,
-                bytes, true);
+                bytes, THROTTLE_WRITE);
    }

    if (!blk->enable_write_cache) {
@ -1533,7 +1547,7 @@ BlockAIOCB *blk_abort_aio_request(BlockBackend *blk,
    acb->blk = blk;
    acb->ret = ret;

-    replay_bh_schedule_oneshot_event(blk_get_aio_context(blk),
+    replay_bh_schedule_oneshot_event(qemu_get_current_aio_context(),
                                     error_callback_bh, acb);
    return &acb->common;
 }
@ -1545,16 +1559,8 @@ typedef struct BlkAioEmAIOCB {
    bool has_returned;
 } BlkAioEmAIOCB;

-static AioContext *blk_aio_em_aiocb_get_aio_context(BlockAIOCB *acb_)
-{
-    BlkAioEmAIOCB *acb = container_of(acb_, BlkAioEmAIOCB, common);
-
-    return blk_get_aio_context(acb->rwco.blk);
-}
-
 static const AIOCBInfo blk_aio_em_aiocb_info = {
    .aiocb_size         = sizeof(BlkAioEmAIOCB),
-    .get_aio_context    = blk_aio_em_aiocb_get_aio_context,
 };

 static void blk_aio_complete(BlkAioEmAIOCB *acb)
@ -1595,11 +1601,11 @@ static BlockAIOCB *blk_aio_prwv(BlockBackend *blk, int64_t offset,
    acb->has_returned = false;

    co = qemu_coroutine_create(co_entry, acb);
-    aio_co_enter(blk_get_aio_context(blk), co);
+    aio_co_enter(qemu_get_current_aio_context(), co);

    acb->has_returned = true;
    if (acb->rwco.ret != NOT_DONE) {
-        replay_bh_schedule_oneshot_event(blk_get_aio_context(blk),
+        replay_bh_schedule_oneshot_event(qemu_get_current_aio_context(),
                                         blk_aio_complete_bh, acb);
    }

@ -1901,11 +1907,11 @@ BlockAIOCB *blk_aio_zone_report(BlockBackend *blk, int64_t offset,
    acb->has_returned = false;

    co = qemu_coroutine_create(blk_aio_zone_report_entry, acb);
-    aio_co_enter(blk_get_aio_context(blk), co);
+    aio_co_enter(qemu_get_current_aio_context(), co);

    acb->has_returned = true;
    if (acb->rwco.ret != NOT_DONE) {
-        replay_bh_schedule_oneshot_event(blk_get_aio_context(blk),
+        replay_bh_schedule_oneshot_event(qemu_get_current_aio_context(),
                                         blk_aio_complete_bh, acb);
    }

@ -1942,11 +1948,11 @@ BlockAIOCB *blk_aio_zone_mgmt(BlockBackend *blk, BlockZoneOp op,
    acb->has_returned = false;

    co = qemu_coroutine_create(blk_aio_zone_mgmt_entry, acb);
-    aio_co_enter(blk_get_aio_context(blk), co);
+    aio_co_enter(qemu_get_current_aio_context(), co);

    acb->has_returned = true;
    if (acb->rwco.ret != NOT_DONE) {
-        replay_bh_schedule_oneshot_event(blk_get_aio_context(blk),
+        replay_bh_schedule_oneshot_event(qemu_get_current_aio_context(),
                                         blk_aio_complete_bh, acb);
    }

@ -1982,10 +1988,10 @@ BlockAIOCB *blk_aio_zone_append(BlockBackend *blk, int64_t *offset,
    acb->has_returned = false;

    co = qemu_coroutine_create(blk_aio_zone_append_entry, acb);
-    aio_co_enter(blk_get_aio_context(blk), co);
+    aio_co_enter(qemu_get_current_aio_context(), co);
    acb->has_returned = true;
    if (acb->rwco.ret != NOT_DONE) {
-        replay_bh_schedule_oneshot_event(blk_get_aio_context(blk),
+        replay_bh_schedule_oneshot_event(qemu_get_current_aio_context(),
                                         blk_aio_complete_bh, acb);
    }

@ -2434,12 +2440,6 @@ AioContext *blk_get_aio_context(BlockBackend *blk)
    return blk->ctx;
 }

-static AioContext *blk_aiocb_get_aio_context(BlockAIOCB *acb)
-{
-    BlockBackendAIOCB *blk_acb = DO_UPCAST(BlockBackendAIOCB, common, acb);
-    return blk_get_aio_context(blk_acb->blk);
-}
-
 int blk_set_aio_context(BlockBackend *blk, AioContext *new_context,
                        Error **errp)
 {
--- a/block/block-copy.c
+++ b/block/block-copy.c
@ -67,7 +67,7 @@ typedef struct BlockCopyCallState {
    QLIST_ENTRY(BlockCopyCallState) list;

    /*
-     * Fields that report information about return values and erros.
+     * Fields that report information about return values and errors.
     * Protected by lock in BlockCopyState.
     */
    bool error_is_read;
@ -462,7 +462,7 @@ static coroutine_fn int block_copy_task_run(AioTaskPool *pool,
 * Do copy of cluster-aligned chunk. Requested region is allowed to exceed
 * s->len only to cover last cluster when s->len is not aligned to clusters.
 *
- * No sync here: nor bitmap neighter intersecting requests handling, only copy.
+ * No sync here: neither bitmap nor intersecting requests handling, only copy.
 *
 * @method is an in-out argument, so that copy_range can be either extended to
 * a full-size buffer or disabled if the copy_range attempt fails.  The output
--- a/block/copy-before-write.c
+++ b/block/copy-before-write.c
@ -341,8 +341,8 @@ static void cbw_refresh_filename(BlockDriverState *bs)
            bs->file->bs->filename);
 }

-static void cbw_child_perm(BlockDriverState *bs, BdrvChild *c,
-                           BdrvChildRole role,
+static void GRAPH_RDLOCK
+cbw_child_perm(BlockDriverState *bs, BdrvChild *c, BdrvChildRole role,
               BlockReopenQueue *reopen_queue,
               uint64_t perm, uint64_t shared,
               uint64_t *nperm, uint64_t *nshared)
@ -503,7 +503,7 @@ static void cbw_close(BlockDriverState *bs)
    s->bcs = NULL;
 }

-BlockDriver bdrv_cbw_filter = {
+static BlockDriver bdrv_cbw_filter = {
    .format_name = "copy-before-write",
    .instance_size = sizeof(BDRVCopyBeforeWriteState),

--- a/block/crypto.c
+++ b/block/crypto.c
@ -777,7 +777,7 @@ block_crypto_get_specific_info_luks(BlockDriverState *bs, Error **errp)
    return spec_info;
 }

-static int
+static int GRAPH_RDLOCK
 block_crypto_amend_prepare(BlockDriverState *bs, Error **errp)
 {
    BlockCrypto *crypto = bs->opaque;
@ -793,7 +793,7 @@ block_crypto_amend_prepare(BlockDriverState *bs, Error **errp)
    return ret;
 }

-static void
+static void GRAPH_RDLOCK
 block_crypto_amend_cleanup(BlockDriverState *bs)
 {
    BlockCrypto *crypto = bs->opaque;
@ -841,6 +841,8 @@ block_crypto_amend_options_luks(BlockDriverState *bs,
    QCryptoBlockAmendOptions *amend_options = NULL;
    int ret = -EINVAL;

+    assume_graph_lock(); /* FIXME */
+
    assert(crypto);
    assert(crypto->block);

--- a/block/export/vduse-blk.c
+++ b/block/export/vduse-blk.c
@ -138,7 +138,7 @@ static void vduse_blk_enable_queue(VduseDev *dev, VduseVirtq *vq)

    aio_set_fd_handler(vblk_exp->export.ctx, vduse_queue_get_fd(vq),
                       on_vduse_vq_kick, NULL, NULL, NULL, vq);
-    /* Make sure we don't miss any kick afer reconnecting */
+    /* Make sure we don't miss any kick after reconnecting */
    eventfd_write(vduse_queue_get_fd(vq), 1);
 }

--- a/block/export/vhost-user-blk-server.c
+++ b/block/export/vhost-user-blk-server.c
@ -1,5 +1,5 @@
 /*
- * Sharing QEMU block devices via vhost-user protocal
+ * Sharing QEMU block devices via vhost-user protocol
 *
 * Parts of the code based on nbd/server.c.
 *
--- a/block/export/vhost-user-blk-server.h
+++ b/block/export/vhost-user-blk-server.h
@ -1,5 +1,5 @@
 /*
- * Sharing QEMU block devices via vhost-user protocal
+ * Sharing QEMU block devices via vhost-user protocol
 *
 * Copyright (c) Coiby Xu <coiby.xu@gmail.com>.
 * Copyright (c) 2020 Red Hat, Inc.
--- a/block/file-posix.c
+++ b/block/file-posix.c
@ -1159,9 +1159,9 @@ static int raw_reopen_prepare(BDRVReopenState *state,
     * As part of reopen prepare we also want to create new fd by
     * raw_reconfigure_getfd(). But it wants updated "perm", when in
     * bdrv_reopen_multiple() .bdrv_reopen_prepare() callback called prior to
-     * permission update. Happily, permission update is always a part (a seprate
-     * stage) of bdrv_reopen_multiple() so we can rely on this fact and
-     * reconfigure fd in raw_check_perm().
+     * permission update. Happily, permission update is always a part
+     * (a separate stage) of bdrv_reopen_multiple() so we can rely on this
+     * fact and reconfigure fd in raw_check_perm().
     */

    s->reopen_state = state;
@ -1412,11 +1412,9 @@ static void raw_refresh_zoned_limits(BlockDriverState *bs, struct stat *st,
    BlockZoneModel zoned;
    int ret;

-    bs->bl.zoned = BLK_Z_NONE;
-
    ret = get_sysfs_zoned_model(st, &zoned);
    if (ret < 0 || zoned == BLK_Z_NONE) {
-        return;
+        goto no_zoned;
    }
    bs->bl.zoned = zoned;

@ -1437,10 +1435,10 @@ static void raw_refresh_zoned_limits(BlockDriverState *bs, struct stat *st,
    if (ret < 0) {
        error_setg_errno(errp, -ret, "Unable to read chunk_sectors "
                                     "sysfs attribute");
-        return;
+        goto no_zoned;
    } else if (!ret) {
        error_setg(errp, "Read 0 from chunk_sectors sysfs attribute");
-        return;
+        goto no_zoned;
    }
    bs->bl.zone_size = ret << BDRV_SECTOR_BITS;

@ -1448,10 +1446,10 @@ static void raw_refresh_zoned_limits(BlockDriverState *bs, struct stat *st,
    if (ret < 0) {
        error_setg_errno(errp, -ret, "Unable to read nr_zones "
                                     "sysfs attribute");
-        return;
+        goto no_zoned;
    } else if (!ret) {
        error_setg(errp, "Read 0 from nr_zones sysfs attribute");
-        return;
+        goto no_zoned;
    }
    bs->bl.nr_zones = ret;

@ -1472,10 +1470,15 @@ static void raw_refresh_zoned_limits(BlockDriverState *bs, struct stat *st,
    ret = get_zones_wp(bs, s->fd, 0, bs->bl.nr_zones, 0);
    if (ret < 0) {
        error_setg_errno(errp, -ret, "report wps failed");
-        bs->wps = NULL;
-        return;
+        goto no_zoned;
    }
    qemu_co_mutex_init(&bs->wps->colock);
+    return;
+
+no_zoned:
+    bs->bl.zoned = BLK_Z_NONE;
+    g_free(bs->wps);
+    bs->wps = NULL;
 }
 #else /* !defined(CONFIG_BLKZONED) */
 static void raw_refresh_zoned_limits(BlockDriverState *bs, struct stat *st,
@ -2452,9 +2455,10 @@ static int coroutine_fn raw_co_prw(BlockDriverState *bs, uint64_t offset,
    if (fd_open(bs) < 0)
        return -EIO;
 #if defined(CONFIG_BLKZONED)
-    if ((type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) && bs->wps) {
+    if ((type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) &&
+        bs->bl.zoned != BLK_Z_NONE) {
        qemu_co_mutex_lock(&bs->wps->colock);
-        if (type & QEMU_AIO_ZONE_APPEND && bs->bl.zone_size) {
+        if (type & QEMU_AIO_ZONE_APPEND) {
            int index = offset / bs->bl.zone_size;
            offset = bs->wps->wp[index];
        }
@ -2502,11 +2506,10 @@ static int coroutine_fn raw_co_prw(BlockDriverState *bs, uint64_t offset,

 out:
 #if defined(CONFIG_BLKZONED)
-{
+    if ((type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) &&
+        bs->bl.zoned != BLK_Z_NONE) {
        BlockZoneWps *wps = bs->wps;
        if (ret == 0) {
-        if ((type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND))
-            && wps && bs->bl.zone_size) {
            uint64_t *wp = &wps->wp[offset / bs->bl.zone_size];
            if (!BDRV_ZT_IS_CONV(*wp)) {
                if (type & QEMU_AIO_ZONE_APPEND) {
@ -2519,17 +2522,12 @@ out:
                    *wp = offset + bytes;
                }
            }
-        }
        } else {
-        if (type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) {
            update_zones_wp(bs, s->fd, 0, 1);
        }
-    }

-    if ((type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) && wps) {
        qemu_co_mutex_unlock(&wps->colock);
    }
-}
 #endif
    return ret;
 }
@ -3374,7 +3372,7 @@ static void raw_account_discard(BDRVRawState *s, uint64_t nbytes, int ret)
 * of an array of zone descriptors.
 * zones is an array of zone descriptors to hold zone information on reply;
 * offset can be any byte within the entire size of the device;
- * nr_zones is the maxium number of sectors the command should operate on.
+ * nr_zones is the maximum number of sectors the command should operate on.
 */
 #if defined(CONFIG_BLKZONED)
 static int coroutine_fn raw_co_zone_report(BlockDriverState *bs, int64_t offset,
--- a/block/graph-lock.c
+++ b/block/graph-lock.c
@ -95,7 +95,7 @@ static uint32_t reader_count(void)

    QEMU_LOCK_GUARD(&aio_context_list_lock);

-    /* rd can temporarly be negative, but the total will *always* be >= 0 */
+    /* rd can temporarily be negative, but the total will *always* be >= 0 */
    rd = orphaned_reader_count;
    QTAILQ_FOREACH(brdv_graph, &aio_context_list, next_aio) {
        rd += qatomic_read(&brdv_graph->reader_count);
@ -163,19 +163,31 @@ void bdrv_graph_wrlock(BlockDriverState *bs)
 void bdrv_graph_wrunlock(void)
 {
    GLOBAL_STATE_CODE();
-    QEMU_LOCK_GUARD(&aio_context_list_lock);
    assert(qatomic_read(&has_writer));

+    WITH_QEMU_LOCK_GUARD(&aio_context_list_lock) {
        /*
         * No need for memory barriers, this works in pair with
         * the slow path of rdlock() and both take the lock.
         */
        qatomic_store_release(&has_writer, 0);

-    /* Wake up all coroutine that are waiting to read the graph */
+        /* Wake up all coroutines that are waiting to read the graph */
        qemu_co_enter_all(&reader_queue, &aio_context_list_lock);
    }

+    /*
+     * Run any BHs that were scheduled during the wrlock section and that
+     * callers might expect to have finished (in particular, this is important
+     * for bdrv_schedule_unref()).
+     *
+     * Do this only after restarting coroutines so that nested event loops in
+     * BHs don't deadlock if their condition relies on the coroutine making
+     * progress.
+     */
+    aio_bh_poll(qemu_get_aio_context());
+}
+
 void coroutine_fn bdrv_graph_co_rdlock(void)
 {
    BdrvGraphRWlock *bdrv_graph;
--- a/block/io.c
+++ b/block/io.c
@ -342,7 +342,7 @@ static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs,
     * timer callback), it is a bug in the caller that should be fixed. */
    assert(data.done);

-    /* Reaquire the AioContext of bs if we dropped it */
+    /* Reacquire the AioContext of bs if we dropped it */
    if (ctx != co_ctx) {
        aio_context_acquire(ctx);
    }
@ -591,10 +591,16 @@ static void coroutine_fn tracked_request_end(BdrvTrackedRequest *req)
        qatomic_dec(&req->bs->serialising_in_flight);
    }

-    qemu_co_mutex_lock(&req->bs->reqs_lock);
+    qemu_mutex_lock(&req->bs->reqs_lock);
    QLIST_REMOVE(req, list);
+    qemu_mutex_unlock(&req->bs->reqs_lock);
+
+    /*
+     * At this point qemu_co_queue_wait(&req->wait_queue, ...) won't be called
+     * anymore because the request has been removed from the list, so it's safe
+     * to restart the queue outside reqs_lock to minimize the critical section.
+     */
    qemu_co_queue_restart_all(&req->wait_queue);
-    qemu_co_mutex_unlock(&req->bs->reqs_lock);
 }

 /**
@ -621,9 +627,9 @@ static void coroutine_fn tracked_request_begin(BdrvTrackedRequest *req,

    qemu_co_queue_init(&req->wait_queue);

-    qemu_co_mutex_lock(&bs->reqs_lock);
+    qemu_mutex_lock(&bs->reqs_lock);
    QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
-    qemu_co_mutex_unlock(&bs->reqs_lock);
+    qemu_mutex_unlock(&bs->reqs_lock);
 }

 static bool tracked_request_overlaps(BdrvTrackedRequest *req,
@ -787,9 +793,9 @@ bdrv_wait_serialising_requests(BdrvTrackedRequest *self)
        return;
    }

-    qemu_co_mutex_lock(&bs->reqs_lock);
+    qemu_mutex_lock(&bs->reqs_lock);
    bdrv_wait_serialising_requests_locked(self);
-    qemu_co_mutex_unlock(&bs->reqs_lock);
+    qemu_mutex_unlock(&bs->reqs_lock);
 }

 void coroutine_fn bdrv_make_request_serialising(BdrvTrackedRequest *req,
@ -797,12 +803,12 @@ void coroutine_fn bdrv_make_request_serialising(BdrvTrackedRequest *req,
 {
    IO_CODE();

-    qemu_co_mutex_lock(&req->bs->reqs_lock);
+    qemu_mutex_lock(&req->bs->reqs_lock);

    tracked_request_set_serialising(req, align);
    bdrv_wait_serialising_requests_locked(req);

-    qemu_co_mutex_unlock(&req->bs->reqs_lock);
+    qemu_mutex_unlock(&req->bs->reqs_lock);
 }

 int bdrv_check_qiov_request(int64_t offset, int64_t bytes,
@ -2944,25 +2950,18 @@ int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
 /**************************************************************/
 /* async I/Os */

+/**
+ * Synchronously cancels an acb. Must be called with the BQL held and the acb
+ * must be processed with the BQL held too (IOThreads are not allowed).
+ *
+ * Use bdrv_aio_cancel_async() instead when possible.
+ */
 void bdrv_aio_cancel(BlockAIOCB *acb)
 {
-    IO_CODE();
+    GLOBAL_STATE_CODE();
    qemu_aio_ref(acb);
    bdrv_aio_cancel_async(acb);
-    while (acb->refcnt > 1) {
-        if (acb->aiocb_info->get_aio_context) {
-            aio_poll(acb->aiocb_info->get_aio_context(acb), true);
-        } else if (acb->bs) {
-            /* qemu_aio_ref and qemu_aio_unref are not thread-safe, so
-             * assert that we're not using an I/O thread.  Thread-safe
-             * code should use bdrv_aio_cancel_async exclusively.
-             */
-            assert(bdrv_get_aio_context(acb->bs) == qemu_get_aio_context());
-            aio_poll(bdrv_get_aio_context(acb->bs), true);
-        } else {
-            abort();
-        }
-    }
+    AIO_WAIT_WHILE_UNLOCKED(NULL, acb->refcnt > 1);
    qemu_aio_unref(acb);
 }

@ -2996,7 +2995,7 @@ int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
        goto early_exit;
    }

-    qemu_co_mutex_lock(&bs->reqs_lock);
+    qemu_mutex_lock(&bs->reqs_lock);
    current_gen = qatomic_read(&bs->write_gen);

    /* Wait until any previous flushes are completed */
@ -3006,7 +3005,7 @@ int coroutine_fn bdrv_co_flush(BlockDriverState *bs)

    /* Flushes reach this point in nondecreasing current_gen order.  */
    bs->active_flush_req = true;
-    qemu_co_mutex_unlock(&bs->reqs_lock);
+    qemu_mutex_unlock(&bs->reqs_lock);

    /* Write back all layers by calling one driver function */
    if (bs->drv->bdrv_co_flush) {
@ -3094,11 +3093,11 @@ out:
        bs->flushed_gen = current_gen;
    }

-    qemu_co_mutex_lock(&bs->reqs_lock);
+    qemu_mutex_lock(&bs->reqs_lock);
    bs->active_flush_req = false;
    /* Return value is ignored - it's ok if wait queue is empty */
    qemu_co_queue_next(&bs->flush_queue);
-    qemu_co_mutex_unlock(&bs->reqs_lock);
+    qemu_mutex_unlock(&bs->reqs_lock);

 early_exit:
    bdrv_dec_in_flight(bs);
--- a/block/iscsi.c
+++ b/block/iscsi.c
@ -1058,6 +1058,7 @@ static BlockAIOCB *iscsi_aio_ioctl(BlockDriverState *bs,
        return NULL;
    }

+    /* Must use malloc(): this is freed via scsi_free_scsi_task() */
    acb->task = malloc(sizeof(struct scsi_task));
    if (acb->task == NULL) {
        error_report("iSCSI: Failed to allocate task for scsi command. %s",
--- a/block/linux-aio.c
+++ b/block/linux-aio.c
@ -227,7 +227,7 @@ static void qemu_laio_process_completions(LinuxAioState *s)

    /* If we are nested we have to notify the level above that we are done
     * by setting event_max to zero, upper level will then jump out of it's
-     * own `for` loop.  If we are the last all counters droped to zero. */
+     * own `for` loop.  If we are the last all counters dropped to zero. */
    s->event_max = 0;
    s->event_idx = 0;
 }
--- a/block/meson.build
+++ b/block/meson.build
@ -4,41 +4,41 @@ block_ss.add(files(
  'aio_task.c',
  'amend.c',
  'backup.c',
-  'copy-before-write.c',
  'blkdebug.c',
  'blklogwrites.c',
  'blkverify.c',
  'block-backend.c',
  'block-copy.c',
-  'graph-lock.c',
  'commit.c',
+  'copy-before-write.c',
  'copy-on-read.c',
-  'preallocate.c',
-  'progress_meter.c',
  'create.c',
  'crypto.c',
  'dirty-bitmap.c',
  'filter-compress.c',
+  'graph-lock.c',
  'io.c',
  'mirror.c',
  'nbd.c',
  'null.c',
  'plug.c',
+  'preallocate.c',
+  'progress_meter.c',
  'qapi.c',
+  'qcow2.c',
  'qcow2-bitmap.c',
  'qcow2-cache.c',
  'qcow2-cluster.c',
  'qcow2-refcount.c',
  'qcow2-snapshot.c',
  'qcow2-threads.c',
-  'qcow2.c',
  'quorum.c',
  'raw-format.c',
  'reqlist.c',
  'snapshot.c',
  'snapshot-access.c',
-  'throttle-groups.c',
  'throttle.c',
+  'throttle-groups.c',
  'write-threshold.c',
 ), zstd, zlib, gnutls)

--- a/block/mirror.c
+++ b/block/mirror.c
@ -502,7 +502,7 @@ static void coroutine_fn mirror_iteration(MirrorBlockJob *s)

    job_pause_point(&s->common.job);

-    /* Find the number of consective dirty chunks following the first dirty
+    /* Find the number of consecutive dirty chunks following the first dirty
     * one, and wait for in flight requests in them. */
    bdrv_dirty_bitmap_lock(s->dirty_bitmap);
    while (nb_chunks * s->granularity < s->buf_size) {
@ -702,8 +702,12 @@ static int mirror_exit_common(Job *job)
     * mirror_top_bs from now on, so keep it drained. */
    bdrv_drained_begin(mirror_top_bs);
    bs_opaque->stop = true;
+
+    bdrv_graph_rdlock_main_loop();
    bdrv_child_refresh_perms(mirror_top_bs, mirror_top_bs->backing,
                             &error_abort);
+    bdrv_graph_rdunlock_main_loop();
+
    if (!abort && s->backing_mode == MIRROR_SOURCE_BACKING_CHAIN) {
        BlockDriverState *backing = s->is_none_mode ? src : s->base;
        BlockDriverState *unfiltered_target = bdrv_skip_filters(target_bs);
@ -1670,6 +1674,8 @@ static BlockJob *mirror_start_job(
    uint64_t target_perms, target_shared_perms;
    int ret;

+    GLOBAL_STATE_CODE();
+
    if (granularity == 0) {
        granularity = bdrv_get_default_bitmap_granularity(target);
    }
@ -1906,8 +1912,10 @@ fail:
    }

    bs_opaque->stop = true;
+    bdrv_graph_rdlock_main_loop();
    bdrv_child_refresh_perms(mirror_top_bs, mirror_top_bs->backing,
                             &error_abort);
+    bdrv_graph_rdunlock_main_loop();
    bdrv_replace_node(mirror_top_bs, mirror_top_bs->backing->bs, &error_abort);

    bdrv_unref(mirror_top_bs);
--- a/block/nbd.c
+++ b/block/nbd.c
@ -352,7 +352,7 @@ int coroutine_fn nbd_co_do_establish_connection(BlockDriverState *bs,
    }

    qio_channel_set_blocking(s->ioc, false, NULL);
-    qio_channel_attach_aio_context(s->ioc, bdrv_get_aio_context(bs));
+    qio_channel_set_follow_coroutine_ctx(s->ioc, true);

    /* successfully connected */
    WITH_QEMU_LOCK_GUARD(&s->requests_lock) {
@ -397,7 +397,6 @@ static void coroutine_fn GRAPH_RDLOCK nbd_reconnect_attempt(BDRVNBDState *s)

    /* Finalize previous connection if any */
    if (s->ioc) {
-        qio_channel_detach_aio_context(s->ioc);
        yank_unregister_function(BLOCKDEV_YANK_INSTANCE(s->bs->node_name),
                                 nbd_yank, s->bs);
        object_unref(OBJECT(s->ioc));
@ -2089,10 +2088,6 @@ static void nbd_attach_aio_context(BlockDriverState *bs,
     * the reconnect_delay_timer cannot be active here.
     */
    assert(!s->reconnect_delay_timer);
-
-    if (s->ioc) {
-        qio_channel_attach_aio_context(s->ioc, new_context);
-    }
 }

 static void nbd_detach_aio_context(BlockDriverState *bs)
@ -2101,10 +2096,6 @@ static void nbd_detach_aio_context(BlockDriverState *bs)

    assert(!s->open_timer);
    assert(!s->reconnect_delay_timer);
-
-    if (s->ioc) {
-        qio_channel_detach_aio_context(s->ioc);
-    }
 }

 static BlockDriver bdrv_nbd = {
--- a/block/parallels.c
+++ b/block/parallels.c
@ -136,6 +136,12 @@ static int cluster_remainder(BDRVParallelsState *s, int64_t sector_num,
    return MIN(nb_sectors, ret);
 }

+static uint32_t host_cluster_index(BDRVParallelsState *s, int64_t off)
+{
+    off -= s->data_start << BDRV_SECTOR_BITS;
+    return off / s->cluster_size;
+}
+
 static int64_t block_status(BDRVParallelsState *s, int64_t sector_num,
                            int nb_sectors, int *pnum)
 {
@ -172,13 +178,82 @@ static void parallels_set_bat_entry(BDRVParallelsState *s,
    bitmap_set(s->bat_dirty_bmap, bat_entry_off(index) / s->bat_dirty_block, 1);
 }

+static int mark_used(BlockDriverState *bs, unsigned long *bitmap,
+                     uint32_t bitmap_size, int64_t off, uint32_t count)
+{
+    BDRVParallelsState *s = bs->opaque;
+    uint32_t cluster_index = host_cluster_index(s, off);
+    unsigned long next_used;
+    if (cluster_index + count > bitmap_size) {
+        return -E2BIG;
+    }
+    next_used = find_next_bit(bitmap, bitmap_size, cluster_index);
+    if (next_used < cluster_index + count) {
+        return -EBUSY;
+    }
+    bitmap_set(bitmap, cluster_index, count);
+    return 0;
+}
+
+/*
+ * Collect used bitmap. The image can contain errors, we should fill the
+ * bitmap anyway, as much as we can. This information will be used for
+ * error resolution.
+ */
+static int parallels_fill_used_bitmap(BlockDriverState *bs)
+{
+    BDRVParallelsState *s = bs->opaque;
+    int64_t payload_bytes;
+    uint32_t i;
+    int err = 0;
+
+    payload_bytes = bdrv_getlength(bs->file->bs);
+    if (payload_bytes < 0) {
+        return payload_bytes;
+    }
+    payload_bytes -= s->data_start * BDRV_SECTOR_SIZE;
+    if (payload_bytes < 0) {
+        return -EINVAL;
+    }
+
+    s->used_bmap_size = DIV_ROUND_UP(payload_bytes, s->cluster_size);
+    if (s->used_bmap_size == 0) {
+        return 0;
+    }
+    s->used_bmap = bitmap_try_new(s->used_bmap_size);
+    if (s->used_bmap == NULL) {
+        return -ENOMEM;
+    }
+
+    for (i = 0; i < s->bat_size; i++) {
+        int err2;
+        int64_t host_off = bat2sect(s, i) << BDRV_SECTOR_BITS;
+        if (host_off == 0) {
+            continue;
+        }
+
+        err2 = mark_used(bs, s->used_bmap, s->used_bmap_size, host_off, 1);
+        if (err2 < 0 && err == 0) {
+            err = err2;
+        }
+    }
+    return err;
+}
+
+static void parallels_free_used_bitmap(BlockDriverState *bs)
+{
+    BDRVParallelsState *s = bs->opaque;
+    s->used_bmap_size = 0;
+    g_free(s->used_bmap);
+}
+
 static int64_t coroutine_fn GRAPH_RDLOCK
 allocate_clusters(BlockDriverState *bs, int64_t sector_num,
                  int nb_sectors, int *pnum)
 {
    int ret = 0;
    BDRVParallelsState *s = bs->opaque;
-    int64_t pos, space, idx, to_allocate, i, len;
+    int64_t i, pos, idx, to_allocate, first_free, host_off;

    pos = block_status(s, sector_num, nb_sectors, pnum);
    if (pos > 0) {
@ -188,7 +263,8 @@ allocate_clusters(BlockDriverState *bs, int64_t sector_num,
    idx = sector_num / s->tracks;
    to_allocate = DIV_ROUND_UP(sector_num + *pnum, s->tracks) - idx;

-    /* This function is called only by parallels_co_writev(), which will never
+    /*
+     * This function is called only by parallels_co_writev(), which will never
     * pass a sector_num at or beyond the end of the image (because the block
     * layer never passes such a sector_num to that function). Therefore, idx
     * is always below s->bat_size.
@ -196,24 +272,25 @@ allocate_clusters(BlockDriverState *bs, int64_t sector_num,
     * exceed the image end. Therefore, idx + to_allocate cannot exceed
     * s->bat_size.
     * Note that s->bat_size is an unsigned int, therefore idx + to_allocate
-     * will always fit into a uint32_t. */
+     * will always fit into a uint32_t.
+     */
    assert(idx < s->bat_size && idx + to_allocate <= s->bat_size);

-    space = to_allocate * s->tracks;
-    len = bdrv_co_getlength(bs->file->bs);
-    if (len < 0) {
-        return len;
-    }
-    if (s->data_end + space > (len >> BDRV_SECTOR_BITS)) {
-        space += s->prealloc_size;
+    first_free = find_first_zero_bit(s->used_bmap, s->used_bmap_size);
+    if (first_free == s->used_bmap_size) {
+        uint32_t new_usedsize;
+        int64_t bytes = to_allocate * s->cluster_size;
+        bytes += s->prealloc_size * BDRV_SECTOR_SIZE;
+
+        host_off = s->data_end * BDRV_SECTOR_SIZE;
+
        /*
         * We require the expanded size to read back as zero. If the
         * user permitted truncation, we try that; but if it fails, we
         * force the safer-but-slower fallocate.
         */
        if (s->prealloc_mode == PRL_PREALLOC_MODE_TRUNCATE) {
-            ret = bdrv_co_truncate(bs->file,
-                                   (s->data_end + space) << BDRV_SECTOR_BITS,
+            ret = bdrv_co_truncate(bs->file, host_off + bytes,
                                   false, PREALLOC_MODE_OFF,
                                   BDRV_REQ_ZERO_WRITE, NULL);
            if (ret == -ENOTSUP) {
@ -221,22 +298,53 @@ allocate_clusters(BlockDriverState *bs, int64_t sector_num,
            }
        }
        if (s->prealloc_mode == PRL_PREALLOC_MODE_FALLOCATE) {
-            ret = bdrv_co_pwrite_zeroes(bs->file,
-                                        s->data_end << BDRV_SECTOR_BITS,
-                                        space << BDRV_SECTOR_BITS, 0);
+            ret = bdrv_co_pwrite_zeroes(bs->file, host_off, bytes, 0);
        }
        if (ret < 0) {
            return ret;
        }
+
+        new_usedsize = s->used_bmap_size + bytes / s->cluster_size;
+        s->used_bmap = bitmap_zero_extend(s->used_bmap, s->used_bmap_size,
+                                          new_usedsize);
+        s->used_bmap_size = new_usedsize;
+    } else {
+        int64_t next_used;
+        next_used = find_next_bit(s->used_bmap, s->used_bmap_size, first_free);
+
+        /* Not enough continuous clusters in the middle, adjust the size */
+        if (next_used - first_free < to_allocate) {
+            to_allocate = next_used - first_free;
+            *pnum = (idx + to_allocate) * s->tracks - sector_num;
        }

-    /* Try to read from backing to fill empty clusters
+        host_off = s->data_start * BDRV_SECTOR_SIZE;
+        host_off += first_free * s->cluster_size;
+
+        /*
+         * No need to preallocate if we are using tail area from the above
+         * branch. In the other case we are likely re-using hole. Preallocate
+         * the space if required by the prealloc_mode.
+         */
+        if (s->prealloc_mode == PRL_PREALLOC_MODE_FALLOCATE &&
+                host_off < s->data_end * BDRV_SECTOR_SIZE) {
+            ret = bdrv_co_pwrite_zeroes(bs->file, host_off,
+                                        s->cluster_size * to_allocate, 0);
+            if (ret < 0) {
+                return ret;
+            }
+        }
+    }
+
+    /*
+     * Try to read from backing to fill empty clusters
     * FIXME: 1. previous write_zeroes may be redundant
     *        2. most of data we read from backing will be rewritten by
     *           parallels_co_writev. On aligned-to-cluster write we do not need
     *           this read at all.
     *        3. it would be good to combine write of data from backing and new
-     *           data into one write call */
+     *           data into one write call.
+     */
    if (bs->backing) {
        int64_t nb_cow_sectors = to_allocate * s->tracks;
        int64_t nb_cow_bytes = nb_cow_sectors << BDRV_SECTOR_BITS;
@ -257,9 +365,18 @@ allocate_clusters(BlockDriverState *bs, int64_t sector_num,
        }
    }

+    ret = mark_used(bs, s->used_bmap, s->used_bmap_size, host_off, to_allocate);
+    if (ret < 0) {
+        /* Image consistency is broken. Alarm! */
+        return ret;
+    }
    for (i = 0; i < to_allocate; i++) {
-        parallels_set_bat_entry(s, idx + i, s->data_end / s->off_multiplier);
-        s->data_end += s->tracks;
+        parallels_set_bat_entry(s, idx + i,
+                host_off / BDRV_SECTOR_SIZE / s->off_multiplier);
+        host_off += s->cluster_size;
+    }
+    if (host_off > s->data_end * BDRV_SECTOR_SIZE) {
+        s->data_end = host_off / BDRV_SECTOR_SIZE;
    }

    return bat2sect(s, idx) + sector_num % s->tracks;
@ -420,6 +537,64 @@ parallels_co_readv(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
    return ret;
 }

+
+static int coroutine_fn GRAPH_RDLOCK
+parallels_co_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes)
+{
+    int ret = 0;
+    uint32_t cluster, count;
+    BDRVParallelsState *s = bs->opaque;
+
+    /*
+     * The image does not support ZERO mark inside the BAT, which means that
+     * stale data could be exposed from the backing file.
+     */
+    if (bs->backing) {
+        return -ENOTSUP;
+    }
+
+    if (!QEMU_IS_ALIGNED(offset, s->cluster_size)) {
+        return -ENOTSUP;
+    } else if (!QEMU_IS_ALIGNED(bytes, s->cluster_size)) {
+        return -ENOTSUP;
+    }
+
+    cluster = offset / s->cluster_size;
+    count = bytes / s->cluster_size;
+
+    qemu_co_mutex_lock(&s->lock);
+    for (; count > 0; cluster++, count--) {
+        int64_t host_off = bat2sect(s, cluster) << BDRV_SECTOR_BITS;
+        if (host_off == 0) {
+            continue;
+        }
+
+        ret = bdrv_co_pdiscard(bs->file, host_off, s->cluster_size);
+        if (ret < 0) {
+            goto done;
+        }
+
+        parallels_set_bat_entry(s, cluster, 0);
+        bitmap_clear(s->used_bmap, host_cluster_index(s, host_off), 1);
+    }
+done:
+    qemu_co_mutex_unlock(&s->lock);
+    return ret;
+}
+
+static int coroutine_fn GRAPH_RDLOCK
+parallels_co_pwrite_zeroes(BlockDriverState *bs, int64_t offset, int64_t bytes,
+                           BdrvRequestFlags flags)
+{
+    /*
+     * The zero flag is missed in the Parallels format specification. We can
+     * resort to discard if we have no backing file (this condition is checked
+     * inside parallels_co_pdiscard().
+     */
+    return parallels_co_pdiscard(bs, offset, bytes);
+}
+
+
 static void parallels_check_unclean(BlockDriverState *bs,
                                    BdrvCheckResult *res,
                                    BdrvCheckMode fix)
@ -440,6 +615,91 @@ static void parallels_check_unclean(BlockDriverState *bs,
    }
 }

+/*
+ * Returns true if data_off is correct, otherwise false. In both cases
+ * correct_offset is set to the proper value.
+ */
+static bool parallels_test_data_off(BDRVParallelsState *s,
+                                    int64_t file_nb_sectors,
+                                    uint32_t *correct_offset)
+{
+    uint32_t data_off, min_off;
+    bool old_magic;
+
+    /*
+     * There are two slightly different image formats: with "WithoutFreeSpace"
+     * or "WithouFreSpacExt" magic words. Call the first one as "old magic".
+     * In such images data_off field can be zero. In this case the offset is
+     * calculated as the end of BAT table plus some padding to ensure sector
+     * size alignment.
+     */
+    old_magic = !memcmp(s->header->magic, HEADER_MAGIC, 16);
+
+    min_off = DIV_ROUND_UP(bat_entry_off(s->bat_size), BDRV_SECTOR_SIZE);
+    if (!old_magic) {
+        min_off = ROUND_UP(min_off, s->cluster_size / BDRV_SECTOR_SIZE);
+    }
+
+    if (correct_offset) {
+        *correct_offset = min_off;
+    }
+
+    data_off = le32_to_cpu(s->header->data_off);
+    if (data_off == 0 && old_magic) {
+        return true;
+    }
+
+    if (data_off < min_off || data_off > file_nb_sectors) {
+        return false;
+    }
+
+    if (correct_offset) {
+        *correct_offset = data_off;
+    }
+
+    return true;
+}
+
+static int coroutine_fn GRAPH_RDLOCK
+parallels_check_data_off(BlockDriverState *bs, BdrvCheckResult *res,
+                         BdrvCheckMode fix)
+{
+    BDRVParallelsState *s = bs->opaque;
+    int64_t file_size;
+    uint32_t data_off;
+
+    file_size = bdrv_co_nb_sectors(bs->file->bs);
+    if (file_size < 0) {
+        res->check_errors++;
+        return file_size;
+    }
+
+    if (parallels_test_data_off(s, file_size, &data_off)) {
+        return 0;
+    }
+
+    res->corruptions++;
+    if (fix & BDRV_FIX_ERRORS) {
+        int err;
+        s->header->data_off = cpu_to_le32(data_off);
+        s->data_start = data_off;
+
+        parallels_free_used_bitmap(bs);
+        err = parallels_fill_used_bitmap(bs);
+        if (err == -ENOMEM) {
+            res->check_errors++;
+            return err;
+        }
+
+        res->corruptions_fixed++;
+    }
+
+    fprintf(stderr, "%s data_off field has incorrect value\n",
+            fix & BDRV_FIX_ERRORS ? "Repairing" : "ERROR");
+
+    return 0;
+}
+
 static int coroutine_fn GRAPH_RDLOCK
 parallels_check_outside_image(BlockDriverState *bs, BdrvCheckResult *res,
                              BdrvCheckMode fix)
@ -484,13 +744,13 @@ parallels_check_outside_image(BlockDriverState *bs, BdrvCheckResult *res,

 static int coroutine_fn GRAPH_RDLOCK
 parallels_check_leak(BlockDriverState *bs, BdrvCheckResult *res,
-                     BdrvCheckMode fix)
+                     BdrvCheckMode fix, bool explicit)
 {
    BDRVParallelsState *s = bs->opaque;
    int64_t size;
    int ret;

-    size = bdrv_getlength(bs->file->bs);
+    size = bdrv_co_getlength(bs->file->bs);
    if (size < 0) {
        res->check_errors++;
        return size;
@ -499,10 +759,13 @@ parallels_check_leak(BlockDriverState *bs, BdrvCheckResult *res,
    if (size > res->image_end_offset) {
        int64_t count;
        count = DIV_ROUND_UP(size - res->image_end_offset, s->cluster_size);
-        fprintf(stderr, "%s space leaked at the end of the image %" PRId64 "\n",
+        if (explicit) {
+            fprintf(stderr,
+                    "%s space leaked at the end of the image %" PRId64 "\n",
                    fix & BDRV_FIX_LEAKS ? "Repairing" : "ERROR",
                    size - res->image_end_offset);
            res->leaks += count;
+        }
        if (fix & BDRV_FIX_LEAKS) {
            Error *local_err = NULL;

@ -517,13 +780,149 @@ parallels_check_leak(BlockDriverState *bs, BdrvCheckResult *res,
                res->check_errors++;
                return ret;
            }
+            if (explicit) {
                res->leaks_fixed += count;
            }
        }
+    }

    return 0;
 }

+static int coroutine_fn GRAPH_RDLOCK
+parallels_check_duplicate(BlockDriverState *bs, BdrvCheckResult *res,
+                          BdrvCheckMode fix)
+{
+    BDRVParallelsState *s = bs->opaque;
+    int64_t host_off, host_sector, guest_sector;
+    unsigned long *bitmap;
+    uint32_t i, bitmap_size, bat_entry;
+    int n, ret = 0;
+    uint64_t *buf = NULL;
+    bool fixed = false;
+
+    /*
+     * Create a bitmap of used clusters.
+     * If a bit is set, there is a BAT entry pointing to this cluster.
+     * Loop through the BAT entries, check bits relevant to an entry offset.
+     * If bit is set, this entry is duplicated. Otherwise set the bit.
+     *
+     * We shouldn't worry about newly allocated clusters outside the image
+     * because they are created higher then any existing cluster pointed by
+     * a BAT entry.
+     */
+    bitmap_size = host_cluster_index(s, res->image_end_offset);
+    if (bitmap_size == 0) {
+        return 0;
+    }
+    if (res->image_end_offset % s->cluster_size) {
+        /* A not aligned image end leads to a bitmap shorter by 1 */
+        bitmap_size++;
+    }
+
+    bitmap = bitmap_new(bitmap_size);
+
+    buf = qemu_blockalign(bs, s->cluster_size);
+
+    for (i = 0; i < s->bat_size; i++) {
+        host_off = bat2sect(s, i) << BDRV_SECTOR_BITS;
+        if (host_off == 0) {
+            continue;
+        }
+
+        ret = mark_used(bs, bitmap, bitmap_size, host_off, 1);
+        assert(ret != -E2BIG);
+        if (ret == 0) {
+            continue;
+        }
+
+        /* this cluster duplicates another one */
+        fprintf(stderr, "%s duplicate offset in BAT entry %u\n",
+                fix & BDRV_FIX_ERRORS ? "Repairing" : "ERROR", i);
+
+        res->corruptions++;
+
+        if (!(fix & BDRV_FIX_ERRORS)) {
+            continue;
+        }
+
+        /*
+         * Reset the entry and allocate a new cluster
+         * for the relevant guest offset. In this way we let
+         * the lower layer to place the new cluster properly.
+         * Copy the original cluster to the allocated one.
+         * But before save the old offset value for repairing
+         * if we have an error.
+         */
+        bat_entry = s->bat_bitmap[i];
+        parallels_set_bat_entry(s, i, 0);
+
+        ret = bdrv_co_pread(bs->file, host_off, s->cluster_size, buf, 0);
+        if (ret < 0) {
+            res->check_errors++;
+            goto out_repair_bat;
+        }
+
+        guest_sector = (i * (int64_t)s->cluster_size) >> BDRV_SECTOR_BITS;
+        host_sector = allocate_clusters(bs, guest_sector, s->tracks, &n);
+        if (host_sector < 0) {
+            res->check_errors++;
+            goto out_repair_bat;
+        }
+        host_off = host_sector << BDRV_SECTOR_BITS;
+
+        ret = bdrv_co_pwrite(bs->file, host_off, s->cluster_size, buf, 0);
+        if (ret < 0) {
+            res->check_errors++;
+            goto out_repair_bat;
+        }
+
+        if (host_off + s->cluster_size > res->image_end_offset) {
+            res->image_end_offset = host_off + s->cluster_size;
+        }
+
+        /*
+         * In the future allocate_cluster() will reuse holed offsets
+         * inside the image. Keep the used clusters bitmap content
+         * consistent for the new allocated clusters too.
+         *
+         * Note, clusters allocated outside the current image are not
+         * considered, and the bitmap size doesn't change. This specifically
+         * means that -E2BIG is OK.
+         */
+        ret = mark_used(bs, bitmap, bitmap_size, host_off, 1);
+        if (ret == -EBUSY) {
+            res->check_errors++;
+            goto out_repair_bat;
+        }
+
+        fixed = true;
+        res->corruptions_fixed++;
+
+    }
+
+    if (fixed) {
+        /*
+         * When new clusters are allocated, the file size increases by
+         * 128 Mb. We need to truncate the file to the right size. Let
+         * the leak fix code make its job without res changing.
+         */
+        ret = parallels_check_leak(bs, res, fix, false);
+    }
+
+out_free:
+    g_free(buf);
+    g_free(bitmap);
+    return ret;
+/*
+ * We can get here only from places where index and old_offset have
+ * meaningful values.
+ */
+out_repair_bat:
+    s->bat_bitmap[i] = bat_entry;
+    goto out_free;
+}
+
 static void parallels_collect_statistics(BlockDriverState *bs,
                                         BdrvCheckResult *res,
                                         BdrvCheckMode fix)
@ -565,12 +964,22 @@ parallels_co_check(BlockDriverState *bs, BdrvCheckResult *res,
    WITH_QEMU_LOCK_GUARD(&s->lock) {
        parallels_check_unclean(bs, res, fix);

+        ret = parallels_check_data_off(bs, res, fix);
+        if (ret < 0) {
+            return ret;
+        }
+
        ret = parallels_check_outside_image(bs, res, fix);
        if (ret < 0) {
            return ret;
        }

-        ret = parallels_check_leak(bs, res, fix);
+        ret = parallels_check_leak(bs, res, fix, true);
+        if (ret < 0) {
+            return ret;
+        }
+
+        ret = parallels_check_duplicate(bs, res, fix);
        if (ret < 0) {
            return ret;
        }
@ -792,16 +1201,58 @@ static int parallels_update_header(BlockDriverState *bs)
    return bdrv_pwrite_sync(bs->file, 0, size, s->header, 0);
 }

+
+static int parallels_opts_prealloc(BlockDriverState *bs, QDict *options,
+                                   Error **errp)
+{
+    int err;
+    char *buf;
+    int64_t bytes;
+    BDRVParallelsState *s = bs->opaque;
+    Error *local_err = NULL;
+    QemuOpts *opts = qemu_opts_create(&parallels_runtime_opts, NULL, 0, errp);
+    if (!opts) {
+        return -ENOMEM;
+    }
+
+    err = -EINVAL;
+    if (!qemu_opts_absorb_qdict(opts, options, errp)) {
+        goto done;
+    }
+
+    bytes = qemu_opt_get_size_del(opts, PARALLELS_OPT_PREALLOC_SIZE, 0);
+    s->prealloc_size = bytes >> BDRV_SECTOR_BITS;
+    buf = qemu_opt_get_del(opts, PARALLELS_OPT_PREALLOC_MODE);
+    /* prealloc_mode can be downgraded later during allocate_clusters */
+    s->prealloc_mode = qapi_enum_parse(&prealloc_mode_lookup, buf,
+                                       PRL_PREALLOC_MODE_FALLOCATE,
+                                       &local_err);
+    g_free(buf);
+    if (local_err != NULL) {
+        error_propagate(errp, local_err);
+        goto done;
+    }
+    err = 0;
+
+done:
+    qemu_opts_del(opts);
+    return err;
+}
+
 static int parallels_open(BlockDriverState *bs, QDict *options, int flags,
                          Error **errp)
 {
    BDRVParallelsState *s = bs->opaque;
    ParallelsHeader ph;
    int ret, size, i;
-    int64_t file_nb_sectors;
-    QemuOpts *opts = NULL;
-    Error *local_err = NULL;
-    char *buf;
+    int64_t file_nb_sectors, sector;
+    uint32_t data_start;
+    bool need_check = false;
+
+    ret = parallels_opts_prealloc(bs, options, errp);
+    if (ret < 0) {
+        return ret;
+    }

    ret = bdrv_open_file_child(NULL, options, "file", bs, errp);
    if (ret < 0) {
@ -815,7 +1266,7 @@ static int parallels_open(BlockDriverState *bs, QDict *options, int flags,

    ret = bdrv_pread(bs->file, 0, sizeof(ph), &ph, 0);
    if (ret < 0) {
-        goto fail;
+        return ret;
    }

    bs->total_sectors = le64_to_cpu(ph.nb_sectors);
@ -835,38 +1286,26 @@ static int parallels_open(BlockDriverState *bs, QDict *options, int flags,
    s->tracks = le32_to_cpu(ph.tracks);
    if (s->tracks == 0) {
        error_setg(errp, "Invalid image: Zero sectors per track");
-        ret = -EINVAL;
-        goto fail;
+        return -EINVAL;
    }
    if (s->tracks > INT32_MAX/513) {
        error_setg(errp, "Invalid image: Too big cluster");
-        ret = -EFBIG;
-        goto fail;
+        return -EFBIG;
    }
+    s->prealloc_size = MAX(s->tracks, s->prealloc_size);
    s->cluster_size = s->tracks << BDRV_SECTOR_BITS;

    s->bat_size = le32_to_cpu(ph.bat_entries);
    if (s->bat_size > INT_MAX / sizeof(uint32_t)) {
        error_setg(errp, "Catalog too large");
-        ret = -EFBIG;
-        goto fail;
+        return -EFBIG;
    }

    size = bat_entry_off(s->bat_size);
    s->header_size = ROUND_UP(size, bdrv_opt_mem_align(bs->file->bs));
    s->header = qemu_try_blockalign(bs->file->bs, s->header_size);
    if (s->header == NULL) {
-        ret = -ENOMEM;
-        goto fail;
-    }
-    s->data_end = le32_to_cpu(ph.data_off);
-    if (s->data_end == 0) {
-        s->data_end = ROUND_UP(bat_entry_off(s->bat_size), BDRV_SECTOR_SIZE);
-    }
-    if (s->data_end < s->header_size) {
-        /* there is not enough unused space to fit to block align between BAT
-           and actual data. We can't avoid read-modify-write... */
-        s->header_size = size;
+        return -ENOMEM;
    }

    ret = bdrv_pread(bs->file, 0, s->header_size, s->header, 0);
@ -875,56 +1314,23 @@ static int parallels_open(BlockDriverState *bs, QDict *options, int flags,
    }
    s->bat_bitmap = (uint32_t *)(s->header + 1);

-    for (i = 0; i < s->bat_size; i++) {
-        int64_t off = bat2sect(s, i);
-        if (off >= file_nb_sectors) {
-            if (flags & BDRV_O_CHECK) {
-                continue;
-            }
-            error_setg(errp, "parallels: Offset %" PRIi64 " in BAT[%d] entry "
-                       "is larger than file size (%" PRIi64 ")",
-                       off << BDRV_SECTOR_BITS, i,
-                       file_nb_sectors << BDRV_SECTOR_BITS);
-            ret = -EINVAL;
-            goto fail;
-        }
-        if (off >= s->data_end) {
-            s->data_end = off + s->tracks;
-        }
-    }
-
    if (le32_to_cpu(ph.inuse) == HEADER_INUSE_MAGIC) {
-        /* Image was not closed correctly. The check is mandatory */
-        s->header_unclean = true;
-        if ((flags & BDRV_O_RDWR) && !(flags & BDRV_O_CHECK)) {
-            error_setg(errp, "parallels: Image was not closed correctly; "
-                       "cannot be opened read/write");
-            ret = -EACCES;
-            goto fail;
-        }
+        need_check = s->header_unclean = true;
    }

-    opts = qemu_opts_create(&parallels_runtime_opts, NULL, 0, errp);
-    if (!opts) {
-        goto fail_options;
+    {
+        bool ok = parallels_test_data_off(s, file_nb_sectors, &data_start);
+        need_check = need_check || !ok;
    }

-    if (!qemu_opts_absorb_qdict(opts, options, errp)) {
-        goto fail_options;
-    }
-
-    s->prealloc_size =
-        qemu_opt_get_size_del(opts, PARALLELS_OPT_PREALLOC_SIZE, 0);
-    s->prealloc_size = MAX(s->tracks, s->prealloc_size >> BDRV_SECTOR_BITS);
-    buf = qemu_opt_get_del(opts, PARALLELS_OPT_PREALLOC_MODE);
-    /* prealloc_mode can be downgraded later during allocate_clusters */
-    s->prealloc_mode = qapi_enum_parse(&prealloc_mode_lookup, buf,
-                                       PRL_PREALLOC_MODE_FALLOCATE,
-                                       &local_err);
-    g_free(buf);
-    if (local_err != NULL) {
-        error_propagate(errp, local_err);
-        goto fail_options;
+    s->data_start = data_start;
+    s->data_end = s->data_start;
+    if (s->data_end < (s->header_size >> BDRV_SECTOR_BITS)) {
+        /*
+         * There is not enough unused space to fit to block align between BAT
+         * and actual data. We can't avoid read-modify-write...
+         */
+        s->header_size = size;
    }

    if (ph.ext_off) {
@ -962,17 +1368,60 @@ static int parallels_open(BlockDriverState *bs, QDict *options, int flags,
               bdrv_get_device_or_node_name(bs));
    ret = migrate_add_blocker(s->migration_blocker, errp);
    if (ret < 0) {
-        error_free(s->migration_blocker);
+        error_setg(errp, "Migration blocker error");
        goto fail;
    }
    qemu_co_mutex_init(&s->lock);
+
+    for (i = 0; i < s->bat_size; i++) {
+        sector = bat2sect(s, i);
+        if (sector + s->tracks > s->data_end) {
+            s->data_end = sector + s->tracks;
+        }
+    }
+    need_check = need_check || s->data_end > file_nb_sectors;
+
+    if (!need_check) {
+        ret = parallels_fill_used_bitmap(bs);
+        if (ret == -ENOMEM) {
+            goto fail;
+        }
+        need_check = need_check || ret < 0; /* These are correctable errors */
+    }
+
+    /*
+     * We don't repair the image here if it's opened for checks. Also we don't
+     * want to change inactive images and can't change readonly images.
+     */
+    if ((flags & (BDRV_O_CHECK | BDRV_O_INACTIVE)) || !(flags & BDRV_O_RDWR)) {
+        return 0;
+    }
+
+    /* Repair the image if corruption was detected. */
+    if (need_check) {
+        BdrvCheckResult res;
+        ret = bdrv_check(bs, &res, BDRV_FIX_ERRORS | BDRV_FIX_LEAKS);
+        if (ret < 0) {
+            error_setg_errno(errp, -ret, "Could not repair corrupted image");
+            migrate_del_blocker(s->migration_blocker);
+            goto fail;
+        }
+    }
    return 0;

 fail_format:
    error_setg(errp, "Image not in Parallels format");
-fail_options:
-    ret = -EINVAL;
+    return -EINVAL;
+
 fail:
+    /*
+     * "s" object was allocated by g_malloc0 so we can safely
+     * try to free its fields even they were not allocated.
+     */
+    parallels_free_used_bitmap(bs);
+
+    error_free(s->migration_blocker);
+    g_free(s->bat_dirty_bmap);
    qemu_vfree(s->header);
    return ret;
 }
@ -991,6 +1440,8 @@ static void parallels_close(BlockDriverState *bs)
                      PREALLOC_MODE_OFF, 0, NULL);
    }

+    parallels_free_used_bitmap(bs);
+
    g_free(s->bat_dirty_bmap);
    qemu_vfree(s->header);

@ -998,24 +1449,34 @@ static void parallels_close(BlockDriverState *bs)
    error_free(s->migration_blocker);
 }

+static bool parallels_is_support_dirty_bitmaps(BlockDriverState *bs)
+{
+    return 1;
+}
+
 static BlockDriver bdrv_parallels = {
    .format_name                = "parallels",
    .instance_size              = sizeof(BDRVParallelsState),
+    .create_opts                = &parallels_create_opts,
+    .is_format                  = true,
+    .supports_backing           = true,
+
+    .bdrv_has_zero_init         = bdrv_has_zero_init_1,
+    .bdrv_supports_persistent_dirty_bitmap = parallels_is_support_dirty_bitmaps,
+
    .bdrv_probe                 = parallels_probe,
    .bdrv_open                  = parallels_open,
    .bdrv_close                 = parallels_close,
    .bdrv_child_perm            = bdrv_default_perms,
    .bdrv_co_block_status       = parallels_co_block_status,
-    .bdrv_has_zero_init       = bdrv_has_zero_init_1,
    .bdrv_co_flush_to_os        = parallels_co_flush_to_os,
    .bdrv_co_readv              = parallels_co_readv,
    .bdrv_co_writev             = parallels_co_writev,
-    .is_format      = true,
-    .supports_backing = true,
    .bdrv_co_create             = parallels_co_create,
    .bdrv_co_create_opts        = parallels_co_create_opts,
    .bdrv_co_check              = parallels_co_check,
-    .create_opts    = &parallels_create_opts,
+    .bdrv_co_pdiscard           = parallels_co_pdiscard,
+    .bdrv_co_pwrite_zeroes      = parallels_co_pwrite_zeroes,
 };

 static void bdrv_parallels_init(void)
--- a/block/parallels.h
+++ b/block/parallels.h
@ -72,9 +72,13 @@ typedef struct BDRVParallelsState {
    unsigned long *bat_dirty_bmap;
    unsigned int  bat_dirty_block;

+    unsigned long *used_bmap;
+    unsigned long used_bmap_size;
+
    uint32_t *bat_bitmap;
    unsigned int bat_size;

+    int64_t  data_start;
    int64_t  data_end;
    uint64_t prealloc_size;
    ParallelsPreallocMode prealloc_mode;
--- a/block/preallocate.c
+++ b/block/preallocate.c
@ -75,8 +75,14 @@ typedef struct BDRVPreallocateState {
     * be invalid (< 0) when we don't have both exclusive BLK_PERM_RESIZE and
     * BLK_PERM_WRITE permissions on file child.
     */
+
+    /* Gives up the resize permission on children when parents don't need it */
+    QEMUBH *drop_resize_bh;
 } BDRVPreallocateState;

+static int preallocate_drop_resize(BlockDriverState *bs, Error **errp);
+static void preallocate_drop_resize_bh(void *opaque);
+
 #define PREALLOCATE_OPT_PREALLOC_ALIGN "prealloc-align"
 #define PREALLOCATE_OPT_PREALLOC_SIZE "prealloc-size"
 static QemuOptsList runtime_opts = {
@ -142,6 +148,7 @@ static int preallocate_open(BlockDriverState *bs, QDict *options, int flags,
     * For this to work, mark them invalid.
     */
    s->file_end = s->zero_start = s->data_end = -EINVAL;
+    s->drop_resize_bh = qemu_bh_new(preallocate_drop_resize_bh, bs);

    ret = bdrv_open_file_child(NULL, options, "file", bs, errp);
    if (ret < 0) {
@ -162,26 +169,42 @@ static int preallocate_open(BlockDriverState *bs, QDict *options, int flags,
    return 0;
 }

-static void preallocate_close(BlockDriverState *bs)
+static int preallocate_truncate_to_real_size(BlockDriverState *bs, Error **errp)
 {
-    int ret;
    BDRVPreallocateState *s = bs->opaque;
-
-    if (s->data_end < 0) {
-        return;
-    }
+    int ret;

    if (s->file_end < 0) {
        s->file_end = bdrv_getlength(bs->file->bs);
        if (s->file_end < 0) {
-            return;
+            error_setg_errno(errp, -s->file_end, "Failed to get file length");
+            return s->file_end;
        }
    }

    if (s->data_end < s->file_end) {
        ret = bdrv_truncate(bs->file, s->data_end, true, PREALLOC_MODE_OFF, 0,
                            NULL);
-        s->file_end = ret < 0 ? ret : s->data_end;
+        if (ret < 0) {
+            error_setg_errno(errp, -ret, "Failed to drop preallocation");
+            s->file_end = ret;
+            return ret;
+        }
+        s->file_end = s->data_end;
+    }
+
+    return 0;
+}
+
+static void preallocate_close(BlockDriverState *bs)
+{
+    BDRVPreallocateState *s = bs->opaque;
+
+    qemu_bh_cancel(s->drop_resize_bh);
+    qemu_bh_delete(s->drop_resize_bh);
+
+    if (s->data_end >= 0) {
+        preallocate_truncate_to_real_size(bs, NULL);
    }
 }

@ -198,6 +221,7 @@ static int preallocate_reopen_prepare(BDRVReopenState *reopen_state,
                                      BlockReopenQueue *queue, Error **errp)
 {
    PreallocateOpts *opts = g_new0(PreallocateOpts, 1);
+    int ret;

    if (!preallocate_absorb_opts(opts, reopen_state->options,
                                 reopen_state->bs->file->bs, errp)) {
@ -205,6 +229,19 @@ static int preallocate_reopen_prepare(BDRVReopenState *reopen_state,
        return -EINVAL;
    }

+    /*
+     * Drop the preallocation already here if reopening read-only. The child
+     * might also be reopened read-only and then scheduling a BH during the
+     * permission update is too late.
+     */
+    if ((reopen_state->flags & BDRV_O_RDWR) == 0) {
+        ret = preallocate_drop_resize(reopen_state->bs, errp);
+        if (ret < 0) {
+            g_free(opts);
+            return ret;
+        }
+    }
+
    reopen_state->opaque = opts;

    return 0;
@ -462,58 +499,61 @@ preallocate_co_getlength(BlockDriverState *bs)
    return ret;
 }

-static int preallocate_check_perm(BlockDriverState *bs,
-                                  uint64_t perm, uint64_t shared, Error **errp)
+static int preallocate_drop_resize(BlockDriverState *bs, Error **errp)
 {
    BDRVPreallocateState *s = bs->opaque;
+    int ret;
+
+    if (s->data_end < 0) {
+        return 0;
+    }

-    if (s->data_end >= 0 && !can_write_resize(perm)) {
    /*
-         * Lose permissions.
-         * We should truncate in check_perm, as in set_perm bs->file->perm will
-         * be already changed, and we should not violate it.
+     * Before switching children to be read-only, truncate them to remove
+     * the preallocation and let them have the real size.
     */
-        if (s->file_end < 0) {
-            s->file_end = bdrv_getlength(bs->file->bs);
-            if (s->file_end < 0) {
-                error_setg(errp, "Failed to get file length");
-                return s->file_end;
-            }
-        }
-
-        if (s->data_end < s->file_end) {
-            int ret = bdrv_truncate(bs->file, s->data_end, true,
-                                    PREALLOC_MODE_OFF, 0, NULL);
+    ret = preallocate_truncate_to_real_size(bs, errp);
    if (ret < 0) {
-                error_setg(errp, "Failed to drop preallocation");
-                s->file_end = ret;
        return ret;
    }
-            s->file_end = s->data_end;
-        }
-    }
+
+    /*
+     * We'll drop our permissions and will allow other users to take write and
+     * resize permissions (see preallocate_child_perm). Anyone will be able to
+     * change the child, so mark all states invalid. We'll regain control if a
+     * parent requests write access again.
+     */
+    s->data_end = s->file_end = s->zero_start = -EINVAL;
+
+    bdrv_graph_rdlock_main_loop();
+    bdrv_child_refresh_perms(bs, bs->file, NULL);
+    bdrv_graph_rdunlock_main_loop();

    return 0;
 }

+static void preallocate_drop_resize_bh(void *opaque)
+{
+    /*
+     * In case of errors, we'll simply keep the exclusive lock on the image
+     * indefinitely.
+     */
+    preallocate_drop_resize(opaque, NULL);
+}
+
 static void preallocate_set_perm(BlockDriverState *bs,
                                 uint64_t perm, uint64_t shared)
 {
    BDRVPreallocateState *s = bs->opaque;

    if (can_write_resize(perm)) {
+        qemu_bh_cancel(s->drop_resize_bh);
        if (s->data_end < 0) {
            s->data_end = s->file_end = s->zero_start =
-                bdrv_getlength(bs->file->bs);
+                bs->file->bs->total_sectors * BDRV_SECTOR_SIZE;
        }
    } else {
-        /*
-         * We drop our permissions, as well as allow shared
-         * permissions (see preallocate_child_perm), anyone will be able to
-         * change the child, so mark all states invalid. We'll regain control if
-         * get good permissions back.
-         */
-        s->data_end = s->file_end = s->zero_start = -EINVAL;
+        qemu_bh_schedule(s->drop_resize_bh);
    }
 }

@ -521,10 +561,16 @@ static void preallocate_child_perm(BlockDriverState *bs, BdrvChild *c,
    BdrvChildRole role, BlockReopenQueue *reopen_queue,
    uint64_t perm, uint64_t shared, uint64_t *nperm, uint64_t *nshared)
 {
+    BDRVPreallocateState *s = bs->opaque;
+
    bdrv_default_perms(bs, c, role, reopen_queue, perm, shared, nperm, nshared);

-    if (can_write_resize(perm)) {
-        /* This should come by default, but let's enforce: */
+    /*
+     * We need exclusive write and resize permissions on the child not only when
+     * the parent can write to it, but also after the parent gave up write
+     * permissions until preallocate_drop_resize() has completed.
+     */
+    if (can_write_resize(perm) || s->data_end != -EINVAL) {
        *nperm |= BLK_PERM_WRITE | BLK_PERM_RESIZE;

        /*
@ -535,7 +581,7 @@ static void preallocate_child_perm(BlockDriverState *bs, BdrvChild *c,
    }
 }

-BlockDriver bdrv_preallocate_filter = {
+static BlockDriver bdrv_preallocate_filter = {
    .format_name = "preallocate",
    .instance_size = sizeof(BDRVPreallocateState),

@ -554,7 +600,6 @@ BlockDriver bdrv_preallocate_filter = {
    .bdrv_co_flush = preallocate_co_flush,
    .bdrv_co_truncate = preallocate_co_truncate,

-    .bdrv_check_perm = preallocate_check_perm,
    .bdrv_set_perm = preallocate_set_perm,
    .bdrv_child_perm = preallocate_child_perm,

--- a/block/qapi.c
+++ b/block/qapi.c
@ -48,7 +48,7 @@ BlockDeviceInfo *bdrv_block_device_info(BlockBackend *blk,
 {
    ImageInfo **p_image_info;
    ImageInfo *backing_info;
-    BlockDriverState *bs0, *backing;
+    BlockDriverState *backing;
    BlockDeviceInfo *info;
    ERRP_GUARD();

@ -145,7 +145,6 @@ BlockDeviceInfo *bdrv_block_device_info(BlockBackend *blk,

    info->write_threshold = bdrv_write_threshold_get(bs);

-    bs0 = bs;
    p_image_info = &info->image;
    info->backing_file_depth = 0;

@ -153,7 +152,7 @@ BlockDeviceInfo *bdrv_block_device_info(BlockBackend *blk,
     * Skip automatically inserted nodes that the user isn't aware of for
     * query-block (blk != NULL), but not for query-named-block-nodes
     */
-    bdrv_query_image_info(bs0, p_image_info, flat, blk != NULL, errp);
+    bdrv_query_image_info(bs, p_image_info, flat, blk != NULL, errp);
    if (*errp) {
        qapi_free_BlockDeviceInfo(info);
        return NULL;
@ -309,33 +308,6 @@ out:
    aio_context_release(bdrv_get_aio_context(bs));
 }

-/**
- * bdrv_query_block_node_info:
- * @bs: block node to examine
- * @p_info: location to store node information
- * @errp: location to store error information
- *
- * Store image information about @bs in @p_info.
- *
- * @p_info will be set only on success. On error, store error in @errp.
- */
-void bdrv_query_block_node_info(BlockDriverState *bs,
-                                BlockNodeInfo **p_info,
-                                Error **errp)
-{
-    BlockNodeInfo *info;
-    ERRP_GUARD();
-
-    info = g_new0(BlockNodeInfo, 1);
-    bdrv_do_query_node_info(bs, info, errp);
-    if (*errp) {
-        qapi_free_BlockNodeInfo(info);
-        return;
-    }
-
-    *p_info = info;
-}
-
 /**
 * bdrv_query_image_info:
 * @bs: block node to examine
--- a/block/qcow.c
+++ b/block/qcow.c
@ -549,7 +549,10 @@ qcow_co_block_status(BlockDriverState *bs, bool want_zero,
    if (!cluster_offset) {
        return 0;
    }
-    if ((cluster_offset & QCOW_OFLAG_COMPRESSED) || s->crypto) {
+    if (cluster_offset & QCOW_OFLAG_COMPRESSED) {
+        return BDRV_BLOCK_DATA | BDRV_BLOCK_COMPRESSED;
+    }
+    if (s->crypto) {
        return BDRV_BLOCK_DATA;
    }
    *map = cluster_offset | index_in_cluster;
--- a/block/qcow2-refcount.c
+++ b/block/qcow2-refcount.c
@ -2645,7 +2645,7 @@ rebuild_refcount_structure(BlockDriverState *bs, BdrvCheckResult *res,
     * repeat all this until the reftable stops growing.
     *
     * (This loop will terminate, because with every cluster the
-     * reftable grows, it can accomodate a multitude of more refcounts,
+     * reftable grows, it can accommodate a multitude of more refcounts,
     * so that at some point this must be able to cover the reftable
     * and all refblocks describing it.)
     *
--- a/block/qcow2.c
+++ b/block/qcow2.c
@ -1880,7 +1880,7 @@ qcow2_do_open(BlockDriverState *bs, QDict *options, int flags,
    g_free(s->image_data_file);
    if (open_data_file && has_data_file(bs)) {
        bdrv_graph_co_rdunlock();
-        bdrv_unref_child(bs, s->data_file);
+        bdrv_co_unref_child(bs, s->data_file);
        bdrv_graph_co_rdlock();
        s->data_file = NULL;
    }
@ -2162,6 +2162,9 @@ qcow2_co_block_status(BlockDriverState *bs, bool want_zero, int64_t offset,
    {
        status |= BDRV_BLOCK_RECURSE;
    }
+    if (type == QCOW2_SUBCLUSTER_COMPRESSED) {
+        status |= BDRV_BLOCK_COMPRESSED;
+    }
    return status;
 }

@ -2790,7 +2793,9 @@ static void qcow2_do_close(BlockDriverState *bs, bool close_data_file)
    g_free(s->image_backing_format);

    if (close_data_file && has_data_file(bs)) {
+        bdrv_graph_wrlock(NULL);
        bdrv_unref_child(bs, s->data_file);
+        bdrv_graph_wrunlock();
        s->data_file = NULL;
    }

--- a/block/quorum.c
+++ b/block/quorum.c
@ -1037,12 +1037,14 @@ static int quorum_open(BlockDriverState *bs, QDict *options, int flags,

 close_exit:
    /* cleanup on error */
+    bdrv_graph_wrlock(NULL);
    for (i = 0; i < s->num_children; i++) {
        if (!opened[i]) {
            continue;
        }
        bdrv_unref_child(bs, s->children[i]);
    }
+    bdrv_graph_wrunlock();
    g_free(s->children);
    g_free(opened);
 exit:
@ -1055,15 +1057,17 @@ static void quorum_close(BlockDriverState *bs)
    BDRVQuorumState *s = bs->opaque;
    int i;

+    bdrv_graph_wrlock(NULL);
    for (i = 0; i < s->num_children; i++) {
        bdrv_unref_child(bs, s->children[i]);
    }
+    bdrv_graph_wrunlock();

    g_free(s->children);
 }

-static void quorum_add_child(BlockDriverState *bs, BlockDriverState *child_bs,
-                             Error **errp)
+static void GRAPH_WRLOCK
+quorum_add_child(BlockDriverState *bs, BlockDriverState *child_bs, Error **errp)
 {
    BDRVQuorumState *s = bs->opaque;
    BdrvChild *child;
@ -1089,8 +1093,6 @@ static void quorum_add_child(BlockDriverState *bs, BlockDriverState *child_bs,
    }
    s->next_child_index++;

-    bdrv_drained_begin(bs);
-
    /* We can safely add the child now */
    bdrv_ref(child_bs);

@ -1098,18 +1100,15 @@ static void quorum_add_child(BlockDriverState *bs, BlockDriverState *child_bs,
                              BDRV_CHILD_DATA, errp);
    if (child == NULL) {
        s->next_child_index--;
-        goto out;
+        return;
    }
    s->children = g_renew(BdrvChild *, s->children, s->num_children + 1);
    s->children[s->num_children++] = child;
    quorum_refresh_flags(bs);
-
-out:
-    bdrv_drained_end(bs);
 }

-static void quorum_del_child(BlockDriverState *bs, BdrvChild *child,
-                             Error **errp)
+static void GRAPH_WRLOCK
+quorum_del_child(BlockDriverState *bs, BdrvChild *child, Error **errp)
 {
    BDRVQuorumState *s = bs->opaque;
    char indexstr[INDEXSTR_LEN];
@ -1139,16 +1138,14 @@ static void quorum_del_child(BlockDriverState *bs, BdrvChild *child,
        s->next_child_index--;
    }

-    bdrv_drained_begin(bs);
-
    /* We can safely remove this child now */
    memmove(&s->children[i], &s->children[i + 1],
            (s->num_children - i - 1) * sizeof(BdrvChild *));
    s->children = g_renew(BdrvChild *, s->children, --s->num_children);
+
    bdrv_unref_child(bs, child);

    quorum_refresh_flags(bs);
-    bdrv_drained_end(bs);
 }

 static void quorum_gather_child_options(BlockDriverState *bs, QDict *target,
--- a/block/replication.c
+++ b/block/replication.c
@ -542,12 +542,15 @@ static void replication_start(ReplicationState *rs, ReplicationMode mode,
            return;
        }

+        bdrv_graph_wrlock(bs);
+
        bdrv_ref(hidden_disk->bs);
        s->hidden_disk = bdrv_attach_child(bs, hidden_disk->bs, "hidden disk",
                                           &child_of_bds, BDRV_CHILD_DATA,
                                           &local_err);
        if (local_err) {
            error_propagate(errp, local_err);
+            bdrv_graph_wrunlock();
            aio_context_release(aio_context);
            return;
        }
@ -558,10 +561,13 @@ static void replication_start(ReplicationState *rs, ReplicationMode mode,
                                              BDRV_CHILD_DATA, &local_err);
        if (local_err) {
            error_propagate(errp, local_err);
+            bdrv_graph_wrunlock();
            aio_context_release(aio_context);
            return;
        }

+        bdrv_graph_wrunlock();
+
        /* start backup job now */
        error_setg(&s->blocker,
                   "Block device is in use by internal backup job");
@ -666,10 +672,13 @@ static void replication_done(void *opaque, int ret)
    if (ret == 0) {
        s->stage = BLOCK_REPLICATION_DONE;

+        bdrv_graph_wrlock(NULL);
        bdrv_unref_child(bs, s->secondary_disk);
        s->secondary_disk = NULL;
        bdrv_unref_child(bs, s->hidden_disk);
        s->hidden_disk = NULL;
+        bdrv_graph_wrunlock();
+
        s->error = 0;
    } else {
        s->stage = BLOCK_REPLICATION_FAILOVER_FAILED;
--- a/block/snapshot-access.c
+++ b/block/snapshot-access.c
@ -108,7 +108,7 @@ static void snapshot_access_child_perm(BlockDriverState *bs, BdrvChild *c,
    *nshared = BLK_PERM_ALL;
 }

-BlockDriver bdrv_snapshot_access_drv = {
+static BlockDriver bdrv_snapshot_access_drv = {
    .format_name = "snapshot-access",

    .bdrv_open                  = snapshot_access_open,
--- a/block/snapshot.c
+++ b/block/snapshot.c
@ -281,7 +281,9 @@ int bdrv_snapshot_goto(BlockDriverState *bs,
        }

        /* .bdrv_open() will re-attach it */
+        bdrv_graph_wrlock(NULL);
        bdrv_unref_child(bs, fallback);
+        bdrv_graph_wrunlock();

        ret = bdrv_snapshot_goto(fallback_bs, snapshot_id, errp);
        open_ret = drv->bdrv_open(bs, options, bs->open_flags, &local_err);
--- a/block/stream.c
+++ b/block/stream.c
@ -54,6 +54,7 @@ static int stream_prepare(Job *job)
 {
    StreamBlockJob *s = container_of(job, StreamBlockJob, common.job);
    BlockDriverState *unfiltered_bs = bdrv_skip_filters(s->target_bs);
+    BlockDriverState *unfiltered_bs_cow = bdrv_cow_bs(unfiltered_bs);
    BlockDriverState *base;
    BlockDriverState *unfiltered_base;
    Error *local_err = NULL;
@ -64,13 +65,18 @@ static int stream_prepare(Job *job)
    s->cor_filter_bs = NULL;

    /*
-     * bdrv_set_backing_hd() requires that unfiltered_bs is drained. Drain
-     * already here and use bdrv_set_backing_hd_drained() instead because
-     * the polling during drained_begin() might change the graph, and if we do
-     * this only later, we may end up working with the wrong base node (or it
-     * might even have gone away by the time we want to use it).
+     * bdrv_set_backing_hd() requires that the unfiltered_bs and the COW child
+     * of unfiltered_bs is drained. Drain already here and use
+     * bdrv_set_backing_hd_drained() instead because the polling during
+     * drained_begin() might change the graph, and if we do this only later, we
+     * may end up working with the wrong base node (or it might even have gone
+     * away by the time we want to use it).
     */
    bdrv_drained_begin(unfiltered_bs);
+    if (unfiltered_bs_cow) {
+        bdrv_ref(unfiltered_bs_cow);
+        bdrv_drained_begin(unfiltered_bs_cow);
+    }

    base = bdrv_filter_or_cow_bs(s->above_base);
    unfiltered_base = bdrv_skip_filters(base);
@ -100,6 +106,10 @@ static int stream_prepare(Job *job)
    }

 out:
+    if (unfiltered_bs_cow) {
+        bdrv_drained_end(unfiltered_bs_cow);
+        bdrv_unref(unfiltered_bs_cow);
+    }
    bdrv_drained_end(unfiltered_bs);
    return ret;
 }
--- a/block/throttle-groups.c
+++ b/block/throttle-groups.c
@ -37,7 +37,7 @@

 static void throttle_group_obj_init(Object *obj);
 static void throttle_group_obj_complete(UserCreatable *obj, Error **errp);
-static void timer_cb(ThrottleGroupMember *tgm, bool is_write);
+static void timer_cb(ThrottleGroupMember *tgm, ThrottleDirection direction);

 /* The ThrottleGroup structure (with its ThrottleState) is shared
 * among different ThrottleGroupMembers and it's independent from
@ -73,8 +73,8 @@ struct ThrottleGroup {
    QemuMutex lock; /* This lock protects the following four fields */
    ThrottleState ts;
    QLIST_HEAD(, ThrottleGroupMember) head;
-    ThrottleGroupMember *tokens[2];
-    bool any_timer_armed[2];
+    ThrottleGroupMember *tokens[THROTTLE_MAX];
+    bool any_timer_armed[THROTTLE_MAX];
    QEMUClockType clock_type;

    /* This field is protected by the global QEMU mutex */
@ -197,13 +197,13 @@ static ThrottleGroupMember *throttle_group_next_tgm(ThrottleGroupMember *tgm)
 * This assumes that tg->lock is held.
 *
 * @tgm:        the ThrottleGroupMember
- * @is_write:   the type of operation (read/write)
+ * @direction:  the ThrottleDirection
 * @ret:        whether the ThrottleGroupMember has pending requests.
 */
 static inline bool tgm_has_pending_reqs(ThrottleGroupMember *tgm,
-                                        bool is_write)
+                                        ThrottleDirection direction)
 {
-    return tgm->pending_reqs[is_write];
+    return tgm->pending_reqs[direction];
 }

 /* Return the next ThrottleGroupMember in the round-robin sequence with pending
@ -212,12 +212,12 @@ static inline bool tgm_has_pending_reqs(ThrottleGroupMember *tgm,
 * This assumes that tg->lock is held.
 *
 * @tgm:       the current ThrottleGroupMember
- * @is_write:  the type of operation (read/write)
+ * @direction: the ThrottleDirection
 * @ret:       the next ThrottleGroupMember with pending requests, or tgm if
 *             there is none.
 */
 static ThrottleGroupMember *next_throttle_token(ThrottleGroupMember *tgm,
-                                                bool is_write)
+                                                ThrottleDirection direction)
 {
    ThrottleState *ts = tgm->throttle_state;
    ThrottleGroup *tg = container_of(ts, ThrottleGroup, ts);
@ -227,16 +227,16 @@ static ThrottleGroupMember *next_throttle_token(ThrottleGroupMember *tgm,
     * it's being drained. Skip the round-robin search and return tgm
     * immediately if it has pending requests. Otherwise we could be
     * forcing it to wait for other member's throttled requests. */
-    if (tgm_has_pending_reqs(tgm, is_write) &&
+    if (tgm_has_pending_reqs(tgm, direction) &&
        qatomic_read(&tgm->io_limits_disabled)) {
        return tgm;
    }

-    start = token = tg->tokens[is_write];
+    start = token = tg->tokens[direction];

    /* get next bs round in round robin style */
    token = throttle_group_next_tgm(token);
-    while (token != start && !tgm_has_pending_reqs(token, is_write)) {
+    while (token != start && !tgm_has_pending_reqs(token, direction)) {
        token = throttle_group_next_tgm(token);
    }

@ -244,12 +244,12 @@ static ThrottleGroupMember *next_throttle_token(ThrottleGroupMember *tgm,
     * then decide the token is the current tgm because chances are
     * the current tgm got the current request queued.
     */
-    if (token == start && !tgm_has_pending_reqs(token, is_write)) {
+    if (token == start && !tgm_has_pending_reqs(token, direction)) {
        token = tgm;
    }

    /* Either we return the original TGM, or one with pending requests */
-    assert(token == tgm || tgm_has_pending_reqs(token, is_write));
+    assert(token == tgm || tgm_has_pending_reqs(token, direction));

    return token;
 }
@ -261,11 +261,11 @@ static ThrottleGroupMember *next_throttle_token(ThrottleGroupMember *tgm,
 * This assumes that tg->lock is held.
 *
 * @tgm:        the current ThrottleGroupMember
- * @is_write:   the type of operation (read/write)
+ * @direction:  the ThrottleDirection
 * @ret:        whether the I/O request needs to be throttled or not
 */
 static bool throttle_group_schedule_timer(ThrottleGroupMember *tgm,
-                                          bool is_write)
+                                          ThrottleDirection direction)
 {
    ThrottleState *ts = tgm->throttle_state;
    ThrottleGroup *tg = container_of(ts, ThrottleGroup, ts);
@ -277,16 +277,16 @@ static bool throttle_group_schedule_timer(ThrottleGroupMember *tgm,
    }

    /* Check if any of the timers in this group is already armed */
-    if (tg->any_timer_armed[is_write]) {
+    if (tg->any_timer_armed[direction]) {
        return true;
    }

-    must_wait = throttle_schedule_timer(ts, tt, is_write);
+    must_wait = throttle_schedule_timer(ts, tt, direction);

    /* If a timer just got armed, set tgm as the current token */
    if (must_wait) {
-        tg->tokens[is_write] = tgm;
-        tg->any_timer_armed[is_write] = true;
+        tg->tokens[direction] = tgm;
+        tg->any_timer_armed[direction] = true;
    }

    return must_wait;
@ -296,15 +296,15 @@ static bool throttle_group_schedule_timer(ThrottleGroupMember *tgm,
 * any request was actually pending.
 *
 * @tgm:       the current ThrottleGroupMember
- * @is_write:  the type of operation (read/write)
+ * @direction: the ThrottleDirection
 */
 static bool coroutine_fn throttle_group_co_restart_queue(ThrottleGroupMember *tgm,
-                                                         bool is_write)
+                                                         ThrottleDirection direction)
 {
    bool ret;

    qemu_co_mutex_lock(&tgm->throttled_reqs_lock);
-    ret = qemu_co_queue_next(&tgm->throttled_reqs[is_write]);
+    ret = qemu_co_queue_next(&tgm->throttled_reqs[direction]);
    qemu_co_mutex_unlock(&tgm->throttled_reqs_lock);

    return ret;
@ -315,9 +315,10 @@ static bool coroutine_fn throttle_group_co_restart_queue(ThrottleGroupMember *tg
 * This assumes that tg->lock is held.
 *
 * @tgm:       the current ThrottleGroupMember
- * @is_write:  the type of operation (read/write)
+ * @direction: the ThrottleDirection
 */
-static void schedule_next_request(ThrottleGroupMember *tgm, bool is_write)
+static void schedule_next_request(ThrottleGroupMember *tgm,
+                                  ThrottleDirection direction)
 {
    ThrottleState *ts = tgm->throttle_state;
    ThrottleGroup *tg = container_of(ts, ThrottleGroup, ts);
@ -325,27 +326,27 @@ static void schedule_next_request(ThrottleGroupMember *tgm, bool is_write)
    ThrottleGroupMember *token;

    /* Check if there's any pending request to schedule next */
-    token = next_throttle_token(tgm, is_write);
-    if (!tgm_has_pending_reqs(token, is_write)) {
+    token = next_throttle_token(tgm, direction);
+    if (!tgm_has_pending_reqs(token, direction)) {
        return;
    }

    /* Set a timer for the request if it needs to be throttled */
-    must_wait = throttle_group_schedule_timer(token, is_write);
+    must_wait = throttle_group_schedule_timer(token, direction);

    /* If it doesn't have to wait, queue it for immediate execution */
    if (!must_wait) {
        /* Give preference to requests from the current tgm */
        if (qemu_in_coroutine() &&
-            throttle_group_co_restart_queue(tgm, is_write)) {
+            throttle_group_co_restart_queue(tgm, direction)) {
            token = tgm;
        } else {
            ThrottleTimers *tt = &token->throttle_timers;
            int64_t now = qemu_clock_get_ns(tg->clock_type);
-            timer_mod(tt->timers[is_write], now);
-            tg->any_timer_armed[is_write] = true;
+            timer_mod(tt->timers[direction], now);
+            tg->any_timer_armed[direction] = true;
        }
-        tg->tokens[is_write] = token;
+        tg->tokens[direction] = token;
    }
 }

@ -355,48 +356,49 @@ static void schedule_next_request(ThrottleGroupMember *tgm, bool is_write)
 *
 * @tgm:       the current ThrottleGroupMember
 * @bytes:     the number of bytes for this I/O
- * @is_write:  the type of operation (read/write)
+ * @direction: the ThrottleDirection
 */
 void coroutine_fn throttle_group_co_io_limits_intercept(ThrottleGroupMember *tgm,
                                                        int64_t bytes,
-                                                        bool is_write)
+                                                        ThrottleDirection direction)
 {
    bool must_wait;
    ThrottleGroupMember *token;
    ThrottleGroup *tg = container_of(tgm->throttle_state, ThrottleGroup, ts);

    assert(bytes >= 0);
+    assert(direction < THROTTLE_MAX);

    qemu_mutex_lock(&tg->lock);

    /* First we check if this I/O has to be throttled. */
-    token = next_throttle_token(tgm, is_write);
-    must_wait = throttle_group_schedule_timer(token, is_write);
+    token = next_throttle_token(tgm, direction);
+    must_wait = throttle_group_schedule_timer(token, direction);

    /* Wait if there's a timer set or queued requests of this type */
-    if (must_wait || tgm->pending_reqs[is_write]) {
-        tgm->pending_reqs[is_write]++;
+    if (must_wait || tgm->pending_reqs[direction]) {
+        tgm->pending_reqs[direction]++;
        qemu_mutex_unlock(&tg->lock);
        qemu_co_mutex_lock(&tgm->throttled_reqs_lock);
-        qemu_co_queue_wait(&tgm->throttled_reqs[is_write],
+        qemu_co_queue_wait(&tgm->throttled_reqs[direction],
                           &tgm->throttled_reqs_lock);
        qemu_co_mutex_unlock(&tgm->throttled_reqs_lock);
        qemu_mutex_lock(&tg->lock);
-        tgm->pending_reqs[is_write]--;
+        tgm->pending_reqs[direction]--;
    }

    /* The I/O will be executed, so do the accounting */
-    throttle_account(tgm->throttle_state, is_write, bytes);
+    throttle_account(tgm->throttle_state, direction, bytes);

    /* Schedule the next request */
-    schedule_next_request(tgm, is_write);
+    schedule_next_request(tgm, direction);

    qemu_mutex_unlock(&tg->lock);
 }

 typedef struct {
    ThrottleGroupMember *tgm;
-    bool is_write;
+    ThrottleDirection direction;
 } RestartData;

 static void coroutine_fn throttle_group_restart_queue_entry(void *opaque)
@ -405,16 +407,16 @@ static void coroutine_fn throttle_group_restart_queue_entry(void *opaque)
    ThrottleGroupMember *tgm = data->tgm;
    ThrottleState *ts = tgm->throttle_state;
    ThrottleGroup *tg = container_of(ts, ThrottleGroup, ts);
-    bool is_write = data->is_write;
+    ThrottleDirection direction = data->direction;
    bool empty_queue;

-    empty_queue = !throttle_group_co_restart_queue(tgm, is_write);
+    empty_queue = !throttle_group_co_restart_queue(tgm, direction);

    /* If the request queue was empty then we have to take care of
     * scheduling the next one */
    if (empty_queue) {
        qemu_mutex_lock(&tg->lock);
-        schedule_next_request(tgm, is_write);
+        schedule_next_request(tgm, direction);
        qemu_mutex_unlock(&tg->lock);
    }

@ -424,18 +426,19 @@ static void coroutine_fn throttle_group_restart_queue_entry(void *opaque)
    aio_wait_kick();
 }

-static void throttle_group_restart_queue(ThrottleGroupMember *tgm, bool is_write)
+static void throttle_group_restart_queue(ThrottleGroupMember *tgm,
+                                        ThrottleDirection direction)
 {
    Coroutine *co;
    RestartData *rd = g_new0(RestartData, 1);

    rd->tgm = tgm;
-    rd->is_write = is_write;
+    rd->direction = direction;

    /* This function is called when a timer is fired or when
     * throttle_group_restart_tgm() is called. Either way, there can
     * be no timer pending on this tgm at this point */
-    assert(!timer_pending(tgm->throttle_timers.timers[is_write]));
+    assert(!timer_pending(tgm->throttle_timers.timers[direction]));

    qatomic_inc(&tgm->restart_pending);

@ -445,18 +448,18 @@ static void throttle_group_restart_queue(ThrottleGroupMember *tgm, bool is_write

 void throttle_group_restart_tgm(ThrottleGroupMember *tgm)
 {
-    int i;
+    ThrottleDirection dir;

    if (tgm->throttle_state) {
-        for (i = 0; i < 2; i++) {
-            QEMUTimer *t = tgm->throttle_timers.timers[i];
+        for (dir = THROTTLE_READ; dir < THROTTLE_MAX; dir++) {
+            QEMUTimer *t = tgm->throttle_timers.timers[dir];
            if (timer_pending(t)) {
                /* If there's a pending timer on this tgm, fire it now */
                timer_del(t);
-                timer_cb(tgm, i);
+                timer_cb(tgm, dir);
            } else {
                /* Else run the next request from the queue manually */
-                throttle_group_restart_queue(tgm, i);
+                throttle_group_restart_queue(tgm, dir);
            }
        }
    }
@ -500,30 +503,30 @@ void throttle_group_get_config(ThrottleGroupMember *tgm, ThrottleConfig *cfg)
 * because it had been throttled.
 *
 * @tgm:       the ThrottleGroupMember whose request had been throttled
- * @is_write:  the type of operation (read/write)
+ * @direction: the ThrottleDirection
 */
-static void timer_cb(ThrottleGroupMember *tgm, bool is_write)
+static void timer_cb(ThrottleGroupMember *tgm, ThrottleDirection direction)
 {
    ThrottleState *ts = tgm->throttle_state;
    ThrottleGroup *tg = container_of(ts, ThrottleGroup, ts);

    /* The timer has just been fired, so we can update the flag */
    qemu_mutex_lock(&tg->lock);
-    tg->any_timer_armed[is_write] = false;
+    tg->any_timer_armed[direction] = false;
    qemu_mutex_unlock(&tg->lock);

    /* Run the request that was waiting for this timer */
-    throttle_group_restart_queue(tgm, is_write);
+    throttle_group_restart_queue(tgm, direction);
 }

 static void read_timer_cb(void *opaque)
 {
-    timer_cb(opaque, false);
+    timer_cb(opaque, THROTTLE_READ);
 }

 static void write_timer_cb(void *opaque)
 {
-    timer_cb(opaque, true);
+    timer_cb(opaque, THROTTLE_WRITE);
 }

 /* Register a ThrottleGroupMember from the throttling group, also initializing
@ -541,7 +544,7 @@ void throttle_group_register_tgm(ThrottleGroupMember *tgm,
                                 const char *groupname,
                                 AioContext *ctx)
 {
-    int i;
+    ThrottleDirection dir;
    ThrottleState *ts = throttle_group_incref(groupname);
    ThrottleGroup *tg = container_of(ts, ThrottleGroup, ts);

@ -551,10 +554,11 @@ void throttle_group_register_tgm(ThrottleGroupMember *tgm,

    QEMU_LOCK_GUARD(&tg->lock);
    /* If the ThrottleGroup is new set this ThrottleGroupMember as the token */
-    for (i = 0; i < 2; i++) {
-        if (!tg->tokens[i]) {
-            tg->tokens[i] = tgm;
+    for (dir = THROTTLE_READ; dir < THROTTLE_MAX; dir++) {
+        if (!tg->tokens[dir]) {
+            tg->tokens[dir] = tgm;
        }
+        qemu_co_queue_init(&tgm->throttled_reqs[dir]);
    }

    QLIST_INSERT_HEAD(&tg->head, tgm, round_robin);
@ -566,8 +570,6 @@ void throttle_group_register_tgm(ThrottleGroupMember *tgm,
                         write_timer_cb,
                         tgm);
    qemu_co_mutex_init(&tgm->throttled_reqs_lock);
-    qemu_co_queue_init(&tgm->throttled_reqs[0]);
-    qemu_co_queue_init(&tgm->throttled_reqs[1]);
 }

 /* Unregister a ThrottleGroupMember from its group, removing it from the list,
@ -585,7 +587,7 @@ void throttle_group_unregister_tgm(ThrottleGroupMember *tgm)
    ThrottleState *ts = tgm->throttle_state;
    ThrottleGroup *tg = container_of(ts, ThrottleGroup, ts);
    ThrottleGroupMember *token;
-    int i;
+    ThrottleDirection dir;

    if (!ts) {
        /* Discard already unregistered tgm */
@ -596,17 +598,17 @@ void throttle_group_unregister_tgm(ThrottleGroupMember *tgm)
    AIO_WAIT_WHILE(tgm->aio_context, qatomic_read(&tgm->restart_pending) > 0);

    WITH_QEMU_LOCK_GUARD(&tg->lock) {
-        for (i = 0; i < 2; i++) {
-            assert(tgm->pending_reqs[i] == 0);
-            assert(qemu_co_queue_empty(&tgm->throttled_reqs[i]));
-            assert(!timer_pending(tgm->throttle_timers.timers[i]));
-            if (tg->tokens[i] == tgm) {
+        for (dir = THROTTLE_READ; dir < THROTTLE_MAX; dir++) {
+            assert(tgm->pending_reqs[dir] == 0);
+            assert(qemu_co_queue_empty(&tgm->throttled_reqs[dir]));
+            assert(!timer_pending(tgm->throttle_timers.timers[dir]));
+            if (tg->tokens[dir] == tgm) {
                token = throttle_group_next_tgm(tgm);
                /* Take care of the case where this is the last tgm in the group */
                if (token == tgm) {
                    token = NULL;
                }
-                tg->tokens[i] = token;
+                tg->tokens[dir] = token;
            }
        }

@ -631,19 +633,20 @@ void throttle_group_detach_aio_context(ThrottleGroupMember *tgm)
 {
    ThrottleGroup *tg = container_of(tgm->throttle_state, ThrottleGroup, ts);
    ThrottleTimers *tt = &tgm->throttle_timers;
-    int i;
+    ThrottleDirection dir;

    /* Requests must have been drained */
-    assert(tgm->pending_reqs[0] == 0 && tgm->pending_reqs[1] == 0);
-    assert(qemu_co_queue_empty(&tgm->throttled_reqs[0]));
-    assert(qemu_co_queue_empty(&tgm->throttled_reqs[1]));
+    for (dir = THROTTLE_READ; dir < THROTTLE_MAX; dir++) {
+        assert(tgm->pending_reqs[dir] == 0);
+        assert(qemu_co_queue_empty(&tgm->throttled_reqs[dir]));
+    }

    /* Kick off next ThrottleGroupMember, if necessary */
    WITH_QEMU_LOCK_GUARD(&tg->lock) {
-        for (i = 0; i < 2; i++) {
-            if (timer_pending(tt->timers[i])) {
-                tg->any_timer_armed[i] = false;
-                schedule_next_request(tgm, i);
+        for (dir = THROTTLE_READ; dir < THROTTLE_MAX; dir++) {
+            if (timer_pending(tt->timers[dir])) {
+                tg->any_timer_armed[dir] = false;
+                schedule_next_request(tgm, dir);
            }
        }
    }
--- a/block/throttle.c
+++ b/block/throttle.c
@ -118,7 +118,7 @@ throttle_co_preadv(BlockDriverState *bs, int64_t offset, int64_t bytes,
 {

    ThrottleGroupMember *tgm = bs->opaque;
-    throttle_group_co_io_limits_intercept(tgm, bytes, false);
+    throttle_group_co_io_limits_intercept(tgm, bytes, THROTTLE_READ);

    return bdrv_co_preadv(bs->file, offset, bytes, qiov, flags);
 }
@ -128,7 +128,7 @@ throttle_co_pwritev(BlockDriverState *bs, int64_t offset, int64_t bytes,
                    QEMUIOVector *qiov, BdrvRequestFlags flags)
 {
    ThrottleGroupMember *tgm = bs->opaque;
-    throttle_group_co_io_limits_intercept(tgm, bytes, true);
+    throttle_group_co_io_limits_intercept(tgm, bytes, THROTTLE_WRITE);

    return bdrv_co_pwritev(bs->file, offset, bytes, qiov, flags);
 }
@ -138,7 +138,7 @@ throttle_co_pwrite_zeroes(BlockDriverState *bs, int64_t offset, int64_t bytes,
                          BdrvRequestFlags flags)
 {
    ThrottleGroupMember *tgm = bs->opaque;
-    throttle_group_co_io_limits_intercept(tgm, bytes, true);
+    throttle_group_co_io_limits_intercept(tgm, bytes, THROTTLE_WRITE);

    return bdrv_co_pwrite_zeroes(bs->file, offset, bytes, flags);
 }
@ -147,7 +147,7 @@ static int coroutine_fn GRAPH_RDLOCK
 throttle_co_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes)
 {
    ThrottleGroupMember *tgm = bs->opaque;
-    throttle_group_co_io_limits_intercept(tgm, bytes, true);
+    throttle_group_co_io_limits_intercept(tgm, bytes, THROTTLE_WRITE);

    return bdrv_co_pdiscard(bs->file, offset, bytes);
 }
--- a/block/vhdx.c
+++ b/block/vhdx.c
@ -1077,7 +1077,7 @@ static int vhdx_open(BlockDriverState *bs, QDict *options, int flags,
        goto fail;
    }

-    /* endian convert populated BAT field entires */
+    /* endian convert populated BAT field entries */
    for (i = 0; i < s->bat_entries; i++) {
        s->bat[i] = le64_to_cpu(s->bat[i]);
    }
--- a/block/vhdx.h
+++ b/block/vhdx.h
@ -212,7 +212,7 @@ typedef struct QEMU_PACKED VHDXLogDataSector {
    uint32_t    sequence_high;          /* 4 MSB of 8 byte sequence_number */
    uint8_t     data[4084];             /* raw data, bytes 8-4091 (inclusive).
                                           see the data descriptor field for the
-                                           other mising bytes */
+                                           other missing bytes */
    uint32_t    sequence_low;           /* 4 LSB of 8 byte sequence_number */
 } VHDXLogDataSector;

@ -257,7 +257,7 @@ typedef struct QEMU_PACKED VHDXMetadataTableHeader {

 #define VHDX_META_FLAGS_IS_USER         0x01    /* max 1024 entries */
 #define VHDX_META_FLAGS_IS_VIRTUAL_DISK 0x02    /* virtual disk metadata if set,
-                                                   otherwise file metdata */
+                                                   otherwise file metadata */
 #define VHDX_META_FLAGS_IS_REQUIRED     0x04    /* parse must understand this
                                                   entry to open the file */
 typedef struct QEMU_PACKED VHDXMetadataTableEntry {
--- a/block/vmdk.c
+++ b/block/vmdk.c
@ -272,6 +272,7 @@ static void vmdk_free_extents(BlockDriverState *bs)
    BDRVVmdkState *s = bs->opaque;
    VmdkExtent *e;

+    bdrv_graph_wrlock(NULL);
    for (i = 0; i < s->num_extents; i++) {
        e = &s->extents[i];
        g_free(e->l1_table);
@ -282,6 +283,8 @@ static void vmdk_free_extents(BlockDriverState *bs)
            bdrv_unref_child(bs, e->file);
        }
    }
+    bdrv_graph_wrunlock();
+
    g_free(s->extents);
 }

@ -1207,7 +1210,7 @@ static int vmdk_parse_extents(const char *desc, BlockDriverState *bs,
                                      bs, &child_of_bds, extent_role, false,
                                      &local_err);
        g_free(extent_path);
-        if (local_err) {
+        if (!extent_file) {
            error_propagate(errp, local_err);
            ret = -EINVAL;
            goto out;
@ -1220,7 +1223,9 @@ static int vmdk_parse_extents(const char *desc, BlockDriverState *bs,
            ret = vmdk_add_extent(bs, extent_file, true, sectors,
                            0, 0, 0, 0, 0, &extent, errp);
            if (ret < 0) {
+                bdrv_graph_wrlock(NULL);
                bdrv_unref_child(bs, extent_file);
+                bdrv_graph_wrunlock();
                goto out;
            }
            extent->flat_start_offset = flat_offset << 9;
@ -1235,20 +1240,26 @@ static int vmdk_parse_extents(const char *desc, BlockDriverState *bs,
            }
            g_free(buf);
            if (ret) {
+                bdrv_graph_wrlock(NULL);
                bdrv_unref_child(bs, extent_file);
+                bdrv_graph_wrunlock();
                goto out;
            }
            extent = &s->extents[s->num_extents - 1];
        } else if (!strcmp(type, "SESPARSE")) {
            ret = vmdk_open_se_sparse(bs, extent_file, bs->open_flags, errp);
            if (ret) {
+                bdrv_graph_wrlock(NULL);
                bdrv_unref_child(bs, extent_file);
+                bdrv_graph_wrunlock();
                goto out;
            }
            extent = &s->extents[s->num_extents - 1];
        } else {
            error_setg(errp, "Unsupported extent type '%s'", type);
+            bdrv_graph_wrlock(NULL);
            bdrv_unref_child(bs, extent_file);
+            bdrv_graph_wrunlock();
            ret = -ENOTSUP;
            goto out;
        }
@ -1309,6 +1320,8 @@ static int vmdk_open(BlockDriverState *bs, QDict *options, int flags,
    BDRVVmdkState *s = bs->opaque;
    uint32_t magic;

+    GRAPH_RDLOCK_GUARD_MAINLOOP();
+
    ret = bdrv_open_file_child(NULL, options, "file", bs, errp);
    if (ret < 0) {
        return ret;
@ -1770,6 +1783,8 @@ vmdk_co_block_status(BlockDriverState *bs, bool want_zero,
            if (extent->flat) {
                ret |= BDRV_BLOCK_RECURSE;
            }
+        } else {
+            ret |= BDRV_BLOCK_COMPRESSED;
        }
        *file = extent->file->bs;
        break;
--- a/block/vpc.c
+++ b/block/vpc.c
@ -510,7 +510,7 @@ get_image_offset(BlockDriverState *bs, uint64_t offset, bool write, int *err)
       miss sparse read optimization, but it's not a problem in terms of
       correctness. */
    if (write && (s->last_bitmap_offset != bitmap_offset)) {
-        uint8_t bitmap[s->bitmap_size];
+        g_autofree uint8_t *bitmap = g_malloc(s->bitmap_size);
        int r;

        s->last_bitmap_offset = bitmap_offset;
@ -558,7 +558,7 @@ alloc_block(BlockDriverState *bs, int64_t offset)
    int64_t bat_offset;
    uint32_t index, bat_value;
    int ret;
-    uint8_t bitmap[s->bitmap_size];
+    g_autofree uint8_t *bitmap = g_malloc(s->bitmap_size);

    /* Check if sector_num is valid */
    if ((offset < 0) || (offset > bs->total_sectors * BDRV_SECTOR_SIZE)) {
--- a/blockdev.c
+++ b/blockdev.c
@ -1378,6 +1378,9 @@ static void external_snapshot_action(TransactionAction *action,
    AioContext *aio_context;
    uint64_t perm, shared;

+    /* TODO We'll eventually have to take a writer lock in this function */
+    GRAPH_RDLOCK_GUARD_MAINLOOP();
+
    tran_add(tran, &external_snapshot_drv, state);

    /* 'blockdev-snapshot' and 'blockdev-snapshot-sync' have similar
@ -2521,6 +2524,9 @@ void qmp_block_commit(const char *job_id, const char *device,
    int job_flags = JOB_DEFAULT;
    uint64_t top_perm, top_shared;

+    /* TODO We'll eventually have to take a writer lock in this function */
+    GRAPH_RDLOCK_GUARD_MAINLOOP();
+
    if (!has_speed) {
        speed = 0;
    }
@ -3539,8 +3545,8 @@ out:
    aio_context_release(aio_context);
 }

-static BdrvChild *bdrv_find_child(BlockDriverState *parent_bs,
-                                  const char *child_name)
+static BdrvChild * GRAPH_RDLOCK
+bdrv_find_child(BlockDriverState *parent_bs, const char *child_name)
 {
    BdrvChild *child;

@ -3559,9 +3565,11 @@ void qmp_x_blockdev_change(const char *parent, const char *child,
    BlockDriverState *parent_bs, *new_bs = NULL;
    BdrvChild *p_child;

+    bdrv_graph_wrlock(NULL);
+
    parent_bs = bdrv_lookup_bs(parent, parent, errp);
    if (!parent_bs) {
-        return;
+        goto out;
    }

    if (!child == !node) {
@ -3570,7 +3578,7 @@ void qmp_x_blockdev_change(const char *parent, const char *child,
        } else {
            error_setg(errp, "Either child or node must be specified");
        }
-        return;
+        goto out;
    }

    if (child) {
@ -3578,7 +3586,7 @@ void qmp_x_blockdev_change(const char *parent, const char *child,
        if (!p_child) {
            error_setg(errp, "Node '%s' does not have child '%s'",
                       parent, child);
-            return;
+            goto out;
        }
        bdrv_del_child(parent_bs, p_child, errp);
    }
@ -3587,10 +3595,13 @@ void qmp_x_blockdev_change(const char *parent, const char *child,
        new_bs = bdrv_find_node(node);
        if (!new_bs) {
            error_setg(errp, "Node '%s' not found", node);
-            return;
+            goto out;
        }
        bdrv_add_child(parent_bs, new_bs, errp);
    }
+
+out:
+    bdrv_graph_wrunlock();
 }

 BlockJobInfoList *qmp_query_block_jobs(Error **errp)
--- a/blockjob.c
+++ b/blockjob.c
@ -198,6 +198,7 @@ void block_job_remove_all_bdrv(BlockJob *job)
     * one to make sure that such a concurrent access does not attempt
     * to process an already freed BdrvChild.
     */
+    bdrv_graph_wrlock(NULL);
    while (job->nodes) {
        GSList *l = job->nodes;
        BdrvChild *c = l->data;
@ -209,6 +210,7 @@ void block_job_remove_all_bdrv(BlockJob *job)

        g_slist_free_1(l);
    }
+    bdrv_graph_wrunlock();
 }

 bool block_job_has_bdrv(BlockJob *job, BlockDriverState *bs)
--- a/bsd-user/bsd-file.h
+++ b/bsd-user/bsd-file.h
@ -51,10 +51,8 @@ do {                                        \
    unlock_user(p1, arg1, 0);               \
 } while (0)

-extern struct iovec *lock_iovec(int type, abi_ulong target_addr, int count,
-        int copy);
-extern void unlock_iovec(struct iovec *vec, abi_ulong target_addr, int count,
-        int copy);
+struct iovec *lock_iovec(int type, abi_ulong target_addr, int count, int copy);
+void unlock_iovec(struct iovec *vec, abi_ulong target_addr, int count, int copy);

 int safe_open(const char *path, int flags, mode_t mode);
 int safe_openat(int fd, const char *path, int flags, mode_t mode);
--- a/bsd-user/errno_defs.h
+++ b/bsd-user/errno_defs.h
@ -149,7 +149,7 @@
 #define TARGET_ELAST            90              /* Must be equal largest errno */

 /* Internal errors: */
-#define TARGET_EJUSTRETURN      254             /* Just return without modifing regs */
+#define TARGET_EJUSTRETURN      254             /* Just return without modifying regs */
 #define TARGET_ERESTART         255             /* Restart syscall */

 #include "special-errno.h"
--- a/bsd-user/freebsd/target_os_siginfo.h
+++ b/bsd-user/freebsd/target_os_siginfo.h
@ -72,7 +72,7 @@ typedef struct target_siginfo {
            int32_t _mqd;
        } _mesgp;

-        /* SIGPOLL -- Not really genreated in FreeBSD ??? */
+        /* SIGPOLL -- Not really generated in FreeBSD ??? */
        struct {
            int _band;  /* POLL_IN, POLL_OUT, POLL_MSG */
        } _poll;
--- a/bsd-user/freebsd/target_os_stack.h
+++ b/bsd-user/freebsd/target_os_stack.h
@ -25,7 +25,7 @@
 #include "qemu/guest-random.h"

 /*
- * The inital FreeBSD stack is as follows:
+ * The initial FreeBSD stack is as follows:
 * (see kern/kern_exec.c exec_copyout_strings() )
 *
 *  Hi Address -> char **ps_argvstr  (struct ps_strings for ps, w, etc.)
@ -59,7 +59,7 @@ static inline int setup_initial_stack(struct bsd_binprm *bprm,
    /* Save some space for ps_strings. */
    p -= sizeof(struct target_ps_strings);

-    /* Add machine depedent sigcode. */
+    /* Add machine dependent sigcode. */
    p -= TARGET_SZSIGCODE;
    if (setup_sigtramp(p, (unsigned)offsetof(struct target_sigframe, sf_uc),
            TARGET_FREEBSD_NR_sigreturn)) {
--- a/bsd-user/freebsd/target_os_user.h
+++ b/bsd-user/freebsd/target_os_user.h
@ -26,7 +26,7 @@
 struct target_priority {
    uint8_t     pri_class;      /* Scheduling class. */
    uint8_t     pri_level;      /* Normal priority level. */
-    uint8_t     pri_native;     /* Priority before propogation. */
+    uint8_t     pri_native;     /* Priority before propagation. */
    uint8_t     pri_user;       /* User priority based on p_cpu and p_nice. */
 };

--- a/bsd-user/qemu.h
+++ b/bsd-user/qemu.h
@ -116,7 +116,7 @@ extern const char *qemu_uname_release;
 /*
 * TARGET_ARG_MAX defines the number of bytes allocated for arguments
 * and envelope for the new program. 256k should suffice for a reasonable
- * maxiumum env+arg in 32-bit environments, bump it up to 512k for !ILP32
+ * maximum env+arg in 32-bit environments, bump it up to 512k for !ILP32
 * platforms.
 */
 #if TARGET_ABI_BITS > 32
--- a/bsd-user/signal-common.h
+++ b/bsd-user/signal-common.h
@ -49,11 +49,11 @@ void target_to_host_sigset(sigset_t *d, const target_sigset_t *s);
 * union in target_siginfo is valid. This only applies between
 * host_to_target_siginfo_noswap() and tswap_siginfo(); it does not appear
 * either within host siginfo_t or in target_siginfo structures which we get
- * from the guest userspace program. Linux kenrels use this internally, but BSD
+ * from the guest userspace program. Linux kernels use this internally, but BSD
 * kernels don't do this, but its a useful abstraction.
 *
 * The linux-user version of this uses the top 16 bits, but FreeBSD's SI_USER
- * and other signal indepenent SI_ codes have bit 16 set, so we only use the top
+ * and other signal independent SI_ codes have bit 16 set, so we only use the top
 * byte instead.
 *
 * For FreeBSD, we have si_pid, si_uid, si_status, and si_addr always. Linux and
--- a/bsd-user/signal.c
+++ b/bsd-user/signal.c
@ -44,7 +44,7 @@ static inline int sas_ss_flags(TaskState *ts, unsigned long sp)
 }

 /*
- * The BSD ABIs use the same singal numbers across all the CPU architectures, so
+ * The BSD ABIs use the same signal numbers across all the CPU architectures, so
 * (unlike Linux) these functions are just the identity mapping. This might not
 * be true for XyzBSD running on AbcBSD, which doesn't currently work.
 */
@ -241,7 +241,7 @@ static inline void host_to_target_siginfo_noswap(target_siginfo_t *tinfo,
 #endif
        /*
         * Unsure that this can actually be generated, and our support for
-         * capsicum is somewhere between weak and non-existant, but if we get
+         * capsicum is somewhere between weak and non-existent, but if we get
         * one, then we know what to save.
         */
 #ifdef QEMU_SI_CAPSICUM
@ -319,7 +319,7 @@ int block_signals(void)
    /*
     * It's OK to block everything including SIGSEGV, because we won't run any
     * further guest code before unblocking signals in
-     * process_pending_signals(). We depend on the FreeBSD behaivor here where
+     * process_pending_signals(). We depend on the FreeBSD behavior here where
     * this will only affect this thread's signal mask. We don't use
     * pthread_sigmask which might seem more correct because that routine also
     * does odd things with SIGCANCEL to implement pthread_cancel().
--- a/bsd-user/trace-events
+++ b/bsd-user/trace-events
@ -1,4 +1,4 @@
-# See docs/tracing.txt for syntax documentation.
+# See docs/devel/tracing.rst for syntax documentation.

 # bsd-user/signal.c
 user_setup_frame(void *env, uint64_t frame_addr) "env=%p frame_addr=0x%"PRIx64
--- a/chardev/char-socket.c
+++ b/chardev/char-socket.c
@ -710,7 +710,7 @@ static void tcp_chr_telnet_init(Chardev *chr)

    if (!s->is_tn3270) {
        init->buflen = 12;
-        /* Prep the telnet negotion to put telnet in binary,
+        /* Prep the telnet negotiation to put telnet in binary,
         * no echo, single char mode */
        IACSET(init->buf, 0xff, 0xfb, 0x01);  /* IAC WILL ECHO */
        IACSET(init->buf, 0xff, 0xfb, 0x03);  /* IAC WILL Suppress go ahead */
@ -718,7 +718,7 @@ static void tcp_chr_telnet_init(Chardev *chr)
        IACSET(init->buf, 0xff, 0xfd, 0x00);  /* IAC DO Binary */
    } else {
        init->buflen = 21;
-        /* Prep the TN3270 negotion based on RFC1576 */
+        /* Prep the TN3270 negotiation based on RFC1576 */
        IACSET(init->buf, 0xff, 0xfd, 0x19);  /* IAC DO EOR */
        IACSET(init->buf, 0xff, 0xfb, 0x19);  /* IAC WILL EOR */
        IACSET(init->buf, 0xff, 0xfd, 0x00);  /* IAC DO BINARY */
@ -1298,7 +1298,7 @@ static bool qmp_chardev_validate_socket(ChardevSocket *sock,
        return false;
    }

-    /* Validate any options which have a dependancy on client vs server */
+    /* Validate any options which have a dependency on client vs server */
    if (!sock->has_server || sock->server) {
        if (sock->has_reconnect) {
            error_setg(errp,
--- a/chardev/char.c
+++ b/chardev/char.c
@ -1115,7 +1115,7 @@ ChardevReturn *qmp_chardev_change(const char *id, ChardevBackend *backend,
        return NULL;
    }

-    /* change successfull, clean up */
+    /* change successful, clean up */
    chr_new->handover_yank_instance = false;

    /*
--- a/chardev/meson.build
+++ b/chardev/meson.build
@ -26,7 +26,7 @@ chardev_ss.add(when: 'CONFIG_WIN32', if_true: files(
  'char-win.c',
 ))

-chardev_ss = chardev_ss.apply(config_host, strict: false)
+chardev_ss = chardev_ss.apply(config_targetos, strict: false)

 system_ss.add(files(
    'char-hmp-cmds.c',
--- a/168
+++ b/168
@ -245,10 +245,9 @@ for opt do
  esac
 done

-
+default_cflags='-O2 -g'
 git_submodules_action="update"
 git="git"
-debug_tcg="no"
 docs="auto"
 EXESUF=""
 prefix="/usr/local"
@ -257,6 +256,7 @@ softmmu="yes"
 linux_user=""
 bsd_user=""
 plugins="$default_feature"
+subdirs=""
 ninja=""
 python=
 download="enabled"
@ -288,7 +288,7 @@ static="no"
 #  ${cross_prefix}gcc (if cross-prefix specified)
 #  system compiler
 if test -z "${CC}${cross_prefix}"; then
-  cc="$host_cc"
+  cc="cc"
 else
  cc="${CC-${cross_prefix}gcc}"
 fi
@ -374,45 +374,14 @@ fi

 # OS specific

-mingw32="no"
-bsd="no"
-linux="no"
-solaris="no"
 case $targetos in
 windows)
-  mingw32="yes"
  plugins="no"
  pie="no"
 ;;
-gnu/kfreebsd)
-  bsd="yes"
-;;
-freebsd)
-  bsd="yes"
-  # needed for kinfo_getvmmap(3) in libutil.h
-;;
-dragonfly)
-  bsd="yes"
-;;
-netbsd)
-  bsd="yes"
-;;
-openbsd)
-  bsd="yes"
-;;
-darwin)
-  bsd="yes"
-  darwin="yes"
-;;
-sunos)
-  solaris="yes"
-;;
 haiku)
  pie="no"
 ;;
-linux)
-  linux="yes"
-;;
 esac

 if test ! -z "$cpu" ; then
@ -583,16 +552,16 @@ if test -n "$linux_arch" && ! test -d "$source_path/linux-headers/asm-$linux_arc
 fi

 check_py_version() {
-    # We require python >= 3.7.
+    # We require python >= 3.8.
    # NB: a True python conditional creates a non-zero return code (Failure)
-    "$1" -c 'import sys; sys.exit(sys.version_info < (3,7))'
+    "$1" -c 'import sys; sys.exit(sys.version_info < (3,8))'
 }

 first_python=
 if test -z "${PYTHON}"; then
    # A bare 'python' is traditionally python 2.x, but some distros
    # have it as python 3.x, so check in both places.
-    for binary in python3 python python3.11 python3.10 python3.9 python3.8 python3.7; do
+    for binary in python3 python python3.11 python3.10 python3.9 python3.8; do
        if has "$binary"; then
            python=$(command -v "$binary")
            if check_py_version "$python"; then
@ -627,7 +596,7 @@ do
    fi
 done

-if test "$mingw32" = "yes" ; then
+if test "$targetos" = "windows" ; then
  EXESUF=".exe"
  prefix="/qemu"
  bindir=""
@ -761,16 +730,13 @@ for opt do
    # configure to be used by RPM and similar macros that set
    # lots of directory switches by default.
  ;;
-  --enable-debug-tcg) debug_tcg="yes"
-  ;;
-  --disable-debug-tcg) debug_tcg="no"
-  ;;
  --enable-debug)
      # Enable debugging options that aren't excessively noisy
-      debug_tcg="yes"
+      meson_option_parse --enable-debug-tcg ""
      meson_option_parse --enable-debug-graph-lock ""
      meson_option_parse --enable-debug-mutex ""
      meson_option_add -Doptimization=0
+      default_cflags='-O0 -g'
  ;;
  --disable-tcg) tcg="disabled"
                 plugins="no"
@ -812,7 +778,7 @@ for opt do
  ;;
  --enable-download) download="enabled"; git_submodules_action=update;
  ;;
-  --enable-plugins) if test "$mingw32" = "yes"; then
+  --enable-plugins) if test "$targetos" = "windows"; then
                        error_exit "TCG plugins not currently supported on Windows platforms"
                    else
                        plugins="yes"
@ -884,12 +850,16 @@ fi
 default_target_list=""
 mak_wilds=""

+if [ -n "$host_arch" ] && [ -d "$source_path/common-user/host/$host_arch" ]; then
    if [ "$linux_user" != no ]; then
-    if [ "$targetos" = linux ] && [ -n "$host_arch" ]; then
+        if [ "$targetos" = linux ]; then
            linux_user=yes
        elif [ "$linux_user" = yes ]; then
            error_exit "linux-user not supported on this architecture"
        fi
+        if [ "$linux_user" = "yes" ]; then
+            mak_wilds="${mak_wilds} $source_path/configs/targets/*-linux-user.mak"
+        fi
    fi
    if [ "$bsd_user" != no ]; then
        if [ "$bsd_user" = "" ]; then
@ -898,16 +868,18 @@ if [ "$bsd_user" != no ]; then
        if [ "$bsd_user" = yes ] && ! [ -d "$source_path/bsd-user/$targetos" ]; then
            error_exit "bsd-user not supported on this host OS"
        fi
+        if [ "$bsd_user" = "yes" ]; then
+            mak_wilds="${mak_wilds} $source_path/configs/targets/*-bsd-user.mak"
+        fi
+    fi
+else
+    if [ "$linux_user" = yes ] || [ "$bsd_user" = yes ]; then
+        error_exit "user mode emulation not supported on this architecture"
+    fi
 fi
 if [ "$softmmu" = "yes" ]; then
    mak_wilds="${mak_wilds} $source_path/configs/targets/*-softmmu.mak"
 fi
-if [ "$linux_user" = "yes" ]; then
-    mak_wilds="${mak_wilds} $source_path/configs/targets/*-linux-user.mak"
-fi
-if [ "$bsd_user" = "yes" ]; then
-    mak_wilds="${mak_wilds} $source_path/configs/targets/*-bsd-user.mak"
-fi

 for config in $mak_wilds; do
    target="$(basename "$config" .mak)"
@ -934,8 +906,8 @@ Advanced options (experts only):
  -Dmesonoptname=val       passthrough option to meson unmodified
  --cross-prefix=PREFIX    use PREFIX for compile tools, PREFIX can be blank [$cross_prefix]
  --cc=CC                  use C compiler CC [$cc]
-  --host-cc=CC             use C compiler CC [$host_cc] for code run at
-                           build time
+  --host-cc=CC             when cross compiling, use C compiler CC for code run
+                           at build time [$host_cc]
  --cxx=CXX                use C++ compiler CXX [$cxx]
  --objcc=OBJCC            use Objective-C compiler OBJCC [$objcc]
  --extra-cflags=CFLAGS    append extra C compiler flags CFLAGS
@ -971,7 +943,6 @@ cat << EOF
  linux-user      all linux usermode emulation targets
  bsd-user        all BSD usermode emulation targets
  pie             Position Independent Executables
-  debug-tcg       TCG debugging (default is disabled)

 NOTE: The object files are built at the place where configure is launched
 EOF
@ -994,7 +965,7 @@ then
 fi

 if ! check_py_version "$python"; then
-  error_exit "Cannot use '$python', Python >= 3.7 is required." \
+  error_exit "Cannot use '$python', Python >= 3.8 is required." \
             "Use --python=/path/to/python to specify a supported Python." \
             "Maybe try:" \
             "  openSUSE Leap 15.3+: zypper install python39" \
@ -1087,7 +1058,7 @@ fi
 # by default.  Only enable by default for git builds
 if test -z "$werror" ; then
    if test -e "$source_path/.git" && \
-        { test "$linux" = "yes" || test "$mingw32" = "yes"; }; then
+        { test "$targetos" = linux || test "$targetos" = "windows"; }; then
        werror="yes"
    else
        werror="no"
@ -1110,6 +1081,9 @@ if test "$static" = "yes" ; then
  fi
 fi
 test "$plugins" = "" && plugins=yes
+if test "$plugins" = "yes"; then
+  subdirs="$subdirs contrib/plugins"
+fi

 cat > $TMPC << EOF

@ -1165,14 +1139,6 @@ else
    done
 fi

-# see if system emulation was really requested
-case " $target_list " in
-  *"-softmmu "*) softmmu=yes
-  ;;
-  *) softmmu=no
-  ;;
-esac
-
 if test "$tcg" = "auto"; then
  if test -z "$target_list"; then
    tcg="disabled"
@ -1670,12 +1636,11 @@ done
 echo "# Automatically generated by configure - do not modify" > Makefile.prereqs

 # Mac OS X ships with a broken assembler
-roms=
 if have_target i386-softmmu x86_64-softmmu && \
        test "$targetos" != "darwin" && test "$targetos" != "sunos" && \
        test "$targetos" != "haiku" && \
        probe_target_compiler i386-softmmu; then
-    roms="pc-bios/optionrom"
+    subdirs="$subdirs pc-bios/optionrom"
    config_mak=pc-bios/optionrom/config.mak
    echo "# Automatically generated by configure - do not modify" > $config_mak
    echo "TOPSRC_DIR=$source_path" >> $config_mak
@ -1684,7 +1649,7 @@ fi

 if have_target ppc-softmmu ppc64-softmmu && \
        probe_target_compiler ppc-softmmu; then
-    roms="$roms pc-bios/vof"
+    subdirs="$subdirs pc-bios/vof"
    config_mak=pc-bios/vof/config.mak
    echo "# Automatically generated by configure - do not modify" > $config_mak
    echo "SRC_DIR=$source_path/pc-bios/vof" >> $config_mak
@ -1703,7 +1668,7 @@ if have_target s390x-softmmu && probe_target_compiler s390x-softmmu && \
      echo "WARNING: Your compiler does not support the z900!"
      echo "         The s390-ccw bios will only work with guest CPUs >= z10."
    fi
-    roms="$roms pc-bios/s390-ccw"
+    subdirs="$subdirs pc-bios/s390-ccw"
    config_mak=pc-bios/s390-ccw/config-host.mak
    echo "# Automatically generated by configure - do not modify" > $config_mak
    echo "SRC_PATH=$source_path/pc-bios/s390-ccw" >> $config_mak
@ -1722,41 +1687,15 @@ echo >> $config_host_mak

 echo all: >> $config_host_mak

-if test "$debug_tcg" = "yes" ; then
-  echo "CONFIG_DEBUG_TCG=y" >> $config_host_mak
-fi
-if test "$mingw32" = "yes" ; then
-  echo "CONFIG_WIN32=y" >> $config_host_mak
+if test "$targetos" = "windows"; then
  echo "QEMU_GA_MANUFACTURER=${QEMU_GA_MANUFACTURER-QEMU}" >> $config_host_mak
  echo "QEMU_GA_DISTRO=${QEMU_GA_DISTRO-Linux}" >> $config_host_mak
  echo "QEMU_GA_VERSION=${QEMU_GA_VERSION-$(cat "$source_path"/VERSION)}" >> $config_host_mak
-else
-  echo "CONFIG_POSIX=y" >> $config_host_mak
 fi

-if test "$linux" = "yes" ; then
-  echo "CONFIG_LINUX=y" >> $config_host_mak
-fi
-
-if test "$darwin" = "yes" ; then
-  echo "CONFIG_DARWIN=y" >> $config_host_mak
-fi
-
-if test "$solaris" = "yes" ; then
-  echo "CONFIG_SOLARIS=y" >> $config_host_mak
-fi
 echo "SRC_PATH=$source_path" >> $config_host_mak
 echo "TARGET_DIRS=$target_list" >> $config_host_mak

-# XXX: suppress that
-if [ "$bsd" = "yes" ] ; then
-  echo "CONFIG_BSD=y" >> $config_host_mak
-fi
-
-if test "$plugins" = "yes" ; then
-    echo "CONFIG_PLUGIN=y" >> $config_host_mak
-fi
-
 if test -n "$gdb_bin"; then
    gdb_version=$($gdb_bin --version | head -n 1)
    if version_ge ${gdb_version##* } 9.1; then
@ -1772,6 +1711,13 @@ if test "$container" != no; then
    echo "RUNC=$runc" >> $config_host_mak
 fi

+echo "SUBDIRS=$subdirs" >> $config_host_mak
+echo "PYTHON=$python" >> $config_host_mak
+echo "GENISOIMAGE=$genisoimage" >> $config_host_mak
+echo "MESON=$meson" >> $config_host_mak
+echo "NINJA=$ninja" >> $config_host_mak
+echo "EXESUF=$EXESUF" >> $config_host_mak
+
 if test "$as_shared_lib" = "yes" ; then
  echo "AS_SHARED_LIB=y" >> $config_host_mak
 fi
@ -1779,17 +1725,8 @@ if test "$as_static_lib" = "yes" ; then
  echo "AS_STATIC_LIB=y" >> $config_host_mak
 fi

-echo "ROMS=$roms" >> $config_host_mak
-echo "PYTHON=$python" >> $config_host_mak
-echo "GENISOIMAGE=$genisoimage" >> $config_host_mak
-echo "MESON=$meson" >> $config_host_mak
-echo "NINJA=$ninja" >> $config_host_mak
-echo "PKG_CONFIG=${pkg_config}" >> $config_host_mak
-echo "CC=$cc" >> $config_host_mak
-echo "EXESUF=$EXESUF" >> $config_host_mak
-
 # use included Linux headers for KVM architectures
-if test "$linux" = "yes" && test -n "$linux_arch"; then
+if test "$targetos" = "linux" && test -n "$linux_arch"; then
  symlink "$source_path/linux-headers/asm-$linux_arch" linux-headers/asm
 fi

@ -1810,12 +1747,21 @@ if test "$ccache_cpp2" = "yes"; then
  echo "export CCACHE_CPP2=y" >> $config_host_mak
 fi

+# contrib/plugins configuration
+echo "# Automatically generated by configure - do not modify" > contrib/plugins/$config_host_mak
+echo "SRC_PATH=$source_path/contrib/plugins" >> contrib/plugins/$config_host_mak
+echo "PKG_CONFIG=${pkg_config}" >> contrib/plugins/$config_host_mak
+echo "CC=$cc $CPU_CFLAGS" >> contrib/plugins/$config_host_mak
+echo "CFLAGS=${CFLAGS-$default_cflags} $EXTRA_CFLAGS" >> contrib/plugins/$config_host_mak
+if test "$targetos" = darwin; then
+  echo "CONFIG_DARWIN=y" >> contrib/plugins/$config_host_mak
+fi
+
 # tests/tcg configuration
 (config_host_mak=tests/tcg/config-host.mak
 mkdir -p tests/tcg
 echo "# Automatically generated by configure - do not modify" > $config_host_mak
 echo "SRC_PATH=$source_path" >> $config_host_mak
-echo "HOST_CC=$host_cc" >> $config_host_mak

 # versioned checked in the main config_host.mak above
 if test -n "$gdb_bin"; then
@ -1908,7 +1854,6 @@ if test "$skip_meson" = no; then
  echo "windres = [$(meson_quote $windres)]" >> $cross
  echo "windmc = [$(meson_quote $windmc)]" >> $cross
  if test "$cross_compile" = "yes"; then
-    cross_arg="--cross-file config-meson.cross"
    echo "[host_machine]" >> $cross
    echo "system = '$targetos'" >> $cross
    case "$cpu" in
@ -1925,6 +1870,14 @@ if test "$skip_meson" = no; then
    else
        echo "endian = 'little'" >> $cross
    fi
+    cross_arg="--cross-file config-meson.cross"
+
+    native="config-meson.native.new"
+    echo "# Automatically generated by configure - do not modify" > $native
+    echo "[binaries]" >> $native
+    echo "c = [$(meson_quote $host_cc)]" >> $native
+    mv $native config-meson.native
+    cross_arg="$cross_arg --native-file config-meson.native"
  else
    cross_arg="--native-file config-meson.cross"
  fi
@ -1944,6 +1897,7 @@ if test "$skip_meson" = no; then
  test "$cfi" != false && meson_option_add "-Dcfi=$cfi"
  test "$docs" != auto && meson_option_add "-Ddocs=$docs"
  test -n "${LIB_FUZZING_ENGINE+xxx}" && meson_option_add "-Dfuzzing_engine=$LIB_FUZZING_ENGINE"
+  test "$plugins" = yes && meson_option_add "-Dplugins=true"
  test "$qemu_suffix" != qemu && meson_option_add "-Dqemu_suffix=$qemu_suffix"
  test "$smbd" != '' && meson_option_add "-Dsmbd=$smbd"
  test "$tcg" != enabled && meson_option_add "-Dtcg=$tcg"
--- a/contrib/plugins/Makefile
+++ b/contrib/plugins/Makefile
@ -6,11 +6,11 @@
 # programs that the main configure has already done for us.
 #

-BUILD_DIR := $(CURDIR)/../..
+include config-host.mak

-include $(BUILD_DIR)/config-host.mak
+TOP_SRC_PATH = $(SRC_PATH)/../..

-VPATH += $(SRC_PATH)/contrib/plugins
+VPATH += $(SRC_PATH)

 NAMES :=
 NAMES += execlog
@ -26,21 +26,25 @@ SONAMES := $(addsuffix .so,$(addprefix lib,$(NAMES)))

 # The main QEMU uses Glib extensively so it's perfectly fine to use it
 # in plugins (which many example do).
-CFLAGS := $(shell $(PKG_CONFIG) --cflags glib-2.0)
-CFLAGS += -fPIC -Wall
-CFLAGS += $(if $(CONFIG_DEBUG_TCG), -ggdb -O0)
-CFLAGS += -I$(SRC_PATH)/include/qemu
+PLUGIN_CFLAGS := $(shell $(PKG_CONFIG) --cflags glib-2.0)
+PLUGIN_CFLAGS += -fPIC -Wall
+PLUGIN_CFLAGS += -I$(TOP_SRC_PATH)/include/qemu

 all: $(SONAMES)

 %.o: %.c
-	$(CC) $(CFLAGS) -c -o $@ $<
+	$(CC) $(CFLAGS) $(PLUGIN_CFLAGS) -c -o $@ $<

 lib%.so: %.o
-	$(CC) -shared -Wl,-soname,$@ -o $@ $^ $(LDLIBS)
+ifeq ($(CONFIG_DARWIN),y)
+	$(CC) -bundle -Wl,-undefined,dynamic_lookup -o $@ $^ $(LDLIBS)
+else
+	$(CC) -shared -o $@ $^ $(LDLIBS)
+endif

 clean:
 	rm -f *.o *.so *.d
 	rm -Rf .libs

 .PHONY: all clean
+.SECONDARY:
--- a/contrib/plugins/cache.c
+++ b/contrib/plugins/cache.c
@ -350,7 +350,7 @@ static int in_cache(Cache *cache, uint64_t addr)
 * @cache: The cache under simulation
 * @addr: The address of the requested memory location
 *
- * Returns true if the requsted data is hit in the cache and false when missed.
+ * Returns true if the requested data is hit in the cache and false when missed.
 * The cache is updated on miss for the next access.
 */
 static bool access_cache(Cache *cache, uint64_t addr)
@ -545,8 +545,8 @@ static void append_stats_line(GString *line, uint64_t l1_daccess,
    l1_dmiss_rate = ((double) l1_dmisses) / (l1_daccess) * 100.0;
    l1_imiss_rate = ((double) l1_imisses) / (l1_iaccess) * 100.0;

-    g_string_append_printf(line, "%-14lu %-12lu %9.4lf%%  %-14lu %-12lu"
-                           " %9.4lf%%",
+    g_string_append_printf(line, "%-14" PRIu64 " %-12" PRIu64 " %9.4lf%%"
+                           "  %-14" PRIu64 " %-12" PRIu64 " %9.4lf%%",
                           l1_daccess,
                           l1_dmisses,
                           l1_daccess ? l1_dmiss_rate : 0.0,
@ -556,7 +556,8 @@ static void append_stats_line(GString *line, uint64_t l1_daccess,

    if (use_l2) {
        l2_miss_rate =  ((double) l2_misses) / (l2_access) * 100.0;
-        g_string_append_printf(line, "  %-12lu %-11lu %10.4lf%%",
+        g_string_append_printf(line,
+                               "  %-12" PRIu64 " %-11" PRIu64 " %10.4lf%%",
                               l2_access,
                               l2_misses,
                               l2_access ? l2_miss_rate : 0.0);
@ -662,8 +663,8 @@ static void log_top_insns(void)
        if (insn->symbol) {
            g_string_append_printf(rep, " (%s)", insn->symbol);
        }
-        g_string_append_printf(rep, ", %ld, %s\n", insn->l1_dmisses,
-                               insn->disas_str);
+        g_string_append_printf(rep, ", %" PRId64 ", %s\n",
+                               insn->l1_dmisses, insn->disas_str);
    }

    miss_insns = g_list_sort(miss_insns, icmp);
@ -675,8 +676,8 @@ static void log_top_insns(void)
        if (insn->symbol) {
            g_string_append_printf(rep, " (%s)", insn->symbol);
        }
-        g_string_append_printf(rep, ", %ld, %s\n", insn->l1_imisses,
-                               insn->disas_str);
+        g_string_append_printf(rep, ", %" PRId64 ", %s\n",
+                               insn->l1_imisses, insn->disas_str);
    }

    if (!use_l2) {
@ -692,8 +693,8 @@ static void log_top_insns(void)
        if (insn->symbol) {
            g_string_append_printf(rep, " (%s)", insn->symbol);
        }
-        g_string_append_printf(rep, ", %ld, %s\n", insn->l2_misses,
-                               insn->disas_str);
+        g_string_append_printf(rep, ", %" PRId64 ", %s\n",
+                               insn->l2_misses, insn->disas_str);
    }

 finish:
--- a/contrib/plugins/drcov.c
+++ b/contrib/plugins/drcov.c
@ -48,7 +48,7 @@ static void printf_header(unsigned long count)
    uint64_t start_code = qemu_plugin_start_code();
    uint64_t end_code = qemu_plugin_end_code();
    uint64_t entry = qemu_plugin_entry_code();
-    fprintf(fp, "0, 0x%lx, 0x%lx, 0x%lx, %s\n",
+    fprintf(fp, "0, 0x%" PRIx64 ", 0x%" PRIx64 ", 0x%" PRIx64 ", %s\n",
            start_code, end_code, entry, path);
    fprintf(fp, "BB Table: %ld bbs\n", count);
 }
--- a/contrib/plugins/howvec.c
+++ b/contrib/plugins/howvec.c
@ -181,7 +181,8 @@ static void plugin_exit(qemu_plugin_id_t id, void *p)
        switch (class->what) {
        case COUNT_CLASS:
            if (class->count || verbose) {
-                g_string_append_printf(report, "Class: %-24s\t(%ld hits)\n",
+                g_string_append_printf(report,
+                                       "Class: %-24s\t(%" PRId64 " hits)\n",
                                       class->class,
                                       class->count);
            }
@ -208,7 +209,8 @@ static void plugin_exit(qemu_plugin_id_t id, void *p)
             i++, counts = g_list_next(counts)) {
            InsnExecCount *rec = (InsnExecCount *) counts->data;
            g_string_append_printf(report,
-                                   "Instr: %-24s\t(%ld hits)\t(op=0x%08x/%s)\n",
+                                   "Instr: %-24s\t(%" PRId64 " hits)"
+                                   "\t(op=0x%08x/%s)\n",
                                   rec->insn,
                                   rec->count,
                                   rec->opcode,
--- a/contrib/plugins/lockstep.c
+++ b/contrib/plugins/lockstep.c
@ -108,7 +108,7 @@ static void report_divergance(ExecState *us, ExecState *them)

    /*
     * If we have diverged before did we get back on track or are we
-     * totally loosing it?
+     * totally losing it?
     */
    if (divergence_log) {
        DivergeState *last = (DivergeState *) divergence_log->data;
@ -134,7 +134,9 @@ static void report_divergance(ExecState *us, ExecState *them)

    /* Output short log entry of going out of sync... */
    if (verbose || divrec.distance == 1 || diverged) {
-        g_string_printf(out, "@ 0x%016lx vs 0x%016lx (%d/%d since last)\n",
+        g_string_printf(out,
+                        "@ 0x%016" PRIx64 " vs 0x%016" PRIx64
+                        " (%d/%d since last)\n",
                        us->pc, them->pc, g_slist_length(divergence_log),
                        divrec.distance);
        qemu_plugin_outs(out->str);
@ -144,7 +146,9 @@ static void report_divergance(ExecState *us, ExecState *them)
        int i;
        GSList *entry;

-        g_string_printf(out, "Δ insn_count @ 0x%016lx (%ld) vs 0x%016lx (%ld)\n",
+        g_string_printf(out,
+                        "Δ insn_count @ 0x%016" PRIx64
+                        " (%ld) vs 0x%016" PRIx64 " (%ld)\n",
                        us->pc, us->insn_count, them->pc, them->insn_count);

        for (entry = log, i = 0;
@ -152,7 +156,8 @@ static void report_divergance(ExecState *us, ExecState *them)
             entry = g_slist_next(entry), i++) {
            ExecInfo *prev = (ExecInfo *) entry->data;
            g_string_append_printf(out,
-                                   "  previously @ 0x%016lx/%ld (%ld insns)\n",
+                                   "  previously @ 0x%016" PRIx64 "/%" PRId64
+                                   " (%ld insns)\n",
                                   prev->block->pc, prev->block->insns,
                                   prev->insn_count);
        }
--- a/contrib/vhost-user-gpu/vhost-user-gpu.c
+++ b/contrib/vhost-user-gpu/vhost-user-gpu.c
@ -1071,6 +1071,7 @@ static gboolean
 protocol_features_cb(gint fd, GIOCondition condition, gpointer user_data)
 {
    const uint64_t protocol_edid = (1 << VHOST_USER_GPU_PROTOCOL_F_EDID);
+    const uint64_t protocol_dmabuf2 = (1 << VHOST_USER_GPU_PROTOCOL_F_DMABUF2);
    VuGpu *g = user_data;
    uint64_t protocol_features;
    VhostUserGpuMsg msg = {
@ -1082,7 +1083,7 @@ protocol_features_cb(gint fd, GIOCondition condition, gpointer user_data)
        return G_SOURCE_CONTINUE;
    }

-    protocol_features &= protocol_edid;
+    protocol_features &= (protocol_edid | protocol_dmabuf2);

    msg = (VhostUserGpuMsg) {
        .request = VHOST_USER_GPU_SET_PROTOCOL_FEATURES,
@ -1100,6 +1101,8 @@ protocol_features_cb(gint fd, GIOCondition condition, gpointer user_data)
        exit(EXIT_FAILURE);
    }

+    g->use_modifiers = !!(protocol_features & protocol_dmabuf2);
+
    return G_SOURCE_REMOVE;
 }

--- a/contrib/vhost-user-gpu/virgl.c
+++ b/contrib/vhost-user-gpu/virgl.c
@ -318,6 +318,37 @@ virgl_resource_detach_backing(VuGpu *g,
    vg_cleanup_mapping_iov(g, res_iovs, num_iovs);
 }

+static int
+virgl_get_resource_info_modifiers(uint32_t resource_id,
+                                  struct virgl_renderer_resource_info *info,
+                                  uint64_t *modifiers)
+{
+    int ret;
+#ifdef VIRGL_RENDERER_RESOURCE_INFO_EXT_VERSION
+    struct virgl_renderer_resource_info_ext info_ext;
+    ret = virgl_renderer_resource_get_info_ext(resource_id, &info_ext);
+    if (ret < 0) {
+        return ret;
+    }
+
+    *info = info_ext.base;
+    *modifiers = info_ext.modifiers;
+#else
+    ret = virgl_renderer_resource_get_info(resource_id, info);
+    if (ret < 0) {
+        return ret;
+    }
+
+    /*
+     * Before virgl_renderer_resource_get_info_ext,
+     * getting the modifiers was not possible.
+     */
+    *modifiers = 0;
+#endif
+
+    return 0;
+}
+
 static void
 virgl_cmd_set_scanout(VuGpu *g,
                      struct virtio_gpu_ctrl_command *cmd)
@ -338,7 +369,9 @@ virgl_cmd_set_scanout(VuGpu *g,
    memset(&info, 0, sizeof(info));

    if (ss.resource_id && ss.r.width && ss.r.height) {
-        ret = virgl_renderer_resource_get_info(ss.resource_id, &info);
+        uint64_t modifiers = 0;
+        ret = virgl_get_resource_info_modifiers(ss.resource_id, &info,
+                                                &modifiers);
        if (ret == -1) {
            g_critical("%s: illegal resource specified %d\n",
                       __func__, ss.resource_id);
@ -354,8 +387,6 @@ virgl_cmd_set_scanout(VuGpu *g,
        }
        assert(fd >= 0);
        VhostUserGpuMsg msg = {
-            .request = VHOST_USER_GPU_DMABUF_SCANOUT,
-            .size = sizeof(VhostUserGpuDMABUFScanout),
            .payload.dmabuf_scanout.scanout_id = ss.scanout_id,
            .payload.dmabuf_scanout.x =  ss.r.x,
            .payload.dmabuf_scanout.y =  ss.r.y,
@ -367,6 +398,20 @@ virgl_cmd_set_scanout(VuGpu *g,
            .payload.dmabuf_scanout.fd_flags = info.flags,
            .payload.dmabuf_scanout.fd_drm_fourcc = info.drm_fourcc
        };
+
+        if (g->use_modifiers) {
+            /*
+             * The mesage uses all the fields set in dmabuf_scanout plus
+             * modifiers which is appended after VhostUserGpuDMABUFScanout.
+             */
+            msg.request = VHOST_USER_GPU_DMABUF_SCANOUT2;
+            msg.size = sizeof(VhostUserGpuDMABUFScanout2);
+            msg.payload.dmabuf_scanout2.modifier = modifiers;
+        } else {
+            msg.request = VHOST_USER_GPU_DMABUF_SCANOUT;
+            msg.size = sizeof(VhostUserGpuDMABUFScanout);
+        }
+
        vg_send_msg(g, &msg, fd);
        close(fd);
    } else {
--- a/contrib/vhost-user-gpu/vugpu.h
+++ b/contrib/vhost-user-gpu/vugpu.h
@ -37,6 +37,7 @@ typedef enum VhostUserGpuRequest {
    VHOST_USER_GPU_DMABUF_SCANOUT,
    VHOST_USER_GPU_DMABUF_UPDATE,
    VHOST_USER_GPU_GET_EDID,
+    VHOST_USER_GPU_DMABUF_SCANOUT2,
 } VhostUserGpuRequest;

 typedef struct VhostUserGpuDisplayInfoReply {
@ -84,6 +85,11 @@ typedef struct VhostUserGpuDMABUFScanout {
    int fd_drm_fourcc;
 } QEMU_PACKED VhostUserGpuDMABUFScanout;

+typedef struct VhostUserGpuDMABUFScanout2 {
+    struct VhostUserGpuDMABUFScanout dmabuf_scanout;
+    uint64_t modifier;
+} QEMU_PACKED VhostUserGpuDMABUFScanout2;
+
 typedef struct VhostUserGpuEdidRequest {
    uint32_t scanout_id;
 } QEMU_PACKED VhostUserGpuEdidRequest;
@ -98,6 +104,7 @@ typedef struct VhostUserGpuMsg {
        VhostUserGpuScanout scanout;
        VhostUserGpuUpdate update;
        VhostUserGpuDMABUFScanout dmabuf_scanout;
+        VhostUserGpuDMABUFScanout2 dmabuf_scanout2;
        VhostUserGpuEdidRequest edid_req;
        struct virtio_gpu_resp_edid resp_edid;
        struct virtio_gpu_resp_display_info display_info;
@ -112,6 +119,7 @@ static VhostUserGpuMsg m __attribute__ ((unused));
 #define VHOST_USER_GPU_MSG_FLAG_REPLY 0x4

 #define VHOST_USER_GPU_PROTOCOL_F_EDID 0
+#define VHOST_USER_GPU_PROTOCOL_F_DMABUF2 1

 struct virtio_gpu_scanout {
    uint32_t width, height;
@ -132,6 +140,7 @@ typedef struct VuGpu {
    bool virgl;
    bool virgl_inited;
    bool edid_inited;
+    bool use_modifiers;
    uint32_t inflight;

    struct virtio_gpu_scanout scanout[VIRTIO_GPU_MAX_SCANOUTS];
--- a/cpu.c
+++ b/cpu.c
@ -734,11 +734,7 @@ int cpu_memory_rw_debug(CPUState *cpu, vaddr addr,

 bool target_words_bigendian(void)
 {
-#if TARGET_BIG_ENDIAN
-    return true;
-#else
-    return false;
-#endif
+    return TARGET_BIG_ENDIAN;
 }

 const char *target_name(void)
--- a/crypto/aes.c
+++ b/crypto/aes.c
@ -272,7 +272,7 @@ AES_Td3[x] = Si[x].[09, 0d, 0b, 0e];
 AES_Td4[x] = Si[x].[01, 01, 01, 01];
 */

-static const uint32_t AES_Te0[256] = {
+const uint32_t AES_Te0[256] = {
    0xc66363a5U, 0xf87c7c84U, 0xee777799U, 0xf67b7b8dU,
    0xfff2f20dU, 0xd66b6bbdU, 0xde6f6fb1U, 0x91c5c554U,
    0x60303050U, 0x02010103U, 0xce6767a9U, 0x562b2b7dU,
@ -607,7 +607,7 @@ static const uint32_t AES_Te4[256] = {
    0xb0b0b0b0U, 0x54545454U, 0xbbbbbbbbU, 0x16161616U,
 };

-static const uint32_t AES_Td0[256] = {
+const uint32_t AES_Td0[256] = {
    0x51f4a750U, 0x7e416553U, 0x1a17a4c3U, 0x3a275e96U,
    0x3bab6bcbU, 0x1f9d45f1U, 0xacfa58abU, 0x4be30393U,
    0x2030fa55U, 0xad766df6U, 0x88cc7691U, 0xf5024c25U,
--- a/crypto/afalg.c
+++ b/crypto/afalg.c
@ -73,7 +73,7 @@ qcrypto_afalg_comm_alloc(const char *type, const char *name,
    QCryptoAFAlg *afalg;

    afalg = g_new0(QCryptoAFAlg, 1);
-    /* initilize crypto API socket */
+    /* initialize crypto API socket */
    afalg->opfd = -1;
    afalg->tfmfd = qcrypto_afalg_socket_bind(type, name, errp);
    if (afalg->tfmfd == -1) {
--- a/crypto/block-luks.c
+++ b/crypto/block-luks.c
@ -244,7 +244,7 @@ qcrypto_block_luks_has_format(const uint8_t *buf,
 *
 * When calculating ESSIV IVs, the cipher length used by ESSIV
 * may be different from the cipher length used for the block
- * encryption, becauses dm-crypt uses the hash digest length
+ * encryption, because dm-crypt uses the hash digest length
 * as the key size. ie, if you have AES 128 as the block cipher
 * and SHA 256 as ESSIV hash, then ESSIV will use AES 256 as
 * the cipher since that gets a key length matching the digest
@ -393,7 +393,7 @@ qcrypto_block_luks_from_disk_endian(QCryptoBlockLUKSHeader *hdr)
 }

 /*
- * Stores the main LUKS header, taking care of endianess
+ * Stores the main LUKS header, taking care of endianness
 */
 static int
 qcrypto_block_luks_store_header(QCryptoBlock *block,
@ -423,7 +423,7 @@ qcrypto_block_luks_store_header(QCryptoBlock *block,
 }

 /*
- * Loads the main LUKS header,and byteswaps it to native endianess
+ * Loads the main LUKS header, and byteswaps it to native endianness
 * And run basic sanity checks on it
 */
 static int
--- a/crypto/clmul.c
+++ b/crypto/clmul.c
@ -0,0 +1,111 @@
+/*
+ * Carry-less multiply operations.
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ *
+ * Copyright (C) 2023 Linaro, Ltd.
+ */
+
+#include "qemu/osdep.h"
+#include "crypto/clmul.h"
+
+uint64_t clmul_8x8_low(uint64_t n, uint64_t m)
+{
+    uint64_t r = 0;
+
+    for (int i = 0; i < 8; ++i) {
+        uint64_t mask = (n & 0x0101010101010101ull) * 0xff;
+        r ^= m & mask;
+        m = (m << 1) & 0xfefefefefefefefeull;
+        n >>= 1;
+    }
+    return r;
+}
+
+static uint64_t clmul_8x4_even_int(uint64_t n, uint64_t m)
+{
+    uint64_t r = 0;
+
+    for (int i = 0; i < 8; ++i) {
+        uint64_t mask = (n & 0x0001000100010001ull) * 0xffff;
+        r ^= m & mask;
+        n >>= 1;
+        m <<= 1;
+    }
+    return r;
+}
+
+uint64_t clmul_8x4_even(uint64_t n, uint64_t m)
+{
+    n &= 0x00ff00ff00ff00ffull;
+    m &= 0x00ff00ff00ff00ffull;
+    return clmul_8x4_even_int(n, m);
+}
+
+uint64_t clmul_8x4_odd(uint64_t n, uint64_t m)
+{
+    return clmul_8x4_even(n >> 8, m >> 8);
+}
+
+static uint64_t unpack_8_to_16(uint64_t x)
+{
+    return  (x & 0x000000ff)
+         | ((x & 0x0000ff00) << 8)
+         | ((x & 0x00ff0000) << 16)
+         | ((x & 0xff000000) << 24);
+}
+
+uint64_t clmul_8x4_packed(uint32_t n, uint32_t m)
+{
+    return clmul_8x4_even_int(unpack_8_to_16(n), unpack_8_to_16(m));
+}
+
+uint64_t clmul_16x2_even(uint64_t n, uint64_t m)
+{
+    uint64_t r = 0;
+
+    n &= 0x0000ffff0000ffffull;
+    m &= 0x0000ffff0000ffffull;
+
+    for (int i = 0; i < 16; ++i) {
+        uint64_t mask = (n & 0x0000000100000001ull) * 0xffffffffull;
+        r ^= m & mask;
+        n >>= 1;
+        m <<= 1;
+    }
+    return r;
+}
+
+uint64_t clmul_16x2_odd(uint64_t n, uint64_t m)
+{
+    return clmul_16x2_even(n >> 16, m >> 16);
+}
+
+uint64_t clmul_32(uint32_t n, uint32_t m32)
+{
+    uint64_t r = 0;
+    uint64_t m = m32;
+
+    for (int i = 0; i < 32; ++i) {
+        r ^= n & 1 ? m : 0;
+        n >>= 1;
+        m <<= 1;
+    }
+    return r;
+}
+
+Int128 clmul_64_gen(uint64_t n, uint64_t m)
+{
+    uint64_t rl = 0, rh = 0;
+
+    /* Bit 0 can only influence the low 64-bit result.  */
+    if (n & 1) {
+        rl = m;
+    }
+
+    for (int i = 1; i < 64; ++i) {
+        uint64_t mask = -((n >> i) & 1);
+        rl ^= (m << i) & mask;
+        rh ^= (m >> (64 - i)) & mask;
+    }
+    return int128_make128(rl, rh);
+}
--- a/crypto/der.c
+++ b/crypto/der.c
@ -76,7 +76,7 @@ enum QCryptoDERTagEnc {
 /**
 * qcrypto_der_encode_length:
 * @src_len: the length of source data
- * @dst: distination to save the encoded 'length', if dst is NULL, only compute
+ * @dst: destination to save the encoded 'length', if dst is NULL, only compute
 * the expected buffer size in bytes.
 * @dst_len: output parameter, indicates how many bytes wrote.
 *
--- a/crypto/der.h
+++ b/crypto/der.h
@ -249,7 +249,7 @@ void qcrypto_der_encode_octet_str(QCryptoEncodeContext *ctx,
 * Start encoding a octet string, All fields between
 * qcrypto_der_encode_octet_str_begin and qcrypto_der_encode_octet_str_end
 * are encoded as an octet string. This is useful when we need to encode a
- * encoded SEQUNCE as OCTET STRING.
+ * encoded SEQUENCE as OCTET STRING.
 */
 void qcrypto_der_encode_octet_str_begin(QCryptoEncodeContext *ctx);

@ -260,7 +260,7 @@ void qcrypto_der_encode_octet_str_begin(QCryptoEncodeContext *ctx);
 * Finish encoding a octet string, All fields between
 * qcrypto_der_encode_octet_str_begin and qcrypto_der_encode_octet_str_end
 * are encoded as an octet string. This is useful when we need to encode a
- * encoded SEQUNCE as OCTET STRING.
+ * encoded SEQUENCE as OCTET STRING.
 */
 void qcrypto_der_encode_octet_str_end(QCryptoEncodeContext *ctx);

@ -275,7 +275,7 @@ size_t qcrypto_der_encode_ctx_buffer_len(QCryptoEncodeContext *ctx);
 /**
 * qcrypto_der_encode_ctx_flush_and_free:
 * @ctx: the encode context.
- * @dst: the distination to save the encoded data, the length of dst should
+ * @dst: the destination to save the encoded data, the length of dst should
 * not less than qcrypto_der_encode_cxt_buffer_len
 *
 * Flush all encoded data into dst, then free ctx.
--- a/crypto/hmacpriv.h
+++ b/crypto/hmacpriv.h
@ -28,7 +28,7 @@ struct QCryptoHmacDriver {
    void (*hmac_free)(QCryptoHmac *hmac);
 };

-extern void *qcrypto_hmac_ctx_new(QCryptoHashAlgorithm alg,
+void *qcrypto_hmac_ctx_new(QCryptoHashAlgorithm alg,
                           const uint8_t *key, size_t nkey,
                           Error **errp);
 extern QCryptoHmacDriver qcrypto_hmac_lib_driver;
@ -37,8 +37,7 @@ extern QCryptoHmacDriver qcrypto_hmac_lib_driver;

 #include "afalgpriv.h"

-extern QCryptoAFAlg *
-qcrypto_afalg_hmac_ctx_new(QCryptoHashAlgorithm alg,
+QCryptoAFAlg *qcrypto_afalg_hmac_ctx_new(QCryptoHashAlgorithm alg,
                                         const uint8_t *key, size_t nkey,
                                         Error **errp);
 extern QCryptoHmacDriver qcrypto_hmac_afalg_driver;
--- a/crypto/meson.build
+++ b/crypto/meson.build
@ -48,9 +48,12 @@ if have_afalg
 endif
 crypto_ss.add(when: gnutls, if_true: files('tls-cipher-suites.c'))

-util_ss.add(files('sm4.c'))
-util_ss.add(files('aes.c'))
-util_ss.add(files('init.c'))
+util_ss.add(files(
+  'aes.c',
+  'clmul.c',
+  'init.c',
+  'sm4.c',
+))
 if gnutls.found()
  util_ss.add(gnutls)
 endif
--- a/crypto/sm4.c
+++ b/crypto/sm4.c
@ -47,3 +47,13 @@ uint8_t const sm4_sbox[] = {
    0x79, 0xee, 0x5f, 0x3e, 0xd7, 0xcb, 0x39, 0x48,
 };

+uint32_t const sm4_ck[] = {
+    0x00070e15, 0x1c232a31, 0x383f464d, 0x545b6269,
+    0x70777e85, 0x8c939aa1, 0xa8afb6bd, 0xc4cbd2d9,
+    0xe0e7eef5, 0xfc030a11, 0x181f262d, 0x343b4249,
+    0x50575e65, 0x6c737a81, 0x888f969d, 0xa4abb2b9,
+    0xc0c7ced5, 0xdce3eaf1, 0xf8ff060d, 0x141b2229,
+    0x30373e45, 0x4c535a61, 0x686f767d, 0x848b9299,
+    0xa0a7aeb5, 0xbcc3cad1, 0xd8dfe6ed, 0xf4fb0209,
+    0x10171e25, 0x2c333a41, 0x484f565d, 0x646b7279
+};
--- a/docs/about/build-platforms.rst
+++ b/docs/about/build-platforms.rst
@ -52,7 +52,7 @@ Those hosts are officially supported, with various accelerators:
   * - SPARC
     - tcg
   * - x86
-     - hax, hvf (64 bit only), kvm, nvmm, tcg, whpx (64 bit only), xen
+     - hvf (64 bit only), kvm, nvmm, tcg, whpx (64 bit only), xen

 Other host architectures are not supported. It is possible to build QEMU system
 emulation on an unsupported host architecture using the configure
--- a/docs/about/deprecated.rst
+++ b/docs/about/deprecated.rst
@ -105,12 +105,6 @@ Use ``-machine hpet=off`` instead.
 The ``-no-acpi`` setting has been turned into a machine property.
 Use ``-machine acpi=off`` instead.

-``-accel hax`` (since 8.0)
-''''''''''''''''''''''''''
-
-The HAXM project has been retired (see https://github.com/intel/haxm#status).
-Use "whpx" (on Windows) or "hvf" (on macOS) instead.
-
 ``-async-teardown`` (since 8.1)
 '''''''''''''''''''''''''''''''

--- a/docs/about/index.rst
+++ b/docs/about/index.rst
@ -8,7 +8,7 @@ QEMU can be used in several different ways. The most common is for
 :ref:`System Emulation`, where it provides a virtual model of an
 entire machine (CPU, memory and emulated devices) to run a guest OS.
 In this mode the CPU may be fully emulated, or it may work with a
-hypervisor such as KVM, Xen, Hax or Hypervisor.Framework to allow the
+hypervisor such as KVM, Xen or Hypervisor.Framework to allow the
 guest to run directly on the host CPU.

 The second supported way to use QEMU is :ref:`User Mode Emulation`,
--- a/docs/about/removed-features.rst
+++ b/docs/about/removed-features.rst
@ -659,15 +659,18 @@ Use ``Icelake-Server`` instead.
 System accelerators
 -------------------

-Userspace local APIC with KVM (x86, removed 8.0)
-''''''''''''''''''''''''''''''''''''''''''''''''
+Userspace local APIC with KVM (x86, removed in 8.0)
+'''''''''''''''''''''''''''''''''''''''''''''''''''

 ``-M kernel-irqchip=off`` cannot be used on KVM if the CPU model includes
 a local APIC.  The ``split`` setting is supported, as is using ``-M
 kernel-irqchip=off`` when the CPU does not have a local APIC.

-System accelerators
-------------------
+HAXM (``-accel hax``) (removed in 8.2)
+''''''''''''''''''''''''''''''''''''''
+
+The HAXM project has been retired (see https://github.com/intel/haxm#status).
+Use "whpx" (on Windows) or "hvf" (on macOS) instead.

 MIPS "Trap-and-Emulate" KVM support (removed in 8.0)
 ''''''''''''''''''''''''''''''''''''''''''''''''''''
--- a/docs/devel/build-system.rst
+++ b/docs/devel/build-system.rst
@ -460,17 +460,13 @@ Built by configure:

 ``config-host.mak``
  When configure has determined the characteristics of the build host it
-  will write them to this file for use in ``Makefile`` and to a smaller
-  extent ``meson.build``. These include the paths to various tools and a
-  variety of ``CONFIG_*`` variables related to optionally enabled features.
+  will write the paths to various tools to this file, for use in ``Makefile``
+  and to a smaller extent ``meson.build``.

  ``config-host.mak`` is also used as a dependency checking mechanism. If make
  sees that the modification timestamp on configure is newer than that on
  ``config-host.mak``, then configure will be re-run.

-  The variables defined here apply to all QEMU
-  build outputs.
-
 ``config-meson.cross``

  A Meson "cross file" (or native file) used to communicate the paths to
--- a/docs/devel/kconfig.rst
+++ b/docs/devel/kconfig.rst
@ -316,6 +316,6 @@ variable::

    host_kconfig = \
      (have_tpm ? ['CONFIG_TPM=y'] : []) + \
-      ('CONFIG_LINUX' in config_host ? ['CONFIG_LINUX=y'] : []) + \
+      (targetos == 'linux' ? ['CONFIG_LINUX=y'] : []) + \
      (have_ivshmem ? ['CONFIG_IVSHMEM=y'] : []) + \
      ...
--- a/docs/devel/multi-process.rst
+++ b/docs/devel/multi-process.rst
@ -409,8 +409,9 @@ the initial messages sent to the emulation process is a guest memory
 table. Each entry in this table consists of a file descriptor and size
 that the emulation process can ``mmap()`` to directly access guest
 memory, similar to ``vhost_user_set_mem_table()``. Note guest memory
-must be backed by file descriptors, such as when QEMU is given the
-*-mem-path* command line option.
+must be backed by shared file-backed memory, for example, using
+*-object memory-backend-file,share=on* and setting that memory backend
+as RAM for the machine.

 IOMMU operations
 ^^^^^^^^^^^^^^^^
--- a/docs/devel/reset.rst
+++ b/docs/devel/reset.rst
@ -184,7 +184,7 @@ in reset.
    {
        MyDevClass *myclass = MYDEV_CLASS(class);
        ResettableClass *rc = RESETTABLE_CLASS(class);
-        resettable_class_set_parent_reset_phases(rc,
+        resettable_class_set_parent_phases(rc,
                                           mydev_reset_enter,
                                           mydev_reset_hold,
                                           mydev_reset_exit,
@ -193,11 +193,10 @@ in reset.

 In the above example, we override all three phases. It is possible to override
 only some of them by passing NULL instead of a function pointer to
-``resettable_class_set_parent_reset_phases()``. For example, the following will
+``resettable_class_set_parent_phases()``. For example, the following will
 only override the *enter* phase and leave *hold* and *exit* untouched::

-    resettable_class_set_parent_reset_phases(rc, mydev_reset_enter,
-                                             NULL, NULL,
+    resettable_class_set_parent_phases(rc, mydev_reset_enter, NULL, NULL,
                                       &myclass->parent_phases);

 This is equivalent to providing a trivial implementation of the hold and exit
--- a/docs/devel/vfio-migration.rst
+++ b/docs/devel/vfio-migration.rst
@ -23,9 +23,21 @@ and recommends that the initial bytes are sent and loaded in the destination
 before stopping the source VM. Enabling this migration capability will
 guarantee that and thus, can potentially reduce downtime even further.

-Note that currently VFIO migration is supported only for a single device. This
-is due to VFIO migration's lack of P2P support. However, P2P support is planned
-to be added later on.
+To support migration of multiple devices that might do P2P transactions between
+themselves, VFIO migration uAPI defines an intermediate P2P quiescent state.
+While in the P2P quiescent state, P2P DMA transactions cannot be initiated by
+the device, but the device can respond to incoming ones. Additionally, all
+outstanding P2P transactions are guaranteed to have been completed by the time
+the device enters this state.
+
+All the devices that support P2P migration are first transitioned to the P2P
+quiescent state and only then are they stopped or started. This makes migration
+safe P2P-wise, since starting and stopping the devices is not done atomically
+for all the devices together.
+
+Thus, multiple VFIO devices migration is allowed only if all the devices
+support P2P migration. Single VFIO device migration is allowed regardless of
+P2P migration support.

 A detailed description of the UAPI for VFIO device migration can be found in
 the comment for the ``vfio_device_mig_state`` structure in the header file
@ -132,11 +144,12 @@ will be blocked.
 Flow of state changes during Live migration
 ===========================================

-Below is the flow of state change during live migration.
+Below is the state change flow during live migration for a VFIO device that
+supports both precopy and P2P migration. The flow for devices that don't
+support it is similar, except that the relevant states for precopy and P2P are
+skipped.
 The values in the parentheses represent the VM state, the migration state, and
 the VFIO device state, respectively.
-The text in the square brackets represents the flow if the VFIO device supports
-pre-copy.

 Live migration save path
 ------------------------
@ -148,37 +161,45 @@ Live migration save path
                                      |
                     migrate_init spawns migration_thread
            Migration thread then calls each device's .save_setup()
-                  (RUNNING, _SETUP, _RUNNING [_PRE_COPY])
+                          (RUNNING, _SETUP, _PRE_COPY)
                                      |
-                  (RUNNING, _ACTIVE, _RUNNING [_PRE_COPY])
+                         (RUNNING, _ACTIVE, _PRE_COPY)
  If device is active, get pending_bytes by .state_pending_{estimate,exact}()
       If total pending_bytes >= threshold_size, call .save_live_iterate()
-                  [Data of VFIO device for pre-copy phase is copied]
+                Data of VFIO device for pre-copy phase is copied
      Iterate till total pending bytes converge and are less than threshold
                                      |
-  On migration completion, vCPU stops and calls .save_live_complete_precopy for
-  each active device. The VFIO device is then transitioned into _STOP_COPY state
-                  (FINISH_MIGRATE, _DEVICE, _STOP_COPY)
+       On migration completion, the vCPUs and the VFIO device are stopped
+              The VFIO device is first put in P2P quiescent state
+                    (FINISH_MIGRATE, _ACTIVE, _PRE_COPY_P2P)
                                      |
-     For the VFIO device, iterate in .save_live_complete_precopy until
+                Then the VFIO device is put in _STOP_COPY state
+                     (FINISH_MIGRATE, _ACTIVE, _STOP_COPY)
+         .save_live_complete_precopy() is called for each active device
+      For the VFIO device, iterate in .save_live_complete_precopy() until
                               pending data is 0
-                   (FINISH_MIGRATE, _DEVICE, _STOP)
                                      |
-                 (FINISH_MIGRATE, _COMPLETED, _STOP)
+                     (POSTMIGRATE, _COMPLETED, _STOP_COPY)
            Migraton thread schedules cleanup bottom half and exits
+                                      |
+                           .save_cleanup() is called
+                        (POSTMIGRATE, _COMPLETED, _STOP)

 Live migration resume path
 --------------------------

 ::

-              Incoming migration calls .load_setup for each device
+             Incoming migration calls .load_setup() for each device
                          (RESTORE_VM, _ACTIVE, _STOP)
                                      |
-       For each device, .load_state is called for that device section data
+     For each device, .load_state() is called for that device section data
                        (RESTORE_VM, _ACTIVE, _RESUMING)
                                      |
-    At the end, .load_cleanup is called for each device and vCPUs are started
+  At the end, .load_cleanup() is called for each device and vCPUs are started
+              The VFIO device is first put in P2P quiescent state
+                        (RUNNING, _ACTIVE, _RUNNING_P2P)
+                                      |
                           (RUNNING, _NONE, _RUNNING)

 Postcopy
--- a/docs/interop/vhost-user-gpu.rst
+++ b/docs/interop/vhost-user-gpu.rst
@ -134,6 +134,19 @@ VhostUserGpuEdidRequest
 :scanout-id: ``u32``, the scanout to get edid from


+VhostUserGpuDMABUFScanout2
+^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+----------------+----------+
+| dmabuf_scanout | modifier |
+----------------+----------+
+
+:dmabuf_scanout: ``VhostUserGpuDMABUFScanout``, filled as described in the
+                 VhostUserGpuDMABUFScanout structure.
+
+:modifier: ``u64``, the DMABUF modifiers
+
+
 C structure
 -----------

@ -164,6 +177,7 @@ Protocol features
 .. code:: c

  #define VHOST_USER_GPU_PROTOCOL_F_EDID    0
+  #define VHOST_USER_GPU_PROTOCOL_F_DMABUF2 1

 New messages and communication changes are negotiated thanks to the
 ``VHOST_USER_GPU_GET_PROTOCOL_FEATURES`` and
@ -263,3 +277,13 @@ Message types
  Retrieve the EDID data for a given scanout.
  This message requires the ``VHOST_USER_GPU_PROTOCOL_F_EDID`` protocol
  feature to be supported.
+
+``VHOST_USER_GPU_DMABUF_SCANOUT2``
+  :id: 12
+  :request payload: ``VhostUserGpuDMABUFScanout2``
+  :reply payload: N/A
+
+  Same as VHOST_USER_GPU_DMABUF_SCANOUT, but also sends the dmabuf modifiers
+  appended to the message, which were not provided in the other message.
+  This message requires the ``VHOST_USER_GPU_PROTOCOL_F_DMABUF2`` protocol
+  feature to be supported.
--- a/docs/multi-thread-compression.txt
+++ b/docs/multi-thread-compression.txt
@ -117,13 +117,13 @@ to support the multiple thread compression migration:
    {qemu} migrate_set_capability compress on

 3. Set the compression thread count on source:
-    {qemu} migrate_set_parameter compress_threads 12
+    {qemu} migrate_set_parameter compress-threads 12

 4. Set the compression level on the source:
-    {qemu} migrate_set_parameter compress_level 1
+    {qemu} migrate_set_parameter compress-level 1

 5. Set the decompression thread count on destination:
-    {qemu} migrate_set_parameter decompress_threads 3
+    {qemu} migrate_set_parameter decompress-threads 3

 6. Start outgoing migration:
    {qemu} migrate -d tcp:destination.host:4444
@ -133,9 +133,9 @@ to support the multiple thread compression migration:

 The following are the default settings:
    compress: off
-    compress_threads: 8
-    decompress_threads: 2
-    compress_level: 1 (which means best speed)
+    compress-threads: 8
+    decompress-threads: 2
+    compress-level: 1 (which means best speed)

 So, only the first two steps are required to use the multiple
 thread compression in migration. You can do more if the default
--- a/Show More
+++ b/Show More