diff --git a/MAINTAINERS b/MAINTAINERS
index caba73ec41..be151f0024 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1748,8 +1748,8 @@ F: tests/qtest/intel-hda-test.c
 F: tests/qtest/fuzz-sb16-test.c
 
 Xilinx CAN
-M: Vikram Garhwal <fnu.vikram@xilinx.com>
-M: Francisco Iglesias <francisco.iglesias@xilinx.com>
+M: Vikram Garhwal <vikram.garhwal@amd.com>
+M: Francisco Iglesias <francisco.iglesias@amd.com>
 S: Maintained
 F: hw/net/can/xlnx-*
 F: include/hw/net/xlnx-*
diff --git a/VERSION b/VERSION
index 628ab8965f..7038194808 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-7.1.90
+7.1.91
diff --git a/block.c b/block.c
index c5e20c0bea..a18f052374 100644
--- a/block.c
+++ b/block.c
@@ -1543,7 +1543,7 @@ const BdrvChildClass child_of_bds = {
 
 AioContext *bdrv_child_get_parent_aio_context(BdrvChild *c)
 {
-    GLOBAL_STATE_CODE();
+    IO_CODE();
     return c->klass->get_parent_aio_context(c);
 }
 
diff --git a/block/blkio.c b/block/blkio.c
index 620fab28a7..5eae3adfaf 100644
--- a/block/blkio.c
+++ b/block/blkio.c
@@ -993,7 +993,6 @@ static void blkio_refresh_limits(BlockDriverState *bs, Error **errp)
     { \
         .format_name             = name, \
         .protocol_name           = name, \
-        .has_variable_length     = true, \
         .instance_size           = sizeof(BDRVBlkioState), \
         .bdrv_file_open          = blkio_file_open, \
         .bdrv_close              = blkio_close, \
diff --git a/block/block-backend.c b/block/block-backend.c
index c0c7d56c8d..b48c91f4e1 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -311,6 +311,7 @@ static void blk_root_detach(BdrvChild *child)
 static AioContext *blk_root_get_parent_aio_context(BdrvChild *c)
 {
     BlockBackend *blk = c->opaque;
+    IO_CODE();
 
     return blk_get_aio_context(blk);
 }
@@ -2157,6 +2158,11 @@ static int blk_do_set_aio_context(BlockBackend *blk, AioContext *new_context,
                 return ret;
             }
         }
+        /*
+         * Make blk->ctx consistent with the root node before we invoke any
+         * other operations like drain that might inquire blk->ctx
+         */
+        blk->ctx = new_context;
         if (tgm->throttle_state) {
             bdrv_drained_begin(bs);
             throttle_group_detach_aio_context(tgm);
@@ -2165,9 +2171,10 @@ static int blk_do_set_aio_context(BlockBackend *blk, AioContext *new_context,
         }
 
         bdrv_unref(bs);
+    } else {
+        blk->ctx = new_context;
     }
 
-    blk->ctx = new_context;
     return 0;
 }
 
diff --git a/block/io.c b/block/io.c
index 34b30e304e..b9424024f9 100644
--- a/block/io.c
+++ b/block/io.c
@@ -71,9 +71,10 @@ static void bdrv_parent_drained_end_single_no_poll(BdrvChild *c,
 void bdrv_parent_drained_end_single(BdrvChild *c)
 {
     int drained_end_counter = 0;
+    AioContext *ctx = bdrv_child_get_parent_aio_context(c);
     IO_OR_GS_CODE();
     bdrv_parent_drained_end_single_no_poll(c, &drained_end_counter);
-    BDRV_POLL_WHILE(c->bs, qatomic_read(&drained_end_counter) > 0);
+    AIO_WAIT_WHILE(ctx, qatomic_read(&drained_end_counter) > 0);
 }
 
 static void bdrv_parent_drained_end(BlockDriverState *bs, BdrvChild *ignore,
@@ -116,13 +117,14 @@ static bool bdrv_parent_drained_poll(BlockDriverState *bs, BdrvChild *ignore,
 
 void bdrv_parent_drained_begin_single(BdrvChild *c, bool poll)
 {
+    AioContext *ctx = bdrv_child_get_parent_aio_context(c);
     IO_OR_GS_CODE();
     c->parent_quiesce_counter++;
     if (c->klass->drained_begin) {
         c->klass->drained_begin(c);
     }
     if (poll) {
-        BDRV_POLL_WHILE(c->bs, bdrv_parent_drained_poll_single(c));
+        AIO_WAIT_WHILE(ctx, bdrv_parent_drained_poll_single(c));
     }
 }
 
diff --git a/block/mirror.c b/block/mirror.c
index 1a75a47cc3..251adc5ae0 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -82,6 +82,7 @@ typedef struct MirrorBlockJob {
     int max_iov;
     bool initial_zeroing_ongoing;
     int in_active_write_counter;
+    int64_t active_write_bytes_in_flight;
     bool prepared;
     bool in_drain;
 } MirrorBlockJob;
@@ -304,19 +305,21 @@ static int mirror_cow_align(MirrorBlockJob *s, int64_t *offset,
 }
 
 static inline void coroutine_fn
-mirror_wait_for_any_operation(MirrorBlockJob *s, bool active)
+mirror_wait_for_free_in_flight_slot(MirrorBlockJob *s)
 {
     MirrorOp *op;
 
     QTAILQ_FOREACH(op, &s->ops_in_flight, next) {
-        /* Do not wait on pseudo ops, because it may in turn wait on
+        /*
+         * Do not wait on pseudo ops, because it may in turn wait on
          * some other operation to start, which may in fact be the
          * caller of this function.  Since there is only one pseudo op
          * at any given time, we will always find some real operation
-         * to wait on. */
-        if (!op->is_pseudo_op && op->is_in_flight &&
-            op->is_active_write == active)
-        {
+         * to wait on.
+         * Also, do not wait on active operations, because they do not
+         * use up in-flight slots.
+         */
+        if (!op->is_pseudo_op && op->is_in_flight && !op->is_active_write) {
             qemu_co_queue_wait(&op->waiting_requests, NULL);
             return;
         }
@@ -324,13 +327,6 @@ mirror_wait_for_any_operation(MirrorBlockJob *s, bool active)
     abort();
 }
 
-static inline void coroutine_fn
-mirror_wait_for_free_in_flight_slot(MirrorBlockJob *s)
-{
-    /* Only non-active operations use up in-flight slots */
-    mirror_wait_for_any_operation(s, false);
-}
-
 /* Perform a mirror copy operation.
  *
  * *op->bytes_handled is set to the number of bytes copied after and
@@ -494,6 +490,13 @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
     }
     bdrv_dirty_bitmap_unlock(s->dirty_bitmap);
 
+    /*
+     * Wait for concurrent requests to @offset.  The next loop will limit the
+     * copied area based on in_flight_bitmap so we only copy an area that does
+     * not overlap with concurrent in-flight requests.  Still, we would like to
+     * copy something, so wait until there are at least no more requests to the
+     * very beginning of the area.
+     */
     mirror_wait_on_conflicts(NULL, s, offset, 1);
 
     job_pause_point(&s->common.job);
@@ -988,12 +991,6 @@ static int coroutine_fn mirror_run(Job *job, Error **errp)
         int64_t cnt, delta;
         bool should_complete;
 
-        /* Do not start passive operations while there are active
-         * writes in progress */
-        while (s->in_active_write_counter) {
-            mirror_wait_for_any_operation(s, true);
-        }
-
         if (s->ret < 0) {
             ret = s->ret;
             goto immediate_exit;
@@ -1010,7 +1007,9 @@ static int coroutine_fn mirror_run(Job *job, Error **errp)
         /* cnt is the number of dirty bytes remaining and s->bytes_in_flight is
          * the number of bytes currently being processed; together those are
          * the current remaining operation length */
-        job_progress_set_remaining(&s->common.job, s->bytes_in_flight + cnt);
+        job_progress_set_remaining(&s->common.job,
+                                   s->bytes_in_flight + cnt +
+                                   s->active_write_bytes_in_flight);
 
         /* Note that even when no rate limit is applied we need to yield
          * periodically with no pending I/O so that bdrv_drain_all() returns.
@@ -1071,6 +1070,10 @@ static int coroutine_fn mirror_run(Job *job, Error **errp)
 
             s->in_drain = true;
             bdrv_drained_begin(bs);
+
+            /* Must be zero because we are drained */
+            assert(s->in_active_write_counter == 0);
+
             cnt = bdrv_get_dirty_count(s->dirty_bitmap);
             if (cnt > 0 || mirror_flush(s) < 0) {
                 bdrv_drained_end(bs);
@@ -1306,6 +1309,7 @@ do_sync_target_write(MirrorBlockJob *job, MirrorMethod method,
     }
 
     job_progress_increase_remaining(&job->common.job, bytes);
+    job->active_write_bytes_in_flight += bytes;
 
     switch (method) {
     case MIRROR_METHOD_COPY:
@@ -1327,6 +1331,7 @@ do_sync_target_write(MirrorBlockJob *job, MirrorMethod method,
         abort();
     }
 
+    job->active_write_bytes_in_flight -= bytes;
     if (ret >= 0) {
         job_progress_update(&job->common.job, bytes);
     } else {
@@ -1375,6 +1380,19 @@ static MirrorOp *coroutine_fn active_write_prepare(MirrorBlockJob *s,
 
     s->in_active_write_counter++;
 
+    /*
+     * Wait for concurrent requests affecting the area.  If there are already
+     * running requests that are copying off now-to-be stale data in the area,
+     * we must wait for them to finish before we begin writing fresh data to the
+     * target so that the write operations appear in the correct order.
+     * Note that background requests (see mirror_iteration()) in contrast only
+     * wait for conflicting requests at the start of the dirty area, and then
+     * (based on the in_flight_bitmap) truncate the area to copy so it will not
+     * conflict with any requests beyond that.  For active writes, however, we
+     * cannot truncate that area.  The request from our parent must be blocked
+     * until the area is copied in full.  Therefore, we must wait for the whole
+     * area to become free of concurrent requests.
+     */
     mirror_wait_on_conflicts(op, s, offset, bytes);
 
     bitmap_set(s->in_flight_bitmap, start_chunk, end_chunk - start_chunk);
@@ -1420,11 +1438,13 @@ static int coroutine_fn bdrv_mirror_top_do_write(BlockDriverState *bs,
     MirrorOp *op = NULL;
     MirrorBDSOpaque *s = bs->opaque;
     int ret = 0;
-    bool copy_to_target;
+    bool copy_to_target = false;
 
-    copy_to_target = s->job->ret >= 0 &&
-                     !job_is_cancelled(&s->job->common.job) &&
-                     s->job->copy_mode == MIRROR_COPY_MODE_WRITE_BLOCKING;
+    if (s->job) {
+        copy_to_target = s->job->ret >= 0 &&
+                         !job_is_cancelled(&s->job->common.job) &&
+                         s->job->copy_mode == MIRROR_COPY_MODE_WRITE_BLOCKING;
+    }
 
     if (copy_to_target) {
         op = active_write_prepare(s->job, offset, bytes);
@@ -1469,11 +1489,13 @@ static int coroutine_fn bdrv_mirror_top_pwritev(BlockDriverState *bs,
     QEMUIOVector bounce_qiov;
     void *bounce_buf;
     int ret = 0;
-    bool copy_to_target;
+    bool copy_to_target = false;
 
-    copy_to_target = s->job->ret >= 0 &&
-                     !job_is_cancelled(&s->job->common.job) &&
-                     s->job->copy_mode == MIRROR_COPY_MODE_WRITE_BLOCKING;
+    if (s->job) {
+        copy_to_target = s->job->ret >= 0 &&
+                         !job_is_cancelled(&s->job->common.job) &&
+                         s->job->copy_mode == MIRROR_COPY_MODE_WRITE_BLOCKING;
+    }
 
     if (copy_to_target) {
         /* The guest might concurrently modify the data to write; but
diff --git a/blockjob.c b/blockjob.c
index 2d86014fa5..f51d4e18f3 100644
--- a/blockjob.c
+++ b/blockjob.c
@@ -173,7 +173,8 @@ static bool child_job_change_aio_ctx(BdrvChild *c, AioContext *ctx,
 static AioContext *child_job_get_parent_aio_context(BdrvChild *c)
 {
     BlockJob *job = c->opaque;
-    GLOBAL_STATE_CODE();
+    IO_CODE();
+    JOB_LOCK_GUARD();
 
     return job->job.aio_context;
 }
diff --git a/hw/intc/arm_gicv3_cpuif.c b/hw/intc/arm_gicv3_cpuif.c
index 8ca630e5ad..b17b29288c 100644
--- a/hw/intc/arm_gicv3_cpuif.c
+++ b/hw/intc/arm_gicv3_cpuif.c
@@ -1016,8 +1016,6 @@ static void icc_pmr_write(CPUARMState *env, const ARMCPRegInfo *ri,
 
     trace_gicv3_icc_pmr_write(gicv3_redist_affid(cs), value);
 
-    value &= icc_fullprio_mask(cs);
-
     if (arm_feature(env, ARM_FEATURE_EL3) && !arm_is_secure(env) &&
         (env->cp15.scr_el3 & SCR_FIQ)) {
         /* NS access and Group 0 is inaccessible to NS: return the
@@ -1029,6 +1027,7 @@ static void icc_pmr_write(CPUARMState *env, const ARMCPRegInfo *ri,
         }
         value = (value >> 1) | 0x80;
     }
+    value &= icc_fullprio_mask(cs);
     cs->icc_pmr_el1 = value;
     gicv3_cpuif_update(cs);
 }
diff --git a/include/block/block-global-state.h b/include/block/block-global-state.h
index bb42ed9559..c7bd4a2088 100644
--- a/include/block/block-global-state.h
+++ b/include/block/block-global-state.h
@@ -220,7 +220,6 @@ void coroutine_fn bdrv_co_lock(BlockDriverState *bs);
  */
 void coroutine_fn bdrv_co_unlock(BlockDriverState *bs);
 
-AioContext *bdrv_child_get_parent_aio_context(BdrvChild *c);
 bool bdrv_child_change_aio_context(BdrvChild *c, AioContext *ctx,
                                    GHashTable *visited, Transaction *tran,
                                    Error **errp);
diff --git a/include/block/block-io.h b/include/block/block-io.h
index 770ddeb7c8..b099d7db45 100644
--- a/include/block/block-io.h
+++ b/include/block/block-io.h
@@ -171,6 +171,8 @@ void bdrv_debug_event(BlockDriverState *bs, BlkdebugEvent event);
  */
 AioContext *bdrv_get_aio_context(BlockDriverState *bs);
 
+AioContext *bdrv_child_get_parent_aio_context(BdrvChild *c);
+
 /**
  * Move the current coroutine to the AioContext of @bs and return the old
  * AioContext of the coroutine. Increase bs->in_flight so that draining @bs
diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h
index 5a2cc077a0..31ae91e56e 100644
--- a/include/block/block_int-common.h
+++ b/include/block/block_int-common.h
@@ -911,8 +911,6 @@ struct BdrvChildClass {
                            GHashTable *visited, Transaction *tran,
                            Error **errp);
 
-    AioContext *(*get_parent_aio_context)(BdrvChild *child);
-
     /*
      * I/O API functions. These functions are thread-safe.
      *
@@ -929,6 +927,8 @@ struct BdrvChildClass {
      */
     const char *(*get_name)(BdrvChild *child);
 
+    AioContext *(*get_parent_aio_context)(BdrvChild *child);
+
     /*
      * If this pair of functions is implemented, the parent doesn't issue new
      * requests after returning from .drained_begin() until .drained_end() is
diff --git a/qapi/block-core.json b/qapi/block-core.json
index 6d904004f8..95ac4fa634 100644
--- a/qapi/block-core.json
+++ b/qapi/block-core.json
@@ -3704,7 +3704,7 @@
 #
 # Driver specific block device options for the nvme-io_uring backend.
 #
-# @path: path to the image file
+# @path: path to the NVMe namespace's character device (e.g. /dev/ng0n1).
 #
 # Since: 7.2
 ##
diff --git a/target/i386/tcg/translate.c b/target/i386/tcg/translate.c
index be499b3f09..28d04b8258 100644
--- a/target/i386/tcg/translate.c
+++ b/target/i386/tcg/translate.c
@@ -445,32 +445,51 @@ static inline MemOp mo_b_d32(int b, MemOp ot)
     return b & 1 ? (ot == MO_16 ? MO_16 : MO_32) : MO_8;
 }
 
-static void gen_op_mov_reg_v(DisasContext *s, MemOp ot, int reg, TCGv t0)
+/* Compute the result of writing t0 to the OT-sized register REG.
+ *
+ * If DEST is NULL, store the result into the register and return the
+ * register's TCGv.
+ *
+ * If DEST is not NULL, store the result into DEST and return the
+ * register's TCGv.
+ */
+static TCGv gen_op_deposit_reg_v(DisasContext *s, MemOp ot, int reg, TCGv dest, TCGv t0)
 {
     switch(ot) {
     case MO_8:
-        if (!byte_reg_is_xH(s, reg)) {
-            tcg_gen_deposit_tl(cpu_regs[reg], cpu_regs[reg], t0, 0, 8);
-        } else {
-            tcg_gen_deposit_tl(cpu_regs[reg - 4], cpu_regs[reg - 4], t0, 8, 8);
+        if (byte_reg_is_xH(s, reg)) {
+            dest = dest ? dest : cpu_regs[reg - 4];
+            tcg_gen_deposit_tl(dest, cpu_regs[reg - 4], t0, 8, 8);
+            return cpu_regs[reg - 4];
         }
+        dest = dest ? dest : cpu_regs[reg];
+        tcg_gen_deposit_tl(dest, cpu_regs[reg], t0, 0, 8);
         break;
     case MO_16:
-        tcg_gen_deposit_tl(cpu_regs[reg], cpu_regs[reg], t0, 0, 16);
+        dest = dest ? dest : cpu_regs[reg];
+        tcg_gen_deposit_tl(dest, cpu_regs[reg], t0, 0, 16);
         break;
     case MO_32:
         /* For x86_64, this sets the higher half of register to zero.
            For i386, this is equivalent to a mov. */
-        tcg_gen_ext32u_tl(cpu_regs[reg], t0);
+        dest = dest ? dest : cpu_regs[reg];
+        tcg_gen_ext32u_tl(dest, t0);
         break;
 #ifdef TARGET_X86_64
     case MO_64:
-        tcg_gen_mov_tl(cpu_regs[reg], t0);
+        dest = dest ? dest : cpu_regs[reg];
+        tcg_gen_mov_tl(dest, t0);
         break;
 #endif
     default:
         tcg_abort();
     }
+    return cpu_regs[reg];
+}
+
+static void gen_op_mov_reg_v(DisasContext *s, MemOp ot, int reg, TCGv t0)
+{
+    gen_op_deposit_reg_v(s, ot, reg, NULL, t0);
 }
 
 static inline
@@ -3767,7 +3786,7 @@ static bool disas_insn(DisasContext *s, CPUState *cpu)
     case 0x1b0:
     case 0x1b1: /* cmpxchg Ev, Gv */
         {
-            TCGv oldv, newv, cmpv;
+            TCGv oldv, newv, cmpv, dest;
 
             ot = mo_b_d(b, dflag);
             modrm = x86_ldub_code(env, s);
@@ -3778,7 +3797,7 @@ static bool disas_insn(DisasContext *s, CPUState *cpu)
             cmpv = tcg_temp_new();
             gen_op_mov_v_reg(s, ot, newv, reg);
             tcg_gen_mov_tl(cmpv, cpu_regs[R_EAX]);
-
+            gen_extu(ot, cmpv);
             if (s->prefix & PREFIX_LOCK) {
                 if (mod == 3) {
                     goto illegal_op;
@@ -3786,32 +3805,43 @@ static bool disas_insn(DisasContext *s, CPUState *cpu)
                 gen_lea_modrm(env, s, modrm);
                 tcg_gen_atomic_cmpxchg_tl(oldv, s->A0, cmpv, newv,
                                           s->mem_index, ot | MO_LE);
-                gen_op_mov_reg_v(s, ot, R_EAX, oldv);
             } else {
                 if (mod == 3) {
                     rm = (modrm & 7) | REX_B(s);
                     gen_op_mov_v_reg(s, ot, oldv, rm);
+                    gen_extu(ot, oldv);
+
+                    /*
+                     * Unlike the memory case, where "the destination operand receives
+                     * a write cycle without regard to the result of the comparison",
+                     * rm must not be touched altogether if the write fails, including
+                     * not zero-extending it on 64-bit processors.  So, precompute
+                     * the result of a successful writeback and perform the movcond
+                     * directly on cpu_regs.  Also need to write accumulator first, in
+                     * case rm is part of RAX too.
+                     */
+                    dest = gen_op_deposit_reg_v(s, ot, rm, newv, newv);
+                    tcg_gen_movcond_tl(TCG_COND_EQ, dest, oldv, cmpv, newv, dest);
                 } else {
                     gen_lea_modrm(env, s, modrm);
                     gen_op_ld_v(s, ot, oldv, s->A0);
-                    rm = 0; /* avoid warning */
-                }
-                gen_extu(ot, oldv);
-                gen_extu(ot, cmpv);
-                /* store value = (old == cmp ? new : old);  */
-                tcg_gen_movcond_tl(TCG_COND_EQ, newv, oldv, cmpv, newv, oldv);
-                if (mod == 3) {
-                    gen_op_mov_reg_v(s, ot, R_EAX, oldv);
-                    gen_op_mov_reg_v(s, ot, rm, newv);
-                } else {
-                    /* Perform an unconditional store cycle like physical cpu;
-                       must be before changing accumulator to ensure
-                       idempotency if the store faults and the instruction
-                       is restarted */
+
+                    /*
+                     * Perform an unconditional store cycle like physical cpu;
+                     * must be before changing accumulator to ensure
+                     * idempotency if the store faults and the instruction
+                     * is restarted
+                     */
+                    tcg_gen_movcond_tl(TCG_COND_EQ, newv, oldv, cmpv, newv, oldv);
                     gen_op_st_v(s, ot, newv, s->A0);
-                    gen_op_mov_reg_v(s, ot, R_EAX, oldv);
                 }
             }
+	    /*
+	     * Write EAX only if the cmpxchg fails; reuse newv as the destination,
+	     * since it's dead here.
+	     */
+            dest = gen_op_deposit_reg_v(s, ot, R_EAX, newv, oldv);
+            tcg_gen_movcond_tl(TCG_COND_EQ, dest, oldv, cmpv, dest, newv);
             tcg_gen_mov_tl(cpu_cc_src, oldv);
             tcg_gen_mov_tl(s->cc_srcT, cmpv);
             tcg_gen_sub_tl(cpu_cc_dst, cmpv, oldv);
@@ -5220,7 +5250,7 @@ static bool disas_insn(DisasContext *s, CPUState *cpu)
     case 0x9e: /* sahf */
         if (CODE64(s) && !(s->cpuid_ext3_features & CPUID_EXT3_LAHF_LM))
             goto illegal_op;
-        gen_op_mov_v_reg(s, MO_8, s->T0, R_AH);
+        tcg_gen_shri_tl(s->T0, cpu_regs[R_EAX], 8);
         gen_compute_eflags(s);
         tcg_gen_andi_tl(cpu_cc_src, cpu_cc_src, CC_O);
         tcg_gen_andi_tl(s->T0, s->T0, CC_S | CC_Z | CC_A | CC_P | CC_C);
@@ -5232,7 +5262,7 @@ static bool disas_insn(DisasContext *s, CPUState *cpu)
         gen_compute_eflags(s);
         /* Note: gen_compute_eflags() only gives the condition codes */
         tcg_gen_ori_tl(s->T0, cpu_cc_src, 0x02);
-        gen_op_mov_reg_v(s, MO_8, R_AH, s->T0);
+        tcg_gen_deposit_tl(cpu_regs[R_EAX], cpu_regs[R_EAX], s->T0, 8, 8);
         break;
     case 0xf5: /* cmc */
         gen_compute_eflags(s);
diff --git a/tests/qemu-iotests/151 b/tests/qemu-iotests/151
index 93d14193d0..b4d1bc2553 100755
--- a/tests/qemu-iotests/151
+++ b/tests/qemu-iotests/151
@@ -19,7 +19,11 @@
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 #
 
+import math
 import os
+import subprocess
+import time
+from typing import List, Optional
 import iotests
 from iotests import qemu_img
 
@@ -50,7 +54,7 @@ class TestActiveMirror(iotests.QMPTestCase):
         self.vm = iotests.VM()
         self.vm.add_drive_raw(self.vm.qmp_to_opts(blk_source))
         self.vm.add_blockdev(self.vm.qmp_to_opts(blk_target))
-        self.vm.add_device('virtio-blk,drive=source')
+        self.vm.add_device('virtio-blk,id=vblk,drive=source')
         self.vm.launch()
 
     def tearDown(self):
@@ -192,6 +196,227 @@ class TestActiveMirror(iotests.QMPTestCase):
         self.potential_writes_in_flight = False
 
 
+class TestThrottledWithNbdExportBase(iotests.QMPTestCase):
+    image_len = 128 * 1024 * 1024  # MB
+    iops: Optional[int] = None
+    background_processes: List['subprocess.Popen[str]'] = []
+
+    def setUp(self):
+        # Must be set by subclasses
+        self.assertIsNotNone(self.iops)
+
+        qemu_img('create', '-f', iotests.imgfmt, source_img, '128M')
+        qemu_img('create', '-f', iotests.imgfmt, target_img, '128M')
+
+        self.vm = iotests.VM()
+        self.vm.launch()
+
+        result = self.vm.qmp('object-add', **{
+            'qom-type': 'throttle-group',
+            'id': 'thrgr',
+            'limits': {
+                'iops-total': self.iops,
+                'iops-total-max': self.iops
+            }
+        })
+        self.assert_qmp(result, 'return', {})
+
+        result = self.vm.qmp('blockdev-add', **{
+            'node-name': 'source-node',
+            'driver': 'throttle',
+            'throttle-group': 'thrgr',
+            'file': {
+                'driver': iotests.imgfmt,
+                'file': {
+                    'driver': 'file',
+                    'filename': source_img
+                }
+            }
+        })
+        self.assert_qmp(result, 'return', {})
+
+        result = self.vm.qmp('blockdev-add', **{
+            'node-name': 'target-node',
+            'driver': iotests.imgfmt,
+            'file': {
+                'driver': 'file',
+                'filename': target_img
+            }
+        })
+        self.assert_qmp(result, 'return', {})
+
+        self.nbd_sock = iotests.file_path('nbd.sock',
+                                          base_dir=iotests.sock_dir)
+        self.nbd_url = f'nbd+unix:///source-node?socket={self.nbd_sock}'
+
+        result = self.vm.qmp('nbd-server-start', addr={
+            'type': 'unix',
+            'data': {
+                'path': self.nbd_sock
+            }
+        })
+        self.assert_qmp(result, 'return', {})
+
+        result = self.vm.qmp('block-export-add', id='exp0', type='nbd',
+                             node_name='source-node', writable=True)
+        self.assert_qmp(result, 'return', {})
+
+    def tearDown(self):
+        # Wait for background requests to settle
+        try:
+            while True:
+                p = self.background_processes.pop()
+                while True:
+                    try:
+                        p.wait(timeout=0.0)
+                        break
+                    except subprocess.TimeoutExpired:
+                        self.vm.qtest(f'clock_step {1 * 1000 * 1000 * 1000}')
+        except IndexError:
+            pass
+
+        # Cancel ongoing block jobs
+        for job in self.vm.qmp('query-jobs')['return']:
+            self.vm.qmp('block-job-cancel', device=job['id'], force=True)
+
+        while True:
+            self.vm.qtest(f'clock_step {1 * 1000 * 1000 * 1000}')
+            if len(self.vm.qmp('query-jobs')['return']) == 0:
+                break
+
+        self.vm.shutdown()
+        os.remove(source_img)
+        os.remove(target_img)
+
+
+class TestLowThrottledWithNbdExport(TestThrottledWithNbdExportBase):
+    iops = 16
+
+    def testUnderLoad(self):
+        '''
+        Throttle the source node, then issue a whole bunch of external requests
+        while the mirror job (in write-blocking mode) is running.  We want to
+        see background requests being issued even while the source is under
+        full load by active writes, so that progress can be made towards READY.
+        '''
+
+        # Fill the first half of the source image; do not fill the second half,
+        # that is where we will have active requests occur.  This ensures that
+        # active mirroring itself will not directly contribute to the job's
+        # progress (because when the job was started, those areas were not
+        # intended to be copied, so active mirroring will only lead to not
+        # losing progress, but also not making any).
+        self.vm.hmp_qemu_io('source-node',
+                            f'aio_write -P 1 0 {self.image_len // 2}')
+        self.vm.qtest(f'clock_step {1 * 1000 * 1000 * 1000}')
+
+        # Launch the mirror job
+        mirror_buf_size = 65536
+        result = self.vm.qmp('blockdev-mirror',
+                             job_id='mirror',
+                             filter_node_name='mirror-node',
+                             device='source-node',
+                             target='target-node',
+                             sync='full',
+                             copy_mode='write-blocking',
+                             buf_size=mirror_buf_size)
+        self.assert_qmp(result, 'return', {})
+
+        # We create the external requests via qemu-io processes on the NBD
+        # server.  Have their offset start in the middle of the image so they
+        # do not overlap with the background requests (which start from the
+        # beginning).
+        active_request_offset = self.image_len // 2
+        active_request_len = 4096
+
+        # Create enough requests to saturate the node for 5 seconds
+        for _ in range(0, 5 * self.iops):
+            req = f'write -P 42 {active_request_offset} {active_request_len}'
+            active_request_offset += active_request_len
+            p = iotests.qemu_io_popen('-f', 'nbd', self.nbd_url, '-c', req)
+            self.background_processes += [p]
+
+        # Now advance the clock one I/O operation at a time by the 4 seconds
+        # (i.e. one less than 5).  We expect the mirror job to issue background
+        # operations here, even though active requests are still in flight.
+        # The active requests will take precedence, however, because they have
+        # been issued earlier than mirror's background requests.
+        # Once the active requests we have started above are done (i.e. after 5
+        # virtual seconds), we expect those background requests to be worked
+        # on.  We only advance 4 seconds here to avoid race conditions.
+        for _ in range(0, 4 * self.iops):
+            step = math.ceil(1 * 1000 * 1000 * 1000 / self.iops)
+            self.vm.qtest(f'clock_step {step}')
+
+        # Note how much remains to be done until the mirror job is finished
+        job_status = self.vm.qmp('query-jobs')['return'][0]
+        start_remaining = job_status['total-progress'] - \
+            job_status['current-progress']
+
+        # Create a whole bunch of more active requests
+        for _ in range(0, 10 * self.iops):
+            req = f'write -P 42 {active_request_offset} {active_request_len}'
+            active_request_offset += active_request_len
+            p = iotests.qemu_io_popen('-f', 'nbd', self.nbd_url, '-c', req)
+            self.background_processes += [p]
+
+        # Let the clock advance more.  After 1 second, as noted above, we
+        # expect the background requests to be worked on.  Give them a couple
+        # of seconds (specifically 4) to see their impact.
+        for _ in range(0, 5 * self.iops):
+            step = math.ceil(1 * 1000 * 1000 * 1000 / self.iops)
+            self.vm.qtest(f'clock_step {step}')
+
+        # Note how much remains to be done now.  We expect this number to be
+        # reduced thanks to those background requests.
+        job_status = self.vm.qmp('query-jobs')['return'][0]
+        end_remaining = job_status['total-progress'] - \
+            job_status['current-progress']
+
+        # See that indeed progress was being made on the job, even while the
+        # node was saturated with active requests
+        self.assertGreater(start_remaining - end_remaining, 0)
+
+
+class TestHighThrottledWithNbdExport(TestThrottledWithNbdExportBase):
+    iops = 1024
+
+    def testActiveOnCreation(self):
+        '''
+        Issue requests on the mirror source node right as the mirror is
+        instated.  It's possible that requests occur before the actual job is
+        created, but after the node has been put into the graph.  Write
+        requests across the node must in that case be forwarded to the source
+        node without attempting to mirror them (there is no job object yet, so
+        attempting to access it would cause a segfault).
+        We do this with a lightly throttled node (i.e. quite high IOPS limit).
+        Using throttling seems to increase reproductivity, but if the limit is
+        too low, all requests allowed per second will be submitted before
+        mirror_start_job() gets to the problematic point.
+        '''
+
+        # Let qemu-img bench create write requests (enough for two seconds on
+        # the virtual clock)
+        bench_args = ['bench', '-w', '-d', '1024', '-f', 'nbd',
+                      '-c', str(self.iops * 2), self.nbd_url]
+        p = iotests.qemu_tool_popen(iotests.qemu_img_args + bench_args)
+        self.background_processes += [p]
+
+        # Give qemu-img bench time to start up and issue requests
+        time.sleep(1.0)
+        # Flush the request queue, so new requests can come in right as we
+        # start blockdev-mirror
+        self.vm.qtest(f'clock_step {1 * 1000 * 1000 * 1000}')
+
+        result = self.vm.qmp('blockdev-mirror',
+                             job_id='mirror',
+                             device='source-node',
+                             target='target-node',
+                             sync='full',
+                             copy_mode='write-blocking')
+        self.assert_qmp(result, 'return', {})
+
+
 if __name__ == '__main__':
     iotests.main(supported_fmts=['qcow2', 'raw'],
                  supported_protocols=['file'])
diff --git a/tests/qemu-iotests/151.out b/tests/qemu-iotests/151.out
index 89968f35d7..3f8a935a08 100644
--- a/tests/qemu-iotests/151.out
+++ b/tests/qemu-iotests/151.out
@@ -1,5 +1,5 @@
-....
+......
 ----------------------------------------------------------------------
-Ran 4 tests
+Ran 6 tests
 
 OK
diff --git a/tests/qemu-iotests/tests/stream-under-throttle b/tests/qemu-iotests/tests/stream-under-throttle
new file mode 100755
index 0000000000..8d2d9e1684
--- /dev/null
+++ b/tests/qemu-iotests/tests/stream-under-throttle
@@ -0,0 +1,121 @@
+#!/usr/bin/env python3
+# group: rw
+#
+# Test streaming with throttle nodes on top
+#
+# Copyright (C) 2022 Red Hat, Inc.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+
+import asyncio
+import os
+from typing import List
+import iotests
+from iotests import qemu_img_create, qemu_io
+
+
+image_size = 256 * 1024 * 1024
+base_img = os.path.join(iotests.test_dir, 'base.img')
+top_img = os.path.join(iotests.test_dir, 'top.img')
+
+
+class TcgVM(iotests.VM):
+    '''
+    Variant of iotests.VM that uses -accel tcg.  Simply using
+    iotests.VM.add_args('-accel', 'tcg') is not sufficient, because that will
+    put -accel qtest before -accel tcg, and -accel arguments are prioritized in
+    the order they appear.
+    '''
+    @property
+    def _base_args(self) -> List[str]:
+        # Put -accel tcg first so it takes precedence
+        return ['-accel', 'tcg'] + super()._base_args
+
+
+class TestStreamWithThrottle(iotests.QMPTestCase):
+    def setUp(self) -> None:
+        '''
+        Create a simple backing chain between two images, write something to
+        the base image.  Attach them to the VM underneath two throttle nodes,
+        one of which has actually no limits set, but the other does.  Then put
+        a virtio-blk device on top.
+        This test configuration has been taken from
+        https://gitlab.com/qemu-project/qemu/-/issues/1215
+        '''
+        qemu_img_create('-f', iotests.imgfmt, base_img, str(image_size))
+        qemu_img_create('-f', iotests.imgfmt, '-b', base_img, '-F',
+                        iotests.imgfmt, top_img, str(image_size))
+
+        # Write something to stream
+        qemu_io(base_img, '-c', f'write 0 {image_size}')
+
+        blockdev = {
+            'driver': 'throttle',
+            'node-name': 'throttled-node',
+            'throttle-group': 'thrgr-limited',
+            'file': {
+                'driver': 'throttle',
+                'throttle-group': 'thrgr-unlimited',
+                'file': {
+                    'driver': iotests.imgfmt,
+                    'node-name': 'unthrottled-node',
+                    'file': {
+                        'driver': 'file',
+                        'filename': top_img
+                    }
+                }
+            }
+        }
+
+        # Issue 1215 is not reproducible in qtest mode, which is why we need to
+        # create an -accel tcg VM
+        self.vm = TcgVM()
+        self.vm.add_object('iothread,id=iothr0')
+        self.vm.add_object('throttle-group,id=thrgr-unlimited')
+        self.vm.add_object('throttle-group,id=thrgr-limited,'
+                           'x-iops-total=10000,x-bps-total=104857600')
+        self.vm.add_blockdev(self.vm.qmp_to_opts(blockdev))
+        self.vm.add_device('virtio-blk,iothread=iothr0,drive=throttled-node')
+        self.vm.launch()
+
+    def tearDown(self) -> None:
+        self.vm.shutdown()
+        os.remove(top_img)
+        os.remove(base_img)
+
+    def test_stream(self) -> None:
+        '''
+        Do a simple stream beneath the two throttle nodes.  Should complete
+        with no problems.
+        '''
+        result = self.vm.qmp('block-stream',
+                             job_id='stream',
+                             device='unthrottled-node')
+        self.assert_qmp(result, 'return', {})
+
+        # Should succeed and not time out
+        try:
+            self.vm.run_job('stream')
+        except asyncio.TimeoutError:
+            # VM may be stuck, kill it before tearDown()
+            self.vm.kill()
+            raise
+
+
+if __name__ == '__main__':
+    # Must support backing images
+    iotests.main(supported_fmts=['qcow', 'qcow2', 'qed'],
+                 supported_protocols=['file'],
+                 required_fmts=['throttle'])
diff --git a/tests/qemu-iotests/tests/stream-under-throttle.out b/tests/qemu-iotests/tests/stream-under-throttle.out
new file mode 100644
index 0000000000..ae1213e6f8
--- /dev/null
+++ b/tests/qemu-iotests/tests/stream-under-throttle.out
@@ -0,0 +1,5 @@
+.
+----------------------------------------------------------------------
+Ran 1 tests
+
+OK
diff --git a/tests/tcg/x86_64/Makefile.target b/tests/tcg/x86_64/Makefile.target
index 6895db1c43..4eac78293f 100644
--- a/tests/tcg/x86_64/Makefile.target
+++ b/tests/tcg/x86_64/Makefile.target
@@ -11,6 +11,7 @@ include $(SRC_PATH)/tests/tcg/i386/Makefile.target
 ifeq ($(filter %-linux-user, $(TARGET)),$(TARGET))
 X86_64_TESTS += vsyscall
 X86_64_TESTS += noexec
+X86_64_TESTS += cmpxchg
 TESTS=$(MULTIARCH_TESTS) $(X86_64_TESTS) test-x86_64
 else
 TESTS=$(MULTIARCH_TESTS)
diff --git a/tests/tcg/x86_64/cmpxchg.c b/tests/tcg/x86_64/cmpxchg.c
new file mode 100644
index 0000000000..5891735161
--- /dev/null
+++ b/tests/tcg/x86_64/cmpxchg.c
@@ -0,0 +1,42 @@
+#include <assert.h>
+
+static int mem;
+
+static unsigned long test_cmpxchgb(unsigned long orig)
+{
+  unsigned long ret;
+  mem = orig;
+  asm("cmpxchgb %b[cmp],%[mem]"
+      : [ mem ] "+m"(mem), [ rax ] "=a"(ret)
+      : [ cmp ] "r"(0x77), "a"(orig));
+  return ret;
+}
+
+static unsigned long test_cmpxchgw(unsigned long orig)
+{
+  unsigned long ret;
+  mem = orig;
+  asm("cmpxchgw %w[cmp],%[mem]"
+      : [ mem ] "+m"(mem), [ rax ] "=a"(ret)
+      : [ cmp ] "r"(0x7777), "a"(orig));
+  return ret;
+}
+
+static unsigned long test_cmpxchgl(unsigned long orig)
+{
+  unsigned long ret;
+  mem = orig;
+  asm("cmpxchgl %[cmp],%[mem]"
+      : [ mem ] "+m"(mem), [ rax ] "=a"(ret)
+      : [ cmp ] "r"(0x77777777u), "a"(orig));
+  return ret;
+}
+
+int main()
+{
+  unsigned long test = 0xdeadbeef12345678ull;
+  assert(test == test_cmpxchgb(test));
+  assert(test == test_cmpxchgw(test));
+  assert(test == test_cmpxchgl(test));
+  return 0;
+}