diff --git a/MAINTAINERS b/MAINTAINERS index caba73ec41..be151f0024 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -1748,8 +1748,8 @@ F: tests/qtest/intel-hda-test.c F: tests/qtest/fuzz-sb16-test.c Xilinx CAN -M: Vikram Garhwal -M: Francisco Iglesias +M: Vikram Garhwal +M: Francisco Iglesias S: Maintained F: hw/net/can/xlnx-* F: include/hw/net/xlnx-* diff --git a/VERSION b/VERSION index 628ab8965f..7038194808 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -7.1.90 +7.1.91 diff --git a/block.c b/block.c index c5e20c0bea..a18f052374 100644 --- a/block.c +++ b/block.c @@ -1543,7 +1543,7 @@ const BdrvChildClass child_of_bds = { AioContext *bdrv_child_get_parent_aio_context(BdrvChild *c) { - GLOBAL_STATE_CODE(); + IO_CODE(); return c->klass->get_parent_aio_context(c); } diff --git a/block/blkio.c b/block/blkio.c index 620fab28a7..5eae3adfaf 100644 --- a/block/blkio.c +++ b/block/blkio.c @@ -993,7 +993,6 @@ static void blkio_refresh_limits(BlockDriverState *bs, Error **errp) { \ .format_name = name, \ .protocol_name = name, \ - .has_variable_length = true, \ .instance_size = sizeof(BDRVBlkioState), \ .bdrv_file_open = blkio_file_open, \ .bdrv_close = blkio_close, \ diff --git a/block/block-backend.c b/block/block-backend.c index c0c7d56c8d..b48c91f4e1 100644 --- a/block/block-backend.c +++ b/block/block-backend.c @@ -311,6 +311,7 @@ static void blk_root_detach(BdrvChild *child) static AioContext *blk_root_get_parent_aio_context(BdrvChild *c) { BlockBackend *blk = c->opaque; + IO_CODE(); return blk_get_aio_context(blk); } @@ -2157,6 +2158,11 @@ static int blk_do_set_aio_context(BlockBackend *blk, AioContext *new_context, return ret; } } + /* + * Make blk->ctx consistent with the root node before we invoke any + * other operations like drain that might inquire blk->ctx + */ + blk->ctx = new_context; if (tgm->throttle_state) { bdrv_drained_begin(bs); throttle_group_detach_aio_context(tgm); @@ -2165,9 +2171,10 @@ static int blk_do_set_aio_context(BlockBackend *blk, AioContext *new_context, } bdrv_unref(bs); + } else { + blk->ctx = new_context; } - blk->ctx = new_context; return 0; } diff --git a/block/io.c b/block/io.c index 34b30e304e..b9424024f9 100644 --- a/block/io.c +++ b/block/io.c @@ -71,9 +71,10 @@ static void bdrv_parent_drained_end_single_no_poll(BdrvChild *c, void bdrv_parent_drained_end_single(BdrvChild *c) { int drained_end_counter = 0; + AioContext *ctx = bdrv_child_get_parent_aio_context(c); IO_OR_GS_CODE(); bdrv_parent_drained_end_single_no_poll(c, &drained_end_counter); - BDRV_POLL_WHILE(c->bs, qatomic_read(&drained_end_counter) > 0); + AIO_WAIT_WHILE(ctx, qatomic_read(&drained_end_counter) > 0); } static void bdrv_parent_drained_end(BlockDriverState *bs, BdrvChild *ignore, @@ -116,13 +117,14 @@ static bool bdrv_parent_drained_poll(BlockDriverState *bs, BdrvChild *ignore, void bdrv_parent_drained_begin_single(BdrvChild *c, bool poll) { + AioContext *ctx = bdrv_child_get_parent_aio_context(c); IO_OR_GS_CODE(); c->parent_quiesce_counter++; if (c->klass->drained_begin) { c->klass->drained_begin(c); } if (poll) { - BDRV_POLL_WHILE(c->bs, bdrv_parent_drained_poll_single(c)); + AIO_WAIT_WHILE(ctx, bdrv_parent_drained_poll_single(c)); } } diff --git a/block/mirror.c b/block/mirror.c index 1a75a47cc3..251adc5ae0 100644 --- a/block/mirror.c +++ b/block/mirror.c @@ -82,6 +82,7 @@ typedef struct MirrorBlockJob { int max_iov; bool initial_zeroing_ongoing; int in_active_write_counter; + int64_t active_write_bytes_in_flight; bool prepared; bool in_drain; } MirrorBlockJob; @@ -304,19 +305,21 @@ static int mirror_cow_align(MirrorBlockJob *s, int64_t *offset, } static inline void coroutine_fn -mirror_wait_for_any_operation(MirrorBlockJob *s, bool active) +mirror_wait_for_free_in_flight_slot(MirrorBlockJob *s) { MirrorOp *op; QTAILQ_FOREACH(op, &s->ops_in_flight, next) { - /* Do not wait on pseudo ops, because it may in turn wait on + /* + * Do not wait on pseudo ops, because it may in turn wait on * some other operation to start, which may in fact be the * caller of this function. Since there is only one pseudo op * at any given time, we will always find some real operation - * to wait on. */ - if (!op->is_pseudo_op && op->is_in_flight && - op->is_active_write == active) - { + * to wait on. + * Also, do not wait on active operations, because they do not + * use up in-flight slots. + */ + if (!op->is_pseudo_op && op->is_in_flight && !op->is_active_write) { qemu_co_queue_wait(&op->waiting_requests, NULL); return; } @@ -324,13 +327,6 @@ mirror_wait_for_any_operation(MirrorBlockJob *s, bool active) abort(); } -static inline void coroutine_fn -mirror_wait_for_free_in_flight_slot(MirrorBlockJob *s) -{ - /* Only non-active operations use up in-flight slots */ - mirror_wait_for_any_operation(s, false); -} - /* Perform a mirror copy operation. * * *op->bytes_handled is set to the number of bytes copied after and @@ -494,6 +490,13 @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s) } bdrv_dirty_bitmap_unlock(s->dirty_bitmap); + /* + * Wait for concurrent requests to @offset. The next loop will limit the + * copied area based on in_flight_bitmap so we only copy an area that does + * not overlap with concurrent in-flight requests. Still, we would like to + * copy something, so wait until there are at least no more requests to the + * very beginning of the area. + */ mirror_wait_on_conflicts(NULL, s, offset, 1); job_pause_point(&s->common.job); @@ -988,12 +991,6 @@ static int coroutine_fn mirror_run(Job *job, Error **errp) int64_t cnt, delta; bool should_complete; - /* Do not start passive operations while there are active - * writes in progress */ - while (s->in_active_write_counter) { - mirror_wait_for_any_operation(s, true); - } - if (s->ret < 0) { ret = s->ret; goto immediate_exit; @@ -1010,7 +1007,9 @@ static int coroutine_fn mirror_run(Job *job, Error **errp) /* cnt is the number of dirty bytes remaining and s->bytes_in_flight is * the number of bytes currently being processed; together those are * the current remaining operation length */ - job_progress_set_remaining(&s->common.job, s->bytes_in_flight + cnt); + job_progress_set_remaining(&s->common.job, + s->bytes_in_flight + cnt + + s->active_write_bytes_in_flight); /* Note that even when no rate limit is applied we need to yield * periodically with no pending I/O so that bdrv_drain_all() returns. @@ -1071,6 +1070,10 @@ static int coroutine_fn mirror_run(Job *job, Error **errp) s->in_drain = true; bdrv_drained_begin(bs); + + /* Must be zero because we are drained */ + assert(s->in_active_write_counter == 0); + cnt = bdrv_get_dirty_count(s->dirty_bitmap); if (cnt > 0 || mirror_flush(s) < 0) { bdrv_drained_end(bs); @@ -1306,6 +1309,7 @@ do_sync_target_write(MirrorBlockJob *job, MirrorMethod method, } job_progress_increase_remaining(&job->common.job, bytes); + job->active_write_bytes_in_flight += bytes; switch (method) { case MIRROR_METHOD_COPY: @@ -1327,6 +1331,7 @@ do_sync_target_write(MirrorBlockJob *job, MirrorMethod method, abort(); } + job->active_write_bytes_in_flight -= bytes; if (ret >= 0) { job_progress_update(&job->common.job, bytes); } else { @@ -1375,6 +1380,19 @@ static MirrorOp *coroutine_fn active_write_prepare(MirrorBlockJob *s, s->in_active_write_counter++; + /* + * Wait for concurrent requests affecting the area. If there are already + * running requests that are copying off now-to-be stale data in the area, + * we must wait for them to finish before we begin writing fresh data to the + * target so that the write operations appear in the correct order. + * Note that background requests (see mirror_iteration()) in contrast only + * wait for conflicting requests at the start of the dirty area, and then + * (based on the in_flight_bitmap) truncate the area to copy so it will not + * conflict with any requests beyond that. For active writes, however, we + * cannot truncate that area. The request from our parent must be blocked + * until the area is copied in full. Therefore, we must wait for the whole + * area to become free of concurrent requests. + */ mirror_wait_on_conflicts(op, s, offset, bytes); bitmap_set(s->in_flight_bitmap, start_chunk, end_chunk - start_chunk); @@ -1420,11 +1438,13 @@ static int coroutine_fn bdrv_mirror_top_do_write(BlockDriverState *bs, MirrorOp *op = NULL; MirrorBDSOpaque *s = bs->opaque; int ret = 0; - bool copy_to_target; + bool copy_to_target = false; - copy_to_target = s->job->ret >= 0 && - !job_is_cancelled(&s->job->common.job) && - s->job->copy_mode == MIRROR_COPY_MODE_WRITE_BLOCKING; + if (s->job) { + copy_to_target = s->job->ret >= 0 && + !job_is_cancelled(&s->job->common.job) && + s->job->copy_mode == MIRROR_COPY_MODE_WRITE_BLOCKING; + } if (copy_to_target) { op = active_write_prepare(s->job, offset, bytes); @@ -1469,11 +1489,13 @@ static int coroutine_fn bdrv_mirror_top_pwritev(BlockDriverState *bs, QEMUIOVector bounce_qiov; void *bounce_buf; int ret = 0; - bool copy_to_target; + bool copy_to_target = false; - copy_to_target = s->job->ret >= 0 && - !job_is_cancelled(&s->job->common.job) && - s->job->copy_mode == MIRROR_COPY_MODE_WRITE_BLOCKING; + if (s->job) { + copy_to_target = s->job->ret >= 0 && + !job_is_cancelled(&s->job->common.job) && + s->job->copy_mode == MIRROR_COPY_MODE_WRITE_BLOCKING; + } if (copy_to_target) { /* The guest might concurrently modify the data to write; but diff --git a/blockjob.c b/blockjob.c index 2d86014fa5..f51d4e18f3 100644 --- a/blockjob.c +++ b/blockjob.c @@ -173,7 +173,8 @@ static bool child_job_change_aio_ctx(BdrvChild *c, AioContext *ctx, static AioContext *child_job_get_parent_aio_context(BdrvChild *c) { BlockJob *job = c->opaque; - GLOBAL_STATE_CODE(); + IO_CODE(); + JOB_LOCK_GUARD(); return job->job.aio_context; } diff --git a/hw/intc/arm_gicv3_cpuif.c b/hw/intc/arm_gicv3_cpuif.c index 8ca630e5ad..b17b29288c 100644 --- a/hw/intc/arm_gicv3_cpuif.c +++ b/hw/intc/arm_gicv3_cpuif.c @@ -1016,8 +1016,6 @@ static void icc_pmr_write(CPUARMState *env, const ARMCPRegInfo *ri, trace_gicv3_icc_pmr_write(gicv3_redist_affid(cs), value); - value &= icc_fullprio_mask(cs); - if (arm_feature(env, ARM_FEATURE_EL3) && !arm_is_secure(env) && (env->cp15.scr_el3 & SCR_FIQ)) { /* NS access and Group 0 is inaccessible to NS: return the @@ -1029,6 +1027,7 @@ static void icc_pmr_write(CPUARMState *env, const ARMCPRegInfo *ri, } value = (value >> 1) | 0x80; } + value &= icc_fullprio_mask(cs); cs->icc_pmr_el1 = value; gicv3_cpuif_update(cs); } diff --git a/include/block/block-global-state.h b/include/block/block-global-state.h index bb42ed9559..c7bd4a2088 100644 --- a/include/block/block-global-state.h +++ b/include/block/block-global-state.h @@ -220,7 +220,6 @@ void coroutine_fn bdrv_co_lock(BlockDriverState *bs); */ void coroutine_fn bdrv_co_unlock(BlockDriverState *bs); -AioContext *bdrv_child_get_parent_aio_context(BdrvChild *c); bool bdrv_child_change_aio_context(BdrvChild *c, AioContext *ctx, GHashTable *visited, Transaction *tran, Error **errp); diff --git a/include/block/block-io.h b/include/block/block-io.h index 770ddeb7c8..b099d7db45 100644 --- a/include/block/block-io.h +++ b/include/block/block-io.h @@ -171,6 +171,8 @@ void bdrv_debug_event(BlockDriverState *bs, BlkdebugEvent event); */ AioContext *bdrv_get_aio_context(BlockDriverState *bs); +AioContext *bdrv_child_get_parent_aio_context(BdrvChild *c); + /** * Move the current coroutine to the AioContext of @bs and return the old * AioContext of the coroutine. Increase bs->in_flight so that draining @bs diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h index 5a2cc077a0..31ae91e56e 100644 --- a/include/block/block_int-common.h +++ b/include/block/block_int-common.h @@ -911,8 +911,6 @@ struct BdrvChildClass { GHashTable *visited, Transaction *tran, Error **errp); - AioContext *(*get_parent_aio_context)(BdrvChild *child); - /* * I/O API functions. These functions are thread-safe. * @@ -929,6 +927,8 @@ struct BdrvChildClass { */ const char *(*get_name)(BdrvChild *child); + AioContext *(*get_parent_aio_context)(BdrvChild *child); + /* * If this pair of functions is implemented, the parent doesn't issue new * requests after returning from .drained_begin() until .drained_end() is diff --git a/qapi/block-core.json b/qapi/block-core.json index 6d904004f8..95ac4fa634 100644 --- a/qapi/block-core.json +++ b/qapi/block-core.json @@ -3704,7 +3704,7 @@ # # Driver specific block device options for the nvme-io_uring backend. # -# @path: path to the image file +# @path: path to the NVMe namespace's character device (e.g. /dev/ng0n1). # # Since: 7.2 ## diff --git a/target/i386/tcg/translate.c b/target/i386/tcg/translate.c index be499b3f09..28d04b8258 100644 --- a/target/i386/tcg/translate.c +++ b/target/i386/tcg/translate.c @@ -445,32 +445,51 @@ static inline MemOp mo_b_d32(int b, MemOp ot) return b & 1 ? (ot == MO_16 ? MO_16 : MO_32) : MO_8; } -static void gen_op_mov_reg_v(DisasContext *s, MemOp ot, int reg, TCGv t0) +/* Compute the result of writing t0 to the OT-sized register REG. + * + * If DEST is NULL, store the result into the register and return the + * register's TCGv. + * + * If DEST is not NULL, store the result into DEST and return the + * register's TCGv. + */ +static TCGv gen_op_deposit_reg_v(DisasContext *s, MemOp ot, int reg, TCGv dest, TCGv t0) { switch(ot) { case MO_8: - if (!byte_reg_is_xH(s, reg)) { - tcg_gen_deposit_tl(cpu_regs[reg], cpu_regs[reg], t0, 0, 8); - } else { - tcg_gen_deposit_tl(cpu_regs[reg - 4], cpu_regs[reg - 4], t0, 8, 8); + if (byte_reg_is_xH(s, reg)) { + dest = dest ? dest : cpu_regs[reg - 4]; + tcg_gen_deposit_tl(dest, cpu_regs[reg - 4], t0, 8, 8); + return cpu_regs[reg - 4]; } + dest = dest ? dest : cpu_regs[reg]; + tcg_gen_deposit_tl(dest, cpu_regs[reg], t0, 0, 8); break; case MO_16: - tcg_gen_deposit_tl(cpu_regs[reg], cpu_regs[reg], t0, 0, 16); + dest = dest ? dest : cpu_regs[reg]; + tcg_gen_deposit_tl(dest, cpu_regs[reg], t0, 0, 16); break; case MO_32: /* For x86_64, this sets the higher half of register to zero. For i386, this is equivalent to a mov. */ - tcg_gen_ext32u_tl(cpu_regs[reg], t0); + dest = dest ? dest : cpu_regs[reg]; + tcg_gen_ext32u_tl(dest, t0); break; #ifdef TARGET_X86_64 case MO_64: - tcg_gen_mov_tl(cpu_regs[reg], t0); + dest = dest ? dest : cpu_regs[reg]; + tcg_gen_mov_tl(dest, t0); break; #endif default: tcg_abort(); } + return cpu_regs[reg]; +} + +static void gen_op_mov_reg_v(DisasContext *s, MemOp ot, int reg, TCGv t0) +{ + gen_op_deposit_reg_v(s, ot, reg, NULL, t0); } static inline @@ -3767,7 +3786,7 @@ static bool disas_insn(DisasContext *s, CPUState *cpu) case 0x1b0: case 0x1b1: /* cmpxchg Ev, Gv */ { - TCGv oldv, newv, cmpv; + TCGv oldv, newv, cmpv, dest; ot = mo_b_d(b, dflag); modrm = x86_ldub_code(env, s); @@ -3778,7 +3797,7 @@ static bool disas_insn(DisasContext *s, CPUState *cpu) cmpv = tcg_temp_new(); gen_op_mov_v_reg(s, ot, newv, reg); tcg_gen_mov_tl(cmpv, cpu_regs[R_EAX]); - + gen_extu(ot, cmpv); if (s->prefix & PREFIX_LOCK) { if (mod == 3) { goto illegal_op; @@ -3786,32 +3805,43 @@ static bool disas_insn(DisasContext *s, CPUState *cpu) gen_lea_modrm(env, s, modrm); tcg_gen_atomic_cmpxchg_tl(oldv, s->A0, cmpv, newv, s->mem_index, ot | MO_LE); - gen_op_mov_reg_v(s, ot, R_EAX, oldv); } else { if (mod == 3) { rm = (modrm & 7) | REX_B(s); gen_op_mov_v_reg(s, ot, oldv, rm); + gen_extu(ot, oldv); + + /* + * Unlike the memory case, where "the destination operand receives + * a write cycle without regard to the result of the comparison", + * rm must not be touched altogether if the write fails, including + * not zero-extending it on 64-bit processors. So, precompute + * the result of a successful writeback and perform the movcond + * directly on cpu_regs. Also need to write accumulator first, in + * case rm is part of RAX too. + */ + dest = gen_op_deposit_reg_v(s, ot, rm, newv, newv); + tcg_gen_movcond_tl(TCG_COND_EQ, dest, oldv, cmpv, newv, dest); } else { gen_lea_modrm(env, s, modrm); gen_op_ld_v(s, ot, oldv, s->A0); - rm = 0; /* avoid warning */ - } - gen_extu(ot, oldv); - gen_extu(ot, cmpv); - /* store value = (old == cmp ? new : old); */ - tcg_gen_movcond_tl(TCG_COND_EQ, newv, oldv, cmpv, newv, oldv); - if (mod == 3) { - gen_op_mov_reg_v(s, ot, R_EAX, oldv); - gen_op_mov_reg_v(s, ot, rm, newv); - } else { - /* Perform an unconditional store cycle like physical cpu; - must be before changing accumulator to ensure - idempotency if the store faults and the instruction - is restarted */ + + /* + * Perform an unconditional store cycle like physical cpu; + * must be before changing accumulator to ensure + * idempotency if the store faults and the instruction + * is restarted + */ + tcg_gen_movcond_tl(TCG_COND_EQ, newv, oldv, cmpv, newv, oldv); gen_op_st_v(s, ot, newv, s->A0); - gen_op_mov_reg_v(s, ot, R_EAX, oldv); } } + /* + * Write EAX only if the cmpxchg fails; reuse newv as the destination, + * since it's dead here. + */ + dest = gen_op_deposit_reg_v(s, ot, R_EAX, newv, oldv); + tcg_gen_movcond_tl(TCG_COND_EQ, dest, oldv, cmpv, dest, newv); tcg_gen_mov_tl(cpu_cc_src, oldv); tcg_gen_mov_tl(s->cc_srcT, cmpv); tcg_gen_sub_tl(cpu_cc_dst, cmpv, oldv); @@ -5220,7 +5250,7 @@ static bool disas_insn(DisasContext *s, CPUState *cpu) case 0x9e: /* sahf */ if (CODE64(s) && !(s->cpuid_ext3_features & CPUID_EXT3_LAHF_LM)) goto illegal_op; - gen_op_mov_v_reg(s, MO_8, s->T0, R_AH); + tcg_gen_shri_tl(s->T0, cpu_regs[R_EAX], 8); gen_compute_eflags(s); tcg_gen_andi_tl(cpu_cc_src, cpu_cc_src, CC_O); tcg_gen_andi_tl(s->T0, s->T0, CC_S | CC_Z | CC_A | CC_P | CC_C); @@ -5232,7 +5262,7 @@ static bool disas_insn(DisasContext *s, CPUState *cpu) gen_compute_eflags(s); /* Note: gen_compute_eflags() only gives the condition codes */ tcg_gen_ori_tl(s->T0, cpu_cc_src, 0x02); - gen_op_mov_reg_v(s, MO_8, R_AH, s->T0); + tcg_gen_deposit_tl(cpu_regs[R_EAX], cpu_regs[R_EAX], s->T0, 8, 8); break; case 0xf5: /* cmc */ gen_compute_eflags(s); diff --git a/tests/qemu-iotests/151 b/tests/qemu-iotests/151 index 93d14193d0..b4d1bc2553 100755 --- a/tests/qemu-iotests/151 +++ b/tests/qemu-iotests/151 @@ -19,7 +19,11 @@ # along with this program. If not, see . # +import math import os +import subprocess +import time +from typing import List, Optional import iotests from iotests import qemu_img @@ -50,7 +54,7 @@ class TestActiveMirror(iotests.QMPTestCase): self.vm = iotests.VM() self.vm.add_drive_raw(self.vm.qmp_to_opts(blk_source)) self.vm.add_blockdev(self.vm.qmp_to_opts(blk_target)) - self.vm.add_device('virtio-blk,drive=source') + self.vm.add_device('virtio-blk,id=vblk,drive=source') self.vm.launch() def tearDown(self): @@ -192,6 +196,227 @@ class TestActiveMirror(iotests.QMPTestCase): self.potential_writes_in_flight = False +class TestThrottledWithNbdExportBase(iotests.QMPTestCase): + image_len = 128 * 1024 * 1024 # MB + iops: Optional[int] = None + background_processes: List['subprocess.Popen[str]'] = [] + + def setUp(self): + # Must be set by subclasses + self.assertIsNotNone(self.iops) + + qemu_img('create', '-f', iotests.imgfmt, source_img, '128M') + qemu_img('create', '-f', iotests.imgfmt, target_img, '128M') + + self.vm = iotests.VM() + self.vm.launch() + + result = self.vm.qmp('object-add', **{ + 'qom-type': 'throttle-group', + 'id': 'thrgr', + 'limits': { + 'iops-total': self.iops, + 'iops-total-max': self.iops + } + }) + self.assert_qmp(result, 'return', {}) + + result = self.vm.qmp('blockdev-add', **{ + 'node-name': 'source-node', + 'driver': 'throttle', + 'throttle-group': 'thrgr', + 'file': { + 'driver': iotests.imgfmt, + 'file': { + 'driver': 'file', + 'filename': source_img + } + } + }) + self.assert_qmp(result, 'return', {}) + + result = self.vm.qmp('blockdev-add', **{ + 'node-name': 'target-node', + 'driver': iotests.imgfmt, + 'file': { + 'driver': 'file', + 'filename': target_img + } + }) + self.assert_qmp(result, 'return', {}) + + self.nbd_sock = iotests.file_path('nbd.sock', + base_dir=iotests.sock_dir) + self.nbd_url = f'nbd+unix:///source-node?socket={self.nbd_sock}' + + result = self.vm.qmp('nbd-server-start', addr={ + 'type': 'unix', + 'data': { + 'path': self.nbd_sock + } + }) + self.assert_qmp(result, 'return', {}) + + result = self.vm.qmp('block-export-add', id='exp0', type='nbd', + node_name='source-node', writable=True) + self.assert_qmp(result, 'return', {}) + + def tearDown(self): + # Wait for background requests to settle + try: + while True: + p = self.background_processes.pop() + while True: + try: + p.wait(timeout=0.0) + break + except subprocess.TimeoutExpired: + self.vm.qtest(f'clock_step {1 * 1000 * 1000 * 1000}') + except IndexError: + pass + + # Cancel ongoing block jobs + for job in self.vm.qmp('query-jobs')['return']: + self.vm.qmp('block-job-cancel', device=job['id'], force=True) + + while True: + self.vm.qtest(f'clock_step {1 * 1000 * 1000 * 1000}') + if len(self.vm.qmp('query-jobs')['return']) == 0: + break + + self.vm.shutdown() + os.remove(source_img) + os.remove(target_img) + + +class TestLowThrottledWithNbdExport(TestThrottledWithNbdExportBase): + iops = 16 + + def testUnderLoad(self): + ''' + Throttle the source node, then issue a whole bunch of external requests + while the mirror job (in write-blocking mode) is running. We want to + see background requests being issued even while the source is under + full load by active writes, so that progress can be made towards READY. + ''' + + # Fill the first half of the source image; do not fill the second half, + # that is where we will have active requests occur. This ensures that + # active mirroring itself will not directly contribute to the job's + # progress (because when the job was started, those areas were not + # intended to be copied, so active mirroring will only lead to not + # losing progress, but also not making any). + self.vm.hmp_qemu_io('source-node', + f'aio_write -P 1 0 {self.image_len // 2}') + self.vm.qtest(f'clock_step {1 * 1000 * 1000 * 1000}') + + # Launch the mirror job + mirror_buf_size = 65536 + result = self.vm.qmp('blockdev-mirror', + job_id='mirror', + filter_node_name='mirror-node', + device='source-node', + target='target-node', + sync='full', + copy_mode='write-blocking', + buf_size=mirror_buf_size) + self.assert_qmp(result, 'return', {}) + + # We create the external requests via qemu-io processes on the NBD + # server. Have their offset start in the middle of the image so they + # do not overlap with the background requests (which start from the + # beginning). + active_request_offset = self.image_len // 2 + active_request_len = 4096 + + # Create enough requests to saturate the node for 5 seconds + for _ in range(0, 5 * self.iops): + req = f'write -P 42 {active_request_offset} {active_request_len}' + active_request_offset += active_request_len + p = iotests.qemu_io_popen('-f', 'nbd', self.nbd_url, '-c', req) + self.background_processes += [p] + + # Now advance the clock one I/O operation at a time by the 4 seconds + # (i.e. one less than 5). We expect the mirror job to issue background + # operations here, even though active requests are still in flight. + # The active requests will take precedence, however, because they have + # been issued earlier than mirror's background requests. + # Once the active requests we have started above are done (i.e. after 5 + # virtual seconds), we expect those background requests to be worked + # on. We only advance 4 seconds here to avoid race conditions. + for _ in range(0, 4 * self.iops): + step = math.ceil(1 * 1000 * 1000 * 1000 / self.iops) + self.vm.qtest(f'clock_step {step}') + + # Note how much remains to be done until the mirror job is finished + job_status = self.vm.qmp('query-jobs')['return'][0] + start_remaining = job_status['total-progress'] - \ + job_status['current-progress'] + + # Create a whole bunch of more active requests + for _ in range(0, 10 * self.iops): + req = f'write -P 42 {active_request_offset} {active_request_len}' + active_request_offset += active_request_len + p = iotests.qemu_io_popen('-f', 'nbd', self.nbd_url, '-c', req) + self.background_processes += [p] + + # Let the clock advance more. After 1 second, as noted above, we + # expect the background requests to be worked on. Give them a couple + # of seconds (specifically 4) to see their impact. + for _ in range(0, 5 * self.iops): + step = math.ceil(1 * 1000 * 1000 * 1000 / self.iops) + self.vm.qtest(f'clock_step {step}') + + # Note how much remains to be done now. We expect this number to be + # reduced thanks to those background requests. + job_status = self.vm.qmp('query-jobs')['return'][0] + end_remaining = job_status['total-progress'] - \ + job_status['current-progress'] + + # See that indeed progress was being made on the job, even while the + # node was saturated with active requests + self.assertGreater(start_remaining - end_remaining, 0) + + +class TestHighThrottledWithNbdExport(TestThrottledWithNbdExportBase): + iops = 1024 + + def testActiveOnCreation(self): + ''' + Issue requests on the mirror source node right as the mirror is + instated. It's possible that requests occur before the actual job is + created, but after the node has been put into the graph. Write + requests across the node must in that case be forwarded to the source + node without attempting to mirror them (there is no job object yet, so + attempting to access it would cause a segfault). + We do this with a lightly throttled node (i.e. quite high IOPS limit). + Using throttling seems to increase reproductivity, but if the limit is + too low, all requests allowed per second will be submitted before + mirror_start_job() gets to the problematic point. + ''' + + # Let qemu-img bench create write requests (enough for two seconds on + # the virtual clock) + bench_args = ['bench', '-w', '-d', '1024', '-f', 'nbd', + '-c', str(self.iops * 2), self.nbd_url] + p = iotests.qemu_tool_popen(iotests.qemu_img_args + bench_args) + self.background_processes += [p] + + # Give qemu-img bench time to start up and issue requests + time.sleep(1.0) + # Flush the request queue, so new requests can come in right as we + # start blockdev-mirror + self.vm.qtest(f'clock_step {1 * 1000 * 1000 * 1000}') + + result = self.vm.qmp('blockdev-mirror', + job_id='mirror', + device='source-node', + target='target-node', + sync='full', + copy_mode='write-blocking') + self.assert_qmp(result, 'return', {}) + + if __name__ == '__main__': iotests.main(supported_fmts=['qcow2', 'raw'], supported_protocols=['file']) diff --git a/tests/qemu-iotests/151.out b/tests/qemu-iotests/151.out index 89968f35d7..3f8a935a08 100644 --- a/tests/qemu-iotests/151.out +++ b/tests/qemu-iotests/151.out @@ -1,5 +1,5 @@ -.... +...... ---------------------------------------------------------------------- -Ran 4 tests +Ran 6 tests OK diff --git a/tests/qemu-iotests/tests/stream-under-throttle b/tests/qemu-iotests/tests/stream-under-throttle new file mode 100755 index 0000000000..8d2d9e1684 --- /dev/null +++ b/tests/qemu-iotests/tests/stream-under-throttle @@ -0,0 +1,121 @@ +#!/usr/bin/env python3 +# group: rw +# +# Test streaming with throttle nodes on top +# +# Copyright (C) 2022 Red Hat, Inc. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# + +import asyncio +import os +from typing import List +import iotests +from iotests import qemu_img_create, qemu_io + + +image_size = 256 * 1024 * 1024 +base_img = os.path.join(iotests.test_dir, 'base.img') +top_img = os.path.join(iotests.test_dir, 'top.img') + + +class TcgVM(iotests.VM): + ''' + Variant of iotests.VM that uses -accel tcg. Simply using + iotests.VM.add_args('-accel', 'tcg') is not sufficient, because that will + put -accel qtest before -accel tcg, and -accel arguments are prioritized in + the order they appear. + ''' + @property + def _base_args(self) -> List[str]: + # Put -accel tcg first so it takes precedence + return ['-accel', 'tcg'] + super()._base_args + + +class TestStreamWithThrottle(iotests.QMPTestCase): + def setUp(self) -> None: + ''' + Create a simple backing chain between two images, write something to + the base image. Attach them to the VM underneath two throttle nodes, + one of which has actually no limits set, but the other does. Then put + a virtio-blk device on top. + This test configuration has been taken from + https://gitlab.com/qemu-project/qemu/-/issues/1215 + ''' + qemu_img_create('-f', iotests.imgfmt, base_img, str(image_size)) + qemu_img_create('-f', iotests.imgfmt, '-b', base_img, '-F', + iotests.imgfmt, top_img, str(image_size)) + + # Write something to stream + qemu_io(base_img, '-c', f'write 0 {image_size}') + + blockdev = { + 'driver': 'throttle', + 'node-name': 'throttled-node', + 'throttle-group': 'thrgr-limited', + 'file': { + 'driver': 'throttle', + 'throttle-group': 'thrgr-unlimited', + 'file': { + 'driver': iotests.imgfmt, + 'node-name': 'unthrottled-node', + 'file': { + 'driver': 'file', + 'filename': top_img + } + } + } + } + + # Issue 1215 is not reproducible in qtest mode, which is why we need to + # create an -accel tcg VM + self.vm = TcgVM() + self.vm.add_object('iothread,id=iothr0') + self.vm.add_object('throttle-group,id=thrgr-unlimited') + self.vm.add_object('throttle-group,id=thrgr-limited,' + 'x-iops-total=10000,x-bps-total=104857600') + self.vm.add_blockdev(self.vm.qmp_to_opts(blockdev)) + self.vm.add_device('virtio-blk,iothread=iothr0,drive=throttled-node') + self.vm.launch() + + def tearDown(self) -> None: + self.vm.shutdown() + os.remove(top_img) + os.remove(base_img) + + def test_stream(self) -> None: + ''' + Do a simple stream beneath the two throttle nodes. Should complete + with no problems. + ''' + result = self.vm.qmp('block-stream', + job_id='stream', + device='unthrottled-node') + self.assert_qmp(result, 'return', {}) + + # Should succeed and not time out + try: + self.vm.run_job('stream') + except asyncio.TimeoutError: + # VM may be stuck, kill it before tearDown() + self.vm.kill() + raise + + +if __name__ == '__main__': + # Must support backing images + iotests.main(supported_fmts=['qcow', 'qcow2', 'qed'], + supported_protocols=['file'], + required_fmts=['throttle']) diff --git a/tests/qemu-iotests/tests/stream-under-throttle.out b/tests/qemu-iotests/tests/stream-under-throttle.out new file mode 100644 index 0000000000..ae1213e6f8 --- /dev/null +++ b/tests/qemu-iotests/tests/stream-under-throttle.out @@ -0,0 +1,5 @@ +. +---------------------------------------------------------------------- +Ran 1 tests + +OK diff --git a/tests/tcg/x86_64/Makefile.target b/tests/tcg/x86_64/Makefile.target index 6895db1c43..4eac78293f 100644 --- a/tests/tcg/x86_64/Makefile.target +++ b/tests/tcg/x86_64/Makefile.target @@ -11,6 +11,7 @@ include $(SRC_PATH)/tests/tcg/i386/Makefile.target ifeq ($(filter %-linux-user, $(TARGET)),$(TARGET)) X86_64_TESTS += vsyscall X86_64_TESTS += noexec +X86_64_TESTS += cmpxchg TESTS=$(MULTIARCH_TESTS) $(X86_64_TESTS) test-x86_64 else TESTS=$(MULTIARCH_TESTS) diff --git a/tests/tcg/x86_64/cmpxchg.c b/tests/tcg/x86_64/cmpxchg.c new file mode 100644 index 0000000000..5891735161 --- /dev/null +++ b/tests/tcg/x86_64/cmpxchg.c @@ -0,0 +1,42 @@ +#include + +static int mem; + +static unsigned long test_cmpxchgb(unsigned long orig) +{ + unsigned long ret; + mem = orig; + asm("cmpxchgb %b[cmp],%[mem]" + : [ mem ] "+m"(mem), [ rax ] "=a"(ret) + : [ cmp ] "r"(0x77), "a"(orig)); + return ret; +} + +static unsigned long test_cmpxchgw(unsigned long orig) +{ + unsigned long ret; + mem = orig; + asm("cmpxchgw %w[cmp],%[mem]" + : [ mem ] "+m"(mem), [ rax ] "=a"(ret) + : [ cmp ] "r"(0x7777), "a"(orig)); + return ret; +} + +static unsigned long test_cmpxchgl(unsigned long orig) +{ + unsigned long ret; + mem = orig; + asm("cmpxchgl %[cmp],%[mem]" + : [ mem ] "+m"(mem), [ rax ] "=a"(ret) + : [ cmp ] "r"(0x77777777u), "a"(orig)); + return ret; +} + +int main() +{ + unsigned long test = 0xdeadbeef12345678ull; + assert(test == test_cmpxchgb(test)); + assert(test == test_cmpxchgw(test)); + assert(test == test_cmpxchgl(test)); + return 0; +}