Migration pull request

- compression:
   Shameer's fix for CONFIG_UADK build
   Yuan Liu fixes for zero-page, QPL, qatzip
 
 - multifd sync cleanups, prereq. for VFIO and postcopy work
 
 - fixes for 9.2 regressions:
   multifd with pre-9.0 -> post-9.1 migrations (#2720)
   s390x migration (#2704)
 
 - fix for assertions during paused migrations; rework of
   late-block-activate logic (#2395, #686)
 
 - fixes for compressed arrays creation and parsing, mostly affecting
   s390x
 -----BEGIN PGP SIGNATURE-----
 
 iQJEBAABCAAuFiEEqhtIsKIjJqWkw2TPx5jcdBvsMZ0FAmeBDgkQHGZhcm9zYXNA
 c3VzZS5kZQAKCRDHmNx0G+wxnSlUEACl31wY+77JxWnBva/eDDwnJ9HiCrqsoqaZ
 YIJJXNlk4lYJWNdZRt6p27exzWrQwm+kWKPECeCakgCMlfhnKCvejGq7iV/fJY4o
 D8hjE3t1htQ8mfblY1+bqzg3Rml59KwXxiqAwvlljbNWdkXruv026dq9vgJMzFhi
 ia043fOO1tYULIoawgmwmLEHnztht0v+ZTZ1v5KQbrH655tpxls/8kHc6v5PXEpA
 3PSmCrCQh1dPtkYRjuJ9yHyfU+/T8tYwIjrU6VR1wQW7MBNkjtqNudaqAFiuyuqn
 P8gh4rAQrMhA9y+aq6xSoJP8XGkuOHxLQtlNutlmtbcQyZ7JqgLmK9ZLdoPf21sK
 //erV63NoyaciYB9Nk3NXflwroc6zyvo8A584kGNPwBznZOJLESP4SPvVm/nlE29
 vbyq8AWHRjFiqqf6P0ttQLAFkusZJzM1Y9UakF51hyVBX70yfqLG20XXZtIq/aZA
 GbBB2Fo0MIlbmWaur3vLsSzn7B8d++Gl9TTGcK/eIXJ1ANCuCxGv9fbXJQlP5F4I
 3OAoSmAVJ2eqw4v0+2WMiEa8yUA5drNnDSI3VRkG+0K9jRfHKXki466/QQdGrNw7
 8GuuzLBNai3gEKbavDU0Be73r982KjXeYXj7RuAkQfm0d4H7tiwtg91Cd1dPKfzh
 mhpmOFJDCg==
 =joNM
 -----END PGP SIGNATURE-----

Merge tag 'migration-20250110-pull-request' of https://gitlab.com/farosas/qemu into staging

Migration pull request

- compression:
  Shameer's fix for CONFIG_UADK build
  Yuan Liu fixes for zero-page, QPL, qatzip

- multifd sync cleanups, prereq. for VFIO and postcopy work

- fixes for 9.2 regressions:
  multifd with pre-9.0 -> post-9.1 migrations (#2720)
  s390x migration (#2704)

- fix for assertions during paused migrations; rework of
  late-block-activate logic (#2395, #686)

- fixes for compressed arrays creation and parsing, mostly affecting
  s390x

# -----BEGIN PGP SIGNATURE-----
#
# iQJEBAABCAAuFiEEqhtIsKIjJqWkw2TPx5jcdBvsMZ0FAmeBDgkQHGZhcm9zYXNA
# c3VzZS5kZQAKCRDHmNx0G+wxnSlUEACl31wY+77JxWnBva/eDDwnJ9HiCrqsoqaZ
# YIJJXNlk4lYJWNdZRt6p27exzWrQwm+kWKPECeCakgCMlfhnKCvejGq7iV/fJY4o
# D8hjE3t1htQ8mfblY1+bqzg3Rml59KwXxiqAwvlljbNWdkXruv026dq9vgJMzFhi
# ia043fOO1tYULIoawgmwmLEHnztht0v+ZTZ1v5KQbrH655tpxls/8kHc6v5PXEpA
# 3PSmCrCQh1dPtkYRjuJ9yHyfU+/T8tYwIjrU6VR1wQW7MBNkjtqNudaqAFiuyuqn
# P8gh4rAQrMhA9y+aq6xSoJP8XGkuOHxLQtlNutlmtbcQyZ7JqgLmK9ZLdoPf21sK
# //erV63NoyaciYB9Nk3NXflwroc6zyvo8A584kGNPwBznZOJLESP4SPvVm/nlE29
# vbyq8AWHRjFiqqf6P0ttQLAFkusZJzM1Y9UakF51hyVBX70yfqLG20XXZtIq/aZA
# GbBB2Fo0MIlbmWaur3vLsSzn7B8d++Gl9TTGcK/eIXJ1ANCuCxGv9fbXJQlP5F4I
# 3OAoSmAVJ2eqw4v0+2WMiEa8yUA5drNnDSI3VRkG+0K9jRfHKXki466/QQdGrNw7
# 8GuuzLBNai3gEKbavDU0Be73r982KjXeYXj7RuAkQfm0d4H7tiwtg91Cd1dPKfzh
# mhpmOFJDCg==
# =joNM
# -----END PGP SIGNATURE-----
# gpg: Signature made Fri 10 Jan 2025 07:09:45 EST
# gpg:                using RSA key AA1B48B0A22326A5A4C364CFC798DC741BEC319D
# gpg:                issuer "farosas@suse.de"
# gpg: Good signature from "Fabiano Rosas <farosas@suse.de>" [unknown]
# gpg:                 aka "Fabiano Almeida Rosas <fabiano.rosas@suse.com>" [unknown]
# gpg: WARNING: The key's User ID is not certified with a trusted signature!
# gpg:          There is no indication that the signature belongs to the owner.
# Primary key fingerprint: AA1B 48B0 A223 26A5 A4C3  64CF C798 DC74 1BEC 319D

* tag 'migration-20250110-pull-request' of https://gitlab.com/farosas/qemu: (25 commits)
  multifd: bugfix for incorrect migration data with qatzip compression
  multifd: bugfix for incorrect migration data with QPL compression
  multifd: bugfix for migration using compression methods
  s390x: Fix CSS migration
  migration: Fix arrays of pointers in JSON writer
  migration: Dump correct JSON format for nullptr replacement
  migration: Rename vmstate_info_nullptr
  migration: Fix parsing of s390 stream
  migration: Remove unused argument in vmsd_desc_field_end
  migration: Add more error handling to analyze-migration.py
  migration/block: Rewrite disk activation
  migration/block: Fix possible race with block_inactive
  migration/block: Apply late-block-active behavior to postcopy
  migration/block: Make late-block-active the default
  qmp/cont: Only activate disks if migration completed
  migration: Add helper to get target runstate
  migration/multifd: Fix compat with QEMU < 9.0
  migration/multifd: Document the reason to sync for save_setup()
  migration/multifd: Cleanup src flushes on condition check
  migration/multifd: Remove sync processing on postcopy
  ...

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
This commit is contained in:
Stefan Hajnoczi 2025-01-10 13:39:19 -05:00
commit 3214bec13d
22 changed files with 602 additions and 273 deletions

View File

@ -1244,6 +1244,7 @@ static void ccw_machine_2_9_instance_options(MachineState *machine)
s390_cpudef_featoff_greater(12, 1, S390_FEAT_ZPCI); s390_cpudef_featoff_greater(12, 1, S390_FEAT_ZPCI);
s390_cpudef_featoff_greater(12, 1, S390_FEAT_ADAPTER_INT_SUPPRESSION); s390_cpudef_featoff_greater(12, 1, S390_FEAT_ADAPTER_INT_SUPPRESSION);
s390_cpudef_featoff_greater(12, 1, S390_FEAT_ADAPTER_EVENT_NOTIFICATION); s390_cpudef_featoff_greater(12, 1, S390_FEAT_ADAPTER_EVENT_NOTIFICATION);
css_migration_enabled = false;
} }
static void ccw_machine_2_9_class_options(MachineClass *mc) static void ccw_machine_2_9_class_options(MachineClass *mc)
@ -1256,7 +1257,6 @@ static void ccw_machine_2_9_class_options(MachineClass *mc)
ccw_machine_2_10_class_options(mc); ccw_machine_2_10_class_options(mc);
compat_props_add(mc->compat_props, hw_compat_2_9, hw_compat_2_9_len); compat_props_add(mc->compat_props, hw_compat_2_9, hw_compat_2_9_len);
compat_props_add(mc->compat_props, compat, G_N_ELEMENTS(compat)); compat_props_add(mc->compat_props, compat, G_N_ELEMENTS(compat));
css_migration_enabled = false;
} }
DEFINE_CCW_MACHINE(2, 9); DEFINE_CCW_MACHINE(2, 9);

View File

@ -104,4 +104,8 @@ bool migration_incoming_postcopy_advised(void);
/* True if background snapshot is active */ /* True if background snapshot is active */
bool migration_in_bg_snapshot(void); bool migration_in_bg_snapshot(void);
/* Wrapper for block active/inactive operations */
bool migration_block_activate(Error **errp);
bool migration_block_inactivate(void);
#endif #endif

94
migration/block-active.c Normal file
View File

@ -0,0 +1,94 @@
/*
* Block activation tracking for migration purpose
*
* SPDX-License-Identifier: GPL-2.0-or-later
*
* Copyright (C) 2024 Red Hat, Inc.
*/
#include "qemu/osdep.h"
#include "block/block.h"
#include "qapi/error.h"
#include "migration/migration.h"
#include "qemu/error-report.h"
#include "trace.h"
/*
* Migration-only cache to remember the block layer activation status.
* Protected by BQL.
*
* We need this because..
*
* - Migration can fail after block devices are invalidated (during
* switchover phase). When that happens, we need to be able to recover
* the block drive status by re-activating them.
*
* - Currently bdrv_inactivate_all() is not safe to be invoked on top of
* invalidated drives (even if bdrv_activate_all() is actually safe to be
* called any time!). It means remembering this could help migration to
* make sure it won't invalidate twice in a row, crashing QEMU. It can
* happen when we migrate a PAUSED VM from host1 to host2, then migrate
* again to host3 without starting it. TODO: a cleaner solution is to
* allow safe invoke of bdrv_inactivate_all() at anytime, like
* bdrv_activate_all().
*
* For freshly started QEMU, the flag is initialized to TRUE reflecting the
* scenario where QEMU owns block device ownerships.
*
* For incoming QEMU taking a migration stream, the flag is initialized to
* FALSE reflecting that the incoming side doesn't own the block devices,
* not until switchover happens.
*/
static bool migration_block_active;
/* Setup the disk activation status */
void migration_block_active_setup(bool active)
{
migration_block_active = active;
}
bool migration_block_activate(Error **errp)
{
ERRP_GUARD();
assert(bql_locked());
if (migration_block_active) {
trace_migration_block_activation("active-skipped");
return true;
}
trace_migration_block_activation("active");
bdrv_activate_all(errp);
if (*errp) {
error_report_err(error_copy(*errp));
return false;
}
migration_block_active = true;
return true;
}
bool migration_block_inactivate(void)
{
int ret;
assert(bql_locked());
if (!migration_block_active) {
trace_migration_block_activation("inactive-skipped");
return true;
}
trace_migration_block_activation("inactive");
ret = bdrv_inactivate_all();
if (ret) {
error_report("%s: bdrv_inactivate_all() failed: %d",
__func__, ret);
return false;
}
migration_block_active = false;
return true;
}

View File

@ -836,7 +836,7 @@ static void *colo_process_incoming_thread(void *opaque)
/* Make sure all file formats throw away their mutable metadata */ /* Make sure all file formats throw away their mutable metadata */
bql_lock(); bql_lock();
bdrv_activate_all(&local_err); migration_block_activate(&local_err);
bql_unlock(); bql_unlock();
if (local_err) { if (local_err) {
error_report_err(local_err); error_report_err(local_err);

View File

@ -11,6 +11,7 @@ migration_files = files(
system_ss.add(files( system_ss.add(files(
'block-dirty-bitmap.c', 'block-dirty-bitmap.c',
'block-active.c',
'channel.c', 'channel.c',
'channel-block.c', 'channel-block.c',
'cpu-throttle.c', 'cpu-throttle.c',

View File

@ -135,6 +135,21 @@ static bool migration_needs_multiple_sockets(void)
return migrate_multifd() || migrate_postcopy_preempt(); return migrate_multifd() || migrate_postcopy_preempt();
} }
static RunState migration_get_target_runstate(void)
{
/*
* When the global state is not migrated, it means we don't know the
* runstate of the src QEMU. We don't have much choice but assuming
* the VM is running. NOTE: this is pretty rare case, so far only Xen
* uses it.
*/
if (!global_state_received()) {
return RUN_STATE_RUNNING;
}
return global_state_get_runstate();
}
static bool transport_supports_multi_channels(MigrationAddress *addr) static bool transport_supports_multi_channels(MigrationAddress *addr)
{ {
if (addr->transport == MIGRATION_ADDRESS_TYPE_SOCKET) { if (addr->transport == MIGRATION_ADDRESS_TYPE_SOCKET) {
@ -723,30 +738,10 @@ static void qemu_start_incoming_migration(const char *uri, bool has_channels,
static void process_incoming_migration_bh(void *opaque) static void process_incoming_migration_bh(void *opaque)
{ {
Error *local_err = NULL;
MigrationIncomingState *mis = opaque; MigrationIncomingState *mis = opaque;
trace_vmstate_downtime_checkpoint("dst-precopy-bh-enter"); trace_vmstate_downtime_checkpoint("dst-precopy-bh-enter");
/* If capability late_block_activate is set:
* Only fire up the block code now if we're going to restart the
* VM, else 'cont' will do it.
* This causes file locking to happen; so we don't want it to happen
* unless we really are starting the VM.
*/
if (!migrate_late_block_activate() ||
(autostart && (!global_state_received() ||
runstate_is_live(global_state_get_runstate())))) {
/* Make sure all file formats throw away their mutable metadata.
* If we get an error here, just don't restart the VM yet. */
bdrv_activate_all(&local_err);
if (local_err) {
error_report_err(local_err);
local_err = NULL;
autostart = false;
}
}
/* /*
* This must happen after all error conditions are dealt with and * This must happen after all error conditions are dealt with and
* we're sure the VM is going to be running on this host. * we're sure the VM is going to be running on this host.
@ -759,10 +754,23 @@ static void process_incoming_migration_bh(void *opaque)
dirty_bitmap_mig_before_vm_start(); dirty_bitmap_mig_before_vm_start();
if (!global_state_received() || if (runstate_is_live(migration_get_target_runstate())) {
runstate_is_live(global_state_get_runstate())) {
if (autostart) { if (autostart) {
/*
* Block activation is always delayed until VM starts, either
* here (which means we need to start the dest VM right now..),
* or until qmp_cont() later.
*
* We used to have cap 'late-block-activate' but now we do this
* unconditionally, as it has no harm but only benefit. E.g.,
* it's not part of migration ABI on the time of disk activation.
*
* Make sure all file formats throw away their mutable
* metadata. If error, don't restart the VM yet.
*/
if (migration_block_activate(NULL)) {
vm_start(); vm_start();
}
} else { } else {
runstate_set(RUN_STATE_PAUSED); runstate_set(RUN_STATE_PAUSED);
} }
@ -1547,16 +1555,6 @@ static void migrate_fd_cancel(MigrationState *s)
} }
} }
} }
if (s->state == MIGRATION_STATUS_CANCELLING && s->block_inactive) {
Error *local_err = NULL;
bdrv_activate_all(&local_err);
if (local_err) {
error_report_err(local_err);
} else {
s->block_inactive = false;
}
}
} }
void migration_add_notifier_mode(NotifierWithReturn *notify, void migration_add_notifier_mode(NotifierWithReturn *notify,
@ -1840,6 +1838,12 @@ void qmp_migrate_incoming(const char *uri, bool has_channels,
return; return;
} }
/*
* Newly setup incoming QEMU. Mark the block active state to reflect
* that the src currently owns the disks.
*/
migration_block_active_setup(false);
once = false; once = false;
} }
@ -2492,7 +2496,6 @@ static int postcopy_start(MigrationState *ms, Error **errp)
QIOChannelBuffer *bioc; QIOChannelBuffer *bioc;
QEMUFile *fb; QEMUFile *fb;
uint64_t bandwidth = migrate_max_postcopy_bandwidth(); uint64_t bandwidth = migrate_max_postcopy_bandwidth();
bool restart_block = false;
int cur_state = MIGRATION_STATUS_ACTIVE; int cur_state = MIGRATION_STATUS_ACTIVE;
if (migrate_postcopy_preempt()) { if (migrate_postcopy_preempt()) {
@ -2528,13 +2531,10 @@ static int postcopy_start(MigrationState *ms, Error **errp)
goto fail; goto fail;
} }
ret = bdrv_inactivate_all(); if (!migration_block_inactivate()) {
if (ret < 0) { error_setg(errp, "%s: Failed in bdrv_inactivate_all()", __func__);
error_setg_errno(errp, -ret, "%s: Failed in bdrv_inactivate_all()",
__func__);
goto fail; goto fail;
} }
restart_block = true;
/* /*
* Cause any non-postcopiable, but iterative devices to * Cause any non-postcopiable, but iterative devices to
@ -2604,8 +2604,6 @@ static int postcopy_start(MigrationState *ms, Error **errp)
goto fail_closefb; goto fail_closefb;
} }
restart_block = false;
/* Now send that blob */ /* Now send that blob */
if (qemu_savevm_send_packaged(ms->to_dst_file, bioc->data, bioc->usage)) { if (qemu_savevm_send_packaged(ms->to_dst_file, bioc->data, bioc->usage)) {
error_setg(errp, "%s: Failed to send packaged data", __func__); error_setg(errp, "%s: Failed to send packaged data", __func__);
@ -2650,17 +2648,7 @@ fail_closefb:
fail: fail:
migrate_set_state(&ms->state, MIGRATION_STATUS_POSTCOPY_ACTIVE, migrate_set_state(&ms->state, MIGRATION_STATUS_POSTCOPY_ACTIVE,
MIGRATION_STATUS_FAILED); MIGRATION_STATUS_FAILED);
if (restart_block) { migration_block_activate(NULL);
/* A failure happened early enough that we know the destination hasn't
* accessed block devices, so we're safe to recover.
*/
Error *local_err = NULL;
bdrv_activate_all(&local_err);
if (local_err) {
error_report_err(local_err);
}
}
migration_call_notifiers(ms, MIG_EVENT_PRECOPY_FAILED, NULL); migration_call_notifiers(ms, MIG_EVENT_PRECOPY_FAILED, NULL);
bql_unlock(); bql_unlock();
return -1; return -1;
@ -2729,14 +2717,11 @@ static int migration_completion_precopy(MigrationState *s,
goto out_unlock; goto out_unlock;
} }
/*
* Inactivate disks except in COLO, and track that we have done so in order
* to remember to reactivate them if migration fails or is cancelled.
*/
s->block_inactive = !migrate_colo();
migration_rate_set(RATE_LIMIT_DISABLED); migration_rate_set(RATE_LIMIT_DISABLED);
/* Inactivate disks except in COLO */
ret = qemu_savevm_state_complete_precopy(s->to_dst_file, false, ret = qemu_savevm_state_complete_precopy(s->to_dst_file, false,
s->block_inactive); !migrate_colo());
out_unlock: out_unlock:
bql_unlock(); bql_unlock();
return ret; return ret;
@ -2761,31 +2746,6 @@ static void migration_completion_postcopy(MigrationState *s)
trace_migration_completion_postcopy_end_after_complete(); trace_migration_completion_postcopy_end_after_complete();
} }
static void migration_completion_failed(MigrationState *s,
int current_active_state)
{
if (s->block_inactive && (s->state == MIGRATION_STATUS_ACTIVE ||
s->state == MIGRATION_STATUS_DEVICE)) {
/*
* If not doing postcopy, vm_start() will be called: let's
* regain control on images.
*/
Error *local_err = NULL;
bql_lock();
bdrv_activate_all(&local_err);
if (local_err) {
error_report_err(local_err);
} else {
s->block_inactive = false;
}
bql_unlock();
}
migrate_set_state(&s->state, current_active_state,
MIGRATION_STATUS_FAILED);
}
/** /**
* migration_completion: Used by migration_thread when there's not much left. * migration_completion: Used by migration_thread when there's not much left.
* The caller 'breaks' the loop when this returns. * The caller 'breaks' the loop when this returns.
@ -2839,7 +2799,8 @@ fail:
error_free(local_err); error_free(local_err);
} }
migration_completion_failed(s, current_active_state); migrate_set_state(&s->state, current_active_state,
MIGRATION_STATUS_FAILED);
} }
/** /**
@ -3269,6 +3230,11 @@ static void migration_iteration_finish(MigrationState *s)
case MIGRATION_STATUS_FAILED: case MIGRATION_STATUS_FAILED:
case MIGRATION_STATUS_CANCELLED: case MIGRATION_STATUS_CANCELLED:
case MIGRATION_STATUS_CANCELLING: case MIGRATION_STATUS_CANCELLING:
/*
* Re-activate the block drives if they're inactivated. Note, COLO
* shouldn't use block_active at all, so it should be no-op there.
*/
migration_block_activate(NULL);
if (runstate_is_live(s->vm_old_state)) { if (runstate_is_live(s->vm_old_state)) {
if (!runstate_check(RUN_STATE_SHUTDOWN)) { if (!runstate_check(RUN_STATE_SHUTDOWN)) {
vm_start(); vm_start();
@ -3842,6 +3808,8 @@ static void migration_instance_init(Object *obj)
ms->state = MIGRATION_STATUS_NONE; ms->state = MIGRATION_STATUS_NONE;
ms->mbps = -1; ms->mbps = -1;
ms->pages_per_second = -1; ms->pages_per_second = -1;
/* Freshly started QEMU owns all the block devices */
migration_block_active_setup(true);
qemu_sem_init(&ms->pause_sem, 0); qemu_sem_init(&ms->pause_sem, 0);
qemu_mutex_init(&ms->error_mutex); qemu_mutex_init(&ms->error_mutex);

View File

@ -370,9 +370,6 @@ struct MigrationState {
/* Flag set once the migration thread is running (and needs joining) */ /* Flag set once the migration thread is running (and needs joining) */
bool migration_thread_running; bool migration_thread_running;
/* Flag set once the migration thread called bdrv_inactivate_all */
bool block_inactive;
/* Migration is waiting for guest to unplug device */ /* Migration is waiting for guest to unplug device */
QemuSemaphore wait_unplug_sem; QemuSemaphore wait_unplug_sem;
@ -556,4 +553,7 @@ void migration_bitmap_sync_precopy(bool last_stage);
/* migration/block-dirty-bitmap.c */ /* migration/block-dirty-bitmap.c */
void dirty_bitmap_mig_init(void); void dirty_bitmap_mig_init(void);
/* migration/block-active.c */
void migration_block_active_setup(bool active);
#endif #endif

View File

@ -20,6 +20,7 @@
#include "qemu/cutils.h" #include "qemu/cutils.h"
#include "qemu/error-report.h" #include "qemu/error-report.h"
#include "trace.h" #include "trace.h"
#include "qemu-file.h"
static MultiFDSendData *multifd_ram_send; static MultiFDSendData *multifd_ram_send;
@ -343,8 +344,53 @@ retry:
return true; return true;
} }
int multifd_ram_flush_and_sync(void) /*
* We have two modes for multifd flushes:
*
* - Per-section mode: this is the legacy way to flush, it requires one
* MULTIFD_FLAG_SYNC message for each RAM_SAVE_FLAG_EOS.
*
* - Per-round mode: this is the modern way to flush, it requires one
* MULTIFD_FLAG_SYNC message only for each round of RAM scan. Normally
* it's paired with a new RAM_SAVE_FLAG_MULTIFD_FLUSH message in network
* based migrations.
*
* One thing to mention is mapped-ram always use the modern way to sync.
*/
/* Do we need a per-section multifd flush (legacy way)? */
bool multifd_ram_sync_per_section(void)
{ {
if (!migrate_multifd()) {
return false;
}
if (migrate_mapped_ram()) {
return false;
}
return migrate_multifd_flush_after_each_section();
}
/* Do we need a per-round multifd flush (modern way)? */
bool multifd_ram_sync_per_round(void)
{
if (!migrate_multifd()) {
return false;
}
if (migrate_mapped_ram()) {
return true;
}
return !migrate_multifd_flush_after_each_section();
}
int multifd_ram_flush_and_sync(QEMUFile *f)
{
MultiFDSyncReq req;
int ret;
if (!migrate_multifd()) { if (!migrate_multifd()) {
return 0; return 0;
} }
@ -356,12 +402,37 @@ int multifd_ram_flush_and_sync(void)
} }
} }
return multifd_send_sync_main(); /* File migrations only need to sync with threads */
req = migrate_mapped_ram() ? MULTIFD_SYNC_LOCAL : MULTIFD_SYNC_ALL;
ret = multifd_send_sync_main(req);
if (ret) {
return ret;
}
/* If we don't need to sync with remote at all, nothing else to do */
if (req == MULTIFD_SYNC_LOCAL) {
return 0;
}
/*
* Old QEMUs don't understand RAM_SAVE_FLAG_MULTIFD_FLUSH, it relies
* on RAM_SAVE_FLAG_EOS instead.
*/
if (migrate_multifd_flush_after_each_section()) {
return 0;
}
qemu_put_be64(f, RAM_SAVE_FLAG_MULTIFD_FLUSH);
qemu_fflush(f);
return 0;
} }
bool multifd_send_prepare_common(MultiFDSendParams *p) bool multifd_send_prepare_common(MultiFDSendParams *p)
{ {
MultiFDPages_t *pages = &p->data->u.ram; MultiFDPages_t *pages = &p->data->u.ram;
multifd_send_prepare_header(p);
multifd_send_zero_page_detect(p); multifd_send_zero_page_detect(p);
if (!pages->normal_num) { if (!pages->normal_num) {
@ -369,8 +440,6 @@ bool multifd_send_prepare_common(MultiFDSendParams *p)
return false; return false;
} }
multifd_send_prepare_header(p);
return true; return true;
} }

View File

@ -373,6 +373,7 @@ static int qatzip_recv(MultiFDRecvParams *p, Error **errp)
/* Copy each page to its appropriate location. */ /* Copy each page to its appropriate location. */
for (int i = 0; i < p->normal_num; i++) { for (int i = 0; i < p->normal_num; i++) {
memcpy(p->host + p->normal[i], q->out_buf + page_size * i, page_size); memcpy(p->host + p->normal[i], q->out_buf + page_size * i, page_size);
ramblock_recv_bitmap_set_offset(p->block, p->normal[i]);
} }
return 0; return 0;
} }

View File

@ -679,6 +679,7 @@ static int multifd_qpl_recv(MultiFDRecvParams *p, Error **errp)
qpl->zlen[i] = be32_to_cpu(qpl->zlen[i]); qpl->zlen[i] = be32_to_cpu(qpl->zlen[i]);
assert(qpl->zlen[i] <= multifd_ram_page_size()); assert(qpl->zlen[i] <= multifd_ram_page_size());
zbuf_len += qpl->zlen[i]; zbuf_len += qpl->zlen[i];
ramblock_recv_bitmap_set_offset(p->block, p->normal[i]);
} }
/* read compressed pages */ /* read compressed pages */

View File

@ -169,7 +169,7 @@ static int multifd_uadk_send_prepare(MultiFDSendParams *p, Error **errp)
.src_len = page_size, .src_len = page_size,
.dst = buf, .dst = buf,
/* Set dst_len to double the src in case compressed out >= page_size */ /* Set dst_len to double the src in case compressed out >= page_size */
.dst_len = p->page_size * 2, .dst_len = page_size * 2,
}; };
if (uadk_data->handle) { if (uadk_data->handle) {

View File

@ -252,9 +252,8 @@ static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp)
p->packet_num = be64_to_cpu(packet->packet_num); p->packet_num = be64_to_cpu(packet->packet_num);
p->packets_recved++; p->packets_recved++;
if (!(p->flags & MULTIFD_FLAG_SYNC)) { /* Always unfill, old QEMUs (<9.0) send data along with SYNC */
ret = multifd_ram_unfill_packet(p, errp); ret = multifd_ram_unfill_packet(p, errp);
}
trace_multifd_recv_unfill(p->id, p->packet_num, p->flags, trace_multifd_recv_unfill(p->id, p->packet_num, p->flags,
p->next_packet_size); p->next_packet_size);
@ -523,11 +522,13 @@ static int multifd_zero_copy_flush(QIOChannel *c)
return ret; return ret;
} }
int multifd_send_sync_main(void) int multifd_send_sync_main(MultiFDSyncReq req)
{ {
int i; int i;
bool flush_zero_copy; bool flush_zero_copy;
assert(req != MULTIFD_SYNC_NONE);
flush_zero_copy = migrate_zero_copy_send(); flush_zero_copy = migrate_zero_copy_send();
for (i = 0; i < migrate_multifd_channels(); i++) { for (i = 0; i < migrate_multifd_channels(); i++) {
@ -543,8 +544,8 @@ int multifd_send_sync_main(void)
* We should be the only user so far, so not possible to be set by * We should be the only user so far, so not possible to be set by
* others concurrently. * others concurrently.
*/ */
assert(qatomic_read(&p->pending_sync) == false); assert(qatomic_read(&p->pending_sync) == MULTIFD_SYNC_NONE);
qatomic_set(&p->pending_sync, true); qatomic_set(&p->pending_sync, req);
qemu_sem_post(&p->sem); qemu_sem_post(&p->sem);
} }
for (i = 0; i < migrate_multifd_channels(); i++) { for (i = 0; i < migrate_multifd_channels(); i++) {
@ -635,14 +636,17 @@ static void *multifd_send_thread(void *opaque)
*/ */
qatomic_store_release(&p->pending_job, false); qatomic_store_release(&p->pending_job, false);
} else { } else {
MultiFDSyncReq req = qatomic_read(&p->pending_sync);
/* /*
* If not a normal job, must be a sync request. Note that * If not a normal job, must be a sync request. Note that
* pending_sync is a standalone flag (unlike pending_job), so * pending_sync is a standalone flag (unlike pending_job), so
* it doesn't require explicit memory barriers. * it doesn't require explicit memory barriers.
*/ */
assert(qatomic_read(&p->pending_sync)); assert(req != MULTIFD_SYNC_NONE);
if (use_packets) { /* Only push the SYNC message if it involves a remote sync */
if (req == MULTIFD_SYNC_ALL) {
p->flags = MULTIFD_FLAG_SYNC; p->flags = MULTIFD_FLAG_SYNC;
multifd_send_fill_packet(p); multifd_send_fill_packet(p);
ret = qio_channel_write_all(p->c, (void *)p->packet, ret = qio_channel_write_all(p->c, (void *)p->packet,
@ -654,7 +658,7 @@ static void *multifd_send_thread(void *opaque)
stat64_add(&mig_stats.multifd_bytes, p->packet_len); stat64_add(&mig_stats.multifd_bytes, p->packet_len);
} }
qatomic_set(&p->pending_sync, false); qatomic_set(&p->pending_sync, MULTIFD_SYNC_NONE);
qemu_sem_post(&p->sem_sync); qemu_sem_post(&p->sem_sync);
} }
} }
@ -1151,9 +1155,13 @@ static void *multifd_recv_thread(void *opaque)
flags = p->flags; flags = p->flags;
/* recv methods don't know how to handle the SYNC flag */ /* recv methods don't know how to handle the SYNC flag */
p->flags &= ~MULTIFD_FLAG_SYNC; p->flags &= ~MULTIFD_FLAG_SYNC;
if (!(flags & MULTIFD_FLAG_SYNC)) {
/*
* Even if it's a SYNC packet, this needs to be set
* because older QEMUs (<9.0) still send data along with
* the SYNC packet.
*/
has_data = p->normal_num || p->zero_num; has_data = p->normal_num || p->zero_num;
}
qemu_mutex_unlock(&p->mutex); qemu_mutex_unlock(&p->mutex);
} else { } else {
/* /*

View File

@ -19,6 +19,22 @@
typedef struct MultiFDRecvData MultiFDRecvData; typedef struct MultiFDRecvData MultiFDRecvData;
typedef struct MultiFDSendData MultiFDSendData; typedef struct MultiFDSendData MultiFDSendData;
typedef enum {
/* No sync request */
MULTIFD_SYNC_NONE = 0,
/* Sync locally on the sender threads without pushing messages */
MULTIFD_SYNC_LOCAL,
/*
* Sync not only on the sender threads, but also push MULTIFD_FLAG_SYNC
* message to the wire for each iochannel (which is for a remote sync).
*
* When remote sync is used, need to be paired with a follow up
* RAM_SAVE_FLAG_EOS / RAM_SAVE_FLAG_MULTIFD_FLUSH message on the main
* channel.
*/
MULTIFD_SYNC_ALL,
} MultiFDSyncReq;
bool multifd_send_setup(void); bool multifd_send_setup(void);
void multifd_send_shutdown(void); void multifd_send_shutdown(void);
void multifd_send_channel_created(void); void multifd_send_channel_created(void);
@ -28,7 +44,7 @@ void multifd_recv_shutdown(void);
bool multifd_recv_all_channels_created(void); bool multifd_recv_all_channels_created(void);
void multifd_recv_new_channel(QIOChannel *ioc, Error **errp); void multifd_recv_new_channel(QIOChannel *ioc, Error **errp);
void multifd_recv_sync_main(void); void multifd_recv_sync_main(void);
int multifd_send_sync_main(void); int multifd_send_sync_main(MultiFDSyncReq req);
bool multifd_queue_page(RAMBlock *block, ram_addr_t offset); bool multifd_queue_page(RAMBlock *block, ram_addr_t offset);
bool multifd_recv(void); bool multifd_recv(void);
MultiFDRecvData *multifd_get_recv_data(void); MultiFDRecvData *multifd_get_recv_data(void);
@ -143,7 +159,7 @@ typedef struct {
/* multifd flags for each packet */ /* multifd flags for each packet */
uint32_t flags; uint32_t flags;
/* /*
* The sender thread has work to do if either of below boolean is set. * The sender thread has work to do if either of below field is set.
* *
* @pending_job: a job is pending * @pending_job: a job is pending
* @pending_sync: a sync request is pending * @pending_sync: a sync request is pending
@ -152,7 +168,8 @@ typedef struct {
* cleared by the multifd sender threads. * cleared by the multifd sender threads.
*/ */
bool pending_job; bool pending_job;
bool pending_sync; MultiFDSyncReq pending_sync;
MultiFDSendData *data; MultiFDSendData *data;
/* thread local variables. No locking required */ /* thread local variables. No locking required */
@ -337,7 +354,9 @@ static inline uint32_t multifd_ram_page_count(void)
void multifd_ram_save_setup(void); void multifd_ram_save_setup(void);
void multifd_ram_save_cleanup(void); void multifd_ram_save_cleanup(void);
int multifd_ram_flush_and_sync(void); int multifd_ram_flush_and_sync(QEMUFile *f);
bool multifd_ram_sync_per_round(void);
bool multifd_ram_sync_per_section(void);
size_t multifd_ram_payload_size(void); size_t multifd_ram_payload_size(void);
void multifd_ram_fill_packet(MultiFDSendParams *p); void multifd_ram_fill_packet(MultiFDSendParams *p);
int multifd_ram_unfill_packet(MultiFDRecvParams *p, Error **errp); int multifd_ram_unfill_packet(MultiFDRecvParams *p, Error **errp);

View File

@ -71,27 +71,6 @@
/***********************************************************/ /***********************************************************/
/* ram save/restore */ /* ram save/restore */
/*
* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
* worked for pages that were filled with the same char. We switched
* it to only search for the zero value. And to avoid confusion with
* RAM_SAVE_FLAG_COMPRESS_PAGE just rename it.
*
* RAM_SAVE_FLAG_FULL was obsoleted in 2009.
*
* RAM_SAVE_FLAG_COMPRESS_PAGE (0x100) was removed in QEMU 9.1.
*/
#define RAM_SAVE_FLAG_FULL 0x01
#define RAM_SAVE_FLAG_ZERO 0x02
#define RAM_SAVE_FLAG_MEM_SIZE 0x04
#define RAM_SAVE_FLAG_PAGE 0x08
#define RAM_SAVE_FLAG_EOS 0x10
#define RAM_SAVE_FLAG_CONTINUE 0x20
#define RAM_SAVE_FLAG_XBZRLE 0x40
/* 0x80 is reserved in rdma.h for RAM_SAVE_FLAG_HOOK */
#define RAM_SAVE_FLAG_MULTIFD_FLUSH 0x200
/* We can't use any flag that is bigger than 0x200 */
/* /*
* mapped-ram migration supports O_DIRECT, so we need to make sure the * mapped-ram migration supports O_DIRECT, so we need to make sure the
* userspace buffer, the IO operation size and the file offset are * userspace buffer, the IO operation size and the file offset are
@ -1323,19 +1302,12 @@ static int find_dirty_block(RAMState *rs, PageSearchStatus *pss)
pss->page = 0; pss->page = 0;
pss->block = QLIST_NEXT_RCU(pss->block, next); pss->block = QLIST_NEXT_RCU(pss->block, next);
if (!pss->block) { if (!pss->block) {
if (migrate_multifd() && if (multifd_ram_sync_per_round()) {
(!migrate_multifd_flush_after_each_section() ||
migrate_mapped_ram())) {
QEMUFile *f = rs->pss[RAM_CHANNEL_PRECOPY].pss_channel; QEMUFile *f = rs->pss[RAM_CHANNEL_PRECOPY].pss_channel;
int ret = multifd_ram_flush_and_sync(); int ret = multifd_ram_flush_and_sync(f);
if (ret < 0) { if (ret < 0) {
return ret; return ret;
} }
if (!migrate_mapped_ram()) {
qemu_put_be64(f, RAM_SAVE_FLAG_MULTIFD_FLUSH);
qemu_fflush(f);
}
} }
/* Hit the end of the list */ /* Hit the end of the list */
@ -3064,19 +3036,39 @@ static int ram_save_setup(QEMUFile *f, void *opaque, Error **errp)
migration_ops->ram_save_target_page = ram_save_target_page_legacy; migration_ops->ram_save_target_page = ram_save_target_page_legacy;
} }
/*
* This operation is unfortunate..
*
* For legacy QEMUs using per-section sync
* =======================================
*
* This must exist because the EOS below requires the SYNC messages
* per-channel to work.
*
* For modern QEMUs using per-round sync
* =====================================
*
* Logically such sync is not needed, and recv threads should not run
* until setup ready (using things like channels_ready on src). Then
* we should be all fine.
*
* However even if we add channels_ready to recv side in new QEMUs, old
* QEMU won't have them so this sync will still be needed to make sure
* multifd recv threads won't start processing guest pages early before
* ram_load_setup() is properly done.
*
* Let's stick with this. Fortunately the overhead is low to sync
* during setup because the VM is running, so at least it's not
* accounted as part of downtime.
*/
bql_unlock(); bql_unlock();
ret = multifd_ram_flush_and_sync(); ret = multifd_ram_flush_and_sync(f);
bql_lock(); bql_lock();
if (ret < 0) { if (ret < 0) {
error_setg(errp, "%s: multifd synchronization failed", __func__); error_setg(errp, "%s: multifd synchronization failed", __func__);
return ret; return ret;
} }
if (migrate_multifd() && !migrate_multifd_flush_after_each_section()
&& !migrate_mapped_ram()) {
qemu_put_be64(f, RAM_SAVE_FLAG_MULTIFD_FLUSH);
}
qemu_put_be64(f, RAM_SAVE_FLAG_EOS); qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
ret = qemu_fflush(f); ret = qemu_fflush(f);
if (ret < 0) { if (ret < 0) {
@ -3209,9 +3201,8 @@ static int ram_save_iterate(QEMUFile *f, void *opaque)
out: out:
if (ret >= 0 && migration_is_running()) { if (ret >= 0 && migration_is_running()) {
if (migrate_multifd() && migrate_multifd_flush_after_each_section() && if (multifd_ram_sync_per_section()) {
!migrate_mapped_ram()) { ret = multifd_ram_flush_and_sync(f);
ret = multifd_ram_flush_and_sync();
if (ret < 0) { if (ret < 0) {
return ret; return ret;
} }
@ -3283,10 +3274,16 @@ static int ram_save_complete(QEMUFile *f, void *opaque)
} }
} }
ret = multifd_ram_flush_and_sync(); if (multifd_ram_sync_per_section()) {
/*
* Only the old dest QEMU will need this sync, because each EOS
* will require one SYNC message on each channel.
*/
ret = multifd_ram_flush_and_sync(f);
if (ret < 0) { if (ret < 0) {
return ret; return ret;
} }
}
if (migrate_mapped_ram()) { if (migrate_mapped_ram()) {
ram_save_file_bmap(f); ram_save_file_bmap(f);
@ -3796,15 +3793,7 @@ int ram_load_postcopy(QEMUFile *f, int channel)
TARGET_PAGE_SIZE); TARGET_PAGE_SIZE);
} }
break; break;
case RAM_SAVE_FLAG_MULTIFD_FLUSH:
multifd_recv_sync_main();
break;
case RAM_SAVE_FLAG_EOS: case RAM_SAVE_FLAG_EOS:
/* normal exit */
if (migrate_multifd() &&
migrate_multifd_flush_after_each_section()) {
multifd_recv_sync_main();
}
break; break;
default: default:
error_report("Unknown combination of migration flags: 0x%x" error_report("Unknown combination of migration flags: 0x%x"

View File

@ -33,6 +33,34 @@
#include "exec/cpu-common.h" #include "exec/cpu-common.h"
#include "io/channel.h" #include "io/channel.h"
/*
* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
* worked for pages that were filled with the same char. We switched
* it to only search for the zero value. And to avoid confusion with
* RAM_SAVE_FLAG_COMPRESS_PAGE just rename it.
*
* RAM_SAVE_FLAG_FULL (0x01) was obsoleted in 2009.
*
* RAM_SAVE_FLAG_COMPRESS_PAGE (0x100) was removed in QEMU 9.1.
*
* RAM_SAVE_FLAG_HOOK is only used in RDMA. Whenever this is found in the
* data stream, the flags will be passed to rdma functions in the
* incoming-migration side.
*
* We can't use any flag that is bigger than 0x200, because the flags are
* always assumed to be encoded in a ramblock address offset, which is
* multiple of PAGE_SIZE. Here it means QEMU supports migration with any
* architecture that has PAGE_SIZE>=1K (0x400).
*/
#define RAM_SAVE_FLAG_ZERO 0x002
#define RAM_SAVE_FLAG_MEM_SIZE 0x004
#define RAM_SAVE_FLAG_PAGE 0x008
#define RAM_SAVE_FLAG_EOS 0x010
#define RAM_SAVE_FLAG_CONTINUE 0x020
#define RAM_SAVE_FLAG_XBZRLE 0x040
#define RAM_SAVE_FLAG_HOOK 0x080
#define RAM_SAVE_FLAG_MULTIFD_FLUSH 0x200
extern XBZRLECacheStats xbzrle_counters; extern XBZRLECacheStats xbzrle_counters;
/* Should be holding either ram_list.mutex, or the RCU lock. */ /* Should be holding either ram_list.mutex, or the RCU lock. */

View File

@ -33,13 +33,6 @@ void rdma_start_incoming_migration(InetSocketAddress *host_port, Error **errp);
#define RAM_CONTROL_ROUND 1 #define RAM_CONTROL_ROUND 1
#define RAM_CONTROL_FINISH 3 #define RAM_CONTROL_FINISH 3
/*
* Whenever this is found in the data stream, the flags
* will be passed to rdma functions in the incoming-migration
* side.
*/
#define RAM_SAVE_FLAG_HOOK 0x80
#define RAM_SAVE_CONTROL_NOT_SUPP -1000 #define RAM_SAVE_CONTROL_NOT_SUPP -1000
#define RAM_SAVE_CONTROL_DELAYED -2000 #define RAM_SAVE_CONTROL_DELAYED -2000

View File

@ -1547,15 +1547,16 @@ int qemu_savevm_state_complete_precopy_non_iterable(QEMUFile *f,
} }
if (inactivate_disks) { if (inactivate_disks) {
/* Inactivate before sending QEMU_VM_EOF so that the /*
* bdrv_activate_all() on the other end won't fail. */ * Inactivate before sending QEMU_VM_EOF so that the
ret = bdrv_inactivate_all(); * bdrv_activate_all() on the other end won't fail.
if (ret) { */
error_setg(&local_err, "%s: bdrv_inactivate_all() failed (%d)", if (!migration_block_inactivate()) {
__func__, ret); error_setg(&local_err, "%s: bdrv_inactivate_all() failed",
__func__);
migrate_set_error(ms, local_err); migrate_set_error(ms, local_err);
error_report_err(local_err); error_report_err(local_err);
qemu_file_set_error(f, ret); qemu_file_set_error(f, -EFAULT);
return ret; return ret;
} }
} }
@ -2121,7 +2122,6 @@ static int loadvm_postcopy_handle_listen(MigrationIncomingState *mis)
static void loadvm_postcopy_handle_run_bh(void *opaque) static void loadvm_postcopy_handle_run_bh(void *opaque)
{ {
Error *local_err = NULL;
MigrationIncomingState *mis = opaque; MigrationIncomingState *mis = opaque;
trace_vmstate_downtime_checkpoint("dst-postcopy-bh-enter"); trace_vmstate_downtime_checkpoint("dst-postcopy-bh-enter");
@ -2137,22 +2137,20 @@ static void loadvm_postcopy_handle_run_bh(void *opaque)
trace_vmstate_downtime_checkpoint("dst-postcopy-bh-announced"); trace_vmstate_downtime_checkpoint("dst-postcopy-bh-announced");
/* Make sure all file formats throw away their mutable metadata.
* If we get an error here, just don't restart the VM yet. */
bdrv_activate_all(&local_err);
if (local_err) {
error_report_err(local_err);
local_err = NULL;
autostart = false;
}
trace_vmstate_downtime_checkpoint("dst-postcopy-bh-cache-invalidated");
dirty_bitmap_mig_before_vm_start(); dirty_bitmap_mig_before_vm_start();
if (autostart) { if (autostart) {
/* Hold onto your hats, starting the CPU */ /*
* Make sure all file formats throw away their mutable metadata.
* If we get an error here, just don't restart the VM yet.
*/
bool success = migration_block_activate(NULL);
trace_vmstate_downtime_checkpoint("dst-postcopy-bh-cache-invalidated");
if (success) {
vm_start(); vm_start();
}
} else { } else {
/* leave it paused and let management decide when to start the CPU */ /* leave it paused and let management decide when to start the CPU */
runstate_set(RUN_STATE_PAUSED); runstate_set(RUN_STATE_PAUSED);
@ -3192,11 +3190,7 @@ void qmp_xen_save_devices_state(const char *filename, bool has_live, bool live,
* side of the migration take control of the images. * side of the migration take control of the images.
*/ */
if (live && !saved_vm_running) { if (live && !saved_vm_running) {
ret = bdrv_inactivate_all(); migration_block_inactivate();
if (ret) {
error_setg(errp, "%s: bdrv_inactivate_all() failed (%d)",
__func__, ret);
}
} }
} }

View File

@ -383,3 +383,6 @@ migration_pagecache_insert(void) "Error allocating page"
# cpu-throttle.c # cpu-throttle.c
cpu_throttle_set(int new_throttle_pct) "set guest CPU throttled by %d%%" cpu_throttle_set(int new_throttle_pct) "set guest CPU throttled by %d%%"
cpu_throttle_dirty_sync(void) "" cpu_throttle_dirty_sync(void) ""
# block-active.c
migration_block_activation(const char *name) "%s"

View File

@ -338,7 +338,7 @@ static int put_nullptr(QEMUFile *f, void *pv, size_t size,
} }
const VMStateInfo vmstate_info_nullptr = { const VMStateInfo vmstate_info_nullptr = {
.name = "uint64", .name = "nullptr",
.get = get_nullptr, .get = get_nullptr,
.put = put_nullptr, .put = put_nullptr,
}; };

View File

@ -51,6 +51,36 @@ vmstate_field_exists(const VMStateDescription *vmsd, const VMStateField *field,
return result; return result;
} }
/*
* Create a fake nullptr field when there's a NULL pointer detected in the
* array of a VMS_ARRAY_OF_POINTER VMSD field. It's needed because we
* can't dereference the NULL pointer.
*/
static const VMStateField *
vmsd_create_fake_nullptr_field(const VMStateField *field)
{
VMStateField *fake = g_new0(VMStateField, 1);
/* It can only happen on an array of pointers! */
assert(field->flags & VMS_ARRAY_OF_POINTER);
/* Some of fake's properties should match the original's */
fake->name = field->name;
fake->version_id = field->version_id;
/* Do not need "field_exists" check as it always exists (which is null) */
fake->field_exists = NULL;
/* See vmstate_info_nullptr - use 1 byte to represent nullptr */
fake->size = 1;
fake->info = &vmstate_info_nullptr;
fake->flags = VMS_SINGLE;
/* All the rest fields shouldn't matter.. */
return (const VMStateField *)fake;
}
static int vmstate_n_elems(void *opaque, const VMStateField *field) static int vmstate_n_elems(void *opaque, const VMStateField *field)
{ {
int n_elems = 1; int n_elems = 1;
@ -143,23 +173,39 @@ int vmstate_load_state(QEMUFile *f, const VMStateDescription *vmsd,
} }
for (i = 0; i < n_elems; i++) { for (i = 0; i < n_elems; i++) {
void *curr_elem = first_elem + size * i; void *curr_elem = first_elem + size * i;
const VMStateField *inner_field;
if (field->flags & VMS_ARRAY_OF_POINTER) { if (field->flags & VMS_ARRAY_OF_POINTER) {
curr_elem = *(void **)curr_elem; curr_elem = *(void **)curr_elem;
} }
if (!curr_elem && size) { if (!curr_elem && size) {
/* if null pointer check placeholder and do not follow */ /*
assert(field->flags & VMS_ARRAY_OF_POINTER); * If null pointer found (which should only happen in
ret = vmstate_info_nullptr.get(f, curr_elem, size, NULL); * an array of pointers), use null placeholder and do
} else if (field->flags & VMS_STRUCT) { * not follow.
ret = vmstate_load_state(f, field->vmsd, curr_elem, */
field->vmsd->version_id); inner_field = vmsd_create_fake_nullptr_field(field);
} else if (field->flags & VMS_VSTRUCT) {
ret = vmstate_load_state(f, field->vmsd, curr_elem,
field->struct_version_id);
} else { } else {
ret = field->info->get(f, curr_elem, size, field); inner_field = field;
} }
if (inner_field->flags & VMS_STRUCT) {
ret = vmstate_load_state(f, inner_field->vmsd, curr_elem,
inner_field->vmsd->version_id);
} else if (inner_field->flags & VMS_VSTRUCT) {
ret = vmstate_load_state(f, inner_field->vmsd, curr_elem,
inner_field->struct_version_id);
} else {
ret = inner_field->info->get(f, curr_elem, size,
inner_field);
}
/* If we used a fake temp field.. free it now */
if (inner_field != field) {
g_clear_pointer((gpointer *)&inner_field, g_free);
}
if (ret >= 0) { if (ret >= 0) {
ret = qemu_file_get_error(f); ret = qemu_file_get_error(f);
} }
@ -311,7 +357,7 @@ static void vmsd_desc_field_start(const VMStateDescription *vmsd,
static void vmsd_desc_field_end(const VMStateDescription *vmsd, static void vmsd_desc_field_end(const VMStateDescription *vmsd,
JSONWriter *vmdesc, JSONWriter *vmdesc,
const VMStateField *field, size_t size, int i) const VMStateField *field, size_t size)
{ {
if (!vmdesc) { if (!vmdesc) {
return; return;
@ -379,37 +425,89 @@ int vmstate_save_state_v(QEMUFile *f, const VMStateDescription *vmsd,
int size = vmstate_size(opaque, field); int size = vmstate_size(opaque, field);
uint64_t old_offset, written_bytes; uint64_t old_offset, written_bytes;
JSONWriter *vmdesc_loop = vmdesc; JSONWriter *vmdesc_loop = vmdesc;
bool is_prev_null = false;
trace_vmstate_save_state_loop(vmsd->name, field->name, n_elems); trace_vmstate_save_state_loop(vmsd->name, field->name, n_elems);
if (field->flags & VMS_POINTER) { if (field->flags & VMS_POINTER) {
first_elem = *(void **)first_elem; first_elem = *(void **)first_elem;
assert(first_elem || !n_elems || !size); assert(first_elem || !n_elems || !size);
} }
for (i = 0; i < n_elems; i++) { for (i = 0; i < n_elems; i++) {
void *curr_elem = first_elem + size * i; void *curr_elem = first_elem + size * i;
const VMStateField *inner_field;
bool is_null;
int max_elems = n_elems - i;
vmsd_desc_field_start(vmsd, vmdesc_loop, field, i, n_elems);
old_offset = qemu_file_transferred(f); old_offset = qemu_file_transferred(f);
if (field->flags & VMS_ARRAY_OF_POINTER) { if (field->flags & VMS_ARRAY_OF_POINTER) {
assert(curr_elem); assert(curr_elem);
curr_elem = *(void **)curr_elem; curr_elem = *(void **)curr_elem;
} }
if (!curr_elem && size) { if (!curr_elem && size) {
/* if null pointer write placeholder and do not follow */ /*
assert(field->flags & VMS_ARRAY_OF_POINTER); * If null pointer found (which should only happen in
ret = vmstate_info_nullptr.put(f, curr_elem, size, NULL, * an array of pointers), use null placeholder and do
NULL); * not follow.
} else if (field->flags & VMS_STRUCT) { */
ret = vmstate_save_state(f, field->vmsd, curr_elem, inner_field = vmsd_create_fake_nullptr_field(field);
vmdesc_loop); is_null = true;
} else if (field->flags & VMS_VSTRUCT) {
ret = vmstate_save_state_v(f, field->vmsd, curr_elem,
vmdesc_loop,
field->struct_version_id, errp);
} else { } else {
ret = field->info->put(f, curr_elem, size, field, inner_field = field;
vmdesc_loop); is_null = false;
} }
/*
* Due to the fake nullptr handling above, if there's mixed
* null/non-null data, it doesn't make sense to emit a
* compressed array representation spanning the entire array
* because the field types will be different (e.g. struct
* vs. nullptr). Search ahead for the next null/non-null element
* and start a new compressed array if found.
*/
if (field->flags & VMS_ARRAY_OF_POINTER &&
is_null != is_prev_null) {
is_prev_null = is_null;
vmdesc_loop = vmdesc;
for (int j = i + 1; j < n_elems; j++) {
void *elem = *(void **)(first_elem + size * j);
bool elem_is_null = !elem && size;
if (is_null != elem_is_null) {
max_elems = j - i;
break;
}
}
}
vmsd_desc_field_start(vmsd, vmdesc_loop, inner_field,
i, max_elems);
if (inner_field->flags & VMS_STRUCT) {
ret = vmstate_save_state(f, inner_field->vmsd,
curr_elem, vmdesc_loop);
} else if (inner_field->flags & VMS_VSTRUCT) {
ret = vmstate_save_state_v(f, inner_field->vmsd,
curr_elem, vmdesc_loop,
inner_field->struct_version_id,
errp);
} else {
ret = inner_field->info->put(f, curr_elem, size,
inner_field, vmdesc_loop);
}
written_bytes = qemu_file_transferred(f) - old_offset;
vmsd_desc_field_end(vmsd, vmdesc_loop, inner_field,
written_bytes);
/* If we used a fake temp field.. free it now */
if (inner_field != field) {
g_clear_pointer((gpointer *)&inner_field, g_free);
}
if (ret) { if (ret) {
error_setg(errp, "Save of field %s/%s failed", error_setg(errp, "Save of field %s/%s failed",
vmsd->name, field->name); vmsd->name, field->name);
@ -419,9 +517,6 @@ int vmstate_save_state_v(QEMUFile *f, const VMStateDescription *vmsd,
return ret; return ret;
} }
written_bytes = qemu_file_transferred(f) - old_offset;
vmsd_desc_field_end(vmsd, vmdesc_loop, field, written_bytes, i);
/* Compressed arrays only care about the first element */ /* Compressed arrays only care about the first element */
if (vmdesc_loop && vmsd_can_compress(field)) { if (vmdesc_loop && vmsd_can_compress(field)) {
vmdesc_loop = NULL; vmdesc_loop = NULL;

View File

@ -31,6 +31,7 @@
#include "qapi/type-helpers.h" #include "qapi/type-helpers.h"
#include "hw/mem/memory-device.h" #include "hw/mem/memory-device.h"
#include "hw/intc/intc.h" #include "hw/intc/intc.h"
#include "migration/misc.h"
NameInfo *qmp_query_name(Error **errp) NameInfo *qmp_query_name(Error **errp)
{ {
@ -96,21 +97,18 @@ void qmp_cont(Error **errp)
} }
} }
/* Continuing after completed migration. Images have been inactivated to
* allow the destination to take control. Need to get control back now.
*
* If there are no inactive block nodes (e.g. because the VM was just
* paused rather than completing a migration), bdrv_inactivate_all() simply
* doesn't do anything. */
bdrv_activate_all(&local_err);
if (local_err) {
error_propagate(errp, local_err);
return;
}
if (runstate_check(RUN_STATE_INMIGRATE)) { if (runstate_check(RUN_STATE_INMIGRATE)) {
autostart = 1; autostart = 1;
} else { } else {
/*
* Continuing after completed migration. Images have been
* inactivated to allow the destination to take control. Need to
* get control back now.
*/
if (!migration_block_activate(&local_err)) {
error_propagate(errp, local_err);
return;
}
vm_start(); vm_start();
} }
} }

View File

@ -65,6 +65,9 @@ class MigrationFile(object):
def tell(self): def tell(self):
return self.file.tell() return self.file.tell()
def seek(self, a, b):
return self.file.seek(a, b)
# The VMSD description is at the end of the file, after EOF. Look for # The VMSD description is at the end of the file, after EOF. Look for
# the last NULL byte, then for the beginning brace of JSON. # the last NULL byte, then for the beginning brace of JSON.
def read_migration_debug_json(self): def read_migration_debug_json(self):
@ -272,11 +275,24 @@ class S390StorageAttributes(object):
self.section_key = section_key self.section_key = section_key
def read(self): def read(self):
pos = 0
while True: while True:
addr_flags = self.file.read64() addr_flags = self.file.read64()
flags = addr_flags & 0xfff flags = addr_flags & 0xfff
if (flags & (self.STATTR_FLAG_DONE | self.STATTR_FLAG_EOS)):
if flags & self.STATTR_FLAG_DONE:
pos = self.file.tell()
continue
elif flags & self.STATTR_FLAG_EOS:
return return
else:
# No EOS came after DONE, that's OK, but rewind the
# stream because this is not our data.
if pos:
self.file.seek(pos, os.SEEK_SET)
return
raise Exception("Unknown flags %x", flags)
if (flags & self.STATTR_FLAG_ERROR): if (flags & self.STATTR_FLAG_ERROR):
raise Exception("Error in migration stream") raise Exception("Error in migration stream")
count = self.file.read64() count = self.file.read64()
@ -401,6 +417,28 @@ class VMSDFieldIntLE(VMSDFieldInt):
super(VMSDFieldIntLE, self).__init__(desc, file) super(VMSDFieldIntLE, self).__init__(desc, file)
self.dtype = '<i%d' % self.size self.dtype = '<i%d' % self.size
class VMSDFieldNull(VMSDFieldGeneric):
NULL_PTR_MARKER = b'0'
def __init__(self, desc, file):
super(VMSDFieldNull, self).__init__(desc, file)
def __repr__(self):
# A NULL pointer is encoded in the stream as a '0' to
# disambiguate from a mere 0x0 value and avoid consumers
# trying to follow the NULL pointer. Displaying '0', 0x30 or
# 0x0 when analyzing the JSON debug stream could become
# confusing, so use an explicit term instead.
return "nullptr"
def __str__(self):
return self.__repr__()
def read(self):
super(VMSDFieldNull, self).read()
assert(self.data == self.NULL_PTR_MARKER)
return self.data
class VMSDFieldBool(VMSDFieldGeneric): class VMSDFieldBool(VMSDFieldGeneric):
def __init__(self, desc, file): def __init__(self, desc, file):
super(VMSDFieldBool, self).__init__(desc, file) super(VMSDFieldBool, self).__init__(desc, file)
@ -429,6 +467,9 @@ class VMSDFieldStruct(VMSDFieldGeneric):
super(VMSDFieldStruct, self).__init__(desc, file) super(VMSDFieldStruct, self).__init__(desc, file)
self.data = collections.OrderedDict() self.data = collections.OrderedDict()
if 'fields' not in self.desc['struct']:
raise Exception("No fields in struct. VMSD:\n%s" % self.desc)
# When we see compressed array elements, unfold them here # When we see compressed array elements, unfold them here
new_fields = [] new_fields = []
for field in self.desc['struct']['fields']: for field in self.desc['struct']['fields']:
@ -461,15 +502,25 @@ class VMSDFieldStruct(VMSDFieldGeneric):
field['data'] = reader(field, self.file) field['data'] = reader(field, self.file)
field['data'].read() field['data'].read()
if 'index' in field: fname = field['name']
if field['name'] not in self.data: fdata = field['data']
self.data[field['name']] = []
a = self.data[field['name']] # The field could be:
if len(a) != int(field['index']): # i) a single data entry, e.g. uint64
raise Exception("internal index of data field unmatched (%d/%d)" % (len(a), int(field['index']))) # ii) an array, indicated by it containing the 'index' key
a.append(field['data']) #
# However, the overall data after parsing the whole
# stream, could be a mix of arrays and single data fields,
# all sharing the same field name due to how QEMU breaks
# up arrays with NULL pointers into multiple compressed
# array segments.
if fname not in self.data:
self.data[fname] = fdata
elif type(self.data[fname]) == list:
self.data[fname].append(fdata)
else: else:
self.data[field['name']] = field['data'] tmp = self.data[fname]
self.data[fname] = [tmp, fdata]
if 'subsections' in self.desc['struct']: if 'subsections' in self.desc['struct']:
for subsection in self.desc['struct']['subsections']: for subsection in self.desc['struct']['subsections']:
@ -477,6 +528,10 @@ class VMSDFieldStruct(VMSDFieldGeneric):
raise Exception("Subsection %s not found at offset %x" % ( subsection['vmsd_name'], self.file.tell())) raise Exception("Subsection %s not found at offset %x" % ( subsection['vmsd_name'], self.file.tell()))
name = self.file.readstr() name = self.file.readstr()
version_id = self.file.read32() version_id = self.file.read32()
if not subsection:
raise Exception("Empty description for subsection: %s" % name)
self.data[name] = VMSDSection(self.file, version_id, subsection, (name, 0)) self.data[name] = VMSDSection(self.file, version_id, subsection, (name, 0))
self.data[name].read() self.data[name].read()
@ -535,6 +590,7 @@ vmsd_field_readers = {
"bitmap" : VMSDFieldGeneric, "bitmap" : VMSDFieldGeneric,
"struct" : VMSDFieldStruct, "struct" : VMSDFieldStruct,
"capability": VMSDFieldCap, "capability": VMSDFieldCap,
"nullptr": VMSDFieldNull,
"unknown" : VMSDFieldGeneric, "unknown" : VMSDFieldGeneric,
} }
@ -574,10 +630,13 @@ class MigrationDump(object):
} }
self.filename = filename self.filename = filename
self.vmsd_desc = None self.vmsd_desc = None
self.vmsd_json = ""
def read(self, desc_only = False, dump_memory = False, write_memory = False): def read(self, desc_only = False, dump_memory = False,
write_memory = False):
# Read in the whole file # Read in the whole file
file = MigrationFile(self.filename) file = MigrationFile(self.filename)
self.vmsd_json = file.read_migration_debug_json()
# File magic # File magic
data = file.read32() data = file.read32()
@ -635,9 +694,11 @@ class MigrationDump(object):
file.close() file.close()
def load_vmsd_json(self, file): def load_vmsd_json(self, file):
vmsd_json = file.read_migration_debug_json() self.vmsd_desc = json.loads(self.vmsd_json,
self.vmsd_desc = json.loads(vmsd_json, object_pairs_hook=collections.OrderedDict) object_pairs_hook=collections.OrderedDict)
for device in self.vmsd_desc['devices']: for device in self.vmsd_desc['devices']:
if 'fields' not in device:
raise Exception("vmstate for device %s has no fields" % device['name'])
key = (device['name'], device['instance_id']) key = (device['name'], device['instance_id'])
value = ( VMSDSection, device ) value = ( VMSDSection, device )
self.section_classes[key] = value self.section_classes[key] = value
@ -666,10 +727,15 @@ args = parser.parse_args()
jsonenc = JSONEncoder(indent=4, separators=(',', ': ')) jsonenc = JSONEncoder(indent=4, separators=(',', ': '))
if args.extract: if not any([args.extract, args.dump == "state", args.dump == "desc"]):
raise Exception("Please specify either -x, -d state or -d desc")
try:
dump = MigrationDump(args.file) dump = MigrationDump(args.file)
if args.extract:
dump.read(desc_only = True) dump.read(desc_only = True)
print("desc.json") print("desc.json")
f = open("desc.json", "w") f = open("desc.json", "w")
f.truncate() f.truncate()
@ -683,14 +749,12 @@ if args.extract:
f.truncate() f.truncate()
f.write(jsonenc.encode(dict)) f.write(jsonenc.encode(dict))
f.close() f.close()
elif args.dump == "state": elif args.dump == "state":
dump = MigrationDump(args.file)
dump.read(dump_memory = args.memory) dump.read(dump_memory = args.memory)
dict = dump.getDict() dict = dump.getDict()
print(jsonenc.encode(dict)) print(jsonenc.encode(dict))
elif args.dump == "desc": elif args.dump == "desc":
dump = MigrationDump(args.file)
dump.read(desc_only = True) dump.read(desc_only = True)
print(jsonenc.encode(dump.vmsd_desc)) print(jsonenc.encode(dump.vmsd_desc))
else: except Exception:
raise Exception("Please specify either -x, -d state or -d desc") raise Exception("Full JSON dump:\n%s", dump.vmsd_json)