From a0b9c5f75c05c12d30328a305377383652e62e63 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 9 Nov 2021 18:50:14 +0100 Subject: [PATCH 1/5] target/i386: sgx: mark device not user creatable The device is created by the machine based on the sgx-epc property. It should not be created by users. Reported-by: Thomas Huth Signed-off-by: Paolo Bonzini --- hw/i386/sgx-epc.c | 1 + 1 file changed, 1 insertion(+) diff --git a/hw/i386/sgx-epc.c b/hw/i386/sgx-epc.c index 55e2217eae..e508827e78 100644 --- a/hw/i386/sgx-epc.c +++ b/hw/i386/sgx-epc.c @@ -154,6 +154,7 @@ static void sgx_epc_class_init(ObjectClass *oc, void *data) dc->realize = sgx_epc_realize; dc->unrealize = sgx_epc_unrealize; dc->desc = "SGX EPC section"; + dc->user_creatable = false; device_class_set_props(dc, sgx_epc_properties); mdc->get_addr = sgx_epc_md_get_addr; From ef149763a8fcce70b85dfda27cc1222ecf765750 Mon Sep 17 00:00:00 2001 From: Greg Kurz Date: Tue, 9 Nov 2021 19:35:22 +0100 Subject: [PATCH 2/5] rcu: Introduce force_rcu notifier The drain_rcu_call() function can be blocked as long as an RCU reader stays in a read-side critical section. This is typically what happens when a TCG vCPU is executing a busy loop. It can deadlock the QEMU monitor as reported in https://gitlab.com/qemu-project/qemu/-/issues/650 . This can be avoided by allowing drain_rcu_call() to enforce an RCU grace period. Since each reader might need to do specific actions to end a read-side critical section, do it with notifiers. Prepare ground for this by adding a notifier list to the RCU reader struct and use it in wait_for_readers() if drain_rcu_call() is in progress. An API is added for readers to register their notifiers. This is largely based on a draft from Paolo Bonzini. Suggested-by: Paolo Bonzini Signed-off-by: Greg Kurz Reviewed-by: Richard Henderson Message-Id: <20211109183523.47726-2-groug@kaod.org> Signed-off-by: Paolo Bonzini --- include/qemu/rcu.h | 15 +++++++++++++++ util/rcu.c | 19 +++++++++++++++++++ 2 files changed, 34 insertions(+) diff --git a/include/qemu/rcu.h b/include/qemu/rcu.h index 515d327cf1..e69efbd47f 100644 --- a/include/qemu/rcu.h +++ b/include/qemu/rcu.h @@ -27,6 +27,7 @@ #include "qemu/thread.h" #include "qemu/queue.h" #include "qemu/atomic.h" +#include "qemu/notify.h" #include "qemu/sys_membarrier.h" #ifdef __cplusplus @@ -66,6 +67,13 @@ struct rcu_reader_data { /* Data used for registry, protected by rcu_registry_lock */ QLIST_ENTRY(rcu_reader_data) node; + + /* + * NotifierList used to force an RCU grace period. Accessed under + * rcu_registry_lock. Note that the notifier is called _outside_ + * the thread! + */ + NotifierList force_rcu; }; extern __thread struct rcu_reader_data rcu_reader; @@ -180,6 +188,13 @@ G_DEFINE_AUTOPTR_CLEANUP_FUNC(RCUReadAuto, rcu_read_auto_unlock) #define RCU_READ_LOCK_GUARD() \ g_autoptr(RCUReadAuto) _rcu_read_auto __attribute__((unused)) = rcu_read_auto_lock() +/* + * Force-RCU notifiers tell readers that they should exit their + * read-side critical section. + */ +void rcu_add_force_rcu_notifier(Notifier *n); +void rcu_remove_force_rcu_notifier(Notifier *n); + #ifdef __cplusplus } #endif diff --git a/util/rcu.c b/util/rcu.c index 13ac0f75cb..c91da9f137 100644 --- a/util/rcu.c +++ b/util/rcu.c @@ -46,6 +46,7 @@ unsigned long rcu_gp_ctr = RCU_GP_LOCKED; QemuEvent rcu_gp_event; +static int in_drain_call_rcu; static QemuMutex rcu_registry_lock; static QemuMutex rcu_sync_lock; @@ -107,6 +108,8 @@ static void wait_for_readers(void) * get some extra futex wakeups. */ qatomic_set(&index->waiting, false); + } else if (qatomic_read(&in_drain_call_rcu)) { + notifier_list_notify(&index->force_rcu, NULL); } } @@ -339,8 +342,10 @@ void drain_call_rcu(void) * assumed. */ + qatomic_inc(&in_drain_call_rcu); call_rcu1(&rcu_drain.rcu, drain_rcu_callback); qemu_event_wait(&rcu_drain.drain_complete_event); + qatomic_dec(&in_drain_call_rcu); if (locked) { qemu_mutex_lock_iothread(); @@ -363,6 +368,20 @@ void rcu_unregister_thread(void) qemu_mutex_unlock(&rcu_registry_lock); } +void rcu_add_force_rcu_notifier(Notifier *n) +{ + qemu_mutex_lock(&rcu_registry_lock); + notifier_list_add(&rcu_reader.force_rcu, n); + qemu_mutex_unlock(&rcu_registry_lock); +} + +void rcu_remove_force_rcu_notifier(Notifier *n) +{ + qemu_mutex_lock(&rcu_registry_lock); + notifier_remove(n); + qemu_mutex_unlock(&rcu_registry_lock); +} + static void rcu_init_complete(void) { QemuThread thread; From dd47a8f654d84f666b235ce8891e17ee76f9be8b Mon Sep 17 00:00:00 2001 From: Greg Kurz Date: Tue, 9 Nov 2021 19:35:23 +0100 Subject: [PATCH 3/5] accel/tcg: Register a force_rcu notifier A TCG vCPU doing a busy loop systematicaly hangs the QEMU monitor if the user passes 'device_add' without argument. This is because drain_cpu_all() which is called from qmp_device_add() cannot return if readers don't exit read-side critical sections. That is typically what busy-looping TCG vCPUs do: int cpu_exec(CPUState *cpu) { [...] rcu_read_lock(); [...] while (!cpu_handle_exception(cpu, &ret)) { // Busy loop keeps vCPU here } [...] rcu_read_unlock(); return ret; } For MTTCG, have all vCPU threads register a force_rcu notifier that will kick them out of the loop using async_run_on_cpu(). The notifier is called with the rcu_registry_lock mutex held, using async_run_on_cpu() ensures there are no deadlocks. For RR, a single thread runs all vCPUs. Just register a single notifier that kicks the current vCPU to the next one. For MTTCG: Suggested-by: Paolo Bonzini For RR: Suggested-by: Richard Henderson Fixes: 7bed89958bfb ("device_core: use drain_call_rcu in in qmp_device_add") Resolves: https://gitlab.com/qemu-project/qemu/-/issues/650 Signed-off-by: Greg Kurz Reviewed-by: Richard Henderson Message-Id: <20211109183523.47726-3-groug@kaod.org> Signed-off-by: Paolo Bonzini --- accel/tcg/tcg-accel-ops-mttcg.c | 26 ++++++++++++++++++++++++++ accel/tcg/tcg-accel-ops-rr.c | 10 ++++++++++ 2 files changed, 36 insertions(+) diff --git a/accel/tcg/tcg-accel-ops-mttcg.c b/accel/tcg/tcg-accel-ops-mttcg.c index 847d2079d2..29632bd4c0 100644 --- a/accel/tcg/tcg-accel-ops-mttcg.c +++ b/accel/tcg/tcg-accel-ops-mttcg.c @@ -28,6 +28,7 @@ #include "sysemu/tcg.h" #include "sysemu/replay.h" #include "qemu/main-loop.h" +#include "qemu/notify.h" #include "qemu/guest-random.h" #include "exec/exec-all.h" #include "hw/boards.h" @@ -35,6 +36,26 @@ #include "tcg-accel-ops.h" #include "tcg-accel-ops-mttcg.h" +typedef struct MttcgForceRcuNotifier { + Notifier notifier; + CPUState *cpu; +} MttcgForceRcuNotifier; + +static void do_nothing(CPUState *cpu, run_on_cpu_data d) +{ +} + +static void mttcg_force_rcu(Notifier *notify, void *data) +{ + CPUState *cpu = container_of(notify, MttcgForceRcuNotifier, notifier)->cpu; + + /* + * Called with rcu_registry_lock held, using async_run_on_cpu() ensures + * that there are no deadlocks. + */ + async_run_on_cpu(cpu, do_nothing, RUN_ON_CPU_NULL); +} + /* * In the multi-threaded case each vCPU has its own thread. The TLS * variable current_cpu can be used deep in the code to find the @@ -43,12 +64,16 @@ static void *mttcg_cpu_thread_fn(void *arg) { + MttcgForceRcuNotifier force_rcu; CPUState *cpu = arg; assert(tcg_enabled()); g_assert(!icount_enabled()); rcu_register_thread(); + force_rcu.notifier.notify = mttcg_force_rcu; + force_rcu.cpu = cpu; + rcu_add_force_rcu_notifier(&force_rcu.notifier); tcg_register_thread(); qemu_mutex_lock_iothread(); @@ -100,6 +125,7 @@ static void *mttcg_cpu_thread_fn(void *arg) tcg_cpus_destroy(cpu); qemu_mutex_unlock_iothread(); + rcu_remove_force_rcu_notifier(&force_rcu.notifier); rcu_unregister_thread(); return NULL; } diff --git a/accel/tcg/tcg-accel-ops-rr.c b/accel/tcg/tcg-accel-ops-rr.c index a5fd26190e..bf59f53dbc 100644 --- a/accel/tcg/tcg-accel-ops-rr.c +++ b/accel/tcg/tcg-accel-ops-rr.c @@ -28,6 +28,7 @@ #include "sysemu/tcg.h" #include "sysemu/replay.h" #include "qemu/main-loop.h" +#include "qemu/notify.h" #include "qemu/guest-random.h" #include "exec/exec-all.h" @@ -133,6 +134,11 @@ static void rr_deal_with_unplugged_cpus(void) } } +static void rr_force_rcu(Notifier *notify, void *data) +{ + rr_kick_next_cpu(); +} + /* * In the single-threaded case each vCPU is simulated in turn. If * there is more than a single vCPU we create a simple timer to kick @@ -143,10 +149,13 @@ static void rr_deal_with_unplugged_cpus(void) static void *rr_cpu_thread_fn(void *arg) { + Notifier force_rcu; CPUState *cpu = arg; assert(tcg_enabled()); rcu_register_thread(); + force_rcu.notify = rr_force_rcu; + rcu_add_force_rcu_notifier(&force_rcu); tcg_register_thread(); qemu_mutex_lock_iothread(); @@ -255,6 +264,7 @@ static void *rr_cpu_thread_fn(void *arg) rr_deal_with_unplugged_cpus(); } + rcu_remove_force_rcu_notifier(&force_rcu); rcu_unregister_thread(); return NULL; } From bd989ed44f847cba20b46a743770c152e188f365 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 10 Nov 2021 13:29:03 +0100 Subject: [PATCH 4/5] numa: avoid crash with SGX and "info numa" Add the MEMORY_DEVICE_INFO_KIND_SGX_EPC case, so that enclave memory is included in the output of "info numa" instead of crashing the monitor. Fixes: a7c565a941 ("sgx-epc: Add the fill_device_info() callback support", 2021-09-30) Signed-off-by: Paolo Bonzini --- hw/core/numa.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/hw/core/numa.c b/hw/core/numa.c index 510d096a88..e6050b2273 100644 --- a/hw/core/numa.c +++ b/hw/core/numa.c @@ -756,6 +756,7 @@ static void numa_stat_memory_devices(NumaNodeMem node_mem[]) PCDIMMDeviceInfo *pcdimm_info; VirtioPMEMDeviceInfo *vpi; VirtioMEMDeviceInfo *vmi; + SgxEPCDeviceInfo *se; for (info = info_list; info; info = info->next) { MemoryDeviceInfo *value = info->value; @@ -781,6 +782,12 @@ static void numa_stat_memory_devices(NumaNodeMem node_mem[]) node_mem[vmi->node].node_mem += vmi->size; node_mem[vmi->node].node_plugged_mem += vmi->size; break; + case MEMORY_DEVICE_INFO_KIND_SGX_EPC: + se = value->u.sgx_epc.data; + /* TODO: once we support numa, assign to right node */ + node_mem[0].node_mem += se->size; + node_mem[0].node_plugged_mem += se->size; + break; default: g_assert_not_reached(); } From 2c3132279b9a962c27adaea53b4c8e8480385706 Mon Sep 17 00:00:00 2001 From: Yang Zhong Date: Mon, 1 Nov 2021 12:20:09 -0400 Subject: [PATCH 5/5] sgx: Reset the vEPC regions during VM reboot For bare-metal SGX on real hardware, the hardware provides guarantees SGX state at reboot. For instance, all pages start out uninitialized. The vepc driver provides a similar guarantee today for freshly-opened vepc instances, but guests such as Windows expect all pages to be in uninitialized state on startup, including after every guest reboot. Qemu can invoke the ioctl to bring its vEPC pages back to uninitialized state. There is a possibility that some pages fail to be removed if they are SECS pages, and the child and SECS pages could be in separate vEPC regions. Therefore, the ioctl returns the number of EREMOVE failures, telling Qemu to try the ioctl again after it's done with all vEPC regions. The related kernel patches: Link: https://lkml.kernel.org/r/20211021201155.1523989-3-pbonzini@redhat.com Signed-off-by: Yang Zhong Message-Id: <20211101162009.62161-6-yang.zhong@intel.com> Signed-off-by: Paolo Bonzini --- hw/i386/sgx.c | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/hw/i386/sgx.c b/hw/i386/sgx.c index 11607568b6..8fef3dd8fa 100644 --- a/hw/i386/sgx.c +++ b/hw/i386/sgx.c @@ -21,6 +21,8 @@ #include "qapi/qapi-commands-misc-target.h" #include "exec/address-spaces.h" #include "sysemu/hw_accel.h" +#include "sysemu/reset.h" +#include #define SGX_MAX_EPC_SECTIONS 8 #define SGX_CPUID_EPC_INVALID 0x0 @@ -29,6 +31,11 @@ #define SGX_CPUID_EPC_SECTION 0x1 #define SGX_CPUID_EPC_MASK 0xF +#define SGX_MAGIC 0xA4 +#define SGX_IOC_VEPC_REMOVE_ALL _IO(SGX_MAGIC, 0x04) + +#define RETRY_NUM 2 + static uint64_t sgx_calc_section_metric(uint64_t low, uint64_t high) { return (low & MAKE_64BIT_MASK(12, 20)) + @@ -59,6 +66,46 @@ static uint64_t sgx_calc_host_epc_section_size(void) return size; } +static void sgx_epc_reset(void *opaque) +{ + PCMachineState *pcms = PC_MACHINE(qdev_get_machine()); + HostMemoryBackend *hostmem; + SGXEPCDevice *epc; + int failures; + int fd, i, j, r; + static bool warned = false; + + /* + * The second pass is needed to remove SECS pages that could not + * be removed during the first. + */ + for (i = 0; i < RETRY_NUM; i++) { + failures = 0; + for (j = 0; j < pcms->sgx_epc.nr_sections; j++) { + epc = pcms->sgx_epc.sections[j]; + hostmem = MEMORY_BACKEND(epc->hostmem); + fd = memory_region_get_fd(host_memory_backend_get_memory(hostmem)); + + r = ioctl(fd, SGX_IOC_VEPC_REMOVE_ALL); + if (r == -ENOTTY && !warned) { + warned = true; + warn_report("kernel does not support SGX_IOC_VEPC_REMOVE_ALL"); + warn_report("SGX might operate incorrectly in the guest after reset"); + break; + } else if (r > 0) { + /* SECS pages remain */ + failures++; + if (i == 1) { + error_report("cannot reset vEPC section %d", j); + } + } + } + if (!failures) { + break; + } + } +} + SGXInfo *qmp_query_sgx_capabilities(Error **errp) { SGXInfo *info = NULL; @@ -190,4 +237,7 @@ void pc_machine_init_sgx_epc(PCMachineState *pcms) } memory_region_set_size(&sgx_epc->mr, sgx_epc->size); + + /* register the reset callback for sgx epc */ + qemu_register_reset(sgx_epc_reset, NULL); }