From a0b9c5f75c05c12d30328a305377383652e62e63 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Tue, 9 Nov 2021 18:50:14 +0100
Subject: [PATCH 1/5] target/i386: sgx: mark device not user creatable

The device is created by the machine based on the sgx-epc property.
It should not be created by users.

Reported-by: Thomas Huth <thuth@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 hw/i386/sgx-epc.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/hw/i386/sgx-epc.c b/hw/i386/sgx-epc.c
index 55e2217eae..e508827e78 100644
--- a/hw/i386/sgx-epc.c
+++ b/hw/i386/sgx-epc.c
@@ -154,6 +154,7 @@ static void sgx_epc_class_init(ObjectClass *oc, void *data)
     dc->realize = sgx_epc_realize;
     dc->unrealize = sgx_epc_unrealize;
     dc->desc = "SGX EPC section";
+    dc->user_creatable = false;
     device_class_set_props(dc, sgx_epc_properties);
 
     mdc->get_addr = sgx_epc_md_get_addr;

From ef149763a8fcce70b85dfda27cc1222ecf765750 Mon Sep 17 00:00:00 2001
From: Greg Kurz <groug@kaod.org>
Date: Tue, 9 Nov 2021 19:35:22 +0100
Subject: [PATCH 2/5] rcu: Introduce force_rcu notifier

The drain_rcu_call() function can be blocked as long as an RCU reader
stays in a read-side critical section. This is typically what happens
when a TCG vCPU is executing a busy loop. It can deadlock the QEMU
monitor as reported in https://gitlab.com/qemu-project/qemu/-/issues/650 .

This can be avoided by allowing drain_rcu_call() to enforce an RCU grace
period. Since each reader might need to do specific actions to end a
read-side critical section, do it with notifiers.

Prepare ground for this by adding a notifier list to the RCU reader
struct and use it in wait_for_readers() if drain_rcu_call() is in
progress. An API is added for readers to register their notifiers.

This is largely based on a draft from Paolo Bonzini.

Suggested-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Greg Kurz <groug@kaod.org>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Message-Id: <20211109183523.47726-2-groug@kaod.org>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 include/qemu/rcu.h | 15 +++++++++++++++
 util/rcu.c         | 19 +++++++++++++++++++
 2 files changed, 34 insertions(+)

diff --git a/include/qemu/rcu.h b/include/qemu/rcu.h
index 515d327cf1..e69efbd47f 100644
--- a/include/qemu/rcu.h
+++ b/include/qemu/rcu.h
@@ -27,6 +27,7 @@
 #include "qemu/thread.h"
 #include "qemu/queue.h"
 #include "qemu/atomic.h"
+#include "qemu/notify.h"
 #include "qemu/sys_membarrier.h"
 
 #ifdef __cplusplus
@@ -66,6 +67,13 @@ struct rcu_reader_data {
 
     /* Data used for registry, protected by rcu_registry_lock */
     QLIST_ENTRY(rcu_reader_data) node;
+
+    /*
+     * NotifierList used to force an RCU grace period.  Accessed under
+     * rcu_registry_lock.  Note that the notifier is called _outside_
+     * the thread!
+     */
+    NotifierList force_rcu;
 };
 
 extern __thread struct rcu_reader_data rcu_reader;
@@ -180,6 +188,13 @@ G_DEFINE_AUTOPTR_CLEANUP_FUNC(RCUReadAuto, rcu_read_auto_unlock)
 #define RCU_READ_LOCK_GUARD() \
     g_autoptr(RCUReadAuto) _rcu_read_auto __attribute__((unused)) = rcu_read_auto_lock()
 
+/*
+ * Force-RCU notifiers tell readers that they should exit their
+ * read-side critical section.
+ */
+void rcu_add_force_rcu_notifier(Notifier *n);
+void rcu_remove_force_rcu_notifier(Notifier *n);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/util/rcu.c b/util/rcu.c
index 13ac0f75cb..c91da9f137 100644
--- a/util/rcu.c
+++ b/util/rcu.c
@@ -46,6 +46,7 @@
 unsigned long rcu_gp_ctr = RCU_GP_LOCKED;
 
 QemuEvent rcu_gp_event;
+static int in_drain_call_rcu;
 static QemuMutex rcu_registry_lock;
 static QemuMutex rcu_sync_lock;
 
@@ -107,6 +108,8 @@ static void wait_for_readers(void)
                  * get some extra futex wakeups.
                  */
                 qatomic_set(&index->waiting, false);
+            } else if (qatomic_read(&in_drain_call_rcu)) {
+                notifier_list_notify(&index->force_rcu, NULL);
             }
         }
 
@@ -339,8 +342,10 @@ void drain_call_rcu(void)
      * assumed.
      */
 
+    qatomic_inc(&in_drain_call_rcu);
     call_rcu1(&rcu_drain.rcu, drain_rcu_callback);
     qemu_event_wait(&rcu_drain.drain_complete_event);
+    qatomic_dec(&in_drain_call_rcu);
 
     if (locked) {
         qemu_mutex_lock_iothread();
@@ -363,6 +368,20 @@ void rcu_unregister_thread(void)
     qemu_mutex_unlock(&rcu_registry_lock);
 }
 
+void rcu_add_force_rcu_notifier(Notifier *n)
+{
+    qemu_mutex_lock(&rcu_registry_lock);
+    notifier_list_add(&rcu_reader.force_rcu, n);
+    qemu_mutex_unlock(&rcu_registry_lock);
+}
+
+void rcu_remove_force_rcu_notifier(Notifier *n)
+{
+    qemu_mutex_lock(&rcu_registry_lock);
+    notifier_remove(n);
+    qemu_mutex_unlock(&rcu_registry_lock);
+}
+
 static void rcu_init_complete(void)
 {
     QemuThread thread;

From dd47a8f654d84f666b235ce8891e17ee76f9be8b Mon Sep 17 00:00:00 2001
From: Greg Kurz <groug@kaod.org>
Date: Tue, 9 Nov 2021 19:35:23 +0100
Subject: [PATCH 3/5] accel/tcg: Register a force_rcu notifier

A TCG vCPU doing a busy loop systematicaly hangs the QEMU monitor
if the user passes 'device_add' without argument. This is because
drain_cpu_all() which is called from qmp_device_add() cannot return
if readers don't exit read-side critical sections. That is typically
what busy-looping TCG vCPUs do:

int cpu_exec(CPUState *cpu)
{
[...]
    rcu_read_lock();
[...]
    while (!cpu_handle_exception(cpu, &ret)) {
        // Busy loop keeps vCPU here
    }
[...]
    rcu_read_unlock();

    return ret;
}

For MTTCG, have all vCPU threads register a force_rcu notifier that will
kick them out of the loop using async_run_on_cpu(). The notifier is called
with the rcu_registry_lock mutex held, using async_run_on_cpu() ensures
there are no deadlocks.

For RR, a single thread runs all vCPUs. Just register a single notifier
that kicks the current vCPU to the next one.

For MTTCG:
Suggested-by: Paolo Bonzini <pbonzini@redhat.com>

For RR:
Suggested-by: Richard Henderson <richard.henderson@linaro.org>

Fixes: 7bed89958bfb ("device_core: use drain_call_rcu in in qmp_device_add")
Resolves: https://gitlab.com/qemu-project/qemu/-/issues/650
Signed-off-by: Greg Kurz <groug@kaod.org>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Message-Id: <20211109183523.47726-3-groug@kaod.org>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 accel/tcg/tcg-accel-ops-mttcg.c | 26 ++++++++++++++++++++++++++
 accel/tcg/tcg-accel-ops-rr.c    | 10 ++++++++++
 2 files changed, 36 insertions(+)

diff --git a/accel/tcg/tcg-accel-ops-mttcg.c b/accel/tcg/tcg-accel-ops-mttcg.c
index 847d2079d2..29632bd4c0 100644
--- a/accel/tcg/tcg-accel-ops-mttcg.c
+++ b/accel/tcg/tcg-accel-ops-mttcg.c
@@ -28,6 +28,7 @@
 #include "sysemu/tcg.h"
 #include "sysemu/replay.h"
 #include "qemu/main-loop.h"
+#include "qemu/notify.h"
 #include "qemu/guest-random.h"
 #include "exec/exec-all.h"
 #include "hw/boards.h"
@@ -35,6 +36,26 @@
 #include "tcg-accel-ops.h"
 #include "tcg-accel-ops-mttcg.h"
 
+typedef struct MttcgForceRcuNotifier {
+    Notifier notifier;
+    CPUState *cpu;
+} MttcgForceRcuNotifier;
+
+static void do_nothing(CPUState *cpu, run_on_cpu_data d)
+{
+}
+
+static void mttcg_force_rcu(Notifier *notify, void *data)
+{
+    CPUState *cpu = container_of(notify, MttcgForceRcuNotifier, notifier)->cpu;
+
+    /*
+     * Called with rcu_registry_lock held, using async_run_on_cpu() ensures
+     * that there are no deadlocks.
+     */
+    async_run_on_cpu(cpu, do_nothing, RUN_ON_CPU_NULL);
+}
+
 /*
  * In the multi-threaded case each vCPU has its own thread. The TLS
  * variable current_cpu can be used deep in the code to find the
@@ -43,12 +64,16 @@
 
 static void *mttcg_cpu_thread_fn(void *arg)
 {
+    MttcgForceRcuNotifier force_rcu;
     CPUState *cpu = arg;
 
     assert(tcg_enabled());
     g_assert(!icount_enabled());
 
     rcu_register_thread();
+    force_rcu.notifier.notify = mttcg_force_rcu;
+    force_rcu.cpu = cpu;
+    rcu_add_force_rcu_notifier(&force_rcu.notifier);
     tcg_register_thread();
 
     qemu_mutex_lock_iothread();
@@ -100,6 +125,7 @@ static void *mttcg_cpu_thread_fn(void *arg)
 
     tcg_cpus_destroy(cpu);
     qemu_mutex_unlock_iothread();
+    rcu_remove_force_rcu_notifier(&force_rcu.notifier);
     rcu_unregister_thread();
     return NULL;
 }
diff --git a/accel/tcg/tcg-accel-ops-rr.c b/accel/tcg/tcg-accel-ops-rr.c
index a5fd26190e..bf59f53dbc 100644
--- a/accel/tcg/tcg-accel-ops-rr.c
+++ b/accel/tcg/tcg-accel-ops-rr.c
@@ -28,6 +28,7 @@
 #include "sysemu/tcg.h"
 #include "sysemu/replay.h"
 #include "qemu/main-loop.h"
+#include "qemu/notify.h"
 #include "qemu/guest-random.h"
 #include "exec/exec-all.h"
 
@@ -133,6 +134,11 @@ static void rr_deal_with_unplugged_cpus(void)
     }
 }
 
+static void rr_force_rcu(Notifier *notify, void *data)
+{
+    rr_kick_next_cpu();
+}
+
 /*
  * In the single-threaded case each vCPU is simulated in turn. If
  * there is more than a single vCPU we create a simple timer to kick
@@ -143,10 +149,13 @@ static void rr_deal_with_unplugged_cpus(void)
 
 static void *rr_cpu_thread_fn(void *arg)
 {
+    Notifier force_rcu;
     CPUState *cpu = arg;
 
     assert(tcg_enabled());
     rcu_register_thread();
+    force_rcu.notify = rr_force_rcu;
+    rcu_add_force_rcu_notifier(&force_rcu);
     tcg_register_thread();
 
     qemu_mutex_lock_iothread();
@@ -255,6 +264,7 @@ static void *rr_cpu_thread_fn(void *arg)
         rr_deal_with_unplugged_cpus();
     }
 
+    rcu_remove_force_rcu_notifier(&force_rcu);
     rcu_unregister_thread();
     return NULL;
 }

From bd989ed44f847cba20b46a743770c152e188f365 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Wed, 10 Nov 2021 13:29:03 +0100
Subject: [PATCH 4/5] numa: avoid crash with SGX and "info numa"

Add the MEMORY_DEVICE_INFO_KIND_SGX_EPC case, so that enclave
memory is included in the output of "info numa" instead of crashing
the monitor.

Fixes: a7c565a941 ("sgx-epc: Add the fill_device_info() callback support", 2021-09-30)
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 hw/core/numa.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/hw/core/numa.c b/hw/core/numa.c
index 510d096a88..e6050b2273 100644
--- a/hw/core/numa.c
+++ b/hw/core/numa.c
@@ -756,6 +756,7 @@ static void numa_stat_memory_devices(NumaNodeMem node_mem[])
     PCDIMMDeviceInfo     *pcdimm_info;
     VirtioPMEMDeviceInfo *vpi;
     VirtioMEMDeviceInfo *vmi;
+    SgxEPCDeviceInfo *se;
 
     for (info = info_list; info; info = info->next) {
         MemoryDeviceInfo *value = info->value;
@@ -781,6 +782,12 @@ static void numa_stat_memory_devices(NumaNodeMem node_mem[])
                 node_mem[vmi->node].node_mem += vmi->size;
                 node_mem[vmi->node].node_plugged_mem += vmi->size;
                 break;
+            case MEMORY_DEVICE_INFO_KIND_SGX_EPC:
+                se = value->u.sgx_epc.data;
+                /* TODO: once we support numa, assign to right node */
+                node_mem[0].node_mem += se->size;
+                node_mem[0].node_plugged_mem += se->size;
+                break;
             default:
                 g_assert_not_reached();
             }

From 2c3132279b9a962c27adaea53b4c8e8480385706 Mon Sep 17 00:00:00 2001
From: Yang Zhong <yang.zhong@intel.com>
Date: Mon, 1 Nov 2021 12:20:09 -0400
Subject: [PATCH 5/5] sgx: Reset the vEPC regions during VM reboot

For bare-metal SGX on real hardware, the hardware provides guarantees
SGX state at reboot.  For instance, all pages start out uninitialized.
The vepc driver provides a similar guarantee today for freshly-opened
vepc instances, but guests such as Windows expect all pages to be in
uninitialized state on startup, including after every guest reboot.

Qemu can invoke the ioctl to bring its vEPC pages back to uninitialized
state. There is a possibility that some pages fail to be removed if they
are SECS pages, and the child and SECS pages could be in separate vEPC
regions.  Therefore, the ioctl returns the number of EREMOVE failures,
telling Qemu to try the ioctl again after it's done with all vEPC regions.

The related kernel patches:
Link: https://lkml.kernel.org/r/20211021201155.1523989-3-pbonzini@redhat.com

Signed-off-by: Yang Zhong <yang.zhong@intel.com>
Message-Id: <20211101162009.62161-6-yang.zhong@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 hw/i386/sgx.c | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 50 insertions(+)

diff --git a/hw/i386/sgx.c b/hw/i386/sgx.c
index 11607568b6..8fef3dd8fa 100644
--- a/hw/i386/sgx.c
+++ b/hw/i386/sgx.c
@@ -21,6 +21,8 @@
 #include "qapi/qapi-commands-misc-target.h"
 #include "exec/address-spaces.h"
 #include "sysemu/hw_accel.h"
+#include "sysemu/reset.h"
+#include <sys/ioctl.h>
 
 #define SGX_MAX_EPC_SECTIONS            8
 #define SGX_CPUID_EPC_INVALID           0x0
@@ -29,6 +31,11 @@
 #define SGX_CPUID_EPC_SECTION           0x1
 #define SGX_CPUID_EPC_MASK              0xF
 
+#define SGX_MAGIC 0xA4
+#define SGX_IOC_VEPC_REMOVE_ALL       _IO(SGX_MAGIC, 0x04)
+
+#define RETRY_NUM                       2
+
 static uint64_t sgx_calc_section_metric(uint64_t low, uint64_t high)
 {
     return (low & MAKE_64BIT_MASK(12, 20)) +
@@ -59,6 +66,46 @@ static uint64_t sgx_calc_host_epc_section_size(void)
     return size;
 }
 
+static void sgx_epc_reset(void *opaque)
+{
+    PCMachineState *pcms = PC_MACHINE(qdev_get_machine());
+    HostMemoryBackend *hostmem;
+    SGXEPCDevice *epc;
+    int failures;
+    int fd, i, j, r;
+    static bool warned = false;
+
+    /*
+     * The second pass is needed to remove SECS pages that could not
+     * be removed during the first.
+     */
+    for (i = 0; i < RETRY_NUM; i++) {
+        failures = 0;
+        for (j = 0; j < pcms->sgx_epc.nr_sections; j++) {
+            epc = pcms->sgx_epc.sections[j];
+            hostmem = MEMORY_BACKEND(epc->hostmem);
+            fd = memory_region_get_fd(host_memory_backend_get_memory(hostmem));
+
+            r = ioctl(fd, SGX_IOC_VEPC_REMOVE_ALL);
+            if (r == -ENOTTY && !warned) {
+                warned = true;
+                warn_report("kernel does not support SGX_IOC_VEPC_REMOVE_ALL");
+                warn_report("SGX might operate incorrectly in the guest after reset");
+                break;
+            } else if (r > 0) {
+                /* SECS pages remain */
+                failures++;
+                if (i == 1) {
+                    error_report("cannot reset vEPC section %d", j);
+                }
+            }
+        }
+        if (!failures) {
+            break;
+        }
+     }
+}
+
 SGXInfo *qmp_query_sgx_capabilities(Error **errp)
 {
     SGXInfo *info = NULL;
@@ -190,4 +237,7 @@ void pc_machine_init_sgx_epc(PCMachineState *pcms)
     }
 
     memory_region_set_size(&sgx_epc->mr, sgx_epc->size);
+
+    /* register the reset callback for sgx epc */
+    qemu_register_reset(sgx_epc_reset, NULL);
 }