- Reorg cpu_tb_exec around setjmp.

- Use __attribute__((target)) for buffer_is_zero. - Add perfmap and jitdump for perf support. -----BEGIN PGP SIGNATURE----- iQFRBAABCgA7FiEEekgeeIaLTbaoWgXAZN846K9+IV8FAmPFtXgdHHJpY2hhcmQu aGVuZGVyc29uQGxpbmFyby5vcmcACgkQZN846K9+IV+62Af/YLdd77+IO8wcPN1n 2esKNlrrYQ2/x40Hede6L4pp+Qz6M810YgNbhMchBS3u8YGIFzmVLaiz5N4nBGTq ricOit6Jj2KonlxWaxgma1BW36ydBIwphC20WjrLTVgdNGvxmy4JKbfklTd4oIZ+ +Kf1VTN1bbt7vfDntd454ck2rhtVZBEvYTynikA3LJ1W7EVN5etDUnrN3VwbaTTK 0aY8MbDfGUASyrDzmaRcF5F90K+V2STuEUs1whgOuTBusZ6n3+FSDhW8CuDhsy/l rXtPkQba3rtUWfJ1+o7bx3Gg4nbfn/9lxIfGwhQ79ywD9InjqLqu9etwVyhijx+n k+tvPw== =xFfF -----END PGP SIGNATURE----- Merge tag 'pull-tcg-20230116' of https://gitlab.com/rth7680/qemu into staging - Reorg cpu_tb_exec around setjmp. - Use __attribute__((target)) for buffer_is_zero. - Add perfmap and jitdump for perf support. # -----BEGIN PGP SIGNATURE----- # # iQFRBAABCgA7FiEEekgeeIaLTbaoWgXAZN846K9+IV8FAmPFtXgdHHJpY2hhcmQu # aGVuZGVyc29uQGxpbmFyby5vcmcACgkQZN846K9+IV+62Af/YLdd77+IO8wcPN1n # 2esKNlrrYQ2/x40Hede6L4pp+Qz6M810YgNbhMchBS3u8YGIFzmVLaiz5N4nBGTq # ricOit6Jj2KonlxWaxgma1BW36ydBIwphC20WjrLTVgdNGvxmy4JKbfklTd4oIZ+ # +Kf1VTN1bbt7vfDntd454ck2rhtVZBEvYTynikA3LJ1W7EVN5etDUnrN3VwbaTTK # 0aY8MbDfGUASyrDzmaRcF5F90K+V2STuEUs1whgOuTBusZ6n3+FSDhW8CuDhsy/l # rXtPkQba3rtUWfJ1+o7bx3Gg4nbfn/9lxIfGwhQ79ywD9InjqLqu9etwVyhijx+n # k+tvPw== # =xFfF # -----END PGP SIGNATURE----- # gpg: Signature made Mon 16 Jan 2023 20:37:12 GMT # gpg: using RSA key 7A481E78868B4DB6A85A05C064DF38E8AF7E215F # gpg: issuer "richard.henderson@linaro.org" # gpg: Good signature from "Richard Henderson <richard.henderson@linaro.org>" [full] # Primary key fingerprint: 7A48 1E78 868B 4DB6 A85A 05C0 64DF 38E8 AF7E 215F * tag 'pull-tcg-20230116' of https://gitlab.com/rth7680/qemu: accel/tcg: Split out cpu_exec_{setjmp,loop} util/bufferiszero: Use __attribute__((target)) for avx2/avx512 tcg: add perfmap and jitdump accel/tcg: Add debuginfo support linux-user: Clean up when exiting due to a signal Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
2023-01-17 10:24:15 +00:00 · 2023-01-17 10:24:15 +00:00 · 7c9236d6d6
commit 7c9236d6d6
parent 2f8d6a88e4 61710a7e23
19 changed files with 763 additions and 101 deletions
--- a/accel/tcg/cpu-exec.c
+++ b/accel/tcg/cpu-exec.c
@ -909,64 +909,10 @@ static inline void cpu_loop_exec_tb(CPUState *cpu, TranslationBlock *tb,
 /* main execution loop */
-int cpu_exec(CPUState *cpu)
+static int __attribute__((noinline))
 cpu_exec_loop(CPUState *cpu, SyncClocks *sc)
 {
    int ret;
    SyncClocks sc = { 0 };
    /* replay_interrupt may need current_cpu */
    current_cpu = cpu;
    if (cpu_handle_halt(cpu)) {
        return EXCP_HALTED;
    }
    rcu_read_lock();
    cpu_exec_enter(cpu);
    /* Calculate difference between guest clock and host clock.
     * This delay includes the delay of the last cycle, so
     * what we have to do is sleep until it is 0. As for the
     * advance/delay we gain here, we try to fix it next time.
     */
    init_delay_params(&sc, cpu);
    /* prepare setjmp context for exception handling */
    if (sigsetjmp(cpu->jmp_env, 0) != 0) {
 #if defined(__clang__)
        /*
         * Some compilers wrongly smash all local variables after
         * siglongjmp (the spec requires that only non-volatile locals
         * which are changed between the sigsetjmp and siglongjmp are
         * permitted to be trashed). There were bug reports for gcc
         * 4.5.0 and clang.  The bug is fixed in all versions of gcc
         * that we support, but is still unfixed in clang:
         *   https://bugs.llvm.org/show_bug.cgi?id=21183
         *
         * Reload an essential local variable here for those compilers.
         * Newer versions of gcc would complain about this code (-Wclobbered),
         * so we only perform the workaround for clang.
         */
        cpu = current_cpu;
 #else
        /* Non-buggy compilers preserve this; assert the correct value. */
        g_assert(cpu == current_cpu);
 #endif
 #ifndef CONFIG_SOFTMMU
        clear_helper_retaddr();
        if (have_mmap_lock()) {
            mmap_unlock();
        }
 #endif
        if (qemu_mutex_iothread_locked()) {
            qemu_mutex_unlock_iothread();
        }
        qemu_plugin_disable_mem_helpers(cpu);
        assert_no_pages_locked();
    }
    /* if an exception is pending, we execute it here */
    while (!cpu_handle_exception(cpu, &ret)) {
@ -1033,9 +979,60 @@ int cpu_exec(CPUState *cpu)
            /* Try to align the host and virtual clocks
               if the guest is in advance */
-            align_clocks(&sc, cpu);
+            align_clocks(sc, cpu);
        }
    }
    return ret;
 }
 static int cpu_exec_setjmp(CPUState *cpu, SyncClocks *sc)
 {
    /* Prepare setjmp context for exception handling. */
    if (unlikely(sigsetjmp(cpu->jmp_env, 0) != 0)) {
        /* Non-buggy compilers preserve this; assert the correct value. */
        g_assert(cpu == current_cpu);
 #ifndef CONFIG_SOFTMMU
        clear_helper_retaddr();
        if (have_mmap_lock()) {
            mmap_unlock();
        }
 #endif
        if (qemu_mutex_iothread_locked()) {
            qemu_mutex_unlock_iothread();
        }
        qemu_plugin_disable_mem_helpers(cpu);
        assert_no_pages_locked();
    }
    return cpu_exec_loop(cpu, sc);
 }
 int cpu_exec(CPUState *cpu)
 {
    int ret;
    SyncClocks sc = { 0 };
    /* replay_interrupt may need current_cpu */
    current_cpu = cpu;
    if (cpu_handle_halt(cpu)) {
        return EXCP_HALTED;
    }
    rcu_read_lock();
    cpu_exec_enter(cpu);
    /*
     * Calculate difference between guest clock and host clock.
     * This delay includes the delay of the last cycle, so
     * what we have to do is sleep until it is 0. As for the
     * advance/delay we gain here, we try to fix it next time.
     */
    init_delay_params(&sc, cpu);
    ret = cpu_exec_setjmp(cpu, &sc);
    cpu_exec_exit(cpu);
    rcu_read_unlock();
--- a/accel/tcg/debuginfo.c
+++ b/accel/tcg/debuginfo.c
@ -0,0 +1,96 @@
 /*
 * Debug information support.
 *
 * SPDX-License-Identifier: GPL-2.0-or-later
 */
 #include "qemu/osdep.h"
 #include "qemu/lockable.h"
 #include <elfutils/libdwfl.h>
 #include "debuginfo.h"
 static QemuMutex lock;
 static Dwfl *dwfl;
 static const Dwfl_Callbacks dwfl_callbacks = {
    .find_elf = NULL,
    .find_debuginfo = dwfl_standard_find_debuginfo,
    .section_address = NULL,
    .debuginfo_path = NULL,
 };
 __attribute__((constructor))
 static void debuginfo_init(void)
 {
    qemu_mutex_init(&lock);
 }
 void debuginfo_report_elf(const char *name, int fd, uint64_t bias)
 {
    QEMU_LOCK_GUARD(&lock);
    if (dwfl) {
        dwfl_report_begin_add(dwfl);
    } else {
        dwfl = dwfl_begin(&dwfl_callbacks);
    }
    if (dwfl) {
        dwfl_report_elf(dwfl, name, name, fd, bias, true);
        dwfl_report_end(dwfl, NULL, NULL);
    }
 }
 void debuginfo_lock(void)
 {
    qemu_mutex_lock(&lock);
 }
 void debuginfo_query(struct debuginfo_query *q, size_t n)
 {
    const char *symbol, *file;
    Dwfl_Module *dwfl_module;
    Dwfl_Line *dwfl_line;
    GElf_Off dwfl_offset;
    GElf_Sym dwfl_sym;
    size_t i;
    int line;
    if (!dwfl) {
        return;
    }
    for (i = 0; i < n; i++) {
        dwfl_module = dwfl_addrmodule(dwfl, q[i].address);
        if (!dwfl_module) {
            continue;
        }
        if (q[i].flags & DEBUGINFO_SYMBOL) {
            symbol = dwfl_module_addrinfo(dwfl_module, q[i].address,
                                          &dwfl_offset, &dwfl_sym,
                                          NULL, NULL, NULL);
            if (symbol) {
                q[i].symbol = symbol;
                q[i].offset = dwfl_offset;
            }
        }
        if (q[i].flags & DEBUGINFO_LINE) {
            dwfl_line = dwfl_module_getsrc(dwfl_module, q[i].address);
            if (dwfl_line) {
                file = dwfl_lineinfo(dwfl_line, NULL, &line, 0, NULL, NULL);
                if (file) {
                    q[i].file = file;
                    q[i].line = line;
                }
            }
        }
    }
 }
 void debuginfo_unlock(void)
 {
    qemu_mutex_unlock(&lock);
 }
--- a/accel/tcg/debuginfo.h
+++ b/accel/tcg/debuginfo.h
@ -0,0 +1,77 @@
 /*
 * Debug information support.
 *
 * SPDX-License-Identifier: GPL-2.0-or-later
 */
 #ifndef ACCEL_TCG_DEBUGINFO_H
 #define ACCEL_TCG_DEBUGINFO_H
 /*
 * Debuginfo describing a certain address.
 */
 struct debuginfo_query {
    uint64_t address;    /* Input: address. */
    int flags;           /* Input: debuginfo subset. */
    const char *symbol;  /* Symbol that the address is part of. */
    uint64_t offset;     /* Offset from the symbol. */
    const char *file;    /* Source file associated with the address. */
    int line;            /* Line number in the source file. */
 };
 /*
 * Debuginfo subsets.
 */
 #define DEBUGINFO_SYMBOL BIT(1)
 #define DEBUGINFO_LINE   BIT(2)
 #if defined(CONFIG_TCG) && defined(CONFIG_LIBDW)
 /*
 * Load debuginfo for the specified guest ELF image.
 * Return true on success, false on failure.
 */
 void debuginfo_report_elf(const char *name, int fd, uint64_t bias);
 /*
 * Take the debuginfo lock.
 */
 void debuginfo_lock(void);
 /*
 * Fill each on N Qs with the debuginfo about Q->ADDRESS as specified by
 * Q->FLAGS:
 *
 * - DEBUGINFO_SYMBOL: update Q->SYMBOL and Q->OFFSET. If symbol debuginfo is
 *                     missing, then leave them as is.
 * - DEBUINFO_LINE: update Q->FILE and Q->LINE. If line debuginfo is missing,
 *                  then leave them as is.
 *
 * This function must be called under the debuginfo lock. The results can be
 * accessed only until the debuginfo lock is released.
 */
 void debuginfo_query(struct debuginfo_query *q, size_t n);
 /*
 * Release the debuginfo lock.
 */
 void debuginfo_unlock(void);
 #else
 static inline void debuginfo_report_elf(const char *image_name, int image_fd,
                                        uint64_t load_bias)
 {
 }
 static inline void debuginfo_lock(void)
 {
 }
 static inline void debuginfo_query(struct debuginfo_query *q, size_t n)
 {
 }
 static inline void debuginfo_unlock(void)
 {
 }
 #endif
 #endif
--- a/accel/tcg/meson.build
+++ b/accel/tcg/meson.build
@ -12,6 +12,8 @@ tcg_ss.add(files(
 tcg_ss.add(when: 'CONFIG_USER_ONLY', if_true: files('user-exec.c'))
 tcg_ss.add(when: 'CONFIG_SOFTMMU', if_false: files('user-exec-stub.c'))
 tcg_ss.add(when: 'CONFIG_PLUGIN', if_true: [files('plugin-gen.c')])
 tcg_ss.add(when: libdw, if_true: files('debuginfo.c'))
 tcg_ss.add(when: 'CONFIG_LINUX', if_true: files('perf.c'))
 specific_ss.add_all(when: 'CONFIG_TCG', if_true: tcg_ss)
 specific_ss.add(when: ['CONFIG_SOFTMMU', 'CONFIG_TCG'], if_true: files(
--- a/accel/tcg/perf.c
+++ b/accel/tcg/perf.c
@ -0,0 +1,375 @@
 /*
 * Linux perf perf-<pid>.map and jit-<pid>.dump integration.
 *
 * The jitdump spec can be found at [1].
 *
 * [1] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/plain/tools/perf/Documentation/jitdump-specification.txt
 *
 * SPDX-License-Identifier: GPL-2.0-or-later
 */
 #include "qemu/osdep.h"
 #include "elf.h"
 #include "exec/exec-all.h"
 #include "qemu/timer.h"
 #include "tcg/tcg.h"
 #include "debuginfo.h"
 #include "perf.h"
 static FILE *safe_fopen_w(const char *path)
 {
    int saved_errno;
    FILE *f;
    int fd;
    /* Delete the old file, if any. */
    unlink(path);
    /* Avoid symlink attacks by using O_CREAT | O_EXCL. */
    fd = open(path, O_RDWR | O_CREAT | O_EXCL, S_IRUSR | S_IWUSR);
    if (fd == -1) {
        return NULL;
    }
    /* Convert fd to FILE*. */
    f = fdopen(fd, "w");
    if (f == NULL) {
        saved_errno = errno;
        close(fd);
        errno = saved_errno;
        return NULL;
    }
    return f;
 }
 static FILE *perfmap;
 void perf_enable_perfmap(void)
 {
    char map_file[32];
    snprintf(map_file, sizeof(map_file), "/tmp/perf-%d.map", getpid());
    perfmap = safe_fopen_w(map_file);
    if (perfmap == NULL) {
        warn_report("Could not open %s: %s, proceeding without perfmap",
                    map_file, strerror(errno));
    }
 }
 /* Get PC and size of code JITed for guest instruction #INSN. */
 static void get_host_pc_size(uintptr_t *host_pc, uint16_t *host_size,
                             const void *start, size_t insn)
 {
    uint16_t start_off = insn ? tcg_ctx->gen_insn_end_off[insn - 1] : 0;
    if (host_pc) {
        *host_pc = (uintptr_t)start + start_off;
    }
    if (host_size) {
        *host_size = tcg_ctx->gen_insn_end_off[insn] - start_off;
    }
 }
 static const char *pretty_symbol(const struct debuginfo_query *q, size_t *len)
 {
    static __thread char buf[64];
    int tmp;
    if (!q->symbol) {
        tmp = snprintf(buf, sizeof(buf), "guest-0x%"PRIx64, q->address);
        if (len) {
            *len = MIN(tmp + 1, sizeof(buf));
        }
        return buf;
    }
    if (!q->offset) {
        if (len) {
            *len = strlen(q->symbol) + 1;
        }
        return q->symbol;
    }
    tmp = snprintf(buf, sizeof(buf), "%s+0x%"PRIx64, q->symbol, q->offset);
    if (len) {
        *len = MIN(tmp + 1, sizeof(buf));
    }
    return buf;
 }
 static void write_perfmap_entry(const void *start, size_t insn,
                                const struct debuginfo_query *q)
 {
    uint16_t host_size;
    uintptr_t host_pc;
    get_host_pc_size(&host_pc, &host_size, start, insn);
    fprintf(perfmap, "%"PRIxPTR" %"PRIx16" %s\n",
            host_pc, host_size, pretty_symbol(q, NULL));
 }
 static FILE *jitdump;
 #define JITHEADER_MAGIC 0x4A695444
 #define JITHEADER_VERSION 1
 struct jitheader {
    uint32_t magic;
    uint32_t version;
    uint32_t total_size;
    uint32_t elf_mach;
    uint32_t pad1;
    uint32_t pid;
    uint64_t timestamp;
    uint64_t flags;
 };
 enum jit_record_type {
    JIT_CODE_LOAD = 0,
    JIT_CODE_DEBUG_INFO = 2,
 };
 struct jr_prefix {
    uint32_t id;
    uint32_t total_size;
    uint64_t timestamp;
 };
 struct jr_code_load {
    struct jr_prefix p;
    uint32_t pid;
    uint32_t tid;
    uint64_t vma;
    uint64_t code_addr;
    uint64_t code_size;
    uint64_t code_index;
 };
 struct debug_entry {
    uint64_t addr;
    int lineno;
    int discrim;
    const char name[];
 };
 struct jr_code_debug_info {
    struct jr_prefix p;
    uint64_t code_addr;
    uint64_t nr_entry;
    struct debug_entry entries[];
 };
 static uint32_t get_e_machine(void)
 {
    Elf64_Ehdr elf_header;
    FILE *exe;
    size_t n;
    QEMU_BUILD_BUG_ON(offsetof(Elf32_Ehdr, e_machine) !=
                      offsetof(Elf64_Ehdr, e_machine));
    exe = fopen("/proc/self/exe", "r");
    if (exe == NULL) {
        return EM_NONE;
    }
    n = fread(&elf_header, sizeof(elf_header), 1, exe);
    fclose(exe);
    if (n != 1) {
        return EM_NONE;
    }
    return elf_header.e_machine;
 }
 void perf_enable_jitdump(void)
 {
    struct jitheader header;
    char jitdump_file[32];
    void *perf_marker;
    if (!use_rt_clock) {
        warn_report("CLOCK_MONOTONIC is not available, proceeding without jitdump");
        return;
    }
    snprintf(jitdump_file, sizeof(jitdump_file), "jit-%d.dump", getpid());
    jitdump = safe_fopen_w(jitdump_file);
    if (jitdump == NULL) {
        warn_report("Could not open %s: %s, proceeding without jitdump",
                    jitdump_file, strerror(errno));
        return;
    }
    /*
     * `perf inject` will see that the mapped file name in the corresponding
     * PERF_RECORD_MMAP or PERF_RECORD_MMAP2 event is of the form jit-%d.dump
     * and will process it as a jitdump file.
     */
    perf_marker = mmap(NULL, qemu_real_host_page_size(), PROT_READ | PROT_EXEC,
                       MAP_PRIVATE, fileno(jitdump), 0);
    if (perf_marker == MAP_FAILED) {
        warn_report("Could not map %s: %s, proceeding without jitdump",
                    jitdump_file, strerror(errno));
        fclose(jitdump);
        jitdump = NULL;
        return;
    }
    header.magic = JITHEADER_MAGIC;
    header.version = JITHEADER_VERSION;
    header.total_size = sizeof(header);
    header.elf_mach = get_e_machine();
    header.pad1 = 0;
    header.pid = getpid();
    header.timestamp = get_clock();
    header.flags = 0;
    fwrite(&header, sizeof(header), 1, jitdump);
 }
 void perf_report_prologue(const void *start, size_t size)
 {
    if (perfmap) {
        fprintf(perfmap, "%"PRIxPTR" %zx tcg-prologue-buffer\n",
                (uintptr_t)start, size);
    }
 }
 /* Write a JIT_CODE_DEBUG_INFO jitdump entry. */
 static void write_jr_code_debug_info(const void *start,
                                     const struct debuginfo_query *q,
                                     size_t icount)
 {
    struct jr_code_debug_info rec;
    struct debug_entry ent;
    uintptr_t host_pc;
    int insn;
    /* Write the header. */
    rec.p.id = JIT_CODE_DEBUG_INFO;
    rec.p.total_size = sizeof(rec) + sizeof(ent) + 1;
    rec.p.timestamp = get_clock();
    rec.code_addr = (uintptr_t)start;
    rec.nr_entry = 1;
    for (insn = 0; insn < icount; insn++) {
        if (q[insn].file) {
            rec.p.total_size += sizeof(ent) + strlen(q[insn].file) + 1;
            rec.nr_entry++;
        }
    }
    fwrite(&rec, sizeof(rec), 1, jitdump);
    /* Write the main debug entries. */
    for (insn = 0; insn < icount; insn++) {
        if (q[insn].file) {
            get_host_pc_size(&host_pc, NULL, start, insn);
            ent.addr = host_pc;
            ent.lineno = q[insn].line;
            ent.discrim = 0;
            fwrite(&ent, sizeof(ent), 1, jitdump);
            fwrite(q[insn].file, strlen(q[insn].file) + 1, 1, jitdump);
        }
    }
    /* Write the trailing debug_entry. */
    ent.addr = (uintptr_t)start + tcg_ctx->gen_insn_end_off[icount - 1];
    ent.lineno = 0;
    ent.discrim = 0;
    fwrite(&ent, sizeof(ent), 1, jitdump);
    fwrite("", 1, 1, jitdump);
 }
 /* Write a JIT_CODE_LOAD jitdump entry. */
 static void write_jr_code_load(const void *start, uint16_t host_size,
                               const struct debuginfo_query *q)
 {
    static uint64_t code_index;
    struct jr_code_load rec;
    const char *symbol;
    size_t symbol_size;
    symbol = pretty_symbol(q, &symbol_size);
    rec.p.id = JIT_CODE_LOAD;
    rec.p.total_size = sizeof(rec) + symbol_size + host_size;
    rec.p.timestamp = get_clock();
    rec.pid = getpid();
    rec.tid = qemu_get_thread_id();
    rec.vma = (uintptr_t)start;
    rec.code_addr = (uintptr_t)start;
    rec.code_size = host_size;
    rec.code_index = code_index++;
    fwrite(&rec, sizeof(rec), 1, jitdump);
    fwrite(symbol, symbol_size, 1, jitdump);
    fwrite(start, host_size, 1, jitdump);
 }
 void perf_report_code(uint64_t guest_pc, TranslationBlock *tb,
                      const void *start)
 {
    struct debuginfo_query *q;
    size_t insn;
    if (!perfmap && !jitdump) {
        return;
    }
    q = g_try_malloc0_n(tb->icount, sizeof(*q));
    if (!q) {
        return;
    }
    debuginfo_lock();
    /* Query debuginfo for each guest instruction. */
    for (insn = 0; insn < tb->icount; insn++) {
        /* FIXME: This replicates the restore_state_to_opc() logic. */
        q[insn].address = tcg_ctx->gen_insn_data[insn][0];
        if (TARGET_TB_PCREL) {
            q[insn].address |= (guest_pc & TARGET_PAGE_MASK);
        } else {
 #if defined(TARGET_I386)
            q[insn].address -= tb->cs_base;
 #endif
        }
        q[insn].flags = DEBUGINFO_SYMBOL | (jitdump ? DEBUGINFO_LINE : 0);
    }
    debuginfo_query(q, tb->icount);
    /* Emit perfmap entries if needed. */
    if (perfmap) {
        flockfile(perfmap);
        for (insn = 0; insn < tb->icount; insn++) {
            write_perfmap_entry(start, insn, &q[insn]);
        }
        funlockfile(perfmap);
    }
    /* Emit jitdump entries if needed. */
    if (jitdump) {
        flockfile(jitdump);
        write_jr_code_debug_info(start, q, tb->icount);
        write_jr_code_load(start, tcg_ctx->gen_insn_end_off[tb->icount - 1],
                           q);
        funlockfile(jitdump);
    }
    debuginfo_unlock();
    g_free(q);
 }
 void perf_exit(void)
 {
    if (perfmap) {
        fclose(perfmap);
        perfmap = NULL;
    }
    if (jitdump) {
        fclose(jitdump);
        jitdump = NULL;
    }
 }
--- a/accel/tcg/perf.h
+++ b/accel/tcg/perf.h
@ -0,0 +1,49 @@
 /*
 * Linux perf perf-<pid>.map and jit-<pid>.dump integration.
 *
 * SPDX-License-Identifier: GPL-2.0-or-later
 */
 #ifndef ACCEL_TCG_PERF_H
 #define ACCEL_TCG_PERF_H
 #if defined(CONFIG_TCG) && defined(CONFIG_LINUX)
 /* Start writing perf-<pid>.map. */
 void perf_enable_perfmap(void);
 /* Start writing jit-<pid>.dump. */
 void perf_enable_jitdump(void);
 /* Add information about TCG prologue to profiler maps. */
 void perf_report_prologue(const void *start, size_t size);
 /* Add information about JITted guest code to profiler maps. */
 void perf_report_code(uint64_t guest_pc, TranslationBlock *tb,
                      const void *start);
 /* Stop writing perf-<pid>.map and/or jit-<pid>.dump. */
 void perf_exit(void);
 #else
 static inline void perf_enable_perfmap(void)
 {
 }
 static inline void perf_enable_jitdump(void)
 {
 }
 static inline void perf_report_prologue(const void *start, size_t size)
 {
 }
 static inline void perf_report_code(uint64_t guest_pc, TranslationBlock *tb,
                                    const void *start)
 {
 }
 static inline void perf_exit(void)
 {
 }
 #endif
 #endif
--- a/accel/tcg/translate-all.c
+++ b/accel/tcg/translate-all.c
@ -62,6 +62,7 @@
 #include "tb-hash.h"
 #include "tb-context.h"
 #include "internal.h"
 #include "perf.h"
 /* Make sure all possible CPU event bits fit in tb->trace_vcpu_dstate */
 QEMU_BUILD_BUG_ON(CPU_TRACE_DSTATE_MAX_EVENTS >
@ -406,6 +407,12 @@ TranslationBlock *tb_gen_code(CPUState *cpu,
    }
    tb->tc.size = gen_code_size;
    /*
     * For TARGET_TB_PCREL, attribute all executions of the generated
     * code to its first mapping.
     */
    perf_report_code(pc, tb, tcg_splitwx_to_rx(gen_code_buf));
 #ifdef CONFIG_PROFILER
    qatomic_set(&prof->code_time, prof->code_time + profile_getclock() - ti);
    qatomic_set(&prof->code_in_len, prof->code_in_len + tb->size);
--- a/docs/devel/tcg.rst
+++ b/docs/devel/tcg.rst
@ -188,3 +188,26 @@ memory areas instead calls out to C code for device emulation.
 Finally, the MMU helps tracking dirty pages and pages pointed to by
 translation blocks.
 Profiling JITted code
 ---------------------
 The Linux ``perf`` tool will treat all JITted code as a single block as
 unlike the main code it can't use debug information to link individual
 program counter samples with larger functions. To overcome this
 limitation you can use the ``-perfmap`` or the ``-jitdump`` option to generate
 map files. ``-perfmap`` is lightweight and produces only guest-host mappings.
 ``-jitdump`` additionally saves JITed code and guest debug information (if
 available); its output needs to be integrated with the ``perf.data`` file
 before the final report can be viewed.
 .. code::
  perf record $QEMU -perfmap $REMAINING_ARGS
  perf report
  perf record -k 1 $QEMU -jitdump $REMAINING_ARGS
  DEBUGINFOD_URLS= perf inject -j -i perf.data -o perf.data.jitted
  perf report -i perf.data.jitted
 Note that qemu-system generates mappings only for ``-kernel`` files in ELF
 format.
--- a/hw/core/loader.c
+++ b/hw/core/loader.c
@ -61,6 +61,7 @@
 #include "hw/boards.h"
 #include "qemu/cutils.h"
 #include "sysemu/runstate.h"
 #include "accel/tcg/debuginfo.h"
 #include <zlib.h>
@ -503,6 +504,10 @@ ssize_t load_elf_ram_sym(const char *filename,
                         clear_lsb, data_swab, as, load_rom, sym_cb);
    }
    if (ret != ELF_LOAD_FAILED) {
        debuginfo_report_elf(filename, fd, 0);
    }
 fail:
    close(fd);
    return ret;
--- a/linux-user/elfload.c
+++ b/linux-user/elfload.c
@ -19,6 +19,7 @@
 #include "qemu/selfmap.h"
 #include "qapi/error.h"
 #include "target_signal.h"
 #include "accel/tcg/debuginfo.h"
 #ifdef _ARCH_PPC64
 #undef ARCH_DLINFO
@ -3261,6 +3262,8 @@ static void load_elf_image(const char *image_name, int image_fd,
        load_symbols(ehdr, image_fd, load_bias);
    }
    debuginfo_report_elf(image_name, image_fd, load_bias);
    mmap_unlock();
    close(image_fd);
--- a/linux-user/exit.c
+++ b/linux-user/exit.c
@ -17,6 +17,7 @@
 *  along with this program; if not, see <http://www.gnu.org/licenses/>.
 */
 #include "qemu/osdep.h"
 #include "accel/tcg/perf.h"
 #include "exec/gdbstub.h"
 #include "qemu.h"
 #include "user-internals.h"
@ -38,4 +39,5 @@ void preexit_cleanup(CPUArchState *env, int code)
 #endif
        gdb_exit(code);
        qemu_plugin_user_exit();
        perf_exit();
 }
--- a/linux-user/main.c
+++ b/linux-user/main.c
@ -53,6 +53,7 @@
 #include "signal-common.h"
 #include "loader.h"
 #include "user-mmap.h"
 #include "accel/tcg/perf.h"
 #ifdef CONFIG_SEMIHOSTING
 #include "semihosting/semihost.h"
@ -423,6 +424,16 @@ static void handle_arg_abi_call0(const char *arg)
 }
 #endif
 static void handle_arg_perfmap(const char *arg)
 {
    perf_enable_perfmap();
 }
 static void handle_arg_jitdump(const char *arg)
 {
    perf_enable_jitdump();
 }
 static QemuPluginList plugins = QTAILQ_HEAD_INITIALIZER(plugins);
 #ifdef CONFIG_PLUGIN
@ -493,6 +504,10 @@ static const struct qemu_argument arg_table[] = {
    {"xtensa-abi-call0", "QEMU_XTENSA_ABI_CALL0", false, handle_arg_abi_call0,
     "",           "assume CALL0 Xtensa ABI"},
 #endif
    {"perfmap",    "QEMU_PERFMAP",     false, handle_arg_perfmap,
     "",           "Generate a /tmp/perf-${pid}.map file for perf"},
    {"jitdump",    "QEMU_JITDUMP",     false, handle_arg_jitdump,
     "",           "Generate a jit-${pid}.dump file for perf"},
    {NULL, NULL, false, NULL, NULL, NULL}
 };
--- a/linux-user/meson.build
+++ b/linux-user/meson.build
@ -22,6 +22,7 @@ linux_user_ss.add(files(
  'uname.c',
 ))
 linux_user_ss.add(rt)
 linux_user_ss.add(libdw)
 linux_user_ss.add(when: 'TARGET_HAS_BFLT', if_true: files('flatload.c'))
 linux_user_ss.add(when: 'TARGET_I386', if_true: files('vm86.c'))
--- a/linux-user/signal.c
+++ b/linux-user/signal.c
@ -695,7 +695,7 @@ void cpu_loop_exit_sigbus(CPUState *cpu, target_ulong addr,
 /* abort execution with signal */
 static G_NORETURN
-void dump_core_and_abort(int target_sig)
+void dump_core_and_abort(CPUArchState *cpu_env, int target_sig)
 {
    CPUState *cpu = thread_cpu;
    CPUArchState *env = cpu->env_ptr;
@ -724,6 +724,8 @@ void dump_core_and_abort(int target_sig)
            target_sig, strsignal(host_sig), "core dumped" );
    }
    preexit_cleanup(cpu_env, 128 + target_sig);
    /* The proper exit code for dying from an uncaught signal is
     * -<signal>.  The kernel doesn't allow exit() or _exit() to pass
     * a negative value.  To get the proper exit code we need to
@ -1058,12 +1060,12 @@ static void handle_pending_signal(CPUArchState *cpu_env, int sig,
                   sig != TARGET_SIGURG &&
                   sig != TARGET_SIGWINCH &&
                   sig != TARGET_SIGCONT) {
-            dump_core_and_abort(sig);
+            dump_core_and_abort(cpu_env, sig);
        }
    } else if (handler == TARGET_SIG_IGN) {
        /* ignore sig */
    } else if (handler == TARGET_SIG_ERR) {
-        dump_core_and_abort(sig);
+        dump_core_and_abort(cpu_env, sig);
    } else {
        /* compute the blocked signals during the handler execution */
        sigset_t *blocked_set;
--- a/meson.build
+++ b/meson.build
@ -1648,6 +1648,12 @@ if libbpf.found() and not cc.links('''
  endif
 endif
 # libdw
 libdw = dependency('libdw',
                   method: 'pkg-config',
                   kwargs: static_kwargs,
                   required: false)
 #################
 # config-host.h #
 #################
@ -1923,6 +1929,7 @@ config_host_data.set('CONFIG_DBUS_DISPLAY', dbus_display)
 config_host_data.set('CONFIG_CFI', get_option('cfi'))
 config_host_data.set('CONFIG_SELINUX', selinux.found())
 config_host_data.set('CONFIG_XEN_BACKEND', xen.found())
 config_host_data.set('CONFIG_LIBDW', libdw.found())
 if xen.found()
  # protect from xen.version() having less than three components
  xen_version = xen.version().split('.') + ['0', '0']
@ -2331,11 +2338,9 @@ config_host_data.set('CONFIG_CPUID_H', have_cpuid_h)
 config_host_data.set('CONFIG_AVX2_OPT', get_option('avx2') \
  .require(have_cpuid_h, error_message: 'cpuid.h not available, cannot enable AVX2') \
  .require(cc.links('''
    #pragma GCC push_options
    #pragma GCC target("avx2")
    #include <cpuid.h>
    #include <immintrin.h>
-    static int bar(void *a) {
+    static int __attribute__((target("avx2"))) bar(void *a) {
      __m256i x = *(__m256i *)a;
      return _mm256_testz_si256(x, x);
    }
@ -2345,11 +2350,9 @@ config_host_data.set('CONFIG_AVX2_OPT', get_option('avx2') \
 config_host_data.set('CONFIG_AVX512F_OPT', get_option('avx512f') \
  .require(have_cpuid_h, error_message: 'cpuid.h not available, cannot enable AVX512F') \
  .require(cc.links('''
    #pragma GCC push_options
    #pragma GCC target("avx512f")
    #include <cpuid.h>
    #include <immintrin.h>
-    static int bar(void *a) {
+    static int __attribute__((target("avx512f"))) bar(void *a) {
      __m512i x = *(__m512i *)a;
      return _mm512_test_epi64_mask(x, x);
    }
@ -3976,6 +3979,7 @@ summary_info += {'libudev':           libudev}
 # Dummy dependency, keep .found()
 summary_info += {'FUSE lseek':        fuse_lseek.found()}
 summary_info += {'selinux':           selinux}
 summary_info += {'libdw':             libdw}
 summary(summary_info, bool_yn: true, section: 'Dependencies')
 if not supported_cpus.contains(cpu)
--- a/qemu-options.hx
+++ b/qemu-options.hx
@ -4838,6 +4838,26 @@ SRST
    Enable synchronization profiling.
 ERST
 #if defined(CONFIG_TCG) && defined(CONFIG_LINUX)
 DEF("perfmap", 0, QEMU_OPTION_perfmap,
    "-perfmap        generate a /tmp/perf-${pid}.map file for perf\n",
    QEMU_ARCH_ALL)
 SRST
 ``-perfmap``
    Generate a map file for Linux perf tools that will allow basic profiling
    information to be broken down into basic blocks.
 ERST
 DEF("jitdump", 0, QEMU_OPTION_jitdump,
    "-jitdump        generate a jit-${pid}.dump file for perf\n",
    QEMU_ARCH_ALL)
 SRST
 ``-jitdump``
    Generate a dump file for Linux perf tools that maps basic blocks to symbol
    names, line numbers and JITted code.
 ERST
 #endif
 DEFHEADING()
 DEFHEADING(Generic object creation:)
--- a/softmmu/vl.c
+++ b/softmmu/vl.c
@ -96,6 +96,9 @@
 #include "fsdev/qemu-fsdev.h"
 #endif
 #include "sysemu/qtest.h"
 #ifdef CONFIG_TCG
 #include "accel/tcg/perf.h"
 #endif
 #include "disas/disas.h"
@ -2926,6 +2929,14 @@ void qemu_init(int argc, char **argv)
            case QEMU_OPTION_DFILTER:
                qemu_set_dfilter_ranges(optarg, &error_fatal);
                break;
 #if defined(CONFIG_TCG) && defined(CONFIG_LINUX)
            case QEMU_OPTION_perfmap:
                perf_enable_perfmap();
                break;
            case QEMU_OPTION_jitdump:
                perf_enable_jitdump();
                break;
 #endif
            case QEMU_OPTION_seed:
                qemu_guest_random_seed_main(optarg, &error_fatal);
                break;
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@ -61,6 +61,7 @@
 #include "exec/log.h"
 #include "tcg/tcg-ldst.h"
 #include "tcg-internal.h"
 #include "accel/tcg/perf.h"
 /* Forward declarations for functions declared in tcg-target.c.inc and
   used here. */
@ -913,6 +914,7 @@ void tcg_prologue_init(TCGContext *s)
 #endif
    prologue_size = tcg_current_code_size(s);
    perf_report_prologue(s->code_gen_ptr, prologue_size);
 #ifndef CONFIG_TCG_INTERPRETER
    flush_idcache_range((uintptr_t)tcg_splitwx_to_rx(s->code_buf),
--- a/util/bufferiszero.c
+++ b/util/bufferiszero.c
@ -64,18 +64,11 @@ buffer_zero_int(const void *buf, size_t len)
 }
 #if defined(CONFIG_AVX512F_OPT) || defined(CONFIG_AVX2_OPT) || defined(__SSE2__)
-/* Do not use push_options pragmas unnecessarily, because clang
+#include <immintrin.h>
 * does not support them.
 */
 #if defined(CONFIG_AVX512F_OPT) || defined(CONFIG_AVX2_OPT)
 #pragma GCC push_options
 #pragma GCC target("sse2")
 #endif
 #include <emmintrin.h>
 /* Note that each of these vectorized functions require len >= 64.  */
-static bool
+static bool __attribute__((target("sse2")))
 buffer_zero_sse2(const void *buf, size_t len)
 {
    __m128i t = _mm_loadu_si128(buf);
@ -104,20 +97,9 @@ buffer_zero_sse2(const void *buf, size_t len)
    return _mm_movemask_epi8(_mm_cmpeq_epi8(t, zero)) == 0xFFFF;
 }
 #if defined(CONFIG_AVX512F_OPT) || defined(CONFIG_AVX2_OPT)
 #pragma GCC pop_options
 #endif
 #ifdef CONFIG_AVX2_OPT
-/* Note that due to restrictions/bugs wrt __builtin functions in gcc <= 4.8,
+static bool __attribute__((target("sse4")))
 * the includes have to be within the corresponding push_options region, and
 * therefore the regions themselves have to be ordered with increasing ISA.
 */
 #pragma GCC push_options
 #pragma GCC target("sse4")
 #include <smmintrin.h>
 static bool
 buffer_zero_sse4(const void *buf, size_t len)
 {
    __m128i t = _mm_loadu_si128(buf);
@ -145,12 +127,7 @@ buffer_zero_sse4(const void *buf, size_t len)
    return _mm_testz_si128(t, t);
 }
-#pragma GCC pop_options
+static bool __attribute__((target("avx2")))
 #pragma GCC push_options
 #pragma GCC target("avx2")
 #include <immintrin.h>
 static bool
 buffer_zero_avx2(const void *buf, size_t len)
 {
    /* Begin with an unaligned head of 32 bytes.  */
@ -176,15 +153,10 @@ buffer_zero_avx2(const void *buf, size_t len)
    return _mm256_testz_si256(t, t);
 }
 #pragma GCC pop_options
 #endif /* CONFIG_AVX2_OPT */
 #ifdef CONFIG_AVX512F_OPT
-#pragma GCC push_options
+static bool __attribute__((target("avx512f")))
 #pragma GCC target("avx512f")
 #include <immintrin.h>
 static bool
 buffer_zero_avx512(const void *buf, size_t len)
 {
    /* Begin with an unaligned head of 64 bytes.  */
@ -210,8 +182,7 @@ buffer_zero_avx512(const void *buf, size_t len)
    return !_mm512_test_epi64_mask(t, t);
 }
-#pragma GCC pop_options
+#endif /* CONFIG_AVX512F_OPT */
 #endif
 /* Note that for test_buffer_is_zero_next_accel, the most preferred