From 06639f8ff53d1dbfa709377499e6c30eca9c3c9a Mon Sep 17 00:00:00 2001 From: Bin Meng Date: Tue, 25 Oct 2022 22:10:15 +0800 Subject: [PATCH 01/17] chardev/char-win-stdio: Pass Ctrl+C to guest with a multiplexed monitor MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit At present when pressing Ctrl+C from a guest running on QEMU Windows with a multiplexed monitor, e.g.: -serial mon:stdio, QEMU executable just exits. This behavior is inconsistent with the Linux version. Such behavior is caused by unconditionally setting the input mode ENABLE_PROCESSED_INPUT for a console's input buffer. Fix this by testing whether the chardev is allowed to do so. Signed-off-by: Bin Meng Reviewed-by: Marc-André Lureau Message-Id: <20221025141015.612291-1-bin.meng@windriver.com> --- chardev/char-win-stdio.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/chardev/char-win-stdio.c b/chardev/char-win-stdio.c index a4771ab82e..eb830eabd9 100644 --- a/chardev/char-win-stdio.c +++ b/chardev/char-win-stdio.c @@ -146,6 +146,8 @@ static void qemu_chr_open_stdio(Chardev *chr, bool *be_opened, Error **errp) { + ChardevStdio *opts = backend->u.stdio.data; + bool stdio_allow_signal = !opts->has_signal || opts->signal; WinStdioChardev *stdio = WIN_STDIO_CHARDEV(chr); DWORD dwMode; int is_console = 0; @@ -193,7 +195,11 @@ static void qemu_chr_open_stdio(Chardev *chr, if (is_console) { /* set the terminal in raw mode */ /* ENABLE_QUICK_EDIT_MODE | ENABLE_EXTENDED_FLAGS */ - dwMode |= ENABLE_PROCESSED_INPUT; + if (stdio_allow_signal) { + dwMode |= ENABLE_PROCESSED_INPUT; + } else { + dwMode &= ~ENABLE_PROCESSED_INPUT; + } } SetConsoleMode(stdio->hStdIn, dwMode); From a216ec85b78ea96b51950665879524132f6e678c Mon Sep 17 00:00:00 2001 From: Fiona Ebner Date: Thu, 13 Oct 2022 10:41:00 +0200 Subject: [PATCH 02/17] migration/channel-block: fix return value for qio_channel_block_{readv,writev} in the error case. The documentation in include/io/channel.h states that -1 or QIO_CHANNEL_ERR_BLOCK should be returned upon error. Simply passing along the return value from the bdrv-functions has the potential to confuse the call sides. Non-blocking mode is not implemented currently, so -1 it is. Signed-off-by: Fiona Ebner Reviewed-by: Juan Quintela Signed-off-by: Juan Quintela --- migration/channel-block.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/migration/channel-block.c b/migration/channel-block.c index c55c8c93ce..f4ab53acdb 100644 --- a/migration/channel-block.c +++ b/migration/channel-block.c @@ -62,7 +62,8 @@ qio_channel_block_readv(QIOChannel *ioc, qemu_iovec_init_external(&qiov, (struct iovec *)iov, niov); ret = bdrv_readv_vmstate(bioc->bs, &qiov, bioc->offset); if (ret < 0) { - return ret; + error_setg_errno(errp, -ret, "bdrv_readv_vmstate failed"); + return -1; } bioc->offset += qiov.size; @@ -86,7 +87,8 @@ qio_channel_block_writev(QIOChannel *ioc, qemu_iovec_init_external(&qiov, (struct iovec *)iov, niov); ret = bdrv_writev_vmstate(bioc->bs, &qiov, bioc->offset); if (ret < 0) { - return ret; + error_setg_errno(errp, -ret, "bdrv_writev_vmstate failed"); + return -1; } bioc->offset += qiov.size; From 4cc47b439594327b213f9b6a67803f1a503c2cb7 Mon Sep 17 00:00:00 2001 From: Leonardo Bras Date: Tue, 25 Oct 2022 01:47:28 -0300 Subject: [PATCH 03/17] migration/multifd/zero-copy: Create helper function for flushing Move flushing code from multifd_send_sync_main() to a new helper, and call it in multifd_send_sync_main(). Signed-off-by: Leonardo Bras Reviewed-by: Juan Quintela Signed-off-by: Juan Quintela --- migration/multifd.c | 30 +++++++++++++++++++----------- 1 file changed, 19 insertions(+), 11 deletions(-) diff --git a/migration/multifd.c b/migration/multifd.c index 586ddc9d65..509bbbe3bf 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -566,6 +566,23 @@ void multifd_save_cleanup(void) multifd_send_state = NULL; } +static int multifd_zero_copy_flush(QIOChannel *c) +{ + int ret; + Error *err = NULL; + + ret = qio_channel_flush(c, &err); + if (ret < 0) { + error_report_err(err); + return -1; + } + if (ret == 1) { + dirty_sync_missed_zero_copy(); + } + + return ret; +} + int multifd_send_sync_main(QEMUFile *f) { int i; @@ -616,17 +633,8 @@ int multifd_send_sync_main(QEMUFile *f) qemu_mutex_unlock(&p->mutex); qemu_sem_post(&p->sem); - if (flush_zero_copy && p->c) { - int ret; - Error *err = NULL; - - ret = qio_channel_flush(p->c, &err); - if (ret < 0) { - error_report_err(err); - return -1; - } else if (ret == 1) { - dirty_sync_missed_zero_copy(); - } + if (flush_zero_copy && p->c && (multifd_zero_copy_flush(p->c) < 0)) { + return -1; } } for (i = 0; i < migrate_multifd_channels(); i++) { From 4934a5dd7c68f5ab15f17498db4fc20ed6db9578 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 4 Oct 2022 14:24:26 -0400 Subject: [PATCH 04/17] migration: Fix possible infinite loop of ram save process When starting ram saving procedure (especially at the completion phase), always set last_seen_block to non-NULL to make sure we can always correctly detect the case where "we've migrated all the dirty pages". Then we'll guarantee both last_seen_block and pss.block will be valid always before the loop starts. See the comment in the code for some details. Reviewed-by: Dr. David Alan Gilbert Signed-off-by: Peter Xu Reviewed-by: Juan Quintela Signed-off-by: Juan Quintela --- migration/ram.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/migration/ram.c b/migration/ram.c index dc1de9ddbc..1d42414ecc 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -2546,14 +2546,22 @@ static int ram_find_and_save_block(RAMState *rs) return pages; } + /* + * Always keep last_seen_block/last_page valid during this procedure, + * because find_dirty_block() relies on these values (e.g., we compare + * last_seen_block with pss.block to see whether we searched all the + * ramblocks) to detect the completion of migration. Having NULL value + * of last_seen_block can conditionally cause below loop to run forever. + */ + if (!rs->last_seen_block) { + rs->last_seen_block = QLIST_FIRST_RCU(&ram_list.blocks); + rs->last_page = 0; + } + pss.block = rs->last_seen_block; pss.page = rs->last_page; pss.complete_round = false; - if (!pss.block) { - pss.block = QLIST_FIRST_RCU(&ram_list.blocks); - } - do { again = true; found = get_queued_page(rs, &pss); From f5816b5c86ed399c99ce8662a4ed96aab32c5eef Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 4 Oct 2022 14:24:27 -0400 Subject: [PATCH 05/17] migration: Fix race on qemu_file_shutdown() In qemu_file_shutdown(), there's a possible race if with current order of operation. There're two major things to do: (1) Do real shutdown() (e.g. shutdown() syscall on socket) (2) Update qemufile's last_error We must do (2) before (1) otherwise there can be a race condition like: page receiver other thread ------------- ------------ qemu_get_buffer() do shutdown() returns 0 (buffer all zero) (meanwhile we didn't check this retcode) try to detect IO error last_error==NULL, IO okay install ALL-ZERO page set last_error --> guest crash! To fix this, we can also check retval of qemu_get_buffer(), but not all APIs can be properly checked and ultimately we still need to go back to qemu_file_get_error(). E.g. qemu_get_byte() doesn't return error. Maybe some day a rework of qemufile API is really needed, but for now keep using qemu_file_get_error() and fix it by not allowing that race condition to happen. Here shutdown() is indeed special because the last_error was emulated. For real -EIO errors it'll always be set when e.g. sendmsg() error triggers so we won't miss those ones, only shutdown() is a bit tricky here. Cc: Daniel P. Berrange Reviewed-by: Dr. David Alan Gilbert Signed-off-by: Peter Xu Reviewed-by: Juan Quintela Signed-off-by: Juan Quintela --- migration/qemu-file.c | 27 ++++++++++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-) diff --git a/migration/qemu-file.c b/migration/qemu-file.c index 4f400c2e52..2d5f74ffc2 100644 --- a/migration/qemu-file.c +++ b/migration/qemu-file.c @@ -79,6 +79,30 @@ int qemu_file_shutdown(QEMUFile *f) int ret = 0; f->shutdown = true; + + /* + * We must set qemufile error before the real shutdown(), otherwise + * there can be a race window where we thought IO all went though + * (because last_error==NULL) but actually IO has already stopped. + * + * If without correct ordering, the race can happen like this: + * + * page receiver other thread + * ------------- ------------ + * qemu_get_buffer() + * do shutdown() + * returns 0 (buffer all zero) + * (we didn't check this retcode) + * try to detect IO error + * last_error==NULL, IO okay + * install ALL-ZERO page + * set last_error + * --> guest crash! + */ + if (!f->last_error) { + qemu_file_set_error(f, -EIO); + } + if (!qio_channel_has_feature(f->ioc, QIO_CHANNEL_FEATURE_SHUTDOWN)) { return -ENOSYS; @@ -88,9 +112,6 @@ int qemu_file_shutdown(QEMUFile *f) ret = -EIO; } - if (!f->last_error) { - qemu_file_set_error(f, -EIO); - } return ret; } From afed4273b5c9438dfbaa0b4762d0433f295ccdc1 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 4 Oct 2022 14:24:28 -0400 Subject: [PATCH 06/17] migration: Disallow postcopy preempt to be used with compress The preempt mode requires the capability to assign channel for each of the page, while the compression logic will currently assign pages to different compress thread/local-channel so potentially they're incompatible. Reviewed-by: Dr. David Alan Gilbert Signed-off-by: Peter Xu Reviewed-by: Juan Quintela Signed-off-by: Juan Quintela --- migration/migration.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/migration/migration.c b/migration/migration.c index 739bb683f3..f3ed77a7d0 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -1337,6 +1337,17 @@ static bool migrate_caps_check(bool *cap_list, error_setg(errp, "Postcopy preempt requires postcopy-ram"); return false; } + + /* + * Preempt mode requires urgent pages to be sent in separate + * channel, OTOH compression logic will disorder all pages into + * different compression channels, which is not compatible with the + * preempt assumptions on channel assignments. + */ + if (cap_list[MIGRATION_CAPABILITY_COMPRESS]) { + error_setg(errp, "Postcopy preempt not compatible with compress"); + return false; + } } return true; From cedb70eafb4fd51d9c714981509d97b9f4055be5 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 4 Oct 2022 14:24:29 -0400 Subject: [PATCH 07/17] migration: Use non-atomic ops for clear log bitmap Since we already have bitmap_mutex to protect either the dirty bitmap or the clear log bitmap, we don't need atomic operations to set/clear/test on the clear log bitmap. Switching all ops from atomic to non-atomic versions, meanwhile touch up the comments to show which lock is in charge. Introduced non-atomic version of bitmap_test_and_clear_atomic(), mostly the same as the atomic version but simplified a few places, e.g. dropped the "old_bits" variable, and also the explicit memory barriers. Reviewed-by: Dr. David Alan Gilbert Signed-off-by: Peter Xu Reviewed-by: Juan Quintela Signed-off-by: Juan Quintela --- include/exec/ram_addr.h | 11 +++++----- include/exec/ramblock.h | 3 +++ include/qemu/bitmap.h | 1 + util/bitmap.c | 45 +++++++++++++++++++++++++++++++++++++++++ 4 files changed, 55 insertions(+), 5 deletions(-) diff --git a/include/exec/ram_addr.h b/include/exec/ram_addr.h index 1500680458..f4fb6a2111 100644 --- a/include/exec/ram_addr.h +++ b/include/exec/ram_addr.h @@ -42,7 +42,8 @@ static inline long clear_bmap_size(uint64_t pages, uint8_t shift) } /** - * clear_bmap_set: set clear bitmap for the page range + * clear_bmap_set: set clear bitmap for the page range. Must be with + * bitmap_mutex held. * * @rb: the ramblock to operate on * @start: the start page number @@ -55,12 +56,12 @@ static inline void clear_bmap_set(RAMBlock *rb, uint64_t start, { uint8_t shift = rb->clear_bmap_shift; - bitmap_set_atomic(rb->clear_bmap, start >> shift, - clear_bmap_size(npages, shift)); + bitmap_set(rb->clear_bmap, start >> shift, clear_bmap_size(npages, shift)); } /** - * clear_bmap_test_and_clear: test clear bitmap for the page, clear if set + * clear_bmap_test_and_clear: test clear bitmap for the page, clear if set. + * Must be with bitmap_mutex held. * * @rb: the ramblock to operate on * @page: the page number to check @@ -71,7 +72,7 @@ static inline bool clear_bmap_test_and_clear(RAMBlock *rb, uint64_t page) { uint8_t shift = rb->clear_bmap_shift; - return bitmap_test_and_clear_atomic(rb->clear_bmap, page >> shift, 1); + return bitmap_test_and_clear(rb->clear_bmap, page >> shift, 1); } static inline bool offset_in_ramblock(RAMBlock *b, ram_addr_t offset) diff --git a/include/exec/ramblock.h b/include/exec/ramblock.h index 6cbedf9e0c..adc03df59c 100644 --- a/include/exec/ramblock.h +++ b/include/exec/ramblock.h @@ -53,6 +53,9 @@ struct RAMBlock { * and split clearing of dirty bitmap on the remote node (e.g., * KVM). The bitmap will be set only when doing global sync. * + * It is only used during src side of ram migration, and it is + * protected by the global ram_state.bitmap_mutex. + * * NOTE: this bitmap is different comparing to the other bitmaps * in that one bit can represent multiple guest pages (which is * decided by the `clear_bmap_shift' variable below). On diff --git a/include/qemu/bitmap.h b/include/qemu/bitmap.h index 82a1d2f41f..3ccb00865f 100644 --- a/include/qemu/bitmap.h +++ b/include/qemu/bitmap.h @@ -253,6 +253,7 @@ void bitmap_set(unsigned long *map, long i, long len); void bitmap_set_atomic(unsigned long *map, long i, long len); void bitmap_clear(unsigned long *map, long start, long nr); bool bitmap_test_and_clear_atomic(unsigned long *map, long start, long nr); +bool bitmap_test_and_clear(unsigned long *map, long start, long nr); void bitmap_copy_and_clear_atomic(unsigned long *dst, unsigned long *src, long nr); unsigned long bitmap_find_next_zero_area(unsigned long *map, diff --git a/util/bitmap.c b/util/bitmap.c index f81d8057a7..8d12e90a5a 100644 --- a/util/bitmap.c +++ b/util/bitmap.c @@ -240,6 +240,51 @@ void bitmap_clear(unsigned long *map, long start, long nr) } } +bool bitmap_test_and_clear(unsigned long *map, long start, long nr) +{ + unsigned long *p = map + BIT_WORD(start); + const long size = start + nr; + int bits_to_clear = BITS_PER_LONG - (start % BITS_PER_LONG); + unsigned long mask_to_clear = BITMAP_FIRST_WORD_MASK(start); + bool dirty = false; + + assert(start >= 0 && nr >= 0); + + /* First word */ + if (nr - bits_to_clear > 0) { + if ((*p) & mask_to_clear) { + dirty = true; + } + *p &= ~mask_to_clear; + nr -= bits_to_clear; + bits_to_clear = BITS_PER_LONG; + p++; + } + + /* Full words */ + if (bits_to_clear == BITS_PER_LONG) { + while (nr >= BITS_PER_LONG) { + if (*p) { + dirty = true; + *p = 0; + } + nr -= BITS_PER_LONG; + p++; + } + } + + /* Last word */ + if (nr) { + mask_to_clear &= BITMAP_LAST_WORD_MASK(size); + if ((*p) & mask_to_clear) { + dirty = true; + } + *p &= ~mask_to_clear; + } + + return dirty; +} + bool bitmap_test_and_clear_atomic(unsigned long *map, long start, long nr) { unsigned long *p = map + BIT_WORD(start); From 6f39c90b86b9d3772779f873ed88aaa75a220aba Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 4 Oct 2022 14:24:30 -0400 Subject: [PATCH 08/17] migration: Disable multifd explicitly with compression Multifd thread model does not work for compression, explicitly disable it. Note that previuosly even we can enable both of them, nothing will go wrong, because the compression code has higher priority so multifd feature will just be ignored. Now we'll fail even earlier at config time so the user should be aware of the consequence better. Note that there can be a slight chance of breaking existing users, but let's assume they're not majority and not serious users, or they should have found that multifd is not working already. With that, we can safely drop the check in ram_save_target_page() for using multifd, because when multifd=on then compression=off, then the removed check on save_page_use_compression() will also always return false too. Signed-off-by: Peter Xu Reviewed-by: Dr. David Alan Gilbert Reviewed-by: Juan Quintela Signed-off-by: Juan Quintela --- migration/migration.c | 7 +++++++ migration/ram.c | 11 +++++------ 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/migration/migration.c b/migration/migration.c index f3ed77a7d0..f485eea5fb 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -1350,6 +1350,13 @@ static bool migrate_caps_check(bool *cap_list, } } + if (cap_list[MIGRATION_CAPABILITY_MULTIFD]) { + if (cap_list[MIGRATION_CAPABILITY_COMPRESS]) { + error_setg(errp, "Multifd is not compatible with compress"); + return false; + } + } + return true; } diff --git a/migration/ram.c b/migration/ram.c index 1d42414ecc..1338e47665 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -2305,13 +2305,12 @@ static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss) } /* - * Do not use multifd for: - * 1. Compression as the first page in the new block should be posted out - * before sending the compressed page - * 2. In postcopy as one whole host page should be placed + * Do not use multifd in postcopy as one whole host page should be + * placed. Meanwhile postcopy requires atomic update of pages, so even + * if host page size == guest page size the dest guest during run may + * still see partially copied pages which is data corruption. */ - if (!save_page_use_compression(rs) && migrate_use_multifd() - && !migration_in_postcopy()) { + if (migrate_use_multifd() && !migration_in_postcopy()) { return ram_save_multifd_page(rs, block, offset); } From b5280437a7f49cf617cdd99bbbe2c7bd1652408b Mon Sep 17 00:00:00 2001 From: Juan Quintela Date: Mon, 3 Oct 2022 05:15:56 +0200 Subject: [PATCH 09/17] migration: Block migration comment or code is wrong And it appears that what is wrong is the code. During bulk stage we need to make sure that some block is dirty, but no games with max_size at all. Signed-off-by: Juan Quintela Reviewed-by: Stefan Hajnoczi --- migration/block.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/migration/block.c b/migration/block.c index 3577c815a9..4347da1526 100644 --- a/migration/block.c +++ b/migration/block.c @@ -880,8 +880,8 @@ static void block_save_pending(QEMUFile *f, void *opaque, uint64_t max_size, blk_mig_unlock(); /* Report at least one block pending during bulk phase */ - if (pending <= max_size && !block_mig_state.bulk_completed) { - pending = max_size + BLK_MIG_BLOCK_SIZE; + if (!pending && !block_mig_state.bulk_completed) { + pending = BLK_MIG_BLOCK_SIZE; } trace_migration_block_save_pending(pending); From 93e2da36ed944d05e78905e95983a44624ed064c Mon Sep 17 00:00:00 2001 From: Strahinja Jankovic Date: Mon, 21 Nov 2022 11:45:12 +0000 Subject: [PATCH 10/17] hw/sd: Fix sun4i allwinner-sdhost for U-Boot Trying to run U-Boot for Cubieboard (Allwinner A10) fails because it cannot access SD card. The problem is that FIFO register in current allwinner-sdhost implementation is at the address corresponding to Allwinner H3, but not A10. Linux kernel is not affected since Linux driver uses DMA access and does not use FIFO register for reading/writing. This patch adds new class parameter `is_sun4i` and based on that parameter uses register at offset 0x100 either as FIFO register (if sun4i) or as threshold register (if not sun4i; in this case register at 0x200 is FIFO register). Tested with U-Boot and Linux kernel image built for Cubieboard and OrangePi PC. Signed-off-by: Strahinja Jankovic Reviewed-by: Peter Maydell Message-id: 20221112214900.24152-1-strahinja.p.jankovic@gmail.com Signed-off-by: Peter Maydell --- hw/sd/allwinner-sdhost.c | 67 ++++++++++++++++++++++---------- include/hw/sd/allwinner-sdhost.h | 1 + 2 files changed, 47 insertions(+), 21 deletions(-) diff --git a/hw/sd/allwinner-sdhost.c b/hw/sd/allwinner-sdhost.c index 455d6eabf6..51e5e90830 100644 --- a/hw/sd/allwinner-sdhost.c +++ b/hw/sd/allwinner-sdhost.c @@ -65,7 +65,7 @@ enum { REG_SD_DLBA = 0x84, /* Descriptor List Base Address */ REG_SD_IDST = 0x88, /* Internal DMA Controller Status */ REG_SD_IDIE = 0x8C, /* Internal DMA Controller IRQ Enable */ - REG_SD_THLDC = 0x100, /* Card Threshold Control */ + REG_SD_THLDC = 0x100, /* Card Threshold Control / FIFO (sun4i only)*/ REG_SD_DSBD = 0x10C, /* eMMC DDR Start Bit Detection Control */ REG_SD_RES_CRC = 0x110, /* Response CRC from card/eMMC */ REG_SD_DATA7_CRC = 0x114, /* CRC Data 7 from card/eMMC */ @@ -415,10 +415,29 @@ static void allwinner_sdhost_dma(AwSdHostState *s) } } +static uint32_t allwinner_sdhost_fifo_read(AwSdHostState *s) +{ + uint32_t res = 0; + + if (sdbus_data_ready(&s->sdbus)) { + sdbus_read_data(&s->sdbus, &res, sizeof(uint32_t)); + le32_to_cpus(&res); + allwinner_sdhost_update_transfer_cnt(s, sizeof(uint32_t)); + allwinner_sdhost_auto_stop(s); + allwinner_sdhost_update_irq(s); + } else { + qemu_log_mask(LOG_GUEST_ERROR, "%s: no data ready on SD bus\n", + __func__); + } + + return res; +} + static uint64_t allwinner_sdhost_read(void *opaque, hwaddr offset, unsigned size) { AwSdHostState *s = AW_SDHOST(opaque); + AwSdHostClass *sc = AW_SDHOST_GET_CLASS(s); uint32_t res = 0; switch (offset) { @@ -508,8 +527,12 @@ static uint64_t allwinner_sdhost_read(void *opaque, hwaddr offset, case REG_SD_IDIE: /* Internal DMA Controller Interrupt Enable */ res = s->dmac_irq; break; - case REG_SD_THLDC: /* Card Threshold Control */ - res = s->card_threshold; + case REG_SD_THLDC: /* Card Threshold Control or FIFO register (sun4i) */ + if (sc->is_sun4i) { + res = allwinner_sdhost_fifo_read(s); + } else { + res = s->card_threshold; + } break; case REG_SD_DSBD: /* eMMC DDR Start Bit Detection Control */ res = s->startbit_detect; @@ -531,16 +554,7 @@ static uint64_t allwinner_sdhost_read(void *opaque, hwaddr offset, res = s->status_crc; break; case REG_SD_FIFO: /* Read/Write FIFO */ - if (sdbus_data_ready(&s->sdbus)) { - sdbus_read_data(&s->sdbus, &res, sizeof(uint32_t)); - le32_to_cpus(&res); - allwinner_sdhost_update_transfer_cnt(s, sizeof(uint32_t)); - allwinner_sdhost_auto_stop(s); - allwinner_sdhost_update_irq(s); - } else { - qemu_log_mask(LOG_GUEST_ERROR, "%s: no data ready on SD bus\n", - __func__); - } + res = allwinner_sdhost_fifo_read(s); break; default: qemu_log_mask(LOG_GUEST_ERROR, "%s: out-of-bounds offset %" @@ -553,11 +567,20 @@ static uint64_t allwinner_sdhost_read(void *opaque, hwaddr offset, return res; } +static void allwinner_sdhost_fifo_write(AwSdHostState *s, uint64_t value) +{ + uint32_t u32 = cpu_to_le32(value); + sdbus_write_data(&s->sdbus, &u32, sizeof(u32)); + allwinner_sdhost_update_transfer_cnt(s, sizeof(u32)); + allwinner_sdhost_auto_stop(s); + allwinner_sdhost_update_irq(s); +} + static void allwinner_sdhost_write(void *opaque, hwaddr offset, uint64_t value, unsigned size) { AwSdHostState *s = AW_SDHOST(opaque); - uint32_t u32; + AwSdHostClass *sc = AW_SDHOST_GET_CLASS(s); trace_allwinner_sdhost_write(offset, value, size); @@ -657,18 +680,18 @@ static void allwinner_sdhost_write(void *opaque, hwaddr offset, s->dmac_irq = value; allwinner_sdhost_update_irq(s); break; - case REG_SD_THLDC: /* Card Threshold Control */ - s->card_threshold = value; + case REG_SD_THLDC: /* Card Threshold Control or FIFO (sun4i) */ + if (sc->is_sun4i) { + allwinner_sdhost_fifo_write(s, value); + } else { + s->card_threshold = value; + } break; case REG_SD_DSBD: /* eMMC DDR Start Bit Detection Control */ s->startbit_detect = value; break; case REG_SD_FIFO: /* Read/Write FIFO */ - u32 = cpu_to_le32(value); - sdbus_write_data(&s->sdbus, &u32, sizeof(u32)); - allwinner_sdhost_update_transfer_cnt(s, sizeof(u32)); - allwinner_sdhost_auto_stop(s); - allwinner_sdhost_update_irq(s); + allwinner_sdhost_fifo_write(s, value); break; case REG_SD_RES_CRC: /* Response CRC from card/eMMC */ case REG_SD_DATA7_CRC: /* CRC Data 7 from card/eMMC */ @@ -834,12 +857,14 @@ static void allwinner_sdhost_sun4i_class_init(ObjectClass *klass, void *data) { AwSdHostClass *sc = AW_SDHOST_CLASS(klass); sc->max_desc_size = 8 * KiB; + sc->is_sun4i = true; } static void allwinner_sdhost_sun5i_class_init(ObjectClass *klass, void *data) { AwSdHostClass *sc = AW_SDHOST_CLASS(klass); sc->max_desc_size = 64 * KiB; + sc->is_sun4i = false; } static const TypeInfo allwinner_sdhost_info = { diff --git a/include/hw/sd/allwinner-sdhost.h b/include/hw/sd/allwinner-sdhost.h index bfe08ff4ef..30c1e60404 100644 --- a/include/hw/sd/allwinner-sdhost.h +++ b/include/hw/sd/allwinner-sdhost.h @@ -130,6 +130,7 @@ struct AwSdHostClass { /** Maximum buffer size in bytes per DMA descriptor */ size_t max_desc_size; + bool is_sun4i; }; From 69e7e60d011846f066af97589660eef52898519a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alex=20Benn=C3=A9e?= Date: Mon, 21 Nov 2022 11:45:13 +0000 Subject: [PATCH 11/17] hw/intc: clean-up access to GIC multi-byte registers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit gic_dist_readb was returning a word value which just happened to work as a result of the way we OR the data together. Lets fix it so only the explicit byte is returned for each part of GICD_TYPER. I've changed the return type to uint8_t although the overflow is only detected with an explicit -Wconversion. Signed-off-by: Alex Bennée Signed-off-by: Peter Maydell Reviewed-by: Philippe Mathieu-Daudé Reviewed-by: Peter Maydell Signed-off-by: Peter Maydell --- hw/intc/arm_gic.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/hw/intc/arm_gic.c b/hw/intc/arm_gic.c index 492b2421ab..1a04144c38 100644 --- a/hw/intc/arm_gic.c +++ b/hw/intc/arm_gic.c @@ -941,7 +941,7 @@ static void gic_complete_irq(GICState *s, int cpu, int irq, MemTxAttrs attrs) gic_update(s); } -static uint32_t gic_dist_readb(void *opaque, hwaddr offset, MemTxAttrs attrs) +static uint8_t gic_dist_readb(void *opaque, hwaddr offset, MemTxAttrs attrs) { GICState *s = (GICState *)opaque; uint32_t res; @@ -955,6 +955,7 @@ static uint32_t gic_dist_readb(void *opaque, hwaddr offset, MemTxAttrs attrs) cm = 1 << cpu; if (offset < 0x100) { if (offset == 0) { /* GICD_CTLR */ + /* We rely here on the only non-zero bits being in byte 0 */ if (s->security_extn && !attrs.secure) { /* The NS bank of this register is just an alias of the * EnableGrp1 bit in the S bank version. @@ -964,11 +965,14 @@ static uint32_t gic_dist_readb(void *opaque, hwaddr offset, MemTxAttrs attrs) return s->ctlr; } } - if (offset == 4) - /* Interrupt Controller Type Register */ - return ((s->num_irq / 32) - 1) - | ((s->num_cpu - 1) << 5) - | (s->security_extn << 10); + if (offset == 4) { + /* GICD_TYPER byte 0 */ + return ((s->num_irq / 32) - 1) | ((s->num_cpu - 1) << 5); + } + if (offset == 5) { + /* GICD_TYPER byte 1 */ + return (s->security_extn << 2); + } if (offset < 0x08) return 0; if (offset >= 0x80) { From 3d5af538a4fa8456a7e54b8115afe3d6358c1ce5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alex=20Benn=C3=A9e?= Date: Mon, 21 Nov 2022 11:45:13 +0000 Subject: [PATCH 12/17] hw/intc: add implementation of GICD_IIDR to Arm GIC MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit a66a24585f (hw/intc/arm_gic: Implement read of GICC_IIDR) implemented this for the CPU interface register. The fact we don't implement it shows up when running Xen with -d guest_error which is definitely wrong because the guest is perfectly entitled to read it. Signed-off-by: Alex Bennée Reviewed-by: Philippe Mathieu-Daudé Reviewed-by: Peter Maydell Signed-off-by: Peter Maydell --- hw/intc/arm_gic.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/hw/intc/arm_gic.c b/hw/intc/arm_gic.c index 1a04144c38..7a34bc0998 100644 --- a/hw/intc/arm_gic.c +++ b/hw/intc/arm_gic.c @@ -973,8 +973,18 @@ static uint8_t gic_dist_readb(void *opaque, hwaddr offset, MemTxAttrs attrs) /* GICD_TYPER byte 1 */ return (s->security_extn << 2); } - if (offset < 0x08) + if (offset == 8) { + /* GICD_IIDR byte 0 */ + return 0x3b; /* Arm JEP106 identity */ + } + if (offset == 9) { + /* GICD_IIDR byte 1 */ + return 0x04; /* Arm JEP106 identity */ + } + if (offset < 0x0c) { + /* All other bytes in this range are RAZ */ return 0; + } if (offset >= 0x80) { /* Interrupt Group Registers: these RAZ/WI if this is an NS * access to a GIC with the security extensions, or if the GIC From c4462523ff0790cbefb1c206cc34c85ec686b1d5 Mon Sep 17 00:00:00 2001 From: Peter Maydell Date: Mon, 21 Nov 2022 11:45:13 +0000 Subject: [PATCH 13/17] tests/avocado/boot_linux.py: Bump aarch64 virt test timeout to 720s MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The two tests tests/avocado/boot_linux.py:BootLinuxAarch64.test_virt_tcg_gicv2 tests/avocado/boot_linux.py:BootLinuxAarch64.test_virt_tcg_gicv3 take quite a long time to run, and the current timeout of 240s is not enough for the tests to complete on slow machines: we've seen these tests time out in the gitlab CI in the 'avocado-system-alpine' CI job, for instance. The timeout is also insufficient for running the test with a debug build of QEMU: on my machine the tests take over 10 minutes to run in that config. Push the timeout up to 720s so that the test definitely has enough time to complete. Signed-off-by: Peter Maydell Reviewed-by: Thomas Huth Reviewed-by: Philippe Mathieu-Daudé --- tests/avocado/boot_linux.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/avocado/boot_linux.py b/tests/avocado/boot_linux.py index 571d33882a..32adae6ff6 100644 --- a/tests/avocado/boot_linux.py +++ b/tests/avocado/boot_linux.py @@ -64,7 +64,7 @@ class BootLinuxAarch64(LinuxTest): :avocado: tags=machine:virt :avocado: tags=machine:gic-version=2 """ - timeout = 240 + timeout = 720 def add_common_args(self): self.vm.add_args('-bios', From 312b71abce3005ca7294dc0db7d548dc7cc41fbf Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 21 Nov 2022 11:45:13 +0000 Subject: [PATCH 14/17] target/arm: Limit LPA2 effective output address when TCR.DS == 0 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit With LPA2, the effective output address size is at most 48 bits when TCR.DS == 0. This case is currently unhandled in the page table walker, where we happily assume LVA/64k granule when outputsize > 48 and param.ds == 0, resulting in the wrong conversion to be used from a page table descriptor to a physical address. if (outputsize > 48) { if (param.ds) { descaddr |= extract64(descriptor, 8, 2) << 50; } else { descaddr |= extract64(descriptor, 12, 4) << 48; } So cap the outputsize to 48 when TCR.DS is cleared, as per the architecture. Cc: Peter Maydell Cc: Philippe Mathieu-Daudé Cc: Richard Henderson Signed-off-by: Ard Biesheuvel Reviewed-by: Richard Henderson Message-id: 20221116170316.259695-1-ardb@kernel.org Signed-off-by: Peter Maydell --- target/arm/ptw.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/target/arm/ptw.c b/target/arm/ptw.c index 3745ac9723..9a6277d862 100644 --- a/target/arm/ptw.c +++ b/target/arm/ptw.c @@ -1222,6 +1222,14 @@ static bool get_phys_addr_lpae(CPUARMState *env, S1Translate *ptw, ps = MIN(ps, param.ps); assert(ps < ARRAY_SIZE(pamax_map)); outputsize = pamax_map[ps]; + + /* + * With LPA2, the effective output address (OA) size is at most 48 bits + * unless TCR.DS == 1 + */ + if (!param.ds && param.gran != Gran64K) { + outputsize = MIN(outputsize, 48); + } } else { param = aa32_va_parameters(env, address, mmu_idx); level = 1; From bd142b2391a4369f39f9f4e8c48b15ed18280446 Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Thu, 17 Nov 2022 11:55:52 -0500 Subject: [PATCH 15/17] rtl8139: avoid clobbering tx descriptor bits MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The device turns the Tx Descriptor into a Tx Status descriptor after fully reading the descriptor. This involves clearing Tx Own (bit 31) to indicate that the driver has ownership of the descriptor again as well as several other bits. The code keeps the first dword of the Tx Descriptor in the txdw0 local variable. txdw0 is reused to build the first word of the Tx Status descriptor. Later on the code uses txdw0 again, incorrectly assuming that it still contains the first dword of the Tx Descriptor. The tx offloading code misbehaves because it sees bogus bits in txdw0. Use a separate local variable for Tx Status and preserve Tx Descriptor in txdw0. Acked-by: Jason Wang Reviewed-by: Philippe Mathieu-Daudé Signed-off-by: Stefan Hajnoczi Message-Id: <20221117165554.1773409-2-stefanha@redhat.com> --- hw/net/rtl8139.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/hw/net/rtl8139.c b/hw/net/rtl8139.c index e6643e3c9d..ffef3789b5 100644 --- a/hw/net/rtl8139.c +++ b/hw/net/rtl8139.c @@ -2027,18 +2027,21 @@ static int rtl8139_cplus_transmit_one(RTL8139State *s) s->currCPlusTxDesc = 0; } + /* Build the Tx Status Descriptor */ + uint32_t tx_status = txdw0; + /* transfer ownership to target */ - txdw0 &= ~CP_TX_OWN; + tx_status &= ~CP_TX_OWN; /* reset error indicator bits */ - txdw0 &= ~CP_TX_STATUS_UNF; - txdw0 &= ~CP_TX_STATUS_TES; - txdw0 &= ~CP_TX_STATUS_OWC; - txdw0 &= ~CP_TX_STATUS_LNKF; - txdw0 &= ~CP_TX_STATUS_EXC; + tx_status &= ~CP_TX_STATUS_UNF; + tx_status &= ~CP_TX_STATUS_TES; + tx_status &= ~CP_TX_STATUS_OWC; + tx_status &= ~CP_TX_STATUS_LNKF; + tx_status &= ~CP_TX_STATUS_EXC; /* update ring data */ - val = cpu_to_le32(txdw0); + val = cpu_to_le32(tx_status); pci_dma_write(d, cplus_tx_ring_desc, (uint8_t *)&val, 4); /* Now decide if descriptor being processed is holding the last segment of packet */ From c74831a02c818f89d10f5475cd0fb9ba40bfb2a8 Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Thu, 17 Nov 2022 11:55:53 -0500 Subject: [PATCH 16/17] rtl8139: keep Tx command mode 0 and 1 separate MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There are two Tx Descriptor formats called mode 0 and mode 1. The mode is determined by the Large Send bit. CP_TX_IPCS (bit 18) is defined in mode 1 but the code checks the bit unconditionally. In mode 0 bit 18 is part of the Large Send MSS value. Explicitly check the Large Send bit to distinguish Tx command modes. This avoids bugs where modes are confused. Note that I didn't find any actual bugs aside from needlessly computing the IP checksum when the Large Send bit is enabled. Acked-by: Jason Wang Reviewed-by: Philippe Mathieu-Daudé Signed-off-by: Stefan Hajnoczi Message-Id: <20221117165554.1773409-3-stefanha@redhat.com> --- hw/net/rtl8139.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hw/net/rtl8139.c b/hw/net/rtl8139.c index ffef3789b5..6dd7a8e6e0 100644 --- a/hw/net/rtl8139.c +++ b/hw/net/rtl8139.c @@ -2135,7 +2135,7 @@ static int rtl8139_cplus_transmit_one(RTL8139State *s) } ip_data_len -= hlen; - if (txdw0 & CP_TX_IPCS) + if (!(txdw0 & CP_TX_LGSEN) && (txdw0 & CP_TX_IPCS)) { DPRINTF("+++ C+ mode need IP checksum\n"); @@ -2268,7 +2268,7 @@ static int rtl8139_cplus_transmit_one(RTL8139State *s) /* Stop sending this frame */ saved_size = 0; } - else if (txdw0 & (CP_TX_TCPCS|CP_TX_UDPCS)) + else if (!(txdw0 & CP_TX_LGSEN) && (txdw0 & (CP_TX_TCPCS|CP_TX_UDPCS))) { DPRINTF("+++ C+ mode need TCP or UDP checksum\n"); From 6d71357a3b651ec9db126e4862b77e13165427f5 Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Thu, 17 Nov 2022 11:55:54 -0500 Subject: [PATCH 17/17] rtl8139: honor large send MSS value MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Large-Send Task Offload Tx Descriptor (9.2.1 Transmit) has a Large-Send MSS value where the driver specifies the MSS. See the datasheet here: http://realtek.info/pdf/rtl8139cp.pdf The code ignores this value and uses a hardcoded MSS of 1500 bytes instead. When the MTU is less than 1500 bytes the hardcoded value results in IP fragmentation and poor performance. Use the Large-Send MSS value to correctly size Large-Send packets. Jason Wang noticed that the Large-Send MSS value mask was incorrect so it is adjusted to match the datasheet and Linux 8139cp driver. This issue was discussed in the past here: https://lore.kernel.org/all/20161114162505.GD26664@stefanha-x1.localdomain/ Reported-by: Russell King - ARM Linux Reported-by: Tobias Fiebig Resolves: https://gitlab.com/qemu-project/qemu/-/issues/1312 Acked-by: Jason Wang Reviewed-by: Philippe Mathieu-Daudé Signed-off-by: Stefan Hajnoczi Message-Id: <20221117165554.1773409-4-stefanha@redhat.com> --- hw/net/rtl8139.c | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/hw/net/rtl8139.c b/hw/net/rtl8139.c index 6dd7a8e6e0..700b1b66b6 100644 --- a/hw/net/rtl8139.c +++ b/hw/net/rtl8139.c @@ -77,7 +77,6 @@ ( ( input ) & ( size - 1 ) ) #define ETHER_TYPE_LEN 2 -#define ETH_MTU 1500 #define VLAN_TCI_LEN 2 #define VLAN_HLEN (ETHER_TYPE_LEN + VLAN_TCI_LEN) @@ -1934,8 +1933,9 @@ static int rtl8139_cplus_transmit_one(RTL8139State *s) #define CP_TX_LS (1<<28) /* large send packet flag */ #define CP_TX_LGSEN (1<<27) -/* large send MSS mask, bits 16...25 */ -#define CP_TC_LGSEN_MSS_MASK ((1 << 12) - 1) +/* large send MSS mask, bits 16...26 */ +#define CP_TC_LGSEN_MSS_SHIFT 16 +#define CP_TC_LGSEN_MSS_MASK ((1 << 11) - 1) /* IP checksum offload flag */ #define CP_TX_IPCS (1<<18) @@ -2152,10 +2152,11 @@ static int rtl8139_cplus_transmit_one(RTL8139State *s) goto skip_offload; } - int large_send_mss = (txdw0 >> 16) & CP_TC_LGSEN_MSS_MASK; + int large_send_mss = (txdw0 >> CP_TC_LGSEN_MSS_SHIFT) & + CP_TC_LGSEN_MSS_MASK; - DPRINTF("+++ C+ mode offloaded task TSO MTU=%d IP data %d " - "frame data %d specified MSS=%d\n", ETH_MTU, + DPRINTF("+++ C+ mode offloaded task TSO IP data %d " + "frame data %d specified MSS=%d\n", ip_data_len, saved_size - ETH_HLEN, large_send_mss); int tcp_send_offset = 0; @@ -2180,25 +2181,22 @@ static int rtl8139_cplus_transmit_one(RTL8139State *s) goto skip_offload; } - /* ETH_MTU = ip header len + tcp header len + payload */ int tcp_data_len = ip_data_len - tcp_hlen; - int tcp_chunk_size = ETH_MTU - hlen - tcp_hlen; DPRINTF("+++ C+ mode TSO IP data len %d TCP hlen %d TCP " - "data len %d TCP chunk size %d\n", ip_data_len, - tcp_hlen, tcp_data_len, tcp_chunk_size); + "data len %d\n", ip_data_len, tcp_hlen, tcp_data_len); /* note the cycle below overwrites IP header data, but restores it from saved_ip_header before sending packet */ int is_last_frame = 0; - for (tcp_send_offset = 0; tcp_send_offset < tcp_data_len; tcp_send_offset += tcp_chunk_size) + for (tcp_send_offset = 0; tcp_send_offset < tcp_data_len; tcp_send_offset += large_send_mss) { - uint16_t chunk_size = tcp_chunk_size; + uint16_t chunk_size = large_send_mss; /* check if this is the last frame */ - if (tcp_send_offset + tcp_chunk_size >= tcp_data_len) + if (tcp_send_offset + large_send_mss >= tcp_data_len) { is_last_frame = 1; chunk_size = tcp_data_len - tcp_send_offset; @@ -2247,7 +2245,7 @@ static int rtl8139_cplus_transmit_one(RTL8139State *s) ip->ip_len = cpu_to_be16(hlen + tcp_hlen + chunk_size); /* increment IP id for subsequent frames */ - ip->ip_id = cpu_to_be16(tcp_send_offset/tcp_chunk_size + be16_to_cpu(ip->ip_id)); + ip->ip_id = cpu_to_be16(tcp_send_offset/large_send_mss + be16_to_cpu(ip->ip_id)); ip->ip_sum = 0; ip->ip_sum = ip_checksum(eth_payload_data, hlen);