From 06639f8ff53d1dbfa709377499e6c30eca9c3c9a Mon Sep 17 00:00:00 2001
From: Bin Meng <bin.meng@windriver.com>
Date: Tue, 25 Oct 2022 22:10:15 +0800
Subject: [PATCH 01/17] chardev/char-win-stdio: Pass Ctrl+C to guest with a
 multiplexed monitor
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

At present when pressing Ctrl+C from a guest running on QEMU Windows
with a multiplexed monitor, e.g.: -serial mon:stdio, QEMU executable
just exits. This behavior is inconsistent with the Linux version.

Such behavior is caused by unconditionally setting the input mode
ENABLE_PROCESSED_INPUT for a console's input buffer. Fix this by
testing whether the chardev is allowed to do so.

Signed-off-by: Bin Meng <bin.meng@windriver.com>
Reviewed-by: Marc-André Lureau <marcandre.lureau@redhat.com>
Message-Id: <20221025141015.612291-1-bin.meng@windriver.com>
---
 chardev/char-win-stdio.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/chardev/char-win-stdio.c b/chardev/char-win-stdio.c
index a4771ab82e..eb830eabd9 100644
--- a/chardev/char-win-stdio.c
+++ b/chardev/char-win-stdio.c
@@ -146,6 +146,8 @@ static void qemu_chr_open_stdio(Chardev *chr,
                                 bool *be_opened,
                                 Error **errp)
 {
+    ChardevStdio *opts = backend->u.stdio.data;
+    bool stdio_allow_signal = !opts->has_signal || opts->signal;
     WinStdioChardev *stdio = WIN_STDIO_CHARDEV(chr);
     DWORD              dwMode;
     int                is_console = 0;
@@ -193,7 +195,11 @@ static void qemu_chr_open_stdio(Chardev *chr,
     if (is_console) {
         /* set the terminal in raw mode */
         /* ENABLE_QUICK_EDIT_MODE | ENABLE_EXTENDED_FLAGS */
-        dwMode |= ENABLE_PROCESSED_INPUT;
+        if (stdio_allow_signal) {
+            dwMode |= ENABLE_PROCESSED_INPUT;
+        } else {
+            dwMode &= ~ENABLE_PROCESSED_INPUT;
+        }
     }
 
     SetConsoleMode(stdio->hStdIn, dwMode);

From a216ec85b78ea96b51950665879524132f6e678c Mon Sep 17 00:00:00 2001
From: Fiona Ebner <f.ebner@proxmox.com>
Date: Thu, 13 Oct 2022 10:41:00 +0200
Subject: [PATCH 02/17] migration/channel-block: fix return value for
 qio_channel_block_{readv,writev}

in the error case. The documentation in include/io/channel.h states
that -1 or QIO_CHANNEL_ERR_BLOCK should be returned upon error. Simply
passing along the return value from the bdrv-functions has the
potential to confuse the call sides. Non-blocking mode is not
implemented currently, so -1 it is.

Signed-off-by: Fiona Ebner <f.ebner@proxmox.com>
Reviewed-by: Juan Quintela <quintela@redhat.com>
Signed-off-by: Juan Quintela <quintela@redhat.com>
---
 migration/channel-block.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/migration/channel-block.c b/migration/channel-block.c
index c55c8c93ce..f4ab53acdb 100644
--- a/migration/channel-block.c
+++ b/migration/channel-block.c
@@ -62,7 +62,8 @@ qio_channel_block_readv(QIOChannel *ioc,
     qemu_iovec_init_external(&qiov, (struct iovec *)iov, niov);
     ret = bdrv_readv_vmstate(bioc->bs, &qiov, bioc->offset);
     if (ret < 0) {
-        return ret;
+        error_setg_errno(errp, -ret, "bdrv_readv_vmstate failed");
+        return -1;
     }
 
     bioc->offset += qiov.size;
@@ -86,7 +87,8 @@ qio_channel_block_writev(QIOChannel *ioc,
     qemu_iovec_init_external(&qiov, (struct iovec *)iov, niov);
     ret = bdrv_writev_vmstate(bioc->bs, &qiov, bioc->offset);
     if (ret < 0) {
-        return ret;
+        error_setg_errno(errp, -ret, "bdrv_writev_vmstate failed");
+        return -1;
     }
 
     bioc->offset += qiov.size;

From 4cc47b439594327b213f9b6a67803f1a503c2cb7 Mon Sep 17 00:00:00 2001
From: Leonardo Bras <leobras@redhat.com>
Date: Tue, 25 Oct 2022 01:47:28 -0300
Subject: [PATCH 03/17] migration/multifd/zero-copy: Create helper function for
 flushing

Move flushing code from multifd_send_sync_main() to a new helper, and call
it in multifd_send_sync_main().

Signed-off-by: Leonardo Bras <leobras@redhat.com>
Reviewed-by: Juan Quintela <quintela@redhat.com>
Signed-off-by: Juan Quintela <quintela@redhat.com>
---
 migration/multifd.c | 30 +++++++++++++++++++-----------
 1 file changed, 19 insertions(+), 11 deletions(-)

diff --git a/migration/multifd.c b/migration/multifd.c
index 586ddc9d65..509bbbe3bf 100644
--- a/migration/multifd.c
+++ b/migration/multifd.c
@@ -566,6 +566,23 @@ void multifd_save_cleanup(void)
     multifd_send_state = NULL;
 }
 
+static int multifd_zero_copy_flush(QIOChannel *c)
+{
+    int ret;
+    Error *err = NULL;
+
+    ret = qio_channel_flush(c, &err);
+    if (ret < 0) {
+        error_report_err(err);
+        return -1;
+    }
+    if (ret == 1) {
+        dirty_sync_missed_zero_copy();
+    }
+
+    return ret;
+}
+
 int multifd_send_sync_main(QEMUFile *f)
 {
     int i;
@@ -616,17 +633,8 @@ int multifd_send_sync_main(QEMUFile *f)
         qemu_mutex_unlock(&p->mutex);
         qemu_sem_post(&p->sem);
 
-        if (flush_zero_copy && p->c) {
-            int ret;
-            Error *err = NULL;
-
-            ret = qio_channel_flush(p->c, &err);
-            if (ret < 0) {
-                error_report_err(err);
-                return -1;
-            } else if (ret == 1) {
-                dirty_sync_missed_zero_copy();
-            }
+        if (flush_zero_copy && p->c && (multifd_zero_copy_flush(p->c) < 0)) {
+            return -1;
         }
     }
     for (i = 0; i < migrate_multifd_channels(); i++) {

From 4934a5dd7c68f5ab15f17498db4fc20ed6db9578 Mon Sep 17 00:00:00 2001
From: Peter Xu <peterx@redhat.com>
Date: Tue, 4 Oct 2022 14:24:26 -0400
Subject: [PATCH 04/17] migration: Fix possible infinite loop of ram save
 process

When starting ram saving procedure (especially at the completion phase),
always set last_seen_block to non-NULL to make sure we can always correctly
detect the case where "we've migrated all the dirty pages".

Then we'll guarantee both last_seen_block and pss.block will be valid
always before the loop starts.

See the comment in the code for some details.

Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
Signed-off-by: Peter Xu <peterx@redhat.com>
Reviewed-by: Juan Quintela <quintela@redhat.com>
Signed-off-by: Juan Quintela <quintela@redhat.com>
---
 migration/ram.c | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/migration/ram.c b/migration/ram.c
index dc1de9ddbc..1d42414ecc 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -2546,14 +2546,22 @@ static int ram_find_and_save_block(RAMState *rs)
         return pages;
     }
 
+    /*
+     * Always keep last_seen_block/last_page valid during this procedure,
+     * because find_dirty_block() relies on these values (e.g., we compare
+     * last_seen_block with pss.block to see whether we searched all the
+     * ramblocks) to detect the completion of migration.  Having NULL value
+     * of last_seen_block can conditionally cause below loop to run forever.
+     */
+    if (!rs->last_seen_block) {
+        rs->last_seen_block = QLIST_FIRST_RCU(&ram_list.blocks);
+        rs->last_page = 0;
+    }
+
     pss.block = rs->last_seen_block;
     pss.page = rs->last_page;
     pss.complete_round = false;
 
-    if (!pss.block) {
-        pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
-    }
-
     do {
         again = true;
         found = get_queued_page(rs, &pss);

From f5816b5c86ed399c99ce8662a4ed96aab32c5eef Mon Sep 17 00:00:00 2001
From: Peter Xu <peterx@redhat.com>
Date: Tue, 4 Oct 2022 14:24:27 -0400
Subject: [PATCH 05/17] migration: Fix race on qemu_file_shutdown()

In qemu_file_shutdown(), there's a possible race if with current order of
operation.  There're two major things to do:

  (1) Do real shutdown() (e.g. shutdown() syscall on socket)
  (2) Update qemufile's last_error

We must do (2) before (1) otherwise there can be a race condition like:

      page receiver                     other thread
      -------------                     ------------
      qemu_get_buffer()
                                        do shutdown()
        returns 0 (buffer all zero)
        (meanwhile we didn't check this retcode)
      try to detect IO error
        last_error==NULL, IO okay
      install ALL-ZERO page
                                        set last_error
      --> guest crash!

To fix this, we can also check retval of qemu_get_buffer(), but not all
APIs can be properly checked and ultimately we still need to go back to
qemu_file_get_error().  E.g. qemu_get_byte() doesn't return error.

Maybe some day a rework of qemufile API is really needed, but for now keep
using qemu_file_get_error() and fix it by not allowing that race condition
to happen.  Here shutdown() is indeed special because the last_error was
emulated.  For real -EIO errors it'll always be set when e.g. sendmsg()
error triggers so we won't miss those ones, only shutdown() is a bit tricky
here.

Cc: Daniel P. Berrange <berrange@redhat.com>
Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
Signed-off-by: Peter Xu <peterx@redhat.com>
Reviewed-by: Juan Quintela <quintela@redhat.com>
Signed-off-by: Juan Quintela <quintela@redhat.com>
---
 migration/qemu-file.c | 27 ++++++++++++++++++++++++---
 1 file changed, 24 insertions(+), 3 deletions(-)

diff --git a/migration/qemu-file.c b/migration/qemu-file.c
index 4f400c2e52..2d5f74ffc2 100644
--- a/migration/qemu-file.c
+++ b/migration/qemu-file.c
@@ -79,6 +79,30 @@ int qemu_file_shutdown(QEMUFile *f)
     int ret = 0;
 
     f->shutdown = true;
+
+    /*
+     * We must set qemufile error before the real shutdown(), otherwise
+     * there can be a race window where we thought IO all went though
+     * (because last_error==NULL) but actually IO has already stopped.
+     *
+     * If without correct ordering, the race can happen like this:
+     *
+     *      page receiver                     other thread
+     *      -------------                     ------------
+     *      qemu_get_buffer()
+     *                                        do shutdown()
+     *        returns 0 (buffer all zero)
+     *        (we didn't check this retcode)
+     *      try to detect IO error
+     *        last_error==NULL, IO okay
+     *      install ALL-ZERO page
+     *                                        set last_error
+     *      --> guest crash!
+     */
+    if (!f->last_error) {
+        qemu_file_set_error(f, -EIO);
+    }
+
     if (!qio_channel_has_feature(f->ioc,
                                  QIO_CHANNEL_FEATURE_SHUTDOWN)) {
         return -ENOSYS;
@@ -88,9 +112,6 @@ int qemu_file_shutdown(QEMUFile *f)
         ret = -EIO;
     }
 
-    if (!f->last_error) {
-        qemu_file_set_error(f, -EIO);
-    }
     return ret;
 }
 

From afed4273b5c9438dfbaa0b4762d0433f295ccdc1 Mon Sep 17 00:00:00 2001
From: Peter Xu <peterx@redhat.com>
Date: Tue, 4 Oct 2022 14:24:28 -0400
Subject: [PATCH 06/17] migration: Disallow postcopy preempt to be used with
 compress

The preempt mode requires the capability to assign channel for each of the
page, while the compression logic will currently assign pages to different
compress thread/local-channel so potentially they're incompatible.

Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
Signed-off-by: Peter Xu <peterx@redhat.com>
Reviewed-by: Juan Quintela <quintela@redhat.com>
Signed-off-by: Juan Quintela <quintela@redhat.com>
---
 migration/migration.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/migration/migration.c b/migration/migration.c
index 739bb683f3..f3ed77a7d0 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -1337,6 +1337,17 @@ static bool migrate_caps_check(bool *cap_list,
             error_setg(errp, "Postcopy preempt requires postcopy-ram");
             return false;
         }
+
+        /*
+         * Preempt mode requires urgent pages to be sent in separate
+         * channel, OTOH compression logic will disorder all pages into
+         * different compression channels, which is not compatible with the
+         * preempt assumptions on channel assignments.
+         */
+        if (cap_list[MIGRATION_CAPABILITY_COMPRESS]) {
+            error_setg(errp, "Postcopy preempt not compatible with compress");
+            return false;
+        }
     }
 
     return true;

From cedb70eafb4fd51d9c714981509d97b9f4055be5 Mon Sep 17 00:00:00 2001
From: Peter Xu <peterx@redhat.com>
Date: Tue, 4 Oct 2022 14:24:29 -0400
Subject: [PATCH 07/17] migration: Use non-atomic ops for clear log bitmap

Since we already have bitmap_mutex to protect either the dirty bitmap or
the clear log bitmap, we don't need atomic operations to set/clear/test on
the clear log bitmap.  Switching all ops from atomic to non-atomic
versions, meanwhile touch up the comments to show which lock is in charge.

Introduced non-atomic version of bitmap_test_and_clear_atomic(), mostly the
same as the atomic version but simplified a few places, e.g. dropped the
"old_bits" variable, and also the explicit memory barriers.

Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
Signed-off-by: Peter Xu <peterx@redhat.com>
Reviewed-by: Juan Quintela <quintela@redhat.com>
Signed-off-by: Juan Quintela <quintela@redhat.com>
---
 include/exec/ram_addr.h | 11 +++++-----
 include/exec/ramblock.h |  3 +++
 include/qemu/bitmap.h   |  1 +
 util/bitmap.c           | 45 +++++++++++++++++++++++++++++++++++++++++
 4 files changed, 55 insertions(+), 5 deletions(-)

diff --git a/include/exec/ram_addr.h b/include/exec/ram_addr.h
index 1500680458..f4fb6a2111 100644
--- a/include/exec/ram_addr.h
+++ b/include/exec/ram_addr.h
@@ -42,7 +42,8 @@ static inline long clear_bmap_size(uint64_t pages, uint8_t shift)
 }
 
 /**
- * clear_bmap_set: set clear bitmap for the page range
+ * clear_bmap_set: set clear bitmap for the page range.  Must be with
+ * bitmap_mutex held.
  *
  * @rb: the ramblock to operate on
  * @start: the start page number
@@ -55,12 +56,12 @@ static inline void clear_bmap_set(RAMBlock *rb, uint64_t start,
 {
     uint8_t shift = rb->clear_bmap_shift;
 
-    bitmap_set_atomic(rb->clear_bmap, start >> shift,
-                      clear_bmap_size(npages, shift));
+    bitmap_set(rb->clear_bmap, start >> shift, clear_bmap_size(npages, shift));
 }
 
 /**
- * clear_bmap_test_and_clear: test clear bitmap for the page, clear if set
+ * clear_bmap_test_and_clear: test clear bitmap for the page, clear if set.
+ * Must be with bitmap_mutex held.
  *
  * @rb: the ramblock to operate on
  * @page: the page number to check
@@ -71,7 +72,7 @@ static inline bool clear_bmap_test_and_clear(RAMBlock *rb, uint64_t page)
 {
     uint8_t shift = rb->clear_bmap_shift;
 
-    return bitmap_test_and_clear_atomic(rb->clear_bmap, page >> shift, 1);
+    return bitmap_test_and_clear(rb->clear_bmap, page >> shift, 1);
 }
 
 static inline bool offset_in_ramblock(RAMBlock *b, ram_addr_t offset)
diff --git a/include/exec/ramblock.h b/include/exec/ramblock.h
index 6cbedf9e0c..adc03df59c 100644
--- a/include/exec/ramblock.h
+++ b/include/exec/ramblock.h
@@ -53,6 +53,9 @@ struct RAMBlock {
      * and split clearing of dirty bitmap on the remote node (e.g.,
      * KVM).  The bitmap will be set only when doing global sync.
      *
+     * It is only used during src side of ram migration, and it is
+     * protected by the global ram_state.bitmap_mutex.
+     *
      * NOTE: this bitmap is different comparing to the other bitmaps
      * in that one bit can represent multiple guest pages (which is
      * decided by the `clear_bmap_shift' variable below).  On
diff --git a/include/qemu/bitmap.h b/include/qemu/bitmap.h
index 82a1d2f41f..3ccb00865f 100644
--- a/include/qemu/bitmap.h
+++ b/include/qemu/bitmap.h
@@ -253,6 +253,7 @@ void bitmap_set(unsigned long *map, long i, long len);
 void bitmap_set_atomic(unsigned long *map, long i, long len);
 void bitmap_clear(unsigned long *map, long start, long nr);
 bool bitmap_test_and_clear_atomic(unsigned long *map, long start, long nr);
+bool bitmap_test_and_clear(unsigned long *map, long start, long nr);
 void bitmap_copy_and_clear_atomic(unsigned long *dst, unsigned long *src,
                                   long nr);
 unsigned long bitmap_find_next_zero_area(unsigned long *map,
diff --git a/util/bitmap.c b/util/bitmap.c
index f81d8057a7..8d12e90a5a 100644
--- a/util/bitmap.c
+++ b/util/bitmap.c
@@ -240,6 +240,51 @@ void bitmap_clear(unsigned long *map, long start, long nr)
     }
 }
 
+bool bitmap_test_and_clear(unsigned long *map, long start, long nr)
+{
+    unsigned long *p = map + BIT_WORD(start);
+    const long size = start + nr;
+    int bits_to_clear = BITS_PER_LONG - (start % BITS_PER_LONG);
+    unsigned long mask_to_clear = BITMAP_FIRST_WORD_MASK(start);
+    bool dirty = false;
+
+    assert(start >= 0 && nr >= 0);
+
+    /* First word */
+    if (nr - bits_to_clear > 0) {
+        if ((*p) & mask_to_clear) {
+            dirty = true;
+        }
+        *p &= ~mask_to_clear;
+        nr -= bits_to_clear;
+        bits_to_clear = BITS_PER_LONG;
+        p++;
+    }
+
+    /* Full words */
+    if (bits_to_clear == BITS_PER_LONG) {
+        while (nr >= BITS_PER_LONG) {
+            if (*p) {
+                dirty = true;
+                *p = 0;
+            }
+            nr -= BITS_PER_LONG;
+            p++;
+        }
+    }
+
+    /* Last word */
+    if (nr) {
+        mask_to_clear &= BITMAP_LAST_WORD_MASK(size);
+        if ((*p) & mask_to_clear) {
+            dirty = true;
+        }
+        *p &= ~mask_to_clear;
+    }
+
+    return dirty;
+}
+
 bool bitmap_test_and_clear_atomic(unsigned long *map, long start, long nr)
 {
     unsigned long *p = map + BIT_WORD(start);

From 6f39c90b86b9d3772779f873ed88aaa75a220aba Mon Sep 17 00:00:00 2001
From: Peter Xu <peterx@redhat.com>
Date: Tue, 4 Oct 2022 14:24:30 -0400
Subject: [PATCH 08/17] migration: Disable multifd explicitly with compression

Multifd thread model does not work for compression, explicitly disable it.

Note that previuosly even we can enable both of them, nothing will go
wrong, because the compression code has higher priority so multifd feature
will just be ignored.  Now we'll fail even earlier at config time so the
user should be aware of the consequence better.

Note that there can be a slight chance of breaking existing users, but
let's assume they're not majority and not serious users, or they should
have found that multifd is not working already.

With that, we can safely drop the check in ram_save_target_page() for using
multifd, because when multifd=on then compression=off, then the removed
check on save_page_use_compression() will also always return false too.

Signed-off-by: Peter Xu <peterx@redhat.com>
Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
Reviewed-by: Juan Quintela <quintela@redhat.com>
Signed-off-by: Juan Quintela <quintela@redhat.com>
---
 migration/migration.c |  7 +++++++
 migration/ram.c       | 11 +++++------
 2 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/migration/migration.c b/migration/migration.c
index f3ed77a7d0..f485eea5fb 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -1350,6 +1350,13 @@ static bool migrate_caps_check(bool *cap_list,
         }
     }
 
+    if (cap_list[MIGRATION_CAPABILITY_MULTIFD]) {
+        if (cap_list[MIGRATION_CAPABILITY_COMPRESS]) {
+            error_setg(errp, "Multifd is not compatible with compress");
+            return false;
+        }
+    }
+
     return true;
 }
 
diff --git a/migration/ram.c b/migration/ram.c
index 1d42414ecc..1338e47665 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -2305,13 +2305,12 @@ static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss)
     }
 
     /*
-     * Do not use multifd for:
-     * 1. Compression as the first page in the new block should be posted out
-     *    before sending the compressed page
-     * 2. In postcopy as one whole host page should be placed
+     * Do not use multifd in postcopy as one whole host page should be
+     * placed.  Meanwhile postcopy requires atomic update of pages, so even
+     * if host page size == guest page size the dest guest during run may
+     * still see partially copied pages which is data corruption.
      */
-    if (!save_page_use_compression(rs) && migrate_use_multifd()
-        && !migration_in_postcopy()) {
+    if (migrate_use_multifd() && !migration_in_postcopy()) {
         return ram_save_multifd_page(rs, block, offset);
     }
 

From b5280437a7f49cf617cdd99bbbe2c7bd1652408b Mon Sep 17 00:00:00 2001
From: Juan Quintela <quintela@redhat.com>
Date: Mon, 3 Oct 2022 05:15:56 +0200
Subject: [PATCH 09/17] migration: Block migration comment or code is wrong

And it appears that what is wrong is the code. During bulk stage we
need to make sure that some block is dirty, but no games with
max_size at all.

Signed-off-by: Juan Quintela <quintela@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 migration/block.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/migration/block.c b/migration/block.c
index 3577c815a9..4347da1526 100644
--- a/migration/block.c
+++ b/migration/block.c
@@ -880,8 +880,8 @@ static void block_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
     blk_mig_unlock();
 
     /* Report at least one block pending during bulk phase */
-    if (pending <= max_size && !block_mig_state.bulk_completed) {
-        pending = max_size + BLK_MIG_BLOCK_SIZE;
+    if (!pending && !block_mig_state.bulk_completed) {
+        pending = BLK_MIG_BLOCK_SIZE;
     }
 
     trace_migration_block_save_pending(pending);

From 93e2da36ed944d05e78905e95983a44624ed064c Mon Sep 17 00:00:00 2001
From: Strahinja Jankovic <strahinjapjankovic@gmail.com>
Date: Mon, 21 Nov 2022 11:45:12 +0000
Subject: [PATCH 10/17] hw/sd: Fix sun4i allwinner-sdhost for U-Boot

Trying to run U-Boot for Cubieboard (Allwinner A10) fails because it cannot
access SD card. The problem is that FIFO register in current
allwinner-sdhost implementation is at the address corresponding to
Allwinner H3, but not A10.
Linux kernel is not affected since Linux driver uses DMA access and does
not use FIFO register for reading/writing.

This patch adds new class parameter `is_sun4i` and based on that
parameter uses register at offset 0x100 either as FIFO register (if
sun4i) or as threshold register (if not sun4i; in this case register at
0x200 is FIFO register).

Tested with U-Boot and Linux kernel image built for Cubieboard and
OrangePi PC.

Signed-off-by: Strahinja Jankovic <strahinja.p.jankovic@gmail.com>
Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
Message-id: 20221112214900.24152-1-strahinja.p.jankovic@gmail.com
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
---
 hw/sd/allwinner-sdhost.c         | 67 ++++++++++++++++++++++----------
 include/hw/sd/allwinner-sdhost.h |  1 +
 2 files changed, 47 insertions(+), 21 deletions(-)

diff --git a/hw/sd/allwinner-sdhost.c b/hw/sd/allwinner-sdhost.c
index 455d6eabf6..51e5e90830 100644
--- a/hw/sd/allwinner-sdhost.c
+++ b/hw/sd/allwinner-sdhost.c
@@ -65,7 +65,7 @@ enum {
     REG_SD_DLBA       = 0x84,  /* Descriptor List Base Address */
     REG_SD_IDST       = 0x88,  /* Internal DMA Controller Status */
     REG_SD_IDIE       = 0x8C,  /* Internal DMA Controller IRQ Enable */
-    REG_SD_THLDC      = 0x100, /* Card Threshold Control */
+    REG_SD_THLDC      = 0x100, /* Card Threshold Control / FIFO (sun4i only)*/
     REG_SD_DSBD       = 0x10C, /* eMMC DDR Start Bit Detection Control */
     REG_SD_RES_CRC    = 0x110, /* Response CRC from card/eMMC */
     REG_SD_DATA7_CRC  = 0x114, /* CRC Data 7 from card/eMMC */
@@ -415,10 +415,29 @@ static void allwinner_sdhost_dma(AwSdHostState *s)
     }
 }
 
+static uint32_t allwinner_sdhost_fifo_read(AwSdHostState *s)
+{
+    uint32_t res = 0;
+
+    if (sdbus_data_ready(&s->sdbus)) {
+        sdbus_read_data(&s->sdbus, &res, sizeof(uint32_t));
+        le32_to_cpus(&res);
+        allwinner_sdhost_update_transfer_cnt(s, sizeof(uint32_t));
+        allwinner_sdhost_auto_stop(s);
+        allwinner_sdhost_update_irq(s);
+    } else {
+        qemu_log_mask(LOG_GUEST_ERROR, "%s: no data ready on SD bus\n",
+                      __func__);
+    }
+
+    return res;
+}
+
 static uint64_t allwinner_sdhost_read(void *opaque, hwaddr offset,
                                       unsigned size)
 {
     AwSdHostState *s = AW_SDHOST(opaque);
+    AwSdHostClass *sc = AW_SDHOST_GET_CLASS(s);
     uint32_t res = 0;
 
     switch (offset) {
@@ -508,8 +527,12 @@ static uint64_t allwinner_sdhost_read(void *opaque, hwaddr offset,
     case REG_SD_IDIE:      /* Internal DMA Controller Interrupt Enable */
         res = s->dmac_irq;
         break;
-    case REG_SD_THLDC:     /* Card Threshold Control */
-        res = s->card_threshold;
+    case REG_SD_THLDC:     /* Card Threshold Control or FIFO register (sun4i) */
+        if (sc->is_sun4i) {
+            res = allwinner_sdhost_fifo_read(s);
+        } else {
+            res = s->card_threshold;
+        }
         break;
     case REG_SD_DSBD:      /* eMMC DDR Start Bit Detection Control */
         res = s->startbit_detect;
@@ -531,16 +554,7 @@ static uint64_t allwinner_sdhost_read(void *opaque, hwaddr offset,
         res = s->status_crc;
         break;
     case REG_SD_FIFO:      /* Read/Write FIFO */
-        if (sdbus_data_ready(&s->sdbus)) {
-            sdbus_read_data(&s->sdbus, &res, sizeof(uint32_t));
-            le32_to_cpus(&res);
-            allwinner_sdhost_update_transfer_cnt(s, sizeof(uint32_t));
-            allwinner_sdhost_auto_stop(s);
-            allwinner_sdhost_update_irq(s);
-        } else {
-            qemu_log_mask(LOG_GUEST_ERROR, "%s: no data ready on SD bus\n",
-                          __func__);
-        }
+        res = allwinner_sdhost_fifo_read(s);
         break;
     default:
         qemu_log_mask(LOG_GUEST_ERROR, "%s: out-of-bounds offset %"
@@ -553,11 +567,20 @@ static uint64_t allwinner_sdhost_read(void *opaque, hwaddr offset,
     return res;
 }
 
+static void allwinner_sdhost_fifo_write(AwSdHostState *s, uint64_t value)
+{
+    uint32_t u32 = cpu_to_le32(value);
+    sdbus_write_data(&s->sdbus, &u32, sizeof(u32));
+    allwinner_sdhost_update_transfer_cnt(s, sizeof(u32));
+    allwinner_sdhost_auto_stop(s);
+    allwinner_sdhost_update_irq(s);
+}
+
 static void allwinner_sdhost_write(void *opaque, hwaddr offset,
                                    uint64_t value, unsigned size)
 {
     AwSdHostState *s = AW_SDHOST(opaque);
-    uint32_t u32;
+    AwSdHostClass *sc = AW_SDHOST_GET_CLASS(s);
 
     trace_allwinner_sdhost_write(offset, value, size);
 
@@ -657,18 +680,18 @@ static void allwinner_sdhost_write(void *opaque, hwaddr offset,
         s->dmac_irq = value;
         allwinner_sdhost_update_irq(s);
         break;
-    case REG_SD_THLDC:     /* Card Threshold Control */
-        s->card_threshold = value;
+    case REG_SD_THLDC:     /* Card Threshold Control or FIFO (sun4i) */
+        if (sc->is_sun4i) {
+            allwinner_sdhost_fifo_write(s, value);
+        } else {
+            s->card_threshold = value;
+        }
         break;
     case REG_SD_DSBD:      /* eMMC DDR Start Bit Detection Control */
         s->startbit_detect = value;
         break;
     case REG_SD_FIFO:      /* Read/Write FIFO */
-        u32 = cpu_to_le32(value);
-        sdbus_write_data(&s->sdbus, &u32, sizeof(u32));
-        allwinner_sdhost_update_transfer_cnt(s, sizeof(u32));
-        allwinner_sdhost_auto_stop(s);
-        allwinner_sdhost_update_irq(s);
+        allwinner_sdhost_fifo_write(s, value);
         break;
     case REG_SD_RES_CRC:   /* Response CRC from card/eMMC */
     case REG_SD_DATA7_CRC: /* CRC Data 7 from card/eMMC */
@@ -834,12 +857,14 @@ static void allwinner_sdhost_sun4i_class_init(ObjectClass *klass, void *data)
 {
     AwSdHostClass *sc = AW_SDHOST_CLASS(klass);
     sc->max_desc_size = 8 * KiB;
+    sc->is_sun4i = true;
 }
 
 static void allwinner_sdhost_sun5i_class_init(ObjectClass *klass, void *data)
 {
     AwSdHostClass *sc = AW_SDHOST_CLASS(klass);
     sc->max_desc_size = 64 * KiB;
+    sc->is_sun4i = false;
 }
 
 static const TypeInfo allwinner_sdhost_info = {
diff --git a/include/hw/sd/allwinner-sdhost.h b/include/hw/sd/allwinner-sdhost.h
index bfe08ff4ef..30c1e60404 100644
--- a/include/hw/sd/allwinner-sdhost.h
+++ b/include/hw/sd/allwinner-sdhost.h
@@ -130,6 +130,7 @@ struct AwSdHostClass {
 
     /** Maximum buffer size in bytes per DMA descriptor */
     size_t max_desc_size;
+    bool   is_sun4i;
 
 };
 

From 69e7e60d011846f066af97589660eef52898519a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alex=20Benn=C3=A9e?= <alex.bennee@linaro.org>
Date: Mon, 21 Nov 2022 11:45:13 +0000
Subject: [PATCH 11/17] hw/intc: clean-up access to GIC multi-byte registers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

gic_dist_readb was returning a word value which just happened to work
as a result of the way we OR the data together. Lets fix it so only
the explicit byte is returned for each part of GICD_TYPER. I've
changed the return type to uint8_t although the overflow is only
detected with an explicit -Wconversion.

Signed-off-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
---
 hw/intc/arm_gic.c | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/hw/intc/arm_gic.c b/hw/intc/arm_gic.c
index 492b2421ab..1a04144c38 100644
--- a/hw/intc/arm_gic.c
+++ b/hw/intc/arm_gic.c
@@ -941,7 +941,7 @@ static void gic_complete_irq(GICState *s, int cpu, int irq, MemTxAttrs attrs)
     gic_update(s);
 }
 
-static uint32_t gic_dist_readb(void *opaque, hwaddr offset, MemTxAttrs attrs)
+static uint8_t gic_dist_readb(void *opaque, hwaddr offset, MemTxAttrs attrs)
 {
     GICState *s = (GICState *)opaque;
     uint32_t res;
@@ -955,6 +955,7 @@ static uint32_t gic_dist_readb(void *opaque, hwaddr offset, MemTxAttrs attrs)
     cm = 1 << cpu;
     if (offset < 0x100) {
         if (offset == 0) {      /* GICD_CTLR */
+            /* We rely here on the only non-zero bits being in byte 0 */
             if (s->security_extn && !attrs.secure) {
                 /* The NS bank of this register is just an alias of the
                  * EnableGrp1 bit in the S bank version.
@@ -964,11 +965,14 @@ static uint32_t gic_dist_readb(void *opaque, hwaddr offset, MemTxAttrs attrs)
                 return s->ctlr;
             }
         }
-        if (offset == 4)
-            /* Interrupt Controller Type Register */
-            return ((s->num_irq / 32) - 1)
-                    | ((s->num_cpu - 1) << 5)
-                    | (s->security_extn << 10);
+        if (offset == 4) {
+            /* GICD_TYPER byte 0 */
+            return ((s->num_irq / 32) - 1) | ((s->num_cpu - 1) << 5);
+        }
+        if (offset == 5) {
+            /* GICD_TYPER byte 1 */
+            return (s->security_extn << 2);
+        }
         if (offset < 0x08)
             return 0;
         if (offset >= 0x80) {

From 3d5af538a4fa8456a7e54b8115afe3d6358c1ce5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alex=20Benn=C3=A9e?= <alex.bennee@linaro.org>
Date: Mon, 21 Nov 2022 11:45:13 +0000
Subject: [PATCH 12/17] hw/intc: add implementation of GICD_IIDR to Arm GIC
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

a66a24585f (hw/intc/arm_gic: Implement read of GICC_IIDR) implemented
this for the CPU interface register. The fact we don't implement it
shows up when running Xen with -d guest_error which is definitely
wrong because the guest is perfectly entitled to read it.

Signed-off-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
---
 hw/intc/arm_gic.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/hw/intc/arm_gic.c b/hw/intc/arm_gic.c
index 1a04144c38..7a34bc0998 100644
--- a/hw/intc/arm_gic.c
+++ b/hw/intc/arm_gic.c
@@ -973,8 +973,18 @@ static uint8_t gic_dist_readb(void *opaque, hwaddr offset, MemTxAttrs attrs)
             /* GICD_TYPER byte 1 */
             return (s->security_extn << 2);
         }
-        if (offset < 0x08)
+        if (offset == 8) {
+            /* GICD_IIDR byte 0 */
+            return 0x3b; /* Arm JEP106 identity */
+        }
+        if (offset == 9) {
+            /* GICD_IIDR byte 1 */
+            return 0x04; /* Arm JEP106 identity */
+        }
+        if (offset < 0x0c) {
+            /* All other bytes in this range are RAZ */
             return 0;
+        }
         if (offset >= 0x80) {
             /* Interrupt Group Registers: these RAZ/WI if this is an NS
              * access to a GIC with the security extensions, or if the GIC

From c4462523ff0790cbefb1c206cc34c85ec686b1d5 Mon Sep 17 00:00:00 2001
From: Peter Maydell <peter.maydell@linaro.org>
Date: Mon, 21 Nov 2022 11:45:13 +0000
Subject: [PATCH 13/17] tests/avocado/boot_linux.py: Bump aarch64 virt test
 timeout to 720s
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The two tests
tests/avocado/boot_linux.py:BootLinuxAarch64.test_virt_tcg_gicv2
tests/avocado/boot_linux.py:BootLinuxAarch64.test_virt_tcg_gicv3

take quite a long time to run, and the current timeout of 240s
is not enough for the tests to complete on slow machines:
we've seen these tests time out in the gitlab CI in the
'avocado-system-alpine' CI job, for instance. The timeout
is also insufficient for running the test with a debug build
of QEMU: on my machine the tests take over 10 minutes to run
in that config.

Push the timeout up to 720s so that the test definitely has
enough time to complete.

Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
Reviewed-by: Thomas Huth <thuth@redhat.com>
Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
---
 tests/avocado/boot_linux.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/avocado/boot_linux.py b/tests/avocado/boot_linux.py
index 571d33882a..32adae6ff6 100644
--- a/tests/avocado/boot_linux.py
+++ b/tests/avocado/boot_linux.py
@@ -64,7 +64,7 @@ class BootLinuxAarch64(LinuxTest):
     :avocado: tags=machine:virt
     :avocado: tags=machine:gic-version=2
     """
-    timeout = 240
+    timeout = 720
 
     def add_common_args(self):
         self.vm.add_args('-bios',

From 312b71abce3005ca7294dc0db7d548dc7cc41fbf Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@kernel.org>
Date: Mon, 21 Nov 2022 11:45:13 +0000
Subject: [PATCH 14/17] target/arm: Limit LPA2 effective output address when
 TCR.DS == 0
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

With LPA2, the effective output address size is at most 48 bits when
TCR.DS == 0. This case is currently unhandled in the page table walker,
where we happily assume LVA/64k granule when outputsize > 48 and
param.ds == 0, resulting in the wrong conversion to be used from a
page table descriptor to a physical address.

    if (outputsize > 48) {
        if (param.ds) {
            descaddr |= extract64(descriptor, 8, 2) << 50;
        } else {
            descaddr |= extract64(descriptor, 12, 4) << 48;
        }

So cap the outputsize to 48 when TCR.DS is cleared, as per the
architecture.

Cc: Peter Maydell <peter.maydell@linaro.org>
Cc: Philippe Mathieu-Daudé <f4bug@amsat.org>
Cc: Richard Henderson <richard.henderson@linaro.org>
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Message-id: 20221116170316.259695-1-ardb@kernel.org
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
---
 target/arm/ptw.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/target/arm/ptw.c b/target/arm/ptw.c
index 3745ac9723..9a6277d862 100644
--- a/target/arm/ptw.c
+++ b/target/arm/ptw.c
@@ -1222,6 +1222,14 @@ static bool get_phys_addr_lpae(CPUARMState *env, S1Translate *ptw,
         ps = MIN(ps, param.ps);
         assert(ps < ARRAY_SIZE(pamax_map));
         outputsize = pamax_map[ps];
+
+        /*
+         * With LPA2, the effective output address (OA) size is at most 48 bits
+         * unless TCR.DS == 1
+         */
+        if (!param.ds && param.gran != Gran64K) {
+            outputsize = MIN(outputsize, 48);
+        }
     } else {
         param = aa32_va_parameters(env, address, mmu_idx);
         level = 1;

From bd142b2391a4369f39f9f4e8c48b15ed18280446 Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@redhat.com>
Date: Thu, 17 Nov 2022 11:55:52 -0500
Subject: [PATCH 15/17] rtl8139: avoid clobbering tx descriptor bits
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The device turns the Tx Descriptor into a Tx Status descriptor after
fully reading the descriptor. This involves clearing Tx Own (bit 31) to
indicate that the driver has ownership of the descriptor again as well
as several other bits.

The code keeps the first dword of the Tx Descriptor in the txdw0 local
variable. txdw0 is reused to build the first word of the Tx Status
descriptor. Later on the code uses txdw0 again, incorrectly assuming
that it still contains the first dword of the Tx Descriptor. The tx
offloading code misbehaves because it sees bogus bits in txdw0.

Use a separate local variable for Tx Status and preserve Tx Descriptor
in txdw0.

Acked-by: Jason Wang <jasowang@redhat.com>
Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Message-Id: <20221117165554.1773409-2-stefanha@redhat.com>
---
 hw/net/rtl8139.c | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/hw/net/rtl8139.c b/hw/net/rtl8139.c
index e6643e3c9d..ffef3789b5 100644
--- a/hw/net/rtl8139.c
+++ b/hw/net/rtl8139.c
@@ -2027,18 +2027,21 @@ static int rtl8139_cplus_transmit_one(RTL8139State *s)
             s->currCPlusTxDesc = 0;
     }
 
+    /* Build the Tx Status Descriptor */
+    uint32_t tx_status = txdw0;
+
     /* transfer ownership to target */
-    txdw0 &= ~CP_TX_OWN;
+    tx_status &= ~CP_TX_OWN;
 
     /* reset error indicator bits */
-    txdw0 &= ~CP_TX_STATUS_UNF;
-    txdw0 &= ~CP_TX_STATUS_TES;
-    txdw0 &= ~CP_TX_STATUS_OWC;
-    txdw0 &= ~CP_TX_STATUS_LNKF;
-    txdw0 &= ~CP_TX_STATUS_EXC;
+    tx_status &= ~CP_TX_STATUS_UNF;
+    tx_status &= ~CP_TX_STATUS_TES;
+    tx_status &= ~CP_TX_STATUS_OWC;
+    tx_status &= ~CP_TX_STATUS_LNKF;
+    tx_status &= ~CP_TX_STATUS_EXC;
 
     /* update ring data */
-    val = cpu_to_le32(txdw0);
+    val = cpu_to_le32(tx_status);
     pci_dma_write(d, cplus_tx_ring_desc, (uint8_t *)&val, 4);
 
     /* Now decide if descriptor being processed is holding the last segment of packet */

From c74831a02c818f89d10f5475cd0fb9ba40bfb2a8 Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@redhat.com>
Date: Thu, 17 Nov 2022 11:55:53 -0500
Subject: [PATCH 16/17] rtl8139: keep Tx command mode 0 and 1 separate
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

There are two Tx Descriptor formats called mode 0 and mode 1. The mode
is determined by the Large Send bit.

CP_TX_IPCS (bit 18) is defined in mode 1 but the code checks the bit
unconditionally. In mode 0 bit 18 is part of the Large Send MSS value.

Explicitly check the Large Send bit to distinguish Tx command modes.
This avoids bugs where modes are confused. Note that I didn't find any
actual bugs aside from needlessly computing the IP checksum when the
Large Send bit is enabled.

Acked-by: Jason Wang <jasowang@redhat.com>
Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Message-Id: <20221117165554.1773409-3-stefanha@redhat.com>
---
 hw/net/rtl8139.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/hw/net/rtl8139.c b/hw/net/rtl8139.c
index ffef3789b5..6dd7a8e6e0 100644
--- a/hw/net/rtl8139.c
+++ b/hw/net/rtl8139.c
@@ -2135,7 +2135,7 @@ static int rtl8139_cplus_transmit_one(RTL8139State *s)
             }
             ip_data_len -= hlen;
 
-            if (txdw0 & CP_TX_IPCS)
+            if (!(txdw0 & CP_TX_LGSEN) && (txdw0 & CP_TX_IPCS))
             {
                 DPRINTF("+++ C+ mode need IP checksum\n");
 
@@ -2268,7 +2268,7 @@ static int rtl8139_cplus_transmit_one(RTL8139State *s)
                 /* Stop sending this frame */
                 saved_size = 0;
             }
-            else if (txdw0 & (CP_TX_TCPCS|CP_TX_UDPCS))
+            else if (!(txdw0 & CP_TX_LGSEN) && (txdw0 & (CP_TX_TCPCS|CP_TX_UDPCS)))
             {
                 DPRINTF("+++ C+ mode need TCP or UDP checksum\n");
 

From 6d71357a3b651ec9db126e4862b77e13165427f5 Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@redhat.com>
Date: Thu, 17 Nov 2022 11:55:54 -0500
Subject: [PATCH 17/17] rtl8139: honor large send MSS value
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The Large-Send Task Offload Tx Descriptor (9.2.1 Transmit) has a
Large-Send MSS value where the driver specifies the MSS. See the
datasheet here:
http://realtek.info/pdf/rtl8139cp.pdf

The code ignores this value and uses a hardcoded MSS of 1500 bytes
instead. When the MTU is less than 1500 bytes the hardcoded value
results in IP fragmentation and poor performance.

Use the Large-Send MSS value to correctly size Large-Send packets.

Jason Wang <jasowang@redhat.com> noticed that the Large-Send MSS value
mask was incorrect so it is adjusted to match the datasheet and Linux
8139cp driver.

This issue was discussed in the past here:
https://lore.kernel.org/all/20161114162505.GD26664@stefanha-x1.localdomain/

Reported-by: Russell King - ARM Linux <linux@armlinux.org.uk>
Reported-by: Tobias Fiebig <tobias+git@fiebig.nl>
Resolves: https://gitlab.com/qemu-project/qemu/-/issues/1312
Acked-by: Jason Wang <jasowang@redhat.com>
Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Message-Id: <20221117165554.1773409-4-stefanha@redhat.com>
---
 hw/net/rtl8139.c | 26 ++++++++++++--------------
 1 file changed, 12 insertions(+), 14 deletions(-)

diff --git a/hw/net/rtl8139.c b/hw/net/rtl8139.c
index 6dd7a8e6e0..700b1b66b6 100644
--- a/hw/net/rtl8139.c
+++ b/hw/net/rtl8139.c
@@ -77,7 +77,6 @@
     ( ( input ) & ( size - 1 )  )
 
 #define ETHER_TYPE_LEN 2
-#define ETH_MTU     1500
 
 #define VLAN_TCI_LEN 2
 #define VLAN_HLEN (ETHER_TYPE_LEN + VLAN_TCI_LEN)
@@ -1934,8 +1933,9 @@ static int rtl8139_cplus_transmit_one(RTL8139State *s)
 #define CP_TX_LS (1<<28)
 /* large send packet flag */
 #define CP_TX_LGSEN (1<<27)
-/* large send MSS mask, bits 16...25 */
-#define CP_TC_LGSEN_MSS_MASK ((1 << 12) - 1)
+/* large send MSS mask, bits 16...26 */
+#define CP_TC_LGSEN_MSS_SHIFT 16
+#define CP_TC_LGSEN_MSS_MASK ((1 << 11) - 1)
 
 /* IP checksum offload flag */
 #define CP_TX_IPCS (1<<18)
@@ -2152,10 +2152,11 @@ static int rtl8139_cplus_transmit_one(RTL8139State *s)
                     goto skip_offload;
                 }
 
-                int large_send_mss = (txdw0 >> 16) & CP_TC_LGSEN_MSS_MASK;
+                int large_send_mss = (txdw0 >> CP_TC_LGSEN_MSS_SHIFT) &
+                                     CP_TC_LGSEN_MSS_MASK;
 
-                DPRINTF("+++ C+ mode offloaded task TSO MTU=%d IP data %d "
-                    "frame data %d specified MSS=%d\n", ETH_MTU,
+                DPRINTF("+++ C+ mode offloaded task TSO IP data %d "
+                    "frame data %d specified MSS=%d\n",
                     ip_data_len, saved_size - ETH_HLEN, large_send_mss);
 
                 int tcp_send_offset = 0;
@@ -2180,25 +2181,22 @@ static int rtl8139_cplus_transmit_one(RTL8139State *s)
                     goto skip_offload;
                 }
 
-                /* ETH_MTU = ip header len + tcp header len + payload */
                 int tcp_data_len = ip_data_len - tcp_hlen;
-                int tcp_chunk_size = ETH_MTU - hlen - tcp_hlen;
 
                 DPRINTF("+++ C+ mode TSO IP data len %d TCP hlen %d TCP "
-                    "data len %d TCP chunk size %d\n", ip_data_len,
-                    tcp_hlen, tcp_data_len, tcp_chunk_size);
+                    "data len %d\n", ip_data_len, tcp_hlen, tcp_data_len);
 
                 /* note the cycle below overwrites IP header data,
                    but restores it from saved_ip_header before sending packet */
 
                 int is_last_frame = 0;
 
-                for (tcp_send_offset = 0; tcp_send_offset < tcp_data_len; tcp_send_offset += tcp_chunk_size)
+                for (tcp_send_offset = 0; tcp_send_offset < tcp_data_len; tcp_send_offset += large_send_mss)
                 {
-                    uint16_t chunk_size = tcp_chunk_size;
+                    uint16_t chunk_size = large_send_mss;
 
                     /* check if this is the last frame */
-                    if (tcp_send_offset + tcp_chunk_size >= tcp_data_len)
+                    if (tcp_send_offset + large_send_mss >= tcp_data_len)
                     {
                         is_last_frame = 1;
                         chunk_size = tcp_data_len - tcp_send_offset;
@@ -2247,7 +2245,7 @@ static int rtl8139_cplus_transmit_one(RTL8139State *s)
                     ip->ip_len = cpu_to_be16(hlen + tcp_hlen + chunk_size);
 
                     /* increment IP id for subsequent frames */
-                    ip->ip_id = cpu_to_be16(tcp_send_offset/tcp_chunk_size + be16_to_cpu(ip->ip_id));
+                    ip->ip_id = cpu_to_be16(tcp_send_offset/large_send_mss + be16_to_cpu(ip->ip_id));
 
                     ip->ip_sum = 0;
                     ip->ip_sum = ip_checksum(eth_payload_data, hlen);