From db0d4017f9b9e87f962b35dd19a4912bbfcd3cbc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eugenio=20P=C3=A9rez?= Date: Mon, 6 Jan 2025 10:57:34 -0500 Subject: [PATCH 1/5] net: parameterize the removing client from nc list MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This change is used in later commits so we can avoid the removal of the netclient if it is delayed. No functional change intended. Reviewed-by: Si-Wei Liu Acked-by: Jason Wang Signed-off-by: Eugenio Pérez Signed-off-by: Jason Wang --- net/net.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/net/net.c b/net/net.c index a3996d5c62..4eb78a1299 100644 --- a/net/net.c +++ b/net/net.c @@ -381,9 +381,12 @@ NetClientState *qemu_get_peer(NetClientState *nc, int queue_index) return ncs->peer; } -static void qemu_cleanup_net_client(NetClientState *nc) +static void qemu_cleanup_net_client(NetClientState *nc, + bool remove_from_net_clients) { - QTAILQ_REMOVE(&net_clients, nc, next); + if (remove_from_net_clients) { + QTAILQ_REMOVE(&net_clients, nc, next); + } if (nc->info->cleanup) { nc->info->cleanup(nc); @@ -442,14 +445,14 @@ void qemu_del_net_client(NetClientState *nc) } for (i = 0; i < queues; i++) { - qemu_cleanup_net_client(ncs[i]); + qemu_cleanup_net_client(ncs[i], true); } return; } for (i = 0; i < queues; i++) { - qemu_cleanup_net_client(ncs[i]); + qemu_cleanup_net_client(ncs[i], true); qemu_free_net_client(ncs[i]); } } @@ -474,7 +477,7 @@ void qemu_del_nic(NICState *nic) for (i = queues - 1; i >= 0; i--) { NetClientState *nc = qemu_get_subqueue(nic, i); - qemu_cleanup_net_client(nc); + qemu_cleanup_net_client(nc, true); qemu_free_net_client(nc); } From e7891c575fb294618b172119a91c892b8f4384a2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eugenio=20P=C3=A9rez?= Date: Mon, 6 Jan 2025 10:57:35 -0500 Subject: [PATCH 2/5] net: move backend cleanup to NIC cleanup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit a0d7215e33 ("vhost-vdpa: do not cleanup the vdpa/vhost-net structures if peer nic is present") effectively delayed the backend cleanup, allowing the frontend or the guest to access it resources as long as the frontend is still visible to the guest. However it does not clean up the resources until the qemu process is over. This causes an effective leak if the device is deleted with device_del, as there is no way to close the vdpa device. This makes impossible to re-add that device to this or other QEMU instances until the first instance of QEMU is finished. Move the cleanup from qemu_cleanup to the NIC deletion and to net_cleanup. Fixes: a0d7215e33 ("vhost-vdpa: do not cleanup the vdpa/vhost-net structures if peer nic is present") Reported-by: Lei Yang Signed-off-by: Eugenio Pérez Signed-off-by: Jonah Palmer Signed-off-by: Jason Wang --- net/net.c | 33 +++++++++++++++++++++++++++------ net/vhost-vdpa.c | 8 -------- 2 files changed, 27 insertions(+), 14 deletions(-) diff --git a/net/net.c b/net/net.c index 4eb78a1299..39d6f28158 100644 --- a/net/net.c +++ b/net/net.c @@ -428,7 +428,13 @@ void qemu_del_net_client(NetClientState *nc) object_unparent(OBJECT(nf)); } - /* If there is a peer NIC, delete and cleanup client, but do not free. */ + /* + * If there is a peer NIC, transfer ownership to it. Delete the client + * from net_client list but do not cleanup nor free. This way NIC can + * still access to members of the backend. + * + * The cleanup and free will be done when the NIC is free. + */ if (nc->peer && nc->peer->info->type == NET_CLIENT_DRIVER_NIC) { NICState *nic = qemu_get_nic(nc->peer); if (nic->peer_deleted) { @@ -438,16 +444,13 @@ void qemu_del_net_client(NetClientState *nc) for (i = 0; i < queues; i++) { ncs[i]->peer->link_down = true; + QTAILQ_REMOVE(&net_clients, ncs[i], next); } if (nc->peer->info->link_status_changed) { nc->peer->info->link_status_changed(nc->peer); } - for (i = 0; i < queues; i++) { - qemu_cleanup_net_client(ncs[i], true); - } - return; } @@ -465,8 +468,12 @@ void qemu_del_nic(NICState *nic) for (i = 0; i < queues; i++) { NetClientState *nc = qemu_get_subqueue(nic, i); - /* If this is a peer NIC and peer has already been deleted, free it now. */ + /* + * If this is a peer NIC and peer has already been deleted, clean it up + * and free it now. + */ if (nic->peer_deleted) { + qemu_cleanup_net_client(nc->peer, false); qemu_free_net_client(nc->peer); } else if (nc->peer) { /* if there are RX packets pending, complete them */ @@ -1681,6 +1688,9 @@ void net_cleanup(void) * of the latest NET_CLIENT_DRIVER_NIC, and operate on *p as we walk * the list. * + * However, the NIC may have peers that trust to be clean beyond this + * point. For example, if they have been removed with device_del. + * * The 'nc' variable isn't part of the list traversal; it's purely * for convenience as too much '(*p)->' has a tendency to make the * readers' eyes bleed. @@ -1688,6 +1698,17 @@ void net_cleanup(void) while (*p) { nc = *p; if (nc->info->type == NET_CLIENT_DRIVER_NIC) { + NICState *nic = qemu_get_nic(nc); + + if (nic->peer_deleted) { + int queues = MAX(nic->conf->peers.queues, 1); + + for (int i = 0; i < queues; i++) { + nc = qemu_get_subqueue(nic, i); + qemu_cleanup_net_client(nc->peer, false); + } + } + /* Skip NET_CLIENT_DRIVER_NIC entries */ p = &QTAILQ_NEXT(nc, next); } else { diff --git a/net/vhost-vdpa.c b/net/vhost-vdpa.c index bd01866878..f7a54f46aa 100644 --- a/net/vhost-vdpa.c +++ b/net/vhost-vdpa.c @@ -224,14 +224,6 @@ static void vhost_vdpa_cleanup(NetClientState *nc) { VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc); - /* - * If a peer NIC is attached, do not cleanup anything. - * Cleanup will happen as a part of qemu_cleanup() -> net_cleanup() - * when the guest is shutting down. - */ - if (nc->peer && nc->peer->info->type == NET_CLIENT_DRIVER_NIC) { - return; - } munmap(s->cvq_cmd_out_buffer, vhost_vdpa_net_cvq_cmd_page_len()); munmap(s->status, vhost_vdpa_net_cvq_cmd_page_len()); if (s->vhost_net) { From 9dc64bd5a4bebdc820e7e8484cb30e02befdc774 Mon Sep 17 00:00:00 2001 From: Akihiko Odaki Date: Sun, 28 Apr 2024 20:11:22 +0900 Subject: [PATCH 3/5] util/iov: Do not assert offset is in iov iov_from_buf(), iov_to_buf(), iov_memset(), and iov_copy() asserts that the given offset fits in the iov while tolerating the specified number of bytes to operate with to be greater than the size of iov. This is inconsistent so remove the assertions. Asserting the offset fits in the iov makes sense if it is expected that there are other operations that process the content before the offset and the content is processed in order. Under this expectation, the offset should point to the end of bytes that are previously processed and fit in the iov. However, this expectation depends on the details of the caller, and did not hold true at least one case and required code to check iov_size(), which is added with commit 83ddb3dbba2e ("hw/net/net_tx_pkt: Fix overrun in update_sctp_checksum()"). Adding such a check is inefficient and error-prone. These functions already tolerate the specified number of bytes to operate with to be greater than the size of iov to avoid such checks so remove the assertions to tolerate invalid offset as well. They return the number of bytes they operated with so their callers can still check the returned value to ensure there are sufficient space at the given offset. Signed-off-by: Akihiko Odaki Signed-off-by: Jason Wang --- include/qemu/iov.h | 5 +++-- util/iov.c | 5 ----- 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/include/qemu/iov.h b/include/qemu/iov.h index 44f9db5cee..9535673c13 100644 --- a/include/qemu/iov.h +++ b/include/qemu/iov.h @@ -31,7 +31,7 @@ size_t iov_size(const struct iovec *iov, const unsigned int iov_cnt); * only part of data will be copied, up to the end of the iovec. * Number of bytes actually copied will be returned, which is * min(bytes, iov_size(iov)-offset) - * `Offset' must point to the inside of iovec. + * Returns 0 when `offset' points to the outside of iovec. */ size_t iov_from_buf_full(const struct iovec *iov, unsigned int iov_cnt, size_t offset, const void *buf, size_t bytes); @@ -67,11 +67,12 @@ iov_to_buf(const struct iovec *iov, const unsigned int iov_cnt, /** * Set data bytes pointed out by iovec `iov' of size `iov_cnt' elements, * starting at byte offset `start', to value `fillc', repeating it - * `bytes' number of times. `Offset' must point to the inside of iovec. + * `bytes' number of times. * If `bytes' is large enough, only last bytes portion of iovec, * up to the end of it, will be filled with the specified value. * Function return actual number of bytes processed, which is * min(size, iov_size(iov) - offset). + * Returns 0 when `offset' points to the outside of iovec. */ size_t iov_memset(const struct iovec *iov, const unsigned int iov_cnt, size_t offset, int fillc, size_t bytes); diff --git a/util/iov.c b/util/iov.c index 7777116123..f8536f0474 100644 --- a/util/iov.c +++ b/util/iov.c @@ -37,7 +37,6 @@ size_t iov_from_buf_full(const struct iovec *iov, unsigned int iov_cnt, offset -= iov[i].iov_len; } } - assert(offset == 0); return done; } @@ -56,7 +55,6 @@ size_t iov_to_buf_full(const struct iovec *iov, const unsigned int iov_cnt, offset -= iov[i].iov_len; } } - assert(offset == 0); return done; } @@ -75,7 +73,6 @@ size_t iov_memset(const struct iovec *iov, const unsigned int iov_cnt, offset -= iov[i].iov_len; } } - assert(offset == 0); return done; } @@ -277,7 +274,6 @@ unsigned iov_copy(struct iovec *dst_iov, unsigned int dst_iov_cnt, bytes -= len; offset = 0; } - assert(offset == 0); return j; } @@ -348,7 +344,6 @@ size_t qemu_iovec_concat_iov(QEMUIOVector *dst, soffset -= src_iov[i].iov_len; } } - assert(soffset == 0); /* offset beyond end of src */ return done; } From 2938c36937559554a2408b008470c9a76cb9271a Mon Sep 17 00:00:00 2001 From: Akihiko Odaki Date: Sun, 28 Apr 2024 20:11:23 +0900 Subject: [PATCH 4/5] Revert "hw/net/net_tx_pkt: Fix overrun in update_sctp_checksum()" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit 83ddb3dbba2ee0f1767442ae6ee665058aeb1093. The added check is no longer necessary due to a change of iov_from_buf(). Reviewed-by: Philippe Mathieu-Daudé Signed-off-by: Akihiko Odaki Signed-off-by: Jason Wang --- hw/net/net_tx_pkt.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/hw/net/net_tx_pkt.c b/hw/net/net_tx_pkt.c index 1f79b82b77..903238dca2 100644 --- a/hw/net/net_tx_pkt.c +++ b/hw/net/net_tx_pkt.c @@ -141,10 +141,6 @@ bool net_tx_pkt_update_sctp_checksum(struct NetTxPkt *pkt) uint32_t csum = 0; struct iovec *pl_start_frag = pkt->vec + NET_TX_PKT_PL_START_FRAG; - if (iov_size(pl_start_frag, pkt->payload_frags) < 8 + sizeof(csum)) { - return false; - } - if (iov_from_buf(pl_start_frag, pkt->payload_frags, 8, &csum, sizeof(csum)) < sizeof(csum)) { return false; } From ac2ff9b840ce82cc7d5fd9ce4fd3019a434d7dc9 Mon Sep 17 00:00:00 2001 From: Akihiko Odaki Date: Tue, 8 Oct 2024 15:52:06 +0900 Subject: [PATCH 5/5] tap-linux: Open ipvtap and macvtap ipvtap and macvtap create a file for each interface unlike tuntap, which creates one file shared by all interfaces. Try to open a file dedicated to the interface first for ipvtap and macvtap. Signed-off-by: Akihiko Odaki Signed-off-by: Jason Wang --- net/tap-linux.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/net/tap-linux.c b/net/tap-linux.c index 1226d5fda2..22ec2f45d2 100644 --- a/net/tap-linux.c +++ b/net/tap-linux.c @@ -45,10 +45,21 @@ int tap_open(char *ifname, int ifname_size, int *vnet_hdr, int len = sizeof(struct virtio_net_hdr); unsigned int features; - fd = RETRY_ON_EINTR(open(PATH_NET_TUN, O_RDWR)); + + ret = if_nametoindex(ifname); + if (ret) { + g_autofree char *file = g_strdup_printf("/dev/tap%d", ret); + fd = open(file, O_RDWR); + } else { + fd = -1; + } + if (fd < 0) { - error_setg_errno(errp, errno, "could not open %s", PATH_NET_TUN); - return -1; + fd = RETRY_ON_EINTR(open(PATH_NET_TUN, O_RDWR)); + if (fd < 0) { + error_setg_errno(errp, errno, "could not open %s", PATH_NET_TUN); + return -1; + } } memset(&ifr, 0, sizeof(ifr)); ifr.ifr_flags = IFF_TAP | IFF_NO_PI;