From 8d689f6aae8be096b4a1859be07c1b083865f755 Mon Sep 17 00:00:00 2001 From: "timothee.cocault@gmail.com" Date: Mon, 10 Apr 2023 17:27:48 +0200 Subject: [PATCH 01/50] e1000e: Fix tx/rx counters MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The bytes and packets counter registers are cleared on read. Copying the "total counter" registers to the "good counter" registers has side effects. If the "total" register is never read by the OS, it only gets incremented. This leads to exponential growth of the "good" register. This commit increments the counters individually to avoid this. Signed-off-by: Timothée Cocault Signed-off-by: Jason Wang --- hw/net/e1000.c | 5 ++--- hw/net/e1000e_core.c | 5 ++--- hw/net/e1000x_common.c | 5 ++--- hw/net/igb_core.c | 5 ++--- 4 files changed, 8 insertions(+), 12 deletions(-) diff --git a/hw/net/e1000.c b/hw/net/e1000.c index 23d660619f..59bacb5d3b 100644 --- a/hw/net/e1000.c +++ b/hw/net/e1000.c @@ -637,9 +637,8 @@ xmit_seg(E1000State *s) e1000x_inc_reg_if_not_full(s->mac_reg, TPT); e1000x_grow_8reg_if_not_full(s->mac_reg, TOTL, s->tx.size + 4); - s->mac_reg[GPTC] = s->mac_reg[TPT]; - s->mac_reg[GOTCL] = s->mac_reg[TOTL]; - s->mac_reg[GOTCH] = s->mac_reg[TOTH]; + e1000x_inc_reg_if_not_full(s->mac_reg, GPTC); + e1000x_grow_8reg_if_not_full(s->mac_reg, GOTCL, s->tx.size + 4); } static void diff --git a/hw/net/e1000e_core.c b/hw/net/e1000e_core.c index c0c09b6965..cfa3f55e96 100644 --- a/hw/net/e1000e_core.c +++ b/hw/net/e1000e_core.c @@ -711,9 +711,8 @@ e1000e_on_tx_done_update_stats(E1000ECore *core, struct NetTxPkt *tx_pkt) g_assert_not_reached(); } - core->mac[GPTC] = core->mac[TPT]; - core->mac[GOTCL] = core->mac[TOTL]; - core->mac[GOTCH] = core->mac[TOTH]; + e1000x_inc_reg_if_not_full(core->mac, GPTC); + e1000x_grow_8reg_if_not_full(core->mac, GOTCL, tot_len); } static void diff --git a/hw/net/e1000x_common.c b/hw/net/e1000x_common.c index b844af590a..4c8e7dcf70 100644 --- a/hw/net/e1000x_common.c +++ b/hw/net/e1000x_common.c @@ -220,15 +220,14 @@ e1000x_update_rx_total_stats(uint32_t *mac, e1000x_increase_size_stats(mac, PRCregs, data_fcs_size); e1000x_inc_reg_if_not_full(mac, TPR); - mac[GPRC] = mac[TPR]; + e1000x_inc_reg_if_not_full(mac, GPRC); /* TOR - Total Octets Received: * This register includes bytes received in a packet from the field through the field, inclusively. * Always include FCS length (4) in size. */ e1000x_grow_8reg_if_not_full(mac, TORL, data_size + 4); - mac[GORCL] = mac[TORL]; - mac[GORCH] = mac[TORH]; + e1000x_grow_8reg_if_not_full(mac, GORCL, data_size + 4); } void diff --git a/hw/net/igb_core.c b/hw/net/igb_core.c index d733fed6cf..826e7a6cf1 100644 --- a/hw/net/igb_core.c +++ b/hw/net/igb_core.c @@ -538,9 +538,8 @@ igb_on_tx_done_update_stats(IGBCore *core, struct NetTxPkt *tx_pkt, int qn) g_assert_not_reached(); } - core->mac[GPTC] = core->mac[TPT]; - core->mac[GOTCL] = core->mac[TOTL]; - core->mac[GOTCH] = core->mac[TOTH]; + e1000x_inc_reg_if_not_full(core->mac, GPTC); + e1000x_grow_8reg_if_not_full(core->mac, GOTCL, tot_len); if (core->mac[MRQC] & 1) { uint16_t pool = qn % IGB_NUM_VM_POOLS; From 163246e1ce9607251ed52df1131af25d608de782 Mon Sep 17 00:00:00 2001 From: Akihiko Odaki Date: Tue, 23 May 2023 11:42:52 +0900 Subject: [PATCH 02/50] hw/net/net_tx_pkt: Decouple implementation from PCI This is intended to be followed by another change for the interface. It also fixes the leak of memory mapping when the specified memory is partially mapped. Fixes: e263cd49c7 ("Packet abstraction for VMWARE network devices") Signed-off-by: Akihiko Odaki Signed-off-by: Jason Wang --- hw/net/net_tx_pkt.c | 53 ++++++++++++++++++++++++++++----------------- hw/net/net_tx_pkt.h | 9 ++++++++ 2 files changed, 42 insertions(+), 20 deletions(-) diff --git a/hw/net/net_tx_pkt.c b/hw/net/net_tx_pkt.c index 8dc8568ba2..aca12ff035 100644 --- a/hw/net/net_tx_pkt.c +++ b/hw/net/net_tx_pkt.c @@ -384,10 +384,9 @@ void net_tx_pkt_setup_vlan_header_ex(struct NetTxPkt *pkt, } } -bool net_tx_pkt_add_raw_fragment(struct NetTxPkt *pkt, hwaddr pa, - size_t len) +static bool net_tx_pkt_add_raw_fragment_common(struct NetTxPkt *pkt, + void *base, size_t len) { - hwaddr mapped_len = 0; struct iovec *ventry; assert(pkt); @@ -395,23 +394,12 @@ bool net_tx_pkt_add_raw_fragment(struct NetTxPkt *pkt, hwaddr pa, return false; } - if (!len) { - return true; - } - ventry = &pkt->raw[pkt->raw_frags]; - mapped_len = len; + ventry->iov_base = base; + ventry->iov_len = len; + pkt->raw_frags++; - ventry->iov_base = pci_dma_map(pkt->pci_dev, pa, - &mapped_len, DMA_DIRECTION_TO_DEVICE); - - if ((ventry->iov_base != NULL) && (len == mapped_len)) { - ventry->iov_len = mapped_len; - pkt->raw_frags++; - return true; - } else { - return false; - } + return true; } bool net_tx_pkt_has_fragments(struct NetTxPkt *pkt) @@ -465,8 +453,9 @@ void net_tx_pkt_reset(struct NetTxPkt *pkt, PCIDevice *pci_dev) assert(pkt->raw); for (i = 0; i < pkt->raw_frags; i++) { assert(pkt->raw[i].iov_base); - pci_dma_unmap(pkt->pci_dev, pkt->raw[i].iov_base, - pkt->raw[i].iov_len, DMA_DIRECTION_TO_DEVICE, 0); + net_tx_pkt_unmap_frag_pci(pkt->pci_dev, + pkt->raw[i].iov_base, + pkt->raw[i].iov_len); } } pkt->pci_dev = pci_dev; @@ -476,6 +465,30 @@ void net_tx_pkt_reset(struct NetTxPkt *pkt, PCIDevice *pci_dev) pkt->l4proto = 0; } +void net_tx_pkt_unmap_frag_pci(void *context, void *base, size_t len) +{ + pci_dma_unmap(context, base, len, DMA_DIRECTION_TO_DEVICE, 0); +} + +bool net_tx_pkt_add_raw_fragment(struct NetTxPkt *pkt, hwaddr pa, + size_t len) +{ + dma_addr_t mapped_len = len; + void *base = pci_dma_map(pkt->pci_dev, pa, &mapped_len, + DMA_DIRECTION_TO_DEVICE); + if (!base) { + return false; + } + + if (mapped_len != len || + !net_tx_pkt_add_raw_fragment_common(pkt, base, len)) { + net_tx_pkt_unmap_frag_pci(pkt->pci_dev, base, mapped_len); + return false; + } + + return true; +} + static void net_tx_pkt_do_sw_csum(struct NetTxPkt *pkt, struct iovec *iov, uint32_t iov_len, uint16_t csl) diff --git a/hw/net/net_tx_pkt.h b/hw/net/net_tx_pkt.h index e5ce6f20bc..5eb123ef90 100644 --- a/hw/net/net_tx_pkt.h +++ b/hw/net/net_tx_pkt.h @@ -153,6 +153,15 @@ void net_tx_pkt_dump(struct NetTxPkt *pkt); */ void net_tx_pkt_reset(struct NetTxPkt *pkt, PCIDevice *dev); +/** + * Unmap a fragment mapped from a PCI device. + * + * @context: PCI device owning fragment + * @base: pointer to fragment + * @len: length of fragment + */ +void net_tx_pkt_unmap_frag_pci(void *context, void *base, size_t len); + /** * Send packet to qemu. handles sw offloads if vhdr is not supported. * From a51db5802744b274ab40385dd9fe8354722fcc4d Mon Sep 17 00:00:00 2001 From: Akihiko Odaki Date: Tue, 23 May 2023 11:42:53 +0900 Subject: [PATCH 03/50] hw/net/net_tx_pkt: Decouple interface from PCI This allows to use the network packet abstractions even if PCI is not used. Signed-off-by: Akihiko Odaki Signed-off-by: Jason Wang --- hw/net/e1000e_core.c | 13 ++++++++----- hw/net/igb_core.c | 13 ++++++------- hw/net/net_tx_pkt.c | 36 +++++++++++++----------------------- hw/net/net_tx_pkt.h | 31 ++++++++++++++++++++----------- hw/net/vmxnet3.c | 14 +++++++------- 5 files changed, 54 insertions(+), 53 deletions(-) diff --git a/hw/net/e1000e_core.c b/hw/net/e1000e_core.c index cfa3f55e96..15821a75e0 100644 --- a/hw/net/e1000e_core.c +++ b/hw/net/e1000e_core.c @@ -746,7 +746,8 @@ e1000e_process_tx_desc(E1000ECore *core, addr = le64_to_cpu(dp->buffer_addr); if (!tx->skip_cp) { - if (!net_tx_pkt_add_raw_fragment(tx->tx_pkt, addr, split_size)) { + if (!net_tx_pkt_add_raw_fragment_pci(tx->tx_pkt, core->owner, + addr, split_size)) { tx->skip_cp = true; } } @@ -764,7 +765,7 @@ e1000e_process_tx_desc(E1000ECore *core, } tx->skip_cp = false; - net_tx_pkt_reset(tx->tx_pkt, core->owner); + net_tx_pkt_reset(tx->tx_pkt, net_tx_pkt_unmap_frag_pci, core->owner); tx->sum_needed = 0; tx->cptse = 0; @@ -3421,7 +3422,7 @@ e1000e_core_pci_realize(E1000ECore *core, qemu_add_vm_change_state_handler(e1000e_vm_state_change, core); for (i = 0; i < E1000E_NUM_QUEUES; i++) { - net_tx_pkt_init(&core->tx[i].tx_pkt, core->owner, E1000E_MAX_TX_FRAGS); + net_tx_pkt_init(&core->tx[i].tx_pkt, E1000E_MAX_TX_FRAGS); } net_rx_pkt_init(&core->rx_pkt); @@ -3446,7 +3447,8 @@ e1000e_core_pci_uninit(E1000ECore *core) qemu_del_vm_change_state_handler(core->vmstate); for (i = 0; i < E1000E_NUM_QUEUES; i++) { - net_tx_pkt_reset(core->tx[i].tx_pkt, core->owner); + net_tx_pkt_reset(core->tx[i].tx_pkt, + net_tx_pkt_unmap_frag_pci, core->owner); net_tx_pkt_uninit(core->tx[i].tx_pkt); } @@ -3571,7 +3573,8 @@ static void e1000e_reset(E1000ECore *core, bool sw) e1000x_reset_mac_addr(core->owner_nic, core->mac, core->permanent_mac); for (i = 0; i < ARRAY_SIZE(core->tx); i++) { - net_tx_pkt_reset(core->tx[i].tx_pkt, core->owner); + net_tx_pkt_reset(core->tx[i].tx_pkt, + net_tx_pkt_unmap_frag_pci, core->owner); memset(&core->tx[i].props, 0, sizeof(core->tx[i].props)); core->tx[i].skip_cp = false; } diff --git a/hw/net/igb_core.c b/hw/net/igb_core.c index 826e7a6cf1..abfdce9aaf 100644 --- a/hw/net/igb_core.c +++ b/hw/net/igb_core.c @@ -597,7 +597,8 @@ igb_process_tx_desc(IGBCore *core, length = cmd_type_len & 0xFFFF; if (!tx->skip_cp) { - if (!net_tx_pkt_add_raw_fragment(tx->tx_pkt, buffer_addr, length)) { + if (!net_tx_pkt_add_raw_fragment_pci(tx->tx_pkt, dev, + buffer_addr, length)) { tx->skip_cp = true; } } @@ -616,7 +617,7 @@ igb_process_tx_desc(IGBCore *core, tx->first = true; tx->skip_cp = false; - net_tx_pkt_reset(tx->tx_pkt, dev); + net_tx_pkt_reset(tx->tx_pkt, net_tx_pkt_unmap_frag_pci, dev); } } @@ -842,8 +843,6 @@ igb_start_xmit(IGBCore *core, const IGB_TxRing *txr) d = core->owner; } - net_tx_pkt_reset(txr->tx->tx_pkt, d); - while (!igb_ring_empty(core, txi)) { base = igb_ring_head_descr(core, txi); @@ -861,6 +860,8 @@ igb_start_xmit(IGBCore *core, const IGB_TxRing *txr) core->mac[EICR] |= eic; igb_set_interrupt_cause(core, E1000_ICR_TXDW); } + + net_tx_pkt_reset(txr->tx->tx_pkt, net_tx_pkt_unmap_frag_pci, d); } static uint32_t @@ -3954,7 +3955,7 @@ igb_core_pci_realize(IGBCore *core, core->vmstate = qemu_add_vm_change_state_handler(igb_vm_state_change, core); for (i = 0; i < IGB_NUM_QUEUES; i++) { - net_tx_pkt_init(&core->tx[i].tx_pkt, NULL, E1000E_MAX_TX_FRAGS); + net_tx_pkt_init(&core->tx[i].tx_pkt, E1000E_MAX_TX_FRAGS); } net_rx_pkt_init(&core->rx_pkt); @@ -3979,7 +3980,6 @@ igb_core_pci_uninit(IGBCore *core) qemu_del_vm_change_state_handler(core->vmstate); for (i = 0; i < IGB_NUM_QUEUES; i++) { - net_tx_pkt_reset(core->tx[i].tx_pkt, NULL); net_tx_pkt_uninit(core->tx[i].tx_pkt); } @@ -4158,7 +4158,6 @@ static void igb_reset(IGBCore *core, bool sw) for (i = 0; i < ARRAY_SIZE(core->tx); i++) { tx = &core->tx[i]; - net_tx_pkt_reset(tx->tx_pkt, NULL); memset(tx->ctx, 0, sizeof(tx->ctx)); tx->first = true; tx->skip_cp = false; diff --git a/hw/net/net_tx_pkt.c b/hw/net/net_tx_pkt.c index aca12ff035..cc36750c9b 100644 --- a/hw/net/net_tx_pkt.c +++ b/hw/net/net_tx_pkt.c @@ -16,12 +16,12 @@ */ #include "qemu/osdep.h" -#include "net_tx_pkt.h" #include "net/eth.h" #include "net/checksum.h" #include "net/tap.h" #include "net/net.h" #include "hw/pci/pci_device.h" +#include "net_tx_pkt.h" enum { NET_TX_PKT_VHDR_FRAG = 0, @@ -32,8 +32,6 @@ enum { /* TX packet private context */ struct NetTxPkt { - PCIDevice *pci_dev; - struct virtio_net_hdr virt_hdr; struct iovec *raw; @@ -59,13 +57,10 @@ struct NetTxPkt { uint8_t l4proto; }; -void net_tx_pkt_init(struct NetTxPkt **pkt, PCIDevice *pci_dev, - uint32_t max_frags) +void net_tx_pkt_init(struct NetTxPkt **pkt, uint32_t max_frags) { struct NetTxPkt *p = g_malloc0(sizeof *p); - p->pci_dev = pci_dev; - p->vec = g_new(struct iovec, max_frags + NET_TX_PKT_PL_START_FRAG); p->raw = g_new(struct iovec, max_frags); @@ -384,8 +379,7 @@ void net_tx_pkt_setup_vlan_header_ex(struct NetTxPkt *pkt, } } -static bool net_tx_pkt_add_raw_fragment_common(struct NetTxPkt *pkt, - void *base, size_t len) +bool net_tx_pkt_add_raw_fragment(struct NetTxPkt *pkt, void *base, size_t len) { struct iovec *ventry; assert(pkt); @@ -433,7 +427,8 @@ void net_tx_pkt_dump(struct NetTxPkt *pkt) #endif } -void net_tx_pkt_reset(struct NetTxPkt *pkt, PCIDevice *pci_dev) +void net_tx_pkt_reset(struct NetTxPkt *pkt, + NetTxPktFreeFrag callback, void *context) { int i; @@ -453,12 +448,9 @@ void net_tx_pkt_reset(struct NetTxPkt *pkt, PCIDevice *pci_dev) assert(pkt->raw); for (i = 0; i < pkt->raw_frags; i++) { assert(pkt->raw[i].iov_base); - net_tx_pkt_unmap_frag_pci(pkt->pci_dev, - pkt->raw[i].iov_base, - pkt->raw[i].iov_len); + callback(context, pkt->raw[i].iov_base, pkt->raw[i].iov_len); } } - pkt->pci_dev = pci_dev; pkt->raw_frags = 0; pkt->hdr_len = 0; @@ -470,19 +462,17 @@ void net_tx_pkt_unmap_frag_pci(void *context, void *base, size_t len) pci_dma_unmap(context, base, len, DMA_DIRECTION_TO_DEVICE, 0); } -bool net_tx_pkt_add_raw_fragment(struct NetTxPkt *pkt, hwaddr pa, - size_t len) +bool net_tx_pkt_add_raw_fragment_pci(struct NetTxPkt *pkt, PCIDevice *pci_dev, + dma_addr_t pa, size_t len) { dma_addr_t mapped_len = len; - void *base = pci_dma_map(pkt->pci_dev, pa, &mapped_len, - DMA_DIRECTION_TO_DEVICE); + void *base = pci_dma_map(pci_dev, pa, &mapped_len, DMA_DIRECTION_TO_DEVICE); if (!base) { return false; } - if (mapped_len != len || - !net_tx_pkt_add_raw_fragment_common(pkt, base, len)) { - net_tx_pkt_unmap_frag_pci(pkt->pci_dev, base, mapped_len); + if (mapped_len != len || !net_tx_pkt_add_raw_fragment(pkt, base, len)) { + net_tx_pkt_unmap_frag_pci(pci_dev, base, mapped_len); return false; } @@ -710,7 +700,7 @@ static void net_tx_pkt_udp_fragment_fix(struct NetTxPkt *pkt, } static bool net_tx_pkt_do_sw_fragmentation(struct NetTxPkt *pkt, - NetTxPktCallback callback, + NetTxPktSend callback, void *context) { uint8_t gso_type = pkt->virt_hdr.gso_type & ~VIRTIO_NET_HDR_GSO_ECN; @@ -807,7 +797,7 @@ bool net_tx_pkt_send(struct NetTxPkt *pkt, NetClientState *nc) } bool net_tx_pkt_send_custom(struct NetTxPkt *pkt, bool offload, - NetTxPktCallback callback, void *context) + NetTxPktSend callback, void *context) { assert(pkt); diff --git a/hw/net/net_tx_pkt.h b/hw/net/net_tx_pkt.h index 5eb123ef90..4d7233e975 100644 --- a/hw/net/net_tx_pkt.h +++ b/hw/net/net_tx_pkt.h @@ -26,17 +26,16 @@ struct NetTxPkt; -typedef void (* NetTxPktCallback)(void *, const struct iovec *, int, const struct iovec *, int); +typedef void (*NetTxPktFreeFrag)(void *, void *, size_t); +typedef void (*NetTxPktSend)(void *, const struct iovec *, int, const struct iovec *, int); /** * Init function for tx packet functionality * * @pkt: packet pointer - * @pci_dev: PCI device processing this packet * @max_frags: max tx ip fragments */ -void net_tx_pkt_init(struct NetTxPkt **pkt, PCIDevice *pci_dev, - uint32_t max_frags); +void net_tx_pkt_init(struct NetTxPkt **pkt, uint32_t max_frags); /** * Clean all tx packet resources. @@ -95,12 +94,11 @@ net_tx_pkt_setup_vlan_header(struct NetTxPkt *pkt, uint16_t vlan) * populate data fragment into pkt context. * * @pkt: packet - * @pa: physical address of fragment + * @base: pointer to fragment * @len: length of fragment * */ -bool net_tx_pkt_add_raw_fragment(struct NetTxPkt *pkt, hwaddr pa, - size_t len); +bool net_tx_pkt_add_raw_fragment(struct NetTxPkt *pkt, void *base, size_t len); /** * Fix ip header fields and calculate IP header and pseudo header checksums. @@ -148,10 +146,11 @@ void net_tx_pkt_dump(struct NetTxPkt *pkt); * reset tx packet private context (needed to be called between packets) * * @pkt: packet - * @dev: PCI device processing the next packet - * + * @callback: function to free the fragments + * @context: pointer to be passed to the callback */ -void net_tx_pkt_reset(struct NetTxPkt *pkt, PCIDevice *dev); +void net_tx_pkt_reset(struct NetTxPkt *pkt, + NetTxPktFreeFrag callback, void *context); /** * Unmap a fragment mapped from a PCI device. @@ -162,6 +161,16 @@ void net_tx_pkt_reset(struct NetTxPkt *pkt, PCIDevice *dev); */ void net_tx_pkt_unmap_frag_pci(void *context, void *base, size_t len); +/** + * map data fragment from PCI device and populate it into pkt context. + * + * @pci_dev: PCI device owning fragment + * @pa: physical address of fragment + * @len: length of fragment + */ +bool net_tx_pkt_add_raw_fragment_pci(struct NetTxPkt *pkt, PCIDevice *pci_dev, + dma_addr_t pa, size_t len); + /** * Send packet to qemu. handles sw offloads if vhdr is not supported. * @@ -182,7 +191,7 @@ bool net_tx_pkt_send(struct NetTxPkt *pkt, NetClientState *nc); * @ret: operation result */ bool net_tx_pkt_send_custom(struct NetTxPkt *pkt, bool offload, - NetTxPktCallback callback, void *context); + NetTxPktSend callback, void *context); /** * parse raw packet data and analyze offload requirements. diff --git a/hw/net/vmxnet3.c b/hw/net/vmxnet3.c index f7b874c139..9acff310e7 100644 --- a/hw/net/vmxnet3.c +++ b/hw/net/vmxnet3.c @@ -651,9 +651,8 @@ static void vmxnet3_process_tx_queue(VMXNET3State *s, int qidx) data_len = (txd.len > 0) ? txd.len : VMXNET3_MAX_TX_BUF_SIZE; data_pa = txd.addr; - if (!net_tx_pkt_add_raw_fragment(s->tx_pkt, - data_pa, - data_len)) { + if (!net_tx_pkt_add_raw_fragment_pci(s->tx_pkt, PCI_DEVICE(s), + data_pa, data_len)) { s->skip_current_tx_pkt = true; } } @@ -678,7 +677,8 @@ static void vmxnet3_process_tx_queue(VMXNET3State *s, int qidx) vmxnet3_complete_packet(s, qidx, txd_idx); s->tx_sop = true; s->skip_current_tx_pkt = false; - net_tx_pkt_reset(s->tx_pkt, PCI_DEVICE(s)); + net_tx_pkt_reset(s->tx_pkt, + net_tx_pkt_unmap_frag_pci, PCI_DEVICE(s)); } } } @@ -1159,7 +1159,7 @@ static void vmxnet3_deactivate_device(VMXNET3State *s) { if (s->device_active) { VMW_CBPRN("Deactivating vmxnet3..."); - net_tx_pkt_reset(s->tx_pkt, PCI_DEVICE(s)); + net_tx_pkt_reset(s->tx_pkt, net_tx_pkt_unmap_frag_pci, PCI_DEVICE(s)); net_tx_pkt_uninit(s->tx_pkt); net_rx_pkt_uninit(s->rx_pkt); s->device_active = false; @@ -1519,7 +1519,7 @@ static void vmxnet3_activate_device(VMXNET3State *s) /* Preallocate TX packet wrapper */ VMW_CFPRN("Max TX fragments is %u", s->max_tx_frags); - net_tx_pkt_init(&s->tx_pkt, PCI_DEVICE(s), s->max_tx_frags); + net_tx_pkt_init(&s->tx_pkt, s->max_tx_frags); net_rx_pkt_init(&s->rx_pkt); /* Read rings memory locations for RX queues */ @@ -2399,7 +2399,7 @@ static int vmxnet3_post_load(void *opaque, int version_id) { VMXNET3State *s = opaque; - net_tx_pkt_init(&s->tx_pkt, PCI_DEVICE(s), s->max_tx_frags); + net_tx_pkt_init(&s->tx_pkt, s->max_tx_frags); net_rx_pkt_init(&s->rx_pkt); if (s->msix_used) { From f3f9b726afba1f53663768603189e574f80b5907 Mon Sep 17 00:00:00 2001 From: Akihiko Odaki Date: Tue, 23 May 2023 11:42:54 +0900 Subject: [PATCH 04/50] e1000x: Fix BPRC and MPRC Before this change, e1000 and the common code updated BPRC and MPRC depending on the matched filter, but e1000e and igb decided to update those counters by deriving the packet type independently. This inconsistency caused a multicast packet to be counted twice. Updating BPRC and MPRC depending on are fundamentally flawed anyway as a filter can be used for different types of packets. For example, it is possible to filter broadcast packets with MTA. Always determine what counters to update by inspecting the packets. Fixes: 3b27430177 ("e1000: Implementing various counters") Signed-off-by: Akihiko Odaki Reviewed-by: Sriram Yagnaraman Signed-off-by: Jason Wang --- hw/net/e1000.c | 6 +++--- hw/net/e1000e_core.c | 20 +++----------------- hw/net/e1000x_common.c | 25 +++++++++++++++++++------ hw/net/e1000x_common.h | 5 +++-- hw/net/igb_core.c | 22 +++++----------------- 5 files changed, 33 insertions(+), 45 deletions(-) diff --git a/hw/net/e1000.c b/hw/net/e1000.c index 59bacb5d3b..18eb6d8876 100644 --- a/hw/net/e1000.c +++ b/hw/net/e1000.c @@ -826,12 +826,10 @@ receive_filter(E1000State *s, const uint8_t *buf, int size) } if (ismcast && (rctl & E1000_RCTL_MPE)) { /* promiscuous mcast */ - e1000x_inc_reg_if_not_full(s->mac_reg, MPRC); return 1; } if (isbcast && (rctl & E1000_RCTL_BAM)) { /* broadcast enabled */ - e1000x_inc_reg_if_not_full(s->mac_reg, BPRC); return 1; } @@ -922,6 +920,7 @@ e1000_receive_iov(NetClientState *nc, const struct iovec *iov, int iovcnt) size_t desc_offset; size_t desc_size; size_t total_size; + eth_pkt_types_e pkt_type; if (!e1000x_hw_rx_enabled(s->mac_reg)) { return -1; @@ -971,6 +970,7 @@ e1000_receive_iov(NetClientState *nc, const struct iovec *iov, int iovcnt) size -= 4; } + pkt_type = get_eth_packet_type(PKT_GET_ETH_HDR(filter_buf)); rdh_start = s->mac_reg[RDH]; desc_offset = 0; total_size = size + e1000x_fcs_len(s->mac_reg); @@ -1036,7 +1036,7 @@ e1000_receive_iov(NetClientState *nc, const struct iovec *iov, int iovcnt) } } while (desc_offset < total_size); - e1000x_update_rx_total_stats(s->mac_reg, size, total_size); + e1000x_update_rx_total_stats(s->mac_reg, pkt_type, size, total_size); n = E1000_ICS_RXT0; if ((rdt = s->mac_reg[RDT]) < s->mac_reg[RDH]) diff --git a/hw/net/e1000e_core.c b/hw/net/e1000e_core.c index 15821a75e0..c2d864a504 100644 --- a/hw/net/e1000e_core.c +++ b/hw/net/e1000e_core.c @@ -1488,24 +1488,10 @@ e1000e_write_to_rx_buffers(E1000ECore *core, } static void -e1000e_update_rx_stats(E1000ECore *core, - size_t data_size, - size_t data_fcs_size) +e1000e_update_rx_stats(E1000ECore *core, size_t pkt_size, size_t pkt_fcs_size) { - e1000x_update_rx_total_stats(core->mac, data_size, data_fcs_size); - - switch (net_rx_pkt_get_packet_type(core->rx_pkt)) { - case ETH_PKT_BCAST: - e1000x_inc_reg_if_not_full(core->mac, BPRC); - break; - - case ETH_PKT_MCAST: - e1000x_inc_reg_if_not_full(core->mac, MPRC); - break; - - default: - break; - } + eth_pkt_types_e pkt_type = net_rx_pkt_get_packet_type(core->rx_pkt); + e1000x_update_rx_total_stats(core->mac, pkt_type, pkt_size, pkt_fcs_size); } static inline bool diff --git a/hw/net/e1000x_common.c b/hw/net/e1000x_common.c index 4c8e7dcf70..7694673bcc 100644 --- a/hw/net/e1000x_common.c +++ b/hw/net/e1000x_common.c @@ -80,7 +80,6 @@ bool e1000x_rx_group_filter(uint32_t *mac, const uint8_t *buf) f = mta_shift[(rctl >> E1000_RCTL_MO_SHIFT) & 3]; f = (((buf[5] << 8) | buf[4]) >> f) & 0xfff; if (mac[MTA + (f >> 5)] & (1 << (f & 0x1f))) { - e1000x_inc_reg_if_not_full(mac, MPRC); return true; } @@ -212,13 +211,14 @@ e1000x_rxbufsize(uint32_t rctl) void e1000x_update_rx_total_stats(uint32_t *mac, - size_t data_size, - size_t data_fcs_size) + eth_pkt_types_e pkt_type, + size_t pkt_size, + size_t pkt_fcs_size) { static const int PRCregs[6] = { PRC64, PRC127, PRC255, PRC511, PRC1023, PRC1522 }; - e1000x_increase_size_stats(mac, PRCregs, data_fcs_size); + e1000x_increase_size_stats(mac, PRCregs, pkt_fcs_size); e1000x_inc_reg_if_not_full(mac, TPR); e1000x_inc_reg_if_not_full(mac, GPRC); /* TOR - Total Octets Received: @@ -226,8 +226,21 @@ e1000x_update_rx_total_stats(uint32_t *mac, * Address> field through the field, inclusively. * Always include FCS length (4) in size. */ - e1000x_grow_8reg_if_not_full(mac, TORL, data_size + 4); - e1000x_grow_8reg_if_not_full(mac, GORCL, data_size + 4); + e1000x_grow_8reg_if_not_full(mac, TORL, pkt_size + 4); + e1000x_grow_8reg_if_not_full(mac, GORCL, pkt_size + 4); + + switch (pkt_type) { + case ETH_PKT_BCAST: + e1000x_inc_reg_if_not_full(mac, BPRC); + break; + + case ETH_PKT_MCAST: + e1000x_inc_reg_if_not_full(mac, MPRC); + break; + + default: + break; + } } void diff --git a/hw/net/e1000x_common.h b/hw/net/e1000x_common.h index 911abd8a90..0298e06283 100644 --- a/hw/net/e1000x_common.h +++ b/hw/net/e1000x_common.h @@ -91,8 +91,9 @@ e1000x_update_regs_on_link_up(uint32_t *mac, uint16_t *phy) } void e1000x_update_rx_total_stats(uint32_t *mac, - size_t data_size, - size_t data_fcs_size); + eth_pkt_types_e pkt_type, + size_t pkt_size, + size_t pkt_fcs_size); void e1000x_core_prepare_eeprom(uint16_t *eeprom, const uint16_t *templ, diff --git a/hw/net/igb_core.c b/hw/net/igb_core.c index abfdce9aaf..464a41d0aa 100644 --- a/hw/net/igb_core.c +++ b/hw/net/igb_core.c @@ -1438,29 +1438,17 @@ igb_write_to_rx_buffers(IGBCore *core, static void igb_update_rx_stats(IGBCore *core, const E1000E_RingInfo *rxi, - size_t data_size, size_t data_fcs_size) + size_t pkt_size, size_t pkt_fcs_size) { - e1000x_update_rx_total_stats(core->mac, data_size, data_fcs_size); - - switch (net_rx_pkt_get_packet_type(core->rx_pkt)) { - case ETH_PKT_BCAST: - e1000x_inc_reg_if_not_full(core->mac, BPRC); - break; - - case ETH_PKT_MCAST: - e1000x_inc_reg_if_not_full(core->mac, MPRC); - break; - - default: - break; - } + eth_pkt_types_e pkt_type = net_rx_pkt_get_packet_type(core->rx_pkt); + e1000x_update_rx_total_stats(core->mac, pkt_type, pkt_size, pkt_fcs_size); if (core->mac[MRQC] & 1) { uint16_t pool = rxi->idx % IGB_NUM_VM_POOLS; - core->mac[PVFGORC0 + (pool * 64)] += data_size + 4; + core->mac[PVFGORC0 + (pool * 64)] += pkt_size + 4; core->mac[PVFGPRC0 + (pool * 64)]++; - if (net_rx_pkt_get_packet_type(core->rx_pkt) == ETH_PKT_MCAST) { + if (pkt_type == ETH_PKT_MCAST) { core->mac[PVFMPRC0 + (pool * 64)]++; } } From ed447c60b341f1714b3c800d7f9c68898e873f78 Mon Sep 17 00:00:00 2001 From: Akihiko Odaki Date: Tue, 23 May 2023 11:42:55 +0900 Subject: [PATCH 05/50] igb: Fix Rx packet type encoding igb's advanced descriptor uses a packet type encoding different from one used in e1000e's extended descriptor. Fix the logic to encode Rx packet type accordingly. Fixes: 3a977deebe ("Intrdocue igb device emulation") Signed-off-by: Akihiko Odaki Reviewed-by: Sriram Yagnaraman Signed-off-by: Jason Wang --- hw/net/igb_core.c | 40 ++++++++++++++++++++-------------------- hw/net/igb_regs.h | 5 +++++ 2 files changed, 25 insertions(+), 20 deletions(-) diff --git a/hw/net/igb_core.c b/hw/net/igb_core.c index 464a41d0aa..dbd1192a8e 100644 --- a/hw/net/igb_core.c +++ b/hw/net/igb_core.c @@ -1227,7 +1227,6 @@ igb_build_rx_metadata(IGBCore *core, struct virtio_net_hdr *vhdr; bool hasip4, hasip6; EthL4HdrProto l4hdr_proto; - uint32_t pkt_type; *status_flags = E1000_RXD_STAT_DD; @@ -1266,28 +1265,29 @@ igb_build_rx_metadata(IGBCore *core, trace_e1000e_rx_metadata_ack(); } - if (hasip6 && (core->mac[RFCTL] & E1000_RFCTL_IPV6_DIS)) { - trace_e1000e_rx_metadata_ipv6_filtering_disabled(); - pkt_type = E1000_RXD_PKT_MAC; - } else if (l4hdr_proto == ETH_L4_HDR_PROTO_TCP || - l4hdr_proto == ETH_L4_HDR_PROTO_UDP) { - pkt_type = hasip4 ? E1000_RXD_PKT_IP4_XDP : E1000_RXD_PKT_IP6_XDP; - } else if (hasip4 || hasip6) { - pkt_type = hasip4 ? E1000_RXD_PKT_IP4 : E1000_RXD_PKT_IP6; - } else { - pkt_type = E1000_RXD_PKT_MAC; - } - - trace_e1000e_rx_metadata_pkt_type(pkt_type); - if (pkt_info) { - if (rss_info->enabled) { - *pkt_info = rss_info->type; + *pkt_info = rss_info->enabled ? rss_info->type : 0; + + if (hasip4) { + *pkt_info |= E1000_ADVRXD_PKT_IP4; } - *pkt_info |= (pkt_type << 4); - } else { - *status_flags |= E1000_RXD_PKT_TYPE(pkt_type); + if (hasip6) { + *pkt_info |= E1000_ADVRXD_PKT_IP6; + } + + switch (l4hdr_proto) { + case ETH_L4_HDR_PROTO_TCP: + *pkt_info |= E1000_ADVRXD_PKT_TCP; + break; + + case ETH_L4_HDR_PROTO_UDP: + *pkt_info |= E1000_ADVRXD_PKT_UDP; + break; + + default: + break; + } } if (hdr_info) { diff --git a/hw/net/igb_regs.h b/hw/net/igb_regs.h index c5c5b3c3b8..21ee9a3b2d 100644 --- a/hw/net/igb_regs.h +++ b/hw/net/igb_regs.h @@ -641,6 +641,11 @@ union e1000_adv_rx_desc { #define E1000_STATUS_NUM_VFS_SHIFT 14 +#define E1000_ADVRXD_PKT_IP4 BIT(4) +#define E1000_ADVRXD_PKT_IP6 BIT(6) +#define E1000_ADVRXD_PKT_TCP BIT(8) +#define E1000_ADVRXD_PKT_UDP BIT(9) + static inline uint8_t igb_ivar_entry_rx(uint8_t i) { return i < 8 ? i * 4 : (i - 8) * 4 + 2; From e209716749cda1581cfc8e582591c0216c30ab0d Mon Sep 17 00:00:00 2001 From: Akihiko Odaki Date: Tue, 23 May 2023 11:42:56 +0900 Subject: [PATCH 06/50] igb: Do not require CTRL.VME for tx VLAN tagging While the datasheet of e1000e says it checks CTRL.VME for tx VLAN tagging, igb's datasheet has no such statements. It also says for "CTRL.VLE": > This register only affects the VLAN Strip in Rx it does not have any > influence in the Tx path in the 82576. (Appendix A. Changes from the 82575) There is no "CTRL.VLE" so it is more likely that it is a mistake of CTRL.VME. Fixes: fba7c3b788 ("igb: respect VMVIR and VMOLR for VLAN") Signed-off-by: Akihiko Odaki Reviewed-by: Sriram Yagnaraman Signed-off-by: Jason Wang --- hw/net/igb_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hw/net/igb_core.c b/hw/net/igb_core.c index dbd1192a8e..96a118b6c1 100644 --- a/hw/net/igb_core.c +++ b/hw/net/igb_core.c @@ -402,7 +402,7 @@ igb_tx_insert_vlan(IGBCore *core, uint16_t qn, struct igb_tx *tx, } } - if (insert_vlan && e1000x_vlan_enabled(core->mac)) { + if (insert_vlan) { net_tx_pkt_setup_vlan_header_ex(tx->tx_pkt, vlan, core->mac[VET] & 0xffff); } From f0b1df5c4502b5ec89f83417924935ab201511d0 Mon Sep 17 00:00:00 2001 From: Akihiko Odaki Date: Tue, 23 May 2023 11:42:57 +0900 Subject: [PATCH 07/50] igb: Clear IMS bits when committing ICR access The datasheet says contradicting statements regarding ICR accesses so it is not reliable to determine the behavior of ICR accesses. However, e1000e does clear IMS bits when reading ICR accesses and Linux also expects ICR accesses will clear IMS bits according to: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/drivers/net/ethernet/intel/igb/igb_main.c?h=v6.2#n8048 Fixes: 3a977deebe ("Intrdocue igb device emulation") Signed-off-by: Akihiko Odaki Reviewed-by: Sriram Yagnaraman Signed-off-by: Jason Wang --- hw/net/igb_core.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/hw/net/igb_core.c b/hw/net/igb_core.c index 96a118b6c1..eaca5bd2b6 100644 --- a/hw/net/igb_core.c +++ b/hw/net/igb_core.c @@ -2452,16 +2452,16 @@ igb_set_ims(IGBCore *core, int index, uint32_t val) static void igb_commit_icr(IGBCore *core) { /* - * If GPIE.NSICR = 0, then the copy of IAM to IMS will occur only if at + * If GPIE.NSICR = 0, then the clear of IMS will occur only if at * least one bit is set in the IMS and there is a true interrupt as * reflected in ICR.INTA. */ if ((core->mac[GPIE] & E1000_GPIE_NSICR) || (core->mac[IMS] && (core->mac[ICR] & E1000_ICR_INT_ASSERTED))) { - igb_set_ims(core, IMS, core->mac[IAM]); - } else { - igb_update_interrupt_state(core); + igb_clear_ims_bits(core, core->mac[IAM]); } + + igb_update_interrupt_state(core); } static void igb_set_icr(IGBCore *core, int index, uint32_t val) From 2f0fa232b8c330df029120a6824c8be3d4eb5cae Mon Sep 17 00:00:00 2001 From: Akihiko Odaki Date: Tue, 23 May 2023 11:42:58 +0900 Subject: [PATCH 08/50] net/net_rx_pkt: Use iovec for net_rx_pkt_set_protocols() igb does not properly ensure the buffer passed to net_rx_pkt_set_protocols() is contiguous for the entire L2/L3/L4 header. Allow it to pass scattered data to net_rx_pkt_set_protocols(). Fixes: 3a977deebe ("Intrdocue igb device emulation") Signed-off-by: Akihiko Odaki Reviewed-by: Sriram Yagnaraman Signed-off-by: Jason Wang --- hw/net/igb_core.c | 2 +- hw/net/net_rx_pkt.c | 14 +++++--------- hw/net/net_rx_pkt.h | 10 ++++++---- hw/net/virtio-net.c | 7 +++++-- hw/net/vmxnet3.c | 7 ++++++- include/net/eth.h | 6 +++--- net/eth.c | 18 ++++++++---------- 7 files changed, 34 insertions(+), 30 deletions(-) diff --git a/hw/net/igb_core.c b/hw/net/igb_core.c index eaca5bd2b6..21a8d9ada4 100644 --- a/hw/net/igb_core.c +++ b/hw/net/igb_core.c @@ -1650,7 +1650,7 @@ igb_receive_internal(IGBCore *core, const struct iovec *iov, int iovcnt, ehdr = PKT_GET_ETH_HDR(filter_buf); net_rx_pkt_set_packet_type(core->rx_pkt, get_eth_packet_type(ehdr)); - net_rx_pkt_set_protocols(core->rx_pkt, filter_buf, size); + net_rx_pkt_set_protocols(core->rx_pkt, iov, iovcnt, iov_ofs); queues = igb_receive_assign(core, ehdr, size, &rss_info, external_tx); if (!queues) { diff --git a/hw/net/net_rx_pkt.c b/hw/net/net_rx_pkt.c index 39cdea06de..63be6e05ad 100644 --- a/hw/net/net_rx_pkt.c +++ b/hw/net/net_rx_pkt.c @@ -103,7 +103,7 @@ net_rx_pkt_pull_data(struct NetRxPkt *pkt, iov, iovcnt, ploff, pkt->tot_len); } - eth_get_protocols(pkt->vec, pkt->vec_len, &pkt->hasip4, &pkt->hasip6, + eth_get_protocols(pkt->vec, pkt->vec_len, 0, &pkt->hasip4, &pkt->hasip6, &pkt->l3hdr_off, &pkt->l4hdr_off, &pkt->l5hdr_off, &pkt->ip6hdr_info, &pkt->ip4hdr_info, &pkt->l4hdr_info); @@ -186,17 +186,13 @@ size_t net_rx_pkt_get_total_len(struct NetRxPkt *pkt) return pkt->tot_len; } -void net_rx_pkt_set_protocols(struct NetRxPkt *pkt, const void *data, - size_t len) +void net_rx_pkt_set_protocols(struct NetRxPkt *pkt, + const struct iovec *iov, size_t iovcnt, + size_t iovoff) { - const struct iovec iov = { - .iov_base = (void *)data, - .iov_len = len - }; - assert(pkt); - eth_get_protocols(&iov, 1, &pkt->hasip4, &pkt->hasip6, + eth_get_protocols(iov, iovcnt, iovoff, &pkt->hasip4, &pkt->hasip6, &pkt->l3hdr_off, &pkt->l4hdr_off, &pkt->l5hdr_off, &pkt->ip6hdr_info, &pkt->ip4hdr_info, &pkt->l4hdr_info); } diff --git a/hw/net/net_rx_pkt.h b/hw/net/net_rx_pkt.h index d00b484900..a06f5c2675 100644 --- a/hw/net/net_rx_pkt.h +++ b/hw/net/net_rx_pkt.h @@ -55,12 +55,14 @@ size_t net_rx_pkt_get_total_len(struct NetRxPkt *pkt); * parse and set packet analysis results * * @pkt: packet - * @data: pointer to the data buffer to be parsed - * @len: data length + * @iov: received data scatter-gather list + * @iovcnt: number of elements in iov + * @iovoff: data start offset in the iov * */ -void net_rx_pkt_set_protocols(struct NetRxPkt *pkt, const void *data, - size_t len); +void net_rx_pkt_set_protocols(struct NetRxPkt *pkt, + const struct iovec *iov, size_t iovcnt, + size_t iovoff); /** * fetches packet analysis results diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c index 8ce239a34d..6df6b7329d 100644 --- a/hw/net/virtio-net.c +++ b/hw/net/virtio-net.c @@ -1834,9 +1834,12 @@ static int virtio_net_process_rss(NetClientState *nc, const uint8_t *buf, VIRTIO_NET_HASH_REPORT_UDPv6, VIRTIO_NET_HASH_REPORT_UDPv6_EX }; + struct iovec iov = { + .iov_base = (void *)buf, + .iov_len = size + }; - net_rx_pkt_set_protocols(pkt, buf + n->host_hdr_len, - size - n->host_hdr_len); + net_rx_pkt_set_protocols(pkt, &iov, 1, n->host_hdr_len); net_rx_pkt_get_protocols(pkt, &hasip4, &hasip6, &l4hdr_proto); net_hash_type = virtio_net_get_hash_type(hasip4, hasip6, l4hdr_proto, n->rss_data.hash_types); diff --git a/hw/net/vmxnet3.c b/hw/net/vmxnet3.c index 9acff310e7..05f41b6dfa 100644 --- a/hw/net/vmxnet3.c +++ b/hw/net/vmxnet3.c @@ -2001,7 +2001,12 @@ vmxnet3_receive(NetClientState *nc, const uint8_t *buf, size_t size) get_eth_packet_type(PKT_GET_ETH_HDR(buf))); if (vmxnet3_rx_filter_may_indicate(s, buf, size)) { - net_rx_pkt_set_protocols(s->rx_pkt, buf, size); + struct iovec iov = { + .iov_base = (void *)buf, + .iov_len = size + }; + + net_rx_pkt_set_protocols(s->rx_pkt, &iov, 1, 0); vmxnet3_rx_need_csum_calculate(s->rx_pkt, buf, size); net_rx_pkt_attach_data(s->rx_pkt, buf, size, s->rx_vlan_stripping); bytes_indicated = vmxnet3_indicate_packet(s) ? size : -1; diff --git a/include/net/eth.h b/include/net/eth.h index c5ae4493b4..9f19c3a695 100644 --- a/include/net/eth.h +++ b/include/net/eth.h @@ -312,10 +312,10 @@ eth_get_l2_hdr_length(const void *p) } static inline uint32_t -eth_get_l2_hdr_length_iov(const struct iovec *iov, int iovcnt) +eth_get_l2_hdr_length_iov(const struct iovec *iov, size_t iovcnt, size_t iovoff) { uint8_t p[sizeof(struct eth_header) + sizeof(struct vlan_header)]; - size_t copied = iov_to_buf(iov, iovcnt, 0, p, ARRAY_SIZE(p)); + size_t copied = iov_to_buf(iov, iovcnt, iovoff, p, ARRAY_SIZE(p)); if (copied < ARRAY_SIZE(p)) { return copied; @@ -397,7 +397,7 @@ typedef struct eth_l4_hdr_info_st { bool has_tcp_data; } eth_l4_hdr_info; -void eth_get_protocols(const struct iovec *iov, int iovcnt, +void eth_get_protocols(const struct iovec *iov, size_t iovcnt, size_t iovoff, bool *hasip4, bool *hasip6, size_t *l3hdr_off, size_t *l4hdr_off, diff --git a/net/eth.c b/net/eth.c index 70bcd8e355..d7b30df79f 100644 --- a/net/eth.c +++ b/net/eth.c @@ -136,7 +136,7 @@ _eth_tcp_has_data(bool is_ip4, return l4len > TCP_HEADER_DATA_OFFSET(tcp); } -void eth_get_protocols(const struct iovec *iov, int iovcnt, +void eth_get_protocols(const struct iovec *iov, size_t iovcnt, size_t iovoff, bool *hasip4, bool *hasip6, size_t *l3hdr_off, size_t *l4hdr_off, @@ -147,26 +147,24 @@ void eth_get_protocols(const struct iovec *iov, int iovcnt, { int proto; bool fragment = false; - size_t l2hdr_len = eth_get_l2_hdr_length_iov(iov, iovcnt); size_t input_size = iov_size(iov, iovcnt); size_t copied; uint8_t ip_p; *hasip4 = *hasip6 = false; + *l3hdr_off = iovoff + eth_get_l2_hdr_length_iov(iov, iovcnt, iovoff); l4hdr_info->proto = ETH_L4_HDR_PROTO_INVALID; - proto = eth_get_l3_proto(iov, iovcnt, l2hdr_len); - - *l3hdr_off = l2hdr_len; + proto = eth_get_l3_proto(iov, iovcnt, *l3hdr_off); if (proto == ETH_P_IP) { struct ip_header *iphdr = &ip4hdr_info->ip4_hdr; - if (input_size < l2hdr_len) { + if (input_size < *l3hdr_off) { return; } - copied = iov_to_buf(iov, iovcnt, l2hdr_len, iphdr, sizeof(*iphdr)); + copied = iov_to_buf(iov, iovcnt, *l3hdr_off, iphdr, sizeof(*iphdr)); if (copied < sizeof(*iphdr) || IP_HEADER_VERSION(iphdr) != IP_HEADER_VERSION_4) { return; @@ -175,17 +173,17 @@ void eth_get_protocols(const struct iovec *iov, int iovcnt, *hasip4 = true; ip_p = iphdr->ip_p; ip4hdr_info->fragment = IP4_IS_FRAGMENT(iphdr); - *l4hdr_off = l2hdr_len + IP_HDR_GET_LEN(iphdr); + *l4hdr_off = *l3hdr_off + IP_HDR_GET_LEN(iphdr); fragment = ip4hdr_info->fragment; } else if (proto == ETH_P_IPV6) { - if (!eth_parse_ipv6_hdr(iov, iovcnt, l2hdr_len, ip6hdr_info)) { + if (!eth_parse_ipv6_hdr(iov, iovcnt, *l3hdr_off, ip6hdr_info)) { return; } *hasip6 = true; ip_p = ip6hdr_info->l4proto; - *l4hdr_off = l2hdr_len + ip6hdr_info->full_hdr_len; + *l4hdr_off = *l3hdr_off + ip6hdr_info->full_hdr_len; fragment = ip6hdr_info->fragment; } else { return; From 310a128eae12339f97f6c940a7ddf92f40d283e4 Mon Sep 17 00:00:00 2001 From: Akihiko Odaki Date: Tue, 23 May 2023 11:42:59 +0900 Subject: [PATCH 09/50] e1000e: Always copy ethernet header e1000e_receive_internal() used to check the iov length to determine copy the iovs to a contiguous buffer, but the check is flawed in two ways: - It does not ensure that iovcnt > 0. - It does not take virtio-net header into consideration. The size of this copy is just 18 octets, which can be even less than the code size required for checks. This (wrong) optimization is probably not worth so just remove it. Fixes: 6f3fbe4ed0 ("net: Introduce e1000e device emulation") Signed-off-by: Akihiko Odaki Signed-off-by: Jason Wang --- hw/net/e1000e_core.c | 26 ++++++++++---------------- 1 file changed, 10 insertions(+), 16 deletions(-) diff --git a/hw/net/e1000e_core.c b/hw/net/e1000e_core.c index c2d864a504..14b94db59c 100644 --- a/hw/net/e1000e_core.c +++ b/hw/net/e1000e_core.c @@ -1686,12 +1686,9 @@ static ssize_t e1000e_receive_internal(E1000ECore *core, const struct iovec *iov, int iovcnt, bool has_vnet) { - static const int maximum_ethernet_hdr_len = (ETH_HLEN + 4); - uint32_t n = 0; - uint8_t min_buf[ETH_ZLEN]; + uint8_t buf[ETH_ZLEN]; struct iovec min_iov; - uint8_t *filter_buf; size_t size, orig_size; size_t iov_ofs = 0; E1000E_RxRing rxr; @@ -1714,24 +1711,21 @@ e1000e_receive_internal(E1000ECore *core, const struct iovec *iov, int iovcnt, net_rx_pkt_unset_vhdr(core->rx_pkt); } - filter_buf = iov->iov_base + iov_ofs; orig_size = iov_size(iov, iovcnt); size = orig_size - iov_ofs; /* Pad to minimum Ethernet frame length */ - if (size < sizeof(min_buf)) { - iov_to_buf(iov, iovcnt, iov_ofs, min_buf, size); - memset(&min_buf[size], 0, sizeof(min_buf) - size); + if (size < sizeof(buf)) { + iov_to_buf(iov, iovcnt, iov_ofs, buf, size); + memset(&buf[size], 0, sizeof(buf) - size); e1000x_inc_reg_if_not_full(core->mac, RUC); - min_iov.iov_base = filter_buf = min_buf; - min_iov.iov_len = size = sizeof(min_buf); + min_iov.iov_base = buf; + min_iov.iov_len = size = sizeof(buf); iovcnt = 1; iov = &min_iov; iov_ofs = 0; - } else if (iov->iov_len < maximum_ethernet_hdr_len) { - /* This is very unlikely, but may happen. */ - iov_to_buf(iov, iovcnt, iov_ofs, min_buf, maximum_ethernet_hdr_len); - filter_buf = min_buf; + } else { + iov_to_buf(iov, iovcnt, iov_ofs, buf, ETH_HLEN + 4); } /* Discard oversized packets if !LPE and !SBP. */ @@ -1740,9 +1734,9 @@ e1000e_receive_internal(E1000ECore *core, const struct iovec *iov, int iovcnt, } net_rx_pkt_set_packet_type(core->rx_pkt, - get_eth_packet_type(PKT_GET_ETH_HDR(filter_buf))); + get_eth_packet_type(PKT_GET_ETH_HDR(buf))); - if (!e1000e_receive_filter(core, filter_buf, size)) { + if (!e1000e_receive_filter(core, buf, size)) { trace_e1000e_rx_flt_dropped(); return orig_size; } From dc9ef1bf454811646b3ee6387f1b96f63f538a18 Mon Sep 17 00:00:00 2001 From: Akihiko Odaki Date: Tue, 23 May 2023 11:43:00 +0900 Subject: [PATCH 10/50] igb: Always copy ethernet header igb_receive_internal() used to check the iov length to determine copy the iovs to a contiguous buffer, but the check is flawed in two ways: - It does not ensure that iovcnt > 0. - It does not take virtio-net header into consideration. The size of this copy is just 22 octets, which can be even less than the code size required for checks. This (wrong) optimization is probably not worth so just remove it. Removing this also allows igb to assume aligned accesses for the ethernet header. Fixes: 3a977deebe ("Intrdocue igb device emulation") Signed-off-by: Akihiko Odaki Reviewed-by: Sriram Yagnaraman Signed-off-by: Jason Wang --- hw/net/igb_core.c | 43 +++++++++++++++++++++++-------------------- 1 file changed, 23 insertions(+), 20 deletions(-) diff --git a/hw/net/igb_core.c b/hw/net/igb_core.c index 21a8d9ada4..1123df9e77 100644 --- a/hw/net/igb_core.c +++ b/hw/net/igb_core.c @@ -67,6 +67,11 @@ typedef struct IGBTxPktVmdqCallbackContext { NetClientState *nc; } IGBTxPktVmdqCallbackContext; +typedef struct L2Header { + struct eth_header eth; + struct vlan_header vlan; +} L2Header; + static ssize_t igb_receive_internal(IGBCore *core, const struct iovec *iov, int iovcnt, bool has_vnet, bool *external_tx); @@ -961,15 +966,16 @@ igb_rx_is_oversized(IGBCore *core, uint16_t qn, size_t size) return size > (lpe ? max_ethernet_lpe_size : max_ethernet_vlan_size); } -static uint16_t igb_receive_assign(IGBCore *core, const struct eth_header *ehdr, +static uint16_t igb_receive_assign(IGBCore *core, const L2Header *l2_header, size_t size, E1000E_RSSInfo *rss_info, bool *external_tx) { static const int ta_shift[] = { 4, 3, 2, 0 }; + const struct eth_header *ehdr = &l2_header->eth; uint32_t f, ra[2], *macp, rctl = core->mac[RCTL]; uint16_t queues = 0; uint16_t oversized = 0; - uint16_t vid = lduw_be_p(&PKT_GET_VLAN_HDR(ehdr)->h_tci) & VLAN_VID_MASK; + uint16_t vid = be16_to_cpu(l2_header->vlan.h_tci) & VLAN_VID_MASK; bool accepted = false; int i; @@ -1590,14 +1596,13 @@ static ssize_t igb_receive_internal(IGBCore *core, const struct iovec *iov, int iovcnt, bool has_vnet, bool *external_tx) { - static const int maximum_ethernet_hdr_len = (ETH_HLEN + 4); - uint16_t queues = 0; uint32_t n = 0; - uint8_t min_buf[ETH_ZLEN]; + union { + L2Header l2_header; + uint8_t octets[ETH_ZLEN]; + } buf; struct iovec min_iov; - struct eth_header *ehdr; - uint8_t *filter_buf; size_t size, orig_size; size_t iov_ofs = 0; E1000E_RxRing rxr; @@ -1623,24 +1628,21 @@ igb_receive_internal(IGBCore *core, const struct iovec *iov, int iovcnt, net_rx_pkt_unset_vhdr(core->rx_pkt); } - filter_buf = iov->iov_base + iov_ofs; orig_size = iov_size(iov, iovcnt); size = orig_size - iov_ofs; /* Pad to minimum Ethernet frame length */ - if (size < sizeof(min_buf)) { - iov_to_buf(iov, iovcnt, iov_ofs, min_buf, size); - memset(&min_buf[size], 0, sizeof(min_buf) - size); + if (size < sizeof(buf)) { + iov_to_buf(iov, iovcnt, iov_ofs, &buf, size); + memset(&buf.octets[size], 0, sizeof(buf) - size); e1000x_inc_reg_if_not_full(core->mac, RUC); - min_iov.iov_base = filter_buf = min_buf; - min_iov.iov_len = size = sizeof(min_buf); + min_iov.iov_base = &buf; + min_iov.iov_len = size = sizeof(buf); iovcnt = 1; iov = &min_iov; iov_ofs = 0; - } else if (iov->iov_len < maximum_ethernet_hdr_len) { - /* This is very unlikely, but may happen. */ - iov_to_buf(iov, iovcnt, iov_ofs, min_buf, maximum_ethernet_hdr_len); - filter_buf = min_buf; + } else { + iov_to_buf(iov, iovcnt, iov_ofs, &buf, sizeof(buf.l2_header)); } /* Discard oversized packets if !LPE and !SBP. */ @@ -1648,11 +1650,12 @@ igb_receive_internal(IGBCore *core, const struct iovec *iov, int iovcnt, return orig_size; } - ehdr = PKT_GET_ETH_HDR(filter_buf); - net_rx_pkt_set_packet_type(core->rx_pkt, get_eth_packet_type(ehdr)); + net_rx_pkt_set_packet_type(core->rx_pkt, + get_eth_packet_type(&buf.l2_header.eth)); net_rx_pkt_set_protocols(core->rx_pkt, iov, iovcnt, iov_ofs); - queues = igb_receive_assign(core, ehdr, size, &rss_info, external_tx); + queues = igb_receive_assign(core, &buf.l2_header, size, + &rss_info, external_tx); if (!queues) { trace_e1000e_rx_flt_dropped(); return orig_size; From 8e6c718a6aaa5073842130a1b643d66479601580 Mon Sep 17 00:00:00 2001 From: Akihiko Odaki Date: Tue, 23 May 2023 11:43:01 +0900 Subject: [PATCH 11/50] Fix references to igb Avocado test MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixes: 9f95111474 ("tests/avocado: re-factor igb test to avoid timeouts") Signed-off-by: Akihiko Odaki Reviewed-by: Philippe Mathieu-Daudé Signed-off-by: Jason Wang --- MAINTAINERS | 2 +- docs/system/devices/igb.rst | 2 +- scripts/ci/org.centos/stream/8/x86_64/test-avocado | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 1b6466496d..35c4e277af 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2279,7 +2279,7 @@ R: Sriram Yagnaraman S: Maintained F: docs/system/devices/igb.rst F: hw/net/igb* -F: tests/avocado/igb.py +F: tests/avocado/netdev-ethtool.py F: tests/qtest/igb-test.c F: tests/qtest/libqos/igb.c diff --git a/docs/system/devices/igb.rst b/docs/system/devices/igb.rst index 0bcdd85747..07d95dd985 100644 --- a/docs/system/devices/igb.rst +++ b/docs/system/devices/igb.rst @@ -60,7 +60,7 @@ Avocado test and can be ran with the following command: .. code:: shell - make check-avocado AVOCADO_TESTS=tests/avocado/igb.py + make check-avocado AVOCADO_TESTS=tests/avocado/netdev-ethtool.py References ========== diff --git a/scripts/ci/org.centos/stream/8/x86_64/test-avocado b/scripts/ci/org.centos/stream/8/x86_64/test-avocado index 7bb5b317b6..f43449377a 100755 --- a/scripts/ci/org.centos/stream/8/x86_64/test-avocado +++ b/scripts/ci/org.centos/stream/8/x86_64/test-avocado @@ -30,7 +30,7 @@ make get-vm-images tests/avocado/cpu_queries.py:QueryCPUModelExpansion.test \ tests/avocado/empty_cpu_model.py:EmptyCPUModel.test \ tests/avocado/hotplug_cpu.py:HotPlugCPU.test \ - tests/avocado/igb.py:IGB.test \ + tests/avocado/netdev-ethtool.py:NetDevEthtool.test_igb_nomsi \ tests/avocado/info_usernet.py:InfoUsernet.test_hostfwd \ tests/avocado/intel_iommu.py:IntelIOMMU.test_intel_iommu \ tests/avocado/intel_iommu.py:IntelIOMMU.test_intel_iommu_pt \ From 808f976c85b62b3bb1a6fdea697ca9f5e0d35539 Mon Sep 17 00:00:00 2001 From: Akihiko Odaki Date: Tue, 23 May 2023 11:43:02 +0900 Subject: [PATCH 12/50] tests/avocado: Remove unused imports Signed-off-by: Akihiko Odaki Signed-off-by: Jason Wang --- tests/avocado/netdev-ethtool.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/avocado/netdev-ethtool.py b/tests/avocado/netdev-ethtool.py index f7e9464184..8de118e313 100644 --- a/tests/avocado/netdev-ethtool.py +++ b/tests/avocado/netdev-ethtool.py @@ -7,7 +7,6 @@ from avocado import skip from avocado_qemu import QemuSystemTest -from avocado_qemu import exec_command, exec_command_and_wait_for_pattern from avocado_qemu import wait_for_console_pattern class NetDevEthtool(QemuSystemTest): From 1531fb4d8d2f62fe40b995dbb8ce16676e0af9d3 Mon Sep 17 00:00:00 2001 From: Akihiko Odaki Date: Tue, 23 May 2023 11:43:03 +0900 Subject: [PATCH 13/50] tests/avocado: Remove test_igb_nomsi_kvm MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It is unlikely to find more bugs with KVM so remove test_igb_nomsi_kvm to save time to run it. Signed-off-by: Akihiko Odaki Reviewed-by: Thomas Huth Acked-by: Alex Bennée Signed-off-by: Jason Wang --- tests/avocado/netdev-ethtool.py | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/tests/avocado/netdev-ethtool.py b/tests/avocado/netdev-ethtool.py index 8de118e313..6da800f62b 100644 --- a/tests/avocado/netdev-ethtool.py +++ b/tests/avocado/netdev-ethtool.py @@ -29,7 +29,7 @@ class NetDevEthtool(QemuSystemTest): # URL into a unique one return self.fetch_asset(name=name, locations=(url), asset_hash=sha1) - def common_test_code(self, netdev, extra_args=None, kvm=False): + def common_test_code(self, netdev, extra_args=None): # This custom kernel has drivers for all the supported network # devices we can emulate in QEMU @@ -57,9 +57,6 @@ class NetDevEthtool(QemuSystemTest): '-drive', drive, '-device', netdev) - if kvm: - self.vm.add_args('-accel', 'kvm') - self.vm.set_console(console_index=0) self.vm.launch() @@ -86,13 +83,6 @@ class NetDevEthtool(QemuSystemTest): """ self.common_test_code("igb", "pci=nomsi") - def test_igb_nomsi_kvm(self): - """ - :avocado: tags=device:igb - """ - self.require_accelerator('kvm') - self.common_test_code("igb", "pci=nomsi", True) - # It seems the other popular cards we model in QEMU currently fail # the pattern test with: # From f0f3ac41d527a2c7bcee26195ae9038f760913f4 Mon Sep 17 00:00:00 2001 From: Akihiko Odaki Date: Tue, 23 May 2023 11:43:04 +0900 Subject: [PATCH 14/50] hw/net/net_tx_pkt: Remove net_rx_pkt_get_l4_info This function is not used. Signed-off-by: Akihiko Odaki Signed-off-by: Jason Wang --- hw/net/net_rx_pkt.c | 5 ----- hw/net/net_rx_pkt.h | 9 --------- 2 files changed, 14 deletions(-) diff --git a/hw/net/net_rx_pkt.c b/hw/net/net_rx_pkt.c index 63be6e05ad..6125a063d7 100644 --- a/hw/net/net_rx_pkt.c +++ b/hw/net/net_rx_pkt.c @@ -236,11 +236,6 @@ eth_ip4_hdr_info *net_rx_pkt_get_ip4_info(struct NetRxPkt *pkt) return &pkt->ip4hdr_info; } -eth_l4_hdr_info *net_rx_pkt_get_l4_info(struct NetRxPkt *pkt) -{ - return &pkt->l4hdr_info; -} - static inline void _net_rx_rss_add_chunk(uint8_t *rss_input, size_t *bytes_written, void *ptr, size_t size) diff --git a/hw/net/net_rx_pkt.h b/hw/net/net_rx_pkt.h index a06f5c2675..ce8dbdb284 100644 --- a/hw/net/net_rx_pkt.h +++ b/hw/net/net_rx_pkt.h @@ -119,15 +119,6 @@ eth_ip6_hdr_info *net_rx_pkt_get_ip6_info(struct NetRxPkt *pkt); */ eth_ip4_hdr_info *net_rx_pkt_get_ip4_info(struct NetRxPkt *pkt); -/** - * fetches L4 header analysis results - * - * Return: pointer to analysis results structure which is stored in internal - * packet area. - * - */ -eth_l4_hdr_info *net_rx_pkt_get_l4_info(struct NetRxPkt *pkt); - typedef enum { NetPktRssIpV4, NetPktRssIpV4Tcp, From 0b11783014a0876344b14872d05ad55b21838e12 Mon Sep 17 00:00:00 2001 From: Akihiko Odaki Date: Tue, 23 May 2023 11:43:05 +0900 Subject: [PATCH 15/50] net/eth: Rename eth_setup_vlan_headers_ex MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The old eth_setup_vlan_headers has no user so remove it and rename eth_setup_vlan_headers_ex. Signed-off-by: Akihiko Odaki Reviewed-by: Philippe Mathieu-Daudé Signed-off-by: Jason Wang --- hw/net/net_tx_pkt.c | 2 +- include/net/eth.h | 9 +-------- net/eth.c | 2 +- 3 files changed, 3 insertions(+), 10 deletions(-) diff --git a/hw/net/net_tx_pkt.c b/hw/net/net_tx_pkt.c index cc36750c9b..ce6b102391 100644 --- a/hw/net/net_tx_pkt.c +++ b/hw/net/net_tx_pkt.c @@ -368,7 +368,7 @@ void net_tx_pkt_setup_vlan_header_ex(struct NetTxPkt *pkt, bool is_new; assert(pkt); - eth_setup_vlan_headers_ex(pkt->vec[NET_TX_PKT_L2HDR_FRAG].iov_base, + eth_setup_vlan_headers(pkt->vec[NET_TX_PKT_L2HDR_FRAG].iov_base, vlan, vlan_ethtype, &is_new); /* update l2hdrlen */ diff --git a/include/net/eth.h b/include/net/eth.h index 9f19c3a695..e8af5742be 100644 --- a/include/net/eth.h +++ b/include/net/eth.h @@ -351,16 +351,9 @@ eth_strip_vlan_ex(const struct iovec *iov, int iovcnt, size_t iovoff, uint16_t eth_get_l3_proto(const struct iovec *l2hdr_iov, int iovcnt, size_t l2hdr_len); -void eth_setup_vlan_headers_ex(struct eth_header *ehdr, uint16_t vlan_tag, +void eth_setup_vlan_headers(struct eth_header *ehdr, uint16_t vlan_tag, uint16_t vlan_ethtype, bool *is_new); -static inline void -eth_setup_vlan_headers(struct eth_header *ehdr, uint16_t vlan_tag, - bool *is_new) -{ - eth_setup_vlan_headers_ex(ehdr, vlan_tag, ETH_P_VLAN, is_new); -} - uint8_t eth_get_gso_type(uint16_t l3_proto, uint8_t *l3_hdr, uint8_t l4proto); diff --git a/net/eth.c b/net/eth.c index d7b30df79f..b6ff89c460 100644 --- a/net/eth.c +++ b/net/eth.c @@ -21,7 +21,7 @@ #include "net/checksum.h" #include "net/tap.h" -void eth_setup_vlan_headers_ex(struct eth_header *ehdr, uint16_t vlan_tag, +void eth_setup_vlan_headers(struct eth_header *ehdr, uint16_t vlan_tag, uint16_t vlan_ethtype, bool *is_new) { struct vlan_header *vhdr = PKT_GET_VLAN_HDR(ehdr); From e9e5b930691e5995210ca82aa6b9516733c20577 Mon Sep 17 00:00:00 2001 From: Akihiko Odaki Date: Tue, 23 May 2023 11:43:06 +0900 Subject: [PATCH 16/50] e1000x: Share more Rx filtering logic This saves some code and enables tracepoint for e1000's VLAN filtering. Signed-off-by: Akihiko Odaki Reviewed-by: Sriram Yagnaraman Signed-off-by: Jason Wang --- hw/net/e1000.c | 35 +++++-------------------------- hw/net/e1000e_core.c | 47 +++++------------------------------------- hw/net/e1000x_common.c | 44 +++++++++++++++++++++++++++++++++------ hw/net/e1000x_common.h | 4 +++- hw/net/igb_core.c | 41 +++--------------------------------- hw/net/trace-events | 4 ++-- 6 files changed, 56 insertions(+), 119 deletions(-) diff --git a/hw/net/e1000.c b/hw/net/e1000.c index 18eb6d8876..aae5f0bdc0 100644 --- a/hw/net/e1000.c +++ b/hw/net/e1000.c @@ -804,36 +804,11 @@ start_xmit(E1000State *s) } static int -receive_filter(E1000State *s, const uint8_t *buf, int size) +receive_filter(E1000State *s, const void *buf) { - uint32_t rctl = s->mac_reg[RCTL]; - int isbcast = is_broadcast_ether_addr(buf); - int ismcast = is_multicast_ether_addr(buf); - - if (e1000x_is_vlan_packet(buf, le16_to_cpu(s->mac_reg[VET])) && - e1000x_vlan_rx_filter_enabled(s->mac_reg)) { - uint16_t vid = lduw_be_p(&PKT_GET_VLAN_HDR(buf)->h_tci); - uint32_t vfta = - ldl_le_p((uint32_t *)(s->mac_reg + VFTA) + - ((vid >> E1000_VFTA_ENTRY_SHIFT) & E1000_VFTA_ENTRY_MASK)); - if ((vfta & (1 << (vid & E1000_VFTA_ENTRY_BIT_SHIFT_MASK))) == 0) { - return 0; - } - } - - if (!isbcast && !ismcast && (rctl & E1000_RCTL_UPE)) { /* promiscuous ucast */ - return 1; - } - - if (ismcast && (rctl & E1000_RCTL_MPE)) { /* promiscuous mcast */ - return 1; - } - - if (isbcast && (rctl & E1000_RCTL_BAM)) { /* broadcast enabled */ - return 1; - } - - return e1000x_rx_group_filter(s->mac_reg, buf); + return (!e1000x_is_vlan_packet(buf, s->mac_reg[VET]) || + e1000x_rx_vlan_filter(s->mac_reg, PKT_GET_VLAN_HDR(buf))) && + e1000x_rx_group_filter(s->mac_reg, buf); } static void @@ -949,7 +924,7 @@ e1000_receive_iov(NetClientState *nc, const struct iovec *iov, int iovcnt) return size; } - if (!receive_filter(s, filter_buf, size)) { + if (!receive_filter(s, filter_buf)) { return size; } diff --git a/hw/net/e1000e_core.c b/hw/net/e1000e_core.c index 14b94db59c..41d2435074 100644 --- a/hw/net/e1000e_core.c +++ b/hw/net/e1000e_core.c @@ -1034,48 +1034,11 @@ e1000e_rx_l4_cso_enabled(E1000ECore *core) } static bool -e1000e_receive_filter(E1000ECore *core, const uint8_t *buf, int size) +e1000e_receive_filter(E1000ECore *core, const void *buf) { - uint32_t rctl = core->mac[RCTL]; - - if (e1000x_is_vlan_packet(buf, core->mac[VET]) && - e1000x_vlan_rx_filter_enabled(core->mac)) { - uint16_t vid = lduw_be_p(&PKT_GET_VLAN_HDR(buf)->h_tci); - uint32_t vfta = - ldl_le_p((uint32_t *)(core->mac + VFTA) + - ((vid >> E1000_VFTA_ENTRY_SHIFT) & E1000_VFTA_ENTRY_MASK)); - if ((vfta & (1 << (vid & E1000_VFTA_ENTRY_BIT_SHIFT_MASK))) == 0) { - trace_e1000e_rx_flt_vlan_mismatch(vid); - return false; - } else { - trace_e1000e_rx_flt_vlan_match(vid); - } - } - - switch (net_rx_pkt_get_packet_type(core->rx_pkt)) { - case ETH_PKT_UCAST: - if (rctl & E1000_RCTL_UPE) { - return true; /* promiscuous ucast */ - } - break; - - case ETH_PKT_BCAST: - if (rctl & E1000_RCTL_BAM) { - return true; /* broadcast enabled */ - } - break; - - case ETH_PKT_MCAST: - if (rctl & E1000_RCTL_MPE) { - return true; /* promiscuous mcast */ - } - break; - - default: - g_assert_not_reached(); - } - - return e1000x_rx_group_filter(core->mac, buf); + return (!e1000x_is_vlan_packet(buf, core->mac[VET]) || + e1000x_rx_vlan_filter(core->mac, PKT_GET_VLAN_HDR(buf))) && + e1000x_rx_group_filter(core->mac, buf); } static inline void @@ -1736,7 +1699,7 @@ e1000e_receive_internal(E1000ECore *core, const struct iovec *iov, int iovcnt, net_rx_pkt_set_packet_type(core->rx_pkt, get_eth_packet_type(PKT_GET_ETH_HDR(buf))); - if (!e1000e_receive_filter(core, buf, size)) { + if (!e1000e_receive_filter(core, buf)) { trace_e1000e_rx_flt_dropped(); return orig_size; } diff --git a/hw/net/e1000x_common.c b/hw/net/e1000x_common.c index 7694673bcc..6cc23138a8 100644 --- a/hw/net/e1000x_common.c +++ b/hw/net/e1000x_common.c @@ -58,32 +58,64 @@ bool e1000x_is_vlan_packet(const void *buf, uint16_t vet) return res; } -bool e1000x_rx_group_filter(uint32_t *mac, const uint8_t *buf) +bool e1000x_rx_vlan_filter(uint32_t *mac, const struct vlan_header *vhdr) +{ + if (e1000x_vlan_rx_filter_enabled(mac)) { + uint16_t vid = lduw_be_p(&vhdr->h_tci); + uint32_t vfta = + ldl_le_p((uint32_t *)(mac + VFTA) + + ((vid >> E1000_VFTA_ENTRY_SHIFT) & E1000_VFTA_ENTRY_MASK)); + if ((vfta & (1 << (vid & E1000_VFTA_ENTRY_BIT_SHIFT_MASK))) == 0) { + trace_e1000x_rx_flt_vlan_mismatch(vid); + return false; + } + + trace_e1000x_rx_flt_vlan_match(vid); + } + + return true; +} + +bool e1000x_rx_group_filter(uint32_t *mac, const struct eth_header *ehdr) { static const int mta_shift[] = { 4, 3, 2, 0 }; uint32_t f, ra[2], *rp, rctl = mac[RCTL]; + if (is_broadcast_ether_addr(ehdr->h_dest)) { + if (rctl & E1000_RCTL_BAM) { + return true; + } + } else if (is_multicast_ether_addr(ehdr->h_dest)) { + if (rctl & E1000_RCTL_MPE) { + return true; + } + } else { + if (rctl & E1000_RCTL_UPE) { + return true; + } + } + for (rp = mac + RA; rp < mac + RA + 32; rp += 2) { if (!(rp[1] & E1000_RAH_AV)) { continue; } ra[0] = cpu_to_le32(rp[0]); ra[1] = cpu_to_le32(rp[1]); - if (!memcmp(buf, (uint8_t *)ra, ETH_ALEN)) { + if (!memcmp(ehdr->h_dest, (uint8_t *)ra, ETH_ALEN)) { trace_e1000x_rx_flt_ucast_match((int)(rp - mac - RA) / 2, - MAC_ARG(buf)); + MAC_ARG(ehdr->h_dest)); return true; } } - trace_e1000x_rx_flt_ucast_mismatch(MAC_ARG(buf)); + trace_e1000x_rx_flt_ucast_mismatch(MAC_ARG(ehdr->h_dest)); f = mta_shift[(rctl >> E1000_RCTL_MO_SHIFT) & 3]; - f = (((buf[5] << 8) | buf[4]) >> f) & 0xfff; + f = (((ehdr->h_dest[5] << 8) | ehdr->h_dest[4]) >> f) & 0xfff; if (mac[MTA + (f >> 5)] & (1 << (f & 0x1f))) { return true; } - trace_e1000x_rx_flt_inexact_mismatch(MAC_ARG(buf), + trace_e1000x_rx_flt_inexact_mismatch(MAC_ARG(ehdr->h_dest), (rctl >> E1000_RCTL_MO_SHIFT) & 3, f >> 5, mac[MTA + (f >> 5)]); diff --git a/hw/net/e1000x_common.h b/hw/net/e1000x_common.h index 0298e06283..be291684de 100644 --- a/hw/net/e1000x_common.h +++ b/hw/net/e1000x_common.h @@ -107,7 +107,9 @@ bool e1000x_rx_ready(PCIDevice *d, uint32_t *mac); bool e1000x_is_vlan_packet(const void *buf, uint16_t vet); -bool e1000x_rx_group_filter(uint32_t *mac, const uint8_t *buf); +bool e1000x_rx_vlan_filter(uint32_t *mac, const struct vlan_header *vhdr); + +bool e1000x_rx_group_filter(uint32_t *mac, const struct eth_header *ehdr); bool e1000x_hw_rx_enabled(uint32_t *mac); diff --git a/hw/net/igb_core.c b/hw/net/igb_core.c index 1123df9e77..934db3c3e5 100644 --- a/hw/net/igb_core.c +++ b/hw/net/igb_core.c @@ -976,7 +976,6 @@ static uint16_t igb_receive_assign(IGBCore *core, const L2Header *l2_header, uint16_t queues = 0; uint16_t oversized = 0; uint16_t vid = be16_to_cpu(l2_header->vlan.h_tci) & VLAN_VID_MASK; - bool accepted = false; int i; memset(rss_info, 0, sizeof(E1000E_RSSInfo)); @@ -986,16 +985,8 @@ static uint16_t igb_receive_assign(IGBCore *core, const L2Header *l2_header, } if (e1000x_is_vlan_packet(ehdr, core->mac[VET] & 0xffff) && - e1000x_vlan_rx_filter_enabled(core->mac)) { - uint32_t vfta = - ldl_le_p((uint32_t *)(core->mac + VFTA) + - ((vid >> E1000_VFTA_ENTRY_SHIFT) & E1000_VFTA_ENTRY_MASK)); - if ((vfta & (1 << (vid & E1000_VFTA_ENTRY_BIT_SHIFT_MASK))) == 0) { - trace_e1000e_rx_flt_vlan_mismatch(vid); - return queues; - } else { - trace_e1000e_rx_flt_vlan_match(vid); - } + !e1000x_rx_vlan_filter(core->mac, PKT_GET_VLAN_HDR(ehdr))) { + return queues; } if (core->mac[MRQC] & 1) { @@ -1103,33 +1094,7 @@ static uint16_t igb_receive_assign(IGBCore *core, const L2Header *l2_header, } } } else { - switch (net_rx_pkt_get_packet_type(core->rx_pkt)) { - case ETH_PKT_UCAST: - if (rctl & E1000_RCTL_UPE) { - accepted = true; /* promiscuous ucast */ - } - break; - - case ETH_PKT_BCAST: - if (rctl & E1000_RCTL_BAM) { - accepted = true; /* broadcast enabled */ - } - break; - - case ETH_PKT_MCAST: - if (rctl & E1000_RCTL_MPE) { - accepted = true; /* promiscuous mcast */ - } - break; - - default: - g_assert_not_reached(); - } - - if (!accepted) { - accepted = e1000x_rx_group_filter(core->mac, ehdr->h_dest); - } - + bool accepted = e1000x_rx_group_filter(core->mac, ehdr); if (!accepted) { for (macp = core->mac + RA2; macp < core->mac + RA2 + 16; macp += 2) { if (!(macp[1] & E1000_RAH_AV)) { diff --git a/hw/net/trace-events b/hw/net/trace-events index d35554fce8..a34d196ff7 100644 --- a/hw/net/trace-events +++ b/hw/net/trace-events @@ -106,6 +106,8 @@ e1000_receiver_overrun(size_t s, uint32_t rdh, uint32_t rdt) "Receiver overrun: # e1000x_common.c e1000x_rx_can_recv_disabled(bool link_up, bool rx_enabled, bool pci_master) "link_up: %d, rx_enabled %d, pci_master %d" e1000x_vlan_is_vlan_pkt(bool is_vlan_pkt, uint16_t eth_proto, uint16_t vet) "Is VLAN packet: %d, ETH proto: 0x%X, VET: 0x%X" +e1000x_rx_flt_vlan_mismatch(uint16_t vid) "VID mismatch: 0x%X" +e1000x_rx_flt_vlan_match(uint16_t vid) "VID match: 0x%X" e1000x_rx_flt_ucast_match(uint32_t idx, uint8_t b0, uint8_t b1, uint8_t b2, uint8_t b3, uint8_t b4, uint8_t b5) "unicast match[%d]: %02x:%02x:%02x:%02x:%02x:%02x" e1000x_rx_flt_ucast_mismatch(uint8_t b0, uint8_t b1, uint8_t b2, uint8_t b3, uint8_t b4, uint8_t b5) "unicast mismatch: %02x:%02x:%02x:%02x:%02x:%02x" e1000x_rx_flt_inexact_mismatch(uint8_t b0, uint8_t b1, uint8_t b2, uint8_t b3, uint8_t b4, uint8_t b5, uint32_t mo, uint32_t mta, uint32_t mta_val) "inexact mismatch: %02x:%02x:%02x:%02x:%02x:%02x MO %d MTA[%d] 0x%x" @@ -154,8 +156,6 @@ e1000e_rx_can_recv_rings_full(void) "Cannot receive: all rings are full" e1000e_rx_can_recv(void) "Can receive" e1000e_rx_has_buffers(int ridx, uint32_t free_desc, size_t total_size, uint32_t desc_buf_size) "ring #%d: free descr: %u, packet size %zu, descr buffer size %u" e1000e_rx_null_descriptor(void) "Null RX descriptor!!" -e1000e_rx_flt_vlan_mismatch(uint16_t vid) "VID mismatch: 0x%X" -e1000e_rx_flt_vlan_match(uint16_t vid) "VID match: 0x%X" e1000e_rx_desc_ps_read(uint64_t a0, uint64_t a1, uint64_t a2, uint64_t a3) "buffers: [0x%"PRIx64", 0x%"PRIx64", 0x%"PRIx64", 0x%"PRIx64"]" e1000e_rx_desc_ps_write(uint16_t a0, uint16_t a1, uint16_t a2, uint16_t a3) "bytes written: [%u, %u, %u, %u]" e1000e_rx_desc_buff_sizes(uint32_t b0, uint32_t b1, uint32_t b2, uint32_t b3) "buffer sizes: [%u, %u, %u, %u]" From 743495144208cd1871ce0971ae6c57925141c855 Mon Sep 17 00:00:00 2001 From: Akihiko Odaki Date: Tue, 23 May 2023 11:43:07 +0900 Subject: [PATCH 17/50] e1000x: Take CRC into consideration for size check Section 13.7.15 Receive Length Error Count says: > Packets over 1522 bytes are oversized if LongPacketEnable is 0b > (RCTL.LPE). If LongPacketEnable (LPE) is 1b, then an incoming packet > is considered oversized if it exceeds 16384 bytes. > These lengths are based on bytes in the received packet from > through , inclusively. As QEMU processes packets without CRC, the number of bytes for CRC need to be subtracted. This change adds some size definitions to be used to derive the new size thresholds to eth.h. Signed-off-by: Akihiko Odaki Signed-off-by: Jason Wang --- hw/net/e1000x_common.c | 10 +++++----- include/net/eth.h | 2 ++ 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/hw/net/e1000x_common.c b/hw/net/e1000x_common.c index 6cc23138a8..212873fd77 100644 --- a/hw/net/e1000x_common.c +++ b/hw/net/e1000x_common.c @@ -140,16 +140,16 @@ bool e1000x_hw_rx_enabled(uint32_t *mac) bool e1000x_is_oversized(uint32_t *mac, size_t size) { + size_t header_size = sizeof(struct eth_header) + sizeof(struct vlan_header); /* this is the size past which hardware will drop packets when setting LPE=0 */ - static const int maximum_ethernet_vlan_size = 1522; + size_t maximum_short_size = header_size + ETH_MTU; /* this is the size past which hardware will drop packets when setting LPE=1 */ - static const int maximum_ethernet_lpe_size = 16 * KiB; + size_t maximum_large_size = 16 * KiB - ETH_FCS_LEN; - if ((size > maximum_ethernet_lpe_size || - (size > maximum_ethernet_vlan_size - && !(mac[RCTL] & E1000_RCTL_LPE))) + if ((size > maximum_large_size || + (size > maximum_short_size && !(mac[RCTL] & E1000_RCTL_LPE))) && !(mac[RCTL] & E1000_RCTL_SBP)) { e1000x_inc_reg_if_not_full(mac, ROC); trace_e1000x_rx_oversized(size); diff --git a/include/net/eth.h b/include/net/eth.h index e8af5742be..05f56931e7 100644 --- a/include/net/eth.h +++ b/include/net/eth.h @@ -32,6 +32,8 @@ #define ETH_ALEN 6 #define ETH_HLEN 14 #define ETH_ZLEN 60 /* Min. octets in frame without FCS */ +#define ETH_FCS_LEN 4 +#define ETH_MTU 1500 struct eth_header { uint8_t h_dest[ETH_ALEN]; /* destination eth addr */ From 5052fc9eb17b5ae6fd956834c68c3f7201dea8df Mon Sep 17 00:00:00 2001 From: Akihiko Odaki Date: Tue, 23 May 2023 11:43:08 +0900 Subject: [PATCH 18/50] e1000x: Rename TcpIpv6 into TcpIpv6Ex e1000e and igb employs NetPktRssIpV6TcpEx for RSS hash if TcpIpv6 MRQC bit is set. Moreover, igb also has a MRQC bit for NetPktRssIpV6Tcp though it is not implemented yet. Rename it to TcpIpv6Ex to avoid confusion. Signed-off-by: Akihiko Odaki Reviewed-by: Sriram Yagnaraman Signed-off-by: Jason Wang --- hw/net/e1000e_core.c | 8 ++++---- hw/net/e1000x_regs.h | 22 +++++++++++----------- hw/net/igb_core.c | 8 ++++---- hw/net/trace-events | 2 +- 4 files changed, 20 insertions(+), 20 deletions(-) diff --git a/hw/net/e1000e_core.c b/hw/net/e1000e_core.c index 41d2435074..38d465a203 100644 --- a/hw/net/e1000e_core.c +++ b/hw/net/e1000e_core.c @@ -537,7 +537,7 @@ e1000e_rss_get_hash_type(E1000ECore *core, struct NetRxPkt *pkt) ip6info->rss_ex_dst_valid, ip6info->rss_ex_src_valid, core->mac[MRQC], - E1000_MRQC_EN_TCPIPV6(core->mac[MRQC]), + E1000_MRQC_EN_TCPIPV6EX(core->mac[MRQC]), E1000_MRQC_EN_IPV6EX(core->mac[MRQC]), E1000_MRQC_EN_IPV6(core->mac[MRQC])); @@ -546,8 +546,8 @@ e1000e_rss_get_hash_type(E1000ECore *core, struct NetRxPkt *pkt) ip6info->rss_ex_src_valid))) { if (l4hdr_proto == ETH_L4_HDR_PROTO_TCP && - E1000_MRQC_EN_TCPIPV6(core->mac[MRQC])) { - return E1000_MRQ_RSS_TYPE_IPV6TCP; + E1000_MRQC_EN_TCPIPV6EX(core->mac[MRQC])) { + return E1000_MRQ_RSS_TYPE_IPV6TCPEX; } if (E1000_MRQC_EN_IPV6EX(core->mac[MRQC])) { @@ -581,7 +581,7 @@ e1000e_rss_calc_hash(E1000ECore *core, case E1000_MRQ_RSS_TYPE_IPV4TCP: type = NetPktRssIpV4Tcp; break; - case E1000_MRQ_RSS_TYPE_IPV6TCP: + case E1000_MRQ_RSS_TYPE_IPV6TCPEX: type = NetPktRssIpV6TcpEx; break; case E1000_MRQ_RSS_TYPE_IPV6: diff --git a/hw/net/e1000x_regs.h b/hw/net/e1000x_regs.h index 6d3c4c6d3a..13760c66d3 100644 --- a/hw/net/e1000x_regs.h +++ b/hw/net/e1000x_regs.h @@ -290,18 +290,18 @@ #define E1000_RETA_IDX(hash) ((hash) & (BIT(7) - 1)) #define E1000_RETA_VAL(reta, hash) (((uint8_t *)(reta))[E1000_RETA_IDX(hash)]) -#define E1000_MRQC_EN_TCPIPV4(mrqc) ((mrqc) & BIT(16)) -#define E1000_MRQC_EN_IPV4(mrqc) ((mrqc) & BIT(17)) -#define E1000_MRQC_EN_TCPIPV6(mrqc) ((mrqc) & BIT(18)) -#define E1000_MRQC_EN_IPV6EX(mrqc) ((mrqc) & BIT(19)) -#define E1000_MRQC_EN_IPV6(mrqc) ((mrqc) & BIT(20)) +#define E1000_MRQC_EN_TCPIPV4(mrqc) ((mrqc) & BIT(16)) +#define E1000_MRQC_EN_IPV4(mrqc) ((mrqc) & BIT(17)) +#define E1000_MRQC_EN_TCPIPV6EX(mrqc) ((mrqc) & BIT(18)) +#define E1000_MRQC_EN_IPV6EX(mrqc) ((mrqc) & BIT(19)) +#define E1000_MRQC_EN_IPV6(mrqc) ((mrqc) & BIT(20)) -#define E1000_MRQ_RSS_TYPE_NONE (0) -#define E1000_MRQ_RSS_TYPE_IPV4TCP (1) -#define E1000_MRQ_RSS_TYPE_IPV4 (2) -#define E1000_MRQ_RSS_TYPE_IPV6TCP (3) -#define E1000_MRQ_RSS_TYPE_IPV6EX (4) -#define E1000_MRQ_RSS_TYPE_IPV6 (5) +#define E1000_MRQ_RSS_TYPE_NONE (0) +#define E1000_MRQ_RSS_TYPE_IPV4TCP (1) +#define E1000_MRQ_RSS_TYPE_IPV4 (2) +#define E1000_MRQ_RSS_TYPE_IPV6TCPEX (3) +#define E1000_MRQ_RSS_TYPE_IPV6EX (4) +#define E1000_MRQ_RSS_TYPE_IPV6 (5) #define E1000_ICR_ASSERTED BIT(31) #define E1000_EIAC_MASK 0x01F00000 diff --git a/hw/net/igb_core.c b/hw/net/igb_core.c index 934db3c3e5..209fdad862 100644 --- a/hw/net/igb_core.c +++ b/hw/net/igb_core.c @@ -301,7 +301,7 @@ igb_rss_get_hash_type(IGBCore *core, struct NetRxPkt *pkt) ip6info->rss_ex_dst_valid, ip6info->rss_ex_src_valid, core->mac[MRQC], - E1000_MRQC_EN_TCPIPV6(core->mac[MRQC]), + E1000_MRQC_EN_TCPIPV6EX(core->mac[MRQC]), E1000_MRQC_EN_IPV6EX(core->mac[MRQC]), E1000_MRQC_EN_IPV6(core->mac[MRQC])); @@ -310,8 +310,8 @@ igb_rss_get_hash_type(IGBCore *core, struct NetRxPkt *pkt) ip6info->rss_ex_src_valid))) { if (l4hdr_proto == ETH_L4_HDR_PROTO_TCP && - E1000_MRQC_EN_TCPIPV6(core->mac[MRQC])) { - return E1000_MRQ_RSS_TYPE_IPV6TCP; + E1000_MRQC_EN_TCPIPV6EX(core->mac[MRQC])) { + return E1000_MRQ_RSS_TYPE_IPV6TCPEX; } if (E1000_MRQC_EN_IPV6EX(core->mac[MRQC])) { @@ -343,7 +343,7 @@ igb_rss_calc_hash(IGBCore *core, struct NetRxPkt *pkt, E1000E_RSSInfo *info) case E1000_MRQ_RSS_TYPE_IPV4TCP: type = NetPktRssIpV4Tcp; break; - case E1000_MRQ_RSS_TYPE_IPV6TCP: + case E1000_MRQ_RSS_TYPE_IPV6TCPEX: type = NetPktRssIpV6TcpEx; break; case E1000_MRQ_RSS_TYPE_IPV6: diff --git a/hw/net/trace-events b/hw/net/trace-events index a34d196ff7..64d776bc2a 100644 --- a/hw/net/trace-events +++ b/hw/net/trace-events @@ -179,7 +179,7 @@ e1000e_rx_rss_disabled(void) "RSS is disabled" e1000e_rx_rss_type(uint32_t type) "RSS type is %u" e1000e_rx_rss_ip4(int l4hdr_proto, uint32_t mrqc, bool tcpipv4_enabled, bool ipv4_enabled) "RSS IPv4: L4 header protocol %d, mrqc 0x%X, tcpipv4 enabled %d, ipv4 enabled %d" e1000e_rx_rss_ip6_rfctl(uint32_t rfctl) "RSS IPv6: rfctl 0x%X" -e1000e_rx_rss_ip6(bool ex_dis, bool new_ex_dis, int l4hdr_proto, bool has_ext_headers, bool ex_dst_valid, bool ex_src_valid, uint32_t mrqc, bool tcpipv6_enabled, bool ipv6ex_enabled, bool ipv6_enabled) "RSS IPv6: ex_dis: %d, new_ex_dis: %d, L4 header protocol %d, has_ext_headers %d, ex_dst_valid %d, ex_src_valid %d, mrqc 0x%X, tcpipv6 enabled %d, ipv6ex enabled %d, ipv6 enabled %d" +e1000e_rx_rss_ip6(bool ex_dis, bool new_ex_dis, int l4hdr_proto, bool has_ext_headers, bool ex_dst_valid, bool ex_src_valid, uint32_t mrqc, bool tcpipv6ex_enabled, bool ipv6ex_enabled, bool ipv6_enabled) "RSS IPv6: ex_dis: %d, new_ex_dis: %d, L4 header protocol %d, has_ext_headers %d, ex_dst_valid %d, ex_src_valid %d, mrqc 0x%X, tcpipv6ex enabled %d, ipv6ex enabled %d, ipv6 enabled %d" e1000e_rx_metadata_protocols(bool hasip4, bool hasip6, int l4hdr_protocol) "protocols: ip4: %d, ip6: %d, l4hdr: %d" e1000e_rx_metadata_vlan(uint16_t vlan_tag) "VLAN tag is 0x%X" From 8b876b99a1b60ee657e3ef57b5c04d8836034d60 Mon Sep 17 00:00:00 2001 From: Akihiko Odaki Date: Tue, 23 May 2023 11:43:09 +0900 Subject: [PATCH 19/50] e1000e: Always log status after building rx metadata MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Without this change, the status flags may not be traced e.g. if checksum offloading is disabled. Signed-off-by: Akihiko Odaki Reviewed-by: Philippe Mathieu-Daudé Signed-off-by: Jason Wang --- hw/net/e1000e_core.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/hw/net/e1000e_core.c b/hw/net/e1000e_core.c index 38d465a203..6a213c0224 100644 --- a/hw/net/e1000e_core.c +++ b/hw/net/e1000e_core.c @@ -1244,9 +1244,8 @@ e1000e_build_rx_metadata(E1000ECore *core, trace_e1000e_rx_metadata_l4_cso_disabled(); } - trace_e1000e_rx_metadata_status_flags(*status_flags); - func_exit: + trace_e1000e_rx_metadata_status_flags(*status_flags); *status_flags = cpu_to_le32(*status_flags); } From d5241351bd4fdb40cafc87024c4f924267a7ba1c Mon Sep 17 00:00:00 2001 From: Akihiko Odaki Date: Tue, 23 May 2023 11:43:10 +0900 Subject: [PATCH 20/50] igb: Always log status after building rx metadata MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Without this change, the status flags may not be traced e.g. if checksum offloading is disabled. Signed-off-by: Akihiko Odaki Reviewed-by: Philippe Mathieu-Daudé Signed-off-by: Jason Wang --- hw/net/igb_core.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/hw/net/igb_core.c b/hw/net/igb_core.c index 209fdad862..946b917f91 100644 --- a/hw/net/igb_core.c +++ b/hw/net/igb_core.c @@ -1303,9 +1303,8 @@ igb_build_rx_metadata(IGBCore *core, trace_e1000e_rx_metadata_l4_cso_disabled(); } - trace_e1000e_rx_metadata_status_flags(*status_flags); - func_exit: + trace_e1000e_rx_metadata_status_flags(*status_flags); *status_flags = cpu_to_le32(*status_flags); } From a09cc21e80ebbb0fecc8eea33281290144234c9b Mon Sep 17 00:00:00 2001 From: Akihiko Odaki Date: Tue, 23 May 2023 11:43:11 +0900 Subject: [PATCH 21/50] igb: Remove goto The goto is a bit confusing as it changes the control flow only if L4 protocol is not recognized. It is also different from e1000e, and noisy when comparing e1000e and igb. Signed-off-by: Akihiko Odaki Reviewed-by: Sriram Yagnaraman Signed-off-by: Jason Wang --- hw/net/igb_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hw/net/igb_core.c b/hw/net/igb_core.c index 946b917f91..bae51cbb63 100644 --- a/hw/net/igb_core.c +++ b/hw/net/igb_core.c @@ -1297,7 +1297,7 @@ igb_build_rx_metadata(IGBCore *core, break; default: - goto func_exit; + break; } } else { trace_e1000e_rx_metadata_l4_cso_disabled(); From 5d92e88a506387837431cee069ca8fee4f7c29ae Mon Sep 17 00:00:00 2001 From: Akihiko Odaki Date: Tue, 23 May 2023 11:43:12 +0900 Subject: [PATCH 22/50] igb: Read DCMD.VLE of the first Tx descriptor Section 7.2.2.3 Advanced Transmit Data Descriptor says: > For frames that spans multiple descriptors, all fields apart from > DCMD.EOP, DCMD.RS, DCMD.DEXT, DTALEN, Address and DTYP are valid only > in the first descriptors and are ignored in the subsequent ones. Signed-off-by: Akihiko Odaki Reviewed-by: Sriram Yagnaraman Signed-off-by: Jason Wang --- hw/net/igb_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hw/net/igb_core.c b/hw/net/igb_core.c index bae51cbb63..162ef26789 100644 --- a/hw/net/igb_core.c +++ b/hw/net/igb_core.c @@ -613,7 +613,7 @@ igb_process_tx_desc(IGBCore *core, idx = (tx->first_olinfo_status >> 4) & 1; igb_tx_insert_vlan(core, queue_index, tx, tx->ctx[idx].vlan_macip_lens >> 16, - !!(cmd_type_len & E1000_TXD_CMD_VLE)); + !!(tx->first_cmd_type_len & E1000_TXD_CMD_VLE)); if (igb_tx_pkt_send(core, tx, queue_index)) { igb_on_tx_done_update_stats(core, tx->tx_pkt, queue_index); From 5c867340252cd5c4c3dcd50b51da961f2f6cb83d Mon Sep 17 00:00:00 2001 From: Akihiko Odaki Date: Tue, 23 May 2023 11:43:13 +0900 Subject: [PATCH 23/50] e1000e: Reset packet state after emptying Tx queue Keeping Tx packet state after the transmit queue is emptied has some problems: - The datasheet says the descriptors can be reused after the transmit queue is emptied, but the Tx packet state may keep references to them. - The Tx packet state cannot be migrated so it can be reset anytime the migration happens. Always reset Tx packet state always after the queue is emptied. Signed-off-by: Akihiko Odaki Signed-off-by: Jason Wang --- hw/net/e1000e_core.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/hw/net/e1000e_core.c b/hw/net/e1000e_core.c index 6a213c0224..7dce448657 100644 --- a/hw/net/e1000e_core.c +++ b/hw/net/e1000e_core.c @@ -959,6 +959,8 @@ e1000e_start_xmit(E1000ECore *core, const E1000E_TxRing *txr) if (!ide || !e1000e_intrmgr_delay_tx_causes(core, &cause)) { e1000e_set_interrupt_cause(core, cause); } + + net_tx_pkt_reset(txr->tx->tx_pkt, net_tx_pkt_unmap_frag_pci, core->owner); } static bool @@ -3389,8 +3391,6 @@ e1000e_core_pci_uninit(E1000ECore *core) qemu_del_vm_change_state_handler(core->vmstate); for (i = 0; i < E1000E_NUM_QUEUES; i++) { - net_tx_pkt_reset(core->tx[i].tx_pkt, - net_tx_pkt_unmap_frag_pci, core->owner); net_tx_pkt_uninit(core->tx[i].tx_pkt); } @@ -3515,8 +3515,6 @@ static void e1000e_reset(E1000ECore *core, bool sw) e1000x_reset_mac_addr(core->owner_nic, core->mac, core->permanent_mac); for (i = 0; i < ARRAY_SIZE(core->tx); i++) { - net_tx_pkt_reset(core->tx[i].tx_pkt, - net_tx_pkt_unmap_frag_pci, core->owner); memset(&core->tx[i].props, 0, sizeof(core->tx[i].props)); core->tx[i].skip_cp = false; } From 1c5618a267640c8d315bf25ab0b84b24980d2458 Mon Sep 17 00:00:00 2001 From: Akihiko Odaki Date: Tue, 23 May 2023 11:43:14 +0900 Subject: [PATCH 24/50] vmxnet3: Reset packet state after emptying Tx queue Keeping Tx packet state after the transmit queue is emptied but this behavior is unreliable as the state can be reset anytime the migration happens. Always reset Tx packet state always after the queue is emptied. Signed-off-by: Akihiko Odaki Signed-off-by: Jason Wang --- hw/net/vmxnet3.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/hw/net/vmxnet3.c b/hw/net/vmxnet3.c index 05f41b6dfa..18b9edfdb2 100644 --- a/hw/net/vmxnet3.c +++ b/hw/net/vmxnet3.c @@ -681,6 +681,8 @@ static void vmxnet3_process_tx_queue(VMXNET3State *s, int qidx) net_tx_pkt_unmap_frag_pci, PCI_DEVICE(s)); } } + + net_tx_pkt_reset(s->tx_pkt, net_tx_pkt_unmap_frag_pci, PCI_DEVICE(s)); } static inline void @@ -1159,7 +1161,6 @@ static void vmxnet3_deactivate_device(VMXNET3State *s) { if (s->device_active) { VMW_CBPRN("Deactivating vmxnet3..."); - net_tx_pkt_reset(s->tx_pkt, net_tx_pkt_unmap_frag_pci, PCI_DEVICE(s)); net_tx_pkt_uninit(s->tx_pkt); net_rx_pkt_uninit(s->rx_pkt); s->device_active = false; From 4847dabf6743477dd29da9962466403dfcf625f3 Mon Sep 17 00:00:00 2001 From: Akihiko Odaki Date: Tue, 23 May 2023 11:43:15 +0900 Subject: [PATCH 25/50] igb: Add more definitions for Tx descriptor Signed-off-by: Akihiko Odaki Reviewed-by: Sriram Yagnaraman Signed-off-by: Jason Wang --- hw/net/igb_core.c | 4 ++-- hw/net/igb_regs.h | 32 +++++++++++++++++++++++++++----- 2 files changed, 29 insertions(+), 7 deletions(-) diff --git a/hw/net/igb_core.c b/hw/net/igb_core.c index 162ef26789..56a53872cf 100644 --- a/hw/net/igb_core.c +++ b/hw/net/igb_core.c @@ -418,7 +418,7 @@ igb_setup_tx_offloads(IGBCore *core, struct igb_tx *tx) { if (tx->first_cmd_type_len & E1000_ADVTXD_DCMD_TSE) { uint32_t idx = (tx->first_olinfo_status >> 4) & 1; - uint32_t mss = tx->ctx[idx].mss_l4len_idx >> 16; + uint32_t mss = tx->ctx[idx].mss_l4len_idx >> E1000_ADVTXD_MSS_SHIFT; if (!net_tx_pkt_build_vheader(tx->tx_pkt, true, true, mss)) { return false; } @@ -612,7 +612,7 @@ igb_process_tx_desc(IGBCore *core, if (!tx->skip_cp && net_tx_pkt_parse(tx->tx_pkt)) { idx = (tx->first_olinfo_status >> 4) & 1; igb_tx_insert_vlan(core, queue_index, tx, - tx->ctx[idx].vlan_macip_lens >> 16, + tx->ctx[idx].vlan_macip_lens >> IGB_TX_FLAGS_VLAN_SHIFT, !!(tx->first_cmd_type_len & E1000_TXD_CMD_VLE)); if (igb_tx_pkt_send(core, tx, queue_index)) { diff --git a/hw/net/igb_regs.h b/hw/net/igb_regs.h index 21ee9a3b2d..eb995d8b2e 100644 --- a/hw/net/igb_regs.h +++ b/hw/net/igb_regs.h @@ -42,11 +42,6 @@ union e1000_adv_tx_desc { } wb; }; -#define E1000_ADVTXD_DTYP_CTXT 0x00200000 /* Advanced Context Descriptor */ -#define E1000_ADVTXD_DTYP_DATA 0x00300000 /* Advanced Data Descriptor */ -#define E1000_ADVTXD_DCMD_DEXT 0x20000000 /* Descriptor Extension (1=Adv) */ -#define E1000_ADVTXD_DCMD_TSE 0x80000000 /* TCP/UDP Segmentation Enable */ - #define E1000_ADVTXD_POTS_IXSM 0x00000100 /* Insert TCP/UDP Checksum */ #define E1000_ADVTXD_POTS_TXSM 0x00000200 /* Insert TCP/UDP Checksum */ @@ -151,6 +146,10 @@ union e1000_adv_rx_desc { #define IGB_82576_VF_DEV_ID 0x10CA #define IGB_I350_VF_DEV_ID 0x1520 +/* VLAN info */ +#define IGB_TX_FLAGS_VLAN_MASK 0xffff0000 +#define IGB_TX_FLAGS_VLAN_SHIFT 16 + /* from igb/e1000_82575.h */ #define E1000_MRQC_ENABLE_RSS_MQ 0x00000002 @@ -160,6 +159,29 @@ union e1000_adv_rx_desc { #define E1000_MRQC_RSS_FIELD_IPV6_UDP 0x00800000 #define E1000_MRQC_RSS_FIELD_IPV6_UDP_EX 0x01000000 +/* Adv Transmit Descriptor Config Masks */ +#define E1000_ADVTXD_MAC_TSTAMP 0x00080000 /* IEEE1588 Timestamp packet */ +#define E1000_ADVTXD_DTYP_CTXT 0x00200000 /* Advanced Context Descriptor */ +#define E1000_ADVTXD_DTYP_DATA 0x00300000 /* Advanced Data Descriptor */ +#define E1000_ADVTXD_DCMD_EOP 0x01000000 /* End of Packet */ +#define E1000_ADVTXD_DCMD_IFCS 0x02000000 /* Insert FCS (Ethernet CRC) */ +#define E1000_ADVTXD_DCMD_RS 0x08000000 /* Report Status */ +#define E1000_ADVTXD_DCMD_DEXT 0x20000000 /* Descriptor extension (1=Adv) */ +#define E1000_ADVTXD_DCMD_VLE 0x40000000 /* VLAN pkt enable */ +#define E1000_ADVTXD_DCMD_TSE 0x80000000 /* TCP Seg enable */ +#define E1000_ADVTXD_PAYLEN_SHIFT 14 /* Adv desc PAYLEN shift */ + +#define E1000_ADVTXD_MACLEN_SHIFT 9 /* Adv ctxt desc mac len shift */ +#define E1000_ADVTXD_TUCMD_L4T_UDP 0x00000000 /* L4 Packet TYPE of UDP */ +#define E1000_ADVTXD_TUCMD_IPV4 0x00000400 /* IP Packet Type: 1=IPv4 */ +#define E1000_ADVTXD_TUCMD_L4T_TCP 0x00000800 /* L4 Packet TYPE of TCP */ +#define E1000_ADVTXD_TUCMD_L4T_SCTP 0x00001000 /* L4 packet TYPE of SCTP */ +/* IPSec Encrypt Enable for ESP */ +#define E1000_ADVTXD_L4LEN_SHIFT 8 /* Adv ctxt L4LEN shift */ +#define E1000_ADVTXD_MSS_SHIFT 16 /* Adv ctxt MSS shift */ +/* Adv ctxt IPSec SA IDX mask */ +/* Adv ctxt IPSec ESP len mask */ + /* Additional Transmit Descriptor Control definitions */ #define E1000_TXDCTL_QUEUE_ENABLE 0x02000000 /* Enable specific Tx Queue */ From ff2b24c86258c6257bdb27bb16fefb11aea5ea57 Mon Sep 17 00:00:00 2001 From: Akihiko Odaki Date: Tue, 23 May 2023 11:43:16 +0900 Subject: [PATCH 26/50] igb: Share common VF constants MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The constants need to be consistent between the PF and VF. Signed-off-by: Akihiko Odaki Reviewed-by: Philippe Mathieu-Daudé Reviewed-by: Sriram Yagnaraman Signed-off-by: Jason Wang --- hw/net/igb.c | 10 +++++----- hw/net/igb_common.h | 8 ++++++++ hw/net/igbvf.c | 7 ------- 3 files changed, 13 insertions(+), 12 deletions(-) diff --git a/hw/net/igb.c b/hw/net/igb.c index 51a7e9133e..1c989d7677 100644 --- a/hw/net/igb.c +++ b/hw/net/igb.c @@ -433,16 +433,16 @@ static void igb_pci_realize(PCIDevice *pci_dev, Error **errp) pcie_ari_init(pci_dev, 0x150, 1); - pcie_sriov_pf_init(pci_dev, IGB_CAP_SRIOV_OFFSET, "igbvf", + pcie_sriov_pf_init(pci_dev, IGB_CAP_SRIOV_OFFSET, TYPE_IGBVF, IGB_82576_VF_DEV_ID, IGB_MAX_VF_FUNCTIONS, IGB_MAX_VF_FUNCTIONS, IGB_VF_OFFSET, IGB_VF_STRIDE); - pcie_sriov_pf_init_vf_bar(pci_dev, 0, + pcie_sriov_pf_init_vf_bar(pci_dev, IGBVF_MMIO_BAR_IDX, PCI_BASE_ADDRESS_MEM_TYPE_64 | PCI_BASE_ADDRESS_MEM_PREFETCH, - 16 * KiB); - pcie_sriov_pf_init_vf_bar(pci_dev, 3, + IGBVF_MMIO_SIZE); + pcie_sriov_pf_init_vf_bar(pci_dev, IGBVF_MSIX_BAR_IDX, PCI_BASE_ADDRESS_MEM_TYPE_64 | PCI_BASE_ADDRESS_MEM_PREFETCH, - 16 * KiB); + IGBVF_MSIX_SIZE); igb_init_net_peer(s, pci_dev, macaddr); diff --git a/hw/net/igb_common.h b/hw/net/igb_common.h index 69ac490f75..f2a9065791 100644 --- a/hw/net/igb_common.h +++ b/hw/net/igb_common.h @@ -28,6 +28,14 @@ #include "igb_regs.h" +#define TYPE_IGBVF "igbvf" + +#define IGBVF_MMIO_BAR_IDX (0) +#define IGBVF_MSIX_BAR_IDX (3) + +#define IGBVF_MMIO_SIZE (16 * 1024) +#define IGBVF_MSIX_SIZE (16 * 1024) + #define defreg(x) x = (E1000_##x >> 2) #define defreg_indexed(x, i) x##i = (E1000_##x(i) >> 2) #define defreg_indexeda(x, i) x##i##_A = (E1000_##x##_A(i) >> 2) diff --git a/hw/net/igbvf.c b/hw/net/igbvf.c index 70beb7af50..284ea61184 100644 --- a/hw/net/igbvf.c +++ b/hw/net/igbvf.c @@ -50,15 +50,8 @@ #include "trace.h" #include "qapi/error.h" -#define TYPE_IGBVF "igbvf" OBJECT_DECLARE_SIMPLE_TYPE(IgbVfState, IGBVF) -#define IGBVF_MMIO_BAR_IDX (0) -#define IGBVF_MSIX_BAR_IDX (3) - -#define IGBVF_MMIO_SIZE (16 * 1024) -#define IGBVF_MSIX_SIZE (16 * 1024) - struct IgbVfState { PCIDevice parent_obj; From 882e54da90a7634d6ccd6d451e2f6950bd68fe04 Mon Sep 17 00:00:00 2001 From: Akihiko Odaki Date: Tue, 23 May 2023 11:43:17 +0900 Subject: [PATCH 27/50] igb: Fix igb_mac_reg_init coding style alignment MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Akihiko Odaki Reviewed-by: Philippe Mathieu-Daudé Signed-off-by: Jason Wang --- hw/net/igb_core.c | 96 +++++++++++++++++++++++------------------------ 1 file changed, 48 insertions(+), 48 deletions(-) diff --git a/hw/net/igb_core.c b/hw/net/igb_core.c index 56a53872cf..20645c4764 100644 --- a/hw/net/igb_core.c +++ b/hw/net/igb_core.c @@ -4027,54 +4027,54 @@ static const uint32_t igb_mac_reg_init[] = { [VMOLR0 ... VMOLR0 + 7] = 0x2600 | E1000_VMOLR_STRCRC, [RPLOLR] = E1000_RPLOLR_STRCRC, [RLPML] = 0x2600, - [TXCTL0] = E1000_DCA_TXCTRL_DATA_RRO_EN | - E1000_DCA_TXCTRL_TX_WB_RO_EN | - E1000_DCA_TXCTRL_DESC_RRO_EN, - [TXCTL1] = E1000_DCA_TXCTRL_DATA_RRO_EN | - E1000_DCA_TXCTRL_TX_WB_RO_EN | - E1000_DCA_TXCTRL_DESC_RRO_EN, - [TXCTL2] = E1000_DCA_TXCTRL_DATA_RRO_EN | - E1000_DCA_TXCTRL_TX_WB_RO_EN | - E1000_DCA_TXCTRL_DESC_RRO_EN, - [TXCTL3] = E1000_DCA_TXCTRL_DATA_RRO_EN | - E1000_DCA_TXCTRL_TX_WB_RO_EN | - E1000_DCA_TXCTRL_DESC_RRO_EN, - [TXCTL4] = E1000_DCA_TXCTRL_DATA_RRO_EN | - E1000_DCA_TXCTRL_TX_WB_RO_EN | - E1000_DCA_TXCTRL_DESC_RRO_EN, - [TXCTL5] = E1000_DCA_TXCTRL_DATA_RRO_EN | - E1000_DCA_TXCTRL_TX_WB_RO_EN | - E1000_DCA_TXCTRL_DESC_RRO_EN, - [TXCTL6] = E1000_DCA_TXCTRL_DATA_RRO_EN | - E1000_DCA_TXCTRL_TX_WB_RO_EN | - E1000_DCA_TXCTRL_DESC_RRO_EN, - [TXCTL7] = E1000_DCA_TXCTRL_DATA_RRO_EN | - E1000_DCA_TXCTRL_TX_WB_RO_EN | - E1000_DCA_TXCTRL_DESC_RRO_EN, - [TXCTL8] = E1000_DCA_TXCTRL_DATA_RRO_EN | - E1000_DCA_TXCTRL_TX_WB_RO_EN | - E1000_DCA_TXCTRL_DESC_RRO_EN, - [TXCTL9] = E1000_DCA_TXCTRL_DATA_RRO_EN | - E1000_DCA_TXCTRL_TX_WB_RO_EN | - E1000_DCA_TXCTRL_DESC_RRO_EN, - [TXCTL10] = E1000_DCA_TXCTRL_DATA_RRO_EN | - E1000_DCA_TXCTRL_TX_WB_RO_EN | - E1000_DCA_TXCTRL_DESC_RRO_EN, - [TXCTL11] = E1000_DCA_TXCTRL_DATA_RRO_EN | - E1000_DCA_TXCTRL_TX_WB_RO_EN | - E1000_DCA_TXCTRL_DESC_RRO_EN, - [TXCTL12] = E1000_DCA_TXCTRL_DATA_RRO_EN | - E1000_DCA_TXCTRL_TX_WB_RO_EN | - E1000_DCA_TXCTRL_DESC_RRO_EN, - [TXCTL13] = E1000_DCA_TXCTRL_DATA_RRO_EN | - E1000_DCA_TXCTRL_TX_WB_RO_EN | - E1000_DCA_TXCTRL_DESC_RRO_EN, - [TXCTL14] = E1000_DCA_TXCTRL_DATA_RRO_EN | - E1000_DCA_TXCTRL_TX_WB_RO_EN | - E1000_DCA_TXCTRL_DESC_RRO_EN, - [TXCTL15] = E1000_DCA_TXCTRL_DATA_RRO_EN | - E1000_DCA_TXCTRL_TX_WB_RO_EN | - E1000_DCA_TXCTRL_DESC_RRO_EN, + [TXCTL0] = E1000_DCA_TXCTRL_DATA_RRO_EN | + E1000_DCA_TXCTRL_TX_WB_RO_EN | + E1000_DCA_TXCTRL_DESC_RRO_EN, + [TXCTL1] = E1000_DCA_TXCTRL_DATA_RRO_EN | + E1000_DCA_TXCTRL_TX_WB_RO_EN | + E1000_DCA_TXCTRL_DESC_RRO_EN, + [TXCTL2] = E1000_DCA_TXCTRL_DATA_RRO_EN | + E1000_DCA_TXCTRL_TX_WB_RO_EN | + E1000_DCA_TXCTRL_DESC_RRO_EN, + [TXCTL3] = E1000_DCA_TXCTRL_DATA_RRO_EN | + E1000_DCA_TXCTRL_TX_WB_RO_EN | + E1000_DCA_TXCTRL_DESC_RRO_EN, + [TXCTL4] = E1000_DCA_TXCTRL_DATA_RRO_EN | + E1000_DCA_TXCTRL_TX_WB_RO_EN | + E1000_DCA_TXCTRL_DESC_RRO_EN, + [TXCTL5] = E1000_DCA_TXCTRL_DATA_RRO_EN | + E1000_DCA_TXCTRL_TX_WB_RO_EN | + E1000_DCA_TXCTRL_DESC_RRO_EN, + [TXCTL6] = E1000_DCA_TXCTRL_DATA_RRO_EN | + E1000_DCA_TXCTRL_TX_WB_RO_EN | + E1000_DCA_TXCTRL_DESC_RRO_EN, + [TXCTL7] = E1000_DCA_TXCTRL_DATA_RRO_EN | + E1000_DCA_TXCTRL_TX_WB_RO_EN | + E1000_DCA_TXCTRL_DESC_RRO_EN, + [TXCTL8] = E1000_DCA_TXCTRL_DATA_RRO_EN | + E1000_DCA_TXCTRL_TX_WB_RO_EN | + E1000_DCA_TXCTRL_DESC_RRO_EN, + [TXCTL9] = E1000_DCA_TXCTRL_DATA_RRO_EN | + E1000_DCA_TXCTRL_TX_WB_RO_EN | + E1000_DCA_TXCTRL_DESC_RRO_EN, + [TXCTL10] = E1000_DCA_TXCTRL_DATA_RRO_EN | + E1000_DCA_TXCTRL_TX_WB_RO_EN | + E1000_DCA_TXCTRL_DESC_RRO_EN, + [TXCTL11] = E1000_DCA_TXCTRL_DATA_RRO_EN | + E1000_DCA_TXCTRL_TX_WB_RO_EN | + E1000_DCA_TXCTRL_DESC_RRO_EN, + [TXCTL12] = E1000_DCA_TXCTRL_DATA_RRO_EN | + E1000_DCA_TXCTRL_TX_WB_RO_EN | + E1000_DCA_TXCTRL_DESC_RRO_EN, + [TXCTL13] = E1000_DCA_TXCTRL_DATA_RRO_EN | + E1000_DCA_TXCTRL_TX_WB_RO_EN | + E1000_DCA_TXCTRL_DESC_RRO_EN, + [TXCTL14] = E1000_DCA_TXCTRL_DATA_RRO_EN | + E1000_DCA_TXCTRL_TX_WB_RO_EN | + E1000_DCA_TXCTRL_DESC_RRO_EN, + [TXCTL15] = E1000_DCA_TXCTRL_DATA_RRO_EN | + E1000_DCA_TXCTRL_TX_WB_RO_EN | + E1000_DCA_TXCTRL_DESC_RRO_EN, }; static void igb_reset(IGBCore *core, bool sw) From fe619f2005b15b14f718c56a57152c5c1c06b4c8 Mon Sep 17 00:00:00 2001 From: Akihiko Odaki Date: Tue, 23 May 2023 11:43:18 +0900 Subject: [PATCH 28/50] igb: Clear EICR bits for delayed MSI-X interrupts Section 7.3.4.1 says: > When auto-clear is enabled for an interrupt cause, the EICR bit is > set when a cause event mapped to this vector occurs. When the EITR > Counter reaches zero, the MSI-X message is sent on PCIe. Then the > EICR bit is cleared and enabled to be set by a new cause event Signed-off-by: Akihiko Odaki Signed-off-by: Jason Wang --- hw/net/igb_core.c | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/hw/net/igb_core.c b/hw/net/igb_core.c index 20645c4764..edda07e564 100644 --- a/hw/net/igb_core.c +++ b/hw/net/igb_core.c @@ -97,23 +97,31 @@ igb_lower_legacy_irq(IGBCore *core) pci_set_irq(core->owner, 0); } -static void igb_msix_notify(IGBCore *core, unsigned int vector) +static void igb_msix_notify(IGBCore *core, unsigned int cause) { PCIDevice *dev = core->owner; uint16_t vfn; + uint32_t effective_eiac; + unsigned int vector; - vfn = 8 - (vector + 2) / IGBVF_MSIX_VEC_NUM; + vfn = 8 - (cause + 2) / IGBVF_MSIX_VEC_NUM; if (vfn < pcie_sriov_num_vfs(core->owner)) { dev = pcie_sriov_get_vf_at_index(core->owner, vfn); assert(dev); - vector = (vector + 2) % IGBVF_MSIX_VEC_NUM; - } else if (vector >= IGB_MSIX_VEC_NUM) { + vector = (cause + 2) % IGBVF_MSIX_VEC_NUM; + } else if (cause >= IGB_MSIX_VEC_NUM) { qemu_log_mask(LOG_GUEST_ERROR, "igb: Tried to use vector unavailable for PF"); return; + } else { + vector = cause; } msix_notify(dev, vector); + + trace_e1000e_irq_icr_clear_eiac(core->mac[EICR], core->mac[EIAC]); + effective_eiac = core->mac[EIAC] & BIT(cause); + core->mac[EICR] &= ~effective_eiac; } static inline void @@ -1834,7 +1842,6 @@ igb_eitr_should_postpone(IGBCore *core, int idx) static void igb_send_msix(IGBCore *core) { uint32_t causes = core->mac[EICR] & core->mac[EIMS]; - uint32_t effective_eiac; int vector; for (vector = 0; vector < IGB_INTR_NUM; ++vector) { @@ -1842,10 +1849,6 @@ static void igb_send_msix(IGBCore *core) trace_e1000e_irq_msix_notify_vec(vector); igb_msix_notify(core, vector); - - trace_e1000e_irq_icr_clear_eiac(core->mac[EICR], core->mac[EIAC]); - effective_eiac = core->mac[EIAC] & BIT(vector); - core->mac[EICR] &= ~effective_eiac; } } } From 54ced75e497109b1cd9aa3ff75b863e7fea0358a Mon Sep 17 00:00:00 2001 From: Akihiko Odaki Date: Tue, 23 May 2023 11:43:19 +0900 Subject: [PATCH 29/50] e1000e: Rename a variable in e1000e_receive_internal() Rename variable "n" to "causes", which properly represents the content of the variable. Signed-off-by: Akihiko Odaki Signed-off-by: Jason Wang --- hw/net/e1000e_core.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/hw/net/e1000e_core.c b/hw/net/e1000e_core.c index 7dce448657..aea70b74d9 100644 --- a/hw/net/e1000e_core.c +++ b/hw/net/e1000e_core.c @@ -1650,7 +1650,7 @@ static ssize_t e1000e_receive_internal(E1000ECore *core, const struct iovec *iov, int iovcnt, bool has_vnet) { - uint32_t n = 0; + uint32_t causes = 0; uint8_t buf[ETH_ZLEN]; struct iovec min_iov; size_t size, orig_size; @@ -1723,32 +1723,32 @@ e1000e_receive_internal(E1000ECore *core, const struct iovec *iov, int iovcnt, /* Perform small receive detection (RSRPD) */ if (total_size < core->mac[RSRPD]) { - n |= E1000_ICS_SRPD; + causes |= E1000_ICS_SRPD; } /* Perform ACK receive detection */ if (!(core->mac[RFCTL] & E1000_RFCTL_ACK_DIS) && (e1000e_is_tcp_ack(core, core->rx_pkt))) { - n |= E1000_ICS_ACK; + causes |= E1000_ICS_ACK; } /* Check if receive descriptor minimum threshold hit */ rdmts_hit = e1000e_rx_descr_threshold_hit(core, rxr.i); - n |= e1000e_rx_wb_interrupt_cause(core, rxr.i->idx, rdmts_hit); + causes |= e1000e_rx_wb_interrupt_cause(core, rxr.i->idx, rdmts_hit); trace_e1000e_rx_written_to_guest(rxr.i->idx); } else { - n |= E1000_ICS_RXO; + causes |= E1000_ICS_RXO; retval = 0; trace_e1000e_rx_not_written_to_guest(rxr.i->idx); } - if (!e1000e_intrmgr_delay_rx_causes(core, &n)) { - trace_e1000e_rx_interrupt_set(n); - e1000e_set_interrupt_cause(core, n); + if (!e1000e_intrmgr_delay_rx_causes(core, &causes)) { + trace_e1000e_rx_interrupt_set(causes); + e1000e_set_interrupt_cause(core, causes); } else { - trace_e1000e_rx_interrupt_delayed(n); + trace_e1000e_rx_interrupt_delayed(causes); } return retval; From 5c30aea4c93e90ed67ee6d279d8bdf7328953638 Mon Sep 17 00:00:00 2001 From: Akihiko Odaki Date: Tue, 23 May 2023 11:43:20 +0900 Subject: [PATCH 30/50] igb: Rename a variable in igb_receive_internal() Rename variable "n" to "causes", which properly represents the content of the variable. Signed-off-by: Akihiko Odaki Reviewed-by: Sriram Yagnaraman Signed-off-by: Jason Wang --- hw/net/igb_core.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/hw/net/igb_core.c b/hw/net/igb_core.c index edda07e564..c954369964 100644 --- a/hw/net/igb_core.c +++ b/hw/net/igb_core.c @@ -1569,7 +1569,7 @@ igb_receive_internal(IGBCore *core, const struct iovec *iov, int iovcnt, bool has_vnet, bool *external_tx) { uint16_t queues = 0; - uint32_t n = 0; + uint32_t causes = 0; union { L2Header l2_header; uint8_t octets[ETH_ZLEN]; @@ -1649,19 +1649,19 @@ igb_receive_internal(IGBCore *core, const struct iovec *iov, int iovcnt, e1000x_fcs_len(core->mac); if (!igb_has_rxbufs(core, rxr.i, total_size)) { - n |= E1000_ICS_RXO; + causes |= E1000_ICS_RXO; trace_e1000e_rx_not_written_to_guest(rxr.i->idx); continue; } - n |= E1000_ICR_RXDW; + causes |= E1000_ICR_RXDW; igb_rx_fix_l4_csum(core, core->rx_pkt); igb_write_packet_to_guest(core, core->rx_pkt, &rxr, &rss_info); /* Check if receive descriptor minimum threshold hit */ if (igb_rx_descr_threshold_hit(core, rxr.i)) { - n |= E1000_ICS_RXDMT0; + causes |= E1000_ICS_RXDMT0; } core->mac[EICR] |= igb_rx_wb_eic(core, rxr.i->idx); @@ -1669,8 +1669,8 @@ igb_receive_internal(IGBCore *core, const struct iovec *iov, int iovcnt, trace_e1000e_rx_written_to_guest(rxr.i->idx); } - trace_e1000e_rx_interrupt_set(n); - igb_set_interrupt_cause(core, n); + trace_e1000e_rx_interrupt_set(causes); + igb_set_interrupt_cause(core, causes); return orig_size; } From 85427bf3884f42626208865f3fe594b3919566fb Mon Sep 17 00:00:00 2001 From: Akihiko Odaki Date: Tue, 23 May 2023 11:43:21 +0900 Subject: [PATCH 31/50] net/eth: Use void pointers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The uses of uint8_t pointers were misleading as they are never accessed as an array of octets and it even require more strict alignment to access as struct eth_header. Signed-off-by: Akihiko Odaki Reviewed-by: Philippe Mathieu-Daudé Signed-off-by: Jason Wang --- include/net/eth.h | 4 ++-- net/eth.c | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/include/net/eth.h b/include/net/eth.h index 05f56931e7..95ff24d6b8 100644 --- a/include/net/eth.h +++ b/include/net/eth.h @@ -342,12 +342,12 @@ eth_get_pkt_tci(const void *p) size_t eth_strip_vlan(const struct iovec *iov, int iovcnt, size_t iovoff, - uint8_t *new_ehdr_buf, + void *new_ehdr_buf, uint16_t *payload_offset, uint16_t *tci); size_t eth_strip_vlan_ex(const struct iovec *iov, int iovcnt, size_t iovoff, - uint16_t vet, uint8_t *new_ehdr_buf, + uint16_t vet, void *new_ehdr_buf, uint16_t *payload_offset, uint16_t *tci); uint16_t diff --git a/net/eth.c b/net/eth.c index b6ff89c460..f7ffbda600 100644 --- a/net/eth.c +++ b/net/eth.c @@ -226,11 +226,11 @@ void eth_get_protocols(const struct iovec *iov, size_t iovcnt, size_t iovoff, size_t eth_strip_vlan(const struct iovec *iov, int iovcnt, size_t iovoff, - uint8_t *new_ehdr_buf, + void *new_ehdr_buf, uint16_t *payload_offset, uint16_t *tci) { struct vlan_header vlan_hdr; - struct eth_header *new_ehdr = (struct eth_header *) new_ehdr_buf; + struct eth_header *new_ehdr = new_ehdr_buf; size_t copied = iov_to_buf(iov, iovcnt, iovoff, new_ehdr, sizeof(*new_ehdr)); @@ -276,7 +276,7 @@ eth_strip_vlan(const struct iovec *iov, int iovcnt, size_t iovoff, size_t eth_strip_vlan_ex(const struct iovec *iov, int iovcnt, size_t iovoff, - uint16_t vet, uint8_t *new_ehdr_buf, + uint16_t vet, void *new_ehdr_buf, uint16_t *payload_offset, uint16_t *tci) { struct vlan_header vlan_hdr; From aaa8a15c96a8a7341a3667dcc932f8c5f227d6af Mon Sep 17 00:00:00 2001 From: Akihiko Odaki Date: Tue, 23 May 2023 11:43:22 +0900 Subject: [PATCH 32/50] net/eth: Always add VLAN tag It is possible to have another VLAN tag even if the packet is already tagged. Signed-off-by: Akihiko Odaki Signed-off-by: Jason Wang --- hw/net/net_tx_pkt.c | 16 +++++++--------- include/net/eth.h | 4 ++-- net/eth.c | 22 ++++++---------------- 3 files changed, 15 insertions(+), 27 deletions(-) diff --git a/hw/net/net_tx_pkt.c b/hw/net/net_tx_pkt.c index ce6b102391..af8f77a3f0 100644 --- a/hw/net/net_tx_pkt.c +++ b/hw/net/net_tx_pkt.c @@ -40,7 +40,10 @@ struct NetTxPkt { struct iovec *vec; - uint8_t l2_hdr[ETH_MAX_L2_HDR_LEN]; + struct { + struct eth_header eth; + struct vlan_header vlan[3]; + } l2_hdr; union { struct ip_header ip; struct ip6_header ip6; @@ -365,18 +368,13 @@ bool net_tx_pkt_build_vheader(struct NetTxPkt *pkt, bool tso_enable, void net_tx_pkt_setup_vlan_header_ex(struct NetTxPkt *pkt, uint16_t vlan, uint16_t vlan_ethtype) { - bool is_new; assert(pkt); eth_setup_vlan_headers(pkt->vec[NET_TX_PKT_L2HDR_FRAG].iov_base, - vlan, vlan_ethtype, &is_new); + &pkt->vec[NET_TX_PKT_L2HDR_FRAG].iov_len, + vlan, vlan_ethtype); - /* update l2hdrlen */ - if (is_new) { - pkt->hdr_len += sizeof(struct vlan_header); - pkt->vec[NET_TX_PKT_L2HDR_FRAG].iov_len += - sizeof(struct vlan_header); - } + pkt->hdr_len += sizeof(struct vlan_header); } bool net_tx_pkt_add_raw_fragment(struct NetTxPkt *pkt, void *base, size_t len) diff --git a/include/net/eth.h b/include/net/eth.h index 95ff24d6b8..048e434685 100644 --- a/include/net/eth.h +++ b/include/net/eth.h @@ -353,8 +353,8 @@ eth_strip_vlan_ex(const struct iovec *iov, int iovcnt, size_t iovoff, uint16_t eth_get_l3_proto(const struct iovec *l2hdr_iov, int iovcnt, size_t l2hdr_len); -void eth_setup_vlan_headers(struct eth_header *ehdr, uint16_t vlan_tag, - uint16_t vlan_ethtype, bool *is_new); +void eth_setup_vlan_headers(struct eth_header *ehdr, size_t *ehdr_size, + uint16_t vlan_tag, uint16_t vlan_ethtype); uint8_t eth_get_gso_type(uint16_t l3_proto, uint8_t *l3_hdr, uint8_t l4proto); diff --git a/net/eth.c b/net/eth.c index f7ffbda600..5307978486 100644 --- a/net/eth.c +++ b/net/eth.c @@ -21,26 +21,16 @@ #include "net/checksum.h" #include "net/tap.h" -void eth_setup_vlan_headers(struct eth_header *ehdr, uint16_t vlan_tag, - uint16_t vlan_ethtype, bool *is_new) +void eth_setup_vlan_headers(struct eth_header *ehdr, size_t *ehdr_size, + uint16_t vlan_tag, uint16_t vlan_ethtype) { struct vlan_header *vhdr = PKT_GET_VLAN_HDR(ehdr); - switch (be16_to_cpu(ehdr->h_proto)) { - case ETH_P_VLAN: - case ETH_P_DVLAN: - /* vlan hdr exists */ - *is_new = false; - break; - - default: - /* No VLAN header, put a new one */ - vhdr->h_proto = ehdr->h_proto; - ehdr->h_proto = cpu_to_be16(vlan_ethtype); - *is_new = true; - break; - } + memmove(vhdr + 1, vhdr, *ehdr_size - ETH_HLEN); vhdr->h_tci = cpu_to_be16(vlan_tag); + vhdr->h_proto = ehdr->h_proto; + ehdr->h_proto = cpu_to_be16(vlan_ethtype); + *ehdr_size += sizeof(*vhdr); } uint8_t From 7edf2f1d54454c0c0e5596b1112318cb91ec1afe Mon Sep 17 00:00:00 2001 From: Akihiko Odaki Date: Tue, 23 May 2023 11:43:23 +0900 Subject: [PATCH 33/50] hw/net/net_rx_pkt: Enforce alignment for eth_header eth_strip_vlan and eth_strip_vlan_ex refers to ehdr_buf as struct eth_header. Enforce alignment for the structure. Signed-off-by: Akihiko Odaki Reviewed-by: Sriram Yagnaraman Signed-off-by: Jason Wang --- hw/net/net_rx_pkt.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/hw/net/net_rx_pkt.c b/hw/net/net_rx_pkt.c index 6125a063d7..1de42b4f51 100644 --- a/hw/net/net_rx_pkt.c +++ b/hw/net/net_rx_pkt.c @@ -23,7 +23,10 @@ struct NetRxPkt { struct virtio_net_hdr virt_hdr; - uint8_t ehdr_buf[sizeof(struct eth_header) + sizeof(struct vlan_header)]; + struct { + struct eth_header eth; + struct vlan_header vlan; + } ehdr_buf; struct iovec *vec; uint16_t vec_len_total; uint16_t vec_len; @@ -89,7 +92,7 @@ net_rx_pkt_pull_data(struct NetRxPkt *pkt, if (pkt->ehdr_buf_len) { net_rx_pkt_iovec_realloc(pkt, iovcnt + 1); - pkt->vec[0].iov_base = pkt->ehdr_buf; + pkt->vec[0].iov_base = &pkt->ehdr_buf; pkt->vec[0].iov_len = pkt->ehdr_buf_len; pkt->tot_len = pllen + pkt->ehdr_buf_len; @@ -120,7 +123,7 @@ void net_rx_pkt_attach_iovec(struct NetRxPkt *pkt, assert(pkt); if (strip_vlan) { - pkt->ehdr_buf_len = eth_strip_vlan(iov, iovcnt, iovoff, pkt->ehdr_buf, + pkt->ehdr_buf_len = eth_strip_vlan(iov, iovcnt, iovoff, &pkt->ehdr_buf, &ploff, &tci); } else { pkt->ehdr_buf_len = 0; @@ -142,7 +145,7 @@ void net_rx_pkt_attach_iovec_ex(struct NetRxPkt *pkt, if (strip_vlan) { pkt->ehdr_buf_len = eth_strip_vlan_ex(iov, iovcnt, iovoff, vet, - pkt->ehdr_buf, + &pkt->ehdr_buf, &ploff, &tci); } else { pkt->ehdr_buf_len = 0; From c6e33a2c529088ef2be787d0704d5706bb154450 Mon Sep 17 00:00:00 2001 From: Akihiko Odaki Date: Tue, 23 May 2023 11:43:24 +0900 Subject: [PATCH 34/50] tests/qtest/libqos/igb: Set GPIE.Multiple_MSIX GPIE.Multiple_MSIX is not set by default, and needs to be set to get interrupts from multiple MSI-X vectors. Signed-off-by: Akihiko Odaki Reviewed-by: Sriram Yagnaraman Signed-off-by: Jason Wang --- tests/qtest/libqos/igb.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/qtest/libqos/igb.c b/tests/qtest/libqos/igb.c index 12fb531bf0..a603468beb 100644 --- a/tests/qtest/libqos/igb.c +++ b/tests/qtest/libqos/igb.c @@ -114,6 +114,7 @@ static void igb_pci_start_hw(QOSGraphObject *obj) e1000e_macreg_write(&d->e1000e, E1000_RCTL, E1000_RCTL_EN); /* Enable all interrupts */ + e1000e_macreg_write(&d->e1000e, E1000_GPIE, E1000_GPIE_MSIX_MODE); e1000e_macreg_write(&d->e1000e, E1000_IMS, 0xFFFFFFFF); e1000e_macreg_write(&d->e1000e, E1000_EIMS, 0xFFFFFFFF); From 191e8bde88a47303eed697a1fb56d19eb0a2a759 Mon Sep 17 00:00:00 2001 From: Akihiko Odaki Date: Tue, 23 May 2023 11:43:25 +0900 Subject: [PATCH 35/50] igb: Implement MSI-X single vector mode Signed-off-by: Akihiko Odaki Reviewed-by: Sriram Yagnaraman Signed-off-by: Jason Wang --- hw/net/igb_core.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/hw/net/igb_core.c b/hw/net/igb_core.c index c954369964..6d55b43fb4 100644 --- a/hw/net/igb_core.c +++ b/hw/net/igb_core.c @@ -1873,7 +1873,7 @@ igb_update_interrupt_state(IGBCore *core) icr = core->mac[ICR] & core->mac[IMS]; - if (msix_enabled(core->owner)) { + if (core->mac[GPIE] & E1000_GPIE_MSIX_MODE) { if (icr) { causes = 0; if (icr & E1000_ICR_DRSTA) { @@ -1908,7 +1908,12 @@ igb_update_interrupt_state(IGBCore *core) trace_e1000e_irq_pending_interrupts(core->mac[ICR] & core->mac[IMS], core->mac[ICR], core->mac[IMS]); - if (msi_enabled(core->owner)) { + if (msix_enabled(core->owner)) { + if (icr) { + trace_e1000e_irq_msix_notify_vec(0); + msix_notify(core->owner, 0); + } + } else if (msi_enabled(core->owner)) { if (icr) { msi_notify(core->owner, 0); } From abc9a29d6bccb2886f86f6242fc228b7eaf5cd87 Mon Sep 17 00:00:00 2001 From: Akihiko Odaki Date: Tue, 23 May 2023 11:43:26 +0900 Subject: [PATCH 36/50] igb: Use UDP for RSS hash e1000e does not support using UDP for RSS hash, but igb does. Signed-off-by: Akihiko Odaki Reviewed-by: Sriram Yagnaraman Signed-off-by: Jason Wang --- hw/net/igb_core.c | 16 ++++++++++++++++ hw/net/igb_regs.h | 3 +++ 2 files changed, 19 insertions(+) diff --git a/hw/net/igb_core.c b/hw/net/igb_core.c index 6d55b43fb4..41a2e5bf7b 100644 --- a/hw/net/igb_core.c +++ b/hw/net/igb_core.c @@ -287,6 +287,11 @@ igb_rss_get_hash_type(IGBCore *core, struct NetRxPkt *pkt) return E1000_MRQ_RSS_TYPE_IPV4TCP; } + if (l4hdr_proto == ETH_L4_HDR_PROTO_UDP && + (core->mac[MRQC] & E1000_MRQC_RSS_FIELD_IPV4_UDP)) { + return E1000_MRQ_RSS_TYPE_IPV4UDP; + } + if (E1000_MRQC_EN_IPV4(core->mac[MRQC])) { return E1000_MRQ_RSS_TYPE_IPV4; } @@ -322,6 +327,11 @@ igb_rss_get_hash_type(IGBCore *core, struct NetRxPkt *pkt) return E1000_MRQ_RSS_TYPE_IPV6TCPEX; } + if (l4hdr_proto == ETH_L4_HDR_PROTO_UDP && + (core->mac[MRQC] & E1000_MRQC_RSS_FIELD_IPV6_UDP)) { + return E1000_MRQ_RSS_TYPE_IPV6UDP; + } + if (E1000_MRQC_EN_IPV6EX(core->mac[MRQC])) { return E1000_MRQ_RSS_TYPE_IPV6EX; } @@ -360,6 +370,12 @@ igb_rss_calc_hash(IGBCore *core, struct NetRxPkt *pkt, E1000E_RSSInfo *info) case E1000_MRQ_RSS_TYPE_IPV6EX: type = NetPktRssIpV6Ex; break; + case E1000_MRQ_RSS_TYPE_IPV4UDP: + type = NetPktRssIpV4Udp; + break; + case E1000_MRQ_RSS_TYPE_IPV6UDP: + type = NetPktRssIpV6Udp; + break; default: assert(false); return 0; diff --git a/hw/net/igb_regs.h b/hw/net/igb_regs.h index eb995d8b2e..e6ac26dc0e 100644 --- a/hw/net/igb_regs.h +++ b/hw/net/igb_regs.h @@ -659,6 +659,9 @@ union e1000_adv_rx_desc { #define E1000_RSS_QUEUE(reta, hash) (E1000_RETA_VAL(reta, hash) & 0x0F) +#define E1000_MRQ_RSS_TYPE_IPV4UDP 7 +#define E1000_MRQ_RSS_TYPE_IPV6UDP 8 + #define E1000_STATUS_IOV_MODE 0x00040000 #define E1000_STATUS_NUM_VFS_SHIFT 14 From 907209e3111dd62a553a19319b422ff8aba5b9c0 Mon Sep 17 00:00:00 2001 From: Akihiko Odaki Date: Tue, 23 May 2023 11:43:27 +0900 Subject: [PATCH 37/50] igb: Implement Rx SCTP CSO Signed-off-by: Akihiko Odaki Reviewed-by: Sriram Yagnaraman Signed-off-by: Jason Wang --- hw/net/e1000e_core.c | 5 +++ hw/net/igb_core.c | 15 ++++++++- hw/net/igb_regs.h | 1 + hw/net/net_rx_pkt.c | 72 ++++++++++++++++++++++++++++++++++--------- include/net/eth.h | 4 ++- include/qemu/crc32c.h | 1 + net/eth.c | 4 +++ util/crc32c.c | 8 +++++ 8 files changed, 93 insertions(+), 17 deletions(-) diff --git a/hw/net/e1000e_core.c b/hw/net/e1000e_core.c index aea70b74d9..0b939ff5a3 100644 --- a/hw/net/e1000e_core.c +++ b/hw/net/e1000e_core.c @@ -1114,6 +1114,11 @@ e1000e_verify_csum_in_sw(E1000ECore *core, return; } + if (l4hdr_proto != ETH_L4_HDR_PROTO_TCP && + l4hdr_proto != ETH_L4_HDR_PROTO_UDP) { + return; + } + if (!net_rx_pkt_validate_l4_csum(pkt, &csum_valid)) { trace_e1000e_rx_metadata_l4_csum_validation_failed(); return; diff --git a/hw/net/igb_core.c b/hw/net/igb_core.c index 41a2e5bf7b..95d46d6e6d 100644 --- a/hw/net/igb_core.c +++ b/hw/net/igb_core.c @@ -1220,7 +1220,7 @@ igb_build_rx_metadata(IGBCore *core, uint16_t *vlan_tag) { struct virtio_net_hdr *vhdr; - bool hasip4, hasip6; + bool hasip4, hasip6, csum_valid; EthL4HdrProto l4hdr_proto; *status_flags = E1000_RXD_STAT_DD; @@ -1280,6 +1280,10 @@ igb_build_rx_metadata(IGBCore *core, *pkt_info |= E1000_ADVRXD_PKT_UDP; break; + case ETH_L4_HDR_PROTO_SCTP: + *pkt_info |= E1000_ADVRXD_PKT_SCTP; + break; + default: break; } @@ -1312,6 +1316,15 @@ igb_build_rx_metadata(IGBCore *core, if (igb_rx_l4_cso_enabled(core)) { switch (l4hdr_proto) { + case ETH_L4_HDR_PROTO_SCTP: + if (!net_rx_pkt_validate_l4_csum(pkt, &csum_valid)) { + trace_e1000e_rx_metadata_l4_csum_validation_failed(); + goto func_exit; + } + if (!csum_valid) { + *status_flags |= E1000_RXDEXT_STATERR_TCPE; + } + /* fall through */ case ETH_L4_HDR_PROTO_TCP: *status_flags |= E1000_RXD_STAT_TCPCS; break; diff --git a/hw/net/igb_regs.h b/hw/net/igb_regs.h index e6ac26dc0e..4b4ebd3369 100644 --- a/hw/net/igb_regs.h +++ b/hw/net/igb_regs.h @@ -670,6 +670,7 @@ union e1000_adv_rx_desc { #define E1000_ADVRXD_PKT_IP6 BIT(6) #define E1000_ADVRXD_PKT_TCP BIT(8) #define E1000_ADVRXD_PKT_UDP BIT(9) +#define E1000_ADVRXD_PKT_SCTP BIT(10) static inline uint8_t igb_ivar_entry_rx(uint8_t i) { diff --git a/hw/net/net_rx_pkt.c b/hw/net/net_rx_pkt.c index 1de42b4f51..3575c8b9f9 100644 --- a/hw/net/net_rx_pkt.c +++ b/hw/net/net_rx_pkt.c @@ -16,6 +16,7 @@ */ #include "qemu/osdep.h" +#include "qemu/crc32c.h" #include "trace.h" #include "net_rx_pkt.h" #include "net/checksum.h" @@ -554,32 +555,73 @@ _net_rx_pkt_calc_l4_csum(struct NetRxPkt *pkt) return csum; } +static bool +_net_rx_pkt_validate_sctp_sum(struct NetRxPkt *pkt) +{ + size_t csum_off; + size_t off = pkt->l4hdr_off; + size_t vec_len = pkt->vec_len; + struct iovec *vec; + uint32_t calculated = 0; + uint32_t original; + bool valid; + + for (vec = pkt->vec; vec->iov_len < off; vec++) { + off -= vec->iov_len; + vec_len--; + } + + csum_off = off + 8; + + if (!iov_to_buf(vec, vec_len, csum_off, &original, sizeof(original))) { + return false; + } + + if (!iov_from_buf(vec, vec_len, csum_off, + &calculated, sizeof(calculated))) { + return false; + } + + calculated = crc32c(0xffffffff, + (uint8_t *)vec->iov_base + off, vec->iov_len - off); + calculated = iov_crc32c(calculated ^ 0xffffffff, vec + 1, vec_len - 1); + valid = calculated == le32_to_cpu(original); + iov_from_buf(vec, vec_len, csum_off, &original, sizeof(original)); + + return valid; +} + bool net_rx_pkt_validate_l4_csum(struct NetRxPkt *pkt, bool *csum_valid) { - uint16_t csum; + uint32_t csum; trace_net_rx_pkt_l4_csum_validate_entry(); - if (pkt->l4hdr_info.proto != ETH_L4_HDR_PROTO_TCP && - pkt->l4hdr_info.proto != ETH_L4_HDR_PROTO_UDP) { - trace_net_rx_pkt_l4_csum_validate_not_xxp(); - return false; - } - - if (pkt->l4hdr_info.proto == ETH_L4_HDR_PROTO_UDP && - pkt->l4hdr_info.hdr.udp.uh_sum == 0) { - trace_net_rx_pkt_l4_csum_validate_udp_with_no_checksum(); - return false; - } - if (pkt->hasip4 && pkt->ip4hdr_info.fragment) { trace_net_rx_pkt_l4_csum_validate_ip4_fragment(); return false; } - csum = _net_rx_pkt_calc_l4_csum(pkt); + switch (pkt->l4hdr_info.proto) { + case ETH_L4_HDR_PROTO_UDP: + if (pkt->l4hdr_info.hdr.udp.uh_sum == 0) { + trace_net_rx_pkt_l4_csum_validate_udp_with_no_checksum(); + return false; + } + /* fall through */ + case ETH_L4_HDR_PROTO_TCP: + csum = _net_rx_pkt_calc_l4_csum(pkt); + *csum_valid = ((csum == 0) || (csum == 0xFFFF)); + break; - *csum_valid = ((csum == 0) || (csum == 0xFFFF)); + case ETH_L4_HDR_PROTO_SCTP: + *csum_valid = _net_rx_pkt_validate_sctp_sum(pkt); + break; + + default: + trace_net_rx_pkt_l4_csum_validate_not_xxp(); + return false; + } trace_net_rx_pkt_l4_csum_validate_csum(*csum_valid); diff --git a/include/net/eth.h b/include/net/eth.h index 048e434685..75e7f1551c 100644 --- a/include/net/eth.h +++ b/include/net/eth.h @@ -224,6 +224,7 @@ struct tcp_hdr { #define IP_HEADER_VERSION_6 (6) #define IP_PROTO_TCP (6) #define IP_PROTO_UDP (17) +#define IP_PROTO_SCTP (132) #define IPTOS_ECN_MASK 0x03 #define IPTOS_ECN(x) ((x) & IPTOS_ECN_MASK) #define IPTOS_ECN_CE 0x03 @@ -379,7 +380,8 @@ typedef struct eth_ip4_hdr_info_st { typedef enum EthL4HdrProto { ETH_L4_HDR_PROTO_INVALID, ETH_L4_HDR_PROTO_TCP, - ETH_L4_HDR_PROTO_UDP + ETH_L4_HDR_PROTO_UDP, + ETH_L4_HDR_PROTO_SCTP } EthL4HdrProto; typedef struct eth_l4_hdr_info_st { diff --git a/include/qemu/crc32c.h b/include/qemu/crc32c.h index 5b78884c38..88b4d2b3b3 100644 --- a/include/qemu/crc32c.h +++ b/include/qemu/crc32c.h @@ -30,5 +30,6 @@ uint32_t crc32c(uint32_t crc, const uint8_t *data, unsigned int length); +uint32_t iov_crc32c(uint32_t crc, const struct iovec *iov, size_t iov_cnt); #endif diff --git a/net/eth.c b/net/eth.c index 5307978486..7f02aea010 100644 --- a/net/eth.c +++ b/net/eth.c @@ -211,6 +211,10 @@ void eth_get_protocols(const struct iovec *iov, size_t iovcnt, size_t iovoff, *l5hdr_off = *l4hdr_off + sizeof(l4hdr_info->hdr.udp); } break; + + case IP_PROTO_SCTP: + l4hdr_info->proto = ETH_L4_HDR_PROTO_SCTP; + break; } } diff --git a/util/crc32c.c b/util/crc32c.c index 762657d853..ea7f345de8 100644 --- a/util/crc32c.c +++ b/util/crc32c.c @@ -113,3 +113,11 @@ uint32_t crc32c(uint32_t crc, const uint8_t *data, unsigned int length) return crc^0xffffffff; } +uint32_t iov_crc32c(uint32_t crc, const struct iovec *iov, size_t iov_cnt) +{ + while (iov_cnt--) { + crc = crc32c(crc, iov->iov_base, iov->iov_len) ^ 0xffffffff; + iov++; + } + return crc ^ 0xffffffff; +} From f199b13bc113c46eaddcf9f375d13f1e400b4e35 Mon Sep 17 00:00:00 2001 From: Akihiko Odaki Date: Tue, 23 May 2023 11:43:28 +0900 Subject: [PATCH 38/50] igb: Implement Tx SCTP CSO Signed-off-by: Akihiko Odaki Reviewed-by: Sriram Yagnaraman Signed-off-by: Jason Wang --- hw/net/igb_core.c | 12 +++++++----- hw/net/net_tx_pkt.c | 18 ++++++++++++++++++ hw/net/net_tx_pkt.h | 8 ++++++++ 3 files changed, 33 insertions(+), 5 deletions(-) diff --git a/hw/net/igb_core.c b/hw/net/igb_core.c index 95d46d6e6d..5eacf1cd8c 100644 --- a/hw/net/igb_core.c +++ b/hw/net/igb_core.c @@ -440,8 +440,9 @@ igb_tx_insert_vlan(IGBCore *core, uint16_t qn, struct igb_tx *tx, static bool igb_setup_tx_offloads(IGBCore *core, struct igb_tx *tx) { + uint32_t idx = (tx->first_olinfo_status >> 4) & 1; + if (tx->first_cmd_type_len & E1000_ADVTXD_DCMD_TSE) { - uint32_t idx = (tx->first_olinfo_status >> 4) & 1; uint32_t mss = tx->ctx[idx].mss_l4len_idx >> E1000_ADVTXD_MSS_SHIFT; if (!net_tx_pkt_build_vheader(tx->tx_pkt, true, true, mss)) { return false; @@ -452,10 +453,11 @@ igb_setup_tx_offloads(IGBCore *core, struct igb_tx *tx) return true; } - if (tx->first_olinfo_status & E1000_ADVTXD_POTS_TXSM) { - if (!net_tx_pkt_build_vheader(tx->tx_pkt, false, true, 0)) { - return false; - } + if ((tx->first_olinfo_status & E1000_ADVTXD_POTS_TXSM) && + !((tx->ctx[idx].type_tucmd_mlhl & E1000_ADVTXD_TUCMD_L4T_SCTP) ? + net_tx_pkt_update_sctp_checksum(tx->tx_pkt) : + net_tx_pkt_build_vheader(tx->tx_pkt, false, true, 0))) { + return false; } if (tx->first_olinfo_status & E1000_ADVTXD_POTS_IXSM) { diff --git a/hw/net/net_tx_pkt.c b/hw/net/net_tx_pkt.c index af8f77a3f0..2e5f58b3c9 100644 --- a/hw/net/net_tx_pkt.c +++ b/hw/net/net_tx_pkt.c @@ -16,6 +16,7 @@ */ #include "qemu/osdep.h" +#include "qemu/crc32c.h" #include "net/eth.h" #include "net/checksum.h" #include "net/tap.h" @@ -135,6 +136,23 @@ void net_tx_pkt_update_ip_checksums(struct NetTxPkt *pkt) pkt->virt_hdr.csum_offset, &csum, sizeof(csum)); } +bool net_tx_pkt_update_sctp_checksum(struct NetTxPkt *pkt) +{ + uint32_t csum = 0; + struct iovec *pl_start_frag = pkt->vec + NET_TX_PKT_PL_START_FRAG; + + if (iov_from_buf(pl_start_frag, pkt->payload_frags, 8, &csum, sizeof(csum)) < sizeof(csum)) { + return false; + } + + csum = cpu_to_le32(iov_crc32c(0xffffffff, pl_start_frag, pkt->payload_frags)); + if (iov_from_buf(pl_start_frag, pkt->payload_frags, 8, &csum, sizeof(csum)) < sizeof(csum)) { + return false; + } + + return true; +} + static void net_tx_pkt_calculate_hdr_len(struct NetTxPkt *pkt) { pkt->hdr_len = pkt->vec[NET_TX_PKT_L2HDR_FRAG].iov_len + diff --git a/hw/net/net_tx_pkt.h b/hw/net/net_tx_pkt.h index 4d7233e975..0a716e74a5 100644 --- a/hw/net/net_tx_pkt.h +++ b/hw/net/net_tx_pkt.h @@ -116,6 +116,14 @@ void net_tx_pkt_update_ip_checksums(struct NetTxPkt *pkt); */ void net_tx_pkt_update_ip_hdr_checksum(struct NetTxPkt *pkt); +/** + * Calculate the SCTP checksum. + * + * @pkt: packet + * + */ +bool net_tx_pkt_update_sctp_checksum(struct NetTxPkt *pkt); + /** * get length of all populated data. * From 7e64a9cabb6b8fe12d315e355e0c362e1453f227 Mon Sep 17 00:00:00 2001 From: Akihiko Odaki Date: Tue, 23 May 2023 11:43:29 +0900 Subject: [PATCH 39/50] igb: Strip the second VLAN tag for extended VLAN Signed-off-by: Akihiko Odaki Signed-off-by: Jason Wang --- hw/net/e1000e_core.c | 3 ++- hw/net/igb_core.c | 14 ++++++++++-- hw/net/net_rx_pkt.c | 15 +++++------- hw/net/net_rx_pkt.h | 19 ++++++++-------- include/net/eth.h | 4 ++-- net/eth.c | 54 ++++++++++++++++++++++++++++---------------- 6 files changed, 66 insertions(+), 43 deletions(-) diff --git a/hw/net/e1000e_core.c b/hw/net/e1000e_core.c index 0b939ff5a3..d601386992 100644 --- a/hw/net/e1000e_core.c +++ b/hw/net/e1000e_core.c @@ -1711,7 +1711,8 @@ e1000e_receive_internal(E1000ECore *core, const struct iovec *iov, int iovcnt, } net_rx_pkt_attach_iovec_ex(core->rx_pkt, iov, iovcnt, iov_ofs, - e1000x_vlan_enabled(core->mac), core->mac[VET]); + e1000x_vlan_enabled(core->mac) ? 0 : -1, + core->mac[VET], 0); e1000e_rss_parse_packet(core, core->rx_pkt, &rss_info); e1000e_rx_ring_init(core, &rxr, rss_info.queue); diff --git a/hw/net/igb_core.c b/hw/net/igb_core.c index 5eacf1cd8c..688eaf7319 100644 --- a/hw/net/igb_core.c +++ b/hw/net/igb_core.c @@ -1611,6 +1611,7 @@ igb_receive_internal(IGBCore *core, const struct iovec *iov, int iovcnt, E1000E_RxRing rxr; E1000E_RSSInfo rss_info; size_t total_size; + int strip_vlan_index; int i; trace_e1000e_rx_receive_iov(iovcnt); @@ -1672,9 +1673,18 @@ igb_receive_internal(IGBCore *core, const struct iovec *iov, int iovcnt, igb_rx_ring_init(core, &rxr, i); + if (!igb_rx_strip_vlan(core, rxr.i)) { + strip_vlan_index = -1; + } else if (core->mac[CTRL_EXT] & BIT(26)) { + strip_vlan_index = 1; + } else { + strip_vlan_index = 0; + } + net_rx_pkt_attach_iovec_ex(core->rx_pkt, iov, iovcnt, iov_ofs, - igb_rx_strip_vlan(core, rxr.i), - core->mac[VET] & 0xffff); + strip_vlan_index, + core->mac[VET] & 0xffff, + core->mac[VET] >> 16); total_size = net_rx_pkt_get_total_len(core->rx_pkt) + e1000x_fcs_len(core->mac); diff --git a/hw/net/net_rx_pkt.c b/hw/net/net_rx_pkt.c index 3575c8b9f9..32e5f3f9cf 100644 --- a/hw/net/net_rx_pkt.c +++ b/hw/net/net_rx_pkt.c @@ -137,20 +137,17 @@ void net_rx_pkt_attach_iovec(struct NetRxPkt *pkt, void net_rx_pkt_attach_iovec_ex(struct NetRxPkt *pkt, const struct iovec *iov, int iovcnt, - size_t iovoff, bool strip_vlan, - uint16_t vet) + size_t iovoff, int strip_vlan_index, + uint16_t vet, uint16_t vet_ext) { uint16_t tci = 0; uint16_t ploff = iovoff; assert(pkt); - if (strip_vlan) { - pkt->ehdr_buf_len = eth_strip_vlan_ex(iov, iovcnt, iovoff, vet, - &pkt->ehdr_buf, - &ploff, &tci); - } else { - pkt->ehdr_buf_len = 0; - } + pkt->ehdr_buf_len = eth_strip_vlan_ex(iov, iovcnt, iovoff, + strip_vlan_index, vet, vet_ext, + &pkt->ehdr_buf, + &ploff, &tci); pkt->tci = tci; diff --git a/hw/net/net_rx_pkt.h b/hw/net/net_rx_pkt.h index ce8dbdb284..55ec67a1a7 100644 --- a/hw/net/net_rx_pkt.h +++ b/hw/net/net_rx_pkt.h @@ -223,18 +223,19 @@ void net_rx_pkt_attach_iovec(struct NetRxPkt *pkt, /** * attach scatter-gather data to rx packet * -* @pkt: packet -* @iov: received data scatter-gather list -* @iovcnt number of elements in iov -* @iovoff data start offset in the iov -* @strip_vlan: should the module strip vlan from data -* @vet: VLAN tag Ethernet type +* @pkt: packet +* @iov: received data scatter-gather list +* @iovcnt: number of elements in iov +* @iovoff: data start offset in the iov +* @strip_vlan_index: index of Q tag if it is to be stripped. negative otherwise. +* @vet: VLAN tag Ethernet type +* @vet_ext: outer VLAN tag Ethernet type * */ void net_rx_pkt_attach_iovec_ex(struct NetRxPkt *pkt, - const struct iovec *iov, int iovcnt, - size_t iovoff, bool strip_vlan, - uint16_t vet); + const struct iovec *iov, int iovcnt, + size_t iovoff, int strip_vlan_index, + uint16_t vet, uint16_t vet_ext); /** * attach data to rx packet diff --git a/include/net/eth.h b/include/net/eth.h index 75e7f1551c..3b80b6e07f 100644 --- a/include/net/eth.h +++ b/include/net/eth.h @@ -347,8 +347,8 @@ eth_strip_vlan(const struct iovec *iov, int iovcnt, size_t iovoff, uint16_t *payload_offset, uint16_t *tci); size_t -eth_strip_vlan_ex(const struct iovec *iov, int iovcnt, size_t iovoff, - uint16_t vet, void *new_ehdr_buf, +eth_strip_vlan_ex(const struct iovec *iov, int iovcnt, size_t iovoff, int index, + uint16_t vet, uint16_t vet_ext, void *new_ehdr_buf, uint16_t *payload_offset, uint16_t *tci); uint16_t diff --git a/net/eth.c b/net/eth.c index 7f02aea010..649e66bb1f 100644 --- a/net/eth.c +++ b/net/eth.c @@ -269,36 +269,50 @@ eth_strip_vlan(const struct iovec *iov, int iovcnt, size_t iovoff, } size_t -eth_strip_vlan_ex(const struct iovec *iov, int iovcnt, size_t iovoff, - uint16_t vet, void *new_ehdr_buf, +eth_strip_vlan_ex(const struct iovec *iov, int iovcnt, size_t iovoff, int index, + uint16_t vet, uint16_t vet_ext, void *new_ehdr_buf, uint16_t *payload_offset, uint16_t *tci) { struct vlan_header vlan_hdr; - struct eth_header *new_ehdr = (struct eth_header *) new_ehdr_buf; + uint16_t *new_ehdr_proto; + size_t new_ehdr_size; + size_t copied; - size_t copied = iov_to_buf(iov, iovcnt, iovoff, - new_ehdr, sizeof(*new_ehdr)); + switch (index) { + case 0: + new_ehdr_proto = &PKT_GET_ETH_HDR(new_ehdr_buf)->h_proto; + new_ehdr_size = sizeof(struct eth_header); + copied = iov_to_buf(iov, iovcnt, iovoff, new_ehdr_buf, new_ehdr_size); + break; - if (copied < sizeof(*new_ehdr)) { + case 1: + new_ehdr_proto = &PKT_GET_VLAN_HDR(new_ehdr_buf)->h_proto; + new_ehdr_size = sizeof(struct eth_header) + sizeof(struct vlan_header); + copied = iov_to_buf(iov, iovcnt, iovoff, new_ehdr_buf, new_ehdr_size); + if (be16_to_cpu(PKT_GET_ETH_HDR(new_ehdr_buf)->h_proto) != vet_ext) { + return 0; + } + break; + + default: return 0; } - if (be16_to_cpu(new_ehdr->h_proto) == vet) { - copied = iov_to_buf(iov, iovcnt, iovoff + sizeof(*new_ehdr), - &vlan_hdr, sizeof(vlan_hdr)); - - if (copied < sizeof(vlan_hdr)) { - return 0; - } - - new_ehdr->h_proto = vlan_hdr.h_proto; - - *tci = be16_to_cpu(vlan_hdr.h_tci); - *payload_offset = iovoff + sizeof(*new_ehdr) + sizeof(vlan_hdr); - return sizeof(struct eth_header); + if (copied < new_ehdr_size || be16_to_cpu(*new_ehdr_proto) != vet) { + return 0; } - return 0; + copied = iov_to_buf(iov, iovcnt, iovoff + new_ehdr_size, + &vlan_hdr, sizeof(vlan_hdr)); + if (copied < sizeof(vlan_hdr)) { + return 0; + } + + *new_ehdr_proto = vlan_hdr.h_proto; + *payload_offset = iovoff + new_ehdr_size + sizeof(vlan_hdr); + *tci = be16_to_cpu(vlan_hdr.h_tci); + + return new_ehdr_size; } void From 6aa262f8e39d2fcb5c0088badc6ee930e52bb36c Mon Sep 17 00:00:00 2001 From: Akihiko Odaki Date: Tue, 23 May 2023 11:43:30 +0900 Subject: [PATCH 40/50] igb: Filter with the second VLAN tag for extended VLAN Signed-off-by: Akihiko Odaki Signed-off-by: Jason Wang --- hw/net/igb_core.c | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/hw/net/igb_core.c b/hw/net/igb_core.c index 688eaf7319..5345f57031 100644 --- a/hw/net/igb_core.c +++ b/hw/net/igb_core.c @@ -69,7 +69,7 @@ typedef struct IGBTxPktVmdqCallbackContext { typedef struct L2Header { struct eth_header eth; - struct vlan_header vlan; + struct vlan_header vlan[2]; } L2Header; static ssize_t @@ -1001,7 +1001,7 @@ static uint16_t igb_receive_assign(IGBCore *core, const L2Header *l2_header, uint32_t f, ra[2], *macp, rctl = core->mac[RCTL]; uint16_t queues = 0; uint16_t oversized = 0; - uint16_t vid = be16_to_cpu(l2_header->vlan.h_tci) & VLAN_VID_MASK; + size_t vlan_num = 0; int i; memset(rss_info, 0, sizeof(E1000E_RSSInfo)); @@ -1010,8 +1010,19 @@ static uint16_t igb_receive_assign(IGBCore *core, const L2Header *l2_header, *external_tx = true; } - if (e1000x_is_vlan_packet(ehdr, core->mac[VET] & 0xffff) && - !e1000x_rx_vlan_filter(core->mac, PKT_GET_VLAN_HDR(ehdr))) { + if (core->mac[CTRL_EXT] & BIT(26)) { + if (be16_to_cpu(ehdr->h_proto) == core->mac[VET] >> 16 && + be16_to_cpu(l2_header->vlan[0].h_proto) == (core->mac[VET] & 0xffff)) { + vlan_num = 2; + } + } else { + if (be16_to_cpu(ehdr->h_proto) == (core->mac[VET] & 0xffff)) { + vlan_num = 1; + } + } + + if (vlan_num && + !e1000x_rx_vlan_filter(core->mac, l2_header->vlan + vlan_num - 1)) { return queues; } @@ -1065,7 +1076,9 @@ static uint16_t igb_receive_assign(IGBCore *core, const L2Header *l2_header, if (e1000x_vlan_rx_filter_enabled(core->mac)) { uint16_t mask = 0; - if (e1000x_is_vlan_packet(ehdr, core->mac[VET] & 0xffff)) { + if (vlan_num) { + uint16_t vid = be16_to_cpu(l2_header->vlan[vlan_num - 1].h_tci) & VLAN_VID_MASK; + for (i = 0; i < E1000_VLVF_ARRAY_SIZE; i++) { if ((core->mac[VLVF0 + i] & E1000_VLVF_VLANID_MASK) == vid && (core->mac[VLVF0 + i] & E1000_VLVF_VLANID_ENABLE)) { From bb97003e731993fc1107bf6c663a68dbe1f60621 Mon Sep 17 00:00:00 2001 From: Akihiko Odaki Date: Tue, 23 May 2023 11:43:31 +0900 Subject: [PATCH 41/50] igb: Implement igb-specific oversize check igb has a configurable size limit for LPE, and uses different limits depending on whether the packet is treated as a VLAN packet. Signed-off-by: Akihiko Odaki Reviewed-by: Sriram Yagnaraman Signed-off-by: Jason Wang --- hw/net/igb_core.c | 36 +++++++++++++++++++++--------------- 1 file changed, 21 insertions(+), 15 deletions(-) diff --git a/hw/net/igb_core.c b/hw/net/igb_core.c index 5345f57031..c04ec01117 100644 --- a/hw/net/igb_core.c +++ b/hw/net/igb_core.c @@ -980,16 +980,13 @@ igb_rx_l4_cso_enabled(IGBCore *core) return !!(core->mac[RXCSUM] & E1000_RXCSUM_TUOFLD); } -static bool -igb_rx_is_oversized(IGBCore *core, uint16_t qn, size_t size) +static bool igb_rx_is_oversized(IGBCore *core, const struct eth_header *ehdr, + size_t size, size_t vlan_num, + bool lpe, uint16_t rlpml) { - uint16_t pool = qn % IGB_NUM_VM_POOLS; - bool lpe = !!(core->mac[VMOLR0 + pool] & E1000_VMOLR_LPE); - int max_ethernet_lpe_size = - core->mac[VMOLR0 + pool] & E1000_VMOLR_RLPML_MASK; - int max_ethernet_vlan_size = 1522; - - return size > (lpe ? max_ethernet_lpe_size : max_ethernet_vlan_size); + size_t vlan_header_size = sizeof(struct vlan_header) * vlan_num; + size_t header_size = sizeof(struct eth_header) + vlan_header_size; + return lpe ? size + ETH_FCS_LEN > rlpml : size > header_size + ETH_MTU; } static uint16_t igb_receive_assign(IGBCore *core, const L2Header *l2_header, @@ -1002,6 +999,8 @@ static uint16_t igb_receive_assign(IGBCore *core, const L2Header *l2_header, uint16_t queues = 0; uint16_t oversized = 0; size_t vlan_num = 0; + bool lpe; + uint16_t rlpml; int i; memset(rss_info, 0, sizeof(E1000E_RSSInfo)); @@ -1021,6 +1020,14 @@ static uint16_t igb_receive_assign(IGBCore *core, const L2Header *l2_header, } } + lpe = !!(core->mac[RCTL] & E1000_RCTL_LPE); + rlpml = core->mac[RLPML]; + if (!(core->mac[RCTL] & E1000_RCTL_SBP) && + igb_rx_is_oversized(core, ehdr, size, vlan_num, lpe, rlpml)) { + trace_e1000x_rx_oversized(size); + return queues; + } + if (vlan_num && !e1000x_rx_vlan_filter(core->mac, l2_header->vlan + vlan_num - 1)) { return queues; @@ -1106,7 +1113,11 @@ static uint16_t igb_receive_assign(IGBCore *core, const L2Header *l2_header, queues &= core->mac[VFRE]; if (queues) { for (i = 0; i < IGB_NUM_VM_POOLS; i++) { - if ((queues & BIT(i)) && igb_rx_is_oversized(core, i, size)) { + lpe = !!(core->mac[VMOLR0 + i] & E1000_VMOLR_LPE); + rlpml = core->mac[VMOLR0 + i] & E1000_VMOLR_RLPML_MASK; + if ((queues & BIT(i)) && + igb_rx_is_oversized(core, ehdr, size, vlan_num, + lpe, rlpml)) { oversized |= BIT(i); } } @@ -1662,11 +1673,6 @@ igb_receive_internal(IGBCore *core, const struct iovec *iov, int iovcnt, iov_to_buf(iov, iovcnt, iov_ofs, &buf, sizeof(buf.l2_header)); } - /* Discard oversized packets if !LPE and !SBP. */ - if (e1000x_is_oversized(core->mac, size)) { - return orig_size; - } - net_rx_pkt_set_packet_type(core->rx_pkt, get_eth_packet_type(&buf.l2_header.eth)); net_rx_pkt_set_protocols(core->rx_pkt, iov, iovcnt, iov_ofs); From 3a9926d939f86243e9fff28b516411236999e3c4 Mon Sep 17 00:00:00 2001 From: Akihiko Odaki Date: Tue, 23 May 2023 11:43:32 +0900 Subject: [PATCH 42/50] igb: Implement Rx PTP2 timestamp Signed-off-by: Akihiko Odaki Signed-off-by: Jason Wang --- hw/net/igb_common.h | 16 +++--- hw/net/igb_core.c | 129 ++++++++++++++++++++++++++++++++------------ hw/net/igb_regs.h | 23 ++++++++ 3 files changed, 127 insertions(+), 41 deletions(-) diff --git a/hw/net/igb_common.h b/hw/net/igb_common.h index f2a9065791..5c261ba9d3 100644 --- a/hw/net/igb_common.h +++ b/hw/net/igb_common.h @@ -51,7 +51,7 @@ defreg_indexeda(x, 0), defreg_indexeda(x, 1), \ defreg_indexeda(x, 2), defreg_indexeda(x, 3) -#define defregv(x) defreg_indexed(x, 0), defreg_indexed(x, 1), \ +#define defreg8(x) defreg_indexed(x, 0), defreg_indexed(x, 1), \ defreg_indexed(x, 2), defreg_indexed(x, 3), \ defreg_indexed(x, 4), defreg_indexed(x, 5), \ defreg_indexed(x, 6), defreg_indexed(x, 7) @@ -122,6 +122,8 @@ enum { defreg(EICS), defreg(EIMS), defreg(EIMC), defreg(EIAM), defreg(EICR), defreg(IVAR_MISC), defreg(GPIE), + defreg(TSYNCRXCFG), defreg8(ETQF), + defreg(RXPBS), defregd(RDBAL), defregd(RDBAH), defregd(RDLEN), defregd(SRRCTL), defregd(RDH), defregd(RDT), defregd(RXDCTL), defregd(RXCTL), defregd(RQDPC), defreg(RA2), @@ -133,15 +135,15 @@ enum { defreg(VT_CTL), - defregv(P2VMAILBOX), defregv(V2PMAILBOX), defreg(MBVFICR), defreg(MBVFIMR), + defreg8(P2VMAILBOX), defreg8(V2PMAILBOX), defreg(MBVFICR), defreg(MBVFIMR), defreg(VFLRE), defreg(VFRE), defreg(VFTE), defreg(WVBR), defreg(QDE), defreg(DTXSWC), defreg_indexed(VLVF, 0), - defregv(VMOLR), defreg(RPLOLR), defregv(VMBMEM), defregv(VMVIR), + defreg8(VMOLR), defreg(RPLOLR), defreg8(VMBMEM), defreg8(VMVIR), - defregv(PVTCTRL), defregv(PVTEICS), defregv(PVTEIMS), defregv(PVTEIMC), - defregv(PVTEIAC), defregv(PVTEIAM), defregv(PVTEICR), defregv(PVFGPRC), - defregv(PVFGPTC), defregv(PVFGORC), defregv(PVFGOTC), defregv(PVFMPRC), - defregv(PVFGPRLBC), defregv(PVFGPTLBC), defregv(PVFGORLBC), defregv(PVFGOTLBC), + defreg8(PVTCTRL), defreg8(PVTEICS), defreg8(PVTEIMS), defreg8(PVTEIMC), + defreg8(PVTEIAC), defreg8(PVTEIAM), defreg8(PVTEICR), defreg8(PVFGPRC), + defreg8(PVFGPTC), defreg8(PVFGORC), defreg8(PVFGOTC), defreg8(PVFMPRC), + defreg8(PVFGPRLBC), defreg8(PVFGPTLBC), defreg8(PVFGORLBC), defreg8(PVFGOTLBC), defreg(MTA_A), diff --git a/hw/net/igb_core.c b/hw/net/igb_core.c index c04ec01117..43d23c7621 100644 --- a/hw/net/igb_core.c +++ b/hw/net/igb_core.c @@ -72,6 +72,24 @@ typedef struct L2Header { struct vlan_header vlan[2]; } L2Header; +typedef struct PTP2 { + uint8_t message_id_transport_specific; + uint8_t version_ptp; + uint16_t message_length; + uint8_t subdomain_number; + uint8_t reserved0; + uint16_t flags; + uint64_t correction; + uint8_t reserved1[5]; + uint8_t source_communication_technology; + uint32_t source_uuid_lo; + uint16_t source_uuid_hi; + uint16_t source_port_id; + uint16_t sequence_id; + uint8_t control; + uint8_t log_message_period; +} PTP2; + static ssize_t igb_receive_internal(IGBCore *core, const struct iovec *iov, int iovcnt, bool has_vnet, bool *external_tx); @@ -989,9 +1007,11 @@ static bool igb_rx_is_oversized(IGBCore *core, const struct eth_header *ehdr, return lpe ? size + ETH_FCS_LEN > rlpml : size > header_size + ETH_MTU; } -static uint16_t igb_receive_assign(IGBCore *core, const L2Header *l2_header, - size_t size, E1000E_RSSInfo *rss_info, - bool *external_tx) +static uint16_t igb_receive_assign(IGBCore *core, const struct iovec *iov, + size_t iovcnt, size_t iov_ofs, + const L2Header *l2_header, size_t size, + E1000E_RSSInfo *rss_info, + uint16_t *etqf, bool *ts, bool *external_tx) { static const int ta_shift[] = { 4, 3, 2, 0 }; const struct eth_header *ehdr = &l2_header->eth; @@ -999,11 +1019,13 @@ static uint16_t igb_receive_assign(IGBCore *core, const L2Header *l2_header, uint16_t queues = 0; uint16_t oversized = 0; size_t vlan_num = 0; + PTP2 ptp2; bool lpe; uint16_t rlpml; int i; memset(rss_info, 0, sizeof(E1000E_RSSInfo)); + *ts = false; if (external_tx) { *external_tx = true; @@ -1028,6 +1050,26 @@ static uint16_t igb_receive_assign(IGBCore *core, const L2Header *l2_header, return queues; } + for (*etqf = 0; *etqf < 8; (*etqf)++) { + if ((core->mac[ETQF0 + *etqf] & E1000_ETQF_FILTER_ENABLE) && + be16_to_cpu(ehdr->h_proto) == (core->mac[ETQF0 + *etqf] & E1000_ETQF_ETYPE_MASK)) { + if ((core->mac[ETQF0 + *etqf] & E1000_ETQF_1588) && + (core->mac[TSYNCRXCTL] & E1000_TSYNCRXCTL_ENABLED) && + !(core->mac[TSYNCRXCTL] & E1000_TSYNCRXCTL_VALID) && + iov_to_buf(iov, iovcnt, iov_ofs + ETH_HLEN, &ptp2, sizeof(ptp2)) >= sizeof(ptp2) && + (ptp2.version_ptp & 15) == 2 && + ptp2.message_id_transport_specific == ((core->mac[TSYNCRXCFG] >> 8) & 255)) { + e1000x_timestamp(core->mac, core->timadj, RXSTMPL, RXSTMPH); + *ts = true; + core->mac[TSYNCRXCTL] |= E1000_TSYNCRXCTL_VALID; + core->mac[RXSATRL] = le32_to_cpu(ptp2.source_uuid_lo); + core->mac[RXSATRH] = le16_to_cpu(ptp2.source_uuid_hi) | + (le16_to_cpu(ptp2.sequence_id) << 16); + } + break; + } + } + if (vlan_num && !e1000x_rx_vlan_filter(core->mac, l2_header->vlan + vlan_num - 1)) { return queues; @@ -1238,7 +1280,7 @@ static void igb_build_rx_metadata(IGBCore *core, struct NetRxPkt *pkt, bool is_eop, - const E1000E_RSSInfo *rss_info, + const E1000E_RSSInfo *rss_info, uint16_t etqf, bool ts, uint16_t *pkt_info, uint16_t *hdr_info, uint32_t *rss, uint32_t *status_flags, @@ -1289,29 +1331,33 @@ igb_build_rx_metadata(IGBCore *core, if (pkt_info) { *pkt_info = rss_info->enabled ? rss_info->type : 0; - if (hasip4) { - *pkt_info |= E1000_ADVRXD_PKT_IP4; - } + if (etqf < 8) { + *pkt_info |= (BIT(11) | etqf) << 4; + } else { + if (hasip4) { + *pkt_info |= E1000_ADVRXD_PKT_IP4; + } - if (hasip6) { - *pkt_info |= E1000_ADVRXD_PKT_IP6; - } + if (hasip6) { + *pkt_info |= E1000_ADVRXD_PKT_IP6; + } - switch (l4hdr_proto) { - case ETH_L4_HDR_PROTO_TCP: - *pkt_info |= E1000_ADVRXD_PKT_TCP; - break; + switch (l4hdr_proto) { + case ETH_L4_HDR_PROTO_TCP: + *pkt_info |= E1000_ADVRXD_PKT_TCP; + break; - case ETH_L4_HDR_PROTO_UDP: - *pkt_info |= E1000_ADVRXD_PKT_UDP; - break; + case ETH_L4_HDR_PROTO_UDP: + *pkt_info |= E1000_ADVRXD_PKT_UDP; + break; - case ETH_L4_HDR_PROTO_SCTP: - *pkt_info |= E1000_ADVRXD_PKT_SCTP; - break; + case ETH_L4_HDR_PROTO_SCTP: + *pkt_info |= E1000_ADVRXD_PKT_SCTP; + break; - default: - break; + default: + break; + } } } @@ -1319,6 +1365,10 @@ igb_build_rx_metadata(IGBCore *core, *hdr_info = 0; } + if (ts) { + *status_flags |= BIT(16); + } + /* RX CSO information */ if (hasip6 && (core->mac[RFCTL] & E1000_RFCTL_IPV6_XSUM_DIS)) { trace_e1000e_rx_metadata_ipv6_sum_disabled(); @@ -1374,7 +1424,7 @@ func_exit: static inline void igb_write_lgcy_rx_descr(IGBCore *core, struct e1000_rx_desc *desc, struct NetRxPkt *pkt, - const E1000E_RSSInfo *rss_info, + const E1000E_RSSInfo *rss_info, uint16_t etqf, bool ts, uint16_t length) { uint32_t status_flags, rss; @@ -1385,7 +1435,7 @@ igb_write_lgcy_rx_descr(IGBCore *core, struct e1000_rx_desc *desc, desc->csum = 0; igb_build_rx_metadata(core, pkt, pkt != NULL, - rss_info, + rss_info, etqf, ts, NULL, NULL, &rss, &status_flags, &ip_id, &desc->special); @@ -1396,7 +1446,7 @@ igb_write_lgcy_rx_descr(IGBCore *core, struct e1000_rx_desc *desc, static inline void igb_write_adv_rx_descr(IGBCore *core, union e1000_adv_rx_desc *desc, struct NetRxPkt *pkt, - const E1000E_RSSInfo *rss_info, + const E1000E_RSSInfo *rss_info, uint16_t etqf, bool ts, uint16_t length) { memset(&desc->wb, 0, sizeof(desc->wb)); @@ -1404,7 +1454,7 @@ igb_write_adv_rx_descr(IGBCore *core, union e1000_adv_rx_desc *desc, desc->wb.upper.length = cpu_to_le16(length); igb_build_rx_metadata(core, pkt, pkt != NULL, - rss_info, + rss_info, etqf, ts, &desc->wb.lower.lo_dword.pkt_info, &desc->wb.lower.lo_dword.hdr_info, &desc->wb.lower.hi_dword.rss, @@ -1415,12 +1465,15 @@ igb_write_adv_rx_descr(IGBCore *core, union e1000_adv_rx_desc *desc, static inline void igb_write_rx_descr(IGBCore *core, union e1000_rx_desc_union *desc, -struct NetRxPkt *pkt, const E1000E_RSSInfo *rss_info, uint16_t length) + struct NetRxPkt *pkt, const E1000E_RSSInfo *rss_info, + uint16_t etqf, bool ts, uint16_t length) { if (igb_rx_use_legacy_descriptor(core)) { - igb_write_lgcy_rx_descr(core, &desc->legacy, pkt, rss_info, length); + igb_write_lgcy_rx_descr(core, &desc->legacy, pkt, rss_info, + etqf, ts, length); } else { - igb_write_adv_rx_descr(core, &desc->adv, pkt, rss_info, length); + igb_write_adv_rx_descr(core, &desc->adv, pkt, rss_info, + etqf, ts, length); } } @@ -1497,7 +1550,8 @@ igb_rx_descr_threshold_hit(IGBCore *core, const E1000E_RingInfo *rxi) static void igb_write_packet_to_guest(IGBCore *core, struct NetRxPkt *pkt, const E1000E_RxRing *rxr, - const E1000E_RSSInfo *rss_info) + const E1000E_RSSInfo *rss_info, + uint16_t etqf, bool ts) { PCIDevice *d; dma_addr_t base; @@ -1579,7 +1633,7 @@ igb_write_packet_to_guest(IGBCore *core, struct NetRxPkt *pkt, } igb_write_rx_descr(core, &desc, is_last ? core->rx_pkt : NULL, - rss_info, written); + rss_info, etqf, ts, written); igb_pci_dma_write_rx_desc(core, d, base, &desc, core->rx_desc_len); igb_ring_advance(core, rxi, core->rx_desc_len / E1000_MIN_RX_DESC_LEN); @@ -1634,6 +1688,8 @@ igb_receive_internal(IGBCore *core, const struct iovec *iov, int iovcnt, size_t iov_ofs = 0; E1000E_RxRing rxr; E1000E_RSSInfo rss_info; + uint16_t etqf; + bool ts; size_t total_size; int strip_vlan_index; int i; @@ -1677,8 +1733,9 @@ igb_receive_internal(IGBCore *core, const struct iovec *iov, int iovcnt, get_eth_packet_type(&buf.l2_header.eth)); net_rx_pkt_set_protocols(core->rx_pkt, iov, iovcnt, iov_ofs); - queues = igb_receive_assign(core, &buf.l2_header, size, - &rss_info, external_tx); + queues = igb_receive_assign(core, iov, iovcnt, iov_ofs, + &buf.l2_header, size, + &rss_info, &etqf, &ts, external_tx); if (!queues) { trace_e1000e_rx_flt_dropped(); return orig_size; @@ -1717,7 +1774,7 @@ igb_receive_internal(IGBCore *core, const struct iovec *iov, int iovcnt, causes |= E1000_ICR_RXDW; igb_rx_fix_l4_csum(core, core->rx_pkt); - igb_write_packet_to_guest(core, core->rx_pkt, &rxr, &rss_info); + igb_write_packet_to_guest(core, core->rx_pkt, &rxr, &rss_info, etqf, ts); /* Check if receive descriptor minimum threshold hit */ if (igb_rx_descr_threshold_hit(core, rxr.i)) { @@ -3305,6 +3362,8 @@ static const readops igb_macreg_readops[] = { [EIAM] = igb_mac_readreg, [IVAR0 ... IVAR0 + 7] = igb_mac_readreg, igb_getreg(IVAR_MISC), + igb_getreg(TSYNCRXCFG), + [ETQF0 ... ETQF0 + 7] = igb_mac_readreg, igb_getreg(VT_CTL), [P2VMAILBOX0 ... P2VMAILBOX7] = igb_mac_readreg, [V2PMAILBOX0 ... V2PMAILBOX7] = igb_mac_vfmailbox_read, @@ -3712,6 +3771,8 @@ static const writeops igb_macreg_writeops[] = { [EIMS] = igb_set_eims, [IVAR0 ... IVAR0 + 7] = igb_mac_writereg, igb_putreg(IVAR_MISC), + igb_putreg(TSYNCRXCFG), + [ETQF0 ... ETQF0 + 7] = igb_mac_writereg, igb_putreg(VT_CTL), [P2VMAILBOX0 ... P2VMAILBOX7] = igb_set_pfmailbox, [V2PMAILBOX0 ... V2PMAILBOX7] = igb_set_vfmailbox, diff --git a/hw/net/igb_regs.h b/hw/net/igb_regs.h index 4b4ebd3369..894705599d 100644 --- a/hw/net/igb_regs.h +++ b/hw/net/igb_regs.h @@ -210,6 +210,15 @@ union e1000_adv_rx_desc { #define E1000_DCA_TXCTRL_CPUID_SHIFT 24 /* Tx CPUID now in the last byte */ #define E1000_DCA_RXCTRL_CPUID_SHIFT 24 /* Rx CPUID now in the last byte */ +/* ETQF register bit definitions */ +#define E1000_ETQF_FILTER_ENABLE BIT(26) +#define E1000_ETQF_1588 BIT(30) +#define E1000_ETQF_IMM_INT BIT(29) +#define E1000_ETQF_QUEUE_ENABLE BIT(31) +#define E1000_ETQF_QUEUE_SHIFT 16 +#define E1000_ETQF_QUEUE_MASK 0x00070000 +#define E1000_ETQF_ETYPE_MASK 0x0000FFFF + #define E1000_DTXSWC_MAC_SPOOF_MASK 0x000000FF /* Per VF MAC spoof control */ #define E1000_DTXSWC_VLAN_SPOOF_MASK 0x0000FF00 /* Per VF VLAN spoof control */ #define E1000_DTXSWC_LLE_MASK 0x00FF0000 /* Per VF Local LB enables */ @@ -384,6 +393,20 @@ union e1000_adv_rx_desc { #define E1000_FRTIMER 0x01048 /* Free Running Timer - RW */ #define E1000_FCRTV 0x02460 /* Flow Control Refresh Timer Value - RW */ +#define E1000_TSYNCRXCFG 0x05F50 /* Time Sync Rx Configuration - RW */ + +/* Filtering Registers */ +#define E1000_SAQF(_n) (0x5980 + 4 * (_n)) +#define E1000_DAQF(_n) (0x59A0 + 4 * (_n)) +#define E1000_SPQF(_n) (0x59C0 + 4 * (_n)) +#define E1000_FTQF(_n) (0x59E0 + 4 * (_n)) +#define E1000_SAQF0 E1000_SAQF(0) +#define E1000_DAQF0 E1000_DAQF(0) +#define E1000_SPQF0 E1000_SPQF(0) +#define E1000_FTQF0 E1000_FTQF(0) +#define E1000_SYNQF(_n) (0x055FC + (4 * (_n))) /* SYN Packet Queue Fltr */ +#define E1000_ETQF(_n) (0x05CB0 + (4 * (_n))) /* EType Queue Fltr */ + #define E1000_RQDPC(_n) (0x0C030 + ((_n) * 0x40)) #define E1000_RXPBS 0x02404 /* Rx Packet Buffer Size - RW */ From 3dfc616eabc18f036511178bf7551f34abfd19cb Mon Sep 17 00:00:00 2001 From: Akihiko Odaki Date: Tue, 23 May 2023 11:43:33 +0900 Subject: [PATCH 43/50] igb: Implement Tx timestamp Signed-off-by: Akihiko Odaki Reviewed-by: Sriram Yagnaraman Signed-off-by: Jason Wang --- hw/net/igb_core.c | 7 +++++++ hw/net/igb_regs.h | 3 +++ 2 files changed, 10 insertions(+) diff --git a/hw/net/igb_core.c b/hw/net/igb_core.c index 43d23c7621..49d1917926 100644 --- a/hw/net/igb_core.c +++ b/hw/net/igb_core.c @@ -659,6 +659,13 @@ igb_process_tx_desc(IGBCore *core, tx->ctx[idx].vlan_macip_lens >> IGB_TX_FLAGS_VLAN_SHIFT, !!(tx->first_cmd_type_len & E1000_TXD_CMD_VLE)); + if ((tx->first_cmd_type_len & E1000_ADVTXD_MAC_TSTAMP) && + (core->mac[TSYNCTXCTL] & E1000_TSYNCTXCTL_ENABLED) && + !(core->mac[TSYNCTXCTL] & E1000_TSYNCTXCTL_VALID)) { + core->mac[TSYNCTXCTL] |= E1000_TSYNCTXCTL_VALID; + e1000x_timestamp(core->mac, core->timadj, TXSTMPL, TXSTMPH); + } + if (igb_tx_pkt_send(core, tx, queue_index)) { igb_on_tx_done_update_stats(core, tx->tx_pkt, queue_index); } diff --git a/hw/net/igb_regs.h b/hw/net/igb_regs.h index 894705599d..82ff195dfc 100644 --- a/hw/net/igb_regs.h +++ b/hw/net/igb_regs.h @@ -322,6 +322,9 @@ union e1000_adv_rx_desc { /* E1000_EITR_CNT_IGNR is only for 82576 and newer */ #define E1000_EITR_CNT_IGNR 0x80000000 /* Don't reset counters on write */ +#define E1000_TSYNCTXCTL_VALID 0x00000001 /* tx timestamp valid */ +#define E1000_TSYNCTXCTL_ENABLED 0x00000010 /* enable tx timestampping */ + /* PCI Express Control */ #define E1000_GCR_CMPL_TMOUT_MASK 0x0000F000 #define E1000_GCR_CMPL_TMOUT_10ms 0x00001000 From ad431f0f8254e808a20987804cff1b38de089e47 Mon Sep 17 00:00:00 2001 From: Akihiko Odaki Date: Tue, 23 May 2023 11:43:34 +0900 Subject: [PATCH 44/50] e1000e: Notify only new interrupts In MSI-X mode, if there are interrupts already notified but not cleared and a new interrupt arrives, e1000e incorrectly notifies the notified ones again along with the new one. To fix this issue, replace e1000e_update_interrupt_state() with two new functions: e1000e_raise_interrupts() and e1000e_lower_interrupts(). These functions don't only raise or lower interrupts, but it also performs register writes which updates the interrupt state. Before it performs a register write, these function determines the interrupts already raised, and compares with the interrupts raised after the register write to determine the interrupts to notify. The introduction of these functions made tracepoints which assumes that the caller of e1000e_update_interrupt_state() performs register writes obsolete. These tracepoints are now removed, and alternative ones are added to the new functions. Signed-off-by: Akihiko Odaki Signed-off-by: Jason Wang --- hw/net/e1000e_core.c | 155 +++++++++++++++++++------------------------ hw/net/e1000e_core.h | 2 - hw/net/trace-events | 2 + 3 files changed, 70 insertions(+), 89 deletions(-) diff --git a/hw/net/e1000e_core.c b/hw/net/e1000e_core.c index d601386992..9f185d099c 100644 --- a/hw/net/e1000e_core.c +++ b/hw/net/e1000e_core.c @@ -165,14 +165,14 @@ e1000e_intrmgr_on_throttling_timer(void *opaque) timer->running = false; - if (msi_enabled(timer->core->owner)) { - trace_e1000e_irq_msi_notify_postponed(); - /* Clear msi_causes_pending to fire MSI eventually */ - timer->core->msi_causes_pending = 0; - e1000e_set_interrupt_cause(timer->core, 0); - } else { - trace_e1000e_irq_legacy_notify_postponed(); - e1000e_set_interrupt_cause(timer->core, 0); + if (timer->core->mac[IMS] & timer->core->mac[ICR]) { + if (msi_enabled(timer->core->owner)) { + trace_e1000e_irq_msi_notify_postponed(); + msi_notify(timer->core->owner, 0); + } else { + trace_e1000e_irq_legacy_notify_postponed(); + e1000e_raise_legacy_irq(timer->core); + } } } @@ -366,10 +366,6 @@ static void e1000e_intrmgr_fire_all_timers(E1000ECore *core) { int i; - uint32_t val = e1000e_intmgr_collect_delayed_causes(core); - - trace_e1000e_irq_adding_delayed_causes(val, core->mac[ICR]); - core->mac[ICR] |= val; if (core->itr.running) { timer_del(core->itr.timer); @@ -1974,13 +1970,6 @@ void(*e1000e_phyreg_writeops[E1000E_PHY_PAGES][E1000E_PHY_PAGE_SIZE]) } }; -static inline void -e1000e_clear_ims_bits(E1000ECore *core, uint32_t bits) -{ - trace_e1000e_irq_clear_ims(bits, core->mac[IMS], core->mac[IMS] & ~bits); - core->mac[IMS] &= ~bits; -} - static inline bool e1000e_postpone_interrupt(E1000IntrDelayTimer *timer) { @@ -2038,7 +2027,6 @@ e1000e_msix_notify_one(E1000ECore *core, uint32_t cause, uint32_t int_cfg) effective_eiac = core->mac[EIAC] & cause; core->mac[ICR] &= ~effective_eiac; - core->msi_causes_pending &= ~effective_eiac; if (!(core->mac[CTRL_EXT] & E1000_CTRL_EXT_IAME)) { core->mac[IMS] &= ~effective_eiac; @@ -2130,33 +2118,17 @@ e1000e_fix_icr_asserted(E1000ECore *core) trace_e1000e_irq_fix_icr_asserted(core->mac[ICR]); } -static void -e1000e_send_msi(E1000ECore *core, bool msix) +static void e1000e_raise_interrupts(E1000ECore *core, + size_t index, uint32_t causes) { - uint32_t causes = core->mac[ICR] & core->mac[IMS] & ~E1000_ICR_ASSERTED; - - core->msi_causes_pending &= causes; - causes ^= core->msi_causes_pending; - if (causes == 0) { - return; - } - core->msi_causes_pending |= causes; - - if (msix) { - e1000e_msix_notify(core, causes); - } else { - if (!e1000e_itr_should_postpone(core)) { - trace_e1000e_irq_msi_notify(causes); - msi_notify(core->owner, 0); - } - } -} - -static void -e1000e_update_interrupt_state(E1000ECore *core) -{ - bool interrupts_pending; bool is_msix = msix_enabled(core->owner); + uint32_t old_causes = core->mac[IMS] & core->mac[ICR]; + uint32_t raised_causes; + + trace_e1000e_irq_set(index << 2, + core->mac[index], core->mac[index] | causes); + + core->mac[index] |= causes; /* Set ICR[OTHER] for MSI-X */ if (is_msix) { @@ -2178,40 +2150,58 @@ e1000e_update_interrupt_state(E1000ECore *core) */ core->mac[ICS] = core->mac[ICR]; - interrupts_pending = (core->mac[IMS] & core->mac[ICR]) ? true : false; - if (!interrupts_pending) { - core->msi_causes_pending = 0; + trace_e1000e_irq_pending_interrupts(core->mac[ICR] & core->mac[IMS], + core->mac[ICR], core->mac[IMS]); + + raised_causes = core->mac[IMS] & core->mac[ICR] & ~old_causes; + if (!raised_causes) { + return; } + if (is_msix) { + e1000e_msix_notify(core, raised_causes & ~E1000_ICR_ASSERTED); + } else if (!e1000e_itr_should_postpone(core)) { + if (msi_enabled(core->owner)) { + trace_e1000e_irq_msi_notify(raised_causes); + msi_notify(core->owner, 0); + } else { + e1000e_raise_legacy_irq(core); + } + } +} + +static void e1000e_lower_interrupts(E1000ECore *core, + size_t index, uint32_t causes) +{ + trace_e1000e_irq_clear(index << 2, + core->mac[index], core->mac[index] & ~causes); + + core->mac[index] &= ~causes; + + /* + * Make sure ICR and ICS registers have the same value. + * The spec says that the ICS register is write-only. However in practice, + * on real hardware ICS is readable, and for reads it has the same value as + * ICR (except that ICS does not have the clear on read behaviour of ICR). + * + * The VxWorks PRO/1000 driver uses this behaviour. + */ + core->mac[ICS] = core->mac[ICR]; + trace_e1000e_irq_pending_interrupts(core->mac[ICR] & core->mac[IMS], core->mac[ICR], core->mac[IMS]); - if (is_msix || msi_enabled(core->owner)) { - if (interrupts_pending) { - e1000e_send_msi(core, is_msix); - } - } else { - if (interrupts_pending) { - if (!e1000e_itr_should_postpone(core)) { - e1000e_raise_legacy_irq(core); - } - } else { - e1000e_lower_legacy_irq(core); - } + if (!(core->mac[IMS] & core->mac[ICR]) && + !msix_enabled(core->owner) && !msi_enabled(core->owner)) { + e1000e_lower_legacy_irq(core); } } static void e1000e_set_interrupt_cause(E1000ECore *core, uint32_t val) { - trace_e1000e_irq_set_cause_entry(val, core->mac[ICR]); - val |= e1000e_intmgr_collect_delayed_causes(core); - core->mac[ICR] |= val; - - trace_e1000e_irq_set_cause_exit(val, core->mac[ICR]); - - e1000e_update_interrupt_state(core); + e1000e_raise_interrupts(core, ICR, val); } static inline void @@ -2477,30 +2467,27 @@ e1000e_set_ics(E1000ECore *core, int index, uint32_t val) static void e1000e_set_icr(E1000ECore *core, int index, uint32_t val) { - uint32_t icr = 0; if ((core->mac[ICR] & E1000_ICR_ASSERTED) && (core->mac[CTRL_EXT] & E1000_CTRL_EXT_IAME)) { trace_e1000e_irq_icr_process_iame(); - e1000e_clear_ims_bits(core, core->mac[IAM]); + e1000e_lower_interrupts(core, IMS, core->mac[IAM]); } - icr = core->mac[ICR] & ~val; /* * Windows driver expects that the "receive overrun" bit and other * ones to be cleared when the "Other" bit (#24) is cleared. */ - icr = (val & E1000_ICR_OTHER) ? (icr & ~E1000_ICR_OTHER_CAUSES) : icr; - trace_e1000e_irq_icr_write(val, core->mac[ICR], icr); - core->mac[ICR] = icr; - e1000e_update_interrupt_state(core); + if (val & E1000_ICR_OTHER) { + val |= E1000_ICR_OTHER_CAUSES; + } + e1000e_lower_interrupts(core, ICR, val); } static void e1000e_set_imc(E1000ECore *core, int index, uint32_t val) { trace_e1000e_irq_ims_clear_set_imc(val); - e1000e_clear_ims_bits(core, val); - e1000e_update_interrupt_state(core); + e1000e_lower_interrupts(core, IMS, val); } static void @@ -2521,9 +2508,6 @@ e1000e_set_ims(E1000ECore *core, int index, uint32_t val) uint32_t valid_val = val & ims_valid_mask; - trace_e1000e_irq_set_ims(val, core->mac[IMS], core->mac[IMS] | valid_val); - core->mac[IMS] |= valid_val; - if ((valid_val & ims_ext_mask) && (core->mac[CTRL_EXT] & E1000_CTRL_EXT_PBA_CLR) && msix_enabled(core->owner)) { @@ -2536,7 +2520,7 @@ e1000e_set_ims(E1000ECore *core, int index, uint32_t val) e1000e_intrmgr_fire_all_timers(core); } - e1000e_update_interrupt_state(core); + e1000e_raise_interrupts(core, IMS, valid_val); } static void @@ -2609,28 +2593,25 @@ static uint32_t e1000e_mac_icr_read(E1000ECore *core, int index) { uint32_t ret = core->mac[ICR]; - trace_e1000e_irq_icr_read_entry(ret); if (core->mac[IMS] == 0) { trace_e1000e_irq_icr_clear_zero_ims(); - core->mac[ICR] = 0; + e1000e_lower_interrupts(core, ICR, 0xffffffff); } if (!msix_enabled(core->owner)) { trace_e1000e_irq_icr_clear_nonmsix_icr_read(); - core->mac[ICR] = 0; + e1000e_lower_interrupts(core, ICR, 0xffffffff); } if ((core->mac[ICR] & E1000_ICR_ASSERTED) && (core->mac[CTRL_EXT] & E1000_CTRL_EXT_IAME)) { trace_e1000e_irq_icr_clear_iame(); - core->mac[ICR] = 0; + e1000e_lower_interrupts(core, ICR, 0xffffffff); trace_e1000e_irq_icr_process_iame(); - e1000e_clear_ims_bits(core, core->mac[IAM]); + e1000e_lower_interrupts(core, IMS, core->mac[IAM]); } - trace_e1000e_irq_icr_read_exit(core->mac[ICR]); - e1000e_update_interrupt_state(core); return ret; } diff --git a/hw/net/e1000e_core.h b/hw/net/e1000e_core.h index 213a70530d..66b025cc43 100644 --- a/hw/net/e1000e_core.h +++ b/hw/net/e1000e_core.h @@ -111,8 +111,6 @@ struct E1000Core { PCIDevice *owner; void (*owner_start_recv)(PCIDevice *d); - uint32_t msi_causes_pending; - int64_t timadj; }; diff --git a/hw/net/trace-events b/hw/net/trace-events index 64d776bc2a..d171dc8179 100644 --- a/hw/net/trace-events +++ b/hw/net/trace-events @@ -205,6 +205,8 @@ e1000e_irq_msix_notify_postponed_vec(int idx) "Sending MSI-X postponed by EITR[% e1000e_irq_legacy_notify(bool level) "IRQ line state: %d" e1000e_irq_msix_notify_vec(uint32_t vector) "MSI-X notify vector 0x%x" e1000e_irq_postponed_by_xitr(uint32_t reg) "Interrupt postponed by [E]ITR register 0x%x" +e1000e_irq_clear(uint32_t offset, uint32_t old, uint32_t new) "Clearing interrupt register 0x%x: 0x%x --> 0x%x" +e1000e_irq_set(uint32_t offset, uint32_t old, uint32_t new) "Setting interrupt register 0x%x: 0x%x --> 0x%x" e1000e_irq_clear_ims(uint32_t bits, uint32_t old_ims, uint32_t new_ims) "Clearing IMS bits 0x%x: 0x%x --> 0x%x" e1000e_irq_set_ims(uint32_t bits, uint32_t old_ims, uint32_t new_ims) "Setting IMS bits 0x%x: 0x%x --> 0x%x" e1000e_irq_fix_icr_asserted(uint32_t new_val) "ICR_ASSERTED bit fixed: 0x%x" From 5844562b177e2067b8ebf78d1845334e0c759896 Mon Sep 17 00:00:00 2001 From: Akihiko Odaki Date: Tue, 23 May 2023 11:43:35 +0900 Subject: [PATCH 45/50] igb: Notify only new interrupts This follows the corresponding change for e1000e. This fixes: tests/avocado/netdev-ethtool.py:NetDevEthtool.test_igb Signed-off-by: Akihiko Odaki Signed-off-by: Jason Wang --- hw/net/igb_core.c | 201 ++++++++---------- hw/net/trace-events | 11 +- .../org.centos/stream/8/x86_64/test-avocado | 1 + tests/avocado/netdev-ethtool.py | 4 - 4 files changed, 87 insertions(+), 130 deletions(-) diff --git a/hw/net/igb_core.c b/hw/net/igb_core.c index 49d1917926..823dde8f28 100644 --- a/hw/net/igb_core.c +++ b/hw/net/igb_core.c @@ -94,10 +94,7 @@ static ssize_t igb_receive_internal(IGBCore *core, const struct iovec *iov, int iovcnt, bool has_vnet, bool *external_tx); -static inline void -igb_set_interrupt_cause(IGBCore *core, uint32_t val); - -static void igb_update_interrupt_state(IGBCore *core); +static void igb_raise_interrupts(IGBCore *core, size_t index, uint32_t causes); static void igb_reset(IGBCore *core, bool sw); static inline void @@ -913,8 +910,8 @@ igb_start_xmit(IGBCore *core, const IGB_TxRing *txr) } if (eic) { - core->mac[EICR] |= eic; - igb_set_interrupt_cause(core, E1000_ICR_TXDW); + igb_raise_interrupts(core, EICR, eic); + igb_raise_interrupts(core, ICR, E1000_ICR_TXDW); } net_tx_pkt_reset(txr->tx->tx_pkt, net_tx_pkt_unmap_frag_pci, d); @@ -1686,6 +1683,7 @@ igb_receive_internal(IGBCore *core, const struct iovec *iov, int iovcnt, { uint16_t queues = 0; uint32_t causes = 0; + uint32_t ecauses = 0; union { L2Header l2_header; uint8_t octets[ETH_ZLEN]; @@ -1788,13 +1786,14 @@ igb_receive_internal(IGBCore *core, const struct iovec *iov, int iovcnt, causes |= E1000_ICS_RXDMT0; } - core->mac[EICR] |= igb_rx_wb_eic(core, rxr.i->idx); + ecauses |= igb_rx_wb_eic(core, rxr.i->idx); trace_e1000e_rx_written_to_guest(rxr.i->idx); } trace_e1000e_rx_interrupt_set(causes); - igb_set_interrupt_cause(core, causes); + igb_raise_interrupts(core, EICR, ecauses); + igb_raise_interrupts(core, ICR, causes); return orig_size; } @@ -1854,7 +1853,7 @@ void igb_core_set_link_status(IGBCore *core) } if (core->mac[STATUS] != old_status) { - igb_set_interrupt_cause(core, E1000_ICR_LSC); + igb_raise_interrupts(core, ICR, E1000_ICR_LSC); } } @@ -1934,13 +1933,6 @@ igb_set_rx_control(IGBCore *core, int index, uint32_t val) } } -static inline void -igb_clear_ims_bits(IGBCore *core, uint32_t bits) -{ - trace_e1000e_irq_clear_ims(bits, core->mac[IMS], core->mac[IMS] & ~bits); - core->mac[IMS] &= ~bits; -} - static inline bool igb_postpone_interrupt(IGBIntrDelayTimer *timer) { @@ -1963,9 +1955,8 @@ igb_eitr_should_postpone(IGBCore *core, int idx) return igb_postpone_interrupt(&core->eitr[idx]); } -static void igb_send_msix(IGBCore *core) +static void igb_send_msix(IGBCore *core, uint32_t causes) { - uint32_t causes = core->mac[EICR] & core->mac[EIMS]; int vector; for (vector = 0; vector < IGB_INTR_NUM; ++vector) { @@ -1988,124 +1979,116 @@ igb_fix_icr_asserted(IGBCore *core) trace_e1000e_irq_fix_icr_asserted(core->mac[ICR]); } -static void -igb_update_interrupt_state(IGBCore *core) +static void igb_raise_interrupts(IGBCore *core, size_t index, uint32_t causes) { - uint32_t icr; - uint32_t causes; + uint32_t old_causes = core->mac[ICR] & core->mac[IMS]; + uint32_t old_ecauses = core->mac[EICR] & core->mac[EIMS]; + uint32_t raised_causes; + uint32_t raised_ecauses; uint32_t int_alloc; - icr = core->mac[ICR] & core->mac[IMS]; + trace_e1000e_irq_set(index << 2, + core->mac[index], core->mac[index] | causes); + + core->mac[index] |= causes; if (core->mac[GPIE] & E1000_GPIE_MSIX_MODE) { - if (icr) { - causes = 0; - if (icr & E1000_ICR_DRSTA) { - int_alloc = core->mac[IVAR_MISC] & 0xff; - if (int_alloc & E1000_IVAR_VALID) { - causes |= BIT(int_alloc & 0x1f); - } + raised_causes = core->mac[ICR] & core->mac[IMS] & ~old_causes; + + if (raised_causes & E1000_ICR_DRSTA) { + int_alloc = core->mac[IVAR_MISC] & 0xff; + if (int_alloc & E1000_IVAR_VALID) { + core->mac[EICR] |= BIT(int_alloc & 0x1f); } - /* Check if other bits (excluding the TCP Timer) are enabled. */ - if (icr & ~E1000_ICR_DRSTA) { - int_alloc = (core->mac[IVAR_MISC] >> 8) & 0xff; - if (int_alloc & E1000_IVAR_VALID) { - causes |= BIT(int_alloc & 0x1f); - } - trace_e1000e_irq_add_msi_other(core->mac[EICR]); + } + /* Check if other bits (excluding the TCP Timer) are enabled. */ + if (raised_causes & ~E1000_ICR_DRSTA) { + int_alloc = (core->mac[IVAR_MISC] >> 8) & 0xff; + if (int_alloc & E1000_IVAR_VALID) { + core->mac[EICR] |= BIT(int_alloc & 0x1f); } - core->mac[EICR] |= causes; } - if ((core->mac[EICR] & core->mac[EIMS])) { - igb_send_msix(core); + raised_ecauses = core->mac[EICR] & core->mac[EIMS] & ~old_ecauses; + if (!raised_ecauses) { + return; } + + igb_send_msix(core, raised_ecauses); } else { igb_fix_icr_asserted(core); - if (icr) { - core->mac[EICR] |= (icr & E1000_ICR_DRSTA) | E1000_EICR_OTHER; - } else { - core->mac[EICR] &= ~E1000_EICR_OTHER; + raised_causes = core->mac[ICR] & core->mac[IMS] & ~old_causes; + if (!raised_causes) { + return; } - trace_e1000e_irq_pending_interrupts(core->mac[ICR] & core->mac[IMS], - core->mac[ICR], core->mac[IMS]); + core->mac[EICR] |= (raised_causes & E1000_ICR_DRSTA) | E1000_EICR_OTHER; if (msix_enabled(core->owner)) { - if (icr) { - trace_e1000e_irq_msix_notify_vec(0); - msix_notify(core->owner, 0); - } + trace_e1000e_irq_msix_notify_vec(0); + msix_notify(core->owner, 0); } else if (msi_enabled(core->owner)) { - if (icr) { - msi_notify(core->owner, 0); - } + trace_e1000e_irq_msi_notify(raised_causes); + msi_notify(core->owner, 0); } else { - if (icr) { - igb_raise_legacy_irq(core); - } else { - igb_lower_legacy_irq(core); - } + igb_raise_legacy_irq(core); } } } -static void -igb_set_interrupt_cause(IGBCore *core, uint32_t val) +static void igb_lower_interrupts(IGBCore *core, size_t index, uint32_t causes) { - trace_e1000e_irq_set_cause_entry(val, core->mac[ICR]); + trace_e1000e_irq_clear(index << 2, + core->mac[index], core->mac[index] & ~causes); - core->mac[ICR] |= val; + core->mac[index] &= ~causes; - trace_e1000e_irq_set_cause_exit(val, core->mac[ICR]); + trace_e1000e_irq_pending_interrupts(core->mac[ICR] & core->mac[IMS], + core->mac[ICR], core->mac[IMS]); - igb_update_interrupt_state(core); + if (!(core->mac[ICR] & core->mac[IMS]) && + !(core->mac[GPIE] & E1000_GPIE_MSIX_MODE)) { + core->mac[EICR] &= ~E1000_EICR_OTHER; + + if (!msix_enabled(core->owner) && !msi_enabled(core->owner)) { + igb_lower_legacy_irq(core); + } + } } static void igb_set_eics(IGBCore *core, int index, uint32_t val) { bool msix = !!(core->mac[GPIE] & E1000_GPIE_MSIX_MODE); + uint32_t mask = msix ? E1000_EICR_MSIX_MASK : E1000_EICR_LEGACY_MASK; trace_igb_irq_write_eics(val, msix); - - core->mac[EICS] |= - val & (msix ? E1000_EICR_MSIX_MASK : E1000_EICR_LEGACY_MASK); - - /* - * TODO: Move to igb_update_interrupt_state if EICS is modified in other - * places. - */ - core->mac[EICR] = core->mac[EICS]; - - igb_update_interrupt_state(core); + igb_raise_interrupts(core, EICR, val & mask); } static void igb_set_eims(IGBCore *core, int index, uint32_t val) { bool msix = !!(core->mac[GPIE] & E1000_GPIE_MSIX_MODE); + uint32_t mask = msix ? E1000_EICR_MSIX_MASK : E1000_EICR_LEGACY_MASK; trace_igb_irq_write_eims(val, msix); - - core->mac[EIMS] |= - val & (msix ? E1000_EICR_MSIX_MASK : E1000_EICR_LEGACY_MASK); - - igb_update_interrupt_state(core); + igb_raise_interrupts(core, EIMS, val & mask); } static void mailbox_interrupt_to_vf(IGBCore *core, uint16_t vfn) { uint32_t ent = core->mac[VTIVAR_MISC + vfn]; + uint32_t causes; if ((ent & E1000_IVAR_VALID)) { - core->mac[EICR] |= (ent & 0x3) << (22 - vfn * IGBVF_MSIX_VEC_NUM); - igb_update_interrupt_state(core); + causes = (ent & 0x3) << (22 - vfn * IGBVF_MSIX_VEC_NUM); + igb_raise_interrupts(core, EICR, causes); } } static void mailbox_interrupt_to_pf(IGBCore *core) { - igb_set_interrupt_cause(core, E1000_ICR_VMMB); + igb_raise_interrupts(core, ICR, E1000_ICR_VMMB); } static void igb_set_pfmailbox(IGBCore *core, int index, uint32_t val) @@ -2196,13 +2179,12 @@ static void igb_w1c(IGBCore *core, int index, uint32_t val) static void igb_set_eimc(IGBCore *core, int index, uint32_t val) { bool msix = !!(core->mac[GPIE] & E1000_GPIE_MSIX_MODE); + uint32_t mask = msix ? E1000_EICR_MSIX_MASK : E1000_EICR_LEGACY_MASK; + + trace_igb_irq_write_eimc(val, msix); /* Interrupts are disabled via a write to EIMC and reflected in EIMS. */ - core->mac[EIMS] &= - ~(val & (msix ? E1000_EICR_MSIX_MASK : E1000_EICR_LEGACY_MASK)); - - trace_igb_irq_write_eimc(val, core->mac[EIMS], msix); - igb_update_interrupt_state(core); + igb_lower_interrupts(core, EIMS, val & mask); } static void igb_set_eiac(IGBCore *core, int index, uint32_t val) @@ -2242,11 +2224,10 @@ static void igb_set_eicr(IGBCore *core, int index, uint32_t val) * TODO: In IOV mode, only bit zero of this vector is available for the PF * function. */ - core->mac[EICR] &= - ~(val & (msix ? E1000_EICR_MSIX_MASK : E1000_EICR_LEGACY_MASK)); + uint32_t mask = msix ? E1000_EICR_MSIX_MASK : E1000_EICR_LEGACY_MASK; trace_igb_irq_write_eicr(val, msix); - igb_update_interrupt_state(core); + igb_lower_interrupts(core, EICR, val & mask); } static void igb_set_vtctrl(IGBCore *core, int index, uint32_t val) @@ -2346,7 +2327,7 @@ igb_autoneg_timer(void *opaque) igb_update_flowctl_status(core); /* signal link status change to the guest */ - igb_set_interrupt_cause(core, E1000_ICR_LSC); + igb_raise_interrupts(core, ICR, E1000_ICR_LSC); } } @@ -2419,7 +2400,7 @@ igb_set_mdic(IGBCore *core, int index, uint32_t val) core->mac[MDIC] = val | E1000_MDIC_READY; if (val & E1000_MDIC_INT_EN) { - igb_set_interrupt_cause(core, E1000_ICR_MDAC); + igb_raise_interrupts(core, ICR, E1000_ICR_MDAC); } } @@ -2527,28 +2508,23 @@ static void igb_set_ics(IGBCore *core, int index, uint32_t val) { trace_e1000e_irq_write_ics(val); - igb_set_interrupt_cause(core, val); + igb_raise_interrupts(core, ICR, val); } static void igb_set_imc(IGBCore *core, int index, uint32_t val) { trace_e1000e_irq_ims_clear_set_imc(val); - igb_clear_ims_bits(core, val); - igb_update_interrupt_state(core); + igb_lower_interrupts(core, IMS, val); } static void igb_set_ims(IGBCore *core, int index, uint32_t val) { - uint32_t valid_val = val & 0x77D4FBFD; - - trace_e1000e_irq_set_ims(val, core->mac[IMS], core->mac[IMS] | valid_val); - core->mac[IMS] |= valid_val; - igb_update_interrupt_state(core); + igb_raise_interrupts(core, IMS, val & 0x77D4FBFD); } -static void igb_commit_icr(IGBCore *core) +static void igb_nsicr(IGBCore *core) { /* * If GPIE.NSICR = 0, then the clear of IMS will occur only if at @@ -2557,19 +2533,14 @@ static void igb_commit_icr(IGBCore *core) */ if ((core->mac[GPIE] & E1000_GPIE_NSICR) || (core->mac[IMS] && (core->mac[ICR] & E1000_ICR_INT_ASSERTED))) { - igb_clear_ims_bits(core, core->mac[IAM]); + igb_lower_interrupts(core, IMS, core->mac[IAM]); } - - igb_update_interrupt_state(core); } static void igb_set_icr(IGBCore *core, int index, uint32_t val) { - uint32_t icr = core->mac[ICR] & ~val; - - trace_igb_irq_icr_write(val, core->mac[ICR], icr); - core->mac[ICR] = icr; - igb_commit_icr(core); + igb_nsicr(core); + igb_lower_interrupts(core, ICR, val); } static uint32_t @@ -2620,21 +2591,19 @@ static uint32_t igb_mac_icr_read(IGBCore *core, int index) { uint32_t ret = core->mac[ICR]; - trace_e1000e_irq_icr_read_entry(ret); if (core->mac[GPIE] & E1000_GPIE_NSICR) { trace_igb_irq_icr_clear_gpie_nsicr(); - core->mac[ICR] = 0; + igb_lower_interrupts(core, ICR, 0xffffffff); } else if (core->mac[IMS] == 0) { trace_e1000e_irq_icr_clear_zero_ims(); - core->mac[ICR] = 0; + igb_lower_interrupts(core, ICR, 0xffffffff); } else if (!msix_enabled(core->owner)) { trace_e1000e_irq_icr_clear_nonmsix_icr_read(); - core->mac[ICR] = 0; + igb_lower_interrupts(core, ICR, 0xffffffff); } - trace_e1000e_irq_icr_read_exit(core->mac[ICR]); - igb_commit_icr(core); + igb_nsicr(core); return ret; } diff --git a/hw/net/trace-events b/hw/net/trace-events index d171dc8179..e4a98b2c7d 100644 --- a/hw/net/trace-events +++ b/hw/net/trace-events @@ -207,21 +207,14 @@ e1000e_irq_msix_notify_vec(uint32_t vector) "MSI-X notify vector 0x%x" e1000e_irq_postponed_by_xitr(uint32_t reg) "Interrupt postponed by [E]ITR register 0x%x" e1000e_irq_clear(uint32_t offset, uint32_t old, uint32_t new) "Clearing interrupt register 0x%x: 0x%x --> 0x%x" e1000e_irq_set(uint32_t offset, uint32_t old, uint32_t new) "Setting interrupt register 0x%x: 0x%x --> 0x%x" -e1000e_irq_clear_ims(uint32_t bits, uint32_t old_ims, uint32_t new_ims) "Clearing IMS bits 0x%x: 0x%x --> 0x%x" -e1000e_irq_set_ims(uint32_t bits, uint32_t old_ims, uint32_t new_ims) "Setting IMS bits 0x%x: 0x%x --> 0x%x" e1000e_irq_fix_icr_asserted(uint32_t new_val) "ICR_ASSERTED bit fixed: 0x%x" e1000e_irq_add_msi_other(uint32_t new_val) "ICR_OTHER bit added: 0x%x" e1000e_irq_pending_interrupts(uint32_t pending, uint32_t icr, uint32_t ims) "ICR PENDING: 0x%x (ICR: 0x%x, IMS: 0x%x)" -e1000e_irq_set_cause_entry(uint32_t val, uint32_t icr) "Going to set IRQ cause 0x%x, ICR: 0x%x" -e1000e_irq_set_cause_exit(uint32_t val, uint32_t icr) "Set IRQ cause 0x%x, ICR: 0x%x" -e1000e_irq_icr_write(uint32_t bits, uint32_t old_icr, uint32_t new_icr) "Clearing ICR bits 0x%x: 0x%x --> 0x%x" e1000e_irq_write_ics(uint32_t val) "Adding ICR bits 0x%x" e1000e_irq_icr_process_iame(void) "Clearing IMS bits due to IAME" e1000e_irq_read_ics(uint32_t ics) "Current ICS: 0x%x" e1000e_irq_read_ims(uint32_t ims) "Current IMS: 0x%x" e1000e_irq_icr_clear_nonmsix_icr_read(void) "Clearing ICR on read due to non MSI-X int" -e1000e_irq_icr_read_entry(uint32_t icr) "Starting ICR read. Current ICR: 0x%x" -e1000e_irq_icr_read_exit(uint32_t icr) "Ending ICR read. Current ICR: 0x%x" e1000e_irq_icr_clear_zero_ims(void) "Clearing ICR on read due to zero IMS" e1000e_irq_icr_clear_iame(void) "Clearing ICR on read due to IAME" e1000e_irq_iam_clear_eiame(uint32_t iam, uint32_t cause) "Clearing IMS due to EIAME, IAM: 0x%X, cause: 0x%X" @@ -237,7 +230,6 @@ e1000e_irq_tidv_fpd_not_running(void) "FPD written while TIDV was not running" e1000e_irq_eitr_set(uint32_t eitr_num, uint32_t val) "EITR[%u] = %u" e1000e_irq_itr_set(uint32_t val) "ITR = %u" e1000e_irq_fire_all_timers(uint32_t val) "Firing all delay/throttling timers on all interrupts enable (0x%X written to IMS)" -e1000e_irq_adding_delayed_causes(uint32_t val, uint32_t icr) "Merging delayed causes 0x%X to ICR 0x%X" e1000e_irq_msix_pending_clearing(uint32_t cause, uint32_t int_cfg, uint32_t vec) "Clearing MSI-X pending bit for cause 0x%x, IVAR config 0x%x, vector %u" e1000e_wrn_msix_vec_wrong(uint32_t cause, uint32_t cfg) "Invalid configuration for cause 0x%x: 0x%x" @@ -290,12 +282,11 @@ igb_rx_desc_buff_write(uint64_t addr, uint16_t offset, const void* source, uint3 igb_rx_metadata_rss(uint32_t rss) "RSS data: 0x%X" igb_irq_icr_clear_gpie_nsicr(void) "Clearing ICR on read due to GPIE.NSICR enabled" -igb_irq_icr_write(uint32_t bits, uint32_t old_icr, uint32_t new_icr) "Clearing ICR bits 0x%x: 0x%x --> 0x%x" igb_irq_set_iam(uint32_t icr) "Update IAM: 0x%x" igb_irq_read_iam(uint32_t icr) "Current IAM: 0x%x" igb_irq_write_eics(uint32_t val, bool msix) "Update EICS: 0x%x MSI-X: %d" igb_irq_write_eims(uint32_t val, bool msix) "Update EIMS: 0x%x MSI-X: %d" -igb_irq_write_eimc(uint32_t val, uint32_t eims, bool msix) "Update EIMC: 0x%x EIMS: 0x%x MSI-X: %d" +igb_irq_write_eimc(uint32_t val, bool msix) "Update EIMC: 0x%x MSI-X: %d" igb_irq_write_eiac(uint32_t val) "Update EIAC: 0x%x" igb_irq_write_eiam(uint32_t val, bool msix) "Update EIAM: 0x%x MSI-X: %d" igb_irq_write_eicr(uint32_t val, bool msix) "Update EICR: 0x%x MSI-X: %d" diff --git a/scripts/ci/org.centos/stream/8/x86_64/test-avocado b/scripts/ci/org.centos/stream/8/x86_64/test-avocado index f43449377a..73e7a1a312 100755 --- a/scripts/ci/org.centos/stream/8/x86_64/test-avocado +++ b/scripts/ci/org.centos/stream/8/x86_64/test-avocado @@ -30,6 +30,7 @@ make get-vm-images tests/avocado/cpu_queries.py:QueryCPUModelExpansion.test \ tests/avocado/empty_cpu_model.py:EmptyCPUModel.test \ tests/avocado/hotplug_cpu.py:HotPlugCPU.test \ + tests/avocado/netdev-ethtool.py:NetDevEthtool.test_igb \ tests/avocado/netdev-ethtool.py:NetDevEthtool.test_igb_nomsi \ tests/avocado/info_usernet.py:InfoUsernet.test_hostfwd \ tests/avocado/intel_iommu.py:IntelIOMMU.test_intel_iommu \ diff --git a/tests/avocado/netdev-ethtool.py b/tests/avocado/netdev-ethtool.py index 6da800f62b..5f33288f81 100644 --- a/tests/avocado/netdev-ethtool.py +++ b/tests/avocado/netdev-ethtool.py @@ -67,10 +67,6 @@ class NetDevEthtool(QemuSystemTest): # no need to gracefully shutdown, just finish self.vm.kill() - # Skip testing for MSI for now. Allegedly it was fixed by: - # 28e96556ba (igb: Allocate MSI-X vector when testing) - # but I'm seeing oops in the kernel - @skip("Kernel bug with MSI enabled") def test_igb(self): """ :avocado: tags=device:igb From efb1fd7a73aac29b02514c8aa46b752a7c7caa73 Mon Sep 17 00:00:00 2001 From: Akihiko Odaki Date: Tue, 23 May 2023 11:43:36 +0900 Subject: [PATCH 46/50] igb: Clear-on-read ICR when ICR.INTA is set For GPIE.NSICR, Section 7.3.2.1.2 says: > ICR bits are cleared on register read. If GPIE.NSICR = 0b, then the > clear on read occurs only if no bit is set in the IMS or at least one > bit is set in the IMS and there is a true interrupt as reflected in > ICR.INTA. e1000e does similar though it checks for CTRL_EXT.IAME, which does not exist on igb. Suggested-by: Sriram Yagnaraman Signed-off-by: Akihiko Odaki Signed-off-by: Jason Wang --- hw/net/igb_core.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/hw/net/igb_core.c b/hw/net/igb_core.c index 823dde8f28..d00b1caa6a 100644 --- a/hw/net/igb_core.c +++ b/hw/net/igb_core.c @@ -2598,6 +2598,8 @@ igb_mac_icr_read(IGBCore *core, int index) } else if (core->mac[IMS] == 0) { trace_e1000e_irq_icr_clear_zero_ims(); igb_lower_interrupts(core, ICR, 0xffffffff); + } else if (core->mac[ICR] & E1000_ICR_INT_ASSERTED) { + igb_lower_interrupts(core, ICR, 0xffffffff); } else if (!msix_enabled(core->owner)) { trace_e1000e_irq_icr_clear_nonmsix_icr_read(); igb_lower_interrupts(core, ICR, 0xffffffff); From 9472640633494af67a418e6258e9b2941db1f5c1 Mon Sep 17 00:00:00 2001 From: Akihiko Odaki Date: Tue, 23 May 2023 11:43:37 +0900 Subject: [PATCH 47/50] vmxnet3: Do not depend on PC MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit vmxnet3 has no dependency on PC, and VMware Fusion actually makes it available on Apple Silicon according to: https://kb.vmware.com/s/article/90364 Signed-off-by: Akihiko Odaki Reviewed-by: Philippe Mathieu-Daudé Signed-off-by: Jason Wang --- hw/net/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hw/net/Kconfig b/hw/net/Kconfig index 18c7851efe..98e00be4f9 100644 --- a/hw/net/Kconfig +++ b/hw/net/Kconfig @@ -56,7 +56,7 @@ config RTL8139_PCI config VMXNET3_PCI bool - default y if PCI_DEVICES && PC_PCI + default y if PCI_DEVICES depends on PCI config SMC91C111 From c8110e767f2deb99029513d79d95a1c84bbeac9c Mon Sep 17 00:00:00 2001 From: Akihiko Odaki Date: Tue, 23 May 2023 11:43:38 +0900 Subject: [PATCH 48/50] MAINTAINERS: Add a reviewer for network packet abstractions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit I have made significant changes for network packet abstractions so add me as a reviewer. Signed-off-by: Akihiko Odaki Reviewed-by: Philippe Mathieu-Daudé Signed-off-by: Jason Wang --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index 35c4e277af..6addabdec4 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2237,6 +2237,7 @@ F: tests/qtest/fuzz-megasas-test.c Network packet abstractions M: Dmitry Fleytman +R: Akihiko Odaki S: Maintained F: include/net/eth.h F: net/eth.c From c37d98f3a940c832dafd3b77d3f9619df64c8ac4 Mon Sep 17 00:00:00 2001 From: Akihiko Odaki Date: Tue, 23 May 2023 11:43:39 +0900 Subject: [PATCH 49/50] docs/system/devices/igb: Note igb is tested for DPDK Signed-off-by: Akihiko Odaki Signed-off-by: Jason Wang --- docs/system/devices/igb.rst | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/docs/system/devices/igb.rst b/docs/system/devices/igb.rst index 07d95dd985..04e79dfe54 100644 --- a/docs/system/devices/igb.rst +++ b/docs/system/devices/igb.rst @@ -14,7 +14,8 @@ Limitations =========== This igb implementation was tested with Linux Test Project [2]_ and Windows HLK -[3]_ during the initial development. The command used when testing with LTP is: +[3]_ during the initial development. Later it was also tested with DPDK Test +Suite [4]_. The command used when testing with LTP is: .. code-block:: shell @@ -22,8 +23,8 @@ This igb implementation was tested with Linux Test Project [2]_ and Windows HLK Be aware that this implementation lacks many functionalities available with the actual hardware, and you may experience various failures if you try to use it -with a different operating system other than Linux and Windows or if you try -functionalities not covered by the tests. +with a different operating system other than DPDK, Linux, and Windows or if you +try functionalities not covered by the tests. Using igb ========= @@ -32,7 +33,7 @@ Using igb should be nothing different from using another network device. See :ref:`Network_emulation` in general. However, you may also need to perform additional steps to activate SR-IOV -feature on your guest. For Linux, refer to [4]_. +feature on your guest. For Linux, refer to [5]_. Developing igb ============== @@ -68,4 +69,5 @@ References .. [1] https://www.intel.com/content/dam/www/public/us/en/documents/datasheets/82576eb-gigabit-ethernet-controller-datasheet.pdf .. [2] https://github.com/linux-test-project/ltp .. [3] https://learn.microsoft.com/en-us/windows-hardware/test/hlk/ -.. [4] https://docs.kernel.org/PCI/pci-iov-howto.html +.. [4] https://doc.dpdk.org/dts/gsg/ +.. [5] https://docs.kernel.org/PCI/pci-iov-howto.html From 792676c165159c11412346870fd58fd243ab2166 Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Thu, 13 Apr 2023 13:19:46 -0400 Subject: [PATCH 50/50] rtl8139: fix large_send_mss divide-by-zero MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If the driver sets large_send_mss to 0 then a divide-by-zero occurs. Even if the division wasn't a problem, the for loop that emits MSS-sized packets would never terminate. Solve these issues by skipping offloading when large_send_mss=0. This issue was found by OSS-Fuzz as part of Alexander Bulekov's device fuzzing work. The reproducer is: $ cat << EOF | ./qemu-system-i386 -display none -machine accel=qtest, -m \ 512M,slots=1,maxmem=0xffff000000000000 -machine q35 -nodefaults -device \ rtl8139,netdev=net0 -netdev user,id=net0 -device \ pc-dimm,id=nv1,memdev=mem1,addr=0xb800a64602800000 -object \ memory-backend-ram,id=mem1,size=2M -qtest stdio outl 0xcf8 0x80000814 outl 0xcfc 0xe0000000 outl 0xcf8 0x80000804 outw 0xcfc 0x06 write 0xe0000037 0x1 0x04 write 0xe00000e0 0x2 0x01 write 0x1 0x1 0x04 write 0x3 0x1 0x98 write 0xa 0x1 0x8c write 0xb 0x1 0x02 write 0xc 0x1 0x46 write 0xd 0x1 0xa6 write 0xf 0x1 0xb8 write 0xb800a646028c000c 0x1 0x08 write 0xb800a646028c000e 0x1 0x47 write 0xb800a646028c0010 0x1 0x02 write 0xb800a646028c0017 0x1 0x06 write 0xb800a646028c0036 0x1 0x80 write 0xe00000d9 0x1 0x40 EOF Buglink: https://gitlab.com/qemu-project/qemu/-/issues/1582 Closes: https://gitlab.com/qemu-project/qemu/-/issues/1582 Cc: qemu-stable@nongnu.org Cc: Peter Maydell Fixes: 6d71357a3b65 ("rtl8139: honor large send MSS value") Reported-by: Alexander Bulekov Reviewed-by: Philippe Mathieu-Daudé Tested-by: Alexander Bulekov Signed-off-by: Stefan Hajnoczi Signed-off-by: Jason Wang --- hw/net/rtl8139.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/hw/net/rtl8139.c b/hw/net/rtl8139.c index 5a5aaf868d..5f1a4d359b 100644 --- a/hw/net/rtl8139.c +++ b/hw/net/rtl8139.c @@ -2154,6 +2154,9 @@ static int rtl8139_cplus_transmit_one(RTL8139State *s) int large_send_mss = (txdw0 >> CP_TC_LGSEN_MSS_SHIFT) & CP_TC_LGSEN_MSS_MASK; + if (large_send_mss == 0) { + goto skip_offload; + } DPRINTF("+++ C+ mode offloaded task TSO IP data %d " "frame data %d specified MSS=%d\n",