From 23a4b3ebc74bbc6b6f44edf2255864042f122b82 Mon Sep 17 00:00:00 2001 From: Stephen Bates Date: Fri, 29 Nov 2024 09:27:39 -0700 Subject: [PATCH 01/10] hw/nvme: Add OCP SMART / Health Information Extended Log Page The Open Compute Project [1] includes a Datacenter NVMe SSD Specification [2]. The most recent version of this specification (as of November 2024) is 2.6.1. This specification layers on top of the NVM Express specifications [3] to provide additional functionality. A key part of of this is the 512 Byte OCP SMART / Health Information Extended log page that is defined in Section 4.8.6 of the specification. We add a controller argument (ocp) that toggles on/off the SMART log extended structure. To accommodate different vendor specific specifications like OCP, we add a multiplexing function (nvme_vendor_specific_log) which will route to the different log functions based on arguments and log ids. We only return the OCP extended SMART log when the command is 0xC0 and ocp has been turned on in the nvme argumants. Though we add the whole nvme SMART log extended structure, we only populate the physical_media_units_{read,written}, log_page_version and log_page_uuid. This patch is based on work done by Joel but has been modified enough that he requested a co-developed-by tag rather than a signed-off-by. [1]: https://www.opencompute.org/ [2]: https://www.opencompute.org/documents/datacenter-nvme-ssd-specification-v2-6-1-pdf [3]: https://nvmexpress.org/specifications/ Signed-off-by: Stephen Bates Co-developed-by: Joel Granados Reviewed-by: Klaus Jensen Signed-off-by: Klaus Jensen --- docs/system/devices/nvme.rst | 7 +++++ hw/nvme/ctrl.c | 59 ++++++++++++++++++++++++++++++++++++ hw/nvme/nvme.h | 1 + include/block/nvme.h | 41 +++++++++++++++++++++++++ 4 files changed, 108 insertions(+) diff --git a/docs/system/devices/nvme.rst b/docs/system/devices/nvme.rst index d2b1ca9645..6509b35fcb 100644 --- a/docs/system/devices/nvme.rst +++ b/docs/system/devices/nvme.rst @@ -53,6 +53,13 @@ parameters. Vendor ID. Set this to ``on`` to revert to the unallocated Intel ID previously used. +``ocp`` (default: ``off``) + The Open Compute Project defines the Datacenter NVMe SSD Specification that + sits on top of NVMe. It describes additional commands and NVMe behaviors + specific for the Datacenter. When this option is ``on`` OCP features such as + the SMART / Health information extended log become available in the + controller. We emulate version 5 of this log page. + Additional Namespaces --------------------- diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c index 8175751518..11687e597a 100644 --- a/hw/nvme/ctrl.c +++ b/hw/nvme/ctrl.c @@ -4917,6 +4917,45 @@ static void nvme_set_blk_stats(NvmeNamespace *ns, struct nvme_stats *stats) stats->write_commands += s->nr_ops[BLOCK_ACCT_WRITE]; } +static uint16_t nvme_ocp_extended_smart_info(NvmeCtrl *n, uint8_t rae, + uint32_t buf_len, uint64_t off, + NvmeRequest *req) +{ + NvmeNamespace *ns = NULL; + NvmeSmartLogExtended smart_l = { 0 }; + struct nvme_stats stats = { 0 }; + uint32_t trans_len; + + if (off >= sizeof(smart_l)) { + return NVME_INVALID_FIELD | NVME_DNR; + } + + /* accumulate all stats from all namespaces */ + for (int i = 1; i <= NVME_MAX_NAMESPACES; i++) { + ns = nvme_ns(n, i); + if (ns) { + nvme_set_blk_stats(ns, &stats); + } + } + + smart_l.physical_media_units_written[0] = cpu_to_le64(stats.units_written); + smart_l.physical_media_units_read[0] = cpu_to_le64(stats.units_read); + smart_l.log_page_version = 0x0005; + + static const uint8_t guid[16] = { + 0xC5, 0xAF, 0x10, 0x28, 0xEA, 0xBF, 0xF2, 0xA4, + 0x9C, 0x4F, 0x6F, 0x7C, 0xC9, 0x14, 0xD5, 0xAF + }; + memcpy(smart_l.log_page_guid, guid, sizeof(smart_l.log_page_guid)); + + if (!rae) { + nvme_clear_events(n, NVME_AER_TYPE_SMART); + } + + trans_len = MIN(sizeof(smart_l) - off, buf_len); + return nvme_c2h(n, (uint8_t *) &smart_l + off, trans_len, req); +} + static uint16_t nvme_smart_info(NvmeCtrl *n, uint8_t rae, uint32_t buf_len, uint64_t off, NvmeRequest *req) { @@ -5146,6 +5185,23 @@ static uint16_t nvme_cmd_effects(NvmeCtrl *n, uint8_t csi, uint32_t buf_len, return nvme_c2h(n, ((uint8_t *)&log) + off, trans_len, req); } +static uint16_t nvme_vendor_specific_log(NvmeCtrl *n, uint8_t rae, + uint32_t buf_len, uint64_t off, + NvmeRequest *req, uint8_t lid) +{ + switch (lid) { + case NVME_OCP_EXTENDED_SMART_INFO: + if (n->params.ocp) { + return nvme_ocp_extended_smart_info(n, rae, buf_len, off, req); + } + break; + /* add a case for each additional vendor specific log id */ + } + + trace_pci_nvme_err_invalid_log_page(nvme_cid(req), lid); + return NVME_INVALID_FIELD | NVME_DNR; +} + static size_t sizeof_fdp_conf_descr(size_t nruh, size_t vss) { size_t entry_siz = sizeof(NvmeFdpDescrHdr) + nruh * sizeof(NvmeRuhDescr) @@ -5396,6 +5452,8 @@ static uint16_t nvme_get_log(NvmeCtrl *n, NvmeRequest *req) return nvme_smart_info(n, rae, len, off, req); case NVME_LOG_FW_SLOT_INFO: return nvme_fw_log_info(n, len, off, req); + case NVME_LOG_VENDOR_START...NVME_LOG_VENDOR_END: + return nvme_vendor_specific_log(n, rae, len, off, req, lid); case NVME_LOG_CHANGED_NSLIST: return nvme_changed_nslist(n, rae, len, off, req); case NVME_LOG_CMD_EFFECTS: @@ -8971,6 +9029,7 @@ static const Property nvme_props[] = { DEFINE_PROP_BOOL("atomic.dn", NvmeCtrl, params.atomic_dn, 0), DEFINE_PROP_UINT16("atomic.awun", NvmeCtrl, params.atomic_awun, 0), DEFINE_PROP_UINT16("atomic.awupf", NvmeCtrl, params.atomic_awupf, 0), + DEFINE_PROP_BOOL("ocp", NvmeCtrl, params.ocp, false), }; static void nvme_get_smart_warning(Object *obj, Visitor *v, const char *name, diff --git a/hw/nvme/nvme.h b/hw/nvme/nvme.h index 7242206910..e307e733e4 100644 --- a/hw/nvme/nvme.h +++ b/hw/nvme/nvme.h @@ -545,6 +545,7 @@ typedef struct NvmeParams { uint32_t sriov_max_vq_per_vf; uint32_t sriov_max_vi_per_vf; bool msix_exclusive_bar; + bool ocp; struct { bool mem; diff --git a/include/block/nvme.h b/include/block/nvme.h index f4d108841b..975d321c5c 100644 --- a/include/block/nvme.h +++ b/include/block/nvme.h @@ -1015,6 +1015,40 @@ typedef struct QEMU_PACKED NvmeSmartLog { uint8_t reserved2[320]; } NvmeSmartLog; +typedef struct QEMU_PACKED NvmeSmartLogExtended { + uint64_t physical_media_units_written[2]; + uint64_t physical_media_units_read[2]; + uint64_t bad_user_blocks; + uint64_t bad_system_nand_blocks; + uint64_t xor_recovery_count; + uint64_t uncorrectable_read_error_count; + uint64_t soft_ecc_error_count; + uint64_t end2end_correction_counts; + uint8_t system_data_percent_used; + uint8_t refresh_counts[7]; + uint64_t user_data_erase_counts; + uint16_t thermal_throttling_stat_and_count; + uint16_t dssd_spec_version[3]; + uint64_t pcie_correctable_error_count; + uint32_t incomplete_shutdowns; + uint32_t rsvd116; + uint8_t percent_free_blocks; + uint8_t rsvd121[7]; + uint16_t capacity_health; + uint8_t nvme_errata_ver; + uint8_t rsvd131[5]; + uint64_t unaligned_io; + uint64_t security_ver_num; + uint64_t total_nuse; + uint64_t plp_start_count[2]; + uint64_t endurance_estimate[2]; + uint64_t pcie_retraining_count; + uint64_t power_state_change_count; + uint8_t rsvd208[286]; + uint16_t log_page_version; + uint64_t log_page_guid[2]; +} NvmeSmartLogExtended; + #define NVME_SMART_WARN_MAX 6 enum NvmeSmartWarn { NVME_SMART_SPARE = 1 << 0, @@ -1052,6 +1086,12 @@ enum NvmeLogIdentifier { NVME_LOG_FDP_RUH_USAGE = 0x21, NVME_LOG_FDP_STATS = 0x22, NVME_LOG_FDP_EVENTS = 0x23, + NVME_LOG_VENDOR_START = 0xc0, + NVME_LOG_VENDOR_END = 0xff, +}; + +enum NvmeOcpLogIdentifier { + NVME_OCP_EXTENDED_SMART_INFO = 0xc0, }; typedef struct QEMU_PACKED NvmePSD { @@ -1899,6 +1939,7 @@ static inline void _nvme_check_size(void) QEMU_BUILD_BUG_ON(sizeof(NvmeErrorLog) != 64); QEMU_BUILD_BUG_ON(sizeof(NvmeFwSlotInfoLog) != 512); QEMU_BUILD_BUG_ON(sizeof(NvmeSmartLog) != 512); + QEMU_BUILD_BUG_ON(sizeof(NvmeSmartLogExtended) != 512); QEMU_BUILD_BUG_ON(sizeof(NvmeEffectsLog) != 4096); QEMU_BUILD_BUG_ON(sizeof(NvmeIdCtrl) != 4096); QEMU_BUILD_BUG_ON(sizeof(NvmeIdCtrlZoned) != 4096); From cd59f50ab017183805a0dd82f5e85159ecc355ce Mon Sep 17 00:00:00 2001 From: Klaus Jensen Date: Mon, 16 Dec 2024 13:53:02 +0100 Subject: [PATCH 02/10] hw/nvme: always initialize a subsystem If no nvme-subsys is explicitly configured, instantiate one. Reviewed-by: Jesper Wendel Devantier Signed-off-by: Klaus Jensen --- hw/nvme/ctrl.c | 36 +++++++++++++++------------- hw/nvme/ns.c | 64 +++++++++++++++++--------------------------------- 2 files changed, 42 insertions(+), 58 deletions(-) diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c index 11687e597a..5a7ccbcc1b 100644 --- a/hw/nvme/ctrl.c +++ b/hw/nvme/ctrl.c @@ -8823,15 +8823,13 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev) id->psd[0].enlat = cpu_to_le32(0x10); id->psd[0].exlat = cpu_to_le32(0x4); - if (n->subsys) { - id->cmic |= NVME_CMIC_MULTI_CTRL; - ctratt |= NVME_CTRATT_ENDGRPS; + id->cmic |= NVME_CMIC_MULTI_CTRL; + ctratt |= NVME_CTRATT_ENDGRPS; - id->endgidmax = cpu_to_le16(0x1); + id->endgidmax = cpu_to_le16(0x1); - if (n->subsys->endgrp.fdp.enabled) { - ctratt |= NVME_CTRATT_FDPS; - } + if (n->subsys->endgrp.fdp.enabled) { + ctratt |= NVME_CTRATT_FDPS; } id->ctratt = cpu_to_le32(ctratt); @@ -8860,7 +8858,15 @@ static int nvme_init_subsys(NvmeCtrl *n, Error **errp) int cntlid; if (!n->subsys) { - return 0; + DeviceState *dev = qdev_new(TYPE_NVME_SUBSYS); + + qdev_prop_set_string(dev, "nqn", n->params.serial); + + if (!qdev_realize(dev, NULL, errp)) { + return -1; + } + + n->subsys = NVME_SUBSYS(dev); } cntlid = nvme_subsys_register_ctrl(n, errp); @@ -8950,17 +8956,15 @@ static void nvme_exit(PCIDevice *pci_dev) nvme_ctrl_reset(n, NVME_RESET_FUNCTION); - if (n->subsys) { - for (i = 1; i <= NVME_MAX_NAMESPACES; i++) { - ns = nvme_ns(n, i); - if (ns) { - ns->attached--; - } + for (i = 1; i <= NVME_MAX_NAMESPACES; i++) { + ns = nvme_ns(n, i); + if (ns) { + ns->attached--; } - - nvme_subsys_unregister_ctrl(n->subsys, n); } + nvme_subsys_unregister_ctrl(n->subsys, n); + g_free(n->cq); g_free(n->sq); g_free(n->aer_reqs); diff --git a/hw/nvme/ns.c b/hw/nvme/ns.c index 410df29591..94cabc6a5b 100644 --- a/hw/nvme/ns.c +++ b/hw/nvme/ns.c @@ -727,25 +727,14 @@ static void nvme_ns_realize(DeviceState *dev, Error **errp) uint32_t nsid = ns->params.nsid; int i; - if (!n->subsys) { - /* If no subsys, the ns cannot be attached to more than one ctrl. */ - ns->params.shared = false; - if (ns->params.detached) { - error_setg(errp, "detached requires that the nvme device is " - "linked to an nvme-subsys device"); - return; - } - } else { - /* - * If this namespace belongs to a subsystem (through a link on the - * controller device), reparent the device. - */ - if (!qdev_set_parent_bus(dev, &subsys->bus.parent_bus, errp)) { - return; - } - ns->subsys = subsys; - ns->endgrp = &subsys->endgrp; + assert(subsys); + + /* reparent to subsystem bus */ + if (!qdev_set_parent_bus(dev, &subsys->bus.parent_bus, errp)) { + return; } + ns->subsys = subsys; + ns->endgrp = &subsys->endgrp; if (nvme_ns_setup(ns, errp)) { return; @@ -753,7 +742,7 @@ static void nvme_ns_realize(DeviceState *dev, Error **errp) if (!nsid) { for (i = 1; i <= NVME_MAX_NAMESPACES; i++) { - if (nvme_ns(n, i) || nvme_subsys_ns(subsys, i)) { + if (nvme_subsys_ns(subsys, i)) { continue; } @@ -765,38 +754,29 @@ static void nvme_ns_realize(DeviceState *dev, Error **errp) error_setg(errp, "no free namespace id"); return; } - } else { - if (nvme_ns(n, nsid) || nvme_subsys_ns(subsys, nsid)) { - error_setg(errp, "namespace id '%d' already allocated", nsid); - return; - } + } else if (nvme_subsys_ns(subsys, nsid)) { + error_setg(errp, "namespace id '%d' already allocated", nsid); + return; } - if (subsys) { - subsys->namespaces[nsid] = ns; + subsys->namespaces[nsid] = ns; - ns->id_ns.endgid = cpu_to_le16(0x1); - ns->id_ns_ind.endgrpid = cpu_to_le16(0x1); + ns->id_ns.endgid = cpu_to_le16(0x1); + ns->id_ns_ind.endgrpid = cpu_to_le16(0x1); - if (ns->params.detached) { - return; - } + if (ns->params.detached) { + return; + } - if (ns->params.shared) { - for (i = 0; i < ARRAY_SIZE(subsys->ctrls); i++) { - NvmeCtrl *ctrl = subsys->ctrls[i]; + if (ns->params.shared) { + for (i = 0; i < ARRAY_SIZE(subsys->ctrls); i++) { + NvmeCtrl *ctrl = subsys->ctrls[i]; - if (ctrl && ctrl != SUBSYS_SLOT_RSVD) { - nvme_attach_ns(ctrl, ns); - } + if (ctrl && ctrl != SUBSYS_SLOT_RSVD) { + nvme_attach_ns(ctrl, ns); } - - return; } - } - - nvme_attach_ns(n, ns); } static const Property nvme_ns_props[] = { From e7047adf1ebf4b5ad63e42c799d8334dcd3d139d Mon Sep 17 00:00:00 2001 From: Klaus Jensen Date: Mon, 16 Dec 2024 13:53:03 +0100 Subject: [PATCH 03/10] hw/nvme: make oacs dynamic Virtualization Management needs sriov-related parameters. Only report support for the command when that conditions are true. Reviewed-by: Jesper Wendel Devantier Signed-off-by: Klaus Jensen --- hw/nvme/ctrl.c | 25 ++++++++++++++++++------- hw/nvme/nvme.h | 4 ++++ include/block/nvme.h | 3 ++- 3 files changed, 24 insertions(+), 8 deletions(-) diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c index 5a7ccbcc1b..4ee8588ca9 100644 --- a/hw/nvme/ctrl.c +++ b/hw/nvme/ctrl.c @@ -266,7 +266,7 @@ static const uint32_t nvme_feature_cap[NVME_FID_MAX] = { [NVME_FDP_EVENTS] = NVME_FEAT_CAP_CHANGE | NVME_FEAT_CAP_NS, }; -static const uint32_t nvme_cse_acs[256] = { +static const uint32_t nvme_cse_acs_default[256] = { [NVME_ADM_CMD_DELETE_SQ] = NVME_CMD_EFF_CSUPP, [NVME_ADM_CMD_CREATE_SQ] = NVME_CMD_EFF_CSUPP, [NVME_ADM_CMD_GET_LOG_PAGE] = NVME_CMD_EFF_CSUPP, @@ -278,7 +278,6 @@ static const uint32_t nvme_cse_acs[256] = { [NVME_ADM_CMD_GET_FEATURES] = NVME_CMD_EFF_CSUPP, [NVME_ADM_CMD_ASYNC_EV_REQ] = NVME_CMD_EFF_CSUPP, [NVME_ADM_CMD_NS_ATTACHMENT] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_NIC, - [NVME_ADM_CMD_VIRT_MNGMT] = NVME_CMD_EFF_CSUPP, [NVME_ADM_CMD_DBBUF_CONFIG] = NVME_CMD_EFF_CSUPP, [NVME_ADM_CMD_FORMAT_NVM] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC, [NVME_ADM_CMD_DIRECTIVE_RECV] = NVME_CMD_EFF_CSUPP, @@ -5174,7 +5173,7 @@ static uint16_t nvme_cmd_effects(NvmeCtrl *n, uint8_t csi, uint32_t buf_len, } } - memcpy(log.acs, nvme_cse_acs, sizeof(nvme_cse_acs)); + memcpy(log.acs, n->cse.acs, sizeof(log.acs)); if (src_iocs) { memcpy(log.iocs, src_iocs, sizeof(log.iocs)); @@ -7300,7 +7299,7 @@ static uint16_t nvme_admin_cmd(NvmeCtrl *n, NvmeRequest *req) trace_pci_nvme_admin_cmd(nvme_cid(req), nvme_sqid(req), req->cmd.opcode, nvme_adm_opc_str(req->cmd.opcode)); - if (!(nvme_cse_acs[req->cmd.opcode] & NVME_CMD_EFF_CSUPP)) { + if (!(n->cse.acs[req->cmd.opcode] & NVME_CMD_EFF_CSUPP)) { trace_pci_nvme_err_invalid_admin_opc(req->cmd.opcode); return NVME_INVALID_OPCODE | NVME_DNR; } @@ -8740,6 +8739,9 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev) uint64_t cap = ldq_le_p(&n->bar.cap); NvmeSecCtrlEntry *sctrl = nvme_sctrl(n); uint32_t ctratt; + uint16_t oacs; + + memcpy(n->cse.acs, nvme_cse_acs_default, sizeof(n->cse.acs)); id->vid = cpu_to_le16(pci_get_word(pci_conf + PCI_VENDOR_ID)); id->ssvid = cpu_to_le16(pci_get_word(pci_conf + PCI_SUBSYSTEM_VENDOR_ID)); @@ -8770,9 +8772,18 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev) id->mdts = n->params.mdts; id->ver = cpu_to_le32(NVME_SPEC_VER); - id->oacs = - cpu_to_le16(NVME_OACS_NS_MGMT | NVME_OACS_FORMAT | NVME_OACS_DBBUF | - NVME_OACS_DIRECTIVES); + + oacs = NVME_OACS_NMS | NVME_OACS_FORMAT | NVME_OACS_DBBUF | + NVME_OACS_DIRECTIVES; + + if (n->params.sriov_max_vfs) { + oacs |= NVME_OACS_VMS; + + n->cse.acs[NVME_ADM_CMD_VIRT_MNGMT] = NVME_CMD_EFF_CSUPP; + } + + id->oacs = cpu_to_le16(oacs); + id->cntrltype = 0x1; /* diff --git a/hw/nvme/nvme.h b/hw/nvme/nvme.h index e307e733e4..b86cad388f 100644 --- a/hw/nvme/nvme.h +++ b/hw/nvme/nvme.h @@ -584,6 +584,10 @@ typedef struct NvmeCtrl { uint64_t dbbuf_eis; bool dbbuf_enabled; + struct { + uint32_t acs[256]; + } cse; + struct { MemoryRegion mem; uint8_t *buf; diff --git a/include/block/nvme.h b/include/block/nvme.h index 975d321c5c..80fbcb420d 100644 --- a/include/block/nvme.h +++ b/include/block/nvme.h @@ -1232,8 +1232,9 @@ enum NvmeIdCtrlOacs { NVME_OACS_SECURITY = 1 << 0, NVME_OACS_FORMAT = 1 << 1, NVME_OACS_FW = 1 << 2, - NVME_OACS_NS_MGMT = 1 << 3, + NVME_OACS_NMS = 1 << 3, NVME_OACS_DIRECTIVES = 1 << 5, + NVME_OACS_VMS = 1 << 7, NVME_OACS_DBBUF = 1 << 8, }; From 9cf6ec06592dea1973e66cd5cedf96fc59639047 Mon Sep 17 00:00:00 2001 From: Klaus Jensen Date: Mon, 16 Dec 2024 13:53:04 +0100 Subject: [PATCH 04/10] hw/nvme: add knob for doorbell buffer config support Add a 'dbcs' knob to allow Doorbell Buffer Config command to be disabled. Reviewed-by: Jesper Wendel Devantier Signed-off-by: Klaus Jensen --- hw/nvme/ctrl.c | 11 ++++++++--- hw/nvme/nvme.h | 1 + include/block/nvme.h | 2 +- 3 files changed, 10 insertions(+), 4 deletions(-) diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c index 4ee8588ca9..1ad76da943 100644 --- a/hw/nvme/ctrl.c +++ b/hw/nvme/ctrl.c @@ -278,7 +278,6 @@ static const uint32_t nvme_cse_acs_default[256] = { [NVME_ADM_CMD_GET_FEATURES] = NVME_CMD_EFF_CSUPP, [NVME_ADM_CMD_ASYNC_EV_REQ] = NVME_CMD_EFF_CSUPP, [NVME_ADM_CMD_NS_ATTACHMENT] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_NIC, - [NVME_ADM_CMD_DBBUF_CONFIG] = NVME_CMD_EFF_CSUPP, [NVME_ADM_CMD_FORMAT_NVM] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC, [NVME_ADM_CMD_DIRECTIVE_RECV] = NVME_CMD_EFF_CSUPP, [NVME_ADM_CMD_DIRECTIVE_SEND] = NVME_CMD_EFF_CSUPP, @@ -8773,8 +8772,13 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev) id->mdts = n->params.mdts; id->ver = cpu_to_le32(NVME_SPEC_VER); - oacs = NVME_OACS_NMS | NVME_OACS_FORMAT | NVME_OACS_DBBUF | - NVME_OACS_DIRECTIVES; + oacs = NVME_OACS_NMS | NVME_OACS_FORMAT | NVME_OACS_DIRECTIVES; + + if (n->params.dbcs) { + oacs |= NVME_OACS_DBCS; + + n->cse.acs[NVME_ADM_CMD_DBBUF_CONFIG] = NVME_CMD_EFF_CSUPP; + } if (n->params.sriov_max_vfs) { oacs |= NVME_OACS_VMS; @@ -9024,6 +9028,7 @@ static const Property nvme_props[] = { DEFINE_PROP_BOOL("use-intel-id", NvmeCtrl, params.use_intel_id, false), DEFINE_PROP_BOOL("legacy-cmb", NvmeCtrl, params.legacy_cmb, false), DEFINE_PROP_BOOL("ioeventfd", NvmeCtrl, params.ioeventfd, false), + DEFINE_PROP_BOOL("dbcs", NvmeCtrl, params.dbcs, true), DEFINE_PROP_UINT8("zoned.zasl", NvmeCtrl, params.zasl, 0), DEFINE_PROP_BOOL("zoned.auto_transition", NvmeCtrl, params.auto_transition_zones, true), diff --git a/hw/nvme/nvme.h b/hw/nvme/nvme.h index b86cad388f..b8d063a027 100644 --- a/hw/nvme/nvme.h +++ b/hw/nvme/nvme.h @@ -539,6 +539,7 @@ typedef struct NvmeParams { bool auto_transition_zones; bool legacy_cmb; bool ioeventfd; + bool dbcs; uint16_t sriov_max_vfs; uint16_t sriov_vq_flexible; uint16_t sriov_vi_flexible; diff --git a/include/block/nvme.h b/include/block/nvme.h index 80fbcb420d..63eb74460e 100644 --- a/include/block/nvme.h +++ b/include/block/nvme.h @@ -1235,7 +1235,7 @@ enum NvmeIdCtrlOacs { NVME_OACS_NMS = 1 << 3, NVME_OACS_DIRECTIVES = 1 << 5, NVME_OACS_VMS = 1 << 7, - NVME_OACS_DBBUF = 1 << 8, + NVME_OACS_DBCS = 1 << 8, }; enum NvmeIdCtrlOncs { From b202fb549dc487c5611564e5d03286748586aa34 Mon Sep 17 00:00:00 2001 From: Klaus Jensen Date: Mon, 16 Dec 2024 13:53:05 +0100 Subject: [PATCH 05/10] nvme: fix iocs status code values The status codes related to I/O Command Sets are in the wrong group. Reviewed-by: Jesper Wendel Devantier Signed-off-by: Klaus Jensen --- hw/nvme/ctrl.c | 4 ++-- include/block/nvme.h | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c index 1ad76da943..2b73f60160 100644 --- a/hw/nvme/ctrl.c +++ b/hw/nvme/ctrl.c @@ -5681,7 +5681,7 @@ static uint16_t nvme_identify_ns(NvmeCtrl *n, NvmeRequest *req, bool active) return nvme_c2h(n, (uint8_t *)&ns->id_ns, sizeof(NvmeIdNs), req); } - return NVME_INVALID_CMD_SET | NVME_DNR; + return NVME_INVALID_IOCS | NVME_DNR; } static uint16_t nvme_identify_ctrl_list(NvmeCtrl *n, NvmeRequest *req, @@ -6647,7 +6647,7 @@ static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeRequest *req) case NVME_COMMAND_SET_PROFILE: if (dw11 & 0x1ff) { trace_pci_nvme_err_invalid_iocsci(dw11 & 0x1ff); - return NVME_CMD_SET_CMB_REJECTED | NVME_DNR; + return NVME_IOCS_COMBINATION_REJECTED | NVME_DNR; } break; case NVME_FDP_MODE: diff --git a/include/block/nvme.h b/include/block/nvme.h index 63eb74460e..aecfc9ce66 100644 --- a/include/block/nvme.h +++ b/include/block/nvme.h @@ -906,8 +906,6 @@ enum NvmeStatusCodes { NVME_SGL_DESCR_TYPE_INVALID = 0x0011, NVME_INVALID_USE_OF_CMB = 0x0012, NVME_INVALID_PRP_OFFSET = 0x0013, - NVME_CMD_SET_CMB_REJECTED = 0x002b, - NVME_INVALID_CMD_SET = 0x002c, NVME_FDP_DISABLED = 0x0029, NVME_INVALID_PHID_LIST = 0x002a, NVME_LBA_RANGE = 0x0080, @@ -940,6 +938,8 @@ enum NvmeStatusCodes { NVME_INVALID_SEC_CTRL_STATE = 0x0120, NVME_INVALID_NUM_RESOURCES = 0x0121, NVME_INVALID_RESOURCE_ID = 0x0122, + NVME_IOCS_COMBINATION_REJECTED = 0x012b, + NVME_INVALID_IOCS = 0x012c, NVME_CONFLICTING_ATTRS = 0x0180, NVME_INVALID_PROT_INFO = 0x0181, NVME_WRITE_TO_RO = 0x0182, From d96a32de3fd0880a9340590fd279288e42e983c1 Mon Sep 17 00:00:00 2001 From: Klaus Jensen Date: Mon, 16 Dec 2024 13:53:06 +0100 Subject: [PATCH 06/10] hw/nvme: be compliant wrt. dsm processing limits The specification states that, > The controller shall set all three processing limit fields (i.e., the > DMRL, DMRSL and DMSL fields) to non-zero values or shall clear all > three processing limit fields to 0h. So, set the DMRL and DMSL fields in addition to DMRSL. Reviewed-by: Jesper Wendel Devantier Signed-off-by: Klaus Jensen --- hw/nvme/ctrl.c | 24 +++++++++++++++--------- include/block/nvme.h | 2 ++ 2 files changed, 17 insertions(+), 9 deletions(-) diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c index 2b73f60160..86e1c48fab 100644 --- a/hw/nvme/ctrl.c +++ b/hw/nvme/ctrl.c @@ -5639,7 +5639,9 @@ static uint16_t nvme_identify_ctrl_csi(NvmeCtrl *n, NvmeRequest *req) switch (c->csi) { case NVME_CSI_NVM: id_nvm->vsl = n->params.vsl; + id_nvm->dmrl = NVME_ID_CTRL_NVM_DMRL_MAX; id_nvm->dmrsl = cpu_to_le32(n->dmrsl); + id_nvm->dmsl = NVME_ID_CTRL_NVM_DMRL_MAX * n->dmrsl; break; case NVME_CSI_ZONED: @@ -6696,18 +6698,23 @@ static uint16_t nvme_aer(NvmeCtrl *n, NvmeRequest *req) return NVME_NO_COMPLETE; } -static void nvme_update_dmrsl(NvmeCtrl *n) +static void nvme_update_dsm_limits(NvmeCtrl *n, NvmeNamespace *ns) { - int nsid; + if (ns) { + n->dmrsl = + MIN_NON_ZERO(n->dmrsl, BDRV_REQUEST_MAX_BYTES / nvme_l2b(ns, 1)); - for (nsid = 1; nsid <= NVME_MAX_NAMESPACES; nsid++) { - NvmeNamespace *ns = nvme_ns(n, nsid); + return; + } + + for (uint32_t nsid = 1; nsid <= NVME_MAX_NAMESPACES; nsid++) { + ns = nvme_ns(n, nsid); if (!ns) { continue; } - n->dmrsl = MIN_NON_ZERO(n->dmrsl, - BDRV_REQUEST_MAX_BYTES / nvme_l2b(ns, 1)); + n->dmrsl = + MIN_NON_ZERO(n->dmrsl, BDRV_REQUEST_MAX_BYTES / nvme_l2b(ns, 1)); } } @@ -6795,7 +6802,7 @@ static uint16_t nvme_ns_attachment(NvmeCtrl *n, NvmeRequest *req) ctrl->namespaces[nsid] = NULL; ns->attached--; - nvme_update_dmrsl(ctrl); + nvme_update_dsm_limits(ctrl, NULL); break; @@ -8902,8 +8909,7 @@ void nvme_attach_ns(NvmeCtrl *n, NvmeNamespace *ns) n->namespaces[nsid] = ns; ns->attached++; - n->dmrsl = MIN_NON_ZERO(n->dmrsl, - BDRV_REQUEST_MAX_BYTES / nvme_l2b(ns, 1)); + nvme_update_dsm_limits(n, ns); } static void nvme_realize(PCIDevice *pci_dev, Error **errp) diff --git a/include/block/nvme.h b/include/block/nvme.h index aecfc9ce66..763b2b2f0e 100644 --- a/include/block/nvme.h +++ b/include/block/nvme.h @@ -1207,6 +1207,8 @@ typedef struct NvmeIdCtrlZoned { uint8_t rsvd1[4095]; } NvmeIdCtrlZoned; +#define NVME_ID_CTRL_NVM_DMRL_MAX 255 + typedef struct NvmeIdCtrlNvm { uint8_t vsl; uint8_t wzsl; From 6ccca4b6bb9f994cc04e71004e1767a3476d2b23 Mon Sep 17 00:00:00 2001 From: Klaus Jensen Date: Mon, 16 Dec 2024 13:53:07 +0100 Subject: [PATCH 07/10] hw/nvme: rework csi handling The controller incorrectly allows a zoned namespace to be attached even if CS.CSS is configured to only support the NVM command set for I/O queues. Rework handling of namespace command sets in general by attaching supported namespaces when the controller is started instead of, like now, statically when realized. Reviewed-by: Jesper Wendel Devantier Signed-off-by: Klaus Jensen --- hw/nvme/ctrl.c | 213 ++++++++++++++++++++++++------------------- hw/nvme/ns.c | 14 --- hw/nvme/nvme.h | 5 +- include/block/nvme.h | 10 +- 4 files changed, 131 insertions(+), 111 deletions(-) diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c index 86e1c48fab..21496c6b6b 100644 --- a/hw/nvme/ctrl.c +++ b/hw/nvme/ctrl.c @@ -277,15 +277,14 @@ static const uint32_t nvme_cse_acs_default[256] = { [NVME_ADM_CMD_SET_FEATURES] = NVME_CMD_EFF_CSUPP, [NVME_ADM_CMD_GET_FEATURES] = NVME_CMD_EFF_CSUPP, [NVME_ADM_CMD_ASYNC_EV_REQ] = NVME_CMD_EFF_CSUPP, - [NVME_ADM_CMD_NS_ATTACHMENT] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_NIC, + [NVME_ADM_CMD_NS_ATTACHMENT] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_NIC | + NVME_CMD_EFF_CCC, [NVME_ADM_CMD_FORMAT_NVM] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC, [NVME_ADM_CMD_DIRECTIVE_RECV] = NVME_CMD_EFF_CSUPP, [NVME_ADM_CMD_DIRECTIVE_SEND] = NVME_CMD_EFF_CSUPP, }; -static const uint32_t nvme_cse_iocs_none[256]; - -static const uint32_t nvme_cse_iocs_nvm[256] = { +static const uint32_t nvme_cse_iocs_nvm_default[256] = { [NVME_CMD_FLUSH] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC, [NVME_CMD_WRITE_ZEROES] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC, [NVME_CMD_WRITE] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC, @@ -298,7 +297,7 @@ static const uint32_t nvme_cse_iocs_nvm[256] = { [NVME_CMD_IO_MGMT_SEND] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC, }; -static const uint32_t nvme_cse_iocs_zoned[256] = { +static const uint32_t nvme_cse_iocs_zoned_default[256] = { [NVME_CMD_FLUSH] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC, [NVME_CMD_WRITE_ZEROES] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC, [NVME_CMD_WRITE] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC, @@ -307,6 +306,9 @@ static const uint32_t nvme_cse_iocs_zoned[256] = { [NVME_CMD_VERIFY] = NVME_CMD_EFF_CSUPP, [NVME_CMD_COPY] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC, [NVME_CMD_COMPARE] = NVME_CMD_EFF_CSUPP, + [NVME_CMD_IO_MGMT_RECV] = NVME_CMD_EFF_CSUPP, + [NVME_CMD_IO_MGMT_SEND] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC, + [NVME_CMD_ZONE_APPEND] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC, [NVME_CMD_ZONE_MGMT_SEND] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC, [NVME_CMD_ZONE_MGMT_RECV] = NVME_CMD_EFF_CSUPP, @@ -4603,6 +4605,61 @@ static uint16_t nvme_io_mgmt_send(NvmeCtrl *n, NvmeRequest *req) }; } +static uint16_t __nvme_io_cmd_nvm(NvmeCtrl *n, NvmeRequest *req) +{ + switch (req->cmd.opcode) { + case NVME_CMD_WRITE: + return nvme_write(n, req); + case NVME_CMD_READ: + return nvme_read(n, req); + case NVME_CMD_COMPARE: + return nvme_compare(n, req); + case NVME_CMD_WRITE_ZEROES: + return nvme_write_zeroes(n, req); + case NVME_CMD_DSM: + return nvme_dsm(n, req); + case NVME_CMD_VERIFY: + return nvme_verify(n, req); + case NVME_CMD_COPY: + return nvme_copy(n, req); + case NVME_CMD_IO_MGMT_RECV: + return nvme_io_mgmt_recv(n, req); + case NVME_CMD_IO_MGMT_SEND: + return nvme_io_mgmt_send(n, req); + } + + g_assert_not_reached(); +} + +static uint16_t nvme_io_cmd_nvm(NvmeCtrl *n, NvmeRequest *req) +{ + if (!(n->cse.iocs.nvm[req->cmd.opcode] & NVME_CMD_EFF_CSUPP)) { + trace_pci_nvme_err_invalid_opc(req->cmd.opcode); + return NVME_INVALID_OPCODE | NVME_DNR; + } + + return __nvme_io_cmd_nvm(n, req); +} + +static uint16_t nvme_io_cmd_zoned(NvmeCtrl *n, NvmeRequest *req) +{ + if (!(n->cse.iocs.zoned[req->cmd.opcode] & NVME_CMD_EFF_CSUPP)) { + trace_pci_nvme_err_invalid_opc(req->cmd.opcode); + return NVME_INVALID_OPCODE | NVME_DNR; + } + + switch (req->cmd.opcode) { + case NVME_CMD_ZONE_APPEND: + return nvme_zone_append(n, req); + case NVME_CMD_ZONE_MGMT_SEND: + return nvme_zone_mgmt_send(n, req); + case NVME_CMD_ZONE_MGMT_RECV: + return nvme_zone_mgmt_recv(n, req); + } + + return __nvme_io_cmd_nvm(n, req); +} + static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest *req) { NvmeNamespace *ns; @@ -4644,11 +4701,6 @@ static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest *req) return NVME_INVALID_FIELD | NVME_DNR; } - if (!(ns->iocs[req->cmd.opcode] & NVME_CMD_EFF_CSUPP)) { - trace_pci_nvme_err_invalid_opc(req->cmd.opcode); - return NVME_INVALID_OPCODE | NVME_DNR; - } - if (ns->status) { return ns->status; } @@ -4659,36 +4711,14 @@ static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest *req) req->ns = ns; - switch (req->cmd.opcode) { - case NVME_CMD_WRITE_ZEROES: - return nvme_write_zeroes(n, req); - case NVME_CMD_ZONE_APPEND: - return nvme_zone_append(n, req); - case NVME_CMD_WRITE: - return nvme_write(n, req); - case NVME_CMD_READ: - return nvme_read(n, req); - case NVME_CMD_COMPARE: - return nvme_compare(n, req); - case NVME_CMD_DSM: - return nvme_dsm(n, req); - case NVME_CMD_VERIFY: - return nvme_verify(n, req); - case NVME_CMD_COPY: - return nvme_copy(n, req); - case NVME_CMD_ZONE_MGMT_SEND: - return nvme_zone_mgmt_send(n, req); - case NVME_CMD_ZONE_MGMT_RECV: - return nvme_zone_mgmt_recv(n, req); - case NVME_CMD_IO_MGMT_RECV: - return nvme_io_mgmt_recv(n, req); - case NVME_CMD_IO_MGMT_SEND: - return nvme_io_mgmt_send(n, req); - default: - g_assert_not_reached(); + switch (ns->csi) { + case NVME_CSI_NVM: + return nvme_io_cmd_nvm(n, req); + case NVME_CSI_ZONED: + return nvme_io_cmd_zoned(n, req); } - return NVME_INVALID_OPCODE | NVME_DNR; + g_assert_not_reached(); } static void nvme_cq_notifier(EventNotifier *e) @@ -5147,7 +5177,7 @@ static uint16_t nvme_cmd_effects(NvmeCtrl *n, uint8_t csi, uint32_t buf_len, uint64_t off, NvmeRequest *req) { NvmeEffectsLog log = {}; - const uint32_t *src_iocs = NULL; + const uint32_t *iocs = NULL; uint32_t trans_len; if (off >= sizeof(log)) { @@ -5157,25 +5187,26 @@ static uint16_t nvme_cmd_effects(NvmeCtrl *n, uint8_t csi, uint32_t buf_len, switch (NVME_CC_CSS(ldl_le_p(&n->bar.cc))) { case NVME_CC_CSS_NVM: - src_iocs = nvme_cse_iocs_nvm; - /* fall through */ - case NVME_CC_CSS_ADMIN_ONLY: + iocs = n->cse.iocs.nvm; break; - case NVME_CC_CSS_CSI: + + case NVME_CC_CSS_ALL: switch (csi) { case NVME_CSI_NVM: - src_iocs = nvme_cse_iocs_nvm; + iocs = n->cse.iocs.nvm; break; case NVME_CSI_ZONED: - src_iocs = nvme_cse_iocs_zoned; + iocs = n->cse.iocs.zoned; break; } + + break; } memcpy(log.acs, n->cse.acs, sizeof(log.acs)); - if (src_iocs) { - memcpy(log.iocs, src_iocs, sizeof(log.iocs)); + if (iocs) { + memcpy(log.iocs, iocs, sizeof(log.iocs)); } trans_len = MIN(sizeof(log) - off, buf_len); @@ -6718,25 +6749,29 @@ static void nvme_update_dsm_limits(NvmeCtrl *n, NvmeNamespace *ns) } } -static void nvme_select_iocs_ns(NvmeCtrl *n, NvmeNamespace *ns) +static bool nvme_csi_supported(NvmeCtrl *n, uint8_t csi) { - uint32_t cc = ldl_le_p(&n->bar.cc); + uint32_t cc; - ns->iocs = nvme_cse_iocs_none; - switch (ns->csi) { + switch (csi) { case NVME_CSI_NVM: - if (NVME_CC_CSS(cc) != NVME_CC_CSS_ADMIN_ONLY) { - ns->iocs = nvme_cse_iocs_nvm; - } - break; + return true; + case NVME_CSI_ZONED: - if (NVME_CC_CSS(cc) == NVME_CC_CSS_CSI) { - ns->iocs = nvme_cse_iocs_zoned; - } else if (NVME_CC_CSS(cc) == NVME_CC_CSS_NVM) { - ns->iocs = nvme_cse_iocs_nvm; - } - break; + cc = ldl_le_p(&n->bar.cc); + + return NVME_CC_CSS(cc) == NVME_CC_CSS_ALL; } + + g_assert_not_reached(); +} + +static void nvme_detach_ns(NvmeCtrl *n, NvmeNamespace *ns) +{ + assert(ns->attached > 0); + + n->namespaces[ns->params.nsid] = NULL; + ns->attached--; } static uint16_t nvme_ns_attachment(NvmeCtrl *n, NvmeRequest *req) @@ -6781,7 +6816,7 @@ static uint16_t nvme_ns_attachment(NvmeCtrl *n, NvmeRequest *req) switch (sel) { case NVME_NS_ATTACHMENT_ATTACH: - if (nvme_ns(ctrl, nsid)) { + if (nvme_ns(n, nsid)) { return NVME_NS_ALREADY_ATTACHED | NVME_DNR; } @@ -6789,19 +6824,17 @@ static uint16_t nvme_ns_attachment(NvmeCtrl *n, NvmeRequest *req) return NVME_NS_PRIVATE | NVME_DNR; } + if (!nvme_csi_supported(n, ns->csi)) { + return NVME_IOCS_NOT_SUPPORTED | NVME_DNR; + } + nvme_attach_ns(ctrl, ns); - nvme_select_iocs_ns(ctrl, ns); + nvme_update_dsm_limits(ctrl, ns); break; case NVME_NS_ATTACHMENT_DETACH: - if (!nvme_ns(ctrl, nsid)) { - return NVME_NS_NOT_ATTACHED | NVME_DNR; - } - - ctrl->namespaces[nsid] = NULL; - ns->attached--; - + nvme_detach_ns(ctrl, ns); nvme_update_dsm_limits(ctrl, NULL); break; @@ -7652,21 +7685,6 @@ static void nvme_ctrl_shutdown(NvmeCtrl *n) } } -static void nvme_select_iocs(NvmeCtrl *n) -{ - NvmeNamespace *ns; - int i; - - for (i = 1; i <= NVME_MAX_NAMESPACES; i++) { - ns = nvme_ns(n, i); - if (!ns) { - continue; - } - - nvme_select_iocs_ns(n, ns); - } -} - static int nvme_start_ctrl(NvmeCtrl *n) { uint64_t cap = ldq_le_p(&n->bar.cap); @@ -7733,7 +7751,18 @@ static int nvme_start_ctrl(NvmeCtrl *n) nvme_set_timestamp(n, 0ULL); - nvme_select_iocs(n); + /* verify that the command sets of attached namespaces are supported */ + for (int i = 1; i <= NVME_MAX_NAMESPACES; i++) { + NvmeNamespace *ns = nvme_subsys_ns(n->subsys, i); + + if (ns && nvme_csi_supported(n, ns->csi) && !ns->params.detached) { + if (!ns->attached || ns->params.shared) { + nvme_attach_ns(n, ns); + } + } + } + + nvme_update_dsm_limits(n, NULL); return 0; } @@ -8748,6 +8777,9 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev) uint16_t oacs; memcpy(n->cse.acs, nvme_cse_acs_default, sizeof(n->cse.acs)); + memcpy(n->cse.iocs.nvm, nvme_cse_iocs_nvm_default, sizeof(n->cse.iocs.nvm)); + memcpy(n->cse.iocs.zoned, nvme_cse_iocs_zoned_default, + sizeof(n->cse.iocs.zoned)); id->vid = cpu_to_le16(pci_get_word(pci_conf + PCI_VENDOR_ID)); id->ssvid = cpu_to_le16(pci_get_word(pci_conf + PCI_SUBSYSTEM_VENDOR_ID)); @@ -8859,9 +8891,8 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev) NVME_CAP_SET_MQES(cap, n->params.mqes); NVME_CAP_SET_CQR(cap, 1); NVME_CAP_SET_TO(cap, 0xf); - NVME_CAP_SET_CSS(cap, NVME_CAP_CSS_NVM); - NVME_CAP_SET_CSS(cap, NVME_CAP_CSS_CSI_SUPP); - NVME_CAP_SET_CSS(cap, NVME_CAP_CSS_ADMIN_ONLY); + NVME_CAP_SET_CSS(cap, NVME_CAP_CSS_NCSS); + NVME_CAP_SET_CSS(cap, NVME_CAP_CSS_IOCSS); NVME_CAP_SET_MPSMAX(cap, 4); NVME_CAP_SET_CMBS(cap, n->params.cmb_size_mb ? 1 : 0); NVME_CAP_SET_PMRS(cap, n->pmr.dev ? 1 : 0); @@ -8908,8 +8939,6 @@ void nvme_attach_ns(NvmeCtrl *n, NvmeNamespace *ns) n->namespaces[nsid] = ns; ns->attached++; - - nvme_update_dsm_limits(n, ns); } static void nvme_realize(PCIDevice *pci_dev, Error **errp) @@ -8965,7 +8994,7 @@ static void nvme_realize(PCIDevice *pci_dev, Error **errp) return; } - nvme_attach_ns(n, ns); + n->subsys->namespaces[ns->params.nsid] = ns; } } diff --git a/hw/nvme/ns.c b/hw/nvme/ns.c index 94cabc6a5b..98c1e75a5d 100644 --- a/hw/nvme/ns.c +++ b/hw/nvme/ns.c @@ -763,20 +763,6 @@ static void nvme_ns_realize(DeviceState *dev, Error **errp) ns->id_ns.endgid = cpu_to_le16(0x1); ns->id_ns_ind.endgrpid = cpu_to_le16(0x1); - - if (ns->params.detached) { - return; - } - - if (ns->params.shared) { - for (i = 0; i < ARRAY_SIZE(subsys->ctrls); i++) { - NvmeCtrl *ctrl = subsys->ctrls[i]; - - if (ctrl && ctrl != SUBSYS_SLOT_RSVD) { - nvme_attach_ns(ctrl, ns); - } - } - } } static const Property nvme_ns_props[] = { diff --git a/hw/nvme/nvme.h b/hw/nvme/nvme.h index b8d063a027..6f782ba188 100644 --- a/hw/nvme/nvme.h +++ b/hw/nvme/nvme.h @@ -237,7 +237,6 @@ typedef struct NvmeNamespace { NvmeLBAF lbaf; unsigned int nlbaf; size_t lbasz; - const uint32_t *iocs; uint8_t csi; uint16_t status; int attached; @@ -587,6 +586,10 @@ typedef struct NvmeCtrl { struct { uint32_t acs[256]; + struct { + uint32_t nvm[256]; + uint32_t zoned[256]; + } iocs; } cse; struct { diff --git a/include/block/nvme.h b/include/block/nvme.h index 763b2b2f0e..366739f79e 100644 --- a/include/block/nvme.h +++ b/include/block/nvme.h @@ -142,9 +142,9 @@ enum NvmeCapMask { ((cap) |= (uint64_t)((val) & CAP_CMBS_MASK) << CAP_CMBS_SHIFT) enum NvmeCapCss { - NVME_CAP_CSS_NVM = 1 << 0, - NVME_CAP_CSS_CSI_SUPP = 1 << 6, - NVME_CAP_CSS_ADMIN_ONLY = 1 << 7, + NVME_CAP_CSS_NCSS = 1 << 0, + NVME_CAP_CSS_IOCSS = 1 << 6, + NVME_CAP_CSS_NOIOCSS = 1 << 7, }; enum NvmeCcShift { @@ -177,7 +177,7 @@ enum NvmeCcMask { enum NvmeCcCss { NVME_CC_CSS_NVM = 0x0, - NVME_CC_CSS_CSI = 0x6, + NVME_CC_CSS_ALL = 0x6, NVME_CC_CSS_ADMIN_ONLY = 0x7, }; @@ -938,6 +938,8 @@ enum NvmeStatusCodes { NVME_INVALID_SEC_CTRL_STATE = 0x0120, NVME_INVALID_NUM_RESOURCES = 0x0121, NVME_INVALID_RESOURCE_ID = 0x0122, + NVME_IOCS_NOT_SUPPORTED = 0x0129, + NVME_IOCS_NOT_ENABLED = 0x012a, NVME_IOCS_COMBINATION_REJECTED = 0x012b, NVME_INVALID_IOCS = 0x012c, NVME_CONFLICTING_ATTRS = 0x0180, From 304babd9401d8ce8fe139a70fe464332eef2cee0 Mon Sep 17 00:00:00 2001 From: Klaus Jensen Date: Mon, 16 Dec 2024 13:53:08 +0100 Subject: [PATCH 08/10] hw/nvme: only set command abort requested when cancelled due to Abort The Command Abort Requested status code should only be set if the command was explicitly cancelled due to an Abort command. Or, in the case the cancel was due to Submission Queue deletion, set the status code to Command Aborted due to SQ Deletion. Reviewed-by: Jesper Wendel Devantier Signed-off-by: Klaus Jensen --- hw/nvme/ctrl.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c index 21496c6b6b..07cd632985 100644 --- a/hw/nvme/ctrl.c +++ b/hw/nvme/ctrl.c @@ -1783,10 +1783,6 @@ static void nvme_aio_err(NvmeRequest *req, int ret) break; } - if (ret == -ECANCELED) { - status = NVME_CMD_ABORT_REQ; - } - trace_pci_nvme_err_aio(nvme_cid(req), strerror(-ret), status); error_setg_errno(&local_err, -ret, "aio failed"); @@ -4827,6 +4823,7 @@ static uint16_t nvme_del_sq(NvmeCtrl *n, NvmeRequest *req) while (!QTAILQ_EMPTY(&sq->out_req_list)) { r = QTAILQ_FIRST(&sq->out_req_list); assert(r->aiocb); + r->status = NVME_CMD_ABORT_SQ_DEL; blk_aio_cancel(r->aiocb); } @@ -6137,6 +6134,7 @@ static uint16_t nvme_abort(NvmeCtrl *n, NvmeRequest *req) QTAILQ_FOREACH_SAFE(r, &sq->out_req_list, entry, next) { if (r->cqe.cid == cid) { if (r->aiocb) { + r->status = NVME_CMD_ABORT_REQ; blk_aio_cancel_async(r->aiocb); } break; From 6fc39228ffe9d54388f6d1080b502634df13bb72 Mon Sep 17 00:00:00 2001 From: Klaus Jensen Date: Mon, 16 Dec 2024 13:53:09 +0100 Subject: [PATCH 09/10] hw/nvme: set error status code explicitly for misc commands The nvme_aio_err() does not handle Verify, Compare, Copy and other misc commands and defaults to setting the error status code to Internal Device Error. For some of these commands, we know better, so set it explicitly. For the commands using the nvme_misc_cb() callback (Copy, Flush, ...), if no status code has explicitly been set by the lower handlers, default to Internal Device Error as previously. Reviewed-by: Jesper Wendel Devantier Signed-off-by: Klaus Jensen --- hw/nvme/ctrl.c | 28 ++++++++++++++++++++++------ include/block/nvme.h | 1 + 2 files changed, 23 insertions(+), 6 deletions(-) diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c index 07cd632985..b7222fd9ac 100644 --- a/hw/nvme/ctrl.c +++ b/hw/nvme/ctrl.c @@ -1771,7 +1771,6 @@ static void nvme_aio_err(NvmeRequest *req, int ret) case NVME_CMD_READ: status = NVME_UNRECOVERED_READ; break; - case NVME_CMD_FLUSH: case NVME_CMD_WRITE: case NVME_CMD_WRITE_ZEROES: case NVME_CMD_ZONE_APPEND: @@ -2157,11 +2156,16 @@ static inline bool nvme_is_write(NvmeRequest *req) static void nvme_misc_cb(void *opaque, int ret) { NvmeRequest *req = opaque; + uint16_t cid = nvme_cid(req); - trace_pci_nvme_misc_cb(nvme_cid(req)); + trace_pci_nvme_misc_cb(cid); if (ret) { - nvme_aio_err(req, ret); + if (!req->status) { + req->status = NVME_INTERNAL_DEV_ERROR; + } + + trace_pci_nvme_err_aio(cid, strerror(-ret), req->status); } nvme_enqueue_req_completion(nvme_cq(req), req); @@ -2264,7 +2268,10 @@ static void nvme_verify_cb(void *opaque, int ret) if (ret) { block_acct_failed(stats, acct); - nvme_aio_err(req, ret); + req->status = NVME_UNRECOVERED_READ; + + trace_pci_nvme_err_aio(nvme_cid(req), strerror(-ret), req->status); + goto out; } @@ -2363,7 +2370,10 @@ static void nvme_compare_mdata_cb(void *opaque, int ret) if (ret) { block_acct_failed(stats, acct); - nvme_aio_err(req, ret); + req->status = NVME_UNRECOVERED_READ; + + trace_pci_nvme_err_aio(nvme_cid(req), strerror(-ret), req->status); + goto out; } @@ -2445,7 +2455,10 @@ static void nvme_compare_data_cb(void *opaque, int ret) if (ret) { block_acct_failed(stats, acct); - nvme_aio_err(req, ret); + req->status = NVME_UNRECOVERED_READ; + + trace_pci_nvme_err_aio(nvme_cid(req), strerror(-ret), req->status); + goto out; } @@ -2924,6 +2937,7 @@ static void nvme_copy_out_completed_cb(void *opaque, int ret) if (ret < 0) { iocb->ret = ret; + req->status = NVME_WRITE_FAULT; goto out; } else if (iocb->ret < 0) { goto out; @@ -2988,6 +3002,7 @@ static void nvme_copy_in_completed_cb(void *opaque, int ret) if (ret < 0) { iocb->ret = ret; + req->status = NVME_UNRECOVERED_READ; goto out; } else if (iocb->ret < 0) { goto out; @@ -3510,6 +3525,7 @@ static void nvme_flush_ns_cb(void *opaque, int ret) if (ret < 0) { iocb->ret = ret; + iocb->req->status = NVME_WRITE_FAULT; goto out; } else if (iocb->ret < 0) { goto out; diff --git a/include/block/nvme.h b/include/block/nvme.h index 366739f79e..358e516e38 100644 --- a/include/block/nvme.h +++ b/include/block/nvme.h @@ -906,6 +906,7 @@ enum NvmeStatusCodes { NVME_SGL_DESCR_TYPE_INVALID = 0x0011, NVME_INVALID_USE_OF_CMB = 0x0012, NVME_INVALID_PRP_OFFSET = 0x0013, + NVME_COMMAND_INTERRUPTED = 0x0021, NVME_FDP_DISABLED = 0x0029, NVME_INVALID_PHID_LIST = 0x002a, NVME_LBA_RANGE = 0x0080, From cad58ada8f104bf342097a7a683ef594ac949c8d Mon Sep 17 00:00:00 2001 From: Klaus Jensen Date: Mon, 16 Dec 2024 13:53:10 +0100 Subject: [PATCH 10/10] hw/nvme: remove nvme_aio_err() nvme_rw_complete_cb() is the only remaining user of nvme_aio_err(), so open code the status code setting instead. Reviewed-by: Jesper Wendel Devantier Signed-off-by: Klaus Jensen --- hw/nvme/ctrl.c | 60 +++++++++++++++++++------------------------------- 1 file changed, 23 insertions(+), 37 deletions(-) diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c index b7222fd9ac..e62c6a3588 100644 --- a/hw/nvme/ctrl.c +++ b/hw/nvme/ctrl.c @@ -1762,42 +1762,6 @@ static uint16_t nvme_check_dulbe(NvmeNamespace *ns, uint64_t slba, return NVME_SUCCESS; } -static void nvme_aio_err(NvmeRequest *req, int ret) -{ - uint16_t status = NVME_SUCCESS; - Error *local_err = NULL; - - switch (req->cmd.opcode) { - case NVME_CMD_READ: - status = NVME_UNRECOVERED_READ; - break; - case NVME_CMD_WRITE: - case NVME_CMD_WRITE_ZEROES: - case NVME_CMD_ZONE_APPEND: - case NVME_CMD_COPY: - status = NVME_WRITE_FAULT; - break; - default: - status = NVME_INTERNAL_DEV_ERROR; - break; - } - - trace_pci_nvme_err_aio(nvme_cid(req), strerror(-ret), status); - - error_setg_errno(&local_err, -ret, "aio failed"); - error_report_err(local_err); - - /* - * Set the command status code to the first encountered error but allow a - * subsequent Internal Device Error to trump it. - */ - if (req->status && status != NVME_INTERNAL_DEV_ERROR) { - return; - } - - req->status = status; -} - static inline uint32_t nvme_zone_idx(NvmeNamespace *ns, uint64_t slba) { return ns->zone_size_log2 > 0 ? slba >> ns->zone_size_log2 : @@ -2182,8 +2146,30 @@ void nvme_rw_complete_cb(void *opaque, int ret) trace_pci_nvme_rw_complete_cb(nvme_cid(req), blk_name(blk)); if (ret) { + Error *err = NULL; + block_acct_failed(stats, acct); - nvme_aio_err(req, ret); + + switch (req->cmd.opcode) { + case NVME_CMD_READ: + req->status = NVME_UNRECOVERED_READ; + break; + + case NVME_CMD_WRITE: + case NVME_CMD_WRITE_ZEROES: + case NVME_CMD_ZONE_APPEND: + req->status = NVME_WRITE_FAULT; + break; + + default: + req->status = NVME_INTERNAL_DEV_ERROR; + break; + } + + trace_pci_nvme_err_aio(nvme_cid(req), strerror(-ret), req->status); + + error_setg_errno(&err, -ret, "aio failed"); + error_report_err(err); } else { block_acct_done(stats, acct); }