From 45c171ad98883b1716d1ec238e87aaa1653c779c Mon Sep 17 00:00:00 2001 From: Lee Miller Date: Sat, 3 Feb 2024 04:40:39 +0200 Subject: [PATCH] Remove custom kmod-nvme package - use upstream one --- .buildbot/openwrt/build.sh | 1 - feed/kmod-nvme/Makefile | 32 - feed/kmod-nvme/src/Makefile | 28 - feed/kmod-nvme/src/core.c | 4831 ---------------------------------- feed/kmod-nvme/src/fabrics.h | 192 -- feed/kmod-nvme/src/nvme.h | 893 ------- feed/kmod-nvme/src/pci.c | 3310 ----------------------- feed/kmod-nvme/src/trace.h | 175 -- 8 files changed, 9462 deletions(-) delete mode 100644 feed/kmod-nvme/Makefile delete mode 100644 feed/kmod-nvme/src/Makefile delete mode 100644 feed/kmod-nvme/src/core.c delete mode 100644 feed/kmod-nvme/src/fabrics.h delete mode 100644 feed/kmod-nvme/src/nvme.h delete mode 100644 feed/kmod-nvme/src/pci.c delete mode 100644 feed/kmod-nvme/src/trace.h diff --git a/.buildbot/openwrt/build.sh b/.buildbot/openwrt/build.sh index 76f59f7..c79f633 100755 --- a/.buildbot/openwrt/build.sh +++ b/.buildbot/openwrt/build.sh @@ -37,7 +37,6 @@ echo "CONFIG_PACKAGE_COLLECTD_ENCRYPTED_NETWORK=y" >> .config ${MAKE} package/i2c-tools/compile make package/kmod-i2c-mux-pinctrl/compile -make package/kmod-nvme/compile make package/kmod-rtc-pcf85063/compile make package/nvme-cli/compile make package/ansible-core/compile diff --git a/feed/kmod-nvme/Makefile b/feed/kmod-nvme/Makefile deleted file mode 100644 index 1d29026..0000000 --- a/feed/kmod-nvme/Makefile +++ /dev/null @@ -1,32 +0,0 @@ -include $(TOPDIR)/rules.mk -include $(INCLUDE_DIR)/kernel.mk - -PKG_NAME:=nvme - -include $(INCLUDE_DIR)/package.mk - -define KernelPackage/$(PKG_NAME) - SUBMENU:=$(BLOCK_MENU) - TITLE:=NVM Express block device - DEPENDS:=@PCI_SUPPORT - FILES:= \ - $(PKG_BUILD_DIR)/nvme-core.ko \ - $(PKG_BUILD_DIR)/nvme.ko - AUTOLOAD:=$(call AutoLoad,30,nvme-core nvme) - KCONFIG:= -endef - -define KernelPackage/nvme/description - Kernel module for NVM Express solid state drives directly - connected to the PCI or PCI Express bus. -endef - -EXTRA_KCONFIG:= \ - CONFIG_NVME_CORE=m \ - CONFIG_BLK_DEV_NVME=m \ - CONFIG_NVME_MULTIPATH=n \ - CONFIG_NVME_HWMON=n - -include ../kmod.mk - -$(eval $(call KernelPackage,$(PKG_NAME))) diff --git a/feed/kmod-nvme/src/Makefile b/feed/kmod-nvme/src/Makefile deleted file mode 100644 index d7f6a87..0000000 --- a/feed/kmod-nvme/src/Makefile +++ /dev/null @@ -1,28 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0 - -ccflags-y += -I$(src) - -obj-$(CONFIG_NVME_CORE) += nvme-core.o -obj-$(CONFIG_BLK_DEV_NVME) += nvme.o -obj-$(CONFIG_NVME_FABRICS) += nvme-fabrics.o -obj-$(CONFIG_NVME_RDMA) += nvme-rdma.o -obj-$(CONFIG_NVME_FC) += nvme-fc.o -obj-$(CONFIG_NVME_TCP) += nvme-tcp.o - -nvme-core-y := core.o -nvme-core-$(CONFIG_TRACING) += trace.o -nvme-core-$(CONFIG_NVME_MULTIPATH) += multipath.o -nvme-core-$(CONFIG_NVM) += lightnvm.o -nvme-core-$(CONFIG_BLK_DEV_ZONED) += zns.o -nvme-core-$(CONFIG_FAULT_INJECTION_DEBUG_FS) += fault_inject.o -nvme-core-$(CONFIG_NVME_HWMON) += hwmon.o - -nvme-y += pci.o - -nvme-fabrics-y += fabrics.o - -nvme-rdma-y += rdma.o - -nvme-fc-y += fc.o - -nvme-tcp-y += tcp.o diff --git a/feed/kmod-nvme/src/core.c b/feed/kmod-nvme/src/core.c deleted file mode 100644 index ab060b4..0000000 --- a/feed/kmod-nvme/src/core.c +++ /dev/null @@ -1,4831 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * NVM Express device driver - * Copyright (c) 2011-2014, Intel Corporation. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "nvme.h" -#include "fabrics.h" - -#define CREATE_TRACE_POINTS -#include "trace.h" - -#define NVME_MINORS (1U << MINORBITS) - -unsigned int admin_timeout = 60; -module_param(admin_timeout, uint, 0644); -MODULE_PARM_DESC(admin_timeout, "timeout in seconds for admin commands"); -EXPORT_SYMBOL_GPL(admin_timeout); - -unsigned int nvme_io_timeout = 30; -module_param_named(io_timeout, nvme_io_timeout, uint, 0644); -MODULE_PARM_DESC(io_timeout, "timeout in seconds for I/O"); -EXPORT_SYMBOL_GPL(nvme_io_timeout); - -static unsigned char shutdown_timeout = 5; -module_param(shutdown_timeout, byte, 0644); -MODULE_PARM_DESC(shutdown_timeout, "timeout in seconds for controller shutdown"); - -static u8 nvme_max_retries = 5; -module_param_named(max_retries, nvme_max_retries, byte, 0644); -MODULE_PARM_DESC(max_retries, "max number of retries a command may have"); - -static unsigned long default_ps_max_latency_us = 100000; -module_param(default_ps_max_latency_us, ulong, 0644); -MODULE_PARM_DESC(default_ps_max_latency_us, - "max power saving latency for new devices; use PM QOS to change per device"); - -static bool force_apst; -module_param(force_apst, bool, 0644); -MODULE_PARM_DESC(force_apst, "allow APST for newly enumerated devices even if quirked off"); - -static bool streams; -module_param(streams, bool, 0644); -MODULE_PARM_DESC(streams, "turn on support for Streams write directives"); - -/* - * nvme_wq - hosts nvme related works that are not reset or delete - * nvme_reset_wq - hosts nvme reset works - * nvme_delete_wq - hosts nvme delete works - * - * nvme_wq will host works such as scan, aen handling, fw activation, - * keep-alive, periodic reconnects etc. nvme_reset_wq - * runs reset works which also flush works hosted on nvme_wq for - * serialization purposes. nvme_delete_wq host controller deletion - * works which flush reset works for serialization. - */ -struct workqueue_struct *nvme_wq; -EXPORT_SYMBOL_GPL(nvme_wq); - -struct workqueue_struct *nvme_reset_wq; -EXPORT_SYMBOL_GPL(nvme_reset_wq); - -struct workqueue_struct *nvme_delete_wq; -EXPORT_SYMBOL_GPL(nvme_delete_wq); - -static LIST_HEAD(nvme_subsystems); -static DEFINE_MUTEX(nvme_subsystems_lock); - -static DEFINE_IDA(nvme_instance_ida); -static dev_t nvme_chr_devt; -static struct class *nvme_class; -static struct class *nvme_subsys_class; - -static void nvme_put_subsystem(struct nvme_subsystem *subsys); -static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl, - unsigned nsid); - -static void nvme_update_bdev_size(struct gendisk *disk) -{ - struct block_device *bdev = bdget_disk(disk, 0); - - if (bdev) { - bd_set_nr_sectors(bdev, get_capacity(disk)); - bdput(bdev); - } -} - -/* - * Prepare a queue for teardown. - * - * This must forcibly unquiesce queues to avoid blocking dispatch, and only set - * the capacity to 0 after that to avoid blocking dispatchers that may be - * holding bd_butex. This will end buffered writers dirtying pages that can't - * be synced. - */ -static void nvme_set_queue_dying(struct nvme_ns *ns) -{ - if (test_and_set_bit(NVME_NS_DEAD, &ns->flags)) - return; - - blk_set_queue_dying(ns->queue); - blk_mq_unquiesce_queue(ns->queue); - - set_capacity(ns->disk, 0); - nvme_update_bdev_size(ns->disk); -} - -static void nvme_queue_scan(struct nvme_ctrl *ctrl) -{ - /* - * Only new queue scan work when admin and IO queues are both alive - */ - if (ctrl->state == NVME_CTRL_LIVE && ctrl->tagset) - queue_work(nvme_wq, &ctrl->scan_work); -} - -/* - * Use this function to proceed with scheduling reset_work for a controller - * that had previously been set to the resetting state. This is intended for - * code paths that can't be interrupted by other reset attempts. A hot removal - * may prevent this from succeeding. - */ -int nvme_try_sched_reset(struct nvme_ctrl *ctrl) -{ - if (ctrl->state != NVME_CTRL_RESETTING) - return -EBUSY; - if (!queue_work(nvme_reset_wq, &ctrl->reset_work)) - return -EBUSY; - return 0; -} -EXPORT_SYMBOL_GPL(nvme_try_sched_reset); - -int nvme_reset_ctrl(struct nvme_ctrl *ctrl) -{ - if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING)) - return -EBUSY; - if (!queue_work(nvme_reset_wq, &ctrl->reset_work)) - return -EBUSY; - return 0; -} -EXPORT_SYMBOL_GPL(nvme_reset_ctrl); - -int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl) -{ - int ret; - - ret = nvme_reset_ctrl(ctrl); - if (!ret) { - flush_work(&ctrl->reset_work); - if (ctrl->state != NVME_CTRL_LIVE) - ret = -ENETRESET; - } - - return ret; -} -EXPORT_SYMBOL_GPL(nvme_reset_ctrl_sync); - -static void nvme_do_delete_ctrl(struct nvme_ctrl *ctrl) -{ - dev_info(ctrl->device, - "Removing ctrl: NQN \"%s\"\n", ctrl->opts->subsysnqn); - - flush_work(&ctrl->reset_work); - nvme_stop_ctrl(ctrl); - nvme_remove_namespaces(ctrl); - ctrl->ops->delete_ctrl(ctrl); - nvme_uninit_ctrl(ctrl); -} - -static void nvme_delete_ctrl_work(struct work_struct *work) -{ - struct nvme_ctrl *ctrl = - container_of(work, struct nvme_ctrl, delete_work); - - nvme_do_delete_ctrl(ctrl); -} - -int nvme_delete_ctrl(struct nvme_ctrl *ctrl) -{ - if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING)) - return -EBUSY; - if (!queue_work(nvme_delete_wq, &ctrl->delete_work)) - return -EBUSY; - return 0; -} -EXPORT_SYMBOL_GPL(nvme_delete_ctrl); - -static void nvme_delete_ctrl_sync(struct nvme_ctrl *ctrl) -{ - /* - * Keep a reference until nvme_do_delete_ctrl() complete, - * since ->delete_ctrl can free the controller. - */ - nvme_get_ctrl(ctrl); - if (nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING)) - nvme_do_delete_ctrl(ctrl); - nvme_put_ctrl(ctrl); -} - -static blk_status_t nvme_error_status(u16 status) -{ - switch (status & 0x7ff) { - case NVME_SC_SUCCESS: - return BLK_STS_OK; - case NVME_SC_CAP_EXCEEDED: - return BLK_STS_NOSPC; - case NVME_SC_LBA_RANGE: - case NVME_SC_CMD_INTERRUPTED: - case NVME_SC_NS_NOT_READY: - return BLK_STS_TARGET; - case NVME_SC_BAD_ATTRIBUTES: - case NVME_SC_ONCS_NOT_SUPPORTED: - case NVME_SC_INVALID_OPCODE: - case NVME_SC_INVALID_FIELD: - case NVME_SC_INVALID_NS: - return BLK_STS_NOTSUPP; - case NVME_SC_WRITE_FAULT: - case NVME_SC_READ_ERROR: - case NVME_SC_UNWRITTEN_BLOCK: - case NVME_SC_ACCESS_DENIED: - case NVME_SC_READ_ONLY: - case NVME_SC_COMPARE_FAILED: - return BLK_STS_MEDIUM; - case NVME_SC_GUARD_CHECK: - case NVME_SC_APPTAG_CHECK: - case NVME_SC_REFTAG_CHECK: - case NVME_SC_INVALID_PI: - return BLK_STS_PROTECTION; - case NVME_SC_RESERVATION_CONFLICT: - return BLK_STS_NEXUS; - case NVME_SC_HOST_PATH_ERROR: - return BLK_STS_TRANSPORT; - case NVME_SC_ZONE_TOO_MANY_ACTIVE: - return BLK_STS_ZONE_ACTIVE_RESOURCE; - case NVME_SC_ZONE_TOO_MANY_OPEN: - return BLK_STS_ZONE_OPEN_RESOURCE; - default: - return BLK_STS_IOERR; - } -} - -static void nvme_retry_req(struct request *req) -{ - struct nvme_ns *ns = req->q->queuedata; - unsigned long delay = 0; - u16 crd; - - /* The mask and shift result must be <= 3 */ - crd = (nvme_req(req)->status & NVME_SC_CRD) >> 11; - if (ns && crd) - delay = ns->ctrl->crdt[crd - 1] * 100; - - nvme_req(req)->retries++; - blk_mq_requeue_request(req, false); - blk_mq_delay_kick_requeue_list(req->q, delay); -} - -enum nvme_disposition { - COMPLETE, - RETRY, - FAILOVER, -}; - -static inline enum nvme_disposition nvme_decide_disposition(struct request *req) -{ - if (likely(nvme_req(req)->status == 0)) - return COMPLETE; - - if (blk_noretry_request(req) || - (nvme_req(req)->status & NVME_SC_DNR) || - nvme_req(req)->retries >= nvme_max_retries) - return COMPLETE; - - if (req->cmd_flags & REQ_NVME_MPATH) { - if (nvme_is_path_error(nvme_req(req)->status) || - blk_queue_dying(req->q)) - return FAILOVER; - } else { - if (blk_queue_dying(req->q)) - return COMPLETE; - } - - return RETRY; -} - -static inline void nvme_end_req(struct request *req) -{ - blk_status_t status = nvme_error_status(nvme_req(req)->status); - - if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) && - req_op(req) == REQ_OP_ZONE_APPEND) - req->__sector = nvme_lba_to_sect(req->q->queuedata, - le64_to_cpu(nvme_req(req)->result.u64)); - - nvme_trace_bio_complete(req, status); - blk_mq_end_request(req, status); -} - -void nvme_complete_rq(struct request *req) -{ - trace_nvme_complete_rq(req); - nvme_cleanup_cmd(req); - - if (nvme_req(req)->ctrl->kas) - nvme_req(req)->ctrl->comp_seen = true; - - switch (nvme_decide_disposition(req)) { - case COMPLETE: - nvme_end_req(req); - return; - case RETRY: - nvme_retry_req(req); - return; - case FAILOVER: - nvme_failover_req(req); - return; - } -} -EXPORT_SYMBOL_GPL(nvme_complete_rq); - -bool nvme_cancel_request(struct request *req, void *data, bool reserved) -{ - dev_dbg_ratelimited(((struct nvme_ctrl *) data)->device, - "Cancelling I/O %d", req->tag); - - /* don't abort one completed request */ - if (blk_mq_request_completed(req)) - return true; - - nvme_req(req)->status = NVME_SC_HOST_ABORTED_CMD; - nvme_req(req)->flags |= NVME_REQ_CANCELLED; - blk_mq_complete_request(req); - return true; -} -EXPORT_SYMBOL_GPL(nvme_cancel_request); - -void nvme_cancel_tagset(struct nvme_ctrl *ctrl) -{ - if (ctrl->tagset) { - blk_mq_tagset_busy_iter(ctrl->tagset, - nvme_cancel_request, ctrl); - blk_mq_tagset_wait_completed_request(ctrl->tagset); - } -} -EXPORT_SYMBOL_GPL(nvme_cancel_tagset); - -void nvme_cancel_admin_tagset(struct nvme_ctrl *ctrl) -{ - if (ctrl->admin_tagset) { - blk_mq_tagset_busy_iter(ctrl->admin_tagset, - nvme_cancel_request, ctrl); - blk_mq_tagset_wait_completed_request(ctrl->admin_tagset); - } -} -EXPORT_SYMBOL_GPL(nvme_cancel_admin_tagset); - -bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl, - enum nvme_ctrl_state new_state) -{ - enum nvme_ctrl_state old_state; - unsigned long flags; - bool changed = false; - - spin_lock_irqsave(&ctrl->lock, flags); - - old_state = ctrl->state; - switch (new_state) { - case NVME_CTRL_LIVE: - switch (old_state) { - case NVME_CTRL_NEW: - case NVME_CTRL_RESETTING: - case NVME_CTRL_CONNECTING: - changed = true; - fallthrough; - default: - break; - } - break; - case NVME_CTRL_RESETTING: - switch (old_state) { - case NVME_CTRL_NEW: - case NVME_CTRL_LIVE: - changed = true; - fallthrough; - default: - break; - } - break; - case NVME_CTRL_CONNECTING: - switch (old_state) { - case NVME_CTRL_NEW: - case NVME_CTRL_RESETTING: - changed = true; - fallthrough; - default: - break; - } - break; - case NVME_CTRL_DELETING: - switch (old_state) { - case NVME_CTRL_LIVE: - case NVME_CTRL_RESETTING: - case NVME_CTRL_CONNECTING: - changed = true; - fallthrough; - default: - break; - } - break; - case NVME_CTRL_DELETING_NOIO: - switch (old_state) { - case NVME_CTRL_DELETING: - case NVME_CTRL_DEAD: - changed = true; - fallthrough; - default: - break; - } - break; - case NVME_CTRL_DEAD: - switch (old_state) { - case NVME_CTRL_DELETING: - changed = true; - fallthrough; - default: - break; - } - break; - default: - break; - } - - if (changed) { - ctrl->state = new_state; - wake_up_all(&ctrl->state_wq); - } - - spin_unlock_irqrestore(&ctrl->lock, flags); - if (changed && ctrl->state == NVME_CTRL_LIVE) - nvme_kick_requeue_lists(ctrl); - return changed; -} -EXPORT_SYMBOL_GPL(nvme_change_ctrl_state); - -/* - * Returns true for sink states that can't ever transition back to live. - */ -static bool nvme_state_terminal(struct nvme_ctrl *ctrl) -{ - switch (ctrl->state) { - case NVME_CTRL_NEW: - case NVME_CTRL_LIVE: - case NVME_CTRL_RESETTING: - case NVME_CTRL_CONNECTING: - return false; - case NVME_CTRL_DELETING: - case NVME_CTRL_DELETING_NOIO: - case NVME_CTRL_DEAD: - return true; - default: - WARN_ONCE(1, "Unhandled ctrl state:%d", ctrl->state); - return true; - } -} - -/* - * Waits for the controller state to be resetting, or returns false if it is - * not possible to ever transition to that state. - */ -bool nvme_wait_reset(struct nvme_ctrl *ctrl) -{ - wait_event(ctrl->state_wq, - nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING) || - nvme_state_terminal(ctrl)); - return ctrl->state == NVME_CTRL_RESETTING; -} -EXPORT_SYMBOL_GPL(nvme_wait_reset); - -static void nvme_free_ns_head(struct kref *ref) -{ - struct nvme_ns_head *head = - container_of(ref, struct nvme_ns_head, ref); - - nvme_mpath_remove_disk(head); - ida_simple_remove(&head->subsys->ns_ida, head->instance); - cleanup_srcu_struct(&head->srcu); - nvme_put_subsystem(head->subsys); - kfree(head); -} - -static void nvme_put_ns_head(struct nvme_ns_head *head) -{ - kref_put(&head->ref, nvme_free_ns_head); -} - -static void nvme_free_ns(struct kref *kref) -{ - struct nvme_ns *ns = container_of(kref, struct nvme_ns, kref); - - if (ns->ndev) - nvme_nvm_unregister(ns); - - put_disk(ns->disk); - nvme_put_ns_head(ns->head); - nvme_put_ctrl(ns->ctrl); - kfree(ns); -} - -void nvme_put_ns(struct nvme_ns *ns) -{ - kref_put(&ns->kref, nvme_free_ns); -} -EXPORT_SYMBOL_NS_GPL(nvme_put_ns, NVME_TARGET_PASSTHRU); - -static inline void nvme_clear_nvme_request(struct request *req) -{ - nvme_req(req)->retries = 0; - nvme_req(req)->flags = 0; - req->rq_flags |= RQF_DONTPREP; -} - -static inline unsigned int nvme_req_op(struct nvme_command *cmd) -{ - return nvme_is_write(cmd) ? REQ_OP_DRV_OUT : REQ_OP_DRV_IN; -} - -static inline void nvme_init_request(struct request *req, - struct nvme_command *cmd) -{ - if (req->q->queuedata) - req->timeout = NVME_IO_TIMEOUT; - else /* no queuedata implies admin queue */ - req->timeout = ADMIN_TIMEOUT; - - req->cmd_flags |= REQ_FAILFAST_DRIVER; - nvme_clear_nvme_request(req); - nvme_req(req)->cmd = cmd; -} - -struct request *nvme_alloc_request(struct request_queue *q, - struct nvme_command *cmd, blk_mq_req_flags_t flags) -{ - struct request *req; - - req = blk_mq_alloc_request(q, nvme_req_op(cmd), flags); - if (!IS_ERR(req)) - nvme_init_request(req, cmd); - return req; -} -EXPORT_SYMBOL_GPL(nvme_alloc_request); - -struct request *nvme_alloc_request_qid(struct request_queue *q, - struct nvme_command *cmd, blk_mq_req_flags_t flags, int qid) -{ - struct request *req; - - req = blk_mq_alloc_request_hctx(q, nvme_req_op(cmd), flags, - qid ? qid - 1 : 0); - if (!IS_ERR(req)) - nvme_init_request(req, cmd); - return req; -} -EXPORT_SYMBOL_GPL(nvme_alloc_request_qid); - -static int nvme_toggle_streams(struct nvme_ctrl *ctrl, bool enable) -{ - struct nvme_command c; - - memset(&c, 0, sizeof(c)); - - c.directive.opcode = nvme_admin_directive_send; - c.directive.nsid = cpu_to_le32(NVME_NSID_ALL); - c.directive.doper = NVME_DIR_SND_ID_OP_ENABLE; - c.directive.dtype = NVME_DIR_IDENTIFY; - c.directive.tdtype = NVME_DIR_STREAMS; - c.directive.endir = enable ? NVME_DIR_ENDIR : 0; - - return nvme_submit_sync_cmd(ctrl->admin_q, &c, NULL, 0); -} - -static int nvme_disable_streams(struct nvme_ctrl *ctrl) -{ - return nvme_toggle_streams(ctrl, false); -} - -static int nvme_enable_streams(struct nvme_ctrl *ctrl) -{ - return nvme_toggle_streams(ctrl, true); -} - -static int nvme_get_stream_params(struct nvme_ctrl *ctrl, - struct streams_directive_params *s, u32 nsid) -{ - struct nvme_command c; - - memset(&c, 0, sizeof(c)); - memset(s, 0, sizeof(*s)); - - c.directive.opcode = nvme_admin_directive_recv; - c.directive.nsid = cpu_to_le32(nsid); - c.directive.numd = cpu_to_le32(nvme_bytes_to_numd(sizeof(*s))); - c.directive.doper = NVME_DIR_RCV_ST_OP_PARAM; - c.directive.dtype = NVME_DIR_STREAMS; - - return nvme_submit_sync_cmd(ctrl->admin_q, &c, s, sizeof(*s)); -} - -static int nvme_configure_directives(struct nvme_ctrl *ctrl) -{ - struct streams_directive_params s; - int ret; - - if (!(ctrl->oacs & NVME_CTRL_OACS_DIRECTIVES)) - return 0; - if (!streams) - return 0; - - ret = nvme_enable_streams(ctrl); - if (ret) - return ret; - - ret = nvme_get_stream_params(ctrl, &s, NVME_NSID_ALL); - if (ret) - goto out_disable_stream; - - ctrl->nssa = le16_to_cpu(s.nssa); - if (ctrl->nssa < BLK_MAX_WRITE_HINTS - 1) { - dev_info(ctrl->device, "too few streams (%u) available\n", - ctrl->nssa); - goto out_disable_stream; - } - - ctrl->nr_streams = min_t(u16, ctrl->nssa, BLK_MAX_WRITE_HINTS - 1); - dev_info(ctrl->device, "Using %u streams\n", ctrl->nr_streams); - return 0; - -out_disable_stream: - nvme_disable_streams(ctrl); - return ret; -} - -/* - * Check if 'req' has a write hint associated with it. If it does, assign - * a valid namespace stream to the write. - */ -static void nvme_assign_write_stream(struct nvme_ctrl *ctrl, - struct request *req, u16 *control, - u32 *dsmgmt) -{ - enum rw_hint streamid = req->write_hint; - - if (streamid == WRITE_LIFE_NOT_SET || streamid == WRITE_LIFE_NONE) - streamid = 0; - else { - streamid--; - if (WARN_ON_ONCE(streamid > ctrl->nr_streams)) - return; - - *control |= NVME_RW_DTYPE_STREAMS; - *dsmgmt |= streamid << 16; - } - - if (streamid < ARRAY_SIZE(req->q->write_hints)) - req->q->write_hints[streamid] += blk_rq_bytes(req) >> 9; -} - -static inline void nvme_setup_passthrough(struct request *req, - struct nvme_command *cmd) -{ - memcpy(cmd, nvme_req(req)->cmd, sizeof(*cmd)); - /* passthru commands should let the driver set the SGL flags */ - cmd->common.flags &= ~NVME_CMD_SGL_ALL; -} - -static inline void nvme_setup_flush(struct nvme_ns *ns, - struct nvme_command *cmnd) -{ - cmnd->common.opcode = nvme_cmd_flush; - cmnd->common.nsid = cpu_to_le32(ns->head->ns_id); -} - -static blk_status_t nvme_setup_discard(struct nvme_ns *ns, struct request *req, - struct nvme_command *cmnd) -{ - unsigned short segments = blk_rq_nr_discard_segments(req), n = 0; - struct nvme_dsm_range *range; - struct bio *bio; - - /* - * Some devices do not consider the DSM 'Number of Ranges' field when - * determining how much data to DMA. Always allocate memory for maximum - * number of segments to prevent device reading beyond end of buffer. - */ - static const size_t alloc_size = sizeof(*range) * NVME_DSM_MAX_RANGES; - - range = kzalloc(alloc_size, GFP_ATOMIC | __GFP_NOWARN); - if (!range) { - /* - * If we fail allocation our range, fallback to the controller - * discard page. If that's also busy, it's safe to return - * busy, as we know we can make progress once that's freed. - */ - if (test_and_set_bit_lock(0, &ns->ctrl->discard_page_busy)) - return BLK_STS_RESOURCE; - - range = page_address(ns->ctrl->discard_page); - } - - __rq_for_each_bio(bio, req) { - u64 slba = nvme_sect_to_lba(ns, bio->bi_iter.bi_sector); - u32 nlb = bio->bi_iter.bi_size >> ns->lba_shift; - - if (n < segments) { - range[n].cattr = cpu_to_le32(0); - range[n].nlb = cpu_to_le32(nlb); - range[n].slba = cpu_to_le64(slba); - } - n++; - } - - if (WARN_ON_ONCE(n != segments)) { - if (virt_to_page(range) == ns->ctrl->discard_page) - clear_bit_unlock(0, &ns->ctrl->discard_page_busy); - else - kfree(range); - return BLK_STS_IOERR; - } - - cmnd->dsm.opcode = nvme_cmd_dsm; - cmnd->dsm.nsid = cpu_to_le32(ns->head->ns_id); - cmnd->dsm.nr = cpu_to_le32(segments - 1); - cmnd->dsm.attributes = cpu_to_le32(NVME_DSMGMT_AD); - - req->special_vec.bv_page = virt_to_page(range); - req->special_vec.bv_offset = offset_in_page(range); - req->special_vec.bv_len = alloc_size; - req->rq_flags |= RQF_SPECIAL_PAYLOAD; - - return BLK_STS_OK; -} - -static inline blk_status_t nvme_setup_write_zeroes(struct nvme_ns *ns, - struct request *req, struct nvme_command *cmnd) -{ - if (ns->ctrl->quirks & NVME_QUIRK_DEALLOCATE_ZEROES) - return nvme_setup_discard(ns, req, cmnd); - - cmnd->write_zeroes.opcode = nvme_cmd_write_zeroes; - cmnd->write_zeroes.nsid = cpu_to_le32(ns->head->ns_id); - cmnd->write_zeroes.slba = - cpu_to_le64(nvme_sect_to_lba(ns, blk_rq_pos(req))); - cmnd->write_zeroes.length = - cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1); - if (nvme_ns_has_pi(ns)) - cmnd->write_zeroes.control = cpu_to_le16(NVME_RW_PRINFO_PRACT); - else - cmnd->write_zeroes.control = 0; - return BLK_STS_OK; -} - -static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns, - struct request *req, struct nvme_command *cmnd, - enum nvme_opcode op) -{ - struct nvme_ctrl *ctrl = ns->ctrl; - u16 control = 0; - u32 dsmgmt = 0; - - if (req->cmd_flags & REQ_FUA) - control |= NVME_RW_FUA; - if (req->cmd_flags & (REQ_FAILFAST_DEV | REQ_RAHEAD)) - control |= NVME_RW_LR; - - if (req->cmd_flags & REQ_RAHEAD) - dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH; - - cmnd->rw.opcode = op; - cmnd->rw.nsid = cpu_to_le32(ns->head->ns_id); - cmnd->rw.slba = cpu_to_le64(nvme_sect_to_lba(ns, blk_rq_pos(req))); - cmnd->rw.length = cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1); - - if (req_op(req) == REQ_OP_WRITE && ctrl->nr_streams) - nvme_assign_write_stream(ctrl, req, &control, &dsmgmt); - - if (ns->ms) { - /* - * If formated with metadata, the block layer always provides a - * metadata buffer if CONFIG_BLK_DEV_INTEGRITY is enabled. Else - * we enable the PRACT bit for protection information or set the - * namespace capacity to zero to prevent any I/O. - */ - if (!blk_integrity_rq(req)) { - if (WARN_ON_ONCE(!nvme_ns_has_pi(ns))) - return BLK_STS_NOTSUPP; - control |= NVME_RW_PRINFO_PRACT; - } - - switch (ns->pi_type) { - case NVME_NS_DPS_PI_TYPE3: - control |= NVME_RW_PRINFO_PRCHK_GUARD; - break; - case NVME_NS_DPS_PI_TYPE1: - case NVME_NS_DPS_PI_TYPE2: - control |= NVME_RW_PRINFO_PRCHK_GUARD | - NVME_RW_PRINFO_PRCHK_REF; - if (op == nvme_cmd_zone_append) - control |= NVME_RW_APPEND_PIREMAP; - cmnd->rw.reftag = cpu_to_le32(t10_pi_ref_tag(req)); - break; - } - } - - cmnd->rw.control = cpu_to_le16(control); - cmnd->rw.dsmgmt = cpu_to_le32(dsmgmt); - return 0; -} - -void nvme_cleanup_cmd(struct request *req) -{ - if (req->rq_flags & RQF_SPECIAL_PAYLOAD) { - struct nvme_ns *ns = req->rq_disk->private_data; - struct page *page = req->special_vec.bv_page; - - if (page == ns->ctrl->discard_page) - clear_bit_unlock(0, &ns->ctrl->discard_page_busy); - else - kfree(page_address(page) + req->special_vec.bv_offset); - } -} -EXPORT_SYMBOL_GPL(nvme_cleanup_cmd); - -blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req, - struct nvme_command *cmd) -{ - struct nvme_ctrl *ctrl = nvme_req(req)->ctrl; - blk_status_t ret = BLK_STS_OK; - - if (!(req->rq_flags & RQF_DONTPREP)) - nvme_clear_nvme_request(req); - - memset(cmd, 0, sizeof(*cmd)); - switch (req_op(req)) { - case REQ_OP_DRV_IN: - case REQ_OP_DRV_OUT: - nvme_setup_passthrough(req, cmd); - break; - case REQ_OP_FLUSH: - nvme_setup_flush(ns, cmd); - break; - case REQ_OP_ZONE_RESET_ALL: - case REQ_OP_ZONE_RESET: - ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_RESET); - break; - case REQ_OP_ZONE_OPEN: - ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_OPEN); - break; - case REQ_OP_ZONE_CLOSE: - ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_CLOSE); - break; - case REQ_OP_ZONE_FINISH: - ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_FINISH); - break; - case REQ_OP_WRITE_ZEROES: - ret = nvme_setup_write_zeroes(ns, req, cmd); - break; - case REQ_OP_DISCARD: - ret = nvme_setup_discard(ns, req, cmd); - break; - case REQ_OP_READ: - ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_read); - break; - case REQ_OP_WRITE: - ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_write); - break; - case REQ_OP_ZONE_APPEND: - ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_zone_append); - break; - default: - WARN_ON_ONCE(1); - return BLK_STS_IOERR; - } - - if (!(ctrl->quirks & NVME_QUIRK_SKIP_CID_GEN)) - nvme_req(req)->genctr++; - cmd->common.command_id = nvme_cid(req); - trace_nvme_setup_cmd(req, cmd); - return ret; -} -EXPORT_SYMBOL_GPL(nvme_setup_cmd); - -static void nvme_end_sync_rq(struct request *rq, blk_status_t error) -{ - struct completion *waiting = rq->end_io_data; - - rq->end_io_data = NULL; - complete(waiting); -} - -static void nvme_execute_rq_polled(struct request_queue *q, - struct gendisk *bd_disk, struct request *rq, int at_head) -{ - DECLARE_COMPLETION_ONSTACK(wait); - - WARN_ON_ONCE(!test_bit(QUEUE_FLAG_POLL, &q->queue_flags)); - - rq->cmd_flags |= REQ_HIPRI; - rq->end_io_data = &wait; - blk_execute_rq_nowait(q, bd_disk, rq, at_head, nvme_end_sync_rq); - - while (!completion_done(&wait)) { - blk_poll(q, request_to_qc_t(rq->mq_hctx, rq), true); - cond_resched(); - } -} - -/* - * Returns 0 on success. If the result is negative, it's a Linux error code; - * if the result is positive, it's an NVM Express status code - */ -int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, - union nvme_result *result, void *buffer, unsigned bufflen, - unsigned timeout, int qid, int at_head, - blk_mq_req_flags_t flags, bool poll) -{ - struct request *req; - int ret; - - if (qid == NVME_QID_ANY) - req = nvme_alloc_request(q, cmd, flags); - else - req = nvme_alloc_request_qid(q, cmd, flags, qid); - if (IS_ERR(req)) - return PTR_ERR(req); - - if (timeout) - req->timeout = timeout; - - if (buffer && bufflen) { - ret = blk_rq_map_kern(q, req, buffer, bufflen, GFP_KERNEL); - if (ret) - goto out; - } - - if (poll) - nvme_execute_rq_polled(req->q, NULL, req, at_head); - else - blk_execute_rq(req->q, NULL, req, at_head); - if (result) - *result = nvme_req(req)->result; - if (nvme_req(req)->flags & NVME_REQ_CANCELLED) - ret = -EINTR; - else - ret = nvme_req(req)->status; - out: - blk_mq_free_request(req); - return ret; -} -EXPORT_SYMBOL_GPL(__nvme_submit_sync_cmd); - -int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, - void *buffer, unsigned bufflen) -{ - return __nvme_submit_sync_cmd(q, cmd, NULL, buffer, bufflen, 0, - NVME_QID_ANY, 0, 0, false); -} -EXPORT_SYMBOL_GPL(nvme_submit_sync_cmd); - -static void *nvme_add_user_metadata(struct bio *bio, void __user *ubuf, - unsigned len, u32 seed, bool write) -{ - struct bio_integrity_payload *bip; - int ret = -ENOMEM; - void *buf; - - buf = kmalloc(len, GFP_KERNEL); - if (!buf) - goto out; - - ret = -EFAULT; - if (write && copy_from_user(buf, ubuf, len)) - goto out_free_meta; - - bip = bio_integrity_alloc(bio, GFP_KERNEL, 1); - if (IS_ERR(bip)) { - ret = PTR_ERR(bip); - goto out_free_meta; - } - - bip->bip_iter.bi_size = len; - bip->bip_iter.bi_sector = seed; - ret = bio_integrity_add_page(bio, virt_to_page(buf), len, - offset_in_page(buf)); - if (ret == len) - return buf; - ret = -ENOMEM; -out_free_meta: - kfree(buf); -out: - return ERR_PTR(ret); -} - -static u32 nvme_known_admin_effects(u8 opcode) -{ - switch (opcode) { - case nvme_admin_format_nvm: - return NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_NCC | - NVME_CMD_EFFECTS_CSE_MASK; - case nvme_admin_sanitize_nvm: - return NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_CSE_MASK; - default: - break; - } - return 0; -} - -u32 nvme_command_effects(struct nvme_ctrl *ctrl, struct nvme_ns *ns, u8 opcode) -{ - u32 effects = 0; - - if (ns) { - if (ns->head->effects) - effects = le32_to_cpu(ns->head->effects->iocs[opcode]); - if (effects & ~(NVME_CMD_EFFECTS_CSUPP | NVME_CMD_EFFECTS_LBCC)) - dev_warn(ctrl->device, - "IO command:%02x has unhandled effects:%08x\n", - opcode, effects); - return 0; - } - - if (ctrl->effects) - effects = le32_to_cpu(ctrl->effects->acs[opcode]); - effects |= nvme_known_admin_effects(opcode); - - return effects; -} -EXPORT_SYMBOL_NS_GPL(nvme_command_effects, NVME_TARGET_PASSTHRU); - -static u32 nvme_passthru_start(struct nvme_ctrl *ctrl, struct nvme_ns *ns, - u8 opcode) -{ - u32 effects = nvme_command_effects(ctrl, ns, opcode); - - /* - * For simplicity, IO to all namespaces is quiesced even if the command - * effects say only one namespace is affected. - */ - if (effects & NVME_CMD_EFFECTS_CSE_MASK) { - mutex_lock(&ctrl->scan_lock); - mutex_lock(&ctrl->subsys->lock); - nvme_mpath_start_freeze(ctrl->subsys); - nvme_mpath_wait_freeze(ctrl->subsys); - nvme_start_freeze(ctrl); - nvme_wait_freeze(ctrl); - } - return effects; -} - -static void nvme_passthru_end(struct nvme_ctrl *ctrl, u32 effects) -{ - if (effects & NVME_CMD_EFFECTS_CSE_MASK) { - nvme_unfreeze(ctrl); - nvme_mpath_unfreeze(ctrl->subsys); - mutex_unlock(&ctrl->subsys->lock); - nvme_remove_invalid_namespaces(ctrl, NVME_NSID_ALL); - mutex_unlock(&ctrl->scan_lock); - } - if (effects & NVME_CMD_EFFECTS_CCC) - nvme_init_identify(ctrl); - if (effects & (NVME_CMD_EFFECTS_NIC | NVME_CMD_EFFECTS_NCC)) { - nvme_queue_scan(ctrl); - flush_work(&ctrl->scan_work); - } -} - -void nvme_execute_passthru_rq(struct request *rq) -{ - struct nvme_command *cmd = nvme_req(rq)->cmd; - struct nvme_ctrl *ctrl = nvme_req(rq)->ctrl; - struct nvme_ns *ns = rq->q->queuedata; - struct gendisk *disk = ns ? ns->disk : NULL; - u32 effects; - - effects = nvme_passthru_start(ctrl, ns, cmd->common.opcode); - blk_execute_rq(rq->q, disk, rq, 0); - nvme_passthru_end(ctrl, effects); -} -EXPORT_SYMBOL_NS_GPL(nvme_execute_passthru_rq, NVME_TARGET_PASSTHRU); - -static int nvme_submit_user_cmd(struct request_queue *q, - struct nvme_command *cmd, void __user *ubuffer, - unsigned bufflen, void __user *meta_buffer, unsigned meta_len, - u32 meta_seed, u64 *result, unsigned timeout) -{ - bool write = nvme_is_write(cmd); - struct nvme_ns *ns = q->queuedata; - struct gendisk *disk = ns ? ns->disk : NULL; - struct request *req; - struct bio *bio = NULL; - void *meta = NULL; - int ret; - - req = nvme_alloc_request(q, cmd, 0); - if (IS_ERR(req)) - return PTR_ERR(req); - - if (timeout) - req->timeout = timeout; - nvme_req(req)->flags |= NVME_REQ_USERCMD; - - if (ubuffer && bufflen) { - ret = blk_rq_map_user(q, req, NULL, ubuffer, bufflen, - GFP_KERNEL); - if (ret) - goto out; - bio = req->bio; - bio->bi_disk = disk; - if (disk && meta_buffer && meta_len) { - meta = nvme_add_user_metadata(bio, meta_buffer, meta_len, - meta_seed, write); - if (IS_ERR(meta)) { - ret = PTR_ERR(meta); - goto out_unmap; - } - req->cmd_flags |= REQ_INTEGRITY; - } - } - - nvme_execute_passthru_rq(req); - if (nvme_req(req)->flags & NVME_REQ_CANCELLED) - ret = -EINTR; - else - ret = nvme_req(req)->status; - if (result) - *result = le64_to_cpu(nvme_req(req)->result.u64); - if (meta && !ret && !write) { - if (copy_to_user(meta_buffer, meta, meta_len)) - ret = -EFAULT; - } - kfree(meta); - out_unmap: - if (bio) - blk_rq_unmap_user(bio); - out: - blk_mq_free_request(req); - return ret; -} - -static void nvme_keep_alive_end_io(struct request *rq, blk_status_t status) -{ - struct nvme_ctrl *ctrl = rq->end_io_data; - unsigned long flags; - bool startka = false; - - blk_mq_free_request(rq); - - if (status) { - dev_err(ctrl->device, - "failed nvme_keep_alive_end_io error=%d\n", - status); - return; - } - - ctrl->comp_seen = false; - spin_lock_irqsave(&ctrl->lock, flags); - if (ctrl->state == NVME_CTRL_LIVE || - ctrl->state == NVME_CTRL_CONNECTING) - startka = true; - spin_unlock_irqrestore(&ctrl->lock, flags); - if (startka) - queue_delayed_work(nvme_wq, &ctrl->ka_work, ctrl->kato * HZ); -} - -static int nvme_keep_alive(struct nvme_ctrl *ctrl) -{ - struct request *rq; - - rq = nvme_alloc_request(ctrl->admin_q, &ctrl->ka_cmd, - BLK_MQ_REQ_RESERVED); - if (IS_ERR(rq)) - return PTR_ERR(rq); - - rq->timeout = ctrl->kato * HZ; - rq->end_io_data = ctrl; - - blk_execute_rq_nowait(rq->q, NULL, rq, 0, nvme_keep_alive_end_io); - - return 0; -} - -static void nvme_keep_alive_work(struct work_struct *work) -{ - struct nvme_ctrl *ctrl = container_of(to_delayed_work(work), - struct nvme_ctrl, ka_work); - bool comp_seen = ctrl->comp_seen; - - if ((ctrl->ctratt & NVME_CTRL_ATTR_TBKAS) && comp_seen) { - dev_dbg(ctrl->device, - "reschedule traffic based keep-alive timer\n"); - ctrl->comp_seen = false; - queue_delayed_work(nvme_wq, &ctrl->ka_work, ctrl->kato * HZ); - return; - } - - if (nvme_keep_alive(ctrl)) { - /* allocation failure, reset the controller */ - dev_err(ctrl->device, "keep-alive failed\n"); - nvme_reset_ctrl(ctrl); - return; - } -} - -static void nvme_start_keep_alive(struct nvme_ctrl *ctrl) -{ - if (unlikely(ctrl->kato == 0)) - return; - - queue_delayed_work(nvme_wq, &ctrl->ka_work, ctrl->kato * HZ); -} - -void nvme_stop_keep_alive(struct nvme_ctrl *ctrl) -{ - if (unlikely(ctrl->kato == 0)) - return; - - cancel_delayed_work_sync(&ctrl->ka_work); -} -EXPORT_SYMBOL_GPL(nvme_stop_keep_alive); - -/* - * In NVMe 1.0 the CNS field was just a binary controller or namespace - * flag, thus sending any new CNS opcodes has a big chance of not working. - * Qemu unfortunately had that bug after reporting a 1.1 version compliance - * (but not for any later version). - */ -static bool nvme_ctrl_limited_cns(struct nvme_ctrl *ctrl) -{ - if (ctrl->quirks & NVME_QUIRK_IDENTIFY_CNS) - return ctrl->vs < NVME_VS(1, 2, 0); - return ctrl->vs < NVME_VS(1, 1, 0); -} - -static int nvme_identify_ctrl(struct nvme_ctrl *dev, struct nvme_id_ctrl **id) -{ - struct nvme_command c = { }; - int error; - - /* gcc-4.4.4 (at least) has issues with initializers and anon unions */ - c.identify.opcode = nvme_admin_identify; - c.identify.cns = NVME_ID_CNS_CTRL; - - *id = kmalloc(sizeof(struct nvme_id_ctrl), GFP_KERNEL); - if (!*id) - return -ENOMEM; - - error = nvme_submit_sync_cmd(dev->admin_q, &c, *id, - sizeof(struct nvme_id_ctrl)); - if (error) - kfree(*id); - return error; -} - -static bool nvme_multi_css(struct nvme_ctrl *ctrl) -{ - return (ctrl->ctrl_config & NVME_CC_CSS_MASK) == NVME_CC_CSS_CSI; -} - -static int nvme_process_ns_desc(struct nvme_ctrl *ctrl, struct nvme_ns_ids *ids, - struct nvme_ns_id_desc *cur, bool *csi_seen) -{ - const char *warn_str = "ctrl returned bogus length:"; - void *data = cur; - - switch (cur->nidt) { - case NVME_NIDT_EUI64: - if (cur->nidl != NVME_NIDT_EUI64_LEN) { - dev_warn(ctrl->device, "%s %d for NVME_NIDT_EUI64\n", - warn_str, cur->nidl); - return -1; - } - if (ctrl->quirks & NVME_QUIRK_BOGUS_NID) - return NVME_NIDT_EUI64_LEN; - memcpy(ids->eui64, data + sizeof(*cur), NVME_NIDT_EUI64_LEN); - return NVME_NIDT_EUI64_LEN; - case NVME_NIDT_NGUID: - if (cur->nidl != NVME_NIDT_NGUID_LEN) { - dev_warn(ctrl->device, "%s %d for NVME_NIDT_NGUID\n", - warn_str, cur->nidl); - return -1; - } - if (ctrl->quirks & NVME_QUIRK_BOGUS_NID) - return NVME_NIDT_NGUID_LEN; - memcpy(ids->nguid, data + sizeof(*cur), NVME_NIDT_NGUID_LEN); - return NVME_NIDT_NGUID_LEN; - case NVME_NIDT_UUID: - if (cur->nidl != NVME_NIDT_UUID_LEN) { - dev_warn(ctrl->device, "%s %d for NVME_NIDT_UUID\n", - warn_str, cur->nidl); - return -1; - } - if (ctrl->quirks & NVME_QUIRK_BOGUS_NID) - return NVME_NIDT_UUID_LEN; - uuid_copy(&ids->uuid, data + sizeof(*cur)); - return NVME_NIDT_UUID_LEN; - case NVME_NIDT_CSI: - if (cur->nidl != NVME_NIDT_CSI_LEN) { - dev_warn(ctrl->device, "%s %d for NVME_NIDT_CSI\n", - warn_str, cur->nidl); - return -1; - } - memcpy(&ids->csi, data + sizeof(*cur), NVME_NIDT_CSI_LEN); - *csi_seen = true; - return NVME_NIDT_CSI_LEN; - default: - /* Skip unknown types */ - return cur->nidl; - } -} - -static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl, unsigned nsid, - struct nvme_ns_ids *ids) -{ - struct nvme_command c = { }; - bool csi_seen = false; - int status, pos, len; - void *data; - - if (ctrl->vs < NVME_VS(1, 3, 0) && !nvme_multi_css(ctrl)) - return 0; - if (ctrl->quirks & NVME_QUIRK_NO_NS_DESC_LIST) - return 0; - - c.identify.opcode = nvme_admin_identify; - c.identify.nsid = cpu_to_le32(nsid); - c.identify.cns = NVME_ID_CNS_NS_DESC_LIST; - - data = kzalloc(NVME_IDENTIFY_DATA_SIZE, GFP_KERNEL); - if (!data) - return -ENOMEM; - - status = nvme_submit_sync_cmd(ctrl->admin_q, &c, data, - NVME_IDENTIFY_DATA_SIZE); - if (status) { - dev_warn(ctrl->device, - "Identify Descriptors failed (%d)\n", status); - goto free_data; - } - - for (pos = 0; pos < NVME_IDENTIFY_DATA_SIZE; pos += len) { - struct nvme_ns_id_desc *cur = data + pos; - - if (cur->nidl == 0) - break; - - len = nvme_process_ns_desc(ctrl, ids, cur, &csi_seen); - if (len < 0) - break; - - len += sizeof(*cur); - } - - if (nvme_multi_css(ctrl) && !csi_seen) { - dev_warn(ctrl->device, "Command set not reported for nsid:%d\n", - nsid); - status = -EINVAL; - } - -free_data: - kfree(data); - return status; -} - -static int nvme_identify_ns(struct nvme_ctrl *ctrl, unsigned nsid, - struct nvme_ns_ids *ids, struct nvme_id_ns **id) -{ - struct nvme_command c = { }; - int error; - - /* gcc-4.4.4 (at least) has issues with initializers and anon unions */ - c.identify.opcode = nvme_admin_identify; - c.identify.nsid = cpu_to_le32(nsid); - c.identify.cns = NVME_ID_CNS_NS; - - *id = kmalloc(sizeof(**id), GFP_KERNEL); - if (!*id) - return -ENOMEM; - - error = nvme_submit_sync_cmd(ctrl->admin_q, &c, *id, sizeof(**id)); - if (error) { - dev_warn(ctrl->device, "Identify namespace failed (%d)\n", error); - goto out_free_id; - } - - error = NVME_SC_INVALID_NS | NVME_SC_DNR; - if ((*id)->ncap == 0) /* namespace not allocated or attached */ - goto out_free_id; - - - if (ctrl->quirks & NVME_QUIRK_BOGUS_NID) { - dev_info(ctrl->device, - "Ignoring bogus Namespace Identifiers\n"); - } else { - if (ctrl->vs >= NVME_VS(1, 1, 0) && - !memchr_inv(ids->eui64, 0, sizeof(ids->eui64))) - memcpy(ids->eui64, (*id)->eui64, sizeof(ids->eui64)); - if (ctrl->vs >= NVME_VS(1, 2, 0) && - !memchr_inv(ids->nguid, 0, sizeof(ids->nguid))) - memcpy(ids->nguid, (*id)->nguid, sizeof(ids->nguid)); - } - - return 0; - -out_free_id: - kfree(*id); - return error; -} - -static int nvme_features(struct nvme_ctrl *dev, u8 op, unsigned int fid, - unsigned int dword11, void *buffer, size_t buflen, u32 *result) -{ - union nvme_result res = { 0 }; - struct nvme_command c; - int ret; - - memset(&c, 0, sizeof(c)); - c.features.opcode = op; - c.features.fid = cpu_to_le32(fid); - c.features.dword11 = cpu_to_le32(dword11); - - ret = __nvme_submit_sync_cmd(dev->admin_q, &c, &res, - buffer, buflen, 0, NVME_QID_ANY, 0, 0, false); - if (ret >= 0 && result) - *result = le32_to_cpu(res.u32); - return ret; -} - -int nvme_set_features(struct nvme_ctrl *dev, unsigned int fid, - unsigned int dword11, void *buffer, size_t buflen, - u32 *result) -{ - return nvme_features(dev, nvme_admin_set_features, fid, dword11, buffer, - buflen, result); -} -EXPORT_SYMBOL_GPL(nvme_set_features); - -int nvme_get_features(struct nvme_ctrl *dev, unsigned int fid, - unsigned int dword11, void *buffer, size_t buflen, - u32 *result) -{ - return nvme_features(dev, nvme_admin_get_features, fid, dword11, buffer, - buflen, result); -} -EXPORT_SYMBOL_GPL(nvme_get_features); - -int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count) -{ - u32 q_count = (*count - 1) | ((*count - 1) << 16); - u32 result; - int status, nr_io_queues; - - status = nvme_set_features(ctrl, NVME_FEAT_NUM_QUEUES, q_count, NULL, 0, - &result); - if (status < 0) - return status; - - /* - * Degraded controllers might return an error when setting the queue - * count. We still want to be able to bring them online and offer - * access to the admin queue, as that might be only way to fix them up. - */ - if (status > 0) { - dev_err(ctrl->device, "Could not set queue count (%d)\n", status); - *count = 0; - } else { - nr_io_queues = min(result & 0xffff, result >> 16) + 1; - *count = min(*count, nr_io_queues); - } - - return 0; -} -EXPORT_SYMBOL_GPL(nvme_set_queue_count); - -#define NVME_AEN_SUPPORTED \ - (NVME_AEN_CFG_NS_ATTR | NVME_AEN_CFG_FW_ACT | \ - NVME_AEN_CFG_ANA_CHANGE | NVME_AEN_CFG_DISC_CHANGE) - -static void nvme_enable_aen(struct nvme_ctrl *ctrl) -{ - u32 result, supported_aens = ctrl->oaes & NVME_AEN_SUPPORTED; - int status; - - if (!supported_aens) - return; - - status = nvme_set_features(ctrl, NVME_FEAT_ASYNC_EVENT, supported_aens, - NULL, 0, &result); - if (status) - dev_warn(ctrl->device, "Failed to configure AEN (cfg %x)\n", - supported_aens); - - queue_work(nvme_wq, &ctrl->async_event_work); -} - -/* - * Convert integer values from ioctl structures to user pointers, silently - * ignoring the upper bits in the compat case to match behaviour of 32-bit - * kernels. - */ -static void __user *nvme_to_user_ptr(uintptr_t ptrval) -{ - if (in_compat_syscall()) - ptrval = (compat_uptr_t)ptrval; - return (void __user *)ptrval; -} - -static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) -{ - struct nvme_user_io io; - struct nvme_command c; - unsigned length, meta_len; - void __user *metadata; - - if (copy_from_user(&io, uio, sizeof(io))) - return -EFAULT; - if (io.flags) - return -EINVAL; - - switch (io.opcode) { - case nvme_cmd_write: - case nvme_cmd_read: - case nvme_cmd_compare: - break; - default: - return -EINVAL; - } - - length = (io.nblocks + 1) << ns->lba_shift; - - if ((io.control & NVME_RW_PRINFO_PRACT) && - ns->ms == sizeof(struct t10_pi_tuple)) { - /* - * Protection information is stripped/inserted by the - * controller. - */ - if (nvme_to_user_ptr(io.metadata)) - return -EINVAL; - meta_len = 0; - metadata = NULL; - } else { - meta_len = (io.nblocks + 1) * ns->ms; - metadata = nvme_to_user_ptr(io.metadata); - } - - if (ns->features & NVME_NS_EXT_LBAS) { - length += meta_len; - meta_len = 0; - } else if (meta_len) { - if ((io.metadata & 3) || !io.metadata) - return -EINVAL; - } - - memset(&c, 0, sizeof(c)); - c.rw.opcode = io.opcode; - c.rw.flags = io.flags; - c.rw.nsid = cpu_to_le32(ns->head->ns_id); - c.rw.slba = cpu_to_le64(io.slba); - c.rw.length = cpu_to_le16(io.nblocks); - c.rw.control = cpu_to_le16(io.control); - c.rw.dsmgmt = cpu_to_le32(io.dsmgmt); - c.rw.reftag = cpu_to_le32(io.reftag); - c.rw.apptag = cpu_to_le16(io.apptag); - c.rw.appmask = cpu_to_le16(io.appmask); - - return nvme_submit_user_cmd(ns->queue, &c, - nvme_to_user_ptr(io.addr), length, - metadata, meta_len, lower_32_bits(io.slba), NULL, 0); -} - -static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns, - struct nvme_passthru_cmd __user *ucmd) -{ - struct nvme_passthru_cmd cmd; - struct nvme_command c; - unsigned timeout = 0; - u64 result; - int status; - - if (!capable(CAP_SYS_ADMIN)) - return -EACCES; - if (copy_from_user(&cmd, ucmd, sizeof(cmd))) - return -EFAULT; - if (cmd.flags) - return -EINVAL; - - memset(&c, 0, sizeof(c)); - c.common.opcode = cmd.opcode; - c.common.flags = cmd.flags; - c.common.nsid = cpu_to_le32(cmd.nsid); - c.common.cdw2[0] = cpu_to_le32(cmd.cdw2); - c.common.cdw2[1] = cpu_to_le32(cmd.cdw3); - c.common.cdw10 = cpu_to_le32(cmd.cdw10); - c.common.cdw11 = cpu_to_le32(cmd.cdw11); - c.common.cdw12 = cpu_to_le32(cmd.cdw12); - c.common.cdw13 = cpu_to_le32(cmd.cdw13); - c.common.cdw14 = cpu_to_le32(cmd.cdw14); - c.common.cdw15 = cpu_to_le32(cmd.cdw15); - - if (cmd.timeout_ms) - timeout = msecs_to_jiffies(cmd.timeout_ms); - - status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c, - nvme_to_user_ptr(cmd.addr), cmd.data_len, - nvme_to_user_ptr(cmd.metadata), cmd.metadata_len, - 0, &result, timeout); - - if (status >= 0) { - if (put_user(result, &ucmd->result)) - return -EFAULT; - } - - return status; -} - -static int nvme_user_cmd64(struct nvme_ctrl *ctrl, struct nvme_ns *ns, - struct nvme_passthru_cmd64 __user *ucmd) -{ - struct nvme_passthru_cmd64 cmd; - struct nvme_command c; - unsigned timeout = 0; - int status; - - if (!capable(CAP_SYS_ADMIN)) - return -EACCES; - if (copy_from_user(&cmd, ucmd, sizeof(cmd))) - return -EFAULT; - if (cmd.flags) - return -EINVAL; - - memset(&c, 0, sizeof(c)); - c.common.opcode = cmd.opcode; - c.common.flags = cmd.flags; - c.common.nsid = cpu_to_le32(cmd.nsid); - c.common.cdw2[0] = cpu_to_le32(cmd.cdw2); - c.common.cdw2[1] = cpu_to_le32(cmd.cdw3); - c.common.cdw10 = cpu_to_le32(cmd.cdw10); - c.common.cdw11 = cpu_to_le32(cmd.cdw11); - c.common.cdw12 = cpu_to_le32(cmd.cdw12); - c.common.cdw13 = cpu_to_le32(cmd.cdw13); - c.common.cdw14 = cpu_to_le32(cmd.cdw14); - c.common.cdw15 = cpu_to_le32(cmd.cdw15); - - if (cmd.timeout_ms) - timeout = msecs_to_jiffies(cmd.timeout_ms); - - status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c, - nvme_to_user_ptr(cmd.addr), cmd.data_len, - nvme_to_user_ptr(cmd.metadata), cmd.metadata_len, - 0, &cmd.result, timeout); - - if (status >= 0) { - if (put_user(cmd.result, &ucmd->result)) - return -EFAULT; - } - - return status; -} - -/* - * Issue ioctl requests on the first available path. Note that unlike normal - * block layer requests we will not retry failed request on another controller. - */ -struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk, - struct nvme_ns_head **head, int *srcu_idx) -{ -#ifdef CONFIG_NVME_MULTIPATH - if (disk->fops == &nvme_ns_head_ops) { - struct nvme_ns *ns; - - *head = disk->private_data; - *srcu_idx = srcu_read_lock(&(*head)->srcu); - ns = nvme_find_path(*head); - if (!ns) - srcu_read_unlock(&(*head)->srcu, *srcu_idx); - return ns; - } -#endif - *head = NULL; - *srcu_idx = -1; - return disk->private_data; -} - -void nvme_put_ns_from_disk(struct nvme_ns_head *head, int idx) -{ - if (head) - srcu_read_unlock(&head->srcu, idx); -} - -static bool is_ctrl_ioctl(unsigned int cmd) -{ - if (cmd == NVME_IOCTL_ADMIN_CMD || cmd == NVME_IOCTL_ADMIN64_CMD) - return true; - if (is_sed_ioctl(cmd)) - return true; - return false; -} - -static int nvme_handle_ctrl_ioctl(struct nvme_ns *ns, unsigned int cmd, - void __user *argp, - struct nvme_ns_head *head, - int srcu_idx) -{ - struct nvme_ctrl *ctrl = ns->ctrl; - int ret; - - nvme_get_ctrl(ns->ctrl); - nvme_put_ns_from_disk(head, srcu_idx); - - switch (cmd) { - case NVME_IOCTL_ADMIN_CMD: - ret = nvme_user_cmd(ctrl, NULL, argp); - break; - case NVME_IOCTL_ADMIN64_CMD: - ret = nvme_user_cmd64(ctrl, NULL, argp); - break; - default: - ret = sed_ioctl(ctrl->opal_dev, cmd, argp); - break; - } - nvme_put_ctrl(ctrl); - return ret; -} - -static int nvme_ioctl(struct block_device *bdev, fmode_t mode, - unsigned int cmd, unsigned long arg) -{ - struct nvme_ns_head *head = NULL; - void __user *argp = (void __user *)arg; - struct nvme_ns *ns; - int srcu_idx, ret; - - ns = nvme_get_ns_from_disk(bdev->bd_disk, &head, &srcu_idx); - if (unlikely(!ns)) - return -EWOULDBLOCK; - - /* - * Handle ioctls that apply to the controller instead of the namespace - * seperately and drop the ns SRCU reference early. This avoids a - * deadlock when deleting namespaces using the passthrough interface. - */ - if (is_ctrl_ioctl(cmd)) - return nvme_handle_ctrl_ioctl(ns, cmd, argp, head, srcu_idx); - - switch (cmd) { - case NVME_IOCTL_ID: - force_successful_syscall_return(); - ret = ns->head->ns_id; - break; - case NVME_IOCTL_IO_CMD: - ret = nvme_user_cmd(ns->ctrl, ns, argp); - break; - case NVME_IOCTL_SUBMIT_IO: - ret = nvme_submit_io(ns, argp); - break; - case NVME_IOCTL_IO64_CMD: - ret = nvme_user_cmd64(ns->ctrl, ns, argp); - break; - default: - if (ns->ndev) - ret = nvme_nvm_ioctl(ns, cmd, arg); - else - ret = -ENOTTY; - } - - nvme_put_ns_from_disk(head, srcu_idx); - return ret; -} - -#ifdef CONFIG_COMPAT -struct nvme_user_io32 { - __u8 opcode; - __u8 flags; - __u16 control; - __u16 nblocks; - __u16 rsvd; - __u64 metadata; - __u64 addr; - __u64 slba; - __u32 dsmgmt; - __u32 reftag; - __u16 apptag; - __u16 appmask; -} __attribute__((__packed__)); - -#define NVME_IOCTL_SUBMIT_IO32 _IOW('N', 0x42, struct nvme_user_io32) - -static int nvme_compat_ioctl(struct block_device *bdev, fmode_t mode, - unsigned int cmd, unsigned long arg) -{ - /* - * Corresponds to the difference of NVME_IOCTL_SUBMIT_IO - * between 32 bit programs and 64 bit kernel. - * The cause is that the results of sizeof(struct nvme_user_io), - * which is used to define NVME_IOCTL_SUBMIT_IO, - * are not same between 32 bit compiler and 64 bit compiler. - * NVME_IOCTL_SUBMIT_IO32 is for 64 bit kernel handling - * NVME_IOCTL_SUBMIT_IO issued from 32 bit programs. - * Other IOCTL numbers are same between 32 bit and 64 bit. - * So there is nothing to do regarding to other IOCTL numbers. - */ - if (cmd == NVME_IOCTL_SUBMIT_IO32) - return nvme_ioctl(bdev, mode, NVME_IOCTL_SUBMIT_IO, arg); - - return nvme_ioctl(bdev, mode, cmd, arg); -} -#else -#define nvme_compat_ioctl NULL -#endif /* CONFIG_COMPAT */ - -static int nvme_open(struct block_device *bdev, fmode_t mode) -{ - struct nvme_ns *ns = bdev->bd_disk->private_data; - -#ifdef CONFIG_NVME_MULTIPATH - /* should never be called due to GENHD_FL_HIDDEN */ - if (WARN_ON_ONCE(ns->head->disk)) - goto fail; -#endif - if (!kref_get_unless_zero(&ns->kref)) - goto fail; - if (!try_module_get(ns->ctrl->ops->module)) - goto fail_put_ns; - - return 0; - -fail_put_ns: - nvme_put_ns(ns); -fail: - return -ENXIO; -} - -static void nvme_release(struct gendisk *disk, fmode_t mode) -{ - struct nvme_ns *ns = disk->private_data; - - module_put(ns->ctrl->ops->module); - nvme_put_ns(ns); -} - -static int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo) -{ - /* some standard values */ - geo->heads = 1 << 6; - geo->sectors = 1 << 5; - geo->cylinders = get_capacity(bdev->bd_disk) >> 11; - return 0; -} - -#ifdef CONFIG_BLK_DEV_INTEGRITY -static void nvme_init_integrity(struct gendisk *disk, u16 ms, u8 pi_type, - u32 max_integrity_segments) -{ - struct blk_integrity integrity; - - memset(&integrity, 0, sizeof(integrity)); - switch (pi_type) { - case NVME_NS_DPS_PI_TYPE3: - integrity.profile = &t10_pi_type3_crc; - integrity.tag_size = sizeof(u16) + sizeof(u32); - integrity.flags |= BLK_INTEGRITY_DEVICE_CAPABLE; - break; - case NVME_NS_DPS_PI_TYPE1: - case NVME_NS_DPS_PI_TYPE2: - integrity.profile = &t10_pi_type1_crc; - integrity.tag_size = sizeof(u16); - integrity.flags |= BLK_INTEGRITY_DEVICE_CAPABLE; - break; - default: - integrity.profile = NULL; - break; - } - integrity.tuple_size = ms; - blk_integrity_register(disk, &integrity); - blk_queue_max_integrity_segments(disk->queue, max_integrity_segments); -} -#else -static void nvme_init_integrity(struct gendisk *disk, u16 ms, u8 pi_type, - u32 max_integrity_segments) -{ -} -#endif /* CONFIG_BLK_DEV_INTEGRITY */ - -static void nvme_config_discard(struct gendisk *disk, struct nvme_ns *ns) -{ - struct nvme_ctrl *ctrl = ns->ctrl; - struct request_queue *queue = disk->queue; - u32 size = queue_logical_block_size(queue); - - if (!(ctrl->oncs & NVME_CTRL_ONCS_DSM)) { - blk_queue_flag_clear(QUEUE_FLAG_DISCARD, queue); - return; - } - - if (ctrl->nr_streams && ns->sws && ns->sgs) - size *= ns->sws * ns->sgs; - - BUILD_BUG_ON(PAGE_SIZE / sizeof(struct nvme_dsm_range) < - NVME_DSM_MAX_RANGES); - - queue->limits.discard_alignment = 0; - queue->limits.discard_granularity = size; - - /* If discard is already enabled, don't reset queue limits */ - if (blk_queue_flag_test_and_set(QUEUE_FLAG_DISCARD, queue)) - return; - - blk_queue_max_discard_sectors(queue, UINT_MAX); - blk_queue_max_discard_segments(queue, NVME_DSM_MAX_RANGES); - - if (ctrl->quirks & NVME_QUIRK_DEALLOCATE_ZEROES) - blk_queue_max_write_zeroes_sectors(queue, UINT_MAX); -} - -/* - * Even though NVMe spec explicitly states that MDTS is not applicable to the - * write-zeroes, we are cautious and limit the size to the controllers - * max_hw_sectors value, which is based on the MDTS field and possibly other - * limiting factors. - */ -static void nvme_config_write_zeroes(struct request_queue *q, - struct nvme_ctrl *ctrl) -{ - if ((ctrl->oncs & NVME_CTRL_ONCS_WRITE_ZEROES) && - !(ctrl->quirks & NVME_QUIRK_DISABLE_WRITE_ZEROES)) - blk_queue_max_write_zeroes_sectors(q, ctrl->max_hw_sectors); -} - -static bool nvme_ns_ids_valid(struct nvme_ns_ids *ids) -{ - return !uuid_is_null(&ids->uuid) || - memchr_inv(ids->nguid, 0, sizeof(ids->nguid)) || - memchr_inv(ids->eui64, 0, sizeof(ids->eui64)); -} - -static bool nvme_ns_ids_equal(struct nvme_ns_ids *a, struct nvme_ns_ids *b) -{ - return uuid_equal(&a->uuid, &b->uuid) && - memcmp(&a->nguid, &b->nguid, sizeof(a->nguid)) == 0 && - memcmp(&a->eui64, &b->eui64, sizeof(a->eui64)) == 0 && - a->csi == b->csi; -} - -static int nvme_setup_streams_ns(struct nvme_ctrl *ctrl, struct nvme_ns *ns, - u32 *phys_bs, u32 *io_opt) -{ - struct streams_directive_params s; - int ret; - - if (!ctrl->nr_streams) - return 0; - - ret = nvme_get_stream_params(ctrl, &s, ns->head->ns_id); - if (ret) - return ret; - - ns->sws = le32_to_cpu(s.sws); - ns->sgs = le16_to_cpu(s.sgs); - - if (ns->sws) { - *phys_bs = ns->sws * (1 << ns->lba_shift); - if (ns->sgs) - *io_opt = *phys_bs * ns->sgs; - } - - return 0; -} - -static int nvme_configure_metadata(struct nvme_ns *ns, struct nvme_id_ns *id) -{ - struct nvme_ctrl *ctrl = ns->ctrl; - - /* - * The PI implementation requires the metadata size to be equal to the - * t10 pi tuple size. - */ - ns->ms = le16_to_cpu(id->lbaf[id->flbas & NVME_NS_FLBAS_LBA_MASK].ms); - if (ns->ms == sizeof(struct t10_pi_tuple)) - ns->pi_type = id->dps & NVME_NS_DPS_PI_MASK; - else - ns->pi_type = 0; - - ns->features &= ~(NVME_NS_METADATA_SUPPORTED | NVME_NS_EXT_LBAS); - if (!ns->ms || !(ctrl->ops->flags & NVME_F_METADATA_SUPPORTED)) - return 0; - if (ctrl->ops->flags & NVME_F_FABRICS) { - /* - * The NVMe over Fabrics specification only supports metadata as - * part of the extended data LBA. We rely on HCA/HBA support to - * remap the separate metadata buffer from the block layer. - */ - if (WARN_ON_ONCE(!(id->flbas & NVME_NS_FLBAS_META_EXT))) - return -EINVAL; - if (ctrl->max_integrity_segments) - ns->features |= - (NVME_NS_METADATA_SUPPORTED | NVME_NS_EXT_LBAS); - } else { - /* - * For PCIe controllers, we can't easily remap the separate - * metadata buffer from the block layer and thus require a - * separate metadata buffer for block layer metadata/PI support. - * We allow extended LBAs for the passthrough interface, though. - */ - if (id->flbas & NVME_NS_FLBAS_META_EXT) - ns->features |= NVME_NS_EXT_LBAS; - else - ns->features |= NVME_NS_METADATA_SUPPORTED; - } - - return 0; -} - -static void nvme_set_queue_limits(struct nvme_ctrl *ctrl, - struct request_queue *q) -{ - bool vwc = ctrl->vwc & NVME_CTRL_VWC_PRESENT; - - if (ctrl->max_hw_sectors) { - u32 max_segments = - (ctrl->max_hw_sectors / (NVME_CTRL_PAGE_SIZE >> 9)) + 1; - - max_segments = min_not_zero(max_segments, ctrl->max_segments); - blk_queue_max_hw_sectors(q, ctrl->max_hw_sectors); - blk_queue_max_segments(q, min_t(u32, max_segments, USHRT_MAX)); - } - blk_queue_virt_boundary(q, NVME_CTRL_PAGE_SIZE - 1); - blk_queue_dma_alignment(q, 3); - blk_queue_write_cache(q, vwc, vwc); -} - -static void nvme_update_disk_info(struct gendisk *disk, - struct nvme_ns *ns, struct nvme_id_ns *id) -{ - sector_t capacity = nvme_lba_to_sect(ns, le64_to_cpu(id->nsze)); - unsigned short bs = 1 << ns->lba_shift; - u32 atomic_bs, phys_bs, io_opt = 0; - - /* - * The block layer can't support LBA sizes larger than the page size - * yet, so catch this early and don't allow block I/O. - */ - if (ns->lba_shift > PAGE_SHIFT) { - capacity = 0; - bs = (1 << 9); - } - - blk_integrity_unregister(disk); - - atomic_bs = phys_bs = bs; - nvme_setup_streams_ns(ns->ctrl, ns, &phys_bs, &io_opt); - if (id->nabo == 0) { - /* - * Bit 1 indicates whether NAWUPF is defined for this namespace - * and whether it should be used instead of AWUPF. If NAWUPF == - * 0 then AWUPF must be used instead. - */ - if (id->nsfeat & NVME_NS_FEAT_ATOMICS && id->nawupf) - atomic_bs = (1 + le16_to_cpu(id->nawupf)) * bs; - else - atomic_bs = (1 + ns->ctrl->subsys->awupf) * bs; - } - - if (id->nsfeat & NVME_NS_FEAT_IO_OPT) { - /* NPWG = Namespace Preferred Write Granularity */ - phys_bs = bs * (1 + le16_to_cpu(id->npwg)); - /* NOWS = Namespace Optimal Write Size */ - io_opt = bs * (1 + le16_to_cpu(id->nows)); - } - - blk_queue_logical_block_size(disk->queue, bs); - /* - * Linux filesystems assume writing a single physical block is - * an atomic operation. Hence limit the physical block size to the - * value of the Atomic Write Unit Power Fail parameter. - */ - blk_queue_physical_block_size(disk->queue, min(phys_bs, atomic_bs)); - blk_queue_io_min(disk->queue, phys_bs); - blk_queue_io_opt(disk->queue, io_opt); - - /* - * Register a metadata profile for PI, or the plain non-integrity NVMe - * metadata masquerading as Type 0 if supported, otherwise reject block - * I/O to namespaces with metadata except when the namespace supports - * PI, as it can strip/insert in that case. - */ - if (ns->ms) { - if (IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY) && - (ns->features & NVME_NS_METADATA_SUPPORTED)) - nvme_init_integrity(disk, ns->ms, ns->pi_type, - ns->ctrl->max_integrity_segments); - else if (!nvme_ns_has_pi(ns)) - capacity = 0; - } - - set_capacity_revalidate_and_notify(disk, capacity, false); - - nvme_config_discard(disk, ns); - nvme_config_write_zeroes(disk->queue, ns->ctrl); - - if (id->nsattr & NVME_NS_ATTR_RO) - set_disk_ro(disk, true); -} - -static inline bool nvme_first_scan(struct gendisk *disk) -{ - /* nvme_alloc_ns() scans the disk prior to adding it */ - return !(disk->flags & GENHD_FL_UP); -} - -static void nvme_set_chunk_sectors(struct nvme_ns *ns, struct nvme_id_ns *id) -{ - struct nvme_ctrl *ctrl = ns->ctrl; - u32 iob; - - if ((ctrl->quirks & NVME_QUIRK_STRIPE_SIZE) && - is_power_of_2(ctrl->max_hw_sectors)) - iob = ctrl->max_hw_sectors; - else - iob = nvme_lba_to_sect(ns, le16_to_cpu(id->noiob)); - - if (!iob) - return; - - if (!is_power_of_2(iob)) { - if (nvme_first_scan(ns->disk)) - pr_warn("%s: ignoring unaligned IO boundary:%u\n", - ns->disk->disk_name, iob); - return; - } - - if (blk_queue_is_zoned(ns->disk->queue)) { - if (nvme_first_scan(ns->disk)) - pr_warn("%s: ignoring zoned namespace IO boundary\n", - ns->disk->disk_name); - return; - } - - blk_queue_chunk_sectors(ns->queue, iob); -} - -static int nvme_update_ns_info(struct nvme_ns *ns, struct nvme_id_ns *id) -{ - unsigned lbaf = id->flbas & NVME_NS_FLBAS_LBA_MASK; - int ret; - - blk_mq_freeze_queue(ns->disk->queue); - ns->lba_shift = id->lbaf[lbaf].ds; - nvme_set_queue_limits(ns->ctrl, ns->queue); - - if (ns->head->ids.csi == NVME_CSI_ZNS) { - ret = nvme_update_zone_info(ns, lbaf); - if (ret) - goto out_unfreeze; - } - - ret = nvme_configure_metadata(ns, id); - if (ret) - goto out_unfreeze; - nvme_set_chunk_sectors(ns, id); - nvme_update_disk_info(ns->disk, ns, id); - blk_mq_unfreeze_queue(ns->disk->queue); - - if (blk_queue_is_zoned(ns->queue)) { - ret = nvme_revalidate_zones(ns); - if (ret && !nvme_first_scan(ns->disk)) - return ret; - } - -#ifdef CONFIG_NVME_MULTIPATH - if (ns->head->disk) { - blk_mq_freeze_queue(ns->head->disk->queue); - nvme_update_disk_info(ns->head->disk, ns, id); - blk_stack_limits(&ns->head->disk->queue->limits, - &ns->queue->limits, 0); - blk_queue_update_readahead(ns->head->disk->queue); - nvme_update_bdev_size(ns->head->disk); - blk_mq_unfreeze_queue(ns->head->disk->queue); - } -#endif - return 0; - -out_unfreeze: - blk_mq_unfreeze_queue(ns->disk->queue); - return ret; -} - -static char nvme_pr_type(enum pr_type type) -{ - switch (type) { - case PR_WRITE_EXCLUSIVE: - return 1; - case PR_EXCLUSIVE_ACCESS: - return 2; - case PR_WRITE_EXCLUSIVE_REG_ONLY: - return 3; - case PR_EXCLUSIVE_ACCESS_REG_ONLY: - return 4; - case PR_WRITE_EXCLUSIVE_ALL_REGS: - return 5; - case PR_EXCLUSIVE_ACCESS_ALL_REGS: - return 6; - default: - return 0; - } -}; - -static int nvme_pr_command(struct block_device *bdev, u32 cdw10, - u64 key, u64 sa_key, u8 op) -{ - struct nvme_ns_head *head = NULL; - struct nvme_ns *ns; - struct nvme_command c; - int srcu_idx, ret; - u8 data[16] = { 0, }; - - ns = nvme_get_ns_from_disk(bdev->bd_disk, &head, &srcu_idx); - if (unlikely(!ns)) - return -EWOULDBLOCK; - - put_unaligned_le64(key, &data[0]); - put_unaligned_le64(sa_key, &data[8]); - - memset(&c, 0, sizeof(c)); - c.common.opcode = op; - c.common.nsid = cpu_to_le32(ns->head->ns_id); - c.common.cdw10 = cpu_to_le32(cdw10); - - ret = nvme_submit_sync_cmd(ns->queue, &c, data, 16); - nvme_put_ns_from_disk(head, srcu_idx); - return ret; -} - -static int nvme_pr_register(struct block_device *bdev, u64 old, - u64 new, unsigned flags) -{ - u32 cdw10; - - if (flags & ~PR_FL_IGNORE_KEY) - return -EOPNOTSUPP; - - cdw10 = old ? 2 : 0; - cdw10 |= (flags & PR_FL_IGNORE_KEY) ? 1 << 3 : 0; - cdw10 |= (1 << 30) | (1 << 31); /* PTPL=1 */ - return nvme_pr_command(bdev, cdw10, old, new, nvme_cmd_resv_register); -} - -static int nvme_pr_reserve(struct block_device *bdev, u64 key, - enum pr_type type, unsigned flags) -{ - u32 cdw10; - - if (flags & ~PR_FL_IGNORE_KEY) - return -EOPNOTSUPP; - - cdw10 = nvme_pr_type(type) << 8; - cdw10 |= ((flags & PR_FL_IGNORE_KEY) ? 1 << 3 : 0); - return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_acquire); -} - -static int nvme_pr_preempt(struct block_device *bdev, u64 old, u64 new, - enum pr_type type, bool abort) -{ - u32 cdw10 = nvme_pr_type(type) << 8 | (abort ? 2 : 1); - return nvme_pr_command(bdev, cdw10, old, new, nvme_cmd_resv_acquire); -} - -static int nvme_pr_clear(struct block_device *bdev, u64 key) -{ - u32 cdw10 = 1 | (key ? 1 << 3 : 0); - return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_register); -} - -static int nvme_pr_release(struct block_device *bdev, u64 key, enum pr_type type) -{ - u32 cdw10 = nvme_pr_type(type) << 8 | (key ? 1 << 3 : 0); - return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_release); -} - -static const struct pr_ops nvme_pr_ops = { - .pr_register = nvme_pr_register, - .pr_reserve = nvme_pr_reserve, - .pr_release = nvme_pr_release, - .pr_preempt = nvme_pr_preempt, - .pr_clear = nvme_pr_clear, -}; - -#ifdef CONFIG_BLK_SED_OPAL -int nvme_sec_submit(void *data, u16 spsp, u8 secp, void *buffer, size_t len, - bool send) -{ - struct nvme_ctrl *ctrl = data; - struct nvme_command cmd; - - memset(&cmd, 0, sizeof(cmd)); - if (send) - cmd.common.opcode = nvme_admin_security_send; - else - cmd.common.opcode = nvme_admin_security_recv; - cmd.common.nsid = 0; - cmd.common.cdw10 = cpu_to_le32(((u32)secp) << 24 | ((u32)spsp) << 8); - cmd.common.cdw11 = cpu_to_le32(len); - - return __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, NULL, buffer, len, - ADMIN_TIMEOUT, NVME_QID_ANY, 1, 0, false); -} -EXPORT_SYMBOL_GPL(nvme_sec_submit); -#endif /* CONFIG_BLK_SED_OPAL */ - -static const struct block_device_operations nvme_fops = { - .owner = THIS_MODULE, - .ioctl = nvme_ioctl, - .compat_ioctl = nvme_compat_ioctl, - .open = nvme_open, - .release = nvme_release, - .getgeo = nvme_getgeo, - .report_zones = nvme_report_zones, - .pr_ops = &nvme_pr_ops, -}; - -#ifdef CONFIG_NVME_MULTIPATH -static int nvme_ns_head_open(struct block_device *bdev, fmode_t mode) -{ - struct nvme_ns_head *head = bdev->bd_disk->private_data; - - if (!kref_get_unless_zero(&head->ref)) - return -ENXIO; - return 0; -} - -static void nvme_ns_head_release(struct gendisk *disk, fmode_t mode) -{ - nvme_put_ns_head(disk->private_data); -} - -const struct block_device_operations nvme_ns_head_ops = { - .owner = THIS_MODULE, - .submit_bio = nvme_ns_head_submit_bio, - .open = nvme_ns_head_open, - .release = nvme_ns_head_release, - .ioctl = nvme_ioctl, - .compat_ioctl = nvme_compat_ioctl, - .getgeo = nvme_getgeo, - .report_zones = nvme_report_zones, - .pr_ops = &nvme_pr_ops, -}; -#endif /* CONFIG_NVME_MULTIPATH */ - -static int nvme_wait_ready(struct nvme_ctrl *ctrl, u64 cap, bool enabled) -{ - unsigned long timeout = - ((NVME_CAP_TIMEOUT(cap) + 1) * HZ / 2) + jiffies; - u32 csts, bit = enabled ? NVME_CSTS_RDY : 0; - int ret; - - while ((ret = ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &csts)) == 0) { - if (csts == ~0) - return -ENODEV; - if ((csts & NVME_CSTS_RDY) == bit) - break; - - usleep_range(1000, 2000); - if (fatal_signal_pending(current)) - return -EINTR; - if (time_after(jiffies, timeout)) { - dev_err(ctrl->device, - "Device not ready; aborting %s, CSTS=0x%x\n", - enabled ? "initialisation" : "reset", csts); - return -ENODEV; - } - } - - return ret; -} - -/* - * If the device has been passed off to us in an enabled state, just clear - * the enabled bit. The spec says we should set the 'shutdown notification - * bits', but doing so may cause the device to complete commands to the - * admin queue ... and we don't know what memory that might be pointing at! - */ -int nvme_disable_ctrl(struct nvme_ctrl *ctrl) -{ - int ret; - - ctrl->ctrl_config &= ~NVME_CC_SHN_MASK; - ctrl->ctrl_config &= ~NVME_CC_ENABLE; - - ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config); - if (ret) - return ret; - - if (ctrl->quirks & NVME_QUIRK_DELAY_BEFORE_CHK_RDY) - msleep(NVME_QUIRK_DELAY_AMOUNT); - - return nvme_wait_ready(ctrl, ctrl->cap, false); -} -EXPORT_SYMBOL_GPL(nvme_disable_ctrl); - -int nvme_enable_ctrl(struct nvme_ctrl *ctrl) -{ - unsigned dev_page_min; - int ret; - - ret = ctrl->ops->reg_read64(ctrl, NVME_REG_CAP, &ctrl->cap); - if (ret) { - dev_err(ctrl->device, "Reading CAP failed (%d)\n", ret); - return ret; - } - dev_page_min = NVME_CAP_MPSMIN(ctrl->cap) + 12; - - if (NVME_CTRL_PAGE_SHIFT < dev_page_min) { - dev_err(ctrl->device, - "Minimum device page size %u too large for host (%u)\n", - 1 << dev_page_min, 1 << NVME_CTRL_PAGE_SHIFT); - return -ENODEV; - } - - if (NVME_CAP_CSS(ctrl->cap) & NVME_CAP_CSS_CSI) - ctrl->ctrl_config = NVME_CC_CSS_CSI; - else - ctrl->ctrl_config = NVME_CC_CSS_NVM; - ctrl->ctrl_config |= (NVME_CTRL_PAGE_SHIFT - 12) << NVME_CC_MPS_SHIFT; - ctrl->ctrl_config |= NVME_CC_AMS_RR | NVME_CC_SHN_NONE; - ctrl->ctrl_config |= NVME_CC_IOSQES | NVME_CC_IOCQES; - ctrl->ctrl_config |= NVME_CC_ENABLE; - - ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config); - if (ret) - return ret; - return nvme_wait_ready(ctrl, ctrl->cap, true); -} -EXPORT_SYMBOL_GPL(nvme_enable_ctrl); - -int nvme_shutdown_ctrl(struct nvme_ctrl *ctrl) -{ - unsigned long timeout = jiffies + (ctrl->shutdown_timeout * HZ); - u32 csts; - int ret; - - ctrl->ctrl_config &= ~NVME_CC_SHN_MASK; - ctrl->ctrl_config |= NVME_CC_SHN_NORMAL; - - ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config); - if (ret) - return ret; - - while ((ret = ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &csts)) == 0) { - if ((csts & NVME_CSTS_SHST_MASK) == NVME_CSTS_SHST_CMPLT) - break; - - msleep(100); - if (fatal_signal_pending(current)) - return -EINTR; - if (time_after(jiffies, timeout)) { - dev_err(ctrl->device, - "Device shutdown incomplete; abort shutdown\n"); - return -ENODEV; - } - } - - return ret; -} -EXPORT_SYMBOL_GPL(nvme_shutdown_ctrl); - -static int nvme_configure_timestamp(struct nvme_ctrl *ctrl) -{ - __le64 ts; - int ret; - - if (!(ctrl->oncs & NVME_CTRL_ONCS_TIMESTAMP)) - return 0; - - ts = cpu_to_le64(ktime_to_ms(ktime_get_real())); - ret = nvme_set_features(ctrl, NVME_FEAT_TIMESTAMP, 0, &ts, sizeof(ts), - NULL); - if (ret) - dev_warn_once(ctrl->device, - "could not set timestamp (%d)\n", ret); - return ret; -} - -static int nvme_configure_acre(struct nvme_ctrl *ctrl) -{ - struct nvme_feat_host_behavior *host; - int ret; - - /* Don't bother enabling the feature if retry delay is not reported */ - if (!ctrl->crdt[0]) - return 0; - - host = kzalloc(sizeof(*host), GFP_KERNEL); - if (!host) - return 0; - - host->acre = NVME_ENABLE_ACRE; - ret = nvme_set_features(ctrl, NVME_FEAT_HOST_BEHAVIOR, 0, - host, sizeof(*host), NULL); - kfree(host); - return ret; -} - -static int nvme_configure_apst(struct nvme_ctrl *ctrl) -{ - /* - * APST (Autonomous Power State Transition) lets us program a - * table of power state transitions that the controller will - * perform automatically. We configure it with a simple - * heuristic: we are willing to spend at most 2% of the time - * transitioning between power states. Therefore, when running - * in any given state, we will enter the next lower-power - * non-operational state after waiting 50 * (enlat + exlat) - * microseconds, as long as that state's exit latency is under - * the requested maximum latency. - * - * We will not autonomously enter any non-operational state for - * which the total latency exceeds ps_max_latency_us. Users - * can set ps_max_latency_us to zero to turn off APST. - */ - - unsigned apste; - struct nvme_feat_auto_pst *table; - u64 max_lat_us = 0; - int max_ps = -1; - int ret; - - /* - * If APST isn't supported or if we haven't been initialized yet, - * then don't do anything. - */ - if (!ctrl->apsta) - return 0; - - if (ctrl->npss > 31) { - dev_warn(ctrl->device, "NPSS is invalid; not using APST\n"); - return 0; - } - - table = kzalloc(sizeof(*table), GFP_KERNEL); - if (!table) - return 0; - - if (!ctrl->apst_enabled || ctrl->ps_max_latency_us == 0) { - /* Turn off APST. */ - apste = 0; - dev_dbg(ctrl->device, "APST disabled\n"); - } else { - __le64 target = cpu_to_le64(0); - int state; - - /* - * Walk through all states from lowest- to highest-power. - * According to the spec, lower-numbered states use more - * power. NPSS, despite the name, is the index of the - * lowest-power state, not the number of states. - */ - for (state = (int)ctrl->npss; state >= 0; state--) { - u64 total_latency_us, exit_latency_us, transition_ms; - - if (target) - table->entries[state] = target; - - /* - * Don't allow transitions to the deepest state - * if it's quirked off. - */ - if (state == ctrl->npss && - (ctrl->quirks & NVME_QUIRK_NO_DEEPEST_PS)) - continue; - - /* - * Is this state a useful non-operational state for - * higher-power states to autonomously transition to? - */ - if (!(ctrl->psd[state].flags & - NVME_PS_FLAGS_NON_OP_STATE)) - continue; - - exit_latency_us = - (u64)le32_to_cpu(ctrl->psd[state].exit_lat); - if (exit_latency_us > ctrl->ps_max_latency_us) - continue; - - total_latency_us = - exit_latency_us + - le32_to_cpu(ctrl->psd[state].entry_lat); - - /* - * This state is good. Use it as the APST idle - * target for higher power states. - */ - transition_ms = total_latency_us + 19; - do_div(transition_ms, 20); - if (transition_ms > (1 << 24) - 1) - transition_ms = (1 << 24) - 1; - - target = cpu_to_le64((state << 3) | - (transition_ms << 8)); - - if (max_ps == -1) - max_ps = state; - - if (total_latency_us > max_lat_us) - max_lat_us = total_latency_us; - } - - apste = 1; - - if (max_ps == -1) { - dev_dbg(ctrl->device, "APST enabled but no non-operational states are available\n"); - } else { - dev_dbg(ctrl->device, "APST enabled: max PS = %d, max round-trip latency = %lluus, table = %*phN\n", - max_ps, max_lat_us, (int)sizeof(*table), table); - } - } - - ret = nvme_set_features(ctrl, NVME_FEAT_AUTO_PST, apste, - table, sizeof(*table), NULL); - if (ret) - dev_err(ctrl->device, "failed to set APST feature (%d)\n", ret); - - kfree(table); - return ret; -} - -static void nvme_set_latency_tolerance(struct device *dev, s32 val) -{ - struct nvme_ctrl *ctrl = dev_get_drvdata(dev); - u64 latency; - - switch (val) { - case PM_QOS_LATENCY_TOLERANCE_NO_CONSTRAINT: - case PM_QOS_LATENCY_ANY: - latency = U64_MAX; - break; - - default: - latency = val; - } - - if (ctrl->ps_max_latency_us != latency) { - ctrl->ps_max_latency_us = latency; - if (ctrl->state == NVME_CTRL_LIVE) - nvme_configure_apst(ctrl); - } -} - -struct nvme_core_quirk_entry { - /* - * NVMe model and firmware strings are padded with spaces. For - * simplicity, strings in the quirk table are padded with NULLs - * instead. - */ - u16 vid; - const char *mn; - const char *fr; - unsigned long quirks; -}; - -static const struct nvme_core_quirk_entry core_quirks[] = { - { - /* - * This Toshiba device seems to die using any APST states. See: - * https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1678184/comments/11 - */ - .vid = 0x1179, - .mn = "THNSF5256GPUK TOSHIBA", - .quirks = NVME_QUIRK_NO_APST, - }, - { - /* - * This LiteON CL1-3D*-Q11 firmware version has a race - * condition associated with actions related to suspend to idle - * LiteON has resolved the problem in future firmware - */ - .vid = 0x14a4, - .fr = "22301111", - .quirks = NVME_QUIRK_SIMPLE_SUSPEND, - }, - { - /* - * This Kioxia CD6-V Series / HPE PE8030 device times out and - * aborts I/O during any load, but more easily reproducible - * with discards (fstrim). - * - * The device is left in a state where it is also not possible - * to use "nvme set-feature" to disable APST, but booting with - * nvme_core.default_ps_max_latency=0 works. - */ - .vid = 0x1e0f, - .mn = "KCD6XVUL6T40", - .quirks = NVME_QUIRK_NO_APST, - }, - { - /* - * The external Samsung X5 SSD fails initialization without a - * delay before checking if it is ready and has a whole set of - * other problems. To make this even more interesting, it - * shares the PCI ID with internal Samsung 970 Evo Plus that - * does not need or want these quirks. - */ - .vid = 0x144d, - .mn = "Samsung Portable SSD X5", - .quirks = NVME_QUIRK_DELAY_BEFORE_CHK_RDY | - NVME_QUIRK_NO_DEEPEST_PS | - NVME_QUIRK_IGNORE_DEV_SUBNQN, - } -}; - -/* match is null-terminated but idstr is space-padded. */ -static bool string_matches(const char *idstr, const char *match, size_t len) -{ - size_t matchlen; - - if (!match) - return true; - - matchlen = strlen(match); - WARN_ON_ONCE(matchlen > len); - - if (memcmp(idstr, match, matchlen)) - return false; - - for (; matchlen < len; matchlen++) - if (idstr[matchlen] != ' ') - return false; - - return true; -} - -static bool quirk_matches(const struct nvme_id_ctrl *id, - const struct nvme_core_quirk_entry *q) -{ - return q->vid == le16_to_cpu(id->vid) && - string_matches(id->mn, q->mn, sizeof(id->mn)) && - string_matches(id->fr, q->fr, sizeof(id->fr)); -} - -static void nvme_init_subnqn(struct nvme_subsystem *subsys, struct nvme_ctrl *ctrl, - struct nvme_id_ctrl *id) -{ - size_t nqnlen; - int off; - - if(!(ctrl->quirks & NVME_QUIRK_IGNORE_DEV_SUBNQN)) { - nqnlen = strnlen(id->subnqn, NVMF_NQN_SIZE); - if (nqnlen > 0 && nqnlen < NVMF_NQN_SIZE) { - strlcpy(subsys->subnqn, id->subnqn, NVMF_NQN_SIZE); - return; - } - - if (ctrl->vs >= NVME_VS(1, 2, 1)) - dev_warn(ctrl->device, "missing or invalid SUBNQN field.\n"); - } - - /* Generate a "fake" NQN per Figure 254 in NVMe 1.3 + ECN 001 */ - off = snprintf(subsys->subnqn, NVMF_NQN_SIZE, - "nqn.2014.08.org.nvmexpress:%04x%04x", - le16_to_cpu(id->vid), le16_to_cpu(id->ssvid)); - memcpy(subsys->subnqn + off, id->sn, sizeof(id->sn)); - off += sizeof(id->sn); - memcpy(subsys->subnqn + off, id->mn, sizeof(id->mn)); - off += sizeof(id->mn); - memset(subsys->subnqn + off, 0, sizeof(subsys->subnqn) - off); -} - -static void nvme_release_subsystem(struct device *dev) -{ - struct nvme_subsystem *subsys = - container_of(dev, struct nvme_subsystem, dev); - - if (subsys->instance >= 0) - ida_simple_remove(&nvme_instance_ida, subsys->instance); - kfree(subsys); -} - -static void nvme_destroy_subsystem(struct kref *ref) -{ - struct nvme_subsystem *subsys = - container_of(ref, struct nvme_subsystem, ref); - - mutex_lock(&nvme_subsystems_lock); - list_del(&subsys->entry); - mutex_unlock(&nvme_subsystems_lock); - - ida_destroy(&subsys->ns_ida); - device_del(&subsys->dev); - put_device(&subsys->dev); -} - -static void nvme_put_subsystem(struct nvme_subsystem *subsys) -{ - kref_put(&subsys->ref, nvme_destroy_subsystem); -} - -static struct nvme_subsystem *__nvme_find_get_subsystem(const char *subsysnqn) -{ - struct nvme_subsystem *subsys; - - lockdep_assert_held(&nvme_subsystems_lock); - - /* - * Fail matches for discovery subsystems. This results - * in each discovery controller bound to a unique subsystem. - * This avoids issues with validating controller values - * that can only be true when there is a single unique subsystem. - * There may be multiple and completely independent entities - * that provide discovery controllers. - */ - if (!strcmp(subsysnqn, NVME_DISC_SUBSYS_NAME)) - return NULL; - - list_for_each_entry(subsys, &nvme_subsystems, entry) { - if (strcmp(subsys->subnqn, subsysnqn)) - continue; - if (!kref_get_unless_zero(&subsys->ref)) - continue; - return subsys; - } - - return NULL; -} - -#define SUBSYS_ATTR_RO(_name, _mode, _show) \ - struct device_attribute subsys_attr_##_name = \ - __ATTR(_name, _mode, _show, NULL) - -static ssize_t nvme_subsys_show_nqn(struct device *dev, - struct device_attribute *attr, - char *buf) -{ - struct nvme_subsystem *subsys = - container_of(dev, struct nvme_subsystem, dev); - - return snprintf(buf, PAGE_SIZE, "%s\n", subsys->subnqn); -} -static SUBSYS_ATTR_RO(subsysnqn, S_IRUGO, nvme_subsys_show_nqn); - -#define nvme_subsys_show_str_function(field) \ -static ssize_t subsys_##field##_show(struct device *dev, \ - struct device_attribute *attr, char *buf) \ -{ \ - struct nvme_subsystem *subsys = \ - container_of(dev, struct nvme_subsystem, dev); \ - return sysfs_emit(buf, "%.*s\n", \ - (int)sizeof(subsys->field), subsys->field); \ -} \ -static SUBSYS_ATTR_RO(field, S_IRUGO, subsys_##field##_show); - -nvme_subsys_show_str_function(model); -nvme_subsys_show_str_function(serial); -nvme_subsys_show_str_function(firmware_rev); - -static struct attribute *nvme_subsys_attrs[] = { - &subsys_attr_model.attr, - &subsys_attr_serial.attr, - &subsys_attr_firmware_rev.attr, - &subsys_attr_subsysnqn.attr, -#ifdef CONFIG_NVME_MULTIPATH - &subsys_attr_iopolicy.attr, -#endif - NULL, -}; - -static struct attribute_group nvme_subsys_attrs_group = { - .attrs = nvme_subsys_attrs, -}; - -static const struct attribute_group *nvme_subsys_attrs_groups[] = { - &nvme_subsys_attrs_group, - NULL, -}; - -static inline bool nvme_discovery_ctrl(struct nvme_ctrl *ctrl) -{ - return ctrl->opts && ctrl->opts->discovery_nqn; -} - -static bool nvme_validate_cntlid(struct nvme_subsystem *subsys, - struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id) -{ - struct nvme_ctrl *tmp; - - lockdep_assert_held(&nvme_subsystems_lock); - - list_for_each_entry(tmp, &subsys->ctrls, subsys_entry) { - if (nvme_state_terminal(tmp)) - continue; - - if (tmp->cntlid == ctrl->cntlid) { - dev_err(ctrl->device, - "Duplicate cntlid %u with %s, rejecting\n", - ctrl->cntlid, dev_name(tmp->device)); - return false; - } - - if ((id->cmic & NVME_CTRL_CMIC_MULTI_CTRL) || - nvme_discovery_ctrl(ctrl)) - continue; - - dev_err(ctrl->device, - "Subsystem does not support multiple controllers\n"); - return false; - } - - return true; -} - -static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id) -{ - struct nvme_subsystem *subsys, *found; - int ret; - - subsys = kzalloc(sizeof(*subsys), GFP_KERNEL); - if (!subsys) - return -ENOMEM; - - subsys->instance = -1; - mutex_init(&subsys->lock); - kref_init(&subsys->ref); - INIT_LIST_HEAD(&subsys->ctrls); - INIT_LIST_HEAD(&subsys->nsheads); - nvme_init_subnqn(subsys, ctrl, id); - memcpy(subsys->serial, id->sn, sizeof(subsys->serial)); - memcpy(subsys->model, id->mn, sizeof(subsys->model)); - memcpy(subsys->firmware_rev, id->fr, sizeof(subsys->firmware_rev)); - subsys->vendor_id = le16_to_cpu(id->vid); - subsys->cmic = id->cmic; - subsys->awupf = le16_to_cpu(id->awupf); -#ifdef CONFIG_NVME_MULTIPATH - subsys->iopolicy = NVME_IOPOLICY_NUMA; -#endif - - subsys->dev.class = nvme_subsys_class; - subsys->dev.release = nvme_release_subsystem; - subsys->dev.groups = nvme_subsys_attrs_groups; - dev_set_name(&subsys->dev, "nvme-subsys%d", ctrl->instance); - device_initialize(&subsys->dev); - - mutex_lock(&nvme_subsystems_lock); - found = __nvme_find_get_subsystem(subsys->subnqn); - if (found) { - put_device(&subsys->dev); - subsys = found; - - if (!nvme_validate_cntlid(subsys, ctrl, id)) { - ret = -EINVAL; - goto out_put_subsystem; - } - } else { - ret = device_add(&subsys->dev); - if (ret) { - dev_err(ctrl->device, - "failed to register subsystem device.\n"); - put_device(&subsys->dev); - goto out_unlock; - } - ida_init(&subsys->ns_ida); - list_add_tail(&subsys->entry, &nvme_subsystems); - } - - ret = sysfs_create_link(&subsys->dev.kobj, &ctrl->device->kobj, - dev_name(ctrl->device)); - if (ret) { - dev_err(ctrl->device, - "failed to create sysfs link from subsystem.\n"); - goto out_put_subsystem; - } - - if (!found) - subsys->instance = ctrl->instance; - ctrl->subsys = subsys; - list_add_tail(&ctrl->subsys_entry, &subsys->ctrls); - mutex_unlock(&nvme_subsystems_lock); - return 0; - -out_put_subsystem: - nvme_put_subsystem(subsys); -out_unlock: - mutex_unlock(&nvme_subsystems_lock); - return ret; -} - -int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp, u8 csi, - void *log, size_t size, u64 offset) -{ - struct nvme_command c = { }; - u32 dwlen = nvme_bytes_to_numd(size); - - c.get_log_page.opcode = nvme_admin_get_log_page; - c.get_log_page.nsid = cpu_to_le32(nsid); - c.get_log_page.lid = log_page; - c.get_log_page.lsp = lsp; - c.get_log_page.numdl = cpu_to_le16(dwlen & ((1 << 16) - 1)); - c.get_log_page.numdu = cpu_to_le16(dwlen >> 16); - c.get_log_page.lpol = cpu_to_le32(lower_32_bits(offset)); - c.get_log_page.lpou = cpu_to_le32(upper_32_bits(offset)); - c.get_log_page.csi = csi; - - return nvme_submit_sync_cmd(ctrl->admin_q, &c, log, size); -} - -static int nvme_get_effects_log(struct nvme_ctrl *ctrl, u8 csi, - struct nvme_effects_log **log) -{ - struct nvme_effects_log *cel = xa_load(&ctrl->cels, csi); - int ret; - - if (cel) - goto out; - - cel = kzalloc(sizeof(*cel), GFP_KERNEL); - if (!cel) - return -ENOMEM; - - ret = nvme_get_log(ctrl, 0x00, NVME_LOG_CMD_EFFECTS, 0, csi, - cel, sizeof(*cel), 0); - if (ret) { - kfree(cel); - return ret; - } - - xa_store(&ctrl->cels, csi, cel, GFP_KERNEL); -out: - *log = cel; - return 0; -} - -/* - * Initialize the cached copies of the Identify data and various controller - * register in our nvme_ctrl structure. This should be called as soon as - * the admin queue is fully up and running. - */ -int nvme_init_identify(struct nvme_ctrl *ctrl) -{ - struct nvme_id_ctrl *id; - int ret, page_shift; - u32 max_hw_sectors; - bool prev_apst_enabled; - - ret = ctrl->ops->reg_read32(ctrl, NVME_REG_VS, &ctrl->vs); - if (ret) { - dev_err(ctrl->device, "Reading VS failed (%d)\n", ret); - return ret; - } - page_shift = NVME_CAP_MPSMIN(ctrl->cap) + 12; - ctrl->sqsize = min_t(u16, NVME_CAP_MQES(ctrl->cap), ctrl->sqsize); - - if (ctrl->vs >= NVME_VS(1, 1, 0)) - ctrl->subsystem = NVME_CAP_NSSRC(ctrl->cap); - - ret = nvme_identify_ctrl(ctrl, &id); - if (ret) { - dev_err(ctrl->device, "Identify Controller failed (%d)\n", ret); - return -EIO; - } - - if (id->lpa & NVME_CTRL_LPA_CMD_EFFECTS_LOG) { - ret = nvme_get_effects_log(ctrl, NVME_CSI_NVM, &ctrl->effects); - if (ret < 0) - goto out_free; - } - - if (!(ctrl->ops->flags & NVME_F_FABRICS)) - ctrl->cntlid = le16_to_cpu(id->cntlid); - - if (!ctrl->identified) { - int i; - - ret = nvme_init_subsystem(ctrl, id); - if (ret) - goto out_free; - - /* - * Check for quirks. Quirk can depend on firmware version, - * so, in principle, the set of quirks present can change - * across a reset. As a possible future enhancement, we - * could re-scan for quirks every time we reinitialize - * the device, but we'd have to make sure that the driver - * behaves intelligently if the quirks change. - */ - for (i = 0; i < ARRAY_SIZE(core_quirks); i++) { - if (quirk_matches(id, &core_quirks[i])) - ctrl->quirks |= core_quirks[i].quirks; - } - } - - if (force_apst && (ctrl->quirks & NVME_QUIRK_NO_DEEPEST_PS)) { - dev_warn(ctrl->device, "forcibly allowing all power states due to nvme_core.force_apst -- use at your own risk\n"); - ctrl->quirks &= ~NVME_QUIRK_NO_DEEPEST_PS; - } - - ctrl->crdt[0] = le16_to_cpu(id->crdt1); - ctrl->crdt[1] = le16_to_cpu(id->crdt2); - ctrl->crdt[2] = le16_to_cpu(id->crdt3); - - ctrl->oacs = le16_to_cpu(id->oacs); - ctrl->oncs = le16_to_cpu(id->oncs); - ctrl->mtfa = le16_to_cpu(id->mtfa); - ctrl->oaes = le32_to_cpu(id->oaes); - ctrl->wctemp = le16_to_cpu(id->wctemp); - ctrl->cctemp = le16_to_cpu(id->cctemp); - - atomic_set(&ctrl->abort_limit, id->acl + 1); - ctrl->vwc = id->vwc; - if (id->mdts) - max_hw_sectors = 1 << (id->mdts + page_shift - 9); - else - max_hw_sectors = UINT_MAX; - ctrl->max_hw_sectors = - min_not_zero(ctrl->max_hw_sectors, max_hw_sectors); - - nvme_set_queue_limits(ctrl, ctrl->admin_q); - ctrl->sgls = le32_to_cpu(id->sgls); - ctrl->kas = le16_to_cpu(id->kas); - ctrl->max_namespaces = le32_to_cpu(id->mnan); - ctrl->ctratt = le32_to_cpu(id->ctratt); - - if (id->rtd3e) { - /* us -> s */ - u32 transition_time = le32_to_cpu(id->rtd3e) / USEC_PER_SEC; - - ctrl->shutdown_timeout = clamp_t(unsigned int, transition_time, - shutdown_timeout, 60); - - if (ctrl->shutdown_timeout != shutdown_timeout) - dev_info(ctrl->device, - "Shutdown timeout set to %u seconds\n", - ctrl->shutdown_timeout); - } else - ctrl->shutdown_timeout = shutdown_timeout; - - ctrl->npss = id->npss; - ctrl->apsta = id->apsta; - prev_apst_enabled = ctrl->apst_enabled; - if (ctrl->quirks & NVME_QUIRK_NO_APST) { - if (force_apst && id->apsta) { - dev_warn(ctrl->device, "forcibly allowing APST due to nvme_core.force_apst -- use at your own risk\n"); - ctrl->apst_enabled = true; - } else { - ctrl->apst_enabled = false; - } - } else { - ctrl->apst_enabled = id->apsta; - } - memcpy(ctrl->psd, id->psd, sizeof(ctrl->psd)); - - if (ctrl->ops->flags & NVME_F_FABRICS) { - ctrl->icdoff = le16_to_cpu(id->icdoff); - ctrl->ioccsz = le32_to_cpu(id->ioccsz); - ctrl->iorcsz = le32_to_cpu(id->iorcsz); - ctrl->maxcmd = le16_to_cpu(id->maxcmd); - - /* - * In fabrics we need to verify the cntlid matches the - * admin connect - */ - if (ctrl->cntlid != le16_to_cpu(id->cntlid)) { - dev_err(ctrl->device, - "Mismatching cntlid: Connect %u vs Identify " - "%u, rejecting\n", - ctrl->cntlid, le16_to_cpu(id->cntlid)); - ret = -EINVAL; - goto out_free; - } - - if (!nvme_discovery_ctrl(ctrl) && !ctrl->kas) { - dev_err(ctrl->device, - "keep-alive support is mandatory for fabrics\n"); - ret = -EINVAL; - goto out_free; - } - } else { - ctrl->hmpre = le32_to_cpu(id->hmpre); - ctrl->hmmin = le32_to_cpu(id->hmmin); - ctrl->hmminds = le32_to_cpu(id->hmminds); - ctrl->hmmaxd = le16_to_cpu(id->hmmaxd); - } - - ret = nvme_mpath_init_identify(ctrl, id); - kfree(id); - - if (ret < 0) - return ret; - - if (ctrl->apst_enabled && !prev_apst_enabled) - dev_pm_qos_expose_latency_tolerance(ctrl->device); - else if (!ctrl->apst_enabled && prev_apst_enabled) - dev_pm_qos_hide_latency_tolerance(ctrl->device); - - ret = nvme_configure_apst(ctrl); - if (ret < 0) - return ret; - - ret = nvme_configure_timestamp(ctrl); - if (ret < 0) - return ret; - - ret = nvme_configure_directives(ctrl); - if (ret < 0) - return ret; - - ret = nvme_configure_acre(ctrl); - if (ret < 0) - return ret; - - if (!ctrl->identified && !nvme_discovery_ctrl(ctrl)) { - ret = nvme_hwmon_init(ctrl); - if (ret < 0) - return ret; - } - - ctrl->identified = true; - - return 0; - -out_free: - kfree(id); - return ret; -} -EXPORT_SYMBOL_GPL(nvme_init_identify); - -static int nvme_dev_open(struct inode *inode, struct file *file) -{ - struct nvme_ctrl *ctrl = - container_of(inode->i_cdev, struct nvme_ctrl, cdev); - - switch (ctrl->state) { - case NVME_CTRL_LIVE: - break; - default: - return -EWOULDBLOCK; - } - - nvme_get_ctrl(ctrl); - if (!try_module_get(ctrl->ops->module)) { - nvme_put_ctrl(ctrl); - return -EINVAL; - } - - file->private_data = ctrl; - return 0; -} - -static int nvme_dev_release(struct inode *inode, struct file *file) -{ - struct nvme_ctrl *ctrl = - container_of(inode->i_cdev, struct nvme_ctrl, cdev); - - module_put(ctrl->ops->module); - nvme_put_ctrl(ctrl); - return 0; -} - -static int nvme_dev_user_cmd(struct nvme_ctrl *ctrl, void __user *argp) -{ - struct nvme_ns *ns; - int ret; - - down_read(&ctrl->namespaces_rwsem); - if (list_empty(&ctrl->namespaces)) { - ret = -ENOTTY; - goto out_unlock; - } - - ns = list_first_entry(&ctrl->namespaces, struct nvme_ns, list); - if (ns != list_last_entry(&ctrl->namespaces, struct nvme_ns, list)) { - dev_warn(ctrl->device, - "NVME_IOCTL_IO_CMD not supported when multiple namespaces present!\n"); - ret = -EINVAL; - goto out_unlock; - } - - dev_warn(ctrl->device, - "using deprecated NVME_IOCTL_IO_CMD ioctl on the char device!\n"); - kref_get(&ns->kref); - up_read(&ctrl->namespaces_rwsem); - - ret = nvme_user_cmd(ctrl, ns, argp); - nvme_put_ns(ns); - return ret; - -out_unlock: - up_read(&ctrl->namespaces_rwsem); - return ret; -} - -static long nvme_dev_ioctl(struct file *file, unsigned int cmd, - unsigned long arg) -{ - struct nvme_ctrl *ctrl = file->private_data; - void __user *argp = (void __user *)arg; - - switch (cmd) { - case NVME_IOCTL_ADMIN_CMD: - return nvme_user_cmd(ctrl, NULL, argp); - case NVME_IOCTL_ADMIN64_CMD: - return nvme_user_cmd64(ctrl, NULL, argp); - case NVME_IOCTL_IO_CMD: - return nvme_dev_user_cmd(ctrl, argp); - case NVME_IOCTL_RESET: - dev_warn(ctrl->device, "resetting controller\n"); - return nvme_reset_ctrl_sync(ctrl); - case NVME_IOCTL_SUBSYS_RESET: - return nvme_reset_subsystem(ctrl); - case NVME_IOCTL_RESCAN: - nvme_queue_scan(ctrl); - return 0; - default: - return -ENOTTY; - } -} - -static const struct file_operations nvme_dev_fops = { - .owner = THIS_MODULE, - .open = nvme_dev_open, - .release = nvme_dev_release, - .unlocked_ioctl = nvme_dev_ioctl, - .compat_ioctl = compat_ptr_ioctl, -}; - -static ssize_t nvme_sysfs_reset(struct device *dev, - struct device_attribute *attr, const char *buf, - size_t count) -{ - struct nvme_ctrl *ctrl = dev_get_drvdata(dev); - int ret; - - ret = nvme_reset_ctrl_sync(ctrl); - if (ret < 0) - return ret; - return count; -} -static DEVICE_ATTR(reset_controller, S_IWUSR, NULL, nvme_sysfs_reset); - -static ssize_t nvme_sysfs_rescan(struct device *dev, - struct device_attribute *attr, const char *buf, - size_t count) -{ - struct nvme_ctrl *ctrl = dev_get_drvdata(dev); - - nvme_queue_scan(ctrl); - return count; -} -static DEVICE_ATTR(rescan_controller, S_IWUSR, NULL, nvme_sysfs_rescan); - -static inline struct nvme_ns_head *dev_to_ns_head(struct device *dev) -{ - struct gendisk *disk = dev_to_disk(dev); - - if (disk->fops == &nvme_fops) - return nvme_get_ns_from_dev(dev)->head; - else - return disk->private_data; -} - -static ssize_t wwid_show(struct device *dev, struct device_attribute *attr, - char *buf) -{ - struct nvme_ns_head *head = dev_to_ns_head(dev); - struct nvme_ns_ids *ids = &head->ids; - struct nvme_subsystem *subsys = head->subsys; - int serial_len = sizeof(subsys->serial); - int model_len = sizeof(subsys->model); - - if (!uuid_is_null(&ids->uuid)) - return sysfs_emit(buf, "uuid.%pU\n", &ids->uuid); - - if (memchr_inv(ids->nguid, 0, sizeof(ids->nguid))) - return sysfs_emit(buf, "eui.%16phN\n", ids->nguid); - - if (memchr_inv(ids->eui64, 0, sizeof(ids->eui64))) - return sysfs_emit(buf, "eui.%8phN\n", ids->eui64); - - while (serial_len > 0 && (subsys->serial[serial_len - 1] == ' ' || - subsys->serial[serial_len - 1] == '\0')) - serial_len--; - while (model_len > 0 && (subsys->model[model_len - 1] == ' ' || - subsys->model[model_len - 1] == '\0')) - model_len--; - - return sysfs_emit(buf, "nvme.%04x-%*phN-%*phN-%08x\n", subsys->vendor_id, - serial_len, subsys->serial, model_len, subsys->model, - head->ns_id); -} -static DEVICE_ATTR_RO(wwid); - -static ssize_t nguid_show(struct device *dev, struct device_attribute *attr, - char *buf) -{ - return sysfs_emit(buf, "%pU\n", dev_to_ns_head(dev)->ids.nguid); -} -static DEVICE_ATTR_RO(nguid); - -static ssize_t uuid_show(struct device *dev, struct device_attribute *attr, - char *buf) -{ - struct nvme_ns_ids *ids = &dev_to_ns_head(dev)->ids; - - /* For backward compatibility expose the NGUID to userspace if - * we have no UUID set - */ - if (uuid_is_null(&ids->uuid)) { - dev_warn_ratelimited(dev, - "No UUID available providing old NGUID\n"); - return sysfs_emit(buf, "%pU\n", ids->nguid); - } - return sysfs_emit(buf, "%pU\n", &ids->uuid); -} -static DEVICE_ATTR_RO(uuid); - -static ssize_t eui_show(struct device *dev, struct device_attribute *attr, - char *buf) -{ - return sysfs_emit(buf, "%8ph\n", dev_to_ns_head(dev)->ids.eui64); -} -static DEVICE_ATTR_RO(eui); - -static ssize_t nsid_show(struct device *dev, struct device_attribute *attr, - char *buf) -{ - return sysfs_emit(buf, "%d\n", dev_to_ns_head(dev)->ns_id); -} -static DEVICE_ATTR_RO(nsid); - -static struct attribute *nvme_ns_id_attrs[] = { - &dev_attr_wwid.attr, - &dev_attr_uuid.attr, - &dev_attr_nguid.attr, - &dev_attr_eui.attr, - &dev_attr_nsid.attr, -#ifdef CONFIG_NVME_MULTIPATH - &dev_attr_ana_grpid.attr, - &dev_attr_ana_state.attr, -#endif - NULL, -}; - -static umode_t nvme_ns_id_attrs_are_visible(struct kobject *kobj, - struct attribute *a, int n) -{ - struct device *dev = container_of(kobj, struct device, kobj); - struct nvme_ns_ids *ids = &dev_to_ns_head(dev)->ids; - - if (a == &dev_attr_uuid.attr) { - if (uuid_is_null(&ids->uuid) && - !memchr_inv(ids->nguid, 0, sizeof(ids->nguid))) - return 0; - } - if (a == &dev_attr_nguid.attr) { - if (!memchr_inv(ids->nguid, 0, sizeof(ids->nguid))) - return 0; - } - if (a == &dev_attr_eui.attr) { - if (!memchr_inv(ids->eui64, 0, sizeof(ids->eui64))) - return 0; - } -#ifdef CONFIG_NVME_MULTIPATH - if (a == &dev_attr_ana_grpid.attr || a == &dev_attr_ana_state.attr) { - if (dev_to_disk(dev)->fops != &nvme_fops) /* per-path attr */ - return 0; - if (!nvme_ctrl_use_ana(nvme_get_ns_from_dev(dev)->ctrl)) - return 0; - } -#endif - return a->mode; -} - -static const struct attribute_group nvme_ns_id_attr_group = { - .attrs = nvme_ns_id_attrs, - .is_visible = nvme_ns_id_attrs_are_visible, -}; - -const struct attribute_group *nvme_ns_id_attr_groups[] = { - &nvme_ns_id_attr_group, -#ifdef CONFIG_NVM - &nvme_nvm_attr_group, -#endif - NULL, -}; - -#define nvme_show_str_function(field) \ -static ssize_t field##_show(struct device *dev, \ - struct device_attribute *attr, char *buf) \ -{ \ - struct nvme_ctrl *ctrl = dev_get_drvdata(dev); \ - return sysfs_emit(buf, "%.*s\n", \ - (int)sizeof(ctrl->subsys->field), ctrl->subsys->field); \ -} \ -static DEVICE_ATTR(field, S_IRUGO, field##_show, NULL); - -nvme_show_str_function(model); -nvme_show_str_function(serial); -nvme_show_str_function(firmware_rev); - -#define nvme_show_int_function(field) \ -static ssize_t field##_show(struct device *dev, \ - struct device_attribute *attr, char *buf) \ -{ \ - struct nvme_ctrl *ctrl = dev_get_drvdata(dev); \ - return sysfs_emit(buf, "%d\n", ctrl->field); \ -} \ -static DEVICE_ATTR(field, S_IRUGO, field##_show, NULL); - -nvme_show_int_function(cntlid); -nvme_show_int_function(numa_node); -nvme_show_int_function(queue_count); -nvme_show_int_function(sqsize); - -static ssize_t nvme_sysfs_delete(struct device *dev, - struct device_attribute *attr, const char *buf, - size_t count) -{ - struct nvme_ctrl *ctrl = dev_get_drvdata(dev); - - if (device_remove_file_self(dev, attr)) - nvme_delete_ctrl_sync(ctrl); - return count; -} -static DEVICE_ATTR(delete_controller, S_IWUSR, NULL, nvme_sysfs_delete); - -static ssize_t nvme_sysfs_show_transport(struct device *dev, - struct device_attribute *attr, - char *buf) -{ - struct nvme_ctrl *ctrl = dev_get_drvdata(dev); - - return snprintf(buf, PAGE_SIZE, "%s\n", ctrl->ops->name); -} -static DEVICE_ATTR(transport, S_IRUGO, nvme_sysfs_show_transport, NULL); - -static ssize_t nvme_sysfs_show_state(struct device *dev, - struct device_attribute *attr, - char *buf) -{ - struct nvme_ctrl *ctrl = dev_get_drvdata(dev); - static const char *const state_name[] = { - [NVME_CTRL_NEW] = "new", - [NVME_CTRL_LIVE] = "live", - [NVME_CTRL_RESETTING] = "resetting", - [NVME_CTRL_CONNECTING] = "connecting", - [NVME_CTRL_DELETING] = "deleting", - [NVME_CTRL_DELETING_NOIO]= "deleting (no IO)", - [NVME_CTRL_DEAD] = "dead", - }; - - if ((unsigned)ctrl->state < ARRAY_SIZE(state_name) && - state_name[ctrl->state]) - return sysfs_emit(buf, "%s\n", state_name[ctrl->state]); - - return sysfs_emit(buf, "unknown state\n"); -} - -static DEVICE_ATTR(state, S_IRUGO, nvme_sysfs_show_state, NULL); - -static ssize_t nvme_sysfs_show_subsysnqn(struct device *dev, - struct device_attribute *attr, - char *buf) -{ - struct nvme_ctrl *ctrl = dev_get_drvdata(dev); - - return snprintf(buf, PAGE_SIZE, "%s\n", ctrl->subsys->subnqn); -} -static DEVICE_ATTR(subsysnqn, S_IRUGO, nvme_sysfs_show_subsysnqn, NULL); - -static ssize_t nvme_sysfs_show_hostnqn(struct device *dev, - struct device_attribute *attr, - char *buf) -{ - struct nvme_ctrl *ctrl = dev_get_drvdata(dev); - - return snprintf(buf, PAGE_SIZE, "%s\n", ctrl->opts->host->nqn); -} -static DEVICE_ATTR(hostnqn, S_IRUGO, nvme_sysfs_show_hostnqn, NULL); - -static ssize_t nvme_sysfs_show_hostid(struct device *dev, - struct device_attribute *attr, - char *buf) -{ - struct nvme_ctrl *ctrl = dev_get_drvdata(dev); - - return snprintf(buf, PAGE_SIZE, "%pU\n", &ctrl->opts->host->id); -} -static DEVICE_ATTR(hostid, S_IRUGO, nvme_sysfs_show_hostid, NULL); - -static ssize_t nvme_sysfs_show_address(struct device *dev, - struct device_attribute *attr, - char *buf) -{ - struct nvme_ctrl *ctrl = dev_get_drvdata(dev); - - return ctrl->ops->get_address(ctrl, buf, PAGE_SIZE); -} -static DEVICE_ATTR(address, S_IRUGO, nvme_sysfs_show_address, NULL); - -static ssize_t nvme_ctrl_loss_tmo_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct nvme_ctrl *ctrl = dev_get_drvdata(dev); - struct nvmf_ctrl_options *opts = ctrl->opts; - - if (ctrl->opts->max_reconnects == -1) - return sysfs_emit(buf, "off\n"); - return sysfs_emit(buf, "%d\n", - opts->max_reconnects * opts->reconnect_delay); -} - -static ssize_t nvme_ctrl_loss_tmo_store(struct device *dev, - struct device_attribute *attr, const char *buf, size_t count) -{ - struct nvme_ctrl *ctrl = dev_get_drvdata(dev); - struct nvmf_ctrl_options *opts = ctrl->opts; - int ctrl_loss_tmo, err; - - err = kstrtoint(buf, 10, &ctrl_loss_tmo); - if (err) - return -EINVAL; - - else if (ctrl_loss_tmo < 0) - opts->max_reconnects = -1; - else - opts->max_reconnects = DIV_ROUND_UP(ctrl_loss_tmo, - opts->reconnect_delay); - return count; -} -static DEVICE_ATTR(ctrl_loss_tmo, S_IRUGO | S_IWUSR, - nvme_ctrl_loss_tmo_show, nvme_ctrl_loss_tmo_store); - -static ssize_t nvme_ctrl_reconnect_delay_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct nvme_ctrl *ctrl = dev_get_drvdata(dev); - - if (ctrl->opts->reconnect_delay == -1) - return sysfs_emit(buf, "off\n"); - return sysfs_emit(buf, "%d\n", ctrl->opts->reconnect_delay); -} - -static ssize_t nvme_ctrl_reconnect_delay_store(struct device *dev, - struct device_attribute *attr, const char *buf, size_t count) -{ - struct nvme_ctrl *ctrl = dev_get_drvdata(dev); - unsigned int v; - int err; - - err = kstrtou32(buf, 10, &v); - if (err) - return err; - - ctrl->opts->reconnect_delay = v; - return count; -} -static DEVICE_ATTR(reconnect_delay, S_IRUGO | S_IWUSR, - nvme_ctrl_reconnect_delay_show, nvme_ctrl_reconnect_delay_store); - -static struct attribute *nvme_dev_attrs[] = { - &dev_attr_reset_controller.attr, - &dev_attr_rescan_controller.attr, - &dev_attr_model.attr, - &dev_attr_serial.attr, - &dev_attr_firmware_rev.attr, - &dev_attr_cntlid.attr, - &dev_attr_delete_controller.attr, - &dev_attr_transport.attr, - &dev_attr_subsysnqn.attr, - &dev_attr_address.attr, - &dev_attr_state.attr, - &dev_attr_numa_node.attr, - &dev_attr_queue_count.attr, - &dev_attr_sqsize.attr, - &dev_attr_hostnqn.attr, - &dev_attr_hostid.attr, - &dev_attr_ctrl_loss_tmo.attr, - &dev_attr_reconnect_delay.attr, - NULL -}; - -static umode_t nvme_dev_attrs_are_visible(struct kobject *kobj, - struct attribute *a, int n) -{ - struct device *dev = container_of(kobj, struct device, kobj); - struct nvme_ctrl *ctrl = dev_get_drvdata(dev); - - if (a == &dev_attr_delete_controller.attr && !ctrl->ops->delete_ctrl) - return 0; - if (a == &dev_attr_address.attr && !ctrl->ops->get_address) - return 0; - if (a == &dev_attr_hostnqn.attr && !ctrl->opts) - return 0; - if (a == &dev_attr_hostid.attr && !ctrl->opts) - return 0; - if (a == &dev_attr_ctrl_loss_tmo.attr && !ctrl->opts) - return 0; - if (a == &dev_attr_reconnect_delay.attr && !ctrl->opts) - return 0; - - return a->mode; -} - -static struct attribute_group nvme_dev_attrs_group = { - .attrs = nvme_dev_attrs, - .is_visible = nvme_dev_attrs_are_visible, -}; - -static const struct attribute_group *nvme_dev_attr_groups[] = { - &nvme_dev_attrs_group, - NULL, -}; - -static struct nvme_ns_head *nvme_find_ns_head(struct nvme_subsystem *subsys, - unsigned nsid) -{ - struct nvme_ns_head *h; - - lockdep_assert_held(&subsys->lock); - - list_for_each_entry(h, &subsys->nsheads, entry) { - if (h->ns_id == nsid && kref_get_unless_zero(&h->ref)) - return h; - } - - return NULL; -} - -static int nvme_subsys_check_duplicate_ids(struct nvme_subsystem *subsys, - struct nvme_ns_ids *ids) -{ - struct nvme_ns_head *h; - - lockdep_assert_held(&subsys->lock); - - list_for_each_entry(h, &subsys->nsheads, entry) { - if (nvme_ns_ids_valid(ids) && nvme_ns_ids_equal(ids, &h->ids)) - return -EINVAL; - } - - return 0; -} - -static struct nvme_ns_head *nvme_alloc_ns_head(struct nvme_ctrl *ctrl, - unsigned nsid, struct nvme_ns_ids *ids) -{ - struct nvme_ns_head *head; - size_t size = sizeof(*head); - int ret = -ENOMEM; - -#ifdef CONFIG_NVME_MULTIPATH - size += num_possible_nodes() * sizeof(struct nvme_ns *); -#endif - - head = kzalloc(size, GFP_KERNEL); - if (!head) - goto out; - ret = ida_simple_get(&ctrl->subsys->ns_ida, 1, 0, GFP_KERNEL); - if (ret < 0) - goto out_free_head; - head->instance = ret; - INIT_LIST_HEAD(&head->list); - ret = init_srcu_struct(&head->srcu); - if (ret) - goto out_ida_remove; - head->subsys = ctrl->subsys; - head->ns_id = nsid; - head->ids = *ids; - kref_init(&head->ref); - - ret = nvme_subsys_check_duplicate_ids(ctrl->subsys, &head->ids); - if (ret) { - dev_err(ctrl->device, - "duplicate IDs for nsid %d\n", nsid); - goto out_cleanup_srcu; - } - - if (head->ids.csi) { - ret = nvme_get_effects_log(ctrl, head->ids.csi, &head->effects); - if (ret) - goto out_cleanup_srcu; - } else - head->effects = ctrl->effects; - - ret = nvme_mpath_alloc_disk(ctrl, head); - if (ret) - goto out_cleanup_srcu; - - list_add_tail(&head->entry, &ctrl->subsys->nsheads); - - kref_get(&ctrl->subsys->ref); - - return head; -out_cleanup_srcu: - cleanup_srcu_struct(&head->srcu); -out_ida_remove: - ida_simple_remove(&ctrl->subsys->ns_ida, head->instance); -out_free_head: - kfree(head); -out: - if (ret > 0) - ret = blk_status_to_errno(nvme_error_status(ret)); - return ERR_PTR(ret); -} - -static int nvme_init_ns_head(struct nvme_ns *ns, unsigned nsid, - struct nvme_ns_ids *ids, bool is_shared) -{ - struct nvme_ctrl *ctrl = ns->ctrl; - struct nvme_ns_head *head = NULL; - int ret = 0; - - mutex_lock(&ctrl->subsys->lock); - head = nvme_find_ns_head(ctrl->subsys, nsid); - if (!head) { - head = nvme_alloc_ns_head(ctrl, nsid, ids); - if (IS_ERR(head)) { - ret = PTR_ERR(head); - goto out_unlock; - } - head->shared = is_shared; - } else { - ret = -EINVAL; - if (!is_shared || !head->shared) { - dev_err(ctrl->device, - "Duplicate unshared namespace %d\n", nsid); - goto out_put_ns_head; - } - if (!nvme_ns_ids_equal(&head->ids, ids)) { - dev_err(ctrl->device, - "IDs don't match for shared namespace %d\n", - nsid); - goto out_put_ns_head; - } - } - - list_add_tail(&ns->siblings, &head->list); - ns->head = head; - mutex_unlock(&ctrl->subsys->lock); - return 0; - -out_put_ns_head: - nvme_put_ns_head(head); -out_unlock: - mutex_unlock(&ctrl->subsys->lock); - return ret; -} - -struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid) -{ - struct nvme_ns *ns, *ret = NULL; - - down_read(&ctrl->namespaces_rwsem); - list_for_each_entry(ns, &ctrl->namespaces, list) { - if (ns->head->ns_id == nsid) { - if (!kref_get_unless_zero(&ns->kref)) - continue; - ret = ns; - break; - } - if (ns->head->ns_id > nsid) - break; - } - up_read(&ctrl->namespaces_rwsem); - return ret; -} -EXPORT_SYMBOL_NS_GPL(nvme_find_get_ns, NVME_TARGET_PASSTHRU); - -/* - * Add the namespace to the controller list while keeping the list ordered. - */ -static void nvme_ns_add_to_ctrl_list(struct nvme_ns *ns) -{ - struct nvme_ns *tmp; - - list_for_each_entry_reverse(tmp, &ns->ctrl->namespaces, list) { - if (tmp->head->ns_id < ns->head->ns_id) { - list_add(&ns->list, &tmp->list); - return; - } - } - list_add(&ns->list, &ns->ctrl->namespaces); -} - -static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid, - struct nvme_ns_ids *ids) -{ - struct nvme_ns *ns; - struct gendisk *disk; - struct nvme_id_ns *id; - char disk_name[DISK_NAME_LEN]; - int node = ctrl->numa_node, flags = GENHD_FL_EXT_DEVT, ret; - - if (nvme_identify_ns(ctrl, nsid, ids, &id)) - return; - - ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node); - if (!ns) - goto out_free_id; - - ns->queue = blk_mq_init_queue(ctrl->tagset); - if (IS_ERR(ns->queue)) - goto out_free_ns; - - if (ctrl->opts && ctrl->opts->data_digest) - blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, ns->queue); - - blk_queue_flag_set(QUEUE_FLAG_NONROT, ns->queue); - if (ctrl->ops->flags & NVME_F_PCI_P2PDMA) - blk_queue_flag_set(QUEUE_FLAG_PCI_P2PDMA, ns->queue); - - ns->queue->queuedata = ns; - ns->ctrl = ctrl; - kref_init(&ns->kref); - - ret = nvme_init_ns_head(ns, nsid, ids, id->nmic & NVME_NS_NMIC_SHARED); - if (ret) - goto out_free_queue; - nvme_set_disk_name(disk_name, ns, ctrl, &flags); - - disk = alloc_disk_node(0, node); - if (!disk) - goto out_unlink_ns; - - disk->fops = &nvme_fops; - disk->private_data = ns; - disk->queue = ns->queue; - disk->flags = flags; - memcpy(disk->disk_name, disk_name, DISK_NAME_LEN); - ns->disk = disk; - - if (nvme_update_ns_info(ns, id)) - goto out_put_disk; - - if ((ctrl->quirks & NVME_QUIRK_LIGHTNVM) && id->vs[0] == 0x1) { - ret = nvme_nvm_register(ns, disk_name, node); - if (ret) { - dev_warn(ctrl->device, "LightNVM init failure\n"); - goto out_put_disk; - } - } - - down_write(&ctrl->namespaces_rwsem); - nvme_ns_add_to_ctrl_list(ns); - up_write(&ctrl->namespaces_rwsem); - nvme_get_ctrl(ctrl); - - device_add_disk(ctrl->device, ns->disk, nvme_ns_id_attr_groups); - - nvme_mpath_add_disk(ns, id); - nvme_fault_inject_init(&ns->fault_inject, ns->disk->disk_name); - kfree(id); - - return; - out_put_disk: - /* prevent double queue cleanup */ - ns->disk->queue = NULL; - put_disk(ns->disk); - out_unlink_ns: - mutex_lock(&ctrl->subsys->lock); - list_del_rcu(&ns->siblings); - if (list_empty(&ns->head->list)) - list_del_init(&ns->head->entry); - mutex_unlock(&ctrl->subsys->lock); - nvme_put_ns_head(ns->head); - out_free_queue: - blk_cleanup_queue(ns->queue); - out_free_ns: - kfree(ns); - out_free_id: - kfree(id); -} - -static void nvme_ns_remove(struct nvme_ns *ns) -{ - if (test_and_set_bit(NVME_NS_REMOVING, &ns->flags)) - return; - - set_capacity(ns->disk, 0); - nvme_fault_inject_fini(&ns->fault_inject); - - mutex_lock(&ns->ctrl->subsys->lock); - list_del_rcu(&ns->siblings); - if (list_empty(&ns->head->list)) - list_del_init(&ns->head->entry); - mutex_unlock(&ns->ctrl->subsys->lock); - - synchronize_rcu(); /* guarantee not available in head->list */ - nvme_mpath_clear_current_path(ns); - synchronize_srcu(&ns->head->srcu); /* wait for concurrent submissions */ - - if (ns->disk->flags & GENHD_FL_UP) { - del_gendisk(ns->disk); - blk_cleanup_queue(ns->queue); - if (blk_get_integrity(ns->disk)) - blk_integrity_unregister(ns->disk); - } - - down_write(&ns->ctrl->namespaces_rwsem); - list_del_init(&ns->list); - up_write(&ns->ctrl->namespaces_rwsem); - - nvme_mpath_check_last_path(ns); - nvme_put_ns(ns); -} - -static void nvme_ns_remove_by_nsid(struct nvme_ctrl *ctrl, u32 nsid) -{ - struct nvme_ns *ns = nvme_find_get_ns(ctrl, nsid); - - if (ns) { - nvme_ns_remove(ns); - nvme_put_ns(ns); - } -} - -static void nvme_validate_ns(struct nvme_ns *ns, struct nvme_ns_ids *ids) -{ - struct nvme_id_ns *id; - int ret = NVME_SC_INVALID_NS | NVME_SC_DNR; - - if (test_bit(NVME_NS_DEAD, &ns->flags)) - goto out; - - ret = nvme_identify_ns(ns->ctrl, ns->head->ns_id, ids, &id); - if (ret) - goto out; - - ret = NVME_SC_INVALID_NS | NVME_SC_DNR; - if (!nvme_ns_ids_equal(&ns->head->ids, ids)) { - dev_err(ns->ctrl->device, - "identifiers changed for nsid %d\n", ns->head->ns_id); - goto out_free_id; - } - - ret = nvme_update_ns_info(ns, id); - -out_free_id: - kfree(id); -out: - /* - * Only remove the namespace if we got a fatal error back from the - * device, otherwise ignore the error and just move on. - * - * TODO: we should probably schedule a delayed retry here. - */ - if (ret > 0 && (ret & NVME_SC_DNR)) - nvme_ns_remove(ns); - else - revalidate_disk_size(ns->disk, true); -} - -static void nvme_validate_or_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) -{ - struct nvme_ns_ids ids = { }; - struct nvme_ns *ns; - - if (nvme_identify_ns_descs(ctrl, nsid, &ids)) - return; - - ns = nvme_find_get_ns(ctrl, nsid); - if (ns) { - nvme_validate_ns(ns, &ids); - nvme_put_ns(ns); - return; - } - - switch (ids.csi) { - case NVME_CSI_NVM: - nvme_alloc_ns(ctrl, nsid, &ids); - break; - case NVME_CSI_ZNS: - if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED)) { - dev_warn(ctrl->device, - "nsid %u not supported without CONFIG_BLK_DEV_ZONED\n", - nsid); - break; - } - if (!nvme_multi_css(ctrl)) { - dev_warn(ctrl->device, - "command set not reported for nsid: %d\n", - nsid); - break; - } - nvme_alloc_ns(ctrl, nsid, &ids); - break; - default: - dev_warn(ctrl->device, "unknown csi %u for nsid %u\n", - ids.csi, nsid); - break; - } -} - -static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl, - unsigned nsid) -{ - struct nvme_ns *ns, *next; - LIST_HEAD(rm_list); - - down_write(&ctrl->namespaces_rwsem); - list_for_each_entry_safe(ns, next, &ctrl->namespaces, list) { - if (ns->head->ns_id > nsid || test_bit(NVME_NS_DEAD, &ns->flags)) - list_move_tail(&ns->list, &rm_list); - } - up_write(&ctrl->namespaces_rwsem); - - list_for_each_entry_safe(ns, next, &rm_list, list) - nvme_ns_remove(ns); - -} - -static int nvme_scan_ns_list(struct nvme_ctrl *ctrl) -{ - const int nr_entries = NVME_IDENTIFY_DATA_SIZE / sizeof(__le32); - __le32 *ns_list; - u32 prev = 0; - int ret = 0, i; - - if (nvme_ctrl_limited_cns(ctrl)) - return -EOPNOTSUPP; - - ns_list = kzalloc(NVME_IDENTIFY_DATA_SIZE, GFP_KERNEL); - if (!ns_list) - return -ENOMEM; - - for (;;) { - struct nvme_command cmd = { - .identify.opcode = nvme_admin_identify, - .identify.cns = NVME_ID_CNS_NS_ACTIVE_LIST, - .identify.nsid = cpu_to_le32(prev), - }; - - ret = nvme_submit_sync_cmd(ctrl->admin_q, &cmd, ns_list, - NVME_IDENTIFY_DATA_SIZE); - if (ret) - goto free; - - for (i = 0; i < nr_entries; i++) { - u32 nsid = le32_to_cpu(ns_list[i]); - - if (!nsid) /* end of the list? */ - goto out; - nvme_validate_or_alloc_ns(ctrl, nsid); - while (++prev < nsid) - nvme_ns_remove_by_nsid(ctrl, prev); - } - } - out: - nvme_remove_invalid_namespaces(ctrl, prev); - free: - kfree(ns_list); - return ret; -} - -static void nvme_scan_ns_sequential(struct nvme_ctrl *ctrl) -{ - struct nvme_id_ctrl *id; - u32 nn, i; - - if (nvme_identify_ctrl(ctrl, &id)) - return; - nn = le32_to_cpu(id->nn); - kfree(id); - - for (i = 1; i <= nn; i++) - nvme_validate_or_alloc_ns(ctrl, i); - - nvme_remove_invalid_namespaces(ctrl, nn); -} - -static void nvme_clear_changed_ns_log(struct nvme_ctrl *ctrl) -{ - size_t log_size = NVME_MAX_CHANGED_NAMESPACES * sizeof(__le32); - __le32 *log; - int error; - - log = kzalloc(log_size, GFP_KERNEL); - if (!log) - return; - - /* - * We need to read the log to clear the AEN, but we don't want to rely - * on it for the changed namespace information as userspace could have - * raced with us in reading the log page, which could cause us to miss - * updates. - */ - error = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_CHANGED_NS, 0, - NVME_CSI_NVM, log, log_size, 0); - if (error) - dev_warn(ctrl->device, - "reading changed ns log failed: %d\n", error); - - kfree(log); -} - -static void nvme_scan_work(struct work_struct *work) -{ - struct nvme_ctrl *ctrl = - container_of(work, struct nvme_ctrl, scan_work); - - /* No tagset on a live ctrl means IO queues could not created */ - if (ctrl->state != NVME_CTRL_LIVE || !ctrl->tagset) - return; - - if (test_and_clear_bit(NVME_AER_NOTICE_NS_CHANGED, &ctrl->events)) { - dev_info(ctrl->device, "rescanning namespaces.\n"); - nvme_clear_changed_ns_log(ctrl); - } - - mutex_lock(&ctrl->scan_lock); - if (nvme_scan_ns_list(ctrl) != 0) - nvme_scan_ns_sequential(ctrl); - mutex_unlock(&ctrl->scan_lock); -} - -/* - * This function iterates the namespace list unlocked to allow recovery from - * controller failure. It is up to the caller to ensure the namespace list is - * not modified by scan work while this function is executing. - */ -void nvme_remove_namespaces(struct nvme_ctrl *ctrl) -{ - struct nvme_ns *ns, *next; - LIST_HEAD(ns_list); - - /* - * make sure to requeue I/O to all namespaces as these - * might result from the scan itself and must complete - * for the scan_work to make progress - */ - nvme_mpath_clear_ctrl_paths(ctrl); - - /* prevent racing with ns scanning */ - flush_work(&ctrl->scan_work); - - /* - * The dead states indicates the controller was not gracefully - * disconnected. In that case, we won't be able to flush any data while - * removing the namespaces' disks; fail all the queues now to avoid - * potentially having to clean up the failed sync later. - */ - if (ctrl->state == NVME_CTRL_DEAD) - nvme_kill_queues(ctrl); - - /* this is a no-op when called from the controller reset handler */ - nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING_NOIO); - - down_write(&ctrl->namespaces_rwsem); - list_splice_init(&ctrl->namespaces, &ns_list); - up_write(&ctrl->namespaces_rwsem); - - list_for_each_entry_safe(ns, next, &ns_list, list) - nvme_ns_remove(ns); -} -EXPORT_SYMBOL_GPL(nvme_remove_namespaces); - -static int nvme_class_uevent(struct device *dev, struct kobj_uevent_env *env) -{ - struct nvme_ctrl *ctrl = - container_of(dev, struct nvme_ctrl, ctrl_device); - struct nvmf_ctrl_options *opts = ctrl->opts; - int ret; - - ret = add_uevent_var(env, "NVME_TRTYPE=%s", ctrl->ops->name); - if (ret) - return ret; - - if (opts) { - ret = add_uevent_var(env, "NVME_TRADDR=%s", opts->traddr); - if (ret) - return ret; - - ret = add_uevent_var(env, "NVME_TRSVCID=%s", - opts->trsvcid ?: "none"); - if (ret) - return ret; - - ret = add_uevent_var(env, "NVME_HOST_TRADDR=%s", - opts->host_traddr ?: "none"); - } - return ret; -} - -static void nvme_aen_uevent(struct nvme_ctrl *ctrl) -{ - char *envp[2] = { NULL, NULL }; - u32 aen_result = ctrl->aen_result; - - ctrl->aen_result = 0; - if (!aen_result) - return; - - envp[0] = kasprintf(GFP_KERNEL, "NVME_AEN=%#08x", aen_result); - if (!envp[0]) - return; - kobject_uevent_env(&ctrl->device->kobj, KOBJ_CHANGE, envp); - kfree(envp[0]); -} - -static void nvme_async_event_work(struct work_struct *work) -{ - struct nvme_ctrl *ctrl = - container_of(work, struct nvme_ctrl, async_event_work); - - nvme_aen_uevent(ctrl); - - /* - * The transport drivers must guarantee AER submission here is safe by - * flushing ctrl async_event_work after changing the controller state - * from LIVE and before freeing the admin queue. - */ - if (ctrl->state == NVME_CTRL_LIVE) - ctrl->ops->submit_async_event(ctrl); -} - -static bool nvme_ctrl_pp_status(struct nvme_ctrl *ctrl) -{ - - u32 csts; - - if (ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &csts)) - return false; - - if (csts == ~0) - return false; - - return ((ctrl->ctrl_config & NVME_CC_ENABLE) && (csts & NVME_CSTS_PP)); -} - -static void nvme_get_fw_slot_info(struct nvme_ctrl *ctrl) -{ - struct nvme_fw_slot_info_log *log; - - log = kmalloc(sizeof(*log), GFP_KERNEL); - if (!log) - return; - - if (nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_FW_SLOT, 0, NVME_CSI_NVM, - log, sizeof(*log), 0)) - dev_warn(ctrl->device, "Get FW SLOT INFO log error\n"); - kfree(log); -} - -static void nvme_fw_act_work(struct work_struct *work) -{ - struct nvme_ctrl *ctrl = container_of(work, - struct nvme_ctrl, fw_act_work); - unsigned long fw_act_timeout; - - if (ctrl->mtfa) - fw_act_timeout = jiffies + - msecs_to_jiffies(ctrl->mtfa * 100); - else - fw_act_timeout = jiffies + - msecs_to_jiffies(admin_timeout * 1000); - - nvme_stop_queues(ctrl); - while (nvme_ctrl_pp_status(ctrl)) { - if (time_after(jiffies, fw_act_timeout)) { - dev_warn(ctrl->device, - "Fw activation timeout, reset controller\n"); - nvme_try_sched_reset(ctrl); - return; - } - msleep(100); - } - - if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_LIVE)) - return; - - nvme_start_queues(ctrl); - /* read FW slot information to clear the AER */ - nvme_get_fw_slot_info(ctrl); -} - -static void nvme_handle_aen_notice(struct nvme_ctrl *ctrl, u32 result) -{ - u32 aer_notice_type = (result & 0xff00) >> 8; - - trace_nvme_async_event(ctrl, aer_notice_type); - - switch (aer_notice_type) { - case NVME_AER_NOTICE_NS_CHANGED: - set_bit(NVME_AER_NOTICE_NS_CHANGED, &ctrl->events); - nvme_queue_scan(ctrl); - break; - case NVME_AER_NOTICE_FW_ACT_STARTING: - /* - * We are (ab)using the RESETTING state to prevent subsequent - * recovery actions from interfering with the controller's - * firmware activation. - */ - if (nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING)) - queue_work(nvme_wq, &ctrl->fw_act_work); - break; -#ifdef CONFIG_NVME_MULTIPATH - case NVME_AER_NOTICE_ANA: - if (!ctrl->ana_log_buf) - break; - queue_work(nvme_wq, &ctrl->ana_work); - break; -#endif - case NVME_AER_NOTICE_DISC_CHANGED: - ctrl->aen_result = result; - break; - default: - dev_warn(ctrl->device, "async event result %08x\n", result); - } -} - -void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status, - volatile union nvme_result *res) -{ - u32 result = le32_to_cpu(res->u32); - u32 aer_type = result & 0x07; - - if (le16_to_cpu(status) >> 1 != NVME_SC_SUCCESS) - return; - - switch (aer_type) { - case NVME_AER_NOTICE: - nvme_handle_aen_notice(ctrl, result); - break; - case NVME_AER_ERROR: - case NVME_AER_SMART: - case NVME_AER_CSS: - case NVME_AER_VS: - trace_nvme_async_event(ctrl, aer_type); - ctrl->aen_result = result; - break; - default: - break; - } - queue_work(nvme_wq, &ctrl->async_event_work); -} -EXPORT_SYMBOL_GPL(nvme_complete_async_event); - -void nvme_stop_ctrl(struct nvme_ctrl *ctrl) -{ - nvme_mpath_stop(ctrl); - nvme_stop_keep_alive(ctrl); - flush_work(&ctrl->async_event_work); - cancel_work_sync(&ctrl->fw_act_work); - if (ctrl->ops->stop_ctrl) - ctrl->ops->stop_ctrl(ctrl); -} -EXPORT_SYMBOL_GPL(nvme_stop_ctrl); - -void nvme_start_ctrl(struct nvme_ctrl *ctrl) -{ - nvme_start_keep_alive(ctrl); - - nvme_enable_aen(ctrl); - - if (ctrl->queue_count > 1) { - nvme_queue_scan(ctrl); - nvme_start_queues(ctrl); - nvme_mpath_update(ctrl); - } -} -EXPORT_SYMBOL_GPL(nvme_start_ctrl); - -void nvme_uninit_ctrl(struct nvme_ctrl *ctrl) -{ - nvme_fault_inject_fini(&ctrl->fault_inject); - dev_pm_qos_hide_latency_tolerance(ctrl->device); - cdev_device_del(&ctrl->cdev, ctrl->device); - nvme_put_ctrl(ctrl); -} -EXPORT_SYMBOL_GPL(nvme_uninit_ctrl); - -static void nvme_free_cels(struct nvme_ctrl *ctrl) -{ - struct nvme_effects_log *cel; - unsigned long i; - - xa_for_each (&ctrl->cels, i, cel) { - xa_erase(&ctrl->cels, i); - kfree(cel); - } - - xa_destroy(&ctrl->cels); -} - -static void nvme_free_ctrl(struct device *dev) -{ - struct nvme_ctrl *ctrl = - container_of(dev, struct nvme_ctrl, ctrl_device); - struct nvme_subsystem *subsys = ctrl->subsys; - - if (!subsys || ctrl->instance != subsys->instance) - ida_simple_remove(&nvme_instance_ida, ctrl->instance); - - nvme_free_cels(ctrl); - nvme_mpath_uninit(ctrl); - __free_page(ctrl->discard_page); - - if (subsys) { - mutex_lock(&nvme_subsystems_lock); - list_del(&ctrl->subsys_entry); - sysfs_remove_link(&subsys->dev.kobj, dev_name(ctrl->device)); - mutex_unlock(&nvme_subsystems_lock); - } - - ctrl->ops->free_ctrl(ctrl); - - if (subsys) - nvme_put_subsystem(subsys); -} - -/* - * Initialize a NVMe controller structures. This needs to be called during - * earliest initialization so that we have the initialized structured around - * during probing. - */ -int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, - const struct nvme_ctrl_ops *ops, unsigned long quirks) -{ - int ret; - - ctrl->state = NVME_CTRL_NEW; - spin_lock_init(&ctrl->lock); - mutex_init(&ctrl->scan_lock); - INIT_LIST_HEAD(&ctrl->namespaces); - xa_init(&ctrl->cels); - init_rwsem(&ctrl->namespaces_rwsem); - ctrl->dev = dev; - ctrl->ops = ops; - ctrl->quirks = quirks; - ctrl->numa_node = NUMA_NO_NODE; - INIT_WORK(&ctrl->scan_work, nvme_scan_work); - INIT_WORK(&ctrl->async_event_work, nvme_async_event_work); - INIT_WORK(&ctrl->fw_act_work, nvme_fw_act_work); - INIT_WORK(&ctrl->delete_work, nvme_delete_ctrl_work); - init_waitqueue_head(&ctrl->state_wq); - - INIT_DELAYED_WORK(&ctrl->ka_work, nvme_keep_alive_work); - memset(&ctrl->ka_cmd, 0, sizeof(ctrl->ka_cmd)); - ctrl->ka_cmd.common.opcode = nvme_admin_keep_alive; - - BUILD_BUG_ON(NVME_DSM_MAX_RANGES * sizeof(struct nvme_dsm_range) > - PAGE_SIZE); - ctrl->discard_page = alloc_page(GFP_KERNEL); - if (!ctrl->discard_page) { - ret = -ENOMEM; - goto out; - } - - ret = ida_simple_get(&nvme_instance_ida, 0, 0, GFP_KERNEL); - if (ret < 0) - goto out; - ctrl->instance = ret; - - device_initialize(&ctrl->ctrl_device); - ctrl->device = &ctrl->ctrl_device; - ctrl->device->devt = MKDEV(MAJOR(nvme_chr_devt), ctrl->instance); - ctrl->device->class = nvme_class; - ctrl->device->parent = ctrl->dev; - ctrl->device->groups = nvme_dev_attr_groups; - ctrl->device->release = nvme_free_ctrl; - dev_set_drvdata(ctrl->device, ctrl); - ret = dev_set_name(ctrl->device, "nvme%d", ctrl->instance); - if (ret) - goto out_release_instance; - - nvme_get_ctrl(ctrl); - cdev_init(&ctrl->cdev, &nvme_dev_fops); - ctrl->cdev.owner = ops->module; - ret = cdev_device_add(&ctrl->cdev, ctrl->device); - if (ret) - goto out_free_name; - - /* - * Initialize latency tolerance controls. The sysfs files won't - * be visible to userspace unless the device actually supports APST. - */ - ctrl->device->power.set_latency_tolerance = nvme_set_latency_tolerance; - dev_pm_qos_update_user_latency_tolerance(ctrl->device, - min(default_ps_max_latency_us, (unsigned long)S32_MAX)); - - nvme_fault_inject_init(&ctrl->fault_inject, dev_name(ctrl->device)); - nvme_mpath_init_ctrl(ctrl); - - return 0; -out_free_name: - nvme_put_ctrl(ctrl); - kfree_const(ctrl->device->kobj.name); -out_release_instance: - ida_simple_remove(&nvme_instance_ida, ctrl->instance); -out: - if (ctrl->discard_page) - __free_page(ctrl->discard_page); - return ret; -} -EXPORT_SYMBOL_GPL(nvme_init_ctrl); - -/** - * nvme_kill_queues(): Ends all namespace queues - * @ctrl: the dead controller that needs to end - * - * Call this function when the driver determines it is unable to get the - * controller in a state capable of servicing IO. - */ -void nvme_kill_queues(struct nvme_ctrl *ctrl) -{ - struct nvme_ns *ns; - - down_read(&ctrl->namespaces_rwsem); - - /* Forcibly unquiesce queues to avoid blocking dispatch */ - if (ctrl->admin_q && !blk_queue_dying(ctrl->admin_q)) - blk_mq_unquiesce_queue(ctrl->admin_q); - - list_for_each_entry(ns, &ctrl->namespaces, list) - nvme_set_queue_dying(ns); - - up_read(&ctrl->namespaces_rwsem); -} -EXPORT_SYMBOL_GPL(nvme_kill_queues); - -void nvme_unfreeze(struct nvme_ctrl *ctrl) -{ - struct nvme_ns *ns; - - down_read(&ctrl->namespaces_rwsem); - list_for_each_entry(ns, &ctrl->namespaces, list) - blk_mq_unfreeze_queue(ns->queue); - up_read(&ctrl->namespaces_rwsem); -} -EXPORT_SYMBOL_GPL(nvme_unfreeze); - -int nvme_wait_freeze_timeout(struct nvme_ctrl *ctrl, long timeout) -{ - struct nvme_ns *ns; - - down_read(&ctrl->namespaces_rwsem); - list_for_each_entry(ns, &ctrl->namespaces, list) { - timeout = blk_mq_freeze_queue_wait_timeout(ns->queue, timeout); - if (timeout <= 0) - break; - } - up_read(&ctrl->namespaces_rwsem); - return timeout; -} -EXPORT_SYMBOL_GPL(nvme_wait_freeze_timeout); - -void nvme_wait_freeze(struct nvme_ctrl *ctrl) -{ - struct nvme_ns *ns; - - down_read(&ctrl->namespaces_rwsem); - list_for_each_entry(ns, &ctrl->namespaces, list) - blk_mq_freeze_queue_wait(ns->queue); - up_read(&ctrl->namespaces_rwsem); -} -EXPORT_SYMBOL_GPL(nvme_wait_freeze); - -void nvme_start_freeze(struct nvme_ctrl *ctrl) -{ - struct nvme_ns *ns; - - down_read(&ctrl->namespaces_rwsem); - list_for_each_entry(ns, &ctrl->namespaces, list) - blk_freeze_queue_start(ns->queue); - up_read(&ctrl->namespaces_rwsem); -} -EXPORT_SYMBOL_GPL(nvme_start_freeze); - -void nvme_stop_queues(struct nvme_ctrl *ctrl) -{ - struct nvme_ns *ns; - - down_read(&ctrl->namespaces_rwsem); - list_for_each_entry(ns, &ctrl->namespaces, list) - blk_mq_quiesce_queue(ns->queue); - up_read(&ctrl->namespaces_rwsem); -} -EXPORT_SYMBOL_GPL(nvme_stop_queues); - -void nvme_start_queues(struct nvme_ctrl *ctrl) -{ - struct nvme_ns *ns; - - down_read(&ctrl->namespaces_rwsem); - list_for_each_entry(ns, &ctrl->namespaces, list) - blk_mq_unquiesce_queue(ns->queue); - up_read(&ctrl->namespaces_rwsem); -} -EXPORT_SYMBOL_GPL(nvme_start_queues); - -void nvme_sync_io_queues(struct nvme_ctrl *ctrl) -{ - struct nvme_ns *ns; - - down_read(&ctrl->namespaces_rwsem); - list_for_each_entry(ns, &ctrl->namespaces, list) - blk_sync_queue(ns->queue); - up_read(&ctrl->namespaces_rwsem); -} -EXPORT_SYMBOL_GPL(nvme_sync_io_queues); - -void nvme_sync_queues(struct nvme_ctrl *ctrl) -{ - nvme_sync_io_queues(ctrl); - if (ctrl->admin_q) - blk_sync_queue(ctrl->admin_q); -} -EXPORT_SYMBOL_GPL(nvme_sync_queues); - -struct nvme_ctrl *nvme_ctrl_from_file(struct file *file) -{ - if (file->f_op != &nvme_dev_fops) - return NULL; - return file->private_data; -} -EXPORT_SYMBOL_NS_GPL(nvme_ctrl_from_file, NVME_TARGET_PASSTHRU); - -/* - * Check we didn't inadvertently grow the command structure sizes: - */ -static inline void _nvme_check_size(void) -{ - BUILD_BUG_ON(sizeof(struct nvme_common_command) != 64); - BUILD_BUG_ON(sizeof(struct nvme_rw_command) != 64); - BUILD_BUG_ON(sizeof(struct nvme_identify) != 64); - BUILD_BUG_ON(sizeof(struct nvme_features) != 64); - BUILD_BUG_ON(sizeof(struct nvme_download_firmware) != 64); - BUILD_BUG_ON(sizeof(struct nvme_format_cmd) != 64); - BUILD_BUG_ON(sizeof(struct nvme_dsm_cmd) != 64); - BUILD_BUG_ON(sizeof(struct nvme_write_zeroes_cmd) != 64); - BUILD_BUG_ON(sizeof(struct nvme_abort_cmd) != 64); - BUILD_BUG_ON(sizeof(struct nvme_get_log_page_command) != 64); - BUILD_BUG_ON(sizeof(struct nvme_command) != 64); - BUILD_BUG_ON(sizeof(struct nvme_id_ctrl) != NVME_IDENTIFY_DATA_SIZE); - BUILD_BUG_ON(sizeof(struct nvme_id_ns) != NVME_IDENTIFY_DATA_SIZE); - BUILD_BUG_ON(sizeof(struct nvme_id_ns_zns) != NVME_IDENTIFY_DATA_SIZE); - BUILD_BUG_ON(sizeof(struct nvme_id_ctrl_zns) != NVME_IDENTIFY_DATA_SIZE); - BUILD_BUG_ON(sizeof(struct nvme_lba_range_type) != 64); - BUILD_BUG_ON(sizeof(struct nvme_smart_log) != 512); - BUILD_BUG_ON(sizeof(struct nvme_dbbuf) != 64); - BUILD_BUG_ON(sizeof(struct nvme_directive_cmd) != 64); -} - - -static int __init nvme_core_init(void) -{ - int result = -ENOMEM; - - _nvme_check_size(); - - nvme_wq = alloc_workqueue("nvme-wq", - WQ_UNBOUND | WQ_MEM_RECLAIM | WQ_SYSFS, 0); - if (!nvme_wq) - goto out; - - nvme_reset_wq = alloc_workqueue("nvme-reset-wq", - WQ_UNBOUND | WQ_MEM_RECLAIM | WQ_SYSFS, 0); - if (!nvme_reset_wq) - goto destroy_wq; - - nvme_delete_wq = alloc_workqueue("nvme-delete-wq", - WQ_UNBOUND | WQ_MEM_RECLAIM | WQ_SYSFS, 0); - if (!nvme_delete_wq) - goto destroy_reset_wq; - - result = alloc_chrdev_region(&nvme_chr_devt, 0, NVME_MINORS, "nvme"); - if (result < 0) - goto destroy_delete_wq; - - nvme_class = class_create(THIS_MODULE, "nvme"); - if (IS_ERR(nvme_class)) { - result = PTR_ERR(nvme_class); - goto unregister_chrdev; - } - nvme_class->dev_uevent = nvme_class_uevent; - - nvme_subsys_class = class_create(THIS_MODULE, "nvme-subsystem"); - if (IS_ERR(nvme_subsys_class)) { - result = PTR_ERR(nvme_subsys_class); - goto destroy_class; - } - return 0; - -destroy_class: - class_destroy(nvme_class); -unregister_chrdev: - unregister_chrdev_region(nvme_chr_devt, NVME_MINORS); -destroy_delete_wq: - destroy_workqueue(nvme_delete_wq); -destroy_reset_wq: - destroy_workqueue(nvme_reset_wq); -destroy_wq: - destroy_workqueue(nvme_wq); -out: - return result; -} - -static void __exit nvme_core_exit(void) -{ - class_destroy(nvme_subsys_class); - class_destroy(nvme_class); - unregister_chrdev_region(nvme_chr_devt, NVME_MINORS); - destroy_workqueue(nvme_delete_wq); - destroy_workqueue(nvme_reset_wq); - destroy_workqueue(nvme_wq); - ida_destroy(&nvme_instance_ida); -} - -MODULE_LICENSE("GPL"); -MODULE_VERSION("1.0"); -module_init(nvme_core_init); -module_exit(nvme_core_exit); diff --git a/feed/kmod-nvme/src/fabrics.h b/feed/kmod-nvme/src/fabrics.h deleted file mode 100644 index 78467cb..0000000 --- a/feed/kmod-nvme/src/fabrics.h +++ /dev/null @@ -1,192 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * NVMe over Fabrics common host code. - * Copyright (c) 2015-2016 HGST, a Western Digital Company. - */ -#ifndef _NVME_FABRICS_H -#define _NVME_FABRICS_H 1 - -#include -#include - -#define NVMF_MIN_QUEUE_SIZE 16 -#define NVMF_MAX_QUEUE_SIZE 1024 -#define NVMF_DEF_QUEUE_SIZE 128 -#define NVMF_DEF_RECONNECT_DELAY 10 -/* default to 600 seconds of reconnect attempts before giving up */ -#define NVMF_DEF_CTRL_LOSS_TMO 600 - -/* - * Define a host as seen by the target. We allocate one at boot, but also - * allow the override it when creating controllers. This is both to provide - * persistence of the Host NQN over multiple boots, and to allow using - * multiple ones, for example in a container scenario. Because we must not - * use different Host NQNs with the same Host ID we generate a Host ID and - * use this structure to keep track of the relation between the two. - */ -struct nvmf_host { - struct kref ref; - struct list_head list; - char nqn[NVMF_NQN_SIZE]; - uuid_t id; -}; - -/** - * enum nvmf_parsing_opts - used to define the sysfs parsing options used. - */ -enum { - NVMF_OPT_ERR = 0, - NVMF_OPT_TRANSPORT = 1 << 0, - NVMF_OPT_NQN = 1 << 1, - NVMF_OPT_TRADDR = 1 << 2, - NVMF_OPT_TRSVCID = 1 << 3, - NVMF_OPT_QUEUE_SIZE = 1 << 4, - NVMF_OPT_NR_IO_QUEUES = 1 << 5, - NVMF_OPT_TL_RETRY_COUNT = 1 << 6, - NVMF_OPT_KATO = 1 << 7, - NVMF_OPT_HOSTNQN = 1 << 8, - NVMF_OPT_RECONNECT_DELAY = 1 << 9, - NVMF_OPT_HOST_TRADDR = 1 << 10, - NVMF_OPT_CTRL_LOSS_TMO = 1 << 11, - NVMF_OPT_HOST_ID = 1 << 12, - NVMF_OPT_DUP_CONNECT = 1 << 13, - NVMF_OPT_DISABLE_SQFLOW = 1 << 14, - NVMF_OPT_HDR_DIGEST = 1 << 15, - NVMF_OPT_DATA_DIGEST = 1 << 16, - NVMF_OPT_NR_WRITE_QUEUES = 1 << 17, - NVMF_OPT_NR_POLL_QUEUES = 1 << 18, - NVMF_OPT_TOS = 1 << 19, -}; - -/** - * struct nvmf_ctrl_options - Used to hold the options specified - * with the parsing opts enum. - * @mask: Used by the fabrics library to parse through sysfs options - * on adding a NVMe controller. - * @transport: Holds the fabric transport "technology name" (for a lack of - * better description) that will be used by an NVMe controller - * being added. - * @subsysnqn: Hold the fully qualified NQN subystem name (format defined - * in the NVMe specification, "NVMe Qualified Names"). - * @traddr: The transport-specific TRADDR field for a port on the - * subsystem which is adding a controller. - * @trsvcid: The transport-specific TRSVCID field for a port on the - * subsystem which is adding a controller. - * @host_traddr: A transport-specific field identifying the NVME host port - * to use for the connection to the controller. - * @queue_size: Number of IO queue elements. - * @nr_io_queues: Number of controller IO queues that will be established. - * @reconnect_delay: Time between two consecutive reconnect attempts. - * @discovery_nqn: indicates if the subsysnqn is the well-known discovery NQN. - * @kato: Keep-alive timeout. - * @host: Virtual NVMe host, contains the NQN and Host ID. - * @max_reconnects: maximum number of allowed reconnect attempts before removing - * the controller, (-1) means reconnect forever, zero means remove - * immediately; - * @disable_sqflow: disable controller sq flow control - * @hdr_digest: generate/verify header digest (TCP) - * @data_digest: generate/verify data digest (TCP) - * @nr_write_queues: number of queues for write I/O - * @nr_poll_queues: number of queues for polling I/O - * @tos: type of service - */ -struct nvmf_ctrl_options { - unsigned mask; - char *transport; - char *subsysnqn; - char *traddr; - char *trsvcid; - char *host_traddr; - size_t queue_size; - unsigned int nr_io_queues; - unsigned int reconnect_delay; - bool discovery_nqn; - bool duplicate_connect; - unsigned int kato; - struct nvmf_host *host; - int max_reconnects; - bool disable_sqflow; - bool hdr_digest; - bool data_digest; - unsigned int nr_write_queues; - unsigned int nr_poll_queues; - int tos; -}; - -/* - * struct nvmf_transport_ops - used to register a specific - * fabric implementation of NVMe fabrics. - * @entry: Used by the fabrics library to add the new - * registration entry to its linked-list internal tree. - * @module: Transport module reference - * @name: Name of the NVMe fabric driver implementation. - * @required_opts: sysfs command-line options that must be specified - * when adding a new NVMe controller. - * @allowed_opts: sysfs command-line options that can be specified - * when adding a new NVMe controller. - * @create_ctrl(): function pointer that points to a non-NVMe - * implementation-specific fabric technology - * that would go into starting up that fabric - * for the purpose of conneciton to an NVMe controller - * using that fabric technology. - * - * Notes: - * 1. At minimum, 'required_opts' and 'allowed_opts' should - * be set to the same enum parsing options defined earlier. - * 2. create_ctrl() must be defined (even if it does nothing) - * 3. struct nvmf_transport_ops must be statically allocated in the - * modules .bss section so that a pure module_get on @module - * prevents the memory from beeing freed. - */ -struct nvmf_transport_ops { - struct list_head entry; - struct module *module; - const char *name; - int required_opts; - int allowed_opts; - struct nvme_ctrl *(*create_ctrl)(struct device *dev, - struct nvmf_ctrl_options *opts); -}; - -static inline bool -nvmf_ctlr_matches_baseopts(struct nvme_ctrl *ctrl, - struct nvmf_ctrl_options *opts) -{ - if (ctrl->state == NVME_CTRL_DELETING || - ctrl->state == NVME_CTRL_DELETING_NOIO || - ctrl->state == NVME_CTRL_DEAD || - strcmp(opts->subsysnqn, ctrl->opts->subsysnqn) || - strcmp(opts->host->nqn, ctrl->opts->host->nqn) || - memcmp(&opts->host->id, &ctrl->opts->host->id, sizeof(uuid_t))) - return false; - - return true; -} - -int nvmf_reg_read32(struct nvme_ctrl *ctrl, u32 off, u32 *val); -int nvmf_reg_read64(struct nvme_ctrl *ctrl, u32 off, u64 *val); -int nvmf_reg_write32(struct nvme_ctrl *ctrl, u32 off, u32 val); -int nvmf_connect_admin_queue(struct nvme_ctrl *ctrl); -int nvmf_connect_io_queue(struct nvme_ctrl *ctrl, u16 qid, bool poll); -int nvmf_register_transport(struct nvmf_transport_ops *ops); -void nvmf_unregister_transport(struct nvmf_transport_ops *ops); -void nvmf_free_options(struct nvmf_ctrl_options *opts); -int nvmf_get_address(struct nvme_ctrl *ctrl, char *buf, int size); -bool nvmf_should_reconnect(struct nvme_ctrl *ctrl); -blk_status_t nvmf_fail_nonready_command(struct nvme_ctrl *ctrl, - struct request *rq); -bool __nvmf_check_ready(struct nvme_ctrl *ctrl, struct request *rq, - bool queue_live); -bool nvmf_ip_options_match(struct nvme_ctrl *ctrl, - struct nvmf_ctrl_options *opts); - -static inline bool nvmf_check_ready(struct nvme_ctrl *ctrl, struct request *rq, - bool queue_live) -{ - if (likely(ctrl->state == NVME_CTRL_LIVE || - ctrl->state == NVME_CTRL_DELETING)) - return true; - return __nvmf_check_ready(ctrl, rq, queue_live); -} - -#endif /* _NVME_FABRICS_H */ diff --git a/feed/kmod-nvme/src/nvme.h b/feed/kmod-nvme/src/nvme.h deleted file mode 100644 index 58cf9e3..0000000 --- a/feed/kmod-nvme/src/nvme.h +++ /dev/null @@ -1,893 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Copyright (c) 2011-2014, Intel Corporation. - */ - -#ifndef _NVME_H -#define _NVME_H - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -extern unsigned int nvme_io_timeout; -#define NVME_IO_TIMEOUT (nvme_io_timeout * HZ) - -extern unsigned int admin_timeout; -#define ADMIN_TIMEOUT (admin_timeout * HZ) - -#define NVME_DEFAULT_KATO 5 -#define NVME_KATO_GRACE 10 - -#ifdef CONFIG_ARCH_NO_SG_CHAIN -#define NVME_INLINE_SG_CNT 0 -#define NVME_INLINE_METADATA_SG_CNT 0 -#else -#define NVME_INLINE_SG_CNT 2 -#define NVME_INLINE_METADATA_SG_CNT 1 -#endif - -/* - * Default to a 4K page size, with the intention to update this - * path in the future to accommodate architectures with differing - * kernel and IO page sizes. - */ -#define NVME_CTRL_PAGE_SHIFT 12 -#define NVME_CTRL_PAGE_SIZE (1 << NVME_CTRL_PAGE_SHIFT) - -extern struct workqueue_struct *nvme_wq; -extern struct workqueue_struct *nvme_reset_wq; -extern struct workqueue_struct *nvme_delete_wq; - -enum { - NVME_NS_LBA = 0, - NVME_NS_LIGHTNVM = 1, -}; - -/* - * List of workarounds for devices that required behavior not specified in - * the standard. - */ -enum nvme_quirks { - /* - * Prefers I/O aligned to a stripe size specified in a vendor - * specific Identify field. - */ - NVME_QUIRK_STRIPE_SIZE = (1 << 0), - - /* - * The controller doesn't handle Identify value others than 0 or 1 - * correctly. - */ - NVME_QUIRK_IDENTIFY_CNS = (1 << 1), - - /* - * The controller deterministically returns O's on reads to - * logical blocks that deallocate was called on. - */ - NVME_QUIRK_DEALLOCATE_ZEROES = (1 << 2), - - /* - * The controller needs a delay before starts checking the device - * readiness, which is done by reading the NVME_CSTS_RDY bit. - */ - NVME_QUIRK_DELAY_BEFORE_CHK_RDY = (1 << 3), - - /* - * APST should not be used. - */ - NVME_QUIRK_NO_APST = (1 << 4), - - /* - * The deepest sleep state should not be used. - */ - NVME_QUIRK_NO_DEEPEST_PS = (1 << 5), - - /* - * Supports the LighNVM command set if indicated in vs[1]. - */ - NVME_QUIRK_LIGHTNVM = (1 << 6), - - /* - * Set MEDIUM priority on SQ creation - */ - NVME_QUIRK_MEDIUM_PRIO_SQ = (1 << 7), - - /* - * Ignore device provided subnqn. - */ - NVME_QUIRK_IGNORE_DEV_SUBNQN = (1 << 8), - - /* - * Broken Write Zeroes. - */ - NVME_QUIRK_DISABLE_WRITE_ZEROES = (1 << 9), - - /* - * Force simple suspend/resume path. - */ - NVME_QUIRK_SIMPLE_SUSPEND = (1 << 10), - - /* - * Use only one interrupt vector for all queues - */ - NVME_QUIRK_SINGLE_VECTOR = (1 << 11), - - /* - * Use non-standard 128 bytes SQEs. - */ - NVME_QUIRK_128_BYTES_SQES = (1 << 12), - - /* - * Prevent tag overlap between queues - */ - NVME_QUIRK_SHARED_TAGS = (1 << 13), - - /* - * Don't change the value of the temperature threshold feature - */ - NVME_QUIRK_NO_TEMP_THRESH_CHANGE = (1 << 14), - - /* - * The controller doesn't handle the Identify Namespace - * Identification Descriptor list subcommand despite claiming - * NVMe 1.3 compliance. - */ - NVME_QUIRK_NO_NS_DESC_LIST = (1 << 15), - - /* - * The controller requires the command_id value be be limited, so skip - * encoding the generation sequence number. - */ - NVME_QUIRK_SKIP_CID_GEN = (1 << 17), - - /* - * Reports garbage in the namespace identifiers (eui64, nguid, uuid). - */ - NVME_QUIRK_BOGUS_NID = (1 << 18), -}; - -/* - * Common request structure for NVMe passthrough. All drivers must have - * this structure as the first member of their request-private data. - */ -struct nvme_request { - struct nvme_command *cmd; - union nvme_result result; - u8 genctr; - u8 retries; - u8 flags; - u16 status; - struct nvme_ctrl *ctrl; -}; - -/* - * Mark a bio as coming in through the mpath node. - */ -#define REQ_NVME_MPATH REQ_DRV - -enum { - NVME_REQ_CANCELLED = (1 << 0), - NVME_REQ_USERCMD = (1 << 1), -}; - -static inline struct nvme_request *nvme_req(struct request *req) -{ - return blk_mq_rq_to_pdu(req); -} - -static inline u16 nvme_req_qid(struct request *req) -{ - if (!req->q->queuedata) - return 0; - return blk_mq_unique_tag_to_hwq(blk_mq_unique_tag(req)) + 1; -} - -/* The below value is the specific amount of delay needed before checking - * readiness in case of the PCI_DEVICE(0x1c58, 0x0003), which needs the - * NVME_QUIRK_DELAY_BEFORE_CHK_RDY quirk enabled. The value (in ms) was - * found empirically. - */ -#define NVME_QUIRK_DELAY_AMOUNT 2300 - -/* - * enum nvme_ctrl_state: Controller state - * - * @NVME_CTRL_NEW: New controller just allocated, initial state - * @NVME_CTRL_LIVE: Controller is connected and I/O capable - * @NVME_CTRL_RESETTING: Controller is resetting (or scheduled reset) - * @NVME_CTRL_CONNECTING: Controller is disconnected, now connecting the - * transport - * @NVME_CTRL_DELETING: Controller is deleting (or scheduled deletion) - * @NVME_CTRL_DELETING_NOIO: Controller is deleting and I/O is not - * disabled/failed immediately. This state comes - * after all async event processing took place and - * before ns removal and the controller deletion - * progress - * @NVME_CTRL_DEAD: Controller is non-present/unresponsive during - * shutdown or removal. In this case we forcibly - * kill all inflight I/O as they have no chance to - * complete - */ -enum nvme_ctrl_state { - NVME_CTRL_NEW, - NVME_CTRL_LIVE, - NVME_CTRL_RESETTING, - NVME_CTRL_CONNECTING, - NVME_CTRL_DELETING, - NVME_CTRL_DELETING_NOIO, - NVME_CTRL_DEAD, -}; - -struct nvme_fault_inject { -#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS - struct fault_attr attr; - struct dentry *parent; - bool dont_retry; /* DNR, do not retry */ - u16 status; /* status code */ -#endif -}; - -struct nvme_ctrl { - bool comp_seen; - enum nvme_ctrl_state state; - bool identified; - spinlock_t lock; - struct mutex scan_lock; - const struct nvme_ctrl_ops *ops; - struct request_queue *admin_q; - struct request_queue *connect_q; - struct request_queue *fabrics_q; - struct device *dev; - int instance; - int numa_node; - struct blk_mq_tag_set *tagset; - struct blk_mq_tag_set *admin_tagset; - struct list_head namespaces; - struct rw_semaphore namespaces_rwsem; - struct device ctrl_device; - struct device *device; /* char device */ - struct cdev cdev; - struct work_struct reset_work; - struct work_struct delete_work; - wait_queue_head_t state_wq; - - struct nvme_subsystem *subsys; - struct list_head subsys_entry; - - struct opal_dev *opal_dev; - - char name[12]; - u16 cntlid; - - u32 ctrl_config; - u16 mtfa; - u32 queue_count; - - u64 cap; - u32 max_hw_sectors; - u32 max_segments; - u32 max_integrity_segments; -#ifdef CONFIG_BLK_DEV_ZONED - u32 max_zone_append; -#endif - u16 crdt[3]; - u16 oncs; - u16 oacs; - u16 nssa; - u16 nr_streams; - u16 sqsize; - u32 max_namespaces; - atomic_t abort_limit; - u8 vwc; - u32 vs; - u32 sgls; - u16 kas; - u8 npss; - u8 apsta; - u16 wctemp; - u16 cctemp; - u32 oaes; - u32 aen_result; - u32 ctratt; - unsigned int shutdown_timeout; - unsigned int kato; - bool subsystem; - unsigned long quirks; - struct nvme_id_power_state psd[32]; - struct nvme_effects_log *effects; - struct xarray cels; - struct work_struct scan_work; - struct work_struct async_event_work; - struct delayed_work ka_work; - struct nvme_command ka_cmd; - struct work_struct fw_act_work; - unsigned long events; - -#ifdef CONFIG_NVME_MULTIPATH - /* asymmetric namespace access: */ - u8 anacap; - u8 anatt; - u32 anagrpmax; - u32 nanagrpid; - struct mutex ana_lock; - struct nvme_ana_rsp_hdr *ana_log_buf; - size_t ana_log_size; - struct timer_list anatt_timer; - struct work_struct ana_work; -#endif - - /* Power saving configuration */ - u64 ps_max_latency_us; - bool apst_enabled; - - /* PCIe only: */ - u32 hmpre; - u32 hmmin; - u32 hmminds; - u16 hmmaxd; - - /* Fabrics only */ - u32 ioccsz; - u32 iorcsz; - u16 icdoff; - u16 maxcmd; - int nr_reconnects; - struct nvmf_ctrl_options *opts; - - struct page *discard_page; - unsigned long discard_page_busy; - - struct nvme_fault_inject fault_inject; -}; - -enum nvme_iopolicy { - NVME_IOPOLICY_NUMA, - NVME_IOPOLICY_RR, -}; - -struct nvme_subsystem { - int instance; - struct device dev; - /* - * Because we unregister the device on the last put we need - * a separate refcount. - */ - struct kref ref; - struct list_head entry; - struct mutex lock; - struct list_head ctrls; - struct list_head nsheads; - char subnqn[NVMF_NQN_SIZE]; - char serial[20]; - char model[40]; - char firmware_rev[8]; - u8 cmic; - u16 vendor_id; - u16 awupf; /* 0's based awupf value. */ - struct ida ns_ida; -#ifdef CONFIG_NVME_MULTIPATH - enum nvme_iopolicy iopolicy; -#endif -}; - -/* - * Container structure for uniqueue namespace identifiers. - */ -struct nvme_ns_ids { - u8 eui64[8]; - u8 nguid[16]; - uuid_t uuid; - u8 csi; -}; - -/* - * Anchor structure for namespaces. There is one for each namespace in a - * NVMe subsystem that any of our controllers can see, and the namespace - * structure for each controller is chained of it. For private namespaces - * there is a 1:1 relation to our namespace structures, that is ->list - * only ever has a single entry for private namespaces. - */ -struct nvme_ns_head { - struct list_head list; - struct srcu_struct srcu; - struct nvme_subsystem *subsys; - unsigned ns_id; - struct nvme_ns_ids ids; - struct list_head entry; - struct kref ref; - bool shared; - int instance; - struct nvme_effects_log *effects; -#ifdef CONFIG_NVME_MULTIPATH - struct gendisk *disk; - struct bio_list requeue_list; - spinlock_t requeue_lock; - struct work_struct requeue_work; - struct mutex lock; - unsigned long flags; -#define NVME_NSHEAD_DISK_LIVE 0 - struct nvme_ns __rcu *current_path[]; -#endif -}; - -enum nvme_ns_features { - NVME_NS_EXT_LBAS = 1 << 0, /* support extended LBA format */ - NVME_NS_METADATA_SUPPORTED = 1 << 1, /* support getting generated md */ -}; - -struct nvme_ns { - struct list_head list; - - struct nvme_ctrl *ctrl; - struct request_queue *queue; - struct gendisk *disk; -#ifdef CONFIG_NVME_MULTIPATH - enum nvme_ana_state ana_state; - u32 ana_grpid; -#endif - struct list_head siblings; - struct nvm_dev *ndev; - struct kref kref; - struct nvme_ns_head *head; - - int lba_shift; - u16 ms; - u16 sgs; - u32 sws; - u8 pi_type; -#ifdef CONFIG_BLK_DEV_ZONED - u64 zsze; -#endif - unsigned long features; - unsigned long flags; -#define NVME_NS_REMOVING 0 -#define NVME_NS_DEAD 1 -#define NVME_NS_ANA_PENDING 2 - - struct nvme_fault_inject fault_inject; - -}; - -/* NVMe ns supports metadata actions by the controller (generate/strip) */ -static inline bool nvme_ns_has_pi(struct nvme_ns *ns) -{ - return ns->pi_type && ns->ms == sizeof(struct t10_pi_tuple); -} - -struct nvme_ctrl_ops { - const char *name; - struct module *module; - unsigned int flags; -#define NVME_F_FABRICS (1 << 0) -#define NVME_F_METADATA_SUPPORTED (1 << 1) -#define NVME_F_PCI_P2PDMA (1 << 2) - int (*reg_read32)(struct nvme_ctrl *ctrl, u32 off, u32 *val); - int (*reg_write32)(struct nvme_ctrl *ctrl, u32 off, u32 val); - int (*reg_read64)(struct nvme_ctrl *ctrl, u32 off, u64 *val); - void (*free_ctrl)(struct nvme_ctrl *ctrl); - void (*submit_async_event)(struct nvme_ctrl *ctrl); - void (*delete_ctrl)(struct nvme_ctrl *ctrl); - void (*stop_ctrl)(struct nvme_ctrl *ctrl); - int (*get_address)(struct nvme_ctrl *ctrl, char *buf, int size); -}; - -/* - * nvme command_id is constructed as such: - * | xxxx | xxxxxxxxxxxx | - * gen request tag - */ -#define nvme_genctr_mask(gen) (gen & 0xf) -#define nvme_cid_install_genctr(gen) (nvme_genctr_mask(gen) << 12) -#define nvme_genctr_from_cid(cid) ((cid & 0xf000) >> 12) -#define nvme_tag_from_cid(cid) (cid & 0xfff) - -static inline u16 nvme_cid(struct request *rq) -{ - return nvme_cid_install_genctr(nvme_req(rq)->genctr) | rq->tag; -} - -static inline struct request *nvme_find_rq(struct blk_mq_tags *tags, - u16 command_id) -{ - u8 genctr = nvme_genctr_from_cid(command_id); - u16 tag = nvme_tag_from_cid(command_id); - struct request *rq; - - rq = blk_mq_tag_to_rq(tags, tag); - if (unlikely(!rq)) { - pr_err("could not locate request for tag %#x\n", - tag); - return NULL; - } - if (unlikely(nvme_genctr_mask(nvme_req(rq)->genctr) != genctr)) { - dev_err(nvme_req(rq)->ctrl->device, - "request %#x genctr mismatch (got %#x expected %#x)\n", - tag, genctr, nvme_genctr_mask(nvme_req(rq)->genctr)); - return NULL; - } - return rq; -} - -static inline struct request *nvme_cid_to_rq(struct blk_mq_tags *tags, - u16 command_id) -{ - return blk_mq_tag_to_rq(tags, nvme_tag_from_cid(command_id)); -} - -#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS -void nvme_fault_inject_init(struct nvme_fault_inject *fault_inj, - const char *dev_name); -void nvme_fault_inject_fini(struct nvme_fault_inject *fault_inject); -void nvme_should_fail(struct request *req); -#else -static inline void nvme_fault_inject_init(struct nvme_fault_inject *fault_inj, - const char *dev_name) -{ -} -static inline void nvme_fault_inject_fini(struct nvme_fault_inject *fault_inj) -{ -} -static inline void nvme_should_fail(struct request *req) {} -#endif - -static inline int nvme_reset_subsystem(struct nvme_ctrl *ctrl) -{ - if (!ctrl->subsystem) - return -ENOTTY; - return ctrl->ops->reg_write32(ctrl, NVME_REG_NSSR, 0x4E564D65); -} - -/* - * Convert a 512B sector number to a device logical block number. - */ -static inline u64 nvme_sect_to_lba(struct nvme_ns *ns, sector_t sector) -{ - return sector >> (ns->lba_shift - SECTOR_SHIFT); -} - -/* - * Convert a device logical block number to a 512B sector number. - */ -static inline sector_t nvme_lba_to_sect(struct nvme_ns *ns, u64 lba) -{ - return lba << (ns->lba_shift - SECTOR_SHIFT); -} - -/* - * Convert byte length to nvme's 0-based num dwords - */ -static inline u32 nvme_bytes_to_numd(size_t len) -{ - return (len >> 2) - 1; -} - -static inline bool nvme_is_ana_error(u16 status) -{ - switch (status & 0x7ff) { - case NVME_SC_ANA_TRANSITION: - case NVME_SC_ANA_INACCESSIBLE: - case NVME_SC_ANA_PERSISTENT_LOSS: - return true; - default: - return false; - } -} - -static inline bool nvme_is_path_error(u16 status) -{ - /* check for a status code type of 'path related status' */ - return (status & 0x700) == 0x300; -} - -/* - * Fill in the status and result information from the CQE, and then figure out - * if blk-mq will need to use IPI magic to complete the request, and if yes do - * so. If not let the caller complete the request without an indirect function - * call. - */ -static inline bool nvme_try_complete_req(struct request *req, __le16 status, - union nvme_result result) -{ - struct nvme_request *rq = nvme_req(req); - - rq->status = le16_to_cpu(status) >> 1; - rq->result = result; - /* inject error when permitted by fault injection framework */ - nvme_should_fail(req); - if (unlikely(blk_should_fake_timeout(req->q))) - return true; - return blk_mq_complete_request_remote(req); -} - -static inline void nvme_get_ctrl(struct nvme_ctrl *ctrl) -{ - get_device(ctrl->device); -} - -static inline void nvme_put_ctrl(struct nvme_ctrl *ctrl) -{ - put_device(ctrl->device); -} - -static inline bool nvme_is_aen_req(u16 qid, __u16 command_id) -{ - return !qid && - nvme_tag_from_cid(command_id) >= NVME_AQ_BLK_MQ_DEPTH; -} - -void nvme_complete_rq(struct request *req); -bool nvme_cancel_request(struct request *req, void *data, bool reserved); -void nvme_cancel_tagset(struct nvme_ctrl *ctrl); -void nvme_cancel_admin_tagset(struct nvme_ctrl *ctrl); -bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl, - enum nvme_ctrl_state new_state); -bool nvme_wait_reset(struct nvme_ctrl *ctrl); -int nvme_disable_ctrl(struct nvme_ctrl *ctrl); -int nvme_enable_ctrl(struct nvme_ctrl *ctrl); -int nvme_shutdown_ctrl(struct nvme_ctrl *ctrl); -int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, - const struct nvme_ctrl_ops *ops, unsigned long quirks); -void nvme_uninit_ctrl(struct nvme_ctrl *ctrl); -void nvme_start_ctrl(struct nvme_ctrl *ctrl); -void nvme_stop_ctrl(struct nvme_ctrl *ctrl); -int nvme_init_identify(struct nvme_ctrl *ctrl); - -void nvme_remove_namespaces(struct nvme_ctrl *ctrl); - -int nvme_sec_submit(void *data, u16 spsp, u8 secp, void *buffer, size_t len, - bool send); - -void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status, - volatile union nvme_result *res); - -void nvme_stop_queues(struct nvme_ctrl *ctrl); -void nvme_start_queues(struct nvme_ctrl *ctrl); -void nvme_kill_queues(struct nvme_ctrl *ctrl); -void nvme_sync_queues(struct nvme_ctrl *ctrl); -void nvme_sync_io_queues(struct nvme_ctrl *ctrl); -void nvme_unfreeze(struct nvme_ctrl *ctrl); -void nvme_wait_freeze(struct nvme_ctrl *ctrl); -int nvme_wait_freeze_timeout(struct nvme_ctrl *ctrl, long timeout); -void nvme_start_freeze(struct nvme_ctrl *ctrl); - -#define NVME_QID_ANY -1 -struct request *nvme_alloc_request(struct request_queue *q, - struct nvme_command *cmd, blk_mq_req_flags_t flags); -struct request *nvme_alloc_request_qid(struct request_queue *q, - struct nvme_command *cmd, blk_mq_req_flags_t flags, int qid); -void nvme_cleanup_cmd(struct request *req); -blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req, - struct nvme_command *cmd); -int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, - void *buf, unsigned bufflen); -int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, - union nvme_result *result, void *buffer, unsigned bufflen, - unsigned timeout, int qid, int at_head, - blk_mq_req_flags_t flags, bool poll); -int nvme_set_features(struct nvme_ctrl *dev, unsigned int fid, - unsigned int dword11, void *buffer, size_t buflen, - u32 *result); -int nvme_get_features(struct nvme_ctrl *dev, unsigned int fid, - unsigned int dword11, void *buffer, size_t buflen, - u32 *result); -int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count); -void nvme_stop_keep_alive(struct nvme_ctrl *ctrl); -int nvme_reset_ctrl(struct nvme_ctrl *ctrl); -int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl); -int nvme_try_sched_reset(struct nvme_ctrl *ctrl); -int nvme_delete_ctrl(struct nvme_ctrl *ctrl); - -int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp, u8 csi, - void *log, size_t size, u64 offset); -struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk, - struct nvme_ns_head **head, int *srcu_idx); -void nvme_put_ns_from_disk(struct nvme_ns_head *head, int idx); - -extern const struct attribute_group *nvme_ns_id_attr_groups[]; -extern const struct block_device_operations nvme_ns_head_ops; - -#ifdef CONFIG_NVME_MULTIPATH -static inline bool nvme_ctrl_use_ana(struct nvme_ctrl *ctrl) -{ - return ctrl->ana_log_buf != NULL; -} - -void nvme_mpath_unfreeze(struct nvme_subsystem *subsys); -void nvme_mpath_wait_freeze(struct nvme_subsystem *subsys); -void nvme_mpath_start_freeze(struct nvme_subsystem *subsys); -void nvme_set_disk_name(char *disk_name, struct nvme_ns *ns, - struct nvme_ctrl *ctrl, int *flags); -void nvme_failover_req(struct request *req); -void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl); -int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl,struct nvme_ns_head *head); -void nvme_mpath_add_disk(struct nvme_ns *ns, struct nvme_id_ns *id); -void nvme_mpath_remove_disk(struct nvme_ns_head *head); -int nvme_mpath_init_identify(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id); -void nvme_mpath_init_ctrl(struct nvme_ctrl *ctrl); -void nvme_mpath_update(struct nvme_ctrl *ctrl); -void nvme_mpath_uninit(struct nvme_ctrl *ctrl); -void nvme_mpath_stop(struct nvme_ctrl *ctrl); -bool nvme_mpath_clear_current_path(struct nvme_ns *ns); -void nvme_mpath_clear_ctrl_paths(struct nvme_ctrl *ctrl); -struct nvme_ns *nvme_find_path(struct nvme_ns_head *head); -blk_qc_t nvme_ns_head_submit_bio(struct bio *bio); - -static inline void nvme_mpath_check_last_path(struct nvme_ns *ns) -{ - struct nvme_ns_head *head = ns->head; - - if (head->disk && list_empty(&head->list)) - kblockd_schedule_work(&head->requeue_work); -} - -static inline void nvme_trace_bio_complete(struct request *req, - blk_status_t status) -{ - struct nvme_ns *ns = req->q->queuedata; - - if (req->cmd_flags & REQ_NVME_MPATH) - trace_block_bio_complete(ns->head->disk->queue, req->bio); -} - -extern struct device_attribute dev_attr_ana_grpid; -extern struct device_attribute dev_attr_ana_state; -extern struct device_attribute subsys_attr_iopolicy; - -#else -static inline bool nvme_ctrl_use_ana(struct nvme_ctrl *ctrl) -{ - return false; -} -/* - * Without the multipath code enabled, multiple controller per subsystems are - * visible as devices and thus we cannot use the subsystem instance. - */ -static inline void nvme_set_disk_name(char *disk_name, struct nvme_ns *ns, - struct nvme_ctrl *ctrl, int *flags) -{ - sprintf(disk_name, "nvme%dn%d", ctrl->instance, ns->head->instance); -} - -static inline void nvme_failover_req(struct request *req) -{ -} -static inline void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl) -{ -} -static inline int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, - struct nvme_ns_head *head) -{ - return 0; -} -static inline void nvme_mpath_add_disk(struct nvme_ns *ns, - struct nvme_id_ns *id) -{ -} -static inline void nvme_mpath_remove_disk(struct nvme_ns_head *head) -{ -} -static inline bool nvme_mpath_clear_current_path(struct nvme_ns *ns) -{ - return false; -} -static inline void nvme_mpath_clear_ctrl_paths(struct nvme_ctrl *ctrl) -{ -} -static inline void nvme_mpath_check_last_path(struct nvme_ns *ns) -{ -} -static inline void nvme_trace_bio_complete(struct request *req, - blk_status_t status) -{ -} -static inline void nvme_mpath_init_ctrl(struct nvme_ctrl *ctrl) -{ -} -static inline int nvme_mpath_init_identify(struct nvme_ctrl *ctrl, - struct nvme_id_ctrl *id) -{ - if (ctrl->subsys->cmic & (1 << 3)) - dev_warn(ctrl->device, -"Please enable CONFIG_NVME_MULTIPATH for full support of multi-port devices.\n"); - return 0; -} -static inline void nvme_mpath_update(struct nvme_ctrl *ctrl) -{ -} -static inline void nvme_mpath_uninit(struct nvme_ctrl *ctrl) -{ -} -static inline void nvme_mpath_stop(struct nvme_ctrl *ctrl) -{ -} -static inline void nvme_mpath_unfreeze(struct nvme_subsystem *subsys) -{ -} -static inline void nvme_mpath_wait_freeze(struct nvme_subsystem *subsys) -{ -} -static inline void nvme_mpath_start_freeze(struct nvme_subsystem *subsys) -{ -} -#endif /* CONFIG_NVME_MULTIPATH */ - -int nvme_revalidate_zones(struct nvme_ns *ns); -#ifdef CONFIG_BLK_DEV_ZONED -int nvme_update_zone_info(struct nvme_ns *ns, unsigned lbaf); -int nvme_report_zones(struct gendisk *disk, sector_t sector, - unsigned int nr_zones, report_zones_cb cb, void *data); - -blk_status_t nvme_setup_zone_mgmt_send(struct nvme_ns *ns, struct request *req, - struct nvme_command *cmnd, - enum nvme_zone_mgmt_action action); -#else -#define nvme_report_zones NULL - -static inline blk_status_t nvme_setup_zone_mgmt_send(struct nvme_ns *ns, - struct request *req, struct nvme_command *cmnd, - enum nvme_zone_mgmt_action action) -{ - return BLK_STS_NOTSUPP; -} - -static inline int nvme_update_zone_info(struct nvme_ns *ns, unsigned lbaf) -{ - dev_warn(ns->ctrl->device, - "Please enable CONFIG_BLK_DEV_ZONED to support ZNS devices\n"); - return -EPROTONOSUPPORT; -} -#endif - -#ifdef CONFIG_NVM -int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node); -void nvme_nvm_unregister(struct nvme_ns *ns); -extern const struct attribute_group nvme_nvm_attr_group; -int nvme_nvm_ioctl(struct nvme_ns *ns, unsigned int cmd, unsigned long arg); -#else -static inline int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, - int node) -{ - return 0; -} - -static inline void nvme_nvm_unregister(struct nvme_ns *ns) {}; -static inline int nvme_nvm_ioctl(struct nvme_ns *ns, unsigned int cmd, - unsigned long arg) -{ - return -ENOTTY; -} -#endif /* CONFIG_NVM */ - -static inline struct nvme_ns *nvme_get_ns_from_dev(struct device *dev) -{ - return dev_to_disk(dev)->private_data; -} - -#ifdef CONFIG_NVME_HWMON -int nvme_hwmon_init(struct nvme_ctrl *ctrl); -#else -static inline int nvme_hwmon_init(struct nvme_ctrl *ctrl) -{ - return 0; -} -#endif - -u32 nvme_command_effects(struct nvme_ctrl *ctrl, struct nvme_ns *ns, - u8 opcode); -void nvme_execute_passthru_rq(struct request *rq); -struct nvme_ctrl *nvme_ctrl_from_file(struct file *file); -struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid); -void nvme_put_ns(struct nvme_ns *ns); - -#endif /* _NVME_H */ diff --git a/feed/kmod-nvme/src/pci.c b/feed/kmod-nvme/src/pci.c deleted file mode 100644 index ce12965..0000000 --- a/feed/kmod-nvme/src/pci.c +++ /dev/null @@ -1,3310 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * NVM Express device driver - * Copyright (c) 2011-2014, Intel Corporation. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "trace.h" -#include "nvme.h" - -#define SQ_SIZE(q) ((q)->q_depth << (q)->sqes) -#define CQ_SIZE(q) ((q)->q_depth * sizeof(struct nvme_completion)) - -#define SGES_PER_PAGE (PAGE_SIZE / sizeof(struct nvme_sgl_desc)) - -/* - * These can be higher, but we need to ensure that any command doesn't - * require an sg allocation that needs more than a page of data. - */ -#define NVME_MAX_KB_SZ 4096 -#define NVME_MAX_SEGS 127 - -static int use_threaded_interrupts; -module_param(use_threaded_interrupts, int, 0); - -static bool use_cmb_sqes = true; -module_param(use_cmb_sqes, bool, 0444); -MODULE_PARM_DESC(use_cmb_sqes, "use controller's memory buffer for I/O SQes"); - -static unsigned int max_host_mem_size_mb = 128; -module_param(max_host_mem_size_mb, uint, 0444); -MODULE_PARM_DESC(max_host_mem_size_mb, - "Maximum Host Memory Buffer (HMB) size per controller (in MiB)"); - -static unsigned int sgl_threshold = SZ_32K; -module_param(sgl_threshold, uint, 0644); -MODULE_PARM_DESC(sgl_threshold, - "Use SGLs when average request segment size is larger or equal to " - "this size. Use 0 to disable SGLs."); - -static int io_queue_depth_set(const char *val, const struct kernel_param *kp); -static const struct kernel_param_ops io_queue_depth_ops = { - .set = io_queue_depth_set, - .get = param_get_uint, -}; - -static unsigned int io_queue_depth = 1024; -module_param_cb(io_queue_depth, &io_queue_depth_ops, &io_queue_depth, 0644); -MODULE_PARM_DESC(io_queue_depth, "set io queue depth, should >= 2"); - -static int io_queue_count_set(const char *val, const struct kernel_param *kp) -{ - unsigned int n; - int ret; - - ret = kstrtouint(val, 10, &n); - if (ret != 0 || n > num_possible_cpus()) - return -EINVAL; - return param_set_uint(val, kp); -} - -static const struct kernel_param_ops io_queue_count_ops = { - .set = io_queue_count_set, - .get = param_get_uint, -}; - -static unsigned int write_queues; -module_param_cb(write_queues, &io_queue_count_ops, &write_queues, 0644); -MODULE_PARM_DESC(write_queues, - "Number of queues to use for writes. If not set, reads and writes " - "will share a queue set."); - -static unsigned int poll_queues; -module_param_cb(poll_queues, &io_queue_count_ops, &poll_queues, 0644); -MODULE_PARM_DESC(poll_queues, "Number of queues to use for polled IO."); - -static bool noacpi; -module_param(noacpi, bool, 0444); -MODULE_PARM_DESC(noacpi, "disable acpi bios quirks"); - -struct nvme_dev; -struct nvme_queue; - -static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown); -static bool __nvme_disable_io_queues(struct nvme_dev *dev, u8 opcode); - -/* - * Represents an NVM Express device. Each nvme_dev is a PCI function. - */ -struct nvme_dev { - struct nvme_queue *queues; - struct blk_mq_tag_set tagset; - struct blk_mq_tag_set admin_tagset; - u32 __iomem *dbs; - struct device *dev; - struct dma_pool *prp_page_pool; - struct dma_pool *prp_small_pool; - unsigned online_queues; - unsigned max_qid; - unsigned io_queues[HCTX_MAX_TYPES]; - unsigned int num_vecs; - u32 q_depth; - int io_sqes; - u32 db_stride; - void __iomem *bar; - unsigned long bar_mapped_size; - struct work_struct remove_work; - struct mutex shutdown_lock; - bool subsystem; - u64 cmb_size; - bool cmb_use_sqes; - u32 cmbsz; - u32 cmbloc; - struct nvme_ctrl ctrl; - u32 last_ps; - - mempool_t *iod_mempool; - - /* shadow doorbell buffer support: */ - u32 *dbbuf_dbs; - dma_addr_t dbbuf_dbs_dma_addr; - u32 *dbbuf_eis; - dma_addr_t dbbuf_eis_dma_addr; - - /* host memory buffer support: */ - u64 host_mem_size; - u32 nr_host_mem_descs; - dma_addr_t host_mem_descs_dma; - struct nvme_host_mem_buf_desc *host_mem_descs; - void **host_mem_desc_bufs; - unsigned int nr_allocated_queues; - unsigned int nr_write_queues; - unsigned int nr_poll_queues; -}; - -static int io_queue_depth_set(const char *val, const struct kernel_param *kp) -{ - int ret; - u32 n; - - ret = kstrtou32(val, 10, &n); - if (ret != 0 || n < 2) - return -EINVAL; - - return param_set_uint(val, kp); -} - -static inline unsigned int sq_idx(unsigned int qid, u32 stride) -{ - return qid * 2 * stride; -} - -static inline unsigned int cq_idx(unsigned int qid, u32 stride) -{ - return (qid * 2 + 1) * stride; -} - -static inline struct nvme_dev *to_nvme_dev(struct nvme_ctrl *ctrl) -{ - return container_of(ctrl, struct nvme_dev, ctrl); -} - -/* - * An NVM Express queue. Each device has at least two (one for admin - * commands and one for I/O commands). - */ -struct nvme_queue { - struct nvme_dev *dev; - spinlock_t sq_lock; - void *sq_cmds; - /* only used for poll queues: */ - spinlock_t cq_poll_lock ____cacheline_aligned_in_smp; - struct nvme_completion *cqes; - dma_addr_t sq_dma_addr; - dma_addr_t cq_dma_addr; - u32 __iomem *q_db; - u32 q_depth; - u16 cq_vector; - u16 sq_tail; - u16 last_sq_tail; - u16 cq_head; - u16 qid; - u8 cq_phase; - u8 sqes; - unsigned long flags; -#define NVMEQ_ENABLED 0 -#define NVMEQ_SQ_CMB 1 -#define NVMEQ_DELETE_ERROR 2 -#define NVMEQ_POLLED 3 - u32 *dbbuf_sq_db; - u32 *dbbuf_cq_db; - u32 *dbbuf_sq_ei; - u32 *dbbuf_cq_ei; - struct completion delete_done; -}; - -/* - * The nvme_iod describes the data in an I/O. - * - * The sg pointer contains the list of PRP/SGL chunk allocations in addition - * to the actual struct scatterlist. - */ -struct nvme_iod { - struct nvme_request req; - struct nvme_command cmd; - struct nvme_queue *nvmeq; - bool use_sgl; - int aborted; - int npages; /* In the PRP list. 0 means small pool in use */ - int nents; /* Used in scatterlist */ - dma_addr_t first_dma; - unsigned int dma_len; /* length of single DMA segment mapping */ - dma_addr_t meta_dma; - struct scatterlist *sg; -}; - -static inline unsigned int nvme_dbbuf_size(struct nvme_dev *dev) -{ - return dev->nr_allocated_queues * 8 * dev->db_stride; -} - -static int nvme_dbbuf_dma_alloc(struct nvme_dev *dev) -{ - unsigned int mem_size = nvme_dbbuf_size(dev); - - if (dev->dbbuf_dbs) - return 0; - - dev->dbbuf_dbs = dma_alloc_coherent(dev->dev, mem_size, - &dev->dbbuf_dbs_dma_addr, - GFP_KERNEL); - if (!dev->dbbuf_dbs) - return -ENOMEM; - dev->dbbuf_eis = dma_alloc_coherent(dev->dev, mem_size, - &dev->dbbuf_eis_dma_addr, - GFP_KERNEL); - if (!dev->dbbuf_eis) { - dma_free_coherent(dev->dev, mem_size, - dev->dbbuf_dbs, dev->dbbuf_dbs_dma_addr); - dev->dbbuf_dbs = NULL; - return -ENOMEM; - } - - return 0; -} - -static void nvme_dbbuf_dma_free(struct nvme_dev *dev) -{ - unsigned int mem_size = nvme_dbbuf_size(dev); - - if (dev->dbbuf_dbs) { - dma_free_coherent(dev->dev, mem_size, - dev->dbbuf_dbs, dev->dbbuf_dbs_dma_addr); - dev->dbbuf_dbs = NULL; - } - if (dev->dbbuf_eis) { - dma_free_coherent(dev->dev, mem_size, - dev->dbbuf_eis, dev->dbbuf_eis_dma_addr); - dev->dbbuf_eis = NULL; - } -} - -static void nvme_dbbuf_init(struct nvme_dev *dev, - struct nvme_queue *nvmeq, int qid) -{ - if (!dev->dbbuf_dbs || !qid) - return; - - nvmeq->dbbuf_sq_db = &dev->dbbuf_dbs[sq_idx(qid, dev->db_stride)]; - nvmeq->dbbuf_cq_db = &dev->dbbuf_dbs[cq_idx(qid, dev->db_stride)]; - nvmeq->dbbuf_sq_ei = &dev->dbbuf_eis[sq_idx(qid, dev->db_stride)]; - nvmeq->dbbuf_cq_ei = &dev->dbbuf_eis[cq_idx(qid, dev->db_stride)]; -} - -static void nvme_dbbuf_free(struct nvme_queue *nvmeq) -{ - if (!nvmeq->qid) - return; - - nvmeq->dbbuf_sq_db = NULL; - nvmeq->dbbuf_cq_db = NULL; - nvmeq->dbbuf_sq_ei = NULL; - nvmeq->dbbuf_cq_ei = NULL; -} - -static void nvme_dbbuf_set(struct nvme_dev *dev) -{ - struct nvme_command c; - unsigned int i; - - if (!dev->dbbuf_dbs) - return; - - memset(&c, 0, sizeof(c)); - c.dbbuf.opcode = nvme_admin_dbbuf; - c.dbbuf.prp1 = cpu_to_le64(dev->dbbuf_dbs_dma_addr); - c.dbbuf.prp2 = cpu_to_le64(dev->dbbuf_eis_dma_addr); - - if (nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0)) { - dev_warn(dev->ctrl.device, "unable to set dbbuf\n"); - /* Free memory and continue on */ - nvme_dbbuf_dma_free(dev); - - for (i = 1; i <= dev->online_queues; i++) - nvme_dbbuf_free(&dev->queues[i]); - } -} - -static inline int nvme_dbbuf_need_event(u16 event_idx, u16 new_idx, u16 old) -{ - return (u16)(new_idx - event_idx - 1) < (u16)(new_idx - old); -} - -/* Update dbbuf and return true if an MMIO is required */ -static bool nvme_dbbuf_update_and_check_event(u16 value, u32 *dbbuf_db, - volatile u32 *dbbuf_ei) -{ - if (dbbuf_db) { - u16 old_value; - - /* - * Ensure that the queue is written before updating - * the doorbell in memory - */ - wmb(); - - old_value = *dbbuf_db; - *dbbuf_db = value; - - /* - * Ensure that the doorbell is updated before reading the event - * index from memory. The controller needs to provide similar - * ordering to ensure the envent index is updated before reading - * the doorbell. - */ - mb(); - - if (!nvme_dbbuf_need_event(*dbbuf_ei, value, old_value)) - return false; - } - - return true; -} - -/* - * Will slightly overestimate the number of pages needed. This is OK - * as it only leads to a small amount of wasted memory for the lifetime of - * the I/O. - */ -static int nvme_pci_npages_prp(void) -{ - unsigned nprps = DIV_ROUND_UP(NVME_MAX_KB_SZ + NVME_CTRL_PAGE_SIZE, - NVME_CTRL_PAGE_SIZE); - return DIV_ROUND_UP(8 * nprps, PAGE_SIZE - 8); -} - -/* - * Calculates the number of pages needed for the SGL segments. For example a 4k - * page can accommodate 256 SGL descriptors. - */ -static int nvme_pci_npages_sgl(void) -{ - return DIV_ROUND_UP(NVME_MAX_SEGS * sizeof(struct nvme_sgl_desc), - PAGE_SIZE); -} - -static size_t nvme_pci_iod_alloc_size(void) -{ - size_t npages = max(nvme_pci_npages_prp(), nvme_pci_npages_sgl()); - - return sizeof(__le64 *) * npages + - sizeof(struct scatterlist) * NVME_MAX_SEGS; -} - -static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, - unsigned int hctx_idx) -{ - struct nvme_dev *dev = data; - struct nvme_queue *nvmeq = &dev->queues[0]; - - WARN_ON(hctx_idx != 0); - WARN_ON(dev->admin_tagset.tags[0] != hctx->tags); - - hctx->driver_data = nvmeq; - return 0; -} - -static int nvme_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, - unsigned int hctx_idx) -{ - struct nvme_dev *dev = data; - struct nvme_queue *nvmeq = &dev->queues[hctx_idx + 1]; - - WARN_ON(dev->tagset.tags[hctx_idx] != hctx->tags); - hctx->driver_data = nvmeq; - return 0; -} - -static int nvme_init_request(struct blk_mq_tag_set *set, struct request *req, - unsigned int hctx_idx, unsigned int numa_node) -{ - struct nvme_dev *dev = set->driver_data; - struct nvme_iod *iod = blk_mq_rq_to_pdu(req); - int queue_idx = (set == &dev->tagset) ? hctx_idx + 1 : 0; - struct nvme_queue *nvmeq = &dev->queues[queue_idx]; - - BUG_ON(!nvmeq); - iod->nvmeq = nvmeq; - - nvme_req(req)->ctrl = &dev->ctrl; - return 0; -} - -static int queue_irq_offset(struct nvme_dev *dev) -{ - /* if we have more than 1 vec, admin queue offsets us by 1 */ - if (dev->num_vecs > 1) - return 1; - - return 0; -} - -static int nvme_pci_map_queues(struct blk_mq_tag_set *set) -{ - struct nvme_dev *dev = set->driver_data; - int i, qoff, offset; - - offset = queue_irq_offset(dev); - for (i = 0, qoff = 0; i < set->nr_maps; i++) { - struct blk_mq_queue_map *map = &set->map[i]; - - map->nr_queues = dev->io_queues[i]; - if (!map->nr_queues) { - BUG_ON(i == HCTX_TYPE_DEFAULT); - continue; - } - - /* - * The poll queue(s) doesn't have an IRQ (and hence IRQ - * affinity), so use the regular blk-mq cpu mapping - */ - map->queue_offset = qoff; - if (i != HCTX_TYPE_POLL && offset) - blk_mq_pci_map_queues(map, to_pci_dev(dev->dev), offset); - else - blk_mq_map_queues(map); - qoff += map->nr_queues; - offset += map->nr_queues; - } - - return 0; -} - -/* - * Write sq tail if we are asked to, or if the next command would wrap. - */ -static inline void nvme_write_sq_db(struct nvme_queue *nvmeq, bool write_sq) -{ - if (!write_sq) { - u16 next_tail = nvmeq->sq_tail + 1; - - if (next_tail == nvmeq->q_depth) - next_tail = 0; - if (next_tail != nvmeq->last_sq_tail) - return; - } - - if (nvme_dbbuf_update_and_check_event(nvmeq->sq_tail, - nvmeq->dbbuf_sq_db, nvmeq->dbbuf_sq_ei)) - writel(nvmeq->sq_tail, nvmeq->q_db); - nvmeq->last_sq_tail = nvmeq->sq_tail; -} - -/** - * nvme_submit_cmd() - Copy a command into a queue and ring the doorbell - * @nvmeq: The queue to use - * @cmd: The command to send - * @write_sq: whether to write to the SQ doorbell - */ -static void nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd, - bool write_sq) -{ - spin_lock(&nvmeq->sq_lock); - memcpy(nvmeq->sq_cmds + (nvmeq->sq_tail << nvmeq->sqes), - cmd, sizeof(*cmd)); - if (++nvmeq->sq_tail == nvmeq->q_depth) - nvmeq->sq_tail = 0; - nvme_write_sq_db(nvmeq, write_sq); - spin_unlock(&nvmeq->sq_lock); -} - -static void nvme_commit_rqs(struct blk_mq_hw_ctx *hctx) -{ - struct nvme_queue *nvmeq = hctx->driver_data; - - spin_lock(&nvmeq->sq_lock); - if (nvmeq->sq_tail != nvmeq->last_sq_tail) - nvme_write_sq_db(nvmeq, true); - spin_unlock(&nvmeq->sq_lock); -} - -static void **nvme_pci_iod_list(struct request *req) -{ - struct nvme_iod *iod = blk_mq_rq_to_pdu(req); - return (void **)(iod->sg + blk_rq_nr_phys_segments(req)); -} - -static inline bool nvme_pci_use_sgls(struct nvme_dev *dev, struct request *req) -{ - struct nvme_iod *iod = blk_mq_rq_to_pdu(req); - int nseg = blk_rq_nr_phys_segments(req); - unsigned int avg_seg_size; - - avg_seg_size = DIV_ROUND_UP(blk_rq_payload_bytes(req), nseg); - - if (!(dev->ctrl.sgls & ((1 << 0) | (1 << 1)))) - return false; - if (!iod->nvmeq->qid) - return false; - if (!sgl_threshold || avg_seg_size < sgl_threshold) - return false; - return true; -} - -static void nvme_free_prps(struct nvme_dev *dev, struct request *req) -{ - const int last_prp = NVME_CTRL_PAGE_SIZE / sizeof(__le64) - 1; - struct nvme_iod *iod = blk_mq_rq_to_pdu(req); - dma_addr_t dma_addr = iod->first_dma; - int i; - - for (i = 0; i < iod->npages; i++) { - __le64 *prp_list = nvme_pci_iod_list(req)[i]; - dma_addr_t next_dma_addr = le64_to_cpu(prp_list[last_prp]); - - dma_pool_free(dev->prp_page_pool, prp_list, dma_addr); - dma_addr = next_dma_addr; - } - -} - -static void nvme_free_sgls(struct nvme_dev *dev, struct request *req) -{ - const int last_sg = SGES_PER_PAGE - 1; - struct nvme_iod *iod = blk_mq_rq_to_pdu(req); - dma_addr_t dma_addr = iod->first_dma; - int i; - - for (i = 0; i < iod->npages; i++) { - struct nvme_sgl_desc *sg_list = nvme_pci_iod_list(req)[i]; - dma_addr_t next_dma_addr = le64_to_cpu((sg_list[last_sg]).addr); - - dma_pool_free(dev->prp_page_pool, sg_list, dma_addr); - dma_addr = next_dma_addr; - } - -} - -static void nvme_unmap_sg(struct nvme_dev *dev, struct request *req) -{ - struct nvme_iod *iod = blk_mq_rq_to_pdu(req); - - if (is_pci_p2pdma_page(sg_page(iod->sg))) - pci_p2pdma_unmap_sg(dev->dev, iod->sg, iod->nents, - rq_dma_dir(req)); - else - dma_unmap_sg(dev->dev, iod->sg, iod->nents, rq_dma_dir(req)); -} - -static void nvme_unmap_data(struct nvme_dev *dev, struct request *req) -{ - struct nvme_iod *iod = blk_mq_rq_to_pdu(req); - - if (iod->dma_len) { - dma_unmap_page(dev->dev, iod->first_dma, iod->dma_len, - rq_dma_dir(req)); - return; - } - - WARN_ON_ONCE(!iod->nents); - - nvme_unmap_sg(dev, req); - if (iod->npages == 0) - dma_pool_free(dev->prp_small_pool, nvme_pci_iod_list(req)[0], - iod->first_dma); - else if (iod->use_sgl) - nvme_free_sgls(dev, req); - else - nvme_free_prps(dev, req); - mempool_free(iod->sg, dev->iod_mempool); -} - -static void nvme_print_sgl(struct scatterlist *sgl, int nents) -{ - int i; - struct scatterlist *sg; - - for_each_sg(sgl, sg, nents, i) { - dma_addr_t phys = sg_phys(sg); - pr_warn("sg[%d] phys_addr:%pad offset:%d length:%d " - "dma_address:%pad dma_length:%d\n", - i, &phys, sg->offset, sg->length, &sg_dma_address(sg), - sg_dma_len(sg)); - } -} - -static blk_status_t nvme_pci_setup_prps(struct nvme_dev *dev, - struct request *req, struct nvme_rw_command *cmnd) -{ - struct nvme_iod *iod = blk_mq_rq_to_pdu(req); - struct dma_pool *pool; - int length = blk_rq_payload_bytes(req); - struct scatterlist *sg = iod->sg; - int dma_len = sg_dma_len(sg); - u64 dma_addr = sg_dma_address(sg); - int offset = dma_addr & (NVME_CTRL_PAGE_SIZE - 1); - __le64 *prp_list; - void **list = nvme_pci_iod_list(req); - dma_addr_t prp_dma; - int nprps, i; - - length -= (NVME_CTRL_PAGE_SIZE - offset); - if (length <= 0) { - iod->first_dma = 0; - goto done; - } - - dma_len -= (NVME_CTRL_PAGE_SIZE - offset); - if (dma_len) { - dma_addr += (NVME_CTRL_PAGE_SIZE - offset); - } else { - sg = sg_next(sg); - dma_addr = sg_dma_address(sg); - dma_len = sg_dma_len(sg); - } - - if (length <= NVME_CTRL_PAGE_SIZE) { - iod->first_dma = dma_addr; - goto done; - } - - nprps = DIV_ROUND_UP(length, NVME_CTRL_PAGE_SIZE); - if (nprps <= (256 / 8)) { - pool = dev->prp_small_pool; - iod->npages = 0; - } else { - pool = dev->prp_page_pool; - iod->npages = 1; - } - - prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &prp_dma); - if (!prp_list) { - iod->first_dma = dma_addr; - iod->npages = -1; - return BLK_STS_RESOURCE; - } - list[0] = prp_list; - iod->first_dma = prp_dma; - i = 0; - for (;;) { - if (i == NVME_CTRL_PAGE_SIZE >> 3) { - __le64 *old_prp_list = prp_list; - prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &prp_dma); - if (!prp_list) - goto free_prps; - list[iod->npages++] = prp_list; - prp_list[0] = old_prp_list[i - 1]; - old_prp_list[i - 1] = cpu_to_le64(prp_dma); - i = 1; - } - prp_list[i++] = cpu_to_le64(dma_addr); - dma_len -= NVME_CTRL_PAGE_SIZE; - dma_addr += NVME_CTRL_PAGE_SIZE; - length -= NVME_CTRL_PAGE_SIZE; - if (length <= 0) - break; - if (dma_len > 0) - continue; - if (unlikely(dma_len < 0)) - goto bad_sgl; - sg = sg_next(sg); - dma_addr = sg_dma_address(sg); - dma_len = sg_dma_len(sg); - } -done: - cmnd->dptr.prp1 = cpu_to_le64(sg_dma_address(iod->sg)); - cmnd->dptr.prp2 = cpu_to_le64(iod->first_dma); - return BLK_STS_OK; -free_prps: - nvme_free_prps(dev, req); - return BLK_STS_RESOURCE; -bad_sgl: - WARN(DO_ONCE(nvme_print_sgl, iod->sg, iod->nents), - "Invalid SGL for payload:%d nents:%d\n", - blk_rq_payload_bytes(req), iod->nents); - return BLK_STS_IOERR; -} - -static void nvme_pci_sgl_set_data(struct nvme_sgl_desc *sge, - struct scatterlist *sg) -{ - sge->addr = cpu_to_le64(sg_dma_address(sg)); - sge->length = cpu_to_le32(sg_dma_len(sg)); - sge->type = NVME_SGL_FMT_DATA_DESC << 4; -} - -static void nvme_pci_sgl_set_seg(struct nvme_sgl_desc *sge, - dma_addr_t dma_addr, int entries) -{ - sge->addr = cpu_to_le64(dma_addr); - if (entries < SGES_PER_PAGE) { - sge->length = cpu_to_le32(entries * sizeof(*sge)); - sge->type = NVME_SGL_FMT_LAST_SEG_DESC << 4; - } else { - sge->length = cpu_to_le32(PAGE_SIZE); - sge->type = NVME_SGL_FMT_SEG_DESC << 4; - } -} - -static blk_status_t nvme_pci_setup_sgls(struct nvme_dev *dev, - struct request *req, struct nvme_rw_command *cmd, int entries) -{ - struct nvme_iod *iod = blk_mq_rq_to_pdu(req); - struct dma_pool *pool; - struct nvme_sgl_desc *sg_list; - struct scatterlist *sg = iod->sg; - dma_addr_t sgl_dma; - int i = 0; - - /* setting the transfer type as SGL */ - cmd->flags = NVME_CMD_SGL_METABUF; - - if (entries == 1) { - nvme_pci_sgl_set_data(&cmd->dptr.sgl, sg); - return BLK_STS_OK; - } - - if (entries <= (256 / sizeof(struct nvme_sgl_desc))) { - pool = dev->prp_small_pool; - iod->npages = 0; - } else { - pool = dev->prp_page_pool; - iod->npages = 1; - } - - sg_list = dma_pool_alloc(pool, GFP_ATOMIC, &sgl_dma); - if (!sg_list) { - iod->npages = -1; - return BLK_STS_RESOURCE; - } - - nvme_pci_iod_list(req)[0] = sg_list; - iod->first_dma = sgl_dma; - - nvme_pci_sgl_set_seg(&cmd->dptr.sgl, sgl_dma, entries); - - do { - if (i == SGES_PER_PAGE) { - struct nvme_sgl_desc *old_sg_desc = sg_list; - struct nvme_sgl_desc *link = &old_sg_desc[i - 1]; - - sg_list = dma_pool_alloc(pool, GFP_ATOMIC, &sgl_dma); - if (!sg_list) - goto free_sgls; - - i = 0; - nvme_pci_iod_list(req)[iod->npages++] = sg_list; - sg_list[i++] = *link; - nvme_pci_sgl_set_seg(link, sgl_dma, entries); - } - - nvme_pci_sgl_set_data(&sg_list[i++], sg); - sg = sg_next(sg); - } while (--entries > 0); - - return BLK_STS_OK; -free_sgls: - nvme_free_sgls(dev, req); - return BLK_STS_RESOURCE; -} - -static blk_status_t nvme_setup_prp_simple(struct nvme_dev *dev, - struct request *req, struct nvme_rw_command *cmnd, - struct bio_vec *bv) -{ - struct nvme_iod *iod = blk_mq_rq_to_pdu(req); - unsigned int offset = bv->bv_offset & (NVME_CTRL_PAGE_SIZE - 1); - unsigned int first_prp_len = NVME_CTRL_PAGE_SIZE - offset; - - iod->first_dma = dma_map_bvec(dev->dev, bv, rq_dma_dir(req), 0); - if (dma_mapping_error(dev->dev, iod->first_dma)) - return BLK_STS_RESOURCE; - iod->dma_len = bv->bv_len; - - cmnd->dptr.prp1 = cpu_to_le64(iod->first_dma); - if (bv->bv_len > first_prp_len) - cmnd->dptr.prp2 = cpu_to_le64(iod->first_dma + first_prp_len); - return BLK_STS_OK; -} - -static blk_status_t nvme_setup_sgl_simple(struct nvme_dev *dev, - struct request *req, struct nvme_rw_command *cmnd, - struct bio_vec *bv) -{ - struct nvme_iod *iod = blk_mq_rq_to_pdu(req); - - iod->first_dma = dma_map_bvec(dev->dev, bv, rq_dma_dir(req), 0); - if (dma_mapping_error(dev->dev, iod->first_dma)) - return BLK_STS_RESOURCE; - iod->dma_len = bv->bv_len; - - cmnd->flags = NVME_CMD_SGL_METABUF; - cmnd->dptr.sgl.addr = cpu_to_le64(iod->first_dma); - cmnd->dptr.sgl.length = cpu_to_le32(iod->dma_len); - cmnd->dptr.sgl.type = NVME_SGL_FMT_DATA_DESC << 4; - return BLK_STS_OK; -} - -static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req, - struct nvme_command *cmnd) -{ - struct nvme_iod *iod = blk_mq_rq_to_pdu(req); - blk_status_t ret = BLK_STS_RESOURCE; - int nr_mapped; - - if (blk_rq_nr_phys_segments(req) == 1) { - struct bio_vec bv = req_bvec(req); - - if (!is_pci_p2pdma_page(bv.bv_page)) { - if (bv.bv_offset + bv.bv_len <= NVME_CTRL_PAGE_SIZE * 2) - return nvme_setup_prp_simple(dev, req, - &cmnd->rw, &bv); - - if (iod->nvmeq->qid && sgl_threshold && - dev->ctrl.sgls & ((1 << 0) | (1 << 1))) - return nvme_setup_sgl_simple(dev, req, - &cmnd->rw, &bv); - } - } - - iod->dma_len = 0; - iod->sg = mempool_alloc(dev->iod_mempool, GFP_ATOMIC); - if (!iod->sg) - return BLK_STS_RESOURCE; - sg_init_table(iod->sg, blk_rq_nr_phys_segments(req)); - iod->nents = blk_rq_map_sg(req->q, req, iod->sg); - if (!iod->nents) - goto out_free_sg; - - if (is_pci_p2pdma_page(sg_page(iod->sg))) - nr_mapped = pci_p2pdma_map_sg_attrs(dev->dev, iod->sg, - iod->nents, rq_dma_dir(req), DMA_ATTR_NO_WARN); - else - nr_mapped = dma_map_sg_attrs(dev->dev, iod->sg, iod->nents, - rq_dma_dir(req), DMA_ATTR_NO_WARN); - if (!nr_mapped) - goto out_free_sg; - - iod->use_sgl = nvme_pci_use_sgls(dev, req); - if (iod->use_sgl) - ret = nvme_pci_setup_sgls(dev, req, &cmnd->rw, nr_mapped); - else - ret = nvme_pci_setup_prps(dev, req, &cmnd->rw); - if (ret != BLK_STS_OK) - goto out_unmap_sg; - return BLK_STS_OK; - -out_unmap_sg: - nvme_unmap_sg(dev, req); -out_free_sg: - mempool_free(iod->sg, dev->iod_mempool); - return ret; -} - -static blk_status_t nvme_map_metadata(struct nvme_dev *dev, struct request *req, - struct nvme_command *cmnd) -{ - struct nvme_iod *iod = blk_mq_rq_to_pdu(req); - - iod->meta_dma = dma_map_bvec(dev->dev, rq_integrity_vec(req), - rq_dma_dir(req), 0); - if (dma_mapping_error(dev->dev, iod->meta_dma)) - return BLK_STS_IOERR; - cmnd->rw.metadata = cpu_to_le64(iod->meta_dma); - return BLK_STS_OK; -} - -/* - * NOTE: ns is NULL when called on the admin queue. - */ -static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx, - const struct blk_mq_queue_data *bd) -{ - struct nvme_ns *ns = hctx->queue->queuedata; - struct nvme_queue *nvmeq = hctx->driver_data; - struct nvme_dev *dev = nvmeq->dev; - struct request *req = bd->rq; - struct nvme_iod *iod = blk_mq_rq_to_pdu(req); - struct nvme_command *cmnd = &iod->cmd; - blk_status_t ret; - - iod->aborted = 0; - iod->npages = -1; - iod->nents = 0; - - /* - * We should not need to do this, but we're still using this to - * ensure we can drain requests on a dying queue. - */ - if (unlikely(!test_bit(NVMEQ_ENABLED, &nvmeq->flags))) - return BLK_STS_IOERR; - - ret = nvme_setup_cmd(ns, req, cmnd); - if (ret) - return ret; - - if (blk_rq_nr_phys_segments(req)) { - ret = nvme_map_data(dev, req, cmnd); - if (ret) - goto out_free_cmd; - } - - if (blk_integrity_rq(req)) { - ret = nvme_map_metadata(dev, req, cmnd); - if (ret) - goto out_unmap_data; - } - - blk_mq_start_request(req); - nvme_submit_cmd(nvmeq, cmnd, bd->last); - return BLK_STS_OK; -out_unmap_data: - nvme_unmap_data(dev, req); -out_free_cmd: - nvme_cleanup_cmd(req); - return ret; -} - -static void nvme_pci_complete_rq(struct request *req) -{ - struct nvme_iod *iod = blk_mq_rq_to_pdu(req); - struct nvme_dev *dev = iod->nvmeq->dev; - - if (blk_integrity_rq(req)) - dma_unmap_page(dev->dev, iod->meta_dma, - rq_integrity_vec(req)->bv_len, rq_data_dir(req)); - if (blk_rq_nr_phys_segments(req)) - nvme_unmap_data(dev, req); - nvme_complete_rq(req); -} - -/* We read the CQE phase first to check if the rest of the entry is valid */ -static inline bool nvme_cqe_pending(struct nvme_queue *nvmeq) -{ - struct nvme_completion *hcqe = &nvmeq->cqes[nvmeq->cq_head]; - - return (le16_to_cpu(READ_ONCE(hcqe->status)) & 1) == nvmeq->cq_phase; -} - -static inline void nvme_ring_cq_doorbell(struct nvme_queue *nvmeq) -{ - u16 head = nvmeq->cq_head; - - if (nvme_dbbuf_update_and_check_event(head, nvmeq->dbbuf_cq_db, - nvmeq->dbbuf_cq_ei)) - writel(head, nvmeq->q_db + nvmeq->dev->db_stride); -} - -static inline struct blk_mq_tags *nvme_queue_tagset(struct nvme_queue *nvmeq) -{ - if (!nvmeq->qid) - return nvmeq->dev->admin_tagset.tags[0]; - return nvmeq->dev->tagset.tags[nvmeq->qid - 1]; -} - -static inline void nvme_handle_cqe(struct nvme_queue *nvmeq, u16 idx) -{ - struct nvme_completion *cqe = &nvmeq->cqes[idx]; - __u16 command_id = READ_ONCE(cqe->command_id); - struct request *req; - - /* - * AEN requests are special as they don't time out and can - * survive any kind of queue freeze and often don't respond to - * aborts. We don't even bother to allocate a struct request - * for them but rather special case them here. - */ - if (unlikely(nvme_is_aen_req(nvmeq->qid, command_id))) { - nvme_complete_async_event(&nvmeq->dev->ctrl, - cqe->status, &cqe->result); - return; - } - - req = nvme_find_rq(nvme_queue_tagset(nvmeq), command_id); - if (unlikely(!req)) { - dev_warn(nvmeq->dev->ctrl.device, - "invalid id %d completed on queue %d\n", - command_id, le16_to_cpu(cqe->sq_id)); - return; - } - - trace_nvme_sq(req, cqe->sq_head, nvmeq->sq_tail); - if (!nvme_try_complete_req(req, cqe->status, cqe->result)) - nvme_pci_complete_rq(req); -} - -static inline void nvme_update_cq_head(struct nvme_queue *nvmeq) -{ - u32 tmp = nvmeq->cq_head + 1; - - if (tmp == nvmeq->q_depth) { - nvmeq->cq_head = 0; - nvmeq->cq_phase ^= 1; - } else { - nvmeq->cq_head = tmp; - } -} - -static inline int nvme_process_cq(struct nvme_queue *nvmeq) -{ - int found = 0; - - while (nvme_cqe_pending(nvmeq)) { - found++; - /* - * load-load control dependency between phase and the rest of - * the cqe requires a full read memory barrier - */ - dma_rmb(); - nvme_handle_cqe(nvmeq, nvmeq->cq_head); - nvme_update_cq_head(nvmeq); - } - - if (found) - nvme_ring_cq_doorbell(nvmeq); - return found; -} - -static irqreturn_t nvme_irq(int irq, void *data) -{ - struct nvme_queue *nvmeq = data; - irqreturn_t ret = IRQ_NONE; - - /* - * The rmb/wmb pair ensures we see all updates from a previous run of - * the irq handler, even if that was on another CPU. - */ - rmb(); - if (nvme_process_cq(nvmeq)) - ret = IRQ_HANDLED; - wmb(); - - return ret; -} - -static irqreturn_t nvme_irq_check(int irq, void *data) -{ - struct nvme_queue *nvmeq = data; - - if (nvme_cqe_pending(nvmeq)) - return IRQ_WAKE_THREAD; - return IRQ_NONE; -} - -/* - * Poll for completions for any interrupt driven queue - * Can be called from any context. - */ -static void nvme_poll_irqdisable(struct nvme_queue *nvmeq) -{ - struct pci_dev *pdev = to_pci_dev(nvmeq->dev->dev); - - WARN_ON_ONCE(test_bit(NVMEQ_POLLED, &nvmeq->flags)); - - disable_irq(pci_irq_vector(pdev, nvmeq->cq_vector)); - nvme_process_cq(nvmeq); - enable_irq(pci_irq_vector(pdev, nvmeq->cq_vector)); -} - -static int nvme_poll(struct blk_mq_hw_ctx *hctx) -{ - struct nvme_queue *nvmeq = hctx->driver_data; - bool found; - - if (!nvme_cqe_pending(nvmeq)) - return 0; - - spin_lock(&nvmeq->cq_poll_lock); - found = nvme_process_cq(nvmeq); - spin_unlock(&nvmeq->cq_poll_lock); - - return found; -} - -static void nvme_pci_submit_async_event(struct nvme_ctrl *ctrl) -{ - struct nvme_dev *dev = to_nvme_dev(ctrl); - struct nvme_queue *nvmeq = &dev->queues[0]; - struct nvme_command c; - - memset(&c, 0, sizeof(c)); - c.common.opcode = nvme_admin_async_event; - c.common.command_id = NVME_AQ_BLK_MQ_DEPTH; - nvme_submit_cmd(nvmeq, &c, true); -} - -static int adapter_delete_queue(struct nvme_dev *dev, u8 opcode, u16 id) -{ - struct nvme_command c; - - memset(&c, 0, sizeof(c)); - c.delete_queue.opcode = opcode; - c.delete_queue.qid = cpu_to_le16(id); - - return nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0); -} - -static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid, - struct nvme_queue *nvmeq, s16 vector) -{ - struct nvme_command c; - int flags = NVME_QUEUE_PHYS_CONTIG; - - if (!test_bit(NVMEQ_POLLED, &nvmeq->flags)) - flags |= NVME_CQ_IRQ_ENABLED; - - /* - * Note: we (ab)use the fact that the prp fields survive if no data - * is attached to the request. - */ - memset(&c, 0, sizeof(c)); - c.create_cq.opcode = nvme_admin_create_cq; - c.create_cq.prp1 = cpu_to_le64(nvmeq->cq_dma_addr); - c.create_cq.cqid = cpu_to_le16(qid); - c.create_cq.qsize = cpu_to_le16(nvmeq->q_depth - 1); - c.create_cq.cq_flags = cpu_to_le16(flags); - c.create_cq.irq_vector = cpu_to_le16(vector); - - return nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0); -} - -static int adapter_alloc_sq(struct nvme_dev *dev, u16 qid, - struct nvme_queue *nvmeq) -{ - struct nvme_ctrl *ctrl = &dev->ctrl; - struct nvme_command c; - int flags = NVME_QUEUE_PHYS_CONTIG; - - /* - * Some drives have a bug that auto-enables WRRU if MEDIUM isn't - * set. Since URGENT priority is zeroes, it makes all queues - * URGENT. - */ - if (ctrl->quirks & NVME_QUIRK_MEDIUM_PRIO_SQ) - flags |= NVME_SQ_PRIO_MEDIUM; - - /* - * Note: we (ab)use the fact that the prp fields survive if no data - * is attached to the request. - */ - memset(&c, 0, sizeof(c)); - c.create_sq.opcode = nvme_admin_create_sq; - c.create_sq.prp1 = cpu_to_le64(nvmeq->sq_dma_addr); - c.create_sq.sqid = cpu_to_le16(qid); - c.create_sq.qsize = cpu_to_le16(nvmeq->q_depth - 1); - c.create_sq.sq_flags = cpu_to_le16(flags); - c.create_sq.cqid = cpu_to_le16(qid); - - return nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0); -} - -static int adapter_delete_cq(struct nvme_dev *dev, u16 cqid) -{ - return adapter_delete_queue(dev, nvme_admin_delete_cq, cqid); -} - -static int adapter_delete_sq(struct nvme_dev *dev, u16 sqid) -{ - return adapter_delete_queue(dev, nvme_admin_delete_sq, sqid); -} - -static void abort_endio(struct request *req, blk_status_t error) -{ - struct nvme_iod *iod = blk_mq_rq_to_pdu(req); - struct nvme_queue *nvmeq = iod->nvmeq; - - dev_warn(nvmeq->dev->ctrl.device, - "Abort status: 0x%x", nvme_req(req)->status); - atomic_inc(&nvmeq->dev->ctrl.abort_limit); - blk_mq_free_request(req); -} - -static bool nvme_should_reset(struct nvme_dev *dev, u32 csts) -{ - /* If true, indicates loss of adapter communication, possibly by a - * NVMe Subsystem reset. - */ - bool nssro = dev->subsystem && (csts & NVME_CSTS_NSSRO); - - /* If there is a reset/reinit ongoing, we shouldn't reset again. */ - switch (dev->ctrl.state) { - case NVME_CTRL_RESETTING: - case NVME_CTRL_CONNECTING: - return false; - default: - break; - } - - /* We shouldn't reset unless the controller is on fatal error state - * _or_ if we lost the communication with it. - */ - if (!(csts & NVME_CSTS_CFS) && !nssro) - return false; - - return true; -} - -static void nvme_warn_reset(struct nvme_dev *dev, u32 csts) -{ - /* Read a config register to help see what died. */ - u16 pci_status; - int result; - - result = pci_read_config_word(to_pci_dev(dev->dev), PCI_STATUS, - &pci_status); - if (result == PCIBIOS_SUCCESSFUL) - dev_warn(dev->ctrl.device, - "controller is down; will reset: CSTS=0x%x, PCI_STATUS=0x%hx\n", - csts, pci_status); - else - dev_warn(dev->ctrl.device, - "controller is down; will reset: CSTS=0x%x, PCI_STATUS read failed (%d)\n", - csts, result); -} - -static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved) -{ - struct nvme_iod *iod = blk_mq_rq_to_pdu(req); - struct nvme_queue *nvmeq = iod->nvmeq; - struct nvme_dev *dev = nvmeq->dev; - struct request *abort_req; - struct nvme_command cmd; - u32 csts = readl(dev->bar + NVME_REG_CSTS); - - /* If PCI error recovery process is happening, we cannot reset or - * the recovery mechanism will surely fail. - */ - mb(); - if (pci_channel_offline(to_pci_dev(dev->dev))) - return BLK_EH_RESET_TIMER; - - /* - * Reset immediately if the controller is failed - */ - if (nvme_should_reset(dev, csts)) { - nvme_warn_reset(dev, csts); - nvme_dev_disable(dev, false); - nvme_reset_ctrl(&dev->ctrl); - return BLK_EH_DONE; - } - - /* - * Did we miss an interrupt? - */ - if (test_bit(NVMEQ_POLLED, &nvmeq->flags)) - nvme_poll(req->mq_hctx); - else - nvme_poll_irqdisable(nvmeq); - - if (blk_mq_request_completed(req)) { - dev_warn(dev->ctrl.device, - "I/O %d QID %d timeout, completion polled\n", - req->tag, nvmeq->qid); - return BLK_EH_DONE; - } - - /* - * Shutdown immediately if controller times out while starting. The - * reset work will see the pci device disabled when it gets the forced - * cancellation error. All outstanding requests are completed on - * shutdown, so we return BLK_EH_DONE. - */ - switch (dev->ctrl.state) { - case NVME_CTRL_CONNECTING: - nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING); - fallthrough; - case NVME_CTRL_DELETING: - dev_warn_ratelimited(dev->ctrl.device, - "I/O %d QID %d timeout, disable controller\n", - req->tag, nvmeq->qid); - nvme_req(req)->flags |= NVME_REQ_CANCELLED; - nvme_dev_disable(dev, true); - return BLK_EH_DONE; - case NVME_CTRL_RESETTING: - return BLK_EH_RESET_TIMER; - default: - break; - } - - /* - * Shutdown the controller immediately and schedule a reset if the - * command was already aborted once before and still hasn't been - * returned to the driver, or if this is the admin queue. - */ - if (!nvmeq->qid || iod->aborted) { - dev_warn(dev->ctrl.device, - "I/O %d QID %d timeout, reset controller\n", - req->tag, nvmeq->qid); - nvme_req(req)->flags |= NVME_REQ_CANCELLED; - nvme_dev_disable(dev, false); - nvme_reset_ctrl(&dev->ctrl); - - return BLK_EH_DONE; - } - - if (atomic_dec_return(&dev->ctrl.abort_limit) < 0) { - atomic_inc(&dev->ctrl.abort_limit); - return BLK_EH_RESET_TIMER; - } - iod->aborted = 1; - - memset(&cmd, 0, sizeof(cmd)); - cmd.abort.opcode = nvme_admin_abort_cmd; - cmd.abort.cid = nvme_cid(req); - cmd.abort.sqid = cpu_to_le16(nvmeq->qid); - - dev_warn(nvmeq->dev->ctrl.device, - "I/O %d QID %d timeout, aborting\n", - req->tag, nvmeq->qid); - - abort_req = nvme_alloc_request(dev->ctrl.admin_q, &cmd, - BLK_MQ_REQ_NOWAIT); - if (IS_ERR(abort_req)) { - atomic_inc(&dev->ctrl.abort_limit); - return BLK_EH_RESET_TIMER; - } - - abort_req->end_io_data = NULL; - blk_execute_rq_nowait(abort_req->q, NULL, abort_req, 0, abort_endio); - - /* - * The aborted req will be completed on receiving the abort req. - * We enable the timer again. If hit twice, it'll cause a device reset, - * as the device then is in a faulty state. - */ - return BLK_EH_RESET_TIMER; -} - -static void nvme_free_queue(struct nvme_queue *nvmeq) -{ - dma_free_coherent(nvmeq->dev->dev, CQ_SIZE(nvmeq), - (void *)nvmeq->cqes, nvmeq->cq_dma_addr); - if (!nvmeq->sq_cmds) - return; - - if (test_and_clear_bit(NVMEQ_SQ_CMB, &nvmeq->flags)) { - pci_free_p2pmem(to_pci_dev(nvmeq->dev->dev), - nvmeq->sq_cmds, SQ_SIZE(nvmeq)); - } else { - dma_free_coherent(nvmeq->dev->dev, SQ_SIZE(nvmeq), - nvmeq->sq_cmds, nvmeq->sq_dma_addr); - } -} - -static void nvme_free_queues(struct nvme_dev *dev, int lowest) -{ - int i; - - for (i = dev->ctrl.queue_count - 1; i >= lowest; i--) { - dev->ctrl.queue_count--; - nvme_free_queue(&dev->queues[i]); - } -} - -/** - * nvme_suspend_queue - put queue into suspended state - * @nvmeq: queue to suspend - */ -static int nvme_suspend_queue(struct nvme_queue *nvmeq) -{ - if (!test_and_clear_bit(NVMEQ_ENABLED, &nvmeq->flags)) - return 1; - - /* ensure that nvme_queue_rq() sees NVMEQ_ENABLED cleared */ - mb(); - - nvmeq->dev->online_queues--; - if (!nvmeq->qid && nvmeq->dev->ctrl.admin_q) - blk_mq_quiesce_queue(nvmeq->dev->ctrl.admin_q); - if (!test_and_clear_bit(NVMEQ_POLLED, &nvmeq->flags)) - pci_free_irq(to_pci_dev(nvmeq->dev->dev), nvmeq->cq_vector, nvmeq); - return 0; -} - -static void nvme_suspend_io_queues(struct nvme_dev *dev) -{ - int i; - - for (i = dev->ctrl.queue_count - 1; i > 0; i--) - nvme_suspend_queue(&dev->queues[i]); -} - -static void nvme_disable_admin_queue(struct nvme_dev *dev, bool shutdown) -{ - struct nvme_queue *nvmeq = &dev->queues[0]; - - if (shutdown) - nvme_shutdown_ctrl(&dev->ctrl); - else - nvme_disable_ctrl(&dev->ctrl); - - nvme_poll_irqdisable(nvmeq); -} - -/* - * Called only on a device that has been disabled and after all other threads - * that can check this device's completion queues have synced, except - * nvme_poll(). This is the last chance for the driver to see a natural - * completion before nvme_cancel_request() terminates all incomplete requests. - */ -static void nvme_reap_pending_cqes(struct nvme_dev *dev) -{ - int i; - - for (i = dev->ctrl.queue_count - 1; i > 0; i--) { - spin_lock(&dev->queues[i].cq_poll_lock); - nvme_process_cq(&dev->queues[i]); - spin_unlock(&dev->queues[i].cq_poll_lock); - } -} - -static int nvme_cmb_qdepth(struct nvme_dev *dev, int nr_io_queues, - int entry_size) -{ - int q_depth = dev->q_depth; - unsigned q_size_aligned = roundup(q_depth * entry_size, - NVME_CTRL_PAGE_SIZE); - - if (q_size_aligned * nr_io_queues > dev->cmb_size) { - u64 mem_per_q = div_u64(dev->cmb_size, nr_io_queues); - - mem_per_q = round_down(mem_per_q, NVME_CTRL_PAGE_SIZE); - q_depth = div_u64(mem_per_q, entry_size); - - /* - * Ensure the reduced q_depth is above some threshold where it - * would be better to map queues in system memory with the - * original depth - */ - if (q_depth < 64) - return -ENOMEM; - } - - return q_depth; -} - -static int nvme_alloc_sq_cmds(struct nvme_dev *dev, struct nvme_queue *nvmeq, - int qid) -{ - struct pci_dev *pdev = to_pci_dev(dev->dev); - - if (qid && dev->cmb_use_sqes && (dev->cmbsz & NVME_CMBSZ_SQS)) { - nvmeq->sq_cmds = pci_alloc_p2pmem(pdev, SQ_SIZE(nvmeq)); - if (nvmeq->sq_cmds) { - nvmeq->sq_dma_addr = pci_p2pmem_virt_to_bus(pdev, - nvmeq->sq_cmds); - if (nvmeq->sq_dma_addr) { - set_bit(NVMEQ_SQ_CMB, &nvmeq->flags); - return 0; - } - - pci_free_p2pmem(pdev, nvmeq->sq_cmds, SQ_SIZE(nvmeq)); - } - } - - nvmeq->sq_cmds = dma_alloc_coherent(dev->dev, SQ_SIZE(nvmeq), - &nvmeq->sq_dma_addr, GFP_KERNEL); - if (!nvmeq->sq_cmds) - return -ENOMEM; - return 0; -} - -static int nvme_alloc_queue(struct nvme_dev *dev, int qid, int depth) -{ - struct nvme_queue *nvmeq = &dev->queues[qid]; - - if (dev->ctrl.queue_count > qid) - return 0; - - nvmeq->sqes = qid ? dev->io_sqes : NVME_ADM_SQES; - nvmeq->q_depth = depth; - nvmeq->cqes = dma_alloc_coherent(dev->dev, CQ_SIZE(nvmeq), - &nvmeq->cq_dma_addr, GFP_KERNEL); - if (!nvmeq->cqes) - goto free_nvmeq; - - if (nvme_alloc_sq_cmds(dev, nvmeq, qid)) - goto free_cqdma; - - nvmeq->dev = dev; - spin_lock_init(&nvmeq->sq_lock); - spin_lock_init(&nvmeq->cq_poll_lock); - nvmeq->cq_head = 0; - nvmeq->cq_phase = 1; - nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride]; - nvmeq->qid = qid; - dev->ctrl.queue_count++; - - return 0; - - free_cqdma: - dma_free_coherent(dev->dev, CQ_SIZE(nvmeq), (void *)nvmeq->cqes, - nvmeq->cq_dma_addr); - free_nvmeq: - return -ENOMEM; -} - -static int queue_request_irq(struct nvme_queue *nvmeq) -{ - struct pci_dev *pdev = to_pci_dev(nvmeq->dev->dev); - int nr = nvmeq->dev->ctrl.instance; - - if (use_threaded_interrupts) { - return pci_request_irq(pdev, nvmeq->cq_vector, nvme_irq_check, - nvme_irq, nvmeq, "nvme%dq%d", nr, nvmeq->qid); - } else { - return pci_request_irq(pdev, nvmeq->cq_vector, nvme_irq, - NULL, nvmeq, "nvme%dq%d", nr, nvmeq->qid); - } -} - -static void nvme_init_queue(struct nvme_queue *nvmeq, u16 qid) -{ - struct nvme_dev *dev = nvmeq->dev; - - nvmeq->sq_tail = 0; - nvmeq->last_sq_tail = 0; - nvmeq->cq_head = 0; - nvmeq->cq_phase = 1; - nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride]; - memset((void *)nvmeq->cqes, 0, CQ_SIZE(nvmeq)); - nvme_dbbuf_init(dev, nvmeq, qid); - dev->online_queues++; - wmb(); /* ensure the first interrupt sees the initialization */ -} - -static int nvme_create_queue(struct nvme_queue *nvmeq, int qid, bool polled) -{ - struct nvme_dev *dev = nvmeq->dev; - int result; - u16 vector = 0; - - clear_bit(NVMEQ_DELETE_ERROR, &nvmeq->flags); - - /* - * A queue's vector matches the queue identifier unless the controller - * has only one vector available. - */ - if (!polled) - vector = dev->num_vecs == 1 ? 0 : qid; - else - set_bit(NVMEQ_POLLED, &nvmeq->flags); - - result = adapter_alloc_cq(dev, qid, nvmeq, vector); - if (result) - return result; - - result = adapter_alloc_sq(dev, qid, nvmeq); - if (result < 0) - return result; - if (result) - goto release_cq; - - nvmeq->cq_vector = vector; - nvme_init_queue(nvmeq, qid); - - if (!polled) { - result = queue_request_irq(nvmeq); - if (result < 0) - goto release_sq; - } - - set_bit(NVMEQ_ENABLED, &nvmeq->flags); - return result; - -release_sq: - dev->online_queues--; - adapter_delete_sq(dev, qid); -release_cq: - adapter_delete_cq(dev, qid); - return result; -} - -static const struct blk_mq_ops nvme_mq_admin_ops = { - .queue_rq = nvme_queue_rq, - .complete = nvme_pci_complete_rq, - .init_hctx = nvme_admin_init_hctx, - .init_request = nvme_init_request, - .timeout = nvme_timeout, -}; - -static const struct blk_mq_ops nvme_mq_ops = { - .queue_rq = nvme_queue_rq, - .complete = nvme_pci_complete_rq, - .commit_rqs = nvme_commit_rqs, - .init_hctx = nvme_init_hctx, - .init_request = nvme_init_request, - .map_queues = nvme_pci_map_queues, - .timeout = nvme_timeout, - .poll = nvme_poll, -}; - -static void nvme_dev_remove_admin(struct nvme_dev *dev) -{ - if (dev->ctrl.admin_q && !blk_queue_dying(dev->ctrl.admin_q)) { - /* - * If the controller was reset during removal, it's possible - * user requests may be waiting on a stopped queue. Start the - * queue to flush these to completion. - */ - blk_mq_unquiesce_queue(dev->ctrl.admin_q); - blk_cleanup_queue(dev->ctrl.admin_q); - blk_mq_free_tag_set(&dev->admin_tagset); - } -} - -static int nvme_alloc_admin_tags(struct nvme_dev *dev) -{ - if (!dev->ctrl.admin_q) { - dev->admin_tagset.ops = &nvme_mq_admin_ops; - dev->admin_tagset.nr_hw_queues = 1; - - dev->admin_tagset.queue_depth = NVME_AQ_MQ_TAG_DEPTH; - dev->admin_tagset.timeout = ADMIN_TIMEOUT; - dev->admin_tagset.numa_node = dev->ctrl.numa_node; - dev->admin_tagset.cmd_size = sizeof(struct nvme_iod); - dev->admin_tagset.flags = BLK_MQ_F_NO_SCHED; - dev->admin_tagset.driver_data = dev; - - if (blk_mq_alloc_tag_set(&dev->admin_tagset)) - return -ENOMEM; - dev->ctrl.admin_tagset = &dev->admin_tagset; - - dev->ctrl.admin_q = blk_mq_init_queue(&dev->admin_tagset); - if (IS_ERR(dev->ctrl.admin_q)) { - blk_mq_free_tag_set(&dev->admin_tagset); - dev->ctrl.admin_q = NULL; - return -ENOMEM; - } - if (!blk_get_queue(dev->ctrl.admin_q)) { - nvme_dev_remove_admin(dev); - dev->ctrl.admin_q = NULL; - return -ENODEV; - } - } else - blk_mq_unquiesce_queue(dev->ctrl.admin_q); - - return 0; -} - -static unsigned long db_bar_size(struct nvme_dev *dev, unsigned nr_io_queues) -{ - return NVME_REG_DBS + ((nr_io_queues + 1) * 8 * dev->db_stride); -} - -static int nvme_remap_bar(struct nvme_dev *dev, unsigned long size) -{ - struct pci_dev *pdev = to_pci_dev(dev->dev); - - if (size <= dev->bar_mapped_size) - return 0; - if (size > pci_resource_len(pdev, 0)) - return -ENOMEM; - if (dev->bar) - iounmap(dev->bar); - dev->bar = ioremap(pci_resource_start(pdev, 0), size); - if (!dev->bar) { - dev->bar_mapped_size = 0; - return -ENOMEM; - } - dev->bar_mapped_size = size; - dev->dbs = dev->bar + NVME_REG_DBS; - - return 0; -} - -static int nvme_pci_configure_admin_queue(struct nvme_dev *dev) -{ - int result; - u32 aqa; - struct nvme_queue *nvmeq; - - result = nvme_remap_bar(dev, db_bar_size(dev, 0)); - if (result < 0) - return result; - - dev->subsystem = readl(dev->bar + NVME_REG_VS) >= NVME_VS(1, 1, 0) ? - NVME_CAP_NSSRC(dev->ctrl.cap) : 0; - - if (dev->subsystem && - (readl(dev->bar + NVME_REG_CSTS) & NVME_CSTS_NSSRO)) - writel(NVME_CSTS_NSSRO, dev->bar + NVME_REG_CSTS); - - result = nvme_disable_ctrl(&dev->ctrl); - if (result < 0) - return result; - - result = nvme_alloc_queue(dev, 0, NVME_AQ_DEPTH); - if (result) - return result; - - dev->ctrl.numa_node = dev_to_node(dev->dev); - - nvmeq = &dev->queues[0]; - aqa = nvmeq->q_depth - 1; - aqa |= aqa << 16; - - writel(aqa, dev->bar + NVME_REG_AQA); - lo_hi_writeq(nvmeq->sq_dma_addr, dev->bar + NVME_REG_ASQ); - lo_hi_writeq(nvmeq->cq_dma_addr, dev->bar + NVME_REG_ACQ); - - result = nvme_enable_ctrl(&dev->ctrl); - if (result) - return result; - - nvmeq->cq_vector = 0; - nvme_init_queue(nvmeq, 0); - result = queue_request_irq(nvmeq); - if (result) { - dev->online_queues--; - return result; - } - - set_bit(NVMEQ_ENABLED, &nvmeq->flags); - return result; -} - -static int nvme_create_io_queues(struct nvme_dev *dev) -{ - unsigned i, max, rw_queues; - int ret = 0; - - for (i = dev->ctrl.queue_count; i <= dev->max_qid; i++) { - if (nvme_alloc_queue(dev, i, dev->q_depth)) { - ret = -ENOMEM; - break; - } - } - - max = min(dev->max_qid, dev->ctrl.queue_count - 1); - if (max != 1 && dev->io_queues[HCTX_TYPE_POLL]) { - rw_queues = dev->io_queues[HCTX_TYPE_DEFAULT] + - dev->io_queues[HCTX_TYPE_READ]; - } else { - rw_queues = max; - } - - for (i = dev->online_queues; i <= max; i++) { - bool polled = i > rw_queues; - - ret = nvme_create_queue(&dev->queues[i], i, polled); - if (ret) - break; - } - - /* - * Ignore failing Create SQ/CQ commands, we can continue with less - * than the desired amount of queues, and even a controller without - * I/O queues can still be used to issue admin commands. This might - * be useful to upgrade a buggy firmware for example. - */ - return ret >= 0 ? 0 : ret; -} - -static ssize_t nvme_cmb_show(struct device *dev, - struct device_attribute *attr, - char *buf) -{ - struct nvme_dev *ndev = to_nvme_dev(dev_get_drvdata(dev)); - - return scnprintf(buf, PAGE_SIZE, "cmbloc : x%08x\ncmbsz : x%08x\n", - ndev->cmbloc, ndev->cmbsz); -} -static DEVICE_ATTR(cmb, S_IRUGO, nvme_cmb_show, NULL); - -static u64 nvme_cmb_size_unit(struct nvme_dev *dev) -{ - u8 szu = (dev->cmbsz >> NVME_CMBSZ_SZU_SHIFT) & NVME_CMBSZ_SZU_MASK; - - return 1ULL << (12 + 4 * szu); -} - -static u32 nvme_cmb_size(struct nvme_dev *dev) -{ - return (dev->cmbsz >> NVME_CMBSZ_SZ_SHIFT) & NVME_CMBSZ_SZ_MASK; -} - -static void nvme_map_cmb(struct nvme_dev *dev) -{ - u64 size, offset; - resource_size_t bar_size; - struct pci_dev *pdev = to_pci_dev(dev->dev); - int bar; - - if (dev->cmb_size) - return; - - if (NVME_CAP_CMBS(dev->ctrl.cap)) - writel(NVME_CMBMSC_CRE, dev->bar + NVME_REG_CMBMSC); - - dev->cmbsz = readl(dev->bar + NVME_REG_CMBSZ); - if (!dev->cmbsz) - return; - dev->cmbloc = readl(dev->bar + NVME_REG_CMBLOC); - - size = nvme_cmb_size_unit(dev) * nvme_cmb_size(dev); - offset = nvme_cmb_size_unit(dev) * NVME_CMB_OFST(dev->cmbloc); - bar = NVME_CMB_BIR(dev->cmbloc); - bar_size = pci_resource_len(pdev, bar); - - if (offset > bar_size) - return; - - /* - * Tell the controller about the host side address mapping the CMB, - * and enable CMB decoding for the NVMe 1.4+ scheme: - */ - if (NVME_CAP_CMBS(dev->ctrl.cap)) { - hi_lo_writeq(NVME_CMBMSC_CRE | NVME_CMBMSC_CMSE | - (pci_bus_address(pdev, bar) + offset), - dev->bar + NVME_REG_CMBMSC); - } - - /* - * Controllers may support a CMB size larger than their BAR, - * for example, due to being behind a bridge. Reduce the CMB to - * the reported size of the BAR - */ - if (size > bar_size - offset) - size = bar_size - offset; - - if (pci_p2pdma_add_resource(pdev, bar, size, offset)) { - dev_warn(dev->ctrl.device, - "failed to register the CMB\n"); - return; - } - - dev->cmb_size = size; - dev->cmb_use_sqes = use_cmb_sqes && (dev->cmbsz & NVME_CMBSZ_SQS); - - if ((dev->cmbsz & (NVME_CMBSZ_WDS | NVME_CMBSZ_RDS)) == - (NVME_CMBSZ_WDS | NVME_CMBSZ_RDS)) - pci_p2pmem_publish(pdev, true); - - if (sysfs_add_file_to_group(&dev->ctrl.device->kobj, - &dev_attr_cmb.attr, NULL)) - dev_warn(dev->ctrl.device, - "failed to add sysfs attribute for CMB\n"); -} - -static inline void nvme_release_cmb(struct nvme_dev *dev) -{ - if (dev->cmb_size) { - sysfs_remove_file_from_group(&dev->ctrl.device->kobj, - &dev_attr_cmb.attr, NULL); - dev->cmb_size = 0; - } -} - -static int nvme_set_host_mem(struct nvme_dev *dev, u32 bits) -{ - u32 host_mem_size = dev->host_mem_size >> NVME_CTRL_PAGE_SHIFT; - u64 dma_addr = dev->host_mem_descs_dma; - struct nvme_command c; - int ret; - - memset(&c, 0, sizeof(c)); - c.features.opcode = nvme_admin_set_features; - c.features.fid = cpu_to_le32(NVME_FEAT_HOST_MEM_BUF); - c.features.dword11 = cpu_to_le32(bits); - c.features.dword12 = cpu_to_le32(host_mem_size); - c.features.dword13 = cpu_to_le32(lower_32_bits(dma_addr)); - c.features.dword14 = cpu_to_le32(upper_32_bits(dma_addr)); - c.features.dword15 = cpu_to_le32(dev->nr_host_mem_descs); - - ret = nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0); - if (ret) { - dev_warn(dev->ctrl.device, - "failed to set host mem (err %d, flags %#x).\n", - ret, bits); - } - return ret; -} - -static void nvme_free_host_mem(struct nvme_dev *dev) -{ - int i; - - for (i = 0; i < dev->nr_host_mem_descs; i++) { - struct nvme_host_mem_buf_desc *desc = &dev->host_mem_descs[i]; - size_t size = le32_to_cpu(desc->size) * NVME_CTRL_PAGE_SIZE; - - dma_free_attrs(dev->dev, size, dev->host_mem_desc_bufs[i], - le64_to_cpu(desc->addr), - DMA_ATTR_NO_KERNEL_MAPPING | DMA_ATTR_NO_WARN); - } - - kfree(dev->host_mem_desc_bufs); - dev->host_mem_desc_bufs = NULL; - dma_free_coherent(dev->dev, - dev->nr_host_mem_descs * sizeof(*dev->host_mem_descs), - dev->host_mem_descs, dev->host_mem_descs_dma); - dev->host_mem_descs = NULL; - dev->nr_host_mem_descs = 0; -} - -static int __nvme_alloc_host_mem(struct nvme_dev *dev, u64 preferred, - u32 chunk_size) -{ - struct nvme_host_mem_buf_desc *descs; - u32 max_entries, len; - dma_addr_t descs_dma; - int i = 0; - void **bufs; - u64 size, tmp; - - tmp = (preferred + chunk_size - 1); - do_div(tmp, chunk_size); - max_entries = tmp; - - if (dev->ctrl.hmmaxd && dev->ctrl.hmmaxd < max_entries) - max_entries = dev->ctrl.hmmaxd; - - descs = dma_alloc_coherent(dev->dev, max_entries * sizeof(*descs), - &descs_dma, GFP_KERNEL); - if (!descs) - goto out; - - bufs = kcalloc(max_entries, sizeof(*bufs), GFP_KERNEL); - if (!bufs) - goto out_free_descs; - - for (size = 0; size < preferred && i < max_entries; size += len) { - dma_addr_t dma_addr; - - len = min_t(u64, chunk_size, preferred - size); - bufs[i] = dma_alloc_attrs(dev->dev, len, &dma_addr, GFP_KERNEL, - DMA_ATTR_NO_KERNEL_MAPPING | DMA_ATTR_NO_WARN); - if (!bufs[i]) - break; - - descs[i].addr = cpu_to_le64(dma_addr); - descs[i].size = cpu_to_le32(len / NVME_CTRL_PAGE_SIZE); - i++; - } - - if (!size) - goto out_free_bufs; - - dev->nr_host_mem_descs = i; - dev->host_mem_size = size; - dev->host_mem_descs = descs; - dev->host_mem_descs_dma = descs_dma; - dev->host_mem_desc_bufs = bufs; - return 0; - -out_free_bufs: - while (--i >= 0) { - size_t size = le32_to_cpu(descs[i].size) * NVME_CTRL_PAGE_SIZE; - - dma_free_attrs(dev->dev, size, bufs[i], - le64_to_cpu(descs[i].addr), - DMA_ATTR_NO_KERNEL_MAPPING | DMA_ATTR_NO_WARN); - } - - kfree(bufs); -out_free_descs: - dma_free_coherent(dev->dev, max_entries * sizeof(*descs), descs, - descs_dma); -out: - dev->host_mem_descs = NULL; - return -ENOMEM; -} - -static int nvme_alloc_host_mem(struct nvme_dev *dev, u64 min, u64 preferred) -{ - u64 min_chunk = min_t(u64, preferred, PAGE_SIZE * MAX_ORDER_NR_PAGES); - u64 hmminds = max_t(u32, dev->ctrl.hmminds * 4096, PAGE_SIZE * 2); - u64 chunk_size; - - /* start big and work our way down */ - for (chunk_size = min_chunk; chunk_size >= hmminds; chunk_size /= 2) { - if (!__nvme_alloc_host_mem(dev, preferred, chunk_size)) { - if (!min || dev->host_mem_size >= min) - return 0; - nvme_free_host_mem(dev); - } - } - - return -ENOMEM; -} - -static int nvme_setup_host_mem(struct nvme_dev *dev) -{ - u64 max = (u64)max_host_mem_size_mb * SZ_1M; - u64 preferred = (u64)dev->ctrl.hmpre * 4096; - u64 min = (u64)dev->ctrl.hmmin * 4096; - u32 enable_bits = NVME_HOST_MEM_ENABLE; - int ret; - - preferred = min(preferred, max); - if (min > max) { - dev_warn(dev->ctrl.device, - "min host memory (%lld MiB) above limit (%d MiB).\n", - min >> ilog2(SZ_1M), max_host_mem_size_mb); - nvme_free_host_mem(dev); - return 0; - } - - /* - * If we already have a buffer allocated check if we can reuse it. - */ - if (dev->host_mem_descs) { - if (dev->host_mem_size >= min) - enable_bits |= NVME_HOST_MEM_RETURN; - else - nvme_free_host_mem(dev); - } - - if (!dev->host_mem_descs) { - if (nvme_alloc_host_mem(dev, min, preferred)) { - dev_warn(dev->ctrl.device, - "failed to allocate host memory buffer.\n"); - return 0; /* controller must work without HMB */ - } - - dev_info(dev->ctrl.device, - "allocated %lld MiB host memory buffer.\n", - dev->host_mem_size >> ilog2(SZ_1M)); - } - - ret = nvme_set_host_mem(dev, enable_bits); - if (ret) - nvme_free_host_mem(dev); - return ret; -} - -/* - * nirqs is the number of interrupts available for write and read - * queues. The core already reserved an interrupt for the admin queue. - */ -static void nvme_calc_irq_sets(struct irq_affinity *affd, unsigned int nrirqs) -{ - struct nvme_dev *dev = affd->priv; - unsigned int nr_read_queues, nr_write_queues = dev->nr_write_queues; - - /* - * If there is no interrupt available for queues, ensure that - * the default queue is set to 1. The affinity set size is - * also set to one, but the irq core ignores it for this case. - * - * If only one interrupt is available or 'write_queue' == 0, combine - * write and read queues. - * - * If 'write_queues' > 0, ensure it leaves room for at least one read - * queue. - */ - if (!nrirqs) { - nrirqs = 1; - nr_read_queues = 0; - } else if (nrirqs == 1 || !nr_write_queues) { - nr_read_queues = 0; - } else if (nr_write_queues >= nrirqs) { - nr_read_queues = 1; - } else { - nr_read_queues = nrirqs - nr_write_queues; - } - - dev->io_queues[HCTX_TYPE_DEFAULT] = nrirqs - nr_read_queues; - affd->set_size[HCTX_TYPE_DEFAULT] = nrirqs - nr_read_queues; - dev->io_queues[HCTX_TYPE_READ] = nr_read_queues; - affd->set_size[HCTX_TYPE_READ] = nr_read_queues; - affd->nr_sets = nr_read_queues ? 2 : 1; -} - -static int nvme_setup_irqs(struct nvme_dev *dev, unsigned int nr_io_queues) -{ - struct pci_dev *pdev = to_pci_dev(dev->dev); - struct irq_affinity affd = { - .pre_vectors = 1, - .calc_sets = nvme_calc_irq_sets, - .priv = dev, - }; - unsigned int irq_queues, poll_queues; - - /* - * Poll queues don't need interrupts, but we need at least one I/O queue - * left over for non-polled I/O. - */ - poll_queues = min(dev->nr_poll_queues, nr_io_queues - 1); - dev->io_queues[HCTX_TYPE_POLL] = poll_queues; - - /* - * Initialize for the single interrupt case, will be updated in - * nvme_calc_irq_sets(). - */ - dev->io_queues[HCTX_TYPE_DEFAULT] = 1; - dev->io_queues[HCTX_TYPE_READ] = 0; - - /* - * We need interrupts for the admin queue and each non-polled I/O queue, - * but some Apple controllers require all queues to use the first - * vector. - */ - irq_queues = 1; - if (!(dev->ctrl.quirks & NVME_QUIRK_SINGLE_VECTOR)) - irq_queues += (nr_io_queues - poll_queues); - return pci_alloc_irq_vectors_affinity(pdev, 1, irq_queues, - PCI_IRQ_ALL_TYPES | PCI_IRQ_AFFINITY, &affd); -} - -static void nvme_disable_io_queues(struct nvme_dev *dev) -{ - if (__nvme_disable_io_queues(dev, nvme_admin_delete_sq)) - __nvme_disable_io_queues(dev, nvme_admin_delete_cq); -} - -static unsigned int nvme_max_io_queues(struct nvme_dev *dev) -{ - return num_possible_cpus() + dev->nr_write_queues + dev->nr_poll_queues; -} - -static int nvme_setup_io_queues(struct nvme_dev *dev) -{ - struct nvme_queue *adminq = &dev->queues[0]; - struct pci_dev *pdev = to_pci_dev(dev->dev); - unsigned int nr_io_queues; - unsigned long size; - int result; - - /* - * Sample the module parameters once at reset time so that we have - * stable values to work with. - */ - dev->nr_write_queues = write_queues; - dev->nr_poll_queues = poll_queues; - - /* - * If tags are shared with admin queue (Apple bug), then - * make sure we only use one IO queue. - */ - if (dev->ctrl.quirks & NVME_QUIRK_SHARED_TAGS) - nr_io_queues = 1; - else - nr_io_queues = min(nvme_max_io_queues(dev), - dev->nr_allocated_queues - 1); - - result = nvme_set_queue_count(&dev->ctrl, &nr_io_queues); - if (result < 0) - return result; - - if (nr_io_queues == 0) - return 0; - - clear_bit(NVMEQ_ENABLED, &adminq->flags); - - if (dev->cmb_use_sqes) { - result = nvme_cmb_qdepth(dev, nr_io_queues, - sizeof(struct nvme_command)); - if (result > 0) - dev->q_depth = result; - else - dev->cmb_use_sqes = false; - } - - do { - size = db_bar_size(dev, nr_io_queues); - result = nvme_remap_bar(dev, size); - if (!result) - break; - if (!--nr_io_queues) - return -ENOMEM; - } while (1); - adminq->q_db = dev->dbs; - - retry: - /* Deregister the admin queue's interrupt */ - pci_free_irq(pdev, 0, adminq); - - /* - * If we enable msix early due to not intx, disable it again before - * setting up the full range we need. - */ - pci_free_irq_vectors(pdev); - - result = nvme_setup_irqs(dev, nr_io_queues); - if (result <= 0) - return -EIO; - - dev->num_vecs = result; - result = max(result - 1, 1); - dev->max_qid = result + dev->io_queues[HCTX_TYPE_POLL]; - - /* - * Should investigate if there's a performance win from allocating - * more queues than interrupt vectors; it might allow the submission - * path to scale better, even if the receive path is limited by the - * number of interrupts. - */ - result = queue_request_irq(adminq); - if (result) - return result; - set_bit(NVMEQ_ENABLED, &adminq->flags); - - result = nvme_create_io_queues(dev); - if (result || dev->online_queues < 2) - return result; - - if (dev->online_queues - 1 < dev->max_qid) { - nr_io_queues = dev->online_queues - 1; - nvme_disable_io_queues(dev); - nvme_suspend_io_queues(dev); - goto retry; - } - dev_info(dev->ctrl.device, "%d/%d/%d default/read/poll queues\n", - dev->io_queues[HCTX_TYPE_DEFAULT], - dev->io_queues[HCTX_TYPE_READ], - dev->io_queues[HCTX_TYPE_POLL]); - return 0; -} - -static void nvme_del_queue_end(struct request *req, blk_status_t error) -{ - struct nvme_queue *nvmeq = req->end_io_data; - - blk_mq_free_request(req); - complete(&nvmeq->delete_done); -} - -static void nvme_del_cq_end(struct request *req, blk_status_t error) -{ - struct nvme_queue *nvmeq = req->end_io_data; - - if (error) - set_bit(NVMEQ_DELETE_ERROR, &nvmeq->flags); - - nvme_del_queue_end(req, error); -} - -static int nvme_delete_queue(struct nvme_queue *nvmeq, u8 opcode) -{ - struct request_queue *q = nvmeq->dev->ctrl.admin_q; - struct request *req; - struct nvme_command cmd; - - memset(&cmd, 0, sizeof(cmd)); - cmd.delete_queue.opcode = opcode; - cmd.delete_queue.qid = cpu_to_le16(nvmeq->qid); - - req = nvme_alloc_request(q, &cmd, BLK_MQ_REQ_NOWAIT); - if (IS_ERR(req)) - return PTR_ERR(req); - - req->end_io_data = nvmeq; - - init_completion(&nvmeq->delete_done); - blk_execute_rq_nowait(q, NULL, req, false, - opcode == nvme_admin_delete_cq ? - nvme_del_cq_end : nvme_del_queue_end); - return 0; -} - -static bool __nvme_disable_io_queues(struct nvme_dev *dev, u8 opcode) -{ - int nr_queues = dev->online_queues - 1, sent = 0; - unsigned long timeout; - - retry: - timeout = ADMIN_TIMEOUT; - while (nr_queues > 0) { - if (nvme_delete_queue(&dev->queues[nr_queues], opcode)) - break; - nr_queues--; - sent++; - } - while (sent) { - struct nvme_queue *nvmeq = &dev->queues[nr_queues + sent]; - - timeout = wait_for_completion_io_timeout(&nvmeq->delete_done, - timeout); - if (timeout == 0) - return false; - - sent--; - if (nr_queues) - goto retry; - } - return true; -} - -static void nvme_dev_add(struct nvme_dev *dev) -{ - int ret; - - if (!dev->ctrl.tagset) { - dev->tagset.ops = &nvme_mq_ops; - dev->tagset.nr_hw_queues = dev->online_queues - 1; - dev->tagset.nr_maps = 2; /* default + read */ - if (dev->io_queues[HCTX_TYPE_POLL]) - dev->tagset.nr_maps++; - dev->tagset.timeout = NVME_IO_TIMEOUT; - dev->tagset.numa_node = dev->ctrl.numa_node; - dev->tagset.queue_depth = min_t(unsigned int, dev->q_depth, - BLK_MQ_MAX_DEPTH) - 1; - dev->tagset.cmd_size = sizeof(struct nvme_iod); - dev->tagset.flags = BLK_MQ_F_SHOULD_MERGE; - dev->tagset.driver_data = dev; - - /* - * Some Apple controllers requires tags to be unique - * across admin and IO queue, so reserve the first 32 - * tags of the IO queue. - */ - if (dev->ctrl.quirks & NVME_QUIRK_SHARED_TAGS) - dev->tagset.reserved_tags = NVME_AQ_DEPTH; - - ret = blk_mq_alloc_tag_set(&dev->tagset); - if (ret) { - dev_warn(dev->ctrl.device, - "IO queues tagset allocation failed %d\n", ret); - return; - } - dev->ctrl.tagset = &dev->tagset; - } else { - blk_mq_update_nr_hw_queues(&dev->tagset, dev->online_queues - 1); - - /* Free previously allocated queues that are no longer usable */ - nvme_free_queues(dev, dev->online_queues); - } - - nvme_dbbuf_set(dev); -} - -static int nvme_pci_enable(struct nvme_dev *dev) -{ - int result = -ENOMEM; - struct pci_dev *pdev = to_pci_dev(dev->dev); - - if (pci_enable_device_mem(pdev)) - return result; - - pci_set_master(pdev); - - if (dma_set_mask_and_coherent(dev->dev, DMA_BIT_MASK(64))) - goto disable; - - if (readl(dev->bar + NVME_REG_CSTS) == -1) { - result = -ENODEV; - goto disable; - } - - /* - * Some devices and/or platforms don't advertise or work with INTx - * interrupts. Pre-enable a single MSIX or MSI vec for setup. We'll - * adjust this later. - */ - result = pci_alloc_irq_vectors(pdev, 1, 1, PCI_IRQ_ALL_TYPES); - if (result < 0) - return result; - - dev->ctrl.cap = lo_hi_readq(dev->bar + NVME_REG_CAP); - - dev->q_depth = min_t(u32, NVME_CAP_MQES(dev->ctrl.cap) + 1, - io_queue_depth); - dev->ctrl.sqsize = dev->q_depth - 1; /* 0's based queue depth */ - dev->db_stride = 1 << NVME_CAP_STRIDE(dev->ctrl.cap); - dev->dbs = dev->bar + 4096; - - /* - * Some Apple controllers require a non-standard SQE size. - * Interestingly they also seem to ignore the CC:IOSQES register - * so we don't bother updating it here. - */ - if (dev->ctrl.quirks & NVME_QUIRK_128_BYTES_SQES) - dev->io_sqes = 7; - else - dev->io_sqes = NVME_NVM_IOSQES; - - /* - * Temporary fix for the Apple controller found in the MacBook8,1 and - * some MacBook7,1 to avoid controller resets and data loss. - */ - if (pdev->vendor == PCI_VENDOR_ID_APPLE && pdev->device == 0x2001) { - dev->q_depth = 2; - dev_warn(dev->ctrl.device, "detected Apple NVMe controller, " - "set queue depth=%u to work around controller resets\n", - dev->q_depth); - } else if (pdev->vendor == PCI_VENDOR_ID_SAMSUNG && - (pdev->device == 0xa821 || pdev->device == 0xa822) && - NVME_CAP_MQES(dev->ctrl.cap) == 0) { - dev->q_depth = 64; - dev_err(dev->ctrl.device, "detected PM1725 NVMe controller, " - "set queue depth=%u\n", dev->q_depth); - } - - /* - * Controllers with the shared tags quirk need the IO queue to be - * big enough so that we get 32 tags for the admin queue - */ - if ((dev->ctrl.quirks & NVME_QUIRK_SHARED_TAGS) && - (dev->q_depth < (NVME_AQ_DEPTH + 2))) { - dev->q_depth = NVME_AQ_DEPTH + 2; - dev_warn(dev->ctrl.device, "IO queue depth clamped to %d\n", - dev->q_depth); - } - - - nvme_map_cmb(dev); - - pci_enable_pcie_error_reporting(pdev); - pci_save_state(pdev); - return 0; - - disable: - pci_disable_device(pdev); - return result; -} - -static void nvme_dev_unmap(struct nvme_dev *dev) -{ - if (dev->bar) - iounmap(dev->bar); - pci_release_mem_regions(to_pci_dev(dev->dev)); -} - -static void nvme_pci_disable(struct nvme_dev *dev) -{ - struct pci_dev *pdev = to_pci_dev(dev->dev); - - pci_free_irq_vectors(pdev); - - if (pci_is_enabled(pdev)) { - pci_disable_pcie_error_reporting(pdev); - pci_disable_device(pdev); - } -} - -static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown) -{ - bool dead = true, freeze = false; - struct pci_dev *pdev = to_pci_dev(dev->dev); - - mutex_lock(&dev->shutdown_lock); - if (pci_is_enabled(pdev)) { - u32 csts = readl(dev->bar + NVME_REG_CSTS); - - if (dev->ctrl.state == NVME_CTRL_LIVE || - dev->ctrl.state == NVME_CTRL_RESETTING) { - freeze = true; - nvme_start_freeze(&dev->ctrl); - } - dead = !!((csts & NVME_CSTS_CFS) || !(csts & NVME_CSTS_RDY) || - pdev->error_state != pci_channel_io_normal); - } - - /* - * Give the controller a chance to complete all entered requests if - * doing a safe shutdown. - */ - if (!dead && shutdown && freeze) - nvme_wait_freeze_timeout(&dev->ctrl, NVME_IO_TIMEOUT); - - nvme_stop_queues(&dev->ctrl); - - if (!dead && dev->ctrl.queue_count > 0) { - nvme_disable_io_queues(dev); - nvme_disable_admin_queue(dev, shutdown); - } - nvme_suspend_io_queues(dev); - nvme_suspend_queue(&dev->queues[0]); - nvme_pci_disable(dev); - nvme_reap_pending_cqes(dev); - - blk_mq_tagset_busy_iter(&dev->tagset, nvme_cancel_request, &dev->ctrl); - blk_mq_tagset_busy_iter(&dev->admin_tagset, nvme_cancel_request, &dev->ctrl); - blk_mq_tagset_wait_completed_request(&dev->tagset); - blk_mq_tagset_wait_completed_request(&dev->admin_tagset); - - /* - * The driver will not be starting up queues again if shutting down so - * must flush all entered requests to their failed completion to avoid - * deadlocking blk-mq hot-cpu notifier. - */ - if (shutdown) { - nvme_start_queues(&dev->ctrl); - if (dev->ctrl.admin_q && !blk_queue_dying(dev->ctrl.admin_q)) - blk_mq_unquiesce_queue(dev->ctrl.admin_q); - } - mutex_unlock(&dev->shutdown_lock); -} - -static int nvme_disable_prepare_reset(struct nvme_dev *dev, bool shutdown) -{ - if (!nvme_wait_reset(&dev->ctrl)) - return -EBUSY; - nvme_dev_disable(dev, shutdown); - return 0; -} - -static int nvme_setup_prp_pools(struct nvme_dev *dev) -{ - dev->prp_page_pool = dma_pool_create("prp list page", dev->dev, - NVME_CTRL_PAGE_SIZE, - NVME_CTRL_PAGE_SIZE, 0); - if (!dev->prp_page_pool) - return -ENOMEM; - - /* Optimisation for I/Os between 4k and 128k */ - dev->prp_small_pool = dma_pool_create("prp list 256", dev->dev, - 256, 256, 0); - if (!dev->prp_small_pool) { - dma_pool_destroy(dev->prp_page_pool); - return -ENOMEM; - } - return 0; -} - -static void nvme_release_prp_pools(struct nvme_dev *dev) -{ - dma_pool_destroy(dev->prp_page_pool); - dma_pool_destroy(dev->prp_small_pool); -} - -static void nvme_free_tagset(struct nvme_dev *dev) -{ - if (dev->tagset.tags) - blk_mq_free_tag_set(&dev->tagset); - dev->ctrl.tagset = NULL; -} - -static void nvme_pci_free_ctrl(struct nvme_ctrl *ctrl) -{ - struct nvme_dev *dev = to_nvme_dev(ctrl); - - nvme_dbbuf_dma_free(dev); - nvme_free_tagset(dev); - if (dev->ctrl.admin_q) - blk_put_queue(dev->ctrl.admin_q); - free_opal_dev(dev->ctrl.opal_dev); - mempool_destroy(dev->iod_mempool); - put_device(dev->dev); - kfree(dev->queues); - kfree(dev); -} - -static void nvme_remove_dead_ctrl(struct nvme_dev *dev) -{ - /* - * Set state to deleting now to avoid blocking nvme_wait_reset(), which - * may be holding this pci_dev's device lock. - */ - nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING); - nvme_get_ctrl(&dev->ctrl); - nvme_dev_disable(dev, false); - nvme_kill_queues(&dev->ctrl); - if (!queue_work(nvme_wq, &dev->remove_work)) - nvme_put_ctrl(&dev->ctrl); -} - -static void nvme_reset_work(struct work_struct *work) -{ - struct nvme_dev *dev = - container_of(work, struct nvme_dev, ctrl.reset_work); - bool was_suspend = !!(dev->ctrl.ctrl_config & NVME_CC_SHN_NORMAL); - int result; - - if (dev->ctrl.state != NVME_CTRL_RESETTING) { - dev_warn(dev->ctrl.device, "ctrl state %d is not RESETTING\n", - dev->ctrl.state); - result = -ENODEV; - goto out; - } - - /* - * If we're called to reset a live controller first shut it down before - * moving on. - */ - if (dev->ctrl.ctrl_config & NVME_CC_ENABLE) - nvme_dev_disable(dev, false); - nvme_sync_queues(&dev->ctrl); - - mutex_lock(&dev->shutdown_lock); - result = nvme_pci_enable(dev); - if (result) - goto out_unlock; - - result = nvme_pci_configure_admin_queue(dev); - if (result) - goto out_unlock; - - result = nvme_alloc_admin_tags(dev); - if (result) - goto out_unlock; - - /* - * Limit the max command size to prevent iod->sg allocations going - * over a single page. - */ - dev->ctrl.max_hw_sectors = min_t(u32, - NVME_MAX_KB_SZ << 1, dma_max_mapping_size(dev->dev) >> 9); - dev->ctrl.max_segments = NVME_MAX_SEGS; - - /* - * Don't limit the IOMMU merged segment size. - */ - dma_set_max_seg_size(dev->dev, 0xffffffff); - dma_set_min_align_mask(dev->dev, NVME_CTRL_PAGE_SIZE - 1); - - mutex_unlock(&dev->shutdown_lock); - - /* - * Introduce CONNECTING state from nvme-fc/rdma transports to mark the - * initializing procedure here. - */ - if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_CONNECTING)) { - dev_warn(dev->ctrl.device, - "failed to mark controller CONNECTING\n"); - result = -EBUSY; - goto out; - } - - /* - * We do not support an SGL for metadata (yet), so we are limited to a - * single integrity segment for the separate metadata pointer. - */ - dev->ctrl.max_integrity_segments = 1; - - result = nvme_init_identify(&dev->ctrl); - if (result) - goto out; - - if (dev->ctrl.oacs & NVME_CTRL_OACS_SEC_SUPP) { - if (!dev->ctrl.opal_dev) - dev->ctrl.opal_dev = - init_opal_dev(&dev->ctrl, &nvme_sec_submit); - else if (was_suspend) - opal_unlock_from_suspend(dev->ctrl.opal_dev); - } else { - free_opal_dev(dev->ctrl.opal_dev); - dev->ctrl.opal_dev = NULL; - } - - if (dev->ctrl.oacs & NVME_CTRL_OACS_DBBUF_SUPP) { - result = nvme_dbbuf_dma_alloc(dev); - if (result) - dev_warn(dev->dev, - "unable to allocate dma for dbbuf\n"); - } - - if (dev->ctrl.hmpre) { - result = nvme_setup_host_mem(dev); - if (result < 0) - goto out; - } - - result = nvme_setup_io_queues(dev); - if (result) - goto out; - - /* - * Keep the controller around but remove all namespaces if we don't have - * any working I/O queue. - */ - if (dev->online_queues < 2) { - dev_warn(dev->ctrl.device, "IO queues not created\n"); - nvme_kill_queues(&dev->ctrl); - nvme_remove_namespaces(&dev->ctrl); - nvme_free_tagset(dev); - } else { - nvme_start_queues(&dev->ctrl); - nvme_wait_freeze(&dev->ctrl); - nvme_dev_add(dev); - nvme_unfreeze(&dev->ctrl); - } - - /* - * If only admin queue live, keep it to do further investigation or - * recovery. - */ - if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_LIVE)) { - dev_warn(dev->ctrl.device, - "failed to mark controller live state\n"); - result = -ENODEV; - goto out; - } - - nvme_start_ctrl(&dev->ctrl); - return; - - out_unlock: - mutex_unlock(&dev->shutdown_lock); - out: - if (result) - dev_warn(dev->ctrl.device, - "Removing after probe failure status: %d\n", result); - nvme_remove_dead_ctrl(dev); -} - -static void nvme_remove_dead_ctrl_work(struct work_struct *work) -{ - struct nvme_dev *dev = container_of(work, struct nvme_dev, remove_work); - struct pci_dev *pdev = to_pci_dev(dev->dev); - - if (pci_get_drvdata(pdev)) - device_release_driver(&pdev->dev); - nvme_put_ctrl(&dev->ctrl); -} - -static int nvme_pci_reg_read32(struct nvme_ctrl *ctrl, u32 off, u32 *val) -{ - *val = readl(to_nvme_dev(ctrl)->bar + off); - return 0; -} - -static int nvme_pci_reg_write32(struct nvme_ctrl *ctrl, u32 off, u32 val) -{ - writel(val, to_nvme_dev(ctrl)->bar + off); - return 0; -} - -static int nvme_pci_reg_read64(struct nvme_ctrl *ctrl, u32 off, u64 *val) -{ - *val = lo_hi_readq(to_nvme_dev(ctrl)->bar + off); - return 0; -} - -static int nvme_pci_get_address(struct nvme_ctrl *ctrl, char *buf, int size) -{ - struct pci_dev *pdev = to_pci_dev(to_nvme_dev(ctrl)->dev); - - return snprintf(buf, size, "%s\n", dev_name(&pdev->dev)); -} - -static const struct nvme_ctrl_ops nvme_pci_ctrl_ops = { - .name = "pcie", - .module = THIS_MODULE, - .flags = NVME_F_METADATA_SUPPORTED | - NVME_F_PCI_P2PDMA, - .reg_read32 = nvme_pci_reg_read32, - .reg_write32 = nvme_pci_reg_write32, - .reg_read64 = nvme_pci_reg_read64, - .free_ctrl = nvme_pci_free_ctrl, - .submit_async_event = nvme_pci_submit_async_event, - .get_address = nvme_pci_get_address, -}; - -static int nvme_dev_map(struct nvme_dev *dev) -{ - struct pci_dev *pdev = to_pci_dev(dev->dev); - - if (pci_request_mem_regions(pdev, "nvme")) - return -ENODEV; - - if (nvme_remap_bar(dev, NVME_REG_DBS + 4096)) - goto release; - - return 0; - release: - pci_release_mem_regions(pdev); - return -ENODEV; -} - -static unsigned long check_vendor_combination_bug(struct pci_dev *pdev) -{ - if (pdev->vendor == 0x144d && pdev->device == 0xa802) { - /* - * Several Samsung devices seem to drop off the PCIe bus - * randomly when APST is on and uses the deepest sleep state. - * This has been observed on a Samsung "SM951 NVMe SAMSUNG - * 256GB", a "PM951 NVMe SAMSUNG 512GB", and a "Samsung SSD - * 950 PRO 256GB", but it seems to be restricted to two Dell - * laptops. - */ - if (dmi_match(DMI_SYS_VENDOR, "Dell Inc.") && - (dmi_match(DMI_PRODUCT_NAME, "XPS 15 9550") || - dmi_match(DMI_PRODUCT_NAME, "Precision 5510"))) - return NVME_QUIRK_NO_DEEPEST_PS; - } else if (pdev->vendor == 0x144d && pdev->device == 0xa804) { - /* - * Samsung SSD 960 EVO drops off the PCIe bus after system - * suspend on a Ryzen board, ASUS PRIME B350M-A, as well as - * within few minutes after bootup on a Coffee Lake board - - * ASUS PRIME Z370-A - */ - if (dmi_match(DMI_BOARD_VENDOR, "ASUSTeK COMPUTER INC.") && - (dmi_match(DMI_BOARD_NAME, "PRIME B350M-A") || - dmi_match(DMI_BOARD_NAME, "PRIME Z370-A"))) - return NVME_QUIRK_NO_APST; - } else if ((pdev->vendor == 0x144d && (pdev->device == 0xa801 || - pdev->device == 0xa808 || pdev->device == 0xa809)) || - (pdev->vendor == 0x1e0f && pdev->device == 0x0001)) { - /* - * Forcing to use host managed nvme power settings for - * lowest idle power with quick resume latency on - * Samsung and Toshiba SSDs based on suspend behavior - * on Coffee Lake board for LENOVO C640 - */ - if ((dmi_match(DMI_BOARD_VENDOR, "LENOVO")) && - dmi_match(DMI_BOARD_NAME, "LNVNB161216")) - return NVME_QUIRK_SIMPLE_SUSPEND; - } - - return 0; -} - -#ifdef CONFIG_ACPI -static bool nvme_acpi_storage_d3(struct pci_dev *dev) -{ - struct acpi_device *adev = ACPI_COMPANION(&dev->dev); - u8 val; - - /* - * Look for _DSD property specifying that the storage device on the port - * must use D3 to support deep platform power savings during - * suspend-to-idle. - */ - - if (!adev) - return false; - if (fwnode_property_read_u8(acpi_fwnode_handle(adev), "StorageD3Enable", - &val)) - return false; - return val == 1; -} -#else -static inline bool nvme_acpi_storage_d3(struct pci_dev *dev) -{ - return false; -} -#endif /* CONFIG_ACPI */ - -static void nvme_async_probe(void *data, async_cookie_t cookie) -{ - struct nvme_dev *dev = data; - - flush_work(&dev->ctrl.reset_work); - flush_work(&dev->ctrl.scan_work); - nvme_put_ctrl(&dev->ctrl); -} - -static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) -{ - int node, result = -ENOMEM; - struct nvme_dev *dev; - unsigned long quirks = id->driver_data; - size_t alloc_size; - - node = dev_to_node(&pdev->dev); - if (node == NUMA_NO_NODE) - set_dev_node(&pdev->dev, first_memory_node); - - dev = kzalloc_node(sizeof(*dev), GFP_KERNEL, node); - if (!dev) - return -ENOMEM; - - dev->nr_write_queues = write_queues; - dev->nr_poll_queues = poll_queues; - dev->nr_allocated_queues = nvme_max_io_queues(dev) + 1; - dev->queues = kcalloc_node(dev->nr_allocated_queues, - sizeof(struct nvme_queue), GFP_KERNEL, node); - if (!dev->queues) - goto free; - - dev->dev = get_device(&pdev->dev); - pci_set_drvdata(pdev, dev); - - result = nvme_dev_map(dev); - if (result) - goto put_pci; - - INIT_WORK(&dev->ctrl.reset_work, nvme_reset_work); - INIT_WORK(&dev->remove_work, nvme_remove_dead_ctrl_work); - mutex_init(&dev->shutdown_lock); - - result = nvme_setup_prp_pools(dev); - if (result) - goto unmap; - - quirks |= check_vendor_combination_bug(pdev); - - if (!noacpi && nvme_acpi_storage_d3(pdev)) { - /* - * Some systems use a bios work around to ask for D3 on - * platforms that support kernel managed suspend. - */ - dev_info(&pdev->dev, - "platform quirk: setting simple suspend\n"); - quirks |= NVME_QUIRK_SIMPLE_SUSPEND; - } - - /* - * Double check that our mempool alloc size will cover the biggest - * command we support. - */ - alloc_size = nvme_pci_iod_alloc_size(); - WARN_ON_ONCE(alloc_size > PAGE_SIZE); - - dev->iod_mempool = mempool_create_node(1, mempool_kmalloc, - mempool_kfree, - (void *) alloc_size, - GFP_KERNEL, node); - if (!dev->iod_mempool) { - result = -ENOMEM; - goto release_pools; - } - - result = nvme_init_ctrl(&dev->ctrl, &pdev->dev, &nvme_pci_ctrl_ops, - quirks); - if (result) - goto release_mempool; - - dev_info(dev->ctrl.device, "pci function %s\n", dev_name(&pdev->dev)); - - nvme_reset_ctrl(&dev->ctrl); - async_schedule(nvme_async_probe, dev); - - return 0; - - release_mempool: - mempool_destroy(dev->iod_mempool); - release_pools: - nvme_release_prp_pools(dev); - unmap: - nvme_dev_unmap(dev); - put_pci: - put_device(dev->dev); - free: - kfree(dev->queues); - kfree(dev); - return result; -} - -static void nvme_reset_prepare(struct pci_dev *pdev) -{ - struct nvme_dev *dev = pci_get_drvdata(pdev); - - /* - * We don't need to check the return value from waiting for the reset - * state as pci_dev device lock is held, making it impossible to race - * with ->remove(). - */ - nvme_disable_prepare_reset(dev, false); - nvme_sync_queues(&dev->ctrl); -} - -static void nvme_reset_done(struct pci_dev *pdev) -{ - struct nvme_dev *dev = pci_get_drvdata(pdev); - - if (!nvme_try_sched_reset(&dev->ctrl)) - flush_work(&dev->ctrl.reset_work); -} - -static void nvme_shutdown(struct pci_dev *pdev) -{ - struct nvme_dev *dev = pci_get_drvdata(pdev); - - nvme_disable_prepare_reset(dev, true); -} - -/* - * The driver's remove may be called on a device in a partially initialized - * state. This function must not have any dependencies on the device state in - * order to proceed. - */ -static void nvme_remove(struct pci_dev *pdev) -{ - struct nvme_dev *dev = pci_get_drvdata(pdev); - - nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING); - pci_set_drvdata(pdev, NULL); - - if (!pci_device_is_present(pdev)) { - nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DEAD); - nvme_dev_disable(dev, true); - } - - flush_work(&dev->ctrl.reset_work); - nvme_stop_ctrl(&dev->ctrl); - nvme_remove_namespaces(&dev->ctrl); - nvme_dev_disable(dev, true); - nvme_release_cmb(dev); - nvme_free_host_mem(dev); - nvme_dev_remove_admin(dev); - nvme_free_queues(dev, 0); - nvme_release_prp_pools(dev); - nvme_dev_unmap(dev); - nvme_uninit_ctrl(&dev->ctrl); -} - -#ifdef CONFIG_PM_SLEEP -static int nvme_get_power_state(struct nvme_ctrl *ctrl, u32 *ps) -{ - return nvme_get_features(ctrl, NVME_FEAT_POWER_MGMT, 0, NULL, 0, ps); -} - -static int nvme_set_power_state(struct nvme_ctrl *ctrl, u32 ps) -{ - return nvme_set_features(ctrl, NVME_FEAT_POWER_MGMT, ps, NULL, 0, NULL); -} - -static int nvme_resume(struct device *dev) -{ - struct nvme_dev *ndev = pci_get_drvdata(to_pci_dev(dev)); - struct nvme_ctrl *ctrl = &ndev->ctrl; - - if (ndev->last_ps == U32_MAX || - nvme_set_power_state(ctrl, ndev->last_ps) != 0) - return nvme_try_sched_reset(&ndev->ctrl); - return 0; -} - -static int nvme_suspend(struct device *dev) -{ - struct pci_dev *pdev = to_pci_dev(dev); - struct nvme_dev *ndev = pci_get_drvdata(pdev); - struct nvme_ctrl *ctrl = &ndev->ctrl; - int ret = -EBUSY; - - ndev->last_ps = U32_MAX; - - /* - * The platform does not remove power for a kernel managed suspend so - * use host managed nvme power settings for lowest idle power if - * possible. This should have quicker resume latency than a full device - * shutdown. But if the firmware is involved after the suspend or the - * device does not support any non-default power states, shut down the - * device fully. - * - * If ASPM is not enabled for the device, shut down the device and allow - * the PCI bus layer to put it into D3 in order to take the PCIe link - * down, so as to allow the platform to achieve its minimum low-power - * state (which may not be possible if the link is up). - * - * If a host memory buffer is enabled, shut down the device as the NVMe - * specification allows the device to access the host memory buffer in - * host DRAM from all power states, but hosts will fail access to DRAM - * during S3. - */ - if (pm_suspend_via_firmware() || !ctrl->npss || - !pcie_aspm_enabled(pdev) || - ndev->nr_host_mem_descs || - (ndev->ctrl.quirks & NVME_QUIRK_SIMPLE_SUSPEND)) - return nvme_disable_prepare_reset(ndev, true); - - nvme_start_freeze(ctrl); - nvme_wait_freeze(ctrl); - nvme_sync_queues(ctrl); - - if (ctrl->state != NVME_CTRL_LIVE) - goto unfreeze; - - ret = nvme_get_power_state(ctrl, &ndev->last_ps); - if (ret < 0) - goto unfreeze; - - /* - * A saved state prevents pci pm from generically controlling the - * device's power. If we're using protocol specific settings, we don't - * want pci interfering. - */ - pci_save_state(pdev); - - ret = nvme_set_power_state(ctrl, ctrl->npss); - if (ret < 0) - goto unfreeze; - - if (ret) { - /* discard the saved state */ - pci_load_saved_state(pdev, NULL); - - /* - * Clearing npss forces a controller reset on resume. The - * correct value will be rediscovered then. - */ - ret = nvme_disable_prepare_reset(ndev, true); - ctrl->npss = 0; - } -unfreeze: - nvme_unfreeze(ctrl); - return ret; -} - -static int nvme_simple_suspend(struct device *dev) -{ - struct nvme_dev *ndev = pci_get_drvdata(to_pci_dev(dev)); - - return nvme_disable_prepare_reset(ndev, true); -} - -static int nvme_simple_resume(struct device *dev) -{ - struct pci_dev *pdev = to_pci_dev(dev); - struct nvme_dev *ndev = pci_get_drvdata(pdev); - - return nvme_try_sched_reset(&ndev->ctrl); -} - -static const struct dev_pm_ops nvme_dev_pm_ops = { - .suspend = nvme_suspend, - .resume = nvme_resume, - .freeze = nvme_simple_suspend, - .thaw = nvme_simple_resume, - .poweroff = nvme_simple_suspend, - .restore = nvme_simple_resume, -}; -#endif /* CONFIG_PM_SLEEP */ - -static pci_ers_result_t nvme_error_detected(struct pci_dev *pdev, - pci_channel_state_t state) -{ - struct nvme_dev *dev = pci_get_drvdata(pdev); - - /* - * A frozen channel requires a reset. When detected, this method will - * shutdown the controller to quiesce. The controller will be restarted - * after the slot reset through driver's slot_reset callback. - */ - switch (state) { - case pci_channel_io_normal: - return PCI_ERS_RESULT_CAN_RECOVER; - case pci_channel_io_frozen: - dev_warn(dev->ctrl.device, - "frozen state error detected, reset controller\n"); - nvme_dev_disable(dev, false); - return PCI_ERS_RESULT_NEED_RESET; - case pci_channel_io_perm_failure: - dev_warn(dev->ctrl.device, - "failure state error detected, request disconnect\n"); - return PCI_ERS_RESULT_DISCONNECT; - } - return PCI_ERS_RESULT_NEED_RESET; -} - -static pci_ers_result_t nvme_slot_reset(struct pci_dev *pdev) -{ - struct nvme_dev *dev = pci_get_drvdata(pdev); - - dev_info(dev->ctrl.device, "restart after slot reset\n"); - pci_restore_state(pdev); - nvme_reset_ctrl(&dev->ctrl); - return PCI_ERS_RESULT_RECOVERED; -} - -static void nvme_error_resume(struct pci_dev *pdev) -{ - struct nvme_dev *dev = pci_get_drvdata(pdev); - - flush_work(&dev->ctrl.reset_work); -} - -static const struct pci_error_handlers nvme_err_handler = { - .error_detected = nvme_error_detected, - .slot_reset = nvme_slot_reset, - .resume = nvme_error_resume, - .reset_prepare = nvme_reset_prepare, - .reset_done = nvme_reset_done, -}; - -static const struct pci_device_id nvme_id_table[] = { - { PCI_VDEVICE(INTEL, 0x0953), /* Intel 750/P3500/P3600/P3700 */ - .driver_data = NVME_QUIRK_STRIPE_SIZE | - NVME_QUIRK_DEALLOCATE_ZEROES, }, - { PCI_VDEVICE(INTEL, 0x0a53), /* Intel P3520 */ - .driver_data = NVME_QUIRK_STRIPE_SIZE | - NVME_QUIRK_DEALLOCATE_ZEROES, }, - { PCI_VDEVICE(INTEL, 0x0a54), /* Intel P4500/P4600 */ - .driver_data = NVME_QUIRK_STRIPE_SIZE | - NVME_QUIRK_DEALLOCATE_ZEROES | - NVME_QUIRK_IGNORE_DEV_SUBNQN, }, - { PCI_VDEVICE(INTEL, 0x0a55), /* Dell Express Flash P4600 */ - .driver_data = NVME_QUIRK_STRIPE_SIZE | - NVME_QUIRK_DEALLOCATE_ZEROES, }, - { PCI_VDEVICE(INTEL, 0xf1a5), /* Intel 600P/P3100 */ - .driver_data = NVME_QUIRK_NO_DEEPEST_PS | - NVME_QUIRK_MEDIUM_PRIO_SQ | - NVME_QUIRK_NO_TEMP_THRESH_CHANGE | - NVME_QUIRK_DISABLE_WRITE_ZEROES, }, - { PCI_VDEVICE(INTEL, 0xf1a6), /* Intel 760p/Pro 7600p */ - .driver_data = NVME_QUIRK_IGNORE_DEV_SUBNQN, }, - { PCI_VDEVICE(INTEL, 0x5845), /* Qemu emulated controller */ - .driver_data = NVME_QUIRK_IDENTIFY_CNS | - NVME_QUIRK_DISABLE_WRITE_ZEROES | - NVME_QUIRK_BOGUS_NID, }, - { PCI_VDEVICE(REDHAT, 0x0010), /* Qemu emulated controller */ - .driver_data = NVME_QUIRK_BOGUS_NID, }, - { PCI_DEVICE(0x126f, 0x2263), /* Silicon Motion unidentified */ - .driver_data = NVME_QUIRK_NO_NS_DESC_LIST, }, - { PCI_DEVICE(0x1bb1, 0x0100), /* Seagate Nytro Flash Storage */ - .driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY | - NVME_QUIRK_NO_NS_DESC_LIST, }, - { PCI_DEVICE(0x1c58, 0x0003), /* HGST adapter */ - .driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, }, - { PCI_DEVICE(0x1c58, 0x0023), /* WDC SN200 adapter */ - .driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, }, - { PCI_DEVICE(0x1c5f, 0x0540), /* Memblaze Pblaze4 adapter */ - .driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, }, - { PCI_DEVICE(0x144d, 0xa821), /* Samsung PM1725 */ - .driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, }, - { PCI_DEVICE(0x144d, 0xa822), /* Samsung PM1725a */ - .driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY | - NVME_QUIRK_DISABLE_WRITE_ZEROES| - NVME_QUIRK_IGNORE_DEV_SUBNQN, }, - { PCI_DEVICE(0x1987, 0x5016), /* Phison E16 */ - .driver_data = NVME_QUIRK_IGNORE_DEV_SUBNQN | - NVME_QUIRK_BOGUS_NID, }, - { PCI_DEVICE(0x1b4b, 0x1092), /* Lexar 256 GB SSD */ - .driver_data = NVME_QUIRK_NO_NS_DESC_LIST | - NVME_QUIRK_IGNORE_DEV_SUBNQN, }, - { PCI_DEVICE(0x1d1d, 0x1f1f), /* LighNVM qemu device */ - .driver_data = NVME_QUIRK_LIGHTNVM, }, - { PCI_DEVICE(0x1d1d, 0x2807), /* CNEX WL */ - .driver_data = NVME_QUIRK_LIGHTNVM, }, - { PCI_DEVICE(0x1d1d, 0x2601), /* CNEX Granby */ - .driver_data = NVME_QUIRK_LIGHTNVM, }, - { PCI_DEVICE(0x10ec, 0x5762), /* ADATA SX6000LNP */ - .driver_data = NVME_QUIRK_IGNORE_DEV_SUBNQN | - NVME_QUIRK_BOGUS_NID, }, - { PCI_DEVICE(0x1cc1, 0x8201), /* ADATA SX8200PNP 512GB */ - .driver_data = NVME_QUIRK_NO_DEEPEST_PS | - NVME_QUIRK_IGNORE_DEV_SUBNQN, }, - { PCI_DEVICE(0x1c5c, 0x1504), /* SK Hynix PC400 */ - .driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, }, - { PCI_DEVICE(0x15b7, 0x2001), /* Sandisk Skyhawk */ - .driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, }, - { PCI_DEVICE(0x2646, 0x2262), /* KINGSTON SKC2000 NVMe SSD */ - .driver_data = NVME_QUIRK_NO_DEEPEST_PS, }, - { PCI_DEVICE(0x2646, 0x2263), /* KINGSTON A2000 NVMe SSD */ - .driver_data = NVME_QUIRK_NO_DEEPEST_PS, }, - { PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2001), - .driver_data = NVME_QUIRK_SINGLE_VECTOR }, - { PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2003) }, - { PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2005), - .driver_data = NVME_QUIRK_SINGLE_VECTOR | - NVME_QUIRK_128_BYTES_SQES | - NVME_QUIRK_SHARED_TAGS | - NVME_QUIRK_SKIP_CID_GEN }, - { PCI_DEVICE_CLASS(PCI_CLASS_STORAGE_EXPRESS, 0xffffff) }, - { 0, } -}; -MODULE_DEVICE_TABLE(pci, nvme_id_table); - -static struct pci_driver nvme_driver = { - .name = "nvme", - .id_table = nvme_id_table, - .probe = nvme_probe, - .remove = nvme_remove, - .shutdown = nvme_shutdown, -#ifdef CONFIG_PM_SLEEP - .driver = { - .pm = &nvme_dev_pm_ops, - }, -#endif - .sriov_configure = pci_sriov_configure_simple, - .err_handler = &nvme_err_handler, -}; - -static int __init nvme_init(void) -{ - BUILD_BUG_ON(sizeof(struct nvme_create_cq) != 64); - BUILD_BUG_ON(sizeof(struct nvme_create_sq) != 64); - BUILD_BUG_ON(sizeof(struct nvme_delete_queue) != 64); - BUILD_BUG_ON(IRQ_AFFINITY_MAX_SETS < 2); - - return pci_register_driver(&nvme_driver); -} - -static void __exit nvme_exit(void) -{ - pci_unregister_driver(&nvme_driver); - flush_workqueue(nvme_wq); -} - -MODULE_AUTHOR("Matthew Wilcox "); -MODULE_LICENSE("GPL"); -MODULE_VERSION("1.0"); -module_init(nvme_init); -module_exit(nvme_exit); diff --git a/feed/kmod-nvme/src/trace.h b/feed/kmod-nvme/src/trace.h deleted file mode 100644 index aa8b0f8..0000000 --- a/feed/kmod-nvme/src/trace.h +++ /dev/null @@ -1,175 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * NVM Express device driver tracepoints - * Copyright (c) 2018 Johannes Thumshirn, SUSE Linux GmbH - */ - -#undef TRACE_SYSTEM -#define TRACE_SYSTEM nvme - -#if !defined(_TRACE_NVME_H) || defined(TRACE_HEADER_MULTI_READ) -#define _TRACE_NVME_H - -#include -#include -#include - -#include "nvme.h" - -const char *nvme_trace_parse_admin_cmd(struct trace_seq *p, u8 opcode, - u8 *cdw10); -const char *nvme_trace_parse_nvm_cmd(struct trace_seq *p, u8 opcode, - u8 *cdw10); -const char *nvme_trace_parse_fabrics_cmd(struct trace_seq *p, u8 fctype, - u8 *spc); - -#define parse_nvme_cmd(qid, opcode, fctype, cdw10) \ - ((opcode) == nvme_fabrics_command ? \ - nvme_trace_parse_fabrics_cmd(p, fctype, cdw10) : \ - ((qid) ? \ - nvme_trace_parse_nvm_cmd(p, opcode, cdw10) : \ - nvme_trace_parse_admin_cmd(p, opcode, cdw10))) - -const char *nvme_trace_disk_name(struct trace_seq *p, char *name); -#define __print_disk_name(name) \ - nvme_trace_disk_name(p, name) - -#ifndef TRACE_HEADER_MULTI_READ -static inline void __assign_disk_name(char *name, struct gendisk *disk) -{ - if (disk) - memcpy(name, disk->disk_name, DISK_NAME_LEN); - else - memset(name, 0, DISK_NAME_LEN); -} -#endif - -TRACE_EVENT(nvme_setup_cmd, - TP_PROTO(struct request *req, struct nvme_command *cmd), - TP_ARGS(req, cmd), - TP_STRUCT__entry( - __array(char, disk, DISK_NAME_LEN) - __field(int, ctrl_id) - __field(int, qid) - __field(u8, opcode) - __field(u8, flags) - __field(u8, fctype) - __field(u16, cid) - __field(u32, nsid) - __field(bool, metadata) - __array(u8, cdw10, 24) - ), - TP_fast_assign( - __entry->ctrl_id = nvme_req(req)->ctrl->instance; - __entry->qid = nvme_req_qid(req); - __entry->opcode = cmd->common.opcode; - __entry->flags = cmd->common.flags; - __entry->cid = cmd->common.command_id; - __entry->nsid = le32_to_cpu(cmd->common.nsid); - __entry->metadata = !!blk_integrity_rq(req); - __entry->fctype = cmd->fabrics.fctype; - __assign_disk_name(__entry->disk, req->rq_disk); - memcpy(__entry->cdw10, &cmd->common.cdw10, - sizeof(__entry->cdw10)); - ), - TP_printk("nvme%d: %sqid=%d, cmdid=%u, nsid=%u, flags=0x%x, meta=0x%x, cmd=(%s %s)", - __entry->ctrl_id, __print_disk_name(__entry->disk), - __entry->qid, __entry->cid, __entry->nsid, - __entry->flags, __entry->metadata, - show_opcode_name(__entry->qid, __entry->opcode, - __entry->fctype), - parse_nvme_cmd(__entry->qid, __entry->opcode, - __entry->fctype, __entry->cdw10)) -); - -TRACE_EVENT(nvme_complete_rq, - TP_PROTO(struct request *req), - TP_ARGS(req), - TP_STRUCT__entry( - __array(char, disk, DISK_NAME_LEN) - __field(int, ctrl_id) - __field(int, qid) - __field(int, cid) - __field(u64, result) - __field(u8, retries) - __field(u8, flags) - __field(u16, status) - ), - TP_fast_assign( - __entry->ctrl_id = nvme_req(req)->ctrl->instance; - __entry->qid = nvme_req_qid(req); - __entry->cid = nvme_req(req)->cmd->common.command_id; - __entry->result = le64_to_cpu(nvme_req(req)->result.u64); - __entry->retries = nvme_req(req)->retries; - __entry->flags = nvme_req(req)->flags; - __entry->status = nvme_req(req)->status; - __assign_disk_name(__entry->disk, req->rq_disk); - ), - TP_printk("nvme%d: %sqid=%d, cmdid=%u, res=%#llx, retries=%u, flags=0x%x, status=%#x", - __entry->ctrl_id, __print_disk_name(__entry->disk), - __entry->qid, __entry->cid, __entry->result, - __entry->retries, __entry->flags, __entry->status) - -); - -#define aer_name(aer) { aer, #aer } - -TRACE_EVENT(nvme_async_event, - TP_PROTO(struct nvme_ctrl *ctrl, u32 result), - TP_ARGS(ctrl, result), - TP_STRUCT__entry( - __field(int, ctrl_id) - __field(u32, result) - ), - TP_fast_assign( - __entry->ctrl_id = ctrl->instance; - __entry->result = result; - ), - TP_printk("nvme%d: NVME_AEN=%#08x [%s]", - __entry->ctrl_id, __entry->result, - __print_symbolic(__entry->result, - aer_name(NVME_AER_NOTICE_NS_CHANGED), - aer_name(NVME_AER_NOTICE_ANA), - aer_name(NVME_AER_NOTICE_FW_ACT_STARTING), - aer_name(NVME_AER_NOTICE_DISC_CHANGED), - aer_name(NVME_AER_ERROR), - aer_name(NVME_AER_SMART), - aer_name(NVME_AER_CSS), - aer_name(NVME_AER_VS)) - ) -); - -#undef aer_name - -TRACE_EVENT(nvme_sq, - TP_PROTO(struct request *req, __le16 sq_head, int sq_tail), - TP_ARGS(req, sq_head, sq_tail), - TP_STRUCT__entry( - __field(int, ctrl_id) - __array(char, disk, DISK_NAME_LEN) - __field(int, qid) - __field(u16, sq_head) - __field(u16, sq_tail) - ), - TP_fast_assign( - __entry->ctrl_id = nvme_req(req)->ctrl->instance; - __assign_disk_name(__entry->disk, req->rq_disk); - __entry->qid = nvme_req_qid(req); - __entry->sq_head = le16_to_cpu(sq_head); - __entry->sq_tail = sq_tail; - ), - TP_printk("nvme%d: %sqid=%d, head=%u, tail=%u", - __entry->ctrl_id, __print_disk_name(__entry->disk), - __entry->qid, __entry->sq_head, __entry->sq_tail - ) -); - -#endif /* _TRACE_NVME_H */ - -#undef TRACE_INCLUDE_PATH -#define TRACE_INCLUDE_PATH . -#undef TRACE_INCLUDE_FILE -#define TRACE_INCLUDE_FILE trace - -/* This part must be outside protection */ -#include